# -*- coding:utf-8 -*- ############################################################################# # Copyright (c) 2020 Huawei Technologies Co.,Ltd. # # openGauss is licensed under Mulan PSL v2. # You can use this software according to the terms # and conditions of the Mulan PSL v2. # You may obtain a copy of Mulan PSL v2 at: # # http://license.coscl.org.cn/MulanPSL2 # # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, # WITHOUT WARRANTIES OF ANY KIND, # EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. # See the Mulan PSL v2 for more details. # ---------------------------------------------------------------------------- # Description : ExpansionImpl.py ############################################################################# from enum import Flag import subprocess import sys import re import os import getpass import pwd import datetime import weakref import time import grp import socket import stat from multiprocessing import Process, Value sys.path.append(sys.path[0] + "/../../../../") from gspylib.common.DbClusterInfo import dbClusterInfo from gspylib.threads.SshTool import SshTool from gspylib.common.ErrorCode import ErrorCode from gspylib.common.Common import DefaultValue from gspylib.common.GaussLog import GaussLog from gspylib.os.gsOSlib import g_OSlib import impl.upgrade.UpgradeConst as Const from gspylib.common.OMCommand import OMCommand from gspylib.os.gsfile import g_file from domain_utils.cluster_file.cluster_dir import ClusterDir from base_utils.os.env_util import EnvUtil #boot/build mode MODE_PRIMARY = "primary" MODE_STANDBY = "standby" MODE_NORMAL = "normal" MODE_CASCADE = "cascade_standby" # instance local_role ROLE_NORMAL = "normal" ROLE_PRIMARY = "primary" ROLE_STANDBY = "standby" ROLE_CASCADE = "cascade standby" #db state STATE_NORMAL = "normal" STATE_STARTING = "starting" STATE_CATCHUP = "catchup" # master MASTER_INSTANCE = 0 # standby STANDBY_INSTANCE = 1 # statu failed STATUS_FAIL = "Failure" BASE_ID_DATANODE = 6001 MAX_DATANODE_NUM = 9 ACTION_INSTALL_CLUSTER = "install_cluster" class ExpansionImpl(): """ class for expansion standby node. step: 1. preinstall database on new standby node 2. install as single-node database 3. establish primary-standby relationship of all node """ def __init__(self, expansion): """ """ self.context = expansion self.user = self.context.user self.group = self.context.group self.existingHosts = [] self.expansionSuccess = {} for newHost in self.context.newHostList: self.expansionSuccess[newHost] = False self.logger = self.context.logger envFile = EnvUtil.getEnv("MPPDB_ENV_SEPARATE_PATH") if envFile: self.envFile = envFile else: userpath = pwd.getpwnam(self.user).pw_dir mpprcFile = os.path.join(userpath, ".bashrc") self.envFile = mpprcFile currentTime = str(datetime.datetime.now()).replace(" ", "_").replace( ".", "_").replace(":", "_") self.commonGsCtl = GsCtlCommon(expansion) self.tempFileDir = "/tmp/gs_expansion_%s" % (currentTime) self.remote_pkg_dir = os.path.join(self.tempFileDir, "pkg") self.logger.debug("tmp expansion dir is %s ." % self.tempFileDir) # primary's wal_keep_segments value self.walKeepSegments = -1 self._finalizer = weakref.finalize(self, self.final) globals()["paramiko"] = __import__("paramiko") def queryPrimaryWalKeepSegments(self): """ query primary's wal_keep_segments, when current user is root """ primaryHostName = self.getPrimaryHostName() primaryHostIp = self.context.clusterInfoDict[primaryHostName]["backIp"] primaryDataNode = self.context.clusterInfoDict[primaryHostName]["dataNode"] status, walKeepSegments = self.commonGsCtl.queryGucParaValue( primaryHostIp, self.envFile, primaryDataNode, "wal_keep_segments", self.user) if status != DefaultValue.SUCCESS: GaussLog.exitWithError(ErrorCode.GAUSS_500["GAUSS_50021"] % "wal_keep_segments") return walKeepSegments def rollbackPrimaryWalKeepSegments(self): """ rollback primary's wal_keep_segments, when current user is root """ self.logger.debug("Start to rollback primary's wal_keep_segments") primary = self.getPrimaryHostName() primaryDataNode = self.context.clusterInfoDict[primary]["dataNode"] status = self.commonGsCtl.setGucPara(primary, self.envFile, primaryDataNode, "wal_keep_segments", self.walKeepSegments, self.user) if status != DefaultValue.SUCCESS: self.logger.log("Failed to rollback wal_keep_segments, please manually " "set it to original value %s." % self.walKeepSegments) else: self.reloadPrimaryConf(self.user) def final(self): """ function: 1. Make sure primary's wal_keep_segments is restored to its original value if it has been changed, 2. rollback, 3. clear temp file input : NA output: NA """ if self.walKeepSegments != -1: currentWalKeepSegments = self.queryPrimaryWalKeepSegments() if currentWalKeepSegments != "NULL" \ and self.walKeepSegments != int(currentWalKeepSegments): self.rollbackPrimaryWalKeepSegments() self.rollback() self.clearTmpFile() def sendSoftToHosts(self, send_pkg=True): """ create software dir and send it on each nodes """ self.logger.log("Start to send soft to each standby nodes.") srcFile = self.context.packagepath pkgfiles = self.generatePackages(srcFile) time_out = self.context.time_out if self.context.time_out else 300 for host in self.context.newHostList: sshTool = SshTool([host], timeout=time_out) # mkdir package dir and send package to remote nodes. sshTool.executeCommand("umask 0022;mkdir -m a+x -p %s; chown %s:%s %s" % \ (self.remote_pkg_dir, self.user, self.group, self.tempFileDir), DefaultValue.SUCCESS, [host]) if send_pkg: for file in pkgfiles: if not os.path.exists(file): GaussLog.exitWithError("Package [%s] is not found." % file) sshTool.scpFiles(file, self.remote_pkg_dir, [host]) sshTool.executeCommand("cd %s;tar -xf %s" % (self.remote_pkg_dir, os.path.basename(pkgfiles[0])), DefaultValue.SUCCESS, [host]) self.cleanSshToolFile(sshTool) self.logger.log("End to send soft to each standby nodes.") def generatePackages(self, pkgdir): bz2_file = g_OSlib.getBz2FilePath() bz2_sha_file = g_OSlib.getSHA256FilePath() upgrade_sql_file = os.path.join(pkgdir, Const.UPGRADE_SQL_FILE) upgrade_sha_file = os.path.join(pkgdir, Const.UPGRADE_SQL_SHA) om_file = bz2_sha_file.replace(".sha256", "-om.tar.gz") return [om_file, bz2_file, bz2_sha_file, upgrade_sql_file, upgrade_sha_file] def generateAndSendXmlFile(self): """ """ self.logger.debug("Start to generateAndSend XML file.\n") tempXmlFile = "%s/clusterconfig.xml" % self.tempFileDir cmd = "mkdir -p %s; touch %s; cat /dev/null > %s" % \ (self.tempFileDir, tempXmlFile, tempXmlFile) (status, output) = subprocess.getstatusoutput(cmd) cmd = "chown -R %s:%s %s" % (self.user, self.group, self.tempFileDir) (status, output) = subprocess.getstatusoutput(cmd) newHosts = self.context.newHostList for host in newHosts: # create single deploy xml file for each standby node xmlContent = self.__generateXml(host) with os.fdopen(os.open("%s" % tempXmlFile, os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR),'w') as fo: fo.write(xmlContent) fo.close() # send single deploy xml file to each standby node sshTool = SshTool([host]) retmap, output = sshTool.getSshStatusOutput("mkdir -p %s" % self.tempFileDir , [host], self.envFile) retmap, output = sshTool.getSshStatusOutput("chown %s:%s %s" % (self.user, self.group, self.tempFileDir), [host], self.envFile) sshTool.scpFiles("%s" % tempXmlFile, "%s" % tempXmlFile, [host], self.envFile) self.cleanSshToolFile(sshTool) self.logger.debug("End to generateAndSend XML file.\n") def __generateXml(self, backIp): """ """ nodeName = self.context.backIpNameMap[backIp] nodeInfo = self.context.clusterInfoDict[nodeName] clusterName = self.context.clusterInfo.name backIp = nodeInfo["backIp"] sshIp = nodeInfo["sshIp"] port = nodeInfo["port"] dataNode = nodeInfo["dataNode"] appPath = self.context.clusterInfoDict["appPath"] logPath = self.context.clusterInfoDict["logPath"] corePath = self.context.clusterInfoDict["corePath"] toolPath = self.context.clusterInfoDict["toolPath"] mppdbconfig = "" tmpMppdbPath = EnvUtil.getEnv("PGHOST") if tmpMppdbPath: mppdbconfig = '' % tmpMppdbPath azName = self.context.hostAzNameMap[backIp] azPriority = nodeInfo["azPriority"] xmlConfig = """\ {mappdbConfig} """.format(clusterName = clusterName, nodeName = nodeName, backIp = backIp, appPath = appPath, logPath = logPath, toolPath = toolPath, corePath = corePath, sshIp = sshIp, port = port, dataNode = dataNode, azName = azName, azPriority = azPriority, mappdbConfig = mppdbconfig) return xmlConfig def changeUser(self): user = self.user try: pw_record = pwd.getpwnam(user) except Exception: GaussLog.exitWithError(ErrorCode.GAUSS_503["GAUSS_50300"] % user) user_name = pw_record.pw_name user_uid = pw_record.pw_uid user_gid = pw_record.pw_gid os.setgid(user_gid) os.setuid(user_uid) os.environ["HOME"] = pw_record.pw_dir os.environ["USER"] = user_name os.environ["LOGNAME"] = user_name os.environ["SHELL"] = pw_record.pw_shell def hasNormalStandbyInAZOfCascade(self, cascadeIp, existingStandbys): """ check whether there are normal standbies in hostAzNameMap[cascadeIp] azZone """ hasStandbyWithSameAZ = False hostAzNameMap = self.context.hostAzNameMap for existingStandby in existingStandbys: existingStandbyName = self.context.backIpNameMap[existingStandby] existingStandbyDataNode = \ self.context.clusterInfoDict[existingStandbyName]["dataNode"] insType, dbState = self.commonGsCtl.queryInstanceStatus( existingStandby, existingStandbyDataNode, self.envFile) if dbState != STATE_NORMAL: continue if hostAzNameMap[cascadeIp] != hostAzNameMap[existingStandby]: continue hasStandbyWithSameAZ = True break return hasStandbyWithSameAZ def getIncreaseAppNames(self, num): """ the default new database application_name is 'dn_6001' which same with primary host. It case standby node cannot set synchronization by name. """ clusterInfo = dbClusterInfo() appPath = self.context.clusterInfoDict["appPath"] staticFile = os.path.join(appPath, "bin", "cluster_static_config") clusterInfo.initFromStaticConfigWithoutUser(staticFile) dbNodes = clusterInfo.dbNodes newInsIds = [] existInsIds = [] for dbNode in dbNodes: for dnInst in dbNode.datanodes: existInsIds.append(int(dnInst.instanceId)) idx = 0 while idx <= MAX_DATANODE_NUM and num > 0: insId = BASE_ID_DATANODE + idx if insId not in existInsIds: newInsIds.append(insId) existInsIds.append(insId) num -= 1 idx += 1 return newInsIds def installDatabaseOnHosts(self): """ install database on each standby node """ standbyHosts = self.context.newHostList tempXmlFile = "%s/clusterconfig.xml" % self.tempFileDir primaryHostName = self.getPrimaryHostName() primaryHostIp = self.context.clusterInfoDict[primaryHostName]["backIp"] existingStandbys = list(set(self.existingHosts) - (set([primaryHostIp]))) failedInstallHosts = [] notInstalledCascadeHosts = [] for newHost,appName in zip(standbyHosts, \ self.getIncreaseAppNames(len(standbyHosts))): if not self.expansionSuccess[newHost]: continue log_path = ClusterDir.getUserLogDirWithUser(self.user) log_dir = "%s/pg_log/dn_%d" % (log_path, appName) audit_dir = "%s/pg_audit/dn_%d" % (log_path, appName) hostName = self.context.backIpNameMap[newHost] sshIp = self.context.clusterInfoDict[hostName]["sshIp"] port = self.context.clusterInfoDict[hostName]["port"] if self.context.newHostCasRoleMap[newHost] == "on": # check whether there are normal standbies in hostAzNameMap[host] azZone hasStandbyWithSameAZ = self.hasNormalStandbyInAZOfCascade(newHost, existingStandbys) if not hasStandbyWithSameAZ: notInstalledCascadeHosts.append(newHost) self.expansionSuccess[newHost] = False continue ssh_tool = SshTool([sshIp], timeout=300) # installing applications cmd = "source %s;" % self.envFile cmd += "%s -t %s -U %s -X %s -R %s -c %s -l %s" % ( OMCommand.getLocalScript("Local_Install"), ACTION_INSTALL_CLUSTER, self.user + ":" + self.group, tempXmlFile, self.context.clusterInfoDict["appPath"], EnvUtil.getEnvironmentParameterValue("GS_CLUSTER_NAME", self.user), self.context.localLog) self.logger.debug( "Command for installing application: %s" % cmd) result_map, output = ssh_tool.getSshStatusOutput(cmd, [], self.envFile) if result_map[sshIp] != DefaultValue.SUCCESS: self.logger.debug("install application failed: %s %s" % (newHost, output)) self.expansionSuccess[newHost] = False failedInstallHosts.append(newHost) continue # send ca file dir ca_file_dir = os.path.realpath(os.path.join( self.context.clusterInfoDict["appPath"], "share", "sslcert")) self.logger.debug( "Command for sending ca file dir: %s" % ca_file_dir) ssh_tool.scpFiles(ca_file_dir, os.path.dirname(ca_file_dir), [sshIp]) # init database datanode cmd = "source {0}; " \ "{1} -U {2} -l {3}".format(self.envFile, OMCommand.getLocalScript("Local_Init_Instance"), self.user, self.context.localLog) self.logger.debug( "Command for installing database datanode: %s" % cmd) result_map, output = ssh_tool.getSshStatusOutput(cmd, [], self.envFile) if result_map[sshIp] != DefaultValue.SUCCESS: self.logger.debug("install datanode failed: %s %s" % (newHost, output)) self.expansionSuccess[newHost] = False failedInstallHosts.append(newHost) continue # set guc config inst_dir = self.context.clusterInfoDict[hostName]["dataNode"] guc_path = os.path.join(self.context.clusterInfoDict["appPath"], "bin", "gs_guc") para_str = " -c \"application_name='dn_{0}'\" " \ "-c \"log_directory='{1}'\" " \ " -c \"audit_directory='{2}'\" " \ " -c \"listen_addresses='localhost,{3}'\"" \ " -c \"port='{4}'\"" \ "".format(appName, log_dir, audit_dir, newHost, port) cmd = "source {0}; {1} set -D {2} {3}".format(self.envFile, guc_path, inst_dir, para_str) self.logger.debug( "Command for set guc params: %s" % cmd) self.guc_executor(ssh_tool, cmd, sshIp) self.logger.log("%s install success." % newHost) if self.context.newHostCasRoleMap[newHost] == "off": existingStandbys.append(newHost) if notInstalledCascadeHosts: self.logger.log("OpenGauss won't be installed on cascade_standby" " %s, because there is no Normal standby in the same azZone." % ", ".join(notInstalledCascadeHosts)) if failedInstallHosts: self.logger.log(ErrorCode.GAUSS_527["GAUSS_52707"] % ", ".join(failedInstallHosts)) self.logger.log("Finish to install database on all nodes.") if self._isAllFailed(): GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35706"] % "install") def preInstallOnHosts(self): """ execute preinstall step """ self.logger.log("Start to preinstall database step.") tempXmlFile = "%s/clusterconfig.xml" % self.tempFileDir preinstallCmd = "{softPath}/script/gs_preinstall -U {user} -G {group} -X {xmlFile} " \ "--non-interactive".format(softPath=self.remote_pkg_dir, user=self.user, group=self.group, xmlFile=tempXmlFile) if EnvUtil.getEnv("MPPDB_ENV_SEPARATE_PATH"): preinstallCmd += " --sep-env-file={envFile}".format(envFile = self.envFile) if not os.listdir(os.path.join(EnvUtil.getEnv("GPHOME"),"lib")): preinstallCmd += " --unused-third-party" preinstallCmd += " --skip-hostname-set 2>&1" failedPreinstallHosts = [] for host in self.context.newHostList: sshTool = SshTool([host], timeout = 300) resultMap, output = sshTool.getSshStatusOutput(preinstallCmd, [], self.envFile) self.logger.debug(resultMap) self.logger.debug(output) if resultMap[host] == DefaultValue.SUCCESS: self.expansionSuccess[host] = True self.logger.log("Preinstall %s success" % host) else: failedPreinstallHosts.append(host) self.cleanSshToolFile(sshTool) if failedPreinstallHosts: self.logger.log("Failed to preinstall on: \n%s" % ", ".join(failedPreinstallHosts)) self.logger.log("End to preinstall database step.") def buildStandbyRelation(self): """ func: after install single database on standby nodes. build the relation with primary and standby nodes. step: 1. set all nodes' guc config parameter: replconninfo, available_zone(only for new) 2. add trust on all hosts 3. generate GRPC cert on new hosts, and primary if current cluster is single instance 4. build new hosts : (1) restart new instance with standby mode (2) build new instances 5. generate cluster static file and send to each node. """ self.setGucConfig() self.addTrust() if DefaultValue.is_create_grpc(self.logger, self.context.clusterInfo.appPath): self.generateGRPCCert() self.distributeCipherFile() self.buildStandbyHosts() self.generateClusterStaticFile() def getExistingHosts(self, isRootUser=True): """ get the exiting hosts """ self.logger.debug("Get the existing hosts.") primaryHost = self.getPrimaryHostName() command = "" if EnvUtil.getEnv("MPPDB_ENV_SEPARATE_PATH"): command = "source %s;gs_om -t status --detail" % self.envFile else: command = "source /etc/profile;source %s;"\ "gs_om -t status --detail" % self.envFile if isRootUser: command = "su - %s -c '%s'" % (self.user, command) self.logger.debug(command) sshTool = SshTool([primaryHost]) resultMap, outputCollect = sshTool.getSshStatusOutput(command, [primaryHost], self.envFile) self.cleanSshToolFile(sshTool) self.logger.debug("Expansion cluster status result:{0}".format(resultMap)) self.logger.debug("Expansion cluster status output:{0}".format(outputCollect)) if resultMap[primaryHost] != DefaultValue.SUCCESS: GaussLog.exitWithError(ErrorCode.GAUSS_516["GAUSS_51600"]) instances = re.split('(?:\|)|(?:\n)', outputCollect) self.existingHosts = [] pattern = re.compile('(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}).*') for inst in instances: existing_hosts_ip = pattern.findall(inst) if len(existing_hosts_ip) != 0: self.existingHosts.append(existing_hosts_ip[0]) self.existingHosts = list(set(self.existingHosts)) def setGucConfig(self): """ set replconninfo on all hosts """ self.logger.debug("Start to set GUC config on all hosts.\n") gucDict = self.getGUCConfig() tempShFile = "%s/guc.sh" % self.tempFileDir hostIpList = list(self.existingHosts) for host in self.expansionSuccess: if self.expansionSuccess[host]: hostIpList.append(host) nodeDict = self.context.clusterInfoDict backIpNameMap = self.context.backIpNameMap hostAzNameMap = self.context.hostAzNameMap for host in hostIpList: hostName = backIpNameMap[host] # set Available_zone for the new standby if host in self.context.newHostList: dataNode = nodeDict[hostName]["dataNode"] gucDict[hostName] += """\ gs_guc set -D {dn} -c "available_zone='{azName}'" """.format(dn=dataNode, azName=hostAzNameMap[host]) command = "source %s ; " % self.envFile + gucDict[hostName] self.logger.debug("[%s] guc command is:%s" % (host, command)) sshTool = SshTool([host]) # create temporary dir to save guc command bashfile. mkdirCmd = "mkdir -m a+x -p %s; chown %s:%s %s" % \ (self.tempFileDir, self.user, self.group, self.tempFileDir) sshTool.getSshStatusOutput(mkdirCmd, [host], self.envFile) exitcode, output = subprocess.getstatusoutput("if [ ! -e '%s' ]; then mkdir -m a+x -p %s;" " fi; touch %s; cat /dev/null > %s" % (self.tempFileDir, self.tempFileDir, tempShFile, tempShFile)) if exitcode != 0: self.expansionSuccess[host] = False self.logger.debug("Failed to create temp file guc.sh.") self.logger.debug(exitcode) self.logger.debug(output) continue with os.fdopen(os.open("%s" % tempShFile, os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fo: fo.write("#bash\n") fo.write(command) fo.close() # send guc command bashfile to each host and execute it. sshTool.scpFiles("%s" % tempShFile, "%s" % tempShFile, [host], self.envFile) resultMap, outputCollect = sshTool.getSshStatusOutput( "sh %s" % tempShFile, [host], self.envFile) self.logger.debug(resultMap) self.logger.debug(outputCollect) self.cleanSshToolFile(sshTool) self.logger.debug("Set guc result: {0}".format(self.expansionSuccess)) if self._isAllFailed(): GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35706"] % "set guc") def addTrust(self): """ add authentication rules about new host ip in existing hosts and add authentication rules about other all hosts ip in new hosts """ self.logger.debug("Start to set host trust on all node.") allHosts = list(self.existingHosts) for host in self.context.newHostList: if self.expansionSuccess[host]: allHosts.append(host) for hostExec in allHosts: hostExecName = self.context.backIpNameMap[hostExec] dataNode = self.context.clusterInfoDict[hostExecName]["dataNode"] cmd = "source %s;gs_guc set -D %s" % (self.envFile, dataNode) if hostExec in self.existingHosts: for hostParam in self.context.newHostList: cmd += " -h 'host all all %s/32 trust'" % \ hostParam else: for hostParam in allHosts: if hostExec != hostParam: cmd += " -h 'host all all %s/32 trust'" % \ hostParam self.logger.debug("[%s] trustCmd:%s" % (hostExec, cmd)) sshTool = SshTool([hostExec]) sshTool.getSshStatusOutput(cmd, [hostExec], self.envFile) self.cleanSshToolFile(sshTool) self.logger.debug("End to set host trust on all node.") def generateGRPCCert(self): """ generate GRPC cert for single node """ primaryHost = self.getPrimaryHostName() dataNode = self.context.clusterInfoDict[primaryHost]["dataNode"] needGRPCHosts = [] for host in self.expansionSuccess: if self.expansionSuccess[host]: needGRPCHosts.append(host) insType, _ = self.commonGsCtl.queryInstanceStatus(primaryHost, dataNode,self.envFile) if insType != MODE_PRIMARY: primaryHostIp = self.context.clusterInfoDict[primaryHost]["backIp"] needGRPCHosts.append(primaryHostIp) self.logger.debug("Start to generate GRPC cert.") if needGRPCHosts: self.context.initSshTool(needGRPCHosts, DefaultValue.TIMEOUT_PSSH_INSTALL) self.context.createGrpcCa(needGRPCHosts) self.logger.debug("End to generate GRPC cert.") def distributeCipherFile(self): """ distribute cipher file to new host """ hostList = [] for host in self.expansionSuccess: if self.expansionSuccess[host]: hostList.append(host) if len(hostList) == 0: return self.logger.debug("Start to distribute cipher file.") cipherFileList = ["datasource.key.cipher", "datasource.key.rand", "usermapping.key.cipher", "usermapping.key.rand", "subscription.key.cipher", "subscription.key.rand"] sshTool = SshTool(hostList) appPath = self.context.clusterInfoDict["appPath"] filePath = os.path.join(appPath, "bin") for cipherFile in cipherFileList: scpFile = os.path.join(filePath, "%s" % cipherFile) self.logger.debug("try to send file: %s" % scpFile) if os.path.exists(scpFile): sshTool.scpFiles(scpFile, filePath, hostList) self.logger.debug("End to distribute cipher file.") def reloadPrimaryConf(self, user=""): """ """ primaryHost = self.getPrimaryHostName() dataNode = self.context.clusterInfoDict[primaryHost]["dataNode"] command = "" if user: command = "su - %s -c 'source %s;gs_ctl reload -D %s'" % \ (user, self.envFile, dataNode) else: command = "gs_ctl reload -D %s " % dataNode sshTool = SshTool([primaryHost]) self.logger.debug(command) resultMap, outputCollect = sshTool.getSshStatusOutput(command, [primaryHost], self.envFile) self.logger.debug(outputCollect) self.cleanSshToolFile(sshTool) def getPrimaryHostName(self): """ """ primaryHost = "" for nodeName in self.context.nodeNameList: if self.context.clusterInfoDict[nodeName]["instanceType"] \ == MASTER_INSTANCE: primaryHost = nodeName break return primaryHost def buildStandbyHosts(self): """ stop the new standby host`s database and build it as standby mode """ self.logger.debug("Start to build new nodes.") standbyHosts = self.context.newHostList hostAzNameMap = self.context.hostAzNameMap primaryHostName = self.getPrimaryHostName() primaryHost = self.context.clusterInfoDict[primaryHostName]["backIp"] existingStandbys = list(set(self.existingHosts).difference(set([primaryHost]))) primaryDataNode = self.context.clusterInfoDict[primaryHostName]["dataNode"] walKeepSegmentsChanged = False status, synchronous_commit = self.commonGsCtl.queryGucParaValue( primaryHost, self.envFile, primaryDataNode, "synchronous_commit") if status != DefaultValue.SUCCESS: GaussLog.exitWithError(ErrorCode.GAUSS_500["GAUSS_50021"] % "synchronous_commit") if synchronous_commit == "off" and self.walKeepSegments < 1024: status = self.commonGsCtl.setGucPara(primaryHost, self.envFile, primaryDataNode, "wal_keep_segments", 1024) if status != DefaultValue.SUCCESS: GaussLog.exitWithError(ErrorCode.GAUSS_500["GAUSS_50007"] % "wal_keep_segments") walKeepSegmentsChanged = True self.reloadPrimaryConf() time.sleep(10) insType, dbState = self.commonGsCtl.queryInstanceStatus( primaryHost, primaryDataNode, self.envFile) primaryExceptionInfo = "" if insType != ROLE_PRIMARY: primaryExceptionInfo = ErrorCode.GAUSS_357["GAUSS_35709"] % \ ("local_role", "primary", "primary") if dbState != STATE_NORMAL: primaryExceptionInfo = ErrorCode.GAUSS_357["GAUSS_35709"] % \ ("db_state", "primary", "Normal") if primaryExceptionInfo != "": GaussLog.exitWithError(primaryExceptionInfo) waitChars = ["\\", "|", "/", "-"] for host in standbyHosts: if not self.expansionSuccess[host]: continue hostName = self.context.backIpNameMap[host] dataNode = self.context.clusterInfoDict[hostName]["dataNode"] buildMode = "" hostRole = "" if self.context.newHostCasRoleMap[host] == "on": buildMode = MODE_CASCADE hostRole = ROLE_CASCADE # check whether there are normal standbies in hostAzNameMap[host] azZone hasStandbyWithSameAZ = self.hasNormalStandbyInAZOfCascade(host, existingStandbys) if not hasStandbyWithSameAZ: self.logger.log("There is no Normal standby in %s" % hostAzNameMap[host]) self.expansionSuccess[host] = False continue else: buildMode = MODE_STANDBY hostRole = ROLE_STANDBY self.logger.log("Start to build %s %s." % (hostRole, host)) self.checkTmpDir(hostName) # start new host as standby mode self.commonGsCtl.stopInstance(hostName, dataNode, self.envFile) result, output = self.commonGsCtl.startInstanceWithMode(host, dataNode, MODE_STANDBY, self.envFile) if result[host] != DefaultValue.SUCCESS: if "Uncompleted build is detected" not in output: self.expansionSuccess[host] = False self.logger.log("Failed to start %s as standby " "before building." % host) continue else: self.logger.debug("Uncompleted build is detected on %s." % host) else: insType, dbState = self.commonGsCtl.queryInstanceStatus( hostName, dataNode, self.envFile) if insType != ROLE_STANDBY: self.logger.log("Build %s failed." % host) self.expansionSuccess[host] = False continue # build new host sshTool = SshTool([host]) tempShFile = "%s/buildStandby.sh" % self.tempFileDir # create temporary dir to save gs_ctl build command bashfile. mkdirCmd = "mkdir -m a+x -p %s; chown %s:%s %s" % \ (self.tempFileDir, self.user, self.group, self.tempFileDir) sshTool.getSshStatusOutput(mkdirCmd, [host], self.envFile) subprocess.getstatusoutput("touch %s; cat /dev/null > %s" % (tempShFile, tempShFile)) buildCmd = "gs_ctl build -D %s -M %s" % (dataNode, buildMode) gs_ctlBuildCmd = "source %s ;nohup " % self.envFile + buildCmd + " 1>/dev/null 2>/dev/null &" self.logger.debug("[%s] gs_ctlBuildCmd: %s" % (host, gs_ctlBuildCmd)) with os.fdopen(os.open("%s" % tempShFile, os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR),'w') as fo: fo.write("#bash\n") fo.write(gs_ctlBuildCmd) fo.close() # send gs_ctlBuildCmd bashfile to the standby host and execute it. sshTool.scpFiles(tempShFile, tempShFile, [host], self.envFile) resultMap, outputCollect = sshTool.getSshStatusOutput("sh %s" % \ tempShFile, [host], self.envFile) self.logger.debug(resultMap) self.logger.debug(outputCollect) if resultMap[host] != DefaultValue.SUCCESS: self.expansionSuccess[host] = False self.logger.debug("Failed to send gs_ctlBuildCmd bashfile " "to %s." % host) self.logger.log("Build %s %s failed." % (hostRole, host)) continue # check whether build process has finished checkProcessExistCmd = "ps x" while True: resultMap, outputCollect = sshTool.getSshStatusOutput( checkProcessExistCmd, [host]) if buildCmd not in outputCollect: self.logger.debug("Build %s complete." % host) break timeFlush = 0.5 for i in range(0, int(60 / timeFlush)): index = i % 4 print("\rThe program is running {}".format(waitChars[index]), end="") time.sleep(timeFlush) # check build result after build process finished while True: timeFlush = 0.5 for i in range(0, int(60 / timeFlush)): index = i % 4 print("\rThe program is running {}".format(waitChars[index]), end="") time.sleep(timeFlush) insType, dbState = self.commonGsCtl.queryInstanceStatus( hostName, dataNode, self.envFile) if dbState not in [STATE_STARTING, STATE_CATCHUP]: self.logger.debug("%s starting and catchup complete." % host) break insType, dbState = self.commonGsCtl.queryInstanceStatus( hostName, dataNode, self.envFile) if insType == hostRole and dbState == STATE_NORMAL: if self.context.newHostCasRoleMap[host] == "off": existingStandbys.append(host) self.logger.log("\rBuild %s %s success." % (hostRole, host)) else: self.expansionSuccess[host] = False self.logger.log("\rBuild %s %s failed." % (hostRole, host)) if walKeepSegmentsChanged: self.logger.debug("Start to rollback primary's wal_keep_segments") status = self.commonGsCtl.setGucPara(primaryHost, self.envFile, primaryDataNode, "wal_keep_segments", self.walKeepSegments) if status != DefaultValue.SUCCESS: self.logger.debug(ErrorCode.GAUSS_500["GAUSS_50007"] % "wal_keep_segments") self.reloadPrimaryConf() if self._isAllFailed(): GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35706"] % "build") def checkTmpDir(self, hostName): """ if the tmp dir id not exist, create it. """ tmpDir = os.path.realpath(EnvUtil.getTmpDirFromEnv()) checkCmd = 'if [ ! -d "%s" ]; then exit 1;fi;' % (tmpDir) sshTool = SshTool([hostName]) resultMap, outputCollect = sshTool.getSshStatusOutput(checkCmd, [hostName], self.envFile) ret = resultMap[hostName] if ret == STATUS_FAIL: self.logger.debug("Node [%s] does not have tmp dir. need to fix.") fixCmd = "mkdir -p %s" % (tmpDir) sshTool.getSshStatusOutput(fixCmd, [hostName], self.envFile) self.cleanSshToolFile(sshTool) def check_new_node_state(self, is_root_user): """ Check new node state. """ self.logger.log("Expansion results:") self.getExistingHosts(is_root_user) for newHost in self.context.newHostList: if newHost in self.existingHosts: self.logger.log("%s:\tSuccess" % newHost) else: self.logger.log("%s:\tFailed" % newHost) def generateClusterStaticFile(self): """ generate static_config_files and send to all hosts """ self.logger.log("Start to generate and send cluster static file.") primaryHost = self.getPrimaryHostName() result = self.commonGsCtl.queryOmCluster(primaryHost, self.envFile) for nodeName in self.context.nodeNameList: nodeInfo = self.context.clusterInfoDict[nodeName] nodeIp = nodeInfo["backIp"] dataNode = nodeInfo["dataNode"] exist_reg = r"(.*)%s[\s]*%s(.*)%s(.*)" % (nodeName, nodeIp, dataNode) dbNode = self.context.clusterInfo.getDbNodeByName(nodeName) if not re.search(exist_reg, result) and nodeIp not in self.context.newHostList: self.logger.debug("The node ip [%s] will not be added to cluster." % nodeIp) self.context.clusterInfo.dbNodes.remove(dbNode) if nodeIp in self.context.newHostList and not self.expansionSuccess[nodeIp]: self.context.clusterInfo.dbNodes.remove(dbNode) toolPath = self.context.clusterInfoDict["toolPath"] appPath = self.context.clusterInfoDict["appPath"] static_config_dir = "%s/script/static_config_files" % toolPath if not os.path.exists(static_config_dir): os.makedirs(static_config_dir) # valid if dynamic config file exists on primary node. dynamic_file = os.path.join(appPath, "bin", "cluster_dynamic_config") dynamic_file_exist = False if os.path.exists(dynamic_file): dynamic_file_exist = True for dbNode in self.context.clusterInfo.dbNodes: hostName = dbNode.name staticConfigPath = "%s/script/static_config_files/cluster_static_config_%s" % \ (toolPath, hostName) self.context.clusterInfo.saveToStaticConfig(staticConfigPath, dbNode.id) srcFile = staticConfigPath if not os.path.exists(srcFile): GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35710"] % srcFile) targetFile = "%s/bin/cluster_static_config" % appPath # if dynamic config file exists on primary node, refreshconf on each host. # if not, remove it on standby nodes if exists. dynamic_opt_cmd = "" if dynamic_file_exist: dynamic_opt_cmd = "gs_om -t refreshconf" else: dynamic_opt_cmd = "if [ -f '%s' ];then rm %s;fi" % \ (dynamic_file, dynamic_file) if hostName != socket.gethostname(): hostSsh = SshTool([hostName]) hostSsh.scpFiles(srcFile, targetFile, [hostName], self.envFile) hostSsh.getSshStatusOutput(dynamic_opt_cmd, [hostName], self.envFile) self.cleanSshToolFile(hostSsh) else: scpcmd = "cp %s %s" % (srcFile, targetFile) (status, output) = subprocess.getstatusoutput(scpcmd) if status != 0: GaussLog.exitWithError("Copy file faild. %s" % output) self.logger.log("End to generate and send cluster static file.\n") if DefaultValue.get_cm_server_num_from_static(self.context.clusterInfo) > 0: self.logger.debug("Check new host state after restart.") return self.check_new_node_state(False) def getGUCConfig(self): """ get guc config of each node: replconninfo[index] """ clusterInfoDict = self.context.clusterInfoDict hostIpList = list(self.existingHosts) for host in self.expansionSuccess: if self.expansionSuccess[host]: hostIpList.append(host) hostNames = [] for host in hostIpList: hostNames.append(self.context.backIpNameMap[host]) gucDict = {} for hostName in hostNames: localeHostInfo = clusterInfoDict[hostName] index = 1 guc_tempate_str = "source %s; " % self.envFile for remoteHost in hostNames: if remoteHost == hostName: continue remoteHostInfo = clusterInfoDict[remoteHost] guc_repl_template = """\ gs_guc set -D {dn} -c "replconninfo{index}=\ 'localhost={localhost} localport={localport} \ localheartbeatport={localeHeartPort} \ localservice={localservice} \ remotehost={remoteNode} \ remoteport={remotePort} \ remoteheartbeatport={remoteHeartPort} \ remoteservice={remoteservice}'" """.format(dn=localeHostInfo["dataNode"], index=index, localhost=localeHostInfo["backIp"], localport=localeHostInfo["localport"], localeHeartPort=localeHostInfo["heartBeatPort"], localservice=localeHostInfo["localservice"], remoteNode=remoteHostInfo["backIp"], remotePort=remoteHostInfo["localport"], remoteHeartPort=remoteHostInfo["heartBeatPort"], remoteservice=remoteHostInfo["localservice"]) guc_tempate_str += guc_repl_template index += 1 gucDict[hostName] = guc_tempate_str return gucDict def checkGaussdbAndGsomVersionOfStandby(self): """ check whether gaussdb and gs_om version of standby are same with priamry """ standbyHosts = list(self.context.newHostList) envFile = self.envFile if self.context.standbyLocalMode: for host in standbyHosts: self.expansionSuccess[host] = True self.logger.log("Checking gaussdb and gs_om version.") getGaussdbVersionCmd = "source %s;gaussdb --version" % envFile getGsomVersionCmd = "source %s;gs_om --version" % envFile gaussdbVersionPattern = re.compile("gaussdb \((.*)\) .*") gsomVersionPattern = re.compile("gs_om \(.*\) .*") primaryHostName = self.getPrimaryHostName() sshPrimary = SshTool([primaryHostName]) resultMap, outputCollect = sshPrimary.getSshStatusOutput( getGaussdbVersionCmd, [], envFile) self.logger.debug(resultMap) self.logger.debug(outputCollect) if resultMap[primaryHostName] != DefaultValue.SUCCESS: GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35707"] % ("gaussdb", "primary")) primaryGaussdbVersion = gaussdbVersionPattern.findall(outputCollect)[0] resultMap, outputCollect = sshPrimary.getSshStatusOutput( getGsomVersionCmd, [], envFile) self.logger.debug(resultMap) self.logger.debug(outputCollect) if resultMap[primaryHostName] != DefaultValue.SUCCESS: GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35707"] % ("gs_om", "primary")) primaryGsomVersion = gsomVersionPattern.findall(outputCollect)[0] self.cleanSshToolFile(sshPrimary) failCheckGaussdbVersionHosts = [] failCheckGsomVersionHosts = [] wrongGaussdbVersionHosts = [] wrongGsomVersionHosts = [] for backIp in standbyHosts: if not self.expansionSuccess[backIp]: continue host = self.context.backIpNameMap[backIp] sshTool = SshTool([host]) # get gaussdb version resultMap, outputCollect = sshTool.getSshStatusOutput( getGaussdbVersionCmd, [], envFile) self.logger.debug(resultMap) self.logger.debug(outputCollect) if resultMap[host] != DefaultValue.SUCCESS: self.expansionSuccess[host] = False failCheckGaussdbVersionHosts.append(host) else: gaussdbVersion = gaussdbVersionPattern.findall(outputCollect)[0] if gaussdbVersion != primaryGaussdbVersion: self.expansionSuccess[host] = False wrongGaussdbVersionHosts.append(host) self.cleanSshToolFile(sshTool) continue # get gs_om version resultMap, outputCollect = sshTool.getSshStatusOutput( getGsomVersionCmd, [], envFile) self.logger.debug(resultMap) self.logger.debug(outputCollect) if resultMap[host] != DefaultValue.SUCCESS: self.expansionSuccess[host] = False failCheckGsomVersionHosts.append(host) else: gsomVersion = gsomVersionPattern.findall(outputCollect)[0] if gsomVersion != primaryGsomVersion: self.expansionSuccess[host] = False wrongGsomVersionHosts.append(host) self.cleanSshToolFile(sshTool) if failCheckGaussdbVersionHosts: self.logger.log(ErrorCode.GAUSS_357["GAUSS_35707"] % ("gaussdb", ", ".join(failCheckGaussdbVersionHosts))) if failCheckGsomVersionHosts: self.logger.log(ErrorCode.GAUSS_357["GAUSS_35707"] % ("gs_om", ", ".join(failCheckGsomVersionHosts))) if wrongGaussdbVersionHosts: self.logger.log(ErrorCode.GAUSS_357["GAUSS_35708"] % ("gaussdb", ", ".join(wrongGaussdbVersionHosts))) if wrongGsomVersionHosts: self.logger.log(ErrorCode.GAUSS_357["GAUSS_35708"] % ("gs_om", ", ".join(wrongGsomVersionHosts))) self.logger.log("End to check gaussdb and gs_om version.\n") if self._isAllFailed(): GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35706"] % "check gaussdb and gs_om version") def preInstall(self): """ preinstall on new hosts. """ self.logger.log("Start to preinstall database on new nodes.") self.sendSoftToHosts() self.generateAndSendXmlFile() self.preInstallOnHosts() self.logger.log("End to preinstall database on new nodes.\n") if self._isAllFailed(): GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35706"] % "preinstall") def clearTmpFile(self): """ clear temporary file after expansion success """ self.logger.debug("start to delete temporary file %s" % self.tempFileDir) clearCmd = "if [ -d '%s' ];then rm -rf %s;fi" % \ (self.tempFileDir, self.tempFileDir) hosts = list(set(self.existingHosts + self.context.newHostList)) try: sshTool = SshTool(hosts) result, output = sshTool.getSshStatusOutput(clearCmd, hosts, self.envFile) self.logger.debug(output) self.cleanSshToolFile(sshTool) except Exception as e: self.logger.debug(str(e)) self.cleanSshToolFile(sshTool) def cleanSshToolFile(self, sshTool): """ """ try: sshTool.clenSshResultFiles() except Exception as e: self.logger.debug(str(e)) def guc_executor(self, ssh_tool, guc_command, host_name): """ Execute gs_guc command """ current_time = str(datetime.datetime.now()).replace(" ", "_").replace( ".", "_") temp_file_dir = "/tmp/gs_expansion_%s" % (current_time) temp_sh_file = os.path.join(temp_file_dir, "guc.sh") command = "source %s ; %s" % (self.envFile, guc_command) self.logger.debug("[%s] ready to run guc command is:%s" % (host_name, command)) # create temporary dir to save guc command bashfile. try: mkdir_cmd = "mkdir -m a+x -p %s; chown %s:%s %s" % \ (temp_file_dir, self.user, self.group, temp_file_dir) ssh_tool.getSshStatusOutput(mkdir_cmd, hostList=[host_name], env_file=self.envFile) local_create_file_cmd = "if [ ! -e '{0}' ]; then mkdir -m a+x -p {0};" \ "fi; touch {0}; cat /dev/null > {0}; " \ "chown {1}:{2} {0}".format(temp_file_dir, self.user, self.group) status, output = subprocess.getstatusoutput(local_create_file_cmd) if status != 0: self.logger.debug("Failed to create temp file guc.sh.") self.logger.debug("guc command result status: {0}".format(status)) self.logger.debug("guc command result output: {0}".format(output)) raise Exception(ErrorCode.GAUSS_535["GAUSS_53506"]) with os.fdopen(os.open("%s" % temp_sh_file, os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fo: fo.write("#bash\n") fo.write(command) fo.close() # send guc command bashfile to each host and execute it. if socket.gethostname() != host_name: ssh_tool.scpFiles("%s" % temp_sh_file, "%s" % temp_sh_file, [host_name], self.envFile) result_map, output_collect = \ ssh_tool.getSshStatusOutput("sh %s" % temp_sh_file, hostList=[host_name], env_file=self.envFile) self.logger.debug("Execute gs_guc command output: {0}".format(output_collect)) if [fail_flag for fail_flag in result_map.values() if not fail_flag]: self.logger.debug("Execute gs_guc command failed. " "result_map is : {0}".format(result_map)) raise Exception(ErrorCode.GAUSS_535["GAUSS_53507"] % command) else: status, output = subprocess.getstatusoutput("sh %s" % temp_sh_file) if status != 0: self.logger.debug("Local execute gs_guc command failed. " "output is : {0}".format(output)) raise Exception(ErrorCode.GAUSS_535["GAUSS_53507"] % command) except Exception as exp: raise Exception(str(exp)) finally: ssh_tool.getSshStatusOutput( g_file.SHELL_CMD_DICT["deleteDir"] % (temp_file_dir, temp_file_dir), hostList=[host_name]) def checkNodesDetail(self): """ """ self.checkNetworkDelay() self.checkUserAndGroupExists() self.checkXmlFileAccessToUser() self.checkClusterStatus() self.validNodeInStandbyList() self.checkXMLConsistency() self.checkDnDirEmpty() def checkNetworkDelay(self): """ check if network delay greater than 1000ms """ backips = self.context.newHostList for backip in backips: ck_net_delay = "ping -s 8192 -c 5 -i 0.3 %s | "\ "awk -F / '{print $5}'| awk '{print $1}'" % backip (status, output) = subprocess.getstatusoutput(ck_net_delay) if status == 0: try: delay_val = float(output.strip()) # if delay greater than 1000ms, it need to warn. if delay_val > 1000: self.logger.warn("[WARNING] The node[%s] has a high "\ "latency[%s ms]." % (backip, delay_val)) except ValueError: self.logger.debug("The node[%s] failed to query\ the delay" % backip) def checkDnDirEmpty(self): """ Check whether datanode dir is empty on new nodes. If not empty, we assume that the datanode directory exists with other database. We should exit and check it. """ if self.context.standbyLocalMode: return excepNodes = [] for node in self.context.newHostList: nodename = self.context.backIpNameMap[node] dn_dir = self.context.clusterInfoDict[nodename]["dataNode"] cmd = """ if [ ! -d "%s" ]; then echo ""; else ls %s; fi; """ % (dn_dir,dn_dir) sshTool = SshTool([node]) (statusMap, output) = sshTool.getSshStatusOutput(cmd, env_file="/etc/profile") if statusMap[node] == DefaultValue.SUCCESS: prefix = '[%s] %s:' % ("SUCCESS", node) result = output[len(prefix):] if result.strip(): excepNodes.append(node) if len(excepNodes) > 0: self.logger.log("The datanode dir of [%s] is not empty.\ Please check it." % ",".join(excepNodes)) sys.exit(1) self.logger.debug("Successfully Check datanode dir is empty.") def checkXMLConsistency(self): """ Check whether XML information is consistent with cluster information """ self.logger.debug("Checking whether XML information is " "consistent with cluster information") self._checkDataNodes() self._checkAvailableZone() def _checkDataNodes(self): """ check datanodes """ self.logger.debug("Checking the consistence of datanodes.") primaryName = self.getPrimaryHostName() cmd = "" if EnvUtil.getEnv("MPPDB_ENV_SEPARATE_PATH"): cmd = "su - %s -c 'source %s;gs_om -t status --detail'" % \ (self.user, self.envFile) else: cmd = "su - %s -c 'source /etc/profile;source %s;"\ "gs_om -t status --detail'" % (self.user, self.envFile) sshTool = SshTool([primaryName]) resultMap, outputCollect = sshTool.getSshStatusOutput(cmd, [primaryName], self.envFile) self.logger.debug(resultMap) self.logger.debug(outputCollect) if resultMap[primaryName] != DefaultValue.SUCCESS: GaussLog.exitWithError(ErrorCode.GAUSS_516["GAUSS_51600"]) self.cleanSshToolFile(sshTool) pos = outputCollect.rfind("-----") pos += len("-----") + 1 allNodesState = outputCollect[pos:] nodeStates = re.split('(?:\|)|(?:\n)', allNodesState) dataNodes = {} for nodeState in nodeStates: pattern = re.compile("[ ]+[^ ]+[ ]+(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[ ]+[^ ]+[ ]+[^ ]+[ ]+([^ ]+)[ ]+") result = pattern.findall(nodeState) if len(result) != 0: result = result[0] if len(result) != 0: dataNodes[result[0]] = result[1] clusterInfoDict = self.context.clusterInfoDict backIpNameMap = self.context.backIpNameMap for hostIp in self.existingHosts: hostName = backIpNameMap[hostIp] dataNode = clusterInfoDict[hostName]["dataNode"] if dataNode != dataNodes[hostIp]: GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35711"] % ("dataNode of %s" % hostIp)) def _checkAvailableZone(self): """ check available_zone """ self.logger.debug("Checking the consistence of azname") clusterInfoDict = self.context.clusterInfoDict backIpNameMap = self.context.backIpNameMap hostAzNameMap = self.context.hostAzNameMap primary = self.getPrimaryHostName() for hostIp in self.existingHosts: hostName = backIpNameMap[hostIp] if hostName == primary: continue dataNode = clusterInfoDict[hostName]["dataNode"] if EnvUtil.getEnv("MPPDB_ENV_SEPARATE_PATH"): cmd = "su - %s -c 'source %s;" \ "gs_guc check -D %s -c \"available_zone\"'" % \ (self.user, self.envFile, dataNode) else: cmd = "su - %s -c 'source /etc/profile;source %s;" \ "gs_guc check -D %s -c \"available_zone\"'" % \ (self.user, self.envFile, dataNode) sshTool = SshTool([hostIp]) resultMap, output = sshTool.getSshStatusOutput(cmd, [hostIp], self.envFile) self.logger.debug(resultMap) self.logger.debug(output) if resultMap[hostIp] != DefaultValue.SUCCESS: GaussLog.exitWithError(ErrorCode.GAUSS_516["GAUSS_51600"]) self.cleanSshToolFile(sshTool) azPattern = re.compile("available_zone='(.*)'") azName = azPattern.findall(output) if len(azName) != 0: azName = azName[0] if azName != hostAzNameMap[hostIp]: GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35711"] % ("azName of %s" % hostIp)) def checkClusterStatus(self): """ Check whether the cluster status is normal before expand. """ self.logger.debug("Start to check cluster status.") curHostName = socket.gethostname() command = "" if EnvUtil.getEnv("MPPDB_ENV_SEPARATE_PATH"): command = "su - %s -c 'source %s;gs_om -t status --detail'" % \ (self.user, self.envFile) else: command = "su - %s -c 'source /etc/profile;source %s;"\ "gs_om -t status --detail'" % (self.user, self.envFile) sshTool = SshTool([curHostName]) resultMap, outputCollect = sshTool.getSshStatusOutput(command, [curHostName], self.envFile) self.logger.debug(resultMap) self.logger.debug(outputCollect) self.cleanSshToolFile(sshTool) if outputCollect.find("Primary Normal") == -1: GaussLog.exitWithError((ErrorCode.GAUSS_357["GAUSS_35709"] % ("status", "primary", "Normal")) + "\nExpansion failed.") self.logger.debug("The primary database is normal.\n") currentWalKeepSegments = self.queryPrimaryWalKeepSegments() if currentWalKeepSegments != "NULL": self.walKeepSegments = int(currentWalKeepSegments) else: self.walKeepSegments = 16 def _adjustOrderOfNewHostList(self): """ Adjust the order of hostlist so that standby comes first and cascade standby comes last """ newHostList = self.context.newHostList newHostCasRoleMap = self.context.newHostCasRoleMap i, j = 0, len(newHostList) - 1 while i < j: while i < j and newHostCasRoleMap[newHostList[i]] == "off": i += 1 while i < j and newHostCasRoleMap[newHostList[j]] == "on": j -= 1 newHostList[i], newHostList[j] = newHostList[j], newHostList[i] i += 1 j -= 1 def validNodeInStandbyList(self): """ check if the node has been installed in the cluster. """ self.logger.debug("Start to check if the nodes in standby list.") self.getExistingHosts() newHostList = self.context.newHostList existedNewHosts = \ [host for host in newHostList if host in self.existingHosts] if existedNewHosts: newHostList = \ [host for host in newHostList if host not in existedNewHosts] self.context.newHostList = newHostList self.expansionSuccess = {} for host in newHostList: self.expansionSuccess[host] = False self.logger.log("These nodes [%s] are already in the cluster. " "Skip expand these nodes." % ",".join(existedNewHosts)) if len(newHostList) == 0: self.logger.log("There is no node can be expanded.") sys.exit(0) self._adjustOrderOfNewHostList() def checkXmlFileAccessToUser(self): """ Check if the xml config file has readable access to user. """ userInfo = pwd.getpwnam(self.user) uid = userInfo.pw_uid gid = userInfo.pw_gid xmlFile = self.context.xmlFile fstat = os.stat(xmlFile) mode = fstat[stat.ST_MODE] if (fstat[stat.ST_UID] == uid and (mode & stat.S_IRUSR > 0)) or \ (fstat[stat.ST_GID] == gid and (mode & stat.S_IRGRP > 0)): pass else: self.logger.debug(ErrorCode.GAUSS_501["GAUSS_50100"] % (xmlFile, self.user)) os.chown(xmlFile, uid, gid) os.chmod(xmlFile, stat.S_IRUSR) def checkUserAndGroupExists(self): """ check system user and group exists and be same on primary and standby nodes """ inputUser = self.user inputGroup = self.group user_group_id = "" isUserExits = False localHost = socket.gethostname() for user in pwd.getpwall(): if user.pw_name == self.user: user_group_id = user.pw_gid isUserExits = True break if not isUserExits: GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35704"] \ % ("User", self.user, localHost)) isGroupExits = False group_id = "" for group in grp.getgrall(): if group.gr_name == self.group: group_id = group.gr_gid isGroupExits = True if not isGroupExits: GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35704"] \ % ("Group", self.group, localHost)) if user_group_id != group_id: GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35712"] % (self.user, self.group)) hostNames = self.context.newHostList envfile = self.envFile sshTool = SshTool(hostNames) #get username in the other standy nodes getUserNameCmd = "cat /etc/passwd | grep -w %s" % inputUser resultMap, outputCollect = sshTool.getSshStatusOutput(getUserNameCmd, [], envfile) for hostKey in resultMap: if resultMap[hostKey] == STATUS_FAIL: self.cleanSshToolFile(sshTool) GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35704"] \ % ("User", self.user, hostKey)) #get groupname in the other standy nodes getGroupNameCmd = "cat /etc/group | grep -w %s" % inputGroup resultMap, outputCollect = sshTool.getSshStatusOutput(getGroupNameCmd, [], envfile) for hostKey in resultMap: if resultMap[hostKey] == STATUS_FAIL: self.cleanSshToolFile(sshTool) GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35704"] \ % ("Group", self.group, hostKey)) self.cleanSshToolFile(sshTool) def installAndExpansion(self): """ install database and expansion standby node with db om user """ pvalue = Value('i', 0) proc = Process(target=self.installProcess, args=(pvalue,)) proc.start() proc.join() if not pvalue.value: sys.exit(1) else: proc.terminate() def installProcess(self, pvalue): # change to db manager user. the below steps run with db manager user. self.changeUser() if not self.context.standbyLocalMode: self.logger.log("Start to install database on new nodes.") self.installDatabaseOnHosts() self.logger.log("Database on standby nodes installed finished.\n") self.checkGaussdbAndGsomVersionOfStandby() self.logger.log("Start to establish the relationship.") self.buildStandbyRelation() # process success pvalue.value = 1 def rollback(self): """ rollback all hosts' replconninfo about failed hosts """ self.getExistingHosts() failedHosts = list(set(self.context.newHostList) - set(self.existingHosts)) clusterInfoDict = self.context.clusterInfoDict for failedHost in failedHosts: # rollback GRPC cert on failed hosts self.logger.debug("Start to rollback GRPC cert of %s" % failedHost) appPath = ClusterDir.getInstallDir(self.user) removeGRPCCertCmd = "ls %s/share/sslcert/grpc/* | grep -v openssl.cnf | " \ "xargs rm -rf" % appPath sshTool = SshTool([failedHost]) sshTool.getSshStatusOutput(removeGRPCCertCmd, [failedHost]) self.cleanSshToolFile(sshTool) for host in self.expansionSuccess: if not self.expansionSuccess[host]: sshTool = SshTool([host]) sshTool.getSshStatusOutput(removeGRPCCertCmd, [host], self.envFile) self.cleanSshToolFile(sshTool) self.logger.debug("Start to rollback replconninfo about %s" % failedHost) for host in self.existingHosts: dataNode = clusterInfoDict[self.context.backIpNameMap[host]]["dataNode"] confFile = os.path.join(dataNode, "postgresql.conf") rollbackReplconninfoCmd = "sed -i '/remotehost=%s/s/^/#&/' %s" \ % (failedHost, confFile) self.logger.debug("[%s] rollbackReplconninfoCmd:%s" % (host, rollbackReplconninfoCmd)) sshTool = SshTool([host]) sshTool.getSshStatusOutput(rollbackReplconninfoCmd, [host]) rollbackPg_hbaCmd = "sed -i '/%s/s/^/#&/' %s" \ % (failedHost, os.path.join(dataNode, "pg_hba.conf")) self.logger.debug("[%s] rollbackPg_hbaCmd:%s" % (host, rollbackPg_hbaCmd)) sshTool.getSshStatusOutput(rollbackPg_hbaCmd, [host]) reloadGUCCommand = "su - %s -c 'source %s; gs_ctl reload " \ "-D %s'" % (self.user, self.envFile, dataNode) self.logger.debug(reloadGUCCommand) resultMap, outputCollect = sshTool.getSshStatusOutput( reloadGUCCommand, [host], self.envFile) self.logger.debug(resultMap) self.logger.debug(outputCollect) self.cleanSshToolFile(sshTool) def _isAllFailed(self): """ check whether all new hosts preinstall/install/build failed """ for host in self.expansionSuccess: if self.expansionSuccess[host]: return False return True def _parse_ssh_tool_output_collect(self, collect_result_str): """ Parse SshTool getSshStatusOutput method result """ self.logger.debug("Start parse SshTool output collect result.") collect_result_list = collect_result_str.split("\n") # node_name_str like this : [[SUCCESS] pekpomdev00006: key_list = [node_name_str.split()[-1].strip(":") for node_name_str in collect_result_list[::2] if node_name_str] # gsql version display like (gsql (openGauss x.x.0 build xxxxxxx) # compiled at 2029-02-26 02:07:00 commit 0 last mr xxxx) value_list = [output_str.split(")")[0].split()[-1] for output_str in collect_result_list[1::2] if output_str] parse_result = dict(zip(key_list, value_list)) self.logger.debug("Parse result is: {0}".format(parse_result)) return parse_result def run(self): """ start expansion """ self.checkNodesDetail() # preinstall on standby nodes with root user. if not self.context.standbyLocalMode: self.preInstall() self.installAndExpansion() self.logger.log("Expansion Finish.") class GsCtlCommon: def __init__(self, expansion): """ """ self.logger = expansion.logger self.user = expansion.user def queryInstanceStatus(self, host, datanode, env): """ """ command = "source %s ; gs_ctl query -D %s" % (env, datanode) sshTool = SshTool([datanode]) resultMap, outputCollect = sshTool.getSshStatusOutput(command, [host], env) self.logger.debug(outputCollect) localRole = re.findall(r"local_role.*: (.*?)\n", outputCollect) db_state = re.findall(r"db_state.*: (.*?)\n", outputCollect) insType = "" if(len(localRole)) == 0: insType = "" else: insType = localRole[0] dbStatus = "" if(len(db_state)) == 0: dbStatus = "" else: dbStatus = db_state[0] self.cleanSshToolTmpFile(sshTool) return insType.strip().lower(), dbStatus.strip().lower() def stopInstance(self, host, datanode, env): """ """ command = "source %s ; gs_ctl stop -D %s" % (env, datanode) sshTool = SshTool([host]) resultMap, outputCollect = sshTool.getSshStatusOutput(command, [host], env) self.logger.debug(host) self.logger.debug(outputCollect) self.cleanSshToolTmpFile(sshTool) def startInstanceWithMode(self, host, datanode, mode, env): """ """ command = "source %s ; gs_ctl start -D %s -M %s" % (env, datanode, mode) self.logger.debug(command) sshTool = SshTool([host]) resultMap, outputCollect = sshTool.getSshStatusOutput(command, [host], env) self.logger.debug(host) self.logger.debug(outputCollect) self.cleanSshToolTmpFile(sshTool) return resultMap, outputCollect def queryOmCluster(self, host, env): """ query om cluster detail with command: gs_om -t status --detail """ command = "source %s ; gs_om -t status --detail" % env sshTool = SshTool([host]) resultMap, outputCollect = sshTool.getSshStatusOutput(command, [host], env) self.logger.debug(host) self.logger.debug(outputCollect) if resultMap[host] == STATUS_FAIL: GaussLog.exitWithError(ErrorCode.GAUSS_516["GAUSS_51600"] + "Please check the cluster status or source the environmental" " variables of user [%s]." % self.user) self.cleanSshToolTmpFile(sshTool) return outputCollect def queryGucParaValue(self, host, env, datanode, para, user=""): """ query guc parameter value """ value = "" command = "" if user: command = "su - %s -c 'source %s; gs_guc check -D %s -c \"%s\"'" % \ (user, env, datanode, para) else: command = "source %s; gs_guc check -D %s -c \"%s\"" % \ (env, datanode, para) sshTool = SshTool([host]) resultMap, outputCollect = sshTool.getSshStatusOutput( command, [host], env) self.logger.debug(host) self.logger.debug(outputCollect) if resultMap[host] == STATUS_FAIL: return resultMap[host], "" self.cleanSshToolTmpFile(sshTool) paraPattern = re.compile(" %s=(.+)" % para) value = paraPattern.findall(outputCollect) if len(value) != 0: value = value[0] else: value = "NULL" return resultMap[host], value def setGucPara(self, host, env, datanode, para, value, user=""): """ set guc parameter """ command = "" if not user: command = "source %s; gs_guc set -D %s -c \"%s=%s\"" % \ (env, datanode, para, value) else: command = "su - %s -c 'source %s; gs_guc set -D %s -c \"%s=%s\"'" % \ (user, env, datanode, para, value) sshTool = SshTool([host]) resultMap, outputCollect = sshTool.getSshStatusOutput( command, [host], env) self.logger.debug(host) self.logger.debug(outputCollect) self.cleanSshToolTmpFile(sshTool) return resultMap[host] def cleanSshToolTmpFile(self, sshTool): """ """ try: sshTool.clenSshResultFiles() except Exception as e: self.logger.debug(str(e))