修复在线扩容bug并优化代码

This commit is contained in:
xue_meng_en
2021-03-11 20:43:37 +08:00
parent 80df4ce336
commit e594191677
3 changed files with 125 additions and 66 deletions

View File

@ -21,6 +21,7 @@
import os
import sys
import pwd
import subprocess
import socket
@ -222,7 +223,7 @@ General options:
backIpList = self.clusterInfo.getClusterBackIps()
for nodeIp in self.newHostList:
if nodeIp not in backIpList:
GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35702"] % \
GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35702"] %
nodeIp)
def _getBackIpNameMap(self):
@ -231,11 +232,17 @@ General options:
self.backIpNameMap[backip] = \
self.clusterInfo.getNodeNameByBackIp(backip)
def checkExecutingUserAndHost(self):
# check whether current user executing this command is root
def checkExecutingUser(self):
"""
check whether current user executing this command is root
"""
if os.getuid() != 0:
GaussLog.exitWithError(ErrorCode.GAUSS_501["GAUSS_50104"])
# check whether current host is primary host
def checkExecutingHost(self):
"""
check whether current host is primary host
"""
currentHost = socket.gethostname()
primaryHost = ""
for nodeName in self.nodeNameList:
@ -244,8 +251,8 @@ General options:
primaryHost = nodeName
break
if currentHost != primaryHost:
GaussLog.exitWithError(ErrorCode.GAUSS_501["GAUSS_50110"] % \
(currentHost + ", which is not primary."))
GaussLog.exitWithError(ErrorCode.GAUSS_501["GAUSS_50110"] %
(currentHost + ", which is not primary"))
def checkTrust(self, hostList = None):
"""
@ -255,18 +262,15 @@ General options:
hostList = self.nodeNameList
rootSSHExceptionHosts = []
individualSSHExceptionHosts = []
sshTool = SshTool(hostList, timeout = 0)
retmap, output = sshTool.getSshStatusOutput("pwd")
for host in hostList:
# check root's trust
if retmap[host] != DefaultValue.SUCCESS:
checkRootTrustCmd = "ssh %s -o ConnectTimeout=10 \"pwd\"" % host
(status, output) = subprocess.getstatusoutput(checkRootTrustCmd)
if status != 0:
rootSSHExceptionHosts.append(host)
try:
sshTool.clenSshResultFiles()
except Exception as e:
self.logger.debug(str(e))
# check individual user's trust
checkUserTrustCmd = "su - %s -c 'ssh %s \"pwd\"'" % (self.user, host)
checkUserTrustCmd = "su - %s -c 'ssh %s -o " \
"ConnectTimeout=10 \"pwd\"'" % (self.user, host)
(status, output) = subprocess.getstatusoutput(checkUserTrustCmd)
if status != 0:
individualSSHExceptionHosts.append(host)
@ -284,6 +288,20 @@ General options:
GaussLog.exitWithError(ErrorCode.GAUSS_511["GAUSS_51100"] %
sshExceptionInfo)
def checkEnvfile(self):
"""
check whether env file has been sourced, if not
1. throw error and exit if environment variable is separated
2. source /home/user/.bashrc otherwise
"""
if not DefaultValue.getEnv("GPHOME"):
userpath = pwd.getpwnam(self.user).pw_dir
envFile = os.path.join(userpath, ".bashrc")
sourceEnvCmd = "source %s" % envFile
os.system(sourceEnvCmd)
if not DefaultValue.getEnv("GPHOME"):
GaussLog.exitWithError(ErrorCode.GAUSS_518["GAUSS_51802"] % "file")
def _getHostAzNameMap(self):
"""
get azName of all hosts
@ -326,13 +344,14 @@ if __name__ == "__main__":
"""
"""
expansion = Expansion()
expansion.checkExecutingUser()
expansion.parseCommandLine()
expansion.checkParameters()
expansion.initLogs()
expansion.checkEnvfile()
expansion.getExpansionInfo()
expansion.checkXmlIncludeNewHost()
expansion.checkExecutingUserAndHost()
expansion.checkExecutingHost()
expansion.checkTrust()
expImpl = ExpansionImpl(expansion)
expImpl.run()

View File

@ -1113,7 +1113,11 @@ class ErrorCode():
"GAUSS_35704": "[GAUSS-35704] %s [%s] does not exist on node [%s].",
"GAUSS_35705": "[GAUSS-35705] Error, the database version is "
"inconsistent in %s: %s",
"GAUSS_35706": "[GAUSS-35706] All new hosts %s failed."
"GAUSS_35706": "[GAUSS-35706] Fail to %s on all new hosts.",
"GAUSS_35707": "[GAUSS-35707] Fail to check %s version on:\n%s",
"GAUSS_35708": "[GAUSS-35708] Inconsistent %s version with primary on \n%s",
"GAUSS_35709": "[GAUSS-35709] The %s of %s is not %s.",
"GAUSS_35710": "[GAUSS-35710] Generate static file [%s] not found."
}
##########################################################################

View File

@ -433,7 +433,7 @@ class ExpansionImpl():
[primaryHost], self.envFile)
self.logger.debug(outputCollect)
if resultMap[primaryHost] != DefaultValue.SUCCESS:
GaussLog.exitWithError("Unable to query current cluster state.")
GaussLog.exitWithError(ErrorCode.GAUSS_516["GAUSS_51600"])
instances = re.split('(?:\|)|(?:\n)', outputCollect)
self.existingHosts = []
pattern = re.compile('(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}).*')
@ -495,7 +495,10 @@ gs_guc set -D {dn} -c "available_zone='{azName}'"
add authentication rules about other all hosts ip in new hosts
"""
self.logger.debug("Start to set host trust on all node.")
allHosts = self.existingHosts + self.context.newHostList
allHosts = list(self.existingHosts)
for host in self.context.newHostList:
if self.expansionSuccess[host]:
allHosts.append(host)
for hostExec in allHosts:
hostExecName = self.context.backIpNameMap[hostExec]
dataNode = self.context.clusterInfoDict[hostExecName]["dataNode"]
@ -511,8 +514,7 @@ gs_guc set -D {dn} -c "available_zone='{azName}'"
hostParam
self.logger.debug("[%s] trustCmd:%s" % (hostExec, cmd))
sshTool = SshTool([hostExec])
resultMap, outputCollect = sshTool.getSshStatusOutput(cmd,
[hostExec], self.envFile)
sshTool.getSshStatusOutput(cmd, [hostExec], self.envFile)
self.cleanSshToolFile(sshTool)
self.logger.debug("End to set host trust on all node.")
@ -580,11 +582,14 @@ gs_guc set -D {dn} -c "available_zone='{azName}'"
primaryHost, primaryDataNode, self.envFile)
primaryExceptionInfo = ""
if insType != ROLE_PRIMARY:
primaryExceptionInfo = "The server mode of primary host" \
"is not primary."
primaryExceptionInfo = ErrorCode.GAUSS_357["GAUSS_35709"] % \
("local_role", "primary", "primary")
if dbStat != STAT_NORMAL:
primaryExceptionInfo = "The primary is not in Normal state."
primaryExceptionInfo = ErrorCode.GAUSS_357["GAUSS_35709"] % \
("db_state", "primary", "Normal")
if primaryExceptionInfo != "":
for host in standbyHosts:
self.expansionSuccess[host] = False
self.rollback()
GaussLog.exitWithError(primaryExceptionInfo)
@ -737,7 +742,7 @@ gs_guc set -D {dn} -c "available_zone='{azName}'"
self.context.clusterInfo.saveToStaticConfig(staticConfigPath, dbNode.id)
srcFile = staticConfigPath
if not os.path.exists(srcFile):
GaussLog.exitWithError("Generate static file [%s] not found." % srcFile)
GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35710"] % srcFile)
hostSsh = SshTool([hostName])
targetFile = "%s/bin/cluster_static_config" % appPath
hostSsh.scpFiles(srcFile, targetFile, [hostName], self.envFile)
@ -802,50 +807,85 @@ remoteservice={remoteservice}'"
gucDict[hostName] = guc_tempate_str
return gucDict
def checkLocalModeOnStandbyHosts(self):
def checkGaussdbAndGsomVersionOfStandby(self):
"""
expansion the installed standby node. check standby database.
1. if the database is installed correctly
2. if the databases version are same before existing and new
check whether gaussdb and gs_om version of standby are same with priamry
"""
standbyHosts = self.context.newHostList
envFile = self.envFile
if self.context.standbyLocalMode:
for host in standbyHosts:
self.expansionSuccess[host] = True
self.logger.log("Checking if the database is installed correctly with local mode.")
getversioncmd = "source %s;gaussdb --version" % envFile
self.logger.log("Checking gaussdb and gs_om version.")
getGaussdbVersionCmd = "source %s;gaussdb --version" % envFile
getGsomVersionCmd = "source %s;gs_om --version" % envFile
gaussdbVersionPattern = re.compile("gaussdb \((.*)\) .*")
gsomVersionPattern = re.compile("gs_om \(.*\) .*")
primaryHostName = self.getPrimaryHostName()
sshPrimary = SshTool([primaryHostName])
resultMap, outputCollect = sshPrimary.getSshStatusOutput(
getversioncmd, [], envFile)
getGaussdbVersionCmd, [], envFile)
if resultMap[primaryHostName] != DefaultValue.SUCCESS:
GaussLog.exitWithError("Fail to check the version of primary.")
ipPattern = re.compile("\[.*\] (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):")
versionPattern = re.compile("gaussdb \((.*)\) .*")
primaryVersion = versionPattern.findall(outputCollect)[0]
notInstalledHosts = []
wrongVersionHosts = []
GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35707"] %
("gaussdb", "primary"))
primaryGaussdbVersion = gaussdbVersionPattern.findall(outputCollect)[0]
resultMap, outputCollect = sshPrimary.getSshStatusOutput(
getGsomVersionCmd, [], envFile)
if resultMap[primaryHostName] != DefaultValue.SUCCESS:
GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35707"] %
("gs_om", "primary"))
primaryGsomVersion = gsomVersionPattern.findall(outputCollect)[0]
self.cleanSshToolFile(sshPrimary)
failCheckGaussdbVersionHosts = []
failCheckGsomVersionHosts = []
wrongGaussdbVersionHosts = []
wrongGsomVersionHosts = []
for host in standbyHosts:
hostName = self.context.backIpNameMap[host]
dataNode = self.context.clusterInfoDict[hostName]["dataNode"]
if not self.expansionSuccess[host]:
continue
sshTool = SshTool([host])
# get gaussdb version
resultMap, outputCollect = sshTool.getSshStatusOutput(
getversioncmd, [], envFile)
getGaussdbVersionCmd, [], envFile)
if resultMap[host] != DefaultValue.SUCCESS:
self.expansionSuccess[host] = False
notInstalledHosts.append(host)
failCheckGaussdbVersionHosts.append(host)
else:
version = versionPattern.findall(outputCollect)[0]
if version != primaryVersion:
gaussdbVersion = gaussdbVersionPattern.findall(outputCollect)[0]
if gaussdbVersion != primaryGaussdbVersion:
self.expansionSuccess[host] = False
wrongVersionHosts.append(host)
if notInstalledHosts:
self.logger.log("In local mode, database is not installed "
"correctly on these nodes:\n%s" % ", ".join(notInstalledHosts))
if wrongVersionHosts:
self.logger.log("In local mode, the database version is not same "
"with primary on these nodes:\n%s" % ", ".join(wrongVersionHosts))
self.logger.log("End to check the database with locale mode.")
wrongGaussdbVersionHosts.append(host)
self.cleanSshToolFile(sshTool)
continue
# get gs_om version
resultMap, outputCollect = sshTool.getSshStatusOutput(
getGsomVersionCmd, [], envFile)
if resultMap[host] != DefaultValue.SUCCESS:
self.expansionSuccess[host] = False
failCheckGsomVersionHosts.append(host)
else:
gsomVersion = gsomVersionPattern.findall(outputCollect)[0]
if gsomVersion != primaryGsomVersion:
self.expansionSuccess[host] = False
wrongGsomVersionHosts.append(host)
self.cleanSshToolFile(sshTool)
if failCheckGaussdbVersionHosts:
self.logger.log(ErrorCode.GAUSS_357["GAUSS_35707"] %
("gaussdb", ", ".join(failCheckGaussdbVersionHosts)))
if failCheckGsomVersionHosts:
self.logger.log(ErrorCode.GAUSS_357["GAUSS_35707"] %
("gs_om", ", ".join(failCheckGsomVersionHosts)))
if wrongGaussdbVersionHosts:
self.logger.log(ErrorCode.GAUSS_357["GAUSS_35708"] %
("gaussdb", ", ".join(wrongGaussdbVersionHosts)))
if wrongGsomVersionHosts:
self.logger.log(ErrorCode.GAUSS_357["GAUSS_35708"] %
("gs_om", ", ".join(wrongGsomVersionHosts)))
self.logger.log("End to check gaussdb and gs_om version.\n")
if self._isAllFailed():
GaussLog.exitWithError(ErrorCode.GAUSS_357["GAUSS_35706"] %
"check gaussdb and gs_om version")
def preInstall(self):
"""
@ -899,7 +939,7 @@ remoteservice={remoteservice}'"
"""
Check whether the cluster status is normal before expand.
"""
self.logger.debug("Start to check cluster status.\n")
self.logger.debug("Start to check cluster status.")
curHostName = socket.gethostname()
command = ""
@ -913,9 +953,7 @@ remoteservice={remoteservice}'"
resultMap, outputCollect = sshTool.getSshStatusOutput(command,
[curHostName], self.envFile)
if outputCollect.find("Primary Normal") == -1:
GaussLog.exitWithError("Unable to query current cluster status. " + \
"Please import environment variables or " +\
"check whether the cluster status is normal.")
GaussLog.exitWithError(ErrorCode.GAUSS_516["GAUSS_51600"])
self.logger.debug("The primary database is normal.\n")
@ -975,8 +1013,8 @@ remoteservice={remoteservice}'"
(fstat[stat.ST_GID] == gid and (mode & stat.S_IRGRP > 0)):
pass
else:
self.logger.debug("User %s has no access right for file %s" \
% (self.user, xmlFile))
self.logger.debug(ErrorCode.GAUSS_501["GAUSS_50100"]
% (xmlFile, self.user))
os.chown(xmlFile, uid, gid)
os.chmod(xmlFile, stat.S_IRUSR)
@ -1060,10 +1098,8 @@ remoteservice={remoteservice}'"
if not self.context.standbyLocalMode:
self.logger.log("Start to install database on new nodes.")
self.installDatabaseOnHosts()
else:
self.checkLocalModeOnStandbyHosts()
self.logger.log("Database on standby nodes installed finished.\n")
self.checkGaussdbAndGsomVersionOfStandby()
self.logger.log("Start to establish the relationship.")
self.buildStandbyRelation()
# process success
@ -1221,9 +1257,9 @@ class GsCtlCommon:
self.logger.debug(host)
self.logger.debug(outputCollect)
if resultMap[host] == STATUS_FAIL:
GaussLog.exitWithError("Query cluster failed. Please check " \
"the cluster status or " \
"source the environmental variables of user [%s]." % self.user)
GaussLog.exitWithError(ErrorCode.GAUSS_516["GAUSS_51600"] +
"Please check the cluster status or source the environmental"
" variables of user [%s]." % self.user)
self.cleanSshToolTmpFile(sshTool)
return outputCollect