From 2ca588a3d8ee39abcc501d41d09524438923abca Mon Sep 17 00:00:00 2001 From: gyt0221 <846772234@qq.com> Date: Wed, 26 Aug 2020 16:06:27 +0800 Subject: [PATCH] =?UTF-8?q?om=20630=E5=90=8E=E4=BF=AE=E6=94=B9=E5=90=88?= =?UTF-8?q?=E5=85=A5=20=E4=BA=91=E5=AF=B9=E6=8E=A5=E5=B7=A5=E5=85=B7?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=20=E6=89=93=E5=8C=85=E8=84=9A=E6=9C=AC?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package/package.sh | 5 + src/bin/gs_guc/cluster_guc.conf | 24 +- src/manager/om/script/gs_check | 340 ++++++++++-------- src/manager/om/script/gs_sshexkey | 14 +- .../om/script/gspylib/common/Common.py | 10 +- .../om/script/gspylib/common/DbClusterInfo.py | 21 +- .../script/gspylib/component/Kernel/Kernel.py | 20 +- .../om/script/gspylib/hardware/gsdisk.py | 2 +- .../gspylib/inspection/common/CheckResult.py | 8 +- .../items/cluster/CheckMpprcFile.py | 11 +- .../inspection/items/network/CheckNetSpeed.py | 4 +- .../gspylib/inspection/items/os/CheckNTPD.py | 4 +- src/manager/om/script/gspylib/os/gsOSlib.py | 9 +- .../om/script/gspylib/threads/SshTool.py | 8 +- .../om/script/impl/om/OLAP/OmImplOLAP.py | 19 + .../script/impl/preinstall/PreinstallImpl.py | 4 +- src/manager/om/script/local/LocalCollect.py | 94 +++-- 17 files changed, 342 insertions(+), 255 deletions(-) diff --git a/package/package.sh b/package/package.sh index 59fdc5d0b..1b02896ac 100644 --- a/package/package.sh +++ b/package/package.sh @@ -303,6 +303,11 @@ function install_gaussdb() export LD_LIBRARY_PATH=$GAUSSHOME/lib:$LD_LIBRARY_PATH commitid=$(LD_PRELOAD='' ${BUILD_DIR}/bin/gaussdb -V | awk '{print $5}' | cut -d ")" -f 1) + if [ -z $commitid ] + then + commitid=$(date "+%Y%m%d%H%M%S") + commitid=${commitid:4:8} + fi echo "${commitid}" >>${SCRIPT_DIR}/version.cfg echo "End insert commitid into version.cfg" >> "$LOG_FILE" 2>&1 } diff --git a/src/bin/gs_guc/cluster_guc.conf b/src/bin/gs_guc/cluster_guc.conf index 43564816f..9726396ca 100644 --- a/src/bin/gs_guc/cluster_guc.conf +++ b/src/bin/gs_guc/cluster_guc.conf @@ -571,25 +571,34 @@ cmserver_ha_status_interval|int|0,2147483647|NULL|NULL| cmserver_self_vote_timeout|int|0,2147483647|NULL|This parameter works only when cmserver_self_vote_timeout >= cmserver_ha_heartbeat_timeout, otherwise, it will work based on cmserver_ha_heartbeat_timeout.| enable_transaction_read_only|bool|0,0|NULL|NULL| datastorage_threshold_check_interval|int|1,2592000|NULL|NULL| -alarm_report_max_count|int|5,2592000|NULL|NULL| +alarm_report_max_count|int|1,2592000|NULL|NULL| datastorage_threshold_value_check|int|1,99|NULL|NULL| max_datastorage_threshold_check|int|1,2592000|NULL|NULL| -coordinator_heartbeat_timeout|int|0,2147483647|NULL|NULL| +phony_dead_effective_time|int|0,2147483647|NULL|NULL| +instance_keep_heartbeat_timeout|int|0,2147483647|NULL|NULL| cm_server_arbitrate_delay_base_time_out|int|0,2147483647|NULL|NULL| cm_server_arbitrate_delay_incrememtal_time_out|int|0,2147483647|NULL|NULL| enable_az_auto_switchover|int|0,1|NULL|NULL| cm_auth_method|enum|trust,gss|NULL|NULL| cm_krb_server_keyfile|string|0,0|NULL|NULL| +switch_rto|int|0,2147483647|NULL|NULL| +force_promote|int|0,1|NULL|NULL| +az_switchover_threshold|int|1,100|NULL|NULL| +az_check_and_arbitrate_interval|int|1,2147483647|NULL|NULL| +az_connect_check_interval|int|1,2147483647|NULL|NULL| +az_connect_check_delay_time|int|1,2147483647|NULL|NULL| +cmserver_demote_delay_on_etcd_fault|int|1,2147483647|NULL|NULL| +instance_phony_dead_restart_interval|int|1,2147483647|NULL|NULL| [cmagent] log_dir|string|0,0|NULL|NULL| log_file_size|int|0,2147483647|MB|NULL| log_min_messages|enum|debug5,debug1,log,warning,error,fatal|NULL|NULL| -log_max_size|int|0,1024|NULL|NULL| -log_max_count|int|0,100000|NULL|NULL| +log_max_size|int|0,2147483647|NULL|NULL| +log_max_count|int|0,10000|NULL|NULL| log_saved_days|int|0,1000|NULL|NULL| enable_log_compress|bool|0,0|NULL|NULL| alarm_report_interval|int|0,2147483647|NULL|NULL| -alarm_report_max_count|int|5,2592000|NULL|NULL| +alarm_report_max_count|int|1,2592000|NULL|NULL| alarm_component|string|0,0|NULL|NULL| incremental_build|bool|0,0|NULL|NULL| agent_report_interval|int|0,2147483647|NULL|NULL| @@ -597,12 +606,15 @@ agent_heartbeat_timeout|int|0,2147483647|NULL|NULL| agent_connect_timeout|int|0,2147483647|NULL|NULL| agent_connect_retries|int|0,2147483647|NULL|NULL| agent_check_interval|int|0,2147483647|NULL|NULL| +agent_kill_instance_timeout|int|0,2147483647|NULL|NULL| +log_threshold_check_interval|int|0,2147483647|NULL|NULL| +dilatation_shard_count_for_disk_capacity_alarm|int|0,2147483647|NULL|NULL| security_mode|bool|0,0|NULL|NULL| upgrade_from|int|0,4294967295|NULL|For upgrading, specify which version we are upgrading from.| unix_socket_directory|string|0,0|NULL|NULL| enable_xc_maintenance_mode|bool|0,0|NULL|NULL| process_cpu_affinity|int|0,2|NULL|NULL| -enable_cn_auto_repair|bool|0,0|NULL|NULL| +agent_phony_dead_check_interval|int|0,2147483647|NULL|NULL| [lcname] allow_concurrent_tuple_update|bool|0,0|NULL|NULL| prefetch_quantity|int|128,131072|kB|NULL| diff --git a/src/manager/om/script/gs_check b/src/manager/om/script/gs_check index ab3bc6bb2..f342c62b2 100644 --- a/src/manager/om/script/gs_check +++ b/src/manager/om/script/gs_check @@ -83,12 +83,32 @@ DEFAULT_TIMEOUT = 1500 # because single clusters don't need to perform consistency checks and # internal communication class checks SINGLE_SKIP = ["CheckTimeZone", "CheckEncoding", "CheckKernelVer", - "CheckNTPD", - "CheckNoCheckSum", "CheckCpuCount", + "CheckNTPD", "CheckNoCheckSum", "CheckCpuCount", "CheckMemInfo", "CheckDiskConfig", "CheckUpVer", "CheckPgxcgroup", "CheckPing", - "CheckNetWorkDrop", - "CheckNetSpeed"] + "CheckNetWorkDrop", "CheckNetSpeed"] + +SETITEM_SKIP = ["CheckCPU", "CheckTimeZone", "CheckOSVer", "CheckNTPD", + "CheckSshdService", "CheckNoCheckSum", "CheckEtcHosts", + "CheckCpuCount", "CheckHyperThread", "CheckMemInfo", + "CheckKernelVer", "CheckEncoding", "CheckBootItems", + "CheckDropCache", "CheckFilehandle", "CheckKeyProAdj", + "CheckDiskFormat", "CheckInodeUsage", "CheckSpaceUsage", + "CheckDiskConfig", "CheckXid", "CheckSysTabSize", + "CheckClusterState", "CheckConfigFileDiff", "CheckUpVer", + "CheckEnvProfile", "CheckGaussVer", "CheckPortRange", + "CheckReadonlyMode", "CheckCatchup", "CheckProcessStatus", + "CheckSpecialFile", "CheckCollector", "CheckLargeFile", + "CheckProStartTime", "CheckMpprcFile", "CheckLockNum", + "CheckCurConnCount", "CheckCursorNum", "CheckPgxcgroup", + "CheckLockState", "CheckIdleSession", "CheckDBConnection", + "CheckSysTable", "CheckSysTabSize", "CheckTableSpace", + "CheckTableSkew", "CheckDNSkew", "CheckCreateView", + "CheckHashIndex", "CheckNextvalInDefault", "CheckPgxcRedistb", + "CheckReturnType", "CheckSysadminUser", "CheckTDDate", + "CheckDropColumn", "CheckDiskFailure", "CheckPing", + "CheckNetWorkDrop", "CheckUsedPort", "CheckNICModel", + "CheckRouting", "CheckNetSpeed", "CheckDataDiskUsage"] class CmdOptions(): @@ -358,10 +378,10 @@ class CheckContext(): input : remote host name and password map output : NA ''' - if (len(hosts) == 0 or g_opts.isSingle): + if len(hosts) == 0 or g_opts.isSingle: return fileName = self.getCacheFile() - if (not os.path.isfile(fileName)): + if not os.path.isfile(fileName): raise CheckException("File %s is not exist or invalid" % fileName) try: pool = ThreadPool(DefaultValue.getCpuSet()) @@ -390,7 +410,8 @@ Usage: General options: -i Health check item number. - OLAP Example: -i CheckCPU,CheckMTU,CheckPing. + OLAP Example: -i CheckCPU,CheckMTU, + CheckPing. -e Health check scene name. OLAP Example: -e inspect/upgrade/slow_node/ binary_upgrade/health/install/longtime @@ -398,17 +419,19 @@ General options: -L Run the command as local mode. -l Path of log file. -o Save the result to the specified directory. - --cid The check ID used for identify a check process, - only for internal use. + --cid The check ID used for identify a check + process, only for internal use. --skip-root-items Skip the items with root privileges. --disk-threshold Set disk threshold for checking disk usage, only for CheckDataDiskUsage. --format Set the format of the result report. --set Set abnormal items if supported - --time-out Set the timeout for scene check, default 1500 seconds. + --time-out Set the timeout for scene check, default + 1500 seconds. --routing The network segment with business ip, example: 192.168.1.1:255.255.255.0 - --skip-items Skip the specified check item or setting item with scene check + --skip-items Skip the specified check item or setting + item with scene check Example: --skip-items CheckCPU,CheckMTU -?, --help Show help information for this utility, and exit the command line mode. @@ -453,7 +476,7 @@ def parseCommandLine(): g_opts = CmdOptions() ParaObj = Parameter() ParaDict = ParaObj.ParameterCommandLine("check") - if ("helpFlag" in list(ParaDict.keys())): + if "helpFlag" in list(ParaDict.keys()): usage() sys.exit(0) @@ -475,50 +498,51 @@ def parseCommandLine(): raise UseBothParameterException( (paraNameMap[para], paraNameMap[irrelevantPara[para]])) - if ("itemstr" in list(ParaDict.keys())): + if "itemstr" in list(ParaDict.keys()): g_opts.items = ParaDict["itemstr"] - if ("scenes" in list(ParaDict.keys())): + if "scenes" in list(ParaDict.keys()): g_opts.scene = ParaDict["scenes"] - if ("outFile" in list(ParaDict.keys())): + if "outFile" in list(ParaDict.keys()): g_context.outPath = ParaDict["outFile"] - if ("logFile" in list(ParaDict.keys())): + if "logFile" in list(ParaDict.keys()): g_opts.logFile = ParaDict["logFile"] - if ("user" in list(ParaDict.keys())): + if "user" in list(ParaDict.keys()): g_context.user = ParaDict["user"] - if ("hostfile" in list(ParaDict.keys())): + if "hostfile" in list(ParaDict.keys()): for node in g_file.readFile(ParaDict["hostfile"]): g_opts.nodes.append(node.strip()) - if ("cid" in list(ParaDict.keys())): + if "cid" in list(ParaDict.keys()): g_context.setCheckID(ParaDict["cid"]) g_opts.distributing = True - if ("localMode" in list(ParaDict.keys())): + if "localMode" in list(ParaDict.keys()): g_opts.localMode = True - if ("skipRootItems" in list(ParaDict.keys())): + if "skipRootItems" in list(ParaDict.keys()): g_opts.skipRootItems = True - if ("disk-threshold" in list(ParaDict.keys())): + if "disk-threshold" in list(ParaDict.keys()): g_context.thresholdDn = ParaDict["disk-threshold"] - if ("set" in list(ParaDict.keys())): + if "set" in list(ParaDict.keys()): g_context.set = True - if ("routing" in list(ParaDict.keys())): + if "routing" in list(ParaDict.keys()): g_opts.routing = ParaDict["routing"] - if ("skipItems" in list(ParaDict.keys())): + if "skipItems" in list(ParaDict.keys()): g_opts.skipItems = ParaDict["skipItems"] - if ("nodegroup_name" in list(ParaDict.keys())): + if "nodegroup_name" in list(ParaDict.keys()): g_context.LCName = ParaDict["nodegroup_name"] - if ("shrinkNodes" in list(ParaDict.keys())): + if "shrinkNodes" in list(ParaDict.keys()): g_context.ShrinkNodes = ParaDict["shrinkNodes"] - if ("time_out" in list(ParaDict.keys())): + if "time_out" in list(ParaDict.keys()): try: g_opts.timeout = int(ParaDict["time_out"]) except Exception: raise CheckException("The parameter timeout set invalid value") - if (g_opts.timeout < DEFAULT_TIMEOUT): + if g_opts.timeout < DEFAULT_TIMEOUT: raise CheckException( - "The timeout parameter must be set larger than default value 1500 seconds") + "The timeout parameter must be set larger than default " + "value 1500 seconds") setTimeOut() - if ("format" in list(ParaDict.keys())): + if "format" in list(ParaDict.keys()): g_opts.format = ParaDict["format"] - if (g_opts.format not in formatList): + if g_opts.format not in formatList: raise CheckException( "Format %s is not available,the valid format is %s" % ( g_opts.format, ",".join(formatList))) @@ -546,13 +570,13 @@ def checkParameter(): def checkuser(): # The new node scenario does not need the -U parameter - if (__isRoot() and not g_opts.localMode): + if __isRoot() and not g_opts.localMode: g_context.user = None return # Default mode -U for the current user - if (not __isRoot() and not g_context.user): + if not __isRoot() and not g_context.user: g_context.user = SharedFuncs.getCurrentUser() - if (g_context.user): + if g_context.user: if not __isRoot() and g_context.user != SharedFuncs.getCurrentUser(): raise CheckException( "The user %s is not current user" % g_context.user) @@ -561,22 +585,22 @@ def checkuser(): except Exception: raise CheckException( "The user %s is not a effective user." % g_context.user) - if (user_uid == 0): + if user_uid == 0: raise CheckException("The -U parameter can not be the root user.") isClusterUser = SharedFuncs.checkClusterUser(g_context.user, __getMpprcFile()) - if (isClusterUser): + if isClusterUser: # get cluster information g_context.mpprc = __getMpprcFile() clusterInfo = g_context.loadClusterInfo(g_context.user) - if (clusterInfo): + if clusterInfo: g_opts.cluster = clusterInfo else: isClusterUser = False - if (not isClusterUser): + if not isClusterUser: raise CheckException( "The user %s is not valid cluster user" % g_context.user) - if (g_opts.localMode or g_opts.distributing): + if g_opts.localMode or g_opts.distributing: return # Check cluster user trust @@ -588,14 +612,14 @@ def checkuser(): psshPath = os.path.join(appPath, 'script/gspylib/pssh/bin/pssh') cmd = "%s -H %s 'id' " % (psshPath, " -H ".join(dbNameList)) (status, output) = subprocess.getstatusoutput(cmd) - if (status != 0): + if status != 0: errorNode = [] for result in output.split('\n'): - if (result.strip() == ""): + if result.strip() == "": continue resultInfo = result.split() # Analyze the results - if (len(resultInfo) > 3 and resultInfo[2] == "[SUCCESS]"): + if len(resultInfo) > 3 and resultInfo[2] == "[SUCCESS]": continue elif (len(resultInfo) > 3 and resultInfo[2] == "[FAILURE]" and resultInfo[3] in dbNameList): @@ -604,7 +628,7 @@ def checkuser(): raise CheckException( "Failed to check user trust. commands: %s Error:/n%s" % (cmd, output)) - if (errorNode): + if errorNode: raise CheckException( "Failed to check user trust with %s" % errorNode) else: @@ -613,20 +637,20 @@ def checkuser(): def createPath(path, user=""): - if (path == "/dev/null"): + if path == "/dev/null": return - if (os.path.isdir(path)): + if os.path.isdir(path): # test write permissions - if (not g_file.checkDirWriteable(path)): + if not g_file.checkDirWriteable(path): raise CheckException( "Failed to create or delete file in the [%s]." % path) - elif (os.path.isfile(path)): + elif os.path.isfile(path): raise CheckException("The out path [%s] must be a directory." % path) else: # path is not exist. recursively create the path g_file.createDirectory(path, True, DefaultValue.KEY_DIRECTORY_MODE) # Modify the file owner - if (__isRoot() and user): + if __isRoot() and user: g_file.changeOwner(user, path) @@ -651,11 +675,12 @@ def initLogFile(): output: NA """ global g_context, g_logger - # load the context when the script ruuning on local mode and the context was cached before + # load the context when the script ruuning on local mode and the context + # was cached before g_context.tmpPath = getTmpPath() - if (g_context.isCached()): + if g_context.isCached(): g_context = g_context.load() - if (__getLocalNode(g_context.nodes) in g_context.newNodes): + if __getLocalNode(g_context.nodes) in g_context.newNodes: g_context.mpprc = None g_context.user = None g_context.cluster = None @@ -667,9 +692,9 @@ def initLogFile(): else: # Parameter specified first, followed by default GAUSSLOG, # last temporary directory - if (g_opts.logFile): + if g_opts.logFile: g_context.logFile = os.path.realpath(g_opts.logFile) - elif (g_opts.cluster): + elif g_opts.cluster: g_context.logFile = os.path.join(g_opts.cluster.logPath, '%s/om/gs_check.log' % g_context.user) @@ -687,16 +712,16 @@ def initLogFile(): # Load support check items by parsing the project folder g_context.loadSupportItems() # load the scene configuration - if (g_opts.scene): + if g_opts.scene: g_context.loadSceneConfiguration(g_opts.scene) # load cluster info - if (g_opts.cluster): + if g_opts.cluster: g_context.cluster = g_opts.cluster g_context.oldNodes = g_opts.cluster.getClusterSshIps()[0] # load nodes - if (g_opts.nodes): + if g_opts.nodes: for node in g_opts.nodes: - if (node not in g_context.oldNodes): + if node not in g_context.oldNodes: g_context.newNodes.append(node) g_context.nodes = g_context.oldNodes + g_context.newNodes @@ -716,11 +741,11 @@ def getRootUserPwd(): g_logger.debug("Ask user input password interactive") for host in g_context.nodes: isPwdOk = SharedFuncs.verifyPasswd(host, rootuser, rootpwd) - if (not isPwdOk): + if not isPwdOk: # try to connect remote node again rootpwd = __retryConnection(host, rootuser) g_opts.pwdMap[host] = (rootuser, rootpwd) - if (pwd.getpwnam(rootuser).pw_uid != 0): + if pwd.getpwnam(rootuser).pw_uid != 0: raise CheckException("Enter the user [%s] does not have" " root privileges." % rootuser) # print message on screen @@ -735,7 +760,7 @@ def parseCheckContext(): """ global g_context initLogFile() - if (g_context.isCached()): + if g_context.isCached(): return g_logger.debug("Start to parse the check items config file") items_all = [] @@ -744,7 +769,7 @@ def parseCheckContext(): failedItems = [] singleSkipList = [] # generate the items from scene configuration - if (g_opts.scene): + if g_opts.scene: items_oldNode, failedItems = __parseScene(g_opts.scene) items_all += items_oldNode # generate the items from -i parameter value @@ -756,23 +781,23 @@ def parseCheckContext(): else: items_all.append(item) for item in items_all: - if (not g_context.set and item['name'] in g_opts.skipItems): + if not g_context.set and item['name'] in g_opts.skipItems: items_all.remove(item) continue - if (g_context.set and item['set_permission'] == 'root'): + if g_context.set and item['set_permission'] == 'root': g_context.rootItems.append(item) - if (g_opts.skipRootItems and item['permission'] == 'root'): + if g_opts.skipRootItems and item['permission'] == 'root': items_all.remove(item) continue - if (item['permission'] == 'root'): + if item['permission'] == 'root': g_context.rootItems.append(item) - if (g_opts.isSingle and item['name'] in SINGLE_SKIP): + if g_opts.isSingle and item['name'] in SINGLE_SKIP: singleSkipList.append(item['name']) continue - if (item['name'] == "CheckRouting"): - if (g_opts.routing): + if item['name'] == "CheckRouting": + if g_opts.routing: g_context.routing = g_opts.routing - elif (g_opts.cluster): + elif g_opts.cluster: workIP = g_opts.cluster.getDbNodeByName( DefaultValue.GetHostIpOrName()).backIps[0] g_context.routing = "%s:%s" % ( @@ -781,16 +806,16 @@ def parseCheckContext(): raise CheckException( "The --routing is required when cluster dosen't exist") g_context.items.append(item) - if (len(singleSkipList) != 0): + if len(singleSkipList) != 0: __printOnScreen( "The following items are skipped when the type of cluster is" " single:\n[%s]" % ",".join(singleSkipList)) - if (not items_newNode): + if not items_newNode: g_context.oldItems = g_context.items else: g_context.oldItems = items_oldNode g_context.newItems = items_newNode - if (g_context.set and items_all): + if g_context.set and items_all: # Settings will have a big impact and need to be confirmed confirmItem = { "CheckCrontabLeft": "Clear om_monitor in crond service", @@ -798,36 +823,41 @@ def parseCheckContext(): "'/var/log/Bigdata/','/home/omm/'", "CheckProcessLeft": "Kill all process with gaussdb and omm user", "CheckOmmUserExist": "Delete system user omm", - "CheckPortConflict": "kill all process with occupies the 25xxx port" + "CheckPortConflict": "kill all process with occupies " + "the 25xxx port" } confirmMsg = "" for item in items_all: - if (item['name'] in list(confirmItem.keys())): + if item['name'] in list(confirmItem.keys()): confirmMsg += confirmItem[item['name']] + "\n" - if (confirmMsg): - confirmMsg = "Warning: Executing the settings will do the following at the [%s] node:\n" % \ + if item['name'] in SETITEM_SKIP: + g_context.skipSetItem.append(item['name']) + + if confirmMsg: + confirmMsg = "Warning: Executing the settings will do " \ + "the following at the [%s] node:\n" % \ ','.join(g_context.newNodes) + confirmMsg __printOnScreen(confirmMsg) flag = input("Execution settings? (Y/N):") - while (True): + while True: # If it is not yes or all, it has been imported - if (not flag.upper() in ("Y", "N", "YES", "NO")): + if not flag.upper() in ("Y", "N", "YES", "NO"): flag = input("Please type 'yes' or 'no': ") continue break - if (flag.upper() in ("Y", "YES")): + if flag.upper() in ("Y", "YES"): pass - if (flag.upper() in ("N", "NO")): - skipSetItem = [] + if flag.upper() in ("N", "NO"): for Item in g_context.newItems: - if (Item['name'] in list(confirmItem.keys())): + if Item['name'] in list(confirmItem.keys()): g_context.newItems.remove(Item) - skipSetItem.append(Item['name']) + g_context.skipSetItem.append(Item['name']) __printOnScreen( - 'Skip the settings for [%s]' % ','.join(skipSetItem)) - if (failedItems): + 'Skip the settings for [%s]' + % ','.join(g_context.skipSetItem)) + if failedItems: raise ParseItemException(failedItems) - if (not g_context.items): + if not g_context.items: raise CheckException("No check item can be performed," " please confirm the input parameters.") @@ -853,7 +883,7 @@ def __printOnScreen(msg): """ function: print message on screen """ - if (g_opts.localMode or g_opts.distributing): + if g_opts.localMode or g_opts.distributing: return g_logger.info(msg) @@ -884,7 +914,7 @@ def __getLocalNode(nodes): """ if nodes: for n in nodes: - if (SharedFuncs.is_local_node(n)): + if SharedFuncs.is_local_node(n): return n return DefaultValue.GetHostIpOrName() @@ -893,7 +923,7 @@ def __getSeparatedValue(value, separator=","): ''' get command line value which were separated by "," ''' - if (separator not in value): + if separator not in value: return [value] return value.split(separator) @@ -924,7 +954,7 @@ def __retryConnection(host, user): "Please enter password for user[%s] on the node[%s]:" % (user, host)) isOK = SharedFuncs.verifyPasswd(host, user, passwd) - if (isOK): + if isOK: return passwd else: continue @@ -938,27 +968,27 @@ def __getMpprcFile(): """ # get mpprc file envValue = DefaultValue.getEnv("MPPDB_ENV_SEPARATE_PATH") - if (envValue is not None and os.path.isfile(envValue)): + if envValue is not None and os.path.isfile(envValue): return envValue - elif (not __isRoot() and DefaultValue.getEnv('GAUSS_ENV')): + elif not __isRoot() and DefaultValue.getEnv('GAUSS_ENV'): cmd = "echo ~ 2>/dev/null" (status, output) = subprocess.getstatusoutput(cmd) - if (status != 0): + if status != 0: raise CheckException( "Fetching user environment variable file failed." " Please setup environment variables." + "The cmd is %s" % cmd) else: return os.path.join(output, ".bashrc") - elif (__isRoot() and g_context.user): + elif __isRoot() and g_context.user: cmd = "su - %s -c 'echo ~ 2>/dev/null'" % g_context.user (status, output) = subprocess.getstatusoutput(cmd) - if (status != 0): + if status != 0: raise CheckException( "Failed to get user [%s] home directory. Error: %s\n" % ( g_context.user, output) + "The cmd is %s" % cmd) else: return os.path.join(output, ".bashrc") - elif (__isRoot()): + elif __isRoot(): return "" else: raise CheckException("The separated mpprc file was not found." @@ -986,7 +1016,7 @@ def __parseScene(sceneName): ''' function: parse scene configure file ''' - if (not sceneName): + if not sceneName: raise NotEmptyException("scene name") # Get scene xml xmlFile = "%s/config/scene_%s.xml" % (g_context.basePath, sceneName) @@ -1003,7 +1033,7 @@ def __parseScene(sceneName): for elem in rootNode.findall('allowitems/item'): elemName = elem.attrib['name'] # check the check item whether exist or not - if (elemName not in list(g_context.supportItems.keys())): + if elemName not in list(g_context.supportItems.keys()): raise NotExistException("elemName", "support items") # save threshold as text and parse them later subElem = elem.find('threshold') @@ -1014,7 +1044,7 @@ def __parseScene(sceneName): # parse categories and get all items for category in rootNode.findall('allowcategories/category'): cpath = "%s/items/%s" % (g_context.basePath, category.attrib['name']) - if (os.path.isdir(cpath)): + if os.path.isdir(cpath): itemNames.extend(x[:-3] for x in os.listdir(cpath) if x[:-3] not in itemNames and x.endswith(".py")) @@ -1032,10 +1062,10 @@ def __parseScene(sceneName): failedItems.append(i) # overwrite the threshold parameters - if (thresholds and i in list(thresholds.keys())): + if thresholds and i in list(thresholds.keys()): # parse the threshold of check item sceneThreshold = __parseThreshold(thresholds[i]) - if (item['threshold']): + if item['threshold']: item['threshold'] = dict(item['threshold'], **sceneThreshold) else: item['threshold'] = sceneThreshold @@ -1047,14 +1077,14 @@ def __parseOneItem(itemName): ''' function: parse one check item and get the full information ''' - if (not itemName): + if not itemName: raise NotEmptyException("Item name") item = {} # try to load check item configuration from xml file xmlFile = "%s/config/items.xml" % g_context.basePath for event, elem in ETree.iterparse(xmlFile): - if (event == 'end'): - if (elem.tag == 'checkitem' and elem.attrib['name'] == itemName): + if event == 'end': + if elem.tag == 'checkitem' and elem.attrib['name'] == itemName: # Parse the xml file item['id'] = elem.attrib['id'] item['name'] = elem.attrib['name'] @@ -1076,7 +1106,7 @@ def __parseOneItem(itemName): 'default') # Get the threshold threshold = elem.find('threshold') - if (threshold is not None and threshold.text is not None): + if threshold is not None and threshold.text is not None: # parse the threshold of check item item["threshold"] = __parseThreshold( threshold.text.strip()) @@ -1089,7 +1119,7 @@ def __parseAttr(elem, attr, language='zh'): function: parse the xml attr with language ''' val = elem.find('/'.join([attr, language])) - if (val is not None and val.text is not None): + if val is not None and val.text is not None: return val.text.strip().encode('utf-8') return "" @@ -1100,7 +1130,7 @@ def __parseProperty(elem, propertyName, defaultValue): ''' prop = elem.find(propertyName) result = defaultValue - if (prop is not None and prop.text is not None): + if prop is not None and prop.text is not None: result = prop.text.strip() return result @@ -1110,10 +1140,10 @@ def __parseThreshold(value, separator=";"): function: parse the threshold of check item ''' result = {} - if (separator not in value and "=" not in value): + if separator not in value and "=" not in value: return result - if (separator not in value and "=" in value): + if separator not in value and "=" in value: d = value.strip().split('=') result[d[0]] = d[1] else: @@ -1137,11 +1167,11 @@ def getMTUValue(node): sshIp = node # get all network card information cmd1 = """printf \"\n\n`/sbin/ifconfig -a`\n\n\" """ - if (not g_opts.pwdMap): + if not g_opts.pwdMap: output = SharedFuncs.runSshCmd(cmd1, sshIp, g_context.user) else: username, passwd = g_opts.pwdMap[node] - if (username is None or passwd is None): + if username is None or passwd is None: raise CheckException("Retrive username and password error.") output = SharedFuncs.runSshCmdWithPwd(cmd1, sshIp, username, passwd) # Separate each network card @@ -1151,37 +1181,37 @@ def getMTUValue(node): mtuValue = "" # find network card by IP for eachNet in networkInfoList: - if (eachNet.find(addr) > 0 and eachNet.find('inet') > 0): + if eachNet.find(addr) > 0 and eachNet.find('inet') > 0: networkInfo = eachNet break - if (not networkInfo): + if not networkInfo: raise CheckException( "Failed to get network card information with '%s'." % node) # get network number networkNum = networkInfo.split()[0] # Remove : if it exists - if (networkNum[-1] == ":"): + if networkNum[-1] == ":": networkNum = networkNum[:-1] for eachLine in networkInfo.split('\n'): # get mtu Value with SuSE and redHat6.x - if (eachLine.find('MTU') > 0): + if eachLine.find('MTU') > 0: mtuValue = eachLine.split(':')[1].split(' ')[0].strip() break # get mtu Value with redHat7.x - elif (eachLine.find('mtu') > 0): + elif eachLine.find('mtu') > 0: mtuValue = eachLine.split()[-1] break else: continue - if (not networkNum): + if not networkNum: raise CheckException( "Failed to get network card number with '%s'." % node) - if (not mtuValue): + if not mtuValue: raise CheckException( "Failed to get network card mtu value with '%s' '%s'." % (node, networkNum)) # The nodes are grouped by MTU value - if (not mtuValue in list(g_mtuMap.keys())): + if not mtuValue in list(g_mtuMap.keys()): g_mtuMap[mtuValue] = ["%s-%s" % (node, networkNum)] else: g_mtuMap[mtuValue].append("%s-%s" % (node, networkNum)) @@ -1194,10 +1224,10 @@ def preCheck(): output: NA """ # patch ssh config - if (__isRoot()): + if __isRoot(): cmd = "grep -E '^MaxStartups[\ \t]+1000' /etc/ssh/sshd_config" (status, output) = subprocess.getstatusoutput(cmd) - if (status != 0): + if status != 0: cmd = "sed -i '/MaxStartups/d' /etc/ssh/sshd_config &&" \ " echo 'MaxStartups 1000' >> /etc/ssh/sshd_config &&" \ " service sshd reload" @@ -1215,7 +1245,7 @@ def preCheck(): except Exception as e: raise Exception(str(e)) # According to the number of groups to determine whether the same - if (len(list(g_mtuMap.keys())) > 1): + if len(list(g_mtuMap.keys())) > 1: warningMsg = "Warning: The MTU value is inconsistent on all node," \ " maybe checking will be slower or hang." for mtuValue in list(g_mtuMap.keys()): @@ -1248,8 +1278,8 @@ def doCheck(): output: NA """ # Local mode - if (g_opts.localMode): - if (__isDistributing()): + if g_opts.localMode: + if __isDistributing(): # load check item dynamic and get the execute result doRunCheck() else: @@ -1288,23 +1318,23 @@ def doCheck(): # Time to hit the log LogCount = 0 lastTimeProgress = -1 - while (len(nodes) and datetime.now() <= g_endTime): + while len(nodes) and datetime.now() <= g_endTime: totleCount = 0 slowNode = [] for node in nodes: # Get user and password username, passwd = __getUserAndPwd(node) - if (node in g_context.oldNodes): + if node in g_context.oldNodes: itemCount_node = len(g_context.oldItems) else: itemCount_node = len(g_context.newItems) # Local execution - if (SharedFuncs.is_local_node(node)): + if SharedFuncs.is_local_node(node): checkCount = SharedFuncs.checkComplete( checkID, node, g_context.hostMapping[node], g_context.user, g_context.tmpPath) # Executed in new node scene - elif (node in g_context.newNodes): + elif node in g_context.newNodes: checkCount = SharedFuncs.checkComplete( checkID, node, g_context.hostMapping[node], username, g_context.tmpPath, passwd) @@ -1318,14 +1348,14 @@ def doCheck(): checkCount = 0 # If there is a node check completed, # some nodes just started,record slow node - if (overNodes > 0 and checkCount < 2): + if overNodes > 0 and checkCount < 2: slowNode.append(node) - if (checkCount == itemCount_node): + if checkCount == itemCount_node: nodes.remove(node) # Record the number of completed nodes overNodes += 1 - if (not SharedFuncs.is_local_node(node)): - if (node in g_context.newNodes): + if not SharedFuncs.is_local_node(node): + if node in g_context.newNodes: outItems = [] for i in itemsName: outItems.append("%s/%s_%s_%s.out" % ( @@ -1353,23 +1383,23 @@ def doCheck(): # Update execution progress progressInfo = totleCount // len(g_context.nodes) # Refresh only as the schedule changes - if (lastTimeProgress < progressInfo <= itemCount): + if lastTimeProgress < progressInfo <= itemCount: progress_manager.update("Checking...", progressInfo) lastTimeProgress = progressInfo # Suggest the slow node to log every 30 seconds - if (slowNode and itemCount > 1 and LogCount % 30 == 0): + if slowNode and itemCount > 1 and LogCount % 30 == 0: logMsg = "Warning: The node [%s] check progress" \ " is slow." % ",".join(slowNode) g_logger.debug(logMsg) for t in threads: - if (t.exitcode == 1): + if t.exitcode == 1: raise ThreadCheckException(t.name, t.exception) for t in threads: t.join(1) - if (datetime.now() > g_endTime): + if datetime.now() > g_endTime: raise TimeoutException(nodes) __printOnScreen("Start to analysis the check result") @@ -1395,11 +1425,11 @@ def doRunCheck(): """ outputPath = g_context.tmpPath localHost = __getLocalNode(g_context.nodes) - if (localHost in g_context.newNodes): + if localHost in g_context.newNodes: items = g_context.newItems else: items = g_context.oldItems - if (g_context.hostMapping): + if g_context.hostMapping: localHost = g_context.hostMapping[localHost] for item in items: content = "" @@ -1416,9 +1446,9 @@ def doRunCheck(): itemResult = __analysisResult(content, item['name']) g_result.append(itemResult) # run the check process distributing and no need to clean the resource - if (__isDistributing()): + if __isDistributing(): g_logger.debug("run check items done and exit the command") - if (g_opts.format == 'default'): + if g_opts.format == 'default': # Initialize the self.clusterInfo variable print(g_result.outputRaw()) @@ -1448,11 +1478,11 @@ def __prepareCmd(items, user, checkid): userParam = "" checkIdParam = "" routingParam = "" - if (user): + if user: userParam = " -U %s " % user - if (checkid): + if checkid: checkIdParam = " --cid=%s " % checkid - if (g_context.routing): + if g_context.routing: routingParam = "--routing %s" % g_context.routing cmd = "%s/gs_check -i %s %s %s -L %s -o %s -l %s" % ( cmdPath, ",".join(itemsName), userParam, checkIdParam, @@ -1472,7 +1502,7 @@ def doLocalCheck(host): g_context.checkID) else: cmd = __prepareCmd(g_context.newItems, "", g_context.checkID) - if (SharedFuncs.is_local_node(host)): + if SharedFuncs.is_local_node(host): if __hasRootItems(): SharedFuncs.runRootCmd(cmd, g_opts.pwdMap[host][0], g_opts.pwdMap[host][1], g_context.mpprc) @@ -1484,7 +1514,7 @@ def doLocalCheck(host): else: # get username and password for certain node username, passwd = __getUserAndPwd(host) - if (host in g_context.newNodes): + if host in g_context.newNodes: SharedFuncs.runSshCmdWithPwd(cmd, host, username, passwd) else: SharedFuncs.runSshCmdWithPwd(cmd, host, username, passwd, @@ -1496,7 +1526,7 @@ def __analysisResult(output, itemName): function: analysis the check result """ item_result = ItemResult.parse(output) - if (not item_result): + if not item_result: raise CheckException("analysis result occurs error") try: # load support item @@ -1616,11 +1646,15 @@ def __checkFileExist(path, filePattern): def killChildProcess(node): checkID = g_context.checkID # cmd with switch users - cmd_switch = """proc_pid_list=`ps -ef | grep 'cid=%s'| grep -v 'grep'|awk '{print \$2}'` """ % checkID - cmd_switch += """ && (if [ X\"$proc_pid_list\" != X\"\" ]; then echo \"$proc_pid_list\" | xargs kill -9 ; fi)""" + cmd_switch = """proc_pid_list=`ps -ef | grep 'cid=%s'| grep -v 'grep'""" \ + """|awk '{print \$2}'` """ % checkID + cmd_switch += """ && (if [ X\"$proc_pid_list\" != X\"\" ]; """ \ + """then echo \"$proc_pid_list\" | xargs kill -9 ; fi)""" # cmd with not switch users - cmd_current = """proc_pid_list=`ps -ef | grep 'cid=%s'| grep -v 'grep'|awk "{print \\\$2}"` """ % checkID - cmd_current += """ && (if [ X"$proc_pid_list" != X"" ]; then echo "$proc_pid_list" | xargs kill -9 ; fi)""" + cmd_current = """proc_pid_list=`ps -ef | grep 'cid=%s'| grep -v 'grep'""" \ + """|awk "{print \\\$2}"` """ % checkID + cmd_current += """ && (if [ X"$proc_pid_list" != X"" ]; then """ \ + """echo "$proc_pid_list" | xargs kill -9 ; fi)""" username, passwd = __getUserAndPwd(node) if SharedFuncs.is_local_node(node) and not __hasRootItems(): @@ -1633,7 +1667,7 @@ def killChildProcess(node): def cleanTmpDir(node): # clean tmp files in all the nodes - cmd = r"rm -rf %s" % (g_context.tmpPath) + cmd = r"rm -rf %s" % g_context.tmpPath if SharedFuncs.is_local_node(node): SharedFuncs.runShellCmd(cmd) else: @@ -1665,7 +1699,7 @@ def cleanEnvironment(skiplog=False): g_logger.debug(str(e)) # clean tmp files in all the nodes - cmd = r"rm -rf %s" % (g_context.tmpPath) + cmd = r"rm -rf %s" % g_context.tmpPath if g_opts.localMode: SharedFuncs.runShellCmd(cmd) else: diff --git a/src/manager/om/script/gs_sshexkey b/src/manager/om/script/gs_sshexkey index b27ae8829..1f2dcf27f 100644 --- a/src/manager/om/script/gs_sshexkey +++ b/src/manager/om/script/gs_sshexkey @@ -150,8 +150,8 @@ gs_sshexkey is a utility to create SSH trust among nodes in a cluster. Usage: gs_sshexkey -? | --help gs_sshexkey -V | --version - gs_sshexkey -f HOSTFILE [-W PASSWORD] [...] [--skip-hostname-set] - [-l LOGFILE] + gs_sshexkey -f HOSTFILE [-l LOGFILE] [--skip-hostname-set] + General options: -f Host file containing the IP address of nodes. @@ -478,10 +478,8 @@ General options: if (not os.path.exists("/etc/hosts")): raise Exception(ErrorCode.GAUSS_512["GAUSS_51221"] + " Error: \nThe /etc/hosts does not exist.") - (status, output) = g_OSlib.getGrepValue("-v", - " #Gauss.* IP Hosts " - "Mapping", - '/etc/hosts') + cmd = "grep -v '" + HOSTS_MAPPING_FLAG + "' /etc/hosts" + (status, output) = subprocess.getstatusoutput(cmd) try: g_file.createFile(tmpHostIpName) g_file.changeMode(DefaultValue.KEY_FILE_MODE, tmpHostIpName) @@ -536,7 +534,7 @@ General options: ssh.close() raise Exception(ErrorCode.GAUSS_503["GAUSS_50317"] + " Error: \n%s" % str(e)) - cmd = "grep -v '%s' %s > %s && cp %s %s && rm -rf %s" \ + cmd = "grep -v '%s' %s > %s ; cp %s %s && rm -rf %s" \ % (" #Gauss.* IP Hosts Mapping", '/etc/hosts', tmpHostIpName, tmpHostIpName, '/etc/hosts', tmpHostIpName) channel = ssh.open_session() @@ -631,7 +629,7 @@ General options: if (boolInvalidIp): boolInvalidIp = False continue - cmd = "grep -v '%s' %s > %s && cp %s %s && rm -rf %s" % ( + cmd = "grep -v '%s' %s > %s ; cp %s %s && rm -rf %s" % ( " #Gauss.* IP Hosts Mapping", '/etc/hosts', tmpHostIpName, tmpHostIpName, '/etc/hosts', tmpHostIpName) channel = ssh.open_session() diff --git a/src/manager/om/script/gspylib/common/Common.py b/src/manager/om/script/gspylib/common/Common.py index f56c6243c..32ca331b1 100644 --- a/src/manager/om/script/gspylib/common/Common.py +++ b/src/manager/om/script/gspylib/common/Common.py @@ -774,11 +774,11 @@ class DefaultValue(): netWorkNum = "" netWorkInfo = psutil.net_if_addrs() for nic_num in netWorkInfo.keys(): - netInfo = netWorkInfo[nic_num][0] - if (netInfo.address == ipAddress): - netWorkNum = nic_num - break - if (netWorkNum == ""): + for netInfo in netWorkInfo[nic_num]: + if netInfo.address == ipAddress: + netWorkNum = nic_num + break + if netWorkNum == "": raise Exception(ErrorCode.GAUSS_506["GAUSS_50604"] % ipAddress) return netWorkNum except Exception as e: diff --git a/src/manager/om/script/gspylib/common/DbClusterInfo.py b/src/manager/om/script/gspylib/common/DbClusterInfo.py index 5a832a32b..afea93ed6 100644 --- a/src/manager/om/script/gspylib/common/DbClusterInfo.py +++ b/src/manager/om/script/gspylib/common/DbClusterInfo.py @@ -1525,7 +1525,12 @@ class dbClusterInfo(): "sync_state : %s\n" % syncInfo.syncState) if syncInfo.secondPeerRole == "": - syncInfo.secondPeerRole = "Unknown" + outText = outText + "\n------------------------" \ + "---------------" \ + "--------------------------------\n\n" + continue + if syncInfo.secondSyncState == "": + syncInfo.secondSyncState = "Unknown" outText = outText + ( "secondary_state : %s\n" % syncInfo.secondPeerRole) @@ -1553,8 +1558,6 @@ class dbClusterInfo(): outText = outText + ( "receiver_replay_location : %s\n" % syncInfo.secondReceiverReplayLocation) - if syncInfo.secondSyncState == "": - syncInfo.secondSyncState = "Unknown" outText = outText + ( "sync_state : %s\n" % syncInfo.secondSyncState) @@ -1847,8 +1850,7 @@ class dbClusterInfo(): else: outText = outText + " " outText = outText + ( - "%s " % self.__getDnRole(roleStatusArray[i], - dnInst.instanceType)) + "%s " % self.__getDnRole(dnInst.instanceType)) if dnNodeCount == 1: outText = outText + ("%-7s" % "Primary") else: @@ -1863,9 +1865,12 @@ class dbClusterInfo(): except Exception as e: raise Exception(ErrorCode.GAUSS_516["GAUSS_51652"] % str(e)) - def __getDnRole(self, roleStatus, instanceType): - if roleStatus in DN_ROLE_MAP.keys(): - return DN_ROLE_MAP[roleStatus] + def __getDnRole(self, instanceType): + """ + function : Get DnRole by instanceType + input : Int + output : String + """ if instanceType == MASTER_INSTANCE: return "P" elif instanceType == STANDBY_INSTANCE: diff --git a/src/manager/om/script/gspylib/component/Kernel/Kernel.py b/src/manager/om/script/gspylib/component/Kernel/Kernel.py index 784f02a6d..b0eecd7f8 100644 --- a/src/manager/om/script/gspylib/component/Kernel/Kernel.py +++ b/src/manager/om/script/gspylib/component/Kernel/Kernel.py @@ -82,24 +82,28 @@ class Kernel(BaseComponent): cmd += " -o \'--securitymode\'" self.logger.debug("start cmd = %s" % cmd) (status, output) = subprocess.getstatusoutput(cmd) - if status != 0: + if status != 0 or re.search("start failed", output): raise Exception(ErrorCode.GAUSS_516["GAUSS_51607"] % "instance" + " Error: Please check the gs_ctl log for " - "failure details.") + "failure details." + "\n" + output) + if re.search("another server might be running", output): + self.logger.log(output) def stop(self, stopMode="", time_out=300): """ """ + cmd = "%s/gs_ctl stop -D %s " % ( + self.binPath, self.instInfo.datadir) if not self.isPidFileExist(): - return - cmd = "%s/gs_ctl stop -D %s " % (self.binPath, self.instInfo.datadir) - # check stop mode - if (stopMode != ""): - cmd += " -m %s" % stopMode + cmd += " -m immediate" + else: + # check stop mode + if stopMode != "": + cmd += " -m %s" % stopMode cmd += " -t %s" % time_out self.logger.debug("stop cmd = %s" % cmd) (status, output) = subprocess.getstatusoutput(cmd) - if (status != 0): + if status != 0: raise Exception(ErrorCode.GAUSS_516["GAUSS_51610"] % "instance" + " Error: \n%s." % output) diff --git a/src/manager/om/script/gspylib/hardware/gsdisk.py b/src/manager/om/script/gspylib/hardware/gsdisk.py index bfea057f6..8b361187f 100644 --- a/src/manager/om/script/gspylib/hardware/gsdisk.py +++ b/src/manager/om/script/gspylib/hardware/gsdisk.py @@ -180,7 +180,7 @@ class diskInfo(): dev_info = os.statvfs(path) used = dev_info.f_blocks - dev_info.f_bfree valueable = dev_info.f_bavail + used - percent = math.ceil((float(used) // valueable) * 100) + percent = math.ceil((float(used) / valueable) * 100) except Exception as e: raise Exception(ErrorCode.GAUSS_530["GAUSS_53011"] + " disk space." + "Error: %s" % str(e)) diff --git a/src/manager/om/script/gspylib/inspection/common/CheckResult.py b/src/manager/om/script/gspylib/inspection/common/CheckResult.py index 6e146515f..c40d8b9e2 100644 --- a/src/manager/om/script/gspylib/inspection/common/CheckResult.py +++ b/src/manager/om/script/gspylib/inspection/common/CheckResult.py @@ -25,6 +25,12 @@ from gspylib.inspection.common import SharedFuncs from gspylib.common.Common import DefaultValue from gspylib.inspection.common.Log import LoggerFactory +class GsCheckEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, bytes): + return str(obj, encoding='utf-8') + return json.JSONEncoder.default(self, obj) + class ResultStatus(object): OK = "OK" @@ -242,4 +248,4 @@ class CheckResult(object): local['raw'] = localitem.raw localList.append(local) resultDic['hosts'] = localList - return json.dumps(resultDic, indent=2) + return json.dumps(resultDic, cls=GsCheckEncoder, indent=2) diff --git a/src/manager/om/script/gspylib/inspection/items/cluster/CheckMpprcFile.py b/src/manager/om/script/gspylib/inspection/items/cluster/CheckMpprcFile.py index 6f9218f13..54d6b9dc3 100644 --- a/src/manager/om/script/gspylib/inspection/items/cluster/CheckMpprcFile.py +++ b/src/manager/om/script/gspylib/inspection/items/cluster/CheckMpprcFile.py @@ -39,8 +39,7 @@ class CheckMpprcFile(BaseItem): return try: with open(mpprcFile, 'r') as fp: - mpp_content = fp.read() - env_list = mpp_content.split('\n') + env_list = fp.readlines() while '' in env_list: env_list.remove('') # get ec content @@ -67,13 +66,17 @@ class CheckMpprcFile(BaseItem): "GAUSS_ENV", "KRB5_CONFIG", "PGKRBSRVNAME", "KRBHOSTNAME", "ETCD_UNSUPPORTED_ARCH"] # black elements - list_black = ["|", ";", "&", "$", "<", ">", "`", "\\", "'", "\"", - "{", "}", "(", ")", "[", "]", "~", "*", "?", " ", + list_black = ["|", ";", "&", "<", ">", "`", "\\", "'", "\"", + "{", "}", "(", ")", "[", "]", "~", "*", "?", "!", "\n"] for env in env_list: env = env.strip() if env == "": continue + if len(env.split()) != 2: + return + if env.split()[0] == "umask" and env.split()[1] == "077": + continue for black in list_black: flag = env.find(black) if flag >= 0: diff --git a/src/manager/om/script/gspylib/inspection/items/network/CheckNetSpeed.py b/src/manager/om/script/gspylib/inspection/items/network/CheckNetSpeed.py index 2835e12df..355d81b5a 100644 --- a/src/manager/om/script/gspylib/inspection/items/network/CheckNetSpeed.py +++ b/src/manager/om/script/gspylib/inspection/items/network/CheckNetSpeed.py @@ -75,14 +75,14 @@ class CheckNetSpeed(BaseItem): def runClient(self, self_index, ipList): base_listen_port = DEFAULT_LISTEN_PORT max_server = 10 - group = self_index / max_server + group = self_index // max_server path = self.context.basePath port = base_listen_port + self_index % max_server for ip in ipList: index = ipList.index(ip) if (index == self_index): continue - if (index / max_server != group): + if (index // max_server != group): continue try: p = subprocess.Popen([path + "/lib/checknetspeed/speed_test", diff --git a/src/manager/om/script/gspylib/inspection/items/os/CheckNTPD.py b/src/manager/om/script/gspylib/inspection/items/os/CheckNTPD.py index 47598d2ce..9979a9032 100644 --- a/src/manager/om/script/gspylib/inspection/items/os/CheckNTPD.py +++ b/src/manager/om/script/gspylib/inspection/items/os/CheckNTPD.py @@ -88,9 +88,9 @@ class CheckNTPD(BaseItem): def postAnalysis(self, itemResult, category="", name=""): errors = [] for i in itemResult.getLocalItems(): - if (i.rst == ResultStatus.NG): + if i.rst == ResultStatus.NG : errors.append("%s: %s" % (i.host, i.val)) - if (len(errors) > 0): + if len(errors) > 0: itemResult.rst = ResultStatus.NG itemResult.analysis = "\n".join(errors) return itemResult diff --git a/src/manager/om/script/gspylib/os/gsOSlib.py b/src/manager/om/script/gspylib/os/gsOSlib.py index 5d1de5a90..66ec04f7a 100644 --- a/src/manager/om/script/gspylib/os/gsOSlib.py +++ b/src/manager/om/script/gspylib/os/gsOSlib.py @@ -295,9 +295,12 @@ class PlatformCommand(): """ pidList = [] for pid in psutil.pids(): - p = psutil.Process(pid) - if procName == p.name(): - pidList.append(pid) + try: + p = psutil.Process(pid) + if procName == p.name(): + pidList.append(pid) + except psutil.NoSuchProcess: + pass return pidList def killProcessByProcName(self, procName, killType=2): diff --git a/src/manager/om/script/gspylib/threads/SshTool.py b/src/manager/om/script/gspylib/threads/SshTool.py index 20f87bfc2..be9b198cc 100644 --- a/src/manager/om/script/gspylib/threads/SshTool.py +++ b/src/manager/om/script/gspylib/threads/SshTool.py @@ -230,9 +230,15 @@ class SshTool(): g_file.removeFile(tmp_hosts) if output is not None: output = str(output, encoding='utf-8') - GaussLog.printMessage(output.strip()) if re.search("\[GAUSS\-", output): + if re.search("Please enter password", output): + GaussLog.printMessage( + ErrorCode.GAUSS_503["GAUSS_50306"] % user) + else: + GaussLog.printMessage(output.strip()) sys.exit(1) + else: + GaussLog.printMessage(output.strip()) else: sys.exit(1) except Exception as e: diff --git a/src/manager/om/script/impl/om/OLAP/OmImplOLAP.py b/src/manager/om/script/impl/om/OLAP/OmImplOLAP.py index 87659992e..4da2de6be 100644 --- a/src/manager/om/script/impl/om/OLAP/OmImplOLAP.py +++ b/src/manager/om/script/impl/om/OLAP/OmImplOLAP.py @@ -19,6 +19,8 @@ ############################################################################# import subprocess import sys +import re +import time sys.path.append(sys.path[0] + "/../../../../") from gspylib.common.DbClusterInfo import dbClusterInfo, queryCmd @@ -221,11 +223,28 @@ class OmImplOLAP(OmImpl): self.context.g_opts.security_mode) if self.dataDir != "": cmd += " -D %s" % self.dataDir + starttime = time.time() (statusMap, output) = self.sshTool.getSshStatusOutput(cmd, hostList) for nodeName in hostList: if statusMap[nodeName] != 'Success': raise Exception( ErrorCode.GAUSS_536["GAUSS_53600"] % (cmd, output)) + if re.search("another server might be running", output): + self.logger.log(output) + if startType == "cluster": + cmd = "source %s; gs_om -t status|grep cluster_state|grep Normal" \ + % self.context.g_opts.mpprcFile + while time.time() <= time_out + starttime: + status = subprocess.getstatusoutput(cmd)[0] + if status != 0: + self.logger.log("Waiting for check cluster state...") + time.sleep(5) + else: + break + if time.time() > time_out + starttime: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51610"] % "cluster" + + "Start timeout, please check the process" + " status manually") self.logger.log("=========================================") self.logger.log("Successfully started.") self.logger.debug("Operation succeeded: Start.") diff --git a/src/manager/om/script/impl/preinstall/PreinstallImpl.py b/src/manager/om/script/impl/preinstall/PreinstallImpl.py index 0d44a1142..4a2add8d2 100644 --- a/src/manager/om/script/impl/preinstall/PreinstallImpl.py +++ b/src/manager/om/script/impl/preinstall/PreinstallImpl.py @@ -509,7 +509,7 @@ class PreinstallImpl: # the temporary Files for /etc/hosts tmp_hostipname = "./tmp_hostsiphostname_%d" % os.getpid() # Delete the line with 'HOSTS_MAPPING_FLAG' in the /etc/hosts - cmd = "grep -v '%s' %s > %s && cp %s %s && rm -rf '%s'" % \ + cmd = "grep -v '%s' %s > %s ; cp %s %s && rm -rf '%s'" % \ ("#Gauss.* IP Hosts Mapping", '/etc/hosts', tmp_hostipname, tmp_hostipname, '/etc/hosts', tmp_hostipname) (status, output) = DefaultValue.retryGetstatusoutput(cmd) @@ -559,7 +559,7 @@ class PreinstallImpl: tmp_hostipname = "./tmp_hostsiphostname_%d" % os.getpid() # Delete the line with 'HOSTS_MAPPING_FLAG' in the /etc/hosts cmd = "if [ -f '%s' ]; then grep -v '%s' %s > %s " \ - "&& cp %s %s && rm -rf '%s'; fi" % \ + "; cp %s %s ; rm -rf '%s'; fi" % \ ('/etc/hosts', "#Gauss.* IP Hosts Mapping", '/etc/hosts', tmp_hostipname, tmp_hostipname, '/etc/hosts', tmp_hostipname) # exec the cmd on all remote nodes diff --git a/src/manager/om/script/local/LocalCollect.py b/src/manager/om/script/local/LocalCollect.py index be2c86feb..d55678312 100644 --- a/src/manager/om/script/local/LocalCollect.py +++ b/src/manager/om/script/local/LocalCollect.py @@ -166,7 +166,7 @@ def sendLogFiles(): cmd = "%s && (if [ -f '%s'/'%s' ];then rm -rf '%s'/'%s';fi)" % \ (cmd, g_tmpdir, tarName, g_tmpdir, tarName) (status, output) = DefaultValue.retryGetstatusoutput(cmd) - if (status != 0): + if status != 0: g_logger.logExit("Failed to delete %s." % "%s and %s" % ( g_resultdir, tarName) + " Error:\n%s" % output) g_logger.logExit("All collection tasks failed") @@ -174,16 +174,16 @@ def sendLogFiles(): cmd = "cd '%s' && tar -zcf '%s' '%s' && chmod %s '%s'" % \ (g_tmpdir, tarName, HOSTNAME, DefaultValue.FILE_MODE, tarName) (status, output) = DefaultValue.retryGetstatusoutput(cmd) - if (status != 0): + if status != 0: g_logger.logExit("Failed to compress %s." % ("directory %s/%s" % \ (g_tmpdir, HOSTNAME)) + " Error: \n%s" % output) - if (g_opts.nodeName != ""): + if g_opts.nodeName != "": # send backup file which is compressed to the node that is # currently performing the backup - if (g_opts.nodeName == DefaultValue.GetHostIpOrName()): + if g_opts.nodeName == DefaultValue.GetHostIpOrName(): if int(g_opts.speedLimitFlag) == 1: cmd = "rsync --bwlimit=%d '%s'/'%s' '%s'/" % \ (g_opts.speedLimitKBs, g_tmpdir, tarName, @@ -198,7 +198,7 @@ def sendLogFiles(): g_opts.speedLimitKBs * 8, g_opts.nodeName, g_tmpdir, tarName, g_opts.outputDir) (status, output) = DefaultValue.retryGetstatusoutput(cmd) - if (status != 0): + if status != 0: g_logger.logExit( "Failed to copy %s." % tarName + " Error:\n%s" % output) @@ -208,7 +208,7 @@ def sendLogFiles(): cmd = "%s && (if [ -f '%s'/'%s' ];then rm -rf '%s'/'%s';fi)" % \ (cmd, g_tmpdir, tarName, g_tmpdir, tarName) (status, output) = DefaultValue.retryGetstatusoutput(cmd) - if (status != 0): + if status != 0: g_logger.logExit("Failed to delete %s. %s" % ( "%s and %s" % (g_resultdir, tarName), " Error:\n%s" % output)) @@ -219,7 +219,7 @@ def checkParameterEmpty(parameter, parameterName): input : parameter, parameterName output : NA """ - if (parameter == ""): + if parameter == "": GaussLog.exitWithError(ErrorCode.GAUSS_500["GAUSS_50001"] % parameterName) @@ -239,7 +239,7 @@ def parseCommandLine(): except getopt.GetoptError as e: # Error exit if an illegal parameter exists GaussLog.exitWithError(ErrorCode.GAUSS_500["GAUSS_50000"] % str(e)) - if (len(args) > 0): + if len(args) > 0: # Error exit if an illegal parameter exists GaussLog.exitWithError(ErrorCode.GAUSS_500["GAUSS_50000"] % str(args[0])) @@ -253,7 +253,7 @@ def parseCommandLine(): parameter_keys = parameter_map.keys() for key, value in opts: - if (key in parameter_keys): + if key in parameter_keys: if key == "-C": value = value.replace("#", "\"") parameter_map[key] = value.strip() @@ -278,18 +278,18 @@ def parseCommandLine(): checkParameterEmpty(g_opts.user, "U") DefaultValue.checkUser(g_opts.user, False) # check log file - if (g_opts.logFile == ""): + if g_opts.logFile == "": g_opts.logFile = DefaultValue.getOMLogPath(DefaultValue.LOCAL_LOG_FILE, g_opts.user, "", "") - if (not os.path.isabs(g_opts.logFile)): + if not os.path.isabs(g_opts.logFile): GaussLog.exitWithError(ErrorCode.GAUSS_502["GAUSS_50213"] % "log") - if (int(g_opts.speedLimitKBs) < 0): + if int(g_opts.speedLimitKBs) < 0: GaussLog.exitWithError(ErrorCode.GAUSS_526["GAUSS_53032"]) g_opts.speedLimitKBs = int(g_opts.speedLimitKBs) # 1048576 KB/s = 1GB/s, which means unlimited. - if (g_opts.speedLimitKBs == 0): + if g_opts.speedLimitKBs == 0: g_opts.speedLimitKBs = 1048576 @@ -370,7 +370,7 @@ def create_temp_result_folder(): DefaultValue.KEY_DIRECTORY_MODE, g_resultdir) g_logger.debug("Command for creating output directory: %s" % cmd) (status, output) = DefaultValue.retryGetstatusoutput(cmd) - if (status != 0): + if status != 0: g_logger.logExit("Failed to create the %s directory." % \ ("%s/logfiles and %s/configfiles" % ( g_resultdir, g_resultdir)) + " Error:\n%s" % output) @@ -460,7 +460,7 @@ def basic_info_check(): # file for cmd in cmds: (status, output) = subprocess.getstatusoutput(cmd) - if (status != 0): + if status != 0: g_logger.debug( ("Failed to collect basic information. Error:\n%s." % output) + ("The cmd is %s " % cmd)) @@ -496,7 +496,7 @@ def system_check(): cmd = cmd.replace("\n", " ") if "echo" in cmd: continue - if (status != 0): + if status != 0: if "Permission denied" in output: output = "can not print info to file: Permission denied" g_jobInfo.failedTask[cmd] = replaceInvalidStr(output) @@ -682,17 +682,17 @@ def matchFile(begin_t, end_t, fileTime): and the end time. """ # both of begin_time and end_time - if (begin_t and end_t): + if begin_t and end_t: for t in fileTime: - if (compareTime(t, begin_t) and compareTime(end_t, t)): + if compareTime(t, begin_t) and compareTime(end_t, t): return True # only begin_time - elif (begin_t and (not end_t)): + elif begin_t and (not end_t): for t in fileTime: if compareTime(t, begin_t): return True # only end_time - elif ((not begin_t) and end_t): + elif (not begin_t) and end_t: for t in fileTime: if compareTime(end_t, t): return True @@ -858,7 +858,7 @@ def log_copy_for_zenith(): g_logger.log(json.dumps(g_jobInfo.__dict__)) raise Exception("") - if (g_opts.key): + if g_opts.key: # Look for keyword matching in the dir and write to the specified file cmd = "echo \"\" > %s/logfiles/%s; for f in `find %s -type f`;" \ " do grep -ai '%s' $f >> %s/logfiles/%s; done" % ( @@ -896,7 +896,7 @@ def log_copy(): deleteCmd = "cd $GAUSSLOG && if [ -d tmp_gs_collector ];" \ "then rm -rf tmp_gs_collector; fi" - if (g_opts.key is not None and g_opts.key != ""): + if g_opts.key is not None and g_opts.key != "": g_logger.debug( "Keyword for collecting log in base64 encode [%s]." % g_opts.key) g_opts.key = base64.b64decode(g_opts.key) @@ -907,7 +907,7 @@ def log_copy(): "Speed limit to copy log files is %d KB/s." % g_opts.speedLimitKBs) # Filter the log files, if has keyword, do not collect prf file - if (g_opts.key is not None and g_opts.key != ""): + if g_opts.key is not None and g_opts.key != "": cmd = "cd $GAUSSLOG && if [ -d tmp_gs_collector ];" \ "then rm -rf tmp_gs_collector; " \ "fi && (find . -type f -iname '*.log' -print)" \ @@ -1010,7 +1010,7 @@ def log_copy(): (DefaultValue.DIRECTORY_MODE, zipdir, zipFileName, zipdir) (status, output) = subprocess.getstatusoutput(cmd) - if (status != 0): + if status != 0: g_jobInfo.failedTask[ "find log zip files"] = replaceInvalidStr(output) g_logger.log(json.dumps(g_jobInfo.__dict__)) @@ -1022,15 +1022,15 @@ def log_copy(): g_logger.debug("There is no zip files.") # Filter keywords - if (g_opts.key is not None and g_opts.key != ""): - if (len(logs) != 0): + if g_opts.key is not None and g_opts.key != "": + if len(logs) != 0: g_opts.key = g_opts.key.replace('$', '\$') g_opts.key = g_opts.key.replace('\"', '\\\"') cmd = "cd $GAUSSLOG/tmp_gs_collector && " cmd = "%s grep \"%s\" -r * > %s/logfiles/%s" % ( cmd, g_opts.key, g_resultdir, keyword_result) (status, output) = subprocess.getstatusoutput(cmd) - if (status != 0 and output != ""): + if status != 0 and output != "": cmd = "rm -rf $GAUSSLOG/tmp_gs_collector" (status1, output1) = DefaultValue.retryGetstatusoutput(cmd) g_jobInfo.failedTask[ @@ -1050,7 +1050,7 @@ def log_copy(): cmd = "touch %s/logfiles/%s && " % (g_resultdir, keyword_result) cmd = "%s rm -rf $GAUSSLOG/tmp_gs_collector" % cmd (status, output) = DefaultValue.retryGetstatusoutput(cmd) - if (status != 0): + if status != 0: g_jobInfo.failedTask["touch keyword file"] = replaceInvalidStr( output) g_logger.log(json.dumps(g_jobInfo.__dict__)) @@ -1071,7 +1071,7 @@ def log_copy(): "&& rm -rf $GAUSSLOG/'%s'" % \ (cmd, logfiletar) (status, output) = subprocess.getstatusoutput(cmd) - if (status != 0): + if status != 0: g_jobInfo.failedTask[ "copy result file and delete tmp file"] = replaceInvalidStr( output) @@ -1136,7 +1136,7 @@ def xlog_copy(): (g_resultdir, g_current_time, g_current_time, g_current_time) (status, output) = subprocess.getstatusoutput(cmd) - if (status != 0): + if status != 0: g_logger.debug( "Failed to collect xlog. Command %s \n, Error %s \n", (cmd, output)) @@ -1236,7 +1236,7 @@ def parallel_xlog(Inst): cmd = getXlogCmd(Inst) if len(cmd) > 1: (status, output) = subprocess.getstatusoutput(cmd) - if (status != 0): + if status != 0: g_logger.debug( "Failed to collect xlog files. Command: %s.\n Error: %s\n" % ( cmd, output)) @@ -1383,17 +1383,9 @@ def conf_gstack(jobName): try: # Gets all instances of the cluster Instances = [] - for Inst in g_localnodeinfo.gtms: - if "gtm" in ",".join(g_opts.content).lower(): - Instances.append(Inst) - for Inst in g_localnodeinfo.coordinators: - if "cn" in ",".join(g_opts.content).lower(): - Instances.append(Inst) for Inst in g_localnodeinfo.datanodes: if "dn" in ",".join(g_opts.content).lower(): Instances.append(Inst) - for Inst in g_localnodeinfo.gtses: - Instances.append(Inst) # parallel copy configuration files, and get gstack if Instances: pool = ThreadPool(DefaultValue.getCpuSet()) @@ -1447,7 +1439,7 @@ def plan_simulator_check(): "-p %d -D %s/planSimulatorfiles/%s" % \ (cmd, db, cnInst.port, g_resultdir, db) (status, output) = subprocess.getstatusoutput(cmd) - if (status != 0): + if status != 0: g_logger.debug( "Failed to Collect plan simulator. " "Command %s.\n Error: %s.\n" % ( @@ -1470,7 +1462,7 @@ def getBakConfCmd(Inst): """ cmd = "" pidfile = "" - if (Inst.instanceRole == DefaultValue.INSTANCE_ROLE_GTM): + if Inst.instanceRole == DefaultValue.INSTANCE_ROLE_GTM: if g_need_gstack == 0: cmd = "mkdir -p -m %s '%s/configfiles/config_%s/gtm_%s'" % \ ( @@ -1508,7 +1500,7 @@ def getBakConfCmd(Inst): "collect gtm_%s process stack info" % Inst.instanceId] = \ ErrorCode.GAUSS_535["GAUSS_53511"] % 'GTM' - elif (Inst.instanceRole == DefaultValue.INSTANCE_ROLE_COODINATOR): + elif Inst.instanceRole == DefaultValue.INSTANCE_ROLE_COODINATOR: if g_need_gstack == 0: cmd = "mkdir -p -m %s '%s/configfiles/config_%s/cn_%s'" % \ ( @@ -1542,7 +1534,7 @@ def getBakConfCmd(Inst): "collect cn_%s process stack info" % Inst.instanceId] = \ ErrorCode.GAUSS_535["GAUSS_53511"] % 'CN' - elif (Inst.instanceRole == DefaultValue.INSTANCE_ROLE_DATANODE): + elif Inst.instanceRole == DefaultValue.INSTANCE_ROLE_DATANODE: if g_need_gstack == 0: cmd = "mkdir -p -m %s '%s/configfiles/config_%s/dn_%s'" % \ ( @@ -1584,7 +1576,7 @@ def parallel_conf_gstack(Inst): """ (cmd, pidfile) = getBakConfCmd(Inst) (status, output) = subprocess.getstatusoutput(cmd) - if (status != 0): + if status != 0: if "command not found" in output: g_jobInfo.failedTask["collect process stack info"] = \ ErrorCode.GAUSS_535["GAUSS_53512"] @@ -1625,7 +1617,7 @@ def parseConfig(): input : NA output: NA """ - if (g_opts.config != ""): + if g_opts.config != "": d = json.loads(g_opts.config) g_opts.content = d['Content'].split(",") @@ -1645,24 +1637,24 @@ def main(): elif g_opts.action == "create_dir": create_temp_result_folder() # Get system information - elif (g_opts.action == "system_check"): + elif g_opts.action == "system_check": system_check() # Gets the database information - elif (g_opts.action == "database_check"): + elif g_opts.action == "database_check": database_check() # Make a copy of the log file - elif (g_opts.action == "log_copy"): + elif g_opts.action == "log_copy": log_copy() # Copy configuration files, and get g stack - elif (g_opts.action == "Config"): + elif g_opts.action == "Config": conf_gstack("Config") - elif (g_opts.action == "Gstack"): + elif g_opts.action == "Gstack": global g_need_gstack g_need_gstack = 1 conf_gstack("Gstack") g_need_gstack = 0 # Send all log files we collected to the command node. - elif (g_opts.action == "copy_file"): + elif g_opts.action == "copy_file": sendLogFiles() elif g_opts.action == "xlog_copy": xlog_copy()