#!/usr/bin/env python3 # -*- coding:utf-8 -*- ############################################################################# # Copyright (c) 2020 Huawei Technologies Co.,Ltd. # # openGauss is licensed under Mulan PSL v2. # You can use this software according to the terms # and conditions of the Mulan PSL v2. # You may obtain a copy of Mulan PSL v2 at: # # http://license.coscl.org.cn/MulanPSL2 # # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, # WITHOUT WARRANTIES OF ANY KIND, # EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. # See the Mulan PSL v2 for more details. # ---------------------------------------------------------------------------- # Description : gs_check is a utility to check cluster and database status ############################################################################# import subprocess import os import sys import re import getopt import getpass import time import pwd import grp import pickle package_path = os.path.dirname(os.path.realpath(__file__)) ld_path = package_path + "/gspylib/clib" if 'LD_LIBRARY_PATH' not in os.environ: os.environ['LD_LIBRARY_PATH'] = ld_path os.execve(os.path.realpath(__file__), sys.argv, os.environ) if not os.environ.get('LD_LIBRARY_PATH').startswith(ld_path): os.environ['LD_LIBRARY_PATH'] = \ ld_path + ":" + os.environ['LD_LIBRARY_PATH'] os.execve(os.path.realpath(__file__), sys.argv, os.environ) import xml.etree.cElementTree as ETree from itertools import combinations from datetime import datetime, timedelta from multiprocessing.dummy import Pool as ThreadPool from gspylib.inspection.common.Exception import CheckException, \ ParameterException, UnknownParameterException, \ EmptyParameterException, \ UseBothParameterException, AvailableParameterException, \ SceneNotFoundException, ParseItemException, \ NotEmptyException, \ NotExistException, InterruptException, ThreadCheckException, \ ContextDumpException, ContextLoadException, \ TimeoutException from gspylib.common.Common import DefaultValue from gspylib.common.GaussLog import GaussLog from gspylib.common.ErrorCode import ErrorCode from gspylib.common.ParameterParsecheck import Parameter from gspylib.inspection.common import SharedFuncs from gspylib.inspection.common.Log import LoggerFactory from gspylib.inspection.common.TaskPool import Watcher, CheckThread from gspylib.inspection.common.CheckResult import CheckResult, ItemResult from gspylib.inspection.common.CheckItem import CheckItemFactory from gspylib.inspection.common.ProgressBar import MultiProgressManager, \ LineProgress from gspylib.os.gsfile import g_file from gspylib.os.gsplatform import g_Platform from gspylib.common.VersionInfo import VersionInfo from gspylib.common.DbClusterInfo import dbClusterInfo ############################################################################# # Global variables # g_opts: global option # g_logger: global logger # g_context :global context # g_result : global result # g_endTime : global endTime # DIRECTORY_MODE: global directory mode # MPPDB_VERSION_R5 : mppdb version # DEFAULT_TIMEOUT : time out ############################################################################# g_logger = None g_opts = None g_context = None g_result = None g_endTime = None g_mtuMap = {} g_itemResult = {} DEFAULT_TIMEOUT = 1500 # single cluster will skip these items # because single clusters don't need to perform consistency checks and # internal communication class checks SINGLE_SKIP = ["CheckTimeZone", "CheckEncoding", "CheckKernelVer", "CheckNTPD", "CheckCpuCount", "CheckMemInfo", "CheckDiskConfig", "CheckUpVer", "CheckPgxcgroup", "CheckPing", "CheckNetWorkDrop", "CheckNetSpeed"] SETITEM_SKIP = ["CheckCPU", "CheckTimeZone", "CheckOSVer", "CheckNTPD", "CheckSshdService", "CheckEtcHosts", "CheckCpuCount", "CheckHyperThread", "CheckMemInfo", "CheckKernelVer", "CheckEncoding", "CheckBootItems", "CheckDropCache", "CheckFilehandle", "CheckKeyProAdj", "CheckDiskFormat", "CheckInodeUsage", "CheckSpaceUsage", "CheckDiskConfig", "CheckXid", "CheckSysTabSize", "CheckClusterState", "CheckConfigFileDiff", "CheckUpVer", "CheckEnvProfile", "CheckGaussVer", "CheckPortRange", "CheckReadonlyMode", "CheckCatchup", "CheckProcessStatus", "CheckSpecialFile", "CheckCollector", "CheckLargeFile", "CheckProStartTime", "CheckMpprcFile", "CheckLockNum", "CheckCurConnCount", "CheckCursorNum", "CheckPgxcgroup", "CheckLockState", "CheckIdleSession", "CheckDBConnection", "CheckSysTable", "CheckSysTabSize", "CheckTableSpace", "CheckTableSkew", "CheckDNSkew", "CheckCreateView", "CheckHashIndex", "CheckNextvalInDefault", "CheckPgxcRedistb", "CheckReturnType", "CheckSysadminUser", "CheckTDDate", "CheckDropColumn", "CheckDiskFailure", "CheckPing", "CheckNetWorkDrop", "CheckUsedPort", "CheckNICModel", "CheckRouting", "CheckNetSpeed", "CheckDataDiskUsage"] class CmdOptions(): """ command-line options """ def __init__(self): # initialize variable self.user = None self.localMode = False self.distributing = False self.skipRootItems = False self.set = False self.language = 'zh' self.format = 'default' self.scene = None self.items = None self.nodes = [] self.cluster = None self.timeout = DEFAULT_TIMEOUT self.pwdMap = {} self.thresholdDn = None self.outPath = None self.logFile = None self.isSingle = False self.routing = None self.skipItems = [] self.LCName = None self.ShrinkNodes = None self.nonPrinting = False class CheckContext(): """ check execution context """ def __init__(self): """ Constructor """ # Initialize the self.clusterInfo variable self.basePath = os.path.join( os.path.split(os.path.realpath(__file__))[0], 'gspylib', 'inspection') self.user = None self.set = None self.log = None self.postAnalysis = False self.supportItems = {} self.supportScenes = {} self.items = [] self.rootItems = [] self.cluster = None self.nodes = [] self.mpprc = None self.checkID = self.genCheckID() self.thresholdDn = None self.outPath = os.path.join(self.basePath, "output") self.logFile = None self.tmpPath = None self.hostMapping = None self.routing = None self.skipSetItem = [] self.oldNodes = [] self.newNodes = [] self.oldItems = [] self.newItems = [] self.LCName = None self.ShrinkNodes = None def genCheckID(self): ''' function : generate the check ID which is unique for once checking input : NA output : checkID ''' # Get Time t = time.localtime(time.time()) dateString = time.strftime("%Y%m%d", t) seconds = timedelta(hours=t.tm_hour, minutes=t.tm_min, seconds=t.tm_sec).seconds pidString = str(os.getpid()) return dateString + str(seconds) + pidString def setCheckID(self, checkID): ''' function : set check id ''' self.checkID = checkID def getCacheFile(self): return "%s/context_%s.cache" % (self.tmpPath, self.checkID) def checkMPPDBVersion(self): ''' function : check mppdb version input : NA output : NA ''' # check the version number cmd = "gsql -V" output = SharedFuncs.runShellCmd(cmd, self.user, self.mpprc) return re.compile(r'V[0-9]{3}R[0-9]{3}C[0-9]{2}').search( output).group() def loadClusterInfo(self, user=None): ''' function : load cluster info from static config file input : user output : NA ''' # Get the user u = user if user is not None else self.user if (u is None): return None try: # Init cluster info clusterInfo = dbClusterInfo() # Initialize the self.clusterInfo variable clusterInfo.initFromStaticConfig(u) return clusterInfo except Exception: return None def loadSupportItems(self): ''' function : load support items by scanning the disk files input : NA output : NA ''' # Get check items itemPath = "%s/items/" % self.basePath for (dirpath, dirnames, filenames) in os.walk(itemPath): for f in filenames: (fileName, suffix) = os.path.splitext(f) if (fileName.find("Check") == 0 and suffix == ".py"): self.supportItems[fileName] = os.path.join(dirpath, f) if (not self.supportItems): raise NotEmptyException("support items") def loadSupportScene(self): ''' function : load support scene by scanning the scene configuration files in config folder input : NA output : NA ''' configPath = "%s/config/" % self.basePath for (dirpath, dirnames, filenames) in os.walk(configPath): for f in filenames: (fileName, suffix) = os.path.splitext(f) if (fileName.find("scene_") == 0 and suffix == ".xml"): self.supportScenes[fileName[6:]] =\ os.path.join(dirpath, f) if (not self.supportScenes): raise NotEmptyException("support scenes") def loadSceneConfiguration(self, scene): ''' function : load certain scene configuration in xml file input : NA output : NA ''' # Get scene xml configFile = "%s/config/scene_%s.xml" % (self.basePath, scene) if not os.path.isfile(configFile): raise SceneNotFoundException(scene, self.supportScenes) # root node rootNode = ETree.parse(configFile).getroot() configElem = rootNode.find('configuration') if configElem is not None: for elem in list(configElem): setattr(self, elem.tag, elem.text.strip()) def isCached(self): ''' function : whether the context was serialized to disk input : NA output : boolean ''' # Check if Cache file exists if os.path.isfile(self.getCacheFile()): return True else: return False def clean(self): ''' function : clean the cache file input : NA output : boolean ''' # Delete Cache files cmd = "rm -rf %s" % self.getCacheFile() SharedFuncs.runShellCmd(cmd) def dump(self): ''' function : serialize the check context to disk input : NA output : NA ''' self.clean() try: pickle.dump(self, open(self.getCacheFile(), "wb"), True) SharedFuncs.chmodFile(self.getCacheFile(), DefaultValue.KEY_FILE_MODE) except Exception as e: raise ContextDumpException(e) def load(self, fileName=None): ''' function : load the check context from disk input : path of the context file output : CheckContext ''' f = fileName if fileName is not None else self.getCacheFile() result = None if self.isCached(): try: result = pickle.load(open(f, "rb")) except Exception as e: raise ContextLoadException(e) return result def getNodeName(self, host): if "HOST_IP" in list(os.environ.keys()): cmd = "echo $HOST_IP" else: cmd = "hostname" if SharedFuncs.is_local_node(host): output = SharedFuncs.runShellCmd(cmd) else: output = SharedFuncs.runSshCmd(cmd, host, self.user) hostname = output.strip().split('\n')[-1].strip() self.hostMapping[host] = hostname def getMapping(self): ''' function : get the ip to hostname mapping with all host input : remote host name and password map output : NA ''' self.hostMapping = {} if (not self.nodes): return try: pool = ThreadPool(DefaultValue.getCpuSet()) results = pool.map(self.getNodeName, self.nodes) pool.close() pool.join() except Exception as e: raise Exception(str(e)) def sendTmpFile(self, host): cmd = "if [ ! -d %s ]; then mkdir %s -p -m %s;fi" % ( self.tmpPath, self.tmpPath, DefaultValue.KEY_DIRECTORY_MODE) SharedFuncs.runSshCmd(cmd, host, self.user) SharedFuncs.sendFile(self.getCacheFile(), host, self.user, self.tmpPath) def dispatch(self, hosts): ''' function : send the serialized context file to remote host input : remote host name and password map output : NA ''' if len(hosts) == 0 or g_opts.isSingle: return fileName = self.getCacheFile() if not os.path.isfile(fileName): raise CheckException("File %s is not exist or invalid" % fileName) try: pool = ThreadPool(DefaultValue.getCpuSet()) results = pool.map(self.sendTmpFile, hosts) pool.close() pool.join() except Exception as e: raise Exception(str(e)) ############################################################################# # Parse and check parameters ############################################################################# def usage(): """ gs_check is a utility to check the health status of a cluster. Usage: gs_check -? | --help Example: gs_check -i ITEM [...] [-U USER] [-L] [-l LOGFILE] [-o OUTPUTDIR] [--skip-root-items] [--set] [--routing] gs_check -e SCENE_NAME [-U USER] [-L] [-l LOGFILE] [-o OUTPUTDIR] [--skip-root-items] [--set] [--time-out=SECS] [--routing] [--skip-items] General options: -i Health check item number. OLAP Example: -i CheckCPU,CheckMTU, CheckPing. -e Health check scene name. OLAP Example: -e inspect/upgrade/slow_node/ binary_upgrade/health/install/longtime -U Cluster user. -L Run the command as local mode. -l Path of log file. -o Save the result to the specified directory. --cid The check ID used for identify a check process, only for internal use. --skip-root-items Skip the items with root privileges. --disk-threshold Set disk threshold for checking disk usage, only for CheckDataDiskUsage. --format Set the format of the result report. --set Set abnormal items if supported --time-out Set the timeout for scene check, default 1500 seconds. --routing The network segment with business ip, example: 192.168.1.1:255.255.255.0 --skip-items Skip the specified check item or setting item with scene check Example: --skip-items CheckCPU,CheckMTU --non-print Do not print output result. -?, --help Show help information for this utility, and exit the command line mode. -V, --version Show version information. """ print(usage.__doc__) def version(): ''' function : get the version the check tool input : NA output: NA ''' print(SharedFuncs.getVersion()) ######################################################### # Init global log ######################################################### def initGlobal(): """ function: initialize the global variable input : NA output: NA """ # state global variable global g_opts, g_context, g_result g_opts = CmdOptions() g_context = CheckContext() g_result = CheckResult() def parseCommandLine(): """ function: Parse command line and save to global variable input : NA output: NA """ # Resolves the command line global g_opts g_opts = CmdOptions() ParaObj = Parameter() ParaDict = ParaObj.ParameterCommandLine("check") if "helpFlag" in list(ParaDict.keys()): usage() sys.exit(0) # command line parameter group definition for gs_check irrelevantPara = {"scenes": "itemstr", "time_out": "itemstr", "skipItems": "itemstr", "cid": "scenes", "nodegroup_name": "scenes", "shrinkNodes": "scenes"} paraNameMap = {"itemstr": "i", "scenes": "e", "time_out": "-time-out", "skipItems": "-skip-items", "cid": "-cid", "nodegroup_name": "-nodegroup-name", "shrinkNodes": "-ShrinkNodes"} formatList = ['default', 'json'] # position parameter can not be set at the same time for para in list(irrelevantPara.keys()): if (para in list(ParaDict.keys()) and irrelevantPara[para] in list(ParaDict.keys())): raise UseBothParameterException( (paraNameMap[para], paraNameMap[irrelevantPara[para]])) if "itemstr" in list(ParaDict.keys()): g_opts.items = ParaDict["itemstr"] if "scenes" in list(ParaDict.keys()): g_opts.scene = ParaDict["scenes"] if "outFile" in list(ParaDict.keys()): g_context.outPath = ParaDict["outFile"] if "logFile" in list(ParaDict.keys()): g_opts.logFile = ParaDict["logFile"] if "user" in list(ParaDict.keys()): g_context.user = ParaDict["user"] if "hostfile" in list(ParaDict.keys()): for node in g_file.readFile(ParaDict["hostfile"]): g_opts.nodes.append(node.strip()) if "cid" in list(ParaDict.keys()): g_context.setCheckID(ParaDict["cid"]) g_opts.distributing = True if "localMode" in list(ParaDict.keys()): g_opts.localMode = True if "skipRootItems" in list(ParaDict.keys()): g_opts.skipRootItems = True if "disk-threshold" in list(ParaDict.keys()): g_context.thresholdDn = ParaDict["disk-threshold"] if "set" in list(ParaDict.keys()): g_context.set = True if "routing" in list(ParaDict.keys()): g_opts.routing = ParaDict["routing"] if "skipItems" in list(ParaDict.keys()): g_opts.skipItems = ParaDict["skipItems"] if "nodegroup_name" in list(ParaDict.keys()): g_context.LCName = ParaDict["nodegroup_name"] if "shrinkNodes" in list(ParaDict.keys()): g_context.ShrinkNodes = ParaDict["shrinkNodes"] if "time_out" in list(ParaDict.keys()): try: g_opts.timeout = int(ParaDict["time_out"]) except Exception: raise CheckException("The parameter timeout set invalid value") if g_opts.timeout < DEFAULT_TIMEOUT: raise CheckException( "The timeout parameter must be set larger than default " "value 1500 seconds") setTimeOut() if "format" in list(ParaDict.keys()): g_opts.format = ParaDict["format"] if g_opts.format not in formatList: raise CheckException( "Format %s is not available,the valid format is %s" % ( g_opts.format, ",".join(formatList))) if "nonPrinting" in list(ParaDict.keys()): g_opts.nonPrinting = True def checkParameter(): ########################################################## if g_opts.nodes: raise CheckException("The --hosts parameter is not available") if __isRoot() and not __isDistributing(): if not g_opts.localMode: raise CheckException( "The command must be running with cluster user") ######################################################## # Get the -U parameter ######################################################## checkuser() if (g_opts.outPath and not g_opts.localMode): ######################################################## # create output path ######################################################## createPath(g_opts.outPath, g_context.user) def checkuser(): # The new node scenario does not need the -U parameter if __isRoot() and not g_opts.localMode: g_context.user = None return # Default mode -U for the current user if not __isRoot() and not g_context.user: g_context.user = SharedFuncs.getCurrentUser() if g_context.user: if not __isRoot() and g_context.user != SharedFuncs.getCurrentUser(): raise CheckException( "The user %s is not current user" % g_context.user) try: user_uid = pwd.getpwnam(g_context.user).pw_uid except Exception: raise CheckException( "The user %s is not a effective user." % g_context.user) if user_uid == 0: raise CheckException("The -U parameter can not be the root user.") isClusterUser = SharedFuncs.checkClusterUser(g_context.user, __getMpprcFile()) if isClusterUser: # get cluster information g_context.mpprc = __getMpprcFile() clusterInfo = g_context.loadClusterInfo(g_context.user) if clusterInfo: g_opts.cluster = clusterInfo else: isClusterUser = False if not isClusterUser: raise CheckException( "The user %s is not valid cluster user" % g_context.user) if g_opts.localMode or g_opts.distributing: return # Check cluster user trust dbNameList = g_opts.cluster.getClusterNodeNames() if (len(dbNameList) == 1 and dbNameList[0] == DefaultValue.GetHostIpOrName()): return appPath = DefaultValue.getEnv('GPHOME', g_opts.cluster.appPath) psshPath = os.path.join(appPath, 'script/gspylib/pssh/bin/pssh') cmd = "%s -H %s 'id' " % (psshPath, " -H ".join(dbNameList)) (status, output) = subprocess.getstatusoutput(cmd) if status != 0: errorNode = [] for result in output.split('\n'): if result.strip() == "": continue resultInfo = result.split() # Analyze the results if len(resultInfo) > 3 and resultInfo[2] == "[SUCCESS]": continue elif (len(resultInfo) > 3 and resultInfo[2] == "[FAILURE]" and resultInfo[3] in dbNameList): errorNode.append(resultInfo[3]) else: raise CheckException( "Failed to check user trust. commands: %s Error:/n%s" % (cmd, output)) if errorNode: raise CheckException( "Failed to check user trust with %s" % errorNode) else: raise CheckException( "Failed to check user trust. Error:/n%s" % output) def createPath(path, user=""): if path == "/dev/null": return if os.path.isdir(path): # test write permissions if not g_file.checkDirWriteable(path): raise CheckException( "Failed to create or delete file in the [%s]." % path) elif os.path.isfile(path): raise CheckException("The out path [%s] must be a directory." % path) else: # path is not exist. recursively create the path g_file.createDirectory(path, True, DefaultValue.KEY_DIRECTORY_MODE) # Modify the file owner if __isRoot() and user: g_file.changeOwner(user, path) def getTmpPath(): """ function: Get and return temporary directory. input : NA output: String """ tmpPath = os.path.join("/tmp", "check_%s" % g_context.checkID) # Get the tmp file path createPath(tmpPath, g_context.user) createPath(os.path.join(tmpPath, "log"), g_context.user) createPath(os.path.join(tmpPath, "nodes"), g_context.user) return tmpPath def initLogFile(): """ function: Get and return temporary directory. input : NA output: NA """ global g_context, g_logger # load the context when the script ruuning on local mode and the context # was cached before g_context.tmpPath = getTmpPath() if g_context.isCached(): g_context = g_context.load() if __getLocalNode(g_context.nodes) in g_context.newNodes: g_context.mpprc = None g_context.user = None g_context.cluster = None (g_logger, logFile) = LoggerFactory.getLogger('gs_check', g_context.logFile, g_context.user) g_context.log = g_logger.debug g_logger.debug("Load check context from cache file") else: # Parameter specified first, followed by default GAUSSLOG, # last temporary directory if g_opts.logFile: g_context.logFile = os.path.realpath(g_opts.logFile) elif g_opts.cluster: g_context.logFile = os.path.join(g_opts.cluster.logPath, '%s/om/gs_check.log' % g_context.user) else: g_context.logFile = os.path.join(g_context.tmpPath, 'log/gs_check.log') (g_logger, g_context.logFile) = LoggerFactory.getLogger( 'gs_check', g_context.logFile, g_context.user) # clean the cache files for reentry the command g_context.clean() # set mpprc file g_context.mpprc = __getMpprcFile() # Load support scene by parsing project folder g_context.loadSupportScene() # Load support check items by parsing the project folder g_context.loadSupportItems() # load the scene configuration if g_opts.scene: g_context.loadSceneConfiguration(g_opts.scene) # load cluster info if g_opts.cluster: g_context.cluster = g_opts.cluster g_context.oldNodes = g_opts.cluster.getClusterSshIps()[0] # load nodes if g_opts.nodes: for node in g_opts.nodes: if node not in g_context.oldNodes: g_context.newNodes.append(node) g_context.nodes = g_context.oldNodes + g_context.newNodes def getRootUserPwd(): # ask user input root password interactive when in new node scene or # contains items with root permission if __hasRootItems() and not __isRoot(): rootItems = [i['name'] for i in g_context.rootItems] __printOnScreen( "The below items require root privileges to execute:[%s]" % " ".join(rootItems)) rootuser = input("Please enter root privileges user[root]:")\ or "root" rootpwd = getpass.getpass("Please enter password for user[%s]:" % rootuser) g_logger.debug("Ask user input password interactive") for host in g_context.nodes: isPwdOk = SharedFuncs.verifyPasswd(host, rootuser, rootpwd) if not isPwdOk: # try to connect remote node again rootpwd = __retryConnection(host, rootuser) g_opts.pwdMap[host] = (rootuser, rootpwd) if pwd.getpwnam(rootuser).pw_uid != 0: raise CheckException("Enter the user [%s] does not have" " root privileges." % rootuser) # print message on screen __printOnScreen("Check root password connection successfully") def parseCheckContext(): """ function: Parse check context and initialize all the context value input : NA output: NA """ global g_context initLogFile() if g_context.isCached(): return g_logger.debug("Start to parse the check items config file") items_all = [] items_oldNode = [] items_newNode = [] failedItems = [] singleSkipList = [] # generate the items from scene configuration if g_opts.scene: items_oldNode, failedItems = __parseScene(g_opts.scene) items_all += items_oldNode # generate the items from -i parameter value elif (g_opts.items): for i in g_opts.items: item = __parseOneItem(i) if (not item): failedItems.append(i) else: items_all.append(item) for item in items_all[:]: if not g_context.set and item['name'] in g_opts.skipItems: items_all.remove(item) continue if g_context.set and item['set_permission'] == 'root': g_context.rootItems.append(item) if g_opts.skipRootItems and item['permission'] == 'root': items_all.remove(item) continue if item['permission'] == 'root': g_context.rootItems.append(item) if g_opts.isSingle and item['name'] in SINGLE_SKIP: singleSkipList.append(item['name']) continue if item['name'] == "CheckRouting": if g_opts.routing: g_context.routing = g_opts.routing elif g_opts.cluster: workIP = g_opts.cluster.getDbNodeByName( DefaultValue.GetHostIpOrName()).backIps[0] g_context.routing = "%s:%s" % ( workIP, SharedFuncs.getMaskByIP(workIP)) else: raise CheckException( "The --routing is required when cluster dosen't exist") g_context.items.append(item) if len(singleSkipList) != 0: __printOnScreen( "The following items are skipped when the type of cluster is" " single:\n[%s]" % ",".join(singleSkipList)) if not items_newNode: g_context.oldItems = g_context.items else: g_context.oldItems = items_oldNode g_context.newItems = items_newNode if g_context.set and items_all: # Settings will have a big impact and need to be confirmed confirmItem = { "CheckCrontabLeft": "Clear om_monitor in crond service", "CheckDirLeft": "Delete all file in '/opt/huawei/Bigdata/'," "'/var/log/Bigdata/','/home/omm/'", "CheckProcessLeft": "Kill all process with gaussdb and omm user", "CheckOmmUserExist": "Delete system user omm", "CheckPortConflict": "kill all process with occupies " "the 25xxx port" } confirmMsg = "" for item in items_all: if item['name'] in list(confirmItem.keys()): confirmMsg += confirmItem[item['name']] + "\n" if item['name'] in SETITEM_SKIP: g_context.skipSetItem.append(item['name']) if confirmMsg: confirmMsg = "Warning: Executing the settings will do " \ "the following at the [%s] node:\n" % \ ','.join(g_context.newNodes) + confirmMsg __printOnScreen(confirmMsg) flag = input("Execution settings? (Y/N):") while True: # If it is not yes or all, it has been imported if not flag.upper() in ("Y", "N", "YES", "NO"): flag = input("Please type 'yes' or 'no': ") continue break if flag.upper() in ("Y", "YES"): pass if flag.upper() in ("N", "NO"): for Item in g_context.newItems: if Item['name'] in list(confirmItem.keys()): g_context.newItems.remove(Item) g_context.skipSetItem.append(Item['name']) __printOnScreen( 'Skip the settings for [%s]' % ','.join(g_context.skipSetItem)) if failedItems: raise ParseItemException(failedItems) if not g_context.items: raise CheckException("No check item can be performed," " please confirm the input parameters.") # print message on screen __printOnScreen("Parsing the check items config file successfully") getRootUserPwd() g_context.getMapping() g_context.dump() def dispatchCached(): # dispatch the context file to remote node if (not g_opts.localMode and not g_opts.distributing and not g_opts.isSingle): g_logger.debug("Start to distributing the check context dump file") g_context.dispatch(__getRemoteNodes(g_context.nodes)) # print message on screen __printOnScreen( "Distribute the context file to remote hosts successfully") def __printOnScreen(msg): """ function: print message on screen """ if g_opts.localMode or g_opts.distributing: return g_logger.info(msg) def __isRoot(): """ function: whether the item running under root user """ return os.getuid() == 0 def __hasRootItems(): """ function: whether the items required root privileges """ return g_context.rootItems is not None and len(g_context.rootItems) > 0 def __isDistributing(): """ function: whether execution is distributing """ return g_opts.distributing def __getLocalNode(nodes): """ function: get local node """ if nodes: for n in nodes: if SharedFuncs.is_local_node(n): return n return DefaultValue.GetHostIpOrName() def __getSeparatedValue(value, separator=","): ''' get command line value which were separated by "," ''' if separator not in value: return [value] return value.split(separator) def __getNodesFromFile(fileName): """ function: get nodes information from hostFile """ lines = [] try: with open(fileName, 'r') as fp: for line in [line.strip().rstrip('\n') for line in fp]: if not line or line in lines or line.startswith('#'): continue lines.append(line.strip()) except Exception as e: raise Exception(str(e)) return lines def __retryConnection(host, user): """ function: try to connect remote node again """ # Try connecting to the remote node three times for i in range(3): passwd = getpass.getpass( "Please enter password for user[%s] on the node[%s]:" % (user, host)) isOK = SharedFuncs.verifyPasswd(host, user, passwd) if isOK: return passwd else: continue raise CheckException( "Verify password failed for user[%s] on the node[%s]" % (user, host)) def __getMpprcFile(): """ function: get separated environment variables """ # get mpprc file envValue = DefaultValue.getEnv("MPPDB_ENV_SEPARATE_PATH") if envValue is not None and os.path.isfile(envValue): return envValue elif not __isRoot() and DefaultValue.getEnv('GAUSS_ENV'): cmd = "echo ~ 2>/dev/null" (status, output) = subprocess.getstatusoutput(cmd) if status != 0: raise CheckException( "Fetching user environment variable file failed." " Please setup environment variables." + "The cmd is %s" % cmd) else: return os.path.join(output, ".bashrc") elif __isRoot() and g_context.user: cmd = "su - %s -c 'echo ~ 2>/dev/null'" % g_context.user (status, output) = subprocess.getstatusoutput(cmd) if status != 0: raise CheckException( "Failed to get user [%s] home directory. Error: %s\n" % ( g_context.user, output) + "The cmd is %s" % cmd) else: return os.path.join(output, ".bashrc") elif __isRoot(): return "" else: raise CheckException("The separated mpprc file was not found." " Please setup environment variables") def __getUserAndPwd(node): """ function: get username and password for certain node """ if __hasRootItems() and not __isRoot(): return (g_opts.pwdMap[node][0], g_opts.pwdMap[node][1]) else: return (g_context.user, None) def __getRemoteNodes(hosts): ''' function: get the remote host ignore the local host ''' return [h for h in hosts if not SharedFuncs.is_local_node(h)] def __parseScene(sceneName): ''' function: parse scene configure file ''' if not sceneName: raise NotEmptyException("scene name") # Get scene xml xmlFile = "%s/config/scene_%s.xml" % (g_context.basePath, sceneName) if not os.path.isfile(xmlFile): raise SceneNotFoundException(sceneName, g_context.supportScenes) domTree = ETree.parse(xmlFile) rootNode = domTree.getroot() itemNames = [] thresholds = {} # parse items from allow items for elem in rootNode.findall('allowitems/item'): elemName = elem.attrib['name'] # check the check item whether exist or not if elemName not in list(g_context.supportItems.keys()): raise NotExistException("elemName", "support items") # save threshold as text and parse them later subElem = elem.find('threshold') if subElem is not None: thresholds[elemName] = subElem.text.strip() itemNames.append(elemName) # parse categories and get all items for category in rootNode.findall('allowcategories/category'): cpath = "%s/items/%s" % (g_context.basePath, category.attrib['name']) if os.path.isdir(cpath): itemNames.extend(x[:-3] for x in os.listdir(cpath) if x[:-3] not in itemNames and x.endswith(".py")) # parse deny items for elem in rootNode.findall('denyitems/item'): elemName = elem.attrib['name'] if elemName in itemNames: itemNames.remove(elemName) items = [] failedItems = [] for i in itemNames: item = __parseOneItem(i) if (not item): failedItems.append(i) # overwrite the threshold parameters if thresholds and i in list(thresholds.keys()): # parse the threshold of check item sceneThreshold = __parseThreshold(thresholds[i]) if item['threshold']: item['threshold'] = dict(item['threshold'], **sceneThreshold) else: item['threshold'] = sceneThreshold items.append(item) return (items, failedItems) def __parseOneItem(itemName): ''' function: parse one check item and get the full information ''' if not itemName: raise NotEmptyException("Item name") item = {} # try to load check item configuration from xml file xmlFile = "%s/config/items.xml" % g_context.basePath for event, elem in ETree.iterparse(xmlFile): if event == 'end': if elem.tag == 'checkitem' and elem.attrib['name'] == itemName: # Parse the xml file item['id'] = elem.attrib['id'] item['name'] = elem.attrib['name'] item['title_zh'] = __parseAttr(elem, "title", "zh") item['title_en'] = __parseAttr(elem, "title", "en") item['suggestion_zh'] = __parseAttr(elem, "suggestion", "zh") item['suggestion_en'] = __parseAttr(elem, "suggestion", "en") item['standard_zh'] = __parseAttr(elem, "standard", "zh") item['standard_en'] = __parseAttr(elem, "standard", "en") item['category'] = __parseProperty(elem, 'category', 'other') item['permission'] = __parseProperty(elem, 'permission', 'user') item['set_permission'] = __parseProperty(elem, 'set_permission', 'user') item['scope'] = __parseProperty(elem, 'scope', 'all') item['analysis'] = __parseProperty(elem, 'analysis', 'default') # Get the threshold threshold = elem.find('threshold') if threshold is not None and threshold.text is not None: # parse the threshold of check item item["threshold"] = __parseThreshold( threshold.text.strip()) break return item def __parseAttr(elem, attr, language='zh'): ''' function: parse the xml attr with language ''' val = elem.find('/'.join([attr, language])) if val is not None and val.text is not None: return val.text.strip().encode('utf-8') return "" def __parseProperty(elem, propertyName, defaultValue): ''' function: parse the property of check item ''' prop = elem.find(propertyName) result = defaultValue if prop is not None and prop.text is not None: result = prop.text.strip() return result def __parseThreshold(value, separator=";"): ''' function: parse the threshold of check item ''' result = {} if separator not in value and "=" not in value: return result if separator not in value and "=" in value: d = value.strip().split('=') result[d[0]] = d[1] else: for v in value.strip().split(separator): d = v.strip().split('=') result[d[0]] = d[1] return result def getMTUValue(node): global g_mtuMap # get ip address # maybe backIP has no trust nodeName = g_context.hostMapping[node] if (g_context.cluster and nodeName in g_context.cluster.getClusterNodeNames()): addr = g_context.cluster.getDbNodeByName(nodeName).backIps[0] sshIp = g_context.cluster.getDbNodeByName(nodeName).sshIps[0] else: addr = node sshIp = node # get all network card information cmd1 = """printf \"\n\n`/sbin/ifconfig -a`\n\n\" """ if not g_opts.pwdMap: output = SharedFuncs.runSshCmd(cmd1, sshIp, g_context.user) else: username, passwd = g_opts.pwdMap[node] if username is None or passwd is None: raise CheckException("Retrive username and password error.") output = SharedFuncs.runSshCmdWithPwd(cmd1, sshIp, username, passwd) # Separate each network card networkInfoList = output.strip().split('\n\n') networkInfo = "" mtuValue = "" # find network card by IP for eachNet in networkInfoList: if eachNet.find(addr) > 0 and eachNet.find('inet') > 0: networkInfo = eachNet break if not networkInfo: raise CheckException( "Failed to get network card information with '%s'." % node) # get network number networkNum = networkInfo.split()[0] # Remove : if it exists if networkNum[-1] == ":": networkNum = networkNum[:-1] for eachLine in networkInfo.split('\n'): # get mtu Value with SuSE and redHat6.x if eachLine.find('MTU') > 0: mtuValue = eachLine.split(':')[1].split(' ')[0].strip() break # get mtu Value with redHat7.x elif eachLine.find('mtu') > 0: mtuValue = eachLine.split()[-1] break else: continue if not networkNum: raise CheckException( "Failed to get network card number with '%s'." % node) if not mtuValue: raise CheckException( "Failed to get network card mtu value with '%s' '%s'." % (node, networkNum)) # The nodes are grouped by MTU value if not mtuValue in list(g_mtuMap.keys()): g_mtuMap[mtuValue] = ["%s-%s" % (node, networkNum)] else: g_mtuMap[mtuValue].append("%s-%s" % (node, networkNum)) def preCheck(): """ function: preCheck for different scene input : NA output: NA """ # patch ssh config if __isRoot(): cmd = "grep -E '^MaxStartups[\ \t]+1000' /etc/ssh/sshd_config" (status, output) = subprocess.getstatusoutput(cmd) if status != 0: cmd = "sed -i '/MaxStartups/d' /etc/ssh/sshd_config &&" \ " echo 'MaxStartups 1000' >> /etc/ssh/sshd_config &&" \ " service sshd reload" SharedFuncs.runShellCmd(cmd) if (g_opts.distributing or g_opts.localMode or g_opts.isSingle or not g_context.nodes): return # Check all node MTU value try: pool = ThreadPool(DefaultValue.getCpuSet()) results = pool.map(getMTUValue, g_context.nodes) pool.close() pool.join() except Exception as e: raise Exception(str(e)) # According to the number of groups to determine whether the same if len(list(g_mtuMap.keys())) > 1: warningMsg = "Warning: The MTU value is inconsistent on all node," \ " maybe checking will be slower or hang." for mtuValue in list(g_mtuMap.keys()): warningMsg += "\n%s: [%s]" % ( mtuValue, ','.join(g_mtuMap[mtuValue])) __printOnScreen(warningMsg) def analysisResult(item): global g_itemResult outputPath = g_context.tmpPath checkID = g_context.checkID itemName = item['name'] files = [] content = "" for n in g_context.nodes: fileName = "%s/%s_%s_%s.out" % ( outputPath, itemName, g_context.hostMapping[n], checkID) files.append(fileName) for f in files: content += "".join(g_file.readFile(f)) itemResult = __analysisResult(content, itemName) g_itemResult[itemName] = [itemResult, itemResult.formatOutput()] def doCheck(): """ function: do check process input : NA output: NA """ # Local mode if g_opts.localMode: if __isDistributing(): # load check item dynamic and get the execute result doRunCheck() else: if not __hasRootItems() or __isRoot(): # load check item dynamic and get the execute result doRunCheck() else: # check with root privileges doRootCheck() else: # watching the threads and response for Ctrl+C signal Watcher() threads = [] __printOnScreen( "Start to health check for the cluster. Total Items:%s Nodes:%s" % (len(g_context.items), len(g_context.nodes))) for n in g_context.nodes: t = CheckThread("%s Thread" % n, doLocalCheck, n) threads.append(t) items = g_context.items itemCount = len(items) itemsName = [i['name'] for i in items] outputPath = g_context.tmpPath nodes = g_context.nodes[:] checkID = g_context.checkID # init progress display progress_manager = MultiProgressManager() progress_manager.put('Checking...', LineProgress(total=itemCount, title='Checking...')) # fix the display format for progress bar newLine = '\n' print(newLine) # Check the number of completed nodes overNodes = 0 # Time to hit the log LogCount = 0 lastTimeProgress = -1 while len(nodes) and datetime.now() <= g_endTime: totleCount = 0 slowNode = [] for node in nodes: # Get user and password username, passwd = __getUserAndPwd(node) if node in g_context.oldNodes: itemCount_node = len(g_context.oldItems) else: itemCount_node = len(g_context.newItems) # Local execution if SharedFuncs.is_local_node(node): checkCount = SharedFuncs.checkComplete( checkID, node, g_context.hostMapping[node], g_context.user, g_context.tmpPath) # Executed in new node scene elif node in g_context.newNodes: checkCount = SharedFuncs.checkComplete( checkID, node, g_context.hostMapping[node], username, g_context.tmpPath, passwd) else: checkCount = SharedFuncs.checkComplete( checkID, node, g_context.hostMapping[node], g_context.user, g_context.tmpPath) try: checkCount = int(checkCount.strip()) except Exception: checkCount = 0 # If there is a node check completed, # some nodes just started,record slow node if overNodes > 0 and checkCount < 2: slowNode.append(node) if checkCount == itemCount_node: nodes.remove(node) # Record the number of completed nodes overNodes += 1 if not SharedFuncs.is_local_node(node): if node in g_context.newNodes: outItems = [] for i in itemsName: outItems.append("%s/%s_%s_%s.out" % ( outputPath, i, g_context.hostMapping[node], checkID)) SharedFuncs.receiveFile(outItems, node, username, outputPath, passwd) else: fileName = "%s/*_%s_%s.out" % ( outputPath, g_context.hostMapping[node], checkID) # Delete Files SharedFuncs.receiveFile(fileName, node, g_context.user, outputPath) else: totleCount += checkCount # All nodes check the number of completed totleCount += itemCount * overNodes # Timed and counted time.sleep(1) LogCount += 1 # Update execution progress progressInfo = totleCount // len(g_context.nodes) # Refresh only as the schedule changes if lastTimeProgress < progressInfo <= itemCount: progress_manager.update("Checking...", progressInfo) lastTimeProgress = progressInfo # Suggest the slow node to log every 30 seconds if slowNode and itemCount > 1 and LogCount % 30 == 0: logMsg = "Warning: The node [%s] check progress" \ " is slow." % ",".join(slowNode) g_logger.debug(logMsg) for t in threads: if t.exitcode == 1: raise ThreadCheckException(t.name, t.exception) for t in threads: t.join(1) if datetime.now() > g_endTime: raise TimeoutException(nodes) __printOnScreen("Start to analysis the check result") try: pool = ThreadPool(DefaultValue.getCpuSet()) results = pool.map(analysisResult, g_context.items) pool.close() pool.join() except Exception as e: raise Exception(str(e)) for item in g_context.items: g_result.append(g_itemResult[item['name']][0]) print(g_itemResult[item['name']][1]) __printOnScreen("Analysis the check result successfully") def doRunCheck(): """ function: load check item dynamic and get the execute result input : NA output: NA """ outputPath = g_context.tmpPath localHost = __getLocalNode(g_context.nodes) if localHost in g_context.newNodes: items = g_context.newItems else: items = g_context.oldItems if g_context.hostMapping: localHost = g_context.hostMapping[localHost] for item in items: content = "" modPath = g_context.supportItems[item['name']] checker = CheckItemFactory.createItem(item['name'], modPath, item['scope'], item['analysis']) checker.runCheck(g_context, g_logger) # for local run get the content fileName = "%s/%s_%s_%s.out" % ( outputPath, item['name'], localHost, g_context.checkID) content += "".join(g_file.readFile(fileName)) itemResult = __analysisResult(content, item['name']) g_result.append(itemResult) # run the check process distributing and no need to clean the resource if __isDistributing(): g_logger.debug("run check items done and exit the command") if g_opts.format == 'default' and not g_opts.nonPrinting: # Initialize the self.clusterInfo variable print(g_result.outputRaw()) def doRootCheck(): """ function: check with root privileges input : NA output: NA """ # get local node host = __getLocalNode(g_context.nodes) # prepare the command for running check cmd = __prepareCmd(g_context.items, g_context.user, g_context.checkID, True) # run root cmd output = SharedFuncs.runRootCmd(cmd, g_opts.pwdMap[host][0], g_opts.pwdMap[host][1], g_context.mpprc) print(output.decode()) def __prepareCmd(items, user, checkid, print_output=False): """ function: prepare the command for running check """ cmdPath = os.path.realpath(os.path.dirname(__file__)) itemsName = [i['name'] for i in items] userParam = "" checkIdParam = "" routingParam = "" printParam = "" if not print_output: printParam = "--non-print" if user: userParam = " -U %s " % user if checkid: checkIdParam = " --cid=%s " % checkid if g_context.routing: routingParam = "--routing %s" % g_context.routing cmd = "%s/gs_check -i %s %s %s -L %s -o %s -l %s %s" % ( cmdPath, ",".join(itemsName), userParam, checkIdParam, routingParam, g_context.tmpPath, g_context.logFile, printParam) return cmd def doLocalCheck(host): """ function: running check on different threads input : NA output: NA """ # prepare the command for running check if host in g_context.oldNodes: cmd = __prepareCmd(g_context.oldItems, g_context.user, g_context.checkID) else: cmd = __prepareCmd(g_context.newItems, "", g_context.checkID) if SharedFuncs.is_local_node(host): if __hasRootItems(): SharedFuncs.runRootCmd(cmd, g_opts.pwdMap[host][0], g_opts.pwdMap[host][1], g_context.mpprc) else: SharedFuncs.runShellCmd(cmd, g_context.user, g_context.mpprc) else: if not __hasRootItems(): SharedFuncs.runSshCmd(cmd, host, g_context.user, g_context.mpprc) else: # get username and password for certain node username, passwd = __getUserAndPwd(host) if host in g_context.newNodes: SharedFuncs.runSshCmdWithPwd(cmd, host, username, passwd) else: SharedFuncs.runSshCmdWithPwd(cmd, host, username, passwd, g_context.mpprc) def __analysisResult(output, itemName): """ function: analysis the check result """ item_result = ItemResult.parse(output) if not item_result: raise CheckException("analysis result occurs error") try: # load support item mod_path = g_context.supportItems[itemName] checker = CheckItemFactory.createFrom(itemName, mod_path, g_context) # analysis the item result got from each node item_result = checker.postAnalysis(item_result) except Exception as e: raise CheckException(str(e)) return item_result def moveLogFile(host): tmpLog = os.path.join(g_context.tmpPath, "log/gs_check.log") SharedFuncs.receiveFile(g_context.logFile, host, g_context.user, tmpLog[:-4] + "_" + host + ".log") def formatOutput(): """ function: format and zip the result package input : NA output: NA """ if g_opts.distributing or not g_result: return try: # output the result to a file resultFile = os.path.join(g_context.tmpPath, "CheckResult_%s" % g_context.checkID) g_file.createFile(resultFile, True) g_file.writeFile(resultFile, [g_result.outputResult()]) except Exception as e: if os.path.exists(resultFile): g_file.removeFile(resultFile) g_logger.info("Warning! Generate check result output file failed.") g_logger.debug(str(e)) if g_opts.localMode: return # export the check result to excel file in output folder, # only export excel for certain scene scene = '_' + g_opts.scene if g_opts.scene else "" # collect the log file from remote host tmpLog = os.path.join(g_context.tmpPath, "log/gs_check.log") # Get the log file if g_opts.logFile or g_opts.cluster: g_file.cpFile(g_context.logFile, tmpLog[:-4] + "_" + DefaultValue.GetHostIpOrName() + ".log") else: g_file.moveFile(g_context.logFile, tmpLog[:-4] + "_" + DefaultValue.GetHostIpOrName() + ".log") hosts = __getRemoteNodes(g_context.nodes) if hosts: try: pool = ThreadPool(DefaultValue.getCpuSet()) results = pool.map(moveLogFile, hosts) pool.close() pool.join() except Exception as e: g_logger.info( "Warning! Retrieve log file from remote host failed.") g_logger.debug(str(e)) # move the *.out file to nodes folder outputFolder = g_context.tmpPath checkID = g_context.checkID cmd = "cd %s; find . -name \'*%s.out\' -exec mv {} %s \;"\ % (g_context.tmpPath, checkID, os.path.join(outputFolder, "nodes")) SharedFuncs.runShellCmd(cmd, g_context.user) # No check result is generated when the output is specified as /dev/null if g_context.outPath == "/dev/null": print(g_result.outputStatistic()) print("The inspection report has been cleared by /dev/null.") return tarFile = "%s/CheckReport%s_%s.tar.gz" %\ (g_context.outPath, scene, g_context.checkID) # tar the output for this check tarFiles = '' if (__checkFileExist(os.path.join(outputFolder, "nodes"), '%s.out' % checkID)): tarFiles += ' nodes ' if __checkFileExist(os.path.join(outputFolder, "log"), '.log'): tarFiles += ' log ' if __checkFileExist(outputFolder, '%s.zip' % checkID): tarFiles += ' *%s.zip ' % checkID if __checkFileExist(outputFolder, 'CheckResult_%s' % checkID): tarFiles += ' CheckResult_%s ' % checkID tarcmd = "cd %s;tar -zcf %s %s 2>&1; chmod %s '%s'" \ % (outputFolder, tarFile, tarFiles, DefaultValue.KEY_FILE_MODE, tarFile) SharedFuncs.runShellCmd(tarcmd, g_context.user) if g_opts.format == 'default': print(g_result.outputStatistic()) print("For more information please refer to %s" % os.path.join(outputFolder, tarFile)) if g_opts.format == 'json': print(g_result.outputJson()) def __checkFileExist(path, filePattern): # Check the file exists cmd = "cd %s; ls | grep '%s' | wc -l" % (path, filePattern) (status, output) = subprocess.getstatusoutput(cmd) if status == 0 and output != "0": return True else: return False def killChildProcess(node): checkID = g_context.checkID # cmd with switch users cmd_switch = """proc_pid_list=`ps -ef | grep 'cid=%s'| grep -v 'grep'""" \ """|awk '{print \$2}'` """ % checkID cmd_switch += """ && (if [ X\"$proc_pid_list\" != X\"\" ]; """ \ """then echo \"$proc_pid_list\" | xargs kill -9 ; fi)""" # cmd with not switch users cmd_current = """proc_pid_list=`ps -ef | grep 'cid=%s'| grep -v 'grep'""" \ """|awk "{print \\\$2}"` """ % checkID cmd_current += """ && (if [ X"$proc_pid_list" != X"" ]; then """ \ """echo "$proc_pid_list" | xargs kill -9 ; fi)""" username, passwd = __getUserAndPwd(node) if SharedFuncs.is_local_node(node) and not __hasRootItems(): SharedFuncs.runShellCmd(cmd_current) elif __hasRootItems(): SharedFuncs.runSshCmdWithPwd(cmd_switch, node, username, passwd) else: SharedFuncs.runSshCmd(cmd_current, node, g_context.user) def cleanTmpDir(node): # clean tmp files in all the nodes cmd = r"rm -rf %s" % g_context.tmpPath if SharedFuncs.is_local_node(node): SharedFuncs.runShellCmd(cmd) else: SharedFuncs.runSshCmd(cmd, node, g_context.user) def cleanEnvironment(skiplog=False): """ function: clean the environment input : NA output: NA """ if __isDistributing(): return if not g_context.tmpPath: return if not g_context.nodes: return # kill child process on all hosts when exception(skip log) if skiplog: try: pool = ThreadPool(DefaultValue.getCpuSet()) results = pool.map(killChildProcess, g_context.nodes) pool.close() pool.join() except Exception as e: g_logger.info("Warning! Failed to kill child process.") g_logger.debug(str(e)) # clean tmp files in all the nodes cmd = r"rm -rf %s" % g_context.tmpPath if g_opts.localMode: SharedFuncs.runShellCmd(cmd) else: try: pool = ThreadPool(DefaultValue.getCpuSet()) results = pool.map(cleanTmpDir, g_context.nodes) pool.close() pool.join() except Exception as e: g_logger.info("Warning! Failed to clear tmp directory.") g_logger.debug(str(e)) def setTimeOut(): """ function: set time out input : NA output: NA """ global g_endTime # end time g_endTime = datetime.now() + timedelta(seconds=g_opts.timeout) if __name__ == '__main__': # main function try: initGlobal() parseCommandLine() checkParameter() parseCheckContext() preCheck() dispatchCached() doCheck() formatOutput() cleanEnvironment() except (InterruptException, ThreadCheckException, TimeoutException) as e: g_logger.error(str(e)) # clean the environment and child process when using Ctrl+C force or # except or timeout to exit the command cleanEnvironment(True) sys.exit(1) except Exception as e: if not g_logger: sys.stdout = sys.stderr print(str(e)) else: g_logger.error(str(e)) cleanEnvironment() sys.exit(1) else: sys.exit(0)