Files
openGauss-OM/script/gs_check

1765 lines
63 KiB
Python

#!/usr/bin/env python3
# -*- coding:utf-8 -*-
#############################################################################
# Copyright (c) 2020 Huawei Technologies Co.,Ltd.
#
# openGauss is licensed under Mulan PSL v2.
# You can use this software according to the terms
# and conditions of the Mulan PSL v2.
# You may obtain a copy of Mulan PSL v2 at:
#
# http://license.coscl.org.cn/MulanPSL2
#
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS,
# WITHOUT WARRANTIES OF ANY KIND,
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
# See the Mulan PSL v2 for more details.
# ----------------------------------------------------------------------------
# Description : gs_check is a utility to check cluster and database status
#############################################################################
import subprocess
import os
import sys
import re
import getpass
import time
import pwd
import pickle
package_path = os.path.dirname(os.path.realpath(__file__))
ld_path = package_path + "/gspylib/clib"
if 'LD_LIBRARY_PATH' not in os.environ:
os.environ['LD_LIBRARY_PATH'] = ld_path
os.execve(os.path.realpath(__file__), sys.argv, os.environ)
if not os.environ.get('LD_LIBRARY_PATH').startswith(ld_path):
os.environ['LD_LIBRARY_PATH'] = \
ld_path + ":" + os.environ['LD_LIBRARY_PATH']
os.execve(os.path.realpath(__file__), sys.argv, os.environ)
import xml.etree.cElementTree as ETree
from datetime import datetime, timedelta
from multiprocessing.dummy import Pool as ThreadPool
from gspylib.inspection.common.Exception import CheckException, \
UseBothParameterException, \
SceneNotFoundException, ParseItemException, \
NotEmptyException, \
NotExistException, InterruptException, ThreadCheckException, \
ContextDumpException, ContextLoadException, \
TimeoutException
from gspylib.common.Common import DefaultValue
from gspylib.common.ParameterParsecheck import Parameter
from gspylib.inspection.common import SharedFuncs
from gspylib.inspection.common.Log import LoggerFactory
from gspylib.inspection.common.TaskPool import Watcher, CheckThread
from gspylib.inspection.common.CheckResult import CheckResult, ItemResult
from gspylib.inspection.common.CheckItem import CheckItemFactory
from gspylib.inspection.common.ProgressBar import MultiProgressManager, \
LineProgress
from gspylib.common.DbClusterInfo import dbClusterInfo
from base_utils.os.env_util import EnvUtil
from base_utils.os.file_util import FileUtil
from base_utils.os.net_util import NetUtil
from domain_utils.domain_common.cluster_constants import ClusterConstants
#############################################################################
# Global variables
# g_opts: global option
# g_logger: global logger
# g_context :global context
# g_result : global result
# g_endTime : global endTime
# DIRECTORY_MODE: global directory mode
# MPPDB_VERSION_R5 : mppdb version
# DEFAULT_TIMEOUT : time out
#############################################################################
g_logger = None
g_opts = None
g_context = None
g_result = None
g_endTime = None
g_mtuMap = {}
g_itemResult = {}
DEFAULT_TIMEOUT = 1500
# single cluster will skip these items
# because single clusters don't need to perform consistency checks and
# internal communication class checks
SINGLE_SKIP = ["CheckTimeZone", "CheckEncoding", "CheckKernelVer",
"CheckNTPD", "CheckCpuCount",
"CheckMemInfo", "CheckDiskConfig",
"CheckUpVer", "CheckPgxcgroup", "CheckPing",
"CheckNetWorkDrop", "CheckNetSpeed"]
SETITEM_SKIP = ["CheckCPU", "CheckTimeZone", "CheckOSVer", "CheckNTPD",
"CheckSshdService", "CheckEtcHosts",
"CheckCpuCount", "CheckHyperThread", "CheckMemInfo",
"CheckKernelVer", "CheckEncoding", "CheckBootItems",
"CheckDropCache", "CheckFilehandle", "CheckKeyProAdj",
"CheckDiskFormat", "CheckInodeUsage", "CheckSpaceUsage",
"CheckDiskConfig", "CheckXid", "CheckSysTabSize",
"CheckClusterState", "CheckConfigFileDiff", "CheckUpVer",
"CheckEnvProfile", "CheckGaussVer", "CheckPortRange",
"CheckReadonlyMode", "CheckCatchup", "CheckProcessStatus",
"CheckSpecialFile", "CheckCollector", "CheckLargeFile",
"CheckProStartTime", "CheckMpprcFile", "CheckLockNum",
"CheckCurConnCount", "CheckCursorNum", "CheckPgxcgroup",
"CheckLockState", "CheckIdleSession", "CheckDBConnection",
"CheckSysTable", "CheckSysTabSize", "CheckTableSpace",
"CheckTableSkew", "CheckDNSkew", "CheckCreateView",
"CheckHashIndex", "CheckNextvalInDefault", "CheckPgxcRedistb",
"CheckReturnType", "CheckSysadminUser", "CheckTDDate",
"CheckDropColumn", "CheckDiskFailure", "CheckPing",
"CheckNetWorkDrop", "CheckUsedPort", "CheckNICModel",
"CheckRouting", "CheckNetSpeed", "CheckDataDiskUsage"]
class CmdOptions():
"""
command-line options
"""
def __init__(self):
# initialize variable
self.user = None
self.localMode = False
self.distributing = False
self.skipRootItems = False
self.set = False
self.language = 'zh'
self.format = 'default'
self.scene = None
self.items = None
self.nodes = []
self.cluster = None
self.timeout = DEFAULT_TIMEOUT
self.pwdMap = {}
self.thresholdDn = None
self.outPath = None
self.logFile = None
self.isSingle = False
self.routing = None
self.skipItems = []
self.LCName = None
self.ShrinkNodes = None
self.nonPrinting = False
class CheckContext():
"""
check execution context
"""
def __init__(self):
"""
Constructor
"""
# Initialize the self.clusterInfo variable
self.basePath = os.path.join(
os.path.split(os.path.realpath(__file__))[0], 'gspylib',
'inspection')
self.user = None
self.set = None
self.log = None
self.postAnalysis = False
self.supportItems = {}
self.supportScenes = {}
self.items = []
self.rootItems = []
self.cluster = None
self.nodes = []
self.mpprc = None
self.checkID = self.genCheckID()
self.thresholdDn = None
self.outPath = os.path.join(self.basePath, "output")
self.logFile = None
self.tmpPath = None
self.hostMapping = None
self.routing = None
self.skipSetItem = []
self.oldNodes = []
self.newNodes = []
self.oldItems = []
self.newItems = []
self.LCName = None
self.ShrinkNodes = None
def genCheckID(self):
'''
function : generate the check ID which is unique for once checking
input : NA
output : checkID
'''
# Get Time
t = time.localtime(time.time())
dateString = time.strftime("%Y%m%d", t)
seconds = timedelta(hours=t.tm_hour, minutes=t.tm_min,
seconds=t.tm_sec).seconds
pidString = str(os.getpid())
return dateString + str(seconds) + pidString
def setCheckID(self, checkID):
'''
function : set check id
'''
self.checkID = checkID
def getCacheFile(self):
return "%s/context_%s.cache" % (self.tmpPath, self.checkID)
def checkMPPDBVersion(self):
'''
function : check mppdb version
input : NA
output : NA
'''
# check the version number
cmd = "gsql -V"
output = SharedFuncs.runShellCmd(cmd, self.user, self.mpprc)
return re.compile(r'V[0-9]{3}R[0-9]{3}C[0-9]{2}').search(
output).group()
def loadClusterInfo(self, user=None):
'''
function : load cluster info from static config file
input : user
output : NA
'''
# Get the user
u = user if user is not None else self.user
if (u is None):
return None
try:
# Init cluster info
clusterInfo = dbClusterInfo()
# Initialize the self.clusterInfo variable
clusterInfo.initFromStaticConfig(u)
return clusterInfo
except Exception:
return None
def loadSupportItems(self):
'''
function : load support items by scanning the disk files
input : NA
output : NA
'''
# Get check items
itemPath = "%s/items/" % self.basePath
for (dirpath, dirnames, filenames) in os.walk(itemPath):
for f in filenames:
(fileName, suffix) = os.path.splitext(f)
if (fileName.find("Check") == 0 and suffix == ".py"):
self.supportItems[fileName] = os.path.join(dirpath, f)
if (not self.supportItems):
raise NotEmptyException("support items")
def loadSupportScene(self):
'''
function : load support scene by scanning the scene
configuration files in config folder
input : NA
output : NA
'''
configPath = "%s/config/" % self.basePath
for (dirpath, dirnames, filenames) in os.walk(configPath):
for f in filenames:
(fileName, suffix) = os.path.splitext(f)
if (fileName.find("scene_") == 0 and suffix == ".xml"):
self.supportScenes[fileName[6:]] =\
os.path.join(dirpath, f)
if (not self.supportScenes):
raise NotEmptyException("support scenes")
def loadSceneConfiguration(self, scene):
'''
function : load certain scene configuration in xml file
input : NA
output : NA
'''
# Get scene xml
configFile = "%s/config/scene_%s.xml" % (self.basePath, scene)
if not os.path.isfile(configFile):
raise SceneNotFoundException(scene, self.supportScenes)
# root node
rootNode = ETree.parse(configFile).getroot()
configElem = rootNode.find('configuration')
if configElem is not None:
for elem in list(configElem):
setattr(self, elem.tag, elem.text.strip())
def isCached(self):
'''
function : whether the context was serialized to disk
input : NA
output : boolean
'''
# Check if Cache file exists
if os.path.isfile(self.getCacheFile()):
return True
else:
return False
def clean(self):
'''
function : clean the cache file
input : NA
output : boolean
'''
# Delete Cache files
cmd = "rm -rf %s" % self.getCacheFile()
SharedFuncs.runShellCmd(cmd)
def dump(self):
'''
function : serialize the check context to disk
input : NA
output : NA
'''
self.clean()
try:
pickle.dump(self, open(self.getCacheFile(), "wb"), True)
SharedFuncs.chmodFile(self.getCacheFile(),
DefaultValue.KEY_FILE_MODE)
except Exception as e:
raise ContextDumpException(e)
def load(self, fileName=None):
'''
function : load the check context from disk
input : path of the context file
output : CheckContext
'''
f = fileName if fileName is not None else self.getCacheFile()
result = None
if self.isCached():
try:
result = pickle.load(open(f, "rb"))
except Exception as e:
raise ContextLoadException(e)
return result
def getNodeName(self, host):
if "HOST_IP" in list(os.environ.keys()):
cmd = "echo $HOST_IP"
else:
cmd = "hostname"
if SharedFuncs.is_local_node(host):
output = SharedFuncs.runShellCmd(cmd)
else:
output = SharedFuncs.runSshCmd(cmd, host, self.user)
hostname = output.strip().split('\n')[-1].strip()
self.hostMapping[host] = hostname
def getMapping(self):
'''
function : get the ip to hostname mapping with all host
input : remote host name and password map
output : NA
'''
self.hostMapping = {}
if (not self.nodes):
return
try:
pool = ThreadPool(DefaultValue.getCpuSet())
results = pool.map(self.getNodeName, self.nodes)
pool.close()
pool.join()
except Exception as e:
raise Exception(str(e))
def sendTmpFile(self, host):
cmd = "if [ ! -d %s ]; then mkdir %s -p -m %s;fi" % (
self.tmpPath, self.tmpPath, DefaultValue.KEY_DIRECTORY_MODE)
SharedFuncs.runSshCmd(cmd, host, self.user)
SharedFuncs.sendFile(self.getCacheFile(), host, self.user,
self.tmpPath)
def dispatch(self, hosts):
'''
function : send the serialized context file to remote host
input : remote host name and password map
output : NA
'''
if len(hosts) == 0 or g_opts.isSingle:
return
fileName = self.getCacheFile()
if not os.path.isfile(fileName):
raise CheckException("File %s is not exist or invalid" % fileName)
try:
pool = ThreadPool(DefaultValue.getCpuSet())
results = pool.map(self.sendTmpFile, hosts)
pool.close()
pool.join()
except Exception as e:
raise Exception(str(e))
#############################################################################
# Parse and check parameters
#############################################################################
def usage():
"""
gs_check is a utility to check the health status of a cluster.
Usage:
gs_check -? | --help
Example:
gs_check -i ITEM [...] [-U USER] [-L] [-l LOGFILE] [-o OUTPUTDIR]
[--skip-root-items] [--set] [--routing]
gs_check -e SCENE_NAME [-U USER] [-L] [-l LOGFILE] [-o OUTPUTDIR]
[--skip-root-items] [--set] [--time-out=SECS]
[--routing] [--skip-items]
General options:
-i Health check item number.
OLAP Example: -i CheckCPU,CheckMTU,
CheckPing.
-e Health check scene name.
OLAP Example: -e inspect/upgrade/slow_node/
binary_upgrade/health/install/longtime
-U Cluster user.
-L Run the command as local mode.
-l Path of log file.
-o Save the result to the specified directory.
--cid The check ID used for identify a check
process, only for internal use.
--skip-root-items Skip the items with root privileges.
--disk-threshold Set disk threshold for checking disk usage,
only for CheckDataDiskUsage.
--format Set the format of the result report.
--set Set abnormal items if supported
--time-out Set the timeout for scene check, default
1500 seconds.
--routing The network segment with business ip,
example: 192.168.1.1:255.255.255.0
--skip-items Skip the specified check item or setting
item with scene check
Example: --skip-items CheckCPU,CheckMTU
--non-print Do not print output result.
-?, --help Show help information for this utility,
and exit the command line mode.
-V, --version Show version information.
"""
print(usage.__doc__)
def version():
'''
function : get the version the check tool
input : NA
output: NA
'''
print(SharedFuncs.getVersion())
#########################################################
# Init global log
#########################################################
def initGlobal():
"""
function: initialize the global variable
input : NA
output: NA
"""
# state global variable
global g_opts, g_context, g_result
g_opts = CmdOptions()
g_context = CheckContext()
g_result = CheckResult()
def parseCommandLine():
"""
function: Parse command line and save to global variable
input : NA
output: NA
"""
# Resolves the command line
global g_opts
g_opts = CmdOptions()
ParaObj = Parameter()
ParaDict = ParaObj.ParameterCommandLine("check")
if "helpFlag" in list(ParaDict.keys()):
usage()
sys.exit(0)
# command line parameter group definition for gs_check
irrelevantPara = {"scenes": "itemstr", "time_out": "itemstr",
"skipItems": "itemstr",
"cid": "scenes", "nodegroup_name": "scenes",
"shrinkNodes": "scenes"}
paraNameMap = {"itemstr": "i", "scenes": "e", "time_out": "-time-out",
"skipItems": "-skip-items",
"cid": "-cid", "nodegroup_name": "-nodegroup-name",
"shrinkNodes": "-ShrinkNodes"}
formatList = ['default', 'json']
# position parameter can not be set at the same time
for para in list(irrelevantPara.keys()):
if (para in list(ParaDict.keys()) and
irrelevantPara[para] in list(ParaDict.keys())):
raise UseBothParameterException(
(paraNameMap[para], paraNameMap[irrelevantPara[para]]))
if "itemstr" in list(ParaDict.keys()):
g_opts.items = ParaDict["itemstr"]
if "scenes" in list(ParaDict.keys()):
g_opts.scene = ParaDict["scenes"]
if "outFile" in list(ParaDict.keys()):
g_context.outPath = ParaDict["outFile"]
if "logFile" in list(ParaDict.keys()):
g_opts.logFile = ParaDict["logFile"]
if "user" in list(ParaDict.keys()):
g_context.user = ParaDict["user"]
if "hostfile" in list(ParaDict.keys()):
for node in FileUtil.readFile(ParaDict["hostfile"]):
g_opts.nodes.append(node.strip())
if "cid" in list(ParaDict.keys()):
g_context.setCheckID(ParaDict["cid"])
g_opts.distributing = True
if "localMode" in list(ParaDict.keys()):
g_opts.localMode = True
if "skipRootItems" in list(ParaDict.keys()):
g_opts.skipRootItems = True
if "disk-threshold" in list(ParaDict.keys()):
g_context.thresholdDn = ParaDict["disk-threshold"]
if "set" in list(ParaDict.keys()):
g_context.set = True
if "routing" in list(ParaDict.keys()):
g_opts.routing = ParaDict["routing"]
if "skipItems" in list(ParaDict.keys()):
g_opts.skipItems = ParaDict["skipItems"]
if "nodegroup_name" in list(ParaDict.keys()):
g_context.LCName = ParaDict["nodegroup_name"]
if "shrinkNodes" in list(ParaDict.keys()):
g_context.ShrinkNodes = ParaDict["shrinkNodes"]
if "time_out" in list(ParaDict.keys()):
try:
g_opts.timeout = int(ParaDict["time_out"])
except Exception:
raise CheckException("The parameter timeout set invalid value")
if g_opts.timeout < DEFAULT_TIMEOUT:
raise CheckException(
"The timeout parameter must be set larger than default "
"value 1500 seconds")
setTimeOut()
if "format" in list(ParaDict.keys()):
g_opts.format = ParaDict["format"]
if g_opts.format not in formatList:
raise CheckException(
"Format %s is not available,the valid format is %s" % (
g_opts.format, ",".join(formatList)))
if "nonPrinting" in list(ParaDict.keys()):
g_opts.nonPrinting = True
def checkParameter():
##########################################################
if g_opts.nodes:
raise CheckException("The --hosts parameter is not available")
if __isRoot() and not __isDistributing():
if not g_opts.localMode:
raise CheckException(
"The command must be running with cluster user")
########################################################
# Get the -U parameter
########################################################
checkuser()
if (g_opts.outPath and not g_opts.localMode):
########################################################
# create output path
########################################################
createPath(g_opts.outPath, g_context.user)
def checkuser():
# The new node scenario does not need the -U parameter
if __isRoot() and not g_opts.localMode:
g_context.user = None
return
# Default mode -U for the current user
if not __isRoot() and not g_context.user:
g_context.user = SharedFuncs.getCurrentUser()
if g_context.user:
if not __isRoot() and g_context.user != SharedFuncs.getCurrentUser():
raise CheckException(
"The user %s is not current user" % g_context.user)
try:
user_uid = pwd.getpwnam(g_context.user).pw_uid
except Exception:
raise CheckException(
"The user %s is not a effective user." % g_context.user)
if user_uid == 0:
raise CheckException("The -U parameter can not be the root user.")
isClusterUser = SharedFuncs.checkClusterUser(g_context.user,
__getMpprcFile())
if isClusterUser:
# get cluster information
g_context.mpprc = __getMpprcFile()
clusterInfo = g_context.loadClusterInfo(g_context.user)
if clusterInfo:
g_opts.cluster = clusterInfo
else:
isClusterUser = False
if not isClusterUser:
raise CheckException(
"The user %s is not valid cluster user" % g_context.user)
if g_opts.localMode or g_opts.distributing:
return
# Check cluster user trust
dbNameList = g_opts.cluster.getClusterNodeNames()
if (len(dbNameList) == 1 and
dbNameList[0] == NetUtil.GetHostIpOrName()):
return
appPath = EnvUtil.getEnv('GPHOME', g_opts.cluster.appPath)
psshPath = os.path.join(appPath, 'script/gspylib/pssh/bin/pssh')
cmd = "%s -H %s 'id' " % (psshPath, " -H ".join(dbNameList))
(status, output) = subprocess.getstatusoutput(cmd)
if status != 0:
errorNode = []
for result in output.split('\n'):
if result.strip() == "":
continue
resultInfo = result.split()
# Analyze the results
if len(resultInfo) > 3 and resultInfo[2] == "[SUCCESS]":
continue
elif (len(resultInfo) > 3 and resultInfo[2] == "[FAILURE]" and
resultInfo[3] in dbNameList):
errorNode.append(resultInfo[3])
else:
raise CheckException(
"Failed to check user trust. commands: %s Error:/n%s"
% (cmd, output))
if errorNode:
raise CheckException(
"Failed to check user trust with %s" % errorNode)
else:
raise CheckException(
"Failed to check user trust. Error:/n%s" % output)
def createPath(path, user=""):
if path == ClusterConstants.DEV_NULL:
return
if os.path.isdir(path):
# test write permissions
if not FileUtil.checkDirWriteable(path):
raise CheckException(
"Failed to create or delete file in the [%s]." % path)
elif os.path.isfile(path):
raise CheckException("The out path [%s] must be a directory." % path)
else:
# path is not exist. recursively create the path
FileUtil.createDirectory(path, True, DefaultValue.KEY_DIRECTORY_MODE)
# Modify the file owner
if __isRoot() and user:
FileUtil.changeOwner(user, path)
def getTmpPath():
"""
function: Get and return temporary directory.
input : NA
output: String
"""
tmpPath = os.path.join("/tmp", "check_%s" % g_context.checkID)
# Get the tmp file path
createPath(tmpPath, g_context.user)
createPath(os.path.join(tmpPath, "log"), g_context.user)
createPath(os.path.join(tmpPath, "nodes"), g_context.user)
return tmpPath
def initLogFile():
"""
function: Get and return temporary directory.
input : NA
output: NA
"""
global g_context, g_logger
# load the context when the script ruuning on local mode and the context
# was cached before
g_context.tmpPath = getTmpPath()
if g_context.isCached():
g_context = g_context.load()
if __getLocalNode(g_context.nodes) in g_context.newNodes:
g_context.mpprc = None
g_context.user = None
g_context.cluster = None
(g_logger, logFile) = LoggerFactory.getLogger('gs_check',
g_context.logFile,
g_context.user)
g_context.log = g_logger.debug
g_logger.debug("Load check context from cache file")
else:
# Parameter specified first, followed by default GAUSSLOG,
# last temporary directory
if g_opts.logFile:
g_context.logFile = os.path.realpath(g_opts.logFile)
elif g_opts.cluster:
g_context.logFile = os.path.join(g_opts.cluster.logPath,
'%s/om/gs_check.log'
% g_context.user)
else:
g_context.logFile = os.path.join(g_context.tmpPath,
'log/gs_check.log')
(g_logger, g_context.logFile) = LoggerFactory.getLogger(
'gs_check', g_context.logFile, g_context.user)
# clean the cache files for reentry the command
g_context.clean()
# set mpprc file
g_context.mpprc = __getMpprcFile()
# Load support scene by parsing project folder
g_context.loadSupportScene()
# Load support check items by parsing the project folder
g_context.loadSupportItems()
# load the scene configuration
if g_opts.scene:
g_context.loadSceneConfiguration(g_opts.scene)
# load cluster info
if g_opts.cluster:
g_context.cluster = g_opts.cluster
g_context.oldNodes = g_opts.cluster.getClusterSshIps()[0]
# load nodes
if g_opts.nodes:
for node in g_opts.nodes:
if node not in g_context.oldNodes:
g_context.newNodes.append(node)
g_context.nodes = g_context.oldNodes + g_context.newNodes
def getRootUserPwd():
# ask user input root password interactive when in new node scene or
# contains items with root permission
if __hasRootItems() and not __isRoot():
rootItems = [i['name'] for i in g_context.rootItems]
__printOnScreen(
"The below items require root privileges to execute:[%s]"
% " ".join(rootItems))
rootuser = input("Please enter root privileges user[root]:")\
or "root"
rootpwd = getpass.getpass("Please enter password for user[%s]:"
% rootuser)
g_logger.debug("Ask user input password interactive")
for host in g_context.nodes:
isPwdOk = SharedFuncs.verifyPasswd(host, rootuser, rootpwd)
if not isPwdOk:
# try to connect remote node again
rootpwd = __retryConnection(host, rootuser)
g_opts.pwdMap[host] = (rootuser, rootpwd)
if pwd.getpwnam(rootuser).pw_uid != 0:
raise CheckException("Enter the user [%s] does not have"
" root privileges." % rootuser)
# print message on screen
__printOnScreen("Check root password connection successfully")
def parseCheckContext():
"""
function: Parse check context and initialize all the context value
input : NA
output: NA
"""
global g_context
initLogFile()
if g_context.isCached():
return
g_logger.debug("Start to parse the check items config file")
items_all = []
items_oldNode = []
items_newNode = []
failedItems = []
singleSkipList = []
# generate the items from scene configuration
if g_opts.scene:
items_oldNode, failedItems = __parseScene(g_opts.scene)
items_all += items_oldNode
# generate the items from -i parameter value
elif (g_opts.items):
for i in g_opts.items:
item = __parseOneItem(i)
if (not item):
failedItems.append(i)
else:
items_all.append(item)
for item in items_all[:]:
if not g_context.set and item['name'] in g_opts.skipItems:
items_all.remove(item)
continue
if g_context.set and item['set_permission'] == 'root':
g_context.rootItems.append(item)
if g_opts.skipRootItems and item['permission'] == 'root':
items_all.remove(item)
continue
if item['permission'] == 'root':
g_context.rootItems.append(item)
if g_opts.isSingle and item['name'] in SINGLE_SKIP:
singleSkipList.append(item['name'])
continue
if item['name'] == "CheckRouting":
if g_opts.routing:
g_context.routing = g_opts.routing
elif g_opts.cluster:
workIP = g_opts.cluster.getDbNodeByName(
NetUtil.GetHostIpOrName()).backIps[0]
g_context.routing = "%s:%s" % (
workIP, SharedFuncs.getMaskByIP(workIP))
else:
raise CheckException(
"The --routing is required when cluster dosen't exist")
g_context.items.append(item)
if len(singleSkipList) != 0:
__printOnScreen(
"The following items are skipped when the type of cluster is"
" single:\n[%s]" % ",".join(singleSkipList))
if not items_newNode:
g_context.oldItems = g_context.items
else:
g_context.oldItems = items_oldNode
g_context.newItems = items_newNode
if g_context.set and items_all:
# Settings will have a big impact and need to be confirmed
confirmItem = {
"CheckCrontabLeft": "Clear om_monitor in crond service",
"CheckDirLeft": "Delete all file in '/opt/huawei/Bigdata/',"
"'/var/log/Bigdata/','/home/omm/'",
"CheckProcessLeft": "Kill all process with gaussdb and omm user",
"CheckOmmUserExist": "Delete system user omm",
"CheckPortConflict": "kill all process with occupies "
"the 25xxx port"
}
confirmMsg = ""
for item in items_all:
if item['name'] in list(confirmItem.keys()):
confirmMsg += confirmItem[item['name']] + "\n"
if item['name'] in SETITEM_SKIP:
g_context.skipSetItem.append(item['name'])
if confirmMsg:
confirmMsg = "Warning: Executing the settings will do " \
"the following at the [%s] node:\n" % \
','.join(g_context.newNodes) + confirmMsg
__printOnScreen(confirmMsg)
flag = input("Execution settings? (Y/N):")
while True:
# If it is not yes or all, it has been imported
if not flag.upper() in ("Y", "N", "YES", "NO"):
flag = input("Please type 'yes' or 'no': ")
continue
break
if flag.upper() in ("Y", "YES"):
pass
if flag.upper() in ("N", "NO"):
for Item in items_all:
if Item['name'] in list(confirmItem.keys()):
g_context.skipSetItem.append(Item['name'])
__printOnScreen(
'Skip the settings for [%s]'
% ','.join(g_context.skipSetItem))
if failedItems:
raise ParseItemException(failedItems)
if not g_context.items:
raise CheckException("No check item can be performed,"
" please confirm the input parameters.")
# print message on screen
__printOnScreen("Parsing the check items config file successfully")
getRootUserPwd()
g_context.getMapping()
g_context.dump()
def dispatchCached():
# dispatch the context file to remote node
if (not g_opts.localMode and not g_opts.distributing and
not g_opts.isSingle):
g_logger.debug("Start to distributing the check context dump file")
g_context.dispatch(__getRemoteNodes(g_context.nodes))
# print message on screen
__printOnScreen(
"Distribute the context file to remote hosts successfully")
def __printOnScreen(msg):
"""
function: print message on screen
"""
if g_opts.localMode or g_opts.distributing:
return
g_logger.info(msg)
def __isRoot():
"""
function: whether the item running under root user
"""
return os.getuid() == 0
def __hasRootItems():
"""
function: whether the items required root privileges
"""
return g_context.rootItems is not None and len(g_context.rootItems) > 0
def __isDistributing():
"""
function: whether execution is distributing
"""
return g_opts.distributing
def __getLocalNode(nodes):
"""
function: get local node
"""
if nodes:
for n in nodes:
if SharedFuncs.is_local_node(n):
return n
return NetUtil.GetHostIpOrName()
def __getSeparatedValue(value, separator=","):
'''
get command line value which were separated by ","
'''
if separator not in value:
return [value]
return value.split(separator)
def __getNodesFromFile(fileName):
"""
function: get nodes information from hostFile
"""
lines = []
try:
with open(fileName, 'r') as fp:
for line in [line.strip().rstrip('\n') for line in fp]:
if not line or line in lines or line.startswith('#'):
continue
lines.append(line.strip())
except Exception as e:
raise Exception(str(e))
return lines
def __retryConnection(host, user):
"""
function: try to connect remote node again
"""
# Try connecting to the remote node three times
for i in range(3):
passwd = getpass.getpass(
"Please enter password for user[%s] on the node[%s]:"
% (user, host))
isOK = SharedFuncs.verifyPasswd(host, user, passwd)
if isOK:
return passwd
else:
continue
raise CheckException(
"Verify password failed for user[%s] on the node[%s]" % (user, host))
def __getMpprcFile():
"""
function: get separated environment variables
"""
# get mpprc file
envValue = EnvUtil.getEnv("MPPDB_ENV_SEPARATE_PATH")
if envValue is not None and os.path.isfile(envValue):
return envValue
elif not __isRoot() and EnvUtil.getEnv('GAUSS_ENV'):
cmd = "echo ~ 2>/dev/null"
(status, output) = subprocess.getstatusoutput(cmd)
if status != 0:
raise CheckException(
"Fetching user environment variable file failed."
" Please setup environment variables." + "The cmd is %s" % cmd)
else:
return os.path.join(output, ".bashrc")
elif __isRoot() and g_context.user:
cmd = "su - %s -c 'echo ~ 2>/dev/null'" % g_context.user
(status, output) = subprocess.getstatusoutput(cmd)
if status != 0:
raise CheckException(
"Failed to get user [%s] home directory. Error: %s\n" % (
g_context.user, output) + "The cmd is %s" % cmd)
else:
return os.path.join(output, ".bashrc")
elif __isRoot():
return ""
else:
raise CheckException("The separated mpprc file was not found."
" Please setup environment variables")
def __getUserAndPwd(node):
"""
function: get username and password for certain node
"""
if __hasRootItems() and not __isRoot():
return (g_opts.pwdMap[node][0], g_opts.pwdMap[node][1])
else:
return (g_context.user, None)
def __getRemoteNodes(hosts):
'''
function: get the remote host ignore the local host
'''
return [h for h in hosts if not SharedFuncs.is_local_node(h)]
def __parseScene(sceneName):
'''
function: parse scene configure file
'''
if not sceneName:
raise NotEmptyException("scene name")
# Get scene xml
xmlFile = "%s/config/scene_%s.xml" % (g_context.basePath, sceneName)
if not os.path.isfile(xmlFile):
raise SceneNotFoundException(sceneName, g_context.supportScenes)
domTree = ETree.parse(xmlFile)
rootNode = domTree.getroot()
itemNames = []
thresholds = {}
# parse items from allow items
for elem in rootNode.findall('allowitems/item'):
elemName = elem.attrib['name']
# check the check item whether exist or not
if elemName not in list(g_context.supportItems.keys()):
raise NotExistException("elemName", "support items")
# save threshold as text and parse them later
subElem = elem.find('threshold')
if subElem is not None:
thresholds[elemName] = subElem.text.strip()
itemNames.append(elemName)
# parse categories and get all items
for category in rootNode.findall('allowcategories/category'):
cpath = "%s/items/%s" % (g_context.basePath, category.attrib['name'])
if os.path.isdir(cpath):
itemNames.extend(x[:-3] for x in os.listdir(cpath) if
x[:-3] not in itemNames and x.endswith(".py"))
# parse deny items
for elem in rootNode.findall('denyitems/item'):
elemName = elem.attrib['name']
if elemName in itemNames:
itemNames.remove(elemName)
items = []
failedItems = []
for i in itemNames:
item = __parseOneItem(i)
if (not item):
failedItems.append(i)
# overwrite the threshold parameters
if thresholds and i in list(thresholds.keys()):
# parse the threshold of check item
sceneThreshold = __parseThreshold(thresholds[i])
if item['threshold']:
item['threshold'] = dict(item['threshold'], **sceneThreshold)
else:
item['threshold'] = sceneThreshold
items.append(item)
return (items, failedItems)
def __parseOneItem(itemName):
'''
function: parse one check item and get the full information
'''
if not itemName:
raise NotEmptyException("Item name")
item = {}
# try to load check item configuration from xml file
xmlFile = "%s/config/items.xml" % g_context.basePath
for event, elem in ETree.iterparse(xmlFile):
if event == 'end':
if elem.tag == 'checkitem' and elem.attrib['name'] == itemName:
# Parse the xml file
item['id'] = elem.attrib['id']
item['name'] = elem.attrib['name']
item['title_zh'] = __parseAttr(elem, "title", "zh")
item['title_en'] = __parseAttr(elem, "title", "en")
item['suggestion_zh'] = __parseAttr(elem, "suggestion", "zh")
item['suggestion_en'] = __parseAttr(elem, "suggestion", "en")
item['standard_zh'] = __parseAttr(elem, "standard", "zh")
item['standard_en'] = __parseAttr(elem, "standard", "en")
item['category'] = __parseProperty(elem, 'category', 'other')
item['permission'] = __parseProperty(elem, 'permission',
'user')
item['set_permission'] = __parseProperty(elem,
'set_permission',
'user')
item['scope'] = __parseProperty(elem, 'scope', 'all')
item['analysis'] = __parseProperty(elem, 'analysis',
'default')
# Get the threshold
threshold = elem.find('threshold')
if threshold is not None and threshold.text is not None:
# parse the threshold of check item
item["threshold"] = __parseThreshold(
threshold.text.strip())
break
return item
def __parseAttr(elem, attr, language='zh'):
'''
function: parse the xml attr with language
'''
val = elem.find('/'.join([attr, language]))
if val is not None and val.text is not None:
return val.text.strip().encode('utf-8')
return ""
def __parseProperty(elem, propertyName, defaultValue):
'''
function: parse the property of check item
'''
prop = elem.find(propertyName)
result = defaultValue
if prop is not None and prop.text is not None:
result = prop.text.strip()
return result
def __parseThreshold(value, separator=";"):
'''
function: parse the threshold of check item
'''
result = {}
if separator not in value and "=" not in value:
return result
if separator not in value and "=" in value:
d = value.strip().split('=')
result[d[0]] = d[1]
else:
for v in value.strip().split(separator):
d = v.strip().split('=')
result[d[0]] = d[1]
return result
def getMTUValue(node):
global g_mtuMap
# get ip address
# maybe backIP has no trust
nodeName = g_context.hostMapping[node]
if (g_context.cluster and
nodeName in g_context.cluster.getClusterNodeNames()):
addr = g_context.cluster.getDbNodeByName(nodeName).backIps[0]
sshIp = g_context.cluster.getDbNodeByName(nodeName).sshIps[0]
else:
addr = node
sshIp = node
# get all network card information
cmd1 = """printf \"\n\n`/sbin/ifconfig -a`\n\n\" """
if not g_opts.pwdMap:
output = SharedFuncs.runSshCmd(cmd1, sshIp, g_context.user)
else:
username, passwd = g_opts.pwdMap[node]
if username is None or passwd is None:
raise CheckException("Retrive username and password error.")
output = SharedFuncs.runSshCmdWithPwd(cmd1, sshIp, username, passwd)
# Separate each network card
networkInfoList = output.strip().split('\n\n')
networkInfo = ""
mtuValue = ""
# find network card by IP
for eachNet in networkInfoList:
if eachNet.find(addr) > 0 and eachNet.find('inet') > 0:
networkInfo = eachNet
break
if not networkInfo:
raise CheckException(
"Failed to get network card information with '%s'." % node)
# get network number
networkNum = networkInfo.split()[0]
# Remove : if it exists
if networkNum[-1] == ":":
networkNum = networkNum[:-1]
for eachLine in networkInfo.split('\n'):
# get mtu Value with SuSE and redHat6.x
if eachLine.find('MTU') > 0:
mtuValue = eachLine.split(':')[1].split(' ')[0].strip()
break
# get mtu Value with redHat7.x
elif eachLine.find('mtu') > 0:
mtuValue = eachLine.split()[-1]
break
else:
continue
if not networkNum:
raise CheckException(
"Failed to get network card number with '%s'." % node)
if not mtuValue:
raise CheckException(
"Failed to get network card mtu value with '%s' '%s'."
% (node, networkNum))
# The nodes are grouped by MTU value
if not mtuValue in list(g_mtuMap.keys()):
g_mtuMap[mtuValue] = ["%s-%s" % (node, networkNum)]
else:
g_mtuMap[mtuValue].append("%s-%s" % (node, networkNum))
def preCheck():
"""
function: preCheck for different scene
input : NA
output: NA
"""
# patch ssh config
if __isRoot():
cmd = "grep -E '^MaxStartups[\ \t]+1000' /etc/ssh/sshd_config"
(status, output) = subprocess.getstatusoutput(cmd)
if status != 0:
cmd = "sed -i '/MaxStartups/d' /etc/ssh/sshd_config &&" \
" echo 'MaxStartups 1000' >> /etc/ssh/sshd_config &&" \
" service sshd reload"
SharedFuncs.runShellCmd(cmd)
if (g_opts.distributing or g_opts.localMode or
g_opts.isSingle or not g_context.nodes):
return
# Check all node MTU value
try:
pool = ThreadPool(DefaultValue.getCpuSet())
results = pool.map(getMTUValue, g_context.nodes)
pool.close()
pool.join()
except Exception as e:
raise Exception(str(e))
# According to the number of groups to determine whether the same
if len(list(g_mtuMap.keys())) > 1:
warningMsg = "Warning: The MTU value is inconsistent on all node," \
" maybe checking will be slower or hang."
for mtuValue in list(g_mtuMap.keys()):
warningMsg += "\n%s: [%s]" % (
mtuValue, ','.join(g_mtuMap[mtuValue]))
__printOnScreen(warningMsg)
def analysisResult(item):
global g_itemResult
outputPath = g_context.tmpPath
checkID = g_context.checkID
itemName = item['name']
files = []
content = ""
for n in g_context.nodes:
fileName = "%s/%s_%s_%s.out" % (
outputPath, itemName, g_context.hostMapping[n], checkID)
files.append(fileName)
for f in files:
content += "".join(FileUtil.readFile(f))
itemResult = __analysisResult(content, itemName)
g_itemResult[itemName] = [itemResult, itemResult.formatOutput()]
def doCheck():
"""
function: do check process
input : NA
output: NA
"""
# Local mode
if g_opts.localMode:
if __isDistributing():
# load check item dynamic and get the execute result
doRunCheck()
else:
if not __hasRootItems() or __isRoot():
# load check item dynamic and get the execute result
doRunCheck()
else:
# check with root privileges
doRootCheck()
else:
# watching the threads and response for Ctrl+C signal
Watcher()
threads = []
__printOnScreen(
"Start to health check for the cluster. Total Items:%s Nodes:%s"
% (len(g_context.items), len(g_context.nodes)))
for n in g_context.nodes:
t = CheckThread("%s Thread" % n, doLocalCheck, n)
threads.append(t)
items = g_context.items
itemCount = len(items)
itemsName = [i['name'] for i in items]
outputPath = g_context.tmpPath
nodes = g_context.nodes[:]
checkID = g_context.checkID
# init progress display
progress_manager = MultiProgressManager()
progress_manager.put('Checking...', LineProgress(total=itemCount,
title='Checking...'))
# fix the display format for progress bar
newLine = '\n'
print(newLine)
# Check the number of completed nodes
overNodes = 0
# Time to hit the log
LogCount = 0
lastTimeProgress = -1
while len(nodes) and datetime.now() <= g_endTime:
totleCount = 0
slowNode = []
for node in nodes:
# Get user and password
username, passwd = __getUserAndPwd(node)
if node in g_context.oldNodes:
itemCount_node = len(g_context.oldItems)
else:
itemCount_node = len(g_context.newItems)
# Local execution
if SharedFuncs.is_local_node(node):
checkCount = SharedFuncs.checkComplete(
checkID, node, g_context.hostMapping[node],
g_context.user, g_context.tmpPath)
# Executed in new node scene
elif node in g_context.newNodes:
checkCount = SharedFuncs.checkComplete(
checkID, node, g_context.hostMapping[node], username,
g_context.tmpPath, passwd)
else:
checkCount = SharedFuncs.checkComplete(
checkID, node, g_context.hostMapping[node],
g_context.user, g_context.tmpPath)
try:
checkCount = int(checkCount.strip())
except Exception:
checkCount = 0
# If there is a node check completed,
# some nodes just started,record slow node
if overNodes > 0 and checkCount < 2:
slowNode.append(node)
if checkCount == itemCount_node:
nodes.remove(node)
# Record the number of completed nodes
overNodes += 1
if not SharedFuncs.is_local_node(node):
if node in g_context.newNodes:
outItems = []
for i in itemsName:
outItems.append("%s/%s_%s_%s.out" % (
outputPath, i,
g_context.hostMapping[node],
checkID))
SharedFuncs.receiveFile(outItems, node, username,
outputPath, passwd)
else:
fileName = "%s/*_%s_%s.out" % (
outputPath, g_context.hostMapping[node],
checkID)
# Delete Files
SharedFuncs.receiveFile(fileName, node,
g_context.user,
outputPath)
else:
totleCount += checkCount
# All nodes check the number of completed
totleCount += itemCount * overNodes
# Timed and counted
time.sleep(1)
LogCount += 1
# Update execution progress
progressInfo = totleCount // len(g_context.nodes)
# Refresh only as the schedule changes
if lastTimeProgress < progressInfo <= itemCount:
progress_manager.update("Checking...", progressInfo)
lastTimeProgress = progressInfo
# Suggest the slow node to log every 30 seconds
if slowNode and itemCount > 1 and LogCount % 30 == 0:
logMsg = "Warning: The node [%s] check progress" \
" is slow." % ",".join(slowNode)
g_logger.debug(logMsg)
for t in threads:
if t.exitcode == 1:
raise ThreadCheckException(t.name, t.exception)
for t in threads:
t.join(1)
if datetime.now() > g_endTime:
raise TimeoutException(nodes)
__printOnScreen("Start to analysis the check result")
try:
pool = ThreadPool(DefaultValue.getCpuSet())
results = pool.map(analysisResult, g_context.items)
pool.close()
pool.join()
except Exception as e:
raise Exception(str(e))
for item in g_context.items:
g_result.append(g_itemResult[item['name']][0])
print(g_itemResult[item['name']][1])
__printOnScreen("Analysis the check result successfully")
def doRunCheck():
"""
function: load check item dynamic and get the execute result
input : NA
output: NA
"""
outputPath = g_context.tmpPath
localHost = __getLocalNode(g_context.nodes)
if localHost in g_context.newNodes:
items = g_context.newItems
else:
items = g_context.oldItems
if g_context.hostMapping:
localHost = g_context.hostMapping[localHost]
for item in items:
content = ""
modPath = g_context.supportItems[item['name']]
checker = CheckItemFactory.createItem(item['name'], modPath,
item['scope'], item['analysis'])
checker.runCheck(g_context, g_logger)
# for local run get the content
fileName = "%s/%s_%s_%s.out" % (
outputPath, item['name'], localHost, g_context.checkID)
content += "".join(FileUtil.readFile(fileName))
itemResult = __analysisResult(content, item['name'])
g_result.append(itemResult)
# run the check process distributing and no need to clean the resource
if __isDistributing():
g_logger.debug("run check items done and exit the command")
if g_opts.format == 'default' and not g_opts.nonPrinting:
# Initialize the self.clusterInfo variable
print(g_result.outputRaw())
def doRootCheck():
"""
function: check with root privileges
input : NA
output: NA
"""
# get local node
host = __getLocalNode(g_context.nodes)
# prepare the command for running check
cmd = __prepareCmd(g_context.items, g_context.user, g_context.checkID, True)
# run root cmd
output = SharedFuncs.runRootCmd(cmd, g_opts.pwdMap[host][0],
g_opts.pwdMap[host][1], g_context.mpprc)
print(output.decode())
def __prepareCmd(items, user, checkid, print_output=False):
"""
function: prepare the command for running check
"""
cmdPath = os.path.realpath(os.path.dirname(__file__))
itemsName = [i['name'] for i in items]
userParam = ""
checkIdParam = ""
routingParam = ""
printParam = ""
if not print_output:
printParam = "--non-print"
if user:
userParam = " -U %s " % user
if checkid:
checkIdParam = " --cid=%s " % checkid
if g_context.routing:
routingParam = "--routing %s" % g_context.routing
cmd = "%s/gs_check -i %s %s %s -L %s -o %s -l %s %s" % (
cmdPath, ",".join(itemsName), userParam, checkIdParam,
routingParam, g_context.tmpPath, g_context.logFile, printParam)
return cmd
def doLocalCheck(host):
"""
function: running check on different threads
input : NA
output: NA
"""
# prepare the command for running check
if host in g_context.oldNodes:
cmd = __prepareCmd(g_context.oldItems, g_context.user,
g_context.checkID)
else:
cmd = __prepareCmd(g_context.newItems, "", g_context.checkID)
if SharedFuncs.is_local_node(host):
if __hasRootItems():
SharedFuncs.runRootCmd(cmd, g_opts.pwdMap[host][0],
g_opts.pwdMap[host][1], g_context.mpprc)
else:
SharedFuncs.runShellCmd(cmd, g_context.user, g_context.mpprc)
else:
if not __hasRootItems():
SharedFuncs.runSshCmd(cmd, host, g_context.user, g_context.mpprc)
else:
# get username and password for certain node
username, passwd = __getUserAndPwd(host)
if host in g_context.newNodes:
SharedFuncs.runSshCmdWithPwd(cmd, host, username, passwd)
else:
SharedFuncs.runSshCmdWithPwd(cmd, host, username, passwd,
g_context.mpprc)
def __analysisResult(output, itemName):
"""
function: analysis the check result
"""
item_result = ItemResult.parse(output)
if not item_result:
raise CheckException("analysis result occurs error")
try:
# load support item
mod_path = g_context.supportItems[itemName]
checker = CheckItemFactory.createFrom(itemName, mod_path, g_context)
# analysis the item result got from each node
item_result = checker.postAnalysis(item_result)
except Exception as e:
raise CheckException(str(e))
return item_result
def moveLogFile(host):
tmpLog = os.path.join(g_context.tmpPath, "log/gs_check.log")
SharedFuncs.receiveFile(g_context.logFile, host, g_context.user,
tmpLog[:-4] + "_" + host + ".log")
def formatOutput():
"""
function: format and zip the result package
input : NA
output: NA
"""
if g_opts.distributing or not g_result:
return
try:
# output the result to a file
resultFile = os.path.join(g_context.tmpPath,
"CheckResult_%s" % g_context.checkID)
FileUtil.createFile(resultFile, True)
FileUtil.writeFile(resultFile, [g_result.outputResult()])
except Exception as e:
if os.path.exists(resultFile):
FileUtil.removeFile(resultFile)
g_logger.info("Warning! Generate check result output file failed.")
g_logger.debug(str(e))
if g_opts.localMode:
return
# export the check result to excel file in output folder,
# only export excel for certain scene
scene = '_' + g_opts.scene if g_opts.scene else ""
# collect the log file from remote host
tmpLog = os.path.join(g_context.tmpPath, "log/gs_check.log")
# Get the log file
if g_opts.logFile or g_opts.cluster:
FileUtil.cpFile(g_context.logFile, tmpLog[:-4] + "_" +
NetUtil.GetHostIpOrName() + ".log")
else:
FileUtil.moveFile(g_context.logFile, tmpLog[:-4] + "_" +
NetUtil.GetHostIpOrName() + ".log")
hosts = __getRemoteNodes(g_context.nodes)
if hosts:
try:
pool = ThreadPool(DefaultValue.getCpuSet())
results = pool.map(moveLogFile, hosts)
pool.close()
pool.join()
except Exception as e:
g_logger.info(
"Warning! Retrieve log file from remote host failed.")
g_logger.debug(str(e))
# move the *.out file to nodes folder
outputFolder = g_context.tmpPath
checkID = g_context.checkID
cmd = "cd %s; find . -name \'*%s.out\' -exec mv {} %s \;"\
% (g_context.tmpPath, checkID, os.path.join(outputFolder, "nodes"))
SharedFuncs.runShellCmd(cmd, g_context.user)
# No check result is generated when the output is specified as /dev/null
if g_context.outPath == ClusterConstants.DEV_NULL:
print(g_result.outputStatistic())
print("The inspection report has been cleared by /dev/null.")
return
tarFile = "%s/CheckReport%s_%s.tar.gz" %\
(g_context.outPath, scene, g_context.checkID)
# tar the output for this check
tarFiles = ''
if (__checkFileExist(os.path.join(outputFolder, "nodes"),
'%s.out' % checkID)):
tarFiles += ' nodes '
if __checkFileExist(os.path.join(outputFolder, "log"), '.log'):
tarFiles += ' log '
if __checkFileExist(outputFolder, '%s.zip' % checkID):
tarFiles += ' *%s.zip ' % checkID
if __checkFileExist(outputFolder, 'CheckResult_%s' % checkID):
tarFiles += ' CheckResult_%s ' % checkID
tarcmd = "cd %s;tar -zcf %s %s 2>&1; chmod %s '%s'" \
% (outputFolder, tarFile, tarFiles,
DefaultValue.KEY_FILE_MODE, tarFile)
SharedFuncs.runShellCmd(tarcmd, g_context.user)
if g_opts.format == 'default':
print(g_result.outputStatistic())
print("For more information please refer to %s"
% os.path.join(outputFolder, tarFile))
if g_opts.format == 'json':
print(g_result.outputJson())
def __checkFileExist(path, filePattern):
# Check the file exists
cmd = "cd %s; ls | grep '%s' | wc -l" % (path, filePattern)
(status, output) = subprocess.getstatusoutput(cmd)
if status == 0 and output != "0":
return True
else:
return False
def killChildProcess(node):
checkID = g_context.checkID
# cmd with switch users
cmd_switch = """proc_pid_list=`ps -ef | grep 'cid=%s'| grep -v 'grep'""" \
"""|awk '{print \$2}'` """ % checkID
cmd_switch += """ && (if [ X\"$proc_pid_list\" != X\"\" ]; """ \
"""then echo \"$proc_pid_list\" | xargs kill -9 ; fi)"""
# cmd with not switch users
cmd_current = """proc_pid_list=`ps -ef | grep 'cid=%s'| grep -v 'grep'""" \
"""|awk "{print \\\$2}"` """ % checkID
cmd_current += """ && (if [ X"$proc_pid_list" != X"" ]; then """ \
"""echo "$proc_pid_list" | xargs kill -9 ; fi)"""
username, passwd = __getUserAndPwd(node)
if SharedFuncs.is_local_node(node) and not __hasRootItems():
SharedFuncs.runShellCmd(cmd_current)
elif __hasRootItems():
SharedFuncs.runSshCmdWithPwd(cmd_switch, node, username, passwd)
else:
SharedFuncs.runSshCmd(cmd_current, node, g_context.user)
def cleanTmpDir(node):
# clean tmp files in all the nodes
cmd = r"rm -rf %s" % g_context.tmpPath
if SharedFuncs.is_local_node(node):
SharedFuncs.runShellCmd(cmd)
else:
SharedFuncs.runSshCmd(cmd, node, g_context.user)
def cleanEnvironment(skiplog=False):
"""
function: clean the environment
input : NA
output: NA
"""
if __isDistributing():
return
if not g_context.tmpPath:
return
if not g_context.nodes:
return
# kill child process on all hosts when exception(skip log)
if skiplog:
try:
pool = ThreadPool(DefaultValue.getCpuSet())
results = pool.map(killChildProcess, g_context.nodes)
pool.close()
pool.join()
except Exception as e:
g_logger.info("Warning! Failed to kill child process.")
g_logger.debug(str(e))
# clean tmp files in all the nodes
cmd = r"rm -rf %s" % g_context.tmpPath
if g_opts.localMode:
SharedFuncs.runShellCmd(cmd)
else:
try:
pool = ThreadPool(DefaultValue.getCpuSet())
results = pool.map(cleanTmpDir, g_context.nodes)
pool.close()
pool.join()
except Exception as e:
g_logger.info("Warning! Failed to clear tmp directory.")
g_logger.debug(str(e))
def setTimeOut():
"""
function: set time out
input : NA
output: NA
"""
global g_endTime
# end time
g_endTime = datetime.now() + timedelta(seconds=g_opts.timeout)
if __name__ == '__main__':
# main function
try:
initGlobal()
parseCommandLine()
checkParameter()
parseCheckContext()
preCheck()
dispatchCached()
doCheck()
formatOutput()
cleanEnvironment()
except (InterruptException, ThreadCheckException, TimeoutException) as e:
g_logger.error(str(e))
# clean the environment and child process when using Ctrl+C force or
# except or timeout to exit the command
cleanEnvironment(True)
sys.exit(1)
except Exception as e:
if not g_logger:
sys.stdout = sys.stderr
print(str(e))
else:
g_logger.error(str(e))
cleanEnvironment()
sys.exit(1)
else:
sys.exit(0)