1765 lines
63 KiB
Python
1765 lines
63 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding:utf-8 -*-
|
|
#############################################################################
|
|
# Copyright (c) 2020 Huawei Technologies Co.,Ltd.
|
|
#
|
|
# openGauss is licensed under Mulan PSL v2.
|
|
# You can use this software according to the terms
|
|
# and conditions of the Mulan PSL v2.
|
|
# You may obtain a copy of Mulan PSL v2 at:
|
|
#
|
|
# http://license.coscl.org.cn/MulanPSL2
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OF ANY KIND,
|
|
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
# See the Mulan PSL v2 for more details.
|
|
# ----------------------------------------------------------------------------
|
|
# Description : gs_check is a utility to check cluster and database status
|
|
#############################################################################
|
|
|
|
import subprocess
|
|
import os
|
|
import sys
|
|
import re
|
|
import getpass
|
|
import time
|
|
import pwd
|
|
import pickle
|
|
package_path = os.path.dirname(os.path.realpath(__file__))
|
|
ld_path = package_path + "/gspylib/clib"
|
|
if 'LD_LIBRARY_PATH' not in os.environ:
|
|
os.environ['LD_LIBRARY_PATH'] = ld_path
|
|
os.execve(os.path.realpath(__file__), sys.argv, os.environ)
|
|
if not os.environ.get('LD_LIBRARY_PATH').startswith(ld_path):
|
|
os.environ['LD_LIBRARY_PATH'] = \
|
|
ld_path + ":" + os.environ['LD_LIBRARY_PATH']
|
|
os.execve(os.path.realpath(__file__), sys.argv, os.environ)
|
|
|
|
import xml.etree.cElementTree as ETree
|
|
from datetime import datetime, timedelta
|
|
from multiprocessing.dummy import Pool as ThreadPool
|
|
from gspylib.inspection.common.Exception import CheckException, \
|
|
UseBothParameterException, \
|
|
SceneNotFoundException, ParseItemException, \
|
|
NotEmptyException, \
|
|
NotExistException, InterruptException, ThreadCheckException, \
|
|
ContextDumpException, ContextLoadException, \
|
|
TimeoutException
|
|
from gspylib.common.Common import DefaultValue
|
|
from gspylib.common.ParameterParsecheck import Parameter
|
|
from gspylib.inspection.common import SharedFuncs
|
|
from gspylib.inspection.common.Log import LoggerFactory
|
|
from gspylib.inspection.common.TaskPool import Watcher, CheckThread
|
|
from gspylib.inspection.common.CheckResult import CheckResult, ItemResult
|
|
from gspylib.inspection.common.CheckItem import CheckItemFactory
|
|
from gspylib.inspection.common.ProgressBar import MultiProgressManager, \
|
|
LineProgress
|
|
from gspylib.common.DbClusterInfo import dbClusterInfo
|
|
from base_utils.os.env_util import EnvUtil
|
|
from base_utils.os.file_util import FileUtil
|
|
from base_utils.os.net_util import NetUtil
|
|
from domain_utils.domain_common.cluster_constants import ClusterConstants
|
|
|
|
|
|
#############################################################################
|
|
# Global variables
|
|
# g_opts: global option
|
|
# g_logger: global logger
|
|
# g_context :global context
|
|
# g_result : global result
|
|
# g_endTime : global endTime
|
|
# DIRECTORY_MODE: global directory mode
|
|
# MPPDB_VERSION_R5 : mppdb version
|
|
# DEFAULT_TIMEOUT : time out
|
|
#############################################################################
|
|
g_logger = None
|
|
g_opts = None
|
|
g_context = None
|
|
g_result = None
|
|
g_endTime = None
|
|
g_mtuMap = {}
|
|
g_itemResult = {}
|
|
|
|
DEFAULT_TIMEOUT = 1500
|
|
# single cluster will skip these items
|
|
# because single clusters don't need to perform consistency checks and
|
|
# internal communication class checks
|
|
SINGLE_SKIP = ["CheckTimeZone", "CheckEncoding", "CheckKernelVer",
|
|
"CheckNTPD", "CheckCpuCount",
|
|
"CheckMemInfo", "CheckDiskConfig",
|
|
"CheckUpVer", "CheckPgxcgroup", "CheckPing",
|
|
"CheckNetWorkDrop", "CheckNetSpeed"]
|
|
|
|
SETITEM_SKIP = ["CheckCPU", "CheckTimeZone", "CheckOSVer", "CheckNTPD",
|
|
"CheckSshdService", "CheckEtcHosts",
|
|
"CheckCpuCount", "CheckHyperThread", "CheckMemInfo",
|
|
"CheckKernelVer", "CheckEncoding", "CheckBootItems",
|
|
"CheckDropCache", "CheckFilehandle", "CheckKeyProAdj",
|
|
"CheckDiskFormat", "CheckInodeUsage", "CheckSpaceUsage",
|
|
"CheckDiskConfig", "CheckXid", "CheckSysTabSize",
|
|
"CheckClusterState", "CheckConfigFileDiff", "CheckUpVer",
|
|
"CheckEnvProfile", "CheckGaussVer", "CheckPortRange",
|
|
"CheckReadonlyMode", "CheckCatchup", "CheckProcessStatus",
|
|
"CheckSpecialFile", "CheckCollector", "CheckLargeFile",
|
|
"CheckProStartTime", "CheckMpprcFile", "CheckLockNum",
|
|
"CheckCurConnCount", "CheckCursorNum", "CheckPgxcgroup",
|
|
"CheckLockState", "CheckIdleSession", "CheckDBConnection",
|
|
"CheckSysTable", "CheckSysTabSize", "CheckTableSpace",
|
|
"CheckTableSkew", "CheckDNSkew", "CheckCreateView",
|
|
"CheckHashIndex", "CheckNextvalInDefault", "CheckPgxcRedistb",
|
|
"CheckReturnType", "CheckSysadminUser", "CheckTDDate",
|
|
"CheckDropColumn", "CheckDiskFailure", "CheckPing",
|
|
"CheckNetWorkDrop", "CheckUsedPort", "CheckNICModel",
|
|
"CheckRouting", "CheckNetSpeed", "CheckDataDiskUsage"]
|
|
|
|
|
|
class CmdOptions():
|
|
"""
|
|
command-line options
|
|
"""
|
|
|
|
def __init__(self):
|
|
# initialize variable
|
|
self.user = None
|
|
self.localMode = False
|
|
self.distributing = False
|
|
self.skipRootItems = False
|
|
self.set = False
|
|
self.language = 'zh'
|
|
self.format = 'default'
|
|
self.scene = None
|
|
self.items = None
|
|
self.nodes = []
|
|
self.cluster = None
|
|
self.timeout = DEFAULT_TIMEOUT
|
|
self.pwdMap = {}
|
|
self.thresholdDn = None
|
|
self.outPath = None
|
|
self.logFile = None
|
|
self.isSingle = False
|
|
self.routing = None
|
|
self.skipItems = []
|
|
self.LCName = None
|
|
self.ShrinkNodes = None
|
|
self.nonPrinting = False
|
|
|
|
|
|
class CheckContext():
|
|
"""
|
|
check execution context
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""
|
|
Constructor
|
|
"""
|
|
# Initialize the self.clusterInfo variable
|
|
self.basePath = os.path.join(
|
|
os.path.split(os.path.realpath(__file__))[0], 'gspylib',
|
|
'inspection')
|
|
self.user = None
|
|
self.set = None
|
|
self.log = None
|
|
self.postAnalysis = False
|
|
self.supportItems = {}
|
|
self.supportScenes = {}
|
|
self.items = []
|
|
self.rootItems = []
|
|
self.cluster = None
|
|
self.nodes = []
|
|
self.mpprc = None
|
|
self.checkID = self.genCheckID()
|
|
self.thresholdDn = None
|
|
self.outPath = os.path.join(self.basePath, "output")
|
|
self.logFile = None
|
|
self.tmpPath = None
|
|
self.hostMapping = None
|
|
self.routing = None
|
|
self.skipSetItem = []
|
|
self.oldNodes = []
|
|
self.newNodes = []
|
|
self.oldItems = []
|
|
self.newItems = []
|
|
self.LCName = None
|
|
self.ShrinkNodes = None
|
|
|
|
def genCheckID(self):
|
|
'''
|
|
function : generate the check ID which is unique for once checking
|
|
input : NA
|
|
output : checkID
|
|
'''
|
|
# Get Time
|
|
t = time.localtime(time.time())
|
|
dateString = time.strftime("%Y%m%d", t)
|
|
seconds = timedelta(hours=t.tm_hour, minutes=t.tm_min,
|
|
seconds=t.tm_sec).seconds
|
|
pidString = str(os.getpid())
|
|
return dateString + str(seconds) + pidString
|
|
|
|
def setCheckID(self, checkID):
|
|
'''
|
|
function : set check id
|
|
'''
|
|
self.checkID = checkID
|
|
|
|
def getCacheFile(self):
|
|
return "%s/context_%s.cache" % (self.tmpPath, self.checkID)
|
|
|
|
def checkMPPDBVersion(self):
|
|
'''
|
|
function : check mppdb version
|
|
input : NA
|
|
output : NA
|
|
'''
|
|
# check the version number
|
|
cmd = "gsql -V"
|
|
output = SharedFuncs.runShellCmd(cmd, self.user, self.mpprc)
|
|
return re.compile(r'V[0-9]{3}R[0-9]{3}C[0-9]{2}').search(
|
|
output).group()
|
|
|
|
def loadClusterInfo(self, user=None):
|
|
'''
|
|
function : load cluster info from static config file
|
|
input : user
|
|
output : NA
|
|
'''
|
|
# Get the user
|
|
u = user if user is not None else self.user
|
|
if (u is None):
|
|
return None
|
|
try:
|
|
# Init cluster info
|
|
clusterInfo = dbClusterInfo()
|
|
# Initialize the self.clusterInfo variable
|
|
clusterInfo.initFromStaticConfig(u)
|
|
return clusterInfo
|
|
except Exception:
|
|
return None
|
|
|
|
def loadSupportItems(self):
|
|
'''
|
|
function : load support items by scanning the disk files
|
|
input : NA
|
|
output : NA
|
|
'''
|
|
# Get check items
|
|
itemPath = "%s/items/" % self.basePath
|
|
for (dirpath, dirnames, filenames) in os.walk(itemPath):
|
|
for f in filenames:
|
|
(fileName, suffix) = os.path.splitext(f)
|
|
if (fileName.find("Check") == 0 and suffix == ".py"):
|
|
self.supportItems[fileName] = os.path.join(dirpath, f)
|
|
if (not self.supportItems):
|
|
raise NotEmptyException("support items")
|
|
|
|
def loadSupportScene(self):
|
|
'''
|
|
function : load support scene by scanning the scene
|
|
configuration files in config folder
|
|
input : NA
|
|
output : NA
|
|
'''
|
|
configPath = "%s/config/" % self.basePath
|
|
for (dirpath, dirnames, filenames) in os.walk(configPath):
|
|
for f in filenames:
|
|
(fileName, suffix) = os.path.splitext(f)
|
|
if (fileName.find("scene_") == 0 and suffix == ".xml"):
|
|
self.supportScenes[fileName[6:]] =\
|
|
os.path.join(dirpath, f)
|
|
if (not self.supportScenes):
|
|
raise NotEmptyException("support scenes")
|
|
|
|
def loadSceneConfiguration(self, scene):
|
|
'''
|
|
function : load certain scene configuration in xml file
|
|
input : NA
|
|
output : NA
|
|
'''
|
|
# Get scene xml
|
|
configFile = "%s/config/scene_%s.xml" % (self.basePath, scene)
|
|
if not os.path.isfile(configFile):
|
|
raise SceneNotFoundException(scene, self.supportScenes)
|
|
# root node
|
|
rootNode = ETree.parse(configFile).getroot()
|
|
configElem = rootNode.find('configuration')
|
|
if configElem is not None:
|
|
for elem in list(configElem):
|
|
setattr(self, elem.tag, elem.text.strip())
|
|
|
|
def isCached(self):
|
|
'''
|
|
function : whether the context was serialized to disk
|
|
input : NA
|
|
output : boolean
|
|
'''
|
|
# Check if Cache file exists
|
|
if os.path.isfile(self.getCacheFile()):
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def clean(self):
|
|
'''
|
|
function : clean the cache file
|
|
input : NA
|
|
output : boolean
|
|
'''
|
|
# Delete Cache files
|
|
cmd = "rm -rf %s" % self.getCacheFile()
|
|
SharedFuncs.runShellCmd(cmd)
|
|
|
|
def dump(self):
|
|
'''
|
|
function : serialize the check context to disk
|
|
input : NA
|
|
output : NA
|
|
'''
|
|
self.clean()
|
|
try:
|
|
pickle.dump(self, open(self.getCacheFile(), "wb"), True)
|
|
SharedFuncs.chmodFile(self.getCacheFile(),
|
|
DefaultValue.KEY_FILE_MODE)
|
|
except Exception as e:
|
|
raise ContextDumpException(e)
|
|
|
|
def load(self, fileName=None):
|
|
'''
|
|
function : load the check context from disk
|
|
input : path of the context file
|
|
output : CheckContext
|
|
'''
|
|
f = fileName if fileName is not None else self.getCacheFile()
|
|
result = None
|
|
if self.isCached():
|
|
try:
|
|
result = pickle.load(open(f, "rb"))
|
|
except Exception as e:
|
|
raise ContextLoadException(e)
|
|
return result
|
|
|
|
def getNodeName(self, host):
|
|
if "HOST_IP" in list(os.environ.keys()):
|
|
cmd = "echo $HOST_IP"
|
|
else:
|
|
cmd = "hostname"
|
|
if SharedFuncs.is_local_node(host):
|
|
output = SharedFuncs.runShellCmd(cmd)
|
|
else:
|
|
output = SharedFuncs.runSshCmd(cmd, host, self.user)
|
|
hostname = output.strip().split('\n')[-1].strip()
|
|
self.hostMapping[host] = hostname
|
|
|
|
def getMapping(self):
|
|
'''
|
|
function : get the ip to hostname mapping with all host
|
|
input : remote host name and password map
|
|
output : NA
|
|
'''
|
|
self.hostMapping = {}
|
|
if (not self.nodes):
|
|
return
|
|
try:
|
|
pool = ThreadPool(DefaultValue.getCpuSet())
|
|
results = pool.map(self.getNodeName, self.nodes)
|
|
pool.close()
|
|
pool.join()
|
|
except Exception as e:
|
|
raise Exception(str(e))
|
|
|
|
def sendTmpFile(self, host):
|
|
|
|
cmd = "if [ ! -d %s ]; then mkdir %s -p -m %s;fi" % (
|
|
self.tmpPath, self.tmpPath, DefaultValue.KEY_DIRECTORY_MODE)
|
|
SharedFuncs.runSshCmd(cmd, host, self.user)
|
|
SharedFuncs.sendFile(self.getCacheFile(), host, self.user,
|
|
self.tmpPath)
|
|
|
|
def dispatch(self, hosts):
|
|
'''
|
|
function : send the serialized context file to remote host
|
|
input : remote host name and password map
|
|
output : NA
|
|
'''
|
|
if len(hosts) == 0 or g_opts.isSingle:
|
|
return
|
|
fileName = self.getCacheFile()
|
|
if not os.path.isfile(fileName):
|
|
raise CheckException("File %s is not exist or invalid" % fileName)
|
|
try:
|
|
pool = ThreadPool(DefaultValue.getCpuSet())
|
|
results = pool.map(self.sendTmpFile, hosts)
|
|
pool.close()
|
|
pool.join()
|
|
except Exception as e:
|
|
raise Exception(str(e))
|
|
|
|
|
|
#############################################################################
|
|
# Parse and check parameters
|
|
#############################################################################
|
|
def usage():
|
|
"""
|
|
gs_check is a utility to check the health status of a cluster.
|
|
|
|
Usage:
|
|
gs_check -? | --help
|
|
Example:
|
|
gs_check -i ITEM [...] [-U USER] [-L] [-l LOGFILE] [-o OUTPUTDIR]
|
|
[--skip-root-items] [--set] [--routing]
|
|
gs_check -e SCENE_NAME [-U USER] [-L] [-l LOGFILE] [-o OUTPUTDIR]
|
|
[--skip-root-items] [--set] [--time-out=SECS]
|
|
[--routing] [--skip-items]
|
|
|
|
General options:
|
|
-i Health check item number.
|
|
OLAP Example: -i CheckCPU,CheckMTU,
|
|
CheckPing.
|
|
-e Health check scene name.
|
|
OLAP Example: -e inspect/upgrade/slow_node/
|
|
binary_upgrade/health/install/longtime
|
|
-U Cluster user.
|
|
-L Run the command as local mode.
|
|
-l Path of log file.
|
|
-o Save the result to the specified directory.
|
|
--cid The check ID used for identify a check
|
|
process, only for internal use.
|
|
--skip-root-items Skip the items with root privileges.
|
|
--disk-threshold Set disk threshold for checking disk usage,
|
|
only for CheckDataDiskUsage.
|
|
--format Set the format of the result report.
|
|
--set Set abnormal items if supported
|
|
--time-out Set the timeout for scene check, default
|
|
1500 seconds.
|
|
--routing The network segment with business ip,
|
|
example: 192.168.1.1:255.255.255.0
|
|
--skip-items Skip the specified check item or setting
|
|
item with scene check
|
|
Example: --skip-items CheckCPU,CheckMTU
|
|
--non-print Do not print output result.
|
|
-?, --help Show help information for this utility,
|
|
and exit the command line mode.
|
|
-V, --version Show version information.
|
|
"""
|
|
print(usage.__doc__)
|
|
|
|
|
|
def version():
|
|
'''
|
|
function : get the version the check tool
|
|
input : NA
|
|
output: NA
|
|
'''
|
|
print(SharedFuncs.getVersion())
|
|
|
|
|
|
#########################################################
|
|
# Init global log
|
|
#########################################################
|
|
def initGlobal():
|
|
"""
|
|
function: initialize the global variable
|
|
input : NA
|
|
output: NA
|
|
"""
|
|
# state global variable
|
|
global g_opts, g_context, g_result
|
|
g_opts = CmdOptions()
|
|
g_context = CheckContext()
|
|
g_result = CheckResult()
|
|
|
|
|
|
def parseCommandLine():
|
|
"""
|
|
function: Parse command line and save to global variable
|
|
input : NA
|
|
output: NA
|
|
"""
|
|
# Resolves the command line
|
|
global g_opts
|
|
g_opts = CmdOptions()
|
|
ParaObj = Parameter()
|
|
ParaDict = ParaObj.ParameterCommandLine("check")
|
|
if "helpFlag" in list(ParaDict.keys()):
|
|
usage()
|
|
sys.exit(0)
|
|
|
|
# command line parameter group definition for gs_check
|
|
irrelevantPara = {"scenes": "itemstr", "time_out": "itemstr",
|
|
"skipItems": "itemstr",
|
|
"cid": "scenes", "nodegroup_name": "scenes",
|
|
"shrinkNodes": "scenes"}
|
|
paraNameMap = {"itemstr": "i", "scenes": "e", "time_out": "-time-out",
|
|
"skipItems": "-skip-items",
|
|
"cid": "-cid", "nodegroup_name": "-nodegroup-name",
|
|
"shrinkNodes": "-ShrinkNodes"}
|
|
formatList = ['default', 'json']
|
|
|
|
# position parameter can not be set at the same time
|
|
for para in list(irrelevantPara.keys()):
|
|
if (para in list(ParaDict.keys()) and
|
|
irrelevantPara[para] in list(ParaDict.keys())):
|
|
raise UseBothParameterException(
|
|
(paraNameMap[para], paraNameMap[irrelevantPara[para]]))
|
|
|
|
if "itemstr" in list(ParaDict.keys()):
|
|
g_opts.items = ParaDict["itemstr"]
|
|
if "scenes" in list(ParaDict.keys()):
|
|
g_opts.scene = ParaDict["scenes"]
|
|
if "outFile" in list(ParaDict.keys()):
|
|
g_context.outPath = ParaDict["outFile"]
|
|
if "logFile" in list(ParaDict.keys()):
|
|
g_opts.logFile = ParaDict["logFile"]
|
|
if "user" in list(ParaDict.keys()):
|
|
g_context.user = ParaDict["user"]
|
|
if "hostfile" in list(ParaDict.keys()):
|
|
for node in FileUtil.readFile(ParaDict["hostfile"]):
|
|
g_opts.nodes.append(node.strip())
|
|
if "cid" in list(ParaDict.keys()):
|
|
g_context.setCheckID(ParaDict["cid"])
|
|
g_opts.distributing = True
|
|
if "localMode" in list(ParaDict.keys()):
|
|
g_opts.localMode = True
|
|
if "skipRootItems" in list(ParaDict.keys()):
|
|
g_opts.skipRootItems = True
|
|
if "disk-threshold" in list(ParaDict.keys()):
|
|
g_context.thresholdDn = ParaDict["disk-threshold"]
|
|
if "set" in list(ParaDict.keys()):
|
|
g_context.set = True
|
|
if "routing" in list(ParaDict.keys()):
|
|
g_opts.routing = ParaDict["routing"]
|
|
if "skipItems" in list(ParaDict.keys()):
|
|
g_opts.skipItems = ParaDict["skipItems"]
|
|
if "nodegroup_name" in list(ParaDict.keys()):
|
|
g_context.LCName = ParaDict["nodegroup_name"]
|
|
if "shrinkNodes" in list(ParaDict.keys()):
|
|
g_context.ShrinkNodes = ParaDict["shrinkNodes"]
|
|
if "time_out" in list(ParaDict.keys()):
|
|
try:
|
|
g_opts.timeout = int(ParaDict["time_out"])
|
|
except Exception:
|
|
raise CheckException("The parameter timeout set invalid value")
|
|
if g_opts.timeout < DEFAULT_TIMEOUT:
|
|
raise CheckException(
|
|
"The timeout parameter must be set larger than default "
|
|
"value 1500 seconds")
|
|
setTimeOut()
|
|
if "format" in list(ParaDict.keys()):
|
|
g_opts.format = ParaDict["format"]
|
|
if g_opts.format not in formatList:
|
|
raise CheckException(
|
|
"Format %s is not available,the valid format is %s" % (
|
|
g_opts.format, ",".join(formatList)))
|
|
if "nonPrinting" in list(ParaDict.keys()):
|
|
g_opts.nonPrinting = True
|
|
|
|
def checkParameter():
|
|
##########################################################
|
|
if g_opts.nodes:
|
|
raise CheckException("The --hosts parameter is not available")
|
|
if __isRoot() and not __isDistributing():
|
|
if not g_opts.localMode:
|
|
raise CheckException(
|
|
"The command must be running with cluster user")
|
|
########################################################
|
|
# Get the -U parameter
|
|
########################################################
|
|
checkuser()
|
|
|
|
if (g_opts.outPath and not g_opts.localMode):
|
|
########################################################
|
|
# create output path
|
|
########################################################
|
|
createPath(g_opts.outPath, g_context.user)
|
|
|
|
|
|
def checkuser():
|
|
# The new node scenario does not need the -U parameter
|
|
if __isRoot() and not g_opts.localMode:
|
|
g_context.user = None
|
|
return
|
|
# Default mode -U for the current user
|
|
if not __isRoot() and not g_context.user:
|
|
g_context.user = SharedFuncs.getCurrentUser()
|
|
if g_context.user:
|
|
if not __isRoot() and g_context.user != SharedFuncs.getCurrentUser():
|
|
raise CheckException(
|
|
"The user %s is not current user" % g_context.user)
|
|
try:
|
|
user_uid = pwd.getpwnam(g_context.user).pw_uid
|
|
except Exception:
|
|
raise CheckException(
|
|
"The user %s is not a effective user." % g_context.user)
|
|
if user_uid == 0:
|
|
raise CheckException("The -U parameter can not be the root user.")
|
|
isClusterUser = SharedFuncs.checkClusterUser(g_context.user,
|
|
__getMpprcFile())
|
|
if isClusterUser:
|
|
# get cluster information
|
|
g_context.mpprc = __getMpprcFile()
|
|
clusterInfo = g_context.loadClusterInfo(g_context.user)
|
|
if clusterInfo:
|
|
g_opts.cluster = clusterInfo
|
|
else:
|
|
isClusterUser = False
|
|
if not isClusterUser:
|
|
raise CheckException(
|
|
"The user %s is not valid cluster user" % g_context.user)
|
|
if g_opts.localMode or g_opts.distributing:
|
|
return
|
|
|
|
# Check cluster user trust
|
|
dbNameList = g_opts.cluster.getClusterNodeNames()
|
|
if (len(dbNameList) == 1 and
|
|
dbNameList[0] == NetUtil.GetHostIpOrName()):
|
|
return
|
|
appPath = EnvUtil.getEnv('GPHOME', g_opts.cluster.appPath)
|
|
psshPath = os.path.join(appPath, 'script/gspylib/pssh/bin/pssh')
|
|
cmd = "%s -H %s 'id' " % (psshPath, " -H ".join(dbNameList))
|
|
(status, output) = subprocess.getstatusoutput(cmd)
|
|
if status != 0:
|
|
errorNode = []
|
|
for result in output.split('\n'):
|
|
if result.strip() == "":
|
|
continue
|
|
resultInfo = result.split()
|
|
# Analyze the results
|
|
if len(resultInfo) > 3 and resultInfo[2] == "[SUCCESS]":
|
|
continue
|
|
elif (len(resultInfo) > 3 and resultInfo[2] == "[FAILURE]" and
|
|
resultInfo[3] in dbNameList):
|
|
errorNode.append(resultInfo[3])
|
|
else:
|
|
raise CheckException(
|
|
"Failed to check user trust. commands: %s Error:/n%s"
|
|
% (cmd, output))
|
|
if errorNode:
|
|
raise CheckException(
|
|
"Failed to check user trust with %s" % errorNode)
|
|
else:
|
|
raise CheckException(
|
|
"Failed to check user trust. Error:/n%s" % output)
|
|
|
|
|
|
def createPath(path, user=""):
|
|
if path == ClusterConstants.DEV_NULL:
|
|
return
|
|
if os.path.isdir(path):
|
|
# test write permissions
|
|
if not FileUtil.checkDirWriteable(path):
|
|
raise CheckException(
|
|
"Failed to create or delete file in the [%s]." % path)
|
|
elif os.path.isfile(path):
|
|
raise CheckException("The out path [%s] must be a directory." % path)
|
|
else:
|
|
# path is not exist. recursively create the path
|
|
FileUtil.createDirectory(path, True, DefaultValue.KEY_DIRECTORY_MODE)
|
|
# Modify the file owner
|
|
if __isRoot() and user:
|
|
FileUtil.changeOwner(user, path)
|
|
|
|
|
|
def getTmpPath():
|
|
"""
|
|
function: Get and return temporary directory.
|
|
input : NA
|
|
output: String
|
|
"""
|
|
tmpPath = os.path.join("/tmp", "check_%s" % g_context.checkID)
|
|
# Get the tmp file path
|
|
createPath(tmpPath, g_context.user)
|
|
createPath(os.path.join(tmpPath, "log"), g_context.user)
|
|
createPath(os.path.join(tmpPath, "nodes"), g_context.user)
|
|
return tmpPath
|
|
|
|
|
|
def initLogFile():
|
|
"""
|
|
function: Get and return temporary directory.
|
|
input : NA
|
|
output: NA
|
|
"""
|
|
global g_context, g_logger
|
|
# load the context when the script ruuning on local mode and the context
|
|
# was cached before
|
|
g_context.tmpPath = getTmpPath()
|
|
if g_context.isCached():
|
|
g_context = g_context.load()
|
|
if __getLocalNode(g_context.nodes) in g_context.newNodes:
|
|
g_context.mpprc = None
|
|
g_context.user = None
|
|
g_context.cluster = None
|
|
(g_logger, logFile) = LoggerFactory.getLogger('gs_check',
|
|
g_context.logFile,
|
|
g_context.user)
|
|
g_context.log = g_logger.debug
|
|
g_logger.debug("Load check context from cache file")
|
|
else:
|
|
# Parameter specified first, followed by default GAUSSLOG,
|
|
# last temporary directory
|
|
if g_opts.logFile:
|
|
g_context.logFile = os.path.realpath(g_opts.logFile)
|
|
elif g_opts.cluster:
|
|
g_context.logFile = os.path.join(g_opts.cluster.logPath,
|
|
'%s/om/gs_check.log'
|
|
% g_context.user)
|
|
else:
|
|
g_context.logFile = os.path.join(g_context.tmpPath,
|
|
'log/gs_check.log')
|
|
(g_logger, g_context.logFile) = LoggerFactory.getLogger(
|
|
'gs_check', g_context.logFile, g_context.user)
|
|
# clean the cache files for reentry the command
|
|
g_context.clean()
|
|
# set mpprc file
|
|
g_context.mpprc = __getMpprcFile()
|
|
# Load support scene by parsing project folder
|
|
g_context.loadSupportScene()
|
|
# Load support check items by parsing the project folder
|
|
g_context.loadSupportItems()
|
|
# load the scene configuration
|
|
if g_opts.scene:
|
|
g_context.loadSceneConfiguration(g_opts.scene)
|
|
# load cluster info
|
|
if g_opts.cluster:
|
|
g_context.cluster = g_opts.cluster
|
|
g_context.oldNodes = g_opts.cluster.getClusterSshIps()[0]
|
|
# load nodes
|
|
if g_opts.nodes:
|
|
for node in g_opts.nodes:
|
|
if node not in g_context.oldNodes:
|
|
g_context.newNodes.append(node)
|
|
g_context.nodes = g_context.oldNodes + g_context.newNodes
|
|
|
|
|
|
def getRootUserPwd():
|
|
# ask user input root password interactive when in new node scene or
|
|
# contains items with root permission
|
|
if __hasRootItems() and not __isRoot():
|
|
rootItems = [i['name'] for i in g_context.rootItems]
|
|
__printOnScreen(
|
|
"The below items require root privileges to execute:[%s]"
|
|
% " ".join(rootItems))
|
|
rootuser = input("Please enter root privileges user[root]:")\
|
|
or "root"
|
|
rootpwd = getpass.getpass("Please enter password for user[%s]:"
|
|
% rootuser)
|
|
g_logger.debug("Ask user input password interactive")
|
|
for host in g_context.nodes:
|
|
isPwdOk = SharedFuncs.verifyPasswd(host, rootuser, rootpwd)
|
|
if not isPwdOk:
|
|
# try to connect remote node again
|
|
rootpwd = __retryConnection(host, rootuser)
|
|
g_opts.pwdMap[host] = (rootuser, rootpwd)
|
|
if pwd.getpwnam(rootuser).pw_uid != 0:
|
|
raise CheckException("Enter the user [%s] does not have"
|
|
" root privileges." % rootuser)
|
|
# print message on screen
|
|
__printOnScreen("Check root password connection successfully")
|
|
|
|
|
|
def parseCheckContext():
|
|
"""
|
|
function: Parse check context and initialize all the context value
|
|
input : NA
|
|
output: NA
|
|
"""
|
|
global g_context
|
|
initLogFile()
|
|
if g_context.isCached():
|
|
return
|
|
g_logger.debug("Start to parse the check items config file")
|
|
items_all = []
|
|
items_oldNode = []
|
|
items_newNode = []
|
|
failedItems = []
|
|
singleSkipList = []
|
|
# generate the items from scene configuration
|
|
if g_opts.scene:
|
|
items_oldNode, failedItems = __parseScene(g_opts.scene)
|
|
items_all += items_oldNode
|
|
# generate the items from -i parameter value
|
|
elif (g_opts.items):
|
|
for i in g_opts.items:
|
|
item = __parseOneItem(i)
|
|
if (not item):
|
|
failedItems.append(i)
|
|
else:
|
|
items_all.append(item)
|
|
for item in items_all[:]:
|
|
if not g_context.set and item['name'] in g_opts.skipItems:
|
|
items_all.remove(item)
|
|
continue
|
|
if g_context.set and item['set_permission'] == 'root':
|
|
g_context.rootItems.append(item)
|
|
if g_opts.skipRootItems and item['permission'] == 'root':
|
|
items_all.remove(item)
|
|
continue
|
|
if item['permission'] == 'root':
|
|
g_context.rootItems.append(item)
|
|
if g_opts.isSingle and item['name'] in SINGLE_SKIP:
|
|
singleSkipList.append(item['name'])
|
|
continue
|
|
if item['name'] == "CheckRouting":
|
|
if g_opts.routing:
|
|
g_context.routing = g_opts.routing
|
|
elif g_opts.cluster:
|
|
workIP = g_opts.cluster.getDbNodeByName(
|
|
NetUtil.GetHostIpOrName()).backIps[0]
|
|
g_context.routing = "%s:%s" % (
|
|
workIP, SharedFuncs.getMaskByIP(workIP))
|
|
else:
|
|
raise CheckException(
|
|
"The --routing is required when cluster dosen't exist")
|
|
g_context.items.append(item)
|
|
if len(singleSkipList) != 0:
|
|
__printOnScreen(
|
|
"The following items are skipped when the type of cluster is"
|
|
" single:\n[%s]" % ",".join(singleSkipList))
|
|
if not items_newNode:
|
|
g_context.oldItems = g_context.items
|
|
else:
|
|
g_context.oldItems = items_oldNode
|
|
g_context.newItems = items_newNode
|
|
if g_context.set and items_all:
|
|
# Settings will have a big impact and need to be confirmed
|
|
confirmItem = {
|
|
"CheckCrontabLeft": "Clear om_monitor in crond service",
|
|
"CheckDirLeft": "Delete all file in '/opt/huawei/Bigdata/',"
|
|
"'/var/log/Bigdata/','/home/omm/'",
|
|
"CheckProcessLeft": "Kill all process with gaussdb and omm user",
|
|
"CheckOmmUserExist": "Delete system user omm",
|
|
"CheckPortConflict": "kill all process with occupies "
|
|
"the 25xxx port"
|
|
}
|
|
confirmMsg = ""
|
|
for item in items_all:
|
|
if item['name'] in list(confirmItem.keys()):
|
|
confirmMsg += confirmItem[item['name']] + "\n"
|
|
if item['name'] in SETITEM_SKIP:
|
|
g_context.skipSetItem.append(item['name'])
|
|
|
|
if confirmMsg:
|
|
confirmMsg = "Warning: Executing the settings will do " \
|
|
"the following at the [%s] node:\n" % \
|
|
','.join(g_context.newNodes) + confirmMsg
|
|
__printOnScreen(confirmMsg)
|
|
flag = input("Execution settings? (Y/N):")
|
|
while True:
|
|
# If it is not yes or all, it has been imported
|
|
if not flag.upper() in ("Y", "N", "YES", "NO"):
|
|
flag = input("Please type 'yes' or 'no': ")
|
|
continue
|
|
break
|
|
if flag.upper() in ("Y", "YES"):
|
|
pass
|
|
if flag.upper() in ("N", "NO"):
|
|
for Item in items_all:
|
|
if Item['name'] in list(confirmItem.keys()):
|
|
g_context.skipSetItem.append(Item['name'])
|
|
__printOnScreen(
|
|
'Skip the settings for [%s]'
|
|
% ','.join(g_context.skipSetItem))
|
|
if failedItems:
|
|
raise ParseItemException(failedItems)
|
|
if not g_context.items:
|
|
raise CheckException("No check item can be performed,"
|
|
" please confirm the input parameters.")
|
|
|
|
# print message on screen
|
|
__printOnScreen("Parsing the check items config file successfully")
|
|
getRootUserPwd()
|
|
g_context.getMapping()
|
|
g_context.dump()
|
|
|
|
|
|
def dispatchCached():
|
|
# dispatch the context file to remote node
|
|
if (not g_opts.localMode and not g_opts.distributing and
|
|
not g_opts.isSingle):
|
|
g_logger.debug("Start to distributing the check context dump file")
|
|
g_context.dispatch(__getRemoteNodes(g_context.nodes))
|
|
# print message on screen
|
|
__printOnScreen(
|
|
"Distribute the context file to remote hosts successfully")
|
|
|
|
|
|
def __printOnScreen(msg):
|
|
"""
|
|
function: print message on screen
|
|
"""
|
|
if g_opts.localMode or g_opts.distributing:
|
|
return
|
|
g_logger.info(msg)
|
|
|
|
|
|
def __isRoot():
|
|
"""
|
|
function: whether the item running under root user
|
|
"""
|
|
return os.getuid() == 0
|
|
|
|
|
|
def __hasRootItems():
|
|
"""
|
|
function: whether the items required root privileges
|
|
"""
|
|
return g_context.rootItems is not None and len(g_context.rootItems) > 0
|
|
|
|
|
|
def __isDistributing():
|
|
"""
|
|
function: whether execution is distributing
|
|
"""
|
|
return g_opts.distributing
|
|
|
|
def __getLocalNode(nodes):
|
|
"""
|
|
function: get local node
|
|
"""
|
|
if nodes:
|
|
for n in nodes:
|
|
if SharedFuncs.is_local_node(n):
|
|
return n
|
|
return NetUtil.GetHostIpOrName()
|
|
|
|
|
|
def __getSeparatedValue(value, separator=","):
|
|
'''
|
|
get command line value which were separated by ","
|
|
'''
|
|
if separator not in value:
|
|
return [value]
|
|
return value.split(separator)
|
|
|
|
|
|
def __getNodesFromFile(fileName):
|
|
"""
|
|
function: get nodes information from hostFile
|
|
"""
|
|
lines = []
|
|
try:
|
|
with open(fileName, 'r') as fp:
|
|
for line in [line.strip().rstrip('\n') for line in fp]:
|
|
if not line or line in lines or line.startswith('#'):
|
|
continue
|
|
lines.append(line.strip())
|
|
except Exception as e:
|
|
raise Exception(str(e))
|
|
return lines
|
|
|
|
|
|
def __retryConnection(host, user):
|
|
"""
|
|
function: try to connect remote node again
|
|
"""
|
|
# Try connecting to the remote node three times
|
|
for i in range(3):
|
|
passwd = getpass.getpass(
|
|
"Please enter password for user[%s] on the node[%s]:"
|
|
% (user, host))
|
|
isOK = SharedFuncs.verifyPasswd(host, user, passwd)
|
|
if isOK:
|
|
return passwd
|
|
else:
|
|
continue
|
|
raise CheckException(
|
|
"Verify password failed for user[%s] on the node[%s]" % (user, host))
|
|
|
|
|
|
def __getMpprcFile():
|
|
"""
|
|
function: get separated environment variables
|
|
"""
|
|
# get mpprc file
|
|
envValue = EnvUtil.getEnv("MPPDB_ENV_SEPARATE_PATH")
|
|
if envValue is not None and os.path.isfile(envValue):
|
|
return envValue
|
|
elif not __isRoot() and EnvUtil.getEnv('GAUSS_ENV'):
|
|
cmd = "echo ~ 2>/dev/null"
|
|
(status, output) = subprocess.getstatusoutput(cmd)
|
|
if status != 0:
|
|
raise CheckException(
|
|
"Fetching user environment variable file failed."
|
|
" Please setup environment variables." + "The cmd is %s" % cmd)
|
|
else:
|
|
return os.path.join(output, ".bashrc")
|
|
elif __isRoot() and g_context.user:
|
|
cmd = "su - %s -c 'echo ~ 2>/dev/null'" % g_context.user
|
|
(status, output) = subprocess.getstatusoutput(cmd)
|
|
if status != 0:
|
|
raise CheckException(
|
|
"Failed to get user [%s] home directory. Error: %s\n" % (
|
|
g_context.user, output) + "The cmd is %s" % cmd)
|
|
else:
|
|
return os.path.join(output, ".bashrc")
|
|
elif __isRoot():
|
|
return ""
|
|
else:
|
|
raise CheckException("The separated mpprc file was not found."
|
|
" Please setup environment variables")
|
|
|
|
|
|
def __getUserAndPwd(node):
|
|
"""
|
|
function: get username and password for certain node
|
|
"""
|
|
if __hasRootItems() and not __isRoot():
|
|
return (g_opts.pwdMap[node][0], g_opts.pwdMap[node][1])
|
|
else:
|
|
return (g_context.user, None)
|
|
|
|
|
|
def __getRemoteNodes(hosts):
|
|
'''
|
|
function: get the remote host ignore the local host
|
|
'''
|
|
return [h for h in hosts if not SharedFuncs.is_local_node(h)]
|
|
|
|
|
|
def __parseScene(sceneName):
|
|
'''
|
|
function: parse scene configure file
|
|
'''
|
|
if not sceneName:
|
|
raise NotEmptyException("scene name")
|
|
# Get scene xml
|
|
xmlFile = "%s/config/scene_%s.xml" % (g_context.basePath, sceneName)
|
|
if not os.path.isfile(xmlFile):
|
|
raise SceneNotFoundException(sceneName, g_context.supportScenes)
|
|
|
|
domTree = ETree.parse(xmlFile)
|
|
rootNode = domTree.getroot()
|
|
|
|
itemNames = []
|
|
thresholds = {}
|
|
|
|
# parse items from allow items
|
|
for elem in rootNode.findall('allowitems/item'):
|
|
elemName = elem.attrib['name']
|
|
# check the check item whether exist or not
|
|
if elemName not in list(g_context.supportItems.keys()):
|
|
raise NotExistException("elemName", "support items")
|
|
# save threshold as text and parse them later
|
|
subElem = elem.find('threshold')
|
|
if subElem is not None:
|
|
thresholds[elemName] = subElem.text.strip()
|
|
itemNames.append(elemName)
|
|
|
|
# parse categories and get all items
|
|
for category in rootNode.findall('allowcategories/category'):
|
|
cpath = "%s/items/%s" % (g_context.basePath, category.attrib['name'])
|
|
if os.path.isdir(cpath):
|
|
itemNames.extend(x[:-3] for x in os.listdir(cpath) if
|
|
x[:-3] not in itemNames and x.endswith(".py"))
|
|
|
|
# parse deny items
|
|
for elem in rootNode.findall('denyitems/item'):
|
|
elemName = elem.attrib['name']
|
|
if elemName in itemNames:
|
|
itemNames.remove(elemName)
|
|
|
|
items = []
|
|
failedItems = []
|
|
for i in itemNames:
|
|
item = __parseOneItem(i)
|
|
if (not item):
|
|
failedItems.append(i)
|
|
|
|
# overwrite the threshold parameters
|
|
if thresholds and i in list(thresholds.keys()):
|
|
# parse the threshold of check item
|
|
sceneThreshold = __parseThreshold(thresholds[i])
|
|
if item['threshold']:
|
|
item['threshold'] = dict(item['threshold'], **sceneThreshold)
|
|
else:
|
|
item['threshold'] = sceneThreshold
|
|
items.append(item)
|
|
return (items, failedItems)
|
|
|
|
|
|
def __parseOneItem(itemName):
|
|
'''
|
|
function: parse one check item and get the full information
|
|
'''
|
|
if not itemName:
|
|
raise NotEmptyException("Item name")
|
|
item = {}
|
|
# try to load check item configuration from xml file
|
|
xmlFile = "%s/config/items.xml" % g_context.basePath
|
|
for event, elem in ETree.iterparse(xmlFile):
|
|
if event == 'end':
|
|
if elem.tag == 'checkitem' and elem.attrib['name'] == itemName:
|
|
# Parse the xml file
|
|
item['id'] = elem.attrib['id']
|
|
item['name'] = elem.attrib['name']
|
|
|
|
item['title_zh'] = __parseAttr(elem, "title", "zh")
|
|
item['title_en'] = __parseAttr(elem, "title", "en")
|
|
item['suggestion_zh'] = __parseAttr(elem, "suggestion", "zh")
|
|
item['suggestion_en'] = __parseAttr(elem, "suggestion", "en")
|
|
item['standard_zh'] = __parseAttr(elem, "standard", "zh")
|
|
item['standard_en'] = __parseAttr(elem, "standard", "en")
|
|
item['category'] = __parseProperty(elem, 'category', 'other')
|
|
item['permission'] = __parseProperty(elem, 'permission',
|
|
'user')
|
|
item['set_permission'] = __parseProperty(elem,
|
|
'set_permission',
|
|
'user')
|
|
item['scope'] = __parseProperty(elem, 'scope', 'all')
|
|
item['analysis'] = __parseProperty(elem, 'analysis',
|
|
'default')
|
|
# Get the threshold
|
|
threshold = elem.find('threshold')
|
|
if threshold is not None and threshold.text is not None:
|
|
# parse the threshold of check item
|
|
item["threshold"] = __parseThreshold(
|
|
threshold.text.strip())
|
|
break
|
|
return item
|
|
|
|
|
|
def __parseAttr(elem, attr, language='zh'):
|
|
'''
|
|
function: parse the xml attr with language
|
|
'''
|
|
val = elem.find('/'.join([attr, language]))
|
|
if val is not None and val.text is not None:
|
|
return val.text.strip().encode('utf-8')
|
|
return ""
|
|
|
|
|
|
def __parseProperty(elem, propertyName, defaultValue):
|
|
'''
|
|
function: parse the property of check item
|
|
'''
|
|
prop = elem.find(propertyName)
|
|
result = defaultValue
|
|
if prop is not None and prop.text is not None:
|
|
result = prop.text.strip()
|
|
return result
|
|
|
|
|
|
def __parseThreshold(value, separator=";"):
|
|
'''
|
|
function: parse the threshold of check item
|
|
'''
|
|
result = {}
|
|
if separator not in value and "=" not in value:
|
|
return result
|
|
|
|
if separator not in value and "=" in value:
|
|
d = value.strip().split('=')
|
|
result[d[0]] = d[1]
|
|
else:
|
|
for v in value.strip().split(separator):
|
|
d = v.strip().split('=')
|
|
result[d[0]] = d[1]
|
|
return result
|
|
|
|
|
|
def getMTUValue(node):
|
|
global g_mtuMap
|
|
# get ip address
|
|
# maybe backIP has no trust
|
|
nodeName = g_context.hostMapping[node]
|
|
if (g_context.cluster and
|
|
nodeName in g_context.cluster.getClusterNodeNames()):
|
|
addr = g_context.cluster.getDbNodeByName(nodeName).backIps[0]
|
|
sshIp = g_context.cluster.getDbNodeByName(nodeName).sshIps[0]
|
|
else:
|
|
addr = node
|
|
sshIp = node
|
|
# get all network card information
|
|
cmd1 = """printf \"\n\n`/sbin/ifconfig -a`\n\n\" """
|
|
if not g_opts.pwdMap:
|
|
output = SharedFuncs.runSshCmd(cmd1, sshIp, g_context.user)
|
|
else:
|
|
username, passwd = g_opts.pwdMap[node]
|
|
if username is None or passwd is None:
|
|
raise CheckException("Retrive username and password error.")
|
|
output = SharedFuncs.runSshCmdWithPwd(cmd1, sshIp, username, passwd)
|
|
# Separate each network card
|
|
networkInfoList = output.strip().split('\n\n')
|
|
|
|
networkInfo = ""
|
|
mtuValue = ""
|
|
# find network card by IP
|
|
for eachNet in networkInfoList:
|
|
if eachNet.find(addr) > 0 and eachNet.find('inet') > 0:
|
|
networkInfo = eachNet
|
|
break
|
|
if not networkInfo:
|
|
raise CheckException(
|
|
"Failed to get network card information with '%s'." % node)
|
|
# get network number
|
|
networkNum = networkInfo.split()[0]
|
|
# Remove : if it exists
|
|
if networkNum[-1] == ":":
|
|
networkNum = networkNum[:-1]
|
|
for eachLine in networkInfo.split('\n'):
|
|
# get mtu Value with SuSE and redHat6.x
|
|
if eachLine.find('MTU') > 0:
|
|
mtuValue = eachLine.split(':')[1].split(' ')[0].strip()
|
|
break
|
|
# get mtu Value with redHat7.x
|
|
elif eachLine.find('mtu') > 0:
|
|
mtuValue = eachLine.split()[-1]
|
|
break
|
|
else:
|
|
continue
|
|
if not networkNum:
|
|
raise CheckException(
|
|
"Failed to get network card number with '%s'." % node)
|
|
if not mtuValue:
|
|
raise CheckException(
|
|
"Failed to get network card mtu value with '%s' '%s'."
|
|
% (node, networkNum))
|
|
# The nodes are grouped by MTU value
|
|
if not mtuValue in list(g_mtuMap.keys()):
|
|
g_mtuMap[mtuValue] = ["%s-%s" % (node, networkNum)]
|
|
else:
|
|
g_mtuMap[mtuValue].append("%s-%s" % (node, networkNum))
|
|
|
|
|
|
def preCheck():
|
|
"""
|
|
function: preCheck for different scene
|
|
input : NA
|
|
output: NA
|
|
"""
|
|
# patch ssh config
|
|
if __isRoot():
|
|
cmd = "grep -E '^MaxStartups[\ \t]+1000' /etc/ssh/sshd_config"
|
|
(status, output) = subprocess.getstatusoutput(cmd)
|
|
if status != 0:
|
|
cmd = "sed -i '/MaxStartups/d' /etc/ssh/sshd_config &&" \
|
|
" echo 'MaxStartups 1000' >> /etc/ssh/sshd_config &&" \
|
|
" service sshd reload"
|
|
SharedFuncs.runShellCmd(cmd)
|
|
|
|
if (g_opts.distributing or g_opts.localMode or
|
|
g_opts.isSingle or not g_context.nodes):
|
|
return
|
|
# Check all node MTU value
|
|
try:
|
|
pool = ThreadPool(DefaultValue.getCpuSet())
|
|
results = pool.map(getMTUValue, g_context.nodes)
|
|
pool.close()
|
|
pool.join()
|
|
except Exception as e:
|
|
raise Exception(str(e))
|
|
# According to the number of groups to determine whether the same
|
|
if len(list(g_mtuMap.keys())) > 1:
|
|
warningMsg = "Warning: The MTU value is inconsistent on all node," \
|
|
" maybe checking will be slower or hang."
|
|
for mtuValue in list(g_mtuMap.keys()):
|
|
warningMsg += "\n%s: [%s]" % (
|
|
mtuValue, ','.join(g_mtuMap[mtuValue]))
|
|
__printOnScreen(warningMsg)
|
|
|
|
|
|
def analysisResult(item):
|
|
global g_itemResult
|
|
outputPath = g_context.tmpPath
|
|
checkID = g_context.checkID
|
|
itemName = item['name']
|
|
files = []
|
|
content = ""
|
|
for n in g_context.nodes:
|
|
fileName = "%s/%s_%s_%s.out" % (
|
|
outputPath, itemName, g_context.hostMapping[n], checkID)
|
|
files.append(fileName)
|
|
for f in files:
|
|
content += "".join(FileUtil.readFile(f))
|
|
itemResult = __analysisResult(content, itemName)
|
|
g_itemResult[itemName] = [itemResult, itemResult.formatOutput()]
|
|
|
|
|
|
def doCheck():
|
|
"""
|
|
function: do check process
|
|
input : NA
|
|
output: NA
|
|
"""
|
|
# Local mode
|
|
if g_opts.localMode:
|
|
if __isDistributing():
|
|
# load check item dynamic and get the execute result
|
|
doRunCheck()
|
|
else:
|
|
if not __hasRootItems() or __isRoot():
|
|
# load check item dynamic and get the execute result
|
|
doRunCheck()
|
|
else:
|
|
# check with root privileges
|
|
doRootCheck()
|
|
else:
|
|
# watching the threads and response for Ctrl+C signal
|
|
Watcher()
|
|
threads = []
|
|
__printOnScreen(
|
|
"Start to health check for the cluster. Total Items:%s Nodes:%s"
|
|
% (len(g_context.items), len(g_context.nodes)))
|
|
for n in g_context.nodes:
|
|
t = CheckThread("%s Thread" % n, doLocalCheck, n)
|
|
threads.append(t)
|
|
|
|
items = g_context.items
|
|
itemCount = len(items)
|
|
itemsName = [i['name'] for i in items]
|
|
outputPath = g_context.tmpPath
|
|
nodes = g_context.nodes[:]
|
|
checkID = g_context.checkID
|
|
# init progress display
|
|
progress_manager = MultiProgressManager()
|
|
progress_manager.put('Checking...', LineProgress(total=itemCount,
|
|
title='Checking...'))
|
|
# fix the display format for progress bar
|
|
newLine = '\n'
|
|
print(newLine)
|
|
# Check the number of completed nodes
|
|
overNodes = 0
|
|
# Time to hit the log
|
|
LogCount = 0
|
|
lastTimeProgress = -1
|
|
while len(nodes) and datetime.now() <= g_endTime:
|
|
totleCount = 0
|
|
slowNode = []
|
|
for node in nodes:
|
|
# Get user and password
|
|
username, passwd = __getUserAndPwd(node)
|
|
if node in g_context.oldNodes:
|
|
itemCount_node = len(g_context.oldItems)
|
|
else:
|
|
itemCount_node = len(g_context.newItems)
|
|
# Local execution
|
|
if SharedFuncs.is_local_node(node):
|
|
checkCount = SharedFuncs.checkComplete(
|
|
checkID, node, g_context.hostMapping[node],
|
|
g_context.user, g_context.tmpPath)
|
|
# Executed in new node scene
|
|
elif node in g_context.newNodes:
|
|
checkCount = SharedFuncs.checkComplete(
|
|
checkID, node, g_context.hostMapping[node], username,
|
|
g_context.tmpPath, passwd)
|
|
else:
|
|
checkCount = SharedFuncs.checkComplete(
|
|
checkID, node, g_context.hostMapping[node],
|
|
g_context.user, g_context.tmpPath)
|
|
try:
|
|
checkCount = int(checkCount.strip())
|
|
except Exception:
|
|
checkCount = 0
|
|
# If there is a node check completed,
|
|
# some nodes just started,record slow node
|
|
if overNodes > 0 and checkCount < 2:
|
|
slowNode.append(node)
|
|
if checkCount == itemCount_node:
|
|
nodes.remove(node)
|
|
# Record the number of completed nodes
|
|
overNodes += 1
|
|
if not SharedFuncs.is_local_node(node):
|
|
if node in g_context.newNodes:
|
|
outItems = []
|
|
for i in itemsName:
|
|
outItems.append("%s/%s_%s_%s.out" % (
|
|
outputPath, i,
|
|
g_context.hostMapping[node],
|
|
checkID))
|
|
SharedFuncs.receiveFile(outItems, node, username,
|
|
outputPath, passwd)
|
|
else:
|
|
fileName = "%s/*_%s_%s.out" % (
|
|
outputPath, g_context.hostMapping[node],
|
|
checkID)
|
|
# Delete Files
|
|
SharedFuncs.receiveFile(fileName, node,
|
|
g_context.user,
|
|
outputPath)
|
|
else:
|
|
totleCount += checkCount
|
|
# All nodes check the number of completed
|
|
totleCount += itemCount * overNodes
|
|
|
|
# Timed and counted
|
|
time.sleep(1)
|
|
LogCount += 1
|
|
# Update execution progress
|
|
progressInfo = totleCount // len(g_context.nodes)
|
|
# Refresh only as the schedule changes
|
|
if lastTimeProgress < progressInfo <= itemCount:
|
|
progress_manager.update("Checking...", progressInfo)
|
|
lastTimeProgress = progressInfo
|
|
# Suggest the slow node to log every 30 seconds
|
|
if slowNode and itemCount > 1 and LogCount % 30 == 0:
|
|
logMsg = "Warning: The node [%s] check progress" \
|
|
" is slow." % ",".join(slowNode)
|
|
g_logger.debug(logMsg)
|
|
|
|
for t in threads:
|
|
if t.exitcode == 1:
|
|
raise ThreadCheckException(t.name, t.exception)
|
|
|
|
for t in threads:
|
|
t.join(1)
|
|
|
|
if datetime.now() > g_endTime:
|
|
raise TimeoutException(nodes)
|
|
|
|
__printOnScreen("Start to analysis the check result")
|
|
try:
|
|
pool = ThreadPool(DefaultValue.getCpuSet())
|
|
results = pool.map(analysisResult, g_context.items)
|
|
pool.close()
|
|
pool.join()
|
|
except Exception as e:
|
|
raise Exception(str(e))
|
|
for item in g_context.items:
|
|
g_result.append(g_itemResult[item['name']][0])
|
|
print(g_itemResult[item['name']][1])
|
|
|
|
__printOnScreen("Analysis the check result successfully")
|
|
|
|
|
|
def doRunCheck():
|
|
"""
|
|
function: load check item dynamic and get the execute result
|
|
input : NA
|
|
output: NA
|
|
"""
|
|
outputPath = g_context.tmpPath
|
|
localHost = __getLocalNode(g_context.nodes)
|
|
if localHost in g_context.newNodes:
|
|
items = g_context.newItems
|
|
else:
|
|
items = g_context.oldItems
|
|
if g_context.hostMapping:
|
|
localHost = g_context.hostMapping[localHost]
|
|
for item in items:
|
|
content = ""
|
|
modPath = g_context.supportItems[item['name']]
|
|
checker = CheckItemFactory.createItem(item['name'], modPath,
|
|
item['scope'], item['analysis'])
|
|
checker.runCheck(g_context, g_logger)
|
|
|
|
# for local run get the content
|
|
fileName = "%s/%s_%s_%s.out" % (
|
|
outputPath, item['name'], localHost, g_context.checkID)
|
|
|
|
content += "".join(FileUtil.readFile(fileName))
|
|
itemResult = __analysisResult(content, item['name'])
|
|
g_result.append(itemResult)
|
|
# run the check process distributing and no need to clean the resource
|
|
if __isDistributing():
|
|
g_logger.debug("run check items done and exit the command")
|
|
if g_opts.format == 'default' and not g_opts.nonPrinting:
|
|
# Initialize the self.clusterInfo variable
|
|
print(g_result.outputRaw())
|
|
|
|
|
|
def doRootCheck():
|
|
"""
|
|
function: check with root privileges
|
|
input : NA
|
|
output: NA
|
|
"""
|
|
# get local node
|
|
host = __getLocalNode(g_context.nodes)
|
|
# prepare the command for running check
|
|
cmd = __prepareCmd(g_context.items, g_context.user, g_context.checkID, True)
|
|
# run root cmd
|
|
output = SharedFuncs.runRootCmd(cmd, g_opts.pwdMap[host][0],
|
|
g_opts.pwdMap[host][1], g_context.mpprc)
|
|
print(output.decode())
|
|
|
|
|
|
def __prepareCmd(items, user, checkid, print_output=False):
|
|
"""
|
|
function: prepare the command for running check
|
|
"""
|
|
cmdPath = os.path.realpath(os.path.dirname(__file__))
|
|
itemsName = [i['name'] for i in items]
|
|
userParam = ""
|
|
checkIdParam = ""
|
|
routingParam = ""
|
|
printParam = ""
|
|
if not print_output:
|
|
printParam = "--non-print"
|
|
|
|
if user:
|
|
userParam = " -U %s " % user
|
|
if checkid:
|
|
checkIdParam = " --cid=%s " % checkid
|
|
if g_context.routing:
|
|
routingParam = "--routing %s" % g_context.routing
|
|
cmd = "%s/gs_check -i %s %s %s -L %s -o %s -l %s %s" % (
|
|
cmdPath, ",".join(itemsName), userParam, checkIdParam,
|
|
routingParam, g_context.tmpPath, g_context.logFile, printParam)
|
|
return cmd
|
|
|
|
|
|
def doLocalCheck(host):
|
|
"""
|
|
function: running check on different threads
|
|
input : NA
|
|
output: NA
|
|
"""
|
|
# prepare the command for running check
|
|
if host in g_context.oldNodes:
|
|
cmd = __prepareCmd(g_context.oldItems, g_context.user,
|
|
g_context.checkID)
|
|
else:
|
|
cmd = __prepareCmd(g_context.newItems, "", g_context.checkID)
|
|
if SharedFuncs.is_local_node(host):
|
|
if __hasRootItems():
|
|
SharedFuncs.runRootCmd(cmd, g_opts.pwdMap[host][0],
|
|
g_opts.pwdMap[host][1], g_context.mpprc)
|
|
else:
|
|
SharedFuncs.runShellCmd(cmd, g_context.user, g_context.mpprc)
|
|
else:
|
|
if not __hasRootItems():
|
|
SharedFuncs.runSshCmd(cmd, host, g_context.user, g_context.mpprc)
|
|
else:
|
|
# get username and password for certain node
|
|
username, passwd = __getUserAndPwd(host)
|
|
if host in g_context.newNodes:
|
|
SharedFuncs.runSshCmdWithPwd(cmd, host, username, passwd)
|
|
else:
|
|
SharedFuncs.runSshCmdWithPwd(cmd, host, username, passwd,
|
|
g_context.mpprc)
|
|
|
|
|
|
def __analysisResult(output, itemName):
|
|
"""
|
|
function: analysis the check result
|
|
"""
|
|
item_result = ItemResult.parse(output)
|
|
if not item_result:
|
|
raise CheckException("analysis result occurs error")
|
|
try:
|
|
# load support item
|
|
mod_path = g_context.supportItems[itemName]
|
|
checker = CheckItemFactory.createFrom(itemName, mod_path, g_context)
|
|
# analysis the item result got from each node
|
|
item_result = checker.postAnalysis(item_result)
|
|
except Exception as e:
|
|
raise CheckException(str(e))
|
|
return item_result
|
|
|
|
|
|
def moveLogFile(host):
|
|
tmpLog = os.path.join(g_context.tmpPath, "log/gs_check.log")
|
|
SharedFuncs.receiveFile(g_context.logFile, host, g_context.user,
|
|
tmpLog[:-4] + "_" + host + ".log")
|
|
|
|
|
|
def formatOutput():
|
|
"""
|
|
function: format and zip the result package
|
|
input : NA
|
|
output: NA
|
|
"""
|
|
if g_opts.distributing or not g_result:
|
|
return
|
|
|
|
try:
|
|
# output the result to a file
|
|
resultFile = os.path.join(g_context.tmpPath,
|
|
"CheckResult_%s" % g_context.checkID)
|
|
FileUtil.createFile(resultFile, True)
|
|
FileUtil.writeFile(resultFile, [g_result.outputResult()])
|
|
except Exception as e:
|
|
if os.path.exists(resultFile):
|
|
FileUtil.removeFile(resultFile)
|
|
g_logger.info("Warning! Generate check result output file failed.")
|
|
g_logger.debug(str(e))
|
|
|
|
if g_opts.localMode:
|
|
return
|
|
|
|
# export the check result to excel file in output folder,
|
|
# only export excel for certain scene
|
|
scene = '_' + g_opts.scene if g_opts.scene else ""
|
|
|
|
# collect the log file from remote host
|
|
tmpLog = os.path.join(g_context.tmpPath, "log/gs_check.log")
|
|
# Get the log file
|
|
if g_opts.logFile or g_opts.cluster:
|
|
FileUtil.cpFile(g_context.logFile, tmpLog[:-4] + "_" +
|
|
NetUtil.GetHostIpOrName() + ".log")
|
|
else:
|
|
FileUtil.moveFile(g_context.logFile, tmpLog[:-4] + "_" +
|
|
NetUtil.GetHostIpOrName() + ".log")
|
|
hosts = __getRemoteNodes(g_context.nodes)
|
|
if hosts:
|
|
try:
|
|
pool = ThreadPool(DefaultValue.getCpuSet())
|
|
results = pool.map(moveLogFile, hosts)
|
|
pool.close()
|
|
pool.join()
|
|
except Exception as e:
|
|
g_logger.info(
|
|
"Warning! Retrieve log file from remote host failed.")
|
|
g_logger.debug(str(e))
|
|
|
|
# move the *.out file to nodes folder
|
|
outputFolder = g_context.tmpPath
|
|
checkID = g_context.checkID
|
|
cmd = "cd %s; find . -name \'*%s.out\' -exec mv {} %s \;"\
|
|
% (g_context.tmpPath, checkID, os.path.join(outputFolder, "nodes"))
|
|
SharedFuncs.runShellCmd(cmd, g_context.user)
|
|
|
|
# No check result is generated when the output is specified as /dev/null
|
|
if g_context.outPath == ClusterConstants.DEV_NULL:
|
|
print(g_result.outputStatistic())
|
|
print("The inspection report has been cleared by /dev/null.")
|
|
return
|
|
tarFile = "%s/CheckReport%s_%s.tar.gz" %\
|
|
(g_context.outPath, scene, g_context.checkID)
|
|
# tar the output for this check
|
|
tarFiles = ''
|
|
if (__checkFileExist(os.path.join(outputFolder, "nodes"),
|
|
'%s.out' % checkID)):
|
|
tarFiles += ' nodes '
|
|
if __checkFileExist(os.path.join(outputFolder, "log"), '.log'):
|
|
tarFiles += ' log '
|
|
if __checkFileExist(outputFolder, '%s.zip' % checkID):
|
|
tarFiles += ' *%s.zip ' % checkID
|
|
if __checkFileExist(outputFolder, 'CheckResult_%s' % checkID):
|
|
tarFiles += ' CheckResult_%s ' % checkID
|
|
tarcmd = "cd %s;tar -zcf %s %s 2>&1; chmod %s '%s'" \
|
|
% (outputFolder, tarFile, tarFiles,
|
|
DefaultValue.KEY_FILE_MODE, tarFile)
|
|
SharedFuncs.runShellCmd(tarcmd, g_context.user)
|
|
|
|
if g_opts.format == 'default':
|
|
print(g_result.outputStatistic())
|
|
print("For more information please refer to %s"
|
|
% os.path.join(outputFolder, tarFile))
|
|
|
|
if g_opts.format == 'json':
|
|
print(g_result.outputJson())
|
|
|
|
|
|
def __checkFileExist(path, filePattern):
|
|
# Check the file exists
|
|
cmd = "cd %s; ls | grep '%s' | wc -l" % (path, filePattern)
|
|
(status, output) = subprocess.getstatusoutput(cmd)
|
|
if status == 0 and output != "0":
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
|
|
def killChildProcess(node):
|
|
checkID = g_context.checkID
|
|
# cmd with switch users
|
|
cmd_switch = """proc_pid_list=`ps -ef | grep 'cid=%s'| grep -v 'grep'""" \
|
|
"""|awk '{print \$2}'` """ % checkID
|
|
cmd_switch += """ && (if [ X\"$proc_pid_list\" != X\"\" ]; """ \
|
|
"""then echo \"$proc_pid_list\" | xargs kill -9 ; fi)"""
|
|
# cmd with not switch users
|
|
cmd_current = """proc_pid_list=`ps -ef | grep 'cid=%s'| grep -v 'grep'""" \
|
|
"""|awk "{print \\\$2}"` """ % checkID
|
|
cmd_current += """ && (if [ X"$proc_pid_list" != X"" ]; then """ \
|
|
"""echo "$proc_pid_list" | xargs kill -9 ; fi)"""
|
|
|
|
username, passwd = __getUserAndPwd(node)
|
|
if SharedFuncs.is_local_node(node) and not __hasRootItems():
|
|
SharedFuncs.runShellCmd(cmd_current)
|
|
elif __hasRootItems():
|
|
SharedFuncs.runSshCmdWithPwd(cmd_switch, node, username, passwd)
|
|
else:
|
|
SharedFuncs.runSshCmd(cmd_current, node, g_context.user)
|
|
|
|
|
|
def cleanTmpDir(node):
|
|
# clean tmp files in all the nodes
|
|
cmd = r"rm -rf %s" % g_context.tmpPath
|
|
if SharedFuncs.is_local_node(node):
|
|
SharedFuncs.runShellCmd(cmd)
|
|
else:
|
|
SharedFuncs.runSshCmd(cmd, node, g_context.user)
|
|
|
|
|
|
def cleanEnvironment(skiplog=False):
|
|
"""
|
|
function: clean the environment
|
|
input : NA
|
|
output: NA
|
|
"""
|
|
if __isDistributing():
|
|
return
|
|
if not g_context.tmpPath:
|
|
return
|
|
if not g_context.nodes:
|
|
return
|
|
|
|
# kill child process on all hosts when exception(skip log)
|
|
if skiplog:
|
|
try:
|
|
pool = ThreadPool(DefaultValue.getCpuSet())
|
|
results = pool.map(killChildProcess, g_context.nodes)
|
|
pool.close()
|
|
pool.join()
|
|
except Exception as e:
|
|
g_logger.info("Warning! Failed to kill child process.")
|
|
g_logger.debug(str(e))
|
|
|
|
# clean tmp files in all the nodes
|
|
cmd = r"rm -rf %s" % g_context.tmpPath
|
|
if g_opts.localMode:
|
|
SharedFuncs.runShellCmd(cmd)
|
|
else:
|
|
try:
|
|
pool = ThreadPool(DefaultValue.getCpuSet())
|
|
results = pool.map(cleanTmpDir, g_context.nodes)
|
|
pool.close()
|
|
pool.join()
|
|
except Exception as e:
|
|
g_logger.info("Warning! Failed to clear tmp directory.")
|
|
g_logger.debug(str(e))
|
|
|
|
|
|
def setTimeOut():
|
|
"""
|
|
function: set time out
|
|
input : NA
|
|
output: NA
|
|
"""
|
|
global g_endTime
|
|
# end time
|
|
g_endTime = datetime.now() + timedelta(seconds=g_opts.timeout)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# main function
|
|
try:
|
|
initGlobal()
|
|
parseCommandLine()
|
|
checkParameter()
|
|
parseCheckContext()
|
|
preCheck()
|
|
dispatchCached()
|
|
doCheck()
|
|
formatOutput()
|
|
cleanEnvironment()
|
|
except (InterruptException, ThreadCheckException, TimeoutException) as e:
|
|
g_logger.error(str(e))
|
|
# clean the environment and child process when using Ctrl+C force or
|
|
# except or timeout to exit the command
|
|
cleanEnvironment(True)
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
if not g_logger:
|
|
sys.stdout = sys.stderr
|
|
print(str(e))
|
|
else:
|
|
g_logger.error(str(e))
|
|
cleanEnvironment()
|
|
sys.exit(1)
|
|
else:
|
|
sys.exit(0)
|