928 lines
37 KiB
Python
928 lines
37 KiB
Python
# coding: UTF-8
|
|
# Copyright (c) 2020 Huawei Technologies Co.,Ltd.
|
|
#
|
|
# openGauss is licensed under Mulan PSL v2.
|
|
# You can use this software according to the terms
|
|
# and conditions of the Mulan PSL v2.
|
|
# You may obtain a copy of Mulan PSL v2 at:
|
|
#
|
|
# http://license.coscl.org.cn/MulanPSL2
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OF ANY KIND,
|
|
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
# See the Mulan PSL v2 for more details.
|
|
# ----------------------------------------------------------------------------
|
|
|
|
import sys
|
|
import time
|
|
import subprocess
|
|
import time
|
|
import base64
|
|
import json
|
|
|
|
sys.path.append(sys.path[0] + "/../../../")
|
|
from gspylib.common.Common import DefaultValue
|
|
from gspylib.common.OMCommand import OMCommand
|
|
from gspylib.common.ErrorCode import ErrorCode
|
|
from gspylib.os.gsfile import g_file
|
|
from gspylib.os.gsplatform import g_Platform
|
|
from gspylib.os.gsOSlib import g_OSlib
|
|
from impl.collect.CollectImpl import CollectImpl
|
|
|
|
|
|
class CollectImplOLAP(CollectImpl):
|
|
"""
|
|
The class is used to do perform collect log files.
|
|
"""
|
|
|
|
def __init__(self, collectObj):
|
|
"""
|
|
function: Constructor
|
|
input : collectObj
|
|
output: NA
|
|
"""
|
|
self.jobInfo = {}
|
|
self.nodeJobInfo = {}
|
|
super(CollectImplOLAP, self).__init__(collectObj)
|
|
|
|
def parseConfigFile(self):
|
|
"""
|
|
function: Parsing configuration files
|
|
input : NA
|
|
output: NA
|
|
"""
|
|
try:
|
|
# Init the cluster information
|
|
self.context.initClusterInfoFromStaticFile(self.context.user)
|
|
self.context.appPath = self.context.clusterInfo.appPath
|
|
|
|
# Obtain the cluster installation directory owner and group
|
|
(self.context.user, self.context.group) = g_OSlib.getPathOwner(
|
|
self.context.appPath)
|
|
if (self.context.user == "" or self.context.group == ""):
|
|
self.context.logger.logExit(ErrorCode.GAUSS_503["GAUSS_50308"])
|
|
|
|
# Match the corresponding node
|
|
for nodename in self.context.nodeName:
|
|
if not self.context.clusterInfo.getDbNodeByName(nodename):
|
|
self.context.logger.logExit(
|
|
ErrorCode.GAUSS_516["GAUSS_51619"] % nodename)
|
|
|
|
if (len(self.context.nodeName) == 0):
|
|
self.context.nodeName = \
|
|
self.context.clusterInfo.getClusterNodeNames()
|
|
|
|
self.context.initSshTool(self.context.nodeName,
|
|
DefaultValue.TIMEOUT_PSSH_COLLECTOR)
|
|
if (len(self.context.nodeName) == 1 and self.context.nodeName[
|
|
0] == DefaultValue.GetHostIpOrName()):
|
|
self.context.localMode = True
|
|
except Exception as e:
|
|
raise Exception(str(e))
|
|
self.context.logger.log("Successfully parsed the configuration file.")
|
|
|
|
# python will remove Single quotes
|
|
# and double quote when we pass parameter from outside
|
|
# we use # to replace double quote
|
|
def formatJsonString(self, check):
|
|
"""
|
|
function: format sonString
|
|
input : string
|
|
output: json string
|
|
"""
|
|
if (self.context.isSingle or self.context.localMode):
|
|
return "\'" + json.dumps(check).replace("\"", "#") + "\'"
|
|
else:
|
|
return "\'" \
|
|
+ \
|
|
json.dumps(check).replace("$", "\$").replace("\"", "#") \
|
|
+ "\'"
|
|
|
|
def checkTmpDir(self):
|
|
"""
|
|
function: Check tmp dir, if tmp dir not exist, create it
|
|
input : TmpDirFromEnv
|
|
output: NA
|
|
"""
|
|
try:
|
|
# Create a temporary file
|
|
tmpDir = DefaultValue.getTmpDirFromEnv()
|
|
cmd = "(if [ ! -d '%s' ];then mkdir -p '%s' -m %s;fi)" \
|
|
% (tmpDir, tmpDir, DefaultValue.KEY_DIRECTORY_MODE)
|
|
DefaultValue.execCommandWithMode(
|
|
cmd,
|
|
"Check temporary directory",
|
|
self.context.sshTool,
|
|
self.context.isSingle or self.context.localMode,
|
|
self.context.mpprcFile)
|
|
except Exception as e:
|
|
self.context.logger.logExit(str(e))
|
|
|
|
def checkCommand(self):
|
|
"""
|
|
function: check command
|
|
output: Successfully command exists
|
|
"""
|
|
self.context.logger.log("check rsync command.")
|
|
# Check the system information on each node
|
|
cmd = "source %s; %s -t check_command -U %s -S %d -l %s" % (
|
|
self.context.mpprcFile,
|
|
OMCommand.getLocalScript("Local_Collect"),
|
|
self.context.user,
|
|
self.context.speedLimitFlag,
|
|
self.context.localLog)
|
|
flag = 0
|
|
failedNodeList = []
|
|
if (self.context.isSingle or self.context.localMode):
|
|
(status, output) = subprocess.getstatusoutput(cmd)
|
|
if (status != 0):
|
|
flag = 1
|
|
self.context.logger.log("The cmd is %s " % cmd)
|
|
self.context.logger.logExit(
|
|
"rsync command not found on %s. "
|
|
"Error:\n%s\n--speed-limit parameters cannot be used" % \
|
|
(self.context.nodeName[0], output))
|
|
else:
|
|
(status, output) = self.context.sshTool.getSshStatusOutput(
|
|
cmd, self.context.nodeName)
|
|
self.context.sshTool.parseSshOutput(self.context.nodeName)
|
|
# Gets the execution result
|
|
for node in self.context.nodeName:
|
|
if (status[node] != DefaultValue.SUCCESS):
|
|
flag = 1
|
|
failedNodeList.append(node)
|
|
if flag == 0:
|
|
self.context.logger.log("Successfully check rsync command.")
|
|
else:
|
|
self.context.logger.logExit(
|
|
"rsync command not found on hosts: %s.\n "
|
|
"--speed-limit parameters cannot be used "
|
|
% str(failedNodeList))
|
|
|
|
def createStoreDir(self):
|
|
"""
|
|
:return:
|
|
"""
|
|
resultdir = ""
|
|
# Gets the current time
|
|
currentTime = time.strftime("%Y%m%d_%H%M%S")
|
|
if (self.context.outFile is not None and self.context.outFile != ""):
|
|
# rm the tmpdir
|
|
resultdir = self.context.outFile
|
|
else:
|
|
# rm the tmpdir
|
|
resultdir = DefaultValue.getTmpDirFromEnv()
|
|
|
|
cmd = \
|
|
"if [ -d '%s'/collector_tmp_* ];" \
|
|
"then rm -rf '%s'/collector_tmp_*; fi" % (
|
|
resultdir, resultdir)
|
|
(status, output) = DefaultValue.retryGetstatusoutput(cmd)
|
|
if status != 0:
|
|
raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd
|
|
+ "Error:\n%s" % output)
|
|
# Get the default path
|
|
targetdir = "%s/collector_tmp_%s" % (resultdir, currentTime)
|
|
self.context.outFile = "%s/collector_%s" % (targetdir, currentTime)
|
|
# Create a folder to store log information
|
|
g_file.createDirectory(self.context.outFile)
|
|
g_file.changeMode(DefaultValue.KEY_DIRECTORY_MODE,
|
|
self.context.outFile, True)
|
|
return (currentTime, targetdir, resultdir)
|
|
|
|
def createDir(self):
|
|
"""
|
|
function: create Dir
|
|
output: Successfully create dir
|
|
"""
|
|
self.context.logger.log("create Dir.")
|
|
# Check the system information on each node
|
|
cmd = "source %s; %s -t create_dir -U %s -l %s" % (
|
|
self.context.mpprcFile,
|
|
OMCommand.getLocalScript("Local_Collect"),
|
|
self.context.user,
|
|
self.context.localLog)
|
|
flag = 0
|
|
if (self.context.isSingle or self.context.localMode):
|
|
(status, output) = subprocess.getstatusoutput(cmd)
|
|
if (status != 0):
|
|
flag = 1
|
|
self.context.logger.log("The cmd is %s " % cmd)
|
|
self.context.logger.log(
|
|
"Failed to create dir on %s. Error:\n%s" % \
|
|
(self.context.nodeName[0], output))
|
|
else:
|
|
(status, output) = self.context.sshTool.getSshStatusOutput(
|
|
cmd,
|
|
self.context.nodeName)
|
|
outputMap = self.context.sshTool.parseSshOutput(
|
|
self.context.nodeName)
|
|
# Gets the execution result
|
|
for node in self.context.nodeName:
|
|
if (status[node] != DefaultValue.SUCCESS):
|
|
flag = 1
|
|
self.context.logger.log(
|
|
"Failed to create dir on %s. Error:\n%s" % \
|
|
(node, str(outputMap[node])))
|
|
if (flag == 0):
|
|
self.context.logger.log("Successfully create dir.")
|
|
|
|
def printSummaryInfo(self, resultdir, currentTime):
|
|
maxNamelen = len("SUCCESS HOSTNAME")
|
|
maxJoblen = len("success")
|
|
jobCount = 0
|
|
jobNameList = []
|
|
tag = ""
|
|
info = ""
|
|
|
|
for host in self.context.nodeName:
|
|
maxNamelen = max(maxNamelen, len(host))
|
|
for jobName, jobInfo in self.jobInfo.items():
|
|
subJobCount = len(jobInfo)
|
|
jobCount += subJobCount
|
|
while subJobCount > 0:
|
|
job = "%s-%s" % (jobName, str(subJobCount))
|
|
maxJoblen = max(maxJoblen, len(job))
|
|
jobNameList.append(job)
|
|
subJobCount -= 1
|
|
maxJoblen = maxJoblen + 4
|
|
maxNamelen = maxNamelen + 4
|
|
|
|
title = "%s%s%s%s%s%s%s" % ("|", "TASK NAME".center(maxJoblen), "|",
|
|
"SUCCESS HOSTNAME".center(maxNamelen), "|",
|
|
"FAILED HOSTNAME".center(maxNamelen), "|")
|
|
index = len(title)
|
|
while index > 0:
|
|
tag = "%s%s" % (tag, "-")
|
|
index -= 1
|
|
info = "%s%s%s" % (info, tag, "\n")
|
|
info = "%s%s%s%s%s%s%s%s" % (
|
|
info, "|", "".center(maxJoblen), "|", "".center(maxNamelen), "|",
|
|
"".center(maxNamelen), "|\n")
|
|
info = "%s%s%s" % (info, title, "\n")
|
|
info = "%s%s%s%s%s%s%s%s" % (
|
|
info, "|", "".center(maxJoblen), "|", "".center(maxNamelen), "|",
|
|
"".center(maxNamelen), "|\n")
|
|
info = "%s%s%s" % (info, tag, "\n")
|
|
for job in jobNameList:
|
|
jobName = str(job.split("-")[0])
|
|
i = int(job.split("-")[1])
|
|
len_s = len(self.jobInfo[jobName][i - 1]["successNodes"])
|
|
len_f = len(self.jobInfo[jobName][i - 1]["failedNodes"])
|
|
if len_s >= len_f:
|
|
self.jobInfo[jobName][i - 1]["failedNodes"] += [None] * (
|
|
len_s - len_f)
|
|
else:
|
|
self.jobInfo[jobName][i - 1]["successNodes"] += [None] * (
|
|
len_f - len_s)
|
|
|
|
isInitTitle = 0
|
|
for s, f in zip(self.jobInfo[jobName][i - 1]["successNodes"],
|
|
self.jobInfo[jobName][i - 1]["failedNodes"]):
|
|
if isInitTitle == 1:
|
|
job = ""
|
|
if str(s) == "None":
|
|
s = ""
|
|
if str(f) == "None":
|
|
f = ""
|
|
info = "%s%s%s%s%s%s%s%s%s" % (
|
|
info, "|", job.ljust(maxJoblen), "|",
|
|
str(s).center(maxNamelen), "|", str(f).center(maxNamelen),
|
|
"|",
|
|
"\n")
|
|
isInitTitle = 1
|
|
info = "%s%s%s" % (info, tag, "\n")
|
|
|
|
cmd = " echo '%s\n' >> %s/collector_tmp_%s/collector_%s/Summary.log" \
|
|
% (
|
|
info, resultdir, currentTime, currentTime)
|
|
(status, output) = subprocess.getstatusoutput(cmd)
|
|
if status != 0:
|
|
print("Generate Summary Info Failed.")
|
|
self.context.logger.debug("The cmd is %s " % cmd)
|
|
self.context.logger.debug(
|
|
"Generate Summary Info Failed %s." % output)
|
|
|
|
def printDetailSummaryInfo(self, resultdir, currentTime):
|
|
statusFiedLen = len("SuccessfulTask") + 4
|
|
for node, jobList in self.nodeJobInfo.items():
|
|
for job in jobList:
|
|
jsonJob = json.loads(job)
|
|
successLen = 0
|
|
jobName = jsonJob["jobName"]
|
|
successInfoList = []
|
|
failedInfolist = []
|
|
Info = ""
|
|
tag = []
|
|
successTask = jsonJob["successTask"]
|
|
|
|
for i in range(0, len(successTask), 5):
|
|
Task = "; ".join(successTask[i: i + 5])
|
|
successLen = max(successLen, len(Task))
|
|
successInfoList.append(Task)
|
|
|
|
failedLen = 0
|
|
for failedJob, reason in jsonJob["failedTask"].items():
|
|
failedInfo = failedJob + ": " + reason
|
|
failedLen = max(len(failedInfo), failedLen)
|
|
failedInfolist.append(failedInfo)
|
|
|
|
title = "%s - %s - %s" % (
|
|
node, jobName, "Success" if failedLen == 0 else "Failed")
|
|
taskMaxLen = max(failedLen + 4 + 2, successLen + 4 + 2)
|
|
maxLen = max(taskMaxLen, len(title))
|
|
titleLen = maxLen + statusFiedLen + 1
|
|
totalLen = titleLen + 2
|
|
i = 0
|
|
while i < totalLen:
|
|
tag.append("-")
|
|
i += 1
|
|
Info = "%s%s%s" % (Info, "".join(tag), "\n")
|
|
Info = "%s%s%s%s%s" % (
|
|
Info, "|", " ".center(titleLen), "|", "\n")
|
|
Info = "%s%s%s%s%s" % (
|
|
Info, "|", title.center(titleLen), "|", "\n")
|
|
Info = "%s%s%s%s%s" % (
|
|
Info, "|", " ".center(titleLen), "|", "\n")
|
|
Info = "%s%s%s" % (Info, "".join(tag), "\n")
|
|
for s in successInfoList:
|
|
Info = "%s%s%s%s%s%s%s" % (
|
|
Info, "|", "SuccessfulTask".center(statusFiedLen), "|",
|
|
s.center(maxLen), "|", "\n")
|
|
Info = "%s%s%s" % (Info, "".join(tag), "\n")
|
|
for f in failedInfolist:
|
|
Info = "%s%s%s%s%s%s%s" % (
|
|
Info, "|", "FailedTask".center(statusFiedLen), "|",
|
|
f.center(maxLen), "|", "\n")
|
|
Info = "%s%s%s" % (Info, "".join(tag), "\n")
|
|
Info = "%s%s" % (Info, "\n\n")
|
|
cmd = \
|
|
" echo '%s' " \
|
|
">> %s/collector_tmp_%s/collector_%s/Detail.log" % (
|
|
Info, resultdir, currentTime, currentTime)
|
|
(status, output) = subprocess.getstatusoutput(cmd)
|
|
if status != 0:
|
|
print("Generate Detail Summary Info Failed")
|
|
self.context.logger.debug("The cmd is %s " % cmd)
|
|
self.context.logger.debug(
|
|
"Generate Detail Summary Info Failed %s." % output)
|
|
|
|
def generalJobInfo(self, jobName, nodeList):
|
|
if self.jobInfo.__contains__(jobName):
|
|
self.jobInfo[jobName].append(nodeList)
|
|
else:
|
|
nodes = [nodeList]
|
|
self.jobInfo[jobName] = nodes
|
|
|
|
def generalDetailInfo(self, nodeName, job):
|
|
if self.nodeJobInfo.__contains__(nodeName):
|
|
self.nodeJobInfo[nodeName].append(job)
|
|
else:
|
|
jobList = [job]
|
|
self.nodeJobInfo[nodeName] = jobList
|
|
|
|
def generalSummary(self, resultdir, currentTime):
|
|
self.printSummaryInfo(resultdir, currentTime)
|
|
self.printDetailSummaryInfo(resultdir, currentTime)
|
|
|
|
def resultCheck(self, output):
|
|
isFailed = 0
|
|
nodeList = {}
|
|
successNodeList = []
|
|
failedNodeList = []
|
|
jobName = "UNKNOWN"
|
|
try:
|
|
if self.context.isSingle or self.context.localMode:
|
|
if len(json.loads(output)["failedTask"]) > 0:
|
|
isFailed = 1
|
|
failedNodeList.append(self.context.nodeName[0])
|
|
else:
|
|
successNodeList.append(self.context.nodeName[0])
|
|
self.generalDetailInfo(self.context.nodeName[0], output)
|
|
jobName = json.loads(output)["jobName"]
|
|
else:
|
|
for node in self.context.nodeName:
|
|
if len(json.loads(str(output[node]))["failedTask"]) > 0:
|
|
isFailed = 1
|
|
failedNodeList.append(node)
|
|
else:
|
|
successNodeList.append(node)
|
|
self.generalDetailInfo(node, str(output[node]))
|
|
jobName = json.loads(output[node])["jobName"]
|
|
nodeList["successNodes"] = successNodeList
|
|
nodeList["failedNodes"] = failedNodeList
|
|
|
|
self.generalJobInfo(jobName, nodeList)
|
|
return isFailed
|
|
except Exception as e:
|
|
self.context.logger.debug("check result failed %s." % str(e))
|
|
return 1
|
|
|
|
def planResultCheck(self, output):
|
|
isFailed = 1
|
|
nodeList = {}
|
|
successNodeList = []
|
|
failedNodeList = []
|
|
jobName = "UNKNOWN"
|
|
try:
|
|
if self.context.isSingle or self.context.localMode:
|
|
if len(json.loads(output)["failedTask"]) == 0:
|
|
isFailed = 0
|
|
successNodeList.append(self.context.nodeName[0])
|
|
else:
|
|
failedNodeList.append(self.context.nodeName[0])
|
|
|
|
self.generalDetailInfo(self.context.nodeName[0], output)
|
|
jobName = json.loads(output)["jobName"]
|
|
else:
|
|
for node in self.context.nodeName:
|
|
if len(json.loads(str(output[node]))["failedTask"]) == 0:
|
|
isFailed = 0
|
|
successNodeList.append(node)
|
|
else:
|
|
failedNodeList.append(node)
|
|
|
|
self.generalDetailInfo(node, str(output[node]))
|
|
jobName = json.loads(output[node])["jobName"]
|
|
nodeList["successNodes"] = successNodeList
|
|
nodeList["failedNodes"] = failedNodeList
|
|
self.generalJobInfo(jobName, nodeList)
|
|
return isFailed
|
|
except Exception as e:
|
|
self.context.logger.debug("check plan result failed %s." % str(e))
|
|
return 1
|
|
|
|
def systemCheck(self, sysInfo):
|
|
"""
|
|
function: collected OS information
|
|
output: Successfully collected OS information
|
|
"""
|
|
self.context.logger.log("Collecting OS information.")
|
|
# Check the system information on each node
|
|
cmd = "source %s; %s -t system_check -U %s -l %s -C %s" % (
|
|
self.context.mpprcFile,
|
|
OMCommand.getLocalScript("Local_Collect"),
|
|
self.context.user,
|
|
self.context.localLog,
|
|
self.formatJsonString(sysInfo))
|
|
|
|
if (self.context.isSingle or self.context.localMode):
|
|
output = subprocess.getstatusoutput(cmd)[1]
|
|
flag = self.resultCheck(output)
|
|
else:
|
|
self.context.sshTool.getSshStatusOutput(
|
|
cmd,
|
|
self.context.nodeName)
|
|
outputMap = self.context.sshTool.parseSshOutput(
|
|
self.context.nodeName)
|
|
# Gets the execution result
|
|
flag = self.resultCheck(outputMap)
|
|
if (flag == 0):
|
|
self.context.logger.log("Successfully collected OS information.")
|
|
else:
|
|
self.context.logger.log("The cmd is %s " % cmd)
|
|
self.context.logger.log("Failed to collect OS information.")
|
|
|
|
def databaseCheck(self, data):
|
|
"""
|
|
function: collected catalog informatics
|
|
output: Successfully collected catalog statistics.
|
|
"""
|
|
self.context.logger.log("Collecting catalog statistics.")
|
|
# Collect catalog statistics on each node
|
|
cmd = "source %s; %s -t database_check -U %s -l %s -C %s" % (
|
|
self.context.mpprcFile,
|
|
OMCommand.getLocalScript("Local_Collect"),
|
|
self.context.user,
|
|
self.context.localLog,
|
|
self.formatJsonString(data))
|
|
|
|
if (self.context.isSingle or self.context.localMode):
|
|
output = subprocess.getstatusoutput(cmd)[1]
|
|
flag = self.resultCheck(output)
|
|
else:
|
|
self.context.sshTool.getSshStatusOutput(
|
|
cmd,
|
|
self.context.nodeName)
|
|
outputMap = self.context.sshTool.parseSshOutput(
|
|
self.context.nodeName)
|
|
# Gets the execution result
|
|
flag = self.resultCheck(outputMap)
|
|
if (flag == 0):
|
|
self.context.logger.log(
|
|
"Successfully collected catalog statistics.")
|
|
else:
|
|
self.context.logger.log("The cmd is %s " % cmd)
|
|
self.context.logger.log("Failed collected catalog statistics.")
|
|
|
|
def logCopy(self, log, l):
|
|
"""
|
|
function: collected log files
|
|
output: Successfully collected log files
|
|
"""
|
|
self.context.logger.log("Collecting %s files." % log)
|
|
# Copy the log information on each node
|
|
self.context.keyword = base64.b64encode(
|
|
bytes(self.context.keyword, 'utf-8')).decode()
|
|
|
|
cmd = \
|
|
"source %s; " \
|
|
"%s -t %s -U %s -b '%s' -e '%s' -k '%s' -l %s -s %d -S %d -C %s" \
|
|
% (self.context.mpprcFile,
|
|
OMCommand.getLocalScript("Local_Collect"),
|
|
"log_copy" if log == "Log" else (
|
|
"xlog_copy" if log == "XLog" else "core_copy"),
|
|
self.context.user,
|
|
self.context.begintime,
|
|
self.context.endtime,
|
|
self.context.keyword,
|
|
self.context.localLog,
|
|
# For local collection,
|
|
# use the max speed limit, as all nodes do it individually.
|
|
self.context.speedLimit * 1024,
|
|
self.context.speedLimitFlag,
|
|
self.formatJsonString(l)
|
|
)
|
|
|
|
if (self.context.isSingle or self.context.localMode):
|
|
output = subprocess.getstatusoutput(cmd)[1]
|
|
flag = self.resultCheck(output)
|
|
else:
|
|
timeout = int(
|
|
self.context.LOG_SIZE_PER_DAY_ONE_NODE
|
|
* self.context.duration // self.context.speedLimit \
|
|
+ self.context.LOG_SIZE_PER_DAY_ONE_NODE
|
|
* self.context.duration // self.context.TAR_SPEED)
|
|
# The timeout value should be in [10 min, 1 hour]
|
|
if (timeout < DefaultValue.TIMEOUT_PSSH_COLLECTOR):
|
|
timeout = DefaultValue.TIMEOUT_PSSH_COLLECTOR
|
|
elif (timeout > 3600):
|
|
timeout = 3600
|
|
self.context.sshTool.setTimeOut(timeout)
|
|
self.context.logger.debug(
|
|
"Collection will be timeout in %ds." % timeout)
|
|
self.context.sshTool.getSshStatusOutput(
|
|
cmd,
|
|
self.context.nodeName)
|
|
outputMap = self.context.sshTool.parseSshOutput(
|
|
self.context.nodeName)
|
|
# Gets the execution result
|
|
flag = self.resultCheck(outputMap)
|
|
if (flag == 0):
|
|
self.context.logger.log("Successfully collected %s files." % log)
|
|
else:
|
|
self.context.logger.log("The cmd is %s " % cmd)
|
|
self.context.logger.log("Failed collected %s files." % log)
|
|
|
|
def confGstack(self, check, s):
|
|
"""
|
|
function: collected configuration files and processed stack information
|
|
output: Successfully collected configuration files
|
|
and processed stack information.
|
|
"""
|
|
self.context.logger.log("Collecting %s files." % s["TypeName"])
|
|
# Collect configuration files
|
|
# and process stack information on each node
|
|
cmd = "source %s; %s -t %s -U %s -l %s -C %s" % (
|
|
self.context.mpprcFile,
|
|
OMCommand.getLocalScript("Local_Collect"),
|
|
check,
|
|
self.context.user,
|
|
self.context.localLog,
|
|
self.formatJsonString(s))
|
|
|
|
if (self.context.isSingle or self.context.localMode):
|
|
output = subprocess.getstatusoutput(cmd)[1]
|
|
flag = self.resultCheck(output)
|
|
else:
|
|
self.context.sshTool.getSshStatusOutput(
|
|
cmd,
|
|
self.context.nodeName)
|
|
outputMap = self.context.sshTool.parseSshOutput(
|
|
self.context.nodeName)
|
|
# Gets the execution result
|
|
flag = self.resultCheck(outputMap)
|
|
if (flag == 0):
|
|
self.context.logger.log(
|
|
"Successfully collected %s files." % s["TypeName"])
|
|
else:
|
|
self.context.logger.log("The cmd is %s " % cmd)
|
|
self.context.logger.log(
|
|
"Failed collected %s files." % s["TypeName"])
|
|
|
|
def planSimulator(self, data):
|
|
"""
|
|
function: collect plan simulator files
|
|
output: Successfully collected files.
|
|
"""
|
|
self.context.logger.log("Collecting plan simulator statistics.")
|
|
# Collect plan simulator on each node
|
|
cmd = "source %s; %s -t plan_simulator_check -U %s -l %s -C %s" % (
|
|
self.context.mpprcFile,
|
|
OMCommand.getLocalScript("Local_Collect"),
|
|
self.context.user,
|
|
self.context.localLog,
|
|
self.formatJsonString(data))
|
|
|
|
if (self.context.isSingle or self.context.localMode):
|
|
output = subprocess.getstatusoutput(cmd)[1]
|
|
flag = self.planResultCheck(output)
|
|
else:
|
|
(status, output) = self.context.sshTool.getSshStatusOutput(
|
|
cmd,
|
|
self.context.nodeName)
|
|
outputMap = self.context.sshTool.parseSshOutput(
|
|
self.context.nodeName)
|
|
# Gets the execution result
|
|
flag = self.planResultCheck(outputMap)
|
|
if (flag == 0):
|
|
self.context.logger.log("Successfully collected plan simulator.")
|
|
else:
|
|
self.context.logger.log("The cmd is %s " % cmd)
|
|
self.context.logger.log("Failed collected plan simulator.")
|
|
|
|
def copyFile(self):
|
|
"""
|
|
function: collected result files
|
|
output: Successfully collected files.
|
|
"""
|
|
self.context.logger.log("Collecting files.")
|
|
# Collect result files on each node
|
|
cmd = "source %s; %s -t copy_file -U %s -o %s -h %s -l %s" % (
|
|
self.context.mpprcFile,
|
|
OMCommand.getLocalScript("Local_Collect"),
|
|
self.context.user,
|
|
self.context.outFile,
|
|
DefaultValue.GetHostIpOrName(),
|
|
self.context.localLog)
|
|
|
|
flag = 0
|
|
if (self.context.isSingle or self.context.localMode):
|
|
cmd = cmd + (" -s %d" % self.context.speedLimit * 1024)
|
|
cmd = cmd + (" -S %d" % self.context.speedLimitFlag)
|
|
(status, output) = subprocess.getstatusoutput(cmd)
|
|
if (status != 0):
|
|
flag = 0
|
|
self.context.logger.log("The cmd is %s " % cmd)
|
|
self.context.logger.log(
|
|
"Failed to collect files on %s. Error:\n%s" % \
|
|
(self.context.nodeName[0], output))
|
|
else:
|
|
flag = 1
|
|
else:
|
|
parallelNum = DefaultValue.getCpuSet()
|
|
if (len(self.context.nodeName) < parallelNum):
|
|
parallelNum = len(self.context.nodeName)
|
|
|
|
speedLimitEachNodeKBs = int(
|
|
self.context.speedLimit * 1024 // parallelNum)
|
|
|
|
# In parallel mode,
|
|
# set a bandwidth to collect log files from other nodes
|
|
# to avoid too much IO for net card, which is risky for CM things.
|
|
cmd = cmd + (" -s %d" % speedLimitEachNodeKBs)
|
|
cmd = cmd + (" -S %d" % self.context.speedLimitFlag)
|
|
|
|
# The timeout value to remote copy.
|
|
timeout = self.context.LOG_SIZE_PER_DAY_ONE_NODE \
|
|
* self.context.duration * 1024 // speedLimitEachNodeKBs
|
|
# The timeout value should be in [10 min, 1 hour]
|
|
if (timeout < DefaultValue.TIMEOUT_PSSH_COLLECTOR):
|
|
timeout = DefaultValue.TIMEOUT_PSSH_COLLECTOR
|
|
elif (timeout > 3600):
|
|
timeout = 3600
|
|
self.context.sshTool.setTimeOut(timeout)
|
|
self.context.logger.debug(
|
|
"Copy logs will be timeout in %ds." % timeout)
|
|
(status, output) = self.context.sshTool.getSshStatusOutput(
|
|
cmd,
|
|
self.context.nodeName,
|
|
parallel_num=parallelNum)
|
|
self.context.sshTool.parseSshOutput(
|
|
self.context.nodeName)
|
|
# Gets the execution result
|
|
for node in self.context.nodeName:
|
|
if (status[node] == DefaultValue.SUCCESS):
|
|
flag = 1
|
|
|
|
if (flag == 0):
|
|
self.context.logger.log(
|
|
"Failed to collect files: All collection tasks failed")
|
|
else:
|
|
self.context.logger.log("Successfully collected files.")
|
|
|
|
def tarResultFiles(self, currentTime, targetdir, resultdir):
|
|
"""
|
|
:return:
|
|
"""
|
|
# tar the result and delete directory
|
|
try:
|
|
# tar the result and delete directory
|
|
tarFile = "collector_%s.tar.gz" % currentTime
|
|
destDir = "collector_%s" % currentTime
|
|
cmd = "%s && %s" % (g_Platform.getCdCmd(targetdir),
|
|
g_Platform.getCompressFilesCmd(tarFile,
|
|
destDir))
|
|
cmd += " && %s" % g_Platform.getChmodCmd(
|
|
str(DefaultValue.KEY_FILE_MODE), tarFile)
|
|
cmd += " && %s" % g_Platform.getMoveFileCmd(tarFile, "../")
|
|
cmd += " && %s '%s'" % (
|
|
g_Platform.getRemoveCmd("directory"), targetdir)
|
|
DefaultValue.execCommandLocally(cmd)
|
|
self.context.logger.log(
|
|
"All results are stored in %s/collector_%s.tar.gz." % (
|
|
resultdir, currentTime))
|
|
except Exception as e:
|
|
raise Exception(str(e))
|
|
|
|
def getCycle(self, sysInfo):
|
|
"""
|
|
function: parse interval and count
|
|
input : sysInfo
|
|
output: count, interval
|
|
"""
|
|
interval = 0
|
|
if sysInfo.__contains__('Interval'):
|
|
interval = int(sysInfo['Interval'].replace(" ", ""))
|
|
count = int(sysInfo['Count'].replace(" ", ""))
|
|
return interval, count
|
|
|
|
def doCollector(self):
|
|
"""
|
|
function: collect information
|
|
input : strftime
|
|
output: Successfully collected catalog statistics
|
|
"""
|
|
# Parsing configuration files
|
|
self.parseConfigFile()
|
|
|
|
# check rsync command
|
|
if self.context.speedLimitFlag == 1:
|
|
self.checkCommand()
|
|
|
|
# check tmp directory
|
|
self.checkTmpDir()
|
|
|
|
self.createDir()
|
|
# create store dir
|
|
(currentTime, targetdir, resultdir) = self.createStoreDir()
|
|
|
|
# collect OS information
|
|
if self.context.config.__contains__('System'):
|
|
sysList = self.context.config['System']
|
|
for sysInfo in sysList:
|
|
if sysInfo.__contains__('Count'):
|
|
(interval, count) = self.getCycle(sysInfo)
|
|
print("do system check interval %s : count %s" % (
|
|
str(interval), str(count)))
|
|
while count:
|
|
count -= 1
|
|
self.systemCheck(sysInfo)
|
|
if count > 0 and interval > 0:
|
|
time.sleep(interval)
|
|
else:
|
|
self.systemCheck(sysInfo)
|
|
|
|
# collect catalog statistics
|
|
if self.context.config.__contains__('Database'):
|
|
dataList = self.context.config['Database']
|
|
for data in dataList:
|
|
if data.__contains__('Count'):
|
|
(interval, count) = self.getCycle(data)
|
|
print("do database check interval %s : count %s" % (
|
|
str(interval), str(count)))
|
|
while count:
|
|
count -= 1
|
|
self.databaseCheck(data)
|
|
if count > 0 and interval > 0:
|
|
time.sleep(interval)
|
|
else:
|
|
self.databaseCheck(data)
|
|
|
|
# Collect log files
|
|
if self.context.config.__contains__('Log'):
|
|
logList = self.context.config['Log']
|
|
for l in logList:
|
|
if l.__contains__('Count'):
|
|
(interval, count) = self.getCycle(l)
|
|
print("do log check interval %s : count %s" % (
|
|
str(interval), str(count)))
|
|
if count > 1:
|
|
self.context.logger.log(
|
|
ErrorCode.GAUSS_512["GAUSS_51246"] % "Log")
|
|
count = 1
|
|
while count:
|
|
count -= 1
|
|
self.logCopy("Log", l)
|
|
if count > 0 and interval > 0:
|
|
time.sleep(interval)
|
|
else:
|
|
self.logCopy("Log", l)
|
|
|
|
# Collect xlog files
|
|
if self.context.config.__contains__('XLog'):
|
|
xloglist = self.context.config['XLog']
|
|
for l in xloglist:
|
|
if l.__contains__('Count'):
|
|
(interval, count) = self.getCycle(l)
|
|
print("do XLog check " + str(interval) + ":" + str(count))
|
|
if count > 1:
|
|
self.context.logger.log(
|
|
ErrorCode.GAUSS_512["GAUSS_51246"] % "XLog")
|
|
count = 1
|
|
while count:
|
|
count -= 1
|
|
self.logCopy("XLog", l)
|
|
if count > 0 and interval > 0:
|
|
time.sleep(interval)
|
|
else:
|
|
self.logCopy("XLog", l)
|
|
|
|
# CoreDump files
|
|
if self.context.config.__contains__('CoreDump'):
|
|
corelist = self.context.config['CoreDump']
|
|
for l in corelist:
|
|
if l.__contains__('Count'):
|
|
(interval, count) = self.getCycle(l)
|
|
print("do CoreDump check " + str(interval) + ":" + str(
|
|
count))
|
|
if count > 1:
|
|
self.context.logger.log(
|
|
ErrorCode.GAUSS_512["GAUSS_51246"] % "CoreDump")
|
|
count = 1
|
|
while count:
|
|
count -= 1
|
|
self.logCopy("CoreDump", l)
|
|
if count > 0 and interval > 0:
|
|
time.sleep(interval)
|
|
else:
|
|
self.logCopy("CoreDump", l)
|
|
|
|
# collect configuration files
|
|
if self.context.config.__contains__('Config'):
|
|
clist = self.context.config['Config']
|
|
for c in clist:
|
|
if c.__contains__('Count'):
|
|
(interval, count) = self.getCycle(c)
|
|
print("do Config check " + str(interval) + ":" + str(
|
|
count))
|
|
if count > 1:
|
|
self.context.logger.log(
|
|
ErrorCode.GAUSS_512["GAUSS_51246"] % "Config")
|
|
count = 1
|
|
while count:
|
|
count -= 1
|
|
self.confGstack("Config", c)
|
|
if count > 0 and interval > 0:
|
|
time.sleep(interval)
|
|
else:
|
|
self.confGstack("Config", c)
|
|
|
|
# process stack information
|
|
if self.context.config.__contains__('Gstack'):
|
|
stacklist = self.context.config['Gstack']
|
|
for s in stacklist:
|
|
if s.__contains__('Count'):
|
|
(interval, count) = self.getCycle(s)
|
|
print("do Gstack check " + str(interval) + ":" + str(
|
|
count))
|
|
|
|
while count:
|
|
count -= 1
|
|
self.confGstack("Gstack", s)
|
|
if count > 0 and interval > 0:
|
|
time.sleep(interval)
|
|
else:
|
|
self.confGstack("Gstack", s)
|
|
|
|
# collect configuration files and process stack information
|
|
if self.context.config.__contains__('Trace'):
|
|
print("do config check")
|
|
|
|
# collect plan simulator files
|
|
if self.context.config.__contains__('Plan'):
|
|
dbList = self.context.config['Plan']
|
|
for s in dbList:
|
|
if s.__contains__('Count'):
|
|
(interval, count) = self.getCycle(s)
|
|
print("do Plan check " + str(interval) + ":" + str(count))
|
|
|
|
while count:
|
|
count -= 1
|
|
self.planSimulator(s)
|
|
if count > 0 and interval > 0:
|
|
time.sleep(interval)
|
|
else:
|
|
self.planSimulator(s)
|
|
|
|
# Collect result files
|
|
self.copyFile()
|
|
|
|
# generate summary info
|
|
self.generalSummary(resultdir, currentTime)
|
|
|
|
# tar the result and delete directory
|
|
self.tarResultFiles(currentTime, targetdir, resultdir)
|