2023-07-24 21:40:45 +08:00

979 lines
43 KiB
Python

# -*- coding:utf-8 -*-
#############################################################################
# Copyright (c) 2020 Huawei Technologies Co.,Ltd.
#
# openGauss is licensed under Mulan PSL v2.
# You can use this software according to the terms
# and conditions of the Mulan PSL v2.
# You may obtain a copy of Mulan PSL v2 at:
#
# http://license.coscl.org.cn/MulanPSL2
#
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS,
# WITHOUT WARRANTIES OF ANY KIND,
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
# See the Mulan PSL v2 for more details.
#############################################################################
try:
import sys
import os
import subprocess
import time
import datetime
import pwd
from datetime import datetime, timedelta
sys.path.append(sys.path[0] + "/../../../../")
from gspylib.common.ErrorCode import ErrorCode
from gspylib.common.Common import DefaultValue, ClusterCommand
from gspylib.common.DbClusterStatus import DbClusterStatus
from gspylib.os.gsfile import g_file
from gspylib.component.CM.CM import CM, CmResAttr, CmResCtrlCmd, DssInstAttr
from gspylib.component.CM.CM import VipInstAttr, VipAddInst, VipDelInst
from gspylib.component.CM.CM import VipCmResCtrlCmd, VipResAttr
from gspylib.common.DbClusterInfo import dbClusterInfo
from base_utils.os.crontab_util import CrontabUtil
from base_utils.os.env_util import EnvUtil
from base_utils.os.grep_util import GrepUtil
from base_utils.os.cmd_util import CmdUtil
from base_utils.os.file_util import FileUtil
from base_utils.common.fast_popen import FastPopen
from domain_utils.cluster_file.cluster_dir import ClusterDir
from domain_utils.cluster_file.cluster_log import ClusterLog
from gspylib.component.DSS.dss_comp import DssInst, UdevContext
from gspylib.component.DSS.dss_checker import DssConfig
except ImportError as e:
sys.exit("[GAUSS-52200] : Unable to import module: %s." % str(e))
class CM_OLAP(CM):
'''
The class is used to define cluster manager component for olap database.
'''
# Don't need DEFAULT_TIMEOUT = 300, user cm_ctl default timeout
DEFAULT_TIMEOUT = 0
# Don't need DEFAULT_RESTART_NODEGROUP_TIMEOUT = 1800, user cm_ctl default timeout
DEFAULT_RESTART_NODEGROUP_TIMEOUT = 0
# The number of CM cert file
CM_CERT_FILES_NUM = 8
# Retry generate CM cert count
RETRY_COUNT = 3
def __init__(self):
'''
Constructor
'''
super(CM_OLAP, self).__init__()
self.cluster_info = None
def init_globals(self):
user = pwd.getpwuid(os.getuid()).pw_name
self.cluster_info = dbClusterInfo()
if os.path.isfile(
self.cluster_info.get_staic_conf_path(user, ignore_err=True)):
self.cluster_info.initFromStaticConfig(user)
def init_cm_server(self):
"""
Init cm server
"""
user = pwd.getpwuid(os.getuid()).pw_name
server_simple_conf_file = os.path.realpath(os.path.join(self.binPath, "..",
"share/config",
"cm_server.conf.sample"))
server_dest_conf_file = os.path.realpath(os.path.join(self.instInfo.datadir,
"cm_server.conf"))
cmd = "if [ ! -d %s ] ; then mkdir -p %s ; chmod 700 %s ; " \
"chown %s:%s %s; fi ; cp %s %s " % (self.instInfo.datadir,
self.instInfo.datadir,
self.instInfo.datadir,
user, user,
self.instInfo.datadir,
server_simple_conf_file,
server_dest_conf_file)
self.logger.debug("Command for copy CM server cnf file command is: %s" % cmd)
(status, output) = CmdUtil.retryGetstatusoutput(cmd)
if status != 0:
raise Exception(ErrorCode.GAUSS_516["GAUSS_51615"] +
" Command:%s. Error:\n%s" % (cmd, output))
log_path = EnvUtil.getEnvironmentParameterValue("GAUSSLOG", user)
server_para_dict = {"log_dir": os.path.realpath(os.path.join(log_path,
"cm", "cm_server"))}
if self.dss_mode:
cm_vote_disk = UdevContext.CM_VOTE_NAME % user
cm_share_disk = UdevContext.CM_SHARE_NAME % user
server_para_dict.update({
'share_disk_path': cm_share_disk,
'voting_disk_path': cm_vote_disk,
'dn_arbitrate_mode': 'share_disk',
'ddb_type': '2'
})
self.setGucConfig(server_para_dict)
self.logger.debug("Initializing cm_server instance successfully.")
def init_cm_agent(self):
"""
Init cm agent
"""
user = pwd.getpwuid(os.getuid()).pw_name
agent_simple_conf_file = os.path.realpath(os.path.join(self.binPath, "..",
"share/config",
"cm_agent.conf.sample"))
agent_dest_conf_file = os.path.realpath(os.path.join(self.instInfo.datadir,
"cm_agent.conf"))
cmd = "if [ ! -d %s ] ; then mkdir -p %s ; chmod 700 %s ; " \
"chown %s:%s %s; fi ; cp %s %s " % (self.instInfo.datadir,
self.instInfo.datadir,
self.instInfo.datadir,
user, user,
self.instInfo.datadir,
agent_simple_conf_file,
agent_dest_conf_file)
self.logger.debug("Command for copy CM agent config file: %s" % cmd)
(status, output) = CmdUtil.retryGetstatusoutput(cmd)
if status != 0:
raise Exception(ErrorCode.GAUSS_516["GAUSS_51615"] +
" Command:%s. Error:\n%s" % (cmd, output))
log_path = EnvUtil.getEnvironmentParameterValue("GAUSSLOG", user)
agent_para_dict = {"unix_socket_directory": os.path.dirname(self.binPath),
"log_dir": os.path.realpath(os.path.join(log_path, "cm", "cm_agent"))}
if self.dss_mode:
cm_vote_disk = UdevContext.CM_VOTE_NAME % user
agent_para_dict.update({
'voting_disk_path': cm_vote_disk
})
self.setGucConfig(agent_para_dict)
self.logger.debug("Initializing cm_agent instance successfully.")
def initInstance(self):
"""
function : install a single cm component
input : NA
output : NA
"""
if not self.instInfo.datadir:
raise Exception("Data directory of instance is invalid.")
if self.instInfo.datadir == "/cm_agent" and not os.path.exists(self.instInfo.datadir):
self.logger.debug("No cm configuration, no need to init CM.")
return
if not os.path.exists(self.instInfo.datadir):
raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % ("cm data directory [%s]" %
self.instInfo.datadir))
if self.instInfo.instanceRole == DefaultValue.INSTANCE_ROLE_CMSERVER:
self.init_cm_server()
elif self.instInfo.instanceRole == DefaultValue.INSTANCE_ROLE_CMAGENT:
if self.dss_mode:
self.init_globals()
self.init_cm_res_json()
self.init_cm_agent()
def uninstall(self):
"""
function: uninstall the cm component
input : NA
output: NA
"""
data_dir = self.instInfo.datadir
dcf_data_dir = os.path.realpath(os.path.join(os.path.dirname(self.instInfo.datadir),
"dcf_data"))
gstor_dir = os.path.realpath(os.path.join(os.path.dirname(self.instInfo.datadir),
"gstor"))
if os.path.exists(data_dir) and DefaultValue.non_root_owner(data_dir):
FileUtil.removeDirectory(data_dir)
if os.path.exists(dcf_data_dir) and DefaultValue.non_root_owner(dcf_data_dir):
FileUtil.removeDirectory(dcf_data_dir)
if os.path.exists(gstor_dir) and DefaultValue.non_root_owner(gstor_dir):
FileUtil.removeDirectory(gstor_dir)
def _out_dot_and_wait_normal(self, node_id, cluster_normal_status,
start_type, time_set):
"""
Wait cluster to normal
"""
end_time, timeout = time_set
dot_count = 0
start_status = 0
start_result = ""
# 1 -> failed
# 0 -> success
is_success = False
# Wait for the cluster to start completely
while True:
# A point is output every 5 seconds
time.sleep(5)
sys.stdout.write(".")
dot_count += 1
# A line break is output per minute
if dot_count >= 12:
dot_count = 0
sys.stdout.write("\n")
start_status = 0
start_result = ""
# The cluster status is checked every 5 seconds
(start_status, start_result) = self.doCheckStaus(node_id, cluster_normal_status)
if start_status == 0:
# Output successful start information
if dot_count != 0:
sys.stdout.write("\n")
self.logger.log("Successfully started %s." % start_type)
is_success = True
break
# The output prompts when the timeout does not start successfully
if end_time is not None and datetime.now() >= end_time:
if dot_count != 0:
sys.stdout.write("\n")
self.logger.log("Failed to start %s " % start_type + " in (%s)s." % timeout)
self.logger.log("It will continue to start in the background.")
self.logger.log("If you want to see the cluster status, "
"please try command gs_om -t status.")
self.logger.log("If you want to stop the cluster, "
"please try command gs_om -t stop.")
break
self.logger.log("=" * 70)
self.logger.log(start_result)
return is_success
def _cluster_switchover(self, is_success, para_set):
"""
Cluster switchover
"""
is_switch_over, is_single, user, timeout = para_set
if is_switch_over:
self.logger.debug("Ready to switchover cluster.")
if is_success and is_switch_over and not is_single:
# Perform the switch reset operation
cmd = CM_OLAP.get_reset_switchover_cmd(user, timeout)
self.logger.debug("Swithover command: {0}".format(cmd))
(status, output) = subprocess.getstatusoutput(cmd)
if status != 0:
self.logger.log(
"Failed to reset switchover the cluster. "
"Command: \"%s\".\nOutput: \"%s\"." % (cmd, output))
@staticmethod
def get_start_cmd(nodeId=0, timeout=DEFAULT_TIMEOUT, datadir="", azName=""):
"""
function : Start all cluster or a node
input : String, int, String, String
output : String
"""
user_profile = EnvUtil.getMpprcFile()
cmd = "source %s ; cm_ctl start" % user_profile
# check node id
if nodeId > 0:
cmd += " -n %d" % nodeId
# check data directory
if datadir != "":
cmd += " -D %s" % datadir
# check timeout
if timeout > 0:
cmd += " -t %d" % timeout
# azName
if azName != "":
cmd += " -z%s" % azName
return cmd
def startCluster(self, user, nodeId=0, timeout=DEFAULT_TIMEOUT,
isSwitchOver=True, isSingle=False,
cluster_normal_status=None, isSinglePrimaryMultiStandbyCluster=False,
azName="", datadir="", retry_times=3):
"""
function:Start cluster or node
input:String,int,int
output:NA
"""
start_type = "cluster"
if nodeId > 0:
start_type = "node"
if datadir != "":
start_type = "instance"
if azName != "":
start_type = azName
# The output starts the screen-printing information of the group
self.logger.log("Starting %s." % start_type)
self.logger.log("======================================================================")
# Call cm_ctl to start the
cmd = CM_OLAP.get_start_cmd(nodeId, timeout=timeout, datadir=datadir, azName=azName)
result_set = subprocess.getstatusoutput(cmd)
# The output prompts when the failure to start
if result_set[0] != 0:
self.logger.error(ErrorCode.GAUSS_516["GAUSS_51607"] % start_type +
" Error: \n%s" % result_set[1])
self.logger.log("The cluster may continue to start in the background.")
self.logger.log("If you want to see the cluster status, "
"please try command gs_om -t status.")
self.logger.log("If you want to stop the cluster, please try command gs_om -t stop.")
raise Exception(ErrorCode.GAUSS_516["GAUSS_51607"] % start_type +
" Error: \n%s" % result_set[1])
if isSingle:
self.logger.log("Successfully started cluster. "
"Waiting for cluster status to become Normal.")
self.logger.log("=" * 70)
elif isSinglePrimaryMultiStandbyCluster:
if azName:
self.logger.log("Successfully started %s." % start_type)
self.logger.log("=" * 70)
self.logger.log("End start %s." % start_type)
return True
else:
self.logger.log("Successfully started single primary multi standby. "
"Wait for standby instance.")
self.logger.log("=" * 70)
# Output the startup instance success information
elif nodeId == 0:
self.logger.log("Successfully started primary instance. Wait for standby instance.")
self.logger.log("=" * 70)
else:
self.logger.log("Successfully started %s." % start_type)
self.logger.log("=" * 70)
self.logger.log("End start %s." % start_type)
return True
is_success = self._out_dot_and_wait_normal(nodeId,
cluster_normal_status,
start_type,
(datetime.now() + timedelta(seconds=timeout),
timeout))
self._cluster_switchover(is_success, (isSwitchOver, isSingle, user, timeout))
return is_success
def stop_cluster(self, stop_args_set):
"""
Stop cluster
"""
node_id, stop_mode, timeout, data_dir, az_name = stop_args_set
stop_type = "cluster"
# Specifies the stop node
# Gets the specified node id
if node_id > 0:
stop_type = "node"
if data_dir != "":
stop_type = "instance"
if az_name != "":
stop_type = az_name
# Perform a stop operation
self.logger.log("Stopping %s." % stop_type)
self.logger.log("=========================================")
timeout = timeout if timeout != 0 else CM_OLAP.DEFAULT_TIMEOUT
cmd = ClusterCommand.getStopCmd(node_id, stop_mode, timeout=0,
datadir=data_dir, azName=az_name)
(status, output) = subprocess.getstatusoutput(cmd)
if status != 0 and stop_type == "cluster":
self.logger.log("Failed to stop %s." % stop_type +
" Try to stop it forcibly." + " Error:\n%s" % output)
cmd = ClusterCommand.getStopCmd(node_id, "i", timeout, data_dir, az_name)
(status, output) = subprocess.getstatusoutput(cmd)
if status != 0:
self.logger.debug(output)
self.logger.log("Failed to stop %s forcibly." % stop_type)
else:
self.logger.log("Successfully stopped %s forcibly." % stop_type)
elif status != 0:
self.logger.log(output)
raise Exception(ErrorCode.GAUSS_516["GAUSS_51610"] % stop_type)
else:
self.logger.log("Successfully stopped %s." % stop_type)
self.logger.log("=========================================")
self.logger.log("End stop %s." % stop_type)
def queryClusterStatus(self, outFile="", isFormat=False):
"""
function: query cluster status
input : int,string,boolean
output: cluster status
"""
if isFormat:
cmd = self.getQueryStatusCmd(0, outFile, True, True)
else:
# query and save status into a file
status_file = \
"%s/gauss_check_status_%d.dat" % (EnvUtil.getTmpDirFromEnv(), os.getpid())
FileUtil.cleanTmpFile(status_file)
cmd = self.getQueryStatusCmd(0, status_file)
(status, output) = subprocess.getstatusoutput(cmd)
if status != 0 and not isFormat:
with open(status_file, 'r') as fp:
output = fp.read()
FileUtil.cleanTmpFile(status_file)
if output.find("cm_ctl: can't connect to cm_server.") >= 0:
raise Exception(ErrorCode.GAUSS_516["GAUSS_51640"])
raise Exception(ErrorCode.GAUSS_516["GAUSS_51601"] % "cluster" +
" Error: \n%s" % output)
if isFormat:
return output
# check cluster status
cluster_status = DbClusterStatus()
cluster_status.initFromFile(status_file)
FileUtil.cleanTmpFile(status_file)
return cluster_status
def switchOver(self, user, nodeId, datadir):
"""
function: Switch instances on a node to standby
input : string, int, string
output: NA
"""
cmd = self._get_switch_over_cmd(user, nodeId, datadir)
self.logger.debug("Switch over command: %s" % cmd)
(status, output) = subprocess.getstatusoutput(cmd)
if status != 0:
raise Exception(ErrorCode.GAUSS_530["GAUSS_53007"] % "instances" +
" Error: \n%s" % output)
def _kill_om_monitor(self):
"""
Kill om_monitor process
"""
# etcd log directory is not created, kill om_monitor.
kill_cmd = DefaultValue.killInstProcessCmd("om_monitor", False)
status, _ = CmdUtil.retryGetstatusoutput(kill_cmd)
if status == 0:
self.logger.debug("Kill om_monitor force: %s" % kill_cmd)
def setMonitor(self, user):
""""
function: Set om monitor cron
input : NA
output: NA
"""
self.logger.log("Set CRON.")
app_path = ClusterDir.getInstallDir(user)
log_path = ClusterLog.getOMLogPath(DefaultValue.OM_MONITOR_DIR_FILE,
user,
app_path)
mpprc_file = EnvUtil.getEnv(DefaultValue.MPPRC_FILE_ENV)
cron_file = "%s/gauss_cron_%d" % (EnvUtil.getTmpDirFromEnv(), os.getpid())
# get all content by crontab command
(status, output) = CrontabUtil.getAllCrontab()
if status == 0:
# overwrit cronFile, make it empty.
FileUtil.createFile(cron_file, True, DefaultValue.KEY_FILE_MODE)
content_cron_file = [output]
if output != "":
FileUtil.writeFile(cron_file, content_cron_file)
FileUtil.deleteLine(cron_file, "\\/bin\\/om_monitor")
elif status not in [256, 1]: # status==256 means this user has no cron
raise Exception(ErrorCode.GAUSS_508["GAUSS_50803"] + " Error: \n%s" % output)
if mpprc_file != "" and mpprc_file is not None:
cron_content = "*/1 * * * * source /etc/profile;(if [ -f ~/.profile ];" \
"then source ~/.profile;fi);source ~/.bashrc;source %s;" \
"nohup %s/bin/om_monitor -L %s >>/dev/null 2>&1 &" % (mpprc_file,
app_path,
log_path)
content_cron_file = [cron_content]
else:
cron_content = "*/1 * * * * source /etc/profile;(if [ -f ~/.profile ];" \
"then source ~/.profile;fi);source ~/.bashrc;" \
"nohup %s/bin/om_monitor -L %s >>/dev/null 2>&1 &" % (app_path,
log_path)
content_cron_file = [cron_content]
# Set cron for clear corefiles
gp_home = EnvUtil.getEnvironmentParameterValue("GPHOME", "")
default_xml = "%s/cluster_default_agent.xml" % gp_home
if os.path.exists(default_xml):
corefile = dbClusterInfo.readClustercorePath(default_xml)
if not os.path.exists(corefile):
cron_content_coreclear = "* */1 * * * diskUse=$(df %s -h | grep -v 'Use' | " \
"awk '{print $5}');" \
"fileNum=$(expr $(ls %s | wc -l) - 5);" % (corefile,
corefile)
cron_content_coreclear += "(if [ ${diskUse%%%%%%} -gt %s -a $fileNum -gt 5 ];" \
"then ls -tr | head -$fileNum | " \
"xargs -i -n$fileNum rm -rf {};fi) >>/dev/null " \
"2>&1 &" % DefaultValue.CORE_PATH_DISK_THRESHOLD
content_cron_file.append(cron_content_coreclear)
FileUtil.writeFile(cron_file, content_cron_file)
CrontabUtil.execCrontab(cron_file)
FileUtil.removeFile(cron_file)
self._kill_om_monitor()
if mpprc_file != "" and mpprc_file is not None:
cmd = "source /etc/profile;(if [ -f ~/.profile ];then source ~/.profile;fi);" \
"source ~/.bashrc;source %s; nohup %s/bin/om_monitor -L %s " \
">>/dev/null 2>&1 &" % (mpprc_file, app_path, log_path)
else:
cmd = "source /etc/profile;(if [ -f ~/.profile ];then source ~/.profile;fi);" \
"source ~/.bashrc; nohup %s/bin/om_monitor -L %s >>" \
"/dev/null 2>&1 &" % (app_path, log_path)
self.logger.debug("Command for start om_monitor: %s" % cmd)
status = os.system(cmd)
if status != 0:
raise Exception(ErrorCode.GAUSS_516["GAUSS_51607"] % "om_monitor")
def setGucConfig(self, para_dict, setMode='set'):
"""
function: set cm guc config file (cm_server.conf/cm_agent.conf)
input : NA
output: NA
"""
user_profile = EnvUtil.getMpprcFile()
EnvUtil.source(user_profile)
self.logger.debug("setMode is {0}".format(setMode))
if self.instInfo.instanceRole == DefaultValue.INSTANCE_ROLE_CMSERVER:
config_file = os.path.join(self.instInfo.datadir, "cm_server.conf")
else:
config_file = os.path.join(self.instInfo.datadir, "cm_agent.conf")
for key in para_dict.keys():
value = para_dict[key].replace("/", "{0}/".format("\\")).replace("'", "\\'")
cmd = "sed -i 's/\\t/ /g' %s && \
if [ 0 -eq `grep '%s' %s | grep -v '^[ ]*#' | wc -l` ]; \
then \
echo \"%s = %s\" >> %s; \
else \
if [ 0 -ne `grep '%s' %s | grep '#' | wc -l` ]; then \
sed -i \"s/^[ \\t]*%s.*=.*\\([ ]*#.*\\)/%s = %s \\1/g\" %s; \
else \
sed -i \"s/^[ \\t]*%s.*=.*\\([ ]*\\)/%s = %s \\1/g\" %s; \
fi \
fi" % \
(config_file,
key, config_file,
key, para_dict[key], config_file,
key, config_file,
key, key, value, config_file,
key, key, value, config_file)
self.logger.debug("Command for setting cm parameter: %s" % cmd)
(status, output) = CmdUtil.retryGetstatusoutput(cmd)
if status != 0:
raise Exception(ErrorCode.GAUSS_500["GAUSS_50007"] % "GUC" +
" Error: \n%s" % str(output))
def getGucConfig(self, paraList):
"""
function: get cm guc parameter value from file (cm_server.conf/cm_agent.conf)
input : NA
output: NA
"""
if self.instInfo.instanceRole == DefaultValue.INSTANCE_ROLE_CMSERVER:
gucFile = "%s/cm_server.conf" % self.instInfo.datadir
else:
gucFile = "%s/cm_agent.conf" % self.instInfo.datadir
paraDict = {}
for paraName in paraList:
(_, output) = GrepUtil.getGrepValue("-E", "^[ \\t]*%s ", gucFile)
output_list = output.strip().split("\n")
for line in output_list:
if line.find("=") < 0:
continue
value_line = line.split("=")[1]
para_value = value_line.split("#")[0].strip()
paraDict[paraName] = para_value
return paraDict
def get_cm_dict(self, user, item_type=None, alarm_component=None, guc_xml=(False, "")):
"""
function: Get cms or cma configuration
input : NA
output: NA
"""
if self.instInfo.instanceRole == DefaultValue.INSTANCE_ROLE_CMSERVER:
instance_role = "cm_server"
instance_simple = "cms"
else:
instance_role = "cm_agent"
instance_simple = "cma"
tmp_dict = dict()
tmp_dict["log_dir"] = os.path.join(ClusterDir.getUserLogDirWithUser(user),
"cm", instance_role)
if guc_xml[0]:
self.logger.log("start to get %s param from guc xml: %s" % (instance_role,
guc_xml[-1]))
tmp_guc_path = EnvUtil.getTmpDirFromEnv(user)
tmp_guc_file = os.path.realpath(os.path.join(tmp_guc_path, "tmp_guc"))
if os.path.isfile(tmp_guc_file):
self.logger.debug("Check if dict is not null.")
dynamic_dict = DefaultValue.dynamicGuc(user, self.logger, instance_simple,
tmp_guc_file, guc_xml)
if dynamic_dict:
tmp_dict.update(dynamic_dict)
if item_type == "ConfigInstance":
tmp_dict["alarm_component"] = "%s" % alarm_component
if instance_role == "cm_agent":
tmp_dict["unix_socket_directory"] = "'%s'" % EnvUtil.getTmpDirFromEnv()
return tmp_dict
def configInstance(self, user, configItemType=None, alarm_component=None, cmsConfig=None,
guc_xml=(False, "")):
"""
function: Get CMAgent configuration
input : user, configItemType, alarm_component
output: NA
"""
tmp_dict = self.get_cm_dict(user, configItemType, alarm_component, guc_xml)
tmp_dict.update(cmsConfig)
self.setGucConfig(tmp_dict)
def doCheckStaus(self, nodeId, cluster_normal_status=None, expected_redistributing=""):
"""
function: Check cluster status
input : user, nodeId, cluster_normal_status, expected_redistributing
output: status, output
"""
status_file = \
"%s/gauss_check_status_%d.dat" % (EnvUtil.getTmpDirFromEnv(), os.getpid())
FileUtil.cleanTmpFile(status_file)
cmd = self.getQueryStatusCmd(0, status_file, showDetail=True, isFormat=False)
(status, output) = subprocess.getstatusoutput(cmd)
if status != 0:
FileUtil.cleanTmpFile(status_file)
return (status, output)
cluster_status = DbClusterStatus()
cluster_status.initFromFile(status_file)
FileUtil.cleanTmpFile(status_file)
status = 0
output = ""
status_rep = None
if nodeId > 0:
nodeStatus = cluster_status.getDbNodeStatusById(nodeId)
if nodeStatus is None:
raise Exception(ErrorCode.GAUSS_516["GAUSS_51619"] % nodeId)
status = 0 if nodeStatus.isNodeHealthy() else 1
status_rep = nodeStatus.getNodeStatusReport()
else:
status = 0 if cluster_status.isAllHealthy(cluster_normal_status) \
and (cluster_status.redistributing == expected_redistributing
or expected_redistributing == "") else 1
status_rep = cluster_status.getClusterStatusReport()
self.logger.debug("status_result is : {0}".format(status_rep))
output += "cluster_state : %s\n" % cluster_status.clusterStatus
output += "redistributing : %s\n" % cluster_status.redistributing
output += "node_count : %d\n" % status_rep.nodeCount
output += "Datanode State\n"
output += " primary : %d\n" % status_rep.dnPrimary
output += " standby : %d\n" % status_rep.dnStandby
output += " secondary : %d\n" % status_rep.dnDummy
output += " cascade_standby : %d\n" % status_rep.dn_cascade_standby
output += " building : %d\n" % status_rep.dnBuild
output += " abnormal : %d\n" % status_rep.dnAbnormal
output += " down : %d\n" % status_rep.dnDown
return (status, output)
@staticmethod
def _get_switch_over_cmd(user, nodeId, datadir):
"""
function : Get the command of switching over standby instance
input : String,int,String
output : String
"""
user_profile = EnvUtil.getMpprcFile()
cmd = "source %s ; cm_ctl switchover -n %d -D %s" % (user_profile, nodeId, datadir)
# build shell command
if user and os.getuid() == 0:
cmd = "su - %s -c 'source %s;%s'" % (user, user_profile, cmd)
return cmd
@staticmethod
def get_reset_switchover_cmd(user, timeout):
"""
function : Reset Switch over
input : String,String
output : String
"""
user_profile = EnvUtil.getMpprcFile()
cmd = "source %s ; cm_ctl switchover -a" % user_profile
if timeout > 0:
cmd += (" -t %d" % timeout)
# build shell command
if user and os.getuid() == 0:
cmd = "su - %s -c 'source %s;%s'" % (user, user_profile, cmd)
return cmd
@staticmethod
def getQueryStatusCmd(nodeId=0, outFile="", showDetail=True, isFormat=True):
"""
function : Get the command of querying status of cluster or node
input : String
output : String
"""
user_profile = EnvUtil.getMpprcFile()
cmd = "source %s ; cm_ctl query" % user_profile
# check node id
if nodeId > 0:
cmd += " -n %d" % nodeId
# check -v
if showDetail:
cmd += " -v"
# status format
if isFormat:
cmd += " -C -i -d"
# check out put file
if outFile != "":
cmd += " > %s" % outFile
return cmd
def killProcess(self):
"""
function : kill cm server instance process
input : NA
output : NA
"""
cmd = DefaultValue.killInstProcessCmd("cm_server", False)
self.logger.debug("Kill cm_server cmd is: {0}".format(cmd))
(status, output) = CmdUtil.retryGetstatusoutput(cmd)
if status != 0:
raise Exception(ErrorCode.GAUSS_516["GAUSS_51606"] %
"cm_server" + " Error: \n%s" % str(output))
self.logger.debug("Kill cm_server process successfully.")
@staticmethod
def encryptor(secret, path, logger):
"""
encrypt secret
"""
gauss_home = DefaultValue.getPathFileOfENV("GAUSSHOME")
gp_home = DefaultValue.getPathFileOfENV("GPHOME")
expect_sh = os.path.realpath(os.path.join(gp_home, "script", "local", "expect.sh"))
cm_ctl = os.path.join(gauss_home, "bin/cm_ctl")
cmd1 = "%s encrypt -M server -D %s" % (cm_ctl, path)
cmd2 = "%s encrypt -M client -D %s" % (cm_ctl, path)
if not gauss_home:
raise Exception(ErrorCode.GAUSS_518["GAUSS_51802"] % "CM_HOME")
for cmd in [cmd1, cmd2]:
expect_key_word = "please enter the password*"
expect_cmd = 'echo "{0}" | sh {1} "{2}" "{3}"'.format(secret,
expect_sh,
expect_key_word,
cmd)
logger.debug("Enryptor execute command: {0}".format(cmd))
proc = FastPopen(expect_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, _ = proc.communicate(expect_cmd)
if proc.returncode != 0:
raise Exception("Generate key file with cm_ctl failed. {0}".format(stdout))
logger.debug("Generate key file successfully.")
def _retry_generate_ca(self, target_dir, retry_count):
"""
Retry perform generate CA files.
"""
from gspylib.common.encrypted_openssl import EncryptedOpenssl
current_retry_count = 0
while current_retry_count < retry_count:
current_retry_count += 1
self.logger.debug("Retry [{0}] time to "
"generate CA for CM component.".format(current_retry_count))
try:
openssl = EncryptedOpenssl(target_dir, self.logger, pw_len=12)
openssl.set_encryptor(CM_OLAP.encryptor)
openssl.generate()
self.logger.debug("Generate CA files successfully.")
return openssl
except Exception as exp:
self.logger.debug("Retry [{0}] time failed. "
"Error info : {1}".format(current_retry_count, str(exp)))
continue
self.logger.debug("Retry generate CA for CM component failed.")
def create_cm_ca(self, ssh_tool, ca_org='cm'):
"""
Create CA file for CM/dss component
"""
self.logger.log("Create CA files for {} beginning.".format(ca_org))
current_user = pwd.getpwuid(os.getuid()).pw_name
gp_home = EnvUtil.getEnvironmentParameterValue("GPHOME", current_user)
create_ca_script = os.path.realpath(os.path.join(gp_home, "script", "gspylib",
"common", "encrypted_openssl.py"))
expect_sh = os.path.realpath(os.path.join(gp_home, "script", "local", "expect.sh"))
target_dir = os.path.realpath(os.path.join(self.binPath, "..", "share", "sslcert", ca_org))
if os.path.isfile(create_ca_script) and os.path.isfile(expect_sh):
create_cmd = g_file.SHELL_CMD_DICT["createDir"] % (target_dir,
target_dir,
DefaultValue.KEY_DIRECTORY_MODE)
status, _ = subprocess.getstatusoutput(create_cmd)
if status != 0:
raise Exception(ErrorCode.GAUSS_535["GAUSS_53507"] % create_cmd)
openssl = None
while True:
FileUtil.cleanDirectoryContent(target_dir)
openssl = self._retry_generate_ca(target_dir, CM_OLAP.RETRY_COUNT)
# number of assurance certificate files.
if openssl and len(os.listdir(target_dir)) > CM_OLAP.CM_CERT_FILES_NUM:
break
if (ssh_tool):
openssl.distribute_cert(ssh_tool)
self.logger.log("Create CA files on directory [{0}]. "
"file list: {1}".format(target_dir, os.listdir(target_dir)))
else:
self.logger.log("There is not exists [%s]." % create_ca_script)
def get_init_cm_cmd(self):
user = pwd.getpwuid(os.getuid()).pw_name
gauss_home = EnvUtil.getEnvironmentParameterValue('GAUSSHOME', user)
dss_home = EnvUtil.getEnvironmentParameterValue('DSS_HOME', user)
# not use realpath
dms_contrl = os.path.join(gauss_home, 'bin/dms_contrl.sh')
dss_contrl = os.path.join(gauss_home, 'bin/dss_contrl.sh')
cmd = [
str(CmResCtrlCmd(name='dms_res', attr=CmResAttr(dms_contrl))),
str(
CmResCtrlCmd(name='dss',
attr=CmResAttr(dss_contrl, res_type='APP')))
]
for db_inst in self.cluster_info.dbNodes:
cmd.append(
str(
CmResCtrlCmd(action='edit',
name='dss',
attr=DssInstAttr(
node_id=db_inst.id,
dss_id=DssInst.get_current_dss_id(
dss_home, db_inst,
DssConfig.get_value_b64_handler(
'dss_nodes_list',
self.dss_config,
action='decode')),
dss_home="{};{}".format(
dss_home,
db_inst.datanodes[0].datadir)))))
return "source {}; {}".format(EnvUtil.getMpprcFile(), ' ;'.join(cmd))
def init_cm_res_json(self, rm_cm_json=True):
cm_resource = os.path.realpath(
os.path.join(self.instInfo.datadir, 'cm_resource.json'))
if rm_cm_json and os.path.isfile(cm_resource):
os.remove(cm_resource)
cmd = self.get_init_cm_cmd()
sts, out = subprocess.getstatusoutput(cmd)
if sts != 0:
raise Exception(
'Failed to initialize the CM resource file. Error: {}'.format(
str(out)))
def get_add_cm_res_cmd(self, cm_res_info):
"""
Get add CM resource information cmd for VIP
"""
cmd_list = []
for res_name, _list in cm_res_info.items():
check_res = "cm_ctl res --list --res_name=\"%s\"" % res_name
stat, out= subprocess.getstatusoutput(check_res)
if stat != 0 or not out:
cmd_list.append(str(VipCmResCtrlCmd("add_res", res_name,
attr=VipResAttr(_list[0][0]))))
for _tup in _list:
check_inst = "cm_ctl res --list --res_name=\"%s\" --list_inst" \
" | grep \"%s\"" % (res_name, _tup[1])
stat, out= subprocess.getstatusoutput(check_inst)
if stat != 0 or not out:
cmd_list.append(str(VipCmResCtrlCmd("add_inst", res_name,
inst=VipAddInst(_tup[2], _tup[3]),
attr=VipInstAttr(_tup[1]))))
cmd = "source %s; %s" % (EnvUtil.getMpprcFile(), ' ;'.join(cmd_list))
self.logger.log("Add cm resource information cmd: \n%s" % cmd)
return cmd
def _get_cm_res_info(self, base_ip):
"""
Get the CM resource info for reducing
"""
# Get resource name
cmd = "cm_ctl res --list | grep \"VIP\" | awk -F \"|\" '{print $1}'" \
" | xargs -i cm_ctl res --list --res_name={} --list_inst" \
" | grep \"base_ip=%s\" | awk -F \"|\" '{print $1}'" % base_ip
stat, out= subprocess.getstatusoutput(cmd)
if stat != 0:
raise Exception("Failed to get res name. Cmd: \n%s" % cmd)
if not out:
return "", -1, -1
res_name = out.strip()
# Get intance ID
cmd = "cm_ctl res --list --res_name=\"%s\" --list_inst | grep " \
"\"base_ip=%s\" | awk -F \"|\" '{print $4}'" % (res_name, base_ip)
stat, out= subprocess.getstatusoutput(cmd)
if stat != 0 or not out:
raise Exception("Failed to get the intance ID. Cmd: \n%s" % cmd)
inst_id = int(out.strip())
# Get the number of instances contained in a resource
cmd = "cm_ctl res --list --res_name=\"%s\" --list_inst" \
" | grep \"VIP\" | wc -l" % res_name
stat, out= subprocess.getstatusoutput(cmd)
if stat != 0 or not out:
raise Exception("Failed to get the number of instances. Cmd: %s" % cmd)
inst_num = int(out.strip())
self.logger.log("CM resource info: res_name=%s,inst_id=%d,inst_num=%d q" \
"" % (res_name, inst_id, inst_num))
return res_name, inst_id, inst_num
def get_reduce_cm_res_cmd(self, base_ips):
"""
Get reduce cm resource information cmd for VIP
"""
cmd_list = []
for base_ip in base_ips:
res_name, inst_id, inst_num = self._get_cm_res_info(base_ip)
if not res_name:
self.logger.log("The base IP is not found: %s" % base_ip)
continue
if inst_num > 1:
cmd_list.append(str(VipCmResCtrlCmd("del_inst", res_name,
inst=VipDelInst(inst_id))))
else:
cmd_list.append(str(VipCmResCtrlCmd("del_res", res_name)))
cmd = "source %s; %s" % (EnvUtil.getMpprcFile(), ' ;'.join(cmd_list))
self.logger.log("Reduce cm resource information cmd: \n%s" % cmd)
return cmd
def config_cm_res_json(self, base_ips, cm_res_info):
"""
Config cm resource file for vip
"""
if not base_ips and not cm_res_info:
raise Exception("The parameters cannot be empty at the same time")
cmd = self.get_reduce_cm_res_cmd(base_ips)
stat, out = subprocess.getstatusoutput(cmd)
if stat != 0:
raise Exception("Failed to reduce the CM resource for VIP." \
" Cmd: \n%s, Error: \n%s" % (cmd, str(out)))
cmd = self.get_add_cm_res_cmd(cm_res_info)
stat, out = subprocess.getstatusoutput(cmd)
if stat != 0:
raise Exception("Failed to add the CM resource for VIP." \
" Cmd: \n%s, Error: \n%s" % (cmd, str(out)))
cmd = "cm_ctl res --check"
stat, out = subprocess.getstatusoutput(cmd)
if stat != 0:
raise Exception("Failed to config the CM resource file for VIP." \
" Cmd: \n%s, Error: \n%s" % (cmd, str(out)))