openGauss-OM/script/impl/upgrade/UpgradeImpl.py
2022-06-20 09:59:18 +08:00

6227 lines
278 KiB
Python

# -*- coding:utf-8 -*-
# Copyright (c) 2020 Huawei Technologies Co.,Ltd.
#
# openGauss is licensed under Mulan PSL v2.
# You can use this software according to the terms
# and conditions of the Mulan PSL v2.
# You may obtain a copy of Mulan PSL v2 at:
#
# http://license.coscl.org.cn/MulanPSL2
#
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS,
# WITHOUT WARRANTIES OF ANY KIND,
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
# See the Mulan PSL v2 for more details.
# ----------------------------------------------------------------------------
import os
import sys
import subprocess
import time
import timeit
import json
import csv
import traceback
import copy
from datetime import datetime, timedelta
from gspylib.common.Common import DefaultValue, ClusterCommand, \
ClusterInstanceConfig
from gspylib.common.DbClusterInfo import instanceInfo, \
dbNodeInfo, dbClusterInfo, compareObject
from gspylib.common.OMCommand import OMCommand
from gspylib.common.ErrorCode import ErrorCode
from gspylib.threads.SshTool import SshTool
from gspylib.common.DbClusterStatus import DbClusterStatus
from gspylib.os.gsfile import g_file
from gspylib.inspection.common import SharedFuncs
from gspylib.component.CM.CM_OLAP.CM_OLAP import CM_OLAP
from impl.upgrade.UpgradeConst import GreyUpgradeStep
import impl.upgrade.UpgradeConst as const
from base_utils.executor.cmd_executor import CmdExecutor
from base_utils.executor.local_remote_cmd import LocalRemoteCmd
from base_utils.os.cmd_util import CmdUtil
from domain_utils.cluster_file.cluster_dir import ClusterDir
from base_utils.os.env_util import EnvUtil
from base_utils.os.file_util import FileUtil
from domain_utils.cluster_file.package_info import PackageInfo
from domain_utils.cluster_file.version_info import VersionInfo
from domain_utils.sql_handler.sql_result import SqlResult
from base_utils.os.net_util import NetUtil
class OldVersionModules():
"""
class: old version modules
"""
def __init__(self):
"""
function: constructor
"""
# old cluster information
self.oldDbClusterInfoModule = None
# old cluster status
self.oldDbClusterStatusModule = None
class UpgradeImpl:
"""
Class: The class is used to do perform upgrade
"""
def __init__(self, upgrade):
"""
function: constructor
"""
self.dnInst = None
self.dnStandbyInsts = []
self.context = upgrade
self.newCommitId = ""
self.oldCommitId = ""
self.isLargeInplaceUpgrade = False
self.__upgrade_across_64bit_xid = False
self.action = upgrade.action
def exitWithRetCode(self, action, succeed=True, msg=""):
"""
funtion: should be called after cmdline parameter check
input : action, succeed, msg, strategy
output: NA
"""
#########################################
# doUpgrade
#
# binary-upgrade success failure
# 0 1
#
# binary-rollback success failure
# 2 3
# commit-upgrade success failure
# 5 1
#########################################
#########################################
# choseStrategy
# success failure
# 4 1
#########################################
if not succeed:
if action == const.ACTION_AUTO_ROLLBACK:
retCode = 3
else:
retCode = 1
elif action in [const.ACTION_SMALL_UPGRADE,
const.ACTION_LARGE_UPGRADE,
const.ACTION_INPLACE_UPGRADE]:
retCode = 0
elif action == const.ACTION_AUTO_ROLLBACK:
retCode = 2
elif action == const.ACTION_CHOSE_STRATEGY:
retCode = 4
elif action == const.ACTION_COMMIT_UPGRADE:
retCode = 5
else:
retCode = 1
if msg != "":
if self.context.logger is not None:
if succeed:
self.context.logger.log(msg)
else:
self.context.logger.error(msg)
else:
print(msg)
sys.exit(retCode)
def initGlobalInfos(self):
"""
function: init global infos
input : NA
output: NA
"""
self.context.logger.debug("Init global infos", "addStep")
self.context.sshTool = SshTool(
self.context.clusterNodes, self.context.localLog,
DefaultValue.TIMEOUT_PSSH_BINARY_UPGRADE)
self.initClusterConfig()
self.context.logger.debug("Successfully init global infos", "constant")
def setClusterDetailInfo(self):
"""
function: set cluster detail info
input : NA
output : NA
"""
for dbNode in self.context.clusterInfo.dbNodes:
dbNode.setDnDetailNum()
#self.context.clusterInfo.setClusterDnCount()
def removeOmRollbackProgressFile(self):
"""
function: remove om rollback process file
input : NA
output : NA
"""
self.context.logger.debug("Remove the om rollback"
" record progress file.")
fileName = os.path.join(self.context.tmpDir,
".upgrade_task_om_rollback_result")
cmd = "(if [ -f '%s' ];then rm -f '%s';fi)" % (fileName, fileName)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
def initOmRollbackProgressFile(self):
"""
function: init om rollback process file
input : NA
output : NA
"""
filePath = os.path.join(self.context.tmpDir,
".upgrade_task_om_rollback_result")
cmd = "echo \"OM:RUN\" > %s" % filePath
(status, output) = subprocess.getstatusoutput(cmd)
if status != 0:
self.context.logger.debug("The cmd is %s " % cmd)
raise Exception(ErrorCode.GAUSS_502["GAUSS_50205"] % filePath
+ "Error: \n%s" % str(output))
if (not self.context.isSingle):
# send file to remote nodes
self.context.sshTool.scpFiles(filePath, self.context.tmpDir)
self.context.logger.debug("Successfully write file %s." % filePath)
def run(self):
"""
function: Do upgrade
input : NA
output: NA
"""
# the action may be changed in each step,
# if failed in auto-rollback,
# we will check if we need to rollback
action = self.context.action
# upgrade backup path
self.context.tmpDir = EnvUtil.getTmpDirFromEnv(self.context.user)
if self.context.tmpDir == "":
raise Exception(ErrorCode.GAUSS_518["GAUSS_51800"] % "$PGHOST")
self.context.upgradeBackupPath = \
"%s/%s" % (self.context.tmpDir, "binary_upgrade")
try:
self.initGlobalInfos()
self.removeOmRollbackProgressFile()
self.commonCheck()
# 4. get upgrade type
# After choseStrategy, it will assign action to self.context.action
# to do full-upgrade or binary-upgrade
if self.context.action == const.ACTION_AUTO_UPGRADE:
self.context.action = self.choseStrategy()
self.context.logger.debug(
"%s execution takes %s steps in total" % (
const.GS_UPGRADECTL, ClusterCommand.countTotalSteps(
const.GS_UPGRADECTL, self.context.action)))
# If get upgrade strategy failed,
# then try to get rollback strategy.
# Set strategyFlag as True to check
# upgrade parameter is correct or not
if self.context.action in [const.ACTION_LARGE_UPGRADE,
const.ACTION_SMALL_UPGRADE]:
self.doGreyBinaryUpgrade()
else:
self.doInplaceBinaryUpgrade()
# After choseStrategy, it will assign action to self.context.action
elif self.context.action == const.ACTION_AUTO_ROLLBACK:
# because if we rollback with auto rollback,
# we will rollback all the nodes,
# but if we rollback under upgrade,
# we will only rollback specified nodes
self.context.action = self.choseStrategy()
self.context.rollback = True
if self.context.oldClusterNumber < const.RELMAP_4K_VERSION and self.context.forceRollback:
errMsg = "could not do force rollback in this version: %s" % self.context.oldClusterNumber
self.context.logger.log(errMsg)
self.exitWithRetCode(action, False, errMsg)
if self.context.action == const.ACTION_INPLACE_UPGRADE:
self.exitWithRetCode(const.ACTION_AUTO_ROLLBACK,
self.doInplaceBinaryRollback())
else:
self.exitWithRetCode(const.ACTION_AUTO_ROLLBACK,
self.doGreyBinaryRollback(
const.ACTION_AUTO_ROLLBACK))
elif self.context.action == const.ACTION_COMMIT_UPGRADE:
self.context.action = self.choseStrategy()
if self.context.action == const.ACTION_INPLACE_UPGRADE:
self.doInplaceCommitUpgrade()
else:
self.doGreyCommitUpgrade()
else:
self.doChoseStrategy()
except Exception as e:
self.context.logger.debug(traceback.format_exc() + str(e))
if not self.context.sshTool:
self.context.sshTool = SshTool(
self.context.clusterNodes, self.context.logger,
DefaultValue.TIMEOUT_PSSH_BINARY_UPGRADE)
if action == const.ACTION_AUTO_ROLLBACK and \
self.checkBakPathNotExists():
self.context.logger.log("No need to rollback.")
self.exitWithRetCode(action, True)
else:
self.context.logger.error(str(e))
self.exitWithRetCode(action, False, str(e))
def commonCheck(self):
"""
Check in the common process.
:return:
"""
self.checkReadOnly()
if self.context.is_grey_upgrade:
self.getOneDNInst(checkNormal=True)
self.checkUpgradeMode()
def checkReadOnly(self):
"""
check if in read only mode under grey upgrade, grey upgrade commit or
grey upgrade rollback if not in read only, then record the value of
enable_transaction_read_only and set it to off
"""
try:
self.context.logger.debug("Check if in read only mode.")
greyUpgradeFlagFile = os.path.join(self.context.upgradeBackupPath,
const.GREY_UPGRADE_STEP_FILE)
# only used under grey upgrade, grey upgrade commit or grey upgrade
# rollback if under grey upgrade, the flag file
# greyUpgradeFlagFile has not been created
# so we use is_inplace_upgrade to judge the mode
if (self.context.action == const.ACTION_AUTO_UPGRADE and
not self.context.is_inplace_upgrade or
(os.path.isfile(greyUpgradeFlagFile) and
self.context.action in [const.ACTION_AUTO_ROLLBACK,
const.ACTION_COMMIT_UPGRADE])):
if self.unSetClusterReadOnlyMode() != 0:
raise Exception("NOTICE: "
+ ErrorCode.GAUSS_529["GAUSS_52907"])
except Exception as e:
raise Exception(str(e))
def checkUpgradeMode(self):
"""
used to check if upgrade_mode is 0 under before upgrade
if not, we set it to 0
"""
tempPath = self.context.upgradeBackupPath
filePath = os.path.join(tempPath, const.INPLACE_UPGRADE_STEP_FILE)
if self.context.action == const.ACTION_AUTO_UPGRADE \
and not os.path.exists(filePath):
try:
self.setUpgradeMode(0)
self.context.logger.log(
"Successfully set upgrade_mode to 0.")
except Exception as e:
self.context.logger.log("Failed to set upgrade_mode to 0, "
"please set it manually, "
"or rollback first.")
raise Exception(str(e))
def checkBakPathNotExists(self):
"""
check binary_upgrade exists on all nodes,
:return: True if not exists on all nodes
"""
try:
cmd = "if [ -d '%s' ]; then echo 'GetDir'; else echo 'NoDir'; fi" \
% self.context.upgradeBackupPath
self.context.logger.debug("Command for checking if upgrade bak "
"path exists: %s" % cmd)
outputCollect = self.context.sshTool.getSshStatusOutput(cmd)[1]
if outputCollect.find('GetDir') >= 0:
self.context.logger.debug("Checking result: %s"
% outputCollect)
return False
self.context.logger.debug("Path %s does not exists on all node."
% self.context.upgradeBackupPath)
return True
except Exception:
self.context.logger.debug("Failed to check upgrade bak path.")
return False
def doChoseStrategy(self):
"""
function: chose the strategy for upgrade
input : NA
output: NA
"""
self.context.logger.debug("Choosing strategy.")
try:
self.context.action = self.choseStrategy()
# we only support binary-upgrade.
if self.context.action in [const.ACTION_SMALL_UPGRADE,
const.ACTION_LARGE_UPGRADE]:
self.exitWithRetCode(const.ACTION_CHOSE_STRATEGY,
True,
"Upgrade strategy: %s."
% self.context.action)
# Use inplace upgrade under special case
else:
self.exitWithRetCode(const.ACTION_CHOSE_STRATEGY,
True,
"Upgrade strategy: %s."
% self.context.action)
except Exception as e:
self.exitWithRetCode(const.ACTION_CHOSE_STRATEGY, False, str(e))
self.context.logger.debug("Successfully got the upgrade strategy.")
def choseStrategy(self):
"""
function: chose upgrade strategy
input : NA
output: NA
"""
upgradeAction = None
try:
# get new cluster info
newVersionFile = VersionInfo.get_version_file()
newClusterVersion, newClusterNumber, newCommitId = \
VersionInfo.get_version_info(newVersionFile)
gaussHome = ClusterDir.getInstallDir(self.context.user)
if gaussHome == "":
raise Exception(ErrorCode.GAUSS_518["GAUSS_51800"]
% "$GAUSSHOME")
if not os.path.islink(gaussHome):
raise Exception(ErrorCode.GAUSS_529["GAUSS_52915"])
newPath = gaussHome + "_%s" % newCommitId
# new app dir should exist after preinstall,
# then we can use chose strategy
if not os.path.exists(newPath):
if self.context.action != const.ACTION_AUTO_ROLLBACK:
raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"]
% newPath)
self.context.logger.debug(
"Successfully obtained version information"
" of new clusters by %s." % newVersionFile)
# get the old cluster info, if binary_upgrade does not exists,
# try to copy from other nodes
oldPath = self.getClusterAppPath(const.OLD)
if oldPath == "":
self.context.logger.debug("Cannot get the old install "
"path from table and file.")
oldPath = os.path.realpath(gaussHome)
self.context.logger.debug("Old cluster app path is %s" % oldPath)
oldVersionFile = "%s/bin/upgrade_version" % oldPath
try:
(oldClusterVersion, oldClusterNumber, oldCommitId) = \
VersionInfo.get_version_info(oldVersionFile)
self.context.logger.debug("Successfully obtained version"
" information of old clusters by %s."
% oldVersionFile)
except Exception as e:
if os.path.exists(self.context.upgradeBackupPath):
# if upgradeBackupPath exist,
# it means that we do rollback first.
# and we get cluster version from the backup file
possibOldVersionFile = "%s/old_upgrade_version" \
% self.context.upgradeBackupPath
self.context.logger.debug(str(e))
self.context.logger.debug(
"Try to get the version information from %s."
% possibOldVersionFile)
(oldClusterVersion, oldClusterNumber, oldCommitId) = \
VersionInfo.get_version_info(possibOldVersionFile)
else:
raise Exception(str(e))
# if last success commit upgrade_type is grey upgrade,
# the symbolic link should point to the
# old app path with old commit id
if oldCommitId == newCommitId:
raise Exception(ErrorCode.GAUSS_529["GAUSS_52901"])
self.context.logger.debug(
"Successfully obtained version information of new and old "
"clusters.\n The old cluster number:%s, the new "
"cluster number:%s." % (oldClusterNumber, newClusterNumber))
self.canDoRollbackOrCommit()
if oldClusterVersion > newClusterVersion:
raise Exception(ErrorCode.GAUSS_529["GAUSS_52902"]
% (oldClusterVersion, newClusterVersion))
self.checkLastUpgrade(newCommitId)
if float(newClusterNumber) < float(oldClusterNumber):
raise Exception(ErrorCode.GAUSS_516["GAUSS_51629"]
% newClusterNumber)
elif float(newClusterNumber) == float(oldClusterNumber):
if self.context.is_inplace_upgrade:
upgradeAction = const.ACTION_INPLACE_UPGRADE
else:
upgradeAction = const.ACTION_SMALL_UPGRADE
else:
if int(float(newClusterNumber)) > int(float(oldClusterNumber)):
raise Exception(ErrorCode.GAUSS_529["GAUSS_52904"]
+ "This cluster version is "
"not supported upgrade.")
elif ((float(newClusterNumber) - int(float(newClusterNumber)))
> (float(oldClusterNumber) -
int(float(oldClusterNumber)))):
if self.context.is_inplace_upgrade:
upgradeAction = const.ACTION_INPLACE_UPGRADE
self.isLargeInplaceUpgrade = True
else:
upgradeAction = const.ACTION_LARGE_UPGRADE
else:
raise Exception(ErrorCode.GAUSS_516["GAUSS_51629"]
% newClusterNumber)
self.context.logger.debug("The matched upgrade strategy is: %s."
% upgradeAction)
self.context.newClusterVersion = newClusterVersion
self.context.newClusterNumber = newClusterNumber
self.context.oldClusterVersion = oldClusterVersion
self.context.oldClusterNumber = oldClusterNumber
self.context.newClusterAppPath = newPath
self.context.oldClusterAppPath = oldPath
self.newCommitId = newCommitId
self.oldCommitId = oldCommitId
return upgradeAction
except Exception as e:
raise Exception(ErrorCode.GAUSS_529["GAUSS_52900"] % str(e)
+ " Do nothing this time.")
def canDoRollbackOrCommit(self):
"""
Check whether rollback or commit is required.
:return:
"""
try:
if self.context.action == const.ACTION_AUTO_ROLLBACK or \
self.context.action == const.ACTION_COMMIT_UPGRADE:
inplaceUpgradeFlagFile = os.path.join(
self.context.upgradeBackupPath,
const.INPLACE_UPGRADE_FLAG_FILE)
grayUpgradeFlagFile = os.path.join(
self.context.upgradeBackupPath,
const.GREY_UPGRADE_STEP_FILE)
self.context.is_inplace_upgrade = False
# we do rollback by the backup directory
if os.path.isfile(inplaceUpgradeFlagFile):
self.context.logger.debug("inplace upgrade flag exists, "
"use inplace rollback or commit.")
self.context.is_inplace_upgrade = True
if os.path.isfile(grayUpgradeFlagFile):
self.context.logger.debug("grey upgrade flag exists, "
"use grey rollback or commit.")
self.context.is_grey_upgrade = True
if not (self.context.is_inplace_upgrade or
self.context.is_grey_upgrade):
if self.context.action == const.ACTION_AUTO_ROLLBACK \
and not self.checkBakPathNotExists():
self.cleanBinaryUpgradeBakFiles(True)
exitMsg = "No need to {0}".format(self.context.action)
self.exitWithRetCode(self.context.action, True, exitMsg)
except Exception as e:
raise Exception("Failed to check whether the rollback or commit."
" Error {0}".format(str(e)))
def checkLastUpgrade(self, newCommitId):
"""
check the last fail upgrade type is same with this time
check the last upgrade version is same with this time
under grey upgrade, if under inplace upgrade, we will
rollback first, under grey upgrade, we will upgrade again
"""
if self.context.action == const.ACTION_AUTO_UPGRADE:
stepFile = os.path.join(self.context.upgradeBackupPath,
const.GREY_UPGRADE_STEP_FILE)
cmd = "if [ -f '%s' ]; then echo 'True';" \
" else echo 'False'; fi" % stepFile
(resultMap, outputCollect) = \
self.context.sshTool.getSshStatusOutput(cmd)
self.context.logger.debug(
"The result of checking grey upgrade step flag"
" file on all nodes is:\n%s" % outputCollect)
if self.context.is_inplace_upgrade:
# if the grey upgrade rollback failed, it should have file,
# so cannot do grey upgrade now
if outputCollect.find('True') >= 0:
ermsg = ErrorCode.GAUSS_502["GAUSS_50200"] \
% const.GREY_UPGRADE_STEP_FILE \
+ "In grey upgrade process, " \
"cannot do inplace upgrade!"
raise Exception(str(ermsg))
else:
inplace_upgrade_flag_file =\
"%s/inplace_upgrade_flag" % self.context.upgradeBackupPath
if os.path.isfile(inplace_upgrade_flag_file):
ermsg = ErrorCode.GAUSS_502["GAUSS_50200"] % \
inplace_upgrade_flag_file + \
"In inplace upgrade process, " \
"cannot do grey upgrade!"
raise Exception(ermsg)
# it may have remaining when last upgrade use
# --force to forceRollback
self.checkBakPathAndTable(outputCollect)
self.checkNewCommitid(newCommitId)
elif self.context.action == const.ACTION_AUTO_ROLLBACK or \
self.context.action == const.ACTION_COMMIT_UPGRADE:
self.checkNewCommitid(newCommitId)
def checkBakPathAndTable(self, outputCollect):
"""
if the record step file in all nodes not exists, and the
table exists, so this situation means the last upgrade
remaining table
if the table and step file exists, check if the content is correct
:param resultMap:
:param outputCollect:
:return:
"""
# no need to check and drop schema under force upgrade
if not self.existTable(const.RECORD_NODE_STEP):
return
output = outputCollect.split('\n')
output = output[:-1]
findBakPath = False
for record in output:
# if can find step, means this
if record.find('True') >= 0:
findBakPath = True
break
if not findBakPath:
self.dropSupportSchema()
return
def checkNewCommitid(self, newCommitId):
"""
the commitid is in version.cfg, it should be same with the record
commitid in record app directory file
:param newCommitId: version.cfg line 3
:return: NA
"""
newPath = self.getClusterAppPath(const.NEW)
if newPath != "":
LastNewCommitId = newPath[-8:]
# When repeatedly run gs_upgradectl script,
# this time upgrade version should be same
# with last record upgrade version
if newCommitId != LastNewCommitId:
raise Exception(ErrorCode.GAUSS_529["GAUSS_52935"])
def setGUCValue(self, guc_key, guc_value, action_type="reload"):
"""
function: do gs_guc
input : gucKey - parameter name
gucValue - parameter value
actionType - guc action type(set/reload). default is 'reload'
onlySetCn - whether only set CN instance. default is False
"""
tmp_file = ""
if guc_value != "":
guc_str = "%s='%s'" % (guc_key, guc_value)
else:
guc_str = "%s" % guc_key
try:
self.context.logger.debug("Start to set GUC value %s." % guc_str)
cmd = "%s -t %s -U %s --upgrade_bak_path=%s --guc_string=\"%s\" -l %s --setType=%s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_SET_GUC_VALUE,
self.context.user,
self.context.upgradeBackupPath,
guc_str,
self.context.localLog,
action_type)
if action_type == "reload":
tmp_file = os.path.join(EnvUtil.getTmpDirFromEnv(self.context.user),
const.TMP_DYNAMIC_DN_INFO)
self.generateDynamicInfoFile(tmp_file)
self.context.logger.debug("Cmd for setting parameter: %s." % cmd)
host_list = copy.deepcopy(self.context.clusterNodes)
self.context.execCommandInSpecialNode(cmd, host_list)
self.context.logger.debug("Successfully set guc value.")
except Exception as er:
if self.context.forceRollback:
self.context.logger.debug("WARNING: failed to set value %s." % guc_str)
else:
raise Exception(str(er))
finally:
if os.path.exists(tmp_file):
delete_cmd = "(if [ -f '%s' ]; then rm -f '%s'; fi) " % \
(tmp_file, tmp_file)
host_list = copy.deepcopy(self.context.clusterNodes)
self.context.execCommandInSpecialNode(delete_cmd, host_list)
def setClusterReadOnlyMode(self):
"""
function: Set the cluster read-only mode
input : NA
output: 0 successfully
1 failed
"""
try:
self.context.logger.debug("Setting up the cluster read-only mode.")
self.setGUCValue("default_transaction_read_only", "true")
self.context.logger.debug("successfully set the cluster read-only mode.")
return 0
except Exception as e:
self.context.logger.debug("WARNING: Failed to set default_transaction_read_only "
"parameter. %s" % str(e))
return 1
def unSetClusterReadOnlyMode(self):
"""
function: Canceling the cluster read-only mode
input : NA
output: 0 successfully
1 failed
"""
try:
self.context.logger.debug("Canceling the cluster read-only mode.")
self.setGUCValue("default_transaction_read_only", "false")
self.context.logger.debug("Successfully cancelled the cluster read-only mode.")
return 0
except Exception as e:
self.context.logger.debug("WARNING: Failed to set default_transaction_read_only "
"parameter. %s" % str(e))
return 1
def stopCluster(self):
"""
function: Stopping the cluster
input : NA
output: NA
"""
self.context.logger.debug("Stopping the cluster.", "addStep")
# Stop cluster applications
cmd = "%s -U %s -R %s -t %s" % (
OMCommand.getLocalScript("Local_StopInstance"),
self.context.user, self.context.clusterInfo.appPath,
const.UPGRADE_TIMEOUT_CLUSTER_STOP)
self.context.logger.debug("Command for stop cluster: %s" % cmd)
CmdExecutor.execCommandWithMode(
cmd, self.context.sshTool,
self.context.isSingle or self.context.localMode,
self.context.mpprcFile)
self.context.logger.debug("Successfully stopped cluster.")
def startCluster(self):
"""
function: start cluster
input : NA
output: NA
"""
versionFile = os.path.join(
self.context.oldClusterAppPath, "bin/upgrade_version")
if os.path.exists(versionFile):
_, number, _ = VersionInfo.get_version_info(versionFile)
cmd = "%s -U %s -R %s -t %s --cluster_number=%s" % (
OMCommand.getLocalScript("Local_StartInstance"),
self.context.user, self.context.clusterInfo.appPath,
const.UPGRADE_TIMEOUT_CLUSTER_START, number)
else:
cmd = "%s -U %s -R %s -t %s" % (
OMCommand.getLocalScript("Local_StartInstance"),
self.context.user, self.context.clusterInfo.appPath,
const.UPGRADE_TIMEOUT_CLUSTER_START)
CmdExecutor.execCommandWithMode(
cmd, self.context.sshTool,
self.context.isSingle or self.context.localMode,
self.context.mpprcFile)
self.context.logger.log("Successfully started cluster.")
def createCommitFlagFile(self):
"""
function: create a flag file, if this file exists,
means that user have called commit interface,
but still not finished. if create failed, script should exit.
input : NA
output: NA
"""
commitFlagFile = "%s/commitFlagFile" % self.context.upgradeBackupPath
self.context.logger.debug("Start to create the commit flag file.")
try:
cmd = "(if [ -d '%s' ]; then touch '%s'; fi) " % (
self.context.upgradeBackupPath, commitFlagFile)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
except Exception as e:
raise Exception(ErrorCode.GAUSS_502["GAUSS_50206"]
% ("commit flag file: %s" % str(e)))
self.context.logger.debug("Successfully created the commit flag file.")
def checkCommitFlagFile(self):
"""
function: check if commit flag file exists.
input : NA
output: return 0, If there is the file commitFlagFile.
else, return 1
"""
commitFlagFile = "%s/commitFlagFile" % self.context.upgradeBackupPath
if (os.path.isfile(commitFlagFile)):
return 0
else:
return 1
def createInplaceUpgradeFlagFile(self):
"""
function: create inplace upgrade flag file on
all nodes if is doing inplace upgrade
1.check if is inplace upgrade
2.get new and old cluster version number
3.write file
Input: NA
output : NA
"""
self.context.logger.debug("Start to create inplace upgrade flag file.")
try:
newClusterNumber = self.context.newClusterNumber
oldClusterNumber = self.context.oldClusterNumber
inplace_upgrade_flag_file = "%s/inplace_upgrade_flag" % \
self.context.upgradeBackupPath
FileUtil.createFile(inplace_upgrade_flag_file)
FileUtil.writeFile(inplace_upgrade_flag_file,
["newClusterNumber:%s" % newClusterNumber], 'a')
FileUtil.writeFile(inplace_upgrade_flag_file,
["oldClusterNumber:%s" % oldClusterNumber], 'a')
if (not self.context.isSingle):
self.context.sshTool.scpFiles(inplace_upgrade_flag_file,
self.context.upgradeBackupPath)
if float(self.context.oldClusterNumber) <= float(
const.UPGRADE_VERSION_64bit_xid) < \
float(self.context.newClusterNumber):
self.__upgrade_across_64bit_xid = True
self.context.logger.debug("Successfully created inplace"
" upgrade flag file.")
except Exception as e:
raise Exception(str(e))
def setUpgradeFromParam(self, cluster_version_number, is_check=True):
"""
function: set upgrade_from parameter
Input : oldClusterNumber
output : NA
"""
if not DefaultValue.get_cm_server_num_from_static(self.context.oldClusterInfo) > 0:
self.context.logger.debug("No need to set cm parameter.")
return
self.context.logger.debug("Set upgrade_from guc parameter.")
working_grand_version = int(float(cluster_version_number) * 1000)
cmd = "gs_guc set -Z cmagent -N all -I all -c 'upgrade_from=%s'" % working_grand_version
self.context.logger.debug("setting cmagent parameter: %s." % cmd)
try:
(status, output) = CmdUtil.retryGetstatusoutput(cmd)
if status != 0:
self.context.logger.debug("Set upgrade_from failed. "
"cmd:%s\nOutput:%s" % (cmd, str(output)))
raise Exception(
ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + "Error: \n%s" % str(output))
if is_check:
gucStr = "%s:%s" % ("upgrade_from", str(working_grand_version).strip())
self.checkParam(gucStr, True)
self.context.logger.debug("Successfully set cmagent parameter "
"upgrade_from=%s." % working_grand_version)
except Exception as er:
if self.context.action == const.ACTION_INPLACE_UPGRADE or \
not self.context.forceRollback:
raise Exception(str(er))
self.context.logger.log("NOTICE: Failed to set upgrade_from, "
"please set it manually with command: \n%s" % str(cmd))
def setUpgradeMode(self, mode, set_type="reload"):
"""
function: set upgrade_mode parameter
Input : upgrade_mode
output : NA
"""
try:
self.setUpgradeModeGuc(mode, set_type)
except Exception as er:
if self.context.action != const.ACTION_INPLACE_UPGRADE and \
not self.context.forceRollback:
raise Exception(str(er))
try:
self.setUpgradeModeGuc(mode, "set")
except Exception as _:
self.context.logger.log("NOTICE: Failed to set upgrade_mode to {0}, "
"please set it manually.".format(mode))
def setUpgradeModeGuc(self, mode, set_type="reload"):
"""
function: set upgrade mode guc
input : mode, setType
output : NA
"""
self.context.logger.debug("Set upgrade_mode guc parameter.")
cmd = "gs_guc %s -Z datanode -I all -c 'upgrade_mode=%d'" % (set_type, mode)
self.context.logger.debug("Command for setting database"
" node parameter: %s." % cmd)
retry_count = 0
while retry_count < 5:
try:
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool)
break
except Exception as _:
retry_count += 1
if retry_count < 5:
time.sleep(5)
continue
guc_str = "upgrade_mode:%d" % mode
self.checkParam(guc_str)
self.context.logger.debug("Successfully set "
"upgrade_mode to %d." % mode)
def checkParam(self, gucStr, fromFile=False):
"""
function: check the cmagent guc value
Input : gucStr the guc key:value string
output : NA
"""
self.context.logger.debug("Start to check GUC value %s." % gucStr)
try:
# send cmd to that node and exec
cmd = "%s -t %s -U %s --upgrade_bak_path=%s" \
" --guc_string=\"%s\" -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_CHECK_GUC,
self.context.user,
self.context.upgradeBackupPath,
gucStr,
self.context.localLog)
if fromFile:
cmd += " --fromFile"
self.context.logger.debug("Command for checking"
" parameter: %s." % cmd)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
self.context.logger.debug("Successfully checked guc value.")
except Exception as e:
raise Exception(str(e))
def floatMoreThan(self, numOne, numTwo):
"""
function: float more than
input : numOne, numTwo
output : True/False
"""
if float(numOne) - float(numTwo) > float(const.DELTA_NUM):
return True
return False
def floatEqualTo(self, numOne, numTwo):
"""
function: float equal to
input: numOne, numTwo
output: True/False
"""
if float(-const.DELTA_NUM) < (float(numOne) - float(numTwo)) \
< float(const.DELTA_NUM):
return True
return False
def floatGreaterOrEqualTo(self, numOne, numTwo):
"""
function: float greater or equal to
input: numOne, numTwo
output: True/False
"""
if self.floatMoreThan(numOne, numTwo) or \
self.floatEqualTo(numOne, numTwo):
return True
return False
def reloadVacuumDeferCleanupAge(self):
"""
function: reload the guc paramter vacuum_defer_cleanup_age value on
inplace upgrade or grey large upgrade
input : NA
"""
self.setGUCValue("vacuum_defer_cleanup_age", "100000", "reload")
def doGreyBinaryUpgrade(self):
"""
function: do grey binary upgrade, which essentially replace the binary
files, for the new version than 91.255, support this strategy to
change binary upgrade(Inplace), use the symbolic links to change the
binary file directory instead of installing the new bin in the same
directory.choose minority nodes to upgrade first, observe to decide
whether upgrade remaining nodes or rollback grey nodes
input : NA
output: NA
"""
upgradeAgain = False
try:
# 1. distribute xml configure file to every nodes.
self.distributeXml()
# 2. check if the app path is ready and sha256 is right and others
self.checkUpgrade()
# 4. check the cluster pressure
self.HASyncReplayCheck()
# 5. before do grey binary upgrade, we must make sure the
# cluster is Normal and the database could be
# connected, if not, exit.
(status, output) = self.doHealthCheck(const.OPTION_PRECHECK)
if status != 0:
raise Exception(ErrorCode.GAUSS_516["GAUSS_51601"] %
"cluster" + "Detail: " + output)
# 6.chose the node name list that satisfy the condition as
# upgrade nodes
self.chooseUpgradeNodes()
# check if it satisfy upgrade again, if it is the second loop to
# upgrade, it can go go upgrade again branch
upgradeAgain = self.canUpgradeAgain()
except Exception as e:
# before this step, the upgrade process do nothing to the cluster,
# this time has no remaining
self.context.logger.debug(traceback.format_exc())
self.context.logger.log(ErrorCode.GAUSS_529["GAUSS_52934"] +
"Nodes are the old version.\n" +
"Error: %s." % str(e) +
" Do nothing this time.")
self.exitWithRetCode(self.action, False, str(e))
if not upgradeAgain:
try:
if not self.doGreyBinaryRollback():
self.exitWithRetCode(const.ACTION_AUTO_ROLLBACK, False)
self.removeOmRollbackProgressFile()
self.context.logger.log(
"The directory %s will be deleted after commit-upgrade, "
"please make sure there is no personal data." %
self.context.oldClusterAppPath)
# 7. prepare upgrade function for sync and table
# RECORD_NODE_STEP, init the step of all nodes as 0
self.prepareGreyUpgrade()
# 8. install the new bin in the appPath which has been
# prepared in the preinstall
self.installNewBin()
# decompress the catalog upgrade_sql.tar.gz to temp dir,
# include upgrade sql file and guc set
self.prepareUpgradeSqlFolder()
self.recordNodeStep(GreyUpgradeStep.STEP_UPDATE_CATALOG)
# 9. if we update catalog after switch to the new bin,
# the system will raise error cannot find
# catalog or column until finish the updateCatalog function
# we can not recognize if it really cannot
# find the column, or just because the old version. So we
# will update the catalog in the old version
if self.context.action == const.ACTION_LARGE_UPGRADE:
self.updateCatalog()
self.recordNodeStep(GreyUpgradeStep.STEP_SWITCH_NEW_BIN)
self.CopyCerts()
self.upgradeAgain()
except Exception as e:
errmsg = ErrorCode.GAUSS_529["GAUSS_52934"] + \
"You can use --grey to upgrade or manually rollback."
self.context.logger.log(errmsg + str(e))
self.exitWithRetCode(self.context.action, False)
else:
self.upgradeAgain()
self.exitWithRetCode(self.context.action, True)
def upgradeAgain(self):
try:
self.context.logger.debug(
"From this step, you can use -h to upgrade again if failed.")
# we have guarantee specified nodes have same step,
# so we only need to get one node step
currentStep = self.getOneNodeStep(self.context.nodeNames[0])
self.context.logger.debug("Current node step is %d" % currentStep)
# first time execute grey upgrade, we will record the step for
# all the nodes, if we upgrade remain nodes,
# reenter the upgrade process, we will not rollback autonomously,
# just upgrade again
if currentStep < GreyUpgradeStep.STEP_UPGRADE_PROCESS:
self.backupHotpatch()
# 10. sync Cgroup configure and etc.
# use the symbolic link to change the bin dir
# sync old config to new bin path, the pg_plugin save the
# C function .so file(but not end with .so),
# so if it create in the old appPath after copy to the
# newAppPath but not switch to new bin
# the new version may not recognize the C function
self.greySyncGuc()
self.greyUpgradeSyncOldConfigToNew()
# 11. switch the cluster version to new version
self.getOneDNInst(checkNormal=True)
self.switchBin(const.NEW)
# create CA for CM
self.create_ca_for_cm()
self.setNewVersionGuc()
self.recordNodeStep(GreyUpgradeStep.STEP_UPGRADE_PROCESS)
if currentStep < GreyUpgradeStep.STEP_UPDATE_POST_CATALOG:
# 12. kill the old existing process, will judge whether
# each process is the required version
self.switchExistsProcess()
self.recordNodeStep(GreyUpgradeStep.STEP_UPDATE_POST_CATALOG)
except Exception as e:
self.context.logger.log("Failed to upgrade, can use --grey to "
"upgrade again after rollback. Error: "
"%s" % str(e))
self.context.logger.debug(traceback.format_exc())
self.exitWithRetCode(self.context.action, False, str(e))
self.context.logger.log(
"The nodes %s have been successfully upgraded to new version. "
"Then do health check." % self.context.nodeNames)
try:
# 13. check the cluster status, the cluster status can be degraded
(status, output) = self.doHealthCheck(const.OPTION_POSTCHECK)
if status != 0:
raise Exception(ErrorCode.GAUSS_516["GAUSS_51601"] %
"cluster" + output)
if self.isNodeSpecifyStep(GreyUpgradeStep.STEP_UPDATE_POST_CATALOG):
# 14. exec post upgrade script
if self.context.action == const.ACTION_LARGE_UPGRADE:
self.waitClusterForNormal()
# backup global relmap file before doing upgrade-post
self.backupGlobalRelmapFile()
self.prepareSql("rollback-post")
self.execRollbackUpgradedCatalog(scriptType="rollback-post")
self.prepareSql("upgrade-post")
self.execRollbackUpgradedCatalog(scriptType="upgrade-post")
self.getLsnInfo()
hosts = copy.deepcopy(self.context.clusterNodes)
self.recordNodeStep(
GreyUpgradeStep.STEP_PRE_COMMIT, nodes=hosts)
self.printPrecommitBanner()
except Exception as e:
hintInfo = "Nodes are new version. " \
"Please check the cluster status. ERROR: \n"
self.context.logger.log(hintInfo + str(e))
self.context.logger.debug(traceback.format_exc())
self.exitWithRetCode(self.context.action, False, hintInfo + str(e))
self.context.logger.log("Successfully upgrade nodes.")
self.exitWithRetCode(self.context.action, True)
def getOneNodeStep(self, nodeName):
"""
get the node's step
"""
currentStep = self.getOneNodeStepInFile(nodeName)
return currentStep
def getOneNodeStepInFile(self, nodeName):
"""
get the node's step from step file
"""
try:
stepFile = os.path.join(self.context.upgradeBackupPath,
const.GREY_UPGRADE_STEP_FILE)
self.context.logger.debug(
"trying to get one node step in file %s" % stepFile)
with open(stepFile, 'r') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if row['node_host'] == nodeName:
step = int(row['step'])
break
self.context.logger.debug("successfully got one node step {0} "
"in file {1}".format(step, stepFile))
return step
except Exception as e:
exitMsg = "Failed to get node step in step file. ERROR {0}".format(
str(e))
self.exitWithRetCode(self.action, False, exitMsg)
def greySyncGuc(self):
"""
delete the old version guc
"""
cmd = "%s -t %s -U %s --upgrade_bak_path=%s -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_GREY_SYNC_GUC,
self.context.user,
self.context.upgradeBackupPath,
self.context.localLog)
self.context.logger.debug("Command for sync GUC in upgrade: %s" % cmd)
hostList = copy.deepcopy(self.context.nodeNames)
self.context.sshTool.executeCommand(cmd, hostList=hostList)
self.context.logger.debug("Successfully sync guc.")
def greyUpgradeSyncOldConfigToNew(self):
"""
function: sync old cluster config to the new cluster install path
input : NA
output: NA
"""
# restore list:
# etc/gscgroup_xxx.cfg
# lib/postgresql/pg_plugin
# initdb_param
# server.key.cipher
# server.key.rand
# /share/sslsert/ca.key
# /share/sslsert/etcdca.crt
self.context.logger.log("Sync cluster configuration.")
try:
# backup DS libs and gds file
cmd = "%s -t %s -U %s -V %d --old_cluster_app_path=%s " \
"--new_cluster_app_path=%s -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_GREY_UPGRADE_CONFIG_SYNC,
self.context.user,
int(float(self.context.oldClusterNumber) * 1000),
self.context.oldClusterAppPath,
self.context.newClusterAppPath,
self.context.localLog)
self.context.logger.debug("Command for syncing config files: %s"
% cmd)
hostList = copy.deepcopy(self.context.nodeNames)
self.context.sshTool.executeCommand(cmd, hostList=hostList)
# change the owner of application
cmd = "chown -R %s:%s '%s'" % \
(self.context.user, self.context.group,
self.context.newClusterAppPath)
hostList = copy.deepcopy(self.context.nodeNames)
self.context.sshTool.executeCommand(cmd, hostList=hostList)
except Exception as e:
raise Exception(str(e) + " Failed to sync configuration.")
self.context.logger.log("Successfully synced cluster configuration.")
def _check_and_start_cluster(self):
"""
Check cluster state and start cluster
"""
self.context.logger.log("Check cluster state.")
cmd = "source {0};gs_om -t query".format(self.context.userProfile)
status, output = subprocess.getstatusoutput(cmd)
if status != 0:
self.context.logger.debug("Check cluster state failed. Output: {0}".format(output))
if "cluster_state : Degraded" in output or "cluster_state : Normal" in output:
self.context.logger.log("Cluster state: {0}".format(output))
return
self.context.logger.log("Cluster need start now.")
cmd = "source {0};gs_om -t start".format(self.context.userProfile)
status, output = subprocess.getstatusoutput(cmd)
if status != 0:
self.context.logger.debug("Start cluster state failed. Output: {0}".format(output))
return
self.context.logger.log("Cluster is started now.")
def switchExistsProcess(self, isRollback=False):
"""
switch all the process
:param isRollback:
:return:
"""
self.context.logger.log("Switching all db processes.", "addStep")
self._check_and_start_cluster()
if DefaultValue.get_cm_server_num_from_static(self.context.oldClusterInfo) > 0:
self.setUpgradeFromParam(self.context.oldClusterNumber)
self.reloadCmAgent()
self.reload_cmserver()
self.createCheckpoint()
self.switchDn(isRollback)
try:
self.waitClusterNormalDegrade()
except Exception as e:
# can't promise normal status in force upgrade or forceRollback
if self.context.forceRollback:
self.context.logger.log("WARNING: Failed to wait "
"cluster normal or degrade.")
else:
raise Exception(str(e))
self.context.logger.log("Successfully switch all process version",
"constant")
def createCheckpoint(self):
try:
self.context.logger.log("Create checkpoint before switching.")
start_time = timeit.default_timer()
# create checkpoint
sql = "CHECKPOINT;"
for i in range(10):
(status, output) = self.execSqlCommandInPrimaryDN(sql)
# no need to retry under force upgrade
if status == 0:
break
self.context.logger.debug("Waring: checkpoint creation fails "
"for the %s time. Fail message:%s."
"try again at one second intervals" %
(str(i), str(output)))
time.sleep(1)
if status != 0:
raise Exception(ErrorCode.GAUSS_513["GAUSS_51300"] % sql +
" Error: \n%s" % str(output))
elapsed = timeit.default_timer() - start_time
self.context.logger.debug("Time to create checkpoint: %s" %
self.getTimeFormat(elapsed))
except Exception as e:
if self.context.forceRollback:
self.context.logger.log(
"WARNING: Failed to create checkpoint, "
"the switch process may use more time.")
else:
raise Exception(str(e))
def need_rolling(self, is_roll_back):
"""
Get is need switch UDF subprocess from upgrade mode
"""
self.context.logger.debug("Start check need rolling.")
new_static_config = os.path.realpath(os.path.join(self.context.newClusterAppPath,
"bin", "cluster_static_config"))
old_static_config = os.path.realpath(os.path.join(self.context.oldClusterAppPath,
"bin", "cluster_static_config"))
cluster_info = dbClusterInfo()
if is_roll_back:
self.context.logger.debug("This check need rolling for rollback.")
if not os.path.isfile(new_static_config):
self.context.logger.debug("Rollback not found new static config file [{0}]. "
"No need to switch UDF.".format(new_static_config))
return False
cluster_info.initFromStaticConfig(self.context.user, new_static_config)
if cluster_info.cmscount > 0:
self.context.logger.debug("Rollback cluster info include CMS instance. "
"So need to switch UDF.")
return True
self.context.logger.debug("Rollback new version cluster not include CMS instance. "
"So no need to switch UDF.")
return True
self.context.logger.debug("This check need rolling for upgrade.")
cluster_info.initFromStaticConfig(self.context.user, old_static_config)
if cluster_info.cmscount > 0:
self.context.logger.debug("Old cluster include CMS instance. So need to switch UDF.")
return True
self.context.logger.debug("Old cluster exclude CMS instance. So no need to switch UDF.")
return False
def switchDn(self, isRollback):
self.context.logger.log("Switching DN processes.")
start_time = timeit.default_timer()
# under upgrade, kill the process from old cluster app path,
# rollback: kill from new cluster app path
cmd = "%s -t %s -U %s -V %d --old_cluster_app_path=%s " \
"--new_cluster_app_path=%s -X '%s' -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_SWITCH_DN,
self.context.user,
int(float(self.context.oldClusterNumber) * 1000),
self.context.oldClusterAppPath,
self.context.newClusterAppPath,
self.context.xmlFile,
self.context.localLog)
if isRollback:
cmd += " --rollback"
if self.context.forceRollback:
cmd += " --force"
if self.need_rolling(isRollback):
cmd += " --rolling"
self.context.logger.debug(
"Command for switching DN processes: %s" % cmd)
hostList = copy.deepcopy(self.context.nodeNames)
self.context.sshTool.executeCommand(cmd, hostList=hostList)
start_cluster_time = timeit.default_timer()
self.greyStartCluster()
end_cluster_time = timeit.default_timer() - start_cluster_time
self.context.logger.debug("Time to start cluster is %s" %
self.getTimeFormat(end_cluster_time))
elapsed = timeit.default_timer() - start_time
self.context.logger.debug("Time to switch DN process version: %s"
% self.getTimeFormat(elapsed))
def greyStartCluster(self):
"""
start cluster in grey upgrade
:return:
"""
self.context.logger.log("Ready to grey start cluster.")
versionFile = os.path.join(
self.context.oldClusterAppPath, "bin/upgrade_version")
if os.path.exists(versionFile):
_, number, _ = VersionInfo.get_version_info(versionFile)
cmd = "gs_om -t start --cluster-number='%s'" % (number)
else:
cmd = "gs_om -t start"
(status, output) = subprocess.getstatusoutput(cmd)
if status != 0:
raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] %
"Command:%s. Error:\n%s" % (cmd, output))
self.context.logger.log("Grey start cluster successfully.")
def isNodeSpecifyStep(self, step, nodes=None):
"""
check if all the specfied nodes is the step
"""
return self.isNodeSpecifyStepInFile(step, nodes)
def isNodeSpecifyStepInFile(self, step=-1, nodes=None):
"""
step = -1 means we just check if step in all the specfied nodes is the
same otherwise, we check if all the specfied nodes is the given step
"""
try:
if nodes:
self.context.logger.debug(
"check if the nodes %s step is %s" % (nodes, step))
else:
self.context.logger.debug(
"check if all the nodes step is %s" % step)
nodes = copy.deepcopy(self.context.clusterNodes)
stepFile = os.path.join(self.context.upgradeBackupPath,
const.GREY_UPGRADE_STEP_FILE)
if not os.path.isfile(stepFile):
self.context.logger.debug(
"no step file, which means nodes %s step is same" % nodes)
return True
with open(stepFile, 'r') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if row['node_host'] in nodes:
if step == -1:
step = int(row['step'])
else:
if step == int(row['step']):
continue
else:
self.context.logger.debug(
"the nodes %s step is not all %s" % (
nodes, step))
return False
self.context.logger.debug(
"the nodes %s step is all %s" % (nodes, step))
return True
except Exception as e:
exitMsg = \
"Failed to check node step in file. ERROR {0}".format(str(e))
self.exitWithRetCode(self.action, False, exitMsg)
def getLsnInfo(self):
"""
Obtain the maximum LSN of each DN instance.
"""
self.context.logger.debug("Start to get lsn info.")
try:
# prepare dynamic cluster info file in every node
self.getOneDNInst(checkNormal=True)
execHosts = [self.dnInst.hostname]
cmd = "%s -t %s -U %s --upgrade_bak_path=%s -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_GET_LSN_INFO,
self.context.user,
self.context.upgradeBackupPath,
self.context.localLog)
self.context.logger.debug("Command for geting lsn info: %s." % cmd)
self.context.sshTool.executeCommand(cmd, hostList=execHosts)
self.context.logger.debug(
"Successfully get lsn info in instanse node.")
except Exception as e:
if self.context.forceRollback:
self.context.logger.debug(
"Failed to get lsn info in force Scenario.")
return
raise Exception(
"Failed to get lsn info in instanse node. "
"Error:{0}".format(str(e)))
def chooseUpgradeNodes(self):
# Already set the self.context.nodesNum = 1
# when number and node names is empty
self.context.logger.debug("Choose the nodes to be upgraded.")
self.setClusterDetailInfo()
self.context.nodeNames = self.context.clusterNodes
self.context.logger.log("Upgrade all nodes.")
def getUpgradedNodeNames(self, step=GreyUpgradeStep.STEP_INIT_STATUS):
"""
by default, return upgraded nodes
otherwise, return the nodes that step is more than given step
under force upgrade, we only get step from file
"""
return self.getUpgradedNodeNamesInFile(step)
def getUpgradedNodeNamesInFile(self, step=GreyUpgradeStep.STEP_INIT_STATUS):
"""
get upgraded nodes from step file
by default, return upgraded nodes
otherwise, return the nodes that step is more than given step
"""
try:
stepFile = os.path.join(self.context.upgradeBackupPath,
const.GREY_UPGRADE_STEP_FILE)
self.context.logger.debug(
"trying to get upgraded nodes from %s" % (stepFile))
if not os.path.isfile(stepFile):
return []
greyNodeNames = []
with open(stepFile, 'r') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if int(row['step']) > step:
greyNodeNames.append(row['node_host'])
self.context.logger.debug("upgraded nodes are {0}".format(
greyNodeNames))
return greyNodeNames
except Exception as e:
exitMsg = "Failed to get upgraded nodes from step file. " \
"ERROR {0}".format(str(e))
self.exitWithRetCode(self.action, False, exitMsg)
def existTable(self, relname):
"""
funcation: if the table exist in pg_class
input : NA
output: NA
"""
try:
sql = "select count(*) from pg_catalog.pg_class c, " \
"pg_catalog.pg_namespace n " \
"where n.nspname = '%s' AND relname = '%s' " \
"AND c.relnamespace = n.oid;" % (
const.UPGRADE_SCHEMA, relname)
self.context.logger.debug("Sql to query if has the table: %s" % sql)
(status, output) = self.execSqlCommandInPrimaryDN(sql)
if status != 0 or SqlResult.findErrorInSql(output):
raise Exception(ErrorCode.GAUSS_513["GAUSS_51300"] %
sql + " Error: \n%s" % str(output))
if output == '0':
self.context.logger.debug("Table does not exist.")
return False
self.context.logger.debug("Table exists.")
return True
except Exception as e:
raise Exception(str(e))
def canUpgradeAgain(self):
"""
judge if we should rollback or can upgrade again,
if has the nodes whose step is more than switch bin
"""
self.context.logger.debug("Check if we can upgrade again.")
greyNodeNames = self.getUpgradedNodeNames(
GreyUpgradeStep.STEP_SWITCH_NEW_BIN)
if len(greyNodeNames) > 0:
self.context.logger.debug(
"Has nodes step greater or equal than %d. Can upgrade again."
% GreyUpgradeStep.STEP_SWITCH_NEW_BIN)
return True
self.context.logger.debug(
"There is no node step greater or equal than %d. "
"Can not do upgrade again." % GreyUpgradeStep.STEP_SWITCH_NEW_BIN)
return False
def prepareGreyUpgrade(self):
"""
function: do pre-upgrade stuffs for primary and standby HA
sync check, and create table to record step
input : NA
output: NA
"""
if self.context.upgrade_remain:
self.context.logger.debug("No need to create pre-upgrade stuffs")
return
self.context.logger.debug("Start to create pre-upgrade stuffs")
# under force upgrade, we only prepare the files
self.prepareGreyUpgradeFiles()
# all stuffs done successfully, return 0
self.context.logger.debug("Successfully created pre-upgrade stuffs.")
def prepareGreyUpgradeFiles(self):
# the bakpath is created in checkUpgrade,
# but may deleted when rollback, so need to check
try:
self.context.logger.debug("start to prepare grey upgrade files")
self.createBakPath()
self.initNodeStepInCsv()
self.initUpgradeProcessStatus()
self.recordDirFile()
self.copyBakVersion()
self.context.logger.debug(
"successfully prepared grey upgrade files")
except Exception as e:
self.context.logger.debug("failed to prepare grey upgrade files")
raise Exception(str(e))
def initNodeStepInCsv(self):
bakStepFile = os.path.join(self.context.upgradeBackupPath,
const.GREY_UPGRADE_STEP_FILE + "_bak")
self.context.logger.debug("Create and init the file %s." % bakStepFile)
FileUtil.createFile(bakStepFile, True, DefaultValue.KEY_FILE_MODE)
header = ["node_host", "upgrade_action", "step"]
FileUtil.createFileInSafeMode(bakStepFile)
writeInfo = []
for dbNode in self.context.clusterInfo.dbNodes:
writeInfo.append([('%s' % dbNode.name),
('%s' % self.context.action),
('%s' % GreyUpgradeStep.STEP_INIT_STATUS)])
with open(bakStepFile, "w") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(header)
writer.writerows(writeInfo)
finalStepFile = os.path.join(self.context.upgradeBackupPath,
const.GREY_UPGRADE_STEP_FILE)
FileUtil.rename(bakStepFile, finalStepFile)
# so if we can get the step file, we can get the step information
self.context.logger.debug("Rename the file %s to %s." % (
bakStepFile, finalStepFile))
self.distributeFile(finalStepFile)
self.context.logger.debug("Successfully inited the file %s and "
"send it to each node." % finalStepFile)
def initUpgradeProcessStatus(self):
stepFile = os.path.join(self.context.upgradeBackupPath,
const.INPLACE_UPGRADE_STEP_FILE)
self.context.logger.debug("Create and init the file %s" % stepFile)
FileUtil.removeFile(stepFile, "python")
FileUtil.createFile(stepFile, True, DefaultValue.KEY_FILE_MODE)
self.recordNodeStepInplace(self.context.action,
GreyUpgradeStep.STEP_INIT_STATUS)
self.context.logger.debug("Successfully inited the file %s "
"and send it to each node" % stepFile)
def recordNodeStep(self, step, nodes=None):
"""
under normal rollback, if not have the binary_upgrade dir,
recordNodeStepInplace will create a file named binary_upgrade,
so we should raise error, and use the force rollback mode
For commit upgrade, we should create the dir to record the cannot
rollback flag to avoid node inconsistency
:param step: upgrade or rollback step
:param nodes: the nodes shoud be the step
:return:NA
"""
cmd = "if [ -d '%s' ]; then echo 'True'; else echo 'False'; fi" %\
self.context.upgradeBackupPath
hostList = copy.deepcopy(self.context.clusterNodes)
(resultMap, outputCollect) = self.context.sshTool.getSshStatusOutput(
cmd, hostList)
self.context.logger.debug(
"The result of checking distribute directory is:\n%s" %
outputCollect)
if outputCollect.find('False') >= 0:
if step != GreyUpgradeStep.STEP_BEGIN_COMMIT:
raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] %
self.context.upgradeBackupPath)
self.createBakPath()
self.recordNodeStepInplace(self.context.action, step)
# under force upgrade, we only record step to file
self.recordNodeStepInCsv(step, nodes)
self.context.logger.debug(
"Successfully record node step %s." % str(step))
def recordNodeStepInCsv(self, step, nodes=None):
if nodes is None:
nodes = []
self.context.logger.debug("Record node step %s in file" % str(step))
stepFile = os.path.join(self.context.upgradeBackupPath,
const.GREY_UPGRADE_STEP_FILE)
stepTempFile = os.path.join(self.context.upgradeBackupPath,
"upgrade_step_temp.csv")
FileUtil.createFileInSafeMode(stepTempFile)
with open(stepFile, 'r') as csvfile, \
open(stepTempFile, 'w') as tempfile:
header = ["node_host", "upgrade_action", "step"]
reader = csv.DictReader(csvfile)
writer = csv.writer(tempfile)
writer.writerow(header)
writeInfo = []
if not nodes:
nodes = self.context.nodeNames
if nodes:
for row in reader:
if row['node_host'] in nodes:
writeInfo.append([row['node_host'], row[
'upgrade_action'], str(step)])
else:
writeInfo.append([row['node_host'], row[
'upgrade_action'], row['step']])
else:
for row in reader:
writeInfo.append([row['node_host'],
row['upgrade_action'], str(step)])
writer.writerows(writeInfo)
FileUtil.removeFile(stepFile)
FileUtil.rename(stepTempFile, stepFile)
FileUtil.changeMode(DefaultValue.KEY_FILE_MODE, stepFile)
# distribute the node step file to each node
self.distributeFile(stepFile)
def doInplaceBinaryUpgrade(self):
"""
function: do binary upgrade, which essentially replace the binary files
input : NA
output: NA
"""
# 1. distribute new package to every nodes.
self.distributeXml()
# 2. check whether we should do rollback or not.
if not self.doInplaceBinaryRollback():
self.exitWithRetCode(const.ACTION_AUTO_ROLLBACK, False)
try:
self.checkUpgrade()
# 3. before do binary upgrade, we must make sure the cluster is
# Normal and the database could be connected
# if not, exit.
self.start_strategy(is_final=False)
# uninstall kerberos if has already installed
pghost_path = EnvUtil.getEnvironmentParameterValue(
'PGHOST', self.context.user)
kerberosflagfile = "%s/kerberos_upgrade_flag" % pghost_path
if os.path.exists(kerberosflagfile):
self.stop_strategy(is_final=False)
self.context.logger.log("Starting uninstall Kerberos.",
"addStep")
cmd = "source %s && " % self.context.userProfile
cmd += "%s -m uninstall -U %s" % (OMCommand.getLocalScript(
"Local_Kerberos"), self.context.user)
self.context.sshTool.executeCommand(cmd)
self.context.logger.log("Successfully uninstall Kerberos.")
self.start_strategy(is_final=False)
if self.unSetClusterReadOnlyMode() != 0:
raise Exception("NOTICE: "
+ ErrorCode.GAUSS_529["GAUSS_52907"])
self.recordNodeStepInplace(const.ACTION_INPLACE_UPGRADE,
const.BINARY_UPGRADE_STEP_INIT_STATUS)
(status, output) = self.doHealthCheck(const.OPTION_PRECHECK)
if status != 0:
self.exitWithRetCode(const.ACTION_INPLACE_UPGRADE, False,
ErrorCode.GAUSS_516["GAUSS_51601"]
% "cluster" + output)
self.getOneDNInst()
# 4.record the old and new app dir in file
self.recordDirFile()
if self.isLargeInplaceUpgrade:
self.recordLogicalClusterName()
# 6. reload vacuum_defer_cleanup_age to new value
if self.isLargeInplaceUpgrade:
if self.__upgrade_across_64bit_xid:
self.reloadVacuumDeferCleanupAge()
if self.setClusterReadOnlyMode() != 0:
raise Exception(ErrorCode.GAUSS_529["GAUSS_52908"])
# after checkUpgrade, the bak path is ready, we can use it now
# create inplace upgrade flag file if is doing inplace upgrade
self.createInplaceUpgradeFlagFile()
# 7. backup current application and configuration.
# The function only be used by binary upgrade.
# to ensure the transaction atomicity,
# it will be used with checkUpgrade().
self.backupNodeVersion()
# For inplace upgrade, we have to perform additional checks
# and then backup catalog files.
if self.isLargeInplaceUpgrade:
self.prepareUpgradeSqlFolder()
self.HASyncReplayCheck()
self.backupOldClusterDBAndRelInfo()
# 8. stop old cluster
self.recordNodeStepInplace(const.ACTION_INPLACE_UPGRADE,
const.BINARY_UPGRADE_STEP_STOP_NODE)
self.context.logger.debug("Start to stop all instances"
" on the node.", "addStep")
self.stop_strategy(is_final=False)
self.context.logger.debug("Successfully stop all"
" instances on the node.", "constant")
# 9. back cluster config. including this:
# cluster_static_config
# cluster_dynamic_config
# etc/gscgroup_xxx.cfg
# lib/postgresql/pg_plugin
# server.key.cipher
# server.key.rand
# Data Studio lib files
# gds files
# physical catalog files if performing inplace upgrade
self.recordNodeStepInplace(
const.ACTION_INPLACE_UPGRADE,
const.BINARY_UPGRADE_STEP_BACKUP_VERSION)
self.backupClusterConfig()
# 10. Upgrade application on node
# install new bin file
self.recordNodeStepInplace(const.ACTION_INPLACE_UPGRADE,
const.BINARY_UPGRADE_STEP_UPGRADE_APP)
self.installNewBin()
# 11. restore the cluster config. including this:
# cluster_static_config
# cluster_dynamic_config
# etc/gscgroup_xxx.cfg
# lib/postgresql/pg_plugin
# server.key.cipher
# server.key.rand
# Data Studio lib files
# gds files
# cn cert files
# At the same time, sync newly added guc for instances
self.restoreClusterConfig()
self.syncNewGUC()
# unset cluster readonly
self.start_strategy(is_final=False)
if self.unSetClusterReadOnlyMode() != 0:
raise Exception("NOTICE: "
+ ErrorCode.GAUSS_529["GAUSS_52907"])
# flush new app dynamic configuration
dynamicConfigFile = "%s/bin/cluster_dynamic_config" % \
self.context.newClusterAppPath
if os.path.exists(dynamicConfigFile) \
and self.isLargeInplaceUpgrade:
self.refresh_dynamic_config_file()
self.context.logger.debug(
"Successfully refresh dynamic config file")
self.stop_strategy(is_final=False)
if os.path.exists(dynamicConfigFile) \
and self.isLargeInplaceUpgrade:
self.restore_dynamic_config_file()
# 12. modify GUC parameter unix_socket_directory
self.modifySocketDir()
# 13. start new cluster
self.recordNodeStepInplace(const.ACTION_INPLACE_UPGRADE,
const.BINARY_UPGRADE_STEP_START_NODE)
self.context.logger.debug("Start to start all instances"
" on the node.", "addStep")
# update catalog
# start cluster in normal mode
if self.isLargeInplaceUpgrade:
self.touchRollbackCatalogFlag()
self.updateCatalog()
self.CopyCerts()
if DefaultValue.is_create_grpc(self.context.logger, self.context.oldClusterAppPath):
self.context.createGrpcCa()
self.context.logger.debug("Successfully createGrpcCa.")
# stop cluster for switch new bin
self.stop_strategy(is_final=False)
self.switchBin(const.NEW)
# create CA for CM
self.create_ca_for_cm()
self.start_strategy(is_final=False)
if self.isLargeInplaceUpgrade:
self.modifyPgProcIndex()
self.context.logger.debug("Start to exec post upgrade script")
self.doUpgradeCatalog(postUpgrade=True)
self.context.logger.debug(
"Successfully exec post upgrade script")
self.context.logger.debug("Successfully start all "
"instances on the node.", "constant")
if self.setClusterReadOnlyMode() != 0:
raise Exception(ErrorCode.GAUSS_529["GAUSS_52908"])
# 14. check the cluster status
(status, output) = self.doHealthCheck(const.OPTION_POSTCHECK)
if status != 0:
raise Exception(ErrorCode.GAUSS_516["GAUSS_51601"]
% "cluster" + output)
# 15. record precommit step status
self.recordNodeStepInplace(const.ACTION_INPLACE_UPGRADE,
const.BINARY_UPGRADE_STEP_PRE_COMMIT)
self.printPrecommitBanner()
except Exception as e:
self.context.logger.error(str(e))
self.context.logger.log("Binary upgrade failed. Rollback"
" to the original cluster.")
# do rollback
self.exitWithRetCode(const.ACTION_AUTO_ROLLBACK,
self.doInplaceBinaryRollback())
self.exitWithRetCode(const.ACTION_INPLACE_UPGRADE, True)
def backupGlobalRelmapFile(self):
"""
Wait and check if all standbys have replayed upto flushed xlog
positions of primaries, then backup global/pg_filenode.map.
if old cluster version num >= RELMAP_4K_VERSION, then no need to backup
"""
if self.context.oldClusterNumber >= const.RELMAP_4K_VERSION:
self.context.logger.debug("no need to backup global relmap file")
return
# perform a checkpoint and wait standby catchup
self.createCheckpoint()
self.getAllStandbyDnInsts()
# wait standby catchup first
self.HASyncReplayCheck(False)
# then wait all cascade standby(if any)
for standby in self.dnStandbyInsts:
self.HASyncReplayCheck(False, standby)
# send cmd to all node and exec
cmd = "%s -t %s -U %s -l %s -V %d" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_BACKUP_GLOBAL_RELMAP_FILE,
self.context.user,
self.context.localLog,
int(float(self.context.oldClusterNumber) * 1000))
self.context.logger.debug("backup global relmap file: %s." % cmd)
hostList = copy.deepcopy(self.context.clusterNodes)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
self.context.logger.debug("Successfully backup global relmap file.")
def cleanTmpGlobalRelampFile(self):
"""
remove global/pg_filenode.map when commit, if old cluster
version num >= RELMAP_4K_VERSION, then no need to remove.
"""
if self.context.oldClusterNumber >= const.RELMAP_4K_VERSION:
self.context.logger.debug("no need to clean tmp global relmap file")
return
# send cmd to all node and exec
cmd = "%s -t %s -U %s -l %s -V %d" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_CLEAN_TMP_GLOBAL_RELMAP_FILE,
self.context.user,
self.context.localLog,
int(float(self.context.oldClusterNumber) * 1000))
self.context.logger.debug("clean tmp global relmap file when commit or rollback: %s." % cmd)
hostList = copy.deepcopy(self.context.clusterNodes)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
self.context.logger.debug("Successfully clean tmp global relmap file.")
def restoreGlobalRelampFile(self):
"""
restore global/pg_filenode.map when rollback, if old cluster
version num >= RELMAP_4K_VERSION, then no need to restore.
use pg_filenode.old.map to recover pg_filenode.map and pg_filenode.map.backup
"""
if self.context.oldClusterNumber >= const.RELMAP_4K_VERSION:
self.context.logger.debug("no need to restore global relmap file")
return
# perform checkpoint and wait standby sync before rollback
self.createCheckpoint()
self.getAllStandbyDnInsts()
# wait standby catchup first
self.HASyncReplayCheck(False)
# then wait all cascade standby(if any)
for standby in self.dnStandbyInsts:
self.HASyncReplayCheck(False, standby)
# send cmd to all node and exec
cmd = "%s -t %s -U %s -l %s -V %d" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_RESTORE_GLOBAL_RELMAP_FILE,
self.context.user,
self.context.localLog,
int(float(self.context.oldClusterNumber) * 1000))
self.context.logger.debug("restore global relmap file when commit: %s." % cmd)
hostList = copy.deepcopy(self.context.clusterNodes)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
self.context.logger.debug("Successfully restore global relmap file.")
def doInplaceCommitUpgrade(self):
"""
function: commit binary upgrade and clean up backup files
1. unset read-only
2. drop old PMK schema
3. restore UDF
4. clean backup catalog physical
files if doing inplace upgrade
5. clean up other upgrade tmp files
input : NA
output: NA
"""
self.context.logger.log("NOTICE: Start to commit binary upgrade.")
self.context.logger.log("Start to check whether can be committed.", "addStep")
if self.getNodeStepInplace() != const.BINARY_UPGRADE_STEP_PRE_COMMIT:
raise Exception(ErrorCode.GAUSS_529["GAUSS_52916"]
+ " Please check if previous upgrade"
" operation was successful or if"
" upgrade has already been committed.")
self.context.logger.log("Can be committed.", "constant")
self.context.logger.log("Start to set commit flag.", "addStep")
# create commit flag file
self.createCommitFlagFile()
self.context.logger.log("Set commit flag succeeded.", "constant")
self.context.logger.log("Start to do operations that cannot be rollback.", "addStep")
# variable to indicate whether we should keep step file
# and cleanup list file for re-entry
cleanUpSuccess = True
# drop table and index after large upgrade
if self.isLargeInplaceUpgrade and self.check_upgrade_mode():
self.drop_table_or_index()
# 1.unset read-only
if self.isLargeInplaceUpgrade:
self.setUpgradeFromParam(const.UPGRADE_UNSET_NUM)
self.reloadCmAgent()
self.setUpgradeMode(0)
if self.unSetClusterReadOnlyMode() != 0:
self.context.logger.log("NOTICE: "
+ ErrorCode.GAUSS_529["GAUSS_52907"])
cleanUpSuccess = False
if self.isLargeInplaceUpgrade:
self.cleanCsvFile()
# 2. drop old PMK schema
# we sleep 10 seconds first because DB might be updating
# ha status after unsetting read-only
self.context.logger.log("Cancel the upgrade status succeeded.", "constant")
self.context.logger.log("Start to clean temp files for upgrade.", "addStep")
time.sleep(10)
# 3. clean backup catalog physical files if doing inplace upgrade
if self.cleanBackupedCatalogPhysicalFiles() != 0:
self.context.logger.debug(
"Failed to clean backup files in directory %s. "
% self.context.upgradeBackupPath)
if not cleanUpSuccess:
self.context.logger.log("NOTICE: Cleanup is incomplete during commit. "
"Please re-commit upgrade once again or cleanup manually")
else:
# 8. clean up other upgrade tmp files
# and uninstall inplace upgrade support functions
self.cleanInstallPath(const.OLD)
self.cleanBinaryUpgradeBakFiles()
if self.isLargeInplaceUpgrade:
self.stop_strategy(is_final=False)
self.start_strategy(is_final=False)
# install Kerberos
self.install_kerberos()
self.context.logger.log("Clean temp files for upgrade succeeded.", "constant")
self.context.logger.log("NOTICE: Commit binary upgrade succeeded.")
# remove global relmap file
self.cleanTmpGlobalRelampFile()
self.exitWithRetCode(const.ACTION_INPLACE_UPGRADE, cleanUpSuccess)
def install_kerberos(self):
"""
install kerberos after upgrade
:return:NA
"""
pghost_path = EnvUtil.getEnvironmentParameterValue(
'PGHOST', self.context.user)
kerberosflagfile = "%s/kerberos_upgrade_flag" % pghost_path
if os.path.exists(kerberosflagfile):
# install kerberos
cmd = "source %s &&" % self.context.userProfile
cmd += "gs_om -t stop && "
cmd += "%s -m install -U %s --krb-server" % (
OMCommand.getLocalScript("Local_Kerberos"),
self.context.user)
(status, output) = CmdUtil.retryGetstatusoutput(cmd, 3, 5)
if status != 0:
raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] %
"Command:%s. Error:\n%s" % (cmd, output))
cmd = "source %s && " % self.context.userProfile
cmd += "%s -m install -U %s --krb-client " % (
OMCommand.getLocalScript("Local_Kerberos"), self.context.user)
self.context.sshTool.executeCommand(
cmd, hostList=self.context.clusterNodes)
self.context.logger.log("Successfully install Kerberos.")
cmd = "source %s && gs_om -t start" % self.context.userProfile
(status, output) = subprocess.getstatusoutput(cmd)
if status != 0:
raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] %
"Command:%s. Error:\n%s" % (cmd, output))
os.remove(kerberosflagfile)
def refresh_dynamic_config_file(self):
"""
refresh dynamic config file
:return:
"""
cmd = "source %s ;gs_om -t refreshconf" % self.context.userProfile
(status, output) = subprocess.getstatusoutput(cmd)
if status != 0:
raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] %
"Command:%s. Error:\n%s" % (cmd, output))
def restore_dynamic_config_file(self):
"""
restore dynamic config file
:return:
"""
cmd = "%s -t %s -U %s -V %d --upgrade_bak_path=%s " \
"--old_cluster_app_path=%s --new_cluster_app_path=%s " \
"-l %s" % (
OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_RESTORE_DYNAMIC_CONFIG_FILE,
self.context.user,
int(float(self.context.oldClusterNumber) * 1000),
self.context.upgradeBackupPath,
self.context.oldClusterAppPath,
self.context.newClusterAppPath,
self.context.localLog)
self.context.logger.debug("Command for restoring "
"config files: %s" % cmd)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
def cleanCsvFile(self):
"""
clean csv file
:return:
"""
clusterNodes = self.context.clusterInfo.dbNodes
for dbNode in clusterNodes:
if len(dbNode.datanodes) == 0:
continue
dnInst = dbNode.datanodes[0]
dndir = dnInst.datadir
pg_proc_csv_path = \
'%s/pg_copydir/tbl_pg_proc_oids.csv' % dndir
new_pg_proc_csv_path = \
'%s/pg_copydir/new_tbl_pg_proc_oids.csv' % dndir
if os.path.exists(pg_proc_csv_path):
FileUtil.removeFile(pg_proc_csv_path)
if os.path.exists(new_pg_proc_csv_path):
FileUtil.removeFile(new_pg_proc_csv_path)
def check_upgrade_mode(self):
"""
check upgrade_mode value
:return:
"""
cmd = "source %s ; gs_guc check -N all -I all -c 'upgrade_mode'" % \
self.context.userProfile
(status, output) = subprocess.getstatusoutput(cmd)
if status != 0:
raise Exception(ErrorCode.GAUSS_500[
"GAUSS_50010"] % 'upgrade_mode' +
"Error: \n%s" % str(output))
if output.find("upgrade_mode=0") >= 0:
return False
else:
return True
def cleanBackupedCatalogPhysicalFiles(self, isRollBack=False):
"""
function : clean backuped catalog physical files
input : isRollBack, default is False
output: return 0, if the operation is done successfully.
return 1, if the operation failed.
"""
try:
if self.isLargeInplaceUpgrade:
self.context.logger.log("Clean up backup catalog files.")
# send cmd to all node and exec
cmd = "%s -t %s -U %s --upgrade_bak_path=%s -X '%s' -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_CLEAN_OLD_CLUSTER_CATALOG_PHYSICAL_FILES,
self.context.user,
self.context.upgradeBackupPath,
self.context.xmlFile,
self.context.localLog)
if isRollBack:
cmd += " --rollback --oldcluster_num='%s'" % \
self.context.oldClusterNumber
self.context.logger.debug(
"Command for cleaning up physical catalog files: %s." % cmd)
CmdExecutor.execCommandWithMode(
cmd,
self.context.sshTool,
self.context.isSingle,
self.context.userProfile)
self.context.logger.debug(
"Successfully cleaned up backup catalog files.")
return 0
except Exception as e:
if isRollBack:
raise Exception(
"Fail to clean up backup catalog files: %s" % str(e))
else:
self.context.logger.debug(
"Fail to clean up backup catalog files. " +
"Please re-commit upgrade once again or clean up manually.")
return 1
def recordLogicalClusterName(self):
"""
function: record the logical node group name in bakpath,
so that we can restore specfic name in bakpath,
used in restoreCgroup, and refresh the CgroupConfigure
input : NA
output: NA
"""
lcgroupfile = "%s/oldclusterinfo.json" % self.context.tmpDir
try:
self.context.logger.debug(
"Write and send logical cluster info file.")
# check whether file is exists
if os.path.isfile(lcgroupfile):
return 0
# check whether it is lc cluster
sql = """SELECT true AS group_kind
FROM pg_class c, pg_namespace n, pg_attribute attr
WHERE c.relname = 'pgxc_group' AND n.nspname = 'pg_catalog'
AND attr.attname = 'group_kind' AND c.relnamespace =
n.oid AND attr.attrelid = c.oid; """
self.context.logger.debug(
"Check if the cluster type is a logical cluster.")
(status, output) = ClusterCommand.remoteSQLCommand(
sql,
self.context.user,
self.dnInst.hostname,
self.dnInst.port,
False,
DefaultValue.DEFAULT_DB_NAME,
IsInplaceUpgrade=True)
if status != 0:
raise Exception(ErrorCode.GAUSS_513[
"GAUSS_51300"] % sql + " Error: \n%s" % str(
output))
if not output or output.strip() != 't':
self.context.logger.debug(
"The old cluster is not logical cluster.")
return 0
self.context.logger.debug("The old cluster is logical cluster.")
# get lc group name lists
sql = "SELECT group_name FROM pgxc_group WHERE group_kind = 'v';"
self.context.logger.debug(
"Getting the list of logical cluster names.")
(status, output) = ClusterCommand.remoteSQLCommand(
sql,
self.context.user,
self.dnInst.hostname,
self.dnInst.port,
False,
DefaultValue.DEFAULT_DB_NAME,
IsInplaceUpgrade=True)
if status != 0:
raise Exception(ErrorCode.GAUSS_513[
"GAUSS_51300"] % sql + " Error: \n%s" % str(
output))
lcgroupnames = output.split("\n")
self.context.logger.debug(
"The list of logical cluster names: %s." % lcgroupnames)
# create the file
FileUtil.createFile(lcgroupfile)
FileUtil.changeOwner(self.context.user, lcgroupfile)
FileUtil.changeMode(DefaultValue.KEY_FILE_MODE, lcgroupfile)
# write result to file
with open(lcgroupfile, "w") as fp_json:
json.dump({"lcgroupnamelist": lcgroupnames}, fp_json)
# send file to remote nodes
if not self.context.isSingle:
self.context.sshTool.scpFiles(lcgroupfile, self.context.tmpDir)
self.context.logger.debug(
"Successfully to write and send logical cluster info file.")
return 0
except Exception as e:
cmd = "(if [ -f '%s' ]; then rm -f '%s'; fi)" % (
lcgroupfile, lcgroupfile)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.userProfile)
raise Exception(str(e))
def prepareUpgradeSqlFolder(self):
"""
function: verify upgrade_sql.tar.gz and extract it to binary backup
path, because all node need set_guc, so
we will decompress on all nodes
input : NA
output: NA
"""
self.context.logger.debug("Preparing upgrade sql folder.")
if self.context.action == const.ACTION_INPLACE_UPGRADE:
hostName = NetUtil.GetHostIpOrName()
hosts = [hostName]
else:
hosts = self.context.clusterNodes
cmd = "%s -t %s -U %s --upgrade_bak_path=%s -X %s -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_UPGRADE_SQL_FOLDER,
self.context.user,
self.context.upgradeBackupPath,
self.context.xmlFile,
self.context.localLog)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.userProfile,
hosts)
def HASyncReplayCheck(self, catchupFailedOk=True, host=None):
"""
function: Wait and check if all standbys have replayed upto flushed
xlog positions of primaries.We record primary xlog flush
position at start of the check and wait until standby replay
upto that point.
Attention: If autovacuum is turned on, primary xlog flush
position may increase during the check.We do not check such
newly added xlog because they will not change catalog
physical file position.
Input: catchupFailedOk, if it's ok standby catch up primay failed
output : NA
"""
host = self.dnInst if host == None else host
self.context.logger.debug("Start to wait and check if all the standby"
" instances have replayed all xlogs, host: %s" % \
host.hostname)
self.doReplay(catchupFailedOk, host)
self.context.logger.debug("Successfully performed the replay check "
"of the standby instance.")
def doReplay(self, catchupFailedOk, host):
refreshTimeout = 180
waitTimeout = 300
RefreshTime = datetime.now() + timedelta(seconds=refreshTimeout)
EndTime = datetime.now() + timedelta(seconds=waitTimeout)
# wait and check sync status between primary and standby
NeedReplay = True
PosList = []
while NeedReplay:
sql = "SELECT sender_flush_location,receiver_replay_location " \
"from pg_catalog.pg_stat_get_wal_senders() " \
"where peer_role != 'Secondary';"
(status, output) = ClusterCommand.remoteSQLCommand(
sql,
self.context.user,
host.hostname,
host.port,
False,
DefaultValue.DEFAULT_DB_NAME,
IsInplaceUpgrade=True)
if status != 0:
self.context.logger.debug(
"Primary and Standby may be not in sync.")
self.context.logger.debug(
"Sync status: %s. Output: %s" % (str(status), output))
elif output != "":
self.context.logger.debug(
"Sync status: %s. Output: %s" % (str(status), output))
tmpPosList = self.getXlogPosition(output)
if len(PosList) == 0:
PosList = copy.deepcopy(tmpPosList)
self.context.logger.debug(
"Primary and Standby may be not in sync.")
else:
NeedReplay = False
for eachRec in PosList:
for eachTmpRec in tmpPosList:
if self.needReplay(eachRec, eachTmpRec):
NeedReplay = True
self.context.logger.debug(
"Primary and Standby may be not in sync.")
break
if NeedReplay:
break
else:
NeedReplay = False
# Standby replay postion may keep falling behind primary
# flush position if it is at the end of one xlog page and the
# free space is less than xlog record header size.
# We do a checkpoint to avoid such situation.
if datetime.now() > RefreshTime and NeedReplay:
self.context.logger.debug(
"Execute CHECKPOINT to refresh xlog position.")
refreshsql = "set statement_timeout=300000;CHECKPOINT;"
(status, output) = ClusterCommand.remoteSQLCommand(
refreshsql,
self.context.user,
host.hostname,
host.port,
False,
DefaultValue.DEFAULT_DB_NAME,
IsInplaceUpgrade=True)
if status != 0:
raise Exception(
ErrorCode.GAUSS_513["GAUSS_51300"] % refreshsql +
"Error: \n%s" % str(output))
if datetime.now() > EndTime and NeedReplay:
logStr = "WARNING: " + ErrorCode.GAUSS_513["GAUSS_51300"] % sql +\
" Timeout while waiting for standby replay."
if catchupFailedOk:
self.context.logger.log(logStr)
return
raise Exception(logStr)
time.sleep(5)
def getXlogPosition(self, output):
"""
get xlog position from output
"""
tmpPosList = []
resList = output.split('\n')
for eachLine in resList:
tmpRec = {}
(flushPos, replayPos) = eachLine.split('|')
(flushPosId, flushPosOff) = (flushPos.strip()).split('/')
(replayPosId, replayPosOff) = (replayPos.strip()).split('/')
tmpRec['nodeName'] = self.getHAShardingName()
tmpRec['flushPosId'] = flushPosId.strip()
tmpRec['flushPosOff'] = flushPosOff.strip()
tmpRec['replayPosId'] = replayPosId.strip()
tmpRec['replayPosOff'] = replayPosOff.strip()
tmpPosList.append(tmpRec)
return tmpPosList
def getHAShardingName(self):
"""
in centralized cluster, used to get the only one sharding name
"""
peerInsts = self.context.clusterInfo.getPeerInstance(self.dnInst)
(instance_name, _, _) = ClusterInstanceConfig.\
getInstanceInfoForSinglePrimaryMultiStandbyCluster(
self.dnInst, peerInsts)
return instance_name
def needReplay(self, eachRec, eachTmpRec):
"""
judeg if need replay by xlog position
"""
if eachRec['nodeName'] == eachTmpRec['nodeName'] \
and (int(eachRec['flushPosId'], 16) > int(
eachTmpRec['replayPosId'], 16) or (
int(eachRec['flushPosId'], 16) == int(
eachTmpRec['replayPosId'], 16) and int(
eachRec['flushPosOff'], 16) > int(eachTmpRec['replayPosOff'], 16))):
return True
else:
return False
def backupOldClusterDBAndRelInfo(self):
"""
function: backup old cluster db and rel info
send cmd to that node
input : NA
output: NA
"""
tmpFile = os.path.join(EnvUtil.getTmpDirFromEnv(
self.context.user), const.TMP_DYNAMIC_DN_INFO)
try:
self.context.logger.debug("Start to backup old cluster database"
" and relation information.")
# prepare backup path
backup_path = os.path.join(
self.context.upgradeBackupPath, "oldClusterDBAndRel")
cmd = "rm -rf '%s' && mkdir '%s' -m '%s' " % \
(backup_path, backup_path, DefaultValue.KEY_DIRECTORY_MODE)
hostList = copy.deepcopy(self.context.clusterNodes)
self.context.sshTool.executeCommand(cmd, hostList=hostList)
# prepare dynamic cluster info file in every node
self.generateDynamicInfoFile(tmpFile)
# get dn primary hosts
dnPrimaryNodes = self.getPrimaryDnListFromDynamicFile()
execHosts = list(set(dnPrimaryNodes))
# send cmd to all node and exec
cmd = "%s -t %s -U %s --upgrade_bak_path=%s -X '%s' -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_BACKUP_OLD_CLUSTER_DB_AND_REL,
self.context.user,
self.context.upgradeBackupPath,
self.context.xmlFile,
self.context.localLog)
self.context.logger.debug(
"Command for backing up old cluster database and "
"relation information: %s." % cmd)
self.context.sshTool.executeCommand(cmd, hostList=execHosts)
self.context.logger.debug("Backing up information of all nodes.")
self.context.logger.debug("Successfully backed up old cluster "
"database and relation information")
except Exception as e:
raise Exception(str(e))
finally:
if os.path.exists(tmpFile):
deleteCmd = "(if [ -f '%s' ]; then rm -f '%s'; fi) " % \
(tmpFile, tmpFile)
hostList = copy.deepcopy(self.context.clusterNodes)
self.context.sshTool.executeCommand(
deleteCmd, hostList=hostList)
def generateDynamicInfoFile(self, tmpFile):
"""
generate dynamic info file and send to every node
:return:
"""
self.context.logger.debug(
"Start to generate dynamic info file and send to every node.")
try:
cmd = ClusterCommand.getQueryStatusCmd("", outFile=tmpFile)
SharedFuncs.runShellCmd(cmd, self.context.user,
self.context.userProfile)
if not os.path.exists(tmpFile):
raise Exception("Can not genetate dynamic info file")
self.context.distributeFileToSpecialNode(tmpFile,
os.path.dirname(tmpFile),
self.context.clusterNodes)
self.context.logger.debug(
"Success to generate dynamic info file and send to every node.")
except Exception as er:
raise Exception("Failed to generate dynamic info file in "
"these nodes: {0}, error: {1}".format(
self.context.clusterNodes, str(er)))
def getPrimaryDnListFromDynamicFile(self):
"""
get primary dn list from dynamic file
:return: primary dn list
"""
try:
self.context.logger.debug(
"Start to get primary dn list from dynamic file.")
tmpFile = os.path.join(EnvUtil.getTmpDirFromEnv(
self.context.user), const.TMP_DYNAMIC_DN_INFO)
if not os.path.exists(tmpFile):
raise Exception(ErrorCode.GAUSS_529["GAUSS_50201"] % tmpFile)
dynamicClusterStatus = DbClusterStatus()
dynamicClusterStatus.initFromFile(tmpFile)
cnAndPrimaryDnNodes = []
# Find the master DN instance
for dbNode in dynamicClusterStatus.dbNodes:
for instance in dbNode.datanodes:
if instance.status == 'Primary':
for staticDBNode in self.context.clusterInfo.dbNodes:
if staticDBNode.id == instance.nodeId:
cnAndPrimaryDnNodes.append(staticDBNode.name)
result = list(set(cnAndPrimaryDnNodes))
self.context.logger.debug("Success to get primary dn list from "
"dynamic file: {0}.".format(result))
return result
except Exception as er:
raise Exception("Failed to get primary dn list from dynamic file. "
"Error:{0}".format(str(er)))
def touchRollbackCatalogFlag(self):
"""
before update system catalog, touch a flag file.
"""
# touch init flag file
# during rollback, if init flag file has not been touched,
# we do not need to do catalog rollback.
cmd = "touch '%s/touch_init_flag'" % self.context.upgradeBackupPath
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.userProfile)
def updateCatalog(self):
"""
function: update catalog to new version
steps:
1.prepare update sql file and check sql file
2.do update catalog
Input: NA
output : NA
"""
try:
self.prepareSql("upgrade-post")
self.prepareSql("upgrade")
self.prepareSql("rollback-post")
self.prepareSql("rollback")
self.doUpgradeCatalog()
except Exception as e:
raise Exception(
"Failed to execute update sql file. Error: %s" % str(e))
def doUpgradeCatalog(self, postUpgrade=False):
"""
function: update catalog to new version
1.set upgrade_from param
2.start cluster
3.touch init files and do pre-upgrade staffs
4.connect database and update catalog one by one
5.stop cluster
6.unset upgrade_from param
7.start cluster
Input: oldClusterNumber
output : NA
"""
self.context.logger.debug("Start upgrade catalog.")
try:
if not postUpgrade:
self.context.logger.debug("Not post upgrade.")
self.setUpgradeFromParam(self.context.oldClusterNumber)
if self.context.action == const.ACTION_INPLACE_UPGRADE:
self.setUpgradeMode(1, "set")
self.start_strategy(is_final=False)
self.touchInitFile()
else:
# the guc parameter upgrade_from need to restart cmagent to take effect
self.setUpgradeMode(2)
self.reloadCmAgent()
# kill snapshot thread in kernel
self.context.killKernalSnapshotThread(self.dnInst)
self.execRollbackUpgradedCatalog(scriptType="rollback")
self.execRollbackUpgradedCatalog(scriptType="upgrade")
self.pgxcNodeUpdateLocalhost("upgrade")
else:
self.context.logger.debug("Post upgrade.")
self.waitClusterForNormal()
# backup global relmap file before doing upgrade-post
self.backupGlobalRelmapFile()
self.execRollbackUpgradedCatalog(scriptType="rollback-post")
self.execRollbackUpgradedCatalog(scriptType="upgrade-post")
self.getLsnInfo()
if self.context.action == \
const.ACTION_INPLACE_UPGRADE and not postUpgrade and not \
int(float(self.context.newClusterNumber) * 1000) > 92298:
self.updatePgproc()
except Exception as e:
raise Exception("update catalog failed.ERROR: %s" % str(e))
def updatePgproc(self):
"""
function: update pg_proc during large upgrade
:return:
"""
self.context.logger.debug(
"Start to update pg_proc in inplace large upgrade ")
# generate new csv file
execHosts = [self.dnInst.hostname]
# send cmd to all node and exec
cmd = "%s -t %s -U %s -R '%s' -l %s" % (
OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_CREATE_NEW_CSV_FILE,
self.context.user,
self.context.tmpDir,
self.context.localLog)
self.context.logger.debug(
"Command for create new csv file: %s." % cmd)
self.context.sshTool.executeCommand(cmd, hostList=execHosts)
self.context.logger.debug(
"Successfully created new csv file.")
# select all databases
database_list = self.getDatabaseList()
# create pg_proc_temp_oids
new_pg_proc_csv_path = '%s/pg_copydir/new_tbl_pg_proc_oids.csv' % \
self.dnInst.datadir
self.createPgprocTempOids(new_pg_proc_csv_path, database_list)
# create pg_proc_temp_oids index
self.createPgprocTempOidsIndex(database_list)
# make checkpoint
self.replyXlog(database_list)
# create pg_proc_mapping.txt to save the mapping between pg_proc
# file path and pg_proc_temp_oids file path
cmd = "%s -t %s -U %s -R '%s' -l %s" % (
OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_CREATE_PG_PROC_MAPPING_FILE,
self.context.user,
self.context.tmpDir,
self.context.localLog)
CmdExecutor.execCommandWithMode(
cmd,
self.context.sshTool,
self.context.isSingle,
self.context.userProfile)
self.context.logger.debug(
"Successfully created file to save mapping between pg_proc file "
"path and pg_proc_temp_oids file path.")
# stop cluster
self.stop_strategy()
# replace pg_proc data file by pg_proc_temp data file
# send cmd to all node and exec
cmd = "%s -t %s -U %s -R '%s' -l %s" % (
OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_REPLACE_PG_PROC_FILES,
self.context.user,
self.context.tmpDir,
self.context.localLog)
CmdExecutor.execCommandWithMode(
cmd,
self.context.sshTool,
self.context.isSingle,
self.context.userProfile)
self.context.logger.debug(
"Successfully replaced pg_proc data files.")
def createPgprocTempOids(self, new_pg_proc_csv_path, database_list):
"""
create pg_proc_temp_oids
:return:
"""
sql = \
"""START TRANSACTION; SET IsInplaceUpgrade = on;
CREATE TABLE pg_proc_temp_oids (proname name NOT NULL,
pronamespace oid NOT NULL, proowner oid NOT NULL, prolang oid
NOT NULL, procost real NOT NULL, prorows real NOT NULL,
provariadic oid NOT NULL, protransform regproc NOT NULL,
proisagg boolean NOT NULL, proiswindow boolean NOT NULL,
prosecdef boolean NOT NULL, proleakproof boolean NOT NULL,
proisstrict boolean NOT NULL, proretset boolean NOT NULL,
provolatile "char" NOT NULL, pronargs smallint NOT NULL,
pronargdefaults smallint NOT NULL, prorettype oid NOT NULL,
proargtypes oidvector NOT NULL, proallargtypes oid[],
proargmodes "char"[], proargnames text[], proargdefaults
pg_node_tree, prosrc text, probin text, proconfig text[],
proacl aclitem[], prodefaultargpos int2vector,fencedmode boolean,
proshippable boolean, propackage boolean, prokind "char" NOT
NULL) with oids;"""
sql += "copy pg_proc_temp_oids WITH OIDS from '%s' with " \
"delimiter ',' csv header FORCE NOT NULL proargtypes;" % \
new_pg_proc_csv_path
sql += "COMMIT;"
# update proisagg and proiswindow message sql
sql += \
"update pg_proc_temp_oids set proisagg = CASE WHEN prokind = 'a' " \
"THEN True ELSE False END, proiswindow = CASE WHEN prokind = 'w' " \
"THEN True ELSE False END;"
self.context.logger.debug("pg_proc_temp_oids sql is %s" % sql)
# creat table
for eachdb in database_list:
(status, output) = ClusterCommand.remoteSQLCommand(
sql, self.context.user,
self.dnInst.hostname, self.dnInst.port, False,
eachdb, IsInplaceUpgrade=True)
if status != 0:
raise Exception(ErrorCode.GAUSS_513["GAUSS_51300"] % sql +
" Error: \n%s" % str(output))
def createPgprocTempOidsIndex(self, database_list):
"""
create index pg_proc_oid_index_temp and
pg_proc_proname_args_nsp_index_temp
:return:
"""
sql = "CREATE UNIQUE INDEX pg_proc_oid_index_temp ON " \
"pg_proc_temp_oids USING btree (oid) TABLESPACE pg_default;"
sql += "CREATE UNIQUE INDEX pg_proc_proname_args_nsp_index_temp ON" \
" pg_proc_temp_oids USING btree (proname, proargtypes," \
" pronamespace) TABLESPACE pg_default;"
# creat index
for eachdb in database_list:
(status, output) = ClusterCommand.remoteSQLCommand(
sql, self.context.user,
self.dnInst.hostname, self.dnInst.port, False,
eachdb, IsInplaceUpgrade=True)
if status != 0:
raise Exception(ErrorCode.GAUSS_513["GAUSS_51300"] % sql +
" Error: \n%s" % str(output))
def getDatabaseList(self):
"""
check database list in cluster
:return:
"""
self.context.logger.debug("Get database list in cluster.")
sql = "select datname from pg_database;"
(status, output) = ClusterCommand.remoteSQLCommand(
sql, self.context.user,
self.dnInst.hostname, self.dnInst.port, False,
DefaultValue.DEFAULT_DB_NAME, IsInplaceUpgrade=True)
if status != 0:
raise Exception(ErrorCode.GAUSS_513["GAUSS_51300"] % sql +
" Error: \n%s" % str(output))
if "" == output:
raise Exception("No database objects were found in the cluster!")
reslines = (output.strip()).split('\n')
if (len(reslines) < 3
or "template1" not in reslines
or "template0" not in reslines
or "postgres" not in reslines):
raise Exception("The database list is invalid:%s." % str(reslines))
self.context.logger.debug("Database list in cluster is %s." % reslines)
return reslines
def replyXlog(self, database_list):
"""
make checkpoint
:return:
"""
sql = 'CHECKPOINT;'
for eachdb in database_list:
(status, output) = ClusterCommand.remoteSQLCommand(
sql, self.context.user,
self.dnInst.hostname, self.dnInst.port, False,
eachdb, IsInplaceUpgrade=True)
if status != 0:
raise Exception(ErrorCode.GAUSS_513["GAUSS_51300"] % sql +
" Error: \n%s" % str(output))
def execRollbackUpgradedCatalog(self, scriptType="rollback"):
"""
function : connect database and rollback/upgrade catalog one by one
1.find a node that has dn instance
2.scp sql files to that node
3.send cmd to that node and exec
input : NA
output: NA
"""
self.context.logger.debug("Start to {0} catalog.".format(scriptType))
try:
dnNodeName = self.dnInst.hostname
if dnNodeName == "":
raise Exception(ErrorCode.GAUSS_526["GAUSS_52602"])
self.context.logger.debug("dn nodes is {0}".format(dnNodeName))
# scp sql files to that node
maindb_sql = "%s/%s_catalog_maindb_tmp.sql" \
% (self.context.upgradeBackupPath, scriptType)
otherdb_sql = "%s/%s_catalog_otherdb_tmp.sql" \
% (self.context.upgradeBackupPath, scriptType)
if "upgrade" == scriptType:
check_upgrade_sql = \
"%s/check_upgrade_tmp.sql" % self.context.upgradeBackupPath
if not os.path.isfile(check_upgrade_sql):
raise Exception(
ErrorCode.GAUSS_502["GAUSS_50210"] % check_upgrade_sql)
self.context.logger.debug("Scp {0} file to nodes {1}".format(
check_upgrade_sql, dnNodeName))
if not self.context.isSingle:
LocalRemoteCmd.scpFile(dnNodeName, check_upgrade_sql,
self.context.upgradeBackupPath)
if not os.path.isfile(maindb_sql):
raise Exception(ErrorCode.GAUSS_502["GAUSS_50210"] % maindb_sql)
if not os.path.isfile(otherdb_sql):
raise Exception(
ErrorCode.GAUSS_502["GAUSS_50210"] % otherdb_sql)
if (not self.context.isSingle):
LocalRemoteCmd.scpFile(dnNodeName, maindb_sql,
self.context.upgradeBackupPath)
LocalRemoteCmd.scpFile(dnNodeName, otherdb_sql,
self.context.upgradeBackupPath)
self.context.logger.debug(
"Scp {0} file and {1} file to nodes {2}".format(
maindb_sql, otherdb_sql, dnNodeName))
# send cmd to that node and exec
cmd = "%s -t %s -U %s --upgrade_bak_path=%s --script_type=%s -l " \
"%s" % (OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_UPDATE_CATALOG,
self.context.user,
self.context.upgradeBackupPath,
scriptType,
self.context.localLog)
self.context.logger.debug(
"Command for executing {0} catalog.".format(scriptType))
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.userProfile,
[dnNodeName])
self.context.logger.debug(
"Successfully {0} catalog.".format(scriptType))
except Exception as e:
self.context.logger.log("Failed to {0} catalog.".format(scriptType))
if not self.context.forceRollback:
raise Exception(str(e))
def pgxcNodeUpdateLocalhost(self, mode):
"""
This function is used to modify the localhost of the system table
which pgxc_node
:param mode:
:return:
"""
try:
if int(float(self.context.newClusterNumber) * 1000) < 92069 or \
int(float(self.context.oldClusterNumber) * 1000) >= 92069:
return
if mode == "upgrade":
self.context.logger.debug("Update localhost in pgxc_node.")
else:
self.context.logger.debug("Rollback localhost in pgxc_node.")
for dbNode in self.context.clusterInfo.dbNodes:
for dn in dbNode.datanodes:
sql = "START TRANSACTION;"
sql += "SET %s = on;" % const.ON_INPLACE_UPGRADE
if mode == "upgrade":
sql += "UPDATE PGXC_NODE SET node_host = '%s', " \
"node_host1 = '%s' WHERE node_host = " \
"'localhost'; " % (dn.listenIps[0],
dn.listenIps[0])
else:
sql += "UPDATE PGXC_NODE SET node_host = " \
"'localhost', node_host1 = 'localhost' WHERE" \
" node_type = 'C' and node_host = '%s';" %\
(dn.listenIps[0])
sql += "COMMIT;"
self.context.logger.debug("Current sql %s." % sql)
(status, output) = ClusterCommand.remoteSQLCommand(
sql, self.context.user, dn.hostname, dn.port,
False, DefaultValue.DEFAULT_DB_NAME,
IsInplaceUpgrade=True)
if status != 0:
if self.context.forceRollback:
self.context.logger.debug("In forceRollback, "
"roll back pgxc_node. "
"%s " % str(output))
else:
raise Exception(ErrorCode.GAUSS_513["GAUSS_51300"]
% sql + " Error: \n%s" %
str(output))
if mode == "upgrade":
self.context.logger.debug(
"Success update localhost in pgxc_node.")
else:
self.context.logger.debug(
"Success rollback localhost in pgxc_node.")
except Exception as e:
raise Exception(str(e))
def touchInitFile(self):
"""
function: touch upgrade init file for every primary/standby and
do pre-upgrade staffs
input : NA
output: NA
"""
try:
if self.isLargeInplaceUpgrade:
self.context.logger.debug("Start to create upgrade init file.")
# send cmd to all node and exec
cmd = "%s -t %s -U %s --upgrade_bak_path=%s -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_TOUCH_INIT_FILE,
self.context.user,
self.context.upgradeBackupPath,
self.context.localLog)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.userProfile)
self.context.logger.debug(
"Successfully created upgrade init file.")
except Exception as e:
raise Exception(str(e))
def prepareSql(self, mode="rollback"):
"""
function : prepare 4 files: rollback_catalog_maindb_tmp.sql,
rollback_catalog_otherdb_tmp.sql and upgrade file
2.for each result file: filter all files and merge
into the *_tmp.sql file
:param rollback: can be rollback or upgrade
"""
try:
self.prepareSqlForDb(mode)
self.prepareSqlForDb(mode, "otherdb")
if mode == "upgrade":
self.prepareCheckSql()
except Exception as e:
raise Exception("Failed to prepare %s sql file failed. ERROR: %s"
% (mode, str(e)))
def prepareSqlForDb(self, mode, dbType="maindb"):
self.context.logger.debug(
"Start to prepare {0} sql files for {1}.".format(mode, dbType))
header = self.getSqlHeader()
if "upgrade" in mode:
listName = "upgrade"
else:
listName = "rollback"
fileNameList = self.getFileNameList("{0}_catalog_{1}".format(
listName, dbType), mode)
if "rollback" in mode:
fileNameList.sort(reverse=True)
else:
fileNameList.sort()
if 'rollback_catalog_maindb_92_506.sql' in fileNameList:
fileNameList.remove('rollback_catalog_maindb_92_506.sql')
fileNameList.append('rollback_catalog_maindb_92_506.sql')
if 'rollback_catalog_otherdb_92_506.sql' in fileNameList:
fileNameList.remove('rollback_catalog_otherdb_92_506.sql')
fileNameList.append('rollback_catalog_otherdb_92_506.sql')
fileName = "{0}_catalog_{1}_tmp.sql".format(mode, dbType)
self.context.logger.debug("The real file list for %s: %s" % (
dbType, fileNameList))
self.togetherFile(header, "{0}_catalog_{1}".format(listName, dbType),
fileNameList, fileName)
self.context.logger.debug("Successfully prepared sql files for %s."
% dbType)
def prepareCheckSql(self):
header = ["START TRANSACTION;"]
fileNameList = self.getFileNameList("check_upgrade")
fileNameList.sort()
if 'rollback_catalog_maindb_92_506.sql' in fileNameList:
fileNameList.remove('rollback_catalog_maindb_92_506.sql')
fileNameList.append('rollback_catalog_maindb_92_506.sql')
if 'rollback_catalog_otherdb_92_506.sql' in fileNameList:
fileNameList.remove('rollback_catalog_otherdb_92_506.sql')
fileNameList.append('rollback_catalog_otherdb_92_506.sql')
self.context.logger.debug("The real file list for checking upgrade: "
"%s" % fileNameList)
self.togetherFile(header, "check_upgrade", fileNameList,
"check_upgrade_tmp.sql")
def togetherFile(self, header, filePathName, fileNameList, executeFileName):
writeFile = ""
try:
filePath = "%s/upgrade_sql/%s" % (self.context.upgradeBackupPath,
filePathName)
self.context.logger.debug("Preparing [%s]." % filePath)
writeFile = "%s/%s" % (self.context.upgradeBackupPath,
executeFileName)
FileUtil.createFile(writeFile)
FileUtil.writeFile(writeFile, header, 'w')
with open(writeFile, 'a') as sqlFile:
for each_file in fileNameList:
each_file_with_path = "%s/%s" % (filePath, each_file)
self.context.logger.debug("Handling file: %s" %
each_file_with_path)
with open(each_file_with_path, 'r') as fp:
for line in fp:
sqlFile.write(line)
sqlFile.write(os.linesep)
FileUtil.writeFile(writeFile, ["COMMIT;"], 'a')
self.context.logger.debug(
"Success to together {0} file".format(writeFile))
if not os.path.isfile(writeFile):
raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % writeFile)
except Exception as e:
raise Exception("Failed to write {0} sql file. ERROR: {1}".format(
writeFile, str(e)))
def om_stop_cluster(self):
"""
Stop cluster with gs_om
"""
cmd = "source %s ;gs_om -t stop" % self.context.userProfile
status, output = subprocess.getstatusoutput(cmd)
if status != 0:
raise Exception(ErrorCode.GAUSS_516["GAUSS_51610"] % "cluster" +
"Output: %s" % output)
self.context.logger.log("Stop cluster with gs_om successfully.")
def om_start_cluster(self):
"""
Start Cluster with om
"""
cmd = "source %s ;gs_om -t start" % self.context.userProfile
status, output = subprocess.getstatusoutput(cmd)
if status != 0:
raise Exception(ErrorCode.GAUSS_516["GAUSS_51607"] % "cluster" +
"Output: %s" % output)
self.context.logger.debug("Start cluster with gs_om successfully.")
def get_cms_num(self, cluster_config_file):
"""
Get cm_server num from static config file
"""
cluster_info = dbClusterInfo()
cluster_info.initFromStaticConfig(self.context.user, cluster_config_file)
return DefaultValue.get_cm_server_num_from_static(cluster_info)
def _get_strategy_with_cm_num(self, old_cm_num, new_cm_num):
"""
Get strategy with CM server instance number
"""
if new_cm_num == 0 and old_cm_num == 0:
self.context.logger.debug("No CM instance exists in the new and old clusters.")
return 0
if new_cm_num > 0 and old_cm_num == 0:
self.context.logger.debug("The new cluster has a CM components "
"but the old cluster does not have a CM components.")
return 1
if new_cm_num > 0 and old_cm_num > 0:
self.context.logger.debug("CM components has in origin cluster.")
return 2
else:
return -1
def get_upgrade_cm_strategy(self):
"""
Get strategy for start cluster
"""
old_cluster_config_file = \
os.path.realpath(os.path.join(self.context.oldClusterAppPath,
"bin", "cluster_static_config"))
new_cluster_config_file = \
os.path.realpath(os.path.join(self.context.newClusterAppPath,
"bin", "cluster_static_config"))
if not os.path.isfile(new_cluster_config_file):
self.context.logger.debug("Start cluster with om tool, "
"[{0}]".format(new_cluster_config_file))
if os.path.isfile(old_cluster_config_file):
if self.get_cms_num(old_cluster_config_file) == 0:
return 0
else:
return 2
return -1
new_cm_num = self.get_cms_num(new_cluster_config_file)
if not os.path.isfile(old_cluster_config_file):
self.context.logger.debug("Not exist old static_config_file "
"[{0}]".format(old_cluster_config_file))
if new_cm_num == 0:
return 0
else:
return 2
old_cm_num = self.get_cms_num(old_cluster_config_file)
return self._get_strategy_with_cm_num(old_cm_num, new_cm_num)
def start_strategy(self, is_final=True):
"""
Start cluster
"""
cm_strategy = self.get_upgrade_cm_strategy()
if cm_strategy == 0:
self.startCluster()
if cm_strategy == 1:
if is_final:
self.om_start_cluster()
else:
self.startCluster()
else:
self.om_start_cluster()
def stop_strategy(self, is_final=True):
"""
Start cluster
"""
cm_strategy = self.get_upgrade_cm_strategy()
if cm_strategy == 0:
self.stopCluster()
if cm_strategy == 1:
if is_final:
self.om_stop_cluster()
else:
self.stopCluster()
else:
self.om_stop_cluster()
def modifyPgProcIndex(self):
"""
1. 执行重建pg_proc index 的sql
2. make checkpoint
3. stop cluster
4. start cluster
:return:
"""
self.context.logger.debug("Begin to modify pg_proc index.")
time.sleep(3)
database_list = self.getDatabaseList()
# 执行重建pg_proc index 的sql
sql = """START TRANSACTION;SET IsInplaceUpgrade = on;
drop index pg_proc_oid_index;SET LOCAL
inplace_upgrade_next_system_object_oids=IUO_CATALOG,false,
true,0,0,0,2690;CREATE UNIQUE INDEX pg_proc_oid_index ON pg_proc
USING btree (oid);SET LOCAL
inplace_upgrade_next_system_object_oids=IUO_CATALOG,false,
true,0,0,0,0;commit;CHECKPOINT;"""
for eachdb in database_list:
(status, output) = ClusterCommand.remoteSQLCommand(
sql, self.context.user,
self.dnInst.hostname, self.dnInst.port, False,
eachdb, IsInplaceUpgrade=True)
if status != 0:
raise Exception(ErrorCode.GAUSS_513["GAUSS_51300"] % sql +
" Error: \n%s" % str(output))
sql = """START TRANSACTION;SET IsInplaceUpgrade = on;
drop index pg_proc_proname_args_nsp_index;SET LOCAL
inplace_upgrade_next_system_object_oids=IUO_CATALOG,false,
true,0,0,0,2691;create UNIQUE INDEX pg_proc_proname_args_nsp_index
ON pg_proc USING btree (proname, proargtypes, pronamespace);SET
LOCAL inplace_upgrade_next_system_object_oids=IUO_CATALOG,false,
true,0,0,0,0;commit;CHECKPOINT;"""
for eachdb in database_list:
(status, output) = ClusterCommand.remoteSQLCommand(
sql, self.context.user,
self.dnInst.hostname, self.dnInst.port, False,
eachdb, IsInplaceUpgrade=True)
if status != 0:
raise Exception(ErrorCode.GAUSS_513["GAUSS_51300"] % sql +
" Error: \n%s" % str(output))
# stop cluster
self.stop_strategy()
# start cluster
self.start_strategy()
self.context.logger.debug("Successfully modified pg_proc index.")
def setNewVersionGuc(self):
"""
function: set new Version guc
input : NA
output : NA
"""
pass
def setActionFile(self):
"""
set the action from step file, if not find, set it to large upgrade,
if the upgrade type is small upgrade, but we set it to large upgrade,
just kill the cm agent as expense, take no effect to transaction
But if the action should be large, we does not set the upgrade_mode,
some new feature will not opened
:return: NA
"""
stepFile = os.path.join(self.context.upgradeBackupPath,
const.GREY_UPGRADE_STEP_FILE)
self.context.logger.debug("Get the action from file %s." % stepFile)
if not (os.path.exists(stepFile) or os.path.isfile(stepFile)):
self.context.logger.debug("Step file does not exists or not file,"
" cannot get action from it. "
"Set it to large upgrade.")
self.context.action = const.ACTION_LARGE_UPGRADE
return
with open(stepFile, 'r') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
self.context.action = row['upgrade_action']
break
self.context.logger.debug("Set the action to %s"
% self.context.action)
def getClusterAppPath(self, mode=const.OLD):
"""
if cannot get path from table, try to get from the backup file
:param mode:
:return:
"""
self.context.logger.debug("Get the install path from table or file.")
path = self.getClusterAppPathFromFile(mode)
return path
def getClusterAppPathFromFile(self, mode=const.OLD):
"""
get the app path from backup dir, mode is new or old,
:param mode: 'old', 'new'
:return: the real path of appPath
"""
dirFile = "%s/%s" % (self.context.upgradeBackupPath,
const.RECORD_UPGRADE_DIR)
self.context.logger.debug("Get the %s app path from file %s"
% (mode, dirFile))
if mode not in [const.OLD, const.NEW]:
raise Exception(traceback.format_exc())
if not os.path.exists(dirFile):
self.context.logger.debug(ErrorCode.GAUSS_502["GAUSS_50201"]
% dirFile)
if self.checkBakPathNotExists():
return ""
# copy the binary_upgrade dir from other node,
# if one node is damaged while binary_upgrade may disappear,
# user repair one node before commit, and send the commit
# command to the repair node, we need to copy the
# dir from remote node
cmd = "if [ -f '%s' ]; then echo 'GetFile';" \
" else echo 'NoThisFile'; fi" % dirFile
self.context.logger.debug("Command for checking file: %s" % cmd)
(status, output) = self.context.sshTool.getSshStatusOutput(
cmd, self.context.clusterNodes, self.context.mpprcFile)
outputMap = self.context.sshTool.parseSshOutput(
self.context.clusterNodes)
self.context.logger.debug("Output: %s" % output)
copyNode = ""
for node in self.context.clusterNodes:
if status[node] == DefaultValue.SUCCESS:
if 'GetFile' in outputMap[node]:
copyNode = node
break
if copyNode:
if not os.path.exists(self.context.upgradeBackupPath):
self.context.logger.debug("Create directory %s."
% self.context.tmpDir)
FileUtil.createDirectory(
self.context.upgradeBackupPath, True,
DefaultValue.KEY_DIRECTORY_MODE)
self.context.logger.debug("Copy the directory %s from node %s."
% (self.context.upgradeBackupPath,
copyNode))
cmd = LocalRemoteCmd.getRemoteCopyCmd(
self.context.upgradeBackupPath, self.context.tmpDir,
str(copyNode), False, 'directory')
self.context.logger.debug("Command for copying "
"directory: %s" % cmd)
CmdExecutor.execCommandLocally(cmd)
else:
# binary_upgrade exists, but no step file
return ""
if not os.path.isfile(dirFile):
raise Exception(ErrorCode.GAUSS_502["GAUSS_50210"] % dirFile)
with open(dirFile, 'r') as fp:
retLines = fp.readlines()
if len(retLines) != 2:
raise Exception(ErrorCode.GAUSS_502["GAUSS_50222"] % dirFile)
if mode == const.OLD:
path = retLines[0].strip()
else:
path = retLines[1].strip()
# if can get the path from file, the path must be valid,
# otherwise the file is damaged accidentally
DefaultValue.checkPathVaild(path)
if not os.path.exists(path):
if mode == const.NEW and \
self.context.action == const.ACTION_AUTO_ROLLBACK:
self.context.logger.debug("Under rollback, the new "
"cluster app path does not exists.")
elif mode == const.OLD and \
self.context.action == const.ACTION_COMMIT_UPGRADE:
self.context.logger.debug("Under commit, no need to "
"check the old path exists.")
else:
self.context.logger.debug(traceback.format_exc())
raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % path)
self.context.logger.debug("Successfully Get the app"
" path [%s] from file" % path)
return path
def printPrecommitBanner(self):
"""
funcation: if in pre-commit status, and do not execute
the commit cmd, then can print this message
input : NA
output: NA
"""
self.context.logger.log("Upgrade main process has been finished,"
" user can do some check now.")
self.context.logger.log("Once the check done, please execute "
"following command to commit upgrade:")
xmlFile = self.context.xmlFile \
if len(self.context.xmlFile) else "XMLFILE"
self.context.logger.log("\n gs_upgradectl -t "
"commit-upgrade -X %s \n" % xmlFile)
def doGreyCommitUpgrade(self):
"""
function: commit binary upgrade and clean up backup files
1. unset read-only
2. drop old PMK schema
3. clean up other upgrade tmp files
input : NA
output: NA
"""
try:
(status, output) = self.doHealthCheck(const.OPTION_POSTCHECK)
if status != 0:
raise Exception(
"NOTICE: " + ErrorCode.GAUSS_516[
"GAUSS_51601"] % "cluster" + output)
if self.unSetClusterReadOnlyMode() != 0:
raise Exception("NOTICE: " + ErrorCode.GAUSS_529["GAUSS_52907"])
if not (self.isNodeSpecifyStep(GreyUpgradeStep.STEP_PRE_COMMIT)
or self.isNodeSpecifyStep(
GreyUpgradeStep.STEP_BEGIN_COMMIT)):
raise Exception(ErrorCode.GAUSS_529["GAUSS_52916"])
# for the reenter commit, the schema may have been deleted
if self.existTable(const.RECORD_NODE_STEP):
self.recordNodeStep(GreyUpgradeStep.STEP_BEGIN_COMMIT)
self.setActionFile()
if self.context.action == const.ACTION_LARGE_UPGRADE:
if DefaultValue.get_cm_server_num_from_static(self.context.clusterInfo) > 0:
self.setUpgradeFromParam(const.UPGRADE_UNSET_NUM)
self.reloadCmAgent()
self.reload_cmserver(is_final=True)
self.setUpgradeMode(0)
time.sleep(10)
if self.dropPMKSchema() != 0:
raise Exception(ErrorCode.GAUSS_529["GAUSS_52917"])
self.clearOtherToolPackage()
self.cleanInstallPath(const.OLD)
self.dropSupportSchema()
self.cleanBinaryUpgradeBakFiles()
self.cleanConfBakOld()
# remove tmp global relmap file
self.cleanTmpGlobalRelampFile()
self.context.logger.log("Commit upgrade succeeded.")
except Exception as e:
self.exitWithRetCode(const.ACTION_COMMIT_UPGRADE, False, str(e))
self.exitWithRetCode(const.ACTION_COMMIT_UPGRADE, True)
def dropPMKSchema(self):
"""
function: Notice: the pmk schema on database postgres
input : NA
output: return 0, if the operation is done successfully.
return 1, if the operation failed.
"""
try:
self.context.logger.debug("Start to drop schema PMK.")
# execute drop commands by the CN instance
sql = "DROP SCHEMA IF EXISTS pmk CASCADE; "
retry_times = 0
while True:
(status, output) = self.execSqlCommandInPrimaryDN(sql)
if status != 0 or SqlResult.findErrorInSql(output):
if retry_times < 12:
self.context.logger.debug(
"ERROR: Failed to DROP SCHEMA pmk for the %d time."
" Error: \n%s" % (retry_times + 1, str(output)))
else:
raise Exception(
ErrorCode.GAUSS_513["GAUSS_51300"] % sql +
" Error: \n%s" % str(output))
else:
break
time.sleep(5)
retry_times += 1
self.context.logger.debug("Succcessfully deleted schema PMK.")
return 0
except Exception as e:
self.context.logger.log(
"NOTICE: Failed to execute SQL command on CN instance, "
+ "please re-commit upgrade once again or " +
"re-execute SQL command 'DROP SCHEMA "
"IF EXISTS pmk CASCADE' manually.")
self.context.logger.debug(str(e))
return 1
def cleanConfBakOld(self):
"""
clean conf.bak.old files in all instances
input : NA
output : NA
"""
try:
cmd = "%s -t %s -U %s -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_CLEAN_CONF_BAK_OLD,
self.context.user,
self.context.localLog)
hostList = copy.deepcopy(self.context.nodeNames)
self.context.sshTool.executeCommand(cmd, hostList=hostList)
except Exception as e:
raise Exception(str(e))
self.context.logger.debug(
"Successfully cleaned conf.bak.old in all instances.")
def doGreyBinaryRollback(self, action=""):
"""
function: rollback the upgrade of binary
input : NA
output: return True, if the operation is done successfully.
return False, if the operation failed.
"""
self.context.logger.log("Performing grey rollback.")
# before prepare upgrade function and table or after commit,
# table does not exist means not rollback
# if we read the step for file, means we have force to rollback,
# the record in table is not same with file
# we can only read the step from file
try:
self.distributeXml()
if action == const.ACTION_AUTO_ROLLBACK:
self.clearOtherToolPackage(action)
try:
self.getOneDNInst(True)
except Exception as e:
# don't promise DN is available in force rollback
if self.context.forceRollback:
self.context.logger.debug("Error: %s" % str(e))
else:
raise Exception(str(e))
# if the cluster is degrade and cn is down,
# the set command will be False, ignore the error
if self.unSetClusterReadOnlyMode() != 0:
self.context.logger.log(
"WARNING: Failed to unset cluster read only mode.")
if self.context.forceRollback:
# if one node is uninstalled,
# there will be no binary_upgrade dir
self.createBakPath()
self.setReadStepFromFile()
self.createGphomePack()
# first time user may use forcerollback, but next time user may
# not use force rollback, so the step file and step
# table is not same, so we can only read step from file,
# consider if need to sync them, not important
# under force upgrade, only read step from file
maxStep = self.getNodeStep()
# if -2, it means there is no need to exec rollback
# if under upgrade continue mode, it will do upgrade not rollback,
# it can enter the upgrade process
# when the binary_upgrade bak dir has some files
if maxStep == const.BINARY_UPGRADE_NO_NEED_ROLLBACK:
self.cleanBinaryUpgradeBakFiles(True)
self.context.logger.log("No need to rollback.")
return True
elif maxStep == GreyUpgradeStep.STEP_BEGIN_COMMIT:
self.context.logger.log(
ErrorCode.GAUSS_529["GAUSS_52919"] +
" Please commit again! Can not rollback any more.")
return False
# Mark that we leave pre commit status,
# so that if we fail at the first few steps,
# we won't be allowed to commit upgrade any more.
elif maxStep == GreyUpgradeStep.STEP_PRE_COMMIT:
nodes = self.getNodesWithStep(maxStep)
self.recordNodeStep(
GreyUpgradeStep.STEP_UPDATE_POST_CATALOG, nodes)
maxStep = self.getNodeStep()
if maxStep == GreyUpgradeStep.STEP_UPDATE_POST_CATALOG:
self.context.logger.debug(
"Record the step %d to mark it has leaved pre-commit"
" status." % GreyUpgradeStep.STEP_UPDATE_POST_CATALOG)
try:
if self.context.action == const.ACTION_LARGE_UPGRADE\
and \
self.isNodeSpecifyStep(
GreyUpgradeStep.STEP_UPDATE_POST_CATALOG):
self.prepareUpgradeSqlFolder()
self.prepareSql("rollback-post")
self.setUpgradeMode(2)
self.execRollbackUpgradedCatalog(
scriptType="rollback-post")
# restore old relmap file after rollback-post
self.restoreGlobalRelampFile()
except Exception as e:
if self.context.forceRollback:
self.context.logger.debug("Error: %s" % str(e))
else:
raise Exception(str(e))
nodes = self.getNodesWithStep(maxStep)
self.recordNodeStep(GreyUpgradeStep.STEP_UPGRADE_PROCESS, nodes)
# rollback the nodes from maxStep, each node do its rollback
needSwitchProcess = False
if maxStep >= GreyUpgradeStep.STEP_UPGRADE_PROCESS:
needSwitchProcess = True
if maxStep >= GreyUpgradeStep.STEP_SWITCH_NEW_BIN:
self.greyRestoreConfig()
self.clean_cm_instance()
self.switchBin(const.OLD)
self.greyRestoreGuc()
if needSwitchProcess:
self.rollbackHotpatch()
self.getOneDNInst(checkNormal=True)
self.switchExistsProcess(True)
self.recordNodeStep(GreyUpgradeStep.STEP_UPDATE_CATALOG)
if maxStep >= GreyUpgradeStep.STEP_UPDATE_CATALOG and\
self.context.action == const.ACTION_LARGE_UPGRADE:
self.rollbackCatalog()
self.recordNodeStep(GreyUpgradeStep.STEP_INIT_STATUS)
if maxStep >= GreyUpgradeStep.STEP_INIT_STATUS:
# clean on all the node, because the binary_upgrade temp
# dir will create in every node
self.cleanInstallPath(const.NEW)
self.getOneDNInst()
self.dropSupportSchema()
self.initOmRollbackProgressFile()
self.cleanBinaryUpgradeBakFiles(True)
self.cleanTmpGlobalRelampFile()
except Exception as e:
self.context.logger.debug(str(e))
self.context.logger.debug(traceback.format_exc())
self.context.logger.log("Rollback failed. Error: %s" % str(e))
return False
self.context.logger.log("Rollback succeeded.")
return True
def setReadStepFromFile(self):
readFromFileFlag = os.path.join(self.context.upgradeBackupPath,
const.READ_STEP_FROM_FILE_FLAG)
self.context.logger.debug("Under force rollback mode.")
FileUtil.createFile(readFromFileFlag, True, DefaultValue.KEY_FILE_MODE)
self.distributeFile(readFromFileFlag)
self.context.logger.debug("Create file %s. " % readFromFileFlag +
"Only read step from file.")
def getNodeStep(self):
"""
get node step from file or tacle
"""
maxStep = self.getNodeStepFile()
return maxStep
def getNodeStepFile(self):
if not os.path.exists(self.context.upgradeBackupPath):
self.context.logger.debug("Directory %s does not exist. "
"Only clean remaining files and schema."
% self.context.upgradeBackupPath)
return const.BINARY_UPGRADE_NO_NEED_ROLLBACK
if not os.path.isdir(self.context.upgradeBackupPath):
raise Exception(ErrorCode.GAUSS_513["GAUSS_50211"] %
self.context.upgradeBackupPath)
# because the binary_upgrade dir is used to block expand,
# so we should clean the dir when rollback
fileList = os.listdir(self.context.upgradeBackupPath)
if not fileList:
return GreyUpgradeStep.STEP_INIT_STATUS
stepFile = os.path.join(self.context.upgradeBackupPath,
const.GREY_UPGRADE_STEP_FILE)
if not os.path.exists(stepFile):
self.context.logger.debug(
"No need to rollback. File %s does not exist." % stepFile)
return const.BINARY_UPGRADE_NO_NEED_ROLLBACK
self.context.logger.debug("Get the node step from file %s." % stepFile)
with open(stepFile, 'r') as csvfile:
reader = csv.DictReader(csvfile)
maxStep = const.INVALID_UPRADE_STEP
for row in reader:
self.checkStep(row['step'])
maxStep = max(int(row['step']), maxStep)
if row['upgrade_action'] != self.context.action:
raise Exception(ErrorCode.GAUSS_502["GAUSS_50222"] %
stepFile +
"\nIncorrect upgrade strategy, input "
"upgrade type: %s; record upgrade type: %s"
% (self.context.action,
row['upgrade_action']))
self.context.logger.debug("Get the max step [%d] from file." % maxStep)
self.context.logger.debug(
"Successfully get the node step from file %s." % stepFile)
return maxStep
def checkActionInTableOrFile(self):
"""
under force upgrade, step file and table may not be coincident.
So we only use step file
"""
self.checkActionInFile()
def execSqlCommandInPrimaryDN(self, sql, retryTime=3):
self.context.logger.debug("Start to exec sql {0}.".format(sql))
count = 0
status, output = 1, ""
while count < retryTime:
self.context.logger.debug(
"Exec sql in dn node {0}".format(self.dnInst.hostname))
(status, output) = ClusterCommand.remoteSQLCommand(
sql, self.context.user,
self.dnInst.hostname, self.dnInst.port, False,
DefaultValue.DEFAULT_DB_NAME, IsInplaceUpgrade=True)
self.context.logger.debug(
"Exec sql result is, status:{0}, output is {1}".format(
status, output))
if status != 0 or SqlResult.findErrorInSql(output):
count += 1
continue
else:
break
return status, output
def checkActionInFile(self):
"""
function: check whether current action is same
with record action in file
input : NA
output: NA
"""
try:
self.context.logger.debug("Check the action in file.")
stepFile = os.path.join(self.context.upgradeBackupPath,
const.GREY_UPGRADE_STEP_FILE)
if not os.path.isfile(stepFile):
self.context.logger.debug(
ErrorCode.GAUSS_502["GAUSS_50201"] % (stepFile))
return
with open(stepFile, 'r') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
upgrade_action = row['upgrade_action']
if self.context.action != upgrade_action:
raise Exception(ErrorCode.GAUSS_529["GAUSS_52925"] % (
self.context.action, upgrade_action))
self.context.logger.debug("Successfully check the action in file.")
return
except Exception as e:
self.context.logger.debug("Failed to check action in table.")
raise Exception(str(e))
def getNodesWithStep(self, step):
"""
get nodes with the given step from step file or table
"""
nodes = self.getNodesWithStepFile(step)
return nodes
def getNodesWithStepFile(self, step):
"""
get nodes with the given step from file upgrade_step.csv
"""
stepFile = os.path.join(self.context.upgradeBackupPath,
const.GREY_UPGRADE_STEP_FILE)
self.context.logger.debug("Get the node step from file %s." % stepFile)
nodes = []
with open(stepFile, 'r') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if not row['step'].isdigit():
raise Exception(ErrorCode.GAUSS_529["GAUSS_52926"])
if int(row['step']) == step:
nodes.append(row['node_host'])
self.context.logger.debug("Nodes %s is step %d" % (nodes, step))
return nodes
def greyRestoreConfig(self):
"""
deal with the lib/postgresql/pg_plugin
Under rollback, we will use new pg_plugin dir as base, the file in
new dir but not in old dir will be moved to old dir considering add
the C function, and remove from old dir considering drop the C function
copy the config from new dir to old dir if the config may change
by user action
"""
cmd = "%s -t %s -U %s --old_cluster_app_path=%s " \
"--new_cluster_app_path=%s -l %s" % (
OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_GREY_RESTORE_CONFIG,
self.context.user,
self.context.oldClusterAppPath,
self.context.newClusterAppPath,
self.context.localLog)
if self.context.forceRollback:
cmd += " --force"
self.context.logger.debug("Command for restoring config: %s" % cmd)
rollbackList = copy.deepcopy(self.context.clusterNodes)
self.context.sshTool.executeCommand(cmd, hostList=rollbackList)
self.context.logger.debug("Successfully restore config.")
def greyRestoreGuc(self):
"""
restore the old guc in rollback
:return: NA
"""
cmd = "%s -t %s -U %s --old_cluster_app_path=%s -X %s -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_GREY_RESTORE_GUC,
self.context.user,
self.context.oldClusterAppPath,
self.context.xmlFile,
self.context.localLog)
if self.context.forceRollback:
cmd += " --force"
self.context.logger.debug("Command for restoring GUC: %s" % cmd)
rollbackList = copy.deepcopy(self.context.clusterNodes)
self.context.sshTool.executeCommand(cmd, hostList=rollbackList)
self.context.logger.debug("Successfully restore guc.")
def dropSupportSchema(self):
self.context.logger.debug("Drop schema.")
sql = "DROP SCHEMA IF EXISTS %s CASCADE;" % const.UPGRADE_SCHEMA
retryTime = 0
try:
while retryTime < 5:
(status, output) = self.execSqlCommandInPrimaryDN(sql)
if status != 0 or SqlResult.findErrorInSql(output):
retryTime += 1
self.context.logger.debug(
"Failed to execute SQL: %s. Error: \n%s. retry" % (
sql, str(output)))
else:
break
if status != 0 or SqlResult.findErrorInSql(output):
self.context.logger.debug(
"Failed to execute SQL: %s. Error: \n%s" % (
sql, str(output)))
raise Exception(ErrorCode.GAUSS_513["GAUSS_51300"] % sql +
" Please drop manually with this command.")
self.context.logger.debug("Successfully drop schema %s cascade." %
const.UPGRADE_SCHEMA)
except Exception as e:
if self.context.forceRollback:
self.context.logger.log(
"Failed to drop schema. Please drop manually "
"with this command: \n %s" % sql)
else:
raise Exception(str(e))
def doInplaceBinaryRollback(self):
"""
function: rollback the upgrade of binary
input : NA
output: return True, if the operation is done successfully.
return False, if the operation failed.
"""
self.context.logger.log("Performing inplace rollback.")
# step flag
# const.BINARY_UPGRADE_NO_NEED_ROLLBACK value is -2
# const.INVALID_UPRADE_STEP value is -1
# const.BINARY_UPGRADE_STEP_INIT_STATUS value is 0
# const.BINARY_UPGRADE_STEP_STOP_NODE value is 2
# const.BINARY_UPGRADE_STEP_BACKUP_VERSION value is 3
# const.BINARY_UPGRADE_STEP_UPGRADE_APP value is 4
# const.BINARY_UPGRADE_STEP_START_NODE value is 5
# const.BINARY_UPGRADE_STEP_PRE_COMMIT value is 6
self.distributeXml()
step = self.getNodeStepInplace()
if step == const.BINARY_UPGRADE_NO_NEED_ROLLBACK:
self.context.logger.log("Rollback succeeded.")
return True
# if step <= -1, it means the step file is broken, exit.
if step <= const.INVALID_UPRADE_STEP:
self.context.logger.debug("Invalid upgrade step: %s." % str(step))
return False
# if step value is const.BINARY_UPGRADE_STEP_PRE_COMMIT
# and find commit flag file,
# means user has commit upgrade, then can not do rollback
if step == const.BINARY_UPGRADE_STEP_PRE_COMMIT:
if not self.checkCommitFlagFile():
self.context.logger.log(
"Upgrade has already been committed, "
"can not execute rollback command any more.")
return False
try:
self.checkStaticConfig()
self.start_strategy()
# Mark that we leave pre commit status,
# so that if we fail at the first few steps,
# we won't be allowed to commit upgrade any more.
if step == const.BINARY_UPGRADE_STEP_PRE_COMMIT:
self.recordNodeStepInplace(
const.ACTION_INPLACE_UPGRADE,
const.BINARY_UPGRADE_STEP_START_NODE)
if step >= const.BINARY_UPGRADE_STEP_START_NODE:
# drop table and index after large upgrade
if self.isLargeInplaceUpgrade:
if self.check_upgrade_mode():
self.drop_table_or_index()
self.restoreClusterConfig(True)
self.clean_cm_instance()
self.switchBin(const.OLD)
if self.isLargeInplaceUpgrade:
touchInitFlagFile = os.path.join(
self.context.upgradeBackupPath, "touch_init_flag")
if os.path.exists(touchInitFlagFile):
self.rollbackCatalog()
self.cleanCsvFile()
else:
self.setUpgradeMode(0)
else:
self.setUpgradeFromParam(const.UPGRADE_UNSET_NUM)
self.stop_strategy()
self.recordNodeStepInplace(
const.ACTION_INPLACE_UPGRADE,
const.BINARY_UPGRADE_STEP_UPGRADE_APP)
if step >= const.BINARY_UPGRADE_STEP_UPGRADE_APP:
self.restoreNodeVersion()
self.restoreClusterConfig(True)
self.recordNodeStepInplace(
const.ACTION_INPLACE_UPGRADE,
const.BINARY_UPGRADE_STEP_BACKUP_VERSION)
if step >= const.BINARY_UPGRADE_STEP_BACKUP_VERSION:
self.cleanBackupedCatalogPhysicalFiles(True)
self.recordNodeStepInplace(
const.ACTION_INPLACE_UPGRADE,
const.BINARY_UPGRADE_STEP_STOP_NODE)
if step >= const.BINARY_UPGRADE_STEP_STOP_NODE:
self.start_strategy()
self.recordNodeStepInplace(
const.ACTION_INPLACE_UPGRADE,
const.BINARY_UPGRADE_STEP_INIT_STATUS)
if step >= const.BINARY_UPGRADE_STEP_INIT_STATUS:
if self.unSetClusterReadOnlyMode() != 0:
raise Exception("NOTICE: " +
ErrorCode.GAUSS_529["GAUSS_52907"])
self.cleanBinaryUpgradeBakFiles(True)
self.cleanInstallPath(const.NEW)
self.cleanTmpGlobalRelampFile()
# install kerberos
self.install_kerberos()
except Exception as e:
self.context.logger.error(str(e))
self.context.logger.log("Rollback failed.")
return False
self.context.logger.log("Rollback succeeded.")
return True
def check_table_or_index_exist(self, name, eachdb):
"""
check a table exist
:return:
"""
sql = "select count(*) from pg_class where relname = '%s';" % name
(status, output) = ClusterCommand.remoteSQLCommand(
sql, self.context.user,
self.dnInst.hostname, self.dnInst.port, False,
eachdb, IsInplaceUpgrade=True)
if status != 0 or SqlResult.findErrorInSql(output):
raise Exception(ErrorCode.GAUSS_513["GAUSS_51300"] % sql +
" Error: \n%s" % str(output))
if output == '0':
self.context.logger.debug("Table does not exist.")
return False
self.context.logger.debug("Table exists.")
return True
def drop_table_or_index(self):
"""
drop a table
:return:
"""
self.context.logger.debug("Start to drop table or index")
database_list = self.getDatabaseList()
# drop table and index
maindb = "postgres"
otherdbs = database_list
otherdbs.remove("postgres")
# check table exist in postgres
table_name = 'pg_proc_temp_oids'
if self.check_table_or_index_exist(table_name, maindb):
self.drop_one_database_table_or_index([maindb])
else:
return
# drop other database table and index
self.drop_one_database_table_or_index(otherdbs)
self.context.logger.debug(
"Successfully droped table or index.")
def drop_one_database_table_or_index(self,
database_list):
"""
drop a table in one database
:return:
"""
table_name = 'pg_proc_temp_oids'
delete_table_sql = "START TRANSACTION;SET IsInplaceUpgrade = on;" \
"drop table %s;commit;" % table_name
index_name_list = ['pg_proc_oid_index_temp',
'pg_proc_proname_args_nsp_index_temp']
for eachdb in database_list:
if self.check_table_or_index_exist(table_name, eachdb):
(status, output) = ClusterCommand.remoteSQLCommand(
delete_table_sql, self.context.user,
self.dnInst.hostname, self.dnInst.port, False,
eachdb, IsInplaceUpgrade=True)
if status != 0:
raise Exception(
ErrorCode.GAUSS_513["GAUSS_51300"] % delete_table_sql
+ " Error: \n%s" % str(output))
for index in index_name_list:
if self.check_table_or_index_exist(index, eachdb):
sql = "START TRANSACTION;SET IsInplaceUpgrade = on;" \
"drop index %s;commit;" % index
(status, output) = ClusterCommand.remoteSQLCommand(
sql, self.context.user,
self.dnInst.hostname, self.dnInst.port, False,
eachdb, IsInplaceUpgrade=True)
if status != 0:
raise Exception(
ErrorCode.GAUSS_513[
"GAUSS_51300"] % sql + " Error: \n%s" % str(
output))
def rollbackCatalog(self):
"""
function: rollback catalog change
steps:
1.prepare update sql file and check sql file
2.do rollback catalog
input : NA
output: NA
"""
try:
if self.context.action == const.ACTION_INPLACE_UPGRADE and int(
float(self.context.oldClusterNumber) * 1000) <= 93000:
raise Exception("For this old version %s, we only support "
"physical rollback." % str(
self.context.oldClusterNumber))
self.context.logger.log("Rollbacking catalog.")
self.prepareUpgradeSqlFolder()
self.prepareSql()
self.doRollbackCatalog()
self.context.logger.log("Successfully Rollbacked catalog.")
except Exception as e:
if self.context.action == const.ACTION_INPLACE_UPGRADE:
self.context.logger.debug(
"Failed to perform rollback operation by rolling "
"back SQL files:\n%s" % str(e))
try:
self.context.logger.debug("Try to recover again using "
"catalog physical files")
self.doPhysicalRollbackCatalog()
except Exception as e:
raise Exception(
"Failed to rollback catalog. ERROR: %s" % str(e))
else:
raise Exception(
"Failed to rollback catalog. ERROR: %s" % str(e))
def doRollbackCatalog(self):
"""
function : rollback catalog change
steps:
stop cluster
set upgrade_from param
start cluster
connect database and rollback catalog changes one by one
stop cluster
unset upgrade_from param
input : NA
output: NA
"""
if self.context.action == const.ACTION_INPLACE_UPGRADE:
self.start_strategy(is_final=False)
self.setUpgradeFromParam(self.context.oldClusterNumber)
self.setUpgradeMode(1)
else:
self.setUpgradeFromParam(self.context.oldClusterNumber)
self.setUpgradeMode(2)
self.reloadCmAgent()
self.execRollbackUpgradedCatalog()
self.pgxcNodeUpdateLocalhost("rollback")
self.setUpgradeFromParam(const.UPGRADE_UNSET_NUM)
self.setUpgradeMode(0)
if self.context.action == const.ACTION_INPLACE_UPGRADE:
self.stop_strategy(is_final=False)
else:
self.reloadCmAgent()
def doPhysicalRollbackCatalog(self):
"""
function : rollback catalog by restore physical files
stop cluster
unset upgrade_from param
restore physical files
input : NA
output: NA
"""
try:
self.start_strategy(is_final=False)
self.setUpgradeFromParam(const.UPGRADE_UNSET_NUM)
self.setUpgradeMode(0)
self.stop_strategy(is_final=False)
self.execPhysicalRollbackUpgradedCatalog()
except Exception as e:
raise Exception(str(e))
def execPhysicalRollbackUpgradedCatalog(self):
"""
function : rollback catalog by restore physical files
send cmd to all node
input : NA
output: NA
"""
try:
if self.isLargeInplaceUpgrade:
self.context.logger.debug(
"Start to restore physical catalog files.")
# send cmd to all node and exec
cmd = "%s -t %s -U %s --upgrade_bak_path=%s " \
"--oldcluster_num='%s' -X '%s' -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_RESTORE_OLD_CLUSTER_CATALOG_PHYSICAL_FILES,
self.context.user,
self.context.upgradeBackupPath,
self.context.oldClusterNumber,
self.context.xmlFile,
self.context.localLog)
self.context.logger.debug(
"Command for restoring physical catalog files: %s." % cmd)
CmdExecutor.execCommandWithMode(
cmd,
self.context.sshTool,
self.context.isSingle,
self.context.userProfile)
self.context.logger.debug(
"Successfully restored physical catalog files.")
except Exception as e:
raise Exception(str(e))
def getSqlHeader(self):
"""
function: get sql header
input : NA
output : NA
"""
header = ["START TRANSACTION;"]
header.append("SET %s = on;" % const.ON_INPLACE_UPGRADE)
header.append("SET search_path = 'pg_catalog';")
header.append("SET local client_min_messages = NOTICE;")
header.append("SET local log_min_messages = NOTICE;")
return header
def getFileNameList(self, filePathName, scriptType="_"):
"""
function: get file name list
input : filePathName
output : []
"""
filePath = "%s/upgrade_sql/%s" % (self.context.upgradeBackupPath,
filePathName)
allFileList = os.listdir(filePath)
upgradeFileList = []
if len(allFileList) == 0:
return []
for each_sql_file in allFileList:
if not os.path.isfile("%s/%s" % (filePath, each_sql_file)):
continue
prefix = each_sql_file.split('.')[0]
resList = prefix.split('_')
if len(resList) != 5 or scriptType not in resList:
continue
file_num = "%s.%s" % (resList[3], resList[4])
if self.floatMoreThan(float(file_num),
self.context.oldClusterNumber) and \
self.floatGreaterOrEqualTo(self.context.newClusterNumber,
float(file_num)):
upgradeFileList.append(each_sql_file)
return upgradeFileList
def initClusterInfo(self, dbClusterInfoPath):
"""
function: init the cluster
input : dbClusterInfoPath
output: dbClusterInfo
"""
clusterInfoModules = OldVersionModules()
fileDir = os.path.dirname(os.path.realpath(dbClusterInfoPath))
sys.path.insert(0, fileDir)
# init cluster information
gp_home = ClusterDir.getClusterToolPath(self.context.user)
gauss_home = ClusterDir.getInstallDir(self.context.user)
gp_home_version = os.path.join(gp_home, "script", "gspylib", "common", "VersionInfo.py")
gauss_home_version = os.path.join(gauss_home, "bin", "script",
"gspylib", "common", "VersionInfo.py")
if not os.path.isfile(gp_home_version) and os.path.isfile(gauss_home_version):
FileUtil.cpFile(gauss_home_version, gp_home_version)
clusterInfoModules.oldDbClusterInfoModule = __import__('DbClusterInfo')
sys.path.remove(fileDir)
return clusterInfoModules.oldDbClusterInfoModule.dbClusterInfo()
def initOldClusterInfo(self, dbClusterInfoPath):
"""
function: init old cluster information
input : dbClusterInfoPath
output: clusterInfoModules.oldDbClusterInfoModule.dbClusterInfo()
"""
clusterInfoModules = OldVersionModules()
fileDir = os.path.dirname(os.path.realpath(dbClusterInfoPath))
# script and OldDbClusterInfo.py are in the same PGHOST directory
sys.path.insert(0, fileDir)
# V1R8 DbClusterInfo.py is "from gspylib.common.ErrorCode import
# ErrorCode"
sys.path.insert(0, os.path.join(fileDir, "script"))
# init old cluster information
clusterInfoModules.oldDbClusterInfoModule = \
__import__('OldDbClusterInfo')
return clusterInfoModules.oldDbClusterInfoModule.dbClusterInfo()
def initClusterConfig(self):
"""
function: init cluster info
input : NA
output: NA
"""
gaussHome = \
EnvUtil.getEnvironmentParameterValue("GAUSSHOME",
self.context.user)
# $GAUSSHOME must has available value.
if gaussHome == "":
raise Exception(ErrorCode.GAUSS_518["GAUSS_51800"] % "$GAUSSHOME")
(appPath, appPathName) = os.path.split(gaussHome)
commonDbClusterInfoModule = \
"%s/bin/script/gspylib/common/DbClusterInfo.py" % gaussHome
commonStaticConfigFile = "%s/bin/cluster_static_config" % gaussHome
try:
if self.context.action == const.ACTION_INPLACE_UPGRADE:
# get DbClusterInfo.py and cluster_static_config both of backup
# path and install path
# get oldClusterInfo
# if the backup file exists, we use them;
# if the install file exists, we use them;
# else, we can not get oldClusterInfo, exit.
# backup path exists
commonDbClusterInfoModuleBak = "%s/../OldDbClusterInfo.py" % \
self.context.upgradeBackupPath
commonStaticConfigFileBak = "%s/../cluster_static_config" % \
self.context.upgradeBackupPath
# if binary.tar exist, decompress it
if os.path.isfile("%s/%s" % (self.context.upgradeBackupPath,
self.context.binTarName)):
cmd = "cd '%s'&&tar xfp '%s'" % \
(self.context.upgradeBackupPath,
self.context.binTarName)
(status, output) = subprocess.getstatusoutput(cmd)
if (status != 0):
raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] %
cmd + "Error: \n%s" % str(output))
if (os.path.isfile(commonDbClusterInfoModuleBak)
and os.path.isfile(commonStaticConfigFileBak)):
try:
# import old module
# init old cluster config
self.context.oldClusterInfo = \
self.initOldClusterInfo(
commonDbClusterInfoModuleBak)
self.context.oldClusterInfo.initFromStaticConfig(
self.context.user, commonStaticConfigFileBak)
except Exception as e:
# maybe the old cluster is V1R5C00 TR5 version, not
# support specify static config file
# path for initFromStaticConfig function,
# so use new cluster format try again
self.context.oldClusterInfo = dbClusterInfo()
self.context.oldClusterInfo.initFromStaticConfig(
self.context.user, commonStaticConfigFileBak)
# if backup path not exist, then use install path
elif (os.path.isfile(commonDbClusterInfoModule)
and os.path.isfile(commonStaticConfigFile)):
# import old module
# init old cluster config
self.context.oldClusterInfo = \
self.initClusterInfo(commonDbClusterInfoModule)
self.context.oldClusterInfo.initFromStaticConfig(
self.context.user, commonStaticConfigFile)
else:
raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] %
"static config file")
# get the accurate logPath
logPathWithUser = EnvUtil.getEnv("GAUSSLOG")
DefaultValue.checkPathVaild(logPathWithUser)
splitMark = "/%s" % self.context.user
self.context.oldClusterInfo.logPath = \
logPathWithUser[0:(logPathWithUser.rfind(splitMark))]
# init new cluster config
# if xmlFile != "", init it by initFromXml();
# else, using oldClusterInfo
if self.context.xmlFile != "":
# get clusterInfo
# if falied to do dbClusterInfo, it means the
# DbClusterInfo.py is not correct
# we will use the backup file to instead of it
self.context.clusterInfo = dbClusterInfo()
try:
self.context.clusterInfo.initFromXml(
self.context.xmlFile)
except Exception as e:
self.context.logger.error(str(e))
try:
# init clusterinfo from backup dbclusterinfo
self.context.clusterInfo = \
self.initOldClusterInfo(
commonDbClusterInfoModuleBak)
self.context.clusterInfo.initFromXml(
self.context.xmlFile)
except Exception as e:
try:
self.context.clusterInfo = \
self.initClusterInfo(
commonDbClusterInfoModule)
self.context.clusterInfo.initFromXml(
self.context.xmlFile)
except Exception as e:
raise Exception(str(e))
# verify cluster config info between old and new cluster
self.verifyClusterConfigInfo(self.context.clusterInfo,
self.context.oldClusterInfo)
# after doing verifyClusterConfigInfo(),
# the clusterInfo and oldClusterInfo are be changed,
# so we should do init it again
self.context.clusterInfo = dbClusterInfo()
try:
self.context.clusterInfo.initFromXml(
self.context.xmlFile)
except Exception as e:
self.context.logger.debug(str(e))
try:
# init clusterinfo from backup dbclusterinfo
self.context.clusterInfo = \
self.initOldClusterInfo(
commonDbClusterInfoModuleBak)
self.context.clusterInfo.initFromXml(
self.context.xmlFile)
except Exception as e:
try:
self.context.clusterInfo = \
self.initClusterInfo(
commonDbClusterInfoModule)
self.context.clusterInfo.initFromXml(
self.context.xmlFile)
except Exception as e:
raise Exception(str(e))
else:
self.context.clusterInfo = self.context.oldClusterInfo
elif (self.context.action == const.ACTION_CHOSE_STRATEGY
or self.context.action == const.ACTION_COMMIT_UPGRADE):
# after switch to new bin, the gausshome points to newversion,
# so the oldClusterNumber is same with
# newClusterNumber, the oldClusterInfo is same with new
try:
self.context.oldClusterInfo = self.context.clusterInfo
self.getOneDNInst(True)
if os.path.isfile(commonDbClusterInfoModule) and \
os.path.isfile(commonStaticConfigFile):
# import old module
# init old cluster config
self.context.oldClusterInfo = \
self.initClusterInfo(commonDbClusterInfoModule)
self.context.oldClusterInfo.initFromStaticConfig(
self.context.user, commonStaticConfigFile)
else:
raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] %
"static config file")
except Exception as e:
# upgrade backup path
if (os.path.exists(
"%s/%s/bin/script/util/DbClusterInfo.py" % (
self.context.upgradeBackupPath, appPathName))):
binaryModuleBak = \
"%s/%s/bin/script/util/DbClusterInfo.py" % \
(self.context.upgradeBackupPath, appPathName)
else:
binaryModuleBak = \
"%s/%s/bin/script/gspylib/common/" \
"DbClusterInfo.py" % \
(self.context.upgradeBackupPath, appPathName)
binaryStaticConfigFileBak = \
"%s/%s/bin/cluster_static_config" % \
(self.context.upgradeBackupPath, appPathName)
if os.path.isfile(binaryModuleBak) and \
os.path.isfile(binaryStaticConfigFileBak):
# import old module
# init old cluster config
commonDbClusterInfoModuleBak = \
"%s/../OldDbClusterInfo.py" % \
self.context.upgradeBackupPath
self.context.oldClusterInfo = \
self.initOldClusterInfo(
commonDbClusterInfoModuleBak)
self.context.oldClusterInfo.initFromStaticConfig(
self.context.user, binaryStaticConfigFileBak)
else:
raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] %
"static config file")
elif (self.context.action in
[const.ACTION_SMALL_UPGRADE, const.ACTION_AUTO_UPGRADE,
const.ACTION_LARGE_UPGRADE, const.ACTION_AUTO_ROLLBACK]):
# 1. get new cluster info
self.context.clusterInfo = dbClusterInfo()
self.context.clusterInfo.initFromXml(self.context.xmlFile)
# 2. get oldClusterInfo
# when under rollback
# the gausshome may point to old or new clusterAppPath,
# so we must choose from the record table
# when upgrade abnormal nodes, the gausshome points to
# newClusterAppPath
oldPath = self.getClusterAppPath()
if oldPath != "" and os.path.exists(oldPath):
self.context.logger.debug("The old install path is %s" %
oldPath)
commonDbClusterInfoModule = \
"%s/bin/script/gspylib/common/DbClusterInfo.py" % \
oldPath
commonStaticConfigFile = \
"%s/bin/cluster_static_config" % oldPath
else:
self.context.logger.debug("The old install path is %s"
% os.path.realpath(gaussHome))
if (os.path.isfile(commonDbClusterInfoModule)
and os.path.isfile(commonStaticConfigFile)):
# import old module
# init old cluster config
self.context.oldClusterInfo = \
self.initClusterInfo(commonDbClusterInfoModule)
self.context.oldClusterInfo.initFromStaticConfig(
self.context.user, commonStaticConfigFile)
else:
raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] %
"static config file")
staticClusterInfo = dbClusterInfo()
config = os.path.join(gaussHome, "bin/cluster_static_config")
if not os.path.isfile(config):
raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] %
os.path.realpath(config))
staticClusterInfo.initFromStaticConfig(self.context.user,
config)
# verify cluster config info between old and new cluster
self.verifyClusterConfigInfo(self.context.clusterInfo,
staticClusterInfo)
# after doing verifyClusterConfigInfo(), the clusterInfo and
# oldClusterInfo are be changed,
# so we should do init it again
self.context.clusterInfo = dbClusterInfo()
# we will get the self.context.newClusterAppPath in
# choseStrategy
self.context.clusterInfo.initFromXml(self.context.xmlFile)
if self.context.is_inplace_upgrade or \
self.context.action == const.ACTION_AUTO_ROLLBACK:
self.getOneDNInst()
self.context.logger.debug("Successfully init cluster config.")
else:
raise Exception(ErrorCode.GAUSS_500["GAUSS_50004"] % 't' +
" Value: %s" % self.context.action)
# judgment has installed kerberos before action_inplace_upgrade
self.context.logger.debug(
"judgment has installed kerberos before action_inplace_upgrade")
xmlfile = os.path.join(os.path.dirname(self.context.userProfile),
DefaultValue.FI_KRB_XML)
if os.path.exists(xmlfile) and \
self.context.action == const.ACTION_AUTO_UPGRADE \
and self.context.is_grey_upgrade:
raise Exception(ErrorCode.GAUSS_502["GAUSS_50200"] % "kerberos")
if os.path.exists(xmlfile) and self.context.is_inplace_upgrade:
pghost_path = EnvUtil.getEnvironmentParameterValue(
'PGHOST', self.context.user)
destfile = "%s/krb5.conf" % os.path.dirname(
self.context.userProfile)
kerberosflagfile = "%s/kerberos_upgrade_flag" % pghost_path
cmd = "cp -rf %s %s " % (destfile, kerberosflagfile)
(status, output) = CmdUtil.retryGetstatusoutput(cmd, 3, 5)
if status != 0:
raise Exception(
ErrorCode.GAUSS_502["GAUSS_50206"] % kerberosflagfile
+ " Error: \n%s" % output)
self.context.logger.debug(
"Successful back up kerberos config file.")
except Exception as e:
self.context.logger.debug(traceback.format_exc())
self.exitWithRetCode(self.context.action, False, str(e))
def getAllStandbyDnInsts(self):
"""
function: find all normal standby dn instances by dbNodes.
input : NA
output: DN instances
"""
try:
self.context.logger.debug("Get all standby DN.")
dnList = []
dnInst = None
clusterNodes = self.context.oldClusterInfo.dbNodes
standbyDn, output = DefaultValue.getStandbyNode(
self.context.userProfile, self.context.logger)
self.context.logger.debug(
"Cluster status information is %s;The standbyDn is %s" % (
output, standbyDn))
if not standbyDn or standbyDn == []:
self.context.logger.debug("There is no standby dn")
return []
for dbNode in clusterNodes:
if len(dbNode.datanodes) == 0:
continue
dnInst = dbNode.datanodes[0]
if dnInst.hostname not in standbyDn:
continue
dnList.append(dnInst)
(checkStatus, checkResult) = OMCommand.doCheckStaus(
self.context.user, 0)
if checkStatus == 0:
self.context.logger.debug("The cluster status is normal,"
" no need to check standby dn status.")
else:
dnList = []
clusterStatus = \
OMCommand.getClusterStatus()
if clusterStatus is None:
raise Exception(ErrorCode.GAUSS_516["GAUSS_51600"])
clusterInfo = dbClusterInfo()
clusterInfo.initFromXml(self.context.xmlFile)
clusterInfo.dbNodes.extend(clusterNodes)
for dbNode in clusterInfo.dbNodes:
if len(dbNode.datanodes) == 0:
continue
dn = dbNode.datanodes[0]
if dn.hostname not in standbyDn:
continue
dbInst = clusterStatus.getInstanceStatusById(
dn.instanceId)
if dbInst is None:
continue
if dbInst.status == "Normal":
self.context.logger.debug(
"DN from %s is healthy." % dn.hostname)
dnList.append(dn)
else:
self.context.logger.debug(
"DN from %s is unhealthy." % dn.hostname)
if not dnList or dnList == []:
self.context.logger.debug("There is no normal standby dn")
else:
self.context.logger.debug("Successfully get all standby DN: %s" % \
','.join(d.hostname for d in dnList))
self.dnStandbyInsts = dnList
except Exception as e:
self.context.logger.log("Failed to get all standby DN. Error: %s" % str(e))
raise Exception(ErrorCode.GAUSS_516["GAUSS_51624"])
def getOneDNInst(self, checkNormal=False):
"""
function: find a dn instance by dbNodes,
which we can execute SQL commands
input : NA
output: DN instance
"""
try:
self.context.logger.debug(
"Get one DN. CheckNormal is %s" % checkNormal)
dnInst = None
clusterNodes = self.context.oldClusterInfo.dbNodes
primaryDnNode, output = DefaultValue.getPrimaryNode(
self.context.userProfile, self.context.logger)
self.context.logger.debug(
"Cluster status information is %s;The primaryDnNode is %s" % (
output, primaryDnNode))
if not primaryDnNode:
self.context.logger.error("Get primary DN failed. Please check cluster.")
raise Exception(ErrorCode.GAUSS_516["GAUSS_51652"] % "Get primary DN failed.")
for dbNode in clusterNodes:
if len(dbNode.datanodes) == 0:
continue
dnInst = dbNode.datanodes[0]
if dnInst.hostname not in primaryDnNode:
continue
break
if checkNormal:
(checkStatus, checkResult) = OMCommand.doCheckStaus(
self.context.user, 0)
if checkStatus == 0:
self.context.logger.debug("The cluster status is normal,"
" no need to check dn status.")
else:
clusterStatus = \
OMCommand.getClusterStatus()
if clusterStatus is None:
raise Exception(ErrorCode.GAUSS_516["GAUSS_51600"])
clusterInfo = dbClusterInfo()
clusterInfo.initFromXml(self.context.xmlFile)
clusterInfo.dbNodes.extend(clusterNodes)
for dbNode in clusterInfo.dbNodes:
if len(dbNode.datanodes) == 0:
continue
dn = dbNode.datanodes[0]
if dn.hostname not in primaryDnNode:
continue
dbInst = clusterStatus.getInstanceStatusById(
dn.instanceId)
if dbInst is None:
continue
if dbInst.status == "Normal":
self.context.logger.debug(
"DN from %s is healthy." % dn.hostname)
dnInst = dn
break
self.context.logger.debug(
"DN from %s is unhealthy." % dn.hostname)
# check if contain DN on nodes
if not dnInst or dnInst == []:
raise Exception(ErrorCode.GAUSS_526["GAUSS_52602"])
else:
self.context.logger.debug("Successfully get one DN from %s."
% dnInst.hostname)
self.dnInst = dnInst
except Exception as e:
self.context.logger.log("Failed to get one DN. Error: %s" % str(e))
raise Exception(ErrorCode.GAUSS_516["GAUSS_51624"])
def verifyClusterConfigInfo(self, clusterInfo, oldClusterInfo,
ignoreFlag="upgradectl"):
"""
function: verify cluster config info between xml and static config
input : clusterInfo, oldClusterInfo
output: NA
"""
try:
# should put self.context.clusterInfo before
# self.context.oldClusterInfo,
# because self.context.oldClusterInfo is not the istance of
# dbCluster
# covert new cluster information to compare cluster
compnew = self.covertToCompCluster(clusterInfo)
# covert old cluster information to compare cluster
compold = self.covertToCompCluster(oldClusterInfo)
# do compare
# if it is not same, print it.
theSame, tempbuffer = compareObject(compnew, compold,
"clusterInfo", [], ignoreFlag)
if (theSame):
self.context.logger.log("Static configuration matched with "
"old static configuration files.")
else:
msg = "Instance[%s] are not the same.\nXml cluster " \
"information: %s\nStatic cluster information: %s\n" % \
(tempbuffer[0], tempbuffer[1], tempbuffer[2])
self.context.logger.debug("The old cluster information is "
"from the cluster_static_config.")
raise Exception(ErrorCode.GAUSS_512["GAUSS_51217"] +
"Error: \n%s" % msg.strip("\n"))
except Exception as e:
raise Exception(str(e))
def covertToCompCluster(self, dbclusterInfo):
"""
function: covert to comp cluster
input : clusterInfo, oldClusterInfo
output: compClusterInfo
"""
# init dbcluster class
compClusterInfo = dbClusterInfo()
# get name
compClusterInfo.name = dbclusterInfo.name
# get appPath
compClusterInfo.appPath = dbclusterInfo.appPath
# get logPath
compClusterInfo.logPath = dbclusterInfo.logPath
for dbnode in dbclusterInfo.dbNodes:
compNodeInfo = dbNodeInfo()
# get datanode instance information
for datanode in dbnode.datanodes:
compNodeInfo.datanodes.append(
self.coverToCompInstance(datanode))
# get node information
compClusterInfo.dbNodes.append(compNodeInfo)
return compClusterInfo
def coverToCompInstance(self, compinstance):
"""
function: cover to comp instance
1. get instanceId
2. get mirrorId
3. get port
4. get datadir
5. get instanceType
6. get listenIps
7. get haIps
input : compinstance
output: covertedInstanceInfo
"""
covertedInstanceInfo = instanceInfo()
# get instanceId
covertedInstanceInfo.instanceId = compinstance.instanceId
# get mirrorId
covertedInstanceInfo.mirrorId = compinstance.mirrorId
# get port
covertedInstanceInfo.port = compinstance.port
# get datadir
covertedInstanceInfo.datadir = compinstance.datadir
# get instanceType
covertedInstanceInfo.instanceType = compinstance.instanceType
# get listenIps
covertedInstanceInfo.listenIps = compinstance.listenIps
# get haIps
covertedInstanceInfo.haIps = compinstance.haIps
return covertedInstanceInfo
def distributeXml(self):
"""
function: distribute package to every host
input : NA
output: NA
"""
self.context.logger.debug("Distributing xml configure file.",
"addStep")
try:
hosts = self.context.clusterInfo.getClusterNodeNames()
hosts.remove(NetUtil.GetHostIpOrName())
# Send xml file to every host
DefaultValue.distributeXmlConfFile(self.context.sshTool,
self.context.xmlFile,
hosts,
self.context.mpprcFile,
self.context.isSingle)
except Exception as e:
raise Exception(str(e))
self.context.logger.debug("Successfully distributed xml "
"configure file.", "constant")
def recordNodeStepInplace(self, action, step):
"""
function: record step info on all nodes
input : action, step
output: NA
"""
try:
# record step info on local node
tempPath = self.context.upgradeBackupPath
filePath = os.path.join(tempPath, const.INPLACE_UPGRADE_STEP_FILE)
cmd = "echo \"%s:%d\" > %s" % (action, step, filePath)
(status, output) = subprocess.getstatusoutput(cmd)
if status != 0:
raise Exception(ErrorCode.GAUSS_502["GAUSS_50205"] %
filePath + "Error: \n%s" % str(output))
if not self.context.isSingle:
# send file to remote nodes
self.context.sshTool.scpFiles(filePath, tempPath)
self.context.logger.debug("Successfully wrote step file[%s:%d]."
% (action, step))
except Exception as e:
raise Exception(str(e))
def distributeFile(self, step_file):
"""
function: distribute file
input : step_file
output : NA
"""
self.context.logger.debug("Distribute the file %s" % step_file)
# send the file to each node
hosts = self.context.clusterInfo.getClusterNodeNames()
hosts.remove(NetUtil.GetHostIpOrName())
if not self.context.isSingle:
stepDir = os.path.normpath(os.path.dirname(step_file))
self.context.sshTool.scpFiles(step_file, stepDir, hosts)
self.context.logger.debug("Successfully distribute the file %s"
% step_file)
def getNodeStepInplace(self):
"""
function: Get the upgrade step info for inplace upgrade
input : action
output: the upgrade step info
"""
try:
tempPath = self.context.upgradeBackupPath
# get file path and check file exists
filePath = os.path.join(tempPath, const.INPLACE_UPGRADE_STEP_FILE)
if not os.path.exists(filePath):
self.context.logger.debug("The cluster status is Normal. "
"No need to rollback.")
return const.BINARY_UPGRADE_NO_NEED_ROLLBACK
# read and check record format
stepInfo = FileUtil.readFile(filePath)[0]
stepList = stepInfo.split(":")
if len(stepList) != 2:
raise Exception(ErrorCode.GAUSS_500["GAUSS_50004"] % filePath)
recordType = stepList[0].strip()
recordStep = stepList[1].strip()
# check upgrade type
# the record value must be consistent with the upgrade type
if self.context.action != recordType:
raise Exception(ErrorCode.GAUSS_500["GAUSS_50004"] % "t" +
"Input upgrade type: %s record upgrade type: "
"%s\nMaybe you chose the wrong interface." %
(self.context.action, recordType))
# if record value is not digit, exit.
if not recordStep.isdigit() or int(recordStep) > \
const.BINARY_UPGRADE_STEP_PRE_COMMIT or \
int(recordStep) < const.INVALID_UPRADE_STEP:
raise Exception(ErrorCode.GAUSS_516["GAUSS_51633"] %
recordStep)
except Exception as e:
self.context.logger.error(str(e))
return const.INVALID_UPRADE_STEP
self.context.logger.debug("The rollback step is %s" % recordStep)
return int(recordStep)
def checkStep(self, step):
"""
function: check step
input : step
output : NA
"""
if not step.isdigit() or \
int(step) > GreyUpgradeStep.STEP_BEGIN_COMMIT or \
int(step) < const.INVALID_UPRADE_STEP:
raise Exception(ErrorCode.GAUSS_516["GAUSS_51633"] % str(step))
##########################################################################
# Offline upgrade functions
##########################################################################
def checkUpgrade(self):
"""
function: Check the environment for upgrade
input : action
output: NA
"""
self.context.logger.log("Checking upgrade environment.", "addStep")
try:
# Check the environment for upgrade
cmd = "%s -t %s -R '%s' -l '%s' -N '%s' -X '%s'" % \
(OMCommand.getLocalScript("Local_Check_Upgrade"),
self.context.action,
self.context.oldClusterAppPath,
self.context.localLog,
self.context.newClusterAppPath,
self.context.xmlFile)
self.context.logger.debug("Command for checking upgrade "
"environment: %s." % cmd)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
except Exception as e:
self.context.logger.log("Failed to check upgrade environment.",
"constant")
raise Exception(str(e))
if not self.context.forceRollback:
if self.context.oldClusterNumber >= \
const.ENABLE_STREAM_REPLICATION_VERSION:
self.check_gucval_is_inval_given(
const.ENABLE_STREAM_REPLICATION_NAME, const.VALUE_ON)
try:
if self.context.action == const.ACTION_INPLACE_UPGRADE:
self.context.logger.log(
"Successfully checked upgrade environment.", "constant")
return
self.checkActionInTableOrFile()
self.checkDifferentVersion()
self.checkOption()
except Exception as e:
self.context.logger.log(
"Failed to check upgrade environment.", "constant")
raise Exception(str(e))
self.context.logger.log(
"Successfully checked upgrade environment.", "constant")
def check_gucval_is_inval_given(self, guc_name, val_list):
"""
Checks whether a given parameter is a given value list in a
given instance list.
"""
self.context.logger.debug("checks whether the parameter:{0} is "
"the value:{1}.".format(guc_name, val_list))
guc_str = "{0}:{1}".format(guc_name, ",".join(val_list))
self.checkParam(guc_str)
self.context.logger.debug("Success to check the parameter:{0} value "
"is in the value:{1}.".format(guc_name,
val_list))
def checkDifferentVersion(self):
"""
if the cluster has only one version. no need to check
if the cluster has two version, it should be the new
version or the old version
:return:
"""
self.context.logger.debug("Check the amount of cluster version.")
failedHost = []
failMsg = ""
gaussHome = ClusterDir.getInstallDir(self.context.user)
# $GAUSSHOME must has available value.
if gaussHome == "":
raise Exception(ErrorCode.GAUSS_518["GAUSS_51800"] % "$GAUSSHOME")
versionFile = os.path.join(gaussHome, "bin/upgrade_version")
cmd = "sed -n \'3,1p\' %s" % versionFile
hostList = copy.deepcopy(self.context.clusterNodes)
(resultMap, outputCollect) = \
self.context.sshTool.getSshStatusOutput(cmd, hostList)
for key, val in resultMap.items():
if DefaultValue.FAILURE in val:
failedHost.append(key)
failMsg += val
if failedHost:
self.context.recordIgnoreOrFailedNodeInEveryNode(
self.context.failedNodeRecordFile, failedHost)
raise Exception(ErrorCode.GAUSS_529["GAUSS_52929"] + failMsg)
for result in outputCollect:
if result.find(self.newCommitId) or result.find(self.oldCommitId):
continue
self.context.logger.debug(
"Find the gausssdb version %s is not same with"
" current upgrade version" % str(result))
raise Exception(ErrorCode.GAUSS_529["GAUSS_52935"])
self.context.logger.debug(
"Successfully checked the amount of cluster version.")
def checkOption(self):
"""
if user use -g first, and then use -h <last -g choose nodes>,
we can upgrade again
:return:
"""
if self.context.is_grey_upgrade:
self.check_option_grey()
if len(self.context.nodeNames) != 0:
self.checkOptionH()
elif self.context.upgrade_remain:
self.checkOptionContinue()
else:
self.checkOptionG()
def check_option_grey(self):
"""
if nodes have been upgraded, no need to use --grey to upgrade again
:return:
"""
stepFile = os.path.join(
self.context.upgradeBackupPath, const.GREY_UPGRADE_STEP_FILE)
if not os.path.isfile(stepFile):
self.context.logger.debug(
"File %s does not exists. No need to check." %
const.GREY_UPGRADE_STEP_FILE)
return
grey_node_names = self.getUpgradedNodeNames()
if grey_node_names:
self.context.logger.log(
"All nodes have been upgrade, no need to upgrade again.")
self.exitWithRetCode(self.action, True)
def checkOptionH(self):
self.checkNodeNames()
stepFile = os.path.join(
self.context.upgradeBackupPath, const.GREY_UPGRADE_STEP_FILE)
if not os.path.isfile(stepFile):
self.context.logger.debug(
"File %s does not exists. No need to check." %
const.GREY_UPGRADE_STEP_FILE)
return
if not self.isNodesSameStep(self.context.nodeNames):
raise Exception(ErrorCode.GAUSS_529["GAUSS_52909"])
if self.isNodeSpecifyStep(
GreyUpgradeStep.STEP_UPDATE_POST_CATALOG,
self.context.nodeNames):
raise Exception(
ErrorCode.GAUSS_529["GAUSS_52910"] % self.context.nodeNames)
nodes = self.getNodeLessThan(GreyUpgradeStep.STEP_UPDATE_POST_CATALOG)
# compare whether current upgrade nodes are same with
# last unfinished node names
if nodes:
a = [i for i in self.context.nodeNames if i not in nodes]
b = [i for i in nodes if i not in self.context.nodeNames]
if len(a) != 0 or len(b) != 0:
raise Exception(
ErrorCode.GAUSS_529["GAUSS_52911"] % nodes +
" Please upgrade them first.")
def checkNodeNames(self):
self.context.logger.debug(
"Check if the node name is invalid or duplicated.")
clusterNodes = self.context.clusterInfo.getClusterNodeNames()
for nodeName in self.context.nodeNames:
if nodeName not in clusterNodes:
raise Exception(
ErrorCode.GAUSS_500["GAUSS_50011"] % ("-h", nodeName))
undupNodes = set(self.context.nodeNames)
if len(self.context.nodeNames) != len(undupNodes):
self.context.logger.log(
ErrorCode.GAUSS_500["GAUSS_50004"] % (
"h" + "Duplicates node names"))
nodeDict = {}.fromkeys(self.context.nodeNames, 0)
for name in self.context.nodeNames:
nodeDict[name] = nodeDict[name] + 1
for key, value in nodeDict.items():
if value > 1:
self.context.logger.log(
"Duplicates node name %s, "
"only keep one in grey upgrade!" % key)
self.context.nodeNames = list(undupNodes)
def isNodesSameStep(self, nodes):
"""
judge if given nodes are same step
"""
return self.isNodeSpecifyStepInFile(nodes=nodes)
def getNodeLessThan(self, step):
"""
get the nodes whose step is less than specified step, and can not be 0
"""
nodes = self.getNodeLessThanInFile(step)
return nodes
def getNodeLessThanInFile(self, step):
"""
get the nodes whose step is less than specified step, and can not be 0
"""
try:
stepFile = os.path.join(
self.context.upgradeBackupPath, const.GREY_UPGRADE_STEP_FILE)
self.context.logger.debug("trying to get nodes that step is "
"less than %s from %s" % (step, stepFile))
if not os.path.isfile(stepFile):
return []
nodes = []
with open(stepFile, 'r') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if int(row['step']) != 0 and int(row['step']) < step:
nodes.append(row['node_host'])
self.context.logger.debug("successfully got nodes that step is "
"less than %s from %s" % (step, stepFile))
return nodes
except Exception as e:
exitMsg = "Failed to get nodes that step is less than {0} " \
"from {1}. ERROR {2}".format(step, stepFile, str(e))
self.exitWithRetCode(self.action, False, exitMsg)
def checkOptionContinue(self):
stepFile = os.path.join(
self.context.upgradeBackupPath, const.GREY_UPGRADE_STEP_FILE)
if not os.path.isfile(stepFile):
raise Exception(ErrorCode.GAUSS_529["GAUSS_52920"] +
"Need to upgrade some nodes first.")
greyNodeNames = self.getUpgradedNodeNames()
# the nodes that have upgraded that should reached to precommit
if not self.isNodeSpecifyStep(GreyUpgradeStep.STEP_UPDATE_POST_CATALOG,
greyNodeNames):
raise Exception(ErrorCode.GAUSS_529["GAUSS_52912"])
if len(greyNodeNames) == len(self.context.clusterInfo.dbNodes):
self.printPrecommitBanner()
self.context.logger.debug(
"The node host in table %s.%s is equal to cluster nodes."
% (const.UPGRADE_SCHEMA, const.RECORD_NODE_STEP))
raise Exception(ErrorCode.GAUSS_529["GAUSS_52913"])
if not self.checkVersion(self.newCommitId, greyNodeNames):
raise Exception(
ErrorCode.GAUSS_529["GAUSS_52914"] +
"Please use the same version to upgrade remain nodes.")
def checkOptionG(self):
stepFile = os.path.join(
self.context.upgradeBackupPath, const.GREY_UPGRADE_STEP_FILE)
if not os.path.isfile(stepFile):
self.context.logger.debug(
"File %s does not exists. No need to check." %
const.GREY_UPGRADE_STEP_FILE)
return
# -g only support 2 loops to upgrade, if has node upgrade,
# cannot use -g to upgrade other nodes
greyNodeNames = self.getUpgradedNodeNames()
if not greyNodeNames:
self.context.logger.debug("No node has ever been upgraded.")
return
else:
raise Exception("-g only support if no node has ever been upgraded"
" ,nodes %s have been upgraded, "
"so can use --continue instead of -g to upgrade"
" other nodes" % greyNodeNames)
def backupClusterConfig(self):
"""
function: Backup the cluster config
input : NA
output: NA
"""
# backup list:
# cluster_static_config
# cluster_dynamic_config
# etc/gscgroup_xxx.cfg
# lib/postgresql/pg_plugin
# server.key.cipher
# server.key.rand
# datasource.key.cipher
# datasource.key.rand
# usermapping.key.cipher
# usermapping.key.rand
# subscription.key.cipher
# subscription.key.rand
# utilslib
# /share/sslsert/ca.key
# /share/sslsert/etcdca.crt
# catalog physical files
# Data Studio lib files
# gds files
# javaUDF
# postGIS
# hadoop_odbc_connector extension files
# libsimsearch etc files and lib files
self.context.logger.log("Backing up cluster configuration.", "addStep")
try:
# send cmd to all node and exec
cmd = "%s -t %s -U %s -V %d --upgrade_bak_path=%s -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_BACKUP_CONFIG,
self.context.user,
int(float(self.context.oldClusterNumber) * 1000),
self.context.upgradeBackupPath,
self.context.localLog)
self.context.logger.debug("Command for backing up cluster "
"configuration: %s" % cmd)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
# backup hotpatch info file
self.backupHotpatch()
# backup version file.
self.backup_version_file()
if not self.isLargeInplaceUpgrade:
return
# backup catalog data files if needed
self.backupCatalogFiles()
# backup DS libs and gds file
cmd = "%s -t %s -U %s --upgrade_bak_path=%s -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_INPLACE_BACKUP,
self.context.user,
self.context.upgradeBackupPath,
self.context.localLog)
self.context.logger.debug(
"Command for backing up gds file: %s" % cmd)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.userProfile)
except Exception as e:
raise Exception(str(e))
self.context.logger.log("Successfully backed up cluster "
"configuration.", "constant")
def backupCatalogFiles(self):
"""
function: backup physical files of catalg objects
1.check if is inplace upgrade
2.get database list
3.get catalog objects list
4.backup physical files for each database
5.backup global folder
input : NA
output: NA
"""
try:
# send cmd to all node and exec
cmd = "%s -t %s -U %s --upgrade_bak_path=%s " \
"--oldcluster_num='%s' -X '%s' -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_BACKUP_OLD_CLUSTER_CATALOG_PHYSICAL_FILES,
self.context.user,
self.context.upgradeBackupPath,
self.context.oldClusterNumber,
self.context.xmlFile,
self.context.localLog)
self.context.logger.debug("Command for backing up physical files "
"of catalg objects: %s" % cmd)
CmdExecutor.execCommandWithMode(
cmd,
self.context.sshTool,
self.context.isSingle,
self.context.userProfile)
self.context.logger.debug("Successfully backed up catalog "
"physical files for old cluster.")
except Exception as e:
raise Exception(str(e))
def syncNewGUC(self):
"""
function: sync newly added guc during inplace upgrade.
For now, we only sync guc of cm_agent and cm_server
input : NA
output: NA
"""
self.context.logger.debug("Start to sync new guc.", "addStep")
try:
# send cmd to all node and exec
cmd = "%s -t %s -U %s --upgrade_bak_path=%s " \
"--new_cluster_app_path=%s -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_SYNC_CONFIG,
self.context.user,
self.context.upgradeBackupPath,
self.context.newClusterAppPath,
self.context.localLog,)
self.context.logger.debug(
"Command for synchronizing new guc: %s" % cmd)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
except Exception as e:
self.context.logger.debug("Failed to synchronize new guc.",
"constant")
raise Exception(str(e))
self.context.logger.debug("Successfully synchronized new guc.",
"constant")
def waitClusterForNormal(self, waitTimeOut=300):
"""
function: Wait the node become Normal
input : waitTimeOut
output: NA
"""
self.context.logger.log("Waiting for the cluster status to "
"become normal.")
dotCount = 0
# get the end time
endTime = datetime.now() + timedelta(seconds=int(waitTimeOut))
while True:
time.sleep(5)
sys.stdout.write(".")
dotCount += 1
if dotCount >= 12:
dotCount = 0
sys.stdout.write("\n")
(checkStatus, checkResult) = \
OMCommand.doCheckStaus(self.context.user, 0)
if checkStatus == 0:
if dotCount != 0:
sys.stdout.write("\n")
self.context.logger.log("The cluster status is normal.")
break
if datetime.now() >= endTime:
if dotCount != 0:
sys.stdout.write("\n")
self.context.logger.debug(checkResult)
raise Exception("Timeout." + "\n" +
ErrorCode.GAUSS_516["GAUSS_51602"])
if checkStatus != 0:
self.context.logger.debug(checkResult)
raise Exception(ErrorCode.GAUSS_516["GAUSS_51607"] % "cluster")
def create_ca_for_cm(self):
"""
Create CM CA file
"""
if self.get_upgrade_cm_strategy() != 1:
self.context.logger.debug("No need to create CA for CM.")
return
new_cluster_config_file = \
os.path.realpath(os.path.join(self.context.newClusterAppPath,
"bin", "cluster_static_config"))
self.context.logger.debug("Start create CA for CM.")
new_cluster_info = dbClusterInfo()
new_cluster_info.initFromStaticConfig(self.context.user,
new_cluster_config_file)
local_node = [node for node in new_cluster_info.dbNodes
if node.name == NetUtil.GetHostIpOrName()][0]
agent_component = CM_OLAP()
agent_component.instInfo = local_node.cmagents[0]
agent_component.logger = self.context.logger
agent_component.binPath = os.path.realpath(os.path.join(self.context.newClusterAppPath,
"bin"))
agent_component.create_cm_ca(self.context.sshTool)
self.context.logger.debug("Create CA for CM successfully.")
def reloadCmAgent(self, is_final=False):
"""
Run the 'kill -1' command to make the parameters of all cmagent instances take effect.
:return:
"""
if not DefaultValue.get_cm_server_num_from_static(self.context.oldClusterInfo) > 0 \
and not is_final:
self.context.logger.debug("No need to reload cm configuration.")
return
self.context.logger.debug("Start to reload cmagent")
cmd = "%s -t %s -U %s --upgrade_bak_path=%s -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_RELOAD_CMAGENT,
self.context.user,
self.context.upgradeBackupPath,
self.context.localLog)
self.context.logger.debug("reloading all cmagent process: %s" % cmd)
try:
hostList = copy.deepcopy(self.context.clusterNodes)
self.context.execCommandInSpecialNode(cmd, hostList)
# wait the cluster be normal
self.waitClusterNormalDegrade()
self.context.logger.debug("Success to reload cmagent")
except Exception as er:
if self.context.action == const.ACTION_INPLACE_UPGRADE or not \
self.context.forceRollback:
raise Exception(str(er))
self.context.logger.debug("Failed to reload cm agent. Warning:{0}".format(str(er)))
def reload_cmserver(self, is_final=False):
"""
Run the 'kill -1' command to make the parameters of all cmserver instances take effect.
:return:
"""
if DefaultValue.get_cm_server_num_from_static(self.context.oldClusterInfo) == 0 \
and not is_final:
self.context.logger.debug("No need to reload cm server configuration.")
return
self.context.logger.debug("Start to reload cmserver")
cm_nodes = []
# Get all the nodes that contain the CMSERVER instance
for dbNode in self.context.clusterInfo.dbNodes:
if len(dbNode.cmservers) > 0:
cm_nodes.append(dbNode.name)
cmd = "%s -t %s -U %s --upgrade_bak_path=%s -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_RELOAD_CMSERVER,
self.context.user,
self.context.upgradeBackupPath,
self.context.localLog)
self.context.logger.debug("reloading all cmserver process: %s" % cmd)
try:
self.context.execCommandInSpecialNode(cmd, cm_nodes)
# wait the cluster be normal
self.waitClusterNormalDegrade()
self.context.logger.debug("Success to reload cmserver")
except Exception as er:
if self.context.action == const.ACTION_INPLACE_UPGRADE or \
not self.context.forceRollback:
raise Exception(str(er))
self.context.logger.debug("Failed to reload cm server. Warning:{0}".format(str(er)))
def restoreClusterConfig(self, isRollBack=False):
"""
function: Restore the cluster config
input : isRollBack
output: NA
"""
# restore list:
# cluster_dynamic_config
# etc/gscgroup_xxx.cfg
# lib/postgresql/pg_plugin
# server.key.cipher
# server.key.rand
# datasource.key.cipher
# datasource.key.rand
# utilslib
# /share/sslsert/ca.key
# /share/sslsert/etcdca.crt
# Data Studio lib files
# gds files
# javaUDF
# postGIS
# hadoop_odbc_connector extension files
# libsimsearch etc files and lib files
if isRollBack:
self.context.logger.log("Restoring cluster configuration.")
else:
self.context.logger.log("Restoring cluster configuration.",
"addStep")
try:
if isRollBack:
self.rollbackHotpatch()
else:
# restore static configuration
cmd = "%s -t %s -U %s -V %d --upgrade_bak_path=%s " \
"--old_cluster_app_path=%s --new_cluster_app_path=%s " \
"-l %s" % (
OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_RESTORE_CONFIG,
self.context.user,
int(float(self.context.oldClusterNumber) * 1000),
self.context.upgradeBackupPath,
self.context.oldClusterAppPath,
self.context.newClusterAppPath,
self.context.localLog)
self.context.logger.debug("Command for restoring "
"config files: %s" % cmd)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
if self.isLargeInplaceUpgrade:
# backup DS libs and gds file
cmd = "%s -t %s -U %s --upgrade_bak_path=%s -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_INPLACE_BACKUP,
self.context.user,
self.context.upgradeBackupPath,
self.context.localLog)
self.context.logger.debug(
"Command for restoreing DS libs and gds file: %s" % cmd)
CmdExecutor.execCommandWithMode(
cmd,
self.context.sshTool,
self.context.isSingle,
self.context.userProfile)
# change the owner of application
cmd = "chown -R %s:%s '%s'" % \
(self.context.user, self.context.group,
self.context.newClusterAppPath)
CmdExecutor.execCommandWithMode(
cmd,
self.context.sshTool, self.context.isSingle,
self.context.mpprcFile)
except Exception as e:
raise Exception(str(e))
if isRollBack:
self.context.logger.log("Successfully restored "
"cluster configuration.")
else:
self.context.logger.log("Successfully restored cluster "
"configuration.", "constant")
def checkStaticConfig(self):
"""
function: Check if static config file exists in bin dir,
if not exists, restore it from backup dir
input : NA
output: NA
"""
self.context.logger.log("Checking static configuration files.")
try:
# check static configuration path
staticConfigPath = "%s/bin" % self.context.oldClusterAppPath
# restore static configuration
cmd = "(if [ ! -f '%s/cluster_static_config' ];then cp " \
"%s/cluster_static_config %s/bin;fi)" % \
(staticConfigPath, self.context.upgradeBackupPath,
self.context.oldClusterAppPath)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
except Exception as e:
raise Exception(str(e))
self.context.logger.log("Successfully checked static "
"configuration files.")
def backupNodeVersion(self):
"""
function: Backup current application and configuration.
The function only be used by binary upgrade.
To ensure the transaction atomicity,
it will be used with checkUpgrade().
input : NA
output: NA
"""
self.context.logger.log("Backing up current application "
"and configurations.", "addStep")
try:
# back up environment variables
cmd = "cp '%s' '%s'_gauss" % (self.context.userProfile,
self.context.userProfile)
self.context.logger.debug(
"Command for backing up environment file: %s" % cmd)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
# back up application and configuration
cmd = "%s -U %s -P %s -p -b -l %s" % \
(OMCommand.getLocalScript("Local_Backup"), self.context.user,
self.context.upgradeBackupPath, self.context.localLog)
self.context.logger.debug(
"Command for backing up application: %s" % cmd)
CmdExecutor.execCommandWithMode(
cmd,
self.context.sshTool, self.context.isSingle,
self.context.mpprcFile)
except Exception as e:
# delete binary backup directory
delCmd = g_file.SHELL_CMD_DICT["deleteDir"] % \
(self.context.tmpDir, os.path.join(self.context.tmpDir,
'backupTemp_*'))
CmdExecutor.execCommandWithMode(delCmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
raise Exception(str(e))
self.context.logger.log("Successfully backed up current "
"application and configurations.", "constant")
def restoreNodeVersion(self):
"""
function: Restore the application and configuration
1. restore old version
2. restore environment variables
input : NA
output: NA
"""
self.context.logger.log("Restoring application and configurations.")
try:
# restore old version
cmd = "%s -U %s -P %s -p -b -l %s" % \
(OMCommand.getLocalScript("Local_Restore"),
self.context.user, self.context.upgradeBackupPath,
self.context.localLog)
self.context.logger.debug("Command for restoring "
"old version: %s" % cmd)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
# restore environment variables
cmd = "(if [ -f '%s'_gauss ];then mv '%s'_gauss '%s';fi)" % \
(self.context.userProfile, self.context.userProfile,
self.context.userProfile)
self.context.logger.debug("Command for restoring environment file:"
" %s" % cmd)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
except Exception as e:
raise Exception(str(e))
self.context.logger.log("Successfully restored application and "
"configuration.")
def modifySocketDir(self):
"""
function: modify unix socket directory
input : NA
output: NA
"""
self.context.logger.log("Modifying the socket path.", "addStep")
try:
# modifying the socket path for all CN/DN instance
self.setGUCValue("unix_socket_directory",
DefaultValue.getTmpDirAppendMppdb(self.context.user), "set")
except Exception as e:
raise Exception(str(e))
self.context.logger.log("Successfully modified socket path.",
"constant")
###########################################################################
# Rollback upgrade functions
###########################################################################
def cleanBackupFiles(self):
"""
function: Clean backup files.
input : action
output : NA
"""
try:
# clean backup files
cmd = "(if [ -f '%s/OldDbClusterInfo.py' ]; then rm -f " \
"'%s/OldDbClusterInfo.py'; fi) &&" % \
(self.context.tmpDir, self.context.tmpDir)
cmd += "(if [ -f '%s/OldDbClusterInfo.pyc' ]; then rm -f " \
"'%s/OldDbClusterInfo.pyc'; fi) &&" % \
(self.context.tmpDir, self.context.tmpDir)
cmd += "(if [ -d '%s/script' ]; then rm -rf '%s/script'; " \
"fi) &&" % (self.context.tmpDir, self.context.tmpDir)
cmd += "(if [ -f '%s/oldclusterinfo' ]; then rm -f " \
"'%s/oldclusterinfo'; fi) &&" % \
(self.context.tmpDir, self.context.tmpDir)
cmd += "(if [ -f '%s/oldclusterGUC' ]; then rm -f " \
"'%s/oldclusterGUC'; fi) &&" % \
(self.context.tmpDir, self.context.tmpDir)
cmd += "(if [ -f '%s/cluster_static_config' ]; then rm -f " \
"'%s/cluster_static_config'; fi) &&" % \
(self.context.tmpDir, self.context.tmpDir)
cmd += "(if [ -f '%s/c_functionfilelist.dat' ]; then rm -f " \
"'%s/c_functionfilelist.dat'; fi) &&" % \
(self.context.tmpDir, self.context.tmpDir)
cmd += "(if [ -f '%s'_gauss ]; then rm -f '%s'_gauss ; fi) &&" % \
(self.context.userProfile, self.context.userProfile)
cmd += "(if [ -f '%s/oldclusterinfo.json' ]; then rm -f " \
"'%s/oldclusterinfo.json'; fi) &&" % \
(self.context.tmpDir, self.context.tmpDir)
cmd += "(if [ -f '%s/%s' ]; then rm -f '%s/%s'; fi) &&" % \
(self.context.tmpDir, const.CLUSTER_CNSCONF_FILE,
self.context.tmpDir, const.CLUSTER_CNSCONF_FILE)
cmd += "(rm -f '%s'/gauss_crontab_file_*) &&" % self.context.tmpDir
cmd += "(if [ -d '%s' ]; then rm -rf '%s'; fi) &&" % \
(self.context.upgradeBackupPath,
self.context.upgradeBackupPath)
cmd += "(if [ -f '%s/pg_proc_mapping.txt' ]; then rm -f" \
" '%s/pg_proc_mapping.txt'; fi)" % \
(self.context.tmpDir, self.context.tmpDir)
self.context.logger.debug("Command for clean "
"backup files: %s" % cmd)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
except Exception as e:
raise Exception(str(e))
def cleanBinaryUpgradeBakFiles(self, isRollBack=False):
"""
function: Clean back up files, include cluster_static_config,
cluster_dynamic_config, binary.tar, parameter.tar.
input : isRollBack
output: NA
"""
if (isRollBack):
self.context.logger.debug("Cleaning backup files.")
else:
self.context.logger.debug("Cleaning backup files.", "addStep")
try:
# clean backup files
self.cleanBackupFiles()
except Exception as e:
raise Exception(str(e))
if (isRollBack):
self.context.logger.debug("Successfully cleaned backup files.")
else:
self.context.logger.debug("Successfully cleaned backup files.",
"constant")
###########################################################################
# Rollback upgrade functions
###########################################################################
def doHealthCheck(self, checkPosition):
"""
function: Do health check, if healthy, return 0, else return 1
input : checkPosition
output: 0 successfully
1 failed
"""
#######################################################################
# When do binary-upgrade:
# const.OPTION_PRECHECK -> cluster Normal
# -> database can connec
# const.OPTION_POSTCHECK -> cluster Normal
# -> package version Normal
# -> database can connec
#######################################################################
self.context.logger.log("Start to do health check.", "addStep")
status = 0
output = ""
if checkPosition == const.OPTION_PRECHECK:
if (self.checkClusterStatus(checkPosition, True) != 0):
output += "\n Cluster status does not match condition."
if self.checkConnection() != 0:
output += "\n Database could not be connected."
elif checkPosition == const.OPTION_POSTCHECK:
if self.checkClusterStatus(checkPosition) != 0:
output += "\n Cluster status is Abnormal."
if not self.checkVersion(
self.context.newClusterVersion,
self.context.clusterInfo.getClusterNodeNames()):
output += "\n The gaussdb version is inconsistent."
if self.checkConnection() != 0:
output += "\n Database could not be connected."
else:
# Invalid check position
output += "\n Invalid check position."
if output != "":
status = 1
# all check has been pass, return 0
self.context.logger.log("Successfully checked cluster status.",
"constant")
return (status, output)
def checkVersion(self, checkinfo, checknodes):
"""
function: Check if the node have been upgraded, if gaussdb bin
file verison is same on all host, return 0, else retrun 1
input : checkinfo, checknodes
output: 0 successfully
1 failed
"""
self.context.logger.debug(
"Start to check gaussdb version consistency.")
if self.context.isSingle:
self.context.logger.debug("There is single cluster,"
" no need to check it.")
return True
try:
# checking gaussdb bin file version VxxxRxxxCxx or commitid
cmd = "source %s;%s -t %s -v %s -U %s -l %s" % \
(self.context.userProfile,
OMCommand.getLocalScript("Local_Check_Upgrade"),
const.ACTION_CHECK_VERSION,
checkinfo,
self.context.user,
self.context.localLog)
self.context.logger.debug("Command for checking gaussdb version "
"consistency: %s." % cmd)
(status, output) = \
self.context.sshTool.getSshStatusOutput(cmd, checknodes)
for node in status.keys():
failFlag = "Failed to check version information"
if status[node] != DefaultValue.SUCCESS or \
output.find(failFlag) >= 0:
raise Exception(ErrorCode.GAUSS_529["GAUSS_52929"] +
"Error: \n%s" % str(output))
# gaussdb bin file version is same on all host, return 0
self.context.logger.debug("Successfully checked gaussdb"
" version consistency.")
return True
except Exception as e:
self.context.logger.debug(str(e))
return False
def _query_cluster_status(self):
"""
Query cluster status
"""
cmd = "source %s;gs_om -t query" % self.context.userProfile
(status, output) = subprocess.getstatusoutput(cmd)
if "Cascade Need repair" in output:
self.context.logger.debug("Cascade node disconnect , "
"check again after 5 seconds.\n{0}".format(output))
time.sleep(5)
(status, output) = subprocess.getstatusoutput(cmd)
self.context.logger.debug("Retry query cluster status finish. "
"Output:\n{0}".format(output))
return cmd, status, output
def checkClusterStatus(self, checkPosition=const.OPTION_PRECHECK,
doDetailCheck=False):
"""
function: Check cluster status, if NORMAL, return 0, else return 1
For grey upgrade, if have switched to new bin, we will remove
abnormal nodes and then return 0, else return 1
input : checkPosition, doDetailCheck
output: 0 successfully
1 failed
"""
self.context.logger.debug("Start to check cluster status.")
# build query cmd
# according to the implementation of the results to determine whether
# the implementation of success
cmd, status, output = self._query_cluster_status()
if status != 0:
self.context.logger.debug(
"Failed to execute command %s.\nStatus:%s\nOutput:%s" %
(cmd, status, output))
return 1
self.context.logger.debug(
"Successfully obtained cluster status information. "
"Cluster status information:\n%s" % output)
if output.find("Normal") < 0:
self.context.logger.debug("The cluster_state is Abnormal.")
if checkPosition == const.OPTION_POSTCHECK:
if output.find("Degraded") < 0:
self.context.logger.debug("The cluster_state is not "
"Degraded under postcheck.")
return 1
else:
return 1
# do more check if required
if doDetailCheck:
cluster_state_check = False
redistributing_check = False
for line in output.split('\n'):
if len(line.split(":")) != 2:
continue
(key, value) = line.split(":")
if key.strip() == "cluster_state" and \
value.strip() == "Normal":
cluster_state_check = True
elif key.strip() == "redistributing" and value.strip() == "No":
redistributing_check = True
if cluster_state_check and redistributing_check:
self.context.logger.debug("Cluster_state must be Normal, "
"redistributing must be No.")
return 0
else:
self.context.logger.debug(
"Cluster status information does not meet the upgrade "
"condition constraints. When upgrading, cluster_state must"
" be Normal, redistributing must be No and balanced"
" must be Yes.")
return 1
# cluster is NORMAL, return 0
return 0
def waitClusterNormalDegrade(self, waitTimeOut=300):
"""
function: Check if cluster status is Normal for each main step of
online upgrade
input : waitTimeOut, default is 60.
output : NA
"""
# get the end time
self.context.logger.log("Wait for the cluster status normal "
"or degrade.")
endTime = datetime.now() + timedelta(seconds=int(waitTimeOut))
while True:
cmd = "source %s;gs_om -t status --detail" % \
self.context.userProfile
(status, output) = subprocess.getstatusoutput(cmd)
if status == 0 and (output.find("Normal") >= 0 or
output.find("Degraded") >= 0):
self.context.logger.debug(
"The cluster status is normal or degrade now.")
break
if datetime.now() >= endTime:
self.context.logger.debug("The cmd is %s " % cmd)
raise Exception("Timeout." + "\n" +
ErrorCode.GAUSS_516["GAUSS_51602"])
else:
self.context.logger.debug(
"Cluster status has not reach normal. Wait for another 3"
" seconds.\n%s" % output)
time.sleep(3) # sleep 3 seconds
def checkConnection(self):
"""
function: Check if cluster accept connecitons,
upder inplace upgrade, all DB should be connected
under grey upgrade, makesure all CN in nodes that does not
under upgrade process or extracted abnormal nodes can be
connected if accpet connection, return 0, else return 1
1. find a cn instance
2. connect this cn and exec sql cmd
input : NA
output: 0 successfully
1 failed
"""
self.context.logger.debug("Start to check database connection.")
for dbNode in self.context.clusterInfo.dbNodes:
if len(dbNode.datanodes) == 0 or dbNode.name:
continue
for dnInst in dbNode.datanodes:
# connect this DB and exec sql cmd
sql = "SELECT 1;"
(status, output) = \
ClusterCommand.remoteSQLCommand(
sql, self.context.user, dnInst.hostname, dnInst.port,
False, DefaultValue.DEFAULT_DB_NAME,
IsInplaceUpgrade=True)
if status != 0 or not output.isdigit():
self.context.logger.debug(
"Failed to execute SQL on [%s]: %s. Error: \n%s" %
(dnInst.hostname, sql, str(output)))
return 1
# all DB accept connection, return 0
self.context.logger.debug("Successfully checked database connection.")
return 0
def createBakPath(self):
"""
function: create bak path
input : NA
output : NA
"""
cmd = "(if [ ! -d '%s' ]; then mkdir -p '%s'; fi)" % \
(self.context.upgradeBackupPath, self.context.upgradeBackupPath)
cmd += " && (chmod %d -R %s)" % (DefaultValue.KEY_DIRECTORY_MODE,
self.context.upgradeBackupPath)
self.context.logger.debug("Command for creating directory: %s" % cmd)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
def recordDirFile(self):
"""
function: record dir file
input: NA
output: NA
"""
self.context.logger.debug("Create the file to record "
"old and new app directory.")
# write the old cluster number and new cluster number into backup dir
appDirRecord = os.path.join(self.context.upgradeBackupPath,
const.RECORD_UPGRADE_DIR)
FileUtil.createFile(appDirRecord, True, DefaultValue.KEY_FILE_MODE)
FileUtil.writeFile(appDirRecord, [self.context.oldClusterAppPath,
self.context.newClusterAppPath], 'w')
self.distributeFile(appDirRecord)
self.context.logger.debug("Successfully created the file to "
"record old and new app directory.")
def copyBakVersion(self):
"""
under commit, if we have cleaned old install path, then node disabled,
we cannot get old version,
under choseStrategy, we will not pass the check
:return:NA
"""
versionFile = os.path.join(self.context.oldClusterAppPath,
"bin/upgrade_version")
bakVersionFile = os.path.join(self.context.upgradeBackupPath,
"old_upgrade_version")
cmd = "(if [ -f '%s' ]; then cp -f -p '%s' '%s';fi)" % \
(versionFile, versionFile, bakVersionFile)
cmd += " && (chmod %d %s)" % \
(DefaultValue.KEY_FILE_MODE, bakVersionFile)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
def cleanInstallPath(self, cleanNew=const.NEW):
"""
function: after grey upgrade succeed, clean old install path
input : cleanNew
output: NA
"""
self.context.logger.debug("Cleaning %s install path." % cleanNew,
"addStep")
# clean old install path
if cleanNew == const.NEW:
installPath = self.context.newClusterAppPath
elif cleanNew == const.OLD:
installPath = self.context.oldClusterAppPath
else:
raise Exception(ErrorCode.GAUSS_529["GAUSS_52937"])
cmd = "%s -t %s -U %s -R %s -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_CLEAN_INSTALL_PATH,
self.context.user,
installPath,
self.context.localLog)
if self.context.forceRollback:
cmd += " --force"
self.context.logger.debug("Command for clean %s install path: %s" %
(cleanNew, cmd))
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
self.context.logger.log("Successfully cleaned %s install path." %
cleanNew, "constant")
def installNewBin(self):
"""
function: install new binary in a new directory
1. get env GAUSSLOG
2. get env PGHOST
3. install new bin file
4. sync old config to new bin path
5. update env
input: none
output: none
"""
try:
self.context.logger.log("Installing new binary.", "addStep")
# install new bin file
cmd = "%s -t 'install_cluster' -U %s:%s -R '%s' -P %s -c %s" \
" -l '%s' -X '%s' -T -u" % \
(OMCommand.getLocalScript("Local_Install"),
self.context.user,
self.context.group,
self.context.newClusterAppPath,
self.context.tmpDir,
self.context.clusterInfo.name,
self.context.localLog,
self.context.xmlFile)
self.context.logger.debug(
"Command for installing new binary: %s." % cmd)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
self.context.logger.debug(
"Successfully installed new binary files.")
except Exception as e:
self.context.logger.debug("Failed to install new binary files.")
raise Exception(str(e))
def backupHotpatch(self):
"""
function: backup hotpatch config file patch.info in xxx/data/hotpatch
input : NA
output: NA
"""
self.context.logger.debug("Start to backup hotpatch.")
try:
cmd = "%s -t %s -U %s --upgrade_bak_path=%s " \
"--new_cluster_app_path=%s -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_BACKUP_HOTPATCH,
self.context.user,
self.context.upgradeBackupPath,
self.context.newClusterAppPath,
self.context.localLog)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
except Exception as e:
raise Exception(" Failed to backup hotpatch config file." + str(e))
self.context.logger.log("Successfully backup hotpatch config file.")
def rollbackHotpatch(self):
"""
function: backup hotpatch config file patch.info in xxx/data/hotpatch
input : NA
output: NA
"""
self.context.logger.debug("Start to rollback hotpatch.")
try:
cmd = "%s -t %s -U %s --upgrade_bak_path=%s -l %s -X '%s'" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_ROLLBACK_HOTPATCH,
self.context.user,
self.context.upgradeBackupPath,
self.context.localLog,
self.context.xmlFile)
if self.context.forceRollback:
cmd += " --force"
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
except Exception as e:
raise Exception(" Failed to rollback hotpatch config file."
+ str(e))
self.context.logger.log("Successfully rollback hotpatch config file.")
def backup_version_file(self):
"""
Backup the old version file.
"""
oldVersionFile = "%s/bin/%s" % \
(self.context.oldClusterAppPath,
DefaultValue.DEFAULT_DISABLED_FEATURE_FILE_NAME)
oldLicenseFile = "%s/bin/%s" % (self.context.oldClusterAppPath,
DefaultValue.DEFAULT_LICENSE_FILE_NAME)
cmd = "(if [ -d %s ] && [ -f %s ]; then cp -f %s %s; fi) && " % \
(self.context.upgradeBackupPath, oldVersionFile, oldVersionFile,
self.context.upgradeBackupPath)
cmd += "(if [ -d %s ] && [ -f %s ]; then cp -f %s %s; fi)" % \
(self.context.upgradeBackupPath, oldLicenseFile, oldLicenseFile,
self.context.upgradeBackupPath)
self.context.logger.debug(
"Execute command to backup the product version file and the "
"license control file: %s" % cmd)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
def getTimeFormat(self, seconds):
"""
format secends to h-m-s
input:int
output:int
"""
seconds = int(seconds)
if seconds == 0:
return 0
# Converts the seconds to standard time
hour = seconds / 3600
minute = (seconds - hour * 3600) / 60
s = seconds % 60
resultstr = ""
if hour != 0:
resultstr += "%dh" % hour
if minute != 0:
resultstr += "%dm" % minute
return "%s%ds" % (resultstr, s)
def CopyCerts(self):
"""
function: copy certs
input : NA
output : NA
"""
self.context.logger.log("copy certs from %s to %s." % (
self.context.oldClusterAppPath, self.context.newClusterAppPath))
try:
cmd = "%s -t %s -U %s --old_cluster_app_path=%s " \
"--new_cluster_app_path=%s -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_COPY_CERTS,
self.context.user,
self.context.oldClusterAppPath,
self.context.newClusterAppPath,
self.context.localLog)
self.context.logger.debug("Command for copy certs: '%s'." % cmd)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
except Exception as e:
self.context.logger.log("Failed to copy certs from %s to %s." %
(self.context.oldClusterAppPath,
self.context.newClusterAppPath))
raise Exception(str(e))
time.sleep(10)
self.context.logger.log("Successfully copy certs from %s to %s." %
(self.context.oldClusterAppPath,
self.context.newClusterAppPath),
"constant")
def clean_cm_instance(self):
"""
Clean CM instance directory
"""
self.context.logger.log("Start roll back CM instance.")
cm_strategy = self.get_upgrade_cm_strategy()
if cm_strategy == 1:
self.context.logger.debug("Rollback need clean cm directory")
cmd = "%s -t %s -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_CLEAN_CM,
self.context.localLog)
self.context.logger.debug("Roll back CM install command: {0}".format(cmd))
self.context.sshTool.executeCommand(cmd, hostList=self.context.nodeNames)
self.context.logger.debug("Clean cm directory successfully.")
else:
self.context.logger.debug("No need clean CM instance directory.")
def switchBin(self, switchTo=const.OLD):
"""
function: switch bin
input : switchTo
output : NA
"""
self.context.logger.log("Switch symbolic link to %s binary directory."
% switchTo, "addStep")
try:
cmd = "%s -t %s -U %s -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_SWITCH_BIN,
self.context.user,
self.context.localLog)
if switchTo == const.NEW:
cmd += " -R '%s'" % self.context.newClusterAppPath
else:
cmd += " -R '%s'" % self.context.oldClusterAppPath
if self.context.forceRollback:
cmd += " --force"
self.context.logger.debug("Command for switching binary directory:"
" '%s'." % cmd)
if self.context.is_grey_upgrade:
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile,
self.context.nodeNames)
else:
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
except Exception as e:
self.context.logger.log("Failed to switch symbolic link to %s "
"binary directory." % switchTo)
raise Exception(str(e))
time.sleep(10)
self.context.logger.log("Successfully switch symbolic link to %s "
"binary directory." % switchTo, "constant")
def clearOtherToolPackage(self, action=""):
"""
function: clear other tool package
input : action
output : NA
"""
if action == const.ACTION_AUTO_ROLLBACK:
self.context.logger.debug("clean other tool package files.")
else:
self.context.logger.debug(
"clean other tool package files.", "addStep")
try:
commonPart = PackageInfo.get_package_back_name().rsplit("_", 1)[0]
gphomePath = \
os.listdir(ClusterDir.getClusterToolPath(self.context.user))
commitId = self.newCommitId
if action == const.ACTION_AUTO_ROLLBACK:
commitId = self.oldCommitId
for filePath in gphomePath:
if commonPart in filePath and commitId not in filePath:
toDeleteFilePath = os.path.join(
ClusterDir.getClusterToolPath(self.context.user),
filePath)
deleteCmd = "(if [ -f '%s' ]; then rm -rf '%s'; fi) " % \
(toDeleteFilePath, toDeleteFilePath)
CmdExecutor.execCommandWithMode(
deleteCmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
except Exception as e:
self.context.logger.log(
"Failed to clean other tool package files.")
raise Exception(str(e))
if action == const.ACTION_AUTO_ROLLBACK:
self.context.logger.debug(
"Success to clean other tool package files.")
else:
self.context.logger.debug(
"Success to clean other tool package files.", "constant")
def createGphomePack(self):
"""
function: create Gphome pack
input : NA
output : NA
"""
try:
cmd = "(if [ ! -d '%s' ]; then mkdir -p '%s'; fi)" % \
(ClusterDir.getClusterToolPath(self.context.user),
ClusterDir.getClusterToolPath(self.context.user))
cmd += " && (chmod %d -R %s)" % \
(DefaultValue.KEY_DIRECTORY_MODE,
ClusterDir.getClusterToolPath(self.context.user))
self.context.logger.debug(
"Command for creating directory: %s" % cmd)
CmdExecutor.execCommandWithMode(cmd,
self.context.sshTool,
self.context.isSingle,
self.context.mpprcFile)
oldPackName = "%s-Package-bak_%s.tar.gz" % \
(VersionInfo.PRODUCT_NAME_PACKAGE, self.oldCommitId)
packFilePath = "%s/%s" % (ClusterDir.getClusterToolPath(
self.context.user), oldPackName)
copyNode = ""
cmd = "if [ -f '%s' ]; then echo 'GetFile'; " \
"else echo 'NoThisFile'; fi" % packFilePath
self.context.logger.debug("Command for checking file: %s" % cmd)
(status, output) = self.context.sshTool.getSshStatusOutput(
cmd, self.context.clusterNodes, self.context.mpprcFile)
outputMap = self.context.sshTool.parseSshOutput(
self.context.clusterNodes)
self.context.logger.debug("Output: %s" % output)
for node in self.context.clusterNodes:
if status[node] == DefaultValue.SUCCESS:
if 'GetFile' in outputMap[node]:
copyNode = node
break
if copyNode:
self.context.logger.debug("Copy the file %s from node %s." %
(packFilePath, copyNode))
for node in self.context.clusterNodes:
if status[node] == DefaultValue.SUCCESS:
if 'NoThisFile' in outputMap[node]:
cmd = LocalRemoteCmd.getRemoteCopyCmd(
packFilePath,
ClusterDir.getClusterToolPath(
self.context.user),
str(copyNode), False, 'directory', node)
self.context.logger.debug(
"Command for copying directory: %s" % cmd)
CmdExecutor.execCommandLocally(cmd)
else:
raise Exception(ErrorCode.GAUSS_502["GAUSS_50210"] %
packFilePath)
except Exception as e:
raise Exception(str(e))