!323 支持流式容灾特性

Merge pull request !323 from 陈照亮/master
This commit is contained in:
opengauss-bot 2022-08-11 02:38:56 +00:00 committed by Gitee
commit 0220fe5901
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
34 changed files with 5765 additions and 939 deletions

View File

@ -1,855 +0,0 @@
From 812edfeeb0e47f14dff2077ff5f8a69b4773eaef Mon Sep 17 00:00:00 2001
From: twx980514 <tanjinbo3@huawei.com>
Date: Tue, 6 Jul 2021 17:43:08 +0800
Subject: [PATCH] del sctp check
---
script/gs_check | 4 +-
script/gspylib/common/GaussLog.py | 2 +-
script/gspylib/etc/conf/check_list.conf | 11 ---
script/gspylib/etc/conf/check_list_dws.conf | 11 ---
.../inspection/config/check_list_V1R6C10.conf | 10 --
.../inspection/config/check_list_V1R7C00.conf | 11 ---
script/gspylib/inspection/config/items.xml | 38 +-------
script/gspylib/inspection/config/scene_inspect.xml | 2 -
script/gspylib/inspection/config/scene_install.xml | 2 -
script/gspylib/inspection/config/scene_upgrade.xml | 1 -
.../inspection/items/network/CheckNoCheckSum.py | 71 --------------
.../inspection/items/network/CheckUsedPort.py | 17 +---
.../inspection/items/os/CheckSctpService.py | 108 ---------------------
.../gspylib/inspection/items/os/CheckSysParams.py | 32 ++----
script/impl/preinstall/PreinstallImpl.py | 38 --------
script/local/LocalCheck.py | 21 +---
script/local/PreInstallUtility.py | 87 +----------------
17 files changed, 16 insertions(+), 450 deletions(-)
delete mode 100644 script/gspylib/inspection/items/network/CheckNoCheckSum.py
delete mode 100644 script/gspylib/inspection/items/os/CheckSctpService.py
diff --git a/script/gs_check b/script/gs_check
index 05d5625..0f29b31 100644
--- a/script/gs_check
+++ b/script/gs_check
@@ -93,13 +93,13 @@ DEFAULT_TIMEOUT = 1500
# because single clusters don't need to perform consistency checks and
# internal communication class checks
SINGLE_SKIP = ["CheckTimeZone", "CheckEncoding", "CheckKernelVer",
- "CheckNTPD", "CheckNoCheckSum", "CheckCpuCount",
+ "CheckNTPD", "CheckCpuCount",
"CheckMemInfo", "CheckDiskConfig",
"CheckUpVer", "CheckPgxcgroup", "CheckPing",
"CheckNetWorkDrop", "CheckNetSpeed"]
SETITEM_SKIP = ["CheckCPU", "CheckTimeZone", "CheckOSVer", "CheckNTPD",
- "CheckSshdService", "CheckNoCheckSum", "CheckEtcHosts",
+ "CheckSshdService", "CheckEtcHosts",
"CheckCpuCount", "CheckHyperThread", "CheckMemInfo",
"CheckKernelVer", "CheckEncoding", "CheckBootItems",
"CheckDropCache", "CheckFilehandle", "CheckKeyProAdj",
diff --git a/script/gspylib/common/GaussLog.py b/script/gspylib/common/GaussLog.py
index bdfecf1..31957d2 100644
--- a/script/gspylib/common/GaussLog.py
+++ b/script/gspylib/common/GaussLog.py
@@ -55,7 +55,7 @@ PREINSTALL_ACTION = ["prepare_path", "check_os_Version", "create_os_user",
"check_os_user", "create_cluster_paths",
"set_os_parameter", "set_finish_flag", "set_warning_env",
"prepare_user_cron_service", "prepare_user_sshd_service",
- "set_library", "set_sctp", "set_virtualIp",
+ "set_library", "set_virtualIp",
"clean_virtualIp", "check_hostname_mapping",
"init_gausslog", "check_envfile", "check_dir_owner",
"set_user_env", "set_tool_env", "gs_preinstall"]
diff --git a/script/gspylib/etc/conf/check_list.conf b/script/gspylib/etc/conf/check_list.conf
index deba792..77b7c60 100644
--- a/script/gspylib/etc/conf/check_list.conf
+++ b/script/gspylib/etc/conf/check_list.conf
@@ -10,14 +10,10 @@ net.ipv4.tcp_keepalive_time = 30
net.ipv4.tcp_keepalive_intvl = 30
net.ipv4.tcp_keepalive_probes = 9
net.ipv4.tcp_retries2 = 12
-net.sctp.addip_enable = 0
net.core.wmem_max = 21299200
net.core.rmem_max = 21299200
net.core.wmem_default = 21299200
net.core.rmem_default = 21299200
-net.sctp.sctp_mem = 94500000 915000000 927000000
-net.sctp.sctp_rmem = 8192 250000 16777216
-net.sctp.sctp_wmem = 8192 250000 16777216
kernel.sem = 250 6400000 1000 25600
net.ipv4.tcp_rmem = 8192 250000 16777216
net.ipv4.tcp_wmem = 8192 250000 16777216
@@ -33,8 +29,6 @@ kernel.shmmax = 18446744073709551615
# if parameter value is not equal to ths OS's value, print the waring, and not error
[SUGGEST:/etc/sysctl.conf]
-net.sctp.sndbuf_policy = 0
-net.sctp.rcvbuf_policy = 0
net.ipv4.ip_local_port_range = 26000 65535
net.ipv4.tcp_fin_timeout = 60
net.ipv4.tcp_sack = 1
@@ -42,13 +36,8 @@ net.ipv4.tcp_timestamps = 1
net.ipv4.tcp_retries1 = 5
net.ipv4.tcp_syn_retries = 5
net.ipv4.tcp_synack_retries = 5
-net.sctp.path_max_retrans = 10
-net.sctp.max_init_retransmits = 10
-net.sctp.association_max_retrans = 10
-net.sctp.hb_interval = 30000
vm.extfrag_threshold = 500
vm.overcommit_ratio = 90
-SctpChecksumErrors = 0
# open file number, please set it to set '1000000'
[/etc/security/limits.conf]
diff --git a/script/gspylib/etc/conf/check_list_dws.conf b/script/gspylib/etc/conf/check_list_dws.conf
index a7f7b7c..a96f7e9 100644
--- a/script/gspylib/etc/conf/check_list_dws.conf
+++ b/script/gspylib/etc/conf/check_list_dws.conf
@@ -10,14 +10,10 @@ net.ipv4.tcp_keepalive_time = 30
net.ipv4.tcp_keepalive_intvl = 30
net.ipv4.tcp_keepalive_probes = 9
net.ipv4.tcp_retries2 = 12
-net.sctp.addip_enable = 0
net.core.wmem_max = 21299200
net.core.rmem_max = 21299200
net.core.wmem_default = 21299200
net.core.rmem_default = 21299200
-net.sctp.sctp_mem = 94500000 915000000 927000000
-net.sctp.sctp_rmem = 8192 250000 16777216
-net.sctp.sctp_wmem = 8192 250000 16777216
kernel.sem = 250 6400000 1000 25600
net.ipv4.tcp_rmem = 8192 250000 16777216
net.ipv4.tcp_wmem = 8192 250000 16777216
@@ -28,8 +24,6 @@ net.ipv4.tcp_max_syn_backlog = 65535
net.core.somaxconn = 65535
net.ipv4.tcp_syncookies = 1
vm.overcommit_memory = 0
-net.sctp.sndbuf_policy = 0
-net.sctp.rcvbuf_policy = 0
net.ipv4.tcp_fin_timeout = 60
kernel.shmall = 1152921504606846720
kernel.shmmax = 18446744073709551615
@@ -38,16 +32,11 @@ net.ipv4.tcp_timestamps = 1
net.ipv4.tcp_retries1 = 10
net.ipv4.tcp_syn_retries = 10
net.ipv4.tcp_synack_retries = 10
-net.sctp.path_max_retrans = 10
-net.sctp.max_init_retransmits = 10
-net.sctp.association_max_retrans = 10
-net.sctp.hb_interval = 30000
vm.extfrag_threshold = 500
vm.overcommit_ratio = 90
# if parameter value is not equal to ths OS's value, print the waring, and not error
[SUGGEST:/etc/sysctl.conf]
-SctpChecksumErrors = 0
# open file number, please set it to set '1000000'
[/etc/security/limits.conf]
diff --git a/script/gspylib/inspection/config/check_list_V1R6C10.conf b/script/gspylib/inspection/config/check_list_V1R6C10.conf
index 75a2203..16c3fd2 100644
--- a/script/gspylib/inspection/config/check_list_V1R6C10.conf
+++ b/script/gspylib/inspection/config/check_list_V1R6C10.conf
@@ -10,14 +10,10 @@ net.ipv4.tcp_keepalive_time = 30
net.ipv4.tcp_keepalive_intvl = 30
net.ipv4.tcp_keepalive_probes = 9
net.ipv4.tcp_retries2 = 80
-net.sctp.addip_enable = 0
net.core.wmem_max = 21299200
net.core.rmem_max = 21299200
net.core.wmem_default = 21299200
net.core.rmem_default = 21299200
-net.sctp.sctp_mem = 94500000 915000000 927000000
-net.sctp.sctp_rmem = 8192 250000 16777216
-net.sctp.sctp_wmem = 8192 250000 16777216
kernel.sem = 250 6400000 1000 25600
net.ipv4.tcp_rmem = 8192 250000 16777216
net.ipv4.tcp_wmem = 8192 250000 16777216
@@ -30,8 +26,6 @@ net.ipv4.tcp_syncookies = 1
vm.overcommit_memory = 0
vm.panic_on_oom = 0;
vm.oom_kill_allocating_task = 0;
-net.sctp.sndbuf_policy = 0
-net.sctp.rcvbuf_policy = 0
# if parameter value is not equal to ths OS's value, print the waring, and not error
[SUGGEST:/etc/sysctl.conf]
@@ -41,10 +35,6 @@ net.ipv4.tcp_timestamps = 1
net.ipv4.tcp_retries1 = 5
net.ipv4.tcp_syn_retries = 5
net.ipv4.tcp_synack_retries = 5
-net.sctp.path_max_retrans = 10
-net.sctp.max_init_retransmits = 10
-net.sctp.association_max_retrans = 10
-net.sctp.hb_interval = 30000
# open file number, please set it to set '1000000'
[/etc/security/limits.conf]
diff --git a/script/gspylib/inspection/config/check_list_V1R7C00.conf b/script/gspylib/inspection/config/check_list_V1R7C00.conf
index 41c9334..4c150b6 100644
--- a/script/gspylib/inspection/config/check_list_V1R7C00.conf
+++ b/script/gspylib/inspection/config/check_list_V1R7C00.conf
@@ -10,14 +10,10 @@ net.ipv4.tcp_keepalive_time = 30
net.ipv4.tcp_keepalive_intvl = 30
net.ipv4.tcp_keepalive_probes = 9
net.ipv4.tcp_retries2 = 80
-net.sctp.addip_enable = 0
net.core.wmem_max = 21299200
net.core.rmem_max = 21299200
net.core.wmem_default = 21299200
net.core.rmem_default = 21299200
-net.sctp.sctp_mem = 94500000 915000000 927000000
-net.sctp.sctp_rmem = 8192 250000 16777216
-net.sctp.sctp_wmem = 8192 250000 16777216
kernel.sem = 250 6400000 1000 25600
net.ipv4.tcp_rmem = 8192 250000 16777216
net.ipv4.tcp_wmem = 8192 250000 16777216
@@ -30,8 +26,6 @@ net.ipv4.tcp_syncookies = 1
vm.overcommit_memory = 0
vm.panic_on_oom = 0
vm.oom_kill_allocating_task = 0
-net.sctp.sndbuf_policy = 0
-net.sctp.rcvbuf_policy = 0
kernel.shmall = 1152921504606846720
kernel.shmmax = 18446744073709551615
@@ -43,13 +37,8 @@ net.ipv4.tcp_timestamps = 1
net.ipv4.tcp_retries1 = 5
net.ipv4.tcp_syn_retries = 5
net.ipv4.tcp_synack_retries = 5
-net.sctp.path_max_retrans = 10
-net.sctp.max_init_retransmits = 10
-net.sctp.association_max_retrans = 10
-net.sctp.hb_interval = 30000
vm.extfrag_threshold = 500
vm.overcommit_ratio = 90
-SctpChecksumErrors = 0
# open file number, please set it to set '1000000'
[/etc/security/limits.conf]
diff --git a/script/gspylib/inspection/config/items.xml b/script/gspylib/inspection/config/items.xml
index 1dbac79..bb4143c 100644
--- a/script/gspylib/inspection/config/items.xml
+++ b/script/gspylib/inspection/config/items.xml
@@ -334,24 +334,6 @@
<analysis>default</analysis>
</checkitem>
- <checkitem id="10026" name="CheckNoCheckSum">
- <title>
- <zh>检查nochecksum值是否为预期值且一致(默认为N,RedHat6.4/6.5且bond是为Y)</zh>
- <en>Check the nochecksum</en>
- </title>
- <threshold/>
- <suggestion>
- <zh>修改nochecksum值为一致的预期值</zh>
- </suggestion>
- <standard>
- <zh>检查nochecksum值,若符合预期且一致则检查项通过,否则检查项不通过</zh>
- </standard>
- <category>network</category>
- <permission>root</permission>
- <scope>all</scope>
- <analysis>consistent</analysis>
- </checkitem>
-
<checkitem id="10027" name="CheckOmmUserExist">
<title>
<zh>检查omm用户是否已删除</zh>
@@ -456,24 +438,6 @@
<analysis>consistent</analysis>
</checkitem>
- <checkitem id="10032" name="CheckSctpService">
- <title>
- <zh>检查sctp服务</zh>
- <en>Check sctp service</en>
- </title>
- <threshold/>
- <suggestion>
- <zh>安装及加载sctp服务</zh>
- </suggestion>
- <standard>
- <zh>stcp服务开启且写在开机自启动文件中则检查项通过,否则检查项不通过</zh>
- </standard>
- <category>os</category>
- <permission>root</permission>
- <scope>all</scope>
- <analysis>default</analysis>
- </checkitem>
-
<checkitem id="10033" name="CheckHyperThread">
<title>
<zh>检查超线程是否打开</zh>
@@ -1841,7 +1805,7 @@
<zh>增大net.ipv4.ip_local_port_range或降低并发</zh>
</suggestion>
<standard>
- <zh>检查net.ipv4.ip_local_port_range,范围大于等于OS默认值通过;检查TCP协议随机端口数,小于总随机端口数的80%通过;检查SCTP协议随机端口数,小于总随机端口数的80%通过</zh>
+ <zh>检查net.ipv4.ip_local_port_range,范围大于等于OS默认值通过;检查TCP协议随机端口数,小于总随机端口数的80%通过</zh>
</standard>
<category>network</category>
<permission>user</permission>
diff --git a/script/gspylib/inspection/config/scene_inspect.xml b/script/gspylib/inspection/config/scene_inspect.xml
index 463e4b7..3ba6da3 100644
--- a/script/gspylib/inspection/config/scene_inspect.xml
+++ b/script/gspylib/inspection/config/scene_inspect.xml
@@ -40,12 +40,10 @@
<item name="CheckSshdConfig"/>
<item name="CheckCrondService"/>
<item name="CheckStack"/>
- <item name="CheckNoCheckSum"/>
<item name="CheckSysPortRange"/>
<item name="CheckMemInfo"/>
<item name="CheckHyperThread"/>
<item name="CheckTableSpace"/>
- <item name="CheckSctpService"/>
<item name="CheckSysadminUser"/>
<item name="CheckGUCConsistent"/>
<item name="CheckMaxProcMemory"/>
diff --git a/script/gspylib/inspection/config/scene_install.xml b/script/gspylib/inspection/config/scene_install.xml
index a189193..42b9547 100644
--- a/script/gspylib/inspection/config/scene_install.xml
+++ b/script/gspylib/inspection/config/scene_install.xml
@@ -12,13 +12,11 @@
<item name="CheckStack"/>
<item name="CheckCrondService"/>
<item name="CheckSshdService"/>
- <item name="CheckSctpService"/>
<item name="CheckSysParams">
<threshold>
version=V1R7C00
</threshold>
</item>
- <item name="CheckNoCheckSum"/>
<item name="CheckDiskFormat"/>
<item name="CheckEtcHosts"/>
<item name="CheckHyperThread"/>
diff --git a/script/gspylib/inspection/config/scene_upgrade.xml b/script/gspylib/inspection/config/scene_upgrade.xml
index 426785a..7356a21 100644
--- a/script/gspylib/inspection/config/scene_upgrade.xml
+++ b/script/gspylib/inspection/config/scene_upgrade.xml
@@ -23,7 +23,6 @@
version=V1R7C00
</threshold>
</item>
- <item name="CheckNoCheckSum"/>
<item name="CheckGUCValue"/>
<item name="CheckStack"/>
<item name="CheckDiskFormat"/>
diff --git a/script/gspylib/inspection/items/network/CheckNoCheckSum.py b/script/gspylib/inspection/items/network/CheckNoCheckSum.py
deleted file mode 100644
index 64d0e52..0000000
--- a/script/gspylib/inspection/items/network/CheckNoCheckSum.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# -*- coding:utf-8 -*-
-# Copyright (c) 2020 Huawei Technologies Co.,Ltd.
-#
-# openGauss is licensed under Mulan PSL v2.
-# You can use this software according to the terms
-# and conditions of the Mulan PSL v2.
-# You may obtain a copy of Mulan PSL v2 at:
-#
-# http://license.coscl.org.cn/MulanPSL2
-#
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS,
-# WITHOUT WARRANTIES OF ANY KIND,
-# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
-# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
-# See the Mulan PSL v2 for more details.
-# ----------------------------------------------------------------------------
-import os
-import platform
-from gspylib.inspection.common import SharedFuncs
-from gspylib.inspection.common.CheckItem import BaseItem
-from gspylib.inspection.common.CheckResult import ResultStatus
-from gspylib.os.gsfile import g_file
-from gspylib.os.gsnetwork import g_network
-from gspylib.os.gsfile import g_Platform
-from gspylib.common.ErrorCode import ErrorCode
-
-
-class CheckNoCheckSum(BaseItem):
- def __init__(self):
- super(CheckNoCheckSum, self).__init__(self.__class__.__name__)
-
- def getOSversion(self):
- distname, version, idnum = g_Platform.dist()
- return distname, version
-
- def doCheck(self):
- if (not os.path.isfile("/sys/module/sctp/parameters/no_checksums")):
- self.result.rst = ResultStatus.OK
- self.result.val = "The SCTP service is not used and the" \
- " check item is skipped"
- return
- expect = "N"
- if (self.cluster):
- LocalNodeInfo = self.cluster.getDbNodeByName(self.host)
- serviceIP = LocalNodeInfo.backIps[0]
- else:
- serviceIP = SharedFuncs.getIpByHostName(self.host)
- for network in g_network.getAllNetworkInfo():
- if (network.ipAddress == serviceIP):
- networkCardNum = network.NICNum
- networkBond = network.networkBondModeInfo
- break
- if (not networkCardNum or not networkBond):
- raise Exception(ErrorCode.GAUSS_506["GAUSS_50619"])
- (distname, version) = self.getOSversion()
- if ((distname in ("redhat", "centos")) and
- (version in ("6.4", "6.5")) and
- networkBond != "BondMode Null"):
- expect = "Y"
-
- output = \
- g_file.readFile('/sys/module/sctp/parameters/no_checksums')[0]
- if (output.strip() == expect):
- self.result.rst = ResultStatus.OK
- self.result.val = "Nochecksum value is %s,Check items pass." \
- % output.strip()
- else:
- self.result.rst = ResultStatus.NG
- self.result.val = "Nochecksum value(%s) is not %s," \
- "Check items are not passed." \
- % (output.strip(), expect)
diff --git a/script/gspylib/inspection/items/network/CheckUsedPort.py b/script/gspylib/inspection/items/network/CheckUsedPort.py
index 8a635ed..9718d96 100644
--- a/script/gspylib/inspection/items/network/CheckUsedPort.py
+++ b/script/gspylib/inspection/items/network/CheckUsedPort.py
@@ -46,17 +46,9 @@ class CheckUsedPort(BaseItem):
return int(tcpUsed)
- def getSctpUsedPort(self):
- cmd = "cat /proc/net/sctp/assocs|" \
- "awk '{print $12}'|sort|uniq -c |wc -l"
- sctpUsed = SharedFuncs.runShellCmd(cmd)
-
- return int(sctpUsed)
-
def doCheck(self):
portRange = self.getPortRange()
tcpUsed = self.getTcpUsedPort()
- sctpUsed = self.getSctpUsedPort()
defaultPortRange = 60000 - 32768
if (portRange < defaultPortRange):
self.result.rst = ResultStatus.WARNING
@@ -70,14 +62,7 @@ class CheckUsedPort(BaseItem):
" not passed." % tcpUsed
return
- if (sctpUsed > portRange * 0.8):
- self.result.rst = ResultStatus.WARNING
- self.result.val = "sctp port used is %s," \
- "Check items are not passed." % sctpUsed
- return
-
self.result.rst = ResultStatus.OK
self.result.val = "port range is %s,tcp port used is %s," \
- "sctp port used is %d,Check items pass." \
- % (portRange, tcpUsed, sctpUsed)
+ "Check items pass." % (portRange, tcpUsed)
return
diff --git a/script/gspylib/inspection/items/os/CheckSctpService.py b/script/gspylib/inspection/items/os/CheckSctpService.py
deleted file mode 100644
index 8e00810..0000000
--- a/script/gspylib/inspection/items/os/CheckSctpService.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# -*- coding:utf-8 -*-
-# Copyright (c) 2020 Huawei Technologies Co.,Ltd.
-#
-# openGauss is licensed under Mulan PSL v2.
-# You can use this software according to the terms
-# and conditions of the Mulan PSL v2.
-# You may obtain a copy of Mulan PSL v2 at:
-#
-# http://license.coscl.org.cn/MulanPSL2
-#
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS,
-# WITHOUT WARRANTIES OF ANY KIND,
-# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
-# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
-# See the Mulan PSL v2 for more details.
-# ----------------------------------------------------------------------------
-import subprocess
-import platform
-from gspylib.inspection.common.CheckItem import BaseItem
-from gspylib.inspection.common.CheckResult import ResultStatus
-from gspylib.common.Common import DefaultValue
-from gspylib.os.gsfile import g_Platform
-
-
-class CheckSctpService(BaseItem):
- def __init__(self):
- super(CheckSctpService, self).__init__(self.__class__.__name__)
-
- def doCheck(self):
-
- parRes = ""
- flag = "Normal"
- cmd = "ls -l /lib/modules/`uname -r`/kernel/net/sctp/sctp.ko*"
- (status, output) = subprocess.getstatusoutput(cmd)
- if (status != 0 or output == "" or output.find(
- "No such file or directory") > 0):
- if DefaultValue.checkDockerEnv():
- return
- flag = "Error"
- parRes += "There is no sctp service."
- else:
- cmd = "modprobe sctp;"
- cmd += "lsmod |grep sctp"
- (status, output) = subprocess.getstatusoutput(cmd)
- if (output == ""):
- flag = "Error"
- parRes += "sctp service is not loaded."
-
- cmd = "cat %s | grep '^insmod.*sctp.ko'" % DefaultValue.getOSInitFile()
- (status, output) = subprocess.getstatusoutput(cmd)
- if (status != 0 or output == ""):
- if (flag == "Normal"):
- flag = "Warning"
- parRes += "Sctp service is not set to boot from power on."
-
- self.result.val = parRes
- self.result.raw = output
- if (flag == "Error"):
- self.result.rst = ResultStatus.NG
- elif (flag == "Warning"):
- self.result.rst = ResultStatus.WARNING
- else:
- self.result.rst = ResultStatus.OK
- self.result.val = "Sctp service is Normal."
-
- def doSet(self):
- self.result.val = ""
- parRes = ""
- sctpFile = ""
- initFileSuse = "/etc/init.d/boot.local"
- initFileRedhat = "/etc/rc.d/rc.local"
- cmd = "ls -l /lib/modules/`uname -r`/kernel/net/sctp/sctp.ko*"
- (status, output) = subprocess.getstatusoutput(cmd)
- if (status != 0 or output == "" or output.find(
- "No such file or directory") > 0):
- parRes = "There is no sctp service.\n"
- else:
- sctpFile = output.split()[-1]
- cmd = "modprobe sctp;"
- (status, output) = subprocess.getstatusoutput(cmd)
- if (status != 0):
- cmd = "insmod %s >/dev/null 2>&1;lsmod |grep sctp" % sctpFile
- (status, output) = subprocess.getstatusoutput(cmd)
- if status != 0 or output == "":
- parRes = "Failed to load sctp service.\n"
- distname, version, idnum = g_Platform.dist()
- if (distname in ["redhat", "centos", "euleros", "openEuler"]):
- cmd = "cat %s | grep sctp" % initFileRedhat
- (status, output) = subprocess.getstatusoutput(cmd)
- if (status != 0 or output == ""):
- cmd = "echo 'modprobe sctp' >> /etc/rc.d/rc.local;"
- cmd += "echo" \
- " 'insmod %s >/dev/null 2>&1' >> /etc/rc.d/rc.local " \
- % sctpFile
- (status, output) = subprocess.getstatusoutput(cmd)
- if (status != 0):
- parRes += "Failed to add sctp service to boot.\n"
- else:
- cmd = "cat %s | grep stcp" % initFileSuse
- (status, output) = subprocess.getstatusoutput(cmd)
- if (status != 0 or output == ""):
- cmd = "echo 'modprobe sctp' >> /etc/init.d/boot.local;"
- cmd += "echo '%s >/dev/null 2>&1' >> /etc/init.d/boot.local " \
- % sctpFile
- (status, output) = subprocess.getstatusoutput(cmd)
- if (status != 0):
- parRes += "Failed to add sctp service to boot."
- self.result.val = parRes
diff --git a/script/gspylib/inspection/items/os/CheckSysParams.py b/script/gspylib/inspection/items/os/CheckSysParams.py
index 947ecc6..c15a627 100644
--- a/script/gspylib/inspection/items/os/CheckSysParams.py
+++ b/script/gspylib/inspection/items/os/CheckSysParams.py
@@ -122,21 +122,13 @@ class CheckSysParams(BaseItem):
for key in kernelParameter:
if (patchlevel == "1" and key == "vm.extfrag_threshold"):
continue
- if (key == "sctpchecksumerrors"):
- snmpFile = "/proc/net/sctp/snmp"
- if (os.path.isfile(snmpFile)):
- output = \
- g_file.readFile(snmpFile, 'SctpChecksumErrors')[
- 0].split()[1].strip()
- else:
- continue
- else:
- sysFile = "/proc/sys/%s" % key.replace('.', '/')
- # High version of linux no longer supports tcp_tw_recycle
- if (not os.path.exists(
- sysFile) and key == "net.ipv4.tcp_tw_recycle"):
- continue
- output = g_file.readFile(sysFile)[0].strip()
+
+ sysFile = "/proc/sys/%s" % key.replace('.', '/')
+ # High version of linux no longer supports tcp_tw_recycle
+ if (not os.path.exists(
+ sysFile) and key == "net.ipv4.tcp_tw_recycle"):
+ continue
+ output = g_file.readFile(sysFile)[0].strip()
if (len(output.split()) > 1):
output = ' '.join(output.split())
@@ -184,16 +176,6 @@ class CheckSysParams(BaseItem):
checkResultList = checkResult.split('\'')
setParameterList[checkResultList[1]] = checkResultList[5]
self.result.val = ""
- # The parameter sctpchecksumerrors set method is independent
- if ("sctpchecksumerrors" in setParameterList):
- cmd = "echo 1 > /sys/module/sctp/parameters/no_checksums"
- (status, output) = subprocess.getstatusoutput(cmd)
- if (status != 0):
- self.result.val += " " \
- " Failed to enforce sysctl kernel " \
- "variable 'sctpchecksumerrors'. " \
- "Error: %s" % output
- setParameterList.pop("sctpchecksumerrors")
if (len(setParameterList) != 0):
for key in setParameterList:
diff --git a/script/impl/preinstall/PreinstallImpl.py b/script/impl/preinstall/PreinstallImpl.py
index a35e87a..908423f 100644
--- a/script/impl/preinstall/PreinstallImpl.py
+++ b/script/impl/preinstall/PreinstallImpl.py
@@ -54,8 +54,6 @@ ACTION_PREPARE_USER_CRON_SERVICE = "prepare_user_cron_service"
ACTION_PREPARE_USER_SSHD_SERVICE = "prepare_user_sshd_service"
# set the dynamic link library
ACTION_SET_LIBRARY = "set_library"
-# set sctp service
-ACTION_SET_SCTP = "set_sctp"
# set virtual Ip
ACTION_SET_VIRTUALIP = "set_virtualIp"
# clean virtual Ip
@@ -1485,38 +1483,6 @@ class PreinstallImpl:
"""
pass
- def setSctp(self):
- """
- function: setting SCTP service
- input: NA
- output: NA
- """
- self.context.logger.log("Setting SCTP service.", "addStep")
- try:
- # set SCTP service
- cmd = "%s -t %s -u %s -l %s" % (
- OMCommand.getLocalScript("Local_PreInstall"),
- ACTION_SET_SCTP,
- self.context.user,
- self.context.localLog)
- # check the mpprcFile
- if self.context.mpprcFile != "":
- cmd += " -s '%s'" % self.context.mpprcFile
- self.context.logger.debug("Command for setting SCTP: %s" % cmd)
-
- # exec cmd for set SCTP
- DefaultValue.execCommandWithMode(
- cmd,
- "set SCTP",
- self.context.sshTool,
- self.context.localMode or self.context.isSingle,
- self.context.mpprcFile)
- except Exception as e:
- # failed set SCTP service
- raise Exception(str(e))
- # Successfully set SCTP service
- self.context.logger.log("Successfully set SCTP service.", "constant")
-
def setVirtualIp(self):
"""
function: set the virtual IPs
@@ -1893,10 +1859,6 @@ class PreinstallImpl:
self.checkOSVersion()
# create path and set mode
self.createDirs()
-
- # set Sctp
- if not DefaultValue.checkDockerEnv():
- self.setSctp()
# set os parameters
self.setAndCheckOSParameter()
# prepare cron service for user
diff --git a/script/local/LocalCheck.py b/script/local/LocalCheck.py
index 82a9efb..6e5cb6e 100644
--- a/script/local/LocalCheck.py
+++ b/script/local/LocalCheck.py
@@ -47,8 +47,7 @@ actioItemMap = {
docker_no_need_check = ["net.core.wmem_max", "net.core.rmem_max",
"net.core.wmem_default", "net.core.rmem_default",
- "net.sctp.sctp_mem", "net.sctp.sctp_rmem",
- "net.sctp.sctp_wmem", "net.core.netdev_max_backlog",
+ "net.core.netdev_max_backlog",
"net.ipv4.tcp_max_tw_buckets", "net.ipv4.tcp_tw_reuse",
"net.ipv4.tcp_tw_recycle", "net.ipv4.tcp_retries2",
"net.ipv4.ip_local_reserved_ports", "net.ipv4.tcp_rmem",
@@ -239,12 +238,7 @@ def checkSysctlParameter(kernelParameter, isSet):
continue
if (DefaultValue.checkDockerEnv() and key in docker_no_need_check):
continue
- # The parameter sctpchecksumerrors check method is independent
- if (key == "sctpchecksumerrors"):
- cmd = "cat /proc/net/sctp/snmp | grep SctpChecksumErrors" \
- " | awk '{print $2}'"
- else:
- cmd = "cat %s" % ("/proc/sys/%s" % key.replace('.', '/'))
+ cmd = "cat %s" % ("/proc/sys/%s" % key.replace('.', '/'))
(status, output) = subprocess.getstatusoutput(cmd)
if (status == 0):
if (key == "vm.min_free_kbytes"
@@ -315,15 +309,6 @@ def setOSParameter(setParameterList, patchlevel):
# vm.extfrag_threshold parameter, skip set
if ("vm.extfrag_threshold" in setParameterList and patchlevel == "1"):
setParameterList.pop("vm.extfrag_threshold")
- # The parameter sctpchecksumerrors set method is independent
- if ("sctpchecksumerrors" in setParameterList):
- cmd = "echo 1 > /sys/module/sctp/parameters/no_checksums"
- (status, output) = subprocess.getstatusoutput(cmd)
- if (status != 0):
- g_logger.debug("The cmd is %s " % cmd)
- g_logger.log(" Failed to enforce sysctl kernel variable"
- " 'sctpchecksumerrors'. Error: %s" % output)
- setParameterList.pop("sctpchecksumerrors")
if (len(setParameterList) != 0):
g_logger.debug("Setting sysctl parameter.")
@@ -332,7 +317,7 @@ def setOSParameter(setParameterList, patchlevel):
g_logger.log(" Set variable '%s' to '%s'"
% (key, setParameterList[key]))
cmd = "sysctl -p"
- (status, output) = subprocess.getstatusoutput(cmd)
+ (status, _) = subprocess.getstatusoutput(cmd)
if (status != 0):
cmderrorinfo = "sysctl -p | grep 'No such file or directory'"
(status, outputresult) = subprocess.getstatusoutput(cmderrorinfo)
diff --git a/script/local/PreInstallUtility.py b/script/local/PreInstallUtility.py
index cbe2a59..b4071f3 100644
--- a/script/local/PreInstallUtility.py
+++ b/script/local/PreInstallUtility.py
@@ -55,7 +55,6 @@ ACTION_SET_TOOL_ENV = "set_tool_env"
ACTION_PREPARE_USER_CRON_SERVICE = "prepare_user_cron_service"
ACTION_PREPARE_USER_SSHD_SERVICE = "prepare_user_sshd_service"
ACTION_SET_LIBRARY = "set_library"
-ACTION_SET_SCTP = "set_sctp"
ACTION_SET_VIRTUALIP = "set_virtualIp"
ACTION_CHECK_HOSTNAME_MAPPING = "check_hostname_mapping"
ACTION_INIT_GAUSSLOG = "init_gausslog"
@@ -256,7 +255,7 @@ Common options:
GaussLog.exitWithError(str(e))
parameter_list = [ACTION_CHECK_OS_VERSION, ACTION_SET_FINISH_FLAG,
ACTION_SET_USER_ENV, ACTION_SET_LIBRARY, \
- ACTION_SET_SCTP, ACTION_PREPARE_USER_CRON_SERVICE,
+ ACTION_PREPARE_USER_CRON_SERVICE,
ACTION_PREPARE_USER_SSHD_SERVICE, \
ACTION_SET_VIRTUALIP, ACTION_INIT_GAUSSLOG,
ACTION_CHECK_ENVFILE, ACTION_CHECK_OS_SOFTWARE, \
@@ -1981,88 +1980,6 @@ Common options:
self.logger.logExit(str(e))
self.logger.debug("Successfully set ARM Optimization.")
- def setSctp(self):
- """
- function: Setting SCTP
- input : NA
- output: NA
- """
- self.logger.debug("Setting SCTP.")
- try:
-
- key = "install ipv6 \/bin\/true"
- confFile = "/etc/modprobe.d/*ipv6.conf"
-
- initFile = DefaultValue.getOSInitFile()
- cmd = "ls %s" % confFile
- (status, output) = subprocess.getstatusoutput(cmd)
- if status == 0:
- cmd = "sed -i 's/^.*\(%s.*\)/#\\1/g' %s" % (key, confFile)
- (status, output) = subprocess.getstatusoutput(cmd)
- if status != 0:
- self.logger.logExit(ErrorCode.GAUSS_502["GAUSS_50223"]
- % confFile + " Error: \n%s" % output)
- cmd = "modprobe ipv6"
- (status, output) = subprocess.getstatusoutput(cmd)
- if status != 0:
- self.logger.logExit(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd
- + " Error: \n%s" % output)
- cmd = "modprobe sctp"
- (status, output) = subprocess.getstatusoutput(cmd)
- if status != 0:
- self.logger.logExit(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd
- + " Error: \n%s" % output)
-
- cmd = "uname -r"
- (status, output) = subprocess.getstatusoutput(cmd)
- if status != 0:
- self.logger.logExit(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd
- + " Error: \n%s" % output)
-
- # Since redhat7.4 kernel module files ending in .xz
- stcpFile = "/lib/modules/%s/kernel/net/sctp/sctp.ko" \
- % output.strip()
- stcpFileXz = "/lib/modules/%s/kernel/net/sctp/sctp.ko.xz" \
- % output.strip()
- if (not os.path.exists(stcpFile)) and \
- (not os.path.exists(stcpFileXz)):
- output = stcpFile + " and " + stcpFileXz
- self.logger.logExit(ErrorCode.GAUSS_502["GAUSS_50201"]
- % output)
-
- cmd_insmod = "insmod %s >/dev/null 2>&1" % stcpFileXz
- (status, output) = subprocess.getstatusoutput(cmd_insmod)
-
- cmd_insmod = "insmod %s >/dev/null 2>&1" % stcpFile
- (status, output) = subprocess.getstatusoutput(cmd_insmod)
-
- cmd = "lsmod | grep 'sctp ' | wc -l"
- (status, output) = subprocess.getstatusoutput(cmd)
- if not str(output.strip()).isdigit() or int(output.strip()) == 0:
- self.logger.logExit(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd
- + " Error: \n%s" % output)
-
- init_cmd = "sed -i '/^modprobe sctp$/d' %s &&" % initFile
- init_cmd += "echo \"modprobe sctp\" >> %s &&" % initFile
- init_cmd += "sed -i '/^insmod.*sctp.ko/d' %s &&" % initFile
- init_cmd += "echo \"%s\" >> %s" % (cmd_insmod, initFile)
- (status, output) = subprocess.getstatusoutput(init_cmd)
- if status != 0:
- self.logger.logExit(ErrorCode.GAUSS_514["GAUSS_51400"]
- % init_cmd + " Error: \n%s" % output)
-
- cmd = "sed -i \"/^sysctl -p/d\" %s &&" % initFile
- cmd += "echo \"sysctl -p\" >> %s" % initFile
- (status, output) = subprocess.getstatusoutput(cmd)
- if status != 0:
- self.logger.logExit(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd
- + " Error: \n%s" % output)
-
- except Exception as e:
- self.logger.logExit(str(e))
-
- self.logger.debug("Successfully set Sctp.")
-
def checkVirtualIp(self):
"""
function: Checking virtual IP
@@ -2915,8 +2832,6 @@ Common options:
self.prepareUserSshdService()
elif self.action == ACTION_SET_LIBRARY:
self.setLibrary()
- elif self.action == ACTION_SET_SCTP:
- self.setSctp()
elif self.action == ACTION_SET_VIRTUALIP:
DefaultValue.modifyFileOwnerFromGPHOME(self.logger.logFile)
self.setVirtualIp()
--
2.6.4.windows.1

View File

@ -16,12 +16,16 @@ class SqlCommands:
@staticmethod
def getSQLCommand(port, database=ConstantsBase.DEFAULT_DB_NAME,
gsqlBin="gsql"):
gsqlBin="gsql", user_name="", user_pwd=""):
"""
function : get SQL command
input : port, database
output : cmd
"""
if user_name and user_pwd:
cmd = ConstantsBase.SQL_EXEC_COMMAND_WITHOUT_HOST_WITH_USER % (
gsqlBin, str(port), database, user_name, user_pwd)
return cmd
cmd = ConstantsBase.SQL_EXEC_COMMAND_WITHOUT_HOST_WITHOUT_USER % (
gsqlBin, str(int(port) + 1), database)
return cmd

View File

@ -59,4 +59,4 @@ class ConstantsBase:
#SQL_EXEC_COMMAND
SQL_EXEC_COMMAND_WITHOUT_HOST_WITHOUT_USER = "%s -p %s -d %s "
SQL_EXEC_COMMAND_WITHOUT_HOST_WITH_USER = "%s -p %s -d %s -U %s -W %s "
SQL_EXEC_COMMAND_WITHOUT_HOST_WITH_USER = "%s -p %s -d %s -U %s -W '%s' "

View File

@ -24,6 +24,8 @@ import subprocess
import threading
import time
from subprocess import PIPE, Popen
from datetime import datetime
from datetime import timedelta
import pwd
from gspylib.common.ErrorCode import ErrorCode
from base_utils.common.exceptions import CommandNotFoundException
@ -575,6 +577,21 @@ class CmdUtil(object):
break
return status, output
@staticmethod
def retry_util_timeout(cmd, timeout, sleep_time=1):
"""
retry execute cmd with giving timeout.
"""
end_time = datetime.now() + timedelta(seconds=int(timeout))
status, output = 1, 1
while datetime.now() < end_time:
status, output = CmdUtil.getstatusoutput_by_fast_popen(cmd)
if status == 0:
break
else:
time.sleep(sleep_time)
return status, output
@staticmethod
def getstatusoutput_by_fast_popen(cmd):
"""

View File

@ -24,6 +24,7 @@ import stat
import subprocess
import pwd
import grp
import json
from subprocess import PIPE
from base_utils.common.constantsbase import ConstantsBase
@ -299,6 +300,27 @@ class FileUtil(object):
lock.release()
return True
@staticmethod
def write_update_file(file_path, content, authority, is_json=True):
"""
Write or update file, create if not exist.
"""
with os.fdopen(os.open(file_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
authority), "w") as fp_write:
if is_json:
json.dump(content, fp_write)
else:
fp_write.write(content)
@staticmethod
def write_add_file(file_path, content, authority):
"""
Write or add content in file, create if not exist.
"""
if not os.path.isfile(file_path):
FileUtil.createFileInSafeMode(file_path, mode=authority)
FileUtil.writeFile(file_path, [content])
@staticmethod
def withAsteriskPath(path):
"""

View File

@ -6,14 +6,28 @@
# Date : 2021-06-30
# Description : security_checker.py check security conditions
#############################################################################
import re
from gspylib.common.ErrorCode import ErrorCode
class ValidationError(Exception):
"""
validation base error
"""
def __init__(self, error_info):
super().__init__(self)
self.error_info = error_info
def __str__(self):
return self.error_info
class SecurityChecker(object):
"""check security conditions"""
INJECTION_CHAR_LIST = ["|", ";", "&", "$", "<", ">", "`", "\\", "'", "\"", "{", "}", "(", ")",
"[", "]", "~", "*", "?", " ", "!", "\n"]
PWD_VALIDATION_PATTERN = r'^[A-Za-z0-9~!@#%^*\-_=+?,]+$'
IP_PATTERN = r'^((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)$'
@staticmethod
def check_injection_char(check_value):
@ -27,3 +41,83 @@ class SecurityChecker(object):
if any(rac in check_value for rac in SecurityChecker.INJECTION_CHAR_LIST):
raise Exception(ErrorCode.GAUSS_502["GAUSS_50219"] % check_value +
" There are illegal characters.")
@staticmethod
def check_is_string(description, value):
"""
Check is string
"""
if not isinstance(value, str):
raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50022'] % (description, 'string'))
@staticmethod
def check_max_length(description, value, max_length):
"""
Check max length
"""
if len(value) > max_length:
raise ValidationError(ErrorCode.GAUSS_500["GAUSS_50023"] % (description, max_length))
@staticmethod
def check_db_injection(description, value):
"""
Check db injection
"""
for rac in SecurityChecker.INJECTION_CHAR_LIST:
if value.find(rac) > 0:
raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50025'] % (rac, description))
@staticmethod
def check_password(description, value):
if not re.match(SecurityChecker.PWD_VALIDATION_PATTERN, value):
raise ValidationError(ErrorCode.GAUSS_500["GAUSS_50024"] % description)
@staticmethod
def check_db_user(description, value):
SecurityChecker.check_is_string(description, value)
SecurityChecker.check_max_length(description, value, 256)
SecurityChecker.check_db_injection(description, value)
@staticmethod
def check_db_password(description, value):
SecurityChecker.check_is_string(description, value)
SecurityChecker.check_max_length(description, value, 256)
SecurityChecker.check_password(description, value)
@staticmethod
def check_is_digit(description, value):
if isinstance(value, int):
return
elif isinstance(value, str):
if not value.isdigit():
raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50022']
% (description, 'integer'))
else:
raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50022']
% (description, 'int or string'))
@staticmethod
def check_is_list(description, value):
if not isinstance(value, list):
raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50022'] % (description, 'list'))
@staticmethod
def check_is_dict(description, value):
if not isinstance(value, dict):
raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50022'] % (description, 'dict'))
@staticmethod
def check_ip_valid(description, value):
SecurityChecker.check_is_string(description, value)
if not re.match(SecurityChecker.IP_PATTERN, value):
raise ValidationError(ErrorCode.GAUSS_500["GAUSS_50024"] % description)
@staticmethod
def check_port_valid(description, value):
SecurityChecker.check_is_digit(description, value)
value = int(value) if not isinstance(value, int) else value
if value > 65535 or value < 0:
raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50022']
% (description, 'between 0 and 65535'))

View File

@ -87,7 +87,7 @@ class Checkperf():
def usage(self):
"""
gs_checkperf is a utility to check the cluster performance and SSD performance.
gs_checkperf is a utility to check the cluster performance and SSD performance, streaming disaster cluster does not yet support.
Usage:
gs_checkperf -? | --help
@ -253,6 +253,11 @@ General options:
binPath = os.path.join(self.clusterInfo.appPath, "bin")
g_opts.databaseSizeFile = os.path.join(binPath,
DefaultValue.DB_SIZE_FILE)
is_disaster_cluster = \
DefaultValue.cm_exist_and_is_disaster_cluster(self.clusterInfo, g_logger)
if is_disaster_cluster:
GaussLog.exitWithError(
ErrorCode.GAUSS_512["GAUSS_51244"] % " Disaster cluster")
except Exception as e:
g_logger.logExit(str(e))

View File

@ -86,7 +86,7 @@ class Dropnode(ParallelBaseOM):
def usage(self):
"""
gs_dropnode is a utility to delete the standby node from a cluster.
gs_dropnode is a utility to delete the standby node from a cluster, streaming cluster does not yet support.
Usage:
gs_dropnode -? | --help
@ -335,6 +335,7 @@ if __name__ == "__main__":
dropNode = Dropnode()
dropNode.parseCommandLine()
dropNode.initLogs()
DefaultValue.check_is_streaming_dr_cluster()
dropNode.check_repeat_process()
dropNode.checkParameters()
dropNode.checkConnection(list(dropNode.backIpNameMap.keys()),

View File

@ -87,7 +87,7 @@ class Expansion(ParallelBaseOM):
def usage(self):
"""
gs_expansion is a utility to expansion standby node for a cluster.
gs_expansion is a utility to expansion standby node for a cluster, streaming cluster does not yet support.
Usage:
gs_expansion -? | --help

95
script/gs_sdr Normal file
View File

@ -0,0 +1,95 @@
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
#############################################################################
# Copyright (c) 2020 Huawei Technologies Co.,Ltd.
#
# openGauss is licensed under Mulan PSL v2.
# You can use this software according to the terms
# and conditions of the Mulan PSL v2.
# You may obtain a copy of Mulan PSL v2 at:
#
# http://license.coscl.org.cn/MulanPSL2
#
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS,
# WITHOUT WARRANTIES OF ANY KIND,
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
# See the Mulan PSL v2 for more details.
# ----------------------------------------------------------------------------
# Description : gs_sdr is a utility for streaming
# disaster recovery fully options.
#############################################################################
import os
import uuid
from gspylib.common.Common import DefaultValue
from gspylib.common.ErrorCode import ErrorCode
from gspylib.common.GaussLog import GaussLog
from impl.streaming_disaster_recovery.streaming_constants import StreamingConstants
from base_utils.os.user_util import UserUtil
from domain_utils.cluster_file.cluster_log import ClusterLog
from impl.streaming_disaster_recovery.params_handler import ParamsHandler
from impl.streaming_disaster_recovery.streaming_modules.\
streaming_diaster_recovery_start import StreamingStartHandler
from impl.streaming_disaster_recovery.streaming_modules.\
streaming_disaster_recovery_stop import StreamingStopHandler
from impl.streaming_disaster_recovery.streaming_modules.\
streaming_disaster_recovery_failover import StreamingFailoverHandler
from impl.streaming_disaster_recovery.streaming_modules.\
streaming_disaster_recovery_switchover import StreamingSwitchoverHandler
from impl.streaming_disaster_recovery.streaming_modules.\
streaming_disaster_recovery_query import StreamingQueryHandler
HANDLER_MAPPING = {
"start": StreamingStartHandler,
"stop": StreamingStopHandler,
"switchover": StreamingSwitchoverHandler,
"failover": StreamingFailoverHandler,
"query": StreamingQueryHandler
}
class StreamingDisasterRecoveryBase(object):
def __init__(self):
self.params = None
self.user = None
self.log_file = None
self.logger = None
self.trace_id = uuid.uuid1().hex
StreamingDisasterRecoveryBase.mock_process_user_sensitive_info()
self.__init_globals()
@staticmethod
def mock_process_user_sensitive_info():
"""mock_process_user_sensitive_info"""
cmdline = DefaultValue.get_proc_title("-W")
DefaultValue.set_proc_title(cmdline)
def __init_globals(self):
self.user = UserUtil.getUserInfo()['name']
tmp_logger_file = ClusterLog.getOMLogPath(StreamingConstants.STREAMING_LOG_FILE, self.user)
tmp_logger = GaussLog(tmp_logger_file, 'parse_and_validate_params', trace_id=self.trace_id)
self.params = ParamsHandler(tmp_logger, self.trace_id).get_valid_params()
self.log_file = self.params.logFile if self.params.logFile else \
ClusterLog.getOMLogPath(StreamingConstants.STREAMING_LOG_FILE, self.user)
self.logger = GaussLog(self.log_file, self.params.task, trace_id=self.trace_id)
if __name__ == '__main__':
if os.getuid() == 0:
GaussLog.exitWithError(ErrorCode.GAUSS_501["GAUSS_50105"])
base = StreamingDisasterRecoveryBase()
handler = HANDLER_MAPPING[base.params.task](base.params, base.user,
base.logger, base.trace_id, base.log_file)
handler.handle_lock_file(handler.trace_id, 'create')
try:
if base.params.task in StreamingConstants.TASK_EXIST_CHECK:
handler.check_streaming_process_is_running()
handler.run()
except Exception as error:
handler.logger.error(error)
raise Exception(str(error))
finally:
handler.handle_lock_file(handler.trace_id, 'remove')

View File

@ -45,6 +45,7 @@ import pwd
import grp
import copy
import re
import json
from gspylib.common.Common import DefaultValue
from gspylib.common.GaussLog import GaussLog
@ -60,6 +61,23 @@ from base_utils.os.net_util import NetUtil
from domain_utils.domain_common.cluster_constants import ClusterConstants
class DualUpgradeShareInfo:
"""
Used to record the upgrade status information of the primary and standby clusters
"""
def __init__(self, jsonInfo=None):
# If the Json string is passed in, the Json information is used to initialize the class
if jsonInfo:
self.__dict__ = jsonInfo
else:
self.masterVersion = ""
self.masterUpgradeStatus = 0
self.standbyVersion = ""
self.standbyUpgradeStatus = 0
class Upgrade(ParallelBaseOM):
"""
The class about upgrade
@ -90,6 +108,14 @@ class Upgrade(ParallelBaseOM):
self.oldClusterNumber = None
self.forceRollback = False
self.upgrade_remain = False
# Record the upgrade status information under dual clusters
self.dualUpgradeShareInfo = None
# Record the primary cluster or the standby cluster, dual-primary or dual-standby
self.clusterType = ""
# Whether it is a standby cluster in a dual cluster. Convenient to judge
self.standbyCluster = False
# The path to record the information of each cluster upgrade stage in the dual cluster
self.upgradePhaseInfoPath = ""
def usage(self):
"""
@ -153,6 +179,10 @@ Option for grey upgrade
self.upgrade_remain = True
if "force" in ParaDict.keys():
self.forceRollback = True
self.tmpDir = EnvUtil.getTmpDirFromEnv()
if self.tmpDir == "":
raise Exception(ErrorCode.GAUSS_518["GAUSS_51800"] % "$PGHOST")
self.upgradePhaseInfoPath = os.path.join(self.tmpDir, Const.UPGRADE_PHASE_INFO)
def checkUser(self):
"""
@ -299,6 +329,69 @@ Option for grey upgrade
raise Exception(ErrorCode.GAUSS_516["GAUSS_51619"] % nodeName)
self.logger.debug("Successfully init global infos")
# If it is a dual-cluster, initialize the related information of the dual-cluster
self.initDualUpgradeInfo()
def initDualUpgradeInfo(self):
"""
initialize dual cluster upgrade status information
If it is not a dual cluster, do not initialize
:return:
"""
if os.path.exists(self.upgradePhaseInfoPath):
if self.is_inplace_upgrade and self.action not in \
["commit-upgrade", "auto-rollback", "chose-strategy"]:
raise Exception("Dual cluster does not support in-place upgrade")
self.dualUpgradeShareInfo = self.getDualUpgradeInfo(self.upgradePhaseInfoPath,
startPost=0)
if not self.dualUpgradeShareInfo:
self.dualUpgradeShareInfo = DualUpgradeShareInfo()
@staticmethod
def getDualUpgradeInfo(filePath, startPost):
"""
Obtain the dual-cluster upgrade status information from the file,
and return None if there is no record
:return:
"""
if os.path.exists(filePath):
lenInfo = 0
with open(filePath, 'r') as shareInfo:
shareInfo.seek(startPost)
length = shareInfo.read(4)
if length > '':
try:
lenInfo = int(length)
except Exception as _:
lenInfo = 0
if lenInfo > 0:
shareInfo.seek(startPost + 4)
return json.loads(shareInfo.read(lenInfo), object_hook=DualUpgradeShareInfo)
return None
def updateDualUpgradeInfo(self, dualUpgradeShareInfo, filePath, startPost):
"""
Update the upgrade information of the cluster to the dual-cluster
shared file /dev/my_disk_sync_disk file
:return:
"""
if os.path.exists(filePath):
with os.fdopen(os.open(filePath, os.O_WRONLY, 0o600), "w") as shareInfo:
shareInfo.seek(startPost + Const.LENGTH_STORAGE_INFO_LEN)
shareInfo.write(json.dumps(dualUpgradeShareInfo, default=lambda obj: obj.__dict__))
length = shareInfo.tell() - (startPost + Const.LENGTH_STORAGE_INFO_LEN)
shareInfo.seek(startPost, 0)
shareInfo.write("{0:04d}".format(length))
# After the status file is updated, the standby cluster
# distributes the updated status file to the data directory of the DN.
for dbNode in self.clusterInfo.dbNodes:
for dnInst in dbNode.datanodes:
self.sshTool.scpFiles(filePath, dnInst.datadir,
hostList=[dnInst.hostname])
else:
raise Exception("{0} file does not exist and cannot be updated".format(filePath))
def distributeFileToSpecialNode(self, file, destDir, hostList):
"""
distribute file to special node

View File

@ -17,6 +17,7 @@
# ----------------------------------------------------------------------------
# Description : Common is a utility with a lot of common functions
#############################################################################
import ctypes
import sys
import subprocess
import os
@ -28,6 +29,7 @@ import time
import multiprocessing
import _thread as thread
import pwd
import json
import base64
import secrets
import string
@ -35,6 +37,7 @@ import stat
import csv
import copy
from subprocess import PIPE
from subprocess import Popen
# The installation starts, but the package is not decompressed completely.
# The lib64/libz.so.1 file is incomplete, and the hashlib depends on the
@ -106,6 +109,7 @@ from base_utils.os.cmd_util import CmdUtil
from base_utils.os.env_util import EnvUtil
from base_utils.os.file_util import FileUtil
from domain_utils.cluster_file.version_info import VersionInfo
from domain_utils.cluster_file.cluster_dir import ClusterDir
from domain_utils.security.random_value import RandomValue
from base_utils.os.process_util import ProcessUtil
from domain_utils.sql_handler.sql_executor import SqlExecutor
@ -199,6 +203,7 @@ class DefaultValue():
FILE_MODE = 640
FILE_MODE_PERMISSION = 0o640
KEY_FILE_MODE = 600
KEY_FILE_MODE_IN_OS = 0o600
MIN_FILE_MODE = 400
SPE_FILE_MODE = 500
KEY_DIRECTORY_MODE = 700
@ -318,6 +323,9 @@ class DefaultValue():
# FI_ELK_KRB_XML is used in elk
FI_ELK_KRB_XML = "auth_config/elk-krb-site.xml"
FI_KRB_CONF = "krb5.conf"
# cluster status
CLUSTER_STATUS_NORMAL = "Normal"
CLUSTER_STATUS_DEGRADED = "Degraded"
###########################
# instance role
###########################
@ -615,6 +623,60 @@ class DefaultValue():
return NetWorkConfFile
@staticmethod
def get_remote_ips(host, mpp_file):
"""
Get ips from remote host
"""
cmd = "source %s && pssh -s -t 30 -H %s \"hostname -I\"" % (mpp_file, host)
status, output = subprocess.getstatusoutput(cmd)
if status == 0 and output != "":
ips = output.strip().split()
return ips
else:
raise Exception(ErrorCode.GAUSS_516['GAUSS_51632']
% "check remote ips for node:%s, Error:%s." % (host, output))
@staticmethod
def obtain_file_content(dest_file, deduplicate=True, is_list=True):
"""
function:obtains the content of each line in the file.
input: file dir
:return: file context lines list
"""
result = [] if is_list else None
if not os.path.isfile(dest_file):
return result
with open(dest_file, "r") as fp_read:
if is_list:
for line in fp_read:
result.append(line.strip('\n'))
else:
result = fp_read.read().strip()
if deduplicate and is_list:
result = list(set(result))
return result
@staticmethod
def get_all_dn_num_for_dr(file_path, dn_inst, cluster_info, logger):
"""get_all_dn_num_for_dr_cluster"""
# DN inst supports a maximum of replicaNum=8 in postgresql.conf.
default_num = 8
content = DefaultValue.obtain_file_content(file_path, is_list=False)
if content:
default_num = 0
shards = json.loads(content)['remoteClusterConf']["shards"]
logger.debug("Stream cluster json shards:%s" % shards)
if cluster_info.isSingleInstCluster():
for shard in shards:
default_num += len(shard)
else:
default_num += len(shards[0])
peer_insts = cluster_info.getPeerInstance(dn_inst)
default_num += len(peer_insts)
logger.debug("Get config replconninfo dn num:%s" % default_num)
return default_num
@staticmethod
def getIpByHostName():
'''
@ -1616,6 +1678,45 @@ class DefaultValue():
noPassIPs.append(ip)
g_lock.release()
@staticmethod
def fast_ping(node_ip):
"""
ping node with short timeout
"""
cmd = "ping %s -c 1 -w 4" % node_ip
proc = FastPopen(cmd, stdout=PIPE, stderr=PIPE, preexec_fn=os.setsid, close_fds=True)
proc.communicate()
status = proc.returncode
result = (node_ip, True) if status == 0 else (node_ip, False)
return result
@staticmethod
def fast_ping_on_node(on_node, from_ip, to_ip, logger):
"""
Ping on remote node with -I
"""
cmd = "ping %s -c 1 -w 4" % on_node
proc = FastPopen(cmd, stdout=PIPE, stderr=PIPE,
preexec_fn=os.setsid, close_fds=True)
proc.communicate()
status = proc.returncode
if status != 0:
logger.debug("Node:%s ping failed, can not execute remote check." % on_node)
return on_node, False
if on_node == NetUtil.GetHostIpOrName():
cmd_remote = "ping %s -I %s -c 1 -w 4" % (to_ip, from_ip)
else:
cmd_remote = "source %s && pssh -s -H %s 'ping %s -I %s -c 1 -w 4'" \
% (EnvUtil.getMpprcFile(), on_node, to_ip, from_ip)
proc = FastPopen(cmd_remote, stdout=PIPE, stderr=PIPE,
preexec_fn=os.setsid, close_fds=True)
proc.communicate()
status = proc.returncode
result = (to_ip, True) if status == 0 else (to_ip, False)
logger.debug("Remote ping result on node:%s, from ip:%s, to ip:%s, result:%s."
% (on_node, from_ip, to_ip, result))
return result
@staticmethod
def checkIsPing(ips):
"""
@ -2259,7 +2360,7 @@ class DefaultValue():
"Command:%s. Error:\n%s" % (cmd, output))
targetString = output.split("Datanode")[1]
dnPrimary = [x for x in re.split(r"[|\n]", targetString)
if flagStr in x]
if flagStr in x or "Main" in x]
primaryList = []
for dn in dnPrimary:
primaryList.append(list(filter(None, dn.split(" ")))[1])
@ -2866,6 +2967,283 @@ class DefaultValue():
"on node [{0}] successfully.".format(node.name))
logger.log("Remove dynamic_config_file and CM metadata directory on all nodes.")
@staticmethod
def distribute_file_to_node(params):
"""
Distribute file to dest node with path
"""
dest_ip, from_path, to_path, timeout = params
pscp_cmd = "source %s ; pscp -t %s -H %s %s %s" % (
EnvUtil.getMpprcFile(), timeout, dest_ip, from_path, to_path)
status, output = CmdUtil.getstatusoutput_by_fast_popen(pscp_cmd)
return status, output, dest_ip
@staticmethod
def check_is_cm_cluster(logger):
"""
Check cm_ctl is exist.
"""
cmd = "source %s; cm_ctl view | grep cmDataPath" % EnvUtil.getMpprcFile()
status, output = CmdUtil.retryGetstatusoutput(cmd)
if status != 0:
logger.debug("Check cm_ctl is failed msg: %s." % output)
return False
logger.debug("Successfully check cm_ctl is available.")
return True
@staticmethod
def is_disaster_cluster(clusterinfo):
"""
function: determine cluster status normal or disaster
input: NA
output: NA
"""
cmd = "source %s; cm_ctl view | grep cmDataPath | awk -F [:] '{print $2}' | head -n 1" % \
EnvUtil.getMpprcFile()
proc = FastPopen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = proc.communicate()
if proc.returncode != 0:
raise Exception(ErrorCode.GAUSS_514['GAUSS_51400'] % cmd + "Error:\n%s" % stderr)
cm_agent_conf_file = stdout.strip() + "/cm_agent/cm_agent.conf"
if not os.path.isfile(cm_agent_conf_file):
host_list = clusterinfo.getClusterNodeNames()
cm_agent_conf_temp_file = os.path.join(EnvUtil.getTmpDirFromEnv(), "cm_agent_tmp.conf")
for host_ip in host_list:
get_file_cmd = g_file.SHELL_CMD_DICT["scpFileFromRemote"] % \
(host_ip, NetUtil.GetHostIpOrName(), cm_agent_conf_file, cm_agent_conf_temp_file)
proc = FastPopen(get_file_cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = proc.communicate()
if not os.path.isfile(cm_agent_conf_temp_file):
continue
else:
break
if os.path.isfile(cm_agent_conf_temp_file):
with open(cm_agent_conf_temp_file, "r") as cma_conf_file:
content = cma_conf_file.read()
ret = re.findall(r'agent_backup_open *= *1|agent_backup_open *= *2', content)
g_file.removeFile(cm_agent_conf_temp_file)
if ret:
return True
else:
return False
else:
raise Exception(ErrorCode.GAUSS_502['GAUSS_50201'] % cm_agent_conf_file)
with open(cm_agent_conf_file, "r") as cma_conf_file:
content = cma_conf_file.read()
ret = re.findall(r'agent_backup_open *= *1|agent_backup_open *= *2', content)
if ret:
return True
else:
return False
@staticmethod
def cm_exist_and_is_disaster_cluster(clusterinfo, logger):
"""
check current cluster cm exist and is disaster cluster.
"""
cm_exist = DefaultValue.check_is_cm_cluster(logger)
if not cm_exist:
return False
is_disaster = DefaultValue.is_disaster_cluster(clusterinfo)
if not is_disaster:
return False
return True
@staticmethod
def write_content_on_file(dest_file, content, authority=None):
"""
Write content on file
"""
authority = authority if authority else DefaultValue.KEY_FILE_MODE_IN_OS
with os.fdopen(os.open(dest_file, os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
authority), "w") as fp_write:
fp_write.write(str(content))
@staticmethod
def get_data_ip_info(instance, logger):
"""
Obtain data ip from file or cluster instance.
"""
cluster_conf_record = os.path.join(EnvUtil.getEnv("PGHOST"),
"streaming_cabin/cluster_conf_record")
if not os.path.isfile(cluster_conf_record):
raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % cluster_conf_record)
with open(cluster_conf_record, 'r') as read_fp:
conf_dict = json.load(read_fp)
if not conf_dict or len(conf_dict) != 2:
logger.debug("Failed obtain data ip list.")
raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "check data ip file")
inst_data_ip = ""
local_shards_list = conf_dict["localClusterConf"]["shards"]
for shard_list in local_shards_list:
for shard in shard_list:
if shard["ip"] not in instance.listenIps:
continue
inst_data_ip = shard["dataIp"]
logger.debug("File record:%s, \nGot data ip:%s for instanceId:%s." %
(conf_dict, inst_data_ip, instance.instanceId))
if not inst_data_ip:
raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain local data ip")
return inst_data_ip
@staticmethod
def obtain_hadr_user_encrypt_str(cluster_info, db_user, logger, mode, ignore_res=False):
"""
Obtain hadr user encrypted string
"""
sql = "select value from gs_global_config where name='hadr_user_info';"
instances = []
for node in cluster_info.dbNodes:
if cluster_info.isSingleInstCluster():
for inst in node.datanodes:
instances.append(inst)
for inst in instances:
logger.debug("Obtain hadr user info string on node:%s with port:%s."
% (inst.hostname, inst.port))
status, output = ClusterCommand.remoteSQLCommand(sql, db_user, inst.hostname,
inst.port, maintenance_mode=mode)
if status == 0 and output:
logger.debug("Successfully obtain hadr user info string.")
return output
if ignore_res:
return
logger.debug("Failed obtain hadr user info string.")
raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain hadr user info")
@staticmethod
def getstatusoutput_hide_pass(joint_cmd):
"""
Hide password of process
"""
proc = Popen(["sh", "-"], stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True)
stdout, stderr = proc.communicate(joint_cmd)
text = stderr or stdout
sts = proc.returncode
if sts is None:
sts = 0
if text and text[-1:] == '\n':
text = text[:-1]
return sts, text
@staticmethod
def decrypt_hadr_user_info(params):
"""
Decrypt hadr user info
"""
if len(params) != 6:
raise Exception(ErrorCode.GAUSS_500["GAUSS_50000"] % "decrypt hadr user info")
rand_pwd, hadr_str, cluster_info, db_user, logger, mode = params
sql = "select pg_catalog.gs_decrypt_aes128('%s', '%s');" % (hadr_str, rand_pwd)
instances = []
for node in cluster_info.dbNodes:
if cluster_info.isSingleInstCluster():
for inst in node.datanodes:
instances.append(inst)
else:
for inst in node.coordinators:
instances.append(inst)
for inst in instances:
logger.debug("Decrypt hadr user info on node:%s with port:%s."
% (inst.hostname, inst.port))
status, output = ClusterCommand.remoteSQLCommand(sql, db_user, inst.hostname,
inst.port, maintenance_mode=mode)
if status == 0 and output and "|" in output and len(output.split("|")) == 2:
logger.debug("Successfully decrypt hadr user info string.")
hadr_user, hadr_pwd = output.strip().split("|")[0], output.strip().split("|")[1]
return hadr_user, hadr_pwd
logger.debug("Failed decrypt hadr user info string.")
raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "decrypt hadr user info")
@staticmethod
def decrypt_hadr_rand_pwd(logger):
"""
Decrypt hadr rand pwd
"""
db_user = pwd.getpwuid(os.getuid()).pw_name
gauss_home = ClusterDir.getInstallDir(db_user)
bin_path = os.path.join(os.path.realpath(gauss_home), "bin")
if not bin_path:
logger.debug("Failed obtain bin path.")
raise Exception(ErrorCode.GAUSS_518["GAUSS_51802"] % "bin path")
cipher_file = os.path.join(EnvUtil.getTmpDirFromEnv(), "binary_upgrade/hadr.key.cipher")
rand_file = os.path.join(EnvUtil.getTmpDirFromEnv(), "binary_upgrade/hadr.key.rand")
if os.path.isfile(cipher_file) and os.path.isfile(rand_file):
bin_path = os.path.join(EnvUtil.getTmpDirFromEnv(), "binary_upgrade")
rand_pwd = AesCbcUtil.aes_cbc_decrypt_with_path(bin_path, bin_path, key_name="hadr")
if rand_pwd:
logger.debug("Successfully decrypt rand pwd.")
return rand_pwd
@staticmethod
def get_proc_title(pwd_para_name):
"""
Obtain the process name after sensitive information is hidden.
"""
cmd = "cat /proc/%s/cmdline" % os.getpid()
status, output = CmdUtil.retryGetstatusoutput(cmd)
if status != 0 or not output:
raise Exception(ErrorCode.GAUSS_502["GAUSS_50219"] % "proc title" + " Cmd is:%s." % cmd)
title_str_list = []
for title_str in output.split("\0"):
if "=" in title_str:
title_str_list.extend(title_str.split("="))
else:
title_str_list.extend(title_str.split(" "))
if pwd_para_name in title_str_list:
w_index = title_str_list.index(pwd_para_name)
title_str_list[w_index], title_str_list[w_index + 1] = "", ""
title_name = " ".join(title_str_list).strip()
return title_name
@staticmethod
def set_proc_title(name):
"""
set proc title to new name
"""
new_name = name.encode('ascii', 'replace')
try:
libc = ctypes.CDLL('libc.so.6')
proc_name = ctypes.c_char_p.in_dll(libc, '__progname_full')
with open('/proc/self/cmdline') as fp:
old_progname_len = len(fp.readline())
if old_progname_len > len(new_name):
# padding blank chars
new_name += b' ' * (old_progname_len - len(new_name))
# Environment variables are already copied to Python app zone.
# We can get environment variables by `os.environ` module,
# so we can ignore the destroying from the following action.
libc.strcpy(proc_name, ctypes.c_char_p(new_name))
buff = ctypes.create_string_buffer(len(new_name) + 1)
buff.value = new_name
libc.prctl(15, ctypes.byref(buff), 0, 0, 0)
except Exception as err_msg:
raise Exception(ErrorCode.GAUSS_505["GAUSS_50503"] + str(err_msg))
@staticmethod
def check_is_streaming_dr_cluster():
"""check_is_steaming_cluster_cluster"""
stream_file = os.path.realpath(os.path.join(EnvUtil.getEnv("PGHOST"), "streaming_cabin"))
if os.path.exists(stream_file):
sys.exit(ErrorCode.GAUSS_512["GAUSS_51244"] % "current operate on dr cluster")
@staticmethod
def get_primary_dn_instance_id(inst_status="Primary", ignore=False):
"""
function: get Primary/Standby dn instance id for centralized/distribute cluster
:param: inst_status Primary/Standby
return; instance id
"""
cmd = r"source %s; cm_ctl query -v | grep -E 'instance_state\ *:\ %s' " \
r"-B 4 | grep -E 'type\ *:\ Datanode' -B 5 | grep instance_id | awk " \
r"'{print $NF}'" % (EnvUtil.getMpprcFile(), inst_status)
(status, output) = CmdUtil.retryGetstatusoutput(cmd)
if status != 0 or not output:
if ignore is True:
return []
raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] %
cmd + " Error: \n%s" % output)
return output.strip().split('\n')
@staticmethod
def isgreyUpgradeNodeSpecify(user, step=-1, nodes=None, logger=None):
"""
@ -2988,6 +3366,29 @@ class ClusterCommand():
# rollback to flag of start cluster
INSTALL_STEP_START = "Start cluster"
@staticmethod
def getStartCmd(nodeId=0, timeout=DefaultValue.TIMEOUT_CLUSTER_START, datadir="", azName = ""):
"""
function : Start all cluster or a node
input : String,int,String,String
output : String
"""
user_profile = EnvUtil.getMpprcFile()
cmd = "%s %s ; cm_ctl start" % (CmdUtil.SOURCE_CMD, user_profile)
# check node id
if nodeId > 0:
cmd += " -n %d" % nodeId
# check data directory
if datadir != "":
cmd += " -D %s" % datadir
# check timeout
if timeout > 0:
cmd += " -t %d" % timeout
# azName
if azName != "":
cmd += " -z%s" % azName
return cmd
@staticmethod
def getStopCmd(nodeId=0, stopMode="", timeout=0, datadir="", azName = ""):
@ -3152,7 +3553,8 @@ class ClusterCommand():
@staticmethod
def remoteSQLCommand(sql, user, host, port, ignoreError=True,
database="postgres", useTid=False,
IsInplaceUpgrade=False):
IsInplaceUpgrade=False, maintenance_mode=False,
user_name="", user_pwd=""):
"""
function : Execute sql command on remote host
input : String,String,String,int
@ -3220,7 +3622,10 @@ class ClusterCommand():
gsql_cmd = SqlCommands.getSQLCommandForInplaceUpgradeBackup(
port, database)
else:
gsql_cmd = SqlCommands.getSQLCommand(port, database)
gsql_cmd = SqlCommands.getSQLCommand(port, database, user_name=user_name,
user_pwd=user_pwd)
if maintenance_mode:
gsql_cmd += " -m "
if str(localHost) != str(host):
sshCmd = CmdUtil.getSshCmd(host)
if os.getuid() == 0 and user != "":
@ -3233,16 +3638,24 @@ class ClusterCommand():
if ignoreError:
cmd += " 2>/dev/null"
else:
cmd = "%s '" % sshCmd
cmd = ""
if mpprcFile != "" and mpprcFile is not None:
cmd += "source %s;" % mpprcFile
cmd += "%s -f %s --output %s -t -A -X '" % (gsql_cmd,
cmd += "%s -f %s --output %s -t -A -X " % (gsql_cmd,
sqlFile,
queryResultFile)
if user_pwd:
cmd = "echo \"%s\" | %s" % (cmd, sshCmd)
else:
cmd = "%s '%s'" % (sshCmd, cmd)
if ignoreError:
cmd += " 2>/dev/null"
for i in range(RE_TIMES):
(status1, output1) = subprocess.getstatusoutput(cmd)
proc = FastPopen(cmd, stdout=PIPE, stderr=PIPE,
preexec_fn=os.setsid, close_fds=True)
stdout, stderr = proc.communicate()
output1 = stdout + stderr
status1 = proc.returncode
if SqlFile.findErrorInSqlFile(sqlFile, output1):
if SqlFile.findTupleErrorInSqlFile(output1):
time.sleep(1) # find tuple error --> retry
@ -3278,7 +3691,11 @@ class ClusterCommand():
if (ignoreError):
cmd += " 2>/dev/null"
for i in range(RE_TIMES):
(status1, output1) = subprocess.getstatusoutput(cmd)
proc = FastPopen(cmd, stdout=PIPE, stderr=PIPE,
preexec_fn=os.setsid, close_fds=True)
stdout, stderr = proc.communicate()
output1 = stdout + stderr
status1 = proc.returncode
if SqlFile.findErrorInSqlFile(sqlFile, output1):
if SqlFile.findTupleErrorInSqlFile(output1):
time.sleep(1) # find tuple error --> retry
@ -3778,6 +4195,83 @@ class ClusterInstanceConfig():
return connInfo1, nodename
@staticmethod
def get_data_from_dcc(cluster_info, logger, user, paralist):
"""
function: get value from dcc
:param cluster_info: cluster info
:param logger: logger obj
:param user: cluster user
:param paralist: paralist
:return: key-value map dict
"""
gausshome = ClusterDir.getInstallDir(user)
cm_ctl = os.path.realpath(os.path.join(gausshome, "bin/cm_ctl"))
if not os.path.isfile(cm_ctl):
raise Exception(ErrorCode.GAUSS_502["GAUSS-50201"] % "file cm_ctl")
cms_count = 0
etcd_count = 0
for dbnode in cluster_info.dbNodes:
for _ in dbnode.cmservers:
cms_count += 1
for _ in dbnode.etcds:
etcd_count += 1
if cms_count == 0 or etcd_count > 1:
raise Exception(ErrorCode.GAUSS_500["GAUSS-50011"] % paralist)
para_value_map = {}
for para_key in paralist:
cmd = "source %s; %s ddb --get '%s'" % (EnvUtil.getMpprcFile(), cm_ctl, para_key)
logger.debug("Get dcc value cmd:%s." % cmd)
(status, output) = subprocess.getstatusoutput(cmd)
if status != 0:
raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd, "Error:%s" % output)
logger.debug("Get dcc value:%s." % output)
res = output.strip("\n").split("\n")
if len(res) != 2:
raise Exception(ErrorCode.GAUSS_500["GAUSS-50019"] % res)
if res[-1].find("Key not found") > -1:
para_value_map[para_key] = ""
continue
para_value_map[para_key] = res[-1].split(":")[-1].strip()
logger.debug("Get all values from dcc component res:%s." % para_value_map)
return para_value_map
@staticmethod
def set_data_on_dcc(cluster_info, logger, user, paradict):
"""
function: set data on dcc
:param cluster_info: cluster info
:param logger: logger obj
:param user: cluster user
:param paradict: paradict
:return: NA
"""
gausshome = ClusterDir.getInstallDir(user)
cm_ctl = os.path.realpath(os.path.join(gausshome, "bin/cm_ctl"))
if not os.path.isfile(cm_ctl):
raise Exception(ErrorCode.GAUSS_502["GAUSS-50201"] % "file cm_ctl")
cms_count = 0
etcd_count = 0
for dbnode in cluster_info.dbNodes:
for _ in dbnode.cmservers:
cms_count += 1
for _ in dbnode.etcds:
etcd_count += 1
if cms_count == 0 or etcd_count > 1:
raise Exception(ErrorCode.GAUSS_500["GAUSS-50011"] % paradict)
for para_key in list(paradict.keys()):
cmd = "source %s; %s ddb --put '%s' '%s'" % \
(EnvUtil.getMpprcFile(), cm_ctl, para_key, paradict[para_key])
logger.debug("Set dcc value cmd:%s." % cmd)
(status, output) = subprocess.getstatusoutput(cmd)
if status != 0:
raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd, "Error:%s" % output)
logger.debug("Set dcc data:%s." % output)
res = output.strip("\n").split("\n")
if len(res) != 2:
raise Exception(ErrorCode.GAUSS_500["GAUSS-50019"] % res)
logger.debug("Successfully set the dcc data information.")
class TempfileManagement():
"""

View File

@ -37,6 +37,7 @@ from domain_utils.cluster_file.version_info import VersionInfo
from domain_utils.domain_common.cluster_constants import ClusterConstants
from base_utils.common.constantsbase import ConstantsBase
from base_utils.os.env_util import EnvUtil
from base_utils.security.security_checker import SecurityChecker
###########################
# instance role
@ -958,6 +959,10 @@ class dbClusterInfo():
# add for dcf
self.enable_dcf = ""
self.dcf_config = ""
self.local_stream_ip_map = []
self.remote_stream_ip_map = []
self.remote_dn_base_port = 0
self.local_dn_base_port = 0
def __str__(self):
"""
@ -1314,7 +1319,7 @@ class dbClusterInfo():
maxAzNameLen = maxAzNameLen if maxAzNameLen > azNameLen \
else azNameLen
dnNodeCount += 1
if roleStatus == "Primary":
if roleStatus in ["Primary", "Main"]:
primaryDbNum += 1
primaryDbState = dbState
else:
@ -3395,6 +3400,7 @@ class dbClusterInfo():
if self.enable_dcf == "":
i = 0
ssdInfoList[i].extend(ssddirList)
self.parse_stream_cluster_info(masterNode, i)
# dataNode syncNum
key = "dataNode%d_syncNum" % (i + 1)
@ -3620,6 +3626,48 @@ class dbClusterInfo():
for inst in masterNode.datanodes:
inst.azName = masterNode.azName
def parse_stream_cluster_info(self, masternode, i):
"""parse_stream_cluster_info"""
i = i + 1
local_ip_map = self.__readNodeStrValue(masternode.name,
"localStreamIpmap%s" % i, True)
if not local_ip_map:
return
remote_ip_map = self.__readNodeStrValue(masternode.name,
"remoteStreamIpmap%s" % i, True)
remote_dn_port = self.__readNodeStrValue(masternode.name,
"remotedataPortBase", True)
local_dn_port = self.__readNodeStrValue(masternode.name,
"dataPortBase", True, MASTER_BASEPORT_DATA)
if not all([local_ip_map, remote_ip_map, remote_dn_port]):
raise Exception(
ErrorCode.GAUSS_512["GAUSS_51236"] + " check streamInfo config is correct")
self.local_stream_ip_map.append(dbClusterInfo.append_map_ip_into_global(local_ip_map))
self.remote_stream_ip_map.append(dbClusterInfo.append_map_ip_into_global(remote_ip_map))
if not remote_dn_port.isdigit() or not local_dn_port.isdigit():
raise Exception(
ErrorCode.GAUSS_512["GAUSS_51236"] + " check streamInfo config is correct")
self.remote_dn_base_port = int(remote_dn_port)
self.local_dn_base_port = int(local_dn_port)
@staticmethod
def append_map_ip_into_global(strem_ip_map):
"""append_map_ip_into_global"""
shard_map = []
ip_map_list = [i.strip().strip("),").strip(",(") for i in strem_ip_map.split("(") if i]
for ip_map in ip_map_list:
peer_ip_map = ip_map.split(",")
temp_dict = dict()
if len(peer_ip_map) != 2:
raise Exception(ErrorCode.GAUSS_512["GAUSS_51236"] +
" check localStreamIpmap is correct")
temp_dict["ip"] = peer_ip_map[0].strip()
SecurityChecker.check_ip_valid(temp_dict["ip"], temp_dict["ip"])
temp_dict["dataIp"] = peer_ip_map[1].strip()
SecurityChecker.check_ip_valid(temp_dict["dataIp"], temp_dict["dataIp"])
shard_map.append(temp_dict)
return shard_map
def __readCmaConfig(self, dbNode):
"""
function : Read cm agent config on node.
@ -4689,3 +4737,14 @@ class dbClusterInfo():
:return:True or False
"""
return self.cmscount < 1
def getDbNodeByID(self, inputid):
"""
function : Get node by id.
input : nodename
output : []
"""
for dbNode in self.dbNodes:
if dbNode.id == inputid:
return dbNode
return None

View File

@ -21,11 +21,13 @@
import os
import sys
sys.path.append(sys.path[0] + "/../../")
from gspylib.common.Common import DefaultValue, ClusterInstanceConfig
from gspylib.common.DbClusterInfo import dbClusterInfo
from gspylib.common.ErrorCode import ErrorCode
from domain_utils.cluster_os.cluster_user import ClusterUser
from impl.streaming_disaster_recovery.streaming_constants import StreamingConstants
###########################
# instance type. only for CN/DN
@ -64,6 +66,7 @@ class StatusReport():
self.dnPrimary = 0
self.dnStandby = 0
self.dn_cascade_standby = 0
self.dn_main_standby = 0
self.dnDummy = 0
self.dnBuild = 0
self.dnAbnormal = 0
@ -124,6 +127,8 @@ class DbInstanceStatus():
elif self.status == DbClusterStatus.INSTANCE_STATUS_CASCADE_STANDBY:
if self.haStatus != DbClusterStatus.HA_STATUS_NORMAL:
return False
elif self.status == DbClusterStatus.INSTANCE_STATUS_MAIN_STANDBY:
return True
else:
return False
@ -231,6 +236,8 @@ class DbNodeStatus():
report.dnDummy += 1
elif inst.status == DbClusterStatus.INSTANCE_STATUS_CASCADE_STANDBY:
report.dn_cascade_standby += 1
elif inst.status == DbClusterStatus.INSTANCE_STATUS_MAIN_STANDBY:
report.dn_main_standby += 1
else:
report.dnAbnormal += 1
@ -400,7 +407,23 @@ class DbClusterStatus():
"Degraded": "Degraded",
"Unknown": "Abnormal"
}
INSTANCE_STATUS_MAP_CHECK_STATUS = {
"Normal": "Primary",
"Unnormal": "Abnormal",
"Primary": "Primary",
"Standby": "Standby",
"Secondary": "Secondary",
"Pending": "Abnormal",
"Down": "Down",
"Unknown": "Abnormal",
"Offline": "Offline",
"Main Standby": "Standby",
"Cascade Standby": "Standby"
}
INSTANCE_STATUS_MAP_CHECK_FAILOVER = {
"Need repair(Disconnected)": "Normal",
"Need repair": "Normal"
}
###################################################################
# instance role
###################################################################
@ -418,6 +441,7 @@ class DbClusterStatus():
INSTANCE_STATUS_PRIMARY = "Primary"
INSTANCE_STATUS_STANDBY = "Standby"
INSTANCE_STATUS_CASCADE_STANDBY = "Cascade Standby"
INSTANCE_STATUS_MAIN_STANDBY = "Main Standby"
INSTANCE_STATUS_ABNORMAL = "Abnormal"
INSTANCE_STATUS_DOWN = "Down"
INSTANCE_STATUS_DUMMY = "Secondary"
@ -432,6 +456,7 @@ class DbClusterStatus():
"Standby": "Standby",
"Secondary": "Secondary",
"Cascade Standby": "Cascade Standby",
"Main Standby": "Main Standby",
"Pending": "Abnormal",
"Down": "Down",
"Unknown": "Abnormal"
@ -611,7 +636,29 @@ class DbClusterStatus():
DbClusterStatus.OM_NODE_STATUS_ABNORMAL)
return statusInfo
def initFromFile(self, filePath, isExpandScene=False):
def init_from_content(self, content, is_expand_scene=False, check_action=None, logger=None):
"""
Init from content
"""
content_list = content.split('\n')
try:
for line in content_list:
line = line.strip()
if line == "":
continue
str_list = line.split(":")
if len(str_list) != 2:
continue
self.__fillField(str_list[0].strip(), str_list[1].strip(),
is_expand_scene, check_action=check_action)
except Exception as error:
if logger:
logger.debug("Failed parse cluster status with error:%s, "
"status content:%s" % (error, content))
raise Exception(
ErrorCode.GAUSS_502["GAUSS_50204"] % "status content" + " Error: \n%s" % str(error))
def initFromFile(self, filePath, isExpandScene=False, check_action=None):
"""
function : Init from status file
input : filePath
@ -637,12 +684,12 @@ class DbClusterStatus():
continue
self.__fillField(strList[0].strip(), strList[1].strip(),
isExpandScene)
isExpandScene, check_action=check_action)
except Exception as e:
raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"] %
"status file" + " Error: \n%s" % str(e))
def __fillField(self, field, value, isExpandScene):
def __fillField(self, field, value, isExpandScene, check_action=None):
"""
function : Fill field
input : field, value
@ -690,7 +737,10 @@ class DbClusterStatus():
elif value == DbClusterStatus.INSTANCE_TYPE_ETCD:
self.__curNode.etcds.append(self.__curInstance)
elif field == "instance_state":
status = DbClusterStatus.INSTANCE_STATUS_MAP.get(value)
if check_action == DefaultValue.TASK_QUERY_STATUS:
status = DbClusterStatus.INSTANCE_STATUS_MAP_CHECK_STATUS.get(value)
else:
status = DbClusterStatus.INSTANCE_STATUS_MAP.get(value)
self.__curInstance.status = \
DbClusterStatus.INSTANCE_STATUS_ABNORMAL \
if status is None else status
@ -715,6 +765,11 @@ class DbClusterStatus():
self.__curInstance.status = \
DbClusterStatus.INSTANCE_STATUS_ABNORMAL
self.__curInstance.detail_status = value
if check_action == StreamingConstants.STREAM_DISTRIBUTE_ACTION:
self.__curInstance.status = \
DbClusterStatus.INSTANCE_STATUS_MAP_CHECK_FAILOVER.get(value, value)
self.__curInstance.detail_status = \
DbClusterStatus.INSTANCE_STATUS_MAP_CHECK_FAILOVER.get(value, value)
elif field == "HA_state":
haStatus = DbClusterStatus.HA_STATUS_MAP.get(value)
detail_ha = value
@ -742,5 +797,9 @@ class DbClusterStatus():
if dataStatus is None else dataStatus
elif field == "reason":
self.__curInstance.reason = value
if check_action == StreamingConstants.STREAM_DISTRIBUTE_ACTION and \
hasattr(self.__curInstance, "detail_ha") and value == "Disconnected":
self.__curInstance.detail_ha = \
DbClusterStatus.INSTANCE_STATUS_MAP_CHECK_FAILOVER.get("Need repair", value)

View File

@ -101,7 +101,12 @@ class ErrorCode():
'GAUSS_50018': "[GAUSS-50018] : The parameter value of %s is Null.",
'GAUSS_50019': "[GAUSS-50019] : The value of %s is error.",
'GAUSS_50020': "[GAUSS-50020] : The value of %s must be a digit.",
'GAUSS_50021': "[GAUSS-50021] : Failed to query %s parameter."
'GAUSS_50021': "[GAUSS-50021] : Failed to query %s parameter.",
'GAUSS_50022': "[GAUSS-50022] : The parameter '%s' should be %s.",
'GAUSS_50023': "[GAUSS-50023] : The parameter '%s' over max length %s.",
'GAUSS_50024': "[GAUSS-50024] : The parameter '%s' is invalid.",
'GAUSS_50025': "[GAUSS-50025] : There is illegal character '%s' in parameter %s.",
'GAUSS_50026': "[GAUSS-50026] : Failed to check %s parameters in the XML file."
}

View File

@ -85,7 +85,7 @@ class GaussLog:
Class to handle log file
"""
def __init__(self, logFile, module="", expectLevel=LOG_DEBUG):
def __init__(self, logFile, module="", expectLevel=LOG_DEBUG, trace_id=None):
"""
function: Constructor
input : NA
@ -104,6 +104,7 @@ class GaussLog:
self.lock = thread.allocate_lock()
self.tmpFile = None
self.ignoreErr = False
self.trace_id = trace_id
logFileList = ""
try:
@ -419,9 +420,14 @@ class GaussLog:
strTime = datetime.datetime.now()
file_line = self.get_log_file_line()
if (stepFlag == ""):
print("[%s][%d][%s][%s][%s]:%s" % (
strTime, self.pid, file_line, self.moduleName, level, msg),
file=self.fp)
if self.trace_id:
print("[%s][%s][%d][%s][%s]:%s"
% (self.trace_id, strTime, self.pid, self.moduleName,
level, msg), file=self.fp)
else:
print("[%s][%d][%s][%s]:%s" % (
strTime, self.pid, self.moduleName, level, msg),
file=self.fp)
else:
stepnum = self.Step(stepFlag)
print("[%s][%d][%s][%s][%s][Step%d]:%s" % (

View File

@ -407,7 +407,7 @@ class DN_OLAP(Kernel):
self.modifyDummpyStandbyConfigItem()
def setPghbaConfig(self, clusterAllIpList):
def setPghbaConfig(self, clusterAllIpList, try_reload=False):
"""
"""
principal = None
@ -446,12 +446,22 @@ class DN_OLAP(Kernel):
GUCParasStrList.append(GUCParasStr)
i = 0
GUCParasStr = ""
# Used only streaming disaster cluster
streaming_dn_ips = self.get_streaming_relate_dn_ips(self.instInfo)
if streaming_dn_ips:
for dn_ip in streaming_dn_ips:
GUCParasStr += "-h \"host all %s %s/32 %s\" " \
% (pg_user, dn_ip, METHOD_TRUST)
GUCParasStr += "-h \"host all all %s/32 %s\" " \
% (dn_ip, METHOD_SHA)
ip_segment = '.'.join(dn_ip.split('.')[:2]) + ".0.0/16"
GUCParasStr += "-h \"host replication all %s sha256\" " % ip_segment
if (GUCParasStr != ""):
GUCParasStrList.append(GUCParasStr)
for parasStr in GUCParasStrList:
self.doGUCConfig("set", parasStr, True)
self.doGUCConfig("set", parasStr, True, try_reload=try_reload)
"""
Desc:

View File

@ -19,6 +19,8 @@ import sys
import os
import subprocess
import re
import pwd
import json
sys.path.append(sys.path[0] + "/../../../")
from gspylib.common.ErrorCode import ErrorCode
@ -28,6 +30,7 @@ from gspylib.common.Common import DefaultValue
from base_utils.os.cmd_util import CmdUtil
from base_utils.os.env_util import EnvUtil
from base_utils.os.file_util import FileUtil
from base_utils.security.security_checker import SecurityChecker
from domain_utils.cluster_os.cluster_user import ClusterUser
MAX_PARA_NUMBER = 1000
@ -403,7 +406,7 @@ class Kernel(BaseComponent):
return tempCommonDict
def doGUCConfig(self, action, GUCParasStr, isHab=False):
def doGUCConfig(self, action, GUCParasStr, isHab=False, try_reload=False):
"""
"""
# check instance data directory
@ -424,6 +427,16 @@ class Kernel(BaseComponent):
if (not os.path.exists(configFile)):
raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % configFile)
if try_reload:
cmd_reload = "%s/gs_guc %s -D %s %s " % (self.binPath, 'reload',
self.instInfo.datadir, GUCParasStr)
status, output = CmdUtil.retryGetstatusoutput(cmd_reload, 3, 3)
if status != 0:
self.logger.log("Failed to reload guc params with commander:[%s]" % cmd_reload)
else:
self.logger.log("Successfully to reload guc params with commander:[%s]"
% cmd_reload)
return
cmd = "%s/gs_guc %s -D %s %s " % (self.binPath, action,
self.instInfo.datadir, GUCParasStr)
self.logger.debug("gs_guc command is: {0}".format(cmd))
@ -456,6 +469,39 @@ class Kernel(BaseComponent):
for parasStr in guc_paras_str_list:
self.doGUCConfig(setMode, parasStr, False)
def get_streaming_relate_dn_ips(self, instance):
"""
function: Streaming disaster cluster, obtain the IP address of the DN
with the same shards.
input: NA
:return: Cn ip
"""
self.logger.debug("Start parse cluster_conf_record.")
pg_host = EnvUtil.getEnv("PGHOST")
config_param_file = os.path.realpath(
os.path.join(pg_host, "streaming_cabin", "cluster_conf_record"))
if not os.path.isfile(config_param_file):
self.logger.debug("Not found streaming cluster config file.")
return []
with open(config_param_file, "r") as fp_read:
param_dict = json.load(fp_read)
dn_ip_list = []
remote_cluster_conf = param_dict.get("remoteClusterConf")
shards = remote_cluster_conf.get('shards')
for shard in shards:
for node_info in shard:
shard_num = node_info.get("shardNum", '1')
node_ip = node_info.get("dataIp")
SecurityChecker.check_ip_valid("check ip from cluster_conf_record", node_ip)
if not all([shard_num, node_ip]):
raise Exception(ErrorCode.GAUSS_516['GAUSS_51632']
% "obtain remote conf from cluster_conf_record")
if str(shard_num) == str(instance.mirrorId):
dn_ip_list.append(node_ip)
self.logger.debug("Got streaming cluster pg_hba ips %s." % dn_ip_list)
return dn_ip_list
def removeIpInfoOnPghbaConfig(self, ipAddressList):
"""
"""

View File

@ -76,7 +76,7 @@ class CheckperfImplOLAP(CheckperfImpl):
dnInst.instanceId)
if (instStatus is not None and
instStatus.isInstanceHealthy() and
instStatus.status == "Primary"):
instStatus.status in ["Primary"]):
normalDNList.append(dnInst)
if (len(normalDNList) == 0):
@ -1791,7 +1791,6 @@ class CheckperfImplOLAP(CheckperfImpl):
pmk_last_collect_start_time, last_snapshot_id) = \
self.getMetaData(hostname, port)
self.deleteExpiredSnapShots(hostname, port)
# collect pmk stat
self.collectPMKData(pmk_curr_collect_start_time,
pmk_last_collect_start_time,
@ -1825,8 +1824,8 @@ class CheckperfImplOLAP(CheckperfImpl):
self.handleNodeStat()
# insert the node stat of all hosts into the cluster
self.insertNodeStat(hostname, port,
pmk_curr_collect_start_time,
pmk_last_collect_start_time, last_snapshot_id)
pmk_curr_collect_start_time,
pmk_last_collect_start_time, last_snapshot_id)
# display pmk stat
showDetail = ""

View File

@ -370,6 +370,10 @@ class OmImplOLAP(OmImpl):
self.logger.log(
"No need to generate dynamic configuration file for one node.")
return
if DefaultValue.cm_exist_and_is_disaster_cluster(self.context.clusterInfo, self.logger):
self.logger.log(
"Streaming disaster cluster do not need to generate dynamic configuration.")
return
self.logger.log("Generating dynamic configuration file for all nodes.")
hostname = NetUtil.GetHostIpOrName()
sshtool = SshTool(self.context.clusterInfo.getClusterNodeNames())

View File

@ -0,0 +1,344 @@
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
#############################################################################
# Copyright (c) 2020 Huawei Technologies Co.,Ltd.
#
# openGauss is licensed under Mulan PSL v2.
# You can use this software according to the terms
# and conditions of the Mulan PSL v2.
# You may obtain a copy of Mulan PSL v2 at:
#
# http://license.coscl.org.cn/MulanPSL2
#
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS,
# WITHOUT WARRANTIES OF ANY KIND,
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
# See the Mulan PSL v2 for more details.
# ----------------------------------------------------------------------------
# Description : params_handler.py is a utility for parsing and verifying streaming
# disaster recovery params.
#############################################################################
import os
import sys
import json
import optparse
import getpass
from impl.streaming_disaster_recovery.streaming_constants import StreamingConstants
from gspylib.common.DbClusterInfo import dbClusterInfo
from gspylib.common.ErrorCode import ErrorCode
from base_utils.security.security_checker import SecurityChecker, ValidationError
from domain_utils.cluster_file.version_info import VersionInfo
def check_streaming_start_mode(mode):
"""
Check start mode
"""
if mode not in ["primary", "disaster_standby"]:
raise ValidationError(ErrorCode.GAUSS_500["GAUSS_50011"] % ('-m', mode))
def check_xml_file(file):
"""
Check xml file param
"""
if not file:
raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50001'] % 'X')
SecurityChecker.check_is_string('xml file path', file)
if not os.path.isfile(file):
raise ValidationError(ErrorCode.GAUSS_502["GAUSS_50201"] % file)
def check_hadr_user(value):
"""
Check disaster user
"""
description = "disaster username"
SecurityChecker.check_db_user(description, value)
def check_hadr_pwd(value):
"""
Check disaster user password
"""
description = "disaster user password"
# check_db_password will be used in cloud scene
SecurityChecker.check_db_user(description, value)
def check_wait_timeout(value):
"""
Check wait timeout
"""
description = "wait timeout"
SecurityChecker.check_is_digit(description, value)
def check_local_cluster_conf(value):
"""
Check local cluster conf
"""
SecurityChecker.check_is_dict("localClusterConf", value)
port = value.get('port')
SecurityChecker.check_port_valid('port of localClusterConf', port)
shards = value.get('shards')
SecurityChecker.check_is_list('shards of localClusterConf', shards)
for shard in shards:
for node in shard:
ip = node.get('ip')
data_ip = node.get('dataIp')
SecurityChecker.check_ip_valid('ip of localClusterConf', ip)
SecurityChecker.check_ip_valid('dataIp of localClusterConf', data_ip)
def check_remote_cluster_conf(value):
"""
Check local cluster conf
"""
SecurityChecker.check_is_dict("remoteClusterConf", value)
port = value.get('port')
SecurityChecker.check_port_valid('port of remoteClusterConf', port)
shards = value.get('shards')
SecurityChecker.check_is_list('shards of remoteClusterConf', shards)
for shard in shards:
for node in shard:
ip = node.get('ip')
data_ip = node.get('dataIp')
SecurityChecker.check_ip_valid('ip of remoteClusterConf', ip)
SecurityChecker.check_ip_valid('dataIp of remoteClusterConf', data_ip)
STREAMING_PARAMS_FOR_MODULE = {
"start": {
"mode": check_streaming_start_mode,
"xml_path": check_xml_file,
"hadrUserName": check_hadr_user,
"hadrUserPassword": check_hadr_pwd,
"waitingTimeout": check_wait_timeout,
"localClusterConf": check_local_cluster_conf,
"remoteClusterConf": check_remote_cluster_conf
},
"stop": {
"xml_path": check_xml_file,
"waitingTimeout": check_wait_timeout,
"localClusterConf": check_local_cluster_conf,
"remoteClusterConf": check_remote_cluster_conf
},
"switchover": {
"mode": check_streaming_start_mode,
"waitingTimeout": check_wait_timeout
},
"failover": {
"waitingTimeout": check_wait_timeout,
},
"query": {}
}
HELP_MSG = """
gs_sdr is a utility for streaming disaster recovery fully options.
Usage:
gs_sdr -? | --help
gs_sdr -V | --version
gs_sdr -t start -m [primary|disaster_standby] -X XMLFILE [-U DR_USERNAME] [-W DR_PASSWORD] [--json JSONFILE] [--time-out=SECS] [-l LOGFILE]
gs_sdr -t stop -X XMLFILE|--json JSONFILE [-l LOGFILE]
gs_sdr -t switchover -m [primary|disaster_standby] [--time-out=SECS] [-l LOGFILE]
gs_sdr -t failover [-l LOGFILE]
gs_sdr -t query [-l LOGFILE]
General options:
-?, --help Show help information for this utility,
and exit the command line mode.
-V, --version Show version information.
-t Task name, it could be:
"start", "stop", "switchover", "failover", "query".
-m Option mode, it could be:
"primary", "disaster_standby".
-U Disaster recovery user name.
-W Disaster recovery user password.
-X Path of the XML configuration file.
-l Path of log file.
--json Path of params file for streaming options.
--time-out=SECS Maximum waiting time when Main standby connect to the primary dn,
default value is 1200s.
"""
class ParamsHandler(object):
"""
Parse and check params.
"""
def __init__(self, logger, trace_id):
self.params = None
self.logger = logger
self.trace_id = trace_id
@staticmethod
def option_parser():
"""
parsing parameters
:return: param obj
"""
parser = optparse.OptionParser(conflict_handler='resolve')
parser.disable_interspersed_args()
parser.epilog = "Example: gs_sdr -t " \
"start -m primary -X clusterConfig.xml " \
"--time-out=1200."
parser.add_option('-V', "--version", dest='version_info', action='store_true',
help='-V|--version show version info.')
parser.add_option('-?', "--help", dest='help_info', action='store_true',
help='-?|--help show help message and exist.')
parser.add_option('-t', dest='task', type='string',
help='Task name. It could be "start", "stop", '
'"switchover", "failover", "query"')
parser.add_option('-m', dest='mode', type='string',
help='Cluster run mode. It could be ["primary", "disaster_standby"].')
parser.add_option('-U', dest='hadrusername', type='string',
help='hadr user name.')
parser.add_option('-W', dest='hadruserpasswd', type='string',
help='hadr user password.')
parser.add_option('-X', dest='xml_path', type='string',
help='Cluster config xml path.')
parser.add_option('--json', dest='json_path', type='string',
help='Config json file of streaming options')
parser.add_option('--time-out=', dest='timeout', default="1200", type='string',
help='time out.')
parser.add_option("-l", dest='logFile', type='string',
help='Path of log file.')
return parser
def __print_usage(self):
"""
Print help message
"""
if self.params.help_info:
print(HELP_MSG)
sys.exit(0)
def __print_version_info(self):
"""
Print version info
"""
if self.params.version_info:
print("%s %s" % (sys.argv[0].split("/")[-1],
VersionInfo.COMMON_VERSION))
sys.exit(0)
def __cluster_conf_parser(self, file_path):
"""
Parse params in json file
"""
if self.params.json_path:
if not os.path.isfile(file_path):
raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50010']
% '--json' + " Json file is not exist.")
with open(file_path, 'r') as read_fp:
param_dict = json.load(read_fp)
for key, value in param_dict.items():
if key not in StreamingConstants.STREAMING_JSON_PARAMS[self.params.task]:
continue
setattr(self.params, key, value)
return
cluster_info = dbClusterInfo()
if not self.params.xml_path or not os.path.isfile(self.params.xml_path):
raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50010']
% '-X' + " XML file and json file are all not exist.")
cluster_info.initFromXml(self.params.xml_path)
remote_cluster_conf = dict()
remote_cluster_conf.setdefault("port", cluster_info.remote_dn_base_port)
remote_cluster_conf.setdefault("shards", cluster_info.remote_stream_ip_map)
setattr(self.params, "remoteClusterConf", remote_cluster_conf)
self.logger.debug("Remote stream cluster conf: %s." % str(remote_cluster_conf))
local_cluster_conf = dict()
local_cluster_conf.setdefault("port", cluster_info.local_dn_base_port)
local_cluster_conf.setdefault("shards", cluster_info.local_stream_ip_map)
setattr(self.params, "localClusterConf", local_cluster_conf)
self.logger.debug("Local stream cluster conf: %s." % str(local_cluster_conf))
if not remote_cluster_conf["shards"] or len(remote_cluster_conf["shards"])\
!= len(local_cluster_conf["shards"]):
raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50026'] % "streaming DR")
def __init_default_params(self):
"""
Init params if need default value
"""
if not self.params.timeout.isdigit():
raise ValidationError(ErrorCode.GAUSS_500["GAUSS_50004"] % "--time-out")
self.params.waitingTimeout = int(self.params.timeout)
def __parse_args(self):
"""
Parse arguments
"""
parser = ParamsHandler.option_parser()
self.params, _ = parser.parse_args()
self.__print_usage()
self.__print_version_info()
if not hasattr(self.params, 'task') or not self.params.task:
raise ValidationError(ErrorCode.GAUSS_500["GAUSS_50001"] % 't' + ".")
if self.params.task not in StreamingConstants.STREAMING_JSON_PARAMS.keys():
raise ValidationError(ErrorCode.GAUSS_500["GAUSS_50004"] % 't')
# parse arguments in json/xml file
if StreamingConstants.STREAMING_JSON_PARAMS[self.params.task]:
self.__cluster_conf_parser(self.params.json_path)
def __reload_hadr_user_info(self):
"""
Input hadr user info
"""
if self.params.task not in ["start"]:
return
if self.params.hadrusername and self.params.hadruserpasswd:
self.params.hadrUserName = self.params.hadrusername
self.params.hadrUserPassword = self.params.hadruserpasswd
del self.params.hadruserpasswd
return
user_name = ""
if not self.params.hadrusername:
user_name = input("Please enter disaster user name:")
self.params.hadrUserName = user_name if user_name else self.params.hadrusername
if self.params.hadruserpasswd:
self.params.hadrUserPassword = self.params.hadruserpasswd
del self.params.hadruserpasswd
return
for i in range(3):
user_passwd = getpass.getpass("Please enter password for [%s]:" %
self.params.hadrUserName)
user_passwd_check = getpass.getpass("Please repeat enter for password for [%s]:"
% self.params.hadrUserName)
if user_passwd == user_passwd_check:
break
if i == 2:
self.logger.logExit("The two passwords entered for too many "
"times are inconsistent. Authentication failed.")
self.logger.error(
ErrorCode.GAUSS_503["GAUSS_50306"] % user_name
+ "The two passwords are different, please enter password again.")
self.params.hadrUserPassword = user_passwd
del user_passwd
del user_passwd_check
self.logger.debug("The hadr user information is successfully loaded.")
def get_valid_params(self):
"""
Check params
"""
try:
self.__parse_args()
self.logger.log(StreamingConstants.LOG_REMARK)
self.logger.log('Streaming disaster recovery ' + self.params.task + ' ' + self.trace_id)
self.logger.log(StreamingConstants.LOG_REMARK)
self.__init_default_params()
self.__reload_hadr_user_info()
for param_name, validate in STREAMING_PARAMS_FOR_MODULE[self.params.task].items():
check_value = getattr(self.params, param_name)
if self.params.task == "stop":
if param_name == "xml_path" and not check_value:
check_value = getattr(self.params, 'json_path')
validate(check_value)
except ValidationError as error:
self.logger.logExit(str(error))
return self.params

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,92 @@
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
#############################################################################
# Copyright (c) 2020 Huawei Technologies Co.,Ltd.
#
# openGauss is licensed under Mulan PSL v2.
# You can use this software according to the terms
# and conditions of the Mulan PSL v2.
# You may obtain a copy of Mulan PSL v2 at:
#
# http://license.coscl.org.cn/MulanPSL2
#
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS,
# WITHOUT WARRANTIES OF ANY KIND,
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
# See the Mulan PSL v2 for more details.
# ----------------------------------------------------------------------------
# Description : streaming_constants.py is utility for defining constants
# of streaming disaster recovery.
#############################################################################
class StreamingConstants:
# streaming files
STREAMING_LOG_FILE = "gs_sdr.log"
STREAMING_FILES_DIR = 'streaming_cabin'
STREAMING_CLUSTER_STATUS_TMP_FILE = "cluster_state_tmp"
WAL_KEEP_SEGMENTS = ".wal_keep_segments_record"
STREAMING_CLUSTER_CONF_RECORD = "cluster_conf_record"
GS_SECURE_FILES = "gs_secure_files"
HADR_KEY_CIPHER = "hadr.key.cipher"
HADR_KEY_RAND = "hadr.key.rand"
STREAM_SWITCHOVER_STATE = ".switchover_cluster_state"
MAX_TERM_RECORD = ".max_term_record"
PROCESS_LOCK_FILE = 'streaming_lock_'
STREAMING_CONFIG_XML = "streaming_config.xml"
GUC_BACKUP_FILE = ".streaming_guc_backup"
CLUSTER_USER_RECORD = ".cluster_user_record"
ACTION_START = "start"
ACTION_SWITCHOVER = "switchover"
ACTION_FAILOVER = "failover"
ACTION_ESTABLISH = "establish"
# streaming query temp file
HADR_CLUSTER_STAT = ".hadr_cluster_stat"
HADR_FAILOVER_STAT = ".hadr_failover_stat"
HADR_SWICHOVER_STAT = ".hadr_switchover_stat"
HADR_ESTABLISH_STAT = ".hadr_establish_stat"
STREAM_DISTRIBUTE_ACTION = "distribute_stream_failover"
# GUC CHANGE MAP
GUC_CHANGE_MAP = {"most_available_sync": "on", "synchronous_commit": "on"}
# params in json file for each module
STREAMING_JSON_PARAMS = {
"start": ["localClusterConf", "remoteClusterConf"],
"stop": ["localClusterConf", "remoteClusterConf"],
"switchover": [],
"failover": [],
"query": []
}
# step file of each module
STREAMING_STEP_FILES = {
"start_primary": ".streaming_start_primary.step",
"start_standby": ".streaming_start_standby.step",
"stop": ".streaming_stop.step",
"switchover_primary": ".streaming_switchover_primary.step",
"switchover_standby": ".streaming_switchover_standby.step",
"failover": ".streaming_failover.step",
"query": ".streaming_query.step",
}
# task need check process is exist
TASK_EXIST_CHECK = ["start", "stop", "switchover", "failover"]
# default values
MAX_WAL_KEEP_SEGMENTS = 16384
MAX_REPLICATION_NUMS = 8
MAX_BUILD_TIMEOUT = 1209600
STANDBY_START_TIMEOUT = 3600 * 24 * 7
CHECK_PROCESS_WAIT_TIME = 3
# backup open key
BACKUP_OPEN = "/%s/CMServer/backup_open"
# log remark
LOG_REMARK = "-" * 80

View File

@ -0,0 +1,234 @@
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
#############################################################################
# Copyright (c) 2020 Huawei Technologies Co.,Ltd.
#
# openGauss is licensed under Mulan PSL v2.
# You can use this software according to the terms
# and conditions of the Mulan PSL v2.
# You may obtain a copy of Mulan PSL v2 at:
#
# http://license.coscl.org.cn/MulanPSL2
#
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS,
# WITHOUT WARRANTIES OF ANY KIND,
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
# See the Mulan PSL v2 for more details.
# ----------------------------------------------------------------------------
# Description : streaming_disaster_recovery_start.py is utility for creating
# relationship between primary cluster and standby cluster.
import os
from base_utils.security.sensitive_mask import SensitiveMask
from gspylib.common.ErrorCode import ErrorCode
from gspylib.common.Common import DefaultValue, ClusterCommand
from impl.streaming_disaster_recovery.streaming_base import StreamingBase
from impl.streaming_disaster_recovery.streaming_constants import StreamingConstants
class StreamingStartHandler(StreamingBase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def _first_step_for_streaming_start(self, step):
"""
First step for streaming start
"""
if step >= 2:
return
self.logger.debug("Start first step of streaming start.")
self.create_streaming_dir(self.streaming_file_dir)
self.check_action_and_mode()
self.init_cluster_status()
def _second_step_for_streaming_start(self, step):
"""
Second step for streaming start
"""
if step >= 2:
return
self.logger.debug("Start second step of streaming start.")
self.check_cluster_status(status_allowed=['Normal'])
self.check_cluster_is_common()
cm_exist = DefaultValue.check_is_cm_cluster(self.logger)
if not cm_exist:
self.logger.logExit(ErrorCode.GAUSS_516["GAUSS_51632"] %
"check cm_ctl is available for current cluster")
self.check_is_under_upgrade()
self.check_dn_instance_params()
self.write_streaming_step("2_check_cluster_step")
def _third_step_for_streaming_start(self, step):
"""
Third step for streaming start
"""
if step >= 3:
return
self.logger.debug("Start third step of streaming start.")
self.drop_replication_slot_on_dr_cluster(only_mode="disaster_standby")
self.prepare_gs_secure_files(only_mode='primary')
self.build_and_distribute_key_files(only_mode='disaster_standby')
self.get_default_wal_keep_segments(only_mode='primary')
self.write_streaming_step("3_set_wal_segments_step")
def drop_replication_slot_on_dr_cluster(self, only_mode=None):
"""
Drop replication slot on dr cluster
"""
if only_mode and self.params.mode != only_mode:
self.logger.debug("Drop replication slot opts not for mode:%s." % self.params.mode)
return
sql_check = "select slot_name from pg_get_replication_slots() where slot_type='logical'"
primary_dns = DefaultValue.get_primary_dn_instance_id("Primary", ignore=True)
if not primary_dns:
return
primary_insts = [inst for node in self.cluster_info.dbNodes
for inst in node.datanodes if str(inst.instanceId) in primary_dns]
dn_inst = primary_insts[0]
self.logger.debug("Start drop node %s [%s] slots" % (dn_inst.hostname, dn_inst.instanceId))
status, output = ClusterCommand.remoteSQLCommand(
sql_check, self.user, dn_inst.hostname, dn_inst.port)
self.logger.debug("Get %s all replication slots, status=%d, output: %s." %
(dn_inst.instanceId, status, SensitiveMask.mask_pwd(output)))
if status == 0 and output.strip():
drop_slots = output.strip().split('\n')
for slot in drop_slots:
self.logger.debug("Starting drop node %s %s" % (dn_inst.instanceId, slot.strip()))
sql = "select * from pg_drop_replication_slot('%s');" % slot.strip()
status_dr, output_dr = ClusterCommand.remoteSQLCommand(
sql, self.user, dn_inst.hostname, dn_inst.port)
if status_dr != 0:
self.logger.debug("Failed to remove node %s %s with error: %s" % (
dn_inst.hostname, slot.strip(), SensitiveMask.mask_pwd(output_dr)))
self.logger.debug(
"Successfully drop node %s %s" % (dn_inst.instanceId, slot.strip()))
def _fourth_step_for_streaming_start(self, step):
"""
Fourth step for streaming start
"""
if step >= 4:
return
self.logger.debug("Start fourth step of streaming start.")
self.set_wal_keep_segments(
"reload", StreamingConstants.MAX_WAL_KEEP_SEGMENTS, only_mode='primary')
self.write_streaming_step("4_set_wal_segments_step")
def _fifth_step_for_streaming_start(self, step):
"""
Fifth step for streaming start
"""
if step >= 5:
return
self.logger.debug("Start fifth step of streaming start.")
self.set_data_in_dcc(self.backup_open_key, "0", only_mode='primary')
self.set_data_in_dcc(self.backup_open_key, "2", only_mode='disaster_standby')
self.stop_cluster_by_node(only_mode='disaster_standby')
self.write_streaming_step("5_set_wal_segments_step")
def common_step_for_streaming_start(self):
"""
Common step for streaming start between step 1 and 2
"""
self.logger.debug("Start common config step of streaming start.")
self.distribute_cluster_conf()
self.update_streaming_pg_hba()
self.config_streaming_repl_info()
def _sixth_step_for_streaming_start(self, step):
"""
Sixth step for streaming start
"""
if step >= 6:
return
self.logger.debug("Start sixth step of streaming start.")
self.set_cmserver_guc("backup_open", "2", "set", only_mode='disaster_standby')
self.set_cmagent_guc("agent_backup_open", "2", "set", only_mode='disaster_standby')
self.write_streaming_step("6_set_guc_step")
def _seventh_step_for_streaming_start(self, step):
"""
Seventh step for streaming start
"""
if step >= 7:
return
self.logger.debug("Start seventh step of streaming start.")
self.update_streaming_info("cluster", "restore", only_mode='disaster_standby')
try:
self.build_dn_instance(only_mode='disaster_standby')
except Exception as error:
self.update_streaming_info("cluster", "restore_fail", only_mode='disaster_standby')
raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "build dns" + "Error:%s" % error)
self.write_streaming_step("7_build_dn_instance_step")
def _eighth_step_for_streaming_start(self, step):
"""
Eighth step for streaming start
"""
if step >= 8:
return
self.logger.debug("Start eighth step of streaming start.")
self.start_cluster(cm_timeout=StreamingConstants.STANDBY_START_TIMEOUT,
only_mode='disaster_standby')
self.update_streaming_info("cluster", "full_backup", only_mode='primary')
try:
self.wait_main_standby_connection(only_mode='primary')
except Exception as error:
self.update_streaming_info("cluster", "backup_fail", only_mode='primary')
raise Exception(str(error))
ret = self.check_cluster_status(status_allowed=['Normal'],
only_check=True, check_current=True)
query_status = "recovery" if ret else "recovery_fail"
self.update_streaming_info("cluster", query_status, only_mode='disaster_standby')
self.update_streaming_info("cluster", "archive", only_mode='primary')
self.write_streaming_step("8_start_cluster_step")
def _ninth_step_for_streaming_start(self, step):
"""
ninth step for streaming start
"""
if step >= 9:
return
self.logger.debug("Start ninth step of streaming start.")
self.restore_wal_keep_segments(only_mode='primary')
self.clean_gs_secure_dir()
self.clean_step_file()
def _check_and_refresh_disaster_user_permission(self):
"""check and refresh disaster user permission"""
if self.params.mode != "primary":
return
self.check_hadr_user(only_mode='primary')
self.check_hadr_pwd(only_mode='primary')
self.logger.debug("Encrypt hadr user info to database not "
"for mode:%s." % self.params.mode)
hadr_cipher_path = os.path.join(self.bin_path, "hadr.key.cipher")
hadr_rand_path = os.path.join(self.bin_path, "hadr.key.rand")
if not os.path.isfile(hadr_cipher_path) or not os.path.isfile(hadr_rand_path):
self.hadr_key_generator('hadr')
user_info = DefaultValue.obtain_hadr_user_encrypt_str(self.cluster_info, self.user,
self.logger, False, True)
if user_info:
self.clean_global_config()
pass_str = self.encrypt_hadr_user_info(
'hadr', self.params.hadrUserName, self.params.hadrUserPassword)
self.keep_hadr_user_info(pass_str)
def run(self):
self.logger.log("Start create streaming disaster relationship.")
step = self.query_streaming_step()
self._first_step_for_streaming_start(step)
self.parse_cluster_status()
self._check_and_refresh_disaster_user_permission()
self._second_step_for_streaming_start(step)
self.common_step_for_streaming_start()
self._third_step_for_streaming_start(step)
self._fourth_step_for_streaming_start(step)
self._fifth_step_for_streaming_start(step)
self._sixth_step_for_streaming_start(step)
self._seventh_step_for_streaming_start(step)
self._eighth_step_for_streaming_start(step)
self._ninth_step_for_streaming_start(step)
self.logger.log("Successfully do streaming disaster recovery start.")

View File

@ -0,0 +1,70 @@
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
#############################################################################
# Copyright (c) 2020 Huawei Technologies Co.,Ltd.
#
# openGauss is licensed under Mulan PSL v2.
# You can use this software according to the terms
# and conditions of the Mulan PSL v2.
# You may obtain a copy of Mulan PSL v2 at:
#
# http://license.coscl.org.cn/MulanPSL2
#
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS,
# WITHOUT WARRANTIES OF ANY KIND,
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
# See the Mulan PSL v2 for more details.
# ----------------------------------------------------------------------------
# Description : streaming_disaster_recovery_failover.py is utility for
# standby cluster failover to primary cluster.
from gspylib.common.Common import DefaultValue
from gspylib.common.ErrorCode import ErrorCode
from impl.streaming_disaster_recovery.streaming_base import StreamingBase
class StreamingFailoverHandler(StreamingBase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def run(self):
self.logger.log("Start streaming disaster failover.")
self.check_action_and_mode()
step = self.check_streaming_failover_workable(check_type_step=3, check_status_step=0)
self.check_is_under_upgrade()
self.init_cluster_conf()
try:
self.streaming_failover_single_inst(step)
self.update_streaming_info("cluster", "normal")
self.clean_step_file()
except Exception as error:
self.update_streaming_info("cluster", "promote_fail")
raise Exception(
ErrorCode.GAUSS_516["GAUSS_51632"] % "centralize failover" + "Error:%s" % error)
finally:
self.remove_cluster_maintance_file()
self.clean_streaming_dir()
self.logger.log("Successfully do streaming disaster recovery failover.")
def check_streaming_failover_workable(self, check_type_step=0, check_status_step=0):
"""
Check streaming failover is workable.
"""
self.logger.debug("Streaming disaster distribute cluster failover...")
stream_disaster_step = self.query_streaming_step()
if not DefaultValue.is_disaster_cluster(self.cluster_info) \
and stream_disaster_step < check_type_step:
self.logger.debug("The primary dn exist, do nothing except record the result file.")
raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] %
"streaming disaster cluster failover, Because the primary cluster "
"does not support failover")
cluster_normal_status = [DefaultValue.CLUSTER_STATUS_NORMAL,
DefaultValue.CLUSTER_STATUS_DEGRADED]
if stream_disaster_step < check_status_step:
self.init_cluster_status()
self.parse_cluster_status()
if stream_disaster_step < check_status_step:
self.check_cluster_status(cluster_normal_status)
return stream_disaster_step

View File

@ -0,0 +1,168 @@
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
#############################################################################
# Copyright (c) 2020 Huawei Technologies Co.,Ltd.
#
# openGauss is licensed under Mulan PSL v2.
# You can use this software according to the terms
# and conditions of the Mulan PSL v2.
# You may obtain a copy of Mulan PSL v2 at:
#
# http://license.coscl.org.cn/MulanPSL2
#
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS,
# WITHOUT WARRANTIES OF ANY KIND,
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
# See the Mulan PSL v2 for more details.
# ----------------------------------------------------------------------------
# Description : streaming_disaster_recovery_query.py is utility for
# query streaming disaster recovery condition.
import os
from base_utils.security.sensitive_mask import SensitiveMask
from impl.streaming_disaster_recovery.streaming_constants import StreamingConstants
from gspylib.common.Common import ClusterCommand
from impl.streaming_disaster_recovery.streaming_base import StreamingBase
class StreamingQueryHandler(StreamingBase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def get_streaming_cluster_query_value(self, file_name):
"""
Query infos from files.
"""
file_path = os.path.realpath(os.path.join(self.streaming_file_dir, file_name))
if not os.path.isfile(file_path) and file_name in [StreamingConstants.HADR_CLUSTER_STAT]:
return "normal"
if not os.path.isfile(file_path):
return "0%"
with open(file_path, 'r') as read_file:
value = read_file.read().strip()
return value
def check_archive(self, former_status, cluster_status):
"""
Check for archive.
"""
self.logger.log("Start check archive.")
if former_status.strip() not in ["archive", "archive_fail"]:
self.logger.debug("Ignore for status:%s" % former_status)
return
archive_status = "archive_fail"
if cluster_status.lower() not in ["normal", "degraded"]:
self.logger.debug("Cluster status:%s,archive fail." % cluster_status)
return archive_status
if self.main_standby_ids or (not self.primary_dn_ids):
self.logger.debug("Ignore update archive for disaster_standby cluster.")
return archive_status
sql_check = "select 1 from pg_catalog.pg_stat_get_wal_senders() where sync_state" \
"='Async' and peer_role='Standby' and peer_state='Normal';"
dn_instances = [inst for node in self.cluster_info.dbNodes for inst in node.datanodes
if inst.instanceId in self.primary_dn_ids]
self.logger.debug("Check archive with cmd:%s." % sql_check)
if dn_instances:
status, output = ClusterCommand.remoteSQLCommand(
sql_check, self.user, dn_instances[0].hostname,
dn_instances[0].port)
if status == 0 and output and output.strip():
archive_status = "archive"
self.logger.debug("Successfully check archive, results:%s." %
SensitiveMask.mask_pwd(output))
return archive_status
elif status == 0 and not output.strip():
self.logger.debug("Check archive fail.")
return archive_status
else:
self.logger.debug("Check archive status:%s, output:%s."
% (status, output))
self.logger.debug("Check archive result:%s." % archive_status)
return archive_status
def check_recovery(self, former_status, cluster_status="normal"):
"""
Check for recovery.
"""
self.logger.log("Start check recovery.")
if former_status.strip() not in ["recovery", "recovery_fail"]:
self.logger.debug("Ignore for check recovery status:%s" % former_status)
return
recovery_status = "recovery_fail"
if cluster_status.lower() not in ["normal", "degraded"]:
self.logger.debug("Cluster status:%s,recovery fail." % cluster_status)
return recovery_status
if self.primary_dn_ids or (not self.main_standby_ids):
self.logger.debug("Ignore update recovery for primary cluster.")
return recovery_status
return "recovery"
def get_max_rpo_rto(self):
"""
Get max rpo and rto.
"""
self.logger.log("Start check RPO & RTO.")
rpo_sql = "SELECT current_rpo FROM dbe_perf.global_streaming_hadr_rto_and_rpo_stat;"
rto_sql = "SELECT current_rto FROM dbe_perf.global_streaming_hadr_rto_and_rpo_stat;"
rto_rpo_sql = rpo_sql + rto_sql
if not self.primary_dn_ids:
self.logger.debug("Not found primary dn in cluster, cluster status:%s, "
"main standby:%s." % (self.cluster_status, self.main_standby_ids))
return "", ""
log_info = "Execute sql [%s] on node [%s: %s] with result:%s"
dn_instances = [inst for node in self.cluster_info.dbNodes for inst in node.datanodes
if inst.instanceId in self.primary_dn_ids]
if dn_instances:
status, output = ClusterCommand.remoteSQLCommand(
rto_rpo_sql, self.user, dn_instances[0].hostname, dn_instances[0].port)
if status == 0 and output:
try:
rets = output.strip().split('\n')
length = len(rets) // 2
rpo_list = [int(i) for i in rets[:length]]
rto_list = [int(j) for j in rets[length:]]
max_rpo, max_rto = str(max(rpo_list)), str(max(rto_list))
except ValueError:
return "", ""
self.logger.debug("Successfully get max rpo:%s, rto:%s, output:%s"
% (max_rpo, max_rto, ','.join(output.split('\n'))))
return max_rpo, max_rto
else:
self.logger.debug(log_info % (rto_rpo_sql, dn_instances[0].hostname,
dn_instances[0].port, ','.join(output.split('\n'))))
return "", ""
def run(self):
self.logger.log("Start streaming disaster query.")
cluster_info = self.query_cluster_info()
if cluster_info:
self.parse_cluster_status(current_status=cluster_info)
self.check_is_under_upgrade()
check_cluster_stat = self.get_streaming_cluster_query_value(
StreamingConstants.HADR_CLUSTER_STAT)
archive_status = self.check_archive(check_cluster_stat, self.cluster_status)
recovery_status = self.check_recovery(check_cluster_stat, self.cluster_status)
hadr_cluster_stat = archive_status or recovery_status or check_cluster_stat
hadr_failover_stat = self.get_streaming_cluster_query_value(
StreamingConstants.HADR_FAILOVER_STAT)
hadr_switchover_stat = self.get_streaming_cluster_query_value(
StreamingConstants.HADR_SWICHOVER_STAT)
if hadr_cluster_stat != "promote":
hadr_failover_stat = ""
if hadr_cluster_stat != "switchover":
hadr_switchover_stat = ""
self.logger.debug("Start check max rpo and rto.")
max_rpo, max_rto = self.get_max_rpo_rto()
self.logger.debug("Finished check max rpo and rto.")
values = dict()
values["hadr_cluster_stat"] = hadr_cluster_stat
values["hadr_failover_stat"] = hadr_failover_stat
values["hadr_switchover_stat"] = hadr_switchover_stat
values["RPO"] = max_rpo
values["RTO"] = max_rto
self.logger.log("Successfully executed streaming disaster "
"recovery query, result:\n%s" % values)

View File

@ -0,0 +1,105 @@
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
#############################################################################
# Copyright (c) 2020 Huawei Technologies Co.,Ltd.
#
# openGauss is licensed under Mulan PSL v2.
# You can use this software according to the terms
# and conditions of the Mulan PSL v2.
# You may obtain a copy of Mulan PSL v2 at:
#
# http://license.coscl.org.cn/MulanPSL2
#
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS,
# WITHOUT WARRANTIES OF ANY KIND,
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
# See the Mulan PSL v2 for more details.
# ----------------------------------------------------------------------------
# Description : streaming_disaster_recovery_stop.py is a utility for stopping
# streaming disaster recovery on primary cluster.
from impl.streaming_disaster_recovery.streaming_base import StreamingBase
class StreamingStopHandler(StreamingBase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def _first_step_for_streaming_stop(self, step):
"""
First step for streaming stop
"""
if step >= 2:
return
self.logger.debug("Start first step of streaming stop.")
self.init_cluster_status()
self.check_action_and_mode()
def _second_step_for_streaming_stop(self, step):
"""
Second step for streaming stop
"""
if step >= 2:
return
self.logger.debug("Start second step of streaming start.")
self.check_cluster_status(status_allowed=['Normal'])
self.check_cluster_type(allowed_type='primary')
self.check_is_under_upgrade()
self.write_streaming_step("2_check_cluster_step")
def _third_step_for_streaming_stop(self, step):
"""
Third step for streaming stop
"""
if step >= 3:
return
self.logger.debug("Start third step of streaming stop.")
self.remove_all_stream_repl_infos(guc_mode="reload")
self.remove_streaming_cluster_file()
self.write_streaming_step("3_remove_config_step")
def _fourth_step_for_streaming_stop(self, step):
"""
Fourth step for streaming stop
"""
if step >= 4:
return
self.logger.debug("Start fourth step of streaming stop.")
self.remove_streaming_pg_hba()
self.restore_guc_params()
self.write_streaming_step("4_remove_pg_hba_step")
def _fifth_step_for_streaming_stop(self, step):
"""
Fifth step for streaming stop
"""
if step >= 5:
return
self.logger.debug("Start fifth step of streaming start.")
self.streaming_clean_replication_slot()
self.write_streaming_step("5_update_config_step")
def _sixth_step_for_streaming_stop(self, step):
"""
Sixth step for streaming stop
"""
if step >= 6:
return
self.logger.debug("Start sixth step of streaming stop.")
self.check_cluster_status(['Normal'])
self.clean_global_config()
self.update_streaming_info("cluster", "normal")
self.clean_streaming_dir()
def run(self):
self.logger.log("Start remove streaming disaster relationship.")
step = self.query_streaming_step()
self._first_step_for_streaming_stop(step)
self.parse_cluster_status()
self._second_step_for_streaming_stop(step)
self._third_step_for_streaming_stop(step)
self._fourth_step_for_streaming_stop(step)
self._fifth_step_for_streaming_stop(step)
self._sixth_step_for_streaming_stop(step)
self.logger.log("Successfully do streaming disaster recovery stop.")

View File

@ -0,0 +1,476 @@
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
#############################################################################
# Copyright (c) 2020 Huawei Technologies Co.,Ltd.
#
# openGauss is licensed under Mulan PSL v2.
# You can use this software according to the terms
# and conditions of the Mulan PSL v2.
# You may obtain a copy of Mulan PSL v2 at:
#
# http://license.coscl.org.cn/MulanPSL2
#
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS,
# WITHOUT WARRANTIES OF ANY KIND,
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
# See the Mulan PSL v2 for more details.
# ----------------------------------------------------------------------------
# Description : streaming_disaster_recovery_switchover.py is a utility for
# changing role between primary cluster and standby cluster.
import os
import time
from datetime import datetime, timedelta
from base_utils.os.cmd_util import CmdUtil
from base_utils.os.env_util import EnvUtil
from gspylib.common.Common import DefaultValue, ClusterCommand, ClusterInstanceConfig
from gspylib.common.DbClusterStatus import DbClusterStatus
from gspylib.common.ErrorCode import ErrorCode
from gspylib.threads.parallelTool import parallelTool
from impl.streaming_disaster_recovery.streaming_base import StreamingBase
from impl.streaming_disaster_recovery.streaming_constants import StreamingConstants
class StreamingSwitchoverHandler(StreamingBase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def run(self):
"""
streaming disaster recovery switchover
"""
self.logger.log("Start streaming disaster switchover.")
self.check_action_and_mode()
self.check_switchover_workable()
self.init_cluster_conf()
self.check_dn_instance_params()
self.check_is_under_upgrade()
try:
self.streaming_switchover_single_inst()
self.clean_step_file()
except Exception as error:
if self.params.mode == "primary":
self.update_streaming_info("cluster", "promote_fail")
raise Exception(
ErrorCode.GAUSS_516["GAUSS_51632"] % "switchover" + "Error:%s" % str(error))
finally:
self.remove_cluster_maintance_file_for_switchover()
self.remove_cluster_maintance_file()
self.logger.log("Successfully do streaming disaster recovery switchover.")
def streaming_switchover_single_inst(self):
"""
streaming disaster recovery switchover for single_inst cluster
disaster_standby: expect primary cluster becomes standby
primary: expect standby cluster becomes primary
"""
self.create_cluster_maintance_file("streaming switchover")
self.update_streaming_info("cluster", StreamingConstants.ACTION_SWITCHOVER)
stream_disaster_step = self.query_streaming_step()
if self.params.mode == "primary":
end_time = datetime.now() + timedelta(seconds=self.params.waitingTimeout)
self.logger.log("Waiting for switchover barrier.")
while True:
switchover_barrier_list = self.check_streaming_disaster_switchover_barrier()
if len(switchover_barrier_list) == len(self.normal_dn_ids):
break
if datetime.now() >= end_time:
self.restart_cluster()
raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] %
"check switchover_barrier on all main standby dn" +
" Because check timeout: %ss" %
str(self.params.waitingTimeout))
time.sleep(5)
self.streaming_failover_single_inst(stream_disaster_step,
StreamingConstants.ACTION_SWITCHOVER)
else:
self.add_cluster_maintance_file_for_switchover()
try:
if stream_disaster_step < 1:
self.update_streaming_info(StreamingConstants.ACTION_SWITCHOVER, "10%")
self.stop_cluster()
self.start_cluster()
self.streaming_disaster_set_master_cluster_in_switchover()
self.write_streaming_step("1_streaming_disaster_set_master_in_switchover")
if stream_disaster_step < 2:
self.update_streaming_info(StreamingConstants.ACTION_SWITCHOVER, "30%")
ClusterInstanceConfig.set_data_on_dcc(self.cluster_info,
self.logger, self.user,
{self.backup_open_key: "2"})
self.stop_cluster()
self.write_streaming_step("2_stop_cluster_for_switchover")
if stream_disaster_step < 3:
self.set_cmserver_guc("backup_open", "2", "set")
self.set_cmagent_guc("agent_backup_open", "2", "set")
self.write_streaming_step("3_set_backup_open_2_done")
if stream_disaster_step < 4:
self.update_streaming_info(StreamingConstants.ACTION_SWITCHOVER, "50%")
self.remove_cluster_maintance_file_for_switchover()
self.remove_cluster_maintance_file()
self.start_cluster()
self.write_streaming_step("4_start_cluster_done")
if stream_disaster_step < 5:
self.wait_for_normal(timeout=self.params.waitingTimeout,
streaming_switchover="streaming_switchover")
self.streaming_clean_replication_slot()
self.update_streaming_info("cluster", "recovery")
except Exception as error:
self.logger.error("Failed to do streaming disaster cluster switchover, Error:"
" \n%s" % str(error))
rollback_step = self.query_streaming_step()
self.logger.debug("Roll back switchover step:%s" % rollback_step)
self.remove_cluster_maintance_file_for_switchover()
self.remove_cluster_maintance_file()
if rollback_step < 4 or (rollback_step >= 4 and
self.streaming_switchover_roll_back_condition()):
self.streaming_switchover_roll_back(update_query=True)
self.clean_step_file()
raise Exception(error)
self.remove_hadr_switchover_process_file()
def remove_hadr_switchover_process_file(self):
self.logger.debug("Remove hadr switchover process file for switchover.")
process_file = os.path.realpath(os.path.join(self.streaming_file_dir,
".hadr_switchover_stat"))
cmd = "if [ -f {0} ]; then rm -rf {0}; fi".format(process_file)
self.ssh_tool.executeCommand(cmd, hostList=self.connected_nodes)
self.logger.debug("Successfully remove switchover process on all connected nodes.")
@staticmethod
def clean_file_on_node(params):
"""
clean file on dest node with path
"""
dest_ip, dest_path, timeout = params
cmd = "source %s && pssh -s -t %s -H %s 'if [ -f %s ]; then rm -f %s; fi'" % (
EnvUtil.getMpprcFile(), timeout, dest_ip, dest_path, dest_path)
status, output = CmdUtil.getstatusoutput_by_fast_popen(cmd)
return status, output, dest_ip
def restart_cluster(self, restart_timeout=DefaultValue.TIMEOUT_CLUSTER_START):
"""
Restart cluster
"""
self.logger.log("Restart cluster.")
static_config = "%s/bin/cluster_static_config" % self.bin_path
cm_ctl_file = "%s/bin/cm_ctl" % self.bin_path
if not os.path.isfile(static_config):
self.logger.debug("Checked file %s lost." % static_config)
if not os.path.isfile(cm_ctl_file):
self.logger.debug("Checked file %s lost." % cm_ctl_file)
stop_cmd = ClusterCommand.getStopCmd(0, timeout=restart_timeout)
status, output = CmdUtil.retryGetstatusoutput(stop_cmd, retry_time=0)
self.logger.debug("Stop cluster result:[%s][%s]." % (status, output))
start_cmd = ClusterCommand.getStartCmd(0, timeout=restart_timeout)
status, output = CmdUtil.retryGetstatusoutput(start_cmd, retry_time=0)
self.logger.debug("Start cluster result:[%s][%s]." % (status, output))
def remove_cluster_maintance_file_for_switchover(self):
"""
function: remove the cluster_maintance file
:return: NA
"""
self.logger.debug("Remove cluster_maintance file for switchover.")
cluster_maintance_file = os.path.realpath(os.path.join(self.gauss_home,
"bin/cluster_maintance"))
host_names = \
self.get_all_connection_node_name("remove_cluster_maintance_file_for_switchover")
try:
pscp_params = []
all_instances = [dn_inst for db_node in self.cluster_info.dbNodes
for dn_inst in db_node.datanodes]
if not self.cluster_info.isSingleInstCluster():
all_instances.extend([dn_inst for db_node in self.cluster_info.dbNodes
for dn_inst in db_node.coordinators])
for dn_inst in all_instances:
if dn_inst.hostname in host_names:
pscp_params.append([dn_inst.hostname, os.path.join(
dn_inst.datadir, os.path.basename(cluster_maintance_file)), 10])
if len(pscp_params) > 0:
results = parallelTool.parallelExecute(self.clean_file_on_node, pscp_params)
for ret in results:
if ret[0] != 0:
self.logger.debug("clean maintance file to node[%s] with status[%s], "
"output[%s]" % (ret[-1], ret[0], ret[1]))
except Exception as error:
self.logger.debug(
"Failed to remove cluster_maintance file for switchover with error: %s"
% str(error))
self.logger.debug("Successfully remove %s cluster_maintance file for switchover."
% host_names)
def add_cluster_maintance_file_for_switchover(self):
"""
add cluster_maintance file for streaming disaster switchover to disaster_standby
"""
self.logger.debug("Start add cluster_maintance file for switchover.")
try:
cluster_maintance_file = os.path.realpath(os.path.join(self.gauss_home,
"bin/cluster_maintance"))
host_names = \
self.get_all_connection_node_name("add_cluster_maintance_file_for_switchover", True)
pscp_params = []
all_instances = [dn_inst for db_node in self.cluster_info.dbNodes
for dn_inst in db_node.datanodes]
for dn_inst in all_instances:
if dn_inst.hostname in host_names:
pscp_params.append([dn_inst.hostname, cluster_maintance_file,
os.path.join(dn_inst.datadir, "cluster_maintance"), 10])
if len(pscp_params) > 0:
results = parallelTool.parallelExecute(
DefaultValue.distribute_file_to_node, pscp_params)
for ret in results:
if ret[0] != 0:
self.logger.debug("Distribute maintance file for switchover to node[%s] "
"with status[%s], output[%s]" % (ret[-1], ret[0], ret[1]))
except Exception as error:
self.logger.debug("WARNING: Failed add cluster_maintance file for switchover, "
"error:%s." % (str(error)))
self.logger.debug("Successfully add cluster_maintance file for switchover.")
def streaming_disaster_set_master_cluster_in_switchover(self):
"""
streaming disaster set master cluster in switchover
"""
self.logger.debug("Starting set streaming master cluster in switchover.")
primary_dns = [dn_inst for db_node in self.cluster_info.dbNodes
for dn_inst in db_node.datanodes if
dn_inst.instanceId in self.primary_dn_ids]
if not primary_dns:
raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"]
% "obtain primary dns for switchover")
if self.streaming_dr_in_switchover(primary_dns):
if self.streaming_dr_service_truncation_check(primary_dns):
self.logger.debug("Successfully set streaming master cluster in switchover.")
def streaming_dr_service_truncation_check(self, primary_dns_list):
"""
streaming dr service truncation check
"""
self.logger.log("Waiting for truncation.")
results = parallelTool.parallelExecute(self.concurrent_check_dr_service_truncation,
primary_dns_list)
return all(results)
def concurrent_check_dr_service_truncation(self, dn_inst):
"""
Wait for the log playback to complete.
"""
self.logger.debug("Starting check node %s shardNum %s instance %s streaming service "
"truncation." % (dn_inst.hostname, dn_inst.mirrorId, dn_inst.instanceId))
sql_check = "select * from gs_streaming_dr_service_truncation_check();"
end_time = datetime.now() + timedelta(seconds=1200)
succeed = False
while datetime.now() < end_time:
status, output = ClusterCommand.remoteSQLCommand(sql_check, self.user, dn_inst.hostname,
dn_inst.port)
if status == 0 and output and output.strip() == "t":
succeed = True
break
time.sleep(5)
self.logger.debug("Retry truncation check shardNum %s in node %s instance %s." %
(dn_inst.mirrorId, dn_inst.hostname, dn_inst.instanceId))
if not succeed:
self.logger.error("Failed to execute the command: %s, Error:\n%s" % (sql_check, output))
raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] %
"check truncate service before switchover")
self.logger.debug("Successfully check node %s shardNum %s instance %s streaming service "
"truncation." % (dn_inst.hostname, dn_inst.mirrorId, dn_inst.instanceId))
return True
def streaming_dr_in_switchover(self, primary_dns_list):
"""
set steaming dr in switchover
"""
results = parallelTool.parallelExecute(self.concurrent_set_dr_in_switchover,
primary_dns_list)
return all(results)
def concurrent_set_dr_in_switchover(self, dn_inst):
"""
Switchover requires log truncation first
"""
self.logger.debug("Starting set shardNum %s node %s streaming dr in switchover." %
(dn_inst.mirrorId, dn_inst.hostname))
sql_cmd = "select * from gs_streaming_dr_in_switchover();"
# We need to use the normal port to transmit service truncation,
# not the OM port.
port = int(dn_inst.port) - 1
(status, output) = ClusterCommand.remoteSQLCommand(sql_cmd,
self.user, dn_inst.hostname, str(port))
self.logger.debug("check streaming in switchover, status=%d, output: %s."
% (status, output))
if status != 0 or self.find_error(output) or output.strip() != "t":
self.logger.error("Failed to execute the command: %s, Error:\n%s" % (sql_cmd, output))
raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] %
"generate switchover barrier before switchover")
self.logger.debug("Successfully set shardNum %s node %s streaming dr in switchover." %
(dn_inst.mirrorId, dn_inst.hostname))
return True
def wait_for_normal(self, timeout=DefaultValue.TIMEOUT_CLUSTER_START,
streaming_switchover=None):
"""
function:Wait the cluster become Normal or Degraded
input:NA
output:NA
"""
self.logger.debug("Waiting for cluster status being satisfied.")
end_time = None if timeout <= 0 else datetime.now() + timedelta(seconds=timeout)
check_status = 0
while True:
time.sleep(10)
if end_time is not None and datetime.now() >= end_time:
check_status = 1
self.logger.debug("Timeout. The cluster is not available.")
break
# View the cluster status
status_file = "/home/%s/gauss_check_status_%d.dat" % (self.user, os.getpid())
cmd = ClusterCommand.getQueryStatusCmd(outFile=status_file)
(status, output) = CmdUtil.retryGetstatusoutput(cmd, retry_time=0)
if status != 0:
if os.path.exists(status_file):
os.remove(status_file)
self.logger.debug("Failed to obtain the cluster status. Error: \n%s" % output)
continue
# Determine whether the cluster status is normal or degraded
cluster_status = DbClusterStatus()
cluster_status.initFromFile(status_file)
if os.path.exists(status_file):
os.remove(status_file)
if cluster_status.clusterStatus == "Normal":
self.logger.log("The cluster status is Normal.")
break
else:
self.logger.debug("Cluster status is %s(%s)." % (
cluster_status.clusterStatus, cluster_status.clusterStatusDetail))
if check_status != 0:
if streaming_switchover == "streaming_switchover":
raise Exception(
ErrorCode.GAUSS_528["GAUSS_52800"] % (cluster_status.clusterStatus,
cluster_status.clusterStatusDetail))
self.logger.logExit(ErrorCode.GAUSS_528["GAUSS_52800"] % (
cluster_status.clusterStatus, cluster_status.clusterStatusDetail))
self.logger.debug("Successfully wait for cluster status become Normal.", "constant")
def set_auto_csn_barrier_guc(self, guc_mode, action_flag=False, roll_back=False):
"""
auto_csn_barrier : 0 / 1
"""
guc_value = 1 if self.params.mode == "primary" else 0
if action_flag:
guc_value = 0
if roll_back:
guc_value = 1
self.logger.debug("Starting %s auto_csn_barrier is %s." % (guc_mode, guc_value))
cmd = 'source %s && gs_guc %s -Z coordinator -N all -I all ' \
'-c "auto_csn_barrier=%s"' % (self.mpp_file, guc_mode, guc_value)
host_names = self.cluster_info.getClusterNodeNames()
ignore_node = [node for node in host_names if node not in self.normal_node_list]
if ignore_node:
self.logger.debug(
"WARNING: auto_csn_barrier need ignore host name is %s" % ignore_node)
nodes = ",".join(ignore_node)
cmd = cmd + " --ignore-node %s" % nodes
self.logger.debug("Set auto_csn_barrier with cmd:%s" % cmd)
status, output = CmdUtil.retryGetstatusoutput(cmd)
if status != 0:
raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"]
% "set auto_csn_barrier" + "Error:%s" % output)
self.logger.debug("Successfully %s auto_csn_barrier is %s." % (guc_mode, guc_value))
def streaming_switchover_roll_back(self, update_query=False):
"""
streaming disaster cluster roll back in switchover
"""
self.logger.log("Roll back streaming disaster cluster switchover...")
ClusterInstanceConfig.set_data_on_dcc(self.cluster_info,
self.logger, self.user,
{self.backup_open_key: "0"})
self.stop_cluster()
self.set_cmserver_guc("backup_open", "0", "set")
self.set_cmagent_guc("agent_backup_open", "0", "set")
self.logger.log("Successfully modify cma and cms parameters to start according to primary "
"cluster mode")
if update_query:
self.update_streaming_info("cluster", "archive")
self.start_cluster()
self.logger.log("Successfully Roll back streaming disaster cluster switchover.")
def check_streaming_disaster_switchover_barrier(self):
"""
check whether get switchover_barrier on all dn
"""
self.logger.debug("check streaming disaster switchover barrier...")
sql_cmd = "select * from gs_streaming_dr_get_switchover_barrier();"
switchover_barrier_list = []
for db_node in self.cluster_info.dbNodes:
for dn_inst in db_node.datanodes:
if dn_inst.instanceId not in self.normal_dn_ids:
self.logger.debug("Warning: Not check for abnormal instance %s %s" % (
dn_inst.instanceType, dn_inst.instanceId))
continue
(status, output) = ClusterCommand.remoteSQLCommand(
sql_cmd, self.user, dn_inst.hostname, dn_inst.port, maintenance_mode=True)
self.logger.debug("Check inst has switchover barrier, status=%d, "
"output: %s." % (status, output))
if status == 0 and output.strip() == "t":
self.logger.debug("Successfully check instance %s %s has switchover "
"barrier." % (dn_inst.instanceType, dn_inst.instanceId))
switchover_barrier_list.append(dn_inst.instanceId)
return switchover_barrier_list
def check_switchover_workable(self):
"""
Check switchover is workable
"""
if not DefaultValue.is_disaster_cluster(self.cluster_info) \
and self.params.mode == "primary":
self.logger.debug("The primary dn exist, do nothing except record the result file.")
raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] %
"streaming disaster cluster switchover, Because the primary cluster "
"[drClusterMode] parameter must be disaster_standby")
if DefaultValue.is_disaster_cluster(self.cluster_info) and \
self.params.mode == "disaster_standby":
self.logger.debug("The primary dn not exist, do nothing except record the result file.")
raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] %
"streaming disaster cluster switchover, Because the disaster_standby "
"cluster [drClusterMode] parameter must be primary")
self.logger.log("Waiting for cluster and all instances normal.")
if self.params.mode == "primary":
end_time = datetime.now() + timedelta(seconds=600)
while True:
self.init_cluster_status()
self.parse_cluster_status()
if self.check_cluster_status(status_allowed=['Normal'], only_check=True,
is_log=False) and self.check_instances_ready_for_switchover():
break
if datetime.now() >= end_time:
raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"]
% "check cluster and instances status"
" with timeout: %ss" % str(600))
time.sleep(5)
self.logger.debug("Retry check stream disaster standby cluster status...")
else:
self.init_cluster_status()
self.parse_cluster_status()
if (not self.check_cluster_status(status_allowed=['Normal'], only_check=True,
is_log=False)) \
or (not self.check_instances_ready_for_switchover()):
raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] % "check cluster status")
def check_instances_ready_for_switchover(self):
"""
Check cns and dns is ready for switchover
"""
dn_instances = [dn_inst.instanceId for db_node in self.cluster_info.dbNodes
for dn_inst in db_node.datanodes]
if len(dn_instances) != len(self.normal_dn_ids):
self.logger.debug("Not all dn instances is normal.")
return False
self.logger.debug("Successfully check cn and dn instances are normal.")
return True

View File

@ -107,6 +107,24 @@ BINARY_UPGRADE_STEP_START_NODE = 5
BINARY_UPGRADE_STEP_PRE_COMMIT = 6
# dual cluster stage
class DualClusterStage:
"""
Dual cluster stage upgrade marking
"""
def __init__(self):
pass
(STEP_UPGRADE_END,
STEP_UPGRADE_UNFINISHED,
STEP_UPGRADE_FINISH,
STEP_UPGRADE_COMMIT,
) = list(range(0, 4))
def __str__(self):
pass
# grey upgrade
class GreyUpgradeStep:
def __init__(self):
@ -174,3 +192,17 @@ UPGRADE_VERSION_64bit_xid = 91.208
ENABLE_STREAM_REPLICATION_VERSION = "92.149"
ENABLE_STREAM_REPLICATION_NAME = "enable_stream_replication"
RELMAP_4K_VERSION = "92.420"
# streaming cluster
GS_SECURE_FILES = "gs_secure_files"
UPGRADE_PHASE_INFO = "upgrade_phase_info"
HARD_KEY_CIPHER = "hadr.key.cipher"
HARD_KEY_RAND = "hadr.key.rand"
DISASTER_RECOVERY_GUC = "backup_open"
INSTALL_TYPE_GUC = "install_type"
REMOTE_INFO_GUC = {
"dual-standby-streamDR": "replconninfo",
"dual-primary-streamDR": "replconninfo"
}
LENGTH_STORAGE_INFO_LEN = 4
ACTION_CLEAN_GS_SECURE_FILES = "clean_gs_secure_files"

View File

@ -23,6 +23,7 @@ import json
import csv
import traceback
import copy
import re
from datetime import datetime, timedelta
@ -38,6 +39,7 @@ from gspylib.os.gsfile import g_file
from gspylib.inspection.common import SharedFuncs
from gspylib.component.CM.CM_OLAP.CM_OLAP import CM_OLAP
from impl.upgrade.UpgradeConst import GreyUpgradeStep
from impl.upgrade.UpgradeConst import DualClusterStage
import impl.upgrade.UpgradeConst as const
from base_utils.executor.cmd_executor import CmdExecutor
from base_utils.executor.local_remote_cmd import LocalRemoteCmd
@ -82,6 +84,7 @@ class UpgradeImpl:
self.isLargeInplaceUpgrade = False
self.__upgrade_across_64bit_xid = False
self.action = upgrade.action
self.primaryDn = None
def exitWithRetCode(self, action, succeed=True, msg=""):
"""
@ -145,9 +148,56 @@ class UpgradeImpl:
self.context.sshTool = SshTool(
self.context.clusterNodes, self.context.localLog,
DefaultValue.TIMEOUT_PSSH_BINARY_UPGRADE)
self.initVersionInfo()
self.initClusterConfig()
self.initClusterType()
self.context.logger.debug("Successfully init global infos", "constant")
def initVersionInfo(self):
"""
Initialize the old and new version information
:return:
"""
newVersionFile = VersionInfo.get_version_file()
newClusterVersion, newClusterNumber, newCommitId = VersionInfo.get_version_info(
newVersionFile)
gaussHome = ClusterDir.getInstallDir(self.context.user)
newPath = gaussHome + "_%s" % newCommitId
oldPath = self.getClusterAppPath(const.OLD)
if oldPath == "":
oldPath = os.path.realpath(gaussHome)
oldVersionFile = "%s/bin/upgrade_version" % oldPath
try:
(oldClusterVersion, oldClusterNumber, oldCommitId) = VersionInfo.get_version_info(
oldVersionFile)
self.context.logger.debug("Successfully obtained version information of "
"old clusters by %s." % oldVersionFile)
except Exception as er:
if os.path.exists(self.context.upgradeBackupPath):
# if upgradeBackupPath exist, it means that we do rollback first.
# and we get cluster version from the backup file
possibOldVersionFile = "%s/old_upgrade_version" % self.context.upgradeBackupPath
self.context.logger.debug(str(er))
self.context.logger.debug("Try to get the version information "
"from %s." % possibOldVersionFile)
(oldClusterVersion, oldClusterNumber, oldCommitId) = VersionInfo.get_version_info(
possibOldVersionFile)
else:
raise Exception(str(er))
self.context.newClusterVersion = newClusterVersion
self.context.newClusterNumber = newClusterNumber
self.context.oldClusterVersion = oldClusterVersion
self.context.oldClusterNumber = oldClusterNumber
self.context.newClusterAppPath = newPath
self.context.oldClusterAppPath = oldPath
self.newCommitId = newCommitId
self.oldCommitId = oldCommitId
def setClusterDetailInfo(self):
"""
function: set cluster detail info
@ -268,6 +318,8 @@ class UpgradeImpl:
DefaultValue.TIMEOUT_PSSH_BINARY_UPGRADE)
if action == const.ACTION_AUTO_ROLLBACK and \
self.checkBakPathNotExists():
if os.path.isfile(self.context.upgradePhaseInfoPath):
self.recordDualClusterStage(self.oldCommitId, DualClusterStage.STEP_UPGRADE_END)
self.context.logger.log("No need to rollback.")
self.exitWithRetCode(action, True)
else:
@ -290,6 +342,11 @@ class UpgradeImpl:
grey upgrade rollback if not in read only, then record the value of
enable_transaction_read_only and set it to off
"""
# no need to check read only mode and close enable_transaction_read_only
if self.context.standbyCluster:
self.context.logger.debug("no need to check read only in force or"
" standby cluster mode upgrade")
return
try:
self.context.logger.debug("Check if in read only mode.")
greyUpgradeFlagFile = os.path.join(self.context.upgradeBackupPath,
@ -481,16 +538,9 @@ class UpgradeImpl:
% newClusterNumber)
self.context.logger.debug("The matched upgrade strategy is: %s."
% upgradeAction)
self.context.newClusterVersion = newClusterVersion
self.context.newClusterNumber = newClusterNumber
self.context.oldClusterVersion = oldClusterVersion
self.context.oldClusterNumber = oldClusterNumber
self.context.newClusterAppPath = newPath
self.context.oldClusterAppPath = oldPath
self.newCommitId = newCommitId
self.oldCommitId = oldCommitId
return upgradeAction
except Exception as e:
self.clean_gs_secure_files()
raise Exception(ErrorCode.GAUSS_529["GAUSS_52900"] % str(e)
+ " Do nothing this time.")
@ -665,6 +715,10 @@ class UpgradeImpl:
"""
try:
self.context.logger.debug("Setting up the cluster read-only mode.")
if self.context.standbyCluster:
self.context.logger.debug("no need to set cluster "
"read only mode under force or standby cluster upgrade")
return 0
self.setGUCValue("default_transaction_read_only", "true")
self.context.logger.debug("successfully set the cluster read-only mode.")
return 0
@ -682,6 +736,10 @@ class UpgradeImpl:
"""
try:
self.context.logger.debug("Canceling the cluster read-only mode.")
if self.context.standbyCluster:
self.context.logger.debug("no need to unset cluster "
"read only mode under force or standby cluster upgrade")
return 0
self.setGUCValue("default_transaction_read_only", "false")
self.context.logger.debug("Successfully cancelled the cluster read-only mode.")
return 0
@ -887,6 +945,8 @@ class UpgradeImpl:
Input : gucStr the guc key:value string
output : NA
"""
if "dual-standby" in self.context.clusterType:
return
self.context.logger.debug("Start to check GUC value %s." % gucStr)
try:
# send cmd to that node and exec
@ -910,6 +970,28 @@ class UpgradeImpl:
except Exception as e:
raise Exception(str(e))
def backup_disaster_user_file(self):
"""backup_disaster_user_file"""
bin_path = os.path.join(EnvUtil.getEnv("GAUSSHOME"), "bin")
cipher_file = os.path.join(bin_path, "hadr.key.cipher")
if os.path.isfile(cipher_file):
FileUtil.cpFile(cipher_file, "%s/" % self.context.tmpDir)
rand_file = os.path.join(bin_path, "hadr.key.rand")
if os.path.isfile(rand_file):
FileUtil.cpFile(rand_file, "%s/" % self.context.tmpDir)
self.context.logger.debug("Back up rand and cipher file to temp dir.")
def restore_origin_disaster_user_file(self):
"""restore_origin_disaster_user_file"""
bin_path = os.path.join(self.context.newClusterAppPath, "bin")
cipher_file = os.path.join(self.context.tmpDir, "hadr.key.cipher")
if os.path.isfile(cipher_file):
self.context.sshTool.scpFiles(cipher_file, bin_path)
rand_file = os.path.join(self.context.tmpDir, "hadr.key.rand")
if os.path.isfile(rand_file):
self.context.sshTool.scpFiles(rand_file, bin_path)
self.context.logger.debug("Restore rand and cipher file to gausshome.")
def floatMoreThan(self, numOne, numTwo):
"""
function: float more than
@ -968,8 +1050,10 @@ class UpgradeImpl:
self.distributeXml()
# 2. check if the app path is ready and sha256 is right and others
self.checkUpgrade()
# 4. check the cluster pressure
self.HASyncReplayCheck()
if self.context.action == const.ACTION_LARGE_UPGRADE and \
"dual-standby" not in self.context.clusterType:
# 4. check the cluster pressure
self.HASyncReplayCheck()
# 5. before do grey binary upgrade, we must make sure the
# cluster is Normal and the database could be
# connected, if not, exit.
@ -983,6 +1067,12 @@ class UpgradeImpl:
# check if it satisfy upgrade again, if it is the second loop to
# upgrade, it can go go upgrade again branch
upgradeAgain = self.canUpgradeAgain()
if not upgradeAgain:
self.recordDualClusterStage(self.oldCommitId,
DualClusterStage.STEP_UPGRADE_UNFINISHED)
self.context.logger.log("NOTICE: The directory %s will be deleted after "
"commit-upgrade, please make sure there is no personal "
"data." % self.context.oldClusterAppPath)
except Exception as e:
# before this step, the upgrade process do nothing to the cluster,
# this time has no remaining
@ -998,6 +1088,8 @@ class UpgradeImpl:
if not self.doGreyBinaryRollback():
self.exitWithRetCode(const.ACTION_AUTO_ROLLBACK, False)
self.removeOmRollbackProgressFile()
self.recordDualClusterStage(self.oldCommitId,
DualClusterStage.STEP_UPGRADE_UNFINISHED)
self.context.logger.log(
"The directory %s will be deleted after commit-upgrade, "
"please make sure there is no personal data." %
@ -1021,8 +1113,14 @@ class UpgradeImpl:
# we can not recognize if it really cannot
# find the column, or just because the old version. So we
# will update the catalog in the old version
if self.context.action == const.ACTION_LARGE_UPGRADE:
if self.context.action == const.ACTION_LARGE_UPGRADE and \
"dual-standby" not in self.context.clusterType:
self.updateCatalog()
elif self.context.action == const.ACTION_LARGE_UPGRADE and \
"dual-standby" in self.context.clusterType:
self.setUpgradeFromParam(self.context.oldClusterNumber)
self.reloadCmAgent()
self.reload_cmserver()
self.recordNodeStep(GreyUpgradeStep.STEP_SWITCH_NEW_BIN)
self.CopyCerts()
self.upgradeAgain()
@ -1061,6 +1159,7 @@ class UpgradeImpl:
# 11. switch the cluster version to new version
self.getOneDNInst(checkNormal=True)
self.switchBin(const.NEW)
self.restore_origin_disaster_user_file()
# create CA for CM
self.create_ca_for_cm()
self.setNewVersionGuc()
@ -1093,14 +1192,16 @@ class UpgradeImpl:
self.waitClusterForNormal()
# backup global relmap file before doing upgrade-post
self.backupGlobalRelmapFile()
self.prepareSql("rollback-post")
self.execRollbackUpgradedCatalog(scriptType="rollback-post")
self.prepareSql("upgrade-post")
self.execRollbackUpgradedCatalog(scriptType="upgrade-post")
self.getLsnInfo()
if "dual-standby" not in self.context.clusterType:
self.prepareSql("rollback-post")
self.execRollbackUpgradedCatalog(scriptType="rollback-post")
self.prepareSql("upgrade-post")
self.execRollbackUpgradedCatalog(scriptType="upgrade-post")
self.getLsnInfo()
hosts = copy.deepcopy(self.context.clusterNodes)
self.recordNodeStep(
GreyUpgradeStep.STEP_PRE_COMMIT, nodes=hosts)
self.recordDualClusterStage(self.newCommitId, DualClusterStage.STEP_UPGRADE_FINISH)
self.printPrecommitBanner()
except Exception as e:
hintInfo = "Nodes are new version. " \
@ -1250,6 +1351,9 @@ class UpgradeImpl:
try:
self.context.logger.log("Create checkpoint before switching.")
start_time = timeit.default_timer()
if self.context.forceRollback or self.context.standbyCluster:
self.context.logger.debug("No need to do checkpoint.")
return
# create checkpoint
sql = "CHECKPOINT;"
for i in range(10):
@ -1703,6 +1807,10 @@ class UpgradeImpl:
if not self.doInplaceBinaryRollback():
self.exitWithRetCode(const.ACTION_AUTO_ROLLBACK, False)
try:
if self.context.action == const.ACTION_LARGE_UPGRADE and \
"dual-standby" not in self.context.clusterType:
# check the cluster pressure
self.HASyncReplayCheck()
self.checkUpgrade()
# 3. before do binary upgrade, we must make sure the cluster is
@ -2278,6 +2386,9 @@ class UpgradeImpl:
output: NA
"""
self.context.logger.debug("Preparing upgrade sql folder.")
if self.context.standbyCluster:
self.context.logger.debug("no need prepare upgrade sql folder under force upgrade")
return
hosts = self.context.clusterNodes
cmd = "%s -t %s -U %s --upgrade_bak_path=%s -X %s -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
@ -2309,6 +2420,10 @@ class UpgradeImpl:
self.context.logger.debug("Start to wait and check if all the standby"
" instances have replayed all xlogs, host: %s" % \
host.hostname)
if self.context.standbyCluster or self.context.forceRollback:
self.context.logger.debug("no need to do HA sync replay check "
"under force upgrade/rollback and standby cluster mode")
return
self.doReplay(catchupFailedOk, host)
self.context.logger.debug("Successfully performed the replay check "
"of the standby instance.")
@ -2754,10 +2869,11 @@ class UpgradeImpl:
"""
self.context.logger.debug("Get database list in cluster.")
sql = "select datname from pg_database;"
mode = True if "dual-standby" in self.context.clusterType else False
(status, output) = ClusterCommand.remoteSQLCommand(
sql, self.context.user,
self.dnInst.hostname, self.dnInst.port, False,
DefaultValue.DEFAULT_DB_NAME, IsInplaceUpgrade=True)
DefaultValue.DEFAULT_DB_NAME, IsInplaceUpgrade=True, maintenance_mode=mode)
if status != 0:
raise Exception(ErrorCode.GAUSS_513["GAUSS_51300"] % sql +
" Error: \n%s" % str(output))
@ -2777,12 +2893,13 @@ class UpgradeImpl:
make checkpoint
:return:
"""
mode = True if "dual-standby" in self.context.clusterType else False
sql = 'CHECKPOINT;'
for eachdb in database_list:
(status, output) = ClusterCommand.remoteSQLCommand(
sql, self.context.user,
self.dnInst.hostname, self.dnInst.port, False,
eachdb, IsInplaceUpgrade=True)
eachdb, IsInplaceUpgrade=True, maintenance_mode=mode)
if status != 0:
raise Exception(ErrorCode.GAUSS_513["GAUSS_51300"] % sql +
" Error: \n%s" % str(output))
@ -3335,6 +3452,7 @@ class UpgradeImpl:
input : NA
output: NA
"""
self.checkDualClusterCommit()
try:
(status, output) = self.doHealthCheck(const.OPTION_POSTCHECK)
if status != 0:
@ -3351,22 +3469,27 @@ class UpgradeImpl:
# for the reenter commit, the schema may have been deleted
if self.existTable(const.RECORD_NODE_STEP):
self.recordNodeStep(GreyUpgradeStep.STEP_BEGIN_COMMIT)
self.recordDualClusterStage(self.newCommitId, DualClusterStage.STEP_UPGRADE_COMMIT)
self.setActionFile()
if self.context.action == const.ACTION_LARGE_UPGRADE:
if DefaultValue.get_cm_server_num_from_static(self.context.clusterInfo) > 0:
self.setUpgradeFromParam(const.UPGRADE_UNSET_NUM)
self.reloadCmAgent()
self.reload_cmserver(is_final=True)
self.setUpgradeMode(0)
if "dual-standby" not in self.context.clusterType:
self.setUpgradeMode(0)
time.sleep(10)
if self.dropPMKSchema() != 0:
raise Exception(ErrorCode.GAUSS_529["GAUSS_52917"])
self.clearOtherToolPackage()
self.cleanInstallPath(const.OLD)
self.dropSupportSchema()
self.cleanBinaryUpgradeBakFiles()
if "dual-standby" not in self.context.clusterType:
self.dropSupportSchema()
self.cleanConfBakOld()
self.recordDualClusterStage(self.newCommitId, DualClusterStage.STEP_UPGRADE_END)
self.cleanBinaryUpgradeBakFiles()
# remove tmp global relmap file
self.cleanTmpGlobalRelampFile()
self.context.logger.log("Commit upgrade succeeded.")
@ -3383,6 +3506,9 @@ class UpgradeImpl:
"""
try:
self.context.logger.debug("Start to drop schema PMK.")
if self.context.standbyCluster:
self.context.logger.debug("no need to delete schema PMK in standby cluster mode.")
return 0
# execute drop commands by the CN instance
sql = "DROP SCHEMA IF EXISTS pmk CASCADE; "
retry_times = 0
@ -3448,7 +3574,10 @@ class UpgradeImpl:
try:
self.distributeXml()
if action == const.ACTION_AUTO_ROLLBACK:
self.checkDualClusterRollback()
self.clearOtherToolPackage(action)
self.recordDualClusterStage(self.oldCommitId,
DualClusterStage.STEP_UPGRADE_UNFINISHED)
try:
self.getOneDNInst(True)
except Exception as e:
@ -3475,12 +3604,14 @@ class UpgradeImpl:
# consider if need to sync them, not important
# under force upgrade, only read step from file
maxStep = self.getNodeStep()
self.checkDualClusterRollback()
# if -2, it means there is no need to exec rollback
# if under upgrade continue mode, it will do upgrade not rollback,
# it can enter the upgrade process
# when the binary_upgrade bak dir has some files
if maxStep == const.BINARY_UPGRADE_NO_NEED_ROLLBACK:
self.cleanBinaryUpgradeBakFiles(True)
self.recordDualClusterStage(self.oldCommitId, DualClusterStage.STEP_UPGRADE_END)
self.context.logger.log("No need to rollback.")
return True
@ -3498,6 +3629,7 @@ class UpgradeImpl:
self.recordNodeStep(
GreyUpgradeStep.STEP_UPDATE_POST_CATALOG, nodes)
maxStep = self.getNodeStep()
self.checkDualClusterRollback()
if maxStep == GreyUpgradeStep.STEP_UPDATE_POST_CATALOG:
self.context.logger.debug(
"Record the step %d to mark it has leaved pre-commit"
@ -3506,7 +3638,8 @@ class UpgradeImpl:
if self.context.action == const.ACTION_LARGE_UPGRADE\
and \
self.isNodeSpecifyStep(
GreyUpgradeStep.STEP_UPDATE_POST_CATALOG):
GreyUpgradeStep.STEP_UPDATE_POST_CATALOG)\
and "dual-standby" not in self.context.clusterType:
self.prepareUpgradeSqlFolder()
self.prepareSql("rollback-post")
self.setUpgradeMode(2)
@ -3538,7 +3671,8 @@ class UpgradeImpl:
self.recordNodeStep(GreyUpgradeStep.STEP_UPDATE_CATALOG)
if maxStep >= GreyUpgradeStep.STEP_UPDATE_CATALOG and\
self.context.action == const.ACTION_LARGE_UPGRADE:
self.rollbackCatalog()
if "dual-standby" not in self.context.clusterType:
self.rollbackCatalog()
self.recordNodeStep(GreyUpgradeStep.STEP_INIT_STATUS)
if maxStep >= GreyUpgradeStep.STEP_INIT_STATUS:
@ -3546,8 +3680,10 @@ class UpgradeImpl:
# dir will create in every node
self.cleanInstallPath(const.NEW)
self.getOneDNInst()
self.dropSupportSchema()
if "dual-standby" not in self.context.clusterType:
self.dropSupportSchema()
self.initOmRollbackProgressFile()
self.recordDualClusterStage(self.oldCommitId, DualClusterStage.STEP_UPGRADE_END)
self.cleanBinaryUpgradeBakFiles(True)
self.cleanTmpGlobalRelampFile()
except Exception as e:
@ -3621,28 +3757,6 @@ class UpgradeImpl:
"""
self.checkActionInFile()
def execSqlCommandInPrimaryDN(self, sql, retryTime=3):
self.context.logger.debug("Start to exec sql {0}.".format(sql))
count = 0
status, output = 1, ""
while count < retryTime:
self.context.logger.debug(
"Exec sql in dn node {0}".format(self.dnInst.hostname))
(status, output) = ClusterCommand.remoteSQLCommand(
sql, self.context.user,
self.dnInst.hostname, self.dnInst.port, False,
DefaultValue.DEFAULT_DB_NAME, IsInplaceUpgrade=True)
self.context.logger.debug(
"Exec sql result is, status:{0}, output is {1}".format(
status, output))
if status != 0 or SqlResult.findErrorInSql(output):
count += 1
continue
else:
break
return status, output
def checkActionInFile(self):
"""
function: check whether current action is same
@ -3884,11 +3998,12 @@ class UpgradeImpl:
check a table exist
:return:
"""
mode = True if "dual-standby" in self.context.clusterType else False
sql = "select count(*) from pg_class where relname = '%s';" % name
(status, output) = ClusterCommand.remoteSQLCommand(
sql, self.context.user,
self.dnInst.hostname, self.dnInst.port, False,
eachdb, IsInplaceUpgrade=True)
eachdb, IsInplaceUpgrade=True, maintenance_mode=mode)
if status != 0 or SqlResult.findErrorInSql(output):
raise Exception(ErrorCode.GAUSS_513["GAUSS_51300"] % sql +
" Error: \n%s" % str(output))
@ -4832,7 +4947,7 @@ class UpgradeImpl:
self.context.logger.log(
"Failed to check upgrade environment.", "constant")
raise Exception(str(e))
self.checkDualClusterUpgrade()
self.context.logger.log(
"Successfully checked upgrade environment.", "constant")
@ -5618,6 +5733,9 @@ class UpgradeImpl:
try:
# clean backup files
self.cleanBackupFiles()
# clean gs_secure_files folder
if self.context.rollback or self.action == "commit-upgrade":
self.clean_gs_secure_files()
except Exception as e:
raise Exception(str(e))
if (isRollBack):
@ -5838,6 +5956,7 @@ class UpgradeImpl:
1 failed
"""
self.context.logger.debug("Start to check database connection.")
mode = True if "dual-standby" in self.context.clusterType else False
for dbNode in self.context.clusterInfo.dbNodes:
if len(dbNode.datanodes) == 0 or dbNode.name:
continue
@ -5848,7 +5967,7 @@ class UpgradeImpl:
ClusterCommand.remoteSQLCommand(
sql, self.context.user, dnInst.hostname, dnInst.port,
False, DefaultValue.DEFAULT_DB_NAME,
IsInplaceUpgrade=True)
IsInplaceUpgrade=True, maintenance_mode=mode)
if status != 0 or not output.isdigit():
self.context.logger.debug(
"Failed to execute SQL on [%s]: %s. Error: \n%s" %
@ -6272,3 +6391,501 @@ class UpgradeImpl:
packFilePath)
except Exception as e:
raise Exception(str(e))
def getPrimaryDN(self, checkNormal):
"""
find primary dn in centralized cluster, which we can execute SQL commands
"""
try:
self.context.logger.debug("start to get primary dn. \n"
"checkNormal is {0}.".format(checkNormal))
if self.context.standbyCluster or self.context.forceRollback:
checkNormal = False
primaryDn = None
if not checkNormal:
clusterNodes = self.context.oldClusterInfo.dbNodes
for dbNode in clusterNodes:
if len(dbNode.datanodes) == 0:
continue
primaryDn = dbNode.datanodes[0]
break
self.primaryDn = primaryDn
else:
primaryList, _ = DefaultValue.getPrimaryNode(self.context.userProfile, self.context.logger)
if primaryList:
primaryDn = primaryList[0]
if not primaryDn:
raise Exception(ErrorCode.GAUSS_526["GAUSS_52635"])
for dbNode in self.context.clusterInfo.dbNodes:
for dn in dbNode.datanodes:
if dn.hostname == primaryDn:
self.primaryDn = dn
self.context.logger.debug("Successfully get primary DN from "
"{0}.".format(self.primaryDn.hostname))
except Exception as er:
self.context.logger.debug("Failed to get Primary dn. Error: %s" % str(er))
raise Exception(ErrorCode.GAUSS_516["GAUSS_51601"] % "primary dn")
def getPrimaryNode(self, instanceType):
"""
:param instanceType:
:return:
"""
try:
self.waitClusterNormalDegrade(waitTimeOut=120)
self.context.logger.debug("Start to get primary node.")
postSplit = ""
primaryFlag = "Primary"
count = 0
cmd, status, output = "", 0, ""
while count < 60:
cmd = "source {0} && cm_ctl query -Cv".format(self.context.userProfile)
(status, output) = CmdUtil.retryGetstatusoutput(cmd, 3, 5)
# no need to retry under force upgrade
if status == 0:
break
time.sleep(2)
count += 1
if status != 0:
raise Exception(
ErrorCode.GAUSS_514["GAUSS_51400"] % "%s. Error:\n%s" % (cmd, output))
self.context.logger.debug("the result of query is {0}, "
"instanceType is {1}.".format(output, instanceType))
targetString = output.split(instanceType)[1]
if instanceType == "Datanode":
dnPrimary = [x for x in re.split(r"[|\n]", targetString) if primaryFlag in x
or "Main" in x]
primaryList = []
for dn in dnPrimary:
primaryList.append(list(filter(None, dn.split(" ")))[1])
return primaryList
if instanceType == "ETCD":
postSplit = "Cluster"
primaryFlag = "StateLeader"
elif instanceType == "CMServer":
postSplit = "ETCD"
elif instanceType == "GTM":
postSplit = "Datanode"
elif instanceType == "Coordinator":
return ""
if postSplit not in targetString:
return ""
primaryInfo = [x for x in re.split(r"[|\n]", targetString.split(postSplit)[0]) if
primaryFlag in x]
if primaryInfo == "" or primaryInfo == []:
return ""
primary = list(filter(None, primaryInfo[0].split(" ")))[1]
self.context.logger.debug("get node {0}".format(primary))
return primary
except Exception as er:
self.context.logger.debug("Failed to get primary node." + str(er))
raise Exception(str(er))
def isGucContainDesignatedVal(self, gucName, result):
"""
The guc value contains the designated string.
:return:
"""
sql = "show {0};".format(gucName)
self.getPrimaryDN(True)
mode = "primary"
is_disaster = DefaultValue.cm_exist_and_is_disaster_cluster(self.context.clusterInfo,
self.context.logger)
if is_disaster:
mode = "standby"
(_, output) = self.execSqlCommandInPrimaryDN(sql, mode=mode)
if result in output:
return True
else:
return False
def execSqlCommandInPrimaryDN(self, sql, retryTime=3, execHost=None, mode="primary"):
"""
execute sql on primary dn
:return:
"""
self.context.logger.debug("Start to exec sql {0}.".format(sql))
count = 0
status, output = 1, ""
mode = True if "dual-standby" in self.context.clusterType or mode == "standby" else False
while count < retryTime:
if not execHost:
self.getPrimaryDN(checkNormal=True)
execHost = self.primaryDn
self.context.logger.debug("Exec sql in dn node {0}".format(execHost.hostname))
(status, output) = ClusterCommand.remoteSQLCommand(sql, self.context.user,
execHost.hostname, execHost.port,
False,
DefaultValue.DEFAULT_DB_NAME,
IsInplaceUpgrade=True,
maintenance_mode=mode)
self.context.logger.debug("Exec sql result "
"is, status:{0}, output is {1}"
"".format(status, output).replace("ERROR", "Log"))
if status != 0 or SqlResult.findErrorInSql(output):
count += 1
continue
else:
break
return status, output
def initClusterType(self):
"""
If it is a dual cluster, initialize whether the current cluster
is the primary cluster or the standby cluster
:return:
"""
# The value of replconninfo1 must contain 'iscascade' in the DR cluster.
isStrDRCluster = self.isGucContainDesignatedVal("replconninfo1", "iscascade")
if isStrDRCluster:
suffix = "-streamDR"
else:
self.context.logger.debug("Current cluster is not dual cluster.")
return
if self.context.is_inplace_upgrade and self.context.action \
not in ["commit-upgrade", "auto-rollback", "chose-strategy"]:
raise Exception("Dual cluster does not support in-place upgrade")
if self.checkGucValIsInValGiven(const.DISASTER_RECOVERY_GUC, ["2"], fromFile=True):
self.context.standbyCluster = True
self.context.clusterType = "dual-standby" + suffix
elif self.checkGucValIsInValGiven(const.DISASTER_RECOVERY_GUC, ["0"], fromFile=True):
self.context.clusterType = "dual-primary" + suffix
self.context.logger.log("NOTICE: the clusterType is {0}".format(self.context.clusterType))
if not self.context.is_inplace_upgrade:
self.backup_disaster_user_file()
if self.context.forceRollback:
return
self.copyStandbyClusterUpgradeFile()
upgradeInfoTmp = self.context.getDualUpgradeInfo(self.context.upgradePhaseInfoPath, 0)
if upgradeInfoTmp is not None:
if "dual-standby" in self.context.clusterType:
self.context.dualUpgradeShareInfo.masterVersion = upgradeInfoTmp.masterVersion
self.context.dualUpgradeShareInfo.masterUpgradeStatus = \
upgradeInfoTmp.masterUpgradeStatus
else:
self.context.dualUpgradeShareInfo.standbyVersion = upgradeInfoTmp.standbyVersion
self.context.dualUpgradeShareInfo.standbyUpgradeStatus = \
upgradeInfoTmp.standbyUpgradeStatus
self.context.updateDualUpgradeInfo(self.context.dualUpgradeShareInfo,
filePath=self.context.upgradePhaseInfoPath,
startPost=0)
def checkGucValIsInValGiven(self, gucName, valList, fromFile=False):
"""
Checks whether a given parameter is a given value list in a given instance list.
"""
self.context.logger.debug("checks whether the parameter:{0} is "
"the value:{1}.".format(gucName, valList))
gucStr = "{0}:{1}".format(gucName, ",".join(valList))
try:
self.checkParam(gucStr, fromFile)
self.context.logger.debug("Success to check the parameter:{0} value is "
"in the value:{1}.".format(gucName, valList))
return True
except Exception as _:
return False
def copyStandbyClusterUpgradeFile(self):
"""
From the data directory of the standby cluster, copy the upgrade_phase_info file
to the designated instance directory of the primary cluster, and distribute it
to the upgrade backup directory of all nodes
"""
hardUser, hardUserPwd = self.getDisasterRecoveryUser()
if hardUser is None or hardUser == "" or hardUserPwd is None or hardUserPwd == "":
raise Exception("Failed to obtain the streaming disaster build user")
dnInstance = None
for x in range(1, 9):
localRemoteInfo = self.getLocalRemoteHostIpAndPort("{0}{1}".format(
const.REMOTE_INFO_GUC[self.context.clusterType], x))
for dbNode in self.context.clusterInfo.dbNodes:
for dnInst in dbNode.datanodes:
self.context.logger.debug("The instance is {0}".format(dnInst.__dict__))
if "-streamDR" in self.context.clusterType:
dataIp = DefaultValue.get_data_ip_info(dnInst, self.context.logger)
if localRemoteInfo.get("localhost") in dataIp and \
localRemoteInfo.get("localport") == str(dnInst.haPort).strip():
dnInstance = copy.deepcopy(dnInst)
break
if dnInstance is not None:
try:
self.copyAndDistributeUpgradeFile(dnInstance, localRemoteInfo)
except Exception as err:
self.context.logger.error("Cope file failed msg:%s." % err)
dnInstance = None
continue
break
if dnInstance is None:
raise Exception("Unable to find a DN to connect to the standby cluster node")
def checkDualClusterUpgrade(self):
"""
Double cluster check whether it can be upgrade
:return:
"""
if "dual-standby-streamDR" not in self.context.clusterType or \
self.context.action == const.ACTION_SMALL_UPGRADE:
return
self.context.logger.debug("The status of the dual-cluster standby status is {0}, version "
"is {1}. The status of the dual-cluster master status is {2}, "
"version is {3}".format(
self.context.dualUpgradeShareInfo.standbyUpgradeStatus,
self.context.dualUpgradeShareInfo.standbyVersion,
self.context.dualUpgradeShareInfo.masterUpgradeStatus,
self.context.dualUpgradeShareInfo.masterVersion))
if self.context.dualUpgradeShareInfo.masterUpgradeStatus < 2 or \
self.context.dualUpgradeShareInfo.masterVersion != self.newCommitId:
raise Exception("The status of the dual-cluster master is {0}. "
"the standby cluster cannot be upgrade."
.format(self.context.dualUpgradeShareInfo.masterUpgradeStatus))
def recordDualClusterStage(self, commitVersion, upgradeStage):
"""
Record the upgrade information of the dual cluster
:param commitVersion:
:param upgradeStage:
:return:
"""
if "dual-primary" in self.context.clusterType:
self.context.dualUpgradeShareInfo.masterVersion = commitVersion
self.context.dualUpgradeShareInfo.masterUpgradeStatus = upgradeStage
elif "dual-standby" in self.context.clusterType:
self.context.dualUpgradeShareInfo.standbyVersion = commitVersion
self.context.dualUpgradeShareInfo.standbyUpgradeStatus = upgradeStage
else:
return
self.context.updateDualUpgradeInfo(self.context.dualUpgradeShareInfo,
filePath=self.context.upgradePhaseInfoPath, startPost=0)
def checkDualClusterRollback(self):
"""
Double cluster check whether it can be rollback
:return:
"""
if "dual-standby" in self.context.clusterType or \
"dual-" not in self.context.clusterType:
return
self.context.logger.debug("The status of the dual-cluster standby status is {0}, version "
"is {1}. The status of the dual-cluster master status is {2}, "
"version is {3}".format(
self.context.dualUpgradeShareInfo.standbyUpgradeStatus,
self.context.dualUpgradeShareInfo.standbyVersion,
self.context.dualUpgradeShareInfo.masterUpgradeStatus,
self.context.dualUpgradeShareInfo.masterVersion))
if not self.context.rollback or \
"dual-primary" in self.context.clusterType or \
self.context.action == const.ACTION_SMALL_UPGRADE or self.context.forceRollback:
return
# master cluster
if "dual-primary" in self.context.clusterType:
if (self.context.dualUpgradeShareInfo.standbyUpgradeStatus > 2 or
self.context.dualUpgradeShareInfo.standbyUpgradeStatus == 0) and \
self.context.dualUpgradeShareInfo.standbyVersion == self.newCommitId:
raise Exception("The status of the dual-cluster standby is {0}. "
"the master cluster cannot be rolled back."
.format(self.context.dualUpgradeShareInfo.standbyUpgradeStatus))
def checkDualClusterCommit(self):
"""
Double cluster check whether it can be submitted
:return:
"""
if "dual-" not in self.context.clusterType:
return
if self.context.action == const.ACTION_SMALL_UPGRADE:
return
self.context.logger.debug("The status of the dual-cluster standby status is {0}, version "
"is {1}. The status of the dual-cluster master status is {2}, "
"version is {3}".format(
self.context.dualUpgradeShareInfo.standbyUpgradeStatus,
self.context.dualUpgradeShareInfo.standbyVersion,
self.context.dualUpgradeShareInfo.masterUpgradeStatus,
self.context.dualUpgradeShareInfo.masterVersion))
# master cluster
if "dual-primary" in self.context.clusterType:
if self.context.dualUpgradeShareInfo.standbyUpgradeStatus != 0 or \
self.context.dualUpgradeShareInfo.standbyVersion != self.newCommitId:
raise Exception("The status of the dual-cluster standby status is {0}, "
"version is {1}. the master cluster cannot be commit."
.format(self.context.dualUpgradeShareInfo.standbyUpgradeStatus,
self.context.dualUpgradeShareInfo.standbyVersion))
if "dual-standby" in self.context.clusterType:
if self.context.dualUpgradeShareInfo.masterUpgradeStatus != 2 or \
self.context.dualUpgradeShareInfo.masterVersion != self.newCommitId:
raise Exception("The status of the dual-cluster master status is {0}, "
"version is {1}. The standby cluster cannot be commit."
.format(self.context.dualUpgradeShareInfo.masterUpgradeStatus,
self.context.dualUpgradeShareInfo.masterVersion))
def copyDirFromRemoteNode(self, remoteHost, remoteDir, targetHost, targetDir):
"""
SSH to the remote node, copy dir from the remote node to the specified node
:param remoteHost:
:param remoteDir:
:param targetHost:
:param targetDir:
:return:
"""
scpcmd = "pssh -s -H {0} 'source {5}; if [ -d '{1}' ];" \
"then pscp -r -H {2} {3} {4}; fi' ".format(remoteHost, remoteDir, targetHost,
remoteDir, targetDir,
self.context.userProfile)
(status, output) = CmdUtil.retryGetstatusoutput(scpcmd, 2, 5)
if status != 0:
raise Exception("File copy failed. Output: {0}".format(output))
def getLocalRemoteHostIpAndPort(self, gucName):
"""
Get the DN instance and the corresponding standby cluster host and port through the
cross_cluster_replconninfo parameter
:param gucName: cross_cluster_replconninfo parameter name
:return: {"localhost":"", "localport":"", "remotehost":"", "remoteport":""}
"""
isLocal = False
localRemoteInfo = dict()
sql = "show {0};".format(gucName)
self.getPrimaryDN(False)
(status, output) = self.execSqlCommandInPrimaryDN(sql)
if status != 0 or output == "":
raise Exception("Failed to get GUC parameter: {0} value. Output: {1}".format(gucName,
output))
localIp = output.split("localhost=")[1].split("localport=")[0].strip()
remoteIp = output.split("remotehost=")[1].split("remoteport=")[0].strip()
self.context.logger.debug("Success get the output {0}".format(output))
if "-streamDR" in self.context.clusterType:
localPort = output.split("localport=")[1].split("localheartbeatport=")[0].strip()
remotePort = output.split("remoteport=")[1].split("remoteheartbeatport=")[0].strip()
for dbNode in self.context.clusterInfo.dbNodes:
if isLocal:
break
for dnInst in dbNode.datanodes:
if remoteIp in dnInst.listenIps or remoteIp in dnInst.hostname:
isLocal = True
break
self.context.logger.debug("The local flag is {0}".format(isLocal))
if isLocal:
localRemoteInfo.setdefault("localhost", "no find remote host")
else:
localRemoteInfo.setdefault("localhost", localIp)
localRemoteInfo.setdefault("localport", localPort)
localRemoteInfo.setdefault("remotehost", remoteIp)
localRemoteInfo.setdefault("remoteport", remotePort)
return localRemoteInfo
def copyAndDistributeUpgradeFile(self, dnInstance, localRemoteInfo):
"""
copy upgrade file
:return:
"""
hardUser, hardUserPwd = self.getDisasterRecoveryUser()
cmd_remote = 'pssh -s -H {0} \'source {8}; gs_ctl build -D {1} -b copy_upgrade_file ' \
'-Z datanode -U {2} -P "{3}" -C "localhost={4} localport={5} remotehost={6} ' \
'remoteport={7}"\''.format(dnInstance.hostname,
dnInstance.datadir,
hardUser,
hardUserPwd,
localRemoteInfo.get("localhost"),
localRemoteInfo.get("localport"),
localRemoteInfo.get("remotehost"),
localRemoteInfo.get("remoteport"),
self.context.userProfile)
cmd_remote = cmd_remote.replace(" -Z datanode", "")
self.context.logger.debug("Copy upgrade file with cmd: {0}.".
format(cmd_remote.replace(hardUserPwd, "***")))
status, output = DefaultValue.getstatusoutput_hide_pass(cmd_remote)
if status == 0:
self.context.logger.debug("Successfully copy upgrade file")
else:
raise Exception("Failed to copy files from the standby cluster. "
"Ensure that the standby cluster version supports this function. "
"Output: {0}".format(output))
remoteUpgradeInfoPath = os.path.join(dnInstance.datadir, const.UPGRADE_PHASE_INFO)
self.copyFileFromRemoteNode(dnInstance.hostname, remoteUpgradeInfoPath,
NetUtil.GetHostIpOrName(),
self.context.upgradePhaseInfoPath)
if not os.path.exists(self.context.upgradePhaseInfoPath):
FileUtil.createFile(self.context.upgradePhaseInfoPath,
mode=DefaultValue.KEY_FILE_MODE)
self.context.updateDualUpgradeInfo(self.context.dualUpgradeShareInfo,
filePath=self.context.upgradePhaseInfoPath,
startPost=0)
self.context.sshTool.scpFiles(self.context.upgradePhaseInfoPath,
self.context.tmpDir,
hostList=self.context.clusterNodes)
def getDisasterRecoveryUser(self):
"""
Obtain special users of the streaming disaster recovery cluster for building
:return: user name
"""
mode = True if "dual-standby" in self.context.clusterType else False
user_str = DefaultValue.obtain_hadr_user_encrypt_str(
self.context.clusterInfo, self.context.user, self.context.logger, mode)
rand_pwd = DefaultValue.decrypt_hadr_rand_pwd(self.context.logger)
params = rand_pwd, user_str, self.context.clusterInfo, self.context.user, \
self.context.logger, mode
hardUser, hardUserPwd = DefaultValue.decrypt_hadr_user_info(params)
return hardUser, hardUserPwd
def copyFileFromRemoteNode(self, remoteHost, remoteFile, targetHost, targetFile):
"""
SSH to the remote node, copy files from the remote node to the specified node
:param remoteHost:
:param remoteFile:
:param targetHost:
:param targetFile:
:return:
"""
scpcmd = "pssh -s -H {0} 'source {5}; if [ -f '{1}' ];" \
"then pscp -H {2} {3} {4}; fi' ".format(remoteHost, remoteFile, targetHost,
remoteFile, targetFile,
self.context.userProfile)
(status, output) = CmdUtil.retryGetstatusoutput(scpcmd, 2, 5)
if status != 0:
raise Exception("File copy failed. Output: {0}".format(output))
def clean_gs_secure_files(self):
"""
delete gs_secure_files during rollback or commit
"""
try:
self.context.logger.debug(
"Starting to clean gs_secure_files folder in the dn data catalog.")
cmd = "%s -t %s -U %s -l %s" % \
(OMCommand.getLocalScript("Local_Upgrade_Utility"),
const.ACTION_CLEAN_GS_SECURE_FILES,
self.context.user,
self.context.localLog)
self.context.logger.debug("clean gs_secure_files folder:{0}".format(cmd))
host_list = copy.deepcopy(self.context.clusterNodes)
self.context.execCommandInSpecialNode(cmd, host_list)
except Exception as er:
raise Exception(str(er))
self.context.logger.debug(
"Successfully to clean gs_secure_files folder in the dn data catalog.")

View File

@ -56,6 +56,7 @@ class CmdOptions():
self.removeIps = []
self.addIps = []
self.dws_mode = False
self.try_reload = False
def usage():
@ -75,6 +76,7 @@ General options:
-r the signal about ignorepgHbaMiss
--remove-ip Remove ip address from pg_hba.conf
--add-ip Add ip address to pg_hba.conf
--try-reload Try reload guc params if can not set
--help Show help information for this utility,
and exit the command line mode.
"""
@ -88,7 +90,7 @@ def parseCommandLine():
try:
opts, args = getopt.getopt(sys.argv[1:], "U:X:l:r",
["remove-ip=", "help", "dws-mode",
"add-ip="])
"add-ip=", "try-reload"])
except Exception as e:
usage()
GaussLog.exitWithError(ErrorCode.GAUSS_500["GAUSS_50000"] % str(e))
@ -116,6 +118,8 @@ def parseCommandLine():
g_opts.removeIps.append(value)
elif (key == "--dws-mode"):
g_opts.dws_mode = True
elif key == "--try-reload":
g_opts.try_reload = True
elif (key == "--add-ip"):
g_opts.addIps = value.split(',')
Parameter.checkParaVaild(key, value)
@ -154,7 +158,7 @@ class ConfigHba(LocalBaseOM):
"""
def __init__(self, logFile, user, clusterConf, dwsMode=False,
ignorepgHbaMiss=False, removeIps=None):
ignorepgHbaMiss=False, removeIps=None, try_reload=False):
"""
function: configure all instance on local node
"""
@ -178,6 +182,7 @@ class ConfigHba(LocalBaseOM):
if removeIps is None:
removeIps = []
self.removeIps = removeIps
self.try_reload = try_reload
def getAllIps(self):
"""
@ -220,6 +225,16 @@ class ConfigHba(LocalBaseOM):
except Exception as e:
raise Exception(str(e))
def remove_streaming_config(self, component):
"""
remove dn & cn pg_hba for streaming stop
"""
ip_segment_list = list(set(['.'.join(
remove_ip.split('.')[:2]) + ".0.0/16" for remove_ip in self.removeIps]))
for ip_segment in ip_segment_list:
ip_remove_str = "-h \"host replication all %s\" " % ip_segment
component.doGUCConfig("set", ip_remove_str, True)
def __configAnInstance(self, component):
"""
function: set hba config for single component
@ -245,9 +260,10 @@ class ConfigHba(LocalBaseOM):
self.logger.debug("The %s does not exist." % hbaFile)
return
component.setPghbaConfig(self.allIps)
component.setPghbaConfig(self.allIps, try_reload=self.try_reload)
if len(self.removeIps) != 0:
component.removeIpInfoOnPghbaConfig(self.removeIps)
self.remove_streaming_config(component)
if __name__ == '__main__':
@ -266,7 +282,7 @@ if __name__ == '__main__':
# modify Instance
configer = ConfigHba(g_opts.logFile, g_opts.clusterUser,
g_opts.clusterConf, g_opts.dws_mode,
g_opts.ignorepgHbaMiss, g_opts.removeIps)
g_opts.ignorepgHbaMiss, g_opts.removeIps, g_opts.try_reload)
configer.configHba()
except Exception as e:

View File

@ -2152,6 +2152,35 @@ def backupHotpatch():
for dbInstance in g_dbNode.gtms:
backupInstanceHotpatchConfig(dbInstance.datadir)
def clean_gs_secure_files():
"""
clean gs_secure_files folder
"""
pool = ThreadPool(DefaultValue.getCpuSet())
pool.map(clean_stream_gs_secure, g_dbNode.datanodes)
pool.close()
pool.join()
def clean_stream_gs_secure(dn_inst):
"""
clean gs secure dir
"""
temp_dir = EnvUtil.getTmpDirFromEnv()
file_path = os.path.join(dn_inst.datadir, "gs_secure_files")
cmd = "(if [ -d '%s' ]; then rm -rf '%s'; fi) && " % (file_path, file_path)
cmd += "(if [ -f '%s/upgrade_phase_info' ]; then rm -f '%s/upgrade_phase_info'; " \
"fi) &&" % (temp_dir, temp_dir)
cmd += "(if [ -f '%s/hadr.key.cipher' ]; then rm -f '%s/hadr.key.cipher'; " \
"fi) &&" % (temp_dir, temp_dir)
cmd += "(if [ -f '%s/hadr.key.rand' ]; then rm -f '%s/hadr.key.rand'; " \
"fi) &&" % (temp_dir, temp_dir)
cmd += "(if [ -d '%s/gs_secure_files' ]; then rm -f '%s/gs_secure_files'; " \
"fi)" % (temp_dir, temp_dir)
g_logger.debug("Starting clean instance %s gs secure dir, cmd:%s." % (dn_inst.instanceId, cmd))
CmdExecutor.execCommandLocally(cmd)
g_logger.debug("Successfully clean instance %s gs secure dir." % dn_inst.instanceId)
def rollbackInstanceHotpatchConfig(instanceDataDir):
"""
@ -4720,6 +4749,7 @@ def main():
const.ACTION_GREY_SYNC_GUC: greySyncGuc,
const.ACTION_GREY_UPGRADE_CONFIG_SYNC: greyUpgradeSyncConfig,
const.ACTION_SWITCH_DN: switchDnNodeProcess,
const.ACTION_CLEAN_GS_SECURE_FILES: clean_gs_secure_files,
const.ACTION_GET_LSN_INFO: getLsnInfo,
const.ACTION_GREY_RESTORE_CONFIG: greyRestoreConfig,
const.ACTION_GREY_RESTORE_GUC: greyRestoreGuc,