From aa970803c3f100d6011db84fd6f347a16d116295 Mon Sep 17 00:00:00 2001 From: chenzhaoliang1228 Date: Wed, 10 Aug 2022 11:16:36 +0800 Subject: [PATCH] =?UTF-8?q?=E6=94=AF=E6=8C=81=E6=B5=81=E5=BC=8F=E5=AE=B9?= =?UTF-8?q?=E7=81=BE=E7=89=B9=E6=80=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 0001-del-sctp-check.patch | 855 ------ script/base_diff/sql_commands.py | 6 +- script/base_utils/common/constantsbase.py | 2 +- script/base_utils/os/cmd_util.py | 17 + script/base_utils/os/file_util.py | 22 + .../base_utils/security/security_checker.py | 96 +- script/gs_checkperf | 7 +- script/gs_dropnode | 3 +- script/gs_expansion | 2 +- script/gs_sdr | 95 + script/gs_upgradectl | 93 + script/gspylib/common/Common.py | 508 +++- script/gspylib/common/DbClusterInfo.py | 61 +- script/gspylib/common/DbClusterStatus.py | 69 +- script/gspylib/common/ErrorCode.py | 7 +- script/gspylib/common/GaussLog.py | 14 +- .../component/Kernel/DN_OLAP/DN_OLAP.py | 14 +- script/gspylib/component/Kernel/Kernel.py | 48 +- .../impl/checkperf/OLAP/CheckperfImplOLAP.py | 7 +- script/impl/om/OLAP/OmImplOLAP.py | 4 + .../streaming_disaster_recovery/__init__.py | 0 .../params_handler.py | 344 +++ .../streaming_base.py | 2484 +++++++++++++++++ .../streaming_constants.py | 92 + .../streaming_modules/__init__.py | 0 .../streaming_diaster_recovery_start.py | 234 ++ .../streaming_disaster_recovery_failover.py | 70 + .../streaming_disaster_recovery_query.py | 168 ++ .../streaming_disaster_recovery_stop.py | 105 + .../streaming_disaster_recovery_switchover.py | 476 ++++ script/impl/upgrade/UpgradeConst.py | 32 + script/impl/upgrade/UpgradeImpl.py | 715 ++++- script/local/ConfigHba.py | 24 +- script/local/UpgradeUtility.py | 30 + 34 files changed, 5765 insertions(+), 939 deletions(-) delete mode 100644 0001-del-sctp-check.patch create mode 100644 script/gs_sdr create mode 100644 script/impl/streaming_disaster_recovery/__init__.py create mode 100644 script/impl/streaming_disaster_recovery/params_handler.py create mode 100644 script/impl/streaming_disaster_recovery/streaming_base.py create mode 100644 script/impl/streaming_disaster_recovery/streaming_constants.py create mode 100644 script/impl/streaming_disaster_recovery/streaming_modules/__init__.py create mode 100644 script/impl/streaming_disaster_recovery/streaming_modules/streaming_diaster_recovery_start.py create mode 100644 script/impl/streaming_disaster_recovery/streaming_modules/streaming_disaster_recovery_failover.py create mode 100644 script/impl/streaming_disaster_recovery/streaming_modules/streaming_disaster_recovery_query.py create mode 100644 script/impl/streaming_disaster_recovery/streaming_modules/streaming_disaster_recovery_stop.py create mode 100644 script/impl/streaming_disaster_recovery/streaming_modules/streaming_disaster_recovery_switchover.py diff --git a/0001-del-sctp-check.patch b/0001-del-sctp-check.patch deleted file mode 100644 index 91131f5..0000000 --- a/0001-del-sctp-check.patch +++ /dev/null @@ -1,855 +0,0 @@ -From 812edfeeb0e47f14dff2077ff5f8a69b4773eaef Mon Sep 17 00:00:00 2001 -From: twx980514 -Date: Tue, 6 Jul 2021 17:43:08 +0800 -Subject: [PATCH] del sctp check - ---- - script/gs_check | 4 +- - script/gspylib/common/GaussLog.py | 2 +- - script/gspylib/etc/conf/check_list.conf | 11 --- - script/gspylib/etc/conf/check_list_dws.conf | 11 --- - .../inspection/config/check_list_V1R6C10.conf | 10 -- - .../inspection/config/check_list_V1R7C00.conf | 11 --- - script/gspylib/inspection/config/items.xml | 38 +------- - script/gspylib/inspection/config/scene_inspect.xml | 2 - - script/gspylib/inspection/config/scene_install.xml | 2 - - script/gspylib/inspection/config/scene_upgrade.xml | 1 - - .../inspection/items/network/CheckNoCheckSum.py | 71 -------------- - .../inspection/items/network/CheckUsedPort.py | 17 +--- - .../inspection/items/os/CheckSctpService.py | 108 --------------------- - .../gspylib/inspection/items/os/CheckSysParams.py | 32 ++---- - script/impl/preinstall/PreinstallImpl.py | 38 -------- - script/local/LocalCheck.py | 21 +--- - script/local/PreInstallUtility.py | 87 +---------------- - 17 files changed, 16 insertions(+), 450 deletions(-) - delete mode 100644 script/gspylib/inspection/items/network/CheckNoCheckSum.py - delete mode 100644 script/gspylib/inspection/items/os/CheckSctpService.py - -diff --git a/script/gs_check b/script/gs_check -index 05d5625..0f29b31 100644 ---- a/script/gs_check -+++ b/script/gs_check -@@ -93,13 +93,13 @@ DEFAULT_TIMEOUT = 1500 - # because single clusters don't need to perform consistency checks and - # internal communication class checks - SINGLE_SKIP = ["CheckTimeZone", "CheckEncoding", "CheckKernelVer", -- "CheckNTPD", "CheckNoCheckSum", "CheckCpuCount", -+ "CheckNTPD", "CheckCpuCount", - "CheckMemInfo", "CheckDiskConfig", - "CheckUpVer", "CheckPgxcgroup", "CheckPing", - "CheckNetWorkDrop", "CheckNetSpeed"] - - SETITEM_SKIP = ["CheckCPU", "CheckTimeZone", "CheckOSVer", "CheckNTPD", -- "CheckSshdService", "CheckNoCheckSum", "CheckEtcHosts", -+ "CheckSshdService", "CheckEtcHosts", - "CheckCpuCount", "CheckHyperThread", "CheckMemInfo", - "CheckKernelVer", "CheckEncoding", "CheckBootItems", - "CheckDropCache", "CheckFilehandle", "CheckKeyProAdj", -diff --git a/script/gspylib/common/GaussLog.py b/script/gspylib/common/GaussLog.py -index bdfecf1..31957d2 100644 ---- a/script/gspylib/common/GaussLog.py -+++ b/script/gspylib/common/GaussLog.py -@@ -55,7 +55,7 @@ PREINSTALL_ACTION = ["prepare_path", "check_os_Version", "create_os_user", - "check_os_user", "create_cluster_paths", - "set_os_parameter", "set_finish_flag", "set_warning_env", - "prepare_user_cron_service", "prepare_user_sshd_service", -- "set_library", "set_sctp", "set_virtualIp", -+ "set_library", "set_virtualIp", - "clean_virtualIp", "check_hostname_mapping", - "init_gausslog", "check_envfile", "check_dir_owner", - "set_user_env", "set_tool_env", "gs_preinstall"] -diff --git a/script/gspylib/etc/conf/check_list.conf b/script/gspylib/etc/conf/check_list.conf -index deba792..77b7c60 100644 ---- a/script/gspylib/etc/conf/check_list.conf -+++ b/script/gspylib/etc/conf/check_list.conf -@@ -10,14 +10,10 @@ net.ipv4.tcp_keepalive_time = 30 - net.ipv4.tcp_keepalive_intvl = 30 - net.ipv4.tcp_keepalive_probes = 9 - net.ipv4.tcp_retries2 = 12 --net.sctp.addip_enable = 0 - net.core.wmem_max = 21299200 - net.core.rmem_max = 21299200 - net.core.wmem_default = 21299200 - net.core.rmem_default = 21299200 --net.sctp.sctp_mem = 94500000 915000000 927000000 --net.sctp.sctp_rmem = 8192 250000 16777216 --net.sctp.sctp_wmem = 8192 250000 16777216 - kernel.sem = 250 6400000 1000 25600 - net.ipv4.tcp_rmem = 8192 250000 16777216 - net.ipv4.tcp_wmem = 8192 250000 16777216 -@@ -33,8 +29,6 @@ kernel.shmmax = 18446744073709551615 - - # if parameter value is not equal to ths OS's value, print the waring, and not error - [SUGGEST:/etc/sysctl.conf] --net.sctp.sndbuf_policy = 0 --net.sctp.rcvbuf_policy = 0 - net.ipv4.ip_local_port_range = 26000 65535 - net.ipv4.tcp_fin_timeout = 60 - net.ipv4.tcp_sack = 1 -@@ -42,13 +36,8 @@ net.ipv4.tcp_timestamps = 1 - net.ipv4.tcp_retries1 = 5 - net.ipv4.tcp_syn_retries = 5 - net.ipv4.tcp_synack_retries = 5 --net.sctp.path_max_retrans = 10 --net.sctp.max_init_retransmits = 10 --net.sctp.association_max_retrans = 10 --net.sctp.hb_interval = 30000 - vm.extfrag_threshold = 500 - vm.overcommit_ratio = 90 --SctpChecksumErrors = 0 - - # open file number, please set it to set '1000000' - [/etc/security/limits.conf] -diff --git a/script/gspylib/etc/conf/check_list_dws.conf b/script/gspylib/etc/conf/check_list_dws.conf -index a7f7b7c..a96f7e9 100644 ---- a/script/gspylib/etc/conf/check_list_dws.conf -+++ b/script/gspylib/etc/conf/check_list_dws.conf -@@ -10,14 +10,10 @@ net.ipv4.tcp_keepalive_time = 30 - net.ipv4.tcp_keepalive_intvl = 30 - net.ipv4.tcp_keepalive_probes = 9 - net.ipv4.tcp_retries2 = 12 --net.sctp.addip_enable = 0 - net.core.wmem_max = 21299200 - net.core.rmem_max = 21299200 - net.core.wmem_default = 21299200 - net.core.rmem_default = 21299200 --net.sctp.sctp_mem = 94500000 915000000 927000000 --net.sctp.sctp_rmem = 8192 250000 16777216 --net.sctp.sctp_wmem = 8192 250000 16777216 - kernel.sem = 250 6400000 1000 25600 - net.ipv4.tcp_rmem = 8192 250000 16777216 - net.ipv4.tcp_wmem = 8192 250000 16777216 -@@ -28,8 +24,6 @@ net.ipv4.tcp_max_syn_backlog = 65535 - net.core.somaxconn = 65535 - net.ipv4.tcp_syncookies = 1 - vm.overcommit_memory = 0 --net.sctp.sndbuf_policy = 0 --net.sctp.rcvbuf_policy = 0 - net.ipv4.tcp_fin_timeout = 60 - kernel.shmall = 1152921504606846720 - kernel.shmmax = 18446744073709551615 -@@ -38,16 +32,11 @@ net.ipv4.tcp_timestamps = 1 - net.ipv4.tcp_retries1 = 10 - net.ipv4.tcp_syn_retries = 10 - net.ipv4.tcp_synack_retries = 10 --net.sctp.path_max_retrans = 10 --net.sctp.max_init_retransmits = 10 --net.sctp.association_max_retrans = 10 --net.sctp.hb_interval = 30000 - vm.extfrag_threshold = 500 - vm.overcommit_ratio = 90 - - # if parameter value is not equal to ths OS's value, print the waring, and not error - [SUGGEST:/etc/sysctl.conf] --SctpChecksumErrors = 0 - - # open file number, please set it to set '1000000' - [/etc/security/limits.conf] -diff --git a/script/gspylib/inspection/config/check_list_V1R6C10.conf b/script/gspylib/inspection/config/check_list_V1R6C10.conf -index 75a2203..16c3fd2 100644 ---- a/script/gspylib/inspection/config/check_list_V1R6C10.conf -+++ b/script/gspylib/inspection/config/check_list_V1R6C10.conf -@@ -10,14 +10,10 @@ net.ipv4.tcp_keepalive_time = 30 - net.ipv4.tcp_keepalive_intvl = 30 - net.ipv4.tcp_keepalive_probes = 9 - net.ipv4.tcp_retries2 = 80 --net.sctp.addip_enable = 0 - net.core.wmem_max = 21299200 - net.core.rmem_max = 21299200 - net.core.wmem_default = 21299200 - net.core.rmem_default = 21299200 --net.sctp.sctp_mem = 94500000 915000000 927000000 --net.sctp.sctp_rmem = 8192 250000 16777216 --net.sctp.sctp_wmem = 8192 250000 16777216 - kernel.sem = 250 6400000 1000 25600 - net.ipv4.tcp_rmem = 8192 250000 16777216 - net.ipv4.tcp_wmem = 8192 250000 16777216 -@@ -30,8 +26,6 @@ net.ipv4.tcp_syncookies = 1 - vm.overcommit_memory = 0 - vm.panic_on_oom = 0; - vm.oom_kill_allocating_task = 0; --net.sctp.sndbuf_policy = 0 --net.sctp.rcvbuf_policy = 0 - - # if parameter value is not equal to ths OS's value, print the waring, and not error - [SUGGEST:/etc/sysctl.conf] -@@ -41,10 +35,6 @@ net.ipv4.tcp_timestamps = 1 - net.ipv4.tcp_retries1 = 5 - net.ipv4.tcp_syn_retries = 5 - net.ipv4.tcp_synack_retries = 5 --net.sctp.path_max_retrans = 10 --net.sctp.max_init_retransmits = 10 --net.sctp.association_max_retrans = 10 --net.sctp.hb_interval = 30000 - - # open file number, please set it to set '1000000' - [/etc/security/limits.conf] -diff --git a/script/gspylib/inspection/config/check_list_V1R7C00.conf b/script/gspylib/inspection/config/check_list_V1R7C00.conf -index 41c9334..4c150b6 100644 ---- a/script/gspylib/inspection/config/check_list_V1R7C00.conf -+++ b/script/gspylib/inspection/config/check_list_V1R7C00.conf -@@ -10,14 +10,10 @@ net.ipv4.tcp_keepalive_time = 30 - net.ipv4.tcp_keepalive_intvl = 30 - net.ipv4.tcp_keepalive_probes = 9 - net.ipv4.tcp_retries2 = 80 --net.sctp.addip_enable = 0 - net.core.wmem_max = 21299200 - net.core.rmem_max = 21299200 - net.core.wmem_default = 21299200 - net.core.rmem_default = 21299200 --net.sctp.sctp_mem = 94500000 915000000 927000000 --net.sctp.sctp_rmem = 8192 250000 16777216 --net.sctp.sctp_wmem = 8192 250000 16777216 - kernel.sem = 250 6400000 1000 25600 - net.ipv4.tcp_rmem = 8192 250000 16777216 - net.ipv4.tcp_wmem = 8192 250000 16777216 -@@ -30,8 +26,6 @@ net.ipv4.tcp_syncookies = 1 - vm.overcommit_memory = 0 - vm.panic_on_oom = 0 - vm.oom_kill_allocating_task = 0 --net.sctp.sndbuf_policy = 0 --net.sctp.rcvbuf_policy = 0 - kernel.shmall = 1152921504606846720 - kernel.shmmax = 18446744073709551615 - -@@ -43,13 +37,8 @@ net.ipv4.tcp_timestamps = 1 - net.ipv4.tcp_retries1 = 5 - net.ipv4.tcp_syn_retries = 5 - net.ipv4.tcp_synack_retries = 5 --net.sctp.path_max_retrans = 10 --net.sctp.max_init_retransmits = 10 --net.sctp.association_max_retrans = 10 --net.sctp.hb_interval = 30000 - vm.extfrag_threshold = 500 - vm.overcommit_ratio = 90 --SctpChecksumErrors = 0 - - # open file number, please set it to set '1000000' - [/etc/security/limits.conf] -diff --git a/script/gspylib/inspection/config/items.xml b/script/gspylib/inspection/config/items.xml -index 1dbac79..bb4143c 100644 ---- a/script/gspylib/inspection/config/items.xml -+++ b/script/gspylib/inspection/config/items.xml -@@ -334,24 +334,6 @@ - default - - -- -- -- <zh>检查nochecksum值是否为预期值且一致(默认为N,RedHat6.4/6.5且bond是为Y)</zh> -- <en>Check the nochecksum</en> -- -- -- -- 修改nochecksum值为一致的预期值 -- -- -- 检查nochecksum值,若符合预期且一致则检查项通过,否则检查项不通过 -- -- network -- root -- all -- consistent -- -- - - - <zh>检查omm用户是否已删除</zh> -@@ -456,24 +438,6 @@ - <analysis>consistent</analysis> - </checkitem> - -- <checkitem id="10032" name="CheckSctpService"> -- <title> -- <zh>检查sctp服务</zh> -- <en>Check sctp service</en> -- -- -- -- 安装及加载sctp服务 -- -- -- stcp服务开启且写在开机自启动文件中则检查项通过,否则检查项不通过 -- -- os -- root -- all -- default -- -- - - - <zh>检查超线程是否打开</zh> -@@ -1841,7 +1805,7 @@ - <zh>增大net.ipv4.ip_local_port_range或降低并发</zh> - </suggestion> - <standard> -- <zh>检查net.ipv4.ip_local_port_range,范围大于等于OS默认值通过;检查TCP协议随机端口数,小于总随机端口数的80%通过;检查SCTP协议随机端口数,小于总随机端口数的80%通过</zh> -+ <zh>检查net.ipv4.ip_local_port_range,范围大于等于OS默认值通过;检查TCP协议随机端口数,小于总随机端口数的80%通过</zh> - </standard> - <category>network</category> - <permission>user</permission> -diff --git a/script/gspylib/inspection/config/scene_inspect.xml b/script/gspylib/inspection/config/scene_inspect.xml -index 463e4b7..3ba6da3 100644 ---- a/script/gspylib/inspection/config/scene_inspect.xml -+++ b/script/gspylib/inspection/config/scene_inspect.xml -@@ -40,12 +40,10 @@ - <item name="CheckSshdConfig"/> - <item name="CheckCrondService"/> - <item name="CheckStack"/> -- <item name="CheckNoCheckSum"/> - <item name="CheckSysPortRange"/> - <item name="CheckMemInfo"/> - <item name="CheckHyperThread"/> - <item name="CheckTableSpace"/> -- <item name="CheckSctpService"/> - <item name="CheckSysadminUser"/> - <item name="CheckGUCConsistent"/> - <item name="CheckMaxProcMemory"/> -diff --git a/script/gspylib/inspection/config/scene_install.xml b/script/gspylib/inspection/config/scene_install.xml -index a189193..42b9547 100644 ---- a/script/gspylib/inspection/config/scene_install.xml -+++ b/script/gspylib/inspection/config/scene_install.xml -@@ -12,13 +12,11 @@ - <item name="CheckStack"/> - <item name="CheckCrondService"/> - <item name="CheckSshdService"/> -- <item name="CheckSctpService"/> - <item name="CheckSysParams"> - <threshold> - version=V1R7C00 - </threshold> - </item> -- <item name="CheckNoCheckSum"/> - <item name="CheckDiskFormat"/> - <item name="CheckEtcHosts"/> - <item name="CheckHyperThread"/> -diff --git a/script/gspylib/inspection/config/scene_upgrade.xml b/script/gspylib/inspection/config/scene_upgrade.xml -index 426785a..7356a21 100644 ---- a/script/gspylib/inspection/config/scene_upgrade.xml -+++ b/script/gspylib/inspection/config/scene_upgrade.xml -@@ -23,7 +23,6 @@ - version=V1R7C00 - </threshold> - </item> -- <item name="CheckNoCheckSum"/> - <item name="CheckGUCValue"/> - <item name="CheckStack"/> - <item name="CheckDiskFormat"/> -diff --git a/script/gspylib/inspection/items/network/CheckNoCheckSum.py b/script/gspylib/inspection/items/network/CheckNoCheckSum.py -deleted file mode 100644 -index 64d0e52..0000000 ---- a/script/gspylib/inspection/items/network/CheckNoCheckSum.py -+++ /dev/null -@@ -1,71 +0,0 @@ --# -*- coding:utf-8 -*- --# Copyright (c) 2020 Huawei Technologies Co.,Ltd. --# --# openGauss is licensed under Mulan PSL v2. --# You can use this software according to the terms --# and conditions of the Mulan PSL v2. --# You may obtain a copy of Mulan PSL v2 at: --# --# http://license.coscl.org.cn/MulanPSL2 --# --# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, --# WITHOUT WARRANTIES OF ANY KIND, --# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, --# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. --# See the Mulan PSL v2 for more details. --# ---------------------------------------------------------------------------- --import os --import platform --from gspylib.inspection.common import SharedFuncs --from gspylib.inspection.common.CheckItem import BaseItem --from gspylib.inspection.common.CheckResult import ResultStatus --from gspylib.os.gsfile import g_file --from gspylib.os.gsnetwork import g_network --from gspylib.os.gsfile import g_Platform --from gspylib.common.ErrorCode import ErrorCode -- -- --class CheckNoCheckSum(BaseItem): -- def __init__(self): -- super(CheckNoCheckSum, self).__init__(self.__class__.__name__) -- -- def getOSversion(self): -- distname, version, idnum = g_Platform.dist() -- return distname, version -- -- def doCheck(self): -- if (not os.path.isfile("/sys/module/sctp/parameters/no_checksums")): -- self.result.rst = ResultStatus.OK -- self.result.val = "The SCTP service is not used and the" \ -- " check item is skipped" -- return -- expect = "N" -- if (self.cluster): -- LocalNodeInfo = self.cluster.getDbNodeByName(self.host) -- serviceIP = LocalNodeInfo.backIps[0] -- else: -- serviceIP = SharedFuncs.getIpByHostName(self.host) -- for network in g_network.getAllNetworkInfo(): -- if (network.ipAddress == serviceIP): -- networkCardNum = network.NICNum -- networkBond = network.networkBondModeInfo -- break -- if (not networkCardNum or not networkBond): -- raise Exception(ErrorCode.GAUSS_506["GAUSS_50619"]) -- (distname, version) = self.getOSversion() -- if ((distname in ("redhat", "centos")) and -- (version in ("6.4", "6.5")) and -- networkBond != "BondMode Null"): -- expect = "Y" -- -- output = \ -- g_file.readFile('/sys/module/sctp/parameters/no_checksums')[0] -- if (output.strip() == expect): -- self.result.rst = ResultStatus.OK -- self.result.val = "Nochecksum value is %s,Check items pass." \ -- % output.strip() -- else: -- self.result.rst = ResultStatus.NG -- self.result.val = "Nochecksum value(%s) is not %s," \ -- "Check items are not passed." \ -- % (output.strip(), expect) -diff --git a/script/gspylib/inspection/items/network/CheckUsedPort.py b/script/gspylib/inspection/items/network/CheckUsedPort.py -index 8a635ed..9718d96 100644 ---- a/script/gspylib/inspection/items/network/CheckUsedPort.py -+++ b/script/gspylib/inspection/items/network/CheckUsedPort.py -@@ -46,17 +46,9 @@ class CheckUsedPort(BaseItem): - - return int(tcpUsed) - -- def getSctpUsedPort(self): -- cmd = "cat /proc/net/sctp/assocs|" \ -- "awk '{print $12}'|sort|uniq -c |wc -l" -- sctpUsed = SharedFuncs.runShellCmd(cmd) -- -- return int(sctpUsed) -- - def doCheck(self): - portRange = self.getPortRange() - tcpUsed = self.getTcpUsedPort() -- sctpUsed = self.getSctpUsedPort() - defaultPortRange = 60000 - 32768 - if (portRange < defaultPortRange): - self.result.rst = ResultStatus.WARNING -@@ -70,14 +62,7 @@ class CheckUsedPort(BaseItem): - " not passed." % tcpUsed - return - -- if (sctpUsed > portRange * 0.8): -- self.result.rst = ResultStatus.WARNING -- self.result.val = "sctp port used is %s," \ -- "Check items are not passed." % sctpUsed -- return -- - self.result.rst = ResultStatus.OK - self.result.val = "port range is %s,tcp port used is %s," \ -- "sctp port used is %d,Check items pass." \ -- % (portRange, tcpUsed, sctpUsed) -+ "Check items pass." % (portRange, tcpUsed) - return -diff --git a/script/gspylib/inspection/items/os/CheckSctpService.py b/script/gspylib/inspection/items/os/CheckSctpService.py -deleted file mode 100644 -index 8e00810..0000000 ---- a/script/gspylib/inspection/items/os/CheckSctpService.py -+++ /dev/null -@@ -1,108 +0,0 @@ --# -*- coding:utf-8 -*- --# Copyright (c) 2020 Huawei Technologies Co.,Ltd. --# --# openGauss is licensed under Mulan PSL v2. --# You can use this software according to the terms --# and conditions of the Mulan PSL v2. --# You may obtain a copy of Mulan PSL v2 at: --# --# http://license.coscl.org.cn/MulanPSL2 --# --# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, --# WITHOUT WARRANTIES OF ANY KIND, --# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, --# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. --# See the Mulan PSL v2 for more details. --# ---------------------------------------------------------------------------- --import subprocess --import platform --from gspylib.inspection.common.CheckItem import BaseItem --from gspylib.inspection.common.CheckResult import ResultStatus --from gspylib.common.Common import DefaultValue --from gspylib.os.gsfile import g_Platform -- -- --class CheckSctpService(BaseItem): -- def __init__(self): -- super(CheckSctpService, self).__init__(self.__class__.__name__) -- -- def doCheck(self): -- -- parRes = "" -- flag = "Normal" -- cmd = "ls -l /lib/modules/`uname -r`/kernel/net/sctp/sctp.ko*" -- (status, output) = subprocess.getstatusoutput(cmd) -- if (status != 0 or output == "" or output.find( -- "No such file or directory") > 0): -- if DefaultValue.checkDockerEnv(): -- return -- flag = "Error" -- parRes += "There is no sctp service." -- else: -- cmd = "modprobe sctp;" -- cmd += "lsmod |grep sctp" -- (status, output) = subprocess.getstatusoutput(cmd) -- if (output == ""): -- flag = "Error" -- parRes += "sctp service is not loaded." -- -- cmd = "cat %s | grep '^insmod.*sctp.ko'" % DefaultValue.getOSInitFile() -- (status, output) = subprocess.getstatusoutput(cmd) -- if (status != 0 or output == ""): -- if (flag == "Normal"): -- flag = "Warning" -- parRes += "Sctp service is not set to boot from power on." -- -- self.result.val = parRes -- self.result.raw = output -- if (flag == "Error"): -- self.result.rst = ResultStatus.NG -- elif (flag == "Warning"): -- self.result.rst = ResultStatus.WARNING -- else: -- self.result.rst = ResultStatus.OK -- self.result.val = "Sctp service is Normal." -- -- def doSet(self): -- self.result.val = "" -- parRes = "" -- sctpFile = "" -- initFileSuse = "/etc/init.d/boot.local" -- initFileRedhat = "/etc/rc.d/rc.local" -- cmd = "ls -l /lib/modules/`uname -r`/kernel/net/sctp/sctp.ko*" -- (status, output) = subprocess.getstatusoutput(cmd) -- if (status != 0 or output == "" or output.find( -- "No such file or directory") > 0): -- parRes = "There is no sctp service.\n" -- else: -- sctpFile = output.split()[-1] -- cmd = "modprobe sctp;" -- (status, output) = subprocess.getstatusoutput(cmd) -- if (status != 0): -- cmd = "insmod %s >/dev/null 2>&1;lsmod |grep sctp" % sctpFile -- (status, output) = subprocess.getstatusoutput(cmd) -- if status != 0 or output == "": -- parRes = "Failed to load sctp service.\n" -- distname, version, idnum = g_Platform.dist() -- if (distname in ["redhat", "centos", "euleros", "openEuler"]): -- cmd = "cat %s | grep sctp" % initFileRedhat -- (status, output) = subprocess.getstatusoutput(cmd) -- if (status != 0 or output == ""): -- cmd = "echo 'modprobe sctp' >> /etc/rc.d/rc.local;" -- cmd += "echo" \ -- " 'insmod %s >/dev/null 2>&1' >> /etc/rc.d/rc.local " \ -- % sctpFile -- (status, output) = subprocess.getstatusoutput(cmd) -- if (status != 0): -- parRes += "Failed to add sctp service to boot.\n" -- else: -- cmd = "cat %s | grep stcp" % initFileSuse -- (status, output) = subprocess.getstatusoutput(cmd) -- if (status != 0 or output == ""): -- cmd = "echo 'modprobe sctp' >> /etc/init.d/boot.local;" -- cmd += "echo '%s >/dev/null 2>&1' >> /etc/init.d/boot.local " \ -- % sctpFile -- (status, output) = subprocess.getstatusoutput(cmd) -- if (status != 0): -- parRes += "Failed to add sctp service to boot." -- self.result.val = parRes -diff --git a/script/gspylib/inspection/items/os/CheckSysParams.py b/script/gspylib/inspection/items/os/CheckSysParams.py -index 947ecc6..c15a627 100644 ---- a/script/gspylib/inspection/items/os/CheckSysParams.py -+++ b/script/gspylib/inspection/items/os/CheckSysParams.py -@@ -122,21 +122,13 @@ class CheckSysParams(BaseItem): - for key in kernelParameter: - if (patchlevel == "1" and key == "vm.extfrag_threshold"): - continue -- if (key == "sctpchecksumerrors"): -- snmpFile = "/proc/net/sctp/snmp" -- if (os.path.isfile(snmpFile)): -- output = \ -- g_file.readFile(snmpFile, 'SctpChecksumErrors')[ -- 0].split()[1].strip() -- else: -- continue -- else: -- sysFile = "/proc/sys/%s" % key.replace('.', '/') -- # High version of linux no longer supports tcp_tw_recycle -- if (not os.path.exists( -- sysFile) and key == "net.ipv4.tcp_tw_recycle"): -- continue -- output = g_file.readFile(sysFile)[0].strip() -+ -+ sysFile = "/proc/sys/%s" % key.replace('.', '/') -+ # High version of linux no longer supports tcp_tw_recycle -+ if (not os.path.exists( -+ sysFile) and key == "net.ipv4.tcp_tw_recycle"): -+ continue -+ output = g_file.readFile(sysFile)[0].strip() - if (len(output.split()) > 1): - output = ' '.join(output.split()) - -@@ -184,16 +176,6 @@ class CheckSysParams(BaseItem): - checkResultList = checkResult.split('\'') - setParameterList[checkResultList[1]] = checkResultList[5] - self.result.val = "" -- # The parameter sctpchecksumerrors set method is independent -- if ("sctpchecksumerrors" in setParameterList): -- cmd = "echo 1 > /sys/module/sctp/parameters/no_checksums" -- (status, output) = subprocess.getstatusoutput(cmd) -- if (status != 0): -- self.result.val += " " \ -- " Failed to enforce sysctl kernel " \ -- "variable 'sctpchecksumerrors'. " \ -- "Error: %s" % output -- setParameterList.pop("sctpchecksumerrors") - - if (len(setParameterList) != 0): - for key in setParameterList: -diff --git a/script/impl/preinstall/PreinstallImpl.py b/script/impl/preinstall/PreinstallImpl.py -index a35e87a..908423f 100644 ---- a/script/impl/preinstall/PreinstallImpl.py -+++ b/script/impl/preinstall/PreinstallImpl.py -@@ -54,8 +54,6 @@ ACTION_PREPARE_USER_CRON_SERVICE = "prepare_user_cron_service" - ACTION_PREPARE_USER_SSHD_SERVICE = "prepare_user_sshd_service" - # set the dynamic link library - ACTION_SET_LIBRARY = "set_library" --# set sctp service --ACTION_SET_SCTP = "set_sctp" - # set virtual Ip - ACTION_SET_VIRTUALIP = "set_virtualIp" - # clean virtual Ip -@@ -1485,38 +1483,6 @@ class PreinstallImpl: - """ - pass - -- def setSctp(self): -- """ -- function: setting SCTP service -- input: NA -- output: NA -- """ -- self.context.logger.log("Setting SCTP service.", "addStep") -- try: -- # set SCTP service -- cmd = "%s -t %s -u %s -l %s" % ( -- OMCommand.getLocalScript("Local_PreInstall"), -- ACTION_SET_SCTP, -- self.context.user, -- self.context.localLog) -- # check the mpprcFile -- if self.context.mpprcFile != "": -- cmd += " -s '%s'" % self.context.mpprcFile -- self.context.logger.debug("Command for setting SCTP: %s" % cmd) -- -- # exec cmd for set SCTP -- DefaultValue.execCommandWithMode( -- cmd, -- "set SCTP", -- self.context.sshTool, -- self.context.localMode or self.context.isSingle, -- self.context.mpprcFile) -- except Exception as e: -- # failed set SCTP service -- raise Exception(str(e)) -- # Successfully set SCTP service -- self.context.logger.log("Successfully set SCTP service.", "constant") -- - def setVirtualIp(self): - """ - function: set the virtual IPs -@@ -1893,10 +1859,6 @@ class PreinstallImpl: - self.checkOSVersion() - # create path and set mode - self.createDirs() -- -- # set Sctp -- if not DefaultValue.checkDockerEnv(): -- self.setSctp() - # set os parameters - self.setAndCheckOSParameter() - # prepare cron service for user -diff --git a/script/local/LocalCheck.py b/script/local/LocalCheck.py -index 82a9efb..6e5cb6e 100644 ---- a/script/local/LocalCheck.py -+++ b/script/local/LocalCheck.py -@@ -47,8 +47,7 @@ actioItemMap = { - - docker_no_need_check = ["net.core.wmem_max", "net.core.rmem_max", - "net.core.wmem_default", "net.core.rmem_default", -- "net.sctp.sctp_mem", "net.sctp.sctp_rmem", -- "net.sctp.sctp_wmem", "net.core.netdev_max_backlog", -+ "net.core.netdev_max_backlog", - "net.ipv4.tcp_max_tw_buckets", "net.ipv4.tcp_tw_reuse", - "net.ipv4.tcp_tw_recycle", "net.ipv4.tcp_retries2", - "net.ipv4.ip_local_reserved_ports", "net.ipv4.tcp_rmem", -@@ -239,12 +238,7 @@ def checkSysctlParameter(kernelParameter, isSet): - continue - if (DefaultValue.checkDockerEnv() and key in docker_no_need_check): - continue -- # The parameter sctpchecksumerrors check method is independent -- if (key == "sctpchecksumerrors"): -- cmd = "cat /proc/net/sctp/snmp | grep SctpChecksumErrors" \ -- " | awk '{print $2}'" -- else: -- cmd = "cat %s" % ("/proc/sys/%s" % key.replace('.', '/')) -+ cmd = "cat %s" % ("/proc/sys/%s" % key.replace('.', '/')) - (status, output) = subprocess.getstatusoutput(cmd) - if (status == 0): - if (key == "vm.min_free_kbytes" -@@ -315,15 +309,6 @@ def setOSParameter(setParameterList, patchlevel): - # vm.extfrag_threshold parameter, skip set - if ("vm.extfrag_threshold" in setParameterList and patchlevel == "1"): - setParameterList.pop("vm.extfrag_threshold") -- # The parameter sctpchecksumerrors set method is independent -- if ("sctpchecksumerrors" in setParameterList): -- cmd = "echo 1 > /sys/module/sctp/parameters/no_checksums" -- (status, output) = subprocess.getstatusoutput(cmd) -- if (status != 0): -- g_logger.debug("The cmd is %s " % cmd) -- g_logger.log(" Failed to enforce sysctl kernel variable" -- " 'sctpchecksumerrors'. Error: %s" % output) -- setParameterList.pop("sctpchecksumerrors") - - if (len(setParameterList) != 0): - g_logger.debug("Setting sysctl parameter.") -@@ -332,7 +317,7 @@ def setOSParameter(setParameterList, patchlevel): - g_logger.log(" Set variable '%s' to '%s'" - % (key, setParameterList[key])) - cmd = "sysctl -p" -- (status, output) = subprocess.getstatusoutput(cmd) -+ (status, _) = subprocess.getstatusoutput(cmd) - if (status != 0): - cmderrorinfo = "sysctl -p | grep 'No such file or directory'" - (status, outputresult) = subprocess.getstatusoutput(cmderrorinfo) -diff --git a/script/local/PreInstallUtility.py b/script/local/PreInstallUtility.py -index cbe2a59..b4071f3 100644 ---- a/script/local/PreInstallUtility.py -+++ b/script/local/PreInstallUtility.py -@@ -55,7 +55,6 @@ ACTION_SET_TOOL_ENV = "set_tool_env" - ACTION_PREPARE_USER_CRON_SERVICE = "prepare_user_cron_service" - ACTION_PREPARE_USER_SSHD_SERVICE = "prepare_user_sshd_service" - ACTION_SET_LIBRARY = "set_library" --ACTION_SET_SCTP = "set_sctp" - ACTION_SET_VIRTUALIP = "set_virtualIp" - ACTION_CHECK_HOSTNAME_MAPPING = "check_hostname_mapping" - ACTION_INIT_GAUSSLOG = "init_gausslog" -@@ -256,7 +255,7 @@ Common options: - GaussLog.exitWithError(str(e)) - parameter_list = [ACTION_CHECK_OS_VERSION, ACTION_SET_FINISH_FLAG, - ACTION_SET_USER_ENV, ACTION_SET_LIBRARY, \ -- ACTION_SET_SCTP, ACTION_PREPARE_USER_CRON_SERVICE, -+ ACTION_PREPARE_USER_CRON_SERVICE, - ACTION_PREPARE_USER_SSHD_SERVICE, \ - ACTION_SET_VIRTUALIP, ACTION_INIT_GAUSSLOG, - ACTION_CHECK_ENVFILE, ACTION_CHECK_OS_SOFTWARE, \ -@@ -1981,88 +1980,6 @@ Common options: - self.logger.logExit(str(e)) - self.logger.debug("Successfully set ARM Optimization.") - -- def setSctp(self): -- """ -- function: Setting SCTP -- input : NA -- output: NA -- """ -- self.logger.debug("Setting SCTP.") -- try: -- -- key = "install ipv6 \/bin\/true" -- confFile = "/etc/modprobe.d/*ipv6.conf" -- -- initFile = DefaultValue.getOSInitFile() -- cmd = "ls %s" % confFile -- (status, output) = subprocess.getstatusoutput(cmd) -- if status == 0: -- cmd = "sed -i 's/^.*\(%s.*\)/#\\1/g' %s" % (key, confFile) -- (status, output) = subprocess.getstatusoutput(cmd) -- if status != 0: -- self.logger.logExit(ErrorCode.GAUSS_502["GAUSS_50223"] -- % confFile + " Error: \n%s" % output) -- cmd = "modprobe ipv6" -- (status, output) = subprocess.getstatusoutput(cmd) -- if status != 0: -- self.logger.logExit(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd -- + " Error: \n%s" % output) -- cmd = "modprobe sctp" -- (status, output) = subprocess.getstatusoutput(cmd) -- if status != 0: -- self.logger.logExit(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd -- + " Error: \n%s" % output) -- -- cmd = "uname -r" -- (status, output) = subprocess.getstatusoutput(cmd) -- if status != 0: -- self.logger.logExit(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd -- + " Error: \n%s" % output) -- -- # Since redhat7.4 kernel module files ending in .xz -- stcpFile = "/lib/modules/%s/kernel/net/sctp/sctp.ko" \ -- % output.strip() -- stcpFileXz = "/lib/modules/%s/kernel/net/sctp/sctp.ko.xz" \ -- % output.strip() -- if (not os.path.exists(stcpFile)) and \ -- (not os.path.exists(stcpFileXz)): -- output = stcpFile + " and " + stcpFileXz -- self.logger.logExit(ErrorCode.GAUSS_502["GAUSS_50201"] -- % output) -- -- cmd_insmod = "insmod %s >/dev/null 2>&1" % stcpFileXz -- (status, output) = subprocess.getstatusoutput(cmd_insmod) -- -- cmd_insmod = "insmod %s >/dev/null 2>&1" % stcpFile -- (status, output) = subprocess.getstatusoutput(cmd_insmod) -- -- cmd = "lsmod | grep 'sctp ' | wc -l" -- (status, output) = subprocess.getstatusoutput(cmd) -- if not str(output.strip()).isdigit() or int(output.strip()) == 0: -- self.logger.logExit(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd -- + " Error: \n%s" % output) -- -- init_cmd = "sed -i '/^modprobe sctp$/d' %s &&" % initFile -- init_cmd += "echo \"modprobe sctp\" >> %s &&" % initFile -- init_cmd += "sed -i '/^insmod.*sctp.ko/d' %s &&" % initFile -- init_cmd += "echo \"%s\" >> %s" % (cmd_insmod, initFile) -- (status, output) = subprocess.getstatusoutput(init_cmd) -- if status != 0: -- self.logger.logExit(ErrorCode.GAUSS_514["GAUSS_51400"] -- % init_cmd + " Error: \n%s" % output) -- -- cmd = "sed -i \"/^sysctl -p/d\" %s &&" % initFile -- cmd += "echo \"sysctl -p\" >> %s" % initFile -- (status, output) = subprocess.getstatusoutput(cmd) -- if status != 0: -- self.logger.logExit(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd -- + " Error: \n%s" % output) -- -- except Exception as e: -- self.logger.logExit(str(e)) -- -- self.logger.debug("Successfully set Sctp.") -- - def checkVirtualIp(self): - """ - function: Checking virtual IP -@@ -2915,8 +2832,6 @@ Common options: - self.prepareUserSshdService() - elif self.action == ACTION_SET_LIBRARY: - self.setLibrary() -- elif self.action == ACTION_SET_SCTP: -- self.setSctp() - elif self.action == ACTION_SET_VIRTUALIP: - DefaultValue.modifyFileOwnerFromGPHOME(self.logger.logFile) - self.setVirtualIp() --- -2.6.4.windows.1 - diff --git a/script/base_diff/sql_commands.py b/script/base_diff/sql_commands.py index 0a185f2..066eddc 100644 --- a/script/base_diff/sql_commands.py +++ b/script/base_diff/sql_commands.py @@ -16,12 +16,16 @@ class SqlCommands: @staticmethod def getSQLCommand(port, database=ConstantsBase.DEFAULT_DB_NAME, - gsqlBin="gsql"): + gsqlBin="gsql", user_name="", user_pwd=""): """ function : get SQL command input : port, database output : cmd """ + if user_name and user_pwd: + cmd = ConstantsBase.SQL_EXEC_COMMAND_WITHOUT_HOST_WITH_USER % ( + gsqlBin, str(port), database, user_name, user_pwd) + return cmd cmd = ConstantsBase.SQL_EXEC_COMMAND_WITHOUT_HOST_WITHOUT_USER % ( gsqlBin, str(int(port) + 1), database) return cmd diff --git a/script/base_utils/common/constantsbase.py b/script/base_utils/common/constantsbase.py index 127737b..496abdf 100644 --- a/script/base_utils/common/constantsbase.py +++ b/script/base_utils/common/constantsbase.py @@ -59,4 +59,4 @@ class ConstantsBase: #SQL_EXEC_COMMAND SQL_EXEC_COMMAND_WITHOUT_HOST_WITHOUT_USER = "%s -p %s -d %s " - SQL_EXEC_COMMAND_WITHOUT_HOST_WITH_USER = "%s -p %s -d %s -U %s -W %s " + SQL_EXEC_COMMAND_WITHOUT_HOST_WITH_USER = "%s -p %s -d %s -U %s -W '%s' " diff --git a/script/base_utils/os/cmd_util.py b/script/base_utils/os/cmd_util.py index c5a9d74..96465e9 100644 --- a/script/base_utils/os/cmd_util.py +++ b/script/base_utils/os/cmd_util.py @@ -24,6 +24,8 @@ import subprocess import threading import time from subprocess import PIPE, Popen +from datetime import datetime +from datetime import timedelta import pwd from gspylib.common.ErrorCode import ErrorCode from base_utils.common.exceptions import CommandNotFoundException @@ -575,6 +577,21 @@ class CmdUtil(object): break return status, output + @staticmethod + def retry_util_timeout(cmd, timeout, sleep_time=1): + """ + retry execute cmd with giving timeout. + """ + end_time = datetime.now() + timedelta(seconds=int(timeout)) + status, output = 1, 1 + while datetime.now() < end_time: + status, output = CmdUtil.getstatusoutput_by_fast_popen(cmd) + if status == 0: + break + else: + time.sleep(sleep_time) + return status, output + @staticmethod def getstatusoutput_by_fast_popen(cmd): """ diff --git a/script/base_utils/os/file_util.py b/script/base_utils/os/file_util.py index 3272680..5b71293 100644 --- a/script/base_utils/os/file_util.py +++ b/script/base_utils/os/file_util.py @@ -24,6 +24,7 @@ import stat import subprocess import pwd import grp +import json from subprocess import PIPE from base_utils.common.constantsbase import ConstantsBase @@ -299,6 +300,27 @@ class FileUtil(object): lock.release() return True + @staticmethod + def write_update_file(file_path, content, authority, is_json=True): + """ + Write or update file, create if not exist. + """ + with os.fdopen(os.open(file_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, + authority), "w") as fp_write: + if is_json: + json.dump(content, fp_write) + else: + fp_write.write(content) + + @staticmethod + def write_add_file(file_path, content, authority): + """ + Write or add content in file, create if not exist. + """ + if not os.path.isfile(file_path): + FileUtil.createFileInSafeMode(file_path, mode=authority) + FileUtil.writeFile(file_path, [content]) + @staticmethod def withAsteriskPath(path): """ diff --git a/script/base_utils/security/security_checker.py b/script/base_utils/security/security_checker.py index b5563b1..d28f74e 100644 --- a/script/base_utils/security/security_checker.py +++ b/script/base_utils/security/security_checker.py @@ -6,14 +6,28 @@ # Date : 2021-06-30 # Description : security_checker.py check security conditions ############################################################################# - +import re from gspylib.common.ErrorCode import ErrorCode +class ValidationError(Exception): + """ + validation base error + """ + def __init__(self, error_info): + super().__init__(self) + self.error_info = error_info + + def __str__(self): + return self.error_info + + class SecurityChecker(object): """check security conditions""" INJECTION_CHAR_LIST = ["|", ";", "&", "$", "<", ">", "`", "\\", "'", "\"", "{", "}", "(", ")", "[", "]", "~", "*", "?", " ", "!", "\n"] + PWD_VALIDATION_PATTERN = r'^[A-Za-z0-9~!@#%^*\-_=+?,]+$' + IP_PATTERN = r'^((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)$' @staticmethod def check_injection_char(check_value): @@ -27,3 +41,83 @@ class SecurityChecker(object): if any(rac in check_value for rac in SecurityChecker.INJECTION_CHAR_LIST): raise Exception(ErrorCode.GAUSS_502["GAUSS_50219"] % check_value + " There are illegal characters.") + + @staticmethod + def check_is_string(description, value): + """ + Check is string + """ + if not isinstance(value, str): + raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50022'] % (description, 'string')) + + @staticmethod + def check_max_length(description, value, max_length): + """ + Check max length + """ + if len(value) > max_length: + raise ValidationError(ErrorCode.GAUSS_500["GAUSS_50023"] % (description, max_length)) + + @staticmethod + def check_db_injection(description, value): + """ + Check db injection + """ + for rac in SecurityChecker.INJECTION_CHAR_LIST: + if value.find(rac) > 0: + raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50025'] % (rac, description)) + + @staticmethod + def check_password(description, value): + if not re.match(SecurityChecker.PWD_VALIDATION_PATTERN, value): + raise ValidationError(ErrorCode.GAUSS_500["GAUSS_50024"] % description) + + @staticmethod + def check_db_user(description, value): + SecurityChecker.check_is_string(description, value) + SecurityChecker.check_max_length(description, value, 256) + SecurityChecker.check_db_injection(description, value) + + @staticmethod + def check_db_password(description, value): + SecurityChecker.check_is_string(description, value) + SecurityChecker.check_max_length(description, value, 256) + SecurityChecker.check_password(description, value) + + @staticmethod + def check_is_digit(description, value): + if isinstance(value, int): + return + elif isinstance(value, str): + if not value.isdigit(): + raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50022'] + % (description, 'integer')) + else: + raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50022'] + % (description, 'int or string')) + + @staticmethod + def check_is_list(description, value): + if not isinstance(value, list): + raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50022'] % (description, 'list')) + + @staticmethod + def check_is_dict(description, value): + if not isinstance(value, dict): + raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50022'] % (description, 'dict')) + + @staticmethod + def check_ip_valid(description, value): + SecurityChecker.check_is_string(description, value) + if not re.match(SecurityChecker.IP_PATTERN, value): + raise ValidationError(ErrorCode.GAUSS_500["GAUSS_50024"] % description) + + @staticmethod + def check_port_valid(description, value): + SecurityChecker.check_is_digit(description, value) + value = int(value) if not isinstance(value, int) else value + if value > 65535 or value < 0: + raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50022'] + % (description, 'between 0 and 65535')) + + diff --git a/script/gs_checkperf b/script/gs_checkperf index c451293..ea82f83 100644 --- a/script/gs_checkperf +++ b/script/gs_checkperf @@ -87,7 +87,7 @@ class Checkperf(): def usage(self): """ -gs_checkperf is a utility to check the cluster performance and SSD performance. +gs_checkperf is a utility to check the cluster performance and SSD performance, streaming disaster cluster does not yet support. Usage: gs_checkperf -? | --help @@ -253,6 +253,11 @@ General options: binPath = os.path.join(self.clusterInfo.appPath, "bin") g_opts.databaseSizeFile = os.path.join(binPath, DefaultValue.DB_SIZE_FILE) + is_disaster_cluster = \ + DefaultValue.cm_exist_and_is_disaster_cluster(self.clusterInfo, g_logger) + if is_disaster_cluster: + GaussLog.exitWithError( + ErrorCode.GAUSS_512["GAUSS_51244"] % " Disaster cluster") except Exception as e: g_logger.logExit(str(e)) diff --git a/script/gs_dropnode b/script/gs_dropnode index 49fc3fc..befb1aa 100644 --- a/script/gs_dropnode +++ b/script/gs_dropnode @@ -86,7 +86,7 @@ class Dropnode(ParallelBaseOM): def usage(self): """ -gs_dropnode is a utility to delete the standby node from a cluster. +gs_dropnode is a utility to delete the standby node from a cluster, streaming cluster does not yet support. Usage: gs_dropnode -? | --help @@ -335,6 +335,7 @@ if __name__ == "__main__": dropNode = Dropnode() dropNode.parseCommandLine() dropNode.initLogs() + DefaultValue.check_is_streaming_dr_cluster() dropNode.check_repeat_process() dropNode.checkParameters() dropNode.checkConnection(list(dropNode.backIpNameMap.keys()), diff --git a/script/gs_expansion b/script/gs_expansion index 6f5444b..1026ce5 100644 --- a/script/gs_expansion +++ b/script/gs_expansion @@ -87,7 +87,7 @@ class Expansion(ParallelBaseOM): def usage(self): """ -gs_expansion is a utility to expansion standby node for a cluster. +gs_expansion is a utility to expansion standby node for a cluster, streaming cluster does not yet support. Usage: gs_expansion -? | --help diff --git a/script/gs_sdr b/script/gs_sdr new file mode 100644 index 0000000..6335689 --- /dev/null +++ b/script/gs_sdr @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################################# +# Copyright (c) 2020 Huawei Technologies Co.,Ltd. +# +# openGauss is licensed under Mulan PSL v2. +# You can use this software according to the terms +# and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# +# http://license.coscl.org.cn/MulanPSL2 +# +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ---------------------------------------------------------------------------- +# Description : gs_sdr is a utility for streaming +# disaster recovery fully options. +############################################################################# + +import os +import uuid + +from gspylib.common.Common import DefaultValue +from gspylib.common.ErrorCode import ErrorCode +from gspylib.common.GaussLog import GaussLog +from impl.streaming_disaster_recovery.streaming_constants import StreamingConstants +from base_utils.os.user_util import UserUtil +from domain_utils.cluster_file.cluster_log import ClusterLog +from impl.streaming_disaster_recovery.params_handler import ParamsHandler +from impl.streaming_disaster_recovery.streaming_modules.\ + streaming_diaster_recovery_start import StreamingStartHandler +from impl.streaming_disaster_recovery.streaming_modules.\ + streaming_disaster_recovery_stop import StreamingStopHandler +from impl.streaming_disaster_recovery.streaming_modules.\ + streaming_disaster_recovery_failover import StreamingFailoverHandler +from impl.streaming_disaster_recovery.streaming_modules.\ + streaming_disaster_recovery_switchover import StreamingSwitchoverHandler +from impl.streaming_disaster_recovery.streaming_modules.\ + streaming_disaster_recovery_query import StreamingQueryHandler + +HANDLER_MAPPING = { + "start": StreamingStartHandler, + "stop": StreamingStopHandler, + "switchover": StreamingSwitchoverHandler, + "failover": StreamingFailoverHandler, + "query": StreamingQueryHandler +} + + +class StreamingDisasterRecoveryBase(object): + def __init__(self): + self.params = None + self.user = None + self.log_file = None + self.logger = None + self.trace_id = uuid.uuid1().hex + StreamingDisasterRecoveryBase.mock_process_user_sensitive_info() + self.__init_globals() + + @staticmethod + def mock_process_user_sensitive_info(): + """mock_process_user_sensitive_info""" + cmdline = DefaultValue.get_proc_title("-W") + DefaultValue.set_proc_title(cmdline) + + def __init_globals(self): + self.user = UserUtil.getUserInfo()['name'] + tmp_logger_file = ClusterLog.getOMLogPath(StreamingConstants.STREAMING_LOG_FILE, self.user) + tmp_logger = GaussLog(tmp_logger_file, 'parse_and_validate_params', trace_id=self.trace_id) + self.params = ParamsHandler(tmp_logger, self.trace_id).get_valid_params() + self.log_file = self.params.logFile if self.params.logFile else \ + ClusterLog.getOMLogPath(StreamingConstants.STREAMING_LOG_FILE, self.user) + self.logger = GaussLog(self.log_file, self.params.task, trace_id=self.trace_id) + + +if __name__ == '__main__': + if os.getuid() == 0: + GaussLog.exitWithError(ErrorCode.GAUSS_501["GAUSS_50105"]) + + base = StreamingDisasterRecoveryBase() + handler = HANDLER_MAPPING[base.params.task](base.params, base.user, + base.logger, base.trace_id, base.log_file) + handler.handle_lock_file(handler.trace_id, 'create') + try: + if base.params.task in StreamingConstants.TASK_EXIST_CHECK: + handler.check_streaming_process_is_running() + handler.run() + except Exception as error: + handler.logger.error(error) + raise Exception(str(error)) + finally: + handler.handle_lock_file(handler.trace_id, 'remove') diff --git a/script/gs_upgradectl b/script/gs_upgradectl index 202b192..d5c0815 100644 --- a/script/gs_upgradectl +++ b/script/gs_upgradectl @@ -45,6 +45,7 @@ import pwd import grp import copy import re +import json from gspylib.common.Common import DefaultValue from gspylib.common.GaussLog import GaussLog @@ -60,6 +61,23 @@ from base_utils.os.net_util import NetUtil from domain_utils.domain_common.cluster_constants import ClusterConstants +class DualUpgradeShareInfo: + """ + Used to record the upgrade status information of the primary and standby clusters + + """ + + def __init__(self, jsonInfo=None): + # If the Json string is passed in, the Json information is used to initialize the class + if jsonInfo: + self.__dict__ = jsonInfo + else: + self.masterVersion = "" + self.masterUpgradeStatus = 0 + self.standbyVersion = "" + self.standbyUpgradeStatus = 0 + + class Upgrade(ParallelBaseOM): """ The class about upgrade @@ -90,6 +108,14 @@ class Upgrade(ParallelBaseOM): self.oldClusterNumber = None self.forceRollback = False self.upgrade_remain = False + # Record the upgrade status information under dual clusters + self.dualUpgradeShareInfo = None + # Record the primary cluster or the standby cluster, dual-primary or dual-standby + self.clusterType = "" + # Whether it is a standby cluster in a dual cluster. Convenient to judge + self.standbyCluster = False + # The path to record the information of each cluster upgrade stage in the dual cluster + self.upgradePhaseInfoPath = "" def usage(self): """ @@ -153,6 +179,10 @@ Option for grey upgrade self.upgrade_remain = True if "force" in ParaDict.keys(): self.forceRollback = True + self.tmpDir = EnvUtil.getTmpDirFromEnv() + if self.tmpDir == "": + raise Exception(ErrorCode.GAUSS_518["GAUSS_51800"] % "$PGHOST") + self.upgradePhaseInfoPath = os.path.join(self.tmpDir, Const.UPGRADE_PHASE_INFO) def checkUser(self): """ @@ -299,6 +329,69 @@ Option for grey upgrade raise Exception(ErrorCode.GAUSS_516["GAUSS_51619"] % nodeName) self.logger.debug("Successfully init global infos") + # If it is a dual-cluster, initialize the related information of the dual-cluster + self.initDualUpgradeInfo() + + def initDualUpgradeInfo(self): + """ + initialize dual cluster upgrade status information + If it is not a dual cluster, do not initialize + :return: + """ + if os.path.exists(self.upgradePhaseInfoPath): + if self.is_inplace_upgrade and self.action not in \ + ["commit-upgrade", "auto-rollback", "chose-strategy"]: + raise Exception("Dual cluster does not support in-place upgrade") + self.dualUpgradeShareInfo = self.getDualUpgradeInfo(self.upgradePhaseInfoPath, + startPost=0) + if not self.dualUpgradeShareInfo: + self.dualUpgradeShareInfo = DualUpgradeShareInfo() + + @staticmethod + def getDualUpgradeInfo(filePath, startPost): + """ + Obtain the dual-cluster upgrade status information from the file, + and return None if there is no record + :return: + """ + if os.path.exists(filePath): + lenInfo = 0 + with open(filePath, 'r') as shareInfo: + shareInfo.seek(startPost) + length = shareInfo.read(4) + if length > '': + try: + lenInfo = int(length) + except Exception as _: + lenInfo = 0 + if lenInfo > 0: + shareInfo.seek(startPost + 4) + return json.loads(shareInfo.read(lenInfo), object_hook=DualUpgradeShareInfo) + return None + + def updateDualUpgradeInfo(self, dualUpgradeShareInfo, filePath, startPost): + """ + Update the upgrade information of the cluster to the dual-cluster + shared file /dev/my_disk_sync_disk file + :return: + """ + if os.path.exists(filePath): + with os.fdopen(os.open(filePath, os.O_WRONLY, 0o600), "w") as shareInfo: + shareInfo.seek(startPost + Const.LENGTH_STORAGE_INFO_LEN) + shareInfo.write(json.dumps(dualUpgradeShareInfo, default=lambda obj: obj.__dict__)) + length = shareInfo.tell() - (startPost + Const.LENGTH_STORAGE_INFO_LEN) + shareInfo.seek(startPost, 0) + shareInfo.write("{0:04d}".format(length)) + + # After the status file is updated, the standby cluster + # distributes the updated status file to the data directory of the DN. + for dbNode in self.clusterInfo.dbNodes: + for dnInst in dbNode.datanodes: + self.sshTool.scpFiles(filePath, dnInst.datadir, + hostList=[dnInst.hostname]) + else: + raise Exception("{0} file does not exist and cannot be updated".format(filePath)) + def distributeFileToSpecialNode(self, file, destDir, hostList): """ distribute file to special node diff --git a/script/gspylib/common/Common.py b/script/gspylib/common/Common.py index 796073c..0e9da07 100644 --- a/script/gspylib/common/Common.py +++ b/script/gspylib/common/Common.py @@ -17,6 +17,7 @@ # ---------------------------------------------------------------------------- # Description : Common is a utility with a lot of common functions ############################################################################# +import ctypes import sys import subprocess import os @@ -28,6 +29,7 @@ import time import multiprocessing import _thread as thread import pwd +import json import base64 import secrets import string @@ -35,6 +37,7 @@ import stat import csv import copy from subprocess import PIPE +from subprocess import Popen # The installation starts, but the package is not decompressed completely. # The lib64/libz.so.1 file is incomplete, and the hashlib depends on the @@ -106,6 +109,7 @@ from base_utils.os.cmd_util import CmdUtil from base_utils.os.env_util import EnvUtil from base_utils.os.file_util import FileUtil from domain_utils.cluster_file.version_info import VersionInfo +from domain_utils.cluster_file.cluster_dir import ClusterDir from domain_utils.security.random_value import RandomValue from base_utils.os.process_util import ProcessUtil from domain_utils.sql_handler.sql_executor import SqlExecutor @@ -199,6 +203,7 @@ class DefaultValue(): FILE_MODE = 640 FILE_MODE_PERMISSION = 0o640 KEY_FILE_MODE = 600 + KEY_FILE_MODE_IN_OS = 0o600 MIN_FILE_MODE = 400 SPE_FILE_MODE = 500 KEY_DIRECTORY_MODE = 700 @@ -318,6 +323,9 @@ class DefaultValue(): # FI_ELK_KRB_XML is used in elk FI_ELK_KRB_XML = "auth_config/elk-krb-site.xml" FI_KRB_CONF = "krb5.conf" + # cluster status + CLUSTER_STATUS_NORMAL = "Normal" + CLUSTER_STATUS_DEGRADED = "Degraded" ########################### # instance role ########################### @@ -615,6 +623,60 @@ class DefaultValue(): return NetWorkConfFile + @staticmethod + def get_remote_ips(host, mpp_file): + """ + Get ips from remote host + """ + cmd = "source %s && pssh -s -t 30 -H %s \"hostname -I\"" % (mpp_file, host) + status, output = subprocess.getstatusoutput(cmd) + if status == 0 and output != "": + ips = output.strip().split() + return ips + else: + raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] + % "check remote ips for node:%s, Error:%s." % (host, output)) + + @staticmethod + def obtain_file_content(dest_file, deduplicate=True, is_list=True): + """ + function:obtains the content of each line in the file. + input: file dir + :return: file context lines list + """ + result = [] if is_list else None + if not os.path.isfile(dest_file): + return result + with open(dest_file, "r") as fp_read: + if is_list: + for line in fp_read: + result.append(line.strip('\n')) + else: + result = fp_read.read().strip() + if deduplicate and is_list: + result = list(set(result)) + return result + + @staticmethod + def get_all_dn_num_for_dr(file_path, dn_inst, cluster_info, logger): + """get_all_dn_num_for_dr_cluster""" + # DN inst supports a maximum of replicaNum=8 in postgresql.conf. + default_num = 8 + content = DefaultValue.obtain_file_content(file_path, is_list=False) + if content: + default_num = 0 + shards = json.loads(content)['remoteClusterConf']["shards"] + logger.debug("Stream cluster json shards:%s" % shards) + if cluster_info.isSingleInstCluster(): + for shard in shards: + default_num += len(shard) + else: + default_num += len(shards[0]) + peer_insts = cluster_info.getPeerInstance(dn_inst) + default_num += len(peer_insts) + logger.debug("Get config replconninfo dn num:%s" % default_num) + return default_num + @staticmethod def getIpByHostName(): ''' @@ -1616,6 +1678,45 @@ class DefaultValue(): noPassIPs.append(ip) g_lock.release() + @staticmethod + def fast_ping(node_ip): + """ + ping node with short timeout + """ + cmd = "ping %s -c 1 -w 4" % node_ip + proc = FastPopen(cmd, stdout=PIPE, stderr=PIPE, preexec_fn=os.setsid, close_fds=True) + proc.communicate() + status = proc.returncode + result = (node_ip, True) if status == 0 else (node_ip, False) + return result + + @staticmethod + def fast_ping_on_node(on_node, from_ip, to_ip, logger): + """ + Ping on remote node with -I + """ + cmd = "ping %s -c 1 -w 4" % on_node + proc = FastPopen(cmd, stdout=PIPE, stderr=PIPE, + preexec_fn=os.setsid, close_fds=True) + proc.communicate() + status = proc.returncode + if status != 0: + logger.debug("Node:%s ping failed, can not execute remote check." % on_node) + return on_node, False + if on_node == NetUtil.GetHostIpOrName(): + cmd_remote = "ping %s -I %s -c 1 -w 4" % (to_ip, from_ip) + else: + cmd_remote = "source %s && pssh -s -H %s 'ping %s -I %s -c 1 -w 4'" \ + % (EnvUtil.getMpprcFile(), on_node, to_ip, from_ip) + proc = FastPopen(cmd_remote, stdout=PIPE, stderr=PIPE, + preexec_fn=os.setsid, close_fds=True) + proc.communicate() + status = proc.returncode + result = (to_ip, True) if status == 0 else (to_ip, False) + logger.debug("Remote ping result on node:%s, from ip:%s, to ip:%s, result:%s." + % (on_node, from_ip, to_ip, result)) + return result + @staticmethod def checkIsPing(ips): """ @@ -2259,7 +2360,7 @@ class DefaultValue(): "Command:%s. Error:\n%s" % (cmd, output)) targetString = output.split("Datanode")[1] dnPrimary = [x for x in re.split(r"[|\n]", targetString) - if flagStr in x] + if flagStr in x or "Main" in x] primaryList = [] for dn in dnPrimary: primaryList.append(list(filter(None, dn.split(" ")))[1]) @@ -2866,6 +2967,283 @@ class DefaultValue(): "on node [{0}] successfully.".format(node.name)) logger.log("Remove dynamic_config_file and CM metadata directory on all nodes.") + @staticmethod + def distribute_file_to_node(params): + """ + Distribute file to dest node with path + """ + dest_ip, from_path, to_path, timeout = params + pscp_cmd = "source %s ; pscp -t %s -H %s %s %s" % ( + EnvUtil.getMpprcFile(), timeout, dest_ip, from_path, to_path) + status, output = CmdUtil.getstatusoutput_by_fast_popen(pscp_cmd) + return status, output, dest_ip + + @staticmethod + def check_is_cm_cluster(logger): + """ + Check cm_ctl is exist. + """ + cmd = "source %s; cm_ctl view | grep cmDataPath" % EnvUtil.getMpprcFile() + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + logger.debug("Check cm_ctl is failed msg: %s." % output) + return False + logger.debug("Successfully check cm_ctl is available.") + return True + + @staticmethod + def is_disaster_cluster(clusterinfo): + """ + function: determine cluster status normal or disaster + input: NA + output: NA + """ + cmd = "source %s; cm_ctl view | grep cmDataPath | awk -F [:] '{print $2}' | head -n 1" % \ + EnvUtil.getMpprcFile() + proc = FastPopen(cmd, stdout=PIPE, stderr=PIPE) + stdout, stderr = proc.communicate() + if proc.returncode != 0: + raise Exception(ErrorCode.GAUSS_514['GAUSS_51400'] % cmd + "Error:\n%s" % stderr) + cm_agent_conf_file = stdout.strip() + "/cm_agent/cm_agent.conf" + if not os.path.isfile(cm_agent_conf_file): + host_list = clusterinfo.getClusterNodeNames() + cm_agent_conf_temp_file = os.path.join(EnvUtil.getTmpDirFromEnv(), "cm_agent_tmp.conf") + for host_ip in host_list: + get_file_cmd = g_file.SHELL_CMD_DICT["scpFileFromRemote"] % \ + (host_ip, NetUtil.GetHostIpOrName(), cm_agent_conf_file, cm_agent_conf_temp_file) + proc = FastPopen(get_file_cmd, stdout=PIPE, stderr=PIPE) + stdout, stderr = proc.communicate() + if not os.path.isfile(cm_agent_conf_temp_file): + continue + else: + break + if os.path.isfile(cm_agent_conf_temp_file): + with open(cm_agent_conf_temp_file, "r") as cma_conf_file: + content = cma_conf_file.read() + ret = re.findall(r'agent_backup_open *= *1|agent_backup_open *= *2', content) + g_file.removeFile(cm_agent_conf_temp_file) + if ret: + return True + else: + return False + else: + raise Exception(ErrorCode.GAUSS_502['GAUSS_50201'] % cm_agent_conf_file) + with open(cm_agent_conf_file, "r") as cma_conf_file: + content = cma_conf_file.read() + ret = re.findall(r'agent_backup_open *= *1|agent_backup_open *= *2', content) + if ret: + return True + else: + return False + + @staticmethod + def cm_exist_and_is_disaster_cluster(clusterinfo, logger): + """ + check current cluster cm exist and is disaster cluster. + """ + cm_exist = DefaultValue.check_is_cm_cluster(logger) + if not cm_exist: + return False + is_disaster = DefaultValue.is_disaster_cluster(clusterinfo) + if not is_disaster: + return False + return True + + @staticmethod + def write_content_on_file(dest_file, content, authority=None): + """ + Write content on file + """ + authority = authority if authority else DefaultValue.KEY_FILE_MODE_IN_OS + with os.fdopen(os.open(dest_file, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, + authority), "w") as fp_write: + fp_write.write(str(content)) + + @staticmethod + def get_data_ip_info(instance, logger): + """ + Obtain data ip from file or cluster instance. + """ + cluster_conf_record = os.path.join(EnvUtil.getEnv("PGHOST"), + "streaming_cabin/cluster_conf_record") + if not os.path.isfile(cluster_conf_record): + raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % cluster_conf_record) + with open(cluster_conf_record, 'r') as read_fp: + conf_dict = json.load(read_fp) + if not conf_dict or len(conf_dict) != 2: + logger.debug("Failed obtain data ip list.") + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "check data ip file") + inst_data_ip = "" + local_shards_list = conf_dict["localClusterConf"]["shards"] + for shard_list in local_shards_list: + for shard in shard_list: + if shard["ip"] not in instance.listenIps: + continue + inst_data_ip = shard["dataIp"] + logger.debug("File record:%s, \nGot data ip:%s for instanceId:%s." % + (conf_dict, inst_data_ip, instance.instanceId)) + if not inst_data_ip: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain local data ip") + return inst_data_ip + + @staticmethod + def obtain_hadr_user_encrypt_str(cluster_info, db_user, logger, mode, ignore_res=False): + """ + Obtain hadr user encrypted string + """ + sql = "select value from gs_global_config where name='hadr_user_info';" + instances = [] + for node in cluster_info.dbNodes: + if cluster_info.isSingleInstCluster(): + for inst in node.datanodes: + instances.append(inst) + for inst in instances: + logger.debug("Obtain hadr user info string on node:%s with port:%s." + % (inst.hostname, inst.port)) + status, output = ClusterCommand.remoteSQLCommand(sql, db_user, inst.hostname, + inst.port, maintenance_mode=mode) + if status == 0 and output: + logger.debug("Successfully obtain hadr user info string.") + return output + if ignore_res: + return + logger.debug("Failed obtain hadr user info string.") + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain hadr user info") + + @staticmethod + def getstatusoutput_hide_pass(joint_cmd): + """ + Hide password of process + """ + proc = Popen(["sh", "-"], stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True) + stdout, stderr = proc.communicate(joint_cmd) + text = stderr or stdout + sts = proc.returncode + if sts is None: + sts = 0 + if text and text[-1:] == '\n': + text = text[:-1] + return sts, text + + @staticmethod + def decrypt_hadr_user_info(params): + """ + Decrypt hadr user info + """ + if len(params) != 6: + raise Exception(ErrorCode.GAUSS_500["GAUSS_50000"] % "decrypt hadr user info") + rand_pwd, hadr_str, cluster_info, db_user, logger, mode = params + sql = "select pg_catalog.gs_decrypt_aes128('%s', '%s');" % (hadr_str, rand_pwd) + instances = [] + for node in cluster_info.dbNodes: + if cluster_info.isSingleInstCluster(): + for inst in node.datanodes: + instances.append(inst) + else: + for inst in node.coordinators: + instances.append(inst) + for inst in instances: + logger.debug("Decrypt hadr user info on node:%s with port:%s." + % (inst.hostname, inst.port)) + status, output = ClusterCommand.remoteSQLCommand(sql, db_user, inst.hostname, + inst.port, maintenance_mode=mode) + if status == 0 and output and "|" in output and len(output.split("|")) == 2: + logger.debug("Successfully decrypt hadr user info string.") + hadr_user, hadr_pwd = output.strip().split("|")[0], output.strip().split("|")[1] + return hadr_user, hadr_pwd + logger.debug("Failed decrypt hadr user info string.") + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "decrypt hadr user info") + + @staticmethod + def decrypt_hadr_rand_pwd(logger): + """ + Decrypt hadr rand pwd + """ + db_user = pwd.getpwuid(os.getuid()).pw_name + gauss_home = ClusterDir.getInstallDir(db_user) + bin_path = os.path.join(os.path.realpath(gauss_home), "bin") + if not bin_path: + logger.debug("Failed obtain bin path.") + raise Exception(ErrorCode.GAUSS_518["GAUSS_51802"] % "bin path") + cipher_file = os.path.join(EnvUtil.getTmpDirFromEnv(), "binary_upgrade/hadr.key.cipher") + rand_file = os.path.join(EnvUtil.getTmpDirFromEnv(), "binary_upgrade/hadr.key.rand") + if os.path.isfile(cipher_file) and os.path.isfile(rand_file): + bin_path = os.path.join(EnvUtil.getTmpDirFromEnv(), "binary_upgrade") + rand_pwd = AesCbcUtil.aes_cbc_decrypt_with_path(bin_path, bin_path, key_name="hadr") + if rand_pwd: + logger.debug("Successfully decrypt rand pwd.") + return rand_pwd + + @staticmethod + def get_proc_title(pwd_para_name): + """ + Obtain the process name after sensitive information is hidden. + """ + cmd = "cat /proc/%s/cmdline" % os.getpid() + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0 or not output: + raise Exception(ErrorCode.GAUSS_502["GAUSS_50219"] % "proc title" + " Cmd is:%s." % cmd) + title_str_list = [] + for title_str in output.split("\0"): + if "=" in title_str: + title_str_list.extend(title_str.split("=")) + else: + title_str_list.extend(title_str.split(" ")) + if pwd_para_name in title_str_list: + w_index = title_str_list.index(pwd_para_name) + title_str_list[w_index], title_str_list[w_index + 1] = "", "" + title_name = " ".join(title_str_list).strip() + return title_name + + @staticmethod + def set_proc_title(name): + """ + set proc title to new name + """ + new_name = name.encode('ascii', 'replace') + try: + libc = ctypes.CDLL('libc.so.6') + proc_name = ctypes.c_char_p.in_dll(libc, '__progname_full') + with open('/proc/self/cmdline') as fp: + old_progname_len = len(fp.readline()) + if old_progname_len > len(new_name): + # padding blank chars + new_name += b' ' * (old_progname_len - len(new_name)) + # Environment variables are already copied to Python app zone. + # We can get environment variables by `os.environ` module, + # so we can ignore the destroying from the following action. + libc.strcpy(proc_name, ctypes.c_char_p(new_name)) + buff = ctypes.create_string_buffer(len(new_name) + 1) + buff.value = new_name + libc.prctl(15, ctypes.byref(buff), 0, 0, 0) + except Exception as err_msg: + raise Exception(ErrorCode.GAUSS_505["GAUSS_50503"] + str(err_msg)) + + @staticmethod + def check_is_streaming_dr_cluster(): + """check_is_steaming_cluster_cluster""" + stream_file = os.path.realpath(os.path.join(EnvUtil.getEnv("PGHOST"), "streaming_cabin")) + if os.path.exists(stream_file): + sys.exit(ErrorCode.GAUSS_512["GAUSS_51244"] % "current operate on dr cluster") + + @staticmethod + def get_primary_dn_instance_id(inst_status="Primary", ignore=False): + """ + function: get Primary/Standby dn instance id for centralized/distribute cluster + :param: inst_status Primary/Standby + return; instance id + """ + cmd = r"source %s; cm_ctl query -v | grep -E 'instance_state\ *:\ %s' " \ + r"-B 4 | grep -E 'type\ *:\ Datanode' -B 5 | grep instance_id | awk " \ + r"'{print $NF}'" % (EnvUtil.getMpprcFile(), inst_status) + (status, output) = CmdUtil.retryGetstatusoutput(cmd) + if status != 0 or not output: + if ignore is True: + return [] + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % + cmd + " Error: \n%s" % output) + return output.strip().split('\n') + @staticmethod def isgreyUpgradeNodeSpecify(user, step=-1, nodes=None, logger=None): """ @@ -2988,6 +3366,29 @@ class ClusterCommand(): # rollback to flag of start cluster INSTALL_STEP_START = "Start cluster" + @staticmethod + def getStartCmd(nodeId=0, timeout=DefaultValue.TIMEOUT_CLUSTER_START, datadir="", azName = ""): + """ + function : Start all cluster or a node + input : String,int,String,String + output : String + """ + user_profile = EnvUtil.getMpprcFile() + cmd = "%s %s ; cm_ctl start" % (CmdUtil.SOURCE_CMD, user_profile) + # check node id + if nodeId > 0: + cmd += " -n %d" % nodeId + # check data directory + if datadir != "": + cmd += " -D %s" % datadir + # check timeout + if timeout > 0: + cmd += " -t %d" % timeout + # azName + if azName != "": + cmd += " -z%s" % azName + + return cmd @staticmethod def getStopCmd(nodeId=0, stopMode="", timeout=0, datadir="", azName = ""): @@ -3152,7 +3553,8 @@ class ClusterCommand(): @staticmethod def remoteSQLCommand(sql, user, host, port, ignoreError=True, database="postgres", useTid=False, - IsInplaceUpgrade=False): + IsInplaceUpgrade=False, maintenance_mode=False, + user_name="", user_pwd=""): """ function : Execute sql command on remote host input : String,String,String,int @@ -3220,7 +3622,10 @@ class ClusterCommand(): gsql_cmd = SqlCommands.getSQLCommandForInplaceUpgradeBackup( port, database) else: - gsql_cmd = SqlCommands.getSQLCommand(port, database) + gsql_cmd = SqlCommands.getSQLCommand(port, database, user_name=user_name, + user_pwd=user_pwd) + if maintenance_mode: + gsql_cmd += " -m " if str(localHost) != str(host): sshCmd = CmdUtil.getSshCmd(host) if os.getuid() == 0 and user != "": @@ -3233,16 +3638,24 @@ class ClusterCommand(): if ignoreError: cmd += " 2>/dev/null" else: - cmd = "%s '" % sshCmd + cmd = "" if mpprcFile != "" and mpprcFile is not None: cmd += "source %s;" % mpprcFile - cmd += "%s -f %s --output %s -t -A -X '" % (gsql_cmd, + cmd += "%s -f %s --output %s -t -A -X " % (gsql_cmd, sqlFile, queryResultFile) + if user_pwd: + cmd = "echo \"%s\" | %s" % (cmd, sshCmd) + else: + cmd = "%s '%s'" % (sshCmd, cmd) if ignoreError: cmd += " 2>/dev/null" for i in range(RE_TIMES): - (status1, output1) = subprocess.getstatusoutput(cmd) + proc = FastPopen(cmd, stdout=PIPE, stderr=PIPE, + preexec_fn=os.setsid, close_fds=True) + stdout, stderr = proc.communicate() + output1 = stdout + stderr + status1 = proc.returncode if SqlFile.findErrorInSqlFile(sqlFile, output1): if SqlFile.findTupleErrorInSqlFile(output1): time.sleep(1) # find tuple error --> retry @@ -3278,7 +3691,11 @@ class ClusterCommand(): if (ignoreError): cmd += " 2>/dev/null" for i in range(RE_TIMES): - (status1, output1) = subprocess.getstatusoutput(cmd) + proc = FastPopen(cmd, stdout=PIPE, stderr=PIPE, + preexec_fn=os.setsid, close_fds=True) + stdout, stderr = proc.communicate() + output1 = stdout + stderr + status1 = proc.returncode if SqlFile.findErrorInSqlFile(sqlFile, output1): if SqlFile.findTupleErrorInSqlFile(output1): time.sleep(1) # find tuple error --> retry @@ -3778,6 +4195,83 @@ class ClusterInstanceConfig(): return connInfo1, nodename + @staticmethod + def get_data_from_dcc(cluster_info, logger, user, paralist): + """ + function: get value from dcc + :param cluster_info: cluster info + :param logger: logger obj + :param user: cluster user + :param paralist: paralist + :return: key-value map dict + """ + gausshome = ClusterDir.getInstallDir(user) + cm_ctl = os.path.realpath(os.path.join(gausshome, "bin/cm_ctl")) + if not os.path.isfile(cm_ctl): + raise Exception(ErrorCode.GAUSS_502["GAUSS-50201"] % "file cm_ctl") + cms_count = 0 + etcd_count = 0 + for dbnode in cluster_info.dbNodes: + for _ in dbnode.cmservers: + cms_count += 1 + for _ in dbnode.etcds: + etcd_count += 1 + if cms_count == 0 or etcd_count > 1: + raise Exception(ErrorCode.GAUSS_500["GAUSS-50011"] % paralist) + para_value_map = {} + for para_key in paralist: + cmd = "source %s; %s ddb --get '%s'" % (EnvUtil.getMpprcFile(), cm_ctl, para_key) + logger.debug("Get dcc value cmd:%s." % cmd) + (status, output) = subprocess.getstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd, "Error:%s" % output) + logger.debug("Get dcc value:%s." % output) + res = output.strip("\n").split("\n") + if len(res) != 2: + raise Exception(ErrorCode.GAUSS_500["GAUSS-50019"] % res) + if res[-1].find("Key not found") > -1: + para_value_map[para_key] = "" + continue + para_value_map[para_key] = res[-1].split(":")[-1].strip() + logger.debug("Get all values from dcc component res:%s." % para_value_map) + return para_value_map + + @staticmethod + def set_data_on_dcc(cluster_info, logger, user, paradict): + """ + function: set data on dcc + :param cluster_info: cluster info + :param logger: logger obj + :param user: cluster user + :param paradict: paradict + :return: NA + """ + gausshome = ClusterDir.getInstallDir(user) + cm_ctl = os.path.realpath(os.path.join(gausshome, "bin/cm_ctl")) + if not os.path.isfile(cm_ctl): + raise Exception(ErrorCode.GAUSS_502["GAUSS-50201"] % "file cm_ctl") + cms_count = 0 + etcd_count = 0 + for dbnode in cluster_info.dbNodes: + for _ in dbnode.cmservers: + cms_count += 1 + for _ in dbnode.etcds: + etcd_count += 1 + if cms_count == 0 or etcd_count > 1: + raise Exception(ErrorCode.GAUSS_500["GAUSS-50011"] % paradict) + for para_key in list(paradict.keys()): + cmd = "source %s; %s ddb --put '%s' '%s'" % \ + (EnvUtil.getMpprcFile(), cm_ctl, para_key, paradict[para_key]) + logger.debug("Set dcc value cmd:%s." % cmd) + (status, output) = subprocess.getstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd, "Error:%s" % output) + logger.debug("Set dcc data:%s." % output) + res = output.strip("\n").split("\n") + if len(res) != 2: + raise Exception(ErrorCode.GAUSS_500["GAUSS-50019"] % res) + logger.debug("Successfully set the dcc data information.") + class TempfileManagement(): """ diff --git a/script/gspylib/common/DbClusterInfo.py b/script/gspylib/common/DbClusterInfo.py index 5797868..0a5937a 100644 --- a/script/gspylib/common/DbClusterInfo.py +++ b/script/gspylib/common/DbClusterInfo.py @@ -37,6 +37,7 @@ from domain_utils.cluster_file.version_info import VersionInfo from domain_utils.domain_common.cluster_constants import ClusterConstants from base_utils.common.constantsbase import ConstantsBase from base_utils.os.env_util import EnvUtil +from base_utils.security.security_checker import SecurityChecker ########################### # instance role @@ -958,6 +959,10 @@ class dbClusterInfo(): # add for dcf self.enable_dcf = "" self.dcf_config = "" + self.local_stream_ip_map = [] + self.remote_stream_ip_map = [] + self.remote_dn_base_port = 0 + self.local_dn_base_port = 0 def __str__(self): """ @@ -1314,7 +1319,7 @@ class dbClusterInfo(): maxAzNameLen = maxAzNameLen if maxAzNameLen > azNameLen \ else azNameLen dnNodeCount += 1 - if roleStatus == "Primary": + if roleStatus in ["Primary", "Main"]: primaryDbNum += 1 primaryDbState = dbState else: @@ -3395,6 +3400,7 @@ class dbClusterInfo(): if self.enable_dcf == "": i = 0 ssdInfoList[i].extend(ssddirList) + self.parse_stream_cluster_info(masterNode, i) # dataNode syncNum key = "dataNode%d_syncNum" % (i + 1) @@ -3620,6 +3626,48 @@ class dbClusterInfo(): for inst in masterNode.datanodes: inst.azName = masterNode.azName + def parse_stream_cluster_info(self, masternode, i): + """parse_stream_cluster_info""" + i = i + 1 + local_ip_map = self.__readNodeStrValue(masternode.name, + "localStreamIpmap%s" % i, True) + if not local_ip_map: + return + remote_ip_map = self.__readNodeStrValue(masternode.name, + "remoteStreamIpmap%s" % i, True) + remote_dn_port = self.__readNodeStrValue(masternode.name, + "remotedataPortBase", True) + local_dn_port = self.__readNodeStrValue(masternode.name, + "dataPortBase", True, MASTER_BASEPORT_DATA) + if not all([local_ip_map, remote_ip_map, remote_dn_port]): + raise Exception( + ErrorCode.GAUSS_512["GAUSS_51236"] + " check streamInfo config is correct") + self.local_stream_ip_map.append(dbClusterInfo.append_map_ip_into_global(local_ip_map)) + self.remote_stream_ip_map.append(dbClusterInfo.append_map_ip_into_global(remote_ip_map)) + if not remote_dn_port.isdigit() or not local_dn_port.isdigit(): + raise Exception( + ErrorCode.GAUSS_512["GAUSS_51236"] + " check streamInfo config is correct") + self.remote_dn_base_port = int(remote_dn_port) + self.local_dn_base_port = int(local_dn_port) + + @staticmethod + def append_map_ip_into_global(strem_ip_map): + """append_map_ip_into_global""" + shard_map = [] + ip_map_list = [i.strip().strip("),").strip(",(") for i in strem_ip_map.split("(") if i] + for ip_map in ip_map_list: + peer_ip_map = ip_map.split(",") + temp_dict = dict() + if len(peer_ip_map) != 2: + raise Exception(ErrorCode.GAUSS_512["GAUSS_51236"] + + " check localStreamIpmap is correct") + temp_dict["ip"] = peer_ip_map[0].strip() + SecurityChecker.check_ip_valid(temp_dict["ip"], temp_dict["ip"]) + temp_dict["dataIp"] = peer_ip_map[1].strip() + SecurityChecker.check_ip_valid(temp_dict["dataIp"], temp_dict["dataIp"]) + shard_map.append(temp_dict) + return shard_map + def __readCmaConfig(self, dbNode): """ function : Read cm agent config on node. @@ -4689,3 +4737,14 @@ class dbClusterInfo(): :return:True or False """ return self.cmscount < 1 + + def getDbNodeByID(self, inputid): + """ + function : Get node by id. + input : nodename + output : [] + """ + for dbNode in self.dbNodes: + if dbNode.id == inputid: + return dbNode + return None diff --git a/script/gspylib/common/DbClusterStatus.py b/script/gspylib/common/DbClusterStatus.py index d7b53f6..e362a2a 100644 --- a/script/gspylib/common/DbClusterStatus.py +++ b/script/gspylib/common/DbClusterStatus.py @@ -21,11 +21,13 @@ import os import sys + sys.path.append(sys.path[0] + "/../../") from gspylib.common.Common import DefaultValue, ClusterInstanceConfig from gspylib.common.DbClusterInfo import dbClusterInfo from gspylib.common.ErrorCode import ErrorCode from domain_utils.cluster_os.cluster_user import ClusterUser +from impl.streaming_disaster_recovery.streaming_constants import StreamingConstants ########################### # instance type. only for CN/DN @@ -64,6 +66,7 @@ class StatusReport(): self.dnPrimary = 0 self.dnStandby = 0 self.dn_cascade_standby = 0 + self.dn_main_standby = 0 self.dnDummy = 0 self.dnBuild = 0 self.dnAbnormal = 0 @@ -124,6 +127,8 @@ class DbInstanceStatus(): elif self.status == DbClusterStatus.INSTANCE_STATUS_CASCADE_STANDBY: if self.haStatus != DbClusterStatus.HA_STATUS_NORMAL: return False + elif self.status == DbClusterStatus.INSTANCE_STATUS_MAIN_STANDBY: + return True else: return False @@ -231,6 +236,8 @@ class DbNodeStatus(): report.dnDummy += 1 elif inst.status == DbClusterStatus.INSTANCE_STATUS_CASCADE_STANDBY: report.dn_cascade_standby += 1 + elif inst.status == DbClusterStatus.INSTANCE_STATUS_MAIN_STANDBY: + report.dn_main_standby += 1 else: report.dnAbnormal += 1 @@ -400,7 +407,23 @@ class DbClusterStatus(): "Degraded": "Degraded", "Unknown": "Abnormal" } - + INSTANCE_STATUS_MAP_CHECK_STATUS = { + "Normal": "Primary", + "Unnormal": "Abnormal", + "Primary": "Primary", + "Standby": "Standby", + "Secondary": "Secondary", + "Pending": "Abnormal", + "Down": "Down", + "Unknown": "Abnormal", + "Offline": "Offline", + "Main Standby": "Standby", + "Cascade Standby": "Standby" + } + INSTANCE_STATUS_MAP_CHECK_FAILOVER = { + "Need repair(Disconnected)": "Normal", + "Need repair": "Normal" + } ################################################################### # instance role ################################################################### @@ -418,6 +441,7 @@ class DbClusterStatus(): INSTANCE_STATUS_PRIMARY = "Primary" INSTANCE_STATUS_STANDBY = "Standby" INSTANCE_STATUS_CASCADE_STANDBY = "Cascade Standby" + INSTANCE_STATUS_MAIN_STANDBY = "Main Standby" INSTANCE_STATUS_ABNORMAL = "Abnormal" INSTANCE_STATUS_DOWN = "Down" INSTANCE_STATUS_DUMMY = "Secondary" @@ -432,6 +456,7 @@ class DbClusterStatus(): "Standby": "Standby", "Secondary": "Secondary", "Cascade Standby": "Cascade Standby", + "Main Standby": "Main Standby", "Pending": "Abnormal", "Down": "Down", "Unknown": "Abnormal" @@ -611,7 +636,29 @@ class DbClusterStatus(): DbClusterStatus.OM_NODE_STATUS_ABNORMAL) return statusInfo - def initFromFile(self, filePath, isExpandScene=False): + def init_from_content(self, content, is_expand_scene=False, check_action=None, logger=None): + """ + Init from content + """ + content_list = content.split('\n') + try: + for line in content_list: + line = line.strip() + if line == "": + continue + str_list = line.split(":") + if len(str_list) != 2: + continue + self.__fillField(str_list[0].strip(), str_list[1].strip(), + is_expand_scene, check_action=check_action) + except Exception as error: + if logger: + logger.debug("Failed parse cluster status with error:%s, " + "status content:%s" % (error, content)) + raise Exception( + ErrorCode.GAUSS_502["GAUSS_50204"] % "status content" + " Error: \n%s" % str(error)) + + def initFromFile(self, filePath, isExpandScene=False, check_action=None): """ function : Init from status file input : filePath @@ -637,12 +684,12 @@ class DbClusterStatus(): continue self.__fillField(strList[0].strip(), strList[1].strip(), - isExpandScene) + isExpandScene, check_action=check_action) except Exception as e: raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"] % "status file" + " Error: \n%s" % str(e)) - def __fillField(self, field, value, isExpandScene): + def __fillField(self, field, value, isExpandScene, check_action=None): """ function : Fill field input : field, value @@ -690,7 +737,10 @@ class DbClusterStatus(): elif value == DbClusterStatus.INSTANCE_TYPE_ETCD: self.__curNode.etcds.append(self.__curInstance) elif field == "instance_state": - status = DbClusterStatus.INSTANCE_STATUS_MAP.get(value) + if check_action == DefaultValue.TASK_QUERY_STATUS: + status = DbClusterStatus.INSTANCE_STATUS_MAP_CHECK_STATUS.get(value) + else: + status = DbClusterStatus.INSTANCE_STATUS_MAP.get(value) self.__curInstance.status = \ DbClusterStatus.INSTANCE_STATUS_ABNORMAL \ if status is None else status @@ -715,6 +765,11 @@ class DbClusterStatus(): self.__curInstance.status = \ DbClusterStatus.INSTANCE_STATUS_ABNORMAL self.__curInstance.detail_status = value + if check_action == StreamingConstants.STREAM_DISTRIBUTE_ACTION: + self.__curInstance.status = \ + DbClusterStatus.INSTANCE_STATUS_MAP_CHECK_FAILOVER.get(value, value) + self.__curInstance.detail_status = \ + DbClusterStatus.INSTANCE_STATUS_MAP_CHECK_FAILOVER.get(value, value) elif field == "HA_state": haStatus = DbClusterStatus.HA_STATUS_MAP.get(value) detail_ha = value @@ -742,5 +797,9 @@ class DbClusterStatus(): if dataStatus is None else dataStatus elif field == "reason": self.__curInstance.reason = value + if check_action == StreamingConstants.STREAM_DISTRIBUTE_ACTION and \ + hasattr(self.__curInstance, "detail_ha") and value == "Disconnected": + self.__curInstance.detail_ha = \ + DbClusterStatus.INSTANCE_STATUS_MAP_CHECK_FAILOVER.get("Need repair", value) diff --git a/script/gspylib/common/ErrorCode.py b/script/gspylib/common/ErrorCode.py index 7fc23a6..ce12814 100644 --- a/script/gspylib/common/ErrorCode.py +++ b/script/gspylib/common/ErrorCode.py @@ -101,7 +101,12 @@ class ErrorCode(): 'GAUSS_50018': "[GAUSS-50018] : The parameter value of %s is Null.", 'GAUSS_50019': "[GAUSS-50019] : The value of %s is error.", 'GAUSS_50020': "[GAUSS-50020] : The value of %s must be a digit.", - 'GAUSS_50021': "[GAUSS-50021] : Failed to query %s parameter." + 'GAUSS_50021': "[GAUSS-50021] : Failed to query %s parameter.", + 'GAUSS_50022': "[GAUSS-50022] : The parameter '%s' should be %s.", + 'GAUSS_50023': "[GAUSS-50023] : The parameter '%s' over max length %s.", + 'GAUSS_50024': "[GAUSS-50024] : The parameter '%s' is invalid.", + 'GAUSS_50025': "[GAUSS-50025] : There is illegal character '%s' in parameter %s.", + 'GAUSS_50026': "[GAUSS-50026] : Failed to check %s parameters in the XML file." } diff --git a/script/gspylib/common/GaussLog.py b/script/gspylib/common/GaussLog.py index d128447..eeddee8 100644 --- a/script/gspylib/common/GaussLog.py +++ b/script/gspylib/common/GaussLog.py @@ -85,7 +85,7 @@ class GaussLog: Class to handle log file """ - def __init__(self, logFile, module="", expectLevel=LOG_DEBUG): + def __init__(self, logFile, module="", expectLevel=LOG_DEBUG, trace_id=None): """ function: Constructor input : NA @@ -104,6 +104,7 @@ class GaussLog: self.lock = thread.allocate_lock() self.tmpFile = None self.ignoreErr = False + self.trace_id = trace_id logFileList = "" try: @@ -419,9 +420,14 @@ class GaussLog: strTime = datetime.datetime.now() file_line = self.get_log_file_line() if (stepFlag == ""): - print("[%s][%d][%s][%s][%s]:%s" % ( - strTime, self.pid, file_line, self.moduleName, level, msg), - file=self.fp) + if self.trace_id: + print("[%s][%s][%d][%s][%s]:%s" + % (self.trace_id, strTime, self.pid, self.moduleName, + level, msg), file=self.fp) + else: + print("[%s][%d][%s][%s]:%s" % ( + strTime, self.pid, self.moduleName, level, msg), + file=self.fp) else: stepnum = self.Step(stepFlag) print("[%s][%d][%s][%s][%s][Step%d]:%s" % ( diff --git a/script/gspylib/component/Kernel/DN_OLAP/DN_OLAP.py b/script/gspylib/component/Kernel/DN_OLAP/DN_OLAP.py index 0839acd..c2ade3a 100644 --- a/script/gspylib/component/Kernel/DN_OLAP/DN_OLAP.py +++ b/script/gspylib/component/Kernel/DN_OLAP/DN_OLAP.py @@ -407,7 +407,7 @@ class DN_OLAP(Kernel): self.modifyDummpyStandbyConfigItem() - def setPghbaConfig(self, clusterAllIpList): + def setPghbaConfig(self, clusterAllIpList, try_reload=False): """ """ principal = None @@ -446,12 +446,22 @@ class DN_OLAP(Kernel): GUCParasStrList.append(GUCParasStr) i = 0 GUCParasStr = "" + # Used only streaming disaster cluster + streaming_dn_ips = self.get_streaming_relate_dn_ips(self.instInfo) + if streaming_dn_ips: + for dn_ip in streaming_dn_ips: + GUCParasStr += "-h \"host all %s %s/32 %s\" " \ + % (pg_user, dn_ip, METHOD_TRUST) + GUCParasStr += "-h \"host all all %s/32 %s\" " \ + % (dn_ip, METHOD_SHA) + ip_segment = '.'.join(dn_ip.split('.')[:2]) + ".0.0/16" + GUCParasStr += "-h \"host replication all %s sha256\" " % ip_segment if (GUCParasStr != ""): GUCParasStrList.append(GUCParasStr) for parasStr in GUCParasStrList: - self.doGUCConfig("set", parasStr, True) + self.doGUCConfig("set", parasStr, True, try_reload=try_reload) """ Desc: diff --git a/script/gspylib/component/Kernel/Kernel.py b/script/gspylib/component/Kernel/Kernel.py index bfad63b..22428ef 100644 --- a/script/gspylib/component/Kernel/Kernel.py +++ b/script/gspylib/component/Kernel/Kernel.py @@ -19,6 +19,8 @@ import sys import os import subprocess import re +import pwd +import json sys.path.append(sys.path[0] + "/../../../") from gspylib.common.ErrorCode import ErrorCode @@ -28,6 +30,7 @@ from gspylib.common.Common import DefaultValue from base_utils.os.cmd_util import CmdUtil from base_utils.os.env_util import EnvUtil from base_utils.os.file_util import FileUtil +from base_utils.security.security_checker import SecurityChecker from domain_utils.cluster_os.cluster_user import ClusterUser MAX_PARA_NUMBER = 1000 @@ -403,7 +406,7 @@ class Kernel(BaseComponent): return tempCommonDict - def doGUCConfig(self, action, GUCParasStr, isHab=False): + def doGUCConfig(self, action, GUCParasStr, isHab=False, try_reload=False): """ """ # check instance data directory @@ -424,6 +427,16 @@ class Kernel(BaseComponent): if (not os.path.exists(configFile)): raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % configFile) + if try_reload: + cmd_reload = "%s/gs_guc %s -D %s %s " % (self.binPath, 'reload', + self.instInfo.datadir, GUCParasStr) + status, output = CmdUtil.retryGetstatusoutput(cmd_reload, 3, 3) + if status != 0: + self.logger.log("Failed to reload guc params with commander:[%s]" % cmd_reload) + else: + self.logger.log("Successfully to reload guc params with commander:[%s]" + % cmd_reload) + return cmd = "%s/gs_guc %s -D %s %s " % (self.binPath, action, self.instInfo.datadir, GUCParasStr) self.logger.debug("gs_guc command is: {0}".format(cmd)) @@ -456,6 +469,39 @@ class Kernel(BaseComponent): for parasStr in guc_paras_str_list: self.doGUCConfig(setMode, parasStr, False) + def get_streaming_relate_dn_ips(self, instance): + """ + function: Streaming disaster cluster, obtain the IP address of the DN + with the same shards. + input: NA + :return: Cn ip + """ + self.logger.debug("Start parse cluster_conf_record.") + pg_host = EnvUtil.getEnv("PGHOST") + config_param_file = os.path.realpath( + os.path.join(pg_host, "streaming_cabin", "cluster_conf_record")) + if not os.path.isfile(config_param_file): + self.logger.debug("Not found streaming cluster config file.") + return [] + + with open(config_param_file, "r") as fp_read: + param_dict = json.load(fp_read) + dn_ip_list = [] + remote_cluster_conf = param_dict.get("remoteClusterConf") + shards = remote_cluster_conf.get('shards') + for shard in shards: + for node_info in shard: + shard_num = node_info.get("shardNum", '1') + node_ip = node_info.get("dataIp") + SecurityChecker.check_ip_valid("check ip from cluster_conf_record", node_ip) + if not all([shard_num, node_ip]): + raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] + % "obtain remote conf from cluster_conf_record") + if str(shard_num) == str(instance.mirrorId): + dn_ip_list.append(node_ip) + self.logger.debug("Got streaming cluster pg_hba ips %s." % dn_ip_list) + return dn_ip_list + def removeIpInfoOnPghbaConfig(self, ipAddressList): """ """ diff --git a/script/impl/checkperf/OLAP/CheckperfImplOLAP.py b/script/impl/checkperf/OLAP/CheckperfImplOLAP.py index 2564b0e..2b9b85e 100644 --- a/script/impl/checkperf/OLAP/CheckperfImplOLAP.py +++ b/script/impl/checkperf/OLAP/CheckperfImplOLAP.py @@ -76,7 +76,7 @@ class CheckperfImplOLAP(CheckperfImpl): dnInst.instanceId) if (instStatus is not None and instStatus.isInstanceHealthy() and - instStatus.status == "Primary"): + instStatus.status in ["Primary"]): normalDNList.append(dnInst) if (len(normalDNList) == 0): @@ -1791,7 +1791,6 @@ class CheckperfImplOLAP(CheckperfImpl): pmk_last_collect_start_time, last_snapshot_id) = \ self.getMetaData(hostname, port) self.deleteExpiredSnapShots(hostname, port) - # collect pmk stat self.collectPMKData(pmk_curr_collect_start_time, pmk_last_collect_start_time, @@ -1825,8 +1824,8 @@ class CheckperfImplOLAP(CheckperfImpl): self.handleNodeStat() # insert the node stat of all hosts into the cluster self.insertNodeStat(hostname, port, - pmk_curr_collect_start_time, - pmk_last_collect_start_time, last_snapshot_id) + pmk_curr_collect_start_time, + pmk_last_collect_start_time, last_snapshot_id) # display pmk stat showDetail = "" diff --git a/script/impl/om/OLAP/OmImplOLAP.py b/script/impl/om/OLAP/OmImplOLAP.py index 51f5525..800ce53 100644 --- a/script/impl/om/OLAP/OmImplOLAP.py +++ b/script/impl/om/OLAP/OmImplOLAP.py @@ -370,6 +370,10 @@ class OmImplOLAP(OmImpl): self.logger.log( "No need to generate dynamic configuration file for one node.") return + if DefaultValue.cm_exist_and_is_disaster_cluster(self.context.clusterInfo, self.logger): + self.logger.log( + "Streaming disaster cluster do not need to generate dynamic configuration.") + return self.logger.log("Generating dynamic configuration file for all nodes.") hostname = NetUtil.GetHostIpOrName() sshtool = SshTool(self.context.clusterInfo.getClusterNodeNames()) diff --git a/script/impl/streaming_disaster_recovery/__init__.py b/script/impl/streaming_disaster_recovery/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/script/impl/streaming_disaster_recovery/params_handler.py b/script/impl/streaming_disaster_recovery/params_handler.py new file mode 100644 index 0000000..0ab963d --- /dev/null +++ b/script/impl/streaming_disaster_recovery/params_handler.py @@ -0,0 +1,344 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################################# +# Copyright (c) 2020 Huawei Technologies Co.,Ltd. +# +# openGauss is licensed under Mulan PSL v2. +# You can use this software according to the terms +# and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# +# http://license.coscl.org.cn/MulanPSL2 +# +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ---------------------------------------------------------------------------- +# Description : params_handler.py is a utility for parsing and verifying streaming +# disaster recovery params. +############################################################################# + +import os +import sys +import json +import optparse +import getpass + +from impl.streaming_disaster_recovery.streaming_constants import StreamingConstants +from gspylib.common.DbClusterInfo import dbClusterInfo +from gspylib.common.ErrorCode import ErrorCode +from base_utils.security.security_checker import SecurityChecker, ValidationError +from domain_utils.cluster_file.version_info import VersionInfo + + +def check_streaming_start_mode(mode): + """ + Check start mode + """ + if mode not in ["primary", "disaster_standby"]: + raise ValidationError(ErrorCode.GAUSS_500["GAUSS_50011"] % ('-m', mode)) + + +def check_xml_file(file): + """ + Check xml file param + """ + if not file: + raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50001'] % 'X') + SecurityChecker.check_is_string('xml file path', file) + if not os.path.isfile(file): + raise ValidationError(ErrorCode.GAUSS_502["GAUSS_50201"] % file) + + +def check_hadr_user(value): + """ + Check disaster user + """ + description = "disaster username" + SecurityChecker.check_db_user(description, value) + + +def check_hadr_pwd(value): + """ + Check disaster user password + """ + description = "disaster user password" + # check_db_password will be used in cloud scene + SecurityChecker.check_db_user(description, value) + + +def check_wait_timeout(value): + """ + Check wait timeout + """ + description = "wait timeout" + SecurityChecker.check_is_digit(description, value) + + +def check_local_cluster_conf(value): + """ + Check local cluster conf + """ + SecurityChecker.check_is_dict("localClusterConf", value) + port = value.get('port') + SecurityChecker.check_port_valid('port of localClusterConf', port) + shards = value.get('shards') + SecurityChecker.check_is_list('shards of localClusterConf', shards) + for shard in shards: + for node in shard: + ip = node.get('ip') + data_ip = node.get('dataIp') + SecurityChecker.check_ip_valid('ip of localClusterConf', ip) + SecurityChecker.check_ip_valid('dataIp of localClusterConf', data_ip) + + +def check_remote_cluster_conf(value): + """ + Check local cluster conf + """ + SecurityChecker.check_is_dict("remoteClusterConf", value) + port = value.get('port') + SecurityChecker.check_port_valid('port of remoteClusterConf', port) + shards = value.get('shards') + SecurityChecker.check_is_list('shards of remoteClusterConf', shards) + for shard in shards: + for node in shard: + ip = node.get('ip') + data_ip = node.get('dataIp') + SecurityChecker.check_ip_valid('ip of remoteClusterConf', ip) + SecurityChecker.check_ip_valid('dataIp of remoteClusterConf', data_ip) + + +STREAMING_PARAMS_FOR_MODULE = { + "start": { + "mode": check_streaming_start_mode, + "xml_path": check_xml_file, + "hadrUserName": check_hadr_user, + "hadrUserPassword": check_hadr_pwd, + "waitingTimeout": check_wait_timeout, + "localClusterConf": check_local_cluster_conf, + "remoteClusterConf": check_remote_cluster_conf + }, + "stop": { + "xml_path": check_xml_file, + "waitingTimeout": check_wait_timeout, + "localClusterConf": check_local_cluster_conf, + "remoteClusterConf": check_remote_cluster_conf + }, + "switchover": { + "mode": check_streaming_start_mode, + "waitingTimeout": check_wait_timeout + }, + "failover": { + "waitingTimeout": check_wait_timeout, + }, + "query": {} +} + +HELP_MSG = """ +gs_sdr is a utility for streaming disaster recovery fully options. + +Usage: + gs_sdr -? | --help + gs_sdr -V | --version + gs_sdr -t start -m [primary|disaster_standby] -X XMLFILE [-U DR_USERNAME] [-W DR_PASSWORD] [--json JSONFILE] [--time-out=SECS] [-l LOGFILE] + gs_sdr -t stop -X XMLFILE|--json JSONFILE [-l LOGFILE] + gs_sdr -t switchover -m [primary|disaster_standby] [--time-out=SECS] [-l LOGFILE] + gs_sdr -t failover [-l LOGFILE] + gs_sdr -t query [-l LOGFILE] +General options: + -?, --help Show help information for this utility, + and exit the command line mode. + -V, --version Show version information. + -t Task name, it could be: + "start", "stop", "switchover", "failover", "query". + -m Option mode, it could be: + "primary", "disaster_standby". + -U Disaster recovery user name. + -W Disaster recovery user password. + -X Path of the XML configuration file. + -l Path of log file. + --json Path of params file for streaming options. + --time-out=SECS Maximum waiting time when Main standby connect to the primary dn, + default value is 1200s. +""" + + +class ParamsHandler(object): + """ + Parse and check params. + """ + def __init__(self, logger, trace_id): + self.params = None + self.logger = logger + self.trace_id = trace_id + + @staticmethod + def option_parser(): + """ + parsing parameters + :return: param obj + """ + parser = optparse.OptionParser(conflict_handler='resolve') + parser.disable_interspersed_args() + parser.epilog = "Example: gs_sdr -t " \ + "start -m primary -X clusterConfig.xml " \ + "--time-out=1200." + parser.add_option('-V', "--version", dest='version_info', action='store_true', + help='-V|--version show version info.') + parser.add_option('-?', "--help", dest='help_info', action='store_true', + help='-?|--help show help message and exist.') + parser.add_option('-t', dest='task', type='string', + help='Task name. It could be "start", "stop", ' + '"switchover", "failover", "query"') + parser.add_option('-m', dest='mode', type='string', + help='Cluster run mode. It could be ["primary", "disaster_standby"].') + parser.add_option('-U', dest='hadrusername', type='string', + help='hadr user name.') + parser.add_option('-W', dest='hadruserpasswd', type='string', + help='hadr user password.') + parser.add_option('-X', dest='xml_path', type='string', + help='Cluster config xml path.') + parser.add_option('--json', dest='json_path', type='string', + help='Config json file of streaming options') + parser.add_option('--time-out=', dest='timeout', default="1200", type='string', + help='time out.') + parser.add_option("-l", dest='logFile', type='string', + help='Path of log file.') + return parser + + def __print_usage(self): + """ + Print help message + """ + if self.params.help_info: + print(HELP_MSG) + sys.exit(0) + + def __print_version_info(self): + """ + Print version info + """ + if self.params.version_info: + print("%s %s" % (sys.argv[0].split("/")[-1], + VersionInfo.COMMON_VERSION)) + sys.exit(0) + + def __cluster_conf_parser(self, file_path): + """ + Parse params in json file + """ + if self.params.json_path: + if not os.path.isfile(file_path): + raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50010'] + % '--json' + " Json file is not exist.") + with open(file_path, 'r') as read_fp: + param_dict = json.load(read_fp) + for key, value in param_dict.items(): + if key not in StreamingConstants.STREAMING_JSON_PARAMS[self.params.task]: + continue + setattr(self.params, key, value) + return + cluster_info = dbClusterInfo() + if not self.params.xml_path or not os.path.isfile(self.params.xml_path): + raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50010'] + % '-X' + " XML file and json file are all not exist.") + cluster_info.initFromXml(self.params.xml_path) + remote_cluster_conf = dict() + remote_cluster_conf.setdefault("port", cluster_info.remote_dn_base_port) + remote_cluster_conf.setdefault("shards", cluster_info.remote_stream_ip_map) + setattr(self.params, "remoteClusterConf", remote_cluster_conf) + self.logger.debug("Remote stream cluster conf: %s." % str(remote_cluster_conf)) + + local_cluster_conf = dict() + local_cluster_conf.setdefault("port", cluster_info.local_dn_base_port) + local_cluster_conf.setdefault("shards", cluster_info.local_stream_ip_map) + setattr(self.params, "localClusterConf", local_cluster_conf) + self.logger.debug("Local stream cluster conf: %s." % str(local_cluster_conf)) + if not remote_cluster_conf["shards"] or len(remote_cluster_conf["shards"])\ + != len(local_cluster_conf["shards"]): + raise ValidationError(ErrorCode.GAUSS_500['GAUSS_50026'] % "streaming DR") + + def __init_default_params(self): + """ + Init params if need default value + """ + if not self.params.timeout.isdigit(): + raise ValidationError(ErrorCode.GAUSS_500["GAUSS_50004"] % "--time-out") + self.params.waitingTimeout = int(self.params.timeout) + + def __parse_args(self): + """ + Parse arguments + """ + parser = ParamsHandler.option_parser() + self.params, _ = parser.parse_args() + self.__print_usage() + self.__print_version_info() + if not hasattr(self.params, 'task') or not self.params.task: + raise ValidationError(ErrorCode.GAUSS_500["GAUSS_50001"] % 't' + ".") + if self.params.task not in StreamingConstants.STREAMING_JSON_PARAMS.keys(): + raise ValidationError(ErrorCode.GAUSS_500["GAUSS_50004"] % 't') + # parse arguments in json/xml file + if StreamingConstants.STREAMING_JSON_PARAMS[self.params.task]: + self.__cluster_conf_parser(self.params.json_path) + + def __reload_hadr_user_info(self): + """ + Input hadr user info + """ + if self.params.task not in ["start"]: + return + if self.params.hadrusername and self.params.hadruserpasswd: + self.params.hadrUserName = self.params.hadrusername + self.params.hadrUserPassword = self.params.hadruserpasswd + del self.params.hadruserpasswd + return + user_name = "" + if not self.params.hadrusername: + user_name = input("Please enter disaster user name:") + self.params.hadrUserName = user_name if user_name else self.params.hadrusername + if self.params.hadruserpasswd: + self.params.hadrUserPassword = self.params.hadruserpasswd + del self.params.hadruserpasswd + return + for i in range(3): + user_passwd = getpass.getpass("Please enter password for [%s]:" % + self.params.hadrUserName) + user_passwd_check = getpass.getpass("Please repeat enter for password for [%s]:" + % self.params.hadrUserName) + if user_passwd == user_passwd_check: + break + if i == 2: + self.logger.logExit("The two passwords entered for too many " + "times are inconsistent. Authentication failed.") + self.logger.error( + ErrorCode.GAUSS_503["GAUSS_50306"] % user_name + + "The two passwords are different, please enter password again.") + self.params.hadrUserPassword = user_passwd + del user_passwd + del user_passwd_check + self.logger.debug("The hadr user information is successfully loaded.") + + def get_valid_params(self): + """ + Check params + """ + try: + self.__parse_args() + self.logger.log(StreamingConstants.LOG_REMARK) + self.logger.log('Streaming disaster recovery ' + self.params.task + ' ' + self.trace_id) + self.logger.log(StreamingConstants.LOG_REMARK) + self.__init_default_params() + self.__reload_hadr_user_info() + for param_name, validate in STREAMING_PARAMS_FOR_MODULE[self.params.task].items(): + check_value = getattr(self.params, param_name) + if self.params.task == "stop": + if param_name == "xml_path" and not check_value: + check_value = getattr(self.params, 'json_path') + validate(check_value) + except ValidationError as error: + self.logger.logExit(str(error)) + return self.params diff --git a/script/impl/streaming_disaster_recovery/streaming_base.py b/script/impl/streaming_disaster_recovery/streaming_base.py new file mode 100644 index 0000000..acbc1a5 --- /dev/null +++ b/script/impl/streaming_disaster_recovery/streaming_base.py @@ -0,0 +1,2484 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################################# +# Copyright (c) 2020 Huawei Technologies Co.,Ltd. +# +# openGauss is licensed under Mulan PSL v2. +# You can use this software according to the terms +# and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# +# http://license.coscl.org.cn/MulanPSL2 +# +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ---------------------------------------------------------------------------- +# Description : streaming_base.py is a base module for streaming disaster recovery. +############################################################################# +import json +import os +import re +import time +from datetime import datetime +from datetime import timedelta + +from domain_utils.cluster_file.version_info import VersionInfo +from impl.streaming_disaster_recovery.streaming_constants import StreamingConstants +from impl.streaming_disaster_recovery.params_handler import check_local_cluster_conf +from impl.streaming_disaster_recovery.params_handler import check_remote_cluster_conf +from gspylib.common.DbClusterInfo import dbClusterInfo +from gspylib.common.Common import DefaultValue, ClusterInstanceConfig +from gspylib.common.ErrorCode import ErrorCode +from gspylib.common.Common import ClusterCommand +from gspylib.common.OMCommand import OMCommand +from gspylib.common.DbClusterStatus import DbClusterStatus +from gspylib.threads.SshTool import SshTool +from gspylib.threads.parallelTool import parallelTool +from gspylib.os.gsfile import g_file +from base_utils.os.cmd_util import CmdUtil +from base_utils.os.env_util import EnvUtil +from base_utils.os.net_util import NetUtil +from base_utils.os.file_util import FileUtil +from base_utils.os.user_util import UserUtil +from base_utils.security.sensitive_mask import SensitiveMask +from base_utils.common.constantsbase import ConstantsBase + + +class StreamingBase(object): + def __init__(self, params, user, logger, trace_id, log_file=None): + self.user = user + self.params = params + self.logger = logger + self.trace_id = trace_id + self.log_file = log_file + self.cluster_info = None + self.gp_home = None + self.pg_host = None + self.gauss_home = None + self.bin_path = None + self.local_host = None + self.local_ip = None + self.is_single_inst = None + self.streaming_file_dir = None + self.streaming_xml = None + self.cluster_node_names = None + self.normal_cm_ips = [] + self.normal_node_list = [] + self.ssh_tool = None + self.mpp_file = None + self.status_info = None + self.step_file_path = "" + self.cluster_status = '' + self.normal_dn_ids = [] + self.normal_cn_ids = [] + self.normal_etcd_ids = [] + self.normal_gtm_ids = [] + self.normal_cm_ids = [] + self.normal_instances = [] + self.primary_dn_ids = [] + self.main_standby_ids = [] + self.cascade_standby_ids = [] + self.connected_nodes = [] + self.__init_globals() + self.backup_open_key = StreamingConstants.BACKUP_OPEN % user + + def __init_globals(self): + self.cluster_info = dbClusterInfo() + self.cluster_info.initFromStaticConfig(self.user) + self.gp_home = EnvUtil.getEnvironmentParameterValue("GPHOME", self.user) + self.pg_host = EnvUtil.getEnvironmentParameterValue("PGHOST", self.user) + self.gauss_home = EnvUtil.getEnvironmentParameterValue("GAUSSHOME", self.user) + self.bin_path = os.path.join(os.path.realpath(self.gauss_home), 'bin') + self.local_host = NetUtil.GetHostIpOrName() + self.local_ip = DefaultValue.getIpByHostName() + self.is_single_inst = True if self.cluster_info.isSingleInstCluster() else None + self.cluster_node_names = self.cluster_info.getClusterNodeNames() + self.streaming_file_dir = os.path.join(self.pg_host, StreamingConstants.STREAMING_FILES_DIR) + self.streaming_xml = os.path.join(self.streaming_file_dir, + StreamingConstants.STREAMING_CONFIG_XML) + self.ssh_tool = SshTool(self.cluster_node_names, self.log_file) + self.mpp_file = EnvUtil.getMpprcFile() + self._init_step_file_path() + + def init_cluster_conf(self): + """ + Init cluster conf from file + """ + if (not hasattr(self.params, "localClusterConf")) \ + or (not hasattr(self.params, "remoteClusterConf")): + self.logger.log("Parse cluster conf from file.") + local_conf, remote_conf = self.read_cluster_conf_record() + self.logger.debug("Start validte cluster conf info.") + check_local_cluster_conf(local_conf) + check_remote_cluster_conf(remote_conf) + setattr(self.params, "localClusterConf", local_conf) + setattr(self.params, "remoteClusterConf", remote_conf) + self.logger.log("Successfully parse cluster conf from file.") + + def _init_step_file_path(self): + """ + Init step file path + """ + if self.params.task == StreamingConstants.ACTION_START: + if self.params.mode == "primary": + step_file_name = StreamingConstants.STREAMING_STEP_FILES["start_primary"] + elif self.params.mode == "disaster_standby": + step_file_name = StreamingConstants.STREAMING_STEP_FILES["start_standby"] + else: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "init step file path") + elif self.params.task == StreamingConstants.ACTION_SWITCHOVER: + if self.params.mode == "primary": + step_file_name = StreamingConstants.STREAMING_STEP_FILES["switchover_primary"] + elif self.params.mode == "disaster_standby": + step_file_name = StreamingConstants.STREAMING_STEP_FILES["switchover_standby"] + else: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "init step file path") + else: + step_file_name = StreamingConstants.STREAMING_STEP_FILES[self.params.task] + self.step_file_path = os.path.join(self.streaming_file_dir, step_file_name) + self.logger.debug("Init step file:%s." % self.step_file_path) + + def read_cluster_conf_record(self, check_file_exist=True): + """ + Read cluster conf from file + """ + cluster_conf_record = os.path.join(self.streaming_file_dir, + StreamingConstants.STREAMING_CLUSTER_CONF_RECORD) + if not os.path.isfile(cluster_conf_record): + if check_file_exist: + raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] + % "check cluster conf, cluster_conf_record is lost") + else: + self.logger.log("Not found file cluster_conf_record.") + return '', '' + content = DefaultValue.obtain_file_content(cluster_conf_record, is_list=False) + json_content = json.loads(content) + local_conf = json_content["localClusterConf"] + remote_conf = json_content["remoteClusterConf"] + return local_conf, remote_conf + + def handle_lock_file(self, trace_id, action): + """ + Create lock file for other streaming process. + """ + if self.params.task not in StreamingConstants.TASK_EXIST_CHECK: + return + file_name = StreamingConstants.PROCESS_LOCK_FILE + trace_id + file_path = os.path.join(self.pg_host, file_name) + self.logger.debug("Start %s lock file:%s." % (action, file_path)) + if action == 'create': + FileUtil.createFile(file_path, DefaultValue.KEY_FILE_MODE) + elif action == 'remove': + if os.path.isfile(file_path): + FileUtil.removeFile(file_path, DefaultValue.KEY_FILE_MODE) + else: + self.logger.warn("Not found:%s." % file_path) + self.logger.debug("Successfully %s lock file:%s." % (action, file_path)) + + def check_streaming_process_is_running(self): + """ + Check streaming process is running + """ + hostnames = ' -H '.join(self.cluster_node_names) + file_path = os.path.join(self.pg_host, StreamingConstants.PROCESS_LOCK_FILE) + cmd = 'source %s && pssh -t 10 -H %s "ls %s*"' % (self.mpp_file, hostnames, file_path) + # waiting for check + time.sleep(StreamingConstants.CHECK_PROCESS_WAIT_TIME) + _, output = CmdUtil.retryGetstatusoutput(cmd, retry_time=0) + host_file_str_list = re.findall(r'.* ?: *%s[^\*^\s]+' % file_path, output) + process_list = [] + for item in host_file_str_list: + hostname = item.split(':')[0].strip() + file_name = item.split(':')[1].strip() + uuid = os.path.basename(file_name).split('_')[-1] + if uuid != self.trace_id: + process_list.append([hostname, file_name]) + if process_list: + msg = ErrorCode.GAUSS_516['GAUSS_51632'] \ + % 'check streaming process, please execute streaming options after other ' \ + 'process exited, if you ensure no other process is running, ' \ + 'remove the lock file [%s] on node [%s], and try again' \ + % (process_list[0][-1], process_list[0][0]) + self.logger.error(msg) + raise Exception(msg) + + def create_streaming_dir(self, dir_path): + """ + Create streaming files dir + """ + cmd = g_file.SHELL_CMD_DICT["createDir"] % ( + dir_path, dir_path, DefaultValue.MAX_DIRECTORY_MODE) + self.ssh_tool.executeCommand(cmd) + self.logger.debug("Successfully create dir [%s] on all nodes." % dir_path) + + def check_hadr_pwd(self, only_mode=None): + """ + Check hadr pwd is correct or not + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Checking hadr user is not for mode:%s." % self.params.mode) + return + self.logger.debug("Start checking disaster user password.") + sql = "select 1;" + primary_dns = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in + db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] + if not primary_dns: + raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] + % "obtain primary dn when check disaster user") + status, output = ClusterCommand.remoteSQLCommand( + sql, self.user, primary_dns[0].hostname, primary_dns[0].port, False, + user_name=self.params.hadrUserName, user_pwd=self.params.hadrUserPassword) + if status != 0: + if "Invalid username/password" in output: + self.logger.debug("Logging denied, please check your password.") + self.logger.logExit(ErrorCode.GAUSS_516['GAUSS_51632'] + % "check disaster user password") + self.logger.debug("Successfully check disaster user password.") + + def check_hadr_user(self, only_mode=None): + """ + Check hadr user is exist + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Checking hadr user is not for mode:%s." % self.params.mode) + return + self.logger.log("Start checking disaster recovery user.") + sql = "select usename, userepl from pg_user;" + primary_dns = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in + db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] + if not primary_dns: + raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] + % "obtain primary dn when check disaster user") + status, output = ClusterCommand.remoteSQLCommand( + sql, self.user, primary_dns[0].hostname, primary_dns[0].port, True) + if status != 0: + raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] + % "execute sql for checking disaster user.") + user_dict = {user_info.split('|')[0].strip(): user_info.split('|')[-1].strip() + for user_info in output.strip().split('\n')} + for user_name, repl in user_dict.items(): + if user_name == self.params.hadrUserName and repl == 't': + self.logger.log("Successfully check disaster recovery user.") + return + msg = ErrorCode.GAUSS_516['GAUSS_51632'] % 'checking disaster user, please confirm ' \ + 'disaster user is exist and with ' \ + 'replication role' + self.logger.logExit(msg + "Users:%s" % user_dict) + + def __copy_hadr_user_key(self, secure_dir_path, update=False): + """ + Copy hadr.key.cipher and hadr.key.rand + """ + self.logger.log("Start copy hadr user key files.") + hadr_cipher_path = os.path.join(self.bin_path, "hadr.key.cipher") + hadr_rand_path = os.path.join(self.bin_path, "hadr.key.rand") + secure_cipher_path = os.path.join(secure_dir_path, "hadr.key.cipher") + secure_rand_path = os.path.join(secure_dir_path, "hadr.key.rand") + if not update: + if (not os.path.isfile(hadr_cipher_path)) or (not os.path.isfile(hadr_rand_path)): + self.logger.debug("Not found hadr user key, no need to copy.") + return + FileUtil.cpFile(hadr_cipher_path, secure_cipher_path, cmd_type="shell") + FileUtil.cpFile(hadr_rand_path, secure_rand_path, cmd_type="shell") + self.logger.debug("Successfully copy hadr key files into temp secure dir.") + else: + if (not os.path.isfile(secure_cipher_path)) or (not os.path.isfile(secure_rand_path)): + self.logger.debug("Not found hadr user key, no need to update.") + return + host_names = self.get_all_connection_node_name("update_hadr_key") + self.ssh_tool.scpFiles(secure_cipher_path, self.bin_path, hostList=host_names) + self.ssh_tool.scpFiles(secure_rand_path, self.bin_path, hostList=host_names) + FileUtil.removeFile(secure_cipher_path) + FileUtil.removeFile(secure_rand_path) + self.logger.debug("Finished copy hadr key files to nodes:%s." % host_names) + + def remove_secure_dir(self, dir_path, host_name): + """ + Remove gs_secure_files dir in PGDATA + """ + secure_dir_path = os.path.join(dir_path, StreamingConstants.GS_SECURE_FILES) + cmd = "echo \"if [ -d '%s' ];then rm -rf '%s';fi\" | pssh -s -H %s" % \ + (secure_dir_path, secure_dir_path, host_name) + status, output = CmdUtil.retryGetstatusoutput(cmd) + self.logger.debug("Remove gs_secure_files cmd:%s" % cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + " Error: \n%s " % output) + + def __stream_copy_file_to_all_dn(self, temp_secure_dir_path): + """ + copy key file dir to all dn dir + """ + dn_infos = DefaultValue.get_dn_info(self.cluster_info) + self.logger.debug("Got dns:%s" % dn_infos) + copy_succeed = 0 + host_names = self.get_all_connection_node_name("copy gs_secure_files to dns") + for dn_info in dn_infos: + if dn_info["host_name"] not in host_names: + continue + self.logger.debug("Copy disaster recovery secure files to inst[%s][%s][%s]." % + (dn_info['id'], dn_info['data_dir'], dn_info['host_name'])) + try: + self.remove_secure_dir(dn_info['data_dir'], dn_info['host_name']) + self.ssh_tool.scpFiles( + temp_secure_dir_path, dn_info['data_dir'], [dn_info['host_name']]) + copy_succeed += 1 + except Exception as error: + self.logger.debug("Failed copy secure files to inst[%s][%s][%s],error:%s." % + (dn_info['id'], dn_info['data_dir'], dn_info['host_name'], + str(error))) + if copy_succeed == 0: + raise Exception( + ErrorCode.GAUSS_516["GAUSS_51632"] % "copy secure dir to all dn data dir") + self.logger.log("Successfully copy secure files.") + + def __prepare_cluster_user_record(self, temp_secure_dir_path): + """ + Save cluster user record + """ + cluster_user_record = os.path.join(temp_secure_dir_path, + StreamingConstants.CLUSTER_USER_RECORD) + DefaultValue.write_content_on_file(cluster_user_record, self.user) + self.logger.debug("Record current cluster user:%s." % self.user) + + def prepare_gs_secure_files(self, only_mode=None): + """ + Prepare gs_secure_files on primary cluster + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Prepare gs_secure_files is not for mode:%s." % self.params.mode) + return + self.logger.log("Start prepare secure files.") + secure_dir_name = StreamingConstants.GS_SECURE_FILES + temp_secure_dir_path = os.path.realpath( + os.path.join(self.streaming_file_dir, secure_dir_name)) + if os.path.isdir(temp_secure_dir_path): + self.logger.debug("Secure file dir exist, cleaning...") + FileUtil.removeDirectory(temp_secure_dir_path) + FileUtil.createDirectory(temp_secure_dir_path, True, DefaultValue.KEY_DIRECTORY_MODE) + if os.path.isdir(temp_secure_dir_path): + self.logger.debug("Successfully create secure file dir.") + version_file_path = os.path.realpath(os.path.join(self.gp_home, "version.cfg")) + FileUtil.cpFile(version_file_path, temp_secure_dir_path) + self.__prepare_cluster_user_record(temp_secure_dir_path) + self.__copy_hadr_user_key(temp_secure_dir_path, update=False) + self.__stream_copy_file_to_all_dn(temp_secure_dir_path) + FileUtil.removeDirectory(temp_secure_dir_path) + + def stream_clean_gs_secure(self, params): + """ + clean gs secure dir + """ + inst, file_path = params + self.logger.debug("Starting clean instance %s gs secure dir." % inst.instanceId) + cmd = "source %s && pssh -s -H %s 'if [ -d %s ]; then rm -rf %s; fi'" \ + % (self.mpp_file, inst.hostname, file_path, file_path) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + self.logger.debug("Clean gs secure dir for instance [%s] result:%s." % + (inst.instanceId, output)) + self.logger.debug("Successfully clean instance %s gs secure dir." % inst.instanceId) + + def clean_gs_secure_dir(self, only_mode=None): + """ + Clean gs secure dir if exist + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Clean gs_secure_files is not for mode:%s." % self.params.mode) + return + self.logger.debug("Start clean gs secure dir.") + params = [] + for node in self.cluster_info.dbNodes: + for inst in node.datanodes: + if inst.hostname not in self.connected_nodes: + continue + file_path = os.path.realpath(os.path.join( + inst.datadir, StreamingConstants.GS_SECURE_FILES)) + params.append((inst, file_path)) + if params: + parallelTool.parallelExecute(self.stream_clean_gs_secure, params) + self.logger.debug("Finished clean gs secure dir.") + + def remove_streaming_dir(self, dir_path): + """ + Remove streaming files dir + """ + cmd = "if [ -d %s ]; then rm %s -rf;fi" % (dir_path, self.streaming_file_dir) + self.ssh_tool.executeCommand(cmd) + self.logger.debug("Successfully remove dir [%s] on all nodes." % dir_path) + + def query_streaming_step(self): + """ + Streaming step + """ + step = -1 + if os.path.isfile(self.step_file_path): + step_list = FileUtil.readFile(self.step_file_path) + if step_list: + step = int(step_list[0].split("_")[0]) + if step == -1: + self.logger.log("Got the step for action:[%s]." % self.params.task) + else: + self.logger.log("Got the continue step:[%s] for action:[%s]." % + (step, self.params.task)) + return step + + def write_streaming_step(self, step): + """ + write streaming step + :return: NA + """ + self.logger.debug("Streaming action:[%s] record current step:[%s]" + % (self.params.task, step)) + with os.fdopen(os.open(self.step_file_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, + DefaultValue.KEY_FILE_MODE_IN_OS), "w") as fp_write: + fp_write.write(step) + + def init_cluster_status(self): + """ + Generate cluster status file + """ + tmp_file = os.path.join(self.streaming_file_dir, + StreamingConstants.STREAMING_CLUSTER_STATUS_TMP_FILE) + cmd = ClusterCommand.getQueryStatusCmd("", tmp_file) + self.logger.debug("Command for checking cluster state: %s" % cmd) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + msg = ErrorCode.GAUSS_516["GAUSS_51632"] \ + % "check cluster state, status:%s, output:%s" % (status, output) + self.logger.debug(msg) + raise Exception(msg) + self.logger.debug("Successfully init cluster status.") + + def query_cluster_info(self, cm_check=False): + """ + Query cluster info + """ + cmd = ClusterCommand.getQueryStatusCmd() + if cm_check: + cmd = "source %s; cm_ctl query -Cv" % self.mpp_file + self.logger.debug("Command for checking cluster state: %s" % cmd) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0 or not output.strip(): + msg = ErrorCode.GAUSS_516["GAUSS_51632"] \ + % "check cluster state, status:%s, output:%s" % (status, output) + self.logger.debug(msg) + return "" + return output.strip() + + def __clean_cluster_status(self): + """ + Clean status + """ + self.normal_cm_ids = [] + self.normal_gtm_ids = [] + self.normal_cn_ids = [] + self.primary_dn_ids = [] + self.main_standby_ids = [] + self.cascade_standby_ids = [] + self.normal_dn_ids = [] + self.normal_etcd_ids = [] + self.normal_instances = [] + + def __parse_instance_status(self): + """ + Parse instance status + """ + abnormal_insts = [] + for db_node in self.status_info.dbNodes: + for cms_inst in db_node.cmservers: + if cms_inst.status in ["Primary", "Standby"]: + self.normal_cm_ids.append(cms_inst.instanceId) + self.normal_instances.append(cms_inst) + else: + abnormal_insts.append({cms_inst.instanceId: cms_inst.status}) + for gtm_inst in db_node.gtms: + if gtm_inst.status in ["Primary", "Standby"] and gtm_inst.isInstanceHealthy(): + self.normal_gtm_ids.append(gtm_inst.instanceId) + self.normal_instances.append(gtm_inst) + else: + abnormal_insts.append({gtm_inst.instanceId: gtm_inst.status}) + for coo_inst in db_node.coordinators: + if coo_inst.status == "Normal": + self.normal_cn_ids.append(coo_inst.instanceId) + self.normal_instances.append(coo_inst) + else: + abnormal_insts.append({coo_inst.instanceId: coo_inst.status}) + for data_inst in db_node.datanodes: + if data_inst.status in ["Primary"]: + self.primary_dn_ids.append(data_inst.instanceId) + if data_inst.status in ["Main Standby"]: + self.main_standby_ids.append(data_inst.instanceId) + if data_inst.status in ["Cascade Standby"]: + self.cascade_standby_ids.append(data_inst.instanceId) + if data_inst.status in ["Primary", "Standby", "Cascade Standby", "Main Standby" + ] and data_inst.isInstanceHealthy(): + self.normal_dn_ids.append(data_inst.instanceId) + self.normal_instances.append(data_inst) + else: + abnormal_insts.append({data_inst.instanceId: data_inst.status}) + for etcd_inst in db_node.etcds: + if etcd_inst.status in ["StateLeader", "StateFollower"] \ + and etcd_inst.isInstanceHealthy(): + self.normal_etcd_ids.append(etcd_inst.instanceId) + self.normal_instances.append(etcd_inst) + else: + abnormal_insts.append({etcd_inst.instanceId: etcd_inst.status}) + return abnormal_insts + + def parse_cluster_status(self, current_status=None): + """ + Parse cluster status + """ + tmp_file = os.path.join(self.streaming_file_dir, + StreamingConstants.STREAMING_CLUSTER_STATUS_TMP_FILE) + if (not os.path.isfile(tmp_file)) and (not current_status): + raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] + % "cluster status file:%s" % tmp_file) + self.status_info = DbClusterStatus() + self.__clean_cluster_status() + if current_status: + self.status_info.init_from_content(current_status) + else: + self.status_info.initFromFile(tmp_file) + self.cluster_status = self.status_info.clusterStatus + self.logger.debug("Current cluster status is:%s." % self.cluster_status) + # Parse instance status + abnormal_insts = self.__parse_instance_status() + # Get node names of normal nodes with nodeId + for instance in self.normal_instances: + self.normal_node_list.append(self.cluster_info.getDbNodeByID(int(instance.nodeId)).name) + self.normal_node_list = list(set(self.normal_node_list)) + for node_id in list(set(self.normal_cm_ids)): + self.normal_cm_ips.append(self.cluster_info.getDbNodeByID(int(node_id)).name) + self.logger.debug("Parsed primary dns:%s" % self.primary_dn_ids) + self.logger.debug("Parsed Main standby dns:%s" % self.main_standby_ids) + if abnormal_insts: + self.logger.debug("Abnormal instances:%s" % abnormal_insts) + else: + self.logger.debug("Checked all instances is normal:%s" + % set([inst.instanceId for inst in self.normal_instances])) + + def check_cluster_status(self, status_allowed, only_check=False, + check_current=False, is_log=True): + """ + Stream disaster cluster switch to check cluster status + """ + cluster_status = self.cluster_status + if check_current: + self.logger.debug("Starting check CLuster status") + check_cmd = "source %s && cm_ctl query | grep cluster_state | awk '{print $NF}'"\ + % self.mpp_file + status, output = CmdUtil.retryGetstatusoutput(check_cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51600"] + + "status(%d), output(%s)" % (status, output)) + cluster_status = output.strip() + self.logger.debug("Checked cluster status is:%s" % cluster_status) + if cluster_status not in status_allowed: + if only_check is True: + self.logger.debug("Current cluster status is %s" % cluster_status) + return False + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "check cluster status") + if is_log: + self.logger.log("Successfully check cluster status is: %s." % cluster_status) + else: + self.logger.debug("Checked cluster status is: %s." % cluster_status) + return True + + def check_is_under_upgrade(self): + """ + Check is cluster is not doing upgrade + """ + if DefaultValue.isUnderUpgrade(self.user): + self.logger.logExit(ErrorCode.GAUSS_516["GAUSS_51632"] + % "check upgrade binary file, please ensure upgrade " + "is finished and upgrade files has been cleaned") + self.logger.debug("Successfully check cluster is not under upgrade opts.") + + def check_cluster_is_common(self): + """ + Check no main standby and cascade standby + """ + if self.main_standby_ids or self.cascade_standby_ids: + self.logger.logExit(ErrorCode.GAUSS_516["GAUSS_51632"] + % "check instance status, there are already main standby " + "or cascade standby, main standby:%s, cascade stadnby:%s" + % (self.main_standby_ids, self.cascade_standby_ids)) + self.logger.log("Successfully check instance status.") + + def check_dn_instance_params(self): + """set_dn_instance_params""" + check_dick = {"enable_dcf": "off", "synchronous_commit": "on"} + dn_insts = [dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes] + if len(dn_insts) <= 2: + self.logger.debug("Need set most available for current cluster.") + check_dick.update({"most_available_sync": "on"}) + primary_dn_insts = [inst for inst in dn_insts if inst.instanceId in self.primary_dn_ids] + if not primary_dn_insts: + self.logger.debug("The primary dn not exist, do not need check dn inst params.") + return + execute_dn = primary_dn_insts[0] + param_list = [] + guc_backup_file = os.path.join(self.streaming_file_dir, StreamingConstants.GUC_BACKUP_FILE) + if not os.path.isfile(guc_backup_file): + FileUtil.createFileInSafeMode(guc_backup_file, DefaultValue.KEY_FILE_MODE_IN_OS) + for peer_check, idx in list(check_dick.items()): + param_list.append((execute_dn, {peer_check: idx})) + ret = parallelTool.parallelExecute(self._check_dn_inst_param, param_list) + self.ssh_tool.scpFiles(guc_backup_file, self.streaming_file_dir, self.cluster_node_names) + if any(ret): + self.logger.logExit('\n'.join(filter(bool, ret))) + self.logger.debug("Successfully check dn inst default value.") + + def _check_dn_inst_param(self, param): + """check_dn_inst_param""" + self.logger.debug("Check dn inst params: %s." % param[1]) + if len(param) != 2: + error_msg = ErrorCode.GAUSS_521["GAUSS_52102"] % param + return error_msg + guc_backup_file = os.path.join(self.streaming_file_dir, StreamingConstants.GUC_BACKUP_FILE) + for sql_key, value in list(param[1].items()): + sql = "show %s;" % sql_key + (status, output) = ClusterCommand.remoteSQLCommand(sql, + self.user, param[0].hostname, + str(param[0].port)) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % sql, "\nError:%s" % output) + if output.strip() != value: + if sql_key in StreamingConstants.GUC_CHANGE_MAP.keys(): + content = "%s,%s,%s" % (sql_key, output.strip(), self.trace_id) + FileUtil.write_add_file(guc_backup_file, content, + DefaultValue.KEY_FILE_MODE_IN_OS) + self.__set_guc_param(sql_key, StreamingConstants.GUC_CHANGE_MAP[sql_key], + mode="reload", inst_type="dn", raise_error=True) + return + error_msg = ErrorCode.GAUSS_516["GAUSS_51632"] \ + % "check [%s], Actual value: [%s], expect value: [%s]" \ + % (sql, output, value) + return error_msg + self.logger.debug("Successfully check and rectify dn inst value:%s." % param[1]) + + def restore_guc_params(self): + """ + Restore guc params in .streaming_guc_backup + """ + self.logger.debug("Start restore guc params.") + guc_backup_file = os.path.join(self.streaming_file_dir, StreamingConstants.GUC_BACKUP_FILE) + if not os.path.isfile(guc_backup_file): + self.logger.debug("Not found guc backup file, no need restore guc params.") + params_record = DefaultValue.obtain_file_content(guc_backup_file) + params_record.reverse() + restored_keys = [] + for param in params_record: + guc_key, guc_value, trace_id = param.split(",") + self.logger.debug("Got guc param:%s, value:%s, trace id:%s in guc backup file." + % (guc_key, guc_value, trace_id)) + if guc_key not in StreamingConstants.GUC_CHANGE_MAP.keys(): + continue + # When the number of dns <=2, ensure that the maximum available mode is always on. + dn_insts = [dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes] + if guc_key in restored_keys or len(dn_insts) <= 2 \ + and guc_key in ["most_available_sync"]: + continue + guc_value = "off" if guc_value not in ["on", "off"] else guc_value + self.__set_guc_param(guc_key, guc_value, mode="reload", + inst_type="dn", raise_error=False) + restored_keys.append(guc_key) + + def set_most_available(self, mode='set', inst_type='dn', raise_error=True): + dn_insts = [dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes if int(dn_inst.mirrorId) == 1] + if len(dn_insts) > 2: + self.logger.debug("No need set most available for current cluster.") + return + self.__set_guc_param("most_available_sync", "on", mode=mode, + inst_type=inst_type, raise_error=raise_error) + + self.__set_guc_param("synchronous_commit", "on", mode=mode, + inst_type=inst_type, raise_error=raise_error) + + def __set_guc_param(self, key, value, mode='set', inst_type='dn', raise_error=True): + """ + Set guc param + """ + if inst_type == 'dn': + instance = '-Z datanode' + elif inst_type == 'cn': + instance = '-Z coordinator' + else: + instance = "-Z datanode -Z coordinator" + cmd = "source %s && gs_guc %s %s -N all -I all " \ + "-c \"%s=%s\"" \ + % (self.mpp_file, mode, instance, key, value) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + if raise_error: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + "Error:%s" % output) + else: + self.logger.debug(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + "Error:%s" % output) + else: + self.logger.debug("Successfully change %s %s with mode %s." % (key, value, mode)) + + def distribute_cluster_conf(self): + """ + Record cluster conf in files + """ + data = {"remoteClusterConf": self.params.remoteClusterConf, + "localClusterConf": self.params.localClusterConf} + file_path = os.path.join(self.streaming_file_dir, + StreamingConstants.STREAMING_CLUSTER_CONF_RECORD) + FileUtil.write_update_file(file_path, data, DefaultValue.KEY_FILE_MODE_IN_OS) + self.ssh_tool.scpFiles(file_path, self.streaming_file_dir, self.cluster_node_names) + + def __record_wal_keep_segments(self, param_list): + """ + record wal_keep_segments value to .wal_keep_segments_record + """ + dn_inst, sql_check, wal_keep_segments = param_list + self.logger.debug("Starting record wal_keep_segments default " + "value for isntance:%s." % dn_inst.instanceId) + (status, output) = ClusterCommand.remoteSQLCommand( + sql_check, self.user, dn_inst.hostname, dn_inst.port, True) + self.logger.debug("Got %s wal_keep_segments, status=%d, output: %s." % + (dn_inst.instanceId, status, SensitiveMask.mask_pwd(output))) + if status == 0 and output.strip(): + value = output.strip() + FileUtil.createFile(wal_keep_segments, True, DefaultValue.KEY_FILE_MODE) + FileUtil.writeFile(wal_keep_segments, [str(dn_inst.instanceId) + ":" + str(value)]) + self.logger.debug("Successfully record %s wal_keep_segments default value:%s" % + (dn_inst.hostname, value)) + else: + raise Exception(ErrorCode.GAUSS_502["GAUSS_50219"] + % "wal_keep_segments default value of %s" % dn_inst.instanceId) + + def get_default_wal_keep_segments(self, only_mode=None): + """ + get wal_keep_segments default value in primary dn + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Wal keep segment opts not for mode:%s." % self.params.mode) + return + self.logger.debug("Starting get wal_keep_segments default value.") + wal_keep_segments = os.path.join( + self.streaming_file_dir, StreamingConstants.WAL_KEEP_SEGMENTS) + sql_check = "show wal_keep_segments;" + param_list = [(dn_inst, sql_check, wal_keep_segments) for db_node in + self.cluster_info.dbNodes for dn_inst in db_node.datanodes + if dn_inst.instanceId in self.primary_dn_ids] + if not param_list: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "obtain param list for get wal_keep_segments") + parallelTool.parallelExecute(self.__record_wal_keep_segments, param_list) + self.logger.debug("Successfully get wal_keep_segments default value.") + + def __set_wal_keep_segments_each_inst(self, params_list): + """ + Set wal_keep_segments value in primary dn + """ + (inst, opt_type, value, mpprc_file) = params_list + self.logger.debug("Start [%s] shardNum [%s] node [%s] wal_keep_segments value [%s]." + % (opt_type, inst.mirrorId, inst.hostname, value)) + cmd = "source %s; pssh -H %s \"source %s ; gs_guc %s " \ + "-Z datanode -D %s -c \\\"wal_keep_segments = '%s'\\\"\"" % \ + (mpprc_file, inst.hostname, mpprc_file, opt_type, inst.datadir, value) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + + "Options:%s, Error: \n%s " + % ("set wal_keep_segments for inst:%s" % inst.instanceId, str(output))) + self.logger.debug("Successfully [%s] shardNum [%s] node [%s] wal_keep_segments " + "value [%s]." % (opt_type, inst.mirrorId, inst.hostname, value)) + + def set_wal_keep_segments(self, opt_type, value, restore_flag=False, only_mode=None): + """ + guc set wal_keep_segments value in primary dn + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Set wal_keep_segments opts not for mode:%s." % self.params.mode) + return + self.logger.log("Starting %s wal_keep_segments value: %s." % (opt_type, value)) + if restore_flag and isinstance(value, dict): + params_list = [(inst, opt_type, value.get(inst.instanceId, 128), self.mpp_file) for + node in self.cluster_info.dbNodes for inst in node.datanodes + if inst.instanceId in self.primary_dn_ids] + else: + params_list = [(inst, opt_type, value, self.mpp_file) for node in + self.cluster_info.dbNodes for inst in node.datanodes + if inst.instanceId in self.primary_dn_ids] + if not params_list: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "obtain param list for set wal_keep_segments") + parallelTool.parallelExecute(self.__set_wal_keep_segments_each_inst, params_list) + self.logger.log("Successfully %s wal_keep_segments value: %s." % (opt_type, value)) + + def __stop_one_node(self, node_id): + """ + Stop one node by node id + """ + self.logger.debug("Start stop node:%s" % node_id) + cmd = ClusterCommand.getStopCmd(int(node_id), "i", 1800) + self.logger.debug("Streaming disaster calling cm_ctl to stop cluster, cmd=[%s]" % cmd) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + self.logger.debug("Failed stop node:%s, error:%s" % (node_id, output)) + else: + self.logger.debug("Successfully stop node:%s" % node_id) + + def stop_cluster_by_node(self, only_mode=None): + """ + stop the cluster by node + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Stop cluster by node not for mode:%s." % self.params.mode) + return + self.logger.log("Stopping the cluster by node.") + static_config = "%s/cluster_static_config" % self.bin_path + cm_ctl_file = "%s/cm_ctl" % self.bin_path + if not os.path.isfile(static_config) or not os.path.isfile(cm_ctl_file): + raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] + % (static_config + " or " + cm_ctl_file)) + node_id_list = list(set([instance.nodeId for instance in self.normal_instances])) + parallelTool.parallelExecute(self.__stop_one_node, node_id_list) + self.logger.log("Successfully stopped the cluster by node for streaming cluster.") + + def get_all_connection_node_name(self, action_flag="", no_update=True): + """ + get all connection node name + """ + if self.connected_nodes and no_update: + self.logger.debug("Got connected nodes:%s for action:%s" + % (self.connected_nodes, action_flag)) + return self.connected_nodes + rets = parallelTool.parallelExecute(DefaultValue.fast_ping, self.cluster_node_names) + self.logger.debug("Check connect for action:%s, result:%s" % (action_flag, str(rets))) + connected_hosts = [ret[0] for ret in rets if ret[-1]] + self.connected_nodes = connected_hosts + return self.connected_nodes + + def update_streaming_pg_hba(self): + """ + update pg_hba.conf, read config_param.json file and set other cluster ip + :return:NA + """ + self.logger.log("Start update pg_hba config.") + FileUtil.cpFile(self.params.xml_path, self.streaming_xml) + cmd = "source %s; %s -U %s -X '%s' --try-reload" % ( + self.mpp_file, OMCommand.getLocalScript( + "Local_Config_Hba"), self.user, self.streaming_xml) + self.logger.debug("Command for changing instance pg_hba.conf file: %s" % cmd) + self.get_all_connection_node_name("update_streaming_pg_hba") + try: + self.ssh_tool.scpFiles(self.streaming_xml, self.streaming_file_dir) + self.ssh_tool.executeCommand(cmd, hostList=self.connected_nodes) + except Exception as error: + msg = ErrorCode.GAUSS_516['GAUSS_51632'] \ + % "update streaming pg_hba with error:%s" % error + self.logger.debug(msg) + raise Exception(msg) + self.logger.log("Successfully update pg_hba config.") + + def __get_repl_info_cmd(self, node_name, ret, dn_inst, opt_mode, idx): + """ + get_repl_info_cmd + """ + if node_name != self.local_host: + set_cmd = "source %s; pssh -H %s \"source %s ; gs_guc %s " \ + "-Z datanode -D %s -c " \ + "\\\"replconninfo%s = 'localhost=%s localport=%s " \ + "localheartbeatport=%s localservice=%s remotehost=%s " \ + "remoteport=%s remoteheartbeatport=%s " \ + "remoteservice=%s iscascade=%s iscrossregion=%s'\\\"\"" + set_cmd = set_cmd % (self.mpp_file, node_name, + self.mpp_file, opt_mode, + dn_inst.datadir, idx, ret.group(1), + ret.group(2), ret.group(3), ret.group(4), + ret.group(5), ret.group(6), ret.group(7), + ret.group(8), "true", "false") + else: + set_cmd = "source %s ; gs_guc %s -Z datanode -D %s -c " \ + "\"replconninfo%s = 'localhost=%s localport=%s " \ + "localheartbeatport=%s localservice=%s remotehost=%s " \ + "remoteport=%s remoteheartbeatport=%s " \ + "remoteservice=%s iscascade=%s iscrossregion=%s'\"" + set_cmd = set_cmd % (self.mpp_file, opt_mode, + dn_inst.datadir, idx, ret.group(1), + ret.group(2), ret.group(3), ret.group(4), + ret.group(5), ret.group(6), ret.group(7), + ret.group(8), "true", "false") + return set_cmd + + def __set_original_repl_info(self, dn_inst, node_name, opt_mode="set"): + """ + Rectify original replconninfos + """ + orignal_ports = None + if not all([dn_inst, node_name]): + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain dn infos") + for idx in range(1, StreamingConstants.MAX_REPLICATION_NUMS + 1): + if node_name == self.local_host: + cmd = "source %s; gs_guc check -Z datanode -D %s " \ + "-c 'replconninfo%s'" % (self.mpp_file, dn_inst.datadir, idx) + else: + cmd = "source %s; pssh -H %s 'source %s; gs_guc check " \ + "-Z datanode -D %s -c \"replconninfo%s\"'" \ + % (self.mpp_file, node_name, self.mpp_file, dn_inst.datadir, idx) + self.logger.debug("Check original repl infos with cmd:%s" % cmd) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + + " Error: \n%s " % output) + if output.count("=NULL") > 2 or "iscrossregion=true" in output.lower(): + self.logger.debug("InstanceID:%s, Index:%s" % (dn_inst.instanceId, idx)) + return idx, orignal_ports + ret = re.search( + r"replconninfo%s='localhost=(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})" + r" localport=(\d{4,5}) localheartbeatport=(\d{4,5}) " + r"localservice=(\d{4,5}) " + r"remotehost=(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}) " + r"remoteport=(\d{4,5}) remoteheartbeatport=(\d{4,5}) " + r"remoteservice=(\d{4,5})" % idx, output) + if not ret: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "search repl infos") + set_cmd = self.__get_repl_info_cmd(node_name, ret, dn_inst, opt_mode, idx) + self.logger.debug("Set original repl infos with cmd:%s" % set_cmd) + status, output = CmdUtil.retryGetstatusoutput(set_cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % set_cmd + + " Error: \n%s " % output) + orignal_ports = (ret.group(2), ret.group(3), ret.group(4)) + self.logger.debug("Successfully rectify original repl infos for instance:%s." + % dn_inst.instanceId) + + def __get_local_data_ip(self, inst_host): + """ + Get local data ip + """ + local_cluster_info = self.params.localClusterConf + shards = local_cluster_info["shards"] + inst_ips = DefaultValue.get_remote_ips(inst_host, self.mpp_file) + for shard in shards: + for node in shard: + ip = node["ip"] + data_ip = node["dataIp"] + if ip in inst_ips: + self.logger.debug("Got ip[%s], dataIp[%s]." % (ip, data_ip)) + return data_ip + raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] + % "obtain shards from local cluster info") + + def __config_one_dn_instance(self, params): + """ + Config replconninfo for one dn instance + """ + inst, opt_mode, remote_cluster_info = params + local_data_ip = self.__get_local_data_ip(inst.hostname) + base_dn_port = self.params.remoteClusterConf['port'] + self.logger.debug("Start config instance:[%s], got dataIp:[%s], port:[%s]." + % (inst.instanceId, local_data_ip, base_dn_port)) + if not all([local_data_ip, base_dn_port]): + raise Exception(ErrorCode.GAUSS_502["GAUSS_50219"] + % "dn port or dataIp for config instance") + inst_index, original_ports = self.__set_original_repl_info( + inst, inst.hostname, opt_mode=opt_mode) + repl_params = [] + shards = remote_cluster_info.get("shards") + for shard in shards: + for node_info in shard: + data_ip = node_info.get("dataIp") + shard_num = node_info.get("shardNum", '1') + if str(inst.mirrorId) == str(shard_num): + repl_params.append(( + shard_num, inst.hostname, local_data_ip, + inst.datadir, data_ip, inst_index, + original_ports, base_dn_port, opt_mode)) + inst_index += 1 + return repl_params + + def __do_config_dn_repl_info(self, params): + """ + function:config postgres conf + :return:NA + """ + shard_num, host, local_data_ip, data_dir, data_ip, index, \ + original_ports, base_port, opt_mode = params + local_port, local_heartbeat, local_service = original_ports + remote_base = int(base_port) + self.logger.debug("shard num %s base port is %s" % (shard_num, remote_base)) + remote_port = remote_base + 1 + remote_heartbeat = remote_base + 5 + remote_service = remote_base + 4 + is_cascade = "false" + if self.local_host == host: + guc_cmd = "source %s ; gs_guc %s -Z datanode -D %s " \ + "-c \"replconninfo%s = 'localhost=%s localport=%s " \ + "localheartbeatport=%s localservice=%s remotehost=%s " \ + "remoteport=%s remoteheartbeatport=%s remoteservice=%s " \ + "iscascade=%s iscrossregion=true'\"" \ + % (self.mpp_file, opt_mode, data_dir, index, local_data_ip, local_port, + local_heartbeat, local_service, data_ip, remote_port, + remote_heartbeat, remote_service, is_cascade) + self.logger.debug("Set datanode postgres file for streaming " + "disaster cluster with cmd:%s" % guc_cmd) + else: + guc_cmd = "source %s; pssh -s -H %s \"source %s ; gs_guc %s -Z datanode -D %s " \ + "-c \\\"replconninfo%s = 'localhost=%s localport=%s " \ + "localheartbeatport=%s localservice=%s remotehost=%s " \ + "remoteport=%s remoteheartbeatport=%s remoteservice=%s " \ + "iscascade=%s iscrossregion=true'\\\"\"" \ + % (self.mpp_file, host, + self.mpp_file, opt_mode, data_dir, index, + local_data_ip, local_port, local_heartbeat, + local_service, data_ip, remote_port, + remote_heartbeat, remote_service, is_cascade) + self.logger.debug("Set datanode postgres file for streaming " + "disaster cluster with cmd:%s" % guc_cmd) + status, output = CmdUtil.retryGetstatusoutput(guc_cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % guc_cmd + + " Error: \n%s " % output) + + def config_streaming_repl_info(self): + """ + update postgresql.conf for replconninfo + """ + self.logger.debug("set all datanode guc param in postgres conf for streaming cluster.") + repl_params = [] + opt_mode = "reload" if self.params.mode == "primary" else "set" + config_repl_params = [] + datanode_instance = [inst for node in self.cluster_info.dbNodes for inst in node.datanodes] + + for inst in datanode_instance: + config_repl_params.append((inst, opt_mode, self.params.remoteClusterConf)) + rets = parallelTool.parallelExecute(self.__config_one_dn_instance, config_repl_params) + for param in rets: + repl_params += param + self.logger.debug("Got repl params:%s" % str(repl_params)) + parallelTool.parallelExecute(self.__do_config_dn_repl_info, repl_params) + self.logger.debug( + "Successfully set all datanode guc param in postgres conf for streaming cluster.") + + def set_cmserver_guc(self, guc_parameter, guc_value, guc_type, only_mode=None): + """ + set cmserver guc param + :return: NA + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Set cms guc [%s] to [%s] not for mode:%s." + % (guc_parameter, guc_value, self.params.mode)) + return + cmd = "gs_guc %s -Z cmserver -N all -I all -c \"%s=%s\" " % \ + (guc_type, guc_parameter, guc_value) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + msg = ErrorCode.GAUSS_516['GAUSS_51632'] \ + % "set cm server guc [%s] to [%s], output:%s" \ + % (guc_parameter, guc_value, output) + self.logger.debug(msg) + + def set_cmagent_guc(self, guc_parameter, guc_value, guc_type, only_mode=None): + """ + set cmagent guc param + :return: NA + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Set cma guc [%s] to [%s] not for mode:%s." + % (guc_parameter, guc_value, self.params.mode)) + return + cmd = "gs_guc %s -Z cmagent -N all -I all -c \"%s=%s\" " % \ + (guc_type, guc_parameter, guc_value) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + msg = ErrorCode.GAUSS_516['GAUSS_51632'] \ + % "set cm agent guc [%s] to [%s], output:%s" \ + % (guc_parameter, guc_value, output) + self.logger.debug(msg) + + def __check_datanode_data_ip_connection(self, inst): + """ + Check remote data ip can connect or not + """ + any_connected = False + node_infos = [node_info for shard in self.params.remoteClusterConf.get("shards", []) + for node_info in shard] + local_data_ip = self.__get_local_data_ip(inst.hostname) + for node_info in node_infos: + data_ip = node_info.get("dataIp") + shard_num = node_info.get("shardNum", '1') + if str(shard_num) != str(inst.mirrorId): + continue + _, ret = DefaultValue.fast_ping_on_node(inst.hostname, local_data_ip, + data_ip, self.logger) + if ret: + any_connected = True + break + if not any_connected: + self.logger.error("Failed check data ip connection for inst:%s." % inst.instanceId) + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "check data ip connection") + self.logger.debug("Successfully check main standby data ip connection.") + + def __pghba_backup_handler(self, node_name, dir_name, inst_id, mode="backup"): + """ + Backup or restore pg_hba file. + """ + file_path = os.path.join(dir_name, "pg_hba.conf") + old_file_path = os.path.join(dir_name, "pg_hba.conf.old") + dest_file = os.path.join(self.streaming_file_dir, "%s_pg_hba.conf" % inst_id) + if self.local_host == node_name: + if mode == "backup" and not os.path.isfile(dest_file): + if os.path.isfile(file_path): + self.logger.debug("Backup file from[%s] to[%s]." % ( + file_path, dest_file)) + FileUtil.cpFile(file_path, dest_file) + else: + self.logger.debug("Backup file from[%s] to[%s]." % ( + old_file_path, dest_file)) + FileUtil.cpFile(old_file_path, dest_file) + if mode == "restore": + self.logger.debug("Restore file from[%s] to[%s]." % ( + dest_file, file_path)) + FileUtil.cpFile(dest_file, file_path) + FileUtil.removeFile(dest_file) + else: + if mode == "backup": + cmd = "source %s; pssh -s -H %s \"if [ ! -f '%s' ];then if [ -f '%s' ];" \ + "then cp '%s' '%s';else cp '%s' '%s';fi;fi\"" \ + % (self.mpp_file, node_name, dest_file, file_path, file_path, + dest_file, old_file_path, dest_file) + self.logger.debug("Backup file on node[%s] with cmd [%s]." % ( + node_name, cmd)) + else: + cmd = "source %s; pssh -s -H %s \"cp %s %s && rm -f %s\"" % ( + self.mpp_file, node_name, dest_file, file_path, dest_file) + self.logger.debug("Restore file on node[%s] from[%s] to[%s]." % ( + node_name, file_path, dest_file)) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + + " Error: \n%s " % output) + + def __pg_ident_backup_handler(self, node_name, dir_name, inst_id, mode="backup"): + """ + Backup or restore pg_ident file. + """ + file_path = os.path.join(dir_name, "pg_ident.conf") + dest_file = os.path.join(self.streaming_file_dir, "%s_pg_ident.conf" % inst_id) + if self.local_host == node_name: + if mode == "backup" and not os.path.isfile(dest_file): + if os.path.isfile(file_path): + self.logger.debug("Backup file from[%s] to[%s]." % ( + file_path, dest_file)) + FileUtil.cpFile(file_path, dest_file) + + if mode == "restore" and os.path.isfile(dest_file): + self.logger.debug("Restore file from[%s] to[%s]." % ( + dest_file, file_path)) + FileUtil.cpFile(dest_file, file_path) + FileUtil.removeFile(dest_file) + else: + if mode == "backup": + cmd = "source %s; pssh -s -H %s \"if [ ! -f '%s' ];then if [ -f '%s' ];" \ + "then cp '%s' '%s';fi;fi\"" \ + % (self.mpp_file, node_name, dest_file, file_path, file_path, dest_file) + self.logger.debug("Backup file on node[%s] with cmd [%s]." % ( + node_name, cmd)) + else: + cmd = "source %s; pssh -s -H %s \"if [ -f '%s' ];then cp '%s' '%s' && " \ + "rm -f '%s';fi\"" % (self.mpp_file, node_name, dest_file, dest_file, + file_path, dest_file) + self.logger.debug("Restore file on node[%s] from[%s] to[%s]." % ( + node_name, file_path, dest_file)) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + + " Error: \n%s " % output) + + def __start_main_standby_dn(self, start_params): + """ + Start single main standby dn + """ + local_ip, inst, bin_path, distribute_arg, build_timeout = start_params + self.logger.debug("Starting start dn:%s" % inst.instanceId) + if local_ip == inst.hostname: + cmd_start = "source %s; %s/gs_ctl start -D %s -M hadr_main_standby%s" % ( + self.mpp_file, bin_path, inst.datadir, distribute_arg) + else: + cmd_start = "source %s; pssh -s -t %s -H %s \"source %s; %s/gs_ctl start -D %s " \ + "-M hadr_main_standby%s\"" \ + % (self.mpp_file, StreamingConstants.MAX_BUILD_TIMEOUT + 10, inst.hostname, + self.mpp_file, bin_path, inst.datadir, distribute_arg) + self.logger.debug("Start dn with cmd:%s." % cmd_start) + status, output = CmdUtil.retry_util_timeout(cmd_start, build_timeout) + if status != 0: + raise Exception( + ErrorCode.GAUSS_514[ + "GAUSS_51400"] % cmd_start + " Error: \n%s " % output) + self.logger.debug("Successfully start dn:%s" % inst.instanceId) + + def __build_main_standby_dn(self, params): + """ + Build single main standby dn + """ + inst, build_timeout, local_ip, bin_path, distribute_arg, rds_backup, backup_pwd = params + self.logger.debug("Start build main standby dn:%s" % inst.instanceId) + self.__check_datanode_data_ip_connection(inst) + self.__pghba_backup_handler(inst.hostname, inst.datadir, inst.instanceId, mode="backup") + self.__pg_ident_backup_handler(inst.hostname, inst.datadir, inst.instanceId, mode="backup") + # -t 1209600 means default value 14 days + if local_ip == inst.hostname: + cmd = "source %s; %s/gs_ctl build -D %s -M hadr_main_standby -r 7200 -q%s -Q " \ + "force_copy_from_local -U %s -P '%s' -t %s" \ + % (self.mpp_file, bin_path, inst.datadir, distribute_arg, rds_backup, backup_pwd, + StreamingConstants.MAX_BUILD_TIMEOUT) + else: + cmd = "echo \"source %s; %s/gs_ctl build -D %s -M hadr_main_standby -r 7200 -q%s " \ + "-Q force_copy_from_local -U %s -P '%s' -t %s\" | pssh -s -t %s -H %s" \ + % (self.mpp_file, bin_path, inst.datadir, distribute_arg, rds_backup, + backup_pwd, StreamingConstants.MAX_BUILD_TIMEOUT, + StreamingConstants.MAX_BUILD_TIMEOUT + 10, inst.hostname) + cmd_log = cmd.replace(backup_pwd, '***') + self.logger.debug("Building with cmd:%s." % cmd_log) + status, output = CmdUtil.retry_util_timeout(cmd, build_timeout) + if status != 0: + error_detail = "Error: Failed to do build because of pssh timeout." \ + if "was killed or timeout" in output else \ + "Error: Failed to do build because of retry timeout in %s s." \ + % build_timeout + self.logger.debug("Failed to do gs_ctl build. " + error_detail) + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "full build from remote cluster" + error_detail) + self.logger.debug("Successfully build main standby dn:%s" % inst.instanceId) + self.__pghba_backup_handler(inst.hostname, inst.datadir, inst.instanceId, mode="restore") + self.__pg_ident_backup_handler(inst.hostname, inst.datadir, inst.instanceId, mode="restore") + start_params = (local_ip, inst, bin_path, distribute_arg, build_timeout) + self.__start_main_standby_dn(start_params) + + def __build_cascade_standby_dn(self, params): + """ + Build single main standby dn + """ + inst, build_timeout, local_ip, bin_path, distribute_arg = params + self.logger.debug("Start build cascade standby dn:%s" % inst.instanceId) + # -t 1209600 means default value 14 days + if local_ip == inst.hostname: + cmd = "source %s; %s/gs_ctl build -D %s -M cascade_standby " \ + "-b standby_full -r 7200%s -t %s" \ + % (self.mpp_file, bin_path, inst.datadir, distribute_arg, + StreamingConstants.MAX_BUILD_TIMEOUT) + else: + cmd = "echo \"source %s; %s/gs_ctl build -D %s -M cascade_standby -b standby_full " \ + "-r 7200%s -t %s\" | pssh -s -t %s -H %s" \ + % (self.mpp_file, bin_path, inst.datadir, distribute_arg, + StreamingConstants.MAX_BUILD_TIMEOUT, + StreamingConstants.MAX_BUILD_TIMEOUT + 10, inst.hostname) + self.logger.debug("Building with cmd:%s." % cmd) + status, output = CmdUtil.retry_util_timeout(cmd, build_timeout) + if status != 0: + error_detail = "Error: Failed to do build because of pssh timeout." \ + if "was killed or timeout" in output else \ + "Error: Failed to do build because of retry timeout in %s s." \ + % build_timeout + self.logger.debug("Failed to do gs_ctl build. " + error_detail) + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "full build from remote cluster" + error_detail) + self.logger.debug("Successfully build cascade standby dn:%s" % inst.instanceId) + + def build_dn_instance(self, only_mode=None): + """ + Build dn instance + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Build dn step is not for mode:%s." % self.params.mode) + return + self.logger.debug("Start building process.") + distribute_arg = "" if self.cluster_info.isSingleInstCluster() else " -Z datanode" + main_params = [] + cascade_params = [] + datanode_instance = [inst for node in self.cluster_info.dbNodes + for inst in node.datanodes] + for inst in datanode_instance: + if inst.instanceId in self.main_standby_ids + self.primary_dn_ids: + main_params.append((inst, self.params.waitingTimeout, self.local_host, + self.bin_path, distribute_arg, self.params.hadrUserName, + self.params.hadrUserPassword)) + else: + cascade_params.append((inst, self.params.waitingTimeout, self.local_host, + self.bin_path, distribute_arg)) + if main_params: + parallelTool.parallelExecute(self.__build_main_standby_dn, main_params) + self.logger.debug("Finished build main standby dns.") + if cascade_params: + parallelTool.parallelExecute(self.__build_cascade_standby_dn, cascade_params) + self.logger.debug("Finished build cascade standby dns.") + del self.params.hadrUserPassword + + def query_cluster(self): + """ + query cluster + :return: output + """ + cmd = "source %s; cm_ctl query -v -C -s -i -d" % self.mpp_file + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + self.logger.error(ErrorCode.GAUSS_516["GAUSS_51600"] + + "status(%d), output(%s)" % (status, output)) + return output + + def start_cluster(self, cm_timeout=None, only_mode=None): + """ + start the cluster + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Start cluster is not for mode:%s." % self.params.mode) + return + self.logger.log("Starting the cluster.") + cm_timeout = cm_timeout or 300 + user, group = UserUtil.getPathOwner(self.gp_home) + if user == "" or group == "": + raise Exception("Failed to obtain the owner of application.") + end_time = datetime.now() + timedelta(seconds=cm_timeout) + cmd = ClusterCommand.getStartCmd(0, cm_timeout) + self.logger.debug("Calling cm_ctl to start cluster, cmd=[%s]" % cmd) + status, output = CmdUtil.retryGetstatusoutput(cmd, retry_time=0) + if status != 0: + error_str = ErrorCode.GAUSS_516["GAUSS_51607"] % "the cluster" + \ + " Error:\n%s." % output + self.logger.debug(error_str) + self.logger.log("Warning: the cluster is not normal, please check cluster status!") + else: + self.logger.log("Successfully started primary instance. " + "Please wait for standby instances.") + + cluster_normal_status = [DefaultValue.CLUSTER_STATUS_NORMAL, + DefaultValue.CLUSTER_STATUS_DEGRADED] + while True: + time.sleep(5) + self.logger.log('Waiting cluster normal.') + check_ret = self.check_cluster_status(cluster_normal_status, only_check=True, + check_current=True, is_log=False) + if check_ret: + self.logger.log("Successfully started standby instances.") + break + if datetime.now() >= end_time: + query_result = self.query_cluster() + self.logger.log("Timeout. Failed to start the cluster in (%s)s." % cm_timeout) + self.logger.log("Current cluster status (%s)." % query_result) + self.logger.log("It will continue to start in the background.") + break + + def __check_one_main_standby_connection(self, param_list): + """ + concurrent check main standby is connected primary dn + """ + (dn_inst, sql_check) = param_list + self.logger.debug("Node %s primary dn instanceId [%s] Check main standby is connected " + "with cmd:%s." % (dn_inst.hostname, dn_inst.instanceId, sql_check)) + status, output = ClusterCommand.remoteSQLCommand( + sql_check, self.user, dn_inst.hostname, dn_inst.port) + if status == 0 and output.strip(): + self.logger.debug("Successfully check main standby connected " + "primary dn on inst:[%s]." % dn_inst.instanceId) + return True + self.logger.debug("Retry check main standby connected on inst:[%s]." % dn_inst.instanceId) + + def check_main_standby_connection_primary_dn(self, p_inst_list): + """ + check connection main_standby connected primary dn + """ + if not p_inst_list: + self.logger.debug("The primary dn does not exist on current cluster.") + return + self.primary_dn_ids = p_inst_list + sql_check = "select 1 from pg_catalog.gs_hadr_local_rto_and_rpo_stat();" + sql_check_2 = "select 1 from pg_catalog.pg_stat_get_wal_senders() where " \ + "sync_state='Async' and peer_role='Standby' and peer_state='Normal';" + param_list = [(dn_inst, sql_check) for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes + if dn_inst.instanceId in self.primary_dn_ids] + param_list_2 = [(dn_inst, sql_check_2) for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes if dn_inst.instanceId + in self.primary_dn_ids] + if not param_list: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "obtain param list for check main standby connection on primary dn") + self.logger.debug("Start check main standby connection with sql:%s." % sql_check) + results = parallelTool.parallelExecute(self.__check_one_main_standby_connection, + param_list) + self.logger.debug("Start check main standby connection with sql:%s." % sql_check_2) + results_2 = parallelTool.parallelExecute(self.__check_one_main_standby_connection, + param_list_2) + + return all(results+results_2) + + def wait_main_standby_connection(self, only_mode=None): + if only_mode and self.params.mode != only_mode: + self.logger.debug("Start cluster is not for mode:%s." % self.params.mode) + return + self.logger.log("Waiting for the main standby connection.") + end_time = datetime.now() + timedelta(seconds=self.params.waitingTimeout) + while True: + p_inst_list = [int(i) for i in DefaultValue.get_primary_dn_instance_id("Primary", + ignore=True)] + if self.check_main_standby_connection_primary_dn(p_inst_list): + break + if datetime.now() >= end_time: + raise Exception( + ErrorCode.GAUSS_516["GAUSS_51632"] % "check main standby connection" + + " Because Waiting timeout: %ss" % str(self.params.waitingTimeout)) + time.sleep(5) + self.logger.log("Main standby already connected.") + + def hadr_key_generator(self, key_name): + """ + Generate key_name.key.cipher & key_name.key.rand + """ + self.logger.log("Start generate hadr key files.") + if not os.path.exists(self.bin_path): + msg = ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain bin path." + self.logger.debug(msg) + raise Exception(msg) + if not os.path.exists(self.gp_home): + msg = ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain env GPHOME" + self.logger.debug(msg) + raise Exception(msg) + key_cipher = os.path.join(self.bin_path, "%s.key.cipher" % key_name) + key_rand = os.path.join(self.bin_path, "%s.key.rand" % key_name) + cmd = "export LD_LIBRARY_PATH=%s/script/gspylib/clib && source %s " \ + "&& gs_guc generate -S default -o %s -D '%s' && %s && %s" \ + % (self.gp_home, self.mpp_file, key_name, self.bin_path, + CmdUtil.getChmodCmd(str(ConstantsBase.KEY_FILE_MODE), key_cipher), + CmdUtil.getChmodCmd(str(ConstantsBase.KEY_FILE_MODE), key_rand)) + if (not os.path.isfile(key_cipher)) or (not os.path.isfile(key_rand)): + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0 or (not os.path.isfile(key_cipher)) \ + or (not os.path.isfile(key_rand)): + msg = ErrorCode.GAUSS_516["GAUSS_51632"] \ + % "generate hadr key files" + "Error:%s" % output + self.logger.error(msg) + raise Exception(msg) + else: + self.logger.log("Streaming key files already exist.") + + self.ssh_tool.scpFiles(key_cipher, self.bin_path) + self.ssh_tool.scpFiles(key_rand, self.bin_path) + self.logger.log("Finished generate and distribute hadr key files.") + + def encrypt_hadr_user_info(self, key_name, hadr_user, hadr_pwd): + """ + Encrypt hadr user info. + """ + self.logger.log("Start encrypt hadr user info.") + cmd = "source %s && gs_encrypt -f %s \"%s|%s\"" \ + % (self.mpp_file, key_name, hadr_user, hadr_pwd) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0 or not output: + msg = ErrorCode.GAUSS_516["GAUSS_51632"] % "encrypt hadr user info" + self.logger.error(msg) + raise Exception(msg) + self.logger.log("Successfully encrypt hadr user info.") + return output + + def keep_hadr_user_info(self, info_str, retry=5): + """ + Keep hadr user info into GLOBAL CONFIGURATION + """ + self.logger.log("Start save hadr user info into database.") + sql = "ALTER GLOBAL CONFIGURATION with(hadr_user_info ='%s');" % info_str + primary_dns = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in + db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] + primary_dns = primary_dns * retry + output = "None" + for dn_inst in primary_dns: + status, output = ClusterCommand.remoteSQLCommand( + sql, self.user, dn_inst.hostname, dn_inst.port, True) + if status == 0: + self.logger.log("Successfully save hadr user info into database.") + return + msg = ErrorCode.GAUSS_516['GAUSS_51632'] % "save hadr user info into database" + self.logger.error(msg + "Error:%s" % SensitiveMask.mask_pwd(output)) + raise Exception(msg) + + def restore_wal_keep_segments(self, only_mode=None): + """ + restore wal_keep_segments default value + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Restore wal_keep_segments not for mode:%s." % self.params.mode) + return + self.logger.debug("Starting restore wal_keep_segments default value.") + default_value_dict = {} + wal_keep_segments = os.path.join(self.streaming_file_dir, + StreamingConstants.WAL_KEEP_SEGMENTS) + if not os.path.isfile(wal_keep_segments): + self.logger.debug("Not found wal keep segments record file, no need restore.") + return + wal_keep_segments_list = FileUtil.readFile(wal_keep_segments) + if not wal_keep_segments_list: + raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] % "obtain record wal_keep_segments") + for each_dn in wal_keep_segments_list: + DefaultValue.checkGuc(each_dn.split(":")[1].strip()) + default_value_dict[each_dn.split(":")[0].strip()] = each_dn.split(":")[1].strip() + self.set_wal_keep_segments("reload", default_value_dict, True) + self.logger.debug("Successfully restore wal_keep_segments default value.") + + def __clean_streaming_files_on_local_node(self, file_name_list): + file_name_list = [file_name_list] \ + if not isinstance(file_name_list, list) else file_name_list + for file_name in file_name_list: + file_path = os.path.join(self.streaming_file_dir, file_name) + if os.path.isfile(file_path): + FileUtil.removeFile(file_path) + self.logger.debug("Successfully removed file:[%s]" % file_path) + + def clean_step_file(self): + """ + Clean step file for each action + """ + step_file = os.path.basename(self.step_file_path) + self.__clean_streaming_files_on_local_node(step_file) + self.logger.log("Successfully removed step file.") + + def check_action_and_mode(self): + """ + Check action and mode if step file exist. + if any streaming options not finished(step file exist), + not allowed doing any other streaming options except query. + """ + self.logger.debug("Checking action and mode.") + exist_step_file_names = [] + for file_name in StreamingConstants.STREAMING_STEP_FILES.values(): + step_file_path = os.path.join(self.streaming_file_dir, file_name) + if os.path.isfile(step_file_path) and file_name != ".streaming_query.step": + exist_step_file_names.append(file_name) + if exist_step_file_names and set(exist_step_file_names) ^ {os.path.basename( + self.step_file_path)}: + exist_action = [key for key, value in StreamingConstants.STREAMING_STEP_FILES.items() + if value in exist_step_file_names] + self.logger.logExit(ErrorCode.GAUSS_516["GAUSS_51632"] + % "check action and mode, the step files %s already exist, " + "please ensure the action %s is finished before " + "doing current options" % (exist_step_file_names, exist_action)) + self.logger.debug("Successfully checked action and mode.") + + def clean_streaming_dir(self): + """ + Clean streaming dir when stop or failover + """ + self.logger.debug("Start clean streaming dir:%s." % self.streaming_file_dir) + cmd = g_file.SHELL_CMD_DICT["deleteDir"] % (self.streaming_file_dir, + self.streaming_file_dir) + try: + self.ssh_tool.executeCommand(cmd, hostList=self.cluster_info.getClusterNodeNames()) + except Exception as error: + self.logger.debug( + "Failed to remove streaming dir with error:%s" % error) + self.logger.log("Finished remove streaming dir.") + + def clean_global_config(self): + """ + Clean global config + """ + self.logger.log("Clean hadr user info.") + sql = "DROP GLOBAL CONFIGURATION hadr_user_info;" + primary_dns = [dn_inst for db_node in self.cluster_info.dbNodes for dn_inst in + db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] + output = "None" + for dn_inst in primary_dns: + status, output = ClusterCommand.remoteSQLCommand( + sql, self.user, dn_inst.hostname, dn_inst.port, True) + if status == 0: + self.logger.log("Successfully clean hadr user info from database.") + return + msg = ErrorCode.GAUSS_516['GAUSS_51632'] % "clean hadr user info from database" + self.logger.debug(msg + "Error:%s" % SensitiveMask.mask_pwd(output)) + + def get_build_info(self): + """ + Assemble build infos + """ + # 1. Get local primary dn inst dir, host + self.logger.debug("Start assemble build info") + dn_inst_info = [] + dn_instances = [dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes if int(dn_inst.mirrorId) == 1] + for dn_inst in dn_instances: + dn_info = dict() + dn_info["port"] = dn_inst.port + 1 + dn_info["data_dir"] = dn_inst.datadir + dn_info["host_name"] = dn_inst.hostname + dn_info["listen_ip"] = self.__get_local_data_ip(dn_inst.hostname) + self.logger.debug("Got build listen ips:%s, ip:%s selected." + % (str(dn_inst.listenIps), dn_info["listen_ip"])) + dn_inst_info.append(dn_info) + + # 2. Get remote dn ip and port + remote_ip_port = [] + shards = self.params.remoteClusterConf["shards"] + remote_port = int(self.params.remoteClusterConf["port"]) + 1 + shard_info = [info for shard in shards for info in shard + if info.get("shardNum", "1") == "1"] + for node_info in shard_info: + remote_ip = node_info.get("dataIp") + remote_ip_port.append((remote_ip, remote_port)) + if (not dn_inst_info) or (not remote_ip_port): + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "obtain dn info") + self.logger.debug("Successfully get remote dn info:%s." % remote_ip_port) + return dn_inst_info, remote_ip_port + + def build_file_from_remote(self): + """ + Build files from remote cluster + """ + local_dn_info, remote_ip_port = self.get_build_info() + cmd_local = 'source %s; %s/gs_ctl build -D %s -M standby -b copy_secure_files -Z datanode' \ + ' -U %s -P "%s" -C "localhost=%s localport=%s remotehost=%s remoteport=%s"' + cmd_remote = "echo \"source %s; %s/gs_ctl build -D %s -M standby -b copy_secure_files -Z " \ + "datanode -U %s -P '%s' -C 'localhost=%s localport=%s " \ + "remotehost=%s remoteport=%s'\"" \ + " | pssh -s -H %s" + + end_time = datetime.now() + timedelta(seconds=self.params.waitingTimeout) + self.logger.debug("Retry Building with timeout:%ss." % self.params.waitingTimeout) + succeed = False + while datetime.now() < end_time: + for local_primary in local_dn_info: + for remote_ip, remote_port in remote_ip_port: + if local_primary["host_name"] == NetUtil.GetHostIpOrName(): + cmd = cmd_local % (self.mpp_file, "%s/bin" % self.gauss_home, + local_primary["data_dir"], + self.params.hadrUserName, self.params.hadrUserPassword, + local_primary["listen_ip"], local_primary["port"], + remote_ip, remote_port) + else: + cmd = cmd_remote % (self.mpp_file, "%s/bin" % self.gauss_home, + local_primary["data_dir"], + self.params.hadrUserName, self.params.hadrUserPassword, + local_primary["listen_ip"], local_primary["port"], + remote_ip, remote_port, local_primary["host_name"]) + result = DefaultValue.fast_ping_on_node(local_primary["host_name"], + local_primary["listen_ip"], + remote_ip, self.logger) + if not result[-1]: + self.logger.debug("Ignore build from %s, ping result:%s" + % (remote_ip, result[-1])) + continue + if self.cluster_info.isSingleInstCluster(): + cmd = cmd.replace(" -Z datanode", "") + self.logger.debug("Building with cmd:%s." + % cmd.replace(self.params.hadrUserPassword, "***")) + status, output = CmdUtil.getstatusoutput_by_fast_popen(cmd) + if status == 0: + succeed = True + self.logger.debug("Successfully Building with cmd:%s." + % cmd.replace(self.params.hadrUserPassword, "***")) + return succeed + else: + self.logger.debug("Building result:%s." % SensitiveMask.mask_pwd(output)) + time.sleep(1) + return succeed + + def __copy_secure_dir_from_dn_dir(self): + """ + Find and copy key file dir from all dn dir + """ + local_temp_secure_path = os.path.join( + self.streaming_file_dir, StreamingConstants.GS_SECURE_FILES) + if os.path.isdir(local_temp_secure_path): + FileUtil.removeDirectory(local_temp_secure_path) + rand_path = os.path.join(local_temp_secure_path, StreamingConstants.HADR_KEY_RAND) + cipher_path = os.path.join(local_temp_secure_path, StreamingConstants.HADR_KEY_CIPHER) + cmd_tep = "echo \"if [ -d '%s' ];then source %s && pscp --trace-id %s -H %s '%s' '%s' " \ + "&& rm -rf '%s';fi\" | pssh -s -H %s" + succeed = False + for db_node in self.cluster_info.dbNodes: + for dn_inst in db_node.datanodes: + if int(dn_inst.mirrorId) == 1: + key_file_path = os.path.realpath(os.path.join( + dn_inst.datadir, StreamingConstants.GS_SECURE_FILES)) + cmd_copy_dir = cmd_tep % (key_file_path, self.mpp_file, self.trace_id, + self.local_host, key_file_path, + self.streaming_file_dir, + key_file_path, dn_inst.hostname) + status, output = CmdUtil.getstatusoutput_by_fast_popen(cmd_copy_dir) + self.logger.debug("Copy cmd:%s" % cmd_copy_dir) + if status != 0: + self.logger.debug("Try copy secure dir from:[%s][%s], error:%s" % ( + dn_inst.hostname, key_file_path, output)) + if os.path.isdir(local_temp_secure_path) and os.path.isfile(rand_path) \ + and os.path.isfile(cipher_path): + succeed = True + if not succeed: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "copy secure file dir") + self.logger.debug("Successfully copy secure dir, file list:%s." % + os.listdir(local_temp_secure_path)) + + def build_and_distribute_key_files(self, only_mode=None): + """ + Distribute key files + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Wal keep segment opts not for mode:%s." % self.params.mode) + return + self.logger.log("Start build key files from remote cluster.") + # build file + if not self.build_file_from_remote(): + raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] % "build files from cluster") + # copy file from data dir to streaming dir + self.__copy_secure_dir_from_dn_dir() + # check version consistency + self.__check_version_file() + # check cluster user consistency + self.__check_cluster_user() + # distribute key files to all node + secure_dir_path = os.path.join(self.streaming_file_dir, StreamingConstants.GS_SECURE_FILES) + self.__copy_hadr_user_key(secure_dir_path, update=True) + FileUtil.removeDirectory(secure_dir_path) + self.logger.log("Successfully build and distribute key files to all nodes.") + + def __check_version_file(self): + """ + function: Check whether the version numbers of the host + cluster and the disaster recovery cluster are the same + """ + gs_secure_version = os.path.realpath(os.path.join(self.streaming_file_dir, + "gs_secure_files/version.cfg")) + master_commit_id = VersionInfo.get_version_info(gs_secure_version)[-1] + local_version_file = VersionInfo.get_version_file() + local_commit_id = VersionInfo.get_version_info(local_version_file)[-1] + self.logger.debug("The committed of the host cluster is %s, " + "and the committed of the disaster recovery cluster is %s" % + (master_commit_id, local_commit_id)) + if local_commit_id != master_commit_id: + raise ValueError(ErrorCode.GAUSS_516["GAUSS_51632"] % + "check version. Different version of cluster and disaster recovery") + + def __check_cluster_user(self): + """ + function: Check whether the version numbers of the host + cluster and the disaster recovery cluster are the same + """ + user_file = os.path.realpath(os.path.join(self.streaming_file_dir, + StreamingConstants.GS_SECURE_FILES, + StreamingConstants.CLUSTER_USER_RECORD)) + remote_user = DefaultValue.obtain_file_content(user_file, is_list=False) + if remote_user.strip() != self.user: + self.logger.logExit(ErrorCode.GAUSS_516["GAUSS_51632"] + % "check cluster user consistency, remote:%s, local:%s" + % (remote_user, self.user)) + self.logger.debug("Successfully checked cluster user consistency.") + + def check_cluster_type(self, allowed_type): + """ + Check cluster type is allowed type or not + """ + if allowed_type == 'primary' and self.main_standby_ids: + self.logger.logExit(ErrorCode.GAUSS_516['GAUSS_51632'] + % "check cluster type, standby cluster is not supported for %s" + % self.params.task) + elif allowed_type == 'standby' and self.primary_dn_ids: + self.logger.logExit(ErrorCode.GAUSS_516['GAUSS_51632'] + % "check cluster type, primary cluster is not supported for %s" + % self.params.task) + else: + self.logger.log("Check cluster type succeed.") + + def __remove_streaming_repl_info(self, params): + """ + Remove streaming repl info from single dn instances. + """ + dn_inst, guc_mode, dn_num = params + self.logger.debug("Start remove replconninfo for instance:%s" % dn_inst.instanceId) + for idx in range(1, dn_num + 1): + if dn_inst.hostname == self.local_host: + cmd = "source %s; gs_guc check -Z datanode -D %s " \ + "-c 'replconninfo%s'" % (self.mpp_file, dn_inst.datadir, idx) + else: + cmd = "source %s; pssh -H %s 'source %s; gs_guc check " \ + "-Z datanode -D %s -c \"replconninfo%s\"'" \ + % (self.mpp_file, dn_inst.hostname, self.mpp_file, dn_inst.datadir, idx) + self.logger.debug("Check original repl infos with cmd:%s" % cmd) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd + + " Error: \n%s " % output) + if output.count("=NULL") > 2: + continue + elif "iscrossregion=false" in output.lower(): + ret = re.search( + r"replconninfo%s='localhost=(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})" + r" localport=(\d{4,5}) localheartbeatport=(\d{4,5}) " + r"localservice=(\d{4,5}) " + r"remotehost=(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}) " + r"remoteport=(\d{4,5}) remoteheartbeatport=(\d{4,5}) " + r"remoteservice=(\d{4,5})" % idx, output) + if not ret: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "search repl infos") + if dn_inst.hostname != NetUtil.GetHostIpOrName(): + set_cmd = "source %s; pssh -H %s \"source %s ; gs_guc %s " \ + "-Z datanode -D %s -c " \ + "\\\"replconninfo%s = 'localhost=%s localport=%s " \ + "localheartbeatport=%s localservice=%s remotehost=%s " \ + "remoteport=%s remoteheartbeatport=%s " \ + "remoteservice=%s'\\\"\"" + set_cmd = set_cmd % (self.mpp_file, dn_inst.hostname, + self.mpp_file, guc_mode, + dn_inst.datadir, idx, ret.group(1), + ret.group(2), ret.group(3), ret.group(4), + ret.group(5), ret.group(6), ret.group(7), + ret.group(8)) + else: + set_cmd = "source %s ; gs_guc %s -Z datanode -D %s -c " \ + "\"replconninfo%s = 'localhost=%s localport=%s " \ + "localheartbeatport=%s localservice=%s remotehost=%s " \ + "remoteport=%s remoteheartbeatport=%s " \ + "remoteservice=%s'\"" + set_cmd = set_cmd % (self.mpp_file, guc_mode, + dn_inst.datadir, idx, ret.group(1), + ret.group(2), ret.group(3), ret.group(4), + ret.group(5), ret.group(6), ret.group(7), + ret.group(8)) + self.logger.debug("Set original repl infos with cmd:%s" % set_cmd) + status, output = CmdUtil.retryGetstatusoutput(set_cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % set_cmd + + " Error: \n%s " % output) + self.logger.debug("Successfully remove original repl infos with cmd:%s." + % set_cmd) + elif "iscrossregion=true" in output.lower(): + if dn_inst.hostname != self.local_host: + set_cmd = "source %s; pssh -H %s \"source %s ; gs_guc %s " \ + "-Z datanode -D %s -c \\\"replconninfo%s\\\"\"" + set_cmd = set_cmd % (self.mpp_file, dn_inst.hostname, + self.mpp_file, guc_mode, + dn_inst.datadir, idx) + else: + set_cmd = "source %s ; gs_guc %s -Z datanode -D %s -c " \ + "\"replconninfo%s\"" + set_cmd = set_cmd % (self.mpp_file, guc_mode, + dn_inst.datadir, idx) + self.logger.debug("Remove stream repl infos with cmd:%s" % set_cmd) + status, output = CmdUtil.retryGetstatusoutput(set_cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % set_cmd + + " Error: \n%s " % output) + self.logger.debug("Successfully remove stream repl infos with cmd:%s." + % set_cmd) + self.logger.debug("Successfully removed replconninfo for instance:%s" % dn_inst.instanceId) + + def remove_all_stream_repl_infos(self, guc_mode="set"): + """ + Remove retreaming disaster repl infos from all instances + """ + params = [] + dn_instances = [inst for node in self.cluster_info.dbNodes + for inst in node.datanodes] + cluster_conf = os.path.join(self.streaming_file_dir, + StreamingConstants.STREAMING_CLUSTER_CONF_RECORD) + dn_num = DefaultValue.get_all_dn_num_for_dr(cluster_conf, dn_instances[0], + self.cluster_info, self.logger) + for inst in dn_instances: + if inst.instanceId not in self.normal_dn_ids: + self.logger.error("Ignore rectify repl info of dn:%s" % inst.instanceId) + continue + params.append((inst, guc_mode, dn_num)) + if params: + self.logger.log("Starting remove all node dn instances repl infos.") + parallelTool.parallelExecute(self.__remove_streaming_repl_info, params) + self.logger.log("Successfully remove all node dn instances repl infos.") + + def remove_streaming_cluster_file(self): + """ + function: remove the parameter file for config pg_hba + :return: NA + """ + self.logger.log("Start remove cluster file.") + cluster_info_file = os.path.join(self.streaming_file_dir, + StreamingConstants.STREAMING_CLUSTER_CONF_RECORD) + cmd = g_file.SHELL_CMD_DICT["deleteFile"] % (cluster_info_file, cluster_info_file) + try: + self.ssh_tool.executeCommand(cmd, hostList=self.cluster_info.getClusterNodeNames()) + except Exception as error: + self.logger.debug( + "Failed to remove cluster file with error:%s" % error) + self.logger.log("Finished remove cluster file.") + + def remove_streaming_pg_hba(self, ignore_error=False): + """ + Remove remote ips from pg hba of streaming disaster + """ + self.logger.log("Start remove pg_hba config.") + remove_ips = [] + shards = self.params.remoteClusterConf["shards"] + for shard in shards: + for node_info in shard: + data_ip = node_info.get("dataIp") + remove_ips.append(data_ip) + remove_ips = list(set(remove_ips)) + host_names = self.get_all_connection_node_name("remove_streaming_pg_hba") + self.logger.debug("Remove ips:%s from pg_hba on nodes:%s" % ( + str(remove_ips), str(host_names))) + cmd = "%s -U '%s' -l '%s'" % (OMCommand.getLocalScript("Local_Config_Hba"), + self.user, self.log_file) + remove_ips_str = "" + for node_ip in remove_ips: + remove_ips_str += " --remove-ip %s" % node_ip + cmd += remove_ips_str + self.logger.debug("Command for updating pg_hba:%s." % cmd) + try: + self.ssh_tool.executeCommand(cmd, DefaultValue.SUCCESS, host_names) + except Exception as error: + self.logger.debug("Failed updating pg_hba with error:%s." % error) + if not ignore_error: + raise error + self.logger.log("Finished remove pg_hba config.") + + def streaming_drop_replication_slot(self, dn_inst, drop_slots): + """ + Delete dn_xxx_hadr on all dn nodes if dn_xxx_hadr exists when the disaster tolerance + relationship is lifted + """ + if not drop_slots: + self.logger.debug("WARNING:Not found dn_xxx_hadr on %s node, No need to " + "delete." % dn_inst.instanceId) + else: + for slot in drop_slots: + self.logger.debug("starting drop inst %s %s" % (dn_inst.instanceId, slot.strip())) + sql = "select * from pg_catalog.pg_drop_replication_slot('%s');" % slot.strip() + status_dr, output_dr = ClusterCommand.remoteSQLCommand( + sql, self.user, dn_inst.hostname, dn_inst.port, maintenance_mode=True) + self.logger.debug("get %s need drop replication_slots, status=%d, " + "output: %s." % (dn_inst.hostname, status_dr, + SensitiveMask.mask_pwd(output_dr))) + if status_dr != 0: + self.logger.debug("Failed to remove inst %s %s with error: %s" % ( + dn_inst.instanceId, slot.strip(), output_dr)) + self.logger.debug( + "Successfully drop node %s %s" % (dn_inst.instanceId, slot.strip())) + + def concurrent_drop_slot(self, dn_inst): + """ + concurrent drop all dn replication slots + """ + sql_check = "select * from pg_catalog.pg_get_replication_slots();" + self.logger.debug("Starting concurrent drop node %s instance [%s] replication slots" % + (dn_inst.hostname, dn_inst.instanceId)) + status, output = ClusterCommand.remoteSQLCommand( + sql_check, self.user, dn_inst.hostname, dn_inst.port, maintenance_mode=True) + self.logger.debug("get %s all replication slots, status=%d, output: %s." % + (dn_inst.instanceId, status, SensitiveMask.mask_pwd(output))) + if status == 0 and output.strip(): + drop_slots = [] + if str(dn_inst.instanceId).startswith("6"): + drop_slots = re.findall(r"dn_\d+_hadr", output.strip()) + if str(dn_inst.instanceId).startswith("5"): + drop_slots = re.findall(r"cn_\d+_\d+\.\d+\.\d+\.\d+_\d+", output.strip()) + self.logger.debug("Waiting to delete instance [%s] replication slots is: %s" % + (dn_inst.instanceId, drop_slots)) + self.streaming_drop_replication_slot(dn_inst, drop_slots) + else: + self.logger.debug("Obtain all replication slot results:%s." % output) + + def streaming_clean_replication_slot(self): + """ + Delete dn_xxx_hadr on all dn nodes if dn_xxx_hadr exists when the disaster tolerance + relationship is lifted + """ + self.logger.log("Starting drop all node replication slots") + params = [dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes if dn_inst.instanceId in self.normal_dn_ids] + self.logger.debug("need drop all node replication slots: %s" % + [inst.instanceId for inst in params]) + parallelTool.parallelExecute(self.concurrent_drop_slot, params) + self.logger.log("Finished drop all node replication slots") + + def update_streaming_info(self, key, value, only_mode=None): + """ + Update info for streaming status + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Update query status [%s] to [%s] " + "not for mode:%s." % (key, value, self.params.mode)) + return + self.logger.debug("Update query [%s] to [%s]." % (key, value)) + try: + if key == "cluster": + key_stat = StreamingConstants.HADR_CLUSTER_STAT + elif key == StreamingConstants.ACTION_FAILOVER: + key_stat = StreamingConstants.HADR_FAILOVER_STAT + elif key == StreamingConstants.ACTION_SWITCHOVER: + key_stat = StreamingConstants.HADR_SWICHOVER_STAT + elif key == StreamingConstants.ACTION_ESTABLISH: + key_stat = StreamingConstants.HADR_ESTABLISH_STAT + else: + self.logger.debug("key error.") + return + file_path = os.path.realpath(os.path.join(self.streaming_file_dir, key_stat)) + with os.fdopen(os.open(file_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, + DefaultValue.KEY_FILE_MODE_IN_OS), "w") as fp_write: + fp_write.write(value) + host_names = self.get_all_connection_node_name( + action_flag="update_streaming_info", no_update=True) + self.ssh_tool.scpFiles(file_path, self.streaming_file_dir, host_names) + except Exception as error: + self.logger.debug("Failed write info, key:%s, value:%s, " + "error:%s." % (key, value, error)) + + def create_cluster_maintance_file(self, value): + """ + add cluster_maintance file for streaming failover and switchover disaster_standby + """ + self.logger.debug("Start create cluster_maintance file.") + try: + cluster_maintance_file = os.path.realpath(os.path.join(self.gauss_home, + "bin/cluster_maintance")) + with os.fdopen(os.open(cluster_maintance_file, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, + DefaultValue.KEY_FILE_MODE_IN_OS), "w") as fp_write: + fp_write.write(value) + host_names = self.get_all_connection_node_name("create_cluster_maintance_file") + self.ssh_tool.scpFiles(cluster_maintance_file, + os.path.join(self.gauss_home, "bin"), host_names) + except Exception as error: + self.logger.debug("WARNING: Failed create cluster_maintance file, value:%s, " + "error:%s." % (value, str(error))) + self.logger.debug("Successfully create cluster_maintance file.") + + def streaming_failover_single_inst(self, stream_disaster_step, action_flag=None): + """ + streaming disaster recovery failover for single_inst cluster + """ + self.create_cluster_maintance_file("streaming failover") + if action_flag != StreamingConstants.ACTION_SWITCHOVER: + self.update_streaming_info("cluster", "promote") + # 0. check cluster status and get normal instance list + if stream_disaster_step < 0: + if action_flag == StreamingConstants.ACTION_SWITCHOVER: + self.update_streaming_info(StreamingConstants.ACTION_SWITCHOVER, "10%") + else: + self.update_streaming_info(StreamingConstants.ACTION_FAILOVER, "10%") + self.init_cluster_status() + self.parse_cluster_status() + self.write_streaming_step("0_check_cluster_status_done_for_failover") + # 1.Specify max xid and max ter to start etcd + max_term_record = os.path.join(self.streaming_file_dir, ".max_term_record") + if stream_disaster_step < 1: + max_term = self.get_term_info() + term_key = "/%s/CMServer/status_key/term" % self.user + para_dict = {term_key: max_term, self.backup_open_key: "0"} + ClusterInstanceConfig.set_data_on_dcc(self.cluster_info, + self.logger, self.user, para_dict) + DefaultValue.write_content_on_file(max_term_record, max_term) + self.write_streaming_step("1_start_etcd_done_for_failover") + self._failover_config_step(stream_disaster_step, action_flag) + self._failover_start_step(stream_disaster_step, action_flag, max_term_record) + + def _failover_start_step(self, stream_disaster_step, action_flag, max_term_record): + """ + Failover step 5 & 6 + """ + if stream_disaster_step < 5: + if action_flag == StreamingConstants.ACTION_SWITCHOVER: + self.update_streaming_info(StreamingConstants.ACTION_SWITCHOVER, "80%") + else: + self.update_streaming_info(StreamingConstants.ACTION_FAILOVER, "80%") + if not os.path.isfile(max_term_record): + raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % max_term_record) + _, dn_infos = self.get_specified_dn_infos() + max_term_list = DefaultValue.obtain_file_content(max_term_record) + if not max_term_list: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "read max term") + params = [(dn_info, max_term_list[0]) for dn_info in dn_infos] + if params: + parallelTool.parallelExecute(self.start_primary_dn, params) + self.write_streaming_step("5_start_primary_dn_done") + if stream_disaster_step < 6: + self.start_cluster() + cluster_normal_status = [DefaultValue.CLUSTER_STATUS_NORMAL, + DefaultValue.CLUSTER_STATUS_DEGRADED] + self.check_cluster_status(cluster_normal_status, check_current=True) + cluster_info = self.query_cluster_info() + self.parse_cluster_status(current_status=cluster_info) + if action_flag != StreamingConstants.ACTION_SWITCHOVER: + self.clean_global_config() + self.restore_guc_params() + self.streaming_clean_archive_slot() + if action_flag != StreamingConstants.ACTION_SWITCHOVER: + self.update_streaming_info(StreamingConstants.ACTION_FAILOVER, "100%") + self.update_streaming_info("cluster", "normal") + else: + self.update_streaming_info("cluster", "archive") + + def streaming_clean_archive_slot(self): + """ + drop lot_type is physical and slot_name not contain (gs_roach_full,gs_roach_inc, + cn_xxx,dn_xxx, dn_xxx_hadr) on all cn node and all primary dn node if the + slot_name exists when the disaster cluster become primary cluster + """ + self.logger.debug("Starting drop archive slots") + params = [dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes if dn_inst.instanceId in self.primary_dn_ids] + self.logger.debug("need drop all node archive slots: %s" % + [inst.instanceId for inst in params]) + parallelTool.parallelExecute(self.parallel_drop_archive_slot, params) + self.logger.debug("Successfully drop all node archive slots") + + def parallel_drop_archive_slot(self, dn_inst): + """ + concurrent drop all primary dn and all cn archive slots + """ + sql_check = "select slot_name from pg_catalog.pg_get_replication_slots() " \ + "where slot_type='physical' and slot_name not in " \ + "('gs_roach_full', 'gs_roach_inc') and slot_name not like 'cn_%' and " \ + "slot_name not like 'dn_%';" + self.logger.debug("Starting concurrent drop node %s instance [%s] archive slots" % + (dn_inst.hostname, dn_inst.instanceId)) + (status, output) = ClusterCommand.remoteSQLCommand( + sql_check, self.user, dn_inst.hostname, dn_inst.port) + self.logger.debug("get %s all archive slots, status=%d, output: %s." % + (dn_inst.instanceId, status, output)) + if status == 0 and output.strip(): + archive_slots = output.strip().split('\n') + self.logger.debug("Waiting to delete instance [%s] archive slots is: %s" % + (dn_inst.instanceId, archive_slots)) + self.streaming_drop_replication_slot(dn_inst, archive_slots) + + def get_specified_dn_infos(self, update=False, dn_status="Primary"): + + """ + Get specified dn infos + """ + tmp_file = os.path.join(self.streaming_file_dir, "cluster_state_tmp") + if not os.path.isfile(tmp_file) or update: + cmd = ClusterCommand.getQueryStatusCmd(self.user, 0, tmp_file) + self.logger.debug("Update cluster state with cmd: %s" % cmd) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "obtain primary dn infos" + "Error:%s" % output) + cluster_info = DbClusterStatus() + cluster_info.initFromFile(tmp_file) + dn_infos = [] + dn_ids = [] + dn_instances = [(inst, db_node.name) for db_node in cluster_info.dbNodes + for inst in db_node.datanodes] + for data_inst, db_node_name in dn_instances: + if data_inst.status == dn_status: + one_dn_info = dict() + one_dn_info["node_ip"] = db_node_name + one_dn_info["instance_id"] = data_inst.instanceId + one_dn_info["data_dir"] = data_inst.datadir + dn_ids.append(data_inst.instanceId) + dn_infos.append(one_dn_info) + self.logger.debug("Got primary dn infos: %s:%s" % (dn_ids, dn_infos)) + return dn_ids, dn_infos + + def start_primary_dn(self, params): + """ + Start main standby as primary dn in streaming failover. + """ + dn_info, max_term = params + opt_type = " -Z datanode" if not self.cluster_info.isSingleInstCluster() else "" + self.logger.debug("Starting primary dn %s, max term:%s." % + (dn_info["instance_id"], max_term)) + bin_path = "%s/bin" % self.cluster_info.appPath + instance_id = dn_info["instance_id"] + hostname = dn_info["node_ip"] + data_dir = dn_info["data_dir"] + if self.local_ip == hostname: + cmd_start = "source %s; %s/gs_ctl start%s -D %s -M pending -t 600" % \ + (self.mpp_file, bin_path, opt_type, data_dir) + else: + cmd_start = "source %s; pssh -s -t 900 -H %s \"source %s; " \ + "%s/gs_ctl start%s -D %s -M pending" \ + " -t 600\"" % (self.mpp_file, hostname, self.mpp_file, + bin_path, opt_type, data_dir) + self.logger.debug("Start primary dn with cmd:%s" % cmd_start) + status, output = CmdUtil.retryGetstatusoutput(cmd_start) + if status != 0: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "start primary dn %s with error:%s" + % (instance_id, output)) + self.logger.debug("Successfully start primary dn %s" % instance_id) + if self.local_ip == hostname: + cmd_config = "source %s; %s/gs_ctl notify%s -D %s -M primary -T %s -t 600" \ + % (self.mpp_file, bin_path, opt_type, data_dir, max_term) + else: + cmd_config = "source %s; pssh -s -t 900 -H %s \"source %s; %s/gs_ctl notify%s -D %s " \ + "-M primary -T %s -t 600\"" % (self.mpp_file, self.mpp_file, hostname, + bin_path, opt_type, data_dir, max_term) + self.logger.debug("Config primary dn with cmd:%s" % cmd_config) + status, output = CmdUtil.retryGetstatusoutput(cmd_config) + if status != 0: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "config primary dn %s with error:%s" + % (instance_id, output)) + self.logger.debug("Successfully start and config primary dn:%s" % instance_id) + + def stream_disaster_set_cmserver_guc(self, guc_parameter, guc_value, guc_type): + """ + set cmserver guc param + :param guc_parameter: guc param + :param guc_value: value + :param guc_type: init type + :return: NA + """ + self.logger.debug("Starting set cm server for streaming disaster.") + cmd = "source %s && gs_guc %s -Z cmserver -D 'cm_instance_data_path' -c \"%s=%s\" " \ + % (self.mpp_file, guc_type, guc_parameter, guc_value) + self.logger.debug("streaming disaster calling set cms, cmd=[%s]" % cmd) + self.ssh_tool.executeCommand(cmd, hostList=self.normal_cm_ips) + self.logger.debug("Successfully set cm server for streaming disaster.") + + def stream_disaster_set_cmagent_guc(self, guc_parameter, guc_value, guc_type): + """ + set cmagent guc param + :param guc_parameter: guc param + :param guc_value: value + :param guc_type: init type + :return: NA + """ + self.logger.debug("Starting set cm agent for streaming disaster.") + cmd = "source %s && gs_guc %s -Z cmagent -D 'cm_instance_data_path' -c \"%s=%s\" " \ + % (self.mpp_file, guc_type, guc_parameter, guc_value) + self.logger.debug("streaming disaster calling set cma, cmd=[%s]" % cmd) + self.ssh_tool.executeCommand(cmd, hostList=self.normal_node_list) + self.logger.debug("Successfully set cm agent for streaming disaster.") + + def _failover_config_step(self, stream_disaster_step, action_flag): + """ + Failover step 2 - 4 + """ + # 2.Stop the cluster by node + if stream_disaster_step < 2: + if action_flag != StreamingConstants.ACTION_SWITCHOVER: + self.streaming_clean_replication_slot() + self.update_streaming_info(StreamingConstants.ACTION_FAILOVER, "30%") + self.stop_cluster_by_node() + self.write_streaming_step("2_stop_cluster_done_for_failover") + # 3.Start the cluster in the main cluster mode + if stream_disaster_step < 3: + self.set_cmserver_guc("backup_open", "0", "set") + self.stream_disaster_set_cmagent_guc("agent_backup_open", "0", "set") + self.write_streaming_step("3_set_backup_open_for_failover") + # 4.Delete the relevant guc parameters and remove the disaster tolerance relationship + # based on streaming disaster recovery cluster, No need to delete for switchover. + if not action_flag: + if stream_disaster_step < 4: + self.update_streaming_info(StreamingConstants.ACTION_FAILOVER, "50%") + self.remove_all_stream_repl_infos() + self.remove_streaming_pg_hba(True) + self.update_streaming_info(StreamingConstants.ACTION_FAILOVER, "70%") + self.write_streaming_step("4_remove_hba_repl_done_for_failover") + + def get_term_info(self): + """get_term_info""" + # get max term from dns + return self.get_term() + + def get_term(self, normal_dn=True): + """ + get etcd term + """ + max_term = 0 + sql_cmd = "select term from pg_last_xlog_replay_location();" + params_list = [(inst, sql_cmd, max_term, normal_dn) for db_node in + self.cluster_info.dbNodes for inst in db_node.datanodes] + if params_list: + term_list = parallelTool.parallelExecute(self.get_max_term_by_compare, params_list) + self.logger.debug("Get term list: %s." % term_list) + if not term_list: + max_term = 0 + else: + max_term = int(max(term_list)) + if int(max_term) == 0: + raise Exception("Failed get term") + max_term = int(max_term) + 100 + self.logger.debug("Get max term %s in dns" % max_term) + return max_term + + def streaming_switchover_roll_back_condition(self): + """ + check need rollback or not by Main Standby dn status + output: return True means need rollback + """ + self.logger.debug("Starting check switchover rollback condition.") + cluster_status = self.query_cluster_info(cm_check=True) + if not cluster_status: + raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] + % "query cluster status when check rollback condition") + + rollback_check_list = ["Main Standby Need repair(Disconnected)", + "Main Standby Need repair(Connecting)"] + need_rollback = False + for check_status in rollback_check_list: + if check_status in cluster_status: + need_rollback = True + self.logger.debug("Successfully check rollback condition: %s." % need_rollback) + self.logger.debug("Cluster status: %s." % cluster_status) + return need_rollback + + def get_max_term_by_compare(self, params): + """ + get max term by compare + """ + instance, sql_cmd, max_term, normal_dn = params + if (normal_dn is True and instance.instanceId in self.normal_dn_ids) or \ + (normal_dn is False and instance.instanceType == DefaultValue.MASTER_INSTANCE): + (status, output) = ClusterCommand.remoteSQLCommand( + sql_cmd, self.user, instance.hostname, instance.port, maintenance_mode=True) + if status != 0 or self.find_error(output): + raise Exception(ErrorCode.GAUSS_513["GAUSS_51300"] % + sql_cmd + "\nError: %s" % output) + self.logger.debug("TERM %s, Instance %s" % (output, instance.instanceId)) + term = output.strip() + if int(term) > int(max_term): + max_term = term + return int(max_term) + + def remove_cluster_maintance_file(self): + """ + function: remove the cluster_maintance file + :return: NA + """ + self.logger.debug("Start remove cluster_maintance file.") + cluster_maintance_file = os.path.realpath(os.path.join(self.gauss_home, + "bin/cluster_maintance")) + cmd = g_file.SHELL_CMD_DICT["deleteFile"] % (cluster_maintance_file, cluster_maintance_file) + host_names = self.get_all_connection_node_name("remove_cluster_maintance_file") + try: + self.ssh_tool.executeCommand(cmd, hostList=host_names) + except Exception as error: + self.logger.debug( + "Failed to remove cluster_maintance file with error: %s" % str(error)) + self.logger.debug("Successfully remove %s cluster_maintance file." % host_names) + + def get_node_sship_from_nodeid(self, node_id): + """ + get node sship from nodeid + :param node_id: node id + :return: + """ + for nodename in self.cluster_info.dbNodes: + if int(node_id) == int(nodename.id): + return nodename.sshIps[0] + + def delivery_file_to_other_node(self, path_name, file_name, node_list=None): + """delivery_file_to_other_node""" + send_file = "%s/%s" % (path_name, file_name) + send_file_bak = "%s/%s_bak" % (path_name, file_name) + if not os.path.isfile(send_file): + raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % send_file) + + if node_list: + p_node_list = " -H ".join(node_list) + elif self.cluster_info.getClusterNodeNames(): + p_node_list = " -H ".join(self.cluster_info.getClusterNodeNames()) + else: + raise Exception("Failed to delivery file: %s, node information does not exits" + % file_name) + pscp_cmd = "cp %s %s && source %s && pscp -t 60 -H %s %s %s && rm -f %s" % \ + (send_file, send_file_bak, self.mpp_file, p_node_list, + send_file_bak, send_file, send_file_bak) + status, output = CmdUtil.retryGetstatusoutput(pscp_cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % pscp_cmd + " Error:\n%s" % output) + else: + self.logger.debug("Successfully send %s to all nodes" % send_file) + + @staticmethod + def find_error(output): + """ + error rule + :param output: error info + :return:bool + """ + error_msg_flag = "(ERROR|FATAL|PANIC)" + error_pattern = "^%s:.*" % error_msg_flag + pattern = re.compile(error_pattern) + for line in output.split("\n"): + line = line.strip() + result = pattern.match(line) + if result is not None: + return True + return False + + def set_stream_cluster_run_mode_guc(self, guc_mode, fail_over=False): + """ + function: set cluster run mode guc + :return: + """ + cluster_run_mode = "cluster_primary" if self.params.mode == "primary" \ + else "cluster_standby" + if fail_over: + cluster_run_mode = "cluster_primary" + guc_cmd = "source %s && gs_guc %s -Z datanode -N all -I all -c " \ + "\"stream_cluster_run_mode = '%s'\"" % \ + (self.mpp_file, guc_mode, cluster_run_mode) + host_names = self.cluster_info.getClusterNodeNames() + ignore_node = [node for node in host_names if node not in self.normal_node_list] + if ignore_node: + self.logger.debug( + "WARNING: cluster_run_mode for datanode ignore nodes:%s" % ignore_node) + nodes = ",".join(ignore_node) + guc_cmd = guc_cmd + " --ignore-node %s" % nodes + self.logger.debug("Set dn stream_cluster_run_mode with cmd:%s" % guc_cmd) + (status, output) = CmdUtil.retryGetstatusoutput(guc_cmd) + if status != 0: + self.logger.debug("Warning: Failed %s dn stream_cluster_run_mode=%s, output: %s" % + (guc_mode, cluster_run_mode, str(output))) + else: + self.logger.debug("Successfully %s streaming cluster run mode for " + "datanode param %s" % (guc_mode, cluster_run_mode)) + + guc_cmd_cn = "source %s && gs_guc %s -Z coordinator -N all -I all -c " \ + "\"stream_cluster_run_mode = '%s'\"" % \ + (self.mpp_file, guc_mode, cluster_run_mode) + if ignore_node: + self.logger.debug( + "WARNING: cluster_run_mode for coordinator ignore nodes:%s" % ignore_node) + nodes = ",".join(ignore_node) + guc_cmd_cn = guc_cmd_cn + " --ignore-node %s" % nodes + self.logger.debug("Set cn stream_cluster_run_mode with cmd:%s" % guc_cmd_cn) + (status, output) = CmdUtil.retryGetstatusoutput(guc_cmd_cn) + if status != 0: + self.logger.debug("Warning: Failed %s cn stream_cluster_run_mode=%s, output: %s" % + (guc_mode, cluster_run_mode, str(output))) + else: + self.logger.debug("Successfully %s streaming cluster run mode for " + "coordinator param %s" % (guc_mode, cluster_run_mode)) + + def set_data_in_dcc(self, key, value, only_mode=None): + """ + Set data in dcc + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("set [%s][%s] not for mode:%s." % (key, value, self.params.mode)) + return + self.logger.debug("Start set data: [%s][%s] in dcc." % (key, value)) + ClusterInstanceConfig.set_data_on_dcc(self.cluster_info, + self.logger, self.user, + {key: value}) + self.logger.log("Successfully set [%s][%s]." % (key, value)) + + def stop_cluster(self, action=None): + """ + stop the cluster + """ + self.logger.log("Stopping the cluster.") + static_config = "%s/bin/cluster_static_config" % self.cluster_info.appPath + cm_ctl_file = "%s/bin/cm_ctl" % self.cluster_info.appPath + if not os.path.isfile(static_config) or not os.path.isfile(cm_ctl_file): + raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % + (static_config + " or " + cm_ctl_file)) + cmd = ClusterCommand.getStopCmd(0, "i", 1800) + if action: + cmd = ClusterCommand.getStopCmd(0, timeout=1800) + self.logger.debug("disaster cluster calling cm_ctl to stop cluster, cmd=[%s]" % cmd) + status, output = CmdUtil.retryGetstatusoutput(cmd, retry_time=0) + if status != 0: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51610"] % + ("the cluster" + " Error:\n%s." % output)) + self.logger.log("Successfully stopped the cluster.") diff --git a/script/impl/streaming_disaster_recovery/streaming_constants.py b/script/impl/streaming_disaster_recovery/streaming_constants.py new file mode 100644 index 0000000..0a1d312 --- /dev/null +++ b/script/impl/streaming_disaster_recovery/streaming_constants.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################################# +# Copyright (c) 2020 Huawei Technologies Co.,Ltd. +# +# openGauss is licensed under Mulan PSL v2. +# You can use this software according to the terms +# and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# +# http://license.coscl.org.cn/MulanPSL2 +# +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ---------------------------------------------------------------------------- +# Description : streaming_constants.py is utility for defining constants +# of streaming disaster recovery. +############################################################################# + + +class StreamingConstants: + + # streaming files + STREAMING_LOG_FILE = "gs_sdr.log" + STREAMING_FILES_DIR = 'streaming_cabin' + STREAMING_CLUSTER_STATUS_TMP_FILE = "cluster_state_tmp" + WAL_KEEP_SEGMENTS = ".wal_keep_segments_record" + STREAMING_CLUSTER_CONF_RECORD = "cluster_conf_record" + GS_SECURE_FILES = "gs_secure_files" + HADR_KEY_CIPHER = "hadr.key.cipher" + HADR_KEY_RAND = "hadr.key.rand" + STREAM_SWITCHOVER_STATE = ".switchover_cluster_state" + MAX_TERM_RECORD = ".max_term_record" + PROCESS_LOCK_FILE = 'streaming_lock_' + STREAMING_CONFIG_XML = "streaming_config.xml" + GUC_BACKUP_FILE = ".streaming_guc_backup" + CLUSTER_USER_RECORD = ".cluster_user_record" + + ACTION_START = "start" + ACTION_SWITCHOVER = "switchover" + ACTION_FAILOVER = "failover" + + ACTION_ESTABLISH = "establish" + + # streaming query temp file + HADR_CLUSTER_STAT = ".hadr_cluster_stat" + HADR_FAILOVER_STAT = ".hadr_failover_stat" + HADR_SWICHOVER_STAT = ".hadr_switchover_stat" + HADR_ESTABLISH_STAT = ".hadr_establish_stat" + + STREAM_DISTRIBUTE_ACTION = "distribute_stream_failover" + + # GUC CHANGE MAP + GUC_CHANGE_MAP = {"most_available_sync": "on", "synchronous_commit": "on"} + + # params in json file for each module + STREAMING_JSON_PARAMS = { + "start": ["localClusterConf", "remoteClusterConf"], + "stop": ["localClusterConf", "remoteClusterConf"], + "switchover": [], + "failover": [], + "query": [] + } + + # step file of each module + STREAMING_STEP_FILES = { + "start_primary": ".streaming_start_primary.step", + "start_standby": ".streaming_start_standby.step", + "stop": ".streaming_stop.step", + "switchover_primary": ".streaming_switchover_primary.step", + "switchover_standby": ".streaming_switchover_standby.step", + "failover": ".streaming_failover.step", + "query": ".streaming_query.step", + } + # task need check process is exist + TASK_EXIST_CHECK = ["start", "stop", "switchover", "failover"] + + # default values + MAX_WAL_KEEP_SEGMENTS = 16384 + MAX_REPLICATION_NUMS = 8 + MAX_BUILD_TIMEOUT = 1209600 + STANDBY_START_TIMEOUT = 3600 * 24 * 7 + CHECK_PROCESS_WAIT_TIME = 3 + + # backup open key + BACKUP_OPEN = "/%s/CMServer/backup_open" + + # log remark + LOG_REMARK = "-" * 80 diff --git a/script/impl/streaming_disaster_recovery/streaming_modules/__init__.py b/script/impl/streaming_disaster_recovery/streaming_modules/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/script/impl/streaming_disaster_recovery/streaming_modules/streaming_diaster_recovery_start.py b/script/impl/streaming_disaster_recovery/streaming_modules/streaming_diaster_recovery_start.py new file mode 100644 index 0000000..39cc3e9 --- /dev/null +++ b/script/impl/streaming_disaster_recovery/streaming_modules/streaming_diaster_recovery_start.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################################# +# Copyright (c) 2020 Huawei Technologies Co.,Ltd. +# +# openGauss is licensed under Mulan PSL v2. +# You can use this software according to the terms +# and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# +# http://license.coscl.org.cn/MulanPSL2 +# +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ---------------------------------------------------------------------------- +# Description : streaming_disaster_recovery_start.py is utility for creating +# relationship between primary cluster and standby cluster. + +import os + +from base_utils.security.sensitive_mask import SensitiveMask +from gspylib.common.ErrorCode import ErrorCode +from gspylib.common.Common import DefaultValue, ClusterCommand +from impl.streaming_disaster_recovery.streaming_base import StreamingBase +from impl.streaming_disaster_recovery.streaming_constants import StreamingConstants + + +class StreamingStartHandler(StreamingBase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def _first_step_for_streaming_start(self, step): + """ + First step for streaming start + """ + if step >= 2: + return + self.logger.debug("Start first step of streaming start.") + self.create_streaming_dir(self.streaming_file_dir) + self.check_action_and_mode() + self.init_cluster_status() + + def _second_step_for_streaming_start(self, step): + """ + Second step for streaming start + """ + if step >= 2: + return + self.logger.debug("Start second step of streaming start.") + self.check_cluster_status(status_allowed=['Normal']) + self.check_cluster_is_common() + cm_exist = DefaultValue.check_is_cm_cluster(self.logger) + if not cm_exist: + self.logger.logExit(ErrorCode.GAUSS_516["GAUSS_51632"] % + "check cm_ctl is available for current cluster") + self.check_is_under_upgrade() + self.check_dn_instance_params() + self.write_streaming_step("2_check_cluster_step") + + def _third_step_for_streaming_start(self, step): + """ + Third step for streaming start + """ + if step >= 3: + return + self.logger.debug("Start third step of streaming start.") + self.drop_replication_slot_on_dr_cluster(only_mode="disaster_standby") + self.prepare_gs_secure_files(only_mode='primary') + self.build_and_distribute_key_files(only_mode='disaster_standby') + self.get_default_wal_keep_segments(only_mode='primary') + self.write_streaming_step("3_set_wal_segments_step") + + def drop_replication_slot_on_dr_cluster(self, only_mode=None): + """ + Drop replication slot on dr cluster + """ + if only_mode and self.params.mode != only_mode: + self.logger.debug("Drop replication slot opts not for mode:%s." % self.params.mode) + return + sql_check = "select slot_name from pg_get_replication_slots() where slot_type='logical'" + primary_dns = DefaultValue.get_primary_dn_instance_id("Primary", ignore=True) + if not primary_dns: + return + primary_insts = [inst for node in self.cluster_info.dbNodes + for inst in node.datanodes if str(inst.instanceId) in primary_dns] + dn_inst = primary_insts[0] + self.logger.debug("Start drop node %s [%s] slots" % (dn_inst.hostname, dn_inst.instanceId)) + status, output = ClusterCommand.remoteSQLCommand( + sql_check, self.user, dn_inst.hostname, dn_inst.port) + self.logger.debug("Get %s all replication slots, status=%d, output: %s." % + (dn_inst.instanceId, status, SensitiveMask.mask_pwd(output))) + if status == 0 and output.strip(): + drop_slots = output.strip().split('\n') + for slot in drop_slots: + self.logger.debug("Starting drop node %s %s" % (dn_inst.instanceId, slot.strip())) + sql = "select * from pg_drop_replication_slot('%s');" % slot.strip() + status_dr, output_dr = ClusterCommand.remoteSQLCommand( + sql, self.user, dn_inst.hostname, dn_inst.port) + if status_dr != 0: + self.logger.debug("Failed to remove node %s %s with error: %s" % ( + dn_inst.hostname, slot.strip(), SensitiveMask.mask_pwd(output_dr))) + self.logger.debug( + "Successfully drop node %s %s" % (dn_inst.instanceId, slot.strip())) + + def _fourth_step_for_streaming_start(self, step): + """ + Fourth step for streaming start + """ + if step >= 4: + return + self.logger.debug("Start fourth step of streaming start.") + self.set_wal_keep_segments( + "reload", StreamingConstants.MAX_WAL_KEEP_SEGMENTS, only_mode='primary') + self.write_streaming_step("4_set_wal_segments_step") + + def _fifth_step_for_streaming_start(self, step): + """ + Fifth step for streaming start + """ + if step >= 5: + return + self.logger.debug("Start fifth step of streaming start.") + self.set_data_in_dcc(self.backup_open_key, "0", only_mode='primary') + self.set_data_in_dcc(self.backup_open_key, "2", only_mode='disaster_standby') + self.stop_cluster_by_node(only_mode='disaster_standby') + self.write_streaming_step("5_set_wal_segments_step") + + def common_step_for_streaming_start(self): + """ + Common step for streaming start between step 1 and 2 + """ + self.logger.debug("Start common config step of streaming start.") + self.distribute_cluster_conf() + self.update_streaming_pg_hba() + self.config_streaming_repl_info() + + def _sixth_step_for_streaming_start(self, step): + """ + Sixth step for streaming start + """ + if step >= 6: + return + self.logger.debug("Start sixth step of streaming start.") + self.set_cmserver_guc("backup_open", "2", "set", only_mode='disaster_standby') + self.set_cmagent_guc("agent_backup_open", "2", "set", only_mode='disaster_standby') + self.write_streaming_step("6_set_guc_step") + + def _seventh_step_for_streaming_start(self, step): + """ + Seventh step for streaming start + """ + if step >= 7: + return + self.logger.debug("Start seventh step of streaming start.") + self.update_streaming_info("cluster", "restore", only_mode='disaster_standby') + try: + self.build_dn_instance(only_mode='disaster_standby') + except Exception as error: + self.update_streaming_info("cluster", "restore_fail", only_mode='disaster_standby') + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % "build dns" + "Error:%s" % error) + self.write_streaming_step("7_build_dn_instance_step") + + def _eighth_step_for_streaming_start(self, step): + """ + Eighth step for streaming start + """ + if step >= 8: + return + self.logger.debug("Start eighth step of streaming start.") + self.start_cluster(cm_timeout=StreamingConstants.STANDBY_START_TIMEOUT, + only_mode='disaster_standby') + self.update_streaming_info("cluster", "full_backup", only_mode='primary') + try: + self.wait_main_standby_connection(only_mode='primary') + except Exception as error: + self.update_streaming_info("cluster", "backup_fail", only_mode='primary') + raise Exception(str(error)) + ret = self.check_cluster_status(status_allowed=['Normal'], + only_check=True, check_current=True) + query_status = "recovery" if ret else "recovery_fail" + self.update_streaming_info("cluster", query_status, only_mode='disaster_standby') + self.update_streaming_info("cluster", "archive", only_mode='primary') + self.write_streaming_step("8_start_cluster_step") + + def _ninth_step_for_streaming_start(self, step): + """ + ninth step for streaming start + """ + if step >= 9: + return + self.logger.debug("Start ninth step of streaming start.") + self.restore_wal_keep_segments(only_mode='primary') + self.clean_gs_secure_dir() + self.clean_step_file() + + def _check_and_refresh_disaster_user_permission(self): + """check and refresh disaster user permission""" + if self.params.mode != "primary": + return + self.check_hadr_user(only_mode='primary') + self.check_hadr_pwd(only_mode='primary') + self.logger.debug("Encrypt hadr user info to database not " + "for mode:%s." % self.params.mode) + hadr_cipher_path = os.path.join(self.bin_path, "hadr.key.cipher") + hadr_rand_path = os.path.join(self.bin_path, "hadr.key.rand") + if not os.path.isfile(hadr_cipher_path) or not os.path.isfile(hadr_rand_path): + self.hadr_key_generator('hadr') + user_info = DefaultValue.obtain_hadr_user_encrypt_str(self.cluster_info, self.user, + self.logger, False, True) + if user_info: + self.clean_global_config() + pass_str = self.encrypt_hadr_user_info( + 'hadr', self.params.hadrUserName, self.params.hadrUserPassword) + self.keep_hadr_user_info(pass_str) + + def run(self): + self.logger.log("Start create streaming disaster relationship.") + step = self.query_streaming_step() + self._first_step_for_streaming_start(step) + self.parse_cluster_status() + self._check_and_refresh_disaster_user_permission() + self._second_step_for_streaming_start(step) + self.common_step_for_streaming_start() + self._third_step_for_streaming_start(step) + self._fourth_step_for_streaming_start(step) + self._fifth_step_for_streaming_start(step) + self._sixth_step_for_streaming_start(step) + self._seventh_step_for_streaming_start(step) + self._eighth_step_for_streaming_start(step) + self._ninth_step_for_streaming_start(step) + self.logger.log("Successfully do streaming disaster recovery start.") diff --git a/script/impl/streaming_disaster_recovery/streaming_modules/streaming_disaster_recovery_failover.py b/script/impl/streaming_disaster_recovery/streaming_modules/streaming_disaster_recovery_failover.py new file mode 100644 index 0000000..ff1fdc5 --- /dev/null +++ b/script/impl/streaming_disaster_recovery/streaming_modules/streaming_disaster_recovery_failover.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################################# +# Copyright (c) 2020 Huawei Technologies Co.,Ltd. +# +# openGauss is licensed under Mulan PSL v2. +# You can use this software according to the terms +# and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# +# http://license.coscl.org.cn/MulanPSL2 +# +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ---------------------------------------------------------------------------- +# Description : streaming_disaster_recovery_failover.py is utility for +# standby cluster failover to primary cluster. + + +from gspylib.common.Common import DefaultValue +from gspylib.common.ErrorCode import ErrorCode +from impl.streaming_disaster_recovery.streaming_base import StreamingBase + + +class StreamingFailoverHandler(StreamingBase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def run(self): + self.logger.log("Start streaming disaster failover.") + self.check_action_and_mode() + step = self.check_streaming_failover_workable(check_type_step=3, check_status_step=0) + self.check_is_under_upgrade() + self.init_cluster_conf() + try: + self.streaming_failover_single_inst(step) + self.update_streaming_info("cluster", "normal") + self.clean_step_file() + except Exception as error: + self.update_streaming_info("cluster", "promote_fail") + raise Exception( + ErrorCode.GAUSS_516["GAUSS_51632"] % "centralize failover" + "Error:%s" % error) + finally: + self.remove_cluster_maintance_file() + self.clean_streaming_dir() + self.logger.log("Successfully do streaming disaster recovery failover.") + + def check_streaming_failover_workable(self, check_type_step=0, check_status_step=0): + """ + Check streaming failover is workable. + """ + self.logger.debug("Streaming disaster distribute cluster failover...") + stream_disaster_step = self.query_streaming_step() + if not DefaultValue.is_disaster_cluster(self.cluster_info) \ + and stream_disaster_step < check_type_step: + self.logger.debug("The primary dn exist, do nothing except record the result file.") + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % + "streaming disaster cluster failover, Because the primary cluster " + "does not support failover") + cluster_normal_status = [DefaultValue.CLUSTER_STATUS_NORMAL, + DefaultValue.CLUSTER_STATUS_DEGRADED] + if stream_disaster_step < check_status_step: + self.init_cluster_status() + self.parse_cluster_status() + if stream_disaster_step < check_status_step: + self.check_cluster_status(cluster_normal_status) + return stream_disaster_step diff --git a/script/impl/streaming_disaster_recovery/streaming_modules/streaming_disaster_recovery_query.py b/script/impl/streaming_disaster_recovery/streaming_modules/streaming_disaster_recovery_query.py new file mode 100644 index 0000000..7724173 --- /dev/null +++ b/script/impl/streaming_disaster_recovery/streaming_modules/streaming_disaster_recovery_query.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################################# +# Copyright (c) 2020 Huawei Technologies Co.,Ltd. +# +# openGauss is licensed under Mulan PSL v2. +# You can use this software according to the terms +# and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# +# http://license.coscl.org.cn/MulanPSL2 +# +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ---------------------------------------------------------------------------- +# Description : streaming_disaster_recovery_query.py is utility for +# query streaming disaster recovery condition. + +import os + +from base_utils.security.sensitive_mask import SensitiveMask +from impl.streaming_disaster_recovery.streaming_constants import StreamingConstants +from gspylib.common.Common import ClusterCommand +from impl.streaming_disaster_recovery.streaming_base import StreamingBase + + +class StreamingQueryHandler(StreamingBase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_streaming_cluster_query_value(self, file_name): + """ + Query infos from files. + """ + file_path = os.path.realpath(os.path.join(self.streaming_file_dir, file_name)) + if not os.path.isfile(file_path) and file_name in [StreamingConstants.HADR_CLUSTER_STAT]: + return "normal" + if not os.path.isfile(file_path): + return "0%" + with open(file_path, 'r') as read_file: + value = read_file.read().strip() + return value + + def check_archive(self, former_status, cluster_status): + """ + Check for archive. + """ + self.logger.log("Start check archive.") + if former_status.strip() not in ["archive", "archive_fail"]: + self.logger.debug("Ignore for status:%s" % former_status) + return + archive_status = "archive_fail" + if cluster_status.lower() not in ["normal", "degraded"]: + self.logger.debug("Cluster status:%s,archive fail." % cluster_status) + return archive_status + if self.main_standby_ids or (not self.primary_dn_ids): + self.logger.debug("Ignore update archive for disaster_standby cluster.") + return archive_status + sql_check = "select 1 from pg_catalog.pg_stat_get_wal_senders() where sync_state" \ + "='Async' and peer_role='Standby' and peer_state='Normal';" + dn_instances = [inst for node in self.cluster_info.dbNodes for inst in node.datanodes + if inst.instanceId in self.primary_dn_ids] + self.logger.debug("Check archive with cmd:%s." % sql_check) + if dn_instances: + status, output = ClusterCommand.remoteSQLCommand( + sql_check, self.user, dn_instances[0].hostname, + dn_instances[0].port) + if status == 0 and output and output.strip(): + archive_status = "archive" + self.logger.debug("Successfully check archive, results:%s." % + SensitiveMask.mask_pwd(output)) + return archive_status + elif status == 0 and not output.strip(): + self.logger.debug("Check archive fail.") + return archive_status + else: + self.logger.debug("Check archive status:%s, output:%s." + % (status, output)) + self.logger.debug("Check archive result:%s." % archive_status) + return archive_status + + def check_recovery(self, former_status, cluster_status="normal"): + """ + Check for recovery. + """ + self.logger.log("Start check recovery.") + if former_status.strip() not in ["recovery", "recovery_fail"]: + self.logger.debug("Ignore for check recovery status:%s" % former_status) + return + recovery_status = "recovery_fail" + if cluster_status.lower() not in ["normal", "degraded"]: + self.logger.debug("Cluster status:%s,recovery fail." % cluster_status) + return recovery_status + if self.primary_dn_ids or (not self.main_standby_ids): + self.logger.debug("Ignore update recovery for primary cluster.") + return recovery_status + return "recovery" + + def get_max_rpo_rto(self): + """ + Get max rpo and rto. + """ + self.logger.log("Start check RPO & RTO.") + rpo_sql = "SELECT current_rpo FROM dbe_perf.global_streaming_hadr_rto_and_rpo_stat;" + rto_sql = "SELECT current_rto FROM dbe_perf.global_streaming_hadr_rto_and_rpo_stat;" + rto_rpo_sql = rpo_sql + rto_sql + if not self.primary_dn_ids: + self.logger.debug("Not found primary dn in cluster, cluster status:%s, " + "main standby:%s." % (self.cluster_status, self.main_standby_ids)) + return "", "" + log_info = "Execute sql [%s] on node [%s: %s] with result:%s" + dn_instances = [inst for node in self.cluster_info.dbNodes for inst in node.datanodes + if inst.instanceId in self.primary_dn_ids] + if dn_instances: + status, output = ClusterCommand.remoteSQLCommand( + rto_rpo_sql, self.user, dn_instances[0].hostname, dn_instances[0].port) + if status == 0 and output: + try: + rets = output.strip().split('\n') + length = len(rets) // 2 + rpo_list = [int(i) for i in rets[:length]] + rto_list = [int(j) for j in rets[length:]] + max_rpo, max_rto = str(max(rpo_list)), str(max(rto_list)) + except ValueError: + return "", "" + self.logger.debug("Successfully get max rpo:%s, rto:%s, output:%s" + % (max_rpo, max_rto, ','.join(output.split('\n')))) + return max_rpo, max_rto + else: + self.logger.debug(log_info % (rto_rpo_sql, dn_instances[0].hostname, + dn_instances[0].port, ','.join(output.split('\n')))) + return "", "" + + def run(self): + self.logger.log("Start streaming disaster query.") + cluster_info = self.query_cluster_info() + if cluster_info: + self.parse_cluster_status(current_status=cluster_info) + self.check_is_under_upgrade() + check_cluster_stat = self.get_streaming_cluster_query_value( + StreamingConstants.HADR_CLUSTER_STAT) + archive_status = self.check_archive(check_cluster_stat, self.cluster_status) + recovery_status = self.check_recovery(check_cluster_stat, self.cluster_status) + hadr_cluster_stat = archive_status or recovery_status or check_cluster_stat + + hadr_failover_stat = self.get_streaming_cluster_query_value( + StreamingConstants.HADR_FAILOVER_STAT) + hadr_switchover_stat = self.get_streaming_cluster_query_value( + StreamingConstants.HADR_SWICHOVER_STAT) + if hadr_cluster_stat != "promote": + hadr_failover_stat = "" + if hadr_cluster_stat != "switchover": + hadr_switchover_stat = "" + + self.logger.debug("Start check max rpo and rto.") + max_rpo, max_rto = self.get_max_rpo_rto() + self.logger.debug("Finished check max rpo and rto.") + values = dict() + values["hadr_cluster_stat"] = hadr_cluster_stat + values["hadr_failover_stat"] = hadr_failover_stat + values["hadr_switchover_stat"] = hadr_switchover_stat + values["RPO"] = max_rpo + values["RTO"] = max_rto + self.logger.log("Successfully executed streaming disaster " + "recovery query, result:\n%s" % values) diff --git a/script/impl/streaming_disaster_recovery/streaming_modules/streaming_disaster_recovery_stop.py b/script/impl/streaming_disaster_recovery/streaming_modules/streaming_disaster_recovery_stop.py new file mode 100644 index 0000000..0a41368 --- /dev/null +++ b/script/impl/streaming_disaster_recovery/streaming_modules/streaming_disaster_recovery_stop.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################################# +# Copyright (c) 2020 Huawei Technologies Co.,Ltd. +# +# openGauss is licensed under Mulan PSL v2. +# You can use this software according to the terms +# and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# +# http://license.coscl.org.cn/MulanPSL2 +# +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ---------------------------------------------------------------------------- +# Description : streaming_disaster_recovery_stop.py is a utility for stopping +# streaming disaster recovery on primary cluster. + +from impl.streaming_disaster_recovery.streaming_base import StreamingBase + + +class StreamingStopHandler(StreamingBase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def _first_step_for_streaming_stop(self, step): + """ + First step for streaming stop + """ + if step >= 2: + return + self.logger.debug("Start first step of streaming stop.") + self.init_cluster_status() + self.check_action_and_mode() + + def _second_step_for_streaming_stop(self, step): + """ + Second step for streaming stop + """ + if step >= 2: + return + self.logger.debug("Start second step of streaming start.") + self.check_cluster_status(status_allowed=['Normal']) + self.check_cluster_type(allowed_type='primary') + self.check_is_under_upgrade() + self.write_streaming_step("2_check_cluster_step") + + def _third_step_for_streaming_stop(self, step): + """ + Third step for streaming stop + """ + if step >= 3: + return + self.logger.debug("Start third step of streaming stop.") + self.remove_all_stream_repl_infos(guc_mode="reload") + self.remove_streaming_cluster_file() + self.write_streaming_step("3_remove_config_step") + + def _fourth_step_for_streaming_stop(self, step): + """ + Fourth step for streaming stop + """ + if step >= 4: + return + self.logger.debug("Start fourth step of streaming stop.") + self.remove_streaming_pg_hba() + self.restore_guc_params() + self.write_streaming_step("4_remove_pg_hba_step") + + def _fifth_step_for_streaming_stop(self, step): + """ + Fifth step for streaming stop + """ + if step >= 5: + return + self.logger.debug("Start fifth step of streaming start.") + self.streaming_clean_replication_slot() + self.write_streaming_step("5_update_config_step") + + def _sixth_step_for_streaming_stop(self, step): + """ + Sixth step for streaming stop + """ + if step >= 6: + return + self.logger.debug("Start sixth step of streaming stop.") + self.check_cluster_status(['Normal']) + self.clean_global_config() + self.update_streaming_info("cluster", "normal") + self.clean_streaming_dir() + + def run(self): + self.logger.log("Start remove streaming disaster relationship.") + step = self.query_streaming_step() + self._first_step_for_streaming_stop(step) + self.parse_cluster_status() + self._second_step_for_streaming_stop(step) + self._third_step_for_streaming_stop(step) + self._fourth_step_for_streaming_stop(step) + self._fifth_step_for_streaming_stop(step) + self._sixth_step_for_streaming_stop(step) + self.logger.log("Successfully do streaming disaster recovery stop.") diff --git a/script/impl/streaming_disaster_recovery/streaming_modules/streaming_disaster_recovery_switchover.py b/script/impl/streaming_disaster_recovery/streaming_modules/streaming_disaster_recovery_switchover.py new file mode 100644 index 0000000..5318eff --- /dev/null +++ b/script/impl/streaming_disaster_recovery/streaming_modules/streaming_disaster_recovery_switchover.py @@ -0,0 +1,476 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################################# +# Copyright (c) 2020 Huawei Technologies Co.,Ltd. +# +# openGauss is licensed under Mulan PSL v2. +# You can use this software according to the terms +# and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# +# http://license.coscl.org.cn/MulanPSL2 +# +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ---------------------------------------------------------------------------- +# Description : streaming_disaster_recovery_switchover.py is a utility for +# changing role between primary cluster and standby cluster. + +import os +import time +from datetime import datetime, timedelta + +from base_utils.os.cmd_util import CmdUtil +from base_utils.os.env_util import EnvUtil +from gspylib.common.Common import DefaultValue, ClusterCommand, ClusterInstanceConfig +from gspylib.common.DbClusterStatus import DbClusterStatus +from gspylib.common.ErrorCode import ErrorCode +from gspylib.threads.parallelTool import parallelTool +from impl.streaming_disaster_recovery.streaming_base import StreamingBase +from impl.streaming_disaster_recovery.streaming_constants import StreamingConstants + + +class StreamingSwitchoverHandler(StreamingBase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def run(self): + """ + streaming disaster recovery switchover + """ + self.logger.log("Start streaming disaster switchover.") + self.check_action_and_mode() + self.check_switchover_workable() + self.init_cluster_conf() + self.check_dn_instance_params() + self.check_is_under_upgrade() + try: + self.streaming_switchover_single_inst() + self.clean_step_file() + except Exception as error: + if self.params.mode == "primary": + self.update_streaming_info("cluster", "promote_fail") + raise Exception( + ErrorCode.GAUSS_516["GAUSS_51632"] % "switchover" + "Error:%s" % str(error)) + finally: + self.remove_cluster_maintance_file_for_switchover() + self.remove_cluster_maintance_file() + self.logger.log("Successfully do streaming disaster recovery switchover.") + + def streaming_switchover_single_inst(self): + """ + streaming disaster recovery switchover for single_inst cluster + disaster_standby: expect primary cluster becomes standby + primary: expect standby cluster becomes primary + """ + self.create_cluster_maintance_file("streaming switchover") + self.update_streaming_info("cluster", StreamingConstants.ACTION_SWITCHOVER) + stream_disaster_step = self.query_streaming_step() + if self.params.mode == "primary": + end_time = datetime.now() + timedelta(seconds=self.params.waitingTimeout) + self.logger.log("Waiting for switchover barrier.") + while True: + switchover_barrier_list = self.check_streaming_disaster_switchover_barrier() + if len(switchover_barrier_list) == len(self.normal_dn_ids): + break + if datetime.now() >= end_time: + self.restart_cluster() + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % + "check switchover_barrier on all main standby dn" + + " Because check timeout: %ss" % + str(self.params.waitingTimeout)) + time.sleep(5) + self.streaming_failover_single_inst(stream_disaster_step, + StreamingConstants.ACTION_SWITCHOVER) + else: + self.add_cluster_maintance_file_for_switchover() + try: + if stream_disaster_step < 1: + self.update_streaming_info(StreamingConstants.ACTION_SWITCHOVER, "10%") + self.stop_cluster() + self.start_cluster() + self.streaming_disaster_set_master_cluster_in_switchover() + self.write_streaming_step("1_streaming_disaster_set_master_in_switchover") + if stream_disaster_step < 2: + self.update_streaming_info(StreamingConstants.ACTION_SWITCHOVER, "30%") + ClusterInstanceConfig.set_data_on_dcc(self.cluster_info, + self.logger, self.user, + {self.backup_open_key: "2"}) + self.stop_cluster() + self.write_streaming_step("2_stop_cluster_for_switchover") + if stream_disaster_step < 3: + self.set_cmserver_guc("backup_open", "2", "set") + self.set_cmagent_guc("agent_backup_open", "2", "set") + self.write_streaming_step("3_set_backup_open_2_done") + if stream_disaster_step < 4: + self.update_streaming_info(StreamingConstants.ACTION_SWITCHOVER, "50%") + self.remove_cluster_maintance_file_for_switchover() + self.remove_cluster_maintance_file() + self.start_cluster() + self.write_streaming_step("4_start_cluster_done") + if stream_disaster_step < 5: + self.wait_for_normal(timeout=self.params.waitingTimeout, + streaming_switchover="streaming_switchover") + self.streaming_clean_replication_slot() + self.update_streaming_info("cluster", "recovery") + except Exception as error: + self.logger.error("Failed to do streaming disaster cluster switchover, Error:" + " \n%s" % str(error)) + rollback_step = self.query_streaming_step() + self.logger.debug("Roll back switchover step:%s" % rollback_step) + self.remove_cluster_maintance_file_for_switchover() + self.remove_cluster_maintance_file() + if rollback_step < 4 or (rollback_step >= 4 and + self.streaming_switchover_roll_back_condition()): + self.streaming_switchover_roll_back(update_query=True) + self.clean_step_file() + raise Exception(error) + self.remove_hadr_switchover_process_file() + + def remove_hadr_switchover_process_file(self): + self.logger.debug("Remove hadr switchover process file for switchover.") + process_file = os.path.realpath(os.path.join(self.streaming_file_dir, + ".hadr_switchover_stat")) + cmd = "if [ -f {0} ]; then rm -rf {0}; fi".format(process_file) + self.ssh_tool.executeCommand(cmd, hostList=self.connected_nodes) + self.logger.debug("Successfully remove switchover process on all connected nodes.") + + @staticmethod + def clean_file_on_node(params): + """ + clean file on dest node with path + """ + dest_ip, dest_path, timeout = params + cmd = "source %s && pssh -s -t %s -H %s 'if [ -f %s ]; then rm -f %s; fi'" % ( + EnvUtil.getMpprcFile(), timeout, dest_ip, dest_path, dest_path) + status, output = CmdUtil.getstatusoutput_by_fast_popen(cmd) + return status, output, dest_ip + + def restart_cluster(self, restart_timeout=DefaultValue.TIMEOUT_CLUSTER_START): + """ + Restart cluster + """ + self.logger.log("Restart cluster.") + static_config = "%s/bin/cluster_static_config" % self.bin_path + cm_ctl_file = "%s/bin/cm_ctl" % self.bin_path + if not os.path.isfile(static_config): + self.logger.debug("Checked file %s lost." % static_config) + if not os.path.isfile(cm_ctl_file): + self.logger.debug("Checked file %s lost." % cm_ctl_file) + stop_cmd = ClusterCommand.getStopCmd(0, timeout=restart_timeout) + status, output = CmdUtil.retryGetstatusoutput(stop_cmd, retry_time=0) + self.logger.debug("Stop cluster result:[%s][%s]." % (status, output)) + start_cmd = ClusterCommand.getStartCmd(0, timeout=restart_timeout) + status, output = CmdUtil.retryGetstatusoutput(start_cmd, retry_time=0) + self.logger.debug("Start cluster result:[%s][%s]." % (status, output)) + + def remove_cluster_maintance_file_for_switchover(self): + """ + function: remove the cluster_maintance file + :return: NA + """ + self.logger.debug("Remove cluster_maintance file for switchover.") + cluster_maintance_file = os.path.realpath(os.path.join(self.gauss_home, + "bin/cluster_maintance")) + host_names = \ + self.get_all_connection_node_name("remove_cluster_maintance_file_for_switchover") + try: + pscp_params = [] + all_instances = [dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes] + if not self.cluster_info.isSingleInstCluster(): + all_instances.extend([dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.coordinators]) + for dn_inst in all_instances: + if dn_inst.hostname in host_names: + pscp_params.append([dn_inst.hostname, os.path.join( + dn_inst.datadir, os.path.basename(cluster_maintance_file)), 10]) + if len(pscp_params) > 0: + results = parallelTool.parallelExecute(self.clean_file_on_node, pscp_params) + for ret in results: + if ret[0] != 0: + self.logger.debug("clean maintance file to node[%s] with status[%s], " + "output[%s]" % (ret[-1], ret[0], ret[1])) + except Exception as error: + self.logger.debug( + "Failed to remove cluster_maintance file for switchover with error: %s" + % str(error)) + self.logger.debug("Successfully remove %s cluster_maintance file for switchover." + % host_names) + + def add_cluster_maintance_file_for_switchover(self): + """ + add cluster_maintance file for streaming disaster switchover to disaster_standby + """ + self.logger.debug("Start add cluster_maintance file for switchover.") + try: + cluster_maintance_file = os.path.realpath(os.path.join(self.gauss_home, + "bin/cluster_maintance")) + host_names = \ + self.get_all_connection_node_name("add_cluster_maintance_file_for_switchover", True) + pscp_params = [] + all_instances = [dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes] + for dn_inst in all_instances: + if dn_inst.hostname in host_names: + pscp_params.append([dn_inst.hostname, cluster_maintance_file, + os.path.join(dn_inst.datadir, "cluster_maintance"), 10]) + if len(pscp_params) > 0: + results = parallelTool.parallelExecute( + DefaultValue.distribute_file_to_node, pscp_params) + for ret in results: + if ret[0] != 0: + self.logger.debug("Distribute maintance file for switchover to node[%s] " + "with status[%s], output[%s]" % (ret[-1], ret[0], ret[1])) + except Exception as error: + self.logger.debug("WARNING: Failed add cluster_maintance file for switchover, " + "error:%s." % (str(error))) + self.logger.debug("Successfully add cluster_maintance file for switchover.") + + def streaming_disaster_set_master_cluster_in_switchover(self): + """ + streaming disaster set master cluster in switchover + """ + self.logger.debug("Starting set streaming master cluster in switchover.") + primary_dns = [dn_inst for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes if + dn_inst.instanceId in self.primary_dn_ids] + if not primary_dns: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "obtain primary dns for switchover") + if self.streaming_dr_in_switchover(primary_dns): + if self.streaming_dr_service_truncation_check(primary_dns): + self.logger.debug("Successfully set streaming master cluster in switchover.") + + def streaming_dr_service_truncation_check(self, primary_dns_list): + """ + streaming dr service truncation check + """ + self.logger.log("Waiting for truncation.") + results = parallelTool.parallelExecute(self.concurrent_check_dr_service_truncation, + primary_dns_list) + return all(results) + + def concurrent_check_dr_service_truncation(self, dn_inst): + """ + Wait for the log playback to complete. + """ + self.logger.debug("Starting check node %s shardNum %s instance %s streaming service " + "truncation." % (dn_inst.hostname, dn_inst.mirrorId, dn_inst.instanceId)) + sql_check = "select * from gs_streaming_dr_service_truncation_check();" + end_time = datetime.now() + timedelta(seconds=1200) + succeed = False + while datetime.now() < end_time: + status, output = ClusterCommand.remoteSQLCommand(sql_check, self.user, dn_inst.hostname, + dn_inst.port) + if status == 0 and output and output.strip() == "t": + succeed = True + break + time.sleep(5) + self.logger.debug("Retry truncation check shardNum %s in node %s instance %s." % + (dn_inst.mirrorId, dn_inst.hostname, dn_inst.instanceId)) + if not succeed: + self.logger.error("Failed to execute the command: %s, Error:\n%s" % (sql_check, output)) + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % + "check truncate service before switchover") + self.logger.debug("Successfully check node %s shardNum %s instance %s streaming service " + "truncation." % (dn_inst.hostname, dn_inst.mirrorId, dn_inst.instanceId)) + return True + + def streaming_dr_in_switchover(self, primary_dns_list): + """ + set steaming dr in switchover + """ + results = parallelTool.parallelExecute(self.concurrent_set_dr_in_switchover, + primary_dns_list) + return all(results) + + def concurrent_set_dr_in_switchover(self, dn_inst): + """ + Switchover requires log truncation first + """ + self.logger.debug("Starting set shardNum %s node %s streaming dr in switchover." % + (dn_inst.mirrorId, dn_inst.hostname)) + sql_cmd = "select * from gs_streaming_dr_in_switchover();" + # We need to use the normal port to transmit service truncation, + # not the OM port. + port = int(dn_inst.port) - 1 + (status, output) = ClusterCommand.remoteSQLCommand(sql_cmd, + self.user, dn_inst.hostname, str(port)) + self.logger.debug("check streaming in switchover, status=%d, output: %s." + % (status, output)) + if status != 0 or self.find_error(output) or output.strip() != "t": + self.logger.error("Failed to execute the command: %s, Error:\n%s" % (sql_cmd, output)) + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % + "generate switchover barrier before switchover") + self.logger.debug("Successfully set shardNum %s node %s streaming dr in switchover." % + (dn_inst.mirrorId, dn_inst.hostname)) + return True + + def wait_for_normal(self, timeout=DefaultValue.TIMEOUT_CLUSTER_START, + streaming_switchover=None): + """ + function:Wait the cluster become Normal or Degraded + input:NA + output:NA + """ + self.logger.debug("Waiting for cluster status being satisfied.") + end_time = None if timeout <= 0 else datetime.now() + timedelta(seconds=timeout) + + check_status = 0 + while True: + time.sleep(10) + if end_time is not None and datetime.now() >= end_time: + check_status = 1 + self.logger.debug("Timeout. The cluster is not available.") + break + # View the cluster status + status_file = "/home/%s/gauss_check_status_%d.dat" % (self.user, os.getpid()) + cmd = ClusterCommand.getQueryStatusCmd(outFile=status_file) + (status, output) = CmdUtil.retryGetstatusoutput(cmd, retry_time=0) + if status != 0: + if os.path.exists(status_file): + os.remove(status_file) + self.logger.debug("Failed to obtain the cluster status. Error: \n%s" % output) + continue + # Determine whether the cluster status is normal or degraded + cluster_status = DbClusterStatus() + cluster_status.initFromFile(status_file) + if os.path.exists(status_file): + os.remove(status_file) + if cluster_status.clusterStatus == "Normal": + self.logger.log("The cluster status is Normal.") + break + else: + self.logger.debug("Cluster status is %s(%s)." % ( + cluster_status.clusterStatus, cluster_status.clusterStatusDetail)) + + if check_status != 0: + if streaming_switchover == "streaming_switchover": + raise Exception( + ErrorCode.GAUSS_528["GAUSS_52800"] % (cluster_status.clusterStatus, + cluster_status.clusterStatusDetail)) + self.logger.logExit(ErrorCode.GAUSS_528["GAUSS_52800"] % ( + cluster_status.clusterStatus, cluster_status.clusterStatusDetail)) + self.logger.debug("Successfully wait for cluster status become Normal.", "constant") + + def set_auto_csn_barrier_guc(self, guc_mode, action_flag=False, roll_back=False): + """ + auto_csn_barrier : 0 / 1 + """ + guc_value = 1 if self.params.mode == "primary" else 0 + if action_flag: + guc_value = 0 + if roll_back: + guc_value = 1 + self.logger.debug("Starting %s auto_csn_barrier is %s." % (guc_mode, guc_value)) + cmd = 'source %s && gs_guc %s -Z coordinator -N all -I all ' \ + '-c "auto_csn_barrier=%s"' % (self.mpp_file, guc_mode, guc_value) + host_names = self.cluster_info.getClusterNodeNames() + ignore_node = [node for node in host_names if node not in self.normal_node_list] + if ignore_node: + self.logger.debug( + "WARNING: auto_csn_barrier need ignore host name is %s" % ignore_node) + nodes = ",".join(ignore_node) + cmd = cmd + " --ignore-node %s" % nodes + self.logger.debug("Set auto_csn_barrier with cmd:%s" % cmd) + status, output = CmdUtil.retryGetstatusoutput(cmd) + if status != 0: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "set auto_csn_barrier" + "Error:%s" % output) + self.logger.debug("Successfully %s auto_csn_barrier is %s." % (guc_mode, guc_value)) + + def streaming_switchover_roll_back(self, update_query=False): + """ + streaming disaster cluster roll back in switchover + """ + self.logger.log("Roll back streaming disaster cluster switchover...") + ClusterInstanceConfig.set_data_on_dcc(self.cluster_info, + self.logger, self.user, + {self.backup_open_key: "0"}) + self.stop_cluster() + self.set_cmserver_guc("backup_open", "0", "set") + self.set_cmagent_guc("agent_backup_open", "0", "set") + self.logger.log("Successfully modify cma and cms parameters to start according to primary " + "cluster mode") + if update_query: + self.update_streaming_info("cluster", "archive") + self.start_cluster() + self.logger.log("Successfully Roll back streaming disaster cluster switchover.") + + def check_streaming_disaster_switchover_barrier(self): + """ + check whether get switchover_barrier on all dn + """ + self.logger.debug("check streaming disaster switchover barrier...") + sql_cmd = "select * from gs_streaming_dr_get_switchover_barrier();" + switchover_barrier_list = [] + for db_node in self.cluster_info.dbNodes: + for dn_inst in db_node.datanodes: + if dn_inst.instanceId not in self.normal_dn_ids: + self.logger.debug("Warning: Not check for abnormal instance %s %s" % ( + dn_inst.instanceType, dn_inst.instanceId)) + continue + (status, output) = ClusterCommand.remoteSQLCommand( + sql_cmd, self.user, dn_inst.hostname, dn_inst.port, maintenance_mode=True) + self.logger.debug("Check inst has switchover barrier, status=%d, " + "output: %s." % (status, output)) + if status == 0 and output.strip() == "t": + self.logger.debug("Successfully check instance %s %s has switchover " + "barrier." % (dn_inst.instanceType, dn_inst.instanceId)) + switchover_barrier_list.append(dn_inst.instanceId) + return switchover_barrier_list + + def check_switchover_workable(self): + """ + Check switchover is workable + """ + if not DefaultValue.is_disaster_cluster(self.cluster_info) \ + and self.params.mode == "primary": + self.logger.debug("The primary dn exist, do nothing except record the result file.") + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % + "streaming disaster cluster switchover, Because the primary cluster " + "[drClusterMode] parameter must be disaster_standby") + if DefaultValue.is_disaster_cluster(self.cluster_info) and \ + self.params.mode == "disaster_standby": + self.logger.debug("The primary dn not exist, do nothing except record the result file.") + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] % + "streaming disaster cluster switchover, Because the disaster_standby " + "cluster [drClusterMode] parameter must be primary") + self.logger.log("Waiting for cluster and all instances normal.") + if self.params.mode == "primary": + end_time = datetime.now() + timedelta(seconds=600) + while True: + self.init_cluster_status() + self.parse_cluster_status() + if self.check_cluster_status(status_allowed=['Normal'], only_check=True, + is_log=False) and self.check_instances_ready_for_switchover(): + break + if datetime.now() >= end_time: + raise Exception(ErrorCode.GAUSS_516["GAUSS_51632"] + % "check cluster and instances status" + " with timeout: %ss" % str(600)) + time.sleep(5) + self.logger.debug("Retry check stream disaster standby cluster status...") + else: + self.init_cluster_status() + self.parse_cluster_status() + if (not self.check_cluster_status(status_allowed=['Normal'], only_check=True, + is_log=False)) \ + or (not self.check_instances_ready_for_switchover()): + raise Exception(ErrorCode.GAUSS_516['GAUSS_51632'] % "check cluster status") + + def check_instances_ready_for_switchover(self): + """ + Check cns and dns is ready for switchover + """ + dn_instances = [dn_inst.instanceId for db_node in self.cluster_info.dbNodes + for dn_inst in db_node.datanodes] + if len(dn_instances) != len(self.normal_dn_ids): + self.logger.debug("Not all dn instances is normal.") + return False + self.logger.debug("Successfully check cn and dn instances are normal.") + return True diff --git a/script/impl/upgrade/UpgradeConst.py b/script/impl/upgrade/UpgradeConst.py index b51fdae..77a018e 100644 --- a/script/impl/upgrade/UpgradeConst.py +++ b/script/impl/upgrade/UpgradeConst.py @@ -107,6 +107,24 @@ BINARY_UPGRADE_STEP_START_NODE = 5 BINARY_UPGRADE_STEP_PRE_COMMIT = 6 +# dual cluster stage +class DualClusterStage: + """ + Dual cluster stage upgrade marking + """ + def __init__(self): + pass + + (STEP_UPGRADE_END, + STEP_UPGRADE_UNFINISHED, + STEP_UPGRADE_FINISH, + STEP_UPGRADE_COMMIT, + ) = list(range(0, 4)) + + def __str__(self): + pass + + # grey upgrade class GreyUpgradeStep: def __init__(self): @@ -174,3 +192,17 @@ UPGRADE_VERSION_64bit_xid = 91.208 ENABLE_STREAM_REPLICATION_VERSION = "92.149" ENABLE_STREAM_REPLICATION_NAME = "enable_stream_replication" RELMAP_4K_VERSION = "92.420" + +# streaming cluster +GS_SECURE_FILES = "gs_secure_files" +UPGRADE_PHASE_INFO = "upgrade_phase_info" +HARD_KEY_CIPHER = "hadr.key.cipher" +HARD_KEY_RAND = "hadr.key.rand" +DISASTER_RECOVERY_GUC = "backup_open" +INSTALL_TYPE_GUC = "install_type" +REMOTE_INFO_GUC = { + "dual-standby-streamDR": "replconninfo", + "dual-primary-streamDR": "replconninfo" +} +LENGTH_STORAGE_INFO_LEN = 4 +ACTION_CLEAN_GS_SECURE_FILES = "clean_gs_secure_files" diff --git a/script/impl/upgrade/UpgradeImpl.py b/script/impl/upgrade/UpgradeImpl.py index 8160f0d..486caea 100644 --- a/script/impl/upgrade/UpgradeImpl.py +++ b/script/impl/upgrade/UpgradeImpl.py @@ -23,6 +23,7 @@ import json import csv import traceback import copy +import re from datetime import datetime, timedelta @@ -38,6 +39,7 @@ from gspylib.os.gsfile import g_file from gspylib.inspection.common import SharedFuncs from gspylib.component.CM.CM_OLAP.CM_OLAP import CM_OLAP from impl.upgrade.UpgradeConst import GreyUpgradeStep +from impl.upgrade.UpgradeConst import DualClusterStage import impl.upgrade.UpgradeConst as const from base_utils.executor.cmd_executor import CmdExecutor from base_utils.executor.local_remote_cmd import LocalRemoteCmd @@ -82,6 +84,7 @@ class UpgradeImpl: self.isLargeInplaceUpgrade = False self.__upgrade_across_64bit_xid = False self.action = upgrade.action + self.primaryDn = None def exitWithRetCode(self, action, succeed=True, msg=""): """ @@ -145,9 +148,56 @@ class UpgradeImpl: self.context.sshTool = SshTool( self.context.clusterNodes, self.context.localLog, DefaultValue.TIMEOUT_PSSH_BINARY_UPGRADE) + self.initVersionInfo() self.initClusterConfig() + self.initClusterType() self.context.logger.debug("Successfully init global infos", "constant") + def initVersionInfo(self): + """ + Initialize the old and new version information + + :return: + """ + newVersionFile = VersionInfo.get_version_file() + newClusterVersion, newClusterNumber, newCommitId = VersionInfo.get_version_info( + newVersionFile) + gaussHome = ClusterDir.getInstallDir(self.context.user) + + newPath = gaussHome + "_%s" % newCommitId + oldPath = self.getClusterAppPath(const.OLD) + + if oldPath == "": + oldPath = os.path.realpath(gaussHome) + oldVersionFile = "%s/bin/upgrade_version" % oldPath + try: + + (oldClusterVersion, oldClusterNumber, oldCommitId) = VersionInfo.get_version_info( + oldVersionFile) + self.context.logger.debug("Successfully obtained version information of " + "old clusters by %s." % oldVersionFile) + except Exception as er: + if os.path.exists(self.context.upgradeBackupPath): + # if upgradeBackupPath exist, it means that we do rollback first. + # and we get cluster version from the backup file + possibOldVersionFile = "%s/old_upgrade_version" % self.context.upgradeBackupPath + self.context.logger.debug(str(er)) + self.context.logger.debug("Try to get the version information " + "from %s." % possibOldVersionFile) + (oldClusterVersion, oldClusterNumber, oldCommitId) = VersionInfo.get_version_info( + possibOldVersionFile) + else: + raise Exception(str(er)) + + self.context.newClusterVersion = newClusterVersion + self.context.newClusterNumber = newClusterNumber + self.context.oldClusterVersion = oldClusterVersion + self.context.oldClusterNumber = oldClusterNumber + self.context.newClusterAppPath = newPath + self.context.oldClusterAppPath = oldPath + self.newCommitId = newCommitId + self.oldCommitId = oldCommitId + def setClusterDetailInfo(self): """ function: set cluster detail info @@ -268,6 +318,8 @@ class UpgradeImpl: DefaultValue.TIMEOUT_PSSH_BINARY_UPGRADE) if action == const.ACTION_AUTO_ROLLBACK and \ self.checkBakPathNotExists(): + if os.path.isfile(self.context.upgradePhaseInfoPath): + self.recordDualClusterStage(self.oldCommitId, DualClusterStage.STEP_UPGRADE_END) self.context.logger.log("No need to rollback.") self.exitWithRetCode(action, True) else: @@ -290,6 +342,11 @@ class UpgradeImpl: grey upgrade rollback if not in read only, then record the value of enable_transaction_read_only and set it to off """ + # no need to check read only mode and close enable_transaction_read_only + if self.context.standbyCluster: + self.context.logger.debug("no need to check read only in force or" + " standby cluster mode upgrade") + return try: self.context.logger.debug("Check if in read only mode.") greyUpgradeFlagFile = os.path.join(self.context.upgradeBackupPath, @@ -481,16 +538,9 @@ class UpgradeImpl: % newClusterNumber) self.context.logger.debug("The matched upgrade strategy is: %s." % upgradeAction) - self.context.newClusterVersion = newClusterVersion - self.context.newClusterNumber = newClusterNumber - self.context.oldClusterVersion = oldClusterVersion - self.context.oldClusterNumber = oldClusterNumber - self.context.newClusterAppPath = newPath - self.context.oldClusterAppPath = oldPath - self.newCommitId = newCommitId - self.oldCommitId = oldCommitId return upgradeAction except Exception as e: + self.clean_gs_secure_files() raise Exception(ErrorCode.GAUSS_529["GAUSS_52900"] % str(e) + " Do nothing this time.") @@ -665,6 +715,10 @@ class UpgradeImpl: """ try: self.context.logger.debug("Setting up the cluster read-only mode.") + if self.context.standbyCluster: + self.context.logger.debug("no need to set cluster " + "read only mode under force or standby cluster upgrade") + return 0 self.setGUCValue("default_transaction_read_only", "true") self.context.logger.debug("successfully set the cluster read-only mode.") return 0 @@ -682,6 +736,10 @@ class UpgradeImpl: """ try: self.context.logger.debug("Canceling the cluster read-only mode.") + if self.context.standbyCluster: + self.context.logger.debug("no need to unset cluster " + "read only mode under force or standby cluster upgrade") + return 0 self.setGUCValue("default_transaction_read_only", "false") self.context.logger.debug("Successfully cancelled the cluster read-only mode.") return 0 @@ -887,6 +945,8 @@ class UpgradeImpl: Input : gucStr the guc key:value string output : NA """ + if "dual-standby" in self.context.clusterType: + return self.context.logger.debug("Start to check GUC value %s." % gucStr) try: # send cmd to that node and exec @@ -910,6 +970,28 @@ class UpgradeImpl: except Exception as e: raise Exception(str(e)) + def backup_disaster_user_file(self): + """backup_disaster_user_file""" + bin_path = os.path.join(EnvUtil.getEnv("GAUSSHOME"), "bin") + cipher_file = os.path.join(bin_path, "hadr.key.cipher") + if os.path.isfile(cipher_file): + FileUtil.cpFile(cipher_file, "%s/" % self.context.tmpDir) + rand_file = os.path.join(bin_path, "hadr.key.rand") + if os.path.isfile(rand_file): + FileUtil.cpFile(rand_file, "%s/" % self.context.tmpDir) + self.context.logger.debug("Back up rand and cipher file to temp dir.") + + def restore_origin_disaster_user_file(self): + """restore_origin_disaster_user_file""" + bin_path = os.path.join(self.context.newClusterAppPath, "bin") + cipher_file = os.path.join(self.context.tmpDir, "hadr.key.cipher") + if os.path.isfile(cipher_file): + self.context.sshTool.scpFiles(cipher_file, bin_path) + rand_file = os.path.join(self.context.tmpDir, "hadr.key.rand") + if os.path.isfile(rand_file): + self.context.sshTool.scpFiles(rand_file, bin_path) + self.context.logger.debug("Restore rand and cipher file to gausshome.") + def floatMoreThan(self, numOne, numTwo): """ function: float more than @@ -968,8 +1050,10 @@ class UpgradeImpl: self.distributeXml() # 2. check if the app path is ready and sha256 is right and others self.checkUpgrade() - # 4. check the cluster pressure - self.HASyncReplayCheck() + if self.context.action == const.ACTION_LARGE_UPGRADE and \ + "dual-standby" not in self.context.clusterType: + # 4. check the cluster pressure + self.HASyncReplayCheck() # 5. before do grey binary upgrade, we must make sure the # cluster is Normal and the database could be # connected, if not, exit. @@ -983,6 +1067,12 @@ class UpgradeImpl: # check if it satisfy upgrade again, if it is the second loop to # upgrade, it can go go upgrade again branch upgradeAgain = self.canUpgradeAgain() + if not upgradeAgain: + self.recordDualClusterStage(self.oldCommitId, + DualClusterStage.STEP_UPGRADE_UNFINISHED) + self.context.logger.log("NOTICE: The directory %s will be deleted after " + "commit-upgrade, please make sure there is no personal " + "data." % self.context.oldClusterAppPath) except Exception as e: # before this step, the upgrade process do nothing to the cluster, # this time has no remaining @@ -998,6 +1088,8 @@ class UpgradeImpl: if not self.doGreyBinaryRollback(): self.exitWithRetCode(const.ACTION_AUTO_ROLLBACK, False) self.removeOmRollbackProgressFile() + self.recordDualClusterStage(self.oldCommitId, + DualClusterStage.STEP_UPGRADE_UNFINISHED) self.context.logger.log( "The directory %s will be deleted after commit-upgrade, " "please make sure there is no personal data." % @@ -1021,8 +1113,14 @@ class UpgradeImpl: # we can not recognize if it really cannot # find the column, or just because the old version. So we # will update the catalog in the old version - if self.context.action == const.ACTION_LARGE_UPGRADE: + if self.context.action == const.ACTION_LARGE_UPGRADE and \ + "dual-standby" not in self.context.clusterType: self.updateCatalog() + elif self.context.action == const.ACTION_LARGE_UPGRADE and \ + "dual-standby" in self.context.clusterType: + self.setUpgradeFromParam(self.context.oldClusterNumber) + self.reloadCmAgent() + self.reload_cmserver() self.recordNodeStep(GreyUpgradeStep.STEP_SWITCH_NEW_BIN) self.CopyCerts() self.upgradeAgain() @@ -1061,6 +1159,7 @@ class UpgradeImpl: # 11. switch the cluster version to new version self.getOneDNInst(checkNormal=True) self.switchBin(const.NEW) + self.restore_origin_disaster_user_file() # create CA for CM self.create_ca_for_cm() self.setNewVersionGuc() @@ -1093,14 +1192,16 @@ class UpgradeImpl: self.waitClusterForNormal() # backup global relmap file before doing upgrade-post self.backupGlobalRelmapFile() - self.prepareSql("rollback-post") - self.execRollbackUpgradedCatalog(scriptType="rollback-post") - self.prepareSql("upgrade-post") - self.execRollbackUpgradedCatalog(scriptType="upgrade-post") - self.getLsnInfo() + if "dual-standby" not in self.context.clusterType: + self.prepareSql("rollback-post") + self.execRollbackUpgradedCatalog(scriptType="rollback-post") + self.prepareSql("upgrade-post") + self.execRollbackUpgradedCatalog(scriptType="upgrade-post") + self.getLsnInfo() hosts = copy.deepcopy(self.context.clusterNodes) self.recordNodeStep( GreyUpgradeStep.STEP_PRE_COMMIT, nodes=hosts) + self.recordDualClusterStage(self.newCommitId, DualClusterStage.STEP_UPGRADE_FINISH) self.printPrecommitBanner() except Exception as e: hintInfo = "Nodes are new version. " \ @@ -1250,6 +1351,9 @@ class UpgradeImpl: try: self.context.logger.log("Create checkpoint before switching.") start_time = timeit.default_timer() + if self.context.forceRollback or self.context.standbyCluster: + self.context.logger.debug("No need to do checkpoint.") + return # create checkpoint sql = "CHECKPOINT;" for i in range(10): @@ -1703,6 +1807,10 @@ class UpgradeImpl: if not self.doInplaceBinaryRollback(): self.exitWithRetCode(const.ACTION_AUTO_ROLLBACK, False) try: + if self.context.action == const.ACTION_LARGE_UPGRADE and \ + "dual-standby" not in self.context.clusterType: + # check the cluster pressure + self.HASyncReplayCheck() self.checkUpgrade() # 3. before do binary upgrade, we must make sure the cluster is @@ -2278,6 +2386,9 @@ class UpgradeImpl: output: NA """ self.context.logger.debug("Preparing upgrade sql folder.") + if self.context.standbyCluster: + self.context.logger.debug("no need prepare upgrade sql folder under force upgrade") + return hosts = self.context.clusterNodes cmd = "%s -t %s -U %s --upgrade_bak_path=%s -X %s -l %s" % \ (OMCommand.getLocalScript("Local_Upgrade_Utility"), @@ -2309,6 +2420,10 @@ class UpgradeImpl: self.context.logger.debug("Start to wait and check if all the standby" " instances have replayed all xlogs, host: %s" % \ host.hostname) + if self.context.standbyCluster or self.context.forceRollback: + self.context.logger.debug("no need to do HA sync replay check " + "under force upgrade/rollback and standby cluster mode") + return self.doReplay(catchupFailedOk, host) self.context.logger.debug("Successfully performed the replay check " "of the standby instance.") @@ -2754,10 +2869,11 @@ class UpgradeImpl: """ self.context.logger.debug("Get database list in cluster.") sql = "select datname from pg_database;" + mode = True if "dual-standby" in self.context.clusterType else False (status, output) = ClusterCommand.remoteSQLCommand( sql, self.context.user, self.dnInst.hostname, self.dnInst.port, False, - DefaultValue.DEFAULT_DB_NAME, IsInplaceUpgrade=True) + DefaultValue.DEFAULT_DB_NAME, IsInplaceUpgrade=True, maintenance_mode=mode) if status != 0: raise Exception(ErrorCode.GAUSS_513["GAUSS_51300"] % sql + " Error: \n%s" % str(output)) @@ -2777,12 +2893,13 @@ class UpgradeImpl: make checkpoint :return: """ + mode = True if "dual-standby" in self.context.clusterType else False sql = 'CHECKPOINT;' for eachdb in database_list: (status, output) = ClusterCommand.remoteSQLCommand( sql, self.context.user, self.dnInst.hostname, self.dnInst.port, False, - eachdb, IsInplaceUpgrade=True) + eachdb, IsInplaceUpgrade=True, maintenance_mode=mode) if status != 0: raise Exception(ErrorCode.GAUSS_513["GAUSS_51300"] % sql + " Error: \n%s" % str(output)) @@ -3335,6 +3452,7 @@ class UpgradeImpl: input : NA output: NA """ + self.checkDualClusterCommit() try: (status, output) = self.doHealthCheck(const.OPTION_POSTCHECK) if status != 0: @@ -3351,22 +3469,27 @@ class UpgradeImpl: # for the reenter commit, the schema may have been deleted if self.existTable(const.RECORD_NODE_STEP): self.recordNodeStep(GreyUpgradeStep.STEP_BEGIN_COMMIT) + self.recordDualClusterStage(self.newCommitId, DualClusterStage.STEP_UPGRADE_COMMIT) + self.setActionFile() if self.context.action == const.ACTION_LARGE_UPGRADE: if DefaultValue.get_cm_server_num_from_static(self.context.clusterInfo) > 0: self.setUpgradeFromParam(const.UPGRADE_UNSET_NUM) self.reloadCmAgent() self.reload_cmserver(is_final=True) - self.setUpgradeMode(0) + if "dual-standby" not in self.context.clusterType: + self.setUpgradeMode(0) time.sleep(10) if self.dropPMKSchema() != 0: raise Exception(ErrorCode.GAUSS_529["GAUSS_52917"]) self.clearOtherToolPackage() self.cleanInstallPath(const.OLD) - self.dropSupportSchema() - self.cleanBinaryUpgradeBakFiles() + if "dual-standby" not in self.context.clusterType: + self.dropSupportSchema() self.cleanConfBakOld() + self.recordDualClusterStage(self.newCommitId, DualClusterStage.STEP_UPGRADE_END) + self.cleanBinaryUpgradeBakFiles() # remove tmp global relmap file self.cleanTmpGlobalRelampFile() self.context.logger.log("Commit upgrade succeeded.") @@ -3383,6 +3506,9 @@ class UpgradeImpl: """ try: self.context.logger.debug("Start to drop schema PMK.") + if self.context.standbyCluster: + self.context.logger.debug("no need to delete schema PMK in standby cluster mode.") + return 0 # execute drop commands by the CN instance sql = "DROP SCHEMA IF EXISTS pmk CASCADE; " retry_times = 0 @@ -3448,7 +3574,10 @@ class UpgradeImpl: try: self.distributeXml() if action == const.ACTION_AUTO_ROLLBACK: + self.checkDualClusterRollback() self.clearOtherToolPackage(action) + self.recordDualClusterStage(self.oldCommitId, + DualClusterStage.STEP_UPGRADE_UNFINISHED) try: self.getOneDNInst(True) except Exception as e: @@ -3475,12 +3604,14 @@ class UpgradeImpl: # consider if need to sync them, not important # under force upgrade, only read step from file maxStep = self.getNodeStep() + self.checkDualClusterRollback() # if -2, it means there is no need to exec rollback # if under upgrade continue mode, it will do upgrade not rollback, # it can enter the upgrade process # when the binary_upgrade bak dir has some files if maxStep == const.BINARY_UPGRADE_NO_NEED_ROLLBACK: self.cleanBinaryUpgradeBakFiles(True) + self.recordDualClusterStage(self.oldCommitId, DualClusterStage.STEP_UPGRADE_END) self.context.logger.log("No need to rollback.") return True @@ -3498,6 +3629,7 @@ class UpgradeImpl: self.recordNodeStep( GreyUpgradeStep.STEP_UPDATE_POST_CATALOG, nodes) maxStep = self.getNodeStep() + self.checkDualClusterRollback() if maxStep == GreyUpgradeStep.STEP_UPDATE_POST_CATALOG: self.context.logger.debug( "Record the step %d to mark it has leaved pre-commit" @@ -3506,7 +3638,8 @@ class UpgradeImpl: if self.context.action == const.ACTION_LARGE_UPGRADE\ and \ self.isNodeSpecifyStep( - GreyUpgradeStep.STEP_UPDATE_POST_CATALOG): + GreyUpgradeStep.STEP_UPDATE_POST_CATALOG)\ + and "dual-standby" not in self.context.clusterType: self.prepareUpgradeSqlFolder() self.prepareSql("rollback-post") self.setUpgradeMode(2) @@ -3538,7 +3671,8 @@ class UpgradeImpl: self.recordNodeStep(GreyUpgradeStep.STEP_UPDATE_CATALOG) if maxStep >= GreyUpgradeStep.STEP_UPDATE_CATALOG and\ self.context.action == const.ACTION_LARGE_UPGRADE: - self.rollbackCatalog() + if "dual-standby" not in self.context.clusterType: + self.rollbackCatalog() self.recordNodeStep(GreyUpgradeStep.STEP_INIT_STATUS) if maxStep >= GreyUpgradeStep.STEP_INIT_STATUS: @@ -3546,8 +3680,10 @@ class UpgradeImpl: # dir will create in every node self.cleanInstallPath(const.NEW) self.getOneDNInst() - self.dropSupportSchema() + if "dual-standby" not in self.context.clusterType: + self.dropSupportSchema() self.initOmRollbackProgressFile() + self.recordDualClusterStage(self.oldCommitId, DualClusterStage.STEP_UPGRADE_END) self.cleanBinaryUpgradeBakFiles(True) self.cleanTmpGlobalRelampFile() except Exception as e: @@ -3621,28 +3757,6 @@ class UpgradeImpl: """ self.checkActionInFile() - def execSqlCommandInPrimaryDN(self, sql, retryTime=3): - self.context.logger.debug("Start to exec sql {0}.".format(sql)) - count = 0 - status, output = 1, "" - while count < retryTime: - - self.context.logger.debug( - "Exec sql in dn node {0}".format(self.dnInst.hostname)) - (status, output) = ClusterCommand.remoteSQLCommand( - sql, self.context.user, - self.dnInst.hostname, self.dnInst.port, False, - DefaultValue.DEFAULT_DB_NAME, IsInplaceUpgrade=True) - self.context.logger.debug( - "Exec sql result is, status:{0}, output is {1}".format( - status, output)) - if status != 0 or SqlResult.findErrorInSql(output): - count += 1 - continue - else: - break - return status, output - def checkActionInFile(self): """ function: check whether current action is same @@ -3884,11 +3998,12 @@ class UpgradeImpl: check a table exist :return: """ + mode = True if "dual-standby" in self.context.clusterType else False sql = "select count(*) from pg_class where relname = '%s';" % name (status, output) = ClusterCommand.remoteSQLCommand( sql, self.context.user, self.dnInst.hostname, self.dnInst.port, False, - eachdb, IsInplaceUpgrade=True) + eachdb, IsInplaceUpgrade=True, maintenance_mode=mode) if status != 0 or SqlResult.findErrorInSql(output): raise Exception(ErrorCode.GAUSS_513["GAUSS_51300"] % sql + " Error: \n%s" % str(output)) @@ -4832,7 +4947,7 @@ class UpgradeImpl: self.context.logger.log( "Failed to check upgrade environment.", "constant") raise Exception(str(e)) - + self.checkDualClusterUpgrade() self.context.logger.log( "Successfully checked upgrade environment.", "constant") @@ -5618,6 +5733,9 @@ class UpgradeImpl: try: # clean backup files self.cleanBackupFiles() + # clean gs_secure_files folder + if self.context.rollback or self.action == "commit-upgrade": + self.clean_gs_secure_files() except Exception as e: raise Exception(str(e)) if (isRollBack): @@ -5838,6 +5956,7 @@ class UpgradeImpl: 1 failed """ self.context.logger.debug("Start to check database connection.") + mode = True if "dual-standby" in self.context.clusterType else False for dbNode in self.context.clusterInfo.dbNodes: if len(dbNode.datanodes) == 0 or dbNode.name: continue @@ -5848,7 +5967,7 @@ class UpgradeImpl: ClusterCommand.remoteSQLCommand( sql, self.context.user, dnInst.hostname, dnInst.port, False, DefaultValue.DEFAULT_DB_NAME, - IsInplaceUpgrade=True) + IsInplaceUpgrade=True, maintenance_mode=mode) if status != 0 or not output.isdigit(): self.context.logger.debug( "Failed to execute SQL on [%s]: %s. Error: \n%s" % @@ -6272,3 +6391,501 @@ class UpgradeImpl: packFilePath) except Exception as e: raise Exception(str(e)) + + def getPrimaryDN(self, checkNormal): + """ + find primary dn in centralized cluster, which we can execute SQL commands + """ + try: + self.context.logger.debug("start to get primary dn. \n" + "checkNormal is {0}.".format(checkNormal)) + if self.context.standbyCluster or self.context.forceRollback: + checkNormal = False + primaryDn = None + if not checkNormal: + clusterNodes = self.context.oldClusterInfo.dbNodes + for dbNode in clusterNodes: + if len(dbNode.datanodes) == 0: + continue + primaryDn = dbNode.datanodes[0] + break + self.primaryDn = primaryDn + else: + primaryList, _ = DefaultValue.getPrimaryNode(self.context.userProfile, self.context.logger) + if primaryList: + primaryDn = primaryList[0] + if not primaryDn: + raise Exception(ErrorCode.GAUSS_526["GAUSS_52635"]) + for dbNode in self.context.clusterInfo.dbNodes: + for dn in dbNode.datanodes: + if dn.hostname == primaryDn: + self.primaryDn = dn + self.context.logger.debug("Successfully get primary DN from " + "{0}.".format(self.primaryDn.hostname)) + except Exception as er: + self.context.logger.debug("Failed to get Primary dn. Error: %s" % str(er)) + raise Exception(ErrorCode.GAUSS_516["GAUSS_51601"] % "primary dn") + + def getPrimaryNode(self, instanceType): + """ + + :param instanceType: + :return: + """ + try: + self.waitClusterNormalDegrade(waitTimeOut=120) + self.context.logger.debug("Start to get primary node.") + postSplit = "" + primaryFlag = "Primary" + count = 0 + cmd, status, output = "", 0, "" + while count < 60: + cmd = "source {0} && cm_ctl query -Cv".format(self.context.userProfile) + (status, output) = CmdUtil.retryGetstatusoutput(cmd, 3, 5) + # no need to retry under force upgrade + if status == 0: + break + time.sleep(2) + count += 1 + if status != 0: + raise Exception( + ErrorCode.GAUSS_514["GAUSS_51400"] % "%s. Error:\n%s" % (cmd, output)) + self.context.logger.debug("the result of query is {0}, " + "instanceType is {1}.".format(output, instanceType)) + targetString = output.split(instanceType)[1] + if instanceType == "Datanode": + dnPrimary = [x for x in re.split(r"[|\n]", targetString) if primaryFlag in x + or "Main" in x] + primaryList = [] + for dn in dnPrimary: + primaryList.append(list(filter(None, dn.split(" ")))[1]) + return primaryList + if instanceType == "ETCD": + postSplit = "Cluster" + primaryFlag = "StateLeader" + elif instanceType == "CMServer": + postSplit = "ETCD" + elif instanceType == "GTM": + postSplit = "Datanode" + elif instanceType == "Coordinator": + return "" + if postSplit not in targetString: + return "" + primaryInfo = [x for x in re.split(r"[|\n]", targetString.split(postSplit)[0]) if + primaryFlag in x] + if primaryInfo == "" or primaryInfo == []: + return "" + primary = list(filter(None, primaryInfo[0].split(" ")))[1] + self.context.logger.debug("get node {0}".format(primary)) + return primary + except Exception as er: + self.context.logger.debug("Failed to get primary node." + str(er)) + raise Exception(str(er)) + + def isGucContainDesignatedVal(self, gucName, result): + """ + The guc value contains the designated string. + :return: + """ + sql = "show {0};".format(gucName) + self.getPrimaryDN(True) + mode = "primary" + is_disaster = DefaultValue.cm_exist_and_is_disaster_cluster(self.context.clusterInfo, + self.context.logger) + if is_disaster: + mode = "standby" + (_, output) = self.execSqlCommandInPrimaryDN(sql, mode=mode) + if result in output: + return True + else: + return False + + def execSqlCommandInPrimaryDN(self, sql, retryTime=3, execHost=None, mode="primary"): + """ + execute sql on primary dn + :return: + """ + self.context.logger.debug("Start to exec sql {0}.".format(sql)) + count = 0 + status, output = 1, "" + mode = True if "dual-standby" in self.context.clusterType or mode == "standby" else False + while count < retryTime: + if not execHost: + self.getPrimaryDN(checkNormal=True) + execHost = self.primaryDn + self.context.logger.debug("Exec sql in dn node {0}".format(execHost.hostname)) + (status, output) = ClusterCommand.remoteSQLCommand(sql, self.context.user, + execHost.hostname, execHost.port, + False, + DefaultValue.DEFAULT_DB_NAME, + IsInplaceUpgrade=True, + maintenance_mode=mode) + self.context.logger.debug("Exec sql result " + "is, status:{0}, output is {1}" + "".format(status, output).replace("ERROR", "Log")) + if status != 0 or SqlResult.findErrorInSql(output): + count += 1 + continue + else: + break + return status, output + + def initClusterType(self): + """ + If it is a dual cluster, initialize whether the current cluster + is the primary cluster or the standby cluster + + :return: + """ + # The value of replconninfo1 must contain 'iscascade' in the DR cluster. + isStrDRCluster = self.isGucContainDesignatedVal("replconninfo1", "iscascade") + if isStrDRCluster: + suffix = "-streamDR" + else: + self.context.logger.debug("Current cluster is not dual cluster.") + return + + if self.context.is_inplace_upgrade and self.context.action \ + not in ["commit-upgrade", "auto-rollback", "chose-strategy"]: + raise Exception("Dual cluster does not support in-place upgrade") + + if self.checkGucValIsInValGiven(const.DISASTER_RECOVERY_GUC, ["2"], fromFile=True): + self.context.standbyCluster = True + self.context.clusterType = "dual-standby" + suffix + + elif self.checkGucValIsInValGiven(const.DISASTER_RECOVERY_GUC, ["0"], fromFile=True): + self.context.clusterType = "dual-primary" + suffix + + self.context.logger.log("NOTICE: the clusterType is {0}".format(self.context.clusterType)) + + if not self.context.is_inplace_upgrade: + self.backup_disaster_user_file() + + if self.context.forceRollback: + return + self.copyStandbyClusterUpgradeFile() + + upgradeInfoTmp = self.context.getDualUpgradeInfo(self.context.upgradePhaseInfoPath, 0) + if upgradeInfoTmp is not None: + if "dual-standby" in self.context.clusterType: + self.context.dualUpgradeShareInfo.masterVersion = upgradeInfoTmp.masterVersion + self.context.dualUpgradeShareInfo.masterUpgradeStatus = \ + upgradeInfoTmp.masterUpgradeStatus + else: + self.context.dualUpgradeShareInfo.standbyVersion = upgradeInfoTmp.standbyVersion + self.context.dualUpgradeShareInfo.standbyUpgradeStatus = \ + upgradeInfoTmp.standbyUpgradeStatus + + self.context.updateDualUpgradeInfo(self.context.dualUpgradeShareInfo, + filePath=self.context.upgradePhaseInfoPath, + startPost=0) + + def checkGucValIsInValGiven(self, gucName, valList, fromFile=False): + """ + Checks whether a given parameter is a given value list in a given instance list. + """ + self.context.logger.debug("checks whether the parameter:{0} is " + "the value:{1}.".format(gucName, valList)) + gucStr = "{0}:{1}".format(gucName, ",".join(valList)) + try: + self.checkParam(gucStr, fromFile) + self.context.logger.debug("Success to check the parameter:{0} value is " + "in the value:{1}.".format(gucName, valList)) + return True + except Exception as _: + return False + + def copyStandbyClusterUpgradeFile(self): + """ + From the data directory of the standby cluster, copy the upgrade_phase_info file + to the designated instance directory of the primary cluster, and distribute it + to the upgrade backup directory of all nodes + """ + hardUser, hardUserPwd = self.getDisasterRecoveryUser() + if hardUser is None or hardUser == "" or hardUserPwd is None or hardUserPwd == "": + raise Exception("Failed to obtain the streaming disaster build user") + dnInstance = None + for x in range(1, 9): + localRemoteInfo = self.getLocalRemoteHostIpAndPort("{0}{1}".format( + const.REMOTE_INFO_GUC[self.context.clusterType], x)) + for dbNode in self.context.clusterInfo.dbNodes: + for dnInst in dbNode.datanodes: + self.context.logger.debug("The instance is {0}".format(dnInst.__dict__)) + if "-streamDR" in self.context.clusterType: + dataIp = DefaultValue.get_data_ip_info(dnInst, self.context.logger) + if localRemoteInfo.get("localhost") in dataIp and \ + localRemoteInfo.get("localport") == str(dnInst.haPort).strip(): + dnInstance = copy.deepcopy(dnInst) + break + if dnInstance is not None: + try: + self.copyAndDistributeUpgradeFile(dnInstance, localRemoteInfo) + except Exception as err: + self.context.logger.error("Cope file failed msg:%s." % err) + dnInstance = None + continue + break + if dnInstance is None: + raise Exception("Unable to find a DN to connect to the standby cluster node") + + def checkDualClusterUpgrade(self): + """ + Double cluster check whether it can be upgrade + + :return: + """ + if "dual-standby-streamDR" not in self.context.clusterType or \ + self.context.action == const.ACTION_SMALL_UPGRADE: + return + self.context.logger.debug("The status of the dual-cluster standby status is {0}, version " + "is {1}. The status of the dual-cluster master status is {2}, " + "version is {3}".format( + self.context.dualUpgradeShareInfo.standbyUpgradeStatus, + self.context.dualUpgradeShareInfo.standbyVersion, + self.context.dualUpgradeShareInfo.masterUpgradeStatus, + self.context.dualUpgradeShareInfo.masterVersion)) + + if self.context.dualUpgradeShareInfo.masterUpgradeStatus < 2 or \ + self.context.dualUpgradeShareInfo.masterVersion != self.newCommitId: + raise Exception("The status of the dual-cluster master is {0}. " + "the standby cluster cannot be upgrade." + .format(self.context.dualUpgradeShareInfo.masterUpgradeStatus)) + + def recordDualClusterStage(self, commitVersion, upgradeStage): + """ + Record the upgrade information of the dual cluster + + :param commitVersion: + :param upgradeStage: + :return: + """ + if "dual-primary" in self.context.clusterType: + self.context.dualUpgradeShareInfo.masterVersion = commitVersion + self.context.dualUpgradeShareInfo.masterUpgradeStatus = upgradeStage + elif "dual-standby" in self.context.clusterType: + self.context.dualUpgradeShareInfo.standbyVersion = commitVersion + self.context.dualUpgradeShareInfo.standbyUpgradeStatus = upgradeStage + else: + return + self.context.updateDualUpgradeInfo(self.context.dualUpgradeShareInfo, + filePath=self.context.upgradePhaseInfoPath, startPost=0) + + def checkDualClusterRollback(self): + """ + Double cluster check whether it can be rollback + + :return: + """ + if "dual-standby" in self.context.clusterType or \ + "dual-" not in self.context.clusterType: + return + self.context.logger.debug("The status of the dual-cluster standby status is {0}, version " + "is {1}. The status of the dual-cluster master status is {2}, " + "version is {3}".format( + self.context.dualUpgradeShareInfo.standbyUpgradeStatus, + self.context.dualUpgradeShareInfo.standbyVersion, + self.context.dualUpgradeShareInfo.masterUpgradeStatus, + self.context.dualUpgradeShareInfo.masterVersion)) + if not self.context.rollback or \ + "dual-primary" in self.context.clusterType or \ + self.context.action == const.ACTION_SMALL_UPGRADE or self.context.forceRollback: + return + # master cluster + if "dual-primary" in self.context.clusterType: + if (self.context.dualUpgradeShareInfo.standbyUpgradeStatus > 2 or + self.context.dualUpgradeShareInfo.standbyUpgradeStatus == 0) and \ + self.context.dualUpgradeShareInfo.standbyVersion == self.newCommitId: + raise Exception("The status of the dual-cluster standby is {0}. " + "the master cluster cannot be rolled back." + .format(self.context.dualUpgradeShareInfo.standbyUpgradeStatus)) + + def checkDualClusterCommit(self): + """ + Double cluster check whether it can be submitted + + :return: + """ + if "dual-" not in self.context.clusterType: + return + if self.context.action == const.ACTION_SMALL_UPGRADE: + return + self.context.logger.debug("The status of the dual-cluster standby status is {0}, version " + "is {1}. The status of the dual-cluster master status is {2}, " + "version is {3}".format( + self.context.dualUpgradeShareInfo.standbyUpgradeStatus, + self.context.dualUpgradeShareInfo.standbyVersion, + self.context.dualUpgradeShareInfo.masterUpgradeStatus, + self.context.dualUpgradeShareInfo.masterVersion)) + # master cluster + if "dual-primary" in self.context.clusterType: + if self.context.dualUpgradeShareInfo.standbyUpgradeStatus != 0 or \ + self.context.dualUpgradeShareInfo.standbyVersion != self.newCommitId: + raise Exception("The status of the dual-cluster standby status is {0}, " + "version is {1}. the master cluster cannot be commit." + .format(self.context.dualUpgradeShareInfo.standbyUpgradeStatus, + self.context.dualUpgradeShareInfo.standbyVersion)) + if "dual-standby" in self.context.clusterType: + if self.context.dualUpgradeShareInfo.masterUpgradeStatus != 2 or \ + self.context.dualUpgradeShareInfo.masterVersion != self.newCommitId: + raise Exception("The status of the dual-cluster master status is {0}, " + "version is {1}. The standby cluster cannot be commit." + .format(self.context.dualUpgradeShareInfo.masterUpgradeStatus, + self.context.dualUpgradeShareInfo.masterVersion)) + + def copyDirFromRemoteNode(self, remoteHost, remoteDir, targetHost, targetDir): + """ + SSH to the remote node, copy dir from the remote node to the specified node + + :param remoteHost: + :param remoteDir: + :param targetHost: + :param targetDir: + :return: + """ + scpcmd = "pssh -s -H {0} 'source {5}; if [ -d '{1}' ];" \ + "then pscp -r -H {2} {3} {4}; fi' ".format(remoteHost, remoteDir, targetHost, + remoteDir, targetDir, + self.context.userProfile) + (status, output) = CmdUtil.retryGetstatusoutput(scpcmd, 2, 5) + if status != 0: + raise Exception("File copy failed. Output: {0}".format(output)) + + def getLocalRemoteHostIpAndPort(self, gucName): + """ + Get the DN instance and the corresponding standby cluster host and port through the + cross_cluster_replconninfo parameter + :param gucName: cross_cluster_replconninfo parameter name + :return: {"localhost":"", "localport":"", "remotehost":"", "remoteport":""} + """ + isLocal = False + localRemoteInfo = dict() + sql = "show {0};".format(gucName) + self.getPrimaryDN(False) + (status, output) = self.execSqlCommandInPrimaryDN(sql) + if status != 0 or output == "": + raise Exception("Failed to get GUC parameter: {0} value. Output: {1}".format(gucName, + output)) + localIp = output.split("localhost=")[1].split("localport=")[0].strip() + remoteIp = output.split("remotehost=")[1].split("remoteport=")[0].strip() + + self.context.logger.debug("Success get the output {0}".format(output)) + + if "-streamDR" in self.context.clusterType: + localPort = output.split("localport=")[1].split("localheartbeatport=")[0].strip() + remotePort = output.split("remoteport=")[1].split("remoteheartbeatport=")[0].strip() + + for dbNode in self.context.clusterInfo.dbNodes: + if isLocal: + break + for dnInst in dbNode.datanodes: + if remoteIp in dnInst.listenIps or remoteIp in dnInst.hostname: + isLocal = True + break + self.context.logger.debug("The local flag is {0}".format(isLocal)) + + if isLocal: + localRemoteInfo.setdefault("localhost", "no find remote host") + else: + localRemoteInfo.setdefault("localhost", localIp) + + localRemoteInfo.setdefault("localport", localPort) + localRemoteInfo.setdefault("remotehost", remoteIp) + localRemoteInfo.setdefault("remoteport", remotePort) + return localRemoteInfo + + def copyAndDistributeUpgradeFile(self, dnInstance, localRemoteInfo): + """ + copy upgrade file + :return: + """ + hardUser, hardUserPwd = self.getDisasterRecoveryUser() + cmd_remote = 'pssh -s -H {0} \'source {8}; gs_ctl build -D {1} -b copy_upgrade_file ' \ + '-Z datanode -U {2} -P "{3}" -C "localhost={4} localport={5} remotehost={6} ' \ + 'remoteport={7}"\''.format(dnInstance.hostname, + dnInstance.datadir, + hardUser, + hardUserPwd, + localRemoteInfo.get("localhost"), + localRemoteInfo.get("localport"), + localRemoteInfo.get("remotehost"), + localRemoteInfo.get("remoteport"), + self.context.userProfile) + + cmd_remote = cmd_remote.replace(" -Z datanode", "") + + self.context.logger.debug("Copy upgrade file with cmd: {0}.". + format(cmd_remote.replace(hardUserPwd, "***"))) + status, output = DefaultValue.getstatusoutput_hide_pass(cmd_remote) + if status == 0: + self.context.logger.debug("Successfully copy upgrade file") + else: + raise Exception("Failed to copy files from the standby cluster. " + "Ensure that the standby cluster version supports this function. " + "Output: {0}".format(output)) + + remoteUpgradeInfoPath = os.path.join(dnInstance.datadir, const.UPGRADE_PHASE_INFO) + self.copyFileFromRemoteNode(dnInstance.hostname, remoteUpgradeInfoPath, + NetUtil.GetHostIpOrName(), + self.context.upgradePhaseInfoPath) + if not os.path.exists(self.context.upgradePhaseInfoPath): + FileUtil.createFile(self.context.upgradePhaseInfoPath, + mode=DefaultValue.KEY_FILE_MODE) + self.context.updateDualUpgradeInfo(self.context.dualUpgradeShareInfo, + filePath=self.context.upgradePhaseInfoPath, + startPost=0) + + self.context.sshTool.scpFiles(self.context.upgradePhaseInfoPath, + self.context.tmpDir, + hostList=self.context.clusterNodes) + + def getDisasterRecoveryUser(self): + """ + Obtain special users of the streaming disaster recovery cluster for building + :return: user name + """ + mode = True if "dual-standby" in self.context.clusterType else False + user_str = DefaultValue.obtain_hadr_user_encrypt_str( + self.context.clusterInfo, self.context.user, self.context.logger, mode) + rand_pwd = DefaultValue.decrypt_hadr_rand_pwd(self.context.logger) + params = rand_pwd, user_str, self.context.clusterInfo, self.context.user, \ + self.context.logger, mode + hardUser, hardUserPwd = DefaultValue.decrypt_hadr_user_info(params) + return hardUser, hardUserPwd + + def copyFileFromRemoteNode(self, remoteHost, remoteFile, targetHost, targetFile): + """ + SSH to the remote node, copy files from the remote node to the specified node + + :param remoteHost: + :param remoteFile: + :param targetHost: + :param targetFile: + :return: + """ + scpcmd = "pssh -s -H {0} 'source {5}; if [ -f '{1}' ];" \ + "then pscp -H {2} {3} {4}; fi' ".format(remoteHost, remoteFile, targetHost, + remoteFile, targetFile, + self.context.userProfile) + (status, output) = CmdUtil.retryGetstatusoutput(scpcmd, 2, 5) + if status != 0: + raise Exception("File copy failed. Output: {0}".format(output)) + + def clean_gs_secure_files(self): + """ + delete gs_secure_files during rollback or commit + """ + try: + self.context.logger.debug( + "Starting to clean gs_secure_files folder in the dn data catalog.") + cmd = "%s -t %s -U %s -l %s" % \ + (OMCommand.getLocalScript("Local_Upgrade_Utility"), + const.ACTION_CLEAN_GS_SECURE_FILES, + self.context.user, + self.context.localLog) + self.context.logger.debug("clean gs_secure_files folder:{0}".format(cmd)) + host_list = copy.deepcopy(self.context.clusterNodes) + self.context.execCommandInSpecialNode(cmd, host_list) + except Exception as er: + raise Exception(str(er)) + self.context.logger.debug( + "Successfully to clean gs_secure_files folder in the dn data catalog.") diff --git a/script/local/ConfigHba.py b/script/local/ConfigHba.py index 3f21c9e..2dce53d 100644 --- a/script/local/ConfigHba.py +++ b/script/local/ConfigHba.py @@ -56,6 +56,7 @@ class CmdOptions(): self.removeIps = [] self.addIps = [] self.dws_mode = False + self.try_reload = False def usage(): @@ -75,6 +76,7 @@ General options: -r the signal about ignorepgHbaMiss --remove-ip Remove ip address from pg_hba.conf --add-ip Add ip address to pg_hba.conf + --try-reload Try reload guc params if can not set --help Show help information for this utility, and exit the command line mode. """ @@ -88,7 +90,7 @@ def parseCommandLine(): try: opts, args = getopt.getopt(sys.argv[1:], "U:X:l:r", ["remove-ip=", "help", "dws-mode", - "add-ip="]) + "add-ip=", "try-reload"]) except Exception as e: usage() GaussLog.exitWithError(ErrorCode.GAUSS_500["GAUSS_50000"] % str(e)) @@ -116,6 +118,8 @@ def parseCommandLine(): g_opts.removeIps.append(value) elif (key == "--dws-mode"): g_opts.dws_mode = True + elif key == "--try-reload": + g_opts.try_reload = True elif (key == "--add-ip"): g_opts.addIps = value.split(',') Parameter.checkParaVaild(key, value) @@ -154,7 +158,7 @@ class ConfigHba(LocalBaseOM): """ def __init__(self, logFile, user, clusterConf, dwsMode=False, - ignorepgHbaMiss=False, removeIps=None): + ignorepgHbaMiss=False, removeIps=None, try_reload=False): """ function: configure all instance on local node """ @@ -178,6 +182,7 @@ class ConfigHba(LocalBaseOM): if removeIps is None: removeIps = [] self.removeIps = removeIps + self.try_reload = try_reload def getAllIps(self): """ @@ -220,6 +225,16 @@ class ConfigHba(LocalBaseOM): except Exception as e: raise Exception(str(e)) + def remove_streaming_config(self, component): + """ + remove dn & cn pg_hba for streaming stop + """ + ip_segment_list = list(set(['.'.join( + remove_ip.split('.')[:2]) + ".0.0/16" for remove_ip in self.removeIps])) + for ip_segment in ip_segment_list: + ip_remove_str = "-h \"host replication all %s\" " % ip_segment + component.doGUCConfig("set", ip_remove_str, True) + def __configAnInstance(self, component): """ function: set hba config for single component @@ -245,9 +260,10 @@ class ConfigHba(LocalBaseOM): self.logger.debug("The %s does not exist." % hbaFile) return - component.setPghbaConfig(self.allIps) + component.setPghbaConfig(self.allIps, try_reload=self.try_reload) if len(self.removeIps) != 0: component.removeIpInfoOnPghbaConfig(self.removeIps) + self.remove_streaming_config(component) if __name__ == '__main__': @@ -266,7 +282,7 @@ if __name__ == '__main__': # modify Instance configer = ConfigHba(g_opts.logFile, g_opts.clusterUser, g_opts.clusterConf, g_opts.dws_mode, - g_opts.ignorepgHbaMiss, g_opts.removeIps) + g_opts.ignorepgHbaMiss, g_opts.removeIps, g_opts.try_reload) configer.configHba() except Exception as e: diff --git a/script/local/UpgradeUtility.py b/script/local/UpgradeUtility.py index 734e22e..01e5e89 100644 --- a/script/local/UpgradeUtility.py +++ b/script/local/UpgradeUtility.py @@ -2152,6 +2152,35 @@ def backupHotpatch(): for dbInstance in g_dbNode.gtms: backupInstanceHotpatchConfig(dbInstance.datadir) +def clean_gs_secure_files(): + """ + clean gs_secure_files folder + """ + pool = ThreadPool(DefaultValue.getCpuSet()) + pool.map(clean_stream_gs_secure, g_dbNode.datanodes) + pool.close() + pool.join() + + +def clean_stream_gs_secure(dn_inst): + """ + clean gs secure dir + """ + temp_dir = EnvUtil.getTmpDirFromEnv() + file_path = os.path.join(dn_inst.datadir, "gs_secure_files") + cmd = "(if [ -d '%s' ]; then rm -rf '%s'; fi) && " % (file_path, file_path) + cmd += "(if [ -f '%s/upgrade_phase_info' ]; then rm -f '%s/upgrade_phase_info'; " \ + "fi) &&" % (temp_dir, temp_dir) + cmd += "(if [ -f '%s/hadr.key.cipher' ]; then rm -f '%s/hadr.key.cipher'; " \ + "fi) &&" % (temp_dir, temp_dir) + cmd += "(if [ -f '%s/hadr.key.rand' ]; then rm -f '%s/hadr.key.rand'; " \ + "fi) &&" % (temp_dir, temp_dir) + cmd += "(if [ -d '%s/gs_secure_files' ]; then rm -f '%s/gs_secure_files'; " \ + "fi)" % (temp_dir, temp_dir) + g_logger.debug("Starting clean instance %s gs secure dir, cmd:%s." % (dn_inst.instanceId, cmd)) + CmdExecutor.execCommandLocally(cmd) + g_logger.debug("Successfully clean instance %s gs secure dir." % dn_inst.instanceId) + def rollbackInstanceHotpatchConfig(instanceDataDir): """ @@ -4720,6 +4749,7 @@ def main(): const.ACTION_GREY_SYNC_GUC: greySyncGuc, const.ACTION_GREY_UPGRADE_CONFIG_SYNC: greyUpgradeSyncConfig, const.ACTION_SWITCH_DN: switchDnNodeProcess, + const.ACTION_CLEAN_GS_SECURE_FILES: clean_gs_secure_files, const.ACTION_GET_LSN_INFO: getLsnInfo, const.ACTION_GREY_RESTORE_CONFIG: greyRestoreConfig, const.ACTION_GREY_RESTORE_GUC: greyRestoreGuc,