From 28127d61e0277ea88bdc2881fddb3c06232fc67c Mon Sep 17 00:00:00 2001 From: l30049514 Date: Mon, 29 Jan 2024 12:53:30 +0800 Subject: [PATCH] =?UTF-8?q?cm=E6=9A=82=E5=81=9C=E9=80=82=E9=85=8D=E8=B5=84?= =?UTF-8?q?=E6=BA=90=E6=B1=A0=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/cm_agent/cma_global_params.cpp | 2 + src/cm_agent/cma_instance_management.cpp | 2 +- src/cm_agent/cma_instance_management_res.cpp | 2 +- src/cm_agent/cma_main.cpp | 11 +++ src/cm_client/cm_client.cpp | 22 ++++++ src/cm_client/cm_client_api.cpp | 4 ++ src/cm_ctl/cm_ctl.cpp | 6 ++ src/cm_ctl/ctl_restart.cpp | 1 + src/cm_ctl/ctl_start.cpp | 70 ++++++++++++++++++-- src/cm_server/cms_rhb.cpp | 4 +- src/include/cm/cm_agent/cma_global_params.h | 2 + src/include/cm/cm_agent/cma_main.h | 1 + src/include/cm/cm_client/cm_client.h | 3 + src/include/cm/cm_ctl/ctl_common.h | 1 + 14 files changed, 123 insertions(+), 8 deletions(-) diff --git a/src/cm_agent/cma_global_params.cpp b/src/cm_agent/cma_global_params.cpp index 990f26b..3c9ac30 100644 --- a/src/cm_agent/cma_global_params.cpp +++ b/src/cm_agent/cma_global_params.cpp @@ -232,6 +232,8 @@ char g_enableFenceDn[10] = {0}; #endif bool g_isPauseArbitration = false; char g_cmManualPausePath[MAX_PATH_LEN] = {0}; +bool g_isStarting = false; +char g_cmManualStartingPath[MAX_PATH_LEN] = {0}; bool &GetIsSharedStorageMode() { diff --git a/src/cm_agent/cma_instance_management.cpp b/src/cm_agent/cma_instance_management.cpp index 002e4fd..6942ac2 100644 --- a/src/cm_agent/cma_instance_management.cpp +++ b/src/cm_agent/cma_instance_management.cpp @@ -398,7 +398,7 @@ void start_instance_check(void) StartInstanceAndCheck(CheckAgentNicDown, "[CheckAgentNicDown]"); - if (IsCusResExistLocal()) { + if (IsCusResExistLocal() && !(g_isPauseArbitration && !g_isStarting)) { StartInstanceAndCheck(StartResourceCheck, "[StartResourceCheck]"); } diff --git a/src/cm_agent/cma_instance_management_res.cpp b/src/cm_agent/cma_instance_management_res.cpp index 973c795..7792f26 100644 --- a/src/cm_agent/cma_instance_management_res.cpp +++ b/src/cm_agent/cma_instance_management_res.cpp @@ -488,7 +488,7 @@ void StopResourceCheck() if (IsInstManualStopped(g_resConf[i].cmInstanceId)) { OneResInstShutdown(&g_resConf[i]); } - if (CmFileExist(g_cmManualStartPath) || !IsOneResInstWork(g_resConf[i].resName, g_resConf[i].cmInstanceId)) { + if ((!IsOneResInstWork(g_resConf[i].resName, g_resConf[i].cmInstanceId) && !g_isPauseArbitration) || CmFileExist(g_cmManualStartPath)) { OneResInstClean(&g_resConf[i]); } } diff --git a/src/cm_agent/cma_main.cpp b/src/cm_agent/cma_main.cpp index 055c18b..2371eb6 100644 --- a/src/cm_agent/cma_main.cpp +++ b/src/cm_agent/cma_main.cpp @@ -381,6 +381,8 @@ int get_prog_path() securec_check_errno(rc, (void)rc); rc = memset_s(g_cmManualPausePath, MAX_PATH_LEN, 0, MAX_PATH_LEN); securec_check_errno(rc, (void)rc); + rc = memset_s(g_cmManualStartingPath, MAX_PATH_LEN, 0, MAX_PATH_LEN); + securec_check_errno(rc, (void)rc); if (GetHomePath(exec_path, sizeof(exec_path)) != 0) { (void)fprintf(stderr, "Get GAUSSHOME failed, please check.\n"); return -1; @@ -433,6 +435,9 @@ int get_prog_path() rcs = snprintf_s( g_cmManualPausePath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/%s", exec_path, CM_CLUSTER_MANUAL_PAUSE); securec_check_intval(rcs, (void)rcs); + rcs = snprintf_s( + g_cmManualStartingPath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/%s", exec_path, CM_CLUSTER_MANUAL_STARTING); + securec_check_intval(rcs, (void)rcs); InitClientCrt(exec_path); } @@ -1049,6 +1054,12 @@ void server_loop(void) pauseLogTimes = 0; } + if (access(g_cmManualStartingPath, F_OK) == 0) { + g_isStarting = true; + } else { + g_isStarting = false; + } + (void)clock_gettime(CLOCK_MONOTONIC, &endTime); if (g_isStart) { g_suppressAlarm = true; diff --git a/src/cm_client/cm_client.cpp b/src/cm_client/cm_client.cpp index fac3b2c..ac9539e 100644 --- a/src/cm_client/cm_client.cpp +++ b/src/cm_client/cm_client.cpp @@ -41,6 +41,8 @@ static bool g_shutdownClient = false; static SendMsgQueue *g_sendMsg = NULL; static OneResStatList *g_clientStatusList = NULL; static volatile bool g_needReconnect = false; +#define CLUSTER_MANUAL_PAUSE "cluster_manual_pause" +char g_manualPausePath[MAX_PATH_LEN]; static pthread_t *g_conThreadId = NULL; static pthread_t *g_sendThreadId = NULL; @@ -608,6 +610,7 @@ static void InitGlobalVariable(const char *resName) status_t PreInit(uint32 instanceId, const char *resName, CmNotifyFunc func, bool *isFirstInit) { + get_pause_path(); if (isFirstInit) { InitGlobalVariable(resName); CM_RETURN_IFERR(InitLogFile()); @@ -695,3 +698,22 @@ ClientLockResult SendLockMsgAndWaitResult(char *msgPtr, uint32 msgLen) return result; } + +void get_pause_path() +{ + char exec_path[MAX_PATH_LEN] = {0}; + errno_t rc; + int rcs; + + rc = memset_s(g_manualPausePath, MAX_PATH_LEN, 0, MAX_PATH_LEN); + securec_check_errno(rc, (void)rc); + if (GetHomePath(exec_path, sizeof(exec_path)) != 0) { + (void)fprintf(stderr, "Get GAUSSHOME failed, please check.\n"); + return; + } else { + check_input_for_security(exec_path); + rcs = snprintf_s( + g_manualPausePath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/%s", exec_path, CLUSTER_MANUAL_PAUSE); + securec_check_intval(rcs, (void)rcs); + } +} \ No newline at end of file diff --git a/src/cm_client/cm_client_api.cpp b/src/cm_client/cm_client_api.cpp index a29100e..f95e8e2 100644 --- a/src/cm_client/cm_client_api.cpp +++ b/src/cm_client/cm_client_api.cpp @@ -73,6 +73,10 @@ ClientCmLockMsg *GetLockSendMsg(const char *lockName, LockOption opt) int ResLockCore(const char *lockName) { + if (access(g_manualPausePath, F_OK) == 0 && strcmp(lockName, "dms_reformer_lock") == 0) { + write_runlog(LOG, "cm is pause, don't lock(%s).\n", lockName); + return 1; + } ClientCmLockMsg *sendMsg = GetLockSendMsg(lockName, CM_RES_LOCK); if (sendMsg == NULL) { write_runlog(ERROR, "generate (%s)lock msg failed.\n", lockName); diff --git a/src/cm_ctl/cm_ctl.cpp b/src/cm_ctl/cm_ctl.cpp index 82d19e5..c7f1d5c 100644 --- a/src/cm_ctl/cm_ctl.cpp +++ b/src/cm_ctl/cm_ctl.cpp @@ -55,6 +55,7 @@ #define MINORITY_AZ_ARBITRATE "minority_az_arbitrate_hist" #define RESUMING_CN_STOP "resuming_cn_stop" #define CLUSTER_MANUAL_PAUSE "cluster_manual_pause" +#define CLUSTER_MANUAL_STARTING "cluster_manual_starting" char* g_bin_name = NULL; char* g_bin_path = NULL; @@ -121,6 +122,7 @@ static char* hotpatch_path = NULL; char manual_start_file[MAXPGPATH]; char instance_manual_start_file[MAXPGPATH]; +char cluster_manual_starting_file[MAXPGPATH]; char etcd_manual_start_file[MAXPGPATH]; static bool coordinator_dynamic_view = false; #ifndef ENABLE_MULTIPLE_NODES @@ -865,6 +867,9 @@ static void init_ctl_global_variable() ret = snprintf_s( instance_manual_start_file, MAXPGPATH, MAXPGPATH - 1, "%s/bin/%s", g_appPath, INSTANCE_MANUAL_START); securec_check_intval(ret, (void)ret); + ret = snprintf_s( + cluster_manual_starting_file, MAXPGPATH, MAXPGPATH - 1, "%s/bin/%s", g_appPath, CLUSTER_MANUAL_STARTING); + securec_check_intval(ret, (void)ret); ret = snprintf_s(etcd_manual_start_file, MAXPGPATH, MAXPGPATH - 1, "%s/bin/%s", g_appPath, ETCD_MANUAL_START); securec_check_intval(ret, (void)ret); #ifndef ENABLE_MULTIPLE_NODES @@ -2249,6 +2254,7 @@ static void CtlCommandProcessCore(int *status, CtlOption *ctlCtx) #endif case START_COMMAND: *status = do_start(); + removeStartingFile(); break; case CM_SWITCHOVER_COMMAND: *status = DoSwitchover(ctlCtx); diff --git a/src/cm_ctl/ctl_restart.cpp b/src/cm_ctl/ctl_restart.cpp index 0d3aab8..d219079 100644 --- a/src/cm_ctl/ctl_restart.cpp +++ b/src/cm_ctl/ctl_restart.cpp @@ -52,6 +52,7 @@ extern char result_path[MAXPGPATH]; extern char* g_command_operation_lcName; extern uint32 g_nodeId; extern char instance_manual_start_file[MAXPGPATH]; +extern char cluster_manual_starting_file[MAXPGPATH]; extern bool switchover_all_quick; #ifdef ENABLE_MULTIPLE_NODES diff --git a/src/cm_ctl/ctl_start.cpp b/src/cm_ctl/ctl_start.cpp index fe92d4e..b673d91 100644 --- a/src/cm_ctl/ctl_start.cpp +++ b/src/cm_ctl/ctl_start.cpp @@ -90,6 +90,7 @@ static struct timespec g_endTime; extern char g_cmData[CM_PATH_LENGTH]; extern char manual_start_file[MAXPGPATH]; extern char instance_manual_start_file[MAXPGPATH]; +extern char cluster_manual_starting_file[MAXPGPATH]; extern char etcd_manual_start_file[MAXPGPATH]; extern char minority_az_start_file[MAX_PATH_LEN]; extern char g_minorityAzArbitrateFile[MAX_PATH_LEN]; @@ -388,6 +389,7 @@ status_t do_start(void) } else { if (CheckOfflineInstance(g_commandOperationNodeId)) { write_runlog(LOG, "the instance(node:%u) is Offline, no need to start.\n", g_commandOperationNodeId); + removeStartingFile(); exit(0); } write_runlog(LOG, "start the node:%u,datapath:%s. \n", g_commandOperationNodeId, g_cmData); @@ -489,12 +491,13 @@ static void start_cluster(void) ret = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, - SYSTEMQUOTE "source /etc/profile;pssh -i %s -h %s \"rm -f %s %s_*\" > %s; " + SYSTEMQUOTE "source /etc/profile;pssh -i %s -h %s \"rm -f %s %s_*; touch %s\" > %s; " "if [ $? -ne 0 ]; then cat %s; fi; rm -f %s" SYSTEMQUOTE, PSSH_TIMEOUT_OPTION, hosts_path, manual_start_file, instance_manual_start_file, + cluster_manual_starting_file, pssh_out_path, pssh_out_path, pssh_out_path); @@ -502,13 +505,14 @@ static void start_cluster(void) ret = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, - SYSTEMQUOTE "source /etc/profile;pssh -i %s -h %s \"source %s;rm -f %s %s_*\" > %s; " + SYSTEMQUOTE "source /etc/profile;pssh -i %s -h %s \"source %s;rm -f %s %s_*; touch %s\" > %s; " "if [ $? -ne 0 ]; then cat %s; fi; rm -f %s" SYSTEMQUOTE, PSSH_TIMEOUT_OPTION, hosts_path, mpp_env_separate_file, manual_start_file, instance_manual_start_file, + cluster_manual_starting_file, pssh_out_path, pssh_out_path, pssh_out_path); @@ -1320,8 +1324,10 @@ static void start_node(uint32 nodeid) char command[MAXPGPATH]; uint32 ii; errno_t rc; - rc = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, SYSTEMQUOTE "rm -f %s %s %s_* < \"%s\" 2>&1 &" SYSTEMQUOTE, - manual_start_file, etcd_manual_start_file, instance_manual_start_file, DEVNULL); + rc = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, + SYSTEMQUOTE "rm -f %s %s %s_*; touch %s < \"%s\" 2>&1 &" SYSTEMQUOTE, + manual_start_file, etcd_manual_start_file, + instance_manual_start_file, cluster_manual_starting_file, DEVNULL); securec_check_intval(rc, (void)rc); if (nodeid == g_currentNode->node) { @@ -1415,6 +1421,52 @@ void start_instance(uint32 nodeid, const char* datapath) } } +void removeStartingFile() +{ + int ret; + char command[MAX_COMMAND_LEN] = {0}; + + init_hosts(); + if (mpp_env_separate_file[0] == '\0') { + ret = snprintf_s(command, + MAX_COMMAND_LEN, + MAX_COMMAND_LEN - 1, + SYSTEMQUOTE "source /etc/profile;pssh -i %s -h %s \"rm -f %s\" > %s; " + "if [ $? -ne 0 ]; then cat %s; fi; rm -f %s" SYSTEMQUOTE, + PSSH_TIMEOUT_OPTION, + hosts_path, + cluster_manual_starting_file, + pssh_out_path, + pssh_out_path, + pssh_out_path); + } else { + ret = snprintf_s(command, + MAX_COMMAND_LEN, + MAX_COMMAND_LEN - 1, + SYSTEMQUOTE "source /etc/profile;pssh -i %s -h %s \"source %s;rm -f %s\" > %s; " + "if [ $? -ne 0 ]; then cat %s; fi; rm -f %s" SYSTEMQUOTE, + PSSH_TIMEOUT_OPTION, + hosts_path, + mpp_env_separate_file, + cluster_manual_starting_file, + pssh_out_path, + pssh_out_path, + pssh_out_path); + } + securec_check_intval(ret, (void)ret); + ret = system(command); + if (ret != 0) { + write_runlog(DEBUG1, + "Failed to delete the startingFile with executing the command: command=\"%s\"," + " nodeId=%u, systemReturn=%d, shellReturn=%d, errno=%d.\n", + command, + g_currentNode->node, + ret, + SHELL_RETURN_CODE(ret), + errno); + } +} + static void* check_cluster_start_status(void* arg) { int count = 0; @@ -1428,9 +1480,11 @@ static void* check_cluster_start_status(void* arg) while (startingTime < g_waitSeconds) { if (g_cluster_start_status == CM_STATUS_NORMAL) { write_runlog(LOG, "start cluster successfully.\n"); + removeStartingFile(); exit(0); } else if (g_cluster_start_status == CM_STATUS_NORMAL_WITH_CN_DELETED) { write_runlog(LOG, "start cluster successfully. There is a coordinator that has been deleted. \n"); + removeStartingFile(); exit(0); } else if (g_az_start_status == CM_STATUS_NORMAL) { for (uint32 ii = 0; ii < g_node_num; ii++) { @@ -1440,6 +1494,7 @@ static void* check_cluster_start_status(void* arg) } write_runlog(LOG, "start availability zone successfully.\n"); + removeStartingFile(); exit(0); } else if (g_az_start_status == CM_STATUS_NORMAL_WITH_CN_DELETED) { for (uint32 ii = 0; ii < g_node_num; ii++) { @@ -1449,12 +1504,15 @@ static void* check_cluster_start_status(void* arg) } write_runlog(LOG, "start availability zone successfully. There is a coordinator that has been deleted. \n"); + removeStartingFile(); exit(0); } else if (g_node_start_status == CM_STATUS_NORMAL) { write_runlog(LOG, "start node successfully.\n"); + removeStartingFile(); exit(0); } else if (g_node_start_status == CM_STATUS_NORMAL_WITH_CN_DELETED) { write_runlog(LOG, "start node successfully. There is a coordinator that has been deleted. \n"); + removeStartingFile(); exit(0); } else if (g_instance_start_status == CM_STATUS_NORMAL) { /* @@ -1464,16 +1522,19 @@ static void* check_cluster_start_status(void* arg) count++; if (count > INSTANCE_START_CONFIRM_TIME) { write_runlog(LOG, "start instance successfully.\n"); + removeStartingFile(); exit(0); } } else if (g_dn_relation_start_status == CM_STATUS_NORMAL) { /* check whether the relation datanodes have been started successfully */ write_runlog(LOG, "start relation datanodes successfully(node:%u, path:%s).\n", g_commandOperationNodeId, g_cmData); + removeStartingFile(); exit(0); } else if (g_resStartStatus == CM_STATUS_NORMAL) { write_runlog(LOG, "start resource instance successfully(nodeId:%u, instId:%u).\n", g_commandOperationNodeId, g_commandOperationInstanceId); + removeStartingFile(); exit(0); } else { count = 0; @@ -1535,6 +1596,7 @@ static void* check_cluster_start_status(void* arg) g_waitSeconds); } + removeStartingFile(); exit(-1); } diff --git a/src/cm_server/cms_rhb.cpp b/src/cm_server/cms_rhb.cpp index 88a55eb..697bf17 100644 --- a/src/cm_server/cms_rhb.cpp +++ b/src/cm_server/cms_rhb.cpp @@ -47,8 +47,8 @@ void InitDbListsByStaticConfig() return; } g_dbResNodeIdxInfo.idxLists[g_dbResNodeIdxInfo.hwl] = i; - rcs = - snprintf_s(info, maxInfoLen, maxInfoLen - 1, " %u:[%u-%u]", g_dbResNodeIdxInfo.hwl, i, g_node[i].node); + rcs = snprintf_s(info, maxInfoLen, maxInfoLen - 1, " %u:[%u-%u]", + g_dbResNodeIdxInfo.hwl, i, g_node[i].node); securec_check_intval(rcs, (void)rcs); rcs = strncat_s(buf, MAX_LOG_BUFF_LEN, info, strlen(info)); securec_check_errno(rcs, (void)rcs); diff --git a/src/include/cm/cm_agent/cma_global_params.h b/src/include/cm/cm_agent/cma_global_params.h index d53f207..fbcf08c 100644 --- a/src/include/cm/cm_agent/cma_global_params.h +++ b/src/include/cm/cm_agent/cma_global_params.h @@ -312,6 +312,8 @@ extern bool g_dn_report_msg_ok; extern bool g_isPauseArbitration; extern char g_cmManualPausePath[MAX_PATH_LEN]; +extern bool g_isStarting; +extern char g_cmManualStartingPath[MAX_PATH_LEN]; #endif diff --git a/src/include/cm/cm_agent/cma_main.h b/src/include/cm/cm_agent/cma_main.h index 05dbcb9..ad31966 100644 --- a/src/include/cm/cm_agent/cma_main.h +++ b/src/include/cm/cm_agent/cma_main.h @@ -47,6 +47,7 @@ #define SYSTEM_CALL_LOG "system_call" #define MAX_LOGFILE_TIMESTAMP "99991231235959" #define CM_CLUSTER_MANUAL_PAUSE "cluster_manual_pause" +#define CM_CLUSTER_MANUAL_STARTING "cluster_manual_starting" #define CONN_FAIL_TIMES 3 /* time style length */ diff --git a/src/include/cm/cm_client/cm_client.h b/src/include/cm/cm_client/cm_client.h index f03e671..6ca2221 100644 --- a/src/include/cm/cm_client/cm_client.h +++ b/src/include/cm/cm_client/cm_client.h @@ -120,6 +120,9 @@ OneResStatList *GetClientStatusList(); status_t SendInitMsg(uint32 instanceId, const char *resName); bool SendInitMsgAndGetResult(const char *resName, uint32 instId); ClientLockResult SendLockMsgAndWaitResult(char *msgPtr, uint32 msgLen); +void get_pause_path(); + +extern char g_manualPausePath[MAX_PATH_LEN]; #endif #endif // CM_CLIENT_H diff --git a/src/include/cm/cm_ctl/ctl_common.h b/src/include/cm/cm_ctl/ctl_common.h index eb88cb1..b69cfb4 100644 --- a/src/include/cm/cm_ctl/ctl_common.h +++ b/src/include/cm/cm_ctl/ctl_common.h @@ -312,5 +312,6 @@ int DoRhbPrint(); int DoPause(); int DoResume(); bool CheckTrustAndNet(); +void removeStartingFile(); #endif