!177 cm暂停适配资源池化

Merge pull request !177 from 刘展峰/cm_pause
This commit is contained in:
opengauss_bot
2024-02-01 10:11:14 +00:00
committed by Gitee
14 changed files with 123 additions and 8 deletions

View File

@ -232,6 +232,8 @@ char g_enableFenceDn[10] = {0};
#endif
bool g_isPauseArbitration = false;
char g_cmManualPausePath[MAX_PATH_LEN] = {0};
bool g_isStarting = false;
char g_cmManualStartingPath[MAX_PATH_LEN] = {0};
bool &GetIsSharedStorageMode()
{

View File

@ -398,7 +398,7 @@ void start_instance_check(void)
StartInstanceAndCheck(CheckAgentNicDown, "[CheckAgentNicDown]");
if (IsCusResExistLocal()) {
if (IsCusResExistLocal() && !(g_isPauseArbitration && !g_isStarting)) {
StartInstanceAndCheck(StartResourceCheck, "[StartResourceCheck]");
}

View File

@ -488,7 +488,7 @@ void StopResourceCheck()
if (IsInstManualStopped(g_resConf[i].cmInstanceId)) {
OneResInstShutdown(&g_resConf[i]);
}
if (CmFileExist(g_cmManualStartPath) || !IsOneResInstWork(g_resConf[i].resName, g_resConf[i].cmInstanceId)) {
if ((!IsOneResInstWork(g_resConf[i].resName, g_resConf[i].cmInstanceId) && !g_isPauseArbitration) || CmFileExist(g_cmManualStartPath)) {
OneResInstClean(&g_resConf[i]);
}
}

View File

@ -381,6 +381,8 @@ int get_prog_path()
securec_check_errno(rc, (void)rc);
rc = memset_s(g_cmManualPausePath, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
rc = memset_s(g_cmManualStartingPath, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
if (GetHomePath(exec_path, sizeof(exec_path)) != 0) {
(void)fprintf(stderr, "Get GAUSSHOME failed, please check.\n");
return -1;
@ -433,6 +435,9 @@ int get_prog_path()
rcs = snprintf_s(
g_cmManualPausePath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/%s", exec_path, CM_CLUSTER_MANUAL_PAUSE);
securec_check_intval(rcs, (void)rcs);
rcs = snprintf_s(
g_cmManualStartingPath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/%s", exec_path, CM_CLUSTER_MANUAL_STARTING);
securec_check_intval(rcs, (void)rcs);
InitClientCrt(exec_path);
}
@ -1049,6 +1054,12 @@ void server_loop(void)
pauseLogTimes = 0;
}
if (access(g_cmManualStartingPath, F_OK) == 0) {
g_isStarting = true;
} else {
g_isStarting = false;
}
(void)clock_gettime(CLOCK_MONOTONIC, &endTime);
if (g_isStart) {
g_suppressAlarm = true;

View File

@ -41,6 +41,8 @@ static bool g_shutdownClient = false;
static SendMsgQueue *g_sendMsg = NULL;
static OneResStatList *g_clientStatusList = NULL;
static volatile bool g_needReconnect = false;
#define CLUSTER_MANUAL_PAUSE "cluster_manual_pause"
char g_manualPausePath[MAX_PATH_LEN];
static pthread_t *g_conThreadId = NULL;
static pthread_t *g_sendThreadId = NULL;
@ -608,6 +610,7 @@ static void InitGlobalVariable(const char *resName)
status_t PreInit(uint32 instanceId, const char *resName, CmNotifyFunc func, bool *isFirstInit)
{
get_pause_path();
if (isFirstInit) {
InitGlobalVariable(resName);
CM_RETURN_IFERR(InitLogFile());
@ -695,3 +698,22 @@ ClientLockResult SendLockMsgAndWaitResult(char *msgPtr, uint32 msgLen)
return result;
}
void get_pause_path()
{
char exec_path[MAX_PATH_LEN] = {0};
errno_t rc;
int rcs;
rc = memset_s(g_manualPausePath, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
if (GetHomePath(exec_path, sizeof(exec_path)) != 0) {
(void)fprintf(stderr, "Get GAUSSHOME failed, please check.\n");
return;
} else {
check_input_for_security(exec_path);
rcs = snprintf_s(
g_manualPausePath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/%s", exec_path, CLUSTER_MANUAL_PAUSE);
securec_check_intval(rcs, (void)rcs);
}
}

View File

@ -73,6 +73,10 @@ ClientCmLockMsg *GetLockSendMsg(const char *lockName, LockOption opt)
int ResLockCore(const char *lockName)
{
if (access(g_manualPausePath, F_OK) == 0 && strcmp(lockName, "dms_reformer_lock") == 0) {
write_runlog(LOG, "cm is pause, don't lock(%s).\n", lockName);
return 1;
}
ClientCmLockMsg *sendMsg = GetLockSendMsg(lockName, CM_RES_LOCK);
if (sendMsg == NULL) {
write_runlog(ERROR, "generate (%s)lock msg failed.\n", lockName);

View File

@ -55,6 +55,7 @@
#define MINORITY_AZ_ARBITRATE "minority_az_arbitrate_hist"
#define RESUMING_CN_STOP "resuming_cn_stop"
#define CLUSTER_MANUAL_PAUSE "cluster_manual_pause"
#define CLUSTER_MANUAL_STARTING "cluster_manual_starting"
char* g_bin_name = NULL;
char* g_bin_path = NULL;
@ -121,6 +122,7 @@ static char* hotpatch_path = NULL;
char manual_start_file[MAXPGPATH];
char instance_manual_start_file[MAXPGPATH];
char cluster_manual_starting_file[MAXPGPATH];
char etcd_manual_start_file[MAXPGPATH];
static bool coordinator_dynamic_view = false;
#ifndef ENABLE_MULTIPLE_NODES
@ -865,6 +867,9 @@ static void init_ctl_global_variable()
ret = snprintf_s(
instance_manual_start_file, MAXPGPATH, MAXPGPATH - 1, "%s/bin/%s", g_appPath, INSTANCE_MANUAL_START);
securec_check_intval(ret, (void)ret);
ret = snprintf_s(
cluster_manual_starting_file, MAXPGPATH, MAXPGPATH - 1, "%s/bin/%s", g_appPath, CLUSTER_MANUAL_STARTING);
securec_check_intval(ret, (void)ret);
ret = snprintf_s(etcd_manual_start_file, MAXPGPATH, MAXPGPATH - 1, "%s/bin/%s", g_appPath, ETCD_MANUAL_START);
securec_check_intval(ret, (void)ret);
#ifndef ENABLE_MULTIPLE_NODES
@ -2249,6 +2254,7 @@ static void CtlCommandProcessCore(int *status, CtlOption *ctlCtx)
#endif
case START_COMMAND:
*status = do_start();
removeStartingFile();
break;
case CM_SWITCHOVER_COMMAND:
*status = DoSwitchover(ctlCtx);

View File

@ -52,6 +52,7 @@ extern char result_path[MAXPGPATH];
extern char* g_command_operation_lcName;
extern uint32 g_nodeId;
extern char instance_manual_start_file[MAXPGPATH];
extern char cluster_manual_starting_file[MAXPGPATH];
extern bool switchover_all_quick;
#ifdef ENABLE_MULTIPLE_NODES

View File

@ -90,6 +90,7 @@ static struct timespec g_endTime;
extern char g_cmData[CM_PATH_LENGTH];
extern char manual_start_file[MAXPGPATH];
extern char instance_manual_start_file[MAXPGPATH];
extern char cluster_manual_starting_file[MAXPGPATH];
extern char etcd_manual_start_file[MAXPGPATH];
extern char minority_az_start_file[MAX_PATH_LEN];
extern char g_minorityAzArbitrateFile[MAX_PATH_LEN];
@ -388,6 +389,7 @@ status_t do_start(void)
} else {
if (CheckOfflineInstance(g_commandOperationNodeId)) {
write_runlog(LOG, "the instance(node:%u) is Offline, no need to start.\n", g_commandOperationNodeId);
removeStartingFile();
exit(0);
}
write_runlog(LOG, "start the node:%u,datapath:%s. \n", g_commandOperationNodeId, g_cmData);
@ -489,12 +491,13 @@ static void start_cluster(void)
ret = snprintf_s(command,
MAXPGPATH,
MAXPGPATH - 1,
SYSTEMQUOTE "source /etc/profile;pssh -i %s -h %s \"rm -f %s %s_*\" > %s; "
SYSTEMQUOTE "source /etc/profile;pssh -i %s -h %s \"rm -f %s %s_*; touch %s\" > %s; "
"if [ $? -ne 0 ]; then cat %s; fi; rm -f %s" SYSTEMQUOTE,
PSSH_TIMEOUT_OPTION,
hosts_path,
manual_start_file,
instance_manual_start_file,
cluster_manual_starting_file,
pssh_out_path,
pssh_out_path,
pssh_out_path);
@ -502,13 +505,14 @@ static void start_cluster(void)
ret = snprintf_s(command,
MAXPGPATH,
MAXPGPATH - 1,
SYSTEMQUOTE "source /etc/profile;pssh -i %s -h %s \"source %s;rm -f %s %s_*\" > %s; "
SYSTEMQUOTE "source /etc/profile;pssh -i %s -h %s \"source %s;rm -f %s %s_*; touch %s\" > %s; "
"if [ $? -ne 0 ]; then cat %s; fi; rm -f %s" SYSTEMQUOTE,
PSSH_TIMEOUT_OPTION,
hosts_path,
mpp_env_separate_file,
manual_start_file,
instance_manual_start_file,
cluster_manual_starting_file,
pssh_out_path,
pssh_out_path,
pssh_out_path);
@ -1320,8 +1324,10 @@ static void start_node(uint32 nodeid)
char command[MAXPGPATH];
uint32 ii;
errno_t rc;
rc = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, SYSTEMQUOTE "rm -f %s %s %s_* < \"%s\" 2>&1 &" SYSTEMQUOTE,
manual_start_file, etcd_manual_start_file, instance_manual_start_file, DEVNULL);
rc = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1,
SYSTEMQUOTE "rm -f %s %s %s_*; touch %s < \"%s\" 2>&1 &" SYSTEMQUOTE,
manual_start_file, etcd_manual_start_file,
instance_manual_start_file, cluster_manual_starting_file, DEVNULL);
securec_check_intval(rc, (void)rc);
if (nodeid == g_currentNode->node) {
@ -1415,6 +1421,52 @@ void start_instance(uint32 nodeid, const char* datapath)
}
}
void removeStartingFile()
{
int ret;
char command[MAX_COMMAND_LEN] = {0};
init_hosts();
if (mpp_env_separate_file[0] == '\0') {
ret = snprintf_s(command,
MAX_COMMAND_LEN,
MAX_COMMAND_LEN - 1,
SYSTEMQUOTE "source /etc/profile;pssh -i %s -h %s \"rm -f %s\" > %s; "
"if [ $? -ne 0 ]; then cat %s; fi; rm -f %s" SYSTEMQUOTE,
PSSH_TIMEOUT_OPTION,
hosts_path,
cluster_manual_starting_file,
pssh_out_path,
pssh_out_path,
pssh_out_path);
} else {
ret = snprintf_s(command,
MAX_COMMAND_LEN,
MAX_COMMAND_LEN - 1,
SYSTEMQUOTE "source /etc/profile;pssh -i %s -h %s \"source %s;rm -f %s\" > %s; "
"if [ $? -ne 0 ]; then cat %s; fi; rm -f %s" SYSTEMQUOTE,
PSSH_TIMEOUT_OPTION,
hosts_path,
mpp_env_separate_file,
cluster_manual_starting_file,
pssh_out_path,
pssh_out_path,
pssh_out_path);
}
securec_check_intval(ret, (void)ret);
ret = system(command);
if (ret != 0) {
write_runlog(DEBUG1,
"Failed to delete the startingFile with executing the command: command=\"%s\","
" nodeId=%u, systemReturn=%d, shellReturn=%d, errno=%d.\n",
command,
g_currentNode->node,
ret,
SHELL_RETURN_CODE(ret),
errno);
}
}
static void* check_cluster_start_status(void* arg)
{
int count = 0;
@ -1428,9 +1480,11 @@ static void* check_cluster_start_status(void* arg)
while (startingTime < g_waitSeconds) {
if (g_cluster_start_status == CM_STATUS_NORMAL) {
write_runlog(LOG, "start cluster successfully.\n");
removeStartingFile();
exit(0);
} else if (g_cluster_start_status == CM_STATUS_NORMAL_WITH_CN_DELETED) {
write_runlog(LOG, "start cluster successfully. There is a coordinator that has been deleted. \n");
removeStartingFile();
exit(0);
} else if (g_az_start_status == CM_STATUS_NORMAL) {
for (uint32 ii = 0; ii < g_node_num; ii++) {
@ -1440,6 +1494,7 @@ static void* check_cluster_start_status(void* arg)
}
write_runlog(LOG, "start availability zone successfully.\n");
removeStartingFile();
exit(0);
} else if (g_az_start_status == CM_STATUS_NORMAL_WITH_CN_DELETED) {
for (uint32 ii = 0; ii < g_node_num; ii++) {
@ -1449,12 +1504,15 @@ static void* check_cluster_start_status(void* arg)
}
write_runlog(LOG, "start availability zone successfully. There is a coordinator that has been deleted. \n");
removeStartingFile();
exit(0);
} else if (g_node_start_status == CM_STATUS_NORMAL) {
write_runlog(LOG, "start node successfully.\n");
removeStartingFile();
exit(0);
} else if (g_node_start_status == CM_STATUS_NORMAL_WITH_CN_DELETED) {
write_runlog(LOG, "start node successfully. There is a coordinator that has been deleted. \n");
removeStartingFile();
exit(0);
} else if (g_instance_start_status == CM_STATUS_NORMAL) {
/*
@ -1464,16 +1522,19 @@ static void* check_cluster_start_status(void* arg)
count++;
if (count > INSTANCE_START_CONFIRM_TIME) {
write_runlog(LOG, "start instance successfully.\n");
removeStartingFile();
exit(0);
}
} else if (g_dn_relation_start_status == CM_STATUS_NORMAL) {
/* check whether the relation datanodes have been started successfully */
write_runlog(LOG, "start relation datanodes successfully(node:%u, path:%s).\n",
g_commandOperationNodeId, g_cmData);
removeStartingFile();
exit(0);
} else if (g_resStartStatus == CM_STATUS_NORMAL) {
write_runlog(LOG, "start resource instance successfully(nodeId:%u, instId:%u).\n",
g_commandOperationNodeId, g_commandOperationInstanceId);
removeStartingFile();
exit(0);
} else {
count = 0;
@ -1535,6 +1596,7 @@ static void* check_cluster_start_status(void* arg)
g_waitSeconds);
}
removeStartingFile();
exit(-1);
}

View File

@ -47,8 +47,8 @@ void InitDbListsByStaticConfig()
return;
}
g_dbResNodeIdxInfo.idxLists[g_dbResNodeIdxInfo.hwl] = i;
rcs =
snprintf_s(info, maxInfoLen, maxInfoLen - 1, " %u:[%u-%u]", g_dbResNodeIdxInfo.hwl, i, g_node[i].node);
rcs = snprintf_s(info, maxInfoLen, maxInfoLen - 1, " %u:[%u-%u]",
g_dbResNodeIdxInfo.hwl, i, g_node[i].node);
securec_check_intval(rcs, (void)rcs);
rcs = strncat_s(buf, MAX_LOG_BUFF_LEN, info, strlen(info));
securec_check_errno(rcs, (void)rcs);

View File

@ -312,6 +312,8 @@ extern bool g_dn_report_msg_ok;
extern bool g_isPauseArbitration;
extern char g_cmManualPausePath[MAX_PATH_LEN];
extern bool g_isStarting;
extern char g_cmManualStartingPath[MAX_PATH_LEN];
#endif

View File

@ -47,6 +47,7 @@
#define SYSTEM_CALL_LOG "system_call"
#define MAX_LOGFILE_TIMESTAMP "99991231235959"
#define CM_CLUSTER_MANUAL_PAUSE "cluster_manual_pause"
#define CM_CLUSTER_MANUAL_STARTING "cluster_manual_starting"
#define CONN_FAIL_TIMES 3
/* time style length */

View File

@ -120,6 +120,9 @@ OneResStatList *GetClientStatusList();
status_t SendInitMsg(uint32 instanceId, const char *resName);
bool SendInitMsgAndGetResult(const char *resName, uint32 instId);
ClientLockResult SendLockMsgAndWaitResult(char *msgPtr, uint32 msgLen);
void get_pause_path();
extern char g_manualPausePath[MAX_PATH_LEN];
#endif
#endif // CM_CLIENT_H

View File

@ -312,5 +312,6 @@ int DoRhbPrint();
int DoPause();
int DoResume();
bool CheckTrustAndNet();
void removeStartingFile();
#endif