diff --git a/build/cm.ver b/build/cm.ver index b1421a4..903677b 100644 --- a/build/cm.ver +++ b/build/cm.ver @@ -1,2 +1,2 @@ PRODUCT="openGauss" -VERSION="5.0.0" +VERSION="5.0.1" diff --git a/src/cm_adapter/cm_sharediskapi/cm_voting_disk.cpp b/src/cm_adapter/cm_sharediskapi/cm_voting_disk.cpp index d3f8545..cb85c97 100644 --- a/src/cm_adapter/cm_sharediskapi/cm_voting_disk.cpp +++ b/src/cm_adapter/cm_sharediskapi/cm_voting_disk.cpp @@ -112,14 +112,14 @@ status_t SetVotingDiskNodeData(char *data, uint32 dataLen) return CM_SUCCESS; } -status_t UpdateAllNodeHeartBeat() +status_t UpdateAllNodeHeartBeat(uint32 nodeNum) { - uint32 dataLen = VOTING_DISK_DATA_SIZE; + uint32 dataLen = nodeNum * VOTING_DISK_EACH_NODE_OFFSET; if (GetVotingDiskNodeData(g_nodeDataBuff, dataLen) != CM_SUCCESS) { write_runlog(ERROR, "[%s] get voting disk node data failed.\n", __FUNCTION__); return CM_ERROR; } - for (uint32 i = 0; i < VOTING_DISK_MAX_NODE_NUM; i++) { + for (uint32 i = 0; i < nodeNum; i++) { uint32 offset = i * VOTING_DISK_EACH_NODE_OFFSET; VotingDiskNodeInfo *nodeInfo = (VotingDiskNodeInfo*)(g_nodeDataBuff + offset); if (nodeInfo->nodeTime == 0) { diff --git a/src/cm_agent/cma_instance_management_res.cpp b/src/cm_agent/cma_instance_management_res.cpp index 7043b24..f7069ca 100644 --- a/src/cm_agent/cma_instance_management_res.cpp +++ b/src/cm_agent/cma_instance_management_res.cpp @@ -71,8 +71,14 @@ static int CusResCmdExecute(const char *scriptPath, const char *oper, uint32 tim status_t StartOneResInst(const CmResConfList *conf) { + int ret; char oper[MAX_OPTION_LEN] = {0}; - int ret = snprintf_s(oper, MAX_OPTION_LEN, MAX_OPTION_LEN - 1, "-start %u %s", conf->resInstanceId, conf->arg); + if (conf->resType == CUSTOM_RESOURCE_DN && undocumentedVersion > 0) { + ret = snprintf_s(oper, MAX_OPTION_LEN, MAX_OPTION_LEN - 1, "-start %u %s '-u %u'", conf->resInstanceId, + conf->arg, undocumentedVersion); + } else { + ret = snprintf_s(oper, MAX_OPTION_LEN, MAX_OPTION_LEN - 1, "-start %u %s", conf->resInstanceId, conf->arg); + } securec_check_intval(ret, (void)ret); ret = CusResCmdExecute(conf->script, oper, (uint32)conf->checkInfo.timeOut, CM_FALSE); @@ -118,13 +124,13 @@ void OneResInstClean(const CmResConfList *oneResConf) } } -status_t RegOneResInst(const CmResConfList *conf, uint32 destInstId) +status_t RegOneResInst(const CmResConfList *conf, uint32 destInstId, bool8 needNohup) { char oper[MAX_OPTION_LEN] = {0}; int ret = snprintf_s(oper, MAX_OPTION_LEN, MAX_OPTION_LEN - 1, "-reg %u %s", destInstId, conf->arg); securec_check_intval(ret, (void)ret); - ret = CusResCmdExecute(conf->script, oper, (uint32)conf->checkInfo.timeOut, CM_TRUE); + ret = CusResCmdExecute(conf->script, oper, (uint32)conf->checkInfo.timeOut, needNohup); if (ret != 0) { write_runlog(ERROR, "[%s]: cmd:(%s %s) execute failed, ret=%d.\n", __FUNCTION__, conf->script, oper, ret); return CM_ERROR; @@ -251,7 +257,7 @@ int CheckOneResInst(const CmResConfList *conf) return ret; } -static void ManualStopLocalResInst(CmResConfList *conf) +static status_t ManualStopOneLocalResInst(CmResConfList *conf) { char instanceStartFile[MAX_PATH_LEN] = {0}; int ret = snprintf_s(instanceStartFile, MAX_PATH_LEN, MAX_PATH_LEN - 1, @@ -260,7 +266,7 @@ static void ManualStopLocalResInst(CmResConfList *conf) if (CmFileExist(instanceStartFile)) { write_runlog(LOG, "instanceStartFile(%s) is exist, can't create again.\n", instanceStartFile); - return; + return CM_SUCCESS; } char command[MAX_PATH_LEN] = {0}; @@ -272,8 +278,28 @@ static void ManualStopLocalResInst(CmResConfList *conf) ret = system(command); if (ret != 0) { write_runlog(ERROR, "manual stop res(%s) inst(%u) failed, ret=%d.\n", conf->resName, conf->resInstanceId, ret); - } else { - write_runlog(LOG, "manual stop res(%s) inst(%u) success.\n", conf->resName, conf->resInstanceId); + return CM_ERROR; + } + + write_runlog(LOG, "manual stop res(%s) inst(%u) success.\n", conf->resName, conf->resInstanceId); + return CM_SUCCESS; +} + +static status_t ManuallStopAllLocalResInst() +{ + status_t result = CM_SUCCESS; + for (uint32 i = 0; i < GetLocalResConfCount(); ++i) { + if (ManualStopOneLocalResInst(&g_resConf[i]) != CM_SUCCESS) { + result = CM_ERROR; + } + } + + return result; +} + +static void ManualStopLocalResInst(CmResConfList *conf) +{ + if (ManuallStopAllLocalResInst() == CM_SUCCESS) { CleanOneInstCheckCount(conf); } } @@ -290,21 +316,17 @@ bool IsInstManualStopped(uint32 instId) return false; } -static bool CanCusInstDoRestart(const CmResConfList *conf) +static inline void RestartOneResInst(CmResConfList *conf) { ResIsregStatus stat = IsregOneResInst(conf, conf->resInstanceId); - if ((stat == CM_RES_ISREG_REG) || (stat == CM_RES_ISREG_NOT_SUPPORT)) { - return true; + if ((stat != CM_RES_ISREG_REG) && (stat != CM_RES_ISREG_NOT_SUPPORT)) { + if (RegOneResInst(conf, conf->resInstanceId, CM_FALSE) != CM_SUCCESS) { + write_runlog(LOG, "cur inst(%u) isreg stat=(%u), and reg failed, restart failed.\n", + conf->cmInstanceId, (uint32)stat); + return; + } } - write_runlog(LOG, "cur inst(%u) isreg stat=(%u), can't do restart.\n", conf->cmInstanceId, (uint32)stat); - return false; -} - -static inline status_t RestartOneResInst(CmResConfList *conf) -{ - (void)CleanOneResInst(conf); - CM_RETURN_IFERR(StartOneResInst(conf)); - return CM_SUCCESS; + (void)StartOneResInst(conf); } static void ProcessOfflineInstance(CmResConfList *conf) @@ -312,9 +334,7 @@ static void ProcessOfflineInstance(CmResConfList *conf) long curTime = GetCurMonotonicTimeSec(); if (conf->checkInfo.restartTimes == -1) { - if (CanCusInstDoRestart(conf)) { - (void)RestartOneResInst(conf); - } + RestartOneResInst(conf); return; } if (conf->checkInfo.brokeTime == 0) { @@ -338,10 +358,7 @@ static void ProcessOfflineInstance(CmResConfList *conf) conf->resName, conf->resInstanceId, conf->checkInfo.startTime, conf->checkInfo.restartPeriod); return; } - if (!CanCusInstDoRestart(conf)) { - return; - } - CM_RETVOID_IFERR(RestartOneResInst(conf)); + RestartOneResInst(conf); conf->checkInfo.startCount++; conf->checkInfo.startTime = curTime; write_runlog(LOG, "res(%s) inst(%u) has been restart (%d) times, restart more than (%d) time will manually stop.\n", @@ -376,7 +393,7 @@ static void ProcessAbnormalInstance(CmResConfList *conf) write_runlog(LOG, "res(%s) inst(%u) has been abnormal (%d)s, >= timeout(%d)s, need restart.\n", conf->resName, conf->cmInstanceId, duration, conf->checkInfo.abnormalTimeout); - CM_RETVOID_IFERR(RestartOneResInst(conf)); + RestartOneResInst(conf); conf->checkInfo.startCount++; conf->checkInfo.startTime = curTime; @@ -625,10 +642,8 @@ static status_t InitLocalAllDnResInstConf(const CusResConfJson *resJson, CmResCo static status_t InitLocalOneResConf(const OneCusResConfJson *oneResJson) { - CmResConfList newLocalConf; - errno_t rc = memset_s(&newLocalConf, sizeof(CmResConfList), 0, sizeof(CmResConfList)); - securec_check_errno(rc, (void)rc); - + CmResConfList newLocalConf = {{0}}; + newLocalConf.resType = (int)oneResJson->resType; if (oneResJson->resType == CUSTOM_RESOURCE_APP) { CM_RETURN_IFERR(InitLocalCommConfOfDefRes(&oneResJson->appResConf, &newLocalConf)); CM_RETURN_IFERR(InitLocalAllAppResInstConf(&oneResJson->appResConf, &newLocalConf)); diff --git a/src/cm_agent/cma_process_messages_client.cpp b/src/cm_agent/cma_process_messages_client.cpp index 66ae3dd..6781e47 100644 --- a/src/cm_agent/cma_process_messages_client.cpp +++ b/src/cm_agent/cma_process_messages_client.cpp @@ -356,7 +356,7 @@ static void ProcessRegResInst(const CmsNotifyAgentRegMsg *recvMsg) } else if ((isreg == CM_RES_ISREG_UNREG) || (isreg == CM_RES_ISREG_PENDING) || (isreg == CM_RES_ISREG_UNKNOWN)) { write_runlog(LOG, "before reg res inst, need clean res inst first.\n"); if ((CheckOneResInst(local) == CUS_RES_CHECK_STAT_OFFLINE) || (CleanOneResInst(local) == CM_SUCCESS)) { - (void)RegOneResInst(local, recvMsg->resInstId); + (void)RegOneResInst(local, recvMsg->resInstId, CM_TRUE); } } else if (isreg == CM_RES_ISREG_NOT_SUPPORT) { write_runlog(LOG, "res inst[%s:%u] don't support reg, not need reg.\n", recvMsg->resName, recvMsg->resInstId); diff --git a/src/cm_ctl/ctl_common.cpp b/src/cm_ctl/ctl_common.cpp index b33c87b..7715fa0 100644 --- a/src/cm_ctl/ctl_common.cpp +++ b/src/cm_ctl/ctl_common.cpp @@ -181,16 +181,15 @@ int ssh_exec(const staticNodeConfig* node, const char* cmd, int32 logLevel) for (uint32 ii = 0; ii < node->sshCount; ii++) { if (mpp_env_separate_file[0] == '\0') { ret = snprintf_s(actualCmd, MAX_COMMAND_LEN, MAX_COMMAND_LEN - 1, - "pssh %s -s -H %s \"( %s ) > %s 2>&1\" < %s > /dev/null 2>&1", - PSSH_TIMEOUT_OPTION, node->sshChannel[ii], cmd, "/dev/null", "/dev/null"); - securec_check_intval(ret, (void)ret); + "pssh %s -s -H %s \"( %s ) > %s 2>&1\" > /dev/null 2>&1", + PSSH_TIMEOUT_OPTION, node->sshChannel[ii], cmd, "/dev/null"); } else { ret = snprintf_s(actualCmd, MAX_COMMAND_LEN, MAX_COMMAND_LEN - 1, - "pssh %s -s -H %s \"( source %s;%s ) > %s 2>&1\" < %s > /dev/null 2>&1", + "pssh %s -s -H %s \"( source %s;%s ) > %s 2>&1\" > /dev/null 2>&1", PSSH_TIMEOUT_OPTION, node->sshChannel[ii], mpp_env_separate_file, cmd, - "/dev/null", "/dev/null"); - securec_check_intval(ret, (void)ret); + "/dev/null"); } + securec_check_intval(ret, (void)ret); rc = system(actualCmd); if (rc != 0) { write_runlog(logLevel, "ssh failed at \"%s\".\n", node->sshChannel[ii]); diff --git a/src/cm_ctl/ctl_guc.cpp b/src/cm_ctl/ctl_guc.cpp index ad579ac..a6c5f64 100644 --- a/src/cm_ctl/ctl_guc.cpp +++ b/src/cm_ctl/ctl_guc.cpp @@ -43,6 +43,8 @@ static char g_confFile[CM_PATH_LENGTH]; extern char g_appPath[MAXPGPATH]; extern char mpp_env_separate_file[MAXPGPATH]; +static status_t CheckGucOption(const GucOption &gucCtx); + static inline void SkipSpace(char *&ptr) { while (isspace((unsigned char)*ptr)) { @@ -690,39 +692,6 @@ status_t ExeGucCommand(const GucOption *gucCtx) return result; } -status_t ProcessInLocalInstance(const GucOption *gucCtx) -{ - errno_t rc; - char cmDir[CM_PATH_LENGTH] = { 0 }; - char instanceDir[CM_PATH_LENGTH] = { 0 }; - - rc = memcpy_s(cmDir, sizeof(cmDir), g_currentNode->cmDataPath, sizeof(cmDir)); - securec_check_errno(rc, (void)rc); - - if (cmDir[0] == '\0') { - write_runlog(ERROR, "Failed to get cm base data path from static config file."); - return CM_ERROR; - } - - if (gucCtx->nodeType == NODE_TYPE_AGENT) { - rc = snprintf_s(instanceDir, sizeof(instanceDir), sizeof(instanceDir) - 1, "%s/cm_agent", cmDir); - securec_check_intval(rc, (void)rc); - } else { - if (g_currentNode->cmServerLevel != 1) { - write_runlog(LOG, "There is no cmserver instance on local node."); - return CM_ERROR; - } - rc = snprintf_s(instanceDir, sizeof(instanceDir), sizeof(instanceDir) - 1, "%s/cm_server", cmDir); - securec_check_intval(rc, (void)rc); - } - GetInstanceConfigfile(gucCtx->nodeType, instanceDir); - if (ExeGucCommand(gucCtx) != CM_SUCCESS) { - return CM_ERROR; - } - - return CM_SUCCESS; -} - static uint32 GetNodeIndex(uint32 nodeId) { for (uint32 i = 0; i < g_node_num; ++i) { @@ -784,18 +753,56 @@ static status_t ListRemoteConfMain(staticNodeConfig *node, const char *cmd) return CM_ERROR; } +status_t ProcessInLocalInstanceExec(const GucOption *gucCtx) +{ + errno_t rc; + char cmDir[CM_PATH_LENGTH] = { 0 }; + char instanceDir[CM_PATH_LENGTH] = { 0 }; + + rc = memcpy_s(cmDir, sizeof(cmDir), g_currentNode->cmDataPath, sizeof(cmDir)); + securec_check_errno(rc, (void)rc); + + if (cmDir[0] == '\0') { + write_runlog(ERROR, "Failed to get cm base data path from static config file."); + return CM_ERROR; + } + + if (gucCtx->nodeType == NODE_TYPE_AGENT) { + rc = snprintf_s(instanceDir, sizeof(instanceDir), sizeof(instanceDir) - 1, "%s/cm_agent", cmDir); + securec_check_intval(rc, (void)rc); + } else { + if (g_currentNode->cmServerLevel != 1) { + write_runlog(LOG, "There is no cmserver instance on local node."); + return CM_ERROR; + } + rc = snprintf_s(instanceDir, sizeof(instanceDir), sizeof(instanceDir) - 1, "%s/cm_server", cmDir); + securec_check_intval(rc, (void)rc); + } + GetInstanceConfigfile(gucCtx->nodeType, instanceDir); + if (ExeGucCommand(gucCtx) != CM_SUCCESS) { + return CM_ERROR; + } + + return CM_SUCCESS; +} + +status_t ProcessInLocalInstance(const CtlOption *ctx) +{ + if (CheckGucOption(ctx->guc) != CM_SUCCESS) { + return CM_ERROR; + } + + if (ctx->guc.gucCommand == SET_CONF_COMMAND && CheckGucOptionValidate(ctx->guc) != CM_SUCCESS) { + DoAdvice(); + return CM_ERROR; + } + + return ProcessInLocalInstanceExec(&ctx->guc); +} + static status_t ProcessInRemoteInstance(const CtlOption *ctx) { char remoteCmd[MAX_COMMAND_LEN] = {0}; - - if (ctx->comm.nodeId == g_currentNode->node) { - if (ProcessInLocalInstance(&ctx->guc) == CM_ERROR) { - write_runlog(DEBUG1, "cm_ctl fail to execute in local.\n"); - return CM_ERROR; - } - return CM_SUCCESS; - } - GetRemoteGucCommand(ctx, remoteCmd, sizeof(remoteCmd)); if (ctx->guc.gucCommand == LIST_CONF_COMMAND) { return ListRemoteConfMain(&g_node[GetNodeIndex(ctx->comm.nodeId)], remoteCmd); @@ -817,8 +824,10 @@ static status_t ProcessInAllNodesInstance(CtlOption *ctx) continue; } ctx->comm.nodeId = g_node[i].node; - if (ProcessInRemoteInstance(ctx) == CM_ERROR) { - result = CM_ERROR; + if (ctx->comm.nodeId == g_currentNode->node) { + result = ProcessInLocalInstance(ctx); + } else { + result = ProcessInRemoteInstance(ctx); } } @@ -827,15 +836,21 @@ static status_t ProcessInAllNodesInstance(CtlOption *ctx) status_t ProcessClusterGucOption(CtlOption *ctx) { - status_t result; - if (ctx->comm.nodeId == 0) { - result = ProcessInAllNodesInstance(ctx); - } else { - result = ProcessInRemoteInstance(ctx); + return ProcessInAllNodesInstance(ctx); } - return result; + if (ctx->comm.nodeId != g_currentNode->node) { + return ProcessInRemoteInstance(ctx); + } + + status_t res = ProcessInLocalInstance(ctx); + if (res == CM_ERROR) { + write_runlog(DEBUG1, "cm_ctl fail to execute in local.\n"); + } + + return res; + } static status_t CheckGucOption(const GucOption &gucCtx) @@ -855,22 +870,10 @@ static status_t CheckGucOption(const GucOption &gucCtx) // cm_ctl integration guc set reload and check capacity int DoGuc(CtlOption *ctx) { - if (CheckGucOption(ctx->guc) != CM_SUCCESS) { - return 1; - } + status_t res = ProcessClusterGucOption(ctx); + PrintResults(res == CM_SUCCESS, ctx); - if ((ctx->guc.gucCommand == SET_CONF_COMMAND) && (CheckGucOptionValidate(ctx->guc) != CM_SUCCESS)) { - DoAdvice(); - return 1; - } - - if (ProcessClusterGucOption(ctx) != CM_SUCCESS) { - PrintResults(false, ctx); - return 1; - } - PrintResults(true, ctx); - - return 0; + return (int)res; } static void MemsetPassword(char **password) diff --git a/src/cm_ctl/ctl_misc.cpp b/src/cm_ctl/ctl_misc.cpp index 9a29491..8c47934 100644 --- a/src/cm_ctl/ctl_misc.cpp +++ b/src/cm_ctl/ctl_misc.cpp @@ -1656,6 +1656,7 @@ char *DoConcatCmd(const CtlOption *ctx) int rc = memset_s(cmd, CM_PATH_LENGTH, 0, CM_PATH_LENGTH); securec_check_errno(rc, (void)rc); if (DoCheckRole(&ctx->dcfOption) == -1) { + free(cmd); return NULL; } diff --git a/src/cm_ctl/ctl_param_check.cpp b/src/cm_ctl/ctl_param_check.cpp index 30d63d0..a5acc7d 100644 --- a/src/cm_ctl/ctl_param_check.cpp +++ b/src/cm_ctl/ctl_param_check.cpp @@ -127,6 +127,9 @@ const char *g_cmsParamInfo[] = { "cms_enable_failover_on2nodes|bool|0,0|NULL|NULL|", "cms_enable_db_crash_recovery|bool|0,0|NULL|NULL|", "cms_network_isolation_timeout|int|10,2147483647|NULL|NULL|", +#ifndef ENABLE_PRIVATEGAUSS + "wait_static_primary_times|int|5,2147483647|NULL|NULL|", +#endif }; const char *g_valueTypeStr[] = { @@ -304,6 +307,7 @@ char *GetParamLineInfo(const char *paramName, const char * const *paramInfos, in if (paramInfos == NULL) { write_runlog(ERROR, "Fail to get param info.\n"); + free(info); return NULL; } diff --git a/src/cm_ctl/ctl_res_list.cpp b/src/cm_ctl/ctl_res_list.cpp index 2e820b2..d360e11 100644 --- a/src/cm_ctl/ctl_res_list.cpp +++ b/src/cm_ctl/ctl_res_list.cpp @@ -140,7 +140,7 @@ static status_t SetResBaseInfoInArray(ResBaseInfo *info, const cJSON *resArray, } resName = GetValueStrFromCJson(item, RES_NAME); if (resName == NULL) { - resName = NULL; + resName = PRINT_NULL; } else { isCanPrint = CM_TRUE; } @@ -188,7 +188,7 @@ static void PrintAllResInfoBody(const ResBaseInfo *info, const cJSON *resArray) } resName = GetValueStrFromCJson(item, RES_NAME); if (resName == NULL) { - resName = NULL; + resName = PRINT_NULL; } resType = GetValueStrFromCJson(item, RESOURCE_TYPE); if (resType == NULL) { diff --git a/src/cm_server/cm_server.centralized.conf.sample b/src/cm_server/cm_server.centralized.conf.sample index f71e94a..fddf8c4 100644 --- a/src/cm_server/cm_server.centralized.conf.sample +++ b/src/cm_server/cm_server.centralized.conf.sample @@ -87,4 +87,6 @@ cms_enable_db_crash_recovery = false # used in 2 nodes cluster. when network re cms_network_isolation_timeout = 20 # cms judges the network is isolated when it finds ddb cluster is not sync with each other nodes, # after cms_network_isolation_timeout times. # default 20 +wait_static_primary_times = 6 # Time to wait for the primary recovery after the primary stopped unexpectedly. + # default value is 6 ############### must leave a new line at the end ################### diff --git a/src/cm_server/cms_arbitrate_cluster.cpp b/src/cm_server/cms_arbitrate_cluster.cpp index b9bdc95..b3bc2d5 100644 --- a/src/cm_server/cms_arbitrate_cluster.cpp +++ b/src/cm_server/cms_arbitrate_cluster.cpp @@ -80,6 +80,13 @@ typedef enum MaxClusterStatEn { MAX_CLUSTER_EXCLUDE, } MaxClusterStat; +typedef struct CurCmRhbStatSt { + uint32 hwl; + time_t baseTime; + time_t hbs[MAX_RHB_NUM][MAX_RHB_NUM]; +} CurCmRhbStat; + +static CurCmRhbStat g_curRhbStat = {0}; static const int32 CHECK_DELAY_IN_ROLE_CHANGING = 10; static MaxNodeCluster g_curCluster = {{0}}; @@ -320,6 +327,23 @@ static status_t InitMaxNodeCluster(MaxNodeCluster *maxNodeCluster) return CM_SUCCESS; } +static MaxClusterResStatus GetNodesConnStatByRhb(int idx1, int idx2, int timeout) +{ + if (timeout == 0) { + return MAX_CLUSTER_STATUS_AVAIL; + } + + if (g_curRhbStat.hbs[idx1][idx2] == 0 || g_curRhbStat.hbs[idx2][idx1] == 0) { + return MAX_CLUSTER_STATUS_INIT; + } + + if (IsRhbTimeout(g_curRhbStat.hbs[idx1][idx2], g_curRhbStat.baseTime, timeout) || + IsRhbTimeout(g_curRhbStat.hbs[idx2][idx1], g_curRhbStat.baseTime, timeout)) { + return MAX_CLUSTER_STATUS_UNAVAIL; + } + return MAX_CLUSTER_STATUS_AVAIL; +} + static bool CheckPoint2PointConn(int32 resIdx1, int32 resIdx2) { MaxClusterResStatus connStatus = GetNodesConnStatByRhb(resIdx1, resIdx2, (int)g_agentNetworkTimeout); @@ -454,6 +478,8 @@ static void FindMaxNodeCluster(MaxNodeCluster *maxCluster) { NodeCluster *nodeCluster = &(maxCluster->nodeCluster); nodeCluster->clusterNum = -1; + g_curRhbStat.baseTime = time(NULL); + GetRhbStat(g_curRhbStat.hbs, &g_curRhbStat.hwl); // assume that all meet the conditions. for (int32 i = nodeCluster->maxNodeNum - 1; i >= 0; --i) { if (!IsAllResAvailInNode(i)) { @@ -885,23 +911,35 @@ static bool IsNodeInCluster(int32 resIdx, const MaxNodeCluster *nodeCluster) return false; } -static void PrintRhbStatus() +static void PrintOneRhbLine(time_t *timeArr) { - uint32 hwl = 0; - time_t hbs[MAX_RHB_NUM][MAX_RHB_NUM] = {{0}}; - GetRhbStat(hbs, &hwl); - char *rhbStr = GetRhbSimple((time_t *)hbs, MAX_RHB_NUM, hwl, time(NULL), g_agentNetworkTimeout); - CM_RETURN_IF_NULL(rhbStr); - size_t rhbLen = strlen(rhbStr); - if (rhbLen >= MAX_LOG_BUFF_LEN) { - write_runlog(LOG, "rhbStr len(%lu) is exceed max log buff len(%d), can't print network stat.\n", - rhbLen, MAX_LOG_BUFF_LEN); - FREE_AND_RESET(rhbStr); - return; + int ret; + errno_t rc; + char rhbStr[MAX_PATH_LEN] = {0}; + const uint32 maxInfoLen = TIME_STR_MAX_LEN + 1; + + for (uint32 i = 0; i < g_curRhbStat.hwl; ++i) { + char info[maxInfoLen] = {0}; + char timeBuf[TIME_STR_MAX_LEN] = {0}; + GetTimeStr(timeArr[i], timeBuf, TIME_STR_MAX_LEN); + ret = snprintf_s(info, maxInfoLen, maxInfoLen - 1, "%s|", timeBuf); + securec_check_intval(ret, (void)ret); + rc = strncat_s(rhbStr, MAX_PATH_LEN, info, strlen(info)); + securec_check_errno(rc, (void)rc); } + write_runlog(LOG, "[RHB] hb infos: |%s\n", rhbStr); +} + +static void PrintAllRhbStatus() +{ + char timeBuf[TIME_STR_MAX_LEN] = {0}; + GetTimeStr(g_curRhbStat.baseTime, timeBuf, TIME_STR_MAX_LEN); + write_runlog(LOG, "Network timeout:%u\n", g_agentNetworkTimeout); - write_runlog(LOG, "Network stat('Y' means connected, otherwise 'N'):\n%s\n", rhbStr); - FREE_AND_RESET(rhbStr); + write_runlog(LOG, "Network base_time:%s\n", timeBuf); + for (uint32 i = 0; i < g_curRhbStat.hwl; ++i) { + PrintOneRhbLine(&g_curRhbStat.hbs[i][0]); + } } static void PrintKickOutResult(int32 resIdx, const MaxNodeCluster *maxCluster) @@ -927,11 +965,10 @@ static void PrintKickOutResult(int32 resIdx, const MaxNodeCluster *maxCluster) if (!CheckPoint2PointConn(resIdx, maxCluster->nodeCluster.cluster[i])) { write_runlog(LOG, "kick out result: (index=%d,nodeId=%u) disconnect with (index=%d,nodeId=%u).\n", resIdx, GetNodeByPoint(resIdx), i, GetNodeByPoint(i)); - PrintHbsInfo(resIdx, GetNodeByPoint(resIdx), i, GetNodeByPoint(i), LOG); continue; } } - PrintRhbStatus(); + PrintAllRhbStatus(); } static void PrintArbitrateResult(const MaxNodeCluster *lastCluster, const MaxNodeCluster *curCluster) @@ -999,7 +1036,7 @@ static status_t CheckVotingDisk() const uint32 timeout = 6; uint32 time = timeout; while (time > 0) { - if (UpdateAllNodeHeartBeat() == CM_SUCCESS) { + if (UpdateAllNodeHeartBeat(g_node_num) == CM_SUCCESS) { return CM_SUCCESS; } time--; @@ -1028,6 +1065,8 @@ void *MaxNodeClusterArbitrateMain(void *arg) write_runlog(FATAL, "Alloc voting disk memory failed!\n"); exit(-1); } + g_curRhbStat.baseTime = time(NULL); + GetRhbStat(g_curRhbStat.hbs, &g_curRhbStat.hwl); for (;;) { if (got_stop) { g_threadProcessStatus = THREAD_PROCESS_STOP; diff --git a/src/cm_server/cms_arbitrate_datanode_pms.cpp b/src/cm_server/cms_arbitrate_datanode_pms.cpp index a9a31af..9f6463a 100644 --- a/src/cm_server/cms_arbitrate_datanode_pms.cpp +++ b/src/cm_server/cms_arbitrate_datanode_pms.cpp @@ -2248,7 +2248,7 @@ static void InitDnArbCond(DnArbCtx *ctx) ctx->cond.maxMemArbiTime = 0; ctx->cond.instMainta = IsMaintance(ctx->maintaMode); ctx->cond.switchoverIdx = INVALID_INDEX; - ctx->cond.arbitInterval = g_clusterStarting ? g_clusterStartingArbitDelay : DATANODE_ARBITE_DELAY; + ctx->cond.arbitInterval = g_clusterStarting ? g_clusterStartingArbitDelay : g_waitStaticPrimaryTimes; ctx->cond.arbitStaticInterval = 5; ctx->cond.setOffline = SetOfflineNode(); ctx->cond.snameAzDnCount = 0; diff --git a/src/cm_server/cms_arbitrate_datanode_pms_utils.cpp b/src/cm_server/cms_arbitrate_datanode_pms_utils.cpp index 40ad114..0171108 100644 --- a/src/cm_server/cms_arbitrate_datanode_pms_utils.cpp +++ b/src/cm_server/cms_arbitrate_datanode_pms_utils.cpp @@ -554,7 +554,7 @@ uint32 GetDnArbitateDelayTime(const DnArbCtx *ctx) /* if static primary has finished redo, not need to wait for 180s */ cm_local_replconninfo *status = &(ctx->dnReport[cond->staticPriIdx].local_status); if (status->local_role == INSTANCE_ROLE_STANDBY && status->disconn_mode == PROHIBIT_CONNECTION) { - return DATANODE_ARBITE_DELAY; + return g_waitStaticPrimaryTimes; } return cond->arbitInterval; } diff --git a/src/cm_server/cms_common.cpp b/src/cm_server/cms_common.cpp index 68260d7..9f19d27 100644 --- a/src/cm_server/cms_common.cpp +++ b/src/cm_server/cms_common.cpp @@ -615,6 +615,12 @@ void get_parameters_from_configfile() g_diskTimeout = get_uint32_value_from_config(configDir, "disk_timeout", 200); g_agentNetworkTimeout = get_uint32_value_from_config(configDir, "agent_network_timeout", 6); GetDnArbitrateMode(); +#ifndef ENABLE_PRIVATEGAUSS + g_waitStaticPrimaryTimes = get_uint32_value_from_config(configDir, "wait_static_primary_times", 6); + if (g_waitStaticPrimaryTimes < 5) { + g_waitStaticPrimaryTimes = 5; + } +#endif } void clean_init_cluster_state() diff --git a/src/cm_server/cms_global_params.cpp b/src/cm_server/cms_global_params.cpp index e3889e2..1e38d58 100644 --- a/src/cm_server/cms_global_params.cpp +++ b/src/cm_server/cms_global_params.cpp @@ -328,6 +328,7 @@ uint32 g_delayArbiTime = 0; int32 g_clusterArbiTime = 300; bool g_isPauseArbitration = false; char g_cmManualPausePath[MAX_PATH_LEN] = {0}; +uint32 g_waitStaticPrimaryTimes = 6; bool isLargerNode() { diff --git a/src/cm_server/cms_main.cpp b/src/cm_server/cms_main.cpp index 14f3c3a..486a1dc 100644 --- a/src/cm_server/cms_main.cpp +++ b/src/cm_server/cms_main.cpp @@ -1727,7 +1727,7 @@ static int cm_server_process_startup_packet(int epollfd, CM_Connection* con, CM_ if ((con->port->user_name != NULL) && strncmp(con->port->user_name, pw->pw_name, SP_USER - 1)) { write_runlog(WARNING, "invalid connection\n"); - if (CmsSendAndFlushMsg(con, 'E', "invalid connection", CM_SERVER_PACKET_ERROR_MSG) != 0) { + if (CmsSendAndFlushMsg(con, 'E', "invalid connection", sizeof("invalid connection")) != 0) { RemoveConnAfterSendMsgFailed(con); write_runlog(ERROR, "[%s][line:%d] CmsSendAndFlushMsg fail.\n", __FUNCTION__, __LINE__); } diff --git a/src/cm_server/cms_monitor_main.cpp b/src/cm_server/cms_monitor_main.cpp index f6610f5..ca8a851 100644 --- a/src/cm_server/cms_monitor_main.cpp +++ b/src/cm_server/cms_monitor_main.cpp @@ -343,11 +343,18 @@ static void ReloadParametersFromConfigfile() g_diskTimeout = get_uint32_value_from_config(configDir, "disk_timeout", 200); g_agentNetworkTimeout = get_uint32_value_from_config(configDir, "agent_network_timeout", 6); GetDnArbitrateMode(); +#ifndef ENABLE_PRIVATEGAUSS + g_waitStaticPrimaryTimes = get_uint32_value_from_config(configDir, "wait_static_primary_times", 6); + if (g_waitStaticPrimaryTimes < 5) { + g_waitStaticPrimaryTimes = 5; + } +#endif if (g_cm_server_num == CMS_ONE_PRIMARY_ONE_STANDBY) { GetTwoNodesArbitrateParams(); } + #ifdef ENABLE_MULTIPLE_NODES write_runlog(LOG, "reload cm_server parameters:\n" @@ -378,13 +385,13 @@ static void ReloadParametersFromConfigfile() "datastorage_threshold_check_interval=%d,\n" " max_datastorage_threshold_check=%d, enableSetReadOnly=%s, enableSetReadOnlyThreshold=%u, " "switch_rto=%d, force_promote=%d, cluster_starting_aribt_delay=%u, enable_e2e_rto=%u, " - "g_delayArbiTime=%u, g_clusterArbiTime=%d.\n", + "g_delayArbiTime=%u, g_clusterArbiTime=%d, wait_static_primary_times=%u.\n", log_min_messages, maxLogFileSize, sys_log_path, g_alarmComponentPath, g_alarmReportInterval, instance_heartbeat_timeout, g_ddbArbicfg.haHeartBeatTimeOut, cmserver_self_vote_timeout, g_ddbArbicfg.haStatusInterval, cmserver_ha_connect_timeout, instance_failover_delay_timeout, datastorage_threshold_check_interval, max_datastorage_threshold_check, g_enableSetReadOnly, g_readOnlyThreshold, switch_rto, force_promote, g_clusterStartingArbitDelay, - g_enableE2ERto, g_delayArbiTime, g_clusterArbiTime); + g_enableE2ERto, g_delayArbiTime, g_clusterArbiTime, g_waitStaticPrimaryTimes); #endif } diff --git a/src/cm_server/cms_rhb.cpp b/src/cm_server/cms_rhb.cpp index 6e14700..88a55eb 100644 --- a/src/cm_server/cms_rhb.cpp +++ b/src/cm_server/cms_rhb.cpp @@ -95,31 +95,13 @@ void GetRhbStat(time_t hbs[MAX_RHB_NUM][MAX_RHB_NUM], unsigned int *hwl) securec_check_errno(rc, (void)rc); } -MaxClusterResStatus GetNodesConnStatByRhb(int resIdx1, int resIdx2, int timeout) -{ - if (timeout == 0) { - return MAX_CLUSTER_STATUS_AVAIL; - } - - if (g_hbs[resIdx1][resIdx2] == 0 || g_hbs[resIdx2][resIdx1] == 0) { - return MAX_CLUSTER_STATUS_INIT; - } - - time_t curTime = time(NULL); - if (IsRhbTimeout(g_hbs[resIdx1][resIdx2], curTime, timeout) || - IsRhbTimeout(g_hbs[resIdx2][resIdx1], curTime, timeout)) { - return MAX_CLUSTER_STATUS_UNAVAIL; - } - return MAX_CLUSTER_STATUS_AVAIL; -} - void ResetNodeConnStat() { errno_t rc = memset_s(g_hbs, sizeof(g_hbs), 0, sizeof(g_hbs)); securec_check_errno(rc, (void)rc); } -void PrintOneHbInfo(int resIdx1, uint32 nodeId1, int resIdx2, uint32 nodeId2, int logLevel) +static void PrintOneHbInfo(int resIdx1, uint32 nodeId1, int resIdx2, uint32 nodeId2, int logLevel) { struct tm result; GetLocalTime(&g_hbs[resIdx1][resIdx2], &result); @@ -135,3 +117,10 @@ void PrintHbsInfo(int resIdx1, uint32 nodeId1, int resIdx2, uint32 nodeId2, int PrintOneHbInfo(resIdx1, nodeId1, resIdx2, nodeId2, logLevel); PrintOneHbInfo(resIdx2, nodeId2, resIdx1, nodeId1, logLevel); } + +void GetTimeStr(time_t baseTime, char *timeStr, uint32 strLen) +{ + struct tm result; + GetLocalTime(&baseTime, &result); + (void)strftime(timeStr, strLen, "%Y-%m-%d %H:%M:%S", &result); +} diff --git a/src/include/cm/cm_adapter/cm_sharedisk/cm_voting_disk.h b/src/include/cm/cm_adapter/cm_sharedisk/cm_voting_disk.h index c32816c..eb2be21 100644 --- a/src/include/cm/cm_adapter/cm_sharedisk/cm_voting_disk.h +++ b/src/include/cm/cm_adapter/cm_sharedisk/cm_voting_disk.h @@ -42,7 +42,7 @@ status_t GetVotingDiskData(char *data, uint32 dataLen, uint32 offset); status_t GetVotingDiskSingleNodeInfo(VotingDiskNodeInfo *nodeInfo, uint32 nodeIndex); status_t InitVotingDiskHandler(const char *scsiDev, uint32 offset); status_t InitVotingDisk(const char *votingDiskPath); -status_t UpdateAllNodeHeartBeat(); +status_t UpdateAllNodeHeartBeat(uint32 nodeNum); void ResetVotingdiskHeartBeat(); VotingDiskStatus GetNodeHeartbeatStat(uint32 nodeIndex, uint32 diskTimeout, int logLevel); status_t AllocVotingDiskMem(); diff --git a/src/include/cm/cm_agent/cma_instance_management_res.h b/src/include/cm/cm_agent/cma_instance_management_res.h index 1df7cc4..ae4bc9c 100644 --- a/src/include/cm/cm_agent/cma_instance_management_res.h +++ b/src/include/cm/cm_agent/cma_instance_management_res.h @@ -30,7 +30,7 @@ status_t StartOneResInst(const CmResConfList *conf); void StopOneResInst(const CmResConfList *conf); void OneResInstShutdown(const CmResConfList *oneResConf); -status_t RegOneResInst(const CmResConfList *conf, uint32 destInstId); +status_t RegOneResInst(const CmResConfList *conf, uint32 destInstId, bool8 needNohup); status_t UnregOneResInst(const CmResConfList *conf, uint32 destInstId); ResIsregStatus IsregOneResInst(const CmResConfList *conf, uint32 destInstId); status_t CleanOneResInst(const CmResConfList *conf); diff --git a/src/include/cm/cm_misc.h b/src/include/cm/cm_misc.h index af06770..141b0f1 100644 --- a/src/include/cm/cm_misc.h +++ b/src/include/cm/cm_misc.h @@ -168,6 +168,7 @@ typedef struct CmResConfListSt { uint32 cmInstanceId; uint32 resInstanceId; ResStatusCheckInfo checkInfo; + int resType; } CmResConfList; // instance type before INST_TYPE_UNKNOWN shouldn't be change diff --git a/src/include/cm/cm_server/cms_arbitrate_datanode_pms_utils.h b/src/include/cm/cm_server/cms_arbitrate_datanode_pms_utils.h index 8fd16ad..f1f99c1 100644 --- a/src/include/cm/cm_server/cms_arbitrate_datanode_pms_utils.h +++ b/src/include/cm/cm_server/cms_arbitrate_datanode_pms_utils.h @@ -44,7 +44,6 @@ typedef struct DnArbitInfo_t { uint32 maxTerm; } DnArbitInfo; -const uint32 DATANODE_ARBITE_DELAY = 6; extern bool CheckPotentialTermRollback(); extern void GroupStatusShow(const char *str, const uint32 groupIndex, const uint32 instanceId, diff --git a/src/include/cm/cm_server/cms_global_params.h b/src/include/cm/cm_server/cms_global_params.h index 0d7c657..d6dfa2b 100644 --- a/src/include/cm/cm_server/cms_global_params.h +++ b/src/include/cm/cm_server/cms_global_params.h @@ -475,6 +475,7 @@ extern uint32 g_delayArbiTime; extern int32 g_clusterArbiTime; extern bool g_isPauseArbitration; extern char g_cmManualPausePath[MAX_PATH_LEN]; +extern uint32 g_waitStaticPrimaryTimes; extern void clean_init_cluster_state(); extern void instance_delay_arbitrate_time_out_direct_clean(uint32 group_index, int member_index, diff --git a/src/include/cm/cm_server/cms_rhb.h b/src/include/cm/cm_server/cms_rhb.h index e296c4f..ce80a38 100644 --- a/src/include/cm/cm_server/cms_rhb.h +++ b/src/include/cm/cm_server/cms_rhb.h @@ -26,13 +26,14 @@ #define CMS_RHB_H #include -#include "cms_arbitrate_cluster.h" + +#define TIME_STR_MAX_LEN 20 void InitDbListsByStaticConfig(); void RefreshNodeRhbInfo(unsigned int nodeId, const time_t *hbs, unsigned int hwl); -MaxClusterResStatus GetNodesConnStatByRhb(int resIdx1, int resIdx2, int timeout); void GetRhbStat(time_t hbs[MAX_RHB_NUM][MAX_RHB_NUM], unsigned int *hwl); void ResetNodeConnStat(); void PrintHbsInfo(int resIdx1, uint32 nodeId1, int resIdx2, uint32 nodeId2, int logLevel); +void GetTimeStr(time_t baseTime, char *timeStr, uint32 strLen); #endif diff --git a/tool/cm_tool/cm_install b/tool/cm_tool/cm_install index 15b045c..4430af5 100755 --- a/tool/cm_tool/cm_install +++ b/tool/cm_tool/cm_install @@ -288,7 +288,7 @@ General options: isLocal = False if host == self.localhostName: isLocal = True - findPrimaryCmd = "source %s; gs_ctl query -D %s | grep 'local_role.*Primary' > /dev/null" % \ + findPrimaryCmd = "source %s; gs_ctl query -D %s | grep -i 'local_role.*Primary' > /dev/null" % \ (self.envFile, self.nodesInfo[host]["dataPath"]) notPrimary, output = executeCmdOnHost(host, findPrimaryCmd, isLocal) if notPrimary == 0: