!143 【5.0.1 补丁版本】 主干bugfix回合5.0.1版本

Merge pull request !143 from zhangxubo/5.0.1
This commit is contained in:
opengauss_bot 2023-08-12 02:36:11 +00:00 committed by Gitee
commit c52edcc563
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
25 changed files with 224 additions and 156 deletions

View File

@ -1,2 +1,2 @@
PRODUCT="openGauss"
VERSION="5.0.0"
VERSION="5.0.1"

View File

@ -112,14 +112,14 @@ status_t SetVotingDiskNodeData(char *data, uint32 dataLen)
return CM_SUCCESS;
}
status_t UpdateAllNodeHeartBeat()
status_t UpdateAllNodeHeartBeat(uint32 nodeNum)
{
uint32 dataLen = VOTING_DISK_DATA_SIZE;
uint32 dataLen = nodeNum * VOTING_DISK_EACH_NODE_OFFSET;
if (GetVotingDiskNodeData(g_nodeDataBuff, dataLen) != CM_SUCCESS) {
write_runlog(ERROR, "[%s] get voting disk node data failed.\n", __FUNCTION__);
return CM_ERROR;
}
for (uint32 i = 0; i < VOTING_DISK_MAX_NODE_NUM; i++) {
for (uint32 i = 0; i < nodeNum; i++) {
uint32 offset = i * VOTING_DISK_EACH_NODE_OFFSET;
VotingDiskNodeInfo *nodeInfo = (VotingDiskNodeInfo*)(g_nodeDataBuff + offset);
if (nodeInfo->nodeTime == 0) {

View File

@ -71,8 +71,14 @@ static int CusResCmdExecute(const char *scriptPath, const char *oper, uint32 tim
status_t StartOneResInst(const CmResConfList *conf)
{
int ret;
char oper[MAX_OPTION_LEN] = {0};
int ret = snprintf_s(oper, MAX_OPTION_LEN, MAX_OPTION_LEN - 1, "-start %u %s", conf->resInstanceId, conf->arg);
if (conf->resType == CUSTOM_RESOURCE_DN && undocumentedVersion > 0) {
ret = snprintf_s(oper, MAX_OPTION_LEN, MAX_OPTION_LEN - 1, "-start %u %s '-u %u'", conf->resInstanceId,
conf->arg, undocumentedVersion);
} else {
ret = snprintf_s(oper, MAX_OPTION_LEN, MAX_OPTION_LEN - 1, "-start %u %s", conf->resInstanceId, conf->arg);
}
securec_check_intval(ret, (void)ret);
ret = CusResCmdExecute(conf->script, oper, (uint32)conf->checkInfo.timeOut, CM_FALSE);
@ -118,13 +124,13 @@ void OneResInstClean(const CmResConfList *oneResConf)
}
}
status_t RegOneResInst(const CmResConfList *conf, uint32 destInstId)
status_t RegOneResInst(const CmResConfList *conf, uint32 destInstId, bool8 needNohup)
{
char oper[MAX_OPTION_LEN] = {0};
int ret = snprintf_s(oper, MAX_OPTION_LEN, MAX_OPTION_LEN - 1, "-reg %u %s", destInstId, conf->arg);
securec_check_intval(ret, (void)ret);
ret = CusResCmdExecute(conf->script, oper, (uint32)conf->checkInfo.timeOut, CM_TRUE);
ret = CusResCmdExecute(conf->script, oper, (uint32)conf->checkInfo.timeOut, needNohup);
if (ret != 0) {
write_runlog(ERROR, "[%s]: cmd:(%s %s) execute failed, ret=%d.\n", __FUNCTION__, conf->script, oper, ret);
return CM_ERROR;
@ -251,7 +257,7 @@ int CheckOneResInst(const CmResConfList *conf)
return ret;
}
static void ManualStopLocalResInst(CmResConfList *conf)
static status_t ManualStopOneLocalResInst(CmResConfList *conf)
{
char instanceStartFile[MAX_PATH_LEN] = {0};
int ret = snprintf_s(instanceStartFile, MAX_PATH_LEN, MAX_PATH_LEN - 1,
@ -260,7 +266,7 @@ static void ManualStopLocalResInst(CmResConfList *conf)
if (CmFileExist(instanceStartFile)) {
write_runlog(LOG, "instanceStartFile(%s) is exist, can't create again.\n", instanceStartFile);
return;
return CM_SUCCESS;
}
char command[MAX_PATH_LEN] = {0};
@ -272,8 +278,28 @@ static void ManualStopLocalResInst(CmResConfList *conf)
ret = system(command);
if (ret != 0) {
write_runlog(ERROR, "manual stop res(%s) inst(%u) failed, ret=%d.\n", conf->resName, conf->resInstanceId, ret);
} else {
write_runlog(LOG, "manual stop res(%s) inst(%u) success.\n", conf->resName, conf->resInstanceId);
return CM_ERROR;
}
write_runlog(LOG, "manual stop res(%s) inst(%u) success.\n", conf->resName, conf->resInstanceId);
return CM_SUCCESS;
}
static status_t ManuallStopAllLocalResInst()
{
status_t result = CM_SUCCESS;
for (uint32 i = 0; i < GetLocalResConfCount(); ++i) {
if (ManualStopOneLocalResInst(&g_resConf[i]) != CM_SUCCESS) {
result = CM_ERROR;
}
}
return result;
}
static void ManualStopLocalResInst(CmResConfList *conf)
{
if (ManuallStopAllLocalResInst() == CM_SUCCESS) {
CleanOneInstCheckCount(conf);
}
}
@ -290,21 +316,17 @@ bool IsInstManualStopped(uint32 instId)
return false;
}
static bool CanCusInstDoRestart(const CmResConfList *conf)
static inline void RestartOneResInst(CmResConfList *conf)
{
ResIsregStatus stat = IsregOneResInst(conf, conf->resInstanceId);
if ((stat == CM_RES_ISREG_REG) || (stat == CM_RES_ISREG_NOT_SUPPORT)) {
return true;
if ((stat != CM_RES_ISREG_REG) && (stat != CM_RES_ISREG_NOT_SUPPORT)) {
if (RegOneResInst(conf, conf->resInstanceId, CM_FALSE) != CM_SUCCESS) {
write_runlog(LOG, "cur inst(%u) isreg stat=(%u), and reg failed, restart failed.\n",
conf->cmInstanceId, (uint32)stat);
return;
}
}
write_runlog(LOG, "cur inst(%u) isreg stat=(%u), can't do restart.\n", conf->cmInstanceId, (uint32)stat);
return false;
}
static inline status_t RestartOneResInst(CmResConfList *conf)
{
(void)CleanOneResInst(conf);
CM_RETURN_IFERR(StartOneResInst(conf));
return CM_SUCCESS;
(void)StartOneResInst(conf);
}
static void ProcessOfflineInstance(CmResConfList *conf)
@ -312,9 +334,7 @@ static void ProcessOfflineInstance(CmResConfList *conf)
long curTime = GetCurMonotonicTimeSec();
if (conf->checkInfo.restartTimes == -1) {
if (CanCusInstDoRestart(conf)) {
(void)RestartOneResInst(conf);
}
RestartOneResInst(conf);
return;
}
if (conf->checkInfo.brokeTime == 0) {
@ -338,10 +358,7 @@ static void ProcessOfflineInstance(CmResConfList *conf)
conf->resName, conf->resInstanceId, conf->checkInfo.startTime, conf->checkInfo.restartPeriod);
return;
}
if (!CanCusInstDoRestart(conf)) {
return;
}
CM_RETVOID_IFERR(RestartOneResInst(conf));
RestartOneResInst(conf);
conf->checkInfo.startCount++;
conf->checkInfo.startTime = curTime;
write_runlog(LOG, "res(%s) inst(%u) has been restart (%d) times, restart more than (%d) time will manually stop.\n",
@ -376,7 +393,7 @@ static void ProcessAbnormalInstance(CmResConfList *conf)
write_runlog(LOG, "res(%s) inst(%u) has been abnormal (%d)s, >= timeout(%d)s, need restart.\n",
conf->resName, conf->cmInstanceId, duration, conf->checkInfo.abnormalTimeout);
CM_RETVOID_IFERR(RestartOneResInst(conf));
RestartOneResInst(conf);
conf->checkInfo.startCount++;
conf->checkInfo.startTime = curTime;
@ -625,10 +642,8 @@ static status_t InitLocalAllDnResInstConf(const CusResConfJson *resJson, CmResCo
static status_t InitLocalOneResConf(const OneCusResConfJson *oneResJson)
{
CmResConfList newLocalConf;
errno_t rc = memset_s(&newLocalConf, sizeof(CmResConfList), 0, sizeof(CmResConfList));
securec_check_errno(rc, (void)rc);
CmResConfList newLocalConf = {{0}};
newLocalConf.resType = (int)oneResJson->resType;
if (oneResJson->resType == CUSTOM_RESOURCE_APP) {
CM_RETURN_IFERR(InitLocalCommConfOfDefRes(&oneResJson->appResConf, &newLocalConf));
CM_RETURN_IFERR(InitLocalAllAppResInstConf(&oneResJson->appResConf, &newLocalConf));

View File

@ -356,7 +356,7 @@ static void ProcessRegResInst(const CmsNotifyAgentRegMsg *recvMsg)
} else if ((isreg == CM_RES_ISREG_UNREG) || (isreg == CM_RES_ISREG_PENDING) || (isreg == CM_RES_ISREG_UNKNOWN)) {
write_runlog(LOG, "before reg res inst, need clean res inst first.\n");
if ((CheckOneResInst(local) == CUS_RES_CHECK_STAT_OFFLINE) || (CleanOneResInst(local) == CM_SUCCESS)) {
(void)RegOneResInst(local, recvMsg->resInstId);
(void)RegOneResInst(local, recvMsg->resInstId, CM_TRUE);
}
} else if (isreg == CM_RES_ISREG_NOT_SUPPORT) {
write_runlog(LOG, "res inst[%s:%u] don't support reg, not need reg.\n", recvMsg->resName, recvMsg->resInstId);

View File

@ -181,16 +181,15 @@ int ssh_exec(const staticNodeConfig* node, const char* cmd, int32 logLevel)
for (uint32 ii = 0; ii < node->sshCount; ii++) {
if (mpp_env_separate_file[0] == '\0') {
ret = snprintf_s(actualCmd, MAX_COMMAND_LEN, MAX_COMMAND_LEN - 1,
"pssh %s -s -H %s \"( %s ) > %s 2>&1\" < %s > /dev/null 2>&1",
PSSH_TIMEOUT_OPTION, node->sshChannel[ii], cmd, "/dev/null", "/dev/null");
securec_check_intval(ret, (void)ret);
"pssh %s -s -H %s \"( %s ) > %s 2>&1\" > /dev/null 2>&1",
PSSH_TIMEOUT_OPTION, node->sshChannel[ii], cmd, "/dev/null");
} else {
ret = snprintf_s(actualCmd, MAX_COMMAND_LEN, MAX_COMMAND_LEN - 1,
"pssh %s -s -H %s \"( source %s;%s ) > %s 2>&1\" < %s > /dev/null 2>&1",
"pssh %s -s -H %s \"( source %s;%s ) > %s 2>&1\" > /dev/null 2>&1",
PSSH_TIMEOUT_OPTION, node->sshChannel[ii], mpp_env_separate_file, cmd,
"/dev/null", "/dev/null");
securec_check_intval(ret, (void)ret);
"/dev/null");
}
securec_check_intval(ret, (void)ret);
rc = system(actualCmd);
if (rc != 0) {
write_runlog(logLevel, "ssh failed at \"%s\".\n", node->sshChannel[ii]);

View File

@ -43,6 +43,8 @@ static char g_confFile[CM_PATH_LENGTH];
extern char g_appPath[MAXPGPATH];
extern char mpp_env_separate_file[MAXPGPATH];
static status_t CheckGucOption(const GucOption &gucCtx);
static inline void SkipSpace(char *&ptr)
{
while (isspace((unsigned char)*ptr)) {
@ -690,39 +692,6 @@ status_t ExeGucCommand(const GucOption *gucCtx)
return result;
}
status_t ProcessInLocalInstance(const GucOption *gucCtx)
{
errno_t rc;
char cmDir[CM_PATH_LENGTH] = { 0 };
char instanceDir[CM_PATH_LENGTH] = { 0 };
rc = memcpy_s(cmDir, sizeof(cmDir), g_currentNode->cmDataPath, sizeof(cmDir));
securec_check_errno(rc, (void)rc);
if (cmDir[0] == '\0') {
write_runlog(ERROR, "Failed to get cm base data path from static config file.");
return CM_ERROR;
}
if (gucCtx->nodeType == NODE_TYPE_AGENT) {
rc = snprintf_s(instanceDir, sizeof(instanceDir), sizeof(instanceDir) - 1, "%s/cm_agent", cmDir);
securec_check_intval(rc, (void)rc);
} else {
if (g_currentNode->cmServerLevel != 1) {
write_runlog(LOG, "There is no cmserver instance on local node.");
return CM_ERROR;
}
rc = snprintf_s(instanceDir, sizeof(instanceDir), sizeof(instanceDir) - 1, "%s/cm_server", cmDir);
securec_check_intval(rc, (void)rc);
}
GetInstanceConfigfile(gucCtx->nodeType, instanceDir);
if (ExeGucCommand(gucCtx) != CM_SUCCESS) {
return CM_ERROR;
}
return CM_SUCCESS;
}
static uint32 GetNodeIndex(uint32 nodeId)
{
for (uint32 i = 0; i < g_node_num; ++i) {
@ -784,18 +753,56 @@ static status_t ListRemoteConfMain(staticNodeConfig *node, const char *cmd)
return CM_ERROR;
}
status_t ProcessInLocalInstanceExec(const GucOption *gucCtx)
{
errno_t rc;
char cmDir[CM_PATH_LENGTH] = { 0 };
char instanceDir[CM_PATH_LENGTH] = { 0 };
rc = memcpy_s(cmDir, sizeof(cmDir), g_currentNode->cmDataPath, sizeof(cmDir));
securec_check_errno(rc, (void)rc);
if (cmDir[0] == '\0') {
write_runlog(ERROR, "Failed to get cm base data path from static config file.");
return CM_ERROR;
}
if (gucCtx->nodeType == NODE_TYPE_AGENT) {
rc = snprintf_s(instanceDir, sizeof(instanceDir), sizeof(instanceDir) - 1, "%s/cm_agent", cmDir);
securec_check_intval(rc, (void)rc);
} else {
if (g_currentNode->cmServerLevel != 1) {
write_runlog(LOG, "There is no cmserver instance on local node.");
return CM_ERROR;
}
rc = snprintf_s(instanceDir, sizeof(instanceDir), sizeof(instanceDir) - 1, "%s/cm_server", cmDir);
securec_check_intval(rc, (void)rc);
}
GetInstanceConfigfile(gucCtx->nodeType, instanceDir);
if (ExeGucCommand(gucCtx) != CM_SUCCESS) {
return CM_ERROR;
}
return CM_SUCCESS;
}
status_t ProcessInLocalInstance(const CtlOption *ctx)
{
if (CheckGucOption(ctx->guc) != CM_SUCCESS) {
return CM_ERROR;
}
if (ctx->guc.gucCommand == SET_CONF_COMMAND && CheckGucOptionValidate(ctx->guc) != CM_SUCCESS) {
DoAdvice();
return CM_ERROR;
}
return ProcessInLocalInstanceExec(&ctx->guc);
}
static status_t ProcessInRemoteInstance(const CtlOption *ctx)
{
char remoteCmd[MAX_COMMAND_LEN] = {0};
if (ctx->comm.nodeId == g_currentNode->node) {
if (ProcessInLocalInstance(&ctx->guc) == CM_ERROR) {
write_runlog(DEBUG1, "cm_ctl fail to execute in local.\n");
return CM_ERROR;
}
return CM_SUCCESS;
}
GetRemoteGucCommand(ctx, remoteCmd, sizeof(remoteCmd));
if (ctx->guc.gucCommand == LIST_CONF_COMMAND) {
return ListRemoteConfMain(&g_node[GetNodeIndex(ctx->comm.nodeId)], remoteCmd);
@ -817,8 +824,10 @@ static status_t ProcessInAllNodesInstance(CtlOption *ctx)
continue;
}
ctx->comm.nodeId = g_node[i].node;
if (ProcessInRemoteInstance(ctx) == CM_ERROR) {
result = CM_ERROR;
if (ctx->comm.nodeId == g_currentNode->node) {
result = ProcessInLocalInstance(ctx);
} else {
result = ProcessInRemoteInstance(ctx);
}
}
@ -827,15 +836,21 @@ static status_t ProcessInAllNodesInstance(CtlOption *ctx)
status_t ProcessClusterGucOption(CtlOption *ctx)
{
status_t result;
if (ctx->comm.nodeId == 0) {
result = ProcessInAllNodesInstance(ctx);
} else {
result = ProcessInRemoteInstance(ctx);
return ProcessInAllNodesInstance(ctx);
}
return result;
if (ctx->comm.nodeId != g_currentNode->node) {
return ProcessInRemoteInstance(ctx);
}
status_t res = ProcessInLocalInstance(ctx);
if (res == CM_ERROR) {
write_runlog(DEBUG1, "cm_ctl fail to execute in local.\n");
}
return res;
}
static status_t CheckGucOption(const GucOption &gucCtx)
@ -855,22 +870,10 @@ static status_t CheckGucOption(const GucOption &gucCtx)
// cm_ctl integration guc set reload and check capacity
int DoGuc(CtlOption *ctx)
{
if (CheckGucOption(ctx->guc) != CM_SUCCESS) {
return 1;
}
status_t res = ProcessClusterGucOption(ctx);
PrintResults(res == CM_SUCCESS, ctx);
if ((ctx->guc.gucCommand == SET_CONF_COMMAND) && (CheckGucOptionValidate(ctx->guc) != CM_SUCCESS)) {
DoAdvice();
return 1;
}
if (ProcessClusterGucOption(ctx) != CM_SUCCESS) {
PrintResults(false, ctx);
return 1;
}
PrintResults(true, ctx);
return 0;
return (int)res;
}
static void MemsetPassword(char **password)

View File

@ -1656,6 +1656,7 @@ char *DoConcatCmd(const CtlOption *ctx)
int rc = memset_s(cmd, CM_PATH_LENGTH, 0, CM_PATH_LENGTH);
securec_check_errno(rc, (void)rc);
if (DoCheckRole(&ctx->dcfOption) == -1) {
free(cmd);
return NULL;
}

View File

@ -127,6 +127,9 @@ const char *g_cmsParamInfo[] = {
"cms_enable_failover_on2nodes|bool|0,0|NULL|NULL|",
"cms_enable_db_crash_recovery|bool|0,0|NULL|NULL|",
"cms_network_isolation_timeout|int|10,2147483647|NULL|NULL|",
#ifndef ENABLE_PRIVATEGAUSS
"wait_static_primary_times|int|5,2147483647|NULL|NULL|",
#endif
};
const char *g_valueTypeStr[] = {
@ -304,6 +307,7 @@ char *GetParamLineInfo(const char *paramName, const char * const *paramInfos, in
if (paramInfos == NULL) {
write_runlog(ERROR, "Fail to get param info.\n");
free(info);
return NULL;
}

View File

@ -140,7 +140,7 @@ static status_t SetResBaseInfoInArray(ResBaseInfo *info, const cJSON *resArray,
}
resName = GetValueStrFromCJson(item, RES_NAME);
if (resName == NULL) {
resName = NULL;
resName = PRINT_NULL;
} else {
isCanPrint = CM_TRUE;
}
@ -188,7 +188,7 @@ static void PrintAllResInfoBody(const ResBaseInfo *info, const cJSON *resArray)
}
resName = GetValueStrFromCJson(item, RES_NAME);
if (resName == NULL) {
resName = NULL;
resName = PRINT_NULL;
}
resType = GetValueStrFromCJson(item, RESOURCE_TYPE);
if (resType == NULL) {

View File

@ -87,4 +87,6 @@ cms_enable_db_crash_recovery = false # used in 2 nodes cluster. when network re
cms_network_isolation_timeout = 20 # cms judges the network is isolated when it finds ddb cluster is not sync with each other nodes,
# after cms_network_isolation_timeout times.
# default 20
wait_static_primary_times = 6 # Time to wait for the primary recovery after the primary stopped unexpectedly.
# default value is 6
############### must leave a new line at the end ###################

View File

@ -80,6 +80,13 @@ typedef enum MaxClusterStatEn {
MAX_CLUSTER_EXCLUDE,
} MaxClusterStat;
typedef struct CurCmRhbStatSt {
uint32 hwl;
time_t baseTime;
time_t hbs[MAX_RHB_NUM][MAX_RHB_NUM];
} CurCmRhbStat;
static CurCmRhbStat g_curRhbStat = {0};
static const int32 CHECK_DELAY_IN_ROLE_CHANGING = 10;
static MaxNodeCluster g_curCluster = {{0}};
@ -320,6 +327,23 @@ static status_t InitMaxNodeCluster(MaxNodeCluster *maxNodeCluster)
return CM_SUCCESS;
}
static MaxClusterResStatus GetNodesConnStatByRhb(int idx1, int idx2, int timeout)
{
if (timeout == 0) {
return MAX_CLUSTER_STATUS_AVAIL;
}
if (g_curRhbStat.hbs[idx1][idx2] == 0 || g_curRhbStat.hbs[idx2][idx1] == 0) {
return MAX_CLUSTER_STATUS_INIT;
}
if (IsRhbTimeout(g_curRhbStat.hbs[idx1][idx2], g_curRhbStat.baseTime, timeout) ||
IsRhbTimeout(g_curRhbStat.hbs[idx2][idx1], g_curRhbStat.baseTime, timeout)) {
return MAX_CLUSTER_STATUS_UNAVAIL;
}
return MAX_CLUSTER_STATUS_AVAIL;
}
static bool CheckPoint2PointConn(int32 resIdx1, int32 resIdx2)
{
MaxClusterResStatus connStatus = GetNodesConnStatByRhb(resIdx1, resIdx2, (int)g_agentNetworkTimeout);
@ -454,6 +478,8 @@ static void FindMaxNodeCluster(MaxNodeCluster *maxCluster)
{
NodeCluster *nodeCluster = &(maxCluster->nodeCluster);
nodeCluster->clusterNum = -1;
g_curRhbStat.baseTime = time(NULL);
GetRhbStat(g_curRhbStat.hbs, &g_curRhbStat.hwl);
// assume that all meet the conditions.
for (int32 i = nodeCluster->maxNodeNum - 1; i >= 0; --i) {
if (!IsAllResAvailInNode(i)) {
@ -885,23 +911,35 @@ static bool IsNodeInCluster(int32 resIdx, const MaxNodeCluster *nodeCluster)
return false;
}
static void PrintRhbStatus()
static void PrintOneRhbLine(time_t *timeArr)
{
uint32 hwl = 0;
time_t hbs[MAX_RHB_NUM][MAX_RHB_NUM] = {{0}};
GetRhbStat(hbs, &hwl);
char *rhbStr = GetRhbSimple((time_t *)hbs, MAX_RHB_NUM, hwl, time(NULL), g_agentNetworkTimeout);
CM_RETURN_IF_NULL(rhbStr);
size_t rhbLen = strlen(rhbStr);
if (rhbLen >= MAX_LOG_BUFF_LEN) {
write_runlog(LOG, "rhbStr len(%lu) is exceed max log buff len(%d), can't print network stat.\n",
rhbLen, MAX_LOG_BUFF_LEN);
FREE_AND_RESET(rhbStr);
return;
int ret;
errno_t rc;
char rhbStr[MAX_PATH_LEN] = {0};
const uint32 maxInfoLen = TIME_STR_MAX_LEN + 1;
for (uint32 i = 0; i < g_curRhbStat.hwl; ++i) {
char info[maxInfoLen] = {0};
char timeBuf[TIME_STR_MAX_LEN] = {0};
GetTimeStr(timeArr[i], timeBuf, TIME_STR_MAX_LEN);
ret = snprintf_s(info, maxInfoLen, maxInfoLen - 1, "%s|", timeBuf);
securec_check_intval(ret, (void)ret);
rc = strncat_s(rhbStr, MAX_PATH_LEN, info, strlen(info));
securec_check_errno(rc, (void)rc);
}
write_runlog(LOG, "[RHB] hb infos: |%s\n", rhbStr);
}
static void PrintAllRhbStatus()
{
char timeBuf[TIME_STR_MAX_LEN] = {0};
GetTimeStr(g_curRhbStat.baseTime, timeBuf, TIME_STR_MAX_LEN);
write_runlog(LOG, "Network timeout:%u\n", g_agentNetworkTimeout);
write_runlog(LOG, "Network stat('Y' means connected, otherwise 'N'):\n%s\n", rhbStr);
FREE_AND_RESET(rhbStr);
write_runlog(LOG, "Network base_time:%s\n", timeBuf);
for (uint32 i = 0; i < g_curRhbStat.hwl; ++i) {
PrintOneRhbLine(&g_curRhbStat.hbs[i][0]);
}
}
static void PrintKickOutResult(int32 resIdx, const MaxNodeCluster *maxCluster)
@ -927,11 +965,10 @@ static void PrintKickOutResult(int32 resIdx, const MaxNodeCluster *maxCluster)
if (!CheckPoint2PointConn(resIdx, maxCluster->nodeCluster.cluster[i])) {
write_runlog(LOG, "kick out result: (index=%d,nodeId=%u) disconnect with (index=%d,nodeId=%u).\n",
resIdx, GetNodeByPoint(resIdx), i, GetNodeByPoint(i));
PrintHbsInfo(resIdx, GetNodeByPoint(resIdx), i, GetNodeByPoint(i), LOG);
continue;
}
}
PrintRhbStatus();
PrintAllRhbStatus();
}
static void PrintArbitrateResult(const MaxNodeCluster *lastCluster, const MaxNodeCluster *curCluster)
@ -999,7 +1036,7 @@ static status_t CheckVotingDisk()
const uint32 timeout = 6;
uint32 time = timeout;
while (time > 0) {
if (UpdateAllNodeHeartBeat() == CM_SUCCESS) {
if (UpdateAllNodeHeartBeat(g_node_num) == CM_SUCCESS) {
return CM_SUCCESS;
}
time--;
@ -1028,6 +1065,8 @@ void *MaxNodeClusterArbitrateMain(void *arg)
write_runlog(FATAL, "Alloc voting disk memory failed!\n");
exit(-1);
}
g_curRhbStat.baseTime = time(NULL);
GetRhbStat(g_curRhbStat.hbs, &g_curRhbStat.hwl);
for (;;) {
if (got_stop) {
g_threadProcessStatus = THREAD_PROCESS_STOP;

View File

@ -2248,7 +2248,7 @@ static void InitDnArbCond(DnArbCtx *ctx)
ctx->cond.maxMemArbiTime = 0;
ctx->cond.instMainta = IsMaintance(ctx->maintaMode);
ctx->cond.switchoverIdx = INVALID_INDEX;
ctx->cond.arbitInterval = g_clusterStarting ? g_clusterStartingArbitDelay : DATANODE_ARBITE_DELAY;
ctx->cond.arbitInterval = g_clusterStarting ? g_clusterStartingArbitDelay : g_waitStaticPrimaryTimes;
ctx->cond.arbitStaticInterval = 5;
ctx->cond.setOffline = SetOfflineNode();
ctx->cond.snameAzDnCount = 0;

View File

@ -554,7 +554,7 @@ uint32 GetDnArbitateDelayTime(const DnArbCtx *ctx)
/* if static primary has finished redo, not need to wait for 180s */
cm_local_replconninfo *status = &(ctx->dnReport[cond->staticPriIdx].local_status);
if (status->local_role == INSTANCE_ROLE_STANDBY && status->disconn_mode == PROHIBIT_CONNECTION) {
return DATANODE_ARBITE_DELAY;
return g_waitStaticPrimaryTimes;
}
return cond->arbitInterval;
}

View File

@ -615,6 +615,12 @@ void get_parameters_from_configfile()
g_diskTimeout = get_uint32_value_from_config(configDir, "disk_timeout", 200);
g_agentNetworkTimeout = get_uint32_value_from_config(configDir, "agent_network_timeout", 6);
GetDnArbitrateMode();
#ifndef ENABLE_PRIVATEGAUSS
g_waitStaticPrimaryTimes = get_uint32_value_from_config(configDir, "wait_static_primary_times", 6);
if (g_waitStaticPrimaryTimes < 5) {
g_waitStaticPrimaryTimes = 5;
}
#endif
}
void clean_init_cluster_state()

View File

@ -328,6 +328,7 @@ uint32 g_delayArbiTime = 0;
int32 g_clusterArbiTime = 300;
bool g_isPauseArbitration = false;
char g_cmManualPausePath[MAX_PATH_LEN] = {0};
uint32 g_waitStaticPrimaryTimes = 6;
bool isLargerNode()
{

View File

@ -1727,7 +1727,7 @@ static int cm_server_process_startup_packet(int epollfd, CM_Connection* con, CM_
if ((con->port->user_name != NULL) && strncmp(con->port->user_name, pw->pw_name, SP_USER - 1)) {
write_runlog(WARNING, "invalid connection\n");
if (CmsSendAndFlushMsg(con, 'E', "invalid connection", CM_SERVER_PACKET_ERROR_MSG) != 0) {
if (CmsSendAndFlushMsg(con, 'E', "invalid connection", sizeof("invalid connection")) != 0) {
RemoveConnAfterSendMsgFailed(con);
write_runlog(ERROR, "[%s][line:%d] CmsSendAndFlushMsg fail.\n", __FUNCTION__, __LINE__);
}

View File

@ -343,11 +343,18 @@ static void ReloadParametersFromConfigfile()
g_diskTimeout = get_uint32_value_from_config(configDir, "disk_timeout", 200);
g_agentNetworkTimeout = get_uint32_value_from_config(configDir, "agent_network_timeout", 6);
GetDnArbitrateMode();
#ifndef ENABLE_PRIVATEGAUSS
g_waitStaticPrimaryTimes = get_uint32_value_from_config(configDir, "wait_static_primary_times", 6);
if (g_waitStaticPrimaryTimes < 5) {
g_waitStaticPrimaryTimes = 5;
}
#endif
if (g_cm_server_num == CMS_ONE_PRIMARY_ONE_STANDBY) {
GetTwoNodesArbitrateParams();
}
#ifdef ENABLE_MULTIPLE_NODES
write_runlog(LOG,
"reload cm_server parameters:\n"
@ -378,13 +385,13 @@ static void ReloadParametersFromConfigfile()
"datastorage_threshold_check_interval=%d,\n"
" max_datastorage_threshold_check=%d, enableSetReadOnly=%s, enableSetReadOnlyThreshold=%u, "
"switch_rto=%d, force_promote=%d, cluster_starting_aribt_delay=%u, enable_e2e_rto=%u, "
"g_delayArbiTime=%u, g_clusterArbiTime=%d.\n",
"g_delayArbiTime=%u, g_clusterArbiTime=%d, wait_static_primary_times=%u.\n",
log_min_messages, maxLogFileSize, sys_log_path, g_alarmComponentPath, g_alarmReportInterval,
instance_heartbeat_timeout, g_ddbArbicfg.haHeartBeatTimeOut, cmserver_self_vote_timeout,
g_ddbArbicfg.haStatusInterval, cmserver_ha_connect_timeout, instance_failover_delay_timeout,
datastorage_threshold_check_interval, max_datastorage_threshold_check, g_enableSetReadOnly,
g_readOnlyThreshold, switch_rto, force_promote, g_clusterStartingArbitDelay,
g_enableE2ERto, g_delayArbiTime, g_clusterArbiTime);
g_enableE2ERto, g_delayArbiTime, g_clusterArbiTime, g_waitStaticPrimaryTimes);
#endif
}

View File

@ -95,31 +95,13 @@ void GetRhbStat(time_t hbs[MAX_RHB_NUM][MAX_RHB_NUM], unsigned int *hwl)
securec_check_errno(rc, (void)rc);
}
MaxClusterResStatus GetNodesConnStatByRhb(int resIdx1, int resIdx2, int timeout)
{
if (timeout == 0) {
return MAX_CLUSTER_STATUS_AVAIL;
}
if (g_hbs[resIdx1][resIdx2] == 0 || g_hbs[resIdx2][resIdx1] == 0) {
return MAX_CLUSTER_STATUS_INIT;
}
time_t curTime = time(NULL);
if (IsRhbTimeout(g_hbs[resIdx1][resIdx2], curTime, timeout) ||
IsRhbTimeout(g_hbs[resIdx2][resIdx1], curTime, timeout)) {
return MAX_CLUSTER_STATUS_UNAVAIL;
}
return MAX_CLUSTER_STATUS_AVAIL;
}
void ResetNodeConnStat()
{
errno_t rc = memset_s(g_hbs, sizeof(g_hbs), 0, sizeof(g_hbs));
securec_check_errno(rc, (void)rc);
}
void PrintOneHbInfo(int resIdx1, uint32 nodeId1, int resIdx2, uint32 nodeId2, int logLevel)
static void PrintOneHbInfo(int resIdx1, uint32 nodeId1, int resIdx2, uint32 nodeId2, int logLevel)
{
struct tm result;
GetLocalTime(&g_hbs[resIdx1][resIdx2], &result);
@ -135,3 +117,10 @@ void PrintHbsInfo(int resIdx1, uint32 nodeId1, int resIdx2, uint32 nodeId2, int
PrintOneHbInfo(resIdx1, nodeId1, resIdx2, nodeId2, logLevel);
PrintOneHbInfo(resIdx2, nodeId2, resIdx1, nodeId1, logLevel);
}
void GetTimeStr(time_t baseTime, char *timeStr, uint32 strLen)
{
struct tm result;
GetLocalTime(&baseTime, &result);
(void)strftime(timeStr, strLen, "%Y-%m-%d %H:%M:%S", &result);
}

View File

@ -42,7 +42,7 @@ status_t GetVotingDiskData(char *data, uint32 dataLen, uint32 offset);
status_t GetVotingDiskSingleNodeInfo(VotingDiskNodeInfo *nodeInfo, uint32 nodeIndex);
status_t InitVotingDiskHandler(const char *scsiDev, uint32 offset);
status_t InitVotingDisk(const char *votingDiskPath);
status_t UpdateAllNodeHeartBeat();
status_t UpdateAllNodeHeartBeat(uint32 nodeNum);
void ResetVotingdiskHeartBeat();
VotingDiskStatus GetNodeHeartbeatStat(uint32 nodeIndex, uint32 diskTimeout, int logLevel);
status_t AllocVotingDiskMem();

View File

@ -30,7 +30,7 @@
status_t StartOneResInst(const CmResConfList *conf);
void StopOneResInst(const CmResConfList *conf);
void OneResInstShutdown(const CmResConfList *oneResConf);
status_t RegOneResInst(const CmResConfList *conf, uint32 destInstId);
status_t RegOneResInst(const CmResConfList *conf, uint32 destInstId, bool8 needNohup);
status_t UnregOneResInst(const CmResConfList *conf, uint32 destInstId);
ResIsregStatus IsregOneResInst(const CmResConfList *conf, uint32 destInstId);
status_t CleanOneResInst(const CmResConfList *conf);

View File

@ -168,6 +168,7 @@ typedef struct CmResConfListSt {
uint32 cmInstanceId;
uint32 resInstanceId;
ResStatusCheckInfo checkInfo;
int resType;
} CmResConfList;
// instance type before INST_TYPE_UNKNOWN shouldn't be change

View File

@ -44,7 +44,6 @@ typedef struct DnArbitInfo_t {
uint32 maxTerm;
} DnArbitInfo;
const uint32 DATANODE_ARBITE_DELAY = 6;
extern bool CheckPotentialTermRollback();
extern void GroupStatusShow(const char *str, const uint32 groupIndex, const uint32 instanceId,

View File

@ -475,6 +475,7 @@ extern uint32 g_delayArbiTime;
extern int32 g_clusterArbiTime;
extern bool g_isPauseArbitration;
extern char g_cmManualPausePath[MAX_PATH_LEN];
extern uint32 g_waitStaticPrimaryTimes;
extern void clean_init_cluster_state();
extern void instance_delay_arbitrate_time_out_direct_clean(uint32 group_index, int member_index,

View File

@ -26,13 +26,14 @@
#define CMS_RHB_H
#include <time.h>
#include "cms_arbitrate_cluster.h"
#define TIME_STR_MAX_LEN 20
void InitDbListsByStaticConfig();
void RefreshNodeRhbInfo(unsigned int nodeId, const time_t *hbs, unsigned int hwl);
MaxClusterResStatus GetNodesConnStatByRhb(int resIdx1, int resIdx2, int timeout);
void GetRhbStat(time_t hbs[MAX_RHB_NUM][MAX_RHB_NUM], unsigned int *hwl);
void ResetNodeConnStat();
void PrintHbsInfo(int resIdx1, uint32 nodeId1, int resIdx2, uint32 nodeId2, int logLevel);
void GetTimeStr(time_t baseTime, char *timeStr, uint32 strLen);
#endif

View File

@ -288,7 +288,7 @@ General options:
isLocal = False
if host == self.localhostName:
isLocal = True
findPrimaryCmd = "source %s; gs_ctl query -D %s | grep 'local_role.*Primary' > /dev/null" % \
findPrimaryCmd = "source %s; gs_ctl query -D %s | grep -i 'local_role.*Primary' > /dev/null" % \
(self.envFile, self.nodesInfo[host]["dataPath"])
notPrimary, output = executeCmdOnHost(host, findPrimaryCmd, isLocal)
if notPrimary == 0: