diff --git a/src/cm_ctl/ctl_param_check.cpp b/src/cm_ctl/ctl_param_check.cpp index 2a1558f..22d6782 100644 --- a/src/cm_ctl/ctl_param_check.cpp +++ b/src/cm_ctl/ctl_param_check.cpp @@ -130,6 +130,7 @@ const char *g_cmsParamInfo[] = { #ifndef ENABLE_PRIVATEGAUSS "wait_static_primary_times|int|5,2147483647|NULL|NULL|", #endif + "ss_double_cluster_mode|int|0,2|NULL|NULL|", }; const char *g_valueTypeStr[] = { diff --git a/src/cm_ctl/ctl_switchover.cpp b/src/cm_ctl/ctl_switchover.cpp index f223747..2c9cd3a 100644 --- a/src/cm_ctl/ctl_switchover.cpp +++ b/src/cm_ctl/ctl_switchover.cpp @@ -28,6 +28,8 @@ #include "ctl_common.h" #include "cm/libpq-int.h" #include "cm/cm_agent/cma_main.h" +#include "cm_elog.h" +#include "cm_msg.h" /* If DN switch take long time and do not complete, it will timeout, pending_command will be clear in server_main.cpp CM_ThreadMonitorMain(), the default g_wait_seconds is 180s, we need to increase the g_wait_seconds to 1200s. */ @@ -59,6 +61,7 @@ extern bool wait_seconds_set; extern int g_waitSeconds; extern CM_Conn* CmServer_conn; extern char *g_command_operation_azName; +SSDoubleClusterMode g_ssDoubleClusterMode = SS_DOUBLE_NULL; static int QueryNeedQuickSwitchInstances(int* need_quick_switchover_instance, NeedQuickSwitchoverInstanceArray* needQuickSwitchoverInstance, bool* is_cluster_balance, @@ -69,11 +72,16 @@ static int GetDatapathByInstanceId(uint32 instanceId, int instanceType, char* da static int JudgeInstanceRole(int instanceType, int member_index, int instance_role, const CommonOption *commCtx); static int JudgeDatanodeStatus(uint32 node_id, const char *data_path, int db_state); static int JudgeGtmStatus(uint32 node_id, const char *data_path, int gtm_state); +static void GetClusterMode(); static void SetSwitchoverOper(SwitchoverOper *oper, int32 localRole, uint32 instanceId) { if (localRole == INSTANCE_ROLE_STANDBY) { - oper->localRole = INSTANCE_ROLE_PRIMARY; + if (g_ssDoubleClusterMode == SS_DOUBLE_STANDBY) { + oper->localRole = INSTANCE_ROLE_MAIN_STANDBY; + } else { + oper->localRole = INSTANCE_ROLE_PRIMARY; + } oper->peerRole = INSTANCE_ROLE_STANDBY; } else if (localRole == INSTANCE_ROLE_CASCADE_STANDBY) { oper->localRole = INSTANCE_ROLE_STANDBY; @@ -101,7 +109,12 @@ static int DoSwitchoverBase(const CtlOption *ctx) cm_to_ctl_command_ack *ackMsg = NULL; cm_to_ctl_instance_status *instStatusPtr = NULL; cm_switchover_incomplete_msg *switchoverIncompletePtr = NULL; - SwitchoverOper oper = {INSTANCE_ROLE_PRIMARY, INSTANCE_ROLE_STANDBY}; + SwitchoverOper oper; + if (g_ssDoubleClusterMode == SS_DOUBLE_STANDBY) { + oper = {INSTANCE_ROLE_MAIN_STANDBY, INSTANCE_ROLE_STANDBY}; + } else { + oper = {INSTANCE_ROLE_PRIMARY, INSTANCE_ROLE_STANDBY}; + } // return conn to cm_server do_conn_cmserver(false, 0); @@ -1489,6 +1502,7 @@ static int GetDatapathByInstanceId(uint32 instanceId, int instanceType, char* da int DoSwitchover(const CtlOption *ctx) { + GetClusterMode(); if (ctx->switchover.switchoverAll) { if (switchover_all_quick && g_clusterType != V3SingleInstCluster) { return DoSwitchoverAllQuick(); @@ -1510,3 +1524,24 @@ int DoSwitchover(const CtlOption *ctx) return DoSwitchoverBase(ctx); } + +static void GetClusterMode() +{ + errno_t rc; + char cmDir[CM_PATH_LENGTH] = { 0 }; + char configDir[CM_PATH_LENGTH] = { 0 }; + + rc = memcpy_s(cmDir, sizeof(cmDir), g_currentNode->cmDataPath, sizeof(cmDir)); + securec_check_errno(rc, (void)rc); + + if (cmDir[0] == '\0') { + write_runlog(ERROR, "Failed to get cm base data path from static config file."); + exit(-1); + } + + rc = snprintf_s(configDir, sizeof(configDir), sizeof(configDir) - 1, "%s/cm_agent/cm_agent.conf", cmDir); + securec_check_intval(rc, (void)rc); + + g_ssDoubleClusterMode = + (SSDoubleClusterMode)get_uint32_value_from_config(configDir, "ss_double_cluster_mode", SS_DOUBLE_NULL); +} \ No newline at end of file diff --git a/src/cm_server/cm_server.centralized.conf.sample b/src/cm_server/cm_server.centralized.conf.sample index fddf8c4..1115f2a 100644 --- a/src/cm_server/cm_server.centralized.conf.sample +++ b/src/cm_server/cm_server.centralized.conf.sample @@ -89,4 +89,5 @@ cms_network_isolation_timeout = 20 # cms judges the network is isolated when i # default 20 wait_static_primary_times = 6 # Time to wait for the primary recovery after the primary stopped unexpectedly. # default value is 6 +ss_double_cluster_mode = 0 #cluster run mode for ss double cluster scene, Valid value: 0-2 ############### must leave a new line at the end ################### diff --git a/src/cm_server/cm_server.centralized_new.conf.sample b/src/cm_server/cm_server.centralized_new.conf.sample index 003098f..adadf21 100644 --- a/src/cm_server/cm_server.centralized_new.conf.sample +++ b/src/cm_server/cm_server.centralized_new.conf.sample @@ -84,4 +84,5 @@ cms_enable_db_crash_recovery = false # used in 2 nodes cluster. when network re cms_network_isolation_timeout = 20 # cms judges the network is isolated when it finds ddb cluster is not sync with each other nodes, # after cms_network_isolation_timeout times. # default 20 +ss_double_cluster_mode = 0 #cluster run mode for ss double cluster scene, Valid value: 0-2 ############### must leave a new line at the end ################### diff --git a/src/cm_server/cm_server.conf.sample b/src/cm_server/cm_server.conf.sample index 4419b43..161f60c 100644 --- a/src/cm_server/cm_server.conf.sample +++ b/src/cm_server/cm_server.conf.sample @@ -84,4 +84,5 @@ cms_enable_db_crash_recovery = false # used in 2 nodes cluster. when network re cms_network_isolation_timeout = 20 # cms judges the network is isolated when it finds ddb cluster is not sync with each other nodes, # after cms_network_isolation_timeout times. # default 20 +ss_double_cluster_mode = 0 #cluster run mode for ss double cluster scene, Valid value: 0-2 ############### must leave a new line at the end ################### diff --git a/src/cm_server/cms_arbitrate_datanode_pms_utils.cpp b/src/cm_server/cms_arbitrate_datanode_pms_utils.cpp index 0171108..d4220f5 100644 --- a/src/cm_server/cms_arbitrate_datanode_pms_utils.cpp +++ b/src/cm_server/cms_arbitrate_datanode_pms_utils.cpp @@ -590,7 +590,7 @@ static void DnWillChangeStaticRole(const DnArbCtx *ctx, const char *str) if (ctx->localRole->role != cmdSour) { return; } - if (cmdPur == INSTANCE_ROLE_PRIMARY) { + if (cmdPur == INSTANCE_ROLE_PRIMARY || cmdPur == INSTANCE_ROLE_MAIN_STANDBY) { ChangeStaticRoleAndNotifyCn(ctx->groupIdx, ctx->memIdx); } else { ChangeDnMemberIndex(str, ctx->groupIdx, ctx->memIdx, cmdPur, cmdSour); diff --git a/src/cm_server/cms_common.cpp b/src/cm_server/cms_common.cpp index 9f19d27..3f080b9 100644 --- a/src/cm_server/cms_common.cpp +++ b/src/cm_server/cms_common.cpp @@ -614,6 +614,8 @@ void get_parameters_from_configfile() SECONDS_PER_DAY); g_diskTimeout = get_uint32_value_from_config(configDir, "disk_timeout", 200); g_agentNetworkTimeout = get_uint32_value_from_config(configDir, "agent_network_timeout", 6); + g_ssDoubleClusterMode = + (SSDoubleClusterMode)get_uint32_value_from_config(configDir, "ss_double_cluster_mode", SS_DOUBLE_NULL); GetDnArbitrateMode(); #ifndef ENABLE_PRIVATEGAUSS g_waitStaticPrimaryTimes = get_uint32_value_from_config(configDir, "wait_static_primary_times", 6); diff --git a/src/cm_server/cms_global_params.cpp b/src/cm_server/cms_global_params.cpp index 1e38d58..3fe2a27 100644 --- a/src/cm_server/cms_global_params.cpp +++ b/src/cm_server/cms_global_params.cpp @@ -251,6 +251,7 @@ char g_cmStaticConfigurePath[MAX_PATH_LEN] = {0}; cm_fenced_UDF_report_status *g_fenced_UDF_report_status_ptr = NULL; int *cn_dn_disconnect_times = NULL; int *g_lastCnDnDisconnectTimes = NULL; +SSDoubleClusterMode g_ssDoubleClusterMode = SS_DOUBLE_NULL; volatile switchover_az_mode cm_switchover_az_mode = AUTOSWITCHOVER_AZ; volatile logic_cluster_restart_mode cm_logic_cluster_restart_mode = INITIAL_LOGIC_CLUSTER_RESTART; diff --git a/src/cm_server/cms_global_params_utils.cpp b/src/cm_server/cms_global_params_utils.cpp index 233addb..0488fb3 100644 --- a/src/cm_server/cms_global_params_utils.cpp +++ b/src/cm_server/cms_global_params_utils.cpp @@ -48,8 +48,8 @@ void ChangeDnMemberIndex(const char *str, uint32 groupIdx, int32 memIdx, int32 i datanode_role_int_to_string(instTypePur)); instMem[i].role = instTypePur; cmd[i].role_changed = INSTANCE_ROLE_CHANGED; - } else if ((instTypePur == INSTANCE_ROLE_PRIMARY || peerInstId == instMem[i].instanceId) && - (i != memIdx) && instMem[i].role == instTypePur) { + } else if (((instTypePur == INSTANCE_ROLE_PRIMARY || instTypePur == INSTANCE_ROLE_MAIN_STANDBY) + || peerInstId == instMem[i].instanceId) && (i != memIdx) && instMem[i].role == instTypePur) { write_runlog(LOG, "%s: %d: instance(%u) static role(%s) will change to be %s.\n", str, __LINE__, instMem[i].instanceId, datanode_role_int_to_string(instMem[i].role), datanode_role_int_to_string(instTypeSor)); @@ -63,8 +63,13 @@ void ChangeDnMemberIndex(const char *str, uint32 groupIdx, int32 memIdx, int32 i void ChangeDnPrimaryMemberIndex(uint32 group_index, int primary_member_index) { if (g_one_master_multi_slave) { - ChangeDnMemberIndex("[ChangeDnPrimaryMemberIndex]", - group_index, primary_member_index, INSTANCE_ROLE_PRIMARY, INSTANCE_ROLE_STANDBY); + if (g_ssDoubleClusterMode == SS_DOUBLE_STANDBY) { + ChangeDnMemberIndex("[ChangeDnPrimaryMemberIndex]", + group_index, primary_member_index, INSTANCE_ROLE_MAIN_STANDBY, INSTANCE_ROLE_STANDBY); + } else { + ChangeDnMemberIndex("[ChangeDnPrimaryMemberIndex]", + group_index, primary_member_index, INSTANCE_ROLE_PRIMARY, INSTANCE_ROLE_STANDBY); + } } else { change_primary_member_index(group_index, primary_member_index); } @@ -79,11 +84,17 @@ void change_primary_member_index(uint32 group_index, int primary_member_index) for (int i = 0; i < count; i++) { /* Does not change dummy standby member index, only change primary and standby member index */ - if (i == primary_member_index && instanceMember[i].role != INSTANCE_ROLE_PRIMARY) { - instanceMember[i].role = INSTANCE_ROLE_PRIMARY; + if (i == primary_member_index && + (instanceMember[i].role != INSTANCE_ROLE_PRIMARY && instanceMember[i].role != INSTANCE_ROLE_MAIN_STANDBY)) { + if (g_ssDoubleClusterMode == SS_DOUBLE_STANDBY) { + instanceMember[i].role = INSTANCE_ROLE_MAIN_STANDBY; + } else { + instanceMember[i].role = INSTANCE_ROLE_PRIMARY; + } status[i].role_changed = INSTANCE_ROLE_CHANGED; SetDynamicConfigChangeToDdb(group_index, i); - } else if (i != primary_member_index && instanceMember[i].role == INSTANCE_ROLE_PRIMARY) { + } else if (i != primary_member_index && + (instanceMember[i].role == INSTANCE_ROLE_PRIMARY || instanceMember[i].role == INSTANCE_ROLE_MAIN_STANDBY)) { instanceMember[i].role = INSTANCE_ROLE_STANDBY; status[i].role_changed = INSTANCE_ROLE_CHANGED; SetDynamicConfigChangeToDdb(group_index, i); @@ -386,7 +397,11 @@ void SetSwitchoverCmd(cm_instance_command_status *cmd, int32 localRole, uint32 i cmd->command_status = INSTANCE_COMMAND_WAIT_EXEC; cmd->pengding_command = (int)MSG_CM_AGENT_SWITCHOVER; if (localRole == INSTANCE_ROLE_STANDBY) { - cmd->cmdPur = INSTANCE_ROLE_PRIMARY; + if (g_ssDoubleClusterMode == SS_DOUBLE_STANDBY) { + cmd->cmdPur = INSTANCE_ROLE_MAIN_STANDBY; + } else { + cmd->cmdPur = INSTANCE_ROLE_PRIMARY; + } cmd->cmdSour = INSTANCE_ROLE_STANDBY; } else if (localRole == INSTANCE_ROLE_CASCADE_STANDBY) { cmd->cmdPur = INSTANCE_ROLE_STANDBY; diff --git a/src/cm_server/cms_monitor_main.cpp b/src/cm_server/cms_monitor_main.cpp index 55f2f4a..e36eded 100644 --- a/src/cm_server/cms_monitor_main.cpp +++ b/src/cm_server/cms_monitor_main.cpp @@ -343,6 +343,8 @@ static void ReloadParametersFromConfigfile() GetDelayArbitClusterTimeFromConf(); g_diskTimeout = get_uint32_value_from_config(configDir, "disk_timeout", 200); g_agentNetworkTimeout = get_uint32_value_from_config(configDir, "agent_network_timeout", 6); + g_ssDoubleClusterMode = + (SSDoubleClusterMode)get_uint32_value_from_config(configDir, "ss_double_cluster_mode", SS_DOUBLE_NULL); GetDnArbitrateMode(); #ifndef ENABLE_PRIVATEGAUSS g_waitStaticPrimaryTimes = get_uint32_value_from_config(configDir, "wait_static_primary_times", 6); @@ -389,13 +391,15 @@ static void ReloadParametersFromConfigfile() "datastorage_threshold_check_interval=%d,\n" " max_datastorage_threshold_check=%d, enableSetReadOnly=%s, enableSetReadOnlyThreshold=%u, " "switch_rto=%d, force_promote=%d, cluster_starting_aribt_delay=%u, enable_e2e_rto=%u, " - "g_delayArbiTime=%u, g_clusterArbiTime=%d, wait_static_primary_times=%u, backup_open=%d.\n", + "g_delayArbiTime=%u, g_clusterArbiTime=%d, wait_static_primary_times=%u, backup_open=%d, " + "g_ssDoubleClusterMode=%d.\n", log_min_messages, maxLogFileSize, sys_log_path, g_alarmComponentPath, g_alarmReportInterval, instance_heartbeat_timeout, g_ddbArbicfg.haHeartBeatTimeOut, cmserver_self_vote_timeout, g_ddbArbicfg.haStatusInterval, cmserver_ha_connect_timeout, instance_failover_delay_timeout, datastorage_threshold_check_interval, max_datastorage_threshold_check, g_enableSetReadOnly, g_readOnlyThreshold, switch_rto, force_promote, g_clusterStartingArbitDelay, - g_enableE2ERto, g_delayArbiTime, g_clusterArbiTime, g_waitStaticPrimaryTimes, backup_open); + g_enableE2ERto, g_delayArbiTime, g_clusterArbiTime, g_waitStaticPrimaryTimes, backup_open, + g_ssDoubleClusterMode); #endif } diff --git a/src/cm_server/cms_process_messages.cpp b/src/cm_server/cms_process_messages.cpp index 1d77ff2..a3c9a7f 100644 --- a/src/cm_server/cms_process_messages.cpp +++ b/src/cm_server/cms_process_messages.cpp @@ -1064,14 +1064,16 @@ int isNodeBalanced(uint32 *switchedInstance) logicClusterId = get_logicClusterId_by_dynamic_dataNodeId( g_instance_role_group_ptr[i].instanceMember[0].instanceId); if (g_single_node_cluster && dnStat->local_role == INSTANCE_ROLE_NORMAL && - g_instance_role_group_ptr[i].instanceMember[j].instanceRoleInit == INSTANCE_ROLE_PRIMARY) { + (g_instance_role_group_ptr[i].instanceMember[j].instanceRoleInit == INSTANCE_ROLE_PRIMARY || + g_instance_role_group_ptr[i].instanceMember[j].instanceRoleInit == INSTANCE_ROLE_MAIN_STANDBY)) { break; } - if ((dnStat->local_role == INSTANCE_ROLE_PRIMARY && + if (((dnStat->local_role == INSTANCE_ROLE_PRIMARY || dnStat->local_role == INSTANCE_ROLE_MAIN_STANDBY) && g_instance_role_group_ptr[i].instanceMember[j].instanceRoleInit == INSTANCE_ROLE_STANDBY) || - (dnStat->local_role != INSTANCE_ROLE_PRIMARY && - g_instance_role_group_ptr[i].instanceMember[j].instanceRoleInit == INSTANCE_ROLE_PRIMARY)) { + ((dnStat->local_role != INSTANCE_ROLE_PRIMARY && dnStat->local_role != INSTANCE_ROLE_MAIN_STANDBY) && + (g_instance_role_group_ptr[i].instanceMember[j].instanceRoleInit == INSTANCE_ROLE_PRIMARY || + g_instance_role_group_ptr[i].instanceMember[j].instanceRoleInit == INSTANCE_ROLE_MAIN_STANDBY))) { if (switchedInstance != NULL) { switchedInstance[switchedCount] = g_instance_role_group_ptr[i].instanceMember[j].instanceId; } @@ -1156,8 +1158,10 @@ int switchoverFullDone(void) case INSTANCE_TYPE_DATANODE: if (g_instance_group_report_status_ptr[group_index].instance_status.command_member[member_index] .pengding_command != (int32)MSG_CM_AGENT_SWITCHOVER && + (g_instance_group_report_status_ptr[group_index].instance_status.data_node_member[member_index] + .local_status.local_role != INSTANCE_ROLE_PRIMARY && g_instance_group_report_status_ptr[group_index].instance_status.data_node_member[member_index] - .local_status.local_role != INSTANCE_ROLE_PRIMARY) { + .local_status.local_role != INSTANCE_ROLE_MAIN_STANDBY)) { (void)pthread_rwlock_unlock(&(g_instance_group_report_status_ptr[group_index].lk_lock)); write_runlog(LOG, "the instance(node = %u instanceid = %u) switchover fail\n", switchOverInstances[i].node, switchOverInstances[i].instanceId); @@ -1166,7 +1170,8 @@ int switchoverFullDone(void) if (g_instance_group_report_status_ptr[group_index].instance_status.command_member[member_index] .pengding_command == (int32)MSG_CM_AGENT_SWITCHOVER) { for (int ii = 0; ii < g_instance_role_group_ptr[group_index].count; ii++) { - if (g_instance_role_group_ptr[group_index].instanceMember[ii].role == INSTANCE_ROLE_PRIMARY && + if ((g_instance_role_group_ptr[group_index].instanceMember[ii].role == INSTANCE_ROLE_PRIMARY || + g_instance_role_group_ptr[group_index].instanceMember[ii].role == INSTANCE_ROLE_MAIN_STANDBY) && g_instance_group_report_status_ptr[group_index].instance_status.data_node_member[ii] .local_status.db_state != INSTANCE_HA_STATE_NORMAL) { (void)pthread_rwlock_unlock(&(g_instance_group_report_status_ptr[group_index].lk_lock)); @@ -1180,7 +1185,9 @@ int switchoverFullDone(void) if ((g_instance_group_report_status_ptr[group_index].instance_status.command_member[member_index] .pengding_command == MSG_CM_AGENT_SWITCHOVER) || (g_instance_group_report_status_ptr[group_index].instance_status.data_node_member[member_index] - .local_status.local_role != INSTANCE_ROLE_PRIMARY)) { + .local_status.local_role != INSTANCE_ROLE_PRIMARY && + g_instance_group_report_status_ptr[group_index].instance_status.data_node_member[member_index] + .local_status.local_role != INSTANCE_ROLE_MAIN_STANDBY)) { (void)pthread_rwlock_unlock(&(g_instance_group_report_status_ptr[group_index].lk_lock)); write_runlog(LOG, "the instance(node = %u instanceid = %u) is executing switchover.\n", switchOverInstances[i].node, switchOverInstances[i].instanceId); @@ -1224,7 +1231,11 @@ void SwitchOverSetting(int time_out, int instanceType, uint32 ptrIndex, int memb &(g_instance_group_report_status_ptr[ptrIndex].instance_status.command_member[memberIndex]); cmd->command_status = INSTANCE_COMMAND_WAIT_EXEC; cmd->pengding_command = (int)MSG_CM_AGENT_SWITCHOVER; - cmd->cmdPur = INSTANCE_ROLE_PRIMARY; + if (g_ssDoubleClusterMode == SS_DOUBLE_STANDBY) { + cmd->cmdPur = INSTANCE_ROLE_MAIN_STANDBY; + } else { + cmd->cmdPur = INSTANCE_ROLE_PRIMARY; + } cmd->cmdSour = INSTANCE_ROLE_STANDBY; cmd->time_out = time_out; cmd->peerInstId = GetPeerInstId(ptrIndex, memberIndex); diff --git a/src/cm_server/cms_process_messages_ctl.cpp b/src/cm_server/cms_process_messages_ctl.cpp index a16d9fb..1405cd7 100644 --- a/src/cm_server/cms_process_messages_ctl.cpp +++ b/src/cm_server/cms_process_messages_ctl.cpp @@ -110,6 +110,7 @@ void ProcessCtlToCmSwitchoverMsg(MsgRecvInfo* recvMsgInfo, const ctl_to_cm_switc // tell cm_ctl will switchover to primary or standby ackMsg.pengding_command = localRole; + write_runlog(LOG, "ackMsg.pengding_command: %d\n", localRole); (void)RespondMsg(recvMsgInfo, 'S', (char *)(&ackMsg), sizeof(ackMsg)); if (ackMsg.command_result == CM_INVALID_COMMAND) { return; @@ -435,7 +436,11 @@ static void process_single_instance_switchover_info(switchover_instance *instanc cm_instance_command_status *cmd = &(instReport->command_member[j]); cmd->command_status = INSTANCE_COMMAND_WAIT_EXEC; cmd->pengding_command = (int)MSG_CM_AGENT_SWITCHOVER; - cmd->cmdPur = INSTANCE_ROLE_PRIMARY; + if (g_ssDoubleClusterMode == SS_DOUBLE_STANDBY) { + cmd->cmdPur = INSTANCE_ROLE_MAIN_STANDBY; + } else { + cmd->cmdPur = INSTANCE_ROLE_PRIMARY; + } cmd->cmdSour = INSTANCE_ROLE_STANDBY; cmd->peerInstId = GetPeerInstId(i, j); cmd->time_out = ctl_to_cm_swithover_ptr->wait_seconds; @@ -701,7 +706,9 @@ void ProcessCtlToCmSwitchoverAzMsg(MsgRecvInfo* recvMsgInfo, ctl_to_cm_switchove break; } else if (g_instance_role_group_ptr[i].instanceMember[j].instanceType == INSTANCE_TYPE_DATANODE && ((g_instance_group_report_status_ptr[i].instance_status.data_node_member[j] - .local_status.local_role == INSTANCE_ROLE_PRIMARY && sameAz))) { + .local_status.local_role == INSTANCE_ROLE_PRIMARY || + g_instance_group_report_status_ptr[i].instance_status.data_node_member[j] + .local_status.local_role == INSTANCE_ROLE_MAIN_STANDBY) && sameAz)) { primaryInstanceInTargetAZ = true; noNeedDoDnNum++; checkSwitchoverInstance = true; @@ -849,7 +856,8 @@ static int SwitchoverDone(void) int dnLocalRole = g_instance_group_report_status_ptr[i].instance_status.data_node_member[j] .local_status.local_role; bool enCheck = (CheckInstInSyncList(i, j, str) == SYNCLIST_IS_FINISTH); - if (initRole == INSTANCE_ROLE_PRIMARY && dnLocalRole != INSTANCE_ROLE_PRIMARY && + if ((initRole == INSTANCE_ROLE_PRIMARY || initRole == INSTANCE_ROLE_MAIN_STANDBY) && + (dnLocalRole != INSTANCE_ROLE_PRIMARY && dnLocalRole != INSTANCE_ROLE_MAIN_STANDBY) && *command != (int)MSG_CM_AGENT_SWITCHOVER && enCheck) { if (localStatus == INSTANCE_HA_STATE_NORMAL) { set_pending_command(i, j, MSG_CM_AGENT_SWITCHOVER, SWITCHOVER_DEFAULT_WAIT); @@ -864,23 +872,26 @@ static int SwitchoverDone(void) } } - if (initRole == INSTANCE_ROLE_PRIMARY && dnLocalRole == INSTANCE_ROLE_STANDBY && + if ((initRole == INSTANCE_ROLE_PRIMARY || initRole == INSTANCE_ROLE_MAIN_STANDBY) && dnLocalRole == INSTANCE_ROLE_STANDBY && localStatus == INSTANCE_HA_STATE_PROMOTING && *command == MSG_CM_AGENT_SWITCHOVER) { anyInitPrimarySwitchover = true; } /* must keep three or in this if condition, otherwise will result to some problem. */ if (*command == MSG_CM_AGENT_SWITCHOVER && - ((dnLocalRole != INSTANCE_ROLE_PRIMARY && initRole == INSTANCE_ROLE_PRIMARY) || - (g_instance_role_group_ptr[i].instanceMember[j].role == INSTANCE_ROLE_PRIMARY && + (((dnLocalRole != INSTANCE_ROLE_PRIMARY && dnLocalRole != INSTANCE_ROLE_MAIN_STANDBY) && + (initRole == INSTANCE_ROLE_PRIMARY || initRole == INSTANCE_ROLE_MAIN_STANDBY)) || + ((g_instance_role_group_ptr[i].instanceMember[j].role == INSTANCE_ROLE_PRIMARY || + g_instance_role_group_ptr[i].instanceMember[j].role == INSTANCE_ROLE_MAIN_STANDBY) && localStatus != INSTANCE_HA_STATE_NORMAL))) { (void)pthread_rwlock_unlock(&(g_instance_group_report_status_ptr[i].lk_lock)); write_runlog(LOG, "%s: inst(%u) is doing switchover.\n", str, instanceId); return SWITCHOVER_EXECING; } - if (*command != MSG_CM_AGENT_SWITCHOVER && dnLocalRole != INSTANCE_ROLE_PRIMARY && - initRole == INSTANCE_ROLE_PRIMARY) { + if (*command != MSG_CM_AGENT_SWITCHOVER && + (dnLocalRole != INSTANCE_ROLE_PRIMARY && dnLocalRole != INSTANCE_ROLE_MAIN_STANDBY) && + (initRole == INSTANCE_ROLE_PRIMARY || initRole == INSTANCE_ROLE_MAIN_STANDBY)) { write_runlog(LOG, "line %d: instanceId(%u) has not do switchover.\n", __LINE__, instanceId); dnCount++; partlySwitchover = true; @@ -2197,38 +2208,44 @@ void ProcessCtlToCmSwitchoverAllMsg(MsgRecvInfo* recvMsgInfo, const ctl_to_cm_sw bool isCatchUp = IsInCatchUpState(i, j); bool isCheckSyncList = (CheckInstInSyncList(i, j, str) == SYNCLIST_IS_FINISTH); if ((dnLocalRole == INSTANCE_ROLE_STANDBY || dnLocalRole == INSTANCE_ROLE_CASCADE_STANDBY) && - initRole == INSTANCE_ROLE_PRIMARY && localStatus == INSTANCE_HA_STATE_NORMAL && !isInVoteAz && + (initRole == INSTANCE_ROLE_PRIMARY || initRole == INSTANCE_ROLE_MAIN_STANDBY) && + localStatus == INSTANCE_HA_STATE_NORMAL && !isInVoteAz && !isCatchUp && isCheckSyncList) { SetSwitchoverInSwitchoverProcess(i, j, switchoverMsg->wait_seconds); needDoDnNum++; - } else if (initRole == INSTANCE_ROLE_PRIMARY && localStatus != INSTANCE_HA_STATE_NORMAL) { + } else if ((initRole == INSTANCE_ROLE_PRIMARY || initRole == INSTANCE_ROLE_MAIN_STANDBY) && + localStatus != INSTANCE_HA_STATE_NORMAL) { write_runlog(LOG, "dn instance=%u status=%s, will not switchover for status is unNormal.\n", instanceId, datanode_dbstate_int_to_string(localStatus)); msgBalanceResult.instances[imbalanceIndex++] = instanceId; noNeedDoDnNum++; - } else if (initRole == INSTANCE_ROLE_PRIMARY && dnLocalRole == INSTANCE_ROLE_PRIMARY) { + } else if ((initRole == INSTANCE_ROLE_PRIMARY || initRole == INSTANCE_ROLE_MAIN_STANDBY) && + (dnLocalRole == INSTANCE_ROLE_PRIMARY || dnLocalRole == INSTANCE_ROLE_MAIN_STANDBY)) { write_runlog(LOG, "dn instance=%u status=%s, will not switchover for status is already primary.\n", instanceId, datanode_dbstate_int_to_string(localStatus)); noNeedDoDnNum++; - } else if (initRole == INSTANCE_ROLE_PRIMARY && isInVoteAz && isCheckSyncList) { + } else if ((initRole == INSTANCE_ROLE_PRIMARY || initRole == INSTANCE_ROLE_MAIN_STANDBY) && + isInVoteAz && isCheckSyncList) { write_runlog(LOG, "dn instance=%u status=%s, will not switchover in vote AZ.\n", instanceId, datanode_dbstate_int_to_string(localStatus)); noNeedDoDnNum++; - } else if (initRole == INSTANCE_ROLE_PRIMARY && isCatchUp) { + } else if ((initRole == INSTANCE_ROLE_PRIMARY || initRole == INSTANCE_ROLE_MAIN_STANDBY) && isCatchUp) { write_runlog(LOG, "dn instance=%u status=%s, will not switchover for the xlog location gap" "between the primary and standby is too large.\n", instanceId, datanode_dbstate_int_to_string(localStatus)); - if (dnLocalRole == INSTANCE_ROLE_STANDBY && initRole == INSTANCE_ROLE_PRIMARY) { + if (dnLocalRole == INSTANCE_ROLE_STANDBY && + (initRole == INSTANCE_ROLE_PRIMARY || initRole == INSTANCE_ROLE_MAIN_STANDBY)) { msgBalanceResult.instances[imbalanceIndex++] = instanceId; } noNeedDoDnNum++; - } else if (initRole == INSTANCE_ROLE_PRIMARY && isCheckSyncList) { + } else if ((initRole == INSTANCE_ROLE_PRIMARY || initRole == INSTANCE_ROLE_MAIN_STANDBY) && isCheckSyncList) { write_runlog(LOG, "dn instance=%u status=%s, will not switchover for the inst not in synclist.\n", instanceId, datanode_dbstate_int_to_string(localStatus)); - if (dnLocalRole == INSTANCE_ROLE_STANDBY && initRole == INSTANCE_ROLE_PRIMARY) { + if (dnLocalRole == INSTANCE_ROLE_STANDBY && + (initRole == INSTANCE_ROLE_PRIMARY || initRole == INSTANCE_ROLE_MAIN_STANDBY)) { msgBalanceResult.instances[imbalanceIndex++] = instanceId; } noNeedDoDnNum++; diff --git a/src/cm_server/cms_process_messages_ctl_inter.cpp b/src/cm_server/cms_process_messages_ctl_inter.cpp index d128a68..3a63bf8 100644 --- a/src/cm_server/cms_process_messages_ctl_inter.cpp +++ b/src/cm_server/cms_process_messages_ctl_inter.cpp @@ -143,7 +143,11 @@ void SetSwitchoverPendingCmd(uint32 groupIdx, int32 memIdx, int32 waitSecond, co cmd->cmdSour = INSTANCE_ROLE_CASCADE_STANDBY; cmd->cmdRealPur = INSTANCE_ROLE_PRIMARY; } else { - cmd->cmdPur = INSTANCE_ROLE_PRIMARY; + if (g_ssDoubleClusterMode == SS_DOUBLE_STANDBY) { + cmd->cmdPur = INSTANCE_ROLE_MAIN_STANDBY; + } else { + cmd->cmdPur = INSTANCE_ROLE_PRIMARY; + } cmd->cmdSour = INSTANCE_ROLE_STANDBY; cmd->cmdRealPur = INSTANCE_ROLE_INIT; if (isNeedDelay) { diff --git a/src/include/cm/cm_server/cms_global_params.h b/src/include/cm/cm_server/cms_global_params.h index d6dfa2b..d643229 100644 --- a/src/include/cm/cm_server/cms_global_params.h +++ b/src/include/cm/cm_server/cms_global_params.h @@ -476,6 +476,7 @@ extern int32 g_clusterArbiTime; extern bool g_isPauseArbitration; extern char g_cmManualPausePath[MAX_PATH_LEN]; extern uint32 g_waitStaticPrimaryTimes; +extern SSDoubleClusterMode g_ssDoubleClusterMode; extern void clean_init_cluster_state(); extern void instance_delay_arbitrate_time_out_direct_clean(uint32 group_index, int member_index,