!64 Datanode主节点被网络孤立后,Datanode未降备,导致datanode双主

Merge pull request !64 from alfredwang/I5TGP3
This commit is contained in:
opengauss-bot 2023-02-17 07:56:41 +00:00 committed by Gitee
commit a582181c2f
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
9 changed files with 56 additions and 1 deletions

View File

@ -44,4 +44,7 @@ agent_rhb_interval = 1000 # the heatbeat of
enable_ssl = on # enable cma to cma ssl
ssl_cert_expire_alert_threshold = 90
ssl_cert_expire_check_interval = 86400
enable_fence_dn = off #enable fence the datanode when cma cannot connect to any cms.
#if set to on, restart datenode after 30 seconds. otherwise, don't restart datanode.
#default off
############### must leave a new line at the end ###################

View File

@ -42,4 +42,7 @@ agent_rhb_interval = 1000 # the heatbeat of
enable_ssl = on # enable cma to cma ssl
ssl_cert_expire_alert_threshold = 90
ssl_cert_expire_check_interval = 86400
enable_fence_dn = off #enable fence the datanode when cma cannot connect to any cms.
#if set to on, restart datenode after 30 seconds. otherwise, don't restart datanode.
#default off
############### must leave a new line at the end ###################

View File

@ -45,4 +45,7 @@ agent_rhb_interval = 1000 # the heatbeat of
enable_ssl = off # enable cma to cma ssl
ssl_cert_expire_alert_threshold = 90
ssl_cert_expire_check_interval = 86400
enable_fence_dn = off #enable fence the datanode when cma cannot connect to any cms.
#if set to on, restart datenode after 30 seconds. otherwise, don't restart datanode.
#default off
############### must leave a new line at the end ###################

View File

@ -389,6 +389,11 @@ void ReloadParametersFromConfigfile()
log_saved_days = (uint32)get_int_value_from_config(configDir, "log_saved_days", 90);
log_max_count = (uint32)get_int_value_from_config(configDir, "log_max_count", 10000);
#ifndef ENABLE_MULTIPLE_NODES
if (get_config_param(configDir, "enable_fence_dn", g_enableFenceDn, sizeof(g_enableFenceDn)) < 0)
write_runlog(ERROR, "get_config_param() get enable_fence_dn fail.\n");
#endif
write_runlog(LOG,
"reload cm_agent parameters:\n"
" log_min_messages=%d, maxLogFileSize=%d, sys_log_path=%s, \n alarm_component=%s, "
@ -397,7 +402,11 @@ void ReloadParametersFromConfigfile()
"agent_check_interval=%u, agent_kill_instance_timeout=%u,\n"
" log_threshold_check_interval=%u, log_max_size=%ld, log_max_count=%u, log_saved_days=%u, upgrade_from=%u,\n"
" enableLogCompress=%s, security_mode=%s, incremental_build=%d, unix_socket_directory=%s, "
#ifndef ENABLE_MULTIPLE_NODES
"enable_e2e_rto=%u, disaster_recovery_type=%d, environment_threshold=%s, enable_fence_dn=%s\n",
#else
"enable_e2e_rto=%u, disaster_recovery_type=%d, environment_threshold=%s\n",
#endif
log_min_messages,
maxLogFileSize,
sys_log_path,
@ -421,7 +430,12 @@ void ReloadParametersFromConfigfile()
g_unixSocketDirectory,
g_enableE2ERto,
g_disasterRecoveryType,
#ifndef ENABLE_MULTIPLE_NODES
g_environmentThreshold,
g_enableFenceDn);
#else
g_environmentThreshold);
#endif
}
int ReadDBStateFile(GaussState *state, const char *statePath)

View File

@ -432,12 +432,20 @@ void* ConnCmsPMain(void* arg)
}
/* agentStopInstanceDelayTime: The delay time of stopping instances.
* If isToStopInstances is true, agentStopInstanceDelayTime is FENCE_TIMEOUT, 30 seconds.
* If isToStopInstances is true, and g_enableFenceDn is true,
* agentStopInstanceDelayTime is FENCE_TIMEOUT, 30 seconds.
* If isToStopInstances is true, and g_enableFenceDn is false,
* agentStopInstanceDelayTime is DISABLE_TIMEOUT, 0 seconds, never timeout.
* If isToStopInstances is false, agentStopInstanceDelayTime is agent_kill_instance_timeout,
* 0 second by default,
* and the operation of stopping instances will not be executed.
*/
#ifndef ENABLE_MULTIPLE_NODES
uint32 timeout = IsBoolCmParamTrue(g_enableFenceDn) ? FENCE_TIMEOUT : DISABLE_TIMEOUT;
uint32 agentStopInstanceDelayTime = isToStopInstances ? timeout : agent_kill_instance_timeout;
#else
uint32 agentStopInstanceDelayTime = isToStopInstances ? DISABLE_TIMEOUT : agent_kill_instance_timeout;
#endif
if (isDisconnectTimeout(g_disconnectTime, (int)agentStopInstanceDelayTime) && !have_killed_nodes) {
if ((undocumentedVersion == 0) && isMaintenanceModeDisableOperation(CMA_KILL_SELF_INSTANCES)) {
have_killed_nodes = false;
@ -451,6 +459,16 @@ void* ConnCmsPMain(void* arg)
"sync_dropped_coordinator change to false.\n", agentStopInstanceDelayTime);
g_syncDroppedCoordinator = false;
have_killed_nodes = true;
#ifndef ENABLE_MULTIPLE_NODES
/*
* Kill datanode proccess, so that it can be restarted with pending mode.
*/
uint32 i;
for (i = 0; i < g_currentNode->datanodeCount; i++) {
immediate_stop_one_instance(g_currentNode->datanode[i].datanodeLocalDataPath, INSTANCE_DN);
}
#endif
}
}
}

View File

@ -225,6 +225,9 @@ char g_enableMesSsl[BOOL_STR_MAX_LEN] = {0};
uint32 g_sslCertExpireCheckInterval = SECONDS_PER_DAY;
uint32 g_cmaRhbItvl = 1000;
CmResConfList g_resConf[CM_MAX_RES_INST_COUNT] = {{{0}}};
#ifndef ENABLE_MULTIPLE_NODES
char g_enableFenceDn[10] = {0};
#endif
bool &GetIsSharedStorageMode()
{

View File

@ -1430,6 +1430,11 @@ int get_agent_global_params_from_configfile()
write_runlog(ERROR, "get_config_param() get enable_dcf fail.\n");
}
#ifndef ENABLE_MULTIPLE_NODES
if (get_config_param(configDir, "enable_fence_dn", g_enableFenceDn, sizeof(g_enableFenceDn)) < 0)
write_runlog(ERROR, "get_config_param() get enable_fence_dn fail.\n");
#endif
#ifdef __aarch64__
agent_process_cpu_affinity = get_uint32_value_from_config(configDir, "process_cpu_affinity", 0);
if (agent_process_cpu_affinity > CPU_AFFINITY_MAX) {

View File

@ -53,6 +53,9 @@ const char *g_cmaParamInfo[] = {
"disk_timeout|int|0,2147483647|NULL|NULL|",
"voting_disk_path|string|0,0|NULL|NULL|",
"agent_rhb_interval|int|0,2147483647|NULL|NULL|",
#ifndef ENABLE_MULTIPLE_NODES
"enable_fence_dn|string|0,0|NULL|NULL|",
#endif
#ifdef ENABLE_MULTIPLE_NODES
"enable_cn_auto_repair|bool|0,0|NULL|NULL|",
"enable_gtm_phony_dead_check|int|0,1|NULL|NULL|",

View File

@ -284,6 +284,9 @@ extern char g_agentQueryBarrier[BARRIERLEN];
extern char g_agentTargetBarrier[BARRIERLEN];
extern char g_environmentThreshold[CM_PATH_LENGTH];
extern char g_doradoIp[CM_IP_LENGTH];
#ifndef ENABLE_MULTIPLE_NODES
extern char g_enableFenceDn[10];
#endif
extern uint32 g_diskTimeout;
extern char g_enableMesSsl[BOOL_STR_MAX_LEN];
extern uint32 g_sslCertExpireCheckInterval;