1763 lines
64 KiB
C++
1763 lines
64 KiB
C++
/*
|
|
* Copyright (c) 2021 Huawei Technologies Co.,Ltd.
|
|
*
|
|
* CM is licensed under Mulan PSL v2.
|
|
* You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
* You may obtain a copy of Mulan PSL v2 at:
|
|
*
|
|
* http://license.coscl.org.cn/MulanPSL2
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
* See the Mulan PSL v2 for more details.
|
|
* -------------------------------------------------------------------------
|
|
*
|
|
* cms_monitor_main.cpp
|
|
*
|
|
*
|
|
* IDENTIFICATION
|
|
* src/cm_server/cms_monitor_main.cpp
|
|
*
|
|
* -------------------------------------------------------------------------
|
|
*/
|
|
#include "cm/cm_elog.h"
|
|
#include "cms_alarm.h"
|
|
#include "cms_ddb.h"
|
|
#include "cms_common.h"
|
|
#include "cms_common_res.h"
|
|
#include "cms_global_params.h"
|
|
#include "cms_process_messages.h"
|
|
#include "cms_write_dynamic_config.h"
|
|
#include "cms_arbitrate_cluster.h"
|
|
#include "cjson/cJSON.h"
|
|
#include "cms_monitor_main.h"
|
|
|
|
/* cluster unbalance check interval */
|
|
const int cluster_unbalance_check_interval = 10;
|
|
static int g_cluster_unbalance_check_interval = cluster_unbalance_check_interval;
|
|
static const uint32 CHECK_SLEEP_INTERVAL = 5;
|
|
static const uint32 MAX_VOTE_NUM = 2;
|
|
static const uint32 DDB_STATUS_CHECK_INTERVAL = 2;
|
|
static const uint32 CMS_ID_INDEX_ONE = 1;
|
|
static const uint32 CMS_ID_INDEX_TWO = 2;
|
|
static void RmAllBlackFile(const char *blackFile);
|
|
|
|
using MonitorContext = struct StMonitorContext {
|
|
long takeTime;
|
|
};
|
|
|
|
#ifdef ENABLE_MULTIPLE_NODES
|
|
static void coordinator_status_reset(int group_index, int member_index)
|
|
{
|
|
write_runlog(LOG,
|
|
"coordinator_status_reset: InstanceId[%d][%d]=%u.\n",
|
|
group_index,
|
|
member_index,
|
|
g_instance_role_group_ptr[group_index].instanceMember[member_index].instanceId);
|
|
|
|
g_instance_group_report_status_ptr[group_index].instance_status.coordinatemember.status.status =
|
|
INSTANCE_ROLE_UNKNOWN;
|
|
g_instance_group_report_status_ptr[group_index].instance_status.coordinatemember.status.db_state =
|
|
INSTANCE_HA_STATE_HEARTBEAT_TIMEOUT;
|
|
|
|
/*
|
|
* The CCN crashed, resetting the central node information
|
|
* and reselecting the next time the agent sent the message
|
|
*/
|
|
(void)pthread_mutex_lock(&g_centralNode.mt_lock);
|
|
if (g_centralNode.instanceId == g_instance_role_group_ptr[group_index].instanceMember[member_index].instanceId) {
|
|
errno_t rc =
|
|
memset_s(g_centralNode.cnodename, sizeof(g_centralNode.cnodename), 0, sizeof(g_centralNode.cnodename));
|
|
securec_check_errno(rc, (void)pthread_mutex_unlock(&g_centralNode.mt_lock));
|
|
|
|
g_centralNode.instanceId = 0;
|
|
g_centralNode.node = 0;
|
|
g_centralNode.recover = 1;
|
|
write_runlog(LOG,
|
|
"clear ccn info, %u.\n",
|
|
g_instance_role_group_ptr[group_index].instanceMember[member_index].instanceId);
|
|
}
|
|
(void)pthread_mutex_unlock(&g_centralNode.mt_lock);
|
|
}
|
|
|
|
static void gtm_status_reset(uint32 group_index, int member_index, bool isNodeStop)
|
|
{
|
|
cm_gtm_replconninfo *local_status =
|
|
&g_instance_group_report_status_ptr[group_index].instance_status.gtm_member[member_index].local_status;
|
|
if (g_instance_role_group_ptr[group_index].instanceMember[member_index].instanceType != INSTANCE_TYPE_GTM) {
|
|
return;
|
|
}
|
|
if (local_status->local_role != INSTANCE_ROLE_UNKNOWN) {
|
|
write_runlog(LOG,
|
|
"gtm_status_reset: InstanceId[%u][%d]=%u.\n",
|
|
group_index, member_index,
|
|
g_instance_role_group_ptr[group_index].instanceMember[member_index].instanceId);
|
|
local_status->local_role = INSTANCE_ROLE_UNKNOWN;
|
|
}
|
|
|
|
local_status->xid = 0;
|
|
local_status->send_msg_count = 0;
|
|
local_status->receive_msg_count = 0;
|
|
|
|
if (isNodeStop) {
|
|
local_status->connect_status = CON_MANUAL_STOPPED;
|
|
} else {
|
|
local_status->connect_status = CON_UNKNOWN;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
static bool CheckRaiseArbitrateInterval(uint32 groupIdx)
|
|
{
|
|
/* when dn is doing failover, or the group has primary dn, not need to raise arbitrate interval */
|
|
cm_instance_datanode_report_status *dnReport =
|
|
g_instance_group_report_status_ptr[groupIdx].instance_status.data_node_member;
|
|
for (int32 i = 0; i < g_instance_role_group_ptr[groupIdx].count; ++i) {
|
|
if (dnReport[i].arbitrateFlag) {
|
|
write_runlog(LOG, "instId(%u) is doing failover, cannot raise arbitrate interval.\n",
|
|
GetInstanceIdInGroup(groupIdx, i));
|
|
return false;
|
|
}
|
|
if (dnReport[i].local_status.db_state == INSTANCE_HA_STATE_PROMOTING) {
|
|
write_runlog(LOG, "instId(%u) is promoting, cannot raise arbitrate interval.\n",
|
|
GetInstanceIdInGroup(groupIdx, i));
|
|
return false;
|
|
}
|
|
if (dnReport[i].local_status.local_role == INSTANCE_ROLE_PRIMARY &&
|
|
dnReport[i].local_status.db_state == INSTANCE_HA_STATE_NORMAL) {
|
|
write_runlog(
|
|
DEBUG1, "instId(%u) is primary, cannot raise arbitrate interval.\n", GetInstanceIdInGroup(groupIdx, i));
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static void datanode_status_reset(uint32 group_index, int member_index, bool isNodeStop)
|
|
{
|
|
if (g_instance_role_group_ptr[group_index].instanceMember[member_index].instanceType != INSTANCE_TYPE_DATANODE) {
|
|
return;
|
|
}
|
|
errno_t rc;
|
|
const uint32 max_arbitrate_interval = 100;
|
|
cm_instance_datanode_report_status *dnReportStatus =
|
|
g_instance_group_report_status_ptr[group_index].instance_status.data_node_member;
|
|
dnReportStatus[member_index].local_status.local_role = INSTANCE_ROLE_UNKNOWN;
|
|
if (!g_clusterStarting && CheckRaiseArbitrateInterval(group_index)) {
|
|
g_instance_group_report_status_ptr[group_index].instance_status.time += max_arbitrate_interval;
|
|
for (int32 i = 0; i < g_instance_role_group_ptr[group_index].count; ++i) {
|
|
dnReportStatus[i].arbiTime += max_arbitrate_interval;
|
|
}
|
|
}
|
|
write_runlog(LOG,
|
|
"datanode_status_reset, arbitrate time is : %u, InstanceId[%u][%d]=%u, local_arbitrate_time=%u.\n",
|
|
g_instance_group_report_status_ptr[group_index].instance_status.time, group_index, member_index,
|
|
g_instance_role_group_ptr[group_index].instanceMember[member_index].instanceId,
|
|
dnReportStatus[member_index].arbiTime);
|
|
|
|
g_instance_group_report_status_ptr[group_index]
|
|
.instance_status.data_node_member[member_index]
|
|
.local_status.static_connections = 0;
|
|
g_instance_group_report_status_ptr[group_index]
|
|
.instance_status.data_node_member[member_index]
|
|
.local_status.buildReason = INSTANCE_HA_DATANODE_BUILD_REASON_UNKNOWN;
|
|
|
|
g_instance_group_report_status_ptr[group_index].instance_status.data_node_member[member_index].floatIp.count = 0;
|
|
|
|
rc = memset_s(&g_instance_group_report_status_ptr[group_index]
|
|
.instance_status.data_node_member[member_index] .sender_status[0].sender_sent_location,
|
|
8 * sizeof(XLogRecPtr), 0, 8 * sizeof(XLogRecPtr));
|
|
securec_check_errno(rc, (void)rc);
|
|
rc = memset_s(&g_instance_group_report_status_ptr[group_index]
|
|
.instance_status.data_node_member[member_index].receive_status.sender_sent_location,
|
|
8 * sizeof(XLogRecPtr), 0, 8 * sizeof(XLogRecPtr));
|
|
securec_check_errno(rc, (void)rc);
|
|
|
|
if (isNodeStop) {
|
|
g_instance_group_report_status_ptr[group_index]
|
|
.instance_status.data_node_member[member_index]
|
|
.local_status.db_state = INSTANCE_HA_STATE_MANUAL_STOPPED;
|
|
} else {
|
|
g_instance_group_report_status_ptr[group_index]
|
|
.instance_status.data_node_member[member_index]
|
|
.local_status.db_state = INSTANCE_HA_STATE_UNKONWN;
|
|
}
|
|
}
|
|
|
|
static void check_cluster_balance_status()
|
|
{
|
|
if (!g_isStart && g_HA_status->local_role == CM_SERVER_PRIMARY) {
|
|
int switchedCount = isNodeBalanced(NULL);
|
|
if (switchedCount > 0) {
|
|
report_unbalanced_alarm(ALM_AT_Fault);
|
|
} else if (switchedCount == 0) {
|
|
report_unbalanced_alarm(ALM_AT_Resume);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void FindParam(FILE *fd, char* buf, size_t maxLen, const char *srcParam, char*& subStr, char*& saveptr1)
|
|
{
|
|
errno_t rc;
|
|
while (!feof(fd)) {
|
|
rc = memset_s(buf, maxLen, 0, maxLen);
|
|
securec_check_errno(rc, (void)rc);
|
|
(void)fgets(buf, (int)maxLen, fd);
|
|
buf[maxLen - 1] = 0;
|
|
|
|
/* skip # comment of agent configure file */
|
|
if (is_comment_line(buf) == 1) {
|
|
continue;
|
|
}
|
|
|
|
subStr = strstr(buf, srcParam);
|
|
if (subStr == NULL) {
|
|
continue;
|
|
}
|
|
subStr = strtok_r(buf, "=", &saveptr1);
|
|
if (subStr == NULL) {
|
|
continue;
|
|
}
|
|
|
|
if (strcmp(trim(subStr), srcParam) == 0) {
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
void get_config_param(const char *config_file, const char *srcParam, char *destParam, int destLen)
|
|
{
|
|
errno_t rc;
|
|
char buf[MAXPGPATH];
|
|
char *subStr = NULL;
|
|
char *saveptr1 = NULL;
|
|
|
|
if (config_file == NULL || srcParam == NULL || destParam == NULL) {
|
|
(void)printf(
|
|
"FATAL Get parameter failed,confDir=%s,srcParam = %s, destParam=%s\n", config_file, srcParam, destParam);
|
|
exit(1);
|
|
}
|
|
|
|
FILE *fd = fopen(config_file, "r");
|
|
if (fd == NULL) {
|
|
(void)printf("FATAL Open configure file failed \n");
|
|
exit(1);
|
|
}
|
|
|
|
FindParam(fd, buf, sizeof(buf), srcParam, subStr, saveptr1);
|
|
/* process each row to filter character */
|
|
if (subStr != NULL) {
|
|
subStr = trim(saveptr1);
|
|
if (subStr != NULL) {
|
|
subStr = strtok_r(subStr, "#", &saveptr1);
|
|
}
|
|
if (subStr != NULL) {
|
|
subStr = strtok_r(subStr, "\n", &saveptr1);
|
|
}
|
|
if (subStr != NULL) {
|
|
subStr = strtok_r(subStr, "\r", &saveptr1);
|
|
}
|
|
if (subStr != NULL) {
|
|
if (strlen(trim(subStr)) + 1 > (size_t)destLen) {
|
|
write_runlog(FATAL, "The value of parameter %s is invalid, subStr is %s.\n", srcParam, subStr);
|
|
(void)fclose(fd);
|
|
exit(1);
|
|
}
|
|
rc = memcpy_s(destParam, strlen(trim(subStr)) + 1, trim(subStr), strlen(trim(subStr)) + 1);
|
|
securec_check_errno(rc, (void)fclose(fd));
|
|
}
|
|
}
|
|
|
|
(void)fclose(fd);
|
|
}
|
|
/**
|
|
* @brief reload cm_server parameters from cm_server.conf without kill and restart the cm_server process
|
|
*
|
|
*/
|
|
static void ReloadParametersFromConfigfile()
|
|
{
|
|
const int min_switch_rto = 60;
|
|
write_runlog(LOG, "reload cm_server parameters from config file.\n");
|
|
int rcs;
|
|
canonicalize_path(configDir);
|
|
GetAlarmConfig(g_alarmConfigDir);
|
|
get_log_paramter(configDir);
|
|
instance_heartbeat_timeout = (uint32)get_int_value_from_config(configDir, "instance_heartbeat_timeout", 6);
|
|
if (instance_heartbeat_timeout == 0) {
|
|
instance_heartbeat_timeout = 6;
|
|
write_runlog(FATAL, "invalid value for parameter \'instance_heartbeat_timeout\' in %s.\n", configDir);
|
|
}
|
|
#ifdef ENABLE_MULTIPLE_NODES
|
|
get_paramter_coordinator_heartbeat_timeout();
|
|
#endif
|
|
instance_keep_heartbeat_timeout =
|
|
(uint32)get_int_value_from_config(configDir, "instance_keep_heartbeat_timeout", 40);
|
|
cmserver_self_vote_timeout = (uint32)get_int_value_from_config(configDir, "cmserver_self_vote_timeout", 8);
|
|
cmserver_ha_connect_timeout = (uint32)get_int_value_from_config(configDir, "cmserver_ha_connect_timeout", 2);
|
|
instance_failover_delay_timeout =
|
|
(uint32)get_int_value_from_config(configDir, "instance_failover_delay_timeout", 0);
|
|
datastorage_threshold_check_interval =
|
|
get_uint32_value_from_config(configDir, "datastorage_threshold_check_interval", 10);
|
|
g_readOnlyThreshold = get_uint32_value_from_config(configDir, "datastorage_threshold_value_check", 85);
|
|
max_datastorage_threshold_check = get_int_value_from_config(configDir, "max_datastorage_threshold_check", 1800);
|
|
az_switchover_threshold = get_int_value_from_config(configDir, "az_switchover_threshold", 100);
|
|
az_check_and_arbitrate_interval = get_int_value_from_config(configDir, "az_check_and_arbitrate_interval", 2);
|
|
az1_and_az2_connect_check_interval = get_int_value_from_config(configDir, "az_connect_check_interval", 60);
|
|
az1_and_az2_connect_check_delay_time = get_int_value_from_config(configDir, "az_connect_check_delay_time", 150);
|
|
phony_dead_effective_time = get_int_value_from_config(configDir, "phony_dead_effective_time", 5);
|
|
if (phony_dead_effective_time <= 0) {
|
|
phony_dead_effective_time = DEFAULT_PHONY_DEAD_EFFECTIVE_TIME;
|
|
}
|
|
instance_phony_dead_restart_interval =
|
|
get_int_value_from_config(configDir, "instance_phony_dead_restart_interval", 21600);
|
|
enable_az_auto_switchover = get_int_value_from_config(configDir, "enable_az_auto_switchover", 1);
|
|
|
|
cmserver_demote_delay_on_etcd_fault =
|
|
get_int_value_from_config(configDir, "cmserver_demote_delay_on_etcd_fault", 8);
|
|
cm_auth_method = get_authentication_type(configDir);
|
|
get_krb_server_keyfile(configDir);
|
|
switch_rto = get_int_value_from_config(configDir, "switch_rto", 600);
|
|
if (switch_rto < min_switch_rto) {
|
|
switch_rto = min_switch_rto;
|
|
}
|
|
|
|
g_clusterStartingArbitDelay =
|
|
(uint32)get_int_value_from_config(configDir, "cluster_starting_aribt_delay", CLUSTER_STARTING_ARBIT_DELAY);
|
|
|
|
force_promote = get_int_value_from_config(configDir, "force_promote", 0);
|
|
g_enableE2ERto = (uint32)get_int_value_from_config(configDir, "enable_e2e_rto", 0);
|
|
if (g_enableE2ERto == 1) {
|
|
instance_heartbeat_timeout = INSTANCE_HEARTBEAT_TIMEOUT_FOR_E2E_RTO;
|
|
}
|
|
g_cm_agent_kill_instance_time = get_uint32_value_from_config(configDir, "agent_fault_timeout", 60);
|
|
get_config_param(configDir, "enable_transaction_read_only", g_enableSetReadOnly, sizeof(g_enableSetReadOnly));
|
|
if (!CheckBoolConfigParam(g_enableSetReadOnly)) {
|
|
rcs = strcpy_s(g_enableSetReadOnly, sizeof(g_enableSetReadOnly), "on");
|
|
securec_check_errno(rcs, (void)rcs);
|
|
write_runlog(FATAL, "invalid value for parameter \" enable_transaction_read_only \" in %s.\n", configDir);
|
|
}
|
|
GetDdbArbiCfg(RELOAD_PARAMTER);
|
|
GetDelayArbitTimeFromConf();
|
|
GetDelayArbitClusterTimeFromConf();
|
|
g_diskTimeout = get_uint32_value_from_config(configDir, "disk_timeout", 200);
|
|
g_agentNetworkTimeout = get_uint32_value_from_config(configDir, "agent_network_timeout", 6);
|
|
GetDnArbitrateMode();
|
|
#ifndef ENABLE_PRIVATEGAUSS
|
|
g_waitStaticPrimaryTimes = get_uint32_value_from_config(configDir, "wait_static_primary_times", 6);
|
|
if (g_waitStaticPrimaryTimes < 5) {
|
|
g_waitStaticPrimaryTimes = 5;
|
|
}
|
|
#endif
|
|
|
|
if (g_cm_server_num == CMS_ONE_PRIMARY_ONE_STANDBY) {
|
|
GetTwoNodesArbitrateParams();
|
|
}
|
|
|
|
|
|
#ifdef ENABLE_MULTIPLE_NODES
|
|
write_runlog(LOG,
|
|
"reload cm_server parameters:\n"
|
|
" log_min_messages=%d, maxLogFileSize=%d, sys_log_path=%s, \n alarm_component=%s, "
|
|
"alarm_report_interval=%d, \n"
|
|
" instance_heartbeat_timeout=%u, coordinator_heartbeat_timeout=%u, "
|
|
"cmserver_ha_heartbeat_timeout=%u, cmserver_self_vote_timeout=%u,\n"
|
|
" cmserver_ha_status_interval=%u, cmserver_ha_connect_timeout=%u, "
|
|
"instance_failover_delay_timeout=%u, datastorage_threshold_check_interval=%d,\n"
|
|
" max_datastorage_threshold_check=%d, enableSetReadOnly=%s, enableSetReadOnlyThreshold=%u, "
|
|
"switch_rto=%d, force_promote=%d, cluster_starting_aribt_delay=%u, "
|
|
"enable_e2e_rto=%u, g_delayArbiTime=%u, g_clusterArbiTime=%d.\n",
|
|
log_min_messages, maxLogFileSize, sys_log_path, g_alarmComponentPath, g_alarmReportInterval,
|
|
instance_heartbeat_timeout, coordinator_heartbeat_timeout, g_ddbArbicfg.haHeartBeatTimeOut,
|
|
cmserver_self_vote_timeout, g_ddbArbicfg.haStatusInterval, cmserver_ha_connect_timeout,
|
|
instance_failover_delay_timeout, datastorage_threshold_check_interval,
|
|
max_datastorage_threshold_check, g_enableSetReadOnly, g_readOnlyThreshold,
|
|
switch_rto, force_promote, g_clusterStartingArbitDelay,
|
|
g_enableE2ERto, g_delayArbiTime, g_clusterArbiTime);
|
|
#else
|
|
write_runlog(LOG,
|
|
"reload cm_server parameters:\n"
|
|
" log_min_messages=%d, maxLogFileSize=%d, sys_log_path=%s, \n alarm_component=%s, "
|
|
"alarm_report_interval=%d, \n"
|
|
" instance_heartbeat_timeout=%u, cmserver_ha_heartbeat_timeout=%u, "
|
|
"cmserver_self_vote_timeout=%u,\n"
|
|
" cmserver_ha_status_interval=%u, cmserver_ha_connect_timeout=%u, instance_failover_delay_timeout=%u, "
|
|
"datastorage_threshold_check_interval=%d,\n"
|
|
" max_datastorage_threshold_check=%d, enableSetReadOnly=%s, enableSetReadOnlyThreshold=%u, "
|
|
"switch_rto=%d, force_promote=%d, cluster_starting_aribt_delay=%u, enable_e2e_rto=%u, "
|
|
"g_delayArbiTime=%u, g_clusterArbiTime=%d, wait_static_primary_times=%u.\n",
|
|
log_min_messages, maxLogFileSize, sys_log_path, g_alarmComponentPath, g_alarmReportInterval,
|
|
instance_heartbeat_timeout, g_ddbArbicfg.haHeartBeatTimeOut, cmserver_self_vote_timeout,
|
|
g_ddbArbicfg.haStatusInterval, cmserver_ha_connect_timeout, instance_failover_delay_timeout,
|
|
datastorage_threshold_check_interval, max_datastorage_threshold_check, g_enableSetReadOnly,
|
|
g_readOnlyThreshold, switch_rto, force_promote, g_clusterStartingArbitDelay,
|
|
g_enableE2ERto, g_delayArbiTime, g_clusterArbiTime, g_waitStaticPrimaryTimes);
|
|
#endif
|
|
}
|
|
|
|
static void CheckKerberosHB()
|
|
{
|
|
uint32 kerberosHeartBeatTimeOut = 20;
|
|
|
|
/* monitor kerberos status heartbeat */
|
|
for (uint kk = 0; kk < KERBEROS_NUM; kk++) {
|
|
g_kerberos_group_report_status.kerberos_status.heartbeat[kk]++;
|
|
if (g_kerberos_group_report_status.kerberos_status.heartbeat[kk] > kerberosHeartBeatTimeOut) {
|
|
g_kerberos_group_report_status.kerberos_status.status[kk] = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void CheckMajorityReElect()
|
|
{
|
|
if (arbitration_majority_reelection_timeout > 0) {
|
|
arbitration_majority_reelection_timeout--;
|
|
if (arbitration_majority_reelection_timeout == 0) {
|
|
write_runlog(LOG,
|
|
"arbitration_majority_reelection_timeout elapsed "
|
|
"into 0. Majority re-election enabled now.\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
static void CheckCmctlStop()
|
|
{
|
|
if (ctl_stop_cluster_server_halt_arbitration_timeout > 0) {
|
|
ctl_stop_cluster_server_halt_arbitration_timeout--;
|
|
if (ctl_stop_cluster_server_halt_arbitration_timeout == 0) {
|
|
write_runlog(LOG,
|
|
"ctl_stop_cluster_server_halt_arbitration_timeout elapsed into 0, and for some "
|
|
"reason cm_ctl stop-cluster did not succeed. Resume arbitration now.\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifdef ENABLE_MULTIPLE_NODES
|
|
static void CheckCnDelDelayTime()
|
|
{
|
|
if (g_cnDeleteDelayTimeForClusterStarting > 0) {
|
|
g_cnDeleteDelayTimeForClusterStarting--;
|
|
if (g_cnDeleteDelayTimeForClusterStarting == 0) {
|
|
write_runlog(LOG,
|
|
"cn delete delay time for cm_server_start_mode or big_cluster elapsed "
|
|
"into 0. Coordinator deletion enabled now.\n");
|
|
}
|
|
}
|
|
|
|
if (g_cnDeleteDelayTimeForDnWithoutPrimary > 0) {
|
|
g_cnDeleteDelayTimeForDnWithoutPrimary--;
|
|
if (g_cnDeleteDelayTimeForDnWithoutPrimary == 0) {
|
|
write_runlog(LOG,
|
|
"cn delete delay time for dn without primary elapsed "
|
|
"into 0. Coordinator deletion enabled now.\n");
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
static void CheckRoleChange()
|
|
{
|
|
static int historyCMSRole = g_HA_status->local_role;
|
|
|
|
if (historyCMSRole != g_HA_status->local_role) {
|
|
write_runlog(LOG,
|
|
"the instance state will reset, history role is %d, current role is %d.\n",
|
|
historyCMSRole,
|
|
g_HA_status->local_role);
|
|
historyCMSRole = g_HA_status->local_role;
|
|
for (uint32 i = 0; i < g_dynamic_header->relationCount; i++) {
|
|
(void)pthread_rwlock_wrlock(&(g_instance_group_report_status_ptr[i].lk_lock));
|
|
for (int j = 0; j < g_instance_role_group_ptr[i].count; j++) {
|
|
/* when cms change primary, the heart beat will ++ at the time of last role */
|
|
g_instance_group_report_status_ptr[i].instance_status.command_member[j].heat_beat = 0;
|
|
}
|
|
(void)pthread_rwlock_unlock(&(g_instance_group_report_status_ptr[i].lk_lock));
|
|
}
|
|
}
|
|
}
|
|
|
|
static void CheckETCD()
|
|
{
|
|
(void)pthread_rwlock_wrlock(&instance_status_rwlock);
|
|
|
|
for (uint32 ii = 0; ii < g_etcd_num; ii++) {
|
|
if (g_instance_status_for_etcd_timeout[ii] > 0) {
|
|
g_instance_status_for_etcd_timeout[ii]--;
|
|
} else {
|
|
write_runlog(LOG, "the %u etcd heartbeat timeout.\n", ii);
|
|
g_instance_status_for_etcd[ii] = CM_ETCD_DOWN;
|
|
}
|
|
}
|
|
|
|
(void)pthread_rwlock_unlock(&instance_status_rwlock);
|
|
}
|
|
|
|
static void CheckHB()
|
|
{
|
|
if (cmserver_switchover_timeout > 0) {
|
|
cmserver_switchover_timeout--;
|
|
}
|
|
|
|
if (g_instance_failover_delay_time_from_set > 0) {
|
|
g_instance_failover_delay_time_from_set--;
|
|
}
|
|
|
|
if (g_init_cluster_delay_time > 0) {
|
|
g_init_cluster_delay_time--;
|
|
} else {
|
|
g_init_cluster_delay_time = 0;
|
|
clean_init_cluster_state();
|
|
}
|
|
|
|
if (g_clusterStartingTimeout > 0) {
|
|
g_clusterStartingTimeout--;
|
|
} else {
|
|
g_clusterStartingTimeout = 0;
|
|
g_clusterStarting = false;
|
|
}
|
|
}
|
|
|
|
static inline void CheckDdbClusterStatusOn2Nodes()
|
|
{
|
|
if (g_ddbNetworkIsolationTimeout > 0) {
|
|
g_ddbNetworkIsolationTimeout--;
|
|
} else {
|
|
g_ddbNetworkIsolationTimeout = 0;
|
|
}
|
|
}
|
|
|
|
#ifdef ENABLE_MULTIPLE_NODES
|
|
static void DoCNTimeout(uint32 groupIdx, int memIdx)
|
|
{
|
|
cm_instance_report_status *instanceReportStatus = &g_instance_group_report_status_ptr[groupIdx].instance_status;
|
|
cm_instance_command_status *instanceCommandStatus = &instanceReportStatus->command_member[memIdx];
|
|
cm_instance_role_status *instanceRoleStatus = &g_instance_role_group_ptr[groupIdx].instanceMember[memIdx];
|
|
|
|
if (instanceReportStatus->coordinatemember.status.status != INSTANCE_ROLE_UNKNOWN) {
|
|
coordinator_status_reset((int)groupIdx, memIdx);
|
|
}
|
|
|
|
if (((coordinator_heartbeat_timeout == 0 && g_cmd_disable_coordinatorId == instanceRoleStatus->instanceId) ||
|
|
(coordinator_heartbeat_timeout > 0 &&
|
|
instanceCommandStatus->heat_beat > (int)coordinator_heartbeat_timeout)) &&
|
|
(instanceReportStatus->coordinatemember.status.status == INSTANCE_ROLE_UNKNOWN) &&
|
|
(instanceReportStatus->coordinatemember.status.db_state == INSTANCE_HA_STATE_STARTING)) {
|
|
write_runlog(LOG,
|
|
"CN heartbeat timeout, reset CN. InstanceId[%u][%d]=%u, heat_beat=%d, "
|
|
"coordinator_heartbeat_timeout=%u, status=%d, db_state=%d\n",
|
|
groupIdx, memIdx, instanceRoleStatus->instanceId, instanceCommandStatus->heat_beat,
|
|
coordinator_heartbeat_timeout, instanceReportStatus->coordinatemember.status.status,
|
|
instanceReportStatus->coordinatemember.status.db_state);
|
|
|
|
coordinator_status_reset((int)groupIdx, memIdx);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
static void DoCommandTimeout(uint32 i, int j)
|
|
{
|
|
cm_instance_report_status *instanceReportStatus = &g_instance_group_report_status_ptr[i].instance_status;
|
|
cm_instance_command_status *instanceCommandStatus = &instanceReportStatus->command_member[j];
|
|
cm_instance_role_status *instanceRoleStatus = &g_instance_role_group_ptr[i].instanceMember[j];
|
|
|
|
if ((instanceCommandStatus->heat_beat - (int)instance_heartbeat_timeout) % 5 == 0) {
|
|
write_runlog(LOG,
|
|
"instance(%u) heartbeat timeout, heartbeat:%d, threshold:%u\n",
|
|
instanceRoleStatus->instanceId,
|
|
instanceCommandStatus->heat_beat,
|
|
instance_heartbeat_timeout);
|
|
}
|
|
|
|
uint32 checkNode = instanceRoleStatus->node;
|
|
g_stopNodeIter = g_stopNodes.find(checkNode);
|
|
if (g_stopNodeIter != g_stopNodes.end()) {
|
|
write_runlog(LOG, "node(%u) is stopped.\n", checkNode);
|
|
datanode_status_reset(i, j, true);
|
|
#ifdef ENABLE_MULTIPLE_NODES
|
|
gtm_status_reset(i, j, true);
|
|
#endif
|
|
} else {
|
|
datanode_status_reset(i, j, false);
|
|
#ifdef ENABLE_MULTIPLE_NODES
|
|
gtm_status_reset(i, j, false);
|
|
#endif
|
|
}
|
|
#ifdef ENABLE_MULTIPLE_NODES
|
|
DoCNTimeout(i, j);
|
|
#endif
|
|
}
|
|
|
|
static void DoInstancePromote(uint32 i, int j)
|
|
{
|
|
cm_instance_report_status *instanceReportStatus = &g_instance_group_report_status_ptr[i].instance_status;
|
|
cm_instance_command_status *instanceCommandStatus = &instanceReportStatus->command_member[j];
|
|
cm_instance_role_status *instanceRoleStatus = &g_instance_role_group_ptr[i].instanceMember[j];
|
|
cm_instance_arbitrate_status *instanceArbitrateAtatus = &instanceReportStatus->arbitrate_status_member[j];
|
|
cm_instance_datanode_report_status *dataNodeMember = &instanceReportStatus->data_node_member[j];
|
|
int other_member_index;
|
|
|
|
if (j == 0) {
|
|
other_member_index = 1;
|
|
} else {
|
|
other_member_index = 0;
|
|
}
|
|
|
|
if (instanceArbitrateAtatus->promoting_timeout > 0) {
|
|
if (dataNodeMember->local_status.local_role == INSTANCE_ROLE_PRIMARY &&
|
|
dataNodeMember->local_status.db_state == INSTANCE_HA_STATE_NORMAL) {
|
|
instanceArbitrateAtatus->promoting_timeout = 0;
|
|
write_runlog(LOG, "instance %u failover successful.\n", instanceRoleStatus->instanceId);
|
|
} else if (instanceArbitrateAtatus->promoting_timeout == 1) {
|
|
if (dataNodeMember->local_status.db_state == INSTANCE_HA_STATE_NEED_REPAIR) {
|
|
instanceArbitrateAtatus->promoting_timeout = 0;
|
|
instanceRoleStatus->role = INSTANCE_ROLE_STANDBY;
|
|
g_instance_role_group_ptr[i].instanceMember[other_member_index].role = INSTANCE_ROLE_PRIMARY;
|
|
instanceCommandStatus->role_changed = INSTANCE_ROLE_CHANGED;
|
|
(void)WriteDynamicConfigFile(false);
|
|
write_runlog(LOG,
|
|
"instance role is changed, instance %u is standby, instance %u is primary.\n",
|
|
instanceRoleStatus->instanceId,
|
|
g_instance_role_group_ptr[i].instanceMember[other_member_index].instanceId);
|
|
} else if (dataNodeMember->local_status.db_state != INSTANCE_HA_STATE_PROMOTING) {
|
|
instanceArbitrateAtatus->promoting_timeout--;
|
|
}
|
|
} else {
|
|
instanceArbitrateAtatus->promoting_timeout--;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void UpdateCommandStatus(uint32 i, int j)
|
|
{
|
|
cm_instance_report_status *instanceReportStatus = &g_instance_group_report_status_ptr[i].instance_status;
|
|
cm_instance_command_status *instanceCommandStatus = &instanceReportStatus->command_member[j];
|
|
cm_instance_role_status *instanceRoleStatus = &g_instance_role_group_ptr[i].instanceMember[j];
|
|
|
|
instanceCommandStatus->heat_beat++;
|
|
write_runlog(DEBUG5, "instance(%u) heartbeat is %d monitor count is %u!\n",
|
|
instanceRoleStatus->instanceId,
|
|
instanceCommandStatus->heat_beat,
|
|
instance_heartbeat_timeout);
|
|
|
|
if (instanceCommandStatus->keep_heartbeat_timeout >= 0) {
|
|
instanceCommandStatus->keep_heartbeat_timeout++;
|
|
} else {
|
|
instanceCommandStatus->keep_heartbeat_timeout = 0;
|
|
}
|
|
}
|
|
|
|
static void UpdateCommandStatus1(uint32 i, int j)
|
|
{
|
|
cm_instance_report_status *instanceReportStatus = &g_instance_group_report_status_ptr[i].instance_status;
|
|
cm_instance_command_status *instanceCommandStatus = &instanceReportStatus->command_member[j];
|
|
|
|
if (instanceCommandStatus->pengding_command == MSG_CM_AGENT_SWITCHOVER) {
|
|
instanceCommandStatus->command_send_times++;
|
|
}
|
|
|
|
if (instanceCommandStatus->pengding_command == MSG_CM_AGENT_BUTT) {
|
|
instanceCommandStatus->command_send_times = 0;
|
|
instanceCommandStatus->command_send_num = 0;
|
|
}
|
|
|
|
if (instanceCommandStatus->arbitrate_delay_time_out > 0) {
|
|
instanceCommandStatus->arbitrate_delay_time_out--;
|
|
}
|
|
|
|
if (instanceCommandStatus->time_out > 0) {
|
|
instanceCommandStatus->time_out--;
|
|
}
|
|
}
|
|
|
|
static void CheckOneMember(uint32 i, int j)
|
|
{
|
|
cm_instance_report_status *instanceReportStatus = &g_instance_group_report_status_ptr[i].instance_status;
|
|
cm_instance_command_status *instanceCommandStatus = &instanceReportStatus->command_member[j];
|
|
cm_instance_datanode_report_status *dataNodeMember = &instanceReportStatus->data_node_member[j];
|
|
int instanceType = g_instance_role_group_ptr[i].instanceMember[j].instanceType;
|
|
uint32 instanceId = g_instance_role_group_ptr[i].instanceMember[j].instanceId;
|
|
|
|
UpdateCommandStatus(i, j);
|
|
|
|
const int dnNormalTimeout = 3;
|
|
if (instanceCommandStatus->heat_beat > dnNormalTimeout && instanceType == INSTANCE_TYPE_DATANODE) {
|
|
dataNodeMember->local_status.local_role = INSTANCE_ROLE_UNKNOWN;
|
|
write_runlog(LOG, "instance(%u) heartbeat abnormal, set dn INSTANCE_ROLE_UNKNOWN\n", instanceId);
|
|
}
|
|
|
|
if (instanceCommandStatus->heat_beat > (int)instance_heartbeat_timeout) {
|
|
DoCommandTimeout(i, j);
|
|
}
|
|
|
|
if (instanceCommandStatus->delaySwitchoverTime > 0) {
|
|
--instanceCommandStatus->delaySwitchoverTime;
|
|
}
|
|
|
|
if (!g_multi_az_cluster) {
|
|
DoInstancePromote(i, j);
|
|
}
|
|
|
|
if (dataNodeMember->phony_dead_interval > 0) {
|
|
dataNodeMember->phony_dead_interval--;
|
|
} else if (dataNodeMember->phony_dead_interval < 0) {
|
|
dataNodeMember->phony_dead_interval = 0;
|
|
}
|
|
if (instanceReportStatus->gtm_member[j].phony_dead_interval > 0) {
|
|
instanceReportStatus->gtm_member[j].phony_dead_interval--;
|
|
} else if (instanceReportStatus->gtm_member[j].phony_dead_interval < 0) {
|
|
instanceReportStatus->gtm_member[j].phony_dead_interval = 0;
|
|
}
|
|
|
|
if (instanceCommandStatus->buildFailedTimeout > 1) {
|
|
--instanceCommandStatus->buildFailedTimeout;
|
|
}
|
|
|
|
UpdateCommandStatus1(i, j);
|
|
|
|
if (dataNodeMember->send_gs_guc_time < CM_GS_GUC_SEND_INTERVAL) {
|
|
dataNodeMember->send_gs_guc_time++;
|
|
}
|
|
|
|
#ifdef ENABLE_MULTIPLE_NODES
|
|
if (instanceCommandStatus->time_out <= 0) {
|
|
if (instanceCommandStatus->pengding_command != (int)MSG_CM_AGENT_NOTIFY_CN) {
|
|
CleanCommand(i, j);
|
|
}
|
|
}
|
|
#else
|
|
if (instanceCommandStatus->time_out <= 0) {
|
|
CleanCommand(i, j);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
static void CheckOneInstanceGroup(uint32 i)
|
|
{
|
|
cm_instance_report_status *instanceReportStatus = &g_instance_group_report_status_ptr[i].instance_status;
|
|
if (instanceReportStatus->cma_kill_instance_timeout > 1) {
|
|
instanceReportStatus->cma_kill_instance_timeout--;
|
|
}
|
|
if (instanceReportStatus->coordinatemember.phony_dead_interval > 0) {
|
|
instanceReportStatus->coordinatemember.phony_dead_interval--;
|
|
} else if (instanceReportStatus->coordinatemember.phony_dead_interval < 0) {
|
|
instanceReportStatus->coordinatemember.phony_dead_interval = 0;
|
|
}
|
|
|
|
if (g_instance_role_group_ptr[i].instanceMember[0].instanceType == INSTANCE_TYPE_COORDINATE) {
|
|
instanceReportStatus->coordinatemember.auto_delete_delay_time++;
|
|
instanceReportStatus->coordinatemember.cma_fault_timeout_to_killcn++;
|
|
|
|
if (instanceReportStatus->coordinatemember.disable_time_out > 0) {
|
|
instanceReportStatus->coordinatemember.disable_time_out--;
|
|
}
|
|
}
|
|
|
|
for (int j = 0; j < g_instance_role_group_ptr[i].count; j++) {
|
|
CheckOneMember(i, j);
|
|
}
|
|
}
|
|
|
|
static void CheckAllInstanceGroup()
|
|
{
|
|
for (uint32 i = 0; i < g_dynamic_header->relationCount; i++) {
|
|
(void)pthread_rwlock_wrlock(&(g_instance_group_report_status_ptr[i].lk_lock));
|
|
CheckOneInstanceGroup(i);
|
|
(void)pthread_rwlock_unlock(&(g_instance_group_report_status_ptr[i].lk_lock));
|
|
}
|
|
}
|
|
|
|
static void CheckAllUDF()
|
|
{
|
|
for (uint32 i = 0; i < g_node_num; i++) {
|
|
(void)pthread_rwlock_wrlock(&(g_fenced_UDF_report_status_ptr[i].lk_lock));
|
|
g_fenced_UDF_report_status_ptr[i].heart_beat++;
|
|
write_runlog(DEBUG5,
|
|
"fenced UDF(%u) heartbeat is %d monitor count is %u!\n",
|
|
i,
|
|
g_fenced_UDF_report_status_ptr[i].heart_beat,
|
|
instance_heartbeat_timeout);
|
|
|
|
if (g_fenced_UDF_report_status_ptr[i].heart_beat > (int)instance_heartbeat_timeout) {
|
|
write_runlog(DEBUG1,
|
|
"fenced UDF(%u) heartbeat timeout, heartbeat:%d, threshold:%u.\n",
|
|
i,
|
|
g_fenced_UDF_report_status_ptr[i].heart_beat,
|
|
instance_heartbeat_timeout);
|
|
g_fenced_UDF_report_status_ptr[i].status = INSTANCE_ROLE_UNKNOWN;
|
|
}
|
|
(void)pthread_rwlock_unlock(&(g_fenced_UDF_report_status_ptr[i].lk_lock));
|
|
}
|
|
}
|
|
|
|
static void ResetInstanceStatus()
|
|
{
|
|
for (uint32 i = 0; i < g_dynamic_header->relationCount; i++) {
|
|
g_instance_group_report_status_ptr[i].instance_status.cma_kill_instance_timeout = 0;
|
|
g_instance_group_report_status_ptr[i].instance_status.coordinatemember.phony_dead_interval = 0;
|
|
for (int j = 0; j < g_instance_role_group_ptr[i].count; j++) {
|
|
g_instance_group_report_status_ptr[i].instance_status.data_node_member[j].phony_dead_interval = 0;
|
|
g_instance_group_report_status_ptr[i].instance_status.gtm_member[j].phony_dead_interval = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void UpdateCheckInterval(MonitorContext *ctx)
|
|
{
|
|
struct timespec checkEnd = {0, 0};
|
|
|
|
(void)clock_gettime(CLOCK_MONOTONIC, &checkEnd);
|
|
if (ctx->takeTime == 0) {
|
|
g_monitor_thread_check_invalid_times = 0;
|
|
} else {
|
|
if ((checkEnd.tv_sec - ctx->takeTime) > 2) {
|
|
g_monitor_thread_check_invalid_times++;
|
|
write_runlog(LOG,
|
|
"has find %d invalid check times, take %ld seconds.\n",
|
|
g_monitor_thread_check_invalid_times,
|
|
(checkEnd.tv_sec - ctx->takeTime));
|
|
} else {
|
|
if (g_monitor_thread_check_invalid_times > 0) {
|
|
write_runlog(LOG, "reset invalid check times to zeros.\n");
|
|
}
|
|
g_monitor_thread_check_invalid_times = 0;
|
|
}
|
|
}
|
|
ctx->takeTime = checkEnd.tv_sec;
|
|
}
|
|
|
|
static void SetResStatUnknown(uint32 nodeId)
|
|
{
|
|
write_runlog(LOG, "nodeId(%u) report res stat heartbeat abnormal, set res status CM_RES_STAT_UNKNOWN.\n", nodeId);
|
|
for (uint32 i = 0; i < CusResCount(); ++i) {
|
|
(void)pthread_rwlock_wrlock(&g_resStatus[i].rwlock);
|
|
for (uint32 j = 0; j < g_resStatus[i].status.instanceCount; ++j) {
|
|
if ((g_resStatus[i].status.resStat[j].nodeId == nodeId) &&
|
|
(g_resStatus[i].status.resStat[j].status != (uint32)CM_RES_STAT_UNKNOWN)) {
|
|
g_resStatus[i].status.resStat[j].status = (uint32)CM_RES_STAT_UNKNOWN;
|
|
++g_resStatus[i].status.version;
|
|
ProcessReportResChangedMsg(false, g_resStatus[i].status);
|
|
SaveOneResStatusToDdb(&g_resStatus[i].status);
|
|
}
|
|
}
|
|
(void)pthread_rwlock_unlock(&g_resStatus[i].rwlock);
|
|
}
|
|
}
|
|
|
|
static void CheckAllResReportByNode()
|
|
{
|
|
for (uint32 i = 0; i < g_node_num; ++i) {
|
|
uint32 inter = GetResStatReportInter(g_node[i].node);
|
|
if (inter > g_agentNetworkTimeout) {
|
|
SetResStatUnknown(g_node[i].node);
|
|
} else {
|
|
SetResStatReportInter(g_node[i].node);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void CheckAllIsregByNode()
|
|
{
|
|
UpdateCheckListAfterTimeout();
|
|
UpdateReportInter();
|
|
}
|
|
|
|
static void CheckMaxCluster()
|
|
{
|
|
SetDelayArbiClusterTime();
|
|
CheckMaxClusterHeartbeartValue();
|
|
}
|
|
|
|
static status_t IsPeerCmsReachableOn2Nodes()
|
|
{
|
|
if (!ENABLED_AUTO_FAILOVER_ON2NODES(g_cm_server_num, g_paramsOn2Nodes.cmsEnableFailoverOn2Nodes)) {
|
|
write_runlog(ERROR, "should be called by two node cluster with enabling auto failover only.\n");
|
|
return CM_ERROR;
|
|
}
|
|
|
|
// create socket
|
|
int socketFd = socket(AF_INET, SOCK_STREAM, 0);
|
|
if (socketFd == -1) {
|
|
write_runlog(ERROR, "could not create socket.\n");
|
|
return CM_ERROR;
|
|
}
|
|
|
|
struct timeval tv = { 0, 0 };
|
|
tv.tv_sec = CM_TCP_TIMEOUT;
|
|
(void)setsockopt(socketFd, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, sizeof(tv));
|
|
(void)setsockopt(socketFd, SOL_SOCKET, SO_RCVTIMEO, (char *)&tv, sizeof(tv));
|
|
|
|
struct sockaddr_in sockAddr;
|
|
status_t ret = CM_SUCCESS;
|
|
for (uint32 i = 0; i < g_cm_server_num; i++) {
|
|
if (g_node[i].cmServerId == g_currentNode->cmServerId) {
|
|
continue;
|
|
}
|
|
|
|
// got peer cms info
|
|
sockAddr.sin_addr.s_addr = inet_addr(g_node[i].cmServerLocalHAIP[0]);
|
|
sockAddr.sin_family = AF_INET;
|
|
sockAddr.sin_port = htons(g_node[i].cmServerLocalHAPort);
|
|
|
|
// Connect to peer cms dcc ip:port
|
|
if (connect(socketFd , (struct sockaddr *)&sockAddr , sizeof(sockAddr)) < 0) {
|
|
write_runlog(LOG, "could not connect to peer cms %s:%d.\n",
|
|
g_node[i].cmServerLocalHAIP[0], g_node[i].cmServerLocalHAPort);
|
|
ret = CM_ERROR;
|
|
} else {
|
|
write_runlog(DEBUG1, "connect to peer cms %s:%d successfuly.\n",
|
|
g_node[i].cmServerLocalHAIP[0], g_node[i].cmServerLocalHAPort);
|
|
ret = CM_SUCCESS;
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
close(socketFd);
|
|
return ret;
|
|
}
|
|
|
|
static status_t GetClusterInfoFromDDb(char *info, int outLen)
|
|
{
|
|
if (g_inMaintainMode) {
|
|
write_runlog(LOG, "in maintain mode, can't do ddb cmd.\n");
|
|
return CM_SUCCESS;
|
|
}
|
|
|
|
char cmd[DCC_CMD_MAX_LEN] = {0};
|
|
char errMsg[ERR_MSG_LENGTH] = {0};
|
|
status_t ret = CM_SUCCESS;
|
|
|
|
errno_t rc = snprintf_s(cmd, DCC_CMD_MAX_LEN, DCC_CMD_MAX_LEN - 1, " %s", CM_DDB_CLUSTER_INFO_CMD);
|
|
securec_check_intval(rc, (void)rc);
|
|
ret = DoDdbExecCmd(cmd, info, &outLen, errMsg, DCC_CMD_MAX_OUTPUT_LEN);
|
|
if (ret != CM_SUCCESS) {
|
|
write_runlog(ERROR, "get ddb cluster info failed. error: %s\n", errMsg);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static status_t IsPeerApplyIndexChanged(cJSON *nodes)
|
|
{
|
|
static int peerApplyIndex = 0;
|
|
int prevPeerApplyIndex = peerApplyIndex;
|
|
cJSON *applyIndex = NULL;
|
|
|
|
if (g_currentNode->cmServerId == CMS_ID_INDEX_ONE) {
|
|
applyIndex = cJSON_GetObjectItem(cJSON_GetArrayItem(nodes, 1), "apply_index");
|
|
} else if (g_currentNode->cmServerId == CMS_ID_INDEX_TWO) {
|
|
applyIndex = cJSON_GetObjectItem(cJSON_GetArrayItem(nodes, 0), "apply_index");
|
|
} else {
|
|
write_runlog(ERROR, "wrong cm server id: %d\n", g_currentNode->cmServerId);
|
|
exit(1);
|
|
}
|
|
|
|
if (applyIndex == NULL) {
|
|
write_runlog(ERROR, "cannot parse ddb cluster info {apply_index}.\n");
|
|
return CM_ERROR;
|
|
}
|
|
peerApplyIndex = applyIndex->valueint;
|
|
write_runlog(DEBUG5, "prevPeerApplyIndex: %d peerApplyIndex: %d g_ddbNetworkIsolationTimeout: %d\n",
|
|
prevPeerApplyIndex, peerApplyIndex, g_ddbNetworkIsolationTimeout);
|
|
|
|
if (peerApplyIndex == prevPeerApplyIndex) {
|
|
return CM_ERROR;
|
|
}
|
|
|
|
return CM_SUCCESS;
|
|
}
|
|
|
|
static status_t IsPeerCmsRolePrimary(cJSON *nodes)
|
|
{
|
|
int peerCmserverRole = -1;
|
|
cJSON *role = NULL;
|
|
|
|
if (g_currentNode->cmServerId == CMS_ID_INDEX_ONE) {
|
|
role = cJSON_GetObjectItem(cJSON_GetArrayItem(nodes, 1), "role");
|
|
} else if (g_currentNode->cmServerId == CMS_ID_INDEX_TWO) {
|
|
role = cJSON_GetObjectItem(cJSON_GetArrayItem(nodes, 0), "role");
|
|
}
|
|
|
|
if (role == NULL) {
|
|
write_runlog(ERROR, "cannot parse ddb cluster info {role}.\n");
|
|
return CM_ERROR;
|
|
}
|
|
peerCmserverRole = strcmp(cJSON_GetStringValue(role), "LEADER") == 0 ? CM_SERVER_PRIMARY : CM_SERVER_STANDBY;
|
|
write_runlog(DEBUG5, "peerCmserverRole: %s g_ddbNetworkIsolationTimeout: %d\n",
|
|
peerCmserverRole == CM_SERVER_PRIMARY ? "Primary" : "Standby", g_ddbNetworkIsolationTimeout);
|
|
|
|
if (peerCmserverRole != CM_SERVER_PRIMARY) {
|
|
return CM_ERROR;
|
|
}
|
|
|
|
return CM_SUCCESS;
|
|
}
|
|
|
|
static status_t IsDdbLogSyncOn2Nodes(char * info)
|
|
{
|
|
if (info == NULL) {
|
|
return CM_ERROR;
|
|
}
|
|
|
|
write_runlog(DEBUG5, "ddb cluster info: %s\n", info);
|
|
cJSON *root = cJSON_Parse(info);
|
|
if (root == NULL) {
|
|
write_runlog(ERROR, "cannot parse ddb cluster info {root}.\n");
|
|
return CM_ERROR;
|
|
}
|
|
|
|
cJSON *stream = cJSON_GetArrayItem(cJSON_GetObjectItem(root, "stream_list"), 0);
|
|
if (stream == NULL) {
|
|
write_runlog(ERROR, "cannot parse ddb cluster info {stream_list}.\n");
|
|
cJSON_Delete(root);
|
|
return CM_ERROR;
|
|
}
|
|
|
|
cJSON *nodes = cJSON_GetObjectItem(stream, "nodes");
|
|
if (nodes == NULL) {
|
|
write_runlog(ERROR, "cannot parse ddb cluster info {nodes}.\n");
|
|
cJSON_Delete(root);
|
|
return CM_ERROR;
|
|
}
|
|
|
|
switch (g_HA_status->local_role) {
|
|
case CM_SERVER_PRIMARY:
|
|
return IsPeerApplyIndexChanged(nodes);
|
|
case CM_SERVER_STANDBY:
|
|
return IsPeerCmsRolePrimary(nodes);
|
|
default:
|
|
write_runlog(ERROR, "unexpected local_role: %d\n", g_HA_status->local_role);
|
|
return CM_ERROR;
|
|
}
|
|
|
|
cJSON_Delete(root);
|
|
return CM_SUCCESS;
|
|
}
|
|
|
|
static inline void DdbSetDdbWorkMode(ddb_work_mode workMode, unsigned int voteNum, uint32 isBigVoteNum)
|
|
{
|
|
if (SetDdbWorkMode(workMode, voteNum) != CM_SUCCESS) {
|
|
write_runlog(ERROR, "setting work mode: %d failed with minVoteNum: %d isBigVoteNum: %d\n",
|
|
workMode, voteNum, isBigVoteNum);
|
|
return;
|
|
}
|
|
g_ddbWorkMode = workMode;
|
|
if (workMode == DDB_WORK_MODE_MINORITY) {
|
|
g_bigVoteNumInMinorityMode = isBigVoteNum;
|
|
}
|
|
}
|
|
|
|
/*
|
|
if reachale is true:
|
|
all ip is reachable, return CM_SUCCESS
|
|
else return CM_ERROR
|
|
if reachale is false:
|
|
all ip is not reachable, return CM_SUCCESS
|
|
else return CM_ERROR
|
|
*/
|
|
static status_t CheckAllIpStatus(char *ip, bool reachable) {
|
|
if (ip == nullptr) {
|
|
return CM_ERROR;
|
|
}
|
|
|
|
char tmpIp[CM_IP_LENGTH];
|
|
int rc = -1;
|
|
rc = strcpy_s(tmpIp, CM_IP_LENGTH, ip);
|
|
securec_check_errno(rc, (void)rc);
|
|
char *saveptr = NULL;
|
|
char *token = strtok_r(tmpIp, ",", &saveptr);
|
|
status_t ret = CM_SUCCESS;
|
|
bool flag = false;
|
|
while (token != NULL) {
|
|
if (reachable && IsReachableIP(token) != CM_SUCCESS) {
|
|
ret = CM_ERROR;
|
|
break;
|
|
}
|
|
else if (!reachable && IsReachableIP(token) == CM_SUCCESS) {
|
|
ret = CM_ERROR;
|
|
break;
|
|
}
|
|
flag = true;
|
|
token = strtok_r(NULL, ",", &saveptr);
|
|
}
|
|
|
|
return flag ? ret : CM_ERROR;
|
|
}
|
|
|
|
static void DdbMinorityWorkModeSetInMajority()
|
|
{
|
|
uint32 minVoteNum = 1;
|
|
if (CheckAllIpStatus(g_paramsOn2Nodes.thirdPartyGatewayIp, true) == CM_SUCCESS) {
|
|
// all third party gateway is reachable, setting a small vote num to make sure current node works as primary.
|
|
write_runlog(LOG, "promote node to primary\n");
|
|
DdbSetDdbWorkMode(DDB_WORK_MODE_MINORITY, minVoteNum, 0);
|
|
} else {
|
|
// not all third party gateway is reachable, setting a big vote num to make sure current node works as standby.
|
|
minVoteNum += MAX_VOTE_NUM;
|
|
DdbSetDdbWorkMode(DDB_WORK_MODE_MINORITY, minVoteNum, 1);
|
|
|
|
if (g_HA_status->local_role == CM_SERVER_PRIMARY) {
|
|
// primary node need to demote to standby
|
|
write_runlog(LOG, "demote node to standby\n");
|
|
if (DemoteDdbRole2Standby() != CM_SUCCESS) {
|
|
write_runlog(ERROR, "demote node to standby failed\n");
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
write_runlog(LOG, "go into minority work mode with minVoteNum: %d g_bigVoteNumInMinorityMode: %d.\n",
|
|
minVoteNum, g_bigVoteNumInMinorityMode);
|
|
(void)pthread_rwlock_wrlock(&term_update_rwlock);
|
|
IncrementTermToDdb(CM_INCREMENT_BIG_TERM_VALUE);
|
|
(void)pthread_rwlock_unlock(&term_update_rwlock);
|
|
}
|
|
|
|
static void DdbMinorityWorkModeSetInMinority()
|
|
{
|
|
uint32 minVoteNum = 1;
|
|
if (CheckAllIpStatus(g_paramsOn2Nodes.thirdPartyGatewayIp, true) == CM_SUCCESS && g_bigVoteNumInMinorityMode == 1) {
|
|
write_runlog(LOG, "reset minority work mode and become primary.\n");
|
|
DdbSetDdbWorkMode(DDB_WORK_MODE_MINORITY, minVoteNum, 0);
|
|
} else if (CheckAllIpStatus(g_paramsOn2Nodes.thirdPartyGatewayIp, false) == CM_SUCCESS && g_bigVoteNumInMinorityMode == 0) {
|
|
// every third party gateway is not reachable, setting a big vote num to make sure current node works as standby.
|
|
minVoteNum += MAX_VOTE_NUM;
|
|
write_runlog(LOG, "reset minority work mode and become standby.\n");
|
|
DdbSetDdbWorkMode(DDB_WORK_MODE_MINORITY, minVoteNum, 1);
|
|
|
|
if (g_HA_status->local_role == CM_SERVER_PRIMARY) {
|
|
write_runlog(LOG, "demote node to standby\n");
|
|
if (DemoteDdbRole2Standby() != CM_SUCCESS) {
|
|
write_runlog(ERROR, "demote node to standby failed\n");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void DdbMinorityWorkModeSetInStartup()
|
|
{
|
|
uint32 minVoteNum = 1;
|
|
if (CheckAllIpStatus(g_paramsOn2Nodes.thirdPartyGatewayIp, true) == CM_SUCCESS) {
|
|
write_runlog(LOG, "start up with minority work mode and minVoteNum: %d.\n", minVoteNum);
|
|
DdbSetDdbWorkMode(DDB_WORK_MODE_MINORITY, minVoteNum, 0);
|
|
} else {
|
|
minVoteNum += MAX_VOTE_NUM;
|
|
write_runlog(LOG, "start up with minority work mode and minVoteNum: %d.\n", minVoteNum);
|
|
DdbSetDdbWorkMode(DDB_WORK_MODE_MINORITY, minVoteNum, 1);
|
|
DemoteDdbRole2Standby();
|
|
}
|
|
|
|
}
|
|
|
|
static int DdbStatusCheck() {
|
|
/*
|
|
* start to do dcc cluster status check.
|
|
*/
|
|
char info[DCC_CMD_MAX_OUTPUT_LEN] = {0};
|
|
if (IsPeerCmsReachableOn2Nodes() == CM_SUCCESS ||
|
|
(GetClusterInfoFromDDb(info, DCC_CMD_MAX_OUTPUT_LEN) == CM_SUCCESS
|
|
&& IsDdbLogSyncOn2Nodes(info) == CM_SUCCESS)) {
|
|
/*
|
|
* network are good between two nodes. reset g_ddbNetworkIsolationTimeout.
|
|
*/
|
|
g_ddbNetworkIsolationTimeout = g_paramsOn2Nodes.cmsNetworkIsolationTimeout;
|
|
return CM_SUCCESS;
|
|
}
|
|
|
|
return CM_ERROR;
|
|
}
|
|
|
|
static void DoDdbStatusCheckAndSet()
|
|
{
|
|
/*
|
|
* if the cluster are not two nodes cluster, or don't enable auto failover, skip check.
|
|
*/
|
|
if (!ENABLED_AUTO_FAILOVER_ON2NODES(g_cm_server_num, g_paramsOn2Nodes.cmsEnableFailoverOn2Nodes)) {
|
|
cm_sleep(DDB_STATUS_CHECK_INTERVAL);
|
|
return;
|
|
}
|
|
|
|
if (DdbStatusCheck() == CM_SUCCESS) {
|
|
if (g_ddbWorkMode != DDB_WORK_MODE_MAJORITY) {
|
|
write_runlog(LOG, "go into majority work mode.\n");
|
|
DdbSetDdbWorkMode(DDB_WORK_MODE_MAJORITY, 0, 0);
|
|
}
|
|
cm_sleep(DDB_STATUS_CHECK_INTERVAL);
|
|
return;
|
|
}
|
|
|
|
if (g_ddbNetworkIsolationTimeout != 0) {
|
|
cm_sleep(DDB_STATUS_CHECK_INTERVAL);
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* go into minority work mode, because:
|
|
* 1. we cannot get ddb cluster info sync within g_cms_ddb_log_sync_timeout
|
|
* 2. we cannot reach peer node dcc ip:port
|
|
*/
|
|
switch (g_ddbWorkMode) {
|
|
case DDB_WORK_MODE_MAJORITY:
|
|
DdbMinorityWorkModeSetInMajority();
|
|
break;
|
|
case DDB_WORK_MODE_MINORITY:
|
|
DdbMinorityWorkModeSetInMinority();
|
|
break;
|
|
default:
|
|
DdbMinorityWorkModeSetInStartup();
|
|
}
|
|
cm_sleep(DDB_STATUS_CHECK_INTERVAL);
|
|
return;
|
|
}
|
|
|
|
static void DoMonitor(MonitorContext *ctx)
|
|
{
|
|
CheckKerberosHB();
|
|
|
|
CheckMajorityReElect();
|
|
|
|
CheckCmctlStop();
|
|
|
|
#ifdef ENABLE_MULTIPLE_NODES
|
|
CheckCnDelDelayTime();
|
|
#endif
|
|
|
|
CheckHB();
|
|
|
|
if (g_dbType == DB_DCC &&
|
|
ENABLED_AUTO_FAILOVER_ON2NODES(g_cm_server_num, g_paramsOn2Nodes.cmsEnableFailoverOn2Nodes)) {
|
|
/*
|
|
* two nodes cluster and enable auto failover.
|
|
* when network isolation happened, cms choice a node as new primary if
|
|
* the node can reach the third party gateway.
|
|
*/
|
|
CheckDdbClusterStatusOn2Nodes();
|
|
}
|
|
|
|
CheckMaxCluster();
|
|
|
|
CheckRoleChange();
|
|
|
|
if (g_HA_status->local_role == CM_SERVER_PRIMARY) {
|
|
CheckETCD();
|
|
|
|
if (g_cmserverDemoteDelayOnDdbFault > 0) {
|
|
g_cmserverDemoteDelayOnDdbFault--;
|
|
}
|
|
|
|
CheckAllInstanceGroup();
|
|
|
|
CheckAllUDF();
|
|
} else {
|
|
ResetInstanceStatus();
|
|
}
|
|
|
|
if (g_gotParameterReload == 1) {
|
|
ReloadParametersFromConfigfile();
|
|
g_gotParameterReload = 0;
|
|
}
|
|
|
|
if (cmserver_switchover_timeout == 0) {
|
|
switchOverInstances.clear();
|
|
write_runlog(DEBUG1, "switchover timeout clear, no switchover in progress.\n");
|
|
(void)pthread_rwlock_wrlock(&(switchover_az_rwlock));
|
|
switchoverAZInProgress = false;
|
|
(void)pthread_rwlock_unlock(&(switchover_az_rwlock));
|
|
}
|
|
|
|
if (g_instance_failover_delay_time_from_set == 1) {
|
|
instance_failover_delay_timeout =
|
|
(uint32)get_int_value_from_config(configDir, "instance_failover_delay_timeout", 0);
|
|
}
|
|
|
|
g_cluster_unbalance_check_interval--;
|
|
if (g_cluster_unbalance_check_interval <= 0) {
|
|
g_cluster_unbalance_check_interval = cluster_unbalance_check_interval;
|
|
check_cluster_balance_status();
|
|
}
|
|
|
|
if (IsCusResExist() && (g_HA_status->local_role == CM_SERVER_PRIMARY)) {
|
|
CheckAllResReportByNode();
|
|
CheckAllIsregByNode();
|
|
}
|
|
|
|
cm_sleep(1);
|
|
|
|
if (g_HA_status->local_role == CM_SERVER_PRIMARY) {
|
|
UpdateCheckInterval(ctx);
|
|
}
|
|
}
|
|
|
|
void *CM_ThreadMonitorMain(void *argp)
|
|
{
|
|
CM_MonitorThread *monitor = (CM_MonitorThread *)argp;
|
|
/* unify log style */
|
|
thread_name = "Monitor";
|
|
|
|
write_runlog(LOG, "Starting Monitor thread\n");
|
|
|
|
monitor->thread.type = THREAD_TYPE_MONITOR;
|
|
|
|
MonitorContext ctx;
|
|
ctx.takeTime = 0;
|
|
|
|
for (;;) {
|
|
DoMonitor(&ctx);
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
void *CM_ThreadDdbStatusCheckAndSetMain(void *argp)
|
|
{
|
|
CM_DdbStatusCheckAndSetThread *pCheckThread = (CM_DdbStatusCheckAndSetThread *)argp;
|
|
/* unify log style */
|
|
thread_name = "DdbStatusCheck";
|
|
|
|
write_runlog(LOG, "Starting Ddb Status Check thread\n");
|
|
|
|
pCheckThread->thread.type = THREAD_TYPE_DDB_STATUS_CHECKER;
|
|
|
|
for (;;) {
|
|
DoDdbStatusCheckAndSet();
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
static void GetStopNodes(char *stopAzNodes, size_t len)
|
|
{
|
|
if (stopAzNodes == NULL || len == 0) {
|
|
write_runlog(WARNING, "az_stop_nodes is null, or len is zero.\n");
|
|
return;
|
|
}
|
|
|
|
write_runlog(LOG, "az_stop_nodes is: (%s).\n", stopAzNodes);
|
|
|
|
char *saveptr = NULL;
|
|
char *subStr = strtok_r(stopAzNodes, ",", &saveptr);
|
|
while (subStr) {
|
|
(void)g_stopNodes.insert(strtol(subStr, NULL, 10));
|
|
subStr = strtok_r(NULL, ",", &saveptr);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* thread to check whether node is manually stopped
|
|
* check node status must not be too often since ddb query cost some seconds
|
|
*/
|
|
void *CM_ThreadMonitorNodeStopMain(void *argp)
|
|
{
|
|
CM_MonitorNodeStopThread *monitor = (CM_MonitorNodeStopThread *)argp;
|
|
thread_name = "MonitorNodeStop";
|
|
monitor->thread.type = THREAD_TYPE_MONITOR;
|
|
write_runlog(LOG, "Starting MonitorNodeStop thread.\n");
|
|
|
|
const int checkInterval = 3;
|
|
int checkTrigger = 0;
|
|
int stopNodesKeyNum = 0;
|
|
char ddbValue[DDB_MIN_VALUE_LEN] = {0};
|
|
char ddbKey[MAX_PATH_LEN] = {0};
|
|
int tryTimes = TRY_TIME_GET_STATUSONLINE_FROM_DDB;
|
|
errno_t rc = 0;
|
|
status_t ret = CM_SUCCESS;
|
|
|
|
for (;;) {
|
|
if (checkTrigger != checkInterval) {
|
|
checkTrigger++;
|
|
cm_sleep(1);
|
|
continue;
|
|
}
|
|
|
|
checkTrigger = 0;
|
|
|
|
rc = snprintf_s(ddbKey, MAX_PATH_LEN, MAX_PATH_LEN - 1, "/%s/command/az_stop_nodes_num", pw->pw_name);
|
|
securec_check_intval(rc, (void)rc);
|
|
|
|
ret = TryDdbGet(ddbKey, ddbValue, DDB_MIN_VALUE_LEN, tryTimes, DEBUG1);
|
|
if (ret != CM_SUCCESS) {
|
|
write_runlog(DEBUG1, "get az_stop_nodes_num failed, key is %s\n", ddbKey);
|
|
cm_sleep(1);
|
|
continue;
|
|
} else {
|
|
stopNodesKeyNum = (int)strtol(ddbValue, NULL, 10);
|
|
rc = memset_s(ddbValue, DDB_MIN_VALUE_LEN, 0, DDB_MIN_VALUE_LEN);
|
|
securec_check_errno(rc, (void)rc);
|
|
}
|
|
|
|
g_stopNodes.clear();
|
|
|
|
for (int ii = 0; ii < stopNodesKeyNum; ii++) {
|
|
rc = snprintf_s(ddbKey, MAX_PATH_LEN, MAX_PATH_LEN - 1, "/%s/command/%d/az_stop_nodes", pw->pw_name, ii);
|
|
securec_check_intval(rc, (void)rc);
|
|
ret = TryDdbGet(ddbKey, ddbValue, DDB_MIN_VALUE_LEN, tryTimes);
|
|
if (ret != CM_SUCCESS) {
|
|
write_runlog(ERROR, "get az_stop_nodes failed, key is %s.\n", ddbKey);
|
|
} else {
|
|
GetStopNodes(ddbValue, DDB_MIN_VALUE_LEN);
|
|
rc = memset_s(ddbValue, DDB_MIN_VALUE_LEN, 0, DDB_MIN_VALUE_LEN);
|
|
securec_check_errno(rc, (void)rc);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void DeleteIgnoreNodeToDdb(char *key, uint32 keyLen)
|
|
{
|
|
status_t st = DelKeyInDdb(key, keyLen);
|
|
if (st != CM_SUCCESS) {
|
|
write_runlog(ERROR, "%d: ddb delete falied. Key=%s\n", __LINE__, key);
|
|
}
|
|
}
|
|
|
|
static void SetIgnoreNodeToDdb(char *key, uint32 keyLen, char *value, uint32 valueLen)
|
|
{
|
|
status_t st = SetKV2Ddb(key, keyLen, value, valueLen, NULL);
|
|
if (st != CM_SUCCESS) {
|
|
write_runlog(ERROR, "%d: ddb set failed. Key=%s, value=%s\n", __LINE__, key, value);
|
|
}
|
|
}
|
|
|
|
static void FindAndSetIgnoreNodeToDdb(const char *blackFile)
|
|
{
|
|
char ignoreNodeKey[MAX_PATH_LEN] = {0};
|
|
char ignoreNodeNumKey[MAX_PATH_LEN] = {0};
|
|
char nodeName[CM_NODE_NAME];
|
|
uint32 nodeNamenums = 1;
|
|
char nodeNamenumValue[MAX_PATH_LEN] = {0};
|
|
|
|
FILE *fp = fopen(blackFile, "r");
|
|
if (fp == NULL) {
|
|
write_runlog(ERROR, "%d: failed to open file %s\n", __LINE__, blackFile);
|
|
return;
|
|
}
|
|
int rc = memset_s(nodeName, CM_NODE_NAME, 0, CM_NODE_NAME);
|
|
securec_check_errno(rc, (void)fclose(fp));
|
|
while (!feof(fp)) {
|
|
if (fgets(nodeName, CM_NODE_NAME, fp) == NULL) {
|
|
write_runlog(ERROR, "%d: failed to get nodename\n", __LINE__);
|
|
continue;
|
|
}
|
|
(void)trim(nodeName);
|
|
rc = snprintf_s(ignoreNodeKey, MAX_PATH_LEN, MAX_PATH_LEN - 1, "/%s/ignorenode/%u", pw->pw_name, nodeNamenums);
|
|
securec_check_intval(rc, (void)fclose(fp));
|
|
SetIgnoreNodeToDdb(ignoreNodeKey, MAX_PATH_LEN, nodeName, CM_NODE_NAME);
|
|
nodeNamenums++;
|
|
}
|
|
(void)fclose(fp);
|
|
fp = NULL;
|
|
nodeNamenums--;
|
|
rc = snprintf_s(ignoreNodeNumKey, MAX_PATH_LEN, MAX_PATH_LEN - 1, "/%s/ignoreNodeNumKey", pw->pw_name);
|
|
securec_check_intval(rc, (void)rc);
|
|
rc = snprintf_s(nodeNamenumValue, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%u", nodeNamenums);
|
|
securec_check_intval(rc, (void)rc);
|
|
SetIgnoreNodeToDdb(ignoreNodeNumKey, MAX_PATH_LEN, nodeNamenumValue, MAX_PATH_LEN);
|
|
}
|
|
|
|
static void DdbGetIgnoreNode(const char *key, char *value, uint32 valueLen)
|
|
{
|
|
DDB_RESULT dbResult = SUCCESS_GET_VALUE;
|
|
status_t st = GetKVAndLogLevel(key, value, valueLen, &dbResult, DEBUG1);
|
|
if (st != CM_SUCCESS) {
|
|
write_runlog(DEBUG1, "get ddb key %s error %d\n", key, dbResult);
|
|
}
|
|
}
|
|
|
|
static bool CheckIgnoreNode()
|
|
{
|
|
char ignoreNodeKey[MAX_PATH_LEN] = {0};
|
|
char getIgnoreNodeValue[MAX_PATH_LEN] = {0};
|
|
char ignoreNodeNumKey[MAX_PATH_LEN] = {0};
|
|
char getignoreNodeNumValue[MAX_PATH_LEN] = {0};
|
|
|
|
errno_t rc = snprintf_s(ignoreNodeNumKey, MAX_PATH_LEN, MAX_PATH_LEN - 1, "/%s/ignoreNodeNumKey", pw->pw_name);
|
|
securec_check_intval(rc, (void)rc);
|
|
DdbGetIgnoreNode(ignoreNodeNumKey, getignoreNodeNumValue, MAX_PATH_LEN);
|
|
uint32 ignoreNum = (uint32)strtoul(getignoreNodeNumValue, NULL, 0);
|
|
if (ignoreNum == 0) {
|
|
return false;
|
|
}
|
|
for (uint32 i = 0; i < ignoreNum; i++) {
|
|
rc = snprintf_s(ignoreNodeKey, MAX_PATH_LEN, MAX_PATH_LEN - 1, "/%s/ignorenode/%u", pw->pw_name, i + 1);
|
|
securec_check_intval(rc, (void)rc);
|
|
getIgnoreNodeValue[0] = 0;
|
|
DdbGetIgnoreNode(ignoreNodeKey, getIgnoreNodeValue, MAX_PATH_LEN);
|
|
if (strncmp(g_currentNode->nodeName, getIgnoreNodeValue, MAX_PATH_LEN) == 0) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static void StopIgnoreNode()
|
|
{
|
|
int rcs;
|
|
char stopCmd[MAXPGPATH] = {0};
|
|
|
|
if (!CheckIgnoreNode()) {
|
|
return;
|
|
}
|
|
write_runlog(LOG, "stop hostname is %s.\n", g_currentNode->nodeName);
|
|
rcs =
|
|
snprintf_s(stopCmd, MAXPGPATH, MAXPGPATH - 1, "cm_ctl stop -n %u -m i > /dev/null 2>&1 &", g_currentNode->node);
|
|
securec_check_intval(rcs, (void)rcs);
|
|
rcs = system(stopCmd);
|
|
if (rcs != 0) {
|
|
write_runlog(ERROR, "cmd execute failed : %s, errno=%d.\n", stopCmd, errno);
|
|
}
|
|
write_runlog(FATAL, "The current node(%u) has been ignored.\n", g_currentNode->node);
|
|
FreeNotifyMsg();
|
|
exit(1);
|
|
}
|
|
|
|
static void RmAllBlackFile(const char *blackFile)
|
|
{
|
|
int ret;
|
|
char rmCmd[MAXPGPATH] = {0};
|
|
for (uint32 i = 0; i < g_node_num; i++) {
|
|
/* cmServerLevel used to check if the node has cm_server */
|
|
if (g_node[i].cmServerLevel != 1 || g_node[i].sshCount == 0) {
|
|
continue;
|
|
}
|
|
|
|
ret = snprintf_s(rmCmd,
|
|
MAXPGPATH,
|
|
MAXPGPATH - 1,
|
|
"pssh %s -s -H %s \"if [ -f %s ];then rm -f %s;fi\"",
|
|
PSSH_TIMEOUT_OPTION,
|
|
g_node[i].sshChannel[0],
|
|
blackFile,
|
|
blackFile);
|
|
securec_check_intval(ret, (void)ret);
|
|
ret = system(rmCmd);
|
|
if (ret != 0) {
|
|
write_runlog(ERROR, "Remove blackfile fail cmd is: %s, errno=%d.\n", rmCmd, errno);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void DeleteIgnoreNodeFromDdb()
|
|
{
|
|
char ignoreNodeNumKey1[MAX_PATH_LEN] = {0};
|
|
char ignoreNodeKey[MAX_PATH_LEN] = {0};
|
|
char getIgnoreNodeValue[MAX_PATH_LEN] = {0};
|
|
char getignoreNodeNumValue[MAX_PATH_LEN] = {0};
|
|
|
|
errno_t rc = snprintf_s(ignoreNodeNumKey1, MAX_PATH_LEN, MAX_PATH_LEN - 1, "/%s/ignoreNodeNumKey", pw->pw_name);
|
|
securec_check_intval(rc, (void)rc);
|
|
DdbGetIgnoreNode(ignoreNodeNumKey1, getignoreNodeNumValue, MAX_PATH_LEN);
|
|
uint32 ignoreNum = (uint32)strtoul(getignoreNodeNumValue, NULL, 0);
|
|
if (ignoreNum == 0) {
|
|
return;
|
|
}
|
|
for (uint32 i = 0; i < ignoreNum; i++) {
|
|
rc = snprintf_s(ignoreNodeKey, MAX_PATH_LEN, MAX_PATH_LEN - 1, "/%s/ignorenode/%u", pw->pw_name, i + 1);
|
|
securec_check_intval(rc, (void)rc);
|
|
getIgnoreNodeValue[0] = 0;
|
|
DdbGetIgnoreNode(ignoreNodeKey, getIgnoreNodeValue, MAX_PATH_LEN);
|
|
if (strncmp(g_currentNode->nodeName, getIgnoreNodeValue, MAX_PATH_LEN) == 0) {
|
|
return;
|
|
}
|
|
}
|
|
for (uint32 i = 0; i < ignoreNum; i++) {
|
|
rc = snprintf_s(ignoreNodeKey, MAX_PATH_LEN, MAX_PATH_LEN - 1, "/%s/ignorenode/%u", pw->pw_name, i + 1);
|
|
securec_check_intval(rc, (void)rc);
|
|
DeleteIgnoreNodeToDdb(ignoreNodeKey, MAX_PATH_LEN);
|
|
}
|
|
DeleteIgnoreNodeToDdb(ignoreNodeNumKey1, MAX_PATH_LEN);
|
|
}
|
|
|
|
static void CheckIgnoreFile(struct stat *beforeStat, const char *blackFile, bool *firstGetFile)
|
|
{
|
|
struct stat afterStat = {0};
|
|
if (stat(blackFile, &afterStat) != 0) {
|
|
DeleteIgnoreNodeFromDdb();
|
|
return;
|
|
}
|
|
if ((*firstGetFile) && stat(blackFile, beforeStat) == 0) {
|
|
FindAndSetIgnoreNodeToDdb(blackFile);
|
|
*firstGetFile = false;
|
|
} else if (stat(blackFile, &afterStat) == 0 && (beforeStat->st_mtim.tv_sec != afterStat.st_mtim.tv_sec ||
|
|
beforeStat->st_mtim.tv_nsec != afterStat.st_mtim.tv_nsec)) {
|
|
int ret;
|
|
FindAndSetIgnoreNodeToDdb(blackFile);
|
|
ret = memcpy_s(beforeStat, sizeof(afterStat), &afterStat, sizeof(afterStat));
|
|
securec_check_errno(ret, (void)ret);
|
|
}
|
|
}
|
|
|
|
static void CheckOneIP(char *ip, uint32 ipLen, bool *isIncluster)
|
|
{
|
|
int ret;
|
|
char stopCmd[MAXPGPATH] = {0};
|
|
|
|
for (uint32 i = 0; i < g_node_num; i++) {
|
|
if (strncmp(ip, g_node[i].nodeName, CM_IP_LENGTH) == 0) {
|
|
*isIncluster = true;
|
|
|
|
if (StopCheckNode(i) == -1) {
|
|
write_runlog(LOG, "stop hostname is %s.\n", g_node[i].nodeName);
|
|
ret = snprintf_s(
|
|
stopCmd, MAXPGPATH, MAXPGPATH - 1, "cm_ctl stop -n %u -m i > /dev/null 2>&1 &", g_node[i].node);
|
|
securec_check_intval(ret, (void)ret);
|
|
ret = system(stopCmd);
|
|
if (ret != 0) {
|
|
write_runlog(ERROR, "cmd execute failed : %s, errno=%d.\n", stopCmd, errno);
|
|
}
|
|
ret = memset_s(ip, ipLen, 0, ipLen);
|
|
securec_check_errno(ret, (void)ret);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void CheckBlackFile(const char* blackFile)
|
|
{
|
|
char ip[CM_IP_LENGTH];
|
|
int ret = memset_s(ip, CM_IP_LENGTH, 0, CM_IP_LENGTH);
|
|
securec_check_errno(ret, (void)ret);
|
|
|
|
FILE *fp = fopen(blackFile, "r");
|
|
if (fp != NULL) {
|
|
bool isIncluster = false;
|
|
while (!feof(fp)) {
|
|
if (fgets(ip, CM_NODE_NAME, fp) != NULL) {
|
|
(void)trim(ip);
|
|
CheckOneIP(ip, CM_IP_LENGTH, &isIncluster);
|
|
}
|
|
}
|
|
(void)fclose(fp);
|
|
if (!isIncluster) {
|
|
RmAllBlackFile(blackFile);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void CheckAgentFile(const char* execPath)
|
|
{
|
|
int ret;
|
|
char upgradeStopFile[MAX_PATH_LEN] = {0};
|
|
char stopCmd[MAXPGPATH] = {0};
|
|
struct stat stopStatBuf = {0};
|
|
|
|
// check agent version file
|
|
for (uint32 i = 0; i < g_node_num; i++) {
|
|
ret = snprintf_s(
|
|
upgradeStopFile, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/check_version_result-%u", execPath, i);
|
|
securec_check_intval(ret, (void)ret);
|
|
if (stat(upgradeStopFile, &stopStatBuf) == 0) {
|
|
ret = snprintf_s(stopCmd,
|
|
MAXPGPATH,
|
|
MAXPGPATH - 1,
|
|
"cm_ctl stop -n %u -m i > /dev/null 2>&1 & rm %s;",
|
|
i,
|
|
upgradeStopFile);
|
|
securec_check_intval(ret, (void)ret);
|
|
ret = system(stopCmd);
|
|
if (ret != 0) {
|
|
write_runlog(ERROR, "cmd execute failed : %s, errno=%d.\n", stopCmd, errno);
|
|
} else {
|
|
write_runlog(LOG, "cmd execute successed : %s.\n", stopCmd);
|
|
}
|
|
char rmCmd[MAXPGPATH] = {0};
|
|
ret = snprintf_s(rmCmd, MAXPGPATH, MAXPGPATH - 1, "rm %s &", upgradeStopFile);
|
|
securec_check_intval(ret, (void)ret);
|
|
ret = system(rmCmd);
|
|
if (ret != 0) {
|
|
write_runlog(ERROR, "cmd execute failed : %s, errno=%d.\n", rmCmd, errno);
|
|
} else {
|
|
write_runlog(LOG, "cmd execute successed : %s.\n", stopCmd);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void DoCheckBlackList(const char* execPath, const char* blackFile)
|
|
{
|
|
StopIgnoreNode();
|
|
CheckBlackFile(blackFile);
|
|
CheckAgentFile(execPath);
|
|
}
|
|
|
|
void *CheckBlackList(void *arg)
|
|
{
|
|
int ret;
|
|
char pghostPath[MAXPGPATH] = {0};
|
|
char execPath[MAXPGPATH] = {0};
|
|
char blackFile[MAX_PATH_LEN] = {0};
|
|
bool firstGetIgnore = true;
|
|
struct stat beforeStat = {0};
|
|
|
|
ret = cmserver_getenv("PGHOST", pghostPath, sizeof(pghostPath), ERROR);
|
|
if (ret != EOK) {
|
|
write_runlog(ERROR, "Get PGHOST failed, please check.\n");
|
|
return NULL;
|
|
}
|
|
check_input_for_security(pghostPath);
|
|
if (GetHomePath(execPath, sizeof(execPath)) != 0) {
|
|
return NULL;
|
|
}
|
|
|
|
ret = snprintf_s(blackFile, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/ignore_node_record", pghostPath);
|
|
securec_check_intval(ret, (void)ret);
|
|
check_input_for_security(blackFile);
|
|
canonicalize_path(blackFile);
|
|
for (;;) {
|
|
if (got_stop == 1) {
|
|
write_runlog(LOG, "receive exit request in CheckBlackList.\n");
|
|
break;
|
|
}
|
|
CheckIgnoreFile(&beforeStat, blackFile, &firstGetIgnore);
|
|
if (g_HA_status->local_role == CM_SERVER_PRIMARY) {
|
|
DoCheckBlackList(execPath, blackFile);
|
|
}
|
|
cm_sleep(CHECK_SLEEP_INTERVAL);
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
#ifdef ENABLE_MULTIPLE_NODES
|
|
static int GetGtmMode()
|
|
{
|
|
if (!IsNeedSyncDdb()) {
|
|
return -1;
|
|
}
|
|
char ddbKey[MAX_PATH_LEN] = {0};
|
|
char ddbValue[MAX_PATH_LEN] = {0};
|
|
int tryTimes = TRY_TIME_GET_STATUSONLINE_FROM_DDB;
|
|
|
|
int erc = snprintf_s(ddbKey, MAX_PATH_LEN, MAX_PATH_LEN - 1, "/%s/command/gtm_mode", pw->pw_name);
|
|
securec_check_intval(erc, (void)erc);
|
|
status_t getResult = TryDdbGet(ddbKey, ddbValue, MAX_PATH_LEN, tryTimes);
|
|
if (getResult != CM_SUCCESS) {
|
|
write_runlog(WARNING, "get gtm_mode failed, key is %s.\n", ddbKey);
|
|
return -1;
|
|
} else {
|
|
write_runlog(LOG, "get gtm_mode successfully, values is %s.\n", ddbValue);
|
|
g_gtm_free_mode = (strcmp(ddbValue, "on") == 0);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void *CheckGtmModMain(void *arg)
|
|
{
|
|
uint32 sleepInterval = 1;
|
|
thread_name = "CheckGtmMod";
|
|
write_runlog(LOG, "Starting check gtm mod thread.\n");
|
|
for (;;) {
|
|
if (got_stop == 1) {
|
|
write_runlog(LOG, "receive exit request in CheckGtmModMain.\n");
|
|
cm_sleep(sleepInterval);
|
|
continue;
|
|
}
|
|
/* gtm_mode can not change after cluster install in actual situation, this guc param will set in install guc */
|
|
if (GetGtmMode() == 0) {
|
|
write_runlog(LOG, "success get gtm mod (%d), and CheckGtmModMain will exit.\n", (int)g_gtm_free_mode);
|
|
break;
|
|
}
|
|
cm_sleep(sleepInterval);
|
|
}
|
|
return NULL;
|
|
}
|
|
#endif
|
|
|