/* * Copyright (c) 2021 Huawei Technologies Co.,Ltd. * * CM is licensed under Mulan PSL v2. * You can use this software according to the terms and conditions of the Mulan PSL v2. * You may obtain a copy of Mulan PSL v2 at: * * http://license.coscl.org.cn/MulanPSL2 * * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. * See the Mulan PSL v2 for more details. * ------------------------------------------------------------------------- * * cma_instance_management.cpp * * * IDENTIFICATION * src/cm_agent/cma_instance_management.cpp * * ------------------------------------------------------------------------- */ #include #include #include #include "cma_global_params.h" #include "cm/stringinfo.h" #include "cm/libpq-fe.h" #include "cm/libpq-int.h" #include "common/config/cm_config.h" #include "cma_alarm.h" #include "cma_common.h" #include "cma_client.h" #include "cma_process_messages.h" #ifdef ENABLE_MULTIPLE_NODES #include "cma_coordinator.h" #include "cma_coordinator_utils.h" #include "cma_cn_gtm_instance_management.h" #endif #include "cma_instance_management_res.h" #include "cma_network_check.h" #include "cma_instance_management.h" #ifdef ENABLE_UT #define static #endif static bool IsCmsReplaceFlagFileExists(); static void StopCmInstance(); static void StopOneZengine(uint32 index); static bool StopCurDnFloatIp(uint32 index); #ifdef ENABLE_GCOV static const int SIG_TYPE = 2; #else static const int SIG_TYPE = 9; #endif bool g_isDnFirstStart = true; typedef void (*StartCheck)(void); typedef bool (*StartDnCheck)(void); /* * @brief A helper function to indicate whether the gs_relpace command is running to repair the faulty cms * * @return true gs_replace command is running, and the cms replace flag file is existed. * @return false gs_replace command is not executed or finished, and the cms replace flag file is not existed. * */ static bool IsCmsReplaceFlagFileExists() { errno_t rc; struct stat cmsStatBuf = {0}; bool cmsReplace = false; char instanceReplace[MAX_PATH_LEN] = {0}; rc = snprintf_s(instanceReplace, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/%s_%s_%u", g_binPath, CM_INSTANCE_REPLACE, "cmserver", g_currentNode->cmServerId); securec_check_intval(rc, (void)rc); if (stat(instanceReplace, &cmsStatBuf) == 0) { cmsReplace = true; write_runlog(LOG, "the cms %u is being repaired by gs_replace, do not start it!\n", g_currentNode->cmServerId); } else { write_runlog(DEBUG1, "the cms %u is not being repaired by gs_replace!\n", g_currentNode->cmServerId); } return cmsReplace; } void start_cmserver_check(void) { int ret; int alarmReason = UNKNOWN_BAD_REASON; int rc; char command[MAXPGPATH] = {0}; char instanceName[CM_NODE_NAME] = {0}; struct stat cluster_stat_buf = {0}; AlarmAdditionalParam tempAdditionalParam; bool cdt; /* If the current node does not deploy cms, we do not need to execute the operation of cms start-detection. */ if (g_currentNode->cmServerLevel != 1) { return; } uint32 alarmIndex = g_currentNode->datanodeCount; rc = snprintf_s( instanceName, sizeof(instanceName), sizeof(instanceName) - 1, "%s_%u", "cms", g_currentNode->cmServerId); securec_check_intval(rc, (void)rc); rc = memset_s(command, MAXPGPATH, 0, MAXPGPATH); securec_check_errno(rc, (void)rc); rc = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, "%s/%s", g_binPath, CM_SERVER_BIN_NAME); securec_check_intval(rc, (void)rc); /* * If cmserver data path disk fault, kill cmserver */ if (stat(g_cmManualStartPath, &cluster_stat_buf) != 0) { cdt = (!agentCheckDisc(g_currentNode->cmDataPath) || !agentCheckDisc(g_logBasePath)); if (cdt) { write_runlog(ERROR, "data path disk writable test failed, %s.\n", g_currentNode->cmDataPath); g_cmsDiskDamage = true; set_instance_not_exist_alarm_value(&alarmReason, DISC_BAD_REASON); } else { g_cmsDiskDamage = false; } } else { g_cmsDiskDamage = false; } if (!GetNicStatus(g_currentNode->cmServerId, CM_INSTANCE_TYPE_CMS)) { write_runlog(WARNING, "nic related with cmserver not up.\n"); g_cmsNicDown = true; set_instance_not_exist_alarm_value(&alarmReason, NIC_BAD_REASON); } else { g_cmsNicDown = false; } ret = check_one_instance_status(CM_SERVER_BIN_NAME, command, NULL); switch (ret) { case PROCESS_RUNNING: cdt = (g_cmsDiskDamage || g_cmsNicDown); if (cdt) { write_runlog(LOG, "cms_%u is killed because disk fault or nic fault, g_cmsDiskDamage=%d, g_cmsNicDown=%d.\n", g_currentNode->cmServerId, g_cmsDiskDamage, g_cmsNicDown); if (g_startupAlarmList != NULL) { g_startCmsCount = 0; /* fill the alarm message */ WriteAlarmAdditionalInfo(&tempAdditionalParam, instanceName, "", "", "", &(g_startupAlarmList[alarmIndex]), ALM_AT_Fault, instanceName, instance_not_exist_reason_to_string(alarmReason)); /* report the alarm */ AlarmReporter(&(g_startupAlarmList[alarmIndex]), ALM_AT_Fault, &tempAdditionalParam); } StopCmInstance(); } else { if (g_startupAlarmList != NULL) { g_startCmsCount = 0; /* fill the alarm message */ WriteAlarmAdditionalInfo(&tempAdditionalParam, instanceName, "", "", "", &(g_startupAlarmList[alarmIndex]), ALM_AT_Resume); /* report the alarm */ AlarmReporter(&(g_startupAlarmList[alarmIndex]), ALM_AT_Resume, &tempAdditionalParam); } } break; case PROCESS_NOT_EXIST: if (g_startCmsCount < STARTUP_CMS_CHECK_TIMES) { /* * the value is -1, it meas the * cluster is starting now ,and cmserver don't start any one */ if (g_startCmsCount == -1) { g_startCmsCount = 1; } else { ++g_startCmsCount; } } else { if (g_startupAlarmList != NULL) { /* fill the alarm message */ WriteAlarmAdditionalInfo(&tempAdditionalParam, instanceName, "", "", "", &(g_startupAlarmList[alarmIndex]), ALM_AT_Fault, instanceName); /* report the alarm */ AlarmReporter(&(g_startupAlarmList[alarmIndex]), ALM_AT_Fault, &tempAdditionalParam); } } if (g_cmsDiskDamage || g_cmsNicDown) { write_runlog(LOG, "g_cmsDiskDamage is %d, and g_cmsNicDown is %d, cannot start cms.\n", g_cmsDiskDamage, g_cmsNicDown); return; } /* Judge the current node cms whether the cms is under replacing */ if (IsCmsReplaceFlagFileExists()) { write_runlog(LOG, "the node(%u) cms is being replaced, do not start it!\n", g_currentNode->node); return; } cdt = (agentCheckPort(g_currentNode->port) <= 0 && agentCheckPort(g_currentNode->cmServerLocalHAPort) <= 0); if (cdt) { rc = memset_s(command, MAXPGPATH, 0, MAXPGPATH); securec_check_errno(rc, (void)rc); rc = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, SYSTEMQUOTE "%s/%s >> \"%s\" 2>&1 &" SYSTEMQUOTE, g_binPath, CM_SERVER_BIN_NAME, system_call_log); securec_check_intval(rc, (void)rc); write_runlog(LOG, "CM_SERVER START system(command:%s).\n", command); ret = system(command); if (ret != 0) { write_runlog(ERROR, "run system command failed %d! %s, errno=%d.\n", ret, command, errno); } } break; default: write_runlog(ERROR, "error.cm_server,ret=%d\n", ret); break; } } static void CheckProcessNum(const char* cmdLine) { char command[MAXPGPATH] = { 0 }; char line[MAXPGPATH] = { 0 }; char buffer[MAXPGPATH] = { 0 }; uint32 processCount = 0; int ret = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, "ps ux | grep -v grep | grep \"%s\"", cmdLine); securec_check_intval(ret, (void)ret); FILE *fp = popen(command, "re"); if (fp == NULL) { return; } while (!feof(fp)) { if (fgets(line, MAXPGPATH - 1, fp)) { ret = strcat_s(buffer, MAXPGPATH, line); securec_check_errno(ret, (void)ret); processCount++; } } (void)pclose(fp); if (processCount > 1) { write_runlog(ERROR, "Multiple processes <%s>, buf is\n%s", cmdLine, buffer); ret = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, "ps ux | grep -v grep| grep \"%s\"| awk '{print $2}'| xargs kill -9", cmdLine); securec_check_intval(ret, (void)ret); ret = system(command); if (ret != 0) { write_runlog(ERROR, "[CheckProcessNum] Failed to execute the command: command=\"%s\"," "errno=%d.\n", command, errno); return; } write_runlog(LOG, "[CheckProcessNum] kill all <%s> process success.\n", cmdLine); } } /* * check if fenced UDF is not running and start it. */ void start_fenced_UDF_check(void) { int ret; ret = check_one_instance_status(FENCED_MASTER_BIN_NAME, "fenced", NULL); if (ret == PROCESS_NOT_EXIST) { char command[MAXPGPATH]; errno_t rc; int rcs; g_fencedUdfStopped = true; rc = memset_s(command, MAXPGPATH, 0, MAXPGPATH); securec_check_errno(rc, (void)rc); rcs = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, SYSTEMQUOTE "%s/%s --fenced -k %s -D %s >> \"%s\" 2>&1 &" SYSTEMQUOTE, g_binPath, FENCED_MASTER_BIN_NAME, g_unixSocketDirectory, sys_log_path, system_call_log); securec_check_intval(rcs, (void)rcs); write_runlog(LOG, "FENCED UDF START system(command:%s).\n", command); ret = system(command); if (ret != 0) { write_runlog(ERROR, "run system command failed %d! %s, errno=%d.\n", ret, command, errno); } } else { CheckProcessNum("gaussdb fenced UDF master"); g_fencedUdfStopped = false; } } void CheckAgentNicDown() { if (!GetNicStatus(g_currentNode->cmAgentId, CM_INSTANCE_TYPE_CMA)) { write_runlog(WARNING, "nic related with cm_agent not up.\n"); g_agentNicDown = true; } else { g_agentNicDown = false; } } static void ComputeCheckTime(const cmTime_t *checkBegin, const char *str) { const long ddbTime = 2; struct timespec checkEnd = {0, 0}; (void)clock_gettime(CLOCK_MONOTONIC, &checkEnd); if (checkEnd.tv_sec - checkBegin->tv_sec > ddbTime) { write_runlog(LOG, "%s, it takes %ld s.\n", str, (checkEnd.tv_sec - checkBegin->tv_sec)); } } static void StartInstanceAndCheck(StartCheck startCheck, const char *str) { if (startCheck == NULL) { return; } cmTime_t checkBegin = {0, 0}; (void)clock_gettime(CLOCK_MONOTONIC, &checkBegin); startCheck(); ComputeCheckTime(&checkBegin, str); } static bool CheckDnCanStart(StartDnCheck startCheck, const char *str) { if (startCheck == NULL) { return true; } cmTime_t checkBegin = {0, 0}; (void)clock_gettime(CLOCK_MONOTONIC, &checkBegin); bool needStartDnCheck = startCheck(); ComputeCheckTime(&checkBegin, str); return needStartDnCheck; } void start_instance_check(void) { if (g_shutdownRequest) { return; } #ifdef ENABLE_MULTIPLE_NODES StartInstanceAndCheck(start_gtm_check, "[start_gtm_check]"); StartInstanceAndCheck(start_coordinator_check, "[start_coordinator_check]"); #endif StartInstanceAndCheck(start_cmserver_check, "[start_cmserver_check]"); bool needStartDnCheck = CheckDnCanStart(CheckStartDN, "[CheckStartDN]"); /* ignore pausing state when firstly starting, otherwise * skip StartDatanodeCheck and StartResourceCheck in pausing state */ if (g_isDnFirstStart) { if (needStartDnCheck && !g_enableSharedStorage) { StartInstanceAndCheck(StartDatanodeCheck, "[StartDatanodeCheck]"); } } else { if (!g_isPauseArbitration && needStartDnCheck && !g_enableSharedStorage) { StartInstanceAndCheck(StartDatanodeCheck, "[StartDatanodeCheck]"); } } StartInstanceAndCheck(CheckAgentNicDown, "[CheckAgentNicDown]"); if (IsCusResExistLocal()) { StartInstanceAndCheck(StartResourceCheck, "[StartResourceCheck]"); } if (g_clusterType == V3SingleInstCluster) { return; } #ifdef ENABLE_MULTIPLE_NODES if (g_currentNode->coordinate == 1 || g_currentNode->datanodeCount > 0) { #else if (g_currentNode->datanodeCount > 0 && needStartDnCheck) { #endif StartInstanceAndCheck(start_fenced_UDF_check, "[start_fenced_UDF_check]"); } else { g_fencedUdfStopped = true; } } static ShutdownMode GetShutdownMode(int32 mode) { if (mode < (int32)FAST_MODE || mode >= (int32)SHUTDOWN_CEIL_MODE) { return FAST_MODE; } return (ShutdownMode)mode; } /* get the lines from a text file - return NULL if file can't be opened */ void get_stop_mode(const char *path) { FILE *infile; int linelen = 0; int nlines = 0; int c; const int bufferLen = 8; char buffer[bufferLen]; g_cmDoForce = false; g_cmShutdownMode = FAST_MODE; g_cmShutdownLevel = SINGLE_INSTANCE; if ((infile = fopen(path, "re")) == NULL) { write_runlog(ERROR, "fopen error.\n"); return; } while ((c = fgetc(infile)) != EOF) { linelen++; if (c == '\n') { nlines++; linelen = 0; } } /* handle last line without a terminating newline (yuck) */ if (linelen) { nlines++; } /* cluster_manual_start file damaged. */ if (nlines < 3) { write_runlog(LOG, "cluster_manual_start file damaged.\n"); (void)fclose(infile); return; } write_runlog(LOG, "[%s] nlines :%d\n", __FUNCTION__, nlines); rewind(infile); if (fgets(buffer, bufferLen, infile) != NULL) { g_cmDoForce = CmAtoBool(buffer); } if (fgets(buffer, bufferLen, infile) != NULL) { g_cmShutdownMode = GetShutdownMode(CmAtoi(buffer, (int)FAST_MODE)); } if (fgets(buffer, bufferLen, infile) != NULL) { g_cmShutdownLevel = CmAtoi(buffer, SINGLE_INSTANCE); } write_runlog(LOG, "g_cmDoForce :%d,g_cmShutdownMode:%d, g_cmShutdownLevel:%d\n", g_cmDoForce, g_cmShutdownMode, g_cmShutdownLevel); (void)fclose(infile); return; } /* * 0 if read pid failed, * pid if success. */ static pid_t get_instances_pid(const char *pidPath) { pid_t pid; FILE *pidf = fopen(pidPath, "re"); if (pidf == NULL) { /* No pid file, not an error on startup */ char errBuffer[ERROR_LIMIT_LEN]; if (errno == ENOENT) { write_runlog(ERROR, "PID file :\"%s\" does not exist: %s\n.", pidPath, strerror_r(errno, errBuffer, ERROR_LIMIT_LEN)); } else { write_runlog( ERROR, "could not open PID file \"%s\": %s\n.", pidPath, strerror_r(errno, errBuffer, ERROR_LIMIT_LEN)); } return 0; } if (fscanf_s(pidf, "%d", &pid) != 1) { write_runlog(ERROR, "invalid data in PID file \"%s\"\n", pidPath); (void)fclose(pidf); return 0; } (void)fclose(pidf); return pid; } void fast_stop_one_instance(const char *instDataPath, InstanceTypes instance_type) { int fast_sig = SIGINT; pid_t pid; char pid_path[MAXPGPATH] = {0}; int ret; int rcs = 0; if (instance_type == INSTANCE_CN) { rcs = snprintf_s(pid_path, MAXPGPATH, MAXPGPATH - 1, "%s/%s", instDataPath, "postmaster.pid"); } else if (instance_type == INSTANCE_DN) { rcs = snprintf_s(pid_path, MAXPGPATH, MAXPGPATH - 1, "%s/%s", instDataPath, "postmaster.pid"); } else if (instance_type == INSTANCE_GTM) { rcs = snprintf_s(pid_path, MAXPGPATH, MAXPGPATH - 1, "%s/%s", instDataPath, "gtm.pid"); } securec_check_intval(rcs, (void)rcs); pid = get_instances_pid(pid_path); if (pid == 0) { ret = check_one_instance_status(type_int_to_str_binname(instance_type), instDataPath, NULL); if (ret == PROCESS_RUNNING) { write_runlog(ERROR, "%s' pid is 0, but still running, use kill_instance_force(): %s.\n", type_int_to_str_name(instance_type), instDataPath); kill_instance_force(instDataPath, instance_type); } return; } /* now just send sig once. */ if (kill(pid, fast_sig) != 0) { write_runlog(ERROR, "fast shutdown ,could not send stop signal (PID: %d), kill_instance_force():%s.\n", pid, instDataPath); kill_instance_force(instDataPath, instance_type); return; } if ((instance_type == INSTANCE_GTM) && (g_cmShutdownLevel != ALL_NODES)) { cm_sleep(2); ret = check_one_instance_status(type_int_to_str_binname(instance_type), instDataPath, NULL); if (ret == PROCESS_RUNNING) { write_runlog(ERROR, "%s is still running, use kill_instance_force to kill : %s.\n", type_int_to_str_name(instance_type), instDataPath); kill_instance_force(instDataPath, instance_type); } } if (instance_type == INSTANCE_DN) { ExecuteEventTrigger(EVENT_STOP); } write_runlog(LOG, "%s shutting down.\n", type_int_to_str_name(instance_type)); } void CheckOfflineNode(uint32 i) { int rcs = 0; if (!CheckStartDN()) { rcs = check_one_instance_status(DATANODE_BIN_NAME, g_currentNode->datanode[i].datanodeLocalDataPath, NULL); if (rcs == PROCESS_RUNNING) { immediate_stop_one_instance(g_currentNode->datanode[i].datanodeLocalDataPath, INSTANCE_DN); } rcs = check_one_instance_status(FENCED_MASTER_BIN_NAME, "fenced", NULL); if (rcs == PROCESS_RUNNING) { g_fencedUdfStopped = true; kill_instance_force("fenced", INSTANCE_FENCED); } } } void stop_datanode_check(uint32 i) { bool cdt; struct stat instanceStatBuf = {0}; struct stat clusterStatBuf = {0}; errno_t rc; int rcs; char instanceManualStartPath[MAX_PATH_LEN] = {0}; char instanceReplace[MAX_PATH_LEN] = {0}; rc = memset_s(instanceManualStartPath, MAX_PATH_LEN, 0, MAX_PATH_LEN); securec_check_errno(rc, (void)rc); rcs = snprintf_s(instanceManualStartPath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s_%u", g_cmInstanceManualStartPath, g_currentNode->datanode[i].datanodeId); securec_check_intval(rcs, (void)rcs); check_input_for_security(instanceManualStartPath); canonicalize_path(instanceManualStartPath); rcs = snprintf_s(instanceReplace, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/%s_%u", g_binPath, CM_INSTANCE_REPLACE, g_currentNode->datanode[i].datanodeId); securec_check_intval(rcs, (void)rcs); CheckOfflineNode(i); cdt = (stat(instanceManualStartPath, &instanceStatBuf) == 0 || stat(g_cmManualStartPath, &clusterStatBuf) == 0); if (cdt) { if (stat(instanceReplace, &instanceStatBuf) == 0) { write_runlog(LOG, "datanode(%s) is being replaced and can't be stopped.\n", g_currentNode->datanode[i].datanodeLocalDataPath); return; } get_stop_mode(instanceManualStartPath); cdt = (g_cmShutdownMode == IMMEDIATE_MODE || (g_cmShutdownMode == FAST_MODE && g_isCmaBuildingDn[i])); if (cdt) { char build_pid_path[MAXPGPATH]; rc = memset_s(build_pid_path, MAXPGPATH, 0, MAXPGPATH); securec_check_errno(rc, (void)rc); rcs = snprintf_s(build_pid_path, MAXPGPATH, MAXPGPATH - 1, "%s/gs_build.pid", g_currentNode->datanode[i].datanodeLocalDataPath); securec_check_intval(rcs, (void)rcs); pgpid_t pid = get_pgpid(build_pid_path, MAXPGPATH); cdt = (pid > 0 && is_process_alive(pid)); if (cdt) { char cmd[MAXPGPATH]; rc = memset_s(cmd, MAXPGPATH, 0, MAXPGPATH); securec_check_errno(rc, (void)rc); rcs = snprintf_s(cmd, MAXPGPATH, MAXPGPATH - 1, "kill -9 %ld >>%s 2>&1", pid, system_call_log); securec_check_intval(rcs, (void)rcs); write_runlog(LOG, "datanode immediate shutdown: %s \n", cmd); const char *shutdownModeStr = (g_cmShutdownMode == IMMEDIATE_MODE) ? "immediate" : "fast"; write_runlog(LOG, "Shutdown the datanode %s : %s\n", shutdownModeStr, cmd); int ret = system(cmd); if (ret != 0) { write_runlog( ERROR, "datanode immediate shutdown: run system command failed! %s, errno=%d.\n", cmd, errno); } else { if (g_isCmaBuildingDn[i]) { g_isCmaBuildingDn[i] = false; write_runlog(LOG, "Shutdown the datanode %s successfully: %s. Then set g_isCmaBuildingDn to false.\n", shutdownModeStr, cmd); } ExecuteEventTrigger(EVENT_STOP); } } else { if (g_isCmaBuildingDn[i]) { g_isCmaBuildingDn[i] = false; write_runlog(LOG, "Datanode %u shutdown: set g_isCmaBuildingDn to false.\n", g_currentNode->datanode[i].datanodeId); } } } if (check_one_instance_status(GetDnProcessName(), g_currentNode->datanode[i].datanodeLocalDataPath, NULL) == PROCESS_RUNNING) { if (g_cmShutdownMode == FAST_MODE) { write_runlog( LOG, "datanode fast shutdown, datapath: %s.\n", g_currentNode->datanode[i].datanodeLocalDataPath); fast_stop_one_instance(g_currentNode->datanode[i].datanodeLocalDataPath, INSTANCE_DN); } else { write_runlog(LOG, "datanode immediate shutdown, kill_instance_force(): %s.\n", g_currentNode->datanode[i].datanodeLocalDataPath); immediate_stop_one_instance(g_currentNode->datanode[i].datanodeLocalDataPath, INSTANCE_DN); } } if (!g_isDnFirstStart) { g_isDnFirstStart = true; } (void)StopCurDnFloatIp(i); } } static void StopAllDatanode() { for (uint32 i = 0; i < g_currentNode->datanodeCount; i++) { if (g_clusterType == V3SingleInstCluster) { StopOneZengine(i); } else { stop_datanode_check(i); } } } void stop_instances_check(void) { #ifdef ENABLE_MULTIPLE_NODES if (g_currentNode->gtm == 1) { stop_gtm_check(); } if (g_currentNode->coordinate == 1) { stop_coordinator_check(); } #endif if (!g_enableSharedStorage) { StopAllDatanode(); } if (IsCusResExistLocal()) { StopResourceCheck(); } } static int cmserver_stopped_check(void) { char command[MAXPGPATH]; errno_t rc; int rcs; if (g_currentNode->cmServerLevel == 1) { rc = memset_s(command, MAXPGPATH, 0, MAXPGPATH); securec_check_errno(rc, (void)rc); rcs = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, "%s/%s", g_binPath, CM_SERVER_BIN_NAME); securec_check_intval(rcs, (void)rcs); return check_one_instance_status(CM_SERVER_BIN_NAME, command, NULL); } return PROCESS_NOT_EXIST; } static NetworkState CheckCurDnFloatIpStatus(uint32 index) { DnFloatIp *dnFloatIp = GetDnFloatIpByDnIdx(index); if (dnFloatIp == NULL || dnFloatIp->dnFloatIpCount == 0) { return NETWORK_STATE_UNKNOWN; } NetworkState state[MAX_FLOAT_IP_COUNT]; GetFloatIpNicStatus(dnFloatIp->instId, CM_INSTANCE_TYPE_DN, state, dnFloatIp->dnFloatIpCount); for (uint32 i = 0; i < dnFloatIp->dnFloatIpCount; ++i) { if (state[i] == NETWORK_STATE_UP) { return NETWORK_STATE_UP; } } return NETWORK_STATE_DOWN; } static int32 CheckFloatIpStateInDn(uint32 index) { uint32 count = DelFloatIpInDatanode(index); if (count != 0) { write_runlog( LOG, "line: %d: datanode(%u) floatIp is running.\n", __LINE__, g_currentNode->datanode[index].datanodeId); return PROCESS_RUNNING; } NetworkState state = CheckCurDnFloatIpStatus(index); if (state == NETWORK_STATE_UP) { write_runlog( LOG, "line: %d: datanode(%u) floatIp is running.\n", __LINE__, g_currentNode->datanode[index].datanodeId); return PROCESS_RUNNING; } return PROCESS_NOT_EXIST; } static int datanode_stopped_check(void) { int ret; errno_t rc; const char *processName = GetDnProcessName(); for (uint32 ii = 0; ii < g_currentNode->datanodeCount; ii++) { char build_pid_path[MAXPGPATH]; ret = check_one_instance_status(processName, g_currentNode->datanode[ii].datanodeLocalDataPath, NULL); rc = memset_s(build_pid_path, MAXPGPATH, 0, MAXPGPATH); securec_check_errno(rc, (void)rc); rc = snprintf_s(build_pid_path, MAXPGPATH, MAXPGPATH - 1, "%s/gs_build.pid", g_currentNode->datanode[ii].datanodeLocalDataPath); securec_check_intval(rc, (void)rc); pgpid_t pid = get_pgpid(build_pid_path, MAXPGPATH); if ((ret == PROCESS_RUNNING) || (pid > 0 && is_process_alive(pid))) { write_runlog(LOG, "data node is running path is %s\n", g_currentNode->datanode[ii].datanodeLocalDataPath); return PROCESS_RUNNING; } if (CheckFloatIpStateInDn(ii) == PROCESS_RUNNING) { return PROCESS_RUNNING; } } return PROCESS_NOT_EXIST; } /** * @brief stop the internal processes of CM instance * * @param cm_path the path of CM instance */ void stop_cm_instance_internal(const char *cm_path) { char system_cmd[MAXPGPATH] = {'\0'}; int ret; char toolPath[MAX_PATH_LEN] = {'\0'}; char pyPstreePath[MAX_PATH_LEN] = {'\0'}; /* Get tool path */ ret = cmagent_getenv("GPHOME", toolPath, sizeof(toolPath)); if (ret != EOK) { write_runlog(FATAL, "get env GPHOME fail.\n"); exit(ret); } ret = snprintf_s(pyPstreePath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/script/py_pstree.py", toolPath); securec_check_intval(ret, (void)ret); ret = access(pyPstreePath, F_OK); if (ret != EOK) { write_runlog(FATAL, "%s may be not exist.\n", pyPstreePath); exit(ret); } ret = snprintf_s(system_cmd, MAXPGPATH, MAXPGPATH - 1, "pidList=`ps aux | grep \"%s\" | grep -v 'grep' | awk '{print $2}' | xargs `; for pid in $pidList; do " "%s -c $pid -s $pid | xargs -r -n 100 kill -%d; done", cm_path, pyPstreePath, SIG_TYPE); securec_check_intval(ret, (void)ret); struct stat statBuf = {0}; if (stat(system_call_log, &statBuf) == 0) { /* redirect to system_call.log */ ret = strncat_s(system_cmd, MAXPGPATH, " >> ", strlen(" >> ")); securec_check_errno(ret, (void)ret); ret = strncat_s(system_cmd, MAXPGPATH, system_call_log, strlen(system_call_log)); securec_check_errno(ret, (void)ret); ret = strncat_s(system_cmd, MAXPGPATH, " 2>&1", strlen(" 2>&1")); securec_check_errno(ret, (void)ret); } struct timeval timeOut = {0}; timeOut.tv_sec = 10; timeOut.tv_usec = 0; write_runlog(LOG, "stop_cm_instance: command= %s \n", system_cmd); if (ExecuteCmd(system_cmd, timeOut)) { write_runlog(WARNING, "stop_cm_instance: execute command failed. %s \n", system_cmd); } else { write_runlog(LOG, "cm_server shutting down.\n"); } } static int check_process_status(const char *processName, int pid, char state, const char *cmd_line, int *isPhonyDead) { static bool persist_T_status = false; bool isCMS = (strcmp(processName, CM_SERVER_BIN_NAME) == 0); if (state == 'd' || state == 'D') { write_runlog(LOG, "process (%s %d) is pending, can not receive signal, path is %s," " state is D (TASK_UNINTERRUPTIBLE)\n", processName, pid, cmd_line); if (isPhonyDead != NULL) { *isPhonyDead = PROCESS_PHONY_DEAD_D; } } else if (state == 't' || state == 'T') { write_runlog(ERROR, "process (%s %d) is T (STOPPED), path is %s\n", processName, pid, cmd_line); if (isCMS && persist_T_status) { write_runlog(LOG, "kill CMS process (%s:%d) due to STOPPED!\n", processName, pid); stop_cm_instance_internal(cmd_line); return PROCESS_NOT_EXIST; } if (isPhonyDead != NULL) { *isPhonyDead = PROCESS_PHONY_DEAD_T; } /* mark the process as corpse, only do it for CMS now. */ persist_T_status = isCMS || persist_T_status; } else if (state == 'z' || state == 'Z') { write_runlog(ERROR, "process (%s %d) is Z (STOPPED), path is %s\n", processName, pid, cmd_line); if (isPhonyDead != NULL) { *isPhonyDead = PROCESS_PHONY_DEAD_Z; } } else { persist_T_status = !isCMS && persist_T_status; write_runlog(DEBUG5, "process (%s %d) is running, path is %s, haveFound is 1\n", processName, pid, cmd_line); } return PROCESS_RUNNING; } static int GetProcessInfo(const char *processName, const char *cmdLine, int *processId, char *processState) { DIR *dir; struct dirent *de = NULL; char pidPath[MAX_PATH_LEN]; char cmdPath[MAX_PATH_LEN]; FILE *fp = NULL; char getBuff[MAX_PATH_LEN]; char paraName[MAX_PATH_LEN]; char paraValue[MAX_PATH_LEN]; int pid = 0, ppid = 0; char state = '0'; uid_t uid = 0, uid1 = 0, uid2 = 0, uid3 = 0; bool nameFound = false, stateGet = false, ppidGet = false; bool nameGet = false, haveFound = false, uidGet = false; char *p = NULL; int i = 0; int paralen; errno_t rc; int rcs; bool isProcessFile = false; bool cdt; if ((dir = opendir("/proc")) == NULL) { write_runlog(LOG, "opendir(/proc) failed! \n"); return PROCESS_UNKNOWN; } while ((de = readdir(dir)) != NULL) { /* * judging whether the directory name is composed by digitals,if so,we will * check whether there are files under the directory ,these files includes * all detailed information about the process */ if (CM_is_str_all_digit(de->d_name) != 0) { continue; } isProcessFile = true; rc = memset_s(pidPath, MAX_PATH_LEN, 0, MAX_PATH_LEN); securec_check_errno(rc, (void)rc); pid = (int)strtol(de->d_name, NULL, 10); { rcs = snprintf_s(pidPath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "/proc/%d/status", pid); securec_check_intval(rcs, (void)rcs); } /* maybe fail because of privilege */ fp = fopen(pidPath, "re"); if (fp == NULL) { continue; } nameGet = false; ppidGet = false; stateGet = false; uidGet = false; rc = memset_s(paraValue, MAX_PATH_LEN, 0, MAX_PATH_LEN); securec_check_errno(rc, (void)rc); ppid = 0; state = '0'; uid = 0; rc = memset_s(getBuff, MAX_PATH_LEN, 0, MAX_PATH_LEN); securec_check_errno(rc, (void)rc); nameFound = false; while (fgets(getBuff, MAX_PATH_LEN - 1, fp) != NULL) { rc = memset_s(paraName, MAX_PATH_LEN, 0, MAX_PATH_LEN); securec_check_errno(rc, (void)rc); cdt = (!nameGet && (strstr(getBuff, "Name:") != NULL)); if (cdt) { nameGet = true; rcs = sscanf_s(getBuff, "%s %s", paraName, MAX_PATH_LEN, paraValue, MAX_PATH_LEN); check_sscanf_s_result(rcs, 2); securec_check_intval(rcs, (void)rcs); if (strcmp(processName, paraValue) != 0) { break; } nameFound = true; } cdt = (!ppidGet && (strstr(getBuff, "PPid:") != NULL)); if (cdt) { ppidGet = true; rcs = sscanf_s(getBuff, "%s %d", paraName, MAX_PATH_LEN, &ppid); check_sscanf_s_result(rcs, 2); securec_check_intval(rcs, (void)rcs); } cdt = (!stateGet && (strstr(getBuff, "State:") != NULL)); if (cdt) { stateGet = true; rcs = sscanf_s(getBuff, "%s %c", paraName, MAX_PATH_LEN, &state, 1); check_sscanf_s_result(rcs, 2); securec_check_intval(rcs, (void)rcs); } cdt = (!uidGet && (strstr(getBuff, "Uid:") != NULL)); if (cdt) { uidGet = true; rcs = sscanf_s( getBuff, "%s %u %u %u %u", paraName, MAX_PATH_LEN, &uid, &uid1, &uid2, &uid3); check_sscanf_s_result(rcs, 5); securec_check_intval(rcs, (void)rcs); } cdt = (nameGet && ppidGet && stateGet && uidGet); if (cdt) { break; } } (void)fclose(fp); if (!nameFound) { continue; } if (getuid() != uid) { continue; } rc = memset_s(cmdPath, MAX_PATH_LEN, 0, MAX_PATH_LEN); securec_check_errno(rc, (void)rc); rcs = snprintf_s(cmdPath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "/proc/%d/cmdline", pid); securec_check_intval(rcs, (void)rcs); fp = fopen(cmdPath, "re"); if (fp == NULL) { continue; } rc = memset_s(getBuff, MAX_PATH_LEN, 0, MAX_PATH_LEN); securec_check_errno(rc, (void)rc); if (fgets(getBuff, MAX_PATH_LEN - 1, fp) != NULL) { p = getBuff; i = 0; while (i < MAX_PATH_LEN - 1) { /* cmdline of CN,DN,GTM and CM begin with '/', and fenced UDF begin with '-', and kerberos with 'k'. */ if (*p == '/') { if (strcmp(p, cmdLine) == 0) { haveFound = true; break; } else { char *cmd_line_tmp = strdup(cmdLine); if (cmd_line_tmp != NULL) { canonicalize_path(cmd_line_tmp); if (strcmp(p, cmd_line_tmp) == 0) { haveFound = true; FREE_AND_RESET(cmd_line_tmp); break; } FREE_AND_RESET(cmd_line_tmp); } paralen = (int)strlen(p); p = p + paralen; i = i + paralen; } } else if (*p == 'f') { if (strstr(p, cmdLine) != NULL) { haveFound = true; break; } else { p++; i++; } } else if (*p == 'k') { if (strstr(p, cmdLine) != NULL) { haveFound = true; break; } else { p++; i++; } } else if (*p == 'l') { if (strstr(p, cmdLine) != NULL) { haveFound = true; break; } else { p++; i++; } } else { p++; i++; } } rc = memset_s(getBuff, MAX_PATH_LEN, 0, MAX_PATH_LEN); securec_check_errno(rc, (void)rc); } (void)fclose(fp); if (haveFound) { break; } } (void)closedir(dir); if (!isProcessFile) { write_runlog(ERROR, "the process files may not exist in /proc.\n"); return PROCESS_UNKNOWN; } if (haveFound) { *processId = pid; if (processState != NULL) { *processState = state; } return PROCESS_RUNNING; } write_runlog(LOG, "process (%s) is not running, path is %s, haveFound is 0\n", processName, cmdLine); return PROCESS_NOT_EXIST; } int killInstanceByPid(const char *processName, const char *cmdLine) { int pid = 0; int runningState = GetProcessInfo(processName, cmdLine, &pid, NULL); if (runningState == PROCESS_RUNNING) { char killCmd[MAX_PATH_LEN] = {0}; int rcs = snprintf_s(killCmd, MAX_PATH_LEN, MAX_PATH_LEN - 1, "kill -9 %d", pid); securec_check_intval(rcs, (void)rcs); write_runlog(LOG, "kill process %s, path is %s, command: %s\n", processName, cmdLine, killCmd); int ret = system(killCmd); if (ret != 0) { write_runlog(ERROR, "kill_instance_by_pid: system command failed, errno=%d.\n", errno); return -1; } } return 0; } int check_one_instance_status(const char *processName, const char *cmdLine, int *isPhonyDead) { int32 processId = 0; char processState = '0'; int32 runningState = GetProcessInfo(processName, cmdLine, &processId, &processState); if (runningState == PROCESS_RUNNING) { return check_process_status(processName, processId, processState, cmdLine, isPhonyDead); } write_runlog(LOG, "process (%s) is not running, path is %s, haveFound is 0\n", processName, cmdLine); return PROCESS_NOT_EXIST; } static int all_nodes_stopped_check() { int ret; int count = 0; #ifdef ENABLE_MULTIPLE_NODES ret = gtm_stopped_check(); if (ret == PROCESS_RUNNING) { count++; } ret = coordinator_stopped_check(); if (ret == PROCESS_RUNNING) { count++; } #endif ret = cmserver_stopped_check(); if (ret == PROCESS_RUNNING) { count++; } ret = datanode_stopped_check(); if (ret == PROCESS_RUNNING) { count++; } ret = ResourceStoppedCheck(); if (ret == PROCESS_RUNNING) { count++; } ret = check_one_instance_status(FENCED_MASTER_BIN_NAME, "fenced", NULL); if (ret == PROCESS_RUNNING) { count++; } return count; } static void StopCmInstance() { char pid_path[MAXPGPATH] = {0}; char cm_path[MAXPGPATH] = {0}; char instanceDataPath[MAX_PATH_LEN] = {0}; errno_t rcs = snprintf_s( instanceDataPath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/%s", g_currentNode->cmDataPath, CM_SERVER_DATA_DIR); securec_check_intval(rcs, (void)rcs); rcs = snprintf_s(pid_path, MAXPGPATH, MAXPGPATH - 1, "%s/%s", instanceDataPath, "cm_server.pid"); securec_check_intval(rcs, (void)rcs); rcs = snprintf_s(cm_path, MAXPGPATH, MAXPGPATH - 1, "%s/%s", g_binPath, CM_SERVER_BIN_NAME); securec_check_intval(rcs, (void)rcs); pid_t pid = get_instances_pid(pid_path); if (pid == 0) { int ret = check_one_instance_status(type_int_to_str_binname(INSTANCE_CM), cm_path, NULL); if (ret == PROCESS_RUNNING) { write_runlog(ERROR, "%s' pid is 0, but still running, use kill_intance_force(): %s.\n", type_int_to_str_name(INSTANCE_CM), instanceDataPath); kill_instance_force(instanceDataPath, INSTANCE_CM); } } else { stop_cm_instance_internal(cm_path); } } static int stop_primary_check(const char *ssh_channel, const char *data_path) { char command[MAXPGPATH] = {0}; char result_str[MAX_BUF_LEN + 1] = {0}; char mpprvFile[MAXPGPATH] = {0}; int rc; int ret = cmagent_getenv("MPPDB_ENV_SEPARATE_PATH", mpprvFile, sizeof(mpprvFile)); if (ret != EOK) { rc = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, "pssh %s -H %s \"cm_ctl check -B %s -T %s\" > /dev/null 2>&1; echo -e $? > %s", PSSH_TIMEOUT_OPTION, ssh_channel, DATANODE_BIN_NAME, data_path, result_path); } else { check_input_for_security(mpprvFile); rc = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, "pssh %s -H %s \"source %s;cm_ctl check -B %s -T %s\" > /dev/null 2>&1; echo -e $? > %s", PSSH_TIMEOUT_OPTION, ssh_channel, mpprvFile, DATANODE_BIN_NAME, data_path, result_path); } securec_check_intval(rc, (void)rc); ret = system(command); if (ret != 0) { write_runlog(LOG, "exec command failed ! command is %s, errno=%d.\n", command, errno); (void)unlink(result_path); return -1; } FILE *fd = fopen(result_path, "re"); if (fd == NULL) { write_runlog(LOG, "fopen failed, errno[%d]!\n", errno); (void)unlink(result_path); return -1; } size_t bytesread = fread(result_str, 1, MAX_BUF_LEN, fd); if (bytesread > MAX_BUF_LEN) { write_runlog(LOG, "stop_primary_check fread file failed! file=%s, bytesread=%lu\n", result_path, bytesread); (void)fclose(fd); (void)unlink(result_path); return -1; } (void)fclose(fd); (void)unlink(result_path); return (int)strtol(result_str, NULL, 10); } static void normal_stop_one_instance(const char *instDataPath, InstanceTypes instance_type) { int fast_sig = SIGTERM; /* normal mode */ pid_t pid; char pid_path[MAXPGPATH] = {0}; int ret; int rcs = 0; if (instance_type == INSTANCE_DN) { rcs = snprintf_s(pid_path, MAXPGPATH, MAXPGPATH - 1, "%s/%s", instDataPath, "postmaster.pid"); securec_check_intval(rcs, (void)rcs); } pid = get_instances_pid(pid_path); if (pid == 0) { ret = check_one_instance_status(type_int_to_str_binname(instance_type), instDataPath, NULL); if (ret == PROCESS_RUNNING) { write_runlog(ERROR, "%s' pid is 0, but still running, use kill_instance_force(): %s.\n", type_int_to_str_name(instance_type), instDataPath); kill_instance_force(instDataPath, instance_type); } return; } /* now just send sig once. */ if (kill(pid, fast_sig) != 0) { write_runlog(ERROR, "normal shutdown ,could not send stop signal (PID: %d), kill_instance_force():%s.\n", pid, instDataPath); kill_instance_force(instDataPath, instance_type); return; } if (instance_type == INSTANCE_DN) { ExecuteEventTrigger(EVENT_STOP); } write_runlog(LOG, "%s shutting down.\n", type_int_to_str_name(instance_type)); } static status_t StopPrimaryDatanode(uint32 role, const char *ip, const char *path) { if (role != DUMMY_STANDBY_DN) { write_runlog(LOG, "peer ip: %s, datapath: %s.\n", ip, path); if (stop_primary_check(ip, path) == PROCESS_RUNNING) { write_runlog(LOG, "peer is still running.\n"); return CM_ERROR; } } return CM_SUCCESS; } static void NormalShutdownOneDatanode(const dataNodeInfo *dnInfo, int localRole) { if (localRole == INSTANCE_ROLE_STANDBY) { if (g_multi_az_cluster) { bool beContinue = false; for (uint32 j = 0; j < g_dn_replication_num - 1; ++j) { beContinue = false; if (StopPrimaryDatanode(dnInfo->peerDatanodes[j].datanodePeerRole, dnInfo->peerDatanodes[j].datanodePeerHAIP[0], dnInfo->peerDatanodes[j].datanodePeerDataPath) != CM_SUCCESS) { beContinue = true; break; } } if (beContinue && g_normalStopTryTimes < 3) { g_normalStopTryTimes++; return; } } else { if ((StopPrimaryDatanode(dnInfo->datanodePeerRole, dnInfo->datanodePeerHAIP[0], dnInfo->datanodePeerDataPath) != CM_SUCCESS) || (StopPrimaryDatanode(dnInfo->datanodePeer2Role, dnInfo->datanodePeer2HAIP[0], dnInfo->datanodePeer2DataPath) != CM_SUCCESS)) { return; } } } write_runlog(LOG, "datanode normal shutdown, datapath: %s.\n", dnInfo->datanodeLocalDataPath); normal_stop_one_instance(dnInfo->datanodeLocalDataPath, INSTANCE_DN); } static void NormalShutdownAllDatanode() { for (uint32 i = 0; i < g_currentNode->datanodeCount; ++i) { const dataNodeInfo *dnInfo = &g_currentNode->datanode[i]; write_runlog(LOG, "local_role is %s, datapath: %s.\n", datanode_role_int_to_string(g_dnReportMsg[i].dnStatus.reportMsg.local_status.local_role), dnInfo->datanodeLocalDataPath); if (g_clusterType == V3SingleInstCluster) { StopOneZengine(i); } else { NormalShutdownOneDatanode(dnInfo, g_dnReportMsg[i].dnStatus.reportMsg.local_status.local_role); } DelAndDownFloatIpInDn(i); } } static void normal_shutdown_nodes(void) { /* coordinator */ if (g_currentNode->coordinate == 1) { write_runlog(LOG, "coordinator normal shutdown, datapath: %s.\n", g_currentNode->DataPath); fast_stop_one_instance(g_currentNode->DataPath, INSTANCE_CN); } /* datanode */ if (!g_enableSharedStorage) { write_runlog(LOG, "datanode normal shutdown.\n"); NormalShutdownAllDatanode(); } /* cm_server */ if (g_currentNode->cmServerLevel == 1) { write_runlog(LOG, "cm_server normal shutdown, datapath: %s.\n", g_currentNode->cmDataPath); StopCmInstance(); } /* gtm */ if (g_currentNode->gtm == 1) { write_runlog(LOG, "gtm normal shutdown, path: %s.\n", g_currentNode->gtmLocalDataPath); fast_stop_one_instance(g_currentNode->gtmLocalDataPath, INSTANCE_GTM); } /* resource */ if (IsCusResExistLocal()) { write_runlog(LOG, "normal_shutdown_nodes, %u resource will be stopped.\n", GetLocalResConfCount()); StopAllResInst(); } } static void ShutdownOneDatanode(const dataNodeInfo *dnInfo) { char buildPidPath[MAX_PATH_LEN] = {0}; int ret = snprintf_s(buildPidPath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/gs_build.pid", dnInfo->datanodeLocalDataPath); securec_check_intval(ret, (void)ret); pgpid_t pid = get_pgpid(buildPidPath, MAX_PATH_LEN); if (pid > 0 && is_process_alive(pid)) { char cmd[MAX_PATH_LEN] = {0}; ret = snprintf_s(cmd, MAX_PATH_LEN, MAX_PATH_LEN - 1, "killall %s %s >>%s 2>&1", PG_CTL_NAME, PG_REWIND_NAME, system_call_log); securec_check_intval(ret, (void)ret); write_runlog(LOG, "immediate_shutdown_nodes: %s \n", cmd); if (system(cmd) != 0) { write_runlog(ERROR, "immediate_shutdown_nodes: run system command failed! %s, errno=%d.\n", cmd, errno); } } write_runlog(LOG, "datanode immediate shutdown, kill_instance_force(): %s.\n", dnInfo->datanodeLocalDataPath); immediate_stop_one_instance(dnInfo->datanodeLocalDataPath, INSTANCE_DN); } void DelAndDownFloatIpInDn(uint32 index) { (void)DelFloatIpInDatanode(index); SetNicOper(g_currentNode->datanode[index].datanodeId, CM_INSTANCE_TYPE_DN, NETWORK_TYPE_FLOATIP, NETWORK_OPER_DOWN); } static void ImmediateShutdownAllDatanode() { for (uint32 ii = 0; ii < g_currentNode->datanodeCount; ii++) { if (g_clusterType == V3SingleInstCluster) { StopOneZengine(ii); } else { ShutdownOneDatanode(&g_currentNode->datanode[ii]); } DelAndDownFloatIpInDn(ii); } } void immediate_shutdown_nodes(bool kill_cmserver, bool kill_cn) { /* coordinate */ if (g_currentNode->coordinate == 1 && kill_cn) { write_runlog(LOG, "coordinator immediate shutdown, kill_instance_force(): %s.\n", g_currentNode->DataPath); immediate_stop_one_instance(g_currentNode->DataPath, INSTANCE_CN); } /* datanode */ if (!g_enableSharedStorage) { write_runlog(LOG, "all datanode immediate shutdown.\n"); ImmediateShutdownAllDatanode(); } /* cm_server */ if (g_currentNode->cmServerLevel == 1 && kill_cmserver) { write_runlog(LOG, "cm_server immediate shutdown, kill_intance_force():%s.\n", g_currentNode->cmDataPath); StopCmInstance(); } /* gtm */ if (g_currentNode->gtm == 1) { write_runlog(LOG, "gtm immediate shutdown, kill_instance_force(): %s.\n", g_currentNode->gtmLocalDataPath); immediate_stop_one_instance(g_currentNode->gtmLocalDataPath, INSTANCE_GTM); } /* resource */ if (IsCusResExistLocal()) { write_runlog(LOG, "immediate_shutdown_nodes, %u resource will be stopped.\n", GetLocalResConfCount()); StopAllResInst(); } } static void FastShutdownAllDatanode() { for (uint32 ii = 0; ii < g_currentNode->datanodeCount; ii++) { write_runlog(LOG, "datanode fast shutdown, datapath: %s.\n", g_currentNode->datanode[ii].datanodeLocalDataPath); if (g_clusterType == V3SingleInstCluster) { StopOneZengine(ii); } else { fast_stop_one_instance(g_currentNode->datanode[ii].datanodeLocalDataPath, INSTANCE_DN); } DelAndDownFloatIpInDn(ii); } } void fast_shutdown_nodes(void) { /* coordinator */ if (g_currentNode->coordinate == 1) { write_runlog(LOG, "coordinator fast shutdown, datapath: %s.\n", g_currentNode->DataPath); fast_stop_one_instance(g_currentNode->DataPath, INSTANCE_CN); } /* datanode */ if (!g_enableSharedStorage) { write_runlog(LOG, "all datanode fast shutdown.\n"); FastShutdownAllDatanode(); } /* cm_server */ if (g_currentNode->cmServerLevel == 1) { write_runlog(LOG, "cm_server fast shutdown, datapath: %s.\n", g_currentNode->cmDataPath); StopCmInstance(); } /* gtm */ if (g_currentNode->gtm == 1) { write_runlog(LOG, "gtm fast shutdown, path: %s.\n", g_currentNode->gtmLocalDataPath); fast_stop_one_instance(g_currentNode->gtmLocalDataPath, INSTANCE_GTM); } /* resource */ if (IsCusResExistLocal()) { write_runlog(LOG, "fast shutdown, %u resource process will be stopped.\n", GetLocalResConfCount()); StopAllResInst(); } } void GetStopZengineCmd(char *cmd, unsigned long cmdLen, uint32 index) { int rcs = 0; if (!IsBoolCmParamTrue(g_agentEnableDcf)) { rcs = snprintf_s(cmd, cmdLen, cmdLen - 1, "%s/cm_script/dn_zenith_ha/stopdb.sh %s %s %u", g_binPath, g_currentNode->datanode[index].datanodeLocalDataPath, g_currentNode->datanode[index].datanodeListenIP[0], g_currentNode->datanode[index].datanodePort); } else { rcs = snprintf_s(cmd, cmdLen, cmdLen - 1, "%s/cm_script/dn_zenith_zpaxos/stopdb.sh %s %u", g_binPath, g_currentNode->datanode[index].datanodeLocalDataPath, g_currentNode->datanode[index].datanodePort); } securec_check_intval(rcs, (void)rcs); } void StopZengineByCmd(uint32 index) { int rcs; char instance_manual_start_path[MAX_PATH_LEN] = {0}; struct stat cluster_stat_buf = {0}; rcs = snprintf_s(instance_manual_start_path, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s_%u", g_cmInstanceManualStartPath, g_currentNode->datanode[index].datanodeId); securec_check_intval(rcs, (void)rcs); check_input_for_security(instance_manual_start_path); canonicalize_path(instance_manual_start_path); if (stat(g_cmManualStartPath, &cluster_stat_buf) == 0) { get_stop_mode(g_cmManualStartPath); } else { get_stop_mode(instance_manual_start_path); } if (g_cmShutdownMode == FAST_MODE || g_cmShutdownMode == SMART_MODE) { write_runlog( LOG, "datanode fast shutdown, datapath: %s.\n", g_currentNode->datanode[index].datanodeLocalDataPath); char cmd[MAXPGPATH] = {0}; GetStopZengineCmd(cmd, MAXPGPATH, index); rcs = ExecuteSystemCmd(cmd); if (rcs != 0) { return; } write_runlog(LOG, "%s %s stopped by cmd %s.\n", type_int_to_str_name(INSTANCE_DN), g_currentNode->datanode[index].datanodeLocalDataPath, cmd); return; } write_runlog( LOG, "datanode immediate shutdown, stopZengine(): %s.\n", g_currentNode->datanode[index].datanodeLocalDataPath); immediate_stop_one_instance(g_currentNode->datanode[index].datanodeLocalDataPath, INSTANCE_DN); } static bool StopCurDnFloatIp(uint32 index) { uint32 count = DelFloatIpInDatanode(index); if (count != 0) { return false; } NetworkState state = CheckCurDnFloatIpStatus(index); if (state != NETWORK_STATE_UP) { return true; } write_runlog( LOG, "instId(%u) FloatIp is running, it need to be stopped.\n", g_currentNode->datanode[index].datanodeId); SetNicOper(g_currentNode->datanode[index].datanodeId, CM_INSTANCE_TYPE_DN, NETWORK_TYPE_FLOATIP, NETWORK_OPER_DOWN); return false; } static void StopOneZengine(uint32 index) { bool dnManualStop = DnManualStop(index); if (dnManualStop) { char instance_replace[MAX_PATH_LEN] = {0}; struct stat instance_stat_buf = {0}; int rcs; rcs = snprintf_s(instance_replace, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/%s_%u", g_binPath, CM_INSTANCE_REPLACE, g_currentNode->datanode[index].datanodeId); securec_check_intval(rcs, (void)rcs); if (stat(instance_replace, &instance_stat_buf) == 0) { write_runlog(LOG, "datanode(%s) is being replaced and can't be stopped.\n", g_currentNode->datanode[index].datanodeLocalDataPath); return; } if (check_one_instance_status(ZENGINE_BIN_NAME, g_currentNode->datanode[index].datanodeLocalDataPath, NULL) == PROCESS_RUNNING) { StopZengineByCmd(index); return; } if (!StopCurDnFloatIp(index)) { return; } write_runlog(LOG, "datanode is not running, no need to shutdown: %s.\n", g_currentNode->datanode[index].datanodeLocalDataPath); } } int stop_instance_check(void) { struct stat stat_buf = {0}; int ret; if (stat(g_cmManualStartPath, &stat_buf) == 0) { g_shutdownRequest = true; write_runlog(LOG, "shutdown requested, find start file!\n"); } if (!g_shutdownRequest) { /* check and stop one instance */ stop_instances_check(); write_runlog(DEBUG5, "stat stop file error!\n"); return 1; } /* check and stop all instances on current node */ ret = all_nodes_stopped_check(); if (ret == 0) { write_runlog(LOG, "all instances have been stopped!\n"); return 0; } get_stop_mode(g_cmManualStartPath); if (g_cmShutdownMode == FAST_MODE) { write_runlog(LOG, "fast shutdown!\n"); fast_shutdown_nodes(); } else if (g_cmShutdownMode == IMMEDIATE_MODE) { write_runlog(LOG, "immediate shutdown!\n"); immediate_shutdown_nodes(true, true); } else { write_runlog(LOG, "normal shutdown!\n"); normal_shutdown_nodes(); } if (g_clusterType != V3SingleInstCluster) { kill_instance_force("fenced", INSTANCE_FENCED); } return 2; } bool isNodeNormal() { bool cdt; if (g_currentNode->gtm) { (void)pthread_rwlock_wrlock(&(g_gtmReportMsg.lk_lock)); cdt = (!((g_gtmReportMsg.report_msg.status.local_role == INSTANCE_ROLE_PRIMARY || g_gtmReportMsg.report_msg.status.local_role == INSTANCE_ROLE_STANDBY) && g_gtmReportMsg.report_msg.status.connect_status == CON_OK && g_gtmReportMsg.report_msg.status.sync_mode == INSTANCE_DATA_REPLICATION_SYNC)); if (cdt) { (void)pthread_rwlock_unlock(&(g_gtmReportMsg.lk_lock)); return false; } (void)pthread_rwlock_unlock(&(g_gtmReportMsg.lk_lock)); } if (g_currentNode->coordinate) { (void)pthread_rwlock_wrlock(&(g_cnReportMsg.lk_lock)); if (g_cnReportMsg.cnStatus.reportMsg.connectStatus != AGENT_TO_INSTANCE_CONNECTION_OK) { (void)pthread_rwlock_unlock(&(g_cnReportMsg.lk_lock)); return false; } (void)pthread_rwlock_unlock(&(g_cnReportMsg.lk_lock)); } for (uint32 ii = 0; ii < g_currentNode->datanodeCount; ii++) { (void)pthread_rwlock_wrlock(&(g_dnReportMsg[ii].lk_lock)); cdt = (!(g_dnReportMsg[ii].dnStatus.reportMsg.local_status.local_role == INSTANCE_ROLE_PRIMARY || (g_dnReportMsg[ii].dnStatus.reportMsg.local_status.local_role == INSTANCE_ROLE_STANDBY && g_dnReportMsg[ii].dnStatus.reportMsg.local_status.db_state == INSTANCE_HA_STATE_NORMAL) || (g_currentNode->datanode[ii].datanodeRole == DUMMY_STANDBY_DN && g_dnReportMsg[ii].dnStatus.reportMsg.processStatus == INSTANCE_PROCESS_RUNNING))); if (cdt) { (void)pthread_rwlock_unlock(&(g_dnReportMsg[ii].lk_lock)); return false; } (void)pthread_rwlock_unlock(&(g_dnReportMsg[ii].lk_lock)); } return true; } static void clean_semp_and_shm() { if (g_cmShutdownMode == IMMEDIATE_MODE) { char user_name[256] = {0}; int ret = cmagent_getenv("USER", user_name, sizeof(user_name)); if (ret == EOK) { errno_t rc; char cmd[MAX_PATH_LEN]; check_input_for_security(user_name); rc = snprintf_s( cmd, MAX_PATH_LEN, MAX_PATH_LEN - 1, "ipcrm `ipcs -s | grep %s | awk '{print \"-s \" $2}'`", user_name); securec_check_intval(rc, (void)rc); if (system(cmd)) { write_runlog(ERROR, "clean semp failed!, erron=%d.\n", errno); } rc = snprintf_s(cmd, MAX_PATH_LEN, MAX_PATH_LEN - 1, "ipcrm `ipcs -m | grep %s | awk '{if($6==\"0\") print \"-m \" $2}'`", user_name); securec_check_intval(rc, (void)rc); if (system(cmd)) { write_runlog(ERROR, "clean shm failed!, erron=%d.\n", errno); } } else { write_runlog(ERROR, "get USER failed!\n"); } } } void *agentStartAndStopMain(void *arg) { bool cdt; int status; int pid; int st; int rcs; char instance_replace[MAX_PATH_LEN] = {0}; struct stat stat_buf = {0}; char pg_host_path[MAX_PATH_LEN] = {0}; char gauss_replace[MAX_PATH_LEN] = {0}; pthread_t threadId = pthread_self(); thread_name = "StartAndStop"; write_runlog(LOG, "agent start and stop thread start, threadid %lu.\n", threadId); /* * init alarm check, check ALM_AI_AbnormalGTMProcess, * ALM_AI_AbnormalCoordinatorProcess and ALM_AI_AbnormalDatanodeProcess */ StartupAlarmItemInitialize(g_currentNode); rcs = snprintf_s(instance_replace, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/%s_%u", g_binPath, CM_INSTANCE_REPLACE, g_currentNode->coordinateId); securec_check_intval(rcs, (void)rcs); rcs = cmagent_getenv("PGHOST", pg_host_path, sizeof(pg_host_path)); if (rcs == EOK) { check_input_for_security(pg_host_path); rcs = snprintf_s(gauss_replace, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/GaussReplace.dat", pg_host_path); securec_check_intval(rcs, (void)rcs); } else { write_runlog(FATAL, "get PGHOST failed!\n"); exit(-1); } for (;;) { if (g_exitFlag) { write_runlog(LOG, "receive exit request in cma startAndStop.\n"); cm_sleep(1); continue; } set_thread_state(threadId); pid = waitpid(-1, &st, WNOHANG); if (pid > 0) { write_runlog(LOG, "child process have die! pid is %d exit status is %d\n ", pid, st); } status = stop_instance_check(); if (status == 0) { clean_semp_and_shm(); write_runlog(LOG, "stop_instance_check find.exit.\n"); exit(0); } #ifdef ENABLE_MULTIPLE_NODES if (cm_agent_need_check_libcomm_port) { g_autoRepairCnt = 0; RemoveStopAutoRepairFile(); write_runlog(LOG, "update libcomm config start.\n"); if (UpdateLibcommConfig()) { cm_agent_need_check_libcomm_port = false; write_runlog(LOG, "update libcomm config complete.\n"); } } #endif start_instance_check(); #ifdef ENABLE_MULTIPLE_NODES if (g_syncDroppedCoordinator) { cdt = (stat(instance_replace, &stat_buf) == 0 || g_repairCn || g_restoreCn); if (cdt) { write_runlog(LOG, "coordinator is being replaced/repiared/restore, can't create node or group.\n"); } else { cm_static_config_check_to_coordinate(); } } #endif cdt = (stat(gauss_replace, &stat_buf) == 0 && isNodeNormal()); if (cdt) { if (unlink(gauss_replace)) { write_runlog(ERROR, "could not remove gauss replace file, errno[%d].\n", errno); } } cm_sleep(1); } }