/* * Copyright (c) 2021 Huawei Technologies Co.,Ltd. * * CM is licensed under Mulan PSL v2. * You can use this software according to the terms and conditions of the Mulan PSL v2. * You may obtain a copy of Mulan PSL v2 at: * * http://license.coscl.org.cn/MulanPSL2 * * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. * See the Mulan PSL v2 for more details. * ------------------------------------------------------------------------- * * ctl_common.cpp * cm_ctl common functions * * IDENTIFICATION * src/cm_ctl/ctl_common.cpp * * ------------------------------------------------------------------------- */ #include #include "common/config/cm_config.h" #include "cm/libpq-fe.h" #include "cm/cm_misc.h" #include "cm/cm_msg.h" #include "cm/libpq-int.h" #include "cs_ssl.h" #include "cm_json_config.h" #include "ctl_common.h" #define STOP_DEFAULT_WAIT 1200 #define CONN_TO_CMSERVER_TIMEOUT 1 #define CONN_STRING_LEN 1024 #define START_DEFAULT_WAIT 600 #define MAX_INVALID_NODE_EXECTIMES 2 const int USEC_TO_TIMEOUT = 1000; const int32 MAX_CONN_TIMES = 10; const int32 SEND_MSG_TIMES = 20; const int32 WAIT_MSG_RES_TIMES = 3; const int32 SUCCESS_SEND_MSG = 0; const int32 NEED_SEND_AGAIN = 1; DdbConn *g_sess = NULL; TlsAuthPath g_tlsPath = {0}; extern CtlCommand ctl_command; extern uint32 g_normal_cm_server_node_index; extern char mpp_env_separate_file[MAXPGPATH]; extern passwd* pw; extern uint32 g_nodeIndexForCmServer[CM_PRIMARY_STANDBY_NUM]; extern char result_path[MAXPGPATH]; extern const char* g_cmServerState[CM_PRIMARY_STANDBY_NUM + 1]; extern char* g_command_operation_azName; extern uint32 g_commandOperationNodeId; extern uint32 g_nodeId; extern char cluster_static_config[MAXPGPATH]; extern char hosts_path[MAXPGPATH]; extern const char* g_progname; extern CM_Conn* CmServer_conn; extern CM_Conn* CmServer_conn1; extern CM_Conn* CmServer_conn2; const int g_max_buf_len = 10; /* estimated extra cost about one more operated node */ static const float g_node_operation_cost = 0.5; extern bool got_stop; extern char g_appPath[MAXPGPATH]; extern char manual_start_file[MAXPGPATH]; const int DEFAULT_GET_INFO_TIME = 5; int g_hostInfo[CM_NODE_MAXNUM][CM_IP_NUM] = {0}; uint32 g_execNodes = 0; bool g_stopAbnormal = false; bool g_isRestop = false; extern int g_waitSeconds; static void connect_to_first_normal_cmserver(uint32 cmsNodeIdx, CM_Conn **curConn); void DoAdvice(void) { write_runlog(LOG, "Try \"%s --help\" for more information.\n", g_progname); } const char *GetDnProcessName(void) { return g_clusterType == V3SingleInstCluster ? ZENGINE_BIN_NAME : DATANODE_BIN_NAME; } /* * Get node index in g_node by node_id. */ uint32 get_node_index(uint32 node_id) { for (uint32 ii = 0; ii < g_node_num; ii++) { if (g_node[ii].node == node_id) { return ii; } } return INVALID_NODE_NUM; } bool isMajority(const char* cm_arbitration_mode) { if (cm_arbitration_mode == NULL) { return false; } else if (strcmp("majority", cm_arbitration_mode) == 0) { return true; } else if (strcmp("MAJORITY", cm_arbitration_mode) == 0) { return true; } else { return false; } } bool isMinority(const char* cm_arbitration_mode) { if (cm_arbitration_mode == NULL) { return false; } else if (strcmp("minority", cm_arbitration_mode) == 0) { return true; } else if (strcmp("MINORITY", cm_arbitration_mode) == 0) { return true; } else { return false; } } int FindInstanceIdAndType(uint32 node, const char *dataPath, uint32 *instanceId, int *instanceType) { if ((node < 1) || (get_node_index(node) >= g_node_num)) { write_runlog(ERROR, "node(%u) is invalid, max node num(%u).\n", node, g_node_num); return -1; } for (uint32 j = 0; j < g_node_num; j++) { if (g_node[j].node != node) { continue; } if (g_node[j].gtm == 1) { if (strncmp(dataPath, g_node[j].gtmLocalDataPath, MAXPGPATH) == 0) { *instanceId = g_node[j].gtmId; *instanceType = INSTANCE_TYPE_GTM; return 0; } } if (g_node[j].coordinate == 1) { if (strncmp(dataPath, g_node[j].DataPath, MAXPGPATH) == 0) { *instanceId = g_node[j].coordinateId; *instanceType = INSTANCE_TYPE_COORDINATE; return 0; } } for (uint32 i = 0; i < g_node[j].datanodeCount; i++) { if (strncmp(dataPath, g_node[j].datanode[i].datanodeLocalDataPath, MAXPGPATH) == 0) { *instanceId = g_node[j].datanode[i].datanodeId; *instanceType = INSTANCE_TYPE_DATANODE; return 0; } } } write_runlog(ERROR, "can't find the node(%u) instance (%s).\n", node, dataPath); return -1; } /* * ssh_exec * exec command in remote host. */ int ssh_exec(const staticNodeConfig* node, const char* cmd, int32 logLevel) { char actualCmd[MAX_COMMAND_LEN] = {0}; int rc = -1; int ret; for (uint32 ii = 0; ii < node->sshCount; ii++) { if (mpp_env_separate_file[0] == '\0') { ret = snprintf_s(actualCmd, MAX_COMMAND_LEN, MAX_COMMAND_LEN - 1, "pssh %s -s -H %s \"( %s ) > %s 2>&1\" < %s > /dev/null 2>&1", PSSH_TIMEOUT_OPTION, node->sshChannel[ii], cmd, "/dev/null", "/dev/null"); securec_check_intval(ret, (void)ret); } else { ret = snprintf_s(actualCmd, MAX_COMMAND_LEN, MAX_COMMAND_LEN - 1, "pssh %s -s -H %s \"( source %s;%s ) > %s 2>&1\" < %s > /dev/null 2>&1", PSSH_TIMEOUT_OPTION, node->sshChannel[ii], mpp_env_separate_file, cmd, "/dev/null", "/dev/null"); securec_check_intval(ret, (void)ret); } rc = system(actualCmd); if (rc != 0) { write_runlog(logLevel, "ssh failed at \"%s\".\n", node->sshChannel[ii]); write_runlog(DEBUG1, "cmd is %s, rc=%d, errno=%d.\n", actualCmd, WEXITSTATUS(rc), errno); } } return rc; } /* * ssh_exec * exec command in remote host. */ int SshExec(const staticNodeConfig *node, const char *cmd) { char actualCmd[MAXPGPATH]; int rc = -1; int ret = 0; for (uint32 ii = 0; ii < node->sshCount; ii++) { if (mpp_env_separate_file[0] == '\0') { ret = snprintf_s(actualCmd, MAXPGPATH, MAXPGPATH - 1, "pssh %s -s -H %s \"( %s )\"", PSSH_TIMEOUT_OPTION, node->sshChannel[ii], cmd); securec_check_intval(ret, (void)ret); } else { ret = snprintf_s(actualCmd, MAXPGPATH, MAXPGPATH - 1, "pssh %s -s -H %s \"( source %s;%s )\"", PSSH_TIMEOUT_OPTION, node->sshChannel[ii], mpp_env_separate_file, cmd); securec_check_intval(ret, (void)ret); } rc = system(actualCmd); if (rc != 0) { write_runlog( ERROR, "cmd execute failed on remote node:\"%s(%s)\".\n", node->nodeName, node->sshChannel[ii]); write_runlog(DEBUG1, "cmd is %s, rc=%d, errno=%d.\n", actualCmd, WEXITSTATUS(rc), errno); } } return rc; } int RunEtcdCmd(const char* command, uint32 nodeIndex) { int ret = 0; if (g_node[nodeIndex].node == g_currentNode->node) { ret = system(command); } else { ret = ssh_exec(&g_node[nodeIndex], command); } if (ret != 0) { write_runlog(DEBUG1, "fail to execute command %s, errno=%d.\n", command, errno); } return ret; } int ProcessSslAck(const char *receiveMsg, bool *enableSsl) { const cm_msg_type *cm_msg_type_ptr = (const cm_msg_type *)receiveMsg; if (cm_msg_type_ptr->msg_type != MSG_CM_SSL_CONN_ACK) { write_runlog(ERROR, "fail to get ssl ack errno=%d.\n", errno); return -1; } const CmToAgentConnectAck *msgAck = (const CmToAgentConnectAck *)(receiveMsg); if (msgAck->status == SSL_ENABLE) { *enableSsl = true; return 0; } else if (msgAck->status == SSL_DISABLE) { *enableSsl = false; return 0; } return -1; } static inline void cs_securec_clear(char *content, uint32 len) { if (content != NULL) { errno_t rc = memset_s(content, len, 0, len); securec_check_errno(rc, (void)rc); } return; } static status_t CtlConnSslRequst(CM_Conn *conn, int ssl_req, bool *enableSsl) { AgentToCmConnectRequest req_msg; req_msg.msg_type = ssl_req; req_msg.nodeid = g_nodeId; const int32 timesPerSec = 5; int64 timeOut = g_waitSeconds * timesPerSec; if (cm_client_send_msg(conn, 'C', (const char *)&req_msg, sizeof(AgentToCmConnectRequest)) != 0) { return CM_ERROR; } char *receiveMsg = NULL; while (timeOut >= 0) { if (cm_client_flush_msg(conn) != 0) { return CM_ERROR; } receiveMsg = recv_cm_server_cmd(conn); if (receiveMsg != NULL) { if (ProcessSslAck(receiveMsg, enableSsl) != 0) { continue; } return CM_SUCCESS; } timeOut--; CmUsleep(CTL_RECV_CYCLE); } return CM_ERROR; } static status_t CtlConnSslEstablish(CM_Conn *conn, conn_option_t *option, bool *enableSsl) { const uint32 plainLen = CM_PASSWD_MAX_LEN + 1; char plain[plainLen] = {0}; CM_RETURN_IFERR(CtlConnSslRequst(conn, MSG_CM_SSL_CONN_REQUEST, enableSsl)); if (!*enableSsl) { return CM_SUCCESS; } write_runlog(DEBUG1, "begin to create ssl connection\n"); CM_RETURN_IFERR(cm_verify_ssl_key_pwd(plain, sizeof(plain), CLIENT_CIPHER)); g_sslOption.ssl_para.key_password = plain; g_sslOption.ssl_para.verify_peer = true; /* check certificate file access permission */ if (strlen(option->ssl_para.ca_file) > 0) { CM_RETURN_IFERR_EX(cm_ssl_verify_file_stat(option->ssl_para.ca_file), cs_securec_clear(plain, plainLen)); } if (strlen(option->ssl_para.key_file) > 0) { CM_RETURN_IFERR_EX(cm_ssl_verify_file_stat(option->ssl_para.key_file), cs_securec_clear(plain, plainLen)); } if (strlen(option->ssl_para.cert_file) > 0) { CM_RETURN_IFERR_EX(cm_ssl_verify_file_stat(option->ssl_para.cert_file), cs_securec_clear(plain, plainLen)); } /* create the ssl connector - init ssl and load certs */ ssl_ctx_t *ssl_fd = cm_ssl_create_connector_fd(&option->ssl_para); /* erase key_password for security issue */ cs_securec_clear(plain, plainLen); if (ssl_fd == NULL) { write_runlog(ERROR, "ssl_create_connector_fd failed.\n"); return CM_ERROR; } conn->ssl_connector_fd = ssl_fd; /* connect to the server */ if (cm_cs_ssl_connect(ssl_fd, &conn->pipe) != CM_SUCCESS) { write_runlog(ERROR, "create ssl connection failed.\n"); return CM_ERROR; } conn->status = CONNECTION_OK; return CM_SUCCESS; } status_t TryGetSslConnToCmserver(CM_Conn *conn, int timeOut) { const uint32 upgradeVersion = 92574; GetUpgradeVersionFromCmaConfig(); if (undocumentedVersion != 0 && undocumentedVersion < upgradeVersion) { return CM_SUCCESS; } const int32 socketTimeout = 3 * USEC_TO_TIMEOUT; conn->pipe.link.tcp.sock = conn->sock; conn->pipe.link.tcp.closed = CM_FALSE; conn->pipe.link.tcp.remote = *(sock_addr_t *)&conn->raddr; conn->pipe.link.tcp.local = *(sock_addr_t *)&conn->laddr; conn->pipe.connect_timeout = timeOut * USEC_TO_TIMEOUT; conn->pipe.socket_timeout = socketTimeout; conn->pipe.l_onoff = 1; conn->pipe.l_linger = 1; conn->pipe.type = CS_TYPE_TCP; conn->status = CONNECTION_SSL_STARTUP; bool enableSsl = false; static int sslEstablishFailedTime = 0; const int sslEstablishFailedTimeOut = 5; if (CtlConnSslEstablish(conn, &g_sslOption, &enableSsl) != CM_SUCCESS) { write_runlog(ERROR, "create ssl connection failed.\n"); sslEstablishFailedTime++; if (sslEstablishFailedTime > sslEstablishFailedTimeOut) { write_runlog(ERROR, "cm_ctl can't establish an SSL connection, please check certificate file\n"); exit(1); } return CM_ERROR; } sslEstablishFailedTime = 0; if (enableSsl) { write_runlog(DEBUG5, "create ssl connection success.\n"); } else { write_runlog(DEBUG5, "ssl connection not enable.\n"); conn->status = CONNECTION_OK; } return CM_SUCCESS; } static status_t DoConnCmserver(uint32 nodeIndex, uint32 cmsIndex, uint32 cmaIndex, CM_Conn **curConn, bool isFirstCms) { char connstr[CONN_STRING_LEN] = {0}; int ret; const int timeOut = 5; // in order to prevent connect timeout in big cluster const int32 ComputeConnectTimeOut = 50; int32 connectTimeout = ((int32)g_node_num) / ComputeConnectTimeOut + CONN_TO_CMSERVER_TIMEOUT; if (connectTimeout >= timeOut) { connectTimeout = timeOut; } ret = memset_s(connstr, CONN_STRING_LEN, 0, CONN_STRING_LEN); securec_check_errno(ret, (void)ret); ret = snprintf_s(connstr, sizeof(connstr), sizeof(connstr) - 1, "host=%s port=%u localhost=%s connect_timeout=%d user=%s node_id=%u node_name=%s " "remote_type=%d %s", g_node[nodeIndex].cmServer[cmsIndex], g_node[nodeIndex].port, g_currentNode->cmAgentIP[cmaIndex], connectTimeout, pw->pw_name, g_nodeHeader.node, "cm_ctl", CM_CTL, isFirstCms ? "" : "postmaster=1"); securec_check_intval(ret, (void)ret); CM_Conn *conn = PQconnectCM(connstr); if (conn != NULL && (CMPQstatus(conn) == CONNECTION_OK)) { write_runlog(DEBUG5, "socket is [%d]. try to get ssl connection: %s\n", conn->sock, connstr); if (TryGetSslConnToCmserver(conn, timeOut) != CM_SUCCESS) { write_runlog(ERROR, "socket is [%d], %d : create ssl failed: %s\n", conn->sock, __LINE__, CMPQerrorMessage(conn)); CMPQfinish(conn); conn = NULL; return CM_ERROR; } if (isFirstCms) { g_normal_cm_server_node_index = nodeIndex; } CMPQfinish(*curConn); *curConn = conn; write_runlog(DEBUG1, "connect to cmserver success, remotehost is %s:%u.\n", g_node[nodeIndex].cmServer[cmsIndex], g_node[nodeIndex].port); return CM_SUCCESS; } write_runlog(DEBUG1, "%d : connect to cmserver failed: %s.\n", __LINE__, CMPQerrorMessage(conn)); CMPQfinish(conn); return CM_ERROR; } static void ConnNormalCms(uint32 nodeIndex, CM_Conn **curConn) { for (uint32 i = 0; i < g_node[nodeIndex].cmServerListenCount; ++i) { for (uint32 j = 0; j < g_currentNode->cmAgentListenCount; ++j) { if (DoConnCmserver(nodeIndex, i, j, curConn, false) == CM_SUCCESS) { return; } } } } static void GetFirstNormalCmsConn(uint32 cmsNodeIdx, bool queryEtcd, CM_Conn **curConn) { if (curConn != NULL) { connect_to_first_normal_cmserver(cmsNodeIdx, curConn); } else if (queryEtcd) { connect_to_first_normal_cmserver(cmsNodeIdx, &CmServer_conn2); } else { connect_to_first_normal_cmserver(cmsNodeIdx, &CmServer_conn); } } static void ConnPrimaryCms(bool queryEtcd, CM_Conn **curConn) { if (g_normal_cm_server_node_index == PG_UINT32_MAX) { for (uint32 kk = 0; kk < g_cm_server_num; kk++) { if (strcmp("Down", g_cmServerState[kk]) == 0 || strcmp("Skip", g_cmServerState[kk]) == 0) { continue; } GetFirstNormalCmsConn(g_nodeIndexForCmServer[kk], queryEtcd, curConn); if (g_normal_cm_server_node_index != PG_UINT32_MAX) { break; } } } else { GetFirstNormalCmsConn(g_normal_cm_server_node_index, queryEtcd, curConn); } } /* quert_etcd is true, it means the conn for query etcd from cms primary * because query -v need the conn CmServer_conn, if use it, will find error with same conn. */ void do_conn_cmserver(bool queryCmserver, uint32 nodeIndex, bool queryEtcd, CM_Conn **curConn) { if (queryCmserver) { if (curConn != NULL) { ConnNormalCms(nodeIndex, curConn); } else { ConnNormalCms(nodeIndex, &CmServer_conn1); } } else { ConnPrimaryCms(queryEtcd, curConn); } } int cm_client_flush_msg(CM_Conn* conn) { if (conn != NULL) { int ret = cmpqFlush(conn); if (ret != 0) { write_runlog(ERROR, "flush data failed: %s.\n", CMPQerrorMessage(conn)); return ret; } } else { write_runlog(DEBUG1, "flush connection is NULL.\n"); return -1; } return 0; } int cm_client_send_msg(CM_Conn* conn, char msgtype, const char* s, size_t len) { int ret = CMPQPacketSend(conn, msgtype, s, len); if (ret != STATUS_OK) { if (ctl_command != CM_SWITCHOVER_COMMAND && ctl_command != CM_BUILD_COMMAND) { write_runlog(ERROR, "send message to server failed: %s.\n", CMPQerrorMessage(conn)); } else { write_runlog(DEBUG1, "send message to server failed: %s.\n", CMPQerrorMessage(conn)); } return -1; } return 0; } char* recv_cm_server_cmd(CM_Conn* conn) { if (conn == NULL) { if (ctl_command != CM_SWITCHOVER_COMMAND) { write_runlog(ERROR, "cm_ctl is not connect to the cm server.\n"); } return NULL; } if (cmpqReadData(conn) < 0) { return NULL; } CM_Result *res = cmpqGetResult(conn); if (res == NULL) { return NULL; } return (char*)&(res->gr_resdata); } /* * @Description: init hosts file used by pssh when starting or stopping cluster. */ void init_hosts() { uint32 i, j; g_execNodes = 0; FILE* fd = fopen(hosts_path, "w"); if (fd == NULL) { char errBuffer[ERROR_LIMIT_LEN]; write_runlog( ERROR, "could not open hosts file \"%s\": %s\n", hosts_path, strerror_r(errno, errBuffer, ERROR_LIMIT_LEN)); exit(1); } for (i = 0; i < g_node_num; i++) { for (j = 0; j < g_node[i].sshCount; j++) { if (g_hostInfo[i][j] > MAX_INVALID_NODE_EXECTIMES && ctl_command == STOP_COMMAND) { continue; } g_execNodes++; (void)fprintf(fd, "%s\n", g_node[i].sshChannel[j]); } } (void)fclose(fd); } /** * @brief * Check if the node is disconnected when stopping the cluster. * * @param * errCode with 255 represents the pssh error. */ void ReportAbnormalNode(const char *errInfo) { const int MAX_IP_LEN = CM_IP_LENGTH + 2; for (uint32 i = 0; i < g_node_num; i++) { for (uint32 j = 0; j < g_node[i].sshCount; j++) { char tmp[MAX_IP_LEN] = {0}; int rc = snprintf_s(tmp, MAX_IP_LEN, MAX_IP_LEN - 1, " %s ", g_node[i].sshChannel[j]); securec_check_intval(rc, (void)rc); if (strstr(errInfo, tmp) == NULL) { continue; } g_hostInfo[i][j]++; if (g_hostInfo[i][j] >= MAX_INVALID_NODE_EXECTIMES) { write_runlog(WARNING, "abnormal node %u.\n", g_node[i].node); g_stopAbnormal = true; } } } } /** * @brief * Check the cluster running status. * * @return * 0: Represents the following scenarios: * 1. The cluster is running. * 2. The cluster is starting. * 3. The cluster has been stopped. * -1: Represents the followign scenarios: * 1. Failed to check the cluster running status. * 2. The cluster is stopping. */ int CheckClusterRunningStatus() { uint32 errorCount = 0; char command[MAXPGPATH * 2] = {0}; char buffer[MAXPGPATH] = {0}; char invalidStr[MAXPGPATH] = {0}; char* exitStr = NULL; uint32 stoppedNode = 0; uint32 uninstallNode = 0; uint32 disConNode = 0; uint32 stoppingNode = 0; uint32 normalNode = 0; uint32 failedNode = 0; uint32 timeoutNode = 0; const int MAX_TRY_TIMES = 3; int rcs = 0; int errorCode = 0; struct timeval cluster_status_check_time_begin; struct timeval cluster_status_check_time_end; if (got_stop || g_isRestop) { return -1; } if (ctl_command == START_COMMAND) { write_runlog(LOG, "checking cluster status.\n"); } (void)gettimeofday(&cluster_status_check_time_begin, NULL); const int ret = snprintf_s(command, MAXPGPATH * 2, MAXPGPATH * 2 - 1, SYSTEMQUOTE "if [ -f \"/etc/profile\" ]; then " "source /etc/profile; " "fi; " "if [ -f \"$HOME/.bashrc\" ]; then source $HOME/.bashrc; fi; " "if [ -f \"%s\" ]; then source %s; fi; " "pssh %s -h %s \" " "if [ -f \\\"/etc/profile\\\" ]; then " "source /etc/profile; " "fi; " "if [ -f \\\"\\$HOME/.bashrc\\\" ]; then " "source \\$HOME/.bashrc; " "fi; " "if [ -f \\\"%s\\\" ]; then " "source %s; " "fi; " "%s/bin/%s check -B %s -T %s/bin/%s > /dev/null; " "\" 2>&1; " SYSTEMQUOTE, mpp_env_separate_file, mpp_env_separate_file, PSSH_TIMEOUT_OPTION, hosts_path, mpp_env_separate_file, mpp_env_separate_file, g_appPath, CM_CTL_BIN_NAME, CM_AGENT_BIN_NAME, g_appPath, CM_AGENT_BIN_NAME); securec_check_intval(ret,); init_hosts(); FILE *fp = popen(command, "re"); if (fp == NULL) { char error_buffer[ERROR_LIMIT_LEN] = {0}; (void)strerror_r(errno, error_buffer, ERROR_LIMIT_LEN); write_runlog(DEBUG1, "Failed to execute the shell command: error=\"[%d] %s\"," " command=\"%s\".\n", errno, error_buffer, command); (void)unlink(hosts_path); return -1; } write_runlog(DEBUG1, "start check cluster running status with command %s.\n", command); while (!feof(fp)) { if (fgets(buffer, MAXPGPATH - 1, fp)) { write_runlog(DEBUG1, "%s", buffer); if (strstr(buffer, "SUCCESS") != NULL) { stoppedNode++; } else if (strstr(buffer, "Timed out") != NULL) { ReportAbnormalNode(buffer); timeoutNode++; } else if (exitStr = strstr(buffer, "Exited with error code"), exitStr != NULL) { rcs = sscanf_s(exitStr, "%[^1-9]%d", invalidStr, MAXPGPATH, &errorCode); check_sscanf_s_result(rcs, 2); switch (errorCode) { case UNINSTALL_NODE: ReportAbnormalNode(buffer); uninstallNode++; break; case DISCONNECT_NODE: ReportAbnormalNode(buffer); disConNode++; break; case STOPPING_NODE: stoppingNode++; break; case ONLINE_NODE: case NORMAL_NODE: normalNode++; break; case FAILED_NODE: failedNode++; break; default: break; } } } else { errorCount++; } if (errorCount >= MAX_TRY_TIMES) { break; } } const int exitCode = pclose(fp); if (!g_isRestop) { (void)unlink(hosts_path); } (void)gettimeofday(&cluster_status_check_time_end, NULL); if (ctl_command == START_COMMAND) { write_runlog(LOG, "checking finished in %ld ms.\n", (cluster_status_check_time_end.tv_sec - cluster_status_check_time_begin.tv_sec) * 1000 + (cluster_status_check_time_end.tv_usec - cluster_status_check_time_begin.tv_usec) / 1000); /* ALL nodes were stopped or started. */ if (WEXITSTATUS(exitCode) == PSSH_SUCCESS) { write_runlog(DEBUG1, "end check cluster running status with pssh.\n"); return 0; } else if (timeoutNode > 0 && stoppingNode == 0 && failedNode == 0) { write_runlog(WARNING, "The ssh connection time out or the ssh trust relationship is" "abnormal on some nodes. But the cluster will continue to start.\n"); return 0; } else { switch (WEXITSTATUS(exitCode)) { /* The pssh exit with exit code 5 represents that the shell command exit with non-zero. */ case COMMAND_TIMEOUT: if (disConNode > 0) { write_runlog(ERROR, "Failed to execute the shell command with the pssh exit code %d.\n", WEXITSTATUS(exitCode)); return -1; } else if (failedNode > 0) { /* Some nodes checked failed. */ write_runlog(WARNING, "Failed to call the \"cm_ctl check\" operation.\n"); write_runlog(WARNING, "Failed to check the cluster running status.\n"); return 0; } else if (stoppingNode > 0) { /* Some nodes are stopping. */ write_runlog(ERROR, "cluster is already running. \n" "HINT: Mabybe the cluster is coninually stopping in the background.\n" "You can wait for a while and check whether the cluster stops, or immediately stop" " the cluster by \"cm_ctl stop -m i\".\n"); return -1; } return 0; /** * The pssh exit with exit code 4 represents the ssh connection times out or * the ssh trust relationship is abnormal on some nodes. */ case PSSH_TIMEOUT: if (disConNode > 0 && stoppingNode == 0 && failedNode == 0) { write_runlog(WARNING, "The ssh connection time out or the ssh trust relationship is" " abnormal on some nodes. But the cluster will continue to start.\n"); return 0; } break; default: write_runlog(ERROR, "Failed to execute the shell command with the pssh exit code %d.\n", WEXITSTATUS(exitCode)); write_runlog(ERROR, "failed to check the cluster running status.\n"); return -1; } } } else if (ctl_command == STOP_COMMAND) { if (timeoutNode > 0) { if (timeoutNode + disConNode + uninstallNode + stoppedNode + failedNode == g_execNodes) { write_runlog(DEBUG1, "end check cluster with timeoutNode %u, disConNode %u, unintallNode %u, " "stoppedNode %u, failedNode %u.\n", timeoutNode, disConNode, uninstallNode, stoppedNode, failedNode); return 0; } else { write_runlog(ERROR, "end check cluster: timeoutNode %u, disConNode %u, normalNode %u, " "stoppedNode %u, stoppingNode %u, failedNode %u, uninstallNode %u.\n", timeoutNode, disConNode, normalNode, stoppedNode, stoppingNode, failedNode, uninstallNode); return -1; } } else { switch (WEXITSTATUS(exitCode)) { /* The pssh exit with exit code 5 represents that the shell command exit with non-zero. */ case PSSH_TIMEOUT: if (disConNode > 0) { if (disConNode + uninstallNode + stoppedNode + failedNode == g_execNodes) { write_runlog(DEBUG1, "end check cluster with disConNode %u, unintallNode %u, " "stoppedNode %u, failedNode %u.\n", disConNode, uninstallNode, stoppedNode, failedNode); return 0; } } break; /** * The pssh exit with exit code 4 represents the ssh connection times out or * the ssh trust relationship is abnormal on some nodes. */ case COMMAND_TIMEOUT: if (uninstallNode + stoppedNode + failedNode == g_execNodes) { write_runlog(DEBUG1, "end check cluster with unintallNode %u, stoppedNode %u, failedNode %u.\n", uninstallNode, stoppedNode, failedNode); return 0; } break; case PSSH_SUCCESS: if (stoppedNode == g_execNodes) { write_runlog(DEBUG1, "end check cluster with stopped node %u.\n", stoppedNode); return 0; } break; /* The pssh exit with other exit codes represents an unexpected error. */ default: write_runlog(ERROR, "end check cluster pssh result %d, disConNode %u, normalNode %u, " "stoppedNode %u, stoppingNode %u, failedNode %u, uninstallNode %u.\n", WEXITSTATUS(exitCode), disConNode, normalNode, stoppedNode, stoppingNode, failedNode, uninstallNode); return -1; } } } return -1; } int CheckSingleClusterRunningStatus() { struct timeval cluster_status_check_time_begin; struct timeval cluster_status_check_time_end; long expired_time; int ret = 0; (void)gettimeofday(&cluster_status_check_time_begin, NULL); if (is_node_stopping(0, 0, manual_start_file, result_path, mpp_env_separate_file)) { ret = stop_check_node(0); } (void)gettimeofday(&cluster_status_check_time_end, NULL); expired_time = (cluster_status_check_time_end.tv_sec - cluster_status_check_time_begin.tv_sec); write_runlog(LOG, "check node status take %ld seconds.\n", expired_time); return ret; } bool is_node_stopping(uint32 checkNode, uint32 currentNode, const char *manualStartFile, const char *resultFile, const char *mppEnvSeperateFile) { int result = -1; char command[MAX_PATH_LEN] = {0}; int ret; if (checkNode == currentNode && strstr(g_appPath, "/var/chroot") == NULL) { ret = snprintf_s(command, MAX_PATH_LEN, MAX_PATH_LEN - 1, "ls %s > /dev/null 2>&1 \n echo -e $? > %s", manualStartFile, resultFile); securec_check_intval(ret, (void)ret); exec_system(command, &result, resultFile); } else { ret = snprintf_s(command, MAX_PATH_LEN, MAX_PATH_LEN - 1, "ls %s\" > /dev/null 2>&1; echo -e $? > %s", manualStartFile, resultFile); securec_check_intval(ret, (void)ret); exec_system_ssh(checkNode, command, &result, resultFile, mppEnvSeperateFile); } return result == 0; } char* xstrdup(const char* s) { char* result = NULL; result = strdup(s); if (result == NULL) { write_runlog(FATAL, "out of memory\n"); exit(1); } return result; } void CheckDnNodeStatusById(uint32 node_id_check, int* result, uint32 dnIndex) { char command[MAXPGPATH] = {0}; char resultPath[MAXPGPATH] = {0}; char checkDnProcessResultPath[MAX_PATH_LEN] = {0}; int fd; bool flag = false; int ret = GetHomePath(resultPath, sizeof(resultPath)); if (ret != EOK) { return; } errno_t tnRet = snprintf_s(checkDnProcessResultPath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/checkDnProcessResult-XXXXXX", resultPath); securec_check_intval(tnRet, (void)tnRet); fd = mkstemp(checkDnProcessResultPath); if (fd <= 0) { write_runlog(ERROR, "failed to create the dn process check result file: errno=%d.\n", errno); flag = true; } if (node_id_check == g_nodeId && strstr(g_appPath, "/var/chroot") == NULL) { tnRet = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, "cm_ctl check -B %s -T %s \n echo -e $? > %s", GetDnProcessName(), g_node[node_id_check].datanode[dnIndex].datanodeLocalDataPath, flag ? result_path : checkDnProcessResultPath); securec_check_intval(tnRet, (void)tnRet); exec_system(command, result, flag ? result_path : checkDnProcessResultPath); } else { tnRet = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, "cm_ctl check -B %s -T %s\" > /dev/null 2>&1; echo -e $? > %s", GetDnProcessName(), g_node[node_id_check].datanode[dnIndex].datanodeLocalDataPath, flag ? result_path : checkDnProcessResultPath); securec_check_intval(tnRet, (void)tnRet); exec_system_ssh(node_id_check, command, result, flag ? result_path : checkDnProcessResultPath, mpp_env_separate_file); } if (fd > 0) { (void)close(fd); } (void)unlink(flag ? result_path : checkDnProcessResultPath); } void CheckCnNodeStatusById(uint32 node_id_check, int* result) { errno_t tnRet = 0; char command[MAXPGPATH] = {0}; char resultPath[MAXPGPATH] = {0}; char checkCnProcessResultPath[MAX_PATH_LEN] = {0}; int fd; bool flag = false; int ret = GetHomePath(resultPath, MAXPGPATH); if (ret != EOK) { return; } ret = snprintf_s(checkCnProcessResultPath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/checkCnProcessResult-XXXXXX", resultPath); securec_check_intval(ret, (void)ret); fd = mkstemp(checkCnProcessResultPath); if (fd <= 0) { write_runlog(ERROR, "failed to create the cn process check result file: errno=%d.\n", errno); flag = true; } if (node_id_check == g_nodeId && strstr(g_appPath, "/var/chroot") == NULL) { tnRet = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, "cm_ctl check -B %s -T %s \n echo -e $? > %s", COORDINATE_BIN_NAME, g_node[node_id_check].DataPath, flag ? result_path : checkCnProcessResultPath); securec_check_intval(tnRet, (void)tnRet); exec_system(command, result, flag ? result_path : checkCnProcessResultPath); } else { tnRet = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, "cm_ctl check -B %s -T %s\" > /dev/null 2>&1; echo -e $? > %s", COORDINATE_BIN_NAME, g_node[node_id_check].DataPath, flag ? result_path : checkCnProcessResultPath); securec_check_intval(tnRet, (void)tnRet); exec_system_ssh(node_id_check, command, result, flag ? result_path : checkCnProcessResultPath, mpp_env_separate_file); } if (fd > 0) { (void)close(fd); } (void)unlink(flag ? result_path : checkCnProcessResultPath); } void CheckGtmNodeStatusById(uint32 node_id_check, int* result) { errno_t tnRet = 0; char command[MAXPGPATH] = {0}; char resultPath[MAXPGPATH] = {0}; char checkGTMProcessResultPath[MAX_PATH_LEN] = {0}; int fd; bool flag = false; int ret = GetHomePath(resultPath, MAXPGPATH); if (ret != EOK) { return; } ret = snprintf_s(checkGTMProcessResultPath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/checkGTMProcessResult-XXXXXX", resultPath); securec_check_intval(ret, (void)ret); fd = mkstemp(checkGTMProcessResultPath); if (fd <= 0) { write_runlog(ERROR, "failed to create the gtm process check result file: errno=%d.\n", errno); flag = true; } if (node_id_check == g_nodeId && strstr(g_appPath, "/var/chroot") == NULL) { tnRet = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, "cm_ctl check -B %s -T %s \n echo -e $? > %s", GTM_BIN_NAME, g_node[node_id_check].gtmLocalDataPath, flag ? result_path : checkGTMProcessResultPath); securec_check_intval(tnRet, (void)tnRet); exec_system(command, result, flag ? result_path : checkGTMProcessResultPath); } else { tnRet = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, "cm_ctl check -B %s -T %s\" > /dev/null 2>&1; echo -e $? > %s", GTM_BIN_NAME, g_node[node_id_check].gtmLocalDataPath, flag ? result_path : checkGTMProcessResultPath); securec_check_intval(tnRet, (void)tnRet); exec_system_ssh(node_id_check, command, result, flag ? result_path : checkGTMProcessResultPath, mpp_env_separate_file); } if (fd > 0) { (void)close(fd); } (void)unlink(flag ? result_path : checkGTMProcessResultPath); } /** * @brief Check whether the static config file exist in the target node. * * @param [in] nodeIndex: The node index in the cluster config. * * @return 0: if the static config file exist, return 0. * @return 1: if the static config file does not exist, return 1. * @return -1: if failed to check the static config file status, return -1. */ int checkStaticConfigExist(uint32 nodeIndex) { int result = -1; char command[MAXPGPATH] = {0}; int ret; char resultPath[MAXPGPATH] = {0}; char checkStaticConfigPath[MAX_PATH_LEN] = {0}; int fd; ret = GetHomePath(resultPath, MAXPGPATH); if (ret != EOK) { return -1; } ret = snprintf_s(checkStaticConfigPath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/checkStaticConfig-XXXXXX", resultPath); securec_check_intval(ret, (void)ret); fd = mkstemp(checkStaticConfigPath); if (fd <= 0) { write_runlog(ERROR, "failed to create the result file: errno=%d.\n", errno); return -1; } /* Check whether the cluster static config file exist. */ if (nodeIndex == g_nodeId && strstr(g_appPath, "/var/chroot") == NULL) { ret = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, "ls %s > /dev/null 2>&1 \n echo -e $? > %s", cluster_static_config, checkStaticConfigPath); securec_check_intval(ret, (void)ret); exec_system(command, &result, checkStaticConfigPath); } else { ret = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, "ls %s\" > /dev/null 2>&1; echo -e $? > %s", cluster_static_config, checkStaticConfigPath); securec_check_intval(ret, (void)ret); exec_system_ssh(nodeIndex, command, &result, checkStaticConfigPath, mpp_env_separate_file); } (void)close(fd); (void)unlink(checkStaticConfigPath); return result; } static status_t GetCmsConnect(CM_Conn **curConn) { status_t st = CM_SUCCESS; int32 tryTime = MAX_CONN_TIMES; const uint32 waitForCms = 10; const uint32 sleepInterval = 3; static bool isFirst = true; if (isFirst) { cm_sleep(waitForCms); isFirst = false; } do { do_conn_cmserver(false, 0, false, curConn); if ((*curConn) == NULL) { write_runlog(DEBUG1, "send ddb msg to cm_server, connect fail. node_id:%u.\n", g_commandOperationNodeId); st = CM_ERROR; write_runlog(LOG, "."); cm_sleep(sleepInterval); } else { break; } --tryTime; } while (st != CM_SUCCESS && (tryTime > 0)); if ((*curConn) == NULL) { return CM_ERROR; } return CM_SUCCESS; } static void GetCltSendDdbOper( const char *key, const char *value, const char *threadName, CltSendDdbOper *sendOper, DDB_OPER dbOper) { sendOper->msgType = (int32)MSG_CLIENT_CM_DDB_OPER; sendOper->dbOper = dbOper; sendOper->node = g_currentNode->node; errno_t rc = strcpy_s(sendOper->threadName, THREAD_NAME_LEN, threadName); securec_check_errno(rc, (void)rc); sendOper->keyLen = (uint32)strlen(key); rc = strcpy_s(sendOper->key, MAX_PATH_LEN, key); securec_check_errno(rc, (void)rc); if (value != NULL) { sendOper->valueLen = (uint32)strlen(value); rc = strcpy_s(sendOper->value, MAX_PATH_LEN, value); securec_check_errno(rc, (void)rc); } } static bool HandleCmsDdbMsg(const char *key, const char *threadName, DDB_OPER dbOper, CM_Conn **curConn) { if (*curConn == NULL) { return false; } int32 ret = cm_client_flush_msg(*curConn); if (ret == TCP_SOCKET_ERROR_EPIPE) { CMPQfinish(*curConn); *curConn = NULL; return false; } char *receiveMsg = recv_cm_server_cmd(*curConn); if (receiveMsg == NULL) { return false; } cm_msg_type *cmMsgType = (cm_msg_type *)receiveMsg; if (cmMsgType->msg_type != (int32)MSG_CM_CLIENT_DDB_OPER_ACK) { return false; } CmSendDdbOperRes *msgDdbOper = (CmSendDdbOperRes *)receiveMsg; if (msgDdbOper->dbOper != dbOper) { return false; } if (strcmp(key, msgDdbOper->key) != 0 || strcmp(threadName, msgDdbOper->threadName) != 0) { write_runlog(DEBUG1, "key is [%s: %s], threadName is [%s: %s].\n", key, msgDdbOper->key, threadName, msgDdbOper->threadName); return false; } return msgDdbOper->exeStatus; } static int32 SendDdbMsgAndGetDdbRes( const char *key, const char *threadName, CltSendDdbOper *sendOper, CM_Conn **curConn) { int32 ret = cm_client_send_msg(*curConn, 'C', (char *)sendOper, sizeof(CltSendDdbOper)); if (ret != 0) { FINISH_CONNECTION2((*curConn)); return -1; } int32 tryTimes = WAIT_MSG_RES_TIMES; bool rt = false; do { rt = HandleCmsDdbMsg(key, threadName, sendOper->dbOper, curConn); if (!rt) { cm_sleep(1); } --tryTimes; } while (!rt && (tryTimes > 0)); if (!rt) { return NEED_SEND_AGAIN; } write_runlog(DEBUG1, "success to get handleCmsDdbMsg, key_value is (%s: %s), threadName is %s.\n", key, sendOper->value, threadName); return SUCCESS_SEND_MSG; } status_t SendKVToCms(const char *key, const char *value, const char *threadName) { CM_Conn *curConn = NULL; status_t st = GetCmsConnect(&curConn); if (st != CM_SUCCESS) { return CM_ERROR; } CltSendDdbOper sendOper = {0}; GetCltSendDdbOper(key, value, threadName, &sendOper, DDB_SET_OPER); int32 tryTime = SEND_MSG_TIMES; int32 ret = 0; do { ret = SendDdbMsgAndGetDdbRes(key, threadName, &sendOper, &curConn); if (ret == -1) { FINISH_CONNECTION2(curConn); return CM_ERROR; } if (ret != SUCCESS_SEND_MSG) { write_runlog(LOG, "."); cm_sleep(1); } --tryTime; } while (ret != SUCCESS_SEND_MSG && (tryTime > 0)); FINISH_CONNECTION2(curConn); if (ret != SUCCESS_SEND_MSG) { write_runlog(DEBUG1, "Failed to send msg(%s: %s) threadName is %s to cms.\n", key, value, threadName); return CM_ERROR; } return CM_SUCCESS; } int cmctl_getenv(const char* env_var, char* output_env_value, uint32 env_value_len) { return cm_getenv(env_var, output_env_value, env_value_len); } static void connect_to_first_normal_cmserver(uint32 cmsNodeIdx, CM_Conn **curConn) { for (uint32 jj = 0; jj < g_node[cmsNodeIdx].cmServerListenCount; jj++) { for (uint32 ii = 0; ii < g_currentNode->cmAgentListenCount; ii++) { if (DoConnCmserver(cmsNodeIdx, jj, ii, curConn, true) == CM_SUCCESS) { return; } g_normal_cm_server_node_index = PG_UINT32_MAX; } } } /** * @brief * Obtains the current time and get the exact number of seconds (counted from January 1, 1970). * * @note * You do not need to pay attention to the return value of the system function "gettimeofday". * If the "timeval" parameter is NULL, an error is returned. * * @return * Returns the exact number of seconds (counted from January 1, 1970). */ time_t get_start_time() { timespec tv = {0, 0}; (void)clock_gettime(CLOCK_MONOTONIC, &tv); return tv.tv_sec; } /** * @brief * Obtain the current time and compare it with the input time. * * @note * We will not think about that the current time minus the start time will be out of range. * * @return * Return the number of the current time minus the start time. */ time_t check_with_end_time(const time_t start_time) { Assert(start_time > 0); timespec tv = {0, 0}; (void)clock_gettime(CLOCK_MONOTONIC, &tv); return tv.tv_sec - start_time; } void exec_system(const char *cmd, int *result, const char *resultPath) { char result_str[g_max_buf_len + 1] = {0}; if (resultPath == NULL) { resultPath = result_path; } int rc = system(cmd); if (rc != 0) { write_runlog(ERROR, "failed to execute the command: command=\"%s\", systemReturn=%d, commandReturn=%d," " errno=%d.\n", cmd, rc, SHELL_RETURN_CODE(rc), errno); *result = -1; return; } char realPath[MAX_PATH_LEN] = {0}; GetRealFile(realPath, MAX_PATH_LEN, resultPath); FILE *fd = fopen(realPath, "r"); if (fd == NULL) { write_runlog(DEBUG1, "failed to open the result file: errno=%d.\n", errno); *result = -1; return; } /* read result */ size_t bytesread = fread(result_str, 1, (size_t)g_max_buf_len, fd); if (bytesread > (size_t)g_max_buf_len) { write_runlog(ERROR, "exec_system fread failed! file=%s, bytesread=%u\n", realPath, (uint32)bytesread); (void)fclose(fd); *result = -1; return; } *result = atoi(result_str); (void)fclose(fd); } void exec_system_ssh(uint32 remote_nodeid, const char *cmd, int *result, const char *resultPath, const char *mppEnvSeperateFile) { const int SHELL_COMMAND_NOT_EXIST = 127; const int COMMAND_NOT_EXIST_FIND_NUM = 20; int rc; char command[MAXPGPATH] = {0}; char result_str[g_max_buf_len + 1] = {0}; int command_not_exist_num = 0; int ret; if (resultPath == NULL) { resultPath = result_path; } if (g_node[remote_nodeid].sshCount != 0) { if (mppEnvSeperateFile[0] == '\0') { ret = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, "pssh %s -s -H %s \"%s", PSSH_TIMEOUT_OPTION, g_node[remote_nodeid].sshChannel[0], cmd); securec_check_intval(ret, (void)ret); } else { ret = snprintf_s(command, MAXPGPATH, MAXPGPATH - 1, "pssh %s -s -H %s \"source %s;%s", PSSH_TIMEOUT_OPTION, g_node[remote_nodeid].sshChannel[0], mppEnvSeperateFile, cmd); securec_check_intval(ret, (void)ret); } rc = system(command); if (rc != 0) { write_runlog(ERROR, "failed to execute the ssh command: nodeId=%u, command=\"%s\", systemReturn=%d," " commandReturn=%d, errno=%d.\n", g_node[remote_nodeid].node, command, rc, SHELL_RETURN_CODE(rc), errno); *result = -1; return; } } char realPath[MAX_PATH_LEN] = {0}; GetRealFile(realPath, MAX_PATH_LEN, resultPath); FILE *fd = fopen(realPath, "r"); if (fd == NULL) { write_runlog(ERROR, "failed to open the result file: errno=%d.\n", errno); *result = -1; return; } /* read result */ size_t bytesread = fread(result_str, 1, g_max_buf_len, fd); if (bytesread > (size_t)g_max_buf_len) { write_runlog(ERROR, "exec_system_ssh fread failed! file=%s, bytesread=%u\n", realPath, (uint32)bytesread); (void)fclose(fd); *result = -1; return; } *result = atoi(result_str); if (*result != 0) { write_runlog(DEBUG1, "execute the ssh command: nodeId=%u, command=\"%s\", " " commandReturn=%d.\n", g_node[remote_nodeid].node, cmd, *result); } if (*result == SHELL_COMMAND_NOT_EXIST) { command_not_exist_num++; if (command_not_exist_num >= COMMAND_NOT_EXIST_FIND_NUM) { (void)fclose(fd); write_runlog(FATAL, "command:%s failed, error is 127, command not exist on node %u. \n", command, remote_nodeid); exit(-1); } } (void)fclose(fd); } /* * routines to check memory allocations and fail noisily. */ void* pg_malloc(size_t size) { /* Avoid unportable behavior of malloc(0) */ if (size == 0) { size = 1; } void *result = malloc(size); if (result == NULL) { write_runlog(FATAL, "out of memory\n"); exit(1); } return result; } int runCmdByNodeId(const char* command, uint32 nodeid) { int ret = 0; uint32 ii; if (nodeid == g_currentNode->node) { ret = system(command); } else { for (ii = 0; ii < g_node_num; ii++) { if (g_node[ii].node == nodeid) { break; } } if (ii < g_node_num) { ret = ssh_exec(&g_node[ii], command); } else { write_runlog(ERROR, "can't find the nodeid: %u\n", nodeid); ret = -1; } } if (ret != 0) { write_runlog(DEBUG1, "fail to execute command %s, errno=%d.\n", command, errno); } return ret; } /* estimate how many times required to wait for operation's completion. */ int caculate_default_timeout(CtlCommand cmd) { uint32 base_timeout, node_count; switch (cmd) { case STOP_COMMAND: base_timeout = STOP_DEFAULT_WAIT; break; case START_COMMAND: base_timeout = START_DEFAULT_WAIT; break; default: base_timeout = DEFAULT_WAIT; break; } if (g_command_operation_azName == NULL && g_commandOperationNodeId == 0) { /* cluster */ node_count = g_node_num; } else if (g_command_operation_azName != NULL) { /* AZ */ node_count = 0; for (uint32 ii = 0; ii < g_node_num; ii++) { if (strcmp(g_node[ii].azName, g_command_operation_azName) == 0) { node_count++; } } } else { /* one node or instance */ node_count = 1; } return (int)(base_timeout + g_node_operation_cost * node_count); } int GetDatanodeRelationInfo(uint32 nodeId, const char *cmData, cm_to_ctl_get_datanode_relation_ack *getInstanceMsg) { uint32 instanceId = 0; int instanceType = 0; int ret; int i = 0; int timePass = 0; char* receiveMsg = NULL; cm_to_ctl_get_datanode_relation_ack* getInstanceMsgPtr = NULL; ctl_to_cm_datanode_relation_info cmDatanodeRelationInfoContent = {0}; ret = FindInstanceIdAndType(nodeId, cmData, &instanceId, &instanceType); if (ret != 0) { write_runlog(ERROR, "can't find the nodeId:%u, data_path:%s.\n", nodeId, cmData); return -1; } do_conn_cmserver(false, 0); if (CmServer_conn == NULL) { write_runlog(ERROR, "this time connect cms failed is NULL.\n "); return -1; } cmDatanodeRelationInfoContent.msg_type = (int)MSG_CTL_CM_GET_DATANODE_RELATION; cmDatanodeRelationInfoContent.instanceId = instanceId; cmDatanodeRelationInfoContent.instance_type = instanceType; cmDatanodeRelationInfoContent.node = nodeId; ret = cm_client_send_msg( CmServer_conn, 'C', (char*)&cmDatanodeRelationInfoContent, sizeof(cmDatanodeRelationInfoContent)); if (ret != 0) { FINISH_CONNECTION(); } for (;;) { (void)sleep(1); timePass++; if (CmServer_conn != NULL) { ret = cm_client_flush_msg(CmServer_conn); if (ret == TCP_SOCKET_ERROR_EPIPE) { FINISH_CONNECTION(); } receiveMsg = recv_cm_server_cmd(CmServer_conn); } if (receiveMsg != NULL) { getInstanceMsgPtr = (cm_to_ctl_get_datanode_relation_ack*)receiveMsg; getInstanceMsg->command_result = getInstanceMsgPtr->command_result; getInstanceMsg->member_index = getInstanceMsgPtr->member_index; for (i = 0; i < CM_PRIMARY_STANDBY_MAX_NUM; i++) { getInstanceMsg->data_node_member[i] = getInstanceMsgPtr->data_node_member[i]; getInstanceMsg->instanceMember[i] = getInstanceMsgPtr->instanceMember[i]; getInstanceMsg->gtm_member[i] = getInstanceMsgPtr->gtm_member[i]; } break; } if (timePass > DEFAULT_GET_INFO_TIME) { write_runlog(ERROR, "Get the datanode relation information timeout in %d.\n", DEFAULT_GET_INFO_TIME); FINISH_CONNECTION(); } } CMPQfinish(CmServer_conn); CmServer_conn = NULL; return 0; } void InstanceInformationRecord(uint32 nodeIndex, const cm_to_ctl_instance_status* cmToCtlInstanceStatusPtr) { uint32 j = 0; uint32 instanceIndex = 0; switch (cmToCtlInstanceStatusPtr->instance_type) { case INSTANCE_TYPE_COORDINATE: write_runlog(DEBUG1, "Coordinator State: node=%u nodeName=%s ip=%s port=%u instanceId=%u DataPath=%s status=%s\n", g_node[nodeIndex].node, g_node[nodeIndex].nodeName, g_node[nodeIndex].coordinateListenIP[0], g_node[nodeIndex].coordinatePort, cmToCtlInstanceStatusPtr->instanceId, g_node[nodeIndex].DataPath, datanode_role_int_to_string(cmToCtlInstanceStatusPtr->coordinatemember.status)); break; case INSTANCE_TYPE_GTM: write_runlog(DEBUG1, "GTM State: node=%u nodeName=%s ip=%s instanceId=%u DataPath=%s static_role=%s role=%s " "connect_status=%s sync_mode=%s\n", g_node[nodeIndex].node, g_node[nodeIndex].nodeName, g_node[nodeIndex].gtmLocalListenIP[0], cmToCtlInstanceStatusPtr->instanceId, g_node[nodeIndex].gtmLocalDataPath, datanode_static_role_int_to_string(g_node[nodeIndex].gtmRole), datanode_role_int_to_string(cmToCtlInstanceStatusPtr->gtm_member.local_status.local_role), gtm_con_int_to_string(cmToCtlInstanceStatusPtr->gtm_member.local_status.connect_status), datanode_wal_sync_state_int_to_string( cmToCtlInstanceStatusPtr->gtm_member.local_status.sync_mode)); break; case INSTANCE_TYPE_DATANODE: for (j = 0; j < g_node[nodeIndex].datanodeCount; j++) { if (g_node[nodeIndex].datanode[j].datanodeId == cmToCtlInstanceStatusPtr->instanceId) { instanceIndex = j; break; } } write_runlog(DEBUG1, "Datanode State: node=%u nodeName=%s ip=%s port=%u instanceId=%u DataPath=%s static_role=%s role=%s " "state=%s buildReason=%s\n", g_node[nodeIndex].node, g_node[nodeIndex].nodeName, g_node[nodeIndex].datanode[instanceIndex].datanodeListenIP[0], g_node[nodeIndex].datanode[instanceIndex].datanodePort, cmToCtlInstanceStatusPtr->instanceId, g_node[nodeIndex].datanode[instanceIndex].datanodeLocalDataPath, datanode_static_role_int_to_string(g_node[nodeIndex].datanode[instanceIndex].datanodeRole), datanode_role_int_to_string(cmToCtlInstanceStatusPtr->data_node_member.local_status.local_role), datanode_dbstate_int_to_string(cmToCtlInstanceStatusPtr->data_node_member.local_status.db_state), datanode_rebuild_reason_int_to_string( cmToCtlInstanceStatusPtr->data_node_member.local_status.buildReason)); break; default: write_runlog(DEBUG1, "Unknown instance_type\n"); break; } return; } void SetServerSocketWithEtcdInfo(ServerSocket *server, staticNodeConfig *node) { server->nodeIdInfo.azName = node->azName; server->nodeIdInfo.nodeId = node->node; server->nodeIdInfo.instd = node->etcdId; server->nodeInfo.nodeName = node->etcdName; server->nodeInfo.len = CM_NODE_NAME; server->host = node->etcdClientListenIPs[0]; server->port = node->etcdClientListenPort; } void EtcdIpPortInfoBalance(ServerSocket *server) { uint32 j = 0; for (uint32 i = 0; i < g_node_num; i++) { if (g_node[i].etcd) { SetServerSocketWithEtcdInfo(&(server[j]), &(g_node[i])); ++j; } } } static status_t InitDdbServerList(DrvApiInfo *drvApiInfo) { size_t len = (g_etcd_num + 1) * sizeof(ServerSocket); ServerSocket *server = (ServerSocket *)malloc(len); if (server == NULL) { write_runlog(FATAL, "out of memory!\n"); return CM_ERROR; } errno_t rc = memset_s(server, len, 0, len); securec_check_errno(rc, FREE_AND_RESET(server)); EtcdIpPortInfoBalance(server); server[g_etcd_num].host = NULL; drvApiInfo->serverList = server; drvApiInfo->serverLen = g_etcd_num + 1; drvApiInfo->nodeNum = g_etcd_num; return CM_SUCCESS; } status_t InitDdbCfgApi(DrvApiInfo *drvApiInfo, int32 timeOut) { status_t initServer = InitDdbServerList(drvApiInfo); if (initServer != CM_SUCCESS) { FREE_AND_RESET(drvApiInfo->serverList); return CM_ERROR; } drvApiInfo->modId = MOD_CMCTL; drvApiInfo->nodeId = g_currentNode->node; drvApiInfo->timeOut = timeOut; drvApiInfo->client_t.tlsPath = &g_tlsPath; return CM_SUCCESS; } status_t ServerDdbInit() { if (g_etcd_num == 0) { write_runlog(DEBUG1, "g_etcd_num is %u, cannot create ddb conn.\n", g_etcd_num); return CM_SUCCESS; } g_sess = (DdbConn *)malloc(sizeof(DdbConn)); if (g_sess == NULL) { write_runlog(ERROR, "g_sess is NULL.\n"); return CM_ERROR; } errno_t rc = memset_s(g_sess, sizeof(DdbConn), 0, sizeof(DdbConn)); securec_check_errno(rc, FREE_AND_RESET(g_sess)); DdbInitConfig config; rc = memset_s(&config, sizeof(DdbInitConfig), 0, sizeof(DdbInitConfig)); securec_check_errno(rc, (void)rc); config.type = DB_ETCD; status_t st = InitDdbCfgApi(&config.drvApiInfo, DDB_DEFAULT_TIMEOUT); CM_RETURN_IFERR(st); st = InitDdbConn(g_sess, &config); FREE_AND_RESET(config.drvApiInfo.serverList); return st; } void FreeDdbInfo() { if (g_sess == NULL) { return; } DdbFreeNodeInfo(g_sess); if (DdbFreeConn(g_sess) != CM_SUCCESS) { write_runlog(DEBUG1, "failed to free conn.\n"); } FREE_AND_RESET(g_sess); } bool CheckDdbHealth() { if (g_sess == NULL) { return true; } const int ddbHealthTimeout = 4000; return DdbIsValid(g_sess, DDB_HEAL_COUNT, ddbHealthTimeout); } bool IsCmsPrimary(const staticNodeConfig *node) { const char *primaryIp = CmServer_conn->pghost; for (uint32 i = 0; i < node->sshCount; ++i) { if (strcmp(node->sshChannel[i], primaryIp) == 0) { return true; } } return false; } static status_t KillOneCms(uint32 nodeIndex) { int ret; char killCmd[CM_PATH_LENGTH] = {0}; char gausshomePath[CM_PATH_LENGTH] = {0}; if (GetHomePath(gausshomePath, sizeof(gausshomePath)) != EOK) { return CM_ERROR; } if (g_node[nodeIndex].node == g_currentNode->node) { ret = snprintf_s(killCmd, CM_PATH_LENGTH, CM_PATH_LENGTH - 1, "ps -eo pid,cmd|grep -v grep|grep %s/bin/cm_server |awk '{print $1}'|xargs kill -9 > /dev/null 2>&1 &", gausshomePath); securec_check_intval(ret, (void)ret); ret = system(killCmd); } else { ret = snprintf_s(killCmd, CM_PATH_LENGTH, CM_PATH_LENGTH - 1, "ps -eo pid,cmd|grep -v grep|grep %s/bin/cm_server |awk '{print \\$1}'|xargs kill -9 > /dev/null 2>&1 &", gausshomePath); securec_check_intval(ret, (void)ret); ret = ssh_exec(&g_node[nodeIndex], killCmd); } if (ret != 0) { write_runlog(ERROR, "cm_ctl exec ssh failed, node(%u), errno=%d.\n", g_node[nodeIndex].node, errno); return CM_ERROR; } write_runlog(DEBUG1, "kill cms node(%u) success.\n", g_node[nodeIndex].node); return CM_SUCCESS; } // if not need kill primary cms, connect primary cms first, then kill cms status_t KillAllCms(bool isNeedKillPrimaryCms) { status_t killResult = CM_SUCCESS; uint32 *cmsNodeIndex = GetCmsNodeIndex(); for (uint32 i = 0; i < g_cm_server_num; ++i) { if (!isNeedKillPrimaryCms && IsCmsPrimary(&g_node[cmsNodeIndex[i]])) { write_runlog(DEBUG1, "The node(%u) is primary or has no cms, can't kill it.\n", g_node[i].node); continue; } if (KillOneCms(cmsNodeIndex[i]) != CM_SUCCESS) { killResult = CM_ERROR; write_runlog(DEBUG1, "kill the cms(node=%u) failed.\n", g_node[i].node); } } return killResult; } void ReleaseConn(CM_Conn *con) { if (con != NULL) { CMPQfinish(con); } } bool SetOfflineNode(uint32 nodeIndex, CM_Conn *con) { if (!IsCmSharedStorageMode()) { return false; } int times = 0; char *receiveMsg = NULL; cm_msg_type *msgType = NULL; GetSharedStorageInfo sendMsg = {0}; CmsSharedStorageInfo *msgAck = NULL; sendMsg.msg_type = (int)MSG_GET_SHARED_STORAGE_INFO; if (cm_client_send_msg(con, 'C', (char*)&sendMsg, sizeof(sendMsg)) != 0) { write_runlog(DEBUG1, "SetOfflineNode send msg to cms fail!\n"); return false; } for (;;) { if (times++ > SHARED_STORAGE_MODE_TIMEOUT) { break; } if (cm_client_flush_msg(con) == TCP_SOCKET_ERROR_EPIPE) { ReleaseConn(con); return false; } receiveMsg = recv_cm_server_cmd(con); if (receiveMsg != NULL) { msgType = (cm_msg_type*)receiveMsg; if (msgType->msg_type != (int)MSG_GET_SHARED_STORAGE_INFO_ACK) { write_runlog(DEBUG1, "SetOfflineNode get unknown msg!\n"); return false; } msgAck = (CmsSharedStorageInfo*)(receiveMsg); if (msgAck->doradoIp[0] == '\0') { write_runlog(DEBUG1, "can't get dorado ip!\n"); return false; } break; } cm_sleep(1); } if (msgAck == NULL) { write_runlog(DEBUG1, "SetOfflineNode msgAck is NULL.\n"); return false; } if (strcmp(trim(msgAck->doradoIp), g_node[nodeIndex].sshChannel[0]) == 0) { write_runlog(DEBUG1, "Line:%d node is offline, ip is %s.\n", __LINE__, g_currentNode->sshChannel[0]); return true; } return false; } void GetUpgradeVersionFromCmaConfig() { int rc; char cmAgentConfigFile[MAX_PATH_LEN] = {0}; char gausshomePath[MAXPGPATH] = {0}; rc = cmctl_getenv("GAUSSHOME", gausshomePath, sizeof(gausshomePath)); if (rc != EOK) { write_runlog(FATAL, "Line: %d.Get GAUSSHOME failed, please check.\n", __LINE__); return; } if (strstr(gausshomePath, "/var/chroot") == NULL) { rc = snprintf_s(cmAgentConfigFile, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/cm_agent/cm_agent.conf", g_currentNode->cmDataPath); } else { rc = snprintf_s(cmAgentConfigFile, MAX_PATH_LEN, MAX_PATH_LEN - 1, "/var/chroot/%s/cm_agent/cm_agent.conf", g_currentNode->cmDataPath); } securec_check_intval(rc, (void)rc); if (access(cmAgentConfigFile, R_OK) != 0) { write_runlog(WARNING, "The cm_agent.conf is unreadable, set undocumentedVersion 0\n"); undocumentedVersion = 0; return; } undocumentedVersion = get_uint32_value_from_config(cmAgentConfigFile, "upgrade_from", 0); } void CtlGetCmJsonConf() { int ret = ReadCmConfJson(NULL); if (!IsReadConfJsonSuccess(ret)) { write_runlog(DEBUG1, "read cm conf json failed, ret=%d, reason=\"%s\".\n", ret, ReadConfJsonFailStr(ret)); } if (InitAllResStat(DEBUG1) != CM_SUCCESS) { write_runlog(DEBUG1, "init res status failed.\n"); } }