修复cm_ctl start时core问题
This commit is contained in:
parent
6d9f6fa88e
commit
b179cc3a67
@ -2254,7 +2254,6 @@ static void CtlCommandProcessCore(int *status, CtlOption *ctlCtx)
|
||||
#endif
|
||||
case START_COMMAND:
|
||||
*status = do_start();
|
||||
removeStartingFile();
|
||||
break;
|
||||
case CM_SWITCHOVER_COMMAND:
|
||||
*status = DoSwitchover(ctlCtx);
|
||||
|
@ -1868,3 +1868,15 @@ void CtlGetCmJsonConf()
|
||||
write_runlog(DEBUG1, "init res status failed.\n");
|
||||
}
|
||||
}
|
||||
|
||||
bool IsTimeOut(const cmTime_t *lastTime, const char *str)
|
||||
{
|
||||
cmTime_t curTime = {0};
|
||||
(void)clock_gettime(CLOCK_MONOTONIC, &curTime);
|
||||
const long maxTimeInterval = 60;
|
||||
if(curTime.tv_sec - lastTime->tv_sec > maxTimeInterval) {
|
||||
write_runlog(DEBUG1, "%s this has timeout(%ld), it will exit.\n", str, maxTimeInterval);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@ -50,10 +50,13 @@ static inline void ResInstCheckGetQueryMsg(QueryOneResInstStat *queryMsg, uint32
|
||||
|
||||
static ResStatus ResInstCheckGetResult(CM_Conn *pCmsCon)
|
||||
{
|
||||
struct timespec timeBegin = {0, 0};
|
||||
(void)clock_gettime(CLOCK_MONOTONIC, &timeBegin);
|
||||
for (;;) {
|
||||
if (cm_client_flush_msg(pCmsCon) == TCP_SOCKET_ERROR_EPIPE) {
|
||||
break;
|
||||
}
|
||||
CM_BREAK_IF_TRUE(IsTimeOut(&timeBegin, "[ResInstCheckGetResult]"));
|
||||
char *recvMsg = recv_cm_server_cmd(pCmsCon);
|
||||
while (recvMsg != NULL) {
|
||||
cm_msg_type *msgTypePtr = (cm_msg_type*)recvMsg;
|
||||
|
@ -54,7 +54,7 @@ static void start_datanode_instance_relation(uint32 node, const char *dataPath);
|
||||
static void start_az_try_more_one(const char* azName);
|
||||
static void* check_cluster_start_status(void* arg);
|
||||
static void StartFailQueryAndReport();
|
||||
static int start_check();
|
||||
static void* start_check(void* arg);
|
||||
static int start_check_cluster();
|
||||
static int start_check_az(const char* azName);
|
||||
static int start_check_node(uint32 node_id_check);
|
||||
@ -82,6 +82,7 @@ static int g_instance_start_status = CM_STATUS_UNKNOWN;
|
||||
static int g_node_start_status = CM_STATUS_UNKNOWN;
|
||||
static int g_dn_relation_start_status = CM_STATUS_UNKNOWN;
|
||||
static int g_resStartStatus = CM_STATUS_UNKNOWN;
|
||||
static StartExitCode g_startExitCode = CM_START_EXIT_INIT;
|
||||
|
||||
static int startaz_try_heartbeat = START_AZ_TRY_HEARTBEAT;
|
||||
static struct timespec g_startTime;
|
||||
@ -113,6 +114,7 @@ extern char* cm_arbitration_mode_set;
|
||||
extern const char* g_progname;
|
||||
extern CM_Conn* CmServer_conn;
|
||||
extern uint32 g_commandOperationInstanceId;
|
||||
extern char manualPauseFile[MAXPGPATH];
|
||||
|
||||
static int StartResInstCheck(uint32 instId)
|
||||
{
|
||||
@ -235,11 +237,14 @@ status_t do_start(void)
|
||||
{
|
||||
CtlGetCmJsonConf();
|
||||
int ret;
|
||||
pthread_t thr_id;
|
||||
pthread_t checkStatusThrId;
|
||||
pthread_t startCheckThrId;
|
||||
g_startExitCode = CM_START_EXIT_INIT;
|
||||
#ifndef ENABLE_MULTIPLE_NODES
|
||||
int nodeNumsInAz = 0;
|
||||
struct stat libnetManualStat = {0};
|
||||
int ltranCheckTimes = 0;
|
||||
(void)atexit(RemoveStartingFile);
|
||||
#endif
|
||||
if (g_commandOperationNodeId > 0 && get_node_index(g_commandOperationNodeId) >= g_node_num) {
|
||||
write_runlog(FATAL, "node_id specified is illegal. \n");
|
||||
@ -389,7 +394,6 @@ status_t do_start(void)
|
||||
} else {
|
||||
if (CheckOfflineInstance(g_commandOperationNodeId)) {
|
||||
write_runlog(LOG, "the instance(node:%u) is Offline, no need to start.\n", g_commandOperationNodeId);
|
||||
removeStartingFile();
|
||||
exit(0);
|
||||
}
|
||||
write_runlog(LOG, "start the node:%u,datapath:%s. \n", g_commandOperationNodeId, g_cmData);
|
||||
@ -397,18 +401,21 @@ status_t do_start(void)
|
||||
}
|
||||
|
||||
/* create a thread to check cluster's status */
|
||||
ret = pthread_create(&thr_id, NULL, &check_cluster_start_status, NULL);
|
||||
ret = pthread_create(&checkStatusThrId, NULL, &check_cluster_start_status, NULL);
|
||||
if (ret != 0) {
|
||||
write_runlog(FATAL, "failed to create thread to check if cluster started.\n");
|
||||
return CM_ERROR;
|
||||
}
|
||||
|
||||
/* check node's status */
|
||||
if (start_check() != 0) {
|
||||
ret = pthread_create(&startCheckThrId, NULL, &start_check, NULL);
|
||||
if (ret != 0) {
|
||||
write_runlog(FATAL, "failed to create start check thread.\n");
|
||||
return CM_ERROR;
|
||||
}
|
||||
|
||||
return CM_SUCCESS;
|
||||
(void)pthread_join(startCheckThrId, NULL);
|
||||
exit((int)g_startExitCode);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1421,11 +1428,16 @@ void start_instance(uint32 nodeid, const char* datapath)
|
||||
}
|
||||
}
|
||||
|
||||
void removeStartingFile()
|
||||
void RemoveStartingFile()
|
||||
{
|
||||
int ret;
|
||||
char command[MAX_COMMAND_LEN] = {0};
|
||||
|
||||
ret = access(manualPauseFile, F_OK);
|
||||
if (ret != 0) {
|
||||
exit(0);
|
||||
}
|
||||
|
||||
init_hosts();
|
||||
if (mpp_env_separate_file[0] == '\0') {
|
||||
ret = snprintf_s(command,
|
||||
@ -1465,6 +1477,20 @@ void removeStartingFile()
|
||||
SHELL_RETURN_CODE(ret),
|
||||
errno);
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
|
||||
static void ContinueCheckClsStatus(long *startingTime)
|
||||
{
|
||||
(void)sleep(1);
|
||||
|
||||
(void)clock_gettime(CLOCK_MONOTONIC, &g_endTime);
|
||||
*startingTime = (g_endTime.tv_sec - g_startTime.tv_sec);
|
||||
if (*startingTime > EXPECTED_CLUSTER_START_TIME && *startingTime % CLUSTER_STATE_CHECK_INTERVAL == 0) {
|
||||
write_runlog(DEBUG1, "starting exceeds 2 mins, instance status:g_cluster_start_status=%d,"
|
||||
"g_az_start_status=%d, g_node_start_status=%d, g_instance_start_status=%d\n",
|
||||
g_cluster_start_status, g_az_start_status, g_node_start_status, g_instance_start_status);
|
||||
}
|
||||
}
|
||||
|
||||
static void* check_cluster_start_status(void* arg)
|
||||
@ -1478,14 +1504,19 @@ static void* check_cluster_start_status(void* arg)
|
||||
}
|
||||
|
||||
while (startingTime < g_waitSeconds) {
|
||||
if (g_startExitCode != CM_START_EXIT_INIT) {
|
||||
// wait start_check thread exit until g_waitSeconds
|
||||
ContinueCheckClsStatus(&startingTime);
|
||||
continue;
|
||||
}
|
||||
if (g_cluster_start_status == CM_STATUS_NORMAL) {
|
||||
write_runlog(LOG, "start cluster successfully.\n");
|
||||
removeStartingFile();
|
||||
exit(0);
|
||||
g_startExitCode = CM_START_EXIT_SUCCESS;
|
||||
continue;
|
||||
} else if (g_cluster_start_status == CM_STATUS_NORMAL_WITH_CN_DELETED) {
|
||||
write_runlog(LOG, "start cluster successfully. There is a coordinator that has been deleted. \n");
|
||||
removeStartingFile();
|
||||
exit(0);
|
||||
g_startExitCode = CM_START_EXIT_SUCCESS;
|
||||
continue;
|
||||
} else if (g_az_start_status == CM_STATUS_NORMAL) {
|
||||
for (uint32 ii = 0; ii < g_node_num; ii++) {
|
||||
if (g_command_operation_azName != NULL && strcmp(g_node[ii].azName, g_command_operation_azName) == 0) {
|
||||
@ -1494,8 +1525,8 @@ static void* check_cluster_start_status(void* arg)
|
||||
}
|
||||
|
||||
write_runlog(LOG, "start availability zone successfully.\n");
|
||||
removeStartingFile();
|
||||
exit(0);
|
||||
g_startExitCode = CM_START_EXIT_SUCCESS;
|
||||
continue;
|
||||
} else if (g_az_start_status == CM_STATUS_NORMAL_WITH_CN_DELETED) {
|
||||
for (uint32 ii = 0; ii < g_node_num; ii++) {
|
||||
if (g_command_operation_azName != NULL && strcmp(g_node[ii].azName, g_command_operation_azName) == 0) {
|
||||
@ -1504,16 +1535,16 @@ static void* check_cluster_start_status(void* arg)
|
||||
}
|
||||
|
||||
write_runlog(LOG, "start availability zone successfully. There is a coordinator that has been deleted. \n");
|
||||
removeStartingFile();
|
||||
exit(0);
|
||||
g_startExitCode = CM_START_EXIT_SUCCESS;
|
||||
continue;
|
||||
} else if (g_node_start_status == CM_STATUS_NORMAL) {
|
||||
write_runlog(LOG, "start node successfully.\n");
|
||||
removeStartingFile();
|
||||
exit(0);
|
||||
g_startExitCode = CM_START_EXIT_SUCCESS;
|
||||
continue;
|
||||
} else if (g_node_start_status == CM_STATUS_NORMAL_WITH_CN_DELETED) {
|
||||
write_runlog(LOG, "start node successfully. There is a coordinator that has been deleted. \n");
|
||||
removeStartingFile();
|
||||
exit(0);
|
||||
g_startExitCode = CM_START_EXIT_SUCCESS;
|
||||
continue;
|
||||
} else if (g_instance_start_status == CM_STATUS_NORMAL) {
|
||||
/*
|
||||
* CM Client found the instance running, but maybe it quit immediately after startup.
|
||||
@ -1522,34 +1553,26 @@ static void* check_cluster_start_status(void* arg)
|
||||
count++;
|
||||
if (count > INSTANCE_START_CONFIRM_TIME) {
|
||||
write_runlog(LOG, "start instance successfully.\n");
|
||||
removeStartingFile();
|
||||
exit(0);
|
||||
g_startExitCode = CM_START_EXIT_SUCCESS;
|
||||
continue;
|
||||
}
|
||||
} else if (g_dn_relation_start_status == CM_STATUS_NORMAL) {
|
||||
/* check whether the relation datanodes have been started successfully */
|
||||
write_runlog(LOG, "start relation datanodes successfully(node:%u, path:%s).\n",
|
||||
g_commandOperationNodeId, g_cmData);
|
||||
removeStartingFile();
|
||||
exit(0);
|
||||
g_startExitCode = CM_START_EXIT_SUCCESS;
|
||||
continue;
|
||||
} else if (g_resStartStatus == CM_STATUS_NORMAL) {
|
||||
write_runlog(LOG, "start resource instance successfully(nodeId:%u, instId:%u).\n",
|
||||
g_commandOperationNodeId, g_commandOperationInstanceId);
|
||||
removeStartingFile();
|
||||
exit(0);
|
||||
g_startExitCode = CM_START_EXIT_SUCCESS;
|
||||
continue;
|
||||
} else {
|
||||
count = 0;
|
||||
}
|
||||
|
||||
(void)sleep(1);
|
||||
ContinueCheckClsStatus(&startingTime);
|
||||
write_runlog(LOG, ".");
|
||||
|
||||
(void)clock_gettime(CLOCK_MONOTONIC, &g_endTime);
|
||||
startingTime = (g_endTime.tv_sec - g_startTime.tv_sec);
|
||||
if (startingTime > EXPECTED_CLUSTER_START_TIME && startingTime % CLUSTER_STATE_CHECK_INTERVAL == 0) {
|
||||
write_runlog(DEBUG1, "starting exceeds 2 mins, instance status:g_cluster_start_status=%d,"
|
||||
"g_az_start_status=%d,g_node_start_status=%d,g_instance_start_status=%d\n",
|
||||
g_cluster_start_status, g_az_start_status, g_node_start_status, g_instance_start_status);
|
||||
}
|
||||
}
|
||||
|
||||
/* query cluster and report when start failed */
|
||||
@ -1596,7 +1619,10 @@ static void* check_cluster_start_status(void* arg)
|
||||
g_waitSeconds);
|
||||
}
|
||||
|
||||
removeStartingFile();
|
||||
g_startExitCode = CM_START_EXIT_FAILED;
|
||||
// wait start_check thread exit
|
||||
// cm_ctl maybe core if start_check thread not exit conn cm_server with openssl
|
||||
(void)sleep(6);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
@ -1644,11 +1670,15 @@ static void StartFailQueryAndReport()
|
||||
return;
|
||||
}
|
||||
|
||||
static int start_check()
|
||||
static void* start_check(void* arg)
|
||||
{
|
||||
uint32 ii;
|
||||
|
||||
for (;;) {
|
||||
if (g_startExitCode != CM_START_EXIT_INIT) {
|
||||
write_runlog(DEBUG1, "start_check thread exit:%d\n", (int)g_startExitCode);
|
||||
break;
|
||||
}
|
||||
if (g_command_operation_azName == NULL && g_commandOperationNodeId == 0) {
|
||||
g_cluster_start_status = start_check_cluster();
|
||||
} else if (g_command_operation_azName != NULL) {
|
||||
@ -1665,7 +1695,8 @@ static int start_check()
|
||||
|
||||
if (ii >= g_node_num) {
|
||||
write_runlog(ERROR, "can't find the nodeid: %u\n", g_commandOperationNodeId);
|
||||
return 1;
|
||||
g_startExitCode = CM_START_EXIT_FAILED;
|
||||
break;
|
||||
}
|
||||
g_node_start_status = start_check_node(ii);
|
||||
} else if (g_commandRelationship) {
|
||||
@ -1679,7 +1710,8 @@ static int start_check()
|
||||
|
||||
if (ii >= g_node_num) {
|
||||
write_runlog(ERROR, "can't find the nodeid: %u\n", g_commandOperationNodeId);
|
||||
return 1;
|
||||
g_startExitCode = CM_START_EXIT_FAILED;
|
||||
break;
|
||||
}
|
||||
|
||||
g_instance_start_status = start_check_instance(ii, g_cmData);
|
||||
@ -1687,7 +1719,7 @@ static int start_check()
|
||||
|
||||
(void)sleep(1);
|
||||
}
|
||||
return 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void NotifyCMSClusterStarting()
|
||||
@ -1704,18 +1736,6 @@ static void NotifyCMSClusterStarting()
|
||||
return;
|
||||
}
|
||||
|
||||
static bool IsTimeOut(const cmTime_t *lastTime, const char *str)
|
||||
{
|
||||
cmTime_t curTime = {0};
|
||||
(void)clock_gettime(CLOCK_MONOTONIC, &curTime);
|
||||
const long maxTimeInterval = 60;
|
||||
if (curTime.tv_sec - lastTime->tv_sec > maxTimeInterval) {
|
||||
write_runlog(DEBUG1, "%s this has timeout(%ld), it will exit.\n", str, maxTimeInterval);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static int start_check_cluster()
|
||||
{
|
||||
ctl_to_cm_query cm_ctl_cm_query_content;
|
||||
@ -1921,6 +1941,8 @@ static int start_check_node(uint32 node_id_check)
|
||||
CmServer_conn = NULL;
|
||||
return CM_STATUS_UNKNOWN;
|
||||
}
|
||||
struct timespec timeBegin = {0, 0};
|
||||
(void)clock_gettime(CLOCK_MONOTONIC, &timeBegin);
|
||||
for (;;) {
|
||||
ret = cm_client_flush_msg(CmServer_conn);
|
||||
if (ret == TCP_SOCKET_ERROR_EPIPE) {
|
||||
@ -1929,6 +1951,12 @@ static int start_check_node(uint32 node_id_check)
|
||||
return CM_STATUS_UNKNOWN;
|
||||
}
|
||||
|
||||
if (IsTimeOut(&timeBegin, "[start_check_node]")) {
|
||||
CMPQfinish(CmServer_conn);
|
||||
CmServer_conn = NULL;
|
||||
return CM_STATUS_UNKNOWN;
|
||||
}
|
||||
|
||||
receive_msg = recv_cm_server_cmd(CmServer_conn);
|
||||
while (receive_msg != NULL) {
|
||||
cm_msg_type_ptr = (cm_msg_type*)receive_msg;
|
||||
|
@ -312,6 +312,7 @@ int DoRhbPrint();
|
||||
int DoPause();
|
||||
int DoResume();
|
||||
bool CheckTrustAndNet();
|
||||
void removeStartingFile();
|
||||
bool IsTimeOut(const cmTime_t *lastTime, const char *str);
|
||||
void RemoveStartingFile();
|
||||
|
||||
#endif
|
||||
|
@ -62,6 +62,12 @@ typedef enum ClientErrorEn {
|
||||
CM_RES_CLIENT_NO_LOCK_OWNER = 6,
|
||||
} ClientError;
|
||||
|
||||
typedef enum StartExitCodeEn {
|
||||
CM_START_EXIT_FAILED = -1,
|
||||
CM_START_EXIT_SUCCESS = 0,
|
||||
CM_START_EXIT_INIT = 2,
|
||||
} StartExitCode;
|
||||
|
||||
typedef unsigned char bool8;
|
||||
|
||||
#define CMS_ONE_PRIMARY_ONE_STANDBY 2
|
||||
|
Loading…
x
Reference in New Issue
Block a user