【资源池化】【bugfix】1.限制switchover降备时间,10min完不成认为失败;2.增加定位手段,找出降备失败,始终处于PM_WAIT_BACKENDS的原因
This commit is contained in:
@ -352,13 +352,27 @@ static inline void SSResetDemoteReqType(void)
|
||||
SpinLockRelease(&t_thrd.walsender_cxt.WalSndCtl->mutex);
|
||||
}
|
||||
|
||||
static void SSHandleReformFailDuringDemote(DemoteMode demote_mode)
|
||||
static void SSHandleReformFailDuringDemote(bool timeout, DemoteMode demote_mode)
|
||||
{
|
||||
ereport(WARNING,
|
||||
(errmodule(MOD_DMS),
|
||||
errmsg("[SS reform][SS switchover] Failure in %s primary demote, pmState=%d, need reform rcy.",
|
||||
DemoteModeDesc(demote_mode), pmState)));
|
||||
|
||||
if (timeout) {
|
||||
g_instance.dms_cxt.SSReformInfo.switchover_demote_failure_signal_handled = false;
|
||||
pg_memory_barrier();
|
||||
SendPostmasterSignal(PMSIGNAL_DMS_SWITCHOVER_DEMOTE_FAILURE_CHECK);
|
||||
const int WAIT_SIGNAL_HANDLED = 100; /* only wait 10s*/
|
||||
for (int ntries = 0; ntries < WAIT_SIGNAL_HANDLED; ntries++) {
|
||||
if (g_instance.dms_cxt.SSReformInfo.switchover_demote_failure_signal_handled) {
|
||||
break;
|
||||
}
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
pg_usleep(100000L); /* wait 0.1 sec, then retry */
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Shutdown checkpoint would cause concurrency as DMS is starting next round of reform.
|
||||
* If we allow ckpt to finish and recover, DMS would not be aware of the recovery process.
|
||||
@ -413,10 +427,10 @@ static int CBSwitchoverDemote(void *db_handle)
|
||||
return DMS_SUCCESS;
|
||||
} else {
|
||||
if (ntries >= WAIT_DEMOTE || dms_reform_failed()) {
|
||||
SSHandleReformFailDuringDemote(demote_mode);
|
||||
bool timeout = ntries >= WAIT_DEMOTE ? true : false;
|
||||
SSHandleReformFailDuringDemote(timeout, demote_mode);
|
||||
return DMS_ERROR;
|
||||
}
|
||||
ntries = 0;
|
||||
}
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
@ -10695,6 +10695,34 @@ static void sigusr1_handler(SIGNAL_ARGS)
|
||||
}
|
||||
}
|
||||
|
||||
if (ENABLE_DMS && CheckPostmasterSignal(PMSIGNAL_DMS_SWITCHOVER_DEMOTE_FAILURE_CHECK)) {
|
||||
if (pmState == PM_WAIT_BACKENDS) {
|
||||
int backend_count = SSCountAndPrintChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC);
|
||||
if (backend_count > 0) {
|
||||
ereport(WARNING, (errmodule(MOD_DMS),
|
||||
errmsg("[SS reform][SS switchover] demote fail reason: backends exist, backend_count:%d, "
|
||||
"pmState:%d",
|
||||
backend_count, pmState)));
|
||||
}
|
||||
|
||||
const int check_list_num = 4;
|
||||
ThreadId check_list[check_list_num] = {
|
||||
g_instance.pid_cxt.AshPID,
|
||||
g_instance.pid_cxt.TwoPhaseCleanerPID,
|
||||
g_instance.pid_cxt.StatementPID,
|
||||
g_instance.pid_cxt.PercentilePID
|
||||
};
|
||||
for (int i = 0; i < check_list_num; i++) {
|
||||
if (check_list[i] != 0 ) {
|
||||
ereport(WARNING, (errmodule(MOD_DMS),
|
||||
errmsg("[SS reform][SS switchover] demote fail reason: thread name:%s exist, pid:%lu, pmState:%d",
|
||||
GetProcName(check_list[i]), check_list[i], pmState)));
|
||||
}
|
||||
}
|
||||
g_instance.dms_cxt.SSReformInfo.switchover_demote_failure_signal_handled = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (CheckPromoteSignal()) {
|
||||
handle_promote_signal();
|
||||
}
|
||||
@ -15429,7 +15457,7 @@ int SSCountAndPrintChildren(int target)
|
||||
|
||||
cnt++;
|
||||
ereport(WARNING, (errmodule(MOD_DMS),
|
||||
errmsg("[SS reform][SS failover] print thread no exiting, thread id:%lu, thread role:%d",
|
||||
errmsg("[SS reform] print thread no exiting, thread id:%lu, thread role:%d",
|
||||
bp->pid, bp->role)));
|
||||
}
|
||||
|
||||
|
||||
@ -193,6 +193,7 @@ static void knl_g_dms_init(knl_g_dms_context *dms_cxt)
|
||||
dms_cxt->SSReformInfo.redo_total_bytes = 0;
|
||||
dms_cxt->SSReformInfo.reform_ver = 0;
|
||||
dms_cxt->SSReformInfo.reform_ver_startup_wait = 0;
|
||||
dms_cxt->SSReformInfo.switchover_demote_failure_signal_handled = false;
|
||||
dms_cxt->SSClusterState = NODESTATE_NORMAL;
|
||||
dms_cxt->SSRecoveryInfo.recovery_inst_id = INVALID_INSTANCEID;
|
||||
dms_cxt->SSRecoveryInfo.cluster_ondemand_status = CLUSTER_NORMAL;
|
||||
|
||||
@ -88,6 +88,7 @@ typedef struct st_reform_info {
|
||||
bool is_hashmap_constructed;
|
||||
TimestampTz reform_ver;
|
||||
TimestampTz reform_ver_startup_wait;
|
||||
bool switchover_demote_failure_signal_handled;
|
||||
} ss_reform_info_t;
|
||||
|
||||
typedef enum st_failover_ckpt_status {
|
||||
|
||||
@ -57,6 +57,7 @@ typedef enum {
|
||||
PMSIGNAL_DMS_REFORM, /* dms reform start during PM_RUN */
|
||||
PMSIGNAL_DMS_REFORM_DONE, /* dms reform done */
|
||||
PMSIGNAL_DMS_TERM_STARTUP, /* term startup thread*/
|
||||
PMSIGNAL_DMS_SWITCHOVER_DEMOTE_FAILURE_CHECK, /* figure out why switchover demote failed */
|
||||
NUM_PMSIGNALS /* Must be last value of enum! */
|
||||
} PMSignalReason;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user