【资源池化】【bugfix】1.限制switchover降备时间,10min完不成认为失败;2.增加定位手段,找出降备失败,始终处于PM_WAIT_BACKENDS的原因

This commit is contained in:
dongning12
2024-06-27 11:37:28 +08:00
parent ca44bc145f
commit e0bf2f88f7
5 changed files with 49 additions and 4 deletions

View File

@ -352,13 +352,27 @@ static inline void SSResetDemoteReqType(void)
SpinLockRelease(&t_thrd.walsender_cxt.WalSndCtl->mutex);
}
static void SSHandleReformFailDuringDemote(DemoteMode demote_mode)
static void SSHandleReformFailDuringDemote(bool timeout, DemoteMode demote_mode)
{
ereport(WARNING,
(errmodule(MOD_DMS),
errmsg("[SS reform][SS switchover] Failure in %s primary demote, pmState=%d, need reform rcy.",
DemoteModeDesc(demote_mode), pmState)));
if (timeout) {
g_instance.dms_cxt.SSReformInfo.switchover_demote_failure_signal_handled = false;
pg_memory_barrier();
SendPostmasterSignal(PMSIGNAL_DMS_SWITCHOVER_DEMOTE_FAILURE_CHECK);
const int WAIT_SIGNAL_HANDLED = 100; /* only wait 10s*/
for (int ntries = 0; ntries < WAIT_SIGNAL_HANDLED; ntries++) {
if (g_instance.dms_cxt.SSReformInfo.switchover_demote_failure_signal_handled) {
break;
}
CHECK_FOR_INTERRUPTS();
pg_usleep(100000L); /* wait 0.1 sec, then retry */
}
}
/*
* Shutdown checkpoint would cause concurrency as DMS is starting next round of reform.
* If we allow ckpt to finish and recover, DMS would not be aware of the recovery process.
@ -413,10 +427,10 @@ static int CBSwitchoverDemote(void *db_handle)
return DMS_SUCCESS;
} else {
if (ntries >= WAIT_DEMOTE || dms_reform_failed()) {
SSHandleReformFailDuringDemote(demote_mode);
bool timeout = ntries >= WAIT_DEMOTE ? true : false;
SSHandleReformFailDuringDemote(timeout, demote_mode);
return DMS_ERROR;
}
ntries = 0;
}
CHECK_FOR_INTERRUPTS();

View File

@ -10695,6 +10695,34 @@ static void sigusr1_handler(SIGNAL_ARGS)
}
}
if (ENABLE_DMS && CheckPostmasterSignal(PMSIGNAL_DMS_SWITCHOVER_DEMOTE_FAILURE_CHECK)) {
if (pmState == PM_WAIT_BACKENDS) {
int backend_count = SSCountAndPrintChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC);
if (backend_count > 0) {
ereport(WARNING, (errmodule(MOD_DMS),
errmsg("[SS reform][SS switchover] demote fail reason: backends exist, backend_count:%d, "
"pmState:%d",
backend_count, pmState)));
}
const int check_list_num = 4;
ThreadId check_list[check_list_num] = {
g_instance.pid_cxt.AshPID,
g_instance.pid_cxt.TwoPhaseCleanerPID,
g_instance.pid_cxt.StatementPID,
g_instance.pid_cxt.PercentilePID
};
for (int i = 0; i < check_list_num; i++) {
if (check_list[i] != 0 ) {
ereport(WARNING, (errmodule(MOD_DMS),
errmsg("[SS reform][SS switchover] demote fail reason: thread name:%s exist, pid:%lu, pmState:%d",
GetProcName(check_list[i]), check_list[i], pmState)));
}
}
g_instance.dms_cxt.SSReformInfo.switchover_demote_failure_signal_handled = true;
}
}
if (CheckPromoteSignal()) {
handle_promote_signal();
}
@ -15429,7 +15457,7 @@ int SSCountAndPrintChildren(int target)
cnt++;
ereport(WARNING, (errmodule(MOD_DMS),
errmsg("[SS reform][SS failover] print thread no exiting, thread id:%lu, thread role:%d",
errmsg("[SS reform] print thread no exiting, thread id:%lu, thread role:%d",
bp->pid, bp->role)));
}

View File

@ -193,6 +193,7 @@ static void knl_g_dms_init(knl_g_dms_context *dms_cxt)
dms_cxt->SSReformInfo.redo_total_bytes = 0;
dms_cxt->SSReformInfo.reform_ver = 0;
dms_cxt->SSReformInfo.reform_ver_startup_wait = 0;
dms_cxt->SSReformInfo.switchover_demote_failure_signal_handled = false;
dms_cxt->SSClusterState = NODESTATE_NORMAL;
dms_cxt->SSRecoveryInfo.recovery_inst_id = INVALID_INSTANCEID;
dms_cxt->SSRecoveryInfo.cluster_ondemand_status = CLUSTER_NORMAL;

View File

@ -88,6 +88,7 @@ typedef struct st_reform_info {
bool is_hashmap_constructed;
TimestampTz reform_ver;
TimestampTz reform_ver_startup_wait;
bool switchover_demote_failure_signal_handled;
} ss_reform_info_t;
typedef enum st_failover_ckpt_status {

View File

@ -57,6 +57,7 @@ typedef enum {
PMSIGNAL_DMS_REFORM, /* dms reform start during PM_RUN */
PMSIGNAL_DMS_REFORM_DONE, /* dms reform done */
PMSIGNAL_DMS_TERM_STARTUP, /* term startup thread*/
PMSIGNAL_DMS_SWITCHOVER_DEMOTE_FAILURE_CHECK, /* figure out why switchover demote failed */
NUM_PMSIGNALS /* Must be last value of enum! */
} PMSignalReason;