diff --git a/src/gausskernel/ddes/adapter/ss_dms_callback.cpp b/src/gausskernel/ddes/adapter/ss_dms_callback.cpp index 1ed0dd5bb..45001113e 100644 --- a/src/gausskernel/ddes/adapter/ss_dms_callback.cpp +++ b/src/gausskernel/ddes/adapter/ss_dms_callback.cpp @@ -352,13 +352,27 @@ static inline void SSResetDemoteReqType(void) SpinLockRelease(&t_thrd.walsender_cxt.WalSndCtl->mutex); } -static void SSHandleReformFailDuringDemote(DemoteMode demote_mode) +static void SSHandleReformFailDuringDemote(bool timeout, DemoteMode demote_mode) { ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS reform][SS switchover] Failure in %s primary demote, pmState=%d, need reform rcy.", DemoteModeDesc(demote_mode), pmState))); + if (timeout) { + g_instance.dms_cxt.SSReformInfo.switchover_demote_failure_signal_handled = false; + pg_memory_barrier(); + SendPostmasterSignal(PMSIGNAL_DMS_SWITCHOVER_DEMOTE_FAILURE_CHECK); + const int WAIT_SIGNAL_HANDLED = 100; /* only wait 10s*/ + for (int ntries = 0; ntries < WAIT_SIGNAL_HANDLED; ntries++) { + if (g_instance.dms_cxt.SSReformInfo.switchover_demote_failure_signal_handled) { + break; + } + CHECK_FOR_INTERRUPTS(); + pg_usleep(100000L); /* wait 0.1 sec, then retry */ + } + } + /* * Shutdown checkpoint would cause concurrency as DMS is starting next round of reform. * If we allow ckpt to finish and recover, DMS would not be aware of the recovery process. @@ -413,10 +427,10 @@ static int CBSwitchoverDemote(void *db_handle) return DMS_SUCCESS; } else { if (ntries >= WAIT_DEMOTE || dms_reform_failed()) { - SSHandleReformFailDuringDemote(demote_mode); + bool timeout = ntries >= WAIT_DEMOTE ? true : false; + SSHandleReformFailDuringDemote(timeout, demote_mode); return DMS_ERROR; } - ntries = 0; } CHECK_FOR_INTERRUPTS(); diff --git a/src/gausskernel/process/postmaster/postmaster.cpp b/src/gausskernel/process/postmaster/postmaster.cpp index 1878f0d34..16ae58749 100644 --- a/src/gausskernel/process/postmaster/postmaster.cpp +++ b/src/gausskernel/process/postmaster/postmaster.cpp @@ -10695,6 +10695,34 @@ static void sigusr1_handler(SIGNAL_ARGS) } } + if (ENABLE_DMS && CheckPostmasterSignal(PMSIGNAL_DMS_SWITCHOVER_DEMOTE_FAILURE_CHECK)) { + if (pmState == PM_WAIT_BACKENDS) { + int backend_count = SSCountAndPrintChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC); + if (backend_count > 0) { + ereport(WARNING, (errmodule(MOD_DMS), + errmsg("[SS reform][SS switchover] demote fail reason: backends exist, backend_count:%d, " + "pmState:%d", + backend_count, pmState))); + } + + const int check_list_num = 4; + ThreadId check_list[check_list_num] = { + g_instance.pid_cxt.AshPID, + g_instance.pid_cxt.TwoPhaseCleanerPID, + g_instance.pid_cxt.StatementPID, + g_instance.pid_cxt.PercentilePID + }; + for (int i = 0; i < check_list_num; i++) { + if (check_list[i] != 0 ) { + ereport(WARNING, (errmodule(MOD_DMS), + errmsg("[SS reform][SS switchover] demote fail reason: thread name:%s exist, pid:%lu, pmState:%d", + GetProcName(check_list[i]), check_list[i], pmState))); + } + } + g_instance.dms_cxt.SSReformInfo.switchover_demote_failure_signal_handled = true; + } + } + if (CheckPromoteSignal()) { handle_promote_signal(); } @@ -15429,7 +15457,7 @@ int SSCountAndPrintChildren(int target) cnt++; ereport(WARNING, (errmodule(MOD_DMS), - errmsg("[SS reform][SS failover] print thread no exiting, thread id:%lu, thread role:%d", + errmsg("[SS reform] print thread no exiting, thread id:%lu, thread role:%d", bp->pid, bp->role))); } diff --git a/src/gausskernel/process/threadpool/knl_instance.cpp b/src/gausskernel/process/threadpool/knl_instance.cpp index c1ba0b33d..d5358fe26 100755 --- a/src/gausskernel/process/threadpool/knl_instance.cpp +++ b/src/gausskernel/process/threadpool/knl_instance.cpp @@ -193,6 +193,7 @@ static void knl_g_dms_init(knl_g_dms_context *dms_cxt) dms_cxt->SSReformInfo.redo_total_bytes = 0; dms_cxt->SSReformInfo.reform_ver = 0; dms_cxt->SSReformInfo.reform_ver_startup_wait = 0; + dms_cxt->SSReformInfo.switchover_demote_failure_signal_handled = false; dms_cxt->SSClusterState = NODESTATE_NORMAL; dms_cxt->SSRecoveryInfo.recovery_inst_id = INVALID_INSTANCEID; dms_cxt->SSRecoveryInfo.cluster_ondemand_status = CLUSTER_NORMAL; diff --git a/src/include/ddes/dms/ss_dms_recovery.h b/src/include/ddes/dms/ss_dms_recovery.h index f208b0116..b700340ce 100644 --- a/src/include/ddes/dms/ss_dms_recovery.h +++ b/src/include/ddes/dms/ss_dms_recovery.h @@ -88,6 +88,7 @@ typedef struct st_reform_info { bool is_hashmap_constructed; TimestampTz reform_ver; TimestampTz reform_ver_startup_wait; + bool switchover_demote_failure_signal_handled; } ss_reform_info_t; typedef enum st_failover_ckpt_status { diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h index 195af874f..6bad09513 100644 --- a/src/include/storage/pmsignal.h +++ b/src/include/storage/pmsignal.h @@ -57,6 +57,7 @@ typedef enum { PMSIGNAL_DMS_REFORM, /* dms reform start during PM_RUN */ PMSIGNAL_DMS_REFORM_DONE, /* dms reform done */ PMSIGNAL_DMS_TERM_STARTUP, /* term startup thread*/ + PMSIGNAL_DMS_SWITCHOVER_DEMOTE_FAILURE_CHECK, /* figure out why switchover demote failed */ NUM_PMSIGNALS /* Must be last value of enum! */ } PMSignalReason;