From e0bf2f88f77a3755ca73e9fc539eaf673d2d5a2d Mon Sep 17 00:00:00 2001 From: dongning12 Date: Thu, 27 Jun 2024 11:37:28 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90=E8=B5=84=E6=BA=90=E6=B1=A0=E5=8C=96?= =?UTF-8?q?=E3=80=91=E3=80=90bugfix=E3=80=911.=E9=99=90=E5=88=B6switchover?= =?UTF-8?q?=E9=99=8D=E5=A4=87=E6=97=B6=E9=97=B4=EF=BC=8C10min=E5=AE=8C?= =?UTF-8?q?=E4=B8=8D=E6=88=90=E8=AE=A4=E4=B8=BA=E5=A4=B1=E8=B4=A5=EF=BC=9B?= =?UTF-8?q?2.=E5=A2=9E=E5=8A=A0=E5=AE=9A=E4=BD=8D=E6=89=8B=E6=AE=B5?= =?UTF-8?q?=EF=BC=8C=E6=89=BE=E5=87=BA=E9=99=8D=E5=A4=87=E5=A4=B1=E8=B4=A5?= =?UTF-8?q?,=E5=A7=8B=E7=BB=88=E5=A4=84=E4=BA=8EPM=5FWAIT=5FBACKENDS?= =?UTF-8?q?=E7=9A=84=E5=8E=9F=E5=9B=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ddes/adapter/ss_dms_callback.cpp | 20 +++++++++++-- .../process/postmaster/postmaster.cpp | 30 ++++++++++++++++++- .../process/threadpool/knl_instance.cpp | 1 + src/include/ddes/dms/ss_dms_recovery.h | 1 + src/include/storage/pmsignal.h | 1 + 5 files changed, 49 insertions(+), 4 deletions(-) diff --git a/src/gausskernel/ddes/adapter/ss_dms_callback.cpp b/src/gausskernel/ddes/adapter/ss_dms_callback.cpp index 1ed0dd5bb..45001113e 100644 --- a/src/gausskernel/ddes/adapter/ss_dms_callback.cpp +++ b/src/gausskernel/ddes/adapter/ss_dms_callback.cpp @@ -352,13 +352,27 @@ static inline void SSResetDemoteReqType(void) SpinLockRelease(&t_thrd.walsender_cxt.WalSndCtl->mutex); } -static void SSHandleReformFailDuringDemote(DemoteMode demote_mode) +static void SSHandleReformFailDuringDemote(bool timeout, DemoteMode demote_mode) { ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS reform][SS switchover] Failure in %s primary demote, pmState=%d, need reform rcy.", DemoteModeDesc(demote_mode), pmState))); + if (timeout) { + g_instance.dms_cxt.SSReformInfo.switchover_demote_failure_signal_handled = false; + pg_memory_barrier(); + SendPostmasterSignal(PMSIGNAL_DMS_SWITCHOVER_DEMOTE_FAILURE_CHECK); + const int WAIT_SIGNAL_HANDLED = 100; /* only wait 10s*/ + for (int ntries = 0; ntries < WAIT_SIGNAL_HANDLED; ntries++) { + if (g_instance.dms_cxt.SSReformInfo.switchover_demote_failure_signal_handled) { + break; + } + CHECK_FOR_INTERRUPTS(); + pg_usleep(100000L); /* wait 0.1 sec, then retry */ + } + } + /* * Shutdown checkpoint would cause concurrency as DMS is starting next round of reform. * If we allow ckpt to finish and recover, DMS would not be aware of the recovery process. @@ -413,10 +427,10 @@ static int CBSwitchoverDemote(void *db_handle) return DMS_SUCCESS; } else { if (ntries >= WAIT_DEMOTE || dms_reform_failed()) { - SSHandleReformFailDuringDemote(demote_mode); + bool timeout = ntries >= WAIT_DEMOTE ? true : false; + SSHandleReformFailDuringDemote(timeout, demote_mode); return DMS_ERROR; } - ntries = 0; } CHECK_FOR_INTERRUPTS(); diff --git a/src/gausskernel/process/postmaster/postmaster.cpp b/src/gausskernel/process/postmaster/postmaster.cpp index 1878f0d34..16ae58749 100644 --- a/src/gausskernel/process/postmaster/postmaster.cpp +++ b/src/gausskernel/process/postmaster/postmaster.cpp @@ -10695,6 +10695,34 @@ static void sigusr1_handler(SIGNAL_ARGS) } } + if (ENABLE_DMS && CheckPostmasterSignal(PMSIGNAL_DMS_SWITCHOVER_DEMOTE_FAILURE_CHECK)) { + if (pmState == PM_WAIT_BACKENDS) { + int backend_count = SSCountAndPrintChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC); + if (backend_count > 0) { + ereport(WARNING, (errmodule(MOD_DMS), + errmsg("[SS reform][SS switchover] demote fail reason: backends exist, backend_count:%d, " + "pmState:%d", + backend_count, pmState))); + } + + const int check_list_num = 4; + ThreadId check_list[check_list_num] = { + g_instance.pid_cxt.AshPID, + g_instance.pid_cxt.TwoPhaseCleanerPID, + g_instance.pid_cxt.StatementPID, + g_instance.pid_cxt.PercentilePID + }; + for (int i = 0; i < check_list_num; i++) { + if (check_list[i] != 0 ) { + ereport(WARNING, (errmodule(MOD_DMS), + errmsg("[SS reform][SS switchover] demote fail reason: thread name:%s exist, pid:%lu, pmState:%d", + GetProcName(check_list[i]), check_list[i], pmState))); + } + } + g_instance.dms_cxt.SSReformInfo.switchover_demote_failure_signal_handled = true; + } + } + if (CheckPromoteSignal()) { handle_promote_signal(); } @@ -15429,7 +15457,7 @@ int SSCountAndPrintChildren(int target) cnt++; ereport(WARNING, (errmodule(MOD_DMS), - errmsg("[SS reform][SS failover] print thread no exiting, thread id:%lu, thread role:%d", + errmsg("[SS reform] print thread no exiting, thread id:%lu, thread role:%d", bp->pid, bp->role))); } diff --git a/src/gausskernel/process/threadpool/knl_instance.cpp b/src/gausskernel/process/threadpool/knl_instance.cpp index c1ba0b33d..d5358fe26 100755 --- a/src/gausskernel/process/threadpool/knl_instance.cpp +++ b/src/gausskernel/process/threadpool/knl_instance.cpp @@ -193,6 +193,7 @@ static void knl_g_dms_init(knl_g_dms_context *dms_cxt) dms_cxt->SSReformInfo.redo_total_bytes = 0; dms_cxt->SSReformInfo.reform_ver = 0; dms_cxt->SSReformInfo.reform_ver_startup_wait = 0; + dms_cxt->SSReformInfo.switchover_demote_failure_signal_handled = false; dms_cxt->SSClusterState = NODESTATE_NORMAL; dms_cxt->SSRecoveryInfo.recovery_inst_id = INVALID_INSTANCEID; dms_cxt->SSRecoveryInfo.cluster_ondemand_status = CLUSTER_NORMAL; diff --git a/src/include/ddes/dms/ss_dms_recovery.h b/src/include/ddes/dms/ss_dms_recovery.h index f208b0116..b700340ce 100644 --- a/src/include/ddes/dms/ss_dms_recovery.h +++ b/src/include/ddes/dms/ss_dms_recovery.h @@ -88,6 +88,7 @@ typedef struct st_reform_info { bool is_hashmap_constructed; TimestampTz reform_ver; TimestampTz reform_ver_startup_wait; + bool switchover_demote_failure_signal_handled; } ss_reform_info_t; typedef enum st_failover_ckpt_status { diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h index 195af874f..6bad09513 100644 --- a/src/include/storage/pmsignal.h +++ b/src/include/storage/pmsignal.h @@ -57,6 +57,7 @@ typedef enum { PMSIGNAL_DMS_REFORM, /* dms reform start during PM_RUN */ PMSIGNAL_DMS_REFORM_DONE, /* dms reform done */ PMSIGNAL_DMS_TERM_STARTUP, /* term startup thread*/ + PMSIGNAL_DMS_SWITCHOVER_DEMOTE_FAILURE_CHECK, /* figure out why switchover demote failed */ NUM_PMSIGNALS /* Must be last value of enum! */ } PMSignalReason;