From 6e2e33f756509f0fcdee06d80277f05f82daa536 Mon Sep 17 00:00:00 2001 From: congzhou2603 Date: Fri, 13 Dec 2024 11:21:20 +0800 Subject: [PATCH] =?UTF-8?q?bugfix=20=E4=BF=AE=E5=A4=8Dswitchover=E6=97=B6c?= =?UTF-8?q?heckpoint=E7=BA=BF=E7=A8=8B=E6=9C=89=E6=A6=82=E7=8E=87core?= =?UTF-8?q?=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ddes/adapter/ss_dms_callback.cpp | 9 +++++---- .../process/threadpool/knl_instance.cpp | 2 +- .../storage/access/transam/xlog.cpp | 20 ++++++++++++++----- src/include/ddes/dms/ss_dms_recovery.h | 6 +++--- 4 files changed, 24 insertions(+), 13 deletions(-) diff --git a/src/gausskernel/ddes/adapter/ss_dms_callback.cpp b/src/gausskernel/ddes/adapter/ss_dms_callback.cpp index e01bcc916..ae745d9b6 100644 --- a/src/gausskernel/ddes/adapter/ss_dms_callback.cpp +++ b/src/gausskernel/ddes/adapter/ss_dms_callback.cpp @@ -434,7 +434,7 @@ static int CBSwitchoverDemote(void *db_handle) if (pmState == PM_RUN && g_instance.dms_cxt.SSClusterState == NODESTATE_PROMOTE_APPROVE) { SSResetDemoteReqType(); ereport(LOG, - (errmodule(MOD_DMS), errmsg("[SS reform][SS switchover] Success in %s primary demote, running as" + (errmodule(MOD_DMS), errmsg("[SS reform][SS switchover] Success in %s primary demote, running as " "standby, waiting for reformer setting new role.", DemoteModeDesc(demote_mode)))); return DMS_SUCCESS; } else { @@ -1809,6 +1809,7 @@ static void CBReformSetDmsRole(void *db_handle, unsigned int reformer_id) dms_role_t new_dms_role = reformer_id == (unsigned int)SS_MY_INST_ID ? DMS_ROLE_REFORMER : DMS_ROLE_PARTNER; if (new_dms_role == DMS_ROLE_REFORMER) { ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS switchover] begin to set currrent DSS as primary"))); + g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status = NOT_ALLOW_CKPT; SSGrantDSSWritePermission(); g_instance.dms_cxt.SSClusterState = NODESTATE_STANDBY_PROMOTING; } @@ -1980,7 +1981,7 @@ static void FailoverStartNotify(dms_reform_start_context_t *rs_cxt) g_instance.dms_cxt.SSRecoveryInfo.recovery_pause_flag = true; if (rs_cxt->role == DMS_ROLE_REFORMER) { g_instance.dms_cxt.dw_init = false; - /* variable set order: SharedRecoveryInProgress -> failover_ckpt_status -> dms_role */ + /* variable set order: SharedRecoveryInProgress -> reform_ckpt_status -> dms_role */ volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl; SpinLockAcquire(&xlogctl->info_lck); xlogctl->IsRecoveryDone = false; @@ -1988,7 +1989,7 @@ static void FailoverStartNotify(dms_reform_start_context_t *rs_cxt) SpinLockRelease(&xlogctl->info_lck); t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_CRASH_RECOVERY; pg_memory_barrier(); - g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status = NOT_ALLOW_CKPT; + g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status = NOT_ALLOW_CKPT; g_instance.dms_cxt.SSClusterState = NODESTATE_STANDBY_FAILOVER_PROMOTING; /* @@ -2093,7 +2094,7 @@ static int CBReformDoneNotify(void *db_handle) /* SSClusterState and in_reform must be set atomically */ g_instance.dms_cxt.SSRecoveryInfo.startup_reform = false; g_instance.dms_cxt.SSRecoveryInfo.restart_failover_flag = false; - g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status = NOT_ACTIVE; + g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status = NOT_ACTIVE; Assert(g_instance.dms_cxt.SSRecoveryInfo.in_flushcopy == false); g_instance.dms_cxt.SSReformInfo.new_bitmap = g_instance.dms_cxt.SSReformerControl.list_stable; ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform] new cluster node bitmap: %lu", diff --git a/src/gausskernel/process/threadpool/knl_instance.cpp b/src/gausskernel/process/threadpool/knl_instance.cpp index 42fa68f79..9790581f4 100755 --- a/src/gausskernel/process/threadpool/knl_instance.cpp +++ b/src/gausskernel/process/threadpool/knl_instance.cpp @@ -199,7 +199,7 @@ static void knl_g_dms_init(knl_g_dms_context *dms_cxt) dms_cxt->SSRecoveryInfo.recovery_inst_id = INVALID_INSTANCEID; dms_cxt->SSRecoveryInfo.cluster_ondemand_status = CLUSTER_NORMAL; dms_cxt->SSRecoveryInfo.recovery_pause_flag = true; - dms_cxt->SSRecoveryInfo.failover_ckpt_status = NOT_ACTIVE; + dms_cxt->SSRecoveryInfo.reform_ckpt_status = NOT_ACTIVE; dms_cxt->SSRecoveryInfo.new_primary_reset_walbuf_flag = false; dms_cxt->SSRecoveryInfo.ready_to_startup = false; dms_cxt->SSRecoveryInfo.startup_reform = true; diff --git a/src/gausskernel/storage/access/transam/xlog.cpp b/src/gausskernel/storage/access/transam/xlog.cpp index 03b24ba5a..6bce82b74 100755 --- a/src/gausskernel/storage/access/transam/xlog.cpp +++ b/src/gausskernel/storage/access/transam/xlog.cpp @@ -11107,8 +11107,10 @@ void StartupXLOG(void) } if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING || !SSOndemandRecoveryExitNormal) { - if (SS_STANDBY_FAILOVER) { - g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status = ALLOW_CKPT; + if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) { + g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status = ALLOW_CKPT; + ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS %s] allow checkpoint execute.", + SS_STANDBY_FAILOVER ? "failover" : "switchover"))); pg_memory_barrier(); } if (!SS_IN_ONDEMAND_RECOVERY) { @@ -11577,7 +11579,7 @@ bool RecoveryInProgress(void) */ if (!t_thrd.xlog_cxt.LocalRecoveryInProgress) { if (!ENABLE_DMS || (ENABLE_DMS && !SS_STANDBY_PROMOTING && - g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status == NOT_ACTIVE)) { + g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status == NOT_ACTIVE)) { return false; } } @@ -12231,14 +12233,22 @@ void CreateCheckPoint(int flags) END_CRIT_SECTION(); } return; - } else if (g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status == NOT_ALLOW_CKPT) { - ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS failover] do not do CreateCheckpoint during failover"))); + } else if (g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status == NOT_ALLOW_CKPT) { + ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS %s] do not do CreateCheckpoint during %s.", + SS_STANDBY_FAILOVER ? "failover" : "switchover", + SS_STANDBY_FAILOVER ? "failover" : "switchover"))); return; } else if (SS_IN_ONDEMAND_RECOVERY && !SS_ONDEMAND_REDO_DONE) { /* do not allow ckpt in ondemand recovery if xlog do not redo done, for valid ckpt loc in control file */ return; + } else if (SS_PRIMARY_DEMOTED && + !(dms_reform_failed() || dms_reform_last_failed() || g_instance.dms_cxt.SSReformInfo.in_reform == false)) { + ereport(LOG, (errmodule(MOD_DMS), + errmsg("[SS reform][SS switchover] do not do CreateCheckpoint during primary demoted."))); + return; } + /* CHECKPOINT_IS_SHUTDOWN CHECKPOINT_END_OF_RECOVERY CHECKPOINT_FORCE shuld do full checkpoint */ if (shutdown || ((unsigned int)flags & (CHECKPOINT_FORCE))) { doFullCheckpoint = true; diff --git a/src/include/ddes/dms/ss_dms_recovery.h b/src/include/ddes/dms/ss_dms_recovery.h index 8cd55a0ab..727e1b7f6 100644 --- a/src/include/ddes/dms/ss_dms_recovery.h +++ b/src/include/ddes/dms/ss_dms_recovery.h @@ -100,11 +100,11 @@ typedef struct st_reform_info { bool switchover_demote_failure_signal_handled; } ss_reform_info_t; -typedef enum st_failover_ckpt_status { +typedef enum st_reform_ckpt_status { NOT_ACTIVE = 0, NOT_ALLOW_CKPT, ALLOW_CKPT -} failover_ckpt_status_t; +} reform_ckpt_status_t; typedef enum st_ondemand_realtime_build_status { DISABLED = 0, @@ -151,9 +151,9 @@ typedef struct realtime_build_log_ctrl { typedef struct ss_recovery_info { bool recovery_pause_flag; - volatile failover_ckpt_status_t failover_ckpt_status; char recovery_xlog_dir[MAXPGPATH]; int recovery_inst_id; + volatile reform_ckpt_status_t reform_ckpt_status; volatile SSGlobalClusterState cluster_ondemand_status; char xlog_list[DMS_MAX_INSTANCE][MAXPGPATH]; LWLock* update_seg_lock;