bugfix 修复switchover时checkpoint线程有概率core的问题

This commit is contained in:
congzhou2603
2024-12-13 11:21:20 +08:00
parent c3b0561a12
commit 6e2e33f756
4 changed files with 24 additions and 13 deletions

View File

@ -434,7 +434,7 @@ static int CBSwitchoverDemote(void *db_handle)
if (pmState == PM_RUN && g_instance.dms_cxt.SSClusterState == NODESTATE_PROMOTE_APPROVE) {
SSResetDemoteReqType();
ereport(LOG,
(errmodule(MOD_DMS), errmsg("[SS reform][SS switchover] Success in %s primary demote, running as"
(errmodule(MOD_DMS), errmsg("[SS reform][SS switchover] Success in %s primary demote, running as "
"standby, waiting for reformer setting new role.", DemoteModeDesc(demote_mode))));
return DMS_SUCCESS;
} else {
@ -1809,6 +1809,7 @@ static void CBReformSetDmsRole(void *db_handle, unsigned int reformer_id)
dms_role_t new_dms_role = reformer_id == (unsigned int)SS_MY_INST_ID ? DMS_ROLE_REFORMER : DMS_ROLE_PARTNER;
if (new_dms_role == DMS_ROLE_REFORMER) {
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS switchover] begin to set currrent DSS as primary")));
g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status = NOT_ALLOW_CKPT;
SSGrantDSSWritePermission();
g_instance.dms_cxt.SSClusterState = NODESTATE_STANDBY_PROMOTING;
}
@ -1980,7 +1981,7 @@ static void FailoverStartNotify(dms_reform_start_context_t *rs_cxt)
g_instance.dms_cxt.SSRecoveryInfo.recovery_pause_flag = true;
if (rs_cxt->role == DMS_ROLE_REFORMER) {
g_instance.dms_cxt.dw_init = false;
/* variable set order: SharedRecoveryInProgress -> failover_ckpt_status -> dms_role */
/* variable set order: SharedRecoveryInProgress -> reform_ckpt_status -> dms_role */
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->IsRecoveryDone = false;
@ -1988,7 +1989,7 @@ static void FailoverStartNotify(dms_reform_start_context_t *rs_cxt)
SpinLockRelease(&xlogctl->info_lck);
t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_CRASH_RECOVERY;
pg_memory_barrier();
g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status = NOT_ALLOW_CKPT;
g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status = NOT_ALLOW_CKPT;
g_instance.dms_cxt.SSClusterState = NODESTATE_STANDBY_FAILOVER_PROMOTING;
/*
@ -2093,7 +2094,7 @@ static int CBReformDoneNotify(void *db_handle)
/* SSClusterState and in_reform must be set atomically */
g_instance.dms_cxt.SSRecoveryInfo.startup_reform = false;
g_instance.dms_cxt.SSRecoveryInfo.restart_failover_flag = false;
g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status = NOT_ACTIVE;
g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status = NOT_ACTIVE;
Assert(g_instance.dms_cxt.SSRecoveryInfo.in_flushcopy == false);
g_instance.dms_cxt.SSReformInfo.new_bitmap = g_instance.dms_cxt.SSReformerControl.list_stable;
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform] new cluster node bitmap: %lu",

View File

@ -199,7 +199,7 @@ static void knl_g_dms_init(knl_g_dms_context *dms_cxt)
dms_cxt->SSRecoveryInfo.recovery_inst_id = INVALID_INSTANCEID;
dms_cxt->SSRecoveryInfo.cluster_ondemand_status = CLUSTER_NORMAL;
dms_cxt->SSRecoveryInfo.recovery_pause_flag = true;
dms_cxt->SSRecoveryInfo.failover_ckpt_status = NOT_ACTIVE;
dms_cxt->SSRecoveryInfo.reform_ckpt_status = NOT_ACTIVE;
dms_cxt->SSRecoveryInfo.new_primary_reset_walbuf_flag = false;
dms_cxt->SSRecoveryInfo.ready_to_startup = false;
dms_cxt->SSRecoveryInfo.startup_reform = true;

View File

@ -11107,8 +11107,10 @@ void StartupXLOG(void)
}
if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING || !SSOndemandRecoveryExitNormal) {
if (SS_STANDBY_FAILOVER) {
g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status = ALLOW_CKPT;
if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) {
g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status = ALLOW_CKPT;
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS %s] allow checkpoint execute.",
SS_STANDBY_FAILOVER ? "failover" : "switchover")));
pg_memory_barrier();
}
if (!SS_IN_ONDEMAND_RECOVERY) {
@ -11577,7 +11579,7 @@ bool RecoveryInProgress(void)
*/
if (!t_thrd.xlog_cxt.LocalRecoveryInProgress) {
if (!ENABLE_DMS || (ENABLE_DMS && !SS_STANDBY_PROMOTING &&
g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status == NOT_ACTIVE)) {
g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status == NOT_ACTIVE)) {
return false;
}
}
@ -12231,14 +12233,22 @@ void CreateCheckPoint(int flags)
END_CRIT_SECTION();
}
return;
} else if (g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status == NOT_ALLOW_CKPT) {
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS failover] do not do CreateCheckpoint during failover")));
} else if (g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status == NOT_ALLOW_CKPT) {
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS %s] do not do CreateCheckpoint during %s.",
SS_STANDBY_FAILOVER ? "failover" : "switchover",
SS_STANDBY_FAILOVER ? "failover" : "switchover")));
return;
} else if (SS_IN_ONDEMAND_RECOVERY && !SS_ONDEMAND_REDO_DONE) {
/* do not allow ckpt in ondemand recovery if xlog do not redo done, for valid ckpt loc in control file */
return;
} else if (SS_PRIMARY_DEMOTED &&
!(dms_reform_failed() || dms_reform_last_failed() || g_instance.dms_cxt.SSReformInfo.in_reform == false)) {
ereport(LOG, (errmodule(MOD_DMS),
errmsg("[SS reform][SS switchover] do not do CreateCheckpoint during primary demoted.")));
return;
}
/* CHECKPOINT_IS_SHUTDOWN CHECKPOINT_END_OF_RECOVERY CHECKPOINT_FORCE shuld do full checkpoint */
if (shutdown || ((unsigned int)flags & (CHECKPOINT_FORCE))) {
doFullCheckpoint = true;

View File

@ -100,11 +100,11 @@ typedef struct st_reform_info {
bool switchover_demote_failure_signal_handled;
} ss_reform_info_t;
typedef enum st_failover_ckpt_status {
typedef enum st_reform_ckpt_status {
NOT_ACTIVE = 0,
NOT_ALLOW_CKPT,
ALLOW_CKPT
} failover_ckpt_status_t;
} reform_ckpt_status_t;
typedef enum st_ondemand_realtime_build_status {
DISABLED = 0,
@ -151,9 +151,9 @@ typedef struct realtime_build_log_ctrl {
typedef struct ss_recovery_info {
bool recovery_pause_flag;
volatile failover_ckpt_status_t failover_ckpt_status;
char recovery_xlog_dir[MAXPGPATH];
int recovery_inst_id;
volatile reform_ckpt_status_t reform_ckpt_status;
volatile SSGlobalClusterState cluster_ondemand_status;
char xlog_list[DMS_MAX_INSTANCE][MAXPGPATH];
LWLock* update_seg_lock;