bugfix 修复switchover时checkpoint线程有概率core的问题
This commit is contained in:
@ -434,7 +434,7 @@ static int CBSwitchoverDemote(void *db_handle)
|
||||
if (pmState == PM_RUN && g_instance.dms_cxt.SSClusterState == NODESTATE_PROMOTE_APPROVE) {
|
||||
SSResetDemoteReqType();
|
||||
ereport(LOG,
|
||||
(errmodule(MOD_DMS), errmsg("[SS reform][SS switchover] Success in %s primary demote, running as"
|
||||
(errmodule(MOD_DMS), errmsg("[SS reform][SS switchover] Success in %s primary demote, running as "
|
||||
"standby, waiting for reformer setting new role.", DemoteModeDesc(demote_mode))));
|
||||
return DMS_SUCCESS;
|
||||
} else {
|
||||
@ -1809,6 +1809,7 @@ static void CBReformSetDmsRole(void *db_handle, unsigned int reformer_id)
|
||||
dms_role_t new_dms_role = reformer_id == (unsigned int)SS_MY_INST_ID ? DMS_ROLE_REFORMER : DMS_ROLE_PARTNER;
|
||||
if (new_dms_role == DMS_ROLE_REFORMER) {
|
||||
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS switchover] begin to set currrent DSS as primary")));
|
||||
g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status = NOT_ALLOW_CKPT;
|
||||
SSGrantDSSWritePermission();
|
||||
g_instance.dms_cxt.SSClusterState = NODESTATE_STANDBY_PROMOTING;
|
||||
}
|
||||
@ -1980,7 +1981,7 @@ static void FailoverStartNotify(dms_reform_start_context_t *rs_cxt)
|
||||
g_instance.dms_cxt.SSRecoveryInfo.recovery_pause_flag = true;
|
||||
if (rs_cxt->role == DMS_ROLE_REFORMER) {
|
||||
g_instance.dms_cxt.dw_init = false;
|
||||
/* variable set order: SharedRecoveryInProgress -> failover_ckpt_status -> dms_role */
|
||||
/* variable set order: SharedRecoveryInProgress -> reform_ckpt_status -> dms_role */
|
||||
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
|
||||
SpinLockAcquire(&xlogctl->info_lck);
|
||||
xlogctl->IsRecoveryDone = false;
|
||||
@ -1988,7 +1989,7 @@ static void FailoverStartNotify(dms_reform_start_context_t *rs_cxt)
|
||||
SpinLockRelease(&xlogctl->info_lck);
|
||||
t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_CRASH_RECOVERY;
|
||||
pg_memory_barrier();
|
||||
g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status = NOT_ALLOW_CKPT;
|
||||
g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status = NOT_ALLOW_CKPT;
|
||||
g_instance.dms_cxt.SSClusterState = NODESTATE_STANDBY_FAILOVER_PROMOTING;
|
||||
|
||||
/*
|
||||
@ -2093,7 +2094,7 @@ static int CBReformDoneNotify(void *db_handle)
|
||||
/* SSClusterState and in_reform must be set atomically */
|
||||
g_instance.dms_cxt.SSRecoveryInfo.startup_reform = false;
|
||||
g_instance.dms_cxt.SSRecoveryInfo.restart_failover_flag = false;
|
||||
g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status = NOT_ACTIVE;
|
||||
g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status = NOT_ACTIVE;
|
||||
Assert(g_instance.dms_cxt.SSRecoveryInfo.in_flushcopy == false);
|
||||
g_instance.dms_cxt.SSReformInfo.new_bitmap = g_instance.dms_cxt.SSReformerControl.list_stable;
|
||||
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform] new cluster node bitmap: %lu",
|
||||
|
||||
@ -199,7 +199,7 @@ static void knl_g_dms_init(knl_g_dms_context *dms_cxt)
|
||||
dms_cxt->SSRecoveryInfo.recovery_inst_id = INVALID_INSTANCEID;
|
||||
dms_cxt->SSRecoveryInfo.cluster_ondemand_status = CLUSTER_NORMAL;
|
||||
dms_cxt->SSRecoveryInfo.recovery_pause_flag = true;
|
||||
dms_cxt->SSRecoveryInfo.failover_ckpt_status = NOT_ACTIVE;
|
||||
dms_cxt->SSRecoveryInfo.reform_ckpt_status = NOT_ACTIVE;
|
||||
dms_cxt->SSRecoveryInfo.new_primary_reset_walbuf_flag = false;
|
||||
dms_cxt->SSRecoveryInfo.ready_to_startup = false;
|
||||
dms_cxt->SSRecoveryInfo.startup_reform = true;
|
||||
|
||||
@ -11107,8 +11107,10 @@ void StartupXLOG(void)
|
||||
}
|
||||
|
||||
if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING || !SSOndemandRecoveryExitNormal) {
|
||||
if (SS_STANDBY_FAILOVER) {
|
||||
g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status = ALLOW_CKPT;
|
||||
if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) {
|
||||
g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status = ALLOW_CKPT;
|
||||
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS %s] allow checkpoint execute.",
|
||||
SS_STANDBY_FAILOVER ? "failover" : "switchover")));
|
||||
pg_memory_barrier();
|
||||
}
|
||||
if (!SS_IN_ONDEMAND_RECOVERY) {
|
||||
@ -11577,7 +11579,7 @@ bool RecoveryInProgress(void)
|
||||
*/
|
||||
if (!t_thrd.xlog_cxt.LocalRecoveryInProgress) {
|
||||
if (!ENABLE_DMS || (ENABLE_DMS && !SS_STANDBY_PROMOTING &&
|
||||
g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status == NOT_ACTIVE)) {
|
||||
g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status == NOT_ACTIVE)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -12231,14 +12233,22 @@ void CreateCheckPoint(int flags)
|
||||
END_CRIT_SECTION();
|
||||
}
|
||||
return;
|
||||
} else if (g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status == NOT_ALLOW_CKPT) {
|
||||
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS failover] do not do CreateCheckpoint during failover")));
|
||||
} else if (g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status == NOT_ALLOW_CKPT) {
|
||||
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS %s] do not do CreateCheckpoint during %s.",
|
||||
SS_STANDBY_FAILOVER ? "failover" : "switchover",
|
||||
SS_STANDBY_FAILOVER ? "failover" : "switchover")));
|
||||
return;
|
||||
} else if (SS_IN_ONDEMAND_RECOVERY && !SS_ONDEMAND_REDO_DONE) {
|
||||
/* do not allow ckpt in ondemand recovery if xlog do not redo done, for valid ckpt loc in control file */
|
||||
return;
|
||||
} else if (SS_PRIMARY_DEMOTED &&
|
||||
!(dms_reform_failed() || dms_reform_last_failed() || g_instance.dms_cxt.SSReformInfo.in_reform == false)) {
|
||||
ereport(LOG, (errmodule(MOD_DMS),
|
||||
errmsg("[SS reform][SS switchover] do not do CreateCheckpoint during primary demoted.")));
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
/* CHECKPOINT_IS_SHUTDOWN CHECKPOINT_END_OF_RECOVERY CHECKPOINT_FORCE shuld do full checkpoint */
|
||||
if (shutdown || ((unsigned int)flags & (CHECKPOINT_FORCE))) {
|
||||
doFullCheckpoint = true;
|
||||
|
||||
@ -100,11 +100,11 @@ typedef struct st_reform_info {
|
||||
bool switchover_demote_failure_signal_handled;
|
||||
} ss_reform_info_t;
|
||||
|
||||
typedef enum st_failover_ckpt_status {
|
||||
typedef enum st_reform_ckpt_status {
|
||||
NOT_ACTIVE = 0,
|
||||
NOT_ALLOW_CKPT,
|
||||
ALLOW_CKPT
|
||||
} failover_ckpt_status_t;
|
||||
} reform_ckpt_status_t;
|
||||
|
||||
typedef enum st_ondemand_realtime_build_status {
|
||||
DISABLED = 0,
|
||||
@ -151,9 +151,9 @@ typedef struct realtime_build_log_ctrl {
|
||||
|
||||
typedef struct ss_recovery_info {
|
||||
bool recovery_pause_flag;
|
||||
volatile failover_ckpt_status_t failover_ckpt_status;
|
||||
char recovery_xlog_dir[MAXPGPATH];
|
||||
int recovery_inst_id;
|
||||
volatile reform_ckpt_status_t reform_ckpt_status;
|
||||
volatile SSGlobalClusterState cluster_ondemand_status;
|
||||
char xlog_list[DMS_MAX_INSTANCE][MAXPGPATH];
|
||||
LWLock* update_seg_lock;
|
||||
|
||||
Reference in New Issue
Block a user