[fix] fix the committed_end_lsn can not be advanced after removing member

This commit is contained in:
BinChenn
2023-09-15 02:40:16 +00:00
committed by ob-robot
parent 14f411060e
commit eeacd9c6bc
5 changed files with 103 additions and 33 deletions

View File

@ -437,8 +437,10 @@ int LogConfigMgr::get_alive_member_list_with_arb(
// require rlock of PalfHandleImpl
int LogConfigMgr::get_log_sync_member_list_for_generate_committed_lsn(
ObMemberList &member_list,
int64_t &replica_num,
ObMemberList &prev_member_list,
int64_t &prev_replica_num,
ObMemberList &curr_member_list,
int64_t &curr_replica_num,
bool &is_before_barrier,
LSN &barrier_lsn) const
{
@ -451,6 +453,9 @@ int LogConfigMgr::get_log_sync_member_list_for_generate_committed_lsn(
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
PALF_LOG(WARN, "LogConfigMgr not init", KR(ret));
} else if (OB_FAIL(curr_member_list.deep_copy(log_ms_meta_.curr_.config_.log_sync_memberlist_))) {
PALF_LOG(WARN, "deep_copy member_list failed", KR(ret), K_(palf_id), K_(self));
} else if (FALSE_IT(curr_replica_num = log_ms_meta_.curr_.config_.log_sync_replica_num_)) {
} else if (OB_UNLIKELY(prev_committed_end_lsn < reconfig_barrier_.prev_end_lsn_ &&
reconfig_barrier_.prev_end_lsn_.is_valid() &&
prev_mode_pid == reconfig_barrier_.prev_mode_pid_)) {
@ -465,16 +470,12 @@ int LogConfigMgr::get_log_sync_member_list_for_generate_committed_lsn(
// be used only when the reconfir_barrier_.prev_mode_pid_ is equal to current mode
// proposal_id. That means access mode hasn’t been changed (PALF hasn’t been flashed back)
// since last reconfiguration.
if (OB_FAIL(member_list.deep_copy(log_ms_meta_.prev_.config_.log_sync_memberlist_))) {
if (OB_FAIL(prev_member_list.deep_copy(log_ms_meta_.prev_.config_.log_sync_memberlist_))) {
PALF_LOG(WARN, "deep_copy member_list failed", KR(ret), K_(palf_id), K_(self));
} else {
replica_num = log_ms_meta_.prev_.config_.log_sync_replica_num_;
prev_replica_num = log_ms_meta_.prev_.config_.log_sync_replica_num_;
}
} else if (OB_FAIL(member_list.deep_copy(log_ms_meta_.curr_.config_.log_sync_memberlist_))) {
PALF_LOG(WARN, "deep_copy member_list failed", KR(ret), K_(palf_id), K_(self));
} else {
replica_num = log_ms_meta_.curr_.config_.log_sync_replica_num_;
}
} else { }
return ret;
}

View File

@ -406,8 +406,10 @@ public:
// return OB_SUCCESS if success
// else return other errno
virtual int get_log_sync_member_list_for_generate_committed_lsn(
ObMemberList &member_list,
int64_t &replica_num,
ObMemberList &prev_member_list,
int64_t &prev_replica_num,
ObMemberList &curr_member_list,
int64_t &curr_replica_num,
bool &is_before_barrier,
LSN &barrier_lsn) const;
virtual int get_arbitration_member(common::ObMember &arb_member) const;
@ -498,11 +500,12 @@ public:
SpinLockGuard guard(lock_);
int64_t pos = 0;
J_OBJ_START();
J_KV(K_(palf_id), K_(self), K_(alive_paxos_memberlist), K_(alive_paxos_replica_num), \
K_(log_ms_meta), K_(checking_barrier), K_(reconfig_barrier), K_(persistent_config_version), \
K_(ms_ack_list), K_(resend_config_version), K_(resend_log_list), \
K_(last_submit_config_log_time_us), K_(region), K_(paxos_member_region_map), \
K_(register_time_us), K_(parent), K_(parent_keepalive_time_us), \
J_KV(K_(palf_id), K_(self), K_(alive_paxos_memberlist), K_(alive_paxos_replica_num), \
K_(log_ms_meta), K_(running_args), K_(state), K_(checking_barrier), K_(reconfig_barrier), \
K_(persistent_config_version), K_(ms_ack_list), K_(resend_config_version), K_(resend_log_list), \
K_(last_submit_config_log_time_us), K_(need_change_config_bkgd), K_(bkgd_config_version), \
K_(region), K_(paxos_member_region_map), \
K_(register_time_us), K_(parent), K_(parent_keepalive_time_us), \
K_(last_submit_register_req_time_us), K_(children), K_(last_submit_keepalive_time_us), KP(this));
J_OBJ_END();
return pos;

View File

@ -2598,27 +2598,31 @@ int64_t LogSlidingWindow::get_start_id() const
int LogSlidingWindow::gen_committed_end_lsn_(LSN &new_committed_end_lsn)
{
int ret = OB_SUCCESS;
ObMemberList member_list;
int64_t replica_num = 0;
LSN result_lsn;
ObMemberList curr_member_list, prev_member_list;
int64_t curr_replica_num = 0, prev_replica_num = 0;
LSN curr_result_lsn, prev_result_lsn;
bool is_before_barrier = false;
LSN barrier_lsn;
if (OB_FAIL(mm_->get_log_sync_member_list_for_generate_committed_lsn(member_list,
replica_num, is_before_barrier, barrier_lsn))) {
if (OB_FAIL(mm_->get_log_sync_member_list_for_generate_committed_lsn(prev_member_list,
prev_replica_num, curr_member_list, curr_replica_num, is_before_barrier, barrier_lsn))) {
PALF_LOG(WARN, "get_log_sync_member_list failed", K(ret), K_(palf_id), K_(self));
} else if (OB_FAIL(get_majority_lsn_(member_list, replica_num, result_lsn))) {
} else if (OB_FAIL(get_majority_lsn_(curr_member_list, curr_replica_num, curr_result_lsn))) {
PALF_LOG(WARN, "get_majority_lsn failed", K(ret), K_(palf_id), K_(self));
} else if (OB_UNLIKELY(true == is_before_barrier) &&
OB_FAIL(get_majority_lsn_(prev_member_list, prev_replica_num, prev_result_lsn))) {
PALF_LOG(WARN, "get_majority_lsn failed", K(ret), K_(palf_id), K_(self));
} else {
// Note: the leader generates committed_end_lsn based on different memberlists before
// and after a reconfiguration, barrier_lsn is the boundary.
// - Logs which is before barrier_lsn should be committed by previous memberlist.
// - Logs which is before barrier_lsn could be committed by previous/current memberlist.
LSN result_lsn = (OB_UNLIKELY(is_before_barrier))? MAX(prev_result_lsn, curr_result_lsn): curr_result_lsn;
// - Logs which is after barrier_lsn should be committed by current memberlist.
// - If current committed_end_lsn is smaller than barrier_lsn, then new committed_end_lsn
// generated by previous memberlist must be smaller than barrier_lsn.
// For example, memberlist:{A} + replica:B. After adding B successfully, Logs after the
// barrier may have been persisted by A, but not B. The leader A can not commit logs after
// the barrier with memberlist:{A}.
result_lsn = (is_before_barrier)? MIN(result_lsn, barrier_lsn): result_lsn;
result_lsn = (OB_UNLIKELY(is_before_barrier))? MIN(result_lsn, barrier_lsn): result_lsn;
// Note: The leader is not allowed to generate new committed_end_lsn while changing configs with arb.
// 1. {A, B, C(arb)}, A is the leader, end_lsns of A and B are both 100.
// 2. B crashes and A decicdes to degrade B to a learner, A changes memberlist to {A, C(arb)} and