[fix] fix the committed_end_lsn can not be advanced after removing member

This commit is contained in:
BinChenn 2023-09-15 02:40:16 +00:00 committed by ob-robot
parent 14f411060e
commit eeacd9c6bc
5 changed files with 103 additions and 33 deletions

View File

@ -362,6 +362,67 @@ TEST_F(TestObSimpleLogClusterConfigChangeMockEle, test_remove_if_another_rebuild
PALF_LOG(INFO, "end test test_committed_end_lsn_after_removing_member", K(id));
}
// 1. (ABCD), A is the leader, D crashed
// 2. remove C from member list (ABCD) and match_lsn_map, committed_end_lsn is 100 and last_submit_end_lsn is 200.
// 2. because committed_end_lsn is smaller than last_submit_end_lsn, the leader will use prev_member_list (ABCD) to generate committed_end_lsn
// 3. C is not in match_lsn_map, D crashed, the leader may can not generate committed_end_lsn
TEST_F(TestObSimpleLogClusterConfigChangeMockEle, test_committed_end_lsn_after_removing_member2)
{
int ret = OB_SUCCESS;
const int64_t id = ATOMIC_AAF(&palf_id_, 1);
const int64_t CONFIG_CHANGE_TIMEOUT = 10 * 1000 * 1000L; // 10s
SET_CASE_LOG_FILE(TEST_NAME, "test_committed_end_lsn_after_removing_member");
PALF_LOG(INFO, "begin test test_committed_end_lsn_after_removing_member", K(id));
{
int64_t leader_idx = 0;
PalfHandleImplGuard leader;
std::vector<PalfHandleImplGuard*> palf_list;
EXPECT_EQ(OB_SUCCESS, create_paxos_group_with_mock_election(id, leader_idx, leader));
EXPECT_EQ(OB_SUCCESS, submit_log(leader, 200, id));
EXPECT_EQ(OB_SUCCESS, get_cluster_palf_handle_guard(id, palf_list));
const int64_t b_idx = (leader_idx + 1) % 4;
const int64_t c_idx = (leader_idx + 2) % 4;
const int64_t d_idx = (leader_idx + 3) % 4;
const common::ObAddr b_addr = get_cluster()[b_idx]->get_addr();
const common::ObAddr c_addr = get_cluster()[c_idx]->get_addr();
const common::ObAddr d_addr = get_cluster()[d_idx]->get_addr();
PalfHandleImplGuard *a_handle = palf_list[leader_idx];
PalfHandleImplGuard *b_handle = palf_list[b_idx];
PalfHandleImplGuard *c_handle = palf_list[c_idx];
PalfHandleImplGuard *d_handle = palf_list[d_idx];
LogConfigVersion config_version;
EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->get_config_version(config_version));
EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->add_member(common::ObMember(d_addr, 1), 4, config_version, CONFIG_CHANGE_TIMEOUT));
// 1. D crashed
block_all_net(d_idx);
// 2. leader can not commit logs
block_pcode(leader_idx, ObRpcPacketCode::OB_LOG_PUSH_RESP);
EXPECT_EQ(OB_SUCCESS, submit_log(leader, 100, id));
sleep(1);
EXPECT_GT(leader.palf_handle_impl_->sw_.last_submit_lsn_, leader.palf_handle_impl_->sw_.committed_end_lsn_);
// 3. remove C
EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->remove_member(common::ObMember(c_addr, 1), 3, CONFIG_CHANGE_TIMEOUT));
EXPECT_GT(leader.palf_handle_impl_->sw_.last_submit_lsn_, leader.palf_handle_impl_->sw_.committed_end_lsn_);
EXPECT_EQ(leader.palf_handle_impl_->config_mgr_.reconfig_barrier_.prev_end_lsn_, leader.palf_handle_impl_->sw_.last_submit_end_lsn_);
EXPECT_GT(leader.palf_handle_impl_->config_mgr_.reconfig_barrier_.prev_end_lsn_, leader.palf_handle_impl_->sw_.committed_end_lsn_);
// 3. leader can commit logs
unblock_pcode(leader_idx, ObRpcPacketCode::OB_LOG_PUSH_RESP);
// 4. check if the leader can commit logs after C has been removed from match_lsn_map
EXPECT_UNTIL_EQ(leader.palf_handle_impl_->sw_.committed_end_lsn_, leader.palf_handle_impl_->sw_.last_submit_end_lsn_);
leader.reset();
revert_cluster_palf_handle_guard(palf_list);
}
delete_paxos_group(id);
PALF_LOG(INFO, "end test test_committed_end_lsn_after_removing_member", K(id));
}
} // end unittest
} // end oceanbase

View File

@ -437,8 +437,10 @@ int LogConfigMgr::get_alive_member_list_with_arb(
// require rlock of PalfHandleImpl
int LogConfigMgr::get_log_sync_member_list_for_generate_committed_lsn(
ObMemberList &member_list,
int64_t &replica_num,
ObMemberList &prev_member_list,
int64_t &prev_replica_num,
ObMemberList &curr_member_list,
int64_t &curr_replica_num,
bool &is_before_barrier,
LSN &barrier_lsn) const
{
@ -451,6 +453,9 @@ int LogConfigMgr::get_log_sync_member_list_for_generate_committed_lsn(
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
PALF_LOG(WARN, "LogConfigMgr not init", KR(ret));
} else if (OB_FAIL(curr_member_list.deep_copy(log_ms_meta_.curr_.config_.log_sync_memberlist_))) {
PALF_LOG(WARN, "deep_copy member_list failed", KR(ret), K_(palf_id), K_(self));
} else if (FALSE_IT(curr_replica_num = log_ms_meta_.curr_.config_.log_sync_replica_num_)) {
} else if (OB_UNLIKELY(prev_committed_end_lsn < reconfig_barrier_.prev_end_lsn_ &&
reconfig_barrier_.prev_end_lsn_.is_valid() &&
prev_mode_pid == reconfig_barrier_.prev_mode_pid_)) {
@ -465,16 +470,12 @@ int LogConfigMgr::get_log_sync_member_list_for_generate_committed_lsn(
// be used only when the reconfir_barrier_.prev_mode_pid_ is equal to current mode
// proposal_id. That means access mode hasn’t been changed (PALF hasn’t been flashed back)
// since last reconfiguration.
if (OB_FAIL(member_list.deep_copy(log_ms_meta_.prev_.config_.log_sync_memberlist_))) {
if (OB_FAIL(prev_member_list.deep_copy(log_ms_meta_.prev_.config_.log_sync_memberlist_))) {
PALF_LOG(WARN, "deep_copy member_list failed", KR(ret), K_(palf_id), K_(self));
} else {
replica_num = log_ms_meta_.prev_.config_.log_sync_replica_num_;
prev_replica_num = log_ms_meta_.prev_.config_.log_sync_replica_num_;
}
} else if (OB_FAIL(member_list.deep_copy(log_ms_meta_.curr_.config_.log_sync_memberlist_))) {
PALF_LOG(WARN, "deep_copy member_list failed", KR(ret), K_(palf_id), K_(self));
} else {
replica_num = log_ms_meta_.curr_.config_.log_sync_replica_num_;
}
} else { }
return ret;
}

View File

@ -406,8 +406,10 @@ public:
// return OB_SUCCESS if success
// else return other errno
virtual int get_log_sync_member_list_for_generate_committed_lsn(
ObMemberList &member_list,
int64_t &replica_num,
ObMemberList &prev_member_list,
int64_t &prev_replica_num,
ObMemberList &curr_member_list,
int64_t &curr_replica_num,
bool &is_before_barrier,
LSN &barrier_lsn) const;
virtual int get_arbitration_member(common::ObMember &arb_member) const;
@ -498,11 +500,12 @@ public:
SpinLockGuard guard(lock_);
int64_t pos = 0;
J_OBJ_START();
J_KV(K_(palf_id), K_(self), K_(alive_paxos_memberlist), K_(alive_paxos_replica_num), \
K_(log_ms_meta), K_(checking_barrier), K_(reconfig_barrier), K_(persistent_config_version), \
K_(ms_ack_list), K_(resend_config_version), K_(resend_log_list), \
K_(last_submit_config_log_time_us), K_(region), K_(paxos_member_region_map), \
K_(register_time_us), K_(parent), K_(parent_keepalive_time_us), \
J_KV(K_(palf_id), K_(self), K_(alive_paxos_memberlist), K_(alive_paxos_replica_num), \
K_(log_ms_meta), K_(running_args), K_(state), K_(checking_barrier), K_(reconfig_barrier), \
K_(persistent_config_version), K_(ms_ack_list), K_(resend_config_version), K_(resend_log_list), \
K_(last_submit_config_log_time_us), K_(need_change_config_bkgd), K_(bkgd_config_version), \
K_(region), K_(paxos_member_region_map), \
K_(register_time_us), K_(parent), K_(parent_keepalive_time_us), \
K_(last_submit_register_req_time_us), K_(children), K_(last_submit_keepalive_time_us), KP(this));
J_OBJ_END();
return pos;

View File

@ -2598,27 +2598,31 @@ int64_t LogSlidingWindow::get_start_id() const
int LogSlidingWindow::gen_committed_end_lsn_(LSN &new_committed_end_lsn)
{
int ret = OB_SUCCESS;
ObMemberList member_list;
int64_t replica_num = 0;
LSN result_lsn;
ObMemberList curr_member_list, prev_member_list;
int64_t curr_replica_num = 0, prev_replica_num = 0;
LSN curr_result_lsn, prev_result_lsn;
bool is_before_barrier = false;
LSN barrier_lsn;
if (OB_FAIL(mm_->get_log_sync_member_list_for_generate_committed_lsn(member_list,
replica_num, is_before_barrier, barrier_lsn))) {
if (OB_FAIL(mm_->get_log_sync_member_list_for_generate_committed_lsn(prev_member_list,
prev_replica_num, curr_member_list, curr_replica_num, is_before_barrier, barrier_lsn))) {
PALF_LOG(WARN, "get_log_sync_member_list failed", K(ret), K_(palf_id), K_(self));
} else if (OB_FAIL(get_majority_lsn_(member_list, replica_num, result_lsn))) {
} else if (OB_FAIL(get_majority_lsn_(curr_member_list, curr_replica_num, curr_result_lsn))) {
PALF_LOG(WARN, "get_majority_lsn failed", K(ret), K_(palf_id), K_(self));
} else if (OB_UNLIKELY(true == is_before_barrier) &&
OB_FAIL(get_majority_lsn_(prev_member_list, prev_replica_num, prev_result_lsn))) {
PALF_LOG(WARN, "get_majority_lsn failed", K(ret), K_(palf_id), K_(self));
} else {
// Note: the leader generates committed_end_lsn based on different memberlists before
// and after a reconfiguration, barrier_lsn is the boundary.
// - Logs which is before barrier_lsn should be committed by previous memberlist.
// - Logs which is before barrier_lsn could be committed by previous/current memberlist.
LSN result_lsn = (OB_UNLIKELY(is_before_barrier))? MAX(prev_result_lsn, curr_result_lsn): curr_result_lsn;
// - Logs which is after barrier_lsn should be committed by current memberlist.
// - If current committed_end_lsn is smaller than barrier_lsn, then new committed_end_lsn
// generated by previous memberlist must be smaller than barrier_lsn.
// For example, memberlist:{A} + replica:B. After adding B successfully, Logs after the
// barrier may have been persisted by A, but not B. The leader A can not commit logs after
// the barrier with memberlist:{A}.
result_lsn = (is_before_barrier)? MIN(result_lsn, barrier_lsn): result_lsn;
result_lsn = (OB_UNLIKELY(is_before_barrier))? MIN(result_lsn, barrier_lsn): result_lsn;
// Note: The leader is not allowed to generate new committed_end_lsn while changing configs with arb.
// 1. {A, B, C(arb)}, A is the leader, end_lsns of A and B are both 100.
// 2. B crashes and A decicdes to degrade B to a learner, A changes memberlist to {A, C(arb)} and

View File

@ -113,8 +113,10 @@ public:
return ret;
}
int get_log_sync_member_list_for_generate_committed_lsn(
ObMemberList &member_list,
int64_t &replica_num,
ObMemberList &prev_member_list,
int64_t &prev_replica_num,
ObMemberList &curr_member_list,
int64_t &curr_replica_num,
bool &is_before_barrier,
LSN &barrier_lsn) const
{
@ -126,20 +128,19 @@ public:
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
PALF_LOG(WARN, "LogConfigMgr not init", KR(ret));
} else if (OB_FAIL(curr_member_list.deep_copy(log_ms_meta_.curr_.config_.log_sync_memberlist_))) {
PALF_LOG(WARN, "deep_copy member_list failed", KR(ret), K_(palf_id), K_(self));
} else if (FALSE_IT(curr_replica_num = log_ms_meta_.curr_.config_.log_sync_replica_num_)) {
} else if (OB_UNLIKELY(prev_committed_end_lsn < reconfig_barrier_.prev_end_lsn_ &&
reconfig_barrier_.prev_end_lsn_.is_valid())) {
is_before_barrier = true;
barrier_lsn = reconfig_barrier_.prev_end_lsn_;
if (OB_FAIL(member_list.deep_copy(log_ms_meta_.prev_.config_.log_sync_memberlist_))) {
if (OB_FAIL(prev_member_list.deep_copy(log_ms_meta_.prev_.config_.log_sync_memberlist_))) {
PALF_LOG(WARN, "deep_copy member_list failed", KR(ret), K_(palf_id), K_(self));
} else {
replica_num = log_ms_meta_.prev_.config_.log_sync_replica_num_;
prev_replica_num = log_ms_meta_.prev_.config_.log_sync_replica_num_;
}
} else if (OB_FAIL(member_list.deep_copy(log_ms_meta_.curr_.config_.log_sync_memberlist_))) {
PALF_LOG(WARN, "deep_copy member_list failed", KR(ret), K_(palf_id), K_(self));
} else {
replica_num = log_ms_meta_.curr_.config_.log_sync_replica_num_;
}
} else { }
return ret;
}
int get_alive_member_list_with_arb(common::ObMemberList &member_list, int64_t &replica_num) const