From eeacd9c6bc39af7d34d393858c50a9c63db5bc6f Mon Sep 17 00:00:00 2001 From: BinChenn Date: Fri, 15 Sep 2023 02:40:16 +0000 Subject: [PATCH] [fix] fix the committed_end_lsn can not be advanced after removing member --- ...t_ob_simple_log_config_change_mock_ele.cpp | 61 +++++++++++++++++++ src/logservice/palf/log_config_mgr.cpp | 19 +++--- src/logservice/palf/log_config_mgr.h | 17 +++--- src/logservice/palf/log_sliding_window.cpp | 20 +++--- .../mock_log_config_mgr.h | 19 +++--- 5 files changed, 103 insertions(+), 33 deletions(-) diff --git a/mittest/logservice/test_ob_simple_log_config_change_mock_ele.cpp b/mittest/logservice/test_ob_simple_log_config_change_mock_ele.cpp index 5773da021..8f5b714e1 100644 --- a/mittest/logservice/test_ob_simple_log_config_change_mock_ele.cpp +++ b/mittest/logservice/test_ob_simple_log_config_change_mock_ele.cpp @@ -362,6 +362,67 @@ TEST_F(TestObSimpleLogClusterConfigChangeMockEle, test_remove_if_another_rebuild PALF_LOG(INFO, "end test test_committed_end_lsn_after_removing_member", K(id)); } +// 1. (ABCD), A is the leader, D crashed +// 2. remove C from member list (ABCD) and match_lsn_map, committed_end_lsn is 100 and last_submit_end_lsn is 200. +// 2. because committed_end_lsn is smaller than last_submit_end_lsn, the leader will use prev_member_list (ABCD) to generate committed_end_lsn +// 3. C is not in match_lsn_map, D crashed, the leader may can not generate committed_end_lsn +TEST_F(TestObSimpleLogClusterConfigChangeMockEle, test_committed_end_lsn_after_removing_member2) +{ + int ret = OB_SUCCESS; + const int64_t id = ATOMIC_AAF(&palf_id_, 1); + const int64_t CONFIG_CHANGE_TIMEOUT = 10 * 1000 * 1000L; // 10s + SET_CASE_LOG_FILE(TEST_NAME, "test_committed_end_lsn_after_removing_member"); + PALF_LOG(INFO, "begin test test_committed_end_lsn_after_removing_member", K(id)); + { + int64_t leader_idx = 0; + PalfHandleImplGuard leader; + std::vector palf_list; + EXPECT_EQ(OB_SUCCESS, create_paxos_group_with_mock_election(id, leader_idx, leader)); + EXPECT_EQ(OB_SUCCESS, submit_log(leader, 200, id)); + EXPECT_EQ(OB_SUCCESS, get_cluster_palf_handle_guard(id, palf_list)); + + const int64_t b_idx = (leader_idx + 1) % 4; + const int64_t c_idx = (leader_idx + 2) % 4; + const int64_t d_idx = (leader_idx + 3) % 4; + const common::ObAddr b_addr = get_cluster()[b_idx]->get_addr(); + const common::ObAddr c_addr = get_cluster()[c_idx]->get_addr(); + const common::ObAddr d_addr = get_cluster()[d_idx]->get_addr(); + PalfHandleImplGuard *a_handle = palf_list[leader_idx]; + PalfHandleImplGuard *b_handle = palf_list[b_idx]; + PalfHandleImplGuard *c_handle = palf_list[c_idx]; + PalfHandleImplGuard *d_handle = palf_list[d_idx]; + LogConfigVersion config_version; + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->get_config_version(config_version)); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->add_member(common::ObMember(d_addr, 1), 4, config_version, CONFIG_CHANGE_TIMEOUT)); + + // 1. D crashed + block_all_net(d_idx); + + // 2. leader can not commit logs + block_pcode(leader_idx, ObRpcPacketCode::OB_LOG_PUSH_RESP); + EXPECT_EQ(OB_SUCCESS, submit_log(leader, 100, id)); + sleep(1); + EXPECT_GT(leader.palf_handle_impl_->sw_.last_submit_lsn_, leader.palf_handle_impl_->sw_.committed_end_lsn_); + + // 3. remove C + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->remove_member(common::ObMember(c_addr, 1), 3, CONFIG_CHANGE_TIMEOUT)); + EXPECT_GT(leader.palf_handle_impl_->sw_.last_submit_lsn_, leader.palf_handle_impl_->sw_.committed_end_lsn_); + EXPECT_EQ(leader.palf_handle_impl_->config_mgr_.reconfig_barrier_.prev_end_lsn_, leader.palf_handle_impl_->sw_.last_submit_end_lsn_); + EXPECT_GT(leader.palf_handle_impl_->config_mgr_.reconfig_barrier_.prev_end_lsn_, leader.palf_handle_impl_->sw_.committed_end_lsn_); + + // 3. leader can commit logs + unblock_pcode(leader_idx, ObRpcPacketCode::OB_LOG_PUSH_RESP); + + // 4. check if the leader can commit logs after C has been removed from match_lsn_map + EXPECT_UNTIL_EQ(leader.palf_handle_impl_->sw_.committed_end_lsn_, leader.palf_handle_impl_->sw_.last_submit_end_lsn_); + + leader.reset(); + revert_cluster_palf_handle_guard(palf_list); + } + delete_paxos_group(id); + PALF_LOG(INFO, "end test test_committed_end_lsn_after_removing_member", K(id)); +} + } // end unittest } // end oceanbase diff --git a/src/logservice/palf/log_config_mgr.cpp b/src/logservice/palf/log_config_mgr.cpp index bea83873f..7fab6242c 100755 --- a/src/logservice/palf/log_config_mgr.cpp +++ b/src/logservice/palf/log_config_mgr.cpp @@ -437,8 +437,10 @@ int LogConfigMgr::get_alive_member_list_with_arb( // require rlock of PalfHandleImpl int LogConfigMgr::get_log_sync_member_list_for_generate_committed_lsn( - ObMemberList &member_list, - int64_t &replica_num, + ObMemberList &prev_member_list, + int64_t &prev_replica_num, + ObMemberList &curr_member_list, + int64_t &curr_replica_num, bool &is_before_barrier, LSN &barrier_lsn) const { @@ -451,6 +453,9 @@ int LogConfigMgr::get_log_sync_member_list_for_generate_committed_lsn( if (IS_NOT_INIT) { ret = OB_NOT_INIT; PALF_LOG(WARN, "LogConfigMgr not init", KR(ret)); + } else if (OB_FAIL(curr_member_list.deep_copy(log_ms_meta_.curr_.config_.log_sync_memberlist_))) { + PALF_LOG(WARN, "deep_copy member_list failed", KR(ret), K_(palf_id), K_(self)); + } else if (FALSE_IT(curr_replica_num = log_ms_meta_.curr_.config_.log_sync_replica_num_)) { } else if (OB_UNLIKELY(prev_committed_end_lsn < reconfig_barrier_.prev_end_lsn_ && reconfig_barrier_.prev_end_lsn_.is_valid() && prev_mode_pid == reconfig_barrier_.prev_mode_pid_)) { @@ -465,16 +470,12 @@ int LogConfigMgr::get_log_sync_member_list_for_generate_committed_lsn( // be used only when the reconfir_barrier_.prev_mode_pid_ is equal to current mode // proposal_id. That means access mode hasn’t been changed (PALF hasn’t been flashed back) // since last reconfiguration. - if (OB_FAIL(member_list.deep_copy(log_ms_meta_.prev_.config_.log_sync_memberlist_))) { + if (OB_FAIL(prev_member_list.deep_copy(log_ms_meta_.prev_.config_.log_sync_memberlist_))) { PALF_LOG(WARN, "deep_copy member_list failed", KR(ret), K_(palf_id), K_(self)); } else { - replica_num = log_ms_meta_.prev_.config_.log_sync_replica_num_; + prev_replica_num = log_ms_meta_.prev_.config_.log_sync_replica_num_; } - } else if (OB_FAIL(member_list.deep_copy(log_ms_meta_.curr_.config_.log_sync_memberlist_))) { - PALF_LOG(WARN, "deep_copy member_list failed", KR(ret), K_(palf_id), K_(self)); - } else { - replica_num = log_ms_meta_.curr_.config_.log_sync_replica_num_; - } + } else { } return ret; } diff --git a/src/logservice/palf/log_config_mgr.h b/src/logservice/palf/log_config_mgr.h index 0209c9ef3..ddabd879a 100755 --- a/src/logservice/palf/log_config_mgr.h +++ b/src/logservice/palf/log_config_mgr.h @@ -406,8 +406,10 @@ public: // return OB_SUCCESS if success // else return other errno virtual int get_log_sync_member_list_for_generate_committed_lsn( - ObMemberList &member_list, - int64_t &replica_num, + ObMemberList &prev_member_list, + int64_t &prev_replica_num, + ObMemberList &curr_member_list, + int64_t &curr_replica_num, bool &is_before_barrier, LSN &barrier_lsn) const; virtual int get_arbitration_member(common::ObMember &arb_member) const; @@ -498,11 +500,12 @@ public: SpinLockGuard guard(lock_); int64_t pos = 0; J_OBJ_START(); - J_KV(K_(palf_id), K_(self), K_(alive_paxos_memberlist), K_(alive_paxos_replica_num), \ - K_(log_ms_meta), K_(checking_barrier), K_(reconfig_barrier), K_(persistent_config_version), \ - K_(ms_ack_list), K_(resend_config_version), K_(resend_log_list), \ - K_(last_submit_config_log_time_us), K_(region), K_(paxos_member_region_map), \ - K_(register_time_us), K_(parent), K_(parent_keepalive_time_us), \ + J_KV(K_(palf_id), K_(self), K_(alive_paxos_memberlist), K_(alive_paxos_replica_num), \ + K_(log_ms_meta), K_(running_args), K_(state), K_(checking_barrier), K_(reconfig_barrier), \ + K_(persistent_config_version), K_(ms_ack_list), K_(resend_config_version), K_(resend_log_list), \ + K_(last_submit_config_log_time_us), K_(need_change_config_bkgd), K_(bkgd_config_version), \ + K_(region), K_(paxos_member_region_map), \ + K_(register_time_us), K_(parent), K_(parent_keepalive_time_us), \ K_(last_submit_register_req_time_us), K_(children), K_(last_submit_keepalive_time_us), KP(this)); J_OBJ_END(); return pos; diff --git a/src/logservice/palf/log_sliding_window.cpp b/src/logservice/palf/log_sliding_window.cpp index ac9dea68f..9399e7c55 100644 --- a/src/logservice/palf/log_sliding_window.cpp +++ b/src/logservice/palf/log_sliding_window.cpp @@ -2598,27 +2598,31 @@ int64_t LogSlidingWindow::get_start_id() const int LogSlidingWindow::gen_committed_end_lsn_(LSN &new_committed_end_lsn) { int ret = OB_SUCCESS; - ObMemberList member_list; - int64_t replica_num = 0; - LSN result_lsn; + ObMemberList curr_member_list, prev_member_list; + int64_t curr_replica_num = 0, prev_replica_num = 0; + LSN curr_result_lsn, prev_result_lsn; bool is_before_barrier = false; LSN barrier_lsn; - if (OB_FAIL(mm_->get_log_sync_member_list_for_generate_committed_lsn(member_list, - replica_num, is_before_barrier, barrier_lsn))) { + if (OB_FAIL(mm_->get_log_sync_member_list_for_generate_committed_lsn(prev_member_list, + prev_replica_num, curr_member_list, curr_replica_num, is_before_barrier, barrier_lsn))) { PALF_LOG(WARN, "get_log_sync_member_list failed", K(ret), K_(palf_id), K_(self)); - } else if (OB_FAIL(get_majority_lsn_(member_list, replica_num, result_lsn))) { + } else if (OB_FAIL(get_majority_lsn_(curr_member_list, curr_replica_num, curr_result_lsn))) { + PALF_LOG(WARN, "get_majority_lsn failed", K(ret), K_(palf_id), K_(self)); + } else if (OB_UNLIKELY(true == is_before_barrier) && + OB_FAIL(get_majority_lsn_(prev_member_list, prev_replica_num, prev_result_lsn))) { PALF_LOG(WARN, "get_majority_lsn failed", K(ret), K_(palf_id), K_(self)); } else { // Note: the leader generates committed_end_lsn based on different memberlists before // and after a reconfiguration, barrier_lsn is the boundary. - // - Logs which is before barrier_lsn should be committed by previous memberlist. + // - Logs which is before barrier_lsn could be committed by previous/current memberlist. + LSN result_lsn = (OB_UNLIKELY(is_before_barrier))? MAX(prev_result_lsn, curr_result_lsn): curr_result_lsn; // - Logs which is after barrier_lsn should be committed by current memberlist. // - If current committed_end_lsn is smaller than barrier_lsn, then new committed_end_lsn // generated by previous memberlist must be smaller than barrier_lsn. // For example, memberlist:{A} + replica:B. After adding B successfully, Logs after the // barrier may have been persisted by A, but not B. The leader A can not commit logs after // the barrier with memberlist:{A}. - result_lsn = (is_before_barrier)? MIN(result_lsn, barrier_lsn): result_lsn; + result_lsn = (OB_UNLIKELY(is_before_barrier))? MIN(result_lsn, barrier_lsn): result_lsn; // Note: The leader is not allowed to generate new committed_end_lsn while changing configs with arb. // 1. {A, B, C(arb)}, A is the leader, end_lsns of A and B are both 100. // 2. B crashes and A decicdes to degrade B to a learner, A changes memberlist to {A, C(arb)} and diff --git a/unittest/logservice/mock_logservice_container/mock_log_config_mgr.h b/unittest/logservice/mock_logservice_container/mock_log_config_mgr.h index 20d9728bf..c790baff0 100755 --- a/unittest/logservice/mock_logservice_container/mock_log_config_mgr.h +++ b/unittest/logservice/mock_logservice_container/mock_log_config_mgr.h @@ -113,8 +113,10 @@ public: return ret; } int get_log_sync_member_list_for_generate_committed_lsn( - ObMemberList &member_list, - int64_t &replica_num, + ObMemberList &prev_member_list, + int64_t &prev_replica_num, + ObMemberList &curr_member_list, + int64_t &curr_replica_num, bool &is_before_barrier, LSN &barrier_lsn) const { @@ -126,20 +128,19 @@ public: if (IS_NOT_INIT) { ret = OB_NOT_INIT; PALF_LOG(WARN, "LogConfigMgr not init", KR(ret)); + } else if (OB_FAIL(curr_member_list.deep_copy(log_ms_meta_.curr_.config_.log_sync_memberlist_))) { + PALF_LOG(WARN, "deep_copy member_list failed", KR(ret), K_(palf_id), K_(self)); + } else if (FALSE_IT(curr_replica_num = log_ms_meta_.curr_.config_.log_sync_replica_num_)) { } else if (OB_UNLIKELY(prev_committed_end_lsn < reconfig_barrier_.prev_end_lsn_ && reconfig_barrier_.prev_end_lsn_.is_valid())) { is_before_barrier = true; barrier_lsn = reconfig_barrier_.prev_end_lsn_; - if (OB_FAIL(member_list.deep_copy(log_ms_meta_.prev_.config_.log_sync_memberlist_))) { + if (OB_FAIL(prev_member_list.deep_copy(log_ms_meta_.prev_.config_.log_sync_memberlist_))) { PALF_LOG(WARN, "deep_copy member_list failed", KR(ret), K_(palf_id), K_(self)); } else { - replica_num = log_ms_meta_.prev_.config_.log_sync_replica_num_; + prev_replica_num = log_ms_meta_.prev_.config_.log_sync_replica_num_; } - } else if (OB_FAIL(member_list.deep_copy(log_ms_meta_.curr_.config_.log_sync_memberlist_))) { - PALF_LOG(WARN, "deep_copy member_list failed", KR(ret), K_(palf_id), K_(self)); - } else { - replica_num = log_ms_meta_.curr_.config_.log_sync_replica_num_; - } + } else { } return ret; } int get_alive_member_list_with_arb(common::ObMemberList &member_list, int64_t &replica_num) const