diff --git a/mittest/logservice/test_ob_simple_log_config_change.cpp b/mittest/logservice/test_ob_simple_log_config_change.cpp index c13612481..55875b1f8 100644 --- a/mittest/logservice/test_ob_simple_log_config_change.cpp +++ b/mittest/logservice/test_ob_simple_log_config_change.cpp @@ -1109,10 +1109,92 @@ TEST_F(TestObSimpleLogClusterConfigChange, learner_loop) } } + for (int i = 0; i < ObSimpleLogClusterTestBase::node_cnt_; i++) { + unblock_pcode(i, ObRpcPacketCode::OB_LOG_PUSH_REQ); + } revert_cluster_palf_handle_guard(palf_list); PALF_LOG(INFO, "end test learner_loop", K(id)); } +// 1. 3 paxos member (A, B ,C), A is the leader. D is a learner, C is the parent of D. +// 2. block_net A <-> C, submit logs, logs of C and D are behind from A +// 3. remove C, or the step 4 cannot be executed succcessfully +// 4. switch D from learner to acceptor, D can not accept the reconfig log because it miss some logs +// 5. remove B +TEST_F(TestObSimpleLogClusterConfigChange, switch_lagged_learner_to_acceptor) +{ + SET_CASE_LOG_FILE(TEST_NAME, "switch_lagged_learner_to_acceptor"); + int ret = OB_SUCCESS; + const int64_t CONFIG_CHANGE_TIMEOUT = 10 * 1000 * 1000L; // 10s + const int64_t id = ATOMIC_AAF(&palf_id_, 1); + int64_t leader_idx = 0; + PalfHandleImplGuard leader; + LogLearnerList all_learner; + const ObMemberList &node_list = get_node_list(); + std::vector palf_list; + common::ObRegion beijing_region("BEIJING"); + common::ObRegion shanghai_region("SHANGHAI"); + + EXPECT_EQ(OB_SUCCESS, create_paxos_group(id, &loc_cb, leader_idx, leader)); + EXPECT_EQ(OB_SUCCESS, get_cluster_palf_handle_guard(id, palf_list)); + + // 1. add a learner, idx = 3 + const int64_t learner_idx = 3; + const int64_t followerc_idx = (leader_idx + 1) % 3; + const int64_t followerb_idx = (leader_idx + 2) % 3; + const ObAddr followerc_addr = get_cluster()[followerc_idx]->get_addr(); + const ObAddr followerb_addr = get_cluster()[followerb_idx]->get_addr(); + + // set region, beijing(leader, follower b), shanghai(follower c, learner) + for (int i = 0; i < ObSimpleLogClusterTestBase::node_cnt_; i++) { + const common::ObAddr addr = palf_list[i]->palf_handle_impl_->self_; + if (leader.palf_handle_impl_->config_mgr_.alive_paxos_memberlist_.contains(addr) && + i != followerc_idx) { + get_cluster()[0]->get_locality_manager()->set_server_region(addr, beijing_region); + } else { + get_cluster()[0]->get_locality_manager()->set_server_region(addr, shanghai_region); + } + } + for (auto palf_handle: palf_list) { palf_handle->palf_handle_impl_->update_self_region_(); } + // add D to learner_list + common::ObMember learner; + EXPECT_EQ(OB_SUCCESS, node_list.get_member_by_index(learner_idx, learner)); + EXPECT_EQ(OB_SUCCESS, all_learner.add_learner(LogLearner(learner.get_server(), 1))); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->add_learner(learner, CONFIG_CHANGE_TIMEOUT)); + + // check topo + EXPECT_UNTIL_EQ(true, check_children_valid(palf_list, all_learner)); + EXPECT_UNTIL_EQ(0, leader.palf_handle_impl_->config_mgr_.children_.get_member_number()); + EXPECT_UNTIL_EQ(0, palf_list[followerb_idx]->palf_handle_impl_->config_mgr_.children_.get_member_number()); + EXPECT_UNTIL_EQ(1, palf_list[followerc_idx]->palf_handle_impl_->config_mgr_.children_.get_member_number()); + EXPECT_UNTIL_EQ(followerc_addr, palf_list[learner_idx]->palf_handle_impl_->config_mgr_.parent_); + loc_cb.leader_ = get_cluster()[leader_idx]->get_addr(); + + // 2. remove C, or the step 4 cannot be executed succcessfully + ASSERT_EQ(OB_SUCCESS, leader.palf_handle_impl_->remove_member(ObMember(followerc_addr, 1), 2, CONFIG_CHANGE_TIMEOUT)); + LogConfigVersion leader_config_version; + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->get_config_version(leader_config_version)); + // ensure the learner can be added successfully + EXPECT_EQ(leader_config_version, palf_list[learner_idx]->palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.config_version_); + + // 3. block_net between leader and followerc, submit logs + block_net(leader_idx, followerc_idx); + EXPECT_EQ(OB_SUCCESS, submit_log(leader, 100, id)); + EXPECT_UNTIL_EQ(leader.palf_handle_impl_->get_max_lsn().val_, leader.palf_handle_impl_->get_end_lsn().val_); + + // 4. switch D from learner to acceptor, D can not accept the reconfig log because it miss some logs + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->get_config_version(leader_config_version)); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->switch_learner_to_acceptor(learner, 3, leader_config_version, CONFIG_CHANGE_TIMEOUT)); + + // 5. remove B + ASSERT_EQ(OB_SUCCESS, leader.palf_handle_impl_->remove_member(ObMember(followerb_addr, 1), 2, CONFIG_CHANGE_TIMEOUT)); + + unblock_net(leader_idx, followerc_idx); + leader.reset(); + revert_cluster_palf_handle_guard(palf_list); + PALF_LOG(INFO, "end switch_lagged_learner_to_acceptor", K(id)); +} + } // end unittest } // end oceanbase diff --git a/src/logservice/palf/log_config_mgr.cpp b/src/logservice/palf/log_config_mgr.cpp index c876ba3af..7eb89f643 100755 --- a/src/logservice/palf/log_config_mgr.cpp +++ b/src/logservice/palf/log_config_mgr.cpp @@ -2727,6 +2727,21 @@ int LogConfigMgr::handle_register_parent_resp(const LogLearner &server, return ret; } +int LogConfigMgr::retire_parent() +{ + int ret = OB_SUCCESS; + const RetireParentReason reason = RetireParentReason::IS_FULL_MEMBER; + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + PALF_LOG(WARN, "LogConfigMgr not init", KR(ret)); + } else if (OB_FAIL(retire_parent_(reason))) { + PALF_LOG(WARN, "LogConfigMgr not init", KR(ret)); + } else { + PALF_LOG(INFO, "retire_parent success", KR(ret), "reason", retire_parent_reason_2_str_(reason)); + } + return ret; +} + int LogConfigMgr::retire_parent_(const RetireParentReason &reason) { int ret = OB_SUCCESS; diff --git a/src/logservice/palf/log_config_mgr.h b/src/logservice/palf/log_config_mgr.h index 973edcaf5..0752eca8d 100755 --- a/src/logservice/palf/log_config_mgr.h +++ b/src/logservice/palf/log_config_mgr.h @@ -494,6 +494,7 @@ public: // NB: no handle_retire_parent and retire_children virtual int handle_retire_child(const LogLearner &parent); virtual int handle_learner_keepalive_req(const LogLearner &parent); + int retire_parent(); // failure detector int check_parent_health(); // ==================== Child ======================== diff --git a/src/logservice/palf/palf_handle_impl.cpp b/src/logservice/palf/palf_handle_impl.cpp index 211ef2144..526defacb 100755 --- a/src/logservice/palf/palf_handle_impl.cpp +++ b/src/logservice/palf/palf_handle_impl.cpp @@ -4120,6 +4120,7 @@ int PalfHandleImpl::receive_config_log(const common::ObAddr &server, const LogConfigMeta &meta) { int ret = OB_SUCCESS; + bool receive_newer_config_log = false; if (IS_NOT_INIT) { ret = OB_NOT_INIT; PALF_LOG(WARN, "PalfHandleImpl not init", KR(ret)); @@ -4136,7 +4137,6 @@ int PalfHandleImpl::receive_config_log(const common::ObAddr &server, PALF_LOG(WARN, "try_update_proposal_id_ failed", KR(ret), KPC(this), K(server), K(msg_proposal_id)); } else { TruncateLogInfo truncate_log_info; - bool need_print_register_log = false; // need wlock in case of truncating log and writing log_ms_meta in LogConfigMgr WLockGuard guard(lock_); // max_scn of multiple replicas may be different in FLASHBACK mode, @@ -4148,6 +4148,7 @@ int PalfHandleImpl::receive_config_log(const common::ObAddr &server, if (REACH_TIME_INTERVAL(100 * 1000)) { PALF_LOG(WARN, "can not receive log", KR(ret), KPC(this), K(msg_proposal_id), "role", state_mgr_.get_role()); } + } else if (FALSE_IT(receive_newer_config_log = true)) { } else if (mode_mgr_.get_accepted_mode_meta().proposal_id_ < prev_mode_pid) { // need fetch mode_meta if (OB_FAIL(sw_.try_fetch_log(FetchTriggerType::MODE_META_BARRIER))) { @@ -4168,18 +4169,22 @@ int PalfHandleImpl::receive_config_log(const common::ObAddr &server, } else if (OB_FAIL(config_mgr_.receive_config_log(server, meta))) { PALF_LOG(WARN, "receive_config_log failed", KR(ret), KPC(this), K(server), K(msg_proposal_id), K(prev_log_proposal_id), K(prev_lsn)); - } else if (!meta.curr_.config_.log_sync_memberlist_.contains(self_) && - meta.curr_.config_.arbitration_member_.get_server() != self_ && - !FALSE_IT(config_mgr_.register_parent()) && - FALSE_IT(need_print_register_log = true)) { - // it's a optimization. If self isn't in memberlist, then register parent right now, - // otherwise this new added learner will register parent in 4s at most and its log will be far behind. } else { PALF_LOG(INFO, "receive_config_log success", KR(ret), KPC(this), K(server), K(msg_proposal_id), K(prev_lsn), K(prev_log_proposal_id), K(meta)); } - if (need_print_register_log) { - PALF_LOG(INFO, "re_register_parent reason: self may in learnerlist", KPC(this), K(server), K(meta)); + } + if (receive_newer_config_log) { + RLockGuard guard(lock_); + const bool in_paxos_member_list = meta.curr_.config_.log_sync_memberlist_.contains(self_) || + meta.curr_.config_.degraded_learnerlist_.contains(self_); + // 1. self is not in paxos memberlist, try register parent + if (!in_paxos_member_list && OB_FAIL(config_mgr_.register_parent())) { + PALF_LOG(WARN, "register_parent failed", KPC(this), K(server), K(meta)); + } + // 2. self is in paxos memberlist, try retire parent + if (in_paxos_member_list && OB_FAIL(config_mgr_.retire_parent())) { + PALF_LOG(WARN, "register_parent failed", KPC(this), K(server), K(meta)); } } return ret;