[fix] paxos member shouldn't be reigstered as a child

This commit is contained in:
BinChenn 2024-11-08 03:13:48 +00:00 committed by ob-robot
parent 886b2b1e5e
commit b44bd6e669
4 changed files with 112 additions and 9 deletions

View File

@ -1109,10 +1109,92 @@ TEST_F(TestObSimpleLogClusterConfigChange, learner_loop)
}
}
for (int i = 0; i < ObSimpleLogClusterTestBase::node_cnt_; i++) {
unblock_pcode(i, ObRpcPacketCode::OB_LOG_PUSH_REQ);
}
revert_cluster_palf_handle_guard(palf_list);
PALF_LOG(INFO, "end test learner_loop", K(id));
}
// 1. 3 paxos member (A, B ,C), A is the leader. D is a learner, C is the parent of D.
// 2. block_net A <-> C, submit logs, logs of C and D are behind from A
// 3. remove C, or the step 4 cannot be executed succcessfully
// 4. switch D from learner to acceptor, D can not accept the reconfig log because it miss some logs
// 5. remove B
TEST_F(TestObSimpleLogClusterConfigChange, switch_lagged_learner_to_acceptor)
{
SET_CASE_LOG_FILE(TEST_NAME, "switch_lagged_learner_to_acceptor");
int ret = OB_SUCCESS;
const int64_t CONFIG_CHANGE_TIMEOUT = 10 * 1000 * 1000L; // 10s
const int64_t id = ATOMIC_AAF(&palf_id_, 1);
int64_t leader_idx = 0;
PalfHandleImplGuard leader;
LogLearnerList all_learner;
const ObMemberList &node_list = get_node_list();
std::vector<PalfHandleImplGuard*> palf_list;
common::ObRegion beijing_region("BEIJING");
common::ObRegion shanghai_region("SHANGHAI");
EXPECT_EQ(OB_SUCCESS, create_paxos_group(id, &loc_cb, leader_idx, leader));
EXPECT_EQ(OB_SUCCESS, get_cluster_palf_handle_guard(id, palf_list));
// 1. add a learner, idx = 3
const int64_t learner_idx = 3;
const int64_t followerc_idx = (leader_idx + 1) % 3;
const int64_t followerb_idx = (leader_idx + 2) % 3;
const ObAddr followerc_addr = get_cluster()[followerc_idx]->get_addr();
const ObAddr followerb_addr = get_cluster()[followerb_idx]->get_addr();
// set region, beijing(leader, follower b), shanghai(follower c, learner)
for (int i = 0; i < ObSimpleLogClusterTestBase::node_cnt_; i++) {
const common::ObAddr addr = palf_list[i]->palf_handle_impl_->self_;
if (leader.palf_handle_impl_->config_mgr_.alive_paxos_memberlist_.contains(addr) &&
i != followerc_idx) {
get_cluster()[0]->get_locality_manager()->set_server_region(addr, beijing_region);
} else {
get_cluster()[0]->get_locality_manager()->set_server_region(addr, shanghai_region);
}
}
for (auto palf_handle: palf_list) { palf_handle->palf_handle_impl_->update_self_region_(); }
// add D to learner_list
common::ObMember learner;
EXPECT_EQ(OB_SUCCESS, node_list.get_member_by_index(learner_idx, learner));
EXPECT_EQ(OB_SUCCESS, all_learner.add_learner(LogLearner(learner.get_server(), 1)));
EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->add_learner(learner, CONFIG_CHANGE_TIMEOUT));
// check topo
EXPECT_UNTIL_EQ(true, check_children_valid(palf_list, all_learner));
EXPECT_UNTIL_EQ(0, leader.palf_handle_impl_->config_mgr_.children_.get_member_number());
EXPECT_UNTIL_EQ(0, palf_list[followerb_idx]->palf_handle_impl_->config_mgr_.children_.get_member_number());
EXPECT_UNTIL_EQ(1, palf_list[followerc_idx]->palf_handle_impl_->config_mgr_.children_.get_member_number());
EXPECT_UNTIL_EQ(followerc_addr, palf_list[learner_idx]->palf_handle_impl_->config_mgr_.parent_);
loc_cb.leader_ = get_cluster()[leader_idx]->get_addr();
// 2. remove C, or the step 4 cannot be executed succcessfully
ASSERT_EQ(OB_SUCCESS, leader.palf_handle_impl_->remove_member(ObMember(followerc_addr, 1), 2, CONFIG_CHANGE_TIMEOUT));
LogConfigVersion leader_config_version;
EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->get_config_version(leader_config_version));
// ensure the learner can be added successfully
EXPECT_EQ(leader_config_version, palf_list[learner_idx]->palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.config_version_);
// 3. block_net between leader and followerc, submit logs
block_net(leader_idx, followerc_idx);
EXPECT_EQ(OB_SUCCESS, submit_log(leader, 100, id));
EXPECT_UNTIL_EQ(leader.palf_handle_impl_->get_max_lsn().val_, leader.palf_handle_impl_->get_end_lsn().val_);
// 4. switch D from learner to acceptor, D can not accept the reconfig log because it miss some logs
EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->get_config_version(leader_config_version));
EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->switch_learner_to_acceptor(learner, 3, leader_config_version, CONFIG_CHANGE_TIMEOUT));
// 5. remove B
ASSERT_EQ(OB_SUCCESS, leader.palf_handle_impl_->remove_member(ObMember(followerb_addr, 1), 2, CONFIG_CHANGE_TIMEOUT));
unblock_net(leader_idx, followerc_idx);
leader.reset();
revert_cluster_palf_handle_guard(palf_list);
PALF_LOG(INFO, "end switch_lagged_learner_to_acceptor", K(id));
}
} // end unittest
} // end oceanbase

View File

@ -2727,6 +2727,21 @@ int LogConfigMgr::handle_register_parent_resp(const LogLearner &server,
return ret;
}
int LogConfigMgr::retire_parent()
{
int ret = OB_SUCCESS;
const RetireParentReason reason = RetireParentReason::IS_FULL_MEMBER;
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
PALF_LOG(WARN, "LogConfigMgr not init", KR(ret));
} else if (OB_FAIL(retire_parent_(reason))) {
PALF_LOG(WARN, "LogConfigMgr not init", KR(ret));
} else {
PALF_LOG(INFO, "retire_parent success", KR(ret), "reason", retire_parent_reason_2_str_(reason));
}
return ret;
}
int LogConfigMgr::retire_parent_(const RetireParentReason &reason)
{
int ret = OB_SUCCESS;

View File

@ -494,6 +494,7 @@ public:
// NB: no handle_retire_parent and retire_children
virtual int handle_retire_child(const LogLearner &parent);
virtual int handle_learner_keepalive_req(const LogLearner &parent);
int retire_parent();
// failure detector
int check_parent_health();
// ==================== Child ========================

View File

@ -4120,6 +4120,7 @@ int PalfHandleImpl::receive_config_log(const common::ObAddr &server,
const LogConfigMeta &meta)
{
int ret = OB_SUCCESS;
bool receive_newer_config_log = false;
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
PALF_LOG(WARN, "PalfHandleImpl not init", KR(ret));
@ -4136,7 +4137,6 @@ int PalfHandleImpl::receive_config_log(const common::ObAddr &server,
PALF_LOG(WARN, "try_update_proposal_id_ failed", KR(ret), KPC(this), K(server), K(msg_proposal_id));
} else {
TruncateLogInfo truncate_log_info;
bool need_print_register_log = false;
// need wlock in case of truncating log and writing log_ms_meta in LogConfigMgr
WLockGuard guard(lock_);
// max_scn of multiple replicas may be different in FLASHBACK mode,
@ -4148,6 +4148,7 @@ int PalfHandleImpl::receive_config_log(const common::ObAddr &server,
if (REACH_TIME_INTERVAL(100 * 1000)) {
PALF_LOG(WARN, "can not receive log", KR(ret), KPC(this), K(msg_proposal_id), "role", state_mgr_.get_role());
}
} else if (FALSE_IT(receive_newer_config_log = true)) {
} else if (mode_mgr_.get_accepted_mode_meta().proposal_id_ < prev_mode_pid) {
// need fetch mode_meta
if (OB_FAIL(sw_.try_fetch_log(FetchTriggerType::MODE_META_BARRIER))) {
@ -4168,18 +4169,22 @@ int PalfHandleImpl::receive_config_log(const common::ObAddr &server,
} else if (OB_FAIL(config_mgr_.receive_config_log(server, meta))) {
PALF_LOG(WARN, "receive_config_log failed", KR(ret), KPC(this), K(server), K(msg_proposal_id),
K(prev_log_proposal_id), K(prev_lsn));
} else if (!meta.curr_.config_.log_sync_memberlist_.contains(self_) &&
meta.curr_.config_.arbitration_member_.get_server() != self_ &&
!FALSE_IT(config_mgr_.register_parent()) &&
FALSE_IT(need_print_register_log = true)) {
// it's a optimization. If self isn't in memberlist, then register parent right now,
// otherwise this new added learner will register parent in 4s at most and its log will be far behind.
} else {
PALF_LOG(INFO, "receive_config_log success", KR(ret), KPC(this), K(server), K(msg_proposal_id),
K(prev_lsn), K(prev_log_proposal_id), K(meta));
}
if (need_print_register_log) {
PALF_LOG(INFO, "re_register_parent reason: self may in learnerlist", KPC(this), K(server), K(meta));
}
if (receive_newer_config_log) {
RLockGuard guard(lock_);
const bool in_paxos_member_list = meta.curr_.config_.log_sync_memberlist_.contains(self_) ||
meta.curr_.config_.degraded_learnerlist_.contains(self_);
// 1. self is not in paxos memberlist, try register parent
if (!in_paxos_member_list && OB_FAIL(config_mgr_.register_parent())) {
PALF_LOG(WARN, "register_parent failed", KPC(this), K(server), K(meta));
}
// 2. self is in paxos memberlist, try retire parent
if (in_paxos_member_list && OB_FAIL(config_mgr_.retire_parent())) {
PALF_LOG(WARN, "register_parent failed", KPC(this), K(server), K(meta));
}
}
return ret;