[fix] paxos member shouldn't be reigstered as a child
This commit is contained in:
parent
886b2b1e5e
commit
b44bd6e669
@ -1109,10 +1109,92 @@ TEST_F(TestObSimpleLogClusterConfigChange, learner_loop)
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < ObSimpleLogClusterTestBase::node_cnt_; i++) {
|
||||
unblock_pcode(i, ObRpcPacketCode::OB_LOG_PUSH_REQ);
|
||||
}
|
||||
revert_cluster_palf_handle_guard(palf_list);
|
||||
PALF_LOG(INFO, "end test learner_loop", K(id));
|
||||
}
|
||||
|
||||
// 1. 3 paxos member (A, B ,C), A is the leader. D is a learner, C is the parent of D.
|
||||
// 2. block_net A <-> C, submit logs, logs of C and D are behind from A
|
||||
// 3. remove C, or the step 4 cannot be executed succcessfully
|
||||
// 4. switch D from learner to acceptor, D can not accept the reconfig log because it miss some logs
|
||||
// 5. remove B
|
||||
TEST_F(TestObSimpleLogClusterConfigChange, switch_lagged_learner_to_acceptor)
|
||||
{
|
||||
SET_CASE_LOG_FILE(TEST_NAME, "switch_lagged_learner_to_acceptor");
|
||||
int ret = OB_SUCCESS;
|
||||
const int64_t CONFIG_CHANGE_TIMEOUT = 10 * 1000 * 1000L; // 10s
|
||||
const int64_t id = ATOMIC_AAF(&palf_id_, 1);
|
||||
int64_t leader_idx = 0;
|
||||
PalfHandleImplGuard leader;
|
||||
LogLearnerList all_learner;
|
||||
const ObMemberList &node_list = get_node_list();
|
||||
std::vector<PalfHandleImplGuard*> palf_list;
|
||||
common::ObRegion beijing_region("BEIJING");
|
||||
common::ObRegion shanghai_region("SHANGHAI");
|
||||
|
||||
EXPECT_EQ(OB_SUCCESS, create_paxos_group(id, &loc_cb, leader_idx, leader));
|
||||
EXPECT_EQ(OB_SUCCESS, get_cluster_palf_handle_guard(id, palf_list));
|
||||
|
||||
// 1. add a learner, idx = 3
|
||||
const int64_t learner_idx = 3;
|
||||
const int64_t followerc_idx = (leader_idx + 1) % 3;
|
||||
const int64_t followerb_idx = (leader_idx + 2) % 3;
|
||||
const ObAddr followerc_addr = get_cluster()[followerc_idx]->get_addr();
|
||||
const ObAddr followerb_addr = get_cluster()[followerb_idx]->get_addr();
|
||||
|
||||
// set region, beijing(leader, follower b), shanghai(follower c, learner)
|
||||
for (int i = 0; i < ObSimpleLogClusterTestBase::node_cnt_; i++) {
|
||||
const common::ObAddr addr = palf_list[i]->palf_handle_impl_->self_;
|
||||
if (leader.palf_handle_impl_->config_mgr_.alive_paxos_memberlist_.contains(addr) &&
|
||||
i != followerc_idx) {
|
||||
get_cluster()[0]->get_locality_manager()->set_server_region(addr, beijing_region);
|
||||
} else {
|
||||
get_cluster()[0]->get_locality_manager()->set_server_region(addr, shanghai_region);
|
||||
}
|
||||
}
|
||||
for (auto palf_handle: palf_list) { palf_handle->palf_handle_impl_->update_self_region_(); }
|
||||
// add D to learner_list
|
||||
common::ObMember learner;
|
||||
EXPECT_EQ(OB_SUCCESS, node_list.get_member_by_index(learner_idx, learner));
|
||||
EXPECT_EQ(OB_SUCCESS, all_learner.add_learner(LogLearner(learner.get_server(), 1)));
|
||||
EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->add_learner(learner, CONFIG_CHANGE_TIMEOUT));
|
||||
|
||||
// check topo
|
||||
EXPECT_UNTIL_EQ(true, check_children_valid(palf_list, all_learner));
|
||||
EXPECT_UNTIL_EQ(0, leader.palf_handle_impl_->config_mgr_.children_.get_member_number());
|
||||
EXPECT_UNTIL_EQ(0, palf_list[followerb_idx]->palf_handle_impl_->config_mgr_.children_.get_member_number());
|
||||
EXPECT_UNTIL_EQ(1, palf_list[followerc_idx]->palf_handle_impl_->config_mgr_.children_.get_member_number());
|
||||
EXPECT_UNTIL_EQ(followerc_addr, palf_list[learner_idx]->palf_handle_impl_->config_mgr_.parent_);
|
||||
loc_cb.leader_ = get_cluster()[leader_idx]->get_addr();
|
||||
|
||||
// 2. remove C, or the step 4 cannot be executed succcessfully
|
||||
ASSERT_EQ(OB_SUCCESS, leader.palf_handle_impl_->remove_member(ObMember(followerc_addr, 1), 2, CONFIG_CHANGE_TIMEOUT));
|
||||
LogConfigVersion leader_config_version;
|
||||
EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->get_config_version(leader_config_version));
|
||||
// ensure the learner can be added successfully
|
||||
EXPECT_EQ(leader_config_version, palf_list[learner_idx]->palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.config_version_);
|
||||
|
||||
// 3. block_net between leader and followerc, submit logs
|
||||
block_net(leader_idx, followerc_idx);
|
||||
EXPECT_EQ(OB_SUCCESS, submit_log(leader, 100, id));
|
||||
EXPECT_UNTIL_EQ(leader.palf_handle_impl_->get_max_lsn().val_, leader.palf_handle_impl_->get_end_lsn().val_);
|
||||
|
||||
// 4. switch D from learner to acceptor, D can not accept the reconfig log because it miss some logs
|
||||
EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->get_config_version(leader_config_version));
|
||||
EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->switch_learner_to_acceptor(learner, 3, leader_config_version, CONFIG_CHANGE_TIMEOUT));
|
||||
|
||||
// 5. remove B
|
||||
ASSERT_EQ(OB_SUCCESS, leader.palf_handle_impl_->remove_member(ObMember(followerb_addr, 1), 2, CONFIG_CHANGE_TIMEOUT));
|
||||
|
||||
unblock_net(leader_idx, followerc_idx);
|
||||
leader.reset();
|
||||
revert_cluster_palf_handle_guard(palf_list);
|
||||
PALF_LOG(INFO, "end switch_lagged_learner_to_acceptor", K(id));
|
||||
}
|
||||
|
||||
} // end unittest
|
||||
} // end oceanbase
|
||||
|
||||
|
@ -2727,6 +2727,21 @@ int LogConfigMgr::handle_register_parent_resp(const LogLearner &server,
|
||||
return ret;
|
||||
}
|
||||
|
||||
int LogConfigMgr::retire_parent()
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
const RetireParentReason reason = RetireParentReason::IS_FULL_MEMBER;
|
||||
if (IS_NOT_INIT) {
|
||||
ret = OB_NOT_INIT;
|
||||
PALF_LOG(WARN, "LogConfigMgr not init", KR(ret));
|
||||
} else if (OB_FAIL(retire_parent_(reason))) {
|
||||
PALF_LOG(WARN, "LogConfigMgr not init", KR(ret));
|
||||
} else {
|
||||
PALF_LOG(INFO, "retire_parent success", KR(ret), "reason", retire_parent_reason_2_str_(reason));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int LogConfigMgr::retire_parent_(const RetireParentReason &reason)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
|
@ -494,6 +494,7 @@ public:
|
||||
// NB: no handle_retire_parent and retire_children
|
||||
virtual int handle_retire_child(const LogLearner &parent);
|
||||
virtual int handle_learner_keepalive_req(const LogLearner &parent);
|
||||
int retire_parent();
|
||||
// failure detector
|
||||
int check_parent_health();
|
||||
// ==================== Child ========================
|
||||
|
@ -4120,6 +4120,7 @@ int PalfHandleImpl::receive_config_log(const common::ObAddr &server,
|
||||
const LogConfigMeta &meta)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
bool receive_newer_config_log = false;
|
||||
if (IS_NOT_INIT) {
|
||||
ret = OB_NOT_INIT;
|
||||
PALF_LOG(WARN, "PalfHandleImpl not init", KR(ret));
|
||||
@ -4136,7 +4137,6 @@ int PalfHandleImpl::receive_config_log(const common::ObAddr &server,
|
||||
PALF_LOG(WARN, "try_update_proposal_id_ failed", KR(ret), KPC(this), K(server), K(msg_proposal_id));
|
||||
} else {
|
||||
TruncateLogInfo truncate_log_info;
|
||||
bool need_print_register_log = false;
|
||||
// need wlock in case of truncating log and writing log_ms_meta in LogConfigMgr
|
||||
WLockGuard guard(lock_);
|
||||
// max_scn of multiple replicas may be different in FLASHBACK mode,
|
||||
@ -4148,6 +4148,7 @@ int PalfHandleImpl::receive_config_log(const common::ObAddr &server,
|
||||
if (REACH_TIME_INTERVAL(100 * 1000)) {
|
||||
PALF_LOG(WARN, "can not receive log", KR(ret), KPC(this), K(msg_proposal_id), "role", state_mgr_.get_role());
|
||||
}
|
||||
} else if (FALSE_IT(receive_newer_config_log = true)) {
|
||||
} else if (mode_mgr_.get_accepted_mode_meta().proposal_id_ < prev_mode_pid) {
|
||||
// need fetch mode_meta
|
||||
if (OB_FAIL(sw_.try_fetch_log(FetchTriggerType::MODE_META_BARRIER))) {
|
||||
@ -4168,18 +4169,22 @@ int PalfHandleImpl::receive_config_log(const common::ObAddr &server,
|
||||
} else if (OB_FAIL(config_mgr_.receive_config_log(server, meta))) {
|
||||
PALF_LOG(WARN, "receive_config_log failed", KR(ret), KPC(this), K(server), K(msg_proposal_id),
|
||||
K(prev_log_proposal_id), K(prev_lsn));
|
||||
} else if (!meta.curr_.config_.log_sync_memberlist_.contains(self_) &&
|
||||
meta.curr_.config_.arbitration_member_.get_server() != self_ &&
|
||||
!FALSE_IT(config_mgr_.register_parent()) &&
|
||||
FALSE_IT(need_print_register_log = true)) {
|
||||
// it's a optimization. If self isn't in memberlist, then register parent right now,
|
||||
// otherwise this new added learner will register parent in 4s at most and its log will be far behind.
|
||||
} else {
|
||||
PALF_LOG(INFO, "receive_config_log success", KR(ret), KPC(this), K(server), K(msg_proposal_id),
|
||||
K(prev_lsn), K(prev_log_proposal_id), K(meta));
|
||||
}
|
||||
if (need_print_register_log) {
|
||||
PALF_LOG(INFO, "re_register_parent reason: self may in learnerlist", KPC(this), K(server), K(meta));
|
||||
}
|
||||
if (receive_newer_config_log) {
|
||||
RLockGuard guard(lock_);
|
||||
const bool in_paxos_member_list = meta.curr_.config_.log_sync_memberlist_.contains(self_) ||
|
||||
meta.curr_.config_.degraded_learnerlist_.contains(self_);
|
||||
// 1. self is not in paxos memberlist, try register parent
|
||||
if (!in_paxos_member_list && OB_FAIL(config_mgr_.register_parent())) {
|
||||
PALF_LOG(WARN, "register_parent failed", KPC(this), K(server), K(meta));
|
||||
}
|
||||
// 2. self is in paxos memberlist, try retire parent
|
||||
if (in_paxos_member_list && OB_FAIL(config_mgr_.retire_parent())) {
|
||||
PALF_LOG(WARN, "register_parent failed", KPC(this), K(server), K(meta));
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
|
Loading…
x
Reference in New Issue
Block a user