[PALF] fix is_changing_config_with_arb_ is true after a reconfiguration failed

This commit is contained in:
BinChenn
2023-06-05 10:14:15 +00:00
committed by ob-robot
parent a3610e5262
commit 9968621711
4 changed files with 70 additions and 19 deletions

View File

@ -274,7 +274,7 @@ TEST_F(TestObSimpleLogClusterAccessMode, prev_log_slide)
LogConfigVersion config_version;
const int64_t proposal_id = leader.palf_handle_impl_->state_mgr_.get_proposal_id();
const int64_t leader_epoch = leader.palf_handle_impl_->state_mgr_.get_leader_epoch();
EXPECT_EQ(OB_EAGAIN, leader.palf_handle_impl_->config_mgr_.change_config(args, proposal_id, leader_epoch, config_version));
EXPECT_EQ(OB_ERR_UNEXPECTED, leader.palf_handle_impl_->config_mgr_.change_config(args, proposal_id, leader_epoch, config_version));
const LogConfigMeta new_config_meta = leader.palf_handle_impl_->config_mgr_.log_ms_meta_;
EXPECT_EQ(config_meta.curr_.config_version_, new_config_meta.curr_.config_version_);
// wait prepare req reaches majority

View File

@ -398,9 +398,10 @@ TEST_F(TestObSimpleLogClusterArbService, test_2f1a_arb_with_highest_version)
sleep(2);
LogConfigChangeArgs args(ObMember(palf_list[3]->palf_handle_impl_->self_, 1), 0, ADD_LEARNER);
const int64_t proposal_id = leader.palf_handle_impl_->state_mgr_.get_proposal_id();
const int64_t election_epoch = leader.palf_handle_impl_->state_mgr_.get_leader_epoch();
int64_t proposal_id = 0;
int64_t election_epoch = 0;
LogConfigVersion config_version;
EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->config_mgr_.start_change_config(proposal_id, election_epoch, args.type_));
EXPECT_EQ(OB_EAGAIN, leader.palf_handle_impl_->config_mgr_.change_config(args, proposal_id, election_epoch, config_version));
// learner list and state_ has been changed
EXPECT_TRUE(config_version.is_valid());
@ -480,9 +481,10 @@ TEST_F(TestObSimpleLogClusterArbService, test_2f1a_defensive)
// add a member, do not allow to append logs until config log reaches majority
LogConfigChangeArgs args(added_member, 3, ADD_MEMBER);
const int64_t proposal_id = leader.palf_handle_impl_->state_mgr_.get_proposal_id();
const int64_t election_epoch = leader.palf_handle_impl_->state_mgr_.get_leader_epoch();
int64_t proposal_id = 0;
int64_t election_epoch = 0;
LogConfigVersion config_version;
EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->config_mgr_.start_change_config(proposal_id, election_epoch, args.type_));
EXPECT_EQ(OB_EAGAIN, leader.palf_handle_impl_->config_mgr_.change_config(args, proposal_id, election_epoch, config_version));
// do not allow to append log when changing config with arb
EXPECT_TRUE(leader.palf_handle_impl_->state_mgr_.is_changing_config_with_arb());
@ -664,9 +666,10 @@ TEST_F(TestObSimpleLogClusterArbService, test_2f1a_degrade_when_no_leader)
sleep(2);
LogConfigChangeArgs args(ObMember(palf_list[another_f_idx]->palf_handle_impl_->self_, 1), 0, DEGRADE_ACCEPTOR_TO_LEARNER);
const int64_t proposal_id = leader.palf_handle_impl_->state_mgr_.get_proposal_id();
const int64_t election_epoch = leader.palf_handle_impl_->state_mgr_.get_leader_epoch();
int64_t proposal_id = 0;
int64_t election_epoch = 0;
LogConfigVersion config_version;
EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->config_mgr_.start_change_config(proposal_id, election_epoch, args.type_));
EXPECT_EQ(OB_EAGAIN, leader.palf_handle_impl_->config_mgr_.change_config(args, proposal_id, election_epoch, config_version));
// leader appended config meta, but did not apply config meta

View File

@ -384,6 +384,54 @@ TEST_F(TestObSimpleLogClusterArbMockEleService, test_2f1a_degrade_when_no_leader
PALF_LOG(INFO, "end test test_2f1a_degrade_when_no_leader2", K(id));
}
TEST_F(TestObSimpleLogClusterArbMockEleService, test_2f1a_change_config_fail)
{
OB_LOGGER.set_log_level("INFO");
int ret = OB_SUCCESS;
const int64_t id = ATOMIC_AAF(&palf_id_, 1);
const int64_t TIMEOUT_US = 10 * 1000 * 1000L;
SET_CASE_LOG_FILE(TEST_NAME, "test_2f1a_change_config_fail");
PALF_LOG(INFO, "begin test test_2f1a_change_config_fail", K(id));
{
int64_t leader_idx = 0;
int64_t arb_replica_idx = 0;
PalfHandleImplGuard leader;
std::vector<PalfHandleImplGuard*> palf_list;
EXPECT_EQ(OB_SUCCESS, create_paxos_group_with_arb_mock_election(id, arb_replica_idx, leader_idx, leader));
EXPECT_EQ(OB_SUCCESS, submit_log(leader, 200, id));
EXPECT_EQ(OB_SUCCESS, get_cluster_palf_handle_guard(id, palf_list));
dynamic_cast<ObSimpleLogServer*>(get_cluster()[leader_idx])->log_service_.get_arbitration_service()->stop();
const int64_t b_idx = (leader_idx + 1) % 4;
const int64_t c_idx = (leader_idx + 2) % 4;
const int64_t d_idx = (leader_idx + 3) % 4;
const common::ObAddr a_addr = get_cluster()[leader_idx]->get_addr();
const common::ObAddr b_addr = get_cluster()[b_idx]->get_addr();
const common::ObAddr c_addr = get_cluster()[c_idx]->get_addr();
const common::ObAddr d_addr = get_cluster()[d_idx]->get_addr();
PalfHandleImplGuard *a_handle = palf_list[leader_idx];
PalfHandleImplGuard *b_handle = palf_list[b_idx];
PalfHandleImplGuard *d_handle = palf_list[d_idx];
LogConfigChangeArgs add_d_arg(common::ObMember(d_addr, 1), 4, ADD_MEMBER);
int64_t add_d_pid = 0;
int64_t add_d_epoch = 0;
LogConfigVersion add_d_version;
EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->config_mgr_.start_change_config(add_d_pid, add_d_epoch, add_d_arg.type_));
// block the network from the leader to the follower
block_net(leader_idx, d_idx);
EXPECT_UNTIL_EQ(OB_LOG_NOT_SYNC, leader.palf_handle_impl_->config_mgr_.change_config_(add_d_arg, add_d_pid, add_d_epoch, add_d_version));
EXPECT_FALSE(add_d_version.is_valid());
unblock_net(leader_idx, d_idx);
dynamic_cast<ObSimpleLogServer*>(get_cluster()[leader_idx])->log_service_.get_arbitration_service()->start();
revert_cluster_palf_handle_guard(palf_list);
}
delete_paxos_group(id);
PALF_LOG(INFO, "end test test_2f1a_change_config_fail", K(id));
}
TEST_F(TestObSimpleLogClusterArbMockEleService, test_2f1a_degrade_when_arb_crash)
{
OB_LOGGER.set_log_level("INFO");

View File

@ -774,8 +774,8 @@ int LogConfigMgr::change_config_(const LogConfigChangeArgs &args,
"role", state_mgr_->get_role(), "state", state_mgr_->get_state());
} else if (false == mode_mgr_->can_do_paxos_accept()) {
// do not allow chagne_config when ModeMgr is in prepare state
ret = OB_EAGAIN;
PALF_LOG(WARN, "is changing access_mode, try again", KR(ret), K_(palf_id), K_(self),
ret = OB_ERR_UNEXPECTED;
PALF_LOG(ERROR, "is changing access_mode, try again", KR(ret), K_(palf_id), K_(self),
"role", state_mgr_->get_role(), "state", state_mgr_->get_state());
} else if (OB_FAIL(check_config_version_matches_state_(args.type_, config_version))) {
PALF_LOG(WARN, "config_version does not match with state, try again", KR(ret), K_(palf_id), K_(self),
@ -1949,13 +1949,22 @@ int LogConfigMgr::wait_log_barrier_(const LogConfigChangeArgs &args,
int64_t unused_id = INT64_MAX;
bool unused_bool = false;
int64_t curr_ts_us = common::ObTimeUtility::current_time();
constexpr int64_t conn_timeout_us = 3 * 1000 * 1000L; // 3s
constexpr bool need_purge_throttling = true;
constexpr bool need_remote_check = false;
const bool need_skip_log_barrier = mode_mgr_->need_skip_log_barrier();
LSN prev_log_end_lsn;
start_wait_barrier_time_us_ = (OB_INVALID_TIMESTAMP == start_wait_barrier_time_us_)? \
curr_ts_us: start_wait_barrier_time_us_;
if (new_config_info.log_sync_memberlist_.get_member_number() == 0) {
ret = OB_INVALID_ARGUMENT;
} else if (curr_ts_us - start_wait_barrier_time_us_ > MAX_WAIT_BARRIER_TIME_US_FOR_RECONFIGURATION &&
args.type_ != LogConfigChangeType::STARTWORKING) {
ret = OB_LOG_NOT_SYNC;
PALF_LOG(WARN, "waiting for log barrier timeout, skip", KR(ret), K_(palf_id), K_(self),
K_(start_wait_barrier_time_us), K(first_committed_end_lsn), K(prev_log_end_lsn));
start_wait_barrier_time_us_ = curr_ts_us;
} else if (OB_FAIL(sync_get_committed_end_lsn_(args, new_config_info, need_purge_throttling,
need_remote_check, conn_timeout_us, first_committed_end_lsn, unused_bool, unused_lsn, unused_id))) {
PALF_LOG(WARN, "sync_get_committed_end_lsn failed", K(ret), K_(palf_id), K_(self), K(new_config_info));
@ -1968,7 +1977,7 @@ int LogConfigMgr::wait_log_barrier_(const LogConfigChangeArgs &args,
} else if (FALSE_IT(ret = (first_committed_end_lsn >= prev_log_end_lsn)? OB_SUCCESS: OB_EAGAIN)) {
} else if (OB_EAGAIN == ret) {
// committed_end_lsn do not change during 2s, skip the reconfiguration
const int64_t curr_ts_us = common::ObTimeUtility::current_time();
curr_ts_us = common::ObTimeUtility::current_time();
if (OB_INVALID_TIMESTAMP == last_wait_barrier_time_us_) {
last_wait_committed_end_lsn_ = first_committed_end_lsn;
last_wait_barrier_time_us_ = curr_ts_us;
@ -1984,15 +1993,6 @@ int LogConfigMgr::wait_log_barrier_(const LogConfigChangeArgs &args,
last_wait_barrier_time_us_ = curr_ts_us;
}
}
if (OB_INVALID_TIMESTAMP == start_wait_barrier_time_us_) {
start_wait_barrier_time_us_ = curr_ts_us;
} else if (curr_ts_us - start_wait_barrier_time_us_ > MAX_WAIT_BARRIER_TIME_US_FOR_RECONFIGURATION &&
args.type_ != LogConfigChangeType::STARTWORKING) {
ret = OB_LOG_NOT_SYNC;
PALF_LOG(WARN, "waiting for log barrier timeout, skip", KR(ret), K_(palf_id), K_(self),
K_(start_wait_barrier_time_us), K(first_committed_end_lsn), K(prev_log_end_lsn));
start_wait_barrier_time_us_ = curr_ts_us;
}
}
PALF_LOG(INFO, "waiting for log barrier", K(ret), K_(palf_id), K_(self),
K(first_committed_end_lsn), K(prev_log_end_lsn), K(new_config_info));