[4.2] fix paxos replica number problem

This commit is contained in:
obdev
2023-09-08 13:26:16 +08:00
committed by ob-robot
parent c6e0f14d20
commit 83c9a54dfb
5 changed files with 37 additions and 11 deletions

View File

@ -919,6 +919,7 @@ int PalfHandleImpl::replace_learners(const common::ObMemberList &added_learners,
return ret;
}
ERRSIM_POINT_DEF(ERRSIM_REPLACE_MEMBER_NOT_REMOVE_ERROR);
int PalfHandleImpl::replace_member_with_learner(const common::ObMember &added_member,
const common::ObMember &removed_member,
const palf::LogConfigVersion &config_version,
@ -944,6 +945,8 @@ int PalfHandleImpl::replace_member_with_learner(const common::ObMember &added_me
PALF_LOG(WARN, "get_curr_member_list failed", KR(ret), KPC(this));
} else if (OB_FAIL(one_stage_config_change_(args, timeout_us))) {
PALF_LOG(WARN, "add_member in replace_member_with_learner failed", KR(ret), KPC(this), K(args));
} else if (OB_UNLIKELY(ERRSIM_REPLACE_MEMBER_NOT_REMOVE_ERROR)) {
// do nothing
} else if (FALSE_IT(args.server_ = removed_member)) {
} else if (FALSE_IT(args.type_ = REMOVE_MEMBER_AND_NUM)) {
} else if (OB_FAIL(one_stage_config_change_(args, timeout_us + begin_time_us - common::ObTimeUtility::current_time()))) {

View File

@ -102,6 +102,7 @@ static const char* disaster_recovery_task_ret_comment_strs[] = {
"[rs] task can not execute because server is not alive",
"[rs] task can not execute because fail to check paxos replica number",
"[rs] task can not execute because replica is not in service",
"[rs] task can not execute because server is permanent offline",
""/*default max*/
};
@ -678,13 +679,13 @@ int ObMigrateLSReplicaTask::check_paxos_number(
int ret = OB_SUCCESS;
const ObLSReplica *leader = nullptr;
if (OB_FAIL(ls_info.find_leader(leader))) {
LOG_WARN("fail to get leader", K(ret));
} else if (OB_UNLIKELY(nullptr == leader)) {
LOG_WARN("fail to get leader", KR(ret), K(ls_info));
} else if (OB_ISNULL(leader)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("leader replica is null", KR(ret));
} else if (leader->get_paxos_replica_number() <= 0) {
ret = OB_REBALANCE_TASK_CANT_EXEC;
LOG_WARN("paxos replica number not report", K(ret), KPC(leader));
LOG_WARN("paxos replica number not report", KR(ret), KPC(leader));
} else if (leader->get_paxos_replica_number() != paxos_replica_number_) {
ret = OB_REBALANCE_TASK_CANT_EXEC;
LOG_WARN("paxos replica number not match", KR(ret),

View File

@ -68,6 +68,7 @@ enum ObDRTaskRetComment
CANNOT_EXECUTE_DUE_TO_SERVER_NOT_ALIVE = 6,
CANNOT_EXECUTE_DUE_TO_PAXOS_REPLICA_NUMBER = 7,
CANNOT_EXECUTE_DUE_TO_REPLICA_NOT_INSERVICE = 8,
CANNOT_EXECUTE_DUE_TO_SERVER_PERMANENT_OFFLINE = 9,
MAX
};

View File

@ -718,13 +718,31 @@ void ObDRTaskMgr::run3()
if (OB_FAIL(try_pop_task(allocator, task))) {
LOG_WARN("fail to try pop task", KR(ret));
} else if (OB_NOT_NULL(task)) {
tmp_ret = task->log_execute_start();
if (OB_SUCCESS != tmp_ret) {
const ObAddr &dst_server = task->get_dst_server();
ObServerInfoInTable server_info;
if (OB_FAIL(SVR_TRACER.get_server_info(dst_server, server_info))) {
LOG_WARN("fail to get server_info", KR(ret), K(dst_server));
} else if (server_info.is_permanent_offline()) {
// dest server permanent offline, do not execute this task, just clean it
LOG_INFO("[DRTASK_NOTICE] dest server is permanent offline, task can not execute", K(dst_server), K(server_info));
ObThreadCondGuard guard(cond_);
if (OB_SUCCESS != (tmp_ret = async_add_cleaning_task_to_updater(
task->get_task_id(),
task->get_task_key(),
OB_REBALANCE_TASK_CANT_EXEC,
false/*need_record_event*/,
ObDRTaskRetComment::CANNOT_EXECUTE_DUE_TO_SERVER_PERMANENT_OFFLINE,
false/*reach_data_copy_concurrency*/))) {
LOG_WARN("fail to do execute over", KR(tmp_ret), KPC(task));
}
} else {
if (OB_SUCCESS != (tmp_ret = task->log_execute_start())) {
LOG_WARN("fail to log task start", KR(tmp_ret), KPC(task));
}
if (OB_FAIL(execute_task(*task))) {
LOG_WARN("fail to send", KR(ret), KPC(task));
}
}
free_task_(allocator, task);
} else {
LOG_TRACE("task is nullptr after try_pop_task");

View File

@ -4656,7 +4656,10 @@ int ObDRWorker::generate_disaster_recovery_paxos_replica_number(
if (locality_paxos_replica_number >= member_list_cnt_after) {
new_paxos_replica_number = curr_paxos_replica_number;
found = true;
} else {} // new member cnt greater than paxos_replica_number, not good
} else if (locality_paxos_replica_number + 1 == member_list_cnt_after) {
new_paxos_replica_number = curr_paxos_replica_number + 1;
found = true;
}
} else if (curr_paxos_replica_number > locality_paxos_replica_number) {
if (curr_paxos_replica_number >= member_list_cnt_after) {
new_paxos_replica_number = curr_paxos_replica_number;