[4.2] fix paxos replica number problem
This commit is contained in:
@ -919,6 +919,7 @@ int PalfHandleImpl::replace_learners(const common::ObMemberList &added_learners,
|
||||
return ret;
|
||||
}
|
||||
|
||||
ERRSIM_POINT_DEF(ERRSIM_REPLACE_MEMBER_NOT_REMOVE_ERROR);
|
||||
int PalfHandleImpl::replace_member_with_learner(const common::ObMember &added_member,
|
||||
const common::ObMember &removed_member,
|
||||
const palf::LogConfigVersion &config_version,
|
||||
@ -944,6 +945,8 @@ int PalfHandleImpl::replace_member_with_learner(const common::ObMember &added_me
|
||||
PALF_LOG(WARN, "get_curr_member_list failed", KR(ret), KPC(this));
|
||||
} else if (OB_FAIL(one_stage_config_change_(args, timeout_us))) {
|
||||
PALF_LOG(WARN, "add_member in replace_member_with_learner failed", KR(ret), KPC(this), K(args));
|
||||
} else if (OB_UNLIKELY(ERRSIM_REPLACE_MEMBER_NOT_REMOVE_ERROR)) {
|
||||
// do nothing
|
||||
} else if (FALSE_IT(args.server_ = removed_member)) {
|
||||
} else if (FALSE_IT(args.type_ = REMOVE_MEMBER_AND_NUM)) {
|
||||
} else if (OB_FAIL(one_stage_config_change_(args, timeout_us + begin_time_us - common::ObTimeUtility::current_time()))) {
|
||||
|
||||
@ -102,6 +102,7 @@ static const char* disaster_recovery_task_ret_comment_strs[] = {
|
||||
"[rs] task can not execute because server is not alive",
|
||||
"[rs] task can not execute because fail to check paxos replica number",
|
||||
"[rs] task can not execute because replica is not in service",
|
||||
"[rs] task can not execute because server is permanent offline",
|
||||
""/*default max*/
|
||||
};
|
||||
|
||||
@ -678,13 +679,13 @@ int ObMigrateLSReplicaTask::check_paxos_number(
|
||||
int ret = OB_SUCCESS;
|
||||
const ObLSReplica *leader = nullptr;
|
||||
if (OB_FAIL(ls_info.find_leader(leader))) {
|
||||
LOG_WARN("fail to get leader", K(ret));
|
||||
} else if (OB_UNLIKELY(nullptr == leader)) {
|
||||
LOG_WARN("fail to get leader", KR(ret), K(ls_info));
|
||||
} else if (OB_ISNULL(leader)) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("leader replica is null", KR(ret));
|
||||
} else if (leader->get_paxos_replica_number() <= 0) {
|
||||
ret = OB_REBALANCE_TASK_CANT_EXEC;
|
||||
LOG_WARN("paxos replica number not report", K(ret), KPC(leader));
|
||||
LOG_WARN("paxos replica number not report", KR(ret), KPC(leader));
|
||||
} else if (leader->get_paxos_replica_number() != paxos_replica_number_) {
|
||||
ret = OB_REBALANCE_TASK_CANT_EXEC;
|
||||
LOG_WARN("paxos replica number not match", KR(ret),
|
||||
|
||||
@ -68,6 +68,7 @@ enum ObDRTaskRetComment
|
||||
CANNOT_EXECUTE_DUE_TO_SERVER_NOT_ALIVE = 6,
|
||||
CANNOT_EXECUTE_DUE_TO_PAXOS_REPLICA_NUMBER = 7,
|
||||
CANNOT_EXECUTE_DUE_TO_REPLICA_NOT_INSERVICE = 8,
|
||||
CANNOT_EXECUTE_DUE_TO_SERVER_PERMANENT_OFFLINE = 9,
|
||||
MAX
|
||||
};
|
||||
|
||||
|
||||
@ -718,13 +718,31 @@ void ObDRTaskMgr::run3()
|
||||
if (OB_FAIL(try_pop_task(allocator, task))) {
|
||||
LOG_WARN("fail to try pop task", KR(ret));
|
||||
} else if (OB_NOT_NULL(task)) {
|
||||
tmp_ret = task->log_execute_start();
|
||||
if (OB_SUCCESS != tmp_ret) {
|
||||
const ObAddr &dst_server = task->get_dst_server();
|
||||
ObServerInfoInTable server_info;
|
||||
if (OB_FAIL(SVR_TRACER.get_server_info(dst_server, server_info))) {
|
||||
LOG_WARN("fail to get server_info", KR(ret), K(dst_server));
|
||||
} else if (server_info.is_permanent_offline()) {
|
||||
// dest server permanent offline, do not execute this task, just clean it
|
||||
LOG_INFO("[DRTASK_NOTICE] dest server is permanent offline, task can not execute", K(dst_server), K(server_info));
|
||||
ObThreadCondGuard guard(cond_);
|
||||
if (OB_SUCCESS != (tmp_ret = async_add_cleaning_task_to_updater(
|
||||
task->get_task_id(),
|
||||
task->get_task_key(),
|
||||
OB_REBALANCE_TASK_CANT_EXEC,
|
||||
false/*need_record_event*/,
|
||||
ObDRTaskRetComment::CANNOT_EXECUTE_DUE_TO_SERVER_PERMANENT_OFFLINE,
|
||||
false/*reach_data_copy_concurrency*/))) {
|
||||
LOG_WARN("fail to do execute over", KR(tmp_ret), KPC(task));
|
||||
}
|
||||
} else {
|
||||
if (OB_SUCCESS != (tmp_ret = task->log_execute_start())) {
|
||||
LOG_WARN("fail to log task start", KR(tmp_ret), KPC(task));
|
||||
}
|
||||
if (OB_FAIL(execute_task(*task))) {
|
||||
LOG_WARN("fail to send", KR(ret), KPC(task));
|
||||
}
|
||||
}
|
||||
free_task_(allocator, task);
|
||||
} else {
|
||||
LOG_TRACE("task is nullptr after try_pop_task");
|
||||
|
||||
@ -4656,7 +4656,10 @@ int ObDRWorker::generate_disaster_recovery_paxos_replica_number(
|
||||
if (locality_paxos_replica_number >= member_list_cnt_after) {
|
||||
new_paxos_replica_number = curr_paxos_replica_number;
|
||||
found = true;
|
||||
} else {} // new member cnt greater than paxos_replica_number, not good
|
||||
} else if (locality_paxos_replica_number + 1 == member_list_cnt_after) {
|
||||
new_paxos_replica_number = curr_paxos_replica_number + 1;
|
||||
found = true;
|
||||
}
|
||||
} else if (curr_paxos_replica_number > locality_paxos_replica_number) {
|
||||
if (curr_paxos_replica_number >= member_list_cnt_after) {
|
||||
new_paxos_replica_number = curr_paxos_replica_number;
|
||||
|
||||
Reference in New Issue
Block a user