diff --git a/src/logservice/palf/palf_handle_impl.cpp b/src/logservice/palf/palf_handle_impl.cpp index 6d06b1ae57..591bc0a5a9 100755 --- a/src/logservice/palf/palf_handle_impl.cpp +++ b/src/logservice/palf/palf_handle_impl.cpp @@ -919,6 +919,7 @@ int PalfHandleImpl::replace_learners(const common::ObMemberList &added_learners, return ret; } +ERRSIM_POINT_DEF(ERRSIM_REPLACE_MEMBER_NOT_REMOVE_ERROR); int PalfHandleImpl::replace_member_with_learner(const common::ObMember &added_member, const common::ObMember &removed_member, const palf::LogConfigVersion &config_version, @@ -944,6 +945,8 @@ int PalfHandleImpl::replace_member_with_learner(const common::ObMember &added_me PALF_LOG(WARN, "get_curr_member_list failed", KR(ret), KPC(this)); } else if (OB_FAIL(one_stage_config_change_(args, timeout_us))) { PALF_LOG(WARN, "add_member in replace_member_with_learner failed", KR(ret), KPC(this), K(args)); + } else if (OB_UNLIKELY(ERRSIM_REPLACE_MEMBER_NOT_REMOVE_ERROR)) { + // do nothing } else if (FALSE_IT(args.server_ = removed_member)) { } else if (FALSE_IT(args.type_ = REMOVE_MEMBER_AND_NUM)) { } else if (OB_FAIL(one_stage_config_change_(args, timeout_us + begin_time_us - common::ObTimeUtility::current_time()))) { diff --git a/src/rootserver/ob_disaster_recovery_task.cpp b/src/rootserver/ob_disaster_recovery_task.cpp index 38d9f2c0e0..a84a0a9fb8 100644 --- a/src/rootserver/ob_disaster_recovery_task.cpp +++ b/src/rootserver/ob_disaster_recovery_task.cpp @@ -102,6 +102,7 @@ static const char* disaster_recovery_task_ret_comment_strs[] = { "[rs] task can not execute because server is not alive", "[rs] task can not execute because fail to check paxos replica number", "[rs] task can not execute because replica is not in service", + "[rs] task can not execute because server is permanent offline", ""/*default max*/ }; @@ -678,13 +679,13 @@ int ObMigrateLSReplicaTask::check_paxos_number( int ret = OB_SUCCESS; const ObLSReplica *leader = nullptr; if (OB_FAIL(ls_info.find_leader(leader))) { - LOG_WARN("fail to get leader", K(ret)); - } else if (OB_UNLIKELY(nullptr == leader)) { + LOG_WARN("fail to get leader", KR(ret), K(ls_info)); + } else if (OB_ISNULL(leader)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("leader replica is null", KR(ret)); } else if (leader->get_paxos_replica_number() <= 0) { ret = OB_REBALANCE_TASK_CANT_EXEC; - LOG_WARN("paxos replica number not report", K(ret), KPC(leader)); + LOG_WARN("paxos replica number not report", KR(ret), KPC(leader)); } else if (leader->get_paxos_replica_number() != paxos_replica_number_) { ret = OB_REBALANCE_TASK_CANT_EXEC; LOG_WARN("paxos replica number not match", KR(ret), diff --git a/src/rootserver/ob_disaster_recovery_task.h b/src/rootserver/ob_disaster_recovery_task.h index 202fb69a57..5472532eb6 100644 --- a/src/rootserver/ob_disaster_recovery_task.h +++ b/src/rootserver/ob_disaster_recovery_task.h @@ -68,6 +68,7 @@ enum ObDRTaskRetComment CANNOT_EXECUTE_DUE_TO_SERVER_NOT_ALIVE = 6, CANNOT_EXECUTE_DUE_TO_PAXOS_REPLICA_NUMBER = 7, CANNOT_EXECUTE_DUE_TO_REPLICA_NOT_INSERVICE = 8, + CANNOT_EXECUTE_DUE_TO_SERVER_PERMANENT_OFFLINE = 9, MAX }; diff --git a/src/rootserver/ob_disaster_recovery_task_mgr.cpp b/src/rootserver/ob_disaster_recovery_task_mgr.cpp index 787f7af40d..9f957dde6e 100644 --- a/src/rootserver/ob_disaster_recovery_task_mgr.cpp +++ b/src/rootserver/ob_disaster_recovery_task_mgr.cpp @@ -718,12 +718,30 @@ void ObDRTaskMgr::run3() if (OB_FAIL(try_pop_task(allocator, task))) { LOG_WARN("fail to try pop task", KR(ret)); } else if (OB_NOT_NULL(task)) { - tmp_ret = task->log_execute_start(); - if (OB_SUCCESS != tmp_ret) { - LOG_WARN("fail to log task start", KR(tmp_ret), KPC(task)); - } - if (OB_FAIL(execute_task(*task))) { - LOG_WARN("fail to send", KR(ret), KPC(task)); + const ObAddr &dst_server = task->get_dst_server(); + ObServerInfoInTable server_info; + if (OB_FAIL(SVR_TRACER.get_server_info(dst_server, server_info))) { + LOG_WARN("fail to get server_info", KR(ret), K(dst_server)); + } else if (server_info.is_permanent_offline()) { + // dest server permanent offline, do not execute this task, just clean it + LOG_INFO("[DRTASK_NOTICE] dest server is permanent offline, task can not execute", K(dst_server), K(server_info)); + ObThreadCondGuard guard(cond_); + if (OB_SUCCESS != (tmp_ret = async_add_cleaning_task_to_updater( + task->get_task_id(), + task->get_task_key(), + OB_REBALANCE_TASK_CANT_EXEC, + false/*need_record_event*/, + ObDRTaskRetComment::CANNOT_EXECUTE_DUE_TO_SERVER_PERMANENT_OFFLINE, + false/*reach_data_copy_concurrency*/))) { + LOG_WARN("fail to do execute over", KR(tmp_ret), KPC(task)); + } + } else { + if (OB_SUCCESS != (tmp_ret = task->log_execute_start())) { + LOG_WARN("fail to log task start", KR(tmp_ret), KPC(task)); + } + if (OB_FAIL(execute_task(*task))) { + LOG_WARN("fail to send", KR(ret), KPC(task)); + } } free_task_(allocator, task); } else { @@ -734,7 +752,7 @@ void ObDRTaskMgr::run3() LOG_WARN("fail to try dump statistic", KR(tmp_ret), K(last_dump_ts)); } if (OB_SUCCESS != (tmp_ret = try_clean_not_in_schedule_task_in_schedule_list_( - last_check_task_in_progress_ts))) { + last_check_task_in_progress_ts))) { LOG_WARN("fail to try check task in progress", KR(tmp_ret), K(last_check_task_in_progress_ts)); } } diff --git a/src/rootserver/ob_disaster_recovery_worker.cpp b/src/rootserver/ob_disaster_recovery_worker.cpp index 411280af1b..5b9f9e89b7 100755 --- a/src/rootserver/ob_disaster_recovery_worker.cpp +++ b/src/rootserver/ob_disaster_recovery_worker.cpp @@ -4656,7 +4656,10 @@ int ObDRWorker::generate_disaster_recovery_paxos_replica_number( if (locality_paxos_replica_number >= member_list_cnt_after) { new_paxos_replica_number = curr_paxos_replica_number; found = true; - } else {} // new member cnt greater than paxos_replica_number, not good + } else if (locality_paxos_replica_number + 1 == member_list_cnt_after) { + new_paxos_replica_number = curr_paxos_replica_number + 1; + found = true; + } } else if (curr_paxos_replica_number > locality_paxos_replica_number) { if (curr_paxos_replica_number >= member_list_cnt_after) { new_paxos_replica_number = curr_paxos_replica_number;