/** * Copyright (c) 2021 OceanBase * OceanBase CE is licensed under Mulan PubL v2. * You can use this software according to the terms and conditions of the Mulan PubL v2. * You may obtain a copy of Mulan PubL v2 at: * http://license.coscl.org.cn/MulanPubL-2.0 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. * See the Mulan PubL v2 for more details. */ #define USING_LOG_PREFIX RS #include "ob_rebalance_task_executor.h" #include "share/ob_debug_sync.h" #include "share/ob_srv_rpc_proxy.h" #include "share/ob_rpc_struct.h" #include "share/partition_table/ob_partition_table_operator.h" #include "share/schema/ob_schema_getter_guard.h" #include "share/schema/ob_multi_version_schema_service.h" #include "share/ob_cluster_version.h" #include "share/partition_table/ob_partition_table_proxy.h" #include "ob_rs_event_history_table_operator.h" #include "ob_server_manager.h" #include "ob_zone_manager.h" #include "ob_leader_coordinator.h" #include "ob_rebalance_task_mgr.h" #include "ob_rebalance_task.h" #include "observer/ob_server.h" #include "lib/utility/ob_tracepoint.h" #include "sql/executor/ob_bkgd_dist_task.h" #include "sql/executor/ob_executor_rpc_proxy.h" #include "sql/executor/ob_executor_rpc_processor.h" #include "share/ob_multi_cluster_util.h" namespace oceanbase { using namespace common; using namespace share; namespace rootserver { int ObRebalanceTaskExecutor::init(share::ObPartitionTableOperator& pt_operator, obrpc::ObSrvRpcProxy& rpc_proxy, ObServerManager& server_mgr, const ObZoneManager& zone_mgr, ObLeaderCoordinator& leader_coordinator, schema::ObMultiVersionSchemaService& schema_service, ObUnitManager& unit_manager) { int ret = OB_SUCCESS; if (OB_UNLIKELY(inited_)) { ret = OB_INIT_TWICE; LOG_WARN("ObRebalanceTaskExecutor inited twice", K(ret)); } else { pt_operator_ = &pt_operator; rpc_proxy_ = &rpc_proxy; server_mgr_ = &server_mgr; zone_mgr_ = &zone_mgr; leader_coordinator_ = &leader_coordinator; schema_service_ = &schema_service; unit_mgr_ = &unit_manager; inited_ = true; } return ret; } int ObRebalanceTaskExecutor::execute(const ObRebalanceTask& task, ObIArray& rc_array) { int ret = OB_SUCCESS; DEBUG_SYNC(BEFORE_REBALANCE_TASK_EXECUTE); if (OB_UNLIKELY(!inited_)) { ret = OB_NOT_INIT; LOG_WARN("not inited", K(ret)); } else if (OB_UNLIKELY(task.get_sub_task_count() <= 0)) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid task", K(ret), "task_info_count", task.get_sub_task_count()); } else { share::ObClusterInfo cluster_info; // check dst server avaiable const ObAddr& online_server = task.get_dest(); bool is_alive = false; bool in_service = false; common::ObAddr leader; if (OB_FAIL(server_mgr_->check_server_alive(online_server, is_alive))) { LOG_WARN("check server alive failed", K(ret), K(online_server)); } else if (!is_alive) { ret = OB_REBALANCE_TASK_CANT_EXEC; LOG_WARN("dst server not alive", K(ret), K(online_server)); } else if (OB_FAIL(server_mgr_->check_in_service(online_server, in_service))) { LOG_WARN("check in service failed", K(ret), K(online_server)); } else if (!in_service) { ret = OB_REBALANCE_TASK_CANT_EXEC; LOG_WARN("dst server not in service", K(ret), K(online_server)); } else if (OB_UNLIKELY(nullptr == pt_operator_ || nullptr == rpc_proxy_)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("null ptr", K(ret), KP(pt_operator_), KP(rpc_proxy_)); } else if (OB_FAIL(task.check_before_execute(*pt_operator_, leader))) { LOG_WARN("fail to check before execute", K(ret)); } else { task.log_execute_start(); if (OB_FAIL(task.execute(*rpc_proxy_, leader, rc_array))) { LOG_WARN("fail to execute task", K(ret)); } } } LOG_INFO("execute rebalance task", K(ret), K(task)); return ret; } int ObRebalanceTaskUtil::get_remove_member_leader(const ObRemoveMemberTaskInfo& task_info, ObAddr& leader) { int ret = OB_SUCCESS; ObArenaAllocator allocator(ObModIds::OB_RS_PARTITION_TABLE_TEMP); ObPartitionInfo partition; partition.set_allocator(&allocator); const ObPartitionKey& pkey = task_info.get_partition_key(); const ObPartitionReplica* replica = NULL; ObZone leader_zone; if (OB_ISNULL(GCTX.pt_operator_)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("pt_operator is null", K(ret)); } else if (OB_FAIL(GCTX.pt_operator_->get(pkey.get_table_id(), pkey.get_partition_id(), partition))) { LOG_WARN("fail to get partition info", K(ret), K(pkey)); } else if (OB_FAIL(partition.find_leader_by_election(replica))) { LOG_WARN("find leader failed", K(ret), K(partition)); } else if (NULL == replica) { ret = OB_ERR_UNEXPECTED; LOG_WARN("NULL replica returned", K(ret)); } else { leader = replica->server_; leader_zone = replica->zone_; if (leader != task_info.get_dest().get_server()) { // when the remove target is not leader, we can execute using batch mode, // if there are leade switch operation after the check, observer is responsible to handle } else { // need to switch leader to another replica when the remove target is leader ObArray excluded_servers; ObArray excluded_zones; if (OB_FAIL(excluded_servers.push_back(leader))) { LOG_WARN("array push back failed", K(ret)); } else if (OB_FAIL(excluded_zones.push_back(leader_zone))) { LOG_WARN("fail to push back", K(ret)); } else if (OB_ISNULL(GCTX.root_service_)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("leader_coordinator is null", K(ret)); } else if (OB_FAIL(GCTX.root_service_->get_leader_coordinator().coordinate_partition_group( pkey.get_table_id(), pkey.get_partition_id(), excluded_servers, excluded_zones))) { LOG_WARN("coordinate partition group leader failed", K(ret), "replica", *replica); } else { partition.reuse(); replica = NULL; if (OB_FAIL(GCTX.pt_operator_->get(pkey.get_table_id(), pkey.get_partition_id(), partition))) { LOG_WARN("fail to get partition info", K(ret), K(pkey)); } else if (OB_FAIL(partition.find_leader_by_election(replica))) { LOG_WARN("find leader failed", K(ret), K(partition)); } else if (NULL == replica) { ret = OB_ERR_UNEXPECTED; LOG_WARN("NULL replica returned", K(ret)); } else if (replica->server_ == leader) { // still leader, switch leader failed ret = OB_REBALANCE_TASK_CANT_EXEC; LOG_WARN("still leader, can't execute task now", K(task_info), K(leader), K(ret)); } else { // switch leader succeed leader = replica->server_; } } } } return ret; } int ObRebalanceTaskUtil::build_batch_remove_member_task(ObRebalanceTaskMgr& task_mgr, ObAddr& leader, const ObRemoveMemberTaskInfo& task_info, const char* comment, ObIArray& remove_member_tasks, bool& check_pg_leader) { int ret = OB_SUCCESS; share::ObClusterInfo cluster_info; if (!leader.is_valid()) { ret = OB_INVALID_ARGUMENT; LOG_WARN("leader is invalid", K(ret), K(leader), K(task_info)); } else if (leader == task_info.get_dest().get_server() || check_pg_leader) { // need to do execute check_remove_member_leader under the following two situations: // 1. the remove target is the leader // 2. pg has partitions because of check_remove_member_leader if (OB_FAIL(get_remove_member_leader(task_info, leader))) { LOG_WARN("fail to get leader", K(ret), K(task_info)); } check_pg_leader = true; } if (OB_FAIL(ret)) { // skip } else if (!leader.is_valid()) { ret = OB_INVALID_ARGUMENT; LOG_WARN("leader is invalid, wait for next round", K(ret), K(task_info), K(leader)); } else { int64_t index = 0; const bool is_standby = common::STANDBY_CLUSTER == cluster_info.cluster_type_; common::ObArray task_info_array; bool check_leader = false; // no need to check leader when batch_remove_member for (/*nop*/; OB_SUCC(ret) && index < remove_member_tasks.count(); index++) { ObRemoveMemberTask& task = remove_member_tasks.at(index); if (leader == task.get_dest()) { // not up to the upper limit, go on to accumulate bool can_gather = false; if (OB_FAIL(check_can_gather_task(task, task_info, is_standby, can_gather))) { LOG_WARN("failed to check can gather task", KR(ret), K(task_info), K(task)); } else if (can_gather) { if (OB_FAIL(task.add_remove_member_task_info(task_info))) { LOG_WARN("fail to add remove member task info", K(ret)); } } else { // cannot accumulate any more, execute the previous task the reset if (OB_FAIL(task_mgr.add_task(task))) { LOG_WARN("fail to add task", K(ret)); } task.clear_task_info(); // reset this task whatever if (OB_SUCC(ret)) { if (OB_FAIL(task_info_array.push_back(task_info))) { LOG_WARN("fail to add task_info", K(ret), K(task_info)); } else if (OB_FAIL(task.build(task_info_array, leader, comment, check_leader))) { LOG_WARN("fail to build remove member task", K(ret)); } } } break; } } if (OB_SUCC(ret) && index == remove_member_tasks.count()) { // create a new task when no one found ObRemoveMemberTask task; if (OB_FAIL(task_info_array.push_back(task_info))) { LOG_WARN("fail to add task_info", K(ret), K(task_info)); } else if (OB_FAIL(task.build(task_info_array, leader, comment, check_leader))) { LOG_WARN("fail to build remove member task", K(ret)); } else if (OB_FAIL(remove_member_tasks.push_back(task))) { LOG_WARN("fail to add task", K(ret), K(task)); } } } return ret; } int ObRebalanceTaskUtil::check_can_gather_task( const ObRemoveMemberTask& task, const ObRemoveMemberTaskInfo& task_info, const bool is_standby, bool& can_gather) { int ret = OB_SUCCESS; can_gather = false; if (!is_standby) { if (MAX_BATCH_REMOVE_MEMBER_COUNT > task.get_sub_task_count()) { can_gather = true; } else { can_gather = false; } } else if (0 == task.get_sub_task_count()) { can_gather = true; } else { // private partitions and non-private partitons should not be accumulated in the same task const ObRebalanceTaskInfo* info = task.get_sub_task(0); if (OB_ISNULL(info)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("task info is null", KR(ret)); } else { can_gather = info->can_accumulate_in_standby(task_info); } } return ret; } int ObRebalanceTaskUtil::build_batch_remove_non_paxos_replica_task(ObRebalanceTaskMgr& task_mgr, const ObRemoveNonPaxosTaskInfo& task_info, const char* comment, ObIArray& remove_non_paxos_replica_tasks) { int ret = OB_SUCCESS; const ObAddr& dest = task_info.get_dest().get_server(); int64_t index = 0; common::ObArray task_info_array; for (/*nop*/; OB_SUCC(ret) && index < remove_non_paxos_replica_tasks.count(); index++) { ObRemoveNonPaxosTask& task = remove_non_paxos_replica_tasks.at(index); if (dest == task.get_dest()) { if (task.get_sub_task_count() < MAX_BATCH_REMOVE_NON_PAXOS_REPLICA_COUNT) { // not up to the upper limit if (OB_FAIL(task.add_remove_non_paxos_replica_task_info(task_info))) { LOG_WARN("fail to add remove non paxos replica task info", K(ret)); } } else { // up to the upper limit execute the previous task then reset if (OB_FAIL(task_mgr.add_task(task))) { LOG_WARN("fail to add task", K(ret)); } else if (OB_FAIL(task_info_array.push_back(task_info))) { LOG_WARN("fail to add task_info", K(ret), K(task_info)); } else if (OB_FAIL(task.build(task_info_array, dest, comment))) { LOG_WARN("fail to build remove non paxos replica task", K(ret)); } } break; } } if (OB_SUCC(ret) && index == remove_non_paxos_replica_tasks.count()) { // create a new one when no one found ObRemoveNonPaxosTask task; if (OB_FAIL(task_info_array.push_back(task_info))) { LOG_WARN("fail to add task_info", K(ret), K(task_info)); } else if (OB_FAIL(task.build(task_info_array, dest, comment))) { LOG_WARN("fail to build remove non paxo replica task", K(ret)); } else if (OB_FAIL(remove_non_paxos_replica_tasks.push_back(task))) { LOG_WARN("fail to add task", K(ret), K(task)); } } return ret; } int ObRebalanceTaskUtil::build_batch_modify_quorum_task(ObRebalanceTaskMgr& task_mgr, const ObModifyQuorumTaskInfo& task_info, const char* comment, ObIArray& modify_quorum_tasks) { int ret = OB_SUCCESS; const ObAddr& leader = task_info.get_dest().get_server(); int64_t index = 0; common::ObArray task_info_array; for (/*nop*/; OB_SUCC(ret) && index < modify_quorum_tasks.count(); index++) { ObModifyQuorumTask& task = modify_quorum_tasks.at(index); if (leader == task.get_dest()) { if (task.get_sub_task_count() < MAX_BATCH_MODIFY_QUORUM_COUNT) { // not up to the upper limit if (OB_FAIL(task.add_modify_quorum_task_info(task_info))) { LOG_WARN("fail to add remove member task info", K(ret)); } } else { // up to the upper limit execute the previous task then reset if (OB_FAIL(task_mgr.add_task(task))) { LOG_WARN("fail to add task", K(ret)); } else if (OB_FAIL(task_info_array.push_back(task_info))) { LOG_WARN("fail to add task_info", K(ret), K(task_info)); } else if (OB_FAIL(task.build(task_info_array, leader, comment))) { LOG_WARN("fail to build remove member task", K(ret)); } } break; } } if (OB_SUCC(ret) && index == modify_quorum_tasks.count()) { // create a new task when no one found ObModifyQuorumTask task; if (OB_FAIL(task_info_array.push_back(task_info))) { LOG_WARN("fail to add task_info", K(ret), K(task_info)); } else if (OB_FAIL(task.build(task_info_array, leader, comment))) { LOG_WARN("fail to build remove member task", K(ret)); } else if (OB_FAIL(modify_quorum_tasks.push_back(task))) { LOG_WARN("fail to add task", K(ret), K(task)); } } return ret; } } // end namespace rootserver } // end namespace oceanbase