/** * Copyright (c) 2021 OceanBase * OceanBase CE is licensed under Mulan PubL v2. * You can use this software according to the terms and conditions of the Mulan PubL v2. * You may obtain a copy of Mulan PubL v2 at: * http://license.coscl.org.cn/MulanPubL-2.0 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. * See the Mulan PubL v2 for more details. */ #define USING_LOG_PREFIX RS_LB #include "ob_root_balancer.h" #include "lib/ob_define.h" #include "lib/container/ob_se_array.h" #include "lib/container/ob_se_array_iterator.h" #include "lib/container/ob_array_iterator.h" #include "lib/mysqlclient/ob_mysql_proxy.h" #include "share/partition_table/ob_partition_table_operator.h" #include "share/config/ob_server_config.h" #include "share/schema/ob_schema_getter_guard.h" #include "share/schema/ob_multi_version_schema_service.h" #include "share/ob_task_define.h" #include "share/ob_define.h" #include "share/ob_multi_cluster_util.h" #include "observer/ob_server_struct.h" #include "observer/ob_restore_ctx.h" #include "rootserver/ob_rs_event_history_table_operator.h" #include "rootserver/ob_partition_creator.h" #include "rootserver/ob_ddl_service.h" #include "ob_leader_coordinator.h" #include "ob_unit_manager.h" #include "ob_locality_checker.h" #include "lib/stat/ob_diagnose_info.h" #include "rootserver/restore/ob_restore_mgr.h" #include "rootserver/ob_root_service.h" #include "rootserver/ob_single_partition_balance.h" #include "share/ob_unit_replica_counter.h" namespace oceanbase { using namespace common; using namespace share; using namespace share::schema; using namespace obrpc; namespace rootserver { using namespace balancer; int64_t ObRootBalanceIdling::get_idle_interval_us() { const int64_t min_idle_time = 10 * 1000000; int64_t idle_time = ((host_.has_locality_modification() || host_.has_task() || host_.have_server_deleting()) ? min_idle_time : GCONF.balancer_idle_time); return idle_time; } ///////////////////// int ObBalancerTargetSchemaInfo::set(const common::ObIArray& tenant_schemas) { int ret = OB_SUCCESS; if (tenant_schemas.count() <= 0) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", KR(ret), K(tenant_schemas)); } else { SpinWLockGuard guard(lock_); tenant_schema_versions_.reset(); if (OB_FAIL(tenant_schema_versions_.assign(tenant_schemas))) { LOG_WARN("fail to assign", KR(ret)); } else { LOG_INFO("set balance target schema", K(tenant_schemas)); } } return ret; } int ObBalancerTargetSchemaInfo::update(const TenantIdAndSchemaVersion& tenant_schema) { int ret = OB_SUCCESS; if (!tenant_schema.is_valid()) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", KR(ret), K(tenant_schema)); } else { SpinWLockGuard guard(lock_); bool find = false; for (int64_t i = 0; i < tenant_schema_versions_.count() && OB_SUCC(ret); i++) { TenantIdAndSchemaVersion old_tenant_schema = tenant_schema_versions_.at(i); if (tenant_schema.tenant_id_ == old_tenant_schema.tenant_id_) { find = true; if (tenant_schema.schema_version_ < old_tenant_schema.schema_version_) { LOG_WARN("schema version should not fallback", K(tenant_schema), K(old_tenant_schema)); } else { tenant_schema_versions_.at(i).schema_version_ = tenant_schema.schema_version_; LOG_INFO("update tenant balance target schema version", K(tenant_schema), K(old_tenant_schema)); } break; } } // end for if (OB_SUCC(ret) && !find) { if (OB_FAIL(tenant_schema_versions_.push_back(tenant_schema))) { LOG_WARN("fail to push back", KR(ret), K(tenant_schema)); } else { LOG_INFO("set tenant balance target schema version", K(tenant_schema)); } } } // end else return ret; } void ObBalancerTargetSchemaInfo::reset() { SpinWLockGuard guard(lock_); tenant_schema_versions_.reset(); } // when switchover primary/standby, in order to ensure that the backup database copy is complete, // it relies on the load balancing task to refresh the schema enough // At the same time, the replication tasks corresponding to the schema have been completed. task_status_ is NO_TASK; // The best way is to switch the process and directly iterate the PT table, to prove that all replicas under the latest // schema are complete. But this query is slower, which seriously affects the execution time of switchover. Therefore, // it is judged by the memory status of load balancing, in theory, as long as the schema seen by the load balancing task // is new enough and there are no tasks to do, it is considered that it can be switched currently; The standby database // playback thread is responsible for the broadcast of the schema, It is guaranteed to directly set the latest (new // enough) schema_version to root_balancer; However, in order to prevent the load balancing task from failing to see a // sufficiently new schema and misjudgment, it is necessary to proactively notify the load balancing when the standby // database finds a new partition. E.g: build new tenant / build new partitions/ build new tables. need to notify the // corresponding schema_version of the load balancing thread, and modify the current memory status to task_status_ to // UNKNOWN; Only when the schema version of the load balancer brushed is greater than the specified version and no task // is generated, then there is really no task. // // Ensure that all tenants can be seen // 1. Judge by the schema_version of the system tenant. // As long as the system tenant's schema is new enough, the tenants in curr_schema_version can be guaranteed to be // complete. // 2. At this time, if the tenant is in the target_schema but not in the curr_schema, the tenant has been deleted. // 3. If the tenant is in the curr_schema, but not in the target_schema, it means that the tenant has not created a user // table, // as long as the full replica of the system table is completed. // This is the common tenant system table visible through the system tenant's schema. // Only need to consider which schemas have both bool ObBalancerTargetSchemaInfo::is_schema_new_enough( const ObIArray& curr_schema_versions) const { bool bret = true; SpinRLockGuard guard(lock_); for (int64_t i = 0; i < tenant_schema_versions_.count() && bret; i++) { const TenantIdAndSchemaVersion& target_version = tenant_schema_versions_.at(i); for (int64_t j = 0; j < curr_schema_versions.count() && bret; j++) { const TenantIdAndSchemaVersion& curr_version = curr_schema_versions.at(j); if (target_version.tenant_id_ == curr_version.tenant_id_) { if (target_version.schema_version_ > curr_version.schema_version_) { bret = false; LOG_INFO("tenant schema is not new enough", K(target_version), K(curr_version), K(tenant_schema_versions_), K(curr_schema_versions)); } } } } return bret; } ///////////////////// ObRootBalancer::ObRootBalancer() : ObRsReentrantThread(true), inited_(false), active_(0), idling_(stop_, *this), config_(NULL), schema_service_(NULL), ddl_service_(NULL), leader_coordinator_(NULL), task_mgr_(NULL), unit_mgr_(NULL), unit_stat_mgr_(), server_balancer_(), replica_creator_(), tenant_stat_(), rereplication_(), unit_balancer_(), pg_balancer_(), migrate_unit_(), server_checker_(), pg_coordinator_(), locality_checker_(), pt_operator_(NULL), rpc_proxy_(NULL), zone_mgr_(NULL), alter_locality_checker_(stop_), restore_ctx_(NULL), task_status_(UNKNOWN), shrink_resource_pool_checker_(stop_), unit_stat_getter_(), server_manager_(NULL), single_zone_mode_migrate_replica_(), has_server_deleting_(false), target_schema_info_(), curr_schema_info_(), single_part_balance_(), refresh_unit_replica_cnt_lock_() {} ObRootBalancer::~ObRootBalancer() { tenant_stat_.reuse(); } int ObRootBalancer::init(common::ObServerConfig& cfg, share::schema::ObMultiVersionSchemaService& schema_service, ObDDLService& ddl_service, ObUnitManager& unit_mgr, ObServerManager& server_mgr, share::ObPartitionTableOperator& pt_operator, share::ObRemotePartitionTableOperator& remote_pt_operator, ObLeaderCoordinator& leader_coordinator, ObZoneManager& zone_mgr, ObEmptyServerChecker& empty_server_checker, ObRebalanceTaskMgr& task_mgr, ObSrvRpcProxy& rpc_proxy, observer::ObRestoreCtx& restore_ctx, ObAddr& self_addr, ObMySQLProxy& sql_proxy, ObSinglePartBalance& single_part_balance) { int ret = OB_SUCCESS; static const int64_t root_balancer_thread_cnt = 1; if (inited_) { ret = OB_INIT_TWICE; LOG_WARN("root balancer already inited", K(ret)); } else if (OB_FAIL(unit_stat_getter_.init(pt_operator, schema_service, *this))) { LOG_WARN("fail init unit stat getter", K(ret)); } else if (OB_FAIL(tenant_stat_.init(unit_mgr, server_mgr, pt_operator, remote_pt_operator, zone_mgr, task_mgr, *this, ddl_service.get_sql_proxy()))) { LOG_WARN("init tenant stat failed", K(ret)); } else if (OB_FAIL(unit_stat_mgr_.init(schema_service, unit_mgr, unit_stat_getter_))) { LOG_WARN("init unit stat mgr failed", K(ret)); } else if (OB_FAIL(replica_creator_.init(schema_service, unit_mgr, server_mgr, pt_operator, zone_mgr, *this))) { LOG_WARN("init failed", K(ret)); } else if (OB_FAIL(server_balancer_.init( schema_service, unit_mgr, leader_coordinator, zone_mgr, server_mgr, unit_stat_mgr_, tenant_stat_))) { LOG_WARN("init failed", K(ret)); } else if (OB_FAIL(unit_balancer_.init(cfg, pt_operator, task_mgr, zone_mgr, tenant_stat_, *this))) { LOG_WARN("init failed", K(ret)); } else if (OB_FAIL(pg_balancer_.init(cfg, schema_service, pt_operator, task_mgr, zone_mgr, tenant_stat_, *this))) { LOG_WARN("init failed", K(ret)); } else if (OB_FAIL(server_checker_.init(unit_mgr, server_mgr, empty_server_checker, tenant_stat_))) { LOG_WARN("init failed", K(ret)); } else if (OB_FAIL(pg_coordinator_.init(task_mgr, tenant_stat_, *this))) { LOG_WARN("init failed", K(ret)); } else if (OB_FAIL(migrate_unit_.init(unit_mgr, task_mgr, tenant_stat_, *this))) { LOG_WARN("init failed", K(ret)); } else if (OB_FAIL( rereplication_.init(schema_service, zone_mgr, pt_operator, task_mgr, tenant_stat_, *this, unit_mgr))) { LOG_WARN("init failed", K(ret)); } else if (OB_FAIL(lost_replica_checker_.init(server_mgr, pt_operator, schema_service))) { LOG_WARN("init failed", K(ret)); } else if (OB_FAIL(create(root_balancer_thread_cnt, "RootBalance"))) { LOG_WARN("create root balancer thread failed", K(ret), K(root_balancer_thread_cnt)); } else if (OB_FAIL(locality_checker_.init( schema_service, tenant_stat_, task_mgr, zone_mgr, *this, server_mgr, unit_mgr))) { LOG_WARN("fail to init locality checker", K(ret)); } else if (OB_FAIL(alter_locality_checker_.init(&schema_service, &leader_coordinator, GCTX.rs_rpc_proxy_, &pt_operator, &server_mgr, self_addr, &zone_mgr, &unit_mgr))) { LOG_WARN("fail to init alter locality checker", K(ret)); } else if (OB_FAIL(shrink_resource_pool_checker_.init(&schema_service, &leader_coordinator, &rpc_proxy, self_addr, &unit_mgr, &pt_operator, &tenant_stat_, &task_mgr, &server_mgr))) { LOG_WARN("fail to init shrink resource pool checker", K(ret)); } else if (OB_FAIL(single_zone_mode_migrate_replica_.init( cfg, schema_service, pt_operator, task_mgr, zone_mgr, tenant_stat_, *this))) { LOG_WARN("fail to init single zone mode migrate replica", K(ret)); } else if (OB_FAIL(pg_backup_.init( cfg, schema_service, sql_proxy, pt_operator, task_mgr, zone_mgr, tenant_stat_, server_mgr, *this))) { LOG_WARN("init failed", K(ret)); } else if (OB_FAIL(pg_validate_.init(sql_proxy, task_mgr, server_mgr, zone_mgr))) { LOG_WARN("failed to init partition validator", K(ret)); } else { config_ = &cfg; zone_mgr_ = &zone_mgr; schema_service_ = &schema_service; ddl_service_ = &ddl_service; leader_coordinator_ = &leader_coordinator; unit_mgr_ = &unit_mgr; task_mgr_ = &task_mgr; pt_operator_ = &pt_operator; rpc_proxy_ = &rpc_proxy; (void)ATOMIC_SET(&task_status_, UNKNOWN); restore_ctx_ = &restore_ctx; server_manager_ = &server_mgr; has_server_deleting_ = false; target_schema_info_.reset(); curr_schema_info_.reset(); single_part_balance_ = &single_part_balance; inited_ = true; } return ret; } int ObRootBalancer::alloc_tablegroup_partitions_for_create(const share::schema::ObTablegroupSchema& tablegroup_schema, const obrpc::ObCreateTableMode create_mode, common::ObIArray& tablegroup_addr) { int ret = OB_SUCCESS; SpinWLockGuard guard(refresh_unit_replica_cnt_lock_); ObArray ten_unit_arr; if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } else if (OB_FAIL(single_part_balance_->get_tenant_unit_array(ten_unit_arr, tablegroup_schema.get_tenant_id()))) { LOG_WARN("fail to get tenant unit array", K(ret)); } else if (OB_FAIL(replica_creator_.alloc_tablegroup_partitions_for_create( tablegroup_schema, create_mode, tablegroup_addr, ten_unit_arr))) { LOG_WARN("fail to alloc tablegroup partition for create", K(ret)); } else if (OB_FAIL(single_part_balance_->update_tenant_unit_replica_capacity( tablegroup_schema.get_tenant_id(), ten_unit_arr))) { LOG_WARN("fail to set tenant unit", K(tablegroup_schema.get_tenant_id()), K(ten_unit_arr)); } return ret; } int ObRootBalancer::standby_alloc_partitions_for_split(const share::schema::ObSimpleTableSchemaV2& table, const common::ObIArray& source_partition_ids, const common::ObIArray& dest_partition_ids, ObITablePartitionAddr& addr) { return replica_creator_.standby_alloc_partitions_for_split(table, source_partition_ids, dest_partition_ids, addr); } int ObRootBalancer::alloc_partitions_for_create(const share::schema::ObSimpleTableSchemaV2& table, obrpc::ObCreateTableMode create_mode, ObITablePartitionAddr& addr) { int ret = OB_SUCCESS; SpinWLockGuard guard(refresh_unit_replica_cnt_lock_); ObArray ten_unit_arr; if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } else if (OB_FAIL(single_part_balance_->get_tenant_unit_array(ten_unit_arr, table.get_tenant_id()))) { LOG_WARN("fail to get tenant unit array", K(ret)); } else if (OB_FAIL(replica_creator_.alloc_partitions_for_create(table, create_mode, addr, ten_unit_arr))) { LOG_WARN("fail to alloc tablegroup partition for create", K(ret)); } else if (OB_FAIL(single_part_balance_->update_tenant_unit_replica_capacity(table.get_tenant_id(), ten_unit_arr))) { LOG_WARN("fail to set tenant unit", K(table.get_tenant_id()), K(ten_unit_arr)); } return ret; } int ObRootBalancer::alloc_table_partitions_for_standby(const share::schema::ObSimpleTableSchemaV2& table, const common::ObIArray& keys, obrpc::ObCreateTableMode create_mode, ObITablePartitionAddr& addr, share::schema::ObSchemaGetterGuard& guard) { return replica_creator_.alloc_table_partitions_for_standby(table, keys, create_mode, addr, guard); } int ObRootBalancer::alloc_tablegroup_partitions_for_standby(const share::schema::ObTablegroupSchema& table_group, const common::ObIArray& keys, obrpc::ObCreateTableMode create_mode, ObITablePartitionAddr& addr, share::schema::ObSchemaGetterGuard& guard) { return replica_creator_.alloc_tablegroup_partitions_for_standby(table_group, keys, create_mode, addr, guard); } int ObRootBalancer::refresh_unit_replica_counter(const uint64_t tenant_id) { int ret = OB_SUCCESS; ObSchemaGetterGuard schema_guard; ObArray tmp_ten_unit_reps; const int64_t start = ObTimeUtility::current_time(); ObArray unit_infos; if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } else if (OB_FAIL(schema_service_->get_tenant_schema_guard(tenant_id, schema_guard))) { LOG_WARN("fail to get tenant schema guard", K(ret)); } else if (OB_FAIL(unit_mgr_->get_active_unit_infos_by_tenant(tenant_id, unit_infos))) { LOG_WARN("fail to get active unit infos by tenant", KR(ret), K(tenant_id)); } else if (OB_FAIL(single_part_balance_->refresh_unit_replica_counter( tenant_id, start, unit_infos, schema_guard, tmp_ten_unit_reps))) { LOG_WARN("fail to refresh unit replica counter", K(ret)); } else { SpinWLockGuard guard(refresh_unit_replica_cnt_lock_); if (OB_FAIL( single_part_balance_->set_tenant_unit_replica_capacity(tenant_id, start, unit_infos, tmp_ten_unit_reps))) { LOG_WARN("fail to set tenant unti replica capacity", K(ret)); } } return ret; } int ObRootBalancer::create_unit_replica_counter(const uint64_t tenant_id) { int ret = OB_SUCCESS; SpinWLockGuard guard(refresh_unit_replica_cnt_lock_); if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } else if (OB_FAIL(single_part_balance_->create_unit_replica_counter(tenant_id))) { LOG_WARN("fail to create unit replica counter", K(ret)); } return ret; } int ObRootBalancer::destory_tenant_unit_array() { int ret = OB_SUCCESS; SpinWLockGuard guard(refresh_unit_replica_cnt_lock_); if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } else if (OB_FAIL(single_part_balance_->destory_tenant_unit_array())) { LOG_WARN("fail to destory tenant unit array", K(ret)); } return ret; } int ObRootBalancer::idle() const { int ret = OB_SUCCESS; if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } else if (OB_FAIL(idling_.idle())) { LOG_WARN("idle failed", K(ret)); } else { LOG_INFO("root balance idle", "idle_time", idling_.get_idle_interval_us()); } return ret; } void ObRootBalancer::wakeup() { if (!inited_) { LOG_WARN("not init"); } else { reset_task_count(); idling_.wakeup(); } } void ObRootBalancer::stop() { int ret = OB_SUCCESS; if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } else { ObRsReentrantThread::stop(); idling_.wakeup(); target_schema_info_.reset(); curr_schema_info_.reset(); } } int ObRootBalancer::check_stop() const { int ret = OB_SUCCESS; if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } else { ret = stop_ ? OB_CANCELED : OB_SUCCESS; } return ret; } void ObRootBalancer::set_active() { const int64_t old_val = ATOMIC_SET(&active_, 1); UNUSED(old_val); wakeup(); } void ObRootBalancer::notify_locality_modification() { alter_locality_checker_.notify_locality_modification(); } bool ObRootBalancer::has_task() const { return UNKNOWN == ATOMIC_LOAD(&task_status_) || HAS_TASK == ATOMIC_LOAD(&task_status_); } bool ObRootBalancer::has_locality_modification() const { return alter_locality_checker_.has_locality_modification(); } void ObRootBalancer::check_server_deleting() { if (OB_ISNULL(server_manager_)) { has_server_deleting_ = false; } else { has_server_deleting_ = server_manager_->have_server_deleting(); } } bool ObRootBalancer::have_server_deleting() const { return has_server_deleting_; } void ObRootBalancer::run3() { LOG_INFO("root balance start"); int ret = OB_SUCCESS; // every time the root balance start to run, we need to try check locality modification notify_locality_modification(); if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } else if (OB_FAIL(do_balance())) { LOG_WARN("all balance failed", K(ret)); } tenant_stat_.reuse(); LOG_INFO("root balance stop", K(ret)); } void ObRootBalancer::reset_task_count() { LOG_INFO("reset task count, maybe has task to do"); (void)ATOMIC_SET(&task_status_, UNKNOWN); } int ObRootBalancer::do_balance() { int ret = OB_SUCCESS; int64_t failed_times = 0; int64_t total_task_cnt = 0; int64_t start_time = 0; int64_t finish_time = 0; int64_t remained_task_count = -1; // Whether auxiliary calculation needs to write balance_start/balance_finish log int64_t task_count_in_round = 0; // Count the tasks generated in a round of load balancing if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not inited", K(ret)); } ObArray all_tenant; while (OB_SUCC(ret)) { update_last_run_timestamp(); if (OB_FAIL(check_stop())) { break; } check_server_deleting(); if (!ATOMIC_LOAD(&active_)) { if (OB_FAIL(idle())) { LOG_WARN("idle failed", K(ret)); break; } else { continue; } } if (OB_SUCC(ret)) { if (-1 == remained_task_count) { } else { remained_task_count = task_mgr_->get_high_priority_task_info_cnt() + task_mgr_->get_low_priority_task_info_cnt(); } } curr_schema_info_.reset(); start_time = ObTimeUtility::current_time(); if (OB_FAIL(all_balance(total_task_cnt))) { failed_times++; LOG_WARN("do balance round failed", K(ret), K(failed_times)); ret = OB_SUCCESS; EVENT_ADD(RS_BALANCER_FAIL_COUNT, 1); } else { failed_times = 0; EVENT_ADD(RS_BALANCER_SUCC_COUNT, 1); } finish_time = ObTimeUtility::current_time(); // balance strategy: // 1. The number of tasks changes from 0 to non-zero, and counts as balance to start // 2. There are no tasks in the queue, and the number of tasks changes from non-zero to zero, // which is counted as the end of balance if (OB_SUCC(ret)) { task_count_in_round += total_task_cnt; if (0 >= remained_task_count && 0 != total_task_cnt) { // New effective round LOG_INFO("tenant balance start", K(remained_task_count), K(total_task_cnt)); ROOTSERVICE_EVENT_ADD("balancer", "tenant_balance_started", "start_time", start_time); remained_task_count = total_task_cnt; } else if (0 == remained_task_count && 0 == total_task_cnt) { // End of this round LOG_INFO("tenant balance finish", K(task_count_in_round), K(total_task_cnt), K(remained_task_count)); ROOTSERVICE_EVENT_ADD( "balancer", "tenant_balance_finished", "finish_time", finish_time, "task_count", task_count_in_round); task_count_in_round = 0; remained_task_count = -1; } } if (OB_SUCC(ret)) { // nothing todo bool all_server_alive = false; if (total_task_cnt > 0 || remained_task_count > 0) { LOG_INFO("root balance has task to do"); (void)ATOMIC_SET(&task_status_, HAS_TASK); } else if (!config_->is_rereplication_enabled()) { // Check whether migration replication and balance are allowed, // otherwise it is impossible to confirm whether there are tasks if (REACH_TIME_INTERVAL(LOG_INTERVAL)) { // Print every 30 minutes LOG_INFO("disable rereplication or disable rebalance, maybe has task todo", "enable_rereplication", config_->is_rereplication_enabled(), "enable_rebalance", config_->is_rebalance_enabled()); } (void)ATOMIC_SET(&task_status_, UNKNOWN); } else if (OB_FAIL(server_manager_->check_all_server_active(all_server_alive))) { LOG_WARN("failed to check all server active", K(ret)); (void)ATOMIC_SET(&task_status_, UNKNOWN); } else if (!all_server_alive) { // When the state of a server is not active, tasks may not be generated, // and there is no way to guarantee that no tasks are generated if (REACH_TIME_INTERVAL(LOG_INTERVAL)) { // Print every 30 minutes LOG_INFO("has server is not active, maybe has task todo", K(ret), K(all_server_alive)); } (void)ATOMIC_SET(&task_status_, UNKNOWN); } else if (target_schema_info_.is_schema_new_enough(curr_schema_info_)) { (void)ATOMIC_SET(&task_status_, NO_TASK); } } else { LOG_INFO("reset task count, maybe has task to do"); (void)ATOMIC_SET(&task_status_, UNKNOWN); } // idle after success or failed 2 times if (0 == failed_times || failed_times >= 2) { if (OB_FAIL(idle())) { LOG_WARN("idle failed", K(ret)); } } } return ret; } int ObRootBalancer::all_balance(int64_t& total_task_cnt) { int ret = OB_SUCCESS; total_task_cnt = 0; if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } else if (OB_FAIL(tenant_stat_.reuse_replica_count_mgr())) { LOG_WARN("reuse_replica_count_mgr failed", K(ret)); } else if (OB_FAIL(unit_stat_mgr_.gather_stat())) { LOG_WARN("gather all tenant unit stat failed, refuse to do server_balance", K(ret)); unit_stat_mgr_.reuse(); ret = OB_SUCCESS; } if (OB_SUCC(ret)) { // ignore unit balance result int tmp_ret = OB_SUCCESS; if (OB_SUCCESS != (tmp_ret = server_balancer_.balance_servers())) { LOG_WARN("do unit balance failed", K(tmp_ret)); } } if (OB_SUCC(ret)) { if (OB_FAIL(restore_tenant())) { LOG_WARN("fail do restore", K(ret)); } } if (OB_SUCC(ret)) { int64_t non_emergency_task_cnt = 0; ObArray pools; if (OB_FAIL(non_emergency_balance(non_emergency_task_cnt))) { LOG_WARN("fail to tenants balance", K(ret)); ret = OB_SUCCESS; // ignore ret } if (OB_FAIL(unit_mgr_->get_pools(pools))) { LOG_WARN("fail to get pools", K(ret)); } else { FOREACH_CNT_X(pool, pools, OB_SUCC(ret)) { if (OB_FAIL(unit_mgr_->finish_migrate_unit_not_in_tenant(pool, total_task_cnt))) { LOG_WARN("fail to finish migrate unit not in tenant", K(ret)); } ret = OB_SUCCESS; // ignore ret } } total_task_cnt += non_emergency_task_cnt; LOG_INFO("finish one round balance", "rereplication_enabled", config_->is_rereplication_enabled(), "rebalance_enabled", config_->is_rebalance_enabled(), "high_prio_task_info_cnt", task_mgr_->get_high_priority_task_info_cnt(), "low_prio_task_info_cnt", task_mgr_->get_low_priority_task_info_cnt(), K(total_task_cnt), K(non_emergency_task_cnt), K(ret)); } int tmp_ret = OB_SUCCESS; if (OB_SUCCESS != (tmp_ret = server_checker_.try_notify_empty_server_checker())) { LOG_WARN("try notify empty server checker failed", K(ret)); } if (OB_SUCCESS != (tmp_ret = server_checker_.try_delete_server())) { LOG_WARN("try_delete_server failed", K(tmp_ret)); } if (OB_SUCCESS != (tmp_ret = lost_replica_checker_.check_lost_replicas())) { LOG_WARN("fail check lost replicas", K(tmp_ret)); } if (OB_SUCCESS != (tmp_ret = shrink_resource_pool_checker_.check_shrink_resource_pool_finished())) { LOG_WARN("fail to check shrink resource pool", K(tmp_ret)); } if (OB_SUCC(ret) && (NULL != task_mgr_)) { ObRootBalanceHelp::BalanceController balance_controller; ObString switch_config_str = GCONF.__balance_controller.str(); if (OB_FAIL(ObRootBalanceHelp::parse_balance_info(switch_config_str, balance_controller))) { LOG_WARN("fail to parse balance switch", K(ret), "balance_switch", switch_config_str); } else if (GCONF.is_rebalance_enabled() || balance_controller.at(ObRootBalanceHelp::ENABLE_SERVER_BALANCE)) { if (OB_FAIL(server_balancer_.tenant_group_balance())) { LOG_WARN("fail to do tenant group balance", K(ret)); } } } return ret; } int ObRootBalancer::restore_tenant() { LOG_INFO("restore tenant second phase begin"); int ret = OB_SUCCESS; ObRestoreMgr restore_mgr(stop_); ObRestoreMgrCtx restore_ctx; restore_ctx.conn_env_ = restore_ctx_; restore_ctx.sql_proxy_ = &(ddl_service_->get_sql_proxy()); restore_ctx.task_mgr_ = task_mgr_; restore_ctx.tenant_stat_ = &tenant_stat_; restore_ctx.schema_service_ = schema_service_; restore_ctx.ddl_service_ = ddl_service_; restore_ctx.root_balancer_ = const_cast(this); if (OB_FAIL(restore_mgr.init(&restore_ctx))) { LOG_WARN("fail init restore mgr", K(ret)); } else if (OB_FAIL(restore_mgr.restore())) { LOG_WARN("fail do restore", K(ret)); } return ret; } int ObRootBalancer::non_emergency_balance(int64_t& total_task_cnt) { int ret = OB_SUCCESS; ObArray all_tenant; if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } else if (OB_FAIL(leader_coordinator_->get_tenant_ids(all_tenant))) { LOG_WARN("fetch all tenant id failed", K(ret)); } else { int tmp_ret = OB_SUCCESS; DEBUG_SYNC(UNIT_BALANCE_BEFORE_PARTITION_BALANCE); FOREACH_X(t, all_tenant, OB_SUCCESS == ret) { update_last_run_timestamp(); const uint64_t tenant_id = *t; // if have enough task in task queue, idle if (task_mgr_->is_busy()) { LOG_INFO("has enough rebalance tasks, idle"); while (OB_SUCC(ret) && task_mgr_->is_busy()) { if (OB_FAIL(idle())) { LOG_WARN("idle failed", K(ret)); } } } if (OB_FAIL(ret)) { // nop } else if (OB_FAIL(check_stop())) { LOG_WARN("balancer stop", K(ret)); } else { // In order to achieve tenant balance error isolation, // the error code of each balance is ignored int64_t task_cnt = 0; if (OB_SUCCESS != (tmp_ret = tenant_balance(tenant_id, task_cnt))) { LOG_WARN("tenant balance failed, mock task count as 1", K(tmp_ret), K(tenant_id)); task_cnt = 1; } total_task_cnt += task_cnt; if (OB_SYS_TENANT_ID == tenant_id) { leader_coordinator_->set_sys_balanced(); } } } // end foreach } return ret; } int ObRootBalancer::can_check_locality(const uint64_t tenant_id, bool& can) const { int ret = OB_SUCCESS; can = false; int64_t high_priority_task_cnt = 0; int64_t low_priority_task_cnt = 0; if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } else if (OB_INVALID_ID == tenant_id) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), K(tenant_id)); } else if (OB_FAIL(task_mgr_->get_tenant_task_info_cnt(tenant_id, high_priority_task_cnt, low_priority_task_cnt))) { LOG_WARN("fail to get tenant task cnt", K(ret), K(tenant_id)); } else if (high_priority_task_cnt + low_priority_task_cnt <= 0) { can = true; } return ret; } int ObRootBalancer::check_can_do_balance(const uint64_t tenant_id, bool& can_do_recovery, bool& can_do_rebalance) const { int ret = OB_SUCCESS; if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } else if (OB_INVALID_ID == tenant_id) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), K(tenant_id)); } else { can_do_recovery = false; can_do_rebalance = false; if (config_->is_rereplication_enabled() || config_->is_rebalance_enabled()) { int64_t high_priority_task_info_cnt = 0; int64_t low_priority_task_info_cnt = 0; if (OB_FAIL(task_mgr_->get_tenant_task_info_cnt( tenant_id, high_priority_task_info_cnt, low_priority_task_info_cnt))) { LOG_WARN("get tenant task count failed", K(ret), K(tenant_id)); } else { // Design ideas: // 1. Rebuild is the highest priority task; // Always look at the high priority queue first when the task is executed; // 2. When there are high-priority tasks, low-priority tasks can be generated. // The low-priority queue will not affect the high-priority execution of the rebuild, // which helps to improve concurrency. // 3. Replication and migration are low-priority tasks; // need to wait for the completion of the previous batch of tasks before continuing to the next batch of // tasks; There may be dependencies between tasks and tasks. // 4. For low-priority tasks, // the total number of tasks is also limited to prevent multiple tenants from balancing at the same time // and generating too many tasks if (0 == low_priority_task_info_cnt && config_->is_rereplication_enabled()) { can_do_recovery = true; } if (0 == low_priority_task_info_cnt && config_->is_rebalance_enabled() && task_mgr_->get_low_priority_task_info_cnt() < ObRebalanceTaskMgr::QUEUE_LOW_TASK_CNT) { can_do_rebalance = true; } } } } return ret; } int ObRootBalancer::multiple_zone_deployment_tenant_balance(const uint64_t tenant_id, ObLeaderBalanceGroupContainer& balance_group_container, ObRootBalanceHelp::BalanceController& balance_controller, int64_t& task_cnt) { int ret = OB_SUCCESS; int64_t backup_task_cnt = 0; DEBUG_SYNC(AFTER_TENANT_BALANCE_GATHER_STAT); bool can_do_rebalance = false; bool can_do_recovery = false; if (OB_FAIL(check_can_do_balance(tenant_id, can_do_recovery, can_do_rebalance))) { LOG_WARN("check can do balance failed", K(ret)); } // step1: Kick out the permanent offline replica // Do not consider the two configuration items // is_rereplication_enable and is_rebalance_enable if (OB_SUCC(ret) && 0 == task_cnt) { // Remove the replica on the permanently offline server from the member list if (OB_FAIL(rereplication_.remove_permanent_offline_replicas(task_cnt))) { LOG_WARN("remove permanent offline replicas failed", K(ret), K(tenant_id)); } else { if (task_cnt > 0) { LOG_INFO("remove permanent offline replicas", K(tenant_id), K(task_cnt)); } } } if (OB_SUCC(ret) && 0 == task_cnt) { if (OB_FAIL(rereplication_.process_only_in_memberlist_replica(task_cnt))) { LOG_WARN("fail to process only in member list replica", K(ret)); } else { if (task_cnt > 0) { LOG_INFO("process only in member list", K(tenant_id), K(task_cnt)); } } } // First do type transform and then copy, reuse old data // Delete first and then copy and migrate to avoid copying and migrating redundant copies // step2: Locality verification; first do type transform and then delete, try to reuse copy data // Not controlled by the switch. Always ensure that the locality is consistent if (balance_controller.at(ObRootBalanceHelp::ENABLE_MODIFY_QUORUM)) { if (OB_SUCC(ret) && 0 == task_cnt) { LOG_INFO("start locality check do modify quorum", K(tenant_id)); bool can = false; if (OB_FAIL(can_check_locality(tenant_id, can))) { LOG_WARN("fail to check", K(ret), K(tenant_id)); } else if (!can) { // nothing todo } else if (OB_FAIL(locality_checker_.do_modify_quorum(balance_group_container.get_hash_index(), task_cnt))) { LOG_WARN("fail to do modify quorum", K(ret), K(tenant_id)); } else if (task_cnt > 0) { LOG_INFO("locality check do modify quorum", K(tenant_id), K(task_cnt)); } } } if (balance_controller.at(ObRootBalanceHelp::ENABLE_TYPE_TRANSFORM)) { if (OB_SUCC(ret) && 0 == task_cnt) { LOG_INFO("start locality check do type transform", K(tenant_id)); bool can = false; if (OB_FAIL(can_check_locality(tenant_id, can))) { LOG_WARN("fail to check", K(ret), K(tenant_id)); } else if (!can) { // nothing todo } else if (OB_FAIL(locality_checker_.do_type_transform(balance_group_container.get_hash_index(), task_cnt))) { LOG_WARN("fail to do type transform", K(ret), K(tenant_id)); } else if (task_cnt > 0) { LOG_INFO("locality check do type transform", K(tenant_id), K(task_cnt)); } } } if (balance_controller.at(ObRootBalanceHelp::ENABLE_DELETE_REDUNDANT)) { if (OB_SUCC(ret) && 0 == task_cnt) { bool can = false; LOG_INFO("start locality check do delete redundant", K(tenant_id)); if (OB_FAIL(can_check_locality(tenant_id, can))) { LOG_WARN("fail to check", K(ret), K(tenant_id)); } else if (!can) { // nothing todo } else if (OB_FAIL(locality_checker_.do_delete_redundant(balance_group_container.get_hash_index(), task_cnt))) { LOG_WARN("fail to do delete redundant", K(ret), K(tenant_id)); } else if (task_cnt > 0) { LOG_INFO("locality check do delete redundant", K(tenant_id), K(task_cnt)); } } } // Disaster recovery, supplement or reconstruction replica // Both rebuild/copy are regarded as disaster recovery operations; // they are of parallel priority; // failures in tasks will not block the entire disaster recovery operation if (OB_SUCC(ret) && 0 == task_cnt) { // Not restricted by is_rereplication_enable, unit has been migrated, keep replica and unit consistent // replicate replica to unit if unit is migrate from permanent offline server int tmp_ret = OB_SUCCESS; if (balance_controller.at(ObRootBalanceHelp::ENABLE_REPLICATE_TO_UNIT)) { int64_t tmp_task_cnt = 0; if (OB_FAIL(rereplication_.replicate_to_unit(tmp_task_cnt))) { LOG_WARN("replicate to unit failed", K(ret), K(tenant_id)); tmp_ret = ret; // tmp_ret may be overwritten ret = OB_SUCCESS; } else { if (tmp_task_cnt > 0) { LOG_INFO("replicate to unit", K(tenant_id), K(tmp_task_cnt)); task_cnt += tmp_task_cnt; } } } // If a copy task has been generated before, it cannot continue to be generated, // otherwise it may cause redundant tasks; if (OB_SUCC(ret) && 0 == task_cnt && (can_do_recovery || balance_controller.at(ObRootBalanceHelp::ENABLE_REPLICATE))) { // make sure active replica count >= schema replica count int64_t tmp_task_cnt = 0; if (OB_FAIL(rereplication_.replicate_enough_replica(balance_group_container.get_hash_index(), tmp_task_cnt))) { LOG_WARN("replicate enough replicas failed", K(ret), K(tenant_id)); } else { if (tmp_task_cnt > 0) { LOG_INFO("replica enough replicas", K(tenant_id), K(tmp_task_cnt)); task_cnt += tmp_task_cnt; } } } if (OB_SUCC(ret)) { ret = tmp_ret; } } // Disaster recovery operation ends // Unit shrinking operation of shrink resource pool if (balance_controller.at(ObRootBalanceHelp::ENABLE_SHRINK)) { if (OB_SUCC(ret) && 0 == task_cnt) { if (OB_FAIL(shrink_resource_pool_checker_.try_shrink_tenant_resource_pools(task_cnt))) { LOG_WARN("fail to try shrink resource pool", K(ret)); } else { if (task_cnt > 0) { LOG_INFO("try shrink resoure pool", K(tenant_id), K(task_cnt)); } } } } // Backup priority is lower than disaster tolerance, // but it is not mutually exclusive with migration, // and the priority is the same as migration if (OB_SUCCESS == ret && 0 == task_cnt) { if (OB_FAIL(pg_backup_.partition_backup(backup_task_cnt, tenant_id))) { LOG_WARN("failed to do partition backup", K(ret), K(tenant_id), K(task_cnt)); } if (OB_SUCC(ret)) { if (backup_task_cnt > 0) { LOG_INFO("do partition backup", K(ret), K(tenant_id), K(backup_task_cnt)); } } } if (OB_SUCCESS == ret && 0 == task_cnt) { int64_t validate_task_cnt = 0; if (OB_FAIL(pg_validate_.partition_validate(tenant_id, validate_task_cnt))) { LOG_WARN("failed to validate partition of tenant", K(ret), K(tenant_id)); } if (OB_SUCC(ret)) { if (validate_task_cnt > 0) { LOG_INFO("do partition validate", K(ret), K(tenant_id)); } } } // step4: The status of the operation unit migration if (OB_FAIL(migrate_unit_.unit_migrate_finish(task_cnt))) { LOG_WARN("check unit migrate finish failed", K(ret), K(tenant_id)); } else if (task_cnt > 0) { LOG_INFO("check unit migrate finish", K(tenant_id), K(task_cnt)); } // step 5: Copy type alignment if (OB_SUCCESS == ret && 0 == task_cnt && (can_do_rebalance || balance_controller.at(ObRootBalanceHelp::ENABLE_COORDINATE_PG))) { // coordinate partition group member to the same unit // We count this kind of rebalance task as replicate task. if (OB_FAIL(check_stop())) { LOG_WARN("balancer stop", K(ret)); } else if (OB_FAIL(pg_coordinator_.coordinate_pg_member(task_cnt))) { LOG_WARN("coordinate partition group member to the same failed", K(ret), K(tenant_id)); } else { if (task_cnt > 0) { LOG_INFO("coordinate partition group member to the same", K(tenant_id), K(task_cnt)); } } } if (OB_SUCCESS == ret && 0 == task_cnt && (can_do_recovery || can_do_rebalance || balance_controller.at(ObRootBalanceHelp::ENABLE_CANCEL_UNIT_MIGRATION))) { LOG_INFO("start to cancel unit migrate for blocked server", K(tenant_id)); if (OB_FAIL(cancel_unit_migration(task_cnt))) { LOG_WARN("fail to cancel unit migration", K(ret), K(tenant_id)); } else if (task_cnt > 0) { LOG_INFO("cancel unit migration finish", K(ret), K(tenant_id), K(task_cnt)); } } if (OB_SUCCESS == ret && 0 == task_cnt && (can_do_rebalance || balance_controller.at(ObRootBalanceHelp::ENABLE_MIGRATE_TO_UNIT))) { // The unit has been migrated and remains consistent with the unit LOG_INFO("start to check unit migrate finish", K(ret), K(tenant_id)); if (OB_FAIL(migrate_unit_.migrate_to_unit(task_cnt))) { LOG_WARN("migrate replica to unit failed", K(ret), K(tenant_id)); } else { if (task_cnt > 0) { LOG_INFO("migrate replica to unit", K(tenant_id), K(task_cnt)); } } } // step 6: Start load balancing in the unit // Tenant is a prerequisite for load balancing // 1. There is no flag replica. Because flag replica only exists in meta, // migration is not easy to handle bool check_passed = true; if (OB_SUCC(ret) && 0 == task_cnt) { FOREACH_X(r, tenant_stat_.all_replica_, check_passed) { if (r->only_in_member_list()) { check_passed = false; LOG_INFO("tenant can not do rebalance task right now, " "because it still has flag replica", "replica", *r); } } } if (OB_SUCCESS == ret && 0 == task_cnt && ((check_passed && can_do_rebalance) || balance_controller.at(ObRootBalanceHelp::ENABLE_PARTITION_BALANCE))) { // tenant_stat_.print_stat(); // balance tablegroup partition group count // Partition groups are evenly distributed to different Units to avoid partition group aggregation if (OB_FAIL(pg_balancer_.partition_balance(task_cnt, balance_group_container.get_hash_index()))) { LOG_WARN("partition group balance failed", K(ret), K(tenant_id)); } else { if (task_cnt > 0) { LOG_INFO("partition group balance", K(tenant_id), K(task_cnt)); } } } if (OB_SUCCESS == ret && 0 == task_cnt && ((check_passed && can_do_rebalance) || balance_controller.at(ObRootBalanceHelp::ENABLE_UNIT_BALANCE))) { // balance tenant unit load if (OB_FAIL(unit_balancer_.unit_load_balance_v2( task_cnt, balance_group_container.get_hash_index(), balance_group_container.get_processed_tids()))) { LOG_WARN("unit load balance failed", K(ret), K(tenant_id)); } if (OB_SUCC(ret)) { if (task_cnt > 0) { LOG_INFO("unit load balance", K(ret), K(tenant_id), K(task_cnt)); } } } if (OB_SUCC(ret)) { task_cnt += backup_task_cnt; } int tmp_ret = OB_SUCCESS; if (OB_SUCCESS != (tmp_ret = alter_locality_checker_.check_alter_locality_finished( tenant_id, balance_group_container.get_hash_index(), tenant_stat_))) { LOG_WARN("fail to check alter locality finished", K(tmp_ret)); } // Record a log for easy checking of tasks that are not scheduled LOG_INFO( "finish one round tenant_balance", K(task_cnt), K(can_do_rebalance), K(can_do_recovery), K(tenant_id), K(ret)); return ret; } // Note: // - is_rereplication_enabled to control disaster tolerance, // replication is not accurate, // and disaster tolerance can also be achieved through migration. // Continue this name for compatibility // - is_rebalance_enabled Control load balancing // // server_balancer does both disaster recovery and balance // - The server_balancer moves the unit out because the server goes offline, // which belongs to the category of disaster tolerance control // - Server_balancer moves the unit out due to uneven load, // which belongs to the category of load balancing control int ObRootBalancer::tenant_balance(const uint64_t tenant_id, int64_t& task_cnt) { int ret = OB_SUCCESS; task_cnt = 0; ObSchemaGetterGuard schema_guard; ObRootBalanceHelp::BalanceController balance_controller; tenant_stat_.reuse(); ObString swtich_config_str = config_->__balance_controller.str(); common::ObArenaAllocator allocator(ObModIds::OB_RS_PARTITION_BALANCER); TenantSchemaGetter stat_finder(tenant_id); ObLeaderBalanceGroupContainer balance_group_container(schema_guard, stat_finder, allocator); if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not inited", K(ret)); } else if (OB_INVALID_ID == tenant_id) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid tenant_id", K(ret), K(tenant_id)); } else if (OB_FAIL(schema_service_->get_tenant_schema_guard(tenant_id, schema_guard))) { ret = OB_ERR_UNEXPECTED; LOG_WARN("get schema manager failed", K(ret)); } else if (OB_FAIL(balance_group_container.init(tenant_id))) { LOG_WARN("fail to init balance group container", K(ret), K(tenant_id)); } else if (OB_FAIL(balance_group_container.build())) { LOG_WARN("fail to build balance index builder", K(ret)); } else if (OB_FAIL(tenant_stat_.gather_stat(tenant_id, &schema_guard, balance_group_container.get_hash_index()))) { LOG_WARN("gather tenant balance statistics failed", K(ret), K(tenant_id)); } else if (OB_FAIL(ObRootBalanceHelp::parse_balance_info(swtich_config_str, balance_controller))) { LOG_WARN("fail to parse balance switch", K(ret), "balance_swtich", swtich_config_str); } else { // assign unit id to replicas with invalid unit id if (OB_FAIL(tenant_stat_.unit_assign())) { LOG_WARN("unit assign failed", K(ret), K(tenant_id)); ret = OB_SUCCESS; } else { if (OB_FAIL(multiple_zone_deployment_tenant_balance( tenant_id, balance_group_container, balance_controller, task_cnt))) { LOG_WARN("fail to execute multiple zone deployment tenant balance", K(ret)); ret = OB_SUCCESS; } } // finish unit not in locality ObArray zone_list; if (OB_FAIL(unit_mgr_->get_tenant_pool_zone_list(tenant_id, zone_list))) { LOG_WARN("fail to get tenant pool zone list", K(ret)); } else if (OB_FAIL(unit_mgr_->finish_migrate_unit_not_in_locality(tenant_id, &schema_guard, zone_list, task_cnt))) { LOG_WARN("fail to finish migrate unit not in locality", K(ret)); } } return ret; } int ObRootBalancer::cancel_unit_migration(int64_t& task_cnt) { int ret = OB_SUCCESS; TenantBalanceStat& ts = tenant_stat_; task_cnt = 0; FOREACH_X(zu, ts.all_zone_unit_, OB_SUCC(ret)) { FOREACH_X(u, zu->all_unit_, OB_SUCC(ret)) { if ((*u)->info_.unit_.migrate_from_server_.is_valid() && (*u)->server_->blocked_) { // Clean up the replicas that have been migrated in the past LOG_WARN("unit migrate blocked destination server, try to cancel migrate", K(ret), "unit", (*u)->info_.unit_, "unit_server", (*u)->server_->server_); FOREACH_X(p, ts.sorted_partition_, OB_SUCCESS == ret) { if (OB_FAIL(check_stop())) { LOG_WARN("balancer stop", K(ret)); break; } if (!(*p)->has_leader()) { LOG_WARN("partition have no leader, can't replicate", "partition", *(*p)); continue; } else if (!(*p)->is_valid_quorum()) { // The quorum of partition was introduced in 1.4.7.1. // It depends on the report and may be an illegal value. // In this case, skip it first to avoid affecting the generation of subsequent tasks. LOG_WARN("partition quorum is invalid, just skip", KPC(*p)); continue; } FOR_BEGIN_END_E(r, *(*p), ts.all_replica_, OB_SUCCESS == ret) { if (r->unit_->info_.unit_.unit_id_ == (*u)->info_.unit_.unit_id_ && r->server_->server_ == (*u)->server_->server_) { task_cnt++; ObPartitionKey pkey; const bool is_valid_paxos_replica = ObReplicaTypeCheck::is_paxos_replica_V2(r->replica_type_); // need to delete; if (!is_valid_paxos_replica) { const char* comment = balancer::REMOVE_MIGRATE_UNIT_BLOCK_REPLICA; common::ObArray task_info_array; ObRemoveNonPaxosTask task; ObRemoveNonPaxosTaskInfo task_info; OnlineReplica remove_member; remove_member.member_ = ObReplicaMember(r->server_->server_, r->member_time_us_, r->replica_type_); remove_member.zone_ = r->zone_; if (r->is_in_blacklist( ObRebalanceTaskType::REMOVE_NON_PAXOS_REPLICA, r->server_->server_, &tenant_stat_)) { ret = OB_REBALANCE_TASK_CANT_EXEC; LOG_WARN( "repica do remove non paxos replica frequent failed, now in black list", K(ret), " replice", *r); } else if (OB_FAIL(ts.gen_partition_key(*(*p), pkey))) { LOG_WARN("fail to gen partition key", K(ret), "partition", *(*p), "replica", *r); } else if (OB_FAIL(task_info.build(remove_member, pkey))) { LOG_WARN("fail to build remove non paxos replica task info", K(ret)); } else if (OB_FAIL(task_info_array.push_back(task_info))) { LOG_WARN("fail to push back", K(ret)); } else if (OB_FAIL(task.build(task_info_array, remove_member.member_.get_server(), comment))) { LOG_WARN("fail to build remove member task", K(ret)); } else if (OB_FAIL(task_mgr_->add_task(task))) { LOG_WARN("fail to add task", K(ret), K(task)); } else { LOG_INFO("add remove non paxos replica task success", K(task), "partition", *(*p), K(task_cnt)); } } else if ((*p)->valid_member_cnt_ <= majority((*p)->schema_replica_cnt_) || !(*p)->can_balance()) { // nothing todo // TODO: need to add additional checks when supporting L-replica copies } else { const char* comment = balancer::REMOVE_MIGRATE_UNIT_BLOCK_MEMBER; ObRemoveMemberTask task; common::ObArray task_info_array; int64_t quorum = 0; ObRemoveMemberTaskInfo task_info; OnlineReplica remove_member; remove_member.member_ = ObReplicaMember(r->server_->server_, r->member_time_us_, r->replica_type_); remove_member.zone_ = r->zone_; if (r->is_in_blacklist(ObRebalanceTaskType::MEMBER_CHANGE, r->server_->server_, &tenant_stat_)) { ret = OB_REBALANCE_TASK_CANT_EXEC; LOG_WARN("replica do member frequent failed, now in black list", K(ret), "replica", *r); } else if (OB_FAIL(tenant_stat_.gen_partition_key(*(*p), pkey))) { LOG_WARN("fail to gen partition key", K(ret), "partition", *(*p), "replica", *r); } else if (OB_FAIL(tenant_stat_.get_remove_replica_quorum_size( *(*p), r->zone_, r->replica_type_, quorum))) { LOG_WARN("fail to get quorum size", K(ret), "partition", *(*p), "replica", *r); } else if (OB_FAIL(task_info.build(remove_member, pkey, quorum, (*p)->get_quorum()))) { LOG_WARN("fail to build remove member task info", K(ret)); } else if (OB_FAIL(task_info_array.push_back(task_info))) { LOG_WARN("fail to push back", K(ret)); } else if (OB_FAIL(task.build(task_info_array, remove_member.member_.get_server(), comment))) { LOG_WARN("fail to build remove member task", K(ret)); } else if (OB_FAIL(task_mgr_->add_task(task))) { LOG_WARN("fail to add task", K(ret), K(task)); } else { LOG_INFO("add task to remove member", K(task), "partition", *(*p), K(task_cnt)); } } } } } if (OB_SUCC(ret)) { ObArray tenant_ids; ObAdminClearBalanceTaskArg::TaskType type = obrpc::ObAdminClearBalanceTaskArg::ALL; if (OB_FAIL(task_mgr_->clear_task(tenant_ids, type))) { LOG_WARN("fail ti flush task", K(ret)); } else { task_cnt++; } } break; } else { // nothing todo } } } return ret; } int ObRootBalancer::check_tenant_group_config_legality( common::ObIArray& tenant_groups, bool& legal) { int ret = OB_SUCCESS; if (OB_UNLIKELY(!inited_)) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } else if (OB_FAIL(server_balancer_.check_tenant_group_config_legality(tenant_groups, legal))) { LOG_WARN("fail to check tenant group config legality", K(ret)); } else { } // no more to do return ret; } int ObRootBalancer::fill_faillist(const int64_t count, const common::ObIArray& table_ids, const common::ObIArray& partition_ids, const common::ObIArray& servers, const share::ObPartitionReplica::FailList& fail_msgs) { int ret = OB_SUCCESS; if (OB_UNLIKELY(!inited_)) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } else if (OB_FAIL(GCTX.root_service_->submit_black_list_task(count, table_ids, partition_ids, servers, fail_msgs))) { LOG_WARN( "failed to add task to asyn queue", K(ret), K(count), K(table_ids), K(partition_ids), K(servers), K(fail_msgs)); } return ret; } int64_t ObRootBalancer::get_schedule_interval() const { return idling_.get_idle_interval_us(); } int64_t ObBlacklistProcess::get_deep_copy_size() const { return sizeof(*this); } ObAsyncTask* ObBlacklistProcess::deep_copy(char* buf, const int64_t buf_size) const { ObAsyncTask* task = NULL; int ret = OB_SUCCESS; const int64_t need_size = get_deep_copy_size(); if (!init_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } else if (NULL == buf) { ret = OB_INVALID_ARGUMENT; LOG_WARN("buf is null", K(ret)); } else if (buf_size < need_size) { ret = OB_INVALID_ARGUMENT; LOG_WARN("buf is not long enough", K(need_size), K(buf_size), K(ret)); } else { task = new (buf) ObBlacklistProcess(); if (OB_FAIL(static_cast(task)->init( count_, *pt_, table_ids_, partition_ids_, servers_, fail_msgs_))) { LOG_WARN("failed to init new task", K(ret)); } } return task; } int ObBlacklistProcess::init(const int64_t count, share::ObPartitionTableOperator& pt, const common::ObIArray& table_ids, const common::ObIArray& partition_ids, const common::ObIArray& servers, const share::ObPartitionReplica::FailList& fail_msgs) { int ret = OB_SUCCESS; if (OB_FAIL(table_ids_.assign(table_ids))) { LOG_WARN("failed to copy table id", K(ret)); } else if (OB_FAIL(partition_ids_.assign(partition_ids))) { LOG_WARN("failed to copy partition id", K(ret)); } else if (OB_FAIL(servers_.assign(servers))) { LOG_WARN("failed to set server", K(ret)); } else if (OB_FAIL(copy_assign(fail_msgs_, fail_msgs))) { LOG_WARN("failed to set failmsg", K(ret)); } else { init_ = true; count_ = count; pt_ = &pt; } return ret; } int ObBlacklistProcess::process() { int ret = OB_SUCCESS; LOG_INFO("do insert black list", K(count_), K(fail_msgs_)); if (!init_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } else if (OB_ISNULL(pt_)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("partition operator is null", K(ret)); } else if (count_ != table_ids_.count() || count_ != partition_ids_.count() || count_ != servers_.count() || count_ != fail_msgs_.count()) { ret = OB_INVALID_ARGUMENT; LOG_WARN("failed to do check", K(count_), K(table_ids_), K(partition_ids_), K(servers_), K(fail_msgs_)); } for (int64_t i = 0; OB_SUCC(ret) && i < table_ids_.count(); ++i) { ObPartitionReplica::FailList fail_list; if (OB_FAIL(fail_list.push_back(fail_msgs_.at(i)))) { LOG_WARN("failed to push back fail msg", K(ret), K(fail_msgs_)); } else if (OB_FAIL(pt_->update_fail_list(table_ids_.at(i), partition_ids_.at(i), servers_.at(i), fail_list))) { LOG_WARN("failed to process update fail list", K(ret), K(fail_list), K(i)); } } // end for return ret; } } // end namespace rootserver } // end namespace oceanbase