/** * Copyright (c) 2021 OceanBase * OceanBase CE is licensed under Mulan PubL v2. * You can use this software according to the terms and conditions of the Mulan PubL v2. * You may obtain a copy of Mulan PubL v2 at: * http://license.coscl.org.cn/MulanPubL-2.0 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. * See the Mulan PubL v2 for more details. */ #define USING_LOG_PREFIX RS #include "ob_partition_group_coordinator.h" #include "lib/container/ob_array.h" #include "lib/container/ob_array_iterator.h" #include "lib/container/ob_se_array.h" #include "lib/container/ob_se_array_iterator.h" #include "observer/ob_server_struct.h" #include "share/ob_multi_cluster_util.h" #include "share/partition_table/ob_partition_table_operator.h" #include "ob_rebalance_task_mgr.h" #include "ob_root_utils.h" #include "ob_locality_checker.h" using namespace oceanbase::common; using namespace oceanbase::rootserver; using namespace oceanbase::share; ObPartitionGroupCoordinator::ObPartitionGroupCoordinator() : inited_(false), task_mgr_(NULL), tenant_stat_(NULL), check_stop_provider_(NULL), two_paxos_zones_(), logonly_units_() {} int ObPartitionGroupCoordinator::init( ObRebalanceTaskMgr& task_mgr, TenantBalanceStat& tenant_stat, share::ObCheckStopProvider& check_stop_provider) { int ret = OB_SUCCESS; if (inited_) { ret = OB_INIT_TWICE; } else { task_mgr_ = &task_mgr; tenant_stat_ = &tenant_stat; check_stop_provider_ = &check_stop_provider; two_paxos_zones_.reset(); logonly_units_.reset(); inited_ = true; } return ret; } int ObPartitionGroupCoordinator::prepare() { int ret = OB_SUCCESS; TenantBalanceStat& ts = *tenant_stat_; for (int64_t i = 0; i < ts.all_zone_paxos_info_.count() && OB_SUCC(ret); i++) { const ZonePaxosInfo& zone_paxos_info = ts.all_zone_paxos_info_.at(i); if (zone_paxos_info.full_replica_num_ == 1 && zone_paxos_info.logonly_replica_num_ == 1) { if (OB_FAIL(two_paxos_zones_.push_back(zone_paxos_info.zone_))) { LOG_WARN("fail to push back", K(ret), K(zone_paxos_info)); } } else { } } for (int64_t i = 0; i < ts.all_zone_unit_.count() && OB_SUCC(ret); i++) { const ZoneUnit& zu = ts.all_zone_unit_.at(i); for (int64_t j = 0; j < zu.all_unit_.count() && OB_SUCC(ret); j++) { UnitStat* u = zu.all_unit_.at(j); if (REPLICA_TYPE_LOGONLY == u->info_.unit_.replica_type_) { if (OB_FAIL(logonly_units_.push_back(u))) { LOG_WARN("fail to push back", K(ret)); } } } } return ret; } int ObPartitionGroupCoordinator::normal_pg_member_coordinate( common::ObArray::iterator& p, Partition& primary_partition, int64_t& task_cnt) { int ret = OB_SUCCESS; TenantBalanceStat& ts = *tenant_stat_; bool primary_partition_migrated = false; int64_t tmp_task_cnt = 0; if (!primary_partition.can_balance()) { LOG_INFO("primary partition has no leader", K(primary_partition)); } else if (!(*p)->is_valid_quorum()) { LOG_WARN("quorum not report yet, just skip", K(ret), KPC(*p)); } else if (OB_FAIL(try_coordinate_primary_by_unit(primary_partition, primary_partition_migrated, tmp_task_cnt))) { LOG_WARN("fail to coordiante logonly replica", K(ret), K(primary_partition)); } else if (primary_partition_migrated) { task_cnt += tmp_task_cnt; } else if (!can_do_coordinate(primary_partition)) { LOG_WARN("can't do coordinate", K(ret), K(primary_partition)); } else { do { int64_t tmp_task_cnt = 0; __typeof__(p) next = p + 1; if (next >= ts.sorted_partition_.end() || (*next)->partition_idx_ != (*p)->partition_idx_ || (*next)->is_primary()) { break; } p = next; if (!(*p)->is_valid_quorum()) { LOG_WARN("quorum not report yet, just skip", K(ret), KPC(*p)); } else if (OB_FAIL( coordinate_partition_member(primary_partition, *(*p), primary_partition_migrated, tmp_task_cnt))) { LOG_WARN("coordinate partition member failed", K(ret), K(primary_partition), "partition", *(*p)); } else { task_cnt += tmp_task_cnt; if (primary_partition_migrated) { break; } } } while (OB_SUCC(ret)); } return ret; } // aligned step: // 1. Using the first partition of pg as a template, called the primary partition, // first solve the alignment of the primary partition; // 1.1 try_coordinate_primary_by_unit: Ensure that the replica of type L is stored on the logonly unit, // and if there is a logonly unit, the logonly replica must be on the logonlg unit; // 1.2 In the case where the primary partition is already aligned, solve the alignment of the remaining partitions // 2.1 migrate_primary_partition: For each replica in pg, // check whether the destination OBS can migrate in occasionally when aligning, // if not, try to migrate back // 2.2 Solve the alignment of each type by type without moving back // 2.2.1 There are two cases of paxos copy alignment. // One is that there are two replicas of paxos in the zone; // the other is that there is only one replica of paxos in the zone; // 2.2.2 Solve alignment issues for other replica types int ObPartitionGroupCoordinator::coordinate_pg_member(int64_t& task_cnt) { int ret = OB_SUCCESS; TenantBalanceStat& ts = *tenant_stat_; task_cnt = 0; two_paxos_zones_.reset(); logonly_units_.reset(); if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } else if (!ts.is_valid()) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), "tenant stat valid", ts.is_valid()); } else if (OB_FAIL(prepare())) { LOG_WARN("fail to prepare", K(ret)); } else { FOREACH_X(p, ts.sorted_partition_, OB_SUCCESS == ret) { if (OB_FAIL(check_stop())) { LOG_WARN("balancer stop", K(ret)); break; } else { if ((*p)->is_primary() && task_cnt > ObRebalanceTaskMgr::ONCE_ADD_TASK_CNT) { break; } } if ((*p)->is_primary()) { const uint64_t partition_entity_id = (*p)->table_id_; Partition& primary_partition = *(*p); if (OB_FAIL(normal_pg_member_coordinate(p, primary_partition, task_cnt))) { LOG_WARN("fail to do normal pg member coordinate", K(ret)); } } } } return ret; } int ObPartitionGroupCoordinator::try_coordinate_primary_by_unit( const Partition& primary_partition, bool& primary_partition_migrated, int64_t& task_count) { int ret = OB_SUCCESS; task_count = 0; TenantBalanceStat& ts = *tenant_stat_; primary_partition_migrated = false; bool can_add_task = false; for (int64_t i = 0; i < logonly_units_.count() && OB_SUCC(ret) && !primary_partition_migrated; i++) { const Replica* replica_on_logonly = NULL; const Replica* logonly_replica = NULL; FOR_BEGIN_END_E(ppr, primary_partition, ts.all_replica_, OB_SUCC(ret)) { if (!ppr->is_in_service()) { // nothing todo } else if (ppr->zone_ != logonly_units_.at(i)->info_.unit_.zone_) { // nothing todo } else { if (REPLICA_TYPE_LOGONLY == ppr->replica_type_) { logonly_replica = ppr; } if (REPLICA_TYPE_LOGONLY == ppr->unit_->info_.unit_.replica_type_) { replica_on_logonly = ppr; } } } // end FOR_BEGIN_END_E(ppr // Check for alignment if (OB_FAIL(ret)) { // nothing todo } else if (OB_ISNULL(logonly_replica)) { // Lack of logonly replica; waiting to make up the replica if (!OB_ISNULL(replica_on_logonly)) { // Try to move the replica on the logonly unit LOG_WARN("need coordiante because non_logonly replica in logonly unit", K(primary_partition), K(*replica_on_logonly)); primary_partition_migrated = true; Replica dest; if (OB_FAIL(get_random_dest(primary_partition, *replica_on_logonly, dest))) { LOG_WARN("fail to get random dest", K(ret)); if (OB_ENTRY_NOT_EXIST == ret) { ret = OB_SUCCESS; } else { LOG_WARN("fail to get random dest", K(ret)); } } else if (OB_FAIL(do_migrate_replica(primary_partition, *replica_on_logonly, dest, can_add_task))) { LOG_WARN("fail to migrate replica", K(ret), "partition_id", primary_partition.partition_id_, "table_id", primary_partition.table_id_, "replica", *replica_on_logonly); } else if (can_add_task) { task_count++; } } } else if (OB_ISNULL(replica_on_logonly)) { primary_partition_migrated = true; Replica dest_replica; dest_replica.zone_ = logonly_units_.at(i)->info_.unit_.zone_; dest_replica.server_ = logonly_units_.at(i)->server_; dest_replica.unit_ = logonly_units_.at(i); LOG_WARN("need coordiante becase logonly_replica not in logonly unit", K(primary_partition), K(*logonly_replica), K(dest_replica)); // Empty unit, logonly replica needs to be migrated if (OB_FAIL(do_migrate_replica(primary_partition, *logonly_replica, dest_replica, can_add_task))) { LOG_WARN("fail to migreate replica", K(ret)); } else if (can_add_task) { task_count++; } } else if (logonly_replica != replica_on_logonly) { LOG_WARN("need coordiante becase logonly_replica not in logonly unit", K(primary_partition), K(*replica_on_logonly), K(*logonly_replica)); primary_partition_migrated = true; Replica dest; // Try to move away first, if the migration fails, need to delete first and then migrate if (OB_FAIL(get_random_dest(primary_partition, *replica_on_logonly, dest))) { if (OB_ENTRY_NOT_EXIST != ret) { LOG_WARN("fail to get random dest", K(ret)); } else { // Priority to keep the replica of F if (REPLICA_TYPE_FULL == replica_on_logonly->replica_type_) { if (OB_FAIL(try_remove_and_migrate( primary_partition, *replica_on_logonly, *logonly_replica, *logonly_replica, can_add_task))) { LOG_WARN("fail to remove and migrate", K(ret)); } else if (can_add_task) { task_count++; } } else if (OB_FAIL(try_remove_and_migrate(primary_partition, *logonly_replica, *replica_on_logonly, *replica_on_logonly, can_add_task))) { LOG_WARN("fail to remove and migrate", K(ret)); } else if (can_add_task) { task_count++; } } } else if (OB_FAIL(do_migrate_replica(primary_partition, *replica_on_logonly, dest, can_add_task))) { LOG_WARN("fail to do migrare replica", K(ret), K(primary_partition), K(*replica_on_logonly), K(dest)); } else if (can_add_task) { task_count++; } } else { // nothing todo } } // end for (int64_t i = 0; i < logonly_zones return ret; } int ObPartitionGroupCoordinator::coordinate_partition_member( const Partition& primary_partition, Partition& partition, bool& primary_partition_migrated, int64_t& task_count) { int ret = OB_SUCCESS; task_count = 0; TenantBalanceStat& ts = *tenant_stat_; if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } else if (!ts.is_valid() || OB_INVALID_ID == primary_partition.table_id_ || OB_INVALID_ID == partition.table_id_ || primary_partition.tablegroup_id_ != partition.tablegroup_id_) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), "tenant stat valid", ts.is_valid(), K(primary_partition), K(partition)); } // ppr : primary partition replica primary_partition_migrated = false; bool skip_coordinate_paxos_replica = false; int64_t tmp_task_count = 0; // 1. Align the replica of the paxos type first // 2. Then align replicas of non-paxos types if (OB_SUCC(ret)) { // Check whether ppr needs to be migrated back. // Need to iterate all the partitions until a suitable location is found for migration back if (OB_FAIL(migrate_primary_partition(primary_partition, partition, primary_partition_migrated, tmp_task_count))) { LOG_WARN("migrate primary partition", K(ret), K(primary_partition), K(partition)); } else if (primary_partition_migrated) { task_count += tmp_task_count; // nothing todo } else if (!can_do_coordinate(partition)) { LOG_WARN("fail to to coordinate", K(partition)); } else if (OB_FAIL(coordinate_paxos_replica( primary_partition, partition, tmp_task_count, skip_coordinate_paxos_replica))) { LOG_WARN("fail to process replica in same addr", K(ret)); } else if (0 == tmp_task_count // Do only one replica at a time && !skip_coordinate_paxos_replica) { if (OB_FAIL(coordinate_non_paxos_replica(primary_partition, partition, tmp_task_count))) { LOG_WARN("fail to process replica in same zone", K(ret), K(primary_partition), K(partition)); } else { task_count += tmp_task_count; } } else { task_count += tmp_task_count; } } return ret; } // Try to find an available machine on the same server/zone in the partition for migration; // Check whether the primary partition needs to find the correct unit to migrate. // In addition, if there is a block, need to migrate back or find the correct unit to migrate. int ObPartitionGroupCoordinator::migrate_primary_partition( const Partition& primary_partition, const Partition& partition, bool& primary_partition_migrated, int64_t& task_cnt) { int ret = OB_SUCCESS; Replica dest; primary_partition_migrated = false; task_cnt = 0; TenantBalanceStat& ts = *tenant_stat_; bool can_add_task = false; FOR_BEGIN_END_E(ppr, primary_partition, ts.all_replica_, OB_SUCC(ret) && !primary_partition_migrated) { if (!ppr->is_in_service()) { // nothing todo } else if (!ppr->server_->need_rebalance()) { // nothing todo } else { primary_partition_migrated = true; if (OB_FAIL(get_migrate_dest(primary_partition, *ppr, partition, dest))) { if (OB_ENTRY_NOT_EXIST == ret) { ret = OB_SUCCESS; } else { LOG_WARN("fail to get migrate dest addr", K(ret), K(primary_partition), K(*ppr), K(partition)); } } else if (OB_FAIL(do_migrate_replica(primary_partition, *ppr, dest, can_add_task))) { LOG_WARN("fail to build migrate task", KR(ret)); } else if (can_add_task) { task_cnt++; LOG_INFO("migrate primary partition back", K(ret), K(primary_partition), K(dest)); } } } return ret; } // Only handle the alignment of the paxos replica of the zone containing two replicas of paxos int ObPartitionGroupCoordinator::coordinate_paxos_replica(const Partition& primary_partition, const Partition& partition, int64_t& task_count, bool& skip_coordinate_paxos_replica) { int ret = OB_SUCCESS; task_count = 0; skip_coordinate_paxos_replica = false; if (OB_FAIL(ret)) { // nothing todo } else if (two_paxos_zones_.count() <= 0) { // nothing todo } else if (OB_FAIL(coordinate_specific_paxos_replica( primary_partition, partition, task_count, skip_coordinate_paxos_replica))) { LOG_WARN("fail to coornate sepcific paxos replica", K(ret), K(two_paxos_zones_)); } if (OB_FAIL(ret) || task_count > 0 || skip_coordinate_paxos_replica) { // nothing todo } else if (OB_FAIL(coordinate_normal_paxos_replica( primary_partition, partition, task_count, skip_coordinate_paxos_replica))) { LOG_WARN("fail to coordinate paxos replica", K(ret)); } return ret; } // If there are two replicas of paxos on the zone, first check whether the replica of F is aligned. // If the replica of F is not aligned, it needs to be migrated for alignment, // or it needs to be deleted and then migrated for alignment // Secondly, under the premise that the replica of F is aligned, the alignment of the replica of L is performed int ObPartitionGroupCoordinator::coordinate_specific_paxos_replica(const Partition& primary_partition, const Partition& partition, int64_t& task_count, bool& skip_coordinate_paxos_replica) { int ret = OB_SUCCESS; skip_coordinate_paxos_replica = false; task_count = 0; TenantBalanceStat& ts = *tenant_stat_; bool can_add_task = false; for (int64_t i = 0; i < two_paxos_zones_.count() && OB_SUCC(ret) && task_count == 0; i++) { const ObZone& zone = two_paxos_zones_.at(i); Replica* ppr_full_replica = NULL; Replica* ppr_logonly_replica = NULL; FOR_BEGIN_END_E(ppr, primary_partition, ts.all_replica_, OB_SUCC(ret) && 0 == task_count) { if (!ppr->is_in_service()) { // not a normal replica } else if (!ppr->is_valid_paxos_member() || ppr->zone_ != zone) { // nothing todo } else if (REPLICA_TYPE_LOGONLY == ppr->replica_type_) { if (REPLICA_TYPE_LOGONLY != ppr->unit_->info_.unit_.replica_type_) { ret = OB_ERR_UNEXPECTED; LOG_WARN("primary partition not coordinate now", K(ret), K(*ppr), "unit", ppr->unit_->info_); } else if (OB_ISNULL(ppr_logonly_replica)) { ppr_logonly_replica = ppr; } else { ret = OB_ERR_UNEXPECTED; LOG_ERROR("there are two logonly replica in zone", K(ret), K(zone), K(primary_partition)); } } else if (REPLICA_TYPE_FULL == ppr->replica_type_) { if (REPLICA_TYPE_LOGONLY == ppr->unit_->info_.unit_.replica_type_) { ret = OB_ERR_UNEXPECTED; LOG_WARN("primary partition not coordinate now", K(ret), K(*ppr), "unit", ppr->unit_->info_); } else if (OB_ISNULL(ppr_full_replica)) { ppr_full_replica = ppr; } else { ret = OB_ERR_UNEXPECTED; LOG_ERROR("there are two full replica in zone", K(ret), K(zone), K(primary_partition)); } } } // end FOR_BEGIN_END_E Replica* pr_full_replica = NULL; Replica* pr_logonly_replica = NULL; Replica* pr_barrier_full = NULL; Replica* pr_barrier_logonly = NULL; FOR_BEGIN_END_E(pr, partition, ts.all_replica_, OB_SUCC(ret) && 0 == task_count) { if (!pr->is_in_service()) { // not a normal replica } else if (!pr->is_valid_paxos_member() || pr->zone_ != zone) { // nothing todo } else if (REPLICA_TYPE_LOGONLY == pr->replica_type_) { if (OB_ISNULL(pr_logonly_replica)) { pr_logonly_replica = pr; } else { ret = OB_ERR_UNEXPECTED; LOG_ERROR("there are two logonly replica in zone", K(ret), K(zone), K(partition)); } } else if (REPLICA_TYPE_FULL == pr->replica_type_) { if (OB_ISNULL(pr_full_replica)) { pr_full_replica = pr; } else { ret = OB_ERR_UNEXPECTED; LOG_ERROR("there are two full replica in zone", K(ret), K(zone), K(partition)); } } if (!OB_ISNULL(ppr_full_replica) && pr->server_->server_ == ppr_full_replica->server_->server_) { pr_barrier_full = pr; } if (!OB_ISNULL(ppr_logonly_replica) && pr->server_->server_ == ppr_logonly_replica->server_->server_) { pr_barrier_logonly = pr; } } // end FOR_BEGIN_END_E(pr, partition // Start to check whether it needs to be migrated or deleted // First check whether the replica of F needs coordinate // Second, check whether the replica of L needs coordinate if (OB_FAIL(ret)) { // nothing todo } else if (OB_ISNULL(ppr_full_replica) || OB_ISNULL(pr_full_replica)) { // nothing todo } else if (ppr_full_replica->server_->server_ == pr_full_replica->server_->server_) { // nothing todo } else { // need to migrate if (OB_ISNULL(pr_barrier_full)) { skip_coordinate_paxos_replica = true; // try migrate if (OB_FAIL(do_migrate_replica(partition, *pr_full_replica, *ppr_full_replica, can_add_task))) { LOG_WARN("fail to do migrate replica", K(ret)); } else if (can_add_task) { task_count++; } } else if (OB_FAIL(try_remove_and_migrate( partition, *pr_full_replica, *pr_barrier_full, *ppr_full_replica, can_add_task))) { LOG_WARN("fail to delete and migrate", K(ret)); skip_coordinate_paxos_replica = true; } else if (can_add_task) { skip_coordinate_paxos_replica = true; task_count++; } } // end check FULL replica // begin to check logonly replica if (OB_FAIL(ret) || skip_coordinate_paxos_replica) { // nothing todo } else if (OB_ISNULL(ppr_logonly_replica) || OB_ISNULL(pr_logonly_replica)) { // nothing todo } else if (ppr_logonly_replica->server_->server_ == pr_logonly_replica->server_->server_) { // nothing todo } else { // need to migrate if (OB_ISNULL(pr_barrier_logonly)) { skip_coordinate_paxos_replica = true; // try migrate if (OB_FAIL(do_migrate_replica(partition, *pr_logonly_replica, *ppr_logonly_replica, can_add_task))) { LOG_WARN("fail to do migrate replica", K(ret)); } else if (can_add_task) { task_count++; } } else if (OB_FAIL(try_remove_and_migrate( partition, *pr_logonly_replica, *pr_barrier_logonly, *ppr_logonly_replica, can_add_task))) { LOG_WARN("fail to delete and migrate", K(ret)); skip_coordinate_paxos_replica = true; } else if (can_add_task) { skip_coordinate_paxos_replica = true; task_count++; } } // end check logonly replica } // end for (int64_t i = 0; i < two_paxos_zones_.count() return ret; } int ObPartitionGroupCoordinator::coordinate_normal_paxos_replica(const Partition& primary_partition, const Partition& partition, int64_t& task_count, bool& skip_coordinate_paxos_replica) { int ret = OB_SUCCESS; task_count = 0; skip_coordinate_paxos_replica = false; TenantBalanceStat& ts = *tenant_stat_; bool can_add_task = false; // Handling of paxos type replicas separately FOR_BEGIN_END_E(ppr, primary_partition, ts.all_replica_, OB_SUCC(ret) && 0 == task_count) { if (!ppr->is_valid_paxos_member()) { continue; } else if (!ppr->server_->can_migrate_in() || !ppr->unit_->in_pool_) { skip_coordinate_paxos_replica = true; continue; } else if (has_exist_in_array(two_paxos_zones_, ppr->zone_)) { // nothing todo } else { Replica* paxos_replica = NULL; Replica* same_addr = NULL; FOR_BEGIN_END_E(pr, partition, ts.all_replica_, OB_SUCC(ret) && 0 == task_count) { if (!pr->is_in_service()) { continue; } if (pr->zone_ == ppr->zone_ && pr->is_valid_paxos_member() && pr->server_->can_migrate_in()) { paxos_replica = &(*pr); } if (pr->server_->server_ == ppr->server_->server_) { same_addr = &(*pr); } } // FOR_BEGIN_END_E(pr if (OB_SUCC(ret) && 0 == task_count) { if (NULL == paxos_replica) { // Missing replica of paxos; waiting for additional replicas of other disaster recovery processes // nothing to do } else if (NULL == same_addr) { skip_coordinate_paxos_replica = true; // There is an unaligned replica of paxos type, which needs to be migrated; //(it may also need to do type conversion, but we only do one operation in one step) if (OB_FAIL(do_migrate_replica(partition, *paxos_replica, *ppr, can_add_task))) { LOG_WARN("fail to migrate replica", K(ret), K(partition), "target", *paxos_replica, "dest replica", *ppr); } else if (can_add_task) { task_count++; } } else if (same_addr == paxos_replica) { // An aligned replica of the paxos type exists, but type conversion may be required if (ppr->replica_type_ == paxos_replica->replica_type_) { // nothing todo } else { // Type conversion can be moved outside to do. Avoid comparing locality } } else { skip_coordinate_paxos_replica = true; // There are two replicas, for example, ppr is located on machine A, // but partition R is located on A, and F is located on machine B // Need to delete first, then migrate if (OB_FAIL(do_remove_and_migrate(partition, *paxos_replica, *ppr, can_add_task))) { LOG_WARN( "fail to do remove and migrate", K(ret), K(partition), "target", *paxos_replica, "dest replica", *ppr); } else if (can_add_task) { task_count++; } } } } } return ret; } int ObPartitionGroupCoordinator::coordinate_non_paxos_replica( const Partition& primary_partition, Partition& partition, int64_t& task_count) { int ret = OB_SUCCESS; if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } ObArray ppr_coordinated_replica; // Record the aligned ppr ObArray pr_coordinated_replica; // Record the aligned pr task_count = 0; //(1) On the same OBS, see if there is a replica, if there are and the types are the same, do nothing, // otherwise perform type conversion //(2) In the same zone, check to see if there is no replica of the same type, and if there is one, perform a // migration. // Otherwise, do a type conversion first.(Need to exclude aligned replicas). if (OB_SUCC(ret)) { if (OB_FAIL(process_non_paxos_replica_in_same_addr( primary_partition, partition, ppr_coordinated_replica, pr_coordinated_replica, task_count))) { LOG_WARN("fail to process replica in same addr", K(ret)); } else if (task_count == 0 // Do only one replica at a time && OB_FAIL(process_non_paxos_replica_in_same_zone( primary_partition, partition, ppr_coordinated_replica, pr_coordinated_replica, task_count))) { LOG_WARN("fail to process replica in same zone", K(ret), K(primary_partition), K(partition)); } } return ret; } int ObPartitionGroupCoordinator::process_non_paxos_replica_in_same_zone(const Partition& primary_partition, Partition& partition, ObIArray& ppr_coordinated_replica, ObIArray& pr_coordinated_replica, int64_t& task_count) { int ret = OB_SUCCESS; TenantBalanceStat& ts = *tenant_stat_; bool can_add_task = false; FOR_BEGIN_END_E(ppr, primary_partition, ts.all_replica_, OB_SUCC(ret) && 0 == task_count) { if (!ppr->is_in_service() || !ppr->server_->active_ || !ppr->server_->online_ || !ppr->unit_->in_pool_ || ppr->server_->server_ != ppr->unit_->info_.unit_.server_) { continue; } else if (ObReplicaTypeCheck::is_paxos_replica_V2(ppr->replica_type_)) { continue; } else if (has_exist_in_array(ppr_coordinated_replica, &(*ppr))) { continue; } bool found_addr = false; FOR_BEGIN_END_E(pr, partition, ts.all_replica_, !found_addr && OB_SUCCESS == ret) { if (!pr->is_in_service() || has_exist_in_array(pr_coordinated_replica, &(*pr))) { continue; } else if (ObReplicaTypeCheck::is_paxos_replica_V2(pr->replica_type_)) { continue; } else if (pr->zone_ == ppr->zone_) { // need to migrate; found_addr = true; if (OB_FAIL(do_migrate_replica(partition, *pr, *ppr, can_add_task))) { LOG_WARN("fail to do migrate", K(ret), K(partition), K(primary_partition)); } else if (OB_FAIL(pr_coordinated_replica.push_back(&(*pr)))) { LOG_WARN("fail to push back", K(ret)); } else if (can_add_task) { task_count++; } } } } return ret; } int ObPartitionGroupCoordinator::process_non_paxos_replica_in_same_addr(const Partition& primary_partition, Partition& partition, ObIArray& ppr_coordinated_replica, ObIArray& pr_coordinated_replica, int64_t& task_count) { int ret = OB_SUCCESS; TenantBalanceStat& ts = *tenant_stat_; FOR_BEGIN_END_E(ppr, primary_partition, ts.all_replica_, OB_SUCC(ret) && 0 == task_count) { if (!ppr->is_in_service() || !ppr->server_->can_migrate_in() || !ppr->unit_->in_pool_ || ppr->server_->server_ != ppr->unit_->info_.unit_.server_) { continue; } else if (ObReplicaTypeCheck::is_paxos_replica_V2(ppr->replica_type_)) { continue; } FOR_BEGIN_END_E(pr, partition, ts.all_replica_, OB_SUCCESS == ret) { if (pr->is_in_service() && ppr->server_->server_ == pr->server_->server_) { if (pr->replica_type_ == ppr->replica_type_) { // nothing todo } else if (ObReplicaTypeCheck::is_paxos_replica_V2(pr->replica_type_)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("fail to be here", K(ret), K(primary_partition), K(*pr), K(partition)); } else { // Don't do type conversion here // For the time being, the situation where there are three types of replicas in a zone will not be processed // If there are three types of replicas in a zone, // except for the paxos type, the other two types should also be aligned. // At this time, the problem of type conversion may be involved. } if (OB_SUCC(ret)) { if (OB_FAIL(pr_coordinated_replica.push_back(pr))) { LOG_WARN("fail to push back coordinated replica", K(ret), K(*pr)); } else if (OB_FAIL(ppr_coordinated_replica.push_back(&(*ppr)))) { LOG_WARN("fail to push back", K(ret)); } } } // if server=server } // end FOR_BEGIN_END_E(pr, partition } // end FOR_BEGIN_END_E(ppr, primary_partition return ret; } // 14x new deployment method: F/L@zone //(1) Check whether a zone has only one paxos type; // if a zone has multiple replicas of paxos, it will affect the coordinate // NOTE: In the pg coordinate stage, type conversion is not considered. // In such a situation, skip all //(2) If there is a unit migration operation, do not coordinate; // wait for the unit migration to be completed bool ObPartitionGroupCoordinator::can_do_coordinate(const Partition& partition) { bool bret = true; int ret = OB_SUCCESS; TenantBalanceStat& ts = *tenant_stat_; if (!ts.is_valid()) { bret = false; ret = OB_ERR_UNEXPECTED; LOG_WARN("get invalid tenant stat", K(ret)); } else { FOR_BEGIN_END_E(pr, partition, ts.all_replica_, OB_SUCC(ret) && bret) { if (!pr->is_in_service()) { continue; } if (pr->server_->server_ != pr->unit_->info_.unit_.server_) { bret = false; LOG_INFO("can not do coordiante now, partition have unit balance task todo", K(partition), "unit", pr->unit_->info_.unit_, "replica_addr", pr->server_->server_); } } } return bret; ; } // If the replica that you want to delete is an L replica, // need to determine whether there is a lack of PAXOS replicas. // If the paxos replica is not enough, you cannot delete int ObPartitionGroupCoordinator::try_remove_and_migrate(const Partition& partition, const Replica& to_migrate_replica, const Replica& to_delete_replica, const Replica& migrate_dest_in_ppr, bool& can_add_task) { int ret = OB_SUCCESS; TenantBalanceStat& ts = *tenant_stat_; can_add_task = false; if (!ts.is_valid()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("get invalid tenant stat", K(ret)); } else if (ObReplicaTypeCheck::is_paxos_replica_V2(to_delete_replica.replica_type_)) { if (partition.valid_member_cnt_ < partition.schema_replica_cnt_) { ret = OB_NOT_SUPPORTED; LOG_WARN("already lost paxos replica, refuse to delete one", K(ret), K(partition)); } } if (OB_FAIL(ret)) { // nothing todo } else if (OB_FAIL(do_remove_replica(partition, to_delete_replica))) { LOG_WARN("fail to remove replica", K(ret)); } else if (OB_FAIL(do_migrate_replica(partition, to_migrate_replica, migrate_dest_in_ppr, can_add_task))) { LOG_WARN("fail to migrate replica", K(ret)); } return ret; } // partition: objects to be deleted and migrated; // First find the replica on the machine where the ppr in the partition is located, and delete it; // then migrate the replica to the machine where the ppr is located; int ObPartitionGroupCoordinator::do_remove_and_migrate( const Partition& partition, const Replica& replica, const Replica& ppr, bool& can_add_task) { int ret = OB_SUCCESS; TenantBalanceStat& ts = *tenant_stat_; can_add_task = false; if (!ts.is_valid()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("get invalid tenant stat", K(ret)); } else { // find replica in ppr->server in partition and remove it FOR_BEGIN_END_E(pr, partition, ts.all_replica_, OB_SUCC(ret)) { if (pr->is_in_service() && pr->server_->server_ == ppr.server_->server_) { if (ObReplicaTypeCheck::is_paxos_replica_V2(pr->replica_type_)) { ret = OB_ERR_UNEXPECTED; LOG_WARN( "should not remove paxos member in partition group", K(ret), K(partition), K(replica), K(ppr), K(*pr)); } else if (OB_FAIL(do_remove_replica(partition, *pr))) { LOG_WARN("fail to remove replica", K(ret), K(partition), K(replica), K(ppr)); } else if (OB_FAIL(do_migrate_replica(partition, replica, ppr, can_add_task))) { LOG_WARN("fail to migrate replica", K(ret), K(partition), K(replica), K(ppr)); } break; } } } return ret; } int ObPartitionGroupCoordinator::do_remove_replica(const Partition& partition, const Replica& replica) { int ret = OB_SUCCESS; TenantBalanceStat& ts = *tenant_stat_; ObPartitionKey key; OnlineReplica remove_member; remove_member.member_ = ObReplicaMember(replica.server_->server_, replica.member_time_us_, replica.replica_type_); remove_member.zone_ = replica.zone_; const char* comment = balancer::PG_COORDINATE_REMOVE_REPLICA; if (!ts.is_valid()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("get invalid tenant stat", K(ret)); } else if (OB_FAIL(ts.gen_partition_key(partition, key))) { LOG_WARN("fail to gen partition key", K(ret), K(partition)); } else if (ObReplicaTypeCheck::is_paxos_replica_V2(replica.replica_type_)) { ObRemoveMemberTask task; ObRemoveMemberTaskInfo task_info; common::ObArray task_info_array; int64_t quorum = 0; if (replica.is_in_blacklist(ObRebalanceTaskType::MEMBER_CHANGE, replica.server_->server_, tenant_stat_)) { ret = OB_REBALANCE_TASK_CANT_EXEC; LOG_WARN("replica member change frequent failed, now in black list", K(replica)); } else if (OB_FAIL(ts.get_remove_replica_quorum_size(partition, replica.zone_, replica.replica_type_, quorum))) { LOG_WARN("fail to get quorum size", K(ret)); } else if (OB_FAIL(task_info.build(remove_member, key, quorum, partition.get_quorum()))) { LOG_WARN("fail to build remove member task info", K(ret)); } else if (OB_FAIL(task_info_array.push_back(task_info))) { LOG_WARN("fail to push back", K(ret)); } else if (OB_FAIL(task.build(task_info_array, remove_member.member_.get_server(), comment))) { LOG_WARN("fail to build remove member task", K(ret)); } else if (OB_FAIL(task_mgr_->add_task(task))) { LOG_WARN("fail to add task", K(ret), K(replica)); } else { LOG_INFO("add task to remove member for coordinate", K(replica), K(partition)); } } else { ObRemoveNonPaxosTask task; ObRemoveNonPaxosTaskInfo task_info; common::ObArray task_info_array; if (replica.is_in_blacklist( ObRebalanceTaskType::REMOVE_NON_PAXOS_REPLICA, replica.server_->server_, tenant_stat_)) { ret = OB_REBALANCE_TASK_CANT_EXEC; LOG_WARN("task in black list", K(replica)); } else if (OB_FAIL(task_info.build(remove_member, key))) { LOG_WARN("fail to build remove non paxos replica task info", K(ret), K(key)); } else if (OB_FAIL(task_info_array.push_back(task_info))) { LOG_WARN("fail to push back", K(ret)); } else if (OB_FAIL(task.build(task_info_array, remove_member.member_.get_server(), comment))) { LOG_WARN("fail to build remove non paxos replcia", K(ret), K(partition), K(remove_member)); } else if (OB_FAIL(task_mgr_->add_task(task))) { LOG_WARN("add task failed", K(ret), K(task)); } else { LOG_INFO("add remove replica task for partition group coordinate", K(task)); } } return ret; } // Migrate replica to the machine where dest_replica is located // Don't worry about the migration target is the leader. // When executing, the leader will be moved first. int ObPartitionGroupCoordinator::do_migrate_replica( const Partition& partition, const Replica& replica, const Replica& dest_replica, bool& can_add_task) { int ret = OB_SUCCESS; TenantBalanceStat& ts = *tenant_stat_; can_add_task = false; if (!ts.is_valid()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("get invalid tenant stat", K(ret)); } else if (!ts.has_leader_while_member_change(partition, replica.server_->server_)) { // Ignore the error code, // otherwise it will fail to initiate the migration replica task due to // the inability to align the standby database for a single zone and multiple units, // affecting the swtich_over operation LOG_WARN("can not do migrate, may no leader", K(ret), K(partition), K(replica), K(dest_replica)); } else if (replica.is_in_blacklist( ObRebalanceTaskType::MIGRATE_REPLICA, dest_replica.server_->server_, tenant_stat_)) { ret = OB_REBALANCE_TASK_CANT_EXEC; LOG_INFO("replica in black list, wait", K(replica), K(dest_replica)); } else { ObMigrateReplicaTask task; ObMigrateTaskInfo task_info; common::ObArray task_info_array; ObPartitionKey key; ObReplicaMember data_source; ObReplicaMember src = ObReplicaMember( replica.server_->server_, replica.member_time_us_, replica.replica_type_, replica.get_memstore_percent()); const char* comment = balancer::PG_COORDINATE_MIGRATE; OnlineReplica dst; dst.member_ = ObReplicaMember(dest_replica.server_->server_, ObTimeUtility::current_time(), replica.replica_type_, replica.get_memstore_percent()); dst.zone_ = dest_replica.zone_; dst.member_.set_region(dest_replica.region_); dst.unit_id_ = dest_replica.unit_->info_.unit_.unit_id_; int64_t quorum = 0; int64_t transmit_data_size = 0; int64_t cluster_id = OB_INVALID_ID; bool is_restore = partition.in_physical_restore(); const obrpc::MigrateMode migrate_mode = obrpc::MigrateMode::MT_LOCAL_FS_MODE; if (OB_FAIL(ts.gen_partition_key(partition, key))) { LOG_WARN("fail to gen partition key", K(ret), K(partition)); } else if (OB_FAIL(ts.choose_data_source(partition, dst.member_, src, false, data_source, transmit_data_size, ObRebalanceTaskType::MIGRATE_REPLICA, cluster_id))) { LOG_WARN("fail to choose data source", K(ret), K(partition), K(replica)); } else if (OB_FAIL(ts.get_migrate_replica_quorum_size(partition, quorum))) { LOG_WARN("fail to get quorum size", K(ret)); } else if (FALSE_IT(task_info.set_transmit_data_size(transmit_data_size))) { // nothing todo } else if (OB_FAIL(task_info.build(migrate_mode, dst, key, src, data_source, quorum, is_restore))) { LOG_WARN("fail to build migrate task info", K(ret)); } else if (OB_FAIL(task_info_array.push_back(task_info))) { LOG_WARN("fail to push back", K(ret)); } else if (OB_FAIL(task.build(migrate_mode, task_info_array, dst.member_.get_server(), ObRebalanceTaskPriority::LOW_PRI, comment))) { LOG_WARN("fail to build migrate task", K(ret), K(key), K(partition)); } else if (OB_FAIL(task_mgr_->add_task(task))) { LOG_WARN("add task failed", K(ret), K(task)); } else { can_add_task = true; LOG_INFO("add migrate task for partition group coordinate", K(task)); } } return ret; } // Choose a migration destination for ppr // First select the replica location of the same type in this zone; secondly, // choose a replica location in this zone at will int ObPartitionGroupCoordinator::get_migrate_dest( const Partition& primary_partition, const Replica& ppr, const Partition& partition, Replica& dest) { int ret = OB_SUCCESS; if (ppr.server_->can_migrate_in()) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), K(ppr)); } else { Replica* same_type_replica = NULL; Replica* same_zone_repilca = NULL; TenantBalanceStat& ts = *tenant_stat_; bool find = false; FOR_BEGIN_END_E(pr, partition, ts.all_replica_, OB_SUCC(ret)) { if (pr->is_in_service() && pr->zone_ == ppr.zone_) { if (pr->server_->server_ == ppr.server_->server_) { // ppr->server has been blocked // nothing todo } else if (pr->unit_->info_.unit_.replica_type_ != ppr.unit_->info_.unit_.replica_type_) { // If the unit types are not the same, can't migrate back and wait for the fault to be resolved // nothing todo } else if (pr->server_->can_migrate_in() && !ts.has_replica(primary_partition, pr->server_->server_)) { same_zone_repilca = &(*pr); if (pr->replica_type_ == ppr.replica_type_) { same_type_replica = &(*pr); } } } } // end FOR_BEGIN_END_E(pr, if (OB_SUCC(ret) && (same_type_replica != NULL || same_zone_repilca != NULL)) { if (same_type_replica != NULL) { dest = *same_type_replica; find = true; } else { dest = *same_zone_repilca; find = true; } } // The above logic may not find a suitable location, need to find a unit to move in if (OB_FAIL(ret) || find) { // nothing todo } else if (OB_FAIL(get_random_dest(primary_partition, ppr, dest))) { LOG_WARN("fail to get random dest", K(ret)); } } return ret; } int ObPartitionGroupCoordinator::get_random_dest(const Partition& primary_partition, const Replica& ppr, Replica& dest) { int ret = OB_SUCCESS; TenantBalanceStat& ts = *tenant_stat_; bool find = false; for (int64_t i = 0; i < ts.all_zone_unit_.count() && OB_SUCC(ret) && !find; i++) { const ZoneUnit& zu = ts.all_zone_unit_.at(i); if (zu.zone_ != ppr.zone_) { // nothing todo } else { for (int64_t j = 0; j < zu.all_unit_.count() && OB_SUCC(ret); j++) { UnitStat* us = zu.all_unit_.at(j); if (!us->server_->can_migrate_in() || ts.has_replica(primary_partition, us->server_->server_) || (REPLICA_TYPE_LOGONLY == us->info_.unit_.replica_type_ && ppr.unit_->info_.unit_.replica_type_ != us->info_.unit_.replica_type_)) { // nothing todo } else { find = true; dest.unit_ = us; dest.server_ = us->server_; dest.zone_ = ppr.zone_; dest.region_ = ppr.region_; } } } } if (OB_SUCC(ret) && !find) { ret = OB_ENTRY_NOT_EXIST; LOG_WARN("fail to find dest for replica", K(primary_partition), K(ppr)); } return ret; }