/** * Copyright (c) 2021 OceanBase * OceanBase CE is licensed under Mulan PubL v2. * You can use this software according to the terms and conditions of the Mulan PubL v2. * You may obtain a copy of Mulan PubL v2 at: * http://license.coscl.org.cn/MulanPubL-2.0 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. * See the Mulan PubL v2 for more details. */ #define USING_LOG_PREFIX SQL_EXE #include "ob_granule_pump.h" #include "sql/engine/px/ob_granule_iterator_op.h" #include "sql/engine/px/ob_granule_util.h" #include "sql/engine/px/ob_px_util.h" #include "sql/session/ob_basic_session_info.h" #include "share/config/ob_server_config.h" #include "share/schema/ob_part_mgr_util.h" #include "sql/engine/dml/ob_table_modify_op.h" #include "sql/engine/ob_engine_op_traits.h" #include "sql/engine/px/ob_px_sqc_handler.h" namespace oceanbase { namespace sql { using namespace oceanbase::share::schema; bool ObGranulePumpArgs::need_partition_granule() { return 0 == cur_tablet_idx_ && ObGranuleUtil::force_partition_granule(gi_attri_flag_); } //------------------------------end ObGranulePumpArgs int ObGITaskSet::get_task_at_pos(ObGranuleTaskInfo &info, const int64_t &pos) const { int ret = OB_SUCCESS; if (pos < 0 || pos >= gi_task_set_.count()) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), K(pos)); } else { int64_t cur_idx = gi_task_set_.at(pos).idx_; info.tablet_loc_ = const_cast(gi_task_set_.at(pos).tablet_loc_); info.ranges_.reset(); info.ss_ranges_.reset(); for (int64_t i = pos; OB_SUCC(ret) && i < gi_task_set_.count(); i++) { if (cur_idx == gi_task_set_.at(i).idx_) { if (OB_FAIL(info.ranges_.push_back(gi_task_set_.at(i).range_))) { LOG_WARN("push back ranges failed", K(ret)); } else if (OB_FAIL(info.ss_ranges_.push_back(gi_task_set_.at(i).ss_range_))) { LOG_WARN("push back skip scan ranges failed", K(ret)); } } else { break; } } } return ret; } int ObGITaskSet::get_next_gi_task_pos(int64_t &pos) { int ret = OB_SUCCESS; if (cur_pos_ == gi_task_set_.count()) { ret = OB_ITER_END; } else if (cur_pos_ > gi_task_set_.count()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("cur_pos_ is out of range", K(ret), K(cur_pos_), K(gi_task_set_.count())); } else { pos = cur_pos_; int64_t cur_idx = gi_task_set_.at(cur_pos_).idx_; for (int64_t i = cur_pos_; OB_SUCC(ret) && i < gi_task_set_.count(); i++) { if (cur_idx == gi_task_set_.at(i).idx_) { if (i == (gi_task_set_.count() - 1)) { cur_pos_ = gi_task_set_.count(); } } else { cur_pos_ = i; break; } } } return ret; } int ObGITaskSet::get_next_gi_task(ObGranuleTaskInfo &info) { int ret = OB_SUCCESS; if (cur_pos_ == gi_task_set_.count()) { ret = OB_ITER_END; } else if (cur_pos_ > gi_task_set_.count()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("cur_pos_ is out of range", K(ret), K(cur_pos_), K(gi_task_set_.count())); } else { int64_t cur_idx = gi_task_set_.at(cur_pos_).idx_; info.tablet_loc_ = gi_task_set_.at(cur_pos_).tablet_loc_; info.ranges_.reset(); info.ss_ranges_.reset(); for (int64_t i = cur_pos_; OB_SUCC(ret) && i < gi_task_set_.count(); i++) { if (cur_idx == gi_task_set_.at(i).idx_) { if (OB_FAIL(info.ranges_.push_back(gi_task_set_.at(i).range_))) { LOG_WARN("push back ranges failed", K(ret)); } else if (OB_FAIL(info.ss_ranges_.push_back(gi_task_set_.at(i).ss_range_))) { LOG_WARN("push back skip scan ranges failed", K(ret)); } if (i == (gi_task_set_.count() - 1)) { cur_pos_ = gi_task_set_.count(); } } else { cur_pos_ = i; break; } } } return ret; } int ObGITaskSet::assign(const ObGITaskSet &other) { int ret = OB_SUCCESS; IGNORE_RETURN gi_task_set_.reset(); if (OB_FAIL(gi_task_set_.assign(other.gi_task_set_))) { LOG_WARN("failed to assign gi_task_set", K(ret)); } else { cur_pos_ = other.cur_pos_; } return ret; } int ObGITaskSet::set_pw_affi_partition_order(bool asc) { int ret = OB_SUCCESS; if (gi_task_set_.count() <= 1) { // 两种情况下不需要进行排序: // 1. partition keys 是empty // 增加empty的判断条件,aone: // 由于在affinitize情况下,任务是按照partition的粒度划分, // 如果parallel的值可能大于表的partition个数,就会出现task set为”空“, // 如果task set是”空“就跳过`set_pw_affi_partition_order`过程 // 2. partition keys的count等于1 // partition keys的count等于1,也就表示仅有一个partition,所以不需要进行排序 // do nothing } else { // first we do a defensive check. if data already sorted as expected, we just skip reverse // FIXME YISHEN , need check if (!(asc && (gi_task_set_.at(0).tablet_loc_->tablet_id_ > gi_task_set_.at(1).tablet_loc_->tablet_id_)) || (!asc && (gi_task_set_.at(0).tablet_loc_->tablet_id_ < gi_task_set_.at(1).tablet_loc_->tablet_id_))) { // no need to reverse this taskset } else { common::ObArray reverse_task_info; if (OB_FAIL(reverse_task_info.reserve(gi_task_set_.count()))) { LOG_WARN("fail reserve memory for array", K(ret)); } for (int64_t i = gi_task_set_.count() - 1; OB_SUCC(ret) && i >=0; --i) { if (OB_FAIL(reverse_task_info.push_back(gi_task_set_.at(i)))) { LOG_WARN("failed to push back task info", K(ret)); } } if (OB_SUCC(ret)) { if (OB_FAIL(gi_task_set_.assign(reverse_task_info))) { LOG_WARN("failed to assign task info", K(ret)); } } LOG_TRACE("reverse this pw affinitize task info", K(ret), K(gi_task_set_)); } } return ret; } // reverse block order for every partition int ObGITaskSet::set_block_order(bool desc) { int ret = OB_SUCCESS; if (desc && gi_task_set_.count() > 1) { common::ObArray reverse_task_info; if (OB_FAIL(reverse_task_info.reserve(gi_task_set_.count()))) { LOG_WARN("fail reserve memory for array", K(ret)); } int64_t lower_inclusive = 0; while (lower_inclusive < gi_task_set_.count() && OB_SUCC(ret)) { // step1: in order to reverse block inside a partition, look for partition boundary int64_t upper_exclusive = lower_inclusive + 1; for (; upper_exclusive < gi_task_set_.count() && OB_SUCC(ret); ++upper_exclusive) { if (gi_task_set_.at(upper_exclusive).tablet_loc_->tablet_id_.id() != gi_task_set_.at(lower_inclusive).tablet_loc_->tablet_id_.id()) { break; } } // step2: reverse gi_task_set_[lower_inclusive, upper_exclusive) int64_t pos = upper_exclusive; for (;lower_inclusive < upper_exclusive && OB_SUCC(ret); ++lower_inclusive) { pos--; if (OB_FAIL(reverse_task_info.push_back(gi_task_set_.at(pos)))) { LOG_WARN("failed to push back task info", K(ret)); } } } if (OB_SUCC(ret)) { if (OB_FAIL(gi_task_set_.assign(reverse_task_info))) { LOG_WARN("failed to assign task info", K(ret)); } } LOG_TRACE("reverse block task info", K(ret), K(gi_task_set_)); } return ret; } int ObGITaskSet::construct_taskset(ObIArray &taskset_tablets, ObIArray &taskset_ranges, ObIArray &ss_ranges, ObIArray &taskset_idxs, ObGIRandomType random_type) { int ret = OB_SUCCESS; if (OB_UNLIKELY(taskset_tablets.count() != taskset_ranges.count() || taskset_tablets.count() != taskset_idxs.count() || taskset_tablets.empty() || ss_ranges.count() > 1)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("taskset count err", K(taskset_tablets.count()), K(taskset_ranges), K(taskset_idxs), K(ss_ranges.count())); } else if (!(GI_RANDOM_NONE <= random_type && random_type <= GI_RANDOM_RANGE)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("random type err", K(random_type)); } else if (gi_task_set_.empty() && OB_FAIL(gi_task_set_.reserve(taskset_tablets.count()))) { LOG_WARN("failed to prepare allocate", K(ret)); } else { ObNewRange whole_range; whole_range.set_whole_range(); ObNewRange &ss_range = ss_ranges.empty() ? whole_range : ss_ranges.at(0); for (int64_t i = 0; OB_SUCC(ret) && i < taskset_tablets.count(); i++) { ObGITaskInfo task_info(taskset_tablets.at(i), taskset_ranges.at(i), ss_range, taskset_idxs.at(i)); if (random_type != ObGITaskSet::GI_RANDOM_NONE) { task_info.hash_value_ = common::murmurhash(&task_info.idx_, sizeof(task_info.idx_), 0); } if (OB_FAIL(gi_task_set_.push_back(task_info))) { LOG_WARN("add partition key failed", K(ret)); } } if (OB_SUCC(ret) && random_type != GI_RANDOM_NONE) { auto compare_fun = [](const ObGITaskInfo &a, const ObGITaskInfo &b) -> bool { return a.hash_value_ > b.hash_value_; }; std::sort(gi_task_set_.begin(), gi_task_set_.end(), compare_fun); } } return ret; } /////////////////////////////////////////////////////////////////////////////////////// int ObGranulePump::try_fetch_pwj_tasks(ObIArray &infos, const ObIArray &op_ids, int64_t worker_id) { int ret = OB_SUCCESS; /*try get gi task*/ if (GIT_UNINITIALIZED == splitter_type_) { ret = OB_NOT_INIT; LOG_WARN("granule pump is not init", K(ret)); } else if (worker_id < 0) { ret = OB_ERR_UNEXPECTED; LOG_WARN("thread_id out of range", K(ret), K(worker_id)); } else { switch(splitter_type_) { case GIT_FULL_PARTITION_WISE : if (OB_FAIL(fetch_pw_granule_from_shared_pool(infos, op_ids))) { if (ret != OB_ITER_END) { LOG_WARN("fetch granule from shared pool failed", K(ret)); } } break; case GIT_PARTITION_WISE_WITH_AFFINITY: if (OB_FAIL(fetch_pw_granule_by_worker_id(infos, op_ids, worker_id))) { if (ret != OB_ITER_END) { LOG_WARN("fetch pw granule by worker id failed", K(ret), K(worker_id)); } } break; default: ret = OB_ERR_UNEXPECTED; LOG_WARN("unexpected type", K(ret), K(splitter_type_)); } } return ret; } int ObGranulePump::fetch_granule_task(const ObGITaskSet *&res_task_set, int64_t &pos, int64_t worker_id, uint64_t tsc_op_id) { int ret = OB_SUCCESS; /*try get gi task*/ LOG_DEBUG("fetch granule task from granule pump"); if (GIT_UNINITIALIZED == splitter_type_) { ret = OB_NOT_INIT; LOG_WARN("granule pump is not init", K(ret)); } else if (worker_id < 0) { ret = OB_ERR_UNEXPECTED; LOG_WARN("thread_id out of range", K(ret), K(worker_id)); } else { switch(splitter_type_) { case GIT_AFFINITY: case GIT_ACCESS_ALL: case GIT_FULL_PARTITION_WISE: case GIT_PARTITION_WISE_WITH_AFFINITY: if (OB_FAIL(fetch_granule_by_worker_id(res_task_set, pos, worker_id, tsc_op_id))) { if (ret != OB_ITER_END) { LOG_WARN("fetch granule by worker id failed", K(ret)); } } break; case GIT_RANDOM: if (OB_FAIL(fetch_granule_from_shared_pool(res_task_set, pos, tsc_op_id))) { if (ret != OB_ITER_END) { LOG_WARN("fetch granule from shared pool failed", K(ret)); } } break; default: ret = OB_ERR_UNEXPECTED; LOG_WARN("unexpected type", K(ret), K(splitter_type_)); } } return ret; } int ObGranulePump::fetch_granule_by_worker_id(const ObGITaskSet *&res_task_set, int64_t &pos, int64_t worker_id, uint64_t tsc_op_id) { int ret = OB_SUCCESS; ObGITaskArray *taskset_array = nullptr; if (OB_FAIL(find_taskset_by_tsc_id(tsc_op_id, taskset_array))) { LOG_WARN("the op_id do not have task set", K(ret), K(tsc_op_id)); } else if (OB_ISNULL(taskset_array)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("the taskset_array is null", K(ret)); } else if (taskset_array->count() < worker_id + 1) { ret = OB_ERR_UNEXPECTED; LOG_WARN("the taskset_array size is invalid", K(taskset_array->count()), K(ret)); } else { res_task_set = &taskset_array->at(worker_id); ObGITaskSet &taskset = taskset_array->at(worker_id); if (OB_FAIL(taskset.get_next_gi_task_pos(pos))) { if (OB_ITER_END != ret) { LOG_WARN("fail to get next gi task pos", K(ret)); } } else { LOG_TRACE("get GI task", K(taskset), K(ret)); } } return ret; } int ObGranulePump::fetch_granule_from_shared_pool(const ObGITaskSet *&res_task_set, int64_t &pos, uint64_t tsc_op_id) { int ret = OB_SUCCESS; if (no_more_task_from_shared_pool_) { // when worker threads count >> shared task count, it performs better ret = OB_ITER_END; } else { ObLockGuard lock_guard(lock_); if (no_more_task_from_shared_pool_) { ret = OB_ITER_END; } ObGITaskArray *taskset_array = nullptr; if (OB_FAIL(ret)) { //has been failed. do nothing. } else if (OB_FAIL(find_taskset_by_tsc_id(tsc_op_id, taskset_array))) { LOG_WARN("the tsc_op_id do not have task set", K(ret), K(tsc_op_id)); } else if (OB_ISNULL(taskset_array)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("the taskset_array is null", K(ret)); } else if (taskset_array->count() < OB_GRANULE_SHARED_POOL_POS + 1) { ret = OB_ERR_UNEXPECTED; LOG_WARN("taskset array count is invalid", K(ret), K(taskset_array->count())); } else { res_task_set = &taskset_array->at(OB_GRANULE_SHARED_POOL_POS); ObGITaskSet &taskset = taskset_array->at(OB_GRANULE_SHARED_POOL_POS); if (OB_FAIL(taskset.get_next_gi_task_pos(pos))) { if (OB_ITER_END != ret) { LOG_WARN("fail to get next gi task pos", K(ret)); } else { no_more_task_from_shared_pool_ = true; } } else { LOG_TRACE("get GI task", K(taskset), K(ret)); } } } return ret; } int ObGranulePump::fetch_pw_granule_by_worker_id(ObIArray &infos, const ObIArray &op_ids, int64_t thread_id) { int ret = OB_SUCCESS; int64_t end_tsc_count = 0; if (GIT_PARTITION_WISE_WITH_AFFINITY != splitter_type_) { ret = OB_ERR_UNEXPECTED; LOG_WARN("only partition wise join granule pump offer this service", K(splitter_type_), K(ret)); } ARRAY_FOREACH_X(op_ids, idx, cnt, OB_SUCC(ret)) { ObGITaskArray *taskset_array = nullptr; ObGranuleTaskInfo info; uint64_t op_id = op_ids.at(idx); if (OB_FAIL(find_taskset_by_tsc_id(op_id, taskset_array))) { LOG_WARN("the op_id do not have task set", K(ret), K(op_id)); } else if (OB_ISNULL(taskset_array)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("the taskset_array is null", K(ret)); } else if (taskset_array->count() < thread_id + 1) { ret = OB_ERR_UNEXPECTED; LOG_WARN("the taskset_array size is invalid", K(taskset_array->count()), K(ret)); } else if (OB_FAIL(taskset_array->at(thread_id).get_next_gi_task(info))) { if (ret != OB_ITER_END) { LOG_WARN("failed to get info", K(ret)); } else { ret = OB_SUCCESS; end_tsc_count++; } } else if (OB_FAIL(infos.push_back(info))) { LOG_WARN("push back task info failed", K(ret)); } } if (OB_FAIL(ret)) { } else if (OB_FAIL(check_pw_end(end_tsc_count, op_ids.count(), infos.count()))) { LOG_WARN("incorrect state", K(ret)); } LOG_TRACE("get a new partition wise join gi tasks", K(infos), K(ret)); return ret; } int ObGranulePump::fetch_pw_granule_from_shared_pool(ObIArray &infos, const ObIArray &op_ids) { int ret = OB_SUCCESS; if (no_more_task_from_shared_pool_) { ret = OB_ITER_END; } else { ObLockGuard lock_guard(lock_); // 表示取不到下一个GI task的op的个数; // 理论上end_op_count只能等于0(表示gi任务还没有被消费完)或者等于`op_ids.count()`(表示gi任务全部被消费完) int64_t end_op_count = 0; if (no_more_task_from_shared_pool_) { ret = OB_ITER_END; } else if (GIT_FULL_PARTITION_WISE != splitter_type_) { ret = OB_ERR_UNEXPECTED; LOG_WARN("only partition wise join granule pump offer this service", K(splitter_type_), K(ret)); } ARRAY_FOREACH_X(op_ids, idx, cnt, OB_SUCC(ret)) { ObGITaskArray *taskset_array = nullptr; ObGranuleTaskInfo info; uint64_t op_id = op_ids.at(idx); if (OB_FAIL(find_taskset_by_tsc_id(op_id, taskset_array))) { LOG_WARN("the op_id do not have task set", K(ret), K(op_id)); } else if (OB_ISNULL(taskset_array)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("the taskset_array is null", K(ret)); } else if (taskset_array->count() != 1) { ret = OB_ERR_UNEXPECTED; LOG_WARN("the taskset_array size is invalid", K(taskset_array->count()), K(ret)); } else if (OB_FAIL(taskset_array->at(0).get_next_gi_task(info))) { if (ret != OB_ITER_END) { LOG_WARN("failed to get info", K(ret)); } else { ret = OB_SUCCESS; end_op_count++; } } else if (OB_FAIL(infos.push_back(info))) { LOG_WARN("push back task info failed", K(ret)); } } // 防御性代码:检查full partition wise的情况下,每一个op对应的GI task是否被同时消费完毕 if (OB_FAIL(ret)) { } else if (OB_FAIL(check_pw_end(end_op_count, op_ids.count(), infos.count()))) { if (OB_ITER_END != ret) { LOG_WARN("incorrect state", K(ret)); } else { no_more_task_from_shared_pool_ = true; } } LOG_TRACE("get a new partition wise join gi tasks", K(infos), K(ret)); } return ret; } int ObGranulePump::check_pw_end(int64_t end_op_count, int64_t op_count, int64_t task_count) { int ret = OB_SUCCESS; if (end_op_count !=0 && end_op_count != op_count) { ret = OB_ERR_UNEXPECTED; LOG_WARN("the end op count does not match partition wise join ops count", K(end_op_count), K(op_count), K(ret)); } else if (end_op_count != 0) { ret = OB_ITER_END; } else if (end_op_count == 0 && task_count != op_count) { ret = OB_ERR_UNEXPECTED; LOG_WARN("the infos count does not match partition wise join ops count", K(end_op_count), K(task_count), K(op_count), K(ret)); } else if (end_op_count == 0) { /* we get tasks for every tsc */ } return ret; } /** * 该函数比较特殊,在一个SQC中它可能被调用多次,类似于这样的计划 * * [Join] * | * ------------ * | | * [Join] GI * | | * ----------- TSC3 * | | * EX(pkey) GI * | | * .... TSC2 * 在sqc的setup_op_input流程中,找到一个GI会调用一次这个接口。 * */ int ObGranulePump::add_new_gi_task(ObGranulePumpArgs &args) { int ret = OB_SUCCESS; partition_wise_join_ = ObGranuleUtil::pwj_gi(args.gi_attri_flag_); ObTableModifySpec *modify_op = args.op_info_.get_modify_op(); ObIArray &scan_ops = args.op_info_.get_scan_ops(); LOG_DEBUG("init granule", K(args)); // GI目前不仅仅支持TSC的任务划分,还支持INSERT的任务划分,需要同时考虑INSERT与TSC int map_size = 0; if (OB_NOT_NULL(modify_op)) { map_size++; } map_size += scan_ops.count(); if (gi_task_array_map_.empty()) { if (OB_FAIL(gi_task_array_map_.prepare_allocate(map_size))) { LOG_WARN("failed to prepare allocate", K(ret)); } } ObGITaskSet::ObGIRandomType random_type = ObGITaskSet::GI_RANDOM_NONE; if (OB_SUCC(ret)) { // only GIT_FULL_PARTITION_WISE and GIT_RANDOM are possible now bool can_randomize = false; if (OB_FAIL(check_can_randomize(args, can_randomize))) { LOG_WARN("check can randomize failed", K(ret)); } else if (can_randomize) { random_type = ObGITaskSet::GI_RANDOM_RANGE; LOG_TRACE("split random task/range for online ddl and pdml"); } } if (OB_FAIL(ret)) { } else if (ObGranuleUtil::access_all(args.gi_attri_flag_)) { splitter_type_ = GIT_ACCESS_ALL; ObAccessAllGranuleSplitter splitter; if (OB_FAIL(splitter.split_granule(args, scan_ops, gi_task_array_map_, random_type))) { LOG_WARN("failed to prepare access all gi task", K(ret)); } } else if (ObGranuleUtil::pwj_gi(args.gi_attri_flag_) && ObGranuleUtil::affinitize(args.gi_attri_flag_)) { splitter_type_ = GIT_PARTITION_WISE_WITH_AFFINITY; ObPWAffinitizeGranuleSplitter splitter; if (OB_FAIL(splitter.partitions_info_.assign(args.partitions_info_))) { LOG_WARN("Failed to assign partitions info", K(ret)); } else if (OB_FAIL(splitter.split_granule(args, scan_ops, gi_task_array_map_, random_type))) { LOG_WARN("failed to prepare affinity gi task", K(ret)); } } else if (ObGranuleUtil::affinitize(args.gi_attri_flag_)) { splitter_type_ = GIT_AFFINITY; ObNormalAffinitizeGranuleSplitter splitter; if (OB_FAIL(splitter.partitions_info_.assign(args.partitions_info_))) { LOG_WARN("Failed to assign partitions info", K(ret)); } else if (OB_FAIL(splitter.split_granule(args, scan_ops, gi_task_array_map_, random_type))) { LOG_WARN("failed to prepare affinity gi task", K(ret)); } } else if (ObGranuleUtil::pwj_gi(args.gi_attri_flag_)) { splitter_type_ = GIT_FULL_PARTITION_WISE; ObPartitionWiseGranuleSplitter splitter; if (OB_FAIL(splitter.split_granule(args, scan_ops, modify_op, gi_task_array_map_, random_type))) { LOG_WARN("failed to prepare pw gi task", K(ret)); } } else { splitter_type_ = GIT_RANDOM; ObRandomGranuleSplitter splitter; bool partition_granule = args.need_partition_granule(); // TODO: randomize GI // if (!(args.asc_order() || args.desc_order() || ObGITaskSet::GI_RANDOM_NONE != random_type)) { // random_type = ObGITaskSet::GI_RANDOM_TASK; // } if (OB_FAIL(splitter.split_granule(args, scan_ops, gi_task_array_map_, random_type, partition_granule))) { LOG_WARN("failed to prepare random gi task", K(ret), K(partition_granule)); } } return ret; } int ObGranulePump::check_can_randomize(ObGranulePumpArgs &args, bool &can_randomize) { int ret = OB_SUCCESS; ObSQLSessionInfo *my_session = nullptr; ObPhysicalPlanCtx *phy_plan_ctx = NULL; const ObPhysicalPlan *phy_plan = NULL; bool need_start_ddl = false; bool need_start_pdml = false; if (OB_ISNULL(args.ctx_)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("error unexpected, arg ctx must not be nullptr", K(ret)); } else if (OB_ISNULL(my_session = GET_MY_SESSION(*args.ctx_))) { ret = OB_ERR_UNEXPECTED; LOG_WARN("error unexpected, session must not be nullptr", K(ret)); } else if (my_session->get_ddl_info().is_ddl()) { need_start_ddl = true; } if(OB_SUCC(ret)) { if (OB_ISNULL(phy_plan_ctx = GET_PHY_PLAN_CTX(*args.ctx_)) || OB_ISNULL(phy_plan = phy_plan_ctx->get_phy_plan())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("some params are NULL", K(ret), K(phy_plan_ctx), K(phy_plan)); } else if(phy_plan->is_use_pdml()) { need_start_pdml = true; } } // Only when in ddl and pdml, can randomize. Specially, can not randomize when sql specifies the order can_randomize = (need_start_ddl || need_start_pdml) && (!(ObGranuleUtil::asc_order(args.gi_attri_flag_) || ObGranuleUtil::desc_order(args.gi_attri_flag_))); LOG_DEBUG("scan order is ", K(ObGranuleUtil::asc_order(args.gi_attri_flag_)), K(ObGranuleUtil::desc_order(args.gi_attri_flag_)), K(can_randomize), K(need_start_ddl), K(need_start_pdml)); return ret; } void ObGranulePump::destroy() { gi_task_array_map_.reset(); pump_args_.reset(); } void ObGranulePump::reset_task_array() { gi_task_array_map_.reset(); } int ObGranulePump::get_first_tsc_range_cnt(int64_t &cnt) { int ret = OB_SUCCESS; if (gi_task_array_map_.empty()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("error happen, empty task array", K(ret)); } else { ObGITaskArray &taskset_array = gi_task_array_map_.at(0).taskset_array_; if (taskset_array.empty()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("unexpected taskset array", K(ret)); } else { cnt = taskset_array.at(0).gi_task_set_.count(); } } return ret; } int64_t ObGranulePump::to_string(char *buf, const int64_t buf_len) const { int64_t pos = 0; J_OBJ_START(); J_KV(K_(parallelism), K_(tablet_size), K_(partition_wise_join)); J_OBJ_END(); return pos; } /////////////////////////////////////////////////////////////////////////////////////// int ObGranuleSplitter::split_gi_task(ObGranulePumpArgs &args, const ObTableScanSpec *tsc, int64_t table_id, int64_t op_id, const common::ObIArray &tablets, bool partition_granule, ObGITaskSet &task_set, ObGITaskSet::ObGIRandomType random_type) { int ret = OB_SUCCESS; ObSEArray ranges; ObSEArray ss_ranges; DASTabletLocSEArray taskset_tablets; ObSEArray taskset_ranges; ObSEArray taskset_idxs; bool range_independent = random_type == ObGITaskSet::GI_RANDOM_RANGE; if (0 > args.parallelism_ || OB_ISNULL(tsc)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("the parallelism is invalid", K(ret), K(args.parallelism_), K(tsc)); } else if (0 > args.tablet_size_) { ret = OB_ERR_UNEXPECTED; LOG_WARN("the parallelism is invalid", K(ret), K(args.tablet_size_)); } else if (tablets.count() <= 0 || OB_ISNULL(args.ctx_)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("the task has an empty tablets", K(ret), K(tablets)); } else if (OB_FAIL(get_query_range(*args.ctx_, tsc->get_query_range(), ranges, ss_ranges, table_id, op_id, partition_granule, ObGranuleUtil::with_param_down(args.gi_attri_flag_)))) { LOG_WARN("get query range failed", K(ret)); } else if (ranges.count() <= 0) { ret = OB_ERR_UNEXPECTED; LOG_WARN("the task has an empty range", K(ret), K(ranges)); } else { bool is_external_table = tsc->tsc_ctdef_.scan_ctdef_.is_external_table_; if (is_external_table) { ret = ObGranuleUtil::split_granule_for_external_table(args.ctx_->get_allocator(), tsc, ranges, tablets, args.external_table_files_, args.parallelism_, taskset_tablets, taskset_ranges, taskset_idxs); } else { ret = ObGranuleUtil::split_block_ranges(*args.ctx_, args.ctx_->get_allocator(), tsc, ranges, tablets, args.parallelism_, args.tablet_size_, partition_granule, taskset_tablets, taskset_ranges, taskset_idxs, range_independent); } if (OB_FAIL(ret)) { LOG_WARN("failed to get granule task", K(ret), K(ranges), K(tablets), K(is_external_table)); } else if (OB_FAIL(task_set.construct_taskset(taskset_tablets, taskset_ranges, ss_ranges, taskset_idxs, random_type))) { LOG_WARN("construct taskset failed", K(ret), K(taskset_tablets), K(taskset_ranges), K(ss_ranges), K(taskset_idxs), K(random_type)); } } return ret; } int ObGranuleSplitter::get_query_range(ObExecContext &ctx, const ObQueryRange &tsc_pre_query_range, ObIArray &ranges, ObIArray &ss_ranges, int64_t table_id, int64_t op_id, bool partition_granule, bool with_param_down /* = false */) { int ret = OB_SUCCESS; ObQueryRangeArray scan_ranges; ObQueryRangeArray skip_scan_ranges; ObPhysicalPlanCtx *plan_ctx = nullptr; bool has_extract_query_range = false; // 如果tsc有对应的query range,就预先抽取对应的query range LOG_DEBUG("set partition granule to whole range", K(table_id), K(op_id), K(partition_granule), K(with_param_down), K(tsc_pre_query_range.get_column_count()), K(tsc_pre_query_range.has_exec_param())); if (OB_ISNULL(plan_ctx = GET_PHY_PLAN_CTX(ctx))) { ret = OB_ERR_UNEXPECTED; LOG_WARN("fail to get physical plan ctx", K(ret)); } else if (partition_granule) { // For partition granule, we will prepare query range in table scan. ObNewRange whole_range; // partition granule情况下,尽量提前抽取有效的query range,如果无法抽取有效的query range // 就使用whole range if (0 == tsc_pre_query_range.get_column_count()) { whole_range.set_whole_range(); has_extract_query_range = !tsc_pre_query_range.has_exec_param(); } else if (OB_FAIL(ObSQLUtils::make_whole_range(ctx.get_allocator(), table_id, tsc_pre_query_range.get_column_count(), whole_range))) { LOG_WARN("Failed to make whole range", K(ret)); } else if (!tsc_pre_query_range.has_exec_param()) { // 没有动态参数才能够进行query range的提前抽取 LOG_DEBUG("try to get scan range for partition granule"); if (OB_FAIL(ObSQLUtils::extract_pre_query_range( tsc_pre_query_range, ctx.get_allocator(), ctx, scan_ranges, ObBasicSessionInfo::create_dtc_params(ctx.get_my_session())))) { LOG_WARN("failed to get scan ranges", K(ret)); } else if (OB_FAIL(tsc_pre_query_range.get_ss_tablet_ranges( ctx.get_allocator(), ctx, skip_scan_ranges, ObBasicSessionInfo::create_dtc_params(ctx.get_my_session())))) { LOG_WARN("failed to final extract index skip query range", K(ret)); } else { has_extract_query_range = true; } } if (OB_SUCC(ret)) { // 没有抽取出来query range, 就使用whole range if (scan_ranges.empty()) { LOG_DEBUG("the scan ranges is invalid, use the whole range", K(scan_ranges)); if (OB_FAIL(ranges.push_back(whole_range))) { LOG_WARN("Failed to push back scan range", K(ret)); } } } } else { if (tsc_pre_query_range.has_exec_param() && with_param_down) { ret = OB_ERR_UNEXPECTED; LOG_WARN("block iterater cannot have exec param", K(ret), K(op_id), K(tsc_pre_query_range)); } else if (OB_FAIL(ObSQLUtils::extract_pre_query_range( tsc_pre_query_range, ctx.get_allocator(), ctx, scan_ranges, ObBasicSessionInfo::create_dtc_params(ctx.get_my_session())))) { LOG_WARN("failed to get scan ranges", K(ret)); } else if (OB_FAIL(tsc_pre_query_range.get_ss_tablet_ranges( ctx.get_allocator(), ctx, skip_scan_ranges, ObBasicSessionInfo::create_dtc_params(ctx.get_my_session())))) { LOG_WARN("failed to final extract index skip query range", K(ret)); } else { has_extract_query_range = true; /* Here is an improvement made: 1. By default, access to each partition is always in sequential ascending order. Compared to not sorting, this is an enhancement that is not always 100% necessary (some scenarios do not require order), but the logic is acceptable. 2. If reverse order is required outside, then the reverse sorting should be done outside on its own. */ std::sort(scan_ranges.begin(), scan_ranges.end(), ObNewRangeCmp()); } } LOG_DEBUG("gi get the scan range", K(ret), K(partition_granule), K(has_extract_query_range), K(scan_ranges), K(skip_scan_ranges)); if (OB_SUCC(ret)) { // index skip scan, ranges from extract_pre_query_range/get_ss_tablet_ranges, // prefix range and postfix range is single range ObNewRange *ss_range = NULL; ObNewRange whole_range; whole_range.set_whole_range(); if (!skip_scan_ranges.empty() && (OB_ISNULL(skip_scan_ranges.at(0)) || OB_UNLIKELY(1 != skip_scan_ranges.count() || 1 != scan_ranges.count()))) { ret = OB_ERR_UNEXPECTED; LOG_WARN("unexpected index skip scan range", K(ret), K(scan_ranges), K(skip_scan_ranges)); } else if (OB_FAIL(ss_ranges.push_back(skip_scan_ranges.empty() ? whole_range : *skip_scan_ranges.at(0)))) { LOG_WARN("push back ranges failed", K(ret)); } else { ss_ranges.at(ss_ranges.count() - 1).table_id_ = table_id; } } for (int64_t i = 0; i < scan_ranges.count() && OB_SUCC(ret); ++i) { if (OB_ISNULL(scan_ranges.at(i))) { ret = OB_ERR_UNEXPECTED; LOG_WARN("the scan range is null", K(ret)); } else if (OB_FAIL(ranges.push_back(*scan_ranges.at(i)))) { LOG_WARN("push back ranges failed", K(ret)); } else { ranges.at(ranges.count() - 1).table_id_ = table_id; } } if (OB_FAIL(ret)) { // do nothing } else { ObOperatorKit *kit = ctx.get_operator_kit(op_id); if (OB_ISNULL(kit) || OB_ISNULL(kit->input_)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("operator is NULL", K(ret), KP(kit), K(table_id), K(op_id)); } else { ObTableScanOpInput *tsc_input = static_cast(kit->input_); tsc_input->set_need_extract_query_range(!has_extract_query_range); } } return ret; } int ObRandomGranuleSplitter::split_granule(ObGranulePumpArgs &args, ObIArray &scan_ops, GITaskArrayMap &gi_task_array_result, ObGITaskSet::ObGIRandomType random_type, bool partition_granule /* = true */) { int ret = OB_SUCCESS; if (scan_ops.count() != gi_task_array_result.count()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("invalid scan ops and gi task array result", K(ret), K(scan_ops.count()), K(gi_task_array_result.count())); } else if (ObGITaskSet::GI_RANDOM_NONE != random_type && (ObGranuleUtil::asc_order(args.gi_attri_flag_) || (ObGranuleUtil::desc_order(args.gi_attri_flag_)))) { ret = OB_ERR_UNEXPECTED; LOG_WARN("conflict order config", K(random_type), K(ObGranuleUtil::desc_order(args.gi_attri_flag_))); } const common::ObIArray &tablet_arrays = args.tablet_arrays_; ARRAY_FOREACH_X(scan_ops, idx, cnt, OB_SUCC(ret)) { const ObTableScanSpec *tsc = scan_ops.at(idx); if (OB_ISNULL(tsc) || scan_ops.count() != tablet_arrays.count()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("get a null tsc ptr", K(ret), K(scan_ops.count()), K(tablet_arrays.count())); } else { uint64_t scan_key_id = tsc->get_scan_key_id(); uint64_t op_id = tsc->get_id(); ObGITaskSet total_task_set; ObGITaskArray &taskset_array = gi_task_array_result.at(idx).taskset_array_; partition_granule = is_virtual_table(scan_key_id) || partition_granule; if (OB_FAIL(split_gi_task(args, tsc, scan_key_id, op_id, tablet_arrays.at(idx), partition_granule, total_task_set, random_type))) { LOG_WARN("failed to init granule iter pump", K(ret), K(idx), K(tablet_arrays)); } else if (OB_FAIL(total_task_set.set_block_order( ObGranuleUtil::desc_order(args.gi_attri_flag_)))) { LOG_WARN("fail set block order", K(ret)); } else if (OB_FAIL(taskset_array.push_back(total_task_set))) { LOG_WARN("failed to push back task set", K(ret)); } else { gi_task_array_result.at(idx).tsc_op_id_ = op_id; } LOG_TRACE("random granule split a task_array", K(op_id), K(scan_key_id), K(taskset_array), K(ret), K(scan_ops.count())); } } return ret; } // duplicate all scan ranges to each worker, so that every worker can // access all data int ObAccessAllGranuleSplitter::split_tasks_access_all(ObGITaskSet &taskset, int64_t parallelism, ObGITaskArray &taskset_array) { int ret = OB_SUCCESS; for (int64_t i = 0; i < parallelism && OB_SUCC(ret); ++i) { if(OB_FAIL(taskset_array.at(i).assign(taskset))) { LOG_WARN("failed to assign taskset", K(ret)); } } return ret; } int ObAccessAllGranuleSplitter::split_granule(ObGranulePumpArgs &args, ObIArray &scan_ops, GITaskArrayMap &gi_task_array_result, ObGITaskSet::ObGIRandomType random_type, bool partition_granule /* = true */) { int ret = OB_SUCCESS; if (scan_ops.count() != gi_task_array_result.count()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("invalid scan ops and gi task array result", K(ret), K(scan_ops.count()), K(gi_task_array_result.count())); } const common::ObIArray &tablet_arrays = args.tablet_arrays_; ARRAY_FOREACH_X(scan_ops, idx, cnt, OB_SUCC(ret)) { const ObTableScanSpec *tsc = scan_ops.at(idx); ObGITaskSet total_task_set; uint64_t op_id = OB_INVALID_ID; uint64_t scan_key_id = OB_INVALID_ID; ObGITaskArray &taskset_array = gi_task_array_result.at(idx).taskset_array_; if (OB_ISNULL(tsc)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("get a null tsc ptr", K(ret)); } else if (FALSE_IT(op_id = tsc->get_id())) { } else if (FALSE_IT(scan_key_id = tsc->get_scan_key_id())) { } else if (OB_FAIL(taskset_array.prepare_allocate(args.parallelism_))) { LOG_WARN("failed to prepare allocate", K(ret)); } else if (OB_FAIL(split_gi_task(args, tsc, scan_key_id, op_id, tablet_arrays.at(idx), partition_granule, total_task_set, random_type))) { LOG_WARN("failed to init granule iter pump", K(ret)); } else if (OB_FAIL(split_tasks_access_all(total_task_set, args.parallelism_, taskset_array))) { LOG_WARN("failed to split "); } else { gi_task_array_result.at(idx).tsc_op_id_ = op_id; } LOG_TRACE("access all granule split a task_array", K(op_id), K(tsc->get_loc_ref_table_id()), K(taskset_array), K(ret), K(scan_ops.count())); } return ret; } int ObAffinitizeGranuleSplitter::split_tasks_affinity(ObExecContext &ctx, ObGITaskSet &taskset, int64_t parallelism, ObGITaskArray &taskset_array) { int ret = OB_SUCCESS; ObSchemaGetterGuard schema_guard; const ObTableSchema *table_schema = NULL; ObSQLSessionInfo *my_session = NULL; ObPxTabletInfo partition_row_info; ObTabletIdxMap idx_map; bool qc_order_gi_tasks = false; if (OB_ISNULL(my_session = GET_MY_SESSION(ctx)) || OB_ISNULL(ctx.get_sqc_handler())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("fail to get my session", K(ret), K(my_session), K(ctx.get_sqc_handler())); } else { qc_order_gi_tasks = ctx.get_sqc_handler()->get_sqc_init_arg().qc_order_gi_tasks_; } int64_t cur_idx = -1; ObPxAffinityByRandom affinitize_rule(qc_order_gi_tasks); ARRAY_FOREACH_X(taskset.gi_task_set_, idx, cnt, OB_SUCC(ret)) { if (cur_idx != taskset.gi_task_set_.at(idx).idx_) { cur_idx = taskset.gi_task_set_.at(idx).idx_; // get all different parition key in Affinitize const ObDASTabletLoc &tablet_loc = *taskset.gi_task_set_.at(idx).tablet_loc_; int64_t tablet_idx = -1; if (NULL == table_schema || table_schema->get_table_id() != tablet_loc.loc_meta_->ref_table_id_) { uint64_t table_id = tablet_loc.loc_meta_->ref_table_id_; if (OB_FAIL(GCTX.schema_service_->get_tenant_schema_guard( my_session->get_effective_tenant_id(), schema_guard))) { LOG_WARN("Failed to get schema guard", K(ret)); } else if (OB_FAIL(schema_guard.get_table_schema( my_session->get_effective_tenant_id(), table_id, table_schema))) { LOG_WARN("Failed to get table schema", K(ret), K(table_id)); } else if (OB_ISNULL(table_schema)) { ret = OB_SCHEMA_ERROR; LOG_WARN("Table schema is null", K(ret), K(table_id)); } else if (OB_FAIL(ObPXServerAddrUtil::build_tablet_idx_map(table_schema, idx_map))) { LOG_WARN("fail to build tablet idx map", K(ret)); } } if (OB_SUCC(ret)) { // see issue // for virtual table, we can directly mock a tablet id // function build_tablet_idx_map will mock a idx map whose key // varies from 1 to table_schema->get_all_part_num(), and the value = key + 1 // so we can directly set tablet_idx = tablet_loc.tablet_id_.id() + 1, the result is same if (is_virtual_table(table_schema->get_table_id())) { tablet_idx = tablet_loc.tablet_id_.id() + 1; } else if (OB_FAIL(idx_map.get_refactored(tablet_loc.tablet_id_.id(), tablet_idx))) { LOG_WARN("fail to get tablet idx", K(ret)); } } if (OB_FAIL(ret)) { } else if (OB_FAIL(ObPxAffinityByRandom::get_tablet_info(tablet_loc.tablet_id_.id(), partitions_info_, partition_row_info))) { LOG_WARN("Failed to get tablet info", K(ret)); } else if (OB_FAIL(affinitize_rule.add_partition(tablet_loc.tablet_id_.id(), tablet_idx, parallelism, my_session->get_effective_tenant_id(), partition_row_info))) { LOG_WARN("Failed to get affinitize taskid" , K(ret)); } } } if (OB_FAIL(ret)) { } else if (OB_FAIL(affinitize_rule.do_random(!partitions_info_.empty(), my_session->get_effective_tenant_id()))) { LOG_WARN("failed to do random", K(ret)); } else { const ObIArray &partition_worker_pairs = affinitize_rule.get_result(); ARRAY_FOREACH(partition_worker_pairs, rt_idx) { int64_t task_id = partition_worker_pairs.at(rt_idx).worker_id_; int64_t tablet_id = partition_worker_pairs.at(rt_idx).tablet_id_; if (task_id >= parallelism) { ret = OB_ERR_UNEXPECTED; LOG_WARN("Task id is invalid", K(ret), K(task_id), K(parallelism)); } ARRAY_FOREACH(taskset.gi_task_set_, idx) { const ObDASTabletLoc &tablet_key = *taskset.gi_task_set_.at(idx).tablet_loc_; if (tablet_id == tablet_key.tablet_id_.id()) { ObGITaskSet &real_task_set = taskset_array.at(task_id); if (OB_FAIL(real_task_set.gi_task_set_.push_back(taskset.gi_task_set_.at(idx)))) { LOG_WARN("Failed to push back task info", K(ret)); } } } LOG_TRACE("affinitize granule split a task_array", K(tablet_id), K(task_id), K(parallelism), K(taskset_array), K(ret)); } } return ret; } int ObNormalAffinitizeGranuleSplitter::split_granule(ObGranulePumpArgs &args, ObIArray &scan_ops, GITaskArrayMap &gi_task_array_result, ObGITaskSet::ObGIRandomType random_type, bool partition_granule /* = true */) { int ret = OB_SUCCESS; if (scan_ops.count() != gi_task_array_result.count()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("invalid scan ops and gi task array result", K(ret), K(scan_ops.count()), K(gi_task_array_result.count())); } const common::ObIArray &tablet_arrays = args.tablet_arrays_; ARRAY_FOREACH_X(scan_ops, idx, cnt, OB_SUCC(ret)) { const ObTableScanSpec *tsc = scan_ops.at(idx); ObGITaskSet total_task_set; uint64_t op_id = OB_INVALID_ID; uint64_t scan_key_id = OB_INVALID_ID; ObGITaskArray &taskset_array = gi_task_array_result.at(idx).taskset_array_; if (OB_ISNULL(tsc) || OB_ISNULL(args.ctx_)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("get a null tsc ptr", K(ret)); } else if (FALSE_IT(op_id = tsc->get_id())) { } else if (FALSE_IT(scan_key_id = tsc->get_scan_key_id())) { } else if (OB_FAIL(gi_task_array_result.at(idx).taskset_array_.prepare_allocate(args.parallelism_))) { LOG_WARN("failed to prepare allocate", K(ret)); } else if (OB_FAIL(split_gi_task(args, tsc, scan_key_id, op_id, tablet_arrays.at(idx), partition_granule, total_task_set, random_type))) { LOG_WARN("failed to init granule iter pump", K(ret)); } else if (OB_FAIL(split_tasks_affinity(*args.ctx_, total_task_set, args.parallelism_, taskset_array))) { LOG_WARN("failed to split task affinity", K(ret)); } else { gi_task_array_result.at(idx).tsc_op_id_ = op_id; } LOG_TRACE("normal affinitize granule split a task_array", K(op_id), K(tsc->get_loc_ref_table_id()), K(taskset_array), K(ret), K(scan_ops.count())); } return ret; } int ObPartitionWiseGranuleSplitter::split_granule(ObGranulePumpArgs &args, ObIArray &scan_ops, GITaskArrayMap &gi_task_array_result, ObGITaskSet::ObGIRandomType random_type, bool partition_granule /* = true */) { // 由于FULL PARTITION WISE的split方法较为特殊(需要对INSERT进行任务切分),而目前的`ObGranuleSplitter`的接口`split_granule`仅仅考虑了TSC的处理, // 因此将`ObPartitionWiseGranuleSplitter`的该接口废弃掉 UNUSED(args); UNUSED(scan_ops); UNUSED(gi_task_array_result); UNUSED(random_type); UNUSED(partition_granule); int ret = OB_NOT_SUPPORTED; LOG_USER_ERROR(OB_NOT_SUPPORTED, "split granule method"); return ret; } // FULL PARTITION WISE 独有的split方法,可以处理INSERT/REPLACE的任务切分 int ObPartitionWiseGranuleSplitter::split_granule(ObGranulePumpArgs &args, ObIArray &scan_ops, const ObTableModifySpec * modify_op, GITaskArrayMap &gi_task_array_result, ObGITaskSet::ObGIRandomType random_type, bool partition_granule /* = true */) { int ret = OB_SUCCESS; int expected_map_size = 0; // 如果GI需要切分INSERT/REPLACE任务,那么tablet_arrays中不仅包含了table_scans表对应的partition keys信息,还包含了 // insert/replace表对应的partition keys信息;例如这样的计划: // .... // GI // INSERT/REPLACE // JOIN // TSC1 // TSC2 // `tablet_arrays`的第一个元素对应的是INSERT/REPLACE表的partition keys,其他元素对应的是TSC的表的partition keys int tsc_begin_idx = 0; const common::ObIArray &tablet_arrays = args.tablet_arrays_; if (OB_NOT_NULL(modify_op)) { expected_map_size++; tsc_begin_idx = 1; // 目前最多只有一个INSERT/REPLACE算子 } expected_map_size += scan_ops.count(); if (expected_map_size != gi_task_array_result.count()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("invalid scan ops and gi task array result", K(ret), K(expected_map_size), K(gi_task_array_result.count()), K(modify_op!=NULL), K(scan_ops.count())); } else if (tablet_arrays.count() != expected_map_size) { ret = OB_ERR_UNEXPECTED; LOG_WARN("invalid args", K(ret), K(tablet_arrays.count()), K(expected_map_size)); } else if (0 >= tablet_arrays.count()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("invalid args", K(ret), K(tablet_arrays.count())); } // 校验:校验REPLACE对应的分区与TSC对应的分区是否在逻辑上是相同的 if (OB_SUCC(ret)) { int tablet_count = tablet_arrays.at(0).count(); DASTabletLocArray tablets = tablet_arrays.at(0); ARRAY_FOREACH(tablet_arrays, idx) { // 校验每一个op对应的partition key的个数是相同的 DASTabletLocArray tablet_array = tablet_arrays.at(idx); if (tablet_count != tablet_array.count()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("the tablet count is not equal", K(ret), K(tablet_count), K(tablet_arrays.count())); } } } // 处理 insert/replace的任务划分 if (OB_SUCC(ret) && OB_NOT_NULL(modify_op)) { int index_idx = 0; ObGITaskSet total_task_set; ObGITaskArray &taskset_array = gi_task_array_result.at(index_idx).taskset_array_; const ObDMLBaseCtDef *dml_ctdef = nullptr; LOG_TRACE("handler split dml op task", K(modify_op->get_type())); if (OB_FAIL(modify_op->get_single_dml_ctdef(dml_ctdef))) { LOG_WARN("get single table loc id failed", K(ret)); } else if (split_insert_gi_task(args, dml_ctdef->das_base_ctdef_.index_tid_, dml_ctdef->das_base_ctdef_.rowkey_cnt_, // insert对应的row key count tablet_arrays.at(0), partition_granule, total_task_set, random_type)){ LOG_WARN("failed to prepare pw insert gi task", K(ret)); } else if (OB_FAIL(taskset_array.push_back(total_task_set))) { LOG_WARN("failed to push back task set", K(ret)); } else { // 获得对应的insert/replace op id LOG_TRACE("split modify gi task successfully", K(modify_op->get_id())); gi_task_array_result.at(index_idx).tsc_op_id_ = modify_op->get_id(); } } // 处理 tsc的任务划分 if(OB_SUCC(ret)) { ObSEArray tsc_tablet_arrays; for (int i = tsc_begin_idx; i < tablet_arrays.count(); i++) { if (OB_FAIL(tsc_tablet_arrays.push_back(tablet_arrays.at(i)))) { LOG_WARN("failed to push back tsc tablet arrays", K(ret)); } } if (OB_FAIL(ret)) { // pass } else if (OB_FAIL(split_tsc_gi_task(args, scan_ops, tsc_tablet_arrays, tsc_begin_idx, gi_task_array_result, partition_granule, random_type))) { LOG_WARN("failed to prepare pw tsc gi task", K(ret)); } } return ret; } int ObPartitionWiseGranuleSplitter::split_insert_gi_task(ObGranulePumpArgs &args, const uint64_t insert_table_id, const int64_t row_key_count, const common::ObIArray &tablets, bool partition_granule, ObGITaskSet &task_set, ObGITaskSet::ObGIRandomType random_type) { // 目前INSERT对应的GI一定是full partition wise类型,任务的划分粒度一定是按照partition进行划分 int ret = OB_SUCCESS; // insert的每一个partition对应的区间默认是[min_rowkey,max_rowkey] ObNewRange each_partition_range; ObSEArray ranges; ObSEArray empty_ss_ranges; DASTabletLocSEArray taskset_tablets; ObSEArray taskset_ranges; ObSEArray taskset_idxs; bool range_independent = random_type == ObGITaskSet::GI_RANDOM_RANGE; if (0 >= args.parallelism_ || tablets.count() <= 0) { ret = OB_ERR_UNEXPECTED; LOG_WARN("unexpected args", K(ret), K(args.parallelism_), K(tablets.count())); } else if (OB_FAIL(ObSQLUtils::make_whole_range(args.ctx_->get_allocator(), insert_table_id, row_key_count, each_partition_range))) { LOG_WARN("failed to make whole range", K(ret)); } else if (OB_FAIL(ranges.push_back(each_partition_range))) { LOG_WARN("failed to push partition range to ranges", K(ret)); } else if (OB_FAIL(ObGranuleUtil::split_block_ranges(*args.ctx_, args.ctx_->get_allocator(), NULL, ranges, tablets, args.parallelism_, args.tablet_size_, partition_granule, // true taskset_tablets, taskset_ranges, taskset_idxs, range_independent))) { LOG_WARN("failed to get insert granule task", K(ret), K(each_partition_range), K(tablets)); } else if (OB_FAIL(task_set.construct_taskset(taskset_tablets, taskset_ranges, empty_ss_ranges, taskset_idxs, random_type))) { // INSERT的任务划分一定是 partition wise的,并且INSERT算子每次rescan仅仅需要每一个task对应的partition key, // `ranges`,`idx`等任务参数是不需要 LOG_WARN("construct taskset failed", K(ret), K(taskset_tablets), K(taskset_ranges), K(taskset_idxs), K(random_type)); } return ret; } int ObPartitionWiseGranuleSplitter::split_tsc_gi_task(ObGranulePumpArgs &args, ObIArray &scan_ops, const common::ObIArray &tablet_arrays, int64_t tsc_begin_idx, GITaskArrayMap &gi_task_array_result, bool partition_granule, ObGITaskSet::ObGIRandomType random_type) { int ret = OB_SUCCESS; ARRAY_FOREACH_X(scan_ops, idx, cnt, OB_SUCC(ret)) { const ObTableScanSpec *tsc = scan_ops.at(idx); ObGITaskSet total_task_set; uint64_t op_id = OB_INVALID_ID; uint64_t scan_key_id = OB_INVALID_ID; int64_t task_array_idx = idx + tsc_begin_idx; ObGITaskArray &taskset_array = gi_task_array_result.at(task_array_idx).taskset_array_; if (OB_ISNULL(tsc)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("get a null tsc ptr", K(ret)); } else if (FALSE_IT(op_id = tsc->get_id())) { } else if (FALSE_IT(scan_key_id = tsc->get_scan_key_id())) { } else if (OB_FAIL(split_gi_task(args, tsc, scan_key_id, op_id, tablet_arrays.at(idx), partition_granule, total_task_set, random_type))) { LOG_WARN("failed to init granule iter pump", K(ret)); } else if (OB_FAIL(taskset_array.push_back(total_task_set))) { LOG_WARN("failed to push back task set", K(ret)); } else { gi_task_array_result.at(task_array_idx).tsc_op_id_ = op_id; } LOG_TRACE("partition wise tsc granule split a task_array", K(op_id), K(tsc->get_loc_ref_table_id()), K(taskset_array), K(ret), K(scan_ops.count())); } return ret; } int ObPWAffinitizeGranuleSplitter::split_granule(ObGranulePumpArgs &args, ObIArray &scan_ops, GITaskArrayMap &gi_task_array_result, ObGITaskSet::ObGIRandomType random_type, bool partition_granule /* = true */) { int ret = OB_SUCCESS; int64_t task_idx = gi_task_array_result.count(); ARRAY_FOREACH_X(scan_ops, idx, cnt, OB_SUCC(ret)) { GITaskArrayItem empty_task_array_item; if (OB_FAIL(gi_task_array_result.push_back(empty_task_array_item))) { LOG_WARN("push back new task array failed", K(ret)); } } const common::ObIArray &tablet_arrays = args.tablet_arrays_; ARRAY_FOREACH_X(scan_ops, idx, cnt, OB_SUCC(ret)) { const ObTableScanSpec *tsc = scan_ops.at(idx); const ObGranuleIteratorSpec *gi = static_cast(tsc->get_parent()); const ObOpSpec *gi_spec = tsc->get_parent(); while (OB_NOT_NULL(gi_spec) && PHY_GRANULE_ITERATOR != gi_spec->get_type()) { gi_spec = gi_spec->get_parent(); } ObGITaskSet total_task_set; uint64_t op_id = OB_INVALID_ID; uint64_t scan_key_id = OB_INVALID_ID; bool asc_gi_task_order = true; ObGITaskArray &taskset_array = gi_task_array_result.at(idx + task_idx).taskset_array_; if (OB_ISNULL(tsc) || OB_ISNULL(gi_spec)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("get a null tsc/gi ptr", K(ret), K(tsc), K(gi_spec)); } else if (FALSE_IT(op_id = tsc->get_id())) { } else if (FALSE_IT(asc_gi_task_order = !ObGranuleUtil::desc_order(static_cast(gi_spec)->gi_attri_flag_))) { } else if (FALSE_IT(scan_key_id = tsc->get_scan_key_id())) { } else if (OB_FAIL(taskset_array.prepare_allocate(args.parallelism_))) { LOG_WARN("failed to prepare allocate", K(ret)); } else if (OB_FAIL(split_gi_task(args, tsc, scan_key_id, op_id, tablet_arrays.at(idx), partition_granule, total_task_set, random_type))) { LOG_WARN("failed to init granule iter pump", K(ret)); } else if (OB_FAIL(split_tasks_affinity(*args.ctx_, total_task_set, args.parallelism_, taskset_array))) { LOG_WARN("failed to split task affinity", K(ret)); } else if (OB_FAIL(adjust_task_order(asc_gi_task_order, taskset_array))) { LOG_WARN("failed to adjust task order", K(ret)); } else { gi_task_array_result.at(idx + task_idx).tsc_op_id_ = op_id; } LOG_TRACE("partition wise with affinitize granule split a task_array", K(op_id), K(taskset_array), K(ret), K(scan_ops.count())); } return ret; } int ObPWAffinitizeGranuleSplitter::adjust_task_order(bool asc, ObGITaskArray &taskset_array) { // In same pw affi task group, worker has there own task order, // we must adjust task order to get right join result, just see issue/22963231. int ret = OB_SUCCESS; for (int64_t i = 0; i < taskset_array.count() && OB_SUCC(ret); ++i) { if (OB_FAIL(taskset_array.at(i).set_pw_affi_partition_order(asc))) { LOG_WARN("failed to set partition order", K(ret)); } } return ret; } int ObGranulePump::find_taskset_by_tsc_id(uint64_t op_id, ObGITaskArray *&taskset_array) { int ret = OB_SUCCESS; for (int64_t i = 0; i < gi_task_array_map_.count() && OB_SUCC(ret); ++i) { if (op_id == gi_task_array_map_.at(i).tsc_op_id_) { taskset_array = &gi_task_array_map_.at(i).taskset_array_; break; } } if (OB_SUCC(ret) && OB_ISNULL(taskset_array)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("task don't exist", K(ret), K(op_id)); } return ret; } int ObGranulePump::init_pump_args_inner(ObExecContext *ctx, ObIArray &scan_ops, const common::ObIArray &tablet_arrays, common::ObIArray &partitions_info, common::ObIArray &external_table_files, const ObTableModifySpec* modify_op, int64_t parallelism, int64_t tablet_size, uint64_t gi_attri_flag) { int ret = OB_SUCCESS; if (OB_ISNULL(ctx)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("ctx or partition service is null", K(ret)); } else { ObGranulePumpArgs new_arg; if (!(ObGranuleUtil::gi_has_attri(gi_attri_flag, GI_PARTITION_WISE) && ObGranuleUtil::gi_has_attri(gi_attri_flag, GI_AFFINITIZE)) && pump_args_.count() > 0) { if (pump_args_.count() != 1) { ret = OB_ERR_UNEXPECTED; LOG_WARN("args is unexpected", K(ret)); } else { if (OB_FAIL(init_arg(pump_args_.at(0), ctx, scan_ops, tablet_arrays, partitions_info, external_table_files, modify_op, parallelism, tablet_size, gi_attri_flag))) { LOG_WARN("fail to init arg", K(ret)); } else if (OB_FAIL(add_new_gi_task(pump_args_.at(0)))) { LOG_WARN("fail to add new gi task", K(ret)); } } } else if (OB_FAIL(init_arg(new_arg, ctx, scan_ops, tablet_arrays, partitions_info, external_table_files, modify_op, parallelism, tablet_size, gi_attri_flag))) { LOG_WARN("fail to init arg", K(ret)); } else if (OB_FAIL(pump_args_.push_back(new_arg))) { LOG_WARN("fail to push back new arg", K(ret)); } else if (OB_FAIL(add_new_gi_task(new_arg))) { LOG_WARN("fail to add new gi task", K(ret)); } } return ret; } int ObGranulePump::init_pump_args(ObExecContext *ctx, ObIArray &scan_ops, const common::ObIArray &tablet_arrays, common::ObIArray &partitions_info, common::ObIArray &external_table_files, const ObTableModifySpec* modify_op, int64_t parallelism, int64_t tablet_size, uint64_t gi_attri_flag) { return init_pump_args_inner(ctx, scan_ops, tablet_arrays, partitions_info, external_table_files, modify_op, parallelism, tablet_size, gi_attri_flag); } int ObGranulePump::init_arg( ObGranulePumpArgs &arg, ObExecContext *ctx, ObIArray &scan_ops, const common::ObIArray &tablet_arrays, common::ObIArray &partitions_info, const common::ObIArray &external_table_files, const ObTableModifySpec* modify_op, int64_t parallelism, int64_t tablet_size, uint64_t gi_attri_flag) { int ret = OB_SUCCESS; arg.op_info_.reset(); arg.tablet_arrays_.reset(); for (int i = 0; OB_SUCC(ret) && i < scan_ops.count(); ++i) { OZ(arg.op_info_.push_back_scan_ops(scan_ops.at(i))); } for (int i = 0; OB_SUCC(ret) && i < tablet_arrays.count(); ++i) { OZ(arg.tablet_arrays_.push_back(tablet_arrays.at(i))); } for (int i = 0; OB_SUCC(ret) && i < partitions_info.count(); ++i) { OZ(arg.partitions_info_.push_back(partitions_info.at(i))); } OZ(arg.external_table_files_.assign(external_table_files)); if (OB_SUCC(ret)) { arg.ctx_ = ctx; arg.op_info_.init_modify_op(modify_op); arg.parallelism_ = parallelism; arg.tablet_size_= tablet_size; arg.gi_attri_flag_ = gi_attri_flag; if (ObGranuleUtil::partition_filter(gi_attri_flag) && !ObGranuleUtil::is_partition_task_mode(gi_attri_flag)) { if (1 != tablet_arrays.count()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("tablet array cnt is unexpected", K(tablet_arrays.count())); } else { const DASTabletLocArray &array = tablet_arrays.at(0); arg.cur_tablet_idx_ = 0; for (int i = 0; OB_SUCC(ret) && i < array.count(); ++i) { OZ(arg.run_time_pruning_flags_.push_back(false)); } } } } return ret; } int ObGranulePump::regenerate_gi_task() { int ret = common::OB_SUCCESS; pump_version_ += 1; reset_task_array(); no_more_task_from_shared_pool_ = false; for (int i = 0; i < pump_args_.count() && OB_SUCC(ret); ++i) { ObGranulePumpArgs &arg = pump_args_.at(i); if (OB_FAIL(add_new_gi_task(arg))) { LOG_WARN("failed to add new gi task", K(ret)); } } return ret; } int ObGranulePump::reset_gi_task() { int ret = common::OB_SUCCESS; if (is_taskset_reset_) { } else { ObLockGuard lock_guard(lock_); if (is_taskset_reset_) { /*do nothing*/ } else { is_taskset_reset_ = true; no_more_task_from_shared_pool_ = false; for (int64_t i = 0; i < gi_task_array_map_.count() && OB_SUCC(ret); ++i) { GITaskArrayItem &item = gi_task_array_map_.at(i); for(int64_t j = 0; j < item.taskset_array_.count() && OB_SUCC(ret); ++j) { ObGITaskSet &taskset = item.taskset_array_.at(j); taskset.cur_pos_ = 0; } } } } return ret; } }//sql }//oceanbase