Files
oceanbase/src/sql/engine/px/ob_dfo_mgr.cpp
2023-08-18 12:11:53 +08:00

971 lines
40 KiB
C++

/**
* Copyright (c) 2021 OceanBase
* OceanBase CE is licensed under Mulan PubL v2.
* You can use this software according to the terms and conditions of the Mulan PubL v2.
* You may obtain a copy of Mulan PubL v2 at:
* http://license.coscl.org.cn/MulanPubL-2.0
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PubL v2 for more details.
*/
#define USING_LOG_PREFIX SQL_ENG
#include "sql/engine/px/ob_dfo_mgr.h"
#include "sql/engine/px/ob_px_util.h"
#include "sql/engine/basic/ob_temp_table_access_op.h"
#include "sql/engine/basic/ob_temp_table_insert_op.h"
#include "sql/engine/px/exchange/ob_transmit_op.h"
#include "sql/engine/basic/ob_material_op.h"
#include "lib/utility/ob_tracepoint.h"
#include "sql/engine/join/ob_join_filter_op.h"
#include "sql/engine/px/exchange/ob_px_repart_transmit_op.h"
#include "sql/optimizer/ob_px_resource_analyzer.h"
#include "sql/engine/px/ob_px_scheduler.h"
#include "share/detect/ob_detect_manager_utils.h"
#include "sql/engine/px/ob_px_coord_op.h"
using namespace oceanbase::common;
using namespace oceanbase::sql;
int ObDfoSchedOrderGenerator::generate_sched_order(ObDfoMgr &dfo_mgr)
{
int ret = OB_SUCCESS;
ObDfo *dfo_tree = dfo_mgr.get_root_dfo();
if (OB_ISNULL(dfo_tree)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("NULL unexpected", K(ret));
} else if (OB_FAIL(DfoTreeNormalizer<ObDfo>::normalize(*dfo_tree))) {
LOG_WARN("fail normalize dfo tree", K(ret));
} else if (OB_FAIL(do_generate_sched_order(dfo_mgr, *dfo_tree))) {
LOG_WARN("fail generate dfo edges", K(ret));
}
return ret;
}
// 正规化后的 dfo_tree 后序遍历顺序,即为调度顺序
// 用 edge 数组表示这种顺序
int ObDfoSchedOrderGenerator::do_generate_sched_order(ObDfoMgr &dfo_mgr, ObDfo &root)
{
int ret = OB_SUCCESS;
for (int64_t i = 0; OB_SUCC(ret) && i < root.get_child_count(); ++i) {
ObDfo *child = NULL;
if (OB_FAIL(root.get_child_dfo(i, child))) {
LOG_WARN("fail get child dfo", K(i), K(root), K(ret));
} else if (OB_ISNULL(child)) {
ret = OB_ERR_UNEXPECTED;
} else if (OB_FAIL(do_generate_sched_order(dfo_mgr, *child))) {
LOG_WARN("fail do generate edge", K(*child), K(ret));
} else if (OB_FAIL(dfo_mgr.add_dfo_edge(child))) {
LOG_WARN("fail add edge to array", K(ret));
}
}
return ret;
}
int ObDfoSchedDepthGenerator::generate_sched_depth(ObExecContext &exec_ctx,
ObDfoMgr &dfo_mgr)
{
int ret = OB_SUCCESS;
if (GCONF._px_max_pipeline_depth > 2) {
ObDfo *dfo_tree = dfo_mgr.get_root_dfo();
if (OB_ISNULL(dfo_tree)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("NULL unexpected", K(ret));
} else if (OB_FAIL(do_generate_sched_depth(exec_ctx, dfo_mgr, *dfo_tree))) {
LOG_WARN("fail generate dfo edges", K(ret));
}
}
return ret;
}
// dfo_tree 后序遍历,定出哪些 dfo 可以做 material op bypass
int ObDfoSchedDepthGenerator::do_generate_sched_depth(ObExecContext &exec_ctx,
ObDfoMgr &dfo_mgr,
ObDfo &parent)
{
int ret = OB_SUCCESS;
for (int64_t i = 0; OB_SUCC(ret) && i < parent.get_child_count(); ++i) {
ObDfo *child = NULL;
if (OB_FAIL(parent.get_child_dfo(i, child))) {
LOG_WARN("fail get child dfo", K(i), K(parent), K(ret));
} else if (OB_ISNULL(child)) {
ret = OB_ERR_UNEXPECTED;
} else if (OB_FAIL(do_generate_sched_depth(exec_ctx, dfo_mgr, *child))) {
LOG_WARN("fail do generate edge", K(*child), K(ret));
} else {
bool need_earlier_sched = check_if_need_do_earlier_sched(*child);
if (need_earlier_sched) {
// child 里面的 material 被改造成了 bypass 的,所以 parent 必须提前调度起来
// 同时,parent 中如果也有 material,必须标记为 block,不可 bypass。否则会 hang。
if (OB_FAIL(try_set_dfo_unblock(exec_ctx, *child))) {
// 既然 parent 被提前调度了,那么 parent 千万要阻塞
// 否则 parent 往外吐数据,就会卡住
LOG_WARN("fail set dfo block", K(ret), K(*child), K(parent));
} else if (OB_FAIL(try_set_dfo_block(exec_ctx, parent))) {
// 既然 parent 被提前调度了,那么 parent 千万要阻塞
// 否则 parent 往外吐数据,就会卡住
LOG_WARN("fail set dfo block", K(ret), K(*child), K(parent));
} else {
parent.set_earlier_sched(true);
LOG_DEBUG("parent dfo can do earlier scheduling", K(*child), K(parent));
}
}
}
}
return ret;
}
bool ObDfoSchedDepthGenerator::check_if_need_do_earlier_sched(ObDfo &child)
{
bool do_earlier_sched = false;
if (child.is_earlier_sched() == false) {
const ObOpSpec *phy_op = child.get_root_op_spec();
if (OB_NOT_NULL(phy_op) && IS_PX_TRANSMIT(phy_op->type_)) {
phy_op = static_cast<const ObTransmitSpec *>(phy_op)->get_child();
do_earlier_sched = phy_op && PHY_MATERIAL == phy_op->type_;
}
} else {
// dfo (child) 是 earlier sched,那么可以知道 dfo 的 material 会阻塞对外吐数据.
// 此时 dfo 的 parent 没有必要提前调度,因为没有任何数据给它消费. parent 依靠
// 稍后的 2-DFO 普通调度即可。
}
return do_earlier_sched;
}
int ObDfoSchedDepthGenerator::try_set_dfo_unblock(ObExecContext &exec_ctx, ObDfo &dfo)
{
return try_set_dfo_block(exec_ctx, dfo, false/*unblock*/);
}
int ObDfoSchedDepthGenerator::try_set_dfo_block(ObExecContext &exec_ctx, ObDfo &dfo, bool block)
{
int ret = OB_SUCCESS;
const ObOpSpec *phy_op = dfo.get_root_op_spec();
if (OB_ISNULL(phy_op)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("phy_op is null", K(ret));
} else {
const ObTransmitSpec *transmit = static_cast<const ObTransmitSpec *>(phy_op);
const ObOpSpec *child = transmit->get_child();
if (OB_ISNULL(child)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("phy_op is null", K(ret));
} else if (PHY_MATERIAL == child->type_) {
const ObMaterialSpec *mat = static_cast<const ObMaterialSpec *>(child);
ObOperatorKit *kit = exec_ctx.get_operator_kit(mat->id_);
if (OB_ISNULL(kit) || OB_ISNULL(kit->input_)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("operator is NULL", K(ret), KP(kit));
} else {
ObMaterialOpInput *mat_input = static_cast<ObMaterialOpInput *>(kit->input_);
mat_input->set_bypass(!block); // so that this dfo will have a blocked material op
}
}
}
return ret;
}
int ObDfoWorkerAssignment::calc_admited_worker_count(const ObIArray<ObDfo*> &dfos,
ObExecContext &exec_ctx,
const ObOpSpec &root_op_spec,
int64_t &px_expected,
int64_t &px_minimal,
int64_t &px_admited)
{
int ret = OB_SUCCESS;
px_admited = 0;
const ObTaskExecutorCtx *task_exec_ctx = NULL;
if (OB_ISNULL(task_exec_ctx = GET_TASK_EXECUTOR_CTX(exec_ctx))) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("task exec ctx NULL", K(ret));
} else if (OB_FAIL(ObDfoWorkerAssignment::get_dfos_worker_count(dfos, true, px_minimal))) {
LOG_WARN("failed to get dfos worker count", K(ret));
} else {
// px 级, 表示 optimizer 计算的数量,当前 px 理论上需要多少线程
px_expected = static_cast<const ObPxCoordSpec*>(&root_op_spec)->get_expected_worker_count();
// query 级, 表示 optimizer 计算的数量
const int64_t query_expected = task_exec_ctx->get_expected_worker_cnt();
// query 级, 表示调度需要最小数量
const int64_t query_minimal = task_exec_ctx->get_minimal_worker_cnt();
// query 级, 表示 admission 实际分配的数量
const int64_t query_admited = task_exec_ctx->get_admited_worker_cnt();
if (query_expected > 0 && 0 >= query_admited) {
ret = OB_ERR_INSUFFICIENT_PX_WORKER;
LOG_WARN("not enough thread resource", K(ret), K(px_expected), K(query_admited), K(query_expected));
} else if (0 == query_expected) {
// note: 对于单表、dop=1的查询,会走 fast dfo,此时 query_expected = 0
px_admited = 0;
} else if (query_admited >= query_expected) {
px_admited = px_expected;
} else if (OB_UNLIKELY(query_minimal <= 0)) {
// compatible with version before 4.2
px_admited = static_cast<int64_t>((double) query_admited * (double)px_expected / (double) query_expected);
} else if (OB_UNLIKELY(query_admited < query_minimal || query_expected <= query_minimal)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("unexpected query admited worker count", K(ret), K(query_minimal), K(query_admited), K(query_expected));
} else {
const int64_t extra_worker = query_admited - query_minimal;
const int64_t extra_expected = px_expected - px_minimal;
px_admited = px_minimal + extra_expected * extra_worker / (query_expected - query_minimal);
}
LOG_TRACE("calc px worker count", K(query_expected), K(query_minimal), K(query_admited),
K(px_expected), K(px_minimal), K(px_admited));
}
return ret;
}
int ObDfoWorkerAssignment::assign_worker(ObDfoMgr &dfo_mgr,
int64_t expected_worker_count,
int64_t minimal_worker_count,
int64_t admited_worker_count)
{
int ret = OB_SUCCESS;
/* 算法: */
/* 1. 基于优化器给出的 dop,给每个 dfo 分配 worker */
/* 2. 理想情况是 dop = worker 数 */
/* 3. 但是,如果 worker 数不足,则需要降级。降级策略: 每个 dfo 等比少用 worker */
/* 4. 为了确定比例,需要找到同时调度时占用 worker 数最多的一组 dop,以它为基准 */
/* 计算比例,才能保证其余 dfo 都能获得足够 worker 数 */
/* 算法可提升点(TODO): */
/* 考虑 expected_worker_count > admited_worker_count 场景, */
/* 本算法中计算出 scale rate < 1 ,于是会导致每个 dfo 会做 dop 降级 */
/* 而实际上,可以让部分 dfo 的执行不降级。考虑下面这种场景: */
/* */
/* dfo5 */
/* / \ */
/* dfo1 dfo4 */
/* \ */
/* dfo3 */
/* \ */
/* dfo2 */
/* */
/* 假设 dop = 5, 那么 expected_worker_count = 3 * 5 = 15 */
/* */
/* 考虑线程不足场景,设 admited_worker_count = 10, */
/* 那么,算法最优的情况下,我们可以这样分配: */
/* */
/* dfo5 (3 threads) */
/* / \ */
/* (3)dfo1 dfo4 (4) */
/* \ */
/* dfo3 (5) */
/* \ */
/* dfo2 (5) */
/* */
/* 当前的实现,由于 dop 等比降低,降低后的 dop = 5 * 10 / 15 = 3,实际分配结果为: */
/* */
/* dfo5 (3) */
/* / \ */
/* (3)dfo1 dfo4 (3) */
/* \ */
/* dfo3 (3) */
/* \ */
/* dfo2 (3) */
/* */
/* 显然,当前实现对 CPU 资源的利用不是最高效的。这部分工作可以留到稍后完善。暂时先这样 */
const ObIArray<ObDfo *> & dfos = dfo_mgr.get_all_dfos();
// 基于优化器给出的 dop,给每个 dfo 分配 worker
// 实际分配的 worker 数一定不大于 dop,但可能小于 dop 给定值
// admited_worker_count在rpc作为worker的场景下,值为0.
double scale_rate = 1.0;
bool match_expected = false;
bool compatible_before_420 = false;
if (OB_UNLIKELY(admited_worker_count < 0 || expected_worker_count <= 0 || minimal_worker_count <= 0)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("should have at least one worker", K(ret), K(admited_worker_count),
K(expected_worker_count), K(minimal_worker_count));
} else if (admited_worker_count >= expected_worker_count) {
match_expected = true;
} else if (minimal_worker_count <= 0) {
// compatible with version before 4.2
compatible_before_420 = true;
scale_rate = static_cast<double>(admited_worker_count) / static_cast<double>(expected_worker_count);
} else if (0 <= admited_worker_count || minimal_worker_count == admited_worker_count) {
scale_rate = 0.0;
} else if (OB_UNLIKELY(minimal_worker_count > admited_worker_count
|| minimal_worker_count >= expected_worker_count)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("unexpected params", K(ret), K(minimal_worker_count), K(admited_worker_count), K(expected_worker_count));
} else {
scale_rate = static_cast<double>(admited_worker_count - minimal_worker_count)
/ static_cast<double>(expected_worker_count - minimal_worker_count);
}
ARRAY_FOREACH_X(dfos, idx, cnt, OB_SUCC(ret)) {
ObDfo *child = dfos.at(idx);
int64_t val = 0;
if (match_expected) {
val = child->get_dop();
} else if (compatible_before_420) {
val = std::max(1L, static_cast<int64_t>(static_cast<double>(child->get_dop()) * scale_rate));
} else {
val = 1L + static_cast<int64_t>(std::max(static_cast<double>(child->get_dop() - 1), 0.0) * scale_rate);
}
child->set_assigned_worker_count(val);
if (child->is_single() && val > 1) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("local dfo do should not have more than 1", K(*child), K(val), K(ret));
}
LOG_TRACE("assign worker count to dfo",
"dfo_id", child->get_dfo_id(), K(admited_worker_count),
K(expected_worker_count), "dop", child->get_dop(), K(scale_rate), K(val));
}
// 因为上面取了 max,所以可能实际 assigned 的会超出 admission 数,这时应该报错
int64_t total_assigned = 0;
if (OB_FAIL(ret)) {
} else if (OB_FAIL(get_dfos_worker_count(dfos, false, total_assigned))) {
LOG_WARN("failed to get dfos worker count", K(ret));
} else if (total_assigned > admited_worker_count && admited_worker_count != 0) {
// 意味着某些 dfo 理论上一个线程都分不到
ret = OB_ERR_PARALLEL_SERVERS_TARGET_NOT_ENOUGH;
LOG_WARN("total assigned worker to dfos is more than admited_worker_count",
K(total_assigned),
K(admited_worker_count),
K(minimal_worker_count),
K(expected_worker_count),
K(ret));
}
return ret;
}
int ObDfoWorkerAssignment::get_dfos_worker_count(const ObIArray<ObDfo*> &dfos,
const bool get_minimal,
int64_t &total_assigned)
{
int ret = OB_SUCCESS;
total_assigned = 0;
ARRAY_FOREACH_X(dfos, idx, cnt, OB_SUCC(ret)) {
const ObDfo *child = dfos.at(idx);
const ObDfo *parent = child->parent();
// 计算当前 dfo 和“孩子们”一起调度时消耗的线程数
// 找到 expected worker cnt 值最大的一组
if (OB_ISNULL(parent) || OB_ISNULL(child)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("dfo edges expect to have parent", KPC(parent), KPC(child), K(ret));
} else {
int64_t child_assigned = get_minimal ? 1 : child->get_assigned_worker_count();
int64_t parent_assigned = get_minimal ? 1 : parent->get_assigned_worker_count();
int64_t assigned = parent_assigned + child_assigned;
// 局部右深树的场景,depend_sibling 和当前 child dfo 都会被调度
/* Why need extra flag has_depend_sibling_? Why not use NULL != depend_sibling_?
* dfo5 (dop=2)
* / | \
* dfo1(2) dfo2 (4) dfo4 (1)
* |
* dfo3 (2)
* Schedule order: (4,3) => (4,5) => (4,5,1) => (4,5,2) => (4,5) => (5). max_dop = (4,5,2) = 1 + 2 + 4 = 7.
* Depend sibling: dfo4 -> dfo1 -> dfo2.
* Depend sibling is stored as a list.
* We thought dfo2 is depend sibling of dfo1, and calculated incorrect max_dop = (1,2,5) = 2 + 4 + 2 = 8.
* Actually, dfo1 and dfo2 are depend sibling of dfo4, but dfo2 is not depend sibling of dfo1.
* So we use has_depend_sibling_ record whether the dfo is the header of list.
*/
int64_t max_depend_sibling_assigned_worker = 0;
if (child->has_depend_sibling()) {
while (NULL != child->depend_sibling()) {
child = child->depend_sibling();
child_assigned = get_minimal ? 1 : child->get_assigned_worker_count();
if (max_depend_sibling_assigned_worker < child_assigned) {
max_depend_sibling_assigned_worker = child_assigned;
}
}
}
assigned += max_depend_sibling_assigned_worker;
if (assigned > total_assigned) {
total_assigned = assigned;
LOG_TRACE("update total assigned", K(idx), K(get_minimal), K(parent_assigned),
K(child_assigned), K(max_depend_sibling_assigned_worker), K(total_assigned));
}
}
}
return ret;
}
void ObDfoMgr::destroy()
{
// release all dfos
for (int64_t i = 0; i < edges_.count(); ++i) {
ObDfo *dfo = edges_.at(i);
ObDfo::reset_resource(dfo);
}
edges_.reset();
// release root dfo
ObDfo::reset_resource(root_dfo_);
root_dfo_ = nullptr;
inited_ = false;
}
int ObDfoMgr::init(ObExecContext &exec_ctx,
const ObOpSpec &root_op_spec,
const ObDfoInterruptIdGen &dfo_int_gen,
ObPxCoordInfo &px_coord_info)
{
int ret = OB_SUCCESS;
root_dfo_ = NULL;
ObDfo *rpc_dfo = nullptr;
int64_t px_expected = 0;
int64_t px_minimal = 0;
int64_t px_admited = 0;
if (inited_) {
ret = OB_INIT_TWICE;
LOG_WARN("dfo mgr init twice", K(ret));
} else if (OB_FAIL(do_split(exec_ctx, allocator_, &root_op_spec, root_dfo_, dfo_int_gen, px_coord_info))) {
LOG_WARN("fail split ops into dfo", K(ret));
} else if (OB_ISNULL(root_dfo_)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("NULL dfo unexpected", K(ret));
} else if (!px_coord_info.rf_dpd_info_.is_empty()
&& OB_FAIL(px_coord_info.rf_dpd_info_.describe_dependency(root_dfo_))) {
LOG_WARN("failed to describe rf dependency");
} else if (OB_FAIL(ObDfoSchedOrderGenerator::generate_sched_order(*this))) {
LOG_WARN("fail init dfo mgr", K(ret));
} else if (OB_FAIL(ObDfoSchedDepthGenerator::generate_sched_depth(exec_ctx, *this))) {
LOG_WARN("fail init dfo mgr", K(ret));
} else if (OB_FAIL(ObDfoWorkerAssignment::calc_admited_worker_count(get_all_dfos(),
exec_ctx,
root_op_spec,
px_expected,
px_minimal,
px_admited))) {
LOG_WARN("fail to calc admited worler count", K(ret));
} else if (OB_FAIL(ObDfoWorkerAssignment::assign_worker(*this, px_expected, px_minimal, px_admited))) {
LOG_WARN("fail assign worker to dfos", K(ret), K(px_expected), K(px_minimal), K(px_admited));
} else {
inited_ = true;
}
return ret;
}
// parent_dfo 作为输入输出参数,仅仅在第一个 op 为 coord 时才作为输出参数,其余时候都作为输入参数
int ObDfoMgr::do_split(ObExecContext &exec_ctx,
ObIAllocator &allocator,
const ObOpSpec *phy_op,
ObDfo *&parent_dfo,
const ObDfoInterruptIdGen &dfo_int_gen,
ObPxCoordInfo &px_coord_info) const
{
int ret = OB_SUCCESS;
bool top_px = (nullptr == parent_dfo);
bool got_fulltree_dfo = false;
ObDfo *dfo = NULL;
bool is_stack_overflow = false;
if (OB_FAIL(check_stack_overflow(is_stack_overflow))) {
LOG_WARN("failed to check stack overflow", K(ret));
} else if (is_stack_overflow) {
ret = OB_SIZE_OVERFLOW;
LOG_WARN("stack overflow, maybe too deep recursive", K(ret));
} else if (OB_ISNULL(phy_op)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("NULL unexpected", K(ret));
} else if (NULL == parent_dfo && !IS_PX_COORD(phy_op->type_)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("the first phy_op must be a coord op", K(ret));
} else if (phy_op->is_table_scan() && NULL != parent_dfo) {
parent_dfo->set_scan(true);
parent_dfo->inc_tsc_op_cnt();
auto tsc_op = static_cast<const ObTableScanSpec *>(phy_op);
if (TableAccessType::HAS_USER_TABLE == px_coord_info.table_access_type_){
// nop
} else if (!is_virtual_table(tsc_op->get_ref_table_id())) {
px_coord_info.table_access_type_ = TableAccessType::HAS_USER_TABLE;
} else {
px_coord_info.table_access_type_ = TableAccessType::PURE_VIRTUAL_TABLE;
}
if (parent_dfo->need_p2p_info_ && parent_dfo->get_p2p_dh_addrs().empty()) {
ObDASTableLoc *table_loc = nullptr;
if (OB_ISNULL(table_loc = DAS_CTX(exec_ctx).get_table_loc_by_id(
tsc_op->get_table_loc_id(), tsc_op->get_loc_ref_table_id()))) {
OZ(ObTableLocation::get_full_leader_table_loc(DAS_CTX(exec_ctx).get_location_router(),
exec_ctx.get_allocator(),
exec_ctx.get_my_session()->get_effective_tenant_id(),
tsc_op->get_table_loc_id(),
tsc_op->get_loc_ref_table_id(),
table_loc));
}
if (OB_FAIL(ret)) {
} else {
const DASTabletLocList &locations = table_loc->get_tablet_locs();
parent_dfo->set_p2p_dh_loc(table_loc);
if (OB_FAIL(get_location_addrs<DASTabletLocList>(locations,
parent_dfo->get_p2p_dh_addrs()))) {
LOG_WARN("fail get location addrs", K(ret));
}
}
}
} else if (phy_op->is_dml_operator() && NULL != parent_dfo) {
// 当前op是一个dml算子,需要设置dfo的属性
parent_dfo->set_dml_op(true);
} else if (phy_op->get_type() == PHY_TEMP_TABLE_ACCESS && NULL != parent_dfo) {
parent_dfo->set_temp_table_scan(true);
const ObTempTableAccessOpSpec *access = static_cast<const ObTempTableAccessOpSpec*>(phy_op);
parent_dfo->set_temp_table_id(access->get_table_id());
if (parent_dfo->need_p2p_info_ && parent_dfo->get_p2p_dh_addrs().empty()) {
OZ(px_coord_info.p2p_temp_table_info_.temp_access_ops_.push_back(phy_op));
OZ(px_coord_info.p2p_temp_table_info_.dfos_.push_back(parent_dfo));
}
} else if (IS_PX_GI(phy_op->get_type()) && NULL != parent_dfo) {
const ObGranuleIteratorSpec *gi_spec =
static_cast<const ObGranuleIteratorSpec *>(phy_op);
if (gi_spec->bf_info_.is_inited_) {
ObP2PDfoMapNode node;
node.target_dfo_id_ = parent_dfo->get_dfo_id();
if (OB_FAIL(px_coord_info.p2p_dfo_map_.set_refactored(
gi_spec->bf_info_.p2p_dh_id_,
node))) {
LOG_WARN("fail to set p2p dh id to map", K(ret));
} else {
parent_dfo->set_need_p2p_info(true);
}
}
} else if (IS_PX_JOIN_FILTER(phy_op->get_type()) && NULL != parent_dfo) {
const ObJoinFilterSpec *filter_spec = static_cast<const ObJoinFilterSpec *>(phy_op);
if (filter_spec->is_create_mode() && OB_FAIL(px_coord_info.rf_dpd_info_.rf_create_ops_.push_back(phy_op))) {
LOG_WARN("failed to push back create op");
} else if (filter_spec->is_use_mode() && OB_FAIL(px_coord_info.rf_dpd_info_.rf_use_ops_.push_back(phy_op))) {
LOG_WARN("failed to push back use op");
}
if (OB_SUCC(ret) && filter_spec->is_shared_join_filter() && filter_spec->is_shuffle_) {
ObP2PDfoMapNode node;
node.target_dfo_id_ = parent_dfo->get_dfo_id();
for (int i = 0; i < filter_spec->rf_infos_.count() && OB_SUCC(ret); ++i) {
if (filter_spec->is_create_mode()) {
if (OB_FAIL(parent_dfo->add_p2p_dh_ids(
filter_spec->rf_infos_.at(i).p2p_datahub_id_))) {
LOG_WARN("fail to add p2p dh ids", K(ret));
}
} else if (OB_FAIL(px_coord_info.p2p_dfo_map_.set_refactored(
filter_spec->rf_infos_.at(i).p2p_datahub_id_,
node))) {
LOG_WARN("fail to set p2p dh id to map", K(ret));
} else {
parent_dfo->set_need_p2p_info(true);
}
}
}
} else if (IS_PX_COORD(phy_op->type_)) {
if (top_px) {
if (OB_FAIL(create_dfo(allocator, phy_op, dfo))) {
LOG_WARN("fail create dfo", K(ret));
}
} else {
// 不为嵌套 px coord 在这里分配 dfo
// 对于嵌套 px coord,它此时只作为一个普通算子被 leaf dfo 调用
// leaf dfo 会调用它的 next_row 接口,驱动嵌套 px coord 开启调度
got_fulltree_dfo = true;
}
} else if (IS_PX_TRANSMIT(phy_op->type_)) {
if (OB_FAIL(create_dfo(allocator, phy_op, dfo))) {
LOG_WARN("fail create dfo", K(ret));
} else {
dfo->set_parent(parent_dfo);
if (NULL != parent_dfo) {
if (OB_FAIL(parent_dfo->append_child_dfo(dfo))) {
LOG_WARN("fail append child dfo", K(ret));
}
}
}
}
if (OB_SUCC(ret) && nullptr != dfo) {
if (IS_PX_COORD(phy_op->type_)) {
dfo->set_coord_info_ptr(&px_coord_info);
dfo->set_root_dfo(true);
dfo->set_single(true);
dfo->set_dop(1);
dfo->set_execution_id(exec_ctx.get_my_session()->get_current_execution_id());
dfo->set_px_sequence_id(dfo_int_gen.get_px_sequence_id());
if (OB_NOT_NULL(phy_op->get_phy_plan()) && phy_op->get_phy_plan()->is_enable_px_fast_reclaim()) {
ObDetectableId sqc_detectable_id;
// if generate_detectable_id failed, means that server id is not ready
if (OB_FAIL(ObDetectManagerUtils::generate_detectable_id(sqc_detectable_id, GET_TENANT_ID()))) {
LOG_WARN("[DM] failed to generate_detectable_id for sqc");
} else {
ObPxDetectableIds px_detectable_ids(px_coord_info.qc_detectable_id_, sqc_detectable_id);
dfo->set_px_detectable_ids(px_detectable_ids);
}
}
if (OB_SUCC(ret)) {
// 存在嵌套情况,则dfo可能已经被设置过一些信息,所以这里不会覆盖
if (OB_INVALID_ID == dfo->get_dfo_id()) {
//只有顶层的dfo的receive才没有设置dfo id,即使嵌套dfo,也会设置,因为会根据transmit进行设置
dfo->set_dfo_id(ObDfo::MAX_DFO_ID);
}
if (OB_INVALID_ID == dfo->get_qc_id()) {
// receive的px记录在了transmit上
const ObTransmitSpec *transmit = static_cast<const ObTransmitSpec *>(phy_op->get_child());
if (OB_INVALID_ID != transmit->get_px_id()) {
dfo->set_qc_id(transmit->get_px_id());
}
}
// 对于 root dfo 来说,它并不是一个真实的 dfo,没有分配 id
// 所以使用 ObDfo::MAX_DFO_ID表示
if (OB_FAIL(dfo_int_gen.gen_id(dfo->get_dfo_id(), dfo->get_interrupt_id()))) {
LOG_WARN("fail gen dfo int id", K(ret));
}
LOG_TRACE("cur dfo info", K(dfo->get_qc_id()), K(dfo->get_dfo_id()), K(dfo->get_dop()));
}
} else {
const ObTransmitSpec *transmit = static_cast<const ObTransmitSpec *>(phy_op);
// 如果 transmit 下面的子树里包含 px coord 算子,那么下面这些设置都会被
// 修改成 is_local = true, dop = 1
dfo->set_coord_info_ptr(&px_coord_info);
dfo->set_single(transmit->is_px_single());
dfo->set_dop(transmit->get_px_dop());
dfo->set_qc_id(transmit->get_px_id());
dfo->set_dfo_id(transmit->get_dfo_id());
dfo->set_execution_id(exec_ctx.get_my_session()->get_current_execution_id());
dfo->set_px_sequence_id(dfo_int_gen.get_px_sequence_id());
if (OB_NOT_NULL(phy_op->get_phy_plan()) && phy_op->get_phy_plan()->is_enable_px_fast_reclaim()) {
ObDetectableId sqc_detectable_id;
// if generate_detectable_id failed, means that server id is not ready
if (OB_FAIL(ObDetectManagerUtils::generate_detectable_id(sqc_detectable_id, GET_TENANT_ID()))) {
LOG_WARN("[DM] failed to generate_detectable_id for sqc");
} else {
ObPxDetectableIds px_detectable_ids(px_coord_info.qc_detectable_id_, sqc_detectable_id);
dfo->set_px_detectable_ids(px_detectable_ids);
}
}
if (OB_SUCC(ret)) {
dfo->set_dist_method(transmit->dist_method_);
dfo->set_slave_mapping_type(transmit->get_slave_mapping_type());
parent_dfo->set_slave_mapping_type(transmit->get_slave_mapping_type());
dfo->set_pkey_table_loc_id(
(reinterpret_cast<const ObPxTransmitSpec *>(transmit))->repartition_table_id_);
if (OB_ISNULL(parent_dfo)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("parent dfo should not be null", K(ret));
} else if (transmit->get_px_dop() <= 0) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("should have dop set by optimizer", K(ret), K(transmit->get_px_dop()));
} else if (OB_FAIL(dfo_int_gen.gen_id(transmit->get_dfo_id(),
dfo->get_interrupt_id()))) {
LOG_WARN("fail gen dfo int id", K(ret));
} else {
dfo->set_qc_server_id(GCTX.server_id_);
dfo->set_parent_dfo_id(parent_dfo->get_dfo_id());
LOG_TRACE("cur dfo dop",
"dfo_id", dfo->get_dfo_id(),
"is_local", transmit->is_px_single(),
"dop", transmit->get_px_dop(),
K(dfo->get_qc_id()),
"parent dfo_id", parent_dfo->get_dfo_id(),
"slave mapping", transmit->is_slave_mapping());
}
}
}
}
if (nullptr != dfo) {
parent_dfo = dfo;
}
if (OB_SUCC(ret)) {
if (got_fulltree_dfo) {
// 序列化包含嵌套 px coord 算子的 dfo 时,需要将它下面所有的子 dfo
// 都序列化出去,也就是要包含整个子树 (fulltree)
if (OB_ISNULL(parent_dfo)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("inner px coord op should be in a dfo", K(ret));
} else {
parent_dfo->set_fulltree(true);
parent_dfo->set_single(true);
parent_dfo->set_dop(1);
}
// we have reach to a inner qc operator,
// no more monkeys jumping on the bed!
} else {
for (int32_t i = 0; OB_SUCC(ret) && i < phy_op->get_child_cnt(); ++i) {
ObDfo *tmp_parent_dfo = parent_dfo;
if (OB_FAIL(do_split(exec_ctx, allocator, phy_op->get_child(i),
tmp_parent_dfo, dfo_int_gen, px_coord_info))) {
LOG_WARN("fail split op into dfo", K(ret));
}
}
}
}
return ret;
}
int ObDfoMgr::create_dfo(ObIAllocator &allocator,
const ObOpSpec *dfo_root_op,
ObDfo *&dfo) const
{
int ret = OB_SUCCESS;
void *tmp = NULL;
dfo = NULL;
if (OB_ISNULL(dfo_root_op)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("NULL unexpected", K(ret));
} else if (OB_ISNULL(tmp = allocator.alloc(sizeof(ObDfo)))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_ERROR("fail to alloc ObDfo", K(ret));
} else if (OB_ISNULL(dfo = new(tmp) ObDfo(allocator))) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("fail to new ObDfo", K(ret));
} else {
dfo->set_root_op_spec(dfo_root_op);
dfo->set_phy_plan(dfo_root_op->get_phy_plan());
}
return ret;
}
// get_ready_dfo接口仅用于单层dfo调度.
// 每次迭代一个dfo出来.
int ObDfoMgr::get_ready_dfo(ObDfo *&dfo) const
{
int ret = OB_SUCCESS;
bool all_finish = true;
dfo = NULL;
for (int64_t i = 0; OB_SUCC(ret) && i < edges_.count(); ++i) {
ObDfo *edge = edges_.at(i);
if (edge->is_thread_finish()) {
continue;
} else {
all_finish = false;
if (!edge->is_active()) {
dfo = edge;
dfo->set_active();
break;
}
}
}
if (OB_SUCC(ret) && all_finish) {
ret = OB_ITER_END;
}
return ret;
}
// 注意区别两种返回状态:
// - 如果有 edge 还没有 finish,且不能调度更多 dfo,则返回空集合
// - 如果所有 edge 都已经 finish,则返回 ITER_END
// 每次只迭代出一对 DFO,child & parent
int ObDfoMgr::get_ready_dfos(ObIArray<ObDfo*> &dfos) const
{
int ret = OB_SUCCESS;
bool all_finish = true;
bool got_pair_dfo = false;
dfos.reset();
LOG_TRACE("ready dfos", K(edges_.count()));
// edges 已经按照调度顺序排序,排在前面的优先调度
for (int64_t i = 0; OB_SUCC(ret) && i < edges_.count(); ++i) {
ObDfo *edge = edges_.at(i);
ObDfo *root_edge = edges_.at(edges_.count() - 1);
if (edge->is_thread_finish()) {
LOG_TRACE("finish dfo", K(*edge));
continue;
} else {
// edge 没有完成,调度的目标就是促成这条边尽快完成,包括调度起它所依赖的 DFO,即:
// - edge 没有调度起来,立即调度
// - edge 已经调度起来,则看这个 edge 是否依赖其它 dfo 才能完成执行
all_finish = false;
if (!edge->is_active()) {
if (OB_FAIL(dfos.push_back(edge))) {
LOG_WARN("fail push dfo", K(ret));
} else if (NULL == edge->parent()) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("parent is NULL, unexpected", K(ret));
} else if (OB_FAIL(dfos.push_back(edge->parent()))) {
LOG_WARN("fail push dfo", K(ret));
} else {
edge->set_active();
got_pair_dfo = true;
}
} else if (edge->has_depend_sibling()) {
// 找到 dependence 链条中优先依赖的 dfo,
// 要求这个 dfo 为未完成状态
ObDfo *sibling_edge = edge->depend_sibling();
for (/* nop */;
nullptr != sibling_edge && sibling_edge->is_thread_finish();
sibling_edge = sibling_edge->depend_sibling()) {
// search forward: [leaf] --> [leaf] --> [leaf]
}
if (OB_UNLIKELY(nullptr == sibling_edge)) {
// nop, all sibling finish
} else if (sibling_edge->is_active()) {
// nop, wait for a sibling finish.
// after then can we shedule next edge
} else if (OB_FAIL(dfos.push_back(sibling_edge))) {
LOG_WARN("fail push dfo", K(ret));
} else if (NULL == sibling_edge->parent()) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("parent is NULL, unexpected", K(ret));
} else if (OB_FAIL(dfos.push_back(sibling_edge->parent()))) {
LOG_WARN("fail push dfo", K(ret));
} else {
sibling_edge->set_active();
got_pair_dfo = true;
LOG_TRACE("start schedule dfo", K(*sibling_edge), K(*sibling_edge->parent()));
}
}else {
// 当前 edge 还没有完成,也没有 sibling edge 需要调度,返回 dfos 空集,继续等待
}
// 三层 DFO 调度逻辑
// 注意:即使上面有 sibling 被调度起来了,已经调度了 3 个 DFO
// 也还是会去尝试调度第 4 个 depend parent dfo
if (OB_SUCC(ret) && !got_pair_dfo && GCONF._px_max_pipeline_depth > 2) {
ObDfo *parent_edge = edge->parent();
if (NULL != parent_edge &&
!parent_edge->is_active() &&
NULL != parent_edge->parent() &&
parent_edge->parent()->is_earlier_sched()) {
/* 为了便于描述,考虑如下场景:
* parent-parent
* |
* parent
* |
* edge
* 当代码运行到这个分支时,edge 已经 active,edge、parent 两个 dfo 已经被调度
* 并且,parent 的执行依赖于 parent-parent 也被调度(2+dfo调度优化,hash join 的
* 结果可以直接输出,无需在上面加 material 算子)
*/
if (OB_FAIL(dfos.push_back(parent_edge))) {
LOG_WARN("fail push dfo", K(ret));
} else if (OB_FAIL(dfos.push_back(parent_edge->parent()))) {
LOG_WARN("fail push dfo", K(ret));
} else {
parent_edge->set_active();
got_pair_dfo = true;
LOG_DEBUG("dfo do earlier scheduling", K(*parent_edge->parent()));
}
}
}
// If one of root_edge's child has scheduled, we try to start the root_dfo.
if (OB_SUCC(ret) && !got_pair_dfo) {
if (edge->is_active() &&
!root_edge->is_active() &&
edge->has_parent() &&
edge->parent() == root_edge) {
// 本分支是一个优化,提前调度起 root dfo,使得 root dfo
// 可以及时拉取下面的数据。在某些场景下,可以避免下层 dfo 添加
// 不必要的 material 算子阻塞数据流动
//
// 之所以可以这么做,是因为 root dfo 无论调度与否,都是占着资源的,
// 不调白不调
if (OB_ISNULL(root_edge->parent()) || root_edge->parent() != root_dfo_) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("The root edge is null or it's parent not root dfo", K(ret));
} else if (OB_FAIL(dfos.push_back(root_edge))) {
LOG_WARN("Fail to push dfo", K(ret));
} else if (OB_FAIL(dfos.push_back(root_dfo_))) {
LOG_WARN("Fail to push dfo", K(ret));
} else {
root_edge->set_active();
got_pair_dfo = true;
LOG_TRACE("Try to schedule root dfo", KP(root_edge), KP(root_dfo_));
}
}
}
// 每次只迭代一对儿结果返回出去
//
// 如果:
// - 当前 edge 还没有完成,
// - 也没有 sibling edge 需要调度,
// - 没有 depend parent edge 需要调度
// - root dfo 也不需要调度
// 则返回 dfos 空集,继续等待
break;
}
}
if (all_finish && OB_SUCCESS == ret) {
ret = OB_ITER_END;
}
return ret;
}
int ObDfoMgr::add_dfo_edge(ObDfo *edge)
{
int ret = OB_SUCCESS;
if (edges_.count() >= ObDfo::MAX_DFO_ID) {
ret = OB_NOT_SUPPORTED;
LOG_USER_ERROR(OB_NOT_SUPPORTED, "plan with more than 128 DFOs");
} else if (OB_FAIL(edges_.push_back(edge))) {
LOG_WARN("fail to push back dfo", K(*edge), K(ret));
// release the memory
ObDfo::reset_resource(edge);
edge = nullptr;
}
return ret;
}
int ObDfoMgr::find_dfo_edge(int64_t id, ObDfo *&edge)
{
int ret = OB_SUCCESS;
if (id < 0 || id >= ObDfo::MAX_DFO_ID || id >= edges_.count()) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("invalid dfo id", K(id), K(edges_.count()), K(ret));
} else {
bool found = false;
int64_t cnt = edges_.count();
for (int64_t i = 0; OB_SUCC(ret) && i < cnt; ++i) {
if (id == edges_.at(i)->get_dfo_id()) {
edge = edges_.at(i);
found = true;
break;
}
}
if (!found) {
ret = OB_ENTRY_NOT_EXIST;
LOG_WARN("not found dfo", K(id), K(cnt), K(ret));
}
}
return ret;
}
int ObDfoMgr::get_active_dfos(ObIArray<ObDfo*> &dfos) const
{
int ret = OB_SUCCESS;
dfos.reset();
// edges 已经按照调度顺序排序,排在前面的优先调度
for (int64_t i = 0; OB_SUCC(ret) && i < edges_.count(); ++i) {
ObDfo *edge = edges_.at(i);
// edge 没有完成,调度的目标就是促成这条边尽快完成
if (edge->is_active()) {
if (OB_FAIL(dfos.push_back(edge))) {
LOG_WARN("fail push back edge", K(ret));
}
}
}
return ret;
}
int ObDfoMgr::get_scheduled_dfos(ObIArray<ObDfo*> &dfos) const
{
int ret = OB_SUCCESS;
dfos.reset();
for (int64_t i = 0; OB_SUCC(ret) && i < edges_.count(); ++i) {
ObDfo *edge = edges_.at(i);
// 调用过 schedule_dfo 接口,无论成功失败,dfo 就会被设置为 scheduled 状态
if (edge->is_scheduled()) {
if (OB_FAIL(dfos.push_back(edge))) {
LOG_WARN("fail push back edge", K(ret));
}
}
}
return ret;
}
int ObDfoMgr::get_running_dfos(ObIArray<ObDfo*> &dfos) const
{
int ret = OB_SUCCESS;
dfos.reset();
for (int64_t i = 0; OB_SUCC(ret) && i < edges_.count(); ++i) {
ObDfo *edge = edges_.at(i);
// 调用过 schedule_dfo 接口,无论成功失败,dfo 就会被设置为 scheduled 状态
if (edge->is_scheduled() && !edge->is_thread_finish()) {
if (OB_FAIL(dfos.push_back(edge))) {
LOG_WARN("fail push back edge", K(ret));
}
}
}
return ret;
}