/* ------------------------------------------------------------------------- * * streamplan_utils.cpp * functions related to stream plan for checking. * * * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * src/backend/optimizer/plan/streamplan_utils.cpp * * ------------------------------------------------------------------------- */ #include #include #include "access/transam.h" #include "commands/explain.h" #include "nodes/makefuncs.h" #include "optimizer/cost.h" #include "optimizer/dataskew.h" #include "optimizer/pathnode.h" #include "optimizer/planner.h" #include "optimizer/restrictinfo.h" #include "optimizer/tlist.h" #include "parser/parse_collate.h" #include "parser/parse_coerce.h" #include "parser/parse_clause.h" #include "parser/parse_oper.h" #include "parser/parse_relation.h" #include "pgxc/groupmgr.h" #include "pgxc/poolmgr.h" #include "pgxc/pruningslice.h" #include "utils/lsyscache.h" #include "utils/syscache.h" /* * Description: return the op list in the plan * @param[IN] result_plan: input plan * @param[IN] check_eval: evaluation function for this op * @return : list of op in the plan. */ List* check_op_list_template(Plan* result_plan, List* (*check_eval)(Node*)) { List* res_list = NIL; res_list = check_eval((Node*)result_plan->targetlist); res_list = list_concat_unique(res_list, check_eval((Node*)result_plan->qual)); switch (nodeTag(result_plan)) { case T_SeqScan: case T_CStoreScan: #ifdef ENABLE_MULTIPLE_NODES case T_TsStoreScan: #endif /* ENABLE_MULTIPLE_NODES */ case T_ExtensiblePlan: break; case T_ForeignScan: case T_VecForeignScan: { ForeignScan* foreignScan = (ForeignScan*)result_plan; if (!OidIsValid(foreignScan->scan_relid)) { break; } ForeignTable* ftbl = NULL; ForeignServer* fsvr = NULL; ftbl = GetForeignTable(foreignScan->scan_relid); AssertEreport(NULL != ftbl, MOD_OPT, "The foreign table is NULL"); fsvr = GetForeignServer(ftbl->serverid); AssertEreport(NULL != fsvr, MOD_OPT, "The foreign server is NULL"); /* * If the predicate is pushed down, must find subplan from item->hdfsQual struct. */ if (isObsOrHdfsTableFormSrvName(fsvr->servername)) { List* foreignPrivateList = (List*)foreignScan->fdw_private; DfsPrivateItem* item = (DfsPrivateItem*)((DefElem*)linitial(foreignPrivateList))->arg; res_list = list_concat_unique(res_list, check_eval((Node*)item->hdfsQual)); } break; } case T_IndexScan: { IndexScan* splan = (IndexScan*)result_plan; res_list = list_concat_unique(res_list, check_eval((Node*)splan->indexqual)); } break; case T_IndexOnlyScan: { IndexOnlyScan* splan = (IndexOnlyScan*)result_plan; res_list = list_concat_unique(res_list, check_eval((Node*)splan->indexqual)); } break; case T_CStoreIndexScan: { CStoreIndexScan* splan = (CStoreIndexScan*)result_plan; res_list = list_concat_unique(res_list, check_eval((Node*)splan->indexqual)); } break; case T_BitmapIndexScan: { BitmapIndexScan* splan = (BitmapIndexScan*)result_plan; res_list = list_concat_unique(res_list, check_eval((Node*)splan->indexqual)); } break; case T_CStoreIndexCtidScan: { CStoreIndexCtidScan* splan = (CStoreIndexCtidScan*)result_plan; res_list = list_concat_unique(res_list, check_eval((Node*)splan->indexqual)); } break; case T_TidScan: { TidScan* splan = (TidScan*)result_plan; res_list = list_concat_unique(res_list, check_eval((Node*)splan->tidquals)); } break; case T_FunctionScan: { FunctionScan* splan = (FunctionScan*)result_plan; res_list = list_concat_unique(res_list, check_eval((Node*)splan->funcexpr)); } break; case T_ValuesScan: { ValuesScan* splan = (ValuesScan*)result_plan; res_list = list_concat_unique(res_list, check_eval((Node*)splan->values_lists)); } break; case T_NestLoop: case T_VecNestLoop: { Join* splan = (Join*)result_plan; res_list = list_concat_unique(res_list, check_eval((Node*)splan->joinqual)); } break; case T_MergeJoin: case T_VecMergeJoin: { MergeJoin* splan = (MergeJoin*)result_plan; res_list = list_concat_unique(res_list, check_eval((Node*)splan->join.joinqual)); res_list = list_concat_unique(res_list, check_eval((Node*)splan->mergeclauses)); } break; case T_HashJoin: case T_VecHashJoin: { HashJoin* splan = (HashJoin*)result_plan; res_list = list_concat_unique(res_list, check_eval((Node*)splan->join.joinqual)); res_list = list_concat_unique(res_list, check_eval((Node*)splan->hashclauses)); } break; case T_Limit: case T_VecLimit: { Limit* splan = (Limit*)result_plan; res_list = list_concat_unique(res_list, check_eval((Node*)splan->limitOffset)); res_list = list_concat_unique(res_list, check_eval((Node*)splan->limitCount)); } break; case T_VecWindowAgg: case T_WindowAgg: { WindowAgg* splan = (WindowAgg*)result_plan; res_list = list_concat_unique(res_list, check_eval((Node*)splan->startOffset)); res_list = list_concat_unique(res_list, check_eval((Node*)splan->endOffset)); } break; case T_BaseResult: case T_VecResult: { BaseResult* splan = (BaseResult*)result_plan; res_list = list_concat_unique(res_list, check_eval((Node*)splan->resconstantqual)); } break; case T_ModifyTable: { ModifyTable* splan = (ModifyTable*)result_plan; if (splan->upsertAction == UPSERT_UPDATE && splan->updateTlist != NULL) { res_list = list_concat_unique(res_list, check_eval((Node*)splan->updateTlist)); } } break; default: break; } return res_list; } /* * @Descarption: Search node and check funcid > FirstNormalObjectId. * @in node: Current node. * @in varInfo: Var_info struct. */ static bool check_func_walker(Node* node, bool* found) { if (node == NULL) { return false; } else if (IsA(node, FuncExpr)) { FuncExpr* fexpr = (FuncExpr*)node; if (fexpr->funcid > FirstNormalObjectId) { *found = true; return true; } else { return false; } } return expression_tree_walker(node, (bool (*)())check_func_walker, found); } /* * @Description: Check this node if vartype > FirstNormalObjectId. * @in qual: Checked node. */ List* check_func(Node* node) { bool found = false; List* rs = NIL; (void)check_func_walker(node, &found); if (found) rs = lappend(rs, NULL); return rs; } static bool contains_specified_func_walker(Node* node, void* context) { if (node == NULL) { return false; } contain_func_context* ctxt = (contain_func_context*)context; if (IsA(node, FuncExpr)) { FuncExpr* func = (FuncExpr*)node; if (list_member_oid(ctxt->funcids, func->funcid)) { ctxt->func_exprs = lappend(ctxt->func_exprs, func); if (!ctxt->find_all) { return true; } } } if (IsA(node, Query)) { /* Recurse into subselects */ bool result = query_tree_walker((Query*)node, (bool (*)())contains_specified_func_walker, context, 0); return result; } return expression_tree_walker(node, (bool (*)())contains_specified_func_walker, context); } /* * contains_specified_func: * Walk through the node, search the func_expr specified by context->funcids, and * append the func_expr to context->func_expr * * @in node: the node that will be checked. * @in context->funcids: the func that will be searched in the node. * @in context->find_all: false means once find any func in context->funcids will return, * true means walk through all the branches to search context->funcids * @in/out context->func_expr: the func expr that we find will be append to the list. */ List* contains_specified_func(Node* node, contain_func_context* context) { /* context->funcids is the func will be checked, it can't be NIL */ AssertEreport(context->funcids != NIL, MOD_OPT, "The context funcids is NIL"); (void)query_or_expression_tree_walker(node, (bool (*)())contains_specified_func_walker, (void*)context, 0); return context->func_exprs; } /* * stream_path_walker: * traverse path node to find if it contains stream node and parameterized path * Parameters: * @in path: path that need to check * @in context: context for checking stream node */ void stream_path_walker(Path* path, ContainStreamContext* context) { if (NULL == path) { return; } /* Record if there's parameterized path */ if (path->param_info) { Relids req_outer = bms_intersect(path->param_info->ppi_req_outer, context->outer_relids); #ifdef ENABLE_MULTIPLE_NODES Bitmapset* req_upper = bms_union(path->param_info->ppi_req_upper, context->upper_params); if (!bms_is_empty(req_outer) || !bms_is_empty(req_upper)) { #else if (!bms_is_empty(req_outer)) { #endif context->has_parameterized_path = true; /* * if param path is under materialize, it definitely cross stream * node. Set it because it's skipped when traversing stream node */ if (context->under_materialize_all) context->has_stream = true; bms_free_ext(req_outer); } } /* Quit earlier if check condition meets */ if (context->has_stream && (context->only_check_stream || context->has_parameterized_path)) { return; } switch (path->pathtype) { case T_Stream: { /* Don't count stream node under added materialize node */ if (!context->under_materialize_all) context->has_stream = true; stream_path_walker(((StreamPath*)path)->subpath, context); } break; /* For subqueryscan, we should traverse to its child plan */ case T_SubqueryScan: { Plan* splan = path->parent->subplan; if (contain_special_plan_node(splan, T_Stream)) context->has_stream = true; else context->has_stream = false; } break; case T_Append: { ListCell* cell = NULL; foreach (cell, ((AppendPath*)path)->subpaths) { stream_path_walker((Path*)lfirst(cell), context); } } break; case T_MergeAppend: { ListCell* cell = NULL; foreach (cell, ((MergeAppendPath*)path)->subpaths) { stream_path_walker((Path*)lfirst(cell), context); } } break; case T_Material: { /* * For added materialize node above stream, we can safely return if there's no * parameters, or we will check further if the parameter is used under materialize */ if (((MaterialPath*)path)->materialize_all && context->outer_relids == NULL) return; bool saved_value = context->under_materialize_all; /* If stream is not materialize_all, we should count stream under it */ if (((MaterialPath*)path)->materialize_all) context->under_materialize_all = true; stream_path_walker(((MaterialPath*)path)->subpath, context); context->under_materialize_all = saved_value; } break; case T_Unique: { stream_path_walker(((UniquePath*)path)->subpath, context); } break; case T_PartIterator: { stream_path_walker(((PartIteratorPath*)path)->subPath, context); } break; case T_NestLoop: case T_HashJoin: case T_MergeJoin: { JoinPath* jp = (JoinPath*)path; stream_path_walker(jp->innerjoinpath, context); stream_path_walker(jp->outerjoinpath, context); } break; case T_IndexScan: case T_IndexOnlyScan: case T_BitmapHeapScan: { IndexPath* indexPath = (IndexPath*)path; if (g_instance.attr.attr_storage.enable_delta_store && (indexPath->indexinfo->relam == PSORT_AM_OID || indexPath->indexinfo->relam == CBTREE_AM_OID || indexPath->indexinfo->relam == CGIN_AM_OID)) context->has_cstore_index_delta = true; } break; default: break; } return; } /* * @Description: Recurse the whole plan tree to do some extra work. * 1. Search the plan tree to see if there is subplan/initplan * 2. Change the plan with subplan/initplan to serialization. * * @param[IN] result_plan: plan node * @param[IN] parent: parent plan of result_plan * @param[IN] cell: if the result_plan is in parent's plan list cell * @param[IN] is_left: if the result_plan is in parent's lefttree * @param[IN] initplans: initplans list * @param[IN] is_search: if we are do the seatch job or rollback job. * @return: there is a subplan/initplan */ bool has_subplan(Plan* result_plan, Plan* parent, ListCell* cell, bool is_left, List** initplans, bool is_search) { bool hasSubplan_under = false; bool hasSubplan_this = false; ListCell* lc = NULL; if (result_plan == NULL) return false; if (is_search) { /* Check the subplan. */ if (result_plan->initPlan && IS_STREAM_PLAN) { *initplans = list_concat(*initplans, list_copy(result_plan->initPlan)); } List* subplan_list = check_subplan_list(result_plan); foreach (lc, subplan_list) { Node* node = (Node*)lfirst(lc); SubPlan* subplan = NULL; if (IsA(node, SubPlan)) { hasSubplan_this = true; } else if (isIntergratedMachine == false) { AssertEreport(IsA(node, Param), MOD_OPT, "The node is NOT a param"); Param* param = (Param*)node; ListCell* lc2 = NULL; foreach (lc2, *initplans) { subplan = (SubPlan*)lfirst(lc2); if (list_member_int(subplan->setParam, param->paramid)) { hasSubplan_this = true; } } } } } else { /* * Do not parallelize the plan node in the same thread * with the plan node which has subplan. */ result_plan->dop = 1; result_plan->parallel_enabled = false; } List* plan_list = NIL; /* Find plan list in special plan nodes. */ switch (nodeTag(result_plan)) { case T_Append: case T_VecAppend: { Append* append = (Append*)result_plan; plan_list = append->appendplans; } break; case T_ModifyTable: case T_VecModifyTable: { ModifyTable* mt = (ModifyTable*)result_plan; plan_list = mt->plans; } break; case T_MergeAppend: case T_VecMergeAppend: { MergeAppend* ma = (MergeAppend*)result_plan; plan_list = ma->mergeplans; } break; case T_BitmapAnd: case T_CStoreIndexAnd: { BitmapAnd* ba = (BitmapAnd*)result_plan; plan_list = ba->bitmapplans; } break; case T_BitmapOr: case T_CStoreIndexOr: { BitmapOr* bo = (BitmapOr*)result_plan; plan_list = bo->bitmapplans; } break; default: { plan_list = NIL; } break; } if (NIL != plan_list) { foreach (lc, plan_list) { Plan* plan = (Plan*)lfirst(lc); hasSubplan_under |= has_subplan(plan, result_plan, lc, false, initplans, is_search); } } else { switch (nodeTag(result_plan)) { case T_SubqueryScan: case T_VecSubqueryScan: { SubqueryScan* ss = (SubqueryScan*)result_plan; if (ss->subplan) { if (has_subplan(ss->subplan, result_plan, NULL, false, initplans, is_search)) hasSubplan_under = true; } } break; case T_Stream: case T_VecStream: { Stream* stream = (Stream*)result_plan; bool removed = false; if (is_search) { hasSubplan_under = has_subplan(result_plan->lefttree, result_plan, NULL, true, initplans, is_search); /* Only do roll back at stream node. */ if (hasSubplan_under) { (void)has_subplan(result_plan->lefttree, result_plan, NULL, true, initplans, false); stream->smpDesc.producerDop = 1; } hasSubplan_under = false; } else { stream->smpDesc.consumerDop = 1; } if (!stream->is_recursive_local) { removed = remove_local_plan(result_plan, parent, cell, is_left); } else { /* For MPP Recursive, don't remove LOCAL GATHER(1:1) upon CteScan, just keep it */ Assert(LOCAL_ROUNDROBIN == stream->smpDesc.distriType && IsA(result_plan->lefttree, CteScan)); } #ifdef ENABLE_MULTIPLE_NODES /* * Push down execnodes of local stream to prevent hang. * If parent is remote query, its exec_nodes is all nodes, we don't need to pushdown it. */ bool underRemoteQuery = IsA(parent, RemoteQuery) || IsA(parent, VecRemoteQuery); if (!removed && !underRemoteQuery && STREAM_IS_LOCAL_NODE(stream->smpDesc.distriType) && list_length(result_plan->exec_nodes->nodeList) > 0) { List* diff_1 = list_difference_int(result_plan->exec_nodes->nodeList, parent->exec_nodes->nodeList); List* diff_2 = list_difference_int(parent->exec_nodes->nodeList, result_plan->exec_nodes->nodeList); if (diff_1 != NIL || diff_2 != NIL) { list_free(diff_1); list_free(diff_2); pushdown_execnodes(result_plan, parent->exec_nodes, true); } } #endif return false; } break; default: { hasSubplan_under |= has_subplan(result_plan->lefttree, result_plan, NULL, true, initplans, is_search); hasSubplan_under |= has_subplan(result_plan->righttree, result_plan, NULL, false, initplans, is_search); } break; } } return hasSubplan_this || hasSubplan_under; } void compute_stream_cost(StreamType type, char locator_type, double subrows, double subgblrows, double skew_ratio, int width, bool isJoin, List* distribute_keys, Cost* total_cost, double* gblrows, unsigned int producer_num_dn, unsigned int consumer_num_dn, ParallelDesc* smpDesc, List* ssinfo) { double size, send_size, receive_size; double stream_ratio_send = 1.0; double stream_ratio_receive = 1.0; int sendDop = 1; int receiveDop = 1; SmpStreamType pa_stream = PARALLEL_NONE; Cost send_cost = 0; Cost receive_cost = 0; Cost send_data_cost = DEFAULT_SEND_KDATA_COST; Cost receive_data_cost = DEFAULT_RECEIVE_KDATA_COST; Cost cpu_hashcal_cost = DEFAULT_CPU_HASH_COST; #ifdef ENABLE_MULTIPLE_NODES /* For the compatibility of original single installation nodegroup mode */ if (producer_num_dn == 0) producer_num_dn = 1; else if (ng_is_all_in_installation_nodegroup_scenario()) producer_num_dn = u_sess->pgxc_cxt.NumDataNodes; if (consumer_num_dn == 0) consumer_num_dn = 1; else if (ng_is_all_in_installation_nodegroup_scenario()) consumer_num_dn = u_sess->pgxc_cxt.NumDataNodes; #else producer_num_dn = 1; consumer_num_dn = 1; #endif /* Get the dop of stream send side and receive side if parallel. */ if (smpDesc != NULL) { sendDop = SET_DOP(smpDesc->producerDop); receiveDop = SET_DOP(smpDesc->consumerDop); pa_stream = smpDesc->distriType; /* Add the thead cost at consumer sides. */ receive_cost += (receiveDop - 1) * u_sess->opt_cxt.smp_thread_cost; parallel_stream_info_print(smpDesc, type); } /* * There is two kinds of stream node: * 1. Transmit data through net. Including the original * redistribute/broadcast node, and split * redistribute/broadcast for smp. * 2. Transmit data througn memory. Incluuding local redistribute * local broadcast, and local roundrobin for smp. * These two kinds of stream should have different * cost parameter with the same calculate formula. */ if (STREAM_IS_LOCAL_NODE(pa_stream)) { send_data_cost = LOCAL_SEND_KDATA_COST; receive_data_cost = LOCAL_RECEIVE_KDATA_COST; /* local broadcast and local roundrobin do not need hash calculation. */ if (LOCAL_BROADCAST == pa_stream || LOCAL_ROUNDROBIN == pa_stream) cpu_hashcal_cost = 0; else cpu_hashcal_cost = DEFAULT_CPU_HASH_COST; } ereport(DEBUG2, (errmodule(MOD_OPT), (errmsg("Stream cost: SMP INFO: sendDop: %d, receiveDop: %d, distriType: %d", sendDop, receiveDop, pa_stream)))); /* * Time consumption is more for join than agg with less width according * to the test, and it may because time wait during join, so we just simulate * it through width change */ if (isJoin) { width = ((width - 1) / 128 + 1) * 128; #ifdef ENABLE_MULTIPLE_NODES stream_ratio_send = pow(u_sess->pgxc_cxt.NumDataNodes, (double)1 / 3); stream_ratio_receive = pow(u_sess->pgxc_cxt.NumDataNodes, (double)1 / 3); #else stream_ratio_send = pow(1, (double)1 / 3); stream_ratio_receive = pow(1, (double)1 / 3); #endif } else if (width <= 0) { width = 8; } /* * We count amount according to the network send and receive times becuase * we send the data in 8K unit */ size = (subrows)*width / 8192.0; ereport(DEBUG1, (errmodule(MOD_OPT), (errmsg("Stream cost: data size: %lf", size)))); /* * Don't consider latency * The cost of stream is cost of broadcast or distribute one data node's data. */ switch (type) { case STREAM_BROADCAST: { double broadcast_cost = 0.0; /* * The data in one send thread is size/sendDop, and we need to * broadcast to receiveDOP * u_sess->pgxc_cxt.NumDataNodes threads. */ send_size = size / sendDop * receiveDop * consumer_num_dn * stream_ratio_send; receive_size = size * producer_num_dn * stream_ratio_receive; /* Cost of send and receive */ send_cost += send_size * send_data_cost; receive_cost += receive_size * receive_data_cost; broadcast_cost = send_cost + receive_cost; broadcast_cost *= u_sess->attr.attr_sql.stream_multiple; ereport(DEBUG1, (errmodule(MOD_OPT), (errmsg("Stream cost: broadcast. Send cost: %lf, Receive cost: %lf, " "total_cost: %lf", send_cost, receive_cost, broadcast_cost)))); if (!u_sess->attr.attr_sql.enable_broadcast) broadcast_cost = Max(g_instance.cost_cxt.disable_cost, broadcast_cost * 1.0e4); *total_cost += broadcast_cost; /* We'll set path rows(used for join) as local rows and plan rows as global rows */ *gblrows = ((isJoin) ? subgblrows : (subgblrows * consumer_num_dn)); /* If SPLIT_BROADCAST, the data at receive size will increase. */ *gblrows *= receiveDop; } break; case STREAM_REDISTRIBUTE: { double hash_cost = 0.0; double redistribute_cost = 0.0; #ifdef ENABLE_MULTIPLE_NODES double nodegroup_stream_weight = ng_get_nodegroup_stream_weight(producer_num_dn, consumer_num_dn); #else double nodegroup_stream_weight = 1.0; #endif skew_ratio = (skew_ratio < 1.0) ? 1.0 : skew_ratio; if (!IsLocatorReplicated(locator_type) || pa_stream == LOCAL_BROADCAST) { if (LOCAL_BROADCAST != pa_stream) { send_size = size / sendDop; receive_size = size / receiveDop; } else { send_size = size / sendDop * receiveDop; receive_size = size; } /* Cost of send and receive */ send_cost += send_size * send_data_cost * nodegroup_stream_weight; receive_cost += receive_size * receive_data_cost * skew_ratio * nodegroup_stream_weight; /* Hash calculate cost is related to the sending end. */ hash_cost += list_length(distribute_keys) * subrows * cpu_hashcal_cost / sendDop; redistribute_cost = hash_cost + send_cost + receive_cost; redistribute_cost *= u_sess->attr.attr_sql.stream_multiple; ereport(DEBUG1, (errmodule(MOD_OPT), (errmsg("Stream cost: redistribute. Send cost: %lf, Receive cost: %lf, " "total_cost: %lf", send_cost, receive_cost, redistribute_cost)))); *total_cost += redistribute_cost; *gblrows = subgblrows; } else { /* it should recompute total cost for hash filter and add hash filter expr in function stream_join_plan */ *total_cost += list_length(distribute_keys) * subrows * cpu_hashcal_cost; *gblrows = subgblrows; } if (LOCAL_BROADCAST == pa_stream) { *gblrows *= receiveDop; if (!u_sess->attr.attr_sql.enable_broadcast) *total_cost += Max(g_instance.cost_cxt.disable_cost, redistribute_cost * 1.0e4); } } break; case STREAM_GATHER: { send_size = size; receive_size = size * producer_num_dn; send_cost += send_size * send_data_cost; receive_cost += receive_size * receive_data_cost; *total_cost = *total_cost + send_cost + receive_cost; *gblrows = subgblrows; } break; case STREAM_HYBRID: { AssertEreport(ssinfo != NULL, MOD_OPT, "The ssinfo is NULL"); double hash_cost = 0.0; double filter_cost = 0.0; double hyrbid_cost = 0.0; double nodegroup_stream_weight = 1.0; double broadcast_send_size = 0.0; double broadcast_recv_size = 0.0; double broadcast_ratio = 0.0; QualCost qual_cost; ListCell* lc = NULL; qual_cost.per_tuple = 0; qual_cost.startup = 0; foreach (lc, ssinfo) { QualSkewInfo* qsinfo = (QualSkewInfo*)lfirst(lc); qual_cost.per_tuple += qsinfo->qual_cost.per_tuple; qual_cost.startup += qsinfo->qual_cost.startup; if (qsinfo->skew_stream_type == PART_REDISTRIBUTE_PART_BROADCAST || qsinfo->skew_stream_type == PART_LOCAL_PART_BROADCAST) { broadcast_ratio = qsinfo->broadcast_ratio; } } /* Calculate stream cost for broadcast data. */ broadcast_send_size = size * broadcast_ratio / sendDop * receiveDop * consumer_num_dn; broadcast_recv_size = size * broadcast_ratio * producer_num_dn; send_cost += broadcast_send_size * send_data_cost; receive_cost += broadcast_recv_size * receive_data_cost; /* Calculate stream cost for rest data. */ send_size = size * (1.0 - broadcast_ratio) / sendDop; receive_size = size * (1.0 - broadcast_ratio) / receiveDop; /* For PART_LOCAL_PART_BROADCAST, mostly data is send to local consumer. */ if (distribute_keys == NIL) { send_data_cost = LOCAL_SEND_KDATA_COST; receive_data_cost = LOCAL_RECEIVE_KDATA_COST; cpu_hashcal_cost = 0; } else { #ifdef ENABLE_MULTIPLE_NODES nodegroup_stream_weight = ng_get_nodegroup_stream_weight(producer_num_dn, consumer_num_dn); #else nodegroup_stream_weight = 1.0; #endif } /* Cost of send and receive */ send_cost += send_size * send_data_cost * nodegroup_stream_weight; receive_cost += receive_size * receive_data_cost * nodegroup_stream_weight; /* Hash calculate cost is related to the sending end. */ hash_cost = list_length(distribute_keys) * subrows * (1 - broadcast_ratio) * cpu_hashcal_cost / sendDop; filter_cost = subrows * qual_cost.per_tuple / sendDop + qual_cost.startup; hyrbid_cost = hash_cost + filter_cost + send_cost + receive_cost; hyrbid_cost *= u_sess->attr.attr_sql.stream_multiple; *total_cost += hyrbid_cost; *gblrows = subgblrows * broadcast_ratio * receiveDop * consumer_num_dn + subgblrows * (1 - broadcast_ratio); } break; default: break; } return; } /* * contain_special_plan_node: check if there are planTag nodes under plan * * @param: (in) plan * the plan node * @param: (in) planTag * the target plan tag * @param: (in) mode * the search mode (see more in ContainPlanNodeMode) * * @return: * wheather the target plan node is found, true means found */ bool contain_special_plan_node(Plan* plan, NodeTag planTag, ContainPlanNodeMode mode) { if (plan == NULL) return false; else if (nodeTag(plan) == planTag) { if (IsA(plan, ForeignScan)) { ForeignScan* fScan = (ForeignScan*)plan; if (IS_OBS_CSV_TXT_FOREIGN_TABLE(fScan->scan_relid)) { return true; } } else { return true; } } else if (IsA(plan, ModifyTable)) { ModifyTable* mt = (ModifyTable*)plan; ListCell* cell = NULL; if ((CPLN_ONE_WAY & mode) && list_length(mt->plans) > 1) { return false; } foreach (cell, mt->plans) { if (contain_special_plan_node((Plan*)lfirst(cell), planTag, mode)) return true; } } else if (IsA(plan, Append)) { ListCell* cell = NULL; if ((CPLN_ONE_WAY & mode) && list_length(((Append*)plan)->appendplans) > 1) { return false; } foreach (cell, ((Append*)plan)->appendplans) { if (contain_special_plan_node((Plan*)lfirst(cell), planTag, mode)) return true; } } else if (IsA(plan, MergeAppend)) { ListCell* cell = NULL; if ((CPLN_ONE_WAY & mode) && list_length(((MergeAppend*)plan)->mergeplans) > 1) { return false; } foreach (cell, ((MergeAppend*)plan)->mergeplans) { if (contain_special_plan_node((Plan*)lfirst(cell), planTag, mode)) return true; } } else if (IsA(plan, SubqueryScan)) { if (contain_special_plan_node(((SubqueryScan*)plan)->subplan, planTag, mode)) return true; } else if (IsA(plan, CteScan)) { if (planTag == T_Stream && ((CteScan*)plan)->subplan != NULL) { if (((CteScan*)plan)->subplan->has_inner_stream || ((CteScan*)plan)->subplan->has_outer_stream) return true; } } else if (IsA(plan, Material) && ((Material*)plan)->materialize_all) { return false; } else { if ((CPLN_ONE_WAY & mode) && NULL != plan->lefttree && NULL != plan->righttree) { return false; } if (contain_special_plan_node(plan->lefttree, planTag, mode)) return true; if (CPLN_NO_IGNORE_MATERIAL & mode) { if (contain_special_plan_node(plan->righttree, planTag, mode)) { return true; } } else { /* * If Nestloop is set material all, right tree should be only executed once, * so skip the check of the right tree then. */ bool material_all = IsA(plan, NestLoop) && ((NestLoop*)plan)->materialAll && IsA(plan->righttree, Material); if (!material_all && contain_special_plan_node(plan->righttree, planTag, mode)) return true; } } return false; } /* * check if the type cast does not change distribution * if the type cast change the hash func or the hash value * we have to add an extra distribution to put the casted * value to the right datanode. * * example: * * create table t_cast_redis(a int, b int); * insert into t_cast_redis values (generate_series(1,1000)); * analyze t_cast_redis; * select * from t_cast_redis t1, t_cast_redis t2 where t1.a=t2.a::boolean; * * * QUERY PLAN * ------------------------------------------------------------------------------- * Streaming (type: GATHER) (cost=6.25..16.75 rows=100 width=16) * Output: t1.a, t1.b, t2.a, t2.b * Node/s: All datanodes * -> Hash Join (cost=3.12..12.06 rows=100 width=16) * Output: t1.a, t1.b, t2.a, t2.b * Hash Cond: ((((t2.a)::boolean)::bigint) = t1.a) * -> Streaming(type: REDISTRIBUTE) (cost=0.00..7.88 rows=100 width=8) * Output: t2.a, t2.b, (((t2.a)::boolean)::bigint) --should redis even if column a is the distribute key * Distribute Key: (((t2.a)::boolean)::bigint) * Spawn on: All datanodes * -> Seq Scan on public.t t2 (cost=0.00..2.50 rows=100 width=8) * Output: t2.a, t2.b, (t2.a)::boolean * -> Hash (cost=2.50..2.50 rows=100 width=8) * Output: t1.a, t1.b * -> Seq Scan on public.t t1 (cost=0.00..2.50 rows=100 width=8) * Output: t1.a, t1.b * */ bool is_type_cast_hash_compatible(FuncExpr* func) { Oid arg_type = exprType((Node*)linitial(func->args)); Oid fun_type = func->funcresulttype; /* type has changed and is not compatible for hashing */ if (!is_compatible_type(arg_type, fun_type)) return false; /* * is type casting possibly change the value? * * we consider the casting is safe if no precision is lost: * * (1) from a small range type to a big range type, say, int1->int2->int4 * this is safe casting, no precision will be lost. * (2) from a big range type to a small range type, say, int4->int2->int1 * the overflow or underflow must be checked. * * we consider the casting is not safe if the value changed: * * (2) from a floating point value to an integer value, usually we round * the value, say, float(1.1)->int(1). this is not safe */ switch (func->funcid) { case 77: // chartoi4 case 78: // i4tochar case 235: // i2tod case 236: // i2tof case 311: // ftod case 312: // dtof case 313: // i2toi4 case 314: // i4toi2 case 316: // i4tod case 318: // i4tof case 406: // name_text case 408: // name_bpchar case 480: // int84 case 481: // int48 case 482: // i8tod case 652: // i8tof case 714: // int82 case 754: // int28 case 860: // char_bpchar case 1287: // i8tooid case 1288: // oidtoi8 case 1401: // name_text case 1740: // int4_numeric case 1742: // float4_numeric case 1743: // float8_numeric case 1781: // int8_numeric case 1782: // int2_numeric case 4065: // int1_varchar case 4066: // int1_nvarchar2 case 4067: // int1_bpchar case 4068: // int2_bpchar case 4069: // int8_bpchar case 4070: // float4_bpchar case 4071: // float8_bpchar case 4072: // numeric_bpchar case 4165: // int1_text case 4166: // int2_text case 4167: // int4_text case 4168: // int8_text case 4169: // float4_text case 4170: // float8_text case 4171: // numeric_text case 4180: // int2_varchar case 4181: // int4_varchar case 4182: // int8_varchar case 4183: // numeric_varchar case 4184: // float4_varchar case 4185: // float8_varchar case 5521: // int1_numeric case 5523: // i1toi2 case 5524: // i2toi1 case 5525: // i1toi4 case 5526: // i4toi1 case 5527: // i1toi8 case 5528: // i8toi1 case 5529: // i1tof4 case 5531: // i1tof8 /* safe casting */ return true; default: break; } /* oops, not safe */ return false; } Oid get_hash_type(Oid type_in) { switch (type_in) { /* Int8 hash arithmetic is compatible in int4, so we return same type. */ case INT8OID: case CASHOID: case INT1OID: case INT2OID: case OIDOID: case INT4OID: case BOOLOID: case CHAROID: case ABSTIMEOID: case RELTIMEOID: case DATEOID: return INT4OID; case INT2VECTOROID: case OIDVECTOROID: return OIDVECTOROID; case NVARCHAR2OID: case VARCHAROID: case TEXTOID: return TEXTOID; case RAWOID: case BYTEAOID: return RAWOID; case TIMEOID: case TIMESTAMPOID: case TIMESTAMPTZOID: case SMALLDATETIMEOID: return TIMEOID; case FLOAT4OID: case FLOAT8OID: return FLOAT4OID; case NAMEOID: case INTERVALOID: case TIMETZOID: case NUMERICOID: return type_in; case BPCHAROID: #ifdef PGXC if (g_instance.attr.attr_sql.string_hash_compatible) return TEXTOID; else #endif return type_in; default: return type_in; } } /* * An aggregate with ORDER BY, DISTINCT directives need to be * computed at Coordinator using all the rows. An aggregate * without collection function needs to be computed at * Coordinator. * Polymorphic transition types need to be resolved to * correctly interpret the transition results from Datanodes. * For now compute such aggregates at Coordinator. */ static bool is_agg_unsupport_dn_compute(Node* node) { Aggref* aggref = (Aggref*)node; /* DISTINCT can be computed at datanode if is stream plain */ if (IS_STREAM_PLAN) { if (aggref->aggorder || aggref->agglevelsup || (!aggref->agghas_collectfn && need_adjust_agg_inner_func_type(aggref)) || IsPolymorphicType(aggref->aggtrantype)) { #ifndef ENABLE_MULTIPLE_NODES errno_t sprintf_rc = sprintf_s(u_sess->opt_cxt.not_shipping_info->not_shipping_reason, NOTPLANSHIPPING_LENGTH, "aggregate %d is not supported in stream plan", aggref->aggfnoid); securec_check_ss_c(sprintf_rc, "\0", "\0"); mark_stream_unsupport(); #endif return true; } } else { if (aggref->aggorder || aggref->aggdistinct || aggref->agglevelsup || !aggref->agghas_collectfn || IsPolymorphicType(aggref->aggtrantype)) { return true; } } return false; } /* * return true if node cannot be evaluatated in foreign server. */ bool foreign_qual_walker(Node* node, foreign_qual_context* context) { bool ret_val = false; bool saved_collect_vars = false; if (node == NULL) return false; switch (nodeTag(node)) { case T_ExprState: return foreign_qual_walker((Node*)((ExprState*)node)->expr, NULL); case T_GroupingFunc: { if (context != NULL) { GroupingFunc* groupingFun = (GroupingFunc*)node; saved_collect_vars = context->collect_vars; context->collect_vars = false; context->aggs = lappend(context->aggs, groupingFun); } } break; case T_Aggref: { Aggref* aggref = (Aggref*)node; if (is_agg_unsupport_dn_compute(node)) { return true; } /* * Datanode can compute transition results, so, add the * aggregate to the context if context is present */ if (context != NULL) { /* * Don't collect VARs under the Aggref node. See * pgxc_process_grouping_targetlist() for details. */ saved_collect_vars = context->collect_vars; context->collect_vars = false; context->aggs = lappend(context->aggs, aggref); } } break; case T_Var: if (context != NULL && context->collect_vars) context->vars = lappend(context->vars, node); break; default: break; } ret_val = expression_tree_walker(node, (bool (*)())foreign_qual_walker, context); /* * restore value of collect_vars in the context, since we have finished * traversing tree rooted under and Aggref node */ if (context && IsA(node, Aggref)) context->collect_vars = saved_collect_vars; return ret_val; } /* * @Description: * Check if we need to add a local redistribute above the subplan. * * @param[IN] subplan: the plan to check. * @return bool: true -- local redistribute is needed. * */ bool is_local_redistribute_needed(Plan* subplan) { if (subplan == NULL) return true; bool ret = true; switch (nodeTag(subplan)) { case T_Stream: { Stream* st = (Stream*)subplan; if (STREAM_REDISTRIBUTE == st->type) { /* * If already add local distribute node, * then there is no need to add another. */ if (LOCAL_DISTRIBUTE == st->smpDesc.distriType || REMOTE_SPLIT_DISTRIBUTE == st->smpDesc.distriType) ret = false; } break; } case T_HashJoin: case T_NestLoop: { /* * If any side need to add local distribute, * then we have to add a new one. */ if (is_local_redistribute_needed(subplan->lefttree) || is_local_redistribute_needed(subplan->righttree)) ret = true; else ret = false; break; } case T_SubqueryScan: { SubqueryScan* ss = (SubqueryScan*)subplan; ret = is_local_redistribute_needed(ss->subplan); break; } case T_Append: { Append* app = (Append*)subplan; ListCell* lc = NULL; ret = false; /* * If any brunch need to add local distribute, * then we have to add a new one. */ foreach (lc, app->appendplans) { Plan* plan = (Plan*)lfirst(lc); if (is_local_redistribute_needed(plan)) ret = true; } break; } default: { if (subplan->lefttree) ret = is_local_redistribute_needed(subplan->lefttree); else ret = true; break; } } return ret; } /* * The function returns all the exprs that func_oid > FirstNormalObjectId * for the input plan. */ List* check_func_list(Plan* result_plan) { return check_op_list_template(result_plan, (List* (*)(Node*)) check_func); } /* * @Description: Wrapper to passin default parameter for function pointer, * This function will call check_subplan_expr with default input. * @param[IN] node: input node. * @return: subplan list. */ List* check_subplan_expr_default(Node* node) { return check_subplan_expr(node); } /* The function returns all the subplan exprs for the input plan */ List* check_subplan_list(Plan* result_plan) { return check_op_list_template(result_plan, (List* (*)(Node*)) check_subplan_expr_default); } /* * The function returns all the exprs that vartype > FirstNormalObjectId * for the input plan. */ List* check_vartype_list(Plan* result_plan) { return check_op_list_template(result_plan, (List* (*)(Node*)) check_vartype); } static void check_plannerInfo(PlannerInfo* root, Index result_rte_idx) { if (root->simple_rte_array == NULL) { ereport(ERROR, (errmodule(MOD_OPT), errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("simple_rte_array is NULL unexpectedly"))); } if (root->simple_rte_array[result_rte_idx] == NULL) { ereport(ERROR, (errmodule(MOD_OPT), errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("simple_rte_array[result_rte_idx] is NULL unexpectedly"))); } } static bool judge_lockrows_need_redistribute_keyLen_equal( PlannerInfo* root, Plan* subplan, Form_pgxc_class target_classForm, Index result_rte_idx) { check_plannerInfo(root, result_rte_idx); bool need_redistribute = false; Var* lockrelvar = NULL; Form_pg_attribute attTup = NULL; Relation rel = heap_open(root->simple_rte_array[result_rte_idx]->relid, AccessShareLock); int disKeyLen = target_classForm->pcattnum.dim1; /* In natural shuffle scene, we need use redistribute to send back the data to original node. */ if (!ng_is_same_group(rel->rd_locator_info->nodeList, (ng_get_dest_execnodes(subplan))->nodeList)) { heap_close(rel, AccessShareLock); return true; } for (int i = 0; i < disKeyLen; i++) { AttrNumber distributeKeyIdx = target_classForm->pcattnum.values[i]; Node* subKey = (Node*)list_nth(subplan->distributed_keys, i); Var* subVar = locate_distribute_var((Expr*)subKey); attTup = &rel->rd_att->attrs[distributeKeyIdx - 1]; /* * Call pfree_ext to free the lockrelvar created in this loop and * we don't need check null outside as its check inside. */ pfree_ext(lockrelvar); /* * Unlike judge_dml_need_redistribute, we just compare the target table's * distribute key with subplan's key but not check the targetlist. * Make var of target table for judge_node_compatible to check whether * need redistribute in FOR UPDATE/SHARE. */ lockrelvar = (Var*)makeVar( result_rte_idx, distributeKeyIdx, attTup->atttypid, attTup->atttypmod, attTup->attcollation, 0); /* Check if current key is exactly equal */ if (subVar != NULL && subVar->varno == result_rte_idx && subVar->varattno == distributeKeyIdx) continue; /* Check if current key from the same equivalence class */ if (subVar != NULL && lockrelvar != NULL && judge_node_compatible(root, (Node*)subVar, (Node*)lockrelvar)) continue; need_redistribute = true; break; } if (!need_redistribute && IsLocatorDistributedBySlice(rel->rd_locator_info->locatorType)) { if (subplan->exec_nodes == NULL || !IsSliceInfoEqualByOid(subplan->exec_nodes->rangelistOid, rel->rd_locator_info->relid)) { need_redistribute = true; } } heap_close(rel, AccessShareLock); return need_redistribute; } /* * @Description: check whether need redistribute for FOR UPDATE/SHARE. * @in root: planner info of current query level * @in subplan: subplan that need to check with target table * @in target_classForm: distribute info of target table * @result_rte_idx: range table entry index of target table * @return : true if redistribute is needed, else false */ bool judge_lockrows_need_redistribute( PlannerInfo* root, Plan* subplan, Form_pgxc_class target_classForm, Index result_rte_idx) { bool need_redistribute = false; int disKeyLen = target_classForm->pcattnum.dim1; int subKeyLen = list_length(subplan->distributed_keys); /* * We use lock table's distribute key as redistribute key when add redistribute * node for lockrows. but tables without real distribute key cannot do accurate * redistribution. * e.g. data of replicated table may be redistribute to other node with the redistribute * key of one of it's column, but it is very difficult to redistribute the data back * without the table's distribute key. */ if (target_classForm->pclocatortype != LOCATOR_TYPE_HASH && !IsLocatorDistributedBySlice(target_classForm->pclocatortype)) ereport(ERROR, (errmodule(MOD_OPT), (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Unsupported FOR UPDATE/SHARE non-hash/range/list table in stream plan.")))); if (disKeyLen != subKeyLen) { need_redistribute = true; } else { need_redistribute = judge_lockrows_need_redistribute_keyLen_equal(root, subplan, target_classForm, result_rte_idx); } return need_redistribute; } /* * @Description: Set can_push flag of the query * * @in query - query tree. * @return : void. */ void mark_query_canpush_flag(Node *query) { if (query == NULL) { return; } errno_t rc = EOK; shipping_context spCtxt; rc = memset_s(&spCtxt, sizeof(spCtxt), 0, sizeof(spCtxt)); securec_check(rc, "\0", "\0"); spCtxt.is_randomfunc_shippable = u_sess->opt_cxt.is_randomfunc_shippable && IS_STREAM_PLAN; spCtxt.is_ecfunc_shippable = true; spCtxt.query_list = NIL; spCtxt.query_count = 0; spCtxt.query_shippable = true; spCtxt.current_shippable = true; spCtxt.global_shippable = true; (void)stream_walker(query, &spCtxt); } /* * Delete useless local stream node. * * @param[IN] stream_plan: plan need to be check. * @param[IN] parent: parant plan of stream. * @param[IN] lc: subplan list cell. * return bool: removed -- true, not removed -- false. */ bool remove_local_plan(Plan* stream_plan, Plan* parent, ListCell* lc, bool is_left) { Stream* stream = NULL; if (stream_plan == NULL || parent == NULL) return false; #ifndef ENABLE_MULTIPLE_NODES /* Plan who has initPlan can not be removed. */ if (stream_plan->initPlan) { return false; } #endif /* Check the plan type. */ if (IsA(stream_plan, Stream) || IsA(stream_plan, VecStream)) stream = (Stream*)stream_plan; else return false; /* Delete useless local stream node. */ if (STREAM_IS_LOCAL_NODE(stream->smpDesc.distriType) && stream->smpDesc.consumerDop == 1 && stream->smpDesc.producerDop == 1) { if (lc != NULL) { lfirst(lc) = (void*)stream_plan->lefttree; } else if (is_left) { parent->lefttree = stream_plan->lefttree; } else if (parent->righttree == stream_plan) { parent->righttree = stream_plan->lefttree; } return true; } else if (stream->smpDesc.distriType == REMOTE_SPLIT_DISTRIBUTE && stream->smpDesc.consumerDop == 1) { /* Set remote stream type. */ stream->smpDesc.distriType = REMOTE_DISTRIBUTE; } else if (stream->smpDesc.distriType == REMOTE_SPLIT_BROADCAST && stream->smpDesc.consumerDop == 1) { stream->smpDesc.distriType = REMOTE_BROADCAST; } return false; } /* * Description: if RANDOM/GS_ENCRYPT_AES128 function exist in the plan, * return the node list. */ List* check_random_expr(Plan* result_plan) { List* random_list = check_op_list_template(result_plan, (List *(*)(Node*)) check_random_expr); return random_list; } static void set_bucketmap_index_by_group(ExecNodes *exec_node, NodeGroupInfoContext *node_group_info_context, Oid groupoid) { char *group_name = get_pgxc_groupname(groupoid); if (group_name == NULL) return; char *installation_group_name = pstrdup(PgxcGroupGetInstallationGroup()); char *group_parent_name = NULL; int current_num_bucketmaps = node_group_info_context->num_bucketmaps; int bucketlen; if (strncmp(group_name, installation_group_name, NAMEDATALEN) == 0 || (((group_parent_name = get_pgxc_groupparent(groupoid)) != NULL) && strncmp(group_parent_name, installation_group_name, NAMEDATALEN) == 0)) { node_group_info_context->groupOids[current_num_bucketmaps] = groupoid; node_group_info_context->bucketMap[current_num_bucketmaps] = BucketMapCacheGetBucketmap(group_name, &bucketlen); node_group_info_context->bucketCnt[current_num_bucketmaps] = bucketlen; exec_node->bucketmapIdx = node_group_info_context->num_bucketmaps; node_group_info_context->num_bucketmaps++; } else { char in_redistribution = get_pgxc_group_redistributionstatus(groupoid); if ('y' != in_redistribution) { char *group_parent = get_pgxc_groupparent(groupoid); if (group_parent != NULL) { Oid group_parent_oid = get_pgxc_groupoid(group_parent, false); in_redistribution = get_pgxc_group_redistributionstatus(group_parent_oid); } } uint2 *bucketmap = BucketMapCacheGetBucketmap(group_name, &bucketlen); if ('y' == in_redistribution) { node_group_info_context->groupOids[current_num_bucketmaps] = groupoid; node_group_info_context->bucketMap[current_num_bucketmaps] = bucketmap; node_group_info_context->bucketCnt[current_num_bucketmaps] = bucketlen; exec_node->bucketmapIdx = node_group_info_context->num_bucketmaps; node_group_info_context->num_bucketmaps++; } else { exec_node->bucketmapIdx = (int)(BUCKETMAP_DEFAULT_INDEX_BIT | bucketlen); } } AssertEreport(node_group_info_context->num_bucketmaps <= MAX_SPECIAL_BUCKETMAP_NUM, MOD_OPT, "The number of bucketmaps exceeded the max special bucketmap num"); pfree_ext(group_name); pfree_ext(group_parent_name); pfree_ext(installation_group_name); } /* * record the bucketmap and the index of bucketmap. */ static void set_bucketmap_index(ExecNodes* exec_node, NodeGroupInfoContext* node_group_info_context) { if (exec_node == NULL) return; exec_node->bucketmapIdx = (int)(BUCKETMAP_DEFAULT_INDEX_BIT | BUCKETDATALEN); Oid groupoid = exec_node->distribution.group_oid; if (groupoid == InvalidOid) return; for (int i = 0; i < node_group_info_context->num_bucketmaps; i++) { if (node_group_info_context->groupOids[i] == groupoid) { exec_node->bucketmapIdx = i; return; } } set_bucketmap_index_by_group(exec_node, node_group_info_context, groupoid); } static bool is_execute_on_multinodes(Plan* plan) { return (plan->exec_nodes && list_length(plan->exec_nodes->nodeList) != 1); } /* * we can get the groupoid from execnode of plan, * if the group is the installation group or the group is in redistribution, * we need record the bucketmap and the index of bucketmap. * * @param[IN] plan: plan need to be check. * @param[IN] node_group_info_context: record the bucketmap */ static void set_bucketmap_index(Plan* plan, NodeGroupInfoContext* node_group_info_context) { if (IsA(plan, Stream) || IsA(plan, VecStream)) { Stream* stream_plan = (Stream*)plan; set_bucketmap_index(stream_plan->scan.plan.exec_nodes, node_group_info_context); set_bucketmap_index(stream_plan->consumer_nodes, node_group_info_context); } else if (IsA(plan, RemoteQuery)) { RemoteQuery* remote_query_plan = (RemoteQuery*)plan; set_bucketmap_index(remote_query_plan->scan.plan.exec_nodes, node_group_info_context); set_bucketmap_index(remote_query_plan->exec_nodes, node_group_info_context); } else { set_bucketmap_index(plan->exec_nodes, node_group_info_context); } } /* * finalize_node_id * To finalize node id and parent node id for result plan. The sequence of plan node id doesn't * mean anything. We just keep it unique among entire plan. * Also, we determine the plan node id of each subplan, which is the top plan node id of thread * that executes the subplan, and make proper change to subplans. Currently, we only support * that one subplan only executes in one thread, and we don't support main query and subplan * block executes on different cn and dn side. * Not sure how to utilize the plan node id and parent node id to do communication in future, * so use the simplest way to finalize it. * * Parameters: * result_plan: the main query plan to handle. * plan_node_id: the global counter to assign plan node id for each plan node * parent_node_id: the recorded plan node id for the plan node above current plan node * num_streams: global counter to record how many streams are under gather node * num_plannodes: global counter to record how many plan nodes in total * total_num_streams: global counter to record how many streams in total * subplans: subplan list for the top plan (including cte, correlated and uncorrelated subplan) * subroots: planner info list for each subplan, has the same number of memebers as subplans * initplans: global init plans to find if a subplan is uncorrelated * subplan_ids: top plan node id list of thread that executes the subplan * is_under_stream: flag if current plan is under stream/gather node * is_data_node_exec: flag to judge if plan with all-node are executed on data_nodes or cn */ void finalize_node_id(Plan* result_plan, int* plan_node_id, int* parent_node_id, int* num_streams, int* num_plannodes, int* total_num_streams, int* max_push_sql_num, int* gather_count, List* subplans, List* subroots, List** initplans, int* subplan_ids, bool is_under_stream, bool is_under_ctescan, bool is_data_node_exec, bool is_read_only, NodeGroupInfoContext* node_group_info_context) { errno_t sprintf_rc = 0; if (result_plan != NULL) { result_plan->plan_node_id = *plan_node_id; result_plan->parent_node_id = *parent_node_id; if (is_execute_on_datanodes(result_plan)) { is_data_node_exec = true; /* set the index of bucketmap */ set_bucketmap_index(result_plan, node_group_info_context); } if (is_under_stream) subplan_ids[0] = *plan_node_id; *parent_node_id = *plan_node_id; int save_parent_id = *parent_node_id; (*plan_node_id)++; (*num_plannodes)++; /* * Since we replace cte with parse tree, we don't need cte sublink any more. * Specifically, we don't support expressions like values(cte), so make it stream unsupport */ if (result_plan->initPlan && IS_STREAM_PLAN) { List* cteLinkList = NIL; ListCell* lc = NULL; foreach (lc, result_plan->initPlan) { SubPlan* plan = (SubPlan*)lfirst(lc); if (plan->subLinkType == CTE_SUBLINK) cteLinkList = lappend(cteLinkList, plan); } if (cteLinkList != NIL) { if (IsA(result_plan, ValuesScan)) { sprintf_rc = sprintf_s(u_sess->opt_cxt.not_shipping_info->not_shipping_reason, NOTPLANSHIPPING_LENGTH, "ValuesScan in CTE SubLink can't be shipped"); securec_check_ss_c(sprintf_rc, "\0", "\0"); mark_stream_unsupport(); } } if (IS_STREAM_PLAN) *initplans = list_concat(*initplans, list_copy(result_plan->initPlan)); } switch (nodeTag(result_plan)) { case T_Append: case T_VecAppend: { Append* append = (Append*)result_plan; ListCell* lc = NULL; foreach (lc, append->appendplans) { Plan* plan = (Plan*)lfirst(lc); finalize_node_id(plan, plan_node_id, parent_node_id, num_streams, num_plannodes, total_num_streams, max_push_sql_num, gather_count, subplans, subroots, initplans, subplan_ids, false, is_under_ctescan, is_data_node_exec, is_read_only, node_group_info_context); *parent_node_id = save_parent_id; } } break; case T_CteScan: { if (STREAM_RECURSIVECTE_SUPPORTED) { CteScan* cte_plan = (CteScan*)result_plan; int subplanid = cte_plan->ctePlanId; /* mark unsupport if CteScan nested in another CteScan */ if (is_under_ctescan) { sprintf_rc = sprintf_s(u_sess->opt_cxt.not_shipping_info->not_shipping_reason, NOTPLANSHIPPING_LENGTH, "With-Recursive CteScan runs as subplan nested in another CteScan"); securec_check_ss_c(sprintf_rc, "\0", "\0"); mark_stream_unsupport(); } /* Fetch the cte's underlying subplan(RecursiveUnion) */ RecursiveUnion* ru_plan = (RecursiveUnion*)list_nth(subplans, cte_plan->ctePlanId - 1); if (IsA(ru_plan, RecursiveUnion)) { /* Do plan node finalization into subplan plantree (RecursiveUnion) */ finalize_node_id((Plan*)ru_plan, plan_node_id, parent_node_id, num_streams, num_plannodes, total_num_streams, max_push_sql_num, gather_count, subplans, subroots, initplans, subplan_ids, false, is_under_ctescan, is_data_node_exec, is_read_only, node_group_info_context); } else break; /* * Note, the recursive-cte processing (stream mode), RecursiveUnion * operator is processed in a way like SubPlan initialization, we just * record the recursive-union's top node planid in subplan_ids array */ subplan_ids[subplanid] = subplan_ids[0]; *parent_node_id = save_parent_id; } /* Mainly for cte that execute on CN. */ if (IsExplainPlanStmt) { /* Fetch the cte's underlying subplan. */ CteScan* cte_plan = (CteScan*)result_plan; Plan* ru_plan = (Plan*)list_nth(subplans, cte_plan->ctePlanId - 1); /* * Set plan node id for cte that can not be push down. * For with recursive cte, if it can not be push down, we only collect plan info till ctescan. * For cte that execute on cn, we should collect all nodes info in cte. */ if (!IsA(ru_plan, RecursiveUnion)) { finalize_node_id((Plan*)ru_plan, plan_node_id, parent_node_id, num_streams, num_plannodes, total_num_streams, max_push_sql_num, gather_count, subplans, subroots, initplans, subplan_ids, false, is_under_ctescan, is_data_node_exec, is_read_only, node_group_info_context); } *parent_node_id = save_parent_id; } } break; case T_ModifyTable: case T_VecModifyTable: { ModifyTable* mt = (ModifyTable*)result_plan; ListCell* lc = NULL; foreach (lc, mt->plans) { Plan* plan = (Plan*)lfirst(lc); finalize_node_id(plan, plan_node_id, parent_node_id, num_streams, num_plannodes, total_num_streams, max_push_sql_num, gather_count, subplans, subroots, initplans, subplan_ids, false, is_under_ctescan, is_data_node_exec, is_read_only, node_group_info_context); *parent_node_id = save_parent_id; } } break; case T_SubqueryScan: case T_VecSubqueryScan: { SubqueryScan* ss = (SubqueryScan*)result_plan; if (ss->subplan) { /* Check the SubqueryScan whether will be reduced in set_plan_reference. */ if (IsExplainPlanStmt && trivial_subqueryscan(ss)) { /* * For explain plan, we must make sure num_plannodes and plan_node_id is absolutely correct. * So, if SubqueryScan will be reduced, we reduce the num_plannodes too. */ (*plan_node_id)--; (*num_plannodes)--; finalize_node_id(ss->subplan, plan_node_id, parent_node_id, num_streams, num_plannodes, total_num_streams, max_push_sql_num, gather_count, subplans, subroots, initplans, subplan_ids, false, is_under_ctescan, is_data_node_exec, is_read_only, node_group_info_context); } else { finalize_node_id(ss->subplan, plan_node_id, parent_node_id, num_streams, num_plannodes, total_num_streams, max_push_sql_num, gather_count, subplans, subroots, initplans, subplan_ids, false, is_under_ctescan, is_data_node_exec, is_read_only, node_group_info_context); } *parent_node_id = save_parent_id; } } break; case T_MergeAppend: case T_VecMergeAppend: { MergeAppend* ma = (MergeAppend*)result_plan; ListCell* lc = NULL; foreach (lc, ma->mergeplans) { Plan* plan = (Plan*)lfirst(lc); finalize_node_id(plan, plan_node_id, parent_node_id, num_streams, num_plannodes, total_num_streams, max_push_sql_num, gather_count, subplans, subroots, initplans, subplan_ids, false, is_under_ctescan, is_data_node_exec, is_read_only, node_group_info_context); *parent_node_id = save_parent_id; } } break; case T_RemoteQuery: case T_VecRemoteQuery: { RemoteQuery* rq = (RemoteQuery*)result_plan; int saved_plan_id = subplan_ids[0]; (*gather_count)++; rq->read_only = is_read_only; if (GATHER == rq->position) { (*num_streams) = 0; } if (result_plan->lefttree) { finalize_node_id(result_plan->lefttree, plan_node_id, parent_node_id, num_streams, num_plannodes, total_num_streams, max_push_sql_num, gather_count, subplans, subroots, initplans, subplan_ids, true, is_under_ctescan, is_data_node_exec, is_read_only, node_group_info_context); subplan_ids[0] = saved_plan_id; *parent_node_id = save_parent_id; } if (result_plan->righttree) { finalize_node_id(result_plan->righttree, plan_node_id, parent_node_id, num_streams, num_plannodes, total_num_streams, max_push_sql_num, gather_count, subplans, subroots, initplans, subplan_ids, true, is_under_ctescan, is_data_node_exec, is_read_only, node_group_info_context); subplan_ids[0] = saved_plan_id; *parent_node_id = save_parent_id; } if (GATHER == rq->position) { rq->num_stream = *num_streams; /* We adjusted exec_nodes for subplans in multi node group case, so also adjust max execnodes */ if (IS_STREAM_PLAN && subplans != NIL) rq->exec_nodes = get_plan_max_ExecNodes(result_plan->lefttree, subplans); (*num_streams) = 0; } if (!rq->is_simple) (*max_push_sql_num)++; /* mark num_gather include scan_gather plan_router gather in all plan */ rq->num_gather = *gather_count; } break; case T_Stream: case T_VecStream: { Stream* stream = (Stream*)result_plan; int saved_plan_id = subplan_ids[0]; (*num_streams)++; (*total_num_streams)++; finalize_node_id(result_plan->lefttree, plan_node_id, parent_node_id, num_streams, num_plannodes, total_num_streams, max_push_sql_num, gather_count, subplans, subroots, initplans, subplan_ids, true, is_under_ctescan, is_data_node_exec, is_read_only, node_group_info_context); *parent_node_id = save_parent_id; subplan_ids[0] = saved_plan_id; if (is_gather_stream(stream)) stream->is_dummy = false; } break; case T_BitmapAnd: case T_CStoreIndexAnd: { BitmapAnd* ba = (BitmapAnd*)result_plan; ListCell* lc = NULL; foreach (lc, ba->bitmapplans) { Plan* plan = (Plan*)lfirst(lc); finalize_node_id(plan, plan_node_id, parent_node_id, num_streams, num_plannodes, total_num_streams, max_push_sql_num, gather_count, subplans, subroots, initplans, subplan_ids, false, is_under_ctescan, is_data_node_exec, is_read_only, node_group_info_context); *parent_node_id = save_parent_id; } } break; case T_BitmapOr: case T_CStoreIndexOr: { BitmapOr* bo = (BitmapOr*)result_plan; ListCell* lc = NULL; foreach (lc, bo->bitmapplans) { Plan* plan = (Plan*)lfirst(lc); finalize_node_id(plan, plan_node_id, parent_node_id, num_streams, num_plannodes, total_num_streams, max_push_sql_num, gather_count, subplans, subroots, initplans, subplan_ids, false, is_under_ctescan, is_data_node_exec, is_read_only, node_group_info_context); *parent_node_id = save_parent_id; } } break; case T_ExtensiblePlan: { ListCell* lc = NULL; foreach (lc, ((ExtensiblePlan*)result_plan)->extensible_plans) { Plan* plan = (Plan*)lfirst(lc); finalize_node_id(plan, plan_node_id, parent_node_id, num_streams, num_plannodes, total_num_streams, max_push_sql_num, gather_count, subplans, subroots, initplans, subplan_ids, false, is_under_ctescan, is_data_node_exec, is_read_only, node_group_info_context); *parent_node_id = save_parent_id; } } break; default: if (result_plan->lefttree) { finalize_node_id(result_plan->lefttree, plan_node_id, parent_node_id, num_streams, num_plannodes, total_num_streams, max_push_sql_num, gather_count, subplans, subroots, initplans, subplan_ids, false, is_under_ctescan, is_data_node_exec, is_read_only, node_group_info_context); *parent_node_id = save_parent_id; } if (result_plan->righttree) { finalize_node_id(result_plan->righttree, plan_node_id, parent_node_id, num_streams, num_plannodes, total_num_streams, max_push_sql_num, gather_count, subplans, subroots, initplans, subplan_ids, false, is_under_ctescan, is_data_node_exec, is_read_only, node_group_info_context); *parent_node_id = save_parent_id; } break; } /* * We handle subplans in following codes, from several aspects: * (1) Set the owner of each subplan in subplan_ids array * (2) Mark stream unsupport if main query and subplan comes from cn and dn respectively (exception (3)) * (3) For initplan, if main query is from cn, just gather the subplan * (4) vectorize subplan and set plan references */ if (IS_STREAM_PLAN) { /* Block pushing down Random()/GS_ENCRYPT_AES128() in Replicated plan temporarily */ if (is_replicated_plan(result_plan) && is_execute_on_multinodes(result_plan)) { List* nodelist = check_random_expr(result_plan); if (list_length(nodelist) > 0) { sprintf_rc = sprintf_s(u_sess->opt_cxt.not_shipping_info->not_shipping_reason, NOTPLANSHIPPING_LENGTH, "Function %s() can not be shipped", get_func_name(((FuncExpr*)linitial(nodelist))->funcid)); securec_check_ss_c(sprintf_rc, "\0", "\0"); list_free_ext(nodelist); mark_stream_unsupport(); } } List* subplan_list = check_subplan_list(result_plan); /* find all subplan exprs from main plan */ ListCell* lc = NULL; foreach (lc, subplan_list) { Node* node = (Node*)lfirst(lc); Plan* plan = NULL; SubPlan* subplan = NULL; bool has_finalized = false; if (IsA(node, SubPlan)) { subplan = (SubPlan*)lfirst(lc); /* this is for the case that initplan hidden in testexpr of subplan */ subplan_list = list_concat(subplan_list, check_subplan_expr(subplan->testexpr)); } else { AssertEreport(IsA(node, Param), MOD_OPT, "The node is NOT a param"); Param* param = (Param*)node; ListCell* lc2 = NULL; foreach (lc2, *initplans) { subplan = (SubPlan*)lfirst(lc2); if (list_member_int(subplan->setParam, param->paramid)) break; } if (subplan == NULL || lc2 == NULL) continue; } plan = (Plan*)list_nth(subplans, subplan->plan_id - 1); /* * check if subplan has been finalized. if it already belongs to other thread, * then mark stream unsupport. else skip finalization */ if (subplan_ids[subplan->plan_id] != 0) { if (subplan_ids[subplan->plan_id] != subplan_ids[0]) { sprintf_rc = sprintf_s(u_sess->opt_cxt.not_shipping_info->not_shipping_reason, NOTPLANSHIPPING_LENGTH, "SubPlan shared on multi-thread can't be shipped"); securec_check_ss_c(sprintf_rc, "\0", "\0"); mark_stream_unsupport(); } else has_finalized = true; } else subplan_ids[subplan->plan_id] = subplan_ids[0]; if (!has_finalized) { #ifdef ENABLE_MULTIPLE_NODES /* * subplan on dn and main plan on cn. In such case, we only * support initplan, and gather the result to cn. * * single no need to consider this situation, because subplan * and the node contains subplan will not parallel. */ if (is_execute_on_coordinator(result_plan) || (is_execute_on_allnodes(result_plan) && !is_data_node_exec)) { Plan* child_plan = NULL; ListCell* lc2 = NULL; int i = subplan->plan_id - 1; PlannerInfo* subroot = (PlannerInfo*)list_nth(subroots, i); if (is_execute_on_datanodes(plan)) { if (IsA(node, Param) && !subroot->is_correlated) { List *child_init_plan = NULL; if (IsA(plan, Stream)) { child_plan = outerPlan(plan); child_init_plan = plan->initPlan; } else { child_plan = plan; } plan = make_simple_RemoteQuery(child_plan, subroot, false); plan->initPlan = child_init_plan; foreach (lc2, subplans) { if (i == 0) { lfirst(lc2) = plan; break; } i--; } } else { subplan_ids[subplan->plan_id] = 0; sprintf_rc = sprintf_s(u_sess->opt_cxt.not_shipping_info->not_shipping_reason, NOTPLANSHIPPING_LENGTH, "main plan exec on CN and SubPlan exec on DN can't be shipped"); securec_check_ss_c(sprintf_rc, "\0", "\0"); mark_stream_unsupport(); } } else if (is_execute_on_coordinator(plan)) { if (subroot->is_correlated && contain_special_plan_node(plan, T_RemoteQuery)) { subplan_ids[subplan->plan_id] = 0; sprintf_rc = sprintf_s(u_sess->opt_cxt.not_shipping_info->not_shipping_reason, NOTPLANSHIPPING_LENGTH, "main plan exec on CN and SubPlan containing RemoteQuery exec on " "CN can't be shipped"); mark_stream_unsupport(); } } } else if (is_execute_on_coordinator(plan)) { subplan_ids[subplan->plan_id] = 0; sprintf_rc = sprintf_s(u_sess->opt_cxt.not_shipping_info->not_shipping_reason, NOTPLANSHIPPING_LENGTH, "SubPlan exec on CN can't be shipped"); securec_check_ss_c(sprintf_rc, "\0", "\0"); mark_stream_unsupport(); } /* we only need to push down partial datanodes execution */ else if (!is_replicated_plan(result_plan) || (result_plan->exec_nodes && result_plan->exec_nodes->nodeList != NIL)) { /* For init plan, we should add broadcast if nodelist not match */ if (subplan->args == NIL && (!IsA(node, SubPlan)) && is_execute_on_datanodes(plan) && !contain_special_plan_node(plan, T_Stream, CPLN_ONE_WAY)) { if (result_plan->exec_nodes && plan->exec_nodes && plan->exec_nodes->nodeList != NIL && list_difference_int(result_plan->exec_nodes->nodeList, plan->exec_nodes->nodeList)) { Distribution* result_distribution = ng_get_dest_distribution(result_plan); plan = make_stream_plan(NULL, plan, NIL, 0.0, result_distribution); } } /* Push only nodelist but not entire exec_nodes here. */ pushdown_execnodes(plan, result_plan->exec_nodes, false, true); } #endif if (check_stream_support()) { PlannerInfo* subroot = NULL; Plan* child_root = NULL; ListCell* lr = NULL; int j = subplan->plan_id - 1; subroot = (PlannerInfo*)list_nth(subroots, j); /* for subplan or correlated init plan, check rescan whether support */ plan = try_vectorize_plan( plan, subroot->parse, IsA(node, SubPlan) || subroot->is_correlated, subroot); child_root = set_plan_references(subroot, plan); foreach (lr, subplans) { if (j == 0) { lfirst(lr) = child_root; break; } j--; } /* also need set plan with child_root. */ plan = child_root; if (isIntergratedMachine && !IsA(node, SubPlan)) { if (u_sess->opt_cxt.query_dop > 1) { List* subPlanList = NIL; (void)has_subplan(plan, result_plan, NULL, true, &subPlanList, true); } confirm_parallel_info(plan, result_plan->dop); } } /* Check if this subplan under CteScan */ if (IsA(result_plan, CteScan)) { is_under_ctescan = true; } finalize_node_id(plan, plan_node_id, parent_node_id, num_streams, num_plannodes, total_num_streams, max_push_sql_num, gather_count, subplans, subroots, initplans, subplan_ids, false, is_under_ctescan, is_data_node_exec, is_read_only, node_group_info_context); *parent_node_id = save_parent_id; } } list_free_ext(subplan_list); } } } void mark_distribute_setop_remotequery(PlannerInfo* root, Node* node, Plan* plan, List* subPlans) { Plan* remotePlan = NULL; Plan* subPlan = NULL; ListCell* cell = NULL; List* newSubPlans = NIL; MergeAppend* mergeAppend = NULL; Append* append = NULL; errno_t sprintf_rc = 0; if (IsA(node, MergeAppend)) { mergeAppend = (MergeAppend*)node; } else if (IsA(node, RecursiveUnion)) { sprintf_rc = sprintf_s(u_sess->opt_cxt.not_shipping_info->not_shipping_reason, NOTPLANSHIPPING_LENGTH, "With-Recursive in subplan which executes on CN is not shippable"); securec_check_ss_c(sprintf_rc, "\0", "\0"); mark_stream_unsupport(); } else { AssertEreport(IsA(node, Append), MOD_OPT, "The node is NOT a Append"); append = (Append*)node; } foreach (cell, subPlans) { subPlan = (Plan*)lfirst(cell); remotePlan = make_simple_RemoteQuery(subPlan, root, true); if (mergeAppend && is_execute_on_datanodes(subPlan)) { remotePlan = (Plan*)make_sort(root, remotePlan, mergeAppend->numCols, mergeAppend->sortColIdx, mergeAppend->sortOperators, mergeAppend->collations, mergeAppend->nullsFirst, -1); } newSubPlans = lappend(newSubPlans, remotePlan); } list_free_ext(subPlans); if (PointerIsValid(mergeAppend)) { mergeAppend->mergeplans = newSubPlans; } else { AssertEreport(PointerIsValid(append), MOD_OPT, "The append is NULL"); append->appendplans = newSubPlans; } plan->dop = 1; plan->distributed_keys = NIL; plan->exec_type = EXEC_ON_COORDS; plan->exec_nodes = ng_get_single_node_group_exec_node(); }