From 5dfa9d8ed1ced7fcdee6b84dc16075a1a15efa1e Mon Sep 17 00:00:00 2001 From: TotaJ Date: Wed, 16 Sep 2020 20:10:45 +0800 Subject: [PATCH] Parallel query, sequence scan. --- src/bin/gs_guc/cluster_guc.conf | 7 + src/common/backend/catalog/index.cpp | 9 + src/common/backend/catalog/namespace.cpp | 52 +- src/common/backend/nodes/copyfuncs.cpp | 26 + src/common/backend/nodes/nodeFuncs.cpp | 114 ++ src/common/backend/nodes/nodes.cpp | 1 + src/common/backend/nodes/outfuncs.cpp | 32 + src/common/backend/nodes/params.cpp | 177 +++ src/common/backend/nodes/readfuncs.cpp | 2 + src/common/backend/utils/adt/datum.cpp | 126 ++ src/common/backend/utils/adt/lockfuncs.cpp | 37 +- src/common/backend/utils/cache/lsyscache.cpp | 10 + src/common/backend/utils/cache/relmapper.cpp | 7 + src/common/backend/utils/misc/guc.cpp | 150 +++ src/common/backend/utils/time/snapmgr.cpp | 221 +++- src/common/pl/plpgsql/src/pl_exec.cpp | 25 +- src/gausskernel/optimizer/commands/async.cpp | 5 + src/gausskernel/optimizer/commands/copy.cpp | 4 +- .../optimizer/commands/explain.cpp | 18 +- .../optimizer/commands/extension.cpp | 2 +- .../optimizer/commands/prepare.cpp | 1 + .../optimizer/commands/sequence.cpp | 15 + .../optimizer/commands/variable.cpp | 32 +- src/gausskernel/optimizer/path/allpaths.cpp | 166 ++- src/gausskernel/optimizer/path/costsize.cpp | 46 +- src/gausskernel/optimizer/plan/createplan.cpp | 50 + src/gausskernel/optimizer/plan/planmain.cpp | 4 + src/gausskernel/optimizer/plan/planner.cpp | 48 + src/gausskernel/optimizer/plan/setrefs.cpp | 7 +- src/gausskernel/optimizer/plan/subselect.cpp | 1 + src/gausskernel/optimizer/util/clauses.cpp | 192 ++- src/gausskernel/optimizer/util/optcommon.cpp | 3 + src/gausskernel/optimizer/util/pathnode.cpp | 66 +- src/gausskernel/optimizer/util/relnode.cpp | 22 + .../process/postmaster/bgworker.cpp | 5 + .../process/postmaster/postmaster.cpp | 128 +- src/gausskernel/process/tcop/dest.cpp | 8 + src/gausskernel/process/tcop/postgres.cpp | 18 +- src/gausskernel/process/tcop/utility.cpp | 20 +- .../process/threadpool/knl_thread.cpp | 6 + src/gausskernel/runtime/executor/Makefile | 6 +- src/gausskernel/runtime/executor/execAmi.cpp | 22 +- src/gausskernel/runtime/executor/execMain.cpp | 53 +- .../runtime/executor/execParallel.cpp | 589 +++++++++ .../runtime/executor/execProcnode.cpp | 35 + .../runtime/executor/functions.cpp | 6 +- .../runtime/executor/instrument.cpp | 77 +- .../runtime/executor/nodeGather.cpp | 434 +++++++ .../runtime/executor/nodeSamplescan.cpp | 14 +- .../runtime/executor/nodeSeqscan.cpp | 119 +- src/gausskernel/runtime/executor/spi.cpp | 41 +- src/gausskernel/runtime/executor/tqueue.cpp | 905 ++++++++++++++ .../storage/access/hbstore/hbucket_am.cpp | 6 +- .../storage/access/heap/heapam.cpp | 343 +++++- .../storage/access/transam/Makefile | 4 +- .../storage/access/transam/parallel.cpp | 1093 +++++++++++++++++ .../storage/access/transam/varsup.cpp | 9 + .../storage/access/transam/xact.cpp | 327 ++++- src/gausskernel/storage/buffer/localbuf.cpp | 14 + src/gausskernel/storage/ipc/Makefile | 2 +- src/gausskernel/storage/ipc/dsm.cpp | 63 + src/gausskernel/storage/ipc/procarray.cpp | 47 +- src/gausskernel/storage/ipc/procsignal.cpp | 4 + src/gausskernel/storage/ipc/shm_mq.cpp | 4 +- src/gausskernel/storage/lmgr/lock.cpp | 35 +- src/gausskernel/storage/lmgr/predicate.cpp | 10 + src/include/access/heapam.h | 6 + src/include/access/parallel.h | 68 + src/include/access/relscan.h | 33 +- src/include/access/tableam.h | 20 + src/include/access/xact.h | 7 + src/include/catalog/namespace.h | 4 +- src/include/catalog/pg_proc.h | 9 + src/include/executor/execParallel.h | 38 + src/include/executor/executor.h | 1 + src/include/executor/instrument.h | 10 + src/include/executor/nodeGather.h | 25 + src/include/executor/nodeSeqscan.h | 6 + src/include/executor/spi.h | 1 + src/include/executor/tqueue.h | 29 + src/include/gs_thread.h | 1 + .../knl/knl_guc/knl_instance_attr_common.h | 4 + .../knl/knl_guc/knl_session_attr_sql.h | 5 + src/include/knl/knl_session.h | 65 + src/include/knl/knl_thread.h | 44 + src/include/libpq/pqmq.h | 2 +- src/include/miscadmin.h | 4 + src/include/nodes/execnodes.h | 20 + src/include/nodes/nodeFuncs.h | 2 + src/include/nodes/nodes.h | 3 + src/include/nodes/params.h | 4 + src/include/nodes/parsenodes.h | 1 + src/include/nodes/plannodes.h | 17 + src/include/nodes/relation.h | 21 + src/include/optimizer/clauses.h | 1 + src/include/optimizer/cost.h | 6 +- src/include/optimizer/pathnode.h | 5 +- src/include/optimizer/planner.h | 7 + src/include/postmaster/bgworker.h | 1 + src/include/postmaster/postmaster.h | 2 + src/include/storage/dsm.h | 48 + src/include/storage/procarray.h | 2 + src/include/storage/shm_mq.h | 2 +- src/include/tcop/dest.h | 3 +- src/include/utils/datum.h | 8 + src/include/utils/lsyscache.h | 1 + src/include/utils/snapmgr.h | 7 + .../expected/bypass_simplequery_support.out | 1 + .../sql/bypass_simplequery_support.sql | 1 + 109 files changed, 6485 insertions(+), 185 deletions(-) create mode 100644 src/gausskernel/runtime/executor/execParallel.cpp create mode 100644 src/gausskernel/runtime/executor/nodeGather.cpp create mode 100644 src/gausskernel/runtime/executor/tqueue.cpp create mode 100644 src/gausskernel/storage/access/transam/parallel.cpp create mode 100644 src/gausskernel/storage/ipc/dsm.cpp create mode 100644 src/include/access/parallel.h create mode 100644 src/include/executor/execParallel.h create mode 100644 src/include/executor/nodeGather.h create mode 100644 src/include/executor/tqueue.h create mode 100644 src/include/storage/dsm.h diff --git a/src/bin/gs_guc/cluster_guc.conf b/src/bin/gs_guc/cluster_guc.conf index 00837318f..4b4d8bc4c 100644 --- a/src/bin/gs_guc/cluster_guc.conf +++ b/src/bin/gs_guc/cluster_guc.conf @@ -527,6 +527,13 @@ tcp_recv_timeout|int|0,86400|s|Specify the receiving timeouts until reporting an max_inner_tool_connections|int|1,8388607|NULL|NULL| max_keep_log_seg|int|0,2147483647|NULL|NULL| max_background_workers|int|0,262143|NULL|NULL| +min_parallel_table_scan_size|int|0,715827882|kB|NULL| +max_parallel_workers|int|0,1024|NULL|NULL| +max_parallel_workers_per_gather|int|0,1024|NULL|NULL| +parallel_tuple_cost|real|0,1.79769e+308|NULL|NULL| +parallel_setup_cost|real|0,1.79769e+308|NULL|NULL| +force_parallel_mode|enum|off,on,regress|NULL|NULL| +parallel_leader_participation|bool|0,0|NULL|NULL| [gtm] nodename|string|0,0|NULL|Name of this GTM/GTM-Standby.| port|int|1,65535|NULL|Listen Port of GTM or GTM standby server.| diff --git a/src/common/backend/catalog/index.cpp b/src/common/backend/catalog/index.cpp index 9a4f922ae..e9404b1b7 100755 --- a/src/common/backend/catalog/index.cpp +++ b/src/common/backend/catalog/index.cpp @@ -4334,6 +4334,11 @@ static void SetReindexPending(List* indexes) /* Reindexing is not re-entrant. */ if (u_sess->catalog_cxt.pendingReindexedIndexes) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cannot reindex while reindexing"))); + + if (IsInParallelMode()) { + ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot modify reindex state during a parallel operation"))); + } u_sess->catalog_cxt.pendingReindexedIndexes = list_copy(indexes); } @@ -4343,6 +4348,10 @@ static void SetReindexPending(List* indexes) */ static void RemoveReindexPending(Oid indexOid) { + if (IsInParallelMode()) { + ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot modify reindex state during a parallel operation"))); + } u_sess->catalog_cxt.pendingReindexedIndexes = list_delete_oid(u_sess->catalog_cxt.pendingReindexedIndexes, indexOid); } diff --git a/src/common/backend/catalog/namespace.cpp b/src/common/backend/catalog/namespace.cpp index 911aa4509..c4bc7a418 100755 --- a/src/common/backend/catalog/namespace.cpp +++ b/src/common/backend/catalog/namespace.cpp @@ -20,6 +20,7 @@ #include "postgres.h" #include "knl/knl_variable.h" +#include "access/parallel.h" #include "access/xact.h" #include "access/xlog.h" #ifdef PGXC @@ -2978,6 +2979,47 @@ Oid GetTempToastNamespace(void) return u_sess->catalog_cxt.myTempToastNamespace; } +/* + * GetTempNamespaceState - fetch status of session's temporary namespace + * + * This is used for conveying state to a parallel worker, and is not meant + * for general-purpose access. + */ +void GetTempNamespaceState(Oid *tempNamespaceId, Oid *tempToastNamespaceId) +{ + /* Return namespace OIDs, or 0 if session has not created temp namespace */ + *tempNamespaceId = u_sess->catalog_cxt.myTempNamespace; + *tempToastNamespaceId = u_sess->catalog_cxt.myTempToastNamespace; +} + +/* + * SetTempNamespaceState - set status of session's temporary namespace + * + * This is used for conveying state to a parallel worker, and is not meant for + * general-purpose access. By transferring these namespace OIDs to workers, + * we ensure they will have the same notion of the search path as their leader + * does. + */ +void SetTempNamespaceState(Oid tempNamespaceId, Oid tempToastNamespaceId) +{ + /* Worker should not have created its own namespaces ... */ + Assert(u_sess->catalog_cxt.myTempNamespace == InvalidOid); + Assert(u_sess->catalog_cxt.myTempToastNamespace == InvalidOid); + Assert(u_sess->catalog_cxt.myTempNamespaceSubID == InvalidSubTransactionId); + + /* Assign same namespace OIDs that leader has */ + u_sess->catalog_cxt.myTempNamespace = tempNamespaceId; + u_sess->catalog_cxt.myTempToastNamespace = tempToastNamespaceId; + + /* + * It's fine to leave myTempNamespaceSubID == InvalidSubTransactionId. + * Even if the namespace is new so far as the leader is concerned, it's + * not new to the worker, and we certainly wouldn't want the worker trying + * to destroy it. + */ + u_sess->catalog_cxt.baseSearchPathValid = false; /* may need to rebuild list */ +} + /* * GetOverrideSearchPath - fetch current search path definition in form * used by PushOverrideSearchPath. @@ -3622,6 +3664,12 @@ static void InitTempTableNamespace(void) ereport(ERROR, (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION), errmsg("cannot create temporary tables during recovery"))); + /* Parallel workers can't create temporary tables, either. */ + if (IsParallelWorker()) { + ereport(ERROR, (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION), + errmsg("cannot create temporary tables during a parallel operation"))); + } + timeLineId = get_controlfile_timeline(); tempID = __sync_add_and_fetch(>_tempID_seed, 1); @@ -3762,7 +3810,7 @@ static void InitTempTableNamespace(void) /* * End-of-transaction cleanup for namespaces. */ -void AtEOXact_Namespace(bool isCommit) +void AtEOXact_Namespace(bool isCommit, bool parallel) { /* * If we abort the transaction in which a temp namespace was selected, @@ -3772,7 +3820,7 @@ void AtEOXact_Namespace(bool isCommit) * at backend shutdown. (We only want to register the callback once per * session, so this is a good place to do it.) */ - if (u_sess->catalog_cxt.myTempNamespaceSubID != InvalidSubTransactionId) { + if (u_sess->catalog_cxt.myTempNamespaceSubID != InvalidSubTransactionId && !parallel) { //@Temp table. No need to register RemoveTempRelationsCallback here, // because we don't drop temp objects by porc_exit(); if (!isCommit) { diff --git a/src/common/backend/nodes/copyfuncs.cpp b/src/common/backend/nodes/copyfuncs.cpp index 5e2a4d631..64153f766 100644 --- a/src/common/backend/nodes/copyfuncs.cpp +++ b/src/common/backend/nodes/copyfuncs.cpp @@ -152,6 +152,7 @@ static PlannedStmt* _copyPlannedStmt(const PlannedStmt* from) COPY_SCALAR_FIELD(gather_count); COPY_SCALAR_FIELD(isRowTriggerShippable); COPY_SCALAR_FIELD(is_stream_plan); + COPY_SCALAR_FIELD(parallelModeNeeded); /* * Not copy ng_queryMem to avoid memory leak in CachedPlan context, * and dywlm_client_manager always calls CalculateQueryMemMain to generate it. @@ -175,6 +176,7 @@ static void CopyPlanFields(const Plan* from, Plan* newnode) COPY_SCALAR_FIELD(plan_rows); COPY_SCALAR_FIELD(multiple); COPY_SCALAR_FIELD(plan_width); + COPY_SCALAR_FIELD(parallel_aware); COPY_SCALAR_FIELD(dop); COPY_NODE_FIELD(targetlist); COPY_NODE_FIELD(qual); @@ -421,6 +423,27 @@ static BitmapAnd* _copyBitmapAnd(const BitmapAnd* from) return newnode; } +/* + * _copyGather + */ +static Gather *_copyGather(const Gather *from) +{ + Gather *newnode = makeNode(Gather); + + /* + * copy node superclass fields + */ + CopyPlanFields((const Plan *)from, (Plan *)newnode); + + /* + * copy remainder of node + */ + COPY_SCALAR_FIELD(num_workers); + COPY_SCALAR_FIELD(single_copy); + + return newnode; +} + /* * _copyBitmapOr */ @@ -5834,6 +5857,9 @@ void* copyObject(const void* from) case T_Scan: retval = _copyScan((Scan*)from); break; + case T_Gather: + retval = _copyGather((Gather*)from); + break; case T_BucketInfo: retval = _copyBucketInfo((BucketInfo*)from); break; diff --git a/src/common/backend/nodes/nodeFuncs.cpp b/src/common/backend/nodes/nodeFuncs.cpp index 698fa8ca1..38e0fb9e1 100755 --- a/src/common/backend/nodes/nodeFuncs.cpp +++ b/src/common/backend/nodes/nodeFuncs.cpp @@ -28,6 +28,8 @@ static bool expression_returns_set_walker(Node* node, void* context); static int leftmostLoc(int loc1, int loc2); +static bool planstate_walk_subplans(List *plans, bool (*walker)(), void *context); +static bool planstate_walk_members(List *plans, PlanState **planstates, bool (*walker)(), void *context); /* * exprType - @@ -3185,3 +3187,115 @@ bool lockNextvalWalker(Node* node, void* context) lockSeqForNextvalFunc(node); return expression_tree_walker(node, (bool (*)())lockNextvalWalker, context); } + +/* + * planstate_tree_walker --- walk plan state trees + * + * The walker has already visited the current node, and so we need only + * recurse into any sub-nodes it has. + */ +bool planstate_tree_walker(PlanState *planstate, bool (*walker)(), void *context) +{ + Plan *plan = planstate->plan; + bool (*p2walker)(PlanState *, void *) = (bool (*)(PlanState *, void *))walker; + + /* initPlan-s */ + if (planstate_walk_subplans(planstate->initPlan, walker, context)) { + return true; + } + + /* lefttree */ + if (outerPlanState(planstate)) { + if (p2walker(outerPlanState(planstate), context)) { + return true; + } + } + + /* righttree */ + if (innerPlanState(planstate)) { + if (p2walker(innerPlanState(planstate), context)) { + return true; + } + } + + /* special child plans */ + switch (nodeTag(plan)) { + case T_ModifyTable: + if (planstate_walk_members(((ModifyTable *)plan)->plans, ((ModifyTableState *)planstate)->mt_plans, walker, + context)) + return true; + break; + case T_Append: + if (planstate_walk_members(((Append *)plan)->appendplans, ((AppendState *)planstate)->appendplans, walker, + context)) + return true; + break; + case T_MergeAppend: + if (planstate_walk_members(((MergeAppend *)plan)->mergeplans, ((MergeAppendState *)planstate)->mergeplans, + walker, context)) + return true; + break; + case T_BitmapAnd: + if (planstate_walk_members(((BitmapAnd *)plan)->bitmapplans, ((BitmapAndState *)planstate)->bitmapplans, + walker, context)) + return true; + break; + case T_BitmapOr: + if (planstate_walk_members(((BitmapOr *)plan)->bitmapplans, ((BitmapOrState *)planstate)->bitmapplans, + walker, context)) + return true; + break; + case T_SubqueryScan: + if (p2walker(((SubqueryScanState *)planstate)->subplan, context)) + return true; + break; + default: + break; + } + + /* subPlan-s */ + if (planstate_walk_subplans(planstate->subPlan, walker, context)) { + return true; + } + + return false; +} + +/* + * Walk a list of SubPlans (or initPlans, which also use SubPlan nodes). + */ +static bool planstate_walk_subplans(List *plans, bool (*walker)(), void *context) +{ + ListCell *lc = NULL; + bool (*p2walker)(PlanState *, void *) = (bool (*)(PlanState *, void *))walker; + + foreach (lc, plans) { + SubPlanState *sps = (SubPlanState *)lfirst(lc); + + Assert(IsA(sps, SubPlanState)); + if (p2walker(sps->planstate, context)) + return true; + } + + return false; +} + +/* + * Walk the constituent plans of a ModifyTable, Append, MergeAppend, + * BitmapAnd, or BitmapOr node. + * + * Note: we don't actually need to examine the Plan list members, but + * we need the list in order to determine the length of the PlanState array. + */ +static bool planstate_walk_members(List *plans, PlanState **planstates, bool (*walker)(), void *context) +{ + int nplans = list_length(plans); + bool (*p2walker)(PlanState *, void *) = (bool (*)(PlanState *, void *))walker; + + for (int j = 0; j < nplans; j++) { + if (p2walker(planstates[j], context)) + return true; + } + + return false; +} diff --git a/src/common/backend/nodes/nodes.cpp b/src/common/backend/nodes/nodes.cpp index 98d84cd08..0ea1b29b4 100755 --- a/src/common/backend/nodes/nodes.cpp +++ b/src/common/backend/nodes/nodes.cpp @@ -245,6 +245,7 @@ static const TagStr g_tagStrArr[] = {{T_Invalid, "Invalid"}, {T_ResultPath, "ResultPath"}, {T_MaterialPath, "MaterialPath"}, {T_UniquePath, "UniquePath"}, + {T_GatherPath, "Gather"}, {T_PartIteratorPath, "PartIteratorPath"}, {T_EquivalenceClass, "EquivalenceClass"}, {T_EquivalenceMember, "EquivalenceMember"}, diff --git a/src/common/backend/nodes/outfuncs.cpp b/src/common/backend/nodes/outfuncs.cpp index ab85460e0..1e935bb06 100755 --- a/src/common/backend/nodes/outfuncs.cpp +++ b/src/common/backend/nodes/outfuncs.cpp @@ -593,6 +593,7 @@ static void _outPlannedStmt(StringInfo str, PlannedStmt* node) } WRITE_BOOL_FIELD(isRowTriggerShippable); WRITE_BOOL_FIELD(is_stream_plan); + WRITE_BOOL_FIELD(parallelModeNeeded); } /* @@ -609,6 +610,7 @@ static void _outPlanInfo(StringInfo str, Plan* node) appendStringInfo(str, " :plan_rows %.0f", PLAN_LOCAL_ROWS(node)); WRITE_FLOAT_FIELD(multiple, "%.0f"); WRITE_INT_FIELD(plan_width); + WRITE_BOOL_FIELD(parallel_aware); WRITE_NODE_FIELD(targetlist); WRITE_NODE_FIELD(qual); WRITE_NODE_FIELD(lefttree); @@ -897,6 +899,16 @@ static void _outBucketInfo(StringInfo str, BucketInfo* node) WRITE_NODE_FIELD(buckets); } +static void _outGather(StringInfo str, Gather *node) +{ + WRITE_NODE_TYPE("GATHER"); + + _outPlanInfo(str, (Plan *)node); + + WRITE_INT_FIELD(num_workers); + WRITE_BOOL_FIELD(single_copy); +} + static void _outScan(StringInfo str, Scan* node) { WRITE_NODE_TYPE("SCAN"); @@ -2829,6 +2841,17 @@ static void _outUniquePath(StringInfo str, UniquePath* node) WRITE_BOOL_FIELD(hold_tlist); } +static void _outGatherPath(StringInfo str, GatherPath *node) +{ + WRITE_NODE_TYPE("GATHERPATH"); + + _outPathInfo(str, (Path *)node); + + WRITE_NODE_FIELD(subpath); + WRITE_INT_FIELD(num_workers); + WRITE_BOOL_FIELD(single_copy); +} + static void _outNestPath(StringInfo str, NestPath* node) { WRITE_NODE_TYPE("NESTPATH"); @@ -2877,6 +2900,8 @@ static void _outPlannerGlobal(StringInfo str, PlannerGlobal* node) WRITE_UINT_FIELD(lastRowMarkId); WRITE_BOOL_FIELD(transientPlan); WRITE_BOOL_FIELD(dependsOnRole); + WRITE_BOOL_FIELD(parallelModeOK); + WRITE_BOOL_FIELD(parallelModeNeeded); } /* @@ -2956,6 +2981,7 @@ static void _outRelOptInfo(StringInfo str, RelOptInfo* node) WRITE_ENUM_FIELD(partflag, PartitionFlag); WRITE_FLOAT_FIELD(rows, "%.0f"); WRITE_INT_FIELD(width); + WRITE_BOOL_FIELD(consider_parallel); WRITE_NODE_FIELD(reltargetlist); WRITE_NODE_FIELD(pathlist); WRITE_NODE_FIELD(ppilist); @@ -4860,6 +4886,9 @@ static void _outNode(StringInfo str, const void* obj) case T_BitmapOr: _outBitmapOr(str, (BitmapOr*)obj); break; + case T_Gather: + _outGather(str, (Gather*)obj); + break; case T_Scan: _outScan(str, (Scan*)obj); break; @@ -5189,6 +5218,9 @@ static void _outNode(StringInfo str, const void* obj) case T_UniquePath: _outUniquePath(str, (UniquePath*)obj); break; + case T_GatherPath: + _outGatherPath(str, (GatherPath*)obj); + break; case T_NestPath: _outNestPath(str, (NestPath*)obj); break; diff --git a/src/common/backend/nodes/params.cpp b/src/common/backend/nodes/params.cpp index 7f28bb5e9..cb9baf633 100644 --- a/src/common/backend/nodes/params.cpp +++ b/src/common/backend/nodes/params.cpp @@ -17,6 +17,7 @@ #include "knl/knl_variable.h" #include "nodes/params.h" +#include "storage/shmem.h" #include "utils/datum.h" #include "utils/lsyscache.h" @@ -49,6 +50,7 @@ ParamListInfo copyParamList(ParamListInfo from) retval->parserSetupArg = NULL; retval->params_need_process = false; retval->numParams = from->numParams; + retval->paramMask = NULL; for (i = 0; i < from->numParams; i++) { ParamExternData* oprm = &from->params[i]; @@ -56,6 +58,15 @@ ParamListInfo copyParamList(ParamListInfo from) int16 typLen; bool typByVal = false; + /* Ignore parameters we don't need, to save cycles and space. */ + if (retval->paramMask != NULL && !bms_is_member(i, retval->paramMask)) { + nprm->value = (Datum)0; + nprm->isnull = true; + nprm->pflags = 0; + nprm->ptype = InvalidOid; + continue; + } + /* give hook a chance in case parameter is dynamic */ if (!OidIsValid(oprm->ptype) && from->paramFetch != NULL) { (*from->paramFetch)(from, i + 1); @@ -74,3 +85,169 @@ ParamListInfo copyParamList(ParamListInfo from) return retval; } + +/* + * Estimate the amount of space required to serialize a ParamListInfo. + */ +Size EstimateParamListSpace(ParamListInfo paramLI) +{ + Size sz = sizeof(int); + + if (paramLI == NULL || paramLI->numParams <= 0) + return sz; + + for (int i = 0; i < paramLI->numParams; i++) { + ParamExternData *prm = ¶mLI->params[i]; + Oid typeOid; + int16 typLen; + bool typByVal = false; + + /* Ignore parameters we don't need, to save cycles and space. */ + if (paramLI->paramMask != NULL && !bms_is_member(i, paramLI->paramMask)) { + typeOid = InvalidOid; + } else { + /* give hook a chance in case parameter is dynamic */ + if (!OidIsValid(prm->ptype) && paramLI->paramFetch != NULL) + (*paramLI->paramFetch)(paramLI, i + 1); + typeOid = prm->ptype; + } + + sz = add_size(sz, sizeof(Oid)); /* space for type OID */ + sz = add_size(sz, sizeof(uint16)); /* space for pflags */ + + /* space for datum/isnull */ + if (OidIsValid(typeOid)) { + get_typlenbyval(typeOid, &typLen, &typByVal); + } else { + /* If no type OID, assume by-value, like copyParamList does. */ + typLen = sizeof(Datum); + typByVal = true; + } + sz = add_size(sz, datumEstimateSpace(prm->value, prm->isnull, typByVal, typLen)); + } + + return sz; +} + +/* + * Serialize a paramListInfo structure into caller-provided storage. + * + * We write the number of parameters first, as a 4-byte integer, and then + * write details for each parameter in turn. The details for each parameter + * consist of a 4-byte type OID, 2 bytes of flags, and then the datum as + * serialized by datumSerialize(). The caller is responsible for ensuring + * that there is enough storage to store the number of bytes that will be + * written; use EstimateParamListSpace to find out how many will be needed. + * *start_address is updated to point to the byte immediately following those + * written. + * + * RestoreParamList can be used to recreate a ParamListInfo based on the + * serialized representation; this will be a static, self-contained copy + * just as copyParamList would create. + */ +void SerializeParamList(ParamListInfo paramLI, char *start_address, Size len) +{ + int nparams; + + /* Write number of parameters. */ + if (paramLI == NULL || paramLI->numParams <= 0) { + nparams = 0; + } else { + nparams = paramLI->numParams; + } + int rc = memcpy_s(start_address, len, &nparams, sizeof(int)); + securec_check_c(rc, "", ""); + Size remainLen = len - sizeof(int); + start_address += sizeof(int); + + /* Write each parameter in turn. */ + for (int i = 0; i < nparams; i++) { + ParamExternData *prm = ¶mLI->params[i]; + Oid typeOid; + int16 typLen; + bool typByVal; + + /* Ignore parameters we don't need, to save cycles and space. */ + if (paramLI->paramMask != NULL && !bms_is_member(i, paramLI->paramMask)) { + typeOid = InvalidOid; + } else { + /* give hook a chance in case parameter is dynamic */ + if (!OidIsValid(prm->ptype) && paramLI->paramFetch != NULL) + (*paramLI->paramFetch)(paramLI, i + 1); + typeOid = prm->ptype; + } + + /* Write type OID. */ + rc = memcpy_s(start_address, remainLen, &typeOid, sizeof(Oid)); + securec_check_c(rc, "", ""); + remainLen -= sizeof(Oid); + start_address += sizeof(Oid); + + /* Write flags. */ + rc = memcpy_s(start_address, remainLen, &prm->pflags, sizeof(uint16)); + securec_check_c(rc, "", ""); + remainLen -= sizeof(uint16); + start_address += sizeof(uint16); + + /* Write datum/isnull. */ + if (OidIsValid(typeOid)) { + get_typlenbyval(typeOid, &typLen, &typByVal); + } else { + /* If no type OID, assume by-value, like copyParamList does. */ + typLen = sizeof(Datum); + typByVal = true; + } + datumSerialize(prm->value, prm->isnull, typByVal, typLen, &start_address, &remainLen); + } +} + +/* + * Copy a ParamListInfo structure. + * + * The result is allocated in CurrentMemoryContext. + * + * Note: the intent of this function is to make a static, self-contained + * set of parameter values. If dynamic parameter hooks are present, we + * intentionally do not copy them into the result. Rather, we forcibly + * instantiate all available parameter values and copy the datum values. + */ +ParamListInfo RestoreParamList(char *start_address, Size len) +{ + int nparams; + + int rc = memcpy_s(&nparams, len, start_address, sizeof(int)); + securec_check_c(rc, "", ""); + Size remainLen = len - sizeof(int); + start_address += sizeof(int); + + Size size = offsetof(ParamListInfoData, params) + nparams * sizeof(ParamExternData); + + ParamListInfo paramLI = (ParamListInfo)palloc(size); + paramLI->paramFetch = NULL; + paramLI->paramFetchArg = NULL; + paramLI->parserSetup = NULL; + paramLI->parserSetupArg = NULL; + paramLI->numParams = nparams; + paramLI->paramMask = NULL; + + for (int i = 0; i < nparams; i++) { + ParamExternData *prm = ¶mLI->params[i]; + + /* Read type OID. */ + rc = memcpy_s(&prm->ptype, remainLen, start_address, sizeof(Oid)); + securec_check_c(rc, "", ""); + remainLen -= sizeof(Oid); + start_address += sizeof(Oid); + + /* Read flags. */ + rc = memcpy_s(&prm->pflags, remainLen, start_address, sizeof(uint16)); + securec_check_c(rc, "", ""); + remainLen -= sizeof(uint16); + start_address += sizeof(uint16); + + /* Read datum/isnull. */ + prm->value = datumRestore(&start_address, &remainLen, &prm->isnull); + } + + return paramLI; +} diff --git a/src/common/backend/nodes/readfuncs.cpp b/src/common/backend/nodes/readfuncs.cpp index ae1059864..3885ed18a 100644 --- a/src/common/backend/nodes/readfuncs.cpp +++ b/src/common/backend/nodes/readfuncs.cpp @@ -2684,6 +2684,7 @@ static Plan* _readPlan(Plan* local_node) READ_FLOAT_FIELD(plan_rows); READ_FLOAT_FIELD(multiple); READ_INT_FIELD(plan_width); + READ_BOOL_FIELD(parallel_aware); READ_NODE_FIELD(targetlist); READ_NODE_FIELD(qual); READ_NODE_FIELD(lefttree); @@ -3722,6 +3723,7 @@ static PlannedStmt* _readPlannedStmt(void) } READ_BOOL_FIELD(isRowTriggerShippable); READ_BOOL_FIELD(is_stream_plan); + READ_BOOL_FIELD(parallelModeNeeded); READ_DONE(); } diff --git a/src/common/backend/utils/adt/datum.cpp b/src/common/backend/utils/adt/datum.cpp index 84ec7fed8..fcfde6794 100755 --- a/src/common/backend/utils/adt/datum.cpp +++ b/src/common/backend/utils/adt/datum.cpp @@ -200,3 +200,129 @@ bool datumIsEqual(Datum value1, Datum value2, bool typByVal, int typLen) } return res; } + +/* ------------------------------------------------------------------------- + * datumEstimateSpace + * + * Compute the amount of space that datumSerialize will require for a + * particular Datum. + * ------------------------------------------------------------------------- + */ +Size datumEstimateSpace(Datum value, bool isnull, bool typByVal, int typLen) +{ + Size sz = sizeof(int); + + if (!isnull) { + /* no need to use add_size, can't overflow */ + if (typByVal) + sz += sizeof(Datum); + else + sz += datumGetSize(value, typByVal, typLen); + } + + return sz; +} + +/* ------------------------------------------------------------------------- + * datumSerialize + * + * Serialize a possibly-NULL datum into caller-provided storage. + * + * Note: "expanded" objects are flattened so as to produce a self-contained + * representation, but other sorts of toast pointers are transferred as-is. + * This is because the intended use of this function is to pass the value + * to another process within the same database server. The other process + * could not access an "expanded" object within this process's memory, but + * we assume it can dereference the same TOAST pointers this one can. + * + * The format is as follows: first, we write a 4-byte header word, which + * is either the length of a pass-by-reference datum, -1 for a + * pass-by-value datum, or -2 for a NULL. If the value is NULL, nothing + * further is written. If it is pass-by-value, sizeof(Datum) bytes + * follow. Otherwise, the number of bytes indicated by the header word + * follow. The caller is responsible for ensuring that there is enough + * storage to store the number of bytes that will be written; use + * datumEstimateSpace() to find out how many will be needed. + * *start_address is updated to point to the byte immediately following + * those written. + * ------------------------------------------------------------------------- + */ +void datumSerialize(Datum value, bool isnull, bool typByVal, int typLen, char **start_address, Size *remainLen) +{ + int header; + + /* Write header word. */ + if (isnull) { + header = -2; + } else if (typByVal) { + header = -1; + } else { + header = datumGetSize(value, typByVal, typLen); + } + int rc = memcpy_s(*start_address, *remainLen, &header, sizeof(int)); + securec_check_c(rc, "", ""); + *remainLen -= sizeof(int); + *start_address += sizeof(int); + + /* If not null, write payload bytes. */ + if (!isnull) { + if (typByVal) { + rc = memcpy_s(*start_address, *remainLen, &value, sizeof(Datum)); + securec_check_c(rc, "", ""); + *remainLen -= sizeof(Datum); + *start_address += sizeof(Datum); + } else { + rc = memcpy_s(*start_address, *remainLen, DatumGetPointer(value), (Size)header); + securec_check_c(rc, "", ""); + *remainLen -= header; + *start_address += header; + } + } +} + +/* ------------------------------------------------------------------------- + * datumRestore + * + * Restore a possibly-NULL datum previously serialized by datumSerialize. + * *start_address is updated according to the number of bytes consumed. + * ------------------------------------------------------------------------- + */ +Datum datumRestore(char **start_address, Size *remainLen, bool *isnull) +{ + int header; + + /* Read header word. */ + int rc = memcpy_s(&header, *remainLen, *start_address, sizeof(int)); + securec_check_c(rc, "", ""); + *remainLen -= sizeof(int); + *start_address += sizeof(int); + + /* If this datum is NULL, we can stop here. */ + if (header == -2) { + *isnull = true; + return (Datum)0; + } + + /* OK, datum is not null. */ + *isnull = false; + + /* If this datum is pass-by-value, sizeof(Datum) bytes follow. */ + if (header == -1) { + Datum val; + + rc = memcpy_s(&val, *remainLen, *start_address, sizeof(Datum)); + securec_check_c(rc, "", ""); + *remainLen -= sizeof(Datum); + *start_address += sizeof(Datum); + return val; + } + + /* Pass-by-reference case; copy indicated number of bytes. */ + Assert(header > 0); + void *d = palloc((Size)header); + rc = memcpy_s(d, *remainLen, *start_address, header); + securec_check_c(rc, "", ""); + *remainLen -= header; + *start_address += header; + return PointerGetDatum(d); +} diff --git a/src/common/backend/utils/adt/lockfuncs.cpp b/src/common/backend/utils/adt/lockfuncs.cpp index 7b5aefd36..5ccc327ae 100755 --- a/src/common/backend/utils/adt/lockfuncs.cpp +++ b/src/common/backend/utils/adt/lockfuncs.cpp @@ -429,6 +429,14 @@ Datum pg_lock_status(PG_FUNCTION_ARGS) #define SET_LOCKTAG_INT32_DB(tag, databaseOid, key1, key2) SET_LOCKTAG_ADVISORY(tag, databaseOid, key1, key2, 2) +static void PreventAdvisoryLocksInParallelMode(void) +{ + if (IsInParallelMode()) + ereport(ERROR, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot use advisory locks during a parallel operation"))); +} + + #ifdef PGXC #define MAXINT8LEN 25 @@ -452,7 +460,8 @@ static bool pgxc_advisory_lock(int64 key64, int32 key1, int32 key2, bool iskeybi LockLevel locklevel, TryType locktry, Name databaseName) { LOCKTAG locktag; - Oid *coOids = NULL, *dnOids = NULL; + Oid *coOids = NULL; + Oid *dnOids = NULL; int numdnodes, numcoords; StringInfoData lock_cmd, unlock_cmd, lock_funcname, unlock_funcname, args; char str_key[MAXINT8LEN + 1]; @@ -576,6 +585,7 @@ Datum pg_advisory_lock_int8(PG_FUNCTION_ARGS) int64 key = PG_GETARG_INT64(0); LOCKTAG tag; + PreventAdvisoryLocksInParallelMode(); #ifdef PGXC if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) { (void)pgxc_advisory_lock(key, 0, 0, true, ExclusiveLock, SESSION_LOCK, WAIT); @@ -599,6 +609,7 @@ Datum pg_advisory_xact_lock_int8(PG_FUNCTION_ARGS) int64 key = PG_GETARG_INT64(0); LOCKTAG tag; + PreventAdvisoryLocksInParallelMode(); #ifdef PGXC if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) { (void)pgxc_advisory_lock(key, 0, 0, true, ExclusiveLock, TRANSACTION_LOCK, WAIT); @@ -621,6 +632,7 @@ Datum pg_advisory_lock_shared_int8(PG_FUNCTION_ARGS) int64 key = PG_GETARG_INT64(0); LOCKTAG tag; + PreventAdvisoryLocksInParallelMode(); #ifdef PGXC if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) { (void)pgxc_advisory_lock(key, 0, 0, true, ShareLock, SESSION_LOCK, WAIT); @@ -644,6 +656,7 @@ Datum pg_advisory_xact_lock_shared_int8(PG_FUNCTION_ARGS) int64 key = PG_GETARG_INT64(0); LOCKTAG tag; + PreventAdvisoryLocksInParallelMode(); #ifdef PGXC if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) { (void)pgxc_advisory_lock(key, 0, 0, true, ShareLock, TRANSACTION_LOCK, WAIT); @@ -669,6 +682,7 @@ Datum pg_try_advisory_lock_int8(PG_FUNCTION_ARGS) LOCKTAG tag; LockAcquireResult res; + PreventAdvisoryLocksInParallelMode(); #ifdef PGXC if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) PG_RETURN_BOOL(pgxc_advisory_lock(key, 0, 0, true, ExclusiveLock, SESSION_LOCK, DONT_WAIT)); @@ -693,6 +707,7 @@ Datum pg_try_advisory_xact_lock_int8(PG_FUNCTION_ARGS) LOCKTAG tag; LockAcquireResult res; + PreventAdvisoryLocksInParallelMode(); #ifdef PGXC if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) PG_RETURN_BOOL(pgxc_advisory_lock(key, 0, 0, true, ExclusiveLock, TRANSACTION_LOCK, DONT_WAIT)); @@ -716,6 +731,7 @@ Datum pg_try_advisory_lock_shared_int8(PG_FUNCTION_ARGS) LOCKTAG tag; LockAcquireResult res; + PreventAdvisoryLocksInParallelMode(); #ifdef PGXC if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) PG_RETURN_BOOL(pgxc_advisory_lock(key, 0, 0, true, ShareLock, SESSION_LOCK, DONT_WAIT)); @@ -740,6 +756,7 @@ Datum pg_try_advisory_xact_lock_shared_int8(PG_FUNCTION_ARGS) LOCKTAG tag; LockAcquireResult res; + PreventAdvisoryLocksInParallelMode(); #ifdef PGXC if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) PG_RETURN_BOOL(pgxc_advisory_lock(key, 0, 0, true, ShareLock, TRANSACTION_LOCK, DONT_WAIT)); @@ -763,6 +780,7 @@ Datum pg_advisory_unlock_int8(PG_FUNCTION_ARGS) LOCKTAG tag; bool res = false; + PreventAdvisoryLocksInParallelMode(); SET_LOCKTAG_INT64(tag, key); res = LockRelease(&tag, ExclusiveLock, true); @@ -781,6 +799,7 @@ Datum pg_advisory_unlock_shared_int8(PG_FUNCTION_ARGS) LOCKTAG tag; bool res = false; + PreventAdvisoryLocksInParallelMode(); SET_LOCKTAG_INT64(tag, key); res = LockRelease(&tag, ShareLock, true); @@ -797,6 +816,7 @@ Datum pg_advisory_lock_int4(PG_FUNCTION_ARGS) int32 key2 = PG_GETARG_INT32(1); LOCKTAG tag; + PreventAdvisoryLocksInParallelMode(); if (key1 == XC_LOCK_FOR_BACKUP_KEY_1 && key2 == XC_LOCK_FOR_BACKUP_KEY_2 && !superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("Only system admin can lock the cluster."))); @@ -826,6 +846,7 @@ Datum pg_advisory_lock_sp_db_int4(PG_FUNCTION_ARGS) LOCKTAG tag; Oid database_oid = u_sess->proc_cxt.MyDatabaseId; + PreventAdvisoryLocksInParallelMode(); if (key1 == XC_LOCK_FOR_BACKUP_KEY_1 && key2 == XC_LOCK_FOR_BACKUP_KEY_2 && !superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("Only system admin can lock the cluster."))); @@ -858,6 +879,7 @@ Datum pg_advisory_xact_lock_int4(PG_FUNCTION_ARGS) int32 key2 = PG_GETARG_INT32(1); LOCKTAG tag; + PreventAdvisoryLocksInParallelMode(); if (key1 == XC_LOCK_FOR_BACKUP_KEY_1 && key2 == XC_LOCK_FOR_BACKUP_KEY_2 && !superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("Only system admin can lock the cluster."))); @@ -884,6 +906,7 @@ Datum pg_advisory_lock_shared_int4(PG_FUNCTION_ARGS) int32 key2 = PG_GETARG_INT32(1); LOCKTAG tag; + PreventAdvisoryLocksInParallelMode(); #ifdef PGXC if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) { (void)pgxc_advisory_lock(0, key1, key2, false, ShareLock, SESSION_LOCK, WAIT); @@ -908,6 +931,7 @@ Datum pg_advisory_xact_lock_shared_int4(PG_FUNCTION_ARGS) int32 key2 = PG_GETARG_INT32(1); LOCKTAG tag; + PreventAdvisoryLocksInParallelMode(); #ifdef PGXC if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) { (void)pgxc_advisory_lock(0, key1, key2, false, ShareLock, TRANSACTION_LOCK, WAIT); @@ -934,6 +958,7 @@ Datum pg_try_advisory_lock_int4(PG_FUNCTION_ARGS) LOCKTAG tag; LockAcquireResult res; + PreventAdvisoryLocksInParallelMode(); if (key1 == XC_LOCK_FOR_BACKUP_KEY_1 && key2 == XC_LOCK_FOR_BACKUP_KEY_2 && !superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("Only system admin can lock the cluster."))); @@ -962,6 +987,7 @@ Datum pg_try_advisory_xact_lock_int4(PG_FUNCTION_ARGS) LOCKTAG tag; LockAcquireResult res; + PreventAdvisoryLocksInParallelMode(); if (key1 == XC_LOCK_FOR_BACKUP_KEY_1 && key2 == XC_LOCK_FOR_BACKUP_KEY_2 && !superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("Only system admin can lock the cluster."))); @@ -989,6 +1015,7 @@ Datum pg_try_advisory_lock_shared_int4(PG_FUNCTION_ARGS) LOCKTAG tag; LockAcquireResult res; + PreventAdvisoryLocksInParallelMode(); #ifdef PGXC if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) PG_RETURN_BOOL(pgxc_advisory_lock(0, key1, key2, false, ShareLock, SESSION_LOCK, DONT_WAIT)); @@ -1014,6 +1041,7 @@ Datum pg_try_advisory_xact_lock_shared_int4(PG_FUNCTION_ARGS) LOCKTAG tag; LockAcquireResult res; + PreventAdvisoryLocksInParallelMode(); #ifdef PGXC if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) PG_RETURN_BOOL(pgxc_advisory_lock(0, key1, key2, false, ShareLock, TRANSACTION_LOCK, DONT_WAIT)); @@ -1038,6 +1066,7 @@ Datum pg_advisory_unlock_int4(PG_FUNCTION_ARGS) LOCKTAG tag; bool res = false; + PreventAdvisoryLocksInParallelMode(); SET_LOCKTAG_INT32(tag, key1, key2); res = LockRelease(&tag, ExclusiveLock, true); @@ -1059,6 +1088,7 @@ Datum pg_advisory_unlock_sp_db_int4(PG_FUNCTION_ARGS) bool res = false; Oid database_oid = u_sess->proc_cxt.MyDatabaseId; + PreventAdvisoryLocksInParallelMode(); if (database_name != NULL) { database_oid = get_database_oid(database_name->data, false); } @@ -1082,6 +1112,7 @@ Datum pg_advisory_unlock_shared_int4(PG_FUNCTION_ARGS) LOCKTAG tag; bool res = false; + PreventAdvisoryLocksInParallelMode(); SET_LOCKTAG_INT32(tag, key1, key2); res = LockRelease(&tag, ShareLock, true); @@ -1113,6 +1144,7 @@ Datum pgxc_lock_for_backup(PG_FUNCTION_ARGS) { bool lockAcquired = false; + PreventAdvisoryLocksInParallelMode(); if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("only system admin can lock the cluster for backup"))); @@ -1194,6 +1226,7 @@ Datum pgxc_unlock_for_sp_database(PG_FUNCTION_ARGS) Name databaseName = PG_GETARG_NAME(0); bool result = false; + PreventAdvisoryLocksInParallelMode(); /* try to acquire the advisory lock in exclusive mode */ result = DatumGetBool(DirectFunctionCall3(pg_advisory_unlock_sp_db_int4, t_thrd.postmaster_cxt.xc_lockForBackupKey1, @@ -1220,6 +1253,7 @@ Datum pgxc_lock_for_sp_database(PG_FUNCTION_ARGS) int prepared_xact_count; Name databaseName = PG_GETARG_NAME(0); + PreventAdvisoryLocksInParallelMode(); if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("only system admin can lock the cluster for backup"))); @@ -1302,6 +1336,7 @@ void pgxc_lock_for_utility_stmt(Node* parsetree, bool is_temp) LOCKTAG tag; LockAcquireResult res; + PreventAdvisoryLocksInParallelMode(); /* * Reload configuration if we got SIGHUP from the postmaster, since we want to fetch * latest enable_online_ddl_waitlock values. diff --git a/src/common/backend/utils/cache/lsyscache.cpp b/src/common/backend/utils/cache/lsyscache.cpp index 11d2eaaf5..42bf45e53 100644 --- a/src/common/backend/utils/cache/lsyscache.cpp +++ b/src/common/backend/utils/cache/lsyscache.cpp @@ -1430,6 +1430,16 @@ char func_volatile(Oid funcid) return result; } +/* + * func_parallel + * Given procedure id, return the function's proparallel flag. + */ +char func_parallel(Oid funcid) +{ + /* Now we treat all func as parallel safe */ + return PROPARALLEL_SAFE; +} + /* * get_func_proshippable * Given procedure id, return the function's proshippable flag. diff --git a/src/common/backend/utils/cache/relmapper.cpp b/src/common/backend/utils/cache/relmapper.cpp index 38c34164b..1f0d902fb 100644 --- a/src/common/backend/utils/cache/relmapper.cpp +++ b/src/common/backend/utils/cache/relmapper.cpp @@ -195,6 +195,13 @@ void RelationMapUpdateMap(Oid relationId, Oid fileNode, bool shared, bool immedi (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot change relation mapping within subtransaction"))); } + + if (IsInParallelMode()) { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot change relation mapping in parallel mode"))); + } + if (immediate) { /* Make it active, but only locally */ if (shared) { diff --git a/src/common/backend/utils/misc/guc.cpp b/src/common/backend/utils/misc/guc.cpp index d20bc1552..ab07a19aa 100644 --- a/src/common/backend/utils/misc/guc.cpp +++ b/src/common/backend/utils/misc/guc.cpp @@ -67,6 +67,7 @@ #include "parser/parser.h" #include "parser/scansup.h" #include "pgstat.h" +#include "postmaster/bgworker_internals.h" #include "workload/workload.h" #include "pgaudit.h" #include "instruments/instr_unique_sql.h" @@ -910,6 +911,19 @@ static const struct config_enum_entry synchronous_commit_options[] = {{"local", {"remote_apply", SYNCHRONOUS_COMMIT_REMOTE_REPLAY, false}, {NULL, 0, false}}; +static const struct config_enum_entry force_parallel_mode_options[] = { + {"off", FORCE_PARALLEL_OFF, false}, + {"on", FORCE_PARALLEL_ON, false}, + {"regress", FORCE_PARALLEL_REGRESS, false}, + {"true", FORCE_PARALLEL_ON, true}, + {"false", FORCE_PARALLEL_OFF, true}, + {"yes", FORCE_PARALLEL_ON, true}, + {"no", FORCE_PARALLEL_OFF, true}, + {"1", FORCE_PARALLEL_ON, true}, + {"0", FORCE_PARALLEL_OFF, true}, + {NULL, 0, false} +}; + static const struct config_enum_entry plan_cache_mode_options[] = { {"auto", PLAN_CACHE_MODE_AUTO, false}, {"force_generic_plan", PLAN_CACHE_MODE_FORCE_GENERIC_PLAN, false}, @@ -4549,6 +4563,20 @@ static void init_configure_names_bool() NULL, NULL }, + { + { + "parallel_leader_participation", + PGC_USERSET, + RESOURCES_ASYNCHRONOUS, + gettext_noop("Controls whether Gather and Gather Merge also run subplans."), + gettext_noop("Should gather nodes also run subplans, or just gather tuples?") + }, + &u_sess->attr.attr_sql.parallel_leader_participation, + true, + NULL, + NULL, + NULL + }, /* End-of-list marker */ { { @@ -7304,6 +7332,24 @@ static void init_configure_names_int() NULL, NULL }, + { + { + "min_parallel_table_scan_size", + PGC_USERSET, + QUERY_TUNING_COST, + gettext_noop("Sets the minimum amount of table data for a parallel scan."), + gettext_noop("If the planner estimates that it will read a number of table " + "pages too small to reach this limit, a parallel scan will not be considered."), + GUC_UNIT_BLOCKS, + }, + &u_sess->attr.attr_sql.min_parallel_table_scan_size, + (8 * 1024 * 1024) / BLCKSZ, + 0, + INT_MAX / 3, + NULL, + NULL, + NULL + }, { /* Can't be set in postgresql.conf */ { @@ -9183,6 +9229,38 @@ static void init_configure_names_int() NULL, NULL }, + { + { + "max_parallel_workers", + PGC_USERSET, + RESOURCES_ASYNCHRONOUS, + gettext_noop("Sets the maximum number of parallel workers that can be active at one time."), + NULL + }, + &g_instance.attr.attr_common.max_parallel_workers, + 8, + 0, + MAX_PARALLEL_WORKER_LIMIT, + NULL, + NULL, + NULL + }, + { + { + "max_parallel_workers_per_gather", + PGC_USERSET, + RESOURCES_ASYNCHRONOUS, + gettext_noop("Sets the maximum number of parallel processes per executor node."), + NULL + }, + &g_instance.attr.attr_common.max_parallel_workers_per_gather, + 2, + 0, + MAX_PARALLEL_WORKER_LIMIT, + NULL, + NULL, + NULL + }, /* End-of-list marker */ { { @@ -9332,6 +9410,40 @@ static void init_configure_names_real() NULL }, #endif + { + { + "parallel_tuple_cost", + PGC_USERSET, + QUERY_TUNING_COST, + gettext_noop("Sets the planner's estimate of the cost of " + "passing each tuple (row) from worker to master backend."), + NULL + }, + &u_sess->attr.attr_sql.parallel_tuple_cost, + DEFAULT_PARALLEL_TUPLE_COST, + 0, + DBL_MAX, + NULL, + NULL, + NULL + }, + { + { + "parallel_setup_cost", + PGC_USERSET, + QUERY_TUNING_COST, + gettext_noop("Sets the planner's estimate of the cost of " + "starting up worker processes for parallel query."), + NULL + }, + &u_sess->attr.attr_sql.parallel_setup_cost, + DEFAULT_PARALLEL_SETUP_COST, + 0, + DBL_MAX, + NULL, + NULL, + NULL + }, { { "cursor_tuple_fraction", @@ -11735,6 +11847,21 @@ static void init_configure_names_enum() NULL }, #endif + { + { + "force_parallel_mode", + PGC_USERSET, + QUERY_TUNING_OTHER, + gettext_noop("Forces use of parallel query facilities."), + gettext_noop("If possible, run query using a parallel worker and with parallel restrictions.") + }, + &u_sess->attr.attr_sql.force_parallel_mode, + FORCE_PARALLEL_OFF, + force_parallel_mode_options, + NULL, + NULL, + NULL + }, { { "plan_cache_mode", @@ -14667,6 +14794,20 @@ int set_config_option(const char* name, const char* value, GucContext context, G } } + /* + * GUC_ACTION_SAVE changes are acceptable during a parallel operation, + * because the current worker will also pop the change. We're probably + * dealing with a function having a proconfig entry. Only the function's + * body should observe the change, and peer workers do not share in the + * execution of a function call started by this worker. + * + * Other changes might need to affect other workers, so forbid them. + */ + if (IsInParallelMode() && changeVal && action != GUC_ACTION_SAVE) { + ereport(elevel, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), errmsg("cannot set parameters during a parallel operation"))); + } + record = find_option(name, true, elevel); if (record == NULL) { ereport( @@ -15901,6 +16042,15 @@ void ExecSetVariableStmt(VariableSetStmt* stmt) char* passwd = NULL; ListCell* phead = NULL; + /* + * Workers synchronize these parameters at the start of the parallel + * operation; then, we block SET during the operation. + */ + if (IsInParallelMode()) { + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), errmsg("cannot set parameters during a parallel operation"))); + } + switch (stmt->kind) { case VAR_SET_VALUE: case VAR_SET_CURRENT: diff --git a/src/common/backend/utils/time/snapmgr.cpp b/src/common/backend/utils/time/snapmgr.cpp index b7d53dad2..0fbca1687 100755 --- a/src/common/backend/utils/time/snapmgr.cpp +++ b/src/common/backend/utils/time/snapmgr.cpp @@ -101,6 +101,26 @@ static Snapshot CopySnapshot(Snapshot snapshot); static void FreeSnapshot(Snapshot snapshot); static void SnapshotResetXmin(void); +/* + * Snapshot fields to be serialized. + * + * Only these fields need to be sent to the cooperating backend; the + * remaining ones can (and must) set by the receiver upon restore. + */ +typedef struct SerializedSnapshotData { + TransactionId xmin; + TransactionId xmax; + uint32 xcnt; + int32 subxcnt; + bool suboverflowed; + bool takenDuringRecovery; + CommandId curcid; + GTM_Timeline timeline; + CommitSeqNo snapshotcsn; + SnapshotType snapshot_type; +} SerializedSnapshotData; + + /* * GetTransactionSnapshot * Get the appropriate snapshot for a new query in a transaction. @@ -128,6 +148,12 @@ Snapshot GetTransactionSnapshot(bool force_local_snapshot) Assert(u_sess->utils_cxt.RegisteredSnapshots == 0); Assert(u_sess->utils_cxt.FirstXactSnapshot == NULL); + if (IsInParallelMode()) { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot take query snapshot during a parallel operation"))); + } + /* * In transaction-snapshot mode, the first snapshot must live until * end of xact regardless of what the caller does with it, so we must @@ -230,6 +256,15 @@ void StreamTxnContextSetMyPgXactXmin(TransactionId xmin) */ Snapshot GetLatestSnapshot(void) { + /* + * We might be able to relax this, but nothing that could otherwise work + * needs it. + */ + if (IsInParallelMode()) { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot update SecondarySnapshot during a parallel operation"))); + } + /* * So far there are no cases requiring support for GetLatestSnapshot() * during logical decoding, but it wouldn't be hard to add if @@ -324,7 +359,7 @@ void SnapshotSetCommandId(CommandId curcid) * must take care of all the same considerations as the first-snapshot case * in GetTransactionSnapshot. */ -static void SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid) +static void SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid, PGPROC *sourceproc) { /* Caller should have checked this already */ Assert(!u_sess->utils_cxt.FirstSnapshotSet); @@ -351,6 +386,28 @@ static void SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid) u_sess->utils_cxt.CurrentSnapshot->timeline = sourcesnap->timeline; u_sess->utils_cxt.CurrentSnapshot->takenDuringRecovery = sourcesnap->takenDuringRecovery; + /* + * Now we have to fix what GetSnapshotData did with MyPgXact->xmin and + * TransactionXmin. There is a race condition: to make sure we are not + * causing the global xmin to go backwards, we have to test that the + * source transaction is still running, and that has to be done + * atomically. So let procarray.c do it. + * + * Note: in serializable mode, predicate.c will do this a second time. It + * doesn't seem worth contorting the logic here to avoid two calls, + * especially since it's not clear that predicate.c *must* do this. + */ + if (sourceproc != NULL) { + if (!ProcArrayInstallRestoredXmin(u_sess->utils_cxt.CurrentSnapshot->xmin, sourceproc)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("could not import the requested snapshot"), + errdetail("The source transaction is not running anymore."))); + } else if (!ProcArrayInstallImportedXmin(u_sess->utils_cxt.CurrentSnapshot->xmin, sourcexid)) { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("could not import the requested snapshot"), + errdetail("The source transaction %lu is not running anymore.", sourcexid))); + } + /* * NB: curcid should NOT be copied, it's a local matter * Now we have to fix what GetSnapshotData did with MyPgXact->xmin and @@ -523,6 +580,21 @@ void UpdateActiveSnapshotCommandId(void) Assert(u_sess->utils_cxt.ActiveSnapshot->as_snap->active_count == 1); Assert(u_sess->utils_cxt.ActiveSnapshot->as_snap->regd_count == 0); + /* + * Don't allow modification of the active snapshot during parallel + * operation. We share the snapshot to worker backends at the beginning + * of parallel operation, so any change to the snapshot can lead to + * inconsistencies. We have other defenses against + * CommandCounterIncrement, but there are a few places that call this + * directly, so we put an additional guard here. + */ + CommandId save_curcid = u_sess->utils_cxt.ActiveSnapshot->as_snap->curcid; + CommandId curcid = GetCurrentCommandId(false); + if (IsInParallelMode() && save_curcid != curcid) { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot modify commandid in active snapshot during a parallel operation"))); + } + u_sess->utils_cxt.ActiveSnapshot->as_snap->curcid = GetCurrentCommandId(false); } @@ -1267,7 +1339,7 @@ void ImportSnapshot(const char* idstr) (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot import a snapshot from a different database"))); /* OK, install the snapshot */ - SetTransactionSnapshot(&snapshot, src_xid); + SetTransactionSnapshot(&snapshot, src_xid, NULL); } /* @@ -1380,3 +1452,148 @@ HTAB* HistoricSnapshotGetTupleCids(void) Assert(HistoricSnapshotActive()); return u_sess->utils_cxt.tuplecid_data; } + +/* + * EstimateSnapshotSpace + * Returns the size need to store the given snapshot. + * + * We are exporting only required fields from the Snapshot, stored in + * SerializedSnapshotData. + */ +Size EstimateSnapshotSpace(Snapshot snap) +{ + Assert(snap != InvalidSnapshot); + Assert(snap->satisfies == HeapTupleSatisfiesMVCC); + + /* We allocate any XID arrays needed in the same palloc block. */ + Size size = add_size(sizeof(SerializedSnapshotData), mul_size(snap->xcnt, sizeof(TransactionId))); + if (snap->subxcnt > 0 && (!snap->suboverflowed || snap->takenDuringRecovery)) { + size = add_size(size, mul_size((Size)snap->subxcnt, sizeof(TransactionId))); + } + + return size; +} + +/* + * SerializeSnapshot + * Dumps the serialized snapshot (extracted from given snapshot) onto the + * memory location at start_address. + */ +void SerializeSnapshot(Snapshot snapshot, char *start_address, Size len) +{ + Assert(snapshot->subxcnt >= 0); + + SerializedSnapshotData *serialized_snapshot = (SerializedSnapshotData *)start_address; + int rc; + + /* Copy all required fields */ + serialized_snapshot->xmin = snapshot->xmin; + serialized_snapshot->xmax = snapshot->xmax; + serialized_snapshot->xcnt = snapshot->xcnt; + serialized_snapshot->subxcnt = snapshot->subxcnt; + serialized_snapshot->suboverflowed = snapshot->suboverflowed; + serialized_snapshot->takenDuringRecovery = snapshot->takenDuringRecovery; + serialized_snapshot->curcid = snapshot->curcid; + serialized_snapshot->timeline = snapshot->timeline; + serialized_snapshot->snapshotcsn = snapshot->snapshotcsn; + serialized_snapshot->snapshot_type = snapshot->snapshot_type; + + /* + * Ignore the SubXID array if it has overflowed, unless the snapshot was + * taken during recovey - in that case, top-level XIDs are in subxip as + * well, and we mustn't lose them. + */ + if (serialized_snapshot->suboverflowed && !snapshot->takenDuringRecovery) + serialized_snapshot->subxcnt = 0; + + /* Copy XID array */ + if (snapshot->xcnt > 0) { + rc = memcpy_s((TransactionId *)(serialized_snapshot + 1), len - 1, + snapshot->xip, snapshot->xcnt * sizeof(TransactionId)); + securec_check_c(rc, "", ""); + } + + /* + * Copy SubXID array. Don't bother to copy it if it had overflowed, + * though, because it's not used anywhere in that case. Except if it's a + * snapshot taken during recovery; all the top-level XIDs are in subxip as + * well in that case, so we mustn't lose them. + */ + if (snapshot->subxcnt > 0) { + Size subxipoff = sizeof(SerializedSnapshotData) + snapshot->xcnt * sizeof(TransactionId); + + rc = memcpy_s(((char *)serialized_snapshot + subxipoff), len - subxipoff, snapshot->subxip, + snapshot->subxcnt * sizeof(TransactionId)); + securec_check_c(rc, "", ""); + } +} + +/* + * RestoreSnapshot + * Restore a serialized snapshot from the specified address. + * + * The copy is palloc'd in TopTransactionContext and has initial refcounts set + * to 0. The returned snapshot has the copied flag set. + */ +Snapshot RestoreSnapshot(char *start_address, Size len) +{ + SerializedSnapshotData *serialized_snapshot = (SerializedSnapshotData*)start_address; + TransactionId *serialized_xids = (TransactionId*)(start_address + sizeof(SerializedSnapshotData)); + + /* We allocate any XID arrays needed in the same palloc block. */ + Size size = sizeof(SnapshotData) + serialized_snapshot->xcnt * sizeof(TransactionId) + + serialized_snapshot->subxcnt * sizeof(TransactionId); + + /* Copy all required fields */ + Snapshot snapshot = (Snapshot)MemoryContextAlloc(u_sess->top_transaction_mem_cxt, size); + snapshot->satisfies = HeapTupleSatisfiesMVCC; + snapshot->xmin = serialized_snapshot->xmin; + snapshot->xmax = serialized_snapshot->xmax; + snapshot->xip = NULL; + snapshot->xcnt = serialized_snapshot->xcnt; + snapshot->subxip = NULL; + snapshot->subxcnt = serialized_snapshot->subxcnt; + snapshot->suboverflowed = serialized_snapshot->suboverflowed; + snapshot->takenDuringRecovery = serialized_snapshot->takenDuringRecovery; + snapshot->curcid = serialized_snapshot->curcid; + snapshot->user_data = NULL; + snapshot->timeline = serialized_snapshot->timeline; + snapshot->snapshotcsn = serialized_snapshot->snapshotcsn; + snapshot->snapshot_type = serialized_snapshot->snapshot_type; + + /* Copy XIDs, if present. */ + int rc; + Size remainLen = len - sizeof(SerializedSnapshotData); + if (serialized_snapshot->xcnt > 0) { + snapshot->xip = (TransactionId *)(snapshot + 1); + rc = memcpy_s(snapshot->xip, remainLen, serialized_xids, serialized_snapshot->xcnt * sizeof(TransactionId)); + remainLen -= serialized_snapshot->xcnt * sizeof(TransactionId); + securec_check_c(rc, "", ""); + } + + /* Copy SubXIDs, if present. */ + if (serialized_snapshot->subxcnt > 0) { + snapshot->subxip = snapshot->xip + serialized_snapshot->xcnt; + rc = memcpy_s(snapshot->subxip, remainLen, serialized_xids + serialized_snapshot->xcnt, + serialized_snapshot->subxcnt * sizeof(TransactionId)); + securec_check_c(rc, "", ""); + } + + /* Set the copied flag so that the caller will set refcounts correctly. */ + snapshot->regd_count = 0; + snapshot->active_count = 0; + snapshot->copied = true; + + return snapshot; +} + +/* + * Install a restored snapshot as the transaction snapshot. + * + * The second argument is of type void * so that snapmgr.h need not include + * the declaration for PGPROC. + */ +void RestoreTransactionSnapshot(Snapshot snapshot, void *master_pgproc) +{ + SetTransactionSnapshot(snapshot, InvalidTransactionId, (PGPROC *)master_pgproc); +} diff --git a/src/common/pl/plpgsql/src/pl_exec.cpp b/src/common/pl/plpgsql/src/pl_exec.cpp index 082a16bfe..7f7e8b156 100755 --- a/src/common/pl/plpgsql/src/pl_exec.cpp +++ b/src/common/pl/plpgsql/src/pl_exec.cpp @@ -150,7 +150,8 @@ static void exec_eval_datum( static int exec_eval_integer(PLpgSQL_execstate* estate, PLpgSQL_expr* expr, bool* isNull); static bool exec_eval_boolean(PLpgSQL_execstate* estate, PLpgSQL_expr* expr, bool* isNull); static Datum exec_eval_expr(PLpgSQL_execstate* estate, PLpgSQL_expr* expr, bool* isNull, Oid* rettype); -static int exec_run_select(PLpgSQL_execstate* estate, PLpgSQL_expr* expr, long maxtuples, Portal* portalP); +static int exec_run_select(PLpgSQL_execstate *estate, PLpgSQL_expr *expr, long maxtuples, Portal *portalP, + bool parallelOK); static int exec_for_query(PLpgSQL_execstate* estate, PLpgSQL_stmt_forq* stmt, Portal portal, bool prefetch_ok, int dno); static ParamListInfo setup_param_list(PLpgSQL_execstate* estate, PLpgSQL_expr* expr); static void plpgsql_param_fetch(ParamListInfo params, int paramid); @@ -2164,7 +2165,7 @@ static int exec_stmt_perform(PLpgSQL_execstate* estate, PLpgSQL_stmt_perform* st if (!RecoveryInProgress()) oldTransactionId = GetTopTransactionId(); - rc = exec_run_select(estate, expr, 0, NULL); + rc = exec_run_select(estate, expr, 0, NULL, true); if (rc != SPI_OK_SELECT) { ereport(DEBUG1, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmodule(MOD_PLSQL), errmsg("exec_run_select returns %d", rc))); @@ -2708,7 +2709,7 @@ static int exec_stmt_fors(PLpgSQL_execstate* estate, PLpgSQL_stmt_fors* stmt) /* * Open the implicit cursor for the statement using exec_run_select */ - rc = exec_run_select(estate, stmt->query, 0, &portal); + rc = exec_run_select(estate, stmt->query, 0, &portal, false); if (rc != SPI_OK_SELECT) { ereport(DEBUG1, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmodule(MOD_PLSQL), errmsg("exec_run_select returns %d", rc))); @@ -3206,7 +3207,7 @@ static int exec_stmt_return(PLpgSQL_execstate* estate, PLpgSQL_stmt_return* stmt if (stmt->expr != NULL) { if (estate->retistuple) { - exec_run_select(estate, stmt->expr, 1, NULL); + exec_run_select(estate, stmt->expr, 1, NULL, true); if (estate->eval_processed > 0) { estate->retval = PointerGetDatum(estate->eval_tuptable->vals[0]); estate->rettupdesc = estate->eval_tuptable->tupdesc; @@ -3398,11 +3399,11 @@ static int exec_stmt_return_query(PLpgSQL_execstate* estate, PLpgSQL_stmt_return if (stmt->query != NULL) { /* static query */ - exec_run_select(estate, stmt->query, 0, &portal); + exec_run_select(estate, stmt->query, 0, &portal, true); } else { /* RETURN QUERY EXECUTE */ AssertEreport(stmt->dynquery != NULL, MOD_PLSQL, "stmt's dynamic query is required."); - portal = exec_dynquery_with_params(estate, stmt->dynquery, stmt->params, NULL, 0); + portal = exec_dynquery_with_params(estate, stmt->dynquery, stmt->params, NULL, CURSOR_OPT_PARALLEL_OK); } tupmap = convert_tuples_by_position( @@ -6222,7 +6223,7 @@ static Datum exec_eval_expr(PLpgSQL_execstate* estate, PLpgSQL_expr* expr, bool* /* * Else do it the hard way via exec_run_select */ - rc = exec_run_select(estate, expr, 2, NULL); + rc = exec_run_select(estate, expr, 2, NULL, false); if (rc != SPI_OK_SELECT) { ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), @@ -6279,7 +6280,8 @@ static Datum exec_eval_expr(PLpgSQL_execstate* estate, PLpgSQL_expr* expr, bool* * exec_run_select Execute a select query * ---------- */ -static int exec_run_select(PLpgSQL_execstate* estate, PLpgSQL_expr* expr, long maxtuples, Portal* portalP) +static int exec_run_select(PLpgSQL_execstate *estate, PLpgSQL_expr *expr, long maxtuples, Portal *portalP, + bool parallelOK) { ParamListInfo paramLI; int rc; @@ -6288,7 +6290,7 @@ static int exec_run_select(PLpgSQL_execstate* estate, PLpgSQL_expr* expr, long m * On the first call for this expression generate the plan */ if (expr->plan == NULL) { - exec_prepare_plan(estate, expr, 0); + exec_prepare_plan(estate, expr, parallelOK ? CURSOR_OPT_PARALLEL_OK : 0); } /* @@ -6711,6 +6713,11 @@ static ParamListInfo setup_param_list(PLpgSQL_execstate* estate, PLpgSQL_expr* e paramLI->parserSetupArg = (void*)expr; paramLI->params_need_process = false; paramLI->numParams = estate->ndatums; + /* + * Allow parameters that aren't needed by this expression to be + * ignored. + */ + paramLI->paramMask = expr->paramnos; /* Instantiate values for "safe" parameters of the expression */ tmpset = bms_copy(expr->paramnos); diff --git a/src/gausskernel/optimizer/commands/async.cpp b/src/gausskernel/optimizer/commands/async.cpp index 8dad9eb7e..af623b780 100755 --- a/src/gausskernel/optimizer/commands/async.cpp +++ b/src/gausskernel/optimizer/commands/async.cpp @@ -115,6 +115,7 @@ #include #include +#include "access/parallel.h" #include "access/slru.h" #include "access/transam.h" #include "access/xact.h" @@ -446,6 +447,10 @@ void Async_Notify(const char* channel, const char* payload) Notification* n = NULL; MemoryContext oldcontext; + if (IsParallelWorker()) { + elog(ERROR, "cannot send notifications from a parallel worker"); + } + if (u_sess->attr.attr_common.Trace_notify) { elog(DEBUG1, "Async_Notify(%s)", channel); } diff --git a/src/gausskernel/optimizer/commands/copy.cpp b/src/gausskernel/optimizer/commands/copy.cpp index 5c0e74c12..a53689332 100644 --- a/src/gausskernel/optimizer/commands/copy.cpp +++ b/src/gausskernel/optimizer/commands/copy.cpp @@ -983,8 +983,10 @@ uint64 DoCopy(CopyStmt* stmt, const char* queryString) Assert(rel); /* check read-only transaction */ - if (u_sess->attr.attr_common.XactReadOnly && !RELATION_IS_TEMP(rel)) + if (u_sess->attr.attr_common.XactReadOnly && !RELATION_IS_TEMP(rel)) { PreventCommandIfReadOnly("COPY FROM"); + } + PreventCommandIfParallelMode("COPY FROM"); /* set write for backend status for the thread, we will use it to check default transaction readOnly */ pgstat_set_stmt_tag(STMTTAG_WRITE); diff --git a/src/gausskernel/optimizer/commands/explain.cpp b/src/gausskernel/optimizer/commands/explain.cpp index 1b78fd1e6..f3b0eb8b1 100755 --- a/src/gausskernel/optimizer/commands/explain.cpp +++ b/src/gausskernel/optimizer/commands/explain.cpp @@ -661,7 +661,7 @@ static void ExplainOneQuery( PlannedStmt* plan = NULL; /* plan the query */ - plan = pg_plan_query(query, 0, params, true); + plan = pg_plan_query(query, CURSOR_OPT_PARALLEL_OK, params, true); /* run it (if needed) and produce output */ ExplainOnePlan(plan, into, es, queryString, params); @@ -1790,6 +1790,9 @@ static void ExplainNode( appendStringInfoString(es->str, "-> "); es->indent += 2; } + if (plan->parallel_aware) { + appendStringInfoString(es->str, "Parallel "); + } appendStringInfoString(es->str, pname); es->indent++; @@ -1805,6 +1808,9 @@ static void ExplainNode( ExplainPropertyText("Parent Relationship", relationship, es); if (plan_name != NULL) ExplainPropertyText("Subplan Name", plan_name, es); + if (plan->parallel_aware) { + ExplainPropertyText("Parallel Aware", "true", es); + } } switch (nodeTag(plan)) { @@ -2379,6 +2385,16 @@ static void ExplainNode( show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); show_llvm_info(planstate, es); break; + case T_Gather: { + Gather *gather = (Gather *)plan; + show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); + if (plan->qual) + show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); + ExplainPropertyInteger("Number of Workers", gather->num_workers, es); + if (gather->single_copy) + ExplainPropertyText("Single Copy", gather->single_copy ? "true" : "false", es); + break; + } case T_DfsScan: { show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); show_pushdown_qual(planstate, ancestors, es, PUSHDOWN_PREDICATE_FLAG); diff --git a/src/gausskernel/optimizer/commands/extension.cpp b/src/gausskernel/optimizer/commands/extension.cpp index 01a785876..ebed784e8 100755 --- a/src/gausskernel/optimizer/commands/extension.cpp +++ b/src/gausskernel/optimizer/commands/extension.cpp @@ -625,7 +625,7 @@ static void execute_sql_string(const char* sql, const char* filename) * We use a null string query_string here to avoid this. */ stmt_list = pg_analyze_and_rewrite(parsetree, query_string, NULL, 0); - stmt_list = pg_plan_queries(stmt_list, 0, NULL); + stmt_list = pg_plan_queries(stmt_list, CURSOR_OPT_PARALLEL_OK, NULL); foreach (lc2, stmt_list) { Node* stmt = (Node*)lfirst(lc2); diff --git a/src/gausskernel/optimizer/commands/prepare.cpp b/src/gausskernel/optimizer/commands/prepare.cpp index d37f3c3d4..a0a2f0599 100755 --- a/src/gausskernel/optimizer/commands/prepare.cpp +++ b/src/gausskernel/optimizer/commands/prepare.cpp @@ -400,6 +400,7 @@ static ParamListInfo EvaluateParams(PreparedStatement* pstmt, List* params, cons paramLI->parserSetupArg = NULL; paramLI->params_need_process = false; paramLI->numParams = num_params; + paramLI->paramMask = NULL; i = 0; foreach (l, exprstates) { diff --git a/src/gausskernel/optimizer/commands/sequence.cpp b/src/gausskernel/optimizer/commands/sequence.cpp index bf4950867..133126559 100755 --- a/src/gausskernel/optimizer/commands/sequence.cpp +++ b/src/gausskernel/optimizer/commands/sequence.cpp @@ -962,6 +962,14 @@ static int64 nextval_internal(Oid relid) /* read-only transactions may only modify temp sequences */ if (!is_use_local_seq) PreventCommandIfReadOnly("nextval()"); + + /* + * Forbid this during parallel operation because, to make it work, the + * cooperating backends would need to share the backend-local cached + * sequence information. Currently, we don't support that. + */ + PreventCommandIfParallelMode("nextval()"); + if (elm->last != elm->cached) { /* some numbers were cached */ Assert(elm->last_valid); @@ -1361,6 +1369,13 @@ static void do_setval(Oid relid, int64 next, bool iscalled) PreventCommandIfReadOnly("setval()"); #endif + /* + * Forbid this during parallel operation because, to make it work, the + * cooperating backends would need to share the backend-local cached + * sequence information. Currently, we don't support that. + */ + PreventCommandIfParallelMode("setval()"); + /* lock page' buffer and read tuple */ GTM_UUID uuid; seq = read_seq_tuple(elm, seqrel, &buf, &seqtuple, &uuid); diff --git a/src/gausskernel/optimizer/commands/variable.cpp b/src/gausskernel/optimizer/commands/variable.cpp index 62cfb3c5a..d088d757a 100755 --- a/src/gausskernel/optimizer/commands/variable.cpp +++ b/src/gausskernel/optimizer/commands/variable.cpp @@ -19,6 +19,7 @@ #include +#include "access/parallel.h" #include "access/xact.h" #include "access/xlog.h" #include "catalog/pg_authid.h" @@ -502,7 +503,8 @@ const char* show_log_timezone(void) */ bool check_transaction_read_only(bool* newval, void** extra, GucSource source) { - if (*newval == false && u_sess->attr.attr_common.XactReadOnly && IsTransactionState()) { + if (*newval == false && u_sess->attr.attr_common.XactReadOnly && IsTransactionState() && + !t_thrd.bgworker_cxt.InitializingParallelWorker) { /* Can't go to r/w mode inside a r/o transaction */ if (IsSubTransaction()) { GUC_check_errcode(ERRCODE_ACTIVE_SQL_TRANSACTION); @@ -763,6 +765,28 @@ void assign_client_encoding(const char* newval, void* extra) { int encoding = *((int*)extra); + /* + * Parallel workers send data to the leader, not the client. They always + * send data using the database encoding. + */ + if (IsParallelWorker()) { + /* + * During parallel worker startup, we want to accept the leader's + * client_encoding setting so that anyone who looks at the value in + * the worker sees the same value that they would see in the leader. + */ + if (t_thrd.bgworker_cxt.InitializingParallelWorker) + return; + + /* + * A change other than during startup, for example due to a SET clause + * attached to a function definition, should be rejected, as there is + * nothing we can do inside the worker to make it take effect. + */ + ereport(ERROR, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot change client_encoding during a parallel operation"))); + } + /* We do not expect an error if PrepareClientEncoding succeeded */ if (SetClientEncoding(encoding) < 0) elog(LOG, "SetClientEncoding(%d) failed", encoding); @@ -895,9 +919,11 @@ bool check_role(char** newval, void** extra, GucSource source) } /* - * Verify that session user is allowed to become this role + * Verify that session user is allowed to become this role, but skip + * this in parallel mode, where we must blindly recreate the parallel + * leader's state. */ - if (!is_member_of_role(GetSessionUserId(), roleid)) { + if (!t_thrd.bgworker_cxt.InitializingParallelWorker && !is_member_of_role(GetSessionUserId(), roleid)) { GUC_check_errcode(ERRCODE_INSUFFICIENT_PRIVILEGE); GUC_check_errmsg("permission denied to set role \"%s\"", *newval); return false; diff --git a/src/gausskernel/optimizer/path/allpaths.cpp b/src/gausskernel/optimizer/path/allpaths.cpp index 9b1fab1cc..de14a87c0 100755 --- a/src/gausskernel/optimizer/path/allpaths.cpp +++ b/src/gausskernel/optimizer/path/allpaths.cpp @@ -22,6 +22,7 @@ #include "catalog/pg_class.h" #include "catalog/pg_partition.h" #include "catalog/pg_partition_fn.h" +#include "catalog/pg_proc.h" #include "foreign/fdwapi.h" #include "nodes/nodeFuncs.h" #include "nodes/pg_list.h" @@ -65,6 +66,7 @@ static void set_rel_pathlist(PlannerInfo* root, RelOptInfo* rel, Index rti, Rang static void set_plain_rel_size(PlannerInfo* root, RelOptInfo* rel, RangeTblEntry* rte); static void set_tablesample_rel_size(PlannerInfo* root, RelOptInfo* rel, RangeTblEntry* rte); static void set_plain_rel_pathlist(PlannerInfo* root, RelOptInfo* rel, RangeTblEntry* rte); +static void set_rel_consider_parallel(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte); static void set_foreign_size(PlannerInfo* root, RelOptInfo* rel, RangeTblEntry* rte); static void set_foreign_pathlist(PlannerInfo* root, RelOptInfo* rel, RangeTblEntry* rte); static void set_append_rel_size(PlannerInfo* root, RelOptInfo* rel, Index rti, RangeTblEntry* rte); @@ -264,6 +266,16 @@ static void set_base_rel_sizes(PlannerInfo* root) if (rel->reloptkind != RELOPT_BASEREL) continue; + /* + * If parallelism is allowable for this query in general, see whether + * it's allowable for this rel in particular. We have to do this + * before set_rel_size, because that if this is an inheritance parent, + * set_append_rel_size will pass the consider_parallel flag down to + * inheritance children. + */ + if (root->glob->parallelModeOK) + set_rel_consider_parallel(root, rel, root->simple_rte_array[rti]); + set_rel_size(root, rel, (Index)rti, root->simple_rte_array[rti]); /* Try inlist2join optimization */ @@ -812,6 +824,7 @@ static void set_plain_rel_pathlist(PlannerInfo* root, RelOptInfo* rel, RangeTblE List* quals = NIL; bool has_vecengine_unsupport_expr = false; ListCell* lc = NULL; + int parallel_threshold = u_sess->attr.attr_sql.min_parallel_table_scan_size; #ifdef PGXC bool isrp = create_plainrel_rqpath(root, rel, rte); @@ -871,8 +884,39 @@ static void set_plain_rel_pathlist(PlannerInfo* root, RelOptInfo* rel, RangeTblE } case REL_ROW_ORIENTED: { add_path(root, rel, create_seqscan_path(root, rel, NULL)); - if (can_parallel) + if (can_parallel) { add_path(root, rel, create_seqscan_path(root, rel, NULL, u_sess->opt_cxt.query_dop)); + } + + /* Consider parallel sequential scan */ + if (rel->consider_parallel && rel->pages > parallel_threshold) { + Path *path; + int parallel_degree = 1; + + /* + * Limit the degree of parallelism logarithmically based on the size + * of the relation. This probably needs to be a good deal more + * sophisticated, but we need something here for now. + */ + while (rel->pages > parallel_threshold * 3 && + parallel_degree < g_instance.attr.attr_common.max_parallel_workers_per_gather) { + parallel_degree++; + parallel_threshold *= 3; + if (parallel_threshold >= PG_INT32_MAX / 3) + break; + } + + /* + * Ideally we should consider postponing the gather operation until + * much later, after we've pushed joins and so on atop the parallel + * sequential scan path. But we don't have the infrastructure for + * that yet, so just do this for now. + */ + path = create_seqscan_path(root, rel, NULL, 1, parallel_degree); + path = (Path *)create_gather_path(root, rel, path, NULL, parallel_degree); + add_path(root, rel, path); + } + break; } default: { @@ -937,15 +981,123 @@ static void set_plain_rel_pathlist(PlannerInfo* root, RelOptInfo* rel, RangeTblE } } +/* + * If this relation could possibly be scanned from within a worker, then set + * the consider_parallel flag. The flag has previously been initialized to + * false, so we just bail out if it becomes clear that we can't safely set it. + */ +static void set_rel_consider_parallel(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) +{ + /* Don't call this if parallelism is disallowed for the entire query. */ + Assert(root->glob->parallelModeOK); + + /* Don't call this for non-baserels. */ + Assert(rel->reloptkind == RELOPT_BASEREL); + + /* Assorted checks based on rtekind. */ + switch (rte->rtekind) { + case RTE_RELATION: + /* + * Currently, parallel workers can't access the leader's temporary + * tables. We could possibly relax this if the wrote all of its + * local buffers at the start of the query and made no changes + * thereafter (maybe we could allow hint bit changes), and if we + * taught the workers to read them. Writing a large number of + * temporary buffers could be expensive, though, and we don't have + * the rest of the necessary infrastructure right now anyway. So + * for now, bail out if we see a temporary table. + */ + if (get_rel_persistence(rte->relid) == RELPERSISTENCE_TEMP) + return; + + /* Don't support parallel for partitioned table. */ + if (rte->ispartrel) { + return; + } + + /* + * Table sampling can be pushed down to workers if the sample + * function and its arguments are safe. + */ + if (rte->tablesample != NULL) { + // TODO, try to use this: func_parallel(rte->tablesample->tsmhandler) + Oid proparallel = PROPARALLEL_SAFE; + + if (proparallel != PROPARALLEL_SAFE) + return; + if (has_parallel_hazard((Node *)rte->tablesample->args, false)) + return; + return; + } + break; + + case RTE_SUBQUERY: + /* + * Subplans currently aren't passed to workers. Even if they + * were, the subplan might be using parallelism internally, and + * we can't support nested Gather nodes at present. Finally, + * we don't have a good way of knowing whether the subplan + * involves any parallel-restricted operations. It would be + * nice to relax this restriction some day, but it's going to + * take a fair amount of work. + */ + return; + + case RTE_JOIN: + /* Shouldn't happen; we're only considering baserels here. */ + Assert(false); + return; + + case RTE_FUNCTION: + /* Check for parallel-restricted functions. */ + if (has_parallel_hazard(rte->funcexpr, false)) + return; + break; + + case RTE_VALUES: + /* + * The data for a VALUES clause is stored in the plan tree itself, + * so scanning it in a worker is fine. + */ + break; + + case RTE_CTE: + /* + * CTE tuplestores aren't shared among parallel workers, so we + * force all CTE scans to happen in the leader. Also, populating + * the CTE would require executing a subplan that's not available + * in the worker, might be parallel-restricted, and must get + * executed only once. + */ + return; + case RTE_REMOTE_DUMMY: + return; + } + + /* + * If there's anything in baserestrictinfo that's parallel-restricted, + * we give up on parallelizing access to this relation. We could consider + * instead postponing application of the restricted quals until we're + * above all the parallelism in the plan tree, but it's not clear that + * this would be a win in very many cases, and it might be tricky to make + * outer join clauses work correctly. + */ + if (has_parallel_hazard((Node *)rel->baserestrictinfo, false)) + return; + + /* We have a winner. */ + rel->consider_parallel = true; +} + /* * Description:add result operator over scan operator. And add * vector type scan's qual with unsupport expression in vector engine * to result operator * * Parameters: - * @in root: plannerinfo struct for current query level. - * @in rel: Per-relation information for planning/optimization. - * @in quals: filter condition + * @in root: plannerinfo struct for current query level. + * @in rel: Per-relation information for planning/optimization. + * @in quals: filter condition * * Return: void */ @@ -1147,6 +1299,9 @@ static void set_append_rel_size(PlannerInfo* root, RelOptInfo* rel, Index rti, R continue; } + /* Copy consider_parallel flag from parent. */ + childrel->consider_parallel = rel->consider_parallel; + /* * CE failed, so finish copying/modifying targetlist and join quals. * @@ -3025,6 +3180,9 @@ static void print_path(PlannerInfo* root, Path* path, int indent) case T_Unique: subpath = ((UniquePath*)path)->subpath; break; + case T_GatherPath: + subpath = ((GatherPath*)path)->subpath; + break; case T_NestLoop: join = true; break; diff --git a/src/gausskernel/optimizer/path/costsize.cpp b/src/gausskernel/optimizer/path/costsize.cpp index d38a1ee04..34907c8c1 100644 --- a/src/gausskernel/optimizer/path/costsize.cpp +++ b/src/gausskernel/optimizer/path/costsize.cpp @@ -11,6 +11,8 @@ * cpu_tuple_cost Cost of typical CPU time to process a tuple * cpu_index_tuple_cost Cost of typical CPU time to process an index tuple * cpu_operator_cost Cost of CPU time to execute an operator or function + * parallel_tuple_cost Cost of CPU time to pass a tuple from worker to master backend + * parallel_setup_cost Cost of setting up shared memory for parallelism * * We expect that the kernel will typically do some amount of read-ahead * optimization; this in conjunction with seek costs means that seq_page_cost @@ -157,6 +159,7 @@ void init_plan_cost(Plan* plan) plan->pred_startup_time = -1.0; plan->pred_total_time = -1.0; plan->pred_max_memory = -1; + plan->parallel_aware = false; } static inline void get_info_from_rel( @@ -669,7 +672,7 @@ static void set_parallel_path_rows(Path* path) * 'baserel' is the relation to be scanned * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL */ -void cost_seqscan(Path* path, PlannerInfo* root, RelOptInfo* baserel, ParamPathInfo* param_info) +void cost_seqscan(Path* path, PlannerInfo* root, RelOptInfo* baserel, ParamPathInfo* param_info, int nworkers) { Cost startup_cost = 0; Cost run_cost = 0; @@ -703,6 +706,17 @@ void cost_seqscan(Path* path, PlannerInfo* root, RelOptInfo* baserel, ParamPathI cpu_per_tuple = u_sess->attr.attr_sql.cpu_tuple_cost + qpqual_cost.per_tuple; run_cost += cpu_per_tuple * RELOPTINFO_LOCAL_FIELD(root, baserel, tuples) / dop; + /* + * Primitive parallel cost model. Assume the leader will do half as much + * work as a regular worker, because it will also need to read the tuples + * returned by the workers when they percolate up to the gather ndoe. + * This is almost certainly not exactly the right way to model this, so + * this will probably need to be changed at some point... + */ + if (nworkers > 0) { + run_cost = run_cost / (nworkers + 0.5); + } + path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; path->stream_cost = 0; @@ -946,6 +960,36 @@ void cost_tsstorescan(Path *path, PlannerInfo *root, RelOptInfo *baserel) } } +/* + * cost_gather + * Determines and returns the cost of gather path. + * + * 'rel' is the relation to be operated upon + * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL + */ +void cost_gather(GatherPath *path, RelOptInfo *rel, ParamPathInfo *param_info) +{ + Cost startup_cost = 0; + Cost run_cost = 0; + + /* Mark the path with the correct row estimate */ + if (param_info) + path->path.rows = param_info->ppi_rows; + else + path->path.rows = rel->rows; + + startup_cost = path->subpath->startup_cost; + + run_cost = path->subpath->total_cost - path->subpath->startup_cost; + + /* Parallel setup and communication cost. */ + startup_cost += u_sess->attr.attr_sql.parallel_setup_cost; + run_cost += u_sess->attr.attr_sql.parallel_tuple_cost * path->path.rows; + + path->path.startup_cost = startup_cost; + path->path.total_cost = (startup_cost + run_cost); +} + /* * cost_index * Determines and returns the cost of scanning a relation using an index. diff --git a/src/gausskernel/optimizer/plan/createplan.cpp b/src/gausskernel/optimizer/plan/createplan.cpp index d45c1f38d..dd2f6a3a2 100644 --- a/src/gausskernel/optimizer/plan/createplan.cpp +++ b/src/gausskernel/optimizer/plan/createplan.cpp @@ -94,6 +94,7 @@ static CStoreScan* create_cstorescan_plan(PlannerInfo* root, Path* best_path, Li static DfsScan* create_dfsscan_plan(PlannerInfo* root, Path* best_path, List* tlist, List* scan_clauses, bool indexFlag = false, List* excludedCol = NIL, bool indexOnly = false); static TsStoreScan* create_tsstorescan_plan(PlannerInfo* root, Path* best_path, List* tlist, List* scan_clauses); +static Gather *create_gather_plan(PlannerInfo *root, GatherPath *best_path); static Scan* create_indexscan_plan( PlannerInfo* root, IndexPath* best_path, List* tlist, List* scan_clauses, bool indexonly); static BitmapHeapScan* create_bitmap_scan_plan( @@ -130,6 +131,7 @@ static Plan* setPartitionParam(PlannerInfo* root, Plan* plan, RelOptInfo* rel); static Plan* setBucketInfoParam(PlannerInfo* root, Plan* plan, RelOptInfo* rel); Plan* create_globalpartInterator_plan(PlannerInfo* root, PartIteratorPath* pIterpath); +static Gather *make_gather(List *qptlist, List *qpqual, int nworkers, bool single_copy, Plan *subplan); static IndexScan* make_indexscan(List* qptlist, List* qpqual, Index scanrelid, Oid indexid, List* indexqual, List* indexqualorig, List* indexorderby, List* indexorderbyorig, ScanDirection indexscandir); static IndexOnlyScan* make_indexonlyscan(List* qptlist, List* qpqual, Index scanrelid, Oid indexid, List* indexqual, @@ -385,6 +387,9 @@ static Plan* create_plan_recurse(PlannerInfo* root, Path* best_path) plan = create_stream_plan(root, (StreamPath*)best_path); break; #endif + case T_Gather: + plan = (Plan*)create_gather_plan(root, (GatherPath*)best_path); + break; default: { ereport(ERROR, (errcode(ERRCODE_UNRECOGNIZED_NODE_TYPE), @@ -1811,6 +1816,34 @@ static bool relIsDeltaNode(PlannerInfo* root, RelOptInfo* relOptInfo) return isDelta; } +/* + * create_gather_plan + * + * Create a Gather plan for 'best_path' and (recursively) plans + * for its subpaths. + */ +static Gather *create_gather_plan(PlannerInfo *root, GatherPath *best_path) +{ + Index scan_relid = best_path->path.parent->relid; + Plan *subplan = create_plan_recurse(root, best_path->subpath); + + disuse_physical_tlist(subplan, best_path->subpath); + + Gather *gather_plan = make_gather(subplan->targetlist, NIL, + best_path->num_workers, best_path->single_copy, subplan); + + copy_path_costsize(&gather_plan->plan, &best_path->path); + +#ifdef STREAMPLAN + add_distribute_info(root, &gather_plan->plan, scan_relid, &(best_path->path), NULL); +#endif + + /* use parallel mode for parallel plans. */ + root->glob->parallelModeNeeded = true; + + return gather_plan; +} + /* * create_seqscan_plan * Returns a seqscan plan for the base relation scanned by 'best_path' @@ -5192,6 +5225,7 @@ static void copy_path_costsize(Plan* dest, Path* src) dest->plan_width = src->parent->width; dest->innerdistinct = src->innerdistinct; dest->outerdistinct = src->outerdistinct; + dest->parallel_aware = src->parallel_aware; } else { /* init the cost field directly */ init_plan_cost(dest); @@ -7522,6 +7556,22 @@ Unique* make_unique(Plan* lefttree, List* distinctList) return node; } +static Gather *make_gather(List *qptlist, List *qpqual, int nworkers, bool single_copy, Plan *subplan) +{ + Gather *node = makeNode(Gather); + Plan *plan = &node->plan; + + /* cost should be inserted by caller */ + plan->targetlist = qptlist; + plan->qual = qpqual; + plan->lefttree = subplan; + plan->righttree = NULL; + node->num_workers = nworkers; + node->single_copy = single_copy; + + return node; +} + /* * distinctList is a list of SortGroupClauses, identifying the targetlist * items that should be considered by the SetOp filter. The input path must diff --git a/src/gausskernel/optimizer/plan/planmain.cpp b/src/gausskernel/optimizer/plan/planmain.cpp index 97de26a37..5af9d9967 100755 --- a/src/gausskernel/optimizer/plan/planmain.cpp +++ b/src/gausskernel/optimizer/plan/planmain.cpp @@ -31,6 +31,7 @@ #include "optimizer/paths.h" #include "optimizer/placeholder.h" #include "optimizer/planmain.h" +#include "optimizer/planner.h" #include "optimizer/randomplan.h" #include "optimizer/tlist.h" #include "utils/selfuncs.h" @@ -113,6 +114,9 @@ void query_planner(PlannerInfo* root, List* tlist, double tuple_fraction, double if (parse->jointree->fromlist == NIL) { /* We need a trivial path result */ *cheapest_path = (Path*)create_result_path((List*)parse->jointree->quals); + if (root->glob->parallelModeOK && u_sess->attr.attr_sql.force_parallel_mode != FORCE_PARALLEL_OFF) { + (*cheapest_path)->parallel_safe = !has_parallel_hazard(parse->jointree->quals, false); + } *sorted_path = NULL; /* diff --git a/src/gausskernel/optimizer/plan/planner.cpp b/src/gausskernel/optimizer/plan/planner.cpp index 415acf524..7e7718d96 100644 --- a/src/gausskernel/optimizer/plan/planner.cpp +++ b/src/gausskernel/optimizer/plan/planner.cpp @@ -19,6 +19,7 @@ #include #include +#include "access/parallel.h" #include "access/transam.h" #include "catalog/indexing.h" #include "catalog/pg_cast.h" @@ -438,6 +439,52 @@ PlannedStmt* standard_planner(Query* parse, int cursorOptions, ParamListInfo bou glob->bloomfilter.bloomfilter_index = -1; glob->bloomfilter.add_index = true; glob->estiopmem = esti_op_mem; + + /* + * Assess whether it's feasible to use parallel mode for this query. + * We can't do this in a standalone backend, or if the command will + * try to modify any data, or if this is a cursor operation, or if + * GUCs are set to values that don't permit parallelism, or if + * parallel-unsafe functions are present in the query tree. + * + * For now, we don't try to use parallel mode if we're running inside + * a parallel worker. We might eventually be able to relax this + * restriction, but for now it seems best not to have parallel workers + * trying to create their own parallel workers. + * + * We can't use parallelism in serializable mode because the predicate + * locking code is not parallel-aware. It's not catastrophic if someone + * tries to run a parallel plan in serializable mode; it just won't get + * any workers and will run serially. But it seems like a good heuristic + * to assume that the same serialization level will be in effect at plan + * time and execution time, so don't generate a parallel plan if we're + * in serializable mode. + */ + glob->parallelModeOK = (cursorOptions & CURSOR_OPT_PARALLEL_OK) != 0 && IsUnderPostmaster && + parse->commandType == CMD_SELECT && !parse->hasModifyingCTE && parse->utilityStmt == NULL && + g_instance.attr.attr_common.max_parallel_workers_per_gather > 0 && !IsParallelWorker() && + !IsolationIsSerializable() && !has_parallel_hazard((Node *)parse, true); + + /* + * glob->parallelModeNeeded is normally set to false here and changed to + * true during plan creation if a Gather or Gather Merge plan is actually + * created (cf. create_gather_plan, create_gather_merge_plan). + * + * However, if force_parallel_mode = on or force_parallel_mode = regress, + * then we impose parallel mode whenever it's safe to do so, even if the + * final plan doesn't use parallelism. It's not safe to do so if the + * query contains anything parallel-unsafe; parallelModeOK will be false + * in that case. Note that parallelModeOK can't change after this point. + * Otherwise, everything in the query is either parallel-safe or + * parallel-restricted, and in either case it should be OK to impose + * parallel-mode restrictions. If that ends up breaking something, then + * either some function the user included in the query is incorrectly + * labelled as parallel-safe or parallel-restricted when in reality it's + * parallel-unsafe, or else the query planner itself has a bug. + */ + glob->parallelModeNeeded = + glob->parallelModeOK && (u_sess->attr.attr_sql.force_parallel_mode != FORCE_PARALLEL_OFF); + if (IS_STREAM_PLAN) glob->vectorized = !vector_engine_preprocess_walker((Node*)parse, parse->rtable); else @@ -723,6 +770,7 @@ PlannedStmt* standard_planner(Query* parse, int cursorOptions, ParamListInfo bou result->canSetTag = parse->canSetTag; result->transientPlan = glob->transientPlan; result->dependsOnRole = glob->dependsOnRole; + result->parallelModeNeeded = glob->parallelModeNeeded; result->planTree = top_plan; result->rtable = glob->finalrtable; result->resultRelations = glob->resultRelations; diff --git a/src/gausskernel/optimizer/plan/setrefs.cpp b/src/gausskernel/optimizer/plan/setrefs.cpp index a64541b86..6a174f958 100755 --- a/src/gausskernel/optimizer/plan/setrefs.cpp +++ b/src/gausskernel/optimizer/plan/setrefs.cpp @@ -295,8 +295,9 @@ static Plan* set_plan_refs(PlannerInfo* root, Plan* plan, int rtoffset) { ListCell* l = NULL; - if (plan == NULL) + if (plan == NULL) { return NULL; + } /* * Plan-type-specific fixes @@ -572,6 +573,10 @@ static Plan* set_plan_refs(PlannerInfo* root, Plan* plan, int rtoffset) } } break; + case T_Gather: + set_upper_references(root, plan, rtoffset); + break; + case T_Hash: case T_Material: case T_VecMaterial: diff --git a/src/gausskernel/optimizer/plan/subselect.cpp b/src/gausskernel/optimizer/plan/subselect.cpp index 3ff05af07..4eab0d9de 100755 --- a/src/gausskernel/optimizer/plan/subselect.cpp +++ b/src/gausskernel/optimizer/plan/subselect.cpp @@ -2703,6 +2703,7 @@ static Bitmapset* finalize_plan(PlannerInfo* root, Plan* plan, Bitmapset* valid_ case T_Material: case T_Sort: case T_Unique: + case T_Gather: case T_SetOp: case T_Group: case T_Stream: diff --git a/src/gausskernel/optimizer/util/clauses.cpp b/src/gausskernel/optimizer/util/clauses.cpp index e23e379fc..f58b23255 100644 --- a/src/gausskernel/optimizer/util/clauses.cpp +++ b/src/gausskernel/optimizer/util/clauses.cpp @@ -89,6 +89,11 @@ typedef struct { char* prosrc; } inline_error_callback_arg; +typedef struct { + bool allow_restricted; +} has_parallel_hazard_arg; + + typedef enum { CONTAIN_FUNCTION_ID, CONTAIN_MUTABLE_FUNCTION, CONTAIN_VOLATILE_FUNTION } checkFuntionType; typedef struct { @@ -109,6 +114,9 @@ static bool expression_returns_set_rows_walker(Node* node, double* count); static bool contain_subplans_walker(Node* node, void* context); template static bool contain_specified_functions_walker(Node* node, check_function_context* context); +static bool has_parallel_hazard_walker(Node *node, has_parallel_hazard_arg *context); +static bool parallel_too_dangerous(char proparallel, has_parallel_hazard_arg *context); +static bool typeid_is_temp(Oid type_id); static bool contain_nonstrict_functions_walker(Node* node, void* context); static bool contain_leaky_functions_walker(Node* node, void* context); static Relids find_nonnullable_rels_walker(Node* node, bool top_level); @@ -1145,11 +1153,191 @@ bool exec_simple_check_mutable_function(Node* clause) } /***************************************************************************** - * Check clauses for nonstrict functions + * Check queries for parallel unsafe and/or restricted constructs *****************************************************************************/ +/* + * Check whether a node tree contains parallel hazards. This is used both + * on the entire query tree, to see whether the query can be parallelized at + * all, and also to evaluate whether a particular expression is safe to + * run in a parallel worker. We could separate these concerns into two + * different functions, but there's enough overlap that it doesn't seem + * worthwhile. + */ +bool has_parallel_hazard(Node *node, bool allow_restricted) +{ + has_parallel_hazard_arg context; + + context.allow_restricted = allow_restricted; + return has_parallel_hazard_walker(node, &context); +} + +static bool has_parallel_hazard_walker(Node *node, has_parallel_hazard_arg *context) +{ + if (node == NULL) + return false; + + /* + * When we're first invoked on a completely unplanned tree, we must + * recurse through Query objects to as to locate parallel-unsafe + * constructs anywhere in the tree. + * + * Later, we'll be called again for specific quals, possibly after + * some planning has been done, we may encounter SubPlan, SubLink, + * or AlternativeSubLink nodes. Currently, there's no need to recurse + * through these; they can't be unsafe, since we've already cleared + * the entire query of unsafe operations, and they're definitely + * parallel-restricted. + */ + if (IsA(node, Query)) { + Query *query = (Query *)node; + + if (query->rowMarks != NULL) + return true; + + /* Recurse into subselects */ + return query_tree_walker(query, (bool (*)())has_parallel_hazard_walker, context, 0); + } else if (IsA(node, SubPlan) || IsA(node, SubLink) || IsA(node, AlternativeSubPlan) || IsA(node, Param)) { + return true; + } + + /* This is just a notational convenience for callers. */ + if (IsA(node, RestrictInfo)) { + RestrictInfo *rinfo = (RestrictInfo *)node; + return has_parallel_hazard_walker((Node *)rinfo->clause, context); + } + + /* + * It is an error for a parallel worker to touch a temporary table in any + * way, so we can't handle nodes whose type is the rowtype of such a table. + */ + if (!context->allow_restricted) { + switch (nodeTag(node)) { + case T_Var: + case T_Const: + case T_Param: + case T_Aggref: + case T_WindowFunc: + case T_ArrayRef: + case T_FuncExpr: + case T_NamedArgExpr: + case T_OpExpr: + case T_DistinctExpr: + case T_NullIfExpr: + case T_FieldSelect: + case T_FieldStore: + case T_RelabelType: + case T_CoerceViaIO: + case T_ArrayCoerceExpr: + case T_ConvertRowtypeExpr: + case T_CaseExpr: + case T_CaseTestExpr: + case T_ArrayExpr: + case T_RowExpr: + case T_CoalesceExpr: + case T_MinMaxExpr: + case T_CoerceToDomain: + case T_CoerceToDomainValue: + case T_SetToDefault: + if (typeid_is_temp(exprType(node))) + return true; + break; + default: + break; + } + } + + /* + * For each node that might potentially call a function, we need to + * examine the pg_proc.proparallel marking for that function to see + * whether it's safe enough for the current value of allow_restricted. + */ + if (IsA(node, FuncExpr)) { + FuncExpr *expr = (FuncExpr *)node; + + if (parallel_too_dangerous(func_parallel(expr->funcid), context)) + return true; + } else if (IsA(node, OpExpr)) { + OpExpr *expr = (OpExpr *)node; + + set_opfuncid(expr); + if (parallel_too_dangerous(func_parallel(expr->opfuncid), context)) + return true; + } else if (IsA(node, DistinctExpr)) { + DistinctExpr *expr = (DistinctExpr *)node; + + set_opfuncid((OpExpr *)expr); /* rely on struct equivalence */ + if (parallel_too_dangerous(func_parallel(expr->opfuncid), context)) + return true; + } else if (IsA(node, NullIfExpr)) { + NullIfExpr *expr = (NullIfExpr *)node; + + set_opfuncid((OpExpr *)expr); /* rely on struct equivalence */ + if (parallel_too_dangerous(func_parallel(expr->opfuncid), context)) + return true; + } else if (IsA(node, ScalarArrayOpExpr)) { + ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *)node; + + set_sa_opfuncid(expr); + if (parallel_too_dangerous(func_parallel(expr->opfuncid), context)) + return true; + } else if (IsA(node, CoerceViaIO)) { + CoerceViaIO *expr = (CoerceViaIO *)node; + Oid iofunc; + Oid typioparam; + bool typisvarlena; + + /* check the result type's input function */ + getTypeInputInfo(expr->resulttype, &iofunc, &typioparam); + if (parallel_too_dangerous(func_parallel(iofunc), context)) + return true; + /* check the input type's output function */ + getTypeOutputInfo(exprType((Node *)expr->arg), &iofunc, &typisvarlena); + if (parallel_too_dangerous(func_parallel(iofunc), context)) + return true; + } else if (IsA(node, ArrayCoerceExpr)) { + ArrayCoerceExpr *expr = (ArrayCoerceExpr *)node; + + if (OidIsValid(expr->elemfuncid) && parallel_too_dangerous(func_parallel(expr->elemfuncid), context)) + return true; + } else if (IsA(node, RowCompareExpr)) { + RowCompareExpr *rcexpr = (RowCompareExpr *)node; + ListCell *opid; + + foreach (opid, rcexpr->opnos) { + Oid opfuncid = get_opcode(lfirst_oid(opid)); + if (parallel_too_dangerous(func_parallel(opfuncid), context)) + return true; + } + } + + /* ... and recurse to check substructure */ + return expression_tree_walker(node, (bool (*)())has_parallel_hazard_walker, context); +} + +static bool parallel_too_dangerous(char proparallel, has_parallel_hazard_arg *context) +{ + if (context->allow_restricted) + return proparallel == PROPARALLEL_UNSAFE; + else + return proparallel != PROPARALLEL_SAFE; +} + +static bool typeid_is_temp(Oid type_id) +{ + Oid relid = get_typ_typrelid(type_id); + + if (!OidIsValid(relid)) + return false; + + return (get_rel_persistence(relid) == RELPERSISTENCE_TEMP); +} + +/* **************************************************************************** + * Check clauses for nonstrict functions + * *************************************************************************** */ /* * contain_nonstrict_functions - * Recursively search for nonstrict functions within a clause. + * Recursively search for nonstrict functions within a clause. * * Returns true if any nonstrict construct is found --- ie, anything that * could produce non-NULL output with a NULL input. diff --git a/src/gausskernel/optimizer/util/optcommon.cpp b/src/gausskernel/optimizer/util/optcommon.cpp index 5153dd797..28303d6d2 100755 --- a/src/gausskernel/optimizer/util/optcommon.cpp +++ b/src/gausskernel/optimizer/util/optcommon.cpp @@ -153,6 +153,9 @@ void GetPlanNodePlainText( } } break; + case T_Gather: + *pname = *sname = *pt_options = "Gather"; + break; case T_IndexScan: *pt_operation = "INDEX"; if (((IndexScan*)plan)->scan.isPartTbl) diff --git a/src/gausskernel/optimizer/util/pathnode.cpp b/src/gausskernel/optimizer/util/pathnode.cpp index 97d119544..ef633323b 100755 --- a/src/gausskernel/optimizer/util/pathnode.cpp +++ b/src/gausskernel/optimizer/util/pathnode.cpp @@ -1098,10 +1098,12 @@ void add_path(PlannerInfo* root, RelOptInfo* parent_rel, Path* new_path) case COSTS_EQUAL: outercmp = bms_subset_compare(PATH_REQ_OUTER(new_path), PATH_REQ_OUTER(old_path)); if (keyscmp == PATHKEYS_BETTER1) { - if ((outercmp == BMS_EQUAL || outercmp == BMS_SUBSET1) && new_path->rows <= old_path->rows) + if ((outercmp == BMS_EQUAL || outercmp == BMS_SUBSET1) && + new_path->rows <= old_path->rows && new_path->parallel_safe >= old_path->parallel_safe) remove_old = true; /* new dominates old */ } else if (keyscmp == PATHKEYS_BETTER2) { - if ((outercmp == BMS_EQUAL || outercmp == BMS_SUBSET2) && new_path->rows >= old_path->rows) + if ((outercmp == BMS_EQUAL || outercmp == BMS_SUBSET2) && + new_path->rows >= old_path->rows && new_path->parallel_safe >= old_path->parallel_safe) accept_new = false; /* old dominates new */ } else { if (outercmp == BMS_EQUAL) { @@ -1120,7 +1122,11 @@ void add_path(PlannerInfo* root, RelOptInfo* parent_rel, Path* new_path) * comparison decides the startup and total * costs compare differently. */ - if (new_path->rows < old_path->rows) + if (new_path->parallel_safe > old_path->parallel_safe) { + remove_old = true; + } else if (new_path->parallel_safe < old_path->parallel_safe) { + accept_new = false; + } else if (new_path->rows < old_path->rows) remove_old = true; /* new dominates old */ else if (new_path->rows > old_path->rows) accept_new = false; /* old dominates new */ @@ -1132,9 +1138,11 @@ void add_path(PlannerInfo* root, RelOptInfo* parent_rel, Path* new_path) else accept_new = false; /* old equals or dominates new */ } - } else if (outercmp == BMS_SUBSET1 && new_path->rows <= old_path->rows) + } else if (outercmp == BMS_SUBSET1 && new_path->rows <= old_path->rows && + new_path->parallel_safe >= old_path->parallel_safe) remove_old = true; /* new dominates old */ - else if (outercmp == BMS_SUBSET2 && new_path->rows >= old_path->rows) + else if (outercmp == BMS_SUBSET2 && new_path->rows >= old_path->rows && + new_path->parallel_safe <= old_path->parallel_safe) accept_new = false; /* old dominates new */ /* else different parameterizations, keep both */ } @@ -1142,14 +1150,16 @@ void add_path(PlannerInfo* root, RelOptInfo* parent_rel, Path* new_path) case COSTS_BETTER1: if (keyscmp != PATHKEYS_BETTER2) { outercmp = bms_subset_compare(PATH_REQ_OUTER(new_path), PATH_REQ_OUTER(old_path)); - if ((outercmp == BMS_EQUAL || outercmp == BMS_SUBSET1) && new_path->rows <= old_path->rows) + if ((outercmp == BMS_EQUAL || outercmp == BMS_SUBSET1) && + new_path->rows <= old_path->rows && new_path->parallel_safe >= old_path->parallel_safe) remove_old = true; /* new dominates old */ } break; case COSTS_BETTER2: if (keyscmp != PATHKEYS_BETTER1) { outercmp = bms_subset_compare(PATH_REQ_OUTER(new_path), PATH_REQ_OUTER(old_path)); - if ((outercmp == BMS_EQUAL || outercmp == BMS_SUBSET2) && new_path->rows >= old_path->rows) + if ((outercmp == BMS_EQUAL || outercmp == BMS_SUBSET2) && + new_path->rows >= old_path->rows && new_path->parallel_safe <= old_path->parallel_safe) accept_new = false; /* old dominates new */ } break; @@ -1433,7 +1443,7 @@ static void add_parameterized_path(RelOptInfo* parent_rel, Path* new_path) * Creates a path corresponding to a sequential scan, returning the * pathnode. */ -Path* create_seqscan_path(PlannerInfo* root, RelOptInfo* rel, Relids required_outer, int dop) +Path* create_seqscan_path(PlannerInfo* root, RelOptInfo* rel, Relids required_outer, int dop, int nworkers) { Path* pathnode = makeNode(Path); @@ -1442,11 +1452,14 @@ Path* create_seqscan_path(PlannerInfo* root, RelOptInfo* rel, Relids required_ou pathnode->param_info = get_baserel_parampathinfo(root, rel, required_outer); pathnode->pathkeys = NIL; /* seqscan has unordered result */ pathnode->dop = dop; + pathnode->parallel_aware = nworkers > 0 ? true : false; + pathnode->parallel_safe = rel->consider_parallel; #ifdef STREAMPLAN + /* We need to set locator_type for parallel query, cause we may send this value to bg worker */ + pathnode->locator_type = rel->locator_type; if (IS_STREAM_PLAN) { pathnode->distribute_keys = rel->distribute_keys; - pathnode->locator_type = rel->locator_type; /* add location information for seqscan path */ RangeTblEntry* rte = root->simple_rte_array[rel->relid]; @@ -1460,7 +1473,7 @@ Path* create_seqscan_path(PlannerInfo* root, RelOptInfo* rel, Relids required_ou RangeTblEntry* rte = planner_rt_fetch(rel->relid, root); if (NULL == rte->tablesample) { - cost_seqscan(pathnode, root, rel, pathnode->param_info); + cost_seqscan(pathnode, root, rel, pathnode->param_info, nworkers); } else { AssertEreport(rte->rtekind == RTE_RELATION, MOD_OPT_JOIN, "Rel should be base relation"); cost_samplescan(pathnode, root, rel, pathnode->param_info); @@ -2277,6 +2290,8 @@ ResultPath* create_result_path(List* quals, Path* subpath) pathnode->path.total_cost = subpath->total_cost; pathnode->path.dop = subpath->dop; pathnode->path.stream_cost = subpath->stream_cost; + pathnode->path.parallel_aware = subpath->parallel_aware; + pathnode->path.parallel_safe = subpath->parallel_safe; #ifdef STREAMPLAN /* result path will inherit node group and distribute information from it's child node */ inherit_path_locator_info((Path*)pathnode, subpath); @@ -2707,6 +2722,37 @@ no_unique_path: /* failure exit */ return NULL; } +/* + * create_gather_path + * + * Creates a path corresponding to a gather scan, returning the + * pathnode. + */ +GatherPath *create_gather_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, Relids required_outer, int nworkers) +{ + GatherPath *pathnode = makeNode(GatherPath); + + pathnode->path.pathtype = T_Gather; + pathnode->path.parent = rel; + pathnode->path.param_info = get_baserel_parampathinfo(root, rel, required_outer); + pathnode->path.parallel_aware = false; + pathnode->path.pathkeys = NIL; /* Gather has unordered result */ + + pathnode->subpath = subpath; + pathnode->num_workers = nworkers; + pathnode->single_copy = false; + + if (pathnode->num_workers == 0) { + pathnode->path.pathkeys = subpath->pathkeys; + pathnode->num_workers = 1; + pathnode->single_copy = true; + } + + cost_gather(pathnode, rel, pathnode->path.param_info); + + return pathnode; +} + /* * translate_sub_tlist - get subquery column numbers represented by tlist * diff --git a/src/gausskernel/optimizer/util/relnode.cpp b/src/gausskernel/optimizer/util/relnode.cpp index cf47879eb..33be36f47 100755 --- a/src/gausskernel/optimizer/util/relnode.cpp +++ b/src/gausskernel/optimizer/util/relnode.cpp @@ -20,6 +20,7 @@ #include "nodes/nodeFuncs.h" #include "nodes/print.h" #include "parser/parse_hint.h" +#include "optimizer/clauses.h" #include "optimizer/cost.h" #include "optimizer/pathnode.h" #include "optimizer/paths.h" @@ -169,6 +170,7 @@ RelOptInfo* build_simple_rel(PlannerInfo* root, int relid, RelOptKind reloptkind rel->partflag = PARTITION_NONE; rel->rows = 0; rel->width = 0; + rel->consider_parallel = false; /* might get changed later */ rel->encodedwidth = 0; rel->encodednum = 0; rel->reltargetlist = NIL; @@ -549,6 +551,7 @@ RelOptInfo* build_join_rel(PlannerInfo* root, Relids joinrelids, RelOptInfo* out joinrel->partflag = PARTITION_NONE; joinrel->rows = 0; joinrel->width = 0; + joinrel->consider_parallel = false; joinrel->encodedwidth = 0; joinrel->encodednum = 0; joinrel->reltargetlist = NIL; @@ -624,6 +627,25 @@ RelOptInfo* build_join_rel(PlannerInfo* root, Relids joinrelids, RelOptInfo* out */ set_joinrel_size_estimates(root, joinrel, outer_rel, inner_rel, sjinfo, restrictlist); + /* + * Set the consider_parallel flag if this joinrel could potentially be + * scanned within a parallel worker. If this flag is false for either + * inner_rel or outer_rel, then it must be false for the joinrel also. + * Even if both are true, there might be parallel-restricted quals at our + * level. + * + * Note that if there are more than two rels in this relation, they could + * be divided between inner_rel and outer_rel in any arbitary way. We + * assume this doesn't matter, because we should hit all the same baserels + * and joinclauses while building up to this joinrel no matter which we + * take; therefore, we should make the same decision here however we get + * here. + */ + if (inner_rel->consider_parallel && outer_rel->consider_parallel && + !has_parallel_hazard((Node *)restrictlist, false)) { + joinrel->consider_parallel = true; + } + /* * Add the joinrel to the query's joinrel list, and store it into the * auxiliary hashtable if there is one. NB: GEQO requires us to append diff --git a/src/gausskernel/process/postmaster/bgworker.cpp b/src/gausskernel/process/postmaster/bgworker.cpp index 02eeaefa4..898268537 100644 --- a/src/gausskernel/process/postmaster/bgworker.cpp +++ b/src/gausskernel/process/postmaster/bgworker.cpp @@ -14,6 +14,7 @@ #include +#include "access/parallel.h" #include "libpq/pqsignal.h" #include "miscadmin.h" #include "pgstat.h" @@ -109,6 +110,10 @@ static const struct { { "autonomous_worker_main", autonomous_worker_main + }, + { + "ParallelWorkerMain", + ParallelWorkerMain } }; diff --git a/src/gausskernel/process/postmaster/postmaster.cpp b/src/gausskernel/process/postmaster/postmaster.cpp index 886dc4892..64fac6422 100755 --- a/src/gausskernel/process/postmaster/postmaster.cpp +++ b/src/gausskernel/process/postmaster/postmaster.cpp @@ -152,6 +152,7 @@ #include "utils/datetime.h" #include "utils/guc.h" #include "utils/memutils.h" +#include "utils/postinit.h" #include "utils/ps_status.h" #include "utils/plog.h" #include "utils/zfiles.h" @@ -384,8 +385,8 @@ bool PMstateIsRun(void); #define BACKEND_TYPE_TEMPBACKEND \ 0x0010 /* temp thread processing cancel signal \ or stream connection */ - -#define BACKEND_TYPE_ALL 0x001F /* OR of all the above */ +#define BACKEND_TYPE_BGWORKER 0x0020 +#define BACKEND_TYPE_ALL 0x003F /* OR of all the above */ static int CountChildren(int target); static bool CreateOptsFile(int argc, const char* argv[], const char* fullprogname); @@ -1029,8 +1030,8 @@ void SetShmemCxt(void) AV_LAUNCHER_PROCS; g_instance.shmem_cxt.MaxReserveBackendId = g_instance.attr.attr_sql.job_queue_processes + 1 + g_instance.attr.attr_storage.autovacuum_max_workers + - (thread_pool_worker_num * STREAM_RESERVE_PROC_TIMES) + - AUXILIARY_BACKENDS + + (thread_pool_worker_num * STREAM_RESERVE_PROC_TIMES) + + AUXILIARY_BACKENDS + AV_LAUNCHER_PROCS; g_instance.shmem_cxt.ThreadPoolGroupNum = thread_pool_group_num; @@ -2543,6 +2544,85 @@ static bool save_backend_variables_for_callback_thread() return save_backend_variables(&backend_save_para, &port); } +/* + * Determine how long should we let ServerLoop sleep. + * + * In normal conditions we wait at most one minute, to ensure that the other + * background tasks handled by ServerLoop get done even when no requests are + * arriving. However, if there are background workers waiting to be started, + * we don't actually sleep so that they are quickly serviced. Other exception + * cases are as shown in the code. + */ +static void DetermineSleepTime(struct timeval *timeout) +{ + TimestampTz next_wakeup = 0; + + /* + * Normal case: either there are no background workers at all, or we're in + * a shutdown sequence (during which we ignore bgworkers altogether). + */ + if (Shutdown > NoShutdown || + (!g_instance.bgworker_cxt.start_worker_needed && !g_instance.bgworker_cxt.have_crashed_worker)) { + timeout->tv_sec = PM_POLL_TIMEOUT_SECOND; + timeout->tv_usec = 0; + return; + } + + if (g_instance.bgworker_cxt.start_worker_needed) { + timeout->tv_sec = 0; + timeout->tv_usec = 0; + return; + } + + if (g_instance.bgworker_cxt.have_crashed_worker) { + slist_mutable_iter siter; + + /* + * When there are crashed bgworkers, we sleep just long enough that + * they are restarted when they request to be. Scan the list to + * determine the minimum of all wakeup times according to most recent + * crash time and requested restart interval. + */ + slist_foreach_modify(siter, &t_thrd.bgworker_cxt.background_worker_list) + { + RegisteredBgWorker *rw = slist_container(RegisteredBgWorker, rw_lnode, siter.cur); + + if (rw->rw_crashed_at == 0) { + continue; + } + + if (rw->rw_worker.bgw_restart_time == BGW_NEVER_RESTART || rw->rw_terminate) { + ForgetBackgroundWorker(&siter); + continue; + } + + TimestampTz this_wakeup = TimestampTzPlusMilliseconds(rw->rw_crashed_at, + 1000L * rw->rw_worker.bgw_restart_time); + if (next_wakeup == 0 || this_wakeup < next_wakeup) { + next_wakeup = this_wakeup; + } + } + } + + if (next_wakeup != 0) { + long secs; + int microsecs; + + TimestampDifference(GetCurrentTimestamp(), next_wakeup, &secs, µsecs); + timeout->tv_sec = secs; + timeout->tv_usec = microsecs; + + /* Ensure we don't exceed PM_POLL_TIMEOUT_SECOND */ + if (timeout->tv_sec > PM_POLL_TIMEOUT_SECOND) { + timeout->tv_sec = PM_POLL_TIMEOUT_SECOND; + timeout->tv_usec = 0; + } + } else { + timeout->tv_sec = PM_POLL_TIMEOUT_SECOND; + timeout->tv_usec = 0; + } +} + /* * Main idle loop of postmaster */ @@ -2651,8 +2731,7 @@ static int ServerLoop(void) /* must set timeout each time; some OSes change it! */ struct timeval timeout; - timeout.tv_sec = PM_POLL_TIMEOUT_SECOND; - timeout.tv_usec = 0; + DetermineSleepTime(&timeout); #ifdef HAVE_POLL selres = poll(ufds, nSockets, timeout.tv_sec * 1000); @@ -3807,7 +3886,8 @@ CAC_state canAcceptConnections(bool isSession) /* * Can't start backends when in startup/shutdown/inconsistent recovery - * state. + * state. bgworkers are excluded from this test; we expect + * bgworker_should_start_now() decided whether the DB state allows them. * * In state PM_WAIT_BACKUP only superusers can connect (this must be * allowed so that a superuser can end online backup mode); we return @@ -4513,10 +4593,11 @@ static void pmdie(SIGNAL_ARGS) } if (pmState == PM_RECOVERY) { + (void)SignalSomeChildren(SIGTERM, BACKEND_TYPE_BGWORKER); /* - * Only startup, bgwriter, walreceiver, and/or checkpointer - * should be active in this state; we just signaled the first - * three, and we don't want to kill checkpointer yet. + * Only startup, bgwriter, walreceiver, possibly bgworkers, + * and/or checkpointer should be active in this state; we just + * signaled the first four, and we don't want to kill checkpointer yet. */ pmState = PM_WAIT_BACKENDS; } else if (pmState == PM_RUN || pmState == PM_WAIT_BACKUP || pmState == PM_WAIT_READONLY || @@ -4527,8 +4608,8 @@ static void pmdie(SIGNAL_ARGS) g_threadPoolControler->CloseAllSessions(); g_threadPoolControler->ShutDownWorker(); } - /* shut down all backends and autovac workers */ - (void)SignalSomeChildren(SIGTERM, BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC); + /* shut down all backends and bgworkers and autovac workers */ + (void)SignalSomeChildren(SIGTERM, BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC | BACKEND_TYPE_BGWORKER); /* and the autovac launcher too */ if (g_instance.pid_cxt.AutoVacPID != 0) @@ -5680,6 +5761,8 @@ static bool CleanupBackgroundWorker(ThreadId pid, * CleanupBackend -- cleanup after terminated backend. * * Remove all local state associated with backend. + * + * If you change this, see also CleanupBackgroundWorker. */ static void CleanupBackend(ThreadId pid, int exitstatus) /* child's exit status. */ { @@ -5725,8 +5808,8 @@ static void CleanupBackend(ThreadId pid, int exitstatus) /* child's exit status. for (curr = DLGetTail(g_instance.backend_list); curr; curr = DLGetPred(curr)) { Backend* bp = (Backend*)DLE_VAL(curr); - if (bp->pid == pid && bp->dead_end) { - { + if (bp->pid == pid) { + if (bp->dead_end) { if (!ReleasePostmasterChildSlot(bp->child_slot)) { /* * Uh-oh, the child failed to clean itself up. Treat as a @@ -5739,8 +5822,7 @@ static void CleanupBackend(ThreadId pid, int exitstatus) /* child's exit status. BackendArrayRemove(bp); } - if (bp->bgworker_notify) - { + if (bp->bgworker_notify) { /* * This backend may have been slated to receive SIGUSR1 when * some background worker started or stopped. Cancel those @@ -5930,7 +6012,8 @@ static void PostmasterStateMachine(void) if (pmState == PM_WAIT_BACKENDS) { /* * PM_WAIT_BACKENDS state ends when we have no regular backends - * (including autovac workers) and no walwriter, autovac launcher or + * (including autovac workers), no bgworkers (including + * unconnected ones), and no walwriter, autovac launcher or * bgwriter. If we are doing crash recovery then we expect the * checkpointer to exit as well, otherwise not. The archiver, stats, * and syslogger processes are disregarded since they are not @@ -5939,7 +6022,8 @@ static void PostmasterStateMachine(void) * later after writing the checkpoint record, like the archiver * process. */ - if (CountChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC) == 0 && g_instance.pid_cxt.StartupPID == 0 && + if (CountChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC | BACKEND_TYPE_BGWORKER) == 0 && + g_instance.pid_cxt.StartupPID == 0 && g_instance.pid_cxt.TwoPhaseCleanerPID == 0 && g_instance.pid_cxt.FaultMonitorPID == 0 && g_instance.pid_cxt.WalReceiverPID == 0 && g_instance.pid_cxt.WalRcvWriterPID == 0 && g_instance.pid_cxt.DataReceiverPID == 0 && g_instance.pid_cxt.DataRcvWriterPID == 0 && @@ -6162,6 +6246,10 @@ static void PostmasterStateMachine(void) */ if (g_instance.demotion > NoDemote && pmState == PM_NO_CHILDREN) { ereport(LOG, (errmsg("all server processes terminated; reinitializing"))); + + /* allow background workers to immediately restart */ + ResetBackgroundWorkerCrashTimes(); + shmem_exit(1); reset_shared(g_instance.attr.attr_network.PostPortNumber); @@ -6346,6 +6434,8 @@ static int BackendStartup(Port* port) * Unless it's a dead_end child, assign it a child slot number */ bn->child_slot = t_thrd.proc_cxt.MyPMChildSlot = childSlot; + /* Hasn't asked to be notified about any bgworkers yet */ + bn->bgworker_notify = false; pid = initialize_worker_thread(WORKER, port); t_thrd.proc_cxt.MyPMChildSlot = 0; @@ -7360,6 +7450,7 @@ static void StartAutovacuumWorker(void) /* Autovac workers are not dead_end and need a child slot */ bn->child_slot = t_thrd.proc_cxt.MyPMChildSlot = slot; + bn->bgworker_notify = false; bn->pid = initialize_util_thread(AUTOVACUUM_WORKER, bn); t_thrd.proc_cxt.MyPMChildSlot = 0; if (bn->pid > 0) { @@ -8405,6 +8496,7 @@ static void BackendArrayRemove(Backend* bn) g_instance.backend_array[i].flag = 0; g_instance.backend_array[i].cancel_key = 0; g_instance.backend_array[i].dead_end = false; + g_instance.backend_array[i].bgworker_notify = false; } #ifdef WIN32 diff --git a/src/gausskernel/process/tcop/dest.cpp b/src/gausskernel/process/tcop/dest.cpp index 7ceea9744..c8cac990d 100755 --- a/src/gausskernel/process/tcop/dest.cpp +++ b/src/gausskernel/process/tcop/dest.cpp @@ -36,6 +36,7 @@ #include "commands/matview.h" #include "executor/functions.h" #include "executor/spi.h" +#include "executor/tqueue.h" #include "executor/tstoreReceiver.h" #include "libpq/libpq.h" #include "libpq/pqformat.h" @@ -149,6 +150,8 @@ DestReceiver* CreateDestReceiver(CommandDest dest) case DestBatchLocalRoundRobin: case DestBatchHybrid: return createStreamDestReceiver(dest); + case DestTupleQueue: + return CreateTupleQueueDestReceiver(NULL); default: break; } @@ -187,6 +190,7 @@ void EndCommand(const char* commandTag, CommandDest dest) case DestCopyOut: case DestSQLFunction: case DestTransientRel: + case DestTupleQueue: default: break; } @@ -217,6 +221,7 @@ void EndCommand_noblock(const char* commandTag, CommandDest dest) case DestIntoRel: case DestCopyOut: case DestSQLFunction: + case DestTupleQueue: default: break; } @@ -264,6 +269,7 @@ void NullCommand(CommandDest dest) case DestCopyOut: case DestSQLFunction: case DestTransientRel: + case DestTupleQueue: default: break; } @@ -312,6 +318,7 @@ void ReadyForQuery(CommandDest dest) case DestIntoRel: case DestCopyOut: case DestSQLFunction: + case DestTupleQueue: default: break; } @@ -346,6 +353,7 @@ void ReadyForQuery_noblock(CommandDest dest, int timeout) case DestIntoRel: case DestCopyOut: case DestSQLFunction: + case DestTupleQueue: default: break; } diff --git a/src/gausskernel/process/tcop/postgres.cpp b/src/gausskernel/process/tcop/postgres.cpp index b326a4c05..d68c37353 100755 --- a/src/gausskernel/process/tcop/postgres.cpp +++ b/src/gausskernel/process/tcop/postgres.cpp @@ -36,6 +36,7 @@ #include #endif +#include "access/parallel.h" #include "access/printtup.h" #include "access/xact.h" #include "access/dfs/dfs_am.h" @@ -256,6 +257,16 @@ bool StreamTopConsumerAmI() return (t_thrd.subrole == TOP_CONSUMER); } +bool ParallelWorkerAmI() +{ + return t_thrd.role == BACKGROUND_WORKER; +} + +bool ParallelLeaderAmI() +{ + return t_thrd.subrole == BACKGROUND_LEADER; +} + void EnableDoingCommandRead() { t_thrd.postgres_cxt.DoingCommandRead = true; @@ -2275,7 +2286,7 @@ void exec_simple_query(const char* query_string, MessageType messageType, String FreeExecNodes(&single_exec_node); } - plantree_list = pg_plan_queries(querytree_list, 0, NULL); + plantree_list = pg_plan_queries(querytree_list, CURSOR_OPT_PARALLEL_OK, NULL); randomPlanInfo = get_random_plan_string(); if (was_logged != false && randomPlanInfo != NULL) { @@ -4072,6 +4083,7 @@ void exec_bind_message(StringInfo input_message) params->parserSetupArg = NULL; params->params_need_process = false; params->numParams = numParams; + params->paramMask = NULL; for (paramno = 0; paramno < numParams; paramno++) { Oid ptype = psrc->param_types[paramno]; @@ -5694,6 +5706,10 @@ void ProcessInterrupts(void) ereport(ERROR, (errcode(ERRCODE_QUERY_CANCELED), errmsg("canceling snapshot task"))); } } + + if (t_thrd.bgworker_cxt.ParallelMessagePending) { + HandleParallelMessages(); + } /* If we get here, do nothing (probably, t_thrd.int_cxt.QueryCancelPending was reset) */ } diff --git a/src/gausskernel/process/tcop/utility.cpp b/src/gausskernel/process/tcop/utility.cpp index a4083fe39..cc9fa61e1 100644 --- a/src/gausskernel/process/tcop/utility.cpp +++ b/src/gausskernel/process/tcop/utility.cpp @@ -308,7 +308,7 @@ bool CommandIsReadOnly(Node* parse_tree) */ static void check_xact_readonly(Node* parse_tree) { - if (!u_sess->attr.attr_common.XactReadOnly) + if (!u_sess->attr.attr_common.XactReadOnly && !IsInParallelMode()) return; /* @@ -410,12 +410,14 @@ static void check_xact_readonly(Node* parse_tree) case T_CreateSynonymStmt: case T_DropSynonymStmt: PreventCommandIfReadOnly(CreateCommandTag(parse_tree)); + PreventCommandIfParallelMode(CreateCommandTag(parse_tree)); break; case T_VacuumStmt: { VacuumStmt* stmt = (VacuumStmt*)parse_tree; /* on verify mode, do nothing */ if (!(stmt->options & VACOPT_VERIFY)) { PreventCommandIfReadOnly(CreateCommandTag(parse_tree)); + PreventCommandIfParallelMode(CreateCommandTag(parse_tree)); } break; } @@ -423,6 +425,7 @@ static void check_xact_readonly(Node* parse_tree) AlterRoleStmt* stmt = (AlterRoleStmt*)parse_tree; if (!(DO_NOTHING != stmt->lockstatus && t_thrd.postmaster_cxt.HaShmData->current_mode == STANDBY_MODE)) { PreventCommandIfReadOnly(CreateCommandTag(parse_tree)); + PreventCommandIfParallelMode(CreateCommandTag(parse_tree)); } break; } @@ -447,6 +450,21 @@ void PreventCommandIfReadOnly(const char* cmd_name) errmsg("cannot execute %s in a read-only transaction", cmd_name))); } +/* + * PreventCommandIfParallelMode: throw error if current (sub)transaction is + * in parallel mode. + * + * This is useful mainly to ensure consistency of the error message wording; + * most callers have checked IsInParallelMode() for themselves. + */ +void PreventCommandIfParallelMode(const char *cmdname) +{ + if (IsInParallelMode()) + ereport(ERROR, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + /* translator: %s is name of a SQL command, eg CREATE */ + errmsg("cannot execute %s during a parallel operation", cmdname))); +} + /* * PreventCommandDuringRecovery: throw error if RecoveryInProgress * diff --git a/src/gausskernel/process/threadpool/knl_thread.cpp b/src/gausskernel/process/threadpool/knl_thread.cpp index 23ea560aa..337d94e7c 100755 --- a/src/gausskernel/process/threadpool/knl_thread.cpp +++ b/src/gausskernel/process/threadpool/knl_thread.cpp @@ -1422,6 +1422,12 @@ void knl_t_bgworker_init(knl_t_bgworker_context* bgworker_cxt) bgworker_cxt->my_bgworker_entry = NULL; bgworker_cxt->is_background_worker = false; bgworker_cxt->background_worker_list = SLIST_STATIC_INIT(background_worker_list); + bgworker_cxt->ParallelMessagePending = false; + bgworker_cxt->InitializingParallelWorker = false; + bgworker_cxt->ParallelWorkerNumber = -1; + bgworker_cxt->pcxt_list = DLIST_STATIC_INIT(bgworker_cxt->pcxt_list); + bgworker_cxt->save_pgBufferUsage = NULL; + bgworker_cxt->hpm_context = NULL; } void knl_t_msqueue_init(knl_t_msqueue_context* msqueue_cxt) diff --git a/src/gausskernel/runtime/executor/Makefile b/src/gausskernel/runtime/executor/Makefile index 729cd22ed..bb72890ec 100755 --- a/src/gausskernel/runtime/executor/Makefile +++ b/src/gausskernel/runtime/executor/Makefile @@ -21,10 +21,10 @@ ifneq "$(MAKECMDGOALS)" "clean" endif OBJS = execAmi.o execCurrent.o execGrouping.o execJunk.o execMain.o \ - execProcnode.o execQual.o execScan.o execTuples.o \ + execParallel.o execProcnode.o execQual.o execScan.o execTuples.o \ execUtils.o functions.o instrument.o nodeAppend.o nodeAgg.o \ nodeBitmapAnd.o nodeBitmapOr.o \ - nodeBitmapHeapscan.o nodeBitmapIndexscan.o nodeHash.o \ + nodeBitmapHeapscan.o nodeBitmapIndexscan.o nodeGather.o nodeHash.o \ nodeHashjoin.o nodeIndexscan.o nodeIndexonlyscan.o \ nodeLimit.o nodeLockRows.o \ nodeMaterial.o nodeMergeAppend.o nodeMergejoin.o nodeModifyTable.o \ @@ -32,7 +32,7 @@ OBJS = execAmi.o execCurrent.o execGrouping.o execJunk.o execMain.o \ nodeSamplescan.o nodeSeqscan.o nodeSetOp.o nodeSort.o nodeUnique.o \ nodeValuesscan.o nodeCtescan.o nodeWorktablescan.o \ nodeGroup.o nodeSubplan.o nodeSubqueryscan.o nodeTidscan.o \ - nodeForeignscan.o nodeWindowAgg.o tstoreReceiver.o spi.o \ + nodeForeignscan.o nodeWindowAgg.o tstoreReceiver.o tqueue.o spi.o \ nodePartIterator.o nodeStub.o execClusterResize.o lightProxy.o execMerge.o \ nodeExtensible.o opfusion.o opfusion_scan.o opfusion_util.o diff --git a/src/gausskernel/runtime/executor/execAmi.cpp b/src/gausskernel/runtime/executor/execAmi.cpp index e30bb139e..7c3c812ae 100755 --- a/src/gausskernel/runtime/executor/execAmi.cpp +++ b/src/gausskernel/runtime/executor/execAmi.cpp @@ -26,7 +26,7 @@ #include "executor/nodeExtensible.h" #include "executor/nodeForeignscan.h" #include "executor/nodeFunctionscan.h" -#include "executor/nodeGroup.h" +#include "executor/nodeGather.h" #include "executor/nodeGroup.h" #include "executor/nodeHash.h" #include "executor/nodeHashjoin.h" @@ -131,6 +131,10 @@ void ExecReScanByType(PlanState* node) ExecReScanSeqScan((SeqScanState*)node); break; + case T_GatherState: + ExecReScanGather((GatherState*)node); + break; + case T_IndexScanState: ExecReScanIndexScan((IndexScanState*)node); break; @@ -492,8 +496,19 @@ bool ExecSupportsMarkRestore(NodeTag plantype) */ bool ExecSupportsBackwardScan(Plan* node) { - if (node == NULL) + if (node == NULL) { return false; + } + + /* + * Parallel-aware nodes return a subset of the tuples in each worker, + * and in general we can't expect to have enough bookkeeping state to + * know which ones we returned in this worker as opposed to some other + * worker. + */ + if (node->parallel_aware) { + return false; + } switch (nodeTag(node)) { case T_BaseResult: @@ -527,6 +542,9 @@ bool ExecSupportsBackwardScan(Plan* node) case T_CteScan: return target_list_supports_backward_scan(node->targetlist); + case T_Gather: + return false; + case T_IndexScan: return index_supports_backward_scan(((IndexScan*)node)->indexid) && target_list_supports_backward_scan(node->targetlist); diff --git a/src/gausskernel/runtime/executor/execMain.cpp b/src/gausskernel/runtime/executor/execMain.cpp index 7c314ab2b..c0993b2ea 100644 --- a/src/gausskernel/runtime/executor/execMain.cpp +++ b/src/gausskernel/runtime/executor/execMain.cpp @@ -107,8 +107,9 @@ static void CheckValidRowMarkRel(Relation rel, RowMarkType markType); static void ExecPostprocessPlan(EState *estate); static void ExecEndPlan(PlanState *planstate, EState *estate); static void ExecCollectMaterialForSubplan(EState *estate); -static void ExecutePlan(EState *estate, PlanState *planstate, CmdType operation, bool sendTuples, long numberTuples, - ScanDirection direction, DestReceiver *dest, JitExec::JitContext* mot_jit_context); +static void ExecutePlan(EState *estate, PlanState *planstate, bool use_parallel_mode, + CmdType operation, bool sendTuples, long numberTuples, ScanDirection direction, + DestReceiver *dest, JitExec::JitContext* mot_jit_context); static void ExecuteVectorizedPlan(EState *estate, PlanState *planstate, CmdType operation, bool sendTuples, long numberTuples, ScanDirection direction, DestReceiver *dest); static bool ExecCheckRTEPerms(RangeTblEntry *rte); @@ -241,8 +242,20 @@ void standard_ExecutorStart(QueryDesc *queryDesc, int eflags) /* * If the transaction is read-only, we need to check if any writes are * planned to non-temporary tables. EXPLAIN is considered read-only. + * + * Don't allow writes in parallel mode. Supporting UPDATE and DELETE + * would require (a) storing the combocid hash in shared memory, rather + * than synchronizing it just once at the start of parallelism, and (b) an + * alternative to heap_update()'s reliance on xmax for mutual exclusion. + * INSERT may have no such troubles, but we forbid it to simplify the + * checks. + * + * We have lower-level defenses in CommandCounterIncrement and elsewhere + * against performing unsafe operations in parallel mode, but this gives a + * more user-friendly error message. */ - if (u_sess->attr.attr_common.XactReadOnly && !(eflags & EXEC_FLAG_EXPLAIN_ONLY)) { + if ((u_sess->attr.attr_common.XactReadOnly || IsInParallelMode()) && + !(eflags & EXEC_FLAG_EXPLAIN_ONLY)) { ExecCheckXactReadOnly(queryDesc->plannedstmt); } @@ -572,8 +585,8 @@ void standard_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, long co if (queryDesc->planstate->vectorized) { ExecuteVectorizedPlan(estate, queryDesc->planstate, operation, send_tuples, count, direction, dest); } else { - ExecutePlan(estate, queryDesc->planstate, operation, send_tuples, - count, direction, dest, queryDesc->mot_jit_context); + ExecutePlan(estate, queryDesc->planstate, queryDesc->plannedstmt->parallelModeNeeded, operation, + send_tuples, count, direction, dest, queryDesc->mot_jit_context); } } @@ -1058,6 +1071,10 @@ void ExecCheckXactReadOnly(PlannedStmt *plannedstmt) PreventCommandIfReadOnly(CreateCommandTag((Node *)plannedstmt)); } + + if (plannedstmt->commandType != CMD_SELECT || plannedstmt->hasModifyingCTE) { + PreventCommandIfParallelMode(CreateCommandTag((Node*)plannedstmt)); + } } /* ---------------------------------------------------------------- @@ -1940,8 +1957,9 @@ static void ExecCollectMaterialForSubplan(EState *estate) * user can see it * ---------------------------------------------------------------- */ -static void ExecutePlan(EState *estate, PlanState *planstate, CmdType operation, bool sendTuples, long numberTuples, - ScanDirection direction, DestReceiver *dest, JitExec::JitContext* mot_jit_context) +static void ExecutePlan(EState *estate, PlanState *planstate, bool use_parallel_mode, + CmdType operation, bool sendTuples, long numberTuples, ScanDirection direction, + DestReceiver *dest, JitExec::JitContext* mot_jit_context) { TupleTableSlot *slot = NULL; long current_tuple_count = 0; @@ -1968,6 +1986,22 @@ static void ExecutePlan(EState *estate, PlanState *planstate, CmdType operation, */ estate->es_direction = direction; + /* + * If a tuple count was supplied, we must force the plan to run without + * parallelism, because we might exit early. + */ + if (numberTuples != 0) { + use_parallel_mode = false; + } + + /* + * If a tuple count was supplied, we must force the plan to run without + * parallelism, because we might exit early. + */ + if (use_parallel_mode) { + EnterParallelMode(); + } + if (IS_PGXC_DATANODE) { /* Collect Material for Subplan first */ ExecCollectMaterialForSubplan(estate); @@ -2035,6 +2069,7 @@ static void ExecutePlan(EState *estate, PlanState *planstate, CmdType operation, * process so we just end the loop... */ if (TupIsNull(slot)) { + (void)ExecShutdownNode(planstate); ExecEarlyFree(planstate); break; } @@ -2103,6 +2138,10 @@ static void ExecutePlan(EState *estate, PlanState *planstate, CmdType operation, << (chunkSizeInBits - BITS_IN_MB); u_sess->instr_cxt.global_instr->SetPeakNodeMemory(planstate->plan->plan_node_id, peak_memory); } + + if (use_parallel_mode) { + ExitParallelMode(); + } } /* ---------------------------------------------------------------- diff --git a/src/gausskernel/runtime/executor/execParallel.cpp b/src/gausskernel/runtime/executor/execParallel.cpp new file mode 100644 index 000000000..4e0289a26 --- /dev/null +++ b/src/gausskernel/runtime/executor/execParallel.cpp @@ -0,0 +1,589 @@ +/* ------------------------------------------------------------------------- + * + * execParallel.c + * Support routines for parallel execution. + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * This file contains routines that are intended to support setting up, + * using, and tearing down a ParallelContext from within the PostgreSQL + * executor. The ParallelContext machinery will handle starting the + * workers and ensuring that their state generally matches that of the + * leader; see src/backend/access/transam/README.parallel for details. + * However, we must save and restore relevant executor state, such as + * any ParamListInfo associated with the query, buffer usage info, and + * the actual plan to be passed down to the worker. + * + * IDENTIFICATION + * src/backend/executor/execParallel.c + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "executor/execParallel.h" +#include "executor/executor.h" +#include "executor/nodeSeqscan.h" +#include "executor/tqueue.h" +#include "nodes/nodeFuncs.h" +#include "optimizer/planmain.h" +#include "optimizer/planner.h" +#include "storage/spin.h" +#include "tcop/tcopprot.h" +#include "utils/memutils.h" +#include "utils/snapmgr.h" + +#define PARALLEL_TUPLE_QUEUE_SIZE 65536 + +/* DSM structure for accumulating per-PlanState instrumentation. */ +struct SharedExecutorInstrumentation { + int instrument_options; + uint32 instrument_offset; /* offset of first Instrumentation struct */ + int num_workers; /* # of workers */ + int num_plan_nodes; /* # of plan nodes */ + int plan_node_id[FLEXIBLE_ARRAY_MEMBER]; /* array of plan node IDs */ + /* array of num_plan_nodes * num_workers Instrumentation objects follows */ +}; +#define GetInstrumentationArray(sei) (AssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \ + (Instrumentation *)(((char *)sei) + sei->instrument_offset)) + + +/* Context object for ExecParallelEstimate. */ +typedef struct ExecParallelEstimateContext { + ParallelContext *pcxt; + int nnodes; +} ExecParallelEstimateContext; + +/* Context object for ExecParallelEstimate. */ +typedef struct ExecParallelInitializeDSMContext { + ParallelContext *pcxt; + SharedExecutorInstrumentation *instrumentation; + int nnodes; +} ExecParallelInitializeDSMContext; + +/* Helper functions that run in the parallel leader. */ +static char *ExecSerializePlan(Plan *plan, EState *estate); +static bool ExecParallelEstimate(PlanState *node, ExecParallelEstimateContext *e); +static bool ExecParallelInitializeDSM(PlanState *node, ExecParallelInitializeDSMContext *d); +static shm_mq_handle **ExecParallelSetupTupleQueues(ParallelContext *pcxt, bool reinitialize); +static bool ExecParallelRetrieveInstrumentation(PlanState *planstate, SharedExecutorInstrumentation *instrumentation); + +/* Helper functions that run in the parallel worker. */ +static DestReceiver *ExecParallelGetReceiver(void *seg); + +/* + * Create a serialized representation of the plan to be sent to each worker. + */ +static char *ExecSerializePlan(Plan *plan, EState *estate) +{ + ListCell *tlist = NULL; + + /* We can't scribble on the original plan, so make a copy. */ + plan = (Plan *)copyObject(plan); + + /* + * The worker will start its own copy of the executor, and that copy will + * insert a junk filter if the toplevel node has any resjunk entries. We + * don't want that to happen, because while resjunk columns shouldn't be + * sent back to the user, here the tuples are coming back to another + * backend which may very well need them. So mutate the target list + * accordingly. This is sort of a hack; there might be better ways to do + * this... + */ + foreach (tlist, plan->targetlist) { + TargetEntry *tle = (TargetEntry *)lfirst(tlist); + + tle->resjunk = false; + } + + /* + * Create a dummy PlannedStmt. Most of the fields don't need to be valid + * for our purposes, but the worker will need at least a minimal + * PlannedStmt to start the executor. + */ + PlannedStmt *pstmt = makeNode(PlannedStmt); + pstmt->commandType = CMD_SELECT; + pstmt->queryId = 0; + pstmt->hasReturning = 0; + pstmt->hasModifyingCTE = 0; + pstmt->canSetTag = 1; + pstmt->transientPlan = 0; + pstmt->planTree = plan; + pstmt->rtable = estate->es_range_table; + pstmt->resultRelations = NIL; + pstmt->utilityStmt = NULL; + pstmt->subplans = NIL; + pstmt->rewindPlanIDs = NULL; + pstmt->rowMarks = NIL; + pstmt->nParamExec = estate->es_plannedstmt->nParamExec; + pstmt->relationOids = NIL; + pstmt->invalItems = NIL; /* workers can't replan anyway... */ + pstmt->num_plannodes = estate->es_plannedstmt->num_plannodes; + + /* Return serialized copy of our dummy PlannedStmt. */ + return nodeToString(pstmt); +} + +/* + * Ordinary plan nodes won't do anything here, but parallel-aware plan nodes + * may need some state which is shared across all parallel workers. Before + * we size the DSM, give them a chance to call shm_toc_estimate_chunk or + * shm_toc_estimate_keys on &pcxt->estimator. + * + * While we're at it, count the number of PlanState nodes in the tree, so + * we know how many SharedPlanStateInstrumentation structures we need. + */ +static bool ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e) +{ + if (planstate == NULL) + return false; + + /* Count this node. */ + e->nnodes++; + + /* Call estimators for parallel-aware nodes. */ + switch (nodeTag(planstate)) { + case T_SeqScanState: + ExecSeqScanEstimate((SeqScanState *)planstate, e->pcxt); + break; + default: + break; + } + + return planstate_tree_walker(planstate, (bool (*)())ExecParallelEstimate, e); +} + +/* + * Ordinary plan nodes won't do anything here, but parallel-aware plan nodes + * may need to initialize shared state in the DSM before parallel workers + * are available. They can allocate the space they previous estimated using + * shm_toc_allocate, and add the keys they previously estimated using + * shm_toc_insert, in each case targeting pcxt->toc. + */ +static bool ExecParallelInitializeDSM(PlanState *planstate, ExecParallelInitializeDSMContext *d) +{ + if (planstate == NULL) + return false; + + /* If instrumentation is enabled, initialize slot for this node. */ + if (d->instrumentation != NULL) { + d->instrumentation->plan_node_id[d->nnodes] = planstate->plan->plan_node_id; + } + + /* Count this node. */ + d->nnodes++; + knl_u_parallel_context *cxt = (knl_u_parallel_context *)d->pcxt->seg; + + /* Call initializers for parallel-aware plan nodes. */ + switch (nodeTag(planstate)) { + case T_SeqScanState: + ExecSeqScanInitializeDSM((SeqScanState *)planstate, d->pcxt, cxt->pwCtx->pscan_num); + cxt->pwCtx->pscan_num++; + break; + default: + break; + } + + return planstate_tree_walker(planstate, (bool (*)())ExecParallelInitializeDSM, d); +} + +/* + * It sets up the response queues for backend workers to return tuples + * to the main backend and start the workers. + */ +static shm_mq_handle **ExecParallelSetupTupleQueues(ParallelContext *pcxt, bool reinitialize) +{ + /* Skip this if no workers. */ + if (pcxt->nworkers <= 0) + return NULL; + + /* Allocate memory for shared memory queue handles. */ + shm_mq_handle **responseq = (shm_mq_handle **)palloc(pcxt->nworkers * sizeof(shm_mq_handle *)); + knl_u_parallel_context *cxt = (knl_u_parallel_context *)pcxt->seg; + /* + * If not reinitializing, allocate space from the DSM for the queues; + * otherwise, find the already allocated space. + */ + if (!reinitialize) { + cxt->pwCtx->tupleQueue = (char *)palloc0(PARALLEL_TUPLE_QUEUE_SIZE * (Size)pcxt->nworkers); + } + Assert(cxt->pwCtx->tupleQueue != NULL); + char *tqueuespace = cxt->pwCtx->tupleQueue; + + /* Create the queues, and become the receiver for each. */ + for (int i = 0; i < pcxt->nworkers; ++i) { + shm_mq *mq = shm_mq_create(tqueuespace + i * PARALLEL_TUPLE_QUEUE_SIZE, (Size)PARALLEL_TUPLE_QUEUE_SIZE); + shm_mq_set_receiver(mq, t_thrd.proc); + responseq[i] = shm_mq_attach(mq, pcxt->seg, NULL); + } + + /* Return array of handles. */ + return responseq; +} + +/* + * Re-initialize the parallel executor info such that it can be reused by + * workers. + */ +void ExecParallelReinitialize(ParallelExecutorInfo *pei) +{ + ReinitializeParallelDSM(pei->pcxt); + pei->tqueue = ExecParallelSetupTupleQueues(pei->pcxt, true); + pei->finished = false; +} + +/* + * Sets up the required infrastructure for backend workers to perform + * execution and return results to the main backend. + */ +ParallelExecutorInfo *ExecInitParallelPlan(PlanState *planstate, EState *estate, int nworkers) +{ + ExecParallelEstimateContext e; + ExecParallelInitializeDSMContext d; + uint32 instrumentation_len = 0; + uint32 instrument_offset = 0; + + /* Allocate object for return value. */ + ParallelExecutorInfo *pei = (ParallelExecutorInfo *)palloc0(sizeof(ParallelExecutorInfo)); + pei->finished = false; + pei->planstate = planstate; + + /* Fix up and serialize plan to be sent to workers. */ + char *pstmt_data = ExecSerializePlan(planstate->plan, estate); + + /* Create a parallel context. */ + ParallelContext *pcxt = CreateParallelContext("postgres", "ParallelQueryMain", nworkers); + pei->pcxt = pcxt; + + /* Estimate space for serialized PlannedStmt. */ + Size pstmt_len = strlen(pstmt_data) + 1; + /* Estimate space for serialized ParamListInfo. */ + Size param_len = EstimateParamListSpace(estate->es_param_list_info); + + /* + * Give parallel-aware nodes a chance to add to the estimates, and get + * a count of how many PlanState nodes there are. + */ + e.pcxt = pcxt; + e.nnodes = 0; + (void)ExecParallelEstimate(planstate, &e); + + /* Estimate space for instrumentation, if required. */ + if (estate->es_instrument) { + instrumentation_len = offsetof(SharedExecutorInstrumentation, plan_node_id) + sizeof(int) * e.nnodes; + instrumentation_len = MAXALIGN(instrumentation_len); + instrument_offset = instrumentation_len; + instrumentation_len += sizeof(Instrumentation) * e.nnodes * nworkers; + } + + /* Everyone's had a chance to ask for space, so now create the DSM. */ + InitializeParallelDSM(pcxt); + knl_u_parallel_context *cxt = (knl_u_parallel_context *)pcxt->seg; + + /* + * OK, now we have a dynamic shared memory segment, and it should be big + * enough to store all of the data we estimated we would want to put into + * it, plus whatever general stuff (not specifically executor-related) the + * ParallelContext itself needs to store there. None of the space we + * asked for has been allocated or initialized yet, though, so do that. + */ + MemoryContext oldcontext = MemoryContextSwitchTo(cxt->memCtx); + + /* Store serialized PlannedStmt. */ + cxt->pwCtx->pstmt_space = (char *)palloc0(pstmt_len); + int rc = memcpy_s(cxt->pwCtx->pstmt_space, pstmt_len, pstmt_data, pstmt_len); + securec_check(rc, "", ""); + + /* Store serialized ParamListInfo. */ + cxt->pwCtx->param_space = (char *)palloc0(param_len); + cxt->pwCtx->param_len = param_len; + SerializeParamList(estate->es_param_list_info, cxt->pwCtx->param_space, param_len); + + /* Allocate space for each worker's BufferUsage; no need to initialize. */ + cxt->pwCtx->bufUsage = (BufferUsage *)palloc0(sizeof(BufferUsage) * pcxt->nworkers); + pei->buffer_usage = cxt->pwCtx->bufUsage; + + /* Set up tuple queues. */ + pei->tqueue = ExecParallelSetupTupleQueues(pcxt, false); + + /* + * If instrumentation options were supplied, allocate space for the + * data. It only gets partially initialized here; the rest happens + * during ExecParallelInitializeDSM. + */ + if (estate->es_instrument) { + cxt->pwCtx->instrumentation = (SharedExecutorInstrumentation *)palloc0(instrumentation_len); + cxt->pwCtx->instrumentation->instrument_options = estate->es_instrument; + cxt->pwCtx->instrumentation->instrument_offset = instrument_offset; + cxt->pwCtx->instrumentation->num_workers = nworkers; + cxt->pwCtx->instrumentation->num_plan_nodes = e.nnodes; + Instrumentation *instrument = GetInstrumentationArray(cxt->pwCtx->instrumentation); + for (int i = 0; i < nworkers * e.nnodes; ++i) { + InstrInit(&instrument[i], estate->es_instrument); + } + pei->instrumentation = cxt->pwCtx->instrumentation; + } + + cxt->pwCtx->pscan = (ParallelHeapScanDesc *)palloc0(sizeof(ParallelHeapScanDesc) * e.nnodes); + + /* + * Give parallel-aware nodes a chance to initialize their shared data. + * This also initializes the elements of instrumentation->ps_instrument, + * if it exists. + */ + d.pcxt = pcxt; + d.instrumentation = cxt->pwCtx->instrumentation; + d.nnodes = 0; + + /* Here we switch to old context, cause heap_beginscan_parallel need malloc memory */ + (void)MemoryContextSwitchTo(oldcontext); + (void)ExecParallelInitializeDSM(planstate, &d); + + /* + * Make sure that the world hasn't shifted under our feat. This could + * probably just be an Assert(), but let's be conservative for now. + */ + if (e.nnodes != d.nnodes) { + ereport(ERROR, (errmsg("inconsistent count of PlanState nodes"))); + } + + /* OK, we're ready to rock and roll. */ + return pei; +} + +/* + * Copy instrumentation information about this node and its descendents from + * dynamic shared memory. + */ +static bool ExecParallelRetrieveInstrumentation(PlanState *planstate, SharedExecutorInstrumentation *instrumentation) +{ + int i; + int plan_node_id = planstate->plan->plan_node_id; + + /* Find the instumentation for this node. */ + for (i = 0; i < instrumentation->num_plan_nodes; ++i) { + if (instrumentation->plan_node_id[i] == plan_node_id) { + break; + } + } + if (i >= instrumentation->num_plan_nodes) { + ereport(ERROR, (errmsg("plan node %d not found", plan_node_id))); + } + + /* Accumulate the statistics from all workers. */ + Instrumentation *instrument = GetInstrumentationArray(instrumentation); + instrument += i * instrumentation->num_workers; + for (i = 0; i < instrumentation->num_workers; ++i) { + InstrAggNode(planstate->instrument, &instrument[i]); + } + + /* Also store the per-worker detail. */ + Size ibytes = instrumentation->num_workers * sizeof(Instrumentation); + planstate->worker_instrument = + (WorkerInstrumentation *)palloc(offsetof(WorkerInstrumentation, instrument) + ibytes); + planstate->worker_instrument->num_workers = instrumentation->num_workers; + int rc = memcpy_s(&planstate->worker_instrument->instrument, ibytes, instrument, ibytes); + securec_check(rc, "", ""); + + return planstate_tree_walker(planstate, (bool (*)())ExecParallelRetrieveInstrumentation, instrumentation); +} + + +/* + * Finish parallel execution. We wait for parallel workers to finish, and + * accumulate their buffer usage and instrumentation. + */ +void ExecParallelFinish(ParallelExecutorInfo *pei) +{ + if (pei->finished) + return; + + /* First, wait for the workers to finish. */ + WaitForParallelWorkersToFinish(pei->pcxt); + + /* Next, accumulate buffer usage. */ + for (int i = 0; i < pei->pcxt->nworkers; ++i) + InstrAccumParallelQuery(&pei->buffer_usage[i]); + + /* Finally, accumulate instrumentation, if any. */ + if (pei->instrumentation) { + (void)ExecParallelRetrieveInstrumentation(pei->planstate, pei->instrumentation); + } + + pei->finished = true; +} + +/* + * Clean up whatever ParallelExecutreInfo resources still exist after + * ExecParallelFinish. We separate these routines because someone might + * want to examine the contents of the DSM after ExecParallelFinish and + * before calling this routine. + */ +void ExecParallelCleanup(ParallelExecutorInfo *pei) +{ + if (pei->pcxt != NULL) { + DestroyParallelContext(pei->pcxt); + pei->pcxt = NULL; + } + pfree(pei); +} + +/* + * Create a DestReceiver to write tuples we produce to the shm_mq designated + * for that purpose. + */ +static DestReceiver *ExecParallelGetReceiver(void *seg) +{ + Assert(seg != NULL); + knl_u_parallel_context *cxt = (knl_u_parallel_context *)seg; + + char *mqspace = cxt->pwCtx->tupleQueue; + mqspace += t_thrd.bgworker_cxt.ParallelWorkerNumber * PARALLEL_TUPLE_QUEUE_SIZE; + shm_mq *mq = (shm_mq *)mqspace; + shm_mq_set_sender(mq, t_thrd.proc); + return CreateTupleQueueDestReceiver(shm_mq_attach(mq, seg, NULL)); +} + +/* + * Create a QueryDesc for the PlannedStmt we are to execute, and return it. + */ +static QueryDesc *ExecParallelGetQueryDesc(void *seg, DestReceiver *receiver, int instrument_options) +{ + knl_u_parallel_context *cxt = (knl_u_parallel_context *)seg; + + /* Reconstruct leader-supplied PlannedStmt. */ + PlannedStmt *pstmt = (PlannedStmt *)stringToNode(cxt->pwCtx->pstmt_space); + + /* Reconstruct ParamListInfo. */ + ParamListInfo paramLI = RestoreParamList(cxt->pwCtx->param_space, cxt->pwCtx->param_len); + + /* + * Create a QueryDesc for the query. + * + * It's not obvious how to obtain the query string from here; and even if + * we could copying it would take more cycles than not copying it. But + * it's a bit unsatisfying to just use a dummy string here, so consider + * revising this someday. + */ + return CreateQueryDesc(pstmt, "", GetActiveSnapshot(), InvalidSnapshot, receiver, paramLI, + instrument_options); +} + +/* + * Copy instrumentation information from this node and its descendents into + * dynamic shared memory, so that the parallel leader can retrieve it. + */ +static bool ExecParallelReportInstrumentation(PlanState *planstate, SharedExecutorInstrumentation *instrumentation) +{ + int i; + int plan_node_id = planstate->plan->plan_node_id; + + InstrEndLoop(planstate->instrument); + + /* + * If we shuffled the plan_node_id values in ps_instrument into sorted + * order, we could use binary search here. This might matter someday + * if we're pushing down sufficiently large plan trees. For now, do it + * the slow, dumb way. + */ + for (i = 0; i < instrumentation->num_plan_nodes; ++i) { + if (instrumentation->plan_node_id[i] == plan_node_id) { + break; + } + } + if (i >= instrumentation->num_plan_nodes) { + ereport(ERROR, (errmsg("plan node %d not found", plan_node_id))); + } + + /* + * Add our statistics to the per-node, per-worker totals. It's possible + * that this could happen more than once if we relaunched workers. + */ + Instrumentation *instrument = GetInstrumentationArray(instrumentation); + instrument += i * instrumentation->num_workers; + Assert(IsParallelWorker()); + Assert(t_thrd.bgworker_cxt.ParallelWorkerNumber < instrumentation->num_workers); + InstrAggNode(&instrument[t_thrd.bgworker_cxt.ParallelWorkerNumber], planstate->instrument); + + return planstate_tree_walker(planstate, (bool (*)())ExecParallelReportInstrumentation, instrumentation); +} + +/* + * Initialize the PlanState and its descendents with the information + * retrieved from shared memory. This has to be done once the PlanState + * is allocated and initialized by executor; that is, after ExecutorStart(). + */ +static bool ExecParallelInitializeWorker(PlanState *planstate, void *context) +{ + if (planstate == NULL) + return false; + + /* Call initializers for parallel-aware plan nodes. */ + if (planstate->plan->parallel_aware) { + switch (nodeTag(planstate)) { + case T_SeqScanState: + ExecSeqScanInitializeWorker((SeqScanState *)planstate, context); + break; + default: + break; + } + } + + return planstate_tree_walker(planstate, (bool (*)())ExecParallelInitializeWorker, context); +} + +/* + * Main entrypoint for parallel query worker processes. + * + * We reach this function from ParallelMain, so the setup necessary to create + * a sensible parallel environment has already been done; ParallelMain worries + * about stuff like the transaction state, combo CID mappings, and GUC values, + * so we don't need to deal with any of that here. + * + * Our job is to deal with concerns specific to the executor. The parallel + * group leader will have stored a serialized PlannedStmt, and it's our job + * to execute that plan and write the resulting tuples to the appropriate + * tuple queue. Various bits of supporting information that we need in order + * to do this are also stored in the dsm_segment and can be accessed through + * the shm_toc. + */ +void ParallelQueryMain(void *seg) +{ + int instrument_options = 0; + + /* Set up DestReceiver, SharedExecutorInstrumentation, and QueryDesc. */ + knl_u_parallel_context *cxt = (knl_u_parallel_context *)seg; + DestReceiver *receiver = ExecParallelGetReceiver(seg); + SharedExecutorInstrumentation *instrumentation = cxt->pwCtx->instrumentation; + if (instrumentation != NULL) + instrument_options = instrumentation->instrument_options; + QueryDesc *queryDesc = ExecParallelGetQueryDesc(seg, receiver, instrument_options); + + /* Prepare to track buffer usage during query execution. */ + InstrStartParallelQuery(); + + /* Start up the executor, have it run the plan, and then shut it down. */ + (void)ExecutorStart(queryDesc, 0); + ExecParallelInitializeWorker(queryDesc->planstate, seg); + ExecutorRun(queryDesc, ForwardScanDirection, 0L); + ExecutorFinish(queryDesc); + + /* Report buffer usage during parallel execution. */ + BufferUsage *buffer_usage = cxt->pwCtx->bufUsage; + InstrEndParallelQuery(&buffer_usage[t_thrd.bgworker_cxt.ParallelWorkerNumber]); + + /* Report instrumentation data if any instrumentation options are set. */ + if (instrumentation != NULL) { + (void)ExecParallelReportInstrumentation(queryDesc->planstate, instrumentation); + } + + /* Must do this after capturing instrumentation. */ + ExecutorEnd(queryDesc); + + /* Cleanup. */ + FreeQueryDesc(queryDesc); + (*receiver->rDestroy)(receiver); +} + diff --git a/src/gausskernel/runtime/executor/execProcnode.cpp b/src/gausskernel/runtime/executor/execProcnode.cpp index a448f0d15..77f86b765 100755 --- a/src/gausskernel/runtime/executor/execProcnode.cpp +++ b/src/gausskernel/runtime/executor/execProcnode.cpp @@ -89,6 +89,7 @@ #include "executor/nodeExtensible.h" #include "executor/nodeForeignscan.h" #include "executor/nodeFunctionscan.h" +#include "executor/nodeGather.h" #include "executor/nodeGroup.h" #include "executor/nodeHash.h" #include "executor/nodeHashjoin.h" @@ -116,6 +117,7 @@ #include "executor/nodeWindowAgg.h" #include "executor/nodeWorktablescan.h" #include "executor/execStream.h" +#include "nodes/nodeFuncs.h" #include "optimizer/clauses.h" #include "optimizer/encoding.h" #include "optimizer/ml_model.h" @@ -297,6 +299,8 @@ PlanState* ExecInitNodeByType(Plan* node, EState* e_state, int e_flags) return (PlanState*)ExecInitWindowAgg((WindowAgg*)node, e_state, e_flags); case T_Unique: return (PlanState*)ExecInitUnique((Unique*)node, e_state, e_flags); + case T_Gather: + return (PlanState*)ExecInitGather((Gather*)node, e_state, e_flags); case T_Hash: return (PlanState*)ExecInitHash((Hash*)node, e_state, e_flags); case T_SetOp: @@ -635,6 +639,8 @@ TupleTableSlot* ExecProcNodeByType(PlanState* node) return ExecWindowAgg((WindowAggState*)node); case T_UniqueState: return ExecUnique((UniqueState*)node); + case T_GatherState: + return ExecGather((GatherState*)node); case T_HashState: return ExecHash(); case T_SetOpState: @@ -1085,6 +1091,9 @@ static void ExecEndNodeByType(PlanState* node) case T_TsStoreScanState: ExecEndCStoreScan((CStoreScanState*)node, false); break; + case T_GatherState: + ExecEndGather((GatherState *)node); + break; case T_IndexScanState: ExecEndIndexScan((IndexScanState*)node); break; @@ -1340,3 +1349,29 @@ void ExecEndNode(PlanState* node) } ExecEndNodeByType(node); } + +/* + * ExecShutdownNode + * + * Give execution nodes a chance to stop asynchronous resource consumption + * and release any resources still held. Currently, this is only used for + * parallel query, but we might want to extend it to other cases also (e.g. + * FDW). We might also want to call it sooner, as soon as it's evident that + * no more rows will be needed (e.g. when a Limit is filled) rather than only + * at the end of ExecutorRun. + */ +bool ExecShutdownNode(PlanState *node) +{ + if (node == NULL) + return false; + + switch (nodeTag(node)) { + case T_GatherState: + ExecShutdownGather((GatherState *)node); + break; + default: + break; + } + + return planstate_tree_walker(node, (bool (*)())ExecShutdownNode, NULL); +} diff --git a/src/gausskernel/runtime/executor/functions.cpp b/src/gausskernel/runtime/executor/functions.cpp index 1af5d06dd..7bfa62049 100755 --- a/src/gausskernel/runtime/executor/functions.cpp +++ b/src/gausskernel/runtime/executor/functions.cpp @@ -456,7 +456,7 @@ static List* init_execution_state(List* query_tree_list, SQLFunctionCachePtr f_c if (query_tree->commandType == CMD_UTILITY) stmt = query_tree->utilityStmt; else - stmt = (Node*)pg_plan_query(query_tree, 0, NULL); + stmt = (Node*)pg_plan_query(query_tree, f_cache->readonly_func ? CURSOR_OPT_PARALLEL_OK : 0, NULL); /* Precheck all commands for validity in a function */ if (IsA(stmt, TransactionStmt)) @@ -471,6 +471,9 @@ static List* init_execution_state(List* query_tree_list, SQLFunctionCachePtr f_c /* translator: %s is a SQL statement name */ errmsg("%s is not allowed in a non-volatile function", CreateCommandTag(stmt)))); + if (IsInParallelMode() && !CommandIsReadOnly(stmt)) + PreventCommandIfParallelMode(CreateCommandTag((Node *)stmt)); + /* OK, build the execution_state for this query */ new_es = (execution_state*)palloc(sizeof(execution_state)); if (prev_es != NULL) @@ -882,6 +885,7 @@ static void postquel_sub_params(SQLFunctionCachePtr f_cache, FunctionCallInfo fc param_li->parserSetupArg = NULL; param_li->params_need_process = false; param_li->numParams = n_args; + param_li->paramMask = NULL; f_cache->paramLI = param_li; } else { param_li = f_cache->paramLI; diff --git a/src/gausskernel/runtime/executor/instrument.cpp b/src/gausskernel/runtime/executor/instrument.cpp index 6ffb916eb..c7d5a3024 100644 --- a/src/gausskernel/runtime/executor/instrument.cpp +++ b/src/gausskernel/runtime/executor/instrument.cpp @@ -59,6 +59,7 @@ extern const char* GetStreamType(Stream* node); extern void insert_obsscaninfo( uint64 queryid, const char* rel_name, int64 file_count, double scan_data_size, double total_time, int format); +static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); static void BufferUsageAccumDiff(BufferUsage* dst, const BufferUsage* add, const BufferUsage* sub); static void CPUUsageGetCurrent(CPUUsage* cur); static void CPUUsageAccumDiff(CPUUsage* dst, const CPUUsage* add, const CPUUsage* sub); @@ -454,6 +455,15 @@ Instrumentation* InstrAlloc(int n, int instrument_options) return instr; } +/* Initialize an pre-allocated instrumentation structure. */ +void InstrInit(Instrumentation *instr, int instrument_options) +{ + int rc = memset_s(instr, sizeof(Instrumentation), 0, sizeof(Instrumentation)); + securec_check(rc, "", ""); + instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0; + instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; +} + /* Entry to a plan node */ void InstrStartNode(Instrumentation* instr) { @@ -689,11 +699,72 @@ void StreamEndLoop(StreamTime* instr) instr->tuplecount = 0; } -/* +/* aggregate instrumentation information */ +void InstrAggNode(Instrumentation *dst, Instrumentation *add) +{ + if (!dst->running && add->running) { + dst->running = true; + dst->firsttuple = add->firsttuple; + } else if (dst->running && add->running && dst->firsttuple > add->firsttuple) { + dst->firsttuple = add->firsttuple; + } + + INSTR_TIME_ADD(dst->counter, add->counter); + + dst->tuplecount += add->tuplecount; + dst->startup += add->startup; + dst->total += add->total; + dst->ntuples += add->ntuples; + dst->nloops += add->nloops; + dst->nfiltered1 += add->nfiltered1; + dst->nfiltered2 += add->nfiltered2; + + /* Add delta of buffer usage since entry to node's totals */ + if (dst->need_bufusage) + BufferUsageAdd(&dst->bufusage, &add->bufusage); +} + +/* note current values during parallel executor startup */ +void InstrStartParallelQuery(void) +{ + t_thrd.bgworker_cxt.save_pgBufferUsage = u_sess->instr_cxt.pg_buffer_usage; +} + +/* report usage after parallel executor shutdown */ +void InstrEndParallelQuery(BufferUsage *result) +{ + int rc = memset_s(result, sizeof(BufferUsage), 0, sizeof(BufferUsage)); + securec_check(rc, "", ""); + BufferUsageAccumDiff(result, u_sess->instr_cxt.pg_buffer_usage, t_thrd.bgworker_cxt.save_pgBufferUsage); +} + +/* accumulate work done by workers in leader's stats */ +void InstrAccumParallelQuery(BufferUsage *result) +{ + BufferUsageAdd(u_sess->instr_cxt.pg_buffer_usage, result); +} + +static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) +{ + dst->shared_blks_hit += add->shared_blks_hit; + dst->shared_blks_read += add->shared_blks_read; + dst->shared_blks_dirtied += add->shared_blks_dirtied; + dst->shared_blks_written += add->shared_blks_written; + dst->local_blks_hit += add->local_blks_hit; + dst->local_blks_read += add->local_blks_read; + dst->local_blks_dirtied += add->local_blks_dirtied; + dst->local_blks_written += add->local_blks_written; + dst->temp_blks_read += add->temp_blks_read; + dst->temp_blks_written += add->temp_blks_written; + INSTR_TIME_ADD(dst->blk_read_time, add->blk_read_time); + INSTR_TIME_ADD(dst->blk_write_time, add->blk_write_time); +} + +/* * BufferUsageAccumDiff - * calculate every element of dst like: dst += add - sub + * calculate every element of dst like: dst += add - sub */ -static void BufferUsageAccumDiff(BufferUsage* dst, const BufferUsage* add, const BufferUsage* sub) +static void BufferUsageAccumDiff(BufferUsage *dst, const BufferUsage *add, const BufferUsage *sub) { dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit; dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read; diff --git a/src/gausskernel/runtime/executor/nodeGather.cpp b/src/gausskernel/runtime/executor/nodeGather.cpp new file mode 100644 index 000000000..e9f6f6b8e --- /dev/null +++ b/src/gausskernel/runtime/executor/nodeGather.cpp @@ -0,0 +1,434 @@ +/* ------------------------------------------------------------------------- + * + * nodeGather.c + * Support routines for scanning a plan via multiple workers. + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * A Gather executor launches parallel workers to run multiple copies of a + * plan. It can also run the plan itself, if the workers are not available + * or have not started up yet. It then merges all of the results it produces + * and the results from the workers into a single output stream. Therefore, + * it will normally be used with a plan where running multiple copies of the + * same plan does not produce duplicate output, such as parallel-aware + * SeqScan. + * + * Alternatively, a Gather node can be configured to use just one worker + * and the single-copy flag can be set. In this case, the Gather node will + * run the plan in one worker and will not execute the plan itself. In + * this case, it simply returns whatever tuples were returned by the worker. + * If a worker cannot be obtained, then it will run the plan itself and + * return the results. Therefore, a plan used with a single-copy Gather + * node need not be parallel-aware. + * + * IDENTIFICATION + * src/backend/executor/nodeGather.c + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/relscan.h" +#include "access/xact.h" +#include "executor/execdebug.h" +#include "executor/execParallel.h" +#include "executor/nodeGather.h" +#include "executor/nodeSubplan.h" +#include "executor/tqueue.h" +#include "miscadmin.h" +#include "utils/memutils.h" +#include "utils/rel.h" + + +static TupleTableSlot *gather_getnext(GatherState *gatherstate); +static HeapTuple gather_readnext(GatherState *gatherstate); +static void ExecShutdownGatherWorkers(GatherState *node); + + +/* ---------------------------------------------------------------- + * ExecInitGather + * ---------------------------------------------------------------- + */ +GatherState *ExecInitGather(Gather *node, EState *estate, int eflags) +{ + bool hasoid = false; + + /* Gather node doesn't have innerPlan node. */ + Assert(innerPlan(node) == NULL); + + /* + * create state structure + */ + GatherState *gatherstate = makeNode(GatherState); + gatherstate->ps.plan = (Plan *)node; + gatherstate->ps.state = estate; + gatherstate->need_to_scan_locally = !node->single_copy && + u_sess->attr.attr_sql.parallel_leader_participation; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &gatherstate->ps); + + /* + * initialize child expressions + */ + gatherstate->ps.targetlist = (List *)ExecInitExpr((Expr *)node->plan.targetlist, (PlanState *)gatherstate); + gatherstate->ps.qual = (List *)ExecInitExpr((Expr *)node->plan.qual, (PlanState *)gatherstate); + + /* + * tuple table initialization + */ + gatherstate->funnel_slot = ExecInitExtraTupleSlot(estate); + ExecInitResultTupleSlot(estate, &gatherstate->ps); + + /* + * now initialize outer plan + */ + Plan *outerNode = outerPlan(node); + outerPlanState(gatherstate) = ExecInitNode(outerNode, estate, eflags); + + gatherstate->ps.ps_TupFromTlist = false; + + /* + * Initialize result tuple type and projection info. + */ + ExecAssignResultTypeFromTL(&gatherstate->ps); + ExecAssignProjectionInfo(&gatherstate->ps, NULL); + + /* + * Initialize funnel slot to same tuple descriptor as outer plan. + */ + if (!ExecContextForcesOids(&gatherstate->ps, &hasoid)) + hasoid = false; + TupleDesc tupDesc = ExecTypeFromTL(outerNode->targetlist, hasoid); + ExecSetSlotDescriptor(gatherstate->funnel_slot, tupDesc); + + return gatherstate; +} + +/* ---------------------------------------------------------------- + * ExecGather(node) + * + * Scans the relation via multiple workers and returns + * the next qualifying tuple. + * ---------------------------------------------------------------- + */ +TupleTableSlot *ExecGather(GatherState *node) +{ + TupleTableSlot *fslot = node->funnel_slot; + int i; + TupleTableSlot *slot = NULL; + TupleTableSlot *resultSlot = NULL; + ExprDoneCond isDone; + + CHECK_FOR_INTERRUPTS(); + + /* + * Initialize the parallel context and workers on first execution. We do + * this on first execution rather than during node initialization, as it + * needs to allocate large dynamic segement, so it is better to do if it + * is really needed. + */ + if (!node->initialized) { + EState *estate = node->ps.state; + Gather *gather = (Gather *)node->ps.plan; + t_thrd.subrole = BACKGROUND_LEADER; + + /* + * Sometimes we might have to run without parallelism; but if + * parallel mode is active then we can try to fire up some workers. + */ + if (gather->num_workers > 0 && IsInParallelMode()) { + bool got_any_worker = false; + + /* Initialize the workers required to execute Gather node. */ + if (!node->pei) + node->pei = ExecInitParallelPlan(node->ps.lefttree, estate, gather->num_workers); + + /* + * Register backend workers. We might not get as many as we + * requested, or indeed any at all. + */ + ParallelContext *pcxt = node->pei->pcxt; + LaunchParallelWorkers(pcxt); + + /* Set up tuple queue readers to read the results. */ + if (pcxt->nworkers > 0) { + node->nreaders = 0; + node->reader = (TupleQueueReader **)palloc(pcxt->nworkers * sizeof(TupleQueueReader *)); + + for (i = 0; i < pcxt->nworkers; ++i) { + if (pcxt->worker[i].bgwhandle == NULL) + continue; + + shm_mq_set_handle(node->pei->tqueue[i], pcxt->worker[i].bgwhandle); + node->reader[node->nreaders++] = + CreateTupleQueueReader(node->pei->tqueue[i], fslot->tts_tupleDescriptor); + got_any_worker = true; + } + } + + /* No workers? Then never mind. */ + if (!got_any_worker) + ExecShutdownGatherWorkers(node); + } + + /* Run plan locally if no workers or not single-copy. */ + node->need_to_scan_locally = (node->reader == NULL) || + (!gather->single_copy && u_sess->attr.attr_sql.parallel_leader_participation); + node->initialized = true; + } + + /* + * Check to see if we're still projecting out tuples from a previous scan + * tuple (because there is a function-returning-set in the projection + * expressions). If so, try to project another one. + */ + if (node->ps.ps_TupFromTlist) { + resultSlot = ExecProject(node->ps.ps_ProjInfo, &isDone); + if (isDone == ExprMultipleResult) + return resultSlot; + /* Done with that source tuple... */ + node->ps.ps_TupFromTlist = false; + } + + /* + * Reset per-tuple memory context to free any expression evaluation + * storage allocated in the previous tuple cycle. Note we can't do this + * until we're done projecting. This will also clear any previous tuple + * returned by a TupleQueueReader; to make sure we don't leave a dangling + * pointer around, clear the working slot first. + */ + (void)ExecClearTuple(node->funnel_slot); + ExprContext *econtext = node->ps.ps_ExprContext; + ResetExprContext(econtext); + + /* Get and return the next tuple, projecting if necessary. */ + for (;;) { + /* + * Get next tuple, either from one of our workers, or by running the + * plan ourselves. + */ + slot = gather_getnext(node); + if (TupIsNull(slot)) + return NULL; + + /* + * form the result tuple using ExecProject(), and return it --- unless + * the projection produces an empty set, in which case we must loop + * back around for another tuple + */ + econtext->ecxt_outertuple = slot; + resultSlot = ExecProject(node->ps.ps_ProjInfo, &isDone); + + if (isDone != ExprEndResult) { + node->ps.ps_TupFromTlist = (isDone == ExprMultipleResult); + return resultSlot; + } + } + + return slot; +} + +/* ---------------------------------------------------------------- + * ExecEndGather + * + * frees any storage allocated through C routines. + * ---------------------------------------------------------------- + */ +void ExecEndGather(GatherState *node) +{ + ExecShutdownGather(node); + ExecFreeExprContext(&node->ps); + (void)ExecClearTuple(node->ps.ps_ResultTupleSlot); + ExecEndNode(outerPlanState(node)); +} + +/* + * Read the next tuple. We might fetch a tuple from one of the tuple queues + * using gather_readnext, or if no tuple queue contains a tuple and the + * single_copy flag is not set, we might generate one locally instead. + */ +static TupleTableSlot *gather_getnext(GatherState *gatherstate) +{ + PlanState *outerPlan = outerPlanState(gatherstate); + TupleTableSlot *fslot = gatherstate->funnel_slot; + + while (gatherstate->reader != NULL || gatherstate->need_to_scan_locally) { + CHECK_FOR_INTERRUPTS(); + + if (gatherstate->reader != NULL) { + HeapTuple tup = gather_readnext(gatherstate); + if (HeapTupleIsValid(tup)) { + (void)ExecStoreTuple(tup, /* tuple to store */ + fslot, /* slot in which to store the tuple */ + InvalidBuffer, /* buffer associated with this tuple */ + true); /* pfree this pointer if not from heap */ + return fslot; + } + } + + if (gatherstate->need_to_scan_locally) { + TupleTableSlot *outerTupleSlot = ExecProcNode(outerPlan); + + if (!TupIsNull(outerTupleSlot)) + return outerTupleSlot; + + gatherstate->need_to_scan_locally = false; + } + } + + return ExecClearTuple(fslot); +} + +/* + * Attempt to read a tuple from one of our parallel workers. + */ +static HeapTuple gather_readnext(GatherState *gatherstate) +{ + int nvisited = 0; + + for (;;) { + bool readerdone = false; + + /* Check for async events, particularly messages from workers. */ + CHECK_FOR_INTERRUPTS(); + + /* Attempt to read a tuple, but don't block if none is available. */ + TupleQueueReader *reader = gatherstate->reader[gatherstate->nextreader]; + HeapTuple tup = TupleQueueReaderNext(reader, true, &readerdone); + + /* + * If this reader is done, remove it. If all readers are done, + * clean up remaining worker state. + */ + if (readerdone) { + Assert(!tup); + DestroyTupleQueueReader(reader); + --gatherstate->nreaders; + if (gatherstate->nreaders == 0) { + ExecShutdownGatherWorkers(gatherstate); + return NULL; + } + Size remainSize = sizeof(TupleQueueReader *) * (gatherstate->nreaders - gatherstate->nextreader); + if (remainSize != 0) { + int rc = memmove_s(&gatherstate->reader[gatherstate->nextreader], remainSize, + &gatherstate->reader[gatherstate->nextreader + 1], remainSize); + securec_check(rc, "", ""); + } + if (gatherstate->nextreader >= gatherstate->nreaders) { + gatherstate->nextreader = 0; + } + continue; + } + + /* If we got a tuple, return it. */ + if (tup) + return tup; + + /* + * Advance nextreader pointer in round-robin fashion. Note that we + * only reach this code if we weren't able to get a tuple from the + * current worker. We used to advance the nextreader pointer after + * every tuple, but it turns out to be much more efficient to keep + * reading from the same queue until that would require blocking. + */ + gatherstate->nextreader++; + if (gatherstate->nextreader >= gatherstate->nreaders) + gatherstate->nextreader = 0; + + /* Have we visited every (surviving) TupleQueueReader? */ + nvisited++; + if (nvisited >= gatherstate->nreaders) { + /* + * If (still) running plan locally, return NULL so caller can + * generate another tuple from the local copy of the plan. + */ + if (gatherstate->need_to_scan_locally) + return NULL; + + /* Nothing to do except wait for developments. */ + (void)WaitLatch(&t_thrd.proc->procLatch, WL_LATCH_SET, 0); + CHECK_FOR_INTERRUPTS(); + ResetLatch(&t_thrd.proc->procLatch); + nvisited = 0; + } + } +} + +/* ---------------------------------------------------------------- + * ExecShutdownGatherWorkers + * + * Destroy the parallel workers. Collect all the stats after + * workers are stopped, else some work done by workers won't be + * accounted. + * ---------------------------------------------------------------- + */ +static void ExecShutdownGatherWorkers(GatherState *node) +{ + /* Shut down tuple queue readers before shutting down workers. */ + if (node->reader != NULL) { + for (int i = 0; i < node->nreaders; ++i) + DestroyTupleQueueReader(node->reader[i]); + + pfree(node->reader); + node->reader = NULL; + } + + /* Now shut down the workers. */ + if (node->pei != NULL) + ExecParallelFinish(node->pei); +} + +/* ---------------------------------------------------------------- + * ExecShutdownGather + * + * Destroy the setup for parallel workers including parallel context. + * Collect all the stats after workers are stopped, else some work + * done by workers won't be accounted. + * ---------------------------------------------------------------- + */ +void ExecShutdownGather(GatherState *node) +{ + ExecShutdownGatherWorkers(node); + + /* Now destroy the parallel context. */ + if (node->pei != NULL) { + ExecParallelCleanup(node->pei); + node->pei = NULL; + } +} + +/* ---------------------------------------------------------------- + * Join Support + * ---------------------------------------------------------------- + */ +/* ---------------------------------------------------------------- + * ExecReScanGather + * + * Re-initialize the workers and rescans a relation via them. + * ---------------------------------------------------------------- + */ +void ExecReScanGather(GatherState *node) +{ + /* + * Re-initialize the parallel workers to perform rescan of relation. + * We want to gracefully shutdown all the workers so that they + * should be able to propagate any error or other information to master + * backend before dying. Parallel context will be reused for rescan. + */ + ExecShutdownGatherWorkers(node); + + node->initialized = false; + + if (node->pei) + ExecParallelReinitialize(node->pei); + + ExecReScan(node->ps.lefttree); +} + diff --git a/src/gausskernel/runtime/executor/nodeSamplescan.cpp b/src/gausskernel/runtime/executor/nodeSamplescan.cpp index 6fb9e0e18..2e2b644fa 100755 --- a/src/gausskernel/runtime/executor/nodeSamplescan.cpp +++ b/src/gausskernel/runtime/executor/nodeSamplescan.cpp @@ -90,6 +90,16 @@ AbsTblScanDesc InitSampleScanDesc(ScanState* scanstate, Relation currentRelation static inline HeapTuple SampleFetchNextTuple(SeqScanState* node) { HeapScanDesc heapScanDesc = GetHeapScanDesc(node->ss_currentScanDesc); + if (heapScanDesc == NULL) { + /* + * We reach here if the scan is not parallel, or if we're executing + * a scan that was intended to be parallel serially. + * It must be a non-partitioned table. + */ + Assert(!node->isPartTbl); + heapScanDesc = (HeapScanDesc)InitSampleScanDesc(node, node->ss_currentRelation); + node->ss_currentScanDesc = (AbsTblScanDesc)heapScanDesc; + } heapScanDesc->rs_ss_accessor = node->ss_scanaccessor; /* @@ -419,7 +429,7 @@ void RowTableSample::getMaxOffset() { HeapScanDesc heapscan = NULL; AbsTblScanDesc scan = sampleScanState->ss_currentScanDesc; - bool pagemode = GetHeapScanDesc(scan)->rs_pageatatime; + bool pagemode = (GetHeapScanDesc(scan)->rs_flags) & SO_ALLOW_PAGEMODE; Page page; Assert(BlockNumberIsValid(currentBlock)); @@ -456,7 +466,7 @@ void RowTableSample::getMaxOffset() ScanValid RowTableSample::scanTup() { HeapScanDesc scan = GetHeapScanDesc(sampleScanState->ss_currentScanDesc); - bool pagemode = scan->rs_pageatatime; + bool pagemode = scan->rs_flags & SO_ALLOW_PAGEMODE; HeapTuple tuple = &(scan->rs_ctup); Snapshot snapshot = scan->rs_snapshot; ItemId itemid; diff --git a/src/gausskernel/runtime/executor/nodeSeqscan.cpp b/src/gausskernel/runtime/executor/nodeSeqscan.cpp index a7611dbac..51ad0a980 100755 --- a/src/gausskernel/runtime/executor/nodeSeqscan.cpp +++ b/src/gausskernel/runtime/executor/nodeSeqscan.cpp @@ -39,6 +39,8 @@ #include "utils/rel_gs.h" #include "nodes/execnodes.h" +static AbsTblScanDesc InitBeginScan(SeqScanState* node, Relation current_relation); + extern void StrategyGetRingPrefetchQuantityAndTrigger(BufferAccessStrategy strategy, int* quantity, int* trigger); /* ---------------------------------------------------------------- * prefetch_pages @@ -202,6 +204,18 @@ static TupleTableSlot* SeqNext(SeqScanState* node) estate = node->ps.state; direction = estate->es_direction; slot = node->ss_ScanTupleSlot; + + if (scanDesc == NULL) { + /* + * We reach here if the scan is not parallel, or if we're executing + * a scan that was intended to be parallel serially. + * It must be a non-partitioned table. + */ + Assert(!node->isPartTbl); + scanDesc = InitBeginScan(node, node->ss_currentRelation); + node->ss_currentScanDesc = scanDesc; + } + GetHeapScanDesc(scanDesc)->rs_ss_accessor = node->ss_scanaccessor; /* @@ -341,10 +355,13 @@ void InitScanRelation(SeqScanState* node, EState* estate) * open that relation and acquire appropriate lock on it. */ current_relation = ExecOpenScanRelation(estate, ((SeqScan*)node->ps.plan)->scanrelid); - if (!node->isPartTbl) { - /* add qual for redis */ - current_scan_desc = InitBeginScan(node, current_relation); + /* + * For non-partitioned table, we will do InitBeginScan later to check whether we can do + * parallel scan or not. Check ExecInitSeqScan and SeqNext for details. + * But we still need to add qual here, otherwise ExecScan will get no qual. + */ + (void)reset_scan_qual(current_relation, node); } else { plan = (SeqScan*)node->ps.plan; @@ -507,7 +524,15 @@ SeqScanState* ExecInitSeqScan(SeqScan* node, EState* estate, int eflags) abs_tbl_init_parallel_seqscan( scanstate->ss_currentScanDesc, scanstate->ps.plan->dop, scanstate->partScanDirection); } else { - scanstate->ps.stubType = PST_Scan; + /* + * For non-partitioned table, ss_currentScanDesc may be none cause we will try to do parallel. + * Check InitScanRelation and SeqNext for details. + */ + if (!node->isPartTbl) { + scanstate->ps.stubType = PST_None; + } else { + scanstate->ps.stubType = PST_Scan; + } } scanstate->ps.ps_TupFromTlist = false; @@ -602,28 +627,88 @@ void ExecReScanSeqScan(SeqScanState* node) } scan = node->ss_currentScanDesc; - if (node->isPartTbl) { - if (PointerIsValid(node->partitions)) { - /* end scan the prev partition first, */ - abs_tbl_endscan(scan); - /* finally init Scan for the next partition */ - ExecInitNextPartitionForSeqScan(node); + if (scan != NULL) { + if (node->isPartTbl) { + if (PointerIsValid(node->partitions)) { + /* end scan the prev partition first, */ + abs_tbl_endscan(scan); - scan = node->ss_currentScanDesc; + /* finally init Scan for the next partition */ + ExecInitNextPartitionForSeqScan(node); + + scan = node->ss_currentScanDesc; + } + } else { + abs_tbl_rescan(scan, NULL); } - } else { - abs_tbl_rescan(scan, NULL); - } - abs_tbl_init_parallel_seqscan(scan, node->ps.plan->dop, node->partScanDirection); + abs_tbl_init_parallel_seqscan(scan, node->ps.plan->dop, node->partScanDirection); + } ExecScanReScan((ScanState*)node); } /* ---------------------------------------------------------------- - * ExecSeqMarkPos(node) + * ExecSeqScanEstimate * - * Marks scan position. + * estimates the space required to serialize seqscan node. + * ---------------------------------------------------------------- + */ +void ExecSeqScanEstimate(SeqScanState *node, ParallelContext *pcxt) +{ + EState *estate = node->ps.state; + node->pscan_len = heap_parallelscan_estimate(estate->es_snapshot); +} + +/* ---------------------------------------------------------------- + * ExecSeqScanInitializeDSM + * + * Set up a parallel heap scan descriptor. + * ---------------------------------------------------------------- + */ +void ExecSeqScanInitializeDSM(SeqScanState *node, ParallelContext *pcxt, int nodeid) +{ + EState *estate = node->ps.state; + knl_u_parallel_context *cxt = (knl_u_parallel_context *)pcxt->seg; + + /* Here we can't use palloc, cause we have switch to old memctx in ExecInitParallelPlan */ + cxt->pwCtx->pscan[nodeid] = (ParallelHeapScanDesc)MemoryContextAllocZero(cxt->memCtx, node->pscan_len); + heap_parallelscan_initialize(cxt->pwCtx->pscan[nodeid], node->pscan_len, node->ss_currentRelation, + estate->es_snapshot); + cxt->pwCtx->pscan[nodeid]->plan_node_id = node->ps.plan->plan_node_id; + node->ss_currentScanDesc = + (AbsTblScanDesc)heap_beginscan_parallel(node->ss_currentRelation, cxt->pwCtx->pscan[nodeid]); +} + +/* ---------------------------------------------------------------- + * ExecSeqScanInitializeWorker + * + * Copy relevant information from TOC into planstate. + * ---------------------------------------------------------------- + */ +void ExecSeqScanInitializeWorker(SeqScanState *node, void *context) +{ + ParallelHeapScanDesc pscan = NULL; + knl_u_parallel_context *cxt = (knl_u_parallel_context *)context; + + for (int i = 0; i < cxt->pwCtx->pscan_num; i++) { + if (node->ps.plan->plan_node_id == cxt->pwCtx->pscan[i]->plan_node_id) { + pscan = cxt->pwCtx->pscan[i]; + break; + } + } + + if (pscan == NULL) { + ereport(ERROR, (errmsg("could not find plan info, plan node id:%d", node->ps.plan->plan_node_id))); + } + + node->ss_currentScanDesc = (AbsTblScanDesc)heap_beginscan_parallel(node->ss_currentRelation, pscan); +} + +/* ---------------------------------------------------------------- + * ExecSeqMarkPos(node) + * + * Marks scan position. * ---------------------------------------------------------------- */ void ExecSeqMarkPos(SeqScanState* node) diff --git a/src/gausskernel/runtime/executor/spi.cpp b/src/gausskernel/runtime/executor/spi.cpp index 98343c7d4..8643e1245 100755 --- a/src/gausskernel/runtime/executor/spi.cpp +++ b/src/gausskernel/runtime/executor/spi.cpp @@ -421,6 +421,18 @@ void AtEOSubXact_SPI(bool isCommit, SubTransactionId mySubid, bool stpRollback, } } +/* + * Are we executing inside a procedure (that is, a nonatomic SPI context)? + */ +bool SPI_inside_nonatomic_context(void) +{ + if (u_sess->SPI_cxt._current == NULL) + return false; /* not in any SPI context at all */ + if (u_sess->SPI_cxt._current->atomic) + return false; /* it's atomic (ie function not procedure) */ + return true; +} + /* Pushes SPI stack to allow recursive SPI calls */ void SPI_push(void) { @@ -1382,23 +1394,27 @@ static Portal SPI_cursor_open_internal(const char *name, SPIPlanPtr plan, ParamL } /* - * If told to be read-only, we'd better check for read-only queries. This - * can't be done earlier because we need to look at the finished, planned - * queries. (In particular, we don't want to do it between GetCachedPlan - * and PortalDefineQuery, because throwing an error between those steps - * would result in leaking our plancache refcount.) + * If told to be read-only, or in parallel mode, verify that this query is + * in fact read-only. This can't be done earlier because we need to look + * at the finished, planned queries. (In particular, we don't want to do + * it between GetCachedPlan and PortalDefineQuery, because throwing an + * error between those steps would result in leaking our plancache refcount.) */ - if (read_only) { + if (read_only || IsInParallelMode()) { ListCell *lc = NULL; foreach (lc, stmt_list) { Node *pstmt = (Node *)lfirst(lc); if (!CommandIsReadOnly(pstmt)) { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - /* translator: %s is a SQL statement name */ - errmsg("%s is not allowed in a non-volatile function", CreateCommandTag(pstmt)), - errhint("You can change function definition."))); + if (read_only) { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + /* translator: %s is a SQL statement name */ + errmsg("%s is not allowed in a non-volatile function", CreateCommandTag(pstmt)), + errhint("You can change function definition."))); + } else { + PreventCommandIfParallelMode(CreateCommandTag((Node *) pstmt)); + } } } } @@ -2153,6 +2169,10 @@ static int _SPI_execute_plan(SPIPlanPtr plan, ParamListInfo paramLI, Snapshot sn errmsg("%s is not allowed in a non-volatile function", CreateCommandTag(stmt)))); } + if (IsInParallelMode() && !CommandIsReadOnly(stmt)) { + PreventCommandIfParallelMode(CreateCommandTag((Node *) stmt)); + } + /* * If not read-only mode, advance the command counter before each * command and update the snapshot. @@ -2360,6 +2380,7 @@ static ParamListInfo _SPI_convert_params(int nargs, Oid *argtypes, Datum *Values param_list_info->parserSetupArg = NULL; param_list_info->params_need_process = false; param_list_info->numParams = nargs; + param_list_info->paramMask = NULL; for (i = 0; i < nargs; i++) { ParamExternData *prm = ¶m_list_info->params[i]; diff --git a/src/gausskernel/runtime/executor/tqueue.cpp b/src/gausskernel/runtime/executor/tqueue.cpp new file mode 100644 index 000000000..96545a9c5 --- /dev/null +++ b/src/gausskernel/runtime/executor/tqueue.cpp @@ -0,0 +1,905 @@ +/* ------------------------------------------------------------------------- + * + * tqueue.c + * Use shm_mq to send & receive tuples between parallel backends + * + * A DestReceiver of type DestTupleQueue, which is a TQueueDestReceiver + * under the hood, writes tuples from the executor to a shm_mq. + * + * A TupleQueueReader reads tuples from a shm_mq and returns the tuples. + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/executor/tqueue.c + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/htup.h" +#include "catalog/pg_type.h" +#include "executor/tqueue.h" +#include "funcapi.h" +#include "lib/stringinfo.h" +#include "miscadmin.h" +#include "utils/array.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rangetypes.h" +#include "utils/syscache.h" +#include "utils/typcache.h" + +typedef enum { + TQUEUE_REMAP_NONE, /* no special processing required */ + TQUEUE_REMAP_ARRAY, /* array */ + TQUEUE_REMAP_RANGE, /* range */ + TQUEUE_REMAP_RECORD /* composite type, named or anonymous */ +} RemapClass; + +typedef struct { + int natts; + RemapClass mapping[FLEXIBLE_ARRAY_MEMBER]; +} RemapInfo; + +typedef struct { + DestReceiver pub; + shm_mq_handle *handle; + MemoryContext tmpcontext; + HTAB *recordhtab; + char mode; + TupleDesc tupledesc; + RemapInfo *remapinfo; +} TQueueDestReceiver; + +typedef struct RecordTypemodMap { + int remotetypmod; + int localtypmod; +} RecordTypemodMap; + +struct TupleQueueReader { + shm_mq_handle *queue; + char mode; + TupleDesc tupledesc; + RemapInfo *remapinfo; + HTAB *typmodmap; +}; + +#define TUPLE_QUEUE_MODE_CONTROL 'c' +#define TUPLE_QUEUE_MODE_DATA 'd' + +static void tqueueWalk(TQueueDestReceiver *tqueue, RemapClass walktype, Datum value); +static void tqueueWalkRecord(TQueueDestReceiver *tqueue, Datum value); +static void tqueueWalkArray(TQueueDestReceiver *tqueue, Datum value); +static void tqueueWalkRange(TQueueDestReceiver *tqueue, Datum value); +static void tqueueSendTypmodInfo(TQueueDestReceiver *tqueue, int typmod, TupleDesc tupledesc); +static void TupleQueueHandleControlMessage(TupleQueueReader *reader, Size nbytes, char *data); +static HeapTuple TupleQueueHandleDataMessage(TupleQueueReader *reader, Size nbytes, HeapTupleHeader data); +static HeapTuple TupleQueueRemapTuple(TupleQueueReader *reader, TupleDesc tupledesc, RemapInfo *remapinfo, + HeapTuple tuple); +static Datum TupleQueueRemap(TupleQueueReader *reader, RemapClass remapclass, Datum value); +static Datum TupleQueueRemapArray(TupleQueueReader *reader, Datum value); +static Datum TupleQueueRemapRange(TupleQueueReader *reader, Datum value); +static Datum TupleQueueRemapRecord(TupleQueueReader *reader, Datum value); +static RemapClass GetRemapClass(Oid type_id); +static RemapInfo *BuildRemapInfo(TupleDesc tupledesc); + + +/* + * Receive a tuple from a query, and send it to the designated shm_mq. + * + * Returns true if successful, false if shm_mq has been detached. + */ +static void tqueueReceiveSlot(TupleTableSlot *slot, DestReceiver *self) +{ + TQueueDestReceiver *tqueue = (TQueueDestReceiver *)self; + TupleDesc tupledesc = slot->tts_tupleDescriptor; + + /* + * Test to see whether the tupledesc has changed; if so, set up for the + * new tupledesc. This is a strange test both because the executor really + * shouldn't change the tupledesc, and also because it would be unsafe if + * the old tupledesc could be freed and a new one allocated at the same + * address. But since some very old code in printtup.c uses a similar + * test, we adopt it here as well. + */ + if (tqueue->tupledesc != tupledesc) { + if (tqueue->remapinfo != NULL) + pfree(tqueue->remapinfo); + tqueue->remapinfo = BuildRemapInfo(tupledesc); + tqueue->tupledesc = tupledesc; + } + + HeapTuple tuple = ExecMaterializeSlot(slot); + + /* + * When, because of the types being transmitted, no record typemod mapping + * can be needed, we can skip a good deal of work. + */ + if (tqueue->remapinfo != NULL) { + RemapInfo *remapinfo = tqueue->remapinfo; + MemoryContext oldcontext = NULL; + + /* Deform the tuple so we can examine it, if not done already. */ + slot_getallattrs(slot); + + /* Iterate over each attribute and search it for transient typemods. */ + Assert(slot->tts_tupleDescriptor->natts == remapinfo->natts); + for (AttrNumber i = 0; i < remapinfo->natts; ++i) { + /* Ignore nulls and types that don't need special handling. */ + if (slot->tts_isnull[i] || remapinfo->mapping[i] == TQUEUE_REMAP_NONE) + continue; + + /* Switch to temporary memory context to avoid leaking. */ + if (oldcontext == NULL) { + if (tqueue->tmpcontext == NULL) + tqueue->tmpcontext = AllocSetContextCreate(TopMemoryContext, "tqueue temporary context", + ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); + oldcontext = MemoryContextSwitchTo(tqueue->tmpcontext); + } + + /* Invoke the appropriate walker function. */ + tqueueWalk(tqueue, remapinfo->mapping[i], slot->tts_values[i]); + } + + /* If we used the temp context, reset it and restore prior context. */ + if (oldcontext != NULL) { + (void)MemoryContextSwitchTo(oldcontext); + MemoryContextReset(tqueue->tmpcontext); + } + + /* If we entered control mode, switch back to data mode. */ + if (tqueue->mode != TUPLE_QUEUE_MODE_DATA) { + tqueue->mode = TUPLE_QUEUE_MODE_DATA; + (void)shm_mq_send(tqueue->handle, sizeof(char), &tqueue->mode, false); + } + } + + /* Send the tuple itself. */ + (void)shm_mq_send(tqueue->handle, tuple->t_len, tuple->t_data, false); +} + +/* + * Invoke the appropriate walker function based on the given RemapClass. + */ +static void tqueueWalk(TQueueDestReceiver *tqueue, RemapClass walktype, Datum value) +{ + check_stack_depth(); + + switch (walktype) { + case TQUEUE_REMAP_NONE: + break; + case TQUEUE_REMAP_ARRAY: + tqueueWalkArray(tqueue, value); + break; + case TQUEUE_REMAP_RANGE: + tqueueWalkRange(tqueue, value); + break; + case TQUEUE_REMAP_RECORD: + tqueueWalkRecord(tqueue, value); + break; + } +} + +/* + * Walk a record and send control messages for transient record types + * contained therein. + */ +static void tqueueWalkRecord(TQueueDestReceiver *tqueue, Datum value) +{ + /* Extract typmod from tuple. */ + HeapTupleHeader tup = DatumGetHeapTupleHeader(value); + Oid type_id = HeapTupleHeaderGetTypeId(tup); + int32 typmod = HeapTupleHeaderGetTypMod(tup); + + /* Look up tuple descriptor in typecache. */ + TupleDesc tupledesc = lookup_rowtype_tupdesc(type_id, typmod); + + /* + * If this is a transient record time, send its TupleDesc as a control + * message. (tqueueSendTypemodInfo is smart enough to do this only once + * per typmod.) + */ + if (type_id == RECORDOID) + tqueueSendTypmodInfo(tqueue, typmod, tupledesc); + + /* + * Build the remap information for this tupledesc. We might want to think + * about keeping a cache of this information keyed by typeid and typemod, + * but let's keep it simple for now. + */ + RemapInfo *remapinfo = BuildRemapInfo(tupledesc); + + /* + * If remapping is required, deform the tuple and process each field. When + * BuildRemapInfo is null, the data types are such that there can be no + * transient record types here, so we can skip all this work. + */ + if (remapinfo != NULL) { + HeapTupleData tdata; + + /* Deform the tuple so we can check each column within. */ + Datum *values = (Datum *)palloc(tupledesc->natts * sizeof(Datum)); + bool *isnull = (bool *)palloc(tupledesc->natts * sizeof(bool)); + tdata.t_len = HeapTupleHeaderGetDatumLength(tup); + ItemPointerSetInvalid(&(tdata.t_self)); + tdata.t_tableOid = InvalidOid; + tdata.t_data = tup; + heap_deform_tuple(&tdata, tupledesc, values, isnull); + + /* Recursively check each non-NULL attribute. */ + for (AttrNumber i = 0; i < tupledesc->natts; ++i) { + if (!isnull[i]) { + tqueueWalk(tqueue, remapinfo->mapping[i], values[i]); + } + } + } + + /* Release reference count acquired by lookup_rowtype_tupdesc. */ + DecrTupleDescRefCount(tupledesc); +} + +/* + * Walk a record and send control messages for transient record types + * contained therein. + */ +static void tqueueWalkArray(TQueueDestReceiver *tqueue, Datum value) +{ + ArrayType *arr = DatumGetArrayTypeP(value); + Oid type_id = ARR_ELEMTYPE(arr); + int16 typlen; + bool typbyval = false; + char typalign; + Datum *elem_values = NULL; + bool *elem_nulls = NULL; + int num_elems; + RemapClass remapclass = GetRemapClass(type_id); + + /* + * If the elements of the array don't need to be walked, we shouldn't have + * been called in the first place: GetRemapClass should have returned NULL + * when asked about this array type. + */ + Assert(remapclass != TQUEUE_REMAP_NONE); + + /* Deconstruct the array. */ + get_typlenbyvalalign(type_id, &typlen, &typbyval, &typalign); + deconstruct_array(arr, type_id, typlen, typbyval, typalign, &elem_values, &elem_nulls, &num_elems); + + /* Walk each element. */ + for (int i = 0; i < num_elems; ++i) { + if (!elem_nulls[i]) { + tqueueWalk(tqueue, remapclass, elem_values[i]); + } + } +} + +/* + * Walk a range type and send control messages for transient record types + * contained therein. + */ +static void tqueueWalkRange(TQueueDestReceiver *tqueue, Datum value) +{ + RangeType *range = DatumGetRangeType(value); + Oid type_id = RangeTypeGetOid(range); + RangeBound lower; + RangeBound upper; + bool empty = false; + + /* + * Extract the lower and upper bounds. It might be worth implementing + * some caching scheme here so that we don't look up the same typeids in + * the type cache repeatedly, but for now let's keep it simple. + */ + TypeCacheEntry *typcache = lookup_type_cache(type_id, TYPECACHE_RANGE_INFO); + if (typcache->rngelemtype == NULL) + ereport(ERROR, (errmsg("type %u is not a range type", type_id))); + range_deserialize(typcache, range, &lower, &upper, &empty); + + /* Nothing to do for an empty range. */ + if (empty) { + return; + } + + /* + * If the range bounds don't need to be walked, we shouldn't have been + * called in the first place: GetRemapClass should have returned NULL when + * asked about this range type. + */ + RemapClass remapclass = GetRemapClass(type_id); + Assert(remapclass != TQUEUE_REMAP_NONE); + + /* Walk each bound, if present. */ + if (!upper.infinite) + tqueueWalk(tqueue, remapclass, upper.val); + if (!lower.infinite) + tqueueWalk(tqueue, remapclass, lower.val); +} + +/* + * Send tuple descriptor information for a transient typemod, unless we've + * already done so previously. + */ +static void tqueueSendTypmodInfo(TQueueDestReceiver *tqueue, int typmod, TupleDesc tupledesc) +{ + StringInfoData buf; + bool found = false; + AttrNumber i; + + /* Initialize hash table if not done yet. */ + if (tqueue->recordhtab == NULL) { + HASHCTL ctl; + + ctl.keysize = sizeof(int); + ctl.entrysize = sizeof(int); + ctl.hcxt = TopMemoryContext; + tqueue->recordhtab = hash_create("tqueue record hashtable", 100, &ctl, HASH_ELEM | HASH_CONTEXT); + } + + /* Have we already seen this record type? If not, must report it. */ + (void)hash_search(tqueue->recordhtab, &typmod, HASH_ENTER, &found); + if (found) { + return; + } + + /* If message queue is in data mode, switch to control mode. */ + if (tqueue->mode != TUPLE_QUEUE_MODE_CONTROL) { + tqueue->mode = TUPLE_QUEUE_MODE_CONTROL; + (void)shm_mq_send(tqueue->handle, sizeof(char), &tqueue->mode, false); + } + + /* Assemble a control message. */ + initStringInfo(&buf); + appendBinaryStringInfo(&buf, (char *)&typmod, sizeof(int)); + appendBinaryStringInfo(&buf, (char *)&tupledesc->natts, sizeof(int)); + appendBinaryStringInfo(&buf, (char *)&tupledesc->tdhasoid, sizeof(bool)); + for (i = 0; i < tupledesc->natts; ++i) + appendBinaryStringInfo(&buf, (char *)tupledesc->attrs[i], sizeof(FormData_pg_attribute)); + + /* Send control message. */ + (void)shm_mq_send(tqueue->handle, buf.len, buf.data, false); +} + + +/* + * Prepare to receive tuples from executor. + */ +static void tqueueStartupReceiver(DestReceiver *self, int operation, TupleDesc typeinfo) +{ + /* do nothing */ +} + +/* + * Clean up at end of an executor run + */ +static void tqueueShutdownReceiver(DestReceiver *self) +{ + TQueueDestReceiver *tqueue = (TQueueDestReceiver *)self; + + if (tqueue->handle != NULL) { + shm_mq_detach(tqueue->handle); + tqueue->handle = NULL; + } +} + +/* + * Destroy receiver when done with it + */ +static void tqueueDestroyReceiver(DestReceiver *self) +{ + TQueueDestReceiver *tqueue = (TQueueDestReceiver *)self; + + if (tqueue->tmpcontext != NULL) + MemoryContextDelete(tqueue->tmpcontext); + if (tqueue->recordhtab != NULL) + hash_destroy(tqueue->recordhtab); + if (tqueue->remapinfo != NULL) + pfree(tqueue->remapinfo); + pfree(self); +} + +/* + * Create a DestReceiver that writes tuples to a tuple queue. + */ +DestReceiver *CreateTupleQueueDestReceiver(shm_mq_handle *handle) +{ + TQueueDestReceiver *self = (TQueueDestReceiver *)palloc0(sizeof(TQueueDestReceiver)); + + self->pub.receiveSlot = tqueueReceiveSlot; + self->pub.rStartup = tqueueStartupReceiver; + self->pub.rShutdown = tqueueShutdownReceiver; + self->pub.rDestroy = tqueueDestroyReceiver; + self->pub.mydest = DestTupleQueue; + self->handle = handle; + self->tmpcontext = NULL; + self->recordhtab = NULL; + self->mode = TUPLE_QUEUE_MODE_DATA; + self->remapinfo = NULL; + + return (DestReceiver *)self; +} + +/* + * Create a tuple queue reader. + */ +TupleQueueReader *CreateTupleQueueReader(shm_mq_handle *handle, TupleDesc tupledesc) +{ + TupleQueueReader *reader = (TupleQueueReader *)palloc0(sizeof(TupleQueueReader)); + + reader->queue = handle; + reader->mode = TUPLE_QUEUE_MODE_DATA; + reader->tupledesc = tupledesc; + reader->remapinfo = BuildRemapInfo(tupledesc); + + return reader; +} + +/* + * Destroy a tuple queue reader. + * + * Note: cleaning up the underlying shm_mq is the caller's responsibility. + * We won't access it here, as it may be detached already. + */ +void DestroyTupleQueueReader(TupleQueueReader *reader) +{ + if (reader->queue != NULL) { + shm_mq_detach(reader->queue); + reader->queue = NULL; + } + if (reader->remapinfo != NULL) + pfree(reader->remapinfo); + pfree(reader); +} + +/* + * Fetch a tuple from a tuple queue reader. + * + * The return value is NULL if there are no remaining tuples or if + * nowait = true and no tuple is ready to return. *done, if not NULL, + * is set to true when there are no remaining tuples and otherwise to false. + * + * The returned tuple, if any, is allocated in CurrentMemoryContext. + * Note that this routine must not leak memory! (We used to allow that, + * but not any more.) + * + * Even when shm_mq_receive() returns SHM_MQ_WOULD_BLOCK, this can still + * accumulate bytes from a partially-read message, so it's useful to call + * this with nowait = true even if nothing is returned. + */ +HeapTuple TupleQueueReaderNext(TupleQueueReader *reader, bool nowait, bool *done) +{ + if (done != NULL) + *done = false; + + for (;;) { + Size nbytes; + void *data = NULL; + /* Attempt to read a message. */ + shm_mq_result result = shm_mq_receive(reader->queue, &nbytes, &data, nowait); + /* If queue is detached, set *done and return NULL. */ + if (result == SHM_MQ_DETACHED) { + if (done != NULL) + *done = true; + return NULL; + } + + /* In non-blocking mode, bail out if no message ready yet. */ + if (result == SHM_MQ_WOULD_BLOCK) + return NULL; + Assert(result == SHM_MQ_SUCCESS); + + /* + * OK, we got a message. Process it. + * + * One-byte messages are mode switch messages, so that we can switch + * between "control" and "data" mode. When in "data" mode, each + * message (unless exactly one byte) is a tuple. When in "control" + * mode, each message provides a transient-typmod-to-tupledesc mapping + * so we can interpret future tuples. + */ + if (nbytes == 1) { + /* Mode switch message. */ + reader->mode = ((char *)data)[0]; + } else if (reader->mode == TUPLE_QUEUE_MODE_DATA) { + /* Tuple data. */ + return TupleQueueHandleDataMessage(reader, nbytes, (HeapTupleHeader)data); + } else if (reader->mode == TUPLE_QUEUE_MODE_CONTROL) { + /* Control message, describing a transient record type. */ + TupleQueueHandleControlMessage(reader, nbytes, (char *)data); + } else { + ereport(ERROR, (errmsg("invalid mode: %d", (int)reader->mode))); + } + } +} + +/* + * Handle a data message - that is, a tuple - from the remote side. + */ +static HeapTuple TupleQueueHandleDataMessage(TupleQueueReader *reader, Size nbytes, HeapTupleHeader data) +{ + HeapTupleData htup; + + ItemPointerSetInvalid(&htup.t_self); + htup.t_tableOid = InvalidOid; + htup.t_len = (uint32)nbytes; + htup.t_data = data; + + return TupleQueueRemapTuple(reader, reader->tupledesc, reader->remapinfo, &htup); +} + +/* + * Remap tuple typmods per control information received from remote side. + */ +static HeapTuple TupleQueueRemapTuple(TupleQueueReader *reader, TupleDesc tupledesc, RemapInfo *remapinfo, + HeapTuple tuple) +{ + /* + * If no remapping is necessary, just copy the tuple into a single + * palloc'd chunk, as caller will expect. + */ + if (remapinfo == NULL) + return heap_copytuple(tuple); + + /* Deform tuple so we can remap record typmods for individual attrs. */ + Datum *values = (Datum *)palloc(tupledesc->natts * sizeof(Datum)); + bool *isnull = (bool *)palloc(tupledesc->natts * sizeof(bool)); + heap_deform_tuple(tuple, tupledesc, values, isnull); + Assert(tupledesc->natts == remapinfo->natts); + + /* Recursively check each non-NULL attribute. */ + for (int i = 0; i < tupledesc->natts; ++i) { + if (isnull[i] || remapinfo->mapping[i] == TQUEUE_REMAP_NONE) + continue; + values[i] = TupleQueueRemap(reader, remapinfo->mapping[i], values[i]); + } + + /* Reform the modified tuple. */ + return heap_form_tuple(tupledesc, values, isnull); +} + +/* + * Remap a value based on the specified remap class. + */ +static Datum TupleQueueRemap(TupleQueueReader *reader, RemapClass remapclass, Datum value) +{ + check_stack_depth(); + + switch (remapclass) { + case TQUEUE_REMAP_NONE: + /* caller probably shouldn't have called us at all, but... */ + return value; + + case TQUEUE_REMAP_ARRAY: + return TupleQueueRemapArray(reader, value); + + case TQUEUE_REMAP_RANGE: + return TupleQueueRemapRange(reader, value); + + case TQUEUE_REMAP_RECORD: + return TupleQueueRemapRecord(reader, value); + } + + ereport(ERROR, (errmsg("unknown remap class: %d", (int)remapclass))); + return (Datum)0; +} + +/* + * Remap an array. + */ +static Datum TupleQueueRemapArray(TupleQueueReader *reader, Datum value) +{ + ArrayType *arr = DatumGetArrayTypeP(value); + Oid type_id = ARR_ELEMTYPE(arr); + int16 typlen; + bool typbyval; + char typalign; + Datum *elem_values = NULL; + bool *elem_nulls = NULL; + int num_elems; + RemapClass remapclass = GetRemapClass(type_id); + + /* + * If the elements of the array don't need to be walked, we shouldn't have + * been called in the first place: GetRemapClass should have returned NULL + * when asked about this array type. + */ + Assert(remapclass != TQUEUE_REMAP_NONE); + + /* Deconstruct the array. */ + get_typlenbyvalalign(type_id, &typlen, &typbyval, &typalign); + deconstruct_array(arr, type_id, typlen, typbyval, typalign, &elem_values, &elem_nulls, &num_elems); + + /* Remap each element. */ + for (int i = 0; i < num_elems; ++i) { + if (!elem_nulls[i]) { + elem_values[i] = TupleQueueRemap(reader, remapclass, elem_values[i]); + } + } + + /* Reconstruct and return the array. */ + arr = construct_md_array(elem_values, elem_nulls, ARR_NDIM(arr), ARR_DIMS(arr), ARR_LBOUND(arr), type_id, typlen, + typbyval, typalign); + return PointerGetDatum(arr); +} + +/* + * Remap a range type. + */ +static Datum TupleQueueRemapRange(TupleQueueReader *reader, Datum value) +{ + RangeType *range = DatumGetRangeType(value); + Oid type_id = RangeTypeGetOid(range); + RangeBound lower; + RangeBound upper; + bool empty = false; + + /* + * Extract the lower and upper bounds. As in tqueueWalkRange, some + * caching might be a good idea here. + */ + TypeCacheEntry *typcache = lookup_type_cache(type_id, TYPECACHE_RANGE_INFO); + if (typcache->rngelemtype == NULL) + ereport(ERROR, (errmsg("type %u is not a range type", type_id))); + range_deserialize(typcache, range, &lower, &upper, &empty); + + /* Nothing to do for an empty range. */ + if (empty) + return value; + + /* + * If the range bounds don't need to be walked, we shouldn't have been + * called in the first place: GetRemapClass should have returned NULL when + * asked about this range type. + */ + RemapClass remapclass = GetRemapClass(type_id); + Assert(remapclass != TQUEUE_REMAP_NONE); + + /* Remap each bound, if present. */ + if (!upper.infinite) + upper.val = TupleQueueRemap(reader, remapclass, upper.val); + if (!lower.infinite) + lower.val = TupleQueueRemap(reader, remapclass, lower.val); + + /* And reserialize. */ + range = range_serialize(typcache, &lower, &upper, empty); + return RangeTypeGetDatum(range); +} + +/* + * Remap a record. + */ +static Datum TupleQueueRemapRecord(TupleQueueReader *reader, Datum value) +{ + HeapTupleData htup; + + /* Fetch type OID and typemod. */ + HeapTupleHeader tup = DatumGetHeapTupleHeader(value); + Oid type_id = HeapTupleHeaderGetTypeId(tup); + int typmod = HeapTupleHeaderGetTypMod(tup); + + /* If transient record, replace remote typmod with local typmod. */ + if (type_id == RECORDOID) { + Assert(reader->typmodmap != NULL); + RecordTypemodMap *mapent = (RecordTypemodMap *)hash_search(reader->typmodmap, &typmod, HASH_FIND, NULL); + if (mapent == NULL) + ereport(ERROR, (errmsg("found unrecognized remote typmod %d", typmod))); + typmod = mapent->localtypmod; + } + + /* + * Fetch tupledesc and compute remap info. We should probably cache this + * so that we don't have to keep recomputing it. + */ + TupleDesc tupledesc = lookup_rowtype_tupdesc(type_id, typmod); + RemapInfo *remapinfo = BuildRemapInfo(tupledesc); + DecrTupleDescRefCount(tupledesc); + + /* Remap tuple. */ + ItemPointerSetInvalid(&htup.t_self); + htup.t_tableOid = InvalidOid; + htup.t_len = HeapTupleHeaderGetDatumLength(tup); + htup.t_data = tup; + HeapTuple atup = TupleQueueRemapTuple(reader, tupledesc, remapinfo, &htup); + HeapTupleHeaderSetTypeId(atup->t_data, type_id); + HeapTupleHeaderSetTypMod(atup->t_data, typmod); + HeapTupleHeaderSetDatumLength(atup->t_data, htup.t_len); + + /* And return the results. */ + return HeapTupleGetDatum(atup); +} + +/* + * Handle a control message from the tuple queue reader. + * + * Control messages are sent when the remote side is sending tuples that + * contain transient record types. We need to arrange to bless those + * record types locally and translate between remote and local typmods. + */ +static void TupleQueueHandleControlMessage(TupleQueueReader *reader, Size nbytes, char *data) +{ + int natts; + int remotetypmod; + bool hasoid = false; + char *buf = data; + Size rc = 0; + int i; + Form_pg_attribute *attrs; + MemoryContext oldcontext; + TupleDesc tupledesc; + RecordTypemodMap *mapent; + bool found; + + /* Extract remote typmod. */ + int errorno = memcpy_s(&remotetypmod, nbytes, &buf[rc], sizeof(int)); + securec_check_c(errorno, "", ""); + nbytes -= sizeof(int); + rc += sizeof(int); + + /* Extract attribute count. */ + errorno = memcpy_s(&natts, nbytes, &buf[rc], sizeof(int)); + securec_check_c(errorno, "", ""); + nbytes -= sizeof(int); + rc += sizeof(int); + + /* Extract hasoid flag. */ + errorno = memcpy_s(&hasoid, nbytes, &buf[rc], sizeof(bool)); + securec_check_c(errorno, "", ""); + nbytes -= sizeof(bool); + rc += sizeof(bool); + + /* Extract attribute details. */ + oldcontext = MemoryContextSwitchTo(t_thrd.mem_cxt.cur_transaction_mem_cxt); + attrs = (Form_pg_attribute *)palloc(natts * sizeof(Form_pg_attribute)); + for (i = 0; i < natts; ++i) { + attrs[i] = (Form_pg_attribute)palloc(sizeof(FormData_pg_attribute)); + errorno = memcpy_s(attrs[i], nbytes, &buf[rc], sizeof(FormData_pg_attribute)); + securec_check_c(errorno, "", ""); + nbytes -= sizeof(FormData_pg_attribute); + rc += sizeof(FormData_pg_attribute); + } + (void)MemoryContextSwitchTo(oldcontext); + + /* We should have read the whole message. */ + Assert(rc == nbytes); + + /* Construct TupleDesc. */ + tupledesc = CreateTupleDesc(natts, hasoid, attrs); + tupledesc = BlessTupleDesc(tupledesc); + + /* Create map if it doesn't exist already. */ + if (reader->typmodmap == NULL) { + HASHCTL ctl; + + ctl.keysize = sizeof(int); + ctl.entrysize = sizeof(RecordTypemodMap); + ctl.hcxt = t_thrd.mem_cxt.cur_transaction_mem_cxt; + reader->typmodmap = hash_create("typmodmap hashtable", 100, &ctl, HASH_ELEM | HASH_CONTEXT); + } + + /* Create map entry. */ + mapent = (RecordTypemodMap *)hash_search(reader->typmodmap, &remotetypmod, HASH_ENTER, &found); + if (found) + ereport(ERROR, (errmsg("duplicate message for typmod %d", remotetypmod))); + mapent->localtypmod = tupledesc->tdtypmod; + ereport(DEBUG3, (errmsg("mapping remote typmod %d to local typmod %d", remotetypmod, tupledesc->tdtypmod))); +} + +/* + * Build a mapping indicating what remapping class applies to each attribute + * described by a tupledesc. + */ +static RemapInfo *BuildRemapInfo(TupleDesc tupledesc) +{ + Size size; + AttrNumber i; + bool noop = true; + + size = offsetof(RemapInfo, mapping) + sizeof(RemapClass) * tupledesc->natts; + RemapInfo *remapinfo = (RemapInfo *)MemoryContextAllocZero(TopMemoryContext, size); + remapinfo->natts = tupledesc->natts; + for (i = 0; i < tupledesc->natts; ++i) { + Form_pg_attribute attr = tupledesc->attrs[i]; + + if (attr->attisdropped) { + remapinfo->mapping[i] = TQUEUE_REMAP_NONE; + continue; + } + + remapinfo->mapping[i] = GetRemapClass(attr->atttypid); + if (remapinfo->mapping[i] != TQUEUE_REMAP_NONE) + noop = false; + } + + if (noop) { + pfree(remapinfo); + remapinfo = NULL; + } + + return remapinfo; +} + +/* + * Determine the remap class assocociated with a particular data type. + * + * Transient record types need to have the typmod applied on the sending side + * replaced with a value on the receiving side that has the same meaning. + * + * Arrays, range types, and all record types (including named composite types) + * need to searched for transient record values buried within them. + * Surprisingly, a walker is required even when the indicated type is a + * composite type, because the actual value may be a compatible transient + * record type. + */ +static RemapClass GetRemapClass(Oid type_id) +{ + RemapClass forceResult = TQUEUE_REMAP_NONE; + RemapClass innerResult = TQUEUE_REMAP_NONE; + + for (;;) { + /* Simple cases. */ + if (type_id == RECORDOID) { + innerResult = TQUEUE_REMAP_RECORD; + break; + } + if (type_id == RECORDARRAYOID) { + innerResult = TQUEUE_REMAP_ARRAY; + break; + } + + /* Otherwise, we need a syscache lookup to figure it out. */ + HeapTuple tup = SearchSysCache1((int)TYPEOID, ObjectIdGetDatum(type_id)); + if (!HeapTupleIsValid(tup)) + ereport(ERROR, (errmsg("cache lookup failed for type %u", type_id))); + Form_pg_type typ = (Form_pg_type)GETSTRUCT(tup); + /* Look through domains to underlying base type. */ + if (typ->typtype == TYPTYPE_DOMAIN) { + type_id = typ->typbasetype; + ReleaseSysCache(tup); + continue; + } + + /* + * Look through arrays to underlying base type, but the final return + * value must be either TQUEUE_REMAP_ARRAY or TQUEUE_REMAP_NONE. (If + * this is an array of integers, for example, we don't need to walk + * it.) + */ + if (OidIsValid(typ->typelem) && typ->typlen == -1) { + type_id = typ->typelem; + ReleaseSysCache(tup); + if (forceResult == TQUEUE_REMAP_NONE) { + forceResult = TQUEUE_REMAP_ARRAY; + } + continue; + } + + /* + * Similarly, look through ranges to the underlying base type, but the + * final return value must be either TQUEUE_REMAP_RANGE or + * TQUEUE_REMAP_NONE. + */ + if (typ->typtype == TYPTYPE_RANGE) { + ReleaseSysCache(tup); + if (forceResult == TQUEUE_REMAP_NONE) { + forceResult = TQUEUE_REMAP_RANGE; + } + type_id = get_range_subtype(type_id); + continue; + } + + /* Walk composite types. Nothing else needs special handling. */ + if (typ->typtype == TYPTYPE_COMPOSITE) { + innerResult = TQUEUE_REMAP_RECORD; + } + ReleaseSysCache(tup); + break; + } + + if (innerResult != TQUEUE_REMAP_NONE && forceResult != TQUEUE_REMAP_NONE) { + return forceResult; + } + return innerResult; +} + diff --git a/src/gausskernel/storage/access/hbstore/hbucket_am.cpp b/src/gausskernel/storage/access/hbstore/hbucket_am.cpp index ccfc8e232..baec56740 100755 --- a/src/gausskernel/storage/access/hbstore/hbucket_am.cpp +++ b/src/gausskernel/storage/access/hbstore/hbucket_am.cpp @@ -285,7 +285,7 @@ static HeapTuple switch_and_scan_next_tbl_hbkt(HBktTblScanDesc hp_scan, ScanDire (void)reset_scan_qual(next_bkt_rel, hp_scan->scanState); next_bkt_scan = heap_beginscan(next_bkt_rel, curr_bkt_scan->rs_snapshot, curr_bkt_scan->rs_nkeys, curr_bkt_scan->rs_key, - curr_bkt_scan->rs_isRangeScanInRedis); + curr_bkt_scan->rs_flags & SO_TYPE_RANGESCAN); try_init_bucket_parallel(next_bkt_scan, hp_scan->scanState); @@ -347,8 +347,8 @@ bool hbkt_sampling_scan_nextbucket(HBktTblScanDesc hp_scan) /* Step 3. Build a HeapScan for new bucket */ next_bkt_scan = heap_beginscan_sampling(next_bkt_rel, curr_bkt_scan->rs_snapshot, curr_bkt_scan->rs_nkeys, curr_bkt_scan->rs_key, - curr_bkt_scan->rs_allow_strat, curr_bkt_scan->rs_allow_sync, - curr_bkt_scan->rs_isRangeScanInRedis); + curr_bkt_scan->rs_flags & SO_ALLOW_STRAT, curr_bkt_scan->rs_flags & SO_ALLOW_SYNC, + curr_bkt_scan->rs_flags & SO_TYPE_RANGESCAN); /* Step 4. Set the parallel scan parameter */ ScanState* sstate = hp_scan->scanState; diff --git a/src/gausskernel/storage/access/heap/heapam.cpp b/src/gausskernel/storage/access/heap/heapam.cpp index 4a8f5e457..c0f81499c 100644 --- a/src/gausskernel/storage/access/heap/heapam.cpp +++ b/src/gausskernel/storage/access/heap/heapam.cpp @@ -44,6 +44,7 @@ #include "access/heapam.h" #include "access/hio.h" #include "access/multixact.h" +#include "access/parallel.h" #include "access/relscan.h" #include "access/sysattr.h" #include "access/tableam.h" @@ -124,7 +125,9 @@ const TableAm g_HeapTblAm = {.table_endscan = (table_endscan_t)heap_endscan, .table_init_parallel_seqscan = (table_init_parallel_seqscan_t)heap_init_parallel_seqscan}; static HeapScanDesc heap_beginscan_internal(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, - bool allow_strat, bool allow_sync, bool is_bitmapscan, bool is_range_scan_in_redis = false, bool is_samplescan = false); + ParallelHeapScanDesc parallel_scan, uint32 flag); +static void heap_parallelscan_startblock_init(HeapScanDesc scan); +static BlockNumber heap_parallelscan_nextpage(HeapScanDesc scan); static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, CommandId cid, int options); static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, const ItemPointer from, Buffer newbuf, HeapTuple newtup, HeapTuple old_key_tup, bool all_visible_cleared, bool new_all_visible_cleared); @@ -148,7 +151,7 @@ static void initscan(HeapScanDesc scan, ScanKey key, bool is_rescan) bool allow_strat = false; bool allow_sync = false; BlockNumber nblocks; - bool is_range_scan_in_redis = scan->rs_isRangeScanInRedis; + bool is_range_scan_in_redis = scan->rs_flags & SO_TYPE_RANGESCAN; /* * Determine the number of blocks we have to scan. @@ -161,7 +164,9 @@ static void initscan(HeapScanDesc scan, ScanKey key, bool is_rescan) * results for a non-MVCC snapshot, the caller must hold some higher-level * lock that ensures the interesting tuple(s) won't change.) */ - if (RelationIsPartitioned(scan->rs_rd)) { + if (scan->rs_parallel != NULL) { + nblocks = scan->rs_parallel->phs_nblocks; + } else if (RelationIsPartitioned(scan->rs_rd)) { /* partition table just set Initial Value, in BitmapHeapTblNext will update */ nblocks = InvalidBlockNumber; } else { @@ -189,8 +194,8 @@ static void initscan(HeapScanDesc scan, ScanKey key, bool is_rescan) * During a rescan, don't make a new strategy object if we don't have to. */ if (scan->rs_nblocks > (uint32)(g_instance.attr.attr_storage.NBuffers / 4)) { - allow_strat = scan->rs_allow_strat; - allow_sync = scan->rs_allow_sync; + allow_strat = scan->rs_flags & SO_ALLOW_STRAT; + allow_sync = scan->rs_flags & SO_ALLOW_SYNC; } else allow_strat = allow_sync = false; @@ -203,7 +208,10 @@ static void initscan(HeapScanDesc scan, ScanKey key, bool is_rescan) scan->rs_strategy = NULL; } - if (is_rescan) { + if (scan->rs_parallel != NULL) { + /* For parallel scan, believe whatever ParallelHeapScanDesc says. */ + scan->rs_syncscan = scan->rs_parallel->phs_syncscan; + } else if (is_rescan) { /* * If rescan, keep the previous startblock setting so that rewinding a * cursor doesn't generate surprising results. Reset the syncscan @@ -246,7 +254,7 @@ static void initscan(HeapScanDesc scan, ScanKey key, bool is_rescan) * underlying bitmap index scans will be counted) or sample scans (we only * update stats for tuple fetches there). */ - if (!scan->rs_bitmapscan && !scan->rs_samplescan) { + if (!(scan->rs_flags & (SO_TYPE_BITMAPSCAN | SO_TYPE_SAMPLESCAN))) { pgstat_count_heap_scan(scan->rs_rd); } } @@ -269,7 +277,7 @@ void heapgetpage(HeapScanDesc scan, BlockNumber page) ItemId lpp; bool all_visible = false; - if (!scan->rs_isRangeScanInRedis) { + if (!(scan->rs_flags & SO_TYPE_RANGESCAN)) { Assert(page < scan->rs_nblocks); } else { Assert(page < scan->rs_nblocks + scan->rs_startblock); @@ -297,7 +305,7 @@ void heapgetpage(HeapScanDesc scan, BlockNumber page) /* We've pinned the buffer, nobody can prune this buffer, check whether snapshot is valid. */ CheckSnapshotIsValidException(scan->rs_snapshot, "heapgetpage"); - if (!scan->rs_pageatatime) { + if (!(scan->rs_flags & SO_ALLOW_PAGEMODE)) { gstrace_exit(GS_TRC_ID_heapgetpage); return; } @@ -311,7 +319,7 @@ void heapgetpage(HeapScanDesc scan, BlockNumber page) * since we use append mode and never look back holes in previous pages * anyway. */ - if (!scan->rs_isRangeScanInRedis) { + if (!(scan->rs_flags & SO_TYPE_RANGESCAN)) { heap_page_prune_opt(scan->rs_rd, buffer); } @@ -417,7 +425,7 @@ bool next_page(HeapScanDesc scan, ScanDirection dir, BlockNumber& page) page += (scan->dop - 1) * PARALLEL_SCAN_GAP; } - if (scan->rs_isRangeScanInRedis) { + if (scan->rs_flags & SO_TYPE_RANGESCAN) { /* Parallel workers start from different point. */ finished = (page >= scan->rs_startblock + scan->rs_nblocks - PARALLEL_SCAN_GAP * u_sess->stream_cxt.smp_id); @@ -432,10 +440,13 @@ bool next_page(HeapScanDesc scan, ScanDirection dir, BlockNumber& page) page = scan->rs_nblocks; } page--; + } else if (scan->rs_parallel != NULL) { + page = heap_parallelscan_nextpage(scan); + finished = (page == InvalidBlockNumber); } else { page++; - if (scan->rs_isRangeScanInRedis) { + if (scan->rs_flags & SO_TYPE_RANGESCAN) { if (page >= scan->rs_startblock + scan->rs_nblocks) { page = 0; } @@ -742,7 +753,19 @@ static void heapgettup(HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey gstrace_exit(GS_TRC_ID_heapgettup); return; } - page = scan->rs_startblock; /* first page */ + + if (scan->rs_parallel != NULL) { + heap_parallelscan_startblock_init(scan); + page = heap_parallelscan_nextpage(scan); + /* Other processes might have already finished the scan. */ + if (page == InvalidBlockNumber) { + Assert(!BufferIsValid(scan->rs_cbuf)); + tuple->t_data = NULL; + return; + } + } else { + page = scan->rs_startblock; /* first page */ + } heapgetpage(scan, page); line_off = FirstOffsetNumber; /* first offnum */ scan->rs_inited = true; @@ -759,6 +782,9 @@ static void heapgettup(HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey /* page and line_off now reference the physically next tid */ lines_left = lines - line_off + 1; } else if (backward) { + /* backward parallel scan not supported */ + Assert(scan->rs_parallel == NULL); + if (!scan->rs_inited) { /* return null immediately if relation is empty */ if (scan->rs_nblocks == 0) { @@ -952,7 +978,7 @@ static void heapgettup_pagemode(HeapScanDesc scan, ScanDirection dir, int nkeys, { HeapTuple tuple = &(scan->rs_ctup); bool backward = ScanDirectionIsBackward(dir); - bool is_range_scan_in_redis = scan->rs_isRangeScanInRedis; + bool is_range_scan_in_redis = scan->rs_flags & SO_TYPE_RANGESCAN; BlockNumber page; bool finished = false; Page dp; @@ -983,7 +1009,20 @@ static void heapgettup_pagemode(HeapScanDesc scan, ScanDirection dir, int nkeys, gstrace_exit(GS_TRC_ID_heapgettup_pagemode); return; } - page = scan->rs_startblock; /* first page */ + + if (scan->rs_parallel != NULL) { + heap_parallelscan_startblock_init(scan); + page = heap_parallelscan_nextpage(scan); + + /* Other processes might have already finished the scan. */ + if (page == InvalidBlockNumber) { + Assert(!BufferIsValid(scan->rs_cbuf)); + tuple->t_data = NULL; + return; + } + } else { + page = scan->rs_startblock; /* first page */ + } heapgetpage(scan, page); line_index = 0; scan->rs_inited = true; @@ -998,6 +1037,9 @@ static void heapgettup_pagemode(HeapScanDesc scan, ScanDirection dir, int nkeys, /* page and line_index now reference the next visible tid */ lines_left = lines - line_index; } else if (backward) { + /* backward parallel scan not supported */ + Assert(scan->rs_parallel == NULL); + if (!scan->rs_inited) { /* return null immediately if relation is empty */ if (scan->rs_nblocks == 0) { @@ -1558,19 +1600,32 @@ Relation heap_openrv_extended( HeapScanDesc heap_beginscan(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, bool is_range_scan_in_redis) { /* We don't allow sync buffer read if it is a range scan in redis */ - return heap_beginscan_internal( - relation, snapshot, nkeys, key, !is_range_scan_in_redis, !is_range_scan_in_redis, false, is_range_scan_in_redis); + uint32 flag; + if (is_range_scan_in_redis) { + flag = SO_TYPE_RANGESCAN; + } else { + flag = SO_ALLOW_STRAT | SO_ALLOW_SYNC; + } + return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL, flag); } HeapScanDesc heap_beginscan_strat( Relation relation, Snapshot snapshot, int nkeys, ScanKey key, bool allow_strat, bool allow_sync) { - return heap_beginscan_internal(relation, snapshot, nkeys, key, allow_strat, allow_sync, false); + uint32 flag = 0; + if (allow_strat) { + flag |= SO_ALLOW_STRAT; + } + if (allow_sync) { + flag |= SO_ALLOW_SYNC; + } + return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL, flag); } HeapScanDesc heap_beginscan_bm(Relation relation, Snapshot snapshot, int nkeys, ScanKey key) { - return heap_beginscan_internal(relation, snapshot, nkeys, key, false, false, true); + uint32 flag = SO_TYPE_BITMAPSCAN; + return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL, flag); } /* @@ -1590,12 +1645,21 @@ HeapScanDesc heap_beginscan_bm(Relation relation, Snapshot snapshot, int nkeys, HeapScanDesc heap_beginscan_sampling(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, bool allow_strat, bool allow_sync, bool is_range_scan_in_redis) { - return heap_beginscan_internal( - relation, snapshot, nkeys, key, allow_strat, allow_sync, false, is_range_scan_in_redis, true); + uint32 flag = SO_TYPE_SAMPLESCAN; + if (allow_strat) { + flag |= SO_ALLOW_STRAT; + } + if (allow_sync) { + flag |= SO_ALLOW_SYNC; + } + if (is_range_scan_in_redis) { + flag |= SO_TYPE_RANGESCAN; + } + return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL, flag); } static HeapScanDesc heap_beginscan_internal(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, - bool allow_strat, bool allow_sync, bool is_bitmapscan, bool is_range_scan_in_redis, bool is_samplescan) + ParallelHeapScanDesc parallel_scan, uint32 flag) { HeapScanDesc scan; @@ -1614,7 +1678,7 @@ static HeapScanDesc heap_beginscan_internal(Relation relation, Snapshot snapshot * bitmapscan to scan tuples using GPI. Therefore, * the value of rs_rd in the scan is used to store partition-fake-relation. */ - Assert(is_bitmapscan); + Assert(flag & SO_TYPE_BITMAPSCAN); } /* @@ -1628,17 +1692,16 @@ static HeapScanDesc heap_beginscan_internal(Relation relation, Snapshot snapshot scan->rs_tupdesc = RelationGetDescr(relation); scan->rs_snapshot = snapshot; scan->rs_nkeys = nkeys; - scan->rs_bitmapscan = is_bitmapscan; - scan->rs_samplescan = is_samplescan; + scan->rs_flags = flag; scan->rs_strategy = NULL; /* set in initscan */ - scan->rs_allow_strat = allow_strat; - scan->rs_allow_sync = allow_sync; - scan->rs_isRangeScanInRedis = is_range_scan_in_redis; + scan->rs_parallel = parallel_scan; /* * we can use page-at-a-time mode if it's an MVCC-safe snapshot */ - scan->rs_pageatatime = IsMVCCSnapshot(snapshot); + if (IsMVCCSnapshot(snapshot)) { + scan->rs_flags |= SO_ALLOW_PAGEMODE; + } /* * For a seqscan in a serializable transaction, acquire a predicate lock @@ -1651,7 +1714,7 @@ static HeapScanDesc heap_beginscan_internal(Relation relation, Snapshot snapshot * covering the predicate. But in that case we still have to lock any * matching heap tuples. */ - if (!is_bitmapscan) { + if (!(flag & SO_TYPE_BITMAPSCAN)) { PredicateLockRelation(relation, snapshot); } @@ -1694,6 +1757,20 @@ void heap_rescan(HeapScanDesc scan, ScanKey key) * reinitialize scan descriptor */ initscan(scan, key, true); + + /* + * reset parallel scan, if present + */ + if (scan->rs_parallel != NULL) { + ParallelHeapScanDesc parallel_scan; + + /* + * Caller is responsible for making sure that all workers have + * finished the scan before calling this. + */ + parallel_scan = scan->rs_parallel; + pg_atomic_write_u64(¶llel_scan->phs_nallocated, 0); + } } /* ---------------- @@ -1727,6 +1804,10 @@ void heap_endscan(HeapScanDesc scan) FreeAccessStrategy(scan->rs_strategy); } + if (scan->rs_flags & SO_TEMP_SNAPSHOT) { + UnregisterSnapshot(scan->rs_snapshot); + } + pfree(scan); scan = NULL; } @@ -1788,12 +1869,159 @@ HeapTuple heapGetNextForVerify(HeapScanDesc scan, ScanDirection direction, bool& return &(scan->rs_ctup); } +/* ---------------- + * heap_parallelscan_estimate - estimate storage for ParallelHeapScanDesc + * + * Sadly, this doesn't reduce to a constant, because the size required + * to serialize the snapshot can vary. + * ---------------- + */ +Size heap_parallelscan_estimate(Snapshot snapshot) +{ + return add_size(offsetof(ParallelHeapScanDescData, phs_snapshot_data), EstimateSnapshotSpace(snapshot)); +} + +/* ---------------- + * heap_parallelscan_initialize - initialize ParallelHeapScanDesc + * + * Must allow as many bytes of shared memory as returned by + * heap_parallelscan_estimate. Call this just once in the leader + * process; then, individual workers attach via heap_beginscan_parallel. + * ---------------- + */ +void heap_parallelscan_initialize(ParallelHeapScanDesc target, Size pscan_len, Relation relation, Snapshot snapshot) +{ + target->phs_relid = RelationGetRelid(relation); + target->phs_nblocks = RelationGetNumberOfBlocks(relation); + /* compare phs_syncscan initialization to similar logic in initscan */ + target->phs_syncscan = u_sess->attr.attr_storage.synchronize_seqscans && !RelationUsesLocalBuffers(relation) && + target->phs_nblocks > (uint)g_instance.attr.attr_storage.NBuffers / 4; + SpinLockInit(&target->phs_mutex); + target->phs_startblock = InvalidBlockNumber; + target->pscan_len = pscan_len; + pg_atomic_write_u64(&target->phs_nallocated, 0); + SerializeSnapshot(snapshot, target->phs_snapshot_data, + pscan_len - offsetof(ParallelHeapScanDescData, phs_snapshot_data)); +} + +/* ---------------- + * heap_beginscan_parallel - join a parallel scan + * + * Caller must hold a suitable lock on the correct relation. + * ---------------- + */ +HeapScanDesc heap_beginscan_parallel(Relation relation, ParallelHeapScanDesc parallel_scan) +{ + Assert(RelationGetRelid(relation) == parallel_scan->phs_relid); + Snapshot snapshot = RestoreSnapshot(parallel_scan->phs_snapshot_data, + parallel_scan->pscan_len - offsetof(ParallelHeapScanDescData, phs_snapshot_data)); + RegisterSnapshot(snapshot); + + uint32 flag = SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_TEMP_SNAPSHOT; + return heap_beginscan_internal(relation, snapshot, 0, NULL, parallel_scan, flag); +} + +/* ---------------- + * heap_parallelscan_startblock_init - find and set the scan's startblock + * + * Determine where the parallel seq scan should start. This function may + * be called many times, once by each parallel worker. We must be careful + * only to set the startblock once. + * ---------------- + */ +static void heap_parallelscan_startblock_init(HeapScanDesc scan) +{ + Assert(scan->rs_parallel); + BlockNumber sync_startpage = InvalidBlockNumber; + ParallelHeapScanDesc parallel_scan = scan->rs_parallel; + +retry: + /* Grab the spinlock. */ + SpinLockAcquire(¶llel_scan->phs_mutex); + + /* + * If the scan's startblock has not yet been initialized, we must do so + * now. If this is not a synchronized scan, we just start at block 0, but + * if it is a synchronized scan, we must get the starting position from + * the synchronized scan machinery. We can't hold the spinlock while + * doing that, though, so release the spinlock, get the information we + * need, and retry. If nobody else has initialized the scan in the + * meantime, we'll fill in the value we fetched on the second time + * through. + */ + if (parallel_scan->phs_startblock == InvalidBlockNumber) { + if (!parallel_scan->phs_syncscan) + parallel_scan->phs_startblock = 0; + else if (sync_startpage != InvalidBlockNumber) + parallel_scan->phs_startblock = sync_startpage; + else { + SpinLockRelease(¶llel_scan->phs_mutex); + sync_startpage = ss_get_location(scan->rs_rd, scan->rs_nblocks); + goto retry; + } + } + SpinLockRelease(¶llel_scan->phs_mutex); +} + +/* ---------------- + * heap_parallelscan_nextpage - get the next page to scan + * + * Get the next page to scan. Even if there are no pages left to scan, + * another backend could have grabbed a page to scan and not yet finished + * looking at it, so it doesn't follow that the scan is done when the + * first backend gets an InvalidBlockNumber return. + * ---------------- + */ +static BlockNumber heap_parallelscan_nextpage(HeapScanDesc scan) +{ + Assert(scan->rs_parallel); + BlockNumber page; + ParallelHeapScanDesc parallel_scan = scan->rs_parallel; + + /* + * phs_nallocated tracks how many pages have been allocated to workers + * already. When phs_nallocated >= rs_nblocks, all blocks have been + * allocated. + * + * Because we use an atomic fetch-and-add to fetch the current value, the + * phs_nallocated counter will exceed rs_nblocks, because workers will + * still increment the value, when they try to allocate the next block but + * all blocks have been allocated already. The counter must be 64 bits + * wide because of that, to avoid wrapping around when rs_nblocks is close + * to 2^32. + * + * The actual page to return is calculated by adding the counter to the + * starting block number, modulo nblocks. + */ + uint64 nallocated = pg_atomic_fetch_add_u64(¶llel_scan->phs_nallocated, 1); + if (nallocated >= scan->rs_nblocks) + page = InvalidBlockNumber; /* all blocks have been allocated */ + else + page = (nallocated + parallel_scan->phs_startblock) % scan->rs_nblocks; + + /* + * Report scan location. Normally, we report the current page number. + * When we reach the end of the scan, though, we report the starting page, + * not the ending page, just so the starting positions for later scans + * doesn't slew backwards. We only report the position at the end of the + * scan once, though: subsequent callers will report nothing. + */ + if (scan->rs_syncscan) { + if (page != InvalidBlockNumber) + ss_report_location(scan->rs_rd, page); + else if (nallocated == scan->rs_nblocks) + ss_report_location(scan->rs_rd, parallel_scan->phs_startblock); + } + + return page; +} + HeapTuple heap_getnext(HeapScanDesc scan, ScanDirection direction) { /* Note: no locking manipulations needed */ HEAPDEBUG_1; /* heap_getnext( info ) */ - if (scan->rs_pageatatime) { + if (scan->rs_flags & SO_ALLOW_PAGEMODE) { heapgettup_pagemode(scan, direction, scan->rs_nkeys, scan->rs_key); } else { heapgettup(scan, direction, scan->rs_nkeys, scan->rs_key); @@ -3276,7 +3504,7 @@ void heap_markpos(HeapScanDesc scan) /* Note: no locking manipulations needed */ if (scan->rs_ctup.t_data != NULL) { scan->rs_mctid = scan->rs_ctup.t_self; - if (scan->rs_pageatatime) { + if (scan->rs_flags & SO_ALLOW_PAGEMODE) { scan->rs_mindex = scan->rs_cindex; } } else @@ -3292,6 +3520,19 @@ void heap_markpos(HeapScanDesc scan) */ static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, CommandId cid, int options) { + /* + * Parallel operations are required to be strictly read-only in a parallel + * worker. Parallel inserts are not safe even in the leader in the + * general case, because group locking means that heavyweight locks for + * relation extension or GIN page locks will not conflict between members + * of a lock group, but we don't prohibit that case here because there are + * useful special cases that we can safely allow, such as CREATE TABLE AS. + */ + if (IsParallelWorker()) { + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), errmsg("cannot insert tuples in a parallel worker"))); + } + if (relation->rd_rel->relhasoids) { #ifdef NOT_USED /* this is redundant with an Assert in HeapTupleSetOid */ @@ -3797,6 +4038,16 @@ HTSU_Result heap_delete(Relation relation, ItemPointer tid, ItemPointer ctid, Tr /* Don't allow any write/lock operator in stream. */ Assert(!StreamThreadAmI()); + /* + * Forbid this during a parallel operation, lest it allocate a combocid. + * Other workers might need that combocid for visibility checks, and we + * have no provision for broadcasting it to them. + */ + if (IsInParallelMode()) { + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), errmsg("cannot delete tuples during a parallel operation"))); + } + block = ItemPointerGetBlockNumber(tid); buffer = ReadBuffer(relation, block); page = BufferGetPage(buffer); @@ -4242,6 +4493,16 @@ HTSU_Result heap_update(Relation relation, Relation parentRelation, ItemPointer /* Don't allow any write/lock operator in stream. */ Assert(!StreamThreadAmI()); + /* + * Forbid this during a parallel operation, lest it allocate a combocid. + * Other workers might need that combocid for visibility checks, and we + * have no provision for broadcasting it to them. + */ + if (IsInParallelMode()) { + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), errmsg("cannot update tuples during a parallel operation"))); + } + /* * Fetch the list of attributes to be checked for HOT update. This is * wasted effort if we fail to update or have to put the new tuple on a @@ -5688,6 +5949,17 @@ void heap_inplace_update(Relation relation, HeapTuple tuple) uint32 newlen; errno_t rc; + /* + * For now, parallel operations are required to be strictly read-only. + * Unlike a regular update, this should never create a combo CID, so it + * might be possible to relax this restriction, but not without more + * thought and testing. It's not clear that it would be useful, anyway. + */ + if (IsInParallelMode()) { + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), errmsg("cannot update tuples during a parallel operation"))); + } + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self))); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); page = (Page)BufferGetPage(buffer); @@ -5907,7 +6179,7 @@ void heap_restrpos(HeapScanDesc scan) */ scan->rs_inited = true; scan->rs_ctup.t_self = scan->rs_mctid; - if (scan->rs_pageatatime) { + if (scan->rs_flags & SO_ALLOW_PAGEMODE) { scan->rs_cindex = scan->rs_mindex; heapgettup_pagemode(scan, NoMovementScanDirection, @@ -6885,7 +7157,6 @@ static void heap_xlog_newpage(XLogReaderState* record) inline static void heap_xlog_allvisiblecleared(RelFileNode target_node, BlockNumber blkno) { - Relation reln = CreateFakeRelcacheEntry(target_node); Buffer vmbuffer = InvalidBuffer; @@ -7786,7 +8057,7 @@ void heap_init_parallel_seqscan(HeapScanDesc scan, int32 dop, ScanDirection dir) if (ScanDirectionIsBackward(dir)) { paral_blocks = (scan->rs_nblocks - 1) - paral_blocks; - if (scan->rs_isRangeScanInRedis) { + if (scan->rs_flags & SO_TYPE_RANGESCAN) { scan->rs_startblock = paral_blocks; } else { scan->rs_startblock += paral_blocks; @@ -7795,7 +8066,7 @@ void heap_init_parallel_seqscan(HeapScanDesc scan, int32 dop, ScanDirection dir) } /* If not range scan in redistribute, just start from 0. */ - if (scan->rs_isRangeScanInRedis) { + if (scan->rs_flags & SO_TYPE_RANGESCAN) { scan->rs_startblock += paral_blocks; } else { scan->rs_startblock = paral_blocks; diff --git a/src/gausskernel/storage/access/transam/Makefile b/src/gausskernel/storage/access/transam/Makefile index 1c33520d3..349d27079 100755 --- a/src/gausskernel/storage/access/transam/Makefile +++ b/src/gausskernel/storage/access/transam/Makefile @@ -10,12 +10,12 @@ ifneq "$(MAKECMDGOALS)" "clean" endif endif ifeq ($(enable_multiple_nodes), yes) -OBJS = clog.o multixact.o rmgr.o slru.o csnlog.o transam.o twophase.o \ +OBJS = clog.o multixact.o parallel.o rmgr.o slru.o csnlog.o transam.o twophase.o \ twophase_rmgr.o varsup.o double_write.o redo_statistic.o multi_redo_api.o multi_redo_settings.o \ xact.o xlog.o xlogfuncs.o \ xloginsert.o xlogreader.o xlogutils.o cbmparsexlog.o cbmfuncs.o else -OBJS = clog.o gtm_single.o multixact.o rmgr.o slru.o csnlog.o transam.o twophase.o \ +OBJS = clog.o gtm_single.o multixact.o parallel.o rmgr.o slru.o csnlog.o transam.o twophase.o \ twophase_rmgr.o varsup.o double_write.o redo_statistic.o multi_redo_api.o multi_redo_settings.o \ xact.o xlog.o xlogfuncs.o \ xloginsert.o xlogreader.o xlogutils.o cbmparsexlog.o cbmfuncs.o diff --git a/src/gausskernel/storage/access/transam/parallel.cpp b/src/gausskernel/storage/access/transam/parallel.cpp new file mode 100644 index 000000000..a6e95823a --- /dev/null +++ b/src/gausskernel/storage/access/transam/parallel.cpp @@ -0,0 +1,1093 @@ +/* ------------------------------------------------------------------------- + * + * parallel.c + * Infrastructure for launching parallel workers + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/transam/parallel.c + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/parallel.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "catalog/pg_enum.h" +#include "catalog/index.h" +#include "catalog/namespace.h" +#include "commands/async.h" +#include "executor/execParallel.h" +#include "libpq/libpq.h" +#include "libpq/pqsignal.h" +#include "libpq/pqformat.h" +#include "libpq/pqmq.h" +#include "miscadmin.h" +#include "optimizer/planner.h" +#include "pgstat.h" +#include "storage/ipc.h" +#include "storage/predicate.h" +#include "storage/sinval.h" +#include "storage/spin.h" +#include "tcop/tcopprot.h" +#include "utils/combocid.h" +#include "utils/guc.h" +#include "utils/inval.h" +#include "utils/memutils.h" +#include "utils/relmapper.h" +#include "utils/snapmgr.h" +#include "utils/typcache.h" + + +/* + * We don't want to waste a lot of memory on an error queue which, most of + * the time, will process only a handful of small messages. However, it is + * desirable to make it large enough that a typical ErrorResponse can be sent + * without blocking. That way, a worker that errors out can write the whole + * message into the queue and terminate without waiting for the user backend. + */ +#define PARALLEL_ERROR_QUEUE_SIZE 16384 + +/* + * List of internal parallel worker entry points. We need this for + * reasons explained in LookupParallelWorkerFunction(), below. + */ +static const struct { + const char *fn_name; + parallel_worker_main_type fn_addr; +} InternalParallelWorkers[] = { + { + "ParallelQueryMain", ParallelQueryMain + } +}; + +/* Private functions. */ +static void HandleParallelMessage(ParallelContext *pcxt, int i, StringInfo msg); +static void WaitForParallelWorkersToExit(ParallelContext *pcxt); +static parallel_worker_main_type LookupParallelWorkerFunction(const char *libraryname, const char *funcname); +static void ParallelWorkerShutdown(int code, Datum arg); + +/* + * Establish a new parallel context. This should be done after entering + * parallel mode, and (unless there is an error) the context should be + * destroyed before exiting the current subtransaction. + */ +ParallelContext *CreateParallelContext(const char *library_name, const char *function_name, int nworkers) +{ + /* It is unsafe to create a parallel context if not in parallel mode. */ + Assert(IsInParallelMode()); + + /* Number of workers should be non-negative. */ + Assert(nworkers >= 0); + + /* We might be running in a short-lived memory context. */ + MemoryContext oldcontext = MemoryContextSwitchTo(u_sess->top_transaction_mem_cxt); + + /* Initialize a new ParallelContext. */ + ParallelContext *pcxt = (ParallelContext *)palloc0(sizeof(ParallelContext)); + pcxt->subid = GetCurrentSubTransactionId(); + pcxt->nworkers = nworkers; + pcxt->library_name = pstrdup(library_name); + pcxt->function_name = pstrdup(function_name); + pcxt->error_context_stack = t_thrd.log_cxt.error_context_stack; + dlist_push_head(&t_thrd.bgworker_cxt.pcxt_list, &pcxt->node); + + /* Restore previous memory context. */ + (void)MemoryContextSwitchTo(oldcontext); + + return pcxt; +} + +/* + * Establish the dynamic shared memory segment for a parallel context and + * copy state and other bookkeeping information that will be needed by + * parallel workers into it. + */ +void InitializeParallelDSM(ParallelContext *pcxt) +{ + int i; + Snapshot transaction_snapshot = GetTransactionSnapshot(); + Snapshot active_snapshot = GetActiveSnapshot(); + + /* + * Create DSM and initialize with new table of contents. But if the user + * didn't request any workers, then don't bother creating a dynamic shared + * memory segment; instead, just use backend-private memory. + * + * Also, if we can't create a dynamic shared memory segment because the + * maximum number of segments have already been created, then fall back to + * backend-private memory, and plan not to use any workers. We hope this + * won't happen very often, but it's better to abandon the use of + * parallelism than to fail outright. + */ + pcxt->seg = dsm_create(); + + knl_u_parallel_context *cxt = (knl_u_parallel_context *)pcxt->seg; + MemoryContext oldcontext = MemoryContextSwitchTo(cxt->memCtx); + + /* Initialize fixed-size state in shared memory. */ + cxt->pwCtx->database_id = u_sess->proc_cxt.MyDatabaseId; + cxt->pwCtx->authenticated_user_id = GetAuthenticatedUserId(); + cxt->pwCtx->outer_user_id = GetCurrentRoleId(); + cxt->pwCtx->is_superuser = u_sess->attr.attr_common.session_auth_is_superuser; + GetUserIdAndSecContext(&cxt->pwCtx->current_user_id, &cxt->pwCtx->sec_context); + GetTempNamespaceState(&cxt->pwCtx->temp_namespace_id, &cxt->pwCtx->temp_toast_namespace_id); + cxt->pwCtx->parallel_master_pgproc = t_thrd.proc; + cxt->pwCtx->parallel_master_pid = t_thrd.proc_cxt.MyProcPid; + cxt->pwCtx->parallel_master_backend_id = t_thrd.proc_cxt.MyBackendId; + cxt->pwCtx->xact_ts = GetCurrentTransactionStartTimestamp(); + cxt->pwCtx->stmt_ts = GetCurrentStatementStartTimestamp(); + SpinLockInit(&cxt->pwCtx->mutex); + cxt->pwCtx->last_xlog_end = 0; + + /* We can skip the rest of this if we're not budgeting for any workers. */ + if (pcxt->nworkers > 0) { + /* Serialize combo CID state. */ + cxt->pwCtx->usedComboCids = u_sess->utils_cxt.usedComboCids; + cxt->pwCtx->comboCids = u_sess->utils_cxt.comboCids; + cxt->pwCtx->sizeComboCids = u_sess->utils_cxt.sizeComboCids; + cxt->pwCtx->comboHash = u_sess->utils_cxt.comboHash; + + /* Serialize transaction snapshot and active snapshot. */ + Size tsnaplen = EstimateSnapshotSpace(transaction_snapshot); + Size asnaplen = EstimateSnapshotSpace(active_snapshot); + + cxt->pwCtx->tsnapspace = (char *)palloc0(tsnaplen); + cxt->pwCtx->tsnapspace_len = tsnaplen; + SerializeSnapshot(transaction_snapshot, cxt->pwCtx->tsnapspace, tsnaplen); + cxt->pwCtx->asnapspace = (char *)palloc0(asnaplen); + cxt->pwCtx->asnapspace_len = asnaplen; + SerializeSnapshot(active_snapshot, cxt->pwCtx->asnapspace, asnaplen); + + Size searchPathLen = strlen(u_sess->attr.attr_common.namespace_search_path); + cxt->pwCtx->namespace_search_path = (char *)palloc(searchPathLen + 1); + int rc = strcpy_s(cxt->pwCtx->namespace_search_path, searchPathLen + 1, + u_sess->attr.attr_common.namespace_search_path); + securec_check_c(rc, "", ""); + + /* Serialize transaction state. */ + cxt->pwCtx->xactIsoLevel = u_sess->utils_cxt.XactIsoLevel; + cxt->pwCtx->xactDeferrable = u_sess->attr.attr_storage.XactDeferrable; + cxt->pwCtx->topTransactionId = GetTopTransactionIdIfAny(); + cxt->pwCtx->currentTransactionId = GetCurrentTransactionIdIfAny(); + cxt->pwCtx->currentCommandId = t_thrd.xact_cxt.currentCommandId; + cxt->pwCtx->nParallelCurrentXids = t_thrd.xact_cxt.nParallelCurrentXids; + cxt->pwCtx->ParallelCurrentXids = t_thrd.xact_cxt.ParallelCurrentXids; + + /* Serialize relmapper state. */ + cxt->pwCtx->active_shared_updates = u_sess->relmap_cxt.active_shared_updates; + cxt->pwCtx->active_local_updates = u_sess->relmap_cxt.active_local_updates; + + /* Allocate space for worker information. */ + pcxt->worker = (ParallelWorkerInfo *)palloc0(sizeof(ParallelWorkerInfo) * pcxt->nworkers); + + /* + * Establish error queues in dynamic shared memory. + * + * These queues should be used only for transmitting ErrorResponse, + * NoticeResponse, and NotifyResponse protocol messages. Tuple data + * should be transmitted via separate (possibly larger?) queues. + */ + cxt->pwCtx->errorQueue = (char *)palloc0(mul_size(pcxt->nworkers, PARALLEL_ERROR_QUEUE_SIZE)); + for (i = 0; i < pcxt->nworkers; ++i) { + shm_mq *mq = + shm_mq_create(cxt->pwCtx->errorQueue + i * PARALLEL_ERROR_QUEUE_SIZE, PARALLEL_ERROR_QUEUE_SIZE); + shm_mq_set_receiver(mq, t_thrd.proc); + pcxt->worker[i].error_mqh = shm_mq_attach(mq, pcxt->seg, NULL); + } + + /* + * Serialize entrypoint information. It's unsafe to pass function + * pointers across processes, as the function pointer may be different + * in each process in EXEC_BACKEND builds, so we always pass library + * and function name. (We use library name "postgres" for functions + * in the core backend.) + */ + Size lnamelen = strlen(pcxt->library_name); + cxt->pwCtx->library_name = (char *)palloc(lnamelen + 1); + rc = strcpy_s(cxt->pwCtx->library_name, lnamelen + 1, pcxt->library_name); + securec_check_c(rc, "", ""); + + Size fnamelen = strlen(pcxt->function_name); + cxt->pwCtx->function_name = (char *)palloc(fnamelen + 1); + rc = strcpy_s(cxt->pwCtx->function_name, fnamelen + 1, pcxt->function_name); + securec_check_c(rc, "", ""); + } + + /* Restore previous memory context. */ + (void)MemoryContextSwitchTo(oldcontext); +} + +/* + * Reinitialize the dynamic shared memory segment for a parallel context such + * that we could launch workers for it again. + */ +void ReinitializeParallelDSM(ParallelContext *pcxt) +{ + /* Wait for any old workers to exit. */ + if (pcxt->nworkers_launched > 0) { + WaitForParallelWorkersToFinish(pcxt); + WaitForParallelWorkersToExit(pcxt); + pcxt->nworkers_launched = 0; + if (pcxt->known_attached_workers) { + pfree(pcxt->known_attached_workers); + pcxt->known_attached_workers = NULL; + pcxt->nknown_attached_workers = 0; + } + } + + knl_u_parallel_context *cxt = (knl_u_parallel_context *)pcxt->seg; + + /* Reset a few bits of fixed parallel state to a clean state. */ + cxt->pwCtx->last_xlog_end = 0; + + /* Recreate error queues (if they exist). */ + if (pcxt->nworkers > 0) { + for (int i = 0; i < pcxt->nworkers; ++i) { + char *start = cxt->pwCtx->errorQueue + i * PARALLEL_ERROR_QUEUE_SIZE; + shm_mq *mq = shm_mq_create(start, PARALLEL_ERROR_QUEUE_SIZE); + shm_mq_set_receiver(mq, t_thrd.proc); + pcxt->worker[i].error_mqh = shm_mq_attach(mq, pcxt->seg, NULL); + } + } +} + +/* + * Launch parallel workers. + */ +void LaunchParallelWorkers(ParallelContext *pcxt) +{ + BackgroundWorker worker; + int i; + bool any_registrations_failed = false; + + /* Skip this if we have no workers. */ + if (pcxt->nworkers == 0) + return; + + /* If we do have workers, we'd better have a DSM segment. */ + Assert(pcxt->seg != NULL); + + /* We might be running in a short-lived memory context. */ + MemoryContext oldcontext = MemoryContextSwitchTo(u_sess->top_transaction_mem_cxt); + + /* Configure a worker. */ + int rc = memset_s(&worker, sizeof(worker), 0, sizeof(worker)); + securec_check(rc, "", ""); + rc = sprintf_s(worker.bgw_name, BGW_MAXLEN, "parallel worker for PID %lu", t_thrd.proc_cxt.MyProcPid); + securec_check_ss(rc, "", ""); + rc = sprintf_s(worker.bgw_type, BGW_MAXLEN, "parallel worker"); + securec_check_ss(rc, "", ""); + worker.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION | BGWORKER_CLASS_PARALLEL; + worker.bgw_start_time = BgWorkerStart_ConsistentState; + worker.bgw_restart_time = BGW_NEVER_RESTART; + rc = strcpy_s(worker.bgw_library_name, BGW_MAXLEN, "postgres"); + securec_check(rc, "", ""); + rc = strcpy_s(worker.bgw_function_name, BGW_MAXLEN, "ParallelWorkerMain"); + securec_check(rc, "", ""); + worker.bgw_main_arg = PointerGetDatum(pcxt->seg); + worker.bgw_notify_pid = t_thrd.proc_cxt.MyProcPid; + worker.bgw_parallel_context = pcxt->seg; + + /* + * Start workers. + * + * The caller must be able to tolerate ending up with fewer workers than + * expected, so there is no need to throw an error here if registration + * fails. It wouldn't help much anyway, because registering the worker in + * no way guarantees that it will start up and initialize successfully. + */ + for (i = 0; i < pcxt->nworkers; ++i) { + rc = memcpy_s(worker.bgw_extra, BGW_EXTRALEN, &i, sizeof(int)); + securec_check(rc, "", ""); + if (!any_registrations_failed && RegisterDynamicBackgroundWorker(&worker, &pcxt->worker[i].bgwhandle)) { + shm_mq_set_handle(pcxt->worker[i].error_mqh, pcxt->worker[i].bgwhandle); + pcxt->nworkers_launched++; + } else { + /* + * If we weren't able to register the worker, then we've bumped up + * against the max_worker_processes limit, and future + * registrations will probably fail too, so arrange to skip them. + * But we still have to execute this code for the remaining slots + * to make sure that we forget about the error queues we budgeted + * for those workers. Otherwise, we'll wait for them to start, + * but they never will. + */ + any_registrations_failed = true; + pcxt->worker[i].bgwhandle = NULL; + shm_mq_detach(pcxt->worker[i].error_mqh); + pcxt->worker[i].error_mqh = NULL; + } + } + + /* + * Now that nworkers_launched has taken its final value, we can initialize + * known_attached_workers. + */ + if (pcxt->nworkers_launched > 0) { + pcxt->known_attached_workers = (bool *)palloc0(sizeof(bool) * pcxt->nworkers_launched); + pcxt->nknown_attached_workers = 0; + } + + /* Restore previous memory context. */ + (void)MemoryContextSwitchTo(oldcontext); +} + +/* + * Wait for all workers to attach to their error queues, and throw an error if + * any worker fails to do this. + * + * Callers can assume that if this function returns successfully, then the + * number of workers given by pcxt->nworkers_launched have initialized and + * attached to their error queues. Whether or not these workers are guaranteed + * to still be running depends on what code the caller asked them to run; + * this function does not guarantee that they have not exited. However, it + * does guarantee that any workers which exited must have done so cleanly and + * after successfully performing the work with which they were tasked. + * + * If this function is not called, then some of the workers that were launched + * may not have been started due to a fork() failure, or may have exited during + * early startup prior to attaching to the error queue, so nworkers_launched + * cannot be viewed as completely reliable. It will never be less than the + * number of workers which actually started, but it might be more. Any workers + * that failed to start will still be discovered by + * WaitForParallelWorkersToFinish and an error will be thrown at that time, + * provided that function is eventually reached. + * + * In general, the leader process should do as much work as possible before + * calling this function. fork() failures and other early-startup failures + * are very uncommon, and having the leader sit idle when it could be doing + * useful work is undesirable. However, if the leader needs to wait for + * all of its workers or for a specific worker, it may want to call this + * function before doing so. If not, it must make some other provision for + * the failure-to-start case, lest it wait forever. On the other hand, a + * leader which never waits for a worker that might not be started yet, or + * at least never does so prior to WaitForParallelWorkersToFinish(), need not + * call this function at all. + */ +void WaitForParallelWorkersToAttach(ParallelContext *pcxt) +{ + int i; + + /* Skip this if we have no launched workers. */ + if (pcxt->nworkers_launched == 0) + return; + + for (;;) { + /* + * This will process any parallel messages that are pending and it may + * also throw an error propagated from a worker. + */ + CHECK_FOR_INTERRUPTS(); + + for (i = 0; i < pcxt->nworkers_launched; ++i) { + shm_mq *mq = NULL; + int rc; + ThreadId pid; + + if (pcxt->known_attached_workers[i]) + continue; + + /* + * If error_mqh is NULL, then the worker has already exited + * cleanly. + */ + if (pcxt->worker[i].error_mqh == NULL) { + pcxt->known_attached_workers[i] = true; + ++pcxt->nknown_attached_workers; + continue; + } + + BgwHandleStatus status = GetBackgroundWorkerPid(pcxt->worker[i].bgwhandle, &pid); + if (status == BGWH_STARTED) { + /* Has the worker attached to the error queue? */ + mq = shm_mq_get_queue(pcxt->worker[i].error_mqh); + if (shm_mq_get_sender(mq) != NULL) { + /* Yes, so it is known to be attached. */ + pcxt->known_attached_workers[i] = true; + ++pcxt->nknown_attached_workers; + } + } else if (status == BGWH_STOPPED) { + /* + * If the worker stopped without attaching to the error queue, + * throw an error. + */ + mq = shm_mq_get_queue(pcxt->worker[i].error_mqh); + if (shm_mq_get_sender(mq) == NULL) + ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("parallel worker failed to initialize"), + errhint("More details may be available in the server log."))); + + pcxt->known_attached_workers[i] = true; + ++pcxt->nknown_attached_workers; + } else { + /* + * Worker not yet started, so we must wait. The postmaster + * will notify us if the worker's state changes. Our latch + * might also get set for some other reason, but if so we'll + * just end up waiting for the same worker again. + */ + rc = WaitLatch(&t_thrd.proc->procLatch, WL_LATCH_SET, -1); + if (rc & WL_LATCH_SET) { + ResetLatch(&t_thrd.proc->procLatch); + } + } + } + + /* If all workers are known to have started, we're done. */ + if (pcxt->nknown_attached_workers >= pcxt->nworkers_launched) { + Assert(pcxt->nknown_attached_workers == pcxt->nworkers_launched); + break; + } + } +} + +/* + * Wait for all workers to finish computing. + * + * Even if the parallel operation seems to have completed successfully, it's + * important to call this function afterwards. We must not miss any errors + * the workers may have thrown during the parallel operation, or any that they + * may yet throw while shutting down. + * + * Also, we want to update our notion of XactLastRecEnd based on worker + * feedback. + */ +void WaitForParallelWorkersToFinish(ParallelContext *pcxt) +{ + for (;;) { + bool anyone_alive = false; + int nfinished = 0; + int i; + + /* + * This will process any parallel messages that are pending, which may + * change the outcome of the loop that follows. It may also throw an + * error propagated from a worker. + */ + CHECK_FOR_INTERRUPTS(); + + for (i = 0; i < pcxt->nworkers_launched; ++i) { + /* + * If error_mqh is NULL, then the worker has already exited + * cleanly. If we have received a message through error_mqh from + * the worker, we know it started up cleanly, and therefore we're + * certain to be notified when it exits. + */ + if (pcxt->worker[i].error_mqh == NULL) + ++nfinished; + else if (pcxt->known_attached_workers[i]) { + anyone_alive = true; + break; + } + } + + if (!anyone_alive) { + /* If all workers are known to have finished, we're done. */ + if (nfinished >= pcxt->nworkers_launched) { + Assert(nfinished == pcxt->nworkers_launched); + break; + } + + /* + * We didn't detect any living workers, but not all workers are + * known to have exited cleanly. Either not all workers have + * launched yet, or maybe some of them failed to start or + * terminated abnormally. + */ + for (i = 0; i < pcxt->nworkers_launched; ++i) { + ThreadId pid; + + /* + * If the worker is BGWH_NOT_YET_STARTED or BGWH_STARTED, we + * should just keep waiting. If it is BGWH_STOPPED, then + * further investigation is needed. + */ + if (pcxt->worker[i].error_mqh == NULL || pcxt->worker[i].bgwhandle == NULL || + GetBackgroundWorkerPid(pcxt->worker[i].bgwhandle, &pid) != BGWH_STOPPED) + continue; + + /* + * Check whether the worker ended up stopped without ever + * attaching to the error queue. If so, the postmaster was + * unable to fork the worker or it exited without initializing + * properly. We must throw an error, since the caller may + * have been expecting the worker to do some work before + * exiting. + */ + shm_mq *mq = shm_mq_get_queue(pcxt->worker[i].error_mqh); + if (shm_mq_get_sender(mq) == NULL) + ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("parallel worker failed to initialize"), + errhint("More details may be available in the server log."))); + + /* + * The worker is stopped, but is attached to the error queue. + * Unless there's a bug somewhere, this will only happen when + * the worker writes messages and terminates after the + * CHECK_FOR_INTERRUPTS() near the top of this function and + * before the call to GetBackgroundWorkerPid(). In that case, + * or latch should have been set as well and the right things + * will happen on the next pass through the loop. + */ + } + } + + (void)WaitLatch(&t_thrd.proc->procLatch, WL_LATCH_SET, -1); + ResetLatch(&t_thrd.proc->procLatch); + } + + knl_u_parallel_context *cxt = (knl_u_parallel_context *)pcxt->seg; + if (cxt->pwCtx->last_xlog_end > t_thrd.xlog_cxt.XactLastRecEnd) + t_thrd.xlog_cxt.XactLastRecEnd = cxt->pwCtx->last_xlog_end; +} + +/* + * Wait for all workers to exit. + * + * This function ensures that workers have been completely shutdown. The + * difference between WaitForParallelWorkersToFinish and this function is + * that former just ensures that last message sent by worker backend is + * received by master backend whereas this ensures the complete shutdown. + */ +static void WaitForParallelWorkersToExit(ParallelContext *pcxt) +{ + /* Wait until the workers actually die. */ + for (int i = 0; i < pcxt->nworkers_launched; ++i) { + if (pcxt->worker == NULL || pcxt->worker[i].bgwhandle == NULL) { + continue; + } + + BgwHandleStatus status = WaitForBackgroundWorkerShutdown(pcxt->worker[i].bgwhandle); + /* + * If the postmaster kicked the bucket, we have no chance of cleaning + * up safely -- we won't be able to tell when our workers are actually + * dead. This doesn't necessitate a PANIC since they will all abort + * eventually, but we can't safely continue this session. + */ + if (status == BGWH_POSTMASTER_DIED) + ereport(FATAL, + (errcode(ERRCODE_ADMIN_SHUTDOWN), errmsg("postmaster exited during a parallel transaction"))); + + /* Release memory. */ + pfree(pcxt->worker[i].bgwhandle); + pcxt->worker[i].bgwhandle = NULL; + } +} + +/* + * Destroy a parallel context. + * + * If expecting a clean exit, you should use WaitForParallelWorkersToFinish() + * first, before calling this function. When this function is invoked, any + * remaining workers are forcibly killed; the dynamic shared memory segment + * is unmapped; and we then wait (uninterruptibly) for the workers to exit. + */ +void DestroyParallelContext(ParallelContext *pcxt) +{ + int i; + + /* + * Be careful about order of operations here! We remove the parallel + * context from the list before we do anything else; otherwise, if an + * error occurs during a subsequent step, we might try to nuke it again + * from AtEOXact_Parallel or AtEOSubXact_Parallel. + */ + dlist_delete(&pcxt->node); + + /* Kill each worker in turn, and forget their error queues. */ + if (pcxt->worker != NULL) { + for (i = 0; i < pcxt->nworkers_launched; ++i) { + if (pcxt->worker[i].error_mqh != NULL) { + TerminateBackgroundWorker(pcxt->worker[i].bgwhandle); + + shm_mq_detach(pcxt->worker[i].error_mqh); + pcxt->worker[i].error_mqh = NULL; + } + } + } + + /* + * If this parallel context is actually in backend-private memory rather + * than shared memory, free that memory instead. + */ + if (pcxt->private_memory != NULL) { + pfree(pcxt->private_memory); + pcxt->private_memory = NULL; + } + + /* + * We can't finish transaction commit or abort until all of the workers + * have exited. This means, in particular, that we can't respond to + * interrupts at this stage. + */ + HOLD_INTERRUPTS(); + WaitForParallelWorkersToExit(pcxt); + RESUME_INTERRUPTS(); + + /* Free the worker array itself. */ + if (pcxt->worker != NULL) { + pfree(pcxt->worker); + pcxt->worker = NULL; + } + + /* + * If we have allocated a shared memory segment, detach it. This will + * implicitly detach the error queues, and any other shared memory queues, + * stored there. + */ + if (pcxt->seg != NULL) { + dsm_detach(&(pcxt->seg)); + pcxt->seg = NULL; + } + + /* Free memory. */ + pfree(pcxt->library_name); + pfree(pcxt->function_name); + pfree(pcxt); +} + +/* + * Are there any parallel contexts currently active? + */ +bool ParallelContextActive(void) +{ + return !dlist_is_empty(&t_thrd.bgworker_cxt.pcxt_list); +} + +/* + * Handle receipt of an interrupt indicating a parallel worker message. + * + * Note: this is called within a signal handler! All we can do is set a flag + * that will cause the next CHECK_FOR_INTERRUPTS() to invoke HandleParallelMessages(). + */ +void HandleParallelMessageInterrupt(void) +{ + InterruptPending = true; + t_thrd.bgworker_cxt.ParallelMessagePending = true; + SetLatch(&t_thrd.proc->procLatch); +} + +/* + * Handle any queued protocol messages received from parallel workers. + */ +void HandleParallelMessages(void) +{ + dlist_iter iter; + + /* + * This is invoked from ProcessInterrupts(), and since some of the + * functions it calls contain CHECK_FOR_INTERRUPTS(), there is a potential + * for recursive calls if more signals are received while this runs. It's + * unclear that recursive entry would be safe, and it doesn't seem useful + * even if it is safe, so let's block interrupts until done. + */ + HOLD_INTERRUPTS(); + + /* + * Moreover, CurrentMemoryContext might be pointing almost anywhere. We + * don't want to risk leaking data into long-lived contexts, so let's do + * our work here in a private context that we can reset on each use. + */ + if (t_thrd.bgworker_cxt.hpm_context == NULL) /* first time through? */ + t_thrd.bgworker_cxt.hpm_context = + AllocSetContextCreate(TopMemoryContext, "HandleParallelMessages", ALLOCSET_DEFAULT_SIZES); + else + MemoryContextReset(t_thrd.bgworker_cxt.hpm_context); + + MemoryContext oldcontext = MemoryContextSwitchTo(t_thrd.bgworker_cxt.hpm_context); + + /* OK to process messages. Reset the flag saying there are more to do. */ + t_thrd.bgworker_cxt.ParallelMessagePending = false; + + dlist_foreach(iter, &t_thrd.bgworker_cxt.pcxt_list) + { + ParallelContext *pcxt = dlist_container(ParallelContext, node, iter.cur); + if (pcxt->worker == NULL) + continue; + + for (int i = 0; i < pcxt->nworkers_launched; ++i) { + /* + * Read as many messages as we can from each worker, but stop when + * either (1) the worker's error queue goes away, which can happen + * if we receive a Terminate message from the worker; or (2) no + * more messages can be read from the worker without blocking. + */ + while (pcxt->worker[i].error_mqh != NULL) { + Size nbytes; + void *data = NULL; + + shm_mq_result res = shm_mq_receive(pcxt->worker[i].error_mqh, &nbytes, &data, true); + if (res == SHM_MQ_WOULD_BLOCK) { + break; + } else if (res == SHM_MQ_SUCCESS) { + StringInfoData msg; + + initStringInfo(&msg); + appendBinaryStringInfo(&msg, (const char *)data, nbytes); + HandleParallelMessage(pcxt, i, &msg); + pfree(msg.data); + } else { + ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("lost connection to parallel worker"))); + } + } + } + } + + (void)MemoryContextSwitchTo(oldcontext); + + /* Might as well clear the context on our way out */ + MemoryContextReset(t_thrd.bgworker_cxt.hpm_context); + + RESUME_INTERRUPTS(); +} + +/* + * Handle a single protocol message received from a single parallel worker. + */ +static void HandleParallelMessage(ParallelContext *pcxt, int i, StringInfo msg) +{ + if (pcxt->known_attached_workers != NULL && !pcxt->known_attached_workers[i]) { + pcxt->known_attached_workers[i] = true; + pcxt->nknown_attached_workers++; + } + + char msgtype = (char)pq_getmsgbyte(msg); + + switch (msgtype) { + case 'K': /* BackendKeyData */ + { + ThreadId pid = pq_getmsgint64(msg); + + (void)pq_getmsgint64(msg); /* discard cancel key */ + pq_getmsgend(msg); + pcxt->worker[i].pid = pid; + break; + } + + case 'E': /* ErrorResponse */ + case 'N': /* NoticeResponse */ + { + ErrorData edata; + + /* Parse ErrorResponse or NoticeResponse. */ + pq_parse_errornotice(msg, &edata); + + /* Death of a worker isn't enough justification for suicide. */ + edata.elevel = Min(edata.elevel, ERROR); + + /* + * If desired, add a context line to show that this is a + * message propagated from a parallel worker. Otherwise, it + * can sometimes be confusing to understand what actually + * happened. (We don't do this in FORCE_PARALLEL_REGRESS mode + * because it causes test-result instability depending on + * whether a parallel worker is actually used or not.) + */ + if (u_sess->attr.attr_sql.force_parallel_mode != FORCE_PARALLEL_REGRESS) { + if (edata.context) { + /* 1 for '\0', 1 for '\n' */ + Size len = strlen(edata.context) + strlen("parallel worker") + 2; + edata.context = (char *)palloc(len); + int rc = sprintf_s(edata.context, len, "%s\n%s", edata.context, "parallel worker"); + securec_check_ss(rc, "", ""); + } else { + edata.context = pstrdup(_("parallel worker")); + } + } + + /* + * Context beyond that should use the error context callbacks + * that were in effect when the ParallelContext was created, + * not the current ones. + */ + ErrorContextCallback *save_error_context_stack = t_thrd.log_cxt.error_context_stack; + t_thrd.log_cxt.error_context_stack = pcxt->error_context_stack; + + /* Rethrow error or print notice. */ + ThrowErrorData(&edata); + + /* Not an error, so restore previous context stack. */ + t_thrd.log_cxt.error_context_stack = save_error_context_stack; + + break; + } + + case 'A': /* NotifyResponse */ + { + /* Propagate NotifyResponse. */ + uint32 pid = pq_getmsgint(msg, 4); + const char *channel = pq_getmsgrawstring(msg); + const char *payload = pq_getmsgrawstring(msg); + pq_endmessage(msg); + + NotifyMyFrontEnd(channel, payload, pid); + + break; + } + + case 'X': /* Terminate, indicating clean exit */ + { + shm_mq_detach(pcxt->worker[i].error_mqh); + pcxt->worker[i].error_mqh = NULL; + break; + } + + default: { + ereport(ERROR, + (errmsg("unrecognized message type received from parallel worker: %c (message length %d bytes)", + msgtype, msg->len))); + } + } +} + +/* + * End-of-subtransaction cleanup for parallel contexts. + * + * Currently, it's forbidden to enter or leave a subtransaction while + * parallel mode is in effect, so we could just blow away everything. But + * we may want to relax that restriction in the future, so this code + * contemplates that there may be multiple subtransaction IDs in pcxt_list. + */ +void AtEOSubXact_Parallel(bool isCommit, SubTransactionId mySubId) +{ + while (!dlist_is_empty(&t_thrd.bgworker_cxt.pcxt_list)) { + ParallelContext *pcxt = dlist_head_element(ParallelContext, node, &t_thrd.bgworker_cxt.pcxt_list); + if (pcxt->subid != mySubId) + break; + if (isCommit) + ereport(WARNING, (errmsg("leaked parallel context"))); + DestroyParallelContext(pcxt); + } +} + +/* + * End-of-transaction cleanup for parallel contexts. + */ +void AtEOXact_Parallel(bool isCommit) +{ + while (!dlist_is_empty(&t_thrd.bgworker_cxt.pcxt_list)) { + ParallelContext *pcxt = dlist_head_element(ParallelContext, node, &t_thrd.bgworker_cxt.pcxt_list); + if (isCommit) + ereport(WARNING, (errmsg("leaked parallel context"))); + DestroyParallelContext(pcxt); + } +} + +/* + * Main entrypoint for parallel workers. + */ +void ParallelWorkerMain(Datum main_arg) +{ + StringInfoData msgbuf; + + knl_u_parallel_context *ctx = (knl_u_parallel_context *)DatumGetPointer(main_arg); + + /* Set flag to indicate that we're initializing a parallel worker. */ + t_thrd.bgworker_cxt.InitializingParallelWorker = true; + + /* Establish signal handlers. */ + gspqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + /* Determine and set our parallel worker number. */ + Assert(t_thrd.bgworker_cxt.ParallelWorkerNumber == -1); + int rc = memcpy_s(&t_thrd.bgworker_cxt.ParallelWorkerNumber, sizeof(int), + t_thrd.bgworker_cxt.my_bgworker_entry->bgw_extra, sizeof(int)); + securec_check(rc, "", ""); + + /* Set up a memory context to work in, just for cleanliness. */ + CurrentMemoryContext = AllocSetContextCreate(TopMemoryContext, "Parallel worker", ALLOCSET_DEFAULT_SIZES); + + /* Arrange to signal the leader if we exit. */ + on_shmem_exit(ParallelWorkerShutdown, (Datum)0); + + /* + * Now we can find and attach to the error queue provided for us. That's + * good, because until we do that, any errors that happen here will not be + * reported back to the process that requested that this worker be + * launched. + */ + char *error_queue_space = ctx->pwCtx->errorQueue; + shm_mq *mq = (shm_mq *)(error_queue_space + t_thrd.bgworker_cxt.ParallelWorkerNumber * PARALLEL_ERROR_QUEUE_SIZE); + shm_mq_set_sender(mq, t_thrd.proc); + shm_mq_handle *mqh = shm_mq_attach(mq, ctx, NULL); + pq_redirect_to_shm_mq(mqh); + pq_set_parallel_master(ctx->pwCtx->parallel_master_pid, ctx->pwCtx->parallel_master_backend_id); + + /* + * Send a BackendKeyData message to the process that initiated parallelism + * so that it has access to our PID before it receives any other messages + * from us. Our cancel key is sent, too, since that's the way the + * protocol message is defined, but it won't actually be used for anything + * in this case. + */ + pq_beginmessage(&msgbuf, 'K'); + pq_sendint64(&msgbuf, t_thrd.proc_cxt.MyProcPid); + pq_sendint64(&msgbuf, t_thrd.proc_cxt.MyCancelKey); + pq_endmessage(&msgbuf); + + /* + * Hooray! Primary initialization is complete. Now, we need to set up our + * backend-local state to match the original backend. + */ + /* + * Restore transaction and statement start-time timestamps. This must + * happen before anything that would start a transaction, else asserts in + * xact.c will fire. + */ + SetParallelStartTimestamps(ctx->pwCtx->xact_ts, ctx->pwCtx->stmt_ts); + + /* + * Identify the entry point to be called. In theory this could result in + * loading an additional library, though most likely the entry point is in + * the core backend or in a library we just loaded. + */ + parallel_worker_main_type entrypt = + LookupParallelWorkerFunction(ctx->pwCtx->library_name, ctx->pwCtx->function_name); + + /* Restore database connection. */ + BackgroundWorkerInitializeConnectionByOid(ctx->pwCtx->database_id, ctx->pwCtx->authenticated_user_id, 0); + + /* + * Set the client encoding to the database encoding, since that is what + * the leader will expect. + */ + (void)SetClientEncoding(GetDatabaseEncoding()); + + /* Crank up a transaction state appropriate to a parallel worker. */ + StartParallelWorkerTransaction(ctx->pwCtx); + + /* Restore combo CID state. */ + u_sess->utils_cxt.usedComboCids = ctx->pwCtx->usedComboCids; + u_sess->utils_cxt.comboCids = ctx->pwCtx->comboCids; + u_sess->utils_cxt.sizeComboCids = ctx->pwCtx->sizeComboCids; + u_sess->utils_cxt.comboHash = ctx->pwCtx->comboHash; + + /* Restore namespace search path */ + u_sess->attr.attr_common.namespace_search_path = ctx->pwCtx->namespace_search_path; + + /* Restore transaction snapshot. */ + RestoreTransactionSnapshot(RestoreSnapshot(ctx->pwCtx->tsnapspace, ctx->pwCtx->tsnapspace_len), + ctx->pwCtx->parallel_master_pgproc); + /* Restore active snapshot. */ + PushActiveSnapshot(RestoreSnapshot(ctx->pwCtx->asnapspace, ctx->pwCtx->asnapspace_len)); + + /* + * We've changed which tuples we can see, and must therefore invalidate + * system caches. + */ + InvalidateSystemCaches(); + + /* + * Restore current role id. Skip verifying whether session user is + * allowed to become this role and blindly restore the leader's state for + * current role. + */ + SetCurrentRoleId(ctx->pwCtx->outer_user_id, ctx->pwCtx->is_superuser); + + /* Restore user ID and security context. */ + SetUserIdAndSecContext(ctx->pwCtx->current_user_id, ctx->pwCtx->sec_context); + + /* Restore temp-namespace state to ensure search path matches leader's. */ + SetTempNamespaceState(ctx->pwCtx->temp_namespace_id, ctx->pwCtx->temp_toast_namespace_id); + + /* Restore relmapper state. */ + u_sess->relmap_cxt.active_shared_updates = ctx->pwCtx->active_shared_updates; + u_sess->relmap_cxt.active_local_updates = ctx->pwCtx->active_local_updates; + + /* + * We've initialized all of our state now; nothing should change + * hereafter. + */ + t_thrd.bgworker_cxt.InitializingParallelWorker = false; + EnterParallelMode(); + + /* + * Time to do the real work: invoke the caller-supplied code. + */ + entrypt(ctx); + + /* Must exit parallel mode to pop active snapshot. */ + ExitParallelMode(); + + /* Must pop active snapshot so snapmgr.c doesn't complain. */ + PopActiveSnapshot(); + + /* Shut down the parallel-worker transaction. */ + EndParallelWorkerTransaction(); + + /* Report success. */ + pq_putmessage('X', NULL, 0); +} + +/* + * Update shared memory with the ending location of the last WAL record we + * wrote, if it's greater than the value already stored there. + */ +void ParallelWorkerReportLastRecEnd(XLogRecPtr last_xlog_end) +{ + knl_u_parallel_context *ctx = (knl_u_parallel_context *)t_thrd.bgworker_cxt.my_bgworker_entry->bgw_parallel_context; + Assert(ctx->pwCtx != NULL); + SpinLockAcquire(&ctx->pwCtx->mutex); + if (ctx->pwCtx->last_xlog_end < last_xlog_end) { + ctx->pwCtx->last_xlog_end = last_xlog_end; + } + SpinLockRelease(&ctx->pwCtx->mutex); +} + +/* + * Make sure the leader tries to read from our error queue one more time. + * This guards against the case where we exit uncleanly without sending an + * ErrorResponse to the leader, for example because some code calls proc_exit + * directly. + */ +static void ParallelWorkerShutdown(int code, Datum arg) +{ + (void)SendProcSignal(t_thrd.msqueue_cxt.pq_mq_parallel_master_pid, PROCSIG_PARALLEL_MESSAGE, + t_thrd.msqueue_cxt.pq_mq_parallel_master_backend_id); +} + +/* + * Look up (and possibly load) a parallel worker entry point function. + * + * For functions contained in the core code, we use library name "postgres" + * and consult the InternalParallelWorkers array. External functions are + * looked up, and loaded if necessary, using load_external_function(). + * + * The point of this is to pass function names as strings across process + * boundaries. We can't pass actual function addresses because of the + * possibility that the function has been loaded at a different address + * in a different process. This is obviously a hazard for functions in + * loadable libraries, but it can happen even for functions in the core code + * on platforms using EXEC_BACKEND (e.g., Windows). + * + * At some point it might be worthwhile to get rid of InternalParallelWorkers[] + * in favor of applying load_external_function() for core functions too; + * but that raises portability issues that are not worth addressing now. + */ +static parallel_worker_main_type LookupParallelWorkerFunction(const char *libraryname, const char *funcname) +{ + /* + * If the function is to be loaded from postgres itself, search the + * InternalParallelWorkers array. + */ + if (strcmp(libraryname, "postgres") == 0) { + for (size_t i = 0; i < lengthof(InternalParallelWorkers); i++) { + if (strcmp(InternalParallelWorkers[i].fn_name, funcname) == 0) + return InternalParallelWorkers[i].fn_addr; + } + + ereport(ERROR, (errmsg("internal function \"%s\" not found", funcname))); + } + + ereport(ERROR, (errmsg("library\"%s\" function \"%s\" not supported", libraryname, funcname))); + return NULL; +} + diff --git a/src/gausskernel/storage/access/transam/varsup.cpp b/src/gausskernel/storage/access/transam/varsup.cpp index b96a1a8ea..f33fec74f 100755 --- a/src/gausskernel/storage/access/transam/varsup.cpp +++ b/src/gausskernel/storage/access/transam/varsup.cpp @@ -117,6 +117,15 @@ TransactionId GetNewTransactionId(bool isSubXact) bool incrementXid = true; #endif + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for new XIDs after that point. + */ + if (IsInParallelMode()) { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot assign TransactionIds during a parallel operation"))); + } + /* * During bootstrap initialization, we return the special bootstrap * transaction id. diff --git a/src/gausskernel/storage/access/transam/xact.cpp b/src/gausskernel/storage/access/transam/xact.cpp index 60900c8e4..007d299da 100644 --- a/src/gausskernel/storage/access/transam/xact.cpp +++ b/src/gausskernel/storage/access/transam/xact.cpp @@ -36,6 +36,7 @@ #include "access/cstore_am.h" #include "access/cstore_rewrite.h" #include "access/multixact.h" +#include "access/parallel.h" #include "access/subtrans.h" #include "access/transam.h" #include "access/twophase.h" @@ -141,6 +142,7 @@ typedef enum TBlockState { /* transaction block states */ TBLOCK_BEGIN, /* starting transaction block */ TBLOCK_INPROGRESS, /* live transaction */ + TBLOCK_PARALLEL_INPROGRESS, /* live transaction inside parallel worker */ TBLOCK_END, /* COMMIT received */ TBLOCK_ABORT, /* failed xact, awaiting ROLLBACK */ TBLOCK_ABORT_END, /* failed xact, ROLLBACK received */ @@ -189,6 +191,7 @@ struct TransactionStateData { bool prevXactReadOnly; /* entry-time xact r/o state */ bool startedInRecovery; /* did we start in recovery? */ bool didLogXid; /* has xid been included in WAL record? */ + int parallelModeLevel; /* Enter/ExitParallelMode counter */ struct TransactionStateData* parent; /* back link to parent */ /* which storage engine tables are used in current transaction for D/I/U/S statements */ @@ -225,6 +228,7 @@ static THR_LOCAL TransactionStateData TopTransactionStateData = { false, /* entry-time xact r/o state */ false, /* startedInRecovery */ false, /* didLogXid */ + 0, /* parallelModeLevel */ NULL, /* link to parent state block */ SE_TYPE_UNSPECIFIED /* storage engine used in transaction */ }; @@ -717,6 +721,15 @@ static void AssignTransactionId(TransactionState s) Assert(!TransactionIdIsValid(s->transactionId)); Assert(s->state == TRANS_INPROGRESS); + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for new XIDs at this point. + */ + if (IsInParallelMode() || IsParallelWorker()) { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot assign XIDs during a parallel operation"))); + } + /* * Ensure parent(s) have XIDs, so that a child always has an XID later * than its parent. Musn't recurse here, or we might get a stack overflow @@ -918,8 +931,16 @@ CommandId GetCurrentCommandId(bool used) #endif /* this is global to a transaction, not subtransaction-local */ - if (used) + if (used) { + /* + * Forbid setting currentCommandIdUsed in a parallel worker, because + * we have no provision for communicating this back to the master. We + * could relax this restriction when currentCommandIdUsed was already + * true at the start of the parallel operation. + */ + Assert(!IsParallelWorker()); t_thrd.xact_cxt.currentCommandIdUsed = true; + } return t_thrd.xact_cxt.currentCommandId; } @@ -1007,12 +1028,18 @@ TimestampTz GetCurrentStmtsysTimestamp(void) /* * SetCurrentStatementStartTimestamp * - * The time on the DN is obtained from the CN. If the CN does not deliver the time, - * the time of the current DN is used. + * The time on the DN is obtained from the CN. If the CN does not deliver the time, + * the time of the current DN is used. + * In a parallel worker, this should already have been provided by a call + * to SetParallelStartTimestamps(). */ void SetCurrentStatementStartTimestamp(void) { - t_thrd.xact_cxt.stmtStartTimestamp = GetCurrentTimestamp(); + if (!IsParallelWorker()) { + t_thrd.xact_cxt.stmtStartTimestamp = GetCurrentTimestamp(); + } else { + Assert(t_thrd.xact_cxt.stmtStartTimestamp != 0); + } } void SetStatementStartTimestamp(TimestampTz timestamp) @@ -1169,7 +1196,42 @@ bool TransactionStartedDuringRecovery(void) } /* - * CommandCounterIncrement + * EnterParallelMode + */ +void EnterParallelMode(void) +{ + TransactionState s = CurrentTransactionState; + Assert(s->parallelModeLevel >= 0); + ++s->parallelModeLevel; +} + +/* + * ExitParallelMode + */ +void ExitParallelMode(void) +{ + TransactionState s = CurrentTransactionState; + Assert(s->parallelModeLevel > 0); + Assert(s->parallelModeLevel > 1 || !ParallelContextActive()); + --s->parallelModeLevel; +} + +/* + * IsInParallelMode + * + * Are we in a parallel operation, as either the master or a worker? Check + * this to prohibit operations that change backend-local state expected to + * match across all workers. Mere caches usually don't require such a + * restriction. State modified in a strict push/pop fashion, such as the + * active snapshot stack, is often fine. + */ +bool IsInParallelMode(void) +{ + return CurrentTransactionState->parallelModeLevel != 0; +} + +/* + * CommandCounterIncrement */ void CommandCounterIncrement(void) { @@ -1180,6 +1242,16 @@ void CommandCounterIncrement(void) * overflow, and keeps no-op CommandCounterIncrement operations cheap. */ if (t_thrd.xact_cxt.currentCommandIdUsed) { + /* + * Workers synchronize transaction state at the beginning of each + * parallel operation, so we can't account for new commands after that + * point. + */ + if (IsInParallelMode() || IsParallelWorker()) { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot start commands during a parallel operation"))); + } + t_thrd.xact_cxt.currentCommandId += 1; if (t_thrd.xact_cxt.currentCommandId == InvalidCommandId) { /* check for overflow */ t_thrd.xact_cxt.currentCommandId -= 1; @@ -1790,7 +1862,6 @@ static void AtSubCommit_childXids(void) * here or in the calculation of new_nChildXids.) */ new_maxChildXids = Min(new_nChildXids * 2, (int)(MaxAllocSize / sizeof(TransactionId))); - if (new_maxChildXids < new_nChildXids) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), @@ -2272,12 +2343,22 @@ static void StartTransaction(bool begin_on_gtm) Assert(s->prevSecContext == 0); /* - * set transaction_timestamp() (a/k/a now()). We want this to be the same - * as the first command's statement_timestamp(), so don't do a fresh - * GetCurrentTimestamp() call (which'd be expensive anyway). Also, mark - * xactStopTimestamp as unset. + * set transaction_timestamp() (a/k/a now()). Normally, we want this to + * be the same as the first command's statement_timestamp(), so don't do a + * fresh GetCurrentTimestamp() call (which'd be expensive anyway). But + * for transactions started inside procedures (i.e., nonatomic SPI + * contexts), we do need to advance the timestamp. Also, in a parallel + * worker, the timestamp should already have been provided by a call to + * SetParallelStartTimestamps. */ - t_thrd.xact_cxt.xactStartTimestamp = t_thrd.xact_cxt.stmtStartTimestamp; + if (!IsParallelWorker()) { + if (!SPI_inside_nonatomic_context()) + t_thrd.xact_cxt.xactStartTimestamp = t_thrd.xact_cxt.stmtStartTimestamp; + else + t_thrd.xact_cxt.xactStartTimestamp = GetCurrentTimestamp(); + } else { + Assert(t_thrd.xact_cxt.xactStartTimestamp != 0); + } t_thrd.xact_cxt.xactStopTimestamp = 0; s->txnKey.txnHandle = InvalidTransactionHandle; @@ -2372,6 +2453,12 @@ static void CommitTransaction(bool stpCommit) TransactionId latestXid; bool barrierLockHeld = false; bool use_old_version_gid = GTM_MODE || (t_thrd.proc->workingVersionNum <= GTM_OLD_VERSION_NUM); + bool is_parallel_worker = (s->blockState == TBLOCK_PARALLEL_INPROGRESS); + + /* Enforce parallel mode restrictions during parallel worker commit. */ + if (is_parallel_worker) { + EnterParallelMode(); + } ShowTransactionState("CommitTransaction"); @@ -2537,6 +2624,10 @@ static void CommitTransaction(bool stpCommit) * of this stuff could still throw an error, which would switch us into * the transaction-abort path. */ + /* If we might have parallel workers, clean them up now. */ + if (IsInParallelMode()) { + AtEOXact_Parallel(true); + } /* Shut down the deferred-trigger manager */ AfterTriggerEndXact(true); @@ -2662,6 +2753,7 @@ static void CommitTransaction(bool stpCommit) */ TransState oldstate = s->state; s->state = TRANS_COMMIT; + s->parallelModeLevel = 0; /* Wait data replicate */ if (!IsInitdb && !g_instance.attr.attr_storage.enable_mix_replication) { @@ -2683,7 +2775,21 @@ static void CommitTransaction(bool stpCommit) /* * Here is where we really truly local commit. */ - latestXid = RecordTransactionCommit(); + if (!is_parallel_worker) { + latestXid = RecordTransactionCommit(); + } else { + /* + * We must not mark our XID committed; the parallel master is + * responsible for that. + */ + latestXid = InvalidTransactionId; + + /* + * Make sure the master will know about any WAL we wrote before it + * commits. + */ + ParallelWorkerReportLastRecEnd(t_thrd.xlog_cxt.XactLastRecEnd); + } if (TwoPhaseCommit) StmtRetrySetTransactionCommitFlag(true); @@ -2859,7 +2965,7 @@ static void CommitTransaction(bool stpCommit) AtEOXact_SPI(true, false, stpCommit); AtEOXact_on_commit_actions(true); if (!stpCommit){ - AtEOXact_Namespace(true); + AtEOXact_Namespace(true, is_parallel_worker); } AtEOXact_SMgr(); AtEOXact_Files(); @@ -2893,6 +2999,9 @@ static void CommitTransaction(bool stpCommit) s->maxChildXids = 0; s->storageEngineType = SE_TYPE_UNSPECIFIED; + t_thrd.xact_cxt.XactTopTransactionId = InvalidTransactionId; + t_thrd.xact_cxt.nParallelCurrentXids = 0; + #ifdef PGXC s->isLocalParameterUsed = false; ForgetTransactionLocalNode(); @@ -3116,6 +3225,8 @@ static void PrepareTransaction(bool stpCommit) char* nodestring = NULL; #endif + Assert(!IsInParallelMode()); + ShowTransactionState("PrepareTransaction"); /* @@ -3410,7 +3521,7 @@ static void PrepareTransaction(bool stpCommit) * since we push the search path hasn't pop yet. */ if (!stpCommit) { - AtEOXact_Namespace(true); + AtEOXact_Namespace(true, false); } AtEOXact_SMgr(); AtEOXact_Files(); @@ -3703,6 +3814,7 @@ static void AbortTransaction(bool PerfectRollback, bool stpRollback) t_thrd.xact_cxt.needRemoveTwophaseState = false; /* check the current transaction state */ + bool is_parallel_worker = (s->blockState == TBLOCK_PARALLEL_INPROGRESS); if (s->state != TRANS_INPROGRESS && s->state != TRANS_PREPARE) ereport(WARNING, (errcode(ERRCODE_WARNING), errmsg("AbortTransaction while in %s state", TransStateAsString(s->state)))); @@ -3735,6 +3847,12 @@ static void AbortTransaction(bool PerfectRollback, bool stpRollback) SetUserIdAndSecContext(s->prevUser, s->prevSecContext); u_sess->exec_cxt.is_exec_trigger_func = false; + /* If in parallel mode, clean up workers and exit parallel mode. */ + if (IsInParallelMode()) { + AtEOXact_Parallel(false); + s->parallelModeLevel = 0; + } + /* * do abort processing */ @@ -3753,9 +3871,22 @@ static void AbortTransaction(bool PerfectRollback, bool stpRollback) /* * Advertise the fact that we aborted in pg_clog (assuming that we got as - * far as assigning an XID to advertise). + * far as assigning an XID to advertise). But if we're inside a parallel + * worker, skip this; the user backend must be the one to write the abort + * record. */ - latestXid = RecordTransactionAbort(false); + if (!is_parallel_worker) { + latestXid = RecordTransactionAbort(false); + } else { + latestXid = InvalidTransactionId; + + /* + * Since the parallel master won't get our value of XactLastRecEnd in + * this case, we nudge WAL-writer ourselves in this case. See related + * comments in RecordTransactionAbort for why this matters. + */ + XLogSetAsyncXactLSN(t_thrd.xlog_cxt.XactLastRecEnd); + } t_thrd.pgxact->prepare_xid = InvalidTransactionId; @@ -3807,7 +3938,7 @@ static void AbortTransaction(bool PerfectRollback, bool stpRollback) AtEOXact_SPI(false, stpRollback, false); AtEOXact_on_commit_actions(false); if (!stpRollback) { - AtEOXact_Namespace(false); + AtEOXact_Namespace(false, is_parallel_worker); } AtEOXact_SMgr(); AtEOXact_Files(); @@ -3942,9 +4073,10 @@ void StartTransactionCommand(bool stpRollback) } break; - /* These cases are invalid. */ + /* These cases are invalid. */ case TBLOCK_STARTED: case TBLOCK_BEGIN: + case TBLOCK_PARALLEL_INPROGRESS: case TBLOCK_SUBBEGIN: case TBLOCK_END: case TBLOCK_SUBRELEASE: @@ -3981,9 +4113,12 @@ void CommitTransactionCommand(bool stpCommit) /* * This shouldn't happen, because it means the previous * StartTransactionCommand didn't set the STARTED state - * appropriately. + * appropriately, while TBLOCK_PARALLEL_INPROGRESS + * should be ended by EndParallelWorkerTranaction(), + * not this function. */ case TBLOCK_DEFAULT: + case TBLOCK_PARALLEL_INPROGRESS: ereport(FATAL, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), errmsg("CommitTransactionCommand: unexpected state %s", BlockStateAsString(s->blockState)))); @@ -4299,6 +4434,7 @@ void AbortCurrentTransaction(bool stpRollback) * ABORT state. We will stay in ABORT until we get a ROLLBACK. */ case TBLOCK_INPROGRESS: + case TBLOCK_PARALLEL_INPROGRESS: AbortTransaction(false, stpRollback); s->blockState = TBLOCK_ABORT; /* CleanupTransaction happens when we exit TBLOCK_ABORT_END */ @@ -4795,6 +4931,7 @@ void BeginTransactionBlock(void) /* Already a transaction block in progress. */ case TBLOCK_INPROGRESS: + case TBLOCK_PARALLEL_INPROGRESS: case TBLOCK_SUBINPROGRESS: case TBLOCK_ABORT: case TBLOCK_SUBABORT: @@ -5023,6 +5160,15 @@ bool EndTransactionBlock(void) result = true; break; + /* + * The user issued a COMMIT that somehow ran inside a parallel + * worker. We can't cope with that. + */ + case TBLOCK_PARALLEL_INPROGRESS: + ereport(FATAL, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), errmsg("cannot commit during a parallel operation"))); + break; + /* These cases are invalid. */ case TBLOCK_DEFAULT: case TBLOCK_BEGIN: @@ -5125,6 +5271,15 @@ void UserAbortTransactionBlock(void) s->blockState = TBLOCK_ABORT_PENDING; break; + /* + * The user issued an ABORT that somehow ran inside a parallel + * worker. We can't cope with that. + */ + case TBLOCK_PARALLEL_INPROGRESS: + ereport(FATAL, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), errmsg("cannot abort during a parallel operation"))); + break; + /* These cases are invalid. */ case TBLOCK_DEFAULT: case TBLOCK_BEGIN: @@ -5160,6 +5315,18 @@ void DefineSavepoint(const char* name) { TransactionState s = CurrentTransactionState; + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for new subtransactions after that + * point. (Note that this check will certainly error out if s->blockState + * is TBLOCK_PARALLEL_INPROGRESS, so we can treat that as an invalid case + * below.) + */ + if (IsInParallelMode()) { + ereport(ERROR, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot define savepoints during a parallel operation"))); + } + switch (s->blockState) { case TBLOCK_INPROGRESS: case TBLOCK_SUBINPROGRESS: @@ -5180,6 +5347,7 @@ void DefineSavepoint(const char* name) case TBLOCK_DEFAULT: case TBLOCK_STARTED: case TBLOCK_BEGIN: + case TBLOCK_PARALLEL_INPROGRESS: case TBLOCK_SUBBEGIN: case TBLOCK_END: case TBLOCK_SUBRELEASE: @@ -5230,6 +5398,18 @@ void ReleaseSavepoint(List* options) ListCell* cell = NULL; char* name = NULL; + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for transaction state change after that + * point. (Note that this check will certainly error out if s->blockState + * is TBLOCK_PARALLEL_INPROGRESS, so we can treat that as an invalid case + * below.) + */ + if (IsInParallelMode()) { + ereport(ERROR, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot release savepoints during a parallel operation"))); + } + switch (s->blockState) { /* * We can't rollback to a savepoint if there is no savepoint @@ -5250,6 +5430,7 @@ void ReleaseSavepoint(List* options) case TBLOCK_DEFAULT: case TBLOCK_STARTED: case TBLOCK_BEGIN: + case TBLOCK_PARALLEL_INPROGRESS: case TBLOCK_SUBBEGIN: case TBLOCK_END: case TBLOCK_SUBRELEASE: @@ -5325,6 +5506,19 @@ void RollbackToSavepoint(List* options) ListCell* cell = NULL; char* name = NULL; + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for transaction state change after that + * point. (Note that this check will certainly error out if s->blockState + * is TBLOCK_PARALLEL_INPROGRESS, so we can treat that as an invalid case + * below.) + */ + if (IsInParallelMode()) { + ereport(ERROR, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot rollback to savepoints during a parallel operation"))); + } + + switch (s->blockState) { /* * We can't rollback to a savepoint if there is no savepoint @@ -5344,6 +5538,7 @@ void RollbackToSavepoint(List* options) case TBLOCK_DEFAULT: case TBLOCK_STARTED: case TBLOCK_BEGIN: + case TBLOCK_PARALLEL_INPROGRESS: case TBLOCK_SUBBEGIN: case TBLOCK_END: case TBLOCK_SUBRELEASE: @@ -5435,6 +5630,21 @@ void BeginInternalSubTransaction(const char* name) { TransactionState s = CurrentTransactionState; + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for new subtransactions after that + * point. We might be able to make an exception for the type of + * subtransaction established by this function, which is typically used in + * contexts where we're going to release or roll back the subtransaction + * before proceeding further, so that no enduring change to the + * transaction state occurs. For now, however, we prohibit this case along + * with all the others. + */ + if (IsInParallelMode()) { + ereport(ERROR, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot start subtransactions during a parallel operation"))); + } + switch (s->blockState) { case TBLOCK_STARTED: case TBLOCK_INPROGRESS: @@ -5456,6 +5666,7 @@ void BeginInternalSubTransaction(const char* name) /* These cases are invalid. */ case TBLOCK_DEFAULT: case TBLOCK_BEGIN: + case TBLOCK_PARALLEL_INPROGRESS: case TBLOCK_SUBBEGIN: case TBLOCK_SUBRELEASE: case TBLOCK_SUBCOMMIT: @@ -5487,6 +5698,18 @@ void ReleaseCurrentSubTransaction(void) { TransactionState s = CurrentTransactionState; + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for commit of subtransactions after that + * point. This should not happen anyway. Code calling this would + * typically have called BeginInternalSubTransaction() first, failing + * there. + */ + if (IsInParallelMode()) { + ereport(ERROR, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot commit subtransactions during a parallel operation"))); + } + if (s->blockState != TBLOCK_SUBINPROGRESS) { ereport(ERROR, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), @@ -5520,6 +5743,7 @@ void RollbackAndReleaseCurrentSubTransaction(void) case TBLOCK_DEFAULT: case TBLOCK_STARTED: case TBLOCK_BEGIN: + case TBLOCK_PARALLEL_INPROGRESS: case TBLOCK_SUBBEGIN: case TBLOCK_INPROGRESS: case TBLOCK_END: @@ -5620,6 +5844,7 @@ void AbortOutOfAnyTransaction(bool reserve_topxact_abort) break; case TBLOCK_INPROGRESS: + case TBLOCK_PARALLEL_INPROGRESS: AbortTransaction(); if (reserve_topxact_abort) { s->blockState = TBLOCK_ABORT; @@ -5740,6 +5965,7 @@ char TransactionBlockStatusCode(void) case TBLOCK_BEGIN: case TBLOCK_SUBBEGIN: case TBLOCK_INPROGRESS: + case TBLOCK_PARALLEL_INPROGRESS: case TBLOCK_SUBINPROGRESS: case TBLOCK_END: case TBLOCK_SUBRELEASE: @@ -5854,6 +6080,12 @@ static void CommitSubTransaction(bool stpCommit) ereport(WARNING, (errmsg("CommitSubTransaction while in %s state", TransStateAsString(s->state)))); } + /* If in parallel mode, clean up workers and exit parallel mode. */ + if (IsInParallelMode()) { + AtEOSubXact_Parallel(true, s->subTransactionId); + s->parallelModeLevel = 0; + } + /* Pre-commit processing goes here -- nothing to do at the moment */ s->state = TRANS_COMMIT; @@ -6085,6 +6317,12 @@ static void AbortSubTransaction(bool stpRollback) SetUserIdAndSecContext(s->prevUser, s->prevSecContext); u_sess->exec_cxt.is_exec_trigger_func = false; + /* Exit from parallel mode, if necessary. */ + if (IsInParallelMode()) { + AtEOSubXact_Parallel(false, s->subTransactionId); + s->parallelModeLevel = 0; + } + /* * We can skip all this stuff if the subxact failed before creating a * ResourceOwner... @@ -6316,9 +6554,56 @@ static void PopTransaction(void) pfree(s); } +/* + * SetParallelStartTimestamps + * + * In a parallel worker, we should inherit the parent transaction's + * timestamps rather than setting our own. The parallel worker + * infrastructure must call this to provide those values before + * calling StartTransaction() or SetCurrentStatementStartTimestamp(). + */ +void SetParallelStartTimestamps(TimestampTz xact_ts, TimestampTz stmt_ts) +{ + Assert(IsParallelWorker()); + t_thrd.xact_cxt.xactStartTimestamp = xact_ts; + t_thrd.xact_cxt.stmtStartTimestamp = stmt_ts; +} + +/* + * StartParallelWorkerTransaction + * Start a parallel worker transaction, restoring the relevant + * transaction state serialized by SerializeTransactionState. + */ +void StartParallelWorkerTransaction(ParallelInfoContext *cxt) +{ + Assert(CurrentTransactionState->blockState == TBLOCK_DEFAULT); + StartTransaction(true); + + u_sess->utils_cxt.XactIsoLevel = cxt->xactIsoLevel; + u_sess->attr.attr_storage.XactDeferrable = cxt->xactDeferrable; + TopTransactionStateData.transactionId = cxt->topTransactionId; + CurrentTransactionState->transactionId = cxt->currentTransactionId; + t_thrd.xact_cxt.currentCommandId = cxt->currentCommandId; + t_thrd.xact_cxt.nParallelCurrentXids = cxt->nParallelCurrentXids; + t_thrd.xact_cxt.ParallelCurrentXids = cxt->ParallelCurrentXids; + + CurrentTransactionState->blockState = TBLOCK_PARALLEL_INPROGRESS; +} + +/* + * EndParallelWorkerTransaction + * End a parallel worker transaction. + */ +void EndParallelWorkerTransaction(void) +{ + Assert(CurrentTransactionState->blockState == TBLOCK_PARALLEL_INPROGRESS); + CommitTransaction(); + CurrentTransactionState->blockState = TBLOCK_DEFAULT; +} + /* * ShowTransactionState - * Debug support + * Debug support */ static void ShowTransactionState(const char* str) { @@ -6384,6 +6669,8 @@ static const char* BlockStateAsString(TBlockState blockState) return "BEGIN"; case TBLOCK_INPROGRESS: return "INPROGRESS"; + case TBLOCK_PARALLEL_INPROGRESS: + return "PARALLEL_INPROGRESS"; case TBLOCK_END: return "END"; case TBLOCK_ABORT: diff --git a/src/gausskernel/storage/buffer/localbuf.cpp b/src/gausskernel/storage/buffer/localbuf.cpp index 4169f98b8..8bc222a81 100644 --- a/src/gausskernel/storage/buffer/localbuf.cpp +++ b/src/gausskernel/storage/buffer/localbuf.cpp @@ -18,6 +18,7 @@ #include "knl/knl_variable.h" #include "catalog/catalog.h" +#include "access/parallel.h" #include "access/double_write.h" #include "executor/instrument.h" #include "storage/buf_internals.h" @@ -435,6 +436,19 @@ static void InitLocalBuffers(void) HASHCTL info; int i; + /* + * Parallel workers can't access data in temporary tables, because they + * have no visibility into the local buffers of their leader. This is a + * convenient, low-cost place to provide a backstop check for that. Note + * that we don't wish to prevent a parallel worker from accessing catalog + * metadata about a temp table, so checks at higher levels would be + * inappropriate. + */ + if (IsParallelWorker()) { + ereport(ERROR, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot access temporary tables during a parallel operation"))); + } + /* Allocate and zero buffer headers and auxiliary arrays */ u_sess->storage_cxt.LocalBufferDescriptors = (BufferDesc*)MemoryContextAllocZero(u_sess->top_mem_cxt, (unsigned int)nbufs * sizeof(BufferDesc)); diff --git a/src/gausskernel/storage/ipc/Makefile b/src/gausskernel/storage/ipc/Makefile index 4b09ca2ce..8bdc7c560 100644 --- a/src/gausskernel/storage/ipc/Makefile +++ b/src/gausskernel/storage/ipc/Makefile @@ -17,6 +17,6 @@ ifneq "$(MAKECMDGOALS)" "clean" endif endif OBJS = ipc.o ipci.o pmsignal.o procarray.o procsignal.o shmem.o shmqueue.o \ - sinval.o sinvaladt.o standby.o shm_mq.o shm_toc.o + sinval.o sinvaladt.o standby.o shm_mq.o shm_toc.o dsm.o include $(top_srcdir)/src/gausskernel/common.mk \ No newline at end of file diff --git a/src/gausskernel/storage/ipc/dsm.cpp b/src/gausskernel/storage/ipc/dsm.cpp new file mode 100644 index 000000000..9571d82bd --- /dev/null +++ b/src/gausskernel/storage/ipc/dsm.cpp @@ -0,0 +1,63 @@ +/* ------------------------------------------------------------------------- + * + * dsm.c + * manage dynamic shared memory segments + * + * This file provides a set of services to make programming with dynamic + * shared memory segments more convenient. Unlike the low-level + * facilities provided by dsm_impl.h and dsm_impl.c, mappings and segments + * created using this module will be cleaned up automatically. Mappings + * will be removed when the resource owner under which they were created + * is cleaned up, unless dsm_pin_mapping() is used, in which case they + * have session lifespan. Segments will be removed when there are no + * remaining mappings, or at postmaster shutdown in any case. After a + * hard postmaster crash, remaining segments will be removed, if they + * still exist, at the next postmaster startup. + * + * Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/gausskernel/storage/ipc/dsm.c + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "storage/dsm.h" +#include "knl/knl_session.h" +#include "utils/memutils.h" +#include "postmaster/bgworker_internals.h" + +void dsm_detach(void **seg) +{ + Assert(*seg != NULL); + knl_u_parallel_context *ctx = (knl_u_parallel_context *)*seg; + MemoryContextDelete(ctx->memCtx); + ctx->memCtx = NULL; + ctx->pwCtx = NULL; + ctx->used = false; +} + +void *dsm_create(void) +{ + for (int i = 0; i < DSM_MAX_ITEM_PER_QUERY; i++) { + if (u_sess->parallel_ctx[i].used == false) { + u_sess->parallel_ctx[i].memCtx = AllocSetContextCreate(u_sess->top_mem_cxt, "parallel query", + ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE, SHARED_CONTEXT); + + MemoryContext oldContext = MemoryContextSwitchTo(u_sess->parallel_ctx[i].memCtx); + u_sess->parallel_ctx[i].pwCtx = (ParallelInfoContext *)palloc0(sizeof(ParallelInfoContext)); + (void)MemoryContextSwitchTo(oldContext); + + u_sess->parallel_ctx[i].used = true; + return &(u_sess->parallel_ctx[i]); + } + } + + ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("too many dynamic shared memory segments"))); + return NULL; +} + diff --git a/src/gausskernel/storage/ipc/procarray.cpp b/src/gausskernel/storage/ipc/procarray.cpp index 0877e6a19..4e97936cf 100644 --- a/src/gausskernel/storage/ipc/procarray.cpp +++ b/src/gausskernel/storage/ipc/procarray.cpp @@ -819,6 +819,45 @@ void ProcArrayInitRecovery(TransactionId initializedUptoXID) TransactionIdRetreat(t_thrd.storage_cxt.latestObservedXid); } +/* + * ProcArrayInstallRestoredXmin -- install restored xmin into MyPgXact->xmin + * + * This is like ProcArrayInstallImportedXmin, but we have a pointer to the + * PGPROC of the transaction from which we imported the snapshot, rather than + * an XID. + * + * Returns TRUE if successful, FALSE if source xact is no longer running. + */ +bool ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc) +{ + bool result = false; + + Assert(TransactionIdIsNormal(xmin)); + Assert(proc != NULL); + + /* Get lock so source xact can't end while we're doing this */ + LWLockAcquire(ProcArrayLock, LW_SHARED); + + volatile PGXACT *pgxact = &g_instance.proc_base->allPgXact[proc->pgprocno]; + + /* + * Be certain that the referenced PGPROC has an advertised xmin which is + * no later than the one we're installing, so that the system-wide xmin + * can't go backwards. Also, make sure it's running in the same database, + * so that the per-database xmin cannot go backwards. + */ + TransactionId xid = pgxact->xmin; /* fetch just once */ + if (proc->databaseId == u_sess->proc_cxt.MyDatabaseId && TransactionIdIsNormal(xid) && + TransactionIdPrecedesOrEquals(xid, xmin)) { + t_thrd.pgxact->xmin = u_sess->utils_cxt.TransactionXmin = xmin; + result = true; + } + + LWLockRelease(ProcArrayLock); + + return result; +} + /* * GetRunningTransactionData -- returns information about running transactions. * @@ -2658,8 +2697,8 @@ int CountDBBackends(Oid databaseid) int pgprocno = arrayP->pgprocnos[index]; volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno]; - if (proc->pid == 0) - continue; /* do not count prepared xacts */ + if (proc->pid == 0 || t_thrd.bgworker_cxt.is_background_worker) + continue; /* do not count prepared xacts and backgroud workers */ if (!OidIsValid(databaseid) || proc->databaseId == databaseid) count++; @@ -2721,8 +2760,8 @@ int CountUserBackends(Oid roleid) int pgprocno = arrayP->pgprocnos[index]; volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno]; - if (proc->pid == 0) - continue; /* do not count prepared xacts */ + if (proc->pid == 0 || t_thrd.bgworker_cxt.is_background_worker) + continue; /* do not count prepared xacts and background workers */ if (proc->roleId == roleid) count++; diff --git a/src/gausskernel/storage/ipc/procsignal.cpp b/src/gausskernel/storage/ipc/procsignal.cpp index 3308fe14e..4b16f0aeb 100644 --- a/src/gausskernel/storage/ipc/procsignal.cpp +++ b/src/gausskernel/storage/ipc/procsignal.cpp @@ -19,6 +19,7 @@ #include #include +#include "access/parallel.h" #include "commands/async.h" #include "miscadmin.h" #include "storage/latch.h" @@ -272,6 +273,9 @@ void procsignal_sigusr1_handler(SIGNAL_ARGS) if (CheckProcSignal(PROCSIG_NOTIFY_INTERRUPT)) HandleNotifyInterrupt(); + if (CheckProcSignal(PROCSIG_PARALLEL_MESSAGE)) + HandleParallelMessageInterrupt(); + #ifdef PGXC if (CheckProcSignal(PROCSIG_PGXCPOOL_RELOAD)) diff --git a/src/gausskernel/storage/ipc/shm_mq.cpp b/src/gausskernel/storage/ipc/shm_mq.cpp index 7c731336b..9daa949d5 100644 --- a/src/gausskernel/storage/ipc/shm_mq.cpp +++ b/src/gausskernel/storage/ipc/shm_mq.cpp @@ -127,7 +127,7 @@ struct shm_mq { */ struct shm_mq_handle { shm_mq *mqh_queue; - char *mqh_segment; + void *mqh_segment; BackgroundWorkerHandle *mqh_handle; char *mqh_buffer; Size mqh_buflen; @@ -265,7 +265,7 @@ PGPROC *shm_mq_get_sender(shm_mq *mq) * counterpart won't get stuck waiting for us to fill or drain the queue * after we've already lost interest. */ -shm_mq_handle *shm_mq_attach(shm_mq *mq, char *seg, BackgroundWorkerHandle *handle) +shm_mq_handle *shm_mq_attach(shm_mq *mq, void *seg, BackgroundWorkerHandle *handle) { shm_mq_handle *mqh = (shm_mq_handle*)palloc(sizeof(shm_mq_handle)); diff --git a/src/gausskernel/storage/lmgr/lock.cpp b/src/gausskernel/storage/lmgr/lock.cpp index 614681944..cb6f363ea 100755 --- a/src/gausskernel/storage/lmgr/lock.cpp +++ b/src/gausskernel/storage/lmgr/lock.cpp @@ -1196,6 +1196,36 @@ bool IsInSameTransaction(PGPROC *proc1, PGPROC *proc2) : u_sess->stream_cxt.global_obj->inNodeGroup(proc1->pid, proc2->pid); } +/* + * when query run as parallel mode, the parallel leader and worker thread hold differnt + * Procs, but we treat them as one transaction. + */ +static bool IsInSameParallelQuery(PGPROC *proc1, PGPROC *proc2) +{ + if (!IsInParallelMode()) { + return false; + } + + /* Which proc is me? */ + PGPROC *otherProc = NULL; + if (proc1 == t_thrd.proc) { + otherProc = proc2; + } else if (proc2 == t_thrd.proc) { + otherProc = proc1; + } else { + return false; + } + + if (ParallelWorkerAmI()) { + /* I'm worker, so check whether other proc is my master or not */ + return t_thrd.msqueue_cxt.pq_mq_parallel_master_pid == otherProc->pid; + } else if (ParallelLeaderAmI()) { + /* I'm leader, so check whether other proc is a worker of mine or not */ + return GetBackgroundWorkerTypeByPid(otherProc->pid) != NULL; + } + return false; +} + /* * LockCheckConflicts -- test whether requested lock conflicts * with those already granted @@ -1246,12 +1276,13 @@ int LockCheckConflicts(LockMethod lockMethodTable, LOCKMODE lockmode, LOCK *lock * thread is in one transaction, but these threads use differnt procs. * We need treat these procs as one proc */ - if (StreamTopConsumerAmI() || StreamThreadAmI()) { + if (StreamTopConsumerAmI() || StreamThreadAmI() || ParallelWorkerAmI() || ParallelLeaderAmI()) { SHM_QUEUE *otherProcLocks = &(lock->procLocks); PROCLOCK *otherProcLock = (PROCLOCK *)SHMQueueNext(otherProcLocks, otherProcLocks, offsetof(PROCLOCK, lockLink)); while (otherProcLock != NULL) { - if (IsInSameTransaction(otherProcLock->tag.myProc, proc)) { + if (IsInSameParallelQuery(otherProcLock->tag.myProc, proc) || + IsInSameTransaction(otherProcLock->tag.myProc, proc)) { if (otherProcLock->holdMask & LOCKBIT_ON((unsigned int)i)) ++myHolding; } diff --git a/src/gausskernel/storage/lmgr/predicate.cpp b/src/gausskernel/storage/lmgr/predicate.cpp index 5b9832923..421b0159e 100755 --- a/src/gausskernel/storage/lmgr/predicate.cpp +++ b/src/gausskernel/storage/lmgr/predicate.cpp @@ -1425,6 +1425,16 @@ static Snapshot GetSerializableTransactionSnapshotInt(Snapshot snapshot, Transac Assert(!RecoveryInProgress()); + /* + * Since all parts of a serializable transaction must use the same + * snapshot, it is too late to establish one after a parallel operation + * has begun. + */ + if (IsInParallelMode()) { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot establish serializable snapshot during a parallel operation"))); + } + proc = t_thrd.proc; Assert(proc != NULL); GET_VXID_FROM_PGPROC(vxid, *proc); diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 86b9bc49b..aac08d89b 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -80,6 +80,7 @@ extern void bucketClosePartition(Partition bucket); /* struct definition appears in relscan.h */ typedef struct HeapScanDescData* HeapScanDesc; +typedef struct ParallelHeapScanDescData *ParallelHeapScanDesc; /* * HeapScanIsValid @@ -101,6 +102,11 @@ extern void heap_rescan(HeapScanDesc scan, ScanKey key); extern void heap_endscan(HeapScanDesc scan); extern HeapTuple heap_getnext(HeapScanDesc scan, ScanDirection direction); +extern Size heap_parallelscan_estimate(Snapshot snapshot); +extern void heap_parallelscan_initialize(ParallelHeapScanDesc target, Size pscan_len, Relation relation, + Snapshot snapshot); +extern HeapScanDesc heap_beginscan_parallel(Relation relation, ParallelHeapScanDesc parallel_scan); + extern void heap_init_parallel_seqscan(HeapScanDesc scan, int32 dop, ScanDirection dir); extern HeapTuple heapGetNextForVerify(HeapScanDesc scan, ScanDirection direction, bool& isValidRelationPage); diff --git a/src/include/access/parallel.h b/src/include/access/parallel.h new file mode 100644 index 000000000..13b3f52f8 --- /dev/null +++ b/src/include/access/parallel.h @@ -0,0 +1,68 @@ +/* ------------------------------------------------------------------------- + * + * parallel.h + * Infrastructure for launching parallel workers + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/parallel.h + * + * ------------------------------------------------------------------------- + */ + +#ifndef PARALLEL_H +#define PARALLEL_H + +#include "access/xlogdefs.h" +#include "lib/ilist.h" +#include "postmaster/bgworker.h" +#include "storage/shm_mq.h" + +typedef void (*parallel_worker_main_type)(void *seg); + +typedef struct ParallelWorkerInfo { + BackgroundWorkerHandle *bgwhandle; + shm_mq_handle *error_mqh; + ThreadId pid; +} ParallelWorkerInfo; + +typedef struct ParallelContext { + dlist_node node; + SubTransactionId subid; + int nworkers; + int nworkers_launched; + char *library_name; + char *function_name; + ErrorContextCallback *error_context_stack; + void *seg; + void *private_memory; + ParallelWorkerInfo *worker; + int nknown_attached_workers; + bool *known_attached_workers; +} ParallelContext; + +typedef struct ParallelWorkerContext { + void *seg; +} ParallelWorkerContext; + +#define IsParallelWorker() (t_thrd.bgworker_cxt.ParallelWorkerNumber >= 0) + +extern ParallelContext *CreateParallelContext(const char *library_name, const char *function_name, int nworkers); +extern void InitializeParallelDSM(ParallelContext *pcxt); +extern void ReinitializeParallelDSM(ParallelContext *pcxt); +extern void LaunchParallelWorkers(ParallelContext *pcxt); +extern void WaitForParallelWorkersToAttach(ParallelContext *pcxt); +extern void WaitForParallelWorkersToFinish(ParallelContext *pcxt); +extern void DestroyParallelContext(ParallelContext *pcxt); +extern bool ParallelContextActive(void); + +extern void HandleParallelMessageInterrupt(void); +extern void HandleParallelMessages(void); +extern void AtEOXact_Parallel(bool isCommit); +extern void AtEOSubXact_Parallel(bool isCommit, SubTransactionId mySubId); +extern void ParallelWorkerReportLastRecEnd(XLogRecPtr last_xlog_end); + +extern void ParallelWorkerMain(Datum main_arg); + +#endif /* PARALLEL_H */ diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index a88a34e56..3ae8208c5 100755 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -21,6 +21,27 @@ #define PARALLEL_SCAN_GAP 100 +/* + * Shared state for parallel heap scan. + * + * Each backend participating in a parallel heap scan has its own + * HeapScanDesc in backend-private memory, and those objects all contain + * a pointer to this structure. The information here must be sufficient + * to properly initialize each new HeapScanDesc as workers join the scan, + * and it must act as a font of block numbers for those workers. + */ +typedef struct ParallelHeapScanDescData { + int plan_node_id; /* used to identify speicific plan */ + Oid phs_relid; /* OID of relation to scan */ + bool phs_syncscan; /* report location to syncscan logic? */ + BlockNumber phs_nblocks; /* # blocks in relation at start of scan */ + slock_t phs_mutex; /* mutual exclusion for setting startblock */ + BlockNumber phs_startblock; /* starting block number */ + pg_atomic_uint64 phs_nallocated; /* number of blocks allocated to workers so far. */ + uint32 pscan_len; /* total size of this struct, including phs_snapshot_data */ + char phs_snapshot_data[FLEXIBLE_ARRAY_MEMBER]; +} ParallelHeapScanDescData; + /* ---------------------------------------------------------------- * Scan State Information * ---------------------------------------------------------------- @@ -62,18 +83,17 @@ typedef struct HeapScanDescData { Snapshot rs_snapshot; /* snapshot to see */ int rs_nkeys; /* number of scan keys */ ScanKey rs_key; /* array of scan key descriptors */ - bool rs_bitmapscan; /* true if this is really a bitmap scan */ - bool rs_samplescan; /* true if this is really a sample scan */ - bool rs_pageatatime; /* verify visibility page-at-a-time? */ - bool rs_allow_strat; /* allow or disallow use of access strategy */ - bool rs_allow_sync; /* allow or disallow use of syncscan */ + /* + * Information about type and behaviour of the scan, a bitmask of members + * of the ScanOptions enum (see tableam.h). + */ + uint32 rs_flags; /* state set up at initscan time */ BlockNumber rs_nblocks; /* number of blocks to scan */ BlockNumber rs_startblock; /* block # to start at */ BufferAccessStrategy rs_strategy; /* access strategy for reads */ bool rs_syncscan; /* report location to syncscan logic? */ - bool rs_isRangeScanInRedis; /* if it is a range scan in redistribution */ /* scan current state */ bool rs_inited; /* false = scan not init'd yet */ @@ -82,6 +102,7 @@ typedef struct HeapScanDescData { Buffer rs_cbuf; /* current buffer in scan, if any */ /* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ ItemPointerData rs_mctid; /* marked scan position, if any */ + ParallelHeapScanDesc rs_parallel; /* parallel scan information */ /* these fields only used in page-at-a-time mode and for bitmap scans */ int rs_cindex; /* current tuple's index in vistuples */ diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 8c42d02ba..8a53128e2 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -28,6 +28,26 @@ #include "utils/snapshot.h" #include "nodes/execnodes.h" +/* + * Bitmask values for the flags argument to the scan_begin callback. + */ +typedef enum ScanOptions { + /* true if this is really a bitmap scan */ + SO_TYPE_BITMAPSCAN = 1 << 1, + /* true if this is really a sample scan */ + SO_TYPE_SAMPLESCAN = 1 << 2, + /* if it is a range scan in redistribution */ + SO_TYPE_RANGESCAN = 1 << 3, + /* verify visibility page-at-a-time? */ + SO_ALLOW_PAGEMODE = 1 << 4, + /* allow or disallow use of access strategy */ + SO_ALLOW_STRAT = 1 << 5, + /* allow or disallow use of syncscan */ + SO_ALLOW_SYNC = 1 << 6, + /* unregister snapshot at scan end? */ + SO_TEMP_SNAPSHOT = 1 << 7 +} ScanOptions; + extern bool reset_scan_qual(Relation currHeapRel, ScanState * node); diff --git a/src/include/access/xact.h b/src/include/access/xact.h index be5e6b94c..c263ead49 100755 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -328,6 +328,9 @@ extern void BeginInternalSubTransaction(const char* name); extern void ReleaseCurrentSubTransaction(void); extern void RollbackAndReleaseCurrentSubTransaction(void); extern bool IsSubTransaction(void); +extern void StartParallelWorkerTransaction(ParallelInfoContext *cxt); +extern void EndParallelWorkerTransaction(void); +extern void SetParallelStartTimestamps(TimestampTz xact_ts, TimestampTz stmt_ts); extern void SetCurrentTransactionId(TransactionId tid); extern bool IsTransactionBlock(void); extern bool IsTransactionOrTransactionBlock(void); @@ -390,4 +393,8 @@ extern bool IsMixedEngineUsed(); extern void SetCurrentTransactionStorageEngine(StorageEngineType storageEngineType); extern void CallXactCallbacks(XactEvent event); +extern void EnterParallelMode(void); +extern void ExitParallelMode(void); +extern bool IsInParallelMode(void); + #endif /* XACT_H */ diff --git a/src/include/catalog/namespace.h b/src/include/catalog/namespace.h index ddd32daff..2da5f5a3a 100755 --- a/src/include/catalog/namespace.h +++ b/src/include/catalog/namespace.h @@ -157,6 +157,8 @@ extern bool isTempOrToastNamespace(Oid namespaceId); extern bool isAnyTempNamespace(Oid namespaceId); extern bool isOtherTempNamespace(Oid namespaceId); extern Oid GetTempToastNamespace(void); +extern void GetTempNamespaceState(Oid *tempNamespaceId, Oid *tempToastNamespaceId); +extern void SetTempNamespaceState(Oid tempNamespaceId, Oid tempToastNamespaceId); extern void ResetTempTableNamespace(void); extern OverrideSearchPath *GetOverrideSearchPath(MemoryContext context); @@ -173,7 +175,7 @@ extern Oid FindDefaultConversionProc(int4 for_encoding, int4 to_encoding); /* initialization & transaction cleanup code */ extern void InitializeSearchPath(void); -extern void AtEOXact_Namespace(bool isCommit); +extern void AtEOXact_Namespace(bool isCommit, bool parallel); extern void AtEOSubXact_Namespace(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid); diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 8ee216c4c..97b53c8d0 100755 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -413,6 +413,15 @@ typedef FormData_pg_proc *Form_pg_proc; #define PROVOLATILE_STABLE 's' /* does not change within a scan */ #define PROVOLATILE_VOLATILE 'v' /* can change even within a scan */ +/* + * Symbolic values for proparallel column: these indicate whether a function + * can be safely be run in a parallel backend, during parallelism but + * necessarily in the master, or only in non-parallel mode. + */ +#define PROPARALLEL_SAFE 's' /* can run in worker or master */ +#define PROPARALLEL_RESTRICTED 'r' /* can run in parallel master only */ +#define PROPARALLEL_UNSAFE 'u' /* banned while in parallel mode */ + /* * Symbolic values for proargmodes column. Note that these must agree with * the FunctionParameterMode enum in parsenodes.h; we declare them here to diff --git a/src/include/executor/execParallel.h b/src/include/executor/execParallel.h new file mode 100644 index 000000000..947d57067 --- /dev/null +++ b/src/include/executor/execParallel.h @@ -0,0 +1,38 @@ +/* -------------------------------------------------------------------- + * execParallel.h + * POSTGRES parallel execution interface + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/executor/execParallel.h + * -------------------------------------------------------------------- + */ + +#ifndef EXECPARALLEL_H +#define EXECPARALLEL_H + +#include "access/parallel.h" +#include "nodes/execnodes.h" +#include "nodes/parsenodes.h" +#include "nodes/plannodes.h" + +typedef struct SharedExecutorInstrumentation SharedExecutorInstrumentation; + +typedef struct ParallelExecutorInfo { + PlanState *planstate; + ParallelContext *pcxt; + BufferUsage *buffer_usage; + SharedExecutorInstrumentation *instrumentation; + shm_mq_handle **tqueue; + bool finished; +} ParallelExecutorInfo; + +extern ParallelExecutorInfo *ExecInitParallelPlan(PlanState *planstate, EState *estate, int nworkers); +extern void ExecParallelFinish(ParallelExecutorInfo *pei); +extern void ExecParallelCleanup(ParallelExecutorInfo *pei); +extern void ExecParallelReinitialize(ParallelExecutorInfo *pei); + +extern void ParallelQueryMain(void *seg); +#endif /* EXECPARALLEL_H */ diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index e329385e0..2a0aa45d6 100755 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -247,6 +247,7 @@ extern PlanState* ExecInitNode(Plan* node, EState* estate, int eflags); extern TupleTableSlot* ExecProcNode(PlanState* node); extern Node* MultiExecProcNode(PlanState* node); extern void ExecEndNode(PlanState* node); +extern bool ExecShutdownNode(PlanState *node); extern long ExecGetPlanMemCost(Plan* node); diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 938b97848..c160d39f9 100755 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -320,6 +320,11 @@ typedef struct Instrumentation { RecursiveInfo recursiveInfo; } Instrumentation; +typedef struct WorkerInstrumentation { + int num_workers; /* # of structures that follow */ + Instrumentation instrument[FLEXIBLE_ARRAY_MEMBER]; +} WorkerInstrumentation; + /* instrumentation data */ typedef struct InstrStreamPlanData { /* whether the plannode is valid */ @@ -1000,9 +1005,14 @@ typedef struct size_info { extern OperatorProfileTable g_operator_table; extern Instrumentation* InstrAlloc(int n, int instrument_options); +extern void InstrInit(Instrumentation *instr, int instrument_options); extern void InstrStartNode(Instrumentation* instr); extern void InstrStopNode(Instrumentation* instr, double nTuples); extern void InstrEndLoop(Instrumentation* instr); +extern void InstrAggNode(Instrumentation *dst, Instrumentation *add); +extern void InstrStartParallelQuery(void); +extern void InstrEndParallelQuery(BufferUsage *result); +extern void InstrAccumParallelQuery(BufferUsage *result); extern void StreamEndLoop(StreamTime* instr); extern void AddControlMemoryContext(Instrumentation* instr, MemoryContext context); extern void CalculateContextSize(MemoryContext ctx, int64* memorySize); diff --git a/src/include/executor/nodeGather.h b/src/include/executor/nodeGather.h new file mode 100644 index 000000000..d161df59d --- /dev/null +++ b/src/include/executor/nodeGather.h @@ -0,0 +1,25 @@ +/* ------------------------------------------------------------------------- + * + * nodeGather.h + * prototypes for nodeGather.c + * + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/executor/nodeGather.h + * + * ------------------------------------------------------------------------- + */ +#ifndef NODEGATHER_H +#define NODEGATHER_H + +#include "nodes/execnodes.h" + +extern GatherState *ExecInitGather(Gather *node, EState *estate, int eflags); +extern TupleTableSlot *ExecGather(GatherState *node); +extern void ExecEndGather(GatherState *node); +extern void ExecShutdownGather(GatherState *node); +extern void ExecReScanGather(GatherState *node); + +#endif /* NODEGATHER_H */ diff --git a/src/include/executor/nodeSeqscan.h b/src/include/executor/nodeSeqscan.h index fa5ddf20f..2860fdb88 100644 --- a/src/include/executor/nodeSeqscan.h +++ b/src/include/executor/nodeSeqscan.h @@ -14,6 +14,7 @@ #ifndef NODESEQSCAN_H #define NODESEQSCAN_H +#include "access/parallel.h" #include "nodes/execnodes.h" extern SeqScanState* ExecInitSeqScan(SeqScan* node, EState* estate, int eflags); @@ -25,4 +26,9 @@ extern void ExecReScanSeqScan(SeqScanState* node); extern void InitScanRelation(SeqScanState* node, EState* estate); +/* parallel scan support */ +extern void ExecSeqScanEstimate(SeqScanState *node, ParallelContext *pcxt); +extern void ExecSeqScanInitializeDSM(SeqScanState *node, ParallelContext *pcxt, int nodeid); +extern void ExecSeqScanInitializeWorker(SeqScanState *node, void *context); + #endif /* NODESEQSCAN_H */ diff --git a/src/include/executor/spi.h b/src/include/executor/spi.h index 962cc56b8..722336ef9 100755 --- a/src/include/executor/spi.h +++ b/src/include/executor/spi.h @@ -133,6 +133,7 @@ extern void SPICleanup(void); extern void AtEOXact_SPI(bool isCommit, bool stpRollback, bool stpCommit); extern void AtEOSubXact_SPI(bool isCommit, SubTransactionId mySubid, bool stpRollback, bool stpCommit); +extern bool SPI_inside_nonatomic_context(void); extern DestReceiver* createAnalyzeSPIDestReceiver(CommandDest dest); /* SPI execution helpers */ extern void spi_exec_with_callback(CommandDest dest, const char* src, bool read_only, long tcount, bool direct_call, diff --git a/src/include/executor/tqueue.h b/src/include/executor/tqueue.h new file mode 100644 index 000000000..e09840bb8 --- /dev/null +++ b/src/include/executor/tqueue.h @@ -0,0 +1,29 @@ +/*------------------------------------------------------------------------- + * + * tqueue.h + * Use shm_mq to send & receive tuples between parallel backends + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/executor/tqueue.h + * + *------------------------------------------------------------------------- + */ + +#ifndef TQUEUE_H +#define TQUEUE_H + +#include "storage/shm_mq.h" +#include "tcop/dest.h" + +/* Use this to send tuples to a shm_mq. */ +extern DestReceiver *CreateTupleQueueDestReceiver(shm_mq_handle *handle); + +/* Use these to receive tuples from a shm_mq. */ +typedef struct TupleQueueReader TupleQueueReader; +extern TupleQueueReader *CreateTupleQueueReader(shm_mq_handle *handle, TupleDesc tupledesc); +extern void DestroyTupleQueueReader(TupleQueueReader *funnel); +extern HeapTuple TupleQueueReaderNext(TupleQueueReader *, bool nowait, bool *done); + +#endif /* TQUEUE_H */ diff --git a/src/include/gs_thread.h b/src/include/gs_thread.h index dc0bfaf1b..47cdecbd4 100755 --- a/src/include/gs_thread.h +++ b/src/include/gs_thread.h @@ -107,6 +107,7 @@ typedef enum knl_thread_role { WAL_STANDBY_SENDER, /* Am I cascading WAL to another standby ? */ WAL_DB_SENDER, TOP_CONSUMER, + BACKGROUND_LEADER, } knl_thread_role; /* diff --git a/src/include/knl/knl_guc/knl_instance_attr_common.h b/src/include/knl/knl_guc/knl_instance_attr_common.h index a4b64a2aa..914de7f28 100755 --- a/src/include/knl/knl_guc/knl_instance_attr_common.h +++ b/src/include/knl/knl_guc/knl_instance_attr_common.h @@ -76,6 +76,10 @@ typedef struct knl_instance_attr_common { bool enable_alarm; char* Alarm_component; char* MOTConfigFileName; + + int max_worker_processes; + int max_parallel_workers; + int max_parallel_workers_per_gather; } knl_instance_attr_common; #endif /* SRC_INCLUDE_KNL_KNL_INSTANCE_ATTR_COMMON_H_ */ diff --git a/src/include/knl/knl_guc/knl_session_attr_sql.h b/src/include/knl/knl_guc/knl_session_attr_sql.h index 6527cc665..212f853e5 100644 --- a/src/include/knl/knl_guc/knl_session_attr_sql.h +++ b/src/include/knl/knl_guc/knl_session_attr_sql.h @@ -154,6 +154,7 @@ typedef struct knl_session_attr_sql { int acce_min_datasize_per_thread; int max_cn_temp_file_size; int default_statistics_target; + int min_parallel_table_scan_size; /* Memory Limit user could set in session */ int FencedUDFMemoryLimit; int64 g_default_expthresh; @@ -163,6 +164,8 @@ typedef struct knl_session_attr_sql { double allocate_mem_cost; double cpu_index_tuple_cost; double cpu_operator_cost; + double parallel_tuple_cost; + double parallel_setup_cost; double stream_multiple; double cursor_tuple_fraction; double Geqo_selection_bias; @@ -198,8 +201,10 @@ typedef struct knl_session_attr_sql { bool enable_opfusion; bool enable_beta_opfusion; bool enable_beta_nestloop_fusion; + bool parallel_leader_participation; int opfusion_debug_mode; int single_shard_stmt; + int force_parallel_mode; } knl_session_attr_sql; #endif /* SRC_INCLUDE_KNL_KNL_SESSION_ATTR_SQL */ diff --git a/src/include/knl/knl_session.h b/src/include/knl/knl_session.h index 130161cc3..1e5e1ae45 100644 --- a/src/include/knl/knl_session.h +++ b/src/include/knl/knl_session.h @@ -45,6 +45,7 @@ #include #include "c.h" +#include "access/heapam.h" #include "datatype/timestamp.h" #include "gs_thread.h" #include "knl/knl_guc.h" @@ -59,10 +60,14 @@ #include "storage/backendid.h" #include "storage/s_lock.h" #include "storage/shmem.h" +#include "storage/predicate.h" +#include "postmaster/bgworker.h" +#include "storage/dsm.h" #include "utils/palloc.h" typedef void (*pg_on_exit_callback)(int code, Datum arg); + /* all session level attribute which expose to user. */ typedef struct knl_session_attr { knl_session_attr_sql attr_sql; @@ -2036,6 +2041,63 @@ typedef struct knl_u_ext_fdw_context { pg_on_exit_callback fdwExitFunc; /* Exit callback, will be called when session exit */ } knl_u_ext_fdw_context; +/* Info need to pass from leader to worker */ +typedef struct ParallelInfoContext { + Oid database_id; + Oid authenticated_user_id; + Oid current_user_id; + Oid outer_user_id; + Oid temp_namespace_id; + Oid temp_toast_namespace_id; + int sec_context; + bool is_superuser; + void *parallel_master_pgproc; /* PGPROC */ + ThreadId parallel_master_pid; + BackendId parallel_master_backend_id; + TimestampTz xact_ts; + TimestampTz stmt_ts; + char *pstmt_space; + char *param_space; + Size param_len; + int pscan_num; + ParallelHeapScanDesc *pscan; + int usedComboCids; + int sizeComboCids; + HTAB *comboHash; + struct ComboCidKeyData *comboCids; + char *tsnapspace; + Size tsnapspace_len; + char *asnapspace; + Size asnapspace_len; + struct RelMapFile *active_shared_updates; + struct RelMapFile *active_local_updates; + char *errorQueue; + int xactIsoLevel; + bool xactDeferrable; + TransactionId topTransactionId; + TransactionId currentTransactionId; + CommandId currentCommandId; + int nParallelCurrentXids; + TransactionId *ParallelCurrentXids; + char *library_name; + char *function_name; + BufferUsage *bufUsage; + char *tupleQueue; + struct SharedExecutorInstrumentation *instrumentation; + char *namespace_search_path; + + /* Mutex protects remaining fields. */ + slock_t mutex; + /* Maximum XactLastRecEnd of any worker. */ + XLogRecPtr last_xlog_end; +} ParallelInfoContext; + +typedef struct knl_u_parallel_context { + ParallelInfoContext *pwCtx; + MemoryContext memCtx; + bool used; +} knl_u_parallel_context; + enum knl_session_status { KNL_SESS_FAKE, KNL_SESS_UNINIT, @@ -2131,6 +2193,9 @@ typedef struct knl_session_context { /* external FDW */ knl_u_ext_fdw_context ext_fdw_ctx[MAX_TYPE_FDW]; + + /* parallel query context */ + knl_u_parallel_context parallel_ctx[DSM_MAX_ITEM_PER_QUERY]; } knl_session_context; extern knl_session_context* create_session_context(MemoryContext parent, uint64 id); diff --git a/src/include/knl/knl_thread.h b/src/include/knl/knl_thread.h index a1325f9be..e9f58d0cb 100644 --- a/src/include/knl/knl_thread.h +++ b/src/include/knl/knl_thread.h @@ -346,6 +346,33 @@ typedef struct knl_t_xact_context { struct SERIALIZABLEXACT* MySerializableXact; bool MyXactDidWrite; + /* + * When running as a parallel worker, we place only a single + * TransactionStateData on the parallel worker's state stack, and the XID + * reflected there will be that of the *innermost* currently-active + * subtransaction in the backend that initiated parallelism. However, + * GetTopTransactionId and TransactionIdIsCurrentTransactionId + * need to return the same answers in the parallel worker as they would have + * in the user backend, so we need some additional bookkeeping. + * + * XactTopTransactionId stores the XID of our toplevel transaction, which + * will be the same as TopTransactionState.transactionId in an ordinary + * backend; but in a parallel backend, which does not have the entire + * transaction state, it will instead be copied from the backend that started + * the parallel operation. + * + * nParallelCurrentXids will be 0 and ParallelCurrentXids NULL in an ordinary + * backend, but in a parallel backend, nParallelCurrentXids will contain the + * number of XIDs that need to be considered current, and ParallelCurrentXids + * will contain the XIDs themselves. This includes all XIDs that were current + * or sub-committed in the parent at the time the parallel operation began. + * The XIDs are stored sorted in numerical order (not logical order) to make + * lookups as fast as possible. + */ + TransactionId XactTopTransactionId; + int nParallelCurrentXids; + TransactionId *ParallelCurrentXids; + #ifdef PGXC bool useLocalSnapshot; /* @@ -2724,6 +2751,23 @@ typedef struct knl_t_bgworker_context { * The postmaster's list of registered background workers, in private memory. */ slist_head background_worker_list; + + /* Is there a parallel message pending which we need to receive? */ + volatile bool ParallelMessagePending; + /* Are we initializing a parallel worker? */ + bool InitializingParallelWorker; + /* + * Our parallel worker number. We initialize this to -1, meaning that we are + * not a parallel worker. In parallel workers, it will be set to a value >= 0 + * and < the number of workers before any user code is invoked; each parallel + * worker will get a different parallel worker number. + */ + int ParallelWorkerNumber; + /* List of active parallel contexts. */ + dlist_head pcxt_list; + + BufferUsage *save_pgBufferUsage; + MemoryContext hpm_context; } knl_t_bgworker_context; struct shm_mq; diff --git a/src/include/libpq/pqmq.h b/src/include/libpq/pqmq.h index 2a749790e..a760249c3 100644 --- a/src/include/libpq/pqmq.h +++ b/src/include/libpq/pqmq.h @@ -18,7 +18,7 @@ extern void pq_redirect_to_shm_mq(shm_mq_handle* mqh); extern void pq_stop_redirect_to_shm_mq(void); -extern void pq_set_parallel_master(pid_t pid, BackendId backend_id); +extern void pq_set_parallel_master(ThreadId pid, BackendId backend_id); extern void pq_parse_errornotice(StringInfo str, ErrorData* edata); diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 4b0a3c9be..cbf3ab312 100755 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -207,6 +207,7 @@ extern bool stack_is_too_deep(void); /* in tcop/utility.c */ extern void PreventCommandIfReadOnly(const char* cmdname); +extern void PreventCommandIfParallelMode(const char *cmdname); extern void PreventCommandDuringRecovery(const char* cmdname); extern int trace_recovery(int trace_level); @@ -410,6 +411,9 @@ extern void EarlyBindingTLSVariables(void); extern bool StreamThreadAmI(); extern void StreamTopConsumerReset(); extern bool StreamTopConsumerAmI(); +extern bool ParallelWorkerAmI(); +extern bool ParallelLeaderAmI(); + /* * converts the 64 bits unsigned integer between host byte order and network byte order. diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 5a755f491..25485356a 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1192,6 +1192,7 @@ typedef struct PlanState { * top-level plan */ Instrumentation* instrument; /* Optional runtime stats for this node */ + WorkerInstrumentation *worker_instrument; /* per-worker instrumentation */ /* * Common structural data for all Plan types. These links to subsidiary @@ -1546,6 +1547,7 @@ typedef struct ScanState { bool isSampleScan; /* identify is it table sample scan or not. */ SampleScanParams sampleScanInfo; /* TABLESAMPLE params include type/seed/repeatable. */ ExecScanAccessMtd ScanNextMtd; + Size pscan_len; /* size of parallel heap scan descriptor */ } ScanState; /* @@ -2285,6 +2287,24 @@ typedef struct UniqueState { MemoryContext tempContext; /* short-term context for comparisons */ } UniqueState; +/* ---------------- + * GatherState information + * + * Gather nodes launch 1 or more parallel workers, run a subplan + * in those workers, and collect the results. + * ---------------- + */ +typedef struct GatherState { + PlanState ps; /* its first field is NodeTag */ + bool initialized; + struct ParallelExecutorInfo *pei; + int nreaders; + int nextreader; + struct TupleQueueReader **reader; + TupleTableSlot *funnel_slot; + bool need_to_scan_locally; +} GatherState; + /* ---------------- * HashState information * ---------------- diff --git a/src/include/nodes/nodeFuncs.h b/src/include/nodes/nodeFuncs.h index bc9ca374a..6043b6bf0 100755 --- a/src/include/nodes/nodeFuncs.h +++ b/src/include/nodes/nodeFuncs.h @@ -56,4 +56,6 @@ extern bool is_func_distinct_unshippable(Oid funcid); extern bool lockNextvalWalker(Node* node, void* context); +struct PlanState; +extern bool planstate_tree_walker(struct PlanState *planstate, bool (*walker)(), void *context); #endif /* NODEFUNCS_H */ diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 1c900fe0d..1aab61316 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -74,6 +74,7 @@ typedef enum NodeTag { T_Agg, T_WindowAgg, T_Unique, + T_Gather, T_Hash, T_SetOp, T_LockRows, @@ -151,6 +152,7 @@ typedef enum NodeTag { T_AggState, T_WindowAggState, T_UniqueState, + T_GatherState, T_HashState, T_SetOpState, T_LockRowsState, @@ -289,6 +291,7 @@ typedef enum NodeTag { T_ResultPath, T_MaterialPath, T_UniquePath, + T_GatherPath, T_PartIteratorPath, T_EquivalenceClass, T_EquivalenceMember, diff --git a/src/include/nodes/params.h b/src/include/nodes/params.h index 9d17ddafc..a0f035dc9 100755 --- a/src/include/nodes/params.h +++ b/src/include/nodes/params.h @@ -79,6 +79,7 @@ typedef struct ParamListInfoData { void* parserSetupArg; int numParams; /* number of ParamExternDatas following */ bool params_need_process; + struct Bitmapset *paramMask; /* if non-NULL, can ignore omitted params */ ParamExternData params[FLEXIBLE_ARRAY_MEMBER]; } ParamListInfoData; @@ -112,5 +113,8 @@ enum { CURSOR_ISOPEN = 1, CURSOR_FOUND, CURSOR_NOTFOUND, CURSOR_ROWCOUNT }; /* Functions found in src/backend/nodes/params.c */ extern ParamListInfo copyParamList(ParamListInfo from); +extern Size EstimateParamListSpace(ParamListInfo paramLI); +extern void SerializeParamList(ParamListInfo paramLI, char *start_address, Size len); +extern ParamListInfo RestoreParamList(char *start_address, Size len); #endif /* PARAMS_H */ diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 2da4118aa..fc2bf3e19 100755 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -2539,6 +2539,7 @@ typedef struct SecLabelStmt { #define CURSOR_OPT_FAST_PLAN 0x0020 /* prefer fast-start plan */ #define CURSOR_OPT_GENERIC_PLAN 0x0040 /* force use of generic plan */ #define CURSOR_OPT_CUSTOM_PLAN 0x0080 /* force use of custom plan */ +#define CURSOR_OPT_PARALLEL_OK 0x0100 /* parallel mode OK */ typedef struct DeclareCursorStmt { NodeTag type; diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index f62ecc5aa..5f4fca027 100755 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -85,6 +85,8 @@ typedef struct PlannedStmt { bool dependsOnRole; /* is plan specific to current role? */ + bool parallelModeNeeded; /* parallel mode required to execute? */ + Plan* planTree; /* tree of Plan nodes */ List* rtable; /* list of RangeTblEntry nodes */ @@ -242,6 +244,11 @@ typedef struct Plan { int plan_width; /* average row width in bytes */ int dop; /* degree of parallelism of current plan */ + /* + * information needed for parallel query + */ + bool parallel_aware; /* engage parallel-aware logic? */ + /* * machine learning model estimations */ @@ -1144,6 +1151,16 @@ typedef struct Unique { Oid* uniqOperators; /* equality operators to compare with */ } Unique; +/* ------------ + * gather node + * ------------ + */ +typedef struct Gather { + Plan plan; + int num_workers; + bool single_copy; +} Gather; + /* ---------------- * hash build node * diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index b9ab9a64f..67fb1b0f0 100755 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -157,6 +157,10 @@ typedef struct PlannerGlobal { bool dependsOnRole; /* is plan specific to current role? */ + bool parallelModeOK; /* parallel mode potentially OK? */ + + bool parallelModeNeeded; /* parallel mode actually required? */ + /* Added post-release, will be in a saner place in 9.3: */ int nParamExec; /* number of PARAM_EXEC Params used */ bool insideRecursion; /* For sql on hdfs, internal flag. */ @@ -548,6 +552,8 @@ typedef struct RelOptInfo { int encodedwidth; /* estimated avg width of encoded columns in result tuples */ AttrNumber encodednum; /* number of encoded column */ + bool consider_parallel; /* consider parallel paths? */ + /* materialization information */ List* reltargetlist; /* Vars to be output by scan of relation */ List* distribute_keys; /* distribute key */ @@ -878,6 +884,9 @@ typedef struct Path { RelOptInfo* parent; /* the relation this path can build */ ParamPathInfo* param_info; /* parameterization info, or NULL if none */ + bool parallel_aware; /* engage parallel-aware logic? */ + bool parallel_safe; /* OK to use as part of parallel plan? */ + /* estimated size/costs for path (see costsize.c for more info) */ double rows; /* estimated number of global result tuples */ double multiple; @@ -1173,6 +1182,18 @@ typedef struct UniquePath { OpMemInfo mem_info; /* Memory info for hashagg or sort */ } UniquePath; +/* + * GatherPath runs several copies of a plan in parallel and collects the + * results. The parallel leader may also execute the plan, unless the + * single_copy flag is set. + */ +typedef struct GatherPath { + Path path; + Path *subpath; /* path for each worker */ + int num_workers; /* number of workers sought to help */ + bool single_copy; /* don't execute path more than once */ +} GatherPath; + /* * All join-type paths share these fields. */ diff --git a/src/include/optimizer/clauses.h b/src/include/optimizer/clauses.h index ce19295b1..a7fbc402e 100755 --- a/src/include/optimizer/clauses.h +++ b/src/include/optimizer/clauses.h @@ -65,6 +65,7 @@ extern bool contain_subplans(Node* clause); extern bool contain_mutable_functions(Node* clause); extern bool contain_volatile_functions(Node* clause); extern bool contain_specified_function(Node* clause, Oid funcid); +extern bool has_parallel_hazard(Node *node, bool allow_restricted); extern bool contain_nonstrict_functions(Node* clause, bool check_agg = false); extern bool contain_leaky_functions(Node* clause); extern bool exec_simple_check_mutable_function(Node* clause); diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 080a5b57a..72116787c 100755 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -51,6 +51,8 @@ #define LOCAL_RECEIVE_KDATA_COST 1.3 /* The receive cost for local stream */ #define DEFAULT_SMP_THREAD_COST 1000 /* The cost for add a new thread */ #define DEFAULT_STREAM_MULTIPLE 1.0 +#define DEFAULT_PARALLEL_TUPLE_COST 0.1 +#define DEFAULT_PARALLEL_SETUP_COST 1000.0 #define DEFAULT_EFFECTIVE_CACHE_SIZE 16384 /* measured in pages */ @@ -80,7 +82,8 @@ extern void cost_update(Path* path, bool vectorized, Cost input_cost, double tup extern double clamp_row_est(double nrows); extern double index_pages_fetched( double tuples_fetched, BlockNumber pages, double index_pages, PlannerInfo* root, bool ispartitionedindex); -extern void cost_seqscan(Path* path, PlannerInfo* root, RelOptInfo* baserel, ParamPathInfo* param_info); +extern void cost_seqscan(Path* path, PlannerInfo* root, RelOptInfo* baserel, + ParamPathInfo* param_info, int nworkers = 0); extern void cost_samplescan(Path* path, PlannerInfo* root, RelOptInfo* baserel, ParamPathInfo* param_info); extern void cost_cstorescan(Path* path, PlannerInfo* root, RelOptInfo* baserel); extern void cost_dfsscan(Path* path, PlannerInfo* root, RelOptInfo* baserel); @@ -130,6 +133,7 @@ extern void final_cost_hashjoin(PlannerInfo* root, HashPath* path, JoinCostWorks extern void cost_rescan(PlannerInfo* root, Path* path, Cost* rescan_startup_cost, /* output parameters */ Cost* rescan_total_cost, OpMemInfo* mem_info); extern Cost cost_rescan_material(double rows, int width, OpMemInfo* mem_info, bool vectorized, int dop); +extern void cost_gather(GatherPath *path, RelOptInfo *baserel, ParamPathInfo *param_info); extern void cost_subplan(PlannerInfo* root, SubPlan* subplan, Plan* plan); extern void cost_qual_eval(QualCost* cost, List* quals, PlannerInfo* root); extern void cost_qual_eval_node(QualCost* cost, Node* qual, PlannerInfo* root); diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 9dcdef176..cc8f78415 100755 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -50,7 +50,8 @@ extern void add_path(PlannerInfo* root, RelOptInfo* parent_rel, Path* new_path); extern bool add_path_precheck( RelOptInfo* parent_rel, Cost startup_cost, Cost total_cost, List* pathkeys, Relids required_outer); -extern Path* create_seqscan_path(PlannerInfo* root, RelOptInfo* rel, Relids required_outer, int dop = 1); +extern Path* create_seqscan_path(PlannerInfo* root, RelOptInfo* rel, Relids required_outer, + int dop = 1, int nworkers = 0); extern Path* create_cstorescan_path(PlannerInfo* root, RelOptInfo* rel, int dop = 1); extern Path *create_tsstorescan_path(PlannerInfo* root, RelOptInfo* rel, int dop = 1); extern IndexPath* create_index_path(PlannerInfo* root, IndexOptInfo* index, List* indexclauses, List* indexclausecols, @@ -73,6 +74,8 @@ extern MergeAppendPath* create_merge_append_path( extern ResultPath* create_result_path(List* quals, Path* subpath = NULL); extern MaterialPath* create_material_path(Path* subpath, bool materialize_all = false); extern UniquePath* create_unique_path(PlannerInfo* root, RelOptInfo* rel, Path* subpath, SpecialJoinInfo* sjinfo); +extern GatherPath *create_gather_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, Relids required_outer, + int nworkers); extern Path* create_subqueryscan_path(PlannerInfo* root, RelOptInfo* rel, List* pathkeys, Relids required_outer); extern Path* create_functionscan_path(PlannerInfo* root, RelOptInfo* rel); extern Path* create_valuesscan_path(PlannerInfo* root, RelOptInfo* rel); diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h index 8fc4dd034..4b3e7894a 100755 --- a/src/include/optimizer/planner.h +++ b/src/include/optimizer/planner.h @@ -44,6 +44,13 @@ typedef struct { bool has_denserank; } DenseRank_context; +/* possible values for force_parallel_mode */ +typedef enum { + FORCE_PARALLEL_OFF, + FORCE_PARALLEL_ON, + FORCE_PARALLEL_REGRESS +} ForceParallelMode; + extern ExecNodes* getExecNodesByGroupName(const char* gname); extern PlannedStmt* planner(Query* parse, int cursorOptions, ParamListInfo boundParams); extern PlannedStmt* standard_planner(Query* parse, int cursorOptions, ParamListInfo boundParams); diff --git a/src/include/postmaster/bgworker.h b/src/include/postmaster/bgworker.h index 6d529e0ff..0634a9f18 100644 --- a/src/include/postmaster/bgworker.h +++ b/src/include/postmaster/bgworker.h @@ -96,6 +96,7 @@ typedef struct BackgroundWorker { Datum bgw_main_arg; char bgw_extra[BGW_EXTRALEN]; ThreadId bgw_notify_pid; /* SIGUSR1 this backend on start/stop */ + void *bgw_parallel_context; } BackgroundWorker; typedef enum BgwHandleStatus { diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h index 5070a7996..a20a4c0bc 100644 --- a/src/include/postmaster/postmaster.h +++ b/src/include/postmaster/postmaster.h @@ -140,6 +140,8 @@ extern void ClosePostmasterPorts(bool am_syslogger); extern int MaxLivePostmasterChildren(void); +extern bool PostmasterMarkPIDForWorkerNotify(ThreadId pid); + extern Size CBMShmemSize(void); extern void CBMShmemInit(void); diff --git a/src/include/storage/dsm.h b/src/include/storage/dsm.h new file mode 100644 index 000000000..f07111d78 --- /dev/null +++ b/src/include/storage/dsm.h @@ -0,0 +1,48 @@ +/* ------------------------------------------------------------------------- + * + * dsm.h + * manage dynamic shared memory segments + * + * Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/dsm.h + * + * ------------------------------------------------------------------------- + */ +#ifndef DSM_H +#define DSM_H + +#define DSM_MAX_ITEM_PER_QUERY 8 + +/* Startup and shutdown functions. */ +#define dsm_cleanup_using_control_segment(oldControlHandle) +#define dsm_postmaster_startup(shmemHeader) +#define dsm_backend_shutdown +#define dsm_detach_all +#define dsm_set_control_handle(dsmHandle) + +/* Functions that create or remove mappings. */ +extern void *dsm_create(void); +#define dsm_attach(dsmHandle) +extern void dsm_detach(void **seg); + +/* Resource management functions. */ +#define dsm_pin_mapping(dsmSegment) +#define dsm_unpin_mapping(dsmSegment) +#define dsm_pin_segment(dsmSegment) +#define dsm_unpin_segment(dsmHandle) +#define dsm_find_mapping(dsmHandle) + +/* Informational functions. */ +#define dsm_segment_address(dsmSegment) +#define dsm_segment_map_length(dsmSegment) +#define dsm_segment_handle(dsmSegment) + +/* Cleanup hooks. */ +#define on_dsm_detach(dsmSegment, callbackFunc, arg) +#define cancel_on_dsm_detach(dsmSegment, callbackFunc, arg) +#define reset_on_dsm_detach + +#endif /* DSM_H */ \ No newline at end of file diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index bcd291932..858ce5d3e 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -55,6 +55,8 @@ extern Snapshot GetLocalSnapshotData(Snapshot snapshot); extern void ReleaseSnapshotData(Snapshot snapshot); extern bool ProcArrayInstallImportedXmin(TransactionId xmin, TransactionId sourcexid); +extern bool ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc); + extern void set_proc_csn_and_check(const char* func, CommitSeqNo csn_min, SnapshotType snapshot_type); extern RunningTransactions GetRunningTransactionData(void); diff --git a/src/include/storage/shm_mq.h b/src/include/storage/shm_mq.h index 27f98af44..8d00ede30 100644 --- a/src/include/storage/shm_mq.h +++ b/src/include/storage/shm_mq.h @@ -53,7 +53,7 @@ extern PGPROC *shm_mq_get_receiver(shm_mq *); extern PGPROC *shm_mq_get_sender(shm_mq *); /* Set up backend-local queue state. */ -extern shm_mq_handle *shm_mq_attach(shm_mq *mq, char *seg, +extern shm_mq_handle *shm_mq_attach(shm_mq *mq, void *seg, BackgroundWorkerHandle *handle); /* Associate worker handle with shm_mq. */ diff --git a/src/include/tcop/dest.h b/src/include/tcop/dest.h index 464324e08..653ea1d2d 100755 --- a/src/include/tcop/dest.h +++ b/src/include/tcop/dest.h @@ -108,7 +108,8 @@ typedef enum { DestBatchLocalRedistribute, /* results send to consumer thread in a local redistribute way */ DestBatchLocalRoundRobin, /* results send to consumer thread in a local roundrobin way */ - DestBatchHybrid + DestBatchHybrid, + DestTupleQueue /* results sent to tuple queue */ } CommandDest; diff --git a/src/include/utils/datum.h b/src/include/utils/datum.h index 087f904d0..c818045df 100644 --- a/src/include/utils/datum.h +++ b/src/include/utils/datum.h @@ -45,4 +45,12 @@ extern void datumFree(Datum value, bool typByVal, int typLen); */ extern bool datumIsEqual(Datum value1, Datum value2, bool typByVal, int typLen); +/* + * Serialize and restore datums so that we can transfer them to parallel + * workers. + */ +extern Size datumEstimateSpace(Datum value, bool isnull, bool typByVal, int typLen); +extern void datumSerialize(Datum value, bool isnull, bool typByVal, int typLen, char **start_address, Size *remainLen); +extern Datum datumRestore(char **start_address, Size *remainLen, bool *isnull); + #endif /* DATUM_H */ diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h index 2c83af000..99a8d2562 100644 --- a/src/include/utils/lsyscache.h +++ b/src/include/utils/lsyscache.h @@ -79,6 +79,7 @@ extern Oid get_func_variadictype(Oid funcid); extern bool get_func_retset(Oid funcid); extern bool func_strict(Oid funcid); extern char func_volatile(Oid funcid); +extern char func_parallel(Oid funcid); extern bool get_func_proshippable(Oid funcid); extern bool get_func_leakproof(Oid funcid); extern float4 get_func_cost(Oid funcid); diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h index 0fa155283..2b2b2fd90 100755 --- a/src/include/utils/snapmgr.h +++ b/src/include/utils/snapmgr.h @@ -14,6 +14,7 @@ #define SNAPMGR_H #include "utils/resowner.h" +#include "utils/snapshot.h" extern Snapshot GetTransactionSnapshot(bool force_local_snapshot = false); extern Snapshot GetLatestSnapshot(void); @@ -59,4 +60,10 @@ extern struct HTAB* HistoricSnapshotGetTupleCids(void); extern void SetupHistoricSnapshot(Snapshot snapshot_now, struct HTAB* tuplecids); extern void TeardownHistoricSnapshot(bool is_error); extern bool HistoricSnapshotActive(void); + +extern Size EstimateSnapshotSpace(Snapshot snapshot); +extern void SerializeSnapshot(Snapshot snapshot, char *start_address, Size len); +extern Snapshot RestoreSnapshot(char *start_address, Size len); +extern void RestoreTransactionSnapshot(Snapshot snapshot, void *master_pgproc); + #endif /* SNAPMGR_H */ diff --git a/src/test/regress/expected/bypass_simplequery_support.out b/src/test/regress/expected/bypass_simplequery_support.out index 900262ef4..68771fa29 100644 --- a/src/test/regress/expected/bypass_simplequery_support.out +++ b/src/test/regress/expected/bypass_simplequery_support.out @@ -7,6 +7,7 @@ set enable_seqscan=off; set opfusion_debug_mode = 'log'; set log_min_messages=debug; set logging_module = 'on(OPFUSION)'; +set max_parallel_workers_per_gather=0; -- create table drop table if exists test_bypass_sq1; NOTICE: table "test_bypass_sq1" does not exist, skipping diff --git a/src/test/regress/sql/bypass_simplequery_support.sql b/src/test/regress/sql/bypass_simplequery_support.sql index 156ba7c64..7922d127b 100644 --- a/src/test/regress/sql/bypass_simplequery_support.sql +++ b/src/test/regress/sql/bypass_simplequery_support.sql @@ -7,6 +7,7 @@ set enable_seqscan=off; set opfusion_debug_mode = 'log'; set log_min_messages=debug; set logging_module = 'on(OPFUSION)'; +set max_parallel_workers_per_gather=0; -- create table drop table if exists test_bypass_sq1;