diff --git a/GNUmakefile.in b/GNUmakefile.in index b5b099d37..13996e225 100644 --- a/GNUmakefile.in +++ b/GNUmakefile.in @@ -95,6 +95,7 @@ install: $(MAKE) -C contrib/hstore $@ $(MAKE) -C contrib/dblink $@ $(MAKE) -C contrib/ndpplugin $@ + @if test -d contrib/spq_plugin; then $(MAKE) -C contrib/spq_plugin $@; fi @if test -d contrib/dolphin; then $(MAKE) -C contrib/dolphin $@; fi @if test -d contrib/whale; then $(MAKE) -C contrib/whale $@; fi +@echo "openGauss installation complete." diff --git a/build/script/aarch64_opengauss_list b/build/script/aarch64_opengauss_list index cc3c38841..8865f3ba9 100644 --- a/build/script/aarch64_opengauss_list +++ b/build/script/aarch64_opengauss_list @@ -71,6 +71,8 @@ ./share/postgresql/extension/security_plugin--1.0.sql ./share/postgresql/extension/ndpplugin.control ./share/postgresql/extension/ndpplugin--1.0.sql +./share/postgresql/extension/spqplugin.control +./share/postgresql/extension/spqplugin--1.0.sql ./share/postgresql/extension/dolphin.control ./share/postgresql/extension/dolphin--3.0.sql ./share/postgresql/extension/dolphin--1.0--1.1.sql @@ -772,6 +774,7 @@ ./lib/postgresql/utf8_and_big5.so ./lib/postgresql/mppdb_decoding.so ./lib/postgresql/ndpplugin.so +./lib/postgresql/spqplugin.so ./lib/postgresql/pg_plugin ./lib/postgresql/proc_srclib ./lib/postgresql/security_plugin.so diff --git a/build/script/x86_64_opengauss_list b/build/script/x86_64_opengauss_list index 4083c32f6..d490d1e6e 100644 --- a/build/script/x86_64_opengauss_list +++ b/build/script/x86_64_opengauss_list @@ -71,6 +71,8 @@ ./share/postgresql/extension/security_plugin--1.0.sql ./share/postgresql/extension/ndpplugin.control ./share/postgresql/extension/ndpplugin--1.0.sql +./share/postgresql/extension/spqplugin.control +./share/postgresql/extension/spqplugin--1.0.sql ./share/postgresql/extension/dolphin.control ./share/postgresql/extension/dolphin--3.0.sql ./share/postgresql/extension/dolphin--1.0--1.1.sql @@ -772,6 +774,7 @@ ./lib/postgresql/utf8_and_big5.so ./lib/postgresql/mppdb_decoding.so ./lib/postgresql/ndpplugin.so +./lib/postgresql/spqplugin.so ./lib/postgresql/pg_plugin ./lib/postgresql/proc_srclib ./lib/postgresql/security_plugin.so diff --git a/cmake/src/build_options.cmake b/cmake/src/build_options.cmake index b357cd106..8c78e36c9 100755 --- a/cmake/src/build_options.cmake +++ b/cmake/src/build_options.cmake @@ -71,6 +71,7 @@ option(USE_TASSL "build with tassl, the old is --with-tassl" OFF)#ON option(ENABLE_THREAD_SAFETY "enable thread safety, the old is --enable-thread-safety" ON) #The following are basically no need to configure, because these libraries are necessary or must not be used in mppdb +option(USE_SPQ "enable spq optimizer" OFF) option(USE_BONJOUR "enable bonjour, the old is --with-bonjour" OFF) option(USE_LDAP "build with ldap, the old is --with-ldap" OFF)#ON option(USE_ETCD "build with etcd libs, new option for old mppdb, after 8.1 close it" OFF) @@ -158,7 +159,7 @@ set(PROTECT_OPTIONS -fwrapv -std=c++14 -fnon-call-exceptions ${OPTIMIZE_LEVEL}) set(WARNING_OPTIONS -Wall -Wendif-labels -Wformat-security) set(OPTIMIZE_OPTIONS -pipe -pthread -fno-aggressive-loop-optimizations -fno-expensive-optimizations -fno-omit-frame-pointer -fno-strict-aliasing -freg-struct-return) set(CHECK_OPTIONS -Wmissing-format-attribute -Wno-attributes -Wno-unused-but-set-variable -Wno-write-strings -Wpointer-arith) -set(MACRO_OPTIONS -D_GLIBCXX_USE_CXX11_ABI=0 -DENABLE_GSTRACE -D_GNU_SOURCE -DPGXC -D_POSIX_PTHREAD_SEMANTICS -D_REENTRANT -DSTREAMPLAN -D_THREAD_SAFE ${DB_COMMON_DEFINE}) +set(MACRO_OPTIONS -D_GLIBCXX_USE_CXX11_ABI=0 -DENABLE_GSTRACE -D_GNU_SOURCE -DPGXC -D_POSIX_PTHREAD_SEMANTICS -D_REENTRANT -DSTREAMPLAN -D_THREAD_SAFE -DUSE_SPQ ${DB_COMMON_DEFINE}) # Set MAX_ALLOC_SEGNUM size in extreme_rto if(${WAL_SEGSIZE} LESS 256) @@ -225,6 +226,10 @@ if("${ENABLE_LCOV}" STREQUAL "ON") set(TEST_LINK_OPTIONS -lgcov -L${LCOV_LIB_PATH}) endif() +if(${USE_SPQ}) + set(GAUSSDB_CONFIGURE "${GAUSSDB_CONFIGURE} -DUSE_SPQ") +endif() + if(${USE_LDAP}) set(HAVE_LIBLDAP 1) set(LIBS "${LIBS} -lldap") diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index b6070c017..8f8db20bf 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -23,6 +23,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/log_fdw ${CMAKE_CURRENT_SOURCE_DIR}/gc_fdw ${CMAKE_CURRENT_SOURCE_DIR}/ndpplugin + ${CMAKE_CURRENT_SOURCE_DIR}/spq_plugin ) add_subdirectory(hstore) @@ -44,4 +45,6 @@ if("${ENABLE_MULTIPLE_NODES}" STREQUAL "OFF") add_subdirectory(gc_fdw) endif() add_subdirectory(ndpplugin) - +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/spq_plugin) + add_subdirectory(spq_plugin) +endif() diff --git a/contrib/Makefile b/contrib/Makefile index f1f57913c..3ac21ab3d 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -56,7 +56,8 @@ SUBDIRS = \ unaccent \ vacuumlo \ security_plugin \ - ndpplugin + ndpplugin \ + spq_plugin ifeq ($(with_openssl),yes) SUBDIRS += sslinfo diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 4e04f5b01..c55db74a3 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -1238,6 +1238,10 @@ ifeq ($(enable_libnet), yes) override CFLAGS := $(CFLAGS) -DUSE_LIBNET endif +override CPPFLAGS := $(CPPFLAGS) -DUSE_SPQ +ifneq ($(wildcard $(top_builddir)/contrib/spq_plugin/.),) +LIBS += -lxerces-c +endif ########################################################################## # diff --git a/src/common/backend/catalog/index.cpp b/src/common/backend/catalog/index.cpp index 017328aed..ecde047aa 100644 --- a/src/common/backend/catalog/index.cpp +++ b/src/common/backend/catalog/index.cpp @@ -90,11 +90,21 @@ #include "utils/rel.h" #include "pgxc/redistrib.h" +#ifdef USE_SPQ +#include "access/spq_btbuild.h" +#endif + #ifdef ENABLE_MOT #include "foreign/fdwapi.h" #endif - +#ifdef USE_SPQ +void spq_validate_index_heapscan(Relation heapRelation, + Relation indexRelation, + IndexInfo* indexInfo, + Snapshot snapshot, + v_i_state* state); +#endif /* non-export function prototypes */ static bool relationHasPrimaryKey(Relation rel); static TupleDesc ConstructTupleDescriptor(Relation heapRelation, IndexInfo* indexInfo, List* indexColNames, @@ -4823,7 +4833,12 @@ void validate_index(Oid heapId, Oid indexId, Snapshot snapshot, bool isPart) /* * Now scan the heap and "merge" it with the index */ - tableam_index_validate_scan(heapRelation, indexRelation, indexInfo, snapshot, &state); +#ifdef USE_SPQ + if (enable_spq_btbuild_cic(indexRelation)) { + spq_validate_index_heapscan(heapRelation, indexRelation, indexInfo, snapshot, &state); + } else +#endif + tableam_index_validate_scan(heapRelation, indexRelation, indexInfo, snapshot, &state); /* Done with tuplesort object */ tuplesort_end(state.tuplesort); @@ -7494,3 +7509,155 @@ void cbi_set_enable_clean(Relation rel) heap_close(pg_class, RowExclusiveLock); } + + +#ifdef USE_SPQ +/* + * spq_validate_index_heapscan - second table scan for concurrent index build + * + * This has much code in common with validate_index_heapscan, scan use SPI module. + * expressions and predicates index not supported. + */ +void spq_validate_index_heapscan( + Relation heapRelation, Relation indexRelation, IndexInfo* indexInfo, Snapshot snapshot, v_i_state* state) +{ + bool in_index[MaxHeapTuplesPerPage]; + OffsetNumber root_offsets[MaxHeapTuplesPerPage]; + + /* state variables for the merge */ + ItemPointer indexcursor = NULL; + bool tuplesort_empty = false; + + SPIPlanPtr plan; + Portal portal; + + bool old_enable_spq = u_sess->attr.attr_spq.gauss_enable_spq; + bool old_spq_enable_index_scan = u_sess->attr.attr_spq.spq_optimizer_enable_indexscan; + bool old_spq_enable_indexonly_scan = u_sess->attr.attr_spq.spq_optimizer_enable_indexonlyscan; + + /* + * sanity checks + */ + Assert(OidIsValid(indexRelation->rd_rel->relam)); + + StringInfo sql = makeStringInfo(); + + /* generate sql */ + { + StringInfo attrs = makeStringInfo(); /* attrs in SELECT clause */ + TupleDesc tupdes = RelationGetDescr(indexRelation); + int natts = tupdes->natts; + Assert(natts > 0); + Form_pg_attribute lastattr = TupleDescAttr(tupdes, natts-1); + + for (int i = 0; i < natts -1; i++) { + Form_pg_attribute att = TupleDescAttr(tupdes, i); + appendStringInfo(attrs, "%s, ", NameStr(att->attname)); + } + appendStringInfo(attrs, "%s", NameStr(lastattr->attname)); + appendStringInfo(sql, "select ctid %s from %s order by ctid", attrs->data, + RelationGetRelationName(heapRelation)); + } + + u_sess->attr.attr_spq.gauss_enable_spq = true; + u_sess->attr.attr_spq.spq_optimizer_enable_indexscan = false; + u_sess->attr.attr_spq.spq_optimizer_enable_indexonlyscan = false; + + SPI_connect(); + + if ((plan = SPI_prepare(sql->data, 0, NULL)) == NULL) + ereport(ERROR, + (errcode(ERRCODE_SPI_PREPARE_FAILURE), + errmsg("SPI_prepare(\"%s\") failed: %s", sql->data, SPI_result_code_string(SPI_result)))); + + if ((portal = SPI_cursor_open(NULL, plan, NULL, NULL, true)) == NULL) + ereport(ERROR, + (errcode(ERRCODE_SPI_CURSOR_OPEN_FAILURE), + errmsg("SPI_cursor_open(\"%s\") failed: %s", sql->data, SPI_result_code_string(SPI_result)))); + + u_sess->attr.attr_spq.gauss_enable_spq = old_enable_spq; + u_sess->attr.attr_spq.spq_optimizer_enable_indexscan = old_spq_enable_index_scan; + u_sess->attr.attr_spq.spq_optimizer_enable_indexonlyscan = old_spq_enable_indexonly_scan; + + SPI_cursor_fetch(portal, true, SPQ_BATCH_SIZE); + while (SPI_processed > 0) { + uint64 i; + for (i = 0; i < SPI_processed; i++) { + BlockNumber root_blkno = InvalidBlockNumber; + OffsetNumber root_offnum; + Datum values[INDEX_MAX_KEYS + 1]; + bool nulls[INDEX_MAX_KEYS + 1]; + ItemPointer heapcursor; + ItemPointerData rootTuple; + HeapTuple tup = SPI_tuptable->vals[i]; + + heap_deform_tuple(tup, SPI_tuptable->tupdesc, values, nulls); + + /* ctid for current heap tuple */ + heapcursor = (ItemPointer)values[0]; + rootTuple = *heapcursor; + root_blkno = ItemPointerGetBlockNumber(heapcursor); + root_offnum = ItemPointerGetOffsetNumber(heapcursor); + if (HeapTupleIsHeapOnly(tup)) { + root_offnum = root_offsets[root_offnum - 1]; + Assert(OffsetNumberIsValid(root_offnum)); + ItemPointerSetOffsetNumber(&rootTuple, root_offnum); + } + CHECK_FOR_INTERRUPTS(); + + /* + * "merge" by skipping through the index tuples until we find or pass + * the current root tuple. + */ + while (!tuplesort_empty && + (!indexcursor || + ItemPointerCompare(indexcursor, &rootTuple) < 0)) { + Datum ts_val; + bool ts_isnull = false; + + if (indexcursor) { + /* + * Remember index items seen earlier on the current heap page + */ + if (ItemPointerGetBlockNumber(indexcursor) == root_blkno) + in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true; + pfree(indexcursor); + indexcursor = NULL; + } + + tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true, + &ts_val, &ts_isnull); + Assert(tuplesort_empty || !ts_isnull); + indexcursor = (ItemPointer)DatumGetPointer(ts_val); + } + + /* + * If the tuplesort has overshot *and* we didn't see a match earlier, + * then this tuple is missing from the index, so insert it. + */ + if ((tuplesort_empty || + ItemPointerCompare(indexcursor, &rootTuple) > 0) && + !in_index[root_offnum - 1]) { + (void)index_insert(indexRelation, + values + 1, + nulls + 1, + &rootTuple, + heapRelation, + indexInfo->ii_Unique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO); + + state->tups_inserted += 1; + } + } + SPI_freetuptable(SPI_tuptable); + SPI_cursor_fetch(portal, true, SPQ_BATCH_SIZE); + } + + SPI_cursor_close(portal); + SPI_freeplan(plan); + SPI_finish(); + + /* These may have been pointing to the now-gone estate */ + indexInfo->ii_ExpressionsState = NIL; + indexInfo->ii_PredicateState = NIL; +} +#endif diff --git a/src/common/backend/nodes/copyfuncs.cpp b/src/common/backend/nodes/copyfuncs.cpp index 2340f2cda..4daab2096 100644 --- a/src/common/backend/nodes/copyfuncs.cpp +++ b/src/common/backend/nodes/copyfuncs.cpp @@ -183,7 +183,11 @@ static PlannedStmt* _copyPlannedStmt(const PlannedStmt* from) COPY_SCALAR_FIELD(multi_node_hint); COPY_SCALAR_FIELD(uniqueSQLId); COPY_SCALAR_FIELD(cause_type); - +#ifdef USE_SPQ + COPY_SCALAR_FIELD(spq_session_id); + COPY_SCALAR_FIELD(current_id); + COPY_SCALAR_FIELD(is_spq_optmized); +#endif /* * Not copy ng_queryMem to avoid memory leak in CachedPlan context, * and dywlm_client_manager always calls CalculateQueryMemMain to generate it. @@ -690,6 +694,27 @@ static SeqScan* _copySeqScan(const SeqScan* from) return newnode; } +#ifdef USE_SPQ +/* + * _copySpqSeqScan + */ +static SpqSeqScan* _copySpqSeqScan(const SpqSeqScan* from) +{ + SpqSeqScan* newnode = makeNode(SpqSeqScan); + + /* + * copy node superclass fields + */ + CopyScanFields((const Scan*)from, &newnode->scan); + + newnode->isFullTableScan = from->isFullTableScan; + newnode->isAdaptiveScan = from->isAdaptiveScan; + newnode->isDirectRead = from->isDirectRead; + + return newnode; +} +#endif + /* * _copyIndexScan */ @@ -1207,6 +1232,10 @@ static void CopyJoinFields(const Join* from, Join* newnode) COPY_SCALAR_FIELD(optimizable); COPY_NODE_FIELD(nulleqqual); COPY_SCALAR_FIELD(skewoptimize); +#ifdef USE_SPQ + COPY_SCALAR_FIELD(prefetch_inner); + COPY_SCALAR_FIELD(is_set_op_join); +#endif } /* @@ -1361,6 +1390,10 @@ static Material* _copyMaterial(const Material* from) CopyPlanFields((const Plan*)from, (Plan*)newnode); COPY_SCALAR_FIELD(materialize_all); CopyMemInfoFields(&from->mem_info, &newnode->mem_info); +#ifdef USE_SPQ + COPY_SCALAR_FIELD(spq_strict); + COPY_SCALAR_FIELD(spq_shield_child_from_rescans); +#endif return newnode; } @@ -1457,6 +1490,9 @@ static Agg* _copyAgg(const Agg* from) CopyPlanFields((const Plan*)from, (Plan*)newnode); COPY_SCALAR_FIELD(aggstrategy); +#ifdef USE_SPQ + COPY_SCALAR_FIELD(aggsplittype); +#endif COPY_SCALAR_FIELD(numCols); if (from->numCols > 0) { COPY_POINTER_FIELD(grpColIdx, from->numCols * sizeof(AttrNumber)); @@ -1846,6 +1882,9 @@ static VecAgg* _copyVecAgg(const VecAgg* from) CopyPlanFields((const Plan*)from, (Plan*)newnode); COPY_SCALAR_FIELD(aggstrategy); +#ifdef USE_SPQ + COPY_SCALAR_FIELD(aggsplittype); +#endif COPY_SCALAR_FIELD(numCols); if (from->numCols > 0) { COPY_POINTER_FIELD(grpColIdx, from->numCols * sizeof(AttrNumber)); @@ -2236,7 +2275,9 @@ static Stream* _copyStream(const Stream* from) COPY_SCALAR_FIELD(stream_level); COPY_NODE_FIELD(origin_consumer_nodes); COPY_SCALAR_FIELD(is_recursive_local); - +#ifdef USE_SPQ + COPY_SCALAR_FIELD(streamID); +#endif return newnode; } @@ -2597,6 +2638,9 @@ static Aggref* _copyAggref(const Aggref* from) COPY_SCALAR_FIELD(agghas_collectfn); COPY_SCALAR_FIELD(aggstage); #endif /* PGXC */ +#ifdef USE_SPQ + COPY_SCALAR_FIELD(aggsplittype); +#endif COPY_SCALAR_FIELD(aggcollid); COPY_SCALAR_FIELD(inputcollid); COPY_NODE_FIELD(aggdirectargs); @@ -4834,6 +4878,11 @@ static Query* _copyQuery(const Query* from) COPY_SCALAR_FIELD(isReplace); } COPY_NODE_FIELD(indexhintList); +#ifdef USE_SPQ + if (t_thrd.proc->workingVersionNum >= SPQ_VERSION_NUM) { + COPY_SCALAR_FIELD(is_support_spq); + } +#endif newnode->rightRefState = CopyRightRefState(from->rightRefState); @@ -7399,7 +7448,103 @@ static AutoIncrement *_copyAutoIncrement(const AutoIncrement *from) COPY_SCALAR_FIELD(autoincout_funcid); return newnode; } +#ifdef USE_SPQ +static Motion* _copyMotion(const Motion *from) +{ + Motion *newnode = makeNode(Motion); + + /* + * copy node superclass fields + */ + CopyPlanFields((Plan *) from, (Plan *) newnode); + + COPY_SCALAR_FIELD(sendSorted); + COPY_SCALAR_FIELD(motionID); + + COPY_SCALAR_FIELD(motionType); + + COPY_NODE_FIELD(hashExprs); + if (from->hashExprs) { + COPY_POINTER_FIELD(hashFuncs, list_length(from->hashExprs) * sizeof(Oid)); + } + + COPY_SCALAR_FIELD(numSortCols); + if (from->numSortCols > 0) { + COPY_POINTER_FIELD(sortColIdx, from->numSortCols * sizeof(AttrNumber)); + COPY_POINTER_FIELD(sortOperators, from->numSortCols * sizeof(Oid)); + COPY_POINTER_FIELD(collations, from->numSortCols * sizeof(Oid)); + COPY_POINTER_FIELD(nullsFirst, from->numSortCols * sizeof(bool)); + } + + COPY_SCALAR_FIELD(segidColIdx); + COPY_SCALAR_FIELD(numHashSegments); + + return newnode; +} +static Result *_copyResult(const Result *from) +{ + Result *newnode = makeNode(Result); + + /* + * copy node superclass fields + */ + CopyPlanFields((const Plan *) from, (Plan *) newnode); + + /* + * copy remainder of node + */ + COPY_NODE_FIELD(resconstantqual); + + COPY_SCALAR_FIELD(numHashFilterCols); + if (from->numHashFilterCols > 0) { + COPY_POINTER_FIELD(hashFilterColIdx, from->numHashFilterCols * sizeof(AttrNumber)); + COPY_POINTER_FIELD(hashFilterFuncs, from->numHashFilterCols * sizeof(Oid)); + } + + return newnode; +} +/* + * _copyAssertOp + */ +static AssertOp *_copyAssertOp(const AssertOp *from) +{ + AssertOp *newnode = makeNode(AssertOp); + /* + * copy node superclass fields + */ + CopyPlanFields((Plan *) from, (Plan *) newnode); + + COPY_SCALAR_FIELD(errcode); + COPY_NODE_FIELD(errmessage); + + return newnode; +} + +static ShareInputScan *_copyShareInputScan(const ShareInputScan *from) +{ + ShareInputScan *newnode = makeNode(ShareInputScan); + + /* copy node superclass fields */ + CopyPlanFields((Plan *) from, (Plan *) newnode); + COPY_SCALAR_FIELD(cross_slice); + COPY_SCALAR_FIELD(share_id); + COPY_SCALAR_FIELD(producer_slice_id); + COPY_SCALAR_FIELD(this_slice_id); + COPY_SCALAR_FIELD(nconsumers); + + return newnode; +} + +static Sequence *_copySequence(const Sequence *from) +{ + Sequence *newnode = makeNode(Sequence); + CopyPlanFields((Plan *) from, (Plan *) newnode); + COPY_NODE_FIELD(subplans); + + return newnode; +} +#endif static CondInfo *_copyCondInfo(const CondInfo *from) { CondInfo* newnode = makeNode(CondInfo); @@ -7567,6 +7712,23 @@ void* copyObject(const void* from) case T_SeqScan: retval = _copySeqScan((SeqScan*)from); break; +#ifdef USE_SPQ + case T_SpqSeqScan: + retval = _copySpqSeqScan((SpqSeqScan*)from); + break; + case T_AssertOp: + retval = _copyAssertOp((AssertOp*)from); + break; + case T_ShareInputScan: + retval = _copyShareInputScan((ShareInputScan*)from); + break; + case T_Sequence: + retval = _copySequence((Sequence*)from); + break; + case T_Result: + retval = _copyResult((Result*)from); + break; +#endif case T_IndexScan: retval = _copyIndexScan((IndexScan*)from); break; @@ -8845,6 +9007,11 @@ void* copyObject(const void* from) case T_CharsetClause: retval = _copyCharsetClause((CharsetClause *)from); break; +#ifdef USE_SPQ + case T_Motion: + retval = _copyMotion((Motion*)from); + break;; +#endif default: ereport(ERROR, (errcode(ERRCODE_UNRECOGNIZED_NODE_TYPE), errmsg("copyObject: unrecognized node type: %d", (int)nodeTag(from)))); diff --git a/src/common/backend/nodes/equalfuncs.cpp b/src/common/backend/nodes/equalfuncs.cpp index d4da22f7f..b34481f7a 100644 --- a/src/common/backend/nodes/equalfuncs.cpp +++ b/src/common/backend/nodes/equalfuncs.cpp @@ -223,6 +223,9 @@ static bool _equalAggref(const Aggref* a, const Aggref* b) COMPARE_SCALAR_FIELD(agghas_collectfn); COMPARE_SCALAR_FIELD(aggstage); #endif /* PGXC */ +#ifdef USE_SPQ + COMPARE_SCALAR_FIELD(aggsplittype); +#endif COMPARE_SCALAR_FIELD(aggcollid); COMPARE_SCALAR_FIELD(inputcollid); COMPARE_NODE_FIELD(aggdirectargs); diff --git a/src/common/backend/nodes/nodes.cpp b/src/common/backend/nodes/nodes.cpp index 7cc67c77a..0603bfd82 100755 --- a/src/common/backend/nodes/nodes.cpp +++ b/src/common/backend/nodes/nodes.cpp @@ -47,6 +47,12 @@ static const TagStr g_tagStrArr[] = {{T_Invalid, "Invalid"}, {T_BitmapOr, "BitmapOr"}, {T_Scan, "Scan"}, {T_SeqScan, "SeqScan"}, +#ifdef USE_SPQ + {T_SpqSeqScan, "SpqSeqScan"}, + {T_AssertOp, "AssertOp"}, + {T_ShareInputScan, "ShareInputScan"}, + {T_Sequence, "Sequence"}, +#endif {T_IndexScan, "IndexScan"}, {T_IndexOnlyScan, "IndexOnlyScan"}, {T_BitmapIndexScan, "BitmapIndexScan"}, @@ -115,6 +121,12 @@ static const TagStr g_tagStrArr[] = {{T_Invalid, "Invalid"}, {T_BitmapOrState, "BitmapOrState"}, {T_ScanState, "ScanState"}, {T_SeqScanState, "SeqScanState"}, +#ifdef USE_SPQ + {T_SpqSeqScanState, "SpqSeqScanState"}, + {T_AssertOpState, "AssertOpState"}, + {T_ShareInputScanState, "ShareInputScanState"}, + {T_SequenceState, "SequenceState"}, +#endif {T_IndexScanState, "IndexScanState"}, {T_IndexOnlyScanState, "IndexOnlyScanState"}, {T_BitmapIndexScanState, "BitmapIndexScanState"}, diff --git a/src/common/backend/nodes/outfuncs.cpp b/src/common/backend/nodes/outfuncs.cpp index 3e4f8ea50..568599c6a 100755 --- a/src/common/backend/nodes/outfuncs.cpp +++ b/src/common/backend/nodes/outfuncs.cpp @@ -578,7 +578,7 @@ static void _outPlannedStmt(StringInfo str, PlannedStmt* node) WRITE_INT_FIELD(gather_count); WRITE_INT_FIELD(num_nodes); - if (t_thrd.proc->workingVersionNum < 92097 || node->num_streams > 0) { + if (t_thrd.proc->workingVersionNum < 92097 || node->num_streams > 0 || IS_SPQ_RUNNING) { for (int i = 0; i < node->num_nodes; i++) { /* Write the field name only one time and just append the value of each field */ appendStringInfo(str, " :nodesDefinition[%d]", i); @@ -641,6 +641,11 @@ static void _outPlannedStmt(StringInfo str, PlannedStmt* node) if (t_thrd.proc->workingVersionNum >= SLOW_SQL_VERSION_NUM) { WRITE_UINT_FIELD(cause_type); } +#ifdef USE_SPQ + WRITE_UINT64_FIELD(spq_session_id); + WRITE_INT_FIELD(current_id); + WRITE_BOOL_FIELD(is_spq_optmized); +#endif } /* @@ -784,6 +789,10 @@ static void _outJoinPlanInfo(StringInfo str, Join* node) WRITE_BOOL_FIELD(optimizable); WRITE_NODE_FIELD(nulleqqual); WRITE_UINT_FIELD(skewoptimize); +#ifdef USE_SPQ + WRITE_BOOL_FIELD(prefetch_inner); + WRITE_BOOL_FIELD(is_set_op_join); +#endif } static void _outPlan(StringInfo str, Plan* node) @@ -1075,6 +1084,45 @@ static void _outSeqScan(StringInfo str, SeqScan* node) _outScanInfo(str, (Scan*)node); } +#ifdef USE_SPQ +static void _outSpqSeqScan(StringInfo str, SpqSeqScan* node) +{ + WRITE_NODE_TYPE("SPQSEQSCAN"); + + _outScanInfo(str, (Scan*)node); + WRITE_BOOL_FIELD(isFullTableScan); + WRITE_BOOL_FIELD(isAdaptiveScan); + WRITE_BOOL_FIELD(isDirectRead); +} + +static void _outAssertOp(StringInfo str, const AssertOp *node) +{ + WRITE_NODE_TYPE("ASSERTOP"); + _outPlanInfo(str, (Plan *) node); + WRITE_INT_FIELD(errcode); + WRITE_NODE_FIELD(errmessage); +} + +static void _outShareInputScan(StringInfo str, const ShareInputScan *node) +{ + WRITE_NODE_TYPE("SHAREINPUTSCAN"); + + WRITE_BOOL_FIELD(cross_slice); + WRITE_INT_FIELD(share_id); + WRITE_INT_FIELD(producer_slice_id); + WRITE_INT_FIELD(this_slice_id); + WRITE_INT_FIELD(nconsumers); + + _outPlanInfo(str, (Plan *) node); +} + +static void _outSequence(StringInfo str, const Sequence *node) +{ + WRITE_NODE_TYPE("SEQUENCE"); + _outPlanInfo(str, (Plan *)node); + WRITE_NODE_FIELD(subplans); +} +#endif template static void _outCommonIndexScanPart(StringInfo str, T* node) @@ -1143,6 +1191,9 @@ static void _outStream(StringInfo str, Stream* node) WRITE_INT_FIELD(stream_level); WRITE_NODE_FIELD(origin_consumer_nodes); WRITE_BOOL_FIELD(is_recursive_local); +#ifdef USE_SPQ + WRITE_INT_FIELD(streamID); +#endif } /* @@ -1673,7 +1724,7 @@ static void _outHashJoin(StringInfo str, HashJoin* node) WRITE_BOOL_FIELD(isSonicHash); out_mem_info(str, &node->mem_info); #ifndef ENABLE_MULTIPLE_NODES - if (t_thrd.proc->workingVersionNum >= CHARACTER_SET_VERSION_NUM) { + if (!IS_SPQ_RUNNING && t_thrd.proc->workingVersionNum >= CHARACTER_SET_VERSION_NUM) { WRITE_NODE_FIELD(hash_collations); } #endif @@ -1702,6 +1753,9 @@ static void _outVecHashAgg(StringInfo str, VecAgg* node) _outPlanInfo(str, (Plan*)node); WRITE_ENUM_FIELD(aggstrategy, AggStrategy); +#ifdef USE_SPQ + WRITE_ENUM_FIELD(aggsplittype, AggSplit); +#endif WRITE_INT_FIELD(numCols); appendStringInfo(str, " :grpColIdx"); @@ -1736,6 +1790,9 @@ static void _outAgg(StringInfo str, Agg* node) _outPlanInfo(str, (Plan*)node); WRITE_ENUM_FIELD(aggstrategy, AggStrategy); +#ifdef USE_SPQ + WRITE_ENUM_FIELD(aggsplittype, AggSplit); +#endif WRITE_INT_FIELD(numCols); appendStringInfo(str, " :grpColIdx"); @@ -1745,7 +1802,7 @@ static void _outAgg(StringInfo str, Agg* node) WRITE_GRPOP_FIELD(grpOperators, numCols); #ifndef ENABLE_MULTIPLE_NODES - if (t_thrd.proc->workingVersionNum >= CHARACTER_SET_VERSION_NUM) { + if (!IS_SPQ_RUNNING && t_thrd.proc->workingVersionNum >= CHARACTER_SET_VERSION_NUM) { WRITE_GRPOP_FIELD(grp_collations, numCols); } #endif @@ -1824,7 +1881,7 @@ static void _outGroup(StringInfo str, Group* node) WRITE_GRPOP_FIELD(grpOperators, numCols); #ifndef ENABLE_MULTIPLE_NODES - if (t_thrd.proc->workingVersionNum >= CHARACTER_SET_VERSION_NUM) { + if (!IS_SPQ_RUNNING && t_thrd.proc->workingVersionNum >= CHARACTER_SET_VERSION_NUM) { WRITE_GRPOP_FIELD(grp_collations, numCols); } #endif @@ -1858,6 +1915,10 @@ static void _outMaterial(StringInfo str, Material* node) _outPlanInfo(str, (Plan*)node); WRITE_BOOL_FIELD(materialize_all); out_mem_info(str, &node->mem_info); +#ifdef USE_SPQ + WRITE_BOOL_FIELD(spq_strict); + WRITE_BOOL_FIELD(spq_shield_child_from_rescans); +#endif } static void _outSimpleSort(StringInfo str, SimpleSort* node) @@ -1995,7 +2056,7 @@ static void _outUnique(StringInfo str, Unique* node) WRITE_GRPOP_FIELD(uniqOperators, numCols); #ifndef ENABLE_MULTIPLE_NODES - if (t_thrd.proc->workingVersionNum >= CHARACTER_SET_VERSION_NUM) { + if (!IS_SPQ_RUNNING && t_thrd.proc->workingVersionNum >= CHARACTER_SET_VERSION_NUM) { WRITE_GRPOP_FIELD(uniq_collations, numCols); } #endif @@ -2054,7 +2115,7 @@ static void _outSetOp(StringInfo str, SetOp* node) WRITE_GRPOP_FIELD(dupOperators, numCols); #ifndef ENABLE_MULTIPLE_NODES - if (t_thrd.proc->workingVersionNum >= CHARACTER_SET_VERSION_NUM) { + if (!IS_SPQ_RUNNING && t_thrd.proc->workingVersionNum >= CHARACTER_SET_VERSION_NUM) { WRITE_GRPOP_FIELD(dup_collations, numCols); } #endif @@ -2408,6 +2469,9 @@ static void _outAggref(StringInfo str, Aggref* node) WRITE_BOOL_FIELD(agghas_collectfn); WRITE_INT_FIELD(aggstage); #endif /* PGXC */ +#ifdef USE_SPQ + WRITE_ENUM_FIELD(aggsplittype, AggSplit); +#endif WRITE_OID_FIELD(aggcollid); WRITE_OID_FIELD(inputcollid); WRITE_NODE_FIELD(aggdirectargs); @@ -3474,7 +3538,9 @@ static void _outRelOptInfo(StringInfo str, RelOptInfo* node) WRITE_BITMAPSET_FIELD(lateral_relids); WRITE_NODE_FIELD(indexlist); #ifndef ENABLE_MULTIPLE_NODES - WRITE_NODE_FIELD(statlist); + if (!IS_SPQ_RUNNING) { + WRITE_NODE_FIELD(statlist); + } #endif WRITE_FLOAT_FIELD(pages, "%.0f"); WRITE_FLOAT_FIELD(tuples, "%.0f"); @@ -4770,6 +4836,11 @@ static void _outQuery(StringInfo str, Query* node) if (t_thrd.proc->workingVersionNum >= INDEX_HINT_VERSION_NUM) { WRITE_NODE_FIELD(indexhintList); } +#ifdef USE_SPQ + if (t_thrd.proc->workingVersionNum >= SPQ_VERSION_NUM) { + WRITE_BOOL_FIELD(is_support_spq); + } +#endif } static void _outWithCheckOption(StringInfo str, const WithCheckOption* node) @@ -6102,6 +6173,9 @@ static void _outNode(StringInfo str, const void* obj) case T_Plan: _outPlan(str, (Plan*)obj); break; +#ifdef USE_SPQ + case T_Result: +#endif case T_BaseResult: _outResult(str, (BaseResult*)obj); break; @@ -6144,6 +6218,20 @@ static void _outNode(StringInfo str, const void* obj) case T_SeqScan: _outSeqScan(str, (SeqScan*)obj); break; +#ifdef USE_SPQ + case T_SpqSeqScan: + _outSpqSeqScan(str, (SpqSeqScan*)obj); + break; + case T_AssertOp: + _outAssertOp(str, (AssertOp*)obj); + break; + case T_ShareInputScan: + _outShareInputScan(str, (ShareInputScan*)obj); + break; + case T_Sequence: + _outSequence(str, (Sequence*)obj); + break; +#endif #ifdef PGXC case T_RemoteQuery: _outRemoteQuery(str, (RemoteQuery*)obj); diff --git a/src/common/backend/nodes/readfuncs.cpp b/src/common/backend/nodes/readfuncs.cpp index 6b846ca50..f3842a92f 100755 --- a/src/common/backend/nodes/readfuncs.cpp +++ b/src/common/backend/nodes/readfuncs.cpp @@ -736,6 +736,12 @@ THR_LOCAL bool skip_read_extern_fields = false; READ_DONE(); \ } while (0) + +#ifdef USE_SPQ +#define READ_STREAM_ID() READ_INT_FIELD(streamID) +#else +#define READ_STREAM_ID() {} +#endif /* * function for _readStream and _readVecStream. */ @@ -761,6 +767,7 @@ THR_LOCAL bool skip_read_extern_fields = false; READ_INT_FIELD(stream_level); \ READ_NODE_FIELD(origin_consumer_nodes); \ READ_BOOL_FIELD(is_recursive_local); \ + READ_STREAM_ID(); \ \ READ_DONE(); \ } while (0) @@ -1648,6 +1655,13 @@ static Query* _readQuery(void) IF_EXIST(indexhintList) { READ_NODE_FIELD(indexhintList); } +#ifdef USE_SPQ + if (t_thrd.proc->workingVersionNum >= SPQ_VERSION_NUM) { + IF_EXIST(is_support_spq) { + READ_BOOL_FIELD(is_support_spq); + } + } +#endif READ_DONE(); } @@ -2219,6 +2233,9 @@ static Aggref* _readAggref(void) READ_BOOL_FIELD(agghas_collectfn); READ_INT_FIELD(aggstage); #endif /* PGXC */ +#ifdef USE_SPQ + READ_ENUM_FIELD(aggsplittype, AggSplit); +#endif READ_OID_FIELD(aggcollid); READ_OID_FIELD(inputcollid); IF_EXIST(aggdirectargs) @@ -3491,11 +3508,14 @@ static Agg* _readAgg(Agg* local_node) _readPlan(&local_node->plan); READ_ENUM_FIELD(aggstrategy, AggStrategy); +#ifdef USE_SPQ + READ_ENUM_FIELD(aggsplittype, AggSplit); +#endif READ_INT_FIELD(numCols); READ_ATTR_ARRAY(grpColIdx, numCols); READ_OPERATOROID_ARRAY(grpOperators, numCols); #ifndef ENABLE_MULTIPLE_NODES - if (t_thrd.proc->workingVersionNum >= CHARACTER_SET_VERSION_NUM) { + if (!IS_SPQ_RUNNING && t_thrd.proc->workingVersionNum >= CHARACTER_SET_VERSION_NUM) { READ_OPERATOROID_ARRAY(grp_collations, numCols); } #endif @@ -4081,7 +4101,7 @@ static Unique* _readUnique(Unique* local_node) READ_ATTR_ARRAY(uniqColIdx, numCols); READ_OPERATOROID_ARRAY(uniqOperators, numCols); #ifndef ENABLE_MULTIPLE_NODES - if (t_thrd.proc->workingVersionNum >= CHARACTER_SET_VERSION_NUM) { + if (!IS_SPQ_RUNNING && t_thrd.proc->workingVersionNum >= CHARACTER_SET_VERSION_NUM) { READ_OPERATOROID_ARRAY(uniq_collations, numCols); } #endif @@ -4397,6 +4417,10 @@ static Material* _readMaterial(Material* local_node) _readPlan(&local_node->plan); READ_BOOL_FIELD(materialize_all); read_mem_info(&local_node->mem_info); +#ifdef USE_SPQ + READ_BOOL_FIELD(spq_strict); + READ_BOOL_FIELD(spq_shield_child_from_rescans); +#endif READ_DONE(); } @@ -4459,7 +4483,7 @@ static Group* _readGroup(Group* local_node) READ_ATTR_ARRAY(grpColIdx, numCols); READ_OPERATOROID_ARRAY(grpOperators, numCols); #ifndef ENABLE_MULTIPLE_NODES - if (t_thrd.proc->workingVersionNum >= CHARACTER_SET_VERSION_NUM) { + if (!IS_SPQ_RUNNING && t_thrd.proc->workingVersionNum >= CHARACTER_SET_VERSION_NUM) { READ_OPERATOROID_ARRAY(grp_collations, numCols); } #endif @@ -4483,6 +4507,10 @@ static Join* _readJoin(Join* local_node) READ_BOOL_FIELD(optimizable); READ_NODE_FIELD(nulleqqual); READ_UINT_FIELD(skewoptimize); +#ifdef USE_SPQ + READ_BOOL_FIELD(prefetch_inner); + READ_BOOL_FIELD(is_set_op_join); +#endif READ_DONE(); } @@ -4521,7 +4549,7 @@ static HashJoin* _readHashJoin(HashJoin* local_node) read_mem_info(&local_node->mem_info); #ifndef ENABLE_MULTIPLE_NODES - if (t_thrd.proc->workingVersionNum >= CHARACTER_SET_VERSION_NUM) { + if (!IS_SPQ_RUNNING && t_thrd.proc->workingVersionNum >= CHARACTER_SET_VERSION_NUM) { READ_NODE_FIELD(hash_collations); } #endif @@ -4619,7 +4647,7 @@ static PlannedStmt* _readPlannedStmt(void) } READ_INT_FIELD(num_nodes); - if (t_thrd.proc->workingVersionNum < 92097 || local_node->num_streams > 0) { + if (t_thrd.proc->workingVersionNum < 92097 || local_node->num_streams > 0 || IS_SPQ_RUNNING) { local_node->nodesDefinition = (NodeDefinition*)palloc0(sizeof(NodeDefinition) * local_node->num_nodes); for (int i = 0; i < local_node->num_nodes; i++) { READ_OID_FIELD(nodesDefinition[i].nodeoid); @@ -4680,6 +4708,11 @@ static PlannedStmt* _readPlannedStmt(void) IF_EXIST(cause_type) { READ_UINT_FIELD(cause_type); } +#ifdef USE_SPQ + READ_UINT64_FIELD(spq_session_id); + READ_INT_FIELD(current_id); + READ_BOOL_FIELD(is_spq_optmized); +#endif READ_DONE(); } @@ -4727,6 +4760,53 @@ static Scan* _readSeqScan(void) READ_END(); } +#ifdef USE_SPQ +static SpqSeqScan* _readSpqSeqScan(void) +{ + READ_LOCALS_NO_FIELDS(SpqSeqScan); + READ_TEMP_LOCALS(); + + _readScan(&local_node->scan); + READ_BOOL_FIELD(isFullTableScan); + READ_BOOL_FIELD(isAdaptiveScan); + READ_BOOL_FIELD(isDirectRead); + + READ_END(); +} +/* + * _readAssertOp + */ +static AssertOp* _readAssertOp(void) +{ + READ_LOCALS(AssertOp); + _readPlan(&local_node->plan); + READ_INT_FIELD(errcode); + READ_NODE_FIELD(errmessage); + READ_END(); +} + +static ShareInputScan* _readShareInputScan(void) +{ + READ_LOCALS(ShareInputScan); + READ_BOOL_FIELD(cross_slice); + READ_INT_FIELD(share_id); + READ_INT_FIELD(producer_slice_id); + READ_INT_FIELD(this_slice_id); + READ_INT_FIELD(nconsumers); + _readPlan(&local_node->scan.plan); + + READ_END(); +} + +static Sequence* _readSequence(void) +{ + READ_LOCALS(Sequence); + _readPlan(&local_node->plan); + READ_NODE_FIELD(subplans); + + READ_END(); +} +#endif static SetOp* _readSetOp(SetOp* local_node) { @@ -4742,7 +4822,7 @@ static SetOp* _readSetOp(SetOp* local_node) READ_ATTR_ARRAY(dupColIdx, numCols); READ_OPERATOROID_ARRAY(dupOperators, numCols); #ifndef ENABLE_MULTIPLE_NODES - if (t_thrd.proc->workingVersionNum >= CHARACTER_SET_VERSION_NUM) { + if (!IS_SPQ_RUNNING && t_thrd.proc->workingVersionNum >= CHARACTER_SET_VERSION_NUM) { READ_OPERATOROID_ARRAY(dup_collations, numCols); } #endif @@ -5214,6 +5294,9 @@ static VecAgg* _readVecAgg(VecAgg* local_node) _readPlan(&local_node->plan); READ_ENUM_FIELD(aggstrategy, AggStrategy); +#ifdef USE_SPQ + READ_ENUM_FIELD(aggsplittype, AggSplit); +#endif READ_INT_FIELD(numCols); READ_ATTR_ARRAY(grpColIdx, numCols); READ_OID_ARRAY(grpOperators, numCols); @@ -6363,6 +6446,16 @@ Node* parseNodeString(void) return_value = _readNestLoop(); } else if (MATCH("SEQSCAN", 7)) { return_value = _readSeqScan(); +#ifdef USE_SPQ + } else if (MATCH("SPQSEQSCAN", 10)) { + return_value = _readSpqSeqScan(); + } else if (MATCH("ASSERTOP", 8)) { + return_value = _readAssertOp(); + } else if (MATCH("SHAREINPUTSCAN", 14)) { + return_value = _readShareInputScan(); + } else if (MATCH("SEQUENCE", 8)) { + return_value = _readSequence(); +#endif } else if (MATCH("BITMAPHEAPSCAN", 14)) { return_value = _readBitmapHeapScan(NULL); } else if (MATCH("BITMAPINDEXSCAN", 15)) { diff --git a/src/common/backend/parser/analyze.cpp b/src/common/backend/parser/analyze.cpp index 58bf5899b..86ad0a18e 100644 --- a/src/common/backend/parser/analyze.cpp +++ b/src/common/backend/parser/analyze.cpp @@ -2776,6 +2776,9 @@ static Query* transformSelectStmt(ParseState* pstate, SelectStmt* stmt, bool isF ListCell* l = NULL; qry->commandType = CMD_SELECT; +#ifdef USE_SPQ + qry->is_support_spq = true; +#endif if (stmt->startWithClause != NULL) { pstate->p_addStartInfo = true; diff --git a/src/common/backend/parser/parse_target.cpp b/src/common/backend/parser/parse_target.cpp index 4371d5244..adc23c112 100644 --- a/src/common/backend/parser/parse_target.cpp +++ b/src/common/backend/parser/parse_target.cpp @@ -1633,6 +1633,9 @@ static int FigureColnameInternal(Node* node, char** name) case ANY_SUBLINK: case ROWCOMPARE_SUBLINK: case CTE_SUBLINK: +#ifdef USE_SPQ + case NOT_EXISTS_SUBLINK: +#endif break; } break; diff --git a/src/common/backend/pgxc_single/pool/execRemote.cpp b/src/common/backend/pgxc_single/pool/execRemote.cpp index 7037b5727..45732b90e 100755 --- a/src/common/backend/pgxc_single/pool/execRemote.cpp +++ b/src/common/backend/pgxc_single/pool/execRemote.cpp @@ -100,6 +100,9 @@ #include "utils/elog.h" #include "utils/globalplancore.h" #include "executor/node/nodeModifyTable.h" +#ifdef USE_SPQ +#include "libpq/libpq-int.h" +#endif #ifndef MIN #define MIN(A, B) (((B) < (A)) ? (B) : (A)) @@ -238,6 +241,1045 @@ void setSocketError(const char* msg, const char* node_name) #define CONN_SCTP_ERR_6 "1049 Stream closed by remote" #define CONN_SCTP_ERR_7 "1059 Wait poll unknow error" +#ifdef USE_SPQ + +void CopyDataRowTupleToSlot(RemoteQueryState* combiner, TupleTableSlot* slot); +static void HandleCopyOutComplete(RemoteQueryState* combiner); +static void HandleCommandComplete( + RemoteQueryState* combiner, const char* msg_body, size_t len, PGXCNodeHandle* conn, bool isdummy); +static bool HandleRowDescription(RemoteQueryState* combiner, char* msg_body); +static void HandleDataRow( + RemoteQueryState* combiner, char* msg_body, size_t len, Oid nodeoid, const char* remoteNodeName); +static void HandleAnalyzeTotalRow(RemoteQueryState* combiner, const char* msg_body, size_t len); +static void HandleCopyIn(RemoteQueryState* combiner); +static void HandleCopyOut(RemoteQueryState* combiner); +static void HandleCopyDataRow(RemoteQueryState* combiner, char* msg_body, size_t len); +static void HandleError(RemoteQueryState* combiner, char* msg_body, size_t len); +static void HandleNotice(RemoteQueryState* combiner, char* msg_body, size_t len); +static void HandleDatanodeCommandId(RemoteQueryState* combiner, const char* msg_body, size_t len); +static TupleTableSlot* ExecSPQRemoteQuery(PlanState* state); + +inline static uint64 GetSPQQueryidFromRemoteQuery(RemoteQueryState* node) +{ + if (node->ss.ps.state != NULL && node->ss.ps.state->es_plannedstmt != NULL) { + return node->ss.ps.state->es_plannedstmt->queryId; + } else { + return 0; + } +} + +static TupleTableSlot* ExecSPQRemoteQuery(PlanState* state) +{ + RemoteQueryState* node = castNode(RemoteQueryState, state); + return ExecScan(&(node->ss), (ExecScanAccessMtd)RemoteQueryNext, (ExecScanRecheckMtd)RemoteQueryRecheck); +} +int getStreamSocketError(const char* str) +{ + if (pg_strncasecmp(str, CONN_SCTP_ERR_1, strlen(CONN_SCTP_ERR_1)) == 0) + return ERRCODE_SCTP_MEMORY_ALLOC; + else if (pg_strncasecmp(str, CONN_SCTP_ERR_2, strlen(CONN_SCTP_ERR_2)) == 0) + return ERRCODE_SCTP_NO_DATA_IN_BUFFER; + else if (pg_strncasecmp(str, CONN_SCTP_ERR_3, strlen(CONN_SCTP_ERR_3)) == 0) + return ERRCODE_SCTP_RELEASE_MEMORY_CLOSE; + else if (pg_strncasecmp(str, CONN_SCTP_ERR_4, strlen(CONN_SCTP_ERR_4)) == 0) + return ERRCODE_SCTP_TCP_DISCONNECT; + else if (pg_strncasecmp(str, CONN_SCTP_ERR_5, strlen(CONN_SCTP_ERR_5)) == 0) + return ERRCODE_SCTP_DISCONNECT; + else if (pg_strncasecmp(str, CONN_SCTP_ERR_6, strlen(CONN_SCTP_ERR_6)) == 0) + return ERRCODE_SCTP_REMOTE_CLOSE; + else if (pg_strncasecmp(str, CONN_SCTP_ERR_7, strlen(CONN_SCTP_ERR_7)) == 0) + return ERRCODE_SCTP_WAIT_POLL_UNKNOW; + else + return ERRCODE_CONNECTION_FAILURE; +} + +char* getSocketError(int* error_code) +{ + Assert(error_code != NULL); + + /* Set error code by checking socket error message. */ + if (pg_strncasecmp(t_thrd.pgxc_cxt.socket_buffer, CONN_RESET_BY_PEER, strlen(CONN_RESET_BY_PEER)) == 0) { + *error_code = IS_PGXC_DATANODE ? ERRCODE_STREAM_CONNECTION_RESET_BY_PEER : ERRCODE_CONNECTION_RESET_BY_PEER; + } else if (pg_strncasecmp(t_thrd.pgxc_cxt.socket_buffer, CONN_TIMED_OUT, strlen(CONN_TIMED_OUT)) == 0) { + *error_code = ERRCODE_CONNECTION_TIMED_OUT; + } else if (pg_strncasecmp(t_thrd.pgxc_cxt.socket_buffer, CONN_REMOTE_CLOSE, strlen(CONN_REMOTE_CLOSE)) == 0) { + *error_code = IS_PGXC_DATANODE ? ERRCODE_STREAM_REMOTE_CLOSE_SOCKET : ERRCODE_CONNECTION_FAILURE; + } else { + *error_code = ERRCODE_CONNECTION_FAILURE; + } + + return t_thrd.pgxc_cxt.socket_buffer; +} +bool SpqFetchTuple(RemoteQueryState* combiner, TupleTableSlot* slot, ParallelFunctionState* parallelfunctionstate) +{ + bool have_tuple = false; + + /* If we have message in the buffer, consume it */ + if (combiner->currentRow.msg) { + CopyDataRowTupleToSlot(combiner, slot); + have_tuple = true; + } + + /* + * If this is ordered fetch we can not know what is the node + * to handle next, so sorter will choose next itself and set it as + * currentRow to have it consumed on the next call to FetchTuple. + * Otherwise allow to prefetch next tuple. + */ + if (((RemoteQuery*)combiner->ss.ps.plan) != NULL && ((RemoteQuery*)combiner->ss.ps.plan)->sort) { + return have_tuple; + } + /* + * If we are fetching no sorted results we can not have both + * currentRow and buffered rows. When connection is buffered currentRow + * is moved to buffer, and then it is cleaned after buffering is completed. + * Afterwards rows will be taken from the buffer bypassing + * currentRow until buffer is empty, and only after that data are read + * from a connection. The message should be allocated in the same memory context as + * that of the slot. We are not sure of that in the call to + * ExecStoreDataRowTuple below. If one fixes this memory issue, please + * consider using CopyDataRowTupleToSlot() for the same. + */ + if (RowStoreLen(combiner->row_store) > 0) { + RemoteDataRowData dataRow; + RowStoreFetch(combiner->row_store, &dataRow); + NetWorkTimeDeserializeStart(t_thrd.pgxc_cxt.GlobalNetInstr); + ExecStoreDataRowTuple(dataRow.msg, dataRow.msglen, dataRow.msgnode, slot, true); + NetWorkTimeDeserializeEnd(t_thrd.pgxc_cxt.GlobalNetInstr); + return true; + } + + while (combiner->conn_count > 0) { + int res; + PGXCNodeHandle* conn = combiner->connections[combiner->current_conn]; + + /* Going to use a connection, buffer it if needed */ + if (conn->state == DN_CONNECTION_STATE_QUERY && conn->combiner != NULL && conn->combiner != combiner) { + BufferConnection(conn); + } + + /* + * If current connection is idle it means portal on the Datanode is suspended. + * If we have a tuple do not hurry to request more rows, + * leave connection clean for other RemoteQueries. + * If we do not have, request more and try to get it. + */ + if (conn->state == DN_CONNECTION_STATE_IDLE) { + /* + * Keep connection clean. + */ + if (have_tuple) { + return true; + } else { + if (pgxc_node_send_execute(conn, combiner->cursor, 1) != 0) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("Failed to fetch from %s[%u]", conn->remoteNodeName, conn->nodeoid))); + if (pgxc_node_send_sync(conn) != 0) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("Failed to fetch from %s[%u]", conn->remoteNodeName, conn->nodeoid))); + if (pgxc_node_receive(1, &conn, NULL)) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("Failed to fetch from %s[%u]", conn->remoteNodeName, conn->nodeoid))); + conn->combiner = combiner; + } + } + + /* read messages */ + res = handle_response(conn, combiner); + if (res == RESPONSE_EOF) { + /* incomplete message, read more */ + if (pgxc_node_receive(1, &conn, NULL)) { + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), errmsg("Failed to fetch from Datanode %u", conn->nodeoid))); + } + continue; + } else if (res == RESPONSE_COMPLETE) { + /* Make last connection current */ + combiner->conn_count = combiner->conn_count - 1; + if (combiner->current_conn >= combiner->conn_count) { + combiner->current_conn = 0; + } else { + combiner->connections[combiner->current_conn] = combiner->connections[combiner->conn_count]; + } + } else if (res == RESPONSE_SUSPENDED) { + /* Make next connection current */ + combiner->current_conn = combiner->conn_count + 1; + if (combiner->current_conn >= combiner->conn_count) { + combiner->current_conn = 0; + } + } else if (res == RESPONSE_DATAROW && have_tuple) { + /* We already have a tuple and received another one, leave it tillnext fetch. */ + return true; + } + + if (combiner->currentRow.msg) { + CopyDataRowTupleToSlot(combiner, slot); + have_tuple = true; + } + } + + /* report end of data to the caller */ + if (!have_tuple) { + ExecClearTuple(slot); + } + + return have_tuple; +} +static void +HandleDatanodeGxid(PGXCNodeHandle* conn, const char* msg_body, size_t len) +{ + uint32 n32; + TransactionId gxid = InvalidTransactionId; + + Assert(msg_body != NULL); + Assert(len >= 2); + + /* Get High half part */ + errno_t rc = 0; + rc = memcpy_s(&n32, sizeof(uint32), &msg_body[0], sizeof(uint32)); + securec_check(rc, "\0", "\0"); + gxid += ((uint64)ntohl(n32)) << 32; + /* Get low half part */ + rc = memcpy_s(&n32, sizeof(uint32), &msg_body[0] + sizeof(uint32), sizeof(uint32)); + securec_check(rc, "\0", "\0"); + gxid += ntohl(n32); + conn->remote_top_txid = gxid; +} + +static void HandleLocalCsnMin(PGXCNodeHandle* conn, const char* msg_body, size_t len) +{ + Assert(msg_body != NULL); + Assert(len >= 2); + + uint32 n32; + errno_t rc; + CommitSeqNo csn_min = 0; + /* Get High half part */ + rc = memcpy_s(&n32, sizeof(uint32), &msg_body[0], sizeof(uint32)); + securec_check(rc, "", ""); + csn_min += ((uint64)ntohl(n32)) << 32; + /* Get low half part */ + rc = memcpy_s(&n32, sizeof(uint32), &msg_body[0] + sizeof(uint32), sizeof(uint32)); + securec_check(rc, "", ""); + csn_min += ntohl(n32); + if (module_logging_is_on(MOD_TRANS_SNAPSHOT)) { + ereport(LOG, (errmodule(MOD_TRANS_SNAPSHOT), + errmsg("[CsnMinSync] local csn min : %lu from node %u", csn_min, conn->nodeoid))); + } + if (csn_min < t_thrd.xact_cxt.ShmemVariableCache->local_csn_min) { + t_thrd.xact_cxt.ShmemVariableCache->local_csn_min = csn_min; + } +} + +static void HandleMaxCSN(RemoteQueryState* combiner, const char* msg, int msg_len) +{ + Assert(msg_len == sizeof(int64) + sizeof(int8)); + combiner->maxCSN = ntohl64(*((CommitSeqNo *)msg)); + combiner->hadrMainStandby = *(bool*)(msg + sizeof(int64)); +} + +int spq_handle_response(PGXCNodeHandle* conn, RemoteQueryState* combiner, bool isdummy) +{ + char* msg = NULL; + int msg_len; + char msg_type; + bool suspended = false; + bool error_flag = false; + int node_idx; + int32 cur_smp_id; + + node_idx = conn->nodeIdx; + + for (;;) { + Assert(conn->state != DN_CONNECTION_STATE_IDLE); + + /* + * If we are in the process of shutting down, we + * may be rolling back, and the buffer may contain other messages. + * We want to avoid a procarray exception + * as well as an error stack overflow. + * + * If not in GPC mode, should receive datanode messages but not interrupt immediately in loop while. + */ + if (t_thrd.proc_cxt.proc_exit_inprogress && ENABLE_CN_GPC) { + conn->state = DN_CONNECTION_STATE_ERROR_FATAL; + ereport(DEBUG2, + (errmsg("DN_CONNECTION_STATE_ERROR_FATAL0 is set for connection to node %s[%u] when proc_exit_inprogress", + conn->remoteNodeName, conn->nodeoid))); + } + + /* don't read from from the connection if there is a fatal error */ + if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL) { + ereport(DEBUG2, + (errmsg("handle_response0 returned with DN_CONNECTION_STATE_ERROR_FATAL for connection to node %s[%u] ", + conn->remoteNodeName, conn->nodeoid))); + return RESPONSE_COMPLETE; + } + + /* No data available, read one more time or exit */ + if (!HAS_MESSAGE_BUFFERED(conn)) { + /* + * For FATAL error, no need to read once more, because openGauss thread(DN) will exit + * immediately after sending error message without sending 'Z'(ready for query). + */ + if (combiner != NULL && combiner->is_fatal_error) { + conn->state = DN_CONNECTION_STATE_ERROR_FATAL; + conn->combiner = NULL; + + return RESPONSE_COMPLETE; + } + + if (error_flag) { + /* incomplete message, if last message type is ERROR,read once more */ + if (pgxc_node_receive(1, &conn, NULL)) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("Failed to receive message from %s[%u]", + conn->remoteNodeName, conn->nodeoid))); + error_flag = false; + continue; + } else { + return RESPONSE_EOF; + } + } + /* no need to check conn's combiner when abort transaction */ + Assert(t_thrd.xact_cxt.bInAbortTransaction || conn->combiner == combiner || conn->combiner == NULL); + + msg_type = get_message(conn, &msg_len, &msg); + LIBCOMM_DEBUG_LOG("handle_response to node:%s[nid:%d,sid:%d] with msg:%c", + conn->remoteNodeName, + conn->gsock.idx, + conn->gsock.sid, + msg_type); + + switch (msg_type) { + case '\0': /* Not enough data in the buffer */ + return RESPONSE_EOF; + case 'c': /* CopyToCommandComplete */ + HandleCopyOutComplete(combiner); + break; + case 'C': /* CommandComplete */ + HandleCommandComplete(combiner, msg, msg_len, conn, isdummy); + break; + case 'T': /* RowDescription */ +#ifdef DN_CONNECTION_DEBUG + Assert(!conn->have_row_desc); + conn->have_row_desc = true; +#endif + if (HandleRowDescription(combiner, msg)) + return RESPONSE_TUPDESC; + break; + case 'D': /* DataRow */ + case 'B': /* DataBatch */ +#ifdef DN_CONNECTION_DEBUG + Assert(conn->have_row_desc); +#endif + HandleDataRow(combiner, msg, msg_len, conn->nodeoid, conn->remoteNodeName); + return RESPONSE_DATAROW; + case 'P': /* AnalyzeTotalRow */ + HandleAnalyzeTotalRow(combiner, msg, msg_len); + return RESPONSE_ANALYZE_ROWCNT; + break; + case 'U': /* Stream instrumentation data */ + /* receive data from the CN of the compute pool in first thread + * of the smp. + */ + cur_smp_id = -1; + if (u_sess->instr_cxt.global_instr) + u_sess->instr_cxt.global_instr->deserialize(node_idx, msg, msg_len, false, cur_smp_id); + break; + case 'u': /* OBS runtime instrumentation data */ + if (u_sess->instr_cxt.obs_instr) + u_sess->instr_cxt.obs_instr->deserialize(msg, msg_len); + break; + case 'V': /* Track data for developer-define profiling */ + /* receive data from the CN of the compute pool in first thread + * of the smp. + */ + if (0 != u_sess->stream_cxt.smp_id && IS_PGXC_DATANODE) + break; + + if (u_sess->instr_cxt.global_instr) + u_sess->instr_cxt.global_instr->deserializeTrack(node_idx, msg, msg_len); + break; + case 's': /* PortalSuspended */ + suspended = true; + break; + case '1': /* ParseComplete */ + case '2': /* BindComplete */ + case '3': /* CloseComplete */ + case 'n': /* NoData */ + /* simple notifications, continue reading */ + break; + case 'G': /* CopyInResponse */ + conn->state = DN_CONNECTION_STATE_COPY_IN; + HandleCopyIn(combiner); + /* Done, return to caller to let it know the data can be passed in */ + return RESPONSE_COPY; + case 'H': /* CopyOutResponse */ + conn->state = DN_CONNECTION_STATE_COPY_OUT; + HandleCopyOut(combiner); + return RESPONSE_COPY; + case 'd': /* CopyOutDataRow */ + conn->state = DN_CONNECTION_STATE_COPY_OUT; + HandleCopyDataRow(combiner, msg, msg_len); + break; + case 'E': /* ErrorResponse */ + HandleError(combiner, msg, msg_len); + add_error_message(conn, "%s", combiner->errorMessage); + error_flag = true; + /* + * Do not return with an error, we still need to consume Z, + * ready-for-query + */ + break; + case 'N': /* NoticeResponse */ + HandleNotice(combiner, msg, msg_len); + break; + case 'A': /* NotificationResponse */ + case 'S': /* SetCommandComplete */ + /* + * Ignore these to prevent multiple messages, one from each + * node. Coordinator will send one for DDL anyway + */ + break; + case 'Z': /* ReadyForQuery */ + { + /* + * Return result depends on previous connection state. + * If it was PORTAL_SUSPENDED Coordinator want to send down + * another EXECUTE to fetch more rows, otherwise it is done + * with the connection + */ + int result = suspended ? RESPONSE_SUSPENDED : RESPONSE_COMPLETE; + conn->transaction_status = msg[0]; + conn->state = DN_CONNECTION_STATE_IDLE; + conn->combiner = NULL; +#ifdef DN_CONNECTION_DEBUG + conn->have_row_desc = false; +#endif + return result; + } + case 'M': /* Command Id */ + HandleDatanodeCommandId(combiner, msg, msg_len); + break; + case 'm': /* Commiting */ + conn->state = DN_CONNECTION_STATE_IDLE; + combiner->request_type = REQUEST_TYPE_COMMITING; + return RESPONSE_COMPLETE; + case 'b': + conn->state = DN_CONNECTION_STATE_IDLE; + return RESPONSE_BARRIER_OK; + case 'y': + conn->state = DN_CONNECTION_STATE_IDLE; + return RESPONSE_SEQUENCE_OK; + case 'O': /* PlanIdComplete */ + conn->state = DN_CONNECTION_STATE_IDLE; + return RESPONSE_PLANID_OK; + case 'g': /* DN top xid */ + HandleDatanodeGxid(conn, msg, msg_len); + break; + case 'L': /* DN local csn min */ + HandleLocalCsnMin(conn, msg, msg_len); + return RESPONSE_COMPLETE; + case 'z': /* pbe for ddl */ + break; + case 'J': + conn->state = DN_CONNECTION_STATE_IDLE; + HandleMaxCSN(combiner, msg, msg_len); + return RESPONSE_MAXCSN_RECEIVED; + case 'I': /* EmptyQuery */ + default: + /* sync lost? */ + elog(WARNING, "Received unsupported message type: %c", msg_type); + conn->state = DN_CONNECTION_STATE_ERROR_FATAL; + /* stop reading */ + return RESPONSE_COMPLETE; + } + } + /* never happen, but keep compiler quiet */ + return RESPONSE_EOF; +} +static void ExecInitPlanState(PlanState* plan_state, EState* estate, RemoteQuery* node, RemoteQueryState* remotestate) +{ + plan_state->state = estate; + plan_state->plan = (Plan*)node; + plan_state->qual = (List*)ExecInitExpr((Expr*)node->scan.plan.qual, (PlanState*)remotestate); + plan_state->targetlist = (List*)ExecInitExpr((Expr*)node->scan.plan.targetlist, (PlanState*)remotestate); + ExecAssignExprContext(estate, plan_state); + ExecInitResultTupleSlot(estate, &remotestate->ss.ps); + plan_state->ps_vec_TupFromTlist = false; + ExecAssignResultTypeFromTL(&remotestate->ss.ps); +} + +RemoteQueryState* ExecInitSpqRemoteQuery(RemoteQuery* node, EState* estate, int eflags, bool row_plan) +{ + RemoteQueryState* spqRemoteState = NULL; + + /* RemoteQuery node is the leaf node in the plan tree, just like seqscan */ + Assert(innerPlan(node) == NULL); + //Assert(node->is_simple == false); + + spqRemoteState = CreateResponseCombiner(0, node->combine_type); + spqRemoteState->position = node->position; + + ExecInitPlanState(&spqRemoteState->ss.ps, estate, node, spqRemoteState); + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_MARK))); + + /* Extract the eflags bits that are relevant for tuplestorestate */ + spqRemoteState->eflags = (eflags & (EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD)); + /* We anyways have to support REWIND for ReScan */ + spqRemoteState->eflags |= EXEC_FLAG_REWIND; + spqRemoteState->ss.ps.ExecProcNode = ExecSPQRemoteQuery; + + spqRemoteState->eof_underlying = false; + spqRemoteState->tuplestorestate = NULL; + spqRemoteState->switch_connection = NULL; + spqRemoteState->refresh_handles = false; + spqRemoteState->nodeidxinfo = NULL; + spqRemoteState->serializedPlan = NULL; + + ExecInitScanTupleSlot(estate, &spqRemoteState->ss); + ExecAssignScanType(&spqRemoteState->ss, ExecTypeFromTL(node->base_tlist, false)); + + /* + * If there are parameters supplied, get them into a form to be sent to the + * Datanodes with bind message. We should not have had done this before. + */ + SetDataRowForExtParams(estate->es_param_list_info, spqRemoteState); + //if (false == node->is_simple || true == node->rq_need_proj) + //ExecAssignScanProjectionInfo(&spqRemoteState->ss); + + //if (node->rq_save_command_id) { + /* Save command id to be used in some special cases */ + //remotestate->rqs_cmd_id = GetCurrentCommandId(false); + //} + // todo only pgxc_FQS_create_remote_plan, orca may diff + if (node->is_simple /* || PLAN_ROUTER == node->position */) { + /* u_sess->instr_cxt.thread_instr in CN do not init nodes which exec on DN */ + ThreadInstrumentation* oldInstr = u_sess->instr_cxt.thread_instr; + u_sess->instr_cxt.thread_instr = NULL; + + u_sess->exec_cxt.under_stream_runtime = true; + /* For explain command and sessionId generation of import or export execution. */ + if (outerPlan(node)) + outerPlanState(spqRemoteState) = ExecInitNode(outerPlan(node), estate, eflags); + + u_sess->instr_cxt.thread_instr = oldInstr; + } + /* Add relations ref count for FQS Query. */ + //RelationIncrementReferenceCountForFQS(node); + + if (node->is_simple /* || node->poll_multi_channel */) { + /* receive logic different from pgxc way */ + spqRemoteState->fetchTuple = FetchTupleByMultiChannel; + } else { + spqRemoteState->fetchTuple = FetchTuple; + } + + if (row_plan) { + estate->es_remotequerystates = lcons(spqRemoteState, estate->es_remotequerystates); + } + + spqRemoteState->parallel_function_state = NULL; + + return spqRemoteState; +} +// copy void InitMultinodeExecutor(bool is_force) +PGXCNodeHandle* InitSPQMultinodeExecutor(Oid nodeoid, char* nodename) +{ + PGXCNodeHandle *result = (PGXCNodeHandle *)palloc0(sizeof(PGXCNodeHandle)); + result->sock = NO_SOCKET; + init_pgxc_handle(result); + result->nodeoid = nodeoid; + result->remoteNodeName = nodename; + result->remote_node_type = VDATANODE; + return result; +} +void spq_release_conn(RemoteQueryState* planstate) +{ + if (planstate == NULL) { + return; + } + for (int i = 0; i < planstate->node_count ; i++) { + if (planstate->nodeCons != NULL && planstate->nodeCons[i] != NULL) { + PGXCNodeClose(planstate->nodeCons[i]); + planstate->nodeCons[i] = NULL; + } + if (planstate->spq_connections_info != NULL && planstate->spq_connections_info[i] != NULL) { + PGXCNodeHandle *handle = planstate->spq_connections_info[i]; + pfree_ext(handle->inBuffer); + pfree_ext(handle->outBuffer); + pfree_ext(handle->error); + pfree_ext(handle); + planstate->spq_connections_info[i] = NULL; + } + } + pfree_ext(planstate->spq_connections_info); + pfree_ext(planstate->nodeCons); + planstate->spq_connections_info = NULL; + planstate->nodeCons = NULL; + +} +PGXCNodeHandle** spq_get_exec_connections( + RemoteQueryState* planstate, ExecNodes* exec_nodes, RemoteQueryExecType exec_type) +{ + int dn_conn_count; + PlannedStmt* planstmt = planstate->ss.ps.state->es_plannedstmt; + + /* Set datanode list and DN number */ + /* Set Coordinator list and Coordinator number */ + // QD count + dn_conn_count = planstmt->num_nodes; + PGXCNodeHandle** connections = (PGXCNodeHandle **)palloc(dn_conn_count * sizeof(PGXCNodeHandle *)); + planstate->spq_connections_info = (PGXCNodeHandle **)palloc(dn_conn_count * sizeof(PGXCNodeHandle *)); + planstate->nodeCons = (PGconn **)palloc0(sizeof(PGconn *) * dn_conn_count); + planstate->node_count = dn_conn_count; + + Oid *dnNode = (Oid *)palloc0(sizeof(Oid) * dn_conn_count); + PGconn **nodeCons = planstate->nodeCons; + char **connectionStrs = (char **)palloc0(sizeof(char *) * dn_conn_count); + + auto spq_release = [&](char* err_msg) { + for (int i = 0; i < dn_conn_count; i++) { + pfree_ext(connectionStrs[i]); + } + pfree_ext(dnNode); + pfree_ext(connectionStrs); + if (err_msg != NULL) { + pfree_ext(connections); + connections = NULL; + spq_release_conn(planstate); + ereport(ERROR, (errmsg("PQconnectdbParallel error: %s", err_msg))); + } + return; + }; + for (int j = 0; j < dn_conn_count; ++j) { + connectionStrs[j] = (char *)palloc0(INITIAL_EXPBUFFER_SIZE * 4); + NodeDefinition* node = &planstmt->nodesDefinition[j]; + sprintf_s(connectionStrs[j], INITIAL_EXPBUFFER_SIZE * 4, + "host=%s port=%d dbname=%s user=%s application_name=coordinator1 connect_timeout=600 rw_timeout=600 \ + options='-c remotetype=coordinator -c DateStyle=iso,mdy -c timezone=prc -c geqo=on -c intervalstyle=postgres \ + -c lc_monetary=en_US.UTF-8 -c lc_numeric=en_US.UTF-8 -c lc_time=en_US.UTF-8 -c omit_encoding_error=off' \ + prototype=1 keepalives_idle=600 keepalives_interval=30 keepalives_count=20 \ + remote_nodename=%s backend_version=%u enable_ce=1", + node->nodehost.data, node->nodeport, u_sess->proc_cxt.MyProcPort->database_name, + u_sess->proc_cxt.MyProcPort->user_name, node->nodename.data, GRAND_VERSION_NUM); + dnNode[j] = node->nodeoid; + connections[j] = InitSPQMultinodeExecutor(node->nodeoid, node->nodename.data); + planstate->spq_connections_info[j] = connections[j]; + connections[j]->nodeIdx = j; + } + + PQconnectdbParallel(connectionStrs, dn_conn_count, nodeCons, dnNode); + + + //ListCell *node_list_item = NULL; + for (int i = 0; i < dn_conn_count; i++) { + + if (nodeCons[i] && (CONNECTION_OK == nodeCons[i]->status)) { + pgxc_node_init(connections[i], nodeCons[i]->sock); + } else { + char firstError[INITIAL_EXPBUFFER_SIZE] = {0}; + errno_t ss_rc = EOK; + if (nodeCons[i] == NULL) { + ss_rc = strcpy_s(firstError, INITIAL_EXPBUFFER_SIZE, "out of memory"); + } else if (nodeCons[i]->errorMessage.data != NULL) { + if (strlen(nodeCons[i]->errorMessage.data) >= INITIAL_EXPBUFFER_SIZE) { + nodeCons[i]->errorMessage.data[INITIAL_EXPBUFFER_SIZE - 1] = '\0'; + } + ss_rc = strcpy_s(firstError, INITIAL_EXPBUFFER_SIZE, nodeCons[i]->errorMessage.data); + } else { + ss_rc = strcpy_s(firstError, INITIAL_EXPBUFFER_SIZE, "unknown error"); + } + spq_release(firstError); + } + } + spq_release(NULL); + return connections; +} +void spq_do_query(RemoteQueryState* node) +{ + RemoteQuery* step = (RemoteQuery*)node->ss.ps.plan; + bool is_read_only = step->read_only; + bool need_stream_sync = false; + + Snapshot snapshot = GetActiveSnapshot(); + PGXCNodeHandle** connections = NULL; + int i; + int regular_conn_count = 0; + bool need_tran_block = false; + PlannedStmt* planstmt = node->ss.ps.state->es_plannedstmt; + NameData nodename = {{0}}; + + /* RecoveryInProgress */ + + if (node->conn_count == 0) + node->connections = NULL; + + planstmt->queryId = u_sess->debug_query_id; + planstmt->spq_session_id = u_sess->debug_query_id; + planstmt->current_id = step->streamID; + node->queryId = generate_unique_id64(>_queryId); + + connections = spq_get_exec_connections(node, step->exec_nodes, step->exec_type); + + Assert(node->spq_connections_info != NULL); + Assert(connections != NULL); + Assert(step->exec_type == EXEC_ON_DATANODES); + + regular_conn_count = node->node_count; + + pfree_ext(node->switch_connection); + pfree_ext(node->nodeidxinfo); + node->switch_connection = (bool*)palloc0(regular_conn_count * sizeof(bool)); + node->nodeidxinfo = (NodeIdxInfo*)palloc0(regular_conn_count * sizeof(NodeIdxInfo)); + for (int k = 0; k < regular_conn_count; k++) { + node->nodeidxinfo[k].nodeidx = connections[k]->nodeIdx; + node->nodeidxinfo[k].nodeoid = connections[k]->nodeoid; + } + + Assert(is_read_only); + if (is_read_only) + need_tran_block = false; + + elog(DEBUG1, + "regular_conn_count = %d, need_tran_block = %s", regular_conn_count, need_tran_block ? "true" : "false"); + + // Do not generate gxid for read only query. + // + //if (is_read_only) { + // gxid = GetCurrentTransactionIdIfAny(); + //} + +#ifdef STREAMPLAN + char *compressedPlan = NULL; + int cLen = 0; + + if (step->is_simple) { + + StringInfoData str_remoteplan; + initStringInfo(&str_remoteplan); + elog(DEBUG5, + "Node Id %d, Thread ID:%lu, queryId: %lu, query: %s", + u_sess->pgxc_cxt.PGXCNodeId, + gs_thread_self(), + planstmt->queryId, + t_thrd.postgres_cxt.debug_query_string ? t_thrd.postgres_cxt.debug_query_string : ""); + + planstmt->query_string = + const_cast(t_thrd.postgres_cxt.debug_query_string ? t_thrd.postgres_cxt.debug_query_string : ""); + /* Flag 'Z' to indicate it's serialized plan */ + /* todo: SerializePlan DISTRIBUTED_FEATURE_NOT_SUPPORTED */ + SpqSerializePlan(step->scan.plan.lefttree, planstmt, &str_remoteplan, step->num_stream, step->num_gather, true, node->queryId); + node->serializedPlan = str_remoteplan.data; + + /* Compress the 'Z' plan here. */ + char *tmpQuery = node->serializedPlan; + /* Skip the msgType 'Z'. */ + tmpQuery++; + /* todo: CompressSerializedPlan DISTRIBUTED_FEATURE_NOT_SUPPORTED */ + compressedPlan = CompressSerializedPlan(tmpQuery, &cLen); + u_sess->instr_cxt.plan_size = cLen; + + need_stream_sync = step->num_stream > 0 ? true : false; + } +#endif + + /* + * Send begin statement to all datanodes for RW transaction parallel. + * Current it should be RO transaction + */ + if (need_stream_sync) { + pgxc_node_send_queryid_with_sync(connections, regular_conn_count, node->queryId); + } + for (i = 0; i < regular_conn_count; i++) { + if (!pgxc_start_command_on_connection(connections[i], node, snapshot, compressedPlan, cLen)) { + Oid nodeid = connections[i]->nodeoid; + pfree_ext(connections); + spq_release_conn(node); + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("Failed to send command to Datanodes %s[%u]", + get_pgxc_nodename_noexcept(nodeid, &nodename), nodeid))); + } + connections[i]->combiner = node; + + // If send command to Datanodes successfully, outEnd must be 0. + // So the outBuffer can be freed here and reset to default buffer size 16K. + // + Assert(connections[i]->outEnd == 0); + ResetHandleOutBuffer(connections[i]); + } + + do_query_for_first_tuple(node, false, regular_conn_count, connections, NULL, NIL); + + /* reset */ + if (step->is_simple) { + pfree_ext(node->serializedPlan); + pfree_ext(compressedPlan); + } +} +static TupleTableSlot* SpqRemoteQueryNext(ScanState* scan_node) +{ + TupleTableSlot* scanslot = scan_node->ss_ScanTupleSlot; + RemoteQueryState* node = (RemoteQueryState*)scan_node; + + /* + * Initialize tuples processed to 0, to make sure we don't re-use the + * values from the earlier iteration of RemoteQueryNext(). + */ + node->rqs_processed = 0; + if (!node->query_Done) { + spq_do_query(node); + node->query_Done = true; + } + + //Assert(rq->spool_no_data == true); + //if (rq->spool_no_data == true) { + /* for simple remotequery, we just pass the data, no need to spool */ + + //} + node->fetchTuple(node, scanslot, NULL); + + /* When finish remote query already, should better reset the flag. */ + if (TupIsNull(scanslot)) + node->need_error_check = false; + + /* report error if any */ + pgxc_node_report_error(node); + + return scanslot; +} +static void ReleaseTupStore(RemoteQueryState* node) +{ + if (node->tuplestorestate != NULL) { + tuplestore_end(node->tuplestorestate); + } +} +static void CloseNodeCursors(RemoteQueryState* node, PGXCNodeHandle** cur_handles, int nCount) +{ + if (node->cursor) { + close_node_cursors(cur_handles, nCount, node->cursor); + /* + * node->cursor now points to the string array attached in hash table of Portals. + * it can't be freed here. + */ + node->cursor = NULL; + } + + if (node->update_cursor) { + close_node_cursors(cur_handles, nCount, node->update_cursor); + /* + * different from cursor, it can be freed here. + */ + pfree_ext(node->update_cursor); + } +} +/* close active cursors during end remote query */ +static void CloseActiveCursors(RemoteQueryState* node) +{ + bool noFree = true; + PGXCNodeAllHandles* all_handles = NULL; + + if (node->cursor || node->update_cursor) { + PGXCNodeHandle** cur_handles = node->cursor_connections; + int nCount = node->cursor_count; + int i; + for (i = 0; i < nCount; i++) { + if (node->cursor_connections == NULL || (!IS_VALID_CONNECTION(node->cursor_connections[i]))) { + all_handles = get_exec_connections(node, NULL, EXEC_ON_DATANODES); + noFree = false; + break; + } + } + + if (all_handles != NULL) { + cur_handles = all_handles->datanode_handles; + nCount = all_handles->dn_conn_count; + } + + /* not clear ?? */ + CloseNodeCursors(node, cur_handles, nCount); + + if (!noFree) { + pfree_pgxc_all_handles(all_handles); + } + } +} + +void RelationDecrementReferenceCountForFQS(const RemoteQuery* node) +{ + if (!node->isFQS || node->relationOids == NULL) { + return; + } + + ListCell *lc = NULL; + foreach(lc, node->relationOids) { + Oid oid = lfirst_oid(lc); + RelationDecrementReferenceCount(oid); + } +} + +void ExecEndSpqRemoteQuery(RemoteQueryState* node, bool pre_end) +{ + RemoteQuery* remote_query = (RemoteQuery*)node->ss.ps.plan; + + if (pre_end == false) { + RowStoreReset(node->row_store); + } + + /* Pack all un-completed connections together and recorrect node->conn_count */ + if (node->conn_count > 0 && remote_query->sort != NULL) { + node->conn_count = PackConnections(node); + } + + node->current_conn = 0; + while (node->current_conn < node->conn_count) { + int res; + PGXCNodeHandle* conn = node->connections[node->current_conn]; + + /* throw away message */ + pfree_ext(node->currentRow.msg); + + if (conn == NULL) { + node->conn_count--; + if (node->current_conn < node->conn_count) { + node->connections[node->current_conn] = node->connections[node->conn_count]; + } + continue; + } + + /* no data is expected */ + if (conn->state == DN_CONNECTION_STATE_IDLE || conn->state == DN_CONNECTION_STATE_ERROR_FATAL) { + if (node->current_conn < --node->conn_count) { + node->connections[node->current_conn] = node->connections[node->conn_count]; + } + continue; + } + + /* incomlete messages */ + res = handle_response(conn, node); + if (res == RESPONSE_EOF) { + node->current_conn++; + } + } + + /* + * Send stop signal to DNs when we already get the tuples + * we need but the DNs are still running. + * Especially for query with limit or likewise. + */ + if (node->conn_count > 0) { + if (u_sess->debug_query_id == 0) { + /* + * when cn send stop signal to dn, + * need to check queryid preventing signal wrong query. + * so if queryid is 0, get query from RemoteQuery Node. + */ + u_sess->debug_query_id = GetSPQQueryidFromRemoteQuery(node); + } + stop_query(); + } + + while (node->conn_count > 0) { + int i = 0; + if (pgxc_node_receive(node->conn_count, node->connections, NULL)) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("Failed to read response from Datanodes when ending query"))); + + while (i < node->conn_count) { + /* throw away message */ + pfree_ext(node->currentRow.msg); + int result = handle_response(node->connections[i], node); + switch (result) { + case RESPONSE_EOF: /* have something to read, keep receiving */ + i++; + break; + default: + if (node->connections[i]->state == DN_CONNECTION_STATE_IDLE || + node->connections[i]->state == DN_CONNECTION_STATE_ERROR_FATAL) { + node->conn_count--; + if (i < node->conn_count) + node->connections[i] = node->connections[node->conn_count]; + } + break; + } + } + } + + /* pre_end true for explain performance receiving data before print plan. */ + if (pre_end) { + return; + } + if (node->tuplestorestate != NULL) { + ExecClearTuple(node->ss.ss_ScanTupleSlot); + } + /* Release tuplestore resources */ + ReleaseTupStore(node); + /* If there are active cursors close them */ + CloseActiveCursors(node); + /* Clean up parameters if they were set */ + if (node->paramval_data) { + pfree_ext(node->paramval_data); + node->paramval_data = NULL; + node->paramval_len = 0; + } + + /* Free the param types if they are newly allocated */ + if (node->rqs_param_types && node->rqs_param_types != ((RemoteQuery*)node->ss.ps.plan)->rq_param_types) { + pfree_ext(node->rqs_param_types); + node->rqs_param_types = NULL; + node->rqs_num_params = 0; + } + + if (node->ss.ss_currentRelation) + ExecCloseScanRelation(node->ss.ss_currentRelation); + + if (node->parallel_function_state != NULL) { + FreeParallelFunctionState(node->parallel_function_state); + node->parallel_function_state = NULL; + } + + +#ifdef STREAMPLAN + PlanState* outer_planstate = outerPlanState(node); +#endif + CloseCombiner(node); + node = NULL; + +#ifdef STREAMPLAN + if ((IS_PGXC_COORDINATOR && remote_query->is_simple) || + (IS_PGXC_DATANODE && remote_query->is_simple && remote_query->rte_ref) || + IS_SPQ_COORDINATOR || + (IS_PGXC_DATANODE && remote_query->is_simple && + (remote_query->position == PLAN_ROUTER || remote_query->position == SCAN_GATHER))) + ExecEndNode(outer_planstate); +#endif + + /* Add relations's ref count for FQS Query. */ + RelationDecrementReferenceCountForFQS(remote_query); + + /* + * Free nodelist if there is en_expr: + * If there is en_expr, nodelist is useless now, for it is generated by en_expr in get_exec_connnection. + * Else, nodelist contains datanodes generated during planning, which is keeped in the plansource + * and should not be set NIL. + */ + ExecNodes* exec_nodes = remote_query->exec_nodes; + if (exec_nodes != NULL && exec_nodes->en_expr && exec_nodes->nodelist_is_nil) { + exec_nodes->primarynodelist = NIL; + exec_nodes->nodeList = NIL; + } + +} +#else int getStreamSocketError(const char* str) { Assert(false); @@ -251,6 +1293,7 @@ char* getSocketError(int* err_code) DISTRIBUTED_FEATURE_NOT_SUPPORTED(); return NULL; } +#endif /* * @Description: Check if need check the other error message(s) when receiving @@ -744,9 +1787,9 @@ static void HandleDataRow( return; /* Check messages from DN. */ - if (IS_PGXC_COORDINATOR) { + if (IS_SPQ_COORDINATOR || IS_PGXC_COORDINATOR) { #ifdef USE_ASSERT_CHECKING - if (strcmp(remoteNodeName, g_instance.attr.attr_common.PGXCNodeName) != 0) { + if (IS_SPQ_COORDINATOR || strcmp(remoteNodeName, g_instance.attr.attr_common.PGXCNodeName) != 0) { CheckMessages(0, 0, msg_body, len, false); msg_body += REMOTE_CHECKMSG_LEN; len -= REMOTE_CHECKMSG_LEN; @@ -754,7 +1797,7 @@ static void HandleDataRow( #else if (unlikely(anls_opt_is_on(ANLS_STREAM_DATA_CHECK) && - strcmp(remoteNodeName, g_instance.attr.attr_common.PGXCNodeName) != 0)) { + (IS_SPQ_COORDINATOR || strcmp(remoteNodeName, g_instance.attr.attr_common.PGXCNodeName) != 0))) { CheckMessages(0, 0, msg_body, len, false); msg_body += REMOTE_CHECKMSG_LEN; len -= REMOTE_CHECKMSG_LEN; @@ -1227,6 +2270,9 @@ bool validate_combiner(RemoteQueryState* combiner) void CloseCombiner(RemoteQueryState* combiner) { if (combiner != NULL) { +#ifdef USE_SPQ + spq_release_conn(combiner); +#endif if (combiner->connections) pfree_ext(combiner->connections); if (combiner->tuple_desc) { @@ -1405,6 +2451,10 @@ void CopyDataRowTupleToSlot(RemoteQueryState* combiner, TupleTableSlot* slot) bool FetchTuple(RemoteQueryState* combiner, TupleTableSlot* slot, ParallelFunctionState* parallelfunctionstate) { +#ifdef USE_SPQ + return SpqFetchTuple(combiner, slot, parallelfunctionstate); +#endif + #ifndef ENABLE_MULTIPLE_NODES Assert(false); DISTRIBUTED_FEATURE_NOT_SUPPORTED(); @@ -1886,6 +2936,10 @@ int light_handle_response(PGXCNodeHandle* conn, lightProxyMsgCtl* msgctl, lightP int handle_response(PGXCNodeHandle* conn, RemoteQueryState* combiner, bool isdummy) { +#ifdef USE_SPQ + return spq_handle_response(conn, combiner, isdummy); +#endif + #ifndef ENABLE_MULTIPLE_NODES Assert(false); DISTRIBUTED_FEATURE_NOT_SUPPORTED(); @@ -3744,6 +4798,9 @@ bool DataNodeCopyEnd(PGXCNodeHandle* handle, bool is_error) RemoteQueryState* ExecInitRemoteQuery(RemoteQuery* node, EState* estate, int eflags, bool row_plan) { +#ifdef USE_SPQ + return ExecInitSpqRemoteQuery(node, estate, eflags, row_plan); +#endif #ifndef ENABLE_MULTIPLE_NODES Assert(false); DISTRIBUTED_FEATURE_NOT_SUPPORTED(); @@ -4233,9 +5290,13 @@ bool pgxc_start_command_on_connection( if (ENABLE_WORKLOAD_CONTROL && u_sess->attr.attr_resource.resource_track_level == RESOURCE_TRACK_OPERATOR && pgxc_node_send_threadid(connection, t_thrd.proc_cxt.MyProcPid)) return false; - +#ifdef USE_SPQ + if (pgxc_node_send_queryid(connection, remotestate->queryId)) + return false; +#else if (pgxc_node_send_queryid(connection, u_sess->debug_query_id)) return false; +#endif // Instrumentation/Unique SQL: send unique sql id to DN node if (is_unique_sql_enabled() && pgxc_node_send_unique_sql_id(connection)) @@ -4299,7 +5360,11 @@ bool pgxc_start_command_on_connection( return false; } } else { - if (pgxc_node_send_query(connection, step->sql_statement, false, false, trigger_ship) != 0) + char* query = step->sql_statement; + if (step->is_simple) { + query = remotestate->serializedPlan; + } + if (pgxc_node_send_query(connection, query, false, false, trigger_ship, false, compressedPlan, cLen) != 0) return false; } return true; @@ -5505,6 +6570,9 @@ static bool RemoteQueryRecheck(RemoteQueryState* node, TupleTableSlot* slot) */ static TupleTableSlot* RemoteQueryNext(ScanState* scan_node) { +#ifdef USE_SPQ + return SpqRemoteQueryNext(scan_node); +#endif PlanState* outerNode = NULL; RemoteQueryState* node = (RemoteQueryState*)scan_node; @@ -5752,6 +6820,10 @@ inline static uint64 GetQueryidFromRemoteQuery(RemoteQueryState* node) void ExecEndRemoteQuery(RemoteQueryState* step, bool pre_end) { + +#ifdef USE_SPQ + return ExecEndSpqRemoteQuery(step, pre_end); +#endif #ifndef ENABLE_MULTIPLE_NODES Assert(false); DISTRIBUTED_FEATURE_NOT_SUPPORTED(); @@ -6060,7 +7132,7 @@ void SetDataRowForExtParams(ParamListInfo paraminfo, RemoteQueryState* rq_state) */ void ExecRemoteQueryReScan(RemoteQueryState* node, ExprContext* exprCtxt) { -#ifndef ENABLE_MULTIPLE_NODES +#if !defined(ENABLE_MULTIPLE_NODES) && !defined(USE_SPQ) Assert(false); DISTRIBUTED_FEATURE_NOT_SUPPORTED(); return; diff --git a/src/common/backend/pgxc_single/pool/pgxcnode.cpp b/src/common/backend/pgxc_single/pool/pgxcnode.cpp index ce3142266..7c3bd98ce 100644 --- a/src/common/backend/pgxc_single/pool/pgxcnode.cpp +++ b/src/common/backend/pgxc_single/pool/pgxcnode.cpp @@ -134,13 +134,310 @@ valid_pgxc_handle(PGXCNodeHandle *pgxc_handle) (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("invalid input/output buffer in node handle"))); } - +#ifdef USE_SPQ +void init_spq_handle(PGXCNodeHandle *pgxc_handle) +{ + /* + * Socket descriptor is small non-negative integer, + * Indicate the handle is not initialized yet + */ + pgxc_handle->sock = NO_SOCKET; + pgxc_handle->state = DN_CONNECTION_STATE_IDLE; + + /* Initialise buffers */ + pgxc_handle->error = NULL; + pgxc_handle->outSize = 16 * 1024; + pgxc_handle->outBuffer = (char *)palloc0(pgxc_handle->outSize); + pgxc_handle->inSize = 16 * 1024; + pgxc_handle->inBuffer = (char *)palloc0(pgxc_handle->inSize); + pgxc_handle->combiner = NULL; + pgxc_handle->stream = NULL; + pgxc_handle->inStart = 0; + pgxc_handle->inEnd = 0; + pgxc_handle->inCursor = 0; + pgxc_handle->outEnd = 0; + pgxc_handle->outNum = 0; + + /* for sctp connection */ + pgxc_handle->listenPort = -1; + pgxc_handle->tcpCtlPort = -1; + + pgxc_handle->remoteNodeName = NULL; + pgxc_handle->nodeIdx = -1; + + pgxc_handle->is_logic_conn = false; + pgxc_handle->gsock = GS_INVALID_GSOCK; + + pgxc_handle->pg_conn = NULL; + + valid_pgxc_handle(pgxc_handle); +} +void pgxc_spqnode_init(PGXCNodeHandle *handle, int sock) +{ + handle->sock = sock; + handle->transaction_status = 'I'; + handle->state = DN_CONNECTION_STATE_IDLE; + handle->combiner = NULL; + handle->stream = NULL; +#ifdef DN_CONNECTION_DEBUG + handle->have_row_desc = false; +#endif + pfree_ext(handle->error); + handle->outEnd = 0; + handle->inStart = 0; + handle->inEnd = 0; + handle->inCursor = 0; + + handle->is_logic_conn = false; + handle->gsock = GS_INVALID_GSOCK; + + handle->pg_conn = NULL; +} + +bool spq_node_receive(const int conn_count, PGXCNodeHandle **connections, struct timeval *timeout) +{ + int i; + bool logic_conn_ret = NO_ERROR_OCCURED; + bool physic_conn_ret = NO_ERROR_OCCURED; + int physic_conn_count = 0; + int logic_conn_count = 0; + int time_out = -1; + PGXCNodeHandle *tmp_connect = NULL; + + pgxc_palloc_net_ctl(conn_count); + + /* + * as the connection between CN and CN is still physical. + * we need to seperate physical and logic connection and call different function + */ + if (g_instance.attr.attr_storage.comm_cn_dn_logic_conn) { + for (i = 0; i < conn_count; i++) { + /* + * for physical channel always put them in the front of connections list + * so just swith the connection between physical conn and the first logic conn + */ + if (!connections[i]->is_logic_conn) { + tmp_connect = connections[physic_conn_count]; + connections[physic_conn_count] = connections[i]; + connections[i] = tmp_connect; + physic_conn_count++; + } else { + logic_conn_count++; + } + } + } else { + physic_conn_count = conn_count; + } + + if (physic_conn_count != 0) { + physic_conn_ret = datanode_receive_from_physic_conn(physic_conn_count, connections, timeout); + } + + if (logic_conn_count != 0) { + if (timeout != NULL) + time_out = (int)timeout->tv_sec; + + /* + * as the physical conn has been put in the front + * so the first logic conn is place in connections[physic_conn_count] + */ + logic_conn_ret = + datanode_receive_from_logic_conn(logic_conn_count, &connections[physic_conn_count], NULL, time_out); + } + + /* anyone is ERROR_OCCURED, return ERROR_OCCURED. */ + return logic_conn_ret || physic_conn_ret; +} +static bool IsExistMsgBuffered(const int connCount, PGXCNodeHandle **connections) +{ + bool isMsgBuffered = false; + for (int i = 0; i < connCount; i++) { + if (HAS_MESSAGE_BUFFERED(connections[i])) { + isMsgBuffered = true; + break; + } + } + return isMsgBuffered; +} + +static bool IsFinishedSend(int idx, PGXCNodeHandle **connections) +{ + if (connections[idx]->state == DN_CONNECTION_STATE_IDLE || HAS_MESSAGE_BUFFERED(connections[idx])) { + return true; + } + return false; +} +int spq_node_send_query(PGXCNodeHandle *handle, const char *query, bool isPush, bool isCreateSchemaPush, + bool trigger_ship, bool check_gtm_mode, const char *compressedPlan, int cLen) +{ + int strLen; + int msgLen; + char *tmpQuery = (char *)query; + errno_t ss_rc = 0; + + /* Invalid connection state, return error */ + if (handle->state != DN_CONNECTION_STATE_IDLE) + return EOF; + + if (unlikely(query == NULL)) { + ereport(ERROR, + (errmodule(MOD_EXECUTOR), + errcode(ERRCODE_UNEXPECTED_NULL_VALUE), + errmsg("Input query should not be NULL when send."))); + } + + /* The message which includes compressed plan is typed "Z". */ + if ('Z' == tmpQuery[0]) { + int nodeId = 1; + int oLen = 0; + int cLen_n32 = 0; + + if (IS_PGXC_COORDINATOR) + nodeId = PGXCNodeGetNodeId(handle->nodeoid, PGXC_NODE_DATANODE); // start from 0 + else + /* DWS DN sends ... to CN of the compute pool if run here. */ + nodeId = u_sess->pgxc_cxt.PGXCNodeId; + + oLen = strlen(tmpQuery); + // Four integer values(msgLen, nodeId, oLen, cLen) and the length of compressed plan. + msgLen = (4 * sizeof(int)) + cLen; + + // msgType + msgLen + ensure_out_buffer_capacity(1 + msgLen, handle); + + Assert(handle->outBuffer != NULL); + handle->outBuffer[handle->outEnd++] = 'Z'; + msgLen = htonl(msgLen); + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &msgLen, sizeof(int)); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += 4; + nodeId = htonl(nodeId); + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &nodeId, sizeof(int)); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += 4; + oLen = htonl(oLen); + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &oLen, sizeof(int)); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += 4; + cLen_n32 = htonl(cLen); + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &cLen_n32, sizeof(int)); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += 4; + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, compressedPlan, cLen); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += cLen; + } + /* + * @hdfs + * the message which includes sql sentances + information is typed "h" + */ + else if ('h' == tmpQuery[0]) { + int nodeId = 1; + + if (IS_PGXC_COORDINATOR) + nodeId = PGXCNodeGetNodeId(handle->nodeoid, PGXC_NODE_DATANODE); // start from 0 + else + /* DWS DN sends ... to CN of the compute pool if run here. */ + nodeId = u_sess->pgxc_cxt.PGXCNodeId; + strLen = strlen(query) + 1; + /* size + strlen */ + msgLen = 4 + strLen + 4; // the extra 4 byte is for pgxc node id + + /* msgType + msgLen */ + ensure_out_buffer_capacity(1 + msgLen, handle); + Assert(handle->outBuffer != NULL); + handle->outBuffer[handle->outEnd++] = 'h'; + tmpQuery++; + strLen--; + msgLen--; + msgLen = htonl(msgLen); + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &msgLen, 4); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += 4; + nodeId = htonl(nodeId); + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &nodeId, 4); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += 4; + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, tmpQuery, strLen); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += strLen; + } else { + if (check_gtm_mode && (t_thrd.proc->workingVersionNum >= 92012)) + pgxc_node_send_gtm_mode(handle); + strLen = strlen(query) + 1; + /* size + strlen */ + msgLen = 4 + strLen; + + /* msgType + msgLen, If trigger is being shipped to DN. */ + if (trigger_ship) { + ensure_out_buffer_capacity(2 + msgLen, handle); + handle->outBuffer[handle->outEnd++] = 'a'; + } else { + ensure_out_buffer_capacity(1 + msgLen, handle); + } + Assert(handle->outBuffer != NULL); + handle->outBuffer[handle->outEnd++] = 'Q'; + msgLen = htonl(msgLen); + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &msgLen, 4); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += 4; + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, tmpQuery, strLen); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += strLen; + } + + if (isPush || isCreateSchemaPush) + pgxc_node_send_popschema(handle, isPush); + + handle->state = DN_CONNECTION_STATE_QUERY; + + return pgxc_node_flush(handle); +} +/* + * Send the Command ID down to the PGXC node + */ +int spq_node_send_cmd_id(PGXCNodeHandle *handle, CommandId cid) +{ + int msglen = CMD_ID_MSG_LEN; + int i32; + + /* No need to send command ID if its sending flag is not enabled. + * Ignore sending flag if acceleration_with_compute_pool is enabled. + */ + if (!IsSendCommandId() && !u_sess->attr.attr_sql.acceleration_with_compute_pool) + return 0; + + /* Invalid connection state, return error */ + if (handle->state != DN_CONNECTION_STATE_IDLE) + return EOF; + + errno_t ss_rc = 0; + /* msgType + msgLen */ + ensure_out_buffer_capacity(1 + msglen, handle); + Assert(handle->outBuffer != NULL); + handle->outBuffer[handle->outEnd++] = 'M'; + msglen = htonl(msglen); + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &msglen, 4); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += 4; + i32 = htonl(cid); + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &i32, 4); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += 4; + + return 0; +} +#endif /* * Initialize PGXCNodeHandle struct */ void init_pgxc_handle(PGXCNodeHandle *pgxc_handle) { +#ifdef USE_SPQ + return init_spq_handle(pgxc_handle); +#endif + #ifndef ENABLE_MULTIPLE_NODES Assert(false); DISTRIBUTED_FEATURE_NOT_SUPPORTED(); @@ -1388,15 +1685,16 @@ PGXCNodeConnected(NODE_CONNECTION *conn) */ void pgxc_node_free(PGXCNodeHandle *handle) { -#ifndef ENABLE_MULTIPLE_NODES +#if (!defined ENABLE_MULTIPLE_NODES) && (!defined USE_SPQ) Assert(false); DISTRIBUTED_FEATURE_NOT_SUPPORTED(); return; #else - - close(handle->sock); - handle->sock = NO_SOCKET; - + if (handle->sock >= 0) { + close(handle->sock); + handle->sock = NO_SOCKET; + } + handle->gsock = GS_INVALID_GSOCK; #endif } @@ -1458,6 +1756,10 @@ pgxc_node_all_free(void) void pgxc_node_init(PGXCNodeHandle *handle, int sock) { +#ifdef USE_SPQ + return pgxc_spqnode_init(handle,sock); +#endif + #ifndef ENABLE_MULTIPLE_NODES Assert(false); DISTRIBUTED_FEATURE_NOT_SUPPORTED(); @@ -1895,6 +2197,10 @@ pgxc_node_receive(const int conn_count, PGXCNodeHandle **connections, struct timeval *timeout, bool ignoreTimeoutWarning) { +#ifdef USE_SPQ + return spq_node_receive(conn_count, connections, timeout); +#endif + #ifndef ENABLE_MULTIPLE_NODES Assert(false); DISTRIBUTED_FEATURE_NOT_SUPPORTED(); @@ -2150,7 +2456,198 @@ retry: pgstat_reset_waitStatePhase(oldStatus, oldPhase); return NO_ERROR_OCCURED; } - +#ifdef USE_SPQ +static bool datanode_read_data_from_logic_conn( + PGXCNodeHandle **connections, StreamNetCtl *ctl, int target, int nfds, int waitNodeId, int timeout) +{ + int i, retval, error_code; + int *datamarks = NULL; + int *poll2conn = NULL; + gsocket *gs_sock = NULL; + + /* + * for logic connection between cn and dn, we do not have StreamNetCtl + * so null pointer in this case. we just use local variable list to save the + * datamarks and gs_sock + * + */ + if (ctl != NULL) { + datamarks = ctl->layer.sctpLayer.datamarks; + poll2conn = ctl->layer.sctpLayer.poll2conn; + gs_sock = ctl->layer.sctpLayer.gs_sock; + } else { + datamarks = t_thrd.pgxc_cxt.pgxc_net_ctl->datamarks; + poll2conn = t_thrd.pgxc_cxt.pgxc_net_ctl->poll2conn; + gs_sock = t_thrd.pgxc_cxt.pgxc_net_ctl->gs_sock; + } + + /* for CN pgstat use nodeoid and for DN pgstat use nodeIdx. */ + WaitStatePhase oldPhase = pgstat_report_waitstatus_phase(PHASE_NONE, true); + WaitState oldStatus = pgstat_report_waitstatus_comm(STATE_WAIT_NODE, + IS_PGXC_COORDINATOR ? connections[target]->nodeoid : connections[target]->nodeIdx, + nfds, + waitNodeId, + global_node_definition ? global_node_definition->num_nodes : -1); + +retry: + + NetWorkTimePollStart(t_thrd.pgxc_cxt.GlobalNetInstr); + + retval = gs_wait_poll(gs_sock, nfds, datamarks, timeout, false); + + NetWorkTimePollEnd(t_thrd.pgxc_cxt.GlobalNetInstr); + + /* no data but wake up, check interrupts then retry */ + if (retval == 0) { + goto retry; + } else if (retval == -2) { + /* logical connection error because remote */ + for (i = 0; i < nfds; i++) { + if (datamarks[i] == -1) { + break; + } + } + + if (i < nfds) { + PGXCNodeHandle *conn = connections[poll2conn[i]]; + error_code = getStreamSocketError(gs_comm_strerror()); + ereport(WARNING, + (errcode(error_code), + errmsg("Failed to read response from node, remote:%s, detail:%s.", + conn->remoteNodeName, + gs_comm_strerror()))); + + conn->state = DN_CONNECTION_STATE_ERROR_FATAL; + add_error_message(conn, + "Logic connecion closed by remote,remote %s[%u], detail:%s.", + conn->remoteNodeName, + conn->nodeoid, + gs_comm_strerror()); + + pgstat_reset_waitStatePhase(oldStatus, oldPhase); + return ERROR_OCCURED; + } else { + error_code = getStreamSocketError(gs_comm_strerror()); + ereport(WARNING, + (errcode(error_code), errmsg("Failed to read response from Datanode, detail:%s", gs_comm_strerror()))); + + pgstat_reset_waitStatePhase(oldStatus, oldPhase); + return ERROR_OCCURED; + } + } else if (retval == -1) { + /* local logical connection error */ + error_code = getStreamSocketError(gs_comm_strerror()); + ereport(WARNING, + (errcode(error_code), errmsg("Failed to read response from Datanodes, detail:%s", gs_comm_strerror()))); + + pgstat_reset_waitStatePhase(oldStatus, oldPhase); + return ERROR_OCCURED; + } + + /* read data */ + for (i = 0; i < nfds; i++) { + if (datamarks[i] > 0) { + PGXCNodeHandle *conn = connections[poll2conn[i]]; + int read_status = pgxc_node_read_data_from_logic_conn(conn, true); + if (read_status == EOF || read_status < 0) { + /* Can not read - no more actions, just discard connection */ + conn->state = DN_CONNECTION_STATE_ERROR_FATAL; + add_error_message(conn, + "Failed to recv on logic connection, remote %s[%u], detail:%s.", + conn->remoteNodeName, + conn->nodeoid, + gs_comm_strerror()); + + /* Should we read from the other connections before returning? */ + error_code = getStreamSocketError(gs_comm_strerror()); + ereport(WARNING, + (errcode(error_code), + errmsg("Failed to read response from node, remote:%s, detail:%s.", + conn->remoteNodeName, + gs_comm_strerror()))); + + pgstat_reset_waitStatePhase(oldStatus, oldPhase); + return ERROR_OCCURED; + } + } + } + + pgstat_reset_waitStatePhase(oldStatus, oldPhase); + return NO_ERROR_OCCURED; +} + +bool datanode_receive_from_logic_conn( + const int conn_count, PGXCNodeHandle **connections, StreamNetCtl *ctl, int timeout) +{ + int i, nfds = 0; + bool is_msg_buffered = false; + int *datamarks = NULL; + int *poll2conn = NULL; + gsocket *gs_sock = NULL; + int waitNodeId = -1; + + if (conn_count == 0) + return NO_ERROR_OCCURED; + + /* + * for logic connection between cn and dn, we do not have StreamNetCtl + * so null pointer in this case. we just use local variable list to save the + * datamarks and gs_sock + * + */ + if (ctl != NULL) { + datamarks = ctl->layer.sctpLayer.datamarks; + poll2conn = ctl->layer.sctpLayer.poll2conn; + gs_sock = ctl->layer.sctpLayer.gs_sock; + } else { + datamarks = t_thrd.pgxc_cxt.pgxc_net_ctl->datamarks; + poll2conn = t_thrd.pgxc_cxt.pgxc_net_ctl->poll2conn; + gs_sock = t_thrd.pgxc_cxt.pgxc_net_ctl->gs_sock; + } + + int target = 0; + + is_msg_buffered = IsExistMsgBuffered(conn_count, connections); + + nfds = 0; + for (i = 0; i < conn_count; i++) { + /* If connection finished sending do not wait input from it */ + if (IsFinishedSend(i, connections)) { + continue; + } + + /* prepare select params */ + if (connections[i]->gsock.type != GSOCK_INVALID) { + target = i; + datamarks[nfds] = 0; + gs_sock[nfds] = connections[i]->gsock; + poll2conn[nfds] = i; + ++nfds; + } else { + /* flag as bad, it will be removed from the list */ + connections[i]->state = DN_CONNECTION_STATE_ERROR_FATAL; + elog(WARNING, + "pgxc_node_stream_receive set DN_CONNECTION_STATE_ERROR_FATAL for node %s", + connections[i]->remoteNodeName); + } + } + + /* + * Return if we do not have connections to receive input + */ + if (nfds == 0) { + if (is_msg_buffered) + return NO_ERROR_OCCURED; + return ERROR_OCCURED; + } + + if (IS_PGXC_DATANODE && connections[target]->stream != NULL) { + waitNodeId = connections[target]->stream->ss.ps.plan->plan_node_id; + } + + return datanode_read_data_from_logic_conn(connections, ctl, target, nfds, waitNodeId, timeout); +} +#else /* * @Description: For logic connection mode. Wait while at least one of specified connections @@ -2171,7 +2668,7 @@ datanode_receive_from_logic_conn(const int conn_count, DISTRIBUTED_FEATURE_NOT_SUPPORTED(); return false; } - +#endif /* * Get one character from the connection buffer and advance cursor */ @@ -2237,7 +2734,7 @@ get_int(PGXCNodeHandle *conn, size_t len, int *out) */ char get_message(PGXCNodeHandle *conn, int *len, char **msg) { -#ifndef ENABLE_MULTIPLE_NODES +#if !defined(ENABLE_MULTIPLE_NODES) && !defined(USE_SPQ) Assert(false); DISTRIBUTED_FEATURE_NOT_SUPPORTED(); return '\0'; @@ -2257,7 +2754,7 @@ char get_message(PGXCNodeHandle *conn, int *len, char **msg) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("message len is too short"))); } - if (unlikely(conn->inCursor > (INT_MAX - *len))) { + if (unlikely(conn->inCursor > (size_t)(INT_MAX - *len))){ ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("conn cursor overflow"))); } if ((((size_t)*len) > MaxAllocSize)) { @@ -2697,6 +3194,9 @@ ensure_in_buffer_capacity(size_t bytes_needed, PGXCNodeHandle *handle) void ensure_out_buffer_capacity(size_t bytes_needed, PGXCNodeHandle *handle) { +#ifdef USE_SPQ + return enlargeBufferSize(bytes_needed, handle->outEnd, &handle->outSize, &handle->outBuffer); +#endif #ifndef ENABLE_MULTIPLE_NODES Assert(false); DISTRIBUTED_FEATURE_NOT_SUPPORTED(); @@ -3535,6 +4035,26 @@ pgxc_node_send_plan_with_params(PGXCNodeHandle *handle, const char *query, int pgxc_node_flush(PGXCNodeHandle *handle) { +#ifdef USE_SPQ + while (handle->outEnd) { + if (send_some(handle, handle->outEnd) < 0) { + elog(LOG, + "send some data to %s[%u] failed, remote_host[%s], remote_port[%s].", + handle->remoteNodeName, + handle->nodeoid, + u_sess->proc_cxt.MyProcPort->remote_host, + u_sess->proc_cxt.MyProcPort->remote_port); + handle->state = DN_CONNECTION_STATE_ERROR_FATAL; + add_error_message(handle, + "failed to send data to %s[%u], detail:%s", + handle->remoteNodeName, + handle->nodeoid, + gs_comm_strerror()); + return EOF; + } + } + return 0; +#endif #ifndef ENABLE_MULTIPLE_NODES Assert(false); DISTRIBUTED_FEATURE_NOT_SUPPORTED(); @@ -3606,6 +4126,25 @@ pgxc_node_flush_read(PGXCNodeHandle *handle) int pgxc_node_send_queryid(PGXCNodeHandle *handle, uint64 queryid) { +#ifdef USE_SPQ + int msglen = 12; + errno_t ss_rc = 0; + /* Invalid connection state, return error */ + if (handle->state != DN_CONNECTION_STATE_IDLE) + return EOF; + /* msgType + msgLen */ + ensure_out_buffer_capacity(1 + msglen, handle); + Assert(handle->outBuffer != NULL); + handle->outBuffer[handle->outEnd++] = 'q'; + msglen = htonl(msglen); + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &msglen, 4); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += 4; + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &queryid, sizeof(uint64)); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += sizeof(uint64); + return 0; +#endif Assert(false); DISTRIBUTED_FEATURE_NOT_SUPPORTED(); return 0; @@ -3620,6 +4159,109 @@ pgxc_node_send_queryid(PGXCNodeHandle *handle, uint64 queryid) int pgxc_node_send_unique_sql_id(PGXCNodeHandle *handle) { +#ifdef USE_SPQ + Assert(is_unique_sql_enabled()); + if (handle == NULL) { + return 1; + } + + uint64 unique_sql_id = u_sess->unique_sql_cxt.unique_sql_id; + ereport(DEBUG1, + (errmodule(MOD_INSTR), + errmsg("[UniqueSQL] unique id: %lu, send unique sql ID to %s", unique_sql_id, handle->remoteNodeName))); + + const int N32_BIT = 32; + /* + * whole msg content + * 'i' + 4 + 1 + 8 + 4 + 4 + * 'i' + msg_len + 'q'+ unique_sql_id + user oid + cn id + qid.procId + qid.queryId + qid.stamp + */ + int msg_len = sizeof(uint32) + sizeof(char) + sizeof(uint64) + sizeof(uint32) + sizeof(uint32); + if (t_thrd.proc->workingVersionNum >= SLOW_QUERY_VERSION) + msg_len += (sizeof(Oid) + sizeof(uint64) + sizeof(int64)); + + int rc; + int n32; + uint64 n64 = 0; + + WLMGeneralParam *g_wlm_params = &u_sess->wlm_cxt->wlm_params; + + if (handle->state != DN_CONNECTION_STATE_IDLE) + return EOF; + + ensure_out_buffer_capacity(1 + msg_len, handle); + Assert(handle->outBuffer != NULL); + /* instrumentation */ + handle->outBuffer[handle->outEnd++] = 'i'; + + /* message length, not including 'i' */ + msg_len = htonl(msg_len); + rc = memcpy_s(handle->outBuffer + handle->outEnd, sizeof(int), &msg_len, sizeof(uint32)); + securec_check(rc, "\0", "\0"); + handle->outEnd += sizeof(uint32); + + handle->outBuffer[handle->outEnd++] = 'q'; + + /* cn id */ + n32 = htonl(u_sess->unique_sql_cxt.unique_sql_cn_id); + rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &n32, sizeof(Oid)); + securec_check(rc, "\0", "\0"); + handle->outEnd += sizeof(Oid); + + /* user id */ + n32 = htonl(u_sess->unique_sql_cxt.unique_sql_user_id); + rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &n32, sizeof(Oid)); + securec_check(rc, "\0", "\0"); + handle->outEnd += sizeof(Oid); + + /* unique sql id */ + /* high order half first */ + n32 = (uint32)(unique_sql_id >> N32_BIT); + n32 = htonl(n32); + rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &n32, sizeof(uint32)); + securec_check(rc, "\0", "\0"); + handle->outEnd += sizeof(uint32); + + /* low order half */ + n32 = (uint32)unique_sql_id; + n32 = htonl(n32); + rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &n32, sizeof(uint32)); + securec_check(rc, "\0", "\0"); + handle->outEnd += sizeof(uint32); + + if (t_thrd.proc->workingVersionNum >= SLOW_QUERY_VERSION) { + /* qid.procId */ + n32 = htonl(g_wlm_params->qid.procId); + rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &n32, sizeof(Oid)); + securec_check(rc, "\0", "\0"); + handle->outEnd += sizeof(Oid); + + /* qid.queryId */ + n64 = htonl64(g_wlm_params->qid.queryId); + rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &n64, sizeof(uint64)); + securec_check(rc, "\0", "\0"); + handle->outEnd += sizeof(uint64); + + /* qid.stamp */ + n64 = htonl64(g_wlm_params->qid.stamp); + rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &n64, sizeof(uint64)); + securec_check(rc, "\0", "\0"); + handle->outEnd += sizeof(uint64); + } + + if (unique_sql_id == START_TRX_UNIQUE_SQL_ID && t_thrd.postgres_cxt.debug_query_string != NULL) { + char* mask_string = maskPassword(t_thrd.postgres_cxt.debug_query_string); + if (mask_string == NULL) + mask_string = (char*)t_thrd.postgres_cxt.debug_query_string; + ereport(LOG, (errmodule(MOD_INSTR), errmsg("[UniqueSQL] send 'START TRANSACTION' to DN - (%s)", mask_string))); + + if (mask_string != t_thrd.postgres_cxt.debug_query_string) + pfree(mask_string); + } + + return 0; + +#endif Assert(false); DISTRIBUTED_FEATURE_NOT_SUPPORTED(); return 0; @@ -3737,6 +4379,11 @@ int pgxc_node_send_query(PGXCNodeHandle * handle, const char *query, bool isPush, bool isCreateSchemaPush, bool trigger_ship, bool check_gtm_mode, const char* compressedPlan, int cLen) { +#ifdef USE_SPQ + return spq_node_send_query(handle, query, isPush, isCreateSchemaPush, trigger_ship, + check_gtm_mode, compressedPlan, cLen); +#endif + #ifndef ENABLE_MULTIPLE_NODES Assert(false); DISTRIBUTED_FEATURE_NOT_SUPPORTED(); @@ -3878,6 +4525,10 @@ pgxc_node_notify_commit(PGXCNodeHandle * handle) int pgxc_node_send_cmd_id(PGXCNodeHandle *handle, CommandId cid) { +#ifdef USE_SPQ + return spq_node_send_cmd_id(handle, cid); +#endif + #ifndef ENABLE_MULTIPLE_NODES Assert(false); DISTRIBUTED_FEATURE_NOT_SUPPORTED(); @@ -3931,6 +4582,137 @@ pgxc_node_send_wlm_cgroup(PGXCNodeHandle *handle) int pgxc_node_dywlm_send_params_for_jobs(PGXCNodeHandle *handle, int tag, const char *keystr) { +#ifdef USE_SPQ + int cg_len = 0; /* cgroup length */ + int sr_len = 0; /* session_respool length*/ + int ng_len = 0; /* node group length*/ + int msgLen; + errno_t ss_rc = 0; + WLMGeneralParam *g_wlm_params = &u_sess->wlm_cxt->wlm_params; + uint32 n32; + int procId = 0; + uint64 queryId = 0; + int flags[2] = {0}; + int dop = g_wlm_params->dopvalue; + int64 stamp = g_wlm_params->qid.stamp; + int io_priority_value = g_wlm_params->io_priority; + int iops_limit_value = g_wlm_params->iops_limits; + char *cgroup = GSCGROUP_INVALID_GROUP; + + /* + * for autovacuum work threads + * autovac_iops_limits = -1 means use default iops_limits + * autovac_iops_limits >= 0 means use seft-defined iops_limits + */ + if (IsAutoVacuumWorkerProcess() && 0 <= u_sess->attr.attr_resource.autovac_iops_limits) + iops_limit_value = u_sess->attr.attr_resource.autovac_iops_limits; + + if (!t_thrd.wlm_cxt.parctl_state.simple) + cgroup = u_sess->wlm_cxt->control_group; + + /* Invalid connection state, return error */ + if (handle->state != DN_CONNECTION_STATE_IDLE) + return EOF; + + ss_rc = memcpy_s(&flags[0], sizeof(char), &g_wlm_params->cpuctrl, sizeof(char)); + securec_check(ss_rc, "\0", "\0"); + + ss_rc = memcpy_s((char *)&flags[0] + sizeof(char), sizeof(char), &g_wlm_params->memtrack, sizeof(char)); + securec_check(ss_rc, "\0", "\0"); + + ss_rc = memcpy_s((char *)&flags[0] + 2 * sizeof(char), sizeof(char), &g_wlm_params->iostate, sizeof(char)); + securec_check(ss_rc, "\0", "\0"); + + ss_rc = memcpy_s((char *)&flags[0] + 3 * sizeof(char), sizeof(char), &g_wlm_params->iotrack, sizeof(char)); + securec_check(ss_rc, "\0", "\0"); + + ss_rc = memcpy_s(&flags[1], sizeof(char), &g_wlm_params->iocontrol, sizeof(char)); + securec_check(ss_rc, "\0", "\0"); + + ss_rc = memcpy_s((char *)&flags[1] + sizeof(char), sizeof(char), &t_thrd.wlm_cxt.parctl_state.simple, sizeof(char)); + securec_check(ss_rc, "\0", "\0"); + + /* msgType + msgLen */ + cg_len = strlen(cgroup) + 1; + sr_len = strlen(g_wlm_params->rpdata.rpname) + 1; + ng_len = strlen(g_wlm_params->ngroup) + 1; + + /* size + strlen */ + msgLen = 4 + cg_len + sr_len + ng_len + 4 * 8 + 8; // the extra 8 byte is for wlm qid + + /* msgType + msgLen */ + ensure_out_buffer_capacity(1 + msgLen, handle); + Assert(handle->outBuffer != NULL); + handle->outBuffer[handle->outEnd++] = 'W'; + msgLen = htonl(msgLen); + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &msgLen, 4); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += 4; + + procId = htonl(g_wlm_params->qid.procId); + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &procId, 4); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += 4; + queryId = htonl64(g_wlm_params->qid.queryId); + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &queryId, 8); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += 8; + + for (int i = 0; i < (int)(sizeof(flags) / sizeof(int)); i++) { + flags[i] = htonl(flags[i]); + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &flags[i], 4); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += 4; + } + + dop = htonl(dop); + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &dop, 4); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += 4; + io_priority_value = htonl(io_priority_value); + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &io_priority_value, 4); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += 4; + iops_limit_value = htonl(iops_limit_value); + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &iops_limit_value, 4); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += 4; + + /* High order half first */ +#ifdef INT64_IS_BUSTED + /* don't try a right shift of 32 on a 32-bit word */ + n32 = (stamp < 0) ? -1 : 0; +#else + n32 = (uint32)((uint64)stamp >> 32); +#endif + n32 = htonl(n32); + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &n32, 4); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += 4; + + /* Now the low order half */ + n32 = (uint32)stamp; + n32 = htonl(n32); + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, &n32, 4); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += 4; + + ss_rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, cgroup, cg_len); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += cg_len; + + ss_rc = memcpy_s( + handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, g_wlm_params->rpdata.rpname, sr_len); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += sr_len; + + ss_rc = + memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd, g_wlm_params->ngroup, ng_len); + securec_check(ss_rc, "\0", "\0"); + handle->outEnd += ng_len; + + return 0; +#endif Assert(false); DISTRIBUTED_FEATURE_NOT_SUPPORTED(); return 0; @@ -4016,6 +4798,12 @@ pgxc_node_send_pgfdw(PGXCNodeHandle *handle, int tag, const char *keystr, int le int pgxc_node_send_snapshot(PGXCNodeHandle *handle, Snapshot snapshot, int max_push_sqls) { +#ifdef USE_SPQ + if (handle->state != DN_CONNECTION_STATE_IDLE) + return EOF; + + return 0; +#endif #ifndef ENABLE_MULTIPLE_NODES Assert(false); DISTRIBUTED_FEATURE_NOT_SUPPORTED(); diff --git a/src/common/backend/pgxc_single/pool/poolmgr.cpp b/src/common/backend/pgxc_single/pool/poolmgr.cpp index 41a49323b..a0a1b099b 100644 --- a/src/common/backend/pgxc_single/pool/poolmgr.cpp +++ b/src/common/backend/pgxc_single/pool/poolmgr.cpp @@ -3487,6 +3487,28 @@ int* StreamConnectNodes(List* datanodelist, int consumerDop, int distriType, Nod */ int* StreamConnectNodes(libcommaddrinfo** addrArray, int connNum) { +#ifdef USE_SPQ + libcommaddrinfo *nodeAddr = NULL; + + Assert(connNum > 0); + + int re = -1; + WaitState oldStatus = pgstat_report_waitstatus(STATE_STREAM_WAIT_CONNECT_NODES); + re = gs_connect(addrArray, connNum, -1); + pgstat_report_waitstatus(oldStatus); + if (re > 0) { + int error_index = re - 1; + nodeAddr = addrArray[error_index]; + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("Failed to connect %s, detail:%s", nodeAddr->nodename, gs_comm_strerror()))); + } else if (re < 0) { + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), errmsg("Failed to connect Nodes, detail:%s", gs_comm_strerror()))); + } + + return 0; +#endif Assert(false); DISTRIBUTED_FEATURE_NOT_SUPPORTED(); return NULL; diff --git a/src/common/backend/utils/adt/numeric.cpp b/src/common/backend/utils/adt/numeric.cpp index e6ebf51bd..159e955d8 100644 --- a/src/common/backend/utils/adt/numeric.cpp +++ b/src/common/backend/utils/adt/numeric.cpp @@ -216,7 +216,6 @@ static char* get_str_from_var_sci(NumericVar* var, int rscale); static void apply_typmod(NumericVar* var, int32 typmod); static int32 numericvar_to_int32(const NumericVar* var, bool can_ignore = false); -static double numeric_to_double_no_overflow(Numeric num); static double numericvar_to_double_no_overflow(NumericVar* var); static Datum numeric_abbrev_convert(Datum original_datum, SortSupport ssup); @@ -5397,7 +5396,7 @@ void int64_to_numericvar(int64 val, NumericVar* var) /* * Convert numeric to float8; if out of range, return +/- HUGE_VAL */ -static double numeric_to_double_no_overflow(Numeric num) +double numeric_to_double_no_overflow(Numeric num) { char* tmp = NULL; double val; @@ -20508,4 +20507,4 @@ void numeric_aggfn_info_change(Oid aggfn_oid, Oid *transfn_oid, Oid *transtype, { numeric_transfn_info_change(aggfn_oid, transfn_oid, transtype); numeric_finalfn_info_change(aggfn_oid, finalfn_oid); -} \ No newline at end of file +} diff --git a/src/common/backend/utils/adt/ruleutils.cpp b/src/common/backend/utils/adt/ruleutils.cpp index 3409f3513..8030a9eac 100644 --- a/src/common/backend/utils/adt/ruleutils.cpp +++ b/src/common/backend/utils/adt/ruleutils.cpp @@ -5204,6 +5204,10 @@ static void set_deparse_planstate(deparse_namespace* dpns, PlanState* ps) */ if (IsA(ps, AppendState)) dpns->outer_planstate = ((AppendState*)ps)->appendplans[0]; +#ifdef USE_SPQ + else if (IsA(ps, SequenceState)) + dpns->outer_planstate = ((SequenceState *) ps)->subplans[1]; +#endif else if (IsA(ps, VecAppendState)) dpns->outer_planstate = ((VecAppendState*)ps)->appendplans[0]; else if (IsA(ps, MergeAppendState)) @@ -5231,6 +5235,10 @@ static void set_deparse_planstate(deparse_namespace* dpns, PlanState* ps) */ if (IsA(ps, SubqueryScanState)) dpns->inner_planstate = ((SubqueryScanState*)ps)->subplan; +#ifdef USE_SPQ + else if (IsA(ps, SequenceState)) + dpns->inner_planstate = ((SequenceState *) ps)->subplans[0]; +#endif else if (IsA(ps, VecSubqueryScanState)) dpns->inner_planstate = ((VecSubqueryScanState*)ps)->subplan; else if (IsA(ps, CteScanState)) diff --git a/src/common/backend/utils/adt/selfuncs.cpp b/src/common/backend/utils/adt/selfuncs.cpp index f232bc7ce..a624f7b2a 100755 --- a/src/common/backend/utils/adt/selfuncs.cpp +++ b/src/common/backend/utils/adt/selfuncs.cpp @@ -183,7 +183,6 @@ static void convert_bytea_to_scalar( static double convert_one_string_to_scalar(const char* value, int rangelo, int rangehi); static double convert_one_bytea_to_scalar(unsigned char* value, int valuelen, int rangelo, int rangehi); static char* convert_string_datum(Datum value, Oid typid); -static double convert_timevalue_to_scalar(Datum value, Oid typid); static void examine_simple_variable(PlannerInfo* root, Var* var, VariableStatData* vardata); static bool get_variable_range(PlannerInfo* root, VariableStatData* vardata, Oid sortop, Datum* min, Datum* max); static bool get_actual_variable_range(PlannerInfo* root, VariableStatData* vardata, Oid sortop, Datum* min, Datum* max); @@ -4375,7 +4374,7 @@ static double convert_one_bytea_to_scalar(unsigned char* value, int valuelen, in /* * Do convert_to_scalar()'s work for any timevalue data type. */ -static double convert_timevalue_to_scalar(Datum value, Oid typid) +double convert_timevalue_to_scalar(Datum value, Oid typid) { switch (typid) { case TIMESTAMPOID: diff --git a/src/common/backend/utils/cache/lsyscache.cpp b/src/common/backend/utils/cache/lsyscache.cpp index 1b4fd50c3..77ee10f32 100644 --- a/src/common/backend/utils/cache/lsyscache.cpp +++ b/src/common/backend/utils/cache/lsyscache.cpp @@ -84,6 +84,14 @@ #include "optimizer/func_dependency.h" #endif +#ifdef USE_SPQ +#include "access/sysattr.h" +#include "funcapi.h" +#include "catalog/pg_inherits_fn.h" +#include "catalog/pg_inherits.h" +#include "catalog/pg_aggregate.h" +#include "catalog/pg_trigger.h" +#endif /* ---------- AMOP CACHES ---------- */ @@ -5509,3 +5517,920 @@ char get_typecategory(Oid typid) ReleaseSysCache(tuple); return result; } + +#ifdef USE_SPQ +/* + * get_check_constraint_relid + * Given check constraint id, return the check constraint's relation oid + */ +Oid get_check_constraint_relid(Oid oidCheckconstraint) +{ + HeapTuple tp; + + tp = SearchSysCache(CONSTROID, ObjectIdGetDatum(oidCheckconstraint), 0, 0, 0); + if (HeapTupleIsValid(tp)) { + Form_pg_constraint contup = (Form_pg_constraint)GETSTRUCT(tp); + Oid result; + + result = contup->conrelid; + ReleaseSysCache(tp); + return result; + } else + return InvalidOid; +} +/* + * get_check_constraint_oids + * Extract all check constraint oid for a given relation. + */ +List *get_check_constraint_oids(Oid oidRel) +{ + List *plConstraints = NIL; + HeapTuple htup; + Relation conrel; + ScanKeyData scankey; + SysScanDesc sscan; + + /* + * lookup constraints for relation from the catalog table + * + * SELECT * FROM pg_constraint WHERE conrelid = :1 + */ + conrel = heap_open(ConstraintRelationId, AccessShareLock); + + ScanKeyInit(&scankey, Anum_pg_constraint_conrelid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(oidRel)); + sscan = systable_beginscan(conrel, InvalidOid, false, NULL, 1, &scankey); + + while (HeapTupleIsValid(htup = systable_getnext(sscan))) { + Form_pg_constraint contuple = (Form_pg_constraint)GETSTRUCT(htup); + + // only consider check constraints + if (CONSTRAINT_CHECK != contuple->contype || !contuple->convalidated) { + continue; + } + + plConstraints = lappend_oid(plConstraints, HeapTupleGetOid(htup)); + } + + systable_endscan(sscan); + heap_close(conrel, AccessShareLock); + + return plConstraints; +} +/* + * get_check_constraint_expr_tree + * returns the expression node tree representing the check constraint + * with the given oidConstraint. + * + * Note: returns a palloc'd expression node tree, or NULL if no such constraint. + */ +Node *get_check_constraint_expr_tree(Oid oidCheckconstraint) +{ + HeapTuple tp; + Node *result = NULL; + + tp = SearchSysCache(CONSTROID, ObjectIdGetDatum(oidCheckconstraint), 0, 0, 0); + if (HeapTupleIsValid(tp)) { + Datum conbin; + bool isnull; + + conbin = SysCacheGetAttr(CONSTROID, tp, Anum_pg_constraint_conbin, &isnull); + if (!isnull) + result = (Node *)stringToNode(TextDatumGetCString(conbin)); + + ReleaseSysCache(tp); + } + return result; +} +/* + * operator_exists + * Is there an operator with the given oid + */ +bool operator_exists(Oid oid) +{ + return SearchSysCacheExists(OPEROID, oid, 0, 0, 0); +} +/* + * relation_exists + * Is there a relation with the given oid + */ +bool relation_exists(Oid oid) +{ + return SearchSysCacheExists(RELOID, oid, 0, 0, 0); +} +/* + * get_agg_transtype + * Given aggregate id, return the aggregate transition function's result type. + */ +Oid get_agg_transtype(Oid aggid) +{ + HeapTuple tp; + Oid result; + + tp = SearchSysCache1(AGGFNOID, ObjectIdGetDatum(aggid)); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for aggregate %u", aggid); + + result = ((Form_pg_aggregate)GETSTRUCT(tp))->aggtranstype; + ReleaseSysCache(tp); + return result; +} +/* + * is_agg_partial_capable + * Given aggregate id, check if it can be used in 2-phase aggregation. + * + * It must have a combine function, and if the transition type is 'internal', + * also serial/deserial functions. + */ +bool is_agg_partial_capable(Oid aggid) +{ + HeapTuple aggTuple; + Form_pg_aggregate aggform; + bool result = true; + + aggTuple = SearchSysCache1(AGGFNOID, ObjectIdGetDatum(aggid)); + if (!HeapTupleIsValid(aggTuple)) + elog(ERROR, "cache lookup failed for aggregate %u", aggid); + aggform = (Form_pg_aggregate)GETSTRUCT(aggTuple); + + ReleaseSysCache(aggTuple); + + return result; +} + +/* + * is_ordered_agg + * Given aggregate id, check if it is an ordered aggregate + */ +bool is_agg_ordered(Oid aggid) +{ + HeapTuple aggTuple; + char aggkind; + bool isnull = false; + + aggTuple = SearchSysCache1(AGGFNOID, ObjectIdGetDatum(aggid)); + if (!HeapTupleIsValid(aggTuple)) + elog(ERROR, "cache lookup failed for aggregate %u", aggid); + + aggkind = DatumGetChar(SysCacheGetAttr(AGGFNOID, aggTuple, Anum_pg_aggregate_aggkind, &isnull)); + Assert(!isnull); + + ReleaseSysCache(aggTuple); + + return AGGKIND_IS_ORDERED_SET(aggkind); +} +/* + * get_cast_func + * finds the cast function between the given source and destination type, + * and records its oid and properties in the output parameters. + * Returns true if a cast exists, false otherwise. + */ +bool get_cast_func(Oid oidSrc, Oid oidDest, bool *is_binary_coercible, Oid *oidCastFunc, CoercionPathType *pathtype) +{ + if (IsBinaryCoercible(oidSrc, oidDest)) { + *is_binary_coercible = true; + *oidCastFunc = 0; + return true; + } + + *is_binary_coercible = false; + + *pathtype = find_coercion_pathway(oidDest, oidSrc, COERCION_IMPLICIT, oidCastFunc); + if (*pathtype == COERCION_PATH_RELABELTYPE) + *is_binary_coercible = true; + if (*pathtype != COERCION_PATH_NONE) + return true; + return false; +} +/* + * type_exists + * Is there a type with the given oid + */ +bool type_exists(Oid oid) +{ + return SearchSysCacheExists(TYPEOID, oid, 0, 0, 0); +} +/* + * get_comparison_type + * Retrieve comparison type + */ +CmpType get_comparison_type(Oid oidOp) +{ + OpBtreeInterpretation *opBti; + List *opBtis; + + opBtis = get_op_btree_interpretation(oidOp); + + if (opBtis == NIL) { + /* The operator does not belong to any B-tree operator family */ + return CmptOther; + } + + /* + * XXX: Arbitrarily use the first found operator family. Usually + * there is only one, but e.g. if someone has created a reverse ordering + * family that sorts in descending order, it is ambiguous whether a + * < operator stands for the less than operator of the ascending opfamily, + * or the greater than operator for the descending opfamily. + */ + opBti = (OpBtreeInterpretation*)linitial(opBtis); + + switch (opBti->strategy) { + case BTLessStrategyNumber: + return CmptLT; + case BTLessEqualStrategyNumber: + return CmptLEq; + case BTEqualStrategyNumber: + return CmptEq; + case BTGreaterEqualStrategyNumber: + return CmptGEq; + case BTGreaterStrategyNumber: + return CmptGT; + case ROWCOMPARE_NE: + return CmptNEq; + default: + elog(ERROR, "unknown B-tree strategy: %d", opBti->strategy); + return CmptOther; + } +} +/* + * index_exists + * Is there an index with the given oid + */ +bool index_exists(Oid oid) +{ + return SearchSysCacheExists(INDEXRELID, oid, 0, 0, 0); +} +bool aggregate_exists(Oid oid) +{ + return SearchSysCacheExists(AGGFNOID, oid, 0, 0, 0); +} +/* + * get_aggregate + * Get oid of aggregate with given name and argument type + */ +Oid get_aggregate(const char *aggname, Oid oidType) +{ + CatCList *catlist; + int i; + Oid oidResult; + + // lookup pg_proc for functions with the given name and arg type + catlist = SearchSysCacheList1(PROCNAMEARGSNSP, CStringGetDatum((char *)aggname)); + + oidResult = InvalidOid; + for (i = 0; i < catlist->n_members; i++) { + HeapTuple htup = t_thrd.lsc_cxt.FetchTupleFromCatCList(catlist, i); + Oid oidProc = HeapTupleGetOid(htup); + Form_pg_proc proctuple = (Form_pg_proc)GETSTRUCT(htup); + + // skip functions with the wrong number of type of arguments + if (1 != proctuple->pronargs || oidType != proctuple->proargtypes.values[0]) { + continue; + } + + if (SearchSysCacheExists(AGGFNOID, ObjectIdGetDatum(oidProc), 0, 0, 0)) { + oidResult = oidProc; + break; + } + } + + ReleaseSysCacheList(catlist); + + return oidResult; +} +/* + * function_exists + * Is there a function with the given oid + */ +bool function_exists(Oid oid) +{ + return SearchSysCacheExists(PROCOID, oid, 0, 0, 0); +} +/* + * check_constraint_exists + * Is there a check constraint with the given oid + */ +bool check_constraint_exists(Oid oidCheckconstraint) +{ + return SearchSysCacheExists1(CONSTROID, ObjectIdGetDatum(oidCheckconstraint)); +} +/* + * get_check_constraint_name + * returns the name of the check constraint with the given oidConstraint. + * + * Note: returns a palloc'd copy of the string, or NULL if no such constraint. + */ +char *get_check_constraint_name(Oid oidCheckconstraint) +{ + return get_constraint_name(oidCheckconstraint); +} +/* + * get_comparison_operator + * Retrieve comparison operator between given types + */ +Oid get_comparison_operator(Oid oidLeft, Oid oidRight, CmpType cmpt) +{ + int16 opstrat; + HeapTuple ht; + Oid result = InvalidOid; + Relation pg_amop; + ScanKeyData scankey[4]; + SysScanDesc sscan; + + switch (cmpt) { + case CmptLT: + opstrat = BTLessStrategyNumber; + break; + case CmptLEq: + opstrat = BTLessEqualStrategyNumber; + break; + case CmptEq: + opstrat = BTEqualStrategyNumber; + break; + case CmptGEq: + opstrat = BTGreaterEqualStrategyNumber; + break; + case CmptGT: + opstrat = BTGreaterStrategyNumber; + break; + default: + return InvalidOid; + } + + pg_amop = heap_open(AccessMethodOperatorRelationId, AccessShareLock); + + /* + * SELECT * FROM pg_amop + * WHERE amoplefttype = :1 and amoprighttype = :2 and amopmethod = :3 and amopstrategy = :4 + */ + ScanKeyInit(&scankey[0], Anum_pg_amop_amoplefttype, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(oidLeft)); + ScanKeyInit(&scankey[1], Anum_pg_amop_amoprighttype, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(oidRight)); + ScanKeyInit(&scankey[2], Anum_pg_amop_amopmethod, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(BTREE_AM_OID)); + ScanKeyInit(&scankey[3], Anum_pg_amop_amopstrategy, BTEqualStrategyNumber, F_INT2EQ, Int16GetDatum(opstrat)); + + /* XXX: There is no index for this, so this is slow! */ + sscan = systable_beginscan(pg_amop, InvalidOid, false, NULL, 4, scankey); + + /* XXX: There can be multiple results. Arbitrarily use the first one */ + while (HeapTupleIsValid(ht = systable_getnext(sscan))) { + Form_pg_amop amoptup = (Form_pg_amop)GETSTRUCT(ht); + + result = amoptup->amopopr; + break; + } + + systable_endscan(sscan); + heap_close(pg_amop, AccessShareLock); + + return result; +} +/* + * pfree_ptr_array + * Free an array of pointers, after freeing each individual element + */ +void pfree_ptr_array(char **ptrarray, int nelements) +{ + int i; + if (NULL == ptrarray) + return; + + for (i = 0; i < nelements; i++) { + if (NULL != ptrarray[i]) { + pfree(ptrarray[i]); + } + } + pfree(ptrarray); +} + +/* + * get_func_output_arg_types + * Given procedure id, return the function's output argument types + */ +List *get_func_output_arg_types(Oid funcid) +{ + HeapTuple tp; + int numargs; + Oid *argtypes = NULL; + char **argnames = NULL; + char *argmodes = NULL; + List *l_argtypes = NIL; + int i; + + tp = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for function %u", funcid); + + numargs = get_func_arg_info(tp, &argtypes, &argnames, &argmodes); + + if (NULL == argmodes) { + pfree_ptr_array(argnames, numargs); + if (NULL != argtypes) { + pfree(argtypes); + } + ReleaseSysCache(tp); + return NULL; + } + + for (i = 0; i < numargs; i++) { + Oid argtype = argtypes[i]; + char argmode = argmodes[i]; + + if (PROARGMODE_INOUT == argmode || PROARGMODE_OUT == argmode || PROARGMODE_TABLE == argmode) { + l_argtypes = lappend_oid(l_argtypes, argtype); + } + } + + pfree_ptr_array(argnames, numargs); + pfree(argtypes); + pfree(argmodes); + + ReleaseSysCache(tp); + return l_argtypes; +} +/* + * func_data_access + * Given procedure id, return the function's data access flag. + */ +char func_data_access(Oid funcid) +{ + return PRODATAACCESS_NONE; +} + +/* + * func_exec_location + * Given procedure id, return the function's proexeclocation field + */ +char func_exec_location(Oid funcid) +{ + return PRODATAACCESS_ANY; +} + +/* + * get_func_arg_types + * Given procedure id, return all the function's argument types + */ +List *get_func_arg_types(Oid funcid) +{ + HeapTuple tp; + Form_pg_proc procstruct; + oidvector *args; + List *result = NIL; + + tp = SearchSysCache(PROCOID, ObjectIdGetDatum(funcid), 0, 0, 0); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for function %u", funcid); + + procstruct = (Form_pg_proc)GETSTRUCT(tp); + args = &procstruct->proargtypes; + for (int i = 0; i < args->dim1; i++) { + result = lappend_oid(result, args->values[i]); + } + + ReleaseSysCache(tp); + return result; +} +/* + * get_type_name + * returns the name of the type with the given oid + * + * Note: returns a palloc'd copy of the string, or NULL if no such type. + */ +char *get_type_name(Oid oid) +{ + HeapTuple tp; + + tp = SearchSysCache(TYPEOID, ObjectIdGetDatum(oid), 0, 0, 0); + if (HeapTupleIsValid(tp)) { + Form_pg_type typtup = (Form_pg_type) GETSTRUCT(tp); + char *result; + + result = pstrdup(NameStr(typtup->typname)); + ReleaseSysCache(tp); + return result; + } else + return NULL; +} +/* + * get_att_stats + * Get attribute statistics. Return a copy of the HeapTuple object, or NULL + * if no stats found for attribute + * + */ +HeapTuple get_att_stats(Oid relid, AttrNumber attrnum) +{ + HeapTuple result; + + char stakind = STARELKIND_CLASS; + + /* + * This is used by ORCA, and ORCA doesn't know that there are two different kinds of stats, + * the inherited stats and the non-inherited. Use the inherited stats, i.e. stats that + * cover all the child tables, too, if available. + */ + result = SearchSysCacheCopy4(STATRELATTINH, + ObjectIdGetDatum(relid), + CharGetDatum(stakind), + Int16GetDatum(attrnum), + BoolGetDatum(true)); + if (!result) + result = SearchSysCacheCopy4(STATRELATTINH, + ObjectIdGetDatum(relid), + CharGetDatum(stakind), + Int16GetDatum(attrnum), + BoolGetDatum(false)); + + return result; +} +/* + * has_subclass_slow + * + * Performs the exhaustive check whether a relation has a subclass. This is + * different from has_subclass(), in that the latter can return true if a relation. + * *might* have a subclass. See comments in has_subclass() for more details. + */ +bool has_subclass_slow(Oid relationId) +{ + ScanKeyData scankey; + Relation rel; + SysScanDesc sscan; + bool result; + + if (!has_subclass(relationId)) { + return false; + } + + rel = heap_open(InheritsRelationId, AccessShareLock); + + ScanKeyInit(&scankey, Anum_pg_inherits_inhparent, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(relationId)); + + /* no index on inhparent */ + sscan = systable_beginscan(rel, InvalidOid, false, NULL, 1, &scankey); + + result = (systable_getnext(sscan) != NULL); + + systable_endscan(sscan); + + heap_close(rel, AccessShareLock); + + return result; +} +/* + * get_index_opfamilies + * Get the oid of operator families for the index keys + */ +List *get_index_opfamilies(Oid oidIndex) +{ + HeapTuple htup; + List *opfam_oids; + bool isnull = false; + int indnkeyatts; + Datum indclassDatum; + oidvector *indclass; + + htup = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(oidIndex)); + if (!HeapTupleIsValid(htup)) + elog(ERROR, "Index %u not found", oidIndex); + + /* + * use SysCacheGetAttr() to retrieve number of index attributes, and the oid + * vector of indclass + */ + indnkeyatts = DatumGetInt16(SysCacheGetAttr(INDEXRELID, htup, Anum_pg_index_indnkeyatts, &isnull)); + Assert(!isnull); + + indclassDatum = SysCacheGetAttr(INDEXRELID, htup, Anum_pg_index_indclass, &isnull); + if (isnull) + return NIL; + indclass = (oidvector *)DatumGetPointer(indclassDatum); + + opfam_oids = NIL; + for (int i = 0; i < indnkeyatts; i++) { + Oid oidOpClass = indclass->values[i]; + Oid opfam = get_opclass_family(oidOpClass); + + opfam_oids = lappend_oid(opfam_oids, opfam); + } + + ReleaseSysCache(htup); + return opfam_oids; +} +/* GPDB_12_MERGE_FIXME: only used by ORCA. Fix the callers to check + * Relation->relkind == RELKIND_PARTITIONED_TABLE instead. They should + * have the relcache entry at hand anyway. + */ +bool relation_is_partitioned(Oid relid) +{ + HeapTuple tuple; + tuple = SearchSysCache1(PARTRELID, ObjectIdGetDatum(relid)); + + if (HeapTupleIsValid(tuple)) { + ReleaseSysCache(tuple); + return true; + } else + return false; +} +bool index_is_partitioned(Oid relid) +{ + HeapTuple tuple; + tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", relid); + Form_pg_class pg_class_tuple = (Form_pg_class)GETSTRUCT(tuple); + ReleaseSysCache(tuple); + return pg_class_tuple->relkind == RELKIND_GLOBAL_INDEX;//todo RELKIND_PARTITIONED_INDEX; +} +/* + * get_operator_opfamilies + * Get the oid of operator families the given operator belongs to + * + * MPP calls this. + */ +List *get_operator_opfamilies(Oid opno) +{ + List *opfam_oids; + CatCList *catlist; + int i; + + opfam_oids = NIL; + + /* SELECT * FROM pg_amop WHERE amopopr = :1 */ + catlist = SearchSysCacheList1(AMOPOPID, ObjectIdGetDatum(opno)); + for (i = 0; i < catlist->n_members; i++) { + HeapTuple htup = t_thrd.lsc_cxt.FetchTupleFromCatCList(catlist, i); + Form_pg_amop amop_tuple = (Form_pg_amop)GETSTRUCT(htup); + + opfam_oids = lappend_oid(opfam_oids, amop_tuple->amopfamily); + } + + ReleaseSysCacheList(catlist); + + return opfam_oids; +} + +Oid get_compatible_hash_opfamily(Oid opno) +{ + Oid result = InvalidOid; + CatCList *catlist; + int i; + + /* + * Search pg_amop to see if the target operator is registered as the "=" + * operator of any hash opfamily. If the operator is registered in + * multiple opfamilies, assume we can use any one. + */ + catlist = SearchSysCacheList1(AMOPOPID, ObjectIdGetDatum(opno)); + + for (i = 0; i < catlist->n_members; i++) { + HeapTuple tuple = t_thrd.lsc_cxt.FetchTupleFromCatCList(catlist, i); + Form_pg_amop aform = (Form_pg_amop)GETSTRUCT(tuple); + + if (aform->amopmethod == HASH_AM_OID && aform->amopstrategy == HTEqualStrategyNumber) { + result = aform->amopfamily; + break; + } + } + + ReleaseSysCacheList(catlist); + + return result; +} +Oid get_compatible_legacy_hash_opfamily(Oid opno) +{ + return InvalidOid; +} +/* ---------- TRIGGER CACHE ---------- */ + +/* + * child_triggers + * Return true if the table is partitioned and any of the child partitions + * have a trigger of the given type. + */ +bool child_triggers(Oid relationId, int32 triggerType) +{ + /* GPDB_12_MERGE_FIXME */ + return false; +} + +/* + * get_trigger_type + * Given trigger id, return the trigger's type + */ +int32 get_trigger_type(Oid triggerid) +{ + Relation rel; + HeapTuple tp; + int32 result = -1; + ScanKeyData scankey; + SysScanDesc sscan; + + /* ObjectIdAttributeNumber as other systable_beginscan called */ + ScanKeyInit(&scankey, ObjectIdAttributeNumber, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(triggerid)); + rel = heap_open(TriggerRelationId, AccessShareLock); + sscan = systable_beginscan(rel, TriggerOidIndexId, true, NULL, 1, &scankey); + + tp = systable_getnext(sscan); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for trigger %u", triggerid); + + result = ((Form_pg_trigger)GETSTRUCT(tp))->tgtype; + + systable_endscan(sscan); + heap_close(rel, AccessShareLock); + + return result; +} + +/* + * trigger_enabled + * Given trigger id, return the trigger's enabled flag + */ +bool trigger_enabled(Oid triggerid) +{ + Relation rel; + HeapTuple tp; + bool result = false; + ScanKeyData scankey; + SysScanDesc sscan; + + /* ObjectIdAttributeNumber as other systable_beginscan called */ + ScanKeyInit(&scankey, ObjectIdAttributeNumber, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(triggerid)); + rel = heap_open(TriggerRelationId, AccessShareLock); + sscan = systable_beginscan(rel, TriggerOidIndexId, true, NULL, 1, &scankey); + + tp = systable_getnext(sscan); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for trigger %u", triggerid); + + char tgenabled = ((Form_pg_trigger)GETSTRUCT(tp))->tgenabled; + switch (tgenabled) { + case TRIGGER_FIRES_ON_ORIGIN: + /* fallthrough */ + /* + * FIXME: we should probably return false when + * SessionReplicationRole isn't SESSION_REPLICATION_ROLE_ORIGIN, + * but does that means we'll also have to flush ORCA's metadata + * cache on every assignment of session_replication_role? + */ + case TRIGGER_FIRES_ALWAYS: + result = true; + break; + case TRIGGER_FIRES_ON_REPLICA: + case TRIGGER_DISABLED: + result = false; + break; + default: + elog(ERROR, "Unknown trigger type: %c", tgenabled); + } + + systable_endscan(sscan); + heap_close(rel, AccessShareLock); + + return result; +} + +/* Does table have update triggers? */ +bool has_update_triggers(Oid relid) +{ + Relation relation; + bool result = false; + + /* Assume the caller already holds a suitable lock. */ + relation = heap_open(relid, NoLock); + + if (relation->rd_rel->relhastriggers) { + bool found = false; + + if (relation->trigdesc == NULL) + RelationBuildTriggers(relation); + + if (relation->trigdesc) { + for (int i = 0; i < relation->trigdesc->numtriggers && !found; i++) { + Trigger trigger = relation->trigdesc->triggers[i]; + found = trigger_enabled(trigger.tgoid) && + (get_trigger_type(trigger.tgoid) & TRIGGER_TYPE_UPDATE) == TRIGGER_TYPE_UPDATE; + if (found) + break; + } + } + + /* GPDB_96_MERGE_FIXME: Why is this not allowed? */ + if (found || child_triggers(relation->rd_id, TRIGGER_TYPE_UPDATE)) + result = true; + } + heap_close(relation, NoLock); + + return result; +} + +bool spq_get_attstatsslot(AttStatsSlot *sslot, HeapTuple statstuple, int reqkind, Oid reqop, int flags) +{ + Form_pg_statistic stats = (Form_pg_statistic)GETSTRUCT(statstuple); + int i; + Datum val; + bool isnull; + ArrayType *statarray; + Oid arrayelemtype; + int narrayelem; + HeapTuple typeTuple; + Form_pg_type typeForm; + + /* initialize *sslot properly */ + memset(sslot, 0, sizeof(AttStatsSlot)); + + for (i = 0; i < STATISTIC_NUM_SLOTS; i++) { + if ((&stats->stakind1)[i] == reqkind && (reqop == InvalidOid || (&stats->staop1)[i] == reqop)) + break; + } + if (i >= STATISTIC_NUM_SLOTS) + return false; /* not there */ + + sslot->staop = (&stats->staop1)[i]; + + /* + * XXX Hopefully-temporary hack: if stacoll isn't set, inject the default + * collation. This won't matter for non-collation-aware datatypes. For + * those that are, this covers cases where stacoll has not been set. In + * the short term we need this because some code paths involving type NAME + * do not pass any collation to prefix_selectivity and related functions. + * Even when that's been fixed, it's likely that some add-on typanalyze + * functions won't get the word right away about filling stacoll during + * ANALYZE, so we'll probably need this for awhile. + */ + + if (flags & ATTSTATSSLOT_VALUES) { + val = SysCacheGetAttr(STATRELATTINH, statstuple, Anum_pg_statistic_stavalues1 + i, &isnull); + if (isnull) + elog(ERROR, "stavalues is null"); + + /* + * Detoast the array if needed, and in any case make a copy that's + * under control of this AttStatsSlot. + */ + statarray = DatumGetArrayTypePCopy(val); + + /* + * Extract the actual array element type, and pass it back in case the + * caller needs it. + */ + sslot->valuetype = arrayelemtype = ARR_ELEMTYPE(statarray); + + /* Need info about element type */ + typeTuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(arrayelemtype)); + if (!HeapTupleIsValid(typeTuple)) + elog(ERROR, "cache lookup failed for type %u", arrayelemtype); + typeForm = (Form_pg_type)GETSTRUCT(typeTuple); + + /* Deconstruct array into Datum elements; NULLs not expected */ + deconstruct_array(statarray, arrayelemtype, typeForm->typlen, typeForm->typbyval, typeForm->typalign, + &sslot->values, NULL, &sslot->nvalues); + + /* + * If the element type is pass-by-reference, we now have a bunch of + * Datums that are pointers into the statarray, so we need to keep + * that until free_attstatsslot. Otherwise, all the useful info is in + * sslot->values[], so we can free the array object immediately. + */ + if (!typeForm->typbyval) + sslot->values_arr = statarray; + else + pfree(statarray); + + ReleaseSysCache(typeTuple); + } + + if (flags & ATTSTATSSLOT_NUMBERS) { + val = SysCacheGetAttr(STATRELATTINH, statstuple, Anum_pg_statistic_stanumbers1 + i, &isnull); + if (isnull) + elog(ERROR, "stanumbers is null"); + + /* + * Detoast the array if needed, and in any case make a copy that's + * under control of this AttStatsSlot. + */ + statarray = DatumGetArrayTypePCopy(val); + narrayelem = ARR_DIMS(statarray)[0]; + + /* Give caller a pointer directly into the statarray */ + sslot->numbers = (float4 *)ARR_DATA_PTR(statarray); + sslot->nnumbers = narrayelem; + + /* We'll free the statarray in free_attstatsslot */ + sslot->numbers_arr = statarray; + } + + return true; +} + +void spq_free_attstatsslot(AttStatsSlot *sslot) +{ + /* The values[] array was separately palloc'd by deconstruct_array */ + if (sslot->values) + pfree(sslot->values); + /* The numbers[] array points into numbers_arr, do not pfree it */ + /* Free the detoasted array objects, if any */ + if (sslot->values_arr) + pfree(sslot->values_arr); + if (sslot->numbers_arr) + pfree(sslot->numbers_arr); +} +#endif diff --git a/src/common/backend/utils/cache/plancache.cpp b/src/common/backend/utils/cache/plancache.cpp index efaca5e77..9c9d98993 100644 --- a/src/common/backend/utils/cache/plancache.cpp +++ b/src/common/backend/utils/cache/plancache.cpp @@ -1891,6 +1891,15 @@ CachedPlan* GetWiseCachedPlan(CachedPlanSource* plansource, /* Decide whether to use a custom plan */ customplan = ChooseCustomPlan(plansource, boundParams); + +#ifdef USE_SPQ + ListCell* qlc = NULL; + foreach (qlc, plansource->query_list) { + Query* query = castNode(Query, lfirst(qlc)); + query->is_support_spq = true; + } +#endif + if (!customplan) { if (ChooseAdaptivePlan(plansource, boundParams)) { plan = GetAdaptGenericPlan(plansource, boundParams, &qlist, &customplan); @@ -2044,6 +2053,14 @@ CachedPlan* GetCachedPlan(CachedPlanSource* plansource, ParamListInfo boundParam /* Decide whether to use a custom plan */ customplan = ChooseCustomPlan(plansource, boundParams); + +#ifdef USE_SPQ + ListCell* qlc = NULL; + foreach (qlc, plansource->query_list) { + Query* query = castNode(Query, lfirst(qlc)); + query->is_support_spq = false; + } +#endif if (!customplan) { if (CheckCachedPlan(plansource, plansource->gplan)) { diff --git a/src/common/backend/utils/init/globals.cpp b/src/common/backend/utils/init/globals.cpp index 8d365f937..94c3cd33c 100644 --- a/src/common/backend/utils/init/globals.cpp +++ b/src/common/backend/utils/init/globals.cpp @@ -75,12 +75,13 @@ bool will_shutdown = false; * NEXT | 92899 | ? | ? * ********************************************/ -const uint32 GRAND_VERSION_NUM = 92914; +const uint32 GRAND_VERSION_NUM = 92915; /******************************************** * 2.VERSION NUM FOR EACH FEATURE * Please write indescending order. ********************************************/ +const uint32 SPQ_VERSION_NUM = 92915; const uint32 PARTITION_ACCESS_EXCLUSIVE_LOCK_UPGRADE_VERSION = 92913; const uint32 PAGE_DIST_VERSION_NUM = 92912; const uint32 NODE_REFORM_INFO_VERSION_NUM = 92911; diff --git a/src/common/backend/utils/misc/guc/guc_network.cpp b/src/common/backend/utils/misc/guc/guc_network.cpp index 14104f334..89f1513b5 100755 --- a/src/common/backend/utils/misc/guc/guc_network.cpp +++ b/src/common/backend/utils/misc/guc/guc_network.cpp @@ -237,7 +237,11 @@ static void InitNetworkConfigureNamesBool() struct config_bool localConfigureNamesBool[] = { {{"enable_stateless_pooler_reuse", PGC_POSTMASTER, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif DEVELOPER_OPTIONS, gettext_noop("Pooler stateless reuse mode."), NULL, @@ -250,7 +254,11 @@ static void InitNetworkConfigureNamesBool() // Stream communication {{"comm_tcp_mode", PGC_POSTMASTER, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif CLIENT_CONN, gettext_noop("Whether use tcp commucation mode for stream"), NULL, @@ -262,7 +270,11 @@ static void InitNetworkConfigureNamesBool() NULL}, {{"comm_debug_mode", PGC_USERSET, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif DEVELOPER_OPTIONS, gettext_noop("Whether use libcomm debug mode for print debug information"), NULL, @@ -278,7 +290,11 @@ static void InitNetworkConfigureNamesBool() NULL}, {{"comm_stat_mode", PGC_USERSET, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif DEVELOPER_OPTIONS, gettext_noop("Whether use libcomm stat mode for print stat data"), NULL, @@ -294,7 +310,11 @@ static void InitNetworkConfigureNamesBool() NULL}, {{"comm_timer_mode", PGC_USERSET, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif DEVELOPER_OPTIONS, gettext_noop("Whether use libcomm timer debug mode for print timer data"), NULL, @@ -310,7 +330,11 @@ static void InitNetworkConfigureNamesBool() NULL}, {{"comm_no_delay", PGC_USERSET, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif DEVELOPER_OPTIONS, gettext_noop("Whether set NO_DELAY option for libcomm socket"), NULL, @@ -322,7 +346,11 @@ static void InitNetworkConfigureNamesBool() NULL}, {{"enable_force_reuse_connections", PGC_BACKEND, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif DEVELOPER_OPTIONS, gettext_noop("Session force reuse pooler connections."), NULL, @@ -334,7 +362,11 @@ static void InitNetworkConfigureNamesBool() NULL}, {{"comm_client_bind", PGC_USERSET, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif DEVELOPER_OPTIONS, gettext_noop("Whether client use bind function"), NULL, @@ -346,7 +378,11 @@ static void InitNetworkConfigureNamesBool() NULL}, {{"comm_ssl", PGC_POSTMASTER, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif CONN_AUTH_SECURITY, gettext_noop("Enables libcomm SSL connections."), NULL}, @@ -464,7 +500,11 @@ static void InitNetworkConfigureNamesInt() show_unix_socket_permissions}, {{"pooler_maximum_idle_time", PGC_USERSET, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif DATA_NODES, gettext_noop("Maximum idle time of the pooler links."), NULL, @@ -479,7 +519,11 @@ static void InitNetworkConfigureNamesInt() {{"minimum_pool_size", PGC_USERSET, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif DATA_NODES, gettext_noop("Initial pool size."), gettext_noop("If number of active connections decreased below this value, " @@ -494,7 +538,11 @@ static void InitNetworkConfigureNamesInt() // Stream communication {{"comm_sctp_port", PGC_POSTMASTER, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif CONN_AUTH_SETTINGS, gettext_noop("Sets the STCP port the server listens on."), NULL}, @@ -508,7 +556,11 @@ static void InitNetworkConfigureNamesInt() {{"comm_control_port", PGC_POSTMASTER, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif CONN_AUTH_SETTINGS, gettext_noop("Sets the stream control port the server listens on."), NULL}, @@ -522,7 +574,11 @@ static void InitNetworkConfigureNamesInt() {{"comm_quota_size", PGC_POSTMASTER, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif CONN_AUTH_SETTINGS, gettext_noop("Sets the stream quota size in kB."), NULL, @@ -537,7 +593,11 @@ static void InitNetworkConfigureNamesInt() {{"comm_usable_memory", PGC_POSTMASTER, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif CONN_AUTH_SETTINGS, gettext_noop("Sets the total usable memory for communication(in kB)."), NULL, @@ -552,7 +612,11 @@ static void InitNetworkConfigureNamesInt() {{"comm_memory_pool", PGC_POSTMASTER, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif CONN_AUTH_SETTINGS, gettext_noop("Sets the memory pool size for communication(in kB)."), NULL, @@ -567,7 +631,11 @@ static void InitNetworkConfigureNamesInt() {{"comm_memory_pool_percent", PGC_POSTMASTER, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif CONN_AUTH_SETTINGS, gettext_noop("Sets the percent of comm_memory_pool for dynamic workload."), NULL}, @@ -581,7 +649,11 @@ static void InitNetworkConfigureNamesInt() {{"comm_ackchk_time", PGC_USERSET, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif QUERY_TUNING, gettext_noop("Send ack check package to stream sender periodically."), NULL, @@ -692,7 +764,11 @@ static void InitNetworkConfigureNamesInt() #endif {{"comm_max_receiver", PGC_POSTMASTER, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif CONN_AUTH_SETTINGS, gettext_noop("Maximum number of internal receiver threads."), NULL}, @@ -719,7 +795,11 @@ static void InitNetworkConfigureNamesInt() NULL}, {{"comm_sender_buffer_size", PGC_POSTMASTER, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif DEVELOPER_OPTIONS, gettext_noop("The libcomm sender's buffer size in every interaction between DN and CN, " "or DN and DN, unit(KB)"), @@ -733,7 +813,11 @@ static void InitNetworkConfigureNamesInt() NULL}, {{"max_pool_size", PGC_POSTMASTER, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif DATA_NODES, gettext_noop("Max pool size."), gettext_noop("If number of active connections reaches this value, " @@ -747,7 +831,11 @@ static void InitNetworkConfigureNamesInt() NULL}, {{"pooler_port", PGC_POSTMASTER, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif COORDINATORS, gettext_noop("Legacy port of the Pool Manager. Now it is used for cn HA port for build and replication " "under thread pool mode."), @@ -762,7 +850,11 @@ static void InitNetworkConfigureNamesInt() {{"pooler_timeout", PGC_SIGHUP, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif DATA_NODES, gettext_noop("Timeout of the Pool Communication with Other Nodes."), NULL, @@ -777,7 +869,11 @@ static void InitNetworkConfigureNamesInt() {{"pooler_connect_max_loops", PGC_USERSET, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif DATA_NODES, gettext_noop("Max retries of the Pooler Connecting to Other Nodes."), NULL}, @@ -791,7 +887,11 @@ static void InitNetworkConfigureNamesInt() {{"pooler_connect_interval_time", PGC_USERSET, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif DATA_NODES, gettext_noop("Indicates the interval for each retry."), NULL, @@ -806,7 +906,11 @@ static void InitNetworkConfigureNamesInt() {{"pooler_connect_timeout", PGC_SIGHUP, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif DATA_NODES, gettext_noop("Timeout of the Pooler Connecting to Other Nodes."), NULL, @@ -821,7 +925,11 @@ static void InitNetworkConfigureNamesInt() {{"pooler_cancel_timeout", PGC_SIGHUP, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif DATA_NODES, gettext_noop("Timeout of the Pooler Cancel Connections to Other Nodes."), NULL, @@ -835,7 +943,11 @@ static void InitNetworkConfigureNamesInt() NULL}, {{"max_coordinators", PGC_POSTMASTER, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif DATA_NODES, gettext_noop("Maximum number of Coordinators in the cluster."), gettext_noop("It is not possible to create more Coordinators in the cluster than " @@ -849,7 +961,11 @@ static void InitNetworkConfigureNamesInt() show_max_coordnode}, {{"comm_max_datanode", PGC_USERSET, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif DEVELOPER_OPTIONS, gettext_noop("Currently number of Datanodes."), NULL}, @@ -863,7 +979,11 @@ static void InitNetworkConfigureNamesInt() {{"comm_max_stream", PGC_POSTMASTER, +#ifdef USE_SPQ + NODE_ALL, +#else NODE_DISTRIBUTE, +#endif CONN_AUTH_SETTINGS, gettext_noop("Maximum number of streams."), NULL}, diff --git a/src/common/backend/utils/mmgr/portalmem.cpp b/src/common/backend/utils/mmgr/portalmem.cpp index 53780e8da..f170f8eba 100755 --- a/src/common/backend/utils/mmgr/portalmem.cpp +++ b/src/common/backend/utils/mmgr/portalmem.cpp @@ -105,7 +105,7 @@ typedef struct portalhashent { inline void ReleaseStreamGroup(Portal portal) { #ifndef ENABLE_MULTIPLE_NODES - if (!StreamThreadAmI()) { + if (!IS_SPQ_RUNNING && !StreamThreadAmI()) { portal->streamInfo.AttachToSession(); StreamNodeGroup::ReleaseStreamGroup(true); portal->streamInfo.Reset(); @@ -542,6 +542,14 @@ void PortalDrop(Portal portal, bool isTopCommit) u_sess->exec_cxt.isFlashBack = false; +#ifdef USE_SPQ + QueryDesc* queryDesc = portal->queryDesc; + if (!IS_SPQ_RUNNING && queryDesc != NULL && (queryDesc->plannedstmt) != NULL && + queryDesc->plannedstmt->is_spq_optmized) { + t_thrd.spq_ctx.spq_role = ROLE_QUERY_COORDINTOR; + } +#endif /* USE_SPQ */ + /* * Allow portalcmds.c to clean up the state it knows about, in particular * shutting down the executor if still active. This step potentially runs @@ -613,7 +621,7 @@ void PortalDrop(Portal portal, bool isTopCommit) */ #ifndef ENABLE_MULTIPLE_NODES /* autonomous transactions procedure out param portal cleaned by its parent session */ - if (portal->holdStore && !portal->isAutoOutParam) { + if (portal->holdStore && (IS_SPQ_RUNNING || !portal->isAutoOutParam)) { #else if (portal->holdStore) { #endif diff --git a/src/common/backend/utils/sort/tuplestore.cpp b/src/common/backend/utils/sort/tuplestore.cpp index dc580eae9..f498300d1 100644 --- a/src/common/backend/utils/sort/tuplestore.cpp +++ b/src/common/backend/utils/sort/tuplestore.cpp @@ -63,6 +63,9 @@ #include "utils/memprot.h" #include "utils/memutils.h" #include "utils/resowner.h" +#ifdef USE_SPQ +#include "executor/node/nodeShareInputScan.h" +#endif /* * Possible states of a Tuplestore object. These denote the states that @@ -94,6 +97,14 @@ typedef struct { off_t offset; /* byte offset in file */ } TSReadPointer; +#ifdef USE_SPQ +typedef enum { + TSHARE_NOT_SHARED, + TSHARE_WRITER, + TSHARE_READER +} TSSharedStatus; +#endif + /* * Private state of a Tuplestore operation. */ @@ -180,6 +191,12 @@ struct Tuplestorestate { int planId; /* id of plan that used this state */ int dop; /* parallel num of the plan */ bool isMemCtl; /* whether context is under memory control */ +#ifdef USE_SPQ + TSSharedStatus share_status; + bool frozen; + SharedFileSet *fileset; + char *shared_filename; +#endif }; #define COPYTUP(state, tup) ((*(state)->copytup)(state, tup)) @@ -531,6 +548,14 @@ void tuplestore_end(Tuplestorestate* state) pfree_ext(state->memtuples[i]); pfree_ext(state->memtuples); } + +#ifdef USE_SPQ + if (state->share_status == TSHARE_WRITER) + BufFileDeleteShared(state->fileset, state->shared_filename); + if (state->shared_filename) + pfree(state->shared_filename); +#endif + pfree_ext(state->readptrs); pfree_ext(state); } @@ -1514,3 +1539,102 @@ int tuplestore_get_memtupcount(Tuplestorestate* state) { return state->memtupcount; } + +#ifdef USE_SPQ +/* + * tuplestore_make_shared + * + * Make a tuplestore available for sharing later. This must be called + * immediately after tuplestore_begin_heap(). + */ +void tuplestore_make_shared(Tuplestorestate *state, SharedFileSet *fileset, const char *filename) +{ + ResourceOwner oldowner; + + Assert(state->status == TSS_INMEM); + Assert(state->share_status == TSHARE_NOT_SHARED); + state->share_status = TSHARE_WRITER; + state->fileset = fileset; + state->shared_filename = pstrdup(filename); + + /* + * Switch to tape-based operation, like in tuplestore_puttuple_common(). + * We could delay this until tuplestore_freeze(), but we know we'll have + * to write everything to the file anyway, so let's not waste memory + * buffering the tuples in the meanwhile. + */ + PrepareTempTablespaces(); + + /* associate the file with the store's resource owner */ + oldowner = t_thrd.utils_cxt.CurrentResourceOwner; + t_thrd.utils_cxt.CurrentResourceOwner = state->resowner; + + state->myfile = BufFileCreateShared(fileset, filename); + t_thrd.utils_cxt.CurrentResourceOwner = oldowner; + + /* + * For now, be conservative and always use trailing length words for + * cross-process tuplestores. It's important that the writer and the + * reader processes agree on this, and forcing it to true is the + * simplest way to achieve that. + */ + state->backward = true; + state->status = TSS_WRITEFILE; +} + +static void writetup_forbidden(Tuplestorestate *state, void *tup) +{ + elog(ERROR, "cannot write to tuplestore, it is already frozen"); +} + +/* + * tuplestore_freeze + * + * Flush the current buffer to disk, and forbid further inserts. This + * prepares the tuplestore for reading from a different process. + */ +void tuplestore_freeze(Tuplestorestate *state) +{ + Assert(state->share_status == TSHARE_WRITER); + Assert(!state->frozen); + dumptuples(state); + BufFileExportShared(state->myfile); + state->frozen = true; +} + +/* + * tuplestore_open_shared + * + * Open a shared tuplestore that has been populated in another process + * for reading. + */ +Tuplestorestate *tuplestore_open_shared(SharedFileSet *fileset, const char *filename) +{ + Tuplestorestate *state; + int eflags; + + eflags = EXEC_FLAG_BACKWARD | EXEC_FLAG_REWIND; + + state = tuplestore_begin_common(eflags, + false /* interXact, ignored because we open existing files */, + 10 /* no need for memory buffers */); + + state->backward = true; + + state->copytup = copytup_heap; + state->writetup = writetup_forbidden; + state->readtup = readtup_heap; + + state->myfile = BufFileOpenShared(fileset, filename); + state->readptrs[0].file = 0; + state->readptrs[0].offset = 0L; + state->status = TSS_READFILE; + + state->share_status = TSHARE_READER; + state->frozen = false; + state->fileset = fileset; + state->shared_filename = pstrdup(filename); + + return state; +} +#endif diff --git a/src/gausskernel/CMakeLists.txt b/src/gausskernel/CMakeLists.txt index 21b9e9e44..c17872f3f 100755 --- a/src/gausskernel/CMakeLists.txt +++ b/src/gausskernel/CMakeLists.txt @@ -352,6 +352,13 @@ list(APPEND gaussdb_LINK_DIRS ${BOOST_LIB_PATH} ) +list(FIND MACRO_OPTIONS "-DUSE_SPQ" RET_SPQ) +if(NOT ${RET_SPQ} EQUAL -1) + if(EXISTS ${PROJECT_OPENGS_DIR}/contrib/spq_plugin) + list(APPEND gaussdb_LINK_LIBS -lxerces-c) + endif() +endif() + if(NOT "${ENABLE_LITE_MODE}" STREQUAL "ON") list(APPEND gaussdb_LINK_DIRS ${LIBOBS_LIB_PATH} diff --git a/src/gausskernel/cbb/extension/foreign/foreign.cpp b/src/gausskernel/cbb/extension/foreign/foreign.cpp index 0b7753f19..c4b8ca339 100644 --- a/src/gausskernel/cbb/extension/foreign/foreign.cpp +++ b/src/gausskernel/cbb/extension/foreign/foreign.cpp @@ -1769,3 +1769,23 @@ void AdvanceFDWUpperPlan(FDWUpperRelCxt* ufdwCxt, UpperRelationKind stage, Plan* ufdwCxt->currentRel->fdwroutine->GetForeignUpperPaths(ufdwCxt, stage, localPlan); } + +#ifdef USE_SPQ +bool rel_is_external_table(Oid relid) +{ + Form_pg_foreign_table tableform; + HeapTuple tp; + bool result; + + tp = SearchSysCache1(FOREIGNTABLEREL, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(tp)) + return false; + tableform = (Form_pg_foreign_table) GETSTRUCT(tp); + + result = (tableform->ftserver == get_foreign_server_oid(GS_EXTTABLE_SERVER_NAME, false)); + + ReleaseSysCache(tp); + + return result; +} +#endif diff --git a/src/gausskernel/cbb/workload/statctl.cpp b/src/gausskernel/cbb/workload/statctl.cpp index f14ff9b3b..b32d14f56 100644 --- a/src/gausskernel/cbb/workload/statctl.cpp +++ b/src/gausskernel/cbb/workload/statctl.cpp @@ -6850,6 +6850,7 @@ void WLMInitQueryPlan(QueryDesc* queryDesc, bool isQueryDesc) #else if (!StreamThreadAmI() && #endif + !IS_SPQ_EXECUTOR && u_sess->attr.attr_resource.enable_resource_track && u_sess->exec_cxt.need_track_resource && t_thrd.shemem_ptr_cxt.mySessionMemoryEntry != NULL && t_thrd.shemem_ptr_cxt.mySessionMemoryEntry->query_plan == NULL && isQueryDesc) { diff --git a/src/gausskernel/optimizer/commands/explain.cpp b/src/gausskernel/optimizer/commands/explain.cpp index b4dc7966b..e265fee94 100755 --- a/src/gausskernel/optimizer/commands/explain.cpp +++ b/src/gausskernel/optimizer/commands/explain.cpp @@ -360,12 +360,12 @@ void ExplainQuery( es.costs = defGetBoolean(opt); else if (strcmp(opt->defname, "buffers") == 0) es.buffers = defGetBoolean(opt); -#ifdef ENABLE_MULTIPLE_NODES - else if (strcmp(opt->defname, "nodes") == 0) +#if defined(ENABLE_MULTIPLE_NODES) || defined(USE_SPQ) + else if (strcmp(opt->defname, "nodes") == 0 && IS_SPQ_RUNNING) es.nodes = defGetBoolean(opt); - else if (strcmp(opt->defname, "num_nodes") == 0) + else if (strcmp(opt->defname, "num_nodes") == 0 && IS_SPQ_RUNNING) es.num_nodes = defGetBoolean(opt); - else if (pg_strcasecmp(opt->defname, "detail") == 0) + else if (pg_strcasecmp(opt->defname, "detail") == 0 && IS_SPQ_RUNNING) es.detail = defGetBoolean(opt); #endif /* ENABLE_MULTIPLE_NODES */ else if (strcmp(opt->defname, "timing") == 0) { @@ -527,8 +527,8 @@ void ExplainQuery( } } -#ifdef ENABLE_MULTIPLE_NODES - if (u_sess->instr_cxt.global_instr != NULL) { +#if defined(ENABLE_MULTIPLE_NODES) || defined(USE_SPQ) + if (IS_SPQ_RUNNING && u_sess->instr_cxt.global_instr != NULL) { delete u_sess->instr_cxt.global_instr; u_sess->instr_cxt.thread_instr = NULL; u_sess->instr_cxt.global_instr = NULL; @@ -1011,13 +1011,15 @@ void ExplainOnePlan( #ifdef ENABLE_MULTIPLE_NODES if (IS_PGXC_COORDINATOR && #else - if (StreamTopConsumerAmI() && + if ((IS_SPQ_COORDINATOR || StreamTopConsumerAmI()) && #endif queryDesc->plannedstmt->is_stream_plan == true && check_stream_support() && instrument_option != 0 && u_sess->instr_cxt.global_instr == NULL && queryDesc->plannedstmt->num_nodes != 0) { int dop = queryDesc->plannedstmt->query_dop; - +#ifdef USE_SPQ + MemoryContext oldContext = MemoryContextSwitchTo(u_sess->spq_cxt.spq_worker_context); +#endif u_sess->instr_cxt.global_instr = StreamInstrumentation::InitOnCn(queryDesc, dop); // u_sess->instr_cxt.thread_instr in CN @@ -1028,6 +1030,9 @@ void ExplainOnePlan( AutoContextSwitch cxtGuard(SESS_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_OPTIMIZER)); u_sess->instr_cxt.obs_instr = New(CurrentMemoryContext) OBSInstrumentation(); } +#ifdef USE_SPQ + MemoryContextSwitchTo(oldContext); +#endif } #endif @@ -1868,6 +1873,14 @@ static void ExplainNodePartition(const Plan* plan, ExplainState* es) flag = 1; } break; +#ifdef USE_SPQ + case T_SpqSeqScan: + if (((SpqSeqScan*)plan->lefttree)->scan.pruningInfo->expr != NULL) { + appendStringInfo(es->str, "Iterations: %s", "PART"); + flag = 1; + } + break; +#endif case T_IndexScan: if (((IndexScan*)plan->lefttree)->scan.pruningInfo->expr != NULL) { appendStringInfo(es->str, "Iterations: %s", "PART"); @@ -1932,6 +1945,9 @@ static bool GetSubPartitionIterations(const Plan* plan, const ExplainState* es, /* fallthrough */ } case T_SeqScan: +#ifdef USE_SPQ + case T_SpqSeqScan: +#endif case T_IndexScan: case T_IndexOnlyScan: case T_BitmapIndexScan: @@ -1958,20 +1974,20 @@ static bool GetSubPartitionIterations(const Plan* plan, const ExplainState* es, #ifndef ENABLE_MULTIPLE_NODES static void PredAppendInfo(Plan* plan, StringInfoData buf, ExplainState* es) { - if (plan->pred_total_time >= 0) { + if (es->planinfo->m_planInfo->m_pred_time && plan->pred_total_time >= 0) { initStringInfo(&buf); appendStringInfo(&buf, "%.0f", plan->pred_total_time); es->planinfo->m_planInfo->put(PREDICT_TIME, PointerGetDatum(cstring_to_text(buf.data))); pfree_ext(buf.data); } -if (plan->pred_rows >= 0) { + if (es->planinfo->m_planInfo->m_pred_row && plan->pred_rows >= 0) { es->planinfo->m_planInfo->put(PREDICT_ROWS, DirectFunctionCall1(dround, Float8GetDatum(plan->pred_rows))); } -if (plan->pred_max_memory >= 0) { + if (es->planinfo->m_planInfo->m_pred_mem && plan->pred_max_memory >= 0) { es->planinfo->m_planInfo->put(PREDICT_MEMORY, DirectFunctionCall1(pg_size_pretty, Int64GetDatum(plan->pred_max_memory))); @@ -2032,6 +2048,9 @@ static void ExplainNode( StringInfo tmpName = nullptr; bool from_datanode = false; bool old_dn_flag = false; +#ifdef USE_SPQ + int current_id_bak = es->current_id; +#endif /* For plan_table column */ char* pt_operation = NULL; @@ -2108,6 +2127,9 @@ static void ExplainNode( switch (nodeTag(plan)) { case T_SeqScan: +#ifdef USE_SPQ + case T_SpqSeqScan: +#endif case T_CStoreScan: #ifdef ENABLE_MULTIPLE_NODES case T_TsStoreScan: @@ -2134,9 +2156,35 @@ static void ExplainNode( if (((Scan*)plan)->scanrelid > 0) ExplainScanTarget((Scan*)plan, es); break; +#ifdef USE_SPQ + case T_Stream: { + es->current_id = ((Stream*)plan)->streamID; + } + break; + + case T_ShareInputScan: { + ShareInputScan *sisc = (ShareInputScan *) plan; + int slice_id = es->current_id; + + if (es->format == EXPLAIN_FORMAT_TEXT) { + appendStringInfo(es->str, " (%s; slice%d; share%d; producer:%d)", + (sisc->is_producer ? "Producer" : "Consumer"), + slice_id, sisc->share_id, sisc->producer_slice_id); + } else { + ExplainPropertyText("Identity", (sisc->is_producer ? "Producer" : "Consumer"), es); + ExplainPropertyInteger("Producer ID", sisc->producer_slice_id, es); + ExplainPropertyInteger("Share ID", sisc->share_id, es); + ExplainPropertyInteger("Slice ID", slice_id, es); + } + } + break; +#endif #ifdef PGXC case T_RemoteQuery: case T_VecRemoteQuery: +#ifdef USE_SPQ + es->current_id = ((RemoteQuery*)plan)->streamID; +#endif /* Emit node execution list */ ExplainExecNodes(((RemoteQuery*)plan)->exec_nodes, es); #ifdef STREAMPLAN @@ -2244,6 +2292,11 @@ static void ExplainNode( case JOIN_RIGHT_ANTI_FULL: jointype = pt_options = "Right Anti Full"; break; +#ifdef USE_SPQ + case JOIN_LASJ_NOTIN: + jointype = "Left Anti Semi (Not-In)"; + break; +#endif default: jointype = pt_options = "?\?\?"; break; @@ -2693,6 +2746,9 @@ static void ExplainNode( show_startwith_dfx((StartWithOpState*)planstate, es); break; case T_SeqScan: +#ifdef USE_SPQ + case T_SpqSeqScan: +#endif show_tablesample(plan, planstate, ancestors, es); if (!((SeqScan*)plan)->scanBatchMode) { show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); @@ -2927,7 +2983,17 @@ static void ExplainNode( } } break; - +#ifdef USE_SPQ + case T_AssertOp: + show_upper_qual(plan->qual, "Assert Cond", planstate, ancestors, es); + break; + case T_ShareInputScan: + show_upper_qual(plan->qual, "Shared Scan", planstate, ancestors, es); + break; + case T_Sequence: + show_upper_qual(plan->qual, "Sequence", planstate, ancestors, es); + break; +#endif default: break; } @@ -2990,8 +3056,15 @@ static void ExplainNode( #ifdef ENABLE_MULTIPLE_NODES char* node_name = PGXCNodeGetNodeNameFromId(i, PGXC_NODE_DATANODE); #else - char* node_name = g_instance.exec_cxt.nodeName; + char* node_name = NULL; + if (!IS_SPQ_RUNNING) { + node_name = g_instance.exec_cxt.nodeName; + } else { + node_name = (char*)palloc0(SPQNODENAMELEN); + sprintf(node_name, "%d", i); + } #endif + for (int j = 0; j < dop; j++) { outerCycles = 0.0; innerCycles = 0.0; @@ -3066,6 +3139,9 @@ static void ExplainNode( #ifdef ENABLE_MULTIPLE_NODES case T_TsStoreScan: #endif /* ENABLE_MULTIPLE_NODES */ +#ifdef USE_SPQ + case T_SpqSeqScan: +#endif case T_IndexScan: case T_IndexOnlyScan: case T_BitmapHeapScan: @@ -3203,6 +3279,11 @@ runnext: case T_ExtensiblePlan: ExplainExtensibleChildren((ExtensiblePlanState*)planstate, ancestors, es); break; +#ifdef USE_SPQ + case T_Sequence: + ExplainMemberNodes(((Sequence*)plan)->subplans, ((SequenceState*)planstate)->subplans, ancestors, es); + break; +#endif default: break; } @@ -3263,6 +3344,9 @@ runnext: /* 4. set cost and cardinality */ es->planinfo->m_planTableData->set_plan_table_cost_card(plan->plan_node_id, plan->total_cost, plan->plan_rows); } +#ifdef USE_SPQ + es->current_id = current_id_bak; +#endif } /* @@ -4062,7 +4146,13 @@ static void show_peak_memory(ExplainState* es, int plan_size) #ifdef ENABLE_MULTIPLE_NODES char* node_name = PGXCNodeGetNodeNameFromId(j, PGXC_NODE_DATANODE); #else - char* node_name = g_instance.exec_cxt.nodeName; + char* node_name = NULL; + if (!IS_SPQ_RUNNING) { + node_name = g_instance.exec_cxt.nodeName; + } else { + node_name = (char*)palloc0(SPQNODENAMELEN); + sprintf(node_name, "%d", j); + } #endif if (instr != NULL) { if (!is_execute) @@ -4155,8 +4245,17 @@ static void show_dn_executor_time(ExplainState* es, int plan_node_id, ExecutorTi char* min_node_name = PGXCNodeGetNodeNameFromId(min_idx, PGXC_NODE_DATANODE); char* max_node_name = PGXCNodeGetNodeNameFromId(max_idx, PGXC_NODE_DATANODE); #else - char* min_node_name = g_instance.exec_cxt.nodeName; - char* max_node_name = g_instance.exec_cxt.nodeName; + char* min_node_name = NULL; + char* max_node_name = NULL; + if (!IS_SPQ_RUNNING) { + min_node_name = g_instance.exec_cxt.nodeName; + max_node_name = g_instance.exec_cxt.nodeName; + } else { + min_node_name = (char*)palloc0(SPQNODENAMELEN); + max_node_name = (char*)palloc0(SPQNODENAMELEN); + sprintf(min_node_name, "%d", min_idx); + sprintf(max_node_name, "%d", max_idx); + } #endif appendStringInfo(es->planinfo->m_query_summary->info_str, "Datanode executor %s time [%s, %s]: [%.3f ms,%.3f ms]\n", symbol_time, min_node_name, max_node_name, min_time, max_time); @@ -4197,8 +4296,15 @@ static void show_sort_info(SortState* sortstate, ExplainState* es) #ifdef ENABLE_MULTIPLE_NODES char* node_name = PGXCNodeGetNodeNameFromId(i, PGXC_NODE_DATANODE); #else - char* node_name = g_instance.exec_cxt.nodeName; + char* node_name = NULL; + if (!IS_SPQ_RUNNING) { + node_name = g_instance.exec_cxt.nodeName; + } else { + node_name = (char*)palloc0(SPQNODENAMELEN); + sprintf(node_name, "%d", i); + } #endif + sortMethodId = instr->sorthashinfo.sortMethodId; spaceTypeId = instr->sorthashinfo.spaceTypeId; sortMethod = sortmessage[sortMethodId].sortName; @@ -4426,8 +4532,15 @@ static void show_llvm_info(const PlanState* planstate, ExplainState* es) #ifdef ENABLE_MULTIPLE_NODES char* node_name = PGXCNodeGetNodeNameFromId(i, PGXC_NODE_DATANODE); #else - char* node_name = g_instance.exec_cxt.nodeName; + char* node_name = NULL; + if (!IS_SPQ_RUNNING) { + node_name = g_instance.exec_cxt.nodeName; + } else { + node_name = (char*)palloc0(SPQNODENAMELEN); + sprintf(node_name, "%d", i); + } #endif + if (t_thrd.explain_cxt.explain_perf_mode != EXPLAIN_NORMAL && es->planinfo->m_runtimeinfo) { es->planinfo->m_runtimeinfo->put(i, 0, LLVM_OPTIMIZATION, true); es->planinfo->m_datanodeInfo->set_plan_name(); @@ -4456,8 +4569,15 @@ static void show_llvm_info(const PlanState* planstate, ExplainState* es) #ifdef ENABLE_MULTIPLE_NODES char* node_name = PGXCNodeGetNodeNameFromId(i, PGXC_NODE_DATANODE); #else - char* node_name = g_instance.exec_cxt.nodeName; + char* node_name = NULL; + if (!IS_SPQ_RUNNING) { + node_name = g_instance.exec_cxt.nodeName; + } else { + node_name = (char*)palloc0(SPQNODENAMELEN); + sprintf(node_name, "%d", i); + } #endif + ExplainPropertyText("DN Name", node_name, es); ExplainPropertyText("LLVM", "LLVM Optimized", es); ExplainCloseGroup("Plan", NULL, true, es); @@ -4533,8 +4653,15 @@ static void show_detail_filenum_info(const PlanState* planstate, ExplainState* e #ifdef ENABLE_MULTIPLE_NODES char* node_name = PGXCNodeGetNodeNameFromId(i, PGXC_NODE_DATANODE); #else - char* node_name = g_instance.exec_cxt.nodeName; + char* node_name = NULL; + if (!IS_SPQ_RUNNING) { + node_name = g_instance.exec_cxt.nodeName; + } else { + node_name = (char*)palloc0(SPQNODENAMELEN); + sprintf(node_name, "%d", i); + } #endif + for (j = 0; j < dop; j++) { instr = u_sess->instr_cxt.global_instr->getInstrSlot(i, planstate->plan->plan_node_id, j); if (instr != NULL && instr->nloops > 0) { @@ -4695,8 +4822,15 @@ static void show_detail_execute_info(const PlanState* planstate, ExplainState* e #ifdef ENABLE_MULTIPLE_NODES char* node_name = PGXCNodeGetNodeNameFromId(i, PGXC_NODE_DATANODE); #else - char* node_name = g_instance.exec_cxt.nodeName; + char* node_name = NULL; + if (!IS_SPQ_RUNNING) { + node_name = g_instance.exec_cxt.nodeName; + } else { + node_name = (char*)palloc0(SPQNODENAMELEN); + sprintf(node_name, "%d", i); + } #endif + for (j = 0; j < dop; j++) { instr = u_sess->instr_cxt.global_instr->getInstrSlot(i, planstate->plan->plan_node_id, j); if (instr != NULL && instr->nloops > 0) { @@ -4829,8 +4963,15 @@ static void show_hash_info(HashState* hashstate, ExplainState* es) #ifdef ENABLE_MULTIPLE_NODES char* node_name = PGXCNodeGetNodeNameFromId(i, PGXC_NODE_DATANODE); #else - char* node_name = g_instance.exec_cxt.nodeName; + char* node_name = NULL; + if (!IS_SPQ_RUNNING) { + node_name = g_instance.exec_cxt.nodeName; + } else { + node_name = (char*)palloc0(SPQNODENAMELEN); + sprintf(node_name, "%d", i); + } #endif + append_datanode_name(es, node_name, 1, 0); spacePeakKb = (instr->sorthashinfo.spacePeak + 1023) / 1024; @@ -5154,8 +5295,15 @@ static void show_vechash_info(VecHashJoinState* hashstate, ExplainState* es) #ifdef ENABLE_MULTIPLE_NODES char* node_name = PGXCNodeGetNodeNameFromId(i, PGXC_NODE_DATANODE); #else - char* node_name = g_instance.exec_cxt.nodeName; + char* node_name = NULL; + if (!IS_SPQ_RUNNING) { + node_name = g_instance.exec_cxt.nodeName; + } else { + node_name = (char*)palloc0(SPQNODENAMELEN); + sprintf(node_name, "%d", i); + } #endif + spaceUsed = (instr->sorthashinfo.spaceUsed + 1023) / 1024; spillSize = (instr->sorthashinfo.spill_size + 1023) / 1024; file_num = instr->sorthashinfo.hash_FileNum; @@ -5418,8 +5566,15 @@ static void show_recursive_info(RecursiveUnionState* rustate, ExplainState* es) #ifdef ENABLE_MULTIPLE_NODES char* node_name = PGXCNodeGetNodeNameFromId(i, PGXC_NODE_DATANODE); #else - char* node_name = g_instance.exec_cxt.nodeName; + char* node_name = NULL; + if (!IS_SPQ_RUNNING) { + node_name = g_instance.exec_cxt.nodeName; + } else { + node_name = (char*)palloc0(SPQNODENAMELEN); + sprintf(node_name, "%d", i); + } #endif + instr = u_sess->instr_cxt.global_instr->getInstrSlot(i, planstate->plan->plan_node_id); if (instr == NULL) { continue; @@ -5449,8 +5604,15 @@ static void show_recursive_info(RecursiveUnionState* rustate, ExplainState* es) #ifdef ENABLE_MULTIPLE_NODES char* node_name = PGXCNodeGetNodeNameFromId(i, PGXC_NODE_DATANODE); #else - char* node_name = g_instance.exec_cxt.nodeName; + char* node_name = NULL; + if (!IS_SPQ_RUNNING) { + node_name = g_instance.exec_cxt.nodeName; + } else { + node_name = (char*)palloc0(SPQNODENAMELEN); + sprintf(node_name, "%d", i); + } #endif + appendStringInfoSpaces(es->planinfo->m_recursiveInfo->info_str, 16); appendStringInfo(es->planinfo->m_recursiveInfo->info_str, "%s return tuples: %lu\n", @@ -5483,8 +5645,15 @@ static void show_datanode_buffers(ExplainState* es, PlanState* planstate) #ifdef ENABLE_MULTIPLE_NODES char* node_name = PGXCNodeGetNodeNameFromId(i, PGXC_NODE_DATANODE); #else - char* node_name = g_instance.exec_cxt.nodeName; + char* node_name = NULL; + if (!IS_SPQ_RUNNING) { + node_name = g_instance.exec_cxt.nodeName; + } else { + node_name = (char*)palloc0(SPQNODENAMELEN); + sprintf(node_name, "%d", i); + } #endif + append_datanode_name(es, node_name, dop, j); if (t_thrd.explain_cxt.explain_perf_mode != EXPLAIN_NORMAL) { @@ -6428,9 +6597,16 @@ static StreamTime* get_instrument( #ifdef ENABLE_MULTIPLE_NODES char* node_name = PGXCNodeGetNodeNameFromId(j, PGXC_NODE_DATANODE); #else - char* node_name = g_instance.exec_cxt.nodeName; + char* node_name = NULL; + if (!IS_SPQ_RUNNING) { + node_name = g_instance.exec_cxt.nodeName; + } else { + node_name = (char*)palloc0(SPQNODENAMELEN); + sprintf(node_name, "%d", j); + } #endif + instrument = &trackpoint->track_time; if (*first_time) { @@ -6511,8 +6687,15 @@ static void show_track_time_without_plannodeid(ExplainState* es) #ifdef ENABLE_MULTIPLE_NODES char* nodename = PGXCNodeGetNodeNameFromId(0, PGXC_NODE_COORDINATOR); #else - char* nodename = g_instance.exec_cxt.nodeName; + char* nodename = NULL; + if (!IS_SPQ_RUNNING) { + nodename = g_instance.exec_cxt.nodeName; + } else { + nodename = (char*)palloc0(SPQNODENAMELEN); + sprintf(nodename, "%d", i); + } #endif + appendStringInfoSpaces(str, 6); appendStringInfo(str, " %s:", nodename); @@ -6660,8 +6843,15 @@ static void show_track_time_info(ExplainState* es) #ifdef ENABLE_MULTIPLE_NODES char* nodename = PGXCNodeGetNodeNameFromId(0, PGXC_NODE_COORDINATOR); #else - char* nodename = g_instance.exec_cxt.nodeName; + char* nodename = NULL; + if (!IS_SPQ_RUNNING) { + nodename = g_instance.exec_cxt.nodeName; + } else { + nodename = (char*)palloc0(SPQNODENAMELEN); + sprintf(nodename, "%d", i); + } #endif + appendStringInfoSpaces(str, 6); appendStringInfo(str, " %s:", nodename); if (instrument->need_timer) { @@ -7074,8 +7264,15 @@ static void show_stream_send_time(ExplainState* es, const PlanState* planstate) #ifdef ENABLE_MULTIPLE_NODES char* node_name = PGXCNodeGetNodeNameFromId(i, PGXC_NODE_DATANODE); #else - char* node_name = g_instance.exec_cxt.nodeName; + char* node_name = NULL; + if (!IS_SPQ_RUNNING) { + node_name = g_instance.exec_cxt.nodeName; + } else { + node_name = (char*)palloc0(SPQNODENAMELEN); + sprintf(node_name, "%d", i); + } #endif + for (int j = 0; j < dop; j++) { Instrumentation* instr = u_sess->instr_cxt.global_instr->getInstrSlot(i, planstate->plan->plan_node_id, j); if (instr != NULL && instr->stream_senddata.loops == true) { @@ -7138,9 +7335,18 @@ static void show_datanode_time(ExplainState* es, PlanState* planstate) char* node_name = PGXCNodeGetNodeNameFromId(i, PGXC_NODE_DATANODE); ThreadInstrumentation* threadinstr = u_sess->instr_cxt.global_instr->getThreadInstrumentationCN(i); #else - char* node_name = g_instance.exec_cxt.nodeName; - ThreadInstrumentation* threadinstr = u_sess->instr_cxt.global_instr->getThreadInstrumentationDN(1, 0); + char* node_name = NULL; + ThreadInstrumentation* threadinstr = NULL; + if (!IS_SPQ_RUNNING) { + node_name = g_instance.exec_cxt.nodeName; + threadinstr = u_sess->instr_cxt.global_instr->getThreadInstrumentationDN(1, 0); + } else { + node_name = (char*)palloc0(SPQNODENAMELEN); + sprintf(node_name, "%d", i); + threadinstr = u_sess->instr_cxt.global_instr->getThreadInstrumentationCN(i); + } #endif + if (threadinstr == NULL) continue; executed = threadinstr->m_instrArray[0].instr.isExecute; @@ -7907,8 +8113,15 @@ static void show_storage_filter_info(PlanState* planstate, ExplainState* es) #ifdef ENABLE_MULTIPLE_NODES char* node_name = PGXCNodeGetNodeNameFromId(i, PGXC_NODE_DATANODE); #else - char* node_name = g_instance.exec_cxt.nodeName; + char* node_name = NULL; + if (!IS_SPQ_RUNNING) { + node_name = g_instance.exec_cxt.nodeName; + } else { + node_name = (char*)palloc0(SPQNODENAMELEN); + sprintf(node_name, "%d", i); + } #endif + for (j = 0; j < dop; j++) { instr = u_sess->instr_cxt.global_instr->getInstrSlot(i, planstate->plan->plan_node_id, j); if (instr == NULL) @@ -7986,8 +8199,15 @@ static void show_modifytable_merge_info(const PlanState* planstate, ExplainState #ifdef ENABLE_MULTIPLE_NODES char* node_name = PGXCNodeGetNodeNameFromId(i, PGXC_NODE_DATANODE); #else - char* node_name = g_instance.exec_cxt.nodeName; + char* node_name = NULL; + if (!IS_SPQ_RUNNING) { + node_name = g_instance.exec_cxt.nodeName; + } else { + node_name = (char*)palloc0(SPQNODENAMELEN); + sprintf(node_name, "%d", i); + } #endif + append_datanode_name(es, node_name, 1, 0); if (t_thrd.explain_cxt.explain_perf_mode != EXPLAIN_NORMAL && es->planinfo->m_runtimeinfo) { @@ -8978,8 +9198,15 @@ static void showStreamnetwork(Stream* stream, ExplainState* es) #ifdef ENABLE_MULTIPLE_NODES char* node_name = PGXCNodeGetNodeNameFromId(i, PGXC_NODE_DATANODE); #else - char* node_name = g_instance.exec_cxt.nodeName; + char* node_name = NULL; + if (!IS_SPQ_RUNNING) { + node_name = g_instance.exec_cxt.nodeName; + } else { + node_name = (char*)palloc0(SPQNODENAMELEN); + sprintf(node_name, "%d", i); + } #endif + for (int j = 0; j < dop; j++) { instr = u_sess->instr_cxt.global_instr->getInstrSlot(i, plan->plan_node_id, j); if (instr == NULL) diff --git a/src/gausskernel/optimizer/commands/indexcmds.cpp b/src/gausskernel/optimizer/commands/indexcmds.cpp index 1d43e62ea..ed07e2490 100644 --- a/src/gausskernel/optimizer/commands/indexcmds.cpp +++ b/src/gausskernel/optimizer/commands/indexcmds.cpp @@ -73,6 +73,10 @@ #include "securec.h" +#ifdef USE_SPQ +#include "access/spq_btbuild.h" +#endif + /* non-export function prototypes */ void CheckPredicate(Expr* predicate); Oid GetIndexOpClass(List* opclass, Oid attrType, const char* accessMethodName, Oid accessMethodId); @@ -2020,7 +2024,13 @@ ObjectAddress DefineIndex(Oid relationId, IndexStmt* stmt, Oid indexRelationId, * Index can now be marked valid -- update its pg_index entry */ index_set_state_flags(indexRelationId, INDEX_CREATE_SET_VALID); - +#ifdef USE_SPQ + Relation indexRelation; + indexRelation = index_open(indexRelationId, RowExclusiveLock); + if (enable_spq_btbuild(indexRelation)) + spq_btbuild_update_pg_class(rel, indexRelation); + index_close(indexRelation, NoLock); +#endif /* * The pg_index update will cause backends (including this one) to update * relcache entries for the index itself, but we should also send a diff --git a/src/gausskernel/optimizer/commands/portalcmds.cpp b/src/gausskernel/optimizer/commands/portalcmds.cpp index a83d2427b..672a64f35 100644 --- a/src/gausskernel/optimizer/commands/portalcmds.cpp +++ b/src/gausskernel/optimizer/commands/portalcmds.cpp @@ -289,7 +289,7 @@ void PortalCleanup(Portal portal) t_thrd.utils_cxt.CurrentResourceOwner = portal->resowner; ExecutorFinish(queryDesc); ExecutorEnd(queryDesc); -#ifndef ENABLE_MULTIPLE_NODES +#if !defined(ENABLE_MULTIPLE_NODES) && !defined(USE_SPQ) /* * estate is under the queryDesc, and stream threads use it. * we should wait all stream threads exit to cleanup queryDesc. @@ -299,6 +299,12 @@ void PortalCleanup(Portal portal) StreamNodeGroup::ReleaseStreamGroup(true); portal->streamInfo.Reset(); } +#else + if (t_thrd.spq_ctx.spq_role == ROLE_UTILITY && !StreamThreadAmI()) { + portal->streamInfo.AttachToSession(); + StreamNodeGroup::ReleaseStreamGroup(true); + portal->streamInfo.Reset(); + } #endif FreeQueryDesc(queryDesc); } diff --git a/src/gausskernel/optimizer/commands/tablecmds.cpp b/src/gausskernel/optimizer/commands/tablecmds.cpp index b24bcf71a..a2f6d820b 100755 --- a/src/gausskernel/optimizer/commands/tablecmds.cpp +++ b/src/gausskernel/optimizer/commands/tablecmds.cpp @@ -28094,6 +28094,21 @@ static void ATExecSplitPartition(Relation partTableRel, AlterTableCmd* cmd) ATUnusableGlobalIndex(partTableRel); } } +#ifdef USE_SPQ +void spq_btbuild_update_pg_class(Relation heap, Relation index) +{ + List *options = NIL; + DefElem *opt; + opt = makeNode(DefElem); + opt->type = T_DefElem; + opt->defnamespace = NULL; + opt->defname = "spq_build"; + opt->defaction = DEFELEM_SET; + opt->arg = (Node *)makeString("finish"); + options = lappend(options, opt); + ATExecSetRelOptions(index, options, AT_SetRelOptions, ShareUpdateExclusiveLock); +} +#endif void CheckSrcListSubPartitionForSplit(Relation rel, Oid partOid, Oid subPartOid) { diff --git a/src/gausskernel/optimizer/plan/createplan.cpp b/src/gausskernel/optimizer/plan/createplan.cpp index b598f88b6..d63f85e2b 100755 --- a/src/gausskernel/optimizer/plan/createplan.cpp +++ b/src/gausskernel/optimizer/plan/createplan.cpp @@ -9250,6 +9250,12 @@ bool is_projection_capable_plan(Plan* plan) case T_MergeAppend: case T_RecursiveUnion: case T_Stream: +#ifdef USE_SPQ + case T_Motion: + case T_ShareInputScan: + case T_Sequence: + case T_PartitionSelector: +#endif return false; case T_PartIterator: @@ -10697,3 +10703,10 @@ bool is_projection_capable_path(Path *path) } return true; } + +#ifdef USE_SPQ +List* spq_make_null_eq_clause(List* joinqual, List** otherqual, List* nullinfo) +{ + return make_null_eq_clause(joinqual, otherqual, nullinfo); +} +#endif \ No newline at end of file diff --git a/src/gausskernel/optimizer/plan/planner.cpp b/src/gausskernel/optimizer/plan/planner.cpp index ad34b3abc..1433c864c 100755 --- a/src/gausskernel/optimizer/plan/planner.cpp +++ b/src/gausskernel/optimizer/plan/planner.cpp @@ -94,6 +94,9 @@ /* Hook for plugins to get control in planner() */ THR_LOCAL ndp_pushdown_hook_type ndp_pushdown_hook = NULL; +#ifdef USE_SPQ +THR_LOCAL spq_planner_hook_type spq_planner_hook = NULL; +#endif #ifndef MIN #define MIN(A, B) ((B) < (A) ? (B) : (A)) @@ -377,6 +380,12 @@ PlannedStmt* planner(Query* parse, int cursorOptions, ParamListInfo boundParams) instr_time starttime; double totaltime = 0; +#ifdef USE_SPQ + if (spq_planner_hook) { + return (*spq_planner_hook) (parse, cursorOptions, boundParams); + } +#endif + INSTR_TIME_SET_CURRENT(starttime); #ifdef PGXC @@ -3525,12 +3534,14 @@ static Plan* grouping_planner(PlannerInfo* root, double tuple_fraction) wflists, &needSecondLevelAgg, collectiveGroupExpr); -#ifdef ENABLE_MULTIPLE_NODES - /* - * grouping_tlist was modified by build_groupingsets_plan, - * we have to change tlist at the same time. - */ - tlist = grouping_tlist; +#if defined(ENABLE_MULTIPLE_NODES) || defined(USE_SPQ) + if (IS_SPQ_RUNNING) { + /* + * grouping_tlist was modified by build_groupingsets_plan, + * we have to change tlist at the same time. + */ + tlist = grouping_tlist; + } #endif /* Delete eq class expr after grouping */ delete_eq_member(root, tlist, collectiveGroupExpr); diff --git a/src/gausskernel/optimizer/plan/setrefs.cpp b/src/gausskernel/optimizer/plan/setrefs.cpp index cdda66282..f2e2b7431 100644 --- a/src/gausskernel/optimizer/plan/setrefs.cpp +++ b/src/gausskernel/optimizer/plan/setrefs.cpp @@ -41,6 +41,10 @@ #include "optimizer/streamplan.h" #include "optimizer/stream_remove.h" +#ifdef USE_SPQ +#include "optimizer/planmem_walker.h" +#endif + typedef struct { Index varno; /* RT index of Var */ AttrNumber varattno; /* attr number of Var */ @@ -309,6 +313,9 @@ static Plan* set_plan_refs(PlannerInfo* root, Plan* plan, int rtoffset) */ switch (nodeTag(plan)) { case T_SeqScan: +#ifdef USE_SPQ + case T_SpqSeqScan: +#endif #ifdef ENABLE_MULTIPLE_NODES case T_TsStoreScan: #endif /* ENABLE_MULTIPLE_NODES */ @@ -647,6 +654,20 @@ static Plan* set_plan_refs(PlannerInfo* root, Plan* plan, int rtoffset) /* resconstantqual can't contain any subplan variable refs */ splan->resconstantqual = fix_scan_expr(root, splan->resconstantqual, rtoffset); } break; +#ifdef USE_SPQ + case T_Result: { + Result* splan = (Result*)plan; + + if (splan->plan.lefttree != NULL) + set_upper_references(root, plan, rtoffset); + else { + splan->plan.targetlist = fix_scan_list(root, splan->plan.targetlist, rtoffset); + splan->plan.qual = fix_scan_list(root, splan->plan.qual, rtoffset); + } + /* resconstantqual can't contain any subplan variable refs */ + splan->resconstantqual = fix_scan_expr(root, splan->resconstantqual, rtoffset); + } +#endif case T_ProjectSet: set_upper_references(root, plan, rtoffset); break; @@ -2791,3 +2812,47 @@ static void set_foreignscan_references(PlannerInfo *root, ForeignScan *fscan, in fix_dfs_private_item(root, rtoffset, item); } } + +#ifdef USE_SPQ +typedef struct { + PlannerInfo *root; + plan_tree_base_prefix base; +} spq_extract_plan_dependencies_context; + +static bool spq_extract_plan_dependencies_walker(Node *node, spq_extract_plan_dependencies_context *context) +{ + if (node == NULL) + return false; + /* Extract function dependencies and check for regclass Consts */ + fix_expr_common(context->root, node); + + return plan_tree_walker(node, (MethodWalker)spq_extract_plan_dependencies_walker, (void *)context); +} + +/* + * spq_extract_plan_dependencies() + * Given a fully built Plan tree, extract their dependencies just as + * set_plan_references_ would have done. + * + * This is used to extract dependencies from a plan that has been created + * by ORCA (set_plan_references() does this usually, but ORCA doesn't use + * it). This adds the new entries directly to PlannerGlobal.relationOids + * and invalItems. + * + * Note: This recurses into SubPlans. You better still call this for + * every subplan in a overall plan, to make sure you capture dependencies + * from subplans that are not referenced from the main plan, because + * changes to the relations in eliminated subplans might require + * re-planning, too. (XXX: it would be better to not recurse into SubPlans + * here, as that's a waste of time.) + */ +void spq_extract_plan_dependencies(PlannerInfo *root, Plan *plan) +{ + spq_extract_plan_dependencies_context context; + + context.base.node = (Node *)(root->glob); + context.root = root; + + (void)spq_extract_plan_dependencies_walker((Node *)plan, &context); +} +#endif \ No newline at end of file diff --git a/src/gausskernel/optimizer/plan/streamplan.cpp b/src/gausskernel/optimizer/plan/streamplan.cpp index 15d2e1c80..354819530 100644 --- a/src/gausskernel/optimizer/plan/streamplan.cpp +++ b/src/gausskernel/optimizer/plan/streamplan.cpp @@ -1672,11 +1672,15 @@ Plan* create_local_redistribute(PlannerInfo* root, Plan* lefttree, List* redistr */ uint2* get_bucketmap_by_execnode(ExecNodes* exec_node, PlannedStmt* plannedstmt, int *bucketCnt) { +#ifndef USE_SPQ if (exec_node == NULL) { return NULL; } - +#endif int nodeLen = list_length(exec_node->nodeList); +#ifdef USE_SPQ + nodeLen = plannedstmt->num_nodes; +#endif if (nodeLen == 0) { return NULL; } diff --git a/src/gausskernel/optimizer/plan/streamplan_single.cpp b/src/gausskernel/optimizer/plan/streamplan_single.cpp index cc0925fad..2abfd1e75 100644 --- a/src/gausskernel/optimizer/plan/streamplan_single.cpp +++ b/src/gausskernel/optimizer/plan/streamplan_single.cpp @@ -89,6 +89,14 @@ void set_default_stream() u_sess->stream_cxt.global_obj == NULL); u_sess->opt_cxt.is_stream_support = u_sess->opt_cxt.is_stream; } +#ifdef USE_SPQ + if (t_thrd.spq_ctx.spq_role != ROLE_UTILITY) { + u_sess->opt_cxt.is_stream_support = true; + } + if (t_thrd.spq_ctx.spq_role == ROLE_QUERY_COORDINTOR) { + u_sess->opt_cxt.is_stream = u_sess->attr.attr_sql.enable_stream_operator; + } +#endif } int2vector* get_baserel_distributekey_no(Oid relid) @@ -205,13 +213,346 @@ ExecNodes* stream_merge_exec_nodes(Plan* lefttree, Plan* righttree, bool push_no return lefttree->exec_nodes; } +#ifdef USE_SPQ +char* SpqCompressSerializedPlan(const char* plan_string, int* cLen) +{ + char* compressedPlan = NULL; + int oLen = strlen(plan_string) + 1; + compressedPlan = (char*)palloc0(LZ4_COMPRESSBOUND(oLen)); + *cLen = LZ4_compress_default(plan_string, compressedPlan, oLen, LZ4_compressBound(oLen)); + validate_LZ4_compress_result(*cLen, MOD_OPT, "compress serialized plan"); + return compressedPlan; +} +// Decompress the serialized plan with LZ4 compression algorithm. +// +char* SpqDecompressSerializedPlan(const char* comp_plan_string, int cLen, int oLen) +{ + char* serializedPlan = (char*)palloc0(oLen); + int returnLen = LZ4_decompress_safe(comp_plan_string, serializedPlan, cLen, oLen); + + if (returnLen < 0) { + ereport(ERROR, + (errmodule(MOD_OPT), + errcode(ERRCODE_DATA_CORRUPTED), + errmsg("LZ4 decompressing serialize plan failed, decompressing result %d", returnLen))); + } + + if (returnLen != oLen) { + ereport(ERROR, + (errmodule(MOD_OPT), + errcode(ERRCODE_OPTIMIZER_INCONSISTENT_STATE), + errmsg("LZ4 decompressing serialize plan failed, returnLen not equal with oLen."))); + } + if (strlen(serializedPlan) + 1 != (uint32)oLen) { + ereport(ERROR, + (errmodule(MOD_OPT), + errcode(ERRCODE_OPTIMIZER_INCONSISTENT_STATE), + errmsg("LZ4 decompressing serialize plan failed, length of serializedPlan not euqal with oLen."))); + } + + return serializedPlan; +} +static bool IsModifyTable(PlannedStmt* planned_stmt, Plan* node) +{ + /* + * fix plan on enable_force_vector_engine = on + * ->Vector Streaming (type: GATHER) + * ->Vector Adapter + * -> ModifyTable + */ + if (IsA(node, RowToVec) || IsA(node, VecToRow)) + node = node->lefttree; + + if (IsA(node, ModifyTable) || IsA(node, VecModifyTable) || + (CMD_SELECT != planned_stmt->commandType && IsModifyTableForDfsTable(node))) + return true; + + return false; +} +/* + * @Description: get all referenced subplans from current plan. + * + * @in result_plan: current plan + * @in/out context: the context the current plan reference all subplan info + */ +void set_node_ref_subplan_walker(Plan* result_plan, set_node_ref_subplan_context* context) +{ + if (NULL == result_plan) + return; + + ListCell* lc = NULL; + List* subplan_list = check_subplan_list(result_plan); /* find all subplan exprs from main plan */ + + foreach (lc, subplan_list) { + Node* pnode = (Node*)lfirst(lc); + SubPlan* subplan = NULL; + Plan* plan = NULL; + + if (IsA(pnode, SubPlan)) { + subplan = (SubPlan*)lfirst(lc); + /* this is for the case that initplan hidden in testexpr of subplan */ + subplan_list = list_concat(subplan_list, check_subplan_expr(subplan->testexpr)); + } else { + AssertEreport(IsA(pnode, Param), MOD_OPT, "The current node is not a param node"); + Param* param = (Param*)pnode; + ListCell* lc2 = NULL; + foreach (lc2, context->org_initPlan) { + subplan = (SubPlan*)lfirst(lc2); + if (list_member_int(subplan->setParam, param->paramid)) + break; + } + if (subplan == NULL || lc2 == NULL) + continue; + } + + plan = (Plan*)list_nth(context->org_subplans, subplan->plan_id - 1); + set_node_ref_subplan_walker(plan, context); + + /* We should serialize the subplans only if current node reference the subplan */ + context->subplan_plan_ids = lappend_int(context->subplan_plan_ids, subplan->plan_id); + } + + switch (nodeTag(result_plan)) { + case T_Append: + case T_VecAppend: { + Append* append = (Append*)result_plan; + ListCell* lc3 = NULL; + foreach (lc3, append->appendplans) { + Plan* plan = (Plan*)lfirst(lc3); + set_node_ref_subplan_walker(plan, context); + } + } break; + case T_ModifyTable: + case T_VecModifyTable: { + ModifyTable* mt = (ModifyTable*)result_plan; + ListCell* lc4 = NULL; + foreach (lc4, mt->plans) { + Plan* plan = (Plan*)lfirst(lc4); + set_node_ref_subplan_walker(plan, context); + } + } break; + case T_SubqueryScan: + case T_VecSubqueryScan: { + SubqueryScan* ss = (SubqueryScan*)result_plan; + if (ss->subplan) + set_node_ref_subplan_walker(ss->subplan, context); + } break; + case T_MergeAppend: { + MergeAppend* ma = (MergeAppend*)result_plan; + ListCell* lc5 = NULL; + foreach (lc5, ma->mergeplans) { + Plan* plan = (Plan*)lfirst(lc5); + set_node_ref_subplan_walker(plan, context); + } + } break; + case T_BitmapAnd: + case T_CStoreIndexAnd: { + BitmapAnd* ba = (BitmapAnd*)result_plan; + ListCell* lc6 = NULL; + foreach (lc6, ba->bitmapplans) { + Plan* plan = (Plan*)lfirst(lc6); + set_node_ref_subplan_walker(plan, context); + } + } break; + + case T_BitmapOr: + case T_CStoreIndexOr: { + BitmapOr* bo = (BitmapOr*)result_plan; + ListCell* lc7 = NULL; + foreach (lc7, bo->bitmapplans) { + Plan* plan = (Plan*)lfirst(lc7); + set_node_ref_subplan_walker(plan, context); + } + } break; + case T_ExtensiblePlan: { + ListCell* lc8 = NULL; + foreach(lc8, ((ExtensiblePlan*)result_plan)->extensible_plans) { + set_node_ref_subplan_walker((Plan*)lfirst(lc8), context); + } + } break; +#ifdef USE_SPQ + case T_Sequence: { + Sequence* sequence = (Sequence*)result_plan; + ListCell* lc9 = NULL; + foreach(lc9, sequence->subplans) { + Plan* plan = (Plan*)lfirst(lc9); + set_node_ref_subplan_walker(plan, context); + } + } break; +#endif + default: { + if (result_plan->lefttree) + set_node_ref_subplan_walker(result_plan->lefttree, context); + + if (result_plan->righttree) + set_node_ref_subplan_walker(result_plan->righttree, context); + } break; + } + + return; +} +static void set_node_ref_subplan(Plan* plan, PlannedStmt* planned_stmt, PlannedStmt* ship_planned_stmt) +{ + List *ret_subplans = NIL, *ret_initPlans = NIL; + ListCell *lc1 = NULL, *lc2 = NULL; + set_node_ref_subplan_context context; + + /* Construct context members. */ + context.org_subplans = planned_stmt->subplans; + context.org_initPlan = planned_stmt->initPlan; + context.subplan_plan_ids = NIL; + + set_node_ref_subplan_walker(plan, &context); + + /* Set non-referenced subplans of the current plan as NULL. */ + foreach (lc2, planned_stmt->subplans) { + if (NIL == context.subplan_plan_ids) + ret_subplans = lappend(ret_subplans, NULL); + else { + Plan* tmp_plan = NULL; + + foreach (lc1, context.subplan_plan_ids) { + int planid = lfirst_int(lc1); + tmp_plan = (Plan*)list_nth(planned_stmt->subplans, planid - 1); + + if (((Plan*)lfirst(lc2))->plan_node_id == tmp_plan->plan_node_id) + break; + } + + if (NULL == lc1) + ret_subplans = lappend(ret_subplans, NULL); + else + ret_subplans = lappend(ret_subplans, tmp_plan); + } + } + + /* Set non-referenced initPlans of the current plan as NULL. */ + foreach (lc2, planned_stmt->initPlan) { + if (NIL == context.subplan_plan_ids) + ret_initPlans = lappend(ret_initPlans, NULL); + else { + SubPlan* subplan = (SubPlan*)lfirst(lc2); + if (list_member_int(context.subplan_plan_ids, subplan->plan_id)) + ret_initPlans = lappend(ret_initPlans, subplan); + else + ret_initPlans = lappend(ret_initPlans, NULL); + } + } + + ship_planned_stmt->subplans = ret_subplans; + ship_planned_stmt->initPlan = ret_initPlans; +} + +/* + * Serialized the plan tree to string + */ +void SpqSerializePlan(Plan* node, PlannedStmt* planned_stmt, StringInfoData* str, + int num_stream, int num_gather, bool push_subplan, uint64 queryId) +{ + PlannedStmt* ShipPlannedStmt = NULL; + ShipPlannedStmt = makeNode(PlannedStmt); + + if (planned_stmt->commandType != CMD_SELECT && IsModifyTable(planned_stmt, node)) { + ShipPlannedStmt->commandType = planned_stmt->commandType; + ShipPlannedStmt->hasReturning = planned_stmt->hasReturning; + } else { + ShipPlannedStmt->commandType = CMD_SELECT; + ShipPlannedStmt->hasReturning = false; + } + + ShipPlannedStmt->queryId = queryId; + ShipPlannedStmt->spq_session_id = planned_stmt->spq_session_id; + ShipPlannedStmt->current_id = planned_stmt->current_id; + ShipPlannedStmt->hasModifyingCTE = planned_stmt->hasModifyingCTE; + ShipPlannedStmt->canSetTag = planned_stmt->canSetTag; + ShipPlannedStmt->transientPlan = planned_stmt->transientPlan; + ShipPlannedStmt->dependsOnRole = planned_stmt->dependsOnRole; + ShipPlannedStmt->planTree = node; + ShipPlannedStmt->rtable = planned_stmt->rtable; + /* data redistribution for DFS table. */ + ShipPlannedStmt->dataDestRelIndex = planned_stmt->dataDestRelIndex; + ShipPlannedStmt->MaxBloomFilterNum = planned_stmt->MaxBloomFilterNum; + ShipPlannedStmt->query_mem[0] = planned_stmt->query_mem[0]; + ShipPlannedStmt->assigned_query_mem[0] = planned_stmt->assigned_query_mem[0]; + ShipPlannedStmt->assigned_query_mem[1] = planned_stmt->assigned_query_mem[1]; + + /* + * Currently, when delete/update operator applied Dfs table, the append + * plan node will be pushed down, so set ShipPlannedStmt->resultRelations is + * planned_stmt->resultRelations. + */ + if (IsModifyTable(planned_stmt, node)) + ShipPlannedStmt->resultRelations = planned_stmt->resultRelations; + else + ShipPlannedStmt->resultRelations = NIL; + + ShipPlannedStmt->utilityStmt = planned_stmt->utilityStmt; + + /* If have subplan, we should set non-referenced subplans of the current plan as NULL, so we don't serialized them + */ + if (push_subplan) { + bool with_recursive = ContainRecursiveUnionSubplan(planned_stmt); + if (planned_stmt->subplans && !with_recursive) { + set_node_ref_subplan(node, planned_stmt, ShipPlannedStmt); + } else { + ShipPlannedStmt->subplans = planned_stmt->subplans; + ShipPlannedStmt->initPlan = planned_stmt->initPlan; + } + } + + ShipPlannedStmt->subplan_ids = planned_stmt->subplan_ids; + ShipPlannedStmt->rewindPlanIDs = planned_stmt->rewindPlanIDs; + ShipPlannedStmt->rowMarks = planned_stmt->rowMarks; + ShipPlannedStmt->relationOids = planned_stmt->relationOids; + ShipPlannedStmt->invalItems = planned_stmt->invalItems; + ShipPlannedStmt->nParamExec = planned_stmt->nParamExec; + ShipPlannedStmt->num_streams = num_stream; + ShipPlannedStmt->gather_count = num_gather; + ShipPlannedStmt->num_nodes = planned_stmt->num_nodes; + ShipPlannedStmt->nodesDefinition = planned_stmt->nodesDefinition; + /* We don't send instrument option to datanode for un-stream plan. + * For un-stream plan, we can not finalize node id and parent node id for result plan. + */ + /* IS_PGXC_DATANODE means in DWS DN, in_compute_pool means in CN of the compute pool. */ + if (IS_STREAM_PLAN || IS_PGXC_DATANODE || planned_stmt->in_compute_pool) { + ShipPlannedStmt->instrument_option = planned_stmt->instrument_option; + } else + ShipPlannedStmt->instrument_option = 0; + + ShipPlannedStmt->num_plannodes = planned_stmt->num_plannodes; + ShipPlannedStmt->query_string = planned_stmt->query_string; + ShipPlannedStmt->in_compute_pool = planned_stmt->in_compute_pool; + ShipPlannedStmt->has_obsrel = planned_stmt->has_obsrel; + ShipPlannedStmt->num_bucketmaps = planned_stmt->num_bucketmaps; + ShipPlannedStmt->query_dop = planned_stmt->query_dop; + + appendStringInfoChar(str, FLAG_SERIALIZED_PLAN); // Flag to indicate it's serialized plan + for (int i = 0; i < ShipPlannedStmt->num_bucketmaps; i++) { + ShipPlannedStmt->bucketMap[i] = planned_stmt->bucketMap[i]; + ShipPlannedStmt->bucketCnt[i] = planned_stmt->bucketCnt[i]; + } + + /* not ship planB */ + ShipPlannedStmt->ng_num = planned_stmt->ng_num; + ShipPlannedStmt->ng_queryMem = planned_stmt->ng_queryMem; + + appendStringInfoString(str, nodeToString(ShipPlannedStmt)); +} +#endif + char* CompressSerializedPlan(const char* plan_string, int* cLen) { +#ifdef USE_SPQ + return SpqCompressSerializedPlan(plan_string, cLen); +#endif DISTRIBUTED_FEATURE_NOT_SUPPORTED(); return NULL; } char* DecompressSerializedPlan(const char* comp_plan_string, int cLen, int oLen) { +#ifdef USE_SPQ + return SpqDecompressSerializedPlan(comp_plan_string, cLen, oLen); +#endif DISTRIBUTED_FEATURE_NOT_SUPPORTED(); return NULL; } diff --git a/src/gausskernel/optimizer/plan/streamplan_utils.cpp b/src/gausskernel/optimizer/plan/streamplan_utils.cpp index ff939c07d..8e2499a43 100755 --- a/src/gausskernel/optimizer/plan/streamplan_utils.cpp +++ b/src/gausskernel/optimizer/plan/streamplan_utils.cpp @@ -1652,6 +1652,19 @@ void finalize_node_id(Plan* result_plan, int* plan_node_id, int* parent_node_id, *parent_node_id = save_parent_id; } } break; +#ifdef USE_SPQ + case T_Sequence: { + Sequence* sequence = (Sequence*)result_plan; + ListCell* lc = NULL; + foreach(lc, sequence->subplans) { + Plan* plan = (Plan*)lfirst(lc); + finalize_node_id(plan, plan_node_id, parent_node_id, num_streams, num_plannodes, total_num_streams, + max_push_sql_num, gather_count, subplans, subroots, initplans, subplan_ids, false, + is_under_ctescan, is_data_node_exec, is_read_only, node_group_info_context); + *parent_node_id = save_parent_id; + } + } break; +#endif case T_CteScan: { if (STREAM_RECURSIVECTE_SUPPORTED) { CteScan* cte_plan = (CteScan*)result_plan; @@ -1864,7 +1877,7 @@ void finalize_node_id(Plan* result_plan, int* plan_node_id, int* parent_node_id, * (3) For initplan, if main query is from cn, just gather the subplan * (4) vectorize subplan and set plan references */ - if (IS_STREAM_PLAN) { + if ((IS_SPQ_COORDINATOR && list_length(subplans) > 0 ) || (!IS_SPQ_RUNNING && IS_STREAM_PLAN)) { /* Block pushing down Random()/GS_ENCRYPT_AES128() in Replicated plan temporarily */ if (is_replicated_plan(result_plan) && is_execute_on_multinodes(result_plan)) { List* nodelist = check_random_expr(result_plan); @@ -1922,7 +1935,7 @@ void finalize_node_id(Plan* result_plan, int* plan_node_id, int* parent_node_id, subplan_ids[subplan->plan_id] = subplan_ids[0]; if (!has_finalized) { -#ifdef ENABLE_MULTIPLE_NODES +#if defined(ENABLE_MULTIPLE_NODES) || defined(USE_SPQ) /* * subplan on dn and main plan on cn. In such case, we only * support initplan, and gather the result to cn. @@ -1930,6 +1943,7 @@ void finalize_node_id(Plan* result_plan, int* plan_node_id, int* parent_node_id, * single no need to consider this situation, because subplan * and the node contains subplan will not parallel. */ + if (t_thrd.spq_ctx.spq_role != ROLE_UTILITY) { if (is_execute_on_coordinator(result_plan) || (is_execute_on_allnodes(result_plan) && !is_data_node_exec)) { Plan* child_plan = NULL; @@ -1999,8 +2013,13 @@ void finalize_node_id(Plan* result_plan, int* plan_node_id, int* parent_node_id, /* Push only nodelist but not entire exec_nodes here. */ pushdown_execnodes(plan, result_plan->exec_nodes, false, true); } + } #endif +#ifdef USE_SPQ + if (check_stream_support() && t_thrd.spq_ctx.spq_role == ROLE_UTILITY) { +#else if (check_stream_support()) { +#endif PlannerInfo* subroot = NULL; Plan* child_root = NULL; ListCell* lr = NULL; diff --git a/src/gausskernel/optimizer/plan/subselect.cpp b/src/gausskernel/optimizer/plan/subselect.cpp index 8c69659ae..74a3cfcb2 100644 --- a/src/gausskernel/optimizer/plan/subselect.cpp +++ b/src/gausskernel/optimizer/plan/subselect.cpp @@ -3309,6 +3309,9 @@ static Bitmapset* finalize_plan(PlannerInfo* root, Plan* plan, Bitmapset* valid_ case T_Stream: case T_PartIterator: case T_StartWithOp: +#ifdef USE_SPQ + case T_ShareInputScan: +#endif break; default: diff --git a/src/gausskernel/optimizer/prep/prepjointree.cpp b/src/gausskernel/optimizer/prep/prepjointree.cpp index 18838537d..9ec5043e9 100755 --- a/src/gausskernel/optimizer/prep/prepjointree.cpp +++ b/src/gausskernel/optimizer/prep/prepjointree.cpp @@ -1358,6 +1358,12 @@ static Node* pull_up_simple_subquery(PlannerInfo* root, Node* jtnode, RangeTblEn case RTE_RESULT: case RTE_REMOTE_DUMMY: /* these can't contain any lateral references */ +#ifdef USE_SPQ + case RTE_NAMEDTUPLESTORE: + case RTE_TABLEFUNC: /* TableFunc(.., column list) */ + case RTE_VOID: /* SPQ: deleted RTE */ + case RTE_TABLEFUNCTION: /* SPQ: Functions over multiset input */ +#endif break; } } @@ -3867,4 +3873,4 @@ static bool is_safe_pull_up_sublink_having(PlannerInfo* root) bms_free(level_up_varnos); list_free_ext(sublinkList); return true; -} \ No newline at end of file +} diff --git a/src/gausskernel/optimizer/rewrite/rewriteManip.cpp b/src/gausskernel/optimizer/rewrite/rewriteManip.cpp index 9f3eb324a..6123167c2 100644 --- a/src/gausskernel/optimizer/rewrite/rewriteManip.cpp +++ b/src/gausskernel/optimizer/rewrite/rewriteManip.cpp @@ -688,6 +688,9 @@ static Relids adjust_relid_set(Relids relids, int oldrelid, int newrelid) typedef struct { int delta_sublevels_up; int min_sublevels_up; +#ifdef USE_SPQ + bool ignore_min_sublevels_up; +#endif } IncrementVarSublevelsUp_context; static bool IncrementVarSublevelsUp_walker(Node* node, IncrementVarSublevelsUp_context* context) @@ -1396,3 +1399,21 @@ Node* ReplaceVarsFromTargetList(Node* node, int target_varno, int sublevels_up, return replace_rte_variables( node, target_varno, sublevels_up, ReplaceVarsFromTargetList_callback, (void*)&context, outer_hasSubLinks); } + +#ifdef USE_SPQ +void SpqIncrementVarSublevelsUpInTransformGroupedWindows(Node *node, int delta_sublevels_up, int min_sublevels_up) +{ + IncrementVarSublevelsUp_context context; + + context.delta_sublevels_up = delta_sublevels_up; + context.min_sublevels_up = min_sublevels_up; + context.ignore_min_sublevels_up = false; + + /* + * Must be prepared to start with a Query or a bare expression tree; if + * it's a Query, we don't want to increment sublevels_up. + */ + (void)query_or_expression_tree_walker + (node, (bool (*)())IncrementVarSublevelsUp_walker, (void *)&context, QTW_EXAMINE_RTES); +} +#endif \ No newline at end of file diff --git a/src/gausskernel/optimizer/util/clauses.cpp b/src/gausskernel/optimizer/util/clauses.cpp index 940a9e132..dd2e0bb1f 100644 --- a/src/gausskernel/optimizer/util/clauses.cpp +++ b/src/gausskernel/optimizer/util/clauses.cpp @@ -5468,3 +5468,109 @@ List *get_quals_lists(Node *jtnode) return quallist; } +#ifdef USE_SPQ + +/* + * fold_constants + * + * Recurses into query tree and folds all constant expressions. + */ +Query *fold_constants(PlannerInfo *root, Query *q, ParamListInfo boundParams, Size max_size) +{ + eval_const_expressions_context context; + + context.root = root; + context.boundParams = boundParams; + context.active_fns = NIL; /* nothing being recursively simplified */ + context.case_val = NULL; /* no CASE being examined */ + context.estimate = false; /* safe transformations only */ + context.recurse_queries = true; /* recurse into query structures */ + context.recurse_sublink_testexpr = false; /* do not recurse into sublink test expressions */ + + context.max_size = max_size; + + return (Query *) query_or_expression_tree_mutator( + (Node *) q, + (Node* (*)(Node*, void*)) eval_const_expressions_mutator, + &context,0); +} + +/* + * flatten_join_alias_var_optimizer + * Replace Vars that reference JOIN outputs with references to the original + * relation variables instead. + */ +Query * flatten_join_alias_var_optimizer(Query *query, int queryLevel) +{ + Query *queryNew = (Query *) copyObject(query); + + /* + * Flatten join alias for expression in + * 1. targetlist + * 2. returningList + * 3. having qual + * 4. scatterClause + * 5. limit offset + * 6. limit count + * + * We flatten the above expressions since these entries may be moved during the query + * normalization step before algebrization. In contrast, the planner flattens alias + * inside quals to allow predicates involving such vars to be pushed down. + * + * Here we ignore the flattening of quals due to the following reasons: + * 1. we assume that the function will be called before Query->DXL translation: + * 2. the quals never gets moved from old query to the new top-level query in the + * query normalization phase before algebrization. In other words, the quals hang of + * the same query structure that is now the new derived table. + * 3. the algebrizer can resolve the abiquity of join aliases in quals since we maintain + * all combinations of to DXL-ColId during Query->DXL translation. + * + */ + + return queryNew; +} + +Expr *transform_array_Const_to_ArrayExpr(Const *c) +{ + Oid elemtype; + int16 elemlen; + bool elembyval; + char elemalign; + int nelems; + Datum *elems; + bool *nulls; + ArrayType *ac; + ArrayExpr *aexpr; + int i; + + Assert(IsA(c, Const)); + + /* Does it look like the right kind of an array Const? */ + if (c->constisnull) + return (Expr *)c; /* NULL const */ + + elemtype = get_element_type(c->consttype); + if (elemtype == InvalidOid) + return (Expr *)c; /* not an array */ + + ac = DatumGetArrayTypeP(c->constvalue); + nelems = ArrayGetNItems(ARR_NDIM(ac), ARR_DIMS(ac)); + + /* All set, extract the elements, and an ArrayExpr to hold them. */ + get_typlenbyvalalign(elemtype, &elemlen, &elembyval, &elemalign); + deconstruct_array(ac, elemtype, elemlen, elembyval, elemalign, &elems, &nulls, &nelems); + + aexpr = makeNode(ArrayExpr); + aexpr->array_typeid = c->consttype; + aexpr->element_typeid = elemtype; + aexpr->multidims = false; + aexpr->location = c->location; + + for (i = 0; i < nelems; i++) { + aexpr->elements = + lappend(aexpr->elements, makeConst(elemtype, -1, c->constcollid, elemlen, elems[i], nulls[i], elembyval)); + } + + return (Expr *)aexpr; +} +#endif diff --git a/src/gausskernel/optimizer/util/learn/encoding.cpp b/src/gausskernel/optimizer/util/learn/encoding.cpp index d5307d283..0a539cfe3 100644 --- a/src/gausskernel/optimizer/util/learn/encoding.cpp +++ b/src/gausskernel/optimizer/util/learn/encoding.cpp @@ -48,7 +48,11 @@ typedef struct { char* strategy; } OperationInfo; +#ifdef USE_SPQ +const unsigned int G_MAX_OPERATION_NUMBER = 66; +#else const unsigned int G_MAX_OPERATION_NUMBER = 65; +#endif const OperationInfo G_OPERATION_INFO_TABLE[G_MAX_OPERATION_NUMBER] = { {T_BaseResult, TEXT_OPTNAME_RESULT, ""}, @@ -71,6 +75,9 @@ const OperationInfo G_OPERATION_INFO_TABLE[G_MAX_OPERATION_NUMBER] = { {T_VecHashJoin, TEXT_OPTNAME_JOIN, TEXT_STRATEGY_JOIN_HASH}, {T_CStoreScan, TEXT_OPTNAME_SCAN, TEXT_STRATEGY_SCAN_SEQ}, {T_SeqScan, TEXT_OPTNAME_SCAN, TEXT_STRATEGY_SCAN_SEQ}, +#ifdef USE_SPQ + {T_SpqSeqScan, TEXT_OPTNAME_SCAN, TEXT_STRATEGY_SCAN_SEQ}, +#endif {T_IndexScan, TEXT_OPTNAME_SCAN, TEXT_STRATEGY_SCAN_INDEX}, {T_CStoreIndexScan, TEXT_OPTNAME_SCAN, TEXT_STRATEGY_SCAN_INDEX}, {T_IndexOnlyScan, TEXT_OPTNAME_SCAN, TEXT_STRATEGY_SCAN_INDEX_ONLY}, diff --git a/src/gausskernel/optimizer/util/optcommon.cpp b/src/gausskernel/optimizer/util/optcommon.cpp index 6f93ad369..95041f77e 100755 --- a/src/gausskernel/optimizer/util/optcommon.cpp +++ b/src/gausskernel/optimizer/util/optcommon.cpp @@ -31,6 +31,11 @@ void GetPlanNodePlainText( RemoteQuery* rq = NULL; char* extensible_name = NULL; switch (nodeTag(plan)) { +#ifdef USE_SPQ + case T_Result: + *pname = *sname = *pt_operation = "SPQ Result"; + break; +#endif case T_BaseResult: *pname = *sname = *pt_operation = "Result"; break; @@ -125,6 +130,33 @@ void GetPlanNodePlainText( } } break; +#ifdef USE_SPQ + case T_SpqSeqScan: + *pt_operation = "TABLE ACCESS"; + if (!((Scan*)plan)->tablesample) { + if (((Scan*)plan)->isPartTbl) { + *pname = *sname = *pt_options = "Partitioned Seq Scan"; + } else { + *pname = *sname = *pt_options = "Spq Seq Scan"; + } + } else { + if (((Scan*)plan)->isPartTbl) { + *pname = *sname = *pt_options = "Partitioned Sample Scan"; + } else { + *pname = *sname = *pt_options = "Spq Sample Scan"; + } + } + break; + case T_AssertOp: + *pname = *sname = "Assert"; + break; + case T_ShareInputScan: + *pname = *sname = *pt_options = "ShareInputScan"; + break; + case T_Sequence: + *pname = *sname = *pt_options = "Sequence"; + break; +#endif case T_CStoreScan: *pt_operation = "TABLE ACCESS"; if (!((Scan*)plan)->tablesample) { diff --git a/src/gausskernel/optimizer/util/plancat.cpp b/src/gausskernel/optimizer/util/plancat.cpp index 900c01264..2c6a401d9 100755 --- a/src/gausskernel/optimizer/util/plancat.cpp +++ b/src/gausskernel/optimizer/util/plancat.cpp @@ -61,6 +61,9 @@ #ifdef PGXC #include "pgxc/pgxc.h" #endif +#ifdef USE_SPQ +#include "catalog/pg_inherits_fn.h" +#endif #define ESTIMATE_PARTITION_NUMBER 10 #define ESTIMATE_PARTITION_NUMBER_THRESHOLD 5 @@ -2081,3 +2084,41 @@ PlannerInfo *get_cte_root(PlannerInfo *root, int levelsup, char *ctename) } return cteroot; } + +#ifdef USE_SPQ +double spq_estimate_partitioned_numtuples(Relation rel) +{ + List *inheritors; + ListCell *lc; + double totaltuples; + + if (rel->rd_rel->reltuples > 0) + return rel->rd_rel->reltuples; + + inheritors = find_all_inheritors(RelationGetRelid(rel), AccessShareLock, NULL); + totaltuples = 0; + foreach (lc, inheritors) { + Oid childid = lfirst_oid(lc); + Relation childrel; + double childtuples; + + if (childid != RelationGetRelid(rel)) + childrel = try_table_open(childid, NoLock); + else + childrel = rel; + + childtuples = childrel->rd_rel->reltuples; + + if (childtuples == 0 && rel_is_external_table(RelationGetRelid(childrel))) { +#define DEFAULT_EXTERNAL_TABLE_TUPLES 1000000 + childtuples = DEFAULT_EXTERNAL_TABLE_TUPLES; + } + totaltuples += childtuples; + + if (childrel != rel) + heap_close(childrel, NoLock); + } + return totaltuples; +} + +#endif diff --git a/src/gausskernel/optimizer/util/planmem_walker.cpp b/src/gausskernel/optimizer/util/planmem_walker.cpp index f4a04c88b..2ac4ab804 100644 --- a/src/gausskernel/optimizer/util/planmem_walker.cpp +++ b/src/gausskernel/optimizer/util/planmem_walker.cpp @@ -37,6 +37,9 @@ #include "nodes/plannodes.h" #include "optimizer/pgxcplan.h" #include "optimizer/planmem_walker.h" +#ifdef USE_SPQ +#include "catalog/pg_collation.h" +#endif extern void check_stack_depth(void); static bool walk_scan_node_fields(Scan* scan, MethodWalker walker, void* context); @@ -197,7 +200,9 @@ bool plan_tree_walker(Node* node, MethodWalker walker, void* context) case T_Plan: case T_ProjectSet: return walk_plan_node_fields((Plan*)node, walker, context); - +#ifdef USE_SPQ + case T_Result: +#endif case T_BaseResult: case T_VecResult: if (walk_plan_node_fields((Plan*)node, walker, context)) @@ -257,6 +262,9 @@ bool plan_tree_walker(Node* node, MethodWalker walker, void* context) break; case T_SeqScan: +#ifdef USE_SPQ + case T_SpqSeqScan: +#endif case T_FunctionScan: case T_ValuesScan: case T_CteScan: @@ -394,6 +402,10 @@ bool plan_tree_walker(Node* node, MethodWalker walker, void* context) return true; if (p2walker((Node*)((HashJoin*)node)->hashclauses, context)) return true; +#ifdef USE_SPQ + if (p2walker((Node *)((HashJoin *)node)->hashqualclauses, context)) + return true; +#endif break; case T_VecToRow: @@ -455,6 +467,33 @@ bool plan_tree_walker(Node* node, MethodWalker walker, void* context) break; +#ifdef USE_SPQ + case T_Motion: + if (walk_plan_node_fields((Plan *) node, walker, context)) + return true; + + if (p2walker((Node *) ((Motion *)node)->hashExprs, context)) + return true; + + break; + + case T_AssertOp: + if (walk_plan_node_fields((Plan *) node, walker, context)) + return true; + break; + + case T_ShareInputScan: + if (walk_plan_node_fields((Plan *) node, walker, context)) + return true; + break; + + case T_Sequence: + if (walk_plan_node_fields((Plan *) node, walker, context)) + return true; + if (p2walker((Node *) ((Sequence *) node)->subplans, context)) + return true; + break; +#endif case T_VecModifyTable: case T_ModifyTable: { ModifyTable* modifytable = (ModifyTable*)node; @@ -608,3 +647,315 @@ Plan* plan_tree_base_subplan_get_plan(plan_tree_base_prefix* base, SubPlan* subp return NULL; } +#ifdef USE_SPQ +/* + * These are helpers to retrieve nodes from plans. + */ +typedef struct extract_context { + //plan_tree_base_prefix base; /* Required prefix for plan_tree_walker/mutator */ + MethodPlanWalkerContext ctx; + bool descendIntoSubqueries; + NodeTag nodeTag; + List *nodes; +} extract_context; + +static bool extract_nodes_walker(Node *node, extract_context *context); +static bool extract_nodes_expression_walker(Node *node, extract_context *context); +/* Rewrite the plan associated with a SubPlan node in a mutator. (This is used by + * framework, not by users of the framework.) + */ +void plan_tree_base_subplan_put_plan(plan_tree_base_prefix *base, SubPlan *subplan, Plan *plan) +{ + Assert(base); + if (IsA(base->node, PlannedStmt)) { + exec_subplan_put_plan((PlannedStmt*)base->node, subplan, plan); + return; + } else if (IsA(base->node, PlannerInfo)) { + planner_subplan_put_plan((PlannerInfo*)base->node, subplan, plan); + return; + } + Assert(false && "Must provide relevant base info."); +} +List *extract_nodes_plan(Plan *pl, int nodeTag, bool descendIntoSubqueries) +{ + extract_context context; + errno_t rc = 0; + rc = memset_s(&context, sizeof(extract_context), 0, sizeof(extract_context)); + securec_check_c(rc, "\0", "\0"); + Assert(pl); + context.nodeTag = (NodeTag)nodeTag; + context.descendIntoSubqueries = descendIntoSubqueries; + extract_nodes_walker((Node *)pl, &context); + return context.nodes; +} +static bool extract_nodes_walker(Node *node, extract_context *context) +{ + if (node == NULL) + return false; + if (nodeTag(node) == context->nodeTag) { + context->nodes = lappend(context->nodes, node); + } + if (nodeTag(node) == T_SubPlan) { + SubPlan *subplan = (SubPlan *)node; + + /* + * SubPlan has both of expressions and subquery. In case the caller wants + * non-subquery version, still we need to walk through its expressions. + * NB: Since we're not going to descend into SUBPLANs anyway (see below), + * look at the SUBPLAN node here, even if descendIntoSubqueries is false + * lest we miss some nodes there. + */ + if (extract_nodes_walker((Node *)subplan->testexpr, context)) + return true; + if (expression_tree_walker((Node *)subplan->args, (MethodWalker)extract_nodes_walker, context)) + return true; + + /* + * Do not descend into subplans. + * Even if descendIntoSubqueries indicates the caller wants to descend into + * subqueries, SubPlan seems special; Some partitioning code assumes this + * should return immediately without descending. See MPP-17168. + */ + return false; + } + if (nodeTag(node) == T_SubqueryScan && !context->descendIntoSubqueries) { + /* Do not descend into subquery scans. */ + return false; + } + + return plan_tree_walker(node, (MethodWalker)extract_nodes_walker, (void *)context); +} +/** + * Extract nodes with specific tag. + * Same as above, but starts off a scalar expression node rather than a PlannedStmt + * + */ +List *extract_nodes_expression(Node *node, int nodeTag, bool descendIntoSubqueries) +{ + extract_context context; + errno_t rc = 0; + rc = memset_s(&context, sizeof(extract_context), 0, sizeof(extract_context)); + securec_check_c(rc, "\0", "\0"); + Assert(node); + context.nodeTag = (NodeTag)nodeTag; + context.descendIntoSubqueries = descendIntoSubqueries; + extract_nodes_expression_walker(node, &context); + + return context.nodes; +} + +static bool extract_nodes_expression_walker(Node *node, extract_context *context) +{ + if (NULL == node) { + return false; + } + + if (nodeTag(node) == context->nodeTag) { + context->nodes = lappend(context->nodes, node); + } + + if (nodeTag(node) == T_Query && context->descendIntoSubqueries) { + Query *query = (Query *)node; + if (expression_tree_walker((Node *)query->targetList, (MethodWalker)extract_nodes_expression_walker, (void *)context)) { + return true; + } + + if (query->jointree != NULL && + expression_tree_walker(query->jointree->quals, (MethodWalker)extract_nodes_expression_walker, (void *)context)) { + return true; + } + + return expression_tree_walker(query->havingQual, (MethodWalker)extract_nodes_expression_walker, (void *)context); + } + + return expression_tree_walker(node, (MethodWalker)extract_nodes_expression_walker, (void *)context); +} +typedef struct find_nodes_context { + List *nodeTags; + int foundNode; +} find_nodes_context; + +static bool find_nodes_walker(Node *node, find_nodes_context *context); + +/** + * Looks for nodes that belong to the given list. + * Returns the index of the first such node that it encounters, or -1 if none + */ +int find_nodes(Node *node, List *nodeTags) +{ + find_nodes_context context; + Assert(NULL != node); + context.nodeTags = nodeTags; + context.foundNode = -1; + find_nodes_walker(node, &context); + + return context.foundNode; +} + +static bool find_nodes_walker(Node *node, find_nodes_context *context) +{ + if (NULL == node) { + return false; + } + + if (IsA(node, Query)) { + /* Recurse into subselects */ + return query_tree_walker((Query *)node, (bool (*)())find_nodes_walker, (void *)context, 0 /* flags */); + } + + ListCell *lc; + int i = 0; + foreach (lc, context->nodeTags) { + NodeTag nodeTag = (NodeTag)lfirst_int(lc); + if (nodeTag(node) == nodeTag) { + context->foundNode = i; + return true; + } + + i++; + } + + return expression_tree_walker(node, (MethodWalker)find_nodes_walker, (void *)context); +} +/** + * GPDB_91_MERGE_FIXME: collation + * Look for nodes with non-default collation; return 1 if any exist, -1 + * otherwise. + */ +typedef struct check_collation_context { + int foundNonDefaultCollation; +} check_collation_context; + +static bool check_collation_walker(Node *node, check_collation_context *context); + +int check_collation(Node *node) +{ + check_collation_context context; + Assert(NULL != node); + context.foundNonDefaultCollation = -1; + check_collation_walker(node, &context); + + return context.foundNonDefaultCollation; +} + + +static void check_collation_in_list(List *colllist, check_collation_context *context) +{ + ListCell *lc; + foreach (lc, colllist) { + Oid coll = lfirst_oid(lc); + if (InvalidOid != coll && DEFAULT_COLLATION_OID != coll) { + context->foundNonDefaultCollation = 1; + break; + } + } +} + +static bool check_collation_walker(Node *node, check_collation_context *context) +{ + Oid collation, inputCollation, type; + + if (NULL == node) { + return false; + } + + if (IsA(node, Query)) { + /* Recurse into subselects */ + return query_tree_walker((Query *)node, (bool (*)())check_collation_walker, (void *)context, 0 /* flags */); + } + + switch (nodeTag(node)) { + case T_Var: + case T_Const: + case T_OpExpr: + type = exprType((node)); + collation = exprCollation(node); + if (type == NAMEOID || type == NAMEARRAYOID) { + if (collation != C_COLLATION_OID) + context->foundNonDefaultCollation = 1; + } else if (InvalidOid != collation && DEFAULT_COLLATION_OID != collation) { + context->foundNonDefaultCollation = 1; + } + break; + case T_ScalarArrayOpExpr: + case T_DistinctExpr: + case T_BoolExpr: + case T_BooleanTest: + case T_CaseExpr: + case T_CaseTestExpr: + case T_CoalesceExpr: + case T_MinMaxExpr: + case T_FuncExpr: + case T_Aggref: + case T_WindowFunc: + case T_NullTest: + case T_NullIfExpr: + case T_RelabelType: + case T_CoerceToDomain: + case T_CoerceViaIO: + case T_ArrayCoerceExpr: + case T_SubLink: + case T_ArrayExpr: + //case T_SubscriptingRef: + case T_RowExpr: + case T_RowCompareExpr: + case T_FieldSelect: + case T_FieldStore: + case T_CoerceToDomainValue: + case T_CurrentOfExpr: + case T_NamedArgExpr: + case T_ConvertRowtypeExpr: + case T_CollateExpr: + //case T_TableValueExpr: + case T_XmlExpr: + case T_SetToDefault: + case T_PlaceHolderVar: + case T_Param: + case T_SubPlan: + case T_AlternativeSubPlan: + case T_GroupingFunc: + //case T_DMLActionExpr: + collation = exprCollation(node); + inputCollation = exprInputCollation(node); + if ((InvalidOid != collation && DEFAULT_COLLATION_OID != collation) || + (InvalidOid != inputCollation && DEFAULT_COLLATION_OID != inputCollation)) { + context->foundNonDefaultCollation = 1; + } + break; + case T_CollateClause: + /* unsupported */ + context->foundNonDefaultCollation = 1; + break; + case T_ColumnDef: + collation = ((ColumnDef *)node)->collOid; + if (InvalidOid != collation && DEFAULT_COLLATION_OID != collation) { + context->foundNonDefaultCollation = 1; + } + break; + case T_IndexElem: + if (NIL != ((IndexElem *)node)->collation) { + context->foundNonDefaultCollation = 1; + } + break; + case T_RangeTblEntry: + Assert(false); + break; + case T_CommonTableExpr: + check_collation_in_list(((CommonTableExpr *)node)->ctecolcollations, context); + break; + case T_SetOperationStmt: + check_collation_in_list(((SetOperationStmt *)node)->colCollations, context); + break; + default: + /* make compiler happy */ + break; + } + + if (context->foundNonDefaultCollation == 1) { + /* end recursion */ + return true; + } else { + return expression_tree_walker(node, (bool (*)())check_collation_walker, (void *)context); + } +} +#endif diff --git a/src/gausskernel/optimizer/util/tlist.cpp b/src/gausskernel/optimizer/util/tlist.cpp index fea63d6a2..6006c2d9f 100644 --- a/src/gausskernel/optimizer/util/tlist.cpp +++ b/src/gausskernel/optimizer/util/tlist.cpp @@ -1386,4 +1386,157 @@ add_sp_items_to_pathtarget(PathTarget *target, List *items) add_sp_item_to_pathtarget(target, item); } -} \ No newline at end of file +} + +#ifdef USE_SPQ +typedef struct maxSortGroupRef_context { + Index maxsgr; + bool include_orderedagg; +} maxSortGroupRef_context; +/* + * tlist_members + * Finds all members of the given tlist whose expression is + * equal() to the given expression. Result is NIL if no such member. + * Note: We do not make a copy of the tlist entries that match. + * The caller is responsible for cleaning up the memory allocated + * to the List returned. + */ +List* tlist_members(Node *node, List *targetlist) +{ + List *tlist = NIL; + ListCell *temp = NULL; + + foreach (temp, targetlist) { + TargetEntry *tlentry = (TargetEntry *) lfirst(temp); + + Assert(IsA(tlentry, TargetEntry)); + + if (equal(node, tlentry->expr)) { + tlist = lappend(tlist, tlentry); + } + } + + return tlist; +} + +static void get_sortgroupclauses_tles_recurse(List *clauses, List *targetList, List **tles, List **sortops, + List **eqops) +{ + ListCell *lc; + ListCell *lc_sortop; + ListCell *lc_eqop; + List *sub_grouping_tles = NIL; + List *sub_grouping_sortops = NIL; + List *sub_grouping_eqops = NIL; + + foreach (lc, clauses) { + Node *node = (Node *)lfirst(lc); + + if (node == NULL) + continue; + + if (IsA(node, SortGroupClause)) { + SortGroupClause *sgc = (SortGroupClause *)node; + TargetEntry *tle = get_sortgroupclause_tle(sgc, targetList); + + if (!list_member(*tles, tle)) { + *tles = lappend(*tles, tle); + *sortops = lappend_oid(*sortops, sgc->sortop); + *eqops = lappend_oid(*eqops, sgc->eqop); + } + } else if (IsA(node, List)) { + get_sortgroupclauses_tles_recurse((List *)node, targetList, tles, sortops, eqops); + } else + elog(ERROR, "unrecognized node type in list of sort/group clauses: %d", (int)nodeTag(node)); + } + + /* + * Put SortGroupClauses before GroupingClauses. + */ + forthree(lc, sub_grouping_tles, lc_sortop, sub_grouping_sortops, lc_eqop, sub_grouping_eqops) + { + if (!list_member(*tles, lfirst(lc))) { + *tles = lappend(*tles, lfirst(lc)); + *sortops = lappend_oid(*sortops, lfirst_oid(lc_sortop)); + *eqops = lappend_oid(*eqops, lfirst_oid(lc_eqop)); + } + } +} + +void get_sortgroupclauses_tles(List *clauses, List *targetList, List **tles, List **sortops, List **eqops) +{ + *tles = NIL; + *sortops = NIL; + *eqops = NIL; + + get_sortgroupclauses_tles_recurse(clauses, targetList, tles, sortops, eqops); +} + +bool maxSortGroupRef_walker(Node *node, maxSortGroupRef_context *cxt) +{ + if (node == NULL) + return false; + + if (IsA(node, TargetEntry)) { + TargetEntry *tle = (TargetEntry *)node; + if (tle->ressortgroupref > cxt->maxsgr) + cxt->maxsgr = tle->ressortgroupref; + + return maxSortGroupRef_walker((Node *)tle->expr, cxt); + } + + /* Aggref nodes don't nest, so we can treat them here without recurring + * further. + */ + + if (IsA(node, Aggref)) { + Aggref *ref = (Aggref *)node; + + if (cxt->include_orderedagg) { + ListCell *lc; + + foreach (lc, ref->aggorder) { + SortGroupClause *sort = (SortGroupClause *)lfirst(lc); + Assert(IsA(sort, SortGroupClause)); + Assert(sort->tleSortGroupRef != 0); + if (sort->tleSortGroupRef > cxt->maxsgr) + cxt->maxsgr = sort->tleSortGroupRef; + } + } + return false; + } + + return expression_tree_walker(node, (bool (*)())maxSortGroupRef_walker, cxt); +} + +/* + * Return the largest sortgroupref value in use in the given + * target list. + * + * If include_orderedagg is false, consider only the top-level + * entries in the target list, i.e., those that might be occur + * in a groupClause, distinctClause, or sortClause of the Query + * node that immediately contains the target list. + * + * If include_orderedagg is true, also consider AggOrder entries + * embedded in Aggref nodes within the target list. Though + * such entries will only occur in the aggregation sub_tlist + * (input) they affect sortgroupref numbering for both sub_tlist + * and tlist (aggregate). + */ +Index maxSortGroupRef(List *targetlist, bool include_orderedagg) +{ + maxSortGroupRef_context context; + context.maxsgr = 0; + context.include_orderedagg = include_orderedagg; + + if (targetlist != NIL) { + if (!IsA(targetlist, List) || !IsA(linitial(targetlist), TargetEntry)) + elog(ERROR, "non-targetlist argument supplied"); + + maxSortGroupRef_walker((Node *)targetlist, &context); + } + + return context.maxsgr; +} +#endif diff --git a/src/gausskernel/process/postmaster/postmaster.cpp b/src/gausskernel/process/postmaster/postmaster.cpp index e8014153f..0a7d80096 100644 --- a/src/gausskernel/process/postmaster/postmaster.cpp +++ b/src/gausskernel/process/postmaster/postmaster.cpp @@ -2952,9 +2952,13 @@ int PostmasterMain(int argc, char* argv[]) InitCommLogicResource(); } +#ifdef USE_SPQ + if (ENABLE_DSS) { +#else if ((!IS_SINGLE_NODE) && ((IS_PGXC_DATANODE && !dummyStandbyMode && !isRestoreMode) || (IS_PGXC_COORDINATOR && g_instance.attr.attr_storage.comm_cn_dn_logic_conn && !isRestoreMode))) { +#endif status = init_stream_comm(); if (status != STATUS_OK) ereport(FATAL, (errmsg("Init libcomm for stream failed, maybe listen port already in use"))); @@ -13733,6 +13737,12 @@ int GaussDbThreadMain(knl_thread_arg* arg) MemoryContextInit(); knl_thread_init(thread_role); +#ifdef USE_SPQ + if (arg->spq_role == ROLE_QUERY_EXECUTOR) { + t_thrd.spq_ctx.spq_role = ROLE_QUERY_EXECUTOR; + } +#endif + MemoryContextSwitchTo(THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_DEFAULT)); t_thrd.fake_session = create_session_context(t_thrd.top_mem_cxt, 0); t_thrd.fake_session->status = KNL_SESS_FAKE; @@ -14546,6 +14556,13 @@ ThreadId initialize_util_thread(knl_thread_role role, void* payload) } thr_argv->m_thd_arg.role = role; thr_argv->m_thd_arg.payload = payload; +#ifdef USE_SPQ + if (IS_SPQ_EXECUTOR) { + thr_argv->m_thd_arg.spq_role = ROLE_QUERY_EXECUTOR; + } else { + thr_argv->m_thd_arg.spq_role = ROLE_UTILITY; + } +#endif Port port; ThreadId pid; errno_t rc; @@ -15112,4 +15129,4 @@ void SSOndemandProcExitIfStayWaitBackends() GetPMState(pmState), WAIT_PMSTATE_UPDATE_TRIES))); proc_exit(1); } -} \ No newline at end of file +} diff --git a/src/gausskernel/process/stream/execStream.cpp b/src/gausskernel/process/stream/execStream.cpp index 801bda111..9bc498350 100755 --- a/src/gausskernel/process/stream/execStream.cpp +++ b/src/gausskernel/process/stream/execStream.cpp @@ -89,6 +89,31 @@ bool IsThreadProcessStreamRecursive() return true; } +#ifdef USE_SPQ +bool IsThreadSkipDirectResult(StreamState* node) +{ + if (node == NULL || node->consumer == NULL) { + return false; + } + if (IS_SPQ_EXECUTOR) { + Plan* plan = node->ss.ps.plan; + Assert(IsA(plan, Stream)); + Stream* stream = (Stream*) plan; + if (node->type == STREAM_GATHER && stream->smpDesc.distriType == REMOTE_DIRECT_DISTRIBUTE) { + const char* nodeName = GetConfigOption("pgxc_node_name", false, false); + if (!(strcmp(node->consumer->getExpectProducerNodeName(), nodeName) == 0)) { + return true; + } + if (u_sess->stream_cxt.smp_id != 0) { + return true; + } + } + return false; + } + return false; +} +#endif + /* * @Description: Check if Stream node is dummy * @@ -143,6 +168,11 @@ const char* GetStreamTypeRedistribute(Stream* node) } break; } +#ifdef USE_SPQ + case REMOTE_ROUNDROBIN: + stream_tag = "ROUNDROBIN"; + break; +#endif default: { if (isRangeListRedis) { @@ -253,7 +283,14 @@ const char* GetStreamType(Stream* node) appendStringInfo(type, "%sStreaming(type: %s%s%s)", vector_tag, stream_tag, dop_tag, ng_tag); } break; - +#ifdef USE_SPQ + case STREAM_GATHER: { + if(node->smpDesc.distriType == REMOTE_DIRECT_DISTRIBUTE) { + stream_tag = "DIRECT DISTRIBUTE"; + } + appendStringInfo(type, "%sStreaming(type: %s%s%s)", vector_tag, stream_tag, dop_tag, ng_tag); + } break; +#endif case STREAM_REDISTRIBUTE: { stream_tag = GetStreamTypeRedistribute(node); appendStringInfo(type, "%sStreaming(type: %s%s%s)", vector_tag, stream_tag, dop_tag, ng_tag); @@ -568,7 +605,7 @@ static void InitStream(StreamFlowCtl* ctl, StreamTransType transType) List* consumer_nodeList = NIL; List* producer_nodeList = NIL; -#ifndef ENABLE_MULTIPLE_NODES +#if !defined(ENABLE_MULTIPLE_NODES) && !defined(USE_SPQ) if (!isLocalStream) { ereport(ERROR, (errmsg("Single Node should only has local stream operator."))); } @@ -683,6 +720,13 @@ static void InitStream(StreamFlowCtl* ctl, StreamTransType transType) /* Set smp identifier. */ key.smpIdentifier = i; +#ifdef USE_SPQ + if (IS_SPQ_EXECUTOR) { + consumer->setPstmt(pstmt); + } else { + consumer->setPstmt(NULL); + } +#endif consumer->init(key, execNodes, streamNode->smpDesc, transType, sharedContext); consumerSMPList = lappend(consumerSMPList, consumer); @@ -698,6 +742,9 @@ static void InitStream(StreamFlowCtl* ctl, StreamTransType transType) /* 2. Start the setup the Producer part */ consumerNum = list_length(streamNode->consumer_nodes->nodeList); +#ifdef USE_SPQ + consumerNum = pstmt->num_nodes; +#endif /* Set connection number of producer. */ if (STREAM_IS_LOCAL_NODE(streamNode->smpDesc.distriType)) @@ -824,6 +871,19 @@ static void InitStreamFlow(StreamFlowCtl* ctl) InitStreamFlow(ctl); } } break; +#ifdef USE_SPQ + case T_Sequence: { + Sequence* sequence = (Sequence*)oldPlan; + ListCell* lc = NULL; + foreach(lc, sequence->subplans) { + Plan* subplan = (Plan*)lfirst(lc); + ctl->plan = subplan; + /* Set parent info as checkInfo for every sub plan. */ + SetCheckInfo(&ctl->checkInfo, oldPlan); + InitStreamFlow(ctl); + } + } break; +#endif case T_ModifyTable: case T_VecModifyTable: { ModifyTable* mt = (ModifyTable*)oldPlan; @@ -1128,6 +1188,9 @@ void SetupStreamRuntime(StreamState* node) /* Set consumer object in streamState. */ node->consumer = consumer; +#ifdef USE_SPQ + node->skip_direct_distribute_result = IsThreadSkipDirectResult(node); +#endif RegisterStreamSnapshots(); } @@ -1165,7 +1228,7 @@ static void StartupStreamThread(StreamState* node) /* Set up Stream thread in parallel */ void StartUpStreamInParallel(PlannedStmt* pstmt, EState* estate) { - if (!IS_PGXC_DATANODE || pstmt->num_streams <= 0) { + if (IS_SPQ_COORDINATOR || !IS_PGXC_DATANODE || pstmt->num_streams <= 0) { return; } @@ -2099,7 +2162,7 @@ StreamState* BuildStreamRuntime(Stream* node, EState* estate, int eflags) #ifdef ENABLE_MULTIPLE_NODES if (IS_PGXC_COORDINATOR) { #else - if (StreamTopConsumerAmI()) { + if ((IS_SPQ_COORDINATOR) || (!IS_SPQ_RUNNING && StreamTopConsumerAmI())) { #endif if (innerPlan(node)) innerPlanState(stream_state) = ExecInitNode(innerPlan(node), estate, eflags); @@ -2112,8 +2175,9 @@ StreamState* BuildStreamRuntime(Stream* node, EState* estate, int eflags) } /* Stream runtime only set up on datanode. */ - if (IS_PGXC_DATANODE) + if (!IS_SPQ_COORDINATOR && IS_PGXC_DATANODE) { SetupStreamRuntime(stream_state); + } return stream_state; } @@ -2183,6 +2247,10 @@ static TupleTableSlot* ExecStream(PlanState* state) } } +#ifdef USE_SPQ + t_thrd.spq_ctx.skip_direct_distribute_result = node->skip_direct_distribute_result; +#endif + node->receive_message = true; if (node->StreamScan(node)) { @@ -2208,7 +2276,7 @@ static TupleTableSlot* ExecStream(PlanState* state) void ExecEarlyDeinitConsumer(PlanState* node) { /* A Coordinator has no stream thread, so do not bother about that */ - if (IS_PGXC_COORDINATOR) + if (IS_PGXC_COORDINATOR || IS_SPQ_COORDINATOR) return; /* Exit if skip early deinit consumer */ diff --git a/src/gausskernel/process/stream/streamConsumer.cpp b/src/gausskernel/process/stream/streamConsumer.cpp index 85d801d56..c90bc3c98 100755 --- a/src/gausskernel/process/stream/streamConsumer.cpp +++ b/src/gausskernel/process/stream/streamConsumer.cpp @@ -74,8 +74,14 @@ void StreamConsumer::init(StreamKey key, List* execProducerNodes, ParallelDesc d bool localNodeOnly = STREAM_IS_LOCAL_NODE(desc.distriType); if (localNodeOnly) producerNum = desc.producerDop; - else + else { producerNum = desc.producerDop * list_length(execProducerNodes); +#ifdef USE_SPQ + if (IS_SPQ_EXECUTOR && m_plan) { + producerNum = desc.producerDop * m_plan->num_nodes; + } +#endif + } Assert(producerNum > 0); @@ -111,6 +117,23 @@ void StreamConsumer::init(StreamKey key, List* execProducerNodes, ParallelDesc d } } copyLock.unLock(); +#endif +#ifdef USE_SPQ + if (IS_SPQ_EXECUTOR && m_plan != nullptr) { + for (int j = 0; j < desc.producerDop; j++) { + for(;i < m_plan->num_nodes; i++) { + if (localNodeOnly) { + continue; + } + rc = strncpy_s(&m_expectProducer[i].nodeName[0], + NAMEDATALEN, + m_plan->nodesDefinition[i].nodename.data, + strlen(m_plan->nodesDefinition[i].nodename.data) + 1); + securec_check(rc, "\0", "\0"); + m_expectProducer[i].nodeIdx = i; + } + } + } #endif for (i = 0; i < producerNum; i++) { int nodeNameLen = 0; @@ -308,6 +331,23 @@ int StreamConsumer::getNodeIdx(const char* nodename) return -1; } +#ifdef USE_SPQ +/* + * @Description: Get expectProducer nodeName + * + * @return: nodeName + */ +char* StreamConsumer::getExpectProducerNodeName() +{ + return m_expectProducer->nodeName; +} + +void StreamConsumer::setPstmt(PlannedStmt* p_stmt) +{ + m_plan = p_stmt; +} +#endif + /* * @Description: Find un connect producer * @@ -405,8 +445,11 @@ void StreamConsumer::waitProducerReady() getFirstUnconnectedProducerNodeIdx(), m_connNum - m_currentProducerNum, m_key.planNodeId, +#ifdef USE_SPQ + m_plan ? m_plan->num_nodes : -1); +#else global_node_definition ? global_node_definition->num_nodes : -1); - +#endif if (ntimes == 300) { if (t_thrd.int_cxt.QueryCancelPending) { ereport(WARNING, (errmodule(MOD_STREAM), diff --git a/src/gausskernel/process/stream/streamMain.cpp b/src/gausskernel/process/stream/streamMain.cpp index b4206856f..312027c26 100755 --- a/src/gausskernel/process/stream/streamMain.cpp +++ b/src/gausskernel/process/stream/streamMain.cpp @@ -75,6 +75,12 @@ int StreamMain() InitStreamThread(); +#ifdef USE_SPQ + t_thrd.spq_ctx.spq_session_id = u_sess->stream_cxt.producer_obj->m_plan->spq_session_id; + t_thrd.spq_ctx.current_id = u_sess->stream_cxt.producer_obj->getStream()->streamID; + t_thrd.spq_ctx.skip_direct_distribute_result = false; +#endif + SetProcessingMode(NormalProcessing); on_proc_exit(StreamQuitAndClean, 0); @@ -506,6 +512,8 @@ static void execute_stream_plan(StreamProducer* producer) if (dest >= DestTupleBroadCast) SetStreamReceiverParams(receiver, producer, portal); + producer->setEcontext(GetPerTupleExprContext(portal->queryDesc->estate)); + /* * Run the portal to completion, and then drop it (and the receiver). */ diff --git a/src/gausskernel/process/stream/streamProducer.cpp b/src/gausskernel/process/stream/streamProducer.cpp index d7f476f84..7a59aad27 100755 --- a/src/gausskernel/process/stream/streamProducer.cpp +++ b/src/gausskernel/process/stream/streamProducer.cpp @@ -140,6 +140,9 @@ StreamProducer::StreamProducer( m_dest = DestNone; m_channelCalVecFun = NULL; m_channelCalFun = NULL; + m_hasExprKey = false; + m_exprkeystate = NULL; + m_econtext = NULL; initStringInfo(&m_tupleBuffer); initStringInfo(&m_tupleBufferWithCheck); @@ -325,6 +328,10 @@ void StreamProducer::setDistributeInfo() nodeLen = list_length(m_consumerNodes->nodeList); } + if (IS_SPQ_RUNNING) { + nodeLen = m_plan->num_nodes; + } + Assert(nodeLen > 0); m_disQuickLocator = (uint2**)palloc0(nodeLen * sizeof(uint2*)); @@ -343,7 +350,13 @@ void StreamProducer::setDistributeInfo() m_disQuickLocator[i][j] = i + j * nodeLen; } +#ifdef USE_SPQ + if (m_parallel_desc.distriType != REMOTE_DIRECT_DISTRIBUTE) { + setDistributeIdx(); + } +#else setDistributeIdx(); +#endif if (((Plan*)m_streamNode)->vec_output == false) BindingRedisFunction(); @@ -359,7 +372,9 @@ void StreamProducer::setDistributeInfo() void StreamProducer::initStreamKey() { int nodeLen = list_length(m_consumerNodes->nodeList); - +#ifdef USE_SPQ + nodeLen = m_plan->num_nodes; +#endif for (int i = 0; i < m_connNum; i++) { StreamCOMM* scomm = (StreamCOMM*)m_transport[i]; @@ -491,7 +506,13 @@ void StreamProducer::setDistributeIdx() ListCell* cell = NULL; foreach (cell, m_distributeKey) { - Var* distriVar = (Var*)lfirst(cell); + Node* node = (Node*)lfirst(cell); + if (!IsA(node, Var)) { + m_hasExprKey = true; + m_exprkeystate = ExecInitExprList(m_streamNode->distribute_keys, NULL); + break; + } + Var* distriVar = (Var*)node; m_distributeIdx[i++] = distriVar->varattno - 1; ereport(DEBUG2, (errmodule(MOD_STREAM), errmsg("[StreamProducer] node id is: %d, distributeIdx[%d] is: %d", m_streamNode->scan.plan.plan_node_id, i - 1, m_distributeIdx[i - 1]))); @@ -566,7 +587,11 @@ void StreamProducer::BindingRedisFunction() Oid dataType; m_hashFun = (hashFun*)palloc0(sizeof(hashFun) * len); for (int i = 0; i < len; i++) { - dataType = m_desc->attrs[m_distributeIdx[i]].atttypid; + if (m_hasExprKey) { + dataType = ((ExprState*)list_nth(m_exprkeystate, i))->resultType; + } else { + dataType = m_desc->attrs[m_distributeIdx[i]].atttypid; + } switch (dataType) { case INT8OID: m_hashFun[i] = &computeHashT; @@ -1249,6 +1274,50 @@ void StreamProducer::redistributeTupleChannel(TupleTableSlot* tuple) hashValue, m_parallel_desc.consumerDop, list_length(m_consumerNodes->nodeList)); } +template +void StreamProducer::redistributeTupleChannelWithExpr(TupleTableSlot* tuple) +{ + /* + * For dn gather case, we do not need to compute hash value. + * we only has one execute datanode in consumer list. + * So, send and receive channel will always be channel 0. + */ + if (distrType == REMOTE_DIRECT_DISTRIBUTE) { + return; + } + + Datum data; + MemoryContext oldContext; + ListCell *cell; + bool isNull = false; + bool allIsNULL = true; + uint64 hashValue = 0; + int i = 0; + m_econtext->ecxt_outertuple = tuple; + + oldContext = MemoryContextSwitchTo(m_econtext->ecxt_per_tuple_memory); + /* foreach key exprs */ + foreach(cell, m_exprkeystate) { + ExprState *state = (ExprState*)lfirst(cell); + data = ExecEvalExpr(state, m_econtext, &isNull, NULL); + if (!isNull) { + if (!allIsNULL) { + hashValue = (hashValue << 1) | ((hashValue & 0x80000000) ? 1 : 0); + hashValue ^= m_hashFun[i](data); + } else { + hashValue = m_hashFun[i](data); + allIsNULL = false; + } + } + ++i; + } + + MemoryContextSwitchTo(oldContext); + + m_locator[0] = ChannelLocalizer( + hashValue, m_parallel_desc.consumerDop, list_length(m_consumerNodes->nodeList)); +} + template void StreamProducer::redistributeTupleChannelForSlice(TupleTableSlot* tuple) { @@ -1307,16 +1376,30 @@ void StreamProducer::DispatchBatchRedistrFunctionByRedisType() case PARALLEL_NONE: #ifdef ENABLE_MULTIPLE_NODES case REMOTE_DISTRIBUTE: - m_channelCalVecFun = (list_length(m_consumerNodes->nodeList) == 1) ? - &StreamProducer::redistributeBatchChannel : - &StreamProducer::redistributeBatchChannel; + if (m_hasExprKey) { + m_channelCalFun = ((list_length(m_consumerNodes->nodeList) == 1) ? + &StreamProducer::redistributeTupleChannelWithExpr : + &StreamProducer::redistributeTupleChannelWithExpr); + } else { + m_channelCalFun = ((list_length(m_consumerNodes->nodeList) == 1) ? + &StreamProducer::redistributeTupleChannel : + &StreamProducer::redistributeTupleChannel); + } break; case REMOTE_SPLIT_DISTRIBUTE: - m_channelCalVecFun = &StreamProducer::redistributeBatchChannel; + if (m_hasExprKey) { + m_channelCalFun = &StreamProducer::redistributeTupleChannelWithExpr; + } else { + m_channelCalFun = &StreamProducer::redistributeTupleChannel; + } break; #endif case LOCAL_DISTRIBUTE: - m_channelCalVecFun = &StreamProducer::redistributeBatchChannel; + if (m_hasExprKey) { + m_channelCalFun = &StreamProducer::redistributeTupleChannelWithExpr; + } else { + m_channelCalFun = &StreamProducer::redistributeTupleChannel; + } break; default: @@ -1334,19 +1417,37 @@ void StreamProducer::DispatchRowRedistrFunctionByRedisType() { switch (m_parallel_desc.distriType) { case PARALLEL_NONE: -#ifdef ENABLE_MULTIPLE_NODES +#if defined(ENABLE_MULTIPLE_NODES) || defined(USE_SPQ) case REMOTE_DISTRIBUTE: - m_channelCalFun = ((list_length(m_consumerNodes->nodeList) == 1) ? - &StreamProducer::redistributeTupleChannel : - &StreamProducer::redistributeTupleChannel); + if (m_hasExprKey) { + m_channelCalFun = ((list_length(m_consumerNodes->nodeList) == 1) ? + &StreamProducer::redistributeTupleChannelWithExpr : + &StreamProducer::redistributeTupleChannelWithExpr); + } else { + m_channelCalFun = ((list_length(m_consumerNodes->nodeList) == 1) ? + &StreamProducer::redistributeTupleChannel : + &StreamProducer::redistributeTupleChannel); + } break; case REMOTE_SPLIT_DISTRIBUTE: - m_channelCalFun = &StreamProducer::redistributeTupleChannel; + if (m_hasExprKey) { + m_channelCalFun = &StreamProducer::redistributeTupleChannelWithExpr; + } else { + m_channelCalFun = &StreamProducer::redistributeTupleChannel; + } + break; + case REMOTE_DIRECT_DISTRIBUTE: + // REMOTE_DIRECT_DISTRIBUTE will not calculate distribute key + m_channelCalFun = &StreamProducer::redistributeTupleChannel; break; #endif case LOCAL_DISTRIBUTE: - m_channelCalFun = &StreamProducer::redistributeTupleChannel; + if (m_hasExprKey) { + m_channelCalFun = &StreamProducer::redistributeTupleChannelWithExpr; + } else { + m_channelCalFun = &StreamProducer::redistributeTupleChannel; + } break; default: @@ -1358,7 +1459,7 @@ void StreamProducer::DispatchBatchRedistrFunctionForSlice() { switch (m_parallel_desc.distriType) { case PARALLEL_NONE: -#ifdef ENABLE_MULTIPLE_NODES +#if defined(ENABLE_MULTIPLE_NODES) || defined(USE_SPQ) case REMOTE_DISTRIBUTE: m_channelCalVecFun = ((list_length(m_consumerNodes->nodeList) == 1) ? &StreamProducer::redistributeBatchChannelForSlice : @@ -1829,6 +1930,651 @@ void StreamProducer::initSharedContext() } #ifndef ENABLE_MULTIPLE_NODES +#ifdef USE_SPQ +void StreamProducer::serializeStream(VectorBatch* batch, int index) +{ + uint32 tempBufferSize = m_bitNullLen + m_bitNumericLen; + uint8* bitNull = (uint8*)m_tempBuffer - 1; + uint8* bitNumericFlag = (uint8*)m_tempBuffer + m_bitNullLen - 1; + + uint8 bitMaskNull = HIGHBIT; + uint8 bitMaskNumeric; + int dataLen; + Form_pg_attribute attr; + char* writeBuffer = NULL; + Datum columnVal; + int32 numericIdx = -1; + char* string = NULL; + errno_t rc; + + /* reset m_tupleBuffer and m_tempBuffer */ + resetStringInfo(&m_tupleBuffer); + rc = memset_s(m_tempBuffer, tempBufferSize, '\0', tempBufferSize); + securec_check(rc, "\0", "\0"); + + m_tupleBuffer.cursor = 'B'; + /* + * the first tempBufferSize Bits of m_tupleBuffer.data will be assigned at the end of this function + * when null flag and numeric flag are finally determinded. + */ + m_tupleBuffer.len = tempBufferSize; + + for (int i = 0; i < batch->m_cols; i++) { + if (unlikely(bitMaskNull == HIGHBIT)) { + /* Get null flag if i % 8 equals to 0. */ + bitNull++; + bitMaskNull = 1; + } else { + /* Get next null flag for next column */ + bitMaskNull <<= 1; + } + + if (NOT_NULL(batch->m_arr[i].m_flag[index])) { + attr = &(m_desc->attrs[i]); + /* Set null flag for index column */ + *bitNull |= bitMaskNull; + + columnVal = batch->m_arr[i].m_vals[index]; + /* can be stored by value directly */ + switch (m_colsType[i]) { + case VALUE_TYPE: + enlargeStringInfo(&m_tupleBuffer, attr->attlen); + writeBuffer = m_tupleBuffer.data + m_tupleBuffer.len; + store_att_byval(writeBuffer, columnVal, attr->attlen); + m_tupleBuffer.len += attr->attlen; + m_tupleBuffer.data[m_tupleBuffer.len] = '\0'; + break; + case NUMERIC_TYPE: + dataLen = VARSIZE_ANY(columnVal); + Assert(dataLen > 0); + numericIdx++; + /* + * initialize numeric flag if numericIdx % 4 equals to 0. + * we use 01 to denote values less that 0xFF; + * 10 to denote values less that 0xFFFF; + * 11 to denote values less than 0xFFFFFFFF; + * 00 to denote other values. + */ + if (numericIdx % 4 == 0) { + bitNumericFlag++; + } + + if (!VARATT_IS_SHORT(columnVal) && NUMERIC_IS_BI64((Numeric)columnVal)) { + uint64 numericVal = (uint64)NUMERIC_64VALUE((Numeric)columnVal); + if (unlikely(numericVal <= 0xFF)) { + /* numeric_8_compress */ + bitMaskNumeric = 1 << (uint32)(2 * (numericIdx % 4)); // 0x01 + /* set numeric size flag */ + *bitNumericFlag |= bitMaskNumeric; + enlargeStringInfo(&m_tupleBuffer, 2); + writeBuffer = m_tupleBuffer.data + m_tupleBuffer.len; + *(uint8*)(writeBuffer) = NUMERIC_BI_SCALE((Numeric)columnVal); + *(uint8*)(writeBuffer + 1) = (uint8)numericVal; + m_tupleBuffer.len += 2; + continue; + } else if (numericVal <= 0xFFFF) { + /* numeric_16_compress */ + bitMaskNumeric = 2 << (uint32)(2 * (numericIdx % 4)); // 0x10 + *bitNumericFlag |= bitMaskNumeric; + enlargeStringInfo(&m_tupleBuffer, 3); + writeBuffer = m_tupleBuffer.data + m_tupleBuffer.len; + *(uint8*)(writeBuffer) = NUMERIC_BI_SCALE((Numeric)columnVal); + *(uint16*)(writeBuffer + 1) = (uint16)numericVal; + m_tupleBuffer.len += 3; + continue; + } else if (numericVal <= 0xFFFFFFFF) { + /* numeric_32_compress */ + bitMaskNumeric = 3 << (uint32)(2 * (numericIdx % 4)); // 0x11 + *bitNumericFlag |= bitMaskNumeric; + enlargeStringInfo(&m_tupleBuffer, 5); + writeBuffer = m_tupleBuffer.data + m_tupleBuffer.len; + *(uint8*)(writeBuffer) = NUMERIC_BI_SCALE((Numeric)columnVal); + *(uint32*)(writeBuffer + 1) = (uint32)numericVal; + m_tupleBuffer.len += 5; + continue; + } + } + /* other numeric value lager than MAX INT32, bitMaskNumeric equals to 0x00 */ + appendBinaryStringInfo(&m_tupleBuffer, DatumGetPointer(columnVal), dataLen); + break; + case VARLENA_TYPE: + dataLen = VARSIZE_ANY(columnVal); + Assert(dataLen > 0); + appendBinaryStringInfo(&m_tupleBuffer, DatumGetPointer(columnVal), dataLen); + break; + case CSTRING_TYPE: + string = VARDATA_ANY(columnVal); + dataLen = strlen(string) + 1; + appendBinaryStringInfo(&m_tupleBuffer, string, dataLen); + break; + case TID_TYPE: + enlargeStringInfo(&m_tupleBuffer, 8); + writeBuffer = m_tupleBuffer.data + m_tupleBuffer.len; + store_att_byval(writeBuffer, columnVal, 8); + m_tupleBuffer.len += 8; + m_tupleBuffer.data[m_tupleBuffer.len] = '\0'; + break; + case NAME_TYPE: + string = ((Name)columnVal)->data; + dataLen = strlen(string) + 1; + columnVal = PointerGetDatum((char*)columnVal); + appendBinaryStringInfo(&m_tupleBuffer, DatumGetPointer(columnVal), dataLen); + break; + case FIXED_TYPE: + /* extract fixed length variable. */ + columnVal = PointerGetDatum((char*)columnVal + VARHDRSZ_SHORT); + appendBinaryStringInfo(&m_tupleBuffer, DatumGetPointer(columnVal), attr->attlen); + break; + default: + Assert(false); + ereport(ERROR, + (errcode(ERRCODE_UNEXPECTED_NODE_STATE), + errmodule(MOD_STREAM), + (errmsg("unrecognize data type %u.", m_colsType[i])))); + break; + } + } + } + + /* copy null flags nd numeric flags into m_tupleBuffer */ + rc = memcpy_s(m_tupleBuffer.data, tempBufferSize, m_tempBuffer, tempBufferSize); + securec_check(rc, "\0", "\0"); +} + +void StreamProducer::SetDest(bool is_vec_plan) +{ + switch (m_streamType) { + case STREAM_BROADCAST: + switch (m_parallel_desc.distriType) { + case LOCAL_BROADCAST: + if (is_vec_plan) + m_dest = DestBatchLocalBroadCast; + else + m_dest = DestTupleLocalBroadCast; + break; + case REMOTE_BROADCAST: + if (is_vec_plan) + m_dest = DestBatchBroadCast; + else + m_dest = DestTupleBroadCast; + break; + default: + break; + } + case STREAM_GATHER: + if (m_parallel_desc.distriType == REMOTE_DIRECT_DISTRIBUTE) { + if (is_vec_plan) { + m_dest = DestBatchRedistribute; + } + else{ + m_dest = DestTupleRedistribute; + } + setDistributeInfo(); + } + break; + case STREAM_REDISTRIBUTE: + switch (m_parallel_desc.distriType) { + case LOCAL_BROADCAST: + if (is_vec_plan) + m_dest = DestBatchLocalBroadCast; + else + m_dest = DestTupleLocalBroadCast; + + break; + + case LOCAL_DISTRIBUTE: + if (is_vec_plan) + m_dest = DestBatchLocalRedistribute; + else + m_dest = DestTupleLocalRedistribute; + + setDistributeInfo(); + break; + + case LOCAL_ROUNDROBIN: + if (is_vec_plan) + m_dest = DestBatchLocalRoundRobin; + else + m_dest = DestTupleLocalRoundRobin; + break; + case REMOTE_ROUNDROBIN: + if (is_vec_plan) + m_dest = DestBatchRoundRobin; + else + m_dest = DestTupleRoundRobin; + break; + case PARALLEL_NONE: + case REMOTE_DISTRIBUTE: + case REMOTE_SPLIT_DISTRIBUTE: + if (is_vec_plan) + m_dest = DestBatchRedistribute; + else + m_dest = DestTupleRedistribute; + + setDistributeInfo(); + break; + default: + break; + } + + break; + + case STREAM_HYBRID: { + if (m_streamNode->distribute_keys != NIL) + setDistributeInfo(); + + if (is_vec_plan) + m_dest = DestBatchHybrid; + else + m_dest = DestTupleHybrid; + } break; + default: + break; + } + return; +} + +void StreamProducer::reportNotice() +{ + m_nodeGroup->saveProducerEdata(); + + /* for NOTICE message, report it through 0 channel only one time. */ + if (STREAM_IS_LOCAL_NODE(m_parallel_desc.distriType) && !m_isDummy) { + if (m_sharedContextInit) { + stream_send_message_to_consumer(); + } else { + gs_memory_disconnect(m_sharedContext, m_nth); + } + } else { + if (m_transport != NULL) { + if (netSwitchDest(0)) { + stream_send_message_to_consumer(); + netStatusSave(0); + } + } + } +} + +void StreamProducer::redistributeStream(VectorBatch* batch) +{ + Assert(batch != NULL); + + /* calc the location for each tuple in batch. */ + (this->*m_channelCalVecFun)(batch); + + for (int i = 0; i < batch->m_rows; i++) { + StreamTimeSerilizeStart(t_thrd.pgxc_cxt.GlobalNetInstr); + serializeStream(batch, i); + StreamTimeSerilizeEnd(t_thrd.pgxc_cxt.GlobalNetInstr); + sendByteStream(m_locator[i]); + } +} + +void StreamProducer::redistributeStream(TupleTableSlot* tuple, DestReceiver* self) +{ + /* calc the location for tuple. */ + (this->*m_channelCalFun)(tuple); + + assembleStreamMessage(tuple, self, &m_tupleBuffer); + + sendByteStream(m_locator[0]); + + /* reset the buffer. */ + resetStringInfo(&m_tupleBuffer); +} + +void StreamProducer::broadCastStream(VectorBatch* batch) +{ + int i; + + Assert(m_originConsumerNodeList == NIL); + + assembleStreamBatchMessage(BCT_NOCOMP, batch, &m_tupleBuffer); + + m_broadcastSize += m_tupleBuffer.len; + t_thrd.shemem_ptr_cxt.mySessionMemoryEntry->broadcastSize = + Max(m_broadcastSize, t_thrd.shemem_ptr_cxt.mySessionMemoryEntry->broadcastSize); + + if (m_broadcastSize / (1 << 20) >= WARNING_BROADCAST_SIZE) { + t_thrd.shemem_ptr_cxt.mySessionMemoryEntry->warning |= (1 << WLM_WARN_BROADCAST_LARGE); + } + + /* When it's simple query, m_wlmParams.ptr is NULL */ + if (m_wlmParams.ptr) { + WLMDNodeInfo* info = (WLMDNodeInfo*)m_wlmParams.ptr; + /* Check if the broadcast size exceeds the threshold */ + if (info->geninfo.broadcastThreshold > 0 && m_broadcastSize > info->geninfo.broadcastThreshold) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("Broadcast size exceeds the threshold: BroadcastSize=%ld, ThresholdSize=%ld, PlanId=%d", + m_broadcastSize, + info->geninfo.broadcastThreshold, + m_streamNode->scan.plan.plan_node_id))); + } + + for (i = 0; i < m_connNum; i++) + sendByteStream(i); + + resetStringInfo(&m_tupleBuffer); +} + +void StreamProducer::broadCastStream(TupleTableSlot* tuple, DestReceiver* self) +{ + int i; + /* assemble tuple message. */ + assembleStreamMessage(tuple, self, &m_tupleBuffer); + + m_broadcastSize += m_tupleBuffer.len; + t_thrd.shemem_ptr_cxt.mySessionMemoryEntry->broadcastSize = + Max(m_broadcastSize, t_thrd.shemem_ptr_cxt.mySessionMemoryEntry->broadcastSize); + + if (m_broadcastSize / (1 << 20) >= WARNING_BROADCAST_SIZE) { + t_thrd.shemem_ptr_cxt.mySessionMemoryEntry->warning |= (1 << WLM_WARN_BROADCAST_LARGE); + } + + /* When it's simple query, m_wlmParams.ptr is NULL */ + if (m_wlmParams.ptr) { + WLMDNodeInfo* info = (WLMDNodeInfo*)m_wlmParams.ptr; + /* Check if the broadcast size exceeds the threshold */ + if (info->geninfo.broadcastThreshold > 0 && m_broadcastSize > info->geninfo.broadcastThreshold) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("Broadcast size exceeds the threshold: BroadcastSize=%ld, ThresholdSize=%ld, PlanId=%d", + m_broadcastSize, + info->geninfo.broadcastThreshold, + m_streamNode->scan.plan.plan_node_id))); + } + + /* + * If original Cosumer node list is not null, we need send data to target datanode + * with refs from its original exec_node lists. + * + * Only for recursive union execution + */ + if (unlikely(m_originConsumerNodeList != NIL)) { + for (i = 0; i < m_connNum; i++) { + if (!list_member_int(m_originConsumerNodeList, i)) { + continue; + } + + sendByteStream(i); + } + } else { + for (i = 0; i < m_connNum; i++) + sendByteStream(i); + } + + /* reset tuple buffer. */ + resetStringInfo(&m_tupleBuffer); +} + +void StreamProducer::broadCastStreamCompress(VectorBatch* batch) +{ + int i; + + Assert(m_originConsumerNodeList == NIL); + + assembleStreamBatchMessage(BCT_LZ4, batch, &m_tupleBuffer); + + for (i = 0; i < m_connNum; i++) + sendByteStream(i); + + resetStringInfo(&m_tupleBuffer); +} + +void StreamProducer::roundRobinStream(TupleTableSlot* tuple, DestReceiver* self) +{ + assembleStreamMessage(tuple, self, &m_tupleBuffer); + + sendByteStream(m_roundRobinIdx); + + m_roundRobinIdx++; + m_roundRobinIdx = m_roundRobinIdx % m_connNum; + + /* reset tuple buffer. */ + resetStringInfo(&m_tupleBuffer); +} + +void StreamProducer::roundRobinStream(VectorBatch* batch) +{ + roundRobinBatch(batch); +} + +template +void StreamProducer::roundRobinBatch(VectorBatch* batch) +{ + assembleStreamBatchMessage(ctype, batch, &m_tupleBuffer); + + sendByteStream(m_roundRobinIdx); + + m_roundRobinIdx++; + m_roundRobinIdx = m_roundRobinIdx % m_connNum; + + /* reset tuple buffer. */ + resetStringInfo(&m_tupleBuffer); +} + +void StreamProducer::hybridStream(TupleTableSlot* tuple, DestReceiver* self) +{ + StreamSkew* sskew = (StreamSkew*)m_skewState; + + assembleStreamMessage(tuple, self, &m_tupleBuffer); + + switch (sskew->chooseStreamType(tuple)) { + case STREAM_REDISTRIBUTE: { + (this->*m_channelCalFun)(tuple); + sendByteStream(m_locator[0]); + break; + } + case STREAM_BROADCAST: { + for (int i = 0; i < m_connNum; i++) + sendByteStream(i); + break; + } + case STREAM_ROUNDROBIN: { + sendByteStream(m_roundRobinIdx); + + m_roundRobinIdx++; + if (m_roundRobinIdx == m_connNum) + m_roundRobinIdx = 0; + + break; + } + case STREAM_LOCAL: { + Assert(sskew->m_localNodeId != -1); + sendByteStream(sskew->m_localNodeId); + break; + } + case STREAM_NONE: { + break; + } + default: + ereport(ERROR, (errcode(ERRCODE_UNEXPECTED_NODE_STATE), errmsg("Invalid stream type for data skew."))); + } + + resetStringInfo(&m_tupleBuffer); +} + +void StreamProducer::hybridStream(VectorBatch* batch, DestReceiver* self) +{ + StreamSkew* sskew = (StreamSkew*)m_skewState; + errno_t rc = EOK; + + rc = memset_s(m_skewMatch, sizeof(int) * BatchMaxSize, 0, sizeof(int) * BatchMaxSize); + securec_check(rc, "\0", "\0"); + + sskew->chooseVecStreamType(batch, m_skewMatch); + + if (m_channelCalVecFun != NULL) + (this->*m_channelCalVecFun)(batch); + + for (int i = 0; i < batch->m_rows; i++) { + m_tupleBuffer.cursor = 'B'; + + if (m_streamNode->jitted_serialize) { + typedef void (*serialize_func)(VectorBatch* batch, StringInfo tuplebuf, int idx); + (void)((serialize_func)(m_streamNode->jitted_serialize))(batch, &m_tupleBuffer, i); + } else { + serializeStream(batch, i); + } + + switch (m_skewMatch[i]) { + case STREAM_REDISTRIBUTE: { + sendByteStream(m_locator[i]); + break; + } + case STREAM_BROADCAST: { + for (int i = 0; i < m_connNum; i++) + sendByteStream(i); + break; + } + case STREAM_ROUNDROBIN: { + sendByteStream(m_roundRobinIdx); + m_roundRobinIdx++; + if (m_roundRobinIdx == m_connNum) + m_roundRobinIdx = 0; + + break; + } + case STREAM_LOCAL: { + Assert(sskew->m_localNodeId != -1); + sendByteStream(sskew->m_localNodeId); + break; + } + case STREAM_NONE: { + break; + } + default: + ereport( + ERROR, (errcode(ERRCODE_UNEXPECTED_NODE_STATE), errmsg("Invalid stream type for data skew.\n"))); + } + + resetStringInfo(&m_tupleBuffer); + } +} + +void StreamProducer::sendByteStream(int nthChannel) +{ + if (netSwitchDest(nthChannel)) { + t_thrd.int_cxt.StreamConnectionLost = false; + +#ifdef USE_ASSERT_CHECKING + AddCheckInfo(nthChannel); +#else + if (anls_opt_is_on(ANLS_STREAM_DATA_CHECK)) { + AddCheckInfo(nthChannel); + } else { + m_transport[nthChannel]->send(m_tupleBuffer.cursor, m_tupleBuffer.data, m_tupleBuffer.len); + } +#endif + + /* Stop query when cancel happend */ + if (t_thrd.int_cxt.QueryCancelPending) { + t_thrd.int_cxt.QueryCancelPending = false; + + /* Must close all connection, */ + /* otherwise error message can insert into data! */ + for (int i = 0; i < m_connNum; i++) + m_transport[i]->release(); + + u_sess->exec_cxt.executorStopFlag = true; + } + + if (t_thrd.int_cxt.StreamConnectionLost) { + t_thrd.int_cxt.StreamConnectionLost = false; + + m_transport[nthChannel]->release(); + + bool allInValid = true; + for (int i = 0; i < m_connNum; i++) { + if (m_transport[i]->isClosed() == false) { + allInValid = false; + break; + } + } + + if (allInValid) + u_sess->exec_cxt.executorStopFlag = true; + } + + netStatusSave(nthChannel); + } +} + +void StreamProducer::connectConsumer(libcomm_addrinfo** consumerAddr, int& count, int totalNum) +{ + int consumerNum = 0; + NodeDefinition* nodesDef = NULL; + int startCount = count; + errno_t rc = EOK; + int i = 0, j = 0; + CommStreamKey key = {0}; + /* only broadcast stream support parallel send mode in libcomm */ + bool parallel_send_mode = false; + + if (IS_PGXC_DATANODE && ContainRecursiveUnionSubplan(m_plan)) { + parallel_send_mode = false; + } else { + parallel_send_mode = (m_streamNode->type == STREAM_BROADCAST) ? true : false; + } + consumerNum = m_plan->num_nodes; + nodesDef = (NodeDefinition*)palloc0(sizeof(NodeDefinition) * consumerNum); + + for (i = 0; i < consumerNum; i++) { + rc = memcpy_s(&nodesDef[i], sizeof(NodeDefinition), &m_plan->nodesDefinition[i], sizeof(NodeDefinition)); + securec_check(rc, "\0", "\0"); + nodesDef[i].nodeid = i; + } + + key.queryId = m_key.queryId; + key.planNodeId = m_key.planNodeId; + key.producerSmpId = m_key.smpIdentifier; + + for (i = 0; i < m_streamNode->smpDesc.consumerDop; i++) { + /* The local stream's consumer number is 1. */ + for (j = 0; j < consumerNum; j++) { + Assert(count < totalNum); + int nodeNameLen = strlen(nodesDef[j].nodename.data); + int nodehostLen = strlen(nodesDef[j].nodehost.data); + consumerAddr[count] = (libcomm_addrinfo*)palloc0(sizeof(libcomm_addrinfo)); + consumerAddr[count]->host = (char*)palloc0(NAMEDATALEN); + consumerAddr[count]->ctrl_port = nodesDef[j].nodectlport; + consumerAddr[count]->listen_port = nodesDef[j].nodesctpport; + consumerAddr[count]->nodeIdx = nodesDef[j].nodeid; + rc = strncpy_s(consumerAddr[count]->host, NAMEDATALEN, nodesDef[j].nodehost.data, nodehostLen + 1); + securec_check(rc, "\0", "\0"); + rc = strncpy_s(consumerAddr[count]->nodename, NAMEDATALEN, nodesDef[j].nodename.data, nodeNameLen + 1); + securec_check(rc, "\0", "\0"); + /* use ai_next buile address info list */ + if (count > startCount) + consumerAddr[count - 1]->addr_list_next = consumerAddr[count]; + /* set flag for parallel send mode */ + consumerAddr[count]->parallel_send_mode = parallel_send_mode; + + consumerAddr[count]->streamKey.queryId = m_key.queryId; + consumerAddr[count]->streamKey.planNodeId = m_key.planNodeId; + consumerAddr[count]->streamKey.producerSmpId = m_key.smpIdentifier; + consumerAddr[count]->streamKey.consumerSmpId = i; + + count++; + } + } + + if (parallel_send_mode) { + Assert(startCount < totalNum); + /* set flag for the head of address info list */ + consumerAddr[startCount]->addr_list_size = count - startCount; + } + + m_transport = (StreamTransport**)MemoryContextAllocZero(m_memoryCxt, m_connNum * sizeof(StreamTransport*)); + for (i = 0; i < m_connNum; i++) { + Assert(i + startCount < totalNum); + m_transport[i] = New(m_memoryCxt) StreamCOMM(consumerAddr[i + startCount], true); + } + + pfree_ext(nodesDef); +} +#else /* * @Description: Get the destnation of producer * @@ -1964,6 +2710,7 @@ void StreamProducer::connectConsumer(libcomm_addrinfo** consumerAddr, int& count return; } #endif +#endif static int GetListConsumerNodeIdx(ExecBoundary* enBoundary, Const** values, int distLen) { @@ -2061,4 +2808,7 @@ uint2 GetTargetConsumerNodeIdx(ExecBoundary* enBoundary, Const** distValues, int return (uint2)idx; } - +void StreamProducer::setEcontext(ExprContext* econtext) +{ + m_econtext = econtext; +} \ No newline at end of file diff --git a/src/gausskernel/process/tcop/dest.cpp b/src/gausskernel/process/tcop/dest.cpp index c6366759c..efd2f6223 100644 --- a/src/gausskernel/process/tcop/dest.cpp +++ b/src/gausskernel/process/tcop/dest.cpp @@ -153,6 +153,10 @@ DestReceiver* CreateDestReceiver(CommandDest dest) case DestTupleLocalRedistribute: case DestTupleLocalRoundRobin: case DestTupleHybrid: +#ifdef USE_SPQ + case DestTupleRoundRobin: + case DestBatchRoundRobin: +#endif case DestBatchBroadCast: case DestBatchLocalBroadCast: case DestBatchRedistribute: diff --git a/src/gausskernel/process/tcop/postgres.cpp b/src/gausskernel/process/tcop/postgres.cpp index 2ccce0002..4ed813d8d 100755 --- a/src/gausskernel/process/tcop/postgres.cpp +++ b/src/gausskernel/process/tcop/postgres.cpp @@ -289,7 +289,7 @@ static void drop_unnamed_stmt(void); static void SigHupHandler(SIGNAL_ARGS); static void ForceModifyInitialPwd(const char* query_string, List* parsetree_list); static void ForceModifyExpiredPwd(const char* queryString, const List* parsetreeList); -#ifdef ENABLE_MULTIPLE_NODES +#if defined(ENABLE_MULTIPLE_NODES) || defined(USE_SPQ) static void InitGlobalNodeDefinition(PlannedStmt* planstmt); #endif static int getSingleNodeIdx_internal(ExecNodes* exec_nodes, ParamListInfo params); @@ -646,7 +646,7 @@ int SocketBackend(StringInfo inBuf) ereport( FATAL, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid frontend message type %d", qtype))); break; -#ifdef ENABLE_MULTIPLE_NODES /* PGXC_DATANODE */ +#if defined(ENABLE_MULTIPLE_NODES) || defined(USE_SPQ) /* PGXC_DATANODE */ case 'q': /* Query ID */ case 'r': /* Plan ID with sync */ case 'M': /* Command ID */ @@ -3130,7 +3130,7 @@ static void exec_simple_query(const char* query_string, MessageType messageType, } } -#ifdef ENABLE_MULTIPLE_NODES +#if defined(ENABLE_MULTIPLE_NODES) || defined(USE_SPQ) /* * exec_plan_with_params * @@ -7305,11 +7305,13 @@ void process_postgres_switches(int argc, char* argv[], GucContext ctx, const cha (errcode(ERRCODE_SYNTAX_ERROR), errmsg("-c %s requires a value", optCtxt.optarg))); } #ifndef ENABLE_MULTIPLE_NODES +#ifndef USE_SPQ /* Only support 'internaltool' and 'application' for remotetype in single-node mode */ if (strcmp(name, "remotetype") == 0 && strcmp(value, "application") != 0 && strcmp(value, "internaltool") != 0) { ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("Invalid remote type:%s", value))); } +#endif #endif if (CheckReplUuid(name, value, needCheckUuid)) { gotUuidOpt = true; @@ -7473,7 +7475,7 @@ void ReloadPoolerWithoutTransaction() } } -#ifdef ENABLE_MULTIPLE_NODES +#if defined(ENABLE_MULTIPLE_NODES) || defined(USE_SPQ) /* * @Description: Initialize or refresh global node definition * @@ -8799,6 +8801,9 @@ int PostgresMain(int argc, char* argv[], const char* dbname, const char* usernam t_thrd.postgres_cxt.whereToSendOutput = saved_whereToSendOutput; firstchar = ReadCommand(&input_message); +#ifdef USE_SPQ + t_thrd.spq_ctx.spq_role = ROLE_UTILITY; +#endif /* update our elapsed time statistics. */ timeInfoRecordStart(); _local_tmp_opt1.enter(); @@ -8906,13 +8911,14 @@ int PostgresMain(int argc, char* argv[], const char* dbname, const char* usernam _local_tmp_opt1.exit(); switch (firstchar) { -#ifdef ENABLE_MULTIPLE_NODES +#if defined(ENABLE_MULTIPLE_NODES) || defined(USE_SPQ) case 'Z': // exeute plan directly. { char* plan_string = NULL; PlannedStmt* planstmt = NULL; int oLen_msg = 0; int cLen_msg = 0; + t_thrd.spq_ctx.spq_role = ROLE_QUERY_EXECUTOR; /* Set top consumer at the very beginning. */ StreamTopConsumerIam(); @@ -8958,6 +8964,8 @@ int PostgresMain(int argc, char* argv[], const char* dbname, const char* usernam pfree(plan_string); InitGlobalNodeDefinition(planstmt); + t_thrd.spq_ctx.spq_session_id = planstmt->spq_session_id; + t_thrd.spq_ctx.current_id = planstmt->current_id; statement_init_metric_context(); exec_simple_plan(planstmt); @@ -9090,7 +9098,7 @@ int PostgresMain(int argc, char* argv[], const char* dbname, const char* usernam u_sess->debug_query_id = 0; send_ready_for_query = true; } break; -#ifdef ENABLE_MULTIPLE_NODES +#if defined(ENABLE_MULTIPLE_NODES) || defined(USE_SPQ) case 'O': /* In pooler stateless resue mode reset connection params */ { const char* query_string = NULL; @@ -9361,7 +9369,7 @@ int PostgresMain(int argc, char* argv[], const char* dbname, const char* usernam } break; case 'S': { - rc = memcpy_s(&u_sess->globalSessionId.sessionId, sizeof(uint64), + errno_t rc = memcpy_s(&u_sess->globalSessionId.sessionId, sizeof(uint64), pq_getmsgbytes(&input_message, sizeof(uint64)), sizeof(uint64)); securec_check(rc, "\0", "\0"); u_sess->globalSessionId.nodeId = (uint32)pq_getmsgint(&input_message, sizeof(uint32)); @@ -9891,7 +9899,7 @@ int PostgresMain(int argc, char* argv[], const char* dbname, const char* usernam * is still sending data. */ break; -#ifdef ENABLE_MULTIPLE_NODES +#if defined(ENABLE_MULTIPLE_NODES) || defined(USE_SPQ) case 'M': /* Command ID */ { CommandId cid = (CommandId)pq_getmsgint(&input_message, 4); @@ -9923,17 +9931,19 @@ int PostgresMain(int argc, char* argv[], const char* dbname, const char* usernam case 'r': /* query id with sync */ { +#ifndef USE_SPQ /* We only process 'r' message on PGCX_DATANODE. */ if (IS_PGXC_COORDINATOR || IS_SINGLE_NODE) ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid frontend message type '%c'.", firstchar))); +#endif /* Set top consumer at the very beginning. */ StreamTopConsumerIam(); /* Set the query id we were passed down */ - rc = memcpy_s(&u_sess->debug_query_id, + errno_t rc = memcpy_s(&u_sess->debug_query_id, sizeof(uint64), pq_getmsgbytes(&input_message, sizeof(uint64)), sizeof(uint64)); @@ -9952,7 +9962,8 @@ int PostgresMain(int argc, char* argv[], const char* dbname, const char* usernam pq_putemptymessage('O'); /* PlanIdComplete */ pq_flush(); } break; - +#endif +#ifdef ENABLE_MULTIPLE_NODES case 'g': /* gxid */ { errno_t rc = EOK; diff --git a/src/gausskernel/process/tcop/pquery.cpp b/src/gausskernel/process/tcop/pquery.cpp index 9827b9f40..435865134 100644 --- a/src/gausskernel/process/tcop/pquery.cpp +++ b/src/gausskernel/process/tcop/pquery.cpp @@ -909,10 +909,12 @@ void PortalSetResultFormat(Portal portal, int nFormats, int16* formats) int i; #ifndef ENABLE_MULTIPLE_NODES +#ifndef USE_SPQ if (StreamTopConsumerAmI()) { portal->streamInfo.RecordSessionInfo(); u_sess->stream_cxt.global_obj->m_portal = portal; } +#endif #endif /* Do nothing if portal won't return tuples */ @@ -1071,6 +1073,13 @@ bool PortalRun( increase_rp_number(); } +#ifdef USE_SPQ + if (!IS_SPQ_RUNNING && queryDesc != NULL && (queryDesc->plannedstmt) != NULL && + queryDesc->plannedstmt->is_spq_optmized) { + t_thrd.spq_ctx.spq_role = ROLE_QUERY_COORDINTOR; + } +#endif /* USE_SPQ */ + /* * Set up global portal context pointers. * @@ -1419,7 +1428,8 @@ static uint64 PortalRunSelect(Portal portal, bool forward, long count, DestRecei * <> means that * we are on DWS CN. */ - if (IS_PGXC_COORDINATOR && !StreamTopConsumerAmI() && queryDesc->plannedstmt->has_obsrel && + if ((IS_SPQ_COORDINATOR || IS_PGXC_COORDINATOR) && + !StreamTopConsumerAmI() && queryDesc->plannedstmt->has_obsrel && u_sess->instr_cxt.obs_instr) { u_sess->instr_cxt.obs_instr->insertData(queryDesc->plannedstmt->queryId); } @@ -2304,4 +2314,4 @@ static void DoPortalRewind(Portal portal) portal->atEnd = false; portal->portalPos = 0; portal->posOverflow = false; -} +} \ No newline at end of file diff --git a/src/gausskernel/process/threadpool/knl_session.cpp b/src/gausskernel/process/threadpool/knl_session.cpp index 5ee8e2a32..6369903b2 100755 --- a/src/gausskernel/process/threadpool/knl_session.cpp +++ b/src/gausskernel/process/threadpool/knl_session.cpp @@ -1383,6 +1383,30 @@ void knl_u_mot_init(knl_u_mot_context* mot_cxt) } #endif +#ifdef USE_SPQ +static void knl_u_spq_init(knl_u_spq_context* spq_cxt) +{ + Assert(spq_cxt != NULL); + spq_cxt->dxl_memory_manager = NULL; + spq_cxt->pmpXerces = NULL; + spq_cxt->pmpDXL = NULL; + spq_cxt->m_ulpInitDXL = 0; + spq_cxt->m_ulpShutdownDXL = 0; + spq_cxt->m_pstrmap = NULL; + spq_cxt->m_pxmlszmap = NULL; + spq_cxt->m_mp = NULL; + spq_cxt->m_memory_pool_mgr = NULL; + spq_cxt->m_worker_pool_manager = NULL; + spq_cxt->m_pcache = NULL; + spq_cxt->m_ullCacheQuota = 0; + spq_cxt->spq_node_all_configs_size = 0; + spq_cxt->spq_node_configs_size = 0; + spq_cxt->spq_worker_context = NULL; + spq_cxt->spq_max_tuple_chunk_size = 0; + spq_cxt->s_tupSerMemCtxt = NULL; +} +#endif + static void knl_u_clientConnTime_init(knl_u_clientConnTime_context* clientConnTime_cxt) { Assert(clientConnTime_cxt != NULL); @@ -1490,6 +1514,9 @@ void knl_session_init(knl_session_context* sess_cxt) knl_u_ledger_init(&sess_cxt->ledger_cxt); #ifdef ENABLE_MOT knl_u_mot_init(&sess_cxt->mot_cxt); +#endif +#ifdef USE_SPQ + knl_u_spq_init(&sess_cxt->spq_cxt); #endif knl_u_libsw_init(&sess_cxt->libsw_cxt); KnlURepOriginInit(&sess_cxt->reporigin_cxt); diff --git a/src/gausskernel/process/threadpool/knl_thread.cpp b/src/gausskernel/process/threadpool/knl_thread.cpp index 8ef2161e5..7b5f3355e 100755 --- a/src/gausskernel/process/threadpool/knl_thread.cpp +++ b/src/gausskernel/process/threadpool/knl_thread.cpp @@ -1797,6 +1797,13 @@ void KnlLscContextInit(knl_t_lsc_context *lsc_cxt) ResourceOwnerCreate(NULL, "InitLocalSysCache", THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_DEFAULT)); } +#ifdef USE_SPQ +static void knlTSPQCxtInit(knl_t_spq_context *spqCxt) +{ + spqCxt->spq_role = ROLE_UTILITY; +} +#endif + void knl_thread_init(knl_thread_role role) { t_thrd.role = role; @@ -1936,6 +1943,9 @@ void knl_thread_init(knl_thread_role role) KnlDcfContextInit(&t_thrd.dcf_cxt); knl_t_page_compression_init(&t_thrd.page_compression_cxt); knl_t_rc_init(&t_thrd.rc_cxt); +#ifdef USE_SPQ + knlTSPQCxtInit(&t_thrd.spq_ctx); +#endif } __attribute__ ((__used__)) knl_thrd_context *GetCurrentThread() diff --git a/src/gausskernel/runtime/executor/Makefile b/src/gausskernel/runtime/executor/Makefile index 386a34d9d..4c3ad668d 100644 --- a/src/gausskernel/runtime/executor/Makefile +++ b/src/gausskernel/runtime/executor/Makefile @@ -49,7 +49,8 @@ OBJS = execAmi.o execCurrent.o execGrouping.o execJunk.o execMain.o \ nodeForeignscan.o nodeWindowAgg.o tstoreReceiver.o spi.o \ nodePartIterator.o nodeStub.o execClusterResize.o lightProxy.o execMerge.o \ nodeExtensible.o route.o nodeTrainModel.o db4ai_common.o spiDbesql.o \ - nodeProjectSet.o nodeSortGroup.o + nodeProjectSet.o nodeSortGroup.o nodeAssertOp.o nodeSequence.o \ + nodeShareInputScan.o nodeSpqSeqscan.o override CPPFLAGS += -D__STDC_FORMAT_MACROS diff --git a/src/gausskernel/runtime/executor/execAmi.cpp b/src/gausskernel/runtime/executor/execAmi.cpp index 2d6d52b84..dc033b704 100755 --- a/src/gausskernel/runtime/executor/execAmi.cpp +++ b/src/gausskernel/runtime/executor/execAmi.cpp @@ -64,6 +64,12 @@ #ifdef PGXC #include "pgxc/execRemote.h" #endif +#ifdef USE_SPQ +#include "executor/node/nodeSpqSeqscan.h" +#include "executor/node/nodeAssertOp.h" +#include "executor/node/nodeShareInputScan.h" +#include "executor/node/nodeSequence.h" +#endif static bool target_list_supports_backward_scan(List* targetlist); static bool index_supports_backward_scan(Oid indexid); @@ -142,6 +148,28 @@ void ExecReScanByType(PlanState* node) ExecReScanSeqScan((SeqScanState*)node); break; +#ifdef USE_SPQ + case T_SpqSeqScanState: + if (spqscan_rescan_hook) { + spqscan_rescan_hook((SpqSeqScanState*)node); + } else { + ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("spqscan hook init_spqscan_hook uninited."))); + } + break; + + case T_AssertOpState: + ExecReScanAssertOp((AssertOpState *) node); + break; + + case T_ShareInputScanState: + ExecReScanShareInputScan((ShareInputScanState *) node); + break; + + case T_SequenceState: + ExecReScanSequence((SequenceState *) node); + break; +#endif + case T_IndexScanState: ExecReScanIndexScan((IndexScanState*)node); break; @@ -365,6 +393,12 @@ void ExecMarkPos(PlanState* node) ExecSeqMarkPos((SeqScanState*)node); break; +#ifdef USE_SPQ + case T_SpqSeqScanState: + ExecSpqSeqMarkPos((SpqSeqScanState*)node); + break; +#endif + case T_IndexScanState: ExecIndexMarkPos((IndexScanState*)node); break; @@ -420,6 +454,12 @@ void ExecRestrPos(PlanState* node) ExecSeqRestrPos((SeqScanState*)node); break; +#ifdef USE_SPQ + case T_SpqSeqScanState: + ExecSpqSeqRestrPos((SpqSeqScanState*)node); + break; +#endif + case T_IndexScanState: ExecIndexRestrPos((IndexScanState*)node); break; @@ -480,6 +520,10 @@ bool ExecSupportsMarkRestore(Path *pathnode) case T_Sort: return true; +#ifdef USE_SPQ + case T_ShareInputScan: +#endif + case T_BaseResult: /* @@ -565,6 +609,11 @@ bool ExecSupportsBackwardScan(Plan* node) case T_ExtensiblePlan: return ((ExtensiblePlan *)node)->flags & EXTENSIBLEPATH_SUPPORT_BACKWARD_SCAN; +#ifdef USE_SPQ + case T_ShareInputScan: + return true; +#endif + case T_Material: case T_Sort: /* these don't evaluate tlist */ @@ -648,6 +697,9 @@ bool ExecMaterializesOutput(NodeTag plantype) case T_CteScan: case T_WorkTableScan: case T_Sort: +#ifdef USE_SPQ + case T_ShareInputScan: +#endif return true; default: diff --git a/src/gausskernel/runtime/executor/execMain.cpp b/src/gausskernel/runtime/executor/execMain.cpp index 63db5fc48..f733fc4d5 100755 --- a/src/gausskernel/runtime/executor/execMain.cpp +++ b/src/gausskernel/runtime/executor/execMain.cpp @@ -274,7 +274,9 @@ void standard_ExecutorStart(QueryDesc *queryDesc, int eflags) } #ifndef ENABLE_MULTIPLE_NODES - (void)InitStreamObject(queryDesc->plannedstmt); + if (!IS_SPQ_COORDINATOR) { + (void)InitStreamObject(queryDesc->plannedstmt); + } #endif if (StreamTopConsumerAmI() && queryDesc->instrument_options != 0 && IS_PGXC_DATANODE) { @@ -320,6 +322,10 @@ void standard_ExecutorStart(QueryDesc *queryDesc, int eflags) (ParamExecData *)palloc0(queryDesc->plannedstmt->nParamExec * sizeof(ParamExecData)); } +#ifdef USE_SPQ + estate->es_sharenode = nullptr; +#endif + /* * If non-read-only query, set the command ID to mark output tuples with */ @@ -1500,7 +1506,8 @@ void InitPlan(QueryDesc *queryDesc, int eflags) #ifdef ENABLE_MULTIPLE_NODES (IS_PGXC_COORDINATOR && list_nth_int(plannedstmt->subplan_ids, i - 1) != 0) || #else - (StreamTopConsumerAmI() && list_nth_int(plannedstmt->subplan_ids, i - 1) != 0) || + (!IS_SPQ_RUNNING && StreamTopConsumerAmI() && list_nth_int(plannedstmt->subplan_ids, i - 1) != 0) || + (IS_SPQ_COORDINATOR && list_nth_int(plannedstmt->subplan_ids, i - 1) != 0) || #endif plannedstmt->planTree->plan_node_id == list_nth_int(plannedstmt->subplan_ids, i - 1))) { estate->es_under_subplan = true; @@ -2192,7 +2199,7 @@ static void ExecutePlan(EState *estate, PlanState *planstate, CmdType operation, */ estate->es_direction = direction; - if (IS_PGXC_DATANODE) { + if (!IS_SPQ_COORDINATOR && IS_PGXC_DATANODE) { /* Collect Material for Subplan first */ ExecCollectMaterialForSubplan(estate); @@ -2297,8 +2304,8 @@ static void ExecutePlan(EState *estate, PlanState *planstate, CmdType operation, slot = ExecFilterJunk(estate->es_junkFilter, slot); } -#ifdef ENABLE_MULTIPLE_NDOES - if (stream_instrument) { +#if defined(ENABLE_MULTIPLE_NDOES) || defined(USE_SPQ) + if (t_thrd.spq_ctx.spq_role != ROLE_UTILITY && stream_instrument) { t_thrd.pgxc_cxt.GlobalNetInstr = planstate->instrument; } #endif diff --git a/src/gausskernel/runtime/executor/execProcnode.cpp b/src/gausskernel/runtime/executor/execProcnode.cpp index ec44a8ace..7e94c8717 100755 --- a/src/gausskernel/runtime/executor/execProcnode.cpp +++ b/src/gausskernel/runtime/executor/execProcnode.cpp @@ -160,6 +160,12 @@ #include "gstrace/gstrace_infra.h" #include "gstrace/executer_gstrace.h" #include "executor/node/nodeTrainModel.h" +#ifdef USE_SPQ +#include "executor/node/nodeSpqSeqscan.h" +#include "executor/node/nodeAssertOp.h" +#include "executor/node/nodeShareInputScan.h" +#include "executor/node/nodeSequence.h" +#endif #define NODENAMELEN 64 static TupleTableSlot *ExecProcNodeFirst(PlanState *node); static TupleTableSlot *ExecProcNodeInstr(PlanState *node); @@ -253,6 +259,9 @@ static inline bool BmHeapScanNodeIsStub(BitmapHeapScanState* bm_heap_scan) PlanState* ExecInitNodeByType(Plan* node, EState* estate, int eflags) { switch (nodeTag(node)) { +#ifdef USE_SPQ + case T_Result: +#endif case T_BaseResult: return (PlanState*)ExecInitResult((BaseResult*)node, estate, eflags); case T_ProjectSet: @@ -273,6 +282,20 @@ PlanState* ExecInitNodeByType(Plan* node, EState* estate, int eflags) return (PlanState*)ExecInitBitmapOr((BitmapOr*)node, estate, eflags); case T_SeqScan: return (PlanState*)ExecInitSeqScan((SeqScan*)node, estate, eflags); +#ifdef USE_SPQ + case T_SpqSeqScan: + if (init_spqscan_hook) { + return (PlanState*)init_spqscan_hook((SpqSeqScan*)node, estate, eflags); + } else { + ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("spqscan hook init_spqscan_hook uninited."))); + } + case T_AssertOp: + return (PlanState *) ExecInitAssertOp((AssertOp *) node, estate, eflags); + case T_ShareInputScan: + return (PlanState *)ExecInitShareInputScan((ShareInputScan *)node, estate, eflags); + case T_Sequence: + return (PlanState *)ExecInitSequence((Sequence *)node, estate, eflags); +#endif case T_IndexScan: return (PlanState*)ExecInitIndexScan((IndexScan*)node, estate, eflags); case T_IndexOnlyScan: @@ -556,7 +579,23 @@ PlanState* ExecInitNode(Plan* node, EState* estate, int e_flags) result->instrument = InstrAlloc(1, estate->es_instrument); } #else - if (u_sess->instr_cxt.global_instr != NULL && u_sess->instr_cxt.thread_instr && node->plan_node_id > 0 && + if (IS_SPQ_RUNNING) { + if (u_sess->instr_cxt.global_instr != NULL && u_sess->instr_cxt.thread_instr && node->plan_node_id > 0 && + IS_SPQ_COORDINATOR && StreamTopConsumerAmI()) { + /* on compute pool */ + result->instrument = u_sess->instr_cxt.thread_instr->allocInstrSlot( + node->plan_node_id, node->parent_node_id, result->plan, estate); + } else if (u_sess->instr_cxt.global_instr != NULL && u_sess->instr_cxt.thread_instr && node->plan_node_id > 0 && + (IS_SPQ_EXECUTOR || + IS_SPQ_COORDINATOR && node->exec_type == EXEC_ON_COORDS)) { + /* plannode(exec on cn)or dn */ + result->instrument = u_sess->instr_cxt.thread_instr->allocInstrSlot( + node->plan_node_id, node->parent_node_id, result->plan, estate); + } else { + result->instrument = InstrAlloc(1, estate->es_instrument); + } + } + else if (u_sess->instr_cxt.global_instr != NULL && u_sess->instr_cxt.thread_instr && node->plan_node_id > 0 && (!StreamTopConsumerAmI() || u_sess->instr_cxt.global_instr->get_planIdOffsetArray()[node->plan_node_id - 1] == 0)) { result->instrument = u_sess->instr_cxt.thread_instr->allocInstrSlot( @@ -1047,6 +1086,27 @@ static void ExecEndNodeByType(PlanState* node) case T_SeqScanState: ExecEndSeqScan((SeqScanState*)node); break; +#ifdef USE_SPQ + case T_SpqSeqScanState: + if (end_spqscan_hook) { + end_spqscan_hook((SpqSeqScanState*)node); + } else { + ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("spqscan hook init_spqscan_hook uninited."))); + } + break; + + case T_AssertOpState: + ExecEndAssertOp((AssertOpState *) node); + break; + + case T_ShareInputScanState: + ExecEndShareInputScan((ShareInputScanState *) node); + break; + + case T_SequenceState: + ExecEndSequence((SequenceState *) node); + break; +#endif case T_CStoreScanState: ExecEndCStoreScan((CStoreScanState*)node, false); break; diff --git a/src/gausskernel/runtime/executor/execQual.cpp b/src/gausskernel/runtime/executor/execQual.cpp index 25edc6283..032f0395f 100644 --- a/src/gausskernel/runtime/executor/execQual.cpp +++ b/src/gausskernel/runtime/executor/execQual.cpp @@ -1978,7 +1978,7 @@ static TupleDesc get_cached_rowtype(Oid type_id, int32 typmod, TupleDesc* cache_ /* * Callback function to release a tupdesc refcount at expression tree shutdown */ -static void ShutdownTupleDescRef(Datum arg) +void ShutdownTupleDescRef(Datum arg) { TupleDesc* cache_field = (TupleDesc*)DatumGetPointer(arg); @@ -7232,3 +7232,31 @@ void ExecCopyDataToDatum(PLpgSQL_datum** datums, int dno, Cursor_Data* source_cu cursor_var->value = Int32GetDatum(source_cursor->row_count); cursor_var->isnull = source_cursor->null_open; } + +#ifdef USE_SPQ +bool IsJoinExprNull(List *joinExpr, ExprContext *econtext) +{ + ListCell *lc; + bool joinkeys_null = true; + + Assert(joinExpr != nullptr); + + foreach(lc, joinExpr) { + ExprState *keyexpr = (ExprState *) lfirst(lc); + bool isNull = false; + + /* + * Evaluate the current join attribute value of the tuple + */ + ExecEvalExpr(keyexpr, econtext, &isNull, NULL); + + if (!isNull) { + /* Found at least one non-null join expression, we're done */ + joinkeys_null = false; + break; + } + } + + return joinkeys_null; +} +#endif diff --git a/src/gausskernel/runtime/executor/execTuples.cpp b/src/gausskernel/runtime/executor/execTuples.cpp index c55f82625..5534048c6 100644 --- a/src/gausskernel/runtime/executor/execTuples.cpp +++ b/src/gausskernel/runtime/executor/execTuples.cpp @@ -442,7 +442,7 @@ TupleTableSlot* ExecClearTuple(TupleTableSlot* slot) /* return: slot passed slot slot->tts_flags &= ~TTS_FLAG_SHOULDFREE; slot->tts_flags &= ~TTS_FLAG_SHOULDFREEMIN; -#ifdef ENABLE_MULTIPLE_NODES +#if defined(ENABLE_MULTIPLE_NODES) || defined(USE_SPQ) if (TTS_SHOULDFREE_ROW(slot)) { pfree_ext(slot->tts_dataRow); } diff --git a/src/gausskernel/runtime/executor/instrument.cpp b/src/gausskernel/runtime/executor/instrument.cpp index f912091a5..aec6b1ab7 100644 --- a/src/gausskernel/runtime/executor/instrument.cpp +++ b/src/gausskernel/runtime/executor/instrument.cpp @@ -483,20 +483,22 @@ Instrumentation* InstrAlloc(int n, int instrument_options) void InstrStartNode(Instrumentation* instr) { - if ( -#ifndef ENABLE_MULTIPLE_NODES - !u_sess->attr.attr_common.enable_seqscan_fusion && +#if !defined(ENABLE_MULTIPLE_NODES) && !defined(USE_SPQ) + if (!u_sess->attr.attr_common.enable_seqscan_fusion && !instr->first_time) { +#else + if ((t_thrd.spq_ctx.spq_role == ROLE_UTILITY && !u_sess->attr.attr_common.enable_seqscan_fusion && !instr->first_time) + || (t_thrd.spq_ctx.spq_role != ROLE_UTILITY && !instr->first_time)) { #endif - !instr->first_time) { instr->enter_time = GetCurrentTimestamp(); instr->first_time = true; } - if ( -#ifndef ENABLE_MULTIPLE_NODES - !u_sess->attr.attr_common.enable_seqscan_fusion && +#if !defined(ENABLE_MULTIPLE_NODES) && !defined(USE_SPQ) + if (!u_sess->attr.attr_common.enable_seqscan_fusion && !instr->first_time) { +#else + if ((t_thrd.spq_ctx.spq_role == ROLE_UTILITY && !u_sess->attr.attr_common.enable_seqscan_fusion && !instr->first_time) + || (t_thrd.spq_ctx.spq_role != ROLE_UTILITY && !instr->first_time)) { #endif - !instr->first_time) { CPUUsageGetCurrent(&instr->cpuusage_start); } @@ -526,7 +528,7 @@ void AddControlMemoryContext(Instrumentation* instr, MemoryContext context) return; MemoryContext old_context = NULL; - if (IS_PGXC_COORDINATOR) { + if (IS_SPQ_COORDINATOR || IS_PGXC_COORDINATOR) { Assert(u_sess->instr_cxt.global_instr->getInstrDataContext() != NULL); old_context = MemoryContextSwitchTo(u_sess->instr_cxt.global_instr->getInstrDataContext()); } else { @@ -563,8 +565,11 @@ void InstrStopNode(Instrumentation* instr, double n_tuples, bool containMemory) INSTR_TIME_SET_ZERO(instr->starttime); } -#ifndef ENABLE_MULTIPLE_NODES +#if !defined(ENABLE_MULTIPLE_NODES) && !defined(USE_SPQ) if (!u_sess->attr.attr_common.enable_seqscan_fusion) +#else + if ((t_thrd.spq_ctx.spq_role == ROLE_UTILITY && !u_sess->attr.attr_common.enable_seqscan_fusion) + || t_thrd.spq_ctx.spq_role != ROLE_UTILITY) #endif CPUUsageGetCurrent(&cpu_usage); @@ -573,8 +578,11 @@ void InstrStopNode(Instrumentation* instr, double n_tuples, bool containMemory) BufferUsageAccumDiff(&instr->bufusage, u_sess->instr_cxt.pg_buffer_usage, &instr->bufusage_start); } -#ifndef ENABLE_MULTIPLE_NODES +#if !defined(ENABLE_MULTIPLE_NODES) && !defined(USE_SPQ) if (!u_sess->attr.attr_common.enable_seqscan_fusion) +#else + if ((t_thrd.spq_ctx.spq_role == ROLE_UTILITY && !u_sess->attr.attr_common.enable_seqscan_fusion) + || t_thrd.spq_ctx.spq_role != ROLE_UTILITY) #endif CPUUsageAccumDiff(&instr->cpuusage, &cpu_usage, &instr->cpuusage_start); @@ -584,11 +592,12 @@ void InstrStopNode(Instrumentation* instr, double n_tuples, bool containMemory) instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter); } - if ( -#ifndef ENABLE_MULTIPLE_NODES - !u_sess->attr.attr_common.enable_seqscan_fusion && +#if !defined(ENABLE_MULTIPLE_NODES) && !defined(USE_SPQ) + if (!u_sess->attr.attr_common.enable_seqscan_fusion && containMemory) { +#else + if ((t_thrd.spq_ctx.spq_role == ROLE_UTILITY && !u_sess->attr.attr_common.enable_seqscan_fusion && containMemory) + || (t_thrd.spq_ctx.spq_role != ROLE_UTILITY && containMemory)) { #endif - containMemory) { int64 memory_size = 0; int64 control_memory_size = 0; /* calculate the memory context size of this Node */ @@ -863,7 +872,8 @@ Instrumentation* ThreadInstrumentation::allocInstrSlot(int plan_node_id, int par * if allocInstrSlot exec on CN or on compute pool, switch context to m_instrDataContext * else switch context to streamRuntimeContext */ - if (IS_PGXC_COORDINATOR && u_sess->instr_cxt.global_instr) + if ((IS_SPQ_COORDINATOR || IS_PGXC_COORDINATOR) && + u_sess->instr_cxt.global_instr) tmp_context = u_sess->instr_cxt.global_instr->getInstrDataContext(); else tmp_context = MemoryContextOriginal((char*)u_sess->instr_cxt.global_instr); @@ -994,6 +1004,36 @@ Instrumentation* ThreadInstrumentation::allocInstrSlot(int plan_node_id, int par break; } break; +#ifdef USE_SPQ + case T_SpqSeqScan: + if (!((Scan*)plan)->tablesample) { + if (((Scan*)plan)->isPartTbl) { + pname = "Partitioned Seq Scan"; + } else { + pname = "Spq Seq Scan"; + } + } else { + if (((Scan*)plan)->isPartTbl) { + pname = "Partitioned Sample Scan"; + } else { + pname = "Spq Sample Scan"; + } + } + plan_type = IO_OP; + break; + case T_AssertOp: + pname = "Assert"; + plan_type = UTILITY_OP; + break; + case T_ShareInputScan: + pname = "ShareInputScan"; + plan_type = UTILITY_OP; + break; + case T_Sequence: + pname = "Sequence"; + plan_type = UTILITY_OP; + break; +#endif case T_SeqScan: if (!((Scan*)plan)->tablesample) { if (((Scan*)plan)->isPartTbl) { @@ -1505,7 +1545,7 @@ StreamInstrumentation::StreamInstrumentation(int size, int num_streams, int gath { m_query_id = u_sess->debug_query_id; MemoryContext oldcontext = NULL; - if (IS_PGXC_COORDINATOR) { + if (IS_SPQ_COORDINATOR || IS_PGXC_COORDINATOR){ m_instrDataContext = AllocSetContextCreate(CurrentMemoryContext, "InstrDataContext", ALLOCSET_DEFAULT_MINSIZE, @@ -1516,21 +1556,26 @@ StreamInstrumentation::StreamInstrumentation(int size, int num_streams, int gath m_instrDataContext = NULL; } - if (IS_PGXC_COORDINATOR) { + if (IS_SPQ_COORDINATOR || IS_PGXC_COORDINATOR){ /* adopt for a lot gather operator */ m_threadInstrArrayLen = m_nodes_num * ((m_num_streams + m_gather_count) * m_query_dop) + 1; /* including thr top consumer list and cn */ m_streamInfo = (ThreadInstrInfo*)palloc0(sizeof(ThreadInstrInfo) * (m_num_streams + m_gather_count + 1)); } else { -#ifdef ENABLE_MULTIPLE_NODES - /* - * in DN, m_gather_count is 1 in general(gather operator) - * in compute pool, m_gather_count = 3 actually. - */ - m_threadInstrArrayLen = (m_num_streams + m_gather_count) * m_query_dop; - /* including the top consumer list. */ - m_streamInfo = (ThreadInstrInfo*)palloc0(sizeof(ThreadInstrInfo) * (m_num_streams + m_gather_count)); +#if defined(ENABLE_MULTIPLE_NODES) || defined(USE_SPQ) + if (t_thrd.spq_ctx.spq_role != ROLE_UTILITY) { + /* + * in DN, m_gather_count is 1 in general(gather operator) + * in compute pool, m_gather_count = 3 actually. + */ + m_threadInstrArrayLen = (m_num_streams + m_gather_count) * m_query_dop; + /* including the top consumer list. */ + m_streamInfo = (ThreadInstrInfo*)palloc0(sizeof(ThreadInstrInfo) * (m_num_streams + m_gather_count)); + } else { + m_threadInstrArrayLen = (m_num_streams + m_gather_count + 1) * m_query_dop; + m_streamInfo = (ThreadInstrInfo*)palloc0(sizeof(ThreadInstrInfo) * (m_num_streams + m_gather_count + 1)); + } #else /* single node need a lot gather operator */ m_threadInstrArrayLen = (m_num_streams + m_gather_count + 1) * m_query_dop; @@ -1557,7 +1602,7 @@ StreamInstrumentation::StreamInstrumentation(int size, int num_streams, int gath m_threadInstrArray[i] = NULL; } - if (IS_PGXC_COORDINATOR) { + if (IS_SPQ_COORDINATOR || IS_PGXC_COORDINATOR) { MemoryContextSwitchTo(oldcontext); } @@ -1570,7 +1615,7 @@ StreamInstrumentation::~StreamInstrumentation() if (CPUMon::m_has_perf) CPUMon::Shutdown(); - if (IS_PGXC_COORDINATOR) { + if (IS_SPQ_COORDINATOR || IS_PGXC_COORDINATOR) { Assert(m_instrDataContext != NULL); if (m_instrDataContext) { @@ -1814,6 +1859,17 @@ void StreamInstrumentation::getStreamInfo( getStreamInfo(plan, planned_stmt, dop, info, offset); } } break; +#ifdef USE_SPQ + case T_Sequence: { + Sequence* sequence = (Sequence*)result_plan; + ListCell* lc = NULL; + foreach (lc, sequence->subplans) { + Plan* plan = (Plan*)lfirst(lc); + getStreamInfo(plan, planned_stmt, dop, info, offset); + } + break; + } +#endif /* USE_SPQ */ default: if (result_plan->lefttree) @@ -1981,6 +2037,7 @@ void StreamInstrumentation::deserialize(int idx, char* msg, size_t len, bool ope query_id = ntohl64(query_id); msg += 8; +#ifndef USE_SPQ if (!operator_statitics) { Assert(query_id == (uint64)u_sess->debug_query_id); if (query_id != (uint64)u_sess->debug_query_id) { @@ -1991,6 +2048,7 @@ void StreamInstrumentation::deserialize(int idx, char* msg, size_t len, bool ope query_id))); } } +#endif rc = memcpy_s(&node_id, sizeof(int), msg, sizeof(int)); securec_check(rc, "\0", "\0"); @@ -2022,7 +2080,7 @@ void StreamInstrumentation::deserialize(int idx, char* msg, size_t len, bool ope int dn_num_streams = DN_NUM_STREAMS_IN_CN(m_num_streams, m_gather_count, m_query_dop); int slot = 1 + idx * dn_num_streams + offset + smp_id; /* adopt for compute pool */ - if (IS_PGXC_DATANODE) { + if (!IS_SPQ_COORDINATOR && IS_PGXC_DATANODE) { int plan_id_offset = m_planIdOffsetArray[node_id - 1] == 0 ? -1 : (m_planIdOffsetArray[node_id - 1] * m_query_dop); slot = plan_id_offset + smp_id; @@ -2030,7 +2088,7 @@ void StreamInstrumentation::deserialize(int idx, char* msg, size_t len, bool ope /* allocate threadinstrumentation in CN if receive from DN. */ if (m_threadInstrArray[slot] == NULL) { - if (IS_PGXC_DATANODE) + if (!IS_SPQ_EXECUTOR || IS_PGXC_DATANODE) tmp_context = MemoryContextOriginal((char*)u_sess->instr_cxt.global_instr); else tmp_context = u_sess->instr_cxt.global_instr->getInstrDataContext(); @@ -2193,7 +2251,7 @@ void StreamInstrumentation::deserializeTrack(int idx, char* msg, size_t len) int dn_num_streams = DN_NUM_STREAMS_IN_CN(m_num_streams, m_gather_count, m_query_dop); int slot = 1 + idx * dn_num_streams + offset + smp_id; /* adopt for compute pool */ - if (IS_PGXC_DATANODE) { + if (!IS_SPQ_COORDINATOR && IS_PGXC_DATANODE) { offset = m_planIdOffsetArray[segment_id - 1] == 0 ? -1 : (m_planIdOffsetArray[segment_id - 1] * m_query_dop); slot = offset; } @@ -2203,7 +2261,7 @@ void StreamInstrumentation::deserializeTrack(int idx, char* msg, size_t len) } if (m_threadInstrArray[slot] == NULL) { - if (IS_PGXC_DATANODE) + if (!IS_SPQ_COORDINATOR && IS_PGXC_DATANODE) tmp_context = MemoryContextOriginal((char*)u_sess->instr_cxt.global_instr); else tmp_context = u_sess->instr_cxt.global_instr->getInstrDataContext(); @@ -2372,22 +2430,26 @@ void StreamInstrumentation::TrackEndTime(int plan_node_id, int track_id) /* run in CN only m_planIdOffsetArray[planNodeId-1] > 0 , planNodeId RUN IN DN */ bool StreamInstrumentation::isFromDataNode(int plan_node_id) { -#ifdef ENABLE_MULTIPLE_NODES - if (u_sess->instr_cxt.global_instr && m_planIdOffsetArray[plan_node_id - 1] > 0) { - int num_streams = u_sess->instr_cxt.global_instr->getInstruThreadNum(); - int query_dop = u_sess->instr_cxt.global_instr->get_query_dop(); - int dn_num_threads = DN_NUM_STREAMS_IN_CN(num_streams, m_gather_count, query_dop); - int offset = (m_planIdOffsetArray[plan_node_id - 1] - 1) * query_dop; +#if defined(ENABLE_MULTIPLE_NODES) || defined(USE_SPQ) + if (t_thrd.spq_ctx.spq_role != ROLE_UTILITY) { + if (u_sess->instr_cxt.global_instr && m_planIdOffsetArray[plan_node_id - 1] > 0) { + int num_streams = u_sess->instr_cxt.global_instr->getInstruThreadNum(); + int query_dop = u_sess->instr_cxt.global_instr->get_query_dop(); + int dn_num_threads = DN_NUM_STREAMS_IN_CN(num_streams, m_gather_count, query_dop); + int offset = (m_planIdOffsetArray[plan_node_id - 1] - 1) * query_dop; - for (int i = 0; i < u_sess->instr_cxt.global_instr->getInstruNodeNum(); i++) { - /* avoid for activesql */ - ThreadInstrumentation* thread_instr = m_threadInstrArray[1 + i * dn_num_threads + offset]; - if (thread_instr != NULL && thread_instr->m_instrArrayMap[plan_node_id - 1] != -1) - return true; + for (int i = 0; i < u_sess->instr_cxt.global_instr->getInstruNodeNum(); i++) { + /* avoid for activesql */ + ThreadInstrumentation* thread_instr = m_threadInstrArray[1 + i * dn_num_threads + offset]; + if (thread_instr != NULL && thread_instr->m_instrArrayMap[plan_node_id - 1] != -1) + return true; + } + return false; } return false; + } else { + return true; } - return false; #else return true; #endif @@ -2396,7 +2458,7 @@ bool StreamInstrumentation::isFromDataNode(int plan_node_id) /* set NetWork in DN */ void StreamInstrumentation::SetNetWork(int plan_node_id, int64 buf_len) { - if (IS_PGXC_DATANODE) { + if (!IS_SPQ_EXECUTOR || IS_PGXC_DATANODE){ ThreadInstrumentation* thread_instr = m_threadInstrArray[m_planIdOffsetArray[plan_node_id - 1] * m_query_dop + u_sess->stream_cxt.smp_id]; @@ -2523,7 +2585,7 @@ void StreamInstrumentation::aggregate(int plannode_num) */ void StreamInstrumentation::SetStreamSend(int plan_node_id, bool send) { - if (IS_PGXC_DATANODE) { + if (!IS_SPQ_COORDINATOR && IS_PGXC_DATANODE) { ThreadInstrumentation* thread_instr = m_threadInstrArray[m_planIdOffsetArray[plan_node_id - 1] * m_query_dop + u_sess->stream_cxt.smp_id]; @@ -2787,7 +2849,8 @@ void OBSInstrumentation::serializeSend() { ereport(DEBUG5, (errmodule(MOD_ACCELERATE), errmsg("in %s", __FUNCTION__))); - if (IS_PGXC_DATANODE && !StreamTopConsumerAmI() && u_sess->instr_cxt.p_OBS_instr_valid == NULL) + if ((!IS_SPQ_COORDINATOR && IS_PGXC_DATANODE) && + !StreamTopConsumerAmI() && u_sess->instr_cxt.p_OBS_instr_valid == NULL) return; StringInfoData buf; @@ -2795,7 +2858,8 @@ void OBSInstrumentation::serializeSend() LWLockAcquire(OBSRuntimeLock, LW_EXCLUSIVE); - if (IS_PGXC_DATANODE && !StreamTopConsumerAmI() && *u_sess->instr_cxt.p_OBS_instr_valid == false) { + if ((!IS_SPQ_COORDINATOR && IS_PGXC_DATANODE) && + !StreamTopConsumerAmI() && *u_sess->instr_cxt.p_OBS_instr_valid == false) { ereport(DEBUG1, (errmodule(MOD_ACCELERATE), errmsg("u_sess->instr_cxt.obs_instr is deleted in top consumer thread."))); LWLockRelease(OBSRuntimeLock); @@ -2821,7 +2885,8 @@ void OBSInstrumentation::deserialize(char* msg, size_t len) { ereport(DEBUG5, (errmodule(MOD_ACCELERATE), errmsg("in %s", __FUNCTION__))); - if (IS_PGXC_DATANODE && !StreamTopConsumerAmI() && u_sess->instr_cxt.p_OBS_instr_valid == NULL) + if ((!IS_SPQ_COORDINATOR && IS_PGXC_DATANODE) && + !StreamTopConsumerAmI() && u_sess->instr_cxt.p_OBS_instr_valid == NULL) return; errno_t rc = EOK; @@ -2832,7 +2897,8 @@ void OBSInstrumentation::deserialize(char* msg, size_t len) LWLockAcquire(OBSRuntimeLock, LW_EXCLUSIVE); - if (IS_PGXC_DATANODE && !StreamTopConsumerAmI() && *u_sess->instr_cxt.p_OBS_instr_valid == false) { + if ((!IS_SPQ_COORDINATOR && IS_PGXC_DATANODE) && + !StreamTopConsumerAmI() && *u_sess->instr_cxt.p_OBS_instr_valid == false) { ereport(DEBUG1, (errmodule(MOD_ACCELERATE), errmsg("u_sess->instr_cxt.obs_instr is deleted in top consumer thread."))); LWLockRelease(OBSRuntimeLock); @@ -2894,7 +2960,8 @@ void OBSInstrumentation::save(const char* relname, int file_scanned, int64 data_ { ereport(DEBUG5, (errmodule(MOD_ACCELERATE), errmsg("in %s", __FUNCTION__))); - if (IS_PGXC_DATANODE && !StreamTopConsumerAmI() && u_sess->instr_cxt.p_OBS_instr_valid == NULL) + if ((!IS_SPQ_COORDINATOR && IS_PGXC_DATANODE) && + !StreamTopConsumerAmI() && u_sess->instr_cxt.p_OBS_instr_valid == NULL) return; errno_t rc = EOK; @@ -2905,7 +2972,8 @@ void OBSInstrumentation::save(const char* relname, int file_scanned, int64 data_ LWLockAcquire(OBSRuntimeLock, LW_EXCLUSIVE); - if (IS_PGXC_DATANODE && !StreamTopConsumerAmI() && *u_sess->instr_cxt.p_OBS_instr_valid == false) { + if ((!IS_SPQ_COORDINATOR && IS_PGXC_DATANODE) && + !StreamTopConsumerAmI() && *u_sess->instr_cxt.p_OBS_instr_valid == false) { ereport(DEBUG1, (errmodule(MOD_ACCELERATE), errmsg("u_sess->instr_cxt.obs_instr is deleted in top consumer thread."))); LWLockRelease(OBSRuntimeLock); @@ -2945,7 +3013,10 @@ void OBSInstrumentation::insertData(uint64 queryid) ListCell* lc = NULL; OBSRuntimeInfo* info = NULL; +#ifndef USE_SPQ Assert(IS_PGXC_COORDINATOR); +#endif + Assert(!StreamTopConsumerAmI()); LWLockAcquire(OBSRuntimeLock, LW_EXCLUSIVE); @@ -3122,7 +3193,8 @@ void ExplainCreateDNodeInfoOnDN( p_dnode_info->execute_on_datanode = on_dn; p_dnode_info->userid = GetUserId(); - if (IS_PGXC_COORDINATOR || IS_SINGLE_NODE) { + if (IS_SPQ_COORDINATOR || IS_PGXC_COORDINATOR || + IS_SINGLE_NODE) { MemoryContext old_context; old_context = MemoryContextSwitchTo(g_instance.wlm_cxt->oper_resource_track_mcxt); int plan_len = strlen(plan_name) + 1; @@ -3134,7 +3206,8 @@ void ExplainCreateDNodeInfoOnDN( p_dnode_info->estimated_rows = estimated_rows; } } else { - if (IS_PGXC_COORDINATOR || IS_SINGLE_NODE) { + if (IS_SPQ_COORDINATOR || IS_PGXC_COORDINATOR || + IS_SINGLE_NODE) { ereport(LOG, (errmsg("Realtime Trace Error: The new information has the same hash key as the existed record in " "the hash table, which is not expected."))); @@ -3686,7 +3759,8 @@ void* ExplainGetSessionStatistics(int* num) return NULL; } - if (!(IS_PGXC_COORDINATOR || IS_SINGLE_NODE)) { + if (!((IS_SPQ_COORDINATOR || IS_PGXC_COORDINATOR) || + IS_SINGLE_NODE)) { ereport(WARNING, (errmsg("This view is not allowed on datanode."))); return NULL; } @@ -3753,71 +3827,73 @@ void* ExplainGetSessionStatistics(int* num) LWLockRelease(GetMainLWLockByIndex(FirstOperatorRealTLock + j)); *num = i; -#ifdef ENABLE_MULTIPLE_NODES - char keystr[NAMEDATALEN] = {0}; - int retry_count = 0; - PGXCNodeAllHandles* pgxc_handles = NULL; +#if defined(ENABLE_MULTIPLE_NODES) || defined(USE_SPQ) + if (IS_SPQ_RUNNING) { + char keystr[NAMEDATALEN] = {0}; + int retry_count = 0; + PGXCNodeAllHandles* pgxc_handles = NULL; -retry: - pgxc_handles = WLMRemoteInfoCollectorStart(); + retry: + pgxc_handles = WLMRemoteInfoCollectorStart(); - if (pgxc_handles == NULL) { - pfree_ext(stat_array); - *num = 0; - ereport(LOG, (errmsg("remote collector failed, reason: connect error."))); - return NULL; - } - - for (i = 0; i < *num; ++i) { - stat_element = stat_array + i; - - /* Get real time info from each data nodes */ - if (stat_element->execute_on_datanode) { - rc = snprintf_s(keystr, - NAMEDATALEN, - NAMEDATALEN - 1, - "%lu,%lu,%d", - stat_element->tid, - stat_element->query_id, - stat_element->plan_node_id); - securec_check_ss(rc, "\0", "\0"); - - int ret = WLMRemoteInfoSender(pgxc_handles, keystr, WLM_COLLECT_OPERATOR_RUNTIME); - if (ret != 0) { - ++retry_count; - release_pgxc_handles(pgxc_handles); - ereport(WARNING, (errmsg("send failed, retry_count: %d", retry_count))); - pg_usleep(3 * USECS_PER_SEC); - - if (retry_count >= 3) - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_FAILURE), - errmsg("Remote Sender: Failed to send command to datanode"))); - - goto retry; - } - - initGenralInfo(stat_element); - size_info temp_info; - rc = memset_s(&temp_info, sizeof(size_info), 0, sizeof(size_info)); - securec_check(rc, "\0", "\0"); - temp_info.plan_node_name = stat_element->plan_node_name; - temp_info.min_cpu_time = -1; - temp_info.min_peak_memory = -1; - temp_info.min_spill_size = -1; - temp_info.dn_count = 0; - temp_info.startup_time = -1; - - /* Fetch session statistics from each datanode */ - WLMRemoteInfoReceiver(pgxc_handles, &temp_info, sizeof(size_info), OperatorStrategyFunc4SessionInfo); - if (temp_info.has_data) - getFinalInfo(stat_element, temp_info); - else - stat_element->status = true; + if (pgxc_handles == NULL) { + pfree_ext(stat_array); + *num = 0; + ereport(LOG, (errmsg("remote collector failed, reason: connect error."))); + return NULL; } - } - WLMRemoteInfoCollectorFinish(pgxc_handles); + for (i = 0; i < *num; ++i) { + stat_element = stat_array + i; + + /* Get real time info from each data nodes */ + if (stat_element->execute_on_datanode) { + rc = snprintf_s(keystr, + NAMEDATALEN, + NAMEDATALEN - 1, + "%lu,%lu,%d", + stat_element->tid, + stat_element->query_id, + stat_element->plan_node_id); + securec_check_ss(rc, "\0", "\0"); + + int ret = WLMRemoteInfoSender(pgxc_handles, keystr, WLM_COLLECT_OPERATOR_RUNTIME); + if (ret != 0) { + ++retry_count; + release_pgxc_handles(pgxc_handles); + ereport(WARNING, (errmsg("send failed, retry_count: %d", retry_count))); + pg_usleep(3 * USECS_PER_SEC); + + if (retry_count >= 3) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("Remote Sender: Failed to send command to datanode"))); + + goto retry; + } + + initGenralInfo(stat_element); + size_info temp_info; + rc = memset_s(&temp_info, sizeof(size_info), 0, sizeof(size_info)); + securec_check(rc, "\0", "\0"); + temp_info.plan_node_name = stat_element->plan_node_name; + temp_info.min_cpu_time = -1; + temp_info.min_peak_memory = -1; + temp_info.min_spill_size = -1; + temp_info.dn_count = 0; + temp_info.startup_time = -1; + + /* Fetch session statistics from each datanode */ + WLMRemoteInfoReceiver(pgxc_handles, &temp_info, sizeof(size_info), OperatorStrategyFunc4SessionInfo); + if (temp_info.has_data) + getFinalInfo(stat_element, temp_info); + else + stat_element->status = true; + } + } + + WLMRemoteInfoCollectorFinish(pgxc_handles); + } #endif return stat_array; } @@ -3850,7 +3926,8 @@ void ExplainSetSessionInfo(int plan_node_id, Instrumentation* instr, bool on_dat uint32 hash_code = GetHashPlanCode(&qid, sizeof(Qpid)); - if ((IS_PGXC_COORDINATOR && !IsConnFromCoord()) || IS_SINGLE_NODE) { + if (((IS_SPQ_COORDINATOR || IS_PGXC_COORDINATOR) && + !IsConnFromCoord()) || IS_SINGLE_NODE) { LockOperHistHashPartition(hash_code, LW_EXCLUSIVE); ExplainDNodeInfo* p_detail = @@ -3889,12 +3966,16 @@ void ExplainSetSessionInfo(int plan_node_id, Instrumentation* instr, bool on_dat p_detail->can_record_to_table = u_sess->instr_cxt.can_record_to_table; p_detail->status = Operator_Normal; UnLockOperHistHashPartition(hash_code); -#ifndef ENABLE_MULTIPLE_NODES +#if !defined(ENABLE_MULTIPLE_NODES) && !defined(USE_SPQ) return; +#else + if (t_thrd.spq_ctx.spq_role == ROLE_UTILITY) { + return; + } #endif } - if (IS_PGXC_DATANODE) { + if (IS_SPQ_COORDINATOR || IS_PGXC_DATANODE) { LockOperHistHashPartition(hash_code, LW_EXCLUSIVE); ExplainDNodeInfo* p_detail = (ExplainDNodeInfo*)hash_search(g_operator_table.collected_info_hashtbl, &qid, HASH_ENTER, &has_found); @@ -3950,7 +4031,7 @@ void* ExplainGetSessionInfo(const Qpid* qid, int removed, int* num) return NULL; } - if (IS_PGXC_COORDINATOR || IS_SINGLE_NODE) { + if (IS_SPQ_COORDINATOR || IS_PGXC_COORDINATOR || IS_SINGLE_NODE) { TimestampTz current_time = GetCurrentTimestamp(); for (j = 0; j < NUM_OPERATOR_HISTORY_PARTITIONS; j++) @@ -4044,75 +4125,77 @@ void* ExplainGetSessionInfo(const Qpid* qid, int removed, int* num) for (j = NUM_OPERATOR_HISTORY_PARTITIONS; --j >= 0;) LWLockRelease(GetMainLWLockByIndex(FirstOperatorHistLock + j)); -#ifdef ENABLE_MULTIPLE_NODES - int retry_count = 0; - int i; - PGXCNodeAllHandles* pgxc_handles = NULL; - char keystr[NAMEDATALEN]; +#if defined(ENABLE_MULTIPLE_NODES) || defined(USE_SPQ) + if (t_thrd.spq_ctx.spq_role != ROLE_UTILITY) { + int retry_count = 0; + int i; + PGXCNodeAllHandles* pgxc_handles = NULL; + char keystr[NAMEDATALEN]; - retry: - pgxc_handles = WLMRemoteInfoCollectorStart(); + retry: + pgxc_handles = WLMRemoteInfoCollectorStart(); - if (pgxc_handles == NULL) { - pfree_ext(stat_array); - *num = 0; - return NULL; - } - - for (i = 0; i < *num; ++i) { - if (i >= record_pos && i <= un_record_pos) { - continue; + if (pgxc_handles == NULL) { + pfree_ext(stat_array); + *num = 0; + return NULL; } - stat_element = stat_array + i; - - if (stat_element->execute_on_datanode) { - rc = snprintf_s(keystr, - NAMEDATALEN, - NAMEDATALEN - 1, - "%lu,%lu,%d,%d", - stat_element->tid, - stat_element->query_id, - stat_element->plan_node_id, - stat_element->remove); - securec_check_ss(rc, "\0", "\0"); - - int ret = WLMRemoteInfoSender(pgxc_handles, keystr, WLM_COLLECT_OPERATOR_SESSION); - - if (ret != 0) { - ++retry_count; - release_pgxc_handles(pgxc_handles); - ereport(WARNING, (errmsg("send failed, retry_count: %d", retry_count))); - - pg_usleep(3 * USECS_PER_SEC); - - if (retry_count >= 3) - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_FAILURE), - errmsg("Remote Sender: Failed to send command to datanode"))); - goto retry; + for (i = 0; i < *num; ++i) { + if (i >= record_pos && i <= un_record_pos) { + continue; } - initGenralInfo(stat_element); - size_info temp_info; - rc = memset_s(&temp_info, sizeof(size_info), 0, sizeof(size_info)); - securec_check(rc, "\0", "\0"); - temp_info.plan_node_name = stat_element->plan_node_name; - temp_info.min_cpu_time = -1; - temp_info.min_peak_memory = -1; - temp_info.min_spill_size = -1; - temp_info.dn_count = 0; - temp_info.startup_time = -1; + stat_element = stat_array + i; - /* Fetch session statistics from each datanode */ - WLMRemoteInfoReceiver(pgxc_handles, &temp_info, sizeof(size_info), OperatorStrategyFunc4SessionInfo); + if (stat_element->execute_on_datanode) { + rc = snprintf_s(keystr, + NAMEDATALEN, + NAMEDATALEN - 1, + "%lu,%lu,%d,%d", + stat_element->tid, + stat_element->query_id, + stat_element->plan_node_id, + stat_element->remove); + securec_check_ss(rc, "\0", "\0"); - if (temp_info.has_data) - getFinalInfo(stat_element, temp_info); + int ret = WLMRemoteInfoSender(pgxc_handles, keystr, WLM_COLLECT_OPERATOR_SESSION); + + if (ret != 0) { + ++retry_count; + release_pgxc_handles(pgxc_handles); + ereport(WARNING, (errmsg("send failed, retry_count: %d", retry_count))); + + pg_usleep(3 * USECS_PER_SEC); + + if (retry_count >= 3) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("Remote Sender: Failed to send command to datanode"))); + goto retry; + } + + initGenralInfo(stat_element); + size_info temp_info; + rc = memset_s(&temp_info, sizeof(size_info), 0, sizeof(size_info)); + securec_check(rc, "\0", "\0"); + temp_info.plan_node_name = stat_element->plan_node_name; + temp_info.min_cpu_time = -1; + temp_info.min_peak_memory = -1; + temp_info.min_spill_size = -1; + temp_info.dn_count = 0; + temp_info.startup_time = -1; + + /* Fetch session statistics from each datanode */ + WLMRemoteInfoReceiver(pgxc_handles, &temp_info, sizeof(size_info), OperatorStrategyFunc4SessionInfo); + + if (temp_info.has_data) + getFinalInfo(stat_element, temp_info); + } } - } - WLMRemoteInfoCollectorFinish(pgxc_handles); + WLMRemoteInfoCollectorFinish(pgxc_handles); + } #endif *num = record_pos; return stat_array; @@ -4154,7 +4237,7 @@ void releaseExplainTable() p_dnode_info->status = Operator_Invalid; } - if (IS_PGXC_DATANODE) { + if (!IS_SPQ_COORDINATOR && IS_PGXC_DATANODE) { hash_search(g_operator_table.collected_info_hashtbl, &qid, HASH_REMOVE, NULL); } @@ -4168,8 +4251,9 @@ void releaseExplainTable() ExplainDNodeInfo* p_dnode_info = (ExplainDNodeInfo*)hash_search(g_operator_table.explain_info_hashtbl, &qid, HASH_FIND, &found); - if (found && (IS_PGXC_COORDINATOR || IS_SINGLE_NODE) && p_dnode_info != NULL && p_dnode_info->plan_name != NULL) { - pfree_ext(p_dnode_info->plan_name); + if (found && (IS_SPQ_COORDINATOR || IS_PGXC_COORDINATOR || IS_SINGLE_NODE) && + p_dnode_info != NULL && p_dnode_info->plan_name != NULL) { + pfree_ext(p_dnode_info->plan_name); } hash_search(g_operator_table.explain_info_hashtbl, &qid, HASH_REMOVE, NULL); @@ -4198,7 +4282,8 @@ void removeExplainInfo(int plan_node_id) ExplainDNodeInfo* p_dnode_info = (ExplainDNodeInfo*)hash_search(g_operator_table.explain_info_hashtbl, &qid, HASH_FIND, &found); - if (found && (IS_PGXC_COORDINATOR || IS_SINGLE_NODE) && p_dnode_info != NULL && p_dnode_info->plan_name != NULL) { + if (found && (IS_SPQ_COORDINATOR || IS_PGXC_COORDINATOR || IS_SINGLE_NODE) && + p_dnode_info != NULL && p_dnode_info->plan_name != NULL) { pfree_ext(p_dnode_info->plan_name); } diff --git a/src/gausskernel/runtime/executor/nodeAgg.cpp b/src/gausskernel/runtime/executor/nodeAgg.cpp index e18091255..7d421bfb5 100644 --- a/src/gausskernel/runtime/executor/nodeAgg.cpp +++ b/src/gausskernel/runtime/executor/nodeAgg.cpp @@ -1762,6 +1762,12 @@ static TupleTableSlot* agg_retrieve_direct(AggState* aggstate) /* If we are grouping, we should produce no tuples too */ if (node->aggstrategy != AGG_PLAIN) return NULL; +#ifdef USE_SPQ + if (IS_SPQ_EXECUTOR) { + if (t_thrd.spq_ctx.skip_direct_distribute_result) + return NULL; + } +#endif } } } @@ -2221,6 +2227,9 @@ AggState* ExecInitAgg(Agg* node, EState* estate, int eflags) aggstate->ss.ps.plan = (Plan*)node; aggstate->ss.ps.state = estate; +#ifdef USE_SPQ + aggstate->aggsplittype = node->aggsplittype; +#endif aggstate->aggs = NIL; aggstate->numaggs = 0; aggstate->maxsets = 0; @@ -3881,6 +3890,15 @@ static void exec_lookups_agg(AggState *aggstate, Agg *node, EState *estate) } } #endif /* ENABLE_MULTIPLE_NODES */ +#ifdef USE_SPQ + /* Final function only required if we're finalizing the aggregates */ + if (t_thrd.spq_ctx.spq_role != ROLE_UTILITY) { + if (DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplittype)) + peraggstate->finalfn_oid = finalfn_oid = InvalidOid; + else + peraggstate->finalfn_oid = finalfn_oid = aggform->aggfinalfn; + } +#endif /* SPQ */ #endif /* PGXC */ /* Check that aggregate owner has permission to call component fns */ { diff --git a/src/gausskernel/runtime/executor/nodeAssertOp.cpp b/src/gausskernel/runtime/executor/nodeAssertOp.cpp new file mode 100644 index 000000000..3de4dc17b --- /dev/null +++ b/src/gausskernel/runtime/executor/nodeAssertOp.cpp @@ -0,0 +1,118 @@ +/*------------------------------------------------------------------------- + * + * nodeAssertOp.cpp + * Implementation of nodeAssertOp. + * + * Portions Copyright (c) 2023 Huawei Technologies Co.,Ltd. + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/gausskernel/runtime/executor/nodeAssertOp.cpp + * + *------------------------------------------------------------------------- + */ +#ifdef USE_SPQ +#include "postgres.h" +#include "miscadmin.h" + +#include "commands/tablecmds.h" +#include "executor/executor.h" +#include "executor/instrument.h" +#include "executor/node/nodeAssertOp.h" + +/* + * Estimated Memory Usage of AssertOp Node. + **/ +void ExecAssertOpExplainEnd(PlanState *planstate, struct StringInfoData *buf) +{ +} + +TupleTableSlot* ExecAssertOp(PlanState *state) +{ + AssertOpState *node = castNode(AssertOpState, state); + List* qual = node->ps.qual; + ExprContext* econtext = node->ps.ps_ExprContext; + ProjectionInfo* proj_info = node->ps.ps_ProjInfo; + AssertOp* plannode = (AssertOp *) node->ps.plan; + StringInfoData errorString; + PlanState *outerNode = outerPlanState(node); + TupleTableSlot *slot = ExecProcNode(outerNode); + if (TupIsNull(slot)) { + return NULL; + } + ResetExprContext(econtext); + econtext->ecxt_outertuple = slot; + initStringInfo(&errorString); + if (!ExecQual(qual, econtext, false)) { + Value *valErrorMessage = (Value *) list_nth(plannode->errmessage, 0); + + Assert(NULL != valErrorMessage && IsA(valErrorMessage, String) && + 0 < strlen(strVal(valErrorMessage))); + + appendStringInfo(&errorString, "%s\n", strVal(valErrorMessage)); + ereport(ERROR, + (errcode(plannode->errcode), + errmsg("one or more assertions failed"), + errdetail("%s", errorString.data))); + } + pfree(errorString.data); + ResetExprContext(econtext); + return ExecProject(proj_info, NULL); +} + +/** + * Init AssertOp, which sets the ProjectInfo and + * the Constraints to evaluate. + * */ +AssertOpState* ExecInitAssertOp(AssertOp *node, EState *estate, int eflags) +{ + AssertOpState *assertOpState; + TupleDesc tupDesc; + Plan *outerPlan; + /* Check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + Assert(outerPlan(node) != NULL); + assertOpState = makeNode(AssertOpState); + assertOpState->ps.plan = (Plan *) node; + assertOpState->ps.state = estate; + assertOpState->ps.ExecProcNode = ExecAssertOp; + ExecInitResultTupleSlot(estate, &assertOpState->ps); + /* Create expression evaluation context */ + ExecAssignExprContext(estate, &assertOpState->ps); + assertOpState->ps.targetlist = (List*)ExecInitExpr((Expr*)node->plan.targetlist, (PlanState*)assertOpState); + assertOpState->ps.qual = (List*)ExecInitExpr((Expr*)node->plan.qual, (PlanState*)assertOpState); + /* + * Initialize outer plan + */ + outerPlan = outerPlan(node); + outerPlanState(assertOpState) = ExecInitNode(outerPlan, estate, eflags); + /* + * Initialize result type and projection. + */ + ExecAssignResultTypeFromTL(&assertOpState->ps); + tupDesc = ExecTypeFromTL(node->plan.targetlist, false); + ExecAssignProjectionInfo(&assertOpState->ps, tupDesc); + + return assertOpState; +} + +/* Rescan AssertOp */ +void ExecReScanAssertOp(AssertOpState *node) +{ + /* + * If chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (node->ps.lefttree && + node->ps.lefttree->chgParam == NULL) + ExecReScan(node->ps.lefttree); +} + +/* Release Resources Requested by AssertOp node. */ +void ExecEndAssertOp(AssertOpState *node) +{ + ExecFreeExprContext(&node->ps); + ExecEndNode(outerPlanState(node)); +} +#endif /* USE_SPQ */ diff --git a/src/gausskernel/runtime/executor/nodeHash.cpp b/src/gausskernel/runtime/executor/nodeHash.cpp index db0550fdc..23d9dbafd 100644 --- a/src/gausskernel/runtime/executor/nodeHash.cpp +++ b/src/gausskernel/runtime/executor/nodeHash.cpp @@ -126,7 +126,12 @@ Node* MultiExecHash(HashState* node) break; /* We have to compute the hash value */ econtext->ecxt_innertuple = slot; +#ifdef USE_SPQ + bool hashkeys_null = false; + if (ExecHashGetHashValue(hashtable, econtext, hashkeys, false, hashtable->keepNulls, &hashvalue, &hashkeys_null)) { +#else if (ExecHashGetHashValue(hashtable, econtext, hashkeys, false, hashtable->keepNulls, &hashvalue)) { +#endif int bucketNumber; bucketNumber = ExecHashGetSkewBucket(hashtable, hashvalue); @@ -145,6 +150,15 @@ Node* MultiExecHash(HashState* node) } hashtable->totalTuples += 1; } +#ifdef USE_SPQ + if (hashkeys_null) { + node->hs_hashkeys_null = true; + if (node->hs_quit_if_hashkeys_null) { + ExecEndNode(outerNode); + return NULL; + } + } +#endif } (void)pgstat_report_waitstatus(oldStatus); @@ -1348,6 +1362,86 @@ void ExecHashTableInsert(HashJoinTable hashtable, TupleTableSlot *slot, uint32 h * because it contains a null attribute, and hence it should be discarded * immediately. (If keep_nulls is true then FALSE is never returned.) */ +#ifdef USE_SPQ +bool ExecHashGetHashValue(HashJoinTable hashtable, ExprContext* econtext, List* hashkeys, bool outer_tuple, + bool keep_nulls, uint32* hashvalue, bool *hashkeys_null) +{ + if (!IS_SPQ_RUNNING) { + return ExecHashGetHashValue(hashtable, econtext, hashkeys, outer_tuple, keep_nulls, hashvalue); + } + uint32 hashkey = 0; + FmgrInfo* hashfunctions = NULL; + ListCell* hk = NULL; + int i = 0; + MemoryContext oldContext; + + Assert(hashkeys_null); + *hashkeys_null = true; + + /* + * We reset the eval context each time to reclaim any memory leaked in the + * hashkey expressions. + */ + ResetExprContext(econtext); + + oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + + if (outer_tuple) + hashfunctions = hashtable->outer_hashfunctions; + else + hashfunctions = hashtable->inner_hashfunctions; + + foreach (hk, hashkeys) { + ExprState* keyexpr = (ExprState*)lfirst(hk); + Datum keyval; + bool isNull = false; + + /* rotate hashkey left 1 bit at each step */ + hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0); + + /* + * Get the join attribute value of the tuple + */ + keyval = ExecEvalExpr(keyexpr, econtext, &isNull, NULL); + + /* + * If the attribute is NULL, and the join operator is strict, then + * this tuple cannot pass the join qual so we can reject it + * immediately (unless we're scanning the outside of an outer join, in + * which case we must not reject it). Otherwise we act like the + * hashcode of NULL is zero (this will support operators that act like + * IS NOT DISTINCT, though not any more-random behavior). We treat + * the hash support function as strict even if the operator is not. + * + * Note: currently, all hashjoinable operators must be strict since + * the hash index AM assumes that. However, it takes so little extra + * code here to allow non-strict that we may as well do it. + */ + if (isNull) { + if (hashtable->hashStrict[i] && !keep_nulls) { + MemoryContextSwitchTo(oldContext); + return false; /* cannot match */ + } + /* else, leave hashkey unmodified, equivalent to hashcode 0 */ + } else { + /* Compute the hash function */ + uint32 hkey; + + hkey = DatumGetUInt32(FunctionCall1(&hashfunctions[i], keyval)); + hashkey ^= hkey; + + *hashkeys_null = false; + } + + i++; + } + + MemoryContextSwitchTo(oldContext); + hashkey = DatumGetUInt32(hash_uint32(hashkey)); + *hashvalue = hashkey; + return true; +} +#endif bool ExecHashGetHashValue(HashJoinTable hashtable, ExprContext* econtext, List* hashkeys, bool outer_tuple, bool keep_nulls, uint32* hashvalue) { diff --git a/src/gausskernel/runtime/executor/nodeHashjoin.cpp b/src/gausskernel/runtime/executor/nodeHashjoin.cpp index c48a5e211..80fee43e5 100755 --- a/src/gausskernel/runtime/executor/nodeHashjoin.cpp +++ b/src/gausskernel/runtime/executor/nodeHashjoin.cpp @@ -123,6 +123,12 @@ static TupleTableSlot* ExecHashJoin(PlanState* state) * First time through: build hash table for inner relation. */ Assert(hashtable == NULL); +#ifdef USE_SPQ + if (node->prefetch_inner) { + node->hj_FirstOuterTupleSlot = NULL; + goto CREATE_HASH_TABLE; + } +#endif /* * If the outer relation is completely empty, and it's not * right/full join, we can quit without building the hash @@ -176,7 +182,11 @@ static TupleTableSlot* ExecHashJoin(PlanState* state) node->hj_OuterNotEmpty = true; } else node->hj_FirstOuterTupleSlot = NULL; - +#ifdef USE_SPQ +CREATE_HASH_TABLE: + bool keepNulls = false; + keepNulls = HJ_FILL_INNER(node) || hashNode->hs_keepnull; +#endif /* * create the hash table, sometimes we should keep nulls */ @@ -184,17 +194,22 @@ static TupleTableSlot* ExecHashJoin(PlanState* state) /* enable_memory_limit */ oldcxt = MemoryContextSwitchTo(hashNode->ps.nodeContext); } - +#ifdef USE_SPQ + hashtable = ExecHashTableCreate((Hash*)hashNode->ps.plan, node->hj_HashOperators, + keepNulls, node->hj_hashCollations); +#else hashtable = ExecHashTableCreate((Hash*)hashNode->ps.plan, node->hj_HashOperators, HJ_FILL_INNER(node) || node->js.nulleqqual != NIL, node->hj_hashCollations); - +#endif if (oldcxt) { /* enable_memory_limit */ MemoryContextSwitchTo(oldcxt); } node->hj_HashTable = hashtable; - +#ifdef USE_SPQ + hashNode->hs_quit_if_hashkeys_null = (node->js.jointype == JOIN_LASJ_NOTIN); +#endif /* * execute the Hash node, to build the hash table */ @@ -210,7 +225,10 @@ static TupleTableSlot* ExecHashJoin(PlanState* state) EARLY_FREE_LOG(elog(LOG, "Early Free: Hash Table for HashJoin" " is built at node %d, memory used %d MB.", (node->js.ps.plan)->plan_node_id, getSessionMemoryUsageMB())); - +#ifdef USE_SPQ + if (node->js.jointype == JOIN_LASJ_NOTIN && hashNode->hs_hashkeys_null) + return NULL; +#endif /* * If the inner relation is completely empty, and we're not * doing a left outer join, we can quit without scanning the @@ -229,7 +247,9 @@ static TupleTableSlot* ExecHashJoin(PlanState* state) return NULL; } - +#ifdef USE_SPQ + node->hj_InnerEmpty = (hashtable->totalTuples == 0); +#endif /* * need to remember whether nbatch has increased since we * began scanning the outer relation @@ -303,6 +323,14 @@ static TupleTableSlot* ExecHashJoin(PlanState* state) /* fall through */ case HJ_SCAN_BUCKET: +#ifdef USE_SPQ + if (node->js.jointype == JOIN_LASJ_NOTIN && !node->hj_InnerEmpty && + IsJoinExprNull(node->hj_OuterHashKeys, econtext)) { + node->hj_MatchedOuter = true; + node->hj_JoinState = HJ_NEED_NEW_OUTER; + continue; + } +#endif /* * Scan the selected hash bucket for matches to current outer */ @@ -346,7 +374,12 @@ static TupleTableSlot* ExecHashJoin(PlanState* state) HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple)); /* Anti join: we never return a matched tuple */ +#ifdef USE_SPQ + if (jointype == JOIN_ANTI || jointype == JOIN_LEFT_ANTI_FULL || + jointype == JOIN_LASJ_NOTIN) { +#else if (jointype == JOIN_ANTI || jointype == JOIN_LEFT_ANTI_FULL) { +#endif node->hj_JoinState = HJ_NEED_NEW_OUTER; continue; } @@ -566,6 +599,20 @@ HashJoinState* ExecInitHashJoin(HashJoin* node, EState* estate, int eflags) hjstate->hashclauses = (List*)ExecInitExprByRecursion((Expr*)node->hashclauses, (PlanState*)hjstate); } +#ifdef USE_SPQ + if (JOIN_LASJ_NOTIN == node->join.jointype && node->hashqualclauses != nullptr) { + hjstate->hj_nonequijoin = true; + } else { + hjstate->hj_nonequijoin = false; + } + + hjstate->prefetch_inner = node->join.prefetch_inner; + + if (node->join.is_set_op_join) { + hjstate->hj_nonequijoin = true; + } +#endif + /* * initialize child nodes * @@ -579,6 +626,10 @@ HashJoinState* ExecInitHashJoin(HashJoin* node, EState* estate, int eflags) outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags); innerPlanState(hjstate) = ExecInitNode((Plan*)hashNode, estate, eflags); +#ifdef USE_SPQ + ((HashState *)innerPlanState(hjstate))->hs_keepnull = hjstate->hj_nonequijoin; +#endif + /* * tuple table initialization */ @@ -596,6 +647,9 @@ HashJoinState* ExecInitHashJoin(HashJoin* node, EState* estate, int eflags) case JOIN_LEFT: case JOIN_ANTI: case JOIN_LEFT_ANTI_FULL: +#ifdef USE_SPQ + case JOIN_LASJ_NOTIN: +#endif hjstate->hj_NullInnerTupleSlot = ExecInitNullTupleSlot(estate, ExecGetResultType(innerPlanState(hjstate))); break; case JOIN_RIGHT: @@ -768,12 +822,26 @@ static TupleTableSlot* ExecHashJoinOuterGetTuple(PlanState* outerNode, HashJoinS ExprContext* econtext = hjstate->js.ps.ps_ExprContext; econtext->ecxt_outertuple = slot; +#ifdef USE_SPQ + bool hashkeys_null = false; + bool keep_nulls = (IS_SPQ_RUNNING) ? + (HJ_FILL_OUTER(hjstate) || hjstate->hj_nonequijoin) : + (HJ_FILL_OUTER(hjstate) || hjstate->js.nulleqqual != NIL); + if (ExecHashGetHashValue(hashtable, + econtext, + hjstate->hj_OuterHashKeys, + true, /* outer tuple */ + keep_nulls, + hashvalue, + &hashkeys_null)) { +#else if (ExecHashGetHashValue(hashtable, econtext, hjstate->hj_OuterHashKeys, true, /* outer tuple */ HJ_FILL_OUTER(hjstate) || hjstate->js.nulleqqual != NIL, /* compute null ? */ hashvalue)) { +#endif /* remember outer relation is not empty for possible rescan */ hjstate->hj_OuterNotEmpty = true; diff --git a/src/gausskernel/runtime/executor/nodeMaterial.cpp b/src/gausskernel/runtime/executor/nodeMaterial.cpp index f3c2dd595..6617cd8a8 100644 --- a/src/gausskernel/runtime/executor/nodeMaterial.cpp +++ b/src/gausskernel/runtime/executor/nodeMaterial.cpp @@ -294,6 +294,12 @@ MaterialState* ExecInitMaterial(Material* node, EState* estate, int eflags) int64 operator_mem = SET_NODEMEM(((Plan*)node)->operatorMemKB[0], ((Plan*)node)->dop); AllocSetContext* set = (AllocSetContext*)(estate->es_query_cxt); set->maxSpaceSize = operator_mem * 1024L + SELF_GENRIC_MEMCTX_LIMITATION; +#ifdef USE_SPQ + if (node->spq_strict) + eflags |= EXEC_FLAG_REWIND; + if (node->spq_shield_child_from_rescans || IsA(outerPlan((Plan *) node), Stream)) + eflags |= EXEC_FLAG_REWIND; +#endif /* * We must have a tuplestore buffering the subplan output to do backward @@ -369,7 +375,7 @@ MaterialState* ExecInitMaterial(Material* node, EState* estate, int eflags) * so that it can be run before main plan in ExecutePlan. */ if (IS_PGXC_DATANODE && estate->es_under_subplan && IsA(left_tree, Stream) && - ((Stream*)left_tree)->type == STREAM_BROADCAST) + ((Stream*)left_tree)->type == STREAM_BROADCAST && !IS_SPQ_RUNNING) estate->es_material_of_subplan = lappend(estate->es_material_of_subplan, (PlanState*)mat_state); return mat_state; diff --git a/src/gausskernel/runtime/executor/nodeMergejoin.cpp b/src/gausskernel/runtime/executor/nodeMergejoin.cpp index e7d4714c8..bf0370575 100644 --- a/src/gausskernel/runtime/executor/nodeMergejoin.cpp +++ b/src/gausskernel/runtime/executor/nodeMergejoin.cpp @@ -1514,6 +1514,13 @@ MergeJoinState* ExecInitMergeJoin(MergeJoin* node, EState* estate, int eflags) errmsg("FULL JOIN is only supported with merge-joinable join conditions."), errhint("Try other join methods like nestloop or hashjoin."))); break; +#ifdef USE_SPQ + case JOIN_LASJ_NOTIN: + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("LASJ NOTIN JOIN is not supported with merge-joinable join conditions."))); + break; +#endif default: ereport(ERROR, (errcode(ERRCODE_UNRECOGNIZED_NODE_TYPE), diff --git a/src/gausskernel/runtime/executor/nodeNestloop.cpp b/src/gausskernel/runtime/executor/nodeNestloop.cpp index 551d78272..dd1114412 100644 --- a/src/gausskernel/runtime/executor/nodeNestloop.cpp +++ b/src/gausskernel/runtime/executor/nodeNestloop.cpp @@ -224,9 +224,15 @@ static TupleTableSlot* ExecNestLoop(PlanState* state) ENL1_printf("no inner tuple, need new outer tuple"); node->nl_NeedNewOuter = true; +#ifdef USE_SPQ + node->nl_innerSideScanned = true; + if (!node->nl_MatchedOuter && (node->js.jointype == JOIN_LEFT || node->js.jointype == JOIN_ANTI || + node->js.jointype == JOIN_LEFT_ANTI_FULL || node->js.jointype == JOIN_LASJ_NOTIN)) { +#else if (!node->nl_MatchedOuter && (node->js.jointype == JOIN_LEFT || node->js.jointype == JOIN_ANTI || node->js.jointype == JOIN_LEFT_ANTI_FULL)) { +#endif /* * We are doing an outer join and there were no join matches * for this outer tuple. Generate a fake join tuple with @@ -263,6 +269,18 @@ static TupleTableSlot* ExecNestLoop(PlanState* state) continue; } +#ifdef USE_SPQ + if ((node->js.jointype == JOIN_LASJ_NOTIN) && (!node->nl_innerSideScanned) && + (node->nl_InnerJoinKeys && IsJoinExprNull(node->nl_InnerJoinKeys, econtext))) { + /* + * If LASJ_NOTIN and a null was found on the inner side, all tuples + * We'll read no more from either inner or outer subtree. To keep our + * in outer sider will be treated as "not in" tuples in inner side. + */ + ENL1_printf("found null tuple on the inner side, clean out"); + return NULL; + } +#endif /* * at this point we have a new pair of inner and outer tuples so we * test the inner and outer tuples to see if they satisfy the node's @@ -277,7 +295,12 @@ static TupleTableSlot* ExecNestLoop(PlanState* state) node->nl_MatchedOuter = true; /* In an antijoin, we never return a matched tuple */ +#ifdef USE_SPQ + if (node->js.jointype == JOIN_ANTI || node->js.jointype == JOIN_LEFT_ANTI_FULL || + node->js.jointype == JOIN_LASJ_NOTIN) { +#else if (node->js.jointype == JOIN_ANTI || node->js.jointype == JOIN_LEFT_ANTI_FULL) { +#endif node->nl_NeedNewOuter = true; continue; /* return to top of loop */ } @@ -323,6 +346,110 @@ static TupleTableSlot* ExecNestLoop(PlanState* state) } } +#ifdef USE_SPQ +/* ---------------------------------------------------------------- + * ExtractFuncExprArgs + * + * Extract the arguments of a FuncExpr or an OpExpr and append them into two + * given lists: + * - lclauses for the left side of the expression, + * - rclauses for the right side + * + * This function is only used for LASJ. Once we find a NULL from inner side, we + * can skip the join and just return an empty set as result. This is only true + * if the equality operator is strict, that is, if a tuple from inner side is + * NULL then the equality operator returns NULL. + * + * If the number of arguments is not two, we just return leaving lclauses and + * rclauses remaining NULL. In this case, the LASJ join would be actually + * performed. + * ---------------------------------------------------------------- + */ +static void ExtractFuncExprArgs(Expr *clause, List **lclauses, List **rclauses) +{ + if (IsA(clause, OpExpr)) { + OpExpr *opexpr = (OpExpr *)clause; + + if (list_length(opexpr->args) != 2) + return; + + if (!op_strict(opexpr->opno)) + return; + + *lclauses = lappend(*lclauses, linitial(opexpr->args)); + *rclauses = lappend(*rclauses, lsecond(opexpr->args)); + } else if (IsA(clause, FuncExpr)) { + FuncExpr *fexpr = (FuncExpr *)clause; + + if (list_length(fexpr->args) != 2) + return; + + if (!func_strict(fexpr->funcid)) + return; + + *lclauses = lappend(*lclauses, linitial(fexpr->args)); + *rclauses = lappend(*rclauses, lsecond(fexpr->args)); + } else + elog(ERROR, "unexpected join qual in JOIN_LASJ_NOTIN join"); +} + +/* ---------------------------------------------------------------- + * SplitJoinQualExpr + * + * Deconstruct the join clauses into outer and inner argument values, so + * that we can evaluate those subexpressions separately. Note: for constant + * expression we don't need to split (MPP-21294). However, if constant expressions + * have peer splittable expressions we *do* split those. + * + * This is used for NOTIN joins, as we need to look for NULLs on both + * inner and outer side. + * + * XXX: This would be more appropriate in the planner. + * ---------------------------------------------------------------- + */ +static void SplitJoinQualExpr(List *joinqual, List **inner_join_keys_p, List **outer_join_keys_p) +{ + List *lclauses = NIL; + List *rclauses = NIL; + ListCell *lc; + + foreach(lc, joinqual) { + Expr *expr = (Expr *)lfirst(lc); + + switch (expr->type) { + case T_FuncExpr: + case T_OpExpr: + ExtractFuncExprArgs(expr, &lclauses, &rclauses); + break; + + case T_BoolExpr: + { + BoolExpr *bexpr = (BoolExpr *)expr; + ListCell *argslc; + + foreach(argslc, bexpr->args) { + ExtractFuncExprArgs((Expr *)lfirst(argslc), &lclauses, &rclauses); + } + } + break; + + case T_Const: + /* + * Constant expressions do not need to be splitted into left and + * right as they don't need to be considered for NULL value special + * cases + */ + break; + + default: + elog(ERROR, "unexpected expression type in NestLoopJoin qual"); + } + } + + *inner_join_keys_p = rclauses; + *outer_join_keys_p = lclauses; +} +#endif /* ---------------------------------------------------------------- * ExecInitNestLoop * ---------------------------------------------------------------- @@ -341,6 +468,9 @@ NestLoopState* ExecInitNestLoop(NestLoop* node, EState* estate, int eflags) nlstate->js.ps.plan = (Plan*)node; nlstate->js.ps.state = estate; nlstate->nl_MaterialAll = node->materialAll; +#ifdef USE_SPQ + nlstate->prefetch_inner = node->join.prefetch_inner; +#endif nlstate->js.ps.ExecProcNode = ExecNestLoop; /* @@ -362,8 +492,43 @@ NestLoopState* ExecInitNestLoop(NestLoop* node, EState* estate, int eflags) nlstate->js.ps.targetlist = (List*)ExecInitExprByRecursion((Expr*)node->join.plan.targetlist, (PlanState*)nlstate); nlstate->js.ps.qual = (List*)ExecInitExprByRecursion((Expr*)node->join.plan.qual, (PlanState*)nlstate); nlstate->js.jointype = node->join.jointype; +#ifdef USE_SPQ + if (node->join.jointype == JOIN_LASJ_NOTIN) { + List *inner_join_keys; + List *outer_join_keys; + ListCell *lc; + + /* not initialized yet */ + Assert(nlstate->nl_InnerJoinKeys == nullptr); + Assert(nlstate->nl_OuterJoinKeys == nullptr); + + SplitJoinQualExpr(node->join.joinqual, + &inner_join_keys, + &outer_join_keys); + foreach(lc, inner_join_keys) { + Expr *expr = (Expr *)lfirst(lc); + ExprState *exprstate; + + exprstate = ExecInitExpr(expr, (PlanState *)nlstate); + + nlstate->nl_InnerJoinKeys = lappend(nlstate->nl_InnerJoinKeys, exprstate); + } + foreach(lc, outer_join_keys) { + Expr *expr = (Expr *)lfirst(lc); + ExprState *exprstate; + + exprstate = ExecInitExpr(expr, (PlanState *)nlstate); + + nlstate->nl_OuterJoinKeys = lappend(nlstate->nl_OuterJoinKeys, exprstate); + } + nlstate->js.joinqual = (List*)ExecInitExpr((Expr*)node->join.joinqual, (PlanState*)nlstate); + } else { nlstate->js.joinqual = (List*)ExecInitExprByRecursion((Expr*)node->join.joinqual, (PlanState*)nlstate); } +#else + nlstate->js.joinqual = (List*)ExecInitExprByRecursion((Expr*)node->join.joinqual, (PlanState*)nlstate); +#endif + } Assert(node->join.nulleqqual == NIL); /* @@ -396,6 +561,9 @@ NestLoopState* ExecInitNestLoop(NestLoop* node, EState* estate, int eflags) case JOIN_LEFT: case JOIN_ANTI: case JOIN_LEFT_ANTI_FULL: +#ifdef USE_SPQ + case JOIN_LASJ_NOTIN: +#endif nlstate->nl_NullInnerTupleSlot = ExecInitNullTupleSlot(estate, ExecGetResultType(innerPlanState(nlstate))); break; default: @@ -486,4 +654,7 @@ void ExecReScanNestLoop(NestLoopState* node) node->js.ps.ps_vec_TupFromTlist = false; node->nl_NeedNewOuter = true; node->nl_MatchedOuter = false; +#ifdef USE_SPQ + node->nl_innerSideScanned = false; +#endif } diff --git a/src/gausskernel/runtime/executor/nodeSequence.cpp b/src/gausskernel/runtime/executor/nodeSequence.cpp new file mode 100644 index 000000000..135b1feb4 --- /dev/null +++ b/src/gausskernel/runtime/executor/nodeSequence.cpp @@ -0,0 +1,155 @@ +/* + * nodeSequence.cpp + * Routines to handle Sequence node. + * + * Portions Copyright (c) 2023 Huawei Technologies Co.,Ltd. + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/gausskernel/runtime/executor/nodeSequence.cpp + * + * Sequence node contains a list of subplans, which will be processed in the + * order of left-to-right. Result tuples from the last subplan will be outputted + * as the results of the Sequence node. + * + * Sequence does not make use of its left and right subtrees, and instead it + * maintains a list of subplans explicitly. + */ +#ifdef USE_SPQ +#include "postgres.h" + +#include "executor/node/nodeSequence.h" +#include "executor/executor.h" +#include "miscadmin.h" + +SequenceState *ExecInitSequence(Sequence *node, EState *estate, int eflags) +{ + SequenceState *sequenceState; + ListCell *lc; + int no = 0; + int numSubplans; + + /* Check for unsupported flags */ + Assert(!(eflags & EXEC_FLAG_MARK)); + + /* Sequence should not contain 'qual'. */ + Assert(node->plan.qual == NIL); + + sequenceState = makeNode(SequenceState); + sequenceState->ps.plan = (Plan *)node; + sequenceState->ps.state = estate; + sequenceState->ps.ExecProcNode = ExecSequence; + + numSubplans = list_length(node->subplans); + Assert(numSubplans >= 1); + sequenceState->subplans = (PlanState **)palloc0(numSubplans * sizeof(PlanState *)); + sequenceState->numSubplans = numSubplans; + + /* Initialize subplans */ + foreach (lc, node->subplans) { + Plan *subplan = (Plan *)lfirst(lc); + Assert(subplan != NULL); + Assert(no < numSubplans); + + sequenceState->subplans[no] = ExecInitNode(subplan, estate, eflags); + no++; + } + + sequenceState->initState = true; + + /* Sequence does not need projection. */ + sequenceState->ps.ps_ProjInfo = NULL; + + /* + * Initialize result type. We will pass through the last child slot. + */ + ExecInitResultTupleSlot(estate, &sequenceState->ps); + ExecAssignResultTypeFromTL(&sequenceState->ps); + + return sequenceState; +} + +/* + * completeSubplan + * Execute a given subplan to completion. + * + * The outputs from the given subplan will be discarded. + */ +static void completeSubplan(PlanState *subplan) +{ + while (ExecProcNode(subplan) != NULL) { + } +} + +TupleTableSlot *ExecSequence(PlanState *state) +{ + SequenceState *node = castNode(SequenceState, state); + PlanState *lastPlan = NULL; + TupleTableSlot *result = NULL; + int no = 0; + + /* + * If no subplan has been executed yet, execute them here, except for + * the last subplan. + */ + if (node->initState) { + for(no = 0; no < node->numSubplans - 1; no++) { + completeSubplan(node->subplans[no]); + + CHECK_FOR_INTERRUPTS(); + } + + node->initState = false; + } + + Assert(!node->initState); + + lastPlan = node->subplans[node->numSubplans - 1]; + result = ExecProcNode(lastPlan); + + /* + * Return the tuple as returned by the subplan as-is. We do + * NOT make use of the result slot that was set up in + * ExecInitSequence, because there's no reason to. + */ + return result; +} + +void ExecEndSequence(SequenceState *node) +{ + int no = 0; + + /* shutdown subplans */ + for (no = 0; no < node->numSubplans; no++) { + Assert(node->subplans[no] != NULL); + ExecEndNode(node->subplans[no]); + } +} + +void ExecReScanSequence(SequenceState *node) +{ + int i = 0; + + for (i = 0; i < node->numSubplans; i++) { + PlanState *subnode = node->subplans[i]; + + /* + * ExecReScan doesn't know about my subplans, so I have to do + * changed-parameter signaling myself. + */ + if (node->ps.chgParam != NULL) { + UpdateChangedParamSet(subnode, node->ps.chgParam); + } + + /* + * Always rescan the inputs immediately, to ensure we can pass down + * any outer tuple that might be used in index quals. + */ + ExecReScan(subnode); + } + + node->initState = true; +} + +#endif /* USE_SPQ */ diff --git a/src/gausskernel/runtime/executor/nodeShareInputScan.cpp b/src/gausskernel/runtime/executor/nodeShareInputScan.cpp new file mode 100644 index 000000000..fdaa2d401 --- /dev/null +++ b/src/gausskernel/runtime/executor/nodeShareInputScan.cpp @@ -0,0 +1,703 @@ +/* ------------------------------------------------------------------------- + * + * nodeShareInputScan.cpp + * + * A Share Input Scan node is used to share the result of an operation in + * two different branches in the plan tree. + * + * These come in two variants: local, and cross-slice. + * + * Local shares + * ------------ + * + * In local mode, all the consumers are in the same slice as the producer. + * In that case, there's no need to communicate across processes, so we + * rely entirely on data structures in backend-private memory to track the + * state. + * + * In local mode, there is no difference between producer and consumer + * nodes. In ExecInitShareInputScan(), the producer node stores the + * PlanState of the shared child node where all the nodes can find it. + * The first ExecShareInputScan() call initializes the store. + * + * A local-mode ShareInputScan is quite similar to PostgreSQL's CteScan, + * but there are some implementation differences. CteScan uses a special + * PARAM_EXEC entry to hold the shared state, while ShareInputScan uses + * an entry in es_sharenode instead. + * + * Cross-slice shares + * ------------------ + * + * A cross-slice share works basically the same as a local one, except + * that the producing slice makes the underlying tuplestore available to + * other processes, by forcing it to be written to a file on disk. The + * first ExecShareInputScan() call in the producing slice materializes + * the whole tuplestore, and advertises that it's ready in shared memory. + * Consumer slices wait for that before trying to read the store. + * + * The producer and the consumers communicate the status of the scan using + * shared memory. There's a hash table in shared memory, containing a + * 'shareinput_Xslice_state' struct for each shared scan. The producer uses + * a &state->ready_done_cv.m_mutexcondition variable to wake up consumers, when the tuplestore is fully + * materialized, and the consumers use the same condition variable to inform + * the producer when they're done reading it. The producer slice keeps the + * underlying tuplestore open, until all the consumers have finished. + * + * Portions Copyright (c) 2023 Huawei Technologies Co.,Ltd. + * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/gausskernel/runtime/executor/nodeShareInputScan.cpp + * + * ------------------------------------------------------------------------- + */ +#ifdef USE_SPQ +#include "postgres.h" + +#include "access/xact.h" +#include "executor/executor.h" +#include "executor/node/nodeShareInputScan.h" +#include "miscadmin.h" +#include "storage/lock/lwlock.h" +#include "storage/lwlocknames.h" +#include "storage/shmem.h" +#include "utils/memutils.h" +#include "utils/resowner.h" +#include "utils/tuplestore.h" +#include "lib/ilist.h" + +/* + * The SharedFileSet deletes any remaining files when the reference count + * reaches zero, but we don't rely on that mechanism. All the files are + * held in the same SharedFileSet, so it cannot be recycled until all + * ShareInputScans in the system have finished, which might never happen if + * new queries are started continuously. The shareinput_Xslice_state entries + * are reference counted separately, and we clean up the files backing each + * individual ShareInputScan whenever its reference count reaches zero. + */ +static SharedFileSet *shareinput_Xslice_fileset; + +typedef struct { + pthread_mutex_t m_mutex; + pthread_cond_t m_cond; +} ConditionVariable; + +/* + * In a cross-slice ShareinputScan, the producer and consumer processes + * communicate using shared memory. There's a hash table containing one + * 'shareinput_share_state' for each in-progress shared input scan. + * + * The hash table itself,, and the fields within every entry, are protected + * by ShareInputScanLock. (Although some operations get away without the + * lock, when the field is atomic and/or there's only one possible writer.) + * + * All producers and consumers that participate in a shared scan hold + * a reference to the 'shareinput_Xslice_state' entry of the scan, for + * the whole lifecycle of the node from ExecInitShareInputScan() to + * ExecEndShareInputScan(). The entry in the hash table is created by + * the first participant that initializes, which is not necessarily the + * producer! When the last participant releases the entry, it is removed + * from the hash table. + */ +typedef struct shareinput_tag { + uint64 session_id; + int32 share_id; + int32 dop_id; +} shareinput_tag; + +typedef struct shareinput_Xslice_state { + shareinput_tag tag; /* hash key */ + + int refcount; /* reference count of this entry */ + bool ready; /* is the input fully materialized and ready to be read? */ + int ndone; /* # of consumers that have finished the scan */ + + /* + * ready_done_cv is used for signaling when the scan becomes "ready", and + * when it becomes "done". The producer wakes up everyone waiting on this + * condition variable when it sets ready = true. Also, when the last + * consumer finishes the scan (ndone reaches nconsumers), it wakes up the + * producer using this same condition variable. + */ + ConditionVariable ready_done_cv; +} shareinput_Xslice_state; + +/* + * 'shareinput_reference' represents a reference or "lease" to an entry + * in the shared memory hash table. It is used for garbage collection of + * the entries, on transaction abort. + * + */ +typedef struct shareinput_Xslice_reference { + int share_id; + shareinput_Xslice_state *xslice_state; + + ResourceOwner owner; + + dlist_node node; +} shareinput_Xslice_reference; + +/* + * For local (i.e. intra-slice) variants, we use a 'shareinput_local_state' + * to track the status. It is analogous to 'shareinput_share_state' used for + * cross-slice scans, but we don't need to keep it in shared memory. These + * are held in estate->es_sharenode, indexed by share_id. + */ +typedef struct shareinput_local_state { + bool ready; + bool closed; + int ndone; + int nsharers; + + /* + * This points to the child node that's being shared. Set by + * ExecInitShareInputScan() of the instance that has the child. + */ + PlanState *childState; + + /* Tuplestore that holds the result */ + Tuplestorestate *ts_state; +} shareinput_local_state; + +static shareinput_Xslice_reference *get_shareinput_reference(int share_id); +static void release_shareinput_reference(shareinput_Xslice_reference *ref); +static void shareinput_create_bufname_prefix(char *p, int size, int share_id, int dop_id); + +static void shareinput_writer_notifyready(shareinput_Xslice_reference *ref); +static void shareinput_reader_waitready(shareinput_Xslice_reference *ref); +static void shareinput_reader_notifydone(shareinput_Xslice_reference *ref, int nconsumers); +static void shareinput_writer_waitdone(shareinput_Xslice_reference *ref, int nconsumers); + + +/* + * init_tuplestore_state + * Initialize the tuplestore state for the Shared node if the state + * is not initialized. + */ +static void init_tuplestore_state(ShareInputScanState *node) +{ + EState *estate = node->ss.ps.state; + ShareInputScan *sisc = (ShareInputScan *)node->ss.ps.plan; + shareinput_local_state *local_state = node->local_state; + Tuplestorestate *ts; + int tsptrno; + TupleTableSlot *outerslot; + + Assert(!node->isready); + Assert(node->ts_state == NULL); + Assert(node->ts_pos == -1); + + if (!node->ref) + elog(ERROR, "cannot execute ShareInputScan that was not initialized"); + + if (!local_state->ready) { + if (t_thrd.spq_ctx.current_id == sisc->producer_slice_id || estate->es_plannedstmt->num_streams == 1) { + char rwfile_prefix[100]; + + ts = tuplestore_begin_heap(true, /* randomAccess */ + false, /* interXact */ + 10); /* maxKBytes FIXME */ + + shareinput_create_bufname_prefix(rwfile_prefix, sizeof(rwfile_prefix), sisc->share_id, + u_sess->stream_cxt.smp_id); + + tuplestore_make_shared(ts, get_shareinput_fileset(), rwfile_prefix); + + for (;;) { + outerslot = ExecProcNode(local_state->childState); + if (TupIsNull(outerslot)) + break; + tuplestore_puttupleslot(ts, outerslot); + } + + tuplestore_freeze(ts); + shareinput_writer_notifyready(node->ref); + + tuplestore_rescan(ts); + } else { + /* + * We are a consumer slice. Wait for the producer to create the + * tuplestore. + */ + char rwfile_prefix[100]; + + shareinput_reader_waitready(node->ref); + + shareinput_create_bufname_prefix(rwfile_prefix, sizeof(rwfile_prefix), sisc->share_id, + u_sess->stream_cxt.smp_id); + ts = tuplestore_open_shared(get_shareinput_fileset(), rwfile_prefix); + } + local_state->ts_state = ts; + local_state->ready = true; + tsptrno = 0; + } else { + /* Another local reader */ + ts = local_state->ts_state; + tsptrno = tuplestore_alloc_read_pointer(ts, (EXEC_FLAG_BACKWARD | EXEC_FLAG_REWIND)); + + tuplestore_select_read_pointer(ts, tsptrno); + tuplestore_rescan(ts); + } + + node->ts_state = ts; + node->ts_pos = tsptrno; + + node->isready = true; +} + + +/* ------------------------------------------------------------------ + * ExecShareInputScan + * Retrieve a tuple from the ShareInputScan + * ------------------------------------------------------------------ + */ +TupleTableSlot *ExecShareInputScan(PlanState *pstate) +{ + ShareInputScanState *node = castNode(ShareInputScanState, pstate); + ShareInputScan *sisc = (ShareInputScan *)pstate->plan; + EState *estate; + ScanDirection dir; + bool forward; + TupleTableSlot *slot; + + /* + * get state info from node + */ + estate = pstate->state; + dir = estate->es_direction; + forward = ScanDirectionIsForward(dir); + + if (sisc->this_slice_id != t_thrd.spq_ctx.current_id && estate->es_plannedstmt->num_streams != 1) + elog(ERROR, "cannot execute alien Share Input Scan"); + + /* if first time call, need to initialize the tuplestore state. */ + if (!node->isready) + init_tuplestore_state(node); + + slot = node->ss.ps.ps_ResultTupleSlot; + + Assert(!node->local_state->closed); + + tuplestore_select_read_pointer(node->ts_state, node->ts_pos); + while (1) { + bool gotOK; + + gotOK = tuplestore_gettupleslot(node->ts_state, forward, false, slot); + + if (!gotOK) + return NULL; + + return slot; + } + + Assert(!"should not be here"); + return NULL; +} + +/* ------------------------------------------------------------------ + * ExecInitShareInputScan + * ------------------------------------------------------------------ + */ +ShareInputScanState *ExecInitShareInputScan(ShareInputScan *node, EState *estate, int eflags) +{ + ShareInputScanState *sisstate; + Plan *outerPlan; + PlanState *childState; + shareinput_local_state *local_state; + + Assert(innerPlan(node) == NULL); + + /* create state data structure */ + sisstate = makeNode(ShareInputScanState); + sisstate->ss.ps.plan = (Plan *)node; + sisstate->ss.ps.state = estate; + sisstate->ss.ps.ExecProcNode = ExecShareInputScan; + sisstate->ts_state = NULL; + sisstate->ts_pos = -1; + + /* + * init child node. + * if outerPlan is NULL, this is no-op (so that the ShareInput node will be + * only init-ed once). + */ + + /* + * initialize child nodes + * + * Like a Material node, we shield the child node from the need to support + * BACKWARD, or MARK/RESTORE. + */ + eflags &= ~(EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK); + + outerPlan = outerPlan(node); + childState = ExecInitNode(outerPlan, estate, eflags); + outerPlanState(sisstate) = childState; + + Assert(node->scan.plan.qual == NULL); + sisstate->ss.ps.qual = NULL; + + /* Misc initialization + * + * Create expression context + */ + ExecAssignExprContext(estate, &sisstate->ss.ps); + + /* + * Initialize result slot and type. + */ + ExecInitResultTupleSlot(estate, &sisstate->ss.ps); + ExecAssignResultTypeFromTL(&sisstate->ss.ps); + + sisstate->ss.ps.ps_ProjInfo = NULL; + + /* + * When doing EXPLAIN only, we won't actually execute anything, so don't + * bother initializing the state. This isn't merely an optimization: + * closing a cross-slice ShareInputScan waits for the consumers to finish, + * but if we don't execute anything, it will hang forever. + * + * We could also exit here immediately if this is an "alien" node, i.e. + * a node that doesn't execute in this slice, but we can't easily + * detect that here. + */ + if ((eflags & EXEC_FLAG_EXPLAIN_ONLY) != 0) + return sisstate; + + /* expand the list if necessary */ + while (list_length(estate->es_sharenode) <= node->share_id) { + local_state = (shareinput_local_state *)palloc0(sizeof(shareinput_local_state)); + local_state->ready = false; + + estate->es_sharenode = lappend(estate->es_sharenode, local_state); + } + + local_state = (shareinput_local_state *)list_nth(estate->es_sharenode, node->share_id); + + /* + * only the consumer ShareInputScan nodes executed in current + * slice are counted, since only consumers would increase + * "ndone" in local_state, and compare "ndone" with "nsharers" + * to judge whether to notify producer. + */ + if (t_thrd.spq_ctx.current_id == node->this_slice_id && t_thrd.spq_ctx.current_id != node->producer_slice_id) + local_state->nsharers++; + + if (childState) + local_state->childState = childState; + sisstate->local_state = local_state; + + /* Get a lease on the shared state */ + sisstate->ref = get_shareinput_reference(node->share_id); + + return sisstate; +} + +/* ------------------------------------------------------------------ + * ExecEndShareInputScan + * ------------------------------------------------------------------ + */ +void ExecEndShareInputScan(ShareInputScanState *node) +{ + EState *estate = node->ss.ps.state; + ShareInputScan *sisc = (ShareInputScan *)node->ss.ps.plan; + shareinput_local_state *local_state = node->local_state; + + /* clean up tuple table */ + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + + if (node->ref) { + if (sisc->this_slice_id == t_thrd.spq_ctx.current_id || estate->es_plannedstmt->num_streams == 1) { + /* + * The producer needs to wait for all the consumers to finish. + * Consumers signal the producer that they're done reading, + * but are free to exit immediately after that. + */ + if (t_thrd.spq_ctx.current_id == sisc->producer_slice_id) { + if (!local_state->ready) + init_tuplestore_state(node); + shareinput_writer_waitdone(node->ref, sisc->nconsumers); + } else { + if (!local_state->closed) { + shareinput_reader_notifydone(node->ref, sisc->nconsumers); + local_state->closed = true; + } + } + } + release_shareinput_reference(node->ref); + node->ref = NULL; + } + + if (local_state && local_state->ts_state) { + tuplestore_end(local_state->ts_state); + local_state->ts_state = NULL; + } + + /* + * shutdown subplan. First scanner of underlying share input will + * do the shutdown, all other scanners are no-op because outerPlanState + * is NULL + */ + ExecEndNode(outerPlanState(node)); +} + +/* ------------------------------------------------------------------ + * ExecReScanShareInputScan + * ------------------------------------------------------------------ + */ +void ExecReScanShareInputScan(ShareInputScanState *node) +{ + /* On first call, initialize the tuplestore state */ + if (!node->isready) + init_tuplestore_state(node); + + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + Assert(node->ts_pos != -1); + + tuplestore_select_read_pointer(node->ts_state, node->ts_pos); + tuplestore_rescan(node->ts_state); +} + +/* ************************************************************************ + * IPC, for cross-slice variants. + * ************************************************************************ */ +/* + * When creating a tuplestore file that will be accessed by + * multiple processes, shareinput_create_bufname_prefix() is used to + * construct the name for it. + */ +static void shareinput_create_bufname_prefix(char *p, int size, int share_id, int dop_id) +{ + snprintf(p, size, "SIRW_%lu_%d_%d", t_thrd.spq_ctx.spq_session_id, share_id, u_sess->stream_cxt.smp_id); +} + +#define MaxBackends 1 +/* + * Initialization of the shared hash table for cross-slice communication. + * + * XXX: Use MaxBackends to size it, on the assumption that max_connections + * will scale accordingly to query complexity. This is quite fuzzy, you could + * create a query with tons of cross-slice ShareInputScans but only a few + * slice, but that ought to be rare enough in practice. This isn't a hard + * limit anyway, the hash table will use up any "slop" in shared memory if + * needed. + */ +#define N_SHAREINPUT_SLOTS() (MaxBackends * 5) + +Size ShareInputShmemSize(void) +{ + Size size; + + size = hash_estimate_size(N_SHAREINPUT_SLOTS(), sizeof(shareinput_Xslice_state)); + + return size; +} + +void ShareInputShmemInit(void) +{ + Size size = ShareInputShmemSize(); + bool found = false; + + shareinput_Xslice_fileset = (SharedFileSet *)ShmemInitStruct("ShareInputScan", size, &found); + + if (!found || t_thrd.shemem_ptr_cxt.shareinput_Xslice_hash == nullptr) { + HASHCTL info; + + info.keysize = sizeof(shareinput_tag); + + info.entrysize = sizeof(shareinput_Xslice_state); + + t_thrd.shemem_ptr_cxt.shareinput_Xslice_hash = ShmemInitHash("ShareInputScan notifications", + N_SHAREINPUT_SLOTS(), N_SHAREINPUT_SLOTS(), &info, HASH_ELEM | HASH_BLOBS); + } +} + +/* + * Get reference to the SharedFileSet used to hold all the tuplestore files. + * + * This is exported so that it can also be used by the INITPLAN function + * tuplestores. + */ +SharedFileSet *get_shareinput_fileset(void) +{ + LWLockAcquire(ShareInputScanLock, LW_EXCLUSIVE); + + if (shareinput_Xslice_fileset->refcnt == 0) + SharedFileSetInit(shareinput_Xslice_fileset); + else + SharedFileSetAttach(shareinput_Xslice_fileset); + + LWLockRelease(ShareInputScanLock); + + return shareinput_Xslice_fileset; +} + +/* + * Get a reference to slot in shared memory for this shared scan. + * + * If the slot doesn't exist yet, it is created and initialized into + * "not ready" state. + * + * The reference is tracked by the current ResourceOwner, and will be + * automatically released on abort. + */ +static shareinput_Xslice_reference *get_shareinput_reference(int share_id) +{ + shareinput_tag tag; + shareinput_Xslice_state *xslice_state; + bool found; + shareinput_Xslice_reference *ref; + + ref = (shareinput_Xslice_reference *)palloc0(sizeof(shareinput_Xslice_reference)); + + LWLockAcquire(ShareInputScanLock, LW_EXCLUSIVE); + + tag.session_id = t_thrd.spq_ctx.spq_session_id; + tag.share_id = share_id; + tag.dop_id = u_sess->stream_cxt.smp_id; + xslice_state = (shareinput_Xslice_state *)hash_search(t_thrd.shemem_ptr_cxt.shareinput_Xslice_hash, &tag, + HASH_ENTER_NULL, &found); + if (!found) { + if (xslice_state == NULL) { + pfree(ref); + ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of cross-slice ShareInputScan slots"))); + } + + xslice_state->refcount = 0; + xslice_state->ready = false; + xslice_state->ndone = 0; + + pthread_mutex_init(&xslice_state->ready_done_cv.m_mutex, NULL); + pthread_cond_init(&xslice_state->ready_done_cv.m_cond, NULL); + } + + xslice_state->refcount++; + + ref->share_id = share_id; + ref->xslice_state = xslice_state; + ref->owner = t_thrd.utils_cxt.CurrentResourceOwner; + + LWLockRelease(ShareInputScanLock); + + return ref; +} + +/* + * Release reference to a shared scan. + * + * The reference count in the shared memory slot is decreased, and if + * it reaches zero, it is destroyed. + */ +static void release_shareinput_reference(shareinput_Xslice_reference *ref) +{ + shareinput_Xslice_state *state = ref->xslice_state; + + LWLockAcquire(ShareInputScanLock, LW_EXCLUSIVE); + + if (state->refcount == 1) { + bool found; + + (void)hash_search(t_thrd.shemem_ptr_cxt.shareinput_Xslice_hash, &state->tag, HASH_REMOVE, &found); + Assert(found); + } else + state->refcount--; + + // dlist_delete(&ref->node); + + LWLockRelease(ShareInputScanLock); + + pfree(ref); +} + +/* + * shareinput_reader_waitready + * + * Called by the reader (consumer) to wait for the writer (producer) to produce + * all the tuples and write them to disk. + * + * This is a blocking operation. + */ +static void shareinput_reader_waitready(shareinput_Xslice_reference *ref) +{ + shareinput_Xslice_state *state = ref->xslice_state; + + pthread_mutex_lock(&state->ready_done_cv.m_mutex); + if (!state->ready) { + pthread_cond_wait(&state->ready_done_cv.m_cond, &state->ready_done_cv.m_mutex); + } + pthread_mutex_unlock(&state->ready_done_cv.m_mutex); + + /* it's ready now */ +} + +/* + * shareinput_writer_notifyready + * + * Called by the writer (producer) once it is done producing all tuples and + * writing them to disk. It notifies all the readers (consumers) that tuples + * are ready to be read from disk. + */ +static void shareinput_writer_notifyready(shareinput_Xslice_reference *ref) +{ + shareinput_Xslice_state *state = ref->xslice_state; + + /* we're the only writer, so no need to acquire the lock. */ + Assert(!state->ready); + pthread_mutex_lock(&state->ready_done_cv.m_mutex); + state->ready = true; + + pthread_cond_broadcast(&state->ready_done_cv.m_cond); + pthread_mutex_unlock(&state->ready_done_cv.m_mutex); +} + +/* + * shareinput_reader_notifydone + * + * Called by the reader (consumer) to notify the writer (producer) that + * it is done reading tuples from disk. + * + * This is a non-blocking operation. + */ +static void shareinput_reader_notifydone(shareinput_Xslice_reference *ref, int nconsumers) +{ + shareinput_Xslice_state *state = ref->xslice_state; + int ndone; + + pthread_mutex_lock(&state->ready_done_cv.m_mutex); + state->ndone++; + ndone = state->ndone; + + /* If we were the last consumer, wake up the producer. */ + if (ndone >= nconsumers) + pthread_cond_broadcast(&state->ready_done_cv.m_cond); + pthread_mutex_unlock(&state->ready_done_cv.m_mutex); +} + +/* + * shareinput_writer_waitdone + * + * Called by the writer (producer) to wait for the "done" notification from + * all readers (consumers). + * + * This is a blocking operation. + */ +static void shareinput_writer_waitdone(shareinput_Xslice_reference *ref, int nconsumers) +{ + shareinput_Xslice_state *state = ref->xslice_state; + + if (!state->ready) + elog(ERROR, "shareinput_writer_waitdone() called without creating the tuplestore"); + + int ndone; + pthread_mutex_lock(&state->ready_done_cv.m_mutex); + ndone = state->ndone; + if (ndone < nconsumers) { + pthread_cond_wait(&state->ready_done_cv.m_cond, &state->ready_done_cv.m_mutex); + } + pthread_mutex_unlock(&state->ready_done_cv.m_mutex); + if (ndone > nconsumers) + elog(WARNING, "%d sharers of ShareInputScan reported to be done, but only %d were expected", ndone, nconsumers); + + /* it's all done now */ +} +#endif /* USE_SPQ */ diff --git a/src/gausskernel/runtime/executor/nodeSpqSeqscan.cpp b/src/gausskernel/runtime/executor/nodeSpqSeqscan.cpp new file mode 100644 index 000000000..afeb3e863 --- /dev/null +++ b/src/gausskernel/runtime/executor/nodeSpqSeqscan.cpp @@ -0,0 +1,54 @@ +/* ------------------------------------------------------------------------- +* +* nodeSpqSeqscan.cpp +* Support routines for sequential scans of relations. +* +* Portions Copyright (c) 2023 Huawei Technologies Co.,Ltd. +* +* +* IDENTIFICATION +* src/gausskernel/runtime/executor/nodeSpqSeqscan.cpp +* +* ------------------------------------------------------------------------- +* +* INTERFACE ROUTINES +* ExecSeqScan sequentially scans a relation. +* ExecSeqNext retrieve next tuple in sequential order. +* ExecInitSeqScan creates and initializes a seqscan node. +* ExecEndSeqScan releases any storage allocated. +* ExecReScanSeqScan rescans the relation +* ExecSeqMarkPos marks scan position +* ExecSeqRestrPos restores scan position + */ +#ifdef USE_SPQ +#include "executor/node/nodeSeqscan.h" +#include "executor/node/nodeSpqSeqscan.h" + +THR_LOCAL init_spqscan_hook_type init_spqscan_hook = nullptr; +THR_LOCAL exec_spqscan_hook_type exec_spqscan_hook = nullptr; +THR_LOCAL end_spqscan_hook_type end_spqscan_hook = nullptr; +THR_LOCAL spqscan_rescan_hook_type spqscan_rescan_hook = nullptr; + +/* ---------------------------------------------------------------- + * ExecSeqMarkPos(node) + * + * Marks scan position. + * ---------------------------------------------------------------- + */ +void ExecSpqSeqMarkPos(SpqSeqScanState* node) +{ + ExecSeqMarkPos((SeqScanState*)node); +} + +/* ---------------------------------------------------------------- + * ExecSeqRestrPos + * + * Restores scan position. + * ---------------------------------------------------------------- + */ +void ExecSpqSeqRestrPos(SpqSeqScanState* node) +{ + ExecSeqRestrPos((SeqScanState*)node); +} + +#endif diff --git a/src/gausskernel/runtime/executor/nodeStub.cpp b/src/gausskernel/runtime/executor/nodeStub.cpp index a25d50122..c8dd97490 100644 --- a/src/gausskernel/runtime/executor/nodeStub.cpp +++ b/src/gausskernel/runtime/executor/nodeStub.cpp @@ -24,6 +24,10 @@ #include "executor/node/nodeBitmapHeapscan.h" #include "executor/node/nodeTidscan.h" +#ifdef USE_SPQ +#include "executor/node/nodeSpqSeqscan.h" +#endif + extern char* nodeTagToString(NodeTag type); PlanState* ExecInitNodeStubNorm(Plan* node, EState* estate, int eflags) @@ -151,6 +155,16 @@ void ExecEndNodeStubScan(PlanState* node) case T_SeqScan: ExecEndSeqScan((SeqScanState*)node); break; +#ifdef USE_SPQ + case T_SpqSeqScan: { + if (end_spqscan_hook) { + end_spqscan_hook((SpqSeqScanState *)node); + } else { + ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("spqscan hook init_spqscan_hook uninited."))); + } + break; + } +#endif case T_IndexScan: ExecEndIndexScan((IndexScanState*)node); break; diff --git a/src/gausskernel/storage/access/common/printtup.cpp b/src/gausskernel/storage/access/common/printtup.cpp index d47d0434c..fd8371c29 100644 --- a/src/gausskernel/storage/access/common/printtup.cpp +++ b/src/gausskernel/storage/access/common/printtup.cpp @@ -65,6 +65,10 @@ static void printLocalBroadCastBatch(VectorBatch *batch, DestReceiver *self); static void printRedistributeBatch(VectorBatch *batch, DestReceiver *self); static void printLocalRedistributeBatch(VectorBatch *batch, DestReceiver *self); static void printLocalRoundRobinBatch(VectorBatch *batch, DestReceiver *self); +#ifdef USE_SPQ +static void printRoundRobinTuple(TupleTableSlot *tuple, DestReceiver *self); +static void printRoundRobinBatch(VectorBatch *batch, DestReceiver *self); +#endif static void printHybridBatch(VectorBatch *batch, DestReceiver *self); static void finalizeLocalStream(DestReceiver *self); @@ -116,7 +120,14 @@ DestReceiver *createStreamDestReceiver(CommandDest dest) case DestTupleHybrid: self->pub.receiveSlot = printHybridTuple; break; - +#ifdef USE_SPQ + case DestTupleRoundRobin: + self->pub.receiveSlot = printRoundRobinTuple; + break; + case DestBatchRoundRobin: + self->pub.sendBatch = printRoundRobinBatch; + break; +#endif case DestBatchBroadCast: self->pub.sendBatch = printBroadCastBatchCompress; break; @@ -233,6 +244,20 @@ static void printLocalRoundRobinTuple(TupleTableSlot *tuple, DestReceiver *self) rec->arg->localRoundRobinStream(tuple); } +#ifdef USE_SPQ +/* + * @Description: Send a tuple by roundrobin + * + * @param[IN] tuple: tuple to send. + * @param[IN] dest: dest receiver. + * @return void + */ +static void printRoundRobinTuple(TupleTableSlot *tuple, DestReceiver *self) +{ + streamReceiver *rec = (streamReceiver *)self; + rec->arg->roundRobinStream(tuple, self); +} +#endif /* * @Description: Send a tuple in hybrid ways, some data with special values * shoule be sent in special way. @@ -312,6 +337,20 @@ static void printLocalRoundRobinBatch(VectorBatch *batch, DestReceiver *self) rec->arg->localRoundRobinStream(batch); } +/* + * @Description: Send a batch by roundrobin + * + * @param[IN] batch: batch to send. + * @param[IN] dest: dest receiver. + * @return void + */ +#ifdef USE_SPQ +static void printRoundRobinBatch(VectorBatch *batch, DestReceiver *self) +{ + streamReceiver *rec = (streamReceiver *)self; + rec->arg->roundRobinStream(batch); +} +#endif /* * @Description: Send a batch in hybrid ways, some data with special values * shoule be sent in special way. @@ -1487,7 +1526,7 @@ inline void AddCheckInfo(StringInfo buf) bool is_check_added = false; /* add check info for datanode and coordinator */ - if (IsConnFromCoord()) { + if (IS_SPQ_EXECUTOR || IsConnFromCoord()) { #ifdef USE_ASSERT_CHECKING initStringInfo(&buf_check); AddCheckMessage(&buf_check, buf, false); diff --git a/src/gausskernel/storage/access/common/reloptions.cpp b/src/gausskernel/storage/access/common/reloptions.cpp index afdcf9690..a01b8b633 100644 --- a/src/gausskernel/storage/access/common/reloptions.cpp +++ b/src/gausskernel/storage/access/common/reloptions.cpp @@ -85,6 +85,10 @@ static void ValidateStrOptEncryptAlgo(const char *val); static void ValidateStrOptDekCipher(const char *val); static void ValidateStrOptCmkId(const char *val); +#ifdef USE_SPQ +static void CheckSpqBTBuildOption(const char *val); +#endif + static relopt_bool boolRelOpts[] = { {{"autovacuum_enabled", "Enables autovacuum in this relation", RELOPT_KIND_HEAP | RELOPT_KIND_TOAST}, true}, {{"user_catalog_table", @@ -518,6 +522,15 @@ static relopt_string stringRelOpts[] = { validateWithCheckOption, NULL }, +#ifdef USE_SPQ + { + { "spq_build", "Btree index build using PX", RELOPT_KIND_BTREE }, + 0, + true, + CheckSpqBTBuildOption, + NULL + }, +#endif { {"view_sql_security", "View has SQL SECURITY OPTION defined (INVOKER or DEFINER).", RELOPT_KIND_VIEW}, 0, @@ -2006,6 +2019,10 @@ bytea *default_reloptions(Datum reloptions, bool validate, relopt_kind kind) { "check_option", RELOPT_TYPE_STRING, offsetof(StdRdOptions, check_option_offset)}, { "view_sql_security", RELOPT_TYPE_STRING, offsetof(StdRdOptions, view_security_option_offset)}, { "collate", RELOPT_TYPE_INT, offsetof(StdRdOptions, collate)}, +#ifdef USE_SPQ + /* SPQ index B-Tree build: btree index build use spq */ + {"spq_build", RELOPT_TYPE_STRING, offsetof(StdRdOptions, spq_bt_build_offset)}, +#endif { "deduplication", RELOPT_TYPE_BOOL, offsetof(StdRdOptions, deduplication)} }; @@ -3078,3 +3095,21 @@ void CheckCompressOption(TableCreateSupport *tableCreateSupport) errmsg("Algorithm PGZSTD current not support ustore."))); } } + +#ifdef USE_SPQ +/* + * before check spq reloption, make sure guc params of spq_enable_btbuild is on + */ +void CheckSpqBTBuildOption(const char *val) +{ + if (!u_sess->attr.attr_spq.spq_enable_btbuild) { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("spq_build is not supported, please set gauss_enable_spq_btbuild=on"))); + } + + if (val == NULL || (strcmp(val, "on") != 0 && strcmp(val, "off") != 0 && strcmp(val, "finish") != 0)) { + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid value for \"spq_build\" option"), + errdetail("Valid values are \"on\", and \"off\"."))); + } +} +#endif \ No newline at end of file diff --git a/src/gausskernel/storage/access/heap/heapam.cpp b/src/gausskernel/storage/access/heap/heapam.cpp index 05a5338b8..f497b78c7 100755 --- a/src/gausskernel/storage/access/heap/heapam.cpp +++ b/src/gausskernel/storage/access/heap/heapam.cpp @@ -10328,3 +10328,29 @@ HeapTuple heapam_index_fetch_tuple(IndexScanDesc scan, bool *all_dead, bool* has return NULL; } + +#ifdef USE_SPQ +/* ---------------- + * try_table_open - open a heap relation by relation OID + * + * As above, but relation return NULL for relation-not-found + * ---------------- + */ +Relation try_table_open(Oid relationId, LOCKMODE lockmode) +{ + Relation r; + + r = try_relation_open(relationId, lockmode); + + if (!RelationIsValid(r)) + return NULL; + + if (r->rd_rel->relkind == RELKIND_INDEX) + ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is an index", RelationGetRelationName(r)))); + else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is a composite type", RelationGetRelationName(r)))); + + return r; +} +#endif diff --git a/src/gausskernel/storage/access/nbtree/Makefile b/src/gausskernel/storage/access/nbtree/Makefile index 403d721b4..f4ca77b79 100644 --- a/src/gausskernel/storage/access/nbtree/Makefile +++ b/src/gausskernel/storage/access/nbtree/Makefile @@ -10,6 +10,6 @@ ifneq "$(MAKECMDGOALS)" "clean" endif endif OBJS = nbtcompare.o nbtdedup.o nbtinsert.o nbtpage.o nbtree.o nbtsearch.o \ - nbtutils.o nbtsort.o nbtxlog.o + nbtutils.o nbtsort.o nbtxlog.o spq_btbuild.o include $(top_srcdir)/src/gausskernel/common.mk diff --git a/src/gausskernel/storage/access/nbtree/nbtree.cpp b/src/gausskernel/storage/access/nbtree/nbtree.cpp index 400dd5fea..3171da388 100644 --- a/src/gausskernel/storage/access/nbtree/nbtree.cpp +++ b/src/gausskernel/storage/access/nbtree/nbtree.cpp @@ -89,6 +89,11 @@ Datum btbuild(PG_FUNCTION_ARGS) Relation heap = (Relation)PG_GETARG_POINTER(0); Relation index = (Relation)PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *)PG_GETARG_POINTER(2); +#ifdef USE_SPQ + if (enable_spq_btbuild(index)) { + return spqbtbuild(heap, index, indexInfo); + } +#endif IndexBuildResult *result = btbuild_internal(heap, index, indexInfo); PG_RETURN_POINTER(result); } diff --git a/src/gausskernel/storage/access/nbtree/nbtsort.cpp b/src/gausskernel/storage/access/nbtree/nbtsort.cpp index f4cb88274..c6ef7953f 100644 --- a/src/gausskernel/storage/access/nbtree/nbtsort.cpp +++ b/src/gausskernel/storage/access/nbtree/nbtsort.cpp @@ -245,6 +245,16 @@ void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) _bt_load(&wstate, btspool, btspool2); } +#ifdef USE_SPQ +/* + * Read tuples and load them into btree + */ +void spq_load(BTWriteState wstate) +{ + _bt_load(&wstate, NULL, NULL); +} +#endif + /* * Internal routines. * @@ -947,6 +957,17 @@ static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) pfree(dstate); } else { +#ifdef USE_SPQ + if (enable_spq_btbuild(wstate->index)) { + while ((itup = spq_consume(wstate->spqleader)) != NULL) { + /* When we see first tuple, create first index page */ + if (state == NULL) + state = _bt_pagestate(wstate, 0); + + _bt_buildadd(wstate, state, itup, 0); + } + } else { +#endif /* merge is unnecessary */ while ((itup = tuplesort_getindextuple(btspool->sortstate, true)) != NULL) { /* When we see first tuple, create first index page */ @@ -955,6 +976,9 @@ static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) _bt_buildadd(wstate, state, itup, 0); } +#ifdef USE_SPQ + } +#endif } /* Close down final pages and write the metapage */ diff --git a/src/gausskernel/storage/access/nbtree/spq_btbuild.cpp b/src/gausskernel/storage/access/nbtree/spq_btbuild.cpp new file mode 100644 index 000000000..968e03776 --- /dev/null +++ b/src/gausskernel/storage/access/nbtree/spq_btbuild.cpp @@ -0,0 +1,658 @@ +/* ------------------------------------------------------------------------- + * + * spq_btbuild.cpp + * Build btree using SPQ. + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2021, openGauss Contributors + * + * + * IDENTIFICATION + * src/gausskernel/storage/access/nbtree/spq_btbuild.cpp + * + * ------------------------------------------------------------------------- + */ + +#ifdef USE_SPQ +#include "access/nbtree.h" +#include "utils/snapmgr.h" +#include "postmaster/bgworker.h" +#include "executor/spi.h" +#include "utils/portal.h" +#include "commands/tablecmds.h" +#include "executor/executor.h" + +void spq_scansort(BTBuildState *buildstate, Relation heap, Relation index, bool isconcurrent); + +void spq_build_main(const BgWorkerContext *bwc); + +SPQSharedContext *spq_init_shared(Relation heap, Relation index, bool isconcurrent); + +SPQWorkerState *spq_worker_init(Relation heap, Relation index); + +void spq_worker_produce(SPQWorkerState *workstate); + +void spq_init_buffer(IndexTupleBuffer *buffer); + +bool index_form_tuple_buffer(SPQWorkerState *workstate, IndexTupleBuffer *buffer, uint64 begin); + +void spq_worker_finish(SPQWorkerState *workstate); + +void spq_leafbuild(SPQLeaderState *spqleader); + +void spq_leader_finish(SPQLeaderState *spqleader); + +void condition_time_wait(SPQSharedContext *shared); + +void condition_signal(SPQSharedContext *shared); + +bool enable_spq_btbuild(Relation rel) +{ + if ((rel)->rd_options && (rel)->rd_rel->relkind == RELKIND_INDEX && (rel)->rd_rel->relam == BTREE_AM_OID) { + if (((StdRdOptions *)(rel)->rd_options)->spq_bt_build_offset != 0) { + return (strcmp((char *)(rel)->rd_options + ((StdRdOptions *)(rel)->rd_options)->spq_bt_build_offset, + "on") == 0); + } else { + return false; + } + } else { + return false; + } +} +bool enable_spq_btbuild_cic(Relation rel) +{ + return u_sess->attr.attr_spq.spq_enable_btbuild_cic && enable_spq_btbuild(rel); +} + +IndexTuple index_form_tuple_allocated(TupleDesc tuple_descriptor, Datum *values, bool *isnull, char *start, Size free, + Size *used) +{ + char *tp = NULL; /* tuple pointer */ + IndexTuple tuple = NULL; /* return tuple */ + Size size, data_size, hoff; + int i; + unsigned short infomask = 0; + bool hasnull = false; + uint16 tupmask = 0; + int attributeNum = tuple_descriptor->natts; + + Size (*computedatasize_tuple)(TupleDesc tuple_desc, Datum * values, const bool *isnull); + void (*filltuple)(TupleDesc tuple_desc, Datum * values, const bool *isnull, char *data, Size data_size, + uint16 *infomask, bits8 *bit); + + computedatasize_tuple = &heap_compute_data_size; + filltuple = &heap_fill_tuple; + +#ifdef TOAST_INDEX_HACK + Datum untoasted_values[INDEX_MAX_KEYS]; + bool untoasted_free[INDEX_MAX_KEYS]; +#endif + + if (attributeNum > INDEX_MAX_KEYS) + ereport(ERROR, (errcode(ERRCODE_TOO_MANY_COLUMNS), + errmsg("number of index columns (%d) exceeds limit (%d)", attributeNum, INDEX_MAX_KEYS))); + +#ifdef TOAST_INDEX_HACK + uint32 toastTarget = TOAST_INDEX_TARGET; + if (tuple_descriptor->tdTableAmType == TAM_USTORE) { + toastTarget = UTOAST_INDEX_TARGET; + } + for (i = 0; i < attributeNum; i++) { + Form_pg_attribute att = tuple_descriptor->attrs[i]; + + untoasted_values[i] = values[i]; + untoasted_free[i] = false; + + /* Do nothing if value is NULL or not of varlena type */ + if (isnull[i] || att->attlen != -1) + continue; + + /* + * If value is stored EXTERNAL, must fetch it so we are not depending + * on outside storage. This should be improved someday. + */ + Pointer val = DatumGetPointer(values[i]); + checkHugeToastPointer((varlena *)val); + if (VARATT_IS_EXTERNAL(val)) { + untoasted_values[i] = PointerGetDatum(heap_tuple_fetch_attr((struct varlena *)DatumGetPointer(values[i]))); + untoasted_free[i] = true; + } + + /* + * If value is above size target, and is of a compressible datatype, + * try to compress it in-line. + */ + if (!VARATT_IS_EXTENDED(DatumGetPointer(untoasted_values[i])) && + VARSIZE(DatumGetPointer(untoasted_values[i])) > toastTarget && + (att->attstorage == 'x' || att->attstorage == 'm')) { + Datum cvalue = toast_compress_datum(untoasted_values[i]); + if (DatumGetPointer(cvalue) != NULL) { + /* successful compression */ + if (untoasted_free[i]) + pfree(DatumGetPointer(untoasted_values[i])); + untoasted_values[i] = cvalue; + untoasted_free[i] = true; + } + } + } +#endif + + for (i = 0; i < attributeNum; i++) { + if (isnull[i]) { + hasnull = true; + break; + } + } + + if (hasnull) + infomask |= INDEX_NULL_MASK; + + hoff = IndexInfoFindDataOffset(infomask); +#ifdef TOAST_INDEX_HACK + data_size = computedatasize_tuple(tuple_descriptor, untoasted_values, isnull); +#else + data_size = computedatasize_tuple(tuple_descriptor, values, isnull); +#endif + size = hoff + data_size; + size = MAXALIGN(size); /* be conservative */ + + *used = size; + + if (size > free) + return NULL; + + tp = start; + tuple = (IndexTuple)tp; + + filltuple(tuple_descriptor, +#ifdef TOAST_INDEX_HACK + untoasted_values, +#else + values, +#endif + isnull, (char *)tp + hoff, data_size, &tupmask, (hasnull ? (bits8 *)tp + sizeof(IndexTupleData) : NULL)); + +#ifdef TOAST_INDEX_HACK + for (i = 0; i < attributeNum; i++) { + if (untoasted_free[i]) + pfree(DatumGetPointer(untoasted_values[i])); + } +#endif + + /* + * We do this because heap_fill_tuple wants to initialize a "tupmask" + * which is used for HeapTuples, but we want an indextuple infomask. The + * only relevant info is the "has variable attributes" field. We have + * already set the hasnull bit above. + */ + if (tupmask & HEAP_HASVARWIDTH) + infomask |= INDEX_VAR_MASK; + + /* + * Here we make sure that the size will fit in the field reserved for it + * in t_info. + */ + if ((size & INDEX_SIZE_MASK) != size) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row requires %lu bytes, maximum size is %lu", + (unsigned long)size, (unsigned long)INDEX_SIZE_MASK))); + + infomask |= size; + + /* + * initialize metadata + */ + tuple->t_info = infomask; + return tuple; +} +/* + * spq based btree build + */ +Datum spqbtbuild(Relation heap, Relation index, IndexInfo *indexInfo) +{ + IndexBuildResult *result = NULL; + BTBuildState buildstate; + double *allPartTuples = NULL; + + /* + * check if it is expr index + */ + if (indexInfo->ii_Expressions) + ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("spq btree build does not support expr index."))); + + buildstate.isUnique = indexInfo->ii_Unique; + buildstate.haveDead = false; + buildstate.heapRel = heap; + buildstate.spool = NULL; + buildstate.spool2 = NULL; + buildstate.indtuples = 0; + buildstate.btleader = NULL; + buildstate.spqleader = NULL; + +#ifdef BTREE_BUILD_STATS + if (u_sess->attr.attr_resource.log_btree_build_stats) { + ResetUsage(); + } +#endif /* BTREE_BUILD_STATS */ + + /* We expect to be called exactly once for any index relation. If that's + * not the case, big trouble's what we have. */ + if (RelationGetNumberOfBlocks(index) != 0) { + ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" already contains data", RelationGetRelationName(index)))); + } + + /* + * scan & sort using spi + */ + spq_scansort(&buildstate, heap, index, indexInfo->ii_Concurrent); + + spq_leafbuild(buildstate.spqleader); + + spq_leader_finish(buildstate.spqleader); + + // Return statistics + result = (IndexBuildResult *)palloc(sizeof(IndexBuildResult)); + result->heap_tuples = buildstate.spqleader->processed; + result->index_tuples = buildstate.indtuples; + result->all_part_tuples = allPartTuples; + + if (!indexInfo->ii_Concurrent) + spq_btbuild_update_pg_class(heap, index); + + PG_RETURN_POINTER(result); +} + +void spq_leader_finish(SPQLeaderState *spqleader) +{ + /* Shutdown worker processes */ + BgworkerListSyncQuit(); + /* Free last reference to MVCC snapshot, if one was used */ + if (IsMVCCSnapshot(spqleader->snapshot)) { + PopActiveSnapshot(); + UnregisterSnapshot(spqleader->snapshot); + } +} + +/* scan & sort using spi */ +void spq_scansort(BTBuildState *buildstate, Relation heap, Relation index, bool isconcurrent) +{ + SPQLeaderState *spqleader; + + SPQSharedContext *shared; + + shared = spq_init_shared(heap, index, isconcurrent); + + /* Launch workers */ + LaunchBackgroundWorkers(1, shared, spq_build_main, NULL); + + buildstate->spqleader = spqleader = (SPQLeaderState *)palloc0(sizeof(SPQLeaderState)); + + spqleader->heap = heap; + spqleader->index = index; + spqleader->shared = shared; + spqleader->buffer = NULL; + spqleader->processed = 0; + spqleader->snapshot = shared->snapshot; +} + +/* SPQSharedContext initialization */ +SPQSharedContext *spq_init_shared(Relation heap, Relation index, bool isconcurrent) +{ + Size sharedsize; + Snapshot snapshot; + SPQSharedContext *shared; + + if (!isconcurrent) + snapshot = SnapshotAny; + else + snapshot = RegisterSnapshot(GetTransactionSnapshot()); + + /* Calculate shared size */ + sharedsize = sizeof(SPQSharedContext); + + sharedsize += SPQ_SHARED_SIZE; + + shared = (SPQSharedContext *)MemoryContextAllocZero(INSTANCE_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE), sharedsize); + + shared->heaprelid = RelationGetRelid(heap); + shared->indexrelid = RelationGetRelid(index); + shared->isunique = false; + shared->isconcurrent = isconcurrent; + shared->dop = SET_DOP(u_sess->opt_cxt.query_dop); + /* Initialize mutable state */ + SpinLockInit(&shared->mutex); + pthread_mutex_init(&shared->m_mutex, NULL); + pthread_cond_init(&shared->m_cond, NULL); + + shared->bufferidx = 0; + shared->done = false; + shared->consumer = -1; + shared->producer = -1; + shared->snapshot = snapshot; + return shared; +} + +/* SPQ btree build worker thread */ +void spq_build_main(const BgWorkerContext *bwc) +{ + SPQSharedContext *shared = (SPQSharedContext *)bwc->bgshared; + SPQWorkerState *worker; + + LOCKMODE heapLockmode = NoLock; + LOCKMODE indexLockmode = NoLock; + + if (!shared->isconcurrent) { + heapLockmode = ShareLock; + indexLockmode = AccessExclusiveLock; + } else { + heapLockmode = ShareUpdateExclusiveLock; + indexLockmode = RowExclusiveLock; + } + + /* Open relations within worker. */ + Relation heap = heap_open(shared->heaprelid, heapLockmode); + Relation index = index_open(shared->indexrelid, indexLockmode); + + /* add snapshot for spi */ + PushActiveSnapshot(shared->snapshot); + u_sess->opt_cxt.query_dop = shared->dop; + worker = spq_worker_init(heap, index); + worker->shared = shared; + spq_worker_produce(worker); + spq_worker_finish(worker); + +#ifdef BTREE_BUILD_STATS + if (u_sess->attr.attr_resource.log_btree_build_stats) { + ResetUsage(); + } +#endif /* BTREE_BUILD_STATS */ + + index_close(index, indexLockmode); + heap_close(heap, heapLockmode); + return; +} + +SPQWorkerState *spq_worker_init(Relation heap, Relation index) +{ + SPQWorkerState *worker = (SPQWorkerState *)palloc0(sizeof(SPQWorkerState)); + worker->all_fetched = true; + worker->heap = heap; + worker->index = index; + worker->sql = makeStringInfo(); + + bool old_enable_spq = u_sess->attr.attr_spq.gauss_enable_spq; + bool old_spq_enable_index_scan = u_sess->attr.attr_spq.spq_optimizer_enable_indexscan; + bool old_spq_enable_indexonly_scan = u_sess->attr.attr_spq.spq_optimizer_enable_indexonlyscan; + + /* generate sql */ + { + StringInfo attrs = makeStringInfo(); /* attrs in SELECT clause */ + StringInfo sortattrs = makeStringInfo(); /* attrs in ORDER BY clause */ + TupleDesc tupdes = RelationGetDescr(index); + int natts = tupdes->natts; + ScanKey scankey = _bt_mkscankey_nodata(index); + Assert(natts > 0); + + for (int i = 0; i < natts; i++, scankey++) { + Form_pg_attribute att = TupleDescAttr(tupdes, i); + appendStringInfo(attrs, ", %s", NameStr(att->attname)); + + appendStringInfo(sortattrs, "%s %s %s", NameStr(att->attname), + ((scankey->sk_flags & SK_BT_DESC) != 0) ? "desc" : "", + ((scankey->sk_flags & SK_BT_NULLS_FIRST) != 0) ? "nulls first" : "nulls last"); + if (i != natts - 1) + appendStringInfo(sortattrs, ", "); + } + appendStringInfo(worker->sql, "select ctid %s from %s order by %s, ctid", attrs->data, RelationGetRelationName(heap), + sortattrs->data); + + elog(INFO, "sql: %s", worker->sql->data); + } + + u_sess->attr.attr_spq.gauss_enable_spq = true; + u_sess->attr.attr_spq.spq_optimizer_enable_indexscan = false; + u_sess->attr.attr_spq.spq_optimizer_enable_indexonlyscan = false; + + SPI_connect(); + + if ((worker->plan = SPI_prepare(worker->sql->data, 0, NULL)) == NULL) + ereport(ERROR, + (errcode(ERRCODE_SPI_PREPARE_FAILURE), + errmsg("SPI_prepare(\"%s\") failed: %s", worker->sql->data, SPI_result_code_string(SPI_result)))); + + if ((worker->portal = SPI_cursor_open(NULL, worker->plan, NULL, NULL, true)) == NULL) + ereport(ERROR, + (errcode(ERRCODE_SPI_CURSOR_OPEN_FAILURE), + errmsg("SPI_cursor_open(\"%s\") failed: %s", worker->sql->data, SPI_result_code_string(SPI_result)))); + + u_sess->attr.attr_spq.gauss_enable_spq = old_enable_spq; + u_sess->attr.attr_spq.spq_optimizer_enable_indexscan = old_spq_enable_index_scan; + u_sess->attr.attr_spq.spq_optimizer_enable_indexonlyscan = old_spq_enable_indexonly_scan; + + return worker; +} + +void spq_worker_produce(SPQWorkerState *worker) +{ + SPQSharedContext *shared = worker->shared; + IndexTupleBuffer *buffer = NULL; + + for (;;) { + { + while (true) { + SpinLockAcquire(&shared->mutex); + if (shared->bufferidx < SPQ_QUEUE_SIZE) { + int nextidx = GET_IDX(shared->producer); + shared->producer = nextidx; + SpinLockRelease(&shared->mutex); + buffer = GET_BUFFER(shared, nextidx); + elog(DEBUG3, "spq btbuild worker get buffer ok %d", nextidx); + break; + } + SpinLockRelease(&shared->mutex); + condition_time_wait(shared); + } + } + + Assert(buffer); + spq_init_buffer(buffer); + + { + if (!worker->all_fetched) { + /* save itup from last SPI_cursor_fetch at worker->processed */ + if (!index_form_tuple_buffer(worker, buffer, worker->processed)) { + goto produce_buffer; + } else { + SPI_freetuptable(SPI_tuptable); + } + } + + /* fetch SPQ_BATCH_SIZE nums of tuples from portal */ + SPI_cursor_fetch(worker->portal, true, SPQ_BATCH_SIZE); + + if (SPI_processed == 0 && worker->all_fetched) { + SpinLockAcquire(&shared->mutex); + shared->done = true; + SpinLockRelease(&shared->mutex); + condition_signal(shared); + return; + } + + /* reset worker status, save itup from the beginning */ + worker->processed = 0; + worker->all_fetched = true; + if (!index_form_tuple_buffer(worker, buffer, 0)) { + goto produce_buffer; + } else { + SPI_freetuptable(SPI_tuptable); + } + + produce_buffer: + SpinLockAcquire(&shared->mutex); + shared->bufferidx++; + SpinLockRelease(&shared->mutex); + condition_signal(shared); + } + } +} + +void spq_init_buffer(IndexTupleBuffer *buffer) +{ + buffer->queue_size = 0; + buffer->offset = 0; + buffer->idx = 0; +} + +bool index_form_tuple_buffer(SPQWorkerState *worker, IndexTupleBuffer *buffer, uint64 begin) +{ + for (uint64 i = begin; i < SPI_processed; i++) { + Datum values[INDEX_MAX_KEYS + 1]; + bool nulls[INDEX_MAX_KEYS + 1]; + IndexTuple ituple; + ItemPointer ip; + Size used; + HeapTuple tup = SPI_tuptable->vals[i]; + heap_deform_tuple(tup, SPI_tuptable->tupdesc, values, nulls); + + ip = (ItemPointer)values[0]; + ituple = + index_form_tuple_allocated(RelationGetDescr(worker->index), values + 1, nulls + 1, + GET_BUFFER_MEM(buffer) + buffer->offset, SPQ_MEM_SIZE - buffer->offset, &used); + + if (ituple == NULL) { + worker->processed = i; + worker->all_fetched = false; + return false; + } + + SPQSharedContext *shared = worker->shared; + + elog(DEBUG5, "spq btbuild worker, put index tuple, buffer %d, offset %d, citd (%u, %u)", shared->producer, + buffer->offset, ItemPointerGetBlockNumber(ip), ItemPointerGetOffsetNumber(ip)); + + ituple->t_tid = *ip; + buffer->addr[buffer->queue_size++] = buffer->offset; + buffer->offset += used; + } + return true; +} + +void spq_worker_finish(SPQWorkerState *worker) +{ + SPI_freetuptable(SPI_tuptable); + SPI_cursor_close(worker->portal); + SPI_freeplan(worker->plan); + SPI_finish(); +} + +void spq_leafbuild(SPQLeaderState *spqleader) +{ + BTWriteState wstate; + +#ifdef BTREE_BUILD_STATS + if (u_sess->attr.attr_resource.log_btree_build_stats) { + ShowUsage("BTREE BUILD (Spool) STATISTICS"); + ResetUsage(); + } +#endif /* BTREE_BUILD_STATS */ + + wstate.spqleader = spqleader; + wstate.heap = spqleader->heap; + wstate.index = spqleader->index; + wstate.inskey = _bt_mkscankey(wstate.index, NULL); + wstate.inskey->allequalimage = btree_allequalimage(wstate.index, true); + + /* + * We need to log index creation in WAL iff WAL archiving/streaming is + * enabled UNLESS the index isn't WAL-logged anyway. + */ + wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index); + + /* reserve the metapage */ + wstate.btws_pages_alloced = BTREE_METAPAGE + 1; + wstate.btws_pages_written = 0; + wstate.btws_zeropage = NULL; /* until needed */ + + spq_load(wstate); + + if (wstate.btws_zeropage != NULL) { + pfree(wstate.btws_zeropage); + wstate.btws_zeropage = NULL; + } +} + +IndexTuple spq_consume(SPQLeaderState *spqleader) +{ + SPQSharedContext *shared = spqleader->shared; + IndexTupleBuffer *buffer = spqleader->buffer; + + for (;;) { + if (buffer && buffer->idx < buffer->queue_size) { + Size offset = buffer->addr[buffer->idx++]; + IndexTuple itup = (IndexTuple)(GET_BUFFER_MEM(buffer) + offset); + elog(DEBUG5, "spq btbuild leader, get index tuple, buffer %d, offset %lu, citd (%u, %u)", shared->consumer, + offset, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid)); + return itup; + } + + /* notify producer */ + if (buffer) { + SpinLockAcquire(&shared->mutex); + shared->bufferidx--; + SpinLockRelease(&shared->mutex); + condition_signal(shared); + spqleader->processed++; + } + + /* get buffer */ + { + while (true) { + SpinLockAcquire(&shared->mutex); + + if (shared->bufferidx > 0) { + int next = GET_IDX(shared->consumer); + shared->consumer = next; + SpinLockRelease(&shared->mutex); + elog(DEBUG3, "spq btbuild leader get buffer %d", next); + spqleader->buffer = buffer = GET_BUFFER(shared, next); + break; + } + + if (shared->done) { + SpinLockRelease(&shared->mutex); + return NULL; + } + SpinLockRelease(&shared->mutex); + condition_time_wait(shared); + } + } + } +} + +void condition_time_wait(SPQSharedContext *shared) +{ + struct timespec time_to_wait; + (void)pthread_mutex_lock(&shared->m_mutex); + (void)clock_gettime(CLOCK_MONOTONIC, &time_to_wait); + + time_to_wait.tv_nsec += 100 * NANOSECONDS_PER_MILLISECOND; + if (time_to_wait.tv_nsec >= NANOSECONDS_PER_SECOND) { + time_to_wait.tv_nsec -= NANOSECONDS_PER_SECOND; + time_to_wait.tv_sec += 1; + } + int res = pthread_cond_timedwait(&shared->m_cond, &shared->m_mutex, &time_to_wait); + if (res != 0 && res != ETIMEDOUT) { + ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("spq btbuild error."))); + } + (void)pthread_mutex_unlock(&shared->m_mutex); +} + +void condition_signal(SPQSharedContext *shared) +{ + (void)pthread_mutex_lock(&shared->m_mutex); + (void)pthread_cond_signal(&shared->m_cond); + (void)pthread_mutex_unlock(&shared->m_mutex); +} +#endif \ No newline at end of file diff --git a/src/gausskernel/storage/ipc/ipci.cpp b/src/gausskernel/storage/ipc/ipci.cpp index 86c90b166..c42be81fb 100644 --- a/src/gausskernel/storage/ipc/ipci.cpp +++ b/src/gausskernel/storage/ipc/ipci.cpp @@ -84,6 +84,10 @@ #include "storage/cfs/cfs_buffers.h" #include "ddes/dms/ss_txnstatus.h" +#ifdef USE_SPQ +#include "executor/node/nodeShareInputScan.h" +#endif + /* we use semaphore not LWLOCK, because when thread InitGucConfig, it does not get a t_thrd.proc */ pthread_mutex_t gLocaleMutex = PTHREAD_MUTEX_INITIALIZER; @@ -149,6 +153,9 @@ Size ComputeTotalSizeOfShmem() size = add_size(size, SInvalShmemSize()); size = add_size(size, PMSignalShmemSize()); size = add_size(size, ProcSignalShmemSize()); +#ifdef USE_SPQ + size = add_size(size, ShareInputShmemSize()); +#endif size = add_size(size, CheckpointerShmemSize()); size = add_size(size, PageWriterShmemSize()); size = add_size(size, AutoVacuumShmemSize()); @@ -354,7 +361,9 @@ void CreateSharedMemoryAndSemaphores(bool makePrivate, int port) */ PMSignalShmemInit(); ProcSignalShmemInit(); - +#ifdef USE_SPQ + ShareInputShmemInit(); +#endif { CheckpointerShmemInit(); CBMShmemInit(); diff --git a/src/gausskernel/storage/ipc/procsignal.cpp b/src/gausskernel/storage/ipc/procsignal.cpp index 0d4bf4edd..1879d85dc 100755 --- a/src/gausskernel/storage/ipc/procsignal.cpp +++ b/src/gausskernel/storage/ipc/procsignal.cpp @@ -286,7 +286,7 @@ void procsignal_sigusr1_handler(SIGNAL_ARGS) WLMCheckSigRecvData(); if (CheckProcSignal(PROCSIG_SPACE_LIMIT)) WLMCheckSpaceLimit(); -#ifndef ENABLE_MULTIPLE_NODES +#if (!defined ENABLE_MULTIPLE_NODES) && (!defined USE_SPQ) if (CheckProcSignal(PROCSIG_STREAM_STOP_CHECK)) StreamMarkStop(); #endif diff --git a/src/gausskernel/storage/lmgr/lwlocknames.txt b/src/gausskernel/storage/lmgr/lwlocknames.txt index 06c70bce8..c2f253d39 100755 --- a/src/gausskernel/storage/lmgr/lwlocknames.txt +++ b/src/gausskernel/storage/lmgr/lwlocknames.txt @@ -143,4 +143,5 @@ OndemandXLogFileHandleLock 133 ExrtoSnapshotLock 134 RedoTruncateLock 135 -ExrtoRecycleResidualUndoLock 137 \ No newline at end of file +ExrtoRecycleResidualUndoLock 137 +ShareInputScanLock 138 diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 296b63346..dea6a8444 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -482,4 +482,7 @@ extern bool ResolveCminCmaxDuringDecoding( struct HTAB* tuplecid_data, Snapshot snapshot, HeapTuple htup, Buffer buffer, CommandId* cmin, CommandId* cmax); extern TableScanDesc heap_beginscan_internal(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, uint32 flags, ParallelHeapScanDesc parallel_scan, RangeScanInRedis rangeScanInRedis = {false, 0, 0}); +#ifdef USE_SPQ +extern Relation try_table_open(Oid relationId, LOCKMODE lockmode); +#endif #endif /* HEAPAM_H */ diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 86c611596..c34fad5c0 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -27,6 +27,10 @@ #include "access/relscan.h" #include "nodes/execnodes.h" +#ifdef USE_SPQ +#include "access/spq_btbuild.h" +#endif + /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */ typedef uint16 BTCycleId; @@ -1104,6 +1108,9 @@ typedef struct BTWriteState { BlockNumber btws_pages_alloced; /* # pages allocated */ BlockNumber btws_pages_written; /* # pages written out */ Page btws_zeropage; /* workspace for filling zeroes */ +#ifdef USE_SPQ + SPQLeaderState *spqleader; /* spq btbuild leader */ +#endif } BTWriteState; typedef struct BTOrderedIndexListElement { @@ -1242,6 +1249,10 @@ typedef struct { * BTBuildState. Workers have their own spool and spool2, though.) */ BTLeader *btleader; +#ifdef USE_SPQ + /* spq btbuild leader */ + SPQLeaderState *spqleader; +#endif } BTBuildState; /* @@ -1414,6 +1425,9 @@ extern void btree_check_third_page(Relation rel, Relation heap, bool need_heapti extern int btree_num_keep_atts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright); extern bool btree_allequalimage(Relation rel, bool debugmessage); +#ifdef USE_SPQ +extern void spq_load(BTWriteState wstate); +#endif /* * prototypes for functions in nbtxlog.c diff --git a/src/include/access/spq_btbuild.h b/src/include/access/spq_btbuild.h new file mode 100644 index 000000000..99ddf3fc0 --- /dev/null +++ b/src/include/access/spq_btbuild.h @@ -0,0 +1,103 @@ +/* ------------------------------------------------------------------------- + * + * spq_btbuild.h + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/spq_btbuild.h + * + * ------------------------------------------------------------------------- + */ + +#ifdef USE_SPQ +#ifndef SPQ_BTBUILD_H +#define SPQ_BTBUILD_H + +#include "utils/rel.h" +#include "utils/snapmgr.h" +#include "executor/spi.h" +#include "utils/portal.h" + +#define NANOSECONDS_PER_MILLISECOND 1000000L +#define NANOSECONDS_PER_SECOND 1000000000L + +#define SPQ_BATCH_SIZE (u_sess->attr.attr_spq.spq_batch_size) +#define SPQ_MEM_SIZE (u_sess->attr.attr_spq.spq_mem_size) +#define SPQ_QUEUE_SIZE (u_sess->attr.attr_spq.spq_queue_size) + +#define ENABLE_SPQ (u_sess->attr.attr_spq.spq_enable_btbuild) + +#define GET_IDX(i) ((i + 1) % SPQ_QUEUE_SIZE) + +#define SPQ_BUFFER_SIZE \ + (sizeof(IndexTupleBuffer) + sizeof(Size) * SPQ_BATCH_SIZE + sizeof(char) * SPQ_MEM_SIZE * (INDEX_MAX_KEYS + 1)) + +#define SPQ_SHARED_SIZE SPQ_BUFFER_SIZE *SPQ_QUEUE_SIZE + +#define GET_BUFFER(SPQ_SHARED, INDEX) ((IndexTupleBuffer *)((char *)SPQ_SHARED->addr + SPQ_BUFFER_SIZE * INDEX)) + +#define GET_BUFFER_MEM(ITUPLE) ((char *)ITUPLE->addr + sizeof(Size) * SPQ_BATCH_SIZE) + +typedef struct SPQSharedContext { + /* + * These fields are not modified during the sort. They primarily exist + * for the benefit of worker processes that need to create BTSpool state + * corresponding to that used by the leader. + */ + Oid heaprelid; + Oid indexrelid; + bool isunique; + bool isconcurrent; + + slock_t mutex; + pthread_mutex_t m_mutex; + pthread_cond_t m_cond; + + volatile bool done; /* flag if all tuples have been fetched */ + volatile int bufferidx; /* buffer index */ + volatile int consumer; /* buffer consume */ + volatile int producer; /* buffer produce */ + char addr[0]; /* varlen */ + Snapshot snapshot; + int dop; +} SPQSharedContext; + +typedef struct IndexTupleBuffer { + volatile int queue_size; /* the number of index tuples in this buffer */ + volatile int idx; /* current tuple index in this buffer */ + volatile int offset; /* memory offset in this buffer */ + Size addr[0]; /* varlen offset: ituple + mem */ +} IndexTupleBuffer; + +typedef struct SPQLeaderState { + Relation heap; + Relation index; + IndexTupleBuffer *buffer; + SPQSharedContext *shared; + double processed; + Snapshot snapshot; +} SPQLeaderState; + +typedef struct SPQWorkerState { + SPIPlanPtr plan; + Portal portal; + StringInfo sql; + + Relation heap; + Relation index; + SPQSharedContext *shared; + uint64 processed; /* location from the last produce buffer */ + bool all_fetched; /* flag if worker has managed all tuples */ +} SPQWorkerState; + +Datum spqbtbuild(Relation heap, Relation index, IndexInfo *indexInfo); + +IndexTuple spq_consume(SPQLeaderState *spqleader); + +bool enable_spq_btbuild(Relation rel); + +bool enable_spq_btbuild_cic(Relation rel); + +#endif // SPQ_BTBUILD_H +#endif diff --git a/src/include/catalog/pg_foreign_server.h b/src/include/catalog/pg_foreign_server.h index 6196981ff..27b6dfa42 100644 --- a/src/include/catalog/pg_foreign_server.h +++ b/src/include/catalog/pg_foreign_server.h @@ -62,5 +62,9 @@ typedef FormData_pg_foreign_server *Form_pg_foreign_server; #define Anum_pg_foreign_server_srvacl 6 #define Anum_pg_foreign_server_srvoptions 7 +#ifdef USE_SPQ +#define GS_EXTTABLE_SERVER_NAME "gs_exttable_server" +#endif + #endif /* PG_FOREIGN_SERVER_H */ diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 303ff921b..344940d82 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -471,6 +471,11 @@ typedef FormData_pg_proc *Form_pg_proc; #define PROARGMODE_VARIADIC 'v' #define PROARGMODE_TABLE 't' +#ifdef USE_SPQ +#define PRODATAACCESS_NONE 'n' +#define PRODATAACCESS_ANY 'a' +#endif + #define PROC_LIB_PATH "$libdir/" #define PORC_PLUGIN_LIB_PATH "$libdir/pg_plugin/" #define PORC_SRC_LIB_PATH "$libdir/proc_srclib/" diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h index 64330128c..7b6fef835 100644 --- a/src/include/commands/explain.h +++ b/src/include/commands/explain.h @@ -158,6 +158,7 @@ typedef struct PlanTableEntry { #define OPTIONSLEN 256 #define OBJECTLEN 31 #define PROJECTIONLEN 4001 +#define SPQNODENAMELEN 256 /* plan_table_data column defination. */ typedef struct PlanTableData { @@ -473,6 +474,9 @@ typedef struct ExplainState { bool is_explain_gplan; char* opt_model_name; ExplainFRSqlState es_frs; /* explain state for remote sql of foreign scan. */ +#ifdef USE_SPQ + int current_id; +#endif } ExplainState; /* Hook for plugins to get control in explain_get_index_name() */ diff --git a/src/include/commands/tablecmds.h b/src/include/commands/tablecmds.h index b0a0339d3..b985b5b57 100644 --- a/src/include/commands/tablecmds.h +++ b/src/include/commands/tablecmds.h @@ -236,4 +236,7 @@ extern void SetPartionIndexType(IndexStmt* stmt, Relation rel, bool is_alter_tab extern bool ConstraintSatisfyAutoIncrement(HeapTuple tuple, TupleDesc desc, AttrNumber attrnum, char contype); extern void CheckRelAutoIncrementIndex(Oid relid, LOCKMODE lockmode); extern void RebuildDependViewForProc(Oid proc_oid); +#ifdef USE_SPQ +extern void spq_btbuild_update_pg_class(Relation heap, Relation index); +#endif #endif /* TABLECMDS_H */ diff --git a/src/include/distributelayer/streamConsumer.h b/src/include/distributelayer/streamConsumer.h index efb0a8e48..3fd717c49 100644 --- a/src/include/distributelayer/streamConsumer.h +++ b/src/include/distributelayer/streamConsumer.h @@ -74,6 +74,12 @@ public: /* Get nodeIdx of producer by nodename. */ int getNodeIdx(const char* nodename); +#ifdef USE_SPQ + /* Get expectProducer nodeName */ + char* getExpectProducerNodeName(); + void setPstmt(PlannedStmt* p_stmt); +#endif + /* Get shared context for local stream. */ inline StreamSharedContext* getSharedContext() { @@ -96,6 +102,9 @@ private: void updateTransportInfo(StreamValue* val); private: +#ifdef USE_SPQ + PlannedStmt* m_plan; +#endif /* Current producer number. */ int m_currentProducerNum; diff --git a/src/include/distributelayer/streamProducer.h b/src/include/distributelayer/streamProducer.h index d789d8a44..85c105452 100644 --- a/src/include/distributelayer/streamProducer.h +++ b/src/include/distributelayer/streamProducer.h @@ -106,6 +106,11 @@ public: /* Send tuple with Roundrobin. */ void roundRobinStream(TupleTableSlot* tuple, DestReceiver* self); +#ifdef USE_SPQ + /* Send batch with Roundrobin. */ + void roundRobinStream(VectorBatch* batch); +#endif + /* Local roundrobin the tuple through memory. */ void localRoundRobinStream(TupleTableSlot* tuple); @@ -271,6 +276,9 @@ public: m_threadInit = flag; } + /* save expr context to producer. */ + void setEcontext(ExprContext* econtext); + void setUniqueSQLKey(uint64 unique_sql_id, Oid unique_user_id, uint32 unique_cn_id); void setGlobalSessionId(GlobalSessionId* globalSessionId); void getGlobalSessionId(GlobalSessionId* globalSessionId); @@ -352,6 +360,9 @@ private: template void redistributeTupleChannel(TupleTableSlot* tuple); + template + void redistributeTupleChannelWithExpr(TupleTableSlot* tuple); + /* Choose which channel to send by hash value. */ template inline int ChannelLocalizer(ScalarValue hashValue, int Dop, int nodeSize); @@ -508,6 +519,10 @@ private: /* global session id */ GlobalSessionId m_globalSessionId; + + bool m_hasExprKey; + List* m_exprkeystate; + ExprContext* m_econtext; }; extern THR_LOCAL StreamProducer* streamProducer; diff --git a/src/include/executor/exec/execStream.h b/src/include/executor/exec/execStream.h index af2131941..09f049dac 100644 --- a/src/include/executor/exec/execStream.h +++ b/src/include/executor/exec/execStream.h @@ -115,6 +115,9 @@ typedef struct StreamState { int64* spill_size; void* sortstate; /* merge sort for stream */ bool receive_message; /* The stream consumer has receive message from then producer */ +#ifdef USE_SPQ + bool skip_direct_distribute_result; +#endif } StreamState; extern StreamState* ExecInitStream(Stream* node, EState* estate, int eflags); diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index bfcb01803..491d58541 100755 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -841,4 +841,7 @@ private: bool m_smpEnabled; }; +#ifdef USE_SPQ +extern bool IsJoinExprNull(List *joinExpr, ExprContext *econtext); +#endif #endif /* EXECUTOR_H */ diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 16cc2a6d4..69da0ccd9 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -724,11 +724,15 @@ public: /* get ThreadInstrumentation */ ThreadInstrumentation *getThreadInstrumentation(int idx, int planNodeId, int smpId) { - ThreadInstrumentation *threadInstr = -#ifdef ENABLE_MULTIPLE_NODES - getThreadInstrumentationCN(idx, planNodeId, smpId); + ThreadInstrumentation *threadInstr = NULL; +#if defined(ENABLE_MULTIPLE_NODES) || defined(USE_SPQ) + if (t_thrd.spq_ctx.spq_role != ROLE_UTILITY) { + threadInstr = getThreadInstrumentationCN(idx, planNodeId, smpId); + } else { + threadInstr = getThreadInstrumentationDN(planNodeId, smpId); + } #else - getThreadInstrumentationDN(planNodeId, smpId); + threadInstr = getThreadInstrumentationDN(planNodeId, smpId); #endif /* ENABLE_MULTIPLE_NODES */ return threadInstr; } diff --git a/src/include/executor/node/nodeAssertOp.h b/src/include/executor/node/nodeAssertOp.h new file mode 100644 index 000000000..851371d6b --- /dev/null +++ b/src/include/executor/node/nodeAssertOp.h @@ -0,0 +1,28 @@ +/* ------------------------------------------------------------------------- + * + * nodeAssertOp.h + * + * + * Portions Copyright (c) 2023 Huawei Technologies Co.,Ltd. + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/executor/nodeAssertOp.h + * + * ------------------------------------------------------------------------- + */ + +#ifndef NODEASSERTOP_H +#define NODEASSERTOP_H + +#ifdef USE_SPQ +#include "nodes/execnodes.h" + +extern void ExecAssertOpExplainEnd(PlanState *planstate, struct StringInfoData *buf); +extern TupleTableSlot* ExecAssertOp(PlanState *node); +extern AssertOpState* ExecInitAssertOp(AssertOp *node, EState *estate, int eflags); +extern void ExecEndAssertOp(AssertOpState *node); +extern void ExecReScanAssertOp(AssertOpState *node); +#endif /* USE_SPQ */ + +#endif /* NODEASSERTOP_H */ diff --git a/src/include/executor/node/nodeHash.h b/src/include/executor/node/nodeHash.h index 94768b570..2206a058c 100644 --- a/src/include/executor/node/nodeHash.h +++ b/src/include/executor/node/nodeHash.h @@ -29,6 +29,10 @@ extern HashJoinTable ExecHashTableCreate(Hash* node, List* hashOperators, bool k extern void ExecHashTableDestroy(HashJoinTable hashtable); extern void ExecHashTableInsert(HashJoinTable hashtable, TupleTableSlot* slot, uint32 hashvalue, int planid, int dop, Instrumentation* instrument = NULL); +#ifdef USE_SPQ +extern bool ExecHashGetHashValue(HashJoinTable hashtable, ExprContext* econtext, List* hashkeys, bool outer_tuple, + bool keep_nulls, uint32* hashvalue, bool *hashkeys_null); +#endif extern bool ExecHashGetHashValue(HashJoinTable hashtable, ExprContext* econtext, List* hashkeys, bool outer_tuple, bool keep_nulls, uint32* hashvalue); extern void ExecHashGetBucketAndBatch(HashJoinTable hashtable, uint32 hashvalue, int* bucketno, int* batchno); diff --git a/src/include/executor/node/nodeSequence.h b/src/include/executor/node/nodeSequence.h new file mode 100644 index 000000000..60109d941 --- /dev/null +++ b/src/include/executor/node/nodeSequence.h @@ -0,0 +1,29 @@ +/*------------------------------------------------------------------------- + * + * nodeSequence.h + * header file for nodeSequence.cpp. + * + * Portions Copyright (c) 2023 Huawei Technologies Co.,Ltd. + * Portions Copyright (c) 2012 - 2022, EMC/Greenplum + * Portions Copyright (c) 2012-Present VMware, Inc. or its affiliates. + * + * + * IDENTIFICATION + * src/include/executor/node/nodeSequence.h + * + *------------------------------------------------------------------------- + */ +#ifndef NODESEQUENCE_H +#define NODESEQUENCE_H + +#ifdef USE_SPQ +#include "executor/tuptable.h" +#include "nodes/execnodes.h" + +extern SequenceState *ExecInitSequence(Sequence *node, EState *estate, int eflags); +extern TupleTableSlot *ExecSequence(PlanState *pstate); +extern void ExecReScanSequence(SequenceState *node); +extern void ExecEndSequence(SequenceState *node); +#endif /* USE_SPQ */ + +#endif diff --git a/src/include/executor/node/nodeShareInputScan.h b/src/include/executor/node/nodeShareInputScan.h new file mode 100644 index 000000000..0e9a6c900 --- /dev/null +++ b/src/include/executor/node/nodeShareInputScan.h @@ -0,0 +1,38 @@ +/*------------------------------------------------------------------------- + * + * nodeShareInputScan.h + * + * Portions Copyright (c) 2023 Huawei Technologies Co.,Ltd. + * Portions Copyright (c) 2012-2021 VMware, Inc. or its affiliates. + * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/include/executor/node/nodeShareInputScan.h + * + *------------------------------------------------------------------------- + */ +#ifndef NODESHAREINPUTSCAN_H +#define NODESHAREINPUTSCAN_H + +#ifdef USE_SPQ +#include "nodes/execnodes.h" +#include "storage/sharedfileset.h" + +extern ShareInputScanState *ExecInitShareInputScan(ShareInputScan *node, EState *estate, int eflags); +extern void ExecEndShareInputScan(ShareInputScanState *node); +extern void ExecReScanShareInputScan(ShareInputScanState *node); +extern TupleTableSlot *ExecShareInputScan(PlanState *pstate); + +extern Size ShareInputShmemSize(void); +extern void ShareInputShmemInit(void); + +extern SharedFileSet *get_shareinput_fileset(void); + +extern void tuplestore_make_shared(Tuplestorestate *state, SharedFileSet *fileset, const char *filename); +extern void tuplestore_freeze(Tuplestorestate *state); +extern Tuplestorestate *tuplestore_open_shared(SharedFileSet *fileset, const char *filename); +#endif /* USE_SPQ */ + +#endif /* NODESHAREINPUTSCAN_H */ diff --git a/src/include/executor/node/nodeSpqSeqscan.h b/src/include/executor/node/nodeSpqSeqscan.h new file mode 100644 index 000000000..c3cff808c --- /dev/null +++ b/src/include/executor/node/nodeSpqSeqscan.h @@ -0,0 +1,32 @@ +/* ------------------------------------------------------------------------- +* +* nodeSpqSeqscan.h +* +* Portions Copyright (c) 2023 Huawei Technologies Co.,Ltd. +* +* src/include/executor/node/nodeSpqSeqscan.h +* +* ------------------------------------------------------------------------- + */ +#ifdef USE_SPQ +#ifndef NODESPQSEQSCAN_H +#define NODESPQSEQSCAN_H + +#include "nodes/execnodes.h" + +typedef SpqSeqScanState* (*init_spqscan_hook_type)(SpqSeqScan* node, EState* estate, int eflags); +typedef TupleTableSlot* (*exec_spqscan_hook_type)(PlanState* node); +typedef void (*end_spqscan_hook_type)(SpqSeqScanState* node); +typedef void (*spqscan_rescan_hook_type)(SpqSeqScanState* node); + +extern THR_LOCAL init_spqscan_hook_type init_spqscan_hook; +extern THR_LOCAL exec_spqscan_hook_type exec_spqscan_hook; +extern THR_LOCAL end_spqscan_hook_type end_spqscan_hook; +extern THR_LOCAL spqscan_rescan_hook_type spqscan_rescan_hook; + +// unchanged function compare with seqscan +extern void ExecSpqSeqMarkPos(SpqSeqScanState* node); +extern void ExecSpqSeqRestrPos(SpqSeqScanState* node); + +#endif // NODESPQSEQSCAN_H +#endif diff --git a/src/include/executor/tuptable.h b/src/include/executor/tuptable.h index 8b83e9a20..0e005163a 100644 --- a/src/include/executor/tuptable.h +++ b/src/include/executor/tuptable.h @@ -249,5 +249,12 @@ extern void heap_slot_getsomeattrs(TupleTableSlot* slot, int attnum); extern bool heap_slot_attisnull(TupleTableSlot* slot, int attnum); extern void heap_slot_formbatch(TupleTableSlot* slot, struct VectorBatch* batch, int cur_rows, int attnum); +#ifdef USE_SPQ +extern Datum slot_getattr(TupleTableSlot *slot, int attnum, bool *isnull); +extern void slot_getsomeattrs(TupleTableSlot *slot, int attnum); +extern void slot_getallattrs(TupleTableSlot *slot); +extern Datum heap_copy_tuple_as_datum(HeapTuple tuple, TupleDesc tupleDesc); +#endif + #endif /* !FRONTEND_PARSER */ #endif /* TUPTABLE_H */ diff --git a/src/include/foreign/foreign.h b/src/include/foreign/foreign.h index 01bea24e9..0d72c42f0 100644 --- a/src/include/foreign/foreign.h +++ b/src/include/foreign/foreign.h @@ -444,4 +444,7 @@ extern void AdvanceFDWUpperPlan(FDWUpperRelCxt* ufdwCxt, UpperRelationKind stage (strncmp(passwd, ENCRYPT_STR_PREFIX, strlen(ENCRYPT_STR_PREFIX)) == 0 && \ strlen(passwd) >= MIN_ENCRYPTED_PASSWORD_LENGTH) +#ifdef USE_SPQ +bool rel_is_external_table(Oid relid); +#endif #endif /* FOREIGN_H */ diff --git a/src/include/gs_thread.h b/src/include/gs_thread.h index 8427048be..2a901a5e0 100755 --- a/src/include/gs_thread.h +++ b/src/include/gs_thread.h @@ -154,6 +154,15 @@ typedef enum knl_thread_role { SW_SENDER } knl_thread_role; +#ifdef USE_SPQ +typedef enum { + ROLE_UTILITY = 0, /* Operating as a simple database engine */ + ROLE_QUERY_COORDINTOR, /* Operating as the parallel query dispatcher */ + ROLE_QUERY_EXECUTOR, /* Operating as a parallel query executor */ + ROLE_UNDEFINED /* Should never see this role in use */ +} SpqRole; +#endif + /* * It is an 64bit identifier in Linux x64 system. There are many legacy * code assumes the original pid is 32 bit where we replace with threadId. @@ -167,6 +176,9 @@ typedef struct knl_thread_arg { char* save_para; void* payload; void* t_thrd; +#ifdef USE_SPQ + SpqRole spq_role; +#endif union { struct syslog_thread { int syslog_handle; diff --git a/src/include/knl/knl_guc.h b/src/include/knl/knl_guc.h index 5a604b561..8dcc9ab67 100644 --- a/src/include/knl/knl_guc.h +++ b/src/include/knl/knl_guc.h @@ -51,5 +51,8 @@ #include "knl_guc/knl_instance_attr_resource.h" #include "knl_guc/knl_session_attr_common.h" #include "knl_guc/knl_instance_attr_common.h" +#ifdef USE_SPQ +#include "knl_guc/knl_session_attr_spq.h" +#endif #endif /* SRC_INCLUDE_KNL_KNL_GUC_H_ */ diff --git a/src/include/knl/knl_guc/knl_session_attr_spq.h b/src/include/knl/knl_guc/knl_session_attr_spq.h new file mode 100644 index 000000000..5b4a6954b --- /dev/null +++ b/src/include/knl/knl_guc/knl_session_attr_spq.h @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2023 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * --------------------------------------------------------------------------------------- + * + * knl_session_attr_spq.h + * Data struct to store all knl_session_attr_spq GUC variables. + * + * When anyone try to added variable in this file, which means add a guc + * variable, there are several rules needed to obey: + * + * add variable to struct 'knl_@level@_attr_@group@' + * + * @level@: + * 1. instance: the level of guc variable is PGC_POSTMASTER. + * 2. session: the other level of guc variable. + * + * @group@: sql, storage, security, network, memory, resource, common, spq + * select the group according to the type of guc variable. + * + * + * IDENTIFICATION + * src/include/knl/knl_guc/knl_session_attr_spq.h + * + * --------------------------------------------------------------------------------------- + */ + +#ifndef SRC_INCLUDE_KNL_KNL_SESSION_ATTR_MPP_H_ +#define SRC_INCLUDE_KNL_KNL_SESSION_ATTR_MPP_H_ + +#include "knl/knl_guc/knl_guc_common.h" +#ifdef PGXC +#include "pgxc/nodemgr.h" +#endif + +struct NodeDefinition; + +typedef struct knl_session_attr_spq { + /* Optimizer related gucs */ + bool gauss_enable_spq; + bool spq_optimizer_log; + int spq_optimizer_minidump; + int spq_optimizer_cost_model; + bool spq_optimizer_metadata_caching; + int spq_optimizer_mdcache_size; + bool spq_optimizer_use_gauss_allocators; + + /* Optimizer debugging GUCs */ + bool spq_optimizer_print_query; + bool spq_optimizer_print_plan; + bool spq_optimizer_print_xform; + bool spq_optimizer_print_xform_results; + bool spq_optimizer_print_memo_after_exploration; + bool spq_optimizer_print_memo_after_implementation; + bool spq_optimizer_print_memo_after_optimization; + bool spq_optimizer_print_job_scheduler; + bool spq_optimizer_print_expression_properties; + bool spq_optimizer_print_group_properties; + bool spq_optimizer_print_optimization_context; + bool spq_optimizer_print_optimization_stats; + + bool spq_optimizer_print_optimization_cost; + + /* array of xforms disable flags */ +#define OPTIMIZER_XFORMS_COUNT 400 /* number of transformation rules */ + bool spq_optimizer_xforms[OPTIMIZER_XFORMS_COUNT]; + + /* GUCs to tell Optimizer to enable a physical operator */ + bool spq_optimizer_enable_nljoin; + bool spq_optimizer_enable_indexjoin; + bool spq_optimizer_enable_motions_masteronly_queries; + bool spq_optimizer_enable_motions; + bool spq_optimizer_enable_motion_broadcast; + bool spq_optimizer_enable_motion_gather; + bool spq_optimizer_enable_motion_redistribute; + bool spq_optimizer_discard_redistribute_hashjoin; + bool spq_optimizer_enable_sort; + bool spq_optimizer_enable_materialize; + bool spq_optimizer_enable_partition_propagation; + bool spq_optimizer_enable_partition_selection; + bool spq_optimizer_enable_outerjoin_rewrite; + bool spq_optimizer_enable_multiple_distinct_aggs; + bool spq_optimizer_enable_direct_dispatch; + bool spq_optimizer_enable_hashjoin_redistribute_broadcast_children; + bool spq_optimizer_enable_broadcast_nestloop_outer_child; + bool spq_optimizer_enable_streaming_material; + bool spq_optimizer_enable_gather_on_segment_for_dml; + bool spq_optimizer_enable_assert_maxonerow; + bool spq_optimizer_enable_constant_expression_evaluation; + bool spq_optimizer_enable_bitmapscan; + bool spq_optimizer_enable_outerjoin_to_unionall_rewrite; + bool spq_optimizer_enable_ctas; + bool spq_optimizer_enable_partial_index; + bool spq_optimizer_enable_dml; + bool spq_optimizer_enable_dml_triggers; + bool spq_optimizer_enable_dml_constraints; + bool spq_optimizer_enable_master_only_queries; + bool spq_optimizer_enable_hashjoin; + bool spq_optimizer_enable_dynamictablescan; + bool spq_optimizer_enable_indexscan; + bool spq_optimizer_enable_indexonlyscan; + bool spq_optimizer_enable_tablescan; + bool spq_optimizer_enable_seqsharescan; + bool spq_optimizer_enable_shareindexscan; + bool spq_optimizer_enable_hashagg; + bool spq_optimizer_enable_groupagg; + bool spq_optimizer_expand_fulljoin; + bool spq_optimizer_enable_mergejoin; + bool spq_optimizer_prune_unused_columns; + bool spq_optimizer_enable_redistribute_nestloop_loj_inner_child; + bool spq_optimizer_force_comprehensive_join_implementation; + bool spq_optimizer_enable_replicated_table; + + /* Optimizer plan enumeration related GUCs */ + bool spq_optimizer_enumerate_plans; + bool spq_optimizer_sample_plans; + int spq_optimizer_plan_id; + int spq_optimizer_samples_number; + + /* Cardinality estimation related GUCs used by the Optimizer */ + bool spq_optimizer_extract_dxl_stats; + bool spq_optimizer_extract_dxl_stats_all_nodes; + bool spq_optimizer_print_missing_stats; + double spq_optimizer_damping_factor_filter; + double spq_optimizer_damping_factor_join; + double spq_optimizer_damping_factor_groupby; + bool spq_optimizer_dpe_stats; + bool spq_optimizer_enable_derive_stats_all_groups; + + /* Costing related GUCs used by the Optimizer */ + int spq_optimizer_segments; + int spq_optimizer_penalize_broadcast_threshold; + double spq_optimizer_cost_threshold; + double spq_optimizer_nestloop_factor; + double spq_optimizer_sort_factor; + double spq_optimizer_share_tablescan_factor; + double spq_optimizer_share_indexscan_factor; + double spq_optimizer_hashjoin_spilling_mem_threshold; + double spq_optimizer_hashjoin_inner_cost_factor; + + /* Optimizer hints */ + int spq_optimizer_join_arity_for_associativity_commutativity; + int spq_optimizer_array_expansion_threshold; + int spq_optimizer_join_order_threshold; + int spq_optimizer_join_order; + int spq_optimizer_cte_inlining_bound; + int spq_optimizer_push_group_by_below_setop_threshold; + int spq_optimizer_xform_bind_threshold; + int spq_optimizer_skew_factor; + bool spq_optimizer_force_multistage_agg; + bool spq_optimizer_force_three_stage_scalar_dqa; + bool spq_optimizer_force_expanded_distinct_aggs; + bool spq_optimizer_force_agg_skew_avoidance; + bool spq_optimizer_penalize_skew; + bool spq_optimizer_prune_computed_columns; + bool spq_optimizer_push_requirements_from_consumer_to_producer; + bool spq_optimizer_enforce_subplans; + bool spq_optimizer_use_external_constant_expression_evaluation_for_ints; + bool spq_optimizer_apply_left_outer_to_union_all_disregarding_stats; + bool spq_optimizer_remove_superfluous_order; + bool spq_optimizer_remove_order_below_dml; + bool spq_optimizer_multilevel_partitioning; + bool spq_optimizer_parallel_union; + bool spq_optimizer_array_constraints; + bool spq_optimizer_cte_inlining; + bool spq_optimizer_enable_space_pruning; + bool spq_optimizer_enable_associativity; + bool spq_optimizer_enable_eageragg; + bool spq_optimizer_enable_orderedagg; + bool spq_optimizer_enable_range_predicate_dpe; + + bool spq_enable_pre_optimizer_check; + bool spq_enable_result_hash_filter; + + bool spq_debug_print_full_dtm; + bool spq_debug_cancel_print; + bool spq_print_direct_dispatch_info; + bool spq_log_dispatch_stats; + + int spq_scan_unit_size; + int spq_scan_unit_bit; + char *gauss_cluster_map; + + /* enable spq btbuild */ + bool spq_enable_btbuild; + bool spq_enable_btbuild_cic; + int spq_batch_size; + int spq_mem_size; + int spq_queue_size; +} knl_session_attr_spq; + +/* TODO SPQ Thread Role*/ +typedef struct knl_t_spq_context { + SpqRole spq_role; + uint64 spq_session_id; + int current_id; + bool skip_direct_distribute_result; + int num_nodes; + NodeDefinition* nodesDefinition; +} knl_t_spq_context; +#endif /* SRC_INCLUDE_KNL_KNL_SESSION_ATTR_MPP_H_ */ diff --git a/src/include/knl/knl_session.h b/src/include/knl/knl_session.h index 5d2d7887e..09e586d42 100644 --- a/src/include/knl/knl_session.h +++ b/src/include/knl/knl_session.h @@ -82,6 +82,9 @@ typedef struct knl_session_attr { knl_session_attr_memory attr_memory; knl_session_attr_resource attr_resource; knl_session_attr_common attr_common; +#ifdef USE_SPQ + knl_session_attr_spq attr_spq; +#endif } knl_session_attr; typedef struct knl_u_stream_context { @@ -2660,6 +2663,54 @@ typedef struct knl_u_mot_context { } knl_u_mot_context; #endif +#ifdef USE_SPQ +namespace spqdxl { + class CDXLMemoryManager; + class CDXLTokens; +} + +namespace spqos { + class CMemoryPool; + class CMemoryPoolManager; + class CWorkerPoolManager; + template class CCache; +} + +namespace spqmd { + class IMDCacheObject; +} + +namespace spqopt { + class CMDKey; +} + + +typedef struct knl_u_spq_context { + /* dxl information */ + spqdxl::CDXLMemoryManager* dxl_memory_manager; + spqos::CMemoryPool* pmpXerces; + spqos::CMemoryPool* pmpDXL; + uintptr_t m_ulpInitDXL; + uintptr_t m_ulpShutdownDXL; + void *m_pstrmap; + void *m_pxmlszmap; + spqos::CMemoryPool* m_mp; + spqdxl::CDXLMemoryManager* m_dxl_memory_manager; + /* memory pool manager */ + spqos::CMemoryPoolManager* m_memory_pool_mgr; + /* worker pool manager */ + spqos::CWorkerPoolManager* m_worker_pool_manager; + /* mdcache */ + spqos::CCache *m_pcache; + uint64 m_ullCacheQuota; + int spq_node_all_configs_size; + int spq_node_configs_size; + MemoryContext spq_worker_context; + MemoryContext s_tupSerMemCtxt; + int32 spq_max_tuple_chunk_size; +} knl_u_spq_context; +#endif + typedef struct knl_u_gtt_context { bool gtt_cleaner_exit_registered; HTAB* gtt_storage_local_hash; @@ -2910,6 +2961,10 @@ typedef struct knl_session_context { knl_u_mot_context mot_cxt; #endif +#ifdef USE_SPQ + knl_u_spq_context spq_cxt; +#endif + /* instrumentation */ knl_u_unique_sql_context unique_sql_cxt; knl_u_user_login_context user_login_cxt; diff --git a/src/include/knl/knl_thread.h b/src/include/knl/knl_thread.h index b3d5cf3a9..e6ebc3d29 100755 --- a/src/include/knl/knl_thread.h +++ b/src/include/knl/knl_thread.h @@ -939,6 +939,10 @@ typedef struct knl_t_shemem_ptr_context { struct HTAB* undoGroupLinkMap; /* Maintain an image of DCF paxos index file */ struct DCFData *dcfData; +#ifdef USE_SPQ + /* shared memory hash table holding 'shareinput_Xslice_state' entries */ + HTAB *shareinput_Xslice_hash; +#endif } knl_t_shemem_ptr_context; typedef struct knl_t_cstore_context { @@ -3559,6 +3563,9 @@ typedef struct knl_thrd_context { knl_t_dms_context dms_cxt; knl_t_ondemand_xlog_copy_context ondemand_xlog_copy_cxt; knl_t_rc_context rc_cxt; +#ifdef USE_SPQ + knl_t_spq_context spq_ctx; +#endif knl_t_dms_auxiliary_context dms_aux_cxt; } knl_thrd_context; diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 591f26fd9..538af8477 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -136,6 +136,7 @@ extern const uint32 INDEX_HINT_VERSION_NUM; extern const uint32 CREATE_TABLE_AS_VERSION_NUM; extern const uint32 GB18030_2022_VERSION_NUM; extern const uint32 PARTITION_ACCESS_EXCLUSIVE_LOCK_UPGRADE_VERSION; +extern const uint32 SPQ_VERSION_NUM; extern void register_backend_version(uint32 backend_version); extern bool contain_backend_version(uint32 version_number); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index d4fa660ae..19a2a1784 100755 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -722,6 +722,9 @@ typedef struct EState { bool have_current_xact_date; /* Check whether dirty reads exist in the cursor rollback scenario. */ int128 first_autoinc; /* autoinc has increased during this execution */ int result_rel_index; /* which result_rel_info to be excuted when multiple-relation modified. */ +#ifdef USE_SPQ + List *es_sharenode; +#endif } EState; /* @@ -1834,6 +1837,43 @@ typedef struct ScanState { */ typedef ScanState SeqScanState; +#ifdef USE_SPQ +/* + * SpqSeqScanState + */ +typedef struct SpqSeqScanState { + SeqScanState ss; + void* pageManager; + void* blockManager; +} SpqSeqScanState; +typedef struct AssertOpState { + PlanState ps; +} AssertOpState; + +/* ---------------- + * State of each scanner of the ShareInput node + * ---------------- + */ +typedef struct ShareInputScanState { + ScanState ss; + Tuplestorestate *ts_state; + int ts_pos; + struct shareinput_local_state *local_state; + struct shareinput_Xslice_reference *ref; + bool isready; +} ShareInputScanState; + +typedef struct SequenceState { + PlanState ps; + PlanState **subplans; + int numSubplans; + + /* + * True if no subplan has been executed. + */ + bool initState; +} SequenceState; +#endif /* * These structs store information about index quals that don't have simple * constant right-hand sides. See comments for ExecIndexBuildScanKeys() @@ -2204,6 +2244,12 @@ typedef struct NestLoopState { bool nl_MatchedOuter; bool nl_MaterialAll; TupleTableSlot* nl_NullInnerTupleSlot; +#ifdef USE_SPQ + List *nl_InnerJoinKeys; /* list of ExprState nodes */ + List *nl_OuterJoinKeys; /* list of ExprState nodes */ + bool nl_innerSideScanned; /* set to true once we've scanned all inner tuples the first time */ + bool prefetch_inner; +#endif } NestLoopState; /* ---------------- @@ -2381,6 +2427,12 @@ typedef struct HashJoinState { bool hj_streamBothSides; bool hj_rebuildHashtable; List* hj_hashCollations; /* list of collations OIDs */ +#ifdef USE_SPQ + bool hj_nonequijoin; /* set true if force hash table to keep nulls */ + bool hj_InnerEmpty; /* set to true if inner side is empty */ + bool prefetch_inner; + bool is_set_op_join; +#endif } HashJoinState; /* ---------------------------------------------------------------- @@ -2480,6 +2532,9 @@ typedef struct AggState { AggStatePerAgg peragg; /* per-Aggref information */ MemoryContext* aggcontexts; /* memory context for long-lived data */ ExprContext* tmpcontext; /* econtext for input expressions */ +#ifdef USE_SPQ + AggSplit aggsplittype; /* agg-splitting mode, see nodes.h */ +#endif AggStatePerAgg curperagg; /* identifies currently active aggregate */ bool input_done; /* indicates end of input */ bool agg_done; /* indicates completion of Agg scan */ @@ -2628,7 +2683,11 @@ typedef struct HashState { List* hashkeys; /* list of ExprState nodes */ int32 local_work_mem; /* work_mem local for this hash join */ int64 spill_size; - +#ifdef USE_SPQ + bool hs_keepnull; /* Keep nulls */ + bool hs_quit_if_hashkeys_null; /* quit building hash table if hashkeys are all null */ + bool hs_hashkeys_null; /* found an instance wherein hashkeys are all null */ +#endif /* hashkeys is same as parent's hj_InnerHashKeys */ } HashState; diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index f6d307350..7cb091f06 100755 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -45,6 +45,10 @@ typedef enum NodeTag { * TAGS FOR PLAN NODES (plannodes.h) */ T_Plan = 100, +#ifdef USE_SPQ + T_Plan_Start, + T_Result, +#endif T_BaseResult, T_ProjectSet, T_ModifyTable, @@ -57,6 +61,9 @@ typedef enum NodeTag { T_BitmapOr, T_Scan, T_SeqScan, +#ifdef USE_SPQ + T_SpqSeqScan, +#endif T_IndexScan, T_IndexOnlyScan, T_BitmapIndexScan, @@ -114,6 +121,18 @@ typedef enum NodeTag { T_CreateAppWorkloadGroupMappingStmt, T_AlterAppWorkloadGroupMappingStmt, T_DropAppWorkloadGroupMappingStmt, +#endif +#ifdef USE_SPQ + T_Sequence, + T_DynamicSeqScan, + T_DynamicBitmapHeapScan, + T_Motion, + T_ShareInputScan, + T_SplitUpdate, + T_AssertOp, + T_PartitionSelector, + T_PartitionPruneInfo, + T_Plan_End, #endif /* these aren't subclasses of Plan: */ T_NestLoopParam, @@ -159,6 +178,12 @@ typedef enum NodeTag { T_BitmapOrState, T_ScanState, T_SeqScanState, +#ifdef USE_SPQ + T_SpqSeqScanState, + T_AssertOpState, + T_ShareInputScanState, + T_SequenceState, +#endif T_IndexScanState, T_IndexOnlyScanState, T_BitmapIndexScanState, @@ -572,6 +597,9 @@ typedef enum NodeTag { T_Constraint, T_DefElem, T_RangeTblEntry, +#ifdef USE_SPQ + T_RangeTblFunction, +#endif T_WithCheckOption, T_TableSampleClause, T_TimeCapsuleClause, @@ -829,6 +857,9 @@ typedef enum NodeTag { T_CondInterval, T_IndexCI, T_RelCI, +#ifdef USE_SPQ + T_GpPolicy, +#endif T_CentroidPoint, T_UserSetElem, T_UserVar, @@ -1066,8 +1097,12 @@ typedef enum JoinType { JOIN_RIGHT_ANTI, /* Right Anti join */ JOIN_LEFT_ANTI_FULL, /* unmatched LHS tuples */ - JOIN_RIGHT_ANTI_FULL /* unmatched RHS tuples */ - + JOIN_RIGHT_ANTI_FULL, /* unmatched RHS tuples */ +#ifdef USE_SPQ + JOIN_LASJ_NOTIN, /* Left Anti Semi Join with Not-In semantics: */ + /* If any NULL values are produced by inner side, */ + /* return no join results. Otherwise, same as LASJ */ +#endif /* * We might need additional join types someday. */ @@ -1118,5 +1153,38 @@ struct TdigestData { double valuetoc; CentroidPoint nodes[0]; }; - +#ifdef USE_SPQ +#define AGGSPLITOP_COMBINE 0x01 /* substitute combinefn for transfn */ +#define AGGSPLITOP_SKIPFINAL 0x02 /* skip finalfn, return state as-is */ +#define AGGSPLITOP_SERIALIZE 0x04 /* apply serializefn to output */ +#define AGGSPLITOP_DESERIALIZE 0x08 /* apply deserializefn to input */ + +#define AGGSPLITOP_DEDUPLICATED 0x100 + +/* Supported operating modes (i.e., useful combinations of these options): */ +typedef enum AggSplit { + /* Basic, non-split aggregation: */ + AGGSTAGE_NORMAL = 0, + /* Initial phase of partial aggregation, with serialization: */ + AGGSTAGE_PARTIAL = AGGSPLITOP_SKIPFINAL | AGGSPLITOP_SERIALIZE, + /* Final phase of partial aggregation, with deserialization: */ + AGGSTAGE_FINAL = AGGSPLITOP_COMBINE | AGGSPLITOP_DESERIALIZE, + + /* + * The inputs have already been deduplicated for DISTINCT. + * This is internal to the planner, it is never set on Aggrefs, and is + * stripped away from Aggs in setrefs.c. + */ + AGGSTAGE_DEDUPLICATED = AGGSPLITOP_DEDUPLICATED, + + AGGSTAGE_INTERMEDIATE = AGGSPLITOP_SKIPFINAL | AGGSPLITOP_SERIALIZE | AGGSPLITOP_COMBINE | AGGSPLITOP_DESERIALIZE, +} AggSplit; + + +/* Test whether an AggSplit value selects each primitive option: */ +#define DO_AGGSPLIT_COMBINE(as) (((as) & AGGSPLITOP_COMBINE) != 0) +#define DO_AGGSPLIT_SKIPFINAL(as) (((as) & AGGSPLITOP_SKIPFINAL) != 0) +#define DO_AGGSPLIT_SERIALIZE(as) (((as) & AGGSPLITOP_SERIALIZE) != 0) +#define DO_AGGSPLIT_DESERIALIZE(as) (((as) & AGGSPLITOP_DESERIALIZE) != 0) +#endif #endif /* NODES_H */ diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 0309b7bec..ba77f9015 100755 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -207,9 +207,15 @@ typedef enum RTEKind { #ifdef PGXC RTE_REMOTE_DUMMY, /* RTEs created by remote plan reduction */ #endif /* PGXC */ - RTE_RESULT /* RTE represents an empty FROM clause; such + RTE_RESULT, /* RTE represents an empty FROM clause; such * RTEs are added by the planner, they're not * present during parsing or rewriting */ +#ifdef USE_SPQ + RTE_NAMEDTUPLESTORE, + RTE_TABLEFUNC, /* TableFunc(.., column list) */ + RTE_VOID, /* CDB: deleted RTE */ + RTE_TABLEFUNCTION /* CDB: Functions over multiset input */ +#endif } RTEKind; typedef struct RangeTblEntry { @@ -374,6 +380,9 @@ typedef struct RangeTblEntry { * Select * from table_name subpartition (subpartition_name); * or delete from table_name partition (partition_name, ...) */ +#ifdef USE_SPQ + bool forceDistRandom; +#endif } RangeTblEntry; /* @@ -2387,6 +2396,24 @@ typedef struct GetDiagStmt { List *condNum; } GetDiagStmt; +#ifdef USE_SPQ +typedef struct RangeTblFunction { + NodeTag type; + Node *funcexpr; /* expression tree for func call */ + int funccolcount; /* number of columns it contributes to RTE */ + /* These fields record the contents of a column definition list, if any: */ + List *funccolnames; /* column names (list of String) */ + List *funccoltypes; /* OID list of column type OIDS */ + List *funccoltypmods; /* integer list of column typmods */ + List *funccolcollations; /* OID list of column collation OIDS */ + + bytea *funcuserdata; /* describe function user data. assume bytea */ + + /* This is set during planning for use by the executor: */ + Bitmapset *funcparams; /* PARAM_EXEC Param IDs affecting this func */ +} RangeTblFunction; +#endif + extern inline NodeTag transform_node_tag(Node* raw_parse_tree) { if (!raw_parse_tree) { diff --git a/src/include/nodes/parsenodes_common.h b/src/include/nodes/parsenodes_common.h index 0d90e9f78..d860b8ba8 100644 --- a/src/include/nodes/parsenodes_common.h +++ b/src/include/nodes/parsenodes_common.h @@ -1957,7 +1957,13 @@ typedef struct RightRefState { /* **************************************************************************** * Query Tree * *************************************************************************** */ - +#ifdef USE_SPQ +typedef uint8 ParentStmtType; +#define PARENTSTMTTYPE_NONE 0 +#define PARENTSTMTTYPE_CTAS 1 +#define PARENTSTMTTYPE_COPY 2 +#define PARENTSTMTTYPE_REFRESH_MATVIEW 3 +#endif /* * Query - * Parse analysis turns all statements into a Query tree @@ -2090,6 +2096,12 @@ typedef struct Query { RightRefState* rightRefState; List* withCheckOptions; /* a list of WithCheckOption's */ List* indexhintList; /* a list of b mode index hint members */ + +#ifdef USE_SPQ + void* intoPolicy; + ParentStmtType parentStmtType; + bool is_support_spq; +#endif } Query; /* ---------------------- diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index abe766205..1fc9d4c3c 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -194,6 +194,11 @@ typedef struct PlannedStmt { uint64 uniqueSQLId; uint32 cause_type; /* Possible Slow SQL Risks in the Plan. */ +#ifdef USE_SPQ + uint64 spq_session_id; + int current_id; + bool is_spq_optmized; +#endif } PlannedStmt; typedef struct NodeGroupInfoContext { @@ -635,6 +640,18 @@ typedef struct Scan { */ typedef Scan SeqScan; +#ifdef USE_SPQ +/* ---------------- + * Spq scan node + * ---------------- + */ +typedef struct SpqSeqScan { + SeqScan scan; + bool isFullTableScan; + bool isAdaptiveScan; + bool isDirectRead; +} SpqSeqScan; +#endif /* * ========== * Column Store Scan nodes @@ -1059,6 +1076,10 @@ typedef struct Join { List* nulleqqual; uint32 skewoptimize; +#ifdef USE_SPQ + bool prefetch_inner; /* to avoid deadlock in spq */ + bool is_set_op_join; +#endif } Join; /* ---------------- @@ -1107,6 +1128,9 @@ typedef struct MergeJoin { Oid* mergeCollations; /* per-clause OIDs of collations */ int* mergeStrategies; /* per-clause ordering (ASC or DESC) */ bool* mergeNullsFirst; /* per-clause nulls ordering */ +#ifdef USE_SPQ + bool unique_outer; /*CDB-OLAP true => outer is unique in merge key */ +#endif } MergeJoin; typedef struct VecMergeJoin : public MergeJoin { @@ -1125,6 +1149,9 @@ typedef struct HashJoin { OpMemInfo mem_info; /* Memory info for inner hash table */ double joinRows; List* hash_collations; +#ifdef USE_SPQ + List *hashqualclauses; +#endif } HashJoin; /* ---------------- @@ -1135,6 +1162,10 @@ typedef struct Material { Plan plan; bool materialize_all; /* if all data should be materialized at the first time */ OpMemInfo mem_info; /* Memory info for material */ +#ifdef USE_SPQ + bool spq_strict; + bool spq_shield_child_from_rescans; +#endif } Material; typedef struct VecMaterial : public Material { @@ -1233,6 +1264,9 @@ typedef enum SAggMethod { typedef struct Agg { Plan plan; AggStrategy aggstrategy; +#ifdef USE_SPQ + AggSplit aggsplittype; /* agg-splitting mode, see nodes.h */ +#endif int numCols; /* number of grouping columns */ AttrNumber* grpColIdx; /* their indexes in the target list */ Oid* grpOperators; /* equality operators to compare with */ @@ -1535,5 +1569,201 @@ typedef struct TrainModel { MemoryContext cxt; // to store models } TrainModel; +#ifdef USE_SPQ +/* ---------------- + * Result node - + * If no outer plan, evaluate a variable-free targetlist. + * If outer plan, return tuples from outer plan (after a level of + * projection as shown by targetlist). + * + * If resconstantqual isn't NULL, it represents a one-time qualification + * test (i.e., one that doesn't depend on any variables from the outer plan, + * so needs to be evaluated only once). + * + * If numHashFilterCols is non-zero, we compute a mpphash value based + * on the columns listed in hashFilterColIdx for each input row. If the + * target segment based on the hash doesn't match the current execution + * segment, the row is discarded. + * ---------------- + */ +typedef struct Result { + Plan plan; + Node *resconstantqual; + + int numHashFilterCols; + AttrNumber *hashFilterColIdx; + Oid *hashFilterFuncs; +} Result; + +/* ------------------------- + * motion node structs + * ------------------------- + */ +typedef enum MotionType { + MOTIONTYPE_GATHER, /* Send tuples from N senders to one receiver */ + MOTIONTYPE_GATHER_SINGLE, /* Execute subplan on N nodes, but only send the tuples from one */ + MOTIONTYPE_HASH, /* Use hashing to select a worker_idx destination */ + MOTIONTYPE_BROADCAST, /* Send tuples from one sender to a fixed set of worker_idxes */ + MOTIONTYPE_EXPLICIT, /* Send tuples to the segment explicitly specified in their segid column */ + MOTIONTYPE_OUTER_QUERY /* Gather or Broadcast to outer query's slice, don't know which one yet */ +} MotionType; + +/* + * Motion Node + */ +typedef struct Motion { + Plan plan; + + MotionType motionType; + bool sendSorted; /* if true, output should be sorted */ + int motionID; /* required by AMS */ + + /* For Hash */ + List *hashExprs; /* list of hash expressions */ + Oid *hashFuncs; /* corresponding hash functions */ + int numHashSegments; /* the module number of the hash function */ + + /* For Explicit */ + AttrNumber segidColIdx; /* index of the segid column in the target list */ + + /* The following field is only used when sendSorted == true */ + int numSortCols; /* number of sort-key columns */ + AttrNumber *sortColIdx; /* their indexes in the target list */ + Oid *sortOperators; /* OIDs of operators to sort them by */ + Oid *collations; /* OIDs of collations */ + bool *nullsFirst; /* NULLS FIRST/LAST directions */ + + /* sender slice info */ + //PlanSlice *senderSliceInfo; +} Motion; + +/* + * Sequence node + * Execute a list of subplans in the order of left-to-right, and return + * the results of the last subplan. + */ +typedef struct Sequence { + Plan plan; + List *subplans; +} Sequence; + +/* + * PartitionPruneInfo - Details required to allow the executor to prune + * partitions. + * + * Here we store mapping details to allow translation of a partitioned table's + * index as returned by the partition pruning code into subplan indexes for + * plan types which support arbitrary numbers of subplans, such as Append. + * We also store various details to tell the executor when it should be + * performing partition pruning. + * + * Each PartitionedRelPruneInfo describes the partitioning rules for a single + * partitioned table (a/k/a level of partitioning). Since a partitioning + * hierarchy could contain multiple levels, we represent it by a List of + * PartitionedRelPruneInfos, where the first entry represents the topmost + * partitioned table and additional entries represent non-leaf child + * partitions, ordered such that parents appear before their children. + * Then, since an Append-type node could have multiple partitioning + * hierarchies among its children, we have an unordered List of those Lists. + * + * prune_infos List of Lists containing PartitionedRelPruneInfo nodes, + * one sublist per run-time-prunable partition hierarchy + * appearing in the parent plan node's subplans. + * other_subplans Indexes of any subplans that are not accounted for + * by any of the PartitionedRelPruneInfo nodes in + * "prune_infos". These subplans must not be pruned. + */ +typedef struct PartitionPruneInfo { + NodeTag type; + List *prune_infos; + Bitmapset *other_subplans; +} PartitionPruneInfo; + +/* ---------------- + * PartitionSelector node + * + * PartitionSelector performs partition pruning based on rows seen on + * the "other" side of a join. It performs partition pruning similar to + * run-time partition pruning in an Append node, but it is performed based + * on the rows seen, instead of executor params. The set of surviving + * partitions is made available to the Append node, by storing it in a + * special executor param, identified by 'paramid' field. + * ---------------- + */ +typedef struct PartitionSelector { + Plan plan; + + struct PartitionPruneInfo *part_prune_info; + int32 paramid; /* result is stored here */ +} PartitionSelector; + +/* ---------------- + * shareinputscan node + * ---------------- + */ +typedef struct ShareInputScan { + Scan scan; + + bool cross_slice; + int share_id; + + /* + * Slice that produces the tuplestore for this shared scan. + * + * As a special case, in a plan that has only one slice, this may be left + * to -1. The executor node ignores this when there is only one slice. + */ + int producer_slice_id; + + /* + * Slice id that this ShareInputScan node runs in. If it's + * different from current slice ID, this ShareInputScan is "alien" + * to the current slice and doesn't need to be executed at all (in + * this slice). It is used to skip IPC in alien nodes. + * + * Like producer_slice_id, this can be left to -1 if there is only one + * slice in the plan tree. + */ + int this_slice_id; + + /* Number of consumer slices participating, not including the producer. */ + int nconsumers; + + /* Discard the scan output? True for ORCA CTE producer, false otherwise. */ + bool discard_output; + + bool is_producer; +} ShareInputScan; + +/* + * SplitUpdate Node + * + */ +typedef struct SplitUpdate { + Plan plan; + AttrNumber actionColIdx; /* index of action column into the target list */ + AttrNumber tupleoidColIdx; /* index of tuple oid column into the target list */ + AttrNumber ctidColIdx; + List *insertColIdx; /* list of columns to INSERT into the target list */ + List *deleteColIdx; /* list of columns to DELETE into the target list */ + + /* + * Fields for calculating the target segment id. + * + * If the targetlist contains a 'gp_segment_id' field, these fields are + * used to compute the target segment id, for INSERT-action rows. + */ + int numHashAttrs; + AttrNumber *hashAttnos; + Oid *hashFuncs; /* corresponding hash functions */ + int numHashSegments; /* # of segs to use in hash computation */ +} SplitUpdate; + +typedef struct AssertOp { + Plan plan; + int errcode; /* SQL error code */ + List *errmessage; /* error message */ +} AssertOp; +#endif /* USE_SPQ */ #endif /* PLANNODES_H */ diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h index 7cfb51f0a..e4160768e 100644 --- a/src/include/nodes/primnodes.h +++ b/src/include/nodes/primnodes.h @@ -285,6 +285,9 @@ typedef struct Aggref { bool agghas_collectfn; /* is collection function available */ int8 aggstage; /* in which stage this aggref is in */ #endif /* PGXC */ +#ifdef USE_SPQ + AggSplit aggsplittype; /* expected agg-splitting mode of parent Agg */ +#endif List* aggdirectargs; /* direct arguments, if an ordered-set agg */ List* args; /* arguments and sort expressions */ List* aggorder; /* ORDER BY (list of SortGroupClause) */ @@ -352,6 +355,9 @@ typedef struct WindowFunc { bool winstar; /* TRUE if argument list was really '*' */ bool winagg; /* is function a simple aggregate? */ int location; /* token location, or -1 if unknown */ +#ifdef USE_SPQ + bool windistinct; /* TRUE if it's agg(DISTINCT ...) */ +#endif } WindowFunc; /* @@ -600,6 +606,9 @@ typedef enum SubLinkType { ROWCOMPARE_SUBLINK, EXPR_SUBLINK, ARRAY_SUBLINK, +#ifdef USE_SPQ + NOT_EXISTS_SUBLINK, /* spq uses NOT_EXIST_SUBLINK to implement correlated left anti semijoin. */ +#endif CTE_SUBLINK /* for SubPlans only */ } SubLinkType; @@ -678,6 +687,10 @@ typedef struct SubPlan { /* Estimated execution costs: */ Cost startup_cost; /* one-time setup cost */ Cost per_call_cost; /* cost for each subplan evaluation */ +#ifdef USE_SPQ + bool is_initplan; /* SPQ: Is the subplan implemented as an initplan? */ + bool is_multirow; /* SPQ: May the subplan return more than one row? */ +#endif } SubPlan; /* diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index a435dfb4c..b24120376 100755 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -25,6 +25,47 @@ #include "optimizer/bucketinfo.h" +#ifdef USE_SPQ +/* + * ApplyShareInputContext is used in different stages of ShareInputScan + * processing. This is mostly used as working area during the stages, but + * some information is also carried through multiple stages. + */ +typedef struct ApplyShareInputContextPerShare { + int producer_slice_id; + Bitmapset *participant_slices; +} ApplyShareInputContextPerShare; + +struct PlanSlice; +struct Plan; + +typedef struct ApplyShareInputContext { + /* curr_rtable is used by all stages when traversing into subqueries */ + List *curr_rtable; + + /* + * Populated in dag_to_tree() (or collect_shareinput_producers() for ORCA), + * used in replace_shareinput_targetlists() + */ + Plan **shared_plans; + int shared_input_count; + + /* + * State for replace_shareinput_targetlists() + */ + int *share_refcounts; + int share_refcounts_sz; /* allocated sized of 'share_refcounts' */ + + /* + * State for apply_sharinput_xslice() walkers. + */ + PlanSlice *slices; /* root->glob->slices */ + List *motStack; /* stack of motionIds leading to current node */ + ApplyShareInputContextPerShare *shared_inputs; /* one for each share */ + Bitmapset *qdShares; /* share_ids that are referenced from QD slices */ +} ApplyShareInputContext; +#endif + /* * Determines if query has to be launched * on Coordinators only (SEQUENCE DDL), @@ -219,6 +260,9 @@ typedef struct PlannerGlobal { /* There is a counter attempt to get name for sublinks */ int sublink_counter; +#ifdef USE_SPQ + ApplyShareInputContext share; /* workspace for GPDB plan sharing */ +#endif } PlannerGlobal; /* macro for fetching the Plan associated with a SubPlan node */ diff --git a/src/include/optimizer/clauses.h b/src/include/optimizer/clauses.h index a185da953..3ab6fc7a9 100644 --- a/src/include/optimizer/clauses.h +++ b/src/include/optimizer/clauses.h @@ -50,6 +50,11 @@ typedef struct { List* active_fns; Node* case_val; bool estimate; +#ifdef USE_SPQ + bool recurse_queries; /* recurse into query structures */ + bool recurse_sublink_testexpr; /* recurse into sublink test expressions */ + Size max_size; /* max constant binary size in bytes, 0: no restrictions */ +#endif } eval_const_expressions_context; typedef enum { UNIQUE_CONSTRAINT, NOT_NULL_CONSTRAINT } constraintType; @@ -157,5 +162,10 @@ extern List* get_quals_lists(Node *jtnode); extern bool isTableofType(Oid typeOid, Oid* base_oid, Oid* indexbyType); extern Expr* simplify_function(Oid funcid, Oid result_type, int32 result_typmod, Oid result_collid, Oid input_collid, List** args_p, bool process_args, bool allow_non_const, eval_const_expressions_context* context); - + +#ifdef USE_SPQ +extern Query *fold_constants(PlannerInfo *root, Query *q, ParamListInfo boundParams, Size max_size); +extern Query *flatten_join_alias_var_optimizer(Query *query, int queryLevel); +extern Expr *transform_array_Const_to_ArrayExpr(Const *c); +#endif #endif /* CLAUSES_H */ diff --git a/src/include/optimizer/pgxc_plan_remote.h b/src/include/optimizer/pgxc_plan_remote.h index f36d51d58..a327f92e6 100644 --- a/src/include/optimizer/pgxc_plan_remote.h +++ b/src/include/optimizer/pgxc_plan_remote.h @@ -144,6 +144,9 @@ typedef struct { * in some scenarios, e.g. assignment of relationOids in fix_expr_common. */ List* relationOids; /* contain OIDs of relations the plan depends on */ +#ifdef USE_SPQ + int streamID; /* required by AMS */ +#endif } RemoteQuery; extern Plan* create_remote_mergeinto_plan(PlannerInfo* root, Plan* topplan, CmdType cmdtyp, MergeAction* action); diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h index 9e74112a4..d36ffcfe0 100644 --- a/src/include/optimizer/plancat.h +++ b/src/include/optimizer/plancat.h @@ -55,4 +55,8 @@ extern bool HasStoredGeneratedColumns(const PlannerInfo *root, Index rti); extern PlannerInfo *get_cte_root(PlannerInfo *root, int levelsup, char *ctename); +#ifdef USE_SPQ +extern double spq_estimate_partitioned_numtuples(Relation rel); +#endif + #endif /* PLANCAT_H */ diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index 93ca09fde..817203195 100755 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -210,4 +210,9 @@ extern List* find_all_internal_tableOids(Oid parentOid); extern bool check_agg_optimizable(Aggref* aggref, int16* strategy); extern void check_hashjoinable(RestrictInfo* restrictinfo); +#ifdef USE_SPQ +extern void spq_extract_plan_dependencies(PlannerInfo *root, Plan *plan); +extern List* spq_make_null_eq_clause(List* joinqual, List** otherqual, List* nullinfo); +#endif + #endif /* PLANMAIN_H */ diff --git a/src/include/optimizer/planmem_walker.h b/src/include/optimizer/planmem_walker.h index fbcfb07c9..9ebe89998 100644 --- a/src/include/optimizer/planmem_walker.h +++ b/src/include/optimizer/planmem_walker.h @@ -173,4 +173,29 @@ extern bool plan_tree_walker(Node* node, MethodWalker walker, void* context); extern bool IsBlockedJoinNode(Plan* node); +#ifdef USE_SPQ +extern void plan_tree_base_subplan_put_plan(plan_tree_base_prefix *base, SubPlan *subplan, Plan *plan); +/* + * Rewrite the Plan associated with a SubPlan node during planning. + */ +static inline void planner_subplan_put_plan(struct PlannerInfo *root, SubPlan *subplan, Plan *plan) +{ + ListCell *cell = list_nth_cell(root->glob->subplans, subplan->plan_id - 1); + cell->data.ptr_value = plan; +} + +/* + * Rewrite the Plan associated with a SubPlan node in a completed PlannedStmt. + */ +static inline void exec_subplan_put_plan(struct PlannedStmt *plannedstmt, SubPlan *subplan, Plan *plan) +{ + ListCell *cell = list_nth_cell(plannedstmt->subplans, subplan->plan_id - 1); + cell->data.ptr_value = plan; +} +extern List *extract_nodes_plan(Plan *pl, int nodeTag, bool descendIntoSubqueries); +extern List *extract_nodes_expression(Node *node, int nodeTag, bool descendIntoSubqueries); +extern int find_nodes(Node *node, List *nodeTags); +extern int check_collation(Node *node); +#endif + #endif /* PLANWALKER_H */ diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h index e5c7a8f4d..667b057d8 100644 --- a/src/include/optimizer/planner.h +++ b/src/include/optimizer/planner.h @@ -56,6 +56,10 @@ extern PlannedStmt* standard_planner(Query* parse, int cursorOptions, ParamListI typedef void (*planner_hook_type) (Query* parse, int cursorOptions, ParamListInfo boundParams); typedef void (*ndp_pushdown_hook_type) (Query* querytree, PlannedStmt *stmt); extern THR_LOCAL PGDLLIMPORT ndp_pushdown_hook_type ndp_pushdown_hook; +#ifdef USE_SPQ +typedef PlannedStmt *(*spq_planner_hook_type) (Query* parse, int cursorOptions, ParamListInfo boundParams); +extern THR_LOCAL PGDLLIMPORT spq_planner_hook_type spq_planner_hook; +#endif extern Plan* subquery_planner(PlannerGlobal* glob, Query* parse, PlannerInfo* parent_root, bool hasRecursion, double tuple_fraction, PlannerInfo** subroot, int options = SUBQUERY_NORMAL, ItstDisKey* diskeys = NULL, diff --git a/src/include/optimizer/stream_cost.h b/src/include/optimizer/stream_cost.h index a480ee7fe..f91b4d0a6 100644 --- a/src/include/optimizer/stream_cost.h +++ b/src/include/optimizer/stream_cost.h @@ -41,6 +41,9 @@ typedef enum { REMOTE_BROADCAST, /* Broadcast data to all nodes. */ REMOTE_SPLIT_BROADCAST, /* Broadcast data to all parallel threads all nodes. */ REMOTE_HYBRID, /* Hybrid send data. */ +#ifdef USE_SPQ + REMOTE_ROUNDROBIN, +#endif LOCAL_DISTRIBUTE, /* Distribute data to all threads at local node. */ LOCAL_BROADCAST, /* Broadcast data to all threads at local node. */ LOCAL_ROUNDROBIN /* Roundrobin data to all threads at local node. */ @@ -82,6 +85,9 @@ typedef struct Stream { * used for recursive sql execution that under recursive-union operator. */ ExecNodes* origin_consumer_nodes; bool is_recursive_local; /* LOCAL GATHER for recursive */ +#ifdef USE_SPQ + int streamID; +#endif } Stream; extern void compute_stream_cost(StreamType type, char locator_type, double subrows, double subgblrows, @@ -95,4 +101,4 @@ extern List* get_max_cost_distkey_for_nulldistkey( extern void parallel_stream_info_print(ParallelDesc* smpDesc, StreamType type); extern List* make_distkey_for_append(PlannerInfo* root, Plan* subPlan); -#endif /* STREAM_COST_H */ \ No newline at end of file +#endif /* STREAM_COST_H */ diff --git a/src/include/optimizer/stream_util.h b/src/include/optimizer/stream_util.h index d01fc7b87..6ad712e29 100644 --- a/src/include/optimizer/stream_util.h +++ b/src/include/optimizer/stream_util.h @@ -83,4 +83,8 @@ extern Plan* update_plan_refs(PlannerInfo* root, Plan* plan, Index* fromRTI, Ind extern void set_node_ref_subplan_walker(Plan* result_plan, set_node_ref_subplan_context* context); extern void StreamPlanWalker(PlannedStmt *pstmt, Plan *plan, bool *need); extern void mark_distribute_setop_remotequery(PlannerInfo* root, Node* node, Plan* plan, List* subPlans); -#endif /* STREAM_UTIL_H */ \ No newline at end of file +#ifdef USE_SPQ +extern void SpqSerializePlan(Plan* node, PlannedStmt* planned_stmt, StringInfoData* str, + int num_stream, int num_gather, bool push_subplan, uint64 queryId); +#endif +#endif /* STREAM_UTIL_H */ diff --git a/src/include/optimizer/tlist.h b/src/include/optimizer/tlist.h index e92cc7bc6..5db0e244b 100644 --- a/src/include/optimizer/tlist.h +++ b/src/include/optimizer/tlist.h @@ -66,4 +66,11 @@ extern bool split_pathtarget_at_srfs(PlannerInfo *root, PathTarget *target, Path /* Convenience macro to get a PathTarget with valid cost/width fields */ #define create_pathtarget(root, tlist) \ set_pathtarget_cost_width(root, make_pathtarget_from_tlist(tlist)) + +#ifdef USE_SPQ +extern List* tlist_members(Node* node, List* targetlist); +extern void get_sortgroupclauses_tles(List *clauses, List *targetList, List **tles, List **sortops, List **eqops); +extern Index maxSortGroupRef(List *targetlist, bool include_orderedagg); +#endif + #endif /* TLIST_H */ diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h index 2659e5b19..f68644d4e 100644 --- a/src/include/pgxc/pgxc.h +++ b/src/include/pgxc/pgxc.h @@ -70,4 +70,14 @@ typedef enum { #define XC_LOCK_FOR_BACKUP_KEY_1 0xFFFF #define XC_LOCK_FOR_BACKUP_KEY_2 0xFFFF +#ifdef USE_SPQ +#define IS_SPQ_RUNNING (t_thrd.spq_ctx.spq_role != ROLE_UTILITY) +#define IS_SPQ_COORDINATOR (t_thrd.spq_ctx.spq_role == ROLE_QUERY_COORDINTOR) +#define IS_SPQ_EXECUTOR (t_thrd.spq_ctx.spq_role == ROLE_QUERY_EXECUTOR) +#else +#define IS_SPQ_RUNNING (false) +#define IS_SPQ_COORDINATOR (false) +#define IS_SPQ_EXECUTOR (false) +#endif + #endif /* PGXC_H */ diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index 32937ba97..051e2359c 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -315,7 +315,7 @@ extern Datum pgxc_execute_on_nodes(int numnodes, Oid* nodelist, char* query); extern bool pgxc_node_receive(const int conn_count, PGXCNodeHandle** connections, struct timeval* timeout, bool ignoreTimeoutWarning = false); extern bool datanode_receive_from_physic_conn( -const int conn_count, PGXCNodeHandle** connections, struct timeval* timeout, bool ignoreTimeoutWarning = false); +const int conn_count, PGXCNodeHandle** connections, struct timeval* timeout); extern bool datanode_receive_from_logic_conn( const int conn_count, PGXCNodeHandle** connections, StreamNetCtl* ctl, int time_out); extern bool pgxc_node_validate(PGXCNodeHandle *conn); diff --git a/src/include/pgxc/remoteCombiner.h b/src/include/pgxc/remoteCombiner.h index 436d7ba4f..c240ba32c 100644 --- a/src/include/pgxc/remoteCombiner.h +++ b/src/include/pgxc/remoteCombiner.h @@ -170,6 +170,11 @@ typedef struct RemoteQueryState { char* serializedPlan; /* the serialized plan tree */ ParallelFunctionState* parallel_function_state; bool has_stream_for_loop; /* has stream node in for loop sql which may cause hang. */ +#ifdef USE_SPQ + uint64 queryId; + PGXCNodeHandle** spq_connections_info; + pg_conn **nodeCons; +#endif } RemoteQueryState; extern RemoteQueryState* CreateResponseCombiner(int node_count, CombineType combine_type); diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h index 0902eae8d..ada675275 100755 --- a/src/include/postmaster/postmaster.h +++ b/src/include/postmaster/postmaster.h @@ -215,7 +215,12 @@ extern void set_disable_conn_mode(void); #define IsConnPortFromCoord(port) \ ((port)->cmdline_options != NULL && strstr((port)->cmdline_options, "remotetype=coordinator") != NULL) #else +#ifdef USE_SPQ +#define IsConnPortFromCoord(port) \ + ((port)->cmdline_options != NULL && strstr((port)->cmdline_options, "remotetype=coordinator") != NULL) +#else #define IsConnPortFromCoord(port) false +#endif extern bool get_addr_from_socket(int sock, struct sockaddr *saddr); extern int get_ip_port_from_addr(char* sock_ip, int* port, struct sockaddr saddr); #endif diff --git a/src/include/rewrite/rewriteManip.h b/src/include/rewrite/rewriteManip.h index 68026101f..686138745 100644 --- a/src/include/rewrite/rewriteManip.h +++ b/src/include/rewrite/rewriteManip.h @@ -39,6 +39,10 @@ extern void ChangeVarNodes(Node* node, int old_varno, int new_varno, int subleve extern void IncrementVarSublevelsUp(Node* node, int delta_sublevels_up, int min_sublevels_up); extern void IncrementVarSublevelsUp_rtable(List* rtable, int delta_sublevels_up, int min_sublevels_up); +#ifdef USE_SPQ +extern void SpqIncrementVarSublevelsUpInTransformGroupedWindows(Node *node, int delta_sublevels_up, int min_sublevels_up); +#endif + extern bool rangeTableEntry_used(Node* node, int rt_index, int sublevels_up); extern bool attribute_used(Node* node, int rt_index, int attno, int sublevels_up); diff --git a/src/include/tcop/dest.h b/src/include/tcop/dest.h index f389d001a..afc8440ce 100644 --- a/src/include/tcop/dest.h +++ b/src/include/tcop/dest.h @@ -102,6 +102,11 @@ typedef enum { DestTupleHybrid, +#ifdef USE_SPQ + DestTupleRoundRobin, + DestBatchRoundRobin, +#endif + DestBatchBroadCast, /* results send to consumer thread in a broadcast way */ DestBatchLocalBroadCast, /* results send to consumer thread in a local broadcast way */ DestBatchRedistribute, /* results send to consumer thread in a redistribute way */ diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h index 019d114c7..8015eab2f 100644 --- a/src/include/utils/lsyscache.h +++ b/src/include/utils/lsyscache.h @@ -19,6 +19,9 @@ #include "catalog/pg_workload_group.h" #include "catalog/pg_app_workloadgroup_mapping.h" #include "catalog/pgxc_node.h" +#ifdef USE_SPQ +#include "parser/parse_coerce.h" +#endif /* Result list element for get_op_btree_interpretation */ typedef struct OpBtreeInterpretation { @@ -220,6 +223,75 @@ extern bool get_func_iswindow(Oid funcid); extern char get_func_prokind(Oid funcid); extern char get_typecategory(Oid typid); +#ifdef USE_SPQ +/* comparison types */ +typedef enum CmpType { + CmptEq, // equality + CmptNEq, // inequality + CmptLT, // less than + CmptLEq, // less or equal to + CmptGT, // greater than + CmptGEq, // greater or equal to + CmptOther // other operator +} CmpType; + +#define ATTSTATSSLOT_VALUES 0x01 +#define ATTSTATSSLOT_NUMBERS 0x02 +/* Result struct for get_attstatsslot */ +typedef struct AttStatsSlot { + /* Always filled: */ + Oid staop; /* Actual staop for the found slot */ + Oid stacoll; /* Actual collation for the found slot */ + /* Filled if ATTSTATSSLOT_VALUES is specified: */ + Oid valuetype; /* Actual datatype of the values */ + Datum *values; /* slot's "values" array, or NULL if none */ + int nvalues; /* length of values[], or 0 */ + /* Filled if ATTSTATSSLOT_NUMBERS is specified: */ + float4 *numbers; /* slot's "numbers" array, or NULL if none */ + int nnumbers; /* length of numbers[], or 0 */ + + /* Remaining fields are private to get_attstatsslot/free_attstatsslot */ + void *values_arr; /* palloc'd values array, if any */ + void *numbers_arr; /* palloc'd numbers array, if any */ +} AttStatsSlot; +extern Oid get_compatible_hash_opfamily(Oid opno); +extern Oid get_compatible_legacy_hash_opfamily(Oid opno); +extern void MemoryContextDeclareAccountingRoot(MemoryContext context); +extern Oid get_agg_transtype(Oid aggid); +extern bool is_agg_ordered(Oid aggid); +extern bool is_agg_partial_capable(Oid aggid); +extern List *get_func_output_arg_types(Oid funcid); +extern List *get_func_arg_types(Oid funcid); +extern char func_data_access(Oid funcid); +extern char func_exec_location(Oid funcid); +extern bool index_exists(Oid oid); +extern bool aggregate_exists(Oid oid); +extern Oid get_aggregate(const char *aggname, Oid oidType); +extern bool function_exists(Oid oid); +extern bool get_cast_func(Oid oidSrc, Oid oidDest, bool *is_binary_coercible, Oid *oidCastFunc, CoercionPathType *pathtype); +extern bool check_constraint_exists(Oid oidCheckconstraint); +extern char *get_check_constraint_name(Oid oidCheckconstraint); +extern Oid get_check_constraint_relid(Oid oidCheckconstraint); +extern List *get_check_constraint_oids(Oid oidRel); +extern Node *get_check_constraint_expr_tree(Oid oidCheckconstraint); +extern bool operator_exists(Oid oid); +extern bool relation_exists(Oid oid); +extern bool type_exists(Oid oid); +extern CmpType get_comparison_type(Oid oidOp); +extern Oid get_comparison_operator(Oid oidLeft, Oid oidRight, CmpType cmpt); +extern bool has_subclass_slow(Oid relationId); +extern List *get_operator_opfamilies(Oid opno); +extern List *get_index_opfamilies(Oid oidIndex); +extern bool relation_is_partitioned(Oid oid); +extern bool index_is_partitioned(Oid oid); +extern bool has_update_triggers(Oid relid); +extern bool spq_get_attstatsslot(AttStatsSlot *sslot, HeapTuple statstuple, int reqkind, Oid reqop, int flags); +extern void spq_free_attstatsslot(AttStatsSlot *sslot); +extern char * get_type_name(Oid typid); +extern int32 get_trigger_type(Oid triggerid); +extern HeapTuple get_att_stats(Oid relid, AttrNumber attrnum); +#endif + #define type_is_array(typid) (get_element_type(typid) != InvalidOid) /* type_is_array_domain accepts both plain arrays and domains over arrays */ #define type_is_array_domain(typid) (get_base_element_type(typid) != InvalidOid) diff --git a/src/include/utils/numeric.h b/src/include/utils/numeric.h index a1d3d2a1e..3d495bc04 100644 --- a/src/include/utils/numeric.h +++ b/src/include/utils/numeric.h @@ -352,6 +352,7 @@ extern bool numericvar_to_int64(const NumericVar* var, int64* result, bool can_i extern void int64_to_numericvar(int64 val, NumericVar *var); extern void add_var(NumericVar *var1, NumericVar *var2, NumericVar *result); extern char *numeric_normalize(Numeric num); +extern double numeric_to_double_no_overflow(Numeric num); bool numeric_agg_trans_initvalisnull(Oid transfn_oid, bool initvalisnull); void numeric_transfn_info_change(Oid aggfn_oid, Oid *transfn_oid, Oid *transtype); diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index a571c9698..6422d613c 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -411,6 +411,10 @@ typedef struct StdRdOptions { int check_option_offset; /* for views */ int view_security_option_offset; /* for views */ Oid collate; /* table's default collation in b format. */ +#ifdef USE_SPQ + /* SPQ OPTIONS */ + int spq_bt_build_offset; +#endif } StdRdOptions; #define HEAP_MIN_FILLFACTOR 10 diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h index bad8ffc50..7c2d0cf87 100644 --- a/src/include/utils/selfuncs.h +++ b/src/include/utils/selfuncs.h @@ -288,4 +288,5 @@ extern void set_varratio_after_calc_selectivity( extern double get_windowagg_selectivity(PlannerInfo* root, WindowClause* wc, WindowFunc* wfunc, List* partitionExprs, int32 constval, double tuples, unsigned int num_datanodes); extern bool contain_single_col_stat(List* stat_list); +extern double convert_timevalue_to_scalar(Datum value, Oid typid); #endif /* SELFUNCS_H */ diff --git a/src/test/regress/pg_regress.cpp b/src/test/regress/pg_regress.cpp index 7ede9947b..3094ffb53 100644 --- a/src/test/regress/pg_regress.cpp +++ b/src/test/regress/pg_regress.cpp @@ -1445,12 +1445,14 @@ static void start_ss_node(int i) (void)snprintf(buf, sizeof(buf), - SYSTEMQUOTE "\"%s/gaussdb\" -p %d -D \"%s/%s\" -c log_statement=all -c logging_collector=true -c " + SYSTEMQUOTE "\"%s/gaussdb\" -p %d -D \"%s/%s\" -c comm_sctp_port=%d -c comm_control_port=%d -c log_statement=all -c logging_collector=true -c " "\"listen_addresses=%s\" & > \"%s/log/postmaster_%s.log\" 2>&1" SYSTEMQUOTE, bindir, port_number, temp_install, data_folder, + port_number + 10, + port_number + 20, hostname ? hostname : "*", outputdir, data_folder); @@ -5466,7 +5468,7 @@ static void CheckCleanCodeWarningInfo(const int baseNum, const int currentNum, return; } -#define BASE_GLOBAL_VARIABLE_NUM 224 +#define BASE_GLOBAL_VARIABLE_NUM 229 #define CMAKE_CMD_BUF_LEN 1000 @@ -5515,7 +5517,7 @@ static void check_global_variables() } } -#define BASE_PGXC_LIKE_MACRO_NUM 1393 +#define BASE_PGXC_LIKE_MACRO_NUM 1396 static void check_pgxc_like_macros() { #ifdef BUILD_BY_CMAKE diff --git a/src/test/ss/build_ss_database_common.sh b/src/test/ss/build_ss_database_common.sh index cdfa3a546..7f0ea52fd 100644 --- a/src/test/ss/build_ss_database_common.sh +++ b/src/test/ss/build_ss_database_common.sh @@ -108,6 +108,8 @@ set_gaussdb_port() pg_port=$2 echo "" >> ${data_node}/postgresql.conf echo "port = ${pg_port}" >> ${data_node}/postgresql.conf + echo "comm_sctp_port = $((pg_port + 10))" >> ${data_node}/postgresql.conf + echo "comm_control_port = $((pg_port + 20))" >> ${data_node}/postgresql.conf } start_gaussdb() @@ -122,4 +124,4 @@ stop_gaussdb() data_node=$1 echo "> stop ${data_node}" && ${GAUSSHOME}/bin/gs_ctl stop -D ${data_node} sleep 5 -} \ No newline at end of file +}