diff --git a/src/common/backend/nodes/copyfuncs.cpp b/src/common/backend/nodes/copyfuncs.cpp index 502fbe6e0..db5f62a85 100644 --- a/src/common/backend/nodes/copyfuncs.cpp +++ b/src/common/backend/nodes/copyfuncs.cpp @@ -462,6 +462,7 @@ static BitmapOr* _copyBitmapOr(const BitmapOr* from) /* * copy remainder of node */ + COPY_SCALAR_FIELD(isshared); COPY_NODE_FIELD(bitmapplans); return newnode; @@ -703,6 +704,7 @@ static BitmapIndexScan* _copyBitmapIndexScan(const BitmapIndexScan* from) * copy remainder of node */ COPY_SCALAR_FIELD(indexid); + COPY_SCALAR_FIELD(isshared); COPY_NODE_FIELD(indexqual); COPY_NODE_FIELD(indexqualorig); diff --git a/src/common/backend/nodes/outfuncs.cpp b/src/common/backend/nodes/outfuncs.cpp index 4d354008e..33dabc1ae 100755 --- a/src/common/backend/nodes/outfuncs.cpp +++ b/src/common/backend/nodes/outfuncs.cpp @@ -873,7 +873,7 @@ static void _outBitmapOr(StringInfo str, BitmapOr* node) WRITE_NODE_TYPE("BITMAPOR"); _outPlanInfo(str, (Plan*)node); - + WRITE_BOOL_FIELD(isshared); WRITE_NODE_FIELD(bitmapplans); } static void _outCStoreIndexOr(StringInfo str, CStoreIndexOr* node) @@ -1176,6 +1176,7 @@ static void _outBitmapIndexScan(StringInfo str, BitmapIndexScan* node) _outScanInfo(str, (Scan*)node); WRITE_OID_FIELD(indexid); + WRITE_BOOL_FIELD(isshared); WRITE_NODE_FIELD(indexqual); WRITE_NODE_FIELD(indexqualorig); #ifdef STREAMPLAN diff --git a/src/common/backend/nodes/readfuncs.cpp b/src/common/backend/nodes/readfuncs.cpp index 54da11848..0e7df4ed1 100644 --- a/src/common/backend/nodes/readfuncs.cpp +++ b/src/common/backend/nodes/readfuncs.cpp @@ -2838,6 +2838,7 @@ static BitmapOr* _readBitmapOr(BitmapOr* local_node) READ_TEMP_LOCALS(); _readPlan(&local_node->plan); + READ_BOOL_FIELD(isshared); READ_NODE_FIELD(bitmapplans); READ_DONE(); @@ -3059,6 +3060,7 @@ static BitmapIndexScan* _readBitmapIndexScan(BitmapIndexScan* local_node) _readScan(&local_node->scan); READ_OID_FIELD(indexid); + READ_BOOL_FIELD(isshared); READ_NODE_FIELD(indexqual); READ_NODE_FIELD(indexqualorig); #ifdef STREAMPLAN diff --git a/src/common/backend/nodes/tidbitmap.cpp b/src/common/backend/nodes/tidbitmap.cpp index 811bd9644..7a8c453b2 100755 --- a/src/common/backend/nodes/tidbitmap.cpp +++ b/src/common/backend/nodes/tidbitmap.cpp @@ -45,6 +45,7 @@ #include "nodes/bitmapset.h" #include "nodes/tidbitmap.h" #include "utils/hsearch.h" +#include "storage/lwlock.h" /* * The maximum number of tuples per page is not large (typically 256 with @@ -117,6 +118,7 @@ typedef struct PagetableEntry { bool recheck; /* should the tuples be rechecked? */ bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)]; } PagetableEntry; + /* * dynahash.c is optimized for relatively large, long-lived hash tables. * This is not ideal for TIDBitMap, particularly when we are using a bitmap @@ -134,6 +136,15 @@ typedef enum { TBM_HASH /* pagetable is valid, entry1 is not */ } TBMStatus; +/* + * Current iterating state of the TBM. + */ +typedef enum { + TBM_NOT_ITERATING, /* not yet converted to page and chunk array */ + TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */ + TBM_ITERATING_SHARED /* converted to shared page and chunk array */ +} TBMIteratingState; + /* * Here is the representation for a whole TIDBitMap: */ @@ -146,8 +157,9 @@ struct TIDBitmap { int maxentries; /* limit on same to meet maxbytes */ int npages; /* number of exact entries in pagetable */ int nchunks; /* number of lossy entries in pagetable */ - bool iterating; /* tbm_begin_iterate called? */ + TBMIteratingState iterating; /* tbm_begin_iterate called? */ bool isGlobalPart; /* represent global partition index tbm */ + bool isShared; /* is shared pagetable? */ PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */ /* these are valid when iterating is true: */ PagetableEntry** spages; /* sorted exact-page list, or NULL */ @@ -168,8 +180,40 @@ struct TBMIterator { TBMIterateResult output; /* MUST BE LAST (because variable-size) */ }; +/* + * Holds the shared members of the iterator so that multiple processes + * can jointly iterate. + */ +struct TBMSharedIteratorState { + int nentries; /* number of entries in pagetable */ + int maxentries; /* limit on same to meet maxbytes */ + int npages; /* number of exact entries in pagetable */ + int nchunks; /* number of lossy entries in pagetable */ + TBMStatus status; /* see codes above */ + PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */ + HTAB* pagetable; /* hash table of PagetableEntry's */ + PagetableEntry** spages; /* sorted exact-page list, or NULL */ + PagetableEntry** schunks; /* sorted lossy-chunk list, or NULL */ + pg_atomic_uint32 pagetableRefcount; /* ref count for pagetable */ + pg_atomic_uint32 pagesRefcount; /* ref count for spages */ + pg_atomic_uint32 chunksRefcount; /* ref count for schunks */ + LWLock lock; /* lock to protect below members */ + int spageptr; /* next spages index */ + int schunkptr; /* next schunks index */ + int schunkbit; /* next bit to check in current schunk */ +}; + +/* + * same as TBMIterator, but it is used for joint iteration, therefore this + * also holds a reference to the shared state. + */ +struct TBMSharedIterator { + TBMSharedIteratorState *state; /* shared state */ + TBMIterateResult output; /* MUST BE LAST (because variable-size) */ +}; + /* Local function prototypes */ -static void tbm_union_page(TIDBitmap* a, const PagetableEntry* bpage); +static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage); static bool tbm_intersect_page(TIDBitmap* a, PagetableEntry* apage, const TIDBitmap* b); static const PagetableEntry* tbm_find_pageentry(const TIDBitmap* tbm, PagetableEntryNode pageNode); static PagetableEntry* tbm_get_pageentry(TIDBitmap* tbm, PagetableEntryNode pageNode); @@ -177,15 +221,18 @@ static bool tbm_page_is_lossy(const TIDBitmap* tbm, PagetableEntryNode pageNode) static void tbm_mark_page_lossy(TIDBitmap* tbm, PagetableEntryNode pageNode); static void tbm_lossify(TIDBitmap* tbm); static int tbm_comparator(const void* left, const void* right); +static void tbm_sort_pages(TIDBitmap* tbm); /* * tbm_create - create an initially-empty bitmap * * The bitmap will live in the memory context that is CurrentMemoryContext * at the time of this call. It will be limited to (approximately) maxbytes - * total memory consumption. + * total memory consumption. If the DSA passed to this function is not NULL + * then the memory for storing elements of the underlying page table will + * be allocated from the DSA. */ -TIDBitmap* tbm_create(long maxbytes) +TIDBitmap* tbm_create(long maxbytes, MemoryContext dsa) { TIDBitmap* tbm = NULL; long nbuckets; @@ -193,7 +240,13 @@ TIDBitmap* tbm_create(long maxbytes) /* Create the TIDBitmap struct and zero all its fields */ tbm = makeNode(TIDBitmap); - tbm->mcxt = CurrentMemoryContext; + if (dsa == NULL) { + tbm->mcxt = CurrentMemoryContext; + tbm->isShared = false; + } else { + tbm->mcxt = dsa; + tbm->isShared = true; + } tbm->status = TBM_EMPTY; tbm->isGlobalPart = false; /* @@ -255,18 +308,39 @@ static void tbm_create_pagetable(TIDBitmap* tbm) */ void tbm_free(TIDBitmap* tbm) { - if (tbm->pagetable != NULL) { + /* + * Don't call hash_destroy when it's shared, cause the memcxt for hash table is already deleted + * when calling dsm_detach. Same for spages and schunks, cause they are alloc by the same memcxt. + */ + if (!tbm->isShared) { hash_destroy(tbm->pagetable); - } - if (tbm->spages != NULL) { pfree_ext(tbm->spages); - } - if (tbm->schunks != NULL) { pfree_ext(tbm->schunks); } pfree_ext(tbm); } +/* + * tbm_free_shared_area - free shared state + * + * Free shared iterator state, Also free shared pagetable and iterator arrays + * memory if they are not referred by any of the shared iterator i.e recount + * is becomes 0. + */ +void tbm_free_shared_area(TBMSharedIteratorState *istate) +{ + if (pg_atomic_sub_fetch_u32(&istate->pagetableRefcount, 1) == 0) { + hash_destroy(istate->pagetable); + } + if (pg_atomic_sub_fetch_u32(&istate->pagesRefcount, 1) == 0) { + pfree_ext(istate->spages); + } + if (pg_atomic_sub_fetch_u32(&istate->chunksRefcount, 1) == 0) { + pfree_ext(istate->schunks); + } + pfree_ext(istate); +} + /* * tbm_add_tuples - add some tuple IDs to a TIDBitmap * @@ -277,7 +351,7 @@ void tbm_add_tuples(TIDBitmap* tbm, const ItemPointer tids, int ntids, bool rech { int i; - Assert(!tbm->iterating); + Assert(tbm->iterating == TBM_NOT_ITERATING); for (i = 0; i < ntids; i++) { BlockNumber blk = ItemPointerGetBlockNumber(tids + i); OffsetNumber off = ItemPointerGetOffsetNumber(tids + i); @@ -340,7 +414,7 @@ void tbm_add_page(TIDBitmap* tbm, BlockNumber pageno, Oid partitionOid) */ void tbm_union(TIDBitmap* a, const TIDBitmap* b) { - Assert(!a->iterating); + Assert(a->iterating == TBM_NOT_ITERATING); /* Nothing to do if b is empty */ if (b->nentries == 0) { return; @@ -414,7 +488,7 @@ static void tbm_union_page(TIDBitmap* a, const PagetableEntry* bpage) */ void tbm_intersect(TIDBitmap* a, const TIDBitmap* b) { - Assert(!a->iterating); + Assert(a->iterating == TBM_NOT_ITERATING); /* Nothing to do if a is empty */ if (a->nentries == 0) { return; @@ -533,6 +607,39 @@ bool tbm_is_empty(const TIDBitmap* tbm) return (tbm->nentries == 0); } +static void tbm_sort_pages(TIDBitmap* tbm) +{ + HASH_SEQ_STATUS status; + PagetableEntry* page = NULL; + int npages; + int nchunks; + + if (tbm->spages == NULL && tbm->npages > 0) { + tbm->spages = (PagetableEntry**)MemoryContextAlloc(tbm->mcxt, tbm->npages * sizeof(PagetableEntry*)); + } + if ((tbm->schunks == NULL) && tbm->nchunks > 0) { + tbm->schunks = (PagetableEntry**)MemoryContextAlloc(tbm->mcxt, tbm->nchunks * sizeof(PagetableEntry*)); + } + + hash_seq_init(&status, tbm->pagetable); + npages = nchunks = 0; + while ((page = (PagetableEntry*)hash_seq_search(&status)) != NULL) { + if (page->ischunk) { + tbm->schunks[nchunks++] = page; + } else { + tbm->spages[npages++] = page; + } + } + Assert(npages == tbm->npages); + Assert(nchunks == tbm->nchunks); + if (npages > 1) { + qsort(tbm->spages, npages, sizeof(PagetableEntry*), tbm_comparator); + } + if (nchunks > 1) { + qsort(tbm->schunks, nchunks, sizeof(PagetableEntry*), tbm_comparator); + } +} + /* * tbm_begin_iterate - prepare to iterate through a TIDBitmap * @@ -548,6 +655,7 @@ bool tbm_is_empty(const TIDBitmap* tbm) */ TBMIterator* tbm_begin_iterate(TIDBitmap* tbm) { + Assert(tbm->iterating != TBM_ITERATING_SHARED); TBMIterator* iterator = NULL; /* @@ -570,43 +678,134 @@ TBMIterator* tbm_begin_iterate(TIDBitmap* tbm) * attached to the bitmap not the iterator, so they can be used by more * than one iterator. */ - if (tbm->status == TBM_HASH && !tbm->iterating) { - HASH_SEQ_STATUS status; - PagetableEntry* page = NULL; - int npages; - int nchunks; - - if (tbm->spages == NULL && tbm->npages > 0) { - tbm->spages = (PagetableEntry**)MemoryContextAlloc(tbm->mcxt, tbm->npages * sizeof(PagetableEntry*)); - } - if ((tbm->schunks == NULL) && tbm->nchunks > 0) { - tbm->schunks = (PagetableEntry**)MemoryContextAlloc(tbm->mcxt, tbm->nchunks * sizeof(PagetableEntry*)); - } - - hash_seq_init(&status, tbm->pagetable); - npages = nchunks = 0; - while ((page = (PagetableEntry*)hash_seq_search(&status)) != NULL) { - if (page->ischunk) { - tbm->schunks[nchunks++] = page; - } else { - tbm->spages[npages++] = page; - } - } - Assert(npages == tbm->npages); - Assert(nchunks == tbm->nchunks); - if (npages > 1) { - qsort(tbm->spages, npages, sizeof(PagetableEntry*), tbm_comparator); - } - if (nchunks > 1) { - qsort(tbm->schunks, nchunks, sizeof(PagetableEntry*), tbm_comparator); - } + if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING) { + tbm_sort_pages(tbm); } - tbm->iterating = true; + tbm->iterating = TBM_ITERATING_PRIVATE; return iterator; } +/* + * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap. + * + * The necessary shared state will be allocated from the DSA passed to + * tbm_create, so that multiple processes can attach to it and iterate jointly. + * + * This will convert the pagetable hash into page and chunk array of the index + * into pagetable array. + */ +TBMSharedIteratorState* tbm_prepare_shared_iterate(TIDBitmap *tbm) +{ + Assert(tbm->iterating != TBM_ITERATING_PRIVATE); + + /* + * Allocate TBMSharedIteratorState from DSA to hold the shared members and + * lock, this will also be used by multiple worker for shared iterate. + */ + TBMSharedIteratorState *istate = (TBMSharedIteratorState*)MemoryContextAllocZero(tbm->mcxt, + sizeof(TBMSharedIteratorState)); + + /* + * If we have a hashtable, create and fill the sorted page lists, unless + * we already did that for a previous iterator. Note that the lists are + * attached to the bitmap not the iterator, so they can be used by more + * than one iterator. + */ + if (tbm->iterating == TBM_NOT_ITERATING) { + if (tbm->status == TBM_HASH) { + tbm_sort_pages(tbm); + } + pg_atomic_init_u32(&istate->pagetableRefcount, 0); + pg_atomic_init_u32(&istate->pagesRefcount, 0); + pg_atomic_init_u32(&istate->chunksRefcount, 0); + } + + /* + * Store the TBM members in the shared state so that we can share them + * across multiple processes. + */ + istate->nentries = tbm->nentries; + istate->maxentries = tbm->maxentries; + istate->npages = tbm->npages; + istate->nchunks = tbm->nchunks; + istate->pagetable = tbm->pagetable; + istate->spages = tbm->spages; + istate->schunks = tbm->schunks; + istate->status = tbm->status; + int rc = memcpy_s(&istate->entry1, sizeof(PagetableEntry), &tbm->entry1, sizeof(PagetableEntry)); + securec_check(rc, "", ""); + /* + * For every shared iterator, referring to pagetable and iterator array, + * increase the refcount by 1 so that while freeing the shared iterator we + * don't free pagetable and iterator array until its refcount becomes 0. + */ + (void)pg_atomic_add_fetch_u32(&istate->pagetableRefcount, 1); + (void)pg_atomic_add_fetch_u32(&istate->pagesRefcount, 1); + (void)pg_atomic_add_fetch_u32(&istate->chunksRefcount, 1); + /* Initialize the iterator lock */ + LWLockInitialize(&istate->lock, LWTRANCHE_TBM); + + /* Initialize the shared iterator state */ + istate->schunkbit = 0; + istate->schunkptr = 0; + istate->spageptr = 0; + + tbm->iterating = TBM_ITERATING_SHARED; + + return istate; +} + +/* + * tbm_extract_page_tuple - extract the tuple offsets from a page + * + * The extracted offsets are stored into TBMIterateResult. + */ +static inline int tbm_extract_page_tuple(const PagetableEntry *page, TBMIterateResult *output) +{ + int ntuples = 0; + + for (int wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) { + bitmapword w = page->words[wordnum]; + + if (w != 0) { + int off = wordnum * BITS_PER_BITMAPWORD + 1; + + while (w != 0) { + if (w & 1) { + output->offsets[ntuples++] = (OffsetNumber)off; + } + off++; + w >>= 1; + } + } + } + + return ntuples; +} + +/* + * tbm_advance_schunkbit - Advance the schunkbit + */ +static inline void tbm_advance_schunkbit(const PagetableEntry *chunk, int *schunkbitp) +{ + int schunkbit = *schunkbitp; + + while (schunkbit < PAGES_PER_CHUNK) { + int wordnum = WORDNUM(schunkbit); + int bitnum = BITNUM(schunkbit); + + if ((chunk->words[wordnum] & ((bitmapword)1 << (unsigned int)bitnum)) != 0) { + break; + } + schunkbit++; + } + + *schunkbitp = schunkbit; +} + + /* * tbm_iterate - scan through next page of a TIDBitmap * @@ -624,7 +823,7 @@ TBMIterateResult* tbm_iterate(TBMIterator* iterator) TIDBitmap* tbm = iterator->tbm; TBMIterateResult* output = &(iterator->output); - Assert(tbm->iterating); + Assert(tbm->iterating == TBM_ITERATING_PRIVATE); /* * If lossy chunk pages remain, make sure we've advanced schunkptr/ @@ -634,15 +833,7 @@ TBMIterateResult* tbm_iterate(TBMIterator* iterator) PagetableEntry* chunk = tbm->schunks[iterator->schunkptr]; int schunkbit = iterator->schunkbit; - while (schunkbit < PAGES_PER_CHUNK) { - int wordnum = WORDNUM(schunkbit); - int bitnum = BITNUM(schunkbit); - - if ((chunk->words[wordnum] & ((bitmapword)1 << (unsigned int)bitnum)) != 0) { - break; - } - schunkbit++; - } + tbm_advance_schunkbit(chunk, &schunkbit); if (schunkbit < PAGES_PER_CHUNK) { iterator->schunkbit = schunkbit; break; @@ -676,7 +867,6 @@ TBMIterateResult* tbm_iterate(TBMIterator* iterator) if (iterator->spageptr < tbm->npages) { PagetableEntry* page = NULL; int ntuples; - int wordnum; /* In ONE_PAGE state, we don't allocate an spages[] array */ if (tbm->status == TBM_ONE_PAGE) { @@ -686,22 +876,7 @@ TBMIterateResult* tbm_iterate(TBMIterator* iterator) } /* scan bitmap to extract individual offset numbers */ - ntuples = 0; - for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) { - bitmapword w = page->words[wordnum]; - - if (w != 0) { - int off = wordnum * BITS_PER_BITMAPWORD + 1; - - while (w != 0) { - if (w & 1) { - output->offsets[ntuples++] = (OffsetNumber)off; - } - off++; - w >>= 1; - } - } - } + ntuples = tbm_extract_page_tuple(page, output); output->blockno = page->entryNode.blockNo; output->partitionOid = page->entryNode.partitionOid; output->ntuples = ntuples; @@ -714,6 +889,91 @@ TBMIterateResult* tbm_iterate(TBMIterator* iterator) return NULL; } +/* + * tbm_shared_iterate - scan through next page of a TIDBitmap + * + * As above, but this will iterate using an iterator which is shared + * across multiple processes. We need to acquire the iterator LWLock, + * before accessing the shared members. + */ +TBMIterateResult *tbm_shared_iterate(TBMSharedIterator *iterator) +{ + TBMIterateResult *output = &iterator->output; + TBMSharedIteratorState *istate = iterator->state; + + /* Acquire the LWLock before accessing the shared members */ + (void)LWLockAcquire(&istate->lock, LW_EXCLUSIVE); + + /* + * If lossy chunk pages remain, make sure we've advanced schunkptr/ + * schunkbit to the next set bit. + */ + while (istate->schunkptr < istate->nchunks) { + PagetableEntry *chunk = istate->schunks[istate->schunkptr]; + int schunkbit = istate->schunkbit; + + tbm_advance_schunkbit(chunk, &schunkbit); + if (schunkbit < PAGES_PER_CHUNK) { + istate->schunkbit = schunkbit; + break; + } + /* advance to next chunk */ + istate->schunkptr++; + istate->schunkbit = 0; + } + + /* + * If both chunk and per-page data remain, must output the numerically + * earlier page. + */ + if (istate->schunkptr < istate->nchunks) { + PagetableEntry *chunk = istate->schunks[istate->schunkptr]; + PagetableEntryNode pnode; + pnode.blockNo = chunk->entryNode.blockNo + istate->schunkbit; + pnode.partitionOid = chunk->entryNode.partitionOid; + + if (istate->spageptr >= istate->npages || + IS_CHUNK_BEFORE_PAGE(pnode, istate->spages[istate->spageptr]->entryNode)) { + /* Return a lossy page indicator from the chunk */ + output->blockno = pnode.blockNo; + output->partitionOid = pnode.partitionOid; + output->ntuples = -1; + output->recheck = true; + istate->schunkbit++; + + LWLockRelease(&istate->lock); + return output; + } + } + + if (istate->spageptr < istate->npages) { + PagetableEntry *page = NULL; + + /* In ONE_PAGE state, we don't allocate an spages[] array */ + if (istate->status == TBM_ONE_PAGE) { + page = &istate->entry1; + } else { + page = istate->spages[istate->spageptr]; + } + + /* scan bitmap to extract individual offset numbers */ + int ntuples = tbm_extract_page_tuple(page, output); + output->blockno = page->entryNode.blockNo; + output->partitionOid = page->entryNode.partitionOid; + output->ntuples = ntuples; + output->recheck = page->recheck; + istate->spageptr++; + + LWLockRelease(&istate->lock); + return output; + } + + LWLockRelease(&istate->lock); + + /* Nothing more in the bitmap */ + return NULL; +} + /* * tbm_end_iterate - finish an iteration over a TIDBitmap * @@ -726,6 +986,17 @@ void tbm_end_iterate(TBMIterator* iterator) pfree_ext(iterator); } +/* + * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap + * + * This doesn't free any of the shared state associated with the iterator, + * just our backend-private state. + */ +void tbm_end_shared_iterate(TBMSharedIterator *iterator) +{ + pfree_ext(iterator); +} + /* * tbm_find_pageentry - find a PagetableEntry for the pageno * @@ -919,7 +1190,7 @@ static void tbm_lossify(TIDBitmap* tbm) * push nentries down to significantly less than maxentries, or else we'll * just end up doing this again very soon. We shoot for maxentries/2. */ - Assert(!tbm->iterating); + Assert(tbm->iterating == TBM_NOT_ITERATING); Assert(tbm->status == TBM_HASH); hash_seq_init(&status, tbm->pagetable); @@ -986,7 +1257,29 @@ static int tbm_comparator(const void* left, const void* right) return 0; } -bool tbm_is_global(const TIDBitmap* tbm) +/* + * tbm_attach_shared_iterate + * + * Allocate a backend-private iterator and attach the shared iterator state + * to it so that multiple processed can iterate jointly. + * + * We also converts the DSA pointers to local pointers and store them into + * our private iterator. + */ +TBMSharedIterator *tbm_attach_shared_iterate(TBMSharedIteratorState *istate) +{ + /* + * Create the TBMSharedIterator struct, with enough trailing space to + * serve the needs of the TBMIterateResult sub-struct. + */ + TBMSharedIterator *iterator = (TBMSharedIterator *)palloc0(sizeof(TBMSharedIterator) + + MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber)); + + iterator->state = istate; + return iterator; +} + +bool tbm_is_global(const TIDBitmap *tbm) { return tbm->isGlobalPart; } diff --git a/src/gausskernel/optimizer/path/allpaths.cpp b/src/gausskernel/optimizer/path/allpaths.cpp index 25694afce..120943197 100755 --- a/src/gausskernel/optimizer/path/allpaths.cpp +++ b/src/gausskernel/optimizer/path/allpaths.cpp @@ -2918,6 +2918,26 @@ static void recurse_push_qual(Node* setOp, Query* topquery, RangeTblEntry* rte, (int)nodeTag(setOp)))); } } + +/* + * create_partial_bitmap_paths + * Build partial bitmap heap path for the relation + */ +void create_partial_bitmap_paths(PlannerInfo *root, RelOptInfo *rel, Path *bitmapqual) +{ + /* Compute heap pages for bitmap heap scan */ + double pages_fetched = compute_bitmap_pages(root, rel, bitmapqual, 1.0, NULL, NULL, rel->isPartitionedTable); + int parallel_workers = compute_parallel_worker(rel, pages_fetched, -1, + u_sess->attr.attr_sql.max_parallel_workers_per_gather); + + if (parallel_workers <= 0) { + return; + } + + add_partial_path(rel, + (Path *)create_bitmap_heap_path(root, rel, bitmapqual, NULL, 1.0, parallel_workers)); +} + /* * partIterator tries to inherit pathkeys from scan path * diff --git a/src/gausskernel/optimizer/path/costsize.cpp b/src/gausskernel/optimizer/path/costsize.cpp index d4b4cd119..ba3b12033 100644 --- a/src/gausskernel/optimizer/path/costsize.cpp +++ b/src/gausskernel/optimizer/path/costsize.cpp @@ -1444,10 +1444,10 @@ void cost_bitmap_heap_scan( Cost startup_cost = 0; Cost run_cost = 0; Cost indexTotalCost; - Selectivity indexSelectivity; QualCost qpqual_cost; Cost cpu_per_tuple = 0.0; Cost cost_per_page; + Cost cpu_run_cost; double tuples_fetched; double pages_fetched; double spc_seq_page_cost, spc_random_page_cost; @@ -1493,52 +1493,15 @@ void cost_bitmap_heap_scan( startup_cost += g_instance.cost_cxt.disable_cost; } - /* - * Fetch total cost of obtaining the bitmap, as well as its total - * selectivity. - */ - cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity); + pages_fetched = compute_bitmap_pages(root, baserel, bitmapqual, loop_count, + &indexTotalCost, &tuples_fetched, ispartitionedindex); startup_cost += indexTotalCost; /* Fetch estimated page costs for tablespace containing table. */ get_tablespace_page_costs(baserel->reltablespace, &spc_random_page_cost, &spc_seq_page_cost); - - /* - * Estimate number of main-table pages fetched. - */ - tuples_fetched = clamp_row_est(indexSelectivity * RELOPTINFO_LOCAL_FIELD(root, baserel, tuples)); - T = (baserel->pages > 1) ? (double)baserel->pages : 1.0; - if (loop_count > 1) { - /* - * For repeated bitmap scans, scale up the number of tuples fetched in - * the Mackert and Lohman formula by the number of scans, so that we - * estimate the number of pages fetched by all the scans. Then - * pro-rate for one scan. - */ - pages_fetched = index_pages_fetched(tuples_fetched * loop_count, - (BlockNumber)baserel->pages, - get_indexpath_pages(bitmapqual), - root, - ispartitionedindex); - - pages_fetched /= loop_count; - } else { - /* - * For a single scan, the number of heap pages that need to be fetched - * is the same as the Mackert and Lohman formula for the case T <= b - * (ie, no re-reads needed). - */ - pages_fetched = (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched); - } - if (pages_fetched >= T) { - pages_fetched = T; - } else { - pages_fetched = ceil(pages_fetched); - } - /* * For small numbers of pages we should charge spc_random_page_cost * apiece, while if nearly all the table's pages are being read, it's more @@ -1566,8 +1529,19 @@ void cost_bitmap_heap_scan( startup_cost += qpqual_cost.startup; cpu_per_tuple = u_sess->attr.attr_sql.cpu_tuple_cost + qpqual_cost.per_tuple; + cpu_run_cost = cpu_per_tuple * tuples_fetched; - run_cost += cpu_per_tuple * tuples_fetched; + /* Adjust costing for parallelism, if used. */ + if (path->parallel_workers > 0) { + double parallel_divisor = get_parallel_divisor(path); + + /* The CPU cost is divided among all the workers. */ + cpu_run_cost /= parallel_divisor; + + path->rows = clamp_row_est(path->rows / parallel_divisor); + } + + run_cost += cpu_run_cost; path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; @@ -5932,6 +5906,66 @@ static double get_parallel_divisor(Path* path) return parallel_divisor; } +/* + * compute_bitmap_pages + * + * compute number of pages fetched from heap in bitmap heap scan. + */ +double compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual, + double loop_count, Cost *cost, double *tuple, bool ispartitionedindex) +{ + Cost indexTotalCost; + Selectivity indexSelectivity; + double pages_fetched; + double T = (baserel->pages > 1) ? (double)baserel->pages : 1.0; + + /* + * Fetch total cost of obtaining the bitmap, as well as its total + * selectivity. + */ + cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity); + /* + * Estimate number of main-table pages fetched. + */ + double tuples_fetched = clamp_row_est(indexSelectivity * RELOPTINFO_LOCAL_FIELD(root, baserel, tuples)); + + if (loop_count > 1) { + /* + * For repeated bitmap scans, scale up the number of tuples fetched in + * the Mackert and Lohman formula by the number of scans, so that we + * estimate the number of pages fetched by all the scans. Then + * pro-rate for one scan. + */ + pages_fetched = index_pages_fetched(tuples_fetched * loop_count, + (BlockNumber)baserel->pages, + get_indexpath_pages(bitmapqual), + root, + ispartitionedindex); + + pages_fetched /= loop_count; + } else { + /* + * For a single scan, the number of heap pages that need to be fetched + * is the same as the Mackert and Lohman formula for the case T <= b + * (ie, no re-reads needed). + */ + pages_fetched = (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched); + } + if (pages_fetched >= T) { + pages_fetched = T; + } else { + pages_fetched = ceil(pages_fetched); + } + + if (cost != NULL) { + *cost = indexTotalCost; + } + if (tuple != NULL) { + *tuple = tuples_fetched; + } + + return pages_fetched; +} /* it used to compute page_size in createplan.cpp */ double cost_page_size(double tuples, int width) { diff --git a/src/gausskernel/optimizer/path/indxpath.cpp b/src/gausskernel/optimizer/path/indxpath.cpp index cf8ebd3d0..cfa2bfdc1 100755 --- a/src/gausskernel/optimizer/path/indxpath.cpp +++ b/src/gausskernel/optimizer/path/indxpath.cpp @@ -279,8 +279,13 @@ void create_index_paths(PlannerInfo* root, RelOptInfo* rel) BitmapHeapPath* bpath = NULL; bitmapqual = choose_bitmap_and(root, rel, bitindexpaths); - bpath = create_bitmap_heap_path(root, rel, bitmapqual, NULL, 1.0); + bpath = create_bitmap_heap_path(root, rel, bitmapqual, NULL, 1.0, 0); add_path(root, rel, (Path*)bpath); + + /* create a partial bitmap heap path */ + if (rel->consider_parallel) { + create_partial_bitmap_paths(root, rel, bitmapqual); + } } /* @@ -348,7 +353,7 @@ void create_index_paths(PlannerInfo* root, RelOptInfo* rel) /* And push that path into the mix */ required_outer = get_bitmap_tree_required_outer(bitmapqual); loop_count = get_loop_count(root, required_outer); - bpath = create_bitmap_heap_path(root, rel, bitmapqual, required_outer, loop_count); + bpath = create_bitmap_heap_path(root, rel, bitmapqual, required_outer, loop_count, 0); add_path(root, rel, (Path*)bpath); } } @@ -1650,6 +1655,11 @@ static Cost bitmap_scan_cost_est(PlannerInfo* root, RelOptInfo* rel, Path* ipath bpath.path.pathkeys = NIL; bpath.bitmapqual = ipath; + /* + * Check the cost of temporary path without considering parallelism. + * Parallel bitmap heap path will be considered at later stage. + */ + bpath.path.parallel_workers = 0; cost_bitmap_heap_scan(&bpath.path, root, rel, bpath.path.param_info, ipath, get_loop_count(root, required_outer)); return bpath.path.total_cost; @@ -1685,6 +1695,11 @@ static Cost bitmap_and_cost_est(PlannerInfo* root, RelOptInfo* rel, List* paths) bpath.path.pathkeys = NIL; bpath.bitmapqual = (Path*)&apath; + /* + * Check the cost of temporary path without considering parallelism. + * Parallel bitmap heap path will be considered at later stage. + */ + bpath.path.parallel_workers = 0; /* Now we can do cost_bitmap_heap_scan */ cost_bitmap_heap_scan( &bpath.path, root, rel, bpath.path.param_info, (Path*)&apath, get_loop_count(root, required_outer)); diff --git a/src/gausskernel/optimizer/plan/createplan.cpp b/src/gausskernel/optimizer/plan/createplan.cpp index fb462c148..8ebac568a 100644 --- a/src/gausskernel/optimizer/plan/createplan.cpp +++ b/src/gausskernel/optimizer/plan/createplan.cpp @@ -100,6 +100,7 @@ static Scan* create_indexscan_plan( static BitmapHeapScan* create_bitmap_scan_plan( PlannerInfo* root, BitmapHeapPath* best_path, List* tlist, List* scan_clauses); static Plan* create_bitmap_subplan(PlannerInfo* root, Path* bitmapqual, List** qual, List** indexqual, List** indexECs); +static void bitmap_subplan_mark_shared(Plan *plan); static TidScan* create_tidscan_plan(PlannerInfo* root, TidPath* best_path, List* tlist, List* scan_clauses); static SubqueryScan* create_subqueryscan_plan(PlannerInfo* root, Path* best_path, List* tlist, List* scan_clauses); static FunctionScan* create_functionscan_plan(PlannerInfo* root, Path* best_path, List* tlist, List* scan_clauses); @@ -1854,6 +1855,7 @@ static Gather* create_gather_plan(PlannerInfo* root, GatherPath* best_path) case T_HashJoin: case T_MergeJoin: case T_NestLoop: + case T_BitmapHeapScan: inherit_plan_locator_info(&gather_plan->plan, subplan); break; default: @@ -2621,20 +2623,27 @@ static BitmapHeapScan* create_bitmap_scan_plan( PlannerInfo* root, BitmapHeapPath* best_path, List* tlist, List* scan_clauses) { Index baserelid = best_path->path.parent->relid; - Plan* bitmapqualplan = NULL; List* bitmapqualorig = NIL; List* indexquals = NIL; List* indexECs = NIL; - List* qpqual = NIL; ListCell* l = NULL; BitmapHeapScan* scan_plan = NULL; + bool isGlobal = false; /* it should be a base rel... */ Assert(baserelid > 0); Assert(best_path->path.parent->rtekind == RTE_RELATION); /* Process the bitmapqual tree into a Plan tree and qual lists */ - bitmapqualplan = create_bitmap_subplan(root, best_path->bitmapqual, &bitmapqualorig, &indexquals, &indexECs); + Plan *bitmapqualplan = create_bitmap_subplan(root, best_path->bitmapqual, &bitmapqualorig, &indexquals, &indexECs); + + if (IsA(best_path->bitmapqual, IndexPath)) { + isGlobal = CheckIndexPathUseGPI((IndexPath*)best_path->bitmapqual); + } + /* Don't support parallel bitmap scan for global partition index or adio is enabled */ + if (!g_instance.attr.attr_storage.enable_adio_function && !isGlobal && best_path->path.parallel_aware) { + bitmap_subplan_mark_shared(bitmapqualplan); + } /* * The qpqual list must contain all restrictions not automatically handled @@ -2661,7 +2670,7 @@ static BitmapHeapScan* create_bitmap_scan_plan( * to do it that way because predicate conditions need to be rechecked if * the scan becomes lossy, so they have to be included in bitmapqualorig. */ - qpqual = NIL; + List *qpqual = NIL; foreach (l, scan_clauses) { RestrictInfo* rinfo = (RestrictInfo*)lfirst(l); Node* clause = (Node*)rinfo->clause; @@ -5293,13 +5302,32 @@ void copy_plan_costsize(Plan* dest, Plan* src) } } -/***************************************************************************** +/* + * bitmap_subplan_mark_shared + * Set isshared flag in bitmap subplan so that it will be created in + * shared memory. + */ +static void bitmap_subplan_mark_shared(Plan *plan) +{ + if (IsA(plan, BitmapAnd)) { + bitmap_subplan_mark_shared((Plan*)linitial(((BitmapAnd *)plan)->bitmapplans)); + } else if (IsA(plan, BitmapOr)) { + ((BitmapOr *)plan)->isshared = true; + bitmap_subplan_mark_shared((Plan*)linitial(((BitmapOr *)plan)->bitmapplans)); + } else if (IsA(plan, BitmapIndexScan)) { + ((BitmapIndexScan *)plan)->isshared = true; + } else { + /* Maybe CStore index scan(T_CStoreIndexCtidScan), don't support parallel */ + } +} + +/* **************************************************************************** * - * PLAN NODE BUILDING ROUTINES + * PLAN NODE BUILDING ROUTINES * * Some of these are exported because they are called to build plan nodes * in contexts where we're not deriving the plan node from a path node. - *****************************************************************************/ + * *************************************************************************** */ static SeqScan* make_seqscan(List* qptlist, List* qpqual, Index scanrelid) { SeqScan* node = makeNode(SeqScan); diff --git a/src/gausskernel/optimizer/util/pathnode.cpp b/src/gausskernel/optimizer/util/pathnode.cpp index 4984bc369..004bee1ca 100755 --- a/src/gausskernel/optimizer/util/pathnode.cpp +++ b/src/gausskernel/optimizer/util/pathnode.cpp @@ -2058,8 +2058,8 @@ IndexPath* create_index_path(PlannerInfo* root, IndexOptInfo* index, List* index * loop_count should match the value used when creating the component * IndexPaths. */ -BitmapHeapPath* create_bitmap_heap_path( - PlannerInfo* root, RelOptInfo* rel, Path* bitmapqual, Relids required_outer, double loop_count) +BitmapHeapPath* create_bitmap_heap_path(PlannerInfo* root, RelOptInfo* rel, Path* bitmapqual, + Relids required_outer, double loop_count, int parallel_degree) { BitmapHeapPath* pathnode = makeNode(BitmapHeapPath); @@ -2067,15 +2067,23 @@ BitmapHeapPath* create_bitmap_heap_path( pathnode->path.parent = rel; pathnode->path.param_info = get_baserel_parampathinfo(root, rel, required_outer); pathnode->path.pathkeys = NIL; /* always unordered */ + pathnode->path.parallel_aware = parallel_degree > 0 ? true : false; + pathnode->path.parallel_safe = rel->consider_parallel; + pathnode->path.parallel_workers = parallel_degree; pathnode->bitmapqual = bitmapqual; cost_bitmap_heap_scan(&pathnode->path, root, rel, pathnode->path.param_info, bitmapqual, loop_count); #ifdef STREAMPLAN + /* + * We need to set locator_type for parallel query, cause we may send + * this value to bg worker. If not, locator_type is the initial value '\0', + * which make the later serialized plan truncated. + */ + pathnode->path.locator_type = rel->locator_type; if (IS_STREAM_PLAN) { pathnode->path.distribute_keys = rel->distribute_keys; - pathnode->path.locator_type = rel->locator_type; /* add location information for bitmap heap path */ RangeTblEntry* rte = root->simple_rte_array[rel->relid]; @@ -3971,7 +3979,7 @@ Path* reparameterize_path(PlannerInfo* root, Path* path, Relids required_outer, case T_BitmapHeapScan: { BitmapHeapPath* bpath = (BitmapHeapPath*)path; - return (Path*)create_bitmap_heap_path(root, rel, bpath->bitmapqual, required_outer, loop_count); + return (Path*)create_bitmap_heap_path(root, rel, bpath->bitmapqual, required_outer, loop_count, 0); } case T_SubqueryScan: return create_subqueryscan_path(root, rel, path->pathkeys, required_outer); diff --git a/src/gausskernel/process/threadpool/knl_thread.cpp b/src/gausskernel/process/threadpool/knl_thread.cpp index b9f92c310..401e84a32 100755 --- a/src/gausskernel/process/threadpool/knl_thread.cpp +++ b/src/gausskernel/process/threadpool/knl_thread.cpp @@ -1432,6 +1432,7 @@ void knl_t_bgworker_init(knl_t_bgworker_context* bgworker_cxt) bgworker_cxt->pcxt_list = DLIST_STATIC_INIT(bgworker_cxt->pcxt_list); bgworker_cxt->save_pgBufferUsage = NULL; bgworker_cxt->hpm_context = NULL; + bgworker_cxt->memCxt = NULL; } void knl_t_msqueue_init(knl_t_msqueue_context* msqueue_cxt) diff --git a/src/gausskernel/runtime/executor/execMain.cpp b/src/gausskernel/runtime/executor/execMain.cpp index fe1465344..b96fb1b75 100644 --- a/src/gausskernel/runtime/executor/execMain.cpp +++ b/src/gausskernel/runtime/executor/execMain.cpp @@ -2066,7 +2066,6 @@ static void ExecutePlan(EState *estate, PlanState *planstate, bool use_parallel_ * process so we just end the loop... */ if (TupIsNull(slot)) { - (void)ExecShutdownNode(planstate); ExecEarlyFree(planstate); break; } @@ -2136,6 +2135,14 @@ static void ExecutePlan(EState *estate, PlanState *planstate, bool use_parallel_ u_sess->instr_cxt.global_instr->SetPeakNodeMemory(planstate->plan->plan_node_id, peak_memory); } + /* + * If we know we won't need to back up, we can release resources at this + * point. + */ + if (!(estate->es_top_eflags & EXEC_FLAG_BACKWARD)) { + (void)ExecShutdownNode(planstate); + } + if (use_parallel_mode) { ExitParallelMode(); } diff --git a/src/gausskernel/runtime/executor/execParallel.cpp b/src/gausskernel/runtime/executor/execParallel.cpp index 0c539ad13..cda0677a9 100644 --- a/src/gausskernel/runtime/executor/execParallel.cpp +++ b/src/gausskernel/runtime/executor/execParallel.cpp @@ -26,6 +26,7 @@ #include "executor/execParallel.h" #include "executor/executor.h" #include "executor/hashjoin.h" +#include "executor/nodeBitmapHeapscan.h" #include "executor/nodeSeqscan.h" #include "executor/nodeAppend.h" #include "executor/nodeIndexscan.h" @@ -183,6 +184,9 @@ static bool ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateConte case T_AppendState: ExecAppendEstimate((AppendState*)planstate, e->pcxt); break; + case T_BitmapHeapScanState: + ExecBitmapHeapEstimate((BitmapHeapScanState*)planstate, e->pcxt); + break; default: break; } @@ -244,6 +248,13 @@ static bool ExecParallelInitializeDSM(PlanState *planstate, ExecParallelInitiali cxt->pwCtx->queryInfo.pappend_num++; } break; + case T_BitmapHeapScanState: + if (planstate->plan->parallel_aware) { + ExecBitmapHeapInitializeDSM((BitmapHeapScanState*)planstate, + d->pcxt, cxt->pwCtx->queryInfo.bmscan_num); + cxt->pwCtx->queryInfo.bmscan_num++; + } + break; case T_HashJoinState: if (planstate->plan->parallel_aware) { ExecHashJoinInitializeDSM((HashJoinState*)planstate, d->pcxt, cxt->pwCtx->queryInfo.jstate_num); @@ -425,6 +436,7 @@ ParallelExecutorInfo *ExecInitParallelPlan(PlanState *planstate, EState *estate, queryInfo.pappend = (ParallelAppendState**)palloc0(sizeof(ParallelAppendState*) * e.nnodes); queryInfo.jstate = (ParallelHashJoinState**)palloc0(sizeof(ParallelHashJoinState*) * e.nnodes); queryInfo.shared_info = (SharedHashInfo**)palloc0(sizeof(SharedHashInfo*) * e.nnodes); + queryInfo.bmscan = (ParallelBitmapHeapState **)palloc0(sizeof(ParallelBitmapHeapState *) * e.nnodes); /* * Give parallel-aware nodes a chance to initialize their shared data. @@ -488,6 +500,11 @@ static bool ExecParallelReInitializeDSM(PlanState* planstate, ParallelContext* p ExecAppendReInitializeDSM((AppendState*)planstate, pcxt); } break; + case T_BitmapHeapScanState: + if (planstate->plan->parallel_aware) { + ExecBitmapHeapReInitializeDSM((BitmapHeapScanState*)planstate, pcxt); + } + break; case T_HashJoinState: if (planstate->plan->parallel_aware) { ExecHashJoinReInitializeDSM((HashJoinState*)planstate, pcxt); @@ -735,6 +752,11 @@ static bool ExecParallelInitializeWorker(PlanState *planstate, void *context) ExecAppendInitializeWorker((AppendState*)planstate, context); } break; + case T_BitmapHeapScanState: + if (planstate->plan->parallel_aware) { + ExecBitmapHeapInitializeWorker((BitmapHeapScanState *)planstate, context); + } + break; case T_HashState: /* even when not parallel-aware, for EXPLAIN ANALYZE */ ExecHashInitializeWorker((HashState*)planstate, context); @@ -778,6 +800,8 @@ void ParallelQueryMain(void *seg) /* Start up the executor, have it run the plan, and then shut it down. */ (void)ExecutorStart(queryDesc, cxt->pwCtx->queryInfo.eflags); + /* Special executor initialization steps for parallel workers */ + Assert(t_thrd.bgworker_cxt.memCxt != NULL); ExecParallelInitializeWorker(queryDesc->planstate, seg); /* Pass down any tuple bound */ diff --git a/src/gausskernel/runtime/executor/nodeBitmapHeapscan.cpp b/src/gausskernel/runtime/executor/nodeBitmapHeapscan.cpp index 3dd4173fd..944d75e1c 100755 --- a/src/gausskernel/runtime/executor/nodeBitmapHeapscan.cpp +++ b/src/gausskernel/runtime/executor/nodeBitmapHeapscan.cpp @@ -52,13 +52,22 @@ #include "gstrace/gstrace_infra.h" #include "gstrace/access_gstrace.h" +#define WAIT_BITMAP_INIT_TIMEOUT 1 // 1s timeout for pthread_cond_timedwait + static TupleTableSlot* BitmapHbucketTblNext(BitmapHeapScanState* node); static TupleTableSlot* BitmapHeapTblNext(BitmapHeapScanState* node); +static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate); +#ifdef USE_PREFETCH +static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node, const TBMIterateResult *tbmres, + TBMIterator *prefetch_iterator, TBMSharedIterator *shared_prefetch_it); +static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node); +#endif static void bitgetpage(HeapScanDesc scan, TBMIterateResult* tbmres); static void ExecInitPartitionForBitmapHeapScan(BitmapHeapScanState* scanstate, EState* estate); static void ExecInitNextPartitionForBitmapHeapScan(BitmapHeapScanState* node); -static void BitmapHeapPrefetchNext( - BitmapHeapScanState* node, HeapScanDesc scan, const TIDBitmap* tbm, TBMIterator** prefetch_iterator); +static void BitmapHeapPrefetchNext(BitmapHeapScanState* node, HeapScanDesc scan, const TIDBitmap* tbm, + TBMIterator** prefetch_iterator, TBMSharedIterator** shared_prefetch_it); +static bool BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate); /* This struct is used for partition switch while prefetch pages */ typedef struct PrefetchNode { @@ -76,11 +85,20 @@ void BitmapHeapFree(BitmapHeapScanState* node) tbm_end_iterate(node->prefetch_iterator); node->prefetch_iterator = NULL; } + if (node->shared_tbmiterator != NULL) { + tbm_end_shared_iterate(node->shared_tbmiterator); + node->shared_tbmiterator = NULL; + } + if (node->shared_prefetch_iterator != NULL) { + tbm_end_shared_iterate(node->shared_prefetch_iterator); + node->shared_prefetch_iterator = NULL; + } if (node->tbm != NULL) { tbm_free(node->tbm); node->tbm = NULL; } node->tbmres = NULL; + node->initialized = false; } static TupleTableSlot* BitmapHbucketTblNext(BitmapHeapScanState* node) { @@ -101,10 +119,104 @@ static TupleTableSlot* BitmapHbucketTblNext(BitmapHeapScanState* node) BitmapHeapFree(node); } } -/* ---------------------------------------------------------------- - * BitmapHeapNext + +#ifdef USE_PREFETCH +/* + * BitmapAdjustPrefetchIterator - Adjust the prefetch iterator + */ +static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node, const TBMIterateResult *tbmres, + TBMIterator *prefetch_iterator, TBMSharedIterator *shared_prefetch_it) +{ + ParallelBitmapHeapState *pstate = node->pstate; + + if (pstate == NULL) { + if (node->prefetch_pages > 0) { + /* The main iterator has closed the distance by one page */ + node->prefetch_pages--; + } else if (prefetch_iterator != NULL) { + /* Do not let the prefetch iterator get behind the main one */ + TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); + + if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno) { + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmodule(MOD_EXECUTOR), + errmsg("prefetch and main iterators are out of sync for BitmapHeapScan."))); + } + } + return; + } + + if (u_sess->storage_cxt.target_prefetch_pages > 0) { + (void)pthread_mutex_lock(&pstate->cv_mtx); + if (pstate->prefetch_pages > 0) { + pstate->prefetch_pages--; + (void)pthread_mutex_unlock(&pstate->cv_mtx); + } else { + /* Release the mutex before iterating */ + (void)pthread_mutex_unlock(&pstate->cv_mtx); + + /* + * In case of shared mode, we can not ensure that the current + * blockno of the main iterator and that of the prefetch iterator + * are same. It's possible that whatever blockno we are + * prefetching will be processed by another process. Therefore, + * we don't validate the blockno here as we do in non-parallel + * case. + */ + if (shared_prefetch_it != NULL) { + (void)tbm_shared_iterate(shared_prefetch_it); + } + } + } +} + +/* + * BitmapAdjustPrefetchTarget - Adjust the prefetch target * - * Retrieve next tuple from the BitmapHeapScan node's currentRelation + * Increase prefetch target if it's not yet at the max. Note that + * we will increase it to zero after fetching the very first + * page/tuple, then to one after the second tuple is fetched, then + * it doubles as later pages are fetched. + */ +static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node) +{ + ParallelBitmapHeapState *pstate = node->pstate; + + if (pstate == NULL) { + if (node->prefetch_target >= u_sess->storage_cxt.target_prefetch_pages) { + /* don't increase any further */ + } else if (node->prefetch_target >= u_sess->storage_cxt.target_prefetch_pages / 2) { + node->prefetch_target = u_sess->storage_cxt.target_prefetch_pages; + } else if (node->prefetch_target > 0) { + node->prefetch_target *= 2; + } else { + node->prefetch_target++; + } + return; + } + + /* Do an unlocked check first to save spinlock acquisitions. */ + if (pstate->prefetch_target < u_sess->storage_cxt.target_prefetch_pages) { + (void)pthread_mutex_lock(&pstate->cv_mtx); + if (pstate->prefetch_target >= u_sess->storage_cxt.target_prefetch_pages) { + /* don't increase any further */ + } else if (pstate->prefetch_target >= u_sess->storage_cxt.target_prefetch_pages / 2) { + pstate->prefetch_target = u_sess->storage_cxt.target_prefetch_pages; + } else if (pstate->prefetch_target > 0) { + pstate->prefetch_target *= 2; + } else { + pstate->prefetch_target++; + } + (void)pthread_mutex_unlock(&pstate->cv_mtx); + } +} +#endif /* USE_PREFETCH */ + +/* ---------------------------------------------------------------- + * BitmapHeapNext + * + * Retrieve next tuple from the BitmapHeapScan node's currentRelation * ---------------------------------------------------------------- */ static TupleTableSlot* BitmapHeapTblNext(BitmapHeapScanState* node) @@ -113,10 +225,13 @@ static TupleTableSlot* BitmapHeapTblNext(BitmapHeapScanState* node) HeapScanDesc scan = NULL; TIDBitmap* tbm = NULL; TBMIterator* tbmiterator = NULL; + TBMSharedIterator *shared_tbmiterator = NULL; TBMIterateResult* tbmres = NULL; + ParallelBitmapHeapState *pstate = node->pstate; #ifdef USE_PREFETCH TBMIterator* prefetch_iterator = NULL; + TBMSharedIterator* shared_prefetch_it = NULL; #endif OffsetNumber targoffset; TupleTableSlot* slot = NULL; @@ -128,10 +243,15 @@ static TupleTableSlot* BitmapHeapTblNext(BitmapHeapScanState* node) slot = node->ss.ss_ScanTupleSlot; scan = GetHeapScanDesc(node->ss.ss_currentScanDesc); tbm = node->tbm; - tbmiterator = node->tbmiterator; + if (pstate == NULL) { + tbmiterator = node->tbmiterator; + } else { + shared_tbmiterator = node->shared_tbmiterator; + } tbmres = node->tbmres; #ifdef USE_PREFETCH prefetch_iterator = node->prefetch_iterator; + shared_prefetch_it = node->shared_prefetch_iterator; #endif /* @@ -147,27 +267,79 @@ static TupleTableSlot* BitmapHeapTblNext(BitmapHeapScanState* node) * a lot of prefetching in a scan that stops after a few tuples because of * a LIMIT. */ - if (tbm == NULL) { - tbm = (TIDBitmap*)MultiExecProcNode(outerPlanState(node)); + if (!node->initialized) { + if (pstate == NULL) { + tbm = (TIDBitmap*)MultiExecProcNode(outerPlanState(node)); - if (tbm == NULL || !IsA(tbm, TIDBitmap)) { - ereport(ERROR, - (errcode(ERRCODE_UNRECOGNIZED_NODE_TYPE), - errmodule(MOD_EXECUTOR), - errmsg("unrecognized result from subplan for BitmapHeapScan."))); - } + if (tbm == NULL || !IsA(tbm, TIDBitmap)) { + ereport(ERROR, + (errcode(ERRCODE_UNRECOGNIZED_NODE_TYPE), + errmodule(MOD_EXECUTOR), + errmsg("unrecognized result from subplan for BitmapHeapScan."))); + } - node->tbm = tbm; - node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm); - node->tbmres = tbmres = NULL; + node->tbm = tbm; + node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm); + node->tbmres = tbmres = NULL; #ifdef USE_PREFETCH - if (u_sess->storage_cxt.target_prefetch_pages > 0) { - node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm); - node->prefetch_pages = 0; - node->prefetch_target = -1; - } + if (u_sess->storage_cxt.target_prefetch_pages > 0) { + node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm); + node->prefetch_pages = 0; + node->prefetch_target = -1; + } #endif + } else { + /* + * The leader will immediately come out of the function, but + * others will be blocked until leader populates the TBM and wakes + * them up. + */ + if (BitmapShouldInitializeSharedState(pstate)) { + tbm = (TIDBitmap *)MultiExecProcNode(outerPlanState(node)); + if (tbm == NULL || !IsA(tbm, TIDBitmap)) { + ereport(ERROR, + (errcode(ERRCODE_UNRECOGNIZED_NODE_TYPE), + errmodule(MOD_EXECUTOR), + errmsg("unrecognized result from subplan for BitmapHeapScan."))); + } + + node->tbm = tbm; + + /* + * Prepare to iterate over the TBM. This will return the + * dsa_pointer of the iterator state which will be used by + * multiple processes to iterate jointly. + */ + pstate->tbmiterator = tbm_prepare_shared_iterate(tbm); +#ifdef USE_PREFETCH + if (u_sess->storage_cxt.target_prefetch_pages > 0) { + pstate->prefetch_iterator = tbm_prepare_shared_iterate(tbm); + + /* + * We don't need the mutex here as we haven't yet woke up + * others. + */ + pstate->prefetch_pages = 0; + pstate->prefetch_target = -1; + } +#endif + /* We have initialized the shared state so wake up others. */ + BitmapDoneInitializingSharedState(pstate); + } + + /* Allocate a private iterator and attach the shared state to it */ + node->shared_tbmiterator = shared_tbmiterator = tbm_attach_shared_iterate(pstate->tbmiterator); + node->tbmres = tbmres = NULL; + +#ifdef USE_PREFETCH + if (u_sess->storage_cxt.target_prefetch_pages > 0) { + node->shared_prefetch_iterator = tbm_attach_shared_iterate(pstate->prefetch_iterator); + shared_prefetch_it = node->shared_prefetch_iterator; + } +#endif + } + node->initialized = true; } for (;;) { @@ -178,37 +350,28 @@ static TupleTableSlot* BitmapHeapTblNext(BitmapHeapScanState* node) * Get next page of results if needed */ if (tbmres == NULL) { - node->tbmres = tbmres = tbm_iterate(tbmiterator); + if (pstate == NULL) { + node->tbmres = tbmres = tbm_iterate(tbmiterator); + } else { + node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator); + } if (tbmres == NULL) { /* no more entries in the bitmap */ break; } #ifdef USE_PREFETCH - if (node->prefetch_pages > 0) { - /* The main iterator has closed the distance by one page */ - node->prefetch_pages--; - } else if (prefetch_iterator != NULL) { - /* Do not let the prefetch iterator get behind the main one */ - TBMIterateResult* tbmpre = tbm_iterate(prefetch_iterator); - - if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno) { - ereport(ERROR, - (errcode(ERRCODE_DATA_EXCEPTION), - errmodule(MOD_EXECUTOR), - errmsg("prefetch and main iterators are out of sync for BitmapHeapScan."))); - } - } + BitmapAdjustPrefetchIterator(node, tbmres, prefetch_iterator, shared_prefetch_it); #endif /* USE_PREFETCH */ /* Check whether switch partition-fake-rel, use rd_rel save */ - if (BitmapNodeNeedSwitchPartRel(node)) { + if (pstate == NULL && BitmapNodeNeedSwitchPartRel(node)) { GPISetCurrPartOid(node->gpi_scan, node->tbmres->partitionOid); if (!GPIGetNextPartRelation(node->gpi_scan, CurrentMemoryContext, AccessShareLock)) { /* If the current partition is invalid, the next page is directly processed */ tbmres = NULL; #ifdef USE_PREFETCH - BitmapHeapPrefetchNext(node, scan, tbm, &prefetch_iterator); + BitmapHeapPrefetchNext(node, scan, tbm, &prefetch_iterator, &shared_prefetch_it); #endif /* USE_PREFETCH */ continue; } @@ -256,14 +419,7 @@ static TupleTableSlot* BitmapHeapTblNext(BitmapHeapScanState* node) * page/tuple, then to one after the second tuple is fetched, then * it doubles as later pages are fetched. */ - if (node->prefetch_target >= u_sess->storage_cxt.target_prefetch_pages) - /* don't increase any further */; - else if (node->prefetch_target >= u_sess->storage_cxt.target_prefetch_pages / 2) - node->prefetch_target = u_sess->storage_cxt.target_prefetch_pages; - else if (node->prefetch_target > 0) - node->prefetch_target *= 2; - else - node->prefetch_target++; + BitmapAdjustPrefetchTarget(node); #endif /* USE_PREFETCH */ } else { /* @@ -277,8 +433,18 @@ static TupleTableSlot* BitmapHeapTblNext(BitmapHeapScanState* node) * Try to prefetch at least a few pages even before we get to the * second page if we don't stop reading after the first tuple. */ - if (node->prefetch_target < u_sess->storage_cxt.target_prefetch_pages) - node->prefetch_target++; + if (pstate == NULL) { + if (node->prefetch_target < u_sess->storage_cxt.target_prefetch_pages) { + node->prefetch_target++; + } + } else if (pstate->prefetch_target < u_sess->storage_cxt.target_prefetch_pages) { + /* take spinlock while updating shared state */ + (void)pthread_mutex_lock(&pstate->cv_mtx); + if (pstate->prefetch_target < u_sess->storage_cxt.target_prefetch_pages) { + pstate->prefetch_target++; + } + (void)pthread_mutex_unlock(&pstate->cv_mtx); + } #endif /* USE_PREFETCH */ } @@ -291,7 +457,7 @@ static TupleTableSlot* BitmapHeapTblNext(BitmapHeapScanState* node) } #ifdef USE_PREFETCH - BitmapHeapPrefetchNext(node, scan, tbm, &prefetch_iterator); + BitmapHeapPrefetchNext(node, scan, tbm, &prefetch_iterator, &shared_prefetch_it); #endif /* USE_PREFETCH */ /* @@ -343,6 +509,20 @@ static TupleTableSlot* BitmapHeapTblNext(BitmapHeapScanState* node) return ExecClearTuple(slot); } +/* + * BitmapDoneInitializingSharedState - Shared state is initialized + * + * By this time the leader has already populated the TBM and initialized the + * shared state so wake up other processes. + */ +static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate) +{ + (void)pthread_mutex_lock(&pstate->cv_mtx); + pstate->state = BM_FINISHED; + (void)pthread_cond_broadcast(&pstate->cv); + (void)pthread_mutex_unlock(&pstate->cv_mtx); +} + /* * bitgetpage - subroutine for BitmapHeapNext() * @@ -624,6 +804,11 @@ BitmapHeapScanState* ExecInitBitmapHeapScan(BitmapHeapScan* node, EState* estate scanstate->ss.isPartTbl = node->scan.isPartTbl; scanstate->ss.currentSlot = 0; scanstate->ss.partScanDirection = node->scan.partScanDirection; + scanstate->pscan_len = 0; + scanstate->initialized = false; + scanstate->shared_tbmiterator = NULL; + scanstate->shared_prefetch_iterator = NULL; + scanstate->pstate = NULL; /* initilize Global partition index scan information */ GPIScanInit(&scanstate->gpi_scan); @@ -802,14 +987,18 @@ static void ExecInitPartitionForBitmapHeapScan(BitmapHeapScanState* scanstate, E * to do on the current page, else we may uselessly prefetch the same * page we are just about to request for real. */ -void BitmapHeapPrefetchNext( - BitmapHeapScanState* node, HeapScanDesc scan, const TIDBitmap* tbm, TBMIterator** prefetch_iterator) +void BitmapHeapPrefetchNext(BitmapHeapScanState* node, HeapScanDesc scan, const TIDBitmap* tbm, + TBMIterator** prefetch_iterator, TBMSharedIterator** shared_prefetch_it) { - if (*prefetch_iterator == NULL) { + ParallelBitmapHeapState *pstate = node->pstate; + if ((pstate == NULL && *prefetch_iterator == NULL) || + (pstate != NULL && *shared_prefetch_it == NULL)) { return; } + ADIO_RUN() { + Assert(shared_prefetch_it == NULL); BlockNumber* blockList = NULL; BlockNumber* blockListPtr = NULL; PrefetchNode* prefetchNode = NULL; @@ -889,32 +1078,204 @@ void BitmapHeapPrefetchNext( } ADIO_ELSE() { - Oid oldOid = GPIGetCurrPartOid(node->gpi_scan); - while (node->prefetch_pages < node->prefetch_target) { - TBMIterateResult* tbmpre = tbm_iterate(*prefetch_iterator); - Relation prefetchRel = scan->rs_rd; - if (tbmpre == NULL) { - /* No more pages to prefetch */ - tbm_end_iterate(*prefetch_iterator); - node->prefetch_iterator = *prefetch_iterator = NULL; - break; + if (pstate == NULL) { + Oid oldOid = GPIGetCurrPartOid(node->gpi_scan); + while (node->prefetch_pages < node->prefetch_target) { + TBMIterateResult* tbmpre = tbm_iterate(*prefetch_iterator); + Relation prefetchRel = scan->rs_rd; + if (tbmpre == NULL) { + /* No more pages to prefetch */ + tbm_end_iterate(*prefetch_iterator); + node->prefetch_iterator = *prefetch_iterator = NULL; + break; + } + node->prefetch_pages++; + if (tbm_is_global(node->tbm) && GPIScanCheckPartOid(node->gpi_scan, tbmpre->partitionOid)) { + GPISetCurrPartOid(node->gpi_scan, tbmpre->partitionOid); + if (!GPIGetNextPartRelation(node->gpi_scan, CurrentMemoryContext, AccessShareLock)) { + /* If the current partition is invalid, the next page is directly processed */ + tbmpre = NULL; + continue; + } else { + prefetchRel = node->gpi_scan->fakePartRelation; + } + } + /* For posix_fadvise() we just send the one request */ + PrefetchBuffer(prefetchRel, MAIN_FORKNUM, tbmpre->blockno); } - node->prefetch_pages++; - if (tbm_is_global(node->tbm) && GPIScanCheckPartOid(node->gpi_scan, tbmpre->partitionOid)) { - GPISetCurrPartOid(node->gpi_scan, tbmpre->partitionOid); - if (!GPIGetNextPartRelation(node->gpi_scan, CurrentMemoryContext, AccessShareLock)) { - /* If the current partition is invalid, the next page is directly processed */ - tbmpre = NULL; - continue; - } else { - prefetchRel = node->gpi_scan->fakePartRelation; + /* recover old oid after prefetch switch */ + GPISetCurrPartOid(node->gpi_scan, oldOid); + } else if (pstate->prefetch_pages < pstate->prefetch_target) { + if (*shared_prefetch_it != NULL) { + while (1) { + bool do_prefetch = false; + + /* + * Recheck under the mutex. If some other process has already + * done enough prefetching then we need not to do anything. + */ + (void)pthread_mutex_lock(&pstate->cv_mtx); + if (pstate->prefetch_pages < pstate->prefetch_target) { + pstate->prefetch_pages++; + do_prefetch = true; + } + (void)pthread_mutex_unlock(&pstate->cv_mtx); + + if (!do_prefetch) { + return; + } + TBMIterateResult *tbmpre = tbm_shared_iterate(*shared_prefetch_it); + if (tbmpre == NULL) { + /* No more pages to prefetch */ + tbm_end_shared_iterate(*shared_prefetch_it); + node->shared_prefetch_iterator = *shared_prefetch_it = NULL; + break; + } + + PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); } } - /* For posix_fadvise() we just send the one request */ - PrefetchBuffer(prefetchRel, MAIN_FORKNUM, tbmpre->blockno); } - /* recover old oid after prefetch switch */ - GPISetCurrPartOid(node->gpi_scan, oldOid); } ADIO_END(); } + +/* ---------------- + * BitmapShouldInitializeSharedState + * + * The first process to come here and see the state to the BM_INITIAL + * will become the leader for the parallel bitmap scan and will be + * responsible for populating the TIDBitmap. The other processes will + * be blocked by the condition variable until the leader wakes them up. + * --------------- + */ +static bool BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate) +{ + SharedBitmapState state; + + (void)pthread_mutex_lock(&pstate->cv_mtx); + while (1) { + CHECK_FOR_INTERRUPTS(); + state = pstate->state; + if (pstate->state == BM_INITIAL) { + pstate->state = BM_INPROGRESS; + } + + /* Exit if bitmap is done, or if we're the leader. */ + if (state != BM_INPROGRESS) { + break; + } + + /* + * Use pthread_cond_timedwait here in case of worker exit in error cases, and call + * CHECK_FOR_INTERRUPTS to handle the error msg from worker. + */ + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + ts.tv_sec += WAIT_BITMAP_INIT_TIMEOUT; + (void)pthread_cond_timedwait(&pstate->cv, &pstate->cv_mtx, &ts); + } + (void)pthread_mutex_unlock(&pstate->cv_mtx); + + return (state == BM_INITIAL); +} + +/* ---------------------------------------------------------------- + * ExecBitmapHeapEstimate + * + * Compute the amount of space we'll need in the parallel + * query DSM, and inform pcxt->estimator about our needs. + * ---------------------------------------------------------------- + */ +void ExecBitmapHeapEstimate(BitmapHeapScanState *node, ParallelContext *pcxt) +{ + EState *estate = node->ss.ps.state; + + node->pscan_len = + add_size(offsetof(ParallelBitmapHeapState, phs_snapshot_data), EstimateSnapshotSpace(estate->es_snapshot)); +} + +/* ---------------------------------------------------------------- + * ExecBitmapHeapInitializeDSM + * + * Set up a parallel bitmap heap scan descriptor. + * ---------------------------------------------------------------- + */ +void ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node, ParallelContext *pcxt, int nodeid) +{ + knl_u_parallel_context *cxt = (knl_u_parallel_context *)pcxt->seg; + EState *estate = node->ss.ps.state; + ParallelBitmapHeapState *pstate = (ParallelBitmapHeapState*)MemoryContextAllocZero(cxt->memCtx, + node->pscan_len); + + pstate->tbmiterator = NULL; + pstate->prefetch_iterator = NULL; + pstate->prefetch_pages = 0; + pstate->prefetch_target = 0; + pstate->state = BM_INITIAL; + pstate->plan_node_id = node->ss.ps.plan->plan_node_id; + pstate->pscan_len = node->pscan_len; + + (void)pthread_cond_init(&pstate->cv, NULL); + (void)pthread_mutex_init(&pstate->cv_mtx, NULL); + SerializeSnapshot(estate->es_snapshot, pstate->phs_snapshot_data, + node->pscan_len - offsetof(ParallelBitmapHeapState, phs_snapshot_data)); + + cxt->pwCtx->queryInfo.bmscan[nodeid] = pstate; + node->pstate = pstate; +} + +/* ---------------------------------------------------------------- + * ExecBitmapHeapReInitializeDSM + * + * Reset shared state before beginning a fresh scan. + * ---------------------------------------------------------------- + */ +void ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node, ParallelContext *pcxt) +{ + ParallelBitmapHeapState *pstate = node->pstate; + + /* If there's no DSA, there are no workers; do nothing. */ + if (t_thrd.bgworker_cxt.memCxt == NULL) { + return; + } + pstate->state = BM_INITIAL; + + if (pstate->tbmiterator != NULL) { + tbm_free_shared_area(pstate->tbmiterator); + } + if (pstate->prefetch_iterator != NULL) { + tbm_free_shared_area(pstate->prefetch_iterator); + } + pstate->tbmiterator = NULL; + pstate->prefetch_iterator = NULL; +} + +/* ---------------------------------------------------------------- + * ExecBitmapHeapInitializeWorker + * + * Copy relevant information from TOC into planstate. + * ---------------------------------------------------------------- + */ +void ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node, void *context) +{ + Assert(t_thrd.bgworker_cxt.memCxt != NULL); + ParallelBitmapHeapState *pstate = NULL; + knl_u_parallel_context *cxt = (knl_u_parallel_context *)context; + + for (int i = 0; i < cxt->pwCtx->queryInfo.bmscan_num; i++) { + if (node->ss.ps.plan->plan_node_id == cxt->pwCtx->queryInfo.bmscan[i]->plan_node_id) { + pstate = cxt->pwCtx->queryInfo.bmscan[i]; + break; + } + } + if (pstate == NULL) { + ereport(ERROR, (errmsg("could not find plan info, plan node id:%d", node->ss.ps.plan->plan_node_id))); + } + node->pstate = pstate; + + Snapshot snapshot = RestoreSnapshot(pstate->phs_snapshot_data, + pstate->pscan_len - offsetof(ParallelBitmapHeapState, phs_snapshot_data)); + heap_scan_update_snapshot((HeapScanDesc)node->ss.ss_currentScanDesc, snapshot); +} + diff --git a/src/gausskernel/runtime/executor/nodeBitmapIndexscan.cpp b/src/gausskernel/runtime/executor/nodeBitmapIndexscan.cpp index 0fb3a3407..bf437bc36 100755 --- a/src/gausskernel/runtime/executor/nodeBitmapIndexscan.cpp +++ b/src/gausskernel/runtime/executor/nodeBitmapIndexscan.cpp @@ -91,7 +91,8 @@ Node* MultiExecBitmapIndexScan(BitmapIndexScanState* node) node->biss_result = NULL; /* reset for next time */ } else { /* XXX should we use less than u_sess->attr.attr_memory.work_mem for this? */ - tbm = tbm_create(u_sess->attr.attr_memory.work_mem * 1024L); + tbm = tbm_create(u_sess->attr.attr_memory.work_mem * 1024L, + ((BitmapIndexScan *) node->ss.ps.plan)->isshared ? t_thrd.bgworker_cxt.memCxt : NULL); /* If bitmapscan uses global partition index, set tbm to global */ if (RelationIsGlobalIndex(node->biss_RelationDesc)) { diff --git a/src/gausskernel/runtime/executor/nodeBitmapOr.cpp b/src/gausskernel/runtime/executor/nodeBitmapOr.cpp index 0353c8ccc..5364ae07d 100755 --- a/src/gausskernel/runtime/executor/nodeBitmapOr.cpp +++ b/src/gausskernel/runtime/executor/nodeBitmapOr.cpp @@ -125,7 +125,8 @@ Node* MultiExecBitmapOr(BitmapOrState* node) /* first subplan */ if (result == NULL) { /* XXX should we use less than u_sess->attr.attr_memory.work_mem for this? */ - result = tbm_create(u_sess->attr.attr_memory.work_mem * 1024L); + result = tbm_create(u_sess->attr.attr_memory.work_mem * 1024L, + ((BitmapOr *) node->ps.plan)->isshared ? t_thrd.bgworker_cxt.memCxt : NULL); /* If bitmapscan uses global partition index, set tbm to global */ if (RelationIsGlobalIndex(((BitmapIndexScanState*)subnode)->biss_RelationDesc)) { tbm_set_global(result, true); diff --git a/src/gausskernel/runtime/executor/nodeHashjoin.cpp b/src/gausskernel/runtime/executor/nodeHashjoin.cpp index 572c0ee67..6a098a7d1 100755 --- a/src/gausskernel/runtime/executor/nodeHashjoin.cpp +++ b/src/gausskernel/runtime/executor/nodeHashjoin.cpp @@ -1597,7 +1597,6 @@ void ExecHashJoinInitializeDSM(HashJoinState* state, ParallelContext* pcxt, int /* Initialize the shared state in the hash node. */ hashNode = (HashState*)innerPlanState(state); hashNode->parallel_state = pstate; - t_thrd.bgworker_cxt.memCxt = cxt->memCtx; } /* ---------------------------------------------------------------- @@ -1669,5 +1668,4 @@ void ExecHashJoinInitializeWorker(HashJoinState* state, void* pwcxt) hashNode = (HashState*)innerPlanState(state); hashNode->parallel_state = pstate; state->isParallel = true; - t_thrd.bgworker_cxt.memCxt = cxt->memCtx; } diff --git a/src/gausskernel/storage/access/gin/ginget.cpp b/src/gausskernel/storage/access/gin/ginget.cpp index 8e2c006d9..19e3721df 100755 --- a/src/gausskernel/storage/access/gin/ginget.cpp +++ b/src/gausskernel/storage/access/gin/ginget.cpp @@ -135,7 +135,7 @@ static bool collectMatchBitmap(GinBtreeData* btree, GinBtreeStack* stack, GinSca /* Initialize empty bitmap result */ if (!isColStore) { - scanEntry->matchBitmap = tbm_create(u_sess->attr.attr_memory.work_mem * 1024L); + scanEntry->matchBitmap = tbm_create(u_sess->attr.attr_memory.work_mem * 1024L, NULL); } /* Null query cannot partial-match anything */ diff --git a/src/gausskernel/storage/access/heap/heapam.cpp b/src/gausskernel/storage/access/heap/heapam.cpp index 8e7e48ac7..9166b695e 100644 --- a/src/gausskernel/storage/access/heap/heapam.cpp +++ b/src/gausskernel/storage/access/heap/heapam.cpp @@ -1857,6 +1857,15 @@ HeapTuple heapGetNextForVerify(HeapScanDesc scan, ScanDirection direction, bool& return &(scan->rs_ctup); } +void heap_scan_update_snapshot(HeapScanDesc scan, Snapshot snapshot) +{ + Assert(IsMVCCSnapshot(snapshot)); + + RegisterSnapshot(snapshot); + scan->rs_snapshot = snapshot; + scan->rs_flags |= SO_TEMP_SNAPSHOT; +} + /* ---------------- * heap_parallelscan_estimate - estimate storage for ParallelHeapScanDesc * diff --git a/src/gausskernel/storage/access/transam/parallel.cpp b/src/gausskernel/storage/access/transam/parallel.cpp index 74911d8bc..15c1a866f 100644 --- a/src/gausskernel/storage/access/transam/parallel.cpp +++ b/src/gausskernel/storage/access/transam/parallel.cpp @@ -918,6 +918,7 @@ void ParallelWorkerMain(Datum main_arg) /* Set flag to indicate that we're initializing a parallel worker. */ t_thrd.bgworker_cxt.InitializingParallelWorker = true; + t_thrd.bgworker_cxt.memCxt = ctx->memCtx; /* Establish signal handlers. */ gspqsignal(SIGTERM, die); diff --git a/src/gausskernel/storage/ipc/dsm.cpp b/src/gausskernel/storage/ipc/dsm.cpp index 5a905d60c..23cf12ad4 100644 --- a/src/gausskernel/storage/ipc/dsm.cpp +++ b/src/gausskernel/storage/ipc/dsm.cpp @@ -104,6 +104,8 @@ void *dsm_create(void) u_sess->parallel_ctx[i].used = true; slist_init(&u_sess->parallel_ctx[i].on_detach); + + t_thrd.bgworker_cxt.memCxt = u_sess->parallel_ctx[i].memCtx; return &(u_sess->parallel_ctx[i]); } } diff --git a/src/gausskernel/storage/lmgr/lwlock.cpp b/src/gausskernel/storage/lmgr/lwlock.cpp index be12c0d82..5c4a30514 100755 --- a/src/gausskernel/storage/lmgr/lwlock.cpp +++ b/src/gausskernel/storage/lmgr/lwlock.cpp @@ -158,7 +158,8 @@ static const char *BuiltinTrancheNames[] = { "PLdebugger", "SharedTupleStore", "parallel_append", - "ParallelHashJoinLock" + "ParallelHashJoinLock", + "TidBitMapLock" }; static void RegisterLWLockTranches(void); diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 413d2ad12..0ef2dfbba 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -101,6 +101,10 @@ extern void heapgetpage(HeapScanDesc scan, BlockNumber page); extern void heap_rescan(HeapScanDesc scan, ScanKey key); extern void heap_endscan(HeapScanDesc scan); extern HeapTuple heap_getnext(HeapScanDesc scan, ScanDirection direction); +/* + * Update snapshot used by the scan. + */ +extern void heap_scan_update_snapshot(HeapScanDesc scan, Snapshot snapshot); extern Size heap_parallelscan_estimate(Snapshot snapshot); extern void heap_parallelscan_initialize(ParallelHeapScanDesc target, Size pscan_len, Relation relation, diff --git a/src/include/executor/nodeBitmapHeapscan.h b/src/include/executor/nodeBitmapHeapscan.h index 63c37574f..16c01272d 100644 --- a/src/include/executor/nodeBitmapHeapscan.h +++ b/src/include/executor/nodeBitmapHeapscan.h @@ -15,10 +15,15 @@ #define NODEBITMAPHEAPSCAN_H #include "nodes/execnodes.h" +#include "access/parallel.h" extern BitmapHeapScanState* ExecInitBitmapHeapScan(BitmapHeapScan* node, EState* estate, int eflags); extern TupleTableSlot* ExecBitmapHeapScan(BitmapHeapScanState* node); extern void ExecEndBitmapHeapScan(BitmapHeapScanState* node); extern void ExecReScanBitmapHeapScan(BitmapHeapScanState* node); +extern void ExecBitmapHeapEstimate(BitmapHeapScanState *node, ParallelContext *pcxt); +extern void ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node, ParallelContext *pcxt, int nodeid); +extern void ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node, ParallelContext *pcxt); +extern void ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node, void *context); #endif /* NODEBITMAPHEAPSCAN_H */ diff --git a/src/include/knl/knl_session.h b/src/include/knl/knl_session.h index 04fbd236d..6f1df3e3d 100644 --- a/src/include/knl/knl_session.h +++ b/src/include/knl/knl_session.h @@ -2059,6 +2059,7 @@ struct ParallelHeapScanDescData; struct ParallelIndexScanDescData; struct ParallelHashJoinState; struct SharedHashInfo; +struct ParallelBitmapHeapState; typedef uint64 XLogRecPtr; typedef struct ParallelQueryInfo { struct SharedExecutorInstrumentation* instrumentation; @@ -2079,6 +2080,8 @@ typedef struct ParallelQueryInfo { ParallelHashJoinState** jstate; int hash_num; SharedHashInfo** shared_info; + int bmscan_num; + ParallelBitmapHeapState **bmscan; } ParallelQueryInfo; struct BTShared; diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 112d7ff58..379f68fed 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -21,6 +21,7 @@ #include "executor/instrument.h" #include "nodes/params.h" #include "nodes/plannodes.h" +#include "nodes/tidbitmap.h" #include "storage/pagecompress.h" #include "utils/bloom_filter.h" #include "utils/reltrigger.h" @@ -1714,15 +1715,65 @@ typedef struct BitmapIndexScanState { } BitmapIndexScanState; /* ---------------- - * BitmapHeapScanState information + * SharedBitmapState information * - * bitmapqualorig execution state for bitmapqualorig expressions - * tbm bitmap obtained from child index scan(s) - * tbmiterator iterator for scanning current pages - * tbmres current-page data - * prefetch_iterator iterator for prefetching ahead of current page - * prefetch_pages # pages prefetch iterator is ahead of current - * prefetch_target target prefetch distance + * BM_INITIAL TIDBitmap creation is not yet started, so first worker + * to see this state will set the state to BM_INPROGRESS + * and that process will be responsible for creating + * TIDBitmap. + * BM_INPROGRESS TIDBitmap creation is in progress; workers need to + * sleep until it's finished. + * BM_FINISHED TIDBitmap creation is done, so now all workers can + * proceed to iterate over TIDBitmap. + * ---------------- + */ +typedef enum { + BM_INITIAL, + BM_INPROGRESS, + BM_FINISHED +} SharedBitmapState; + +/* ---------------- + * ParallelBitmapHeapState information + * tbmiterator iterator for scanning current pages + * prefetch_iterator iterator for prefetching ahead of current page + * mutex mutual exclusion for the prefetching variable + * and state + * prefetch_pages # pages prefetch iterator is ahead of current + * prefetch_target current target prefetch distance + * state current state of the TIDBitmap + * cv conditional wait variable + * phs_snapshot_data snapshot data shared to workers + * ---------------- + */ +typedef struct ParallelBitmapHeapState { + TBMSharedIteratorState *tbmiterator; + TBMSharedIteratorState *prefetch_iterator; + Size pscan_len; + int prefetch_pages; + int prefetch_target; + int plan_node_id; + SharedBitmapState state; + pthread_mutex_t cv_mtx; + pthread_cond_t cv; + char phs_snapshot_data[FLEXIBLE_ARRAY_MEMBER]; +} ParallelBitmapHeapState; + +/* ---------------- + * BitmapHeapScanState information + * + * bitmapqualorig execution state for bitmapqualorig expressions + * tbm bitmap obtained from child index scan(s) + * tbmiterator iterator for scanning current pages + * tbmres current-page data + * prefetch_iterator iterator for prefetching ahead of current page + * prefetch_pages # pages prefetch iterator is ahead of current + * prefetch_target target prefetch distance + * pscan_len size of the shared memory for parallel bitmap + * initialized is node is ready to iterate + * shared_tbmiterator shared iterator + * shared_prefetch_iterator shared iterator for prefetching + * pstate shared state for parallel bitmap scan * ---------------- */ typedef struct BitmapHeapScanState { @@ -1735,6 +1786,11 @@ typedef struct BitmapHeapScanState { int prefetch_pages; int prefetch_target; GPIScanDesc gpi_scan; /* global partition index scan use information */ + Size pscan_len; + bool initialized; + TBMSharedIterator *shared_tbmiterator; + TBMSharedIterator *shared_prefetch_iterator; + ParallelBitmapHeapState *pstate; } BitmapHeapScanState; /* ---------------- diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index fc5cd570c..868af7282 100755 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -507,6 +507,7 @@ typedef struct BitmapAnd { */ typedef struct BitmapOr { Plan plan; + bool isshared; List* bitmapplans; } BitmapOr; @@ -683,6 +684,7 @@ typedef struct IndexOnlyScan { typedef struct BitmapIndexScan { Scan scan; Oid indexid; /* OID of index to scan */ + bool isshared; char* indexname; /* name of index to scan */ List* indexqual; /* list of index quals (OpExprs) */ List* indexqualorig; /* the same in original form */ diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h index 1440eb66b..81df2e12b 100755 --- a/src/include/nodes/tidbitmap.h +++ b/src/include/nodes/tidbitmap.h @@ -32,6 +32,8 @@ typedef struct TIDBitmap TIDBitmap; /* Likewise, TBMIterator is private */ typedef struct TBMIterator TBMIterator; +typedef struct TBMSharedIterator TBMSharedIterator; +typedef struct TBMSharedIteratorState TBMSharedIteratorState; /* Result structure for tbm_iterate */ typedef struct { @@ -44,8 +46,9 @@ typedef struct { } TBMIterateResult; /* function prototypes in nodes/tidbitmap.c */ -extern TIDBitmap* tbm_create(long maxbytes); +extern TIDBitmap* tbm_create(long maxbytes, MemoryContext dsa); extern void tbm_free(TIDBitmap* tbm); +extern void tbm_free_shared_area(TBMSharedIteratorState *istate); extern void tbm_add_tuples( TIDBitmap* tbm, const ItemPointer tids, int ntids, bool recheck, Oid partitionOid = InvalidOid); @@ -57,8 +60,12 @@ extern void tbm_intersect(TIDBitmap* a, const TIDBitmap* b); extern bool tbm_is_empty(const TIDBitmap* tbm); extern TBMIterator* tbm_begin_iterate(TIDBitmap* tbm); +extern TBMSharedIteratorState* tbm_prepare_shared_iterate(TIDBitmap *tbm); extern TBMIterateResult* tbm_iterate(TBMIterator* iterator); +extern TBMIterateResult* tbm_shared_iterate(TBMSharedIterator *iterator); extern void tbm_end_iterate(TBMIterator* iterator); +extern void tbm_end_shared_iterate(TBMSharedIterator *iterator); +extern TBMSharedIterator *tbm_attach_shared_iterate(TBMSharedIteratorState* istate); extern bool tbm_is_global(const TIDBitmap* tbm); extern void tbm_set_global(TIDBitmap* tbm, bool isGlobal); #endif /* TIDBITMAP_H */ diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 250c1217e..78ba3291b 100755 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -179,6 +179,8 @@ extern double estimate_hash_num_distinct(PlannerInfo* root, List* hashkey, Path* double local_ndistinct, double global_ndistinct, bool* usesinglestats); extern RelOptInfo* find_join_input_rel(PlannerInfo* root, Relids relids); extern double compute_sort_disk_cost(double input_bytes, double sort_mem_bytes); +extern double compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, + Path *bitmapqual, double loop_count, Cost *cost, double *tuple, bool ispartitionedindex); extern double approx_tuple_count(PlannerInfo* root, JoinPath* path, List* quals); extern void set_rel_path_rows(Path* path, RelOptInfo* rel, ParamPathInfo* param_info); diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 97bc0d754..128176593 100755 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -66,8 +66,8 @@ extern bool CheckBitmapHeapPathContainGlobalOrLocal(Path* bitmapqual); extern bool check_bitmap_heap_path_index_unusable(Path* bitmapqual, RelOptInfo* baserel); extern bool is_partitionIndex_Subpath(Path* subpath); extern bool is_pwj_path(Path* pwjpath); -extern BitmapHeapPath* create_bitmap_heap_path( - PlannerInfo* root, RelOptInfo* rel, Path* bitmapqual, Relids required_outer, double loop_count); +extern BitmapHeapPath *create_bitmap_heap_path(PlannerInfo *root, RelOptInfo *rel, Path *bitmapqual, + Relids required_outer, double loop_count, int parallel_degree); extern BitmapAndPath* create_bitmap_and_path(PlannerInfo* root, RelOptInfo* rel, List* bitmapquals); extern BitmapOrPath* create_bitmap_or_path(PlannerInfo* root, RelOptInfo* rel, List* bitmapquals); extern TidPath* create_tidscan_path(PlannerInfo* root, RelOptInfo* rel, List* tidquals); diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 503f93684..3a97187fb 100755 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -21,6 +21,7 @@ extern RelOptInfo* standard_join_search(PlannerInfo* root, int levels_needed, Li extern void generate_gather_paths(PlannerInfo *root, RelOptInfo *rel); extern int compute_parallel_worker(const RelOptInfo *rel, double heap_pages, double index_pages, int max_workers); +extern void create_partial_bitmap_paths(PlannerInfo *root, RelOptInfo *rel, Path *bitmapqual); extern void set_rel_size(PlannerInfo* root, RelOptInfo* rel, Index rti, RangeTblEntry* rte); diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 47854170a..eff3fc75b 100755 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -146,6 +146,7 @@ enum BuiltinTrancheIds { LWTRANCHE_SHARED_TUPLESTORE, LWTRANCHE_PARALLEL_APPEND, LWTRANCHE_PARALLEL_HASH_JOIN, + LWTRANCHE_TBM, /* * Each trancheId above should have a corresponding item in BuiltinTrancheNames; */ diff --git a/src/test/regress/expected/parallel_query.out b/src/test/regress/expected/parallel_query.out index 9a0aff42c..0689d8008 100644 --- a/src/test/regress/expected/parallel_query.out +++ b/src/test/regress/expected/parallel_query.out @@ -72,14 +72,539 @@ select count(*) from parallel_t1 where a <> 5000; 99999 (1 row) +--normal plan for bitmapscan +create index idx_parallel_t1 on parallel_t1(a); +analyze parallel_t1; +set enable_seqscan to off; +set enable_indexscan to off; +explain (costs off) select count(*) from parallel_t1 where a > 5000; + QUERY PLAN +-------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on parallel_t1 + Recheck Cond: (a > 5000) + -> Bitmap Index Scan on idx_parallel_t1 + Index Cond: (a > 5000) +(5 rows) + +explain (costs off) select count(*) from parallel_t1 where a < 5000; + QUERY PLAN +-------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on parallel_t1 + Recheck Cond: (a < 5000) + -> Bitmap Index Scan on idx_parallel_t1 + Index Cond: (a < 5000) +(5 rows) + +explain (costs off) select count(*) from parallel_t1 where a <> 5000; + QUERY PLAN +-------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on parallel_t1 + Filter: (a <> 5000) + -> Bitmap Index Scan on idx_parallel_t1 +(4 rows) + +select count(*) from parallel_t1 where a > 5000; + count +------- + 95000 +(1 row) + +select count(*) from parallel_t1 where a < 5000; + count +------- + 4999 +(1 row) + +select count(*) from parallel_t1 where a <> 5000; + count +------- + 99999 +(1 row) + +reset enable_seqscan; +reset enable_indexscan; --set parallel parameter set force_parallel_mode=on; set parallel_setup_cost=0; set parallel_tuple_cost=0.000005; set max_parallel_workers_per_gather=2; set min_parallel_table_scan_size=0; +set min_parallel_index_scan_size=0; set parallel_leader_participation=on; +--rescan case +create table test_with_rescan(dm int, sj_dm int, name text); +insert into test_with_rescan values(1,0,'universe'); +insert into test_with_rescan values(2,1,'galaxy'); +insert into test_with_rescan values(3,2,'sun'); +insert into test_with_rescan values(4,3,'earth'); +insert into test_with_rescan values(5,4,'asia'); +insert into test_with_rescan values(6,5,'China'); +insert into test_with_rescan values(7,6,'shaanxi'); +insert into test_with_rescan values(8,7,'xian'); +insert into test_with_rescan values(9,8,'huawei'); +insert into test_with_rescan values(10,9,'v10'); +insert into test_with_rescan values(11,10,'v10-3L'); +insert into test_with_rescan values(12,11,'gauss'); +insert into test_with_rescan values(13,12,'test'); +insert into test_with_rescan values(14,13,'test'); +insert into test_with_rescan values(15,14,'test'); +insert into test_with_rescan values(16,15,'test'); +insert into test_with_rescan values(17,16,'test'); +insert into test_with_rescan values(18,17,'test'); +insert into test_with_rescan values(19,18,'test'); +insert into test_with_rescan values(20,19,'test'); +create index on test_with_rescan(dm); +create index on test_with_rescan(sj_dm); +create index on test_with_rescan(name); +analyze test_with_rescan; +explain (costs off) +WITH recursive t_result AS ( +select * from( +SELECT dm,sj_dm,name,1 as level +FROM test_with_rescan +WHERE sj_dm < 10 order by dm limit 6 offset 2) +UNION all +SELECT t2.dm,t2.sj_dm,t2.name||' > '||t1.name,t1.level+1 +FROM t_result t1 +JOIN test_with_rescan t2 ON t2.sj_dm = t1.dm +) +SELECT * +FROM t_result t; + QUERY PLAN +-------------------------------------------------------------------------- + CTE Scan on t_result t + CTE t_result + -> Recursive Union + -> Limit + -> Sort + Sort Key: test_with_rescan.dm + -> Gather + Number of Workers: 1 + -> Parallel Seq Scan on test_with_rescan + Filter: (sj_dm < 10) + -> Hash Join + Hash Cond: (t1.dm = t2.sj_dm) + -> WorkTable Scan on t_result t1 + -> Hash + -> Gather + Number of Workers: 1 + -> Parallel Seq Scan on test_with_rescan t2 +(17 rows) + +WITH recursive t_result AS ( +select * from( +SELECT dm,sj_dm,name,1 as level +FROM test_with_rescan +WHERE sj_dm < 10 order by dm limit 6 offset 2) +UNION all +SELECT t2.dm,t2.sj_dm,t2.name||' > '||t1.name,t1.level+1 +FROM t_result t1 +JOIN test_with_rescan t2 ON t2.sj_dm = t1.dm +) +SELECT * +FROM t_result t order by 1,2,3,4; + dm | sj_dm | name | level +----+-------+-------------------------------------------------------------------------------------------------------------------------------------+------- + 3 | 2 | sun | 1 + 4 | 3 | earth | 1 + 4 | 3 | earth > sun | 2 + 5 | 4 | asia | 1 + 5 | 4 | asia > earth | 2 + 5 | 4 | asia > earth > sun | 3 + 6 | 5 | China | 1 + 6 | 5 | China > asia | 2 + 6 | 5 | China > asia > earth | 3 + 6 | 5 | China > asia > earth > sun | 4 + 7 | 6 | shaanxi | 1 + 7 | 6 | shaanxi > China | 2 + 7 | 6 | shaanxi > China > asia | 3 + 7 | 6 | shaanxi > China > asia > earth | 4 + 7 | 6 | shaanxi > China > asia > earth > sun | 5 + 8 | 7 | xian | 1 + 8 | 7 | xian > shaanxi | 2 + 8 | 7 | xian > shaanxi > China | 3 + 8 | 7 | xian > shaanxi > China > asia | 4 + 8 | 7 | xian > shaanxi > China > asia > earth | 5 + 8 | 7 | xian > shaanxi > China > asia > earth > sun | 6 + 9 | 8 | huawei > xian | 2 + 9 | 8 | huawei > xian > shaanxi | 3 + 9 | 8 | huawei > xian > shaanxi > China | 4 + 9 | 8 | huawei > xian > shaanxi > China > asia | 5 + 9 | 8 | huawei > xian > shaanxi > China > asia > earth | 6 + 9 | 8 | huawei > xian > shaanxi > China > asia > earth > sun | 7 + 10 | 9 | v10 > huawei > xian | 3 + 10 | 9 | v10 > huawei > xian > shaanxi | 4 + 10 | 9 | v10 > huawei > xian > shaanxi > China | 5 + 10 | 9 | v10 > huawei > xian > shaanxi > China > asia | 6 + 10 | 9 | v10 > huawei > xian > shaanxi > China > asia > earth | 7 + 10 | 9 | v10 > huawei > xian > shaanxi > China > asia > earth > sun | 8 + 11 | 10 | v10-3L > v10 > huawei > xian | 4 + 11 | 10 | v10-3L > v10 > huawei > xian > shaanxi | 5 + 11 | 10 | v10-3L > v10 > huawei > xian > shaanxi > China | 6 + 11 | 10 | v10-3L > v10 > huawei > xian > shaanxi > China > asia | 7 + 11 | 10 | v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 8 + 11 | 10 | v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 9 + 12 | 11 | gauss > v10-3L > v10 > huawei > xian | 5 + 12 | 11 | gauss > v10-3L > v10 > huawei > xian > shaanxi | 6 + 12 | 11 | gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 7 + 12 | 11 | gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 8 + 12 | 11 | gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 9 + 12 | 11 | gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 10 + 13 | 12 | test > gauss > v10-3L > v10 > huawei > xian | 6 + 13 | 12 | test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 7 + 13 | 12 | test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 8 + 13 | 12 | test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 9 + 13 | 12 | test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 10 + 13 | 12 | test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 11 + 14 | 13 | test > test > gauss > v10-3L > v10 > huawei > xian | 7 + 14 | 13 | test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 8 + 14 | 13 | test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 9 + 14 | 13 | test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 10 + 14 | 13 | test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 11 + 14 | 13 | test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 12 + 15 | 14 | test > test > test > gauss > v10-3L > v10 > huawei > xian | 8 + 15 | 14 | test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 9 + 15 | 14 | test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 10 + 15 | 14 | test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 11 + 15 | 14 | test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 12 + 15 | 14 | test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 13 + 16 | 15 | test > test > test > test > gauss > v10-3L > v10 > huawei > xian | 9 + 16 | 15 | test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 10 + 16 | 15 | test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 11 + 16 | 15 | test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 12 + 16 | 15 | test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 13 + 16 | 15 | test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 14 + 17 | 16 | test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian | 10 + 17 | 16 | test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 11 + 17 | 16 | test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 12 + 17 | 16 | test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 13 + 17 | 16 | test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 14 + 17 | 16 | test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 15 + 18 | 17 | test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian | 11 + 18 | 17 | test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 12 + 18 | 17 | test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 13 + 18 | 17 | test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 14 + 18 | 17 | test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 15 + 18 | 17 | test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 16 + 19 | 18 | test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian | 12 + 19 | 18 | test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 13 + 19 | 18 | test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 14 + 19 | 18 | test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 15 + 19 | 18 | test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 16 + 19 | 18 | test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 17 + 20 | 19 | test > test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian | 13 + 20 | 19 | test > test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 14 + 20 | 19 | test > test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 15 + 20 | 19 | test > test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 16 + 20 | 19 | test > test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 17 + 20 | 19 | test > test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 18 +(93 rows) + +--increate cpu_tuple_cost, disable seqscan, test parallel index scan +set enable_seqscan=off; +set cpu_tuple_cost=1000; +explain (costs off) +WITH recursive t_result AS ( +select * from( +SELECT dm,sj_dm,name,1 as level +FROM test_with_rescan +WHERE sj_dm < 10 order by dm limit 6 offset 2) +UNION all +SELECT t2.dm,t2.sj_dm,t2.name||' > '||t1.name,t1.level+1 +FROM t_result t1 +JOIN test_with_rescan t2 ON t2.sj_dm = t1.dm +) +SELECT * +FROM t_result t; + QUERY PLAN +------------------------------------------------------------------------------------------------------------- + CTE Scan on t_result t + CTE t_result + -> Recursive Union + -> Limit + -> Sort + Sort Key: test_with_rescan.dm + -> Gather + Number of Workers: 1 + -> Parallel Index Scan using test_with_rescan_sj_dm_idx on test_with_rescan + Index Cond: (sj_dm < 10) + -> Merge Join + Merge Cond: (t1.dm = t2.sj_dm) + -> Sort + Sort Key: t1.dm + -> WorkTable Scan on t_result t1 + -> Sort + Sort Key: t2.sj_dm + -> Gather + Number of Workers: 1 + -> Parallel Index Scan using test_with_rescan_sj_dm_idx on test_with_rescan t2 +(20 rows) + +WITH recursive t_result AS ( +select * from( +SELECT dm,sj_dm,name,1 as level +FROM test_with_rescan +WHERE sj_dm < 10 order by dm limit 6 offset 2) +UNION all +SELECT t2.dm,t2.sj_dm,t2.name||' > '||t1.name,t1.level+1 +FROM t_result t1 +JOIN test_with_rescan t2 ON t2.sj_dm = t1.dm +) +SELECT * +FROM t_result t order by 1,2,3,4; + dm | sj_dm | name | level +----+-------+-------------------------------------------------------------------------------------------------------------------------------------+------- + 3 | 2 | sun | 1 + 4 | 3 | earth | 1 + 4 | 3 | earth > sun | 2 + 5 | 4 | asia | 1 + 5 | 4 | asia > earth | 2 + 5 | 4 | asia > earth > sun | 3 + 6 | 5 | China | 1 + 6 | 5 | China > asia | 2 + 6 | 5 | China > asia > earth | 3 + 6 | 5 | China > asia > earth > sun | 4 + 7 | 6 | shaanxi | 1 + 7 | 6 | shaanxi > China | 2 + 7 | 6 | shaanxi > China > asia | 3 + 7 | 6 | shaanxi > China > asia > earth | 4 + 7 | 6 | shaanxi > China > asia > earth > sun | 5 + 8 | 7 | xian | 1 + 8 | 7 | xian > shaanxi | 2 + 8 | 7 | xian > shaanxi > China | 3 + 8 | 7 | xian > shaanxi > China > asia | 4 + 8 | 7 | xian > shaanxi > China > asia > earth | 5 + 8 | 7 | xian > shaanxi > China > asia > earth > sun | 6 + 9 | 8 | huawei > xian | 2 + 9 | 8 | huawei > xian > shaanxi | 3 + 9 | 8 | huawei > xian > shaanxi > China | 4 + 9 | 8 | huawei > xian > shaanxi > China > asia | 5 + 9 | 8 | huawei > xian > shaanxi > China > asia > earth | 6 + 9 | 8 | huawei > xian > shaanxi > China > asia > earth > sun | 7 + 10 | 9 | v10 > huawei > xian | 3 + 10 | 9 | v10 > huawei > xian > shaanxi | 4 + 10 | 9 | v10 > huawei > xian > shaanxi > China | 5 + 10 | 9 | v10 > huawei > xian > shaanxi > China > asia | 6 + 10 | 9 | v10 > huawei > xian > shaanxi > China > asia > earth | 7 + 10 | 9 | v10 > huawei > xian > shaanxi > China > asia > earth > sun | 8 + 11 | 10 | v10-3L > v10 > huawei > xian | 4 + 11 | 10 | v10-3L > v10 > huawei > xian > shaanxi | 5 + 11 | 10 | v10-3L > v10 > huawei > xian > shaanxi > China | 6 + 11 | 10 | v10-3L > v10 > huawei > xian > shaanxi > China > asia | 7 + 11 | 10 | v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 8 + 11 | 10 | v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 9 + 12 | 11 | gauss > v10-3L > v10 > huawei > xian | 5 + 12 | 11 | gauss > v10-3L > v10 > huawei > xian > shaanxi | 6 + 12 | 11 | gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 7 + 12 | 11 | gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 8 + 12 | 11 | gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 9 + 12 | 11 | gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 10 + 13 | 12 | test > gauss > v10-3L > v10 > huawei > xian | 6 + 13 | 12 | test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 7 + 13 | 12 | test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 8 + 13 | 12 | test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 9 + 13 | 12 | test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 10 + 13 | 12 | test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 11 + 14 | 13 | test > test > gauss > v10-3L > v10 > huawei > xian | 7 + 14 | 13 | test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 8 + 14 | 13 | test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 9 + 14 | 13 | test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 10 + 14 | 13 | test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 11 + 14 | 13 | test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 12 + 15 | 14 | test > test > test > gauss > v10-3L > v10 > huawei > xian | 8 + 15 | 14 | test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 9 + 15 | 14 | test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 10 + 15 | 14 | test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 11 + 15 | 14 | test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 12 + 15 | 14 | test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 13 + 16 | 15 | test > test > test > test > gauss > v10-3L > v10 > huawei > xian | 9 + 16 | 15 | test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 10 + 16 | 15 | test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 11 + 16 | 15 | test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 12 + 16 | 15 | test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 13 + 16 | 15 | test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 14 + 17 | 16 | test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian | 10 + 17 | 16 | test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 11 + 17 | 16 | test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 12 + 17 | 16 | test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 13 + 17 | 16 | test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 14 + 17 | 16 | test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 15 + 18 | 17 | test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian | 11 + 18 | 17 | test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 12 + 18 | 17 | test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 13 + 18 | 17 | test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 14 + 18 | 17 | test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 15 + 18 | 17 | test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 16 + 19 | 18 | test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian | 12 + 19 | 18 | test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 13 + 19 | 18 | test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 14 + 19 | 18 | test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 15 + 19 | 18 | test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 16 + 19 | 18 | test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 17 + 20 | 19 | test > test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian | 13 + 20 | 19 | test > test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 14 + 20 | 19 | test > test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 15 + 20 | 19 | test > test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 16 + 20 | 19 | test > test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 17 + 20 | 19 | test > test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 18 +(93 rows) + +--disable indexscan, test parallel bitmap scan +set enable_indexscan to off; +explain (costs off) +WITH recursive t_result AS ( +select * from( +SELECT dm,sj_dm,name,1 as level +FROM test_with_rescan +WHERE sj_dm < 10 order by dm limit 6 offset 2) +UNION all +SELECT t2.dm,t2.sj_dm,t2.name||' > '||t1.name,t1.level+1 +FROM t_result t1 +JOIN test_with_rescan t2 ON t2.sj_dm = t1.dm +) +SELECT * +FROM t_result t; + QUERY PLAN +--------------------------------------------------------------------------------------- + CTE Scan on t_result t + CTE t_result + -> Recursive Union + -> Limit + -> Sort + Sort Key: test_with_rescan.dm + -> Gather + Number of Workers: 1 + -> Parallel Bitmap Heap Scan on test_with_rescan + Recheck Cond: (sj_dm < 10) + -> Bitmap Index Scan on test_with_rescan_sj_dm_idx + Index Cond: (sj_dm < 10) + -> Nested Loop + -> WorkTable Scan on t_result t1 + -> Bitmap Heap Scan on test_with_rescan t2 + Recheck Cond: (sj_dm = t1.dm) + -> Bitmap Index Scan on test_with_rescan_sj_dm_idx + Index Cond: (sj_dm = t1.dm) +(18 rows) + +WITH recursive t_result AS ( +select * from( +SELECT dm,sj_dm,name,1 as level +FROM test_with_rescan +WHERE sj_dm < 10 order by dm limit 6 offset 2) +UNION all +SELECT t2.dm,t2.sj_dm,t2.name||' > '||t1.name,t1.level+1 +FROM t_result t1 +JOIN test_with_rescan t2 ON t2.sj_dm = t1.dm +) +SELECT * +FROM t_result t order by 1,2,3,4; + dm | sj_dm | name | level +----+-------+-------------------------------------------------------------------------------------------------------------------------------------+------- + 3 | 2 | sun | 1 + 4 | 3 | earth | 1 + 4 | 3 | earth > sun | 2 + 5 | 4 | asia | 1 + 5 | 4 | asia > earth | 2 + 5 | 4 | asia > earth > sun | 3 + 6 | 5 | China | 1 + 6 | 5 | China > asia | 2 + 6 | 5 | China > asia > earth | 3 + 6 | 5 | China > asia > earth > sun | 4 + 7 | 6 | shaanxi | 1 + 7 | 6 | shaanxi > China | 2 + 7 | 6 | shaanxi > China > asia | 3 + 7 | 6 | shaanxi > China > asia > earth | 4 + 7 | 6 | shaanxi > China > asia > earth > sun | 5 + 8 | 7 | xian | 1 + 8 | 7 | xian > shaanxi | 2 + 8 | 7 | xian > shaanxi > China | 3 + 8 | 7 | xian > shaanxi > China > asia | 4 + 8 | 7 | xian > shaanxi > China > asia > earth | 5 + 8 | 7 | xian > shaanxi > China > asia > earth > sun | 6 + 9 | 8 | huawei > xian | 2 + 9 | 8 | huawei > xian > shaanxi | 3 + 9 | 8 | huawei > xian > shaanxi > China | 4 + 9 | 8 | huawei > xian > shaanxi > China > asia | 5 + 9 | 8 | huawei > xian > shaanxi > China > asia > earth | 6 + 9 | 8 | huawei > xian > shaanxi > China > asia > earth > sun | 7 + 10 | 9 | v10 > huawei > xian | 3 + 10 | 9 | v10 > huawei > xian > shaanxi | 4 + 10 | 9 | v10 > huawei > xian > shaanxi > China | 5 + 10 | 9 | v10 > huawei > xian > shaanxi > China > asia | 6 + 10 | 9 | v10 > huawei > xian > shaanxi > China > asia > earth | 7 + 10 | 9 | v10 > huawei > xian > shaanxi > China > asia > earth > sun | 8 + 11 | 10 | v10-3L > v10 > huawei > xian | 4 + 11 | 10 | v10-3L > v10 > huawei > xian > shaanxi | 5 + 11 | 10 | v10-3L > v10 > huawei > xian > shaanxi > China | 6 + 11 | 10 | v10-3L > v10 > huawei > xian > shaanxi > China > asia | 7 + 11 | 10 | v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 8 + 11 | 10 | v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 9 + 12 | 11 | gauss > v10-3L > v10 > huawei > xian | 5 + 12 | 11 | gauss > v10-3L > v10 > huawei > xian > shaanxi | 6 + 12 | 11 | gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 7 + 12 | 11 | gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 8 + 12 | 11 | gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 9 + 12 | 11 | gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 10 + 13 | 12 | test > gauss > v10-3L > v10 > huawei > xian | 6 + 13 | 12 | test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 7 + 13 | 12 | test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 8 + 13 | 12 | test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 9 + 13 | 12 | test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 10 + 13 | 12 | test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 11 + 14 | 13 | test > test > gauss > v10-3L > v10 > huawei > xian | 7 + 14 | 13 | test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 8 + 14 | 13 | test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 9 + 14 | 13 | test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 10 + 14 | 13 | test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 11 + 14 | 13 | test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 12 + 15 | 14 | test > test > test > gauss > v10-3L > v10 > huawei > xian | 8 + 15 | 14 | test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 9 + 15 | 14 | test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 10 + 15 | 14 | test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 11 + 15 | 14 | test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 12 + 15 | 14 | test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 13 + 16 | 15 | test > test > test > test > gauss > v10-3L > v10 > huawei > xian | 9 + 16 | 15 | test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 10 + 16 | 15 | test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 11 + 16 | 15 | test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 12 + 16 | 15 | test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 13 + 16 | 15 | test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 14 + 17 | 16 | test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian | 10 + 17 | 16 | test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 11 + 17 | 16 | test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 12 + 17 | 16 | test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 13 + 17 | 16 | test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 14 + 17 | 16 | test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 15 + 18 | 17 | test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian | 11 + 18 | 17 | test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 12 + 18 | 17 | test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 13 + 18 | 17 | test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 14 + 18 | 17 | test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 15 + 18 | 17 | test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 16 + 19 | 18 | test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian | 12 + 19 | 18 | test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 13 + 19 | 18 | test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 14 + 19 | 18 | test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 15 + 19 | 18 | test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 16 + 19 | 18 | test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 17 + 20 | 19 | test > test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian | 13 + 20 | 19 | test > test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi | 14 + 20 | 19 | test > test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China | 15 + 20 | 19 | test > test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia | 16 + 20 | 19 | test > test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth | 17 + 20 | 19 | test > test > test > test > test > test > test > test > gauss > v10-3L > v10 > huawei > xian > shaanxi > China > asia > earth > sun | 18 +(93 rows) + +drop table test_with_rescan; +reset enable_seqscan; +reset enable_indexscan; +reset cpu_tuple_cost; --parallel plan for seq scan +set enable_bitmapscan to off; +set enable_indexscan to off; explain (costs off) select count(*) from parallel_t1; QUERY PLAN ---------------------------------------------- @@ -174,6 +699,100 @@ explain (costs off,analyse on,verbose on) select count(*) from parallel_t1; --? Total runtime: [0-9]*\.[0-9]*\.\.[0-9]*\.[0-9]* ms (10 rows) +reset enable_indexscan; +reset enable_bitmapscan; +--parallel plan for bitmap scan +--onepage case +CREATE TABLE onepage1 (val int, val2 int); +ALTER TABLE onepage1 ADD PRIMARY KEY(val, val2); +NOTICE: ALTER TABLE / ADD PRIMARY KEY will create implicit index "onepage1_pkey" for table "onepage1" +insert into onepage1 (select * from generate_series(1, 5) a, generate_series(1, 5) b); +CREATE TABLE onepage2 as select * from onepage1; +explain select * from onepage2 natural join onepage1 where onepage2.val > 2 and onepage1.val < 4; + QUERY PLAN +----------------------------------------------------------------------------------------------- + Gather (cost=15.04..32.69 rows=1 width=8) + Number of Workers: 2 + -> Parallel Hash Join (cost=15.04..32.69 rows=1 width=8) + Hash Cond: ((onepage2.val = onepage1.val) AND (onepage2.val2 = onepage1.val2)) + -> Parallel Seq Scan on onepage2 (cost=0.00..17.60 rows=5 width=8) + Filter: ((val > 2) AND (val < 4)) + -> Parallel Hash (cost=14.97..14.97 rows=5 width=8) + -> Parallel Bitmap Heap Scan on onepage1 (cost=4.36..14.97 rows=5 width=8) + Recheck Cond: ((val < 4) AND (val > 2)) + -> Bitmap Index Scan on onepage1_pkey (cost=0.00..4.36 rows=11 width=0) + Index Cond: ((val < 4) AND (val > 2)) +(11 rows) + +select * from onepage2 natural join onepage1 where onepage2.val > 2 and onepage1.val < 4 order by 1,2; + val | val2 +-----+------ + 3 | 1 + 3 | 2 + 3 | 3 + 3 | 4 + 3 | 5 +(5 rows) + +drop table onepage1; +drop table onepage2; +set enable_seqscan to off; +set enable_indexscan to off; +explain (costs off) select count(*) from parallel_t1 where a > 5000; + QUERY PLAN +-------------------------------------------------------- + Aggregate + -> Gather + Number of Workers: 2 + -> Parallel Bitmap Heap Scan on parallel_t1 + Recheck Cond: (a > 5000) + -> Bitmap Index Scan on idx_parallel_t1 + Index Cond: (a > 5000) +(7 rows) + +explain (costs off) select count(*) from parallel_t1 where a < 5000; + QUERY PLAN +-------------------------------------------------------- + Aggregate + -> Gather + Number of Workers: 2 + -> Parallel Bitmap Heap Scan on parallel_t1 + Recheck Cond: (a < 5000) + -> Bitmap Index Scan on idx_parallel_t1 + Index Cond: (a < 5000) +(7 rows) + +explain (costs off) select count(*) from parallel_t1 where a <> 5000; + QUERY PLAN +-------------------------------------------------------- + Aggregate + -> Gather + Number of Workers: 2 + -> Parallel Bitmap Heap Scan on parallel_t1 + Filter: (a <> 5000) + -> Bitmap Index Scan on idx_parallel_t1 +(6 rows) + +select count(*) from parallel_t1 where a > 5000; + count +------- + 95000 +(1 row) + +select count(*) from parallel_t1 where a < 5000; + count +------- + 4999 +(1 row) + +select count(*) from parallel_t1 where a <> 5000; + count +------- + 99999 +(1 row) + +reset enable_seqscan; +reset enable_indexscan; --clean up reset force_parallel_mode; reset parallel_setup_cost; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 1eeca6d1a..9f2a16cbe 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -29,8 +29,10 @@ test: misc test: stats test: alter_system_set -# parallel query -test: parallel_query parallel_nested_loop parallel_hashjoin parallel_append parallel_create_index +# parallel query, don't put more than 2 parallel query testcases into one test group +test: parallel_query parallel_nested_loop +test: parallel_hashjoin parallel_append +test: parallel_create_index #dispatch from 13 test: function diff --git a/src/test/regress/parallel_schedule3 b/src/test/regress/parallel_schedule3 index ae1e81124..bc689a319 100644 --- a/src/test/regress/parallel_schedule3 +++ b/src/test/regress/parallel_schedule3 @@ -26,5 +26,7 @@ test: upsert_grammer_test_02 upsert_restriction upsert_composite test: upsert_trigger_test upsert_explain test: upsert_clean -# test parallel query -test: parallel_query parallel_nested_loop parallel_hashjoin parallel_append parallel_create_index +# test parallel query, don't put more than 2 parallel query testcases into one test group +test: parallel_query parallel_nested_loop +test: parallel_hashjoin parallel_append +test: parallel_create_index diff --git a/src/test/regress/sql/parallel_query.sql b/src/test/regress/sql/parallel_query.sql index 405291d2d..e7d2bbc14 100644 --- a/src/test/regress/sql/parallel_query.sql +++ b/src/test/regress/sql/parallel_query.sql @@ -14,15 +14,149 @@ select count(*) from parallel_t1 where a > 5000; select count(*) from parallel_t1 where a < 5000; select count(*) from parallel_t1 where a <> 5000; +--normal plan for bitmapscan +create index idx_parallel_t1 on parallel_t1(a); +analyze parallel_t1; +set enable_seqscan to off; +set enable_indexscan to off; +explain (costs off) select count(*) from parallel_t1 where a > 5000; +explain (costs off) select count(*) from parallel_t1 where a < 5000; +explain (costs off) select count(*) from parallel_t1 where a <> 5000; +select count(*) from parallel_t1 where a > 5000; +select count(*) from parallel_t1 where a < 5000; +select count(*) from parallel_t1 where a <> 5000; +reset enable_seqscan; +reset enable_indexscan; + --set parallel parameter set force_parallel_mode=on; set parallel_setup_cost=0; set parallel_tuple_cost=0.000005; set max_parallel_workers_per_gather=2; set min_parallel_table_scan_size=0; +set min_parallel_index_scan_size=0; set parallel_leader_participation=on; +--rescan case +create table test_with_rescan(dm int, sj_dm int, name text); +insert into test_with_rescan values(1,0,'universe'); +insert into test_with_rescan values(2,1,'galaxy'); +insert into test_with_rescan values(3,2,'sun'); +insert into test_with_rescan values(4,3,'earth'); +insert into test_with_rescan values(5,4,'asia'); +insert into test_with_rescan values(6,5,'China'); +insert into test_with_rescan values(7,6,'shaanxi'); +insert into test_with_rescan values(8,7,'xian'); +insert into test_with_rescan values(9,8,'huawei'); +insert into test_with_rescan values(10,9,'v10'); +insert into test_with_rescan values(11,10,'v10-3L'); +insert into test_with_rescan values(12,11,'gauss'); +insert into test_with_rescan values(13,12,'test'); +insert into test_with_rescan values(14,13,'test'); +insert into test_with_rescan values(15,14,'test'); +insert into test_with_rescan values(16,15,'test'); +insert into test_with_rescan values(17,16,'test'); +insert into test_with_rescan values(18,17,'test'); +insert into test_with_rescan values(19,18,'test'); +insert into test_with_rescan values(20,19,'test'); +create index on test_with_rescan(dm); +create index on test_with_rescan(sj_dm); +create index on test_with_rescan(name); +analyze test_with_rescan; +explain (costs off) +WITH recursive t_result AS ( +select * from( +SELECT dm,sj_dm,name,1 as level +FROM test_with_rescan +WHERE sj_dm < 10 order by dm limit 6 offset 2) +UNION all +SELECT t2.dm,t2.sj_dm,t2.name||' > '||t1.name,t1.level+1 +FROM t_result t1 +JOIN test_with_rescan t2 ON t2.sj_dm = t1.dm +) +SELECT * +FROM t_result t; + +WITH recursive t_result AS ( +select * from( +SELECT dm,sj_dm,name,1 as level +FROM test_with_rescan +WHERE sj_dm < 10 order by dm limit 6 offset 2) +UNION all +SELECT t2.dm,t2.sj_dm,t2.name||' > '||t1.name,t1.level+1 +FROM t_result t1 +JOIN test_with_rescan t2 ON t2.sj_dm = t1.dm +) +SELECT * +FROM t_result t order by 1,2,3,4; + +--increate cpu_tuple_cost, disable seqscan, test parallel index scan +set enable_seqscan=off; +set cpu_tuple_cost=1000; +explain (costs off) +WITH recursive t_result AS ( +select * from( +SELECT dm,sj_dm,name,1 as level +FROM test_with_rescan +WHERE sj_dm < 10 order by dm limit 6 offset 2) +UNION all +SELECT t2.dm,t2.sj_dm,t2.name||' > '||t1.name,t1.level+1 +FROM t_result t1 +JOIN test_with_rescan t2 ON t2.sj_dm = t1.dm +) +SELECT * +FROM t_result t; + +WITH recursive t_result AS ( +select * from( +SELECT dm,sj_dm,name,1 as level +FROM test_with_rescan +WHERE sj_dm < 10 order by dm limit 6 offset 2) +UNION all +SELECT t2.dm,t2.sj_dm,t2.name||' > '||t1.name,t1.level+1 +FROM t_result t1 +JOIN test_with_rescan t2 ON t2.sj_dm = t1.dm +) +SELECT * +FROM t_result t order by 1,2,3,4; + +--disable indexscan, test parallel bitmap scan +set enable_indexscan to off; +explain (costs off) +WITH recursive t_result AS ( +select * from( +SELECT dm,sj_dm,name,1 as level +FROM test_with_rescan +WHERE sj_dm < 10 order by dm limit 6 offset 2) +UNION all +SELECT t2.dm,t2.sj_dm,t2.name||' > '||t1.name,t1.level+1 +FROM t_result t1 +JOIN test_with_rescan t2 ON t2.sj_dm = t1.dm +) +SELECT * +FROM t_result t; + +WITH recursive t_result AS ( +select * from( +SELECT dm,sj_dm,name,1 as level +FROM test_with_rescan +WHERE sj_dm < 10 order by dm limit 6 offset 2) +UNION all +SELECT t2.dm,t2.sj_dm,t2.name||' > '||t1.name,t1.level+1 +FROM t_result t1 +JOIN test_with_rescan t2 ON t2.sj_dm = t1.dm +) +SELECT * +FROM t_result t order by 1,2,3,4; + +drop table test_with_rescan; +reset enable_seqscan; +reset enable_indexscan; +reset cpu_tuple_cost; + --parallel plan for seq scan +set enable_bitmapscan to off; +set enable_indexscan to off; explain (costs off) select count(*) from parallel_t1; explain (costs off) select count(*) from parallel_t1 where a = 5000; explain (costs off) select count(*) from parallel_t1 where a > 5000; @@ -34,6 +168,29 @@ select count(*) from parallel_t1 where a > 5000; select count(*) from parallel_t1 where a < 5000; select count(*) from parallel_t1 where a <> 5000; explain (costs off,analyse on,verbose on) select count(*) from parallel_t1; +reset enable_indexscan; +reset enable_bitmapscan; + +--parallel plan for bitmap scan +--onepage case +CREATE TABLE onepage1 (val int, val2 int); +ALTER TABLE onepage1 ADD PRIMARY KEY(val, val2); +insert into onepage1 (select * from generate_series(1, 5) a, generate_series(1, 5) b); +CREATE TABLE onepage2 as select * from onepage1; +explain select * from onepage2 natural join onepage1 where onepage2.val > 2 and onepage1.val < 4; +select * from onepage2 natural join onepage1 where onepage2.val > 2 and onepage1.val < 4 order by 1,2; +drop table onepage1; +drop table onepage2; +set enable_seqscan to off; +set enable_indexscan to off; +explain (costs off) select count(*) from parallel_t1 where a > 5000; +explain (costs off) select count(*) from parallel_t1 where a < 5000; +explain (costs off) select count(*) from parallel_t1 where a <> 5000; +select count(*) from parallel_t1 where a > 5000; +select count(*) from parallel_t1 where a < 5000; +select count(*) from parallel_t1 where a <> 5000; +reset enable_seqscan; +reset enable_indexscan; --clean up reset force_parallel_mode;