1371 lines
49 KiB
C++
1371 lines
49 KiB
C++
/* -------------------------------------------------------------------------
|
|
*
|
|
* nodeBitmapHeapscan.cpp
|
|
* Routines to support bitmapped scans of relations
|
|
*
|
|
* NOTE: it is critical that this plan type only be used with MVCC-compliant
|
|
* snapshots (ie, regular snapshots, not SnapshotNow or one of the other
|
|
* special snapshots). The reason is that since index and heap scans are
|
|
* decoupled, there can be no assurance that the index tuple prompting a
|
|
* visit to a particular heap TID still exists when the visit is made.
|
|
* Therefore the tuple might not exist anymore either (which is OK because
|
|
* heap_fetch will cope) --- but worse, the tuple slot could have been
|
|
* re-used for a newer tuple. With an MVCC snapshot the newer tuple is
|
|
* certain to fail the time qual and so it will not be mistakenly returned.
|
|
* With SnapshotNow we might return a tuple that doesn't meet the required
|
|
* index qual conditions.
|
|
*
|
|
*
|
|
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
|
|
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
* Portions Copyright (c) 2021, openGauss Contributors
|
|
*
|
|
*
|
|
* IDENTIFICATION
|
|
* src/gausskernel/runtime/executor/nodeBitmapHeapscan.cpp
|
|
*
|
|
* -------------------------------------------------------------------------
|
|
*
|
|
* INTERFACE ROUTINES
|
|
* ExecBitmapHeapScan scans a relation using bitmap info
|
|
* ExecBitmapHeapNext workhorse for above
|
|
* ExecInitBitmapHeapScan creates and initializes state info.
|
|
* ExecReScanBitmapHeapScan prepares to rescan the plan.
|
|
* ExecEndBitmapHeapScan releases all storage.
|
|
*/
|
|
#include "postgres.h"
|
|
#include "knl/knl_variable.h"
|
|
|
|
#include "access/relscan.h"
|
|
#include "access/tableam.h"
|
|
#include "access/transam.h"
|
|
#include "catalog/pg_partition_fn.h"
|
|
#include "commands/cluster.h"
|
|
#include "executor/exec/execdebug.h"
|
|
#include "executor/node/nodeBitmapHeapscan.h"
|
|
#include "pgstat.h"
|
|
#include "storage/buf/bufmgr.h"
|
|
#include "storage/predicate.h"
|
|
#include "storage/tcap.h"
|
|
#include "utils/memutils.h"
|
|
#include "utils/rel.h"
|
|
#include "utils/rel_gs.h"
|
|
#include "utils/snapmgr.h"
|
|
#include "gstrace/gstrace_infra.h"
|
|
#include "gstrace/access_gstrace.h"
|
|
#include "nodes/execnodes.h"
|
|
#include "access/ustore/knl_uscan.h"
|
|
|
|
#include "nodes/makefuncs.h"
|
|
#include "optimizer/pruning.h"
|
|
|
|
#include "nodes/makefuncs.h"
|
|
#include "optimizer/pruning.h"
|
|
|
|
static TupleTableSlot* ExecBitmapHeapScan(PlanState* state);
|
|
static TupleTableSlot* BitmapHbucketTblNext(BitmapHeapScanState* node);
|
|
static TupleTableSlot* BitmapHeapTblNext(BitmapHeapScanState* node);
|
|
bool heapam_scan_bitmap_next_block(TableScanDesc scan, TBMIterateResult* tbmres,
|
|
bool* has_cur_xact_write = NULL);
|
|
static void ExecInitPartitionForBitmapHeapScan(BitmapHeapScanState* scanstate, EState* estate);
|
|
static void ExecInitNextPartitionForBitmapHeapScan(BitmapHeapScanState* node);
|
|
void BitmapHeapPrefetchNext(
|
|
BitmapHeapScanState* node, TableScanDesc scan, const TIDBitmap* tbm, TBMIterator** prefetch_iterator);
|
|
|
|
/* This struct is used for partition switch while prefetch pages */
|
|
typedef struct PrefetchNode {
|
|
BlockNumber blockNum;
|
|
Oid partOid;
|
|
int2 bktId;
|
|
} PrefetchNode;
|
|
|
|
void BitmapHeapFree(BitmapHeapScanState* node)
|
|
{
|
|
if (node->tbmiterator != NULL) {
|
|
tbm_end_iterate(node->tbmiterator);
|
|
node->tbmiterator = NULL;
|
|
}
|
|
if (node->prefetch_iterator != NULL) {
|
|
tbm_end_iterate(node->prefetch_iterator);
|
|
node->prefetch_iterator = NULL;
|
|
}
|
|
if (node->tbm != NULL) {
|
|
tbm_free(node->tbm);
|
|
node->tbm = NULL;
|
|
}
|
|
node->tbmres = NULL;
|
|
}
|
|
static TupleTableSlot* BitmapHbucketTblNext(BitmapHeapScanState* node)
|
|
{
|
|
Assert(node->ss.ss_currentScanDesc != NULL);
|
|
HBktTblScanDesc hpScan = (HBktTblScanDesc)node->ss.ss_currentScanDesc;
|
|
TupleTableSlot* slot = NULL;
|
|
while (true) {
|
|
node->ss.ps.hbktScanSlot.currSlot = hpScan->curr_slot;
|
|
node->ss.ps.lefttree->hbktScanSlot.currSlot = hpScan->curr_slot;
|
|
slot = BitmapHeapTblNext(node);
|
|
|
|
/* for crossbucket index */
|
|
if (tbm_is_crossbucket(node->tbm)) {
|
|
return slot;
|
|
}
|
|
|
|
/* for non-crossbucket index */
|
|
if (!TupIsNull(slot)) {
|
|
return slot;
|
|
}
|
|
|
|
if (!hbkt_bitmapheap_scan_nextbucket(hpScan)) {
|
|
return NULL;
|
|
}
|
|
|
|
BitmapHeapFree(node);
|
|
}
|
|
}
|
|
|
|
bool HeapamScanBitmapNextTuple(TableScanDesc scan,
|
|
TBMIterateResult *tbmres,
|
|
TupleTableSlot *slot)
|
|
{
|
|
HeapScanDesc hscan = (HeapScanDesc) scan;
|
|
OffsetNumber targoffset;
|
|
Page dp;
|
|
ItemId lp;
|
|
|
|
/*
|
|
* Out of range? If so, nothing more to look at on this page
|
|
*/
|
|
if (hscan->rs_base.rs_cindex < 0 || hscan->rs_base.rs_cindex >= hscan->rs_base.rs_ntuples)
|
|
return false;
|
|
|
|
/*
|
|
* Okay to fetch the tuple
|
|
*/
|
|
targoffset = hscan->rs_base.rs_vistuples[hscan->rs_base.rs_cindex];
|
|
dp = (Page)BufferGetPage(hscan->rs_base.rs_cbuf);
|
|
lp = PageGetItemId(dp, targoffset);
|
|
Assert(ItemIdIsNormal(lp));
|
|
|
|
hscan->rs_ctup.t_data = (HeapTupleHeader)PageGetItem((Page)dp, lp);
|
|
hscan->rs_ctup.t_len = ItemIdGetLength(lp);
|
|
hscan->rs_ctup.t_tableOid = RelationGetRelid(hscan->rs_base.rs_rd);
|
|
hscan->rs_ctup.t_bucketId = RelationGetBktid(hscan->rs_base.rs_rd);
|
|
HeapTupleCopyBaseFromPage(&hscan->rs_ctup, dp);
|
|
ItemPointerSet(&hscan->rs_ctup.t_self, tbmres->blockno, targoffset);
|
|
|
|
pgstat_count_heap_fetch(hscan->rs_base.rs_rd);
|
|
|
|
/*
|
|
* Set up the result slot to point to this tuple. Note that the slot
|
|
* acquires a pin on the buffer.
|
|
*/
|
|
(void)ExecStoreTuple(&hscan->rs_ctup, slot, hscan->rs_base.rs_cbuf, false);
|
|
|
|
hscan->rs_base.rs_cindex++;
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool TableScanBitmapNextTuple(TableScanDesc scan, TBMIterateResult *tbmres, TupleTableSlot *slot)
|
|
{
|
|
bool isUstore = RelationIsUstoreFormat(scan->rs_rd);
|
|
if (isUstore) {
|
|
return UHeapScanBitmapNextTuple(scan, tbmres, slot);
|
|
} else {
|
|
return HeapamScanBitmapNextTuple(scan, tbmres, slot);
|
|
}
|
|
}
|
|
|
|
static bool TableScanBitmapNextBlock(TableScanDesc scan, TBMIterateResult *tbmres, bool* has_cur_xact_write)
|
|
{
|
|
bool isUstore = RelationIsUstoreFormat(scan->rs_rd);
|
|
if (isUstore) {
|
|
return UHeapScanBitmapNextBlock(scan, tbmres, has_cur_xact_write);
|
|
} else {
|
|
return heapam_scan_bitmap_next_block(scan, tbmres, has_cur_xact_write);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* This is intended to locate the target child relation (partition or bucket).
|
|
* It is only applied to the underlying scan is a global index scan (GPI or GPI+CBI).
|
|
*
|
|
* Return values: 0: success; -1: fail; 1: need to prefetch.
|
|
*/
|
|
static int TableScanBitmapNextTargetRel(TableScanDesc scan, BitmapHeapScanState *node)
|
|
{
|
|
Assert(scan != NULL);
|
|
Assert(node != NULL);
|
|
Assert(node->tbm != NULL);
|
|
Assert(node->tbmres != NULL);
|
|
|
|
bool result = true;
|
|
TIDBitmap *tbm = node->tbm;
|
|
TBMIterateResult *tbmres = node->tbmres;
|
|
int2 bucketid = InvalidBktId;
|
|
bool need_reset_bucketid = false;
|
|
|
|
/* Check whether switch partition-fake-rel, use rd_rel save. */
|
|
if (BitmapNodeNeedSwitchPartRel(node)) { /* for global partitioned index */
|
|
GPISetCurrPartOid(node->gpi_scan, tbmres->partitionOid);
|
|
if (!GPIGetNextPartRelation(node->gpi_scan, CurrentMemoryContext, AccessShareLock)) {
|
|
/* return 1 to indicate caller may need to call prefetch */
|
|
return 1;
|
|
}
|
|
scan->rs_rd = node->gpi_scan->fakePartRelation;
|
|
scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
|
|
|
|
/*
|
|
* Reset the scanning bucketid to force reloading
|
|
* the target bucket relation in this new partition.
|
|
*/
|
|
need_reset_bucketid = true;
|
|
}
|
|
|
|
/*
|
|
* Check whether need to switch bucket, if the underlying
|
|
* indexscan is a cross-bucket indexscan.
|
|
*/
|
|
if (tbm_is_crossbucket(tbm)) { /* for crossbucket index */
|
|
bucketid = tbmres->bucketid; /* set to the current iterating bucketid */
|
|
Assert(BUCKET_NODE_IS_VALID(bucketid));
|
|
|
|
need_reset_bucketid = (need_reset_bucketid || (scan->rs_rd->rd_node.bucketNode != bucketid));
|
|
|
|
if (need_reset_bucketid ) {
|
|
cbi_set_bucketid(node->cbi_scan, InvalidBktId);
|
|
}
|
|
|
|
if (cbi_scan_need_change_bucket(node->cbi_scan, bucketid)) {
|
|
cbi_set_bucketid(node->cbi_scan, bucketid);
|
|
result = cbi_bitmapheap_scan_nextbucket((HBktTblScanDesc)node->ss.ss_currentScanDesc, node->gpi_scan,
|
|
node->cbi_scan);
|
|
}
|
|
}
|
|
|
|
return (result ? 0 : -1);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* BitmapHeapNext
|
|
*
|
|
* Retrieve next tuple from the BitmapHeapScan node's currentRelation
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
static TupleTableSlot* BitmapHeapTblNext(BitmapHeapScanState* node)
|
|
{
|
|
ExprContext* econtext = NULL;
|
|
TableScanDesc scan = NULL;
|
|
TIDBitmap* tbm = NULL;
|
|
TBMHandler tbm_handler;
|
|
TBMIterator* tbmiterator = NULL;
|
|
TBMIterateResult* tbmres = NULL;
|
|
HBktTblScanDesc hpscan = NULL;
|
|
|
|
#ifdef USE_PREFETCH
|
|
TBMIterator* prefetch_iterator = NULL;
|
|
#endif
|
|
TupleTableSlot* slot = NULL;
|
|
|
|
/*
|
|
* extract necessary information from index scan node
|
|
*/
|
|
econtext = node->ss.ps.ps_ExprContext;
|
|
slot = node->ss.ss_ScanTupleSlot;
|
|
if (node->ss.ss_currentRelation != NULL && RelationIsPartitionedHashBucketTable(node->ss.ss_currentRelation)) {
|
|
Assert(node->ss.ss_currentScanDesc != NULL);
|
|
hpscan = (HBktTblScanDesc)node->ss.ss_currentScanDesc;
|
|
scan = (TableScanDesc)hpscan->currBktScan;
|
|
} else {
|
|
scan = GetTableScanDesc(node->ss.ss_currentScanDesc, node->ss.ss_currentRelation);
|
|
}
|
|
tbm = node->tbm;
|
|
tbmiterator = node->tbmiterator;
|
|
tbmres = node->tbmres;
|
|
#ifdef USE_PREFETCH
|
|
prefetch_iterator = node->prefetch_iterator;
|
|
#endif
|
|
|
|
/*
|
|
* If we haven't yet performed the underlying index scan, do it, and begin
|
|
* the iteration over the bitmap.
|
|
*
|
|
* For prefetching, we use *two* iterators, one for the pages we are
|
|
* actually scanning and another that runs ahead of the first for
|
|
* prefetching. node->prefetch_pages tracks exactly how many pages ahead
|
|
* the prefetch iterator is. Also, node->prefetch_target tracks the
|
|
* desired prefetch distance, which starts small and increases up to the
|
|
* GUC-controlled maximum, target_prefetch_pages. This is to avoid doing
|
|
* a lot of prefetching in a scan that stops after a few tuples because of
|
|
* a LIMIT.
|
|
*/
|
|
if (tbm == NULL) {
|
|
tbm = (TIDBitmap*)MultiExecProcNode(outerPlanState(node));
|
|
tbm_handler = tbm_get_handler(tbm);
|
|
|
|
if (tbm == NULL || !IsA(tbm, TIDBitmap)) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_UNRECOGNIZED_NODE_TYPE),
|
|
errmodule(MOD_EXECUTOR),
|
|
errmsg("unrecognized result from subplan for BitmapHeapScan.")));
|
|
}
|
|
|
|
node->tbm = tbm;
|
|
node->tbmiterator = tbmiterator = tbm_handler._begin_iterate(tbm);
|
|
node->tbmres = tbmres = NULL;
|
|
|
|
#ifdef USE_PREFETCH
|
|
if (u_sess->storage_cxt.target_prefetch_pages > 0) {
|
|
node->prefetch_iterator = prefetch_iterator = tbm_handler._begin_iterate(tbm);
|
|
node->prefetch_pages = 0;
|
|
node->prefetch_target = -1;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* Now tbm is not NULL, we have enough information to
|
|
* determine whether need to assign hpscan. Also need
|
|
* to make sure we are not scanning a virtual hashbucket
|
|
* table.
|
|
*/
|
|
if (hpscan == NULL && tbm_is_crossbucket(tbm) && RELATION_OWN_BUCKET(node->ss.ss_currentScanDesc->rs_rd)) {
|
|
hpscan = (HBktTblScanDesc)node->ss.ss_currentScanDesc;
|
|
}
|
|
|
|
for (;;) {
|
|
/*
|
|
* Get next page of results if needed
|
|
*/
|
|
if (tbmres == NULL) {
|
|
node->tbmres = tbmres = tbm_iterate(tbmiterator);
|
|
if (tbmres == NULL) {
|
|
/* no more entries in the bitmap */
|
|
break;
|
|
}
|
|
|
|
#ifdef USE_PREFETCH
|
|
if (node->prefetch_pages > 0) {
|
|
/* The main iterator has closed the distance by one page */
|
|
node->prefetch_pages--;
|
|
} else if (prefetch_iterator != NULL) {
|
|
/* Do not let the prefetch iterator get behind the main one */
|
|
TBMIterateResult* tbmpre = tbm_iterate(prefetch_iterator);
|
|
|
|
if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_DATA_EXCEPTION),
|
|
errmodule(MOD_EXECUTOR),
|
|
errmsg("prefetch and main iterators are out of sync for BitmapHeapScan.")));
|
|
}
|
|
}
|
|
#endif /* USE_PREFETCH */
|
|
|
|
int rc = TableScanBitmapNextTargetRel(scan, node);
|
|
if (rc != 0) {
|
|
/*
|
|
* If the current partition is invalid,
|
|
* the next page is directly processed.
|
|
*/
|
|
tbmres = NULL;
|
|
#ifdef USE_PREFETCH
|
|
if (rc == 1) {
|
|
BitmapHeapPrefetchNext(node, scan, tbm, &prefetch_iterator);
|
|
}
|
|
#endif /* USE_PREFETCH */
|
|
continue;
|
|
}
|
|
|
|
/* update bucket scan */
|
|
if (hpscan != NULL && scan != hpscan->currBktScan) {
|
|
scan = hpscan->currBktScan;
|
|
}
|
|
|
|
/*
|
|
* Fetch the current table page and identify candidate tuples.
|
|
*/
|
|
if (!TableScanBitmapNextBlock(scan, tbmres, &node->ss.ps.state->have_current_xact_date)) {
|
|
node->tbmres = tbmres = NULL;
|
|
continue;
|
|
}
|
|
|
|
if (tbmres->ntuples >= 0) {
|
|
node->exact_pages++;
|
|
} else {
|
|
node->lossy_pages++;
|
|
}
|
|
|
|
#ifdef USE_PREFETCH
|
|
|
|
/*
|
|
* Increase prefetch target if it's not yet at the max. Note that
|
|
* we will increase it to zero after fetching the very first
|
|
* page/tuple, then to one after the second tuple is fetched, then
|
|
* it doubles as later pages are fetched.
|
|
*/
|
|
if (node->prefetch_target >= u_sess->storage_cxt.target_prefetch_pages)
|
|
/* don't increase any further */;
|
|
else if (node->prefetch_target >= u_sess->storage_cxt.target_prefetch_pages / 2)
|
|
node->prefetch_target = u_sess->storage_cxt.target_prefetch_pages;
|
|
else if (node->prefetch_target > 0)
|
|
node->prefetch_target *= 2;
|
|
else
|
|
node->prefetch_target++;
|
|
#endif /* USE_PREFETCH */
|
|
} else {
|
|
/*
|
|
* Continuing in previously obtained page.
|
|
*/
|
|
|
|
#ifdef USE_PREFETCH
|
|
|
|
/*
|
|
* Try to prefetch at least a few pages even before we get to the
|
|
* second page if we don't stop reading after the first tuple.
|
|
*/
|
|
if (node->prefetch_target < u_sess->storage_cxt.target_prefetch_pages)
|
|
node->prefetch_target++;
|
|
#endif /* USE_PREFETCH */
|
|
}
|
|
|
|
#ifdef USE_PREFETCH
|
|
BitmapHeapPrefetchNext(node, scan, tbm, &prefetch_iterator);
|
|
#endif /* USE_PREFETCH */
|
|
|
|
/*
|
|
* Attempt to fetch tuple from AM.
|
|
*/
|
|
if (!TableScanBitmapNextTuple(scan, tbmres, slot)) {
|
|
/* nothing more to look at on this page */
|
|
node->tbmres = tbmres = NULL;
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* If we are using lossy info, we have to recheck the qual conditions
|
|
* at every tuple.
|
|
*/
|
|
if (tbmres->recheck) {
|
|
econtext->ecxt_scantuple = slot;
|
|
ResetExprContext(econtext);
|
|
|
|
if (!ExecQual(node->bitmapqualorig, econtext)) {
|
|
/* Fails recheck, so drop it and loop back for another */
|
|
InstrCountFiltered2(node, 1);
|
|
(void)ExecClearTuple(slot);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/* OK to return this tuple */
|
|
return slot;
|
|
}
|
|
|
|
/*
|
|
* if we get here it means we are at the end of the scan..
|
|
*/
|
|
return ExecClearTuple(slot);
|
|
}
|
|
|
|
/*
|
|
* bitgetpage - subroutine for BitmapHeapNext()
|
|
*
|
|
* This routine reads and pins the specified page of the relation, then
|
|
* builds an array indicating which tuples on the page are both potentially
|
|
* interesting according to the bitmap, and visible according to the snapshot.
|
|
*/
|
|
bool heapam_scan_bitmap_next_block(TableScanDesc scan, TBMIterateResult* tbmres, bool* has_cur_xact_write)
|
|
{
|
|
HeapScanDesc hscan = (HeapScanDesc) scan;
|
|
BlockNumber page = tbmres->blockno;
|
|
Buffer buffer;
|
|
Snapshot snapshot;
|
|
int ntup;
|
|
|
|
hscan->rs_base.rs_cindex = 0;
|
|
hscan->rs_base.rs_ntuples = 0;
|
|
|
|
/*
|
|
* Ignore any claimed entries past what we think is the end of the
|
|
* relation. It may have been extended after the start of our scan (we
|
|
* only hold an AccessShareLock, and it could be inserts from this
|
|
* backend).
|
|
*/
|
|
if (page >= hscan->rs_base.rs_nblocks)
|
|
return false;
|
|
|
|
/*
|
|
* Acquire pin on the target heap page, trading in any pin we held before.
|
|
*/
|
|
|
|
hscan->rs_base.rs_cbuf = ReleaseAndReadBuffer(hscan->rs_base.rs_cbuf, hscan->rs_base.rs_rd, page);
|
|
|
|
/* In hot standby, we may get a null buffer if index
|
|
* replayed before the tid replayed. This is acceptable, so we return
|
|
* directly without reporting error.
|
|
*/
|
|
if (!BufferIsValid(hscan->rs_base.rs_cbuf)) {
|
|
return false;
|
|
}
|
|
|
|
hscan->rs_base.rs_cblock = page;
|
|
buffer = hscan->rs_base.rs_cbuf;
|
|
snapshot = hscan->rs_base.rs_snapshot;
|
|
|
|
ntup = 0;
|
|
|
|
/*
|
|
* Prune and repair fragmentation for the whole page, if possible.
|
|
*/
|
|
heap_page_prune_opt(hscan->rs_base.rs_rd, buffer);
|
|
|
|
/*
|
|
* We must hold share lock on the buffer content while examining tuple
|
|
* visibility. Afterwards, however, the tuples we have found to be
|
|
* visible are guaranteed good as long as we hold the buffer pin.
|
|
*/
|
|
LockBuffer(buffer, BUFFER_LOCK_SHARE);
|
|
|
|
/*
|
|
* We need two separate strategies for lossy and non-lossy cases.
|
|
*/
|
|
if (tbmres->ntuples >= 0) {
|
|
/*
|
|
* Bitmap is non-lossy, so we just look through the offsets listed in
|
|
* tbmres; but we have to follow any HOT chain starting at each such
|
|
* offset.
|
|
*/
|
|
int curslot;
|
|
|
|
for (curslot = 0; curslot < tbmres->ntuples; curslot++) {
|
|
OffsetNumber offnum = tbmres->offsets[curslot];
|
|
ItemPointerData tid;
|
|
HeapTupleData heapTuple;
|
|
|
|
ItemPointerSet(&tid, page, offnum);
|
|
if (heap_hot_search_buffer(&tid, hscan->rs_base.rs_rd, buffer, snapshot, &heapTuple, NULL, NULL, true,
|
|
has_cur_xact_write))
|
|
hscan->rs_base.rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
|
|
}
|
|
} else {
|
|
/*
|
|
* Bitmap is lossy, so we must examine each item pointer on the page.
|
|
* But we can ignore HOT chains, since we'll check each tuple anyway.
|
|
*/
|
|
Page dp = (Page)BufferGetPage(buffer);
|
|
OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
|
|
OffsetNumber offnum;
|
|
|
|
for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) {
|
|
ItemId lp;
|
|
HeapTupleData loctup;
|
|
bool valid = false;
|
|
|
|
lp = PageGetItemId(dp, offnum);
|
|
if (!ItemIdIsNormal(lp))
|
|
continue;
|
|
loctup.t_data = (HeapTupleHeader)PageGetItem((Page)dp, lp);
|
|
loctup.t_len = ItemIdGetLength(lp);
|
|
loctup.t_tableOid = RelationGetRelid(hscan->rs_base.rs_rd);
|
|
loctup.t_bucketId = RelationGetBktid(hscan->rs_base.rs_rd);
|
|
HeapTupleCopyBaseFromPage(&hscan->rs_ctup, dp);
|
|
HeapTupleCopyBaseFromPage(&loctup, dp);
|
|
ItemPointerSet(&loctup.t_self, page, offnum);
|
|
valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
|
|
if (valid) {
|
|
hscan->rs_base.rs_vistuples[ntup++] = offnum;
|
|
PredicateLockTuple(hscan->rs_base.rs_rd, &loctup, snapshot);
|
|
}
|
|
CheckForSerializableConflictOut(valid, hscan->rs_base.rs_rd, (void *) &loctup, buffer, snapshot);
|
|
}
|
|
}
|
|
|
|
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
|
|
|
Assert(ntup <= MaxHeapTuplesPerPage);
|
|
hscan->rs_base.rs_ntuples = ntup;
|
|
|
|
return ntup > 0;
|
|
}
|
|
|
|
/*
|
|
* BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
|
|
*/
|
|
static bool BitmapHeapRecheck(BitmapHeapScanState* node, TupleTableSlot* slot)
|
|
{
|
|
ExprContext* econtext = NULL;
|
|
|
|
/*
|
|
* extract necessary information from index scan node
|
|
*/
|
|
econtext = node->ss.ps.ps_ExprContext;
|
|
|
|
/* Does the tuple meet the original qual conditions? */
|
|
econtext->ecxt_scantuple = slot;
|
|
|
|
ResetExprContext(econtext);
|
|
|
|
return ExecQual(node->bitmapqualorig, econtext);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecBitmapHeapScan(node)
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
static TupleTableSlot* ExecBitmapHeapScan(PlanState* state)
|
|
{
|
|
BitmapHeapScanState* node = castNode(BitmapHeapScanState, state);
|
|
return ExecScan(&node->ss, node->ss.ScanNextMtd, (ExecScanRecheckMtd)BitmapHeapRecheck);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecReScanBitmapHeapScan(node)
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
void ExecReScanBitmapHeapScan(BitmapHeapScanState* node)
|
|
{
|
|
/*
|
|
* deal with partitioned table
|
|
*/
|
|
if (node->ss.isPartTbl && !(((Scan *)node->ss.ps.plan)->partition_iterator_elimination)) {
|
|
if (!PointerIsValid(node->ss.partitions)) {
|
|
return;
|
|
}
|
|
/*
|
|
* if there are partitions for scaning, switch to the next partition;
|
|
* else return with doing nothing
|
|
*/
|
|
scan_handler_tbl_endscan(node->ss.ss_currentScanDesc);
|
|
|
|
/* switch to next partition for scan */
|
|
ExecInitNextPartitionForBitmapHeapScan(node);
|
|
} else {
|
|
/* rescan to release any page pin */
|
|
scan_handler_tbl_rescan(node->ss.ss_currentScanDesc, NULL, node->ss.ss_currentRelation, true);
|
|
}
|
|
|
|
/* rescan to release any page pin */
|
|
|
|
BitmapHeapFree(node);
|
|
|
|
ExecScanReScan(&node->ss);
|
|
|
|
/*
|
|
* if chgParam of subnode is not null or the relation is a partitioned table
|
|
* then plan will be re-scanned by first ExecProcNode.
|
|
*/
|
|
if (node->ss.isPartTbl || !PointerIsValid(node->ss.ps.lefttree->chgParam))
|
|
ExecReScan(node->ss.ps.lefttree);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecEndBitmapHeapScan
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
void ExecEndBitmapHeapScan(BitmapHeapScanState* node)
|
|
{
|
|
|
|
/*
|
|
* extract information from the node
|
|
*/
|
|
Relation relation = node->ss.ss_currentRelation;
|
|
|
|
/*
|
|
* Free the exprcontext
|
|
*/
|
|
ExecFreeExprContext(&node->ss.ps);
|
|
|
|
/*
|
|
* clear out tuple table slots
|
|
*/
|
|
(void)ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
|
|
(void)ExecClearTuple(node->ss.ss_ScanTupleSlot);
|
|
|
|
/*
|
|
* close down subplans
|
|
*/
|
|
ExecEndNode(outerPlanState(node));
|
|
|
|
/*
|
|
* release bitmap if any
|
|
*/
|
|
BitmapHeapFree(node);
|
|
if (node->ss.ss_currentScanDesc != NULL) {
|
|
scan_handler_tbl_endscan(node->ss.ss_currentScanDesc);
|
|
}
|
|
|
|
if (node->gpi_scan != NULL) {
|
|
GPIScanEnd(node->gpi_scan);
|
|
}
|
|
|
|
if (node->cbi_scan != NULL) {
|
|
cbi_scan_end(node->cbi_scan);
|
|
}
|
|
|
|
/* close heap scan */
|
|
if (node->ss.isPartTbl && PointerIsValid(node->ss.partitions)) {
|
|
/* close table partition */
|
|
|
|
Assert(node->ss.ss_currentPartition);
|
|
releaseDummyRelation(&(node->ss.ss_currentPartition));
|
|
|
|
releaseSubPartitionList(node->ss.ss_currentRelation, &(node->ss.subpartitions), NoLock);
|
|
releasePartitionList(node->ss.ss_currentRelation, &(node->ss.partitions), NoLock);
|
|
}
|
|
|
|
/*
|
|
* close the heap relation.
|
|
*/
|
|
ExecCloseScanRelation(relation);
|
|
|
|
}
|
|
static inline void InitBitmapHeapScanNextMtd(BitmapHeapScanState* bmstate)
|
|
{
|
|
if (RELATION_OWN_BUCKET(bmstate->ss.ss_currentRelation)) {
|
|
bmstate->ss.ScanNextMtd = (ExecScanAccessMtd)BitmapHbucketTblNext;
|
|
return;
|
|
}
|
|
bmstate->ss.ScanNextMtd = (ExecScanAccessMtd)BitmapHeapTblNext;
|
|
}
|
|
|
|
TableScanDesc UHeapBeginScan(Relation relation, Snapshot snapshot, int nkeys, ParallelHeapScanDesc parallel_scan);
|
|
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecInitBitmapHeapScan
|
|
*
|
|
* Initializes the scan's state information.
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
BitmapHeapScanState* ExecInitBitmapHeapScan(BitmapHeapScan* node, EState* estate, int eflags)
|
|
{
|
|
BitmapHeapScanState* scanstate = NULL;
|
|
Relation currentRelation;
|
|
bool isUstoreRel = false;
|
|
Snapshot scanSnap;
|
|
|
|
/* check for unsupported flags */
|
|
Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
|
|
|
|
/*
|
|
* Assert caller didn't ask for an unsafe snapshot --- see comments at
|
|
* head of file.
|
|
*/
|
|
Assert(IsMVCCSnapshot(estate->es_snapshot));
|
|
|
|
/*
|
|
* create state structure
|
|
*/
|
|
scanstate = makeNode(BitmapHeapScanState);
|
|
scanstate->ss.ps.plan = (Plan*)node;
|
|
scanstate->ss.ps.state = estate;
|
|
|
|
scanstate->tbm = NULL;
|
|
scanstate->tbmiterator = NULL;
|
|
scanstate->tbmres = NULL;
|
|
scanstate->exact_pages = 0.0;
|
|
scanstate->lossy_pages = 0.0;
|
|
scanstate->prefetch_iterator = NULL;
|
|
scanstate->prefetch_pages = 0;
|
|
scanstate->prefetch_target = 0;
|
|
scanstate->ss.isPartTbl = node->scan.isPartTbl;
|
|
scanstate->ss.currentSlot = 0;
|
|
scanstate->ss.partScanDirection = node->scan.partScanDirection;
|
|
scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan;
|
|
|
|
/* initialize Global partition index scan information */
|
|
GPIScanInit(&scanstate->gpi_scan);
|
|
|
|
/* initialize cross-bucket index scan information */
|
|
cbi_scan_init(&scanstate->cbi_scan);
|
|
|
|
/*
|
|
* Miscellaneous initialization
|
|
*
|
|
* create expression context for node
|
|
*/
|
|
ExecAssignExprContext(estate, &scanstate->ss.ps);
|
|
|
|
/*
|
|
* initialize child expressions
|
|
*/
|
|
if (estate->es_is_flt_frame) {
|
|
scanstate->ss.ps.qual = (List*)ExecInitQualByFlatten(node->scan.plan.qual, (PlanState*)scanstate);
|
|
scanstate->bitmapqualorig = (List*)ExecInitQualByFlatten(node->bitmapqualorig, (PlanState*)scanstate);
|
|
} else {
|
|
scanstate->ss.ps.targetlist = (List*)ExecInitExprByRecursion((Expr*)node->scan.plan.targetlist, (PlanState*)scanstate);
|
|
scanstate->ss.ps.qual = (List*)ExecInitExprByRecursion((Expr*)node->scan.plan.qual, (PlanState*)scanstate);
|
|
scanstate->bitmapqualorig = (List*)ExecInitExprByRecursion((Expr*)node->bitmapqualorig, (PlanState*)scanstate);
|
|
}
|
|
|
|
/*
|
|
* open the base relation and acquire appropriate lock on it.
|
|
*/
|
|
currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid);
|
|
|
|
scanstate->ss.ss_currentRelation = currentRelation;
|
|
scanstate->gpi_scan->parentRelation = currentRelation;
|
|
|
|
isUstoreRel = RelationIsUstoreFormat(currentRelation);
|
|
|
|
/*
|
|
* tuple table initialization
|
|
*/
|
|
ExecInitResultTupleSlot(estate, &scanstate->ss.ps, currentRelation->rd_tam_ops);
|
|
ExecInitScanTupleSlot(estate, &scanstate->ss, currentRelation->rd_tam_ops);
|
|
|
|
InitBitmapHeapScanNextMtd(scanstate);
|
|
|
|
/*
|
|
* Choose user-specified snapshot if TimeCapsule clause exists, otherwise
|
|
* estate->es_snapshot instead.
|
|
*/
|
|
scanSnap = TvChooseScanSnap(currentRelation, &node->scan, &scanstate->ss);
|
|
|
|
/*
|
|
* Even though we aren't going to do a conventional seqscan, it is useful
|
|
* to create a HeapScanDesc --- most of the fields in it are usable.
|
|
*/
|
|
if (scanstate->ss.isPartTbl) {
|
|
scanstate->ss.ss_currentScanDesc = NULL;
|
|
ExecInitPartitionForBitmapHeapScan(scanstate, estate);
|
|
|
|
if (node->scan.itrs > 0) {
|
|
Partition partition = NULL;
|
|
Relation partitiontrel = NULL;
|
|
|
|
if (scanstate->ss.partitions != NIL) {
|
|
/* construct a dummy table relation with the next table partition for scan */
|
|
partition = (Partition)list_nth(scanstate->ss.partitions, 0);
|
|
partitiontrel = partitionGetRelation(currentRelation, partition);
|
|
scanstate->ss.ss_currentPartition = partitiontrel;
|
|
|
|
/*
|
|
* Verify if a DDL operation that froze all tuples in the relation
|
|
* occured after taking the snapshot. Skip for explain only commands.
|
|
*/
|
|
if (isUstoreRel && !(eflags & EXEC_FLAG_EXPLAIN_ONLY)) {
|
|
TransactionId relfrozenxid64 = InvalidTransactionId;
|
|
getPartitionRelxids(partitiontrel, &relfrozenxid64);
|
|
if (TransactionIdPrecedes(FirstNormalTransactionId, scanSnap->xmax) &&
|
|
!TransactionIdIsCurrentTransactionId(relfrozenxid64) &&
|
|
TransactionIdPrecedes(scanSnap->xmax, relfrozenxid64)) {
|
|
ereport(ERROR, (errcode(ERRCODE_SNAPSHOT_INVALID),
|
|
(errmsg("Snapshot too old, BitmapHeapScan is PartTbl, the info: snapxmax is %lu, "
|
|
"snapxmin is %lu, csn is %lu, relfrozenxid64 is %lu, globalRecycleXid is %lu.",
|
|
scanSnap->xmax, scanSnap->xmin, scanSnap->snapshotcsn, relfrozenxid64,
|
|
g_instance.undo_cxt.globalRecycleXid))));
|
|
}
|
|
}
|
|
|
|
scanstate->ss.ss_currentScanDesc =
|
|
scan_handler_tbl_beginscan_bm(partitiontrel, scanSnap, 0, NULL, &scanstate->ss);
|
|
}
|
|
}
|
|
} else {
|
|
if (!isUstoreRel) {
|
|
scanstate->ss.ss_currentScanDesc =
|
|
scan_handler_tbl_beginscan_bm(currentRelation, scanSnap, 0, NULL, &scanstate->ss);
|
|
} else {
|
|
/*
|
|
* Verify if a DDL operation that froze all tuples in the relation
|
|
* occured after taking the snapshot. Skip for explain only commands.
|
|
*/
|
|
if (!(eflags & EXEC_FLAG_EXPLAIN_ONLY)) {
|
|
TransactionId relfrozenxid64 = InvalidTransactionId;
|
|
getRelationRelxids(currentRelation, &relfrozenxid64);
|
|
if (TransactionIdPrecedes(FirstNormalTransactionId, scanSnap->xmax) &&
|
|
!TransactionIdIsCurrentTransactionId(relfrozenxid64) &&
|
|
TransactionIdPrecedes(scanSnap->xmax, relfrozenxid64)) {
|
|
ereport(ERROR, (errcode(ERRCODE_SNAPSHOT_INVALID),
|
|
(errmsg("Snapshot too old, BitmapHeapScan is not PartTbl, the info: snapxmax is %lu, "
|
|
"snapxmin is %lu, csn is %lu, relfrozenxid64 is %lu, globalRecycleXid is %lu.",
|
|
scanSnap->xmax, scanSnap->xmin, scanSnap->snapshotcsn, relfrozenxid64,
|
|
g_instance.undo_cxt.globalRecycleXid))));
|
|
}
|
|
}
|
|
|
|
scanstate->ss.ss_currentScanDesc = UHeapBeginScan(currentRelation, scanSnap, 0, NULL);
|
|
}
|
|
}
|
|
if (scanstate->ss.ss_currentScanDesc == NULL) {
|
|
scanstate->ss.ps.stubType = PST_Scan;
|
|
}
|
|
|
|
/*
|
|
* get the scan type from the relation descriptor.
|
|
*/
|
|
ExecAssignScanType(&scanstate->ss, RelationGetDescr(currentRelation));
|
|
|
|
/*
|
|
* Initialize result tuple type and projection info.
|
|
*/
|
|
ExecAssignResultTypeFromTL(
|
|
&scanstate->ss.ps,
|
|
scanstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor->td_tam_ops);
|
|
|
|
ExecAssignScanProjectionInfo(&scanstate->ss);
|
|
|
|
Assert(scanstate->ss.ps.ps_ResultTupleSlot->tts_tupleDescriptor->td_tam_ops);
|
|
|
|
/*
|
|
* initialize child nodes
|
|
*
|
|
* We do this last because the child nodes will open indexscans on our
|
|
* relation's indexes, and we want to be sure we have acquired a lock on
|
|
* the relation first.
|
|
*/
|
|
outerPlanState(scanstate) = ExecInitNode(outerPlan(node), estate, eflags);
|
|
|
|
/*
|
|
* all done.
|
|
*/
|
|
return scanstate;
|
|
}
|
|
|
|
/*
|
|
* @@GaussDB@@
|
|
* Target : data partition
|
|
* Brief : Initialize the table partition and the index partition for
|
|
* : index sacn
|
|
* Description :
|
|
* Input :
|
|
* Output :
|
|
* Notes :
|
|
*/
|
|
static void ExecInitNextPartitionForBitmapHeapScan(BitmapHeapScanState* node)
|
|
{
|
|
Partition currentpartition = NULL;
|
|
Relation currentpartitionrel = NULL;
|
|
BitmapHeapScan* plan = NULL;
|
|
int paramno = -1;
|
|
ParamExecData* param = NULL;
|
|
int subPartParamno = -1;
|
|
ParamExecData* SubPrtParam = NULL;
|
|
|
|
plan = (BitmapHeapScan*)(node->ss.ps.plan);
|
|
|
|
/* get partition sequnce */
|
|
paramno = plan->scan.plan.paramno;
|
|
param = &(node->ss.ps.state->es_param_exec_vals[paramno]);
|
|
node->ss.currentSlot = (int)param->value;
|
|
|
|
subPartParamno = plan->scan.plan.subparamno;
|
|
SubPrtParam = &(node->ss.ps.state->es_param_exec_vals[subPartParamno]);
|
|
|
|
/* construct a dummy relation with the nextl table partition */
|
|
currentpartition = (Partition)list_nth(node->ss.partitions, node->ss.currentSlot);
|
|
currentpartitionrel = partitionGetRelation(node->ss.ss_currentRelation, currentpartition);
|
|
|
|
/* switch the partition that needs to be scanned */
|
|
Assert(PointerIsValid(node->ss.ss_currentPartition));
|
|
releaseDummyRelation(&(node->ss.ss_currentPartition));
|
|
|
|
BitmapHeapScanState* bitmapState = node;
|
|
BitmapHeapScan* bitmpHeapScan = (BitmapHeapScan*)node->ss.ps.plan;
|
|
Snapshot scanSnap;
|
|
scanSnap = TvChooseScanSnap(currentpartitionrel, &bitmpHeapScan->scan, &bitmapState->ss);
|
|
|
|
if (currentpartitionrel->partMap != NULL) {
|
|
Partition currentSubPartition = NULL;
|
|
List* currentSubPartitionList = NULL;
|
|
Relation currentSubPartitionRel = NULL;
|
|
Assert(SubPrtParam != NULL);
|
|
currentSubPartitionList = (List *)list_nth(node->ss.subpartitions, node->ss.currentSlot);
|
|
currentSubPartition = (Partition)list_nth(currentSubPartitionList, (int)SubPrtParam->value);
|
|
currentSubPartitionRel = partitionGetRelation(currentpartitionrel, currentSubPartition);
|
|
releaseDummyRelation(&(currentpartitionrel));
|
|
|
|
node->ss.ss_currentPartition = currentSubPartitionRel;
|
|
node->gpi_scan->parentRelation = currentpartitionrel;
|
|
|
|
/* Initialize scan descriptor. */
|
|
node->ss.ss_currentScanDesc =
|
|
scan_handler_tbl_beginscan_bm(currentSubPartitionRel, scanSnap, 0, NULL, &node->ss);
|
|
} else {
|
|
node->ss.ss_currentPartition = currentpartitionrel;
|
|
|
|
/* Initialize scan descriptor. */
|
|
node->ss.ss_currentScanDesc =
|
|
scan_handler_tbl_beginscan_bm(currentpartitionrel, scanSnap, 0, NULL, &node->ss);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* @@GaussDB@@
|
|
* Target : data partition
|
|
* Brief : get the table partition that need to be scanned, and add it
|
|
* : the list for the following scanning
|
|
* Description :
|
|
* Input :
|
|
* Output :
|
|
* Notes :
|
|
*/
|
|
static void ExecInitPartitionForBitmapHeapScan(BitmapHeapScanState* scanstate, EState* estate)
|
|
{
|
|
BitmapHeapScan* plan = NULL;
|
|
Relation currentRelation = NULL;
|
|
|
|
plan = (BitmapHeapScan*)scanstate->ss.ps.plan;
|
|
|
|
currentRelation = scanstate->ss.ss_currentRelation;
|
|
scanstate->ss.partitions = NIL;
|
|
scanstate->ss.ss_currentPartition = NULL;
|
|
|
|
if (plan->scan.itrs > 0) {
|
|
LOCKMODE lock = NoLock;
|
|
Partition tablepartition = NULL;
|
|
bool relistarget = false;
|
|
PruningResult* resultPlan = NULL;
|
|
if (plan->scan.pruningInfo->expr) {
|
|
resultPlan = GetPartitionInfo(plan->scan.pruningInfo, estate, currentRelation);
|
|
if (ENABLE_SQL_BETA_FEATURE(PARTITION_OPFUSION)) {
|
|
if (estate->pruningResult) {
|
|
destroyPruningResult(estate->pruningResult);
|
|
}
|
|
estate->pruningResult = resultPlan;
|
|
}
|
|
} else {
|
|
resultPlan = plan->scan.pruningInfo;
|
|
}
|
|
|
|
if (resultPlan->ls_rangeSelectedPartitions != NULL) {
|
|
scanstate->ss.part_id = resultPlan->ls_rangeSelectedPartitions->length;
|
|
} else {
|
|
scanstate->ss.part_id = 0;
|
|
}
|
|
|
|
ListCell* cell1 = NULL;
|
|
ListCell* cell2 = NULL;
|
|
List* part_seqs = resultPlan->ls_rangeSelectedPartitions;
|
|
List* partitionnos = resultPlan->ls_selectedPartitionnos;
|
|
Assert(list_length(part_seqs) == list_length(partitionnos));
|
|
relistarget = ExecRelationIsTargetRelation(estate, plan->scan.scanrelid);
|
|
lock = (relistarget ? RowExclusiveLock : AccessShareLock);
|
|
scanstate->ss.lockMode = lock;
|
|
|
|
forboth (cell1, part_seqs, cell2, partitionnos) {
|
|
Oid tablepartitionid = InvalidOid;
|
|
int partSeq = lfirst_int(cell1);
|
|
int partitionno = lfirst_int(cell2);
|
|
/* add table partition to list */
|
|
tablepartitionid = getPartitionOidFromSequence(currentRelation, partSeq, partitionno);
|
|
tablepartition = PartitionOpenWithPartitionno(currentRelation, tablepartitionid, partitionno, lock);
|
|
scanstate->ss.partitions = lappend(scanstate->ss.partitions, tablepartition);
|
|
|
|
if (resultPlan->ls_selectedSubPartitions != NIL) {
|
|
Relation partRelation = partitionGetRelation(currentRelation, tablepartition);
|
|
SubPartitionPruningResult* subPartPruningResult =
|
|
GetSubPartitionPruningResult(resultPlan->ls_selectedSubPartitions, partSeq, partitionno);
|
|
if (subPartPruningResult == NULL) {
|
|
continue;
|
|
}
|
|
List *subpart_seqs = subPartPruningResult->ls_selectedSubPartitions;
|
|
List *subpartitionnos = subPartPruningResult->ls_selectedSubPartitionnos;
|
|
Assert(list_length(subpart_seqs) == list_length(subpartitionnos));
|
|
List *subpartition = NULL;
|
|
ListCell *lc1 = NULL;
|
|
ListCell *lc2 = NULL;
|
|
forboth (lc1, subpart_seqs, lc2, subpartitionnos) {
|
|
Oid subpartitionid = InvalidOid;
|
|
int subpartSeq = lfirst_int(lc1);
|
|
int subpartitionno = lfirst_int(lc2);
|
|
|
|
subpartitionid = getPartitionOidFromSequence(partRelation, subpartSeq, subpartitionno);
|
|
Partition subpart =
|
|
PartitionOpenWithPartitionno(partRelation, subpartitionid, subpartitionno, lock);
|
|
subpartition = lappend(subpartition, subpart);
|
|
}
|
|
releaseDummyRelation(&(partRelation));
|
|
scanstate->ss.subPartLengthList =
|
|
lappend_int(scanstate->ss.subPartLengthList, list_length(subpartition));
|
|
scanstate->ss.subpartitions = lappend(scanstate->ss.subpartitions, subpartition);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static Relation BitmapHeapPrefetchTargetRel(BitmapHeapScanState* node, Oid partoid, bool partmatched,
|
|
int2 bucketid, bool bktmatched)
|
|
{
|
|
Relation targetheap = NULL;
|
|
HBktTblScanDesc hpscan = (HBktTblScanDesc)node->ss.ss_currentScanDesc;
|
|
bool partchanged = false;
|
|
bool bktchanged = false;
|
|
bool isgpi = OidIsValid(partoid);
|
|
bool iscbi = BUCKET_NODE_IS_VALID(bucketid);
|
|
|
|
if (isgpi && !partmatched) {
|
|
GPISetCurrPartOid(node->gpi_scan, partoid);
|
|
partchanged = GPIGetNextPartRelation(node->gpi_scan, CurrentMemoryContext, AccessShareLock);
|
|
targetheap = node->gpi_scan->fakePartRelation;
|
|
}
|
|
|
|
if (iscbi) {
|
|
if (partchanged) {
|
|
cbi_set_bucketid(node->cbi_scan, InvalidBktId);
|
|
} else if (!bktmatched) {
|
|
cbi_set_bucketid(node->cbi_scan, bucketid);
|
|
}
|
|
|
|
targetheap = cbi_bitmapheap_scan_getbucket(hpscan, node->gpi_scan, node->cbi_scan, bucketid);
|
|
bktchanged = (targetheap != hpscan->currBktRel);
|
|
}
|
|
|
|
if (!partchanged && !bktchanged) {
|
|
return NULL;
|
|
}
|
|
|
|
return targetheap;
|
|
}
|
|
|
|
/* First sort by partition oid, then call PageListPrefetch to get the pages under each partition */
|
|
void BitmapHeapPrefetchWithCrossLevelIndex(BitmapHeapScanState* node, int prefetchNow, PrefetchNode* prefetchNode,
|
|
BlockNumber* blockList)
|
|
{
|
|
/*
|
|
* we must save part Oid before switch relation, and recover it after prefetch.
|
|
* The reason for this is to assure correctness while getting a new tbmres.
|
|
*/
|
|
HBktTblScanDesc hpscan = (HBktTblScanDesc)node->ss.ss_currentScanDesc;
|
|
Oid originOid = GPIGetCurrPartOid(node->gpi_scan);
|
|
int2 originBktId = cbi_get_current_bucketid(node->cbi_scan);
|
|
int blkCount = 0;
|
|
Oid prevOid = prefetchNode[0].partOid;
|
|
int2 prevBktId = prefetchNode[0].bktId;
|
|
bool isgpi = OidIsValid(prevOid);
|
|
bool iscbi = BUCKET_NODE_IS_VALID(prevBktId);
|
|
bool partmatched, bktmatched;
|
|
Relation targetheap = NULL;
|
|
|
|
for (int i = 0; i < prefetchNow; i++) {
|
|
partmatched = bktmatched = false;
|
|
|
|
if (isgpi && prefetchNode[i].partOid == prevOid) {
|
|
partmatched = true;
|
|
}
|
|
if (iscbi && prefetchNode[i].bktId == prevBktId) {
|
|
bktmatched = true;
|
|
}
|
|
|
|
targetheap = BitmapHeapPrefetchTargetRel(node, prevOid, partmatched, prevBktId, bktmatched);
|
|
if (RelationIsValid(targetheap)) {
|
|
PageListPrefetch(targetheap, MAIN_FORKNUM, blockList, blkCount, 0, 0);
|
|
}
|
|
|
|
if (isgpi && !partmatched) {
|
|
blkCount = 0;
|
|
prevOid = prefetchNode[i].partOid;
|
|
}
|
|
if (iscbi && !bktmatched) {
|
|
blkCount = 0;
|
|
prevBktId = prefetchNode[i].bktId;
|
|
|
|
if (RelationIsValid(targetheap) && targetheap != hpscan->currBktRel) {
|
|
bucketCloseRelation(targetheap);
|
|
}
|
|
}
|
|
|
|
blockList[blkCount++] = prefetchNode[i].blockNum;
|
|
}
|
|
|
|
targetheap = BitmapHeapPrefetchTargetRel(node, prevOid, false, prevBktId, false);
|
|
if (RelationIsValid(targetheap)) {
|
|
PageListPrefetch(targetheap, MAIN_FORKNUM, blockList, blkCount, 0, 0);
|
|
if (targetheap != hpscan->currBktRel) {
|
|
bucketCloseRelation(targetheap);
|
|
}
|
|
}
|
|
|
|
/* recover old oid after prefetch switch */
|
|
GPISetCurrPartOid(node->gpi_scan, originOid);
|
|
cbi_set_bucketid(node->cbi_scan, originBktId);
|
|
}
|
|
|
|
Relation BitmapHeapPrefetchNextTargetHeap(BitmapHeapScanState* node, TBMIterateResult* tbmpre, Relation curr_targetheap)
|
|
{
|
|
bool need_reset_bucketid = false;
|
|
Relation targetheap = NULL;
|
|
Relation next_targetheap = curr_targetheap;
|
|
|
|
if (tbm_is_global(node->tbm) && GPIScanCheckPartOid(node->gpi_scan, tbmpre->partitionOid)) {
|
|
GPISetCurrPartOid(node->gpi_scan, tbmpre->partitionOid);
|
|
if (!GPIGetNextPartRelation(node->gpi_scan, CurrentMemoryContext, AccessShareLock)) {
|
|
/* If the current partition is invalid, the next page is directly processed */
|
|
return NULL;
|
|
}
|
|
|
|
next_targetheap = node->gpi_scan->fakePartRelation;
|
|
need_reset_bucketid = true;
|
|
}
|
|
|
|
if (tbm_is_crossbucket(node->tbm)) { /* for crossbucket index */
|
|
HBktTblScanDesc hpscan = (HBktTblScanDesc)node->ss.ss_currentScanDesc;
|
|
int2 bucketid = tbmpre->bucketid; /* set to the current iterating bucketid */
|
|
Assert(BUCKET_NODE_IS_VALID(bucketid));
|
|
|
|
if (need_reset_bucketid) {
|
|
cbi_set_bucketid(node->cbi_scan, InvalidBktId);
|
|
}
|
|
|
|
targetheap = cbi_bitmapheap_scan_getbucket(hpscan, node->gpi_scan, node->cbi_scan, bucketid);
|
|
if (targetheap == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
/* update target relation to prefetch */
|
|
next_targetheap = targetheap;
|
|
}
|
|
|
|
return next_targetheap;
|
|
}
|
|
|
|
void BitmapHeapPrefetchNextAsync(BitmapHeapScanState* node, TableScanDesc scan, const TIDBitmap* tbm,
|
|
TBMIterator** prefetch_iterator)
|
|
{
|
|
BlockNumber* blockList = NULL;
|
|
BlockNumber* blockListPtr = NULL;
|
|
PrefetchNode* prefetchNode = NULL;
|
|
PrefetchNode* prefetchNodePtr = NULL;
|
|
int prefetchNow = 0;
|
|
int prefetchWindow = node->prefetch_target - node->prefetch_pages;
|
|
|
|
/* We expect to prefetch at most prefetchWindow pages */
|
|
if (prefetchWindow > 0) {
|
|
if (tbm_is_global(tbm) || tbm_is_crossbucket(tbm)) {
|
|
prefetchNode = (PrefetchNode*)malloc(sizeof(PrefetchNode) * prefetchWindow);
|
|
prefetchNodePtr = prefetchNode;
|
|
}
|
|
blockList = (BlockNumber*)palloc(sizeof(BlockNumber) * prefetchWindow);
|
|
blockListPtr = blockList;
|
|
}
|
|
while (node->prefetch_pages < node->prefetch_target) {
|
|
TBMIterateResult* tbmpre = tbm_iterate(*prefetch_iterator);
|
|
|
|
if (tbmpre == NULL) {
|
|
/* No more pages to prefetch */
|
|
tbm_end_iterate(*prefetch_iterator);
|
|
node->prefetch_iterator = *prefetch_iterator = NULL;
|
|
break;
|
|
}
|
|
node->prefetch_pages++;
|
|
/* we use PrefetchNode here to store relations between blockno and partition Oid */
|
|
if ((tbm_is_global(tbm) || tbm_is_crossbucket(tbm)) && prefetchNodePtr != NULL) {
|
|
prefetchNodePtr->blockNum = tbmpre->blockno;
|
|
prefetchNodePtr->partOid = tbmpre->partitionOid;
|
|
prefetchNodePtr->bktId = tbmpre->bucketid;
|
|
prefetchNodePtr++;
|
|
}
|
|
/* For Async Direct I/O we accumulate a list and send it */
|
|
if (blockListPtr != NULL) {
|
|
*blockListPtr++ = tbmpre->blockno;
|
|
}
|
|
prefetchNow++;
|
|
}
|
|
|
|
/* Send the list we generated and free it */
|
|
if (prefetchNow && blockList != NULL) {
|
|
if (tbm_is_global(tbm) || tbm_is_crossbucket(tbm)) {
|
|
BitmapHeapPrefetchWithCrossLevelIndex(node, prefetchNow, prefetchNode, blockList);
|
|
} else {
|
|
PageListPrefetch(scan->rs_rd, MAIN_FORKNUM, blockList, prefetchNow, 0, 0);
|
|
}
|
|
}
|
|
if (prefetchWindow > 0) {
|
|
pfree_ext(blockList);
|
|
if (tbm_is_global(tbm) || tbm_is_crossbucket(tbm)) {
|
|
pfree_ext(prefetchNode);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* We issue prefetch requests *after* fetching the current page to try
|
|
* to avoid having prefetching interfere with the main I/O. Also, this
|
|
* should happen only when we have determined there is still something
|
|
* to do on the current page, else we may uselessly prefetch the same
|
|
* page we are just about to request for real.
|
|
*/
|
|
void BitmapHeapPrefetchNext(
|
|
BitmapHeapScanState* node, TableScanDesc scan, const TIDBitmap* tbm, TBMIterator** prefetch_iterator)
|
|
{
|
|
if (*prefetch_iterator == NULL) {
|
|
return;
|
|
}
|
|
|
|
Assert(node->tbm == tbm);
|
|
|
|
ADIO_RUN()
|
|
{
|
|
/* prefetch next asynchronously */
|
|
BitmapHeapPrefetchNextAsync(node, scan, tbm, prefetch_iterator);
|
|
}
|
|
ADIO_ELSE()
|
|
{
|
|
/* prefetch next synchronously */
|
|
if (unlikely(tbm_is_crossbucket(tbm) || tbm_is_global(tbm))) {
|
|
HBktTblScanDesc hpscan = NULL;
|
|
Oid oldOid = GPIGetCurrPartOid(node->gpi_scan);
|
|
int2 oldBktId = cbi_get_current_bucketid(node->cbi_scan);
|
|
Relation oldheap = NULL;
|
|
Relation prefetchRel = scan->rs_rd;
|
|
|
|
while (node->prefetch_pages < node->prefetch_target) {
|
|
TBMIterateResult* tbmpre = tbm_iterate(*prefetch_iterator);
|
|
hpscan = (tbm_is_crossbucket(node->tbm) ? (HBktTblScanDesc)node->ss.ss_currentScanDesc : NULL);
|
|
|
|
if (tbmpre == NULL) {
|
|
/* No more pages to prefetch */
|
|
tbm_end_iterate(*prefetch_iterator);
|
|
node->prefetch_iterator = *prefetch_iterator = NULL;
|
|
break;
|
|
}
|
|
node->prefetch_pages++;
|
|
|
|
prefetchRel = BitmapHeapPrefetchNextTargetHeap(node, tbmpre, prefetchRel);
|
|
if (prefetchRel == NULL) {
|
|
tbmpre = NULL;
|
|
continue;
|
|
}
|
|
|
|
/* For posix_fadvise() we just send the one request */
|
|
PrefetchBuffer(prefetchRel, MAIN_FORKNUM, tbmpre->blockno);
|
|
if (RelationIsValid(oldheap) && oldheap != prefetchRel && PointerIsValid(hpscan) &&
|
|
oldheap != hpscan->currBktRel) {
|
|
/* release previous bucket fake relation except the current scanning one */
|
|
bucketCloseRelation(oldheap);
|
|
/* now oldheap is NULL */
|
|
}
|
|
oldheap = prefetchRel;
|
|
}
|
|
|
|
if (RelationIsValid(oldheap) && PointerIsValid(hpscan) && oldheap != hpscan->currBktRel) {
|
|
/* release previous bucket fake relation except the current scanning one */
|
|
bucketCloseRelation(oldheap);
|
|
}
|
|
|
|
/* recover old oid after prefetch switch */
|
|
GPISetCurrPartOid(node->gpi_scan, oldOid);
|
|
cbi_set_bucketid(node->cbi_scan, oldBktId);
|
|
} else {
|
|
while (node->prefetch_pages < node->prefetch_target) {
|
|
TBMIterateResult* tbmpre = tbm_iterate(*prefetch_iterator);
|
|
Relation prefetchRel = scan->rs_rd;
|
|
|
|
if (tbmpre == NULL) {
|
|
/* No more pages to prefetch */
|
|
tbm_end_iterate(*prefetch_iterator);
|
|
node->prefetch_iterator = *prefetch_iterator = NULL;
|
|
break;
|
|
}
|
|
node->prefetch_pages++;
|
|
|
|
/* For posix_fadvise() we just send the one request */
|
|
PrefetchBuffer(prefetchRel, MAIN_FORKNUM, tbmpre->blockno);
|
|
}
|
|
}
|
|
}
|
|
ADIO_END();
|
|
}
|