openGauss-server/src/gausskernel/runtime/executor/nodeSamplescan.cpp

/*
 * Copyright (c) 2020 Huawei Technologies Co.,Ltd.
 *
 * openGauss is licensed under Mulan PSL v2.
 * You can use this software according to the terms and conditions of the Mulan PSL v2.
 * You may obtain a copy of Mulan PSL v2 at:
 *
 *          http://license.coscl.org.cn/MulanPSL2
 *
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
 * See the Mulan PSL v2 for more details.
 * -------------------------------------------------------------------------
 * nodeSamplescan.cpp
 *
 *      Support routines for sample scans of relations (table sampling).
 *
 * IDENTIFICATION
 *      src/gausskernel/runtime/executor/nodeSamplescan.cpp
 *
 * -------------------------------------------------------------------------
 */
#include "postgres.h"
#include "knl/knl_variable.h"

#include "access/hash.h"
#include "access/relscan.h"
#include "access/tableam.h"
#include "commands/vacuum.h"
#include "executor/executor.h"
#include "executor/nodeSamplescan.h"
#include "executor/nodeSeqscan.h"
#include "miscadmin.h"
#include "pgstat.h"
#ifdef PGXC
#include "pgxc/pgxc.h"
#endif
#include "storage/predicate.h"
#include "utils/builtins.h"
#include "utils/rel.h"
#include "utils/rel_gs.h"
#include "utils/snapmgr.h"
#include "vecexecutor/vecnodecstorescan.h"
#include "nodes/execnodes.h"

static double sample_random_fract(void);

/*
 * Description: Initialize relation descriptor for sample table scan.
 *
 * Parameters:
 *	@in scanstate: ScanState information
 *	@in currentRelation: relation being scanned
 *
 * Returns: HeapScanDesc
 */
TableScanDesc InitSampleScanDesc(ScanState* scanstate, Relation currentRelation)
{
    bool allow_sync = false;
    bool use_bulkread = false;
    TableScanDesc current_scan_desc = NULL;
    SampleScanParams* sample_scan_info = &scanstate->sampleScanInfo;

    /* Need scan all block. */
    if (sample_scan_info->sampleType == BERNOULLI_SAMPLE) {
        allow_sync = true;

        /*
         * Use bulkread, since we're scanning all pages. But pagemode visibility
         * checking is a win only at larger sampling fractions.
         */
        use_bulkread = true;
    } else {
        allow_sync = false;

        /*
         * Bulkread buffer access strategy probably makes sense unless we're
         * scanning a very small fraction of the table.
         */
        use_bulkread = (((BaseTableSample*)sample_scan_info->tsm_state)->percent[0] >= 1);
    }

    current_scan_desc = scan_handler_tbl_beginscan_sampling(
        currentRelation, scanstate->ps.state->es_snapshot, 0, NULL, use_bulkread, allow_sync, scanstate);

    return current_scan_desc;
}
static inline HeapTuple SampleFetchNextTuple(SeqScanState* node)
{
    TableScanDesc tableScanDesc = GetTableScanDesc(node->ss_currentScanDesc, node->ss_currentRelation);
    tableScanDesc->rs_ss_accessor = node->ss_scanaccessor;

    /*
     * Get the next tuple for table sample, and return it.
     * Scans the relation using the sampling method and returns
     * the next qualifying tuple. We call the ExecScan() routine and pass it
     * the appropriate access method functions.
     */
    return (((RowTableSample*)node->sampleScanInfo.tsm_state)->scanSample)();
}

/*
 * Description: Get the next tuple from the table for sample scan.
 *
 * Parameters:
 * @in node: ScanState information
 *
 * Returns: TupleTableSlot
 *
 */
TupleTableSlot* SeqSampleNext(SeqScanState* node)
{
    TupleTableSlot* slot = node->ss_ScanTupleSlot;
    HeapTuple tuple = SampleFetchNextTuple(node);

    node->ss_ScanTupleSlot->tts_tupleDescriptor->tdTableAmType = node->ss_currentRelation->rd_tam_type;
    return ExecMakeTupleSlot(tuple, GetTableScanDesc(node->ss_currentScanDesc, node->ss_currentRelation), slot, node->ss_currentRelation->rd_tam_type);
}

TupleTableSlot* HbktSeqSampleNext(SeqScanState* node)
{
    TupleTableSlot* slot = node->ss_ScanTupleSlot;
    HeapTuple tuple = NULL;
    TableScanDesc hb_scan = node->ss_currentScanDesc;

    while (hb_scan != NULL) {
        tuple = SampleFetchNextTuple(node);
        if (tuple != NULL) {
            break;
        }

        /* try switch to next partition */
        if (!hbkt_sampling_scan_nextbucket(hb_scan)) {
            break;
        }

        (((RowTableSample*)node->sampleScanInfo.tsm_state)->resetSampleScan)();
    }

    node->ss_ScanTupleSlot->tts_tupleDescriptor->tdTableAmType = node->ss_currentRelation->rd_tam_type;
    return ExecMakeTupleSlot(
            tuple, GetTableScanDesc(node->ss_currentScanDesc, node->ss_currentRelation),
            slot,
            node->ss_currentRelation->rd_tam_type);
}

/*
 * Description: Get seed value.
 *
 * Parameters: null
 *
 * Returns: void
 */
void BaseTableSample::getSeed()
{
    Datum datum;
    bool isnull = false;
    ExprContext* econtext = sampleScanState->ps.ps_ExprContext;
    ExprState* repeatable = sampleScanState->sampleScanInfo.repeatable;

    if (NULL != repeatable) {
        datum = ExecEvalExprSwitchContext(repeatable, econtext, &isnull, NULL);
        if (isnull) {
            ereport(ERROR,
                (errcode(ERRCODE_INVALID_TABLESAMPLE_REPEAT),
                    errmsg("TABLESAMPLE REPEATABLE parameter cannot be null")));
        }

        /*
         * The REPEATABLE parameter has been coerced to float8 by the parser.
         * The reason for using float8 at the SQL level is that it will
         * produce unsurprising results both for users used to databases that
         * accept only integers in the REPEATABLE clause and for those who
         * might expect that REPEATABLE works like setseed() (a float in the
         * range from -1 to 1).
         *
         * We use hashfloat8() to convert the supplied value into a suitable
         * seed. For regression-testing purposes, that has the convenient
         * property that REPEATABLE(0) gives a machine-independent result.
         */
        seed = DatumGetUInt32(DirectFunctionCall1(hashfloat8, datum));
    } else {
        seed = random();
    }

    if (seed > 0) {
        gs_srandom(seed);
    }
}

/*
 * Description: Get percent value.
 * Parameters: null
 * Returns: void
 */
void BaseTableSample::getPercent()
{
    int i = 0;
    ListCell* arg = NULL;
    bool isnull = false;

    ExprContext* econtext = sampleScanState->ps.ps_ExprContext;
    List* args = sampleScanState->sampleScanInfo.args;
    Datum* params = (Datum*)palloc0(list_length(args) * sizeof(Datum));

    Assert(list_length(args));
    percent = (double*)palloc0(SAMPLEARGSNUM * sizeof(double));

    foreach (arg, args) {
        ExprState* argstate = (ExprState*)lfirst(arg);

        params[i] = ExecEvalExprSwitchContext(argstate, econtext, &isnull, NULL);
        if (isnull) {
            ereport(
                ERROR, (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT), errmsg("TABLESAMPLE parameter cannot be null")));
        }

        percent[i] = DatumGetFloat4(params[i]);

        if (percent[i] < MIN_PERCENT_ARG || percent[i] > MAX_PERCENT_ARG || isnan(percent[i])) {
            ereport(ERROR,
                (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT), errmsg("sample percentage must be between 0 and 100")));
        }

        i++;
    }
}

/*
 * Description: Get next random block number.
 * Parameters: null
 * Returns: void
 */
void BaseTableSample::system_nextsampleblock()
{
    BlockNumber blockindex = 0;

    /* We should start from currentBlock + 1. */
    for (blockindex = currentBlock + 1; blockindex < totalBlockNum; blockindex++) {
        if (sample_random_fract() < percent[SYSTEM_SAMPLE]) {
            break;
        }
    }

    if (blockindex < totalBlockNum) {
        currentBlock = blockindex;
    } else {
        currentBlock = InvalidBlockNumber;
    }
}

/*
 * Description: Get sequential next offset.
 * Parameters: null
 * Returns: void
 */
void BaseTableSample::system_nextsampletuple()
{
    OffsetNumber tupoffset = currentOffset;

    /* Advance to next possible offset on page */
    if (tupoffset == InvalidOffsetNumber) {
        tupoffset = FirstOffsetNumber;
    } else {
        tupoffset++;
    }

    if (tupoffset > curBlockMaxoffset) {
        tupoffset = InvalidOffsetNumber;
    }

    currentOffset = tupoffset;
}

/*
 * Description: Get sequential next block.
 * Parameters: null
 * Returns: void
 */
void BaseTableSample::bernoulli_nextsampleblock()
{
    if (currentBlock + 1 < totalBlockNum) {
        currentBlock++;
    } else {
        currentBlock = InvalidBlockNumber;
    }
}

/*
 * Description: Get random next block.
 * Parameters: null
 * Returns: void
 */
void BaseTableSample::bernoulli_nextsampletuple()
{
    OffsetNumber tupoffset = currentOffset;

    /* Advance to first/next tuple in block */
    if (tupoffset == InvalidOffsetNumber) {
        tupoffset = FirstOffsetNumber;
    } else {
        tupoffset++;
    }

    /*
     * Loop over tuple offsets until finding suitable TID or reaching end of
     * block.
     */
    for (; tupoffset <= curBlockMaxoffset; tupoffset++) {
        if (sample_random_fract() < percent[BERNOULLI_SAMPLE]) {
            break;
        }
    }

    if (tupoffset > curBlockMaxoffset) {
        tupoffset = InvalidOffsetNumber;
    }

    currentOffset = tupoffset;
}

/*
 * Description: Initialize base tableSample info.
 *
 * Parameters:
 *	@in scanstate: ScanState information
 *
 * Returns: void
 */
BaseTableSample::BaseTableSample(void* scanstate)
    : runState(GETMAXBLOCK),
      totalBlockNum(0),
      currentBlock(InvalidBlockNumber),
      currentOffset(InvalidOffsetNumber),
      curBlockMaxoffset(InvalidOffsetNumber),
      finished(false)
{
    TableSampleType sampleType;
    bool vectorized = ((ScanState*)scanstate)->ps.vectorized;

    sampleScanState = (ScanState*)scanstate;
    sampleType = ((ScanState*)scanstate)->sampleScanInfo.sampleType;
    getPercent();
    getSeed();

    /* Save vecsample ScanState if it is vectorized. */
    if (vectorized) {
        vecsampleScanState = (CStoreScanState*)scanstate;
    }

    /* We can transform hybrid to system or bernoulli for optimize according to value of args. */
    if ((sampleType == BERNOULLI_SAMPLE) ||
        (sampleType == HYBRID_SAMPLE && percent[SYSTEM_SAMPLE] == MAX_PERCENT_ARG)) {
        percent[BERNOULLI_SAMPLE] = (sampleType == BERNOULLI_SAMPLE) ? (percent[0] / MAX_PERCENT_ARG)
                                                                     : (percent[BERNOULLI_SAMPLE] / MAX_PERCENT_ARG);
        nextSampleBlock_function = &BaseTableSample::bernoulli_nextsampleblock;
        nextSampleTuple_function = &BaseTableSample::bernoulli_nextsampletuple;
    } else if ((sampleType == SYSTEM_SAMPLE) ||
               (sampleType == HYBRID_SAMPLE && percent[BERNOULLI_SAMPLE] == MAX_PERCENT_ARG)) {
        percent[SYSTEM_SAMPLE] =
            (sampleType == SYSTEM_SAMPLE) ? (percent[0] / MAX_PERCENT_ARG) : (percent[SYSTEM_SAMPLE] / MAX_PERCENT_ARG);
        nextSampleBlock_function = &BaseTableSample::system_nextsampleblock;
        nextSampleTuple_function = &BaseTableSample::system_nextsampletuple;
    } else {
        Assert(sampleType == HYBRID_SAMPLE);
        percent[SYSTEM_SAMPLE] = percent[SYSTEM_SAMPLE] / MAX_PERCENT_ARG;
        percent[BERNOULLI_SAMPLE] = percent[BERNOULLI_SAMPLE] / MAX_PERCENT_ARG;
        nextSampleBlock_function = &BaseTableSample::system_nextsampleblock;
        nextSampleTuple_function = &BaseTableSample::bernoulli_nextsampletuple;
    }
    scanTupState = 0;
}

BaseTableSample::~BaseTableSample()
{
    sampleScanState = NULL;
    vecsampleScanState = NULL;
    percent = NULL;
}

/*
 * Description: Reset Sample Scan parameter.
 *
 * Parameters: null
 *
 * Returns: void
 */
void BaseTableSample::resetSampleScan()
{
    runState = GETMAXBLOCK;
    totalBlockNum = 0;
    currentOffset = InvalidOffsetNumber;
    currentBlock = InvalidBlockNumber;
    curBlockMaxoffset = InvalidOffsetNumber;
    finished = false;
}

/*
 * Description: Initialize Sample Scan parameter.
 *
 * Parameters:
 *	@in scanstate: ScanState information
 *
 * Returns: void
 */
RowTableSample::RowTableSample(ScanState* scanstate) : BaseTableSample(scanstate)
{}

RowTableSample::~RowTableSample()
{}

/*
 * Description: Get max offset for current block.
 *
 * Parameters: null
 *
 * Returns: void
 */
void RowTableSample::getMaxOffset()
{
    TableScanDesc tablescan = NULL;
    TableScanDesc scan = sampleScanState->ss_currentScanDesc;
    bool pagemode = GetTableScanDesc(scan, sampleScanState->ss_currentRelation)->rs_pageatatime;
    Page page;

    Assert(BlockNumberIsValid(currentBlock));

    if (scanTupState == NEWBLOCK) {
        tableam_scan_getpage(GetTableScanDesc(scan, sampleScanState->ss_currentRelation), currentBlock);
    }

    /*
     * When not using pagemode, we must lock the buffer during tuple
     * visibility checks.
     */
    tablescan = GetTableScanDesc(scan, sampleScanState->ss_currentRelation);
    if (!pagemode) {
        LockBuffer(tablescan->rs_cbuf, BUFFER_LOCK_SHARE);
    }

    page = (Page)BufferGetPage(tablescan->rs_cbuf);
    curBlockMaxoffset = PageGetMaxOffsetNumber(page);

    /* Found visible tuple, return it. */
    if (!pagemode) {
        LockBuffer(tablescan->rs_cbuf, BUFFER_LOCK_UNLOCK);
    }
}

/*
 * Description: Scan tuple according to currentblock and current currentoffset.
 *
 * Parameters: null
 *
 * Returns: ScanValid (the flag which identify the tuple is valid or not)
 */
ScanValid RowTableSample::scanTup()
{
    TableScanDesc scan = GetTableScanDesc(sampleScanState->ss_currentScanDesc, sampleScanState->ss_currentRelation);
    bool pagemode = scan->rs_pageatatime;
    HeapTuple tuple = &(((HeapScanDesc)scan)->rs_ctup);
    Snapshot snapshot = scan->rs_snapshot;
    ItemId itemid;
    Page page;
    bool all_visible = false;
    bool visible = false;

    if (scanTupState == NEWBLOCK) {
        if (BlockNumberIsValid(currentBlock)) {
            /*
             * Report our new scan position for synchronization purposes.
             *
             * Note: we do this before checking for end of scan so that the
             * final state of the position hint is back at the start of the
             * rel.  That's not strictly necessary, but otherwise when you run
             * the same query multiple times the starting position would shift
             * a little bit backwards on every invocation, which is confusing.
             * We don't guarantee any specific ordering in general, though.
             */
            if (scan->rs_syncscan) {
                ss_report_location(scan->rs_rd, currentBlock);
            }
        } else {
            if (scan->rs_inited) {
                if (BufferIsValid(scan->rs_cbuf)) {
                    ReleaseBuffer(scan->rs_cbuf);
                }
                scan->rs_cbuf = InvalidBuffer;
                scan->rs_cblock = InvalidBlockNumber;
                scan->rs_inited = false;
            }

            tuple->t_data = NULL;

            return INVALIDBLOCKNO;
        }

        if (!scan->rs_inited) {
            scan->rs_inited = true;
        }

        scanTupState = NONEWBLOCK;
    }

    Assert(currentBlock < scan->rs_nblocks);

    /* Current block alreadly have be readed.*/
    if (currentOffset == InvalidOffsetNumber) {
        /*
         * If we get here, it means we've exhausted the items on this page and
         * it's time to move to the next.
         */
        if (!pagemode) {
            LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
        }

        return INVALIDOFFSET;
    }

    /*
     * When not using pagemode, we must lock the buffer during tuple
     * visibility checks.
     */
    if (!pagemode) {
        LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
    }

    page = (Page)BufferGetPage(scan->rs_cbuf);
    all_visible = PageIsAllVisible(page) && !(snapshot->takenDuringRecovery);

    /* Skip invalid tuple pointers. */
    itemid = PageGetItemId(page, currentOffset);
    if (!ItemIdIsNormal(itemid)) {
        return NEXTDATA;
    }

    tuple->t_data = (HeapTupleHeader)PageGetItem(page, itemid);
    tuple->t_len = ItemIdGetLength(itemid);
    HeapTupleCopyBaseFromPage(tuple, page);
    ItemPointerSet(&(tuple->t_self), currentBlock, currentOffset);

    if (all_visible) {
        visible = true;
    } else {
        BufferDesc* bufHdr = GetBufferDescriptor(scan->rs_cbuf - 1);
        bool isTmpLock = false;

        if (!LWLockHeldByMe(bufHdr->content_lock)) {
            LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
            isTmpLock = true;
        }

        visible = HeapTupleSatisfiesVisibility(tuple, scan->rs_snapshot, scan->rs_cbuf);

        if (isTmpLock) {
            LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
        }
    }

    /* in pagemode, heapgetpage did this for us */
    if (!pagemode) {
        CheckForSerializableConflictOut(visible, scan->rs_rd, tuple, scan->rs_cbuf, snapshot);
    }

    if (visible) {
        /* Found visible tuple, return it. */
        if (!pagemode) {
            LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
        }

        /* Count successfully-fetched tuples as heap fetches */
        pgstat_count_heap_getnext(scan->rs_rd);

        elog(DEBUG2,
            "Get one tuple [currentBlock: %u, currentOffset: %u] for relation: %s on %s.",
            currentBlock,
            currentOffset,
            NameStr(scan->rs_rd->rd_rel->relname),
            g_instance.attr.attr_common.PGXCNodeName);

        return VALIDDATA;
    }

    return NEXTDATA;
}

/*
 * Description: Get sample tuple for row table.
 *
 * Parameters: null
 *
 * Returns: HeapTuple
 */
HeapTuple RowTableSample::scanSample()
{
    TableScanDesc scan = GetTableScanDesc(sampleScanState->ss_currentScanDesc, sampleScanState->ss_currentRelation);
    HeapTuple tuple = &(((HeapScanDesc)scan)->rs_ctup);

    if (finished == true) {
        return NULL;
    }

    /* Return NULL if no data or percent value is 0. */
    if ((scan->rs_nblocks == 0) ||
        (sampleScanState->sampleScanInfo.sampleType == BERNOULLI_SAMPLE && percent[0] == 0) ||
        (sampleScanState->sampleScanInfo.sampleType == SYSTEM_SAMPLE && percent[0] == 0) ||
        (sampleScanState->sampleScanInfo.sampleType == HYBRID_SAMPLE && percent[BERNOULLI_SAMPLE] == 0 &&
            percent[SYSTEM_SAMPLE] == 0)) {
        Assert(!BufferIsValid(scan->rs_cbuf));
        tuple->t_data = NULL;
        return NULL;
    }

    for (;;) {
        CHECK_FOR_INTERRUPTS();

        switch (runState) {
            /* Get num of max block. */
            case GETMAXBLOCK: {
                totalBlockNum = scan->rs_nblocks;
                runState = GETBLOCKNO;
                elog(DEBUG2,
                    "Get %u blocks for relation: %s on %s.",
                    totalBlockNum,
                    NameStr(scan->rs_rd->rd_rel->relname),
                    g_instance.attr.attr_common.PGXCNodeName);
                break;
            }
            case GETBLOCKNO: {
                /* Get current block no with method of function. */
                (this->*nextSampleBlock_function)();

                if (BlockNumberIsValid(currentBlock)) {
                    runState = GETMAXOFFSET;
                } else {
                    runState = GETDATA;
                }

                scanTupState = NEWBLOCK;
                break;
            }
            case GETMAXOFFSET: {
                getMaxOffset();
                runState = GETOFFSET;
                elog(DEBUG2,
                    "Get %d tuples in blockno: %u for relation: %s on %s.",
                    curBlockMaxoffset,
                    currentBlock,
                    NameStr(scan->rs_rd->rd_rel->relname),
                    g_instance.attr.attr_common.PGXCNodeName);
                break;
            }
            case GETOFFSET: {
                (this->*nextSampleTuple_function)();

                runState = GETDATA;
                break;
            }
            case GETDATA: {
                ScanValid scanState = scanTup();

                switch (scanState) {
                    case VALIDDATA: {
                        runState = GETOFFSET;
                        return &(((HeapScanDesc)scan)->rs_ctup);
                    }
                    case NEXTDATA: {
                        runState = GETOFFSET;
                        break;
                    }
                    case INVALIDBLOCKNO: {
                        /* All block alreadly be scaned finish.*/
                        finished = true;
                        return NULL;
                    }
                    case INVALIDOFFSET: {
                        runState = GETBLOCKNO;
                        break;
                    }
                    default: {
                        break;
                    }
                }
                break;
            }
            default: {
                break;
            }
        }
    }
}

/*
 * Description: Initialize Sample Scan parameter for CStoreScanState.
 *
 * Parameters:
 *	@in scanstate: CStoreScanState information
 *
 * Returns: void
 */
ColumnTableSample::ColumnTableSample(CStoreScanState* scanstate)
    : BaseTableSample(scanstate), currentCuId(0), batchRowCount(0)
{
    offsetIds = (uint16*)palloc0(sizeof(uint16) * BatchMaxSize);
    errno_t rc = memset_s(offsetIds, sizeof(uint16) * BatchMaxSize, 0, sizeof(uint16) * BatchMaxSize);
    securec_check(rc, "", "");

    /* Create new VectorBatch for construct tids to get sample VectorBatch. */
    TupleDesc tupdesc = CreateTemplateTupleDesc(1, false);
    TupleDescInitEntry(tupdesc, (AttrNumber)1, "tids", INT8OID, -1, 0);
    tids = New(CurrentMemoryContext) VectorBatch(CurrentMemoryContext, tupdesc);
}

ColumnTableSample::~ColumnTableSample()
{
    if (offsetIds) {
        pfree_ext(offsetIds);
        offsetIds = NULL;
    }
    if (tids) {
        delete tids;
        tids = NULL;
    }
}

/*
 * Description: Reset Vec Sample Scan parameter.
 *
 * Parameters: null
 *
 * Returns: void
 */
void ColumnTableSample::resetVecSampleScan()
{
    currentCuId = 0;
    batchRowCount = 0;

    /* Reset common parameters for table sample. */
    (((ColumnTableSample*)vecsampleScanState->sampleScanInfo.tsm_state)->resetSampleScan)();

    if (tids) {
        tids->Reset();
    }

    if (offsetIds) {
        errno_t rc = memset_s(offsetIds, sizeof(uint16) * BatchMaxSize, 0, sizeof(uint16) * BatchMaxSize);
        securec_check(rc, "", "");
    }
}

/*
 * Description: Get current block max offset.
 *
 * Parameters: null
 *
 * Returns: void
 */
void ColumnTableSample::getMaxOffset()
{
    CUDesc cu_desc;
    int fstColIdx = 0;
    Assert(BlockNumberIsValid(currentBlock));
    curBlockMaxoffset = InvalidOffsetNumber;

    /* If the first column has dropped, we should change the index of first column. */
    if (vecsampleScanState->ss_currentRelation->rd_att->attrs[0]->attisdropped) {
        fstColIdx = CStoreGetfstColIdx(vecsampleScanState->ss_currentRelation);
    }

    /*
     * Get CUDesc of column according to currentCuId.
     */
    if (vecsampleScanState->m_CStore->GetCUDesc(fstColIdx, currentCuId, &cu_desc, GetActiveSnapshot()) != true) {
        return;
    }

    /*
     * We try our best to keep the rules of acquiring tuples about row relations:
     * 1). ignore to sample tuples dead
     * 2). ignore to sample tuples recently dead
     * 3). ignore to sample tuples being inserted in progress by other transactions
     * 4). ignore to sample tuples being deleted in progress by our transactions
     * 5). ignore to sample tuples being deleted in progress by other transactions
     *  SnapshotNow can satisfy the rule 1) 2) 3) 4), so it's used here.
     */
    vecsampleScanState->m_CStore->GetCUDeleteMaskIfNeed(currentCuId, GetActiveSnapshot());

    /* Quit this loop quickly if all the tuples are dead in this CU unit. */
    if (vecsampleScanState->m_CStore->IsTheWholeCuDeleted(cu_desc.row_count)) {
        return;
    }

    curBlockMaxoffset = cu_desc.row_count;
}

/*
 * Description: Get sample VectorBatch by tids(CuId+offsetId).
 *
 * Parameters:
 *	@in state: CStoreScanState information
 *	@in cuId:  CuId of current CU
 *	@in maxOffset: max Offset of current CU
 *	@in offsetIds: random offsetIds of current CU
 *	@in tids: construct VectorBatch of tids by cuId and offsetIds
 *	@in vbout: return values of VectorBatch
 *
 * Returns: void
 */
void ColumnTableSample::getBatchBySamples(VectorBatch* vbout)
{
    ScalarVector* vec = tids->m_arr;
    tids->Reset();

    /* Fill VectorBatch of tids with CuId and offsetId. */
    for (int j = 0; j < batchRowCount; j++) {
        /* We can be sure it is not dead row. */
        vec->m_vals[j] = 0;
        ItemPointer itemPtr = (ItemPointer)&vec->m_vals[j];

        /* Note that itemPtr->offset start from 1 */
        ItemPointerSet(itemPtr, currentCuId, offsetIds[j]);
    }
    vec->m_rows = batchRowCount;
    tids->m_rows = vec->m_rows;

    /* Scan VectorBatch by tids. */
    if (!BatchIsNull(tids)) {
        CStoreIndexScanState* indexScanState = makeNode(CStoreIndexScanState);
        indexScanState->m_indexOutAttrNo = 0;

        vecsampleScanState->m_CStore->ScanByTids(indexScanState, tids, vbout);
        vecsampleScanState->m_CStore->ResetLateRead();
    }
}

/*
 * Description: Scan each offsets and get sample VectorBatch by tids.
 *
 * Parameters:
 *	@in pOutBatch: return values of VectorBatch
 *
 * Returns: ScanValid (the flag which identify the tuple is valid or not)
 */
ScanValid ColumnTableSample::scanBatch(VectorBatch* pOutBatch)
{
    Assert(BlockNumberIsValid(currentBlock));

    /* Current block alreadly have be readed.*/
    if (currentOffset == InvalidOffsetNumber) {
        if (batchRowCount > 0) {
            /*
             * If we get here, it means we've exhausted the items on this CU and
             * it's time to move to the next CU.
             */
            getBatchBySamples(pOutBatch);

            errno_t rc = memset_s(offsetIds, sizeof(uint16) * BatchMaxSize, 0, sizeof(uint16) * BatchMaxSize);
            securec_check(rc, "", "");
        }

        return INVALIDOFFSET;
    }

    if (!vecsampleScanState->m_CStore->IsDeadRow(currentCuId, (uint32)currentOffset)) {
        elog(DEBUG2,
            "Get one tuple [currentCuId: %u, currentOffset: %u] for relation: %s on %s.",
            currentCuId,
            currentOffset,
            NameStr(vecsampleScanState->ss_currentRelation->rd_rel->relname),
            g_instance.attr.attr_common.PGXCNodeName);

        /* Get current row from CU and fill into vector until to finish one batch. */
        offsetIds[batchRowCount++] = currentOffset;
        if (batchRowCount >= BatchMaxSize) {
            getBatchBySamples(pOutBatch);

            batchRowCount = 0;
            errno_t rc = memset_s(offsetIds, sizeof(uint16) * BatchMaxSize, 0, sizeof(uint16) * BatchMaxSize);
            securec_check(rc, "", "");

            return VALIDDATA;
        }
    }

    return NEXTDATA;
}

/*
 * Description: Get sample VectoBatch for column table.
 *
 * Parameters:
 *	@in pOutBatch: return values of VectorBatch
 *
 * Returns: void
 */
void ColumnTableSample::scanVecSample(VectorBatch* pOutBatch)
{
    /* Return NULL if finish scan or percent value is 0. */
    if ((finished == true) || (vecsampleScanState->sampleScanInfo.sampleType == BERNOULLI_SAMPLE && percent[0] == 0) ||
        (vecsampleScanState->sampleScanInfo.sampleType == SYSTEM_SAMPLE && percent[0] == 0) ||
        (vecsampleScanState->sampleScanInfo.sampleType == HYBRID_SAMPLE && percent[BERNOULLI_SAMPLE] == 0 &&
            percent[SYSTEM_SAMPLE] == 0)) {
        return;
    }

    for (;;) {
        CHECK_FOR_INTERRUPTS();

        switch (runState) {
            case GETMAXBLOCK: {
                /* Get num of max CU. */
                totalBlockNum = CStoreRelGetCUNumByNow((CStoreScanDesc)vecsampleScanState);
                runState = GETBLOCKNO;
                elog(DEBUG2,
                    "Get %u CUs for relation: %s on %s.",
                    totalBlockNum,
                    NameStr(vecsampleScanState->ss_currentRelation->rd_rel->relname),
                    g_instance.attr.attr_common.PGXCNodeName);
                break;
            }
            case GETBLOCKNO: {
                /* Get random or sequence CUId as current block. */
                (this->*nextSampleBlock_function)();

                if (!BlockNumberIsValid(currentBlock)) {
                    /* All block alreadly be scaned finish.*/
                    finished = true;
                    return;
                }

                currentCuId = currentBlock + FirstCUID + 1;
                runState = GETMAXOFFSET;
                break;
            }
            case GETMAXOFFSET: {
                getMaxOffset();

                if (InvalidOffsetNumber == curBlockMaxoffset) {
                    runState = GETBLOCKNO;
                } else {
                    runState = GETOFFSET;
                }

                elog(DEBUG2,
                    "Get %d tuples in CUNo: %u for relation: %s on %s.",
                    curBlockMaxoffset,
                    currentBlock,
                    NameStr(vecsampleScanState->ss_currentRelation->rd_rel->relname),
                    g_instance.attr.attr_common.PGXCNodeName);
                break;
            }
            case GETOFFSET: {
                (this->*nextSampleTuple_function)();

                runState = GETDATA;
                break;
            }
            case GETDATA: {
                ScanValid scanState = scanBatch(pOutBatch);

                switch (scanState) {
                    case VALIDDATA: {
                        runState = GETOFFSET;
                        return;
                    }
                    case NEXTDATA: {
                        runState = GETOFFSET;
                        break;
                    }
                    case INVALIDOFFSET: {
                        runState = GETBLOCKNO;

                        /* Return the last batch if filled and get new CU and batch. */
                        if (batchRowCount > 0) {
                            batchRowCount = 0;
                            return;
                        }
                        break;
                    }
                    default: {
                        break;
                    }
                }
                break;
            }
            default: {
                break;
            }
        }
    }
}

static double sample_random_fract(void)
{
    return ((double)gs_random() + 1) / ((double)MAX_RANDOM_VALUE + 2);
}