doris/be/src/exec/partitioned_hash_table.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/apache/impala/blob/branch-2.9.0/be/src/exec/partitioned-hash-table.h
// and modified by Doris

#pragma once

#include <memory>
#include <vector>

#include "common/compiler_util.h"
#include "runtime/buffered_tuple_stream3.h"
#include "runtime/bufferpool/suballocator.h"
#include "runtime/tuple_row.h"
#include "util/bitmap.h"

namespace doris {

class Expr;
class ExprContext;
class PartitionedHashTable;
class RowDescriptor;
class RuntimeState;
class Tuple;
class TupleRow;

/// Linear or quadratic probing hash table implementation tailored to the usage pattern
/// for partitioned hash aggregation and hash joins. The hash table stores TupleRows and
/// allows for different exprs for insertions and finds. This is the pattern we use for
/// joins and aggregation where the input/build tuple row descriptor is different from the
/// find/probe descriptor. The implementation is designed to allow codegen for some paths.
//
/// In addition to the hash table there is also an accompanying hash table context that is
/// used for insertions and probes. For example, the hash table context stores evaluated
/// expr results for the current row being processed when possible into a contiguous
/// memory buffer. This allows for efficient hash computation.
//
/// The hash table does not support removes. The hash table is not thread safe.
/// The table is optimized for the partition hash aggregation and hash joins and is not
/// intended to be a generic hash table implementation. The API loosely mimics the
/// std::hashset API.
//
/// The data (rows) are stored in a BufferedTupleStream3. The basic data structure of this
/// hash table is a vector of buckets. The buckets (indexed by the mod of the hash)
/// contain a pointer to either the slot in the tuple-stream or in case of duplicate
/// values, to the head of a linked list of nodes that in turn contain a pointer to
/// tuple-stream slots. When inserting an entry we start at the bucket at position
/// (hash % size) and search for either a bucket with the same hash or for an empty
/// bucket. If a bucket with the same hash is found, we then compare for row equality and
/// either insert a duplicate node if the equality is true, or continue the search if the
/// row equality is false. Similarly, when probing we start from the bucket at position
/// (hash % size) and search for an entry with the same hash or for an empty bucket.
/// In the former case, we then check for row equality and continue the search if the row
/// equality is false. In the latter case, the probe is not successful. When growing the
/// hash table, the number of buckets is doubled. We trigger a resize when the fill
/// factor is approx 75%. Due to the doubling nature of the buckets, we require that the
/// number of buckets is a power of 2. This allows us to perform a modulo of the hash
/// using a bitmask.
///
/// We choose to use linear or quadratic probing because they exhibit good (predictable)
/// cache behavior.
///
/// The first NUM_SMALL_BLOCKS of nodes_ are made of blocks less than the IO size (of 8MB)
/// to reduce the memory footprint of small queries.
///
/// TODO: Compare linear and quadratic probing and remove the loser.
/// TODO: We currently use 32-bit hashes. There is room in the bucket structure for at
/// least 48-bits. We should exploit this space.
/// TODO: Consider capping the probes with a threshold value. If an insert reaches
/// that threshold it is inserted to another linked list of overflow entries.
/// TODO: Smarter resizes, and perhaps avoid using powers of 2 as the hash table size.
/// TODO: this is not a fancy hash table in terms of memory access patterns
/// (cuckoo-hashing or something that spills to disk). We will likely want to invest
/// more time into this.
/// TODO: hash-join and aggregation have very different access patterns.  Joins insert all
/// the rows and then calls scan to find them.  Aggregation interleaves FindProbeRow() and
/// Inserts().  We may want to optimize joins more heavily for Inserts() (in particular
/// growing).
/// TODO: Batched interface for inserts and finds.
/// TODO: Do we need to check mem limit exceeded so often. Check once per batch?
/// TODO: as an optimization, compute variable-length data size for the agg node.

/// Control block for a hash table. This class contains the logic as well as the variables
/// needed by a thread to operate on a hash table.
class PartitionedHashTableCtx {
public:
    /// Create a hash table context with the specified parameters, invoke Init() to
    /// initialize the new hash table context and return it in 'ht_ctx'. Expression
    /// evaluators for the build and probe expressions will also be allocated.
    /// Please see the comments of HashTableCtx constructor and Init() for details
    /// of other parameters.
    static Status Create(ObjectPool* pool, RuntimeState* state,
                         const std::vector<Expr*>& build_exprs,
                         const std::vector<Expr*>& probe_exprs, bool stores_nulls,
                         const std::vector<bool>& finds_nulls, int32_t initial_seed, int max_levels,
                         int num_build_tuples, MemPool* mem_pool, MemPool* expr_results_pool,
                         const RowDescriptor& row_desc, const RowDescriptor& row_desc_probe,
                         std::unique_ptr<PartitionedHashTableCtx>* ht_ctx);

    /// Initialize the build and probe expression evaluators.
    Status Open(RuntimeState* state);

    /// Call to cleanup any resources allocated by the expression evaluators.
    void Close(RuntimeState* state);

    /// Free local allocations made by build and probe expression evaluators respectively.
    void FreeBuildLocalAllocations();
    void FreeProbeLocalAllocations();

    /// Free local allocations of both build and probe expression evaluators.
    void FreeLocalAllocations();

    void set_level(int level);

    int ALWAYS_INLINE level() const { return level_; }

    uint32_t ALWAYS_INLINE seed(int level) { return seeds_.at(level); }

    TupleRow* ALWAYS_INLINE scratch_row() const { return scratch_row_; }

    /// Returns the results of the expression at 'expr_idx' evaluated at the current row.
    /// This value is invalid if the expr evaluated to nullptr.
    /// TODO: this is an awkward abstraction but aggregation node can take advantage of
    /// it and save some expr evaluation calls.
    void* ALWAYS_INLINE ExprValue(int expr_idx) const {
        return expr_values_cache_.ExprValuePtr(expr_values_cache_.cur_expr_values(), expr_idx);
    }

    /// Returns if the expression at 'expr_idx' is evaluated to nullptr for the current row.
    bool ALWAYS_INLINE ExprValueNull(int expr_idx) const {
        return static_cast<bool>(*(expr_values_cache_.cur_expr_values_null() + expr_idx));
    }

    /// Evaluate and hash the build/probe row, saving the evaluation to the current row of
    /// the ExprValuesCache in this hash table context: the results are saved in
    /// 'cur_expr_values_', the nullness of expressions values in 'cur_expr_values_null_',
    /// and the hashed expression values in 'cur_expr_values_hash_'. Returns false if this
    /// row should be rejected  (doesn't need to be processed further) because it contains
    /// nullptr. These need to be inlined in the IR module so we can find and replace the
    /// calls to EvalBuildRow()/EvalProbeRow().
    bool EvalAndHashBuild(TupleRow* row);
    bool EvalAndHashProbe(TupleRow* row);

    /// Struct that returns the number of constants replaced by ReplaceConstants().
    struct HashTableReplacedConstants {
        int stores_nulls;
        int finds_some_nulls;
        int stores_tuples;
        int stores_duplicates;
        int quadratic_probing;
    };

    /// To enable prefetching, the hash table building and probing are pipelined by the
    /// exec nodes. A set of rows in a row batch will be evaluated and hashed first and
    /// the corresponding hash table buckets are prefetched before they are probed against
    /// the hash table. ExprValuesCache is a container for caching the results of
    /// expressions evaluations for the rows in a prefetch set to avoid re-evaluating the
    /// rows again during probing. Expressions evaluation can be very expensive.
    ///
    /// The expression evaluation results are cached in the following data structures:
    ///
    /// - 'expr_values_array_' is an array caching the results of the rows
    /// evaluated against either the build or probe expressions. 'cur_expr_values_'
    /// is a pointer into this array.
    /// - 'expr_values_null_array_' is an array caching the nullness of each evaluated
    /// expression in each row. 'cur_expr_values_null_' is a pointer into this array.
    /// - 'expr_values_hash_array_' is an array of cached hash values of the rows.
    /// 'cur_expr_values_hash_' is a pointer into this array.
    /// - 'null_bitmap_' is a bitmap which indicates rows evaluated to nullptr.
    ///
    /// ExprValuesCache provides an iterator like interface for performing a write pass
    /// followed by a read pass. We refrain from providing an interface for random accesses
    /// as there isn't a use case for it now and we want to avoid expensive multiplication
    /// as the buffer size of each row is not necessarily power of two:
    /// - Reset(), ResetForRead(): reset the iterators before writing / reading cached values.
    /// - NextRow(): moves the iterators to point to the next row of cached values.
    /// - AtEnd(): returns true if all cached rows have been read. Valid in read mode only.
    ///
    /// Various metadata information such as layout of results buffer is also stored in
    /// this class. Note that the result buffer doesn't store variable length data. It only
    /// contains pointers to the variable length data (e.g. if an expression value is a
    /// StringValue).
    ///
    class ExprValuesCache {
    public:
        ExprValuesCache();

        /// Allocates memory and initializes various data structures. Return error status
        /// if memory allocation leads to the memory limits of the exec node to be exceeded.
        Status Init(RuntimeState* state, const std::vector<Expr*>& build_exprs);

        /// Frees up various resources.
        void Close();

        /// Resets the cache states (iterators, end pointers etc) before writing.
        void Reset() noexcept;

        /// Resets the iterators to the start before reading. Will record the current position
        /// of the iterators in end pointer before resetting so AtEnd() can determine if all
        /// cached values have been read.
        void ResetForRead();

        /// Advances the iterators to the next row by moving to the next entries in the
        /// arrays of cached values.
        void ALWAYS_INLINE NextRow();

        /// Compute the total memory usage of this ExprValuesCache.
        static int MemUsage(int capacity, int results_buffer_size, int num_build_exprs);

        /// Returns the maximum number rows of expression values states which can be cached.
        int ALWAYS_INLINE capacity() const { return capacity_; }

        /// Returns the total size in bytes of a row of evaluated expressions' values.
        int ALWAYS_INLINE expr_values_bytes_per_row() const { return expr_values_bytes_per_row_; }

        /// Returns the offset into the result buffer of the first variable length
        /// data results.
        int ALWAYS_INLINE var_result_offset() const { return var_result_offset_; }

        /// Returns true if the current read pass is complete, meaning all cached values
        /// have been read.
        bool ALWAYS_INLINE AtEnd() const {
            return cur_expr_values_hash_ == cur_expr_values_hash_end_;
        }

        /// Returns true if the current row is null but nulls are not considered in the current
        /// phase (build or probe).
        bool ALWAYS_INLINE IsRowNull() const { return null_bitmap_.Get(CurIdx()); }

        /// Record in a bitmap that the current row is null but nulls are not considered in
        /// the current phase (build or probe).
        void ALWAYS_INLINE SetRowNull() { null_bitmap_.Set(CurIdx(), true); }

        /// Returns the hash values of the current row.
        uint32_t ALWAYS_INLINE CurExprValuesHash() const { return *cur_expr_values_hash_; }

        /// Sets the hash values for the current row.
        void ALWAYS_INLINE SetCurExprValuesHash(uint32_t hash) { *cur_expr_values_hash_ = hash; }

        /// Returns a pointer to the expression value at 'expr_idx' in 'expr_values'.
        template <typename T>
        T ExprValuePtr(T expr_values, int expr_idx) const {
            return expr_values + expr_values_offsets_[expr_idx];
        };

        /// Returns the current row's expression buffer. The expression values in the buffer
        /// are accessed using ExprValuePtr().
        uint8_t* ALWAYS_INLINE cur_expr_values() const { return cur_expr_values_; }

        /// Returns null indicator bytes for the current row, one per expression. Non-zero
        /// bytes mean nullptr, zero bytes mean non-nullptr. Indexed by the expression index.
        /// These are uint8_t instead of bool to simplify codegen with IRBuilder.
        /// TODO: is there actually a valid reason why this is necessary for codegen?
        uint8_t* ALWAYS_INLINE cur_expr_values_null() const { return cur_expr_values_null_; }

        /// Returns the offset into the results buffer of the expression value at 'expr_idx'.
        int ALWAYS_INLINE expr_values_offsets(int expr_idx) const {
            return expr_values_offsets_[expr_idx];
        }

    private:
        friend class PartitionedHashTableCtx;

        /// Resets the iterators to the beginning of the cache values' arrays.
        void ResetIterators();

        /// Returns the offset in number of rows into the cached values' buffer.
        int ALWAYS_INLINE CurIdx() const {
            return cur_expr_values_hash_ - expr_values_hash_array_.get();
        }

        /// Max amount of memory in bytes for caching evaluated expression values.
        static const int MAX_EXPR_VALUES_ARRAY_SIZE = 256 << 10;

        /// Maximum number of rows of expressions evaluation states which this
        /// ExprValuesCache can cache.
        int capacity_;

        /// Byte size of a row of evaluated expression values. Never changes once set,
        /// can be used for constant substitution during codegen.
        int expr_values_bytes_per_row_;

        /// Number of build/probe expressions.
        int num_exprs_;

        /// Pointer into 'expr_values_array_' for the current row's expression values.
        uint8_t* cur_expr_values_;

        /// Pointer into 'expr_values_null_array_' for the current row's nullness of each
        /// expression value.
        uint8_t* cur_expr_values_null_;

        /// Pointer into 'expr_hash_value_array_' for the hash value of current row's
        /// expression values.
        uint32_t* cur_expr_values_hash_;

        /// Pointer to the buffer one beyond the end of the last entry of cached expressions'
        /// hash values.
        uint32_t* cur_expr_values_hash_end_;

        /// Array for caching up to 'capacity_' number of rows worth of evaluated expression
        /// values. Each row consumes 'expr_values_bytes_per_row_' number of bytes.
        std::unique_ptr<uint8_t[]> expr_values_array_;

        /// Array for caching up to 'capacity_' number of rows worth of null booleans.
        /// Each row contains 'num_exprs_' booleans to indicate nullness of expression values.
        /// Used when the hash table supports nullptr. Use 'uint8_t' to guarantee each entry is 1
        /// byte as sizeof(bool) is implementation dependent. The IR depends on this
        /// assumption.
        std::unique_ptr<uint8_t[]> expr_values_null_array_;

        /// Array for caching up to 'capacity_' number of rows worth of hashed values.
        std::unique_ptr<uint32_t[]> expr_values_hash_array_;

        /// One bit for each row. A bit is set if that row is not hashed as it's evaluated
        /// to nullptr but the hash table doesn't support nullptr. Such rows may still be included
        /// in outputs for certain join types (e.g. left anti joins).
        Bitmap null_bitmap_;

        /// Maps from expression index to the byte offset into a row of expression values.
        /// One entry per build/probe expression.
        std::vector<int> expr_values_offsets_;

        /// Byte offset into 'cur_expr_values_' that begins the variable length results for
        /// a row. If -1, there are no variable length slots. Never changes once set, can be
        /// constant substituted with codegen.
        int var_result_offset_;
    };

    ExprValuesCache* ALWAYS_INLINE expr_values_cache() { return &expr_values_cache_; }

private:
    friend class PartitionedAggregationNode;
    friend class PartitionedHashTable;
    friend class HashTableTest_HashEmpty_Test;

    /// Construct a hash table context.
    ///  - build_exprs are the exprs that should be used to evaluate rows during Insert().
    ///  - probe_exprs are used during FindProbeRow()
    ///  - stores_nulls: if false, TupleRows with nulls are ignored during Insert
    ///  - finds_nulls: if finds_nulls[i] is false, FindProbeRow() returns End() for
    ///        TupleRows with nulls in position i even if stores_nulls is true.
    ///  - initial_seed: initial seed value to use when computing hashes for rows with
    ///        level 0. Other levels have their seeds derived from this seed.
    ///  - max_levels: the max lhashevels we will hash with.
    ///  - mem_pool: the MemPool which the expression evaluators allocate from. Owned by the
    ///        exec node which owns this hash table context. Memory usage of the expression
    ///        value cache is charged against its MemTracker.
    ///
    /// TODO: stores_nulls is too coarse: for a hash table in which some columns are joined
    ///       with '<=>' and others with '=', stores_nulls could distinguish between columns
    ///       in which nulls are stored and columns in which they are not, which could save
    ///       space by not storing some rows we know will never match.
    PartitionedHashTableCtx(const std::vector<Expr*>& build_exprs,
                            const std::vector<Expr*>& probe_exprs, bool stores_nulls,
                            const std::vector<bool>& finds_nulls, int32_t initial_seed,
                            int max_levels, MemPool* mem_pool, MemPool* expr_results_pool);

    /// Allocate various buffers for storing expression evaluation results, hash values,
    /// null bits etc. Also allocate evaluators for the build and probe expressions and
    /// store them in 'pool'. Returns error if allocation causes query memory limit to
    /// be exceeded or the evaluators fail to initialize. 'num_build_tuples' is the number
    /// of tuples of a row in the build side, used for computing the size of a scratch row.
    Status Init(ObjectPool* pool, RuntimeState* state, int num_build_tuples,
                const RowDescriptor& row_desc, const RowDescriptor& row_desc_probe);

    /// Compute the hash of the values in 'expr_values' with nullness 'expr_values_null'.
    /// This will be replaced by codegen.  We don't want this inlined for replacing
    /// with codegen'd functions so the function name does not change.
    uint32_t HashRow(const uint8_t* expr_values, const uint8_t* expr_values_null) const noexcept;

    /// Wrapper function for calling correct HashUtil function in non-codegen'd case.
    uint32_t Hash(const void* input, int len, uint32_t hash) const;

    /// Evaluate 'row' over build exprs, storing values into 'expr_values' and nullness into
    /// 'expr_values_null'. This will be replaced by codegen. We do not want this function
    /// inlined when cross compiled because we need to be able to differentiate between
    /// EvalBuildRow and EvalProbeRow by name and the build/probe exprs are baked into the
    /// codegen'd function.
    bool EvalBuildRow(TupleRow* row, uint8_t* expr_values, uint8_t* expr_values_null) noexcept {
        return EvalRow(row, build_expr_evals_, expr_values, expr_values_null);
    }

    /// Evaluate 'row' over probe exprs, storing the values into 'expr_values' and nullness
    /// into 'expr_values_null'. This will be replaced by codegen.
    bool EvalProbeRow(TupleRow* row, uint8_t* expr_values, uint8_t* expr_values_null) noexcept {
        return EvalRow(row, probe_expr_evals_, expr_values, expr_values_null);
    }

    /// Compute the hash of the values in 'expr_values' with nullness 'expr_values_null'
    /// for a row with variable length fields (e.g. strings).
    uint32_t HashVariableLenRow(const uint8_t* expr_values, const uint8_t* expr_values_null) const;

    /// Evaluate the exprs over row, storing the values into 'expr_values' and nullness into
    /// 'expr_values_null'. Returns whether any expr evaluated to nullptr. This will be
    /// replaced by codegen.
    bool EvalRow(TupleRow* row, const std::vector<ExprContext*>& ctxs, uint8_t* expr_values,
                 uint8_t* expr_values_null) noexcept;

    /// Returns true if the values of build_exprs evaluated over 'build_row' equal the
    /// values in 'expr_values' with nullness 'expr_values_null'. FORCE_NULL_EQUALITY is
    /// true if all nulls should be treated as equal, regardless of the values of
    /// 'finds_nulls_'. This will be replaced by codegen.
    template <bool FORCE_NULL_EQUALITY>
    bool Equals(TupleRow* build_row, const uint8_t* expr_values,
                const uint8_t* expr_values_null) const noexcept;

    /// Helper function that calls Equals() with the current row. Always inlined so that
    /// it does not appear in cross-compiled IR.
    template <bool FORCE_NULL_EQUALITY>
    bool ALWAYS_INLINE Equals(TupleRow* build_row) const {
        return Equals<FORCE_NULL_EQUALITY>(build_row, expr_values_cache_.cur_expr_values(),
                                           expr_values_cache_.cur_expr_values_null());
    }

    /// Cross-compiled function to access member variables used in CodegenHashRow().
    uint32_t GetHashSeed() const;

    /// Functions to be replaced by codegen to specialize the hash table.
    bool stores_nulls() const { return stores_nulls_; }
    bool finds_some_nulls() const { return finds_some_nulls_; }

    const std::vector<Expr*>& build_exprs_;
    std::vector<ExprContext*> build_expr_evals_;

    const std::vector<Expr*>& probe_exprs_;
    std::vector<ExprContext*> probe_expr_evals_;

    /// Constants on how the hash table should behave. Joins and aggs have slightly
    /// different behavior.
    const bool stores_nulls_;
    const std::vector<bool> finds_nulls_;

    /// finds_some_nulls_ is just the logical OR of finds_nulls_.
    const bool finds_some_nulls_;

    /// The current level this context is working on. Each level needs to use a
    /// different seed.
    int level_;

    /// The seeds to use for hashing. Indexed by the level.
    std::vector<uint32_t> seeds_;

    /// The ExprValuesCache for caching expression evaluation results, null bytes and hash
    /// values for rows. Used to store results of batch evaluations of rows.
    ExprValuesCache expr_values_cache_;

    /// Scratch buffer to generate rows on the fly.
    TupleRow* scratch_row_;

    /// MemPool for 'build_expr_evals_' and 'probe_expr_evals_' to allocate expr-managed
    /// memory from. Not owned.
    MemPool* mem_pool_;

    // MemPool for allocations by made EvalRow to copy expr's StringVal result. Not owned
    MemPool* expr_results_pool_;
};

/// The hash table consists of a contiguous array of buckets that contain a pointer to the
/// data, the hash value and three flags: whether this bucket is filled, whether this
/// entry has been matched (used in right and full joins) and whether this entry has
/// duplicates. If there are duplicates, then the data is pointing to the head of a
/// linked list of duplicate nodes that point to the actual data. Note that the duplicate
/// nodes do not contain the hash value, because all the linked nodes have the same hash
/// value, the one in the bucket. The data is either a tuple stream index or a Tuple*.
/// This array of buckets is sparse, we are shooting for up to 3/4 fill factor (75%). The
/// data allocated by the hash table comes from the BufferPool.
class PartitionedHashTable {
private:
    /// Rows are represented as pointers into the BufferedTupleStream data with one
    /// of two formats, depending on the number of tuples in the row.
    union HtData {
        // For rows with multiple tuples per row, a pointer to the flattened TupleRow.
        BufferedTupleStream3::FlatRowPtr flat_row;
        Tuple* tuple;
    };

    /// Linked list of entries used for duplicates.
    struct DuplicateNode {
        /// Used for full outer and right {outer, anti, semi} joins. Indicates whether the
        /// row in the DuplicateNode has been matched.
        /// From an abstraction point of view, this is an awkward place to store this
        /// information.
        /// TODO: Fold this flag in the next pointer below.
        bool matched;

        /// Chain to next duplicate node, nullptr when end of list.
        DuplicateNode* next;
        HtData htdata;
    };

    struct Bucket {
        /// Whether this bucket contains a valid entry, or it is empty.
        bool filled;

        /// Used for full outer and right {outer, anti, semi} joins. Indicates whether the
        /// row in the bucket has been matched.
        /// From an abstraction point of view, this is an awkward place to store this
        /// information but it is efficient. This space is otherwise unused.
        bool matched;

        /// Used in case of duplicates. If true, then the bucketData union should be used as
        /// 'duplicates'.
        bool hasDuplicates;

        /// Cache of the hash for data.
        /// TODO: Do we even have to cache the hash value?
        uint32_t hash;

        /// Either the data for this bucket or the linked list of duplicates.
        union {
            HtData htdata;
            DuplicateNode* duplicates;
        } bucketData;
    };

public:
    class Iterator;

    /// Returns a newly allocated HashTable. The probing algorithm is set by the
    /// FLAG_enable_quadratic_probing.
    ///  - allocator: allocator to allocate bucket directory and data pages from.
    ///  - stores_duplicates: true if rows with duplicate keys may be inserted into the
    ///    hash table.
    ///  - num_build_tuples: number of Tuples in the build tuple row.
    ///  - tuple_stream: the tuple stream which contains the tuple rows index by the
    ///    hash table. Can be nullptr if the rows contain only a single tuple, in which
    ///    case the 'tuple_stream' is unused.
    ///  - max_num_buckets: the maximum number of buckets that can be stored. If we
    ///    try to grow the number of buckets to a larger number, the inserts will fail.
    ///    -1, if it unlimited.
    ///  - initial_num_buckets: number of buckets that the hash table should be initialized
    ///    with.
    static PartitionedHashTable* Create(Suballocator* allocator, bool stores_duplicates,
                                        int num_build_tuples, BufferedTupleStream3* tuple_stream,
                                        int64_t max_num_buckets, int64_t initial_num_buckets);

    /// Allocates the initial bucket structure. Returns a non-OK status if an error is
    /// encountered. If an OK status is returned , 'got_memory' is set to indicate whether
    /// enough memory for the initial buckets was allocated from the Suballocator.
    Status Init(bool* got_memory);

    /// Call to cleanup any resources. Must be called once.
    void Close();

    /// Inserts the row to the hash table. The caller is responsible for ensuring that the
    /// table has free buckets. Returns true if the insertion was successful. Always
    /// returns true if the table has free buckets and the key is not a duplicate. If the
    /// key was a duplicate and memory could not be allocated for the new duplicate node,
    /// returns false. If an error is encountered while creating a duplicate node, returns
    /// false and sets 'status' to the error.
    ///
    /// 'flat_row' is a pointer to the flattened row in 'tuple_stream_' If the row contains
    /// only one tuple, a pointer to that tuple is stored. Otherwise the 'flat_row' pointer
    /// is stored. The 'row' is not copied by the hash table and the caller must guarantee
    /// it stays in memory. This will not grow the hash table.
    bool Insert(PartitionedHashTableCtx* ht_ctx, BufferedTupleStream3::FlatRowPtr flat_row,
                TupleRow* row, Status* status);

    /// Prefetch the hash table bucket which the given hash value 'hash' maps to.
    template <const bool READ>
    void PrefetchBucket(uint32_t hash);

    /// Returns an iterator to the bucket that matches the probe expression results that
    /// are cached at the current position of the ExprValuesCache in 'ht_ctx'. Assumes that
    /// the ExprValuesCache was filled using EvalAndHashProbe(). Returns HashTable::End()
    /// if no match is found. The iterator can be iterated until HashTable::End() to find
    /// all the matching rows. Advancing the returned iterator will go to the next matching
    /// row. The matching rows do not need to be evaluated since all the nodes of a bucket
    /// are duplicates. One scan can be in progress for each 'ht_ctx'. Used in the probe
    /// phase of hash joins.
    Iterator FindProbeRow(PartitionedHashTableCtx* ht_ctx);

    /// If a match is found in the table, return an iterator as in FindProbeRow(). If a
    /// match was not present, return an iterator pointing to the empty bucket where the key
    /// should be inserted. Returns End() if the table is full. The caller can set the data
    /// in the bucket using a Set*() method on the iterator.
    Iterator FindBuildRowBucket(PartitionedHashTableCtx* ht_ctx, bool* found);

    /// Returns number of elements inserted in the hash table
    int64_t size() const {
        return num_filled_buckets_ - num_buckets_with_duplicates_ + num_duplicate_nodes_;
    }

    /// Returns the number of empty buckets.
    int64_t EmptyBuckets() const { return num_buckets_ - num_filled_buckets_; }

    /// Returns the number of buckets
    int64_t num_buckets() const { return num_buckets_; }

    /// Returns the number of filled buckets
    int64_t num_filled_buckets() const { return num_filled_buckets_; }

    /// Returns the time of hash table resize
    int64_t num_resize() const { return num_resizes_; }

    /// Returns the number of bucket_with_duplicates
    int64_t num_buckets_with_duplicates() const { return num_buckets_with_duplicates_; }

    /// Returns the number of bucket_with_duplicates
    int64_t num_duplicates_nodes() const { return num_duplicate_nodes_; }

    /// Returns the number of probe operations
    int64_t num_probe() const { return num_probes_; }

    /// Returns the number of failed probe operations
    int64_t num_failed_probe() const { return num_failed_probes_; }

    /// Returns the number of travel_length of probe operations
    int64_t travel_length() const { return travel_length_; }

    /// Returns the load factor (the number of non-empty buckets)
    double load_factor() const { return static_cast<double>(num_filled_buckets_) / num_buckets_; }

    /// Return an estimate of the number of bytes needed to build the hash table
    /// structure for 'num_rows'. To do that, it estimates the number of buckets,
    /// rounded up to a power of two, and also assumes that there are no duplicates.
    static int64_t EstimateNumBuckets(int64_t num_rows) {
        /// Assume max 66% fill factor and no duplicates.
        return BitUtil::next_power_of_two(3 * num_rows / 2);
    }
    static int64_t EstimateSize(int64_t num_rows) {
        int64_t num_buckets = EstimateNumBuckets(num_rows);
        return num_buckets * sizeof(Bucket);
    }

    /// Return the size of a hash table bucket in bytes.
    static int64_t BucketSize() { return sizeof(Bucket); }

    /// Returns the memory occupied by the hash table, takes into account the number of
    /// duplicates.
    int64_t CurrentMemSize() const;

    /// Returns the number of inserts that can be performed before resizing the table.
    int64_t NumInsertsBeforeResize() const;

    /// Calculates the fill factor if 'buckets_to_fill' additional buckets were to be
    /// filled and resizes the hash table so that the projected fill factor is below the
    /// max fill factor.
    /// If 'got_memory' is true, then it is guaranteed at least 'rows_to_add' rows can be
    /// inserted without need to resize. If there is not enough memory available to
    /// resize the hash table, Status::OK()() is returned and 'got_memory' is false. If a
    /// another error occurs, an error status may be returned.
    Status CheckAndResize(uint64_t buckets_to_fill, const PartitionedHashTableCtx* ht_ctx,
                          bool* got_memory);

    /// Returns the number of bytes allocated to the hash table from the block manager.
    int64_t ByteSize() const { return num_buckets_ * sizeof(Bucket) + total_data_page_size_; }

    /// Returns an iterator at the beginning of the hash table.  Advancing this iterator
    /// will traverse all elements.
    Iterator Begin(const PartitionedHashTableCtx* ht_ctx);

    /// Return an iterator pointing to the first element (Bucket or DuplicateNode, if the
    /// bucket has duplicates) in the hash table that does not have its matched flag set.
    /// Used in right joins and full-outer joins.
    Iterator FirstUnmatched(PartitionedHashTableCtx* ctx);

    /// Return true if there was a least one match.
    bool HasMatches() const { return has_matches_; }

    /// Return end marker.
    Iterator End() { return Iterator(); }

    /// Dump out the entire hash table to string.  If 'skip_empty', empty buckets are
    /// skipped.  If 'show_match', it also prints the matched flag of each node. If
    /// 'build_desc' is non-null, the build rows will be printed. Otherwise, only the
    /// the addresses of the build rows will be printed.
    std::string DebugString(bool skip_empty, bool show_match, const RowDescriptor* build_desc);

    /// Print the content of a bucket or node.
    void DebugStringTuple(std::stringstream& ss, HtData& htdata, const RowDescriptor* desc);

    /// Update and print some statistics that can be used for performance debugging.
    std::string PrintStats() const;

    /// Number of hash collisions so far in the lifetime of this object
    int64_t NumHashCollisions() const { return num_hash_collisions_; }

    /// stl-like iterator interface.
    class Iterator {
    private:
        /// Bucket index value when probe is not successful.
        static const int64_t BUCKET_NOT_FOUND = -1;

    public:
        Iterator()
                : table_(nullptr),
                  scratch_row_(nullptr),
                  bucket_idx_(BUCKET_NOT_FOUND),
                  node_(nullptr) {}

        /// Iterates to the next element. It should be called only if !AtEnd().
        void Next();

        /// Iterates to the next duplicate node. If the bucket does not have duplicates or
        /// when it reaches the last duplicate node, then it moves the Iterator to AtEnd().
        /// Used when we want to iterate over all the duplicate nodes bypassing the Next()
        /// interface (e.g. in semi/outer joins without other_join_conjuncts, in order to
        /// iterate over all nodes of an unmatched bucket).
        void NextDuplicate();

        /// Iterates to the next element that does not have its matched flag set. Used in
        /// right-outer and full-outer joins.
        void NextUnmatched();

        /// Return the current row or tuple. Callers must check the iterator is not AtEnd()
        /// before calling them.  The returned row is owned by the iterator and valid until
        /// the next call to GetRow(). It is safe to advance the iterator.
        TupleRow* GetRow() const;
        Tuple* GetTuple() const;

        /// Set the current tuple for an empty bucket. Designed to be used with the iterator
        /// returned from FindBuildRowBucket() in the case when the value is not found.  It is
        /// not valid to call this function if the bucket already has an entry.
        void SetTuple(Tuple* tuple, uint32_t hash);

        /// Sets as matched the Bucket or DuplicateNode currently pointed by the iterator,
        /// depending on whether the bucket has duplicates or not. The iterator cannot be
        /// AtEnd().
        void SetMatched();

        /// Returns the 'matched' flag of the current Bucket or DuplicateNode, depending on
        /// whether the bucket has duplicates or not. It should be called only if !AtEnd().
        bool IsMatched() const;

        /// Resets everything but the pointer to the hash table.
        void SetAtEnd();

        /// Returns true if this iterator is at the end, i.e. GetRow() cannot be called.
        bool ALWAYS_INLINE AtEnd() const { return bucket_idx_ == BUCKET_NOT_FOUND; }

        /// Prefetch the hash table bucket which the iterator is pointing to now.
        template <const bool READ>
        void PrefetchBucket();

    private:
        friend class PartitionedHashTable;

        ALWAYS_INLINE
        Iterator(PartitionedHashTable* table, TupleRow* row, int bucket_idx, DuplicateNode* node)
                : table_(table), scratch_row_(row), bucket_idx_(bucket_idx), node_(node) {}

        PartitionedHashTable* table_;

        /// Scratch buffer to hold generated rows. Not owned.
        TupleRow* scratch_row_;

        /// Current bucket idx.
        int64_t bucket_idx_;

        /// Pointer to the current duplicate node.
        DuplicateNode* node_;
    };

private:
    friend class Iterator;
    friend class HashTableTest;

    /// Hash table constructor. Private because Create() should be used, instead
    /// of calling this constructor directly.
    ///  - quadratic_probing: set to true when the probing algorithm is quadratic, as
    ///    opposed to linear.
    PartitionedHashTable(bool quadratic_probing, Suballocator* allocator, bool stores_duplicates,
                         int num_build_tuples, BufferedTupleStream3* tuple_stream,
                         int64_t max_num_buckets, int64_t initial_num_buckets);

    /// Performs the probing operation according to the probing algorithm (linear or
    /// quadratic. Returns one of the following:
    /// (a) the index of the bucket that contains the entry that matches with the last row
    ///     evaluated in 'ht_ctx'. If 'ht_ctx' is nullptr then it does not check for row
    ///     equality and returns the index of the first empty bucket.
    /// (b) the index of the first empty bucket according to the probing algorithm (linear
    ///     or quadratic), if the entry is not in the hash table or 'ht_ctx' is nullptr.
    /// (c) Iterator::BUCKET_NOT_FOUND if the probe was not successful, i.e. the maximum
    ///     distance was traveled without finding either an empty or a matching bucket.
    /// Using the returned index value, the caller can create an iterator that can be
    /// iterated until End() to find all the matching rows.
    ///
    /// EvalAndHashBuild() or EvalAndHashProbe() must have been called before calling
    /// this function. The values of the expression values cache in 'ht_ctx' will be
    /// used to probe the hash table.
    ///
    /// 'FORCE_NULL_EQUALITY' is true if NULLs should always be considered equal when
    /// comparing two rows.
    ///
    /// 'hash' is the hash computed by EvalAndHashBuild() or EvalAndHashProbe().
    /// 'found' indicates that a bucket that contains an equal row is found.
    ///
    /// There are wrappers of this function that perform the Find and Insert logic.
    template <bool FORCE_NULL_EQUALITY>
    int64_t Probe(Bucket* buckets, int64_t num_buckets, PartitionedHashTableCtx* ht_ctx,
                  uint32_t hash, bool* found);

    /// Performs the insert logic. Returns the HtData* of the bucket or duplicate node
    /// where the data should be inserted. Returns nullptr if the insert was not successful
    /// and either sets 'status' to OK if it failed because not enough reservation was
    /// available or the error if an error was encountered.
    HtData* InsertInternal(PartitionedHashTableCtx* ht_ctx, Status* status);

    /// Updates 'bucket_idx' to the index of the next non-empty bucket. If the bucket has
    /// duplicates, 'node' will be pointing to the head of the linked list of duplicates.
    /// Otherwise, 'node' should not be used. If there are no more buckets, sets
    /// 'bucket_idx' to BUCKET_NOT_FOUND.
    void NextFilledBucket(int64_t* bucket_idx, DuplicateNode** node);

    /// Resize the hash table to 'num_buckets'. 'got_memory' is false on OOM.
    Status ResizeBuckets(int64_t num_buckets, const PartitionedHashTableCtx* ht_ctx,
                         bool* got_memory);

    /// Appends the DuplicateNode pointed by next_node_ to 'bucket' and moves the next_node_
    /// pointer to the next DuplicateNode in the page, updating the remaining node counter.
    DuplicateNode* AppendNextNode(Bucket* bucket);

    /// Creates a new DuplicateNode for a entry and chains it to the bucket with index
    /// 'bucket_idx'. The duplicate nodes of a bucket are chained as a linked list.
    /// This places the new duplicate node at the beginning of the list. If this is the
    /// first duplicate entry inserted in this bucket, then the entry already contained by
    /// the bucket is converted to a DuplicateNode. That is, the contents of 'data' of the
    /// bucket are copied to a DuplicateNode and 'data' is updated to pointing to a
    /// DuplicateNode.
    /// Returns nullptr and sets 'status' to OK if the node array could not grow, i.e. there
    /// was not enough memory to allocate a new DuplicateNode. Returns nullptr and sets
    /// 'status' to an error if another error was encountered.
    DuplicateNode* InsertDuplicateNode(int64_t bucket_idx, Status* status);

    /// Resets the contents of the empty bucket with index 'bucket_idx', in preparation for
    /// an insert. Sets all the fields of the bucket other than 'data'.
    void PrepareBucketForInsert(int64_t bucket_idx, uint32_t hash);

    /// Return the TupleRow pointed by 'htdata'.
    TupleRow* GetRow(HtData& htdata, TupleRow* row) const;

    /// Returns the TupleRow of the pointed 'bucket'. In case of duplicates, it
    /// returns the content of the first chained duplicate node of the bucket.
    TupleRow* GetRow(Bucket* bucket, TupleRow* row) const;

    /// Grow the node array. Returns true and sets 'status' to OK on success. Returns false
    /// and set 'status' to OK if we can't get sufficient reservation to allocate the next
    /// data page. Returns false and sets 'status' if another error is encountered.
    bool GrowNodeArray(Status* status);

    /// Functions to be replaced by codegen to specialize the hash table.
    bool stores_tuples() const { return stores_tuples_; }
    bool stores_duplicates() const { return stores_duplicates_; }
    bool quadratic_probing() const { return quadratic_probing_; }

    /// Load factor that will trigger growing the hash table on insert.  This is
    /// defined as the number of non-empty buckets / total_buckets
    static constexpr double MAX_FILL_FACTOR = 0.75;

    /// The size in bytes of each page of duplicate nodes. Should be large enough to fit
    /// enough DuplicateNodes to amortise the overhead of allocating each page and low
    /// enough to not waste excessive memory to internal fragmentation.
    static constexpr int64_t DATA_PAGE_SIZE = 64L * 1024;

    RuntimeState* state_;

    /// Suballocator to allocate data pages and hash table buckets with.
    Suballocator* allocator_;

    /// Stream contains the rows referenced by the hash table. Can be nullptr if the
    /// row only contains a single tuple, in which case the TupleRow indirection
    /// is removed by the hash table.
    BufferedTupleStream3* tuple_stream_;

    /// Constants on how the hash table should behave.

    /// True if the HtData uses the Tuple* representation, or false if it uses FlatRowPtr.
    const bool stores_tuples_;

    /// True if duplicates may be inserted into hash table.
    const bool stores_duplicates_;

    /// Quadratic probing enabled (as opposed to linear).
    const bool quadratic_probing_;

    /// Data pages for all nodes. Allocated from suballocator to reduce memory
    /// consumption of small tables.
    std::vector<std::unique_ptr<Suballocation>> data_pages_;

    /// Byte size of all buffers in data_pages_.
    int64_t total_data_page_size_;

    /// Next duplicate node to insert. Valid when node_remaining_current_page_ > 0.
    DuplicateNode* next_node_;

    /// Number of nodes left in the current page.
    int node_remaining_current_page_;

    /// Number of duplicate nodes.
    int64_t num_duplicate_nodes_;

    const int64_t max_num_buckets_;

    /// Allocation containing all buckets.
    std::unique_ptr<Suballocation> bucket_allocation_;

    /// Pointer to the 'buckets_' array from 'bucket_allocation_'.
    Bucket* buckets_;

    /// Total number of buckets (filled and empty).
    int64_t num_buckets_;

    /// Number of non-empty buckets.  Used to determine when to resize.
    int64_t num_filled_buckets_;

    /// Number of (non-empty) buckets with duplicates. These buckets do not point to slots
    /// in the tuple stream, rather than to a linked list of Nodes.
    int64_t num_buckets_with_duplicates_;

    /// Number of build tuples, used for constructing temp row* for probes.
    const int num_build_tuples_;

    /// Flag used to check that we don't lose stored matches when spilling hash tables
    /// (IMPALA-1488).
    bool has_matches_;

    /// The stats below can be used for debugging perf.
    /// TODO: Should we make these statistics atomic?
    /// Number of FindProbeRow(), Insert(), or FindBuildRowBucket() calls that probe the
    /// hash table.
    int64_t num_probes_;

    /// Number of probes that failed and had to fall back to linear probing without cap.
    int64_t num_failed_probes_;

    /// Total distance traveled for each probe. That is the sum of the diff between the end
    /// position of a probe (find/insert) and its start position
    /// (hash & (num_buckets_ - 1)).
    int64_t travel_length_;

    /// The number of cases where we had to compare buckets with the same hash value, but
    /// the row equality failed.
    int64_t num_hash_collisions_;

    /// How many times this table has resized so far.
    int64_t num_resizes_;
};

} // namespace doris