Files
doris/be/src/exec/partitioned_hash_table.h
2017-08-11 17:51:21 +08:00

700 lines
32 KiB
C++

// Modifications copyright (C) 2017, Baidu.com, Inc.
// Copyright 2017 The Apache Software Foundation
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef BDG_PALO_BE_SRC_EXEC_PARTITIONED_HASH_TABLE_H
#define BDG_PALO_BE_SRC_EXEC_PARTITIONED_HASH_TABLE_H
#include <vector>
#include <boost/cstdint.hpp>
#include <boost/scoped_ptr.hpp>
#include "codegen/palo_ir.h"
#include "util/logging.h"
#include "runtime/buffered_block_mgr2.h"
#include "runtime/buffered_tuple_stream2.h"
#include "runtime/buffered_tuple_stream2.inline.h"
#include "runtime/mem_tracker.h"
#include "runtime/mem_tracker.h"
#include "runtime/tuple_row.h"
#include "util/hash_util.hpp"
#include "util/bit_util.h"
namespace llvm {
class Function;
}
namespace palo {
class Expr;
class ExprContext;
class LlvmCodeGen;
class MemTracker;
class MemTracker;
class RowDescriptor;
class RuntimeState;
class Tuple;
class TupleRow;
class PartitionedHashTable;
// Linear or quadratic probing hash table implementation tailored to the usage pattern
// for partitioned hash aggregation and hash joins. The hash table stores TupleRows and
// allows for different exprs for insertions and finds. This is the pattern we use for
// joins and aggregation where the input/build tuple row descriptor is different from the
// find/probe descriptor. The implementation is designed to allow codegen for some paths.
//
// In addition to the hash table there is also an accompanying hash table context that
// is used for insertions and probes. For example, the hash table context stores
// evaluated expr results for the current row being processed when possible into a
// contiguous memory buffer. This allows for efficient hash computation.
//
// The hash table does not support removes. The hash table is not thread safe.
// The table is optimized for the partition hash aggregation and hash joins and is not
// intended to be a generic hash table implementation. The API loosely mimics the
// std::hashset API.
//
// The data (rows) are stored in a BufferedTupleStream2. The basic data structure of this
// hash table is a vector of buckets. The buckets (indexed by the mod of the hash)
// contain a pointer to either the slot in the tuple-stream or in case of duplicate
// values, to the head of a linked list of nodes that in turn contain a pointer to
// tuple-stream slots. When inserting an entry we start at the bucket at position
// (hash % size) and search for either a bucket with the same hash or for an empty
// bucket. If a bucket with the same hash is found, we then compare for row equality and
// either insert a duplicate node if the equality is true, or continue the search if the
// row equality is false. Similarly, when probing we start from the bucket at position
// (hash % size) and search for an entry with the same hash or for an empty bucket.
// In the former case, we then check for row equality and continue the search if the row
// equality is false. In the latter case, the probe is not successful. When growing the
// hash table, the number of buckets is doubled. We trigger a resize when the fill
// factor is approx 75%. Due to the doubling nature of the buckets, we require that the
// number of buckets is a power of 2. This allows us to perform a modulo of the hash
// using a bitmask.
//
// We choose to use linear or quadratic probing because they exhibit good (predictable)
// cache behavior.
//
// The first NUM_SMALL_BLOCKS of _nodes are made of blocks less than the IO size (of 8MB)
// to reduce the memory footprint of small queries.
//
// TODO: Compare linear and quadratic probing and remove the loser.
// TODO: We currently use 32-bit hashes. There is room in the bucket structure for at
// least 48-bits. We should exploit this space.
// TODO: Consider capping the probes with a threshold value. If an insert reaches
// that threshold it is inserted to another linked list of overflow entries.
// TODO: Smarter resizes, and perhaps avoid using powers of 2 as the hash table size.
// TODO: this is not a fancy hash table in terms of memory access patterns
// (cuckoo-hashing or something that spills to disk). We will likely want to invest
// more time into this.
// TODO: hash-join and aggregation have very different access patterns. Joins insert
// all the rows and then calls scan to find them. Aggregation interleaves find() and
// Inserts(). We may want to optimize joins more heavily for Inserts() (in particular
// growing).
// TODO: Batched interface for inserts and finds.
// TODO: Do we need to check mem limit exceeded so often. Check once per batch?
// Control block for a hash table. This class contains the logic as well as the variables
// needed by a thread to operate on a hash table.
class PartitionedHashTableCtx {
public:
// Create a hash table context.
// - build_exprs are the exprs that should be used to evaluate rows during insert().
// - probe_exprs are used during find()
// - stores_nulls: if false, TupleRows with nulls are ignored during insert
// - finds_nulls: if false, find() returns End() for TupleRows with nulls
// even if stores_nulls is true
// - initial_seed: Initial seed value to use when computing hashes for rows with
// level 0. Other levels have their seeds derived from this seed.
// - The max levels we will hash with.
PartitionedHashTableCtx(const std::vector<ExprContext*>& build_expr_ctxs,
const std::vector<ExprContext*>& probe_expr_ctxs, bool stores_nulls,
bool finds_nulls, int32_t initial_seed, int max_levels,
int num_build_tuples);
// Call to cleanup any resources.
void close();
void set_level(int level);
int level() const { return _level; }
uint32_t seed(int level) { return _seeds.at(level); }
TupleRow* row() const { return _row; }
// Returns the results of the exprs at 'expr_idx' evaluated over the last row
// processed.
// This value is invalid if the expr evaluated to NULL.
// TODO: this is an awkward abstraction but aggregation node can take advantage of
// it and save some expr evaluation calls.
void* last_expr_value(int expr_idx) const {
return _expr_values_buffer + _expr_values_buffer_offsets[expr_idx];
}
// Returns if the expr at 'expr_idx' evaluated to NULL for the last row.
bool last_expr_value_null(int expr_idx) const {
return _expr_value_null_bits[expr_idx];
}
// Evaluate and hash the build/probe row, returning in *hash. Returns false if this
// row should be rejected (doesn't need to be processed further) because it
// contains NULL.
// These need to be inlined in the IR module so we can find and replace the calls to
// EvalBuildRow()/EvalProbeRow().
bool IR_ALWAYS_INLINE eval_and_hash_build(TupleRow* row, uint32_t* hash);
bool IR_ALWAYS_INLINE eval_and_hash_probe(TupleRow* row, uint32_t* hash);
int results_buffer_size() const { return _results_buffer_size; }
// Codegen for evaluating a tuple row. Codegen'd function matches the signature
// for EvalBuildRow and EvalTupleRow.
// If build_row is true, the codegen uses the build_exprs, otherwise the probe_exprs.
llvm::Function* codegen_eval_row(RuntimeState* state, bool build_row);
// Codegen for evaluating a TupleRow and comparing equality against
// '_expr_values_buffer'. Function signature matches PartitionedHashTable::equals().
llvm::Function* codegen_equals(RuntimeState* state);
// Codegen for hashing the expr values in '_expr_values_buffer'. Function prototype
// matches HashCurrentRow identically. Unlike HashCurrentRow(), the returned function
// only uses a single hash function, rather than switching based on _level.
// If 'use_murmur' is true, murmur hash is used, otherwise CRC is used if the hardware
// supports it (see hash-util.h).
llvm::Function* codegen_hash_current_row(RuntimeState* state, bool use_murmur);
static const char* _s_llvm_class_name;
private:
friend class PartitionedHashTable;
friend class PartitionedHashTableTest_HashEmpty_Test;
// Compute the hash of the values in _expr_values_buffer.
// This will be replaced by codegen. We don't want this inlined for replacing
// with codegen'd functions so the function name does not change.
uint32_t IR_NO_INLINE HashCurrentRow() {
DCHECK_LT(_level, _seeds.size());
if (_var_result_begin == -1) {
// This handles NULLs implicitly since a constant seed value was put
// into results buffer for nulls.
// TODO: figure out which hash function to use. We need to generate uncorrelated
// hashes by changing just the seed. CRC does not have this property and FNV is
// okay. We should switch to something else.
return hash_help(_expr_values_buffer, _results_buffer_size, _seeds[_level]);
} else {
return PartitionedHashTableCtx::hash_variable_len_row();
}
}
// Wrapper function for calling correct HashUtil function in non-codegen'd case.
uint32_t inline hash_help(const void* input, int len, int32_t hash) {
// Use CRC hash at first level for better performance. Switch to murmur hash at
// subsequent levels since CRC doesn't randomize well with different seed inputs.
if (_level == 0) {
return HashUtil::hash(input, len, hash);
}
return HashUtil::murmur_hash2_64(input, len, hash);
}
// Evaluate 'row' over build exprs caching the results in '_expr_values_buffer' This
// will be replaced by codegen. We do not want this function inlined when cross
// compiled because we need to be able to differentiate between EvalBuildRow and
// EvalProbeRow by name and the build/probe exprs are baked into the codegen'd
// function.
bool IR_NO_INLINE EvalBuildRow(TupleRow* row) {
return eval_row(row, _build_expr_ctxs);
}
// Evaluate 'row' over probe exprs caching the results in '_expr_values_buffer'
// This will be replaced by codegen.
bool IR_NO_INLINE EvalProbeRow(TupleRow* row) {
return eval_row(row, _probe_expr_ctxs);
}
// Compute the hash of the values in _expr_values_buffer for rows with variable length
// fields (e.g. strings).
uint32_t hash_variable_len_row();
// Evaluate the exprs over row and cache the results in '_expr_values_buffer'.
// Returns whether any expr evaluated to NULL.
// This will be replaced by codegen.
bool eval_row(TupleRow* row, const std::vector<ExprContext*>& ctxs);
// Returns true if the values of build_exprs evaluated over 'build_row' equal
// the values cached in _expr_values_buffer.
// This will be replaced by codegen.
bool IR_NO_INLINE equals(TupleRow* build_row);
const std::vector<ExprContext*>& _build_expr_ctxs;
const std::vector<ExprContext*>& _probe_expr_ctxs;
// Constants on how the hash table should behave. Joins and aggs have slightly
// different behavior.
// TODO: these constants are an ideal candidate to be removed with codegen.
// TODO: ..or with template-ization
const bool _stores_nulls;
const bool _finds_nulls;
// The current level this context is working on. Each level needs to use a
// different seed.
int _level;
// The seeds to use for hashing. Indexed by the level.
std::vector<uint32_t> _seeds;
// Cache of exprs values for the current row being evaluated. This can either
// be a build row (during insert()) or probe row (during find()).
std::vector<int> _expr_values_buffer_offsets;
// Byte offset into '_expr_values_buffer' that begins the variable length results.
// If -1, there are no variable length slots. Never changes once set, can be removed
// with codegen.
int _var_result_begin;
// Byte size of '_expr_values_buffer'. Never changes once set, can be removed with
// codegen.
int _results_buffer_size;
// Buffer to store evaluated expr results. This address must not change once
// allocated since the address is baked into the codegen.
uint8_t* _expr_values_buffer;
// Use bytes instead of bools to be compatible with llvm. This address must
// not change once allocated.
uint8_t* _expr_value_null_bits;
// Scratch buffer to generate rows on the fly.
TupleRow* _row;
// Cross-compiled functions to access member variables used in codegen_hash_current_row().
uint32_t get_hash_seed() const;
};
// The hash table consists of a contiguous array of buckets that contain a pointer to the
// data, the hash value and three flags: whether this bucket is filled, whether this
// entry has been matched (used in right and full joins) and whether this entry has
// duplicates. If there are duplicates, then the data is pointing to the head of a
// linked list of duplicate nodes that point to the actual data. Note that the duplicate
// nodes do not contain the hash value, because all the linked nodes have the same hash
// value, the one in the bucket. The data is either a tuple stream index or a Tuple*.
// This array of buckets is sparse, we are shooting for up to 3/4 fill factor (75%). The
// data allocated by the hash table comes from the BufferedBlockMgr2.
class PartitionedHashTable {
private:
// Either the row in the tuple stream or a pointer to the single tuple of this row.
union HtData {
BufferedTupleStream2::RowIdx idx;
Tuple* tuple;
};
// Linked list of entries used for duplicates.
struct DuplicateNode {
// Used for full outer and right {outer, anti, semi} joins. Indicates whether the
// row in the DuplicateNode has been matched.
// From an abstraction point of view, this is an awkward place to store this
// information.
// TODO: Fold this flag in the next pointer below.
bool matched;
// Chain to next duplicate node, NULL when end of list.
DuplicateNode* next;
HtData htdata;
};
struct Bucket {
// Whether this bucket contains a vaild entry, or it is empty.
bool filled;
// Used for full outer and right {outer, anti, semi} joins. Indicates whether the
// row in the bucket has been matched.
// From an abstraction point of view, this is an awkward place to store this
// information but it is efficient. This space is otherwise unused.
bool matched;
// Used in case of duplicates. If true, then the bucketData union should be used as
// 'duplicates'.
bool hasDuplicates;
// Cache of the hash for data.
// TODO: Do we even have to cache the hash value?
uint32_t hash;
// Either the data for this bucket or the linked list of duplicates.
union {
HtData htdata;
DuplicateNode* duplicates;
} bucketData;
};
public:
class Iterator;
// Returns a newly allocated PartitionedHashTable. The probing algorithm is set by the
// FLAG_enable_quadratic_probing.
// - client: block mgr client to allocate data pages from.
// - num_build_tuples: number of Tuples in the build tuple row.
// - tuple_stream: the tuple stream which contains the tuple rows index by the
// hash table. Can be NULL if the rows contain only a single tuple, in which
// case the 'tuple_stream' is unused.
// - max_num_buckets: the maximum number of buckets that can be stored. If we
// try to grow the number of buckets to a larger number, the inserts will fail.
// -1, if it unlimited.
// - initial_num_buckets: number of buckets that the hash table should be initialized
// with.
static PartitionedHashTable* create(RuntimeState* state, BufferedBlockMgr2::Client* client,
int num_build_tuples, BufferedTupleStream2* tuple_stream, int64_t max_num_buckets,
int64_t initial_num_buckets);
// Allocates the initial bucket structure. Returns false if OOM.
bool init();
// Call to cleanup any resources. Must be called once.
void close();
// Inserts the row to the hash table. Returns true if the insertion was successful.
// Always returns true if the table has free buckets and the key is not a duplicate.
// The caller is responsible for ensuring that the table has free buckets
// 'idx' is the index into _tuple_stream for this row. If the row contains more than
// one tuple, the 'idx' is stored instead of the 'row'. The 'row' is not copied by the
// hash table and the caller must guarantee it stays in memory. This will not grow the
// hash table. In the case that there is a need to insert a duplicate node, instead of
// filling a new bucket, and there is not enough memory to insert a duplicate node,
// the insert fails and this function returns false.
// Used during the build phase of hash joins.
bool IR_ALWAYS_INLINE insert(PartitionedHashTableCtx* ht_ctx,
const BufferedTupleStream2::RowIdx& idx, TupleRow* row, uint32_t hash);
// Same as insert() but for inserting a single Tuple. The 'tuple' is not copied by
// the hash table and the caller must guarantee it stays in memory.
bool IR_ALWAYS_INLINE insert(PartitionedHashTableCtx* ht_ctx, Tuple* tuple, uint32_t hash);
// Returns an iterator to the bucket matching the last row evaluated in 'ht_ctx'.
// Returns PartitionedHashTable::End() if no match is found. The iterator can be iterated until
// PartitionedHashTable::End() to find all the matching rows. Advancing the returned iterator will
// go to the next matching row. The matching rows do not need to be evaluated since all
// the nodes of a bucket are duplicates. One scan can be in progress for each 'ht_ctx'.
// Used during the probe phase of hash joins.
Iterator IR_ALWAYS_INLINE find(PartitionedHashTableCtx* ht_ctx, uint32_t hash);
// If a match is found in the table, return an iterator as in find(). If a match was
// not present, return an iterator pointing to the empty bucket where the key should
// be inserted. Returns End() if the table is full. The caller can set the data in
// the bucket using a Set*() method on the iterator.
Iterator IR_ALWAYS_INLINE find_bucket(PartitionedHashTableCtx* ht_ctx, uint32_t hash,
bool* found);
// Returns number of elements inserted in the hash table
int64_t size() const {
return _num_filled_buckets - _num_buckets_with_duplicates + _num_duplicate_nodes;
}
// Returns the number of empty buckets.
int64_t empty_buckets() const { return _num_buckets - _num_filled_buckets; }
// Returns the number of buckets
int64_t num_buckets() const { return _num_buckets; }
// Returns the load factor (the number of non-empty buckets)
double load_factor() const {
return static_cast<double>(_num_filled_buckets) / _num_buckets;
}
// Returns an estimate of the number of bytes needed to build the hash table
// structure for 'num_rows'. To do that, it estimates the number of buckets,
// rounded up to a power of two, and also assumes that there are no duplicates.
static int64_t EstimateNumBuckets(int64_t num_rows) {
// Assume max 66% fill factor and no duplicates.
return BitUtil::next_power_of_two(3 * num_rows / 2);
}
static int64_t EstimateSize(int64_t num_rows) {
int64_t num_buckets = EstimateNumBuckets(num_rows);
return num_buckets * sizeof(Bucket);
}
// Returns the memory occupied by the hash table, takes into account the number of
// duplicates.
int64_t current_mem_size() const;
// Calculates the fill factor if 'buckets_to_fill' additional buckets were to be
// filled and resizes the hash table so that the projected fill factor is below the
// max fill factor.
// If it returns true, then it is guaranteed at least 'rows_to_add' rows can be
// inserted without need to resize.
bool check_and_resize(uint64_t buckets_to_fill, PartitionedHashTableCtx* ht_ctx);
// Returns the number of bytes allocated to the hash table
int64_t byte_size() const { return _total_data_page_size; }
// Returns an iterator at the beginning of the hash table. Advancing this iterator
// will traverse all elements.
Iterator begin(PartitionedHashTableCtx* ht_ctx);
// Return an iterator pointing to the first element (Bucket or DuplicateNode, if the
// bucket has duplicates) in the hash table that does not have its matched flag set.
// Used in right joins and full-outer joins.
Iterator first_unmatched(PartitionedHashTableCtx* ctx);
// Return true if there was a least one match.
bool HasMatches() const { return _has_matches; }
// Return end marker.
Iterator End() { return Iterator(); }
// Dump out the entire hash table to string. If 'skip_empty', empty buckets are
// skipped. If 'show_match', it also prints the matched flag of each node. If
// 'build_desc' is non-null, the build rows will be printed. Otherwise, only the
// the addresses of the build rows will be printed.
std::string debug_string(bool skip_empty, bool show_match,
const RowDescriptor* build_desc);
// Print the content of a bucket or node.
void debug_string_tuple(std::stringstream& ss, HtData& htdata, const RowDescriptor* desc);
// Update and print some statistics that can be used for performance debugging.
std::string print_stats() const;
// stl-like iterator interface.
class Iterator {
private:
// Bucket index value when probe is not successful.
static const int64_t BUCKET_NOT_FOUND = -1;
public:
Iterator() : _table(NULL), _row(NULL), _bucket_idx(BUCKET_NOT_FOUND), _node(NULL) { }
// Iterates to the next element. It should be called only if !AtEnd().
void IR_ALWAYS_INLINE next();
// Iterates to the next duplicate node. If the bucket does not have duplicates or
// when it reaches the last duplicate node, then it moves the Iterator to AtEnd().
// Used when we want to iterate over all the duplicate nodes bypassing the next()
// interface (e.g. in semi/outer joins without other_join_conjuncts, in order to
// iterate over all nodes of an unmatched bucket).
void IR_ALWAYS_INLINE next_duplicate();
// Iterates to the next element that does not have its matched flag set. Used in
// right-outer and full-outer joins.
void next_unmatched();
// Return the current row or tuple. Callers must check the iterator is not AtEnd()
// before calling them. The returned row is owned by the iterator and valid until
// the next call to get_row(). It is safe to advance the iterator.
TupleRow* get_row() const;
Tuple* get_tuple() const;
// Set the current tuple for an empty bucket. Designed to be used with the
// iterator returned from find_bucket() in the case when the value is not found.
// It is not valid to call this function if the bucket already has an entry.
void set_tuple(Tuple* tuple, uint32_t hash);
// Sets as matched the Bucket or DuplicateNode currently pointed by the iterator,
// depending on whether the bucket has duplicates or not. The iterator cannot be
// AtEnd().
void set_matched();
// Returns the 'matched' flag of the current Bucket or DuplicateNode, depending on
// whether the bucket has duplicates or not. It should be called only if !AtEnd().
bool is_matched() const;
// Resets everything but the pointer to the hash table.
void set_at_end();
// Returns true if this iterator is at the end, i.e. get_row() cannot be called.
bool at_end() const { return _bucket_idx == BUCKET_NOT_FOUND; }
private:
friend class PartitionedHashTable;
Iterator(PartitionedHashTable* table, TupleRow* row, int bucket_idx, DuplicateNode* node)
: _table(table),
_row(row),
_bucket_idx(bucket_idx),
_node(node) {
}
PartitionedHashTable* _table;
TupleRow* _row;
// Current bucket idx.
// TODO: Use uint32_t?
int64_t _bucket_idx;
// Pointer to the current duplicate node.
DuplicateNode* _node;
};
private:
friend class Iterator;
friend class PartitionedHashTableTest;
// Hash table constructor. Private because Create() should be used, instead
// of calling this constructor directly.
// - quadratic_probing: set to true when the probing algorithm is quadratic, as
// opposed to linear.
PartitionedHashTable(bool quadratic_probing, RuntimeState* state, BufferedBlockMgr2::Client* client,
int num_build_tuples, BufferedTupleStream2* tuple_stream,
int64_t max_num_buckets, int64_t initial_num_buckets);
// Performs the probing operation according to the probing algorithm (linear or
// quadratic. Returns one of the following:
// (a) the index of the bucket that contains the entry that matches with the last row
// evaluated in 'ht_ctx'. If 'ht_ctx' is NULL then it does not check for row
// equality and returns the index of the first empty bucket.
// (b) the index of the first empty bucket according to the probing algorithm (linear
// or quadratic), if the entry is not in the hash table or 'ht_ctx' is NULL.
// (c) Iterator::BUCKET_NOT_FOUND if the probe was not successful, i.e. the maximum
// distance was traveled without finding either an empty or a matching bucket.
// Using the returned index value, the caller can create an iterator that can be
// iterated until End() to find all the matching rows.
// EvalAndHashBuild() or EvalAndHashProb(e) must have been called before calling this.
// 'hash' must be the hash returned by these functions.
// 'found' indicates that a bucket that contains an equal row is found.
//
// There are wrappers of this function that perform the find and insert logic.
int64_t IR_ALWAYS_INLINE probe(Bucket* buckets, int64_t num_buckets,
PartitionedHashTableCtx* ht_ctx, uint32_t hash, bool* found);
// Performs the insert logic. Returns the HtData* of the bucket or duplicate node
// where the data should be inserted. Returns NULL if the insert was not successful.
HtData* IR_ALWAYS_INLINE insert_internal(PartitionedHashTableCtx* ht_ctx, uint32_t hash);
// Updates 'bucket_idx' to the index of the next non-empty bucket. If the bucket has
// duplicates, 'node' will be pointing to the head of the linked list of duplicates.
// Otherwise, 'node' should not be used. If there are no more buckets, sets
// 'bucket_idx' to BUCKET_NOT_FOUND.
void next_filled_bucket(int64_t* bucket_idx, DuplicateNode** node);
// Resize the hash table to 'num_buckets'. Returns false on OOM.
bool resize_buckets(int64_t num_buckets, PartitionedHashTableCtx* ht_ctx);
// Appends the DuplicateNode pointed by _next_node to 'bucket' and moves the _next_node
// pointer to the next DuplicateNode in the page, updating the remaining node counter.
DuplicateNode* IR_ALWAYS_INLINE append_next_node(Bucket* bucket);
// Creates a new DuplicateNode for a entry and chains it to the bucket with index
// 'bucket_idx'. The duplicate nodes of a bucket are chained as a linked list.
// This places the new duplicate node at the beginning of the list. If this is the
// first duplicate entry inserted in this bucket, then the entry already contained by
// the bucket is converted to a DuplicateNode. That is, the contents of 'data' of the
// bucket are copied to a DuplicateNode and 'data' is updated to pointing to a
// DuplicateNode.
// Returns NULL if the node array could not grow, i.e. there was not enough memory to
// allocate a new DuplicateNode.
DuplicateNode* IR_ALWAYS_INLINE insert_duplicate_node(int64_t bucket_idx);
// Resets the contents of the empty bucket with index 'bucket_idx', in preparation for
// an insert. Sets all the fields of the bucket other than 'data'.
void IR_ALWAYS_INLINE prepare_bucket_for_insert(int64_t bucket_idx, uint32_t hash);
// Return the TupleRow pointed by 'htdata'.
TupleRow* get_row(HtData& htdata, TupleRow* row) const;
// Returns the TupleRow of the pointed 'bucket'. In case of duplicates, it
// returns the content of the first chained duplicate node of the bucket.
TupleRow* get_row(Bucket* bucket, TupleRow* row) const;
// Grow the node array. Returns false on OOM.
bool grow_node_array();
// Load factor that will trigger growing the hash table on insert. This is
// defined as the number of non-empty buckets / total_buckets
static const double MAX_FILL_FACTOR;
RuntimeState* _state;
// Client to allocate data pages with.
BufferedBlockMgr2::Client* _block_mgr_client;
// Stream contains the rows referenced by the hash table. Can be NULL if the
// row only contains a single tuple, in which case the TupleRow indirection
// is removed by the hash table.
BufferedTupleStream2* _tuple_stream;
// Constants on how the hash table should behave. Joins and aggs have slightly
// different behavior.
// TODO: these constants are an ideal candidate to be removed with codegen.
// TODO: ..or with template-ization
const bool _stores_tuples;
// Quadratic probing enabled (as opposed to linear).
const bool _quadratic_probing;
// Data pages for all nodes. These are always pinned.
std::vector<BufferedBlockMgr2::Block*> _data_pages;
// Byte size of all buffers in _data_pages.
int64_t _total_data_page_size;
// Next duplicate node to insert. Vaild when _node_remaining_current_page > 0.
DuplicateNode* _next_node;
// Number of nodes left in the current page.
int _node_remaining_current_page;
// Number of duplicate nodes.
int64_t _num_duplicate_nodes;
const int64_t _max_num_buckets;
// Array of all buckets. Owned by this node. Using c-style array to control
// control memory footprint.
Bucket* _buckets;
// Total number of buckets (filled and empty).
int64_t _num_buckets;
// Number of non-empty buckets. Used to determine when to resize.
int64_t _num_filled_buckets;
// Number of (non-empty) buckets with duplicates. These buckets do not point to slots
// in the tuple stream, rather than to a linked list of Nodes.
int64_t _num_buckets_with_duplicates;
// Number of build tuples, used for constructing temp row* for probes.
// TODO: We should remove it.
const int _num_build_tuples;
// Flag used to disable spilling hash tables that already had matches in case of
// right joins (IMPALA-1488).
// TODO: Not fail when spilling hash tables with matches in right joins
bool _has_matches;
// The stats below can be used for debugging perf.
// TODO: Should we make these statistics atomic?
// Number of find(), insert(), or find_bucket() calls that probe the hash table.
int64_t _num_probes;
// Number of probes that failed and had to fall back to linear probing without cap.
int64_t _num_failed_probes;
// Total distance traveled for each probe. That is the sum of the diff between the end
// position of a probe (find/insert) and its start position
// (hash & (_num_buckets - 1)).
int64_t _travel_length;
// The number of cases where we had to compare buckets with the same hash value, but
// the row equality failed.
int64_t _num_hash_collisions;
// How many times this table has resized so far.
int64_t _num_resizes;
};
} // end namespace palo
#endif // BDG_PALO_BE_SRC_EXEC_PARTITIONED_HASH_TABLE_H