[minor](hash table) parameterize hash table (#19653)

This commit is contained in:
Gabriel
2023-05-17 09:58:26 +08:00
committed by GitHub
parent 0cae9bb3a1
commit 8fd1eb0d1e
3 changed files with 15 additions and 3 deletions

View File

@ -103,6 +103,9 @@ DEFINE_mInt64(mmap_threshold, "134217728"); // bytes
// Increase can reduce the number of hash table resize, but may waste more memory.
DEFINE_mInt32(hash_table_double_grow_degree, "31");
DEFINE_mInt32(max_fill_rate, "2");
DEFINE_mInt32(double_resize_threshold, "20");
// Expand the hash table before inserting data, the maximum expansion size.
// There are fewer duplicate keys, reducing the number of resize hash tables
// There are many duplicate keys, and the hash table filled bucket is far less than the hash table build bucket.

View File

@ -141,6 +141,11 @@ DECLARE_mInt64(mmap_threshold); // bytes
// Increase can reduce the number of hash table resize, but may waste more memory.
DECLARE_mInt32(hash_table_double_grow_degree);
// The max fill rate for hash table
DECLARE_mInt32(max_fill_rate);
DECLARE_mInt32(double_resize_threshold);
// Expand the hash table before inserting data, the maximum expansion size.
// There are fewer duplicate keys, reducing the number of resize hash tables
// There are many duplicate keys, and the hash table filled bucket is far less than the hash table build bucket.

View File

@ -238,6 +238,8 @@ void insert_set_mapped(MappedType* dest, const ValueType& src) {
*dest = src.second;
}
static doris::vectorized::Int32 double_resize_threshold = doris::config::double_resize_threshold;
/** Determines the size of the hash table, and when and how much it should be resized.
*/
template <size_t initial_size_degree = 10>
@ -246,6 +248,8 @@ struct HashTableGrower {
doris::vectorized::UInt8 size_degree = initial_size_degree;
doris::vectorized::Int64 double_grow_degree = doris::config::hash_table_double_grow_degree;
doris::vectorized::Int32 max_fill_rate = doris::config::max_fill_rate;
/// The size of the hash table in the cells.
size_t buf_size() const { return 1ULL << size_degree; }
@ -253,7 +257,7 @@ struct HashTableGrower {
size_t max_fill() const {
return size_degree < double_grow_degree
? 1ULL << (size_degree - 1)
: (1ULL << size_degree) - (1ULL << (size_degree - 2));
: (1ULL << size_degree) - (1ULL << (size_degree - max_fill_rate));
}
size_t mask() const { return buf_size() - 1; }
@ -271,7 +275,7 @@ struct HashTableGrower {
bool overflow(size_t elems) const { return elems > max_fill(); }
/// Increase the size of the hash table.
void increase_size() { size_degree += size_degree >= 23 ? 1 : 2; }
void increase_size() { size_degree += size_degree >= double_resize_threshold ? 1 : 2; }
/// Set the buffer size by the number of elements in the hash table. Used when deserializing a hash table.
void set(size_t num_elems) {
@ -336,7 +340,7 @@ public:
bool overflow(size_t elems) const { return elems > precalculated_max_fill; }
/// Increase the size of the hash table.
void increase_size() { increase_size_degree(size_degree_ >= 23 ? 1 : 2); }
void increase_size() { increase_size_degree(size_degree_ >= double_resize_threshold ? 1 : 2); }
/// Set the buffer size by the number of elements in the hash table. Used when deserializing a hash table.
void set(size_t num_elems) {