doris/be/src/exec/hash_table.cpp

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/apache/impala/blob/branch-2.9.0/be/src/exec/hash-table.cc
// and modified by Doris

#include "exec/hash_table.h"

#include "exprs/expr.h"
#include "exprs/expr_context.h"
#include "runtime/memory/mem_tracker.h"
#include "runtime/raw_value.h"

namespace doris {

HashTable::HashTable(const std::vector<ExprContext*>& build_expr_ctxs,
                     const std::vector<ExprContext*>& probe_expr_ctxs, int num_build_tuples,
                     bool stores_nulls, const std::vector<bool>& finds_nulls, int32_t initial_seed,
                     int64_t num_buckets)
        : _build_expr_ctxs(build_expr_ctxs),
          _probe_expr_ctxs(probe_expr_ctxs),
          _num_build_tuples(num_build_tuples),
          _stores_nulls(stores_nulls),
          _finds_nulls(finds_nulls),
          _initial_seed(initial_seed),
          _node_byte_size(sizeof(Node) + sizeof(Tuple*) * _num_build_tuples),
          _num_filled_buckets(0),
          _current_nodes(nullptr),
          _num_nodes(0),
          _current_capacity(num_buckets),
          _current_used(0),
          _total_capacity(num_buckets) {
    DCHECK_EQ(_build_expr_ctxs.size(), _probe_expr_ctxs.size());

    DCHECK_EQ((num_buckets & (num_buckets - 1)), 0) << "num_buckets must be a power of 2";
    _mem_tracker = std::make_unique<MemTracker>("HashTable");
    _buckets.resize(num_buckets);
    _num_buckets = num_buckets;
    _num_buckets_till_resize = MAX_BUCKET_OCCUPANCY_FRACTION * _num_buckets;
    _mem_tracker->consume(_buckets.capacity() * sizeof(Bucket));

    // Compute the layout and buffer size to store the evaluated expr results
    _results_buffer_size = Expr::compute_results_layout(
            _build_expr_ctxs, &_expr_values_buffer_offsets, &_var_result_begin);
    _expr_values_buffer = new uint8_t[_results_buffer_size];
    memset(_expr_values_buffer, 0, sizeof(uint8_t) * _results_buffer_size);
    _expr_value_null_bits = new uint8_t[_build_expr_ctxs.size()];

    _alloc_list.reserve(10);
    _end_list.reserve(10);
    _current_nodes = reinterpret_cast<uint8_t*>(malloc(_current_capacity * _node_byte_size));
    // TODO: remove memset later
    memset(_current_nodes, 0, _current_capacity * _node_byte_size);
    _alloc_list.push_back(_current_nodes);
    _end_list.push_back(_current_nodes + _current_capacity * _node_byte_size);

    _mem_tracker->consume(_current_capacity * _node_byte_size);
}

HashTable::~HashTable() {}

void HashTable::close() {
    // TODO: use tr1::array?
    delete[] _expr_values_buffer;
    delete[] _expr_value_null_bits;
    for (auto ptr : _alloc_list) {
        free(ptr);
    }
    _mem_tracker->release(_total_capacity * _node_byte_size);
    _mem_tracker->release(_buckets.size() * sizeof(Bucket));
}

bool HashTable::eval_row(TupleRow* row, const std::vector<ExprContext*>& ctxs) {
    // Put a non-zero constant in the result location for nullptr.
    // We don't want(nullptr, 1) to hash to the same as (0, 1).
    // This needs to be as big as the biggest primitive type since the bytes
    // get copied directly.

    // the 10 is experience value which need bigger than sizeof(Decimal)/sizeof(int64).
    // for if slot is null, we need copy the null value to all type.
    static int64_t null_value[10] = {HashUtil::FNV_SEED, HashUtil::FNV_SEED, 0};
    bool has_null = false;

    for (int i = 0; i < ctxs.size(); ++i) {
        void* loc = _expr_values_buffer + _expr_values_buffer_offsets[i];
        void* val = ctxs[i]->get_value(row);

        if (val == nullptr) {
            // If the table doesn't store nulls, no reason to keep evaluating
            if (!_stores_nulls) {
                return true;
            }

            _expr_value_null_bits[i] = true;
            val = &null_value;
            has_null = true;
        } else {
            _expr_value_null_bits[i] = false;
        }

        RawValue::write(val, loc, _build_expr_ctxs[i]->root()->type(), nullptr);
    }

    return has_null;
}

uint32_t HashTable::hash_variable_len_row() {
    uint32_t hash = _initial_seed;
    // Hash the non-var length portions (if there are any)
    if (_var_result_begin != 0) {
        hash = HashUtil::hash(_expr_values_buffer, _var_result_begin, hash);
    }

    for (int i = 0; i < _build_expr_ctxs.size(); ++i) {
        // non-string and null slots are already part of expr_values_buffer
        if (_build_expr_ctxs[i]->root()->type().is_string_type()) {
            void* loc = _expr_values_buffer + _expr_values_buffer_offsets[i];

            if (_expr_value_null_bits[i]) {
                // Hash the null random seed values at 'loc'
                hash = HashUtil::hash(loc, sizeof(StringValue), hash);
            } else {
                // Hash the string
                StringValue* str = reinterpret_cast<StringValue*>(loc);
                hash = HashUtil::hash(str->ptr, str->len, hash);
            }
        }
    }

    return hash;
}

bool HashTable::equals(TupleRow* build_row) {
    for (int i = 0; i < _build_expr_ctxs.size(); ++i) {
        void* val = _build_expr_ctxs[i]->get_value(build_row);

        if (val == nullptr) {
            if (!(_stores_nulls && _finds_nulls[i])) {
                return false;
            }

            if (!_expr_value_null_bits[i]) {
                return false;
            }

            continue;
        }

        void* loc = _expr_values_buffer + _expr_values_buffer_offsets[i];

        if (!RawValue::eq(loc, val, _build_expr_ctxs[i]->root()->type())) {
            return false;
        }
    }

    return true;
}

Status HashTable::resize_buckets(int64_t num_buckets) {
    DCHECK_EQ((num_buckets & (num_buckets - 1)), 0) << "num_buckets must be a power of 2";

    int64_t old_num_buckets = _num_buckets;
    int64_t delta_bytes = (num_buckets - old_num_buckets) * sizeof(Bucket);
    Status st = thread_context()->_thread_mem_tracker_mgr->limiter_mem_tracker_raw()->check_limit(
            delta_bytes);
    if (!st) {
        LOG_EVERY_N(WARNING, 100) << "resize bucket failed: " << st.to_string();
        return st;
    }
    _mem_tracker->consume(delta_bytes);

    _buckets.resize(num_buckets);

    // If we're doubling the number of buckets, all nodes in a particular bucket
    // either remain there, or move down to an analogous bucket in the other half.
    // In order to efficiently check which of the two buckets a node belongs in, the number
    // of buckets must be a power of 2.
    bool doubled_buckets = (num_buckets == old_num_buckets * 2);

    for (int i = 0; i < _num_buckets; ++i) {
        Bucket* bucket = &_buckets[i];
        Bucket* sister_bucket = &_buckets[i + old_num_buckets];
        Node* last_node = nullptr;
        Node* node = bucket->_node;

        while (node != nullptr) {
            Node* next_node = node->_next;
            uint32_t hash = node->_hash;

            bool node_must_move = true;
            Bucket* move_to = nullptr;

            if (doubled_buckets) {
                node_must_move = ((hash & old_num_buckets) != 0);
                move_to = sister_bucket;
            } else {
                int64_t bucket_idx = hash & (num_buckets - 1);
                node_must_move = (bucket_idx != i);
                move_to = &_buckets[bucket_idx];
            }

            if (node_must_move) {
                move_node(bucket, move_to, node, last_node);
            } else {
                last_node = node;
            }

            node = next_node;
        }
    }

    _num_buckets = num_buckets;
    _num_buckets_till_resize = MAX_BUCKET_OCCUPANCY_FRACTION * _num_buckets;
    return Status::OK();
}

void HashTable::grow_node_array() {
    _current_capacity = _total_capacity / 2;
    _total_capacity += _current_capacity;
    int64_t alloc_size = _current_capacity * _node_byte_size;
    _current_nodes = reinterpret_cast<uint8_t*>(malloc(alloc_size));
    _current_used = 0;
    // TODO: remove memset later
    memset(_current_nodes, 0, alloc_size);
    // add _current_nodes to alloc pool
    _alloc_list.push_back(_current_nodes);
    _end_list.push_back(_current_nodes + alloc_size);

    _mem_tracker->consume(alloc_size);
}

std::string HashTable::debug_string(bool skip_empty, const RowDescriptor* desc) {
    std::stringstream ss;
    ss << std::endl;

    for (int i = 0; i < _buckets.size(); ++i) {
        Node* node = _buckets[i]._node;
        bool first = true;

        if (skip_empty && node == nullptr) {
            continue;
        }

        ss << i << ": ";

        while (node != nullptr) {
            if (!first) {
                ss << ",";
            }

            if (desc == nullptr) {
                ss << node->_hash << "(" << (void*)node->data() << ")";
            } else {
                ss << (void*)node->data() << " " << node->data()->to_string(*desc);
            }

            node = node->_next;
            first = false;
        }

        ss << std::endl;
    }

    return ss.str();
}

bool HashTable::emplace_key(TupleRow* row, TupleRow** dest_addr) {
    if (_num_filled_buckets > _num_buckets_till_resize) {
        if (!resize_buckets(_num_buckets * 2).ok()) {
            return false;
        }
    }
    if (_current_used == _current_capacity) {
        grow_node_array();
    }

    bool has_nulls = eval_build_row(row);

    if (!_stores_nulls && has_nulls) {
        return false;
    }

    uint32_t hash = hash_current_row();
    int64_t bucket_idx = hash & (_num_buckets - 1);

    Bucket* bucket = &_buckets[bucket_idx];
    Node* node = bucket->_node;

    bool will_insert = true;

    if (node == nullptr) {
        will_insert = true;
    } else {
        Node* last_node = node;
        while (node != nullptr) {
            if (node->_hash == hash && equals(node->data())) {
                will_insert = false;
                break;
            }
            last_node = node;
            node = node->_next;
        }
        node = last_node;
    }
    if (will_insert) {
        Node* alloc_node =
                reinterpret_cast<Node*>(_current_nodes + _node_byte_size * _current_used++);
        ++_num_nodes;
        TupleRow* data = alloc_node->data();
        *dest_addr = data;
        alloc_node->_hash = hash;
        if (node == nullptr) {
            add_to_bucket(&_buckets[bucket_idx], alloc_node);
        } else {
            node->_next = alloc_node;
        }
    }
    return will_insert;
}

HashTable::Iterator HashTable::find(TupleRow* probe_row, bool probe) {
    bool has_nulls = probe ? eval_probe_row(probe_row) : eval_build_row(probe_row);

    if (!_stores_nulls && has_nulls) {
        return end();
    }

    uint32_t hash = hash_current_row();
    int64_t bucket_idx = hash & (_num_buckets - 1);

    Bucket* bucket = &_buckets[bucket_idx];
    Node* node = bucket->_node;

    while (node != nullptr) {
        if (node->_hash == hash && equals(node->data())) {
            return Iterator(this, bucket_idx, node, hash);
        }

        node = node->_next;
    }

    return end();
}

HashTable::Iterator HashTable::begin() {
    int64_t bucket_idx = -1;
    Bucket* bucket = next_bucket(&bucket_idx);

    if (bucket != nullptr) {
        return Iterator(this, bucket_idx, bucket->_node, 0);
    }

    return end();
}

HashTable::Bucket* HashTable::next_bucket(int64_t* bucket_idx) {
    ++*bucket_idx;

    for (; *bucket_idx < _num_buckets; ++*bucket_idx) {
        if (_buckets[*bucket_idx]._node != nullptr) {
            return &_buckets[*bucket_idx];
        }
    }

    *bucket_idx = -1;
    return nullptr;
}

void HashTable::insert_impl(TupleRow* row) {
    bool has_null = eval_build_row(row);

    if (!_stores_nulls && has_null) {
        return;
    }

    uint32_t hash = hash_current_row();
    int64_t bucket_idx = hash & (_num_buckets - 1);

    if (_current_used == _current_capacity) {
        grow_node_array();
    }
    // get a node from memory pool
    Node* node = reinterpret_cast<Node*>(_current_nodes + _node_byte_size * _current_used++);

    TupleRow* data = node->data();
    node->_hash = hash;
    memcpy(data, row, sizeof(Tuple*) * _num_build_tuples);
    add_to_bucket(&_buckets[bucket_idx], node);
    ++_num_nodes;
}

void HashTable::add_to_bucket(Bucket* bucket, Node* node) {
    if (bucket->_node == nullptr) {
        ++_num_filled_buckets;
    }

    node->_next = bucket->_node;
    bucket->_node = node;
    bucket->_size++;
}

void HashTable::move_node(Bucket* from_bucket, Bucket* to_bucket, Node* node, Node* previous_node) {
    Node* next_node = node->_next;
    from_bucket->_size--;

    if (previous_node != nullptr) {
        previous_node->_next = next_node;
    } else {
        // Update bucket directly
        from_bucket->_node = next_node;

        if (next_node == nullptr) {
            --_num_filled_buckets;
        }
    }

    add_to_bucket(to_bucket, node);
}

std::pair<int64_t, int64_t> HashTable::minmax_node() {
    bool has_value = false;
    int64_t min_size = std::numeric_limits<int64_t>::max();
    int64_t max_size = std::numeric_limits<int64_t>::min();
    for (const auto bucket : _buckets) {
        int64_t counter = bucket._size;
        if (counter > 0) {
            has_value = true;
            min_size = std::min(counter, min_size);
            max_size = std::max(counter, max_size);
        }
    }
    if (!has_value) {
        return std::make_pair(0, 0);
    }
    return std::make_pair(min_size, max_size);
}

} // namespace doris