[optimization] avoid extra memory copy while build hash table (#5301)

avoid extra memory copy while build hash table
This commit is contained in:
stdpain
2021-01-30 04:32:12 -08:00
committed by GitHub
parent 8fe372f82b
commit bf0cb78b67
6 changed files with 183 additions and 205 deletions

View File

@ -40,8 +40,11 @@ HashTable::HashTable(const std::vector<ExprContext*>& build_expr_ctxs,
_initial_seed(initial_seed),
_node_byte_size(sizeof(Node) + sizeof(Tuple*) * _num_build_tuples),
_num_filled_buckets(0),
_nodes(NULL),
_current_nodes(nullptr),
_num_nodes(0),
_current_capacity(num_buckets),
_current_used(0),
_total_capacity(num_buckets),
_exceeded_limit(false),
_mem_tracker(mem_tracker),
_mem_limit_exceeded(false) {
@ -61,13 +64,15 @@ HashTable::HashTable(const std::vector<ExprContext*>& build_expr_ctxs,
memset(_expr_values_buffer, 0, sizeof(uint8_t) * _results_buffer_size);
_expr_value_null_bits = new uint8_t[_build_expr_ctxs.size()];
_nodes_capacity = 1024;
_nodes = reinterpret_cast<uint8_t*>(malloc(_nodes_capacity * _node_byte_size));
memset(_nodes, 0, _nodes_capacity * _node_byte_size);
_alloc_list.reserve(10);
_current_nodes = reinterpret_cast<uint8_t*>(malloc(_current_capacity * _node_byte_size));
// TODO: remove memset later
memset(_current_nodes, 0, _current_capacity * _node_byte_size);
_alloc_list.push_back(_current_nodes);
_mem_tracker->Consume(_nodes_capacity * _node_byte_size);
_mem_tracker->Consume(_current_capacity * _node_byte_size);
if (_mem_tracker->limit_exceeded()) {
mem_limit_exceeded(_nodes_capacity * _node_byte_size);
mem_limit_exceeded(_current_capacity * _node_byte_size);
}
}
@ -77,8 +82,10 @@ void HashTable::close() {
// TODO: use tr1::array?
delete[] _expr_values_buffer;
delete[] _expr_value_null_bits;
free(_nodes);
_mem_tracker->Release(_nodes_capacity * _node_byte_size);
for (auto ptr : _alloc_list) {
free(ptr);
}
_mem_tracker->Release(_total_capacity * _node_byte_size);
_mem_tracker->Release(_buckets.size() * sizeof(Bucket));
}
@ -199,11 +206,10 @@ void HashTable::resize_buckets(int64_t num_buckets) {
Bucket* bucket = &_buckets[i];
Bucket* sister_bucket = &_buckets[i + old_num_buckets];
Node* last_node = NULL;
int node_idx = bucket->_node_idx;
Node* node = bucket->_node;
while (node_idx != -1) {
Node* node = get_node(node_idx);
int64_t next_idx = node->_next_idx;
while (node != nullptr) {
Node* next_node = node->_next;
uint32_t hash = node->_hash;
bool node_must_move = true;
@ -219,12 +225,12 @@ void HashTable::resize_buckets(int64_t num_buckets) {
}
if (node_must_move) {
move_node(bucket, move_to, node_idx, node, last_node);
move_node(bucket, move_to, node, last_node);
} else {
last_node = node;
}
node_idx = next_idx;
node = next_node;
}
}
@ -233,19 +239,19 @@ void HashTable::resize_buckets(int64_t num_buckets) {
}
void HashTable::grow_node_array() {
int64_t old_size = _nodes_capacity * _node_byte_size;
_nodes_capacity = _nodes_capacity + _nodes_capacity / 2;
int64_t new_size = _nodes_capacity * _node_byte_size;
_current_capacity = _total_capacity / 2;
_total_capacity += _current_capacity;
int64_t alloc_size = _current_capacity * _node_byte_size;
_current_nodes = reinterpret_cast<uint8_t*>(malloc(alloc_size));
_current_used = 0;
// TODO: remove memset later
memset(_current_nodes, 0, alloc_size);
// add _current_nodes to alloc pool
_alloc_list.push_back(_current_nodes);
uint8_t* new_nodes = reinterpret_cast<uint8_t*>(malloc(new_size));
memset(new_nodes, 0, new_size);
memcpy(new_nodes, _nodes, old_size);
free(_nodes);
_nodes = new_nodes;
_mem_tracker->Consume(new_size - old_size);
_mem_tracker->Consume(alloc_size);
if (_mem_tracker->limit_exceeded()) {
mem_limit_exceeded(new_size - old_size);
mem_limit_exceeded(alloc_size);
}
}
@ -262,29 +268,27 @@ std::string HashTable::debug_string(bool skip_empty, const RowDescriptor* desc)
ss << std::endl;
for (int i = 0; i < _buckets.size(); ++i) {
int64_t node_idx = _buckets[i]._node_idx;
Node* node = _buckets[i]._node;
bool first = true;
if (skip_empty && node_idx == -1) {
if (skip_empty && node == nullptr) {
continue;
}
ss << i << ": ";
while (node_idx != -1) {
Node* node = get_node(node_idx);
while (node != nullptr) {
if (!first) {
ss << ",";
}
if (desc == NULL) {
ss << node_idx << "(" << (void*)node->data() << ")";
ss << node->_hash << "(" << (void*)node->data() << ")";
} else {
ss << (void*)node->data() << " " << node->data()->to_string(*desc);
}
node_idx = node->_next_idx;
node = node->_next;
first = false;
}