[improvement](stream-load) use vector instead of skiplist when insert dup keys (#18686)

This commit is contained in:
huanghaibin
2023-04-23 09:40:09 +08:00
committed by GitHub
parent e7ad536a71
commit 9756be6bf0

View File

@ -214,8 +214,7 @@ void MemTable::_insert_one_row_from_block(RowInBlock* row_in_block) {
_rows++;
bool overwritten = false;
if (_keys_type == KeysType::DUP_KEYS) {
// TODO: dup keys only need sort opertaion. Rethink skiplist is the beat way to sort columns?
_vec_skip_list->Insert(row_in_block, &overwritten);
// for dup keys, already store row_in_block in vector and will sort it on flush stage.
DCHECK(!overwritten) << "Duplicate key model meet overwrite in SkipList";
return;
}
@ -266,11 +265,23 @@ void MemTable::_collect_vskiplist_results() {
VecTable::Iterator it(_vec_skip_list.get());
vectorized::Block in_block = _input_mutable_block.to_block();
if (_keys_type == KeysType::DUP_KEYS) {
vectorized::MutableBlock mutable_block =
vectorized::MutableBlock::build_mutable_block(&in_block);
_vec_row_comparator->set_block(&mutable_block);
std::sort(_row_in_blocks.begin(), _row_in_blocks.end(),
[this](const RowInBlock* l, const RowInBlock* r) -> bool {
auto value = (*(this->_vec_row_comparator))(l, r);
if (value == 0) {
return l->_row_pos > r->_row_pos;
} else {
return value < 0;
}
});
std::vector<int> row_pos_vec;
DCHECK(in_block.rows() <= std::numeric_limits<int>::max());
row_pos_vec.reserve(in_block.rows());
for (it.SeekToFirst(); it.Valid(); it.Next()) {
row_pos_vec.emplace_back(it.key()->_row_pos);
for (int i = 0; i < _row_in_blocks.size(); i++) {
row_pos_vec.emplace_back(_row_in_blocks[i]->_row_pos);
}
_output_mutable_block.add_rows(&in_block, row_pos_vec.data(),
row_pos_vec.data() + in_block.rows());