[Improvement](string) Optimize scanning for string #12911

~0.2X performance boost for queries containing string predicates
This commit is contained in:
Gabriel
2022-09-29 15:11:16 +08:00
committed by GitHub
parent fef1062835
commit 34b14a71c8
4 changed files with 81 additions and 10 deletions

View File

@ -248,12 +248,14 @@ public:
return Status::OK();
}
const size_t max_fetch = std::min(*n, static_cast<size_t>(_num_elems - _cur_idx));
uint32_t len_array[max_fetch];
uint32_t start_offset_array[max_fetch];
for (int i = 0; i < max_fetch; i++, _cur_idx++) {
const uint32_t start_offset = offset(_cur_idx);
uint32_t len = offset(_cur_idx + 1) - start_offset;
uint32_t last_offset = guarded_offset(_cur_idx);
for (int i = 0; i < max_fetch - 1; i++, _cur_idx++) {
const uint32_t start_offset = last_offset;
last_offset = guarded_offset(_cur_idx + 1);
uint32_t len = last_offset - start_offset;
len_array[i] = len;
start_offset_array[i] = start_offset;
if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) {
@ -262,6 +264,14 @@ public:
}
}
}
_cur_idx++;
len_array[max_fetch - 1] = offset(_cur_idx) - last_offset;
start_offset_array[max_fetch - 1] = last_offset;
if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) {
if (_options.need_check_bitmap) {
RETURN_IF_ERROR(BitmapTypeCode::validate(*(_data.data + last_offset)));
}
}
dst->insert_many_binary_data(_data.mutable_data(), len_array, start_offset_array,
max_fetch);
@ -340,13 +350,20 @@ public:
}
private:
static constexpr size_t SIZE_OF_INT32 = sizeof(uint32_t);
// Return the offset within '_data' where the string value with index 'idx' can be found.
uint32_t offset(size_t idx) const {
if (idx >= _num_elems) {
return _offsets_pos;
}
const uint8_t* p =
reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx * sizeof(uint32_t)]);
reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx * SIZE_OF_INT32]);
return decode_fixed32_le(p);
}
uint32_t guarded_offset(size_t idx) const {
const uint8_t* p =
reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx * SIZE_OF_INT32]);
return decode_fixed32_le(p);
}

View File

@ -1150,8 +1150,11 @@ Status SegmentIterator::next_batch(vectorized::Block* block) {
}
if (!_lazy_materialization_read) {
Status ret = _output_column_by_sel_idx(block, _first_read_column_ids, sel_rowid_idx,
selected_size);
Status ret = Status::OK();
if (selected_size > 0) {
ret = _output_column_by_sel_idx(block, _first_read_column_ids, sel_rowid_idx,
selected_size);
}
if (!ret.ok()) {
return ret;
}
@ -1176,8 +1179,10 @@ Status SegmentIterator::next_batch(vectorized::Block* block) {
// when lazy materialization enables, _first_read_column_ids = distinct(_short_cir_pred_column_ids + _vec_pred_column_ids)
// see _vec_init_lazy_materialization
// todo(wb) need to tell input columnids from output columnids
RETURN_IF_ERROR(_output_column_by_sel_idx(block, _first_read_column_ids, sel_rowid_idx,
selected_size));
if (selected_size > 0) {
RETURN_IF_ERROR(_output_column_by_sel_idx(block, _first_read_column_ids, sel_rowid_idx,
selected_size));
}
}
// shrink char_type suffix zero data

View File

@ -207,6 +207,55 @@ public:
}
}
void insert_many_continuous_strings(const StringRef* strings, size_t num) {
DCHECK_NE(num, 0);
offsets.reserve(offsets.size() + num);
std::vector<const char*> start_points(1);
auto& head = strings[0];
start_points[0] = head.data;
size_t new_size = head.size;
const char* cursor = head.data + new_size;
std::vector<const char*> end_points;
const size_t old_size = chars.size();
size_t offset = old_size;
offset += new_size;
offsets.push_back(offset);
if (num == 1) {
end_points.push_back(cursor);
} else {
for (size_t i = 1; i < num; i++) {
auto& str = strings[i];
if (cursor != str.data) {
end_points.push_back(cursor);
start_points.push_back(str.data);
cursor = str.data;
}
size_t sz = str.size;
offset += sz;
new_size += sz;
cursor += sz;
offsets.push_back_without_reserve(offset);
}
end_points.push_back(cursor);
}
DCHECK_EQ(end_points.size(), start_points.size());
chars.resize(old_size + new_size);
size_t num_range = start_points.size();
Char* data = chars.data();
offset = old_size;
for (size_t i = 0; i < num_range; i++) {
uint32_t len = end_points[i] - start_points[i];
if (len) {
memcpy(data + offset, start_points[i], len);
offset += len;
}
}
}
void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict,
size_t num, uint32_t /*dict_num*/) override {
size_t offset_size = offsets.size();

View File

@ -97,7 +97,7 @@ private:
refs[i].data = sv.ptr;
refs[i].size = sv.len;
}
res_ptr->insert_many_strings(refs, sel_size);
res_ptr->insert_many_continuous_strings(refs, sel_size);
}
void insert_decimal_to_res_column(const uint16_t* sel, size_t sel_size,