[Improvement](string) Optimize scanning for string #12911
~0.2X performance boost for queries containing string predicates
This commit is contained in:
@ -248,12 +248,14 @@ public:
|
||||
return Status::OK();
|
||||
}
|
||||
const size_t max_fetch = std::min(*n, static_cast<size_t>(_num_elems - _cur_idx));
|
||||
|
||||
uint32_t len_array[max_fetch];
|
||||
uint32_t start_offset_array[max_fetch];
|
||||
for (int i = 0; i < max_fetch; i++, _cur_idx++) {
|
||||
const uint32_t start_offset = offset(_cur_idx);
|
||||
uint32_t len = offset(_cur_idx + 1) - start_offset;
|
||||
|
||||
uint32_t last_offset = guarded_offset(_cur_idx);
|
||||
for (int i = 0; i < max_fetch - 1; i++, _cur_idx++) {
|
||||
const uint32_t start_offset = last_offset;
|
||||
last_offset = guarded_offset(_cur_idx + 1);
|
||||
uint32_t len = last_offset - start_offset;
|
||||
len_array[i] = len;
|
||||
start_offset_array[i] = start_offset;
|
||||
if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) {
|
||||
@ -262,6 +264,14 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
_cur_idx++;
|
||||
len_array[max_fetch - 1] = offset(_cur_idx) - last_offset;
|
||||
start_offset_array[max_fetch - 1] = last_offset;
|
||||
if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) {
|
||||
if (_options.need_check_bitmap) {
|
||||
RETURN_IF_ERROR(BitmapTypeCode::validate(*(_data.data + last_offset)));
|
||||
}
|
||||
}
|
||||
dst->insert_many_binary_data(_data.mutable_data(), len_array, start_offset_array,
|
||||
max_fetch);
|
||||
|
||||
@ -340,13 +350,20 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
static constexpr size_t SIZE_OF_INT32 = sizeof(uint32_t);
|
||||
// Return the offset within '_data' where the string value with index 'idx' can be found.
|
||||
uint32_t offset(size_t idx) const {
|
||||
if (idx >= _num_elems) {
|
||||
return _offsets_pos;
|
||||
}
|
||||
const uint8_t* p =
|
||||
reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx * sizeof(uint32_t)]);
|
||||
reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx * SIZE_OF_INT32]);
|
||||
return decode_fixed32_le(p);
|
||||
}
|
||||
|
||||
uint32_t guarded_offset(size_t idx) const {
|
||||
const uint8_t* p =
|
||||
reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx * SIZE_OF_INT32]);
|
||||
return decode_fixed32_le(p);
|
||||
}
|
||||
|
||||
|
||||
@ -1150,8 +1150,11 @@ Status SegmentIterator::next_batch(vectorized::Block* block) {
|
||||
}
|
||||
|
||||
if (!_lazy_materialization_read) {
|
||||
Status ret = _output_column_by_sel_idx(block, _first_read_column_ids, sel_rowid_idx,
|
||||
selected_size);
|
||||
Status ret = Status::OK();
|
||||
if (selected_size > 0) {
|
||||
ret = _output_column_by_sel_idx(block, _first_read_column_ids, sel_rowid_idx,
|
||||
selected_size);
|
||||
}
|
||||
if (!ret.ok()) {
|
||||
return ret;
|
||||
}
|
||||
@ -1176,8 +1179,10 @@ Status SegmentIterator::next_batch(vectorized::Block* block) {
|
||||
// when lazy materialization enables, _first_read_column_ids = distinct(_short_cir_pred_column_ids + _vec_pred_column_ids)
|
||||
// see _vec_init_lazy_materialization
|
||||
// todo(wb) need to tell input columnids from output columnids
|
||||
RETURN_IF_ERROR(_output_column_by_sel_idx(block, _first_read_column_ids, sel_rowid_idx,
|
||||
selected_size));
|
||||
if (selected_size > 0) {
|
||||
RETURN_IF_ERROR(_output_column_by_sel_idx(block, _first_read_column_ids, sel_rowid_idx,
|
||||
selected_size));
|
||||
}
|
||||
}
|
||||
|
||||
// shrink char_type suffix zero data
|
||||
|
||||
@ -207,6 +207,55 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
void insert_many_continuous_strings(const StringRef* strings, size_t num) {
|
||||
DCHECK_NE(num, 0);
|
||||
offsets.reserve(offsets.size() + num);
|
||||
std::vector<const char*> start_points(1);
|
||||
auto& head = strings[0];
|
||||
start_points[0] = head.data;
|
||||
size_t new_size = head.size;
|
||||
const char* cursor = head.data + new_size;
|
||||
std::vector<const char*> end_points;
|
||||
|
||||
const size_t old_size = chars.size();
|
||||
size_t offset = old_size;
|
||||
offset += new_size;
|
||||
offsets.push_back(offset);
|
||||
if (num == 1) {
|
||||
end_points.push_back(cursor);
|
||||
} else {
|
||||
for (size_t i = 1; i < num; i++) {
|
||||
auto& str = strings[i];
|
||||
if (cursor != str.data) {
|
||||
end_points.push_back(cursor);
|
||||
start_points.push_back(str.data);
|
||||
cursor = str.data;
|
||||
}
|
||||
size_t sz = str.size;
|
||||
offset += sz;
|
||||
new_size += sz;
|
||||
cursor += sz;
|
||||
offsets.push_back_without_reserve(offset);
|
||||
}
|
||||
end_points.push_back(cursor);
|
||||
}
|
||||
DCHECK_EQ(end_points.size(), start_points.size());
|
||||
|
||||
chars.resize(old_size + new_size);
|
||||
|
||||
size_t num_range = start_points.size();
|
||||
Char* data = chars.data();
|
||||
|
||||
offset = old_size;
|
||||
for (size_t i = 0; i < num_range; i++) {
|
||||
uint32_t len = end_points[i] - start_points[i];
|
||||
if (len) {
|
||||
memcpy(data + offset, start_points[i], len);
|
||||
offset += len;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict,
|
||||
size_t num, uint32_t /*dict_num*/) override {
|
||||
size_t offset_size = offsets.size();
|
||||
|
||||
@ -97,7 +97,7 @@ private:
|
||||
refs[i].data = sv.ptr;
|
||||
refs[i].size = sv.len;
|
||||
}
|
||||
res_ptr->insert_many_strings(refs, sel_size);
|
||||
res_ptr->insert_many_continuous_strings(refs, sel_size);
|
||||
}
|
||||
|
||||
void insert_decimal_to_res_column(const uint16_t* sel, size_t sel_size,
|
||||
|
||||
Reference in New Issue
Block a user