From 56809230d1dd8b6d7cb58eb38980db13a87f74ba Mon Sep 17 00:00:00 2001 From: Gabriel Date: Wed, 17 May 2023 14:09:52 +0800 Subject: [PATCH] [Improvement](string function) optimize substring and in string set (#19257) * [Improvement](string function) optimize substring and in string set * update --- be/src/exprs/hybrid_set.h | 15 ++- be/src/vec/functions/function_string.h | 153 +++++++++++++++++-------- 2 files changed, 114 insertions(+), 54 deletions(-) diff --git a/be/src/exprs/hybrid_set.h b/be/src/exprs/hybrid_set.h index 54a30cd343..73508d56d2 100644 --- a/be/src/exprs/hybrid_set.h +++ b/be/src/exprs/hybrid_set.h @@ -60,7 +60,7 @@ public: } // Use '|' instead of '||' has better performance by test. - bool find(const T& value) const { + ALWAYS_INLINE bool find(const T& value) const { if constexpr (N == 1) { return (value == _data[0]); } @@ -585,21 +585,26 @@ public: const doris::vectorized::NullMap* null_map, doris::vectorized::ColumnUInt8::Container& results) { auto& col = assert_cast(column); + const uint32_t* __restrict offset = col.get_offsets().data(); + const uint8_t* __restrict data = col.get_chars().data(); + uint8_t* __restrict cursor = const_cast(data); const uint8_t* __restrict null_map_data; if constexpr (is_nullable) { null_map_data = null_map->data(); } auto* __restrict result_data = results.data(); for (size_t i = 0; i < rows; ++i) { + uint32_t len = offset[i] - offset[i - 1]; if constexpr (!is_nullable && !is_negative) { - result_data[i] = _set.find(col.get_data_at(i)); + result_data[i] = _set.find(StringRef(cursor, len)); } else if constexpr (!is_nullable && is_negative) { - result_data[i] = !_set.find(col.get_data_at(i)); + result_data[i] = !_set.find(StringRef(cursor, len)); } else if constexpr (is_nullable && !is_negative) { - result_data[i] = _set.find(col.get_data_at(i)) & (!null_map_data[i]); + result_data[i] = (!null_map_data[i]) & _set.find(StringRef(cursor, len)); } else { // (is_nullable && is_negative) - result_data[i] = !(_set.find(col.get_data_at(i)) & (!null_map_data[i])); + result_data[i] = !((!null_map_data[i]) & _set.find(StringRef(cursor, len))); } + cursor += len; } } diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index b62778540f..cdcbd2515a 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -170,7 +170,7 @@ private: const PaddedPODArray& start, const PaddedPODArray& len, NullMap& null_map, ColumnString::Chars& res_chars, ColumnString::Offsets& res_offsets) { - int size = offsets.size(); + size_t size = offsets.size(); res_offsets.resize(size); res_chars.reserve(chars.size()); @@ -178,63 +178,118 @@ private: PMR::monotonic_buffer_resource pool {buf.data(), buf.size()}; PMR::vector index {&pool}; - PMR::vector> strs(&pool); - strs.resize(size); auto* __restrict data_ptr = chars.data(); auto* __restrict offset_ptr = offsets.data(); - for (int i = 0; i < size; ++i) { - strs[i].first = data_ptr + offset_ptr[i - 1]; - strs[i].second = offset_ptr[i] - offset_ptr[i - 1]; - } - for (int i = 0; i < size; ++i) { - auto [raw_str, str_size] = strs[i]; - const auto& start_value = start[index_check_const(i, Const)]; - const auto& len_value = len[index_check_const(i, Const)]; + if constexpr (Const) { + const auto start_value = start[0]; + const auto len_value = len[0]; + if (start_value == 0 || len_value <= 0) { + for (size_t i = 0; i < size; ++i) { + StringOP::push_empty_string(i, res_chars, res_offsets); + } + } else { + for (size_t i = 0; i < size; ++i) { + const int str_size = offset_ptr[i] - offset_ptr[i - 1]; + const uint8_t* raw_str = data_ptr + offset_ptr[i - 1]; + // return empty string if start > src.length + if (start_value > str_size || start_value < -str_size || str_size == 0) { + StringOP::push_empty_string(i, res_chars, res_offsets); + continue; + } + // reference to string_function.cpp: substring + size_t byte_pos = 0; + index.clear(); + for (size_t j = 0, char_size = 0; + j < str_size && + (start_value <= 0 || index.size() <= start_value + len_value); + j += char_size) { + char_size = UTF8_BYTE_LENGTH[(unsigned char)(raw_str)[j]]; + index.push_back(j); + } - // return empty string if start > src.length - if (start_value > str_size || str_size == 0 || start_value == 0 || len_value <= 0) { - StringOP::push_empty_string(i, res_chars, res_offsets); - continue; - } - // reference to string_function.cpp: substring - size_t byte_pos = 0; - index.clear(); - for (size_t j = 0, char_size = 0; j < str_size; j += char_size) { - char_size = UTF8_BYTE_LENGTH[(unsigned char)(raw_str)[j]]; - index.push_back(j); - if (start_value > 0 && index.size() > start_value + len_value) { - break; + int fixed_pos = start_value; + if (fixed_pos < 0) { + fixed_pos = str_size + fixed_pos + 1; + } else if (fixed_pos > index.size()) { + StringOP::push_null_string(i, res_chars, res_offsets, null_map); + continue; + } + + byte_pos = index[fixed_pos - 1]; + int fixed_len = str_size - byte_pos; + if (fixed_pos + len_value <= index.size()) { + fixed_len = index[fixed_pos + len_value - 1] - byte_pos; + } + + if (byte_pos <= str_size && fixed_len > 0) { + // return StringRef(str.data + byte_pos, fixed_len); + StringOP::push_value_string( + std::string_view {reinterpret_cast(raw_str + byte_pos), + (size_t)fixed_len}, + i, res_chars, res_offsets); + } else { + StringOP::push_empty_string(i, res_chars, res_offsets); + } } } - - int fixed_pos = start_value; - if (fixed_pos < -(int)index.size()) { - StringOP::push_empty_string(i, res_chars, res_offsets); - continue; - } - if (fixed_pos < 0) { - fixed_pos = index.size() + fixed_pos + 1; - } - if (fixed_pos > index.size()) { - StringOP::push_null_string(i, res_chars, res_offsets, null_map); - continue; + } else { + PMR::vector> strs(&pool); + strs.resize(size); + for (int i = 0; i < size; ++i) { + strs[i].first = data_ptr + offset_ptr[i - 1]; + strs[i].second = offset_ptr[i] - offset_ptr[i - 1]; } - byte_pos = index[fixed_pos - 1]; - int fixed_len = str_size - byte_pos; - if (fixed_pos + len_value <= index.size()) { - fixed_len = index[fixed_pos + len_value - 1] - byte_pos; - } + for (size_t i = 0; i < size; ++i) { + auto [raw_str, str_size] = strs[i]; + const auto& start_value = start[i]; + const auto& len_value = len[i]; - if (byte_pos <= str_size && fixed_len > 0) { - // return StringRef(str.data + byte_pos, fixed_len); - StringOP::push_value_string( - std::string_view {reinterpret_cast(raw_str + byte_pos), - (size_t)fixed_len}, - i, res_chars, res_offsets); - } else { - StringOP::push_empty_string(i, res_chars, res_offsets); + // return empty string if start > src.length + if (start_value > str_size || str_size == 0 || start_value == 0 || len_value <= 0) { + StringOP::push_empty_string(i, res_chars, res_offsets); + continue; + } + // reference to string_function.cpp: substring + size_t byte_pos = 0; + index.clear(); + for (size_t j = 0, char_size = 0; j < str_size; j += char_size) { + char_size = UTF8_BYTE_LENGTH[(unsigned char)(raw_str)[j]]; + index.push_back(j); + if (start_value > 0 && index.size() > start_value + len_value) { + break; + } + } + + int fixed_pos = start_value; + if (fixed_pos < -(int)index.size()) { + StringOP::push_empty_string(i, res_chars, res_offsets); + continue; + } + if (fixed_pos < 0) { + fixed_pos = index.size() + fixed_pos + 1; + } + if (fixed_pos > index.size()) { + StringOP::push_null_string(i, res_chars, res_offsets, null_map); + continue; + } + + byte_pos = index[fixed_pos - 1]; + int fixed_len = str_size - byte_pos; + if (fixed_pos + len_value <= index.size()) { + fixed_len = index[fixed_pos + len_value - 1] - byte_pos; + } + + if (byte_pos <= str_size && fixed_len > 0) { + // return StringRef(str.data + byte_pos, fixed_len); + StringOP::push_value_string( + std::string_view {reinterpret_cast(raw_str + byte_pos), + (size_t)fixed_len}, + i, res_chars, res_offsets); + } else { + StringOP::push_empty_string(i, res_chars, res_offsets); + } } } }