optimize substr performance and fix ASAN global buffer overflow (#10442)

* add volnitsky substr algorithm

* replace std::search with volnitsky search algorithm in StringSearch

* optimize substring for constant_substring_fn case
use long run length search for performance
This commit is contained in:
Kang
2022-07-12 08:36:21 +08:00
committed by GitHub
parent f5036fea63
commit 4e9d5a7f7a
6 changed files with 894 additions and 10 deletions

View File

@ -101,7 +101,8 @@ Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block,
// result column
auto res = ColumnUInt8::create();
ColumnUInt8::Container& vec_res = res->get_data();
vec_res.resize(values->size());
// set default value to 0, and match functions only need to set 1/true
vec_res.resize_fill(values->size());
auto* state = reinterpret_cast<LikeState*>(
context->get_function_state(FunctionContext::THREAD_LOCAL));
@ -129,6 +130,42 @@ Status FunctionLikeBase::vector_vector(const ColumnString::Chars& values,
const ColumnString::Offsets& pattern_offsets,
ColumnUInt8::Container& result, const LikeFn& function,
LikeSearchState* search_state) {
// for constant_substring_fn, use long run length search for performance
if (constant_substring_fn ==
*(function.target<doris::Status (*)(LikeSearchState * state, const StringValue&,
const StringValue&, unsigned char*)>())) {
// treat continous multi string data as a long string data
const UInt8* begin = values.data();
const UInt8* end = begin + values.size();
const UInt8* pos = begin;
/// Current index in the array of strings.
size_t i = 0;
size_t needle_size = search_state->substring_pattern.get_pattern_length();
/// We will search for the next occurrence in all strings at once.
while (pos < end) {
// search return matched substring start offset
pos = (UInt8*)search_state->substring_pattern.search((char*)pos, end - pos);
if (pos >= end) break;
/// Determine which index it refers to.
/// begin + value_offsets[i] is the start offset of string at i+1
while (begin + value_offsets[i] <= pos) ++i;
/// We check that the entry does not pass through the boundaries of strings.
if (pos + needle_size < begin + value_offsets[i]) {
result[i] = 1;
}
// move to next string offset
pos = begin + value_offsets[i];
++i;
}
return Status::OK();
}
const auto size = value_offsets.size();
for (int i = 0; i < size; ++i) {