diff --git a/be/src/olap/like_column_predicate.cpp b/be/src/olap/like_column_predicate.cpp index a3546d3546..1a50187cc9 100644 --- a/be/src/olap/like_column_predicate.cpp +++ b/be/src/olap/like_column_predicate.cpp @@ -86,8 +86,9 @@ uint16_t LikeColumnPredicate::evaluate(const vectorized::IColumn& sel[new_size] = idx; StringValue cell_value = nested_col_ptr->get_shrink_value(data_array[idx]); unsigned char flag = 0; - (_state->function)(const_cast(&_like_state), - cell_value, pattern, &flag); + (_state->scalar_function)( + const_cast(&_like_state), + StringRef(cell_value.ptr, cell_value.len), pattern, &flag); new_size += _opposite ^ flag; } } else { @@ -101,25 +102,24 @@ uint16_t LikeColumnPredicate::evaluate(const vectorized::IColumn& StringValue cell_value = nested_col_ptr->get_shrink_value(data_array[idx]); unsigned char flag = 0; - (_state->function)(const_cast(&_like_state), - cell_value, pattern, &flag); + (_state->scalar_function)( + const_cast(&_like_state), + StringRef(cell_value.ptr, cell_value.len), pattern, &flag); new_size += _opposite ^ flag; } } } else { - auto* data_array = vectorized::check_and_get_column< - vectorized::PredicateColumnType>(nested_col) - ->get_data() - .data(); + auto* str_col = vectorized::check_and_get_column< + vectorized::PredicateColumnType>(nested_col); if (!nullable_col->has_null()) { + vectorized::ColumnUInt8::Container res(size, 0); + (_state->predicate_like_function)( + const_cast(&_like_state), *str_col, + pattern, res, sel, size); for (uint16_t i = 0; i != size; i++) { uint16_t idx = sel[i]; sel[new_size] = idx; - - unsigned char flag = 0; - (_state->function)(const_cast(&_like_state), - data_array[idx], pattern, &flag); - new_size += _opposite ^ flag; + new_size += _opposite ^ res[i]; } } else { for (uint16_t i = 0; i != size; i++) { @@ -130,88 +130,41 @@ uint16_t LikeColumnPredicate::evaluate(const vectorized::IColumn& continue; } + StringValue cell_value = str_col->get_data()[idx]; unsigned char flag = 0; - (_state->function)(const_cast(&_like_state), - data_array[idx], pattern, &flag); + (_state->scalar_function)( + const_cast(&_like_state), + StringRef(cell_value.ptr, cell_value.len), pattern, &flag); new_size += _opposite ^ flag; } } } } else { if (column.is_column_dictionary()) { - if (_state->function_vec_dict) { - if (LIKELY(_like_state.search_string_sv.len > 0)) { - auto* nested_col_ptr = vectorized::check_and_get_column< - vectorized::ColumnDictionary>(column); - auto& data_array = nested_col_ptr->get_data(); - StringValue values[size]; - unsigned char flags[size]; - for (uint16_t i = 0; i != size; i++) { - values[i] = nested_col_ptr->get_shrink_value(data_array[sel[i]]); - } - (_state->function_vec_dict)( - const_cast(&_like_state), pattern, - values, size, flags); - - for (uint16_t i = 0; i != size; i++) { - uint16_t idx = sel[i]; - sel[new_size] = idx; - new_size += _opposite ^ flags[i]; - } - } else { - for (uint16_t i = 0; i != size; i++) { - uint16_t idx = sel[i]; - sel[new_size] = idx; - new_size += _opposite ^ true; - } - } - } else { - auto* nested_col_ptr = vectorized::check_and_get_column< - vectorized::ColumnDictionary>(column); - auto& data_array = nested_col_ptr->get_data(); - for (uint16_t i = 0; i != size; i++) { - uint16_t idx = sel[i]; - sel[new_size] = idx; - StringValue cell_value = nested_col_ptr->get_shrink_value(data_array[idx]); - unsigned char flag = 0; - (_state->function)(const_cast(&_like_state), - cell_value, pattern, &flag); - new_size += _opposite ^ flag; - } + auto* nested_col_ptr = vectorized::check_and_get_column< + vectorized::ColumnDictionary>(column); + auto& data_array = nested_col_ptr->get_data(); + for (uint16_t i = 0; i != size; i++) { + uint16_t idx = sel[i]; + sel[new_size] = idx; + StringValue cell_value = nested_col_ptr->get_shrink_value(data_array[idx]); + unsigned char flag = 0; + (_state->scalar_function)( + const_cast(&_like_state), + StringRef(cell_value.ptr, cell_value.len), pattern, &flag); + new_size += _opposite ^ flag; } } else { - if (_state->function_vec) { - if (LIKELY(_like_state.search_string_sv.len > 0)) { - auto* data_array = - vectorized::check_and_get_column< - vectorized::PredicateColumnType>(column) - ->get_data() - .data(); - - (_state->function_vec)( - const_cast(&_like_state), pattern, - data_array, sel, size, _opposite, &new_size); - } else { - for (uint16_t i = 0; i < size; i++) { - uint16_t idx = sel[i]; - sel[new_size] = idx; - new_size += _opposite ^ true; - } - } - } else { - auto* data_array = vectorized::check_and_get_column< - vectorized::PredicateColumnType>(column) - ->get_data() - .data(); - - for (uint16_t i = 0; i != size; i++) { - uint16_t idx = sel[i]; - sel[new_size] = idx; - unsigned char flag = 0; - (_state->function)(const_cast(&_like_state), - data_array[idx], pattern, &flag); - new_size += _opposite ^ flag; - } + auto* str_col = vectorized::check_and_get_column< + vectorized::PredicateColumnType>(column); + vectorized::ColumnUInt8::Container res(size, 0); + (_state->predicate_like_function)( + const_cast(&_like_state), *str_col, pattern, + res, sel, size); + for (uint16_t i = 0; i != size; i++) { + uint16_t idx = sel[i]; + sel[new_size] = idx; + new_size += _opposite ^ res[i]; } } } diff --git a/be/src/olap/like_column_predicate.h b/be/src/olap/like_column_predicate.h index 2f01a4db97..0bb53c8119 100644 --- a/be/src/olap/like_column_predicate.h +++ b/be/src/olap/like_column_predicate.h @@ -101,15 +101,15 @@ private: StringValue cell_value = nested_col_ptr->get_shrink_value(data_array[i]); if constexpr (is_and) { unsigned char flag = 0; - (_state->function)( + (_state->scalar_function)( const_cast(&_like_state), - cell_value, pattern, &flag); + StringRef(cell_value.ptr, cell_value.len), pattern, &flag); flags[i] &= _opposite ^ flag; } else { unsigned char flag = 0; - (_state->function)( + (_state->scalar_function)( const_cast(&_like_state), - cell_value, pattern, &flag); + StringRef(cell_value.ptr, cell_value.len), pattern, &flag); flags[i] = _opposite ^ flag; } } @@ -118,55 +118,23 @@ private: } } else { if (column.is_column_dictionary()) { - if (_state->function_vec_dict) { - if (LIKELY(_like_state.search_string_sv.len > 0)) { - auto* nested_col_ptr = vectorized::check_and_get_column< - vectorized::ColumnDictionary>(column); - auto& data_array = nested_col_ptr->get_data(); - StringValue values[size]; - unsigned char temp_flags[size]; - for (uint16_t i = 0; i != size; i++) { - values[i] = nested_col_ptr->get_shrink_value(data_array[i]); - } - (_state->function_vec_dict)( - const_cast(&_like_state), pattern, - values, size, temp_flags); - for (uint16_t i = 0; i < size; i++) { - if constexpr (is_and) { - flags[i] &= _opposite ^ temp_flags[i]; - } else { - flags[i] = _opposite ^ temp_flags[i]; - } - } + auto* nested_col_ptr = vectorized::check_and_get_column< + vectorized::ColumnDictionary>(column); + auto& data_array = nested_col_ptr->get_data(); + for (uint16_t i = 0; i < size; i++) { + StringValue cell_value = nested_col_ptr->get_shrink_value(data_array[i]); + if constexpr (is_and) { + unsigned char flag = 0; + (_state->scalar_function)( + const_cast(&_like_state), + StringRef(cell_value.ptr, cell_value.len), pattern, &flag); + flags[i] &= _opposite ^ flag; } else { - for (uint16_t i = 0; i < size; i++) { - if constexpr (is_and) { - flags[i] &= _opposite ^ true; - } else { - flags[i] = _opposite ^ true; - } - } - } - } else { - auto* nested_col_ptr = vectorized::check_and_get_column< - vectorized::ColumnDictionary>(column); - auto& data_array = nested_col_ptr->get_data(); - for (uint16_t i = 0; i < size; i++) { - StringValue cell_value = - nested_col_ptr->get_shrink_value(data_array[i]); - if constexpr (is_and) { - unsigned char flag = 0; - (_state->function)( - const_cast(&_like_state), - cell_value, pattern, &flag); - flags[i] &= _opposite ^ flag; - } else { - unsigned char flag = 0; - (_state->function)( - const_cast(&_like_state), - cell_value, pattern, &flag); - flags[i] = _opposite ^ flag; - } + unsigned char flag = 0; + (_state->scalar_function)( + const_cast(&_like_state), + StringRef(cell_value.ptr, cell_value.len), pattern, &flag); + flags[i] = _opposite ^ flag; } } } else { diff --git a/be/src/runtime/string_search.hpp b/be/src/runtime/string_search.hpp index 463719f279..6565f516b4 100644 --- a/be/src/runtime/string_search.hpp +++ b/be/src/runtime/string_search.hpp @@ -39,6 +39,11 @@ public: _vol_searcher.reset(new Volnitsky(pattern->ptr, pattern->len)); } + void set_pattern(const StringRef* pattern) { + _pattern = reinterpret_cast(pattern); + _vol_searcher.reset(new Volnitsky(pattern->data, pattern->size)); + } + // search for this pattern in str. // Returns the offset into str if the pattern exists // Returns -1 if the pattern is not found @@ -51,6 +56,15 @@ public: } } + int search(const StringRef& str) const { + auto it = search(const_cast(str.data), str.size); + if (it == str.data + str.size) { + return -1; + } else { + return it - str.data; + } + } + // search for this pattern in str. // Returns the offset into str if the pattern exists // Returns str+len if the pattern is not found diff --git a/be/src/vec/common/string_ref.h b/be/src/vec/common/string_ref.h index c339c3abea..006274bd7c 100644 --- a/be/src/vec/common/string_ref.h +++ b/be/src/vec/common/string_ref.h @@ -46,37 +46,6 @@ #include #endif -/// The thing to avoid creating strings to find substrings in the hash table. -struct StringRef { - const char* data = nullptr; - size_t size = 0; - - StringRef(const char* data_, size_t size_) : data(data_), size(size_) {} - StringRef(const unsigned char* data_, size_t size_) - : data(reinterpret_cast(data_)), size(size_) {} - StringRef(const std::string& s) : data(s.data()), size(s.size()) {} - StringRef() = default; - - std::string to_string() const { return std::string(data, size); } - std::string_view to_string_view() const { return std::string_view(data, size); } - doris::Slice to_slice() const { return doris::Slice(data, size); } - - // this is just for show, eg. print data to error log, to avoid print large string. - std::string to_prefix(size_t length) const { return std::string(data, std::min(length, size)); } - - explicit operator std::string() const { return to_string(); } - - StringVal to_string_val() { - return StringVal(reinterpret_cast(const_cast(data)), size); - } - - static StringRef from_string_val(StringVal sv) { - return StringRef(reinterpret_cast(sv.ptr), sv.len); - } -}; - -using StringRefs = std::vector; - #if defined(__SSE2__) || defined(__aarch64__) /** Compare strings for equality. @@ -163,6 +132,64 @@ inline bool memequalSSE2Wide(const char* p1, const char* p2, size_t size) { #endif +/// The thing to avoid creating strings to find substrings in the hash table. +struct StringRef { + const char* data = nullptr; + size_t size = 0; + + StringRef(const char* data_, size_t size_) : data(data_), size(size_) {} + StringRef(const unsigned char* data_, size_t size_) + : data(reinterpret_cast(data_)), size(size_) {} + StringRef(const std::string& s) : data(s.data()), size(s.size()) {} + StringRef() = default; + + std::string to_string() const { return std::string(data, size); } + std::string_view to_string_view() const { return std::string_view(data, size); } + doris::Slice to_slice() const { return doris::Slice(data, size); } + + // this is just for show, eg. print data to error log, to avoid print large string. + std::string to_prefix(size_t length) const { return std::string(data, std::min(length, size)); } + + explicit operator std::string() const { return to_string(); } + + StringRef substring(int start_pos, int new_len) const { + return StringRef(data + start_pos, (new_len < 0) ? (size - start_pos) : new_len); + } + + StringVal to_string_val() { + return StringVal(reinterpret_cast(const_cast(data)), size); + } + + static StringRef from_string_val(StringVal sv) { + return StringRef(reinterpret_cast(sv.ptr), sv.len); + } + + bool start_with(StringRef& search_string) const { + DCHECK(size >= search_string.size); + if (search_string.size == 0) return true; + +#if defined(__SSE2__) || defined(__aarch64__) + return memequalSSE2Wide(data, search_string.data, search_string.size); +#else + return 0 == memcmp(data, search_string.data, search_string.size); +#endif + } + bool end_with(StringRef& search_string) const { + DCHECK(size >= search_string.size); + if (search_string.size == 0) return true; + +#if defined(__SSE2__) || defined(__aarch64__) + return memequalSSE2Wide(data + size - search_string.size, search_string.data, + search_string.size); +#else + return 0 == + memcmp(data + size - search_string.size, search_string.data, search_string.size); +#endif + } +}; + +using StringRefs = std::vector; + inline bool operator==(StringRef lhs, StringRef rhs) { if (lhs.size != rhs.size) return false; diff --git a/be/src/vec/functions/like.cpp b/be/src/vec/functions/like.cpp index 1d156378e5..097352c360 100644 --- a/be/src/vec/functions/like.cpp +++ b/be/src/vec/functions/like.cpp @@ -63,66 +63,144 @@ Status LikeSearchState::clone(LikeSearchState& cloned) { return Status::OK(); } -Status FunctionLikeBase::constant_starts_with_fn(LikeSearchState* state, const StringValue& val, +Status FunctionLikeBase::constant_starts_with_fn(LikeSearchState* state, const ColumnString& val, const StringValue& pattern, - unsigned char* result) { - *result = (val.len >= state->search_string_sv.len) && - (state->search_string_sv == val.substring(0, state->search_string_sv.len)); + ColumnUInt8::Container& result) { + auto sz = val.size(); + for (size_t i = 0; i < sz; i++) { + const auto& str_ref = val.get_data_at(i); + result[i] = (str_ref.size >= state->search_string_sv.size) && + str_ref.start_with(state->search_string_sv); + } return Status::OK(); } -Status FunctionLikeBase::constant_ends_with_fn(LikeSearchState* state, const StringValue& val, - const StringValue& pattern, unsigned char* result) { - *result = (val.len >= state->search_string_sv.len) && - (state->search_string_sv == - val.substring(val.len - state->search_string_sv.len, state->search_string_sv.len)); +Status FunctionLikeBase::constant_ends_with_fn(LikeSearchState* state, const ColumnString& val, + const StringValue& pattern, + ColumnUInt8::Container& result) { + auto sz = val.size(); + for (size_t i = 0; i < sz; i++) { + const auto& str_ref = val.get_data_at(i); + result[i] = (str_ref.size >= state->search_string_sv.size) && + str_ref.end_with(state->search_string_sv); + } return Status::OK(); } -Status FunctionLikeBase::constant_equals_fn(LikeSearchState* state, const StringValue& val, - const StringValue& pattern, unsigned char* result) { +Status FunctionLikeBase::constant_equals_fn(LikeSearchState* state, const ColumnString& val, + const StringValue& pattern, + ColumnUInt8::Container& result) { + auto sz = val.size(); + for (size_t i = 0; i < sz; i++) { + result[i] = (val.get_data_at(i) == state->search_string_sv); + } + return Status::OK(); +} + +Status FunctionLikeBase::constant_substring_fn(LikeSearchState* state, const ColumnString& val, + const StringValue& pattern, + ColumnUInt8::Container& result) { + auto sz = val.size(); + for (size_t i = 0; i < sz; i++) { + if (state->search_string_sv.size == 0) { + result[i] = true; + } + result[i] = state->substring_pattern.search(val.get_data_at(i)) != -1; + } + return Status::OK(); +} + +Status FunctionLikeBase::constant_starts_with_fn_predicate( + LikeSearchState* state, const PredicateColumnType& val, + const StringValue& pattern, ColumnUInt8::Container& result, uint16_t* sel, size_t sz) { + auto data_ptr = reinterpret_cast(val.get_data().data()); + for (size_t i = 0; i < sz; i++) { + result[i] = (data_ptr[sel[i]].size >= state->search_string_sv.size) && + (state->search_string_sv == + data_ptr[sel[i]].substring(0, state->search_string_sv.size)); + } + return Status::OK(); +} + +Status FunctionLikeBase::constant_ends_with_fn_predicate( + LikeSearchState* state, const PredicateColumnType& val, + const StringValue& pattern, ColumnUInt8::Container& result, uint16_t* sel, size_t sz) { + auto data_ptr = reinterpret_cast(val.get_data().data()); + for (size_t i = 0; i < sz; i++) { + result[i] = + (data_ptr[sel[i]].size >= state->search_string_sv.size) && + (state->search_string_sv == + data_ptr[sel[i]].substring(data_ptr[sel[i]].size - state->search_string_sv.size, + state->search_string_sv.size)); + } + return Status::OK(); +} + +Status FunctionLikeBase::constant_equals_fn_predicate(LikeSearchState* state, + const PredicateColumnType& val, + const StringValue& pattern, + ColumnUInt8::Container& result, uint16_t* sel, + size_t sz) { + auto data_ptr = reinterpret_cast(val.get_data().data()); + for (size_t i = 0; i < sz; i++) { + result[i] = (data_ptr[sel[i]] == state->search_string_sv); + } + return Status::OK(); +} + +Status FunctionLikeBase::constant_substring_fn_predicate( + LikeSearchState* state, const PredicateColumnType& val, + const StringValue& pattern, ColumnUInt8::Container& result, uint16_t* sel, size_t sz) { + auto data_ptr = reinterpret_cast(val.get_data().data()); + for (size_t i = 0; i < sz; i++) { + if (state->search_string_sv.size == 0) { + result[i] = true; + } + result[i] = state->substring_pattern.search(data_ptr[sel[i]]) != -1; + } + return Status::OK(); +} + +Status FunctionLikeBase::constant_starts_with_fn_scalar(LikeSearchState* state, + const StringRef& val, + const StringValue& pattern, + unsigned char* result) { + *result = (val.size >= state->search_string_sv.size) && + (state->search_string_sv == val.substring(0, state->search_string_sv.size)); + return Status::OK(); +} + +Status FunctionLikeBase::constant_ends_with_fn_scalar(LikeSearchState* state, const StringRef& val, + const StringValue& pattern, + unsigned char* result) { + *result = (val.size >= state->search_string_sv.size) && + (state->search_string_sv == val.substring(val.size - state->search_string_sv.size, + state->search_string_sv.size)); + return Status::OK(); +} + +Status FunctionLikeBase::constant_equals_fn_scalar(LikeSearchState* state, const StringRef& val, + const StringValue& pattern, + unsigned char* result) { *result = (val == state->search_string_sv); return Status::OK(); } -Status FunctionLikeBase::constant_substring_fn(LikeSearchState* state, const StringValue& val, - const StringValue& pattern, unsigned char* result) { - if (state->search_string_sv.len == 0) { +Status FunctionLikeBase::constant_substring_fn_scalar(LikeSearchState* state, const StringRef& val, + const StringValue& pattern, + unsigned char* result) { + if (state->search_string_sv.size == 0) { *result = true; return Status::OK(); } - *result = state->substring_pattern.search(&val) != -1; + *result = state->substring_pattern.search(val) != -1; return Status::OK(); } -Status FunctionLikeBase::constant_substring_fn_vec(LikeSearchState* state, - const StringValue& pattern, - const StringValue* values, uint16_t* sel, - uint16_t size, bool opposite, - uint16_t* new_size) { - uint16_t count = 0; - for (uint16_t i = 0; i < size; i++) { - uint16_t idx = sel[i]; - sel[count] = idx; - count += opposite ^ (state->substring_pattern.search(&values[idx]) != -1); - } - *new_size = count; - return Status::OK(); -} - -Status FunctionLikeBase::constant_substring_fn_vec_dict(LikeSearchState* state, - const StringValue& pattern, - const StringValue* values, uint16_t size, - unsigned char* result) { - for (uint16_t i = 0; i < size; i++) { - result[i] = (state->substring_pattern.search(&values[i]) != -1); - } - return Status::OK(); -} - -Status FunctionLikeBase::constant_regex_fn(LikeSearchState* state, const StringValue& val, - const StringValue& pattern, unsigned char* result) { - auto ret = hs_scan(state->hs_database.get(), val.ptr, val.len, 0, state->hs_scratch.get(), +Status FunctionLikeBase::constant_regex_fn_scalar(LikeSearchState* state, const StringRef& val, + const StringValue& pattern, + unsigned char* result) { + auto ret = hs_scan(state->hs_database.get(), val.data, val.size, 0, state->hs_scratch.get(), state->hs_match_handler, (void*)result); if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); @@ -131,16 +209,16 @@ Status FunctionLikeBase::constant_regex_fn(LikeSearchState* state, const StringV return Status::OK(); } -Status FunctionLikeBase::regexp_fn(LikeSearchState* state, const StringValue& val, - const StringValue& pattern, unsigned char* result) { - std::string re_pattern(pattern.ptr, pattern.len); +Status FunctionLikeBase::regexp_fn_scalar(LikeSearchState* state, const StringRef& val, + const StringValue& pattern, unsigned char* result) { + std::string_view re_pattern(pattern.ptr, pattern.len); hs_database_t* database = nullptr; hs_scratch_t* scratch = nullptr; - RETURN_IF_ERROR(hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch)); + RETURN_IF_ERROR(hs_prepare(nullptr, re_pattern.data(), &database, &scratch)); - auto ret = - hs_scan(database, val.ptr, val.len, 0, scratch, state->hs_match_handler, (void*)result); + auto ret = hs_scan(database, val.data, val.size, 0, scratch, state->hs_match_handler, + (void*)result); if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); } @@ -151,6 +229,91 @@ Status FunctionLikeBase::regexp_fn(LikeSearchState* state, const StringValue& va return Status::OK(); } +Status FunctionLikeBase::constant_regex_fn(LikeSearchState* state, const ColumnString& val, + const StringValue& pattern, + ColumnUInt8::Container& result) { + auto sz = val.size(); + for (size_t i = 0; i < sz; i++) { + const auto& str_ref = val.get_data_at(i); + auto ret = hs_scan(state->hs_database.get(), str_ref.data, str_ref.size, 0, + state->hs_scratch.get(), state->hs_match_handler, + (void*)(result.data() + i)); + if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { + return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); + } + } + + return Status::OK(); +} + +Status FunctionLikeBase::regexp_fn(LikeSearchState* state, const ColumnString& val, + const StringValue& pattern, ColumnUInt8::Container& result) { + std::string_view re_pattern(pattern.ptr, pattern.len); + + hs_database_t* database = nullptr; + hs_scratch_t* scratch = nullptr; + RETURN_IF_ERROR(hs_prepare(nullptr, re_pattern.data(), &database, &scratch)); + + auto sz = val.size(); + for (size_t i = 0; i < sz; i++) { + const auto& str_ref = val.get_data_at(i); + auto ret = hs_scan(database, str_ref.data, str_ref.size, 0, scratch, + state->hs_match_handler, (void*)(result.data() + i)); + if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { + return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); + } + } + + hs_free_scratch(scratch); + hs_free_database(database); + + return Status::OK(); +} + +Status FunctionLikeBase::constant_regex_fn_predicate(LikeSearchState* state, + const PredicateColumnType& val, + const StringValue& pattern, + ColumnUInt8::Container& result, uint16_t* sel, + size_t sz) { + auto data_ptr = reinterpret_cast(val.get_data().data()); + for (size_t i = 0; i < sz; i++) { + auto ret = hs_scan(state->hs_database.get(), data_ptr[sel[i]].data, data_ptr[sel[i]].size, + 0, state->hs_scratch.get(), state->hs_match_handler, + (void*)(result.data() + i)); + if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { + return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); + } + } + + return Status::OK(); +} + +Status FunctionLikeBase::regexp_fn_predicate(LikeSearchState* state, + const PredicateColumnType& val, + const StringValue& pattern, + ColumnUInt8::Container& result, uint16_t* sel, + size_t sz) { + std::string_view re_pattern(pattern.ptr, pattern.len); + + hs_database_t* database = nullptr; + hs_scratch_t* scratch = nullptr; + RETURN_IF_ERROR(hs_prepare(nullptr, re_pattern.data(), &database, &scratch)); + + auto data_ptr = reinterpret_cast(val.get_data().data()); + for (size_t i = 0; i < sz; i++) { + auto ret = hs_scan(database, data_ptr[sel[i]].data, data_ptr[sel[i]].size, 0, scratch, + state->hs_match_handler, (void*)(result.data() + i)); + if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { + return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); + } + } + + hs_free_scratch(scratch); + hs_free_database(database); + + return Status::OK(); +} + // hyperscan compile expression to database and allocate scratch space Status FunctionLikeBase::hs_prepare(FunctionContext* context, const char* expression, hs_database_t** database, hs_scratch_t** scratch) { @@ -196,22 +359,24 @@ Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block, context->get_function_state(FunctionContext::THREAD_LOCAL)); // for constant_substring_fn, use long run length search for performance if (constant_substring_fn == - *(state->function.target())) { + *(state->function + .target())) { RETURN_IF_ERROR(execute_substring(values->get_chars(), values->get_offsets(), vec_res, - state->function, &state->search_state)); + &state->search_state)); } else { const auto pattern_col = block.get_by_position(arguments[1]).column; - if (const auto* patterns = check_and_get_column(pattern_col.get())) { - RETURN_IF_ERROR(vector_vector(values->get_chars(), values->get_offsets(), - patterns->get_chars(), patterns->get_offsets(), vec_res, - state->function, &state->search_state)); + if (const auto* str_patterns = check_and_get_column(pattern_col.get())) { + DCHECK_EQ(str_patterns->size(), 1); + const auto& pattern_val = str_patterns->get_data_at(0); + RETURN_IF_ERROR(vector_const(*values, &pattern_val, vec_res, state->function, + &state->search_state)); } else if (const auto* const_patterns = check_and_get_column(pattern_col.get())) { const auto& pattern_val = const_patterns->get_data_at(0); - RETURN_IF_ERROR(vector_const(values->get_chars(), values->get_offsets(), &pattern_val, - vec_res, state->function, &state->search_state)); + RETURN_IF_ERROR(vector_const(*values, &pattern_val, vec_res, state->function, + &state->search_state)); } else { return Status::InternalError("Not supported input arguments types"); } @@ -232,7 +397,7 @@ Status FunctionLikeBase::close(FunctionContext* context, Status FunctionLikeBase::execute_substring(const ColumnString::Chars& values, const ColumnString::Offsets& value_offsets, - ColumnUInt8::Container& result, const LikeFn& function, + ColumnUInt8::Container& result, LikeSearchState* search_state) { // treat continuous multi string data as a long string data const UInt8* begin = values.data(); @@ -266,50 +431,42 @@ Status FunctionLikeBase::execute_substring(const ColumnString::Chars& values, return Status::OK(); } -Status FunctionLikeBase::vector_const(const ColumnString::Chars& values, - const ColumnString::Offsets& value_offsets, - const StringRef* pattern_val, ColumnUInt8::Container& result, - const LikeFn& function, LikeSearchState* search_state) { - const auto size = value_offsets.size(); - - for (int i = 0; i < size; ++i) { - char* val_raw_str = (char*)(&values[value_offsets[i - 1]]); - UInt32 val_str_size = value_offsets[i] - value_offsets[i - 1]; - - RETURN_IF_ERROR((function)(search_state, StringValue(val_raw_str, val_str_size), - *reinterpret_cast(pattern_val), &result[i])); - } +Status FunctionLikeBase::vector_const(const ColumnString& values, const StringRef* pattern_val, + ColumnUInt8::Container& result, const LikeFn& function, + LikeSearchState* search_state) { + RETURN_IF_ERROR((function)(search_state, values, + *reinterpret_cast(pattern_val), result)); return Status::OK(); } -Status FunctionLikeBase::vector_vector(const ColumnString::Chars& values, - const ColumnString::Offsets& value_offsets, - const ColumnString::Chars& patterns, - const ColumnString::Offsets& pattern_offsets, - ColumnUInt8::Container& result, const LikeFn& function, - LikeSearchState* search_state) { - const auto size = value_offsets.size(); - - for (int i = 0; i < size; ++i) { - char* val_raw_str = (char*)(&values[value_offsets[i - 1]]); - UInt32 val_str_size = value_offsets[i] - value_offsets[i - 1]; - - char* pattern_raw_str = (char*)(&patterns[pattern_offsets[i - 1]]); - UInt32 patter_str_size = pattern_offsets[i] - pattern_offsets[i - 1]; - RETURN_IF_ERROR((function)(search_state, StringValue(val_raw_str, val_str_size), - StringValue(pattern_raw_str, patter_str_size), &result[i])); - } - return Status::OK(); -} - -Status FunctionLike::like_fn(LikeSearchState* state, const StringValue& val, - const StringValue& pattern, unsigned char* result) { +Status FunctionLike::like_fn(LikeSearchState* state, const ColumnString& val, + const StringValue& pattern, ColumnUInt8::Container& result) { std::string re_pattern; convert_like_pattern(state, std::string(pattern.ptr, pattern.len), &re_pattern); return regexp_fn(state, val, {re_pattern.c_str(), (int)re_pattern.size()}, result); } +Status FunctionLike::like_fn_predicate(LikeSearchState* state, + const PredicateColumnType& val, + const StringValue& pattern, ColumnUInt8::Container& result, + uint16_t* sel, size_t sz) { + std::string re_pattern; + convert_like_pattern(state, std::string(pattern.ptr, pattern.len), &re_pattern); + + return regexp_fn_predicate(state, val, {re_pattern.c_str(), (int)re_pattern.size()}, result, + sel, sz); +} + +Status FunctionLike::like_fn_scalar(LikeSearchState* state, const StringValue& val, + const StringValue& pattern, unsigned char* result) { + std::string re_pattern; + convert_like_pattern(state, std::string(pattern.ptr, pattern.len), &re_pattern); + + return regexp_fn_scalar(state, StringRef(val.ptr, val.len), + {re_pattern.c_str(), (int)re_pattern.size()}, result); +} + void FunctionLike::convert_like_pattern(LikeSearchState* state, const std::string& pattern, std::string* re_pattern) { re_pattern->clear(); @@ -373,6 +530,8 @@ Status FunctionLike::prepare(FunctionContext* context, FunctionContext::Function auto* state = new LikeState(); context->set_function_state(scope, state); state->function = like_fn; + state->predicate_like_function = like_fn_predicate; + state->scalar_function = like_fn_scalar; if (context->is_col_constant(1)) { const auto pattern_col = context->get_constant_col(1)->column_ptr; const auto& pattern = pattern_col->get_data_at(0); @@ -384,20 +543,26 @@ Status FunctionLike::prepare(FunctionContext* context, FunctionContext::Function remove_escape_character(&search_string); state->search_state.set_search_string(search_string); state->function = constant_equals_fn; + state->predicate_like_function = constant_equals_fn_predicate; + state->scalar_function = constant_equals_fn_scalar; } else if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &search_string)) { remove_escape_character(&search_string); state->search_state.set_search_string(search_string); state->function = constant_starts_with_fn; + state->predicate_like_function = constant_starts_with_fn_predicate; + state->scalar_function = constant_starts_with_fn_scalar; } else if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &search_string)) { remove_escape_character(&search_string); state->search_state.set_search_string(search_string); state->function = constant_ends_with_fn; + state->predicate_like_function = constant_ends_with_fn_predicate; + state->scalar_function = constant_ends_with_fn_scalar; } else if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &search_string)) { remove_escape_character(&search_string); state->search_state.set_search_string(search_string); state->function = constant_substring_fn; - state->function_vec = constant_substring_fn_vec; - state->function_vec_dict = constant_substring_fn_vec_dict; + state->predicate_like_function = constant_substring_fn_predicate; + state->scalar_function = constant_substring_fn_scalar; } else { std::string re_pattern; convert_like_pattern(&state->search_state, pattern_str, &re_pattern); @@ -410,6 +575,8 @@ Status FunctionLike::prepare(FunctionContext* context, FunctionContext::Function state->search_state.hs_scratch.reset(scratch); state->function = constant_regex_fn; + state->predicate_like_function = constant_regex_fn_predicate; + state->scalar_function = constant_regex_fn_scalar; } } return Status::OK(); @@ -423,6 +590,8 @@ Status FunctionRegexp::prepare(FunctionContext* context, auto* state = new LikeState(); context->set_function_state(scope, state); state->function = regexp_fn; + state->predicate_like_function = regexp_fn_predicate; + state->scalar_function = regexp_fn_scalar; if (context->is_col_constant(1)) { const auto pattern_col = context->get_constant_col(1)->column_ptr; const auto& pattern = pattern_col->get_data_at(0); @@ -432,17 +601,23 @@ Status FunctionRegexp::prepare(FunctionContext* context, if (RE2::FullMatch(pattern_str, EQUALS_RE, &search_string)) { state->search_state.set_search_string(search_string); state->function = constant_equals_fn; + state->predicate_like_function = constant_equals_fn_predicate; + state->scalar_function = constant_equals_fn_scalar; } else if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &search_string)) { state->search_state.set_search_string(search_string); state->function = constant_starts_with_fn; + state->predicate_like_function = constant_starts_with_fn_predicate; + state->scalar_function = constant_starts_with_fn_scalar; } else if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &search_string)) { state->search_state.set_search_string(search_string); state->function = constant_ends_with_fn; + state->predicate_like_function = constant_ends_with_fn_predicate; + state->scalar_function = constant_ends_with_fn_scalar; } else if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &search_string)) { state->search_state.set_search_string(search_string); state->function = constant_substring_fn; - state->function_vec = constant_substring_fn_vec; - state->function_vec_dict = constant_substring_fn_vec_dict; + state->predicate_like_function = constant_substring_fn_predicate; + state->scalar_function = constant_substring_fn_scalar; } else { hs_database_t* database = nullptr; hs_scratch_t* scratch = nullptr; @@ -452,6 +627,8 @@ Status FunctionRegexp::prepare(FunctionContext* context, state->search_state.hs_scratch.reset(scratch); state->function = constant_regex_fn; + state->predicate_like_function = constant_regex_fn_predicate; + state->scalar_function = constant_regex_fn_scalar; } } return Status::OK(); diff --git a/be/src/vec/functions/like.h b/be/src/vec/functions/like.h index 98458dbd5b..b44831a93d 100644 --- a/be/src/vec/functions/like.h +++ b/be/src/vec/functions/like.h @@ -27,6 +27,7 @@ #include "vec/columns/column_const.h" #include "vec/columns/column_set.h" #include "vec/columns/columns_number.h" +#include "vec/columns/predicate_column.h" #include "vec/data_types/data_type_nullable.h" #include "vec/data_types/data_type_number.h" #include "vec/exprs/vexpr.h" @@ -50,7 +51,7 @@ struct LikeSearchState { /// constant string or has a constant string at the beginning or end of the pattern. /// This will be set in order to check for that pattern in the corresponding part of /// the string. - doris::StringValue search_string_sv; + StringRef search_string_sv; /// Used for LIKE predicates if the pattern is a constant argument and has a constant /// string in the middle of it. This will be use in order to check for the substring @@ -91,26 +92,27 @@ struct LikeSearchState { void set_search_string(const std::string& search_string_arg) { search_string = search_string_arg; - search_string_sv = StringValue(search_string); + search_string_sv = StringRef(search_string); substring_pattern.set_pattern(&search_string_sv); } }; -using LikeFn = std::function; +using LikeFn = std::function; -using LikeFnVec = - std::function; +using LikePredicateFn = std::function&, const StringValue&, + ColumnUInt8::Container&, uint16_t* sel, size_t sz)>; -using LikeFnVecDict = std::function; +using ScalarLikeFn = std::function; struct LikeState { LikeSearchState search_state; LikeFn function; - LikeFnVec function_vec; - LikeFnVecDict function_vec_dict; + // Two functions below are used only for predicate. + LikePredicateFn predicate_like_function; + ScalarLikeFn scalar_function; }; class FunctionLikeBase : public IFunction { @@ -129,48 +131,86 @@ public: Status close(FunctionContext* context, FunctionContext::FunctionStateScope scope) override; protected: - Status vector_vector(const ColumnString::Chars& values, - const ColumnString::Offsets& value_offsets, - const ColumnString::Chars& patterns, - const ColumnString::Offsets& pattern_offsets, - ColumnUInt8::Container& result, const LikeFn& function, - LikeSearchState* search_state); - - Status vector_const(const ColumnString::Chars& values, - const ColumnString::Offsets& value_offsets, const StringRef* pattern_val, + Status vector_const(const ColumnString& values, const StringRef* pattern_val, ColumnUInt8::Container& result, const LikeFn& function, LikeSearchState* search_state); Status execute_substring(const ColumnString::Chars& values, const ColumnString::Offsets& value_offsets, - ColumnUInt8::Container& result, const LikeFn& function, - LikeSearchState* search_state); + ColumnUInt8::Container& result, LikeSearchState* search_state); - static Status constant_starts_with_fn(LikeSearchState* state, const StringValue& val, - const StringValue& pattern, unsigned char* result); + static Status constant_starts_with_fn(LikeSearchState* state, const ColumnString& val, + const StringValue& pattern, + ColumnUInt8::Container& result); - static Status constant_ends_with_fn(LikeSearchState* state, const StringValue& val, - const StringValue& pattern, unsigned char* result); + static Status constant_ends_with_fn(LikeSearchState* state, const ColumnString& val, + const StringValue& pattern, ColumnUInt8::Container& result); - static Status constant_equals_fn(LikeSearchState* state, const StringValue& val, - const StringValue& pattern, unsigned char* result); + static Status constant_equals_fn(LikeSearchState* state, const ColumnString& val, + const StringValue& pattern, ColumnUInt8::Container& result); - static Status constant_substring_fn(LikeSearchState* state, const StringValue& val, - const StringValue& pattern, unsigned char* result); + static Status constant_substring_fn(LikeSearchState* state, const ColumnString& val, + const StringValue& pattern, ColumnUInt8::Container& result); - static Status constant_substring_fn_vec(LikeSearchState* state, const StringValue& pattern, - const StringValue* values, uint16_t* sel, uint16_t size, - bool opposite, uint16_t* new_size); + static Status constant_regex_fn(LikeSearchState* state, const ColumnString& val, + const StringValue& pattern, ColumnUInt8::Container& result); - static Status constant_substring_fn_vec_dict(LikeSearchState* state, const StringValue& pattern, - const StringValue* values, uint16_t size, - unsigned char* result); + static Status regexp_fn(LikeSearchState* state, const ColumnString& val, + const StringValue& pattern, ColumnUInt8::Container& result); - static Status constant_regex_fn(LikeSearchState* state, const StringValue& val, - const StringValue& pattern, unsigned char* result); + // These functions below are used only for predicate. + static Status constant_regex_fn_predicate(LikeSearchState* state, + const PredicateColumnType& val, + const StringValue& pattern, + ColumnUInt8::Container& result, uint16_t* sel, + size_t sz); - static Status regexp_fn(LikeSearchState* state, const StringValue& val, - const StringValue& pattern, unsigned char* result); + static Status regexp_fn_predicate(LikeSearchState* state, + const PredicateColumnType& val, + const StringValue& pattern, ColumnUInt8::Container& result, + uint16_t* sel, size_t sz); + + static Status constant_starts_with_fn_predicate(LikeSearchState* state, + const PredicateColumnType& val, + const StringValue& pattern, + ColumnUInt8::Container& result, uint16_t* sel, + size_t sz); + + static Status constant_ends_with_fn_predicate(LikeSearchState* state, + const PredicateColumnType& val, + const StringValue& pattern, + ColumnUInt8::Container& result, uint16_t* sel, + size_t sz); + + static Status constant_equals_fn_predicate(LikeSearchState* state, + const PredicateColumnType& val, + const StringValue& pattern, + ColumnUInt8::Container& result, uint16_t* sel, + size_t sz); + + static Status constant_substring_fn_predicate(LikeSearchState* state, + const PredicateColumnType& val, + const StringValue& pattern, + ColumnUInt8::Container& result, uint16_t* sel, + size_t sz); + + static Status constant_starts_with_fn_scalar(LikeSearchState* state, const StringRef& val, + const StringValue& pattern, unsigned char* result); + + static Status constant_ends_with_fn_scalar(LikeSearchState* state, const StringRef& val, + const StringValue& pattern, unsigned char* result); + + static Status constant_equals_fn_scalar(LikeSearchState* state, const StringRef& val, + const StringValue& pattern, unsigned char* result); + + static Status constant_substring_fn_scalar(LikeSearchState* state, const StringRef& val, + const StringValue& pattern, unsigned char* result); + + static Status constant_regex_fn_scalar(LikeSearchState* state, const StringRef& val, + const StringValue& pattern, unsigned char* result); + + static Status regexp_fn_scalar(LikeSearchState* state, const StringRef& val, + const StringValue& pattern, unsigned char* result); // hyperscan compile expression to database and allocate scratch space static Status hs_prepare(FunctionContext* context, const char* expression, @@ -190,8 +230,16 @@ public: friend struct LikeSearchState; private: - static Status like_fn(LikeSearchState* state, const StringValue& val, - const StringValue& pattern, unsigned char* result); + static Status like_fn(LikeSearchState* state, const ColumnString& val, + const StringValue& pattern, ColumnUInt8::Container& result); + + static Status like_fn_predicate(LikeSearchState* state, + const PredicateColumnType& val, + const StringValue& pattern, ColumnUInt8::Container& result, + uint16_t* sel, size_t sz); + + static Status like_fn_scalar(LikeSearchState* state, const StringValue& val, + const StringValue& pattern, unsigned char* result); static void convert_like_pattern(LikeSearchState* state, const std::string& pattern, std::string* re_pattern); diff --git a/be/test/vec/function/function_like_test.cpp b/be/test/vec/function/function_like_test.cpp index e3c63fd294..2c8299e274 100644 --- a/be/test/vec/function/function_like_test.cpp +++ b/be/test/vec/function/function_like_test.cpp @@ -60,10 +60,6 @@ TEST(FunctionLikeTest, like) { check_function(func_name, const_pattern_input_types, const_pattern_dataset); } - - // pattern is not constant value - InputTypeSet input_types = {TypeIndex::String, TypeIndex::String}; - check_function(func_name, input_types, data_set); } TEST(FunctionLikeTest, regexp) { @@ -100,10 +96,6 @@ TEST(FunctionLikeTest, regexp) { check_function(func_name, const_pattern_input_types, const_pattern_dataset); } - - // pattern is not constant value - InputTypeSet input_types = {TypeIndex::String, TypeIndex::String}; - check_function(func_name, input_types, data_set); } TEST(FunctionLikeTest, regexp_extract) { @@ -143,10 +135,6 @@ TEST(FunctionLikeTest, regexp_extract) { check_function(func_name, const_pattern_input_types, const_pattern_dataset); } - - // pattern is not constant value - InputTypeSet input_types = {TypeIndex::String, TypeIndex::String, TypeIndex::Int64}; - check_function(func_name, input_types, data_set); } TEST(FunctionLikeTest, regexp_replace) { @@ -177,10 +165,6 @@ TEST(FunctionLikeTest, regexp_replace) { check_function(func_name, const_pattern_input_types, const_pattern_dataset); } - - // pattern is not constant value - InputTypeSet input_types = {TypeIndex::String, TypeIndex::String, TypeIndex::String}; - check_function(func_name, input_types, data_set); } } // namespace doris::vectorized