diff --git a/be/src/vec/functions/like.cpp b/be/src/vec/functions/like.cpp index e599b563ba..d8fc1b0e6c 100644 --- a/be/src/vec/functions/like.cpp +++ b/be/src/vec/functions/like.cpp @@ -47,16 +47,26 @@ Status LikeSearchState::clone(LikeSearchState& cloned) { cloned.escape_char = escape_char; cloned.set_search_string(search_string); - if (hs_database) { - std::string re_pattern; - FunctionLike::convert_like_pattern(this, pattern_str, &re_pattern); - + std::string re_pattern; + FunctionLike::convert_like_pattern(this, pattern_str, &re_pattern); + if (hs_database) { // use hyperscan hs_database_t* database = nullptr; hs_scratch_t* scratch = nullptr; RETURN_IF_ERROR(FunctionLike::hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch)); cloned.hs_database.reset(database); cloned.hs_scratch.reset(scratch); + } else { // fallback to re2 + cloned.hs_database.reset(); + cloned.hs_scratch.reset(); + + RE2::Options opts; + opts.set_never_nl(false); + opts.set_dot_nl(true); + cloned.regex = std::make_unique(re_pattern, opts); + if (!cloned.regex->ok()) { + return Status::InternalError("Invalid regex expression: {}", re_pattern); + } } return Status::OK(); @@ -198,10 +208,14 @@ Status FunctionLikeBase::constant_substring_fn_scalar(LikeSearchState* state, co Status FunctionLikeBase::constant_regex_fn_scalar(LikeSearchState* state, const StringRef& val, const StringRef& pattern, unsigned char* result) { - auto ret = hs_scan(state->hs_database.get(), val.data, val.size, 0, state->hs_scratch.get(), - doris::vectorized::LikeSearchState::hs_match_handler, (void*)result); - if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { - return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); + if (state->hs_database) { // use hyperscan + auto ret = hs_scan(state->hs_database.get(), val.data, val.size, 0, state->hs_scratch.get(), + doris::vectorized::LikeSearchState::hs_match_handler, (void*)result); + if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { + return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); + } + } else { // fallback to re2 + *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex.get()); } return Status::OK(); @@ -213,17 +227,27 @@ Status FunctionLikeBase::regexp_fn_scalar(LikeSearchState* state, const StringRe hs_database_t* database = nullptr; hs_scratch_t* scratch = nullptr; - RETURN_IF_ERROR(hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch)); + if (hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch).ok()) { // use hyperscan + auto ret = hs_scan(database, val.data, val.size, 0, scratch, + doris::vectorized::LikeSearchState::hs_match_handler, (void*)result); + if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { + return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); + } - auto ret = hs_scan(database, val.data, val.size, 0, scratch, - doris::vectorized::LikeSearchState::hs_match_handler, (void*)result); - if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { - return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); + hs_free_scratch(scratch); + hs_free_database(database); + } else { // fallback to re2 + RE2::Options opts; + opts.set_never_nl(false); + opts.set_dot_nl(true); + re2::RE2 re(re_pattern, opts); + if (re.ok()) { + *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), re); + } else { + return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string()); + } } - hs_free_scratch(scratch); - hs_free_database(database); - return Status::OK(); } @@ -231,13 +255,22 @@ Status FunctionLikeBase::constant_regex_fn(LikeSearchState* state, const ColumnS const StringRef& pattern, ColumnUInt8::Container& result) { auto sz = val.size(); - for (size_t i = 0; i < sz; i++) { - const auto& str_ref = val.get_data_at(i); - auto ret = hs_scan( - state->hs_database.get(), str_ref.data, str_ref.size, 0, state->hs_scratch.get(), - doris::vectorized::LikeSearchState::hs_match_handler, (void*)(result.data() + i)); - if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { - return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); + if (state->hs_database) { // use hyperscan + for (size_t i = 0; i < sz; i++) { + const auto& str_ref = val.get_data_at(i); + auto ret = hs_scan(state->hs_database.get(), str_ref.data, str_ref.size, 0, + state->hs_scratch.get(), + doris::vectorized::LikeSearchState::hs_match_handler, + (void*)(result.data() + i)); + if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { + return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); + } + } + } else { // fallback to re2 + for (size_t i = 0; i < sz; i++) { + const auto& str_ref = val.get_data_at(i); + *(result.data() + i) = RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), + *state->regex.get()); } } @@ -250,22 +283,37 @@ Status FunctionLikeBase::regexp_fn(LikeSearchState* state, const ColumnString& v hs_database_t* database = nullptr; hs_scratch_t* scratch = nullptr; - RETURN_IF_ERROR(hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch)); + if (hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch).ok()) { // use hyperscan + auto sz = val.size(); + for (size_t i = 0; i < sz; i++) { + const auto& str_ref = val.get_data_at(i); + auto ret = hs_scan(database, str_ref.data, str_ref.size, 0, scratch, + doris::vectorized::LikeSearchState::hs_match_handler, + (void*)(result.data() + i)); + if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { + return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); + } + } - auto sz = val.size(); - for (size_t i = 0; i < sz; i++) { - const auto& str_ref = val.get_data_at(i); - auto ret = hs_scan(database, str_ref.data, str_ref.size, 0, scratch, - doris::vectorized::LikeSearchState::hs_match_handler, - (void*)(result.data() + i)); - if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { - return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); + hs_free_scratch(scratch); + hs_free_database(database); + } else { // fallback to re2 + RE2::Options opts; + opts.set_never_nl(false); + opts.set_dot_nl(true); + re2::RE2 re(re_pattern, opts); + if (re.ok()) { + auto sz = val.size(); + for (size_t i = 0; i < sz; i++) { + const auto& str_ref = val.get_data_at(i); + *(result.data() + i) = + RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), re); + } + } else { + return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string()); } } - hs_free_scratch(scratch); - hs_free_database(database); - return Status::OK(); } @@ -275,13 +323,22 @@ Status FunctionLikeBase::constant_regex_fn_predicate(LikeSearchState* state, ColumnUInt8::Container& result, const uint16_t* sel, size_t sz) { auto data_ptr = reinterpret_cast(val.get_data().data()); - for (size_t i = 0; i < sz; i++) { - auto ret = hs_scan(state->hs_database.get(), data_ptr[sel[i]].data, data_ptr[sel[i]].size, - 0, state->hs_scratch.get(), - doris::vectorized::LikeSearchState::hs_match_handler, - (void*)(result.data() + i)); - if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { - return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); + + if (state->hs_database) { // use hyperscan + for (size_t i = 0; i < sz; i++) { + auto ret = hs_scan(state->hs_database.get(), data_ptr[sel[i]].data, + data_ptr[sel[i]].size, 0, state->hs_scratch.get(), + doris::vectorized::LikeSearchState::hs_match_handler, + (void*)(result.data() + i)); + if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { + return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); + } + } + } else { // fallback to re2 + for (size_t i = 0; i < sz; i++) { + *(result.data() + i) = RE2::PartialMatch( + re2::StringPiece(data_ptr[sel[i]].data, data_ptr[sel[i]].size), + *state->regex.get()); } } @@ -297,21 +354,35 @@ Status FunctionLikeBase::regexp_fn_predicate(LikeSearchState* state, hs_database_t* database = nullptr; hs_scratch_t* scratch = nullptr; - RETURN_IF_ERROR(hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch)); + if (hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch).ok()) { // use hyperscan + auto data_ptr = reinterpret_cast(val.get_data().data()); + for (size_t i = 0; i < sz; i++) { + auto ret = hs_scan(database, data_ptr[sel[i]].data, data_ptr[sel[i]].size, 0, scratch, + doris::vectorized::LikeSearchState::hs_match_handler, + (void*)(result.data() + i)); + if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { + return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); + } + } - auto data_ptr = reinterpret_cast(val.get_data().data()); - for (size_t i = 0; i < sz; i++) { - auto ret = hs_scan(database, data_ptr[sel[i]].data, data_ptr[sel[i]].size, 0, scratch, - doris::vectorized::LikeSearchState::hs_match_handler, - (void*)(result.data() + i)); - if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { - return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); + hs_free_scratch(scratch); + hs_free_database(database); + } else { // fallback to re2 + RE2::Options opts; + opts.set_never_nl(false); + opts.set_dot_nl(true); + re2::RE2 re(re_pattern, opts); + if (re.ok()) { + auto data_ptr = reinterpret_cast(val.get_data().data()); + for (size_t i = 0; i < sz; i++) { + *(result.data() + i) = RE2::PartialMatch( + re2::StringPiece(data_ptr[sel[i]].data, data_ptr[sel[i]].size), re); + } + } else { + return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string()); } } - hs_free_scratch(scratch); - hs_free_database(database); - return Status::OK(); } @@ -655,10 +726,24 @@ Status FunctionLike::open(FunctionContext* context, FunctionContext::FunctionSta hs_database_t* database = nullptr; hs_scratch_t* scratch = nullptr; - RETURN_IF_ERROR(hs_prepare(context, re_pattern.c_str(), &database, &scratch)); + if (hs_prepare(context, re_pattern.c_str(), &database, &scratch).ok()) { + // use hyperscan + state->search_state.hs_database.reset(database); + state->search_state.hs_scratch.reset(scratch); + } else { + // fallback to re2 + // reset hs_database to nullptr to indicate not use hyperscan + state->search_state.hs_database.reset(); + state->search_state.hs_scratch.reset(); - state->search_state.hs_database.reset(database); - state->search_state.hs_scratch.reset(scratch); + RE2::Options opts; + opts.set_never_nl(false); + opts.set_dot_nl(true); + state->search_state.regex = std::make_unique(re_pattern, opts); + if (!state->search_state.regex->ok()) { + return Status::InternalError("Invalid regex expression: {}", pattern_str); + } + } state->function = constant_regex_fn; state->predicate_like_function = constant_regex_fn_predicate; @@ -706,11 +791,23 @@ Status FunctionRegexp::open(FunctionContext* context, FunctionContext::FunctionS } else { hs_database_t* database = nullptr; hs_scratch_t* scratch = nullptr; - RETURN_IF_ERROR(hs_prepare(context, pattern_str.c_str(), &database, &scratch)); - - state->search_state.hs_database.reset(database); - state->search_state.hs_scratch.reset(scratch); - + if (hs_prepare(context, pattern_str.c_str(), &database, &scratch).ok()) { + // use hyperscan + state->search_state.hs_database.reset(database); + state->search_state.hs_scratch.reset(scratch); + } else { + // fallback to re2 + // reset hs_database to nullptr to indicate not use hyperscan + state->search_state.hs_database.reset(); + state->search_state.hs_scratch.reset(); + RE2::Options opts; + opts.set_never_nl(false); + opts.set_dot_nl(true); + state->search_state.regex = std::make_unique(pattern_str, opts); + if (!state->search_state.regex->ok()) { + return Status::InternalError("Invalid regex expression: {}", pattern_str); + } + } state->function = constant_regex_fn; state->predicate_like_function = constant_regex_fn_predicate; state->scalar_function = constant_regex_fn_scalar;