// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. // This file is copied from // https://github.com/apache/impala/blob/branch-2.9.0/be/src/exprs/like-predicate.cc // and modified by Doris #include "exprs/like_predicate.h" #include #include #include "exprs/string_functions.h" namespace doris { // A regex to match any regex pattern is equivalent to a substring search. static const RE2 SUBSTRING_RE( "(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*"); // A regex to match any regex pattern which is equivalent to matching a constant string // at the end of the string values. static const RE2 ENDS_WITH_RE("(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$"); // A regex to match any regex pattern which is equivalent to matching a constant string // at the end of the string values. static const RE2 STARTS_WITH_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*"); // A regex to match any regex pattern which is equivalent to a constant string match. static const RE2 EQUALS_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$"); static const re2::RE2 LIKE_SUBSTRING_RE("(?:%+)(((\\\\%)|(\\\\_)|([^%_]))+)(?:%+)"); static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\%)|(\\\\_)|([^%_]))+)"); static const re2::RE2 LIKE_STARTS_WITH_RE("(((\\\\%)|(\\\\_)|([^%_]))+)(?:%+)"); static const re2::RE2 LIKE_EQUALS_RE("(((\\\\%)|(\\\\_)|([^%_]))+)"); void LikePredicate::init() {} void LikePredicate::like_prepare(FunctionContext* context, FunctionContext::FunctionStateScope scope) { if (scope != FunctionContext::THREAD_LOCAL) { return; } LikePredicateState* state = new LikePredicateState(); state->function = like_fn; context->set_function_state(scope, state); if (context->is_arg_constant(1)) { StringVal pattern_val = *reinterpret_cast(context->get_constant_arg(1)); if (pattern_val.is_null) { return; } StringValue pattern = StringValue::from_string_val(pattern_val); std::string pattern_str(pattern.ptr, pattern.len); std::string search_string; if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &search_string)) { remove_escape_character(&search_string); state->set_search_string(search_string); state->function = constant_ends_with_fn; } else if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &search_string)) { remove_escape_character(&search_string); state->set_search_string(search_string); state->function = constant_substring_fn; } else if (RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &search_string)) { remove_escape_character(&search_string); state->set_search_string(search_string); state->function = constant_equals_fn; } else if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &search_string)) { remove_escape_character(&search_string); state->set_search_string(search_string); state->function = constant_starts_with_fn; } else { std::string re_pattern; convert_like_pattern(context, *reinterpret_cast(context->get_constant_arg(1)), &re_pattern); RE2::Options opts; opts.set_never_nl(false); opts.set_dot_nl(true); state->regex.reset(new RE2(re_pattern, opts)); if (!state->regex->ok()) { context->set_error("Invalid regex: $0"); } } } } BooleanVal LikePredicate::like(FunctionContext* context, const StringVal& val, const StringVal& pattern) { LikePredicateState* state = reinterpret_cast( context->get_function_state(FunctionContext::THREAD_LOCAL)); return (state->function)(context, val, pattern); } void LikePredicate::like_close(FunctionContext* context, FunctionContext::FunctionStateScope scope) { if (scope == FunctionContext::THREAD_LOCAL) { LikePredicateState* state = reinterpret_cast( context->get_function_state(FunctionContext::THREAD_LOCAL)); delete state; } } void LikePredicate::regex_prepare(FunctionContext* context, FunctionContext::FunctionStateScope scope) { if (scope != FunctionContext::THREAD_LOCAL) { return; } LikePredicateState* state = new LikePredicateState(); context->set_function_state(scope, state); state->function = regex_fn; if (context->is_arg_constant(1)) { StringVal* pattern = reinterpret_cast(context->get_constant_arg(1)); if (pattern->is_null) { return; } std::string pattern_str(reinterpret_cast(pattern->ptr), pattern->len); std::string search_string; // The following four conditionals check if the pattern is a constant string, // starts with a constant string and is followed by any number of wildcard characters, // ends with a constant string and is preceded by any number of wildcard characters or // has a constant substring surrounded on both sides by any number of wildcard // characters. In any of these conditions, we can search for the pattern more // efficiently by using our own string match functions rather than regex matching. if (RE2::FullMatch(pattern_str, EQUALS_RE, &search_string)) { state->set_search_string(search_string); state->function = constant_equals_fn; } else if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &search_string)) { state->set_search_string(search_string); state->function = constant_starts_with_fn; } else if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &search_string)) { state->set_search_string(search_string); state->function = constant_ends_with_fn; } else if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &search_string)) { state->set_search_string(search_string); state->function = constant_substring_fn; } else { RE2::Options opts; opts.set_never_nl(false); opts.set_dot_nl(true); state->regex.reset(new RE2(pattern_str, opts)); if (!state->regex->ok()) { std::stringstream error; error << "Invalid regex expression" << pattern->ptr; context->set_error(error.str().c_str()); } state->function = constant_regex_fn_partial; } } } BooleanVal LikePredicate::regex(FunctionContext* context, const StringVal& val, const StringVal& pattern) { LikePredicateState* state = reinterpret_cast( context->get_function_state(FunctionContext::THREAD_LOCAL)); return (state->function)(context, val, pattern); } // This prepare function is used only when 3 parameters are passed to the regexp_like() // function. For the 2 parameter version, the RegexPrepare() function is used to prepare. void LikePredicate::regexp_like_prepare(FunctionContext* context, FunctionContext::FunctionStateScope scope) { if (scope != FunctionContext::THREAD_LOCAL) { return; } LikePredicateState* state = new LikePredicateState(); context->set_function_state(scope, state); // If both the pattern and the match parameter are constant, we pre-compile the // regular expression once here. Otherwise, the RE is compiled per row in RegexpLike() if (context->is_arg_constant(1) && context->is_arg_constant(2)) { StringVal* pattern = nullptr; pattern = reinterpret_cast(context->get_constant_arg(1)); if (pattern->is_null) { return; } StringVal* match_parameter = reinterpret_cast(context->get_constant_arg(2)); std::stringstream error; if (match_parameter->is_null) { error << "match parameter is null"; context->set_error(error.str().c_str()); return; } RE2::Options opts; opts.set_never_nl(false); opts.set_dot_nl(true); std::string error_str; if (!StringFunctions::set_re2_options(*match_parameter, &error_str, &opts)) { context->set_error(error_str.c_str()); return; } std::string pattern_str(reinterpret_cast(pattern->ptr), pattern->len); state->regex.reset(new RE2(pattern_str, opts)); if (!state->regex->ok()) { error << "Invalid regex expression" << pattern->ptr; context->set_error(error.str().c_str()); } } } // This is used only for the 3 parameter version of regexp_like(). The 2 parameter // version calls Regex() directly. BooleanVal LikePredicate::regexp_like(FunctionContext* context, const StringVal& val, const StringVal& pattern, const StringVal& match_parameter) { if (val.is_null || pattern.is_null) { return BooleanVal::null(); } // If either the pattern or the third optional match parameter are not constant, we // have to recompile the RE for every row. if (!context->is_arg_constant(2) || !context->is_arg_constant(1)) { if (match_parameter.is_null) { return BooleanVal::null(); } RE2::Options opts; std::string error_str; if (!StringFunctions::set_re2_options(match_parameter, &error_str, &opts)) { context->set_error(error_str.c_str()); return BooleanVal(false); } std::string re_pattern(reinterpret_cast(pattern.ptr), pattern.len); re2::RE2 re(re_pattern, opts); if (re.ok()) { return RE2::PartialMatch( re2::StringPiece(reinterpret_cast(val.ptr), val.len), re); } else { context->set_error("Invalid regex: $0"); return BooleanVal(false); } } return constant_regex_fn_partial(context, val, pattern); } void LikePredicate::regex_close(FunctionContext* context, FunctionContext::FunctionStateScope scope) { if (scope == FunctionContext::THREAD_LOCAL) { LikePredicateState* state = reinterpret_cast( context->get_function_state(FunctionContext::THREAD_LOCAL)); delete state; } } BooleanVal LikePredicate::regex_fn(FunctionContext* context, const StringVal& val, const StringVal& pattern) { return regex_match(context, val, pattern, false); } BooleanVal LikePredicate::like_fn(FunctionContext* context, const StringVal& val, const StringVal& pattern) { return regex_match(context, val, pattern, true); } BooleanVal LikePredicate::constant_substring_fn(FunctionContext* context, const StringVal& val, const StringVal& pattern) { if (val.is_null) { return BooleanVal::null(); } LikePredicateState* state = reinterpret_cast( context->get_function_state(FunctionContext::THREAD_LOCAL)); if (state->search_string_sv.len == 0) { return BooleanVal(true); } StringValue pattern_value = StringValue::from_string_val(val); return BooleanVal(state->substring_pattern.search(&pattern_value) != -1); } BooleanVal LikePredicate::constant_starts_with_fn(FunctionContext* context, const StringVal& val, const StringVal& pattern) { if (val.is_null) { return BooleanVal::null(); } LikePredicateState* state = reinterpret_cast( context->get_function_state(FunctionContext::THREAD_LOCAL)); if (val.len < state->search_string_sv.len) { return BooleanVal(false); } else { StringValue v = StringValue(reinterpret_cast(val.ptr), state->search_string_sv.len); return BooleanVal(state->search_string_sv.eq((v))); } } BooleanVal LikePredicate::constant_ends_with_fn(FunctionContext* context, const StringVal& val, const StringVal& pattern) { if (val.is_null) { return BooleanVal::null(); } LikePredicateState* state = reinterpret_cast( context->get_function_state(FunctionContext::THREAD_LOCAL)); if (val.len < state->search_string_sv.len) { return BooleanVal(false); } else { char* ptr = reinterpret_cast(val.ptr) + val.len - state->search_string_sv.len; int len = state->search_string_sv.len; StringValue v = StringValue(ptr, len); return BooleanVal(state->search_string_sv.eq(v)); } } BooleanVal LikePredicate::constant_equals_fn(FunctionContext* context, const StringVal& val, const StringVal& pattern) { if (val.is_null) { return BooleanVal::null(); } LikePredicateState* state = reinterpret_cast( context->get_function_state(FunctionContext::THREAD_LOCAL)); return BooleanVal(state->search_string_sv.eq(StringValue::from_string_val(val))); } BooleanVal LikePredicate::constant_regex_fn_partial(FunctionContext* context, const StringVal& val, const StringVal& pattern) { if (val.is_null) { return BooleanVal::null(); } LikePredicateState* state = reinterpret_cast( context->get_function_state(FunctionContext::THREAD_LOCAL)); re2::StringPiece operand_sp(reinterpret_cast(val.ptr), val.len); return RE2::PartialMatch(operand_sp, *state->regex); } BooleanVal LikePredicate::constant_regex_fn(FunctionContext* context, const StringVal& val, const StringVal& pattern) { if (val.is_null) { return BooleanVal::null(); } LikePredicateState* state = reinterpret_cast( context->get_function_state(FunctionContext::THREAD_LOCAL)); re2::StringPiece operand_sp(reinterpret_cast(val.ptr), val.len); return RE2::FullMatch(operand_sp, *state->regex); } BooleanVal LikePredicate::regex_match(FunctionContext* context, const StringVal& operand_value, const StringVal& pattern_value, bool is_like_pattern) { if (operand_value.is_null || pattern_value.is_null) { return BooleanVal::null(); } if (context->is_arg_constant(1)) { LikePredicateState* state = reinterpret_cast( context->get_function_state(FunctionContext::THREAD_LOCAL)); if (is_like_pattern) { return RE2::FullMatch(re2::StringPiece(reinterpret_cast(operand_value.ptr), operand_value.len), *state->regex.get()); } else { return RE2::PartialMatch( re2::StringPiece(reinterpret_cast(operand_value.ptr), operand_value.len), *state->regex.get()); } } else { std::string re_pattern; RE2::Options opts; opts.set_never_nl(false); opts.set_dot_nl(true); if (is_like_pattern) { convert_like_pattern(context, pattern_value, &re_pattern); } else { re_pattern = std::string(reinterpret_cast(pattern_value.ptr), pattern_value.len); } re2::RE2 re(re_pattern, opts); if (re.ok()) { if (is_like_pattern) { return RE2::FullMatch( re2::StringPiece(reinterpret_cast(operand_value.ptr), operand_value.len), re); } else { return RE2::PartialMatch( re2::StringPiece(reinterpret_cast(operand_value.ptr), operand_value.len), re); } } else { context->set_error("Invalid regex: $0"); return BooleanVal(false); } } } void LikePredicate::convert_like_pattern(FunctionContext* context, const StringVal& pattern, std::string* re_pattern) { re_pattern->clear(); LikePredicateState* state = reinterpret_cast( context->get_function_state(FunctionContext::THREAD_LOCAL)); bool is_escaped = false; for (int i = 0; i < pattern.len; ++i) { if (!is_escaped && pattern.ptr[i] == '%') { re_pattern->append(".*"); } else if (!is_escaped && pattern.ptr[i] == '_') { re_pattern->append("."); // check for escape char before checking for regex special chars, they might overlap } else if (!is_escaped && pattern.ptr[i] == state->escape_char) { is_escaped = true; } else if (pattern.ptr[i] == '.' || pattern.ptr[i] == '[' || pattern.ptr[i] == ']' || pattern.ptr[i] == '{' || pattern.ptr[i] == '}' || pattern.ptr[i] == '(' || pattern.ptr[i] == ')' || pattern.ptr[i] == '\\' || pattern.ptr[i] == '*' || pattern.ptr[i] == '+' || pattern.ptr[i] == '?' || pattern.ptr[i] == '|' || pattern.ptr[i] == '^' || pattern.ptr[i] == '$') { // escape all regex special characters; see list at re_pattern->append("\\"); re_pattern->append(1, pattern.ptr[i]); is_escaped = false; } else { // regular character or escaped special character re_pattern->append(1, pattern.ptr[i]); is_escaped = false; } } } void LikePredicate::remove_escape_character(std::string* search_string) { std::string tmp_search_string; tmp_search_string.swap(*search_string); int len = tmp_search_string.length(); for (int i = 0; i < len;) { if (tmp_search_string[i] == '\\' && i + 1 < len && (tmp_search_string[i + 1] == '%' || tmp_search_string[i + 1] == '_')) { search_string->append(1, tmp_search_string[i + 1]); i += 2; } else { search_string->append(1, tmp_search_string[i]); i++; } } } } // namespace doris