Files
doris/be/src/exprs/like_predicate.cpp

439 lines
19 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/apache/impala/blob/branch-2.9.0/be/src/exprs/like-predicate.cc
// and modified by Doris
#include "exprs/like_predicate.h"
#include <string.h>
#include <sstream>
#include "exprs/string_functions.h"
namespace doris {
// A regex to match any regex pattern is equivalent to a substring search.
static const RE2 SUBSTRING_RE(
"(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*");
// A regex to match any regex pattern which is equivalent to matching a constant string
// at the end of the string values.
static const RE2 ENDS_WITH_RE("(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$");
// A regex to match any regex pattern which is equivalent to matching a constant string
// at the end of the string values.
static const RE2 STARTS_WITH_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*");
// A regex to match any regex pattern which is equivalent to a constant string match.
static const RE2 EQUALS_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$");
static const re2::RE2 LIKE_SUBSTRING_RE("(?:%+)(((\\\\%)|(\\\\_)|([^%_]))+)(?:%+)");
static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\%)|(\\\\_)|([^%_]))+)");
static const re2::RE2 LIKE_STARTS_WITH_RE("(((\\\\%)|(\\\\_)|([^%_]))+)(?:%+)");
static const re2::RE2 LIKE_EQUALS_RE("(((\\\\%)|(\\\\_)|([^%_]))+)");
void LikePredicate::init() {}
void LikePredicate::like_prepare(FunctionContext* context,
FunctionContext::FunctionStateScope scope) {
if (scope != FunctionContext::THREAD_LOCAL) {
return;
}
LikePredicateState* state = new LikePredicateState();
state->function = like_fn;
context->set_function_state(scope, state);
if (context->is_arg_constant(1)) {
StringVal pattern_val = *reinterpret_cast<StringVal*>(context->get_constant_arg(1));
if (pattern_val.is_null) {
return;
}
StringValue pattern = StringValue::from_string_val(pattern_val);
std::string pattern_str(pattern.ptr, pattern.len);
std::string search_string;
if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &search_string)) {
remove_escape_character(&search_string);
state->set_search_string(search_string);
state->function = constant_ends_with_fn;
} else if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &search_string)) {
remove_escape_character(&search_string);
state->set_search_string(search_string);
state->function = constant_substring_fn;
} else if (RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &search_string)) {
remove_escape_character(&search_string);
state->set_search_string(search_string);
state->function = constant_equals_fn;
} else if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &search_string)) {
remove_escape_character(&search_string);
state->set_search_string(search_string);
state->function = constant_starts_with_fn;
} else {
std::string re_pattern;
convert_like_pattern(context,
*reinterpret_cast<StringVal*>(context->get_constant_arg(1)),
&re_pattern);
RE2::Options opts;
opts.set_never_nl(false);
opts.set_dot_nl(true);
state->regex.reset(new RE2(re_pattern, opts));
if (!state->regex->ok()) {
context->set_error("Invalid regex: $0");
}
}
}
}
BooleanVal LikePredicate::like(FunctionContext* context, const StringVal& val,
const StringVal& pattern) {
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->get_function_state(FunctionContext::THREAD_LOCAL));
return (state->function)(context, val, pattern);
}
void LikePredicate::like_close(FunctionContext* context,
FunctionContext::FunctionStateScope scope) {
if (scope == FunctionContext::THREAD_LOCAL) {
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->get_function_state(FunctionContext::THREAD_LOCAL));
delete state;
}
}
void LikePredicate::regex_prepare(FunctionContext* context,
FunctionContext::FunctionStateScope scope) {
if (scope != FunctionContext::THREAD_LOCAL) {
return;
}
LikePredicateState* state = new LikePredicateState();
context->set_function_state(scope, state);
state->function = regex_fn;
if (context->is_arg_constant(1)) {
StringVal* pattern = reinterpret_cast<StringVal*>(context->get_constant_arg(1));
if (pattern->is_null) {
return;
}
std::string pattern_str(reinterpret_cast<const char*>(pattern->ptr), pattern->len);
std::string search_string;
// The following four conditionals check if the pattern is a constant string,
// starts with a constant string and is followed by any number of wildcard characters,
// ends with a constant string and is preceded by any number of wildcard characters or
// has a constant substring surrounded on both sides by any number of wildcard
// characters. In any of these conditions, we can search for the pattern more
// efficiently by using our own string match functions rather than regex matching.
if (RE2::FullMatch(pattern_str, EQUALS_RE, &search_string)) {
state->set_search_string(search_string);
state->function = constant_equals_fn;
} else if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &search_string)) {
state->set_search_string(search_string);
state->function = constant_starts_with_fn;
} else if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &search_string)) {
state->set_search_string(search_string);
state->function = constant_ends_with_fn;
} else if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &search_string)) {
state->set_search_string(search_string);
state->function = constant_substring_fn;
} else {
RE2::Options opts;
opts.set_never_nl(false);
opts.set_dot_nl(true);
state->regex.reset(new RE2(pattern_str, opts));
if (!state->regex->ok()) {
std::stringstream error;
error << "Invalid regex expression" << pattern->ptr;
context->set_error(error.str().c_str());
}
state->function = constant_regex_fn_partial;
}
}
}
BooleanVal LikePredicate::regex(FunctionContext* context, const StringVal& val,
const StringVal& pattern) {
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->get_function_state(FunctionContext::THREAD_LOCAL));
return (state->function)(context, val, pattern);
}
// This prepare function is used only when 3 parameters are passed to the regexp_like()
// function. For the 2 parameter version, the RegexPrepare() function is used to prepare.
void LikePredicate::regexp_like_prepare(FunctionContext* context,
FunctionContext::FunctionStateScope scope) {
if (scope != FunctionContext::THREAD_LOCAL) {
return;
}
LikePredicateState* state = new LikePredicateState();
context->set_function_state(scope, state);
// If both the pattern and the match parameter are constant, we pre-compile the
// regular expression once here. Otherwise, the RE is compiled per row in RegexpLike()
if (context->is_arg_constant(1) && context->is_arg_constant(2)) {
StringVal* pattern = nullptr;
pattern = reinterpret_cast<StringVal*>(context->get_constant_arg(1));
if (pattern->is_null) {
return;
}
StringVal* match_parameter = reinterpret_cast<StringVal*>(context->get_constant_arg(2));
std::stringstream error;
if (match_parameter->is_null) {
error << "match parameter is null";
context->set_error(error.str().c_str());
return;
}
RE2::Options opts;
opts.set_never_nl(false);
opts.set_dot_nl(true);
std::string error_str;
if (!StringFunctions::set_re2_options(*match_parameter, &error_str, &opts)) {
context->set_error(error_str.c_str());
return;
}
std::string pattern_str(reinterpret_cast<const char*>(pattern->ptr), pattern->len);
state->regex.reset(new RE2(pattern_str, opts));
if (!state->regex->ok()) {
error << "Invalid regex expression" << pattern->ptr;
context->set_error(error.str().c_str());
}
}
}
// This is used only for the 3 parameter version of regexp_like(). The 2 parameter
// version calls Regex() directly.
BooleanVal LikePredicate::regexp_like(FunctionContext* context, const StringVal& val,
const StringVal& pattern, const StringVal& match_parameter) {
if (val.is_null || pattern.is_null) {
return BooleanVal::null();
}
// If either the pattern or the third optional match parameter are not constant, we
// have to recompile the RE for every row.
if (!context->is_arg_constant(2) || !context->is_arg_constant(1)) {
if (match_parameter.is_null) {
return BooleanVal::null();
}
RE2::Options opts;
std::string error_str;
if (!StringFunctions::set_re2_options(match_parameter, &error_str, &opts)) {
context->set_error(error_str.c_str());
return BooleanVal(false);
}
std::string re_pattern(reinterpret_cast<const char*>(pattern.ptr), pattern.len);
re2::RE2 re(re_pattern, opts);
if (re.ok()) {
return RE2::PartialMatch(
re2::StringPiece(reinterpret_cast<const char*>(val.ptr), val.len), re);
} else {
context->set_error("Invalid regex: $0");
return BooleanVal(false);
}
}
return constant_regex_fn_partial(context, val, pattern);
}
void LikePredicate::regex_close(FunctionContext* context,
FunctionContext::FunctionStateScope scope) {
if (scope == FunctionContext::THREAD_LOCAL) {
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->get_function_state(FunctionContext::THREAD_LOCAL));
delete state;
}
}
BooleanVal LikePredicate::regex_fn(FunctionContext* context, const StringVal& val,
const StringVal& pattern) {
return regex_match(context, val, pattern, false);
}
BooleanVal LikePredicate::like_fn(FunctionContext* context, const StringVal& val,
const StringVal& pattern) {
return regex_match(context, val, pattern, true);
}
BooleanVal LikePredicate::constant_substring_fn(FunctionContext* context, const StringVal& val,
const StringVal& pattern) {
if (val.is_null) {
return BooleanVal::null();
}
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->get_function_state(FunctionContext::THREAD_LOCAL));
if (state->search_string_sv.len == 0) {
return BooleanVal(true);
}
StringValue pattern_value = StringValue::from_string_val(val);
return BooleanVal(state->substring_pattern.search(&pattern_value) != -1);
}
BooleanVal LikePredicate::constant_starts_with_fn(FunctionContext* context, const StringVal& val,
const StringVal& pattern) {
if (val.is_null) {
return BooleanVal::null();
}
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->get_function_state(FunctionContext::THREAD_LOCAL));
if (val.len < state->search_string_sv.len) {
return BooleanVal(false);
} else {
StringValue v = StringValue(reinterpret_cast<char*>(val.ptr), state->search_string_sv.len);
return BooleanVal(state->search_string_sv.eq((v)));
}
}
BooleanVal LikePredicate::constant_ends_with_fn(FunctionContext* context, const StringVal& val,
const StringVal& pattern) {
if (val.is_null) {
return BooleanVal::null();
}
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->get_function_state(FunctionContext::THREAD_LOCAL));
if (val.len < state->search_string_sv.len) {
return BooleanVal(false);
} else {
char* ptr = reinterpret_cast<char*>(val.ptr) + val.len - state->search_string_sv.len;
int len = state->search_string_sv.len;
StringValue v = StringValue(ptr, len);
return BooleanVal(state->search_string_sv.eq(v));
}
}
BooleanVal LikePredicate::constant_equals_fn(FunctionContext* context, const StringVal& val,
const StringVal& pattern) {
if (val.is_null) {
return BooleanVal::null();
}
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->get_function_state(FunctionContext::THREAD_LOCAL));
return BooleanVal(state->search_string_sv.eq(StringValue::from_string_val(val)));
}
BooleanVal LikePredicate::constant_regex_fn_partial(FunctionContext* context, const StringVal& val,
const StringVal& pattern) {
if (val.is_null) {
return BooleanVal::null();
}
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->get_function_state(FunctionContext::THREAD_LOCAL));
re2::StringPiece operand_sp(reinterpret_cast<const char*>(val.ptr), val.len);
return RE2::PartialMatch(operand_sp, *state->regex);
}
BooleanVal LikePredicate::constant_regex_fn(FunctionContext* context, const StringVal& val,
const StringVal& pattern) {
if (val.is_null) {
return BooleanVal::null();
}
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->get_function_state(FunctionContext::THREAD_LOCAL));
re2::StringPiece operand_sp(reinterpret_cast<const char*>(val.ptr), val.len);
return RE2::FullMatch(operand_sp, *state->regex);
}
BooleanVal LikePredicate::regex_match(FunctionContext* context, const StringVal& operand_value,
const StringVal& pattern_value, bool is_like_pattern) {
if (operand_value.is_null || pattern_value.is_null) {
return BooleanVal::null();
}
if (context->is_arg_constant(1)) {
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->get_function_state(FunctionContext::THREAD_LOCAL));
if (is_like_pattern) {
return RE2::FullMatch(re2::StringPiece(reinterpret_cast<const char*>(operand_value.ptr),
operand_value.len),
*state->regex.get());
} else {
return RE2::PartialMatch(
re2::StringPiece(reinterpret_cast<const char*>(operand_value.ptr),
operand_value.len),
*state->regex.get());
}
} else {
std::string re_pattern;
RE2::Options opts;
opts.set_never_nl(false);
opts.set_dot_nl(true);
if (is_like_pattern) {
convert_like_pattern(context, pattern_value, &re_pattern);
} else {
re_pattern = std::string(reinterpret_cast<const char*>(pattern_value.ptr),
pattern_value.len);
}
re2::RE2 re(re_pattern, opts);
if (re.ok()) {
if (is_like_pattern) {
return RE2::FullMatch(
re2::StringPiece(reinterpret_cast<const char*>(operand_value.ptr),
operand_value.len),
re);
} else {
return RE2::PartialMatch(
re2::StringPiece(reinterpret_cast<const char*>(operand_value.ptr),
operand_value.len),
re);
}
} else {
context->set_error("Invalid regex: $0");
return BooleanVal(false);
}
}
}
void LikePredicate::convert_like_pattern(FunctionContext* context, const StringVal& pattern,
std::string* re_pattern) {
re_pattern->clear();
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->get_function_state(FunctionContext::THREAD_LOCAL));
bool is_escaped = false;
for (int i = 0; i < pattern.len; ++i) {
if (!is_escaped && pattern.ptr[i] == '%') {
re_pattern->append(".*");
} else if (!is_escaped && pattern.ptr[i] == '_') {
re_pattern->append(".");
// check for escape char before checking for regex special chars, they might overlap
} else if (!is_escaped && pattern.ptr[i] == state->escape_char) {
is_escaped = true;
} else if (pattern.ptr[i] == '.' || pattern.ptr[i] == '[' || pattern.ptr[i] == ']' ||
pattern.ptr[i] == '{' || pattern.ptr[i] == '}' || pattern.ptr[i] == '(' ||
pattern.ptr[i] == ')' || pattern.ptr[i] == '\\' || pattern.ptr[i] == '*' ||
pattern.ptr[i] == '+' || pattern.ptr[i] == '?' || pattern.ptr[i] == '|' ||
pattern.ptr[i] == '^' || pattern.ptr[i] == '$') {
// escape all regex special characters; see list at
re_pattern->append("\\");
re_pattern->append(1, pattern.ptr[i]);
is_escaped = false;
} else {
// regular character or escaped special character
re_pattern->append(1, pattern.ptr[i]);
is_escaped = false;
}
}
}
void LikePredicate::remove_escape_character(std::string* search_string) {
std::string tmp_search_string;
tmp_search_string.swap(*search_string);
int len = tmp_search_string.length();
for (int i = 0; i < len;) {
if (tmp_search_string[i] == '\\' && i + 1 < len &&
(tmp_search_string[i + 1] == '%' || tmp_search_string[i + 1] == '_')) {
search_string->append(1, tmp_search_string[i + 1]);
i += 2;
} else {
search_string->append(1, tmp_search_string[i]);
i++;
}
}
}
} // namespace doris