[fix](regex) fix wrong escape of function LIKE (#30557)
fix wrong escape of function LIKE
This commit is contained in:
@ -21,7 +21,6 @@
|
||||
#include <hs/hs_compile.h>
|
||||
#include <re2/stringpiece.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <ostream>
|
||||
#include <utility>
|
||||
@ -39,26 +38,25 @@
|
||||
|
||||
namespace doris::vectorized {
|
||||
// A regex to match any regex pattern is equivalent to a substring search.
|
||||
static const RE2 SUBSTRING_RE(
|
||||
"(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*");
|
||||
static const RE2 SUBSTRING_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
|
||||
|
||||
// A regex to match any regex pattern which is equivalent to matching a constant string
|
||||
// at the end of the string values.
|
||||
static const RE2 ENDS_WITH_RE("(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$");
|
||||
static const RE2 ENDS_WITH_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
|
||||
|
||||
// A regex to match any regex pattern which is equivalent to matching a constant string
|
||||
// at the end of the string values.
|
||||
static const RE2 STARTS_WITH_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*");
|
||||
static const RE2 STARTS_WITH_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
|
||||
|
||||
// A regex to match any regex pattern which is equivalent to a constant string match.
|
||||
static const RE2 EQUALS_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$");
|
||||
static const RE2 EQUALS_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
|
||||
// A regex to match .*
|
||||
static const RE2 ALLPASS_RE("(\\\\.\\*)+");
|
||||
static const RE2 ALLPASS_RE(R"((\\.\*)+)");
|
||||
|
||||
// Like patterns
|
||||
static const re2::RE2 LIKE_SUBSTRING_RE("(?:%+)(((\\\\_)|([^%_\\\\]))+)(?:%+)");
|
||||
static const re2::RE2 LIKE_SUBSTRING_RE(R"((?:%+)(((\\_)|([^%_\\]))+)(?:%+))");
|
||||
static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\_)|([^%_]))+)");
|
||||
static const re2::RE2 LIKE_STARTS_WITH_RE("(((\\\\%)|(\\\\_)|([^%_\\\\]))+)(?:%+)");
|
||||
static const re2::RE2 LIKE_STARTS_WITH_RE(R"((((\\%)|(\\_)|([^%_\\]))+)(?:%+))");
|
||||
static const re2::RE2 LIKE_EQUALS_RE("(((\\\\_)|([^%_]))+)");
|
||||
static const re2::RE2 LIKE_ALLPASS_RE("%+");
|
||||
|
||||
@ -200,7 +198,7 @@ Status FunctionLikeBase::constant_regex_fn_scalar(LikeSearchState* state, const
|
||||
return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
|
||||
}
|
||||
} else { // fallback to re2
|
||||
*result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex.get());
|
||||
*result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
@ -241,8 +239,8 @@ Status FunctionLikeBase::constant_regex_fn(LikeSearchState* state, const ColumnS
|
||||
} else { // fallback to re2
|
||||
for (size_t i = 0; i < sz; i++) {
|
||||
const auto& str_ref = val.get_data_at(i);
|
||||
*(result.data() + i) = RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size),
|
||||
*state->regex.get());
|
||||
*(result.data() + i) =
|
||||
RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), *state->regex);
|
||||
}
|
||||
}
|
||||
|
||||
@ -447,14 +445,25 @@ void FunctionLike::convert_like_pattern(LikeSearchState* state, const std::strin
|
||||
}
|
||||
|
||||
// add ^ to pattern head to match line head
|
||||
if (pattern.size() > 0 && pattern[0] != '%') {
|
||||
if (!pattern.empty() && pattern[0] != '%') {
|
||||
re_pattern->append("^");
|
||||
}
|
||||
|
||||
bool is_escaped = false;
|
||||
for (size_t i = 0; i < pattern.size(); ++i) {
|
||||
if (!is_escaped) {
|
||||
switch (pattern[i]) {
|
||||
// expect % and _, all chars should keep it literal means.
|
||||
for (char i : pattern) {
|
||||
if (is_escaped) { // last is \, this should be escape
|
||||
if (i == '[' || i == ']' || i == '(' || i == ')' || i == '{' || i == '}' || i == '-' ||
|
||||
i == '*' || i == '+' || i == '\\' || i == '|' || i == '/' || i == ':' || i == '^' ||
|
||||
i == '.' || i == '$' || i == '?') {
|
||||
re_pattern->append(1, '\\');
|
||||
} else if (i != '%' && i != '_') {
|
||||
re_pattern->append(2, '\\');
|
||||
}
|
||||
re_pattern->append(1, i);
|
||||
is_escaped = false;
|
||||
} else {
|
||||
switch (i) {
|
||||
case '%':
|
||||
re_pattern->append(".*");
|
||||
break;
|
||||
@ -462,28 +471,23 @@ void FunctionLike::convert_like_pattern(LikeSearchState* state, const std::strin
|
||||
re_pattern->append(".");
|
||||
break;
|
||||
default:
|
||||
is_escaped = pattern[i] == state->escape_char;
|
||||
is_escaped = i == state->escape_char;
|
||||
if (!is_escaped) {
|
||||
re_pattern->append(1, pattern[i]);
|
||||
// special for hyperscan: [, ], (, ), {, }, -, *, +, \, |, /, :, ^, ., $, ?
|
||||
if (i == '[' || i == ']' || i == '(' || i == ')' || i == '{' || i == '}' ||
|
||||
i == '-' || i == '*' || i == '+' || i == '\\' || i == '|' || i == '/' ||
|
||||
i == ':' || i == '^' || i == '.' || i == '$' || i == '?') {
|
||||
re_pattern->append(1, '\\');
|
||||
}
|
||||
re_pattern->append(1, i);
|
||||
}
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if (pattern[i] == '.' || pattern[i] == '[' || pattern[i] == ']' || pattern[i] == '{' ||
|
||||
pattern[i] == '}' || pattern[i] == '(' || pattern[i] == ')' || pattern[i] == '\\' ||
|
||||
pattern[i] == '*' || pattern[i] == '+' || pattern[i] == '?' || pattern[i] == '|' ||
|
||||
pattern[i] == '^' || pattern[i] == '$') {
|
||||
re_pattern->append("\\");
|
||||
} else if (pattern[i] != '%' && pattern[i] != '_') {
|
||||
re_pattern->append("\\\\");
|
||||
}
|
||||
re_pattern->append(1, pattern[i]);
|
||||
is_escaped = false;
|
||||
}
|
||||
}
|
||||
|
||||
// add $ to pattern tail to match line tail
|
||||
if (pattern.size() > 0 && re_pattern->back() != '*') {
|
||||
if (!pattern.empty() && re_pattern->back() != '*') {
|
||||
re_pattern->append("$");
|
||||
}
|
||||
}
|
||||
|
||||
@ -182,6 +182,7 @@
|
||||
"advanced/using-hll",
|
||||
"advanced/variables",
|
||||
"advanced/time-zone",
|
||||
"advanced/sql-mode",
|
||||
"advanced/small-file-mgr",
|
||||
"advanced/cold-hot-separation",
|
||||
"advanced/compute-node",
|
||||
|
||||
@ -77,3 +77,21 @@ true false
|
||||
-- !like24 --
|
||||
false true
|
||||
|
||||
-- !escape1 --
|
||||
true
|
||||
|
||||
-- !escape2 --
|
||||
false
|
||||
|
||||
-- !escape3 --
|
||||
false
|
||||
|
||||
-- !escape4 --
|
||||
true
|
||||
|
||||
-- !escape5 --
|
||||
true
|
||||
|
||||
-- !escape6 --
|
||||
true
|
||||
|
||||
|
||||
@ -77,3 +77,21 @@ true false
|
||||
-- !like24 --
|
||||
false true
|
||||
|
||||
-- !escape1 --
|
||||
true
|
||||
|
||||
-- !escape2 --
|
||||
false
|
||||
|
||||
-- !escape3 --
|
||||
false
|
||||
|
||||
-- !escape4 --
|
||||
true
|
||||
|
||||
-- !escape5 --
|
||||
true
|
||||
|
||||
-- !escape6 --
|
||||
true
|
||||
|
||||
|
||||
@ -47,4 +47,11 @@ suite("test_query_like", "query,p0") {
|
||||
qt_like22 """select "abcd%%1" like "abcd__1", "abcd%%1" not like "abcd__1" """
|
||||
qt_like23 """select "abcd%%1" like "abcd_%_", "abcd%%1" not like "abcd_%_" """
|
||||
qt_like24 """select "abcd%%1" like "abcd\\_%1", "abcd%%1" not like "abcd\\_%1" """
|
||||
|
||||
qt_escape1 """select 'facebook_10008_T1+T2-ALL_AAA-VO_LowestCost_20230830_HSJ' LIKE '%facebook_10008_T1+T2%' """
|
||||
qt_escape2 """select '!z23]' like '_[z]%' """
|
||||
qt_escape3 """select '[123]' like '%[1.*]%' """
|
||||
qt_escape4 """select '1\\b\\b' like '%_\\b\\b%' """
|
||||
qt_escape5 """select '1\\d\\d' like '%_\\d\\d%' """
|
||||
qt_escape6 """select '1dd' like '%_\\d\\d%' """
|
||||
}
|
||||
|
||||
@ -45,4 +45,11 @@ suite("test_query_like", "query,p0") {
|
||||
qt_like22 """select "abcd%%1" like "abcd__1", "abcd%%1" not like "abcd__1" """
|
||||
qt_like23 """select "abcd%%1" like "abcd_%_", "abcd%%1" not like "abcd_%_" """
|
||||
qt_like24 """select "abcd%%1" like "abcd\\_%1", "abcd%%1" not like "abcd\\_%1" """
|
||||
|
||||
qt_escape1 """select 'facebook_10008_T1+T2-ALL_AAA-VO_LowestCost_20230830_HSJ' LIKE '%facebook_10008_T1+T2%' """
|
||||
qt_escape2 """select '!z23]' like '_[z]%' """
|
||||
qt_escape3 """select '[123]' like '%[1.*]%' """
|
||||
qt_escape4 """select '1\\b\\b' like '%_\\b\\b%' """
|
||||
qt_escape5 """select '1\\d\\d' like '%_\\d\\d%' """
|
||||
qt_escape6 """select '1dd' like '%_\\d\\d%' """
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user