[fix](regex) fix wrong escape of function LIKE (#30557)

fix wrong escape of function LIKE
This commit is contained in:
zclllyybb
2024-02-08 13:08:26 +08:00
committed by yiguolei
parent 927dd8a246
commit d60ecdba6f
6 changed files with 85 additions and 30 deletions

View File

@ -21,7 +21,6 @@
#include <hs/hs_compile.h>
#include <re2/stringpiece.h>
#include <algorithm>
#include <cstddef>
#include <ostream>
#include <utility>
@ -39,26 +38,25 @@
namespace doris::vectorized {
// A regex to match any regex pattern is equivalent to a substring search.
static const RE2 SUBSTRING_RE(
"(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*");
static const RE2 SUBSTRING_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
// A regex to match any regex pattern which is equivalent to matching a constant string
// at the end of the string values.
static const RE2 ENDS_WITH_RE("(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$");
static const RE2 ENDS_WITH_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
// A regex to match any regex pattern which is equivalent to matching a constant string
// at the end of the string values.
static const RE2 STARTS_WITH_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*");
static const RE2 STARTS_WITH_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
// A regex to match any regex pattern which is equivalent to a constant string match.
static const RE2 EQUALS_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$");
static const RE2 EQUALS_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
// A regex to match .*
static const RE2 ALLPASS_RE("(\\\\.\\*)+");
static const RE2 ALLPASS_RE(R"((\\.\*)+)");
// Like patterns
static const re2::RE2 LIKE_SUBSTRING_RE("(?:%+)(((\\\\_)|([^%_\\\\]))+)(?:%+)");
static const re2::RE2 LIKE_SUBSTRING_RE(R"((?:%+)(((\\_)|([^%_\\]))+)(?:%+))");
static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\_)|([^%_]))+)");
static const re2::RE2 LIKE_STARTS_WITH_RE("(((\\\\%)|(\\\\_)|([^%_\\\\]))+)(?:%+)");
static const re2::RE2 LIKE_STARTS_WITH_RE(R"((((\\%)|(\\_)|([^%_\\]))+)(?:%+))");
static const re2::RE2 LIKE_EQUALS_RE("(((\\\\_)|([^%_]))+)");
static const re2::RE2 LIKE_ALLPASS_RE("%+");
@ -200,7 +198,7 @@ Status FunctionLikeBase::constant_regex_fn_scalar(LikeSearchState* state, const
return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
}
} else { // fallback to re2
*result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex.get());
*result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex);
}
return Status::OK();
@ -241,8 +239,8 @@ Status FunctionLikeBase::constant_regex_fn(LikeSearchState* state, const ColumnS
} else { // fallback to re2
for (size_t i = 0; i < sz; i++) {
const auto& str_ref = val.get_data_at(i);
*(result.data() + i) = RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size),
*state->regex.get());
*(result.data() + i) =
RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), *state->regex);
}
}
@ -447,14 +445,25 @@ void FunctionLike::convert_like_pattern(LikeSearchState* state, const std::strin
}
// add ^ to pattern head to match line head
if (pattern.size() > 0 && pattern[0] != '%') {
if (!pattern.empty() && pattern[0] != '%') {
re_pattern->append("^");
}
bool is_escaped = false;
for (size_t i = 0; i < pattern.size(); ++i) {
if (!is_escaped) {
switch (pattern[i]) {
// expect % and _, all chars should keep it literal means.
for (char i : pattern) {
if (is_escaped) { // last is \, this should be escape
if (i == '[' || i == ']' || i == '(' || i == ')' || i == '{' || i == '}' || i == '-' ||
i == '*' || i == '+' || i == '\\' || i == '|' || i == '/' || i == ':' || i == '^' ||
i == '.' || i == '$' || i == '?') {
re_pattern->append(1, '\\');
} else if (i != '%' && i != '_') {
re_pattern->append(2, '\\');
}
re_pattern->append(1, i);
is_escaped = false;
} else {
switch (i) {
case '%':
re_pattern->append(".*");
break;
@ -462,28 +471,23 @@ void FunctionLike::convert_like_pattern(LikeSearchState* state, const std::strin
re_pattern->append(".");
break;
default:
is_escaped = pattern[i] == state->escape_char;
is_escaped = i == state->escape_char;
if (!is_escaped) {
re_pattern->append(1, pattern[i]);
// special for hyperscan: [, ], (, ), {, }, -, *, +, \, |, /, :, ^, ., $, ?
if (i == '[' || i == ']' || i == '(' || i == ')' || i == '{' || i == '}' ||
i == '-' || i == '*' || i == '+' || i == '\\' || i == '|' || i == '/' ||
i == ':' || i == '^' || i == '.' || i == '$' || i == '?') {
re_pattern->append(1, '\\');
}
re_pattern->append(1, i);
}
break;
}
} else {
if (pattern[i] == '.' || pattern[i] == '[' || pattern[i] == ']' || pattern[i] == '{' ||
pattern[i] == '}' || pattern[i] == '(' || pattern[i] == ')' || pattern[i] == '\\' ||
pattern[i] == '*' || pattern[i] == '+' || pattern[i] == '?' || pattern[i] == '|' ||
pattern[i] == '^' || pattern[i] == '$') {
re_pattern->append("\\");
} else if (pattern[i] != '%' && pattern[i] != '_') {
re_pattern->append("\\\\");
}
re_pattern->append(1, pattern[i]);
is_escaped = false;
}
}
// add $ to pattern tail to match line tail
if (pattern.size() > 0 && re_pattern->back() != '*') {
if (!pattern.empty() && re_pattern->back() != '*') {
re_pattern->append("$");
}
}

View File

@ -182,6 +182,7 @@
"advanced/using-hll",
"advanced/variables",
"advanced/time-zone",
"advanced/sql-mode",
"advanced/small-file-mgr",
"advanced/cold-hot-separation",
"advanced/compute-node",

View File

@ -77,3 +77,21 @@ true false
-- !like24 --
false true
-- !escape1 --
true
-- !escape2 --
false
-- !escape3 --
false
-- !escape4 --
true
-- !escape5 --
true
-- !escape6 --
true

View File

@ -77,3 +77,21 @@ true false
-- !like24 --
false true
-- !escape1 --
true
-- !escape2 --
false
-- !escape3 --
false
-- !escape4 --
true
-- !escape5 --
true
-- !escape6 --
true

View File

@ -47,4 +47,11 @@ suite("test_query_like", "query,p0") {
qt_like22 """select "abcd%%1" like "abcd__1", "abcd%%1" not like "abcd__1" """
qt_like23 """select "abcd%%1" like "abcd_%_", "abcd%%1" not like "abcd_%_" """
qt_like24 """select "abcd%%1" like "abcd\\_%1", "abcd%%1" not like "abcd\\_%1" """
qt_escape1 """select 'facebook_10008_T1+T2-ALL_AAA-VO_LowestCost_20230830_HSJ' LIKE '%facebook_10008_T1+T2%' """
qt_escape2 """select '!z23]' like '_[z]%' """
qt_escape3 """select '[123]' like '%[1.*]%' """
qt_escape4 """select '1\\b\\b' like '%_\\b\\b%' """
qt_escape5 """select '1\\d\\d' like '%_\\d\\d%' """
qt_escape6 """select '1dd' like '%_\\d\\d%' """
}

View File

@ -45,4 +45,11 @@ suite("test_query_like", "query,p0") {
qt_like22 """select "abcd%%1" like "abcd__1", "abcd%%1" not like "abcd__1" """
qt_like23 """select "abcd%%1" like "abcd_%_", "abcd%%1" not like "abcd_%_" """
qt_like24 """select "abcd%%1" like "abcd\\_%1", "abcd%%1" not like "abcd\\_%1" """
qt_escape1 """select 'facebook_10008_T1+T2-ALL_AAA-VO_LowestCost_20230830_HSJ' LIKE '%facebook_10008_T1+T2%' """
qt_escape2 """select '!z23]' like '_[z]%' """
qt_escape3 """select '[123]' like '%[1.*]%' """
qt_escape4 """select '1\\b\\b' like '%_\\b\\b%' """
qt_escape5 """select '1\\d\\d' like '%_\\d\\d%' """
qt_escape6 """select '1dd' like '%_\\d\\d%' """
}