Optimize regexp and like using hyperscan (#11116)
* use hyperscan instead of re2 for regexp and like function
This commit is contained in:
@ -144,6 +144,9 @@ set_target_properties(backtrace PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/l
|
||||
add_library(re2 STATIC IMPORTED)
|
||||
set_target_properties(re2 PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libre2.a)
|
||||
|
||||
add_library(hyperscan STATIC IMPORTED)
|
||||
set_target_properties(hyperscan PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib64/libhs.a)
|
||||
|
||||
add_library(odbc STATIC IMPORTED)
|
||||
set_target_properties(odbc PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libodbc.a)
|
||||
|
||||
@ -638,6 +641,7 @@ set(COMMON_THIRDPARTY
|
||||
thriftnb
|
||||
glog
|
||||
re2
|
||||
hyperscan
|
||||
pprof
|
||||
lz4
|
||||
libevent
|
||||
|
||||
@ -82,6 +82,61 @@ Status FunctionLikeBase::constant_substring_fn(LikeSearchState* state, const Str
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status FunctionLikeBase::constant_regex_fn(LikeSearchState* state, const StringValue& val,
|
||||
const StringValue& pattern, unsigned char* result) {
|
||||
auto ret = hs_scan(state->hs_database.get(), val.ptr, val.len, 0, state->hs_scratch.get(),
|
||||
state->hs_match_handler, (void*)result);
|
||||
if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
|
||||
return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status FunctionLikeBase::regexp_fn(LikeSearchState* state, const StringValue& val,
|
||||
const StringValue& pattern, unsigned char* result) {
|
||||
std::string re_pattern(pattern.ptr, pattern.len);
|
||||
|
||||
hs_database_t* database = nullptr;
|
||||
hs_scratch_t* scratch = nullptr;
|
||||
RETURN_IF_ERROR(hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch));
|
||||
|
||||
auto ret =
|
||||
hs_scan(database, val.ptr, val.len, 0, scratch, state->hs_match_handler, (void*)result);
|
||||
if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
|
||||
return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
|
||||
}
|
||||
|
||||
hs_free_scratch(scratch);
|
||||
hs_free_database(database);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// hyperscan compile expression to database and allocate scratch space
|
||||
Status FunctionLikeBase::hs_prepare(FunctionContext* context, const char* expression,
|
||||
hs_database_t** database, hs_scratch_t** scratch) {
|
||||
hs_compile_error_t* compile_err;
|
||||
if (hs_compile(expression, HS_FLAG_DOTALL, HS_MODE_BLOCK, NULL, database, &compile_err) !=
|
||||
HS_SUCCESS) {
|
||||
hs_free_compile_error(compile_err);
|
||||
*database = nullptr;
|
||||
if (context) context->set_error("hs_compile regex pattern error");
|
||||
return Status::RuntimeError("hs_compile regex pattern error");
|
||||
}
|
||||
hs_free_compile_error(compile_err);
|
||||
|
||||
if (hs_alloc_scratch(*database, scratch) != HS_SUCCESS) {
|
||||
hs_free_database(*database);
|
||||
*database = nullptr;
|
||||
*scratch = nullptr;
|
||||
if (context) context->set_error("hs_alloc_scratch allocate scratch space error");
|
||||
return Status::RuntimeError("hs_alloc_scratch allocate scratch space error");
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block,
|
||||
const ColumnNumbers& arguments, size_t result,
|
||||
size_t /*input_rows_count*/) {
|
||||
@ -183,28 +238,20 @@ Status FunctionLikeBase::vector_vector(const ColumnString::Chars& values,
|
||||
Status FunctionLike::like_fn(LikeSearchState* state, const StringValue& val,
|
||||
const StringValue& pattern, unsigned char* result) {
|
||||
std::string re_pattern;
|
||||
RE2::Options opts;
|
||||
opts.set_never_nl(false);
|
||||
opts.set_dot_nl(true);
|
||||
convert_like_pattern(state, std::string(pattern.ptr, pattern.len), &re_pattern);
|
||||
re2::RE2 re(re_pattern, opts);
|
||||
if (re.ok()) {
|
||||
*result = RE2::FullMatch(re2::StringPiece(val.ptr, val.len), re);
|
||||
return Status::OK();
|
||||
} else {
|
||||
return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
|
||||
}
|
||||
}
|
||||
|
||||
Status FunctionLike::constant_regex_full_fn(LikeSearchState* state, const StringValue& val,
|
||||
const StringValue& pattern, unsigned char* result) {
|
||||
*result = RE2::FullMatch(re2::StringPiece(val.ptr, val.len), *state->regex.get());
|
||||
return Status::OK();
|
||||
return regexp_fn(state, val, {re_pattern.c_str(), (int)re_pattern.size()}, result);
|
||||
}
|
||||
|
||||
void FunctionLike::convert_like_pattern(LikeSearchState* state, const std::string& pattern,
|
||||
std::string* re_pattern) {
|
||||
re_pattern->clear();
|
||||
|
||||
// add ^ to pattern head to match line head
|
||||
if (pattern.size() > 0 && pattern[0] != '%') {
|
||||
re_pattern->append("^");
|
||||
}
|
||||
|
||||
bool is_escaped = false;
|
||||
for (size_t i = 0; i < pattern.size(); ++i) {
|
||||
if (!is_escaped && pattern[i] == '%') {
|
||||
@ -229,6 +276,11 @@ void FunctionLike::convert_like_pattern(LikeSearchState* state, const std::strin
|
||||
is_escaped = false;
|
||||
}
|
||||
}
|
||||
|
||||
// add $ to pattern tail to match line tail
|
||||
if (pattern.size() > 0 && pattern[pattern.size() - 1] != '%') {
|
||||
re_pattern->append("$");
|
||||
}
|
||||
}
|
||||
|
||||
void FunctionLike::remove_escape_character(std::string* search_string) {
|
||||
@ -279,14 +331,15 @@ Status FunctionLike::prepare(FunctionContext* context, FunctionContext::Function
|
||||
} else {
|
||||
std::string re_pattern;
|
||||
convert_like_pattern(&state->search_state, pattern_str, &re_pattern);
|
||||
RE2::Options opts;
|
||||
opts.set_never_nl(false);
|
||||
opts.set_dot_nl(true);
|
||||
state->search_state.regex = std::make_unique<RE2>(re_pattern, opts);
|
||||
if (!state->search_state.regex->ok()) {
|
||||
return Status::InternalError("Invalid regex expression: {}", pattern_str);
|
||||
}
|
||||
state->function = constant_regex_full_fn;
|
||||
|
||||
hs_database_t* database = nullptr;
|
||||
hs_scratch_t* scratch = nullptr;
|
||||
RETURN_IF_ERROR(hs_prepare(context, re_pattern.c_str(), &database, &scratch));
|
||||
|
||||
state->search_state.hs_database.reset(database);
|
||||
state->search_state.hs_scratch.reset(scratch);
|
||||
|
||||
state->function = constant_regex_fn;
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
@ -319,39 +372,17 @@ Status FunctionRegexp::prepare(FunctionContext* context,
|
||||
state->search_state.set_search_string(search_string);
|
||||
state->function = constant_substring_fn;
|
||||
} else {
|
||||
RE2::Options opts;
|
||||
opts.set_never_nl(false);
|
||||
opts.set_dot_nl(true);
|
||||
state->search_state.regex = std::make_unique<RE2>(pattern_str, opts);
|
||||
if (!state->search_state.regex->ok()) {
|
||||
return Status::InternalError("Invalid regex expression: {}", pattern_str);
|
||||
}
|
||||
state->function = constant_regex_partial_fn;
|
||||
hs_database_t* database = nullptr;
|
||||
hs_scratch_t* scratch = nullptr;
|
||||
RETURN_IF_ERROR(hs_prepare(context, pattern_str.c_str(), &database, &scratch));
|
||||
|
||||
state->search_state.hs_database.reset(database);
|
||||
state->search_state.hs_scratch.reset(scratch);
|
||||
|
||||
state->function = constant_regex_fn;
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status FunctionRegexp::constant_regex_partial_fn(LikeSearchState* state, const StringValue& val,
|
||||
const StringValue& pattern,
|
||||
unsigned char* result) {
|
||||
*result = RE2::PartialMatch(re2::StringPiece(val.ptr, val.len), *state->regex);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status FunctionRegexp::regexp_fn(LikeSearchState* state, const StringValue& val,
|
||||
const StringValue& pattern, unsigned char* result) {
|
||||
std::string re_pattern(pattern.ptr, pattern.len);
|
||||
RE2::Options opts;
|
||||
opts.set_never_nl(false);
|
||||
opts.set_dot_nl(true);
|
||||
re2::RE2 re(re_pattern, opts);
|
||||
if (re.ok()) {
|
||||
*result = RE2::PartialMatch(re2::StringPiece(val.ptr, val.len), re);
|
||||
return Status::OK();
|
||||
} else {
|
||||
return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
|
||||
@ -17,6 +17,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hs/hs.h>
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
|
||||
@ -56,6 +58,31 @@ struct LikeSearchState {
|
||||
/// Used for RLIKE and REGEXP predicates if the pattern is a constant argument.
|
||||
std::unique_ptr<re2::RE2> regex;
|
||||
|
||||
template <typename Deleter, Deleter deleter>
|
||||
struct HyperscanDeleter {
|
||||
template <typename T>
|
||||
void operator()(T* ptr) const {
|
||||
deleter(ptr);
|
||||
}
|
||||
};
|
||||
|
||||
// hyperscan compiled pattern database and scratch space, reused for performance
|
||||
std::unique_ptr<hs_database_t, HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>>
|
||||
hs_database;
|
||||
std::unique_ptr<hs_scratch_t, HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>
|
||||
hs_scratch;
|
||||
|
||||
// hyperscan match callback
|
||||
static int hs_match_handler(unsigned int /* from */, // NOLINT
|
||||
unsigned long long /* from */, // NOLINT
|
||||
unsigned long long /* to */, // NOLINT
|
||||
unsigned int /* flags */, void* ctx) {
|
||||
// set result to 1 for matched row
|
||||
*((unsigned char*)ctx) = 1;
|
||||
/// return non-zero to indicate hyperscan stop after first matched
|
||||
return 1;
|
||||
}
|
||||
|
||||
LikeSearchState() : escape_char('\\') {}
|
||||
|
||||
void set_search_string(const std::string& search_string_arg) {
|
||||
@ -105,6 +132,16 @@ protected:
|
||||
|
||||
static Status constant_substring_fn(LikeSearchState* state, const StringValue& val,
|
||||
const StringValue& pattern, unsigned char* result);
|
||||
|
||||
static Status constant_regex_fn(LikeSearchState* state, const StringValue& val,
|
||||
const StringValue& pattern, unsigned char* result);
|
||||
|
||||
static Status regexp_fn(LikeSearchState* state, const StringValue& val,
|
||||
const StringValue& pattern, unsigned char* result);
|
||||
|
||||
// hyperscan compile expression to database and allocate scratch space
|
||||
static Status hs_prepare(FunctionContext* context, const char* expression,
|
||||
hs_database_t** database, hs_scratch_t** scratch);
|
||||
};
|
||||
|
||||
class FunctionLike : public FunctionLikeBase {
|
||||
@ -121,9 +158,6 @@ private:
|
||||
static Status like_fn(LikeSearchState* state, const StringValue& val,
|
||||
const StringValue& pattern, unsigned char* result);
|
||||
|
||||
static Status constant_regex_full_fn(LikeSearchState* state, const StringValue& val,
|
||||
const StringValue& pattern, unsigned char* result);
|
||||
|
||||
static void convert_like_pattern(LikeSearchState* state, const std::string& pattern,
|
||||
std::string* re_pattern);
|
||||
|
||||
@ -139,13 +173,6 @@ public:
|
||||
String get_name() const override { return name; }
|
||||
|
||||
Status prepare(FunctionContext* context, FunctionContext::FunctionStateScope scope) override;
|
||||
|
||||
private:
|
||||
static Status regexp_fn(LikeSearchState* state, const StringValue& val,
|
||||
const StringValue& pattern, unsigned char* result);
|
||||
|
||||
static Status constant_regex_partial_fn(LikeSearchState* state, const StringValue& val,
|
||||
const StringValue& pattern, unsigned char* result);
|
||||
};
|
||||
|
||||
void register_function_like(SimpleFunctionFactory& factory) {
|
||||
|
||||
Reference in New Issue
Block a user