Optimize regexp and like using hyperscan (#11116)

* use hyperscan instead of re2 for regexp and like function
This commit is contained in:
Kang
2022-07-27 16:43:58 +08:00
committed by GitHub
parent d4fb27125a
commit 4ea2c04676
3 changed files with 125 additions and 63 deletions

View File

@ -144,6 +144,9 @@ set_target_properties(backtrace PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/l
add_library(re2 STATIC IMPORTED)
set_target_properties(re2 PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libre2.a)
add_library(hyperscan STATIC IMPORTED)
set_target_properties(hyperscan PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib64/libhs.a)
add_library(odbc STATIC IMPORTED)
set_target_properties(odbc PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libodbc.a)
@ -638,6 +641,7 @@ set(COMMON_THIRDPARTY
thriftnb
glog
re2
hyperscan
pprof
lz4
libevent

View File

@ -82,6 +82,61 @@ Status FunctionLikeBase::constant_substring_fn(LikeSearchState* state, const Str
return Status::OK();
}
Status FunctionLikeBase::constant_regex_fn(LikeSearchState* state, const StringValue& val,
const StringValue& pattern, unsigned char* result) {
auto ret = hs_scan(state->hs_database.get(), val.ptr, val.len, 0, state->hs_scratch.get(),
state->hs_match_handler, (void*)result);
if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
}
return Status::OK();
}
Status FunctionLikeBase::regexp_fn(LikeSearchState* state, const StringValue& val,
const StringValue& pattern, unsigned char* result) {
std::string re_pattern(pattern.ptr, pattern.len);
hs_database_t* database = nullptr;
hs_scratch_t* scratch = nullptr;
RETURN_IF_ERROR(hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch));
auto ret =
hs_scan(database, val.ptr, val.len, 0, scratch, state->hs_match_handler, (void*)result);
if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
}
hs_free_scratch(scratch);
hs_free_database(database);
return Status::OK();
}
// hyperscan compile expression to database and allocate scratch space
Status FunctionLikeBase::hs_prepare(FunctionContext* context, const char* expression,
hs_database_t** database, hs_scratch_t** scratch) {
hs_compile_error_t* compile_err;
if (hs_compile(expression, HS_FLAG_DOTALL, HS_MODE_BLOCK, NULL, database, &compile_err) !=
HS_SUCCESS) {
hs_free_compile_error(compile_err);
*database = nullptr;
if (context) context->set_error("hs_compile regex pattern error");
return Status::RuntimeError("hs_compile regex pattern error");
}
hs_free_compile_error(compile_err);
if (hs_alloc_scratch(*database, scratch) != HS_SUCCESS) {
hs_free_database(*database);
*database = nullptr;
*scratch = nullptr;
if (context) context->set_error("hs_alloc_scratch allocate scratch space error");
return Status::RuntimeError("hs_alloc_scratch allocate scratch space error");
}
return Status::OK();
}
Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block,
const ColumnNumbers& arguments, size_t result,
size_t /*input_rows_count*/) {
@ -183,28 +238,20 @@ Status FunctionLikeBase::vector_vector(const ColumnString::Chars& values,
Status FunctionLike::like_fn(LikeSearchState* state, const StringValue& val,
const StringValue& pattern, unsigned char* result) {
std::string re_pattern;
RE2::Options opts;
opts.set_never_nl(false);
opts.set_dot_nl(true);
convert_like_pattern(state, std::string(pattern.ptr, pattern.len), &re_pattern);
re2::RE2 re(re_pattern, opts);
if (re.ok()) {
*result = RE2::FullMatch(re2::StringPiece(val.ptr, val.len), re);
return Status::OK();
} else {
return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
}
}
Status FunctionLike::constant_regex_full_fn(LikeSearchState* state, const StringValue& val,
const StringValue& pattern, unsigned char* result) {
*result = RE2::FullMatch(re2::StringPiece(val.ptr, val.len), *state->regex.get());
return Status::OK();
return regexp_fn(state, val, {re_pattern.c_str(), (int)re_pattern.size()}, result);
}
void FunctionLike::convert_like_pattern(LikeSearchState* state, const std::string& pattern,
std::string* re_pattern) {
re_pattern->clear();
// add ^ to pattern head to match line head
if (pattern.size() > 0 && pattern[0] != '%') {
re_pattern->append("^");
}
bool is_escaped = false;
for (size_t i = 0; i < pattern.size(); ++i) {
if (!is_escaped && pattern[i] == '%') {
@ -229,6 +276,11 @@ void FunctionLike::convert_like_pattern(LikeSearchState* state, const std::strin
is_escaped = false;
}
}
// add $ to pattern tail to match line tail
if (pattern.size() > 0 && pattern[pattern.size() - 1] != '%') {
re_pattern->append("$");
}
}
void FunctionLike::remove_escape_character(std::string* search_string) {
@ -279,14 +331,15 @@ Status FunctionLike::prepare(FunctionContext* context, FunctionContext::Function
} else {
std::string re_pattern;
convert_like_pattern(&state->search_state, pattern_str, &re_pattern);
RE2::Options opts;
opts.set_never_nl(false);
opts.set_dot_nl(true);
state->search_state.regex = std::make_unique<RE2>(re_pattern, opts);
if (!state->search_state.regex->ok()) {
return Status::InternalError("Invalid regex expression: {}", pattern_str);
}
state->function = constant_regex_full_fn;
hs_database_t* database = nullptr;
hs_scratch_t* scratch = nullptr;
RETURN_IF_ERROR(hs_prepare(context, re_pattern.c_str(), &database, &scratch));
state->search_state.hs_database.reset(database);
state->search_state.hs_scratch.reset(scratch);
state->function = constant_regex_fn;
}
}
return Status::OK();
@ -319,39 +372,17 @@ Status FunctionRegexp::prepare(FunctionContext* context,
state->search_state.set_search_string(search_string);
state->function = constant_substring_fn;
} else {
RE2::Options opts;
opts.set_never_nl(false);
opts.set_dot_nl(true);
state->search_state.regex = std::make_unique<RE2>(pattern_str, opts);
if (!state->search_state.regex->ok()) {
return Status::InternalError("Invalid regex expression: {}", pattern_str);
}
state->function = constant_regex_partial_fn;
hs_database_t* database = nullptr;
hs_scratch_t* scratch = nullptr;
RETURN_IF_ERROR(hs_prepare(context, pattern_str.c_str(), &database, &scratch));
state->search_state.hs_database.reset(database);
state->search_state.hs_scratch.reset(scratch);
state->function = constant_regex_fn;
}
}
return Status::OK();
}
Status FunctionRegexp::constant_regex_partial_fn(LikeSearchState* state, const StringValue& val,
const StringValue& pattern,
unsigned char* result) {
*result = RE2::PartialMatch(re2::StringPiece(val.ptr, val.len), *state->regex);
return Status::OK();
}
Status FunctionRegexp::regexp_fn(LikeSearchState* state, const StringValue& val,
const StringValue& pattern, unsigned char* result) {
std::string re_pattern(pattern.ptr, pattern.len);
RE2::Options opts;
opts.set_never_nl(false);
opts.set_dot_nl(true);
re2::RE2 re(re_pattern, opts);
if (re.ok()) {
*result = RE2::PartialMatch(re2::StringPiece(val.ptr, val.len), re);
return Status::OK();
} else {
return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
}
}
} // namespace doris::vectorized

View File

@ -17,6 +17,8 @@
#pragma once
#include <hs/hs.h>
#include <functional>
#include <memory>
@ -56,6 +58,31 @@ struct LikeSearchState {
/// Used for RLIKE and REGEXP predicates if the pattern is a constant argument.
std::unique_ptr<re2::RE2> regex;
template <typename Deleter, Deleter deleter>
struct HyperscanDeleter {
template <typename T>
void operator()(T* ptr) const {
deleter(ptr);
}
};
// hyperscan compiled pattern database and scratch space, reused for performance
std::unique_ptr<hs_database_t, HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>>
hs_database;
std::unique_ptr<hs_scratch_t, HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>
hs_scratch;
// hyperscan match callback
static int hs_match_handler(unsigned int /* from */, // NOLINT
unsigned long long /* from */, // NOLINT
unsigned long long /* to */, // NOLINT
unsigned int /* flags */, void* ctx) {
// set result to 1 for matched row
*((unsigned char*)ctx) = 1;
/// return non-zero to indicate hyperscan stop after first matched
return 1;
}
LikeSearchState() : escape_char('\\') {}
void set_search_string(const std::string& search_string_arg) {
@ -105,6 +132,16 @@ protected:
static Status constant_substring_fn(LikeSearchState* state, const StringValue& val,
const StringValue& pattern, unsigned char* result);
static Status constant_regex_fn(LikeSearchState* state, const StringValue& val,
const StringValue& pattern, unsigned char* result);
static Status regexp_fn(LikeSearchState* state, const StringValue& val,
const StringValue& pattern, unsigned char* result);
// hyperscan compile expression to database and allocate scratch space
static Status hs_prepare(FunctionContext* context, const char* expression,
hs_database_t** database, hs_scratch_t** scratch);
};
class FunctionLike : public FunctionLikeBase {
@ -121,9 +158,6 @@ private:
static Status like_fn(LikeSearchState* state, const StringValue& val,
const StringValue& pattern, unsigned char* result);
static Status constant_regex_full_fn(LikeSearchState* state, const StringValue& val,
const StringValue& pattern, unsigned char* result);
static void convert_like_pattern(LikeSearchState* state, const std::string& pattern,
std::string* re_pattern);
@ -139,13 +173,6 @@ public:
String get_name() const override { return name; }
Status prepare(FunctionContext* context, FunctionContext::FunctionStateScope scope) override;
private:
static Status regexp_fn(LikeSearchState* state, const StringValue& val,
const StringValue& pattern, unsigned char* result);
static Status constant_regex_partial_fn(LikeSearchState* state, const StringValue& val,
const StringValue& pattern, unsigned char* result);
};
void register_function_like(SimpleFunctionFactory& factory) {