From bcc32b5b265841caed525300fd72e9732ea1f3b9 Mon Sep 17 00:00:00 2001 From: zzzxl <33418555+zzzxl1993@users.noreply.github.com> Date: Wed, 20 Dec 2023 14:30:35 +0800 Subject: [PATCH] [feature](invert index) match_regexp feature added (#28257) --- be/src/exec/olap_common.h | 2 + be/src/exec/olap_utils.h | 13 ++- be/src/olap/match_predicate.cpp | 3 + .../query/conjunction_query.cpp | 6 +- .../query/disjunction_query.cpp | 17 ++- .../inverted_index/query/disjunction_query.h | 1 - .../inverted_index/query/regexp_query.cpp | 98 ++++++++++++++++ .../inverted_index/query/regexp_query.h | 46 ++++++++ .../segment_v2/inverted_index_query_type.h | 4 + .../segment_v2/inverted_index_reader.cpp | 108 ++++++++++++++---- .../rowset/segment_v2/inverted_index_reader.h | 16 ++- be/src/vec/functions/function_tokenize.cpp | 8 +- be/src/vec/functions/match.cpp | 38 +++--- be/src/vec/functions/match.h | 17 +++ .../org/apache/doris/nereids/DorisLexer.g4 | 1 + .../org/apache/doris/nereids/DorisParser.g4 | 2 +- fe/fe-core/src/main/cup/sql_parser.cup | 5 +- .../apache/doris/analysis/MatchPredicate.java | 11 ++ .../nereids/parser/LogicalPlanBuilder.java | 7 ++ .../nereids/trees/expressions/Match.java | 2 + .../trees/expressions/MatchRegexp.java | 49 ++++++++ .../visitor/ExpressionVisitor.java | 5 + fe/fe-core/src/main/jflex/sql_scanner.flex | 1 + gensrc/thrift/Opcodes.thrift | 1 + .../test_index_match_regexp.out | 16 +++ .../test_index_match_regexp.groovy | 89 +++++++++++++++ 26 files changed, 499 insertions(+), 67 deletions(-) create mode 100644 be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp create mode 100644 be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchRegexp.java create mode 100644 regression-test/data/inverted_index_p0/test_index_match_regexp.out create mode 100644 regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy diff --git a/be/src/exec/olap_common.h b/be/src/exec/olap_common.h index acf81a48eb..cdca939c6e 100644 --- a/be/src/exec/olap_common.h +++ b/be/src/exec/olap_common.h @@ -306,6 +306,8 @@ public: condition.__set_condition_op("match_phrase"); } else if (value.first == MatchType::MATCH_PHRASE_PREFIX) { condition.__set_condition_op("match_phrase_prefix"); + } else if (value.first == MatchType::MATCH_REGEXP) { + condition.__set_condition_op("match_regexp"); } else if (value.first == MatchType::MATCH_ELEMENT_EQ) { condition.__set_condition_op("match_element_eq"); } else if (value.first == MatchType::MATCH_ELEMENT_LT) { diff --git a/be/src/exec/olap_utils.h b/be/src/exec/olap_utils.h index 5efcc01236..106ded98c7 100644 --- a/be/src/exec/olap_utils.h +++ b/be/src/exec/olap_utils.h @@ -170,6 +170,7 @@ enum class MatchType { MATCH_ELEMENT_LE = 6, MATCH_ELEMENT_GE = 7, MATCH_PHRASE_PREFIX = 8, + MATCH_REGEXP = 9, }; inline MatchType to_match_type(TExprOpcode::type type) { @@ -186,6 +187,9 @@ inline MatchType to_match_type(TExprOpcode::type type) { case TExprOpcode::type::MATCH_PHRASE_PREFIX: return MatchType::MATCH_PHRASE_PREFIX; break; + case TExprOpcode::type::MATCH_REGEXP: + return MatchType::MATCH_REGEXP; + break; case TExprOpcode::type::MATCH_ELEMENT_EQ: return MatchType::MATCH_ELEMENT_EQ; break; @@ -217,6 +221,8 @@ inline MatchType to_match_type(const std::string& condition_op) { return MatchType::MATCH_PHRASE; } else if (condition_op.compare("match_phrase_prefix") == 0) { return MatchType::MATCH_PHRASE_PREFIX; + } else if (condition_op.compare("match_regexp") == 0) { + return MatchType::MATCH_REGEXP; } else if (condition_op.compare("match_element_eq") == 0) { return MatchType::MATCH_ELEMENT_EQ; } else if (condition_op.compare("match_element_lt") == 0) { @@ -235,6 +241,7 @@ inline bool is_match_condition(const std::string& op) { if (0 == strcasecmp(op.c_str(), "match_any") || 0 == strcasecmp(op.c_str(), "match_all") || 0 == strcasecmp(op.c_str(), "match_phrase") || 0 == strcasecmp(op.c_str(), "match_phrase_prefix") || + 0 == strcasecmp(op.c_str(), "match_regexp") || 0 == strcasecmp(op.c_str(), "match_element_eq") || 0 == strcasecmp(op.c_str(), "match_element_lt") || 0 == strcasecmp(op.c_str(), "match_element_gt") || @@ -248,9 +255,9 @@ inline bool is_match_condition(const std::string& op) { inline bool is_match_operator(const TExprOpcode::type& op_type) { return TExprOpcode::MATCH_ANY == op_type || TExprOpcode::MATCH_ALL == op_type || TExprOpcode::MATCH_PHRASE == op_type || TExprOpcode::MATCH_PHRASE_PREFIX == op_type || - TExprOpcode::MATCH_ELEMENT_EQ == op_type || TExprOpcode::MATCH_ELEMENT_LT == op_type || - TExprOpcode::MATCH_ELEMENT_GT == op_type || TExprOpcode::MATCH_ELEMENT_LE == op_type || - TExprOpcode::MATCH_ELEMENT_GE == op_type; + TExprOpcode::MATCH_REGEXP == op_type || TExprOpcode::MATCH_ELEMENT_EQ == op_type || + TExprOpcode::MATCH_ELEMENT_LT == op_type || TExprOpcode::MATCH_ELEMENT_GT == op_type || + TExprOpcode::MATCH_ELEMENT_LE == op_type || TExprOpcode::MATCH_ELEMENT_GE == op_type; } } // namespace doris diff --git a/be/src/olap/match_predicate.cpp b/be/src/olap/match_predicate.cpp index aa4d993a62..36f167d0d0 100644 --- a/be/src/olap/match_predicate.cpp +++ b/be/src/olap/match_predicate.cpp @@ -110,6 +110,9 @@ InvertedIndexQueryType MatchPredicate::_to_inverted_index_query_type(MatchType m case MatchType::MATCH_PHRASE_PREFIX: ret = InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY; break; + case MatchType::MATCH_REGEXP: + ret = InvertedIndexQueryType::MATCH_REGEXP_QUERY; + break; case MatchType::MATCH_ELEMENT_EQ: ret = InvertedIndexQueryType::EQUAL_QUERY; break; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp index b77edc79ad..b2448a8fa8 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp @@ -38,12 +38,12 @@ ConjunctionQuery::~ConjunctionQuery() { } void ConjunctionQuery::add(const std::wstring& field_name, const std::vector& terms) { - if (terms.size() < 1) { - _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size() < 1"); + if (terms.empty()) { + _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms empty"); } std::vector iterators; - for (auto& term : terms) { + for (const auto& term : terms) { std::wstring ws_term = StringUtil::string_to_wstring(term); Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str()); _terms.push_back(t); diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp index 07a159b322..7b797d7b54 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp @@ -22,26 +22,25 @@ namespace doris { DisjunctionQuery::DisjunctionQuery(IndexReader* reader) : _reader(reader) {} DisjunctionQuery::~DisjunctionQuery() { - for (auto& term : _terms) { - if (term) { - _CLDELETE(term); - } - } for (auto& term_doc : _term_docs) { if (term_doc) { _CLDELETE(term_doc); } } + for (auto& term : _terms) { + if (term) { + _CLDELETE(term); + } + } } void DisjunctionQuery::add(const std::wstring& field_name, const std::vector& terms) { - if (terms.size() < 1) { - _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size() < 1"); + if (terms.empty()) { + _CLTHROWA(CL_ERR_IllegalArgument, "DisjunctionQuery::add: terms empty"); } - for (auto& term : terms) { + for (const auto& term : terms) { std::wstring ws_term = StringUtil::string_to_wstring(term); - _wsterms.emplace_back(&ws_term); Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str()); _terms.push_back(t); TermDocs* term_doc = _reader->termDocs(t); diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h index f42fd69dab..bb0a837f42 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h @@ -39,7 +39,6 @@ public: private: IndexReader* _reader = nullptr; - std::vector _wsterms; std::vector _terms; std::vector _term_docs; std::vector _term_iterators; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp new file mode 100644 index 0000000000..83c5401bac --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "regexp_query.h" + +#include +#include + +#include "common/logging.h" + +namespace doris::segment_v2 { + +RegexpQuery::RegexpQuery(const std::shared_ptr& searcher) + : _searcher(searcher), query(searcher->getReader()) {} + +void RegexpQuery::add(const std::wstring& field_name, const std::string& pattern) { + hs_database_t* database = nullptr; + hs_compile_error_t* compile_err = nullptr; + hs_scratch_t* scratch = nullptr; + + if (hs_compile(pattern.data(), HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8, + HS_MODE_BLOCK, nullptr, &database, &compile_err) != HS_SUCCESS) { + LOG(ERROR) << "hyperscan compilation failed: " << compile_err->message; + hs_free_compile_error(compile_err); + return; + } + + if (hs_alloc_scratch(database, &scratch) != HS_SUCCESS) { + LOG(ERROR) << "hyperscan could not allocate scratch space."; + hs_free_database(database); + return; + } + + auto on_match = [](unsigned int id, unsigned long long from, unsigned long long to, + unsigned int flags, void* context) -> int { + *((bool*)context) = true; + return 0; + }; + + Term* term = nullptr; + TermEnum* enumerator = nullptr; + std::vector terms; + int32_t count = 0; + + try { + enumerator = _searcher->getReader()->terms(); + while (enumerator->next()) { + term = enumerator->term(); + std::string input = lucene_wcstoutf8string(term->text(), term->textLength()); + + bool is_match = false; + if (hs_scan(database, input.data(), input.size(), 0, scratch, on_match, + (void*)&is_match) != HS_SUCCESS) { + LOG(ERROR) << "hyperscan match failed: " << input; + break; + } + + if (is_match) { + terms.emplace_back(std::move(input)); + if (++count >= _max_expansions) { + break; + } + } + + _CLDECDELETE(term); + } + } + _CLFINALLY({ + _CLDECDELETE(term); + enumerator->close(); + _CLDELETE(enumerator); + + hs_free_scratch(scratch); + hs_free_database(database); + }) + + query.add(field_name, terms); +} + +void RegexpQuery::search(roaring::Roaring& roaring) { + query.search(roaring); +} + +} // namespace doris::segment_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h new file mode 100644 index 0000000000..3791ad50d8 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "olap/rowset/segment_v2/inverted_index/query/disjunction_query.h" + +CL_NS_USE(index) +CL_NS_USE(search) + +namespace doris::segment_v2 { + +class RegexpQuery { +public: + RegexpQuery(const std::shared_ptr& searcher); + ~RegexpQuery() = default; + + void set_max_expansions(int32_t max_expansions) { _max_expansions = max_expansions; } + + void add(const std::wstring& field_name, const std::string& pattern); + void search(roaring::Roaring& roaring); + +private: + std::shared_ptr _searcher; + + int32_t _max_expansions = 50; + DisjunctionQuery query; +}; + +} // namespace doris::segment_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index_query_type.h b/be/src/olap/rowset/segment_v2/inverted_index_query_type.h index 6d91c3e2ec..844cec27b4 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_query_type.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_query_type.h @@ -77,6 +77,7 @@ enum class InvertedIndexQueryType { MATCH_ALL_QUERY = 6, MATCH_PHRASE_QUERY = 7, MATCH_PHRASE_PREFIX_QUERY = 8, + MATCH_REGEXP_QUERY = 9, }; inline std::string query_type_to_string(InvertedIndexQueryType query_type) { @@ -111,6 +112,9 @@ inline std::string query_type_to_string(InvertedIndexQueryType query_type) { case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY: { return "MPHRASEPREFIX"; } + case InvertedIndexQueryType::MATCH_REGEXP_QUERY: { + return "MREGEXP"; + } default: return ""; } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index a567859a3b..8a226ac123 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -60,6 +60,7 @@ #include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h" #include "olap/rowset/segment_v2/inverted_index/query/conjunction_query.h" #include "olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h" +#include "olap/rowset/segment_v2/inverted_index/query/regexp_query.h" #include "olap/rowset/segment_v2/inverted_index_cache.h" #include "olap/rowset/segment_v2/inverted_index_compound_directory.h" #include "olap/types.h" @@ -83,7 +84,8 @@ bool InvertedIndexReader::_is_match_query(InvertedIndexQueryType query_type) { return (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY || query_type == InvertedIndexQueryType::MATCH_ALL_QUERY || query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY || - query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY); + query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY || + query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY); } bool InvertedIndexReader::indexExists(io::Path& index_file_path) { @@ -134,10 +136,13 @@ std::unique_ptr InvertedIndexReader::create_reader( return reader; } -std::vector InvertedIndexReader::get_analyse_result( - lucene::util::Reader* reader, lucene::analysis::Analyzer* analyzer, - const std::string& field_name, InvertedIndexQueryType query_type, bool drop_duplicates) { - std::vector analyse_result; +void InvertedIndexReader::get_analyse_result(std::vector& analyse_result, + lucene::util::Reader* reader, + lucene::analysis::Analyzer* analyzer, + const std::string& field_name, + InvertedIndexQueryType query_type, + bool drop_duplicates) { + analyse_result.clear(); std::wstring field_ws = std::wstring(field_name.begin(), field_name.end()); std::unique_ptr token_stream( @@ -161,8 +166,6 @@ std::vector InvertedIndexReader::get_analyse_result( std::set unrepeated_result(analyse_result.begin(), analyse_result.end()); analyse_result.assign(unrepeated_result.begin(), unrepeated_result.end()); } - - return analyse_result; } Status InvertedIndexReader::read_null_bitmap(InvertedIndexQueryCacheHandle* cache_handle, @@ -239,19 +242,25 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run auto index_file_name = InvertedIndexDescriptor::get_index_file_name( path.filename(), _index_meta.index_id(), _index_meta.get_index_suffix()); auto index_file_path = index_dir / index_file_name; - InvertedIndexCtxSPtr inverted_index_ctx = std::make_shared(); - inverted_index_ctx->parser_type = get_inverted_index_parser_type_from_string( - get_parser_string_from_properties(_index_meta.properties())); - inverted_index_ctx->parser_mode = - get_parser_mode_string_from_properties(_index_meta.properties()); - inverted_index_ctx->char_filter_map = - get_parser_char_filter_map_from_properties(_index_meta.properties()); + try { - auto analyzer = create_analyzer(inverted_index_ctx.get()); - auto reader = create_reader(inverted_index_ctx.get(), search_str); - inverted_index_ctx->analyzer = analyzer.get(); - std::vector analyse_result = - get_analyse_result(reader.get(), analyzer.get(), column_name, query_type); + std::vector analyse_result; + if (query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) { + analyse_result.emplace_back(search_str); + } else { + InvertedIndexCtxSPtr inverted_index_ctx = std::make_shared(); + inverted_index_ctx->parser_type = get_inverted_index_parser_type_from_string( + get_parser_string_from_properties(_index_meta.properties())); + inverted_index_ctx->parser_mode = + get_parser_mode_string_from_properties(_index_meta.properties()); + inverted_index_ctx->char_filter_map = + get_parser_char_filter_map_from_properties(_index_meta.properties()); + auto analyzer = create_analyzer(inverted_index_ctx.get()); + auto reader = create_reader(inverted_index_ctx.get(), search_str); + inverted_index_ctx->analyzer = analyzer.get(); + get_analyse_result(analyse_result, reader.get(), analyzer.get(), column_name, + query_type); + } if (analyse_result.empty()) { auto msg = fmt::format( @@ -261,7 +270,8 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run if (query_type == InvertedIndexQueryType::MATCH_ALL_QUERY || query_type == InvertedIndexQueryType::MATCH_ANY_QUERY || query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY || - query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY) { + query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY || + query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) { LOG(WARNING) << msg; return Status::OK(); } else { @@ -290,7 +300,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run str_tokens += " "; } - auto cache = InvertedIndexQueryCache::instance(); + auto* cache = InvertedIndexQueryCache::instance(); InvertedIndexQueryCache::CacheKey cache_key; cache_key.index_path = index_file_path; cache_key.column_name = column_name; @@ -345,13 +355,49 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run } } query_match_bitmap = *term_match_bitmap; + } else if (query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) { + const std::string& pattern = analyse_result[0]; + + std::shared_ptr term_match_bitmap = nullptr; + auto* cache = InvertedIndexQueryCache::instance(); + + InvertedIndexQueryCache::CacheKey cache_key; + cache_key.index_path = index_file_path; + cache_key.column_name = column_name; + cache_key.query_type = query_type; + cache_key.value = pattern; + InvertedIndexQueryCacheHandle cache_handle; + if (cache->lookup(cache_key, &cache_handle)) { + stats->inverted_index_query_cache_hit++; + term_match_bitmap = cache_handle.get_bitmap(); + } else { + stats->inverted_index_query_cache_miss++; + InvertedIndexCacheHandle inverted_index_cache_handle; + RETURN_IF_ERROR(InvertedIndexSearcherCache::instance()->get_index_searcher( + _fs, _index_dir.c_str(), _index_file_name, &inverted_index_cache_handle, + stats, type(), _has_null)); + auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); + if (FulltextIndexSearcherPtr* searcher_ptr = + std::get_if(&searcher_variant)) { + term_match_bitmap = std::make_shared(); + + Status res = match_regexp_index_search(stats, runtime_state, field_ws, pattern, + *searcher_ptr, term_match_bitmap); + if (!res.ok()) { + return res; + } + } + term_match_bitmap->runOptimize(); + cache->insert(cache_key, term_match_bitmap, &cache_handle); + } + query_match_bitmap = *term_match_bitmap; } else { bool first = true; for (auto token : analyse_result) { std::shared_ptr term_match_bitmap = nullptr; // try to get term bitmap match result from cache to avoid query index on cache hit - auto cache = InvertedIndexQueryCache::instance(); + auto* cache = InvertedIndexQueryCache::instance(); // use EQUAL_QUERY type here since cache is for each term/token //auto token = lucene_wcstoutf8string(token_ws.c_str(), token_ws.length()); std::wstring token_ws = StringUtil::string_to_wstring(token); @@ -495,6 +541,24 @@ Status FullTextIndexReader::match_phrase_prefix_index_search( return Status::OK(); } +Status FullTextIndexReader::match_regexp_index_search( + OlapReaderStatistics* stats, RuntimeState* runtime_state, const std::wstring& field_ws, + const std::string& pattern, const FulltextIndexSearcherPtr& index_searcher, + const std::shared_ptr& term_match_bitmap) { + TQueryOptions queryOptions = runtime_state->query_options(); + try { + SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); + RegexpQuery query(index_searcher); + query.set_max_expansions(queryOptions.inverted_index_max_expansions); + query.add(field_ws, pattern); + query.search(*term_match_bitmap); + } catch (const CLuceneError& e) { + return Status::Error("CLuceneError occured: {}", + e.what()); + } + return Status::OK(); +} + void FullTextIndexReader::check_null_bitmap(const FulltextIndexSearcherPtr& index_searcher, bool& null_bitmap_already_read) { // try to reuse index_searcher's directory to read null_bitmap to cache diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index e14e4bcc47..67fe0e4ae6 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -108,11 +108,12 @@ public: [[nodiscard]] bool has_null() const { return _has_null; } - static std::vector get_analyse_result(lucene::util::Reader* reader, - lucene::analysis::Analyzer* analyzer, - const std::string& field_name, - InvertedIndexQueryType query_type, - bool drop_duplicates = true); + static void get_analyse_result(std::vector& analyse_result, + lucene::util::Reader* reader, + lucene::analysis::Analyzer* analyzer, + const std::string& field_name, InvertedIndexQueryType query_type, + bool drop_duplicates = true); + static std::unique_ptr create_reader(InvertedIndexCtx* inverted_index_ctx, const std::string& value); static std::unique_ptr create_analyzer( @@ -172,6 +173,11 @@ private: const FulltextIndexSearcherPtr& index_searcher, const std::shared_ptr& term_match_bitmap); + Status match_regexp_index_search(OlapReaderStatistics* stats, RuntimeState* runtime_state, + const std::wstring& field_ws, const std::string& pattern, + const FulltextIndexSearcherPtr& index_searcher, + const std::shared_ptr& term_match_bitmap); + void check_null_bitmap(const FulltextIndexSearcherPtr& index_searcher, bool& null_bitmap_already_read); }; diff --git a/be/src/vec/functions/function_tokenize.cpp b/be/src/vec/functions/function_tokenize.cpp index 648f79db30..42f27a116e 100644 --- a/be/src/vec/functions/function_tokenize.cpp +++ b/be/src/vec/functions/function_tokenize.cpp @@ -79,10 +79,10 @@ void FunctionTokenize::_do_tokenize(const ColumnString& src_column_string, auto reader = doris::segment_v2::InvertedIndexReader::create_reader( &inverted_index_ctx, tokenize_str.to_string()); - std::vector query_tokens = - doris::segment_v2::InvertedIndexReader::get_analyse_result( - reader.get(), inverted_index_ctx.analyzer, "tokenize", - doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY); + std::vector query_tokens; + doris::segment_v2::InvertedIndexReader::get_analyse_result( + query_tokens, reader.get(), inverted_index_ctx.analyzer, "tokenize", + doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY); for (auto token : query_tokens) { const size_t old_size = column_string_chars.size(); const size_t split_part_size = token.length(); diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp index 6b8f6a4d8e..38145342a0 100644 --- a/be/src/vec/functions/match.cpp +++ b/be/src/vec/functions/match.cpp @@ -130,10 +130,10 @@ inline std::vector FunctionMatchBase::analyse_data_token( auto reader = doris::segment_v2::InvertedIndexReader::create_reader( inverted_index_ctx, str_ref.to_string()); - std::vector element_tokens = - doris::segment_v2::InvertedIndexReader::get_analyse_result( - reader.get(), inverted_index_ctx->analyzer, column_name, query_type, - false); + std::vector element_tokens; + doris::segment_v2::InvertedIndexReader::get_analyse_result( + element_tokens, reader.get(), inverted_index_ctx->analyzer, column_name, + query_type, false); data_tokens.insert(data_tokens.end(), element_tokens.begin(), element_tokens.end()); } } else { @@ -141,8 +141,9 @@ inline std::vector FunctionMatchBase::analyse_data_token( auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx, str_ref.to_string()); - data_tokens = doris::segment_v2::InvertedIndexReader::get_analyse_result( - reader.get(), inverted_index_ctx->analyzer, column_name, query_type, false); + doris::segment_v2::InvertedIndexReader::get_analyse_result(data_tokens, reader.get(), + inverted_index_ctx->analyzer, + column_name, query_type, false); } return data_tokens; } @@ -161,10 +162,10 @@ Status FunctionMatchAny::execute_match(const std::string& column_name, << inverted_index_parser_type_to_string(parser_type); auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx, match_query_str); - std::vector query_tokens = - doris::segment_v2::InvertedIndexReader::get_analyse_result( - reader.get(), inverted_index_ctx->analyzer, column_name, - doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY); + std::vector query_tokens; + doris::segment_v2::InvertedIndexReader::get_analyse_result( + query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name, + doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY); if (query_tokens.empty()) { LOG(WARNING) << fmt::format( "token parser result is empty for query, " @@ -206,10 +207,10 @@ Status FunctionMatchAll::execute_match(const std::string& column_name, << inverted_index_parser_type_to_string(parser_type); auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx, match_query_str); - std::vector query_tokens = - doris::segment_v2::InvertedIndexReader::get_analyse_result( - reader.get(), inverted_index_ctx->analyzer, column_name, - doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY); + std::vector query_tokens; + doris::segment_v2::InvertedIndexReader::get_analyse_result( + query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name, + doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY); if (query_tokens.empty()) { LOG(WARNING) << fmt::format( "token parser result is empty for query, " @@ -257,10 +258,10 @@ Status FunctionMatchPhrase::execute_match(const std::string& column_name, << inverted_index_parser_type_to_string(parser_type); auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx, match_query_str); - std::vector query_tokens = - doris::segment_v2::InvertedIndexReader::get_analyse_result( - reader.get(), inverted_index_ctx->analyzer, column_name, - doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY); + std::vector query_tokens; + doris::segment_v2::InvertedIndexReader::get_analyse_result( + query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name, + doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY); if (query_tokens.empty()) { LOG(WARNING) << fmt::format( "token parser result is empty for query, " @@ -315,6 +316,7 @@ void register_function_match(SimpleFunctionFactory& factory) { factory.register_function(); factory.register_function(); factory.register_function(); + factory.register_function(); factory.register_function(); factory.register_function(); factory.register_function(); diff --git a/be/src/vec/functions/match.h b/be/src/vec/functions/match.h index ee32ee0eaf..5ca981e702 100644 --- a/be/src/vec/functions/match.h +++ b/be/src/vec/functions/match.h @@ -145,6 +145,23 @@ public: } }; +class FunctionMatchRegexp : public FunctionMatchBase { +public: + static constexpr auto name = "match_regexp"; + static FunctionPtr create() { return std::make_shared(); } + + String get_name() const override { return name; } + + Status execute_match(const std::string& column_name, const std::string& match_query_str, + size_t input_rows_count, const ColumnString* string_col, + InvertedIndexCtx* inverted_index_ctx, + const ColumnArray::Offsets64* array_offsets, + ColumnUInt8::Container& result) const override { + return Status::Error( + "FunctionMatchRegexp not support execute_match"); + } +}; + class FunctionMatchElementEQ : public FunctionMatchBase { public: static constexpr auto name = "match_element_eq"; diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 index eb9cbcc685..c7e823ac65 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 @@ -346,6 +346,7 @@ MATCH_ELEMENT_LE: 'ELEMENT_LE'; MATCH_ELEMENT_LT: 'ELEMENT_LT'; MATCH_PHRASE: 'MATCH_PHRASE'; MATCH_PHRASE_PREFIX: 'MATCH_PHRASE_PREFIX'; +MATCH_REGEXP: 'MATCH_REGEXP'; MATERIALIZED: 'MATERIALIZED'; MAX: 'MAX'; MAXVALUE: 'MAXVALUE'; diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 index 77e8188131..0bfc2d313c 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 @@ -594,7 +594,7 @@ rowConstructorItem predicate : NOT? kind=BETWEEN lower=valueExpression AND upper=valueExpression | NOT? kind=(LIKE | REGEXP | RLIKE) pattern=valueExpression - | NOT? kind=(MATCH | MATCH_ANY | MATCH_ALL | MATCH_PHRASE | MATCH_PHRASE_PREFIX) pattern=valueExpression + | NOT? kind=(MATCH | MATCH_ANY | MATCH_ALL | MATCH_PHRASE | MATCH_PHRASE_PREFIX | MATCH_REGEXP) pattern=valueExpression | NOT? kind=IN LEFT_PAREN query RIGHT_PAREN | NOT? kind=IN LEFT_PAREN expression (COMMA expression)* RIGHT_PAREN | IS NOT? kind=NULL diff --git a/fe/fe-core/src/main/cup/sql_parser.cup b/fe/fe-core/src/main/cup/sql_parser.cup index db3f934b6d..838f6d7a75 100644 --- a/fe/fe-core/src/main/cup/sql_parser.cup +++ b/fe/fe-core/src/main/cup/sql_parser.cup @@ -482,6 +482,7 @@ terminal String KW_MATCH_ALL, KW_MATCH_PHRASE, KW_MATCH_PHRASE_PREFIX, + KW_MATCH_REGEXP, KW_MATCH_ELEMENT_EQ, KW_MATCH_ELEMENT_LT, KW_MATCH_ELEMENT_GT, @@ -987,7 +988,7 @@ precedence left KW_AND; precedence left KW_NOT, NOT; precedence left KW_BETWEEN, KW_IN, KW_IS, KW_EXISTS; precedence left KW_LIKE, KW_REGEXP; -precedence left KW_MATCH_ANY, KW_MATCH_ALL, KW_MATCH_PHRASE, KW_MATCH_PHRASE_PREFIX, KW_MATCH, KW_MATCH_ELEMENT_EQ, KW_MATCH_ELEMENT_LT, KW_MATCH_ELEMENT_GT, KW_MATCH_ELEMENT_LE, KW_MATCH_ELEMENT_GE; +precedence left KW_MATCH_ANY, KW_MATCH_ALL, KW_MATCH_PHRASE, KW_MATCH_PHRASE_PREFIX, KW_MATCH_REGEXP, KW_MATCH, KW_MATCH_ELEMENT_EQ, KW_MATCH_ELEMENT_LT, KW_MATCH_ELEMENT_GT, KW_MATCH_ELEMENT_LE, KW_MATCH_ELEMENT_GE; precedence left EQUAL, LESSTHAN, GREATERTHAN; precedence left ADD, SUBTRACT; precedence left AT, STAR, DIVIDE, MOD, KW_DIV; @@ -7170,6 +7171,8 @@ match_predicate ::= {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_PHRASE, e1, e2); :} | expr:e1 KW_MATCH_PHRASE_PREFIX expr:e2 {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_PHRASE_PREFIX, e1, e2); :} + | expr:e1 KW_MATCH_REGEXP expr:e2 + {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_REGEXP, e1, e2); :} | expr:e1 KW_MATCH_ELEMENT_EQ expr:e2 {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_ELEMENT_EQ, e1, e2); :} | expr:e1 KW_MATCH_ELEMENT_LT expr:e2 diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java index 49a0796c19..f106aec956 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java @@ -51,6 +51,7 @@ public class MatchPredicate extends Predicate { MATCH_ALL("MATCH_ALL", "match_all", TExprOpcode.MATCH_ALL), MATCH_PHRASE("MATCH_PHRASE", "match_phrase", TExprOpcode.MATCH_PHRASE), MATCH_PHRASE_PREFIX("MATCH_PHRASE_PREFIX", "match_phrase_prefix", TExprOpcode.MATCH_PHRASE_PREFIX), + MATCH_REGEXP("MATCH_REGEXP", "match_regexp", TExprOpcode.MATCH_REGEXP), MATCH_ELEMENT_EQ("MATCH_ELEMENT_EQ", "match_element_eq", TExprOpcode.MATCH_ELEMENT_EQ), MATCH_ELEMENT_LT("MATCH_ELEMENT_LT", "match_element_lt", TExprOpcode.MATCH_ELEMENT_LT), MATCH_ELEMENT_GT("MATCH_ELEMENT_GT", "match_element_gt", TExprOpcode.MATCH_ELEMENT_GT), @@ -158,6 +159,16 @@ public class MatchPredicate extends Predicate { symbolNotUsed, Lists.newArrayList(new ArrayType(t), t), Type.BOOLEAN)); + functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator( + Operator.MATCH_REGEXP.getName(), + symbolNotUsed, + Lists.newArrayList(t, t), + Type.BOOLEAN)); + functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator( + Operator.MATCH_REGEXP.getName(), + symbolNotUsed, + Lists.newArrayList(new ArrayType(t), t), + Type.BOOLEAN)); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java index d0e731b623..c2c6717448 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java @@ -231,6 +231,7 @@ import org.apache.doris.nereids.trees.expressions.MatchAll; import org.apache.doris.nereids.trees.expressions.MatchAny; import org.apache.doris.nereids.trees.expressions.MatchPhrase; import org.apache.doris.nereids.trees.expressions.MatchPhrasePrefix; +import org.apache.doris.nereids.trees.expressions.MatchRegexp; import org.apache.doris.nereids.trees.expressions.Mod; import org.apache.doris.nereids.trees.expressions.Multiply; import org.apache.doris.nereids.trees.expressions.NamedExpression; @@ -2937,6 +2938,12 @@ public class LogicalPlanBuilder extends DorisParserBaseVisitor { getExpression(ctx.pattern) ); break; + case DorisParser.MATCH_REGEXP: + outExpression = new MatchRegexp( + valueExpression, + getExpression(ctx.pattern) + ); + break; default: throw new ParseException("Unsupported predicate type: " + ctx.kind.getText(), ctx); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java index cafe2824fa..976e46830e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java @@ -51,6 +51,8 @@ public abstract class Match extends BinaryOperator implements PropagateNullable return Operator.MATCH_PHRASE; case "MATCH_PHRASE_PREFIX": return Operator.MATCH_PHRASE_PREFIX; + case "MATCH_REGEXP": + return Operator.MATCH_REGEXP; default: throw new AnalysisException("UnSupported type for match: " + symbol); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchRegexp.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchRegexp.java new file mode 100644 index 0000000000..6bb55aeb89 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchRegexp.java @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions; + +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * like expression: a MATCH_REGEXP '^h\\w*'. + */ +public class MatchRegexp extends Match { + public MatchRegexp(Expression left, Expression right) { + super(ImmutableList.of(left, right), "MATCH_REGEXP"); + } + + private MatchRegexp(List children) { + super(children, "MATCH_REGEXP"); + } + + @Override + public MatchRegexp withChildren(List children) { + Preconditions.checkArgument(children.size() == 2); + return new MatchRegexp(children); + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitMatchRegexp(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java index b53d22916a..561648f900 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java @@ -61,6 +61,7 @@ import org.apache.doris.nereids.trees.expressions.MatchAll; import org.apache.doris.nereids.trees.expressions.MatchAny; import org.apache.doris.nereids.trees.expressions.MatchPhrase; import org.apache.doris.nereids.trees.expressions.MatchPhrasePrefix; +import org.apache.doris.nereids.trees.expressions.MatchRegexp; import org.apache.doris.nereids.trees.expressions.Mod; import org.apache.doris.nereids.trees.expressions.Multiply; import org.apache.doris.nereids.trees.expressions.NamedExpression; @@ -500,6 +501,10 @@ public abstract class ExpressionVisitor return visitMatch(matchPhrasePrefix, context); } + public R visitMatchRegexp(MatchRegexp matchRegexp, C context) { + return visitMatch(matchRegexp, context); + } + public R visitAny(Any any, C context) { return visit(any, context); } diff --git a/fe/fe-core/src/main/jflex/sql_scanner.flex b/fe/fe-core/src/main/jflex/sql_scanner.flex index 955555c270..68db0e9bf4 100644 --- a/fe/fe-core/src/main/jflex/sql_scanner.flex +++ b/fe/fe-core/src/main/jflex/sql_scanner.flex @@ -321,6 +321,7 @@ import org.apache.doris.qe.SqlModeHelper; keywordMap.put("match_all", new Integer(SqlParserSymbols.KW_MATCH_ALL)); keywordMap.put("match_phrase", new Integer(SqlParserSymbols.KW_MATCH_PHRASE)); keywordMap.put("match_phrase_prefix", new Integer(SqlParserSymbols.KW_MATCH_PHRASE_PREFIX)); + keywordMap.put("match_regexp", new Integer(SqlParserSymbols.KW_MATCH_REGEXP)); keywordMap.put("element_eq", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_EQ)); keywordMap.put("element_lt", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_LT)); keywordMap.put("element_gt", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_GT)); diff --git a/gensrc/thrift/Opcodes.thrift b/gensrc/thrift/Opcodes.thrift index 0afa53566d..72a1d80e03 100644 --- a/gensrc/thrift/Opcodes.thrift +++ b/gensrc/thrift/Opcodes.thrift @@ -94,4 +94,5 @@ enum TExprOpcode { MATCH_ELEMENT_LE, MATCH_ELEMENT_GE, MATCH_PHRASE_PREFIX, + MATCH_REGEXP, } diff --git a/regression-test/data/inverted_index_p0/test_index_match_regexp.out b/regression-test/data/inverted_index_p0/test_index_match_regexp.out new file mode 100644 index 0000000000..eab27de65e --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_index_match_regexp.out @@ -0,0 +1,16 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +1000 + +-- !sql -- +54 + +-- !sql -- +910 + +-- !sql -- +60 + +-- !sql -- +38 + diff --git a/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy b/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy new file mode 100644 index 0000000000..4c1ee1a5b0 --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_index_match_regexp", "p0"){ + def indexTbName1 = "test_index_match_regexp" + + sql "DROP TABLE IF EXISTS ${indexTbName1}" + + sql """ + CREATE TABLE ${indexTbName1} ( + `@timestamp` int(11) NULL COMMENT "", + `clientip` varchar(20) NULL COMMENT "", + `request` text NULL COMMENT "", + `status` int(11) NULL COMMENT "", + `size` int(11) NULL COMMENT "", + INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`@timestamp`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + def load_httplogs_data = {table_name, label, read_flag, format_flag, file_name, ignore_failure=false, + expected_succ_rows = -1, load_to_single_tablet = 'true' -> + + // load the json data + streamLoad { + table "${table_name}" + + // set http request header params + set 'label', label + "_" + UUID.randomUUID().toString() + set 'read_json_by_line', read_flag + set 'format', format_flag + file file_name // import json file + time 10000 // limit inflight 10s + if (expected_succ_rows >= 0) { + set 'max_filter_ratio', '1' + } + + // if declared a check callback, the default check condition will ignore. + // So you must check all condition + check { result, exception, startTime, endTime -> + if (ignore_failure && expected_succ_rows < 0) { return } + if (exception != null) { + throw exception + } + log.info("Stream load result: ${result}".toString()) + def json = parseJson(result) + assertEquals("success", json.Status.toLowerCase()) + if (expected_succ_rows >= 0) { + assertEquals(json.NumberLoadedRows, expected_succ_rows) + } else { + assertEquals(json.NumberTotalRows, json.NumberLoadedRows + json.NumberUnselectedRows) + assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0) + } + } + } + } + + try { + load_httplogs_data.call(indexTbName1, 'test_index_match_regexp', 'true', 'json', 'documents-1000.json') + + qt_sql """ select count() from test_index_match_regexp where request match_regexp '^h'; """ + qt_sql """ select count() from test_index_match_regexp where request match_regexp '^team'; """ + qt_sql """ select count() from test_index_match_regexp where request match_regexp 's\$'; """ + qt_sql """ select count() from test_index_match_regexp where request match_regexp 'er\$'; """ + qt_sql """ select count() from test_index_match_regexp where request match_regexp '.*tickets.*'; """ + } finally { + //try_sql("DROP TABLE IF EXISTS ${testTable}") + } +} \ No newline at end of file