[feature](inverted index) match_phrase_prefix feature added (#27404)
select count() from test_index_match_phrase_prefix where request match_phrase_prefix 'xxx';
This commit is contained in:
@ -304,6 +304,8 @@ public:
|
||||
condition.__set_condition_op("match_all");
|
||||
} else if (value.first == MatchType::MATCH_PHRASE) {
|
||||
condition.__set_condition_op("match_phrase");
|
||||
} else if (value.first == MatchType::MATCH_PHRASE_PREFIX) {
|
||||
condition.__set_condition_op("match_phrase_prefix");
|
||||
} else if (value.first == MatchType::MATCH_ELEMENT_EQ) {
|
||||
condition.__set_condition_op("match_element_eq");
|
||||
} else if (value.first == MatchType::MATCH_ELEMENT_LT) {
|
||||
|
||||
@ -169,6 +169,7 @@ enum class MatchType {
|
||||
MATCH_ELEMENT_GT = 5,
|
||||
MATCH_ELEMENT_LE = 6,
|
||||
MATCH_ELEMENT_GE = 7,
|
||||
MATCH_PHRASE_PREFIX = 8,
|
||||
};
|
||||
|
||||
inline MatchType to_match_type(TExprOpcode::type type) {
|
||||
@ -182,6 +183,9 @@ inline MatchType to_match_type(TExprOpcode::type type) {
|
||||
case TExprOpcode::type::MATCH_PHRASE:
|
||||
return MatchType::MATCH_PHRASE;
|
||||
break;
|
||||
case TExprOpcode::type::MATCH_PHRASE_PREFIX:
|
||||
return MatchType::MATCH_PHRASE_PREFIX;
|
||||
break;
|
||||
case TExprOpcode::type::MATCH_ELEMENT_EQ:
|
||||
return MatchType::MATCH_ELEMENT_EQ;
|
||||
break;
|
||||
@ -211,6 +215,8 @@ inline MatchType to_match_type(const std::string& condition_op) {
|
||||
return MatchType::MATCH_ALL;
|
||||
} else if (condition_op.compare("match_phrase") == 0) {
|
||||
return MatchType::MATCH_PHRASE;
|
||||
} else if (condition_op.compare("match_phrase_prefix") == 0) {
|
||||
return MatchType::MATCH_PHRASE_PREFIX;
|
||||
} else if (condition_op.compare("match_element_eq") == 0) {
|
||||
return MatchType::MATCH_ELEMENT_EQ;
|
||||
} else if (condition_op.compare("match_element_lt") == 0) {
|
||||
@ -228,6 +234,7 @@ inline MatchType to_match_type(const std::string& condition_op) {
|
||||
inline bool is_match_condition(const std::string& op) {
|
||||
if (0 == strcasecmp(op.c_str(), "match_any") || 0 == strcasecmp(op.c_str(), "match_all") ||
|
||||
0 == strcasecmp(op.c_str(), "match_phrase") ||
|
||||
0 == strcasecmp(op.c_str(), "match_phrase_prefix") ||
|
||||
0 == strcasecmp(op.c_str(), "match_element_eq") ||
|
||||
0 == strcasecmp(op.c_str(), "match_element_lt") ||
|
||||
0 == strcasecmp(op.c_str(), "match_element_gt") ||
|
||||
@ -240,9 +247,10 @@ inline bool is_match_condition(const std::string& op) {
|
||||
|
||||
inline bool is_match_operator(const TExprOpcode::type& op_type) {
|
||||
return TExprOpcode::MATCH_ANY == op_type || TExprOpcode::MATCH_ALL == op_type ||
|
||||
TExprOpcode::MATCH_PHRASE == op_type || TExprOpcode::MATCH_ELEMENT_EQ == op_type ||
|
||||
TExprOpcode::MATCH_ELEMENT_LT == op_type || TExprOpcode::MATCH_ELEMENT_GT == op_type ||
|
||||
TExprOpcode::MATCH_ELEMENT_LE == op_type || TExprOpcode::MATCH_ELEMENT_GE == op_type;
|
||||
TExprOpcode::MATCH_PHRASE == op_type || TExprOpcode::MATCH_PHRASE_PREFIX == op_type ||
|
||||
TExprOpcode::MATCH_ELEMENT_EQ == op_type || TExprOpcode::MATCH_ELEMENT_LT == op_type ||
|
||||
TExprOpcode::MATCH_ELEMENT_GT == op_type || TExprOpcode::MATCH_ELEMENT_LE == op_type ||
|
||||
TExprOpcode::MATCH_ELEMENT_GE == op_type;
|
||||
}
|
||||
|
||||
} // namespace doris
|
||||
|
||||
@ -107,6 +107,9 @@ InvertedIndexQueryType MatchPredicate::_to_inverted_index_query_type(MatchType m
|
||||
case MatchType::MATCH_PHRASE:
|
||||
ret = InvertedIndexQueryType::MATCH_PHRASE_QUERY;
|
||||
break;
|
||||
case MatchType::MATCH_PHRASE_PREFIX:
|
||||
ret = InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY;
|
||||
break;
|
||||
case MatchType::MATCH_ELEMENT_EQ:
|
||||
ret = InvertedIndexQueryType::EQUAL_QUERY;
|
||||
break;
|
||||
@ -129,7 +132,7 @@ InvertedIndexQueryType MatchPredicate::_to_inverted_index_query_type(MatchType m
|
||||
}
|
||||
|
||||
bool MatchPredicate::_skip_evaluate(InvertedIndexIterator* iterator) const {
|
||||
if (_match_type == MatchType::MATCH_PHRASE &&
|
||||
if ((_match_type == MatchType::MATCH_PHRASE || _match_type == MatchType::MATCH_PHRASE_PREFIX) &&
|
||||
iterator->get_inverted_index_reader_type() == InvertedIndexReaderType::FULLTEXT &&
|
||||
get_parser_phrase_support_string_from_properties(iterator->get_index_properties()) ==
|
||||
INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO) {
|
||||
|
||||
@ -0,0 +1,63 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "phrase_prefix_query.h"
|
||||
|
||||
#include "olap/rowset//segment_v2/inverted_index/query/prefix_query.h"
|
||||
|
||||
namespace doris {
|
||||
|
||||
namespace segment_v2 {
|
||||
|
||||
PhrasePrefixQuery::PhrasePrefixQuery(const std::shared_ptr<lucene::search::IndexSearcher>& searcher)
|
||||
: _searcher(searcher) {}
|
||||
|
||||
void PhrasePrefixQuery::add(const std::wstring& field_name, const std::vector<std::string>& terms) {
|
||||
if (terms.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < terms.size(); i++) {
|
||||
if (i < terms.size() - 1) {
|
||||
std::wstring ws = StringUtil::string_to_wstring(terms[i]);
|
||||
Term* t = _CLNEW Term(field_name.c_str(), ws.c_str());
|
||||
_query.add(t);
|
||||
_CLDECDELETE(t);
|
||||
} else {
|
||||
std::vector<CL_NS(index)::Term*> prefix_terms;
|
||||
PrefixQuery::get_prefix_terms(_searcher->getReader(), field_name, terms[i],
|
||||
prefix_terms, _max_expansions);
|
||||
if (prefix_terms.empty()) {
|
||||
continue;
|
||||
}
|
||||
_query.add(prefix_terms);
|
||||
for (auto& t : prefix_terms) {
|
||||
_CLDECDELETE(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PhrasePrefixQuery::search(roaring::Roaring& roaring) {
|
||||
_searcher->_search(&_query, [&roaring](const int32_t docid, const float_t /*score*/) {
|
||||
roaring.add(docid);
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace segment_v2
|
||||
|
||||
} // namespace doris
|
||||
@ -0,0 +1,54 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <CLucene.h>
|
||||
#include <CLucene/index/IndexReader.h>
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "CLucene/search/MultiPhraseQuery.h"
|
||||
#include "roaring/roaring.hh"
|
||||
|
||||
CL_NS_USE(index)
|
||||
CL_NS_USE(search)
|
||||
|
||||
namespace doris {
|
||||
|
||||
namespace segment_v2 {
|
||||
|
||||
class PhrasePrefixQuery {
|
||||
public:
|
||||
PhrasePrefixQuery(const std::shared_ptr<lucene::search::IndexSearcher>& searcher);
|
||||
~PhrasePrefixQuery() = default;
|
||||
|
||||
void set_max_expansions(int32_t max_expansions) { _max_expansions = max_expansions; }
|
||||
|
||||
void add(const std::wstring& field_name, const std::vector<std::string>& terms);
|
||||
void search(roaring::Roaring& roaring);
|
||||
|
||||
private:
|
||||
std::shared_ptr<lucene::search::IndexSearcher> _searcher;
|
||||
MultiPhraseQuery _query;
|
||||
|
||||
int32_t _max_expansions = 50;
|
||||
};
|
||||
|
||||
} // namespace segment_v2
|
||||
|
||||
} // namespace doris
|
||||
@ -0,0 +1,80 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "prefix_query.h"
|
||||
|
||||
namespace doris {
|
||||
|
||||
void PrefixQuery::get_prefix_terms(IndexReader* reader, const std::wstring& field_name,
|
||||
const std::string& prefix,
|
||||
std::vector<CL_NS(index)::Term*>& prefix_terms,
|
||||
int32_t max_expansions) {
|
||||
std::wstring ws_prefix = StringUtil::string_to_wstring(prefix);
|
||||
|
||||
Term* prefix_term = _CLNEW Term(field_name.c_str(), ws_prefix.c_str());
|
||||
TermEnum* enumerator = reader->terms(prefix_term);
|
||||
|
||||
int32_t count = 0;
|
||||
Term* lastTerm = nullptr;
|
||||
try {
|
||||
const TCHAR* prefixText = prefix_term->text();
|
||||
const TCHAR* prefixField = prefix_term->field();
|
||||
const TCHAR* tmp = nullptr;
|
||||
size_t i = 0;
|
||||
size_t prefixLen = prefix_term->textLength();
|
||||
do {
|
||||
lastTerm = enumerator->term();
|
||||
if (lastTerm != nullptr && lastTerm->field() == prefixField) {
|
||||
size_t termLen = lastTerm->textLength();
|
||||
if (prefixLen > termLen) {
|
||||
break;
|
||||
}
|
||||
|
||||
tmp = lastTerm->text();
|
||||
|
||||
for (i = prefixLen - 1; i != -1; --i) {
|
||||
if (tmp[i] != prefixText[i]) {
|
||||
tmp = nullptr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (tmp == nullptr) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (max_expansions > 0 && count >= max_expansions) {
|
||||
break;
|
||||
}
|
||||
|
||||
Term* t = _CLNEW Term(field_name.c_str(), tmp);
|
||||
prefix_terms.push_back(t);
|
||||
count++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
_CLDECDELETE(lastTerm);
|
||||
} while (enumerator->next());
|
||||
}
|
||||
_CLFINALLY({
|
||||
enumerator->close();
|
||||
_CLDELETE(enumerator);
|
||||
_CLDECDELETE(lastTerm);
|
||||
_CLDECDELETE(prefix_term);
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace doris
|
||||
@ -0,0 +1,40 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <CLucene.h>
|
||||
#include <CLucene/index/IndexReader.h>
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
CL_NS_USE(index)
|
||||
|
||||
namespace doris {
|
||||
|
||||
class PrefixQuery {
|
||||
public:
|
||||
PrefixQuery() = default;
|
||||
~PrefixQuery() = default;
|
||||
|
||||
static void get_prefix_terms(IndexReader* reader, const std::wstring& field_name,
|
||||
const std::string& prefix,
|
||||
std::vector<CL_NS(index)::Term*>& prefix_terms,
|
||||
int32_t max_expansions = 50);
|
||||
};
|
||||
|
||||
} // namespace doris
|
||||
@ -76,6 +76,7 @@ enum class InvertedIndexQueryType {
|
||||
MATCH_ANY_QUERY = 5,
|
||||
MATCH_ALL_QUERY = 6,
|
||||
MATCH_PHRASE_QUERY = 7,
|
||||
MATCH_PHRASE_PREFIX_QUERY = 8,
|
||||
};
|
||||
|
||||
inline std::string query_type_to_string(InvertedIndexQueryType query_type) {
|
||||
@ -107,6 +108,9 @@ inline std::string query_type_to_string(InvertedIndexQueryType query_type) {
|
||||
case InvertedIndexQueryType::MATCH_PHRASE_QUERY: {
|
||||
return "MPHRASE";
|
||||
}
|
||||
case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY: {
|
||||
return "MPHRASEPREFIX";
|
||||
}
|
||||
default:
|
||||
return "";
|
||||
}
|
||||
|
||||
@ -40,6 +40,8 @@
|
||||
#include <roaring/roaring.hh>
|
||||
#include <set>
|
||||
|
||||
#include "inverted_index_query_type.h"
|
||||
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wshadow-field"
|
||||
@ -57,6 +59,7 @@
|
||||
#include "olap/olap_common.h"
|
||||
#include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"
|
||||
#include "olap/rowset/segment_v2/inverted_index/query/conjunction_query.h"
|
||||
#include "olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h"
|
||||
#include "olap/rowset/segment_v2/inverted_index_cache.h"
|
||||
#include "olap/rowset/segment_v2/inverted_index_compound_directory.h"
|
||||
#include "olap/types.h"
|
||||
@ -79,7 +82,8 @@ bool InvertedIndexReader::_is_range_query(InvertedIndexQueryType query_type) {
|
||||
bool InvertedIndexReader::_is_match_query(InvertedIndexQueryType query_type) {
|
||||
return (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY ||
|
||||
query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
|
||||
query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY);
|
||||
query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
|
||||
query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY);
|
||||
}
|
||||
|
||||
bool InvertedIndexReader::indexExists(io::Path& index_file_path) {
|
||||
@ -256,7 +260,8 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run
|
||||
search_str, get_parser_string_from_properties(_index_meta.properties()));
|
||||
if (query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
|
||||
query_type == InvertedIndexQueryType::MATCH_ANY_QUERY ||
|
||||
query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
|
||||
query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
|
||||
query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY) {
|
||||
LOG(WARNING) << msg;
|
||||
return Status::OK();
|
||||
} else {
|
||||
@ -276,6 +281,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run
|
||||
roaring::Roaring query_match_bitmap;
|
||||
bool null_bitmap_already_read = false;
|
||||
if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
|
||||
query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY ||
|
||||
query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
|
||||
query_type == InvertedIndexQueryType::EQUAL_QUERY) {
|
||||
std::string str_tokens;
|
||||
@ -321,6 +327,10 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run
|
||||
res = normal_index_search(stats, query_type, *searcher_ptr,
|
||||
null_bitmap_already_read, query,
|
||||
term_match_bitmap);
|
||||
} else if (query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY) {
|
||||
res = match_phrase_prefix_index_search(stats, runtime_state, field_ws,
|
||||
analyse_result, *searcher_ptr,
|
||||
term_match_bitmap);
|
||||
} else {
|
||||
res = match_all_index_search(stats, runtime_state, field_ws, analyse_result,
|
||||
*searcher_ptr, term_match_bitmap);
|
||||
@ -466,6 +476,25 @@ Status FullTextIndexReader::match_all_index_search(
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status FullTextIndexReader::match_phrase_prefix_index_search(
|
||||
OlapReaderStatistics* stats, RuntimeState* runtime_state, const std::wstring& field_ws,
|
||||
const std::vector<std::string>& analyse_result,
|
||||
const FulltextIndexSearcherPtr& index_searcher,
|
||||
const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
|
||||
TQueryOptions queryOptions = runtime_state->query_options();
|
||||
try {
|
||||
SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
|
||||
PhrasePrefixQuery query(index_searcher);
|
||||
query.set_max_expansions(queryOptions.inverted_index_max_expansions);
|
||||
query.add(field_ws, analyse_result);
|
||||
query.search(*term_match_bitmap);
|
||||
} catch (const CLuceneError& e) {
|
||||
return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured: {}",
|
||||
e.what());
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void FullTextIndexReader::check_null_bitmap(const FulltextIndexSearcherPtr& index_searcher,
|
||||
bool& null_bitmap_already_read) {
|
||||
// try to reuse index_searcher's directory to read null_bitmap to cache
|
||||
|
||||
@ -166,6 +166,12 @@ private:
|
||||
const FulltextIndexSearcherPtr& index_searcher,
|
||||
const std::shared_ptr<roaring::Roaring>& term_match_bitmap);
|
||||
|
||||
Status match_phrase_prefix_index_search(
|
||||
OlapReaderStatistics* stats, RuntimeState* runtime_state, const std::wstring& field_ws,
|
||||
const std::vector<std::string>& analyse_result,
|
||||
const FulltextIndexSearcherPtr& index_searcher,
|
||||
const std::shared_ptr<roaring::Roaring>& term_match_bitmap);
|
||||
|
||||
void check_null_bitmap(const FulltextIndexSearcherPtr& index_searcher,
|
||||
bool& null_bitmap_already_read);
|
||||
};
|
||||
|
||||
@ -314,6 +314,7 @@ void register_function_match(SimpleFunctionFactory& factory) {
|
||||
factory.register_function<FunctionMatchAny>();
|
||||
factory.register_function<FunctionMatchAll>();
|
||||
factory.register_function<FunctionMatchPhrase>();
|
||||
factory.register_function<FunctionMatchPhrasePrefix>();
|
||||
factory.register_function<FunctionMatchElementEQ>();
|
||||
factory.register_function<FunctionMatchElementLT>();
|
||||
factory.register_function<FunctionMatchElementGT>();
|
||||
|
||||
@ -128,6 +128,23 @@ public:
|
||||
ColumnUInt8::Container& result) const override;
|
||||
};
|
||||
|
||||
class FunctionMatchPhrasePrefix : public FunctionMatchBase {
|
||||
public:
|
||||
static constexpr auto name = "match_phrase_prefix";
|
||||
static FunctionPtr create() { return std::make_shared<FunctionMatchPhrasePrefix>(); }
|
||||
|
||||
String get_name() const override { return name; }
|
||||
|
||||
Status execute_match(const std::string& column_name, const std::string& match_query_str,
|
||||
size_t input_rows_count, const ColumnString* string_col,
|
||||
InvertedIndexCtx* inverted_index_ctx,
|
||||
const ColumnArray::Offsets64* array_offsets,
|
||||
ColumnUInt8::Container& result) const override {
|
||||
return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
|
||||
"FunctionMatchPhrasePrefix not support execute_match");
|
||||
}
|
||||
};
|
||||
|
||||
class FunctionMatchElementEQ : public FunctionMatchBase {
|
||||
public:
|
||||
static constexpr auto name = "match_element_eq";
|
||||
|
||||
@ -344,6 +344,7 @@ MATCH_ELEMENT_GT: 'ELEMENT_GT';
|
||||
MATCH_ELEMENT_LE: 'ELEMENT_LE';
|
||||
MATCH_ELEMENT_LT: 'ELEMENT_LT';
|
||||
MATCH_PHRASE: 'MATCH_PHRASE';
|
||||
MATCH_PHRASE_PREFIX: 'MATCH_PHRASE_PREFIX';
|
||||
MATERIALIZED: 'MATERIALIZED';
|
||||
MAX: 'MAX';
|
||||
MAXVALUE: 'MAXVALUE';
|
||||
|
||||
@ -593,7 +593,7 @@ rowConstructorItem
|
||||
predicate
|
||||
: NOT? kind=BETWEEN lower=valueExpression AND upper=valueExpression
|
||||
| NOT? kind=(LIKE | REGEXP | RLIKE) pattern=valueExpression
|
||||
| NOT? kind=(MATCH | MATCH_ANY | MATCH_ALL | MATCH_PHRASE) pattern=valueExpression
|
||||
| NOT? kind=(MATCH | MATCH_ANY | MATCH_ALL | MATCH_PHRASE | MATCH_PHRASE_PREFIX) pattern=valueExpression
|
||||
| NOT? kind=IN LEFT_PAREN query RIGHT_PAREN
|
||||
| NOT? kind=IN LEFT_PAREN expression (COMMA expression)* RIGHT_PAREN
|
||||
| IS NOT? kind=NULL
|
||||
|
||||
@ -478,6 +478,7 @@ terminal String
|
||||
KW_MATCH_ANY,
|
||||
KW_MATCH_ALL,
|
||||
KW_MATCH_PHRASE,
|
||||
KW_MATCH_PHRASE_PREFIX,
|
||||
KW_MATCH_ELEMENT_EQ,
|
||||
KW_MATCH_ELEMENT_LT,
|
||||
KW_MATCH_ELEMENT_GT,
|
||||
@ -975,7 +976,7 @@ precedence left KW_AND;
|
||||
precedence left KW_NOT, NOT;
|
||||
precedence left KW_BETWEEN, KW_IN, KW_IS, KW_EXISTS;
|
||||
precedence left KW_LIKE, KW_REGEXP;
|
||||
precedence left KW_MATCH_ANY, KW_MATCH_ALL, KW_MATCH_PHRASE, KW_MATCH, KW_MATCH_ELEMENT_EQ, KW_MATCH_ELEMENT_LT, KW_MATCH_ELEMENT_GT, KW_MATCH_ELEMENT_LE, KW_MATCH_ELEMENT_GE;
|
||||
precedence left KW_MATCH_ANY, KW_MATCH_ALL, KW_MATCH_PHRASE, KW_MATCH_PHRASE_PREFIX, KW_MATCH, KW_MATCH_ELEMENT_EQ, KW_MATCH_ELEMENT_LT, KW_MATCH_ELEMENT_GT, KW_MATCH_ELEMENT_LE, KW_MATCH_ELEMENT_GE;
|
||||
precedence left EQUAL, LESSTHAN, GREATERTHAN;
|
||||
precedence left ADD, SUBTRACT;
|
||||
precedence left AT, STAR, DIVIDE, MOD, KW_DIV;
|
||||
@ -7022,6 +7023,8 @@ match_predicate ::=
|
||||
{: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_ALL, e1, e2); :}
|
||||
| expr:e1 KW_MATCH_PHRASE expr:e2
|
||||
{: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_PHRASE, e1, e2); :}
|
||||
| expr:e1 KW_MATCH_PHRASE_PREFIX expr:e2
|
||||
{: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_PHRASE_PREFIX, e1, e2); :}
|
||||
| expr:e1 KW_MATCH_ELEMENT_EQ expr:e2
|
||||
{: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_ELEMENT_EQ, e1, e2); :}
|
||||
| expr:e1 KW_MATCH_ELEMENT_LT expr:e2
|
||||
|
||||
@ -50,6 +50,7 @@ public class MatchPredicate extends Predicate {
|
||||
MATCH_ANY("MATCH_ANY", "match_any", TExprOpcode.MATCH_ANY),
|
||||
MATCH_ALL("MATCH_ALL", "match_all", TExprOpcode.MATCH_ALL),
|
||||
MATCH_PHRASE("MATCH_PHRASE", "match_phrase", TExprOpcode.MATCH_PHRASE),
|
||||
MATCH_PHRASE_PREFIX("MATCH_PHRASE_PREFIX", "match_phrase_prefix", TExprOpcode.MATCH_PHRASE_PREFIX),
|
||||
MATCH_ELEMENT_EQ("MATCH_ELEMENT_EQ", "match_element_eq", TExprOpcode.MATCH_ELEMENT_EQ),
|
||||
MATCH_ELEMENT_LT("MATCH_ELEMENT_LT", "match_element_lt", TExprOpcode.MATCH_ELEMENT_LT),
|
||||
MATCH_ELEMENT_GT("MATCH_ELEMENT_GT", "match_element_gt", TExprOpcode.MATCH_ELEMENT_GT),
|
||||
@ -147,6 +148,16 @@ public class MatchPredicate extends Predicate {
|
||||
symbolNotUsed,
|
||||
Lists.<Type>newArrayList(new ArrayType(t), t),
|
||||
Type.BOOLEAN));
|
||||
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
|
||||
Operator.MATCH_PHRASE_PREFIX.getName(),
|
||||
symbolNotUsed,
|
||||
Lists.<Type>newArrayList(t, t),
|
||||
Type.BOOLEAN));
|
||||
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
|
||||
Operator.MATCH_PHRASE_PREFIX.getName(),
|
||||
symbolNotUsed,
|
||||
Lists.<Type>newArrayList(new ArrayType(t), t),
|
||||
Type.BOOLEAN));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -226,6 +226,7 @@ import org.apache.doris.nereids.trees.expressions.ListQuery;
|
||||
import org.apache.doris.nereids.trees.expressions.MatchAll;
|
||||
import org.apache.doris.nereids.trees.expressions.MatchAny;
|
||||
import org.apache.doris.nereids.trees.expressions.MatchPhrase;
|
||||
import org.apache.doris.nereids.trees.expressions.MatchPhrasePrefix;
|
||||
import org.apache.doris.nereids.trees.expressions.Mod;
|
||||
import org.apache.doris.nereids.trees.expressions.Multiply;
|
||||
import org.apache.doris.nereids.trees.expressions.NamedExpression;
|
||||
@ -2821,6 +2822,12 @@ public class LogicalPlanBuilder extends DorisParserBaseVisitor<Object> {
|
||||
getExpression(ctx.pattern)
|
||||
);
|
||||
break;
|
||||
case DorisParser.MATCH_PHRASE_PREFIX:
|
||||
outExpression = new MatchPhrasePrefix(
|
||||
valueExpression,
|
||||
getExpression(ctx.pattern)
|
||||
);
|
||||
break;
|
||||
default:
|
||||
throw new ParseException("Unsupported predicate type: " + ctx.kind.getText(), ctx);
|
||||
}
|
||||
|
||||
@ -49,6 +49,8 @@ public abstract class Match extends BinaryOperator implements PropagateNullable
|
||||
return Operator.MATCH_ALL;
|
||||
case "MATCH_PHRASE":
|
||||
return Operator.MATCH_PHRASE;
|
||||
case "MATCH_PHRASE_PREFIX":
|
||||
return Operator.MATCH_PHRASE_PREFIX;
|
||||
default:
|
||||
throw new AnalysisException("UnSupported type for match: " + symbol);
|
||||
}
|
||||
|
||||
@ -0,0 +1,49 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
package org.apache.doris.nereids.trees.expressions;
|
||||
|
||||
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* like expression: a MATCH_PHRASE_PREFIX 'hello w'.
|
||||
*/
|
||||
public class MatchPhrasePrefix extends Match {
|
||||
public MatchPhrasePrefix(Expression left, Expression right) {
|
||||
super(ImmutableList.of(left, right), "MATCH_PHRASE_PREFIX");
|
||||
}
|
||||
|
||||
private MatchPhrasePrefix(List<Expression> children) {
|
||||
super(children, "MATCH_PHRASE_PREFIX");
|
||||
}
|
||||
|
||||
@Override
|
||||
public MatchPhrasePrefix withChildren(List<Expression> children) {
|
||||
Preconditions.checkArgument(children.size() == 2);
|
||||
return new MatchPhrasePrefix(children);
|
||||
}
|
||||
|
||||
@Override
|
||||
public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
|
||||
return visitor.visitMatchPhrasePrefix(this, context);
|
||||
}
|
||||
}
|
||||
@ -59,6 +59,7 @@ import org.apache.doris.nereids.trees.expressions.Match;
|
||||
import org.apache.doris.nereids.trees.expressions.MatchAll;
|
||||
import org.apache.doris.nereids.trees.expressions.MatchAny;
|
||||
import org.apache.doris.nereids.trees.expressions.MatchPhrase;
|
||||
import org.apache.doris.nereids.trees.expressions.MatchPhrasePrefix;
|
||||
import org.apache.doris.nereids.trees.expressions.Mod;
|
||||
import org.apache.doris.nereids.trees.expressions.Multiply;
|
||||
import org.apache.doris.nereids.trees.expressions.NamedExpression;
|
||||
@ -494,6 +495,10 @@ public abstract class ExpressionVisitor<R, C>
|
||||
return visitMatch(matchPhrase, context);
|
||||
}
|
||||
|
||||
public R visitMatchPhrasePrefix(MatchPhrasePrefix matchPhrasePrefix, C context) {
|
||||
return visitMatch(matchPhrasePrefix, context);
|
||||
}
|
||||
|
||||
/* ********************************************************************************************
|
||||
* Unbound expressions
|
||||
* ********************************************************************************************/
|
||||
|
||||
@ -423,6 +423,7 @@ public class SessionVariable implements Serializable, Writable {
|
||||
public static final String ENABLE_UNIQUE_KEY_PARTIAL_UPDATE = "enable_unique_key_partial_update";
|
||||
|
||||
public static final String INVERTED_INDEX_CONJUNCTION_OPT_THRESHOLD = "inverted_index_conjunction_opt_threshold";
|
||||
public static final String INVERTED_INDEX_MAX_EXPANSIONS = "inverted_index_max_expansions";
|
||||
|
||||
public static final String AUTO_ANALYZE_START_TIME = "auto_analyze_start_time";
|
||||
|
||||
@ -1316,6 +1317,12 @@ public class SessionVariable implements Serializable, Writable {
|
||||
+ " use a skiplist to optimize the intersection."})
|
||||
public int invertedIndexConjunctionOptThreshold = 1000;
|
||||
|
||||
@VariableMgr.VarAttr(name = INVERTED_INDEX_MAX_EXPANSIONS,
|
||||
description = {"这个参数用来限制查询时扩展的词项(terms)的数量,以此来控制查询的性能",
|
||||
"This parameter is used to limit the number of term expansions during a query,"
|
||||
+ " thereby controlling query performance"})
|
||||
public int invertedIndexMaxExpansions = 50;
|
||||
|
||||
@VariableMgr.VarAttr(name = SQL_DIALECT, needForward = true, checker = "checkSqlDialect",
|
||||
description = {"解析sql使用的方言", "The dialect used to parse sql."})
|
||||
public String sqlDialect = "doris";
|
||||
@ -2635,6 +2642,7 @@ public class SessionVariable implements Serializable, Writable {
|
||||
tResult.setEnableMemtableOnSinkNode(enableMemtableOnSinkNode);
|
||||
|
||||
tResult.setInvertedIndexConjunctionOptThreshold(invertedIndexConjunctionOptThreshold);
|
||||
tResult.setInvertedIndexMaxExpansions(invertedIndexMaxExpansions);
|
||||
|
||||
tResult.setFasterFloatConvert(fasterFloatConvert);
|
||||
|
||||
|
||||
@ -319,6 +319,7 @@ import org.apache.doris.qe.SqlModeHelper;
|
||||
keywordMap.put("match_any", new Integer(SqlParserSymbols.KW_MATCH_ANY));
|
||||
keywordMap.put("match_all", new Integer(SqlParserSymbols.KW_MATCH_ALL));
|
||||
keywordMap.put("match_phrase", new Integer(SqlParserSymbols.KW_MATCH_PHRASE));
|
||||
keywordMap.put("match_phrase_prefix", new Integer(SqlParserSymbols.KW_MATCH_PHRASE_PREFIX));
|
||||
keywordMap.put("element_eq", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_EQ));
|
||||
keywordMap.put("element_lt", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_LT));
|
||||
keywordMap.put("element_gt", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_GT));
|
||||
|
||||
@ -93,4 +93,5 @@ enum TExprOpcode {
|
||||
MATCH_ELEMENT_GT,
|
||||
MATCH_ELEMENT_LE,
|
||||
MATCH_ELEMENT_GE,
|
||||
MATCH_PHRASE_PREFIX,
|
||||
}
|
||||
|
||||
@ -259,6 +259,8 @@ struct TQueryOptions {
|
||||
91: optional bool runtime_filter_wait_infinitely = false;
|
||||
|
||||
92: optional i32 wait_full_block_schedule_times = 1;
|
||||
|
||||
93: optional i32 inverted_index_max_expansions = 50;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -0,0 +1,31 @@
|
||||
-- This file is automatically generated. You should know what you did if you want to edit this
|
||||
-- !sql --
|
||||
863
|
||||
|
||||
-- !sql --
|
||||
863
|
||||
|
||||
-- !sql --
|
||||
235
|
||||
|
||||
-- !sql --
|
||||
235
|
||||
|
||||
-- !sql --
|
||||
166
|
||||
|
||||
-- !sql --
|
||||
166
|
||||
|
||||
-- !sql --
|
||||
56
|
||||
|
||||
-- !sql --
|
||||
56
|
||||
|
||||
-- !sql --
|
||||
7
|
||||
|
||||
-- !sql --
|
||||
7
|
||||
|
||||
@ -0,0 +1,98 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
|
||||
suite("test_index_match_phrase_prefix", "p0"){
|
||||
def indexTbName1 = "test_index_match_phrase_prefix"
|
||||
|
||||
sql "DROP TABLE IF EXISTS ${indexTbName1}"
|
||||
|
||||
sql """
|
||||
CREATE TABLE ${indexTbName1} (
|
||||
`@timestamp` int(11) NULL COMMENT "",
|
||||
`clientip` varchar(20) NULL COMMENT "",
|
||||
`request` text NULL COMMENT "",
|
||||
`status` int(11) NULL COMMENT "",
|
||||
`size` int(11) NULL COMMENT "",
|
||||
INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT ''
|
||||
) ENGINE=OLAP
|
||||
DUPLICATE KEY(`@timestamp`)
|
||||
COMMENT "OLAP"
|
||||
DISTRIBUTED BY RANDOM BUCKETS 1
|
||||
PROPERTIES (
|
||||
"replication_allocation" = "tag.location.default: 1"
|
||||
);
|
||||
"""
|
||||
|
||||
def load_httplogs_data = {table_name, label, read_flag, format_flag, file_name, ignore_failure=false,
|
||||
expected_succ_rows = -1, load_to_single_tablet = 'true' ->
|
||||
|
||||
// load the json data
|
||||
streamLoad {
|
||||
table "${table_name}"
|
||||
|
||||
// set http request header params
|
||||
set 'label', label + "_" + UUID.randomUUID().toString()
|
||||
set 'read_json_by_line', read_flag
|
||||
set 'format', format_flag
|
||||
file file_name // import json file
|
||||
time 10000 // limit inflight 10s
|
||||
if (expected_succ_rows >= 0) {
|
||||
set 'max_filter_ratio', '1'
|
||||
}
|
||||
|
||||
// if declared a check callback, the default check condition will ignore.
|
||||
// So you must check all condition
|
||||
check { result, exception, startTime, endTime ->
|
||||
if (ignore_failure && expected_succ_rows < 0) { return }
|
||||
if (exception != null) {
|
||||
throw exception
|
||||
}
|
||||
log.info("Stream load result: ${result}".toString())
|
||||
def json = parseJson(result)
|
||||
assertEquals("success", json.Status.toLowerCase())
|
||||
if (expected_succ_rows >= 0) {
|
||||
assertEquals(json.NumberLoadedRows, expected_succ_rows)
|
||||
} else {
|
||||
assertEquals(json.NumberTotalRows, json.NumberLoadedRows + json.NumberUnselectedRows)
|
||||
assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
load_httplogs_data.call(indexTbName1, 'test_index_match_phrase_prefix', 'true', 'json', 'documents-1000.json')
|
||||
|
||||
qt_sql """ select count() from test_index_match_phrase_prefix where request match_phrase_prefix 'ima'; """
|
||||
qt_sql """ select count() from test_index_match_phrase_prefix where request like '%ima%'; """
|
||||
|
||||
qt_sql """ select count() from test_index_match_phrase_prefix where request match_phrase_prefix 'images/h'; """
|
||||
qt_sql """ select count() from test_index_match_phrase_prefix where request like '%images/h%'; """
|
||||
|
||||
qt_sql """ select count() from test_index_match_phrase_prefix where request match_phrase_prefix 'images/hm'; """
|
||||
qt_sql """ select count() from test_index_match_phrase_prefix where request like '%images/hm%'; """
|
||||
|
||||
qt_sql """ select count() from test_index_match_phrase_prefix where request match_phrase_prefix '/french/images/n'; """
|
||||
qt_sql """ select count() from test_index_match_phrase_prefix where request like '%/french/images/n%'; """
|
||||
|
||||
qt_sql """ select count() from test_index_match_phrase_prefix where request match_phrase_prefix '/french/tickets/images/ti'; """
|
||||
qt_sql """ select count() from test_index_match_phrase_prefix where request like '%/french/tickets/images/ti%'; """
|
||||
} finally {
|
||||
//try_sql("DROP TABLE IF EXISTS ${testTable}")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user