[fix] (inverted index) Fix match function without inverted index (#38989) (#39220)

## Proposed changes

pick from #38989
This commit is contained in:
Sun Chenyang
2024-08-13 10:55:54 +08:00
committed by GitHub
parent a6155a517d
commit 60eeec3754
10 changed files with 203 additions and 83 deletions

View File

@ -1150,6 +1150,9 @@ lucene::util::bkd::relation InvertedIndexVisitor<QT>::compare(std::vector<uint8_
Status InvertedIndexIterator::read_from_inverted_index(
const std::string& column_name, const void* query_value, InvertedIndexQueryType query_type,
uint32_t segment_num_rows, std::shared_ptr<roaring::Roaring>& bit_map, bool skip_try) {
DBUG_EXECUTE_IF("return_inverted_index_bypass", {
return Status::Error<ErrorCode::INVERTED_INDEX_BYPASS>("inverted index bypass");
});
if (UNLIKELY(_reader == nullptr)) {
throw CLuceneError(CL_ERR_NullPointer, "bkd index reader is null", false);
}

View File

@ -17,6 +17,12 @@
#include "vec/exprs/vmatch_predicate.h"
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wshadow-field"
#endif
#include <CLucene/analysis/LanguageBasedAnalyzer.h>
#include <fmt/format.h>
#include <fmt/ranges.h> // IWYU pragma: keep
#include <gen_cpp/Exprs_types.h>
@ -29,6 +35,7 @@
#include <string_view>
#include <vector>
#include "CLucene/analysis/standard95/StandardAnalyzer.h"
#include "common/status.h"
#include "olap/rowset/segment_v2/inverted_index_reader.h"
#include "vec/core/block.h"
@ -53,6 +60,12 @@ VMatchPredicate::VMatchPredicate(const TExprNode& node) : VExpr(node) {
_inverted_index_ctx->parser_mode = node.match_predicate.parser_mode;
_inverted_index_ctx->char_filter_map = node.match_predicate.char_filter_map;
_analyzer = InvertedIndexReader::create_analyzer(_inverted_index_ctx.get());
_analyzer->set_lowercase(node.match_predicate.parser_lowercase);
if (node.match_predicate.parser_stopwords == "none") {
_analyzer->set_stopwords(nullptr);
} else {
_analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
}
_inverted_index_ctx->analyzer = _analyzer.get();
}

View File

@ -120,10 +120,29 @@ inline doris::segment_v2::InvertedIndexQueryType FunctionMatchBase::get_query_ty
return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY;
} else if (fn_name == MATCH_PHRASE_REGEXP_FUNCTION) {
return doris::segment_v2::InvertedIndexQueryType::MATCH_REGEXP_QUERY;
} else if (fn_name == MATCH_PHRASE_EDGE_FUNCTION) {
return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY;
}
return doris::segment_v2::InvertedIndexQueryType::UNKNOWN_QUERY;
}
void FunctionMatchBase::analyse_query_str_token(std::vector<std::string>* query_tokens,
InvertedIndexCtx* inverted_index_ctx,
const std::string& match_query_str,
const std::string& column_name) const {
VLOG_DEBUG << "begin to run " << get_name() << ", parser_type: "
<< inverted_index_parser_type_to_string(inverted_index_ctx->parser_type);
if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE) {
query_tokens->emplace_back(match_query_str);
return;
}
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
match_query_str);
doris::segment_v2::InvertedIndexReader::get_analyse_result(
*query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
get_query_type_from_fn_name());
}
inline std::vector<std::string> FunctionMatchBase::analyse_data_token(
const std::string& column_name, InvertedIndexCtx* inverted_index_ctx,
const ColumnString* string_col, int32_t current_block_row_idx,
@ -134,10 +153,15 @@ inline std::vector<std::string> FunctionMatchBase::analyse_data_token(
for (auto next_src_array_offset = (*array_offsets)[current_block_row_idx];
current_src_array_offset < next_src_array_offset; ++current_src_array_offset) {
const auto& str_ref = string_col->get_data_at(current_src_array_offset);
if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE) {
data_tokens.emplace_back(str_ref.to_string());
continue;
}
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(
inverted_index_ctx, str_ref.to_string());
std::vector<std::string> element_tokens;
doris::segment_v2::InvertedIndexReader::get_analyse_result(
element_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
query_type, false);
@ -145,12 +169,15 @@ inline std::vector<std::string> FunctionMatchBase::analyse_data_token(
}
} else {
const auto& str_ref = string_col->get_data_at(current_block_row_idx);
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
str_ref.to_string());
doris::segment_v2::InvertedIndexReader::get_analyse_result(data_tokens, reader.get(),
inverted_index_ctx->analyzer,
column_name, query_type, false);
if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE) {
data_tokens.emplace_back(str_ref.to_string());
} else {
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(
inverted_index_ctx, str_ref.to_string());
doris::segment_v2::InvertedIndexReader::get_analyse_result(
data_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
query_type, false);
}
}
return data_tokens;
}
@ -177,23 +204,14 @@ Status FunctionMatchAny::execute_match(FunctionContext* context, const std::stri
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));
doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN;
if (inverted_index_ctx) {
parser_type = inverted_index_ctx->parser_type;
}
VLOG_DEBUG << "begin to run FunctionMatchAny::execute_match, parser_type: "
<< inverted_index_parser_type_to_string(parser_type);
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
match_query_str);
std::vector<std::string> query_tokens;
doris::segment_v2::InvertedIndexReader::get_analyse_result(
query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY);
analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
match_query_str, inverted_index_parser_type_to_string(parser_type));
match_query_str,
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
return Status::OK();
}
@ -224,23 +242,14 @@ Status FunctionMatchAll::execute_match(FunctionContext* context, const std::stri
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));
doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN;
if (inverted_index_ctx) {
parser_type = inverted_index_ctx->parser_type;
}
VLOG_DEBUG << "begin to run FunctionMatchAll::execute_match, parser_type: "
<< inverted_index_parser_type_to_string(parser_type);
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
match_query_str);
std::vector<std::string> query_tokens;
doris::segment_v2::InvertedIndexReader::get_analyse_result(
query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY);
analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
match_query_str, inverted_index_parser_type_to_string(parser_type));
match_query_str,
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
return Status::OK();
}
@ -277,23 +286,14 @@ Status FunctionMatchPhrase::execute_match(FunctionContext* context, const std::s
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));
doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN;
if (inverted_index_ctx) {
parser_type = inverted_index_ctx->parser_type;
}
VLOG_DEBUG << "begin to run FunctionMatchPhrase::execute_match, parser_type: "
<< inverted_index_parser_type_to_string(parser_type);
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
match_query_str);
std::vector<std::string> query_tokens;
doris::segment_v2::InvertedIndexReader::get_analyse_result(
query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY);
analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
match_query_str, inverted_index_parser_type_to_string(parser_type));
match_query_str,
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
return Status::OK();
}
@ -345,25 +345,14 @@ Status FunctionMatchPhrasePrefix::execute_match(
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));
doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN;
if (inverted_index_ctx) {
parser_type = inverted_index_ctx->parser_type;
}
VLOG_DEBUG << "begin to run FunctionMatchPhrasePrefix::execute_match, parser_type: "
<< inverted_index_parser_type_to_string(parser_type);
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
match_query_str);
std::vector<std::string> query_tokens;
doris::segment_v2::InvertedIndexReader::get_analyse_result(
query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY);
analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
match_query_str, inverted_index_parser_type_to_string(parser_type));
match_query_str,
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
return Status::OK();
}
@ -415,18 +404,15 @@ Status FunctionMatchRegexp::execute_match(FunctionContext* context, const std::s
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));
doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN;
if (inverted_index_ctx) {
parser_type = inverted_index_ctx->parser_type;
}
VLOG_DEBUG << "begin to run FunctionMatchRegexp::execute_match, parser_type: "
<< inverted_index_parser_type_to_string(parser_type);
<< inverted_index_parser_type_to_string(inverted_index_ctx->parser_type);
if (match_query_str.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
match_query_str, inverted_index_parser_type_to_string(parser_type));
match_query_str,
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
return Status::OK();
}

View File

@ -55,6 +55,7 @@ const std::string MATCH_ALL_FUNCTION = "match_all";
const std::string MATCH_PHRASE_FUNCTION = "match_phrase";
const std::string MATCH_PHRASE_PREFIX_FUNCTION = "match_phrase_prefix";
const std::string MATCH_PHRASE_REGEXP_FUNCTION = "match_regexp";
const std::string MATCH_PHRASE_EDGE_FUNCTION = "match_phrase_edge";
class FunctionMatchBase : public IFunction {
public:
@ -81,6 +82,11 @@ public:
doris::segment_v2::InvertedIndexQueryType get_query_type_from_fn_name() const;
void analyse_query_str_token(std::vector<std::string>* query_tokens,
InvertedIndexCtx* inverted_index_ctx,
const std::string& match_query_str,
const std::string& field_name) const;
std::vector<std::string> analyse_data_token(const std::string& column_name,
InvertedIndexCtx* inverted_index_ctx,
const ColumnString* string_col,

View File

@ -101,6 +101,18 @@ public class InvertedIndexUtil {
return charFilterMap;
}
public static boolean getInvertedIndexParserLowercase(Map<String, String> properties) {
String lowercase = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_LOWERCASE_KEY);
// default is true if not set
return lowercase != null ? Boolean.parseBoolean(lowercase) : true;
}
public static String getInvertedIndexParserStopwords(Map<String, String> properties) {
String stopwrods = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_STOPWORDS_KEY);
// default is "" if not set
return stopwrods != null ? stopwrods : "";
}
public static void checkInvertedIndexParser(String indexColName, PrimitiveType colType,
Map<String, String> properties) throws AnalysisException {
String parser = null;

View File

@ -150,6 +150,8 @@ public class MatchPredicate extends Predicate {
private String invertedIndexParser;
private String invertedIndexParserMode;
private Map<String, String> invertedIndexCharFilter;
private boolean invertedIndexParserLowercase = true;
private String invertedIndexParserStopwords = "";
public MatchPredicate(Operator op, Expr e1, Expr e2) {
super();
@ -170,23 +172,22 @@ public class MatchPredicate extends Predicate {
invertedIndexParser = other.invertedIndexParser;
invertedIndexParserMode = other.invertedIndexParserMode;
invertedIndexCharFilter = other.invertedIndexCharFilter;
invertedIndexParserLowercase = other.invertedIndexParserLowercase;
invertedIndexParserStopwords = other.invertedIndexParserStopwords;
}
/**
* use for Nereids ONLY
*/
public MatchPredicate(Operator op, Expr e1, Expr e2, Type retType,
NullableMode nullableMode, String invertedIndexParser, String invertedIndexParserMode,
Map<String, String> invertedIndexCharFilter) {
NullableMode nullableMode, Index invertedIndex) {
this(op, e1, e2);
if (invertedIndexParser != null) {
this.invertedIndexParser = invertedIndexParser;
}
if (invertedIndexParserMode != null) {
this.invertedIndexParserMode = invertedIndexParserMode;
}
if (invertedIndexParserMode != null) {
this.invertedIndexCharFilter = invertedIndexCharFilter;
if (invertedIndex != null) {
this.invertedIndexParser = invertedIndex.getInvertedIndexParser();
this.invertedIndexParserMode = invertedIndex.getInvertedIndexParserMode();
this.invertedIndexCharFilter = invertedIndex.getInvertedIndexCharFilter();
this.invertedIndexParserLowercase = invertedIndex.getInvertedIndexParserLowercase();
this.invertedIndexParserStopwords = invertedIndex.getInvertedIndexParserStopwords();
}
fn = new Function(new FunctionName(op.name), Lists.newArrayList(e1.getType(), e2.getType()), retType,
false, true, nullableMode);
@ -220,6 +221,8 @@ public class MatchPredicate extends Predicate {
msg.setOpcode(op.getOpcode());
msg.match_predicate = new TMatchPredicate(invertedIndexParser, invertedIndexParserMode);
msg.match_predicate.setCharFilterMap(invertedIndexCharFilter);
msg.match_predicate.setParserLowercase(invertedIndexParserLowercase);
msg.match_predicate.setParserStopwords(invertedIndexParserStopwords);
}
@Override
@ -264,6 +267,8 @@ public class MatchPredicate extends Predicate {
invertedIndexParser = index.getInvertedIndexParser();
invertedIndexParserMode = index.getInvertedIndexParserMode();
invertedIndexCharFilter = index.getInvertedIndexCharFilter();
invertedIndexParserLowercase = index.getInvertedIndexParserLowercase();
invertedIndexParserStopwords = index.getInvertedIndexParserStopwords();
break;
}
}

View File

@ -158,6 +158,18 @@ public class Index implements Writable {
return InvertedIndexUtil.getInvertedIndexCharFilter(properties);
}
public boolean getInvertedIndexParserLowercase() {
return InvertedIndexUtil.getInvertedIndexParserLowercase(properties);
}
public String getInvertedIndexParserStopwords() {
return InvertedIndexUtil.getInvertedIndexParserStopwords(properties);
}
public boolean isLightIndexChangeSupported() {
return indexType == IndexDef.IndexType.INVERTED;
}
public String getComment() {
return getComment(false);
}

View File

@ -32,7 +32,6 @@ import org.apache.doris.analysis.FunctionCallExpr;
import org.apache.doris.analysis.FunctionName;
import org.apache.doris.analysis.FunctionParams;
import org.apache.doris.analysis.IndexDef;
import org.apache.doris.analysis.InvertedIndexUtil;
import org.apache.doris.analysis.IsNullPredicate;
import org.apache.doris.analysis.LambdaFunctionCallExpr;
import org.apache.doris.analysis.LambdaFunctionExpr;
@ -106,9 +105,7 @@ import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
@ -213,9 +210,7 @@ public class ExpressionTranslator extends DefaultExpressionVisitor<Expr, PlanTra
@Override
public Expr visitMatch(Match match, PlanTranslatorContext context) {
String invertedIndexParser = InvertedIndexUtil.INVERTED_INDEX_PARSER_UNKNOWN;
String invertedIndexParserMode = InvertedIndexUtil.INVERTED_INDEX_PARSER_COARSE_GRANULARITY;
Map<String, String> invertedIndexCharFilter = new HashMap<>();
Index invertedIndex = null;
// Get the first slot from match's left expr
SlotRef left = (SlotRef) match.left().getInputSlots().stream().findFirst().get().accept(this, context);
OlapTable olapTbl = Optional.ofNullable(getOlapTableFromSlotDesc(left.getDesc()))
@ -231,9 +226,7 @@ public class ExpressionTranslator extends DefaultExpressionVisitor<Expr, PlanTra
if (index.getIndexType() == IndexDef.IndexType.INVERTED) {
List<String> columns = index.getColumns();
if (columns != null && !columns.isEmpty() && left.getColumnName().equals(columns.get(0))) {
invertedIndexParser = index.getInvertedIndexParser();
invertedIndexParserMode = index.getInvertedIndexParserMode();
invertedIndexCharFilter = index.getInvertedIndexCharFilter();
invertedIndex = index;
break;
}
}
@ -243,8 +236,7 @@ public class ExpressionTranslator extends DefaultExpressionVisitor<Expr, PlanTra
MatchPredicate.Operator op = match.op();
MatchPredicate matchPredicate = new MatchPredicate(op, match.left().accept(this, context),
match.right().accept(this, context), match.getDataType().toCatalogDataType(),
NullableMode.DEPEND_ON_ARGUMENT, invertedIndexParser, invertedIndexParserMode,
invertedIndexCharFilter);
NullableMode.DEPEND_ON_ARGUMENT, invertedIndex);
matchPredicate.setNullableFromNereids(match.nullable());
return matchPredicate;
}

View File

@ -159,6 +159,8 @@ struct TMatchPredicate {
1: required string parser_type;
2: required string parser_mode;
3: optional map<string, string> char_filter_map;
4: optional bool parser_lowercase = true;
5: optional string parser_stopwords = "";
}
struct TLiteralPredicate {

View File

@ -0,0 +1,89 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
suite("test_match_without_index", "p0") {
def testTable = "test_match_without_index"
sql "DROP TABLE IF EXISTS ${testTable}"
sql """
CREATE TABLE ${testTable} (
`@timestamp` int(11) NULL COMMENT "",
`clientip` string NULL COMMENT "",
`request` string NULL COMMENT "",
`status` string NULL COMMENT "",
`size` int NULL COMMENT "",
INDEX clientip_idx (`clientip`) USING INVERTED COMMENT '',
INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser"="unicode", "lower_case" = "false") COMMENT '',
INDEX status_idx (`status`) USING INVERTED COMMENT '',
INDEX size_idx (`size`) USING INVERTED COMMENT ''
) ENGINE=OLAP
DUPLICATE KEY(`@timestamp`)
COMMENT "OLAP"
DISTRIBUTED BY HASH(`@timestamp`) BUCKETS 1
PROPERTIES (
"replication_allocation" = "tag.location.default: 1"
);
"""
sql """ INSERT INTO ${testTable} VALUES (123, '17.0.0.0', 'HTTP GET', '200', 20); """
sql """ INSERT INTO ${testTable} VALUES (123, '17.0.0.0', 'Life is like a box of chocolates, you never know what you are going to get.', '200', 20); """
// sql """ """
List<Object> match_res_without_index = new ArrayList<>();
List<Object> match_res_with_index =new ArrayList<>();
def create_sql = {
List<String> list = new ArrayList<>()
list.add(" select count() from ${testTable} where clientip match_phrase '17' ");
list.add(" select count() from ${testTable} where clientip match_all '17' ");
list.add(" select count() from ${testTable} where clientip match_any '17' ");
list.add(" select count() from ${testTable} where request match_any 'get' ");
list.add(" select count() from ${testTable} where request match_phrase_prefix 'like box' ");
return list;
}
def execute_sql = { resultList, sqlList ->
for (sqlStr in sqlList) {
def sqlResult = sql """ ${sqlStr} """
resultList.add(sqlResult)
}
}
def compare_result = { executedSql ->
assertEquals(match_res_without_index.size(), match_res_with_index.size())
for (int i = 0; i < match_res_without_index.size(); i++) {
if (match_res_without_index[i] != match_res_with_index[i]) {
logger.info("sql is {}", executedSql[i])
logger.info("match_res_without_index is {}", match_res_without_index[i])
logger.info("match_res_with_index is {}", match_res_with_index[i])
assertTrue(false)
}
}
}
def index_sql = create_sql.call()
try {
GetDebugPoint().enableDebugPointForAllBEs("return_inverted_index_bypass")
execute_sql.call(match_res_without_index, index_sql)
} finally {
GetDebugPoint().disableDebugPointForAllBEs("return_inverted_index_bypass")
execute_sql.call(match_res_with_index, index_sql)
compare_result.call(index_sql)
}
}