// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include "vec/functions/match.h" #include "olap/rowset/segment_v2/inverted_index_reader.h" #include "runtime/query_context.h" #include "runtime/runtime_state.h" namespace doris::vectorized { Status FunctionMatchBase::execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) { auto match_query_str = block.get_by_position(arguments[1]).to_string(0); std::string column_name = block.get_by_position(arguments[0]).name; auto match_pred_column_name = BeConsts::BLOCK_TEMP_COLUMN_PREFIX + column_name + "_match_" + match_query_str; if (!block.has(match_pred_column_name)) { VLOG_DEBUG << "begin to execute match directly, column_name=" << column_name << ", match_query_str=" << match_query_str; InvertedIndexCtx* inverted_index_ctx = reinterpret_cast( context->get_function_state(FunctionContext::THREAD_LOCAL)); const auto values_col = block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); const auto* values = check_and_get_column(values_col.get()); if (!values) { return Status::InternalError("Not supported input arguments types"); } // result column auto res = ColumnUInt8::create(); ColumnUInt8::Container& vec_res = res->get_data(); // set default value to 0, and match functions only need to set 1/true vec_res.resize_fill(input_rows_count); RETURN_IF_ERROR(execute_match(column_name, match_query_str, input_rows_count, values, inverted_index_ctx, vec_res)); block.replace_by_position(result, std::move(res)); } else { auto match_pred_column = block.get_by_name(match_pred_column_name).column->convert_to_full_column_if_const(); block.replace_by_position(result, std::move(match_pred_column)); } return Status::OK(); } Status FunctionMatchAny::execute_match(const std::string& column_name, const std::string& match_query_str, size_t input_rows_count, const ColumnString* datas, InvertedIndexCtx* inverted_index_ctx, ColumnUInt8::Container& result) { doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN; if (inverted_index_ctx) { parser_type = inverted_index_ctx->parser_type; } VLOG_DEBUG << "begin to run FunctionMatchAny::execute_match, parser_type: " << inverted_index_parser_type_to_string(parser_type); std::vector query_tokens = doris::segment_v2::InvertedIndexReader::get_analyse_result( column_name, match_query_str, doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY, inverted_index_ctx); for (int i = 0; i < input_rows_count; i++) { const auto& str_ref = datas->get_data_at(i); std::vector data_tokens = doris::segment_v2::InvertedIndexReader::get_analyse_result( column_name, str_ref.to_string(), doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY, inverted_index_ctx); // TODO: more efficient impl for (auto& token : query_tokens) { auto it = std::find(data_tokens.begin(), data_tokens.end(), token); if (it != data_tokens.end()) { result[i] = true; break; } } } return Status::OK(); } Status FunctionMatchAll::execute_match(const std::string& column_name, const std::string& match_query_str, size_t input_rows_count, const ColumnString* datas, InvertedIndexCtx* inverted_index_ctx, ColumnUInt8::Container& result) { doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN; if (inverted_index_ctx) { parser_type = inverted_index_ctx->parser_type; } VLOG_DEBUG << "begin to run FunctionMatchAll::execute_match, parser_type: " << inverted_index_parser_type_to_string(parser_type); std::vector query_tokens = doris::segment_v2::InvertedIndexReader::get_analyse_result( column_name, match_query_str, doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY, inverted_index_ctx); for (int i = 0; i < input_rows_count; i++) { const auto& str_ref = datas->get_data_at(i); std::vector data_tokens = doris::segment_v2::InvertedIndexReader::get_analyse_result( column_name, str_ref.to_string(), doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY, inverted_index_ctx); // TODO: more efficient impl auto find_count = 0; for (auto& token : query_tokens) { auto it = std::find(data_tokens.begin(), data_tokens.end(), token); if (it != data_tokens.end()) { ++find_count; } else { break; } } if (find_count == query_tokens.size()) { result[i] = true; } } return Status::OK(); } Status FunctionMatchPhrase::execute_match(const std::string& column_name, const std::string& match_query_str, size_t input_rows_count, const ColumnString* datas, InvertedIndexCtx* inverted_index_ctx, ColumnUInt8::Container& result) { doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN; if (inverted_index_ctx) { parser_type = inverted_index_ctx->parser_type; } VLOG_DEBUG << "begin to run FunctionMatchPhrase::execute_match, parser_type: " << inverted_index_parser_type_to_string(parser_type); std::vector query_tokens = doris::segment_v2::InvertedIndexReader::get_analyse_result( column_name, match_query_str, doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY, inverted_index_ctx); for (int i = 0; i < input_rows_count; i++) { const auto& str_ref = datas->get_data_at(i); std::vector data_tokens = doris::segment_v2::InvertedIndexReader::get_analyse_result( column_name, str_ref.to_string(), doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY, inverted_index_ctx); // TODO: more efficient impl bool matched = false; auto it = data_tokens.begin(); while (it != data_tokens.end()) { // find position of first token it = std::find(it, data_tokens.end(), query_tokens[0]); if (it != data_tokens.end()) { matched = true; it++; auto it_more = it; // compare query_tokens after the first to data_tokens one by one for (size_t idx = 1; idx < query_tokens.size(); idx++) { if (it_more == data_tokens.end() || *it_more != query_tokens[idx]) { matched = false; } it_more++; } if (matched) { break; } } } // check matched if (matched) { result[i] = true; } } return Status::OK(); } void register_function_match(SimpleFunctionFactory& factory) { factory.register_function(); factory.register_function(); factory.register_function(); factory.register_function(); factory.register_function(); factory.register_function(); factory.register_function(); factory.register_function(); } } // namespace doris::vectorized