[feature](inverted index) Add multi_match function #37722 #38931 #39149 (#38877)

This commit is contained in:
zzzxl
2024-08-10 15:20:08 +08:00
committed by GitHub
parent 80b6345270
commit 5e1e725cee
23 changed files with 864 additions and 65 deletions

View File

@ -72,7 +72,6 @@ class InvertedIndexIterator;
class InvertedIndexQueryCacheHandle;
class InvertedIndexFileReader;
struct InvertedIndexQueryInfo;
class InvertedIndexReader : public std::enable_shared_from_this<InvertedIndexReader> {
public:
explicit InvertedIndexReader(
@ -153,6 +152,7 @@ protected:
TabletIndex _index_meta;
bool _has_null = true;
};
using InvertedIndexReaderPtr = std::shared_ptr<InvertedIndexReader>;
class FullTextIndexReader : public InvertedIndexReader {
ENABLE_FACTORY_CREATOR(FullTextIndexReader);
@ -306,6 +306,8 @@ public:
[[nodiscard]] const std::map<string, string>& get_index_properties() const;
[[nodiscard]] bool has_null() { return _reader->has_null(); };
const InvertedIndexReaderPtr& reader() { return _reader; }
private:
OlapReaderStatistics* _stats = nullptr;
RuntimeState* _runtime_state = nullptr;

View File

@ -314,6 +314,7 @@ Status SegmentIterator::_init_impl(const StorageReadOptions& opts) {
for (auto& expr : _remaining_conjunct_roots) {
_calculate_pred_in_remaining_conjunct_root(expr);
}
_calculate_func_in_remaining_conjunct_root();
_column_predicate_info.reset(new ColumnPredicateInfo());
if (_schema->rowid_col_idx() > 0) {
@ -558,6 +559,8 @@ Status SegmentIterator::_get_row_ranges_by_column_conditions() {
++it;
}
}
_col_preds_except_leafnode_of_andnode.clear();
compound_func_exprs.clear();
// 1. if all conditions in the compound hit the inverted index and there are no other expr to handle.
// 2. then there is no need to generate index_result_column.
if (_enable_common_expr_pushdown && _remaining_conjunct_roots.empty()) {
@ -807,25 +810,32 @@ Status SegmentIterator::_execute_predicates_except_leafnode_of_andnode(
auto v_literal_expr = std::dynamic_pointer_cast<doris::vectorized::VLiteral>(expr);
_column_predicate_info->query_values.insert(v_literal_expr->value());
} else if (node_type == TExprNodeType::BINARY_PRED || node_type == TExprNodeType::MATCH_PRED ||
node_type == TExprNodeType::IN_PRED) {
if (node_type == TExprNodeType::MATCH_PRED) {
_column_predicate_info->query_op = "match";
} else if (node_type == TExprNodeType::IN_PRED) {
if (expr->op() == TExprOpcode::type::FILTER_IN) {
_column_predicate_info->query_op = "in";
} else {
_column_predicate_info->query_op = "not_in";
}
node_type == TExprNodeType::IN_PRED || node_type == TExprNodeType::FUNCTION_CALL) {
std::string result_sign;
if (node_type == TExprNodeType::FUNCTION_CALL) {
result_sign =
BeConsts::BLOCK_TEMP_COLUMN_PREFIX + std::to_string(expr->index_unique_id());
} else {
_column_predicate_info->query_op = expr->fn().name.function_name;
if (node_type == TExprNodeType::MATCH_PRED) {
_column_predicate_info->query_op = "match";
} else if (node_type == TExprNodeType::IN_PRED) {
if (expr->op() == TExprOpcode::type::FILTER_IN) {
_column_predicate_info->query_op = "in";
} else {
_column_predicate_info->query_op = "not_in";
}
} else {
_column_predicate_info->query_op = expr->fn().name.function_name;
}
result_sign = _gen_predicate_result_sign(_column_predicate_info.get());
}
// get child condition result in compound conditions
auto pred_result_sign = _gen_predicate_result_sign(_column_predicate_info.get());
_column_predicate_info.reset(new ColumnPredicateInfo());
VLOG_DEBUG << "_gen_predicate_result_sign " << pred_result_sign;
if (_rowid_result_for_index.count(pred_result_sign) > 0 &&
_rowid_result_for_index[pred_result_sign].first) {
auto apply_result = _rowid_result_for_index[pred_result_sign].second;
VLOG_DEBUG << "result_sign " << result_sign;
if (_rowid_result_for_index.count(result_sign) > 0 &&
_rowid_result_for_index[result_sign].first) {
auto apply_result = _rowid_result_for_index[result_sign].second;
_pred_except_leafnode_of_andnode_evaluate_result.push_back(apply_result);
}
} else if (node_type == TExprNodeType::COMPOUND_PRED) {
@ -869,7 +879,7 @@ Status SegmentIterator::_execute_compound_fn(const std::string& function_name) {
bool SegmentIterator::_can_filter_by_preds_except_leafnode_of_andnode() {
// no compound predicates push down, so no need to filter
if (_col_preds_except_leafnode_of_andnode.size() == 0) {
if (_col_preds_except_leafnode_of_andnode.empty() && compound_func_exprs.empty()) {
return false;
}
for (auto pred : _col_preds_except_leafnode_of_andnode) {
@ -883,6 +893,14 @@ bool SegmentIterator::_can_filter_by_preds_except_leafnode_of_andnode() {
return false;
}
}
for (const auto& func_expr_pair : compound_func_exprs) {
const auto& expr = func_expr_pair.first;
std::string pred_result_sign =
BeConsts::BLOCK_TEMP_COLUMN_PREFIX + std::to_string(expr->index_unique_id());
if (!_rowid_result_for_index.contains(pred_result_sign)) {
return false;
}
}
return true;
}
@ -994,6 +1012,16 @@ Status SegmentIterator::_apply_index_except_leafnode_of_andnode() {
}
}
for (const auto& func_expr_pair : compound_func_exprs) {
const auto& expr = func_expr_pair.first;
const auto& expr_ctx = func_expr_pair.second;
auto result = std::make_shared<roaring::Roaring>();
RETURN_IF_ERROR(execute_func_expr(expr, expr_ctx, result));
std::string result_sign =
BeConsts::BLOCK_TEMP_COLUMN_PREFIX + std::to_string(expr->index_unique_id());
_rowid_result_for_index.emplace(result_sign, std::make_pair(true, std::move(*result)));
}
return Status::OK();
}
@ -1269,18 +1297,6 @@ Status SegmentIterator::_apply_inverted_index() {
std::vector<ColumnPredicate*> remaining_predicates;
std::set<const ColumnPredicate*> no_need_to_pass_column_predicate_set;
// TODO:Comment out this code before introducing range query functionality
/*for (const auto& entry : _opts.col_id_to_predicates) {
ColumnId column_id = entry.first;
auto pred = entry.second;
bool continue_apply = true;
RETURN_IF_ERROR(_apply_inverted_index_on_block_column_predicate(
column_id, pred.get(), no_need_to_pass_column_predicate_set, &continue_apply));
if (!continue_apply) {
break;
}
}*/
for (auto pred : _col_predicates) {
if (no_need_to_pass_column_predicate_set.count(pred) > 0) {
continue;
@ -1316,6 +1332,23 @@ Status SegmentIterator::_apply_inverted_index() {
}
}
for (const auto& func_expr_pair : no_compound_func_exprs) {
const auto& expr = func_expr_pair.first;
const auto& expr_ctx = func_expr_pair.second;
auto result = std::make_shared<roaring::Roaring>();
RETURN_IF_ERROR(execute_func_expr(expr, expr_ctx, result));
_row_bitmap &= *result;
for (auto it = _remaining_conjunct_roots.begin(); it != _remaining_conjunct_roots.end();) {
if (*it == expr) {
std::erase_if(_common_expr_ctxs_push_down,
[&it](const auto& iter) { return iter->root() == *it; });
it = _remaining_conjunct_roots.erase(it);
} else {
++it;
}
}
}
_col_predicates = std::move(remaining_predicates);
_opts.stats->rows_inverted_index_filtered += (input_rows - _row_bitmap.cardinality());
return Status::OK();
@ -1392,6 +1425,17 @@ Status SegmentIterator::_init_inverted_index_iterators() {
return Status::OK();
}
Status SegmentIterator::_init_inverted_index_iterators(ColumnId cid) {
std::lock_guard lock(_idx_init_lock);
if (_inverted_index_iterators[cid] == nullptr) {
return _segment->new_inverted_index_iterator(
_opts.tablet_schema->column(cid),
_segment->_tablet_schema->get_inverted_index(_opts.tablet_schema->column(cid)),
_opts, &_inverted_index_iterators[cid]);
}
return Status::OK();
}
Status SegmentIterator::_lookup_ordinal(const RowCursor& key, bool is_include, rowid_t upper_bound,
rowid_t* rowid) {
if (_segment->_tablet_schema->keys_type() == UNIQUE_KEYS &&
@ -2822,6 +2866,64 @@ void SegmentIterator::_calculate_pred_in_remaining_conjunct_root(
}
}
void SegmentIterator::_calculate_func_in_remaining_conjunct_root() {
auto hash = [](const vectorized::VExprSPtr& expr) -> std::size_t {
return std::hash<std::string>()(expr->expr_name());
};
auto equal = [](const vectorized::VExprSPtr& lhs, const vectorized::VExprSPtr& rhs) -> bool {
return lhs->equals(*rhs);
};
uint32_t next_id = 0;
std::unordered_map<vectorized::VExprSPtr, uint32_t, decltype(hash), decltype(equal)> unique_map(
0, hash, equal);
auto gen_func_unique_id = [&unique_map, &next_id](const vectorized::VExprSPtr& expr) {
auto it = unique_map.find(expr);
if (it != unique_map.end()) {
return it->second;
} else {
unique_map[expr] = ++next_id;
return next_id;
}
};
for (const auto& root_expr_ctx : _common_expr_ctxs_push_down) {
const auto& root_expr = root_expr_ctx->root();
if (root_expr == nullptr) {
continue;
}
std::stack<std::pair<vectorized::VExprSPtr, bool>> stack;
stack.emplace(root_expr, false);
while (!stack.empty()) {
const auto& [expr, has_compound_pred] = stack.top();
stack.pop();
bool current_has_compound_pred =
has_compound_pred || (expr->node_type() == TExprNodeType::COMPOUND_PRED);
if (expr->node_type() == TExprNodeType::FUNCTION_CALL &&
expr->can_push_down_to_index()) {
expr->set_index_unique_id(gen_func_unique_id(expr));
if (current_has_compound_pred) {
compound_func_exprs.emplace_back(expr, root_expr_ctx);
} else {
no_compound_func_exprs.emplace_back(expr, root_expr_ctx);
}
}
const auto& children = expr->children();
for (int32_t i = children.size() - 1; i >= 0; --i) {
if (!children[i]->children().empty()) {
stack.emplace(children[i], current_has_compound_pred);
}
}
}
}
}
bool SegmentIterator::_no_need_read_key_data(ColumnId cid, vectorized::MutableColumnPtr& column,
size_t nrows_read) {
if (!((_opts.tablet_schema->keys_type() == KeysType::DUP_KEYS ||
@ -2900,5 +3002,23 @@ bool SegmentIterator::_can_opt_topn_reads() const {
return result;
}
Status SegmentIterator::execute_func_expr(const vectorized::VExprSPtr& expr,
const vectorized::VExprContextSPtr& expr_ctx,
std::shared_ptr<roaring::Roaring>& result) {
const auto& expr0 = expr->get_child(0);
if (!expr0 || expr0->node_type() != TExprNodeType::SLOT_REF) {
return Status::RuntimeError("cannot perform index filtering");
}
FuncExprParams params;
auto slot_expr = std::static_pointer_cast<vectorized::VSlotRef>(expr0);
params._column_id = _schema->column_id(slot_expr->column_id());
params._unique_id = _schema->unique_id(slot_expr->column_id());
params._column_name = _opts.tablet_schema->column(params._column_id).name();
params._segment_iterator = this;
return expr->eval_inverted_index(expr_ctx.get(), params, result);
}
} // namespace segment_v2
} // namespace doris

View File

@ -107,6 +107,15 @@ struct ColumnPredicateInfo {
int32_t column_id;
};
class SegmentIterator;
struct FuncExprParams {
ColumnId _column_id = 0;
uint32_t _unique_id = 0;
std::string _column_name;
SegmentIterator* _segment_iterator = nullptr;
std::shared_ptr<roaring::Roaring> result;
};
class SegmentIterator : public RowwiseIterator {
public:
SegmentIterator(std::shared_ptr<Segment> segment, SchemaSPtr schema);
@ -123,6 +132,8 @@ public:
std::vector<RowLocation>* block_row_locations) override;
const Schema& schema() const override { return *_schema; }
Segment& segment() { return *_segment; }
StorageReadOptions& storage_read_options() { return _opts; }
bool is_lazy_materialization_read() const override { return _lazy_materialization_read; }
uint64_t data_id() const override { return _segment->id(); }
RowsetId rowset_id() const { return _segment->rowset_id(); }
@ -142,6 +153,11 @@ public:
return updated;
}
std::vector<std::unique_ptr<InvertedIndexIterator>>& inverted_index_iterators() {
return _inverted_index_iterators;
}
[[nodiscard]] Status _init_inverted_index_iterators(ColumnId cid);
private:
Status _next_batch_internal(vectorized::Block* block);
@ -308,6 +324,7 @@ private:
bool _check_column_pred_all_push_down(const std::string& column_name, bool in_compound = false,
bool is_match = false);
void _calculate_pred_in_remaining_conjunct_root(const vectorized::VExprSPtr& expr);
void _calculate_func_in_remaining_conjunct_root();
// todo(wb) remove this method after RowCursor is removed
void _convert_rowcursor_to_short_key(const RowCursor& key, size_t num_keys) {
@ -387,6 +404,10 @@ private:
bool _can_opt_topn_reads() const;
Status execute_func_expr(const vectorized::VExprSPtr& expr,
const vectorized::VExprContextSPtr& expr_ctx,
std::shared_ptr<roaring::Roaring>& result);
class BitmapRangeIterator;
class BackwardBitmapRangeIterator;
@ -452,6 +473,11 @@ private:
// make a copy of `_opts.column_predicates` in order to make local changes
std::vector<ColumnPredicate*> _col_predicates;
std::vector<ColumnPredicate*> _col_preds_except_leafnode_of_andnode;
using FuncExprPair = std::pair<vectorized::VExprSPtr, vectorized::VExprContextSPtr>;
std::vector<FuncExprPair> no_compound_func_exprs;
std::vector<FuncExprPair> compound_func_exprs;
vectorized::VExprContextSPtrs _common_expr_ctxs_push_down;
bool _enable_common_expr_pushdown = false;
std::vector<vectorized::VExprSPtr> _remaining_conjunct_roots;
@ -493,6 +519,13 @@ private:
std::set<int32_t> _output_columns;
std::unique_ptr<HierarchicalDataReader> _path_reader;
std::vector<uint8_t> _ret_flags;
std::unordered_map<int, std::unordered_map<std::string, bool>>
_column_predicate_inverted_index_status;
std::mutex _idx_init_lock;
};
} // namespace segment_v2

View File

@ -112,8 +112,7 @@ Status VectorizedFnCall::prepare(RuntimeState* state, const RowDescriptor& desc,
}
VExpr::register_function_context(state, context);
_function_name = _fn.name.function_name;
_can_fast_execute = _function->can_fast_execute() && _children.size() == 2 &&
_children[0]->is_slot_ref() && _children[1]->is_literal();
_can_fast_execute = can_fast_execute();
_prepare_finished = true;
return Status::OK();
}
@ -215,4 +214,45 @@ std::string VectorizedFnCall::debug_string(const std::vector<VectorizedFnCall*>&
out << "]";
return out.str();
}
bool VectorizedFnCall::can_push_down_to_index() const {
return _function->can_push_down_to_index();
}
bool VectorizedFnCall::can_fast_execute() const {
auto function_name = _function->get_name();
if (function_name == "eq" || function_name == "ne" || function_name == "lt" ||
function_name == "gt" || function_name == "le" || function_name == "ge") {
if (_children.size() == 2 && _children[0]->is_slot_ref() && _children[1]->is_literal()) {
return true;
}
}
return _function->can_push_down_to_index();
}
Status VectorizedFnCall::eval_inverted_index(VExprContext* context,
segment_v2::FuncExprParams& params,
std::shared_ptr<roaring::Roaring>& result) {
return _function->eval_inverted_index(context->fn_context(_fn_context_index), params, result);
}
bool VectorizedFnCall::equals(const VExpr& other) {
const auto* other_ptr = dynamic_cast<const VectorizedFnCall*>(&other);
if (!other_ptr) {
return false;
}
if (this->_function_name != other_ptr->_function_name) {
return false;
}
if (this->children().size() != other_ptr->children().size()) {
return false;
}
for (size_t i = 0; i < this->children().size(); i++) {
if (!this->get_child(i)->equals(*other_ptr->get_child(i))) {
return false;
}
}
return true;
}
} // namespace doris::vectorized

View File

@ -66,9 +66,14 @@ public:
}
static std::string debug_string(const std::vector<VectorizedFnCall*>& exprs);
bool can_push_down_to_index() const override;
bool can_fast_execute() const override;
Status eval_inverted_index(VExprContext* context, segment_v2::FuncExprParams& params,
std::shared_ptr<roaring::Roaring>& result) override;
bool equals(const VExpr& other) override;
protected:
FunctionBasePtr _function;
bool _can_fast_execute = false;
std::string _expr_name;
std::string _function_name;
@ -76,4 +81,5 @@ private:
Status _do_execute(doris::vectorized::VExprContext* context, doris::vectorized::Block* block,
int* result_column_id, std::vector<size_t>& args);
};
} // namespace doris::vectorized

View File

@ -624,24 +624,33 @@ bool VExpr::fast_execute(Block& block, const ColumnNumbers& arguments, size_t re
}
std::string VExpr::gen_predicate_result_sign(Block& block, const ColumnNumbers& arguments,
const std::string& function_name) {
const std::string& function_name) const {
std::string pred_result_sign;
std::string column_name = block.get_by_position(arguments[0]).name;
pred_result_sign +=
BeConsts::BLOCK_TEMP_COLUMN_PREFIX + column_name + "_" + function_name + "_";
if (function_name == "in" || function_name == "not_in") {
// Generating 'result_sign' from 'inlist' requires sorting the values.
std::set<std::string> values;
for (size_t i = 1; i < arguments.size(); i++) {
const auto& entry = block.get_by_position(arguments[i]);
values.insert(entry.type->to_string(*entry.column, 0));
}
pred_result_sign += boost::join(values, ",");
if (this->node_type() == TExprNodeType::FUNCTION_CALL) {
pred_result_sign =
BeConsts::BLOCK_TEMP_COLUMN_PREFIX + std::to_string(this->index_unique_id());
} else {
const auto& entry = block.get_by_position(arguments[1]);
pred_result_sign += entry.type->to_string(*entry.column, 0);
std::string column_name = block.get_by_position(arguments[0]).name;
pred_result_sign +=
BeConsts::BLOCK_TEMP_COLUMN_PREFIX + column_name + "_" + function_name + "_";
if (function_name == "in" || function_name == "not_in") {
// Generating 'result_sign' from 'inlist' requires sorting the values.
std::set<std::string> values;
for (size_t i = 1; i < arguments.size(); i++) {
const auto& entry = block.get_by_position(arguments[i]);
values.insert(entry.type->to_string(*entry.column, 0));
}
pred_result_sign += boost::join(values, ",");
} else {
const auto& entry = block.get_by_position(arguments[1]);
pred_result_sign += entry.type->to_string(*entry.column, 0);
}
}
return pred_result_sign;
}
bool VExpr::equals(const VExpr& other) {
return false;
}
} // namespace doris::vectorized

View File

@ -226,7 +226,17 @@ public:
size_t input_rows_count, const std::string& function_name);
std::string gen_predicate_result_sign(Block& block, const ColumnNumbers& arguments,
const std::string& function_name);
const std::string& function_name) const;
virtual bool can_push_down_to_index() const { return false; }
virtual bool can_fast_execute() const { return false; }
virtual Status eval_inverted_index(VExprContext* context, segment_v2::FuncExprParams& params,
std::shared_ptr<roaring::Roaring>& result) {
return Status::NotSupported("Not supported execute_with_inverted_index");
}
virtual bool equals(const VExpr& other);
void set_index_unique_id(uint32_t index_unique_id) { _index_unique_id = index_unique_id; }
uint32_t index_unique_id() const { return _index_unique_id; }
protected:
/// Simple debug string that provides no expr subclass-specific information
@ -292,6 +302,10 @@ protected:
// for concrete classes
bool _prepare_finished = false;
bool _open_finished = false;
// ensuring uniqueness during index traversal
uint32_t _index_unique_id = 0;
bool _can_fast_execute = false;
};
} // namespace vectorized

View File

@ -78,9 +78,7 @@ Status VInPredicate::prepare(RuntimeState* state, const RowDescriptor& desc,
VExpr::register_function_context(state, context);
_prepare_finished = true;
_can_fast_execute = _function->can_fast_execute();
_can_fast_execute = can_fast_execute();
return Status::OK();
}

View File

@ -54,6 +54,7 @@ public:
const FunctionBasePtr function() { return _function; }
bool is_not_in() const { return _is_not_in; };
bool can_fast_execute() const override { return true; }
private:
FunctionBasePtr _function;
@ -61,7 +62,5 @@ private:
const bool _is_not_in;
static const constexpr char* function_name = "in";
bool _can_fast_execute = false;
};
} // namespace doris::vectorized

View File

@ -96,4 +96,25 @@ std::string VLiteral::debug_string() const {
return out.str();
}
bool VLiteral::equals(const VExpr& other) {
const auto* other_ptr = dynamic_cast<const VLiteral*>(&other);
if (!other_ptr) {
return false;
}
if (this->_expr_name != other_ptr->_expr_name) {
return false;
}
if (this->_column_ptr->structure_equals(*other_ptr->_column_ptr)) {
if (this->_column_ptr->size() != other_ptr->_column_ptr->size()) {
return false;
}
for (size_t i = 0; i < this->_column_ptr->size(); i++) {
if (this->_column_ptr->compare_at(i, i, *other_ptr->_column_ptr, -1) != 0) {
return false;
}
}
}
return true;
}
} // namespace doris::vectorized

View File

@ -57,6 +57,8 @@ public:
bool is_literal() const override { return true; }
bool equals(const VExpr& other) override;
protected:
ColumnPtr _column_ptr;
std::string _expr_name;

View File

@ -112,4 +112,21 @@ std::string VSlotRef::debug_string() const {
out << "SlotRef(slot_id=" << _slot_id << VExpr::debug_string() << ")";
return out.str();
}
bool VSlotRef::equals(const VExpr& other) {
if (!VExpr::equals(other)) {
return false;
}
const auto* other_ptr = dynamic_cast<const VSlotRef*>(&other);
if (!other_ptr) {
return false;
}
if (this->_slot_id != other_ptr->_slot_id || this->_column_id != other_ptr->_column_id ||
this->_column_name != other_ptr->_column_name ||
this->_column_label != other_ptr->_column_label) {
return false;
}
return true;
}
} // namespace doris::vectorized

View File

@ -52,6 +52,8 @@ public:
int slot_id() const { return _slot_id; }
bool equals(const VExpr& other) override;
private:
int _slot_id;
int _column_id;

View File

@ -31,6 +31,7 @@
#include "common/exception.h"
#include "common/status.h"
#include "olap/rowset/segment_v2/inverted_index_reader.h"
#include "udf/udf.h"
#include "vec/core/block.h"
#include "vec/core/column_numbers.h"
@ -40,6 +41,10 @@
#include "vec/data_types/data_type.h"
#include "vec/data_types/data_type_nullable.h"
namespace doris::segment_v2 {
struct FuncExprParams;
} // namespace doris::segment_v2
namespace doris::vectorized {
#define RETURN_REAL_TYPE_FOR_DATEV2_FUNCTION(TYPE) \
@ -54,6 +59,7 @@ namespace doris::vectorized {
: std::make_shared<TYPE>();
class Field;
class VExpr;
// Only use dispose the variadic argument
template <typename T>
@ -185,8 +191,6 @@ public:
return Status::OK();
}
virtual bool can_fast_execute() const { return false; }
virtual bool is_use_default_implementation_for_constants() const = 0;
/// The property of monotonicity for a certain range.
@ -214,6 +218,13 @@ public:
get_name());
return Monotonicity {};
}
virtual bool can_push_down_to_index() const { return false; }
virtual Status eval_inverted_index(FunctionContext* context, segment_v2::FuncExprParams& params,
std::shared_ptr<roaring::Roaring>& result) {
return Status::NotSupported("eval_inverted_index is not supported in function: ",
get_name());
}
};
using FunctionBasePtr = std::shared_ptr<IFunctionBase>;
@ -485,13 +496,6 @@ public:
return function->close(context, scope);
}
bool can_fast_execute() const override {
auto function_name = function->get_name();
return function_name == "eq" || function_name == "ne" || function_name == "lt" ||
function_name == "gt" || function_name == "le" || function_name == "ge" ||
function_name == "in" || function_name == "not_in";
}
IFunctionBase::Monotonicity get_monotonicity_for_range(const IDataType& type, const Field& left,
const Field& right) const override {
return function->get_monotonicity_for_range(type, left, right);
@ -501,6 +505,12 @@ public:
return function->is_use_default_implementation_for_constants();
}
bool can_push_down_to_index() const override { return function->can_push_down_to_index(); }
Status eval_inverted_index(FunctionContext* context, segment_v2::FuncExprParams& params,
std::shared_ptr<roaring::Roaring>& result) override {
return function->eval_inverted_index(context, params, result);
}
private:
std::shared_ptr<IFunction> function;
DataTypes arguments;

View File

@ -0,0 +1,194 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vec/functions/function_multi_match.h"
#include <gen_cpp/PaloBrokerService_types.h>
#include <glog/logging.h>
#include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/classification.hpp>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <roaring/roaring.hh>
#include <string>
#include <vector>
#include "io/fs/file_reader.h"
#include "olap/olap_common.h"
#include "olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h"
#include "olap/rowset/segment_v2/segment_iterator.h"
#include "runtime/primitive_type.h"
#include "vec/columns/column.h"
#include "vec/data_types/data_type.h"
#include "vec/exprs/varray_literal.h"
#include "vec/exprs/vexpr.h"
#include "vec/exprs/vslot_ref.h"
#include "vec/functions/simple_function_factory.h"
namespace doris::vectorized {
Status FunctionMultiMatch::execute_impl(FunctionContext* /*context*/, Block& block,
const ColumnNumbers& arguments, size_t result,
size_t /*input_rows_count*/) const {
return Status::RuntimeError("only inverted index queries are supported");
}
Status FunctionMultiMatch::open(FunctionContext* context,
FunctionContext::FunctionStateScope scope) {
if (scope == FunctionContext::THREAD_LOCAL) {
return Status::OK();
}
DCHECK(context->get_num_args() == 4);
for (int i = 0; i < context->get_num_args(); ++i) {
DCHECK(is_string_type(context->get_arg_type(i)->type));
}
std::shared_ptr<MatchParam> state = std::make_shared<MatchParam>();
context->set_function_state(scope, state);
for (int i = 0; i < context->get_num_args(); ++i) {
const auto& const_column_ptr = context->get_constant_col(i);
if (const_column_ptr) {
auto const_data = const_column_ptr->column_ptr->get_data_at(0);
switch (i) {
case 1: {
std::string field_names_str = const_data.to_string();
field_names_str.erase(
std::remove_if(field_names_str.begin(), field_names_str.end(),
[](unsigned char c) { return std::isspace(c); }),
field_names_str.end());
std::vector<std::string> field_names;
boost::split(field_names, field_names_str, boost::algorithm::is_any_of(","));
for (const auto& field_name : field_names) {
if (!field_name.empty()) {
state->fields.insert(field_name);
}
}
} break;
case 2:
state->type = const_data.to_string();
break;
case 3:
state->query = const_data.to_string();
break;
default:
break;
}
}
}
return Status::OK();
}
Status FunctionMultiMatch::eval_inverted_index(FunctionContext* context,
segment_v2::FuncExprParams& params,
std::shared_ptr<roaring::Roaring>& result) {
auto* match_param = reinterpret_cast<MatchParam*>(
context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
if (match_param == nullptr) {
return Status::RuntimeError("function parameter parsing failed");
}
match_param->fields.insert(params._column_name);
const auto& segment_iterator = params._segment_iterator;
const auto& opts = segment_iterator->storage_read_options();
const auto& tablet_schema = opts.tablet_schema;
std::vector<ColumnId> columns_ids;
for (const auto& column_name : match_param->fields) {
auto cid = tablet_schema->field_index(column_name);
if (cid < 0) {
return Status::RuntimeError("column name is incorrect");
}
const auto& column = tablet_schema->column(cid);
if (!is_string_type(column.type())) {
return Status::RuntimeError("column type is incorrect");
}
if (!tablet_schema->has_inverted_index(column)) {
return Status::RuntimeError("column index is incorrect");
}
columns_ids.emplace_back(cid);
}
// query type
InvertedIndexQueryType query_type;
if (match_param->type == "phrase_prefix") {
query_type = InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY;
} else {
return Status::RuntimeError("query type is incorrect");
}
// cache key
roaring::Roaring cids_str;
cids_str.addMany(columns_ids.size(), columns_ids.data());
cids_str.runOptimize();
std::string column_name_binary(cids_str.getSizeInBytes(), 0);
cids_str.write(column_name_binary.data());
InvertedIndexQueryCache::CacheKey cache_key;
io::Path index_path = segment_iterator->segment().file_reader()->path();
cache_key.index_path = index_path.parent_path() / index_path.stem();
cache_key.column_name = column_name_binary;
cache_key.query_type = query_type;
cache_key.value = match_param->query;
// query cache
auto* cache = InvertedIndexQueryCache::instance();
InvertedIndexQueryCacheHandle cache_handler;
if (cache->lookup(cache_key, &cache_handler)) {
result = cache_handler.get_bitmap();
return Status::OK();
}
// search
for (const auto& column_name : match_param->fields) {
auto cid = tablet_schema->field_index(column_name);
const auto& column = tablet_schema->column(column_name);
auto& index_iterator = segment_iterator->inverted_index_iterators()[cid];
if (!index_iterator) {
RETURN_IF_ERROR(segment_iterator->_init_inverted_index_iterators(cid));
}
const auto& index_reader = index_iterator->reader();
auto single_result = std::make_shared<roaring::Roaring>();
StringRef query_value(match_param->query.data());
auto index_version = tablet_schema->get_inverted_index_storage_format();
if (index_version == InvertedIndexStorageFormatPB::V1) {
RETURN_IF_ERROR(index_reader->query(opts.stats, opts.runtime_state, column_name,
&query_value, query_type, single_result));
} else if (index_version == InvertedIndexStorageFormatPB::V2) {
RETURN_IF_ERROR(index_reader->query(opts.stats, opts.runtime_state,
std::to_string(column.unique_id()), &query_value,
query_type, single_result));
}
(*result) |= (*single_result);
}
result->runOptimize();
cache->insert(cache_key, result, &cache_handler);
return Status::OK();
}
void register_function_multi_match(SimpleFunctionFactory& factory) {
factory.register_function<FunctionMultiMatch>();
}
} // namespace doris::vectorized

View File

@ -0,0 +1,70 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <boost/algorithm/string/split.hpp>
#include "vec/data_types/data_type.h"
#include "vec/data_types/data_type_array.h"
#include "vec/data_types/data_type_number.h"
#include "vec/functions/function.h"
namespace doris::vectorized {
class MatchParam {
public:
std::string query;
std::set<std::string> fields;
std::string type;
};
class FunctionMultiMatch : public IFunction {
public:
static constexpr auto name = "multi_match";
static FunctionPtr create() { return std::make_shared<FunctionMultiMatch>(); }
using NullMapType = PaddedPODArray<UInt8>;
String get_name() const override { return name; }
bool is_variadic() const override { return false; }
size_t get_number_of_arguments() const override { return 4; }
bool use_default_implementation_for_nulls() const override { return false; }
DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
return std::make_shared<DataTypeUInt8>();
}
Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override;
Status close(FunctionContext* context, FunctionContext::FunctionStateScope scope) override {
return Status::OK();
}
Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments,
size_t result, size_t /*input_rows_count*/) const override;
bool can_push_down_to_index() const override { return true; }
Status eval_inverted_index(FunctionContext* context, segment_v2::FuncExprParams& params,
std::shared_ptr<roaring::Roaring>& result) override;
};
} // namespace doris::vectorized

View File

@ -95,15 +95,14 @@ void register_function_multi_string_position(SimpleFunctionFactory& factory);
void register_function_multi_string_search(SimpleFunctionFactory& factory);
void register_function_width_bucket(SimpleFunctionFactory& factory);
void register_function_ignore(SimpleFunctionFactory& factory);
void register_function_encryption(SimpleFunctionFactory& factory);
void register_function_regexp_extract(SimpleFunctionFactory& factory);
void register_function_hex_variadic(SimpleFunctionFactory& factory);
void register_function_match(SimpleFunctionFactory& factory);
void register_function_tokenize(SimpleFunctionFactory& factory);
void register_function_url(SimpleFunctionFactory& factory);
void register_function_ip(SimpleFunctionFactory& factory);
void register_function_multi_match(SimpleFunctionFactory& factory);
class SimpleFunctionFactory {
using Creator = std::function<FunctionBuilderPtr()>;
@ -286,6 +285,7 @@ public:
register_function_tokenize(instance);
register_function_ignore(instance);
register_function_variant_element(instance);
register_function_multi_match(instance);
});
return instance;
}

View File

@ -298,6 +298,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.MonthName;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MonthsAdd;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MonthsDiff;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MonthsSub;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MultiMatch;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MultiMatchAny;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MultiSearchAllPositions;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash332;
@ -906,8 +907,8 @@ public class BuiltinScalarFunctions implements FunctionHelper {
scalar(YearWeek.class, "yearweek"),
scalar(YearsAdd.class, "years_add"),
scalar(YearsDiff.class, "years_diff"),
scalar(YearsSub.class, "years_sub")
);
scalar(YearsSub.class, "years_sub"),
scalar(MultiMatch.class, "multi_match"));
public static final BuiltinScalarFunctions INSTANCE = new BuiltinScalarFunctions();

View File

@ -0,0 +1,73 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.nereids.trees.expressions.functions.scalar;
import org.apache.doris.catalog.FunctionSignature;
import org.apache.doris.nereids.trees.expressions.Expression;
import org.apache.doris.nereids.trees.expressions.functions.AlwaysNotNullable;
import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
import org.apache.doris.nereids.trees.expressions.shape.BinaryExpression;
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
import org.apache.doris.nereids.types.BooleanType;
import org.apache.doris.nereids.types.StringType;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import java.util.List;
/**
* ScalarFunction 'multi_match'. This class is generated by GenerateFunction.
*/
public class MultiMatch extends ScalarFunction
implements BinaryExpression, ExplicitlyCastableSignature, AlwaysNotNullable {
public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
FunctionSignature.ret(BooleanType.INSTANCE)
.args(StringType.INSTANCE,
StringType.INSTANCE,
StringType.INSTANCE,
StringType.INSTANCE)
);
/**
* constructor with 4 arguments.
*/
public MultiMatch(Expression arg0, Expression arg1, Expression arg2, Expression arg3) {
super("multi_match", arg0, arg1, arg2, arg3);
}
/**
* withChildren.
*/
@Override
public MultiMatch withChildren(List<Expression> children) {
Preconditions.checkArgument(children.size() == 4);
return new MultiMatch(children.get(0), children.get(1), children.get(2), children.get(3));
}
@Override
public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
return visitor.visitMultiMatch(this, context);
}
@Override
public List<FunctionSignature> getSignatures() {
return SIGNATURES;
}
}

View File

@ -299,6 +299,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.MonthName;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MonthsAdd;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MonthsDiff;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MonthsSub;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MultiMatch;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MultiMatchAny;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MultiSearchAllPositions;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash332;
@ -2157,4 +2158,8 @@ public interface ScalarFunctionVisitor<R, C> {
default R visitStructElement(StructElement structElement, C context) {
return visitScalarFunction(structElement, context);
}
default R visitMultiMatch(MultiMatch multiMatch, C context) {
return visitScalarFunction(multiMatch, context);
}
}

View File

@ -2206,6 +2206,11 @@ visible_functions = {
[['ignore'], 'BOOLEAN', ['ARRAY_DECIMAL128', '...'], 'ALWAYS_NOT_NULLABLE'],
[['ignore'], 'BOOLEAN', ['ARRAY_VARCHAR', '...'], 'ALWAYS_NOT_NULLABLE'],
[['ignore'], 'BOOLEAN', ['ARRAY_STRING', '...'], 'ALWAYS_NOT_NULLABLE']
],
# multi match functions
"MultiMatch": [
[['multi_match'], 'BOOLEAN', ['STRING', 'STRING', 'STRING', 'STRING'], 'ALWAYS_NOT_NULLABLE']
]
}

View File

@ -0,0 +1,49 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql --
178
-- !sql --
180
-- !sql --
859
-- !sql --
44
-- !sql --
178
-- !sql --
180
-- !sql --
859
-- !sql --
44
-- !sql --
178
-- !sql --
180
-- !sql --
859
-- !sql --
44
-- !sql --
178
-- !sql --
180
-- !sql --
859
-- !sql --
44

View File

@ -0,0 +1,129 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
suite("test_index_multi_match", "p0"){
def indexTbName1 = "test_index_multi_match_1"
def indexTbName2 = "test_index_multi_match_2"
def indexTbName3 = "test_index_multi_match_3"
def indexTbName4 = "test_index_multi_match_4"
sql "DROP TABLE IF EXISTS ${indexTbName1}"
sql "DROP TABLE IF EXISTS ${indexTbName2}"
sql "DROP TABLE IF EXISTS ${indexTbName3}"
sql "DROP TABLE IF EXISTS ${indexTbName4}"
def create_table = {table_name, idx_version ->
sql """
CREATE TABLE ${table_name} (
`@timestamp` int(11) NULL COMMENT "",
`clientip` text NULL COMMENT "",
`request` text NULL COMMENT "",
`status` text NULL COMMENT "",
`size` text NULL COMMENT "",
INDEX clientip_idx (`clientip`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '',
INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '',
INDEX status_idx (`status`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '',
INDEX size_idx (`size`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT ''
) ENGINE=OLAP
DUPLICATE KEY(`@timestamp`)
COMMENT "OLAP"
DISTRIBUTED BY RANDOM BUCKETS 1
PROPERTIES (
"replication_allocation" = "tag.location.default: 1",
"inverted_index_storage_format" = "${idx_version}",
"disable_auto_compaction" = "true"
);
"""
}
def load_httplogs_data = {table_name, label, read_flag, format_flag, file_name, ignore_failure=false,
expected_succ_rows = -1, load_to_single_tablet = 'true' ->
// load the json data
streamLoad {
table "${table_name}"
// set http request header params
set 'label', label + "_" + UUID.randomUUID().toString()
set 'read_json_by_line', read_flag
set 'format', format_flag
file file_name // import json file
time 10000 // limit inflight 10s
if (expected_succ_rows >= 0) {
set 'max_filter_ratio', '1'
}
// if declared a check callback, the default check condition will ignore.
// So you must check all condition
check { result, exception, startTime, endTime ->
if (ignore_failure && expected_succ_rows < 0) { return }
if (exception != null) {
throw exception
}
log.info("Stream load result: ${result}".toString())
def json = parseJson(result)
assertEquals("success", json.Status.toLowerCase())
if (expected_succ_rows >= 0) {
assertEquals(json.NumberLoadedRows, expected_succ_rows)
} else {
assertEquals(json.NumberTotalRows, json.NumberLoadedRows + json.NumberUnselectedRows)
assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0)
}
}
}
}
try {
create_table(indexTbName1, 'V1')
create_table(indexTbName2, 'V2')
create_table(indexTbName3, 'V1')
create_table(indexTbName4, 'V2')
load_httplogs_data.call(indexTbName1, 'test_index_multi_match_1', 'true', 'json', 'documents-1000.json')
load_httplogs_data.call(indexTbName2, 'test_index_multi_match_2', 'true', 'json', 'documents-1000.json')
load_httplogs_data.call(indexTbName3, 'test_index_multi_match_3', 'true', 'json', 'documents-1000.json')
load_httplogs_data.call(indexTbName4, 'test_index_multi_match_4', 'true', 'json', 'documents-1000.json')
sql "sync"
sql """ set enable_common_expr_pushdown = true """
qt_sql """ select count() from ${indexTbName1} where (clientip match_phrase_prefix '2'); """
qt_sql """ select count() from ${indexTbName1} where (clientip match_phrase_prefix '2' or request match_phrase_prefix '2'); """
qt_sql """ select count() from ${indexTbName1} where (clientip match_phrase_prefix '2' or request match_phrase_prefix '2' or status match_phrase_prefix '2' or size match_phrase_prefix '2'); """
qt_sql """ select count() from ${indexTbName1} where (clientip match_phrase_prefix 'a' or request match_phrase_prefix 'a' or status match_phrase_prefix 'a' or size match_phrase_prefix 'a'); """
qt_sql """ select count() from ${indexTbName2} where (clientip match_phrase_prefix '2'); """
qt_sql """ select count() from ${indexTbName2} where (clientip match_phrase_prefix '2' or request match_phrase_prefix '2'); """
qt_sql """ select count() from ${indexTbName2} where (clientip match_phrase_prefix '2' or request match_phrase_prefix '2' or status match_phrase_prefix '2' or size match_phrase_prefix '2'); """
qt_sql """ select count() from ${indexTbName2} where (clientip match_phrase_prefix 'a' or request match_phrase_prefix 'a' or status match_phrase_prefix 'a' or size match_phrase_prefix 'a'); """
qt_sql """ select count() from ${indexTbName3} where multi_match(clientip, '', 'phrase_prefix', '2'); """
qt_sql """ select count() from ${indexTbName3} where multi_match(clientip, 'request', 'phrase_prefix', '2'); """
qt_sql """ select count() from ${indexTbName3} where multi_match(clientip, 'request, status, size', 'phrase_prefix', '2'); """
qt_sql """ select count() from ${indexTbName3} where multi_match(clientip, 'request, status, size', 'phrase_prefix', 'a'); """
qt_sql """ select count() from ${indexTbName4} where multi_match(clientip, '', 'phrase_prefix', '2'); """
qt_sql """ select count() from ${indexTbName4} where multi_match(clientip, 'request', 'phrase_prefix', '2'); """
qt_sql """ select count() from ${indexTbName4} where multi_match(clientip, 'request, status, size', 'phrase_prefix', '2'); """
qt_sql """ select count() from ${indexTbName4} where multi_match(clientip, 'request, status, size', 'phrase_prefix', 'a'); """
} finally {
//try_sql("DROP TABLE IF EXISTS ${testTable}")
}
}