diff --git a/be/src/exec/olap_common.cpp b/be/src/exec/olap_common.cpp index 6d51fb6071..54c584eef8 100644 --- a/be/src/exec/olap_common.cpp +++ b/be/src/exec/olap_common.cpp @@ -35,6 +35,11 @@ std::string cast_to_string(__int128 value) { return ss.str(); } +template <> +std::string cast_to_string(int8_t value) { + return std::to_string(static_cast(value)); +} + template <> void ColumnValueRange::convert_to_fixed_value() { return; diff --git a/be/src/exec/olap_common.h b/be/src/exec/olap_common.h index b4cf31ee0b..c952f7bfc0 100644 --- a/be/src/exec/olap_common.h +++ b/be/src/exec/olap_common.h @@ -31,7 +31,7 @@ #include "exec/scan_node.h" #include "gen_cpp/PlanNodes_types.h" #include "olap/tuple.h" -#include "runtime/datetime_value.h" +#include "runtime/type_limit.h" #include "runtime/descriptors.h" #include "runtime/string_value.hpp" @@ -42,6 +42,11 @@ std::string cast_to_string(T value) { return boost::lexical_cast(value); } +// TYPE_TINYINT should cast to int32_t to first +// because it need to convert to num not char for build Olap fetch Query +template <> +std::string cast_to_string(int8_t); + /** * @brief Column's value range **/ @@ -50,7 +55,8 @@ class ColumnValueRange { public: typedef typename std::set::iterator iterator_type; ColumnValueRange(); - ColumnValueRange(std::string col_name, PrimitiveType type, T min, T max); + ColumnValueRange(std::string col_name, PrimitiveType type); + ColumnValueRange(std::string col_name, PrimitiveType type, const T& min, const T& max); // should add fixed value before add range Status add_fixed_value(T value); @@ -73,10 +79,12 @@ public: bool has_intersection(ColumnValueRange& range); + void intersection(ColumnValueRange& range); + void set_empty_value_range() { _fixed_values.clear(); - _low_value = _type_max; - _high_value = _type_min; + _low_value = TYPE_MAX; + _high_value = TYPE_MIN; } const std::set& get_fixed_value_set() const { return _fixed_values; } @@ -85,9 +93,9 @@ public: T get_range_min_value() const { return _low_value; } - bool is_low_value_mininum() const { return _low_value == _type_min; } + bool is_low_value_mininum() const { return _low_value == TYPE_MIN; } - bool is_high_value_maximum() const { return _high_value == _type_max; } + bool is_high_value_maximum() const { return _high_value == TYPE_MAX; } bool is_begin_include() const { return _low_op == FILTER_LARGER_OR_EQUAL; } @@ -112,7 +120,7 @@ public: } } else { TCondition low; - if (_type_min != _low_value || FILTER_LARGER_OR_EQUAL != _low_op) { + if (TYPE_MIN != _low_value || FILTER_LARGER_OR_EQUAL != _low_op) { low.__set_column_name(_column_name); low.__set_condition_op((_low_op == FILTER_LARGER_OR_EQUAL ? ">=" : ">>")); low.condition_values.push_back(cast_to_string(_low_value)); @@ -123,7 +131,7 @@ public: } TCondition high; - if (_type_max != _high_value || FILTER_LESS_OR_EQUAL != _high_op) { + if (TYPE_MAX != _high_value || FILTER_LESS_OR_EQUAL != _high_op) { high.__set_column_name(_column_name); high.__set_condition_op((_high_op == FILTER_LESS_OR_EQUAL ? "<=" : "<<")); high.condition_values.push_back(cast_to_string(_high_value)); @@ -137,20 +145,30 @@ public: void clear() { _fixed_values.clear(); - _low_value = _type_min; - _high_value = _type_max; + _low_value = TYPE_MIN; + _high_value = TYPE_MAX; _low_op = FILTER_LARGER_OR_EQUAL; _high_op = FILTER_LESS_OR_EQUAL; } + bool is_whole_range() const { + return _fixed_values.empty() && _low_value == TYPE_MIN && _high_value == TYPE_MAX && + _low_op == FILTER_LARGER_OR_EQUAL && _high_op == FILTER_LESS_OR_EQUAL; + } + + static ColumnValueRange create_empty_column_value_range(PrimitiveType type) { + return ColumnValueRange("", type, type_limit::max(), type_limit::min()); + } + protected: bool is_in_range(const T& value); private: + const static T TYPE_MIN; // Column type's min value + const static T TYPE_MAX; // Column type's max value + std::string _column_name; PrimitiveType _column_type; // Column type (eg: TINYINT,SMALLINT,INT,BIGINT) - T _type_min; // Column type's min value - T _type_max; // Column type's max value T _low_value; // Column's low value, closed interval at left T _high_value; // Column's high value, open interval at right SQLFilterOp _low_op; @@ -222,15 +240,22 @@ typedef boost::variant, ColumnValueRange, ColumnValueRange, ColumnValueRange> ColumnValueRangeType; +template +const T ColumnValueRange::TYPE_MIN = type_limit::min(); +template +const T ColumnValueRange::TYPE_MAX = type_limit::max(); + template ColumnValueRange::ColumnValueRange() : _column_type(INVALID_TYPE) {} template -ColumnValueRange::ColumnValueRange(std::string col_name, PrimitiveType type, T min, T max) - : _column_name(col_name), +ColumnValueRange::ColumnValueRange(std::string col_name, PrimitiveType type) + : ColumnValueRange(std::move(col_name), type, TYPE_MIN, TYPE_MAX){} + +template +ColumnValueRange::ColumnValueRange(std::string col_name, PrimitiveType type, const T& min, const T& max) + : _column_name(std::move(col_name)), _column_type(type), - _type_min(min), - _type_max(max), _low_value(min), _high_value(max), _low_op(FILTER_LARGER_OR_EQUAL), @@ -325,14 +350,12 @@ void ColumnValueRange::convert_to_fixed_value() { ++_low_value; } - if (_high_op == FILTER_LESS) { - for (T v = _low_value; v < _high_value; ++v) { - _fixed_values.insert(v); - } - } else { - for (T v = _low_value; v <= _high_value; ++v) { - _fixed_values.insert(v); - } + for (T v = _low_value; v < _high_value; ++v) { + _fixed_values.insert(v); + } + + if (_high_op == FILTER_LESS_OR_EQUAL) { + _fixed_values.insert(_high_value); } } @@ -391,8 +414,8 @@ Status ColumnValueRange::add_range(SQLFilterOp op, T value) { } } - _high_value = _type_min; - _low_value = _type_max; + _high_value = TYPE_MIN; + _low_value = TYPE_MAX; } else { if (_high_value > _low_value) { switch (op) { @@ -430,7 +453,6 @@ Status ColumnValueRange::add_range(SQLFilterOp op, T value) { } break; - break; } default: { @@ -442,8 +464,8 @@ Status ColumnValueRange::add_range(SQLFilterOp op, T value) { if (FILTER_LARGER_OR_EQUAL == _low_op && FILTER_LESS_OR_EQUAL == _high_op && _high_value == _low_value) { add_fixed_value(_high_value); - _high_value = _type_min; - _low_value = _type_max; + _high_value = TYPE_MIN; + _low_value = TYPE_MAX; } } @@ -495,6 +517,55 @@ bool ColumnValueRange::is_in_range(const T& value) { return false; } +template +void ColumnValueRange::intersection(ColumnValueRange& range) { + // 1. clear if column type not match + if (_column_type != range._column_type) { + set_empty_value_range(); + } + + // 2. clear if any range is empty + if (is_empty_value_range() || range.is_empty_value_range()) { + set_empty_value_range(); + } + + std::set result_values; + // 3. fixed_value intersection + if (is_fixed_value_range() || range.is_fixed_value_range()) { + if (is_fixed_value_range() && range.is_fixed_value_range()) { + set_intersection(_fixed_values.begin(), _fixed_values.end(), range._fixed_values.begin(), + range._fixed_values.end(), + std::inserter(result_values, result_values.begin())); + } else if (is_fixed_value_range() && !range.is_fixed_value_range()) { + iterator_type iter = _fixed_values.begin(); + + while (iter != _fixed_values.end()) { + if (range.is_in_range(*iter)) { + result_values.insert(*iter); + } + ++iter; + } + } else if (!is_fixed_value_range() && range.is_fixed_value_range()) { + iterator_type iter = range._fixed_values.begin(); + while (iter != range._fixed_values.end()) { + if (this->is_in_range(*iter)) { + result_values.insert(*iter); + } + ++iter; + } + } + + if (!result_values.empty()) { + _fixed_values = std::move(result_values); + } else { + set_empty_value_range(); + } + } else { + add_range(range._high_op, range._high_value); + add_range(range._low_op, range._low_value); + } +} + template bool ColumnValueRange::has_intersection(ColumnValueRange& range) { // 1. return false if column type not match diff --git a/be/src/exec/olap_scan_node.cpp b/be/src/exec/olap_scan_node.cpp index ce79fb56fc..445f88c34e 100644 --- a/be/src/exec/olap_scan_node.cpp +++ b/be/src/exec/olap_scan_node.cpp @@ -30,7 +30,6 @@ #include "common/resource_tls.h" #include "exprs/binary_predicate.h" #include "exprs/expr.h" -#include "exprs/in_predicate.h" #include "gen_cpp/PlanNodes_types.h" #include "runtime/exec_env.h" #include "runtime/row_batch.h" @@ -455,45 +454,37 @@ Status OlapScanNode::normalize_conjuncts() { for (int slot_idx = 0; slot_idx < slots.size(); ++slot_idx) { switch (slots[slot_idx]->type().type) { - // TYPE_TINYINT use int32_t to present - // because it's easy to convert to string for build Olap fetch Query case TYPE_TINYINT: { - ColumnValueRange range( - slots[slot_idx]->col_name(), slots[slot_idx]->type().type, - std::numeric_limits::min(), std::numeric_limits::max()); + ColumnValueRange range( + slots[slot_idx]->col_name(), slots[slot_idx]->type().type); normalize_predicate(range, slots[slot_idx]); break; } case TYPE_SMALLINT: { ColumnValueRange range( - slots[slot_idx]->col_name(), slots[slot_idx]->type().type, - std::numeric_limits::min(), std::numeric_limits::max()); + slots[slot_idx]->col_name(), slots[slot_idx]->type().type); normalize_predicate(range, slots[slot_idx]); break; } case TYPE_INT: { ColumnValueRange range( - slots[slot_idx]->col_name(), slots[slot_idx]->type().type, - std::numeric_limits::min(), std::numeric_limits::max()); + slots[slot_idx]->col_name(), slots[slot_idx]->type().type); normalize_predicate(range, slots[slot_idx]); break; } case TYPE_BIGINT: { ColumnValueRange range( - slots[slot_idx]->col_name(), slots[slot_idx]->type().type, - std::numeric_limits::min(), std::numeric_limits::max()); + slots[slot_idx]->col_name(), slots[slot_idx]->type().type); normalize_predicate(range, slots[slot_idx]); break; } case TYPE_LARGEINT: { - __int128 min = MIN_INT128; - __int128 max = MAX_INT128; ColumnValueRange<__int128> range(slots[slot_idx]->col_name(), - slots[slot_idx]->type().type, min, max); + slots[slot_idx]->type().type); normalize_predicate(range, slots[slot_idx]); break; } @@ -501,47 +492,36 @@ Status OlapScanNode::normalize_conjuncts() { case TYPE_CHAR: case TYPE_VARCHAR: case TYPE_HLL: { - static char min_char = 0x00; - static char max_char = 0xff; ColumnValueRange range( - slots[slot_idx]->col_name(), slots[slot_idx]->type().type, - StringValue(&min_char, 0), StringValue(&max_char, 1)); + slots[slot_idx]->col_name(), slots[slot_idx]->type().type); normalize_predicate(range, slots[slot_idx]); break; } case TYPE_DATE: case TYPE_DATETIME: { - DateTimeValue max_value = DateTimeValue::datetime_max_value(); - DateTimeValue min_value = DateTimeValue::datetime_min_value(); ColumnValueRange range(slots[slot_idx]->col_name(), - slots[slot_idx]->type().type, min_value, - max_value); + slots[slot_idx]->type().type); normalize_predicate(range, slots[slot_idx]); break; } case TYPE_DECIMAL: { - DecimalValue min = DecimalValue::get_min_decimal(); - DecimalValue max = DecimalValue::get_max_decimal(); ColumnValueRange range(slots[slot_idx]->col_name(), - slots[slot_idx]->type().type, min, max); + slots[slot_idx]->type().type); normalize_predicate(range, slots[slot_idx]); break; } case TYPE_DECIMALV2: { - DecimalV2Value min = DecimalV2Value::get_min_decimal(); - DecimalV2Value max = DecimalV2Value::get_max_decimal(); ColumnValueRange range(slots[slot_idx]->col_name(), - slots[slot_idx]->type().type, min, max); + slots[slot_idx]->type().type); normalize_predicate(range, slots[slot_idx]); break; } case TYPE_BOOLEAN: { - ColumnValueRange range(slots[slot_idx]->col_name(), slots[slot_idx]->type().type, - false, true); + ColumnValueRange range(slots[slot_idx]->col_name(), slots[slot_idx]->type().type); normalize_predicate(range, slots[slot_idx]); break; } @@ -759,7 +739,10 @@ Status OlapScanNode::normalize_predicate(ColumnValueRange& range, SlotDescrip // 2. Normalize BinaryPredicate , add to ColumnValueRange RETURN_IF_ERROR(normalize_noneq_binary_predicate(slot, &range)); - // 3. Add range to Column->ColumnValueRange map + // 3. Check whether range is empty, set _eos + if (range.is_empty_value_range()) _eos = true; + + // 4. Add range to Column->ColumnValueRange map _column_value_ranges[slot->col_name()] = range; return Status::OK(); @@ -775,6 +758,142 @@ static bool ignore_cast(SlotDescriptor* slot, Expr* expr) { return false; } + +bool OlapScanNode::should_push_down_in_predicate(doris::SlotDescriptor *slot, doris::InPredicate* pred) { + if (pred->is_not_in()) { + // can not push down NOT IN predicate to storage engine + return false; + } + + if (Expr::type_without_cast(pred->get_child(0)) != TExprNodeType::SLOT_REF) { + // not a slot ref(column) + return false; + } + + std::vector slot_ids; + if (pred->get_child(0)->get_slot_ids(&slot_ids) != 1) { + // not a single column predicate + return false; + } + + if (slot_ids[0] != slot->id()) { + // predicate not related to current column + return false; + } + + if (pred->get_child(0)->type().type != slot->type().type) { + if (!ignore_cast(slot, pred->get_child(0))) { + // the type of predicate not match the slot's type + return false; + } + } + + VLOG(1) << slot->col_name() << " fixed_values add num: " << pred->hybrid_set()->size(); + + // if there are too many elements in InPredicate, exceed the limit, + // we will not push any condition of this column to storage engine. + // because too many conditions pushed down to storage engine may even + // slow down the query process. + // ATTN: This is just an experience value. You may need to try + // different thresholds to improve performance. + if (pred->hybrid_set()->size() > _max_pushdown_conditions_per_column) { + VLOG(3) << "Predicate value num " << pred->hybrid_set()->size() << " exceed limit " + << _max_pushdown_conditions_per_column; + return false; + } + + return true; +} + +std::pair OlapScanNode::should_push_down_eq_predicate(doris::SlotDescriptor *slot, doris::Expr *pred, + int conj_idx, int child_idx) { + auto result_pair = std::make_pair(false, nullptr); + + // Do not get slot_ref of column, should not push_down to Storage Engine + if (Expr::type_without_cast(pred->get_child(child_idx)) != + TExprNodeType::SLOT_REF) { + return result_pair; + } + + std::vector slot_ids; + if (pred->get_child(child_idx)->get_slot_ids(&slot_ids) != 1) { + // not a single column predicate + return result_pair; + } + + if (slot_ids[0] != slot->id()) { + // predicate not related to current column + return result_pair; + } + + if (pred->get_child(child_idx)->type().type != slot->type().type) { + if (!ignore_cast(slot, pred->get_child(child_idx))) { + // the type of predicate not match the slot's type + return result_pair; + } + } + + Expr* expr = pred->get_child(1 - child_idx); + if (!expr->is_constant()) { + // only handle constant value + return result_pair; + } + + // get value in result pair + result_pair.second = _conjunct_ctxs[conj_idx]->get_value(expr, NULL); + // TODO(lhp) push down is null predicate to storage engine + // for case: where col = null + if (result_pair.second != nullptr) { + result_pair.first = true; + } + return result_pair; +} + +template +Status OlapScanNode::insert_value_to_range(doris::ColumnValueRange& temp_range, doris::PrimitiveType type, void *value) { + switch (type) { + case TYPE_TINYINT: { + int32_t v = *reinterpret_cast(value); + temp_range.add_fixed_value(*reinterpret_cast(&v)); + break; + } + case TYPE_DATE: { + DateTimeValue date_value = + *reinterpret_cast(value); + // There is must return empty data in olap_scan_node, + // Because data value loss accuracy + if (!date_value.check_loss_accuracy_cast_to_date()) { + temp_range.add_fixed_value(*reinterpret_cast(&date_value)); + } + break; + } + case TYPE_DECIMAL: + case TYPE_DECIMALV2: + case TYPE_CHAR: + case TYPE_VARCHAR: + case TYPE_HLL: + case TYPE_DATETIME: + case TYPE_SMALLINT: + case TYPE_INT: + case TYPE_BIGINT: + case TYPE_LARGEINT: { + temp_range.add_fixed_value(*reinterpret_cast(value)); + break; + } + case TYPE_BOOLEAN: { + bool v = *reinterpret_cast(value); + temp_range.add_fixed_value(*reinterpret_cast(&v)); + break; + } + default: { + LOG(WARNING) << "Normalize filter fail, Unsupported Primitive type. [type=" + << type << "]"; + return Status::InternalError("Normalize filter fail, Unsupported Primitive type"); + } + } + return Status::OK(); +} + // Construct the ColumnValueRange for one specified column // It will only handle the InPredicate and eq BinaryPredicate in _conjunct_ctxs. // It will try to push down conditions of that column as much as possible, @@ -783,117 +902,38 @@ template Status OlapScanNode::normalize_in_and_eq_predicate(SlotDescriptor* slot, ColumnValueRange* range) { std::vector filter_conjuncts_index; - bool meet_eq_binary = false; for (int conj_idx = 0; conj_idx < _conjunct_ctxs.size(); ++conj_idx) { + // create empty range as temp range, temp range should do intersection on range + auto temp_range = ColumnValueRange::create_empty_column_value_range(range->type()); + // 1. Normalize in conjuncts like 'where col in (v1, v2, v3)' if (TExprOpcode::FILTER_IN == _conjunct_ctxs[conj_idx]->root()->op()) { InPredicate* pred = dynamic_cast(_conjunct_ctxs[conj_idx]->root()); - if (pred->is_not_in()) { - // can not push down NOT IN predicate to storage engine - continue; - } - - if (Expr::type_without_cast(pred->get_child(0)) != TExprNodeType::SLOT_REF) { - // not a slot ref(column) - continue; - } - - std::vector slot_ids; - if (pred->get_child(0)->get_slot_ids(&slot_ids) != 1) { - // not a single column predicate - continue; - } - - if (slot_ids[0] != slot->id()) { - // predicate not related to current column - continue; - } - - if (pred->get_child(0)->type().type != slot->type().type) { - if (!ignore_cast(slot, pred->get_child(0))) { - // the type of predicate not match the slot's type - continue; - } - } - - VLOG(1) << slot->col_name() << " fixed_values add num: " << pred->hybrid_set()->size(); - - // if there are too many elements in InPredicate, exceed the limit, - // we will not push any condition of this column to storage engine. - // because too many conditions pushed down to storage engine may even - // slow down the query process. - // ATTN: This is just an experience value. You may need to try - // different thresholds to improve performance. - if (pred->hybrid_set()->size() > _max_pushdown_conditions_per_column) { - VLOG(3) << "Predicate value num " << pred->hybrid_set()->size() << " exceed limit " - << _max_pushdown_conditions_per_column; + if (!should_push_down_in_predicate(slot, pred)) { continue; } // begin to push InPredicate value into ColumnValueRange HybridSetBase::IteratorBase* iter = pred->hybrid_set()->begin(); - auto skip_invalid_value_count = 0; - while (iter->has_next()) { // column in (NULL,...) couldn't push down to StorageEngine - // so that discard whole ColumnValueRange + // so set clear() temp_range to whole range if (NULL == iter->get_value()) { - range->clear(); + temp_range.clear(); break; } - - switch (slot->type().type) { - case TYPE_TINYINT: { - int32_t v = *reinterpret_cast(const_cast(iter->get_value())); - range->add_fixed_value(*reinterpret_cast(&v)); - break; - } - case TYPE_DATE: { - DateTimeValue date_value = - *reinterpret_cast(iter->get_value()); - if (date_value.check_loss_accuracy_cast_to_date()) { - // There is may return empty data in olap_scan_node, - // Because data value loss accuracy, skip this value - skip_invalid_value_count++; - } else { - range->add_fixed_value(*reinterpret_cast(&date_value)); - } - break; - } - case TYPE_DECIMAL: - case TYPE_DECIMALV2: - case TYPE_LARGEINT: - case TYPE_CHAR: - case TYPE_VARCHAR: - case TYPE_HLL: - case TYPE_SMALLINT: - case TYPE_INT: - case TYPE_BIGINT: - case TYPE_DATETIME: { - range->add_fixed_value( - *reinterpret_cast(const_cast(iter->get_value()))); - break; - } - case TYPE_BOOLEAN: { - bool v = *reinterpret_cast(const_cast(iter->get_value())); - range->add_fixed_value(*reinterpret_cast(&v)); - break; - } - default: { - break; - } - } + auto value = const_cast(iter->get_value()); + RETURN_IF_ERROR(insert_value_to_range(temp_range, slot->type().type, value)); iter->next(); } - // all value in hybrid set in skip, means all in condition - // is invalid, so set eos = true - if (skip_invalid_value_count == pred->hybrid_set()->size()) { - _eos = true; - } - - if (is_key_column(slot->col_name())) { - filter_conjuncts_index.emplace_back(conj_idx); + // only where a in ('a', 'b', NULL) contain NULL will + // clear temp_range to whole range, no need do intersection + if (!temp_range.is_whole_range()) { + if (is_key_column(slot->col_name())) { + filter_conjuncts_index.emplace_back(conj_idx); + } + range->intersection(temp_range); } } // end of handle in predicate @@ -904,119 +944,27 @@ Status OlapScanNode::normalize_in_and_eq_predicate(SlotDescriptor* slot, DCHECK(pred->get_num_children() == 2); for (int child_idx = 0; child_idx < 2; ++child_idx) { - if (Expr::type_without_cast(pred->get_child(child_idx)) != - TExprNodeType::SLOT_REF) { + // TODO: should use C++17 structured bindlings to refactor this code in the future: + // 'auto [should_push_down, value] = should_push_down_eq_predicate(slot, pred, conj_idx, child_idx);' + // make code tidier and readabler + auto result_pair = should_push_down_eq_predicate(slot, pred, conj_idx, child_idx); + if (!result_pair.first) { continue; } + auto value = result_pair.second; - std::vector slot_ids; - if (pred->get_child(child_idx)->get_slot_ids(&slot_ids) != 1) { - // not a single column predicate - continue; - } - - if (slot_ids[0] != slot->id()) { - // predicate not related to current column - continue; - } - - if (pred->get_child(child_idx)->type().type != slot->type().type) { - if (!ignore_cast(slot, pred->get_child(child_idx))) { - // the type of predicate not match the slot's type - continue; - } - } - - Expr* expr = pred->get_child(1 - child_idx); - if (!expr->is_constant()) { - // only handle constant value - continue; - } - - void* value = _conjunct_ctxs[conj_idx]->get_value(expr, NULL); - // for case: where col = null - if (value == NULL) { - continue; - } - - // begin to push condition value into ColumnValueRange - // clear the ColumnValueRange before adding new fixed values. - // because for AND compound predicates, it can overwrite previous conditions - range->clear(); - switch (slot->type().type) { - case TYPE_TINYINT: { - int32_t v = *reinterpret_cast(value); - range->add_fixed_value(*reinterpret_cast(&v)); - break; - } - case TYPE_DATE: { - DateTimeValue date_value = - *reinterpret_cast(value); - if (date_value.check_loss_accuracy_cast_to_date()) { - // There is must return empty data in olap_scan_node, - // Because data value loss accuracy - _eos = true; - } - range->add_fixed_value(*reinterpret_cast(&date_value)); - break; - } - case TYPE_DECIMAL: - case TYPE_DECIMALV2: - case TYPE_CHAR: - case TYPE_VARCHAR: - case TYPE_HLL: - case TYPE_DATETIME: - case TYPE_SMALLINT: - case TYPE_INT: - case TYPE_BIGINT: - case TYPE_LARGEINT: { - range->add_fixed_value(*reinterpret_cast(value)); - break; - } - case TYPE_BOOLEAN: { - bool v = *reinterpret_cast(value); - range->add_fixed_value(*reinterpret_cast(&v)); - break; - } - default: { - LOG(WARNING) << "Normalize filter fail, Unsupported Primitive type. [type=" - << expr->type() << "]"; - return Status::InternalError("Normalize filter fail, Unsupported Primitive type"); - } - } + RETURN_IF_ERROR(insert_value_to_range(temp_range, slot->type().type, value)); if (is_key_column(slot->col_name())) { filter_conjuncts_index.emplace_back(conj_idx); } - meet_eq_binary = true; + range->intersection(temp_range); } // end for each binary predicate child - } // end of handling eq binary predicate - - - if (range->get_fixed_value_size() > 0) { - // this columns already meet some eq predicates(IN or Binary), - // There is no need to continue to iterate. - // TODO(cmy): In fact, this part of the judgment should be completed in - // the FE query planning stage. For the following predicate conditions, - // it should be possible to eliminate at the FE side. - // WHERE A = 1 and A in (2,3,4) - - if (meet_eq_binary) { - // meet_eq_binary is true, means we meet at least one eq binary predicate. - // this flag is to handle following case: - // There are 2 conjuncts, first in a InPredicate, and second is a BinaryPredicate. - // Firstly, we met a InPredicate, and add lots of values in ColumnValueRange, - // if breaks, doris will read many rows filtered by these values. - // But if continue to handle the BinaryPredicate, the value in ColumnValueRange - // may become only one, which can reduce the rows read from storage engine. - // So the strategy is to use the BinaryPredicate as much as possible. - break; - } - } + } // end of handling eq binary predicate } + // exceed limit, no conditions will be pushed down to storage engine. if (range->get_fixed_value_size() > _max_pushdown_conditions_per_column) { - // exceed limit, no conditions will be pushed down to storage engine. range->clear(); } else { std::copy(filter_conjuncts_index.cbegin(), filter_conjuncts_index.cend(), @@ -1158,8 +1106,6 @@ Status OlapScanNode::normalize_noneq_binary_predicate(SlotDescriptor* slot, << " value: " << *reinterpret_cast(value); } } - - } std::copy(filter_conjuncts_index.cbegin(), filter_conjuncts_index.cend(), diff --git a/be/src/exec/olap_scan_node.h b/be/src/exec/olap_scan_node.h index 0f5cbb6fe4..80e0f68f2d 100644 --- a/be/src/exec/olap_scan_node.h +++ b/be/src/exec/olap_scan_node.h @@ -26,6 +26,7 @@ #include "exec/olap_common.h" #include "exec/olap_scanner.h" #include "exec/scan_node.h" +#include "exprs/in_predicate.h" #include "runtime/descriptors.h" #include "runtime/row_batch_interface.hpp" #include "runtime/vectorized_row_batch.h" @@ -172,6 +173,13 @@ private: void construct_is_null_pred_in_where_pred(Expr* expr, SlotDescriptor* slot, const std::string& is_null_str); + bool should_push_down_in_predicate(SlotDescriptor* slot, InPredicate* in_pred); + + std::pair should_push_down_eq_predicate(SlotDescriptor* slot, Expr* pred, int conj_idx, int child_idx); + + template + static Status insert_value_to_range(ColumnValueRange& range, PrimitiveType type, void* value); + friend class OlapScanner; std::vector _is_null_vector; diff --git a/be/src/exec/olap_scanner.h b/be/src/exec/olap_scanner.h index 7f5e432c6c..3dc3881a67 100644 --- a/be/src/exec/olap_scanner.h +++ b/be/src/exec/olap_scanner.h @@ -26,7 +26,7 @@ #include "common/status.h" #include "exec/exec_node.h" -#include "exec/olap_common.h" +#include "exec/olap_utils.h" #include "exprs/expr.h" #include "gen_cpp/PaloInternalService_types.h" #include "gen_cpp/PlanNodes_types.h" diff --git a/be/src/runtime/datetime_value.cpp b/be/src/runtime/datetime_value.cpp index 65e96f2887..9b58370836 100644 --- a/be/src/runtime/datetime_value.cpp +++ b/be/src/runtime/datetime_value.cpp @@ -59,8 +59,6 @@ static uint32_t calc_days_in_year(uint32_t year) { return is_leap(year) ? 366 : 365; } -DateTimeValue DateTimeValue::_s_min_datetime_value(0, TIME_DATETIME, 0, 0, 0, 0, 0, 1, 1); -DateTimeValue DateTimeValue::_s_max_datetime_value(0, TIME_DATETIME, 23, 59, 59, 0, 9999, 12, 31); RE2 DateTimeValue::time_zone_offset_format_reg("^[+-]{1}\\d{2}\\:\\d{2}$"); bool DateTimeValue::check_range() const { diff --git a/be/src/runtime/datetime_value.h b/be/src/runtime/datetime_value.h index d092cc9817..8751b93766 100644 --- a/be/src/runtime/datetime_value.h +++ b/be/src/runtime/datetime_value.h @@ -426,9 +426,15 @@ public: return std::string(buf, end - buf); } - static DateTimeValue datetime_min_value() { return _s_min_datetime_value; } - - static DateTimeValue datetime_max_value() { return _s_max_datetime_value; } + static DateTimeValue datetime_min_value() { + static DateTimeValue _s_min_datetime_value(0, TIME_DATETIME, 0, 0, 0, 0, 0, 1, 1); + return _s_min_datetime_value; + } + + static DateTimeValue datetime_max_value() { + static DateTimeValue _s_max_datetime_value(0, TIME_DATETIME, 23, 59, 59, 0, 9999, 12, 31); + return _s_max_datetime_value; + } int64_t second_diff(const DateTimeValue& rhs) const { int day_diff = daynr() - rhs.daynr(); @@ -542,8 +548,6 @@ private: _day(day), _microsecond(microsecond) {} - static DateTimeValue _s_min_datetime_value; - static DateTimeValue _s_max_datetime_value; // RE2 obj is thread safe static RE2 time_zone_offset_format_reg; }; diff --git a/be/src/runtime/string_value.cpp b/be/src/runtime/string_value.cpp index f4c15d3fa6..6ae245b98c 100644 --- a/be/src/runtime/string_value.cpp +++ b/be/src/runtime/string_value.cpp @@ -39,4 +39,16 @@ std::size_t operator-(const StringValue& v1, const StringValue& v2) { return 0; } +constexpr char StringValue::MIN_CHAR = 0x00; + +constexpr char StringValue::MAX_CHAR = 0xff; + +StringValue StringValue::min_string_val() { + return StringValue((char*)(&StringValue::MIN_CHAR), 0); +} + +StringValue StringValue::max_string_val() { + return StringValue((char*)(&StringValue::MAX_CHAR), 1); +} + } // namespace doris diff --git a/be/src/runtime/string_value.h b/be/src/runtime/string_value.h index 3f9e3a3516..160ff179f9 100644 --- a/be/src/runtime/string_value.h +++ b/be/src/runtime/string_value.h @@ -29,6 +29,9 @@ namespace doris { // The returned StringValue of all functions that return StringValue // shares its buffer the parent. struct StringValue { + const static char MIN_CHAR; + const static char MAX_CHAR; + static const int MAX_LENGTH = (1 << 30); // TODO: change ptr to an offset relative to a contiguous memory block, // so that we can send row batches between nodes without having to swizzle @@ -104,6 +107,10 @@ struct StringValue { static StringValue from_string_val(const doris_udf::StringVal& sv) { return StringValue(reinterpret_cast(sv.ptr), sv.len); } + + static StringValue min_string_val(); + + static StringValue max_string_val(); }; // This function must be called 'hash_value' to be picked up by boost. diff --git a/be/src/runtime/type_limit.h b/be/src/runtime/type_limit.h new file mode 100644 index 0000000000..62247298cf --- /dev/null +++ b/be/src/runtime/type_limit.h @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef DORIS_BE_RUNTIME_TYPE_LIMIT_H +#define DORIS_BE_RUNTIME_TYPE_LIMIT_H + +#include "runtime/datetime_value.h" +#include "runtime/decimal_value.h" +#include "runtime/decimalv2_value.h" +#include "runtime/string_value.h" + +namespace doris { + +template +struct type_limit { + static T min() { + return std::numeric_limits::min(); + } + static T max() { + return std::numeric_limits::max(); + } +}; + +template <> +struct type_limit { + static StringValue min() { + return StringValue::min_string_val(); + } + static StringValue max() { + return StringValue::max_string_val(); + } +}; + +template <> +struct type_limit { + static DecimalValue min() { + return DecimalValue::get_min_decimal(); + } + static DecimalValue max() { + return DecimalValue::get_max_decimal(); + } +}; + +template <> +struct type_limit { + static DecimalV2Value min() { + return DecimalV2Value::get_min_decimal(); + } + static DecimalV2Value max() { + return DecimalV2Value::get_max_decimal(); + } +}; + +template <> +struct type_limit { + static DateTimeValue min() { + return DateTimeValue::datetime_min_value(); + } + static DateTimeValue max() { + return DateTimeValue::datetime_max_value(); + } +}; + +} // namespace doris + +#endif