[Bug] Fix the bug of where condition a in ('A', 'B', 'V') and a in ('A') return error result (#5072)
And Refactor ColumnRangeValue and OlapScanNode This patch mainly do the following: - Fix issue #5071 - Change type_min in ColumnRangeValue as static - Add Class of type_limit make code clear - Refactor the function of normalize_in_and_eq_predicate
This commit is contained in:
@ -35,6 +35,11 @@ std::string cast_to_string(__int128 value) {
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
template <>
|
||||
std::string cast_to_string(int8_t value) {
|
||||
return std::to_string(static_cast<int>(value));
|
||||
}
|
||||
|
||||
template <>
|
||||
void ColumnValueRange<StringValue>::convert_to_fixed_value() {
|
||||
return;
|
||||
|
||||
@ -31,7 +31,7 @@
|
||||
#include "exec/scan_node.h"
|
||||
#include "gen_cpp/PlanNodes_types.h"
|
||||
#include "olap/tuple.h"
|
||||
#include "runtime/datetime_value.h"
|
||||
#include "runtime/type_limit.h"
|
||||
#include "runtime/descriptors.h"
|
||||
#include "runtime/string_value.hpp"
|
||||
|
||||
@ -42,6 +42,11 @@ std::string cast_to_string(T value) {
|
||||
return boost::lexical_cast<std::string>(value);
|
||||
}
|
||||
|
||||
// TYPE_TINYINT should cast to int32_t to first
|
||||
// because it need to convert to num not char for build Olap fetch Query
|
||||
template <>
|
||||
std::string cast_to_string(int8_t);
|
||||
|
||||
/**
|
||||
* @brief Column's value range
|
||||
**/
|
||||
@ -50,7 +55,8 @@ class ColumnValueRange {
|
||||
public:
|
||||
typedef typename std::set<T>::iterator iterator_type;
|
||||
ColumnValueRange();
|
||||
ColumnValueRange(std::string col_name, PrimitiveType type, T min, T max);
|
||||
ColumnValueRange(std::string col_name, PrimitiveType type);
|
||||
ColumnValueRange(std::string col_name, PrimitiveType type, const T& min, const T& max);
|
||||
|
||||
// should add fixed value before add range
|
||||
Status add_fixed_value(T value);
|
||||
@ -73,10 +79,12 @@ public:
|
||||
|
||||
bool has_intersection(ColumnValueRange<T>& range);
|
||||
|
||||
void intersection(ColumnValueRange<T>& range);
|
||||
|
||||
void set_empty_value_range() {
|
||||
_fixed_values.clear();
|
||||
_low_value = _type_max;
|
||||
_high_value = _type_min;
|
||||
_low_value = TYPE_MAX;
|
||||
_high_value = TYPE_MIN;
|
||||
}
|
||||
|
||||
const std::set<T>& get_fixed_value_set() const { return _fixed_values; }
|
||||
@ -85,9 +93,9 @@ public:
|
||||
|
||||
T get_range_min_value() const { return _low_value; }
|
||||
|
||||
bool is_low_value_mininum() const { return _low_value == _type_min; }
|
||||
bool is_low_value_mininum() const { return _low_value == TYPE_MIN; }
|
||||
|
||||
bool is_high_value_maximum() const { return _high_value == _type_max; }
|
||||
bool is_high_value_maximum() const { return _high_value == TYPE_MAX; }
|
||||
|
||||
bool is_begin_include() const { return _low_op == FILTER_LARGER_OR_EQUAL; }
|
||||
|
||||
@ -112,7 +120,7 @@ public:
|
||||
}
|
||||
} else {
|
||||
TCondition low;
|
||||
if (_type_min != _low_value || FILTER_LARGER_OR_EQUAL != _low_op) {
|
||||
if (TYPE_MIN != _low_value || FILTER_LARGER_OR_EQUAL != _low_op) {
|
||||
low.__set_column_name(_column_name);
|
||||
low.__set_condition_op((_low_op == FILTER_LARGER_OR_EQUAL ? ">=" : ">>"));
|
||||
low.condition_values.push_back(cast_to_string(_low_value));
|
||||
@ -123,7 +131,7 @@ public:
|
||||
}
|
||||
|
||||
TCondition high;
|
||||
if (_type_max != _high_value || FILTER_LESS_OR_EQUAL != _high_op) {
|
||||
if (TYPE_MAX != _high_value || FILTER_LESS_OR_EQUAL != _high_op) {
|
||||
high.__set_column_name(_column_name);
|
||||
high.__set_condition_op((_high_op == FILTER_LESS_OR_EQUAL ? "<=" : "<<"));
|
||||
high.condition_values.push_back(cast_to_string(_high_value));
|
||||
@ -137,20 +145,30 @@ public:
|
||||
|
||||
void clear() {
|
||||
_fixed_values.clear();
|
||||
_low_value = _type_min;
|
||||
_high_value = _type_max;
|
||||
_low_value = TYPE_MIN;
|
||||
_high_value = TYPE_MAX;
|
||||
_low_op = FILTER_LARGER_OR_EQUAL;
|
||||
_high_op = FILTER_LESS_OR_EQUAL;
|
||||
}
|
||||
|
||||
bool is_whole_range() const {
|
||||
return _fixed_values.empty() && _low_value == TYPE_MIN && _high_value == TYPE_MAX &&
|
||||
_low_op == FILTER_LARGER_OR_EQUAL && _high_op == FILTER_LESS_OR_EQUAL;
|
||||
}
|
||||
|
||||
static ColumnValueRange<T> create_empty_column_value_range(PrimitiveType type) {
|
||||
return ColumnValueRange<T>("", type, type_limit<T>::max(), type_limit<T>::min());
|
||||
}
|
||||
|
||||
protected:
|
||||
bool is_in_range(const T& value);
|
||||
|
||||
private:
|
||||
const static T TYPE_MIN; // Column type's min value
|
||||
const static T TYPE_MAX; // Column type's max value
|
||||
|
||||
std::string _column_name;
|
||||
PrimitiveType _column_type; // Column type (eg: TINYINT,SMALLINT,INT,BIGINT)
|
||||
T _type_min; // Column type's min value
|
||||
T _type_max; // Column type's max value
|
||||
T _low_value; // Column's low value, closed interval at left
|
||||
T _high_value; // Column's high value, open interval at right
|
||||
SQLFilterOp _low_op;
|
||||
@ -222,15 +240,22 @@ typedef boost::variant<ColumnValueRange<int8_t>, ColumnValueRange<int16_t>,
|
||||
ColumnValueRange<DecimalV2Value>, ColumnValueRange<bool>>
|
||||
ColumnValueRangeType;
|
||||
|
||||
template <class T>
|
||||
const T ColumnValueRange<T>::TYPE_MIN = type_limit<T>::min();
|
||||
template <class T>
|
||||
const T ColumnValueRange<T>::TYPE_MAX = type_limit<T>::max();
|
||||
|
||||
template <class T>
|
||||
ColumnValueRange<T>::ColumnValueRange() : _column_type(INVALID_TYPE) {}
|
||||
|
||||
template <class T>
|
||||
ColumnValueRange<T>::ColumnValueRange(std::string col_name, PrimitiveType type, T min, T max)
|
||||
: _column_name(col_name),
|
||||
ColumnValueRange<T>::ColumnValueRange(std::string col_name, PrimitiveType type)
|
||||
: ColumnValueRange(std::move(col_name), type, TYPE_MIN, TYPE_MAX){}
|
||||
|
||||
template <class T>
|
||||
ColumnValueRange<T>::ColumnValueRange(std::string col_name, PrimitiveType type, const T& min, const T& max)
|
||||
: _column_name(std::move(col_name)),
|
||||
_column_type(type),
|
||||
_type_min(min),
|
||||
_type_max(max),
|
||||
_low_value(min),
|
||||
_high_value(max),
|
||||
_low_op(FILTER_LARGER_OR_EQUAL),
|
||||
@ -325,14 +350,12 @@ void ColumnValueRange<T>::convert_to_fixed_value() {
|
||||
++_low_value;
|
||||
}
|
||||
|
||||
if (_high_op == FILTER_LESS) {
|
||||
for (T v = _low_value; v < _high_value; ++v) {
|
||||
_fixed_values.insert(v);
|
||||
}
|
||||
} else {
|
||||
for (T v = _low_value; v <= _high_value; ++v) {
|
||||
_fixed_values.insert(v);
|
||||
}
|
||||
for (T v = _low_value; v < _high_value; ++v) {
|
||||
_fixed_values.insert(v);
|
||||
}
|
||||
|
||||
if (_high_op == FILTER_LESS_OR_EQUAL) {
|
||||
_fixed_values.insert(_high_value);
|
||||
}
|
||||
}
|
||||
|
||||
@ -391,8 +414,8 @@ Status ColumnValueRange<T>::add_range(SQLFilterOp op, T value) {
|
||||
}
|
||||
}
|
||||
|
||||
_high_value = _type_min;
|
||||
_low_value = _type_max;
|
||||
_high_value = TYPE_MIN;
|
||||
_low_value = TYPE_MAX;
|
||||
} else {
|
||||
if (_high_value > _low_value) {
|
||||
switch (op) {
|
||||
@ -430,7 +453,6 @@ Status ColumnValueRange<T>::add_range(SQLFilterOp op, T value) {
|
||||
}
|
||||
|
||||
break;
|
||||
break;
|
||||
}
|
||||
|
||||
default: {
|
||||
@ -442,8 +464,8 @@ Status ColumnValueRange<T>::add_range(SQLFilterOp op, T value) {
|
||||
if (FILTER_LARGER_OR_EQUAL == _low_op && FILTER_LESS_OR_EQUAL == _high_op &&
|
||||
_high_value == _low_value) {
|
||||
add_fixed_value(_high_value);
|
||||
_high_value = _type_min;
|
||||
_low_value = _type_max;
|
||||
_high_value = TYPE_MIN;
|
||||
_low_value = TYPE_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
@ -495,6 +517,55 @@ bool ColumnValueRange<T>::is_in_range(const T& value) {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void ColumnValueRange<T>::intersection(ColumnValueRange<T>& range) {
|
||||
// 1. clear if column type not match
|
||||
if (_column_type != range._column_type) {
|
||||
set_empty_value_range();
|
||||
}
|
||||
|
||||
// 2. clear if any range is empty
|
||||
if (is_empty_value_range() || range.is_empty_value_range()) {
|
||||
set_empty_value_range();
|
||||
}
|
||||
|
||||
std::set<T> result_values;
|
||||
// 3. fixed_value intersection
|
||||
if (is_fixed_value_range() || range.is_fixed_value_range()) {
|
||||
if (is_fixed_value_range() && range.is_fixed_value_range()) {
|
||||
set_intersection(_fixed_values.begin(), _fixed_values.end(), range._fixed_values.begin(),
|
||||
range._fixed_values.end(),
|
||||
std::inserter(result_values, result_values.begin()));
|
||||
} else if (is_fixed_value_range() && !range.is_fixed_value_range()) {
|
||||
iterator_type iter = _fixed_values.begin();
|
||||
|
||||
while (iter != _fixed_values.end()) {
|
||||
if (range.is_in_range(*iter)) {
|
||||
result_values.insert(*iter);
|
||||
}
|
||||
++iter;
|
||||
}
|
||||
} else if (!is_fixed_value_range() && range.is_fixed_value_range()) {
|
||||
iterator_type iter = range._fixed_values.begin();
|
||||
while (iter != range._fixed_values.end()) {
|
||||
if (this->is_in_range(*iter)) {
|
||||
result_values.insert(*iter);
|
||||
}
|
||||
++iter;
|
||||
}
|
||||
}
|
||||
|
||||
if (!result_values.empty()) {
|
||||
_fixed_values = std::move(result_values);
|
||||
} else {
|
||||
set_empty_value_range();
|
||||
}
|
||||
} else {
|
||||
add_range(range._high_op, range._high_value);
|
||||
add_range(range._low_op, range._low_value);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
bool ColumnValueRange<T>::has_intersection(ColumnValueRange<T>& range) {
|
||||
// 1. return false if column type not match
|
||||
|
||||
@ -30,7 +30,6 @@
|
||||
#include "common/resource_tls.h"
|
||||
#include "exprs/binary_predicate.h"
|
||||
#include "exprs/expr.h"
|
||||
#include "exprs/in_predicate.h"
|
||||
#include "gen_cpp/PlanNodes_types.h"
|
||||
#include "runtime/exec_env.h"
|
||||
#include "runtime/row_batch.h"
|
||||
@ -455,45 +454,37 @@ Status OlapScanNode::normalize_conjuncts() {
|
||||
|
||||
for (int slot_idx = 0; slot_idx < slots.size(); ++slot_idx) {
|
||||
switch (slots[slot_idx]->type().type) {
|
||||
// TYPE_TINYINT use int32_t to present
|
||||
// because it's easy to convert to string for build Olap fetch Query
|
||||
case TYPE_TINYINT: {
|
||||
ColumnValueRange<int32_t> range(
|
||||
slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
|
||||
std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
|
||||
ColumnValueRange<int8_t> range(
|
||||
slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
|
||||
normalize_predicate(range, slots[slot_idx]);
|
||||
break;
|
||||
}
|
||||
|
||||
case TYPE_SMALLINT: {
|
||||
ColumnValueRange<int16_t> range(
|
||||
slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
|
||||
std::numeric_limits<int16_t>::min(), std::numeric_limits<int16_t>::max());
|
||||
slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
|
||||
normalize_predicate(range, slots[slot_idx]);
|
||||
break;
|
||||
}
|
||||
|
||||
case TYPE_INT: {
|
||||
ColumnValueRange<int32_t> range(
|
||||
slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
|
||||
std::numeric_limits<int32_t>::min(), std::numeric_limits<int32_t>::max());
|
||||
slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
|
||||
normalize_predicate(range, slots[slot_idx]);
|
||||
break;
|
||||
}
|
||||
|
||||
case TYPE_BIGINT: {
|
||||
ColumnValueRange<int64_t> range(
|
||||
slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
|
||||
std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max());
|
||||
slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
|
||||
normalize_predicate(range, slots[slot_idx]);
|
||||
break;
|
||||
}
|
||||
|
||||
case TYPE_LARGEINT: {
|
||||
__int128 min = MIN_INT128;
|
||||
__int128 max = MAX_INT128;
|
||||
ColumnValueRange<__int128> range(slots[slot_idx]->col_name(),
|
||||
slots[slot_idx]->type().type, min, max);
|
||||
slots[slot_idx]->type().type);
|
||||
normalize_predicate(range, slots[slot_idx]);
|
||||
break;
|
||||
}
|
||||
@ -501,47 +492,36 @@ Status OlapScanNode::normalize_conjuncts() {
|
||||
case TYPE_CHAR:
|
||||
case TYPE_VARCHAR:
|
||||
case TYPE_HLL: {
|
||||
static char min_char = 0x00;
|
||||
static char max_char = 0xff;
|
||||
ColumnValueRange<StringValue> range(
|
||||
slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
|
||||
StringValue(&min_char, 0), StringValue(&max_char, 1));
|
||||
slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
|
||||
normalize_predicate(range, slots[slot_idx]);
|
||||
break;
|
||||
}
|
||||
|
||||
case TYPE_DATE:
|
||||
case TYPE_DATETIME: {
|
||||
DateTimeValue max_value = DateTimeValue::datetime_max_value();
|
||||
DateTimeValue min_value = DateTimeValue::datetime_min_value();
|
||||
ColumnValueRange<DateTimeValue> range(slots[slot_idx]->col_name(),
|
||||
slots[slot_idx]->type().type, min_value,
|
||||
max_value);
|
||||
slots[slot_idx]->type().type);
|
||||
normalize_predicate(range, slots[slot_idx]);
|
||||
break;
|
||||
}
|
||||
|
||||
case TYPE_DECIMAL: {
|
||||
DecimalValue min = DecimalValue::get_min_decimal();
|
||||
DecimalValue max = DecimalValue::get_max_decimal();
|
||||
ColumnValueRange<DecimalValue> range(slots[slot_idx]->col_name(),
|
||||
slots[slot_idx]->type().type, min, max);
|
||||
slots[slot_idx]->type().type);
|
||||
normalize_predicate(range, slots[slot_idx]);
|
||||
break;
|
||||
}
|
||||
|
||||
case TYPE_DECIMALV2: {
|
||||
DecimalV2Value min = DecimalV2Value::get_min_decimal();
|
||||
DecimalV2Value max = DecimalV2Value::get_max_decimal();
|
||||
ColumnValueRange<DecimalV2Value> range(slots[slot_idx]->col_name(),
|
||||
slots[slot_idx]->type().type, min, max);
|
||||
slots[slot_idx]->type().type);
|
||||
normalize_predicate(range, slots[slot_idx]);
|
||||
break;
|
||||
}
|
||||
|
||||
case TYPE_BOOLEAN: {
|
||||
ColumnValueRange<bool> range(slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
|
||||
false, true);
|
||||
ColumnValueRange<bool> range(slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
|
||||
normalize_predicate(range, slots[slot_idx]);
|
||||
break;
|
||||
}
|
||||
@ -759,7 +739,10 @@ Status OlapScanNode::normalize_predicate(ColumnValueRange<T>& range, SlotDescrip
|
||||
// 2. Normalize BinaryPredicate , add to ColumnValueRange
|
||||
RETURN_IF_ERROR(normalize_noneq_binary_predicate(slot, &range));
|
||||
|
||||
// 3. Add range to Column->ColumnValueRange map
|
||||
// 3. Check whether range is empty, set _eos
|
||||
if (range.is_empty_value_range()) _eos = true;
|
||||
|
||||
// 4. Add range to Column->ColumnValueRange map
|
||||
_column_value_ranges[slot->col_name()] = range;
|
||||
|
||||
return Status::OK();
|
||||
@ -775,6 +758,142 @@ static bool ignore_cast(SlotDescriptor* slot, Expr* expr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool OlapScanNode::should_push_down_in_predicate(doris::SlotDescriptor *slot, doris::InPredicate* pred) {
|
||||
if (pred->is_not_in()) {
|
||||
// can not push down NOT IN predicate to storage engine
|
||||
return false;
|
||||
}
|
||||
|
||||
if (Expr::type_without_cast(pred->get_child(0)) != TExprNodeType::SLOT_REF) {
|
||||
// not a slot ref(column)
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<SlotId> slot_ids;
|
||||
if (pred->get_child(0)->get_slot_ids(&slot_ids) != 1) {
|
||||
// not a single column predicate
|
||||
return false;
|
||||
}
|
||||
|
||||
if (slot_ids[0] != slot->id()) {
|
||||
// predicate not related to current column
|
||||
return false;
|
||||
}
|
||||
|
||||
if (pred->get_child(0)->type().type != slot->type().type) {
|
||||
if (!ignore_cast(slot, pred->get_child(0))) {
|
||||
// the type of predicate not match the slot's type
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
VLOG(1) << slot->col_name() << " fixed_values add num: " << pred->hybrid_set()->size();
|
||||
|
||||
// if there are too many elements in InPredicate, exceed the limit,
|
||||
// we will not push any condition of this column to storage engine.
|
||||
// because too many conditions pushed down to storage engine may even
|
||||
// slow down the query process.
|
||||
// ATTN: This is just an experience value. You may need to try
|
||||
// different thresholds to improve performance.
|
||||
if (pred->hybrid_set()->size() > _max_pushdown_conditions_per_column) {
|
||||
VLOG(3) << "Predicate value num " << pred->hybrid_set()->size() << " exceed limit "
|
||||
<< _max_pushdown_conditions_per_column;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::pair<bool, void*> OlapScanNode::should_push_down_eq_predicate(doris::SlotDescriptor *slot, doris::Expr *pred,
|
||||
int conj_idx, int child_idx) {
|
||||
auto result_pair = std::make_pair<bool, void*>(false, nullptr);
|
||||
|
||||
// Do not get slot_ref of column, should not push_down to Storage Engine
|
||||
if (Expr::type_without_cast(pred->get_child(child_idx)) !=
|
||||
TExprNodeType::SLOT_REF) {
|
||||
return result_pair;
|
||||
}
|
||||
|
||||
std::vector<SlotId> slot_ids;
|
||||
if (pred->get_child(child_idx)->get_slot_ids(&slot_ids) != 1) {
|
||||
// not a single column predicate
|
||||
return result_pair;
|
||||
}
|
||||
|
||||
if (slot_ids[0] != slot->id()) {
|
||||
// predicate not related to current column
|
||||
return result_pair;
|
||||
}
|
||||
|
||||
if (pred->get_child(child_idx)->type().type != slot->type().type) {
|
||||
if (!ignore_cast(slot, pred->get_child(child_idx))) {
|
||||
// the type of predicate not match the slot's type
|
||||
return result_pair;
|
||||
}
|
||||
}
|
||||
|
||||
Expr* expr = pred->get_child(1 - child_idx);
|
||||
if (!expr->is_constant()) {
|
||||
// only handle constant value
|
||||
return result_pair;
|
||||
}
|
||||
|
||||
// get value in result pair
|
||||
result_pair.second = _conjunct_ctxs[conj_idx]->get_value(expr, NULL);
|
||||
// TODO(lhp) push down is null predicate to storage engine
|
||||
// for case: where col = null
|
||||
if (result_pair.second != nullptr) {
|
||||
result_pair.first = true;
|
||||
}
|
||||
return result_pair;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Status OlapScanNode::insert_value_to_range(doris::ColumnValueRange<T>& temp_range, doris::PrimitiveType type, void *value) {
|
||||
switch (type) {
|
||||
case TYPE_TINYINT: {
|
||||
int32_t v = *reinterpret_cast<int8_t*>(value);
|
||||
temp_range.add_fixed_value(*reinterpret_cast<T*>(&v));
|
||||
break;
|
||||
}
|
||||
case TYPE_DATE: {
|
||||
DateTimeValue date_value =
|
||||
*reinterpret_cast<DateTimeValue*>(value);
|
||||
// There is must return empty data in olap_scan_node,
|
||||
// Because data value loss accuracy
|
||||
if (!date_value.check_loss_accuracy_cast_to_date()) {
|
||||
temp_range.add_fixed_value(*reinterpret_cast<T*>(&date_value));
|
||||
}
|
||||
break;
|
||||
}
|
||||
case TYPE_DECIMAL:
|
||||
case TYPE_DECIMALV2:
|
||||
case TYPE_CHAR:
|
||||
case TYPE_VARCHAR:
|
||||
case TYPE_HLL:
|
||||
case TYPE_DATETIME:
|
||||
case TYPE_SMALLINT:
|
||||
case TYPE_INT:
|
||||
case TYPE_BIGINT:
|
||||
case TYPE_LARGEINT: {
|
||||
temp_range.add_fixed_value(*reinterpret_cast<T*>(value));
|
||||
break;
|
||||
}
|
||||
case TYPE_BOOLEAN: {
|
||||
bool v = *reinterpret_cast<bool*>(value);
|
||||
temp_range.add_fixed_value(*reinterpret_cast<T*>(&v));
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
LOG(WARNING) << "Normalize filter fail, Unsupported Primitive type. [type="
|
||||
<< type << "]";
|
||||
return Status::InternalError("Normalize filter fail, Unsupported Primitive type");
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Construct the ColumnValueRange for one specified column
|
||||
// It will only handle the InPredicate and eq BinaryPredicate in _conjunct_ctxs.
|
||||
// It will try to push down conditions of that column as much as possible,
|
||||
@ -783,117 +902,38 @@ template <class T>
|
||||
Status OlapScanNode::normalize_in_and_eq_predicate(SlotDescriptor* slot,
|
||||
ColumnValueRange<T>* range) {
|
||||
std::vector<uint32_t> filter_conjuncts_index;
|
||||
bool meet_eq_binary = false;
|
||||
for (int conj_idx = 0; conj_idx < _conjunct_ctxs.size(); ++conj_idx) {
|
||||
// create empty range as temp range, temp range should do intersection on range
|
||||
auto temp_range = ColumnValueRange<T>::create_empty_column_value_range(range->type());
|
||||
|
||||
// 1. Normalize in conjuncts like 'where col in (v1, v2, v3)'
|
||||
if (TExprOpcode::FILTER_IN == _conjunct_ctxs[conj_idx]->root()->op()) {
|
||||
InPredicate* pred = dynamic_cast<InPredicate*>(_conjunct_ctxs[conj_idx]->root());
|
||||
if (pred->is_not_in()) {
|
||||
// can not push down NOT IN predicate to storage engine
|
||||
continue;
|
||||
}
|
||||
|
||||
if (Expr::type_without_cast(pred->get_child(0)) != TExprNodeType::SLOT_REF) {
|
||||
// not a slot ref(column)
|
||||
continue;
|
||||
}
|
||||
|
||||
std::vector<SlotId> slot_ids;
|
||||
if (pred->get_child(0)->get_slot_ids(&slot_ids) != 1) {
|
||||
// not a single column predicate
|
||||
continue;
|
||||
}
|
||||
|
||||
if (slot_ids[0] != slot->id()) {
|
||||
// predicate not related to current column
|
||||
continue;
|
||||
}
|
||||
|
||||
if (pred->get_child(0)->type().type != slot->type().type) {
|
||||
if (!ignore_cast(slot, pred->get_child(0))) {
|
||||
// the type of predicate not match the slot's type
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
VLOG(1) << slot->col_name() << " fixed_values add num: " << pred->hybrid_set()->size();
|
||||
|
||||
// if there are too many elements in InPredicate, exceed the limit,
|
||||
// we will not push any condition of this column to storage engine.
|
||||
// because too many conditions pushed down to storage engine may even
|
||||
// slow down the query process.
|
||||
// ATTN: This is just an experience value. You may need to try
|
||||
// different thresholds to improve performance.
|
||||
if (pred->hybrid_set()->size() > _max_pushdown_conditions_per_column) {
|
||||
VLOG(3) << "Predicate value num " << pred->hybrid_set()->size() << " exceed limit "
|
||||
<< _max_pushdown_conditions_per_column;
|
||||
if (!should_push_down_in_predicate(slot, pred)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// begin to push InPredicate value into ColumnValueRange
|
||||
HybridSetBase::IteratorBase* iter = pred->hybrid_set()->begin();
|
||||
auto skip_invalid_value_count = 0;
|
||||
|
||||
while (iter->has_next()) {
|
||||
// column in (NULL,...) couldn't push down to StorageEngine
|
||||
// so that discard whole ColumnValueRange
|
||||
// so set clear() temp_range to whole range
|
||||
if (NULL == iter->get_value()) {
|
||||
range->clear();
|
||||
temp_range.clear();
|
||||
break;
|
||||
}
|
||||
|
||||
switch (slot->type().type) {
|
||||
case TYPE_TINYINT: {
|
||||
int32_t v = *reinterpret_cast<int8_t*>(const_cast<void*>(iter->get_value()));
|
||||
range->add_fixed_value(*reinterpret_cast<T*>(&v));
|
||||
break;
|
||||
}
|
||||
case TYPE_DATE: {
|
||||
DateTimeValue date_value =
|
||||
*reinterpret_cast<const DateTimeValue*>(iter->get_value());
|
||||
if (date_value.check_loss_accuracy_cast_to_date()) {
|
||||
// There is may return empty data in olap_scan_node,
|
||||
// Because data value loss accuracy, skip this value
|
||||
skip_invalid_value_count++;
|
||||
} else {
|
||||
range->add_fixed_value(*reinterpret_cast<T *>(&date_value));
|
||||
}
|
||||
break;
|
||||
}
|
||||
case TYPE_DECIMAL:
|
||||
case TYPE_DECIMALV2:
|
||||
case TYPE_LARGEINT:
|
||||
case TYPE_CHAR:
|
||||
case TYPE_VARCHAR:
|
||||
case TYPE_HLL:
|
||||
case TYPE_SMALLINT:
|
||||
case TYPE_INT:
|
||||
case TYPE_BIGINT:
|
||||
case TYPE_DATETIME: {
|
||||
range->add_fixed_value(
|
||||
*reinterpret_cast<T*>(const_cast<void*>(iter->get_value())));
|
||||
break;
|
||||
}
|
||||
case TYPE_BOOLEAN: {
|
||||
bool v = *reinterpret_cast<bool*>(const_cast<void*>(iter->get_value()));
|
||||
range->add_fixed_value(*reinterpret_cast<T*>(&v));
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
break;
|
||||
}
|
||||
}
|
||||
auto value = const_cast<void*>(iter->get_value());
|
||||
RETURN_IF_ERROR(insert_value_to_range(temp_range, slot->type().type, value));
|
||||
iter->next();
|
||||
}
|
||||
|
||||
// all value in hybrid set in skip, means all in condition
|
||||
// is invalid, so set eos = true
|
||||
if (skip_invalid_value_count == pred->hybrid_set()->size()) {
|
||||
_eos = true;
|
||||
}
|
||||
|
||||
if (is_key_column(slot->col_name())) {
|
||||
filter_conjuncts_index.emplace_back(conj_idx);
|
||||
// only where a in ('a', 'b', NULL) contain NULL will
|
||||
// clear temp_range to whole range, no need do intersection
|
||||
if (!temp_range.is_whole_range()) {
|
||||
if (is_key_column(slot->col_name())) {
|
||||
filter_conjuncts_index.emplace_back(conj_idx);
|
||||
}
|
||||
range->intersection(temp_range);
|
||||
}
|
||||
} // end of handle in predicate
|
||||
|
||||
@ -904,119 +944,27 @@ Status OlapScanNode::normalize_in_and_eq_predicate(SlotDescriptor* slot,
|
||||
DCHECK(pred->get_num_children() == 2);
|
||||
|
||||
for (int child_idx = 0; child_idx < 2; ++child_idx) {
|
||||
if (Expr::type_without_cast(pred->get_child(child_idx)) !=
|
||||
TExprNodeType::SLOT_REF) {
|
||||
// TODO: should use C++17 structured bindlings to refactor this code in the future:
|
||||
// 'auto [should_push_down, value] = should_push_down_eq_predicate(slot, pred, conj_idx, child_idx);'
|
||||
// make code tidier and readabler
|
||||
auto result_pair = should_push_down_eq_predicate(slot, pred, conj_idx, child_idx);
|
||||
if (!result_pair.first) {
|
||||
continue;
|
||||
}
|
||||
auto value = result_pair.second;
|
||||
|
||||
std::vector<SlotId> slot_ids;
|
||||
if (pred->get_child(child_idx)->get_slot_ids(&slot_ids) != 1) {
|
||||
// not a single column predicate
|
||||
continue;
|
||||
}
|
||||
|
||||
if (slot_ids[0] != slot->id()) {
|
||||
// predicate not related to current column
|
||||
continue;
|
||||
}
|
||||
|
||||
if (pred->get_child(child_idx)->type().type != slot->type().type) {
|
||||
if (!ignore_cast(slot, pred->get_child(child_idx))) {
|
||||
// the type of predicate not match the slot's type
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
Expr* expr = pred->get_child(1 - child_idx);
|
||||
if (!expr->is_constant()) {
|
||||
// only handle constant value
|
||||
continue;
|
||||
}
|
||||
|
||||
void* value = _conjunct_ctxs[conj_idx]->get_value(expr, NULL);
|
||||
// for case: where col = null
|
||||
if (value == NULL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// begin to push condition value into ColumnValueRange
|
||||
// clear the ColumnValueRange before adding new fixed values.
|
||||
// because for AND compound predicates, it can overwrite previous conditions
|
||||
range->clear();
|
||||
switch (slot->type().type) {
|
||||
case TYPE_TINYINT: {
|
||||
int32_t v = *reinterpret_cast<int8_t*>(value);
|
||||
range->add_fixed_value(*reinterpret_cast<T*>(&v));
|
||||
break;
|
||||
}
|
||||
case TYPE_DATE: {
|
||||
DateTimeValue date_value =
|
||||
*reinterpret_cast<DateTimeValue*>(value);
|
||||
if (date_value.check_loss_accuracy_cast_to_date()) {
|
||||
// There is must return empty data in olap_scan_node,
|
||||
// Because data value loss accuracy
|
||||
_eos = true;
|
||||
}
|
||||
range->add_fixed_value(*reinterpret_cast<T*>(&date_value));
|
||||
break;
|
||||
}
|
||||
case TYPE_DECIMAL:
|
||||
case TYPE_DECIMALV2:
|
||||
case TYPE_CHAR:
|
||||
case TYPE_VARCHAR:
|
||||
case TYPE_HLL:
|
||||
case TYPE_DATETIME:
|
||||
case TYPE_SMALLINT:
|
||||
case TYPE_INT:
|
||||
case TYPE_BIGINT:
|
||||
case TYPE_LARGEINT: {
|
||||
range->add_fixed_value(*reinterpret_cast<T*>(value));
|
||||
break;
|
||||
}
|
||||
case TYPE_BOOLEAN: {
|
||||
bool v = *reinterpret_cast<bool*>(value);
|
||||
range->add_fixed_value(*reinterpret_cast<T*>(&v));
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
LOG(WARNING) << "Normalize filter fail, Unsupported Primitive type. [type="
|
||||
<< expr->type() << "]";
|
||||
return Status::InternalError("Normalize filter fail, Unsupported Primitive type");
|
||||
}
|
||||
}
|
||||
RETURN_IF_ERROR(insert_value_to_range(temp_range, slot->type().type, value));
|
||||
|
||||
if (is_key_column(slot->col_name())) {
|
||||
filter_conjuncts_index.emplace_back(conj_idx);
|
||||
}
|
||||
meet_eq_binary = true;
|
||||
range->intersection(temp_range);
|
||||
} // end for each binary predicate child
|
||||
} // end of handling eq binary predicate
|
||||
|
||||
|
||||
if (range->get_fixed_value_size() > 0) {
|
||||
// this columns already meet some eq predicates(IN or Binary),
|
||||
// There is no need to continue to iterate.
|
||||
// TODO(cmy): In fact, this part of the judgment should be completed in
|
||||
// the FE query planning stage. For the following predicate conditions,
|
||||
// it should be possible to eliminate at the FE side.
|
||||
// WHERE A = 1 and A in (2,3,4)
|
||||
|
||||
if (meet_eq_binary) {
|
||||
// meet_eq_binary is true, means we meet at least one eq binary predicate.
|
||||
// this flag is to handle following case:
|
||||
// There are 2 conjuncts, first in a InPredicate, and second is a BinaryPredicate.
|
||||
// Firstly, we met a InPredicate, and add lots of values in ColumnValueRange,
|
||||
// if breaks, doris will read many rows filtered by these values.
|
||||
// But if continue to handle the BinaryPredicate, the value in ColumnValueRange
|
||||
// may become only one, which can reduce the rows read from storage engine.
|
||||
// So the strategy is to use the BinaryPredicate as much as possible.
|
||||
break;
|
||||
}
|
||||
}
|
||||
} // end of handling eq binary predicate
|
||||
}
|
||||
|
||||
// exceed limit, no conditions will be pushed down to storage engine.
|
||||
if (range->get_fixed_value_size() > _max_pushdown_conditions_per_column) {
|
||||
// exceed limit, no conditions will be pushed down to storage engine.
|
||||
range->clear();
|
||||
} else {
|
||||
std::copy(filter_conjuncts_index.cbegin(), filter_conjuncts_index.cend(),
|
||||
@ -1158,8 +1106,6 @@ Status OlapScanNode::normalize_noneq_binary_predicate(SlotDescriptor* slot,
|
||||
<< " value: " << *reinterpret_cast<T*>(value);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
std::copy(filter_conjuncts_index.cbegin(), filter_conjuncts_index.cend(),
|
||||
|
||||
@ -26,6 +26,7 @@
|
||||
#include "exec/olap_common.h"
|
||||
#include "exec/olap_scanner.h"
|
||||
#include "exec/scan_node.h"
|
||||
#include "exprs/in_predicate.h"
|
||||
#include "runtime/descriptors.h"
|
||||
#include "runtime/row_batch_interface.hpp"
|
||||
#include "runtime/vectorized_row_batch.h"
|
||||
@ -172,6 +173,13 @@ private:
|
||||
void construct_is_null_pred_in_where_pred(Expr* expr, SlotDescriptor* slot,
|
||||
const std::string& is_null_str);
|
||||
|
||||
bool should_push_down_in_predicate(SlotDescriptor* slot, InPredicate* in_pred);
|
||||
|
||||
std::pair<bool, void*> should_push_down_eq_predicate(SlotDescriptor* slot, Expr* pred, int conj_idx, int child_idx);
|
||||
|
||||
template <typename T>
|
||||
static Status insert_value_to_range(ColumnValueRange<T>& range, PrimitiveType type, void* value);
|
||||
|
||||
friend class OlapScanner;
|
||||
|
||||
std::vector<TCondition> _is_null_vector;
|
||||
|
||||
@ -26,7 +26,7 @@
|
||||
|
||||
#include "common/status.h"
|
||||
#include "exec/exec_node.h"
|
||||
#include "exec/olap_common.h"
|
||||
#include "exec/olap_utils.h"
|
||||
#include "exprs/expr.h"
|
||||
#include "gen_cpp/PaloInternalService_types.h"
|
||||
#include "gen_cpp/PlanNodes_types.h"
|
||||
|
||||
@ -59,8 +59,6 @@ static uint32_t calc_days_in_year(uint32_t year) {
|
||||
return is_leap(year) ? 366 : 365;
|
||||
}
|
||||
|
||||
DateTimeValue DateTimeValue::_s_min_datetime_value(0, TIME_DATETIME, 0, 0, 0, 0, 0, 1, 1);
|
||||
DateTimeValue DateTimeValue::_s_max_datetime_value(0, TIME_DATETIME, 23, 59, 59, 0, 9999, 12, 31);
|
||||
RE2 DateTimeValue::time_zone_offset_format_reg("^[+-]{1}\\d{2}\\:\\d{2}$");
|
||||
|
||||
bool DateTimeValue::check_range() const {
|
||||
|
||||
@ -426,9 +426,15 @@ public:
|
||||
return std::string(buf, end - buf);
|
||||
}
|
||||
|
||||
static DateTimeValue datetime_min_value() { return _s_min_datetime_value; }
|
||||
|
||||
static DateTimeValue datetime_max_value() { return _s_max_datetime_value; }
|
||||
static DateTimeValue datetime_min_value() {
|
||||
static DateTimeValue _s_min_datetime_value(0, TIME_DATETIME, 0, 0, 0, 0, 0, 1, 1);
|
||||
return _s_min_datetime_value;
|
||||
}
|
||||
|
||||
static DateTimeValue datetime_max_value() {
|
||||
static DateTimeValue _s_max_datetime_value(0, TIME_DATETIME, 23, 59, 59, 0, 9999, 12, 31);
|
||||
return _s_max_datetime_value;
|
||||
}
|
||||
|
||||
int64_t second_diff(const DateTimeValue& rhs) const {
|
||||
int day_diff = daynr() - rhs.daynr();
|
||||
@ -542,8 +548,6 @@ private:
|
||||
_day(day),
|
||||
_microsecond(microsecond) {}
|
||||
|
||||
static DateTimeValue _s_min_datetime_value;
|
||||
static DateTimeValue _s_max_datetime_value;
|
||||
// RE2 obj is thread safe
|
||||
static RE2 time_zone_offset_format_reg;
|
||||
};
|
||||
|
||||
@ -39,4 +39,16 @@ std::size_t operator-(const StringValue& v1, const StringValue& v2) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
constexpr char StringValue::MIN_CHAR = 0x00;
|
||||
|
||||
constexpr char StringValue::MAX_CHAR = 0xff;
|
||||
|
||||
StringValue StringValue::min_string_val() {
|
||||
return StringValue((char*)(&StringValue::MIN_CHAR), 0);
|
||||
}
|
||||
|
||||
StringValue StringValue::max_string_val() {
|
||||
return StringValue((char*)(&StringValue::MAX_CHAR), 1);
|
||||
}
|
||||
|
||||
} // namespace doris
|
||||
|
||||
@ -29,6 +29,9 @@ namespace doris {
|
||||
// The returned StringValue of all functions that return StringValue
|
||||
// shares its buffer the parent.
|
||||
struct StringValue {
|
||||
const static char MIN_CHAR;
|
||||
const static char MAX_CHAR;
|
||||
|
||||
static const int MAX_LENGTH = (1 << 30);
|
||||
// TODO: change ptr to an offset relative to a contiguous memory block,
|
||||
// so that we can send row batches between nodes without having to swizzle
|
||||
@ -104,6 +107,10 @@ struct StringValue {
|
||||
static StringValue from_string_val(const doris_udf::StringVal& sv) {
|
||||
return StringValue(reinterpret_cast<char*>(sv.ptr), sv.len);
|
||||
}
|
||||
|
||||
static StringValue min_string_val();
|
||||
|
||||
static StringValue max_string_val();
|
||||
};
|
||||
|
||||
// This function must be called 'hash_value' to be picked up by boost.
|
||||
|
||||
80
be/src/runtime/type_limit.h
Normal file
80
be/src/runtime/type_limit.h
Normal file
@ -0,0 +1,80 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#ifndef DORIS_BE_RUNTIME_TYPE_LIMIT_H
|
||||
#define DORIS_BE_RUNTIME_TYPE_LIMIT_H
|
||||
|
||||
#include "runtime/datetime_value.h"
|
||||
#include "runtime/decimal_value.h"
|
||||
#include "runtime/decimalv2_value.h"
|
||||
#include "runtime/string_value.h"
|
||||
|
||||
namespace doris {
|
||||
|
||||
template <typename T>
|
||||
struct type_limit {
|
||||
static T min() {
|
||||
return std::numeric_limits<T>::min();
|
||||
}
|
||||
static T max() {
|
||||
return std::numeric_limits<T>::max();
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_limit<StringValue> {
|
||||
static StringValue min() {
|
||||
return StringValue::min_string_val();
|
||||
}
|
||||
static StringValue max() {
|
||||
return StringValue::max_string_val();
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_limit<DecimalValue> {
|
||||
static DecimalValue min() {
|
||||
return DecimalValue::get_min_decimal();
|
||||
}
|
||||
static DecimalValue max() {
|
||||
return DecimalValue::get_max_decimal();
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_limit<DecimalV2Value> {
|
||||
static DecimalV2Value min() {
|
||||
return DecimalV2Value::get_min_decimal();
|
||||
}
|
||||
static DecimalV2Value max() {
|
||||
return DecimalV2Value::get_max_decimal();
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_limit<DateTimeValue> {
|
||||
static DateTimeValue min() {
|
||||
return DateTimeValue::datetime_min_value();
|
||||
}
|
||||
static DateTimeValue max() {
|
||||
return DateTimeValue::datetime_max_value();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace doris
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user