[Bug] Fix the bug of where condition a in ('A', 'B', 'V') and a in ('A') return error result (#5072)

And Refactor ColumnRangeValue and OlapScanNode

This patch mainly do the following:
- Fix issue #5071
- Change type_min in ColumnRangeValue as static
- Add Class of type_limit make code clear
- Refactor the function of normalize_in_and_eq_predicate
This commit is contained in:
HappenLee
2020-12-15 09:29:10 +08:00
committed by GitHub
parent 90e7f7005e
commit 0a0e46fd53
10 changed files with 398 additions and 267 deletions

View File

@ -35,6 +35,11 @@ std::string cast_to_string(__int128 value) {
return ss.str();
}
template <>
std::string cast_to_string(int8_t value) {
return std::to_string(static_cast<int>(value));
}
template <>
void ColumnValueRange<StringValue>::convert_to_fixed_value() {
return;

View File

@ -31,7 +31,7 @@
#include "exec/scan_node.h"
#include "gen_cpp/PlanNodes_types.h"
#include "olap/tuple.h"
#include "runtime/datetime_value.h"
#include "runtime/type_limit.h"
#include "runtime/descriptors.h"
#include "runtime/string_value.hpp"
@ -42,6 +42,11 @@ std::string cast_to_string(T value) {
return boost::lexical_cast<std::string>(value);
}
// TYPE_TINYINT should cast to int32_t to first
// because it need to convert to num not char for build Olap fetch Query
template <>
std::string cast_to_string(int8_t);
/**
* @brief Column's value range
**/
@ -50,7 +55,8 @@ class ColumnValueRange {
public:
typedef typename std::set<T>::iterator iterator_type;
ColumnValueRange();
ColumnValueRange(std::string col_name, PrimitiveType type, T min, T max);
ColumnValueRange(std::string col_name, PrimitiveType type);
ColumnValueRange(std::string col_name, PrimitiveType type, const T& min, const T& max);
// should add fixed value before add range
Status add_fixed_value(T value);
@ -73,10 +79,12 @@ public:
bool has_intersection(ColumnValueRange<T>& range);
void intersection(ColumnValueRange<T>& range);
void set_empty_value_range() {
_fixed_values.clear();
_low_value = _type_max;
_high_value = _type_min;
_low_value = TYPE_MAX;
_high_value = TYPE_MIN;
}
const std::set<T>& get_fixed_value_set() const { return _fixed_values; }
@ -85,9 +93,9 @@ public:
T get_range_min_value() const { return _low_value; }
bool is_low_value_mininum() const { return _low_value == _type_min; }
bool is_low_value_mininum() const { return _low_value == TYPE_MIN; }
bool is_high_value_maximum() const { return _high_value == _type_max; }
bool is_high_value_maximum() const { return _high_value == TYPE_MAX; }
bool is_begin_include() const { return _low_op == FILTER_LARGER_OR_EQUAL; }
@ -112,7 +120,7 @@ public:
}
} else {
TCondition low;
if (_type_min != _low_value || FILTER_LARGER_OR_EQUAL != _low_op) {
if (TYPE_MIN != _low_value || FILTER_LARGER_OR_EQUAL != _low_op) {
low.__set_column_name(_column_name);
low.__set_condition_op((_low_op == FILTER_LARGER_OR_EQUAL ? ">=" : ">>"));
low.condition_values.push_back(cast_to_string(_low_value));
@ -123,7 +131,7 @@ public:
}
TCondition high;
if (_type_max != _high_value || FILTER_LESS_OR_EQUAL != _high_op) {
if (TYPE_MAX != _high_value || FILTER_LESS_OR_EQUAL != _high_op) {
high.__set_column_name(_column_name);
high.__set_condition_op((_high_op == FILTER_LESS_OR_EQUAL ? "<=" : "<<"));
high.condition_values.push_back(cast_to_string(_high_value));
@ -137,20 +145,30 @@ public:
void clear() {
_fixed_values.clear();
_low_value = _type_min;
_high_value = _type_max;
_low_value = TYPE_MIN;
_high_value = TYPE_MAX;
_low_op = FILTER_LARGER_OR_EQUAL;
_high_op = FILTER_LESS_OR_EQUAL;
}
bool is_whole_range() const {
return _fixed_values.empty() && _low_value == TYPE_MIN && _high_value == TYPE_MAX &&
_low_op == FILTER_LARGER_OR_EQUAL && _high_op == FILTER_LESS_OR_EQUAL;
}
static ColumnValueRange<T> create_empty_column_value_range(PrimitiveType type) {
return ColumnValueRange<T>("", type, type_limit<T>::max(), type_limit<T>::min());
}
protected:
bool is_in_range(const T& value);
private:
const static T TYPE_MIN; // Column type's min value
const static T TYPE_MAX; // Column type's max value
std::string _column_name;
PrimitiveType _column_type; // Column type (eg: TINYINT,SMALLINT,INT,BIGINT)
T _type_min; // Column type's min value
T _type_max; // Column type's max value
T _low_value; // Column's low value, closed interval at left
T _high_value; // Column's high value, open interval at right
SQLFilterOp _low_op;
@ -222,15 +240,22 @@ typedef boost::variant<ColumnValueRange<int8_t>, ColumnValueRange<int16_t>,
ColumnValueRange<DecimalV2Value>, ColumnValueRange<bool>>
ColumnValueRangeType;
template <class T>
const T ColumnValueRange<T>::TYPE_MIN = type_limit<T>::min();
template <class T>
const T ColumnValueRange<T>::TYPE_MAX = type_limit<T>::max();
template <class T>
ColumnValueRange<T>::ColumnValueRange() : _column_type(INVALID_TYPE) {}
template <class T>
ColumnValueRange<T>::ColumnValueRange(std::string col_name, PrimitiveType type, T min, T max)
: _column_name(col_name),
ColumnValueRange<T>::ColumnValueRange(std::string col_name, PrimitiveType type)
: ColumnValueRange(std::move(col_name), type, TYPE_MIN, TYPE_MAX){}
template <class T>
ColumnValueRange<T>::ColumnValueRange(std::string col_name, PrimitiveType type, const T& min, const T& max)
: _column_name(std::move(col_name)),
_column_type(type),
_type_min(min),
_type_max(max),
_low_value(min),
_high_value(max),
_low_op(FILTER_LARGER_OR_EQUAL),
@ -325,14 +350,12 @@ void ColumnValueRange<T>::convert_to_fixed_value() {
++_low_value;
}
if (_high_op == FILTER_LESS) {
for (T v = _low_value; v < _high_value; ++v) {
_fixed_values.insert(v);
}
} else {
for (T v = _low_value; v <= _high_value; ++v) {
_fixed_values.insert(v);
}
for (T v = _low_value; v < _high_value; ++v) {
_fixed_values.insert(v);
}
if (_high_op == FILTER_LESS_OR_EQUAL) {
_fixed_values.insert(_high_value);
}
}
@ -391,8 +414,8 @@ Status ColumnValueRange<T>::add_range(SQLFilterOp op, T value) {
}
}
_high_value = _type_min;
_low_value = _type_max;
_high_value = TYPE_MIN;
_low_value = TYPE_MAX;
} else {
if (_high_value > _low_value) {
switch (op) {
@ -430,7 +453,6 @@ Status ColumnValueRange<T>::add_range(SQLFilterOp op, T value) {
}
break;
break;
}
default: {
@ -442,8 +464,8 @@ Status ColumnValueRange<T>::add_range(SQLFilterOp op, T value) {
if (FILTER_LARGER_OR_EQUAL == _low_op && FILTER_LESS_OR_EQUAL == _high_op &&
_high_value == _low_value) {
add_fixed_value(_high_value);
_high_value = _type_min;
_low_value = _type_max;
_high_value = TYPE_MIN;
_low_value = TYPE_MAX;
}
}
@ -495,6 +517,55 @@ bool ColumnValueRange<T>::is_in_range(const T& value) {
return false;
}
template <class T>
void ColumnValueRange<T>::intersection(ColumnValueRange<T>& range) {
// 1. clear if column type not match
if (_column_type != range._column_type) {
set_empty_value_range();
}
// 2. clear if any range is empty
if (is_empty_value_range() || range.is_empty_value_range()) {
set_empty_value_range();
}
std::set<T> result_values;
// 3. fixed_value intersection
if (is_fixed_value_range() || range.is_fixed_value_range()) {
if (is_fixed_value_range() && range.is_fixed_value_range()) {
set_intersection(_fixed_values.begin(), _fixed_values.end(), range._fixed_values.begin(),
range._fixed_values.end(),
std::inserter(result_values, result_values.begin()));
} else if (is_fixed_value_range() && !range.is_fixed_value_range()) {
iterator_type iter = _fixed_values.begin();
while (iter != _fixed_values.end()) {
if (range.is_in_range(*iter)) {
result_values.insert(*iter);
}
++iter;
}
} else if (!is_fixed_value_range() && range.is_fixed_value_range()) {
iterator_type iter = range._fixed_values.begin();
while (iter != range._fixed_values.end()) {
if (this->is_in_range(*iter)) {
result_values.insert(*iter);
}
++iter;
}
}
if (!result_values.empty()) {
_fixed_values = std::move(result_values);
} else {
set_empty_value_range();
}
} else {
add_range(range._high_op, range._high_value);
add_range(range._low_op, range._low_value);
}
}
template <class T>
bool ColumnValueRange<T>::has_intersection(ColumnValueRange<T>& range) {
// 1. return false if column type not match

View File

@ -30,7 +30,6 @@
#include "common/resource_tls.h"
#include "exprs/binary_predicate.h"
#include "exprs/expr.h"
#include "exprs/in_predicate.h"
#include "gen_cpp/PlanNodes_types.h"
#include "runtime/exec_env.h"
#include "runtime/row_batch.h"
@ -455,45 +454,37 @@ Status OlapScanNode::normalize_conjuncts() {
for (int slot_idx = 0; slot_idx < slots.size(); ++slot_idx) {
switch (slots[slot_idx]->type().type) {
// TYPE_TINYINT use int32_t to present
// because it's easy to convert to string for build Olap fetch Query
case TYPE_TINYINT: {
ColumnValueRange<int32_t> range(
slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
ColumnValueRange<int8_t> range(
slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
normalize_predicate(range, slots[slot_idx]);
break;
}
case TYPE_SMALLINT: {
ColumnValueRange<int16_t> range(
slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
std::numeric_limits<int16_t>::min(), std::numeric_limits<int16_t>::max());
slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
normalize_predicate(range, slots[slot_idx]);
break;
}
case TYPE_INT: {
ColumnValueRange<int32_t> range(
slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
std::numeric_limits<int32_t>::min(), std::numeric_limits<int32_t>::max());
slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
normalize_predicate(range, slots[slot_idx]);
break;
}
case TYPE_BIGINT: {
ColumnValueRange<int64_t> range(
slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max());
slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
normalize_predicate(range, slots[slot_idx]);
break;
}
case TYPE_LARGEINT: {
__int128 min = MIN_INT128;
__int128 max = MAX_INT128;
ColumnValueRange<__int128> range(slots[slot_idx]->col_name(),
slots[slot_idx]->type().type, min, max);
slots[slot_idx]->type().type);
normalize_predicate(range, slots[slot_idx]);
break;
}
@ -501,47 +492,36 @@ Status OlapScanNode::normalize_conjuncts() {
case TYPE_CHAR:
case TYPE_VARCHAR:
case TYPE_HLL: {
static char min_char = 0x00;
static char max_char = 0xff;
ColumnValueRange<StringValue> range(
slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
StringValue(&min_char, 0), StringValue(&max_char, 1));
slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
normalize_predicate(range, slots[slot_idx]);
break;
}
case TYPE_DATE:
case TYPE_DATETIME: {
DateTimeValue max_value = DateTimeValue::datetime_max_value();
DateTimeValue min_value = DateTimeValue::datetime_min_value();
ColumnValueRange<DateTimeValue> range(slots[slot_idx]->col_name(),
slots[slot_idx]->type().type, min_value,
max_value);
slots[slot_idx]->type().type);
normalize_predicate(range, slots[slot_idx]);
break;
}
case TYPE_DECIMAL: {
DecimalValue min = DecimalValue::get_min_decimal();
DecimalValue max = DecimalValue::get_max_decimal();
ColumnValueRange<DecimalValue> range(slots[slot_idx]->col_name(),
slots[slot_idx]->type().type, min, max);
slots[slot_idx]->type().type);
normalize_predicate(range, slots[slot_idx]);
break;
}
case TYPE_DECIMALV2: {
DecimalV2Value min = DecimalV2Value::get_min_decimal();
DecimalV2Value max = DecimalV2Value::get_max_decimal();
ColumnValueRange<DecimalV2Value> range(slots[slot_idx]->col_name(),
slots[slot_idx]->type().type, min, max);
slots[slot_idx]->type().type);
normalize_predicate(range, slots[slot_idx]);
break;
}
case TYPE_BOOLEAN: {
ColumnValueRange<bool> range(slots[slot_idx]->col_name(), slots[slot_idx]->type().type,
false, true);
ColumnValueRange<bool> range(slots[slot_idx]->col_name(), slots[slot_idx]->type().type);
normalize_predicate(range, slots[slot_idx]);
break;
}
@ -759,7 +739,10 @@ Status OlapScanNode::normalize_predicate(ColumnValueRange<T>& range, SlotDescrip
// 2. Normalize BinaryPredicate , add to ColumnValueRange
RETURN_IF_ERROR(normalize_noneq_binary_predicate(slot, &range));
// 3. Add range to Column->ColumnValueRange map
// 3. Check whether range is empty, set _eos
if (range.is_empty_value_range()) _eos = true;
// 4. Add range to Column->ColumnValueRange map
_column_value_ranges[slot->col_name()] = range;
return Status::OK();
@ -775,6 +758,142 @@ static bool ignore_cast(SlotDescriptor* slot, Expr* expr) {
return false;
}
bool OlapScanNode::should_push_down_in_predicate(doris::SlotDescriptor *slot, doris::InPredicate* pred) {
if (pred->is_not_in()) {
// can not push down NOT IN predicate to storage engine
return false;
}
if (Expr::type_without_cast(pred->get_child(0)) != TExprNodeType::SLOT_REF) {
// not a slot ref(column)
return false;
}
std::vector<SlotId> slot_ids;
if (pred->get_child(0)->get_slot_ids(&slot_ids) != 1) {
// not a single column predicate
return false;
}
if (slot_ids[0] != slot->id()) {
// predicate not related to current column
return false;
}
if (pred->get_child(0)->type().type != slot->type().type) {
if (!ignore_cast(slot, pred->get_child(0))) {
// the type of predicate not match the slot's type
return false;
}
}
VLOG(1) << slot->col_name() << " fixed_values add num: " << pred->hybrid_set()->size();
// if there are too many elements in InPredicate, exceed the limit,
// we will not push any condition of this column to storage engine.
// because too many conditions pushed down to storage engine may even
// slow down the query process.
// ATTN: This is just an experience value. You may need to try
// different thresholds to improve performance.
if (pred->hybrid_set()->size() > _max_pushdown_conditions_per_column) {
VLOG(3) << "Predicate value num " << pred->hybrid_set()->size() << " exceed limit "
<< _max_pushdown_conditions_per_column;
return false;
}
return true;
}
std::pair<bool, void*> OlapScanNode::should_push_down_eq_predicate(doris::SlotDescriptor *slot, doris::Expr *pred,
int conj_idx, int child_idx) {
auto result_pair = std::make_pair<bool, void*>(false, nullptr);
// Do not get slot_ref of column, should not push_down to Storage Engine
if (Expr::type_without_cast(pred->get_child(child_idx)) !=
TExprNodeType::SLOT_REF) {
return result_pair;
}
std::vector<SlotId> slot_ids;
if (pred->get_child(child_idx)->get_slot_ids(&slot_ids) != 1) {
// not a single column predicate
return result_pair;
}
if (slot_ids[0] != slot->id()) {
// predicate not related to current column
return result_pair;
}
if (pred->get_child(child_idx)->type().type != slot->type().type) {
if (!ignore_cast(slot, pred->get_child(child_idx))) {
// the type of predicate not match the slot's type
return result_pair;
}
}
Expr* expr = pred->get_child(1 - child_idx);
if (!expr->is_constant()) {
// only handle constant value
return result_pair;
}
// get value in result pair
result_pair.second = _conjunct_ctxs[conj_idx]->get_value(expr, NULL);
// TODO(lhp) push down is null predicate to storage engine
// for case: where col = null
if (result_pair.second != nullptr) {
result_pair.first = true;
}
return result_pair;
}
template <typename T>
Status OlapScanNode::insert_value_to_range(doris::ColumnValueRange<T>& temp_range, doris::PrimitiveType type, void *value) {
switch (type) {
case TYPE_TINYINT: {
int32_t v = *reinterpret_cast<int8_t*>(value);
temp_range.add_fixed_value(*reinterpret_cast<T*>(&v));
break;
}
case TYPE_DATE: {
DateTimeValue date_value =
*reinterpret_cast<DateTimeValue*>(value);
// There is must return empty data in olap_scan_node,
// Because data value loss accuracy
if (!date_value.check_loss_accuracy_cast_to_date()) {
temp_range.add_fixed_value(*reinterpret_cast<T*>(&date_value));
}
break;
}
case TYPE_DECIMAL:
case TYPE_DECIMALV2:
case TYPE_CHAR:
case TYPE_VARCHAR:
case TYPE_HLL:
case TYPE_DATETIME:
case TYPE_SMALLINT:
case TYPE_INT:
case TYPE_BIGINT:
case TYPE_LARGEINT: {
temp_range.add_fixed_value(*reinterpret_cast<T*>(value));
break;
}
case TYPE_BOOLEAN: {
bool v = *reinterpret_cast<bool*>(value);
temp_range.add_fixed_value(*reinterpret_cast<T*>(&v));
break;
}
default: {
LOG(WARNING) << "Normalize filter fail, Unsupported Primitive type. [type="
<< type << "]";
return Status::InternalError("Normalize filter fail, Unsupported Primitive type");
}
}
return Status::OK();
}
// Construct the ColumnValueRange for one specified column
// It will only handle the InPredicate and eq BinaryPredicate in _conjunct_ctxs.
// It will try to push down conditions of that column as much as possible,
@ -783,117 +902,38 @@ template <class T>
Status OlapScanNode::normalize_in_and_eq_predicate(SlotDescriptor* slot,
ColumnValueRange<T>* range) {
std::vector<uint32_t> filter_conjuncts_index;
bool meet_eq_binary = false;
for (int conj_idx = 0; conj_idx < _conjunct_ctxs.size(); ++conj_idx) {
// create empty range as temp range, temp range should do intersection on range
auto temp_range = ColumnValueRange<T>::create_empty_column_value_range(range->type());
// 1. Normalize in conjuncts like 'where col in (v1, v2, v3)'
if (TExprOpcode::FILTER_IN == _conjunct_ctxs[conj_idx]->root()->op()) {
InPredicate* pred = dynamic_cast<InPredicate*>(_conjunct_ctxs[conj_idx]->root());
if (pred->is_not_in()) {
// can not push down NOT IN predicate to storage engine
continue;
}
if (Expr::type_without_cast(pred->get_child(0)) != TExprNodeType::SLOT_REF) {
// not a slot ref(column)
continue;
}
std::vector<SlotId> slot_ids;
if (pred->get_child(0)->get_slot_ids(&slot_ids) != 1) {
// not a single column predicate
continue;
}
if (slot_ids[0] != slot->id()) {
// predicate not related to current column
continue;
}
if (pred->get_child(0)->type().type != slot->type().type) {
if (!ignore_cast(slot, pred->get_child(0))) {
// the type of predicate not match the slot's type
continue;
}
}
VLOG(1) << slot->col_name() << " fixed_values add num: " << pred->hybrid_set()->size();
// if there are too many elements in InPredicate, exceed the limit,
// we will not push any condition of this column to storage engine.
// because too many conditions pushed down to storage engine may even
// slow down the query process.
// ATTN: This is just an experience value. You may need to try
// different thresholds to improve performance.
if (pred->hybrid_set()->size() > _max_pushdown_conditions_per_column) {
VLOG(3) << "Predicate value num " << pred->hybrid_set()->size() << " exceed limit "
<< _max_pushdown_conditions_per_column;
if (!should_push_down_in_predicate(slot, pred)) {
continue;
}
// begin to push InPredicate value into ColumnValueRange
HybridSetBase::IteratorBase* iter = pred->hybrid_set()->begin();
auto skip_invalid_value_count = 0;
while (iter->has_next()) {
// column in (NULL,...) couldn't push down to StorageEngine
// so that discard whole ColumnValueRange
// so set clear() temp_range to whole range
if (NULL == iter->get_value()) {
range->clear();
temp_range.clear();
break;
}
switch (slot->type().type) {
case TYPE_TINYINT: {
int32_t v = *reinterpret_cast<int8_t*>(const_cast<void*>(iter->get_value()));
range->add_fixed_value(*reinterpret_cast<T*>(&v));
break;
}
case TYPE_DATE: {
DateTimeValue date_value =
*reinterpret_cast<const DateTimeValue*>(iter->get_value());
if (date_value.check_loss_accuracy_cast_to_date()) {
// There is may return empty data in olap_scan_node,
// Because data value loss accuracy, skip this value
skip_invalid_value_count++;
} else {
range->add_fixed_value(*reinterpret_cast<T *>(&date_value));
}
break;
}
case TYPE_DECIMAL:
case TYPE_DECIMALV2:
case TYPE_LARGEINT:
case TYPE_CHAR:
case TYPE_VARCHAR:
case TYPE_HLL:
case TYPE_SMALLINT:
case TYPE_INT:
case TYPE_BIGINT:
case TYPE_DATETIME: {
range->add_fixed_value(
*reinterpret_cast<T*>(const_cast<void*>(iter->get_value())));
break;
}
case TYPE_BOOLEAN: {
bool v = *reinterpret_cast<bool*>(const_cast<void*>(iter->get_value()));
range->add_fixed_value(*reinterpret_cast<T*>(&v));
break;
}
default: {
break;
}
}
auto value = const_cast<void*>(iter->get_value());
RETURN_IF_ERROR(insert_value_to_range(temp_range, slot->type().type, value));
iter->next();
}
// all value in hybrid set in skip, means all in condition
// is invalid, so set eos = true
if (skip_invalid_value_count == pred->hybrid_set()->size()) {
_eos = true;
}
if (is_key_column(slot->col_name())) {
filter_conjuncts_index.emplace_back(conj_idx);
// only where a in ('a', 'b', NULL) contain NULL will
// clear temp_range to whole range, no need do intersection
if (!temp_range.is_whole_range()) {
if (is_key_column(slot->col_name())) {
filter_conjuncts_index.emplace_back(conj_idx);
}
range->intersection(temp_range);
}
} // end of handle in predicate
@ -904,119 +944,27 @@ Status OlapScanNode::normalize_in_and_eq_predicate(SlotDescriptor* slot,
DCHECK(pred->get_num_children() == 2);
for (int child_idx = 0; child_idx < 2; ++child_idx) {
if (Expr::type_without_cast(pred->get_child(child_idx)) !=
TExprNodeType::SLOT_REF) {
// TODO: should use C++17 structured bindlings to refactor this code in the future:
// 'auto [should_push_down, value] = should_push_down_eq_predicate(slot, pred, conj_idx, child_idx);'
// make code tidier and readabler
auto result_pair = should_push_down_eq_predicate(slot, pred, conj_idx, child_idx);
if (!result_pair.first) {
continue;
}
auto value = result_pair.second;
std::vector<SlotId> slot_ids;
if (pred->get_child(child_idx)->get_slot_ids(&slot_ids) != 1) {
// not a single column predicate
continue;
}
if (slot_ids[0] != slot->id()) {
// predicate not related to current column
continue;
}
if (pred->get_child(child_idx)->type().type != slot->type().type) {
if (!ignore_cast(slot, pred->get_child(child_idx))) {
// the type of predicate not match the slot's type
continue;
}
}
Expr* expr = pred->get_child(1 - child_idx);
if (!expr->is_constant()) {
// only handle constant value
continue;
}
void* value = _conjunct_ctxs[conj_idx]->get_value(expr, NULL);
// for case: where col = null
if (value == NULL) {
continue;
}
// begin to push condition value into ColumnValueRange
// clear the ColumnValueRange before adding new fixed values.
// because for AND compound predicates, it can overwrite previous conditions
range->clear();
switch (slot->type().type) {
case TYPE_TINYINT: {
int32_t v = *reinterpret_cast<int8_t*>(value);
range->add_fixed_value(*reinterpret_cast<T*>(&v));
break;
}
case TYPE_DATE: {
DateTimeValue date_value =
*reinterpret_cast<DateTimeValue*>(value);
if (date_value.check_loss_accuracy_cast_to_date()) {
// There is must return empty data in olap_scan_node,
// Because data value loss accuracy
_eos = true;
}
range->add_fixed_value(*reinterpret_cast<T*>(&date_value));
break;
}
case TYPE_DECIMAL:
case TYPE_DECIMALV2:
case TYPE_CHAR:
case TYPE_VARCHAR:
case TYPE_HLL:
case TYPE_DATETIME:
case TYPE_SMALLINT:
case TYPE_INT:
case TYPE_BIGINT:
case TYPE_LARGEINT: {
range->add_fixed_value(*reinterpret_cast<T*>(value));
break;
}
case TYPE_BOOLEAN: {
bool v = *reinterpret_cast<bool*>(value);
range->add_fixed_value(*reinterpret_cast<T*>(&v));
break;
}
default: {
LOG(WARNING) << "Normalize filter fail, Unsupported Primitive type. [type="
<< expr->type() << "]";
return Status::InternalError("Normalize filter fail, Unsupported Primitive type");
}
}
RETURN_IF_ERROR(insert_value_to_range(temp_range, slot->type().type, value));
if (is_key_column(slot->col_name())) {
filter_conjuncts_index.emplace_back(conj_idx);
}
meet_eq_binary = true;
range->intersection(temp_range);
} // end for each binary predicate child
} // end of handling eq binary predicate
if (range->get_fixed_value_size() > 0) {
// this columns already meet some eq predicates(IN or Binary),
// There is no need to continue to iterate.
// TODO(cmy): In fact, this part of the judgment should be completed in
// the FE query planning stage. For the following predicate conditions,
// it should be possible to eliminate at the FE side.
// WHERE A = 1 and A in (2,3,4)
if (meet_eq_binary) {
// meet_eq_binary is true, means we meet at least one eq binary predicate.
// this flag is to handle following case:
// There are 2 conjuncts, first in a InPredicate, and second is a BinaryPredicate.
// Firstly, we met a InPredicate, and add lots of values in ColumnValueRange,
// if breaks, doris will read many rows filtered by these values.
// But if continue to handle the BinaryPredicate, the value in ColumnValueRange
// may become only one, which can reduce the rows read from storage engine.
// So the strategy is to use the BinaryPredicate as much as possible.
break;
}
}
} // end of handling eq binary predicate
}
// exceed limit, no conditions will be pushed down to storage engine.
if (range->get_fixed_value_size() > _max_pushdown_conditions_per_column) {
// exceed limit, no conditions will be pushed down to storage engine.
range->clear();
} else {
std::copy(filter_conjuncts_index.cbegin(), filter_conjuncts_index.cend(),
@ -1158,8 +1106,6 @@ Status OlapScanNode::normalize_noneq_binary_predicate(SlotDescriptor* slot,
<< " value: " << *reinterpret_cast<T*>(value);
}
}
}
std::copy(filter_conjuncts_index.cbegin(), filter_conjuncts_index.cend(),

View File

@ -26,6 +26,7 @@
#include "exec/olap_common.h"
#include "exec/olap_scanner.h"
#include "exec/scan_node.h"
#include "exprs/in_predicate.h"
#include "runtime/descriptors.h"
#include "runtime/row_batch_interface.hpp"
#include "runtime/vectorized_row_batch.h"
@ -172,6 +173,13 @@ private:
void construct_is_null_pred_in_where_pred(Expr* expr, SlotDescriptor* slot,
const std::string& is_null_str);
bool should_push_down_in_predicate(SlotDescriptor* slot, InPredicate* in_pred);
std::pair<bool, void*> should_push_down_eq_predicate(SlotDescriptor* slot, Expr* pred, int conj_idx, int child_idx);
template <typename T>
static Status insert_value_to_range(ColumnValueRange<T>& range, PrimitiveType type, void* value);
friend class OlapScanner;
std::vector<TCondition> _is_null_vector;

View File

@ -26,7 +26,7 @@
#include "common/status.h"
#include "exec/exec_node.h"
#include "exec/olap_common.h"
#include "exec/olap_utils.h"
#include "exprs/expr.h"
#include "gen_cpp/PaloInternalService_types.h"
#include "gen_cpp/PlanNodes_types.h"

View File

@ -59,8 +59,6 @@ static uint32_t calc_days_in_year(uint32_t year) {
return is_leap(year) ? 366 : 365;
}
DateTimeValue DateTimeValue::_s_min_datetime_value(0, TIME_DATETIME, 0, 0, 0, 0, 0, 1, 1);
DateTimeValue DateTimeValue::_s_max_datetime_value(0, TIME_DATETIME, 23, 59, 59, 0, 9999, 12, 31);
RE2 DateTimeValue::time_zone_offset_format_reg("^[+-]{1}\\d{2}\\:\\d{2}$");
bool DateTimeValue::check_range() const {

View File

@ -426,9 +426,15 @@ public:
return std::string(buf, end - buf);
}
static DateTimeValue datetime_min_value() { return _s_min_datetime_value; }
static DateTimeValue datetime_max_value() { return _s_max_datetime_value; }
static DateTimeValue datetime_min_value() {
static DateTimeValue _s_min_datetime_value(0, TIME_DATETIME, 0, 0, 0, 0, 0, 1, 1);
return _s_min_datetime_value;
}
static DateTimeValue datetime_max_value() {
static DateTimeValue _s_max_datetime_value(0, TIME_DATETIME, 23, 59, 59, 0, 9999, 12, 31);
return _s_max_datetime_value;
}
int64_t second_diff(const DateTimeValue& rhs) const {
int day_diff = daynr() - rhs.daynr();
@ -542,8 +548,6 @@ private:
_day(day),
_microsecond(microsecond) {}
static DateTimeValue _s_min_datetime_value;
static DateTimeValue _s_max_datetime_value;
// RE2 obj is thread safe
static RE2 time_zone_offset_format_reg;
};

View File

@ -39,4 +39,16 @@ std::size_t operator-(const StringValue& v1, const StringValue& v2) {
return 0;
}
constexpr char StringValue::MIN_CHAR = 0x00;
constexpr char StringValue::MAX_CHAR = 0xff;
StringValue StringValue::min_string_val() {
return StringValue((char*)(&StringValue::MIN_CHAR), 0);
}
StringValue StringValue::max_string_val() {
return StringValue((char*)(&StringValue::MAX_CHAR), 1);
}
} // namespace doris

View File

@ -29,6 +29,9 @@ namespace doris {
// The returned StringValue of all functions that return StringValue
// shares its buffer the parent.
struct StringValue {
const static char MIN_CHAR;
const static char MAX_CHAR;
static const int MAX_LENGTH = (1 << 30);
// TODO: change ptr to an offset relative to a contiguous memory block,
// so that we can send row batches between nodes without having to swizzle
@ -104,6 +107,10 @@ struct StringValue {
static StringValue from_string_val(const doris_udf::StringVal& sv) {
return StringValue(reinterpret_cast<char*>(sv.ptr), sv.len);
}
static StringValue min_string_val();
static StringValue max_string_val();
};
// This function must be called 'hash_value' to be picked up by boost.

View File

@ -0,0 +1,80 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef DORIS_BE_RUNTIME_TYPE_LIMIT_H
#define DORIS_BE_RUNTIME_TYPE_LIMIT_H
#include "runtime/datetime_value.h"
#include "runtime/decimal_value.h"
#include "runtime/decimalv2_value.h"
#include "runtime/string_value.h"
namespace doris {
template <typename T>
struct type_limit {
static T min() {
return std::numeric_limits<T>::min();
}
static T max() {
return std::numeric_limits<T>::max();
}
};
template <>
struct type_limit<StringValue> {
static StringValue min() {
return StringValue::min_string_val();
}
static StringValue max() {
return StringValue::max_string_val();
}
};
template <>
struct type_limit<DecimalValue> {
static DecimalValue min() {
return DecimalValue::get_min_decimal();
}
static DecimalValue max() {
return DecimalValue::get_max_decimal();
}
};
template <>
struct type_limit<DecimalV2Value> {
static DecimalV2Value min() {
return DecimalV2Value::get_min_decimal();
}
static DecimalV2Value max() {
return DecimalV2Value::get_max_decimal();
}
};
template <>
struct type_limit<DateTimeValue> {
static DateTimeValue min() {
return DateTimeValue::datetime_min_value();
}
static DateTimeValue max() {
return DateTimeValue::datetime_max_value();
}
};
} // namespace doris
#endif