[Enhancement](scan) disable build key range and filters when push down agg work (#14248)
disable build key range and filters when push down agg work
This commit is contained in:
@ -18,6 +18,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <boost/lexical_cast.hpp>
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
@ -25,6 +26,7 @@
|
||||
#include <variant>
|
||||
|
||||
#include "exec/olap_utils.h"
|
||||
#include "olap/olap_common.h"
|
||||
#include "olap/tuple.h"
|
||||
#include "runtime/primitive_type.h"
|
||||
#include "runtime/type_limit.h"
|
||||
@ -112,7 +114,13 @@ public:
|
||||
|
||||
bool convert_to_avg_range_value(std::vector<OlapTuple>& begin_scan_keys,
|
||||
std::vector<OlapTuple>& end_scan_keys, bool& begin_include,
|
||||
bool& end_include, bool* eos, int32_t max_scan_key_num);
|
||||
bool& end_include, int32_t max_scan_key_num);
|
||||
|
||||
bool convert_to_close_range(std::vector<OlapTuple>& begin_scan_keys,
|
||||
std::vector<OlapTuple>& end_scan_keys, bool& begin_include,
|
||||
bool& end_include);
|
||||
|
||||
constexpr bool is_reject_split_type() const { return _is_reject_split_type; }
|
||||
|
||||
bool has_intersection(ColumnValueRange<primitive_type>& range);
|
||||
|
||||
@ -317,6 +325,16 @@ private:
|
||||
bool _contain_null;
|
||||
int _precision;
|
||||
int _scale;
|
||||
|
||||
static constexpr bool _is_reject_split_type = primitive_type == PrimitiveType::TYPE_LARGEINT ||
|
||||
primitive_type == PrimitiveType::TYPE_DECIMALV2 ||
|
||||
primitive_type == PrimitiveType::TYPE_HLL ||
|
||||
primitive_type == PrimitiveType::TYPE_VARCHAR ||
|
||||
primitive_type == PrimitiveType::TYPE_CHAR ||
|
||||
primitive_type == PrimitiveType::TYPE_STRING ||
|
||||
primitive_type == PrimitiveType::TYPE_BOOLEAN ||
|
||||
primitive_type == PrimitiveType::TYPE_DATETIME ||
|
||||
primitive_type == PrimitiveType::TYPE_DATETIMEV2;
|
||||
};
|
||||
|
||||
class OlapScanKeys {
|
||||
@ -517,45 +535,68 @@ size_t ColumnValueRange<primitive_type>::get_convertible_fixed_value_size() cons
|
||||
return _high_value - _low_value;
|
||||
}
|
||||
|
||||
// The return value indicates whether eos.
|
||||
template <PrimitiveType primitive_type>
|
||||
bool ColumnValueRange<primitive_type>::convert_to_close_range(
|
||||
std::vector<OlapTuple>& begin_scan_keys, std::vector<OlapTuple>& end_scan_keys,
|
||||
bool& begin_include, bool& end_include) {
|
||||
if constexpr (!_is_reject_split_type) {
|
||||
begin_include = true;
|
||||
end_include = true;
|
||||
|
||||
bool is_empty = false;
|
||||
|
||||
if (!is_begin_include()) {
|
||||
if (_low_value == TYPE_MIN) {
|
||||
is_empty = true;
|
||||
} else {
|
||||
++_low_value;
|
||||
}
|
||||
}
|
||||
if (!is_end_include()) {
|
||||
if (_high_value == TYPE_MAX) {
|
||||
is_empty = true;
|
||||
} else {
|
||||
--_high_value;
|
||||
}
|
||||
}
|
||||
|
||||
if (_high_value < _low_value) {
|
||||
is_empty = true;
|
||||
}
|
||||
|
||||
if (is_empty && !contain_null()) {
|
||||
begin_scan_keys.clear();
|
||||
end_scan_keys.clear();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// The return value indicates whether the split result is range or fixed value.
|
||||
template <PrimitiveType primitive_type>
|
||||
bool ColumnValueRange<primitive_type>::convert_to_avg_range_value(
|
||||
std::vector<OlapTuple>& begin_scan_keys, std::vector<OlapTuple>& end_scan_keys,
|
||||
bool& begin_include, bool& end_include, bool* eos, int32_t max_scan_key_num) {
|
||||
constexpr bool reject_type = primitive_type == PrimitiveType::TYPE_LARGEINT ||
|
||||
primitive_type == PrimitiveType::TYPE_DECIMALV2 ||
|
||||
primitive_type == PrimitiveType::TYPE_HLL ||
|
||||
primitive_type == PrimitiveType::TYPE_VARCHAR ||
|
||||
primitive_type == PrimitiveType::TYPE_CHAR ||
|
||||
primitive_type == PrimitiveType::TYPE_STRING ||
|
||||
primitive_type == PrimitiveType::TYPE_BOOLEAN ||
|
||||
primitive_type == PrimitiveType::TYPE_DATETIME ||
|
||||
primitive_type == PrimitiveType::TYPE_DATETIMEV2;
|
||||
begin_include = is_begin_include();
|
||||
end_include = is_end_include();
|
||||
bool is_empty_range = false;
|
||||
if constexpr (reject_type) {
|
||||
begin_scan_keys.emplace_back();
|
||||
begin_scan_keys.back().add_value(
|
||||
cast_to_string<primitive_type, CppType>(get_range_min_value(), scale()),
|
||||
contain_null());
|
||||
end_scan_keys.emplace_back();
|
||||
end_scan_keys.back().add_value(
|
||||
cast_to_string<primitive_type, CppType>(get_range_max_value(), scale()));
|
||||
return true;
|
||||
} else if (is_low_value_mininum() && is_high_value_maximum()) {
|
||||
// Do not split the range of whole range. TODO: figure out why the code
|
||||
// execute here
|
||||
begin_scan_keys.emplace_back();
|
||||
begin_scan_keys.back().add_value(
|
||||
cast_to_string<primitive_type, CppType>(get_range_min_value(), scale()),
|
||||
contain_null());
|
||||
end_scan_keys.emplace_back();
|
||||
end_scan_keys.back().add_value(
|
||||
cast_to_string<primitive_type, CppType>(get_range_max_value(), scale()));
|
||||
return true;
|
||||
} else {
|
||||
bool& begin_include, bool& end_include, int32_t max_scan_key_num) {
|
||||
if constexpr (!_is_reject_split_type) {
|
||||
auto no_split = [&]() -> bool {
|
||||
begin_scan_keys.emplace_back();
|
||||
begin_scan_keys.back().add_value(
|
||||
cast_to_string<primitive_type, CppType>(get_range_min_value(), scale()),
|
||||
contain_null());
|
||||
end_scan_keys.emplace_back();
|
||||
end_scan_keys.back().add_value(
|
||||
cast_to_string<primitive_type, CppType>(get_range_max_value(), scale()));
|
||||
return true;
|
||||
};
|
||||
|
||||
CppType min_value = get_range_min_value();
|
||||
CppType max_value = get_range_max_value();
|
||||
if constexpr (primitive_type == PrimitiveType::TYPE_DATE) {
|
||||
min_value.set_type(TimeType::TIME_DATE);
|
||||
max_value.set_type(TimeType::TIME_DATE);
|
||||
}
|
||||
|
||||
if (contain_null()) {
|
||||
begin_scan_keys.emplace_back();
|
||||
@ -564,65 +605,53 @@ bool ColumnValueRange<primitive_type>::convert_to_avg_range_value(
|
||||
end_scan_keys.back().add_null();
|
||||
}
|
||||
|
||||
if (!is_begin_include()) {
|
||||
if (min_value == TYPE_MAX) {
|
||||
is_empty_range = true;
|
||||
if (min_value > max_value || max_scan_key_num == 1) {
|
||||
return no_split();
|
||||
}
|
||||
|
||||
auto cast = [](const CppType& value) {
|
||||
if constexpr (primitive_type == PrimitiveType::TYPE_DATE ||
|
||||
primitive_type == PrimitiveType::TYPE_DATEV2) {
|
||||
return value;
|
||||
} else {
|
||||
begin_include = true;
|
||||
++min_value;
|
||||
}
|
||||
}
|
||||
if (!is_end_include()) {
|
||||
if (max_value == TYPE_MIN) {
|
||||
is_empty_range = true;
|
||||
} else {
|
||||
end_include = true;
|
||||
--max_value;
|
||||
return (int128_t)value;
|
||||
}
|
||||
};
|
||||
|
||||
// When CppType is date, we can not convert it to integer number and calculate distance.
|
||||
// In other case, we convert element to int128 to avoit overflow.
|
||||
size_t step_size = (cast(max_value) - min_value) / max_scan_key_num;
|
||||
|
||||
constexpr size_t MAX_STEP_SIZE = 1 << 20;
|
||||
// When the step size is too large, the range is easy to not really contain data.
|
||||
if (step_size > MAX_STEP_SIZE) {
|
||||
return no_split();
|
||||
}
|
||||
|
||||
if (begin_include && end_include && min_value > max_value) {
|
||||
is_empty_range = true;
|
||||
}
|
||||
if (is_empty_range) {
|
||||
if (!contain_null()) {
|
||||
begin_scan_keys.clear();
|
||||
end_scan_keys.clear();
|
||||
*eos = true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
int128_t range_size = is_fixed_value_convertible() ? (int128_t)max_value - min_value : 0;
|
||||
size_t step_size = range_size / max_scan_key_num;
|
||||
|
||||
auto current = min_value;
|
||||
if constexpr (primitive_type == PrimitiveType::TYPE_DATE) {
|
||||
current.set_type(TimeType::TIME_DATE);
|
||||
}
|
||||
|
||||
while (current <= max_value) {
|
||||
while (true) {
|
||||
begin_scan_keys.emplace_back();
|
||||
begin_scan_keys.back().add_value(
|
||||
cast_to_string<primitive_type, CppType>(current, scale()));
|
||||
cast_to_string<primitive_type, CppType>(min_value, scale()));
|
||||
|
||||
if ((int128_t)max_value - current < step_size) {
|
||||
current = max_value;
|
||||
if (cast(max_value) - min_value < step_size) {
|
||||
min_value = max_value;
|
||||
} else {
|
||||
current += step_size;
|
||||
min_value += step_size;
|
||||
}
|
||||
|
||||
end_scan_keys.emplace_back();
|
||||
end_scan_keys.back().add_value(
|
||||
cast_to_string<primitive_type, CppType>(current, scale()));
|
||||
cast_to_string<primitive_type, CppType>(min_value, scale()));
|
||||
|
||||
if (current == max_value) {
|
||||
if (min_value == max_value) {
|
||||
break;
|
||||
}
|
||||
++current;
|
||||
++min_value;
|
||||
}
|
||||
|
||||
return step_size != 0;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <PrimitiveType primitive_type>
|
||||
@ -945,9 +974,13 @@ Status OlapScanKeys::extend_scan_key(ColumnValueRange<primitive_type>& range,
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (_begin_scan_keys.empty() && range.is_fixed_value_convertible() && _is_convertible) {
|
||||
if (_begin_scan_keys.empty() && range.is_fixed_value_convertible() && _is_convertible &&
|
||||
!range.is_reject_split_type()) {
|
||||
*eos |= range.convert_to_close_range(_begin_scan_keys, _end_scan_keys, _begin_include,
|
||||
_end_include);
|
||||
|
||||
if (range.convert_to_avg_range_value(_begin_scan_keys, _end_scan_keys, _begin_include,
|
||||
_end_include, eos, max_scan_key_num)) {
|
||||
_end_include, max_scan_key_num)) {
|
||||
_has_range_value = true;
|
||||
}
|
||||
return Status::OK();
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
|
||||
#include "vec/exec/scan/new_olap_scan_node.h"
|
||||
|
||||
#include "common/status.h"
|
||||
#include "olap/storage_engine.h"
|
||||
#include "olap/tablet.h"
|
||||
#include "util/to_string.h"
|
||||
@ -135,6 +136,18 @@ static std::string olap_filters_to_string(const std::vector<doris::TCondition>&
|
||||
return filters_string;
|
||||
}
|
||||
|
||||
inline std::string push_down_agg_to_string(const TPushAggOp::type& op) {
|
||||
if (op == TPushAggOp::MINMAX) {
|
||||
return "MINMAX";
|
||||
} else if (op == TPushAggOp::COUNT) {
|
||||
return "COUNT";
|
||||
} else if (op == TPushAggOp::MIX) {
|
||||
return "MIX";
|
||||
} else {
|
||||
return "NONE";
|
||||
}
|
||||
}
|
||||
|
||||
static std::string tablets_id_to_string(
|
||||
const std::vector<std::unique_ptr<TPaloScanRange>>& scan_ranges) {
|
||||
if (scan_ranges.empty()) {
|
||||
@ -160,57 +173,64 @@ Status NewOlapScanNode::_process_conjuncts() {
|
||||
}
|
||||
|
||||
Status NewOlapScanNode::_build_key_ranges_and_filters() {
|
||||
const std::vector<std::string>& column_names = _olap_scan_node.key_column_name;
|
||||
const std::vector<TPrimitiveType::type>& column_types = _olap_scan_node.key_column_type;
|
||||
DCHECK(column_types.size() == column_names.size());
|
||||
if (!_olap_scan_node.__isset.push_down_agg_type_opt ||
|
||||
_olap_scan_node.push_down_agg_type_opt == TPushAggOp::NONE) {
|
||||
const std::vector<std::string>& column_names = _olap_scan_node.key_column_name;
|
||||
const std::vector<TPrimitiveType::type>& column_types = _olap_scan_node.key_column_type;
|
||||
DCHECK(column_types.size() == column_names.size());
|
||||
|
||||
// 1. construct scan key except last olap engine short key
|
||||
_scan_keys.set_is_convertible(limit() == -1);
|
||||
// 1. construct scan key except last olap engine short key
|
||||
_scan_keys.set_is_convertible(limit() == -1);
|
||||
|
||||
// we use `exact_range` to identify a key range is an exact range or not when we convert
|
||||
// it to `_scan_keys`. If `exact_range` is true, we can just discard it from `_olap_filters`.
|
||||
bool exact_range = true;
|
||||
bool eos = false;
|
||||
for (int column_index = 0;
|
||||
column_index < column_names.size() && !_scan_keys.has_range_value() && !eos;
|
||||
++column_index) {
|
||||
auto iter = _colname_to_value_range.find(column_names[column_index]);
|
||||
if (_colname_to_value_range.end() == iter) {
|
||||
break;
|
||||
}
|
||||
// we use `exact_range` to identify a key range is an exact range or not when we convert
|
||||
// it to `_scan_keys`. If `exact_range` is true, we can just discard it from `_olap_filters`.
|
||||
bool exact_range = true;
|
||||
bool eos = false;
|
||||
for (int column_index = 0;
|
||||
column_index < column_names.size() && !_scan_keys.has_range_value() && !eos;
|
||||
++column_index) {
|
||||
auto iter = _colname_to_value_range.find(column_names[column_index]);
|
||||
if (_colname_to_value_range.end() == iter) {
|
||||
break;
|
||||
}
|
||||
|
||||
RETURN_IF_ERROR(std::visit(
|
||||
[&](auto&& range) {
|
||||
// make a copy or range and pass to extend_scan_key, keep the range unchanged
|
||||
// because extend_scan_key method may change the first parameter.
|
||||
// but the original range may be converted to olap filters, if it's not a exact_range.
|
||||
auto temp_range = range;
|
||||
if (range.get_fixed_value_size() <= _max_pushdown_conditions_per_column) {
|
||||
RETURN_IF_ERROR(_scan_keys.extend_scan_key(temp_range, _max_scan_key_num,
|
||||
&exact_range, &eos));
|
||||
if (exact_range) {
|
||||
_colname_to_value_range.erase(iter->first);
|
||||
RETURN_IF_ERROR(std::visit(
|
||||
[&](auto&& range) {
|
||||
// make a copy or range and pass to extend_scan_key, keep the range unchanged
|
||||
// because extend_scan_key method may change the first parameter.
|
||||
// but the original range may be converted to olap filters, if it's not a exact_range.
|
||||
auto temp_range = range;
|
||||
if (range.get_fixed_value_size() <= _max_pushdown_conditions_per_column) {
|
||||
RETURN_IF_ERROR(_scan_keys.extend_scan_key(
|
||||
temp_range, _max_scan_key_num, &exact_range, &eos));
|
||||
if (exact_range) {
|
||||
_colname_to_value_range.erase(iter->first);
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
},
|
||||
iter->second));
|
||||
}
|
||||
_eos |= eos;
|
||||
|
||||
for (auto& iter : _colname_to_value_range) {
|
||||
std::vector<TCondition> filters;
|
||||
std::visit([&](auto&& range) { range.to_olap_filter(filters); }, iter.second);
|
||||
|
||||
for (const auto& filter : filters) {
|
||||
_olap_filters.push_back(filter);
|
||||
return Status::OK();
|
||||
},
|
||||
iter->second));
|
||||
}
|
||||
}
|
||||
_eos |= eos;
|
||||
|
||||
// Append value ranges in "_not_in_value_ranges"
|
||||
for (auto& range : _not_in_value_ranges) {
|
||||
std::visit([&](auto&& the_range) { the_range.to_in_condition(_olap_filters, false); },
|
||||
range);
|
||||
for (auto& iter : _colname_to_value_range) {
|
||||
std::vector<TCondition> filters;
|
||||
std::visit([&](auto&& range) { range.to_olap_filter(filters); }, iter.second);
|
||||
|
||||
for (const auto& filter : filters) {
|
||||
_olap_filters.push_back(filter);
|
||||
}
|
||||
}
|
||||
|
||||
// Append value ranges in "_not_in_value_ranges"
|
||||
for (auto& range : _not_in_value_ranges) {
|
||||
std::visit([&](auto&& the_range) { the_range.to_in_condition(_olap_filters, false); },
|
||||
range);
|
||||
}
|
||||
} else {
|
||||
_runtime_profile->add_info_string(
|
||||
"PushDownAggregate",
|
||||
push_down_agg_to_string(_olap_scan_node.push_down_agg_type_opt));
|
||||
}
|
||||
|
||||
if (_state->enable_profile()) {
|
||||
|
||||
@ -630,9 +630,9 @@ TEST_F(OlapScanKeysTest, EachtypeTest) {
|
||||
EXPECT_EQ(exact_range, true);
|
||||
scan_keys.get_key_range(&key_range);
|
||||
// contain null, [-128, 127]
|
||||
EXPECT_EQ(key_range.size(), 1);
|
||||
EXPECT_EQ(OlapScanKeys::to_print_key(key_range[0]->begin_scan_range), "null(-128)");
|
||||
EXPECT_EQ(OlapScanKeys::to_print_key(key_range[0]->end_scan_range), "127");
|
||||
EXPECT_EQ(key_range.size(), 257);
|
||||
EXPECT_EQ(OlapScanKeys::to_print_key(key_range[1]->begin_scan_range), "-128");
|
||||
EXPECT_EQ(OlapScanKeys::to_print_key(key_range[256]->end_scan_range), "127");
|
||||
|
||||
EXPECT_TRUE(range.add_range(FILTER_LESS, 50).ok());
|
||||
scan_keys.clear();
|
||||
@ -655,9 +655,9 @@ TEST_F(OlapScanKeysTest, EachtypeTest) {
|
||||
EXPECT_TRUE(scan_keys.extend_scan_key(range, max_scan_key, &exact_range, &eos).ok());
|
||||
EXPECT_EQ(exact_range, true);
|
||||
scan_keys.get_key_range(&key_range);
|
||||
EXPECT_EQ(key_range.size(), 1);
|
||||
EXPECT_EQ(OlapScanKeys::to_print_key(key_range[0]->begin_scan_range), "null(-32768)");
|
||||
EXPECT_EQ(OlapScanKeys::to_print_key(key_range[0]->end_scan_range), "32767");
|
||||
EXPECT_EQ(key_range.size(), 49);
|
||||
EXPECT_EQ(OlapScanKeys::to_print_key(key_range[1]->begin_scan_range), "-32768");
|
||||
EXPECT_EQ(OlapScanKeys::to_print_key(key_range[max_scan_key]->end_scan_range), "32767");
|
||||
|
||||
EXPECT_TRUE(range.add_range(FILTER_LARGER, 0).ok());
|
||||
scan_keys.clear();
|
||||
@ -678,7 +678,7 @@ TEST_F(OlapScanKeysTest, EachtypeTest) {
|
||||
scan_keys.get_key_range(&key_range);
|
||||
|
||||
EXPECT_EQ(key_range.size(), max_scan_key);
|
||||
EXPECT_EQ(OlapScanKeys::to_print_key(key_range[0]->begin_scan_range), "1");
|
||||
EXPECT_EQ(OlapScanKeys::to_print_key(key_range[0]->begin_scan_range), "2");
|
||||
EXPECT_EQ(OlapScanKeys::to_print_key(key_range[max_scan_key - 1]->end_scan_range), "32765");
|
||||
}
|
||||
}
|
||||
|
||||
@ -223,5 +223,19 @@ beijing chengdu shanghai
|
||||
2
|
||||
|
||||
-- !select44 --
|
||||
6 3975 2003 33035710 25819.948 78965.368 4449.5830001831055
|
||||
6 3975 2003 33035710 25819.948000000 78965.368 4449.5830001831055
|
||||
|
||||
-- !select45 --
|
||||
1 10
|
||||
2 8
|
||||
2 441
|
||||
3 10
|
||||
5 29
|
||||
6 101
|
||||
|
||||
-- !select46 --
|
||||
5 29
|
||||
|
||||
-- !select47 --
|
||||
6
|
||||
|
||||
|
||||
@ -469,5 +469,11 @@ suite("test_aggregate_all_functions") {
|
||||
|
||||
sql "DROP TABLE IF EXISTS ${tableName_10}"
|
||||
|
||||
qt_select44 """ select sum(distinct k1), sum(distinct k2), sum(distinct k3), sum(distinct cast(k4 as largeint)), sum(distinct k5), sum(distinct k8), sum(distinct k9) from test_query_db.test """
|
||||
qt_select44 """select sum(distinct k1), sum(distinct k2), sum(distinct k3), sum(distinct cast(k4 as largeint)), sum(distinct k5), sum(distinct k8), sum(distinct k9) from test_query_db.test """
|
||||
|
||||
qt_select45 """select * from ${tableName_12} order by id,level"""
|
||||
|
||||
qt_select46 """select * from ${tableName_12} where id>=5 and id <=5 and level >10 order by id,level;"""
|
||||
|
||||
qt_select47 """select count(*) from ${tableName_12}"""
|
||||
}
|
||||
|
||||
@ -72,7 +72,7 @@ fi
|
||||
|
||||
eval set -- "${OPTS}"
|
||||
|
||||
PARALLEL="$(($(nproc) / 2 + 1))"
|
||||
PARALLEL="$(($(nproc) / 5 + 1))"
|
||||
|
||||
CLEAN=0
|
||||
RUN=0
|
||||
|
||||
Reference in New Issue
Block a user