[improvement](scanner) Remove the predicate that is always true for the segment (#25366)

By utilizing the zonemap index of the segment, we can ascertain if a predicate is always true. For example, if the segment’s maximum value is 100 and the predicate is col < 101, then this predicate is always true for this segment.
This commit is contained in:
Jerry Hu
2023-10-13 02:25:38 -05:00
committed by GitHub
parent cee7a6889f
commit 283bd59eba
9 changed files with 172 additions and 1 deletions

View File

@ -1109,6 +1109,8 @@ DEFINE_Bool(exit_on_exception, "false");
DEFINE_String(doris_cgroup_cpu_path, "");
DEFINE_Bool(enable_cpu_hard_limit, "false");
DEFINE_Bool(ignore_always_true_predicate_for_segment, "true");
// clang-format off
#ifdef BE_TEST
// test s3

View File

@ -1179,6 +1179,9 @@ DECLARE_mBool(exit_on_exception);
DECLARE_String(doris_cgroup_cpu_path);
DECLARE_Bool(enable_cpu_hard_limit);
// Remove predicate that is always true for a segment.
DECLARE_Bool(ignore_always_true_predicate_for_segment);
#ifdef BE_TEST
// test s3
DECLARE_String(test_s3_resource);

View File

@ -173,6 +173,10 @@ public:
return true;
}
virtual bool is_always_true(const std::pair<WrapperField*, WrapperField*>& statistic) const {
return false;
}
virtual bool evaluate_del(const std::pair<WrapperField*, WrapperField*>& statistic) const {
return false;
}

View File

@ -174,6 +174,29 @@ public:
}
}
bool is_always_true(const std::pair<WrapperField*, WrapperField*>& statistic) const override {
if (statistic.first->is_null() || statistic.second->is_null()) {
return false;
}
T tmp_min_value {};
T tmp_max_value {};
memcpy((char*)(&tmp_min_value), statistic.first->cell_ptr(), sizeof(WarpperFieldType));
memcpy((char*)(&tmp_max_value), statistic.second->cell_ptr(), sizeof(WarpperFieldType));
if constexpr (PT == PredicateType::LT) {
return _value > tmp_max_value;
} else if constexpr (PT == PredicateType::LE) {
return _value >= tmp_max_value;
} else if constexpr (PT == PredicateType::GT) {
return _value < tmp_min_value;
} else if constexpr (PT == PredicateType::GE) {
return _value <= tmp_min_value;
}
return false;
}
bool evaluate_del(const std::pair<WrapperField*, WrapperField*>& statistic) const override {
if (statistic.first->is_null() || statistic.second->is_null()) {
return false;

View File

@ -31,6 +31,7 @@
#include "io/fs/file_reader.h"
#include "olap/block_column_predicate.h"
#include "olap/column_predicate.h"
#include "olap/comparison_predicate.h"
#include "olap/decimal12.h"
#include "olap/inverted_index_parser.h"
#include "olap/iterators.h"
@ -338,6 +339,31 @@ bool ColumnReader::match_condition(const AndBlockColumnPredicate* col_predicates
col_predicates);
}
bool ColumnReader::prune_predicates_by_zone_map(std::vector<ColumnPredicate*>& predicates,
const int column_id) const {
if (_zone_map_index == nullptr) {
return false;
}
FieldType type = _type_info->type();
std::unique_ptr<WrapperField> min_value(WrapperField::create_by_type(type, _meta_length));
std::unique_ptr<WrapperField> max_value(WrapperField::create_by_type(type, _meta_length));
_parse_zone_map(*_segment_zone_map, min_value.get(), max_value.get());
auto pruned = false;
for (auto it = predicates.begin(); it != predicates.end();) {
auto predicate = *it;
if (predicate->column_id() == column_id &&
predicate->is_always_true({min_value.get(), max_value.get()})) {
pruned = true;
it = predicates.erase(it);
} else {
++it;
}
}
return pruned;
}
void ColumnReader::_parse_zone_map(const ZoneMapPB& zone_map, WrapperField* min_value_container,
WrapperField* max_value_container) const {
// min value and max value are valid if has_not_null is true

View File

@ -162,6 +162,9 @@ public:
bool is_empty() const { return _num_rows == 0; }
bool prune_predicates_by_zone_map(std::vector<ColumnPredicate*>& predicates,
const int column_id) const;
CompressionTypePB get_compression() const { return _meta_compression; }
uint64_t num_rows() const { return _num_rows; }

View File

@ -129,7 +129,6 @@ Status Segment::new_iterator(SchemaSPtr schema, const StorageReadOptions& read_o
return Status::OK();
}
}
if (read_options.use_topn_opt) {
auto query_ctx = read_options.runtime_state->get_query_ctx();
auto runtime_predicate = query_ctx->get_runtime_predicate().get_predictate();
@ -157,6 +156,25 @@ Status Segment::new_iterator(SchemaSPtr schema, const StorageReadOptions& read_o
iter->reset(new SegmentIterator(this->shared_from_this(), schema));
}
if (config::ignore_always_true_predicate_for_segment &&
read_options.io_ctx.reader_type == ReaderType::READER_QUERY &&
!read_options.column_predicates.empty()) {
auto pruned_predicates = read_options.column_predicates;
auto pruned = false;
for (auto& it : _column_readers) {
if (it.second->prune_predicates_by_zone_map(pruned_predicates, it.first)) {
pruned = true;
}
}
if (pruned) {
auto options_with_pruned_predicates = read_options;
options_with_pruned_predicates.column_predicates = pruned_predicates;
LOG(INFO) << "column_predicates pruned from " << read_options.column_predicates.size()
<< " to " << pruned_predicates.size();
return iter->get()->init(options_with_pruned_predicates);
}
}
return iter->get()->init(read_options);
}

View File

@ -0,0 +1,25 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !select1 --
1 jerry 2020-10-01
2 tom 2020-10-02
3 jack 2020-10-01
4 tony 2020-10-02
-- !select2 --
1 jerry 2020-10-01
3 jack 2020-10-01
-- !select3 --
-- !select4 --
1 jerry 2020-10-01
2 tom 2020-10-02
3 jack 2020-10-01
4 tony 2020-10-02
-- !select5 --
2 tom 2020-10-02
4 tony 2020-10-02
-- !select6 --

View File

@ -0,0 +1,67 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
suite("test_select_with_predicate_prune") {
sql """
drop table if exists `test_select_with_predicate_prune`;
"""
sql """
CREATE TABLE IF NOT EXISTS `test_select_with_predicate_prune` (
id int,
name string,
birthday date not null
)
duplicate key(`id`)
AUTO PARTITION BY LIST (`birthday`)()
DISTRIBUTED BY HASH(`id`) buckets 1
PROPERTIES
(
"replication_allocation" = "tag.location.default: 1"
);
"""
sql """
insert into test_select_with_predicate_prune values (1, 'jerry', '2020-10-01'), (2, 'tom', '2020-10-02');
"""
sql """
insert into test_select_with_predicate_prune values (3, 'jack', '2020-10-01'), (4, 'tony', '2020-10-02');
"""
qt_select1 """
select * from test_select_with_predicate_prune where birthday < '2020-10-03' order by id;
"""
qt_select2 """
select * from test_select_with_predicate_prune where birthday < '2020-10-02' order by id;
"""
qt_select3 """
select * from test_select_with_predicate_prune where birthday < '2020-10-01' order by id;
"""
qt_select4 """
select * from test_select_with_predicate_prune where birthday > '2020-09-30' order by id;
"""
qt_select5 """
select * from test_select_with_predicate_prune where birthday > '2020-10-01' order by id;
"""
qt_select6 """
select * from test_select_with_predicate_prune where birthday > '2020-10-02' order by id;
"""
}