[bugfix](ngram bf index) process differently for normal bloom filter index and ngram bf index (#21310)

* process differently for normal bloom filter index and ngram bf index

* fix review comments for readbility

* add test case

* add testcase for delete condition
This commit is contained in:
Kang
2023-07-13 17:31:45 +08:00
committed by GitHub
parent d4bdd6768c
commit abc21f5d77
11 changed files with 132 additions and 12 deletions

View File

@ -148,7 +148,9 @@ public:
bool evaluate_and(const BloomFilter* bf) const override { return _nested->evaluate_and(bf); }
bool can_do_bloom_filter() const override { return _nested->can_do_bloom_filter(); }
bool can_do_bloom_filter(bool ngram) const override {
return _nested->can_do_bloom_filter(ngram);
}
void evaluate_vec(const vectorized::IColumn& column, uint16_t size,
bool* flags) const override {

View File

@ -87,7 +87,7 @@ public:
return true;
}
virtual bool can_do_bloom_filter() const { return false; }
virtual bool can_do_bloom_filter(bool ngram) const { return false; }
//evaluate predicate on inverted
virtual Status evaluate(const std::string& column_name, InvertedIndexIterator* iterator,
@ -121,7 +121,9 @@ public:
void evaluate_vec(vectorized::MutableColumns& block, uint16_t size, bool* flags) const override;
bool can_do_bloom_filter() const override { return _predicate->can_do_bloom_filter(); }
bool can_do_bloom_filter(bool ngram) const override {
return _predicate->can_do_bloom_filter(ngram);
}
private:
const ColumnPredicate* _predicate;
@ -188,9 +190,9 @@ public:
bool evaluate_and(const StringRef* dict_words, const size_t dict_num) const override;
bool can_do_bloom_filter() const override {
bool can_do_bloom_filter(bool ngram) const override {
for (auto& pred : _block_column_predicate_vec) {
if (!pred->can_do_bloom_filter()) {
if (!pred->can_do_bloom_filter(ngram)) {
return false;
}
}

View File

@ -183,7 +183,7 @@ public:
return true;
}
virtual bool can_do_bloom_filter() const { return false; }
virtual bool can_do_bloom_filter(bool ngram) const { return false; }
// used to evaluate pre read column in lazy materialization
// now only support integer/float

View File

@ -244,6 +244,8 @@ public:
bool evaluate_and(const segment_v2::BloomFilter* bf) const override {
if constexpr (PT == PredicateType::EQ) {
// EQ predicate can not use ngram bf, just return true to accept
if (bf->is_ngram_bf()) return true;
if constexpr (std::is_same_v<T, StringRef>) {
return bf->test_bytes(_value.data, _value.size);
} else if constexpr (Type == TYPE_DATE) {
@ -272,7 +274,9 @@ public:
return true;
}
bool can_do_bloom_filter() const override { return PT == PredicateType::EQ; }
bool can_do_bloom_filter(bool ngram) const override {
return PT == PredicateType::EQ && !ngram;
}
void evaluate_or(const vectorized::IColumn& column, const uint16_t* sel, uint16_t size,
bool* flags) const override {

View File

@ -381,6 +381,8 @@ public:
bool evaluate_and(const segment_v2::BloomFilter* bf) const override {
if constexpr (PT == PredicateType::IN_LIST) {
// IN predicate can not use ngram bf, just return true to accept
if (bf->is_ngram_bf()) return true;
HybridSetBase::IteratorBase* iter = _values->begin();
while (iter->has_next()) {
if constexpr (std::is_same_v<T, StringRef>) {
@ -408,7 +410,9 @@ public:
}
}
bool can_do_bloom_filter() const override { return PT == PredicateType::IN_LIST; }
bool can_do_bloom_filter(bool ngram) const override {
return PT == PredicateType::IN_LIST && !ngram;
}
private:
template <typename LeftT, typename RightT>

View File

@ -76,12 +76,14 @@ public:
_page_ng_bf = std::move(src);
}
bool evaluate_and(const BloomFilter* bf) const override {
// like predicate can not use normal bf, just return true to accept
if (!bf->is_ngram_bf()) return true;
if (_page_ng_bf) {
return bf->contains(*_page_ng_bf);
}
return true;
}
bool can_do_bloom_filter() const override { return true; }
bool can_do_bloom_filter(bool ngram) const override { return ngram; }
private:
template <bool is_and>

View File

@ -84,6 +84,8 @@ public:
}
bool evaluate_and(const segment_v2::BloomFilter* bf) const override {
// null predicate can not use ngram bf, just return true to accept
if (bf->is_ngram_bf()) return true;
if (_is_null) {
return bf->test_bytes(nullptr, 0);
} else {
@ -92,7 +94,7 @@ public:
}
}
bool can_do_bloom_filter() const override { return _is_null; }
bool can_do_bloom_filter(bool ngram) const override { return _is_null && !ngram; }
void evaluate_vec(const vectorized::IColumn& column, uint16_t size, bool* flags) const override;

View File

@ -1213,7 +1213,8 @@ Status FileColumnIterator::get_row_ranges_by_zone_map(
Status FileColumnIterator::get_row_ranges_by_bloom_filter(
const AndBlockColumnPredicate* col_predicates, RowRanges* row_ranges) {
if (col_predicates->can_do_bloom_filter() && _reader->has_bloom_filter_index()) {
if ((col_predicates->can_do_bloom_filter(false) && _reader->has_bloom_filter_index(false)) ||
(col_predicates->can_do_bloom_filter(true) && _reader->has_bloom_filter_index(true))) {
RETURN_IF_ERROR(_reader->get_row_ranges_by_bloom_filter(col_predicates, row_ranges));
}
return Status::OK();

View File

@ -137,7 +137,15 @@ public:
bool has_zone_map() const { return _zone_map_index_meta != nullptr; }
bool has_bitmap_index() const { return _bitmap_index_meta != nullptr; }
bool has_bloom_filter_index() const { return _bf_index_meta != nullptr; }
bool has_bloom_filter_index(bool ngram) const {
if (_bf_index_meta == nullptr) return false;
if (ngram) {
return _bf_index_meta->algorithm() == BloomFilterAlgorithmPB::NGRAM_BLOOM_FILTER;
} else {
return _bf_index_meta->algorithm() != BloomFilterAlgorithmPB::NGRAM_BLOOM_FILTER;
}
}
// Check if this column could match `cond' using segment zone map.
// Since segment zone map is stored in metadata, this function is fast without I/O.

View File

@ -0,0 +1,33 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !select_all_1 --
1 dt_bjn001 p9-webcast-sign.douyinpic.com test /%/7212503657802320699% /test 100 false
1 dt_bjn001 p9-webcast-sign.douyinpic.com test /%/7212503657802320699%xxx /test 100 false
-- !select_eq_1 --
1 dt_bjn001 p9-webcast-sign.douyinpic.com test /%/7212503657802320699% /test 100 false
-- !select_in_1 --
1 dt_bjn001 p9-webcast-sign.douyinpic.com test /%/7212503657802320699% /test 100 false
-- !select_like_1 --
1 dt_bjn001 p9-webcast-sign.douyinpic.com test /%/7212503657802320699% /test 100 false
1 dt_bjn001 p9-webcast-sign.douyinpic.com test /%/7212503657802320699%xxx /test 100 false
-- !select_all_2 --
1 dt_bjn001 p9-webcast-sign.douyinpic.com test /%/7212503657802320699%xxx /test 100 false
-- !select_eq_2 --
-- !select_in_2 --
-- !select_like_2 --
1 dt_bjn001 p9-webcast-sign.douyinpic.com test /%/7212503657802320699%xxx /test 100 false
-- !select_all_3 --
-- !select_eq_3 --
-- !select_in_3 --
-- !select_like_3 --

View File

@ -0,0 +1,62 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
suite("test_ngram_bloomfilter_index") {
// todo: test bitmap index, such as create, drop, alter table index
def tableName = 'test_ngram_bloomfilter_index'
sql "DROP TABLE IF EXISTS ${tableName}"
sql """
CREATE TABLE IF NOT EXISTS ${tableName} (
`key_id` bigint(20) NULL COMMENT '',
`category` varchar(200) NULL COMMENT '',
`https_url` varchar(300) NULL COMMENT '',
`hostname` varchar(300) NULL,
`http_url` text NULL COMMENT '',
`url_path` varchar(2000) NULL COMMENT '',
`cnt` bigint(20) NULL COMMENT '',
`host_flag` boolean NULL COMMENT '',
INDEX idx_ngrambf (`http_url`) USING NGRAM_BF PROPERTIES("gram_size" = "2", "bf_size" = "512")
) ENGINE=OLAP
DUPLICATE KEY(`key_id`, `category`)
COMMENT 'OLAP'
DISTRIBUTED BY HASH(`key_id`) BUCKETS 3
PROPERTIES("replication_num" = "1");
"""
sql "INSERT INTO ${tableName} values (1, 'dt_bjn001', 'p9-webcast-sign.douyinpic.com', 'test', '/%/7212503657802320699%', '/test', 100, false);"
sql "INSERT INTO ${tableName} values (1, 'dt_bjn001', 'p9-webcast-sign.douyinpic.com', 'test', '/%/7212503657802320699%xxx', '/test', 100, false);"
sql "SET enable_function_pushdown = true"
qt_select_all_1 "SELECT * FROM ${tableName}"
qt_select_eq_1 "SELECT * FROM ${tableName} WHERE http_url = '/%/7212503657802320699%'"
qt_select_in_1 "SELECT * FROM ${tableName} WHERE http_url IN ('/%/7212503657802320699%')"
qt_select_like_1 "SELECT * FROM ${tableName} WHERE http_url like '/%/7212503657802320699%'"
// delete and then select
sql "DELETE FROM ${tableName} WHERE http_url IN ('/%/7212503657802320699%')"
qt_select_all_2 "SELECT * FROM ${tableName}"
qt_select_eq_2 "SELECT * FROM ${tableName} WHERE http_url = '/%/7212503657802320699%'"
qt_select_in_2 "SELECT * FROM ${tableName} WHERE http_url IN ('/%/7212503657802320699%')"
qt_select_like_2 "SELECT * FROM ${tableName} WHERE http_url like '/%/7212503657802320699%'"
sql "DELETE FROM ${tableName} WHERE http_url = '/%/7212503657802320699%xxx'"
qt_select_all_3 "SELECT * FROM ${tableName}"
qt_select_eq_3 "SELECT * FROM ${tableName} WHERE http_url = '/%/7212503657802320699%'"
qt_select_in_3 "SELECT * FROM ${tableName} WHERE http_url IN ('/%/7212503657802320699%')"
qt_select_like_3 "SELECT * FROM ${tableName} WHERE http_url like '/%/7212503657802320699%'"
}