From 75aa00d3d02ee5e361f05365c7c430917be0aee8 Mon Sep 17 00:00:00 2001 From: Jet He Date: Wed, 28 Dec 2022 18:01:50 +0800 Subject: [PATCH] [Feature](NGram BloomFilter Index) add new ngram bloom filter index to speed up like query (#11579) This PR implement the new bloom filter index: NGram bloom filter index, which was proposed in #10733. The new index can improve the like query performance greatly, from our some test case , can get order of magnitude improve. For how to use it you can check the docs in this PR, and the index based on the ```enable_function_pushdown```, you need set it to ```true```, to make the index work for like query. --- .clang-format-ignore | 4 + .licenserc.yaml | 1 + be/src/common/config.h | 1 + be/src/olap/CMakeLists.txt | 2 + be/src/olap/column_predicate.h | 9 + be/src/olap/itoken_extractor.cpp | 77 +++ be/src/olap/itoken_extractor.h | 98 ++++ be/src/olap/like_column_predicate.h | 24 +- be/src/olap/reader.cpp | 33 ++ .../segment_v2/block_split_bloom_filter.h | 1 + .../olap/rowset/segment_v2/bloom_filter.cpp | 6 +- be/src/olap/rowset/segment_v2/bloom_filter.h | 27 +- .../segment_v2/bloom_filter_index_reader.cpp | 2 +- .../segment_v2/bloom_filter_index_writer.cpp | 75 +++ .../segment_v2/bloom_filter_index_writer.h | 29 +- .../olap/rowset/segment_v2/column_writer.cpp | 11 +- be/src/olap/rowset/segment_v2/column_writer.h | 3 + .../rowset/segment_v2/ngram_bloom_filter.cpp | 74 +++ .../rowset/segment_v2/ngram_bloom_filter.h | 50 ++ .../rowset/segment_v2/segment_iterator.cpp | 1 + .../olap/rowset/segment_v2/segment_writer.cpp | 8 + be/src/olap/tablet_meta.cpp | 12 + be/src/olap/tablet_schema.cpp | 34 ++ be/src/olap/tablet_schema.h | 25 +- be/src/util/CMakeLists.txt | 1 + be/src/util/cityhash102/city.cc | 481 ++++++++++++++++++ be/src/util/cityhash102/city.h | 104 ++++ be/src/util/cityhash102/citycrc.h | 48 ++ be/src/util/cityhash102/config.h | 125 +++++ be/src/util/simd/vstring_function.h | 4 + be/test/CMakeLists.txt | 1 + be/test/olap/itoken_extractor_test.cpp | 78 +++ .../segment_v2/block_bloom_filter_test.cpp | 29 ++ .../index/ngram-bloomfilter-index.md | 79 +++ .../index/ngram-bloomfilter-index.md | 81 +++ fe/fe-core/src/main/cup/sql_parser.cup | 15 +- .../doris/alter/SchemaChangeHandler.java | 12 +- .../org/apache/doris/analysis/IndexDef.java | 38 +- .../java/org/apache/doris/catalog/Index.java | 32 ++ .../doris/datasource/InternalCatalog.java | 2 + fe/fe-core/src/main/jflex/sql_scanner.flex | 1 + gensrc/proto/olap_file.proto | 1 + gensrc/proto/segment_v2.proto | 2 + gensrc/thrift/Descriptors.thrift | 6 +- 44 files changed, 1720 insertions(+), 27 deletions(-) create mode 100644 be/src/olap/itoken_extractor.cpp create mode 100644 be/src/olap/itoken_extractor.h create mode 100644 be/src/olap/rowset/segment_v2/ngram_bloom_filter.cpp create mode 100644 be/src/olap/rowset/segment_v2/ngram_bloom_filter.h create mode 100644 be/src/util/cityhash102/city.cc create mode 100644 be/src/util/cityhash102/city.h create mode 100644 be/src/util/cityhash102/citycrc.h create mode 100644 be/src/util/cityhash102/config.h create mode 100644 be/test/olap/itoken_extractor_test.cpp create mode 100644 docs/en/docs/data-table/index/ngram-bloomfilter-index.md create mode 100644 docs/zh-CN/docs/data-table/index/ngram-bloomfilter-index.md diff --git a/.clang-format-ignore b/.clang-format-ignore index 9275d0e000..b92688b824 100644 --- a/.clang-format-ignore +++ b/.clang-format-ignore @@ -7,3 +7,7 @@ be/src/util/sse2neon.h be/src/util/mustache/mustache.h be/src/util/mustache/mustache.cc be/src/util/utf8_check.cpp +be/src/util/cityhash102/city.h +be/src/util/cityhash102/city.cc +be/src/util/cityhash102/citycrc.h +be/src/util/cityhash102/config.h diff --git a/.licenserc.yaml b/.licenserc.yaml index d458e45269..54c9435c14 100644 --- a/.licenserc.yaml +++ b/.licenserc.yaml @@ -63,6 +63,7 @@ header: - "be/src/util/sse2neo.h" - "be/src/util/sse2neon.h" - "be/src/util/utf8_check.cpp" + - "be/src/util/cityhash102" - "build-support/run_clang_format.py" - "regression-test/data" - "docs/.vuepress/public/css/animate.min.css" diff --git a/be/src/common/config.h b/be/src/common/config.h index debcdb67d6..dee2fb51a2 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -826,6 +826,7 @@ CONF_Int32(s3_transfer_executor_pool_size, "2"); CONF_Bool(enable_time_lut, "true"); CONF_Bool(enable_simdjson_reader, "false"); +CONF_mBool(enable_query_like_bloom_filter, "true"); // number of s3 scanner thread pool size CONF_Int32(doris_remote_scanner_thread_pool_thread_num, "16"); // number of s3 scanner thread pool queue size diff --git a/be/src/olap/CMakeLists.txt b/be/src/olap/CMakeLists.txt index fd1387aca1..7f542e9718 100644 --- a/be/src/olap/CMakeLists.txt +++ b/be/src/olap/CMakeLists.txt @@ -40,6 +40,7 @@ add_library(Olap STATIC file_helper.cpp hll.cpp inverted_index_parser.cpp + itoken_extractor.cpp like_column_predicate.cpp key_coder.cpp lru_cache.cpp @@ -93,6 +94,7 @@ add_library(Olap STATIC rowset/segment_v2/empty_segment_iterator.cpp rowset/segment_v2/segment_writer.cpp rowset/segment_v2/block_split_bloom_filter.cpp + rowset/segment_v2/ngram_bloom_filter.cpp rowset/segment_v2/bloom_filter_index_reader.cpp rowset/segment_v2/bloom_filter_index_writer.cpp rowset/segment_v2/bloom_filter.cpp diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h index 1d3a561109..d2a6fcff0a 100644 --- a/be/src/olap/column_predicate.h +++ b/be/src/olap/column_predicate.h @@ -156,6 +156,15 @@ public: bool* flags) const { DCHECK(false) << "should not reach here"; } + + virtual std::string get_search_str() const { + DCHECK(false) << "should not reach here"; + return ""; + } + + virtual void set_page_ng_bf(std::unique_ptr) { + DCHECK(false) << "should not reach here"; + } uint32_t column_id() const { return _column_id; } virtual std::string debug_string() const { diff --git a/be/src/olap/itoken_extractor.cpp b/be/src/olap/itoken_extractor.cpp new file mode 100644 index 0000000000..90ad29d291 --- /dev/null +++ b/be/src/olap/itoken_extractor.cpp @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "itoken_extractor.h" + +#include "util/simd/vstring_function.h" + +namespace doris { + +bool NgramTokenExtractor::next_in_string(const char* data, size_t length, size_t* __restrict pos, + size_t* __restrict token_start, + size_t* __restrict token_length) const { + *token_start = *pos; + *token_length = 0; + size_t code_points = 0; + for (; code_points < n && *token_start + *token_length < length; ++code_points) { + size_t sz = get_utf8_byte_length(static_cast(data[*token_start + *token_length])); + *token_length += sz; + } + *pos += get_utf8_byte_length(static_cast(data[*pos])); + return code_points == n; +} + +bool NgramTokenExtractor::next_in_string_like(const char* data, size_t length, size_t* pos, + std::string& token) const { + token.clear(); + + size_t code_points = 0; + bool escaped = false; + for (size_t i = *pos; i < length;) { + if (escaped && (data[i] == '%' || data[i] == '_' || data[i] == '\\')) { + token += data[i]; + ++code_points; + escaped = false; + ++i; + } else if (!escaped && (data[i] == '%' || data[i] == '_')) { + /// This token is too small, go to the next. + token.clear(); + code_points = 0; + escaped = false; + *pos = ++i; + } else if (!escaped && data[i] == '\\') { + escaped = true; + ++i; + } else { + const size_t sz = get_utf8_byte_length(static_cast(data[i])); + for (size_t j = 0; j < sz; ++j) { + token += data[i + j]; + } + i += sz; + ++code_points; + escaped = false; + } + + if (code_points == n) { + *pos += get_utf8_byte_length(static_cast(data[*pos])); + return true; + } + } + + return false; +} +} // namespace doris diff --git a/be/src/olap/itoken_extractor.h b/be/src/olap/itoken_extractor.h new file mode 100644 index 0000000000..d7004d9c8d --- /dev/null +++ b/be/src/olap/itoken_extractor.h @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef DORIS_ITOKEN_EXTRACTOR_H +#define DORIS_ITOKEN_EXTRACTOR_H + +#include + +#include + +#include "olap/rowset/segment_v2/bloom_filter.h" + +namespace doris { + +/// Interface for string parsers. +struct ITokenExtractor { + virtual ~ITokenExtractor() = default; + + /// Fast inplace implementation for regular use. + /// Gets string (data ptr and len) and start position for extracting next token (state of extractor). + /// Returns false if parsing is finished, otherwise returns true. + virtual bool next_in_string(const char* data, size_t length, size_t* __restrict pos, + size_t* __restrict token_start, + size_t* __restrict token_length) const = 0; + + /// Special implementation for creating bloom filter for LIKE function. + /// It skips unescaped `%` and `_` and supports escaping symbols, but it is less lightweight. + virtual bool next_in_string_like(const char* data, size_t length, size_t* pos, + std::string& out) const = 0; + + virtual void string_to_bloom_filter(const char* data, size_t length, + segment_v2::BloomFilter& bloom_filter) const = 0; + + virtual bool string_like_to_bloom_filter(const char* data, size_t length, + segment_v2::BloomFilter& bloom_filter) const = 0; +}; + +template +class ITokenExtractorHelper : public ITokenExtractor { +public: + void string_to_bloom_filter(const char* data, size_t length, + segment_v2::BloomFilter& bloom_filter) const override { + size_t cur = 0; + size_t token_start = 0; + size_t token_len = 0; + + while (cur < length && static_cast(this)->next_in_string( + data, length, &cur, &token_start, &token_len)) + bloom_filter.add_bytes(data + token_start, token_len); + } + + bool string_like_to_bloom_filter(const char* data, size_t length, + segment_v2::BloomFilter& bloom_filter) const override { + size_t cur = 0; + bool added = false; + std::string token; + while (cur < length && + static_cast(this)->next_in_string_like(data, length, &cur, token)) { + bloom_filter.add_bytes(token.data(), token.size()); + added = true; + } + + return added; + } +}; + +/// Parser extracting all ngrams from string. +struct NgramTokenExtractor final : public ITokenExtractorHelper { +public: + explicit NgramTokenExtractor(size_t n_) : n(n_) {} + + bool next_in_string(const char* data, size_t length, size_t* __restrict pos, + size_t* __restrict token_start, + size_t* __restrict token_length) const override; + + bool next_in_string_like(const char* data, size_t length, size_t* pos, + std::string& token) const override; + +private: + size_t n; +}; +} // namespace doris + +#endif //DORIS_ITOKEN_EXTRACTOR_H diff --git a/be/src/olap/like_column_predicate.h b/be/src/olap/like_column_predicate.h index e376fe4dfd..2be7980103 100644 --- a/be/src/olap/like_column_predicate.h +++ b/be/src/olap/like_column_predicate.h @@ -47,6 +47,22 @@ public: void evaluate_and_vec(const vectorized::IColumn& column, uint16_t size, bool* flags) const override; + std::string get_search_str() const override { + return std::string(reinterpret_cast(pattern.ptr), pattern.len); + } + bool is_opposite() const { return _opposite; } + + void set_page_ng_bf(std::unique_ptr src) override { + _page_ng_bf = std::move(src); + } + bool evaluate_and(const BloomFilter* bf) const override { + if (_page_ng_bf) { + return bf->contains(*_page_ng_bf); + } + return true; + } + bool can_do_bloom_filter() const override { return true; } + private: template void _evaluate_vec(const vectorized::IColumn& column, uint16_t size, bool* flags) const { @@ -130,9 +146,11 @@ private: StateType* _state; - // A separate scratch region is required for every concurrent caller of the Hyperscan API. - // So here _like_state is separate for each instance of LikeColumnPredicate. + // A separate scratch region is required for every concurrent caller of the + // Hyperscan API. So here _like_state is separate for each instance of + // LikeColumnPredicate. vectorized::LikeSearchState _like_state; + std::unique_ptr _page_ng_bf; // for ngram-bf index }; -} //namespace doris +} // namespace doris diff --git a/be/src/olap/reader.cpp b/be/src/olap/reader.cpp index 7a0df6c66f..fb3015a170 100644 --- a/be/src/olap/reader.cpp +++ b/be/src/olap/reader.cpp @@ -22,6 +22,11 @@ #include "common/status.h" #include "exprs/create_predicate_function.h" #include "exprs/hybrid_set.h" +#include "gen_cpp/segment_v2.pb.h" +#include "olap/bloom_filter_predicate.h" +#include "olap/comparison_predicate.h" +#include "olap/in_list_predicate.h" +#include "olap/itoken_extractor.h" #include "olap/like_column_predicate.h" #include "olap/olap_common.h" #include "olap/predicate_creator.h" @@ -463,8 +468,36 @@ void TabletReader::_init_conditions_param(const ReaderParams& read_params) { } // Function filter push down to storage engine + auto is_like_predicate = [](ColumnPredicate* _pred) { + if (dynamic_cast*>(_pred) || + dynamic_cast*>(_pred)) { + return true; + } + + return false; + }; + for (const auto& filter : read_params.function_filters) { _col_predicates.emplace_back(_parse_to_predicate(filter)); + auto* pred = _col_predicates.back(); + const auto& col = _tablet->tablet_schema()->column(pred->column_id()); + auto is_like = is_like_predicate(pred); + auto* tablet_index = _tablet->tablet_schema()->get_ngram_bf_index(col.unique_id()); + + if (is_like && tablet_index && config::enable_query_like_bloom_filter) { + std::unique_ptr ng_bf; + std::string pattern = pred->get_search_str(); + auto gram_bf_size = tablet_index->get_gram_bf_size(); + auto gram_size = tablet_index->get_gram_size(); + + segment_v2::BloomFilter::create(segment_v2::NGRAM_BLOOM_FILTER, &ng_bf, gram_bf_size); + NgramTokenExtractor _token_extractor(gram_size); + + if (_token_extractor.string_like_to_bloom_filter(pattern.data(), pattern.length(), + *ng_bf)) { + pred->set_page_ng_bf(std::move(ng_bf)); + } + } } } diff --git a/be/src/olap/rowset/segment_v2/block_split_bloom_filter.h b/be/src/olap/rowset/segment_v2/block_split_bloom_filter.h index 9d082f7167..93591adc1d 100644 --- a/be/src/olap/rowset/segment_v2/block_split_bloom_filter.h +++ b/be/src/olap/rowset/segment_v2/block_split_bloom_filter.h @@ -32,6 +32,7 @@ public: void add_hash(uint64_t hash) override; bool test_hash(uint64_t hash) const override; + bool contains(const BloomFilter&) const override { return true; } private: // Bytes in a tiny Bloom filter block. diff --git a/be/src/olap/rowset/segment_v2/bloom_filter.cpp b/be/src/olap/rowset/segment_v2/bloom_filter.cpp index cbb5181ac5..667cc9b595 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter.cpp @@ -21,14 +21,18 @@ #include "gen_cpp/segment_v2.pb.h" #include "gutil/strings/substitute.h" #include "olap/rowset/segment_v2/block_split_bloom_filter.h" +#include "olap/rowset/segment_v2/ngram_bloom_filter.h" #include "olap/utils.h" namespace doris { namespace segment_v2 { -Status BloomFilter::create(BloomFilterAlgorithmPB algorithm, std::unique_ptr* bf) { +Status BloomFilter::create(BloomFilterAlgorithmPB algorithm, std::unique_ptr* bf, + size_t bf_size) { if (algorithm == BLOCK_BLOOM_FILTER) { bf->reset(new BlockSplitBloomFilter()); + } else if (algorithm == NGRAM_BLOOM_FILTER) { + bf->reset(new NGramBloomFilter(bf_size)); } else { return Status::InternalError("invalid bloom filter algorithm:{}", algorithm); } diff --git a/be/src/olap/rowset/segment_v2/bloom_filter.h b/be/src/olap/rowset/segment_v2/bloom_filter.h index 8796f4c6de..139ff3aea2 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter.h +++ b/be/src/olap/rowset/segment_v2/bloom_filter.h @@ -51,17 +51,26 @@ public: static const uint32_t MAXIMUM_BYTES = 128 * 1024 * 1024; // Factory function for BloomFilter - static Status create(BloomFilterAlgorithmPB algorithm, std::unique_ptr* bf); + static Status create(BloomFilterAlgorithmPB algorithm, std::unique_ptr* bf, + size_t bf_size = 0); BloomFilter() : _data(nullptr), _num_bytes(0), _size(0), _has_null(nullptr) {} - virtual ~BloomFilter() { delete[] _data; } + virtual ~BloomFilter() { + if (_data) { + delete[] _data; + } + } + + virtual bool is_ngram_bf() const { return false; } // for write Status init(uint64_t n, double fpp, HashStrategyPB strategy) { return this->init(optimal_bit_num(n, fpp) / 8, strategy); } + Status init(uint64_t filter_size) { return init(filter_size, HASH_MURMUR3_X64_64); } + Status init(uint64_t filter_size, HashStrategyPB strategy) { if (strategy == HASH_MURMUR3_X64_64) { _hash_func = murmur_hash3_x64_64; @@ -81,7 +90,7 @@ public: // for read // use deep copy to acquire the data - Status init(const char* buf, uint32_t size, HashStrategyPB strategy) { + virtual Status init(const char* buf, uint32_t size, HashStrategyPB strategy) { DCHECK(size > 1); if (strategy == HASH_MURMUR3_X64_64) { _hash_func = murmur_hash3_x64_64; @@ -108,7 +117,7 @@ public: return hash_code; } - void add_bytes(const char* buf, uint32_t size) { + virtual void add_bytes(const char* buf, uint32_t size) { if (buf == nullptr) { *_has_null = true; return; @@ -125,15 +134,19 @@ public: return test_hash(code); } - char* data() const { return _data; } + /// Checks if this contains everything from another bloom filter. + /// Bloom filters must have equal size and seed. + virtual bool contains(const BloomFilter& bf_) const = 0; + + virtual char* data() const { return _data; } uint32_t num_bytes() const { return _num_bytes; } - uint32_t size() const { return _size; } + virtual uint32_t size() const { return _size; } void set_has_null(bool has_null) { *_has_null = has_null; } - bool has_null() const { return *_has_null; } + virtual bool has_null() const { return *_has_null; } virtual void add_hash(uint64_t hash) = 0; virtual bool test_hash(uint64_t hash) const = 0; diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp index f3779d032c..1f6a733776 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp @@ -50,8 +50,8 @@ Status BloomFilterIndexIterator::read_bloom_filter(rowid_t ordinal, RETURN_IF_ERROR(_bloom_filter_iter.next_batch(&num_read, &column_block_view)); DCHECK(num_to_read == num_read); // construct bloom filter - BloomFilter::create(_reader->_bloom_filter_index_meta->algorithm(), bf); const Slice* value_ptr = reinterpret_cast(block.data()); + BloomFilter::create(_reader->_bloom_filter_index_meta->algorithm(), bf, value_ptr->size); RETURN_IF_ERROR((*bf)->init(value_ptr->data, value_ptr->size, _reader->_bloom_filter_index_meta->hash_strategy())); _pool->clear(); diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp index 5542a6068d..567cf8d795 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp @@ -170,6 +170,63 @@ private: } // namespace +NGramBloomFilterIndexWriterImpl::NGramBloomFilterIndexWriterImpl( + const BloomFilterOptions& bf_options, uint8_t gram_size, uint16_t bf_size) + : _bf_options(bf_options), + _gram_size(gram_size), + _bf_size(bf_size), + _bf_buffer_size(0), + _token_extractor(gram_size) { + BloomFilter::create(NGRAM_BLOOM_FILTER, &_bf, bf_size); +} + +void NGramBloomFilterIndexWriterImpl::add_values(const void* values, size_t count) { + const Slice* src = reinterpret_cast(values); + for (int i = 0; i < count; ++i, ++src) { + if (src->size < _gram_size) { + continue; + } + _token_extractor.string_to_bloom_filter(src->data, src->size, *_bf); + } +} + +Status NGramBloomFilterIndexWriterImpl::flush() { + _bf_buffer_size += _bf->size(); + _bfs.emplace_back(std::move(_bf)); + // init new one + RETURN_IF_ERROR(BloomFilter::create(NGRAM_BLOOM_FILTER, &_bf, _bf_size)); + return Status::OK(); +} + +Status NGramBloomFilterIndexWriterImpl::finish(io::FileWriter* file_writer, + ColumnIndexMetaPB* index_meta) { + index_meta->set_type(BLOOM_FILTER_INDEX); + BloomFilterIndexPB* meta = index_meta->mutable_bloom_filter_index(); + meta->set_hash_strategy(CITY_HASH_64); + meta->set_algorithm(NGRAM_BLOOM_FILTER); + + // write bloom filters + const TypeInfo* bf_typeinfo = get_scalar_type_info(OLAP_FIELD_TYPE_VARCHAR); + IndexedColumnWriterOptions options; + options.write_ordinal_index = true; + options.write_value_index = false; + options.encoding = PLAIN_ENCODING; + IndexedColumnWriter bf_writer(options, bf_typeinfo, file_writer); + RETURN_IF_ERROR(bf_writer.init()); + for (auto& bf : _bfs) { + Slice data(bf->data(), bf->size()); + bf_writer.add(&data); + } + RETURN_IF_ERROR(bf_writer.finish(meta->mutable_bloom_filter())); + return Status::OK(); +} + +uint64_t NGramBloomFilterIndexWriterImpl::size() { + uint64_t total_size = _bf_buffer_size; + total_size += _pool.total_allocated_bytes(); + return total_size; +} + // TODO currently we don't support bloom filter index for tinyint/hll/float/double Status BloomFilterIndexWriter::create(const BloomFilterOptions& bf_options, const TypeInfo* type_info, @@ -203,5 +260,23 @@ Status BloomFilterIndexWriter::create(const BloomFilterOptions& bf_options, return Status::OK(); } +Status NGramBloomFilterIndexWriterImpl::create(const BloomFilterOptions& bf_options, + const TypeInfo* typeinfo, uint8_t gram_size, + uint16_t gram_bf_size, + std::unique_ptr* res) { + FieldType type = typeinfo->type(); + switch (type) { + case OLAP_FIELD_TYPE_CHAR: + case OLAP_FIELD_TYPE_VARCHAR: + case OLAP_FIELD_TYPE_STRING: + res->reset(new NGramBloomFilterIndexWriterImpl(bf_options, gram_size, gram_bf_size)); + break; + default: + return Status::NotSupported("unsupported type for ngram bloom filter index:{}", + std::to_string(type)); + } + return Status::OK(); +} + } // namespace segment_v2 } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h index 8b9a945e1a..9cc02f4f67 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h @@ -23,6 +23,8 @@ #include "common/status.h" #include "gen_cpp/segment_v2.pb.h" #include "gutil/macros.h" +#include "olap/itoken_extractor.h" +#include "runtime/mem_pool.h" namespace doris { @@ -38,7 +40,7 @@ struct BloomFilterOptions; class BloomFilterIndexWriter { public: - static Status create(const BloomFilterOptions& bf_options, const TypeInfo* type_info, + static Status create(const BloomFilterOptions& bf_options, const TypeInfo* typeinfo, std::unique_ptr* res); BloomFilterIndexWriter() = default; @@ -58,5 +60,30 @@ private: DISALLOW_COPY_AND_ASSIGN(BloomFilterIndexWriter); }; +class NGramBloomFilterIndexWriterImpl : public BloomFilterIndexWriter { +public: + static Status create(const BloomFilterOptions& bf_options, const TypeInfo* typeinfo, + uint8_t gram_size, uint16_t gram_bf_size, + std::unique_ptr* res); + + NGramBloomFilterIndexWriterImpl(const BloomFilterOptions& bf_options, uint8_t gram_size, + uint16_t bf_size); + void add_values(const void* values, size_t count) override; + void add_nulls(uint32_t) override {} + Status flush() override; + Status finish(io::FileWriter* file_writer, ColumnIndexMetaPB* index_meta) override; + uint64_t size() override; + +private: + BloomFilterOptions _bf_options; + uint8_t _gram_size; + uint16_t _bf_size; + MemPool _pool; + uint64_t _bf_buffer_size; + NgramTokenExtractor _token_extractor; + std::unique_ptr _bf; + std::vector> _bfs; +}; + } // namespace segment_v2 } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp index 146a29443c..df820b141e 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/column_writer.cpp @@ -299,6 +299,7 @@ Status ScalarColumnWriter::init() { RETURN_IF_ERROR( BitmapIndexWriter::create(get_field()->type_info(), &_bitmap_index_builder)); } + if (_opts.inverted_index) { RETURN_IF_ERROR(InvertedIndexColumnWriter::create( get_field(), &_inverted_index_builder, _opts.meta->unique_id(), @@ -307,8 +308,14 @@ Status ScalarColumnWriter::init() { _file_writer->fs())); } if (_opts.need_bloom_filter) { - RETURN_IF_ERROR(BloomFilterIndexWriter::create( - BloomFilterOptions(), get_field()->type_info(), &_bloom_filter_index_builder)); + if (_opts.is_ngram_bf_index) { + RETURN_IF_ERROR(NGramBloomFilterIndexWriterImpl::create( + BloomFilterOptions(), get_field()->type_info(), _opts.gram_size, + _opts.gram_bf_size, &_bloom_filter_index_builder)); + } else { + RETURN_IF_ERROR(BloomFilterIndexWriter::create( + BloomFilterOptions(), get_field()->type_info(), &_bloom_filter_index_builder)); + } } return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h index 748b8d725e..5ea7ae654c 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.h +++ b/be/src/olap/rowset/segment_v2/column_writer.h @@ -51,6 +51,9 @@ struct ColumnWriterOptions { bool need_zone_map = false; bool need_bitmap_index = false; bool need_bloom_filter = false; + bool is_ngram_bf_index = false; + uint8_t gram_size; + uint16_t gram_bf_size; std::vector indexes; const TabletIndex* inverted_index = nullptr; std::string to_string() const { diff --git a/be/src/olap/rowset/segment_v2/ngram_bloom_filter.cpp b/be/src/olap/rowset/segment_v2/ngram_bloom_filter.cpp new file mode 100644 index 0000000000..a761c5a6e4 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/ngram_bloom_filter.cpp @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/ngram_bloom_filter.h" + +#include "util/cityhash102/city.h" +#include "util/debug_util.h" + +namespace doris { +namespace segment_v2 { + +static constexpr uint64_t SEED_GEN = 217728422; + +NGramBloomFilter::NGramBloomFilter(size_t size) + : _size(size), + words((size + sizeof(UnderType) - 1) / sizeof(UnderType)), + filter(words, 0) {} + +// for read +Status NGramBloomFilter::init(const char* buf, uint32_t size, HashStrategyPB strategy) { + if (size == 0) { + return Status::InvalidArgument(strings::Substitute("invalid size:$0", size)); + } + DCHECK(_size == size); + + if (strategy != CITY_HASH_64) { + return Status::InvalidArgument(strings::Substitute("invalid strategy:$0", strategy)); + } + words = (_size + sizeof(UnderType) - 1) / sizeof(UnderType); + filter.reserve(words); + const UnderType* from = reinterpret_cast(buf); + for (size_t i = 0; i < words; ++i) { + filter[i] = from[i]; + } + + return Status::OK(); +} + +void NGramBloomFilter::add_bytes(const char* data, uint32_t len) { + size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, 0); + size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN); + + for (size_t i = 0; i < HASH_FUNCTIONS; ++i) { + size_t pos = (hash1 + i * hash2 + i * i) % (8 * _size); + filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType)))); + } +} + +bool NGramBloomFilter::contains(const BloomFilter& bf_) const { + const NGramBloomFilter& bf = static_cast(bf_); + for (size_t i = 0; i < words; ++i) { + if ((filter[i] & bf.filter[i]) != bf.filter[i]) { + return false; + } + } + return true; +} + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/ngram_bloom_filter.h b/be/src/olap/rowset/segment_v2/ngram_bloom_filter.h new file mode 100644 index 0000000000..c7684a796f --- /dev/null +++ b/be/src/olap/rowset/segment_v2/ngram_bloom_filter.h @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "olap/rowset/segment_v2/bloom_filter.h" + +namespace doris { +namespace segment_v2 { + +class NGramBloomFilter : public BloomFilter { +public: + // Fixed hash function number + static const size_t HASH_FUNCTIONS = 2; + using UnderType = uint64_t; + NGramBloomFilter(size_t size); + void add_bytes(const char* data, uint32_t len) override; + bool contains(const BloomFilter& bf_) const override; + Status init(const char* buf, uint32_t size, HashStrategyPB strategy) override; + char* data() const override { + return reinterpret_cast(const_cast(filter.data())); + } + uint32_t size() const override { return _size; } + void add_hash(uint64_t) override {} + bool test_hash(uint64_t hash) const override { return true; } + bool has_null() const override { return true; } + bool is_ngram_bf() const override { return true; } + +private: + size_t _size; + size_t words; + std::vector filter; +}; + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index b01ab46dde..02897fd1cd 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -317,6 +317,7 @@ Status SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row _opts.col_id_to_predicates[cid].get(), &column_bf_row_ranges)); RowRanges::ranges_intersection(bf_row_ranges, column_bf_row_ranges, &bf_row_ranges); } + size_t pre_size = condition_row_ranges->count(); RowRanges::ranges_intersection(*condition_row_ranges, bf_row_ranges, condition_row_ranges); _opts.stats->rows_bf_filtered += (pre_size - condition_row_ranges->count()); diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index 9776761aa6..df7f26c34f 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -112,6 +112,14 @@ Status SegmentWriter::init(const std::vector& col_ids, bool has_key) { // and not support zone map for array type and jsonb type. opts.need_zone_map = column.is_key() || _tablet_schema->keys_type() != KeysType::AGG_KEYS; opts.need_bloom_filter = column.is_bf_column(); + auto* tablet_index = _tablet_schema->get_ngram_bf_index(column.unique_id()); + if (tablet_index) { + opts.need_bloom_filter = true; + opts.is_ngram_bf_index = true; + opts.gram_size = tablet_index->get_gram_size(); + opts.gram_bf_size = tablet_index->get_gram_bf_size(); + } + opts.need_bitmap_index = column.has_bitmap_index(); bool skip_inverted_index = false; if (_opts.rowset_ctx != nullptr) { diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index 99c7523250..e3d4c6f3ff 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -165,6 +165,13 @@ TabletMeta::TabletMeta(int64_t table_id, int64_t partition_id, int64_t tablet_id column->set_has_bitmap_index(true); break; } + } else if (index.index_type == TIndexType::type::BLOOMFILTER || + index.index_type == TIndexType::type::NGRAM_BF) { + DCHECK_EQ(index.columns.size(), 1); + if (iequal(tcolumn.column_name, index.columns[0])) { + column->set_is_bf_column(true); + break; + } } } } @@ -195,7 +202,11 @@ TabletMeta::TabletMeta(int64_t table_id, int64_t partition_id, int64_t tablet_id case TIndexType::BLOOMFILTER: index_pb->set_index_type(IndexType::BLOOMFILTER); break; + case TIndexType::NGRAM_BF: + index_pb->set_index_type(IndexType::NGRAM_BF); + break; } + if (index.__isset.properties) { auto properties = index_pb->mutable_properties(); for (auto kv : index.properties) { @@ -284,6 +295,7 @@ void TabletMeta::init_column_from_tcolumn(uint32_t unique_id, const TColumn& tco if (tcolumn.__isset.is_bloom_filter_column) { column->set_is_bf_column(tcolumn.is_bloom_filter_column); } + if (tcolumn.column_type.type == TPrimitiveType::ARRAY) { ColumnPB* children_column = column->add_children_columns(); init_column_from_tcolumn(0, tcolumn.children_column[0], children_column); diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index d684806454..55f030cbb9 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -402,6 +402,7 @@ void TabletColumn::init_from_pb(const ColumnPB& column) { if (column.has_visible()) { _visible = column.visible(); } + if (_type == FieldType::OLAP_FIELD_TYPE_ARRAY) { DCHECK(column.children_columns_size() == 1) << "ARRAY type has more than 1 children types."; TabletColumn child_column; @@ -479,6 +480,9 @@ void TabletIndex::init_from_thrift(const TOlapTableIndex& index, case TIndexType::BLOOMFILTER: _index_type = IndexType::BLOOMFILTER; break; + case TIndexType::NGRAM_BF: + _index_type = IndexType::NGRAM_BF; + break; } if (index.__isset.properties) { for (auto kv : index.properties) { @@ -811,6 +815,36 @@ const TabletIndex* TabletSchema::get_inverted_index(int32_t col_unique_id) const return nullptr; } +bool TabletSchema::has_ngram_bf_index(int32_t col_unique_id) const { + // TODO use more efficient impl + for (size_t i = 0; i < _indexes.size(); i++) { + if (_indexes[i].index_type() == IndexType::NGRAM_BF) { + for (int32_t id : _indexes[i].col_unique_ids()) { + if (id == col_unique_id) { + return true; + } + } + } + } + + return false; +} + +const TabletIndex* TabletSchema::get_ngram_bf_index(int32_t col_unique_id) const { + // TODO use more efficient impl + for (size_t i = 0; i < _indexes.size(); i++) { + if (_indexes[i].index_type() == IndexType::NGRAM_BF) { + for (int32_t id : _indexes[i].col_unique_ids()) { + if (id == col_unique_id) { + return &(_indexes[i]); + } + } + } + } + + return nullptr; +} + vectorized::Block TabletSchema::create_block( const std::vector& return_columns, const std::unordered_set* tablet_columns_need_convert_null) const { diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h index 20b05d1801..bbbbfc896f 100644 --- a/be/src/olap/tablet_schema.h +++ b/be/src/olap/tablet_schema.h @@ -69,8 +69,11 @@ public: std::string suffix) const; int precision() const { return _precision; } int frac() const { return _frac; } - bool visible() const { return _visible; } - // Add a sub column. + inline bool visible() const { return _visible; } + + /** + * Add a sub column. + */ void add_sub_column(TabletColumn& sub_column); uint32_t get_subtype_count() const { return _sub_column_count; } @@ -129,6 +132,20 @@ public: const IndexType index_type() const { return _index_type; } const vector& col_unique_ids() const { return _col_unique_ids; } const std::map& properties() const { return _properties; } + int32_t get_gram_size() const { + if (_properties.count("gram_size")) { + return std::stoi(_properties.at("gram_size")); + } + + return 0; + } + int32_t get_gram_bf_size() const { + if (_properties.count("bf_size")) { + return std::stoi(_properties.at("bf_size")); + } + + return 0; + } private: int64_t _index_id; @@ -186,6 +203,8 @@ public: std::vector get_indexes_for_column(int32_t col_unique_id) const; bool has_inverted_index(int32_t col_unique_id) const; const TabletIndex* get_inverted_index(int32_t col_unique_id) const; + bool has_ngram_bf_index(int32_t col_unique_id) const; + const TabletIndex* get_ngram_bf_index(int32_t col_unique_id) const; void update_indexes_from_thrift(const std::vector& indexes); int32_t schema_version() const { return _schema_version; } @@ -250,4 +269,4 @@ bool operator!=(const TabletSchema& a, const TabletSchema& b); using TabletSchemaSPtr = std::shared_ptr; -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/util/CMakeLists.txt b/be/src/util/CMakeLists.txt index 067d7442fa..4b67e12ce8 100644 --- a/be/src/util/CMakeLists.txt +++ b/be/src/util/CMakeLists.txt @@ -106,6 +106,7 @@ set(UTIL_FILES hdfs_util.cpp time_lut.cpp topn_counter.cpp + cityhash102/city.cc tuple_row_zorder_compare.cpp telemetry/telemetry.cpp telemetry/brpc_carrier.cpp diff --git a/be/src/util/cityhash102/city.cc b/be/src/util/cityhash102/city.cc new file mode 100644 index 0000000000..6ad5c37c66 --- /dev/null +++ b/be/src/util/cityhash102/city.cc @@ -0,0 +1,481 @@ +// Copyright (c) 2011 Google, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +// CityHash, by Geoff Pike and Jyrki Alakuijala +// +// This file provides CityHash64() and related functions. +// +// It's probably possible to create even faster hash functions by +// writing a program that systematically explores some of the space of +// possible hash functions, by using SIMD instructions, or by +// compromising on hash quality. + +#include "config.h" +#include "city.h" + +#include +#include // for memcpy and memset + +using namespace std; + + +#if !defined(WORDS_BIGENDIAN) + +#define uint32_in_expected_order(x) (x) +#define uint64_in_expected_order(x) (x) + +#else + +#ifdef _MSC_VER +#include +#define bswap_32(x) _byteswap_ulong(x) +#define bswap_64(x) _byteswap_uint64(x) + +#elif defined(__APPLE__) +// Mac OS X / Darwin features +#include +#define bswap_32(x) OSSwapInt32(x) +#define bswap_64(x) OSSwapInt64(x) + +#else +#include +#endif + +#define uint32_in_expected_order(x) (bswap_32(x)) +#define uint64_in_expected_order(x) (bswap_64(x)) + +#endif // WORDS_BIGENDIAN + +#if !defined(LIKELY) +#if HAVE_BUILTIN_EXPECT +#define LIKELY(x) (__builtin_expect(!!(x), 1)) +#else +#define LIKELY(x) (x) +#endif +#endif + +namespace CityHash_v1_0_2 +{ + +static uint64 UNALIGNED_LOAD64(const char *p) { + uint64 result; + memcpy(&result, p, sizeof(result)); + return result; +} + +static uint32 UNALIGNED_LOAD32(const char *p) { + uint32 result; + memcpy(&result, p, sizeof(result)); + return result; +} + +static uint64 Fetch64(const char *p) { + return uint64_in_expected_order(UNALIGNED_LOAD64(p)); +} + +static uint32 Fetch32(const char *p) { + return uint32_in_expected_order(UNALIGNED_LOAD32(p)); +} + +// Some primes between 2^63 and 2^64 for various uses. +static const uint64 k0 = 0xc3a5c85c97cb3127ULL; +static const uint64 k1 = 0xb492b66fbe98f273ULL; +static const uint64 k2 = 0x9ae16a3b2f90404fULL; +static const uint64 k3 = 0xc949d7c7509e6557ULL; + +// Bitwise right rotate. Normally this will compile to a single +// instruction, especially if the shift is a manifest constant. +static uint64 Rotate(uint64 val, int shift) { + // Avoid shifting by 64: doing so yields an undefined result. + return shift == 0 ? val : ((val >> shift) | (val << (64 - shift))); +} + +// Equivalent to Rotate(), but requires the second arg to be non-zero. +// On x86-64, and probably others, it's possible for this to compile +// to a single instruction if both args are already in registers. +static uint64 RotateByAtLeast1(uint64 val, int shift) { + return (val >> shift) | (val << (64 - shift)); +} + +static uint64 ShiftMix(uint64 val) { + return val ^ (val >> 47); +} + +static uint64 HashLen16(uint64 u, uint64 v) { + return Hash128to64(uint128(u, v)); +} + +static uint64 HashLen0to16(const char *s, size_t len) { + if (len > 8) { + uint64 a = Fetch64(s); + uint64 b = Fetch64(s + len - 8); + return HashLen16(a, RotateByAtLeast1(b + len, len)) ^ b; + } + if (len >= 4) { + uint64 a = Fetch32(s); + return HashLen16(len + (a << 3), Fetch32(s + len - 4)); + } + if (len > 0) { + uint8 a = s[0]; + uint8 b = s[len >> 1]; + uint8 c = s[len - 1]; + uint32 y = static_cast(a) + (static_cast(b) << 8); + uint32 z = len + (static_cast(c) << 2); + return ShiftMix(y * k2 ^ z * k3) * k2; + } + return k2; +} + +// This probably works well for 16-byte strings as well, but it may be overkill +// in that case. +static uint64 HashLen17to32(const char *s, size_t len) { + uint64 a = Fetch64(s) * k1; + uint64 b = Fetch64(s + 8); + uint64 c = Fetch64(s + len - 8) * k2; + uint64 d = Fetch64(s + len - 16) * k0; + return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d, + a + Rotate(b ^ k3, 20) - c + len); +} + +// Return a 16-byte hash for 48 bytes. Quick and dirty. +// Callers do best to use "random-looking" values for a and b. +static pair WeakHashLen32WithSeeds( + uint64 w, uint64 x, uint64 y, uint64 z, uint64 a, uint64 b) { + a += w; + b = Rotate(b + a + z, 21); + uint64 c = a; + a += x; + a += y; + b += Rotate(a, 44); + return make_pair(a + z, b + c); +} + +// Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty. +static pair WeakHashLen32WithSeeds( + const char* s, uint64 a, uint64 b) { + return WeakHashLen32WithSeeds(Fetch64(s), + Fetch64(s + 8), + Fetch64(s + 16), + Fetch64(s + 24), + a, + b); +} + +// Return an 8-byte hash for 33 to 64 bytes. +static uint64 HashLen33to64(const char *s, size_t len) { + uint64 z = Fetch64(s + 24); + uint64 a = Fetch64(s) + (len + Fetch64(s + len - 16)) * k0; + uint64 b = Rotate(a + z, 52); + uint64 c = Rotate(a, 37); + a += Fetch64(s + 8); + c += Rotate(a, 7); + a += Fetch64(s + 16); + uint64 vf = a + z; + uint64 vs = b + Rotate(a, 31) + c; + a = Fetch64(s + 16) + Fetch64(s + len - 32); + z = Fetch64(s + len - 8); + b = Rotate(a + z, 52); + c = Rotate(a, 37); + a += Fetch64(s + len - 24); + c += Rotate(a, 7); + a += Fetch64(s + len - 16); + uint64 wf = a + z; + uint64 ws = b + Rotate(a, 31) + c; + uint64 r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0); + return ShiftMix(r * k0 + vs) * k2; +} + +uint64 CityHash64(const char *s, size_t len) { + if (len <= 32) { + if (len <= 16) { + return HashLen0to16(s, len); + } else { + return HashLen17to32(s, len); + } + } else if (len <= 64) { + return HashLen33to64(s, len); + } + + // For strings over 64 bytes we hash the end first, and then as we + // loop we keep 56 bytes of state: v, w, x, y, and z. + uint64 x = Fetch64(s); + uint64 y = Fetch64(s + len - 16) ^ k1; + uint64 z = Fetch64(s + len - 56) ^ k0; + pair v = WeakHashLen32WithSeeds(s + len - 64, len, y); + pair w = WeakHashLen32WithSeeds(s + len - 32, len * k1, k0); + z += ShiftMix(v.second) * k1; + x = Rotate(z + x, 39) * k1; + y = Rotate(y, 33) * k1; + + // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks. + len = (len - 1) & ~static_cast(63); + do { + x = Rotate(x + y + v.first + Fetch64(s + 16), 37) * k1; + y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; + x ^= w.second; + y ^= v.first; + z = Rotate(z ^ w.first, 33); + v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); + w = WeakHashLen32WithSeeds(s + 32, z + w.second, y); + std::swap(z, x); + s += 64; + len -= 64; + } while (len != 0); + return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z, + HashLen16(v.second, w.second) + x); +} + +uint64 CityHash64WithSeed(const char *s, size_t len, uint64 seed) { + return CityHash64WithSeeds(s, len, k2, seed); +} + +uint64 CityHash64WithSeeds(const char *s, size_t len, + uint64 seed0, uint64 seed1) { + return HashLen16(CityHash64(s, len) - seed0, seed1); +} + +// A subroutine for CityHash128(). Returns a decent 128-bit hash for strings +// of any length representable in ssize_t. Based on City and Murmur. +static uint128 CityMurmur(const char *s, size_t len, uint128 seed) { + uint64 a = Uint128Low64(seed); + uint64 b = Uint128High64(seed); + uint64 c = 0; + uint64 d = 0; + ssize_t l = len - 16; + if (l <= 0) { // len <= 16 + a = ShiftMix(a * k1) * k1; + c = b * k1 + HashLen0to16(s, len); + d = ShiftMix(a + (len >= 8 ? Fetch64(s) : c)); + } else { // len > 16 + c = HashLen16(Fetch64(s + len - 8) + k1, a); + d = HashLen16(b + len, c + Fetch64(s + len - 16)); + a += d; + do { + a ^= ShiftMix(Fetch64(s) * k1) * k1; + a *= k1; + b ^= a; + c ^= ShiftMix(Fetch64(s + 8) * k1) * k1; + c *= k1; + d ^= c; + s += 16; + l -= 16; + } while (l > 0); + } + a = HashLen16(a, c); + b = HashLen16(d, b); + return uint128(a ^ b, HashLen16(b, a)); +} + +uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed) { + if (len < 128) { + return CityMurmur(s, len, seed); + } + + // We expect len >= 128 to be the common case. Keep 56 bytes of state: + // v, w, x, y, and z. + pair v, w; + uint64 x = Uint128Low64(seed); + uint64 y = Uint128High64(seed); + uint64 z = len * k1; + v.first = Rotate(y ^ k1, 49) * k1 + Fetch64(s); + v.second = Rotate(v.first, 42) * k1 + Fetch64(s + 8); + w.first = Rotate(y + z, 35) * k1 + x; + w.second = Rotate(x + Fetch64(s + 88), 53) * k1; + + // This is the same inner loop as CityHash64(), manually unrolled. + do { + x = Rotate(x + y + v.first + Fetch64(s + 16), 37) * k1; + y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; + x ^= w.second; + y ^= v.first; + z = Rotate(z ^ w.first, 33); + v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); + w = WeakHashLen32WithSeeds(s + 32, z + w.second, y); + std::swap(z, x); + s += 64; + x = Rotate(x + y + v.first + Fetch64(s + 16), 37) * k1; + y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; + x ^= w.second; + y ^= v.first; + z = Rotate(z ^ w.first, 33); + v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); + w = WeakHashLen32WithSeeds(s + 32, z + w.second, y); + std::swap(z, x); + s += 64; + len -= 128; + } while (LIKELY(len >= 128)); + y += Rotate(w.first, 37) * k0 + z; + x += Rotate(v.first + z, 49) * k0; + // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s. + for (size_t tail_done = 0; tail_done < len; ) { + tail_done += 32; + y = Rotate(y - x, 42) * k0 + v.second; + w.first += Fetch64(s + len - tail_done + 16); + x = Rotate(x, 49) * k0 + w.first; + w.first += v.first; + v = WeakHashLen32WithSeeds(s + len - tail_done, v.first, v.second); + } + // At this point our 48 bytes of state should contain more than + // enough information for a strong 128-bit hash. We use two + // different 48-byte-to-8-byte hashes to get a 16-byte final result. + x = HashLen16(x, v.first); + y = HashLen16(y, w.first); + return uint128(HashLen16(x + v.second, w.second) + y, + HashLen16(x + w.second, y + v.second)); +} + +uint128 CityHash128(const char *s, size_t len) { + if (len >= 16) { + return CityHash128WithSeed(s + 16, + len - 16, + uint128(Fetch64(s) ^ k3, + Fetch64(s + 8))); + } else if (len >= 8) { + return CityHash128WithSeed(NULL, + 0, + uint128(Fetch64(s) ^ (len * k0), + Fetch64(s + len - 8) ^ k1)); + } else { + return CityHash128WithSeed(s, len, uint128(k0, k1)); + } +} + +} + +#ifdef __SSE4_2__ +#include "citycrc.h" +#include + +namespace CityHash_v1_0_2 +{ + +// Requires len >= 240. +static void CityHashCrc256Long(const char *s, size_t len, + uint32 seed, uint64 *result) { + uint64 a = Fetch64(s + 56) + k0; + uint64 b = Fetch64(s + 96) + k0; + uint64 c = result[1] = HashLen16(b, len); + uint64 d = result[2] = Fetch64(s + 120) * k0 + len; + uint64 e = Fetch64(s + 184) + seed; + uint64 f = seed; + uint64 g = 0; + uint64 h = 0; + uint64 i = 0; + uint64 j = 0; + uint64 t = c + d; + + // 240 bytes of input per iter. + size_t iters = len / 240; + len -= iters * 240; + do { +#define CHUNK(multiplier, z) \ + { \ + uint64 old_a = a; \ + a = Rotate(b, 41 ^ z) * multiplier + Fetch64(s); \ + b = Rotate(c, 27 ^ z) * multiplier + Fetch64(s + 8); \ + c = Rotate(d, 41 ^ z) * multiplier + Fetch64(s + 16); \ + d = Rotate(e, 33 ^ z) * multiplier + Fetch64(s + 24); \ + e = Rotate(t, 25 ^ z) * multiplier + Fetch64(s + 32); \ + t = old_a; \ + } \ + f = _mm_crc32_u64(f, a); \ + g = _mm_crc32_u64(g, b); \ + h = _mm_crc32_u64(h, c); \ + i = _mm_crc32_u64(i, d); \ + j = _mm_crc32_u64(j, e); \ + s += 40 + + CHUNK(1, 1); CHUNK(k0, 0); + CHUNK(1, 1); CHUNK(k0, 0); + CHUNK(1, 1); CHUNK(k0, 0); + } while (--iters > 0); + j += i << 32; + a = HashLen16(a, j); + h += g << 32; + b = b * k0 + h; + c = HashLen16(c, f) + i; + d = HashLen16(d, e); + pair v(j + e, HashLen16(h, t)); + h = v.second + f; + // If 0 < len < 240, hash chunks of 32 bytes each from the end of s. + for (size_t tail_done = 0; tail_done < len; ) { + tail_done += 32; + c = Rotate(c - a, 42) * k0 + v.second; + d += Fetch64(s + len - tail_done + 16); + a = Rotate(a, 49) * k0 + d; + d += v.first; + v = WeakHashLen32WithSeeds(s + len - tail_done, v.first, v.second); + } + + // Final mix. + e = HashLen16(a, d) + v.first; + f = HashLen16(b, c) + a; + g = HashLen16(v.first, v.second) + c; + result[0] = e + f + g + h; + a = ShiftMix((a + g) * k0) * k0 + b; + result[1] += a + result[0]; + a = ShiftMix(a * k0) * k0 + c; + result[2] += a + result[1]; + a = ShiftMix((a + e) * k0) * k0; + result[3] = a + result[2]; +} + +// Requires len < 240. +static void CityHashCrc256Short(const char *s, size_t len, uint64 *result) { + char buf[240]; + memcpy(buf, s, len); + memset(buf + len, 0, 240 - len); + CityHashCrc256Long(buf, 240, ~static_cast(len), result); +} + +void CityHashCrc256(const char *s, size_t len, uint64 *result) { + if (LIKELY(len >= 240)) { + CityHashCrc256Long(s, len, 0, result); + } else { + CityHashCrc256Short(s, len, result); + } +} + +uint128 CityHashCrc128WithSeed(const char *s, size_t len, uint128 seed) { + if (len <= 900) { + return CityHash128WithSeed(s, len, seed); + } else { + uint64 result[4]; + CityHashCrc256(s, len, result); + uint64 u = Uint128High64(seed) + result[0]; + uint64 v = Uint128Low64(seed) + result[1]; + return uint128(HashLen16(u, v + result[2]), + HashLen16(Rotate(v, 32), u * k0 + result[3])); + } +} + +uint128 CityHashCrc128(const char *s, size_t len) { + if (len <= 900) { + return CityHash128(s, len); + } else { + uint64 result[4]; + CityHashCrc256(s, len, result); + return uint128(result[2], result[3]); + } +} + +} + +#endif diff --git a/be/src/util/cityhash102/city.h b/be/src/util/cityhash102/city.h new file mode 100644 index 0000000000..77d4c988cd --- /dev/null +++ b/be/src/util/cityhash102/city.h @@ -0,0 +1,104 @@ +// Copyright (c) 2011 Google, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +// CityHash, by Geoff Pike and Jyrki Alakuijala +// +// This file provides a few functions for hashing strings. On x86-64 +// hardware in 2011, CityHash64() is faster than other high-quality +// hash functions, such as Murmur. This is largely due to higher +// instruction-level parallelism. CityHash64() and CityHash128() also perform +// well on hash-quality tests. +// +// CityHash128() is optimized for relatively long strings and returns +// a 128-bit hash. For strings more than about 2000 bytes it can be +// faster than CityHash64(). +// +// Functions in the CityHash family are not suitable for cryptography. +// +// WARNING: This code has not been tested on big-endian platforms! +// It is known to work well on little-endian platforms that have a small penalty +// for unaligned reads, such as current Intel and AMD moderate-to-high-end CPUs. +// +// By the way, for some hash functions, given strings a and b, the hash +// of a+b is easily derived from the hashes of a and b. This property +// doesn't hold for any hash functions in this file. + +#ifndef CITY_HASH_H_ +#define CITY_HASH_H_ + +#include // for size_t. +#include +#include + +/** This is a version of CityHash that predates v1.0.3 algorithm change. + * Why we need exactly this version? + * Although hash values of CityHash are not recommended for storing persistently anywhere, + * it has already been used this way in ClickHouse: + * - for calculation of checksums of compressed chunks and for data parts; + * - this version of CityHash is exposed in cityHash64 function in ClickHouse SQL language; + * - and already used by many users for data ordering, sampling and sharding. + */ +namespace CityHash_v1_0_2 +{ + +typedef uint8_t uint8; +typedef uint32_t uint32; +typedef uint64_t uint64; +typedef std::pair uint128; + + +inline uint64 Uint128Low64(const uint128& x) { return x.first; } +inline uint64 Uint128High64(const uint128& x) { return x.second; } + +// Hash function for a byte array. +uint64 CityHash64(const char *buf, size_t len); + +// Hash function for a byte array. For convenience, a 64-bit seed is also +// hashed into the result. +uint64 CityHash64WithSeed(const char *buf, size_t len, uint64 seed); + +// Hash function for a byte array. For convenience, two seeds are also +// hashed into the result. +uint64 CityHash64WithSeeds(const char *buf, size_t len, + uint64 seed0, uint64 seed1); + +// Hash function for a byte array. +uint128 CityHash128(const char *s, size_t len); + +// Hash function for a byte array. For convenience, a 128-bit seed is also +// hashed into the result. +uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed); + +// Hash 128 input bits down to 64 bits of output. +// This is intended to be a reasonably good hash function. +inline uint64 Hash128to64(const uint128& x) { + // Murmur-inspired hashing. + const uint64 kMul = 0x9ddfea08eb382d69ULL; + uint64 a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul; + a ^= (a >> 47); + uint64 b = (Uint128High64(x) ^ a) * kMul; + b ^= (b >> 47); + b *= kMul; + return b; +} + +} + +#endif // CITY_HASH_H_ diff --git a/be/src/util/cityhash102/citycrc.h b/be/src/util/cityhash102/citycrc.h new file mode 100644 index 0000000000..3ec72cc887 --- /dev/null +++ b/be/src/util/cityhash102/citycrc.h @@ -0,0 +1,48 @@ +// Copyright (c) 2011 Google, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +// CityHash, by Geoff Pike and Jyrki Alakuijala +// +// This file declares the subset of the CityHash functions that require +// _mm_crc32_u64(). See the CityHash README for details. +// +// Functions in the CityHash family are not suitable for cryptography. + +#ifndef CITY_HASH_CRC_H_ +#define CITY_HASH_CRC_H_ + +#include "city.h" + +namespace CityHash_v1_0_2 +{ + +// Hash function for a byte array. +uint128 CityHashCrc128(const char *s, size_t len); + +// Hash function for a byte array. For convenience, a 128-bit seed is also +// hashed into the result. +uint128 CityHashCrc128WithSeed(const char *s, size_t len, uint128 seed); + +// Hash function for a byte array. Sets result[0] ... result[3]. +void CityHashCrc256(const char *s, size_t len, uint64 *result); + +} + +#endif // CITY_HASH_CRC_H_ diff --git a/be/src/util/cityhash102/config.h b/be/src/util/cityhash102/config.h new file mode 100644 index 0000000000..cca744a35c --- /dev/null +++ b/be/src/util/cityhash102/config.h @@ -0,0 +1,125 @@ +/* config.h. Generated from config.h.in by configure. */ +/* config.h.in. Generated from configure.ac by autoheader. */ + +/* Define if building universal (internal helper macro) */ +/* #undef AC_APPLE_UNIVERSAL_BUILD */ + +/* Define to 1 if the compiler supports __builtin_expect. */ +#if _MSC_VER +#define HAVE_BUILTIN_EXPECT 0 +#else +#define HAVE_BUILTIN_EXPECT 1 +#endif + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to the sub-directory in which libtool stores uninstalled libraries. + */ +#define LT_OBJDIR ".libs/" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "cityhash-discuss@googlegroups.com" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "CityHash" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "CityHash 1.0.2" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "cityhash" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "1.0.2" + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most + significant byte first (like Motorola and SPARC, unlike Intel). */ +#if defined AC_APPLE_UNIVERSAL_BUILD +# if defined __BIG_ENDIAN__ +# define WORDS_BIGENDIAN 1 +# endif +#else +# ifndef WORDS_BIGENDIAN +/* # undef WORDS_BIGENDIAN */ +# endif +#endif + +/* Define for Solaris 2.5.1 so the uint32_t typedef from , + , or is not used. If the typedef were allowed, the + #define below would cause a syntax error. */ +/* #undef _UINT32_T */ + +/* Define for Solaris 2.5.1 so the uint64_t typedef from , + , or is not used. If the typedef were allowed, the + #define below would cause a syntax error. */ +/* #undef _UINT64_T */ + +/* Define for Solaris 2.5.1 so the uint8_t typedef from , + , or is not used. If the typedef were allowed, the + #define below would cause a syntax error. */ +/* #undef _UINT8_T */ + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +/* #undef inline */ +#endif + +/* Define to `unsigned int' if does not define. */ +/* #undef size_t */ + +/* Define to `int' if does not define. */ +/* #undef ssize_t */ + +/* Define to the type of an unsigned integer type of width exactly 32 bits if + such a type exists and the standard includes do not define it. */ +/* #undef uint32_t */ + +/* Define to the type of an unsigned integer type of width exactly 64 bits if + such a type exists and the standard includes do not define it. */ +/* #undef uint64_t */ + +/* Define to the type of an unsigned integer type of width exactly 8 bits if + such a type exists and the standard includes do not define it. */ +/* #undef uint8_t */ + +#ifdef _MSC_VER + #include + typedef SSIZE_T ssize_t; +#else + #include +#endif diff --git a/be/src/util/simd/vstring_function.h b/be/src/util/simd/vstring_function.h index e48d677eb4..3c1a4e7f32 100644 --- a/be/src/util/simd/vstring_function.h +++ b/be/src/util/simd/vstring_function.h @@ -43,6 +43,10 @@ static constexpr std::array UTF8_BYTE_LENGTH = { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6}; +inline uint8_t get_utf8_byte_length(uint8_t character) { + return UTF8_BYTE_LENGTH[character]; +} + namespace simd { class VStringFunctions { diff --git a/be/test/CMakeLists.txt b/be/test/CMakeLists.txt index ce33cd1fda..0d57b06bc1 100644 --- a/be/test/CMakeLists.txt +++ b/be/test/CMakeLists.txt @@ -96,6 +96,7 @@ set(OLAP_TEST_FILES olap/byte_buffer_test.cpp olap/lru_cache_test.cpp olap/bloom_filter_test.cpp + olap/itoken_extractor_test.cpp olap/file_helper_test.cpp olap/file_utils_test.cpp olap/cumulative_compaction_policy_test.cpp diff --git a/be/test/olap/itoken_extractor_test.cpp b/be/test/olap/itoken_extractor_test.cpp new file mode 100644 index 0000000000..d57682a1ee --- /dev/null +++ b/be/test/olap/itoken_extractor_test.cpp @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/itoken_extractor.h" + +#include + +#include + +#include "common/logging.h" +#include "util/utf8_check.h" + +namespace doris { + +class TestITokenExtractor : public testing::Test { +public: + void SetUp() {} + void TearDown() {} +}; + +void runNextInString(const ITokenExtractor& extractor, std::string statement, + std::vector expect) { + ASSERT_TRUE(validate_utf8(statement.c_str(), statement.length())); + + std::vector actual; + actual.reserve(expect.size()); + size_t pos = 0; + size_t token_start = 0; + size_t token_length = 0; + while (extractor.next_in_string(statement.c_str(), statement.size(), &pos, &token_start, + &token_length)) { + actual.push_back(statement.substr(token_start, token_length)); + } + ASSERT_EQ(expect, actual); +} + +void runNextInStringLike(const ITokenExtractor& extractor, std::string statement, + std::vector expect) { + std::vector actual; + actual.reserve(expect.size()); + size_t pos = 0; + std::string str; + while (extractor.next_in_string_like(statement.c_str(), statement.length(), &pos, str)) { + actual.push_back(str); + } + ASSERT_EQ(expect, actual); +} + +TEST_F(TestITokenExtractor, ngram_extractor) { + std::string statement = u8"预计09发布i13手机。"; + std::vector expect = {u8"预计", u8"计0", u8"09", u8"9发", u8"发布", u8"布i", + u8"i1", u8"13", u8"3手", u8"手机", u8"机。"}; + NgramTokenExtractor ngram_extractor(2); + runNextInString(ngram_extractor, statement, expect); +} + +TEST_F(TestITokenExtractor, ngram_like_extractor) { + NgramTokenExtractor ngram_extractor(2); + runNextInStringLike(ngram_extractor, u8"%手机%", {u8"手机"}); + runNextInStringLike(ngram_extractor, u8"%机%", {}); + runNextInStringLike(ngram_extractor, {u8"i_%手机%"}, {u8"手机"}); + runNextInStringLike(ngram_extractor, {u8"\\_手机%"}, {u8"_手", u8"手机"}); +} +} // namespace doris diff --git a/be/test/olap/rowset/segment_v2/block_bloom_filter_test.cpp b/be/test/olap/rowset/segment_v2/block_bloom_filter_test.cpp index 9c3f15dcd6..e486122530 100644 --- a/be/test/olap/rowset/segment_v2/block_bloom_filter_test.cpp +++ b/be/test/olap/rowset/segment_v2/block_bloom_filter_test.cpp @@ -174,5 +174,34 @@ TEST_F(BlockBloomFilterTest, slice) { EXPECT_FALSE(bf->test_bytes(s.data, s.size)); } +// Test contains +TEST_F(BlockBloomFilterTest, contains) { + std::unique_ptr bf1; + auto st1 = BloomFilter::create(NGRAM_BLOOM_FILTER, &bf1, 512); + ASSERT_TRUE(st1.ok()); + ASSERT_NE(nullptr, bf1); + ASSERT_TRUE(st1.ok()); + ASSERT_TRUE(bf1->size() > 0); + + std::unique_ptr bf2; + auto st2 = BloomFilter::create(NGRAM_BLOOM_FILTER, &bf2, 512); + ASSERT_TRUE(st2.ok()); + ASSERT_NE(nullptr, bf2); + ASSERT_TRUE(st2.ok()); + ASSERT_TRUE(bf2->size() > 0); + + std::vector str_list = {"abc", "csx", "d2", "csxx", "vaa"}; + for (int i = 0; i < str_list.size(); ++i) { + auto str = str_list[i]; + bf1->add_bytes(str.data(), str.size()); + if (1 == i % 2) { + bf2->add_bytes(str.data(), str.size()); + } + } + + ASSERT_TRUE(bf1->contains(*bf2)); + ASSERT_FALSE(bf2->contains(*bf1)); +} + } // namespace segment_v2 } // namespace doris diff --git a/docs/en/docs/data-table/index/ngram-bloomfilter-index.md b/docs/en/docs/data-table/index/ngram-bloomfilter-index.md new file mode 100644 index 0000000000..331b78468d --- /dev/null +++ b/docs/en/docs/data-table/index/ngram-bloomfilter-index.md @@ -0,0 +1,79 @@ +--- +{ + "title": "NGram BloomFilter Index", + "language": "en" +} +--- + + + +# Doris NGram BloomFilter Index + +In order to improve the like query performance, the NGram BloomFilter index was implemented, which referenced to the ClickHouse's ngrambf skip indices; + +## Create Column With NGram BloomFilter Index + +During create table: + +```sql +CREATE TABLE `table3` ( + `siteid` int(11) NULL DEFAULT "10" COMMENT "", + `citycode` smallint(6) NULL COMMENT "", + `username` varchar(100) NULL DEFAULT "" COMMENT "", + INDEX idx_ngrambf (`username`) USING NGRAM_BF PROPERTIES("gram_size"="3", "bf_size"="256") COMMENT 'username ngram_bf index' +) ENGINE=OLAP +AGGREGATE KEY(`siteid`, `citycode`, `username`) COMMENT "OLAP" +DISTRIBUTED BY HASH(`siteid`) BUCKETS 10 +PROPERTIES ( +"replication_num" = "1" +); + +-- PROPERTIES("gram_size"="3", "bf_size"="1024"),indicate the number of gram and bytes of bloom filter respectively. +-- the gram size set to same as the like query pattern string length. and the suitable bytes of bloom filter can be get by test, more larger more better, 256 maybe is a good start. +-- Usually, if the data's cardinality is small, you can increase the bytes of bloom filter to improve the efficiency. +``` + +## Show NGram BloomFilter Index + +```sql +show index from example_db.table3; +``` + +## Drop NGram BloomFilter Index + + +```sql +alter table example_db.table3 drop index idx_ngrambf; +``` + +## Add NGram BloomFilter Index + +Add NGram BloomFilter Index for old column: + +```sql +alter table example_db.table3 add index idx_ngrambf(username) using NGRAM_BF PROPERTIES("gram_size"="3", "bf_size"="512")comment 'username ngram_bf index' +``` + +## **Some notes about Doris NGram BloomFilter** + +1. NGram BloomFilter only support CHAR/VARCHAR/String column. +2. NGram BloomFilter index and BloomFilter index should be exclusive on same column +3. The gram number and bytes of BloomFilter can be adjust and optimize. Like if gram is too small, you can increase the bytes of BloomFilter. +4. To find some query whether use the NGram BloomFilter index, you can check the query profile. diff --git a/docs/zh-CN/docs/data-table/index/ngram-bloomfilter-index.md b/docs/zh-CN/docs/data-table/index/ngram-bloomfilter-index.md new file mode 100644 index 0000000000..5c94803f53 --- /dev/null +++ b/docs/zh-CN/docs/data-table/index/ngram-bloomfilter-index.md @@ -0,0 +1,81 @@ +--- +{ + "title": "NGram BloomFilter索引", + "language": "zh-CN" +} +--- + + + +# Doris NGram BloomFilter索引及使用使用场景 + +为了提升like的查询性能,增加了NGram BloomFilter索引,其实现主要参照了ClickHouse的ngrambf。 + +## NGram BloomFilter创建 + +表创建时指定: + +```sql +CREATE TABLE `table3` ( + `siteid` int(11) NULL DEFAULT "10" COMMENT "", + `citycode` smallint(6) NULL COMMENT "", + `username` varchar(32) NULL DEFAULT "" COMMENT "", + INDEX idx_ngrambf (`username`) USING NGRAM_BF PROPERTIES("gram_size"="3", "bf_size"="256") COMMENT 'username ngram_bf index' +) ENGINE=OLAP +AGGREGATE KEY(`siteid`, `citycode`, `username`) COMMENT "OLAP" +DISTRIBUTED BY HASH(`siteid`) BUCKETS 10 +PROPERTIES ( +"replication_num" = "1" +); + +-- PROPERTIES("gram_size"="3", "bf_size"="256"),分别表示gram的个数和bloom filter的字节数。 +-- gram的个数跟实际查询场景相关,通常设置为大部分查询字符串的长度,bloom filter字节数,可以通过测试得出,通常越大过滤效果越好,可以从256开始进行验证测试看看效果。当然字节数越大也会带来索引存储、内存cost上升。 +-- 如果数据基数比较高,字节数可以不用设置过大,如果基数不是很高,可以通过增加字节数来提升过滤效果。 +``` + +## 查看NGram BloomFilter索引 + +查看我们在表上建立的NGram BloomFilter索引是使用: + +```sql +show index from example_db.table3; +``` + +## 删除NGram BloomFilter索引 + + +```sql +alter table example_db.table3 drop index idx_ngrambf; +``` + +## 修改NGram BloomFilter索引 + +为已有列新增NGram BloomFilter索引: + +```sql +alter table example_db.table3 add index idx_ngrambf(username) using NGRAM_BF PROPERTIES("gram_size"="2", "bf_size"="512")comment 'username ngram_bf index' +``` + +## **Doris NGram BloomFilter使用注意事项** + +1. NGram BloomFilter只支持字符串列 +2. NGram BloomFilter索引和BloomFilter索引为互斥关系,即同一个列只能设置两者中的一个 +3. NGram大小和BloomFilter的字节数,可以根据实际情况调优,如果NGram比较小,可以适当增加BloomFilter大小 +4. 如果要查看某个查询是否命中了NGram Bloom Filter索引,可以通过查询的Profile信息查看 diff --git a/fe/fe-core/src/main/cup/sql_parser.cup b/fe/fe-core/src/main/cup/sql_parser.cup index ac678a2898..b6bba6a5d7 100644 --- a/fe/fe-core/src/main/cup/sql_parser.cup +++ b/fe/fe-core/src/main/cup/sql_parser.cup @@ -267,6 +267,7 @@ terminal String KW_BINLOG, KW_BITMAP, KW_BITMAP_UNION, + KW_NGRAM_BF, KW_BLOB, KW_BOOLEAN, KW_BROKER, @@ -2143,7 +2144,7 @@ opt_password_lock_time ::= | KW_PASSWORD_LOCK_TIME passwd_lock_time_opt:opt {: RESULT = opt; - :} + :} ; passwd_lock_time_opt ::= @@ -3270,6 +3271,10 @@ opt_index_type ::= {: RESULT = IndexDef.IndexType.BITMAP; :} + | KW_USING KW_NGRAM_BF + {: + RESULT = IndexDef.IndexType.NGRAM_BF; + :} | KW_USING KW_INVERTED {: RESULT = IndexDef.IndexType.INVERTED; @@ -5662,10 +5667,10 @@ func_args_def ::= cast_expr ::= KW_CAST LPAREN expr:e KW_AS type_def:targetType RPAREN - {: - CastExpr castExpr = new CastExpr(targetType, e); + {: + CastExpr castExpr = new CastExpr(targetType, e); if (targetType.getType().getLength() != -1 - && (targetType.getType().getPrimitiveType() == PrimitiveType.VARCHAR + && (targetType.getType().getPrimitiveType() == PrimitiveType.VARCHAR || targetType.getType().getPrimitiveType() == PrimitiveType.CHAR)) { // transfer cast(xx as char(N)/varchar(N)) to substr(cast(xx as char), 1, N) // this is just a workaround to make the result correct @@ -6504,6 +6509,8 @@ keyword ::= {: RESULT = id; :} | KW_BITMAP_UNION:id {: RESULT = id; :} + | KW_NGRAM_BF:id + {: RESULT = id; :} | KW_QUANTILE_UNION:id {: RESULT = id; :} | KW_BLOB:id diff --git a/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java b/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java index 1a0489ea58..de171e25c7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java +++ b/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java @@ -1247,6 +1247,8 @@ public class SchemaChangeHandler extends AlterHandler { bfFpp = 0; } + Index.checkConflict(newSet, bfColumns); + // property 3: timeout long timeoutSecond = PropertyAnalyzer.analyzeTimeout(propertyMap, Config.alter_table_timeout_second); @@ -2058,9 +2060,13 @@ public class SchemaChangeHandler extends AlterHandler { } Set existedIdxColSet = Sets.newTreeSet(String.CASE_INSENSITIVE_ORDER); existedIdxColSet.addAll(existedIdx.getColumns()); - if (newColset.equals(existedIdxColSet)) { + if (existedIdx.getIndexType() == indexDef.getIndexType() && newColset.equals(existedIdxColSet)) { throw new DdlException( - "index for columns (" + String.join(",", indexDef.getColumns()) + " ) already exist."); + indexDef.getIndexType() + + " index for columns (" + + String.join(",", indexDef.getColumns()) + + " ) already exist." + ); } } @@ -2069,7 +2075,7 @@ public class SchemaChangeHandler extends AlterHandler { if (column != null) { indexDef.checkColumn(column, olapTable.getKeysType()); } else { - throw new DdlException("BITMAP column does not exist in table. invalid column: " + col); + throw new DdlException("index column does not exist in table. invalid column: " + col); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/IndexDef.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/IndexDef.java index 4df18a9879..ed03dbd84e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/IndexDef.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/IndexDef.java @@ -37,6 +37,11 @@ public class IndexDef { private String comment; private Map properties; + public static final String NGRAM_SIZE_KEY = "gram_size"; + public static final String NGRAM_BF_SIZE_KEY = "bf_size"; + public static final String DEFAULT_NGRAM_SIZE = "2"; + public static final String DEFAULT_NGRAM_BF_SIZE = "256"; + public IndexDef(String indexName, boolean ifNotExists, List columns, IndexType indexType, Map properties, String comment) { this.indexName = indexName; @@ -57,6 +62,10 @@ public class IndexDef { } else { this.properties = properties; } + if (indexType == IndexType.NGRAM_BF) { + properties.putIfAbsent(NGRAM_SIZE_KEY, DEFAULT_NGRAM_SIZE); + properties.putIfAbsent(NGRAM_BF_SIZE_KEY, DEFAULT_NGRAM_BF_SIZE); + } } public void analyze() throws AnalysisException { @@ -155,6 +164,7 @@ public class IndexDef { BITMAP, INVERTED, BLOOMFILTER, + NGRAM_BF } public boolean isInvertedIndex() { @@ -162,7 +172,8 @@ public class IndexDef { } public void checkColumn(Column column, KeysType keysType) throws AnalysisException { - if (indexType == IndexType.BITMAP || indexType == IndexType.INVERTED || indexType == IndexType.BLOOMFILTER) { + if (indexType == IndexType.BITMAP || indexType == IndexType.INVERTED || indexType == IndexType.BLOOMFILTER + || indexType == IndexType.NGRAM_BF) { String indexColName = column.getName(); PrimitiveType colType = column.getDataType(); if (!(colType.isDateType() || colType.isDecimalV2Type() || colType.isDecimalV3Type() @@ -177,6 +188,31 @@ public class IndexDef { if (indexType == IndexType.INVERTED) { InvertedIndexUtil.checkInvertedIndexParser(indexColName, colType, properties); + } else if (indexType == IndexType.NGRAM_BF) { + if (colType != PrimitiveType.CHAR && colType != PrimitiveType.VARCHAR + && colType != PrimitiveType.STRING) { + throw new AnalysisException(colType + " is not supported in ngram_bf index. " + + "invalid column: " + indexColName); + } else if ((keysType == KeysType.AGG_KEYS && !column.isKey())) { + throw new AnalysisException( + "ngram_bf index only used in columns of DUP_KEYS/UNIQUE_KEYS table or key columns of" + + " AGG_KEYS table. invalid column: " + indexColName); + } + if (properties.size() != 2) { + throw new AnalysisException("ngram_bf index should have gram_size and bf_size properties"); + } + try { + int ngramSize = Integer.parseInt(properties.get(NGRAM_SIZE_KEY)); + int bfSize = Integer.parseInt(properties.get(NGRAM_BF_SIZE_KEY)); + if (ngramSize > 256 || ngramSize < 1) { + throw new AnalysisException("gram_size should be integer and less than 256"); + } + if (bfSize > 65536 || bfSize < 64) { + throw new AnalysisException("bf_size should be integer and between 64 and 65536"); + } + } catch (NumberFormatException e) { + throw new AnalysisException("invalid ngram properties:" + e.getMessage(), e); + } } } else { throw new AnalysisException("Unsupported index type: " + indexType); diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java index 2da8ce35ff..b7058d2d20 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java @@ -19,6 +19,7 @@ package org.apache.doris.catalog; import org.apache.doris.analysis.IndexDef; import org.apache.doris.analysis.InvertedIndexUtil; +import org.apache.doris.common.AnalysisException; import org.apache.doris.common.io.Text; import org.apache.doris.common.io.Writable; import org.apache.doris.common.util.PrintableMap; @@ -32,9 +33,13 @@ import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; /** * Internal representation of index, including index type, name, columns and comments. @@ -197,4 +202,31 @@ public class Index implements Writable { } return tIndex; } + + public static void checkConflict(Collection indices, Set bloomFilters) throws AnalysisException { + indices = indices == null ? Collections.emptyList() : indices; + bloomFilters = bloomFilters == null ? Collections.emptySet() : bloomFilters; + Set bfColumns = new HashSet<>(); + for (Index index : indices) { + if (IndexDef.IndexType.NGRAM_BF == index.getIndexType() + || IndexDef.IndexType.BLOOMFILTER == index.getIndexType()) { + for (String column : index.getColumns()) { + column = column.toLowerCase(); + if (bfColumns.contains(column)) { + throw new AnalysisException(column + " should have only one ngram bloom filter index or bloom " + + "filter index"); + } + bfColumns.add(column); + } + } + } + for (String column : bloomFilters) { + column = column.toLowerCase(); + if (bfColumns.contains(column)) { + throw new AnalysisException(column + " should have only one ngram bloom filter index or bloom " + + "filter index"); + } + bfColumns.add(column); + } + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java index bed44c6997..e68b82ec5d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java @@ -1918,6 +1918,8 @@ public class InternalCatalog implements CatalogIf { throw new DdlException(e.getMessage()); } + Index.checkConflict(stmt.getIndexes(), bfColumns); + olapTable.setReplicationAllocation(replicaAlloc); // set in memory diff --git a/fe/fe-core/src/main/jflex/sql_scanner.flex b/fe/fe-core/src/main/jflex/sql_scanner.flex index 2b0b02ecc6..730e6affb3 100644 --- a/fe/fe-core/src/main/jflex/sql_scanner.flex +++ b/fe/fe-core/src/main/jflex/sql_scanner.flex @@ -120,6 +120,7 @@ import org.apache.doris.qe.SqlModeHelper; keywordMap.put("bitmap", new Integer(SqlParserSymbols.KW_BITMAP)); keywordMap.put("inverted", new Integer(SqlParserSymbols.KW_INVERTED)); keywordMap.put("bitmap_union", new Integer(SqlParserSymbols.KW_BITMAP_UNION)); + keywordMap.put("ngram_bf", new Integer(SqlParserSymbols.KW_NGRAM_BF)); keywordMap.put("blob", new Integer(SqlParserSymbols.KW_BLOB)); keywordMap.put("boolean", new Integer(SqlParserSymbols.KW_BOOLEAN)); keywordMap.put("broker", new Integer(SqlParserSymbols.KW_BROKER)); diff --git a/gensrc/proto/olap_file.proto b/gensrc/proto/olap_file.proto index 3e20ff8345..ab0791006f 100644 --- a/gensrc/proto/olap_file.proto +++ b/gensrc/proto/olap_file.proto @@ -190,6 +190,7 @@ enum IndexType { BITMAP = 0; INVERTED = 1; BLOOMFILTER = 2; + NGRAM_BF = 3; } message TabletIndexPB { diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto index 7a69286046..629f1a4fc8 100644 --- a/gensrc/proto/segment_v2.proto +++ b/gensrc/proto/segment_v2.proto @@ -265,11 +265,13 @@ message BitmapIndexPB { enum HashStrategyPB { HASH_MURMUR3_X64_64 = 0; + CITY_HASH_64 = 1; } enum BloomFilterAlgorithmPB { BLOCK_BLOOM_FILTER = 0; CLASSIC_BLOOM_FILTER = 1; + NGRAM_BLOOM_FILTER = 2; } message BloomFilterIndexPB { diff --git a/gensrc/thrift/Descriptors.thrift b/gensrc/thrift/Descriptors.thrift index d133b3a27d..1db9bd0537 100644 --- a/gensrc/thrift/Descriptors.thrift +++ b/gensrc/thrift/Descriptors.thrift @@ -34,6 +34,9 @@ struct TColumn { 10: optional list children_column 11: optional i32 col_unique_id = -1 12: optional bool has_bitmap_index = false + 13: optional bool has_ngram_bf_index = false + 14: optional i32 gram_size + 15: optional i32 gram_bf_size } struct TSlotDescriptor { @@ -120,7 +123,8 @@ enum THdfsCompression { enum TIndexType { BITMAP, INVERTED, - BLOOMFILTER + BLOOMFILTER, + NGRAM_BF } // Mapping from names defined by Avro to the enum.