From d72fbdf4254558140160a289d7b0ebed4f04aae9 Mon Sep 17 00:00:00 2001 From: Dayue Gao Date: Wed, 20 Nov 2019 13:51:21 +0800 Subject: [PATCH] Support bitmap index build (#2050) This PR implements the build part of bitmap index support. It follows most of the design described in #1684 , but with the following differences and enhancements 1. Bitmap indexes are now written in the segment file for simplicity. Separate index file would be helpful when we're going to support `alter table add bitmap index` in the future though. 2. We switch to a generalized index page format for all data types rather than specialize for each one. Code simplicity and reusability is preferred here than optimal compression rate. 3. We introduce a new abstraction called `IndexedColumn` to unify the processing of the dictionary section and bitmap section of bitmap index. IndexedColumn is a column with an optional ordinal index and an optional value index. Ordinal index enables us to seek to a particular rowid within the column. Value index requires IndexedColumn to store ordered values and enables us to seek to a particular value. Therefore, the dictionary section can be represented by an IndexedColumn with value index and the bitmap section can be represented by an IndexedColumn with ordinal index. --- be/src/olap/CMakeLists.txt | 4 + be/src/olap/key_coder.cpp | 7 +- be/src/olap/key_coder.h | 53 ++++- .../rowset/segment_v2/binary_dict_page.cpp | 4 +- .../rowset/segment_v2/binary_plain_page.h | 1 + .../rowset/segment_v2/bitmap_index_reader.cpp | 26 +++ .../rowset/segment_v2/bitmap_index_reader.h | 28 +++ .../rowset/segment_v2/bitmap_index_writer.cpp | 206 ++++++++++++++++++ .../rowset/segment_v2/bitmap_index_writer.h | 51 +++++ .../olap/rowset/segment_v2/bitshuffle_page.h | 2 +- .../olap/rowset/segment_v2/column_writer.cpp | 23 ++ be/src/olap/rowset/segment_v2/column_writer.h | 5 + .../olap/rowset/segment_v2/encoding_info.cpp | 85 +++++--- be/src/olap/rowset/segment_v2/encoding_info.h | 8 +- be/src/olap/rowset/segment_v2/index_page.cpp | 124 +++++++++++ be/src/olap/rowset/segment_v2/index_page.h | 142 ++++++++++++ .../segment_v2/indexed_column_reader.cpp | 72 ++++++ .../rowset/segment_v2/indexed_column_reader.h | 93 ++++++++ .../segment_v2/indexed_column_writer.cpp | 188 ++++++++++++++++ .../rowset/segment_v2/indexed_column_writer.h | 125 +++++++++++ be/src/olap/rowset/segment_v2/page_builder.h | 4 +- be/src/olap/rowset/segment_v2/page_decoder.h | 18 ++ be/src/olap/rowset/segment_v2/page_pointer.h | 7 + be/src/olap/rowset/segment_v2/rle_page.h | 11 +- .../olap/rowset/segment_v2/segment_writer.cpp | 11 +- .../olap/rowset/segment_v2/segment_writer.h | 1 + be/src/util/coding.h | 26 +++ gensrc/proto/segment_v2.proto | 62 +++++- 28 files changed, 1327 insertions(+), 60 deletions(-) create mode 100644 be/src/olap/rowset/segment_v2/bitmap_index_reader.cpp create mode 100644 be/src/olap/rowset/segment_v2/bitmap_index_reader.h create mode 100644 be/src/olap/rowset/segment_v2/bitmap_index_writer.cpp create mode 100644 be/src/olap/rowset/segment_v2/bitmap_index_writer.h create mode 100644 be/src/olap/rowset/segment_v2/index_page.cpp create mode 100644 be/src/olap/rowset/segment_v2/index_page.h create mode 100644 be/src/olap/rowset/segment_v2/indexed_column_reader.cpp create mode 100644 be/src/olap/rowset/segment_v2/indexed_column_reader.h create mode 100644 be/src/olap/rowset/segment_v2/indexed_column_writer.cpp create mode 100644 be/src/olap/rowset/segment_v2/indexed_column_writer.h diff --git a/be/src/olap/CMakeLists.txt b/be/src/olap/CMakeLists.txt index 17c7ef425e..903e85e21a 100644 --- a/be/src/olap/CMakeLists.txt +++ b/be/src/olap/CMakeLists.txt @@ -82,11 +82,15 @@ add_library(Olap STATIC types.cpp utils.cpp wrapper_field.cpp + rowset/segment_v2/bitmap_index_writer.cpp rowset/segment_v2/bitshuffle_page.cpp rowset/segment_v2/bitshuffle_wrapper.cpp rowset/segment_v2/column_reader.cpp rowset/segment_v2/column_writer.cpp rowset/segment_v2/encoding_info.cpp + rowset/segment_v2/index_page.cpp + rowset/segment_v2/indexed_column_reader.cpp + rowset/segment_v2/indexed_column_writer.cpp rowset/segment_v2/ordinal_page_index.cpp rowset/segment_v2/page_compression.cpp rowset/segment_v2/binary_dict_page.cpp diff --git a/be/src/olap/key_coder.cpp b/be/src/olap/key_coder.cpp index 8624e25f08..ed181a28f3 100644 --- a/be/src/olap/key_coder.cpp +++ b/be/src/olap/key_coder.cpp @@ -22,9 +22,10 @@ namespace doris { template -KeyCoder::KeyCoder(TraitsType traits) - : _encode_ascending(traits.encode_ascending), - _decode_ascending(traits.decode_ascending) { +KeyCoder::KeyCoder(TraitsType traits) + : _full_encode_ascending(traits.full_encode_ascending), + _encode_ascending(traits.encode_ascending), + _decode_ascending(traits.decode_ascending) { } struct EnumClassHash { diff --git a/be/src/olap/key_coder.h b/be/src/olap/key_coder.h index 5c79fec95e..12cfc5bdfb 100644 --- a/be/src/olap/key_coder.h +++ b/be/src/olap/key_coder.h @@ -30,25 +30,36 @@ namespace doris { using strings::Substitute; +using FullEncodeAscendingFunc = void (*) (const void* value, std::string* buf); using EncodeAscendingFunc = void (*)(const void* value, size_t index_size, std::string* buf); using DecodeAscendingFunc = Status (*)(Slice* encoded_key, size_t index_size, uint8_t* cell_ptr, MemPool* pool); -// Helper class that is used to encode types of value in memory format -// into a sorted binary. For example, this class will encode unsigned -// integer to bit endian format which can compare with memcmp. +// Order-preserving binary encoding for values of a particular type so that +// those values can be compared by memcpy their encoded bytes. +// +// To obtain instance of this class, use the `get_key_coder(FieldType)` method. class KeyCoder { public: template KeyCoder(TraitsType traits); + // encode the provided `value` into `buf`. + void full_encode_ascending(const void* value, std::string* buf) const { + _full_encode_ascending(value, buf); + } + + // similar to `full_encode_ascending`, but only encode part (the first `index_size` bytes) of the value. + // only applicable to string type void encode_ascending(const void* value, size_t index_size, std::string* buf) const { _encode_ascending(value, index_size, buf); } + Status decode_ascending(Slice* encoded_key, size_t index_size, uint8_t* cell_ptr, MemPool* pool) const { return _decode_ascending(encoded_key, index_size, cell_ptr, pool); } private: + FullEncodeAscendingFunc _full_encode_ascending; EncodeAscendingFunc _encode_ascending; DecodeAscendingFunc _decode_ascending; }; @@ -83,7 +94,7 @@ private: } public: - static void encode_ascending(const void* value, size_t index_size, std::string* buf) { + static void full_encode_ascending(const void* value, std::string* buf) { UnsignedCppType unsigned_val; memcpy(&unsigned_val, value, sizeof(unsigned_val)); // swap MSB to encode integer @@ -96,6 +107,10 @@ public: buf->append((char*)&unsigned_val, sizeof(unsigned_val)); } + static void encode_ascending(const void* value, size_t index_size, std::string* buf) { + full_encode_ascending(value, buf); + } + static Status decode_ascending(Slice* encoded_key, size_t index_size, uint8_t* cell_ptr, MemPool* pool) { if (encoded_key->size < sizeof(UnsignedCppType)) { @@ -122,7 +137,7 @@ public: using UnsignedCppType = typename CppTypeTraits::UnsignedCppType; public: - static void encode_ascending(const void* value, size_t index_size, std::string* buf) { + static void full_encode_ascending(const void* value, std::string* buf) { UnsignedCppType unsigned_val; memcpy(&unsigned_val, value, sizeof(unsigned_val)); // make it bigendian @@ -130,6 +145,10 @@ public: buf->append((char*)&unsigned_val, sizeof(unsigned_val)); } + static void encode_ascending(const void* value, size_t index_size, std::string* buf) { + full_encode_ascending(value, buf); + } + static Status decode_ascending(Slice* encoded_key, size_t index_size, uint8_t* cell_ptr, MemPool* pool) { if (encoded_key->size < sizeof(UnsignedCppType)) { @@ -149,15 +168,15 @@ public: template<> class KeyCoderTraits { public: - static void encode_ascending(const void* value, size_t index_size, std::string* buf) { + static void full_encode_ascending(const void* value, std::string* buf) { decimal12_t decimal_val; memcpy(&decimal_val, value, sizeof(decimal12_t)); - // encode integer - KeyCoderTraits::encode_ascending( - &decimal_val.integer, sizeof(decimal_val.integer), buf); - // encode integer - KeyCoderTraits::encode_ascending( - &decimal_val.fraction, sizeof(decimal_val.fraction), buf); + KeyCoderTraits::full_encode_ascending(&decimal_val.integer, buf); + KeyCoderTraits::full_encode_ascending(&decimal_val.fraction, buf); + } + + static void encode_ascending(const void* value, size_t index_size, std::string* buf) { + full_encode_ascending(value, buf); } static Status decode_ascending(Slice* encoded_key, size_t index_size, @@ -175,6 +194,11 @@ public: template<> class KeyCoderTraits { public: + static void full_encode_ascending(const void* value, std::string* buf) { + auto slice = reinterpret_cast(value); + buf->append(slice->get_data(), slice->get_size()); + } + static void encode_ascending(const void* value, size_t index_size, std::string* buf) { const Slice* slice = (const Slice*)value; CHECK(index_size <= slice->size) << "index size is larger than char size, index=" << index_size << ", char=" << slice->size; @@ -200,6 +224,11 @@ public: template<> class KeyCoderTraits { public: + static void full_encode_ascending(const void* value, std::string* buf) { + auto slice = reinterpret_cast(value); + buf->append(slice->get_data(), slice->get_size()); + } + static void encode_ascending(const void* value, size_t index_size, std::string* buf) { const Slice* slice = (const Slice*)value; size_t copy_size = std::min(index_size, slice->size); diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp index 1653fd2c2d..fff5a38947 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp @@ -17,12 +17,10 @@ #include "olap/rowset/segment_v2/binary_dict_page.h" +#include "common/logging.h" #include "util/slice.h" // for Slice #include "gutil/strings/substitute.h" // for Substitute -#include "runtime/mem_pool.h" - #include "olap/rowset/segment_v2/bitshuffle_page.h" -#include "olap/rowset/segment_v2/rle_page.h" namespace doris { namespace segment_v2 { diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h b/be/src/olap/rowset/segment_v2/binary_plain_page.h index e52c927951..8402a7f548 100644 --- a/be/src/olap/rowset/segment_v2/binary_plain_page.h +++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h @@ -28,6 +28,7 @@ #pragma once +#include "common/logging.h" #include "util/coding.h" #include "util/faststring.h" #include "olap/olap_common.h" diff --git a/be/src/olap/rowset/segment_v2/bitmap_index_reader.cpp b/be/src/olap/rowset/segment_v2/bitmap_index_reader.cpp new file mode 100644 index 0000000000..5bd1523611 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/bitmap_index_reader.cpp @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/bitmap_index_reader.h" + +namespace doris { +namespace segment_v2 { + + + +} // namespace segment_v2 +} // namespace doris \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/bitmap_index_reader.h b/be/src/olap/rowset/segment_v2/bitmap_index_reader.h new file mode 100644 index 0000000000..44834faebf --- /dev/null +++ b/be/src/olap/rowset/segment_v2/bitmap_index_reader.h @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +namespace doris { +namespace segment_v2 { + +class BitmapIndexReader { + +}; + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/bitmap_index_writer.cpp b/be/src/olap/rowset/segment_v2/bitmap_index_writer.cpp new file mode 100644 index 0000000000..4ee3b12ac4 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/bitmap_index_writer.cpp @@ -0,0 +1,206 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/bitmap_index_writer.h" + +#include +#include + +#include "env/env.h" +#include "olap/rowset/segment_v2/common.h" +#include "olap/rowset/segment_v2/encoding_info.h" +#include "olap/rowset/segment_v2/indexed_column_writer.h" +#include "olap/types.h" +#include "runtime/mem_pool.h" +#include "runtime/mem_tracker.h" +#include "util/faststring.h" +#include "util/slice.h" + +namespace doris { +namespace segment_v2 { + +namespace { + +template +struct BitmapIndexTraits { + using MemoryIndexType = std::map; +}; + +template<> +struct BitmapIndexTraits { + using MemoryIndexType = std::map; +}; + +// Builder for bitmap index. Bitmap index is comprised of two parts +// - an "ordered dictionary" which contains all distinct values of a column and maps each value to an id. +// the smallest value mapped to 0, second value mapped to 1, .. +// - a posting list which stores one bitmap for each value in the dictionary. each bitmap is used to represent +// the list of rowid where a particular value exists. +// +// E.g, if the column contains 10 rows ['x', 'x', 'x', 'b', 'b', 'b', 'x', 'b', 'b', 'b'], +// then the ordered dictionary would be ['b', 'x'] which maps 'b' to 0 and 'x' to 1, +// and the posting list would contain two bitmaps +// bitmap for ID 0 : [0 0 0 1 1 1 0 1 1 1] +// bitmap for ID 1 : [1 1 1 0 0 0 1 0 0 0] +// the n-th bit is set to 1 if the n-th row equals to the corresponding value. +// +template +class BitmapIndexWriterImpl : public BitmapIndexWriter { +public: + using CppType = typename CppTypeTraits::CppType; + using MemoryIndexType = typename BitmapIndexTraits::MemoryIndexType; + + explicit BitmapIndexWriterImpl(const TypeInfo* typeinfo) + : _typeinfo(typeinfo), _tracker(), _pool(&_tracker) {} + + ~BitmapIndexWriterImpl() = default; + + void add_values(const void* values, size_t count) override { + auto p = reinterpret_cast(values); + for (size_t i = 0; i < count; ++i) { + add_value(*p); + p++; + } + } + + void add_value(const CppType& value) { + auto it = _mem_index.find(value); + if (it != _mem_index.end()) { + // exiting value, update bitmap + it->second.add(_rid); + } else { + // new value, copy value and insert new key->bitmap pair + CppType new_value; + _typeinfo->deep_copy(&new_value, &value, &_pool); + _mem_index.insert({new_value, Roaring::bitmapOf(1, _rid)}); + } + _rid++; + } + + void add_nulls(uint32_t count) override { + _null_bitmap.addRange(_rid, _rid + count); + _rid += count; + } + + Status finish(WritableFile* file, BitmapIndexColumnPB* meta) override { + meta->set_bitmap_type(BitmapIndexColumnPB::ROARING_BITMAP); + meta->set_has_null(!_null_bitmap.isEmpty()); + + { // write dictionary + IndexedColumnWriterOptions options; + options.write_ordinal_index = false; + options.write_value_index = true; + options.encoding = EncodingInfo::get_default_encoding(_typeinfo, true); + options.compression = LZ4F; + + IndexedColumnWriter dict_column_writer(options, _typeinfo, file); + RETURN_IF_ERROR(dict_column_writer.init()); + for (auto const& it : _mem_index) { + RETURN_IF_ERROR(dict_column_writer.add(&(it.first))); + } + RETURN_IF_ERROR(dict_column_writer.finish(meta->mutable_dict_column())); + } + { // write bitmaps + std::vector bitmaps; + for (auto& it : _mem_index) { + bitmaps.push_back(&(it.second)); + } + if (!_null_bitmap.isEmpty()) { + bitmaps.push_back(&_null_bitmap); + } + + uint32_t max_bitmap_size = 0; + std::vector bitmap_sizes; + for (auto& bitmap : bitmaps) { + bitmap->runOptimize(); + uint32_t bitmap_size = bitmap->getSizeInBytes(false); + if (max_bitmap_size < bitmap_size) { + max_bitmap_size = bitmap_size; + } + bitmap_sizes.push_back(bitmap_size); + } + + const TypeInfo* bitmap_typeinfo = get_type_info(OLAP_FIELD_TYPE_OBJECT); + + IndexedColumnWriterOptions options; + options.write_ordinal_index = true; + options.write_value_index = false; + options.encoding = EncodingInfo::get_default_encoding(bitmap_typeinfo, false); + // we already store compressed bitmap, use NO_COMPRESSION to save some cpu + options.compression = NO_COMPRESSION; + + IndexedColumnWriter bitmap_column_writer(options, bitmap_typeinfo, file); + RETURN_IF_ERROR(bitmap_column_writer.init()); + + faststring buf; + buf.reserve(max_bitmap_size); + for (size_t i = 0; i < bitmaps.size(); ++i) { + buf.resize(bitmap_sizes[i]); // so that buf[0..size) can be read and written + bitmaps[i]->write(reinterpret_cast(buf.data()), false); + Slice buf_slice(buf); + RETURN_IF_ERROR(bitmap_column_writer.add(&buf_slice)); + } + RETURN_IF_ERROR(bitmap_column_writer.finish(meta->mutable_bitmap_column())); + } + return Status::OK(); + } + +private: + const TypeInfo* _typeinfo; + rowid_t _rid = 0; + // row id list for null value + Roaring _null_bitmap; + // unique value to its row id list + MemoryIndexType _mem_index; + MemTracker _tracker; + MemPool _pool; +}; + +} // namespace + +Status BitmapIndexWriter::create(const TypeInfo* typeinfo, std::unique_ptr* res) { + FieldType type = typeinfo->type(); + switch (type) { + case OLAP_FIELD_TYPE_TINYINT: + res->reset(new BitmapIndexWriterImpl(typeinfo)); + break; + case OLAP_FIELD_TYPE_SMALLINT: + res->reset(new BitmapIndexWriterImpl(typeinfo)); + break; + case OLAP_FIELD_TYPE_INT: + res->reset(new BitmapIndexWriterImpl(typeinfo)); + break; + case OLAP_FIELD_TYPE_UNSIGNED_INT: + res->reset(new BitmapIndexWriterImpl(typeinfo)); + break; + case OLAP_FIELD_TYPE_BIGINT: + res->reset(new BitmapIndexWriterImpl(typeinfo)); + break; + case OLAP_FIELD_TYPE_CHAR: + res->reset(new BitmapIndexWriterImpl(typeinfo)); + break; + case OLAP_FIELD_TYPE_VARCHAR: + res->reset(new BitmapIndexWriterImpl(typeinfo)); + break; + default: + return Status::NotSupported("unsupported type for bitmap index: " + std::to_string(type)); + } + return Status::OK(); +} + +} // segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/bitmap_index_writer.h b/be/src/olap/rowset/segment_v2/bitmap_index_writer.h new file mode 100644 index 0000000000..fd5b537a24 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/bitmap_index_writer.h @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "common/status.h" +#include "gen_cpp/segment_v2.pb.h" +#include "gutil/macros.h" + +namespace doris { + +class TypeInfo; +class WritableFile; + +namespace segment_v2 { + +class BitmapIndexWriter { +public: + static Status create(const TypeInfo* typeinfo, std::unique_ptr* res); + + BitmapIndexWriter() = default; + virtual ~BitmapIndexWriter() = default; + + virtual void add_values(const void* values, size_t count) = 0; + + virtual void add_nulls(uint32_t count) = 0; + + virtual Status finish(WritableFile* file, BitmapIndexColumnPB* meta) = 0; +private: + DISALLOW_COPY_AND_ASSIGN(BitmapIndexWriter); +}; + +} // segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/bitshuffle_page.h b/be/src/olap/rowset/segment_v2/bitshuffle_page.h index 070319caa7..430445ed5c 100644 --- a/be/src/olap/rowset/segment_v2/bitshuffle_page.h +++ b/be/src/olap/rowset/segment_v2/bitshuffle_page.h @@ -173,7 +173,7 @@ private: CppType cell(int idx) const { DCHECK_GE(idx, 0); CppType ret; - memcpy(&ret, &_data[idx * SIZE_OF_TYPE], sizeof(CppType)); + memcpy(&ret, &_data[idx * SIZE_OF_TYPE], SIZE_OF_TYPE); return ret; } diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp index dda5d3cf9c..971077a1b4 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/column_writer.cpp @@ -22,6 +22,7 @@ #include "common/logging.h" // for LOG #include "env/env.h" // for LOG #include "gutil/strings/substitute.h" // for Substitute +#include "olap/rowset/segment_v2/bitmap_index_writer.h" #include "olap/rowset/segment_v2/encoding_info.h" // for EncodingInfo #include "olap/rowset/segment_v2/options.h" // for PageBuilderOptions #include "olap/rowset/segment_v2/ordinal_page_index.h" // for OrdinalPageIndexBuilder @@ -114,6 +115,9 @@ Status ColumnWriter::init() { if (_opts.need_zone_map) { _column_zone_map_builder.reset(new ColumnZoneMapBuilder(_field.get())); } + if (_opts.need_bitmap_index) { + RETURN_IF_ERROR(BitmapIndexWriter::create(_field->type_info(), &_bitmap_index_builder)); + } return Status::OK(); } @@ -123,6 +127,9 @@ Status ColumnWriter::append_nulls(size_t num_rows) { if (_opts.need_zone_map) { RETURN_IF_ERROR(_column_zone_map_builder->add(nullptr, 1)); } + if (_opts.need_bitmap_index) { + _bitmap_index_builder->add_nulls(num_rows); + } return Status::OK(); } @@ -141,6 +148,9 @@ Status ColumnWriter::_append_data(const uint8_t** ptr, size_t num_rows) { if (_opts.need_zone_map) { RETURN_IF_ERROR(_column_zone_map_builder->add(*ptr, num_written)); } + if (_opts.need_bitmap_index) { + _bitmap_index_builder->add_values(*ptr, num_written); + } bool is_page_full = (num_written < remaining); remaining -= num_written; @@ -172,6 +182,9 @@ Status ColumnWriter::append_nullable( if (_opts.need_zone_map) { RETURN_IF_ERROR(_column_zone_map_builder->add(nullptr, 1)); } + if (_opts.need_bitmap_index) { + _bitmap_index_builder->add_nulls(this_run); + } } else { RETURN_IF_ERROR(_append_data(&ptr, this_run)); } @@ -236,6 +249,13 @@ Status ColumnWriter::write_zone_map() { return Status::OK(); } +Status ColumnWriter::write_bitmap_index() { + if (!_opts.need_bitmap_index) { + return Status::OK(); + } + return _bitmap_index_builder->finish(_output_file, &_bitmap_index_meta); +} + void ColumnWriter::write_meta(ColumnMetaPB* meta) { meta->set_type(_field->type()); meta->set_encoding(_opts.encoding_type); @@ -249,6 +269,9 @@ void ColumnWriter::write_meta(ColumnMetaPB* meta) { if (_encoding_info->encoding() == DICT_ENCODING) { _dict_page_pp.to_proto(meta->mutable_dict_page()); } + if (_opts.need_bitmap_index) { + meta->mutable_bitmap_index()->CopyFrom(_bitmap_index_meta); + } } // write a page into file and update ordinal index diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h index 052f10bb08..50b662277e 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.h +++ b/be/src/olap/rowset/segment_v2/column_writer.h @@ -43,8 +43,10 @@ struct ColumnWriterOptions { // space saving = 1 - compressed_size / uncompressed_size double compression_min_space_saving = 0.1; bool need_zone_map = false; + bool need_bitmap_index = false; }; +class BitmapIndexWriter; class EncodingInfo; class NullBitmapBuilder; class OrdinalPageIndexBuilder; @@ -96,6 +98,7 @@ public: Status write_data(); Status write_ordinal_index(); Status write_zone_map(); + Status write_bitmap_index(); void write_meta(ColumnMetaPB* meta); private: @@ -148,6 +151,8 @@ private: std::unique_ptr _ordinal_index_builder; std::unique_ptr _column_zone_map_builder; std::unique_ptr _field; + std::unique_ptr _bitmap_index_builder; + BitmapIndexColumnPB _bitmap_index_meta; PagePointer _ordinal_index_pp; PagePointer _zone_map_pp; diff --git a/be/src/olap/rowset/segment_v2/encoding_info.cpp b/be/src/olap/rowset/segment_v2/encoding_info.cpp index b2ea47ac2c..4edd500550 100644 --- a/be/src/olap/rowset/segment_v2/encoding_info.cpp +++ b/be/src/olap/rowset/segment_v2/encoding_info.cpp @@ -17,10 +17,12 @@ #include "olap/rowset/segment_v2/encoding_info.h" +#include #include "olap/olap_common.h" #include "olap/rowset/segment_v2/binary_dict_page.h" #include "olap/rowset/segment_v2/binary_plain_page.h" #include "olap/rowset/segment_v2/bitshuffle_page.h" +#include "olap/rowset/segment_v2/frame_of_reference_page.h" #include "olap/rowset/segment_v2/plain_page.h" #include "olap/rowset/segment_v2/rle_page.h" #include "gutil/strings/substitute.h" @@ -34,11 +36,14 @@ struct EncodingMapHash { } }; -template +template struct TypeEncodingTraits { }; -template -struct TypeEncodingTraits { +template +struct TypeEncodingTraits { static Status create_page_builder(const PageBuilderOptions& opts, PageBuilder** builder) { *builder = new PlainPageBuilder(opts); return Status::OK(); @@ -50,7 +55,7 @@ struct TypeEncodingTraits { }; template -struct TypeEncodingTraits { +struct TypeEncodingTraits { static Status create_page_builder(const PageBuilderOptions& opts, PageBuilder** builder) { *builder = new BinaryPlainPageBuilder(opts); return Status::OK(); @@ -61,8 +66,9 @@ struct TypeEncodingTraits { } }; -template -struct TypeEncodingTraits { +template +struct TypeEncodingTraits::value>::type> { static Status create_page_builder(const PageBuilderOptions& opts, PageBuilder** builder) { *builder = new BitshufflePageBuilder(opts); return Status::OK(); @@ -73,20 +79,20 @@ struct TypeEncodingTraits { } }; -template -struct TypeEncodingTraits { +template<> +struct TypeEncodingTraits { static Status create_page_builder(const PageBuilderOptions& opts, PageBuilder** builder) { - *builder = new RlePageBuilder(opts); + *builder = new RlePageBuilder(opts); return Status::OK(); } static Status create_page_decoder(const Slice& data, const PageDecoderOptions& opts, PageDecoder** decoder) { - *decoder = new RlePageDecoder(data, opts); + *decoder = new RlePageDecoder(data, opts); return Status::OK(); } }; template -struct TypeEncodingTraits { +struct TypeEncodingTraits { static Status create_page_builder(const PageBuilderOptions& opts, PageBuilder** builder) { *builder = new BinaryDictPageBuilder(opts); return Status::OK(); @@ -97,10 +103,23 @@ struct TypeEncodingTraits { } }; -template -struct EncodingTraits : TypeEncodingTraits { - static const FieldType type = Type; - static const EncodingTypePB encoding = Encoding; +template +struct TypeEncodingTraits::value>::type> { + static Status create_page_builder(const PageBuilderOptions& opts, PageBuilder** builder) { + *builder = new FrameOfReferencePageBuilder(opts); + return Status::OK(); + } + static Status create_page_decoder(const Slice& data, const PageDecoderOptions& opts, PageDecoder** decoder) { + *decoder = new FrameOfReferencePageDecoder(data, opts); + return Status::OK(); + } +}; + +template +struct EncodingTraits : TypeEncodingTraits::CppType> { + static const FieldType type = field_type; + static const EncodingTypePB encoding = encoding_type; }; @@ -109,42 +128,53 @@ public: EncodingInfoResolver(); ~EncodingInfoResolver(); - EncodingTypePB get_default_encoding_type(FieldType type) const { - auto it = _default_encoding_type_map.find(type); - if (it != std::end(_default_encoding_type_map)) { + EncodingTypePB get_default_encoding(FieldType type, bool optimize_value_seek) const { + auto& encoding_map = optimize_value_seek ? _value_seek_encoding_map : _default_encoding_type_map; + auto it = encoding_map.find(type); + if (it != encoding_map.end()) { return it->second; } - return DEFAULT_ENCODING; + return UNKNOWN_ENCODING; } Status get(FieldType data_type, EncodingTypePB encoding_type, const EncodingInfo** out); private: - template + template void _add_map() { EncodingTraits traits; std::unique_ptr encoding(new EncodingInfo(traits)); if (_default_encoding_type_map.find(type) == std::end(_default_encoding_type_map)) { _default_encoding_type_map[type] = encoding_type; } + if (optimize_value_seek && _value_seek_encoding_map.find(type) == _value_seek_encoding_map.end()) { + _value_seek_encoding_map[type] = encoding_type; + } auto key = std::make_pair(type, encoding_type); _encoding_map.emplace(key, encoding.release()); } std::unordered_map> _default_encoding_type_map; + // default encoding for each type which optimizes value seek + std::unordered_map> _value_seek_encoding_map; + std::unordered_map, EncodingInfo*, EncodingMapHash> _encoding_map; }; EncodingInfoResolver::EncodingInfoResolver() { _add_map(); + _add_map(); _add_map(); _add_map(); + _add_map(); _add_map(); _add_map(); + _add_map(); _add_map(); _add_map(); + _add_map(); _add_map(); _add_map(); _add_map(); @@ -153,10 +183,10 @@ EncodingInfoResolver::EncodingInfoResolver() { _add_map(); _add_map(); _add_map(); - _add_map(); + _add_map(); _add_map(); - _add_map(); - _add_map(); + _add_map(); + _add_map(); _add_map(); _add_map(); _add_map(); @@ -166,7 +196,8 @@ EncodingInfoResolver::EncodingInfoResolver() { _add_map(); _add_map(); _add_map(); - _add_map(); + _add_map(); + _add_map(); } EncodingInfoResolver::~EncodingInfoResolver() { @@ -181,7 +212,7 @@ Status EncodingInfoResolver::get( EncodingTypePB encoding_type, const EncodingInfo** out) { if (encoding_type == DEFAULT_ENCODING) { - encoding_type = get_default_encoding_type(data_type); + encoding_type = get_default_encoding(data_type, false); } auto key = std::make_pair(data_type, encoding_type); auto it = _encoding_map.find(key); @@ -210,8 +241,8 @@ Status EncodingInfo::get(const TypeInfo* type_info, return s_encoding_info_resolver.get(type_info->type(), encoding_type, out); } -EncodingTypePB EncodingInfo::get_default_encoding_type(const TypeInfo* type_info) { - return s_encoding_info_resolver.get_default_encoding_type(type_info->type()); +EncodingTypePB EncodingInfo::get_default_encoding(const TypeInfo* type_info, bool optimize_value_seek) { + return s_encoding_info_resolver.get_default_encoding(type_info->type(), optimize_value_seek); } } diff --git a/be/src/olap/rowset/segment_v2/encoding_info.h b/be/src/olap/rowset/segment_v2/encoding_info.h index 00f3e23991..1647cb0ea7 100644 --- a/be/src/olap/rowset/segment_v2/encoding_info.h +++ b/be/src/olap/rowset/segment_v2/encoding_info.h @@ -40,8 +40,10 @@ public: static Status get(const TypeInfo* type_info, EncodingTypePB encoding_type, const EncodingInfo** encoding); - // Get default type info - static EncodingTypePB get_default_encoding_type(const TypeInfo* type_info); + + // optimize_value_search: whether the encoding scheme should optimize for ordered data + // and support fast value seek operation + static EncodingTypePB get_default_encoding(const TypeInfo* type_info, bool optimize_value_seek); Status create_page_builder(const PageBuilderOptions& opts, PageBuilder** builder) const { return _create_builder_func(opts, builder); @@ -55,7 +57,7 @@ private: friend class EncodingInfoResolver; template - EncodingInfo(TypeEncodingTraits traits); + explicit EncodingInfo(TypeEncodingTraits traits); using CreateBuilderFunc = std::function; CreateBuilderFunc _create_builder_func; diff --git a/be/src/olap/rowset/segment_v2/index_page.cpp b/be/src/olap/rowset/segment_v2/index_page.cpp new file mode 100644 index 0000000000..14a4732a3f --- /dev/null +++ b/be/src/olap/rowset/segment_v2/index_page.cpp @@ -0,0 +1,124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/index_page.h" + +#include + +#include "common/logging.h" +#include "util/coding.h" + +namespace doris { +namespace segment_v2 { + +void IndexPageBuilder::add(const Slice& key, const PagePointer& ptr) { + DCHECK(!_finished) << "must reset() after finish() to add new entry"; + _entry_offsets.push_back(_buffer.size()); + put_length_prefixed_slice(&_buffer, key); + ptr.encode_to(&_buffer); +} + +bool IndexPageBuilder::is_full() const { + // estimate size of IndexPageFooterPB to be 16 + return _buffer.size() + _entry_offsets.size() * sizeof(uint32_t) + 16 > _index_page_size; +} + +Slice IndexPageBuilder::finish() { + DCHECK(!_finished) << "already called finish()"; + for (uint32_t offset : _entry_offsets) { + put_fixed32_le(&_buffer, offset); + } + IndexPageFooterPB footer; + footer.set_num_entries(_entry_offsets.size()); + footer.set_type(_is_leaf ? IndexPageFooterPB::LEAF : IndexPageFooterPB::INTERNAL); + + std::string footer_buf; + footer.SerializeToString(&footer_buf); + _buffer.append(footer_buf); + put_fixed32_le(&_buffer, footer_buf.size()); + return Slice(_buffer); +} + +Status IndexPageBuilder::get_first_key(Slice* key) const { + if (_entry_offsets.empty()) { + return Status::NotFound("index page is empty"); + } + Slice input(_buffer); + if (get_length_prefixed_slice(&input, key)) { + return Status::OK(); + } else { + return Status::Corruption("can't decode first key"); + } +} + +/////////////////////////////////////////////////////////////////////////////// + +IndexPageReader::IndexPageReader() : _parsed(false) { +} + +Status IndexPageReader::parse(const Slice& data) { + return Status(); // FIXME +} + +size_t IndexPageReader::count() const { + CHECK(_parsed) << "not parsed"; + return _footer.num_entries(); +} + +bool IndexPageReader::is_leaf() const { + CHECK(_parsed) << "not parsed"; + return _footer.type() == IndexPageFooterPB::LEAF; +} + +void IndexPageReader::reset() { + _parsed = false; +} + +/////////////////////////////////////////////////////////////////////////////// + + +IndexPageIterator::IndexPageIterator(const IndexPageReader* reader) + : _reader(reader), + _seeked(false), + _cur_idx(-1) { +} + +void IndexPageIterator::reset() { + _seeked = false; + _cur_idx = -1; +} + +Status IndexPageIterator::seek_at_or_before(const Slice& search_key) { + return Status(); // FIXME +} + +Status IndexPageIterator::seek_ordinal(size_t idx) { + return Status(); // FIXME +} + +const Slice& IndexPageIterator::current_key() const { + CHECK(_seeked) << "not seeked"; + return _cur_key; +} + +const PagePointer& IndexPageIterator::current_page_pointer() const { + CHECK(_seeked) << "not seeked"; + return _cur_ptr; +} + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/index_page.h b/be/src/olap/rowset/segment_v2/index_page.h new file mode 100644 index 0000000000..1d4bbba920 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/index_page.h @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "common/status.h" +#include "gen_cpp/segment_v2.pb.h" +#include "gutil/macros.h" +#include "olap/rowset/segment_v2/page_pointer.h" +#include "util/faststring.h" +#include "util/slice.h" + +namespace doris { +namespace segment_v2 { + +class IndexPageIterator; // forward decl. + +// IndexPage is the building block for IndexedColumn's ordinal index and value index. +// It is used to guide searching for a particular key to the data page containing it. +// We use the same general format for all index pages, regardless of the data type and node type (leaf or internal) +// IndexPage := IndexEntry^NumEntry, StartOffset(4)^NumEntry, IndexPageFooterPB, IndexPageFooterPBSize(4) +// IndexEntry := IndexKey, PagePointer +// IndexKey := KeyLength(vint32), KeyData(KeyLength bytes) +// PagePointer := PageOffset(vint64), PageSize(vint32) +// +// IndexPageFooterPB records NumEntry and type (leaf/internal) of the index page. +// For leaf, IndexKey records the first/smallest key of the data page PagePointer points to. +// For internal, IndexKey records the first/smallest key of the next-level index page PagePointer points to. +// +// All keys are treated as binary string and compared with memcpy. Keys of other data type are encoded first by +// KeyCoder, e.g., ordinal index's original key type is uint32_t but is encoded to binary string. +class IndexPageBuilder { +public: + explicit IndexPageBuilder(size_t index_page_size, bool is_leaf) + : _index_page_size(index_page_size), _is_leaf(is_leaf) { + } + + void add(const Slice& key, const PagePointer& ptr); + + bool is_full() const; + + size_t count() const { return _entry_offsets.size(); } + + Slice finish(); + + // Return the key of the first entry in this index block. + // The pointed-to data is only valid until the next call to this builder. + Status get_first_key(Slice* key) const; + + void reset() { + _finished = false; + _buffer.clear(); + _entry_offsets.clear(); + } + +private: + DISALLOW_COPY_AND_ASSIGN(IndexPageBuilder); + const size_t _index_page_size; + const bool _is_leaf; + // is the builder currently between finish() and reset()? + bool _finished = false; + faststring _buffer; + std::vector _entry_offsets; +}; + +class IndexPageReader { +public: + IndexPageReader(); + + Status parse(const Slice& data); + + size_t count() const; + + bool is_leaf() const; + + void reset(); +private: + friend class IndexPageIterator; + bool _parsed; + + // valid only when `_parsed == true` + Slice _data; + IndexPageFooterPB _footer; + const uint8_t* _entry_offsets; +}; + +class IndexPageIterator { +public: + explicit IndexPageIterator(const IndexPageReader* reader); + + // Reset the state of this iterator. This should be used + // after the associated 'reader' parses a different page. + void reset(); + + // Find the highest index entry which has a key <= the given key. + // If such a entry is found, returns OK status. + // Otherwise Status::NotFound is returned. (i.e the smallest key in the + // index is still larger than the provided key) + // + // If this function returns an error, then the state of this + // iterator is undefined (i.e it may or may not have moved + // since the previous call) + Status seek_at_or_before(const Slice& search_key); + + Status seek_ordinal(size_t idx); + + bool has_next() const { return _cur_idx + 1 < _reader->count(); } + + Status next() { return seek_ordinal(_cur_idx + 1); } + + const Slice& current_key() const; + + const PagePointer& current_page_pointer() const; +private: + const IndexPageReader* _reader; + + bool _seeked; + size_t _cur_idx; + Slice _cur_key; + PagePointer _cur_ptr; +}; + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp b/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp new file mode 100644 index 0000000000..359492fffd --- /dev/null +++ b/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/indexed_column_reader.h" + +namespace doris { +namespace segment_v2 { + +IndexedColumnReader::IndexedColumnReader(RandomAccessFile* file) { + // TODO +} +Status IndexedColumnReader::init() { + return Status(); // TODO +} +Status IndexedColumnReader::new_iterator(std::unique_ptr* iter) { + return Status(); // TODO +} +Status IndexedColumnReader::read_page(const PagePointer& pp, PageHandle* ret) { + return Status(); // TODO +} +const IndexedColumnMetaPB& IndexedColumnReader::meta() const { + static IndexedColumnMetaPB temp; + return temp; // TODO +} +bool IndexedColumnReader::has_ordinal_index() const { + return false; // TODO +} +bool IndexedColumnReader::has_value_index() const { + return false; // TODO +} + +/////////////////////////////////////////////////////////////////////////////// + + +IndexedColumnIterator::IndexedColumnIterator(IndexedColumnReader* reader) { + // TODO +} +IndexedColumnIterator::~IndexedColumnIterator() { + // TODO +} +Status IndexedColumnIterator::seek_to_first() { + return Status(); // TODO +} +Status IndexedColumnIterator::seek_to_ordinal(rowid_t idx) { + return Status(); // TODO +} +Status IndexedColumnIterator::seek_at_or_after(const Slice& key, bool* exact_match) { + return Status(); // TODO +} +bool IndexedColumnIterator::seeked() const { + return false; // TODO +} +rowid_t IndexedColumnIterator::get_current_ordinal() const { + return 0; // TODO +} + +} // namespace segment_v2 +} // namespace doris \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/indexed_column_reader.h b/be/src/olap/rowset/segment_v2/indexed_column_reader.h new file mode 100644 index 0000000000..a5b8a8ba2d --- /dev/null +++ b/be/src/olap/rowset/segment_v2/indexed_column_reader.h @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "common/status.h" +#include "olap/rowset/segment_v2/common.h" +#include "olap/rowset/segment_v2/page_handle.h" +#include "olap/rowset/segment_v2/page_pointer.h" +#include "util/slice.h" + +namespace doris { + +class RandomAccessFile; + +namespace segment_v2 { + +class IndexedColumnIterator; + +// thread-safe reader for IndexedColumn (see comments of `IndexedColumnWriter` to understand what IndexedColumn is) +class IndexedColumnReader { +public: + explicit IndexedColumnReader(RandomAccessFile* file); + + Status init(); + + Status new_iterator(std::unique_ptr* iter); + + Status read_page(const PagePointer& pp, PageHandle* ret); + + const IndexedColumnMetaPB& meta() const; + + bool has_ordinal_index() const; + + bool has_value_index() const; + +private: + +}; + +class IndexedColumnIterator { +public: + explicit IndexedColumnIterator(IndexedColumnReader* reader); + + ~IndexedColumnIterator(); + + // Seek to the first entry. This works for both ordinal-indexed and value-indexed column + Status seek_to_first(); + + // Seek to the given ordinal entry. Entry 0 is the first entry. + // Return NotFound if provided seek point is past the end. + // Return NotSupported for column without ordinal index. + Status seek_to_ordinal(rowid_t idx); + + // Seek the index to the given key, or to the index entry immediately + // before it. Then seek the data block to the value matching value or to + // the value immediately after it. + // + // Sets *exact_match to indicate whether the seek found the exact + // key requested. + // + // Return NotFound if the given key is greater than all keys in this column. + // Return NotSupported for column without value index. + Status seek_at_or_after(const Slice &key, + bool *exact_match); + + // Return true if this reader is currently seeked. + // If the iterator is not seeked, it is an error to call any functions except + // for seek (including get_current_ordinal). + bool seeked() const; + + // Get the ordinal index that the iterator is currently pointed to. + rowid_t get_current_ordinal() const; +}; + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp b/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp new file mode 100644 index 0000000000..f40e2eca76 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp @@ -0,0 +1,188 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/indexed_column_writer.h" + +#include + +#include "env/env.h" +#include "olap/rowset/segment_v2/encoding_info.h" +#include "olap/rowset/segment_v2/index_page.h" +#include "olap/rowset/segment_v2/options.h" +#include "olap/rowset/segment_v2/page_builder.h" +#include "olap/rowset/segment_v2/page_compression.h" +#include "olap/rowset/segment_v2/page_pointer.h" +#include "olap/key_coder.h" +#include "olap/types.h" +#include "util/block_compression.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace doris { +namespace segment_v2 { + +IndexedColumnWriter::IndexedColumnWriter(const IndexedColumnWriterOptions& options, + const TypeInfo* typeinfo, + WritableFile* output_file) + : _options(options), + _typeinfo(typeinfo), + _file(output_file), + _mem_tracker(-1), + _mem_pool(&_mem_tracker), + _num_values(0), + _num_data_pages(0), + _validx_key_coder(nullptr), + _compress_codec(nullptr) { + _first_value.resize(_typeinfo->size()); +} + +IndexedColumnWriter::~IndexedColumnWriter() = default; + +Status IndexedColumnWriter::init() { + const EncodingInfo* encoding_info; + RETURN_IF_ERROR(EncodingInfo::get(_typeinfo, _options.encoding, &encoding_info)); + + PageBuilder* data_page_builder; + RETURN_IF_ERROR(encoding_info->create_page_builder(PageBuilderOptions(), &data_page_builder)); + _data_page_builder.reset(data_page_builder); + + if (_options.write_ordinal_index) { + _ordinal_index_builder.reset(new IndexPageBuilder(_options.index_page_size, true)); + } + if (_options.write_value_index) { + _value_index_builder.reset(new IndexPageBuilder(_options.index_page_size, true)); + _validx_key_coder = get_key_coder(_typeinfo->type()); + } + + if (_options.compression != NO_COMPRESSION) { + RETURN_IF_ERROR(get_block_compression_codec(_options.compression, &_compress_codec)); + } + return Status::OK(); +} + +Status IndexedColumnWriter::add(const void* value) { + if (_options.write_value_index && _data_page_builder->count() == 0) { + // remember page's first value because it's used to build value index + _typeinfo->deep_copy(_first_value.data(), value, &_mem_pool); + } + size_t num_to_write = 1; + RETURN_IF_ERROR(_data_page_builder->add(reinterpret_cast(value), &num_to_write)); + _num_values++; + if (_data_page_builder->is_page_full()) { + RETURN_IF_ERROR(_finish_current_data_page()); + } + return Status::OK(); +} + +Status IndexedColumnWriter::_finish_current_data_page() { + const uint32_t page_row_count = _data_page_builder->count(); + + if (page_row_count == 0) { + return Status::OK(); + } + + uint32_t first_rowid = _num_values - page_row_count; + faststring page_header; + put_varint32(&page_header, first_rowid); + put_varint32(&page_header, page_row_count); + + OwnedSlice page_data = _data_page_builder->finish(); + _data_page_builder->reset(); + + return _append_data_page({Slice(page_header), page_data.slice()}, first_rowid); +} + +Status IndexedColumnWriter::_append_data_page(const std::vector& data_page, rowid_t first_rowid) { + RETURN_IF_ERROR(_append_page(data_page, &_last_data_page)); + _num_data_pages++; + + if (_options.write_ordinal_index) { + std::string key; + KeyCoderTraits::full_encode_ascending( + &first_rowid, &key); + _ordinal_index_builder->add(key, _last_data_page); + } + + if (_options.write_value_index) { + std::string key; + _validx_key_coder->full_encode_ascending(_first_value.data(), &key); + // TODO short separate key optimize + _value_index_builder->add(key, _last_data_page); + // TODO record last key in short separate key optimize + } + return Status::OK(); +} + +Status IndexedColumnWriter::_append_page(const std::vector& page, PagePointer* pp) { + std::vector output_page; + + // Put compressor out of if block, because we will use compressor's + // content until this function finished. + PageCompressor compressor(_compress_codec); + if (_compress_codec != nullptr) { + RETURN_IF_ERROR(compressor.compress(page, &output_page)); + } else { + output_page = page; + } + + // checksum + uint8_t checksum_buf[sizeof(uint32_t)]; + uint32_t checksum = crc32c::Value(output_page); + encode_fixed32_le(checksum_buf, checksum); + output_page.emplace_back(checksum_buf, sizeof(uint32_t)); + + // append to file + pp->offset = _file->size(); + RETURN_IF_ERROR(_file->appendv(&output_page[0], output_page.size())); + pp->size = _file->size() - pp->offset; + return Status::OK(); +} + +Status IndexedColumnWriter::finish(IndexedColumnMetaPB* meta) { + RETURN_IF_ERROR(_finish_current_data_page()); + if (_options.write_ordinal_index) { + RETURN_IF_ERROR(_flush_index(_ordinal_index_builder.get(), + meta->mutable_ordinal_index_meta())); + } + if (_options.write_value_index) { + RETURN_IF_ERROR(_flush_index(_value_index_builder.get(), + meta->mutable_value_index_meta())); + } + meta->set_data_type(_typeinfo->type()); + meta->set_encoding(_options.encoding); + meta->set_num_values(_num_values); + meta->set_compression(_options.compression); + return Status::OK(); +} + +Status IndexedColumnWriter::_flush_index(IndexPageBuilder* index_builder, BTreeMetaPB* meta) { + if (_num_data_pages <= 1) { + meta->set_is_root_data_page(true); + _last_data_page.to_proto(meta->mutable_root_page()); + } else { + Slice root_page = index_builder->finish(); + PagePointer pp; + RETURN_IF_ERROR(_append_page({root_page}, &pp)); + + meta->set_is_root_data_page(false); + pp.to_proto(meta->mutable_root_page()); + } + return Status::OK(); +} + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/indexed_column_writer.h b/be/src/olap/rowset/segment_v2/indexed_column_writer.h new file mode 100644 index 0000000000..c9f143354a --- /dev/null +++ b/be/src/olap/rowset/segment_v2/indexed_column_writer.h @@ -0,0 +1,125 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "common/status.h" +#include "gen_cpp/segment_v2.pb.h" +#include "gutil/macros.h" +#include "olap/rowset/segment_v2/common.h" +#include "olap/rowset/segment_v2/page_pointer.h" +#include "runtime/mem_tracker.h" +#include "runtime/mem_pool.h" +#include "util/slice.h" + +namespace doris { + +class BlockCompressionCodec; +class KeyCoder; +class TypeInfo; +class WritableFile; + +namespace segment_v2 { + +class IndexPageBuilder; +class PageBuilder; + +struct IndexedColumnWriterOptions { + size_t index_page_size = 64 * 1024; + bool write_ordinal_index = false; + bool write_value_index = false; + EncodingTypePB encoding = DEFAULT_ENCODING; + CompressionTypePB compression = NO_COMPRESSION; +}; + +// IndexedColumn is a column with an optional "ordinal index" and an optional "value index". +// - "ordinal index" enables us to seek to a particular rowid within the column +// - "value index" enables us to seek to a particular value but requires IndexedColumn to store ordered values +// +// IndexedColumn can be used as the building blocks for implementing other data structures. For example, +// - a bitmap index can be represented by two indexed columns, one for the term dictionary, one for the posting lists. +// the "dictionary" IndexedColumn contains ordered terms and a value index. +// the "posting" IndexedColumn contains bitmap for each term and an ordinal index. +// - a bloom filter index can be represented by one indexed column containing bloom filters with an ordinal index +// +// Currently IndexedColumn has the following restrictions but can be extended to solve in the future +// 1. value can't be null +// 2. duplicated values are not supported/tested when storing ordered values +// TODO test with empty input +class IndexedColumnWriter { +public: + explicit IndexedColumnWriter(const IndexedColumnWriterOptions& options, + const TypeInfo* typeinfo, + WritableFile* output_file); + + ~IndexedColumnWriter(); + + Status init(); + + // add a single not-null value + Status add(const void* value); + + Status finish(IndexedColumnMetaPB* meta); + +private: + Status _finish_current_data_page(); + + // Append the given data page, update ordinal index or value index if they're used. + Status _append_data_page(const std::vector& data_page, rowid_t first_rowid); + + // Append the given page into the file. After return, *pp points to the newly + // inserted page. + // Input data will be compressed when compression is enabled. + // We also compute and append checksum for the page. + Status _append_page(const std::vector& page, PagePointer* pp); + + Status _flush_index(IndexPageBuilder* index_builder, BTreeMetaPB* meta); + + IndexedColumnWriterOptions _options; + const TypeInfo* _typeinfo; + WritableFile* _file; + // only used for `_first_value` + MemTracker _mem_tracker; + MemPool _mem_pool; + + rowid_t _num_values; + uint32_t _num_data_pages; + // remember the first value in current page + faststring _first_value; + PagePointer _last_data_page; + + // the following members are initialized in init() + // ----- + // builder for data pages + std::unique_ptr _data_page_builder; + // builder for index pages of ordinal index, null if write_ordinal_index == false + std::unique_ptr _ordinal_index_builder; + // builder for index pages of value index, null if write_value_index == false + std::unique_ptr _value_index_builder; + // encoder for value index's key + const KeyCoder* _validx_key_coder; + const BlockCompressionCodec* _compress_codec; + + DISALLOW_COPY_AND_ASSIGN(IndexedColumnWriter); +}; + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/page_builder.h b/be/src/olap/rowset/segment_v2/page_builder.h index 8f3936f8a3..82f54ba6aa 100644 --- a/be/src/olap/rowset/segment_v2/page_builder.h +++ b/be/src/olap/rowset/segment_v2/page_builder.h @@ -50,7 +50,9 @@ public: // than requested if the page is full. // // vals size should be decided according to the page build type - virtual doris::Status add(const uint8_t* vals, size_t* count) = 0; + // TODO make sure vals is natually-aligned to its type so that impls can use aligned load + // instead of memcpy to copy values. + virtual Status add(const uint8_t* vals, size_t* count) = 0; // Finish building the current page, return the encoded data. // This api should be followed by reset() before reusing the builder diff --git a/be/src/olap/rowset/segment_v2/page_decoder.h b/be/src/olap/rowset/segment_v2/page_decoder.h index 490f45f498..a6aa79862f 100644 --- a/be/src/olap/rowset/segment_v2/page_decoder.h +++ b/be/src/olap/rowset/segment_v2/page_decoder.h @@ -43,6 +43,24 @@ public: // Doing so has undefined results. virtual Status seek_to_position_in_page(size_t pos) = 0; + // Seek the decoder to the given value in the page, or the + // lowest value which is greater than the given value. + // + // If the decoder was able to locate an exact match, then + // sets *exact_match to true. Otherwise sets *exact_match to + // false, to indicate that the seeked value is _after_ the + // requested value. + // + // If the given value is less than the lowest value in the page, + // seeks to the start of the page. If it is higher than the highest + // value in the page, then returns Status::NotFound + // + // This will only return valid results when the data page + // consists of values in sorted order. + virtual Status seek_at_or_after_value(const void* value, bool* exact_match) { + return Status::NotSupported("seek_at_or_after_value"); // FIXME + } + // Seek the decoder forward by a given number of rows, or to the end // of the page. This is primarily used to skip over data. // diff --git a/be/src/olap/rowset/segment_v2/page_pointer.h b/be/src/olap/rowset/segment_v2/page_pointer.h index b537fbb775..e0470ed5d2 100644 --- a/be/src/olap/rowset/segment_v2/page_pointer.h +++ b/be/src/olap/rowset/segment_v2/page_pointer.h @@ -18,9 +18,11 @@ #pragma once #include +#include #include "gen_cpp/segment_v2.pb.h" #include "util/coding.h" +#include "util/faststring.h" namespace doris { namespace segment_v2 { @@ -50,6 +52,11 @@ struct PagePointer { } return decode_varint32_ptr(data, limit, &size); } + + void encode_to(faststring* dst) const { + put_varint64_varint32(dst, offset, size); + } + void encode_to(std::string* dst) const { put_varint64_varint32(dst, offset, size); } diff --git a/be/src/olap/rowset/segment_v2/rle_page.h b/be/src/olap/rowset/segment_v2/rle_page.h index a8504a2390..390a309e80 100644 --- a/be/src/olap/rowset/segment_v2/rle_page.h +++ b/be/src/olap/rowset/segment_v2/rle_page.h @@ -86,14 +86,13 @@ public: Status add(const uint8_t* vals, size_t* count) override { DCHECK(!_finished); - DCHECK_EQ(reinterpret_cast(vals) & (alignof(CppType) - 1), 0) - << "Pointer passed to Add() must be naturally-aligned"; - - const CppType* new_vals = reinterpret_cast(vals); + auto new_vals = reinterpret_cast(vals); for (int i = 0; i < *count; ++i) { - _rle_encoder->Put(new_vals[i]); + // note: vals is not guaranteed to be aligned for now, thus memcpy here + CppType value; + memcpy(&value, &new_vals[i], SIZE_OF_TYPE); + _rle_encoder->Put(value); } - _count += *count; return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index 53bbdd9713..7ce010c7a2 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -62,10 +62,10 @@ Status SegmentWriter::init(uint32_t write_mbytes_per_sec) { if (column.is_key()) { opts.need_zone_map = true; } + // TODO set opts.need_bitmap_index based on table properties std::unique_ptr field(FieldFactory::create(column)); DCHECK(field.get() != nullptr); - std::unique_ptr writer(new ColumnWriter(opts, std::move(field), is_nullable, _output_file.get())); RETURN_IF_ERROR(writer->init()); _column_writers.push_back(std::move(writer)); @@ -109,7 +109,9 @@ Status SegmentWriter::finalize(uint64_t* segment_file_size) { RETURN_IF_ERROR(_write_data()); RETURN_IF_ERROR(_write_ordinal_index()); RETURN_IF_ERROR(_write_zone_map()); + RETURN_IF_ERROR(_write_bitmap_index()); RETURN_IF_ERROR(_write_short_key_index()); + RETURN_IF_ERROR(_write_footer()); *segment_file_size = _output_file->size(); return Status::OK(); @@ -138,6 +140,13 @@ Status SegmentWriter::_write_zone_map() { return Status::OK(); } +Status SegmentWriter::_write_bitmap_index() { + for (auto& column_writer : _column_writers) { + RETURN_IF_ERROR(column_writer->write_bitmap_index()); + } + return Status::OK(); +} + Status SegmentWriter::_write_short_key_index() { std::vector slices; // TODO(zc): we should get segment_size diff --git a/be/src/olap/rowset/segment_v2/segment_writer.h b/be/src/olap/rowset/segment_v2/segment_writer.h index 88faa53a2b..8d6d3b8c42 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.h +++ b/be/src/olap/rowset/segment_v2/segment_writer.h @@ -70,6 +70,7 @@ private: Status _write_data(); Status _write_ordinal_index(); Status _write_zone_map(); + Status _write_bitmap_index(); Status _write_short_key_index(); Status _write_footer(); Status _write_raw_data(const std::vector& slices); diff --git a/be/src/util/coding.h b/be/src/util/coding.h index c733f15d88..aebea84510 100644 --- a/be/src/util/coding.h +++ b/be/src/util/coding.h @@ -143,6 +143,12 @@ inline void put_varint64(T* dst, uint64_t v) { dst->append((char*)buf, static_cast(ptr - buf)); } +template +inline void put_length_prefixed_slice(T* dst, const Slice& value) { + put_varint32(dst, value.get_size()); + dst->append(value.get_data(), value.get_size()); +} + template inline void put_varint64_varint32(T* dst, uint64_t v1, uint32_t v2) { uint8_t buf[15]; @@ -151,6 +157,9 @@ inline void put_varint64_varint32(T* dst, uint64_t v1, uint32_t v2) { dst->append((char*)buf, static_cast(ptr - buf)); } +// parse a varint32 from the start of `input` into `val`. +// on success, return true and advance `input` past the parsed value. +// on failure, return false and `input` is not modified. inline bool get_varint32(Slice* input, uint32_t* val) { const uint8_t* p = (const uint8_t*)input->data; const uint8_t* limit = p + input->size; @@ -163,6 +172,9 @@ inline bool get_varint32(Slice* input, uint32_t* val) { } } +// parse a varint64 from the start of `input` into `val`. +// on success, return true and advance `input` past the parsed value. +// on failure, return false and `input` is not modified. inline bool get_varint64(Slice* input, uint64_t* val) { const uint8_t* p = (const uint8_t*)input->data; const uint8_t* limit = p + input->size; @@ -175,4 +187,18 @@ inline bool get_varint64(Slice* input, uint64_t* val) { } } +// parse a length-prefixed-slice from the start of `input` into `val`. +// on success, return true and advance `input` past the parsed value. +// on failure, return false and `input` may or may not be modified. +inline bool get_length_prefixed_slice(Slice* input, Slice* val) { + uint32_t len; + if (get_varint32(input, &len) && input->get_size() >= len) { + *val = Slice(input->get_data(), len); + input->remove_prefix(len); + return true; + } else { + return false; + } } + +} // namespace doris diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto index 8c13940a58..70a5ddf2d2 100644 --- a/gensrc/proto/segment_v2.proto +++ b/gensrc/proto/segment_v2.proto @@ -48,12 +48,12 @@ message MetadataPairPB { enum EncodingTypePB { UNKNOWN_ENCODING = 0; DEFAULT_ENCODING = 1; - PLAIN_ENCODING = 2; // for non-binary type + PLAIN_ENCODING = 2; PREFIX_ENCODING = 3; RLE = 4; DICT_ENCODING = 5; BIT_SHUFFLE = 6; - BINARY_PLAIN_ENCODING = 7; + FOR_ENCODING = 7; // Frame-Of-Reference } enum CompressionTypePB { @@ -98,8 +98,10 @@ message ColumnMetaPB { optional ZoneMapPB zone_map = 9; // // dictionary page for DICT_ENCODING optional PagePointerPB dict_page = 10; + // bitmap index + optional BitmapIndexColumnPB bitmap_index = 11; // var length for string type - optional int32 length = 11; + optional int32 length = 12; // // bloom filter pages for bloom filter column // repeated PagePointerPB bloom_filter_pages = 3; @@ -164,3 +166,57 @@ message SegmentFooterPB { optional PagePointerPB short_key_index_page = 9; } +message IndexPageFooterPB { + // required: number of entries in this page + optional int32 num_entries = 1; + + enum Type { + UNKNOWN_INDEX_PAGE_TYPE = 0; + LEAF = 1; + INTERNAL = 2; + }; + // required: type of the index page + optional Type type = 2; +} + +message BTreeMetaPB { + // required: pointer to either root index page or sole data page based on is_root_data_page + optional PagePointerPB root_page = 1; + // required: true if we only have one data page, in which case root points to that page directly + optional bool is_root_data_page = 2; +} + +message IndexedColumnMetaPB { + // required: FieldType value + optional int32 data_type = 1; + // required: encoding for this column + optional EncodingTypePB encoding = 2; + // required: total number of values in this column + optional int64 num_values = 3; + // present iff this column has ordinal index + optional BTreeMetaPB ordinal_index_meta = 4; + // present iff this column contains sorted values and has value index + optional BTreeMetaPB value_index_meta = 5; + // compression type for data and index page + optional CompressionTypePB compression = 6 [default=NO_COMPRESSION]; + // optional bool is_nullable = 7 [default=false]; +} + +message BitmapIndexColumnPB { + enum BitmapType { + UNKNOWN_BITMAP_TYPE = 0; + ROARING_BITMAP = 1; + } + optional uint32 column_id = 1; + optional uint32 unique_id = 2; + // required: whether the index contains null key. + // if true, the last bitmap (ordinal:dict_column.num_values) in bitmap_column is + // the bitmap for null key. we don't store null key in dict_column. + optional bool has_null = 3; + // required: meta for ordered dictionary part + optional IndexedColumnMetaPB dict_column = 4; + // required: meta for bitmaps part + optional IndexedColumnMetaPB bitmap_column = 5; + optional BitmapType bitmap_type = 6 [default=ROARING_BITMAP]; +} +