diff --git a/be/src/olap/CMakeLists.txt b/be/src/olap/CMakeLists.txt index 73ad6ccdba..7adfe8419f 100644 --- a/be/src/olap/CMakeLists.txt +++ b/be/src/olap/CMakeLists.txt @@ -93,18 +93,18 @@ add_library(Olap STATIC rowset/segment_v2/indexed_column_reader.cpp rowset/segment_v2/indexed_column_writer.cpp rowset/segment_v2/ordinal_page_index.cpp - rowset/segment_v2/page_compression.cpp + rowset/segment_v2/page_io.cpp rowset/segment_v2/binary_dict_page.cpp rowset/segment_v2/binary_prefix_page.cpp rowset/segment_v2/segment.cpp rowset/segment_v2/segment_iterator.cpp rowset/segment_v2/empty_segment_iterator.cpp rowset/segment_v2/segment_writer.cpp - rowset/segment_v2/column_zone_map.cpp rowset/segment_v2/block_split_bloom_filter.cpp rowset/segment_v2/bloom_filter_index_reader.cpp rowset/segment_v2/bloom_filter_index_writer.cpp rowset/segment_v2/bloom_filter.cpp + rowset/segment_v2/zone_map_index.cpp task/engine_batch_load_task.cpp task/engine_checksum_task.cpp task/engine_clone_task.cpp diff --git a/be/src/olap/key_coder.cpp b/be/src/olap/key_coder.cpp index ed181a28f3..4a35b5657a 100644 --- a/be/src/olap/key_coder.cpp +++ b/be/src/olap/key_coder.cpp @@ -64,6 +64,7 @@ private: add_mapping(); add_mapping(); add_mapping(); + add_mapping(); add_mapping(); add_mapping(); diff --git a/be/src/olap/rowset/segment_v2/bitmap_index_reader.cpp b/be/src/olap/rowset/segment_v2/bitmap_index_reader.cpp index f282426849..0a66511d50 100644 --- a/be/src/olap/rowset/segment_v2/bitmap_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/bitmap_index_reader.cpp @@ -22,15 +22,15 @@ namespace doris { namespace segment_v2 { -Status BitmapIndexReader::load(bool cache_in_memory) { - const IndexedColumnMetaPB& dict_meta = _bitmap_index_meta.dict_column(); - const IndexedColumnMetaPB& bitmap_meta = _bitmap_index_meta.bitmap_column(); - _has_null = _bitmap_index_meta.has_null(); +Status BitmapIndexReader::load(bool use_page_cache, bool kept_in_memory) { + const IndexedColumnMetaPB& dict_meta = _bitmap_index_meta->dict_column(); + const IndexedColumnMetaPB& bitmap_meta = _bitmap_index_meta->bitmap_column(); + _has_null = _bitmap_index_meta->has_null(); - _dict_column_reader.reset(new IndexedColumnReader(_file_name, dict_meta, cache_in_memory)); - _bitmap_column_reader.reset(new IndexedColumnReader(_file_name, bitmap_meta, cache_in_memory)); - RETURN_IF_ERROR(_dict_column_reader->load()); - RETURN_IF_ERROR(_bitmap_column_reader->load()); + _dict_column_reader.reset(new IndexedColumnReader(_file_name, dict_meta)); + _bitmap_column_reader.reset(new IndexedColumnReader(_file_name, bitmap_meta)); + RETURN_IF_ERROR(_dict_column_reader->load(use_page_cache, kept_in_memory)); + RETURN_IF_ERROR(_bitmap_column_reader->load(use_page_cache, kept_in_memory)); return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/bitmap_index_reader.h b/be/src/olap/rowset/segment_v2/bitmap_index_reader.h index c0d6e6bdd7..c1dac83e20 100644 --- a/be/src/olap/rowset/segment_v2/bitmap_index_reader.h +++ b/be/src/olap/rowset/segment_v2/bitmap_index_reader.h @@ -41,13 +41,13 @@ class IndexedColumnIterator; class BitmapIndexReader { public: explicit BitmapIndexReader(const std::string& file_name, - const BitmapIndexColumnPB& bitmap_index_meta) + const BitmapIndexPB* bitmap_index_meta) : _file_name(file_name), _bitmap_index_meta(bitmap_index_meta){ _typeinfo = get_type_info(OLAP_FIELD_TYPE_VARCHAR); } - Status load(bool cache_in_memory); + Status load(bool use_page_cache, bool kept_in_memory); // create a new column iterator. Client should delete returned iterator Status new_iterator(BitmapIndexIterator** iterator); @@ -65,7 +65,7 @@ private: std::string _file_name; const TypeInfo* _typeinfo; - const BitmapIndexColumnPB& _bitmap_index_meta; + const BitmapIndexPB* _bitmap_index_meta; bool _has_null = false; std::unique_ptr _dict_column_reader; std::unique_ptr _bitmap_column_reader; diff --git a/be/src/olap/rowset/segment_v2/bitmap_index_writer.cpp b/be/src/olap/rowset/segment_v2/bitmap_index_writer.cpp index 3074fcaf49..cf44c9928b 100644 --- a/be/src/olap/rowset/segment_v2/bitmap_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/bitmap_index_writer.cpp @@ -100,8 +100,11 @@ public: _rid += count; } - Status finish(WritableFile* file, BitmapIndexColumnPB* meta) override { - meta->set_bitmap_type(BitmapIndexColumnPB::ROARING_BITMAP); + Status finish(WritableFile* file, ColumnIndexMetaPB* index_meta) override { + index_meta->set_type(BITMAP_INDEX); + BitmapIndexPB* meta = index_meta->mutable_bitmap_index(); + + meta->set_bitmap_type(BitmapIndexPB::ROARING_BITMAP); meta->set_has_null(!_null_bitmap.isEmpty()); { // write dictionary diff --git a/be/src/olap/rowset/segment_v2/bitmap_index_writer.h b/be/src/olap/rowset/segment_v2/bitmap_index_writer.h index 78f42147cb..8b4dc44c36 100644 --- a/be/src/olap/rowset/segment_v2/bitmap_index_writer.h +++ b/be/src/olap/rowset/segment_v2/bitmap_index_writer.h @@ -42,7 +42,7 @@ public: virtual void add_nulls(uint32_t count) = 0; - virtual Status finish(WritableFile* file, BitmapIndexColumnPB* meta) = 0; + virtual Status finish(WritableFile* file, ColumnIndexMetaPB* index_meta) = 0; virtual uint64_t size() const = 0; private: diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp index 4aacab10d4..1cb5418558 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp @@ -23,11 +23,11 @@ namespace doris { namespace segment_v2 { -Status BloomFilterIndexReader::load(bool cache_in_memory) { - const IndexedColumnMetaPB& bf_index_meta = _bloom_filter_index_meta.bloom_filter(); +Status BloomFilterIndexReader::load(bool use_page_cache, bool kept_in_memory) { + const IndexedColumnMetaPB& bf_index_meta = _bloom_filter_index_meta->bloom_filter(); - _bloom_filter_reader.reset(new IndexedColumnReader(_file_name, bf_index_meta, cache_in_memory)); - RETURN_IF_ERROR(_bloom_filter_reader->load()); + _bloom_filter_reader.reset(new IndexedColumnReader(_file_name, bf_index_meta)); + RETURN_IF_ERROR(_bloom_filter_reader->load(use_page_cache, kept_in_memory)); return Status::OK(); } @@ -48,8 +48,8 @@ Status BloomFilterIndexIterator::read_bloom_filter(rowid_t ordinal, std::unique_ RETURN_IF_ERROR(_bloom_filter_iter.next_batch(&num_read, &column_block_view)); DCHECK(num_to_read == num_read); // construct bloom filter - BloomFilter::create(_reader->_bloom_filter_index_meta.algorithm(), bf); - RETURN_IF_ERROR((*bf)->init(value.data, value.size, _reader->_bloom_filter_index_meta.hash_strategy())); + BloomFilter::create(_reader->_bloom_filter_index_meta->algorithm(), bf); + RETURN_IF_ERROR((*bf)->init(value.data, value.size, _reader->_bloom_filter_index_meta->hash_strategy())); _pool->clear(); return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h index b2cf300b03..db3f80c85a 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h @@ -44,13 +44,13 @@ class BloomFilter; class BloomFilterIndexReader { public: explicit BloomFilterIndexReader(const std::string& file_name, - const BloomFilterIndexPB& bloom_filter_index_meta) + const BloomFilterIndexPB* bloom_filter_index_meta) : _file_name(file_name), _bloom_filter_index_meta(bloom_filter_index_meta) { _typeinfo = get_type_info(OLAP_FIELD_TYPE_VARCHAR); } - Status load(bool cache_in_memory); + Status load(bool use_page_cache, bool kept_in_memory); // create a new column iterator. Status new_iterator(std::unique_ptr* iterator); @@ -64,7 +64,7 @@ private: std::string _file_name; const TypeInfo* _typeinfo; - BloomFilterIndexPB _bloom_filter_index_meta; + const BloomFilterIndexPB* _bloom_filter_index_meta; std::unique_ptr _bloom_filter_reader; }; diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp index 7cee754555..9c3f8f867d 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp @@ -104,10 +104,12 @@ public: return Status::OK(); } - Status finish(WritableFile* file, BloomFilterIndexPB* meta) override { + Status finish(WritableFile* file, ColumnIndexMetaPB* index_meta) override { if (_values.size() > 0) { RETURN_IF_ERROR(flush()); } + index_meta->set_type(BLOOM_FILTER_INDEX); + BloomFilterIndexPB* meta = index_meta->mutable_bloom_filter_index(); meta->set_hash_strategy(_bf_options.strategy); meta->set_algorithm(BLOCK_BLOOM_FILTER); @@ -118,7 +120,7 @@ public: options.write_value_index = false; options.encoding = PLAIN_ENCODING; IndexedColumnWriter bf_writer(options, bf_typeinfo, file); - bf_writer.init(); + RETURN_IF_ERROR(bf_writer.init()); for (auto& bf : _bfs) { Slice data(bf->data(), bf->size()); bf_writer.add(&data); diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h index f96aa09fe5..04a877af43 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h @@ -47,7 +47,7 @@ public: virtual Status flush() = 0; - virtual Status finish(WritableFile* file, BloomFilterIndexPB* meta) = 0; + virtual Status finish(WritableFile* file, ColumnIndexMetaPB* index_meta) = 0; virtual uint64_t size() = 0; private: diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index 0c0660c508..74773ac236 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -21,19 +21,14 @@ #include "env/env.h" // for RandomAccessFile #include "gutil/strings/substitute.h" // for Substitute #include "olap/rowset/segment_v2/encoding_info.h" // for EncodingInfo -#include "olap/rowset/segment_v2/page_decoder.h" // for PagePointer #include "olap/rowset/segment_v2/page_handle.h" // for PageHandle +#include "olap/rowset/segment_v2/page_io.h" #include "olap/rowset/segment_v2/page_pointer.h" // for PagePointer -#include "olap/rowset/segment_v2/page_compression.h" -#include "olap/rowset/segment_v2/options.h" // for PageDecoderOptions #include "olap/types.h" // for TypeInfo #include "olap/column_block.h" // for ColumnBlockView -#include "olap/page_cache.h" #include "util/coding.h" // for get_varint32 -#include "util/crc32c.h" #include "util/rle_encoding.h" // for RleDecoder #include "util/block_compression.h" -#include "util/file_manager.h" #include "olap/rowset/segment_v2/binary_dict_page.h" // for BinaryDictPageDecoder #include "olap/rowset/segment_v2/bloom_filter_index_reader.h" @@ -70,6 +65,31 @@ Status ColumnReader::init() { } RETURN_IF_ERROR(EncodingInfo::get(_type_info, _meta.encoding(), &_encoding_info)); RETURN_IF_ERROR(get_block_compression_codec(_meta.compression(), &_compress_codec)); + + for (int i = 0; i < _meta.indexes_size(); i++) { + auto& index_meta = _meta.indexes(i); + switch (index_meta.type()) { + case ORDINAL_INDEX: + _ordinal_index_meta = &index_meta.ordinal_index(); + break; + case ZONE_MAP_INDEX: + _zone_map_index_meta = &index_meta.zone_map_index(); + break; + case BITMAP_INDEX: + _bitmap_index_meta = &index_meta.bitmap_index(); + break; + case BLOOM_FILTER_INDEX: + _bf_index_meta = &index_meta.bloom_filter_index(); + break; + default: + return Status::Corruption(Substitute( + "Bad file $0: invalid column index type $1", _file_name, index_meta.type())); + } + } + if (_ordinal_index_meta == nullptr) { + return Status::Corruption(Substitute( + "Bad file $0: missing ordinal index for column $1", _file_name, _meta.column_id())); + } return Status::OK(); } @@ -80,85 +100,23 @@ Status ColumnReader::new_iterator(ColumnIterator** iterator) { Status ColumnReader::new_bitmap_index_iterator(BitmapIndexIterator** iterator) { RETURN_IF_ERROR(_ensure_index_loaded()); - RETURN_IF_ERROR(_bitmap_index_reader->new_iterator(iterator)); + RETURN_IF_ERROR(_bitmap_index->new_iterator(iterator)); return Status::OK(); } -Status ColumnReader::read_page(const PagePointer& pp, const ColumnIteratorOptions& opts, PageHandle* handle) { - OpenedFileHandle file_handle; - RETURN_IF_ERROR(FileManager::instance()->open_file(_file_name, &file_handle)); - RandomAccessFile* input_file = file_handle.file(); - return read_page(input_file, pp, opts, handle); -} +Status ColumnReader::read_page(const ColumnIteratorOptions& iter_opts, const PagePointer& pp, + PageHandle* handle, Slice* page_body, PageFooterPB* footer) { + iter_opts.sanity_check(); + PageReadOptions opts; + opts.file = iter_opts.file; + opts.page_pointer = pp; + opts.codec = _compress_codec; + opts.stats = iter_opts.stats; + opts.verify_checksum = _opts.verify_checksum; + opts.use_page_cache = iter_opts.use_page_cache; + opts.kept_in_memory = _opts.kept_in_memory; -Status ColumnReader::read_page(RandomAccessFile* file, const PagePointer& pp, - const ColumnIteratorOptions& iter_opts, PageHandle* handle) { - iter_opts.stats->total_pages_num++; - auto cache = StoragePageCache::instance(); - PageCacheHandle cache_handle; - StoragePageCache::CacheKey cache_key(file->file_name(), pp.offset); - if (iter_opts.use_page_cache && cache->lookup(cache_key, &cache_handle)) { - // we find page in cache, use it - *handle = PageHandle(std::move(cache_handle)); - iter_opts.stats->cached_pages_num++; - return Status::OK(); - } - // Now we read this from file. - size_t page_size = pp.size; - if (page_size < sizeof(uint32_t)) { - return Status::Corruption(Substitute("Bad page, page size is too small, size=$0", page_size)); - } - - // Now we use this buffer to store page from storage, if this page is compressed - // this buffer will assigned uncompressed page, and origin content will be freed. - std::unique_ptr page(new uint8_t[page_size]); - Slice page_slice(page.get(), page_size); - { - SCOPED_RAW_TIMER(&iter_opts.stats->io_ns); - RETURN_IF_ERROR(file->read_at(pp.offset, page_slice)); - iter_opts.stats->compressed_bytes_read += page_size; - } - - size_t data_size = page_size - 4; - if (_opts.verify_checksum) { - uint32_t expect = decode_fixed32_le((uint8_t*)page_slice.data + page_slice.size - 4); - uint32_t actual = crc32c::Value(page_slice.data, page_slice.size - 4); - if (expect != actual) { - return Status::Corruption( - Substitute("Page checksum mismatch, actual=$0 vs expect=$1", actual, expect)); - } - } - - // remove page's suffix - page_slice.size = data_size; - - if (_compress_codec != nullptr) { - PageDecompressor decompressor(page_slice, _compress_codec); - - Slice uncompressed_page; - { - SCOPED_RAW_TIMER(&iter_opts.stats->decompress_ns); - RETURN_IF_ERROR(decompressor.decompress_to(&uncompressed_page)); - } - - // If decompressor create new heap memory for uncompressed data, - // assign this uncompressed page to page and page slice - if (uncompressed_page.data != page_slice.data) { - page.reset((uint8_t*)uncompressed_page.data); - } - page_slice = uncompressed_page; - iter_opts.stats->uncompressed_bytes_read += page_slice.size; - } - if (iter_opts.use_page_cache) { - // insert this into cache and return the cache handle - cache->insert(cache_key, page_slice, &cache_handle, _opts.cache_in_memory); - *handle = PageHandle(std::move(cache_handle)); - } else { - *handle = PageHandle(page_slice); - } - - page.release(); - return Status::OK(); + return PageIO::read_and_decompress_page(opts, handle, page_body, footer); } Status ColumnReader::get_row_ranges_by_zone_map(CondColumn* cond_column, @@ -173,32 +131,57 @@ Status ColumnReader::get_row_ranges_by_zone_map(CondColumn* cond_column, return Status::OK(); } +bool ColumnReader::match_condition(CondColumn* cond) const { + if (_zone_map_index_meta == nullptr || cond == nullptr) { + return true; + } + FieldType type = _type_info->type(); + std::unique_ptr min_value(WrapperField::create_by_type(type, _meta.length())); + std::unique_ptr max_value(WrapperField::create_by_type(type, _meta.length())); + return _zone_map_match_condition( + _zone_map_index_meta->segment_zone_map(), min_value.get(), max_value.get(), cond); +} + +bool ColumnReader::_zone_map_match_condition(const ZoneMapPB& zone_map, + WrapperField* min_value_container, + WrapperField* max_value_container, + CondColumn* cond) const { + if (cond == nullptr) { + return true; + } + if (!zone_map.has_not_null() && !zone_map.has_null()) { + return false; // no data in this zone + } + // min value and max value are valid if has_not_null is true + if (zone_map.has_not_null()) { + min_value_container->from_string(zone_map.min()); + max_value_container->from_string(zone_map.max()); + } + // for compatible original Cond eval logic + // TODO(hkp): optimize OlapCond + if (zone_map.has_null()) { + // for compatible, if exist null, original logic treat null as min + min_value_container->set_null(); + if (!zone_map.has_not_null()) { + // for compatible OlapCond's 'is not null' + max_value_container->set_null(); + } + } + + return cond->eval({min_value_container, max_value_container}); +} + Status ColumnReader::_get_filtered_pages(CondColumn* cond_column, const std::vector& delete_conditions, std::vector* delete_partial_filtered_pages, std::vector* page_indexes) { FieldType type = _type_info->type(); - const std::vector& zone_maps = _column_zone_map->get_column_zone_map(); - int32_t page_size = _column_zone_map->num_pages(); + const std::vector& zone_maps = _zone_map_index->page_zone_maps(); + int32_t page_size = _zone_map_index->num_pages(); std::unique_ptr min_value(WrapperField::create_by_type(type, _meta.length())); std::unique_ptr max_value(WrapperField::create_by_type(type, _meta.length())); for (int32_t i = 0; i < page_size; ++i) { - // min value and max value are valid if has_not_null is true - if (zone_maps[i].has_not_null()) { - min_value->from_string(zone_maps[i].min()); - max_value->from_string(zone_maps[i].max()); - } - // for compatible original Cond eval logic - // TODO(hkp): optimize OlapCond - if (zone_maps[i].has_null()) { - // for compatible, if exist null, original logic treat null as min - min_value->set_null(); - if (!zone_maps[i].has_not_null()) { - // for compatible OlapCond's 'is not null' - max_value->set_null(); - } - } - if (cond_column == nullptr || cond_column->eval({min_value.get(), max_value.get()})) { + if (_zone_map_match_condition(zone_maps[i], min_value.get(), max_value.get(), cond_column)) { bool should_read = true; for (auto& col_cond : delete_conditions) { int state = col_cond->del_eval({min_value.get(), max_value.get()}); @@ -220,8 +203,8 @@ Status ColumnReader::_get_filtered_pages(CondColumn* cond_column, Status ColumnReader::_calculate_row_ranges(const std::vector& page_indexes, RowRanges* row_ranges) { row_ranges->clear(); for (auto i : page_indexes) { - rowid_t page_first_id = _ordinal_index->get_first_row_id(i); - rowid_t page_last_id = _ordinal_index->get_last_row_id(i); + ordinal_t page_first_id = _ordinal_index->get_first_ordinal(i); + ordinal_t page_last_id = _ordinal_index->get_last_ordinal(i); RowRanges page_row_ranges(RowRanges::create_single(page_first_id, page_last_id + 1)); RowRanges::ranges_union(*row_ranges, page_row_ranges, row_ranges); } @@ -232,7 +215,7 @@ Status ColumnReader::get_row_ranges_by_bloom_filter(CondColumn* cond_column, Row RETURN_IF_ERROR(_ensure_index_loaded()); RowRanges bf_row_ranges; std::unique_ptr bf_iter; - RETURN_IF_ERROR(_bloom_filter_index_reader->new_iterator(&bf_iter)); + RETURN_IF_ERROR(_bloom_filter_index->new_iterator(&bf_iter)); size_t range_size = row_ranges->range_size(); // get covered page ids std::set page_ids; @@ -242,8 +225,8 @@ Status ColumnReader::get_row_ranges_by_bloom_filter(CondColumn* cond_column, Row int64_t to = row_ranges->get_range_to(i); auto iter = _ordinal_index->seek_at_or_before(from); while (idx < to) { - page_ids.insert(iter.cur_idx()); - idx = iter.cur_page_last_row_id() + 1; + page_ids.insert(iter.page_index()); + idx = iter.last_ordinal() + 1; iter.next(); } } @@ -251,69 +234,40 @@ Status ColumnReader::get_row_ranges_by_bloom_filter(CondColumn* cond_column, Row std::unique_ptr bf; RETURN_IF_ERROR(bf_iter->read_bloom_filter(pid, &bf)); if (cond_column->eval(bf.get())) { - bf_row_ranges.add(RowRange(_ordinal_index->get_first_row_id(pid), - _ordinal_index->get_last_row_id(pid) + 1)); + bf_row_ranges.add(RowRange(_ordinal_index->get_first_ordinal(pid), + _ordinal_index->get_last_ordinal(pid) + 1)); } } RowRanges::ranges_intersection(*row_ranges, bf_row_ranges, row_ranges); return Status::OK(); } -Status ColumnReader::_load_ordinal_index() { - PagePointer pp = _meta.ordinal_index_page(); - PageHandle ph; - OlapReaderStatistics stats; - ColumnIteratorOptions opts; - // column index only load once, so we use global config to decide - if (!config::disable_storage_page_cache) { - opts.use_page_cache = true; - } - opts.stats = &stats; - RETURN_IF_ERROR(read_page(pp, opts, &ph)); - - _ordinal_index.reset(new OrdinalPageIndex(ph.data(), _num_rows)); - RETURN_IF_ERROR(_ordinal_index->load()); - return Status::OK(); +Status ColumnReader::_load_ordinal_index(bool use_page_cache, bool kept_in_memory) { + DCHECK(_ordinal_index_meta != nullptr); + _ordinal_index.reset(new OrdinalIndexReader(_file_name, _ordinal_index_meta, _num_rows)); + return _ordinal_index->load(use_page_cache, kept_in_memory); } -Status ColumnReader::_load_zone_map_index() { - if (_meta.has_zone_map_page()) { - PagePointer pp = _meta.zone_map_page(); - PageHandle ph; - OlapReaderStatistics stats; - ColumnIteratorOptions opts; - // column index only load once, so we use global config to decide - if (!config::disable_storage_page_cache) { - opts.use_page_cache = true; - } - opts.stats = &stats; - RETURN_IF_ERROR(read_page(pp, opts, &ph)); - _column_zone_map.reset(new ColumnZoneMap(ph.data())); - RETURN_IF_ERROR(_column_zone_map->load()); - } else { - _column_zone_map.reset(nullptr); +Status ColumnReader::_load_zone_map_index(bool use_page_cache, bool kept_in_memory) { + if (_zone_map_index_meta != nullptr) { + _zone_map_index.reset(new ZoneMapIndexReader(_file_name, _zone_map_index_meta)); + return _zone_map_index->load(use_page_cache, kept_in_memory); } return Status::OK(); } -Status ColumnReader::_load_bitmap_index() { - if (_meta.has_bitmap_index()) { - const BitmapIndexColumnPB& bitmap_index_meta = _meta.bitmap_index(); - _bitmap_index_reader.reset(new BitmapIndexReader(_file_name, bitmap_index_meta)); - RETURN_IF_ERROR(_bitmap_index_reader->load(_opts.cache_in_memory)); - } else { - _bitmap_index_reader.reset(nullptr); +Status ColumnReader::_load_bitmap_index(bool use_page_cache, bool kept_in_memory) { + if (_bitmap_index_meta != nullptr) { + _bitmap_index.reset(new BitmapIndexReader(_file_name, _bitmap_index_meta)); + return _bitmap_index->load(use_page_cache, kept_in_memory); } return Status::OK(); } -Status ColumnReader::_load_bloom_filter_index() { - if (_meta.has_bloom_filter_index()) { - const BloomFilterIndexPB& bloom_filter_index_meta = _meta.bloom_filter_index(); - _bloom_filter_index_reader.reset(new BloomFilterIndexReader(_file_name, bloom_filter_index_meta)); - RETURN_IF_ERROR(_bloom_filter_index_reader->load(_opts.cache_in_memory)); - } else { - _bloom_filter_index_reader.reset(nullptr); +Status ColumnReader::_load_bloom_filter_index(bool use_page_cache, bool kept_in_memory) { + if (_bf_index_meta != nullptr) { + _bloom_filter_index.reset(new BloomFilterIndexReader(_file_name, _bf_index_meta)); + return _bloom_filter_index->load(use_page_cache, kept_in_memory); } return Status::OK(); } @@ -327,11 +281,11 @@ Status ColumnReader::seek_to_first(OrdinalPageIndexIterator* iter) { return Status::OK(); } -Status ColumnReader::seek_at_or_before(rowid_t rowid, OrdinalPageIndexIterator* iter) { +Status ColumnReader::seek_at_or_before(ordinal_t ordinal, OrdinalPageIndexIterator* iter) { RETURN_IF_ERROR(_ensure_index_loaded()); - *iter = _ordinal_index->seek_at_or_before(rowid); + *iter = _ordinal_index->seek_at_or_before(ordinal); if (!iter->valid()) { - return Status::NotFound(Substitute("Failed to seek to rowid $0, ", rowid)); + return Status::NotFound(Substitute("Failed to seek to ordinal $0, ", ordinal)); } return Status::OK(); } @@ -343,38 +297,34 @@ FileColumnIterator::~FileColumnIterator() = default; Status FileColumnIterator::seek_to_first() { RETURN_IF_ERROR(_reader->seek_to_first(&_page_iter)); - - _page.reset(new ParsedPage()); - RETURN_IF_ERROR(_read_page(_page_iter, _page.get())); + RETURN_IF_ERROR(_read_data_page(_page_iter)); _seek_to_pos_in_page(_page.get(), 0); - _current_rowid = 0; - + _current_ordinal = 0; return Status::OK(); } -Status FileColumnIterator::seek_to_ordinal(rowid_t rid) { +Status FileColumnIterator::seek_to_ordinal(ordinal_t ord) { // if current page contains this row, we don't need to seek - if (_page == nullptr || !_page->contains(rid)) { - RETURN_IF_ERROR(_reader->seek_at_or_before(rid, &_page_iter)); - _page.reset(new ParsedPage()); - RETURN_IF_ERROR(_read_page(_page_iter, _page.get())); + if (_page == nullptr || !_page->contains(ord)) { + RETURN_IF_ERROR(_reader->seek_at_or_before(ord, &_page_iter)); + RETURN_IF_ERROR(_read_data_page(_page_iter)); } - _seek_to_pos_in_page(_page.get(), rid - _page->first_rowid); - _current_rowid = rid; + _seek_to_pos_in_page(_page.get(), ord - _page->first_ordinal); + _current_ordinal = ord; return Status::OK(); } -void FileColumnIterator::_seek_to_pos_in_page(ParsedPage* page, uint32_t offset_in_page) { +void FileColumnIterator::_seek_to_pos_in_page(ParsedPage* page, ordinal_t offset_in_page) { if (page->offset_in_page == offset_in_page) { // fast path, do nothing return; } - uint32_t pos_in_data = offset_in_page; - if (_reader->is_nullable()) { - rowid_t offset_in_data = 0; - rowid_t skips = offset_in_page; + ordinal_t pos_in_data = offset_in_page; + if (_page->has_null) { + ordinal_t offset_in_data = 0; + ordinal_t skips = offset_in_page; if (offset_in_page > page->offset_in_page) { // forward, reuse null bitmap @@ -415,8 +365,8 @@ Status FileColumnIterator::next_batch(size_t* n, ColumnBlockView* dst) { // number of rows to be read from this page size_t nrows_in_page = std::min(remaining, _page->remaining()); size_t nrows_to_read = nrows_in_page; - if (_reader->is_nullable()) { - // when this column is nullable we read data in some runs + if (_page->has_null) { + // when this page contains NULLs we read data in some runs // first we read null bits in the same value, if this is null, we // don't need to read value from page. // If this is not null, we read data from page in batch. @@ -438,7 +388,7 @@ Status FileColumnIterator::next_batch(size_t* n, ColumnBlockView* dst) { nrows_to_read -= this_run; _page->offset_in_page += this_run; dst->advance(this_run); - _current_rowid += this_run; + _current_ordinal += this_run; } } else { RETURN_IF_ERROR(_page->data_decoder->next_batch(&nrows_to_read, dst)); @@ -450,7 +400,7 @@ Status FileColumnIterator::next_batch(size_t* n, ColumnBlockView* dst) { _page->offset_in_page += nrows_to_read; dst->advance(nrows_to_read); - _current_rowid += nrows_to_read; + _current_ordinal += nrows_to_read; } remaining -= nrows_in_page; } @@ -467,70 +417,46 @@ Status FileColumnIterator::_load_next_page(bool* eos) { *eos = true; return Status::OK(); } - _page.reset(new ParsedPage()); - RETURN_IF_ERROR(_read_page(_page_iter, _page.get())); + + RETURN_IF_ERROR(_read_data_page(_page_iter)); _seek_to_pos_in_page(_page.get(), 0); *eos = false; return Status::OK(); } -// read one page from file and parse this page to make -// it ready to read -Status FileColumnIterator::_read_page(const OrdinalPageIndexIterator& iter, ParsedPage* page) { - page->page_pointer = iter.page(); - RETURN_IF_ERROR(_reader->read_page(_file, page->page_pointer, _opts, &page->page_handle)); - // TODO(zc): read page from file - Slice data = page->page_handle.data(); +Status FileColumnIterator::_read_data_page(const OrdinalPageIndexIterator& iter) { + PageHandle handle; + Slice page_body; + PageFooterPB footer; + RETURN_IF_ERROR(_reader->read_page(_opts, iter.page(), &handle, &page_body, &footer)); + // parse data page + RETURN_IF_ERROR(ParsedPage::create( + std::move(handle), page_body, footer.data_page_footer(), _reader->encoding_info(), + iter.page(), iter.page_index(), &_page)); - // decode first rowid - if (!get_varint32(&data, &page->first_rowid)) { - return Status::Corruption("Bad page, failed to decode first rowid"); - } - // decode number rows - if (!get_varint32(&data, &page->num_rows)) { - return Status::Corruption("Bad page, failed to decode rows count"); - } - if (_reader->is_nullable()) { - uint32_t null_bitmap_size = 0; - if (!get_varint32(&data, &null_bitmap_size)) { - return Status::Corruption("Bad page, failed to decode null bitmap size"); - } - if (null_bitmap_size > data.size) { - return Status::Corruption( - Substitute("Bad page, null bitmap too large $0 vs $1", null_bitmap_size, data.size)); - } - page->null_decoder = RleDecoder((uint8_t*)data.data, null_bitmap_size, 1); - page->null_bitmap = Slice(data.data, null_bitmap_size); - - // remove null bitmap - data.remove_prefix(null_bitmap_size); - } - - // create page data decoder - PageDecoderOptions options; - RETURN_IF_ERROR(_reader->encoding_info()->create_page_decoder(data, options, &page->data_decoder)); - RETURN_IF_ERROR(page->data_decoder->init()); - - // lazy init dict_encoding'dict for three reasons - // 1. a column use dictionary encoding still has non-dict-encoded data pages are seeked,load dict when necessary - // 2. ColumnReader which is owned by Segment and Rowset can being alive even when there is no query,it should retain memory as small as possible. - // 3. Iterators of the same column won't repeat load the dict page because of page cache. + // dictionary page is read when the first data page that uses it is read, + // this is to optimize the memory usage: when there is no query on one column, we could + // release the memory of dictionary page. + // note that concurrent iterators for the same column won't repeatedly read dictionary page + // because of page cache. if (_reader->encoding_info()->encoding() == DICT_ENCODING) { - BinaryDictPageDecoder* binary_dict_page_decoder = (BinaryDictPageDecoder*)page->data_decoder; - if (binary_dict_page_decoder->is_dict_encoding()) { + auto dict_page_decoder = reinterpret_cast(_page->data_decoder); + if (dict_page_decoder->is_dict_encoding()) { if (_dict_decoder == nullptr) { - PagePointer pp = _reader->get_dict_page_pointer(); - RETURN_IF_ERROR(_reader->read_page(_file, pp, _opts, &_dict_page_handle)); - - _dict_decoder.reset(new BinaryPlainPageDecoder(_dict_page_handle.data())); + // read dictionary page + Slice dict_data; + PageFooterPB dict_footer; + RETURN_IF_ERROR(_reader->read_page( + _opts, _reader->get_dict_page_pointer(), + &_dict_page_handle, &dict_data, &dict_footer)); + // ignore dict_footer.dict_page_footer().encoding() due to only + // PLAIN_ENCODING is supported for dict page right now + _dict_decoder.reset(new BinaryPlainPageDecoder(dict_data)); RETURN_IF_ERROR(_dict_decoder->init()); } - binary_dict_page_decoder->set_dict_decoder(_dict_decoder.get()); + dict_page_decoder->set_dict_decoder(_dict_decoder.get()); } } - - page->offset_in_page = 0; - page->page_index = iter.cur_idx(); return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index ed5b6a6d7d..50265b0c43 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -21,17 +21,18 @@ #include // for size_t #include // for unique_ptr +#include "common/logging.h" #include "common/status.h" // for Status #include "gen_cpp/segment_v2.pb.h" // for ColumnMetaPB #include "olap/olap_cond.h" // for CondColumn #include "olap/tablet_schema.h" #include "olap/rowset/segment_v2/bitmap_index_reader.h" // for BitmapIndexReader -#include "olap/rowset/segment_v2/common.h" // for rowid_t +#include "olap/rowset/segment_v2/common.h" #include "olap/rowset/segment_v2/ordinal_page_index.h" // for OrdinalPageIndexIterator -#include "olap/rowset/segment_v2/column_zone_map.h" // for ColumnZoneMap #include "olap/rowset/segment_v2/row_ranges.h" // for RowRanges #include "olap/rowset/segment_v2/page_handle.h" // for PageHandle #include "olap/rowset/segment_v2/parsed_page.h" // for ParsedPage +#include "olap/rowset/segment_v2/zone_map_index.h" #include "util/once.h" #include "util/file_cache.h" @@ -41,12 +42,13 @@ class ColumnBlock; class RandomAccessFile; class TypeInfo; class BlockCompressionCodec; +class WrapperField; namespace segment_v2 { class EncodingInfo; class PageHandle; -class PagePointer; +struct PagePointer; class ColumnIterator; class BloomFilterIndexReader; @@ -54,14 +56,19 @@ struct ColumnReaderOptions { // whether verify checksum when read page bool verify_checksum = true; // for in memory olap table, use DURABLE CachePriority in page cache - bool cache_in_memory = false; + bool kept_in_memory = false; }; struct ColumnIteratorOptions { + RandomAccessFile* file = nullptr; // reader statistics OlapReaderStatistics* stats = nullptr; - RandomAccessFile* file = nullptr; bool use_page_cache = false; + + void sanity_check() const { + CHECK_NOTNULL(file); + CHECK_NOTNULL(stats); + } }; // There will be concurrent users to read the same column. So @@ -87,31 +94,25 @@ public: // Seek to the first entry in the column. Status seek_to_first(OrdinalPageIndexIterator* iter); - Status seek_at_or_before(rowid_t rowid, OrdinalPageIndexIterator* iter); + Status seek_at_or_before(ordinal_t ordinal, OrdinalPageIndexIterator* iter); // read a page from file into a page handle - // use reader owned _file(usually is Descriptor*) to read page - Status read_page(const PagePointer& pp, const ColumnIteratorOptions& opts, PageHandle* handle); - - // read a page from file into a page handle - // use file(usually is RandomAccessFile*) to read page - Status read_page(RandomAccessFile* file, const PagePointer& pp, const ColumnIteratorOptions& opts, PageHandle* handle); + Status read_page(const ColumnIteratorOptions& iter_opts, const PagePointer& pp, + PageHandle* handle, Slice* page_body, PageFooterPB* footer); bool is_nullable() const { return _meta.is_nullable(); } const EncodingInfo* encoding_info() const { return _encoding_info; } - const TypeInfo* type_info() const { return _type_info; } - bool has_zone_map() const { return _meta.has_zone_map_page(); } + bool has_zone_map() const { return _zone_map_index_meta != nullptr; } + bool has_bitmap_index() const { return _bitmap_index_meta != nullptr; } + bool has_bloom_filter_index() const { return _bf_index_meta != nullptr; } - bool has_bitmap_index() { - return _meta.has_bitmap_index(); - } - - bool has_bloom_filter_index() { - return _meta.has_bloom_filter_index(); - } + // Check if this column could match `cond' using segment zone map. + // Since segment zone map is stored in metadata, this function is fast without I/O. + // Return true if segment zone map is absent or `cond' could be satisfied, false otherwise. + bool match_condition(CondColumn* cond) const; // get row ranges with zone map // - cond_column is user's query predicate @@ -137,18 +138,24 @@ private: // May be called multiple times, subsequent calls will no op. Status _ensure_index_loaded() { return _load_index_once.call([this] { - RETURN_IF_ERROR(_load_zone_map_index()); - RETURN_IF_ERROR(_load_ordinal_index()); - RETURN_IF_ERROR(_load_bitmap_index()); - RETURN_IF_ERROR(_load_bloom_filter_index()); + bool use_page_cache = !config::disable_storage_page_cache; + RETURN_IF_ERROR(_load_zone_map_index(use_page_cache, _opts.kept_in_memory)); + RETURN_IF_ERROR(_load_ordinal_index(use_page_cache, _opts.kept_in_memory)); + RETURN_IF_ERROR(_load_bitmap_index(use_page_cache, _opts.kept_in_memory)); + RETURN_IF_ERROR(_load_bloom_filter_index(use_page_cache, _opts.kept_in_memory)); return Status::OK(); }); } - Status _load_zone_map_index(); - Status _load_ordinal_index(); - Status _load_bitmap_index(); - Status _load_bloom_filter_index(); + Status _load_zone_map_index(bool use_page_cache, bool kept_in_memory); + Status _load_ordinal_index(bool use_page_cache, bool kept_in_memory); + Status _load_bitmap_index(bool use_page_cache, bool kept_in_memory); + Status _load_bloom_filter_index(bool use_page_cache, bool kept_in_memory); + + bool _zone_map_match_condition(const ZoneMapPB& zone_map, + WrapperField* min_value_container, + WrapperField* max_value_container, + CondColumn* cond) const; Status _get_filtered_pages(CondColumn* cond_column, const std::vector& delete_conditions, @@ -167,12 +174,17 @@ private: const TypeInfo* _type_info = nullptr; const EncodingInfo* _encoding_info = nullptr; const BlockCompressionCodec* _compress_codec = nullptr; + // meta for various column indexes (null if the index is absent) + const ZoneMapIndexPB* _zone_map_index_meta = nullptr; + const OrdinalIndexPB* _ordinal_index_meta = nullptr; + const BitmapIndexPB* _bitmap_index_meta = nullptr; + const BloomFilterIndexPB* _bf_index_meta = nullptr; DorisCallOnce _load_index_once; - std::unique_ptr _column_zone_map; - std::unique_ptr _ordinal_index; - std::unique_ptr _bitmap_index_reader; - std::unique_ptr _bloom_filter_index_reader; + std::unique_ptr _zone_map_index; + std::unique_ptr _ordinal_index; + std::unique_ptr _bitmap_index; + std::unique_ptr _bloom_filter_index; }; // Base iterator to read one column data @@ -193,14 +205,14 @@ public: // Entry 0 is the first entry written to the column. // If provided seek point is past the end of the file, // then returns false. - virtual Status seek_to_ordinal(rowid_t ord_idx) = 0; + virtual Status seek_to_ordinal(ordinal_t ord) = 0; // After one seek, we can call this function many times to read data // into ColumnBlockView. when read string type data, memory will allocated // from MemPool virtual Status next_batch(size_t* n, ColumnBlockView* dst) = 0; - virtual rowid_t get_current_ordinal() const = 0; + virtual ordinal_t get_current_ordinal() const = 0; virtual Status get_row_ranges_by_zone_map(CondColumn* cond_column, const std::vector& delete_conditions, @@ -238,20 +250,13 @@ public: FileColumnIterator(ColumnReader* reader); ~FileColumnIterator() override; - Status init(const ColumnIteratorOptions& opts) override { - RETURN_IF_ERROR(ColumnIterator::init(opts)); - DCHECK(_opts.file != nullptr); - _file = _opts.file; - return Status::OK(); - } - Status seek_to_first() override; - Status seek_to_ordinal(rowid_t ord_idx) override; + Status seek_to_ordinal(ordinal_t ord) override; Status next_batch(size_t* n, ColumnBlockView* dst) override; - rowid_t get_current_ordinal() const override { return _current_rowid; } + ordinal_t get_current_ordinal() const override { return _current_ordinal; } // get row ranges by zone map // - cond_column is user's query predicate @@ -263,9 +268,9 @@ public: Status get_row_ranges_by_bloom_filter(CondColumn* cond_column, RowRanges* row_ranges) override; private: - void _seek_to_pos_in_page(ParsedPage* page, uint32_t offset_in_page); + void _seek_to_pos_in_page(ParsedPage* page, ordinal_t offset_in_page); Status _load_next_page(bool* eos); - Status _read_page(const OrdinalPageIndexIterator& iter, ParsedPage* page); + Status _read_data_page(const OrdinalPageIndexIterator& iter); private: ColumnReader* _reader; @@ -286,13 +291,11 @@ private: // This value will be reset when a new seek is issued OrdinalPageIndexIterator _page_iter; - // current rowid - rowid_t _current_rowid = 0; + // current value ordinal + ordinal_t _current_ordinal = 0; // page indexes those are DEL_PARTIAL_SATISFIED std::vector _delete_partial_statisfied_pages; - - RandomAccessFile* _file; }; // This iterator is used to read default value column @@ -315,14 +318,14 @@ public: return Status::OK(); } - Status seek_to_ordinal(rowid_t ord_idx) override { + Status seek_to_ordinal(ordinal_t ord_idx) override { _current_rowid = ord_idx; return Status::OK(); } Status next_batch(size_t* n, ColumnBlockView* dst) override; - rowid_t get_current_ordinal() const override { return _current_rowid; } + ordinal_t get_current_ordinal() const override { return _current_rowid; } private: bool _has_default_value; @@ -337,7 +340,7 @@ private: std::unique_ptr _pool; // current rowid - rowid_t _current_rowid = 0; + ordinal_t _current_rowid = 0; }; } diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp index 7839baf200..bf3f1b051f 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/column_writer.cpp @@ -19,22 +19,21 @@ #include -#include "common/logging.h" // for LOG -#include "env/env.h" // for LOG -#include "gutil/strings/substitute.h" // for Substitute +#include "common/logging.h" +#include "env/env.h" +#include "gutil/strings/substitute.h" #include "olap/rowset/segment_v2/bitmap_index_writer.h" -#include "olap/rowset/segment_v2/encoding_info.h" // for EncodingInfo -#include "olap/rowset/segment_v2/options.h" // for PageBuilderOptions -#include "olap/rowset/segment_v2/ordinal_page_index.h" // for OrdinalPageIndexBuilder -#include "olap/rowset/segment_v2/page_builder.h" // for PageBuilder -#include "olap/rowset/segment_v2/page_compression.h" -#include "olap/rowset/segment_v2/bloom_filter_index_writer.h" #include "olap/rowset/segment_v2/bloom_filter.h" -#include "olap/types.h" // for TypeInfo -#include "util/crc32c.h" -#include "util/faststring.h" // for fastring -#include "util/rle_encoding.h" // for RleEncoder +#include "olap/rowset/segment_v2/bloom_filter_index_writer.h" +#include "olap/rowset/segment_v2/encoding_info.h" +#include "olap/rowset/segment_v2/options.h" +#include "olap/rowset/segment_v2/ordinal_page_index.h" +#include "olap/rowset/segment_v2/page_builder.h" +#include "olap/rowset/segment_v2/page_io.h" +#include "olap/rowset/segment_v2/zone_map_index.h" #include "util/block_compression.h" +#include "util/faststring.h" +#include "util/rle_encoding.h" namespace doris { namespace segment_v2 { @@ -43,23 +42,28 @@ using strings::Substitute; class NullBitmapBuilder { public: - NullBitmapBuilder() : _bitmap_buf(512), _rle_encoder(&_bitmap_buf, 1) { + NullBitmapBuilder() : _has_null(false), _bitmap_buf(512), _rle_encoder(&_bitmap_buf, 1) { } explicit NullBitmapBuilder(size_t reserve_bits) - : _bitmap_buf(BitmapSize(reserve_bits)), _rle_encoder(&_bitmap_buf, 1) { + : _has_null(false), _bitmap_buf(BitmapSize(reserve_bits)), _rle_encoder(&_bitmap_buf, 1) { } void add_run(bool value, size_t run) { + _has_null |= value; _rle_encoder.Put(value, run); } + // Returns whether the building nullmap contains NULL + bool has_null() const { return _has_null; } + OwnedSlice finish() { _rle_encoder.Flush(); return _bitmap_buf.build(); } void reset() { + _has_null = false; _rle_encoder.Clear(); } @@ -67,19 +71,27 @@ public: return _bitmap_buf.size(); } private: + bool _has_null; faststring _bitmap_buf; RleEncoder _rle_encoder; }; ColumnWriter::ColumnWriter(const ColumnWriterOptions& opts, std::unique_ptr field, - bool is_nullable, - WritableFile* output_file) - : _opts(opts), - _is_nullable(is_nullable), - _output_file(output_file), + WritableFile* output_file) : + _opts(opts), _field(std::move(field)), + _output_file(output_file), + _is_nullable(_opts.meta->is_nullable()), _data_size(0) { + // these opts.meta fields should be set by client + DCHECK(opts.meta->has_column_id()); + DCHECK(opts.meta->has_unique_id()); + DCHECK(opts.meta->has_type()); + DCHECK(opts.meta->has_length()); + DCHECK(opts.meta->has_encoding()); + DCHECK(opts.meta->has_compression()); + DCHECK(opts.meta->has_is_nullable()); } ColumnWriter::~ColumnWriter() { @@ -93,10 +105,13 @@ ColumnWriter::~ColumnWriter() { } Status ColumnWriter::init() { - RETURN_IF_ERROR(EncodingInfo::get(_field->type_info(), _opts.encoding_type, &_encoding_info)); - if (_opts.compression_type != NO_COMPRESSION) { - RETURN_IF_ERROR(get_block_compression_codec(_opts.compression_type, &_compress_codec)); - } + RETURN_IF_ERROR(EncodingInfo::get(_field->type_info(), _opts.meta->encoding(), &_encoding_info)); + _opts.meta->set_encoding(_encoding_info->encoding()); + // should store more concrete encoding type instead of DEFAULT_ENCODING + // because the default encoding of a data type can be changed in the future + DCHECK_NE(_opts.meta->encoding(), DEFAULT_ENCODING); + + RETURN_IF_ERROR(get_block_compression_codec(_opts.meta->compression(), &_compress_codec)); // create page builder PageBuilder* page_builder = nullptr; @@ -106,17 +121,17 @@ Status ColumnWriter::init() { if (page_builder == nullptr) { return Status::NotSupported( Substitute("Failed to create page builder for type $0 and encoding $1", - _field->type(), _opts.encoding_type)); + _field->type(), _opts.meta->encoding())); } _page_builder.reset(page_builder); // create ordinal builder - _ordinal_index_builder.reset(new OrdinalPageIndexBuilder()); + _ordinal_index_builder.reset(new OrdinalIndexWriter()); // create null bitmap builder if (_is_nullable) { _null_bitmap_builder.reset(new NullBitmapBuilder()); } if (_opts.need_zone_map) { - _column_zone_map_builder.reset(new ColumnZoneMapBuilder(_field.get())); + _zone_map_index_builder.reset(new ZoneMapIndexWriter(_field.get())); } if (_opts.need_bitmap_index) { RETURN_IF_ERROR(BitmapIndexWriter::create(_field->type_info(), &_bitmap_index_builder)); @@ -132,7 +147,7 @@ Status ColumnWriter::append_nulls(size_t num_rows) { _null_bitmap_builder->add_run(true, num_rows); _next_rowid += num_rows; if (_opts.need_zone_map) { - RETURN_IF_ERROR(_column_zone_map_builder->add(nullptr, 1)); + _zone_map_index_builder->add_nulls(num_rows); } if (_opts.need_bitmap_index) { _bitmap_index_builder->add_nulls(num_rows); @@ -156,7 +171,7 @@ Status ColumnWriter::_append_data(const uint8_t** ptr, size_t num_rows) { size_t num_written = remaining; RETURN_IF_ERROR(_page_builder->add(*ptr, &num_written)); if (_opts.need_zone_map) { - RETURN_IF_ERROR(_column_zone_map_builder->add(*ptr, num_written)); + _zone_map_index_builder->add_values(*ptr, num_written); } if (_opts.need_bitmap_index) { _bitmap_index_builder->add_values(*ptr, num_written); @@ -193,7 +208,7 @@ Status ColumnWriter::append_nullable( _null_bitmap_builder->add_run(true, this_run); _next_rowid += this_run; if (_opts.need_zone_map) { - RETURN_IF_ERROR(_column_zone_map_builder->add(nullptr, 1)); + _zone_map_index_builder->add_nulls(this_run); } if (_opts.need_bitmap_index) { _bitmap_index_builder->add_nulls(this_run); @@ -216,7 +231,7 @@ uint64_t ColumnWriter::estimate_buffer_size() { } size += _ordinal_index_builder->size(); if (_opts.need_zone_map) { - size += _column_zone_map_builder->size(); + size += _zone_map_index_builder->size(); } if (_opts.need_bitmap_index) { size += _bitmap_index_builder->size(); @@ -239,189 +254,110 @@ Status ColumnWriter::write_data() { } // write column dict if (_encoding_info->encoding() == DICT_ENCODING) { - OwnedSlice dict_page; - _page_builder->get_dictionary_page(&dict_page); - std::vector origin_data; - origin_data.push_back(dict_page.slice()); - RETURN_IF_ERROR(_compress_and_write_page(&origin_data, &_dict_page_pp)); + OwnedSlice dict_body; + RETURN_IF_ERROR(_page_builder->get_dictionary_page(&dict_body)); + + PageFooterPB footer; + footer.set_type(DICTIONARY_PAGE); + footer.set_uncompressed_size(dict_body.slice().get_size()); + footer.mutable_dict_page_footer()->set_encoding(PLAIN_ENCODING); + + PagePointer dict_pp; + RETURN_IF_ERROR(PageIO::compress_and_write_page( + _compress_codec, _opts.compression_min_space_saving, _output_file, + { dict_body.slice() }, footer, &dict_pp)); + dict_pp.to_proto(_opts.meta->mutable_dict_page()); } return Status::OK(); } Status ColumnWriter::write_ordinal_index() { - Slice data = _ordinal_index_builder->finish(); - std::vector slices{data}; - auto st = _compress_and_write_page(&slices, &_ordinal_index_pp); - return st; + return _ordinal_index_builder->finish(_output_file, _opts.meta->add_indexes()); } Status ColumnWriter::write_zone_map() { if (_opts.need_zone_map) { - OwnedSlice data = _column_zone_map_builder->finish(); - std::vector slices{data.slice()}; - RETURN_IF_ERROR(_compress_and_write_page(&slices, &_zone_map_pp)); + return _zone_map_index_builder->finish(_output_file, _opts.meta->add_indexes()); } return Status::OK(); } Status ColumnWriter::write_bitmap_index() { - if (!_opts.need_bitmap_index) { - return Status::OK(); + if (_opts.need_bitmap_index) { + return _bitmap_index_builder->finish(_output_file, _opts.meta->add_indexes()); } - return _bitmap_index_builder->finish(_output_file, &_bitmap_index_meta); + return Status::OK(); } Status ColumnWriter::write_bloom_filter_index() { - if (!_opts.need_bloom_filter) { - return Status::OK(); - } - return _bloom_filter_index_builder->finish(_output_file, &_bloom_filter_index_meta); -} - -void ColumnWriter::write_meta(ColumnMetaPB* meta) { - meta->set_type(_field->type()); - meta->set_encoding(_encoding_info->encoding()); - // should store more concrete encoding type instead of DEFAULT_ENCODING - // because the default encoding of a data type can be changed in the future - DCHECK_NE(meta->encoding(), DEFAULT_ENCODING); - meta->set_compression(_opts.compression_type); - meta->set_is_nullable(_is_nullable); - _ordinal_index_pp.to_proto(meta->mutable_ordinal_index_page()); - if (_opts.need_zone_map) { - _zone_map_pp.to_proto(meta->mutable_zone_map_page()); - _column_zone_map_builder->fill_segment_zone_map(meta->mutable_zone_map()); - } - if (_encoding_info->encoding() == DICT_ENCODING) { - _dict_page_pp.to_proto(meta->mutable_dict_page()); - } - if (_opts.need_bitmap_index) { - meta->mutable_bitmap_index()->CopyFrom(_bitmap_index_meta); - } if (_opts.need_bloom_filter) { - meta->mutable_bloom_filter_index()->CopyFrom(_bloom_filter_index_meta); + return _bloom_filter_index_builder->finish(_output_file, _opts.meta->add_indexes()); } + return Status::OK(); } -// write a page into file and update ordinal index -// this function will call _write_physical_page to write data +// write a data page into file and update ordinal index Status ColumnWriter::_write_data_page(Page* page) { PagePointer pp; - std::vector origin_data; + std::vector compressed_body; for (auto& data : page->data) { - origin_data.push_back(data.slice()); + compressed_body.push_back(data.slice()); } - RETURN_IF_ERROR(_write_physical_page(&origin_data, &pp)); - _ordinal_index_builder->append_entry(page->first_rowid, pp); - return Status::OK(); -} - -Status ColumnWriter::_compress_and_write_page(std::vector* origin_data, PagePointer* pp) { - std::vector* output_data = origin_data; - std::vector compressed_data; - - // Put compressor out of if block, because we will use compressor's - // content until this function finished. - PageCompressor compressor(_compress_codec); - if (_compress_codec != nullptr) { - RETURN_IF_ERROR(compressor.compress(*origin_data, &compressed_data)); - output_data = &compressed_data; - } - return _write_physical_page(output_data, pp); -} - -// write a physical page in to files -Status ColumnWriter::_write_physical_page(std::vector* origin_data, PagePointer* pp) { - // checksum - uint8_t checksum_buf[sizeof(uint32_t)]; - uint32_t checksum = crc32c::Value(*origin_data); - encode_fixed32_le(checksum_buf, checksum); - origin_data->emplace_back(checksum_buf, sizeof(uint32_t)); - - // remember the offset - pp->offset = _output_file->size(); - // write content to file - size_t bytes_written = 0; - RETURN_IF_ERROR(_write_raw_data(*origin_data, &bytes_written)); - pp->size = bytes_written; - - return Status::OK(); -} - -// write raw data into file, this is the only place to write data -Status ColumnWriter::_write_raw_data(const std::vector& data, size_t* bytes_written) { - auto file_size = _output_file->size(); - auto st = _output_file->appendv(&data[0], data.size()); - if (!st.ok()) { - LOG(WARNING) << "failed to append data to file, st=" << st.to_string(); - return st; - } - *bytes_written = _output_file->size() - file_size; - _written_size += *bytes_written; + RETURN_IF_ERROR(PageIO::write_page(_output_file, compressed_body, page->footer, &pp)); + _ordinal_index_builder->append_entry(page->footer.data_page_footer().first_ordinal(), pp); return Status::OK(); } Status ColumnWriter::_finish_current_page() { - if (_next_rowid == _last_first_rowid) { + if (_next_rowid == _first_rowid) { return Status::OK(); } - std::unique_ptr page(new Page()); - page->first_rowid = _last_first_rowid; - page->num_rows = _next_rowid - _last_first_rowid; - faststring header; - // 1. first rowid - put_varint32(&header, page->first_rowid); - // 2. row count - put_varint32(&header, page->num_rows); - OwnedSlice null_bitmap; - if (_is_nullable) { - null_bitmap = _null_bitmap_builder->finish(); - _null_bitmap_builder->reset(); - put_varint32(&header, null_bitmap.slice().get_size()); - } - page->data.emplace_back(std::move(header.build())); - if (_is_nullable) { - page->data.emplace_back(std::move(null_bitmap)); - } - OwnedSlice data_slice = _page_builder->finish(); - _page_builder->reset(); - page->data.emplace_back(std::move(data_slice)); - - // compressed data - if (_compress_codec != nullptr) { - PageCompressor compressor(_compress_codec); - std::vector data_slices; - size_t origin_size = 0; - for (auto& data : page->data) { - data_slices.push_back(data.slice()); - origin_size += data.slice().size; - } - OwnedSlice compressed_data; - bool compressed = false; - RETURN_IF_ERROR(compressor.compress(data_slices, &compressed_data, &compressed)); - if (compressed) { - page->data.clear(); - page->data.emplace_back(std::move(compressed_data)); - } else { - size_t uncompressed_bytes = Slice::compute_total_size(data_slices); - faststring buf; - buf.resize(4); - encode_fixed32_le((uint8_t*)buf.data(), uncompressed_bytes); - page->data.emplace_back(std::move(buf.build())); - } - } - - // update last first rowid - _last_first_rowid = _next_rowid; - - _push_back_page(page.release()); if (_opts.need_zone_map) { - RETURN_IF_ERROR(_column_zone_map_builder->flush()); + RETURN_IF_ERROR(_zone_map_index_builder->flush()); } if (_opts.need_bloom_filter) { RETURN_IF_ERROR(_bloom_filter_index_builder->flush()); } + + // build data page body : encoded values + [nullmap] + vector body; + OwnedSlice encoded_values = _page_builder->finish(); + _page_builder->reset(); + body.push_back(encoded_values.slice()); + + OwnedSlice nullmap; + if (_is_nullable && _null_bitmap_builder->has_null()) { + nullmap = _null_bitmap_builder->finish(); + _null_bitmap_builder->reset(); + body.push_back(nullmap.slice()); + } + + // prepare data page footer + std::unique_ptr page(new Page()); + page->footer.set_type(DATA_PAGE); + page->footer.set_uncompressed_size(Slice::compute_total_size(body)); + auto data_page_footer = page->footer.mutable_data_page_footer(); + data_page_footer->set_first_ordinal(_first_rowid); + data_page_footer->set_num_values(_next_rowid - _first_rowid); + data_page_footer->set_nullmap_size(nullmap.slice().size); + + // trying to compress page body + OwnedSlice compressed_body; + RETURN_IF_ERROR(PageIO::compress_page_body( + _compress_codec, _opts.compression_min_space_saving, body, &compressed_body)); + if (compressed_body.slice().empty()) { + // page body is uncompressed + page->data.emplace_back(std::move(encoded_values)); + page->data.emplace_back(std::move(nullmap)); + } else { + // page body is compressed + page->data.emplace_back(std::move(compressed_body)); + } + + _push_back_page(page.release()); + _first_rowid = _next_rowid; return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h index 63404d9de3..43039e3edf 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.h +++ b/be/src/olap/rowset/segment_v2/column_writer.h @@ -21,8 +21,7 @@ #include "common/status.h" // for Status #include "gen_cpp/segment_v2.pb.h" // for EncodingTypePB -#include "olap/rowset/segment_v2/column_zone_map.h" // for ColumnZoneMapBuilder -#include "olap/rowset/segment_v2/common.h" // for rowid_t +#include "olap/rowset/segment_v2/common.h" #include "olap/rowset/segment_v2/page_pointer.h" // for PagePointer #include "util/bitmap.h" // for BitmapChange #include "util/slice.h" // for OwnedSlice @@ -36,8 +35,10 @@ class BlockCompressionCodec; namespace segment_v2 { struct ColumnWriterOptions { - EncodingTypePB encoding_type = DEFAULT_ENCODING; - CompressionTypePB compression_type = segment_v2::CompressionTypePB::LZ4F; + // input and output parameter: + // - input: column_id/unique_id/type/length/encoding/compression/is_nullable members + // - output: encoding/indexes/dict_page members + ColumnMetaPB* meta; size_t data_page_size = 64 * 1024; // store compressed page only when space saving is above the threshold. // space saving = 1 - compressed_size / uncompressed_size @@ -50,9 +51,10 @@ struct ColumnWriterOptions { class BitmapIndexWriter; class EncodingInfo; class NullBitmapBuilder; -class OrdinalPageIndexBuilder; +class OrdinalIndexWriter; class PageBuilder; class BloomFilterIndexWriter; +class ZoneMapIndexWriter; // Encode one column's data into some memory slice. // Because some columns would be stored in a file, we should wait @@ -62,7 +64,6 @@ class ColumnWriter { public: ColumnWriter(const ColumnWriterOptions& opts, std::unique_ptr field, - bool is_nullable, WritableFile* output_file); ~ColumnWriter(); @@ -102,19 +103,17 @@ public: Status write_zone_map(); Status write_bitmap_index(); Status write_bloom_filter_index(); - void write_meta(ColumnMetaPB* meta); private: // All Pages will be organized into a linked list struct Page { - int32_t first_rowid; - int32_t num_rows; // the data vector may contain: - // 1. one OwnedSlice if the data is compressed - // 2. one OwnedSlice if the data is not compressed and is not nullable - // 3. two OwnedSlice if the data is not compressed and is nullable + // 1. one OwnedSlice if the page body is compressed + // 2. one OwnedSlice if the page body is not compressed and doesn't have nullmap + // 3. two OwnedSlice if the page body is not compressed and has nullmap // use vector for easier management for lifetime of OwnedSlice std::vector data; + PageFooterPB footer; Page* next = nullptr; }; @@ -135,45 +134,37 @@ private: for (auto& data_slice : page->data) { _data_size += data_slice.slice().size; } + // estimate (page footer + footer size + checksum) took 20 bytes + _data_size += 20; } Status _append_data(const uint8_t** ptr, size_t num_rows); Status _finish_current_page(); - Status _write_raw_data(const std::vector& data, size_t* bytes_written); - Status _write_data_page(Page* page); - Status _compress_and_write_page(std::vector* origin_data, PagePointer* pp); - Status _write_physical_page(std::vector* origin_data, PagePointer* pp); private: ColumnWriterOptions _opts; + std::unique_ptr _field; + WritableFile* _output_file; bool _is_nullable; - WritableFile* _output_file = nullptr; + // total size of data page list + uint64_t _data_size; // cached generated pages, PageHead _pages; - rowid_t _last_first_rowid = 0; - rowid_t _next_rowid = 0; + ordinal_t _first_rowid = 0; + ordinal_t _next_rowid = 0; const EncodingInfo* _encoding_info = nullptr; const BlockCompressionCodec* _compress_codec = nullptr; std::unique_ptr _page_builder; std::unique_ptr _null_bitmap_builder; - std::unique_ptr _ordinal_index_builder; - std::unique_ptr _column_zone_map_builder; - std::unique_ptr _field; + + std::unique_ptr _ordinal_index_builder; + std::unique_ptr _zone_map_index_builder; std::unique_ptr _bitmap_index_builder; std::unique_ptr _bloom_filter_index_builder; - BitmapIndexColumnPB _bitmap_index_meta; - BloomFilterIndexPB _bloom_filter_index_meta; - - PagePointer _ordinal_index_pp; - PagePointer _zone_map_pp; - PagePointer _dict_page_pp; - // the total data size of page list - uint64_t _data_size; - uint64_t _written_size = 0; }; diff --git a/be/src/olap/rowset/segment_v2/column_zone_map.cpp b/be/src/olap/rowset/segment_v2/column_zone_map.cpp deleted file mode 100644 index 659880c219..0000000000 --- a/be/src/olap/rowset/segment_v2/column_zone_map.cpp +++ /dev/null @@ -1,127 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "olap/rowset/segment_v2/column_zone_map.h" - -#include "olap/olap_define.h" - -namespace doris { - -namespace segment_v2 { - -ColumnZoneMapBuilder::ColumnZoneMapBuilder(Field* field) : _field(field), _pool(&_tracker) { - PageBuilderOptions options; - options.data_page_size = 0; - _page_builder.reset(new BinaryPlainPageBuilder(options)); - _zone_map.min_value = _field->allocate_value(&_pool); - _zone_map.max_value = _field->allocate_value(&_pool); - _reset_page_zone_map(); - _segment_zone_map.min_value = _field->allocate_value(&_pool); - _segment_zone_map.max_value = _field->allocate_value(&_pool); - _reset_segment_zone_map(); -} - -Status ColumnZoneMapBuilder::add(const uint8_t *vals, size_t count) { - if (vals != nullptr) { - for (int i = 0; i < count; ++i) { - if (_field->compare(_zone_map.min_value, (char *)vals) > 0) { - _field->type_info()->direct_copy(_zone_map.min_value, (const char *)vals); - } - if (_field->compare(_zone_map.max_value, (char *)vals) < 0) { - _field->type_info()->direct_copy(_zone_map.max_value, (const char *)vals); - } - vals += _field->size(); - if (!_zone_map.has_not_null) { - _zone_map.has_not_null = true; - } - } - } - else { - if (!_zone_map.has_null) { - _zone_map.has_null = true; - } - } - return Status::OK(); -} - -void ColumnZoneMapBuilder::fill_segment_zone_map(ZoneMapPB* const to) { - _fill_zone_map_to_pb(_segment_zone_map, to); -} - -Status ColumnZoneMapBuilder::flush() { - // Update segment zone map. - if (_field->compare(_segment_zone_map.min_value, _zone_map.min_value) > 0) { - _field->type_info()->direct_copy(_segment_zone_map.min_value, _zone_map.min_value); - } - if (_field->compare(_segment_zone_map.max_value, _zone_map.max_value) < 0) { - _field->type_info()->direct_copy(_segment_zone_map.max_value, _zone_map.max_value); - } - if (!_segment_zone_map.has_null && _zone_map.has_null) { - _segment_zone_map.has_null = true; - } - if (!_segment_zone_map.has_not_null && _zone_map.has_not_null) { - _segment_zone_map.has_not_null = true; - } - - ZoneMapPB page_zone_map; - _fill_zone_map_to_pb(_zone_map, &page_zone_map); - - std::string serialized_zone_map; - bool ret = page_zone_map.SerializeToString(&serialized_zone_map); - if (!ret) { - return Status::InternalError("serialize zone map failed"); - } - Slice data(serialized_zone_map.data(), serialized_zone_map.size()); - size_t num = 1; - RETURN_IF_ERROR(_page_builder->add((const uint8_t *)&data, &num)); - // reset the variables - // we should allocate max varchar length and set to max for min value - _reset_page_zone_map(); - return Status::OK(); -} - -void ColumnZoneMapBuilder::_reset_zone_map(ZoneMap* zone_map) { - _field->set_to_max(zone_map->min_value); - _field->set_to_min(zone_map->max_value); - zone_map->has_null = false; - zone_map->has_not_null = false; -} - -void ColumnZoneMapBuilder::_fill_zone_map_to_pb(const ZoneMap& from, ZoneMapPB* const to) { - to->set_has_not_null(from.has_not_null); - to->set_has_null(from.has_null); - to->set_max(_field->to_string(from.max_value)); - to->set_min(_field->to_string(from.min_value)); -} - -Status ColumnZoneMap::load() { - BinaryPlainPageDecoder page_decoder(_data); - RETURN_IF_ERROR(page_decoder.init()); - _num_pages = page_decoder.count(); - _page_zone_maps.resize(_num_pages); - for (int i = 0; i < _num_pages; ++i) { - Slice data = page_decoder.string_at_index(i); - bool ret = _page_zone_maps[i].ParseFromString(std::string(data.data, data.size)); - if (!ret) { - return Status::Corruption("parse zone map failed"); - } - } - return Status::OK(); -} - -} // namespace segment_v2 -} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/common.h b/be/src/olap/rowset/segment_v2/common.h index 8ec64cb6ac..166aa9fca5 100644 --- a/be/src/olap/rowset/segment_v2/common.h +++ b/be/src/olap/rowset/segment_v2/common.h @@ -25,7 +25,10 @@ namespace doris { namespace segment_v2 { +// One segment file could store at most INT32_MAX rows, +// but due to array type, each column could store more than INT32_MAX values. using rowid_t = uint32_t; +using ordinal_t = uint64_t; } // namespace segment_v2 } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/frame_of_reference_page.h b/be/src/olap/rowset/segment_v2/frame_of_reference_page.h index 3293931448..0c0b5d730c 100644 --- a/be/src/olap/rowset/segment_v2/frame_of_reference_page.h +++ b/be/src/olap/rowset/segment_v2/frame_of_reference_page.h @@ -20,7 +20,6 @@ #include "olap/rowset/segment_v2/page_builder.h" // for PageBuilder #include "olap/rowset/segment_v2/page_decoder.h" // for PageDecoder #include "olap/rowset/segment_v2/options.h" // for PageBuilderOptions/PageDecoderOptions -#include "olap/rowset/segment_v2/common.h" // for rowid_t #include "util/frame_of_reference_coding.h" namespace doris { diff --git a/be/src/olap/rowset/segment_v2/index_page.cpp b/be/src/olap/rowset/segment_v2/index_page.cpp index c361afa298..84525131d7 100644 --- a/be/src/olap/rowset/segment_v2/index_page.cpp +++ b/be/src/olap/rowset/segment_v2/index_page.cpp @@ -20,7 +20,6 @@ #include #include "common/logging.h" -#include "olap/key_coder.h" #include "util/coding.h" namespace doris { @@ -38,17 +37,15 @@ bool IndexPageBuilder::is_full() const { return _buffer.size() + 16 > _index_page_size; } -Slice IndexPageBuilder::finish() { +void IndexPageBuilder::finish(OwnedSlice* body, PageFooterPB* footer) { DCHECK(!_finished) << "already called finish()"; - IndexPageFooterPB footer; - footer.set_num_entries(_count); - footer.set_type(_is_leaf ? IndexPageFooterPB::LEAF : IndexPageFooterPB::INTERNAL); + *body = _buffer.build(); - std::string footer_buf; - footer.SerializeToString(&footer_buf); - _buffer.append(footer_buf); - put_fixed32_le(&_buffer, footer_buf.size()); - return Slice(_buffer); + footer->set_type(INDEX_PAGE); + footer->set_uncompressed_size(body->slice().get_size()); + footer->mutable_index_page_footer()->set_num_entries(_count); + footer->mutable_index_page_footer()->set_type( + _is_leaf ? IndexPageFooterPB::LEAF : IndexPageFooterPB::INTERNAL); } Status IndexPageBuilder::get_first_key(Slice* key) const { @@ -65,15 +62,11 @@ Status IndexPageBuilder::get_first_key(Slice* key) const { /////////////////////////////////////////////////////////////////////////////// -Status IndexPageReader::parse(const Slice& data) { - size_t buffer_len = data.size; - const uint8_t* buffer = (uint8_t*)data.data; - size_t footer_size = decode_fixed32_le(buffer + buffer_len - 4); - std::string footer_buf(data.data + buffer_len - 4 - footer_size, footer_size); - _footer.ParseFromString(footer_buf); +Status IndexPageReader::parse(const Slice& body, const IndexPageFooterPB& footer) { + _footer = footer; size_t num_entries = _footer.num_entries(); - Slice input(data); + Slice input(body); for (int i = 0; i < num_entries; ++i) { Slice key; PagePointer value; diff --git a/be/src/olap/rowset/segment_v2/index_page.h b/be/src/olap/rowset/segment_v2/index_page.h index 1a8923a1fb..2cee7a4fd7 100644 --- a/be/src/olap/rowset/segment_v2/index_page.h +++ b/be/src/olap/rowset/segment_v2/index_page.h @@ -31,22 +31,18 @@ namespace doris { namespace segment_v2 { -class IndexPageIterator; // forward decl. - // IndexPage is the building block for IndexedColumn's ordinal index and value index. // It is used to guide searching for a particular key to the data page containing it. // We use the same general format for all index pages, regardless of the data type and node type (leaf or internal) -// IndexPage := IndexEntry^NumEntry, IndexPageFooterPB, IndexPageFooterPBSize(4) -// IndexEntry := IndexKey, PagePointer -// IndexKey := KeyLength(vint32), KeyData(KeyLength bytes) -// PagePointer := PageOffset(vint64), PageSize(vint32) +// IndexPageBody := IndexEntry^NumEntry +// IndexEntry := KeyLength(vint), Byte^KeyLength, PageOffset(vlong), PageSize(vint) // // IndexPageFooterPB records NumEntry and type (leaf/internal) of the index page. // For leaf, IndexKey records the first/smallest key of the data page PagePointer points to. // For internal, IndexKey records the first/smallest key of the next-level index page PagePointer points to. // // All keys are treated as binary string and compared with memcpy. Keys of other data type are encoded first by -// KeyCoder, e.g., ordinal index's original key type is uint32_t but is encoded to binary string. +// KeyCoder, e.g., ordinal index's original key type is uint64_t but is encoded to binary string. class IndexPageBuilder { public: explicit IndexPageBuilder(size_t index_page_size, bool is_leaf) @@ -59,7 +55,7 @@ public: size_t count() const { return _count; } - Slice finish(); + void finish(OwnedSlice* body, PageFooterPB* footer); uint64_t size() { return _buffer.size(); @@ -87,9 +83,9 @@ private: class IndexPageIterator; class IndexPageReader { public: - IndexPageReader() : _parsed(false) {}; + IndexPageReader() : _parsed(false) {} - Status parse(const Slice& data); + Status parse(const Slice& body, const IndexPageFooterPB& footer); inline size_t count() const { DCHECK(_parsed); diff --git a/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp b/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp index 20661a26f9..0bb5396ca0 100644 --- a/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp @@ -21,12 +21,7 @@ #include "gutil/strings/substitute.h" // for Substitute #include "olap/key_coder.h" #include "olap/rowset/segment_v2/encoding_info.h" // for EncodingInfo -#include "olap/rowset/segment_v2/index_page.h" // for IndexPageReader -#include "olap/rowset/segment_v2/options.h" // for PageDecoderOptions -#include "olap/rowset/segment_v2/page_compression.h" -#include "olap/rowset/segment_v2/page_decoder.h" // for PagePointer -#include "util/crc32c.h" -#include "util/rle_encoding.h" // for RleDecoder +#include "olap/rowset/segment_v2/page_io.h" #include "util/file_manager.h" namespace doris { @@ -34,7 +29,10 @@ namespace segment_v2 { using strings::Substitute; -Status IndexedColumnReader::load() { +Status IndexedColumnReader::load(bool use_page_cache, bool kept_in_memory) { + _use_page_cache = use_page_cache; + _kept_in_memory = kept_in_memory; + _type_info = get_type_info((FieldType)_meta.data_type()); if (_type_info == nullptr) { return Status::NotSupported(Substitute("unsupported typeinfo, type=$0", _meta.data_type())); @@ -51,8 +49,10 @@ Status IndexedColumnReader::load() { if (_meta.ordinal_index_meta().is_root_data_page()) { _sole_data_page = PagePointer(_meta.ordinal_index_meta().root_page()); } else { - RETURN_IF_ERROR(read_page(input_file, _meta.ordinal_index_meta().root_page(), &_ordinal_index_page_handle)); - RETURN_IF_ERROR(_ordinal_index_reader.parse(_ordinal_index_page_handle.data())); + RETURN_IF_ERROR(load_index_page(input_file, + _meta.ordinal_index_meta().root_page(), + &_ordinal_index_page_handle, + &_ordinal_index_reader)); _has_index_page = true; } } @@ -62,8 +62,10 @@ Status IndexedColumnReader::load() { if (_meta.value_index_meta().is_root_data_page()) { _sole_data_page = PagePointer(_meta.value_index_meta().root_page()); } else { - RETURN_IF_ERROR(read_page(input_file, _meta.value_index_meta().root_page(), &_value_index_page_handle)); - RETURN_IF_ERROR(_value_index_reader.parse(_value_index_page_handle.data())); + RETURN_IF_ERROR(load_index_page(input_file, + _meta.value_index_meta().root_page(), + &_value_index_page_handle, + &_value_index_reader)); _has_index_page = true; } } @@ -71,91 +73,45 @@ Status IndexedColumnReader::load() { return Status::OK(); } -Status IndexedColumnReader::read_page(RandomAccessFile* file, const PagePointer& pp, PageHandle* handle) const { - auto cache = StoragePageCache::instance(); - PageCacheHandle cache_handle; - StoragePageCache::CacheKey cache_key(file->file_name(), pp.offset); - // column index only load once, so we use global config to decide - if (!config::disable_storage_page_cache && cache->lookup(cache_key, &cache_handle)) { - // we find page in cache, use it - *handle = PageHandle(std::move(cache_handle)); - return Status::OK(); - } - // Now we read this from file. - size_t page_size = pp.size; - if (page_size < sizeof(uint32_t)) { - return Status::Corruption(Substitute("Bad page, page size is too small, size=$0", page_size)); - } - - // Now we use this buffer to store page from storage, if this page is compressed - // this buffer will assigned uncompressed page, and origin content will be freed. - std::unique_ptr page(new uint8_t[page_size]); - Slice page_slice(page.get(), page_size); - RETURN_IF_ERROR(file->read_at(pp.offset, page_slice)); - - size_t data_size = page_size - 4; - if (_verify_checksum) { - uint32_t expect = decode_fixed32_le((uint8_t*)page_slice.data + page_slice.size - 4); - uint32_t actual = crc32c::Value(page_slice.data, page_slice.size - 4); - if (expect != actual) { - return Status::Corruption( - Substitute("Page checksum mismatch, actual=$0 vs expect=$1", actual, expect)); - } - } - - // remove page's suffix - page_slice.size = data_size; - if (_compress_codec != nullptr) { - PageDecompressor decompressor(page_slice, _compress_codec); - - Slice uncompressed_page; - RETURN_IF_ERROR(decompressor.decompress_to(&uncompressed_page)); - - // If decompressor create new heap memory for uncompressed data, - // assign this uncompressed page to page and page slice - if (uncompressed_page.data != page_slice.data) { - page.reset((uint8_t*)uncompressed_page.data); - } - page_slice = uncompressed_page; - } - if (!config::disable_storage_page_cache) { - // insert this into cache and return the cache handle - cache->insert(cache_key, page_slice, &cache_handle, _cache_in_memory); - *handle = PageHandle(std::move(cache_handle)); - } else { - *handle = PageHandle(page_slice); - } - - page.release(); +Status IndexedColumnReader::load_index_page(RandomAccessFile* file, + const PagePointerPB& pp, + PageHandle* handle, + IndexPageReader* reader) { + Slice body; + PageFooterPB footer; + RETURN_IF_ERROR(read_page(file, PagePointer(pp), handle, &body, &footer)); + RETURN_IF_ERROR(reader->parse(body, footer.index_page_footer())); return Status::OK(); } +Status IndexedColumnReader::read_page(RandomAccessFile* file, const PagePointer& pp, + PageHandle* handle, Slice* body, PageFooterPB* footer) const { + PageReadOptions opts; + opts.file = file; + opts.page_pointer = pp; + opts.codec = _compress_codec; + OlapReaderStatistics tmp_stats; + opts.stats = &tmp_stats; + opts.use_page_cache = _use_page_cache; + opts.kept_in_memory = _kept_in_memory; + + return PageIO::read_and_decompress_page(opts, handle, body, footer); +} + /////////////////////////////////////////////////////////////////////////////// -Status IndexedColumnIterator::_read_data_page(const PagePointer& page_pointer, ParsedPage* page) { - RETURN_IF_ERROR(_reader->read_page(_file, page_pointer, &page->page_handle)); - Slice data = page->page_handle.data(); - - // decode first rowid - if (!get_varint32(&data, &page->first_rowid)) { - return Status::Corruption("Bad page, failed to decode first rowid"); - } - - // decode number rows - if (!get_varint32(&data, &page->num_rows)) { - return Status::Corruption("Bad page, failed to decode rows count"); - } - - // create page data decoder - PageDecoderOptions options; - RETURN_IF_ERROR(_reader->encoding_info()->create_page_decoder(data, options, &page->data_decoder)); - RETURN_IF_ERROR(page->data_decoder->init()); - - page->offset_in_page = 0; - return Status::OK(); +Status IndexedColumnIterator::_read_data_page(const PagePointer& pp) { + PageHandle handle; + Slice body; + PageFooterPB footer; + RETURN_IF_ERROR(_reader->read_page(_file, pp, &handle, &body, &footer)); + // parse data page + // note that page_index is not used in IndexedColumnIterator, so we pass 0 + return ParsedPage::create(std::move(handle), body, footer.data_page_footer(), + _reader->encoding_info(), pp, 0, &_data_page); } -Status IndexedColumnIterator::seek_to_ordinal(rowid_t idx) { +Status IndexedColumnIterator::seek_to_ordinal(ordinal_t idx) { DCHECK(idx >= 0 && idx <= _reader->num_values()); if (!_reader->support_ordinal_seek()) { @@ -164,30 +120,29 @@ Status IndexedColumnIterator::seek_to_ordinal(rowid_t idx) { // it's ok to seek past the last value if (idx == _reader->num_values()) { - _current_rowid = idx; + _current_ordinal = idx; _seeked = true; return Status::OK(); } if (_data_page == nullptr || !_data_page->contains(idx)) { // need to read the data page containing row at idx - _data_page.reset(new ParsedPage()); if (_reader->_has_index_page) { std::string key; - KeyCoderTraits::full_encode_ascending(&idx, &key); + KeyCoderTraits::full_encode_ascending(&idx, &key); RETURN_IF_ERROR(_ordinal_iter.seek_at_or_before(key)); - RETURN_IF_ERROR(_read_data_page(_ordinal_iter.current_page_pointer(), _data_page.get())); + RETURN_IF_ERROR(_read_data_page(_ordinal_iter.current_page_pointer())); _current_iter = &_ordinal_iter; } else { - RETURN_IF_ERROR(_read_data_page(_reader->_sole_data_page, _data_page.get())); + RETURN_IF_ERROR(_read_data_page(_reader->_sole_data_page)); } } - rowid_t offset_in_page = idx - _data_page->first_rowid; + ordinal_t offset_in_page = idx - _data_page->first_ordinal; RETURN_IF_ERROR(_data_page->data_decoder->seek_to_position_in_page(offset_in_page)); DCHECK(offset_in_page == _data_page->data_decoder->current_index()); _data_page->offset_in_page = offset_in_page; - _current_rowid = idx; + _current_ordinal = idx; _seeked = true; return Status::OK(); } @@ -221,27 +176,21 @@ Status IndexedColumnIterator::seek_at_or_after(const void* key, bool* exact_matc } if (load_data_page) { - _data_page.reset(new ParsedPage()); - RETURN_IF_ERROR(_read_data_page(data_page_pp, _data_page.get())); + RETURN_IF_ERROR(_read_data_page(data_page_pp)); } // seek inside data page RETURN_IF_ERROR(_data_page->data_decoder->seek_at_or_after_value(key, exact_match)); _data_page->offset_in_page = _data_page->data_decoder->current_index(); - _current_rowid = _data_page->first_rowid + _data_page->offset_in_page; - DCHECK(_data_page->contains(_current_rowid)); + _current_ordinal = _data_page->first_ordinal + _data_page->offset_in_page; + DCHECK(_data_page->contains(_current_ordinal)); _seeked = true; return Status::OK(); } -rowid_t IndexedColumnIterator::get_current_ordinal() const { - DCHECK(_seeked); - return _current_rowid; -} - Status IndexedColumnIterator::next_batch(size_t* n, ColumnBlockView* column_view) { DCHECK(_seeked); - if (_current_rowid == _reader->num_values()) { + if (_current_ordinal == _reader->num_values()) { *n = 0; return Status::OK(); } @@ -257,8 +206,7 @@ Status IndexedColumnIterator::next_batch(size_t* n, ColumnBlockView* column_view if (!has_next) { break; // no more data page } - _data_page.reset(new ParsedPage()); - RETURN_IF_ERROR(_read_data_page(_current_iter->current_page_pointer(), _data_page.get())); + RETURN_IF_ERROR(_read_data_page(_current_iter->current_page_pointer())); } size_t rows_to_read = std::min(_data_page->remaining(), remaining); @@ -267,7 +215,7 @@ Status IndexedColumnIterator::next_batch(size_t* n, ColumnBlockView* column_view DCHECK(rows_to_read == rows_read); _data_page->offset_in_page += rows_read; - _current_rowid += rows_read; + _current_ordinal += rows_read; column_view->advance(rows_read); remaining -= rows_read; } diff --git a/be/src/olap/rowset/segment_v2/indexed_column_reader.h b/be/src/olap/rowset/segment_v2/indexed_column_reader.h index 8a874da78f..8e35784571 100644 --- a/be/src/olap/rowset/segment_v2/indexed_column_reader.h +++ b/be/src/olap/rowset/segment_v2/indexed_column_reader.h @@ -47,34 +47,34 @@ class IndexedColumnIterator; class IndexedColumnReader { public: explicit IndexedColumnReader(const std::string& file_name, - const IndexedColumnMetaPB& meta, - const bool cache_in_memory) - : _file_name(file_name), _meta(meta), _cache_in_memory(cache_in_memory) {}; + const IndexedColumnMetaPB& meta) + : _file_name(file_name), _meta(meta) {}; - Status load(); + Status load(bool use_page_cache, bool kept_in_memory); - // read a page from file into a page handle - // use file(usually is RandomAccessFile*) to read page - Status read_page(RandomAccessFile* file, const PagePointer& pp, PageHandle* handle) const; + // read a page specified by `pp' from `file' into `handle' + Status read_page(RandomAccessFile* file, const PagePointer& pp, + PageHandle* handle, Slice* body, PageFooterPB* footer) const; int64_t num_values() const { return _num_values; } - const EncodingInfo* encoding_info() const { return _encoding_info; } - const TypeInfo* type_info() const { return _type_info; } - bool support_ordinal_seek() const { return _meta.has_ordinal_index_meta(); } - bool support_value_seek() const { return _meta.has_value_index_meta(); } private: + Status load_index_page(RandomAccessFile* file, + const PagePointerPB& pp, + PageHandle* handle, + IndexPageReader* reader); + friend class IndexedColumnIterator; std::string _file_name; IndexedColumnMetaPB _meta; - // if _cache_in_memory is true, we will use DURABLE CachePriority in page cache, - // otherwise we use NORMAL CachePriority - bool _cache_in_memory; + + bool _use_page_cache; + bool _kept_in_memory; int64_t _num_values = 0; // whether this column contains any index page. // could be false when the column contains only one data page. @@ -86,7 +86,6 @@ private: PageHandle _ordinal_index_page_handle; PageHandle _value_index_page_handle; - bool _verify_checksum = true; const TypeInfo* _type_info = nullptr; const EncodingInfo* _encoding_info = nullptr; const BlockCompressionCodec* _compress_codec = nullptr; @@ -109,7 +108,7 @@ public: // Seek to the given ordinal entry. Entry 0 is the first entry. // Return NotFound if provided seek point is past the end. // Return NotSupported for column without ordinal index. - Status seek_to_ordinal(rowid_t idx); + Status seek_to_ordinal(ordinal_t idx); // Seek the index to the given key, or to the index entry immediately // before it. Then seek the data block to the value matching value or to @@ -123,14 +122,17 @@ public: Status seek_at_or_after(const void* key, bool* exact_match); // Get the ordinal index that the iterator is currently pointed to. - rowid_t get_current_ordinal() const; + ordinal_t get_current_ordinal() const { + DCHECK(_seeked); + return _current_ordinal; + } // After one seek, we can only call this function once to read data // into ColumnBlock. when read string type data, memory will allocated // from Arena Status next_batch(size_t* n, ColumnBlockView* column_view); private: - Status _read_data_page(const PagePointer& page_pointer, ParsedPage* page); + Status _read_data_page(const PagePointer& pp); const IndexedColumnReader* _reader; // iterator for ordinal index page @@ -141,10 +143,10 @@ private: bool _seeked = false; // current in-use index iterator, could be `&_ordinal_iter` or `&_value_iter` or null IndexPageIterator* _current_iter = nullptr; - // seeked data page, containing value at `_current_rowid` + // seeked data page, containing value at `_current_ordinal` std::unique_ptr _data_page; // next_batch() will read from this position - rowid_t _current_rowid = 0; + ordinal_t _current_ordinal = 0; // open file handle OpenedFileHandle _file_handle; // file to read diff --git a/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp b/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp index f40e2eca76..0f5b3df85f 100644 --- a/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp @@ -19,18 +19,18 @@ #include +#include "common/logging.h" #include "env/env.h" #include "olap/rowset/segment_v2/encoding_info.h" #include "olap/rowset/segment_v2/index_page.h" #include "olap/rowset/segment_v2/options.h" #include "olap/rowset/segment_v2/page_builder.h" -#include "olap/rowset/segment_v2/page_compression.h" +#include "olap/rowset/segment_v2/page_io.h" #include "olap/rowset/segment_v2/page_pointer.h" #include "olap/key_coder.h" #include "olap/types.h" #include "util/block_compression.h" #include "util/coding.h" -#include "util/crc32c.h" namespace doris { namespace segment_v2 { @@ -55,6 +55,10 @@ IndexedColumnWriter::~IndexedColumnWriter() = default; Status IndexedColumnWriter::init() { const EncodingInfo* encoding_info; RETURN_IF_ERROR(EncodingInfo::get(_typeinfo, _options.encoding, &encoding_info)); + _options.encoding = encoding_info->encoding(); + // should store more concrete encoding type instead of DEFAULT_ENCODING + // because the default encoding of a data type can be changed in the future + DCHECK_NE(_options.encoding, DEFAULT_ENCODING); PageBuilder* data_page_builder; RETURN_IF_ERROR(encoding_info->create_page_builder(PageBuilderOptions(), &data_page_builder)); @@ -89,31 +93,31 @@ Status IndexedColumnWriter::add(const void* value) { } Status IndexedColumnWriter::_finish_current_data_page() { - const uint32_t page_row_count = _data_page_builder->count(); - - if (page_row_count == 0) { + auto num_values_in_page = _data_page_builder->count(); + if (num_values_in_page == 0) { return Status::OK(); } + ordinal_t first_ordinal = _num_values - num_values_in_page; - uint32_t first_rowid = _num_values - page_row_count; - faststring page_header; - put_varint32(&page_header, first_rowid); - put_varint32(&page_header, page_row_count); - - OwnedSlice page_data = _data_page_builder->finish(); + // IndexedColumn doesn't have NULLs, thus data page body only contains encoded values + OwnedSlice page_body = _data_page_builder->finish(); _data_page_builder->reset(); - return _append_data_page({Slice(page_header), page_data.slice()}, first_rowid); -} + PageFooterPB footer; + footer.set_type(DATA_PAGE); + footer.set_uncompressed_size(page_body.slice().get_size()); + footer.mutable_data_page_footer()->set_first_ordinal(first_ordinal); + footer.mutable_data_page_footer()->set_num_values(num_values_in_page); + footer.mutable_data_page_footer()->set_nullmap_size(0); -Status IndexedColumnWriter::_append_data_page(const std::vector& data_page, rowid_t first_rowid) { - RETURN_IF_ERROR(_append_page(data_page, &_last_data_page)); + RETURN_IF_ERROR(PageIO::compress_and_write_page( + _compress_codec, _options.compression_min_space_saving, _file, { page_body.slice() }, + footer, &_last_data_page)); _num_data_pages++; if (_options.write_ordinal_index) { std::string key; - KeyCoderTraits::full_encode_ascending( - &first_rowid, &key); + KeyCoderTraits::full_encode_ascending(&first_ordinal, &key); _ordinal_index_builder->add(key, _last_data_page); } @@ -127,31 +131,6 @@ Status IndexedColumnWriter::_append_data_page(const std::vector& data_pag return Status::OK(); } -Status IndexedColumnWriter::_append_page(const std::vector& page, PagePointer* pp) { - std::vector output_page; - - // Put compressor out of if block, because we will use compressor's - // content until this function finished. - PageCompressor compressor(_compress_codec); - if (_compress_codec != nullptr) { - RETURN_IF_ERROR(compressor.compress(page, &output_page)); - } else { - output_page = page; - } - - // checksum - uint8_t checksum_buf[sizeof(uint32_t)]; - uint32_t checksum = crc32c::Value(output_page); - encode_fixed32_le(checksum_buf, checksum); - output_page.emplace_back(checksum_buf, sizeof(uint32_t)); - - // append to file - pp->offset = _file->size(); - RETURN_IF_ERROR(_file->appendv(&output_page[0], output_page.size())); - pp->size = _file->size() - pp->offset; - return Status::OK(); -} - Status IndexedColumnWriter::finish(IndexedColumnMetaPB* meta) { RETURN_IF_ERROR(_finish_current_data_page()); if (_options.write_ordinal_index) { @@ -174,9 +153,14 @@ Status IndexedColumnWriter::_flush_index(IndexPageBuilder* index_builder, BTreeM meta->set_is_root_data_page(true); _last_data_page.to_proto(meta->mutable_root_page()); } else { - Slice root_page = index_builder->finish(); + OwnedSlice page_body; + PageFooterPB page_footer; + index_builder->finish(&page_body, &page_footer); + PagePointer pp; - RETURN_IF_ERROR(_append_page({root_page}, &pp)); + RETURN_IF_ERROR(PageIO::compress_and_write_page( + _compress_codec, _options.compression_min_space_saving, _file, + { page_body.slice() }, page_footer, &pp)); meta->set_is_root_data_page(false); pp.to_proto(meta->mutable_root_page()); diff --git a/be/src/olap/rowset/segment_v2/indexed_column_writer.h b/be/src/olap/rowset/segment_v2/indexed_column_writer.h index c9f143354a..8c81476f23 100644 --- a/be/src/olap/rowset/segment_v2/indexed_column_writer.h +++ b/be/src/olap/rowset/segment_v2/indexed_column_writer.h @@ -48,6 +48,7 @@ struct IndexedColumnWriterOptions { bool write_value_index = false; EncodingTypePB encoding = DEFAULT_ENCODING; CompressionTypePB compression = NO_COMPRESSION; + double compression_min_space_saving = 0.1; }; // IndexedColumn is a column with an optional "ordinal index" and an optional "value index". @@ -82,15 +83,6 @@ public: private: Status _finish_current_data_page(); - // Append the given data page, update ordinal index or value index if they're used. - Status _append_data_page(const std::vector& data_page, rowid_t first_rowid); - - // Append the given page into the file. After return, *pp points to the newly - // inserted page. - // Input data will be compressed when compression is enabled. - // We also compute and append checksum for the page. - Status _append_page(const std::vector& page, PagePointer* pp); - Status _flush_index(IndexPageBuilder* index_builder, BTreeMetaPB* meta); IndexedColumnWriterOptions _options; @@ -100,7 +92,7 @@ private: MemTracker _mem_tracker; MemPool _mem_pool; - rowid_t _num_values; + ordinal_t _num_values; uint32_t _num_data_pages; // remember the first value in current page faststring _first_value; diff --git a/be/src/olap/rowset/segment_v2/ordinal_page_index.cpp b/be/src/olap/rowset/segment_v2/ordinal_page_index.cpp index 274438cbc0..3050bc42c9 100644 --- a/be/src/olap/rowset/segment_v2/ordinal_page_index.cpp +++ b/be/src/olap/rowset/segment_v2/ordinal_page_index.cpp @@ -17,58 +17,111 @@ #include "olap/rowset/segment_v2/ordinal_page_index.h" +#include "common/logging.h" +#include "env/env.h" +#include "olap/key_coder.h" +#include "olap/rowset/segment_v2/page_handle.h" +#include "olap/rowset/segment_v2/page_io.h" +#include "util/file_manager.h" + namespace doris { namespace segment_v2 { -OrdinalPageIndex::~OrdinalPageIndex() { - delete[] _rowids; - delete[] _pages; +void OrdinalIndexWriter::append_entry(ordinal_t ordinal, const PagePointer& data_pp) { + std::string key; + KeyCoderTraits::full_encode_ascending(&ordinal, &key); + _page_builder->add(key, data_pp); + _last_pp = data_pp; } -Status OrdinalPageIndex::load() { - if (UNLIKELY(_data.size < _header_size())) { - return Status::Corruption("block size must greate than header"); - } - const uint8_t* ptr = (const uint8_t*)_data.data; - const uint8_t* limit = (const uint8_t*)_data.data + _data.size; +Status OrdinalIndexWriter::finish(WritableFile* file, ColumnIndexMetaPB* meta) { + CHECK(_page_builder->count() > 0) << "no entry has been added, file=" << file->filename(); + meta->set_type(ORDINAL_INDEX); + BTreeMetaPB* root_page_meta = meta->mutable_ordinal_index()->mutable_root_page(); - _num_pages = decode_fixed32_le(ptr); - ptr += 4; + if (_page_builder->count() == 1) { + // only one data page, no need to write index page + root_page_meta->set_is_root_data_page(true); + _last_pp.to_proto(root_page_meta->mutable_root_page()); + } else { + OwnedSlice page_body; + PageFooterPB page_footer; + _page_builder->finish(&page_body, &page_footer); - // add a additional rowid for row id compute convenience - _rowids = new rowid_t[_num_pages + 1]; - _pages = new PagePointer[_num_pages]; - for (int i = 0; i < _num_pages; ++i) { - ptr = decode_varint32_ptr(ptr, limit, &_rowids[i]); - if (ptr == nullptr) { - return Status::InternalError("Data corruption"); - } - ptr = _pages[i].decode_from(ptr, limit); - if (ptr == nullptr) { - return Status::InternalError("Data corruption"); - } + // write index page (currently it's not compressed) + PagePointer pp; + RETURN_IF_ERROR(PageIO::write_page(file, { page_body.slice() }, page_footer, &pp)); + + root_page_meta->set_is_root_data_page(false); + pp.to_proto(root_page_meta->mutable_root_page()); } - // set the additional last row id as number of rows - _rowids[_num_pages] = _num_rows; return Status::OK(); } -OrdinalPageIndexIterator OrdinalPageIndex::seek_at_or_before(rowid_t rid) { +Status OrdinalIndexReader::load(bool use_page_cache, bool kept_in_memory) { + if (_index_meta->root_page().is_root_data_page()) { + // only one data page, no index page + _num_pages = 1; + _ordinals.push_back(0); + _ordinals.push_back(_num_values); + _pages.emplace_back(_index_meta->root_page().root_page()); + return Status::OK(); + } + // need to read index page + OpenedFileHandle file_handle; + RETURN_IF_ERROR(FileManager::instance()->open_file(_filename, &file_handle)); + + PageReadOptions opts; + opts.file = file_handle.file(); + opts.page_pointer = PagePointer(_index_meta->root_page().root_page()); + opts.codec = nullptr; // ordinal index page uses NO_COMPRESSION right now + OlapReaderStatistics tmp_stats; + opts.stats = &tmp_stats; + opts.use_page_cache = use_page_cache; + opts.kept_in_memory = kept_in_memory; + + // read index page + PageHandle page_handle; + Slice body; + PageFooterPB footer; + RETURN_IF_ERROR(PageIO::read_and_decompress_page(opts, &page_handle, &body, &footer)); + + // parse and save all (ordinal, pp) from index page + IndexPageReader reader; + RETURN_IF_ERROR(reader.parse(body, footer.index_page_footer())); + + _num_pages = reader.count(); + _ordinals.resize(_num_pages + 1); + _pages.resize(_num_pages); + for (int i = 0; i < _num_pages; i++) { + Slice key = reader.get_key(i); + ordinal_t ordinal; + RETURN_IF_ERROR(KeyCoderTraits::decode_ascending( + &key, sizeof(ordinal_t), (uint8_t*) &ordinal, nullptr)); + + _ordinals[i] = ordinal; + _pages[i] = reader.get_value(i); + } + _ordinals[_num_pages] = _num_values; + return Status::OK(); +} + +OrdinalPageIndexIterator OrdinalIndexReader::seek_at_or_before(ordinal_t ordinal) { int32_t left = 0; int32_t right = _num_pages - 1; while (left < right) { int32_t mid = (left + right + 1) / 2; - if (_rowids[mid] < rid) { + if (_ordinals[mid] < ordinal) { left = mid; - } else if (_rowids[mid] > rid) { + } else if (_ordinals[mid] > ordinal) { right = mid - 1; } else { left = mid; break; } } - if (_rowids[left] > rid) { + if (_ordinals[left] > ordinal) { return OrdinalPageIndexIterator(this, _num_pages); } return OrdinalPageIndexIterator(this, left); diff --git a/be/src/olap/rowset/segment_v2/ordinal_page_index.h b/be/src/olap/rowset/segment_v2/ordinal_page_index.h index eb66a08892..c132684614 100644 --- a/be/src/olap/rowset/segment_v2/ordinal_page_index.h +++ b/be/src/olap/rowset/segment_v2/ordinal_page_index.h @@ -18,154 +18,112 @@ #pragma once #include +#include #include #include "common/status.h" +#include "gutil/macros.h" #include "olap/rowset/segment_v2/common.h" +#include "olap/rowset/segment_v2/index_page.h" #include "olap/rowset/segment_v2/page_pointer.h" #include "util/coding.h" #include "util/slice.h" namespace doris { + +class WritableFile; + namespace segment_v2 { -// this class encode ordinal page index -// the binary format is like that -// Header | Content -// Header: -// number of pages (4 Bytes) -// Content: -// array of index_pair -// index_pair: -// Ordinal (4 Bytes) -// PagePointer (8 Bytes) - -static const uint32_t ORDINAL_PAGE_INDEX_HEADER_SIZE = 4; - -class OrdinalPageIndexBuilder { +// Ordinal index is implemented by one IndexPage that stores the first value ordinal +// and file pointer for each data page. +// But if there is only one data page, there is no need for index page. So we store +// the file pointer to that data page directly in index meta (OrdinalIndexPB). +class OrdinalIndexWriter { public: - OrdinalPageIndexBuilder() : _num_pages(0) { - _buffer.reserve(4 * 1024); - // reserve space for number of pages - _buffer.resize(ORDINAL_PAGE_INDEX_HEADER_SIZE); - } + OrdinalIndexWriter() : _page_builder(new IndexPageBuilder(0, true)) {} - void append_entry(rowid_t rid, const PagePointer& page) { - // rid - put_varint32(&_buffer, rid); - // page pointer - page.encode_to(&_buffer); - _num_pages++; - } + void append_entry(ordinal_t ordinal, const PagePointer& data_pp); - uint64_t size() { - return _buffer.size(); - } + uint64_t size() { return _page_builder->size(); } - Slice finish() { - // encoded number of pages - encode_fixed32_le((uint8_t*)_buffer.data(), _num_pages); - return Slice(_buffer); - } + Status finish(WritableFile* file, ColumnIndexMetaPB* meta); private: - std::string _buffer; - uint32_t _num_pages; + DISALLOW_COPY_AND_ASSIGN(OrdinalIndexWriter); + std::unique_ptr _page_builder; + PagePointer _last_pp; }; -class OrdinalPageIndex; -class OrdinalPageIndexIterator { +class OrdinalPageIndexIterator; + +class OrdinalIndexReader { public: - OrdinalPageIndexIterator() : _index(nullptr), _cur_idx(-1) { } - OrdinalPageIndexIterator(OrdinalPageIndex* index) : _index(index), _cur_idx(0) { } - OrdinalPageIndexIterator(OrdinalPageIndex* index, int cur_idx) : _index(index), _cur_idx(cur_idx) { } - inline bool valid() const; - inline void next(); - inline rowid_t rowid() const; - inline int32_t cur_idx() const; - inline const PagePointer& page() const; - inline rowid_t cur_page_first_row_id() const; - inline rowid_t cur_page_last_row_id() const; -private: - OrdinalPageIndex* _index; - int32_t _cur_idx; -}; - -// Page index -class OrdinalPageIndex { -public: - OrdinalPageIndex(const Slice& data, uint64_t num_rows) - : _data(data), _num_rows(num_rows), _num_pages(0), _rowids(nullptr), _pages(nullptr) { - } - ~OrdinalPageIndex(); - - Status load(); - - OrdinalPageIndexIterator seek_at_or_before(rowid_t rid); - OrdinalPageIndexIterator begin() { - return OrdinalPageIndexIterator(this); - } - OrdinalPageIndexIterator end() { - return OrdinalPageIndexIterator(this, _num_pages); - } - rowid_t get_first_row_id(int page_index) const { - return _rowids[page_index]; + explicit OrdinalIndexReader(const std::string& filename, + const OrdinalIndexPB* index_meta, + ordinal_t num_values) : + _filename(filename), _index_meta(index_meta), _num_values(num_values) { } - rowid_t get_last_row_id(int page_index) const { - // because add additional number of rows as the last rowid - // so just return next_page_first_id - 1 - int next_page_index = page_index + 1; - return get_first_row_id(next_page_index) - 1; + // load and parse the index page into memory + Status load(bool use_page_cache, bool kept_in_memory); + + OrdinalPageIndexIterator seek_at_or_before(ordinal_t ordinal); + inline OrdinalPageIndexIterator begin(); + inline OrdinalPageIndexIterator end(); + ordinal_t get_first_ordinal(int page_index) const { + return _ordinals[page_index]; } - int32_t num_pages() const { - return _num_pages; + ordinal_t get_last_ordinal(int page_index) const { + return get_first_ordinal(page_index + 1) - 1; } -private: - uint32_t _header_size() const { return ORDINAL_PAGE_INDEX_HEADER_SIZE; } + // for test + int32_t num_data_pages() const { return _num_pages; } private: friend OrdinalPageIndexIterator; - Slice _data; - uint64_t _num_rows; + std::string _filename; + const OrdinalIndexPB* _index_meta; + // total number of values (including NULLs) in the indexed column, + // equals to 1 + 'last ordinal of last data pages' + ordinal_t _num_values; - // valid after laod - int32_t _num_pages; - // the last row id is additional, set to number of rows - rowid_t* _rowids; - PagePointer* _pages; + // valid after load + int _num_pages = 0; + // _ordinals[i] = first ordinal of the i-th data page, + std::vector _ordinals; + // _pages[i] = page pointer to the i-th data page + std::vector _pages; }; -inline bool OrdinalPageIndexIterator::valid() const { - return _cur_idx < _index->_num_pages; +class OrdinalPageIndexIterator { +public: + OrdinalPageIndexIterator() : _index(nullptr), _cur_idx(-1) { } + OrdinalPageIndexIterator(OrdinalIndexReader* index) : _index(index), _cur_idx(0) { } + OrdinalPageIndexIterator(OrdinalIndexReader* index, int cur_idx) : _index(index), _cur_idx(cur_idx) { } + bool valid() const { return _cur_idx < _index->_num_pages; } + void next() { + DCHECK_LT(_cur_idx, _index->_num_pages); + _cur_idx++; + } + int32_t page_index() const { return _cur_idx; }; + const PagePointer& page() const { return _index->_pages[_cur_idx]; }; + ordinal_t first_ordinal() const { return _index->get_first_ordinal(_cur_idx); } + ordinal_t last_ordinal() const { return _index->get_last_ordinal(_cur_idx); } +private: + OrdinalIndexReader* _index; + int32_t _cur_idx; +}; + +OrdinalPageIndexIterator OrdinalIndexReader::begin() { + return OrdinalPageIndexIterator(this); } -inline void OrdinalPageIndexIterator::next() { - DCHECK_LT(_cur_idx, _index->_num_pages); - _cur_idx++; -} - -inline rowid_t OrdinalPageIndexIterator::rowid() const { - return _index->_rowids[_cur_idx]; -} - -int32_t OrdinalPageIndexIterator::cur_idx() const { - return _cur_idx; -} - -inline const PagePointer& OrdinalPageIndexIterator::page() const { - return _index->_pages[_cur_idx]; -} - -rowid_t OrdinalPageIndexIterator::cur_page_first_row_id() const { - return _index->get_first_row_id(_cur_idx); -} - -rowid_t OrdinalPageIndexIterator::cur_page_last_row_id() const { - return _index->get_last_row_id(_cur_idx); +OrdinalPageIndexIterator OrdinalIndexReader::end() { + return OrdinalPageIndexIterator(this, _num_pages); } } diff --git a/be/src/olap/rowset/segment_v2/page_compression.cpp b/be/src/olap/rowset/segment_v2/page_compression.cpp deleted file mode 100644 index c85b2cd809..0000000000 --- a/be/src/olap/rowset/segment_v2/page_compression.cpp +++ /dev/null @@ -1,120 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "olap/rowset/segment_v2/page_compression.h" - -#include "gutil/strings/substitute.h" -#include "util/block_compression.h" -#include "util/coding.h" - -namespace doris { -namespace segment_v2 { - -using strings::Substitute; - -Status PageDecompressor::decompress_to(Slice* uncompressed_data) { - if (_data.size < 4) { - return Status::Corruption( - Substitute("Compressed page's size is too small, size=$0, needed=$1", - _data.size, 4)); - } - // decode uncompressed_bytes from footer - uint32_t uncompressed_bytes = decode_fixed32_le((uint8_t*)_data.data + _data.size - 4); - - Slice compressed_slice(_data.data, _data.size - 4); - if (compressed_slice.size == uncompressed_bytes) { - // If compressed_slice's size is equal with _uncompressed_bytes, it means - // compressor store this directly without compression. So we just copy - // this to buf and return. - *uncompressed_data = compressed_slice; - return Status::OK(); - } - std::unique_ptr buf(new char[uncompressed_bytes]); - - Slice uncompressed_slice(buf.get(), uncompressed_bytes); - RETURN_IF_ERROR(_codec->decompress(compressed_slice, &uncompressed_slice)); - if (uncompressed_slice.size != uncompressed_bytes) { - // If size after decompress didn't match recorded size, we think this - // page is corrupt. - return Status::Corruption( - Substitute("Uncompressed size not match, record=$0 vs decompress=$1", - uncompressed_bytes, uncompressed_slice.size)); - } - *uncompressed_data = Slice(buf.release(), uncompressed_bytes); - return Status::OK(); -} - -Status PageCompressor::compress(const std::vector& raw_data, - std::vector* compressed_slices) { - size_t uncompressed_bytes = Slice::compute_total_size(raw_data); - size_t max_compressed_bytes = _codec->max_compressed_len(uncompressed_bytes); - _buf.resize(max_compressed_bytes + 4); - Slice compressed_slice(_buf.data(), max_compressed_bytes); - RETURN_IF_ERROR(_codec->compress(raw_data, &compressed_slice)); - - double space_saving = 1.0 - (double)compressed_slice.size / uncompressed_bytes; - if (compressed_slice.size >= uncompressed_bytes || // use integer to make definite - space_saving < _min_space_saving) { - // If space saving is not higher enough we just copy uncompressed - // data to avoid decompression CPU cost - for (auto& slice : raw_data) { - compressed_slices->push_back(slice); - } - - // encode uncompressed_bytes into footer of compressed value - encode_fixed32_le((uint8_t*)_buf.data(), uncompressed_bytes); - compressed_slices->emplace_back(_buf.data(), 4); - return Status::OK(); - } - // encode uncompressed_bytes into footer of compressed value - encode_fixed32_le((uint8_t*)_buf.data() + compressed_slice.size, uncompressed_bytes); - // return compressed data to client - compressed_slices->emplace_back(_buf.data(), 4 + compressed_slice.size); - - return Status::OK(); -} - -Status PageCompressor::compress(const std::vector& raw_data, - OwnedSlice* compressed_data, bool* compressed) { - size_t uncompressed_bytes = Slice::compute_total_size(raw_data); - size_t max_compressed_bytes = _codec->max_compressed_len(uncompressed_bytes); - _buf.resize(max_compressed_bytes + 4); - Slice compression_buffer(_buf.data(), max_compressed_bytes); - RETURN_IF_ERROR(_codec->compress(raw_data, &compression_buffer)); - - double space_saving = 1.0 - (double)compression_buffer.size / uncompressed_bytes; - if (compression_buffer.size >= uncompressed_bytes || // use integer to make definite - space_saving < _min_space_saving) { - // If space saving is not higher enough we just copy uncompressed - // data to avoid decompression CPU cost - _buf.resize(0); - *compressed_data = _buf.build(); - *compressed = false; - return Status::OK(); - } - // encode uncompressed_bytes into footer of compressed value - encode_fixed32_le((uint8_t*)_buf.data() + compression_buffer.size, uncompressed_bytes); - // return compressed data to client - _buf.resize(compression_buffer.size + 4); - *compressed_data = _buf.build(); - *compressed = true; - - return Status::OK(); -} - -} -} diff --git a/be/src/olap/rowset/segment_v2/page_compression.h b/be/src/olap/rowset/segment_v2/page_compression.h deleted file mode 100644 index 4e78fa3845..0000000000 --- a/be/src/olap/rowset/segment_v2/page_compression.h +++ /dev/null @@ -1,103 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "common/status.h" -#include "util/slice.h" -#include "util/faststring.h" - -namespace doris { - -class BlockCompressionCodec; - -namespace segment_v2 { - -// Utility class for parsing and decompressing compressed page. -// Format of compressed page := Data, UncompressedSize(fixed32) -// When sizeof(Data) == UncompressedSize, it means Data is stored in uncompressed -// form, thus decompression is not needed. -// Otherwise Data is in compressed form and should be decompressed. -// The type of compression codec for Data is stored elsewhere and should -// be passed into the constructor. -// Usage example: -// // page_slice refers to page read from storage -// PageDecompressor decompressor(page_slice, codec); -// // points to decompressed Data of the page (without footer) -// Slice uncompressed_slice; -// RETURN_IF_ERROR(decompressor.decompress_to(&uncompressed_slice)); -// // use uncompressed_slice -// // we have a new buffer for decompressed page -// if (uncompressed_slice.data != page_slice.data) { -// delete[] page_slice.data; -// } -class PageDecompressor { -public: - PageDecompressor(const Slice& compressed_data, const BlockCompressionCodec* codec) - : _data(compressed_data), _codec(codec) { - } - - // This client will set uncompress content to uncompressed_data. - // In normal case(compressed_data.data != uncompressed_data.data) client should - // call delete[] compressed_data.data to free heap memory. However - // when the data is not compressed, this function will return compressed_data - // directly. In this case compressed_data.data == uncompressed_data.data, - // client should not free content. - Status decompress_to(Slice* uncompressed_data); -private: - Slice _data; - const BlockCompressionCodec* _codec; -}; - -// Helper to build a compress page. -// Usage: -// std::vector raw_data; -// PageCompressor compressor(codec, 0.1); -// std::vector compressed_data; -// compressor.compress(raw_data, &compressed_data) -class PageCompressor { -public: - PageCompressor(const BlockCompressionCodec* codec, double min_space_saving = 0.1) - : _codec(codec), _min_space_saving(min_space_saving) { - } - - // Try to compress input raw data into compressed page - // according given BlockCompressionCodec. If compressed page is not - // smaller enough than raw data, this class will return uncompressed data. - Status compress(const std::vector& raw_data, - std::vector* compressed_data); - - // Try to compress input raw data into compressed page by returning OwnedSlice - // according given BlockCompressionCodec. If compressed page is not - // smaller enough than raw data, this class will return uncompressed data. - Status compress(const std::vector& raw_data, - OwnedSlice* compressed_data, bool* compressed); -private: - const BlockCompressionCodec* _codec; - - // If space saving is lower than _min_space_saving, compress will return origin data - double _min_space_saving; - - // used to store compressed data - faststring _buf; -}; - -} -} diff --git a/be/src/olap/rowset/segment_v2/page_decoder.h b/be/src/olap/rowset/segment_v2/page_decoder.h index a6aa79862f..ee7e84bc05 100644 --- a/be/src/olap/rowset/segment_v2/page_decoder.h +++ b/be/src/olap/rowset/segment_v2/page_decoder.h @@ -18,7 +18,6 @@ #pragma once #include "olap/column_block.h" // for ColumnBlockView -#include "olap/rowset/segment_v2/common.h" // for rowid_t #include "common/status.h" // for Status namespace doris { diff --git a/be/src/olap/rowset/segment_v2/page_handle.h b/be/src/olap/rowset/segment_v2/page_handle.h index 04f49bd614..0279a9b2b4 100644 --- a/be/src/olap/rowset/segment_v2/page_handle.h +++ b/be/src/olap/rowset/segment_v2/page_handle.h @@ -65,7 +65,7 @@ public: } } - // This function only valid when assign valid data, either in cache or not + // the return slice contains uncompressed page body, page footer, and footer size Slice data() const { if (_is_data_owner) { return _data; diff --git a/be/src/olap/rowset/segment_v2/page_io.cpp b/be/src/olap/rowset/segment_v2/page_io.cpp new file mode 100644 index 0000000000..6363520349 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/page_io.cpp @@ -0,0 +1,208 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/page_io.h" + +#include +#include + +#include "common/logging.h" +#include "env/env.h" +#include "gutil/strings/substitute.h" +#include "olap/page_cache.h" +#include "util/block_compression.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/faststring.h" +#include "util/runtime_profile.h" + +namespace doris { +namespace segment_v2 { + +using strings::Substitute; + +Status PageIO::compress_page_body(const BlockCompressionCodec* codec, + double min_space_saving, + const std::vector& body, + OwnedSlice* compressed_body) { + size_t uncompressed_size = Slice::compute_total_size(body); + if (codec != nullptr && uncompressed_size > 0) { + size_t max_compressed_size = codec->max_compressed_len(uncompressed_size); + faststring buf; + buf.resize(max_compressed_size); + Slice compressed_slice(buf); + RETURN_IF_ERROR(codec->compress(body, &compressed_slice)); + buf.resize(compressed_slice.get_size()); + + double space_saving = 1.0 - static_cast(buf.size()) / uncompressed_size; + // return compressed body only when it saves more than min_space_saving + if (space_saving > 0 && space_saving >= min_space_saving) { + *compressed_body = buf.build(); + return Status::OK(); + } + } + // otherwise, do not compress + OwnedSlice empty; + *compressed_body = std::move(empty); + return Status::OK(); +} + +Status PageIO::write_page(WritableFile* file, + const std::vector& body, + const PageFooterPB& footer, + PagePointer* result) { + // sanity check of page footer + CHECK(footer.has_type()) << "type must be set"; + CHECK(footer.has_uncompressed_size()) << "uncompressed_size must be set"; + switch (footer.type()) { + case DATA_PAGE: + CHECK(footer.has_data_page_footer()); + break; + case INDEX_PAGE: + CHECK(footer.has_index_page_footer()); + break; + case DICTIONARY_PAGE: + CHECK(footer.has_dict_page_footer()); + break; + case SHORT_KEY_PAGE: + CHECK(footer.has_short_key_page_footer()); + break; + default: + CHECK(false) << "Invalid page footer type: " << footer.type(); + break; + } + + std::string footer_buf; // serialized footer + footer size + footer.SerializeToString(&footer_buf); + put_fixed32_le(&footer_buf, static_cast(footer_buf.size())); + + std::vector page = body; + page.emplace_back(footer_buf); + + // checksum + uint8_t checksum_buf[sizeof(uint32_t)]; + uint32_t checksum = crc32c::Value(page); + encode_fixed32_le(checksum_buf, checksum); + page.emplace_back(checksum_buf, sizeof(uint32_t)); + + uint64_t offset = file->size(); + RETURN_IF_ERROR(file->appendv(&page[0], page.size())); + + result->offset = offset; + result->size = file->size() - offset; + return Status::OK(); +} + +Status PageIO::read_and_decompress_page(const PageReadOptions& opts, + PageHandle* handle, + Slice* body, + PageFooterPB* footer) { + opts.sanity_check(); + opts.stats->total_pages_num++; + + auto cache = StoragePageCache::instance(); + PageCacheHandle cache_handle; + StoragePageCache::CacheKey cache_key(opts.file->file_name(), opts.page_pointer.offset); + if (opts.use_page_cache && cache->lookup(cache_key, &cache_handle)) { + // we find page in cache, use it + *handle = PageHandle(std::move(cache_handle)); + opts.stats->cached_pages_num++; + // parse body and footer + Slice page_slice = handle->data(); + uint32_t footer_size = decode_fixed32_le((uint8_t*) page_slice.data + page_slice.size - 4); + std::string footer_buf(page_slice.data + page_slice.size - 4 - footer_size, footer_size); + if (!footer->ParseFromString(footer_buf)) { + return Status::Corruption("Bad page: invalid footer"); + } + *body = Slice(page_slice.data, page_slice.size - 4 - footer_size); + return Status::OK(); + } + + // every page contains 4 bytes footer length and 4 bytes checksum + const uint32_t page_size = opts.page_pointer.size; + if (page_size < 8) { + return Status::Corruption(Substitute("Bad page: too small size ($0)", page_size)); + } + + // hold compressed page at first, reset to decompressed page later + std::unique_ptr page(new char[page_size]); + Slice page_slice(page.get(), page_size); + { + SCOPED_RAW_TIMER(&opts.stats->io_ns); + RETURN_IF_ERROR(opts.file->read_at(opts.page_pointer.offset, page_slice)); + opts.stats->compressed_bytes_read += page_size; + } + + if (opts.verify_checksum) { + uint32_t expect = decode_fixed32_le((uint8_t*) page_slice.data + page_slice.size - 4); + uint32_t actual = crc32c::Value(page_slice.data, page_slice.size - 4); + if (expect != actual) { + return Status::Corruption(Substitute( + "Bad page: checksum mismatch (actual=$0 vs expect=$1)", actual, expect)); + } + } + + // remove checksum suffix + page_slice.size -= 4; + // parse and set footer + uint32_t footer_size = decode_fixed32_le((uint8_t*) page_slice.data + page_slice.size - 4); + if (!footer->ParseFromArray(page_slice.data + page_slice.size - 4 - footer_size, footer_size)) { + return Status::Corruption("Bad page: invalid footer"); + } + + uint32_t body_size = page_slice.size - 4 - footer_size; + if (body_size != footer->uncompressed_size()) { // need decompress body + if (opts.codec == nullptr) { + return Status::Corruption("Bad page: page is compressed but codec is NO_COMPRESSION"); + } + SCOPED_RAW_TIMER(&opts.stats->decompress_ns); + std::unique_ptr decompressed_page( + new char[footer->uncompressed_size() + footer_size + 4]); + + // decompress page body + Slice compressed_body(page_slice.data, body_size); + Slice decompressed_body(decompressed_page.get(), footer->uncompressed_size()); + RETURN_IF_ERROR(opts.codec->decompress(compressed_body, &decompressed_body)); + if (decompressed_body.size != footer->uncompressed_size()) { + return Status::Corruption(Substitute( + "Bad page: record uncompressed size=$0 vs real decompressed size=$1", + footer->uncompressed_size(), decompressed_body.size)); + } + // append footer and footer size + memcpy(decompressed_body.data + decompressed_body.size, + page_slice.data + body_size, + footer_size + 4); + // free memory of compressed page + page = std::move(decompressed_page); + page_slice = Slice(page.get(), footer->uncompressed_size() + footer_size + 4); + opts.stats->uncompressed_bytes_read += page_slice.size; + } + + *body = Slice(page_slice.data, page_slice.size - 4 - footer_size); + if (opts.use_page_cache) { + // insert this page into cache and return the cache handle + cache->insert(cache_key, page_slice, &cache_handle, opts.kept_in_memory); + *handle = PageHandle(std::move(cache_handle)); + } else { + *handle = PageHandle(page_slice); + } + page.release(); // memory now managed by handle + return Status::OK(); +} + +} // namespace segment_v2 +} // namespace doris \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/page_io.h b/be/src/olap/rowset/segment_v2/page_io.h new file mode 100644 index 0000000000..c18bf5c441 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/page_io.h @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "common/logging.h" +#include "common/status.h" +#include "gen_cpp/segment_v2.pb.h" +#include "olap/rowset/segment_v2/page_handle.h" +#include "olap/rowset/segment_v2/page_pointer.h" +#include "util/slice.h" + +namespace doris { + +class BlockCompressionCodec; +struct OlapReaderStatistics; +class RandomAccessFile; +class WritableFile; + +namespace segment_v2 { + +struct PageReadOptions { + // file to read page + RandomAccessFile* file = nullptr; + // location of the page + PagePointer page_pointer; + // decompressor for page body (null means page body is not compressed) + const BlockCompressionCodec* codec = nullptr; + // used to collect IO metrics + OlapReaderStatistics* stats = nullptr; + // whether to verify page checksum + bool verify_checksum = true; + // whether to use page cache in read path + bool use_page_cache = true; + // if true, use DURABLE CachePriority in page cache + // currently used for in memory olap table + bool kept_in_memory = false; + + void sanity_check() const { + CHECK_NOTNULL(file); + CHECK_NOTNULL(stats); + } +}; + +// Utility class for read and write page. All types of page share the same general layout: +// Page := PageBody, PageFooter, FooterSize(4), Checksum(4) +// - PageBody is defined by page type and may be compressed +// - PageFooter is serialized PageFooterPB. It contains page_type, uncompressed_body_size, +// and other custom metadata. PageBody is not compressed when its size is equal to +// uncompressed_body_size +// - FooterSize stores the size of PageFooter +// - Checksum is the crc32c checksum of all previous part +class PageIO { +public: + + // Compress `body' using `codec' into `compressed_body'. + // The size of returned `compressed_body' is 0 when the body is not compressed, this + // could happen when `codec' is null or space saving is less than `min_space_saving'. + static Status compress_page_body(const BlockCompressionCodec* codec, + double min_space_saving, + const std::vector& body, + OwnedSlice* compressed_body); + + // Encode page from `body' and `footer' and write to `file'. + // `body' could be either uncompressed or compressed. + // On success, the file pointer to the written page is stored in `result'. + static Status write_page(WritableFile* file, + const std::vector& body, + const PageFooterPB& footer, + PagePointer* result); + + // Convenient function to compress page body and write page in one go. + static Status compress_and_write_page(const BlockCompressionCodec* codec, + double min_space_saving, + WritableFile* file, + const std::vector& body, + const PageFooterPB& footer, + PagePointer* result) { + DCHECK_EQ(footer.uncompressed_size(), Slice::compute_total_size(body)); + OwnedSlice compressed_body; + RETURN_IF_ERROR(compress_page_body(codec, min_space_saving, body, &compressed_body)); + if (compressed_body.slice().empty()) { // uncompressed + return write_page(file, body, footer, result); + } + return write_page(file, { compressed_body.slice() }, footer, result); + } + + // Read and parse a page according to `opts'. + // On success + // `handle' holds the memory of page data, + // `body' points to page body, + // `footer' stores the page footer. + static Status read_and_decompress_page(const PageReadOptions& opts, + PageHandle* handle, + Slice* body, + PageFooterPB* footer); +}; + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/parsed_page.h b/be/src/olap/rowset/segment_v2/parsed_page.h index 2800c7db86..7291ede65b 100644 --- a/be/src/olap/rowset/segment_v2/parsed_page.h +++ b/be/src/olap/rowset/segment_v2/parsed_page.h @@ -17,47 +17,88 @@ #pragma once -#include "olap/rowset/segment_v2/page_decoder.h" // for PagePointer -#include "util/rle_encoding.h" // for RleDecoder +#include + +#include "common/status.h" +#include "gen_cpp/segment_v2.pb.h" +#include "olap/rowset/segment_v2/common.h" +#include "olap/rowset/segment_v2/encoding_info.h" +#include "olap/rowset/segment_v2/options.h" +#include "olap/rowset/segment_v2/page_decoder.h" +#include "olap/rowset/segment_v2/page_handle.h" +#include "util/rle_encoding.h" namespace doris { namespace segment_v2 { -class PageHandle; -struct PagePointer; - // This contains information when one page is loaded, and ready for read // This struct can be reused, client should call reset first before reusing // this object struct ParsedPage { - ParsedPage() { } + + static Status create(PageHandle handle, + const Slice& body, + const DataPageFooterPB& footer, + const EncodingInfo* encoding, + const PagePointer& page_pointer, + uint32_t page_index, + std::unique_ptr* result) { + std::unique_ptr page(new ParsedPage); + page->page_handle = std::move(handle); + + auto null_size = footer.nullmap_size(); + page->has_null = null_size > 0; + page->null_bitmap = Slice(body.data + body.size - null_size, null_size); + + if (page->has_null) { + page->null_decoder = RleDecoder( + (const uint8_t*) page->null_bitmap.data, null_size, 1); + } + + Slice data_slice(body.data, body.size - null_size); + PageDecoderOptions opts; + RETURN_IF_ERROR(encoding->create_page_decoder(data_slice, opts, &page->data_decoder)); + RETURN_IF_ERROR(page->data_decoder->init()); + + page->first_ordinal = footer.first_ordinal(); + page->num_rows = footer.num_values(); + page->page_pointer = page_pointer; + page->page_index = page_index; + + *result = std::move(page); + return Status::OK(); + } + ~ParsedPage() { delete data_decoder; } - PagePointer page_pointer; PageHandle page_handle; + bool has_null; Slice null_bitmap; RleDecoder null_decoder; PageDecoder* data_decoder = nullptr; - // first rowid for this page - rowid_t first_rowid = 0; - + // ordinal of the first value in this page + ordinal_t first_ordinal = 0; // number of rows including nulls and not-nulls - uint32_t num_rows = 0; + ordinal_t num_rows = 0; + + PagePointer page_pointer; + uint32_t page_index = 0; // current offset when read this page // this means next row we will read - uint32_t offset_in_page = 0; + ordinal_t offset_in_page = 0; - uint32_t page_index = 0; - - bool contains(rowid_t rid) { return rid >= first_rowid && rid < (first_rowid + num_rows); } - rowid_t last_rowid() { return first_rowid + num_rows - 1; } + bool contains(ordinal_t ord) { return ord >= first_ordinal && ord < (first_ordinal + num_rows); } bool has_remaining() const { return offset_in_page < num_rows; } size_t remaining() const { return num_rows - offset_in_page; } + +private: + // client should use create() factory method + ParsedPage() = default; }; } diff --git a/be/src/olap/rowset/segment_v2/rle_page.h b/be/src/olap/rowset/segment_v2/rle_page.h index 027ffff458..a6fa79fb64 100644 --- a/be/src/olap/rowset/segment_v2/rle_page.h +++ b/be/src/olap/rowset/segment_v2/rle_page.h @@ -17,7 +17,6 @@ #pragma once -#include "olap/rowset/segment_v2/common.h" // for rowid_t #include "olap/rowset/segment_v2/options.h" // for PageBuilderOptions/PageDecoderOptions #include "olap/rowset/segment_v2/page_builder.h" // for PageBuilder #include "olap/rowset/segment_v2/page_decoder.h" // for PageDecoder diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index eac07b486a..e4a0c19a87 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -21,6 +21,7 @@ #include "env/env.h" // RandomAccessFile #include "gutil/strings/substitute.h" #include "olap/rowset/segment_v2/column_reader.h" // ColumnReader +#include "olap/rowset/segment_v2/page_io.h" #include "olap/rowset/segment_v2/segment_writer.h" // k_segment_magic_length #include "olap/rowset/segment_v2/segment_iterator.h" #include "olap/rowset/segment_v2/empty_segment_iterator.h" @@ -68,41 +69,10 @@ Status Segment::new_iterator(const Schema& schema, if (read_options.conditions != nullptr) { for (auto& column_condition : read_options.conditions->columns()) { int32_t column_id = column_condition.first; - auto entry = _column_id_to_footer_ordinal.find(column_id); - if (entry == _column_id_to_footer_ordinal.end()) { + if (_column_readers[column_id] == nullptr || !_column_readers[column_id]->has_zone_map()) { continue; } - auto& c_meta = _footer.columns(entry->second); - if (!c_meta.has_zone_map()) { - continue; - } - auto& c_zone_map = c_meta.zone_map(); - if (!c_zone_map.has_not_null() && !c_zone_map.has_null()) { - // no data - iter->reset(new EmptySegmentIterator(schema)); - return Status::OK(); - } - // TODO Logic here and the similar logic in ColumnReader::_get_filtered_pages should be unified. - TypeInfo* type_info = get_type_info((FieldType)c_meta.type()); - if (type_info == nullptr) { - return Status::NotSupported(Substitute("unsupported typeinfo, type=$0", c_meta.type())); - } - FieldType type = type_info->type(); - const Field* field = schema.column(column_id); - int32_t var_length = field->length(); - std::unique_ptr min_value(WrapperField::create_by_type(type, var_length)); - std::unique_ptr max_value(WrapperField::create_by_type(type, var_length)); - if (c_zone_map.has_not_null()) { - min_value->from_string(c_zone_map.min()); - max_value->from_string(c_zone_map.max()); - } - if (c_zone_map.has_null()) { - min_value->set_null(); - if (!c_zone_map.has_not_null()) { - max_value->set_null(); - } - } - if (!column_condition.second->eval({min_value.get(), max_value.get()})) { + if (!_column_readers[column_id]->match_condition(column_condition.second)) { // any condition not satisfied, return. iter->reset(new EmptySegmentIterator(schema)); return Status::OK(); @@ -164,18 +134,25 @@ Status Segment::_parse_footer() { Status Segment::_load_index() { return _load_index_once.call([this] { - // read short key index content + // read and parse short key index page OpenedFileHandle file_handle; RETURN_IF_ERROR(FileManager::instance()->open_file(_fname, &file_handle)); - RandomAccessFile* input_file = file_handle.file(); - _sk_index_buf.resize(_footer.short_key_index_page().size()); - Slice slice(_sk_index_buf.data(), _sk_index_buf.size()); - RETURN_IF_ERROR(input_file->read_at(_footer.short_key_index_page().offset(), slice)); - // Parse short key index - _sk_index_decoder.reset(new ShortKeyIndexDecoder(_sk_index_buf)); - RETURN_IF_ERROR(_sk_index_decoder->parse()); - return Status::OK(); + PageReadOptions opts; + opts.file = file_handle.file(); + opts.page_pointer = PagePointer(_footer.short_key_index_page()); + opts.codec = nullptr; // short key index page uses NO_COMPRESSION for now + OlapReaderStatistics tmp_stats; + opts.stats = &tmp_stats; + + Slice body; + PageFooterPB footer; + RETURN_IF_ERROR(PageIO::read_and_decompress_page(opts, &_sk_index_handle, &body, &footer)); + DCHECK_EQ(footer.type(), SHORT_KEY_PAGE); + DCHECK(footer.has_short_key_page_footer()); + + _sk_index_decoder.reset(new ShortKeyIndexDecoder); + return _sk_index_decoder->parse(body, footer.short_key_page_footer()); }); } @@ -194,7 +171,7 @@ Status Segment::_create_column_readers() { } ColumnReaderOptions opts; - opts.cache_in_memory = _tablet_schema->is_in_memory(); + opts.kept_in_memory = _tablet_schema->is_in_memory(); std::unique_ptr reader; // pass Descriptor* to column reader RETURN_IF_ERROR(ColumnReader::create( diff --git a/be/src/olap/rowset/segment_v2/segment.h b/be/src/olap/rowset/segment_v2/segment.h index 831d8c88e6..afe983fff0 100644 --- a/be/src/olap/rowset/segment_v2/segment.h +++ b/be/src/olap/rowset/segment_v2/segment.h @@ -26,7 +26,7 @@ #include "gen_cpp/segment_v2.pb.h" #include "gutil/macros.h" #include "olap/iterators.h" -#include "olap/rowset/segment_v2/common.h" // rowid_t +#include "olap/rowset/segment_v2/page_handle.h" #include "olap/short_key_index.h" #include "olap/tablet_schema.h" #include "util/faststring.h" @@ -141,8 +141,8 @@ private: // used to guarantee that short key index will be loaded at most once in a thread-safe way DorisCallOnce _load_index_once; - // used to store short key index - faststring _sk_index_buf; + // used to hold short key index page in memory + PageHandle _sk_index_handle; // short key index decoder std::unique_ptr _sk_index_decoder; }; diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index b3a9310237..f8bb7760d1 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -27,8 +27,6 @@ #include "olap/rowset/segment_v2/segment.h" #include "olap/schema.h" #include "olap/rowset/segment_v2/row_ranges.h" -#include "olap/rowset/segment_v2/column_zone_map.h" -#include "olap/rowset/segment_v2/ordinal_page_index.h" #include "olap/olap_cond.h" #include "util/file_cache.h" diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index 5aeddb34d3..3fc26e432c 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -19,9 +19,9 @@ #include "env/env.h" // Env #include "olap/row.h" // ContiguousRow -#include "olap/row_block.h" // RowBlock #include "olap/row_cursor.h" // RowCursor #include "olap/rowset/segment_v2/column_writer.h" // ColumnWriter +#include "olap/rowset/segment_v2/page_io.h" #include "olap/short_key_index.h" #include "util/crc32c.h" @@ -48,16 +48,20 @@ Status SegmentWriter::init(uint32_t write_mbytes_per_sec) { uint32_t column_id = 0; for (auto& column : _tablet_schema->columns()) { - ColumnMetaPB* column_meta = _footer.add_columns(); - // TODO(zc): Do we need this column_id?? - column_meta->set_column_id(column_id++); - column_meta->set_unique_id(column.unique_id()); - bool is_nullable = column.is_nullable(); - column_meta->set_is_nullable(is_nullable); - column_meta->set_length(column.length()); + std::unique_ptr field(FieldFactory::create(column)); + DCHECK(field.get() != nullptr); ColumnWriterOptions opts; - opts.compression_type = segment_v2::CompressionTypePB::LZ4F; + opts.meta = _footer.add_columns(); + // TODO(zc): Do we need this column_id?? + opts.meta->set_column_id(column_id++); + opts.meta->set_unique_id(column.unique_id()); + opts.meta->set_type(field->type()); + opts.meta->set_length(column.length()); + opts.meta->set_encoding(DEFAULT_ENCODING); + opts.meta->set_compression(LZ4F); + opts.meta->set_is_nullable(column.is_nullable()); + // now we create zone map for key columns if (column.is_key()) { opts.need_zone_map = true; @@ -85,9 +89,8 @@ Status SegmentWriter::init(uint32_t write_mbytes_per_sec) { } } - std::unique_ptr field(FieldFactory::create(column)); - DCHECK(field.get() != nullptr); - std::unique_ptr writer(new ColumnWriter(opts, std::move(field), is_nullable, _output_file.get())); + std::unique_ptr writer( + new ColumnWriter(opts, std::move(field), _output_file.get())); RETURN_IF_ERROR(writer->init()); _column_writers.push_back(std::move(writer)); } @@ -179,25 +182,18 @@ Status SegmentWriter::_write_bloom_filter_index() { } Status SegmentWriter::_write_short_key_index() { - std::vector slices; - // TODO(zc): we should get segment_size - RETURN_IF_ERROR(_index_builder->finalize(_row_count * 100, _row_count, &slices)); - - uint64_t offset = _output_file->size(); - RETURN_IF_ERROR(_write_raw_data(slices)); - uint32_t written_bytes = _output_file->size() - offset; - - _footer.mutable_short_key_index_page()->set_offset(offset); - _footer.mutable_short_key_index_page()->set_size(written_bytes); + std::vector body; + PageFooterPB footer; + RETURN_IF_ERROR(_index_builder->finalize(_row_count, &body, &footer)); + PagePointer pp; + // short key index page is not compressed right now + RETURN_IF_ERROR(PageIO::write_page(_output_file.get(), body, footer, &pp)); + pp.to_proto(_footer.mutable_short_key_index_page()); return Status::OK(); } Status SegmentWriter::_write_footer() { _footer.set_num_rows(_row_count); - // collect all - for (int i = 0; i < _column_writers.size(); ++i) { - _column_writers[i]->write_meta(_footer.mutable_columns(i)); - } // Footer := SegmentFooterPB, FooterPBSize(4), FooterPBChecksum(4), MagicNumber(4) std::string footer_buf; diff --git a/be/src/olap/rowset/segment_v2/segment_writer.h b/be/src/olap/rowset/segment_v2/segment_writer.h index 33988b34fe..70d332b5aa 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.h +++ b/be/src/olap/rowset/segment_v2/segment_writer.h @@ -67,12 +67,6 @@ public: Status finalize(uint64_t* segment_file_size, uint64_t* index_size); - // for ut - // this function should be called after finalize - bool has_bf_index(uint32_t col_id) const { - return _footer.columns(col_id).has_bloom_filter_index(); - } - private: DISALLOW_COPY_AND_ASSIGN(SegmentWriter); Status _write_data(); diff --git a/be/src/olap/rowset/segment_v2/zone_map_index.cpp b/be/src/olap/rowset/segment_v2/zone_map_index.cpp new file mode 100644 index 0000000000..8dc3a895f4 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/zone_map_index.cpp @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/zone_map_index.h" + +#include "olap/column_block.h" +#include "olap/olap_define.h" +#include "olap/rowset/segment_v2/encoding_info.h" +#include "olap/rowset/segment_v2/indexed_column_reader.h" +#include "olap/rowset/segment_v2/indexed_column_writer.h" +#include "olap/types.h" +#include "runtime/mem_pool.h" +#include "runtime/mem_tracker.h" + +namespace doris { + +namespace segment_v2 { + +ZoneMapIndexWriter::ZoneMapIndexWriter(Field* field) : _field(field), _pool(&_tracker) { + _page_zone_map.min_value = _field->allocate_value(&_pool); + _page_zone_map.max_value = _field->allocate_value(&_pool); + _reset_zone_map(&_page_zone_map); + _segment_zone_map.min_value = _field->allocate_value(&_pool); + _segment_zone_map.max_value = _field->allocate_value(&_pool); + _reset_zone_map(&_segment_zone_map); +} + +void ZoneMapIndexWriter::add_values(const void* values, size_t count) { + if (count > 0) { + _page_zone_map.has_not_null = true; + } + const char* vals = reinterpret_cast(values); + for (int i = 0; i < count; ++i) { + if (_field->compare(_page_zone_map.min_value, vals) > 0) { + _field->type_info()->direct_copy(_page_zone_map.min_value, vals); + } + if (_field->compare(_page_zone_map.max_value, vals) < 0) { + _field->type_info()->direct_copy(_page_zone_map.max_value, vals); + } + vals += _field->size(); + } +} + +Status ZoneMapIndexWriter::flush() { + // Update segment zone map. + if (_field->compare(_segment_zone_map.min_value, _page_zone_map.min_value) > 0) { + _field->type_info()->direct_copy(_segment_zone_map.min_value, _page_zone_map.min_value); + } + if (_field->compare(_segment_zone_map.max_value, _page_zone_map.max_value) < 0) { + _field->type_info()->direct_copy(_segment_zone_map.max_value, _page_zone_map.max_value); + } + if (_page_zone_map.has_null) { + _segment_zone_map.has_null = true; + } + if (_page_zone_map.has_not_null) { + _segment_zone_map.has_not_null = true; + } + + ZoneMapPB zone_map_pb; + _page_zone_map.to_proto(&zone_map_pb, _field); + _reset_zone_map(&_page_zone_map); + + std::string serialized_zone_map; + bool ret = zone_map_pb.SerializeToString(&serialized_zone_map); + if (!ret) { + return Status::InternalError("serialize zone map failed"); + } + _estimated_size += serialized_zone_map.size() + sizeof(uint32_t); + _values.push_back(std::move(serialized_zone_map)); + return Status::OK(); +} + +Status ZoneMapIndexWriter::finish(WritableFile* file, ColumnIndexMetaPB* index_meta) { + index_meta->set_type(ZONE_MAP_INDEX); + ZoneMapIndexPB* meta = index_meta->mutable_zone_map_index(); + // store segment zone map + _segment_zone_map.to_proto(meta->mutable_segment_zone_map(), _field); + + // write out zone map for each data pages + const TypeInfo* typeinfo = get_type_info(OLAP_FIELD_TYPE_OBJECT); + IndexedColumnWriterOptions options; + options.write_ordinal_index = true; + options.write_value_index = false; + options.encoding = EncodingInfo::get_default_encoding(typeinfo, false); + options.compression = NO_COMPRESSION; // currently not compressed + + IndexedColumnWriter writer(options, typeinfo, file); + RETURN_IF_ERROR(writer.init()); + + for (auto& value : _values) { + Slice value_slice(value); + RETURN_IF_ERROR(writer.add(&value_slice)); + } + return writer.finish(meta->mutable_page_zone_maps()); +} + +Status ZoneMapIndexReader::load(bool use_page_cache, bool kept_in_memory) { + IndexedColumnReader reader(_filename, _index_meta->page_zone_maps()); + RETURN_IF_ERROR(reader.load(use_page_cache, kept_in_memory)); + IndexedColumnIterator iter(&reader); + + MemTracker tracker; + MemPool pool(&tracker); + _page_zone_maps.resize(reader.num_values()); + + // read and cache all page zone maps + for (int i = 0; i < reader.num_values(); ++i) { + Slice value; + uint8_t nullmap; + size_t num_to_read = 1; + ColumnBlock block(reader.type_info(), (uint8_t*) &value, &nullmap, num_to_read, &pool); + ColumnBlockView column_block_view(&block); + + RETURN_IF_ERROR(iter.seek_to_ordinal(i)); + size_t num_read = num_to_read; + RETURN_IF_ERROR(iter.next_batch(&num_read, &column_block_view)); + DCHECK(num_to_read == num_read); + + if (!_page_zone_maps[i].ParseFromArray(value.data, value.size)) { + return Status::Corruption("Failed to parse zone map"); + } + pool.clear(); + } + return Status::OK(); +} + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/column_zone_map.h b/be/src/olap/rowset/segment_v2/zone_map_index.h similarity index 53% rename from be/src/olap/rowset/segment_v2/column_zone_map.h rename to be/src/olap/rowset/segment_v2/zone_map_index.h index 6989725cdc..6bafa07fca 100644 --- a/be/src/olap/rowset/segment_v2/column_zone_map.h +++ b/be/src/olap/rowset/segment_v2/zone_map_index.h @@ -17,8 +17,9 @@ #pragma once -#include #include +#include +#include #include "common/status.h" #include "util/slice.h" @@ -30,6 +31,8 @@ namespace doris { +class WritableFile; + namespace segment_v2 { struct ZoneMap { @@ -46,66 +49,77 @@ struct ZoneMap { bool has_null = false; // has_not_null means whether zone has none-null value bool has_not_null = false; + + void to_proto(ZoneMapPB* dst, Field* field) { + dst->set_min(field->to_string(min_value)); + dst->set_max(field->to_string(max_value)); + dst->set_has_null(has_null); + dst->set_has_not_null(has_not_null); + } }; -// This class encode column pages' zone map. -// The binary is encoded by BinaryPlainPageBuilder -class ColumnZoneMapBuilder { +// Zone map index is represented by an IndexedColumn with ordinal index. +// The IndexedColumn stores serialized ZoneMapPB for each data page. +// It also create and store the segment-level zone map in the index meta so that +// reader can prune an entire segment without reading pages. +class ZoneMapIndexWriter { public: - ColumnZoneMapBuilder(Field* field); + explicit ZoneMapIndexWriter(Field* field); - Status add(const uint8_t* vals, size_t count); + void add_values(const void* values, size_t count); + void add_nulls(uint32_t count) { + _page_zone_map.has_null = true; + } + + // mark the end of one data page so that we can finalize the corresponding zone map Status flush(); - void fill_segment_zone_map(ZoneMapPB* const to); + Status finish(WritableFile* file, ColumnIndexMetaPB* index_meta); - uint64_t size() { - return _page_builder->size(); - } - - OwnedSlice finish() { - return _page_builder->finish(); - } + uint64_t size() { return _estimated_size; } private: - void _reset_zone_map(ZoneMap* zone_map); - void _reset_page_zone_map() { _reset_zone_map(&_zone_map); } - void _reset_segment_zone_map() { _reset_zone_map(&_segment_zone_map); } - void _fill_zone_map_to_pb(const ZoneMap& from, ZoneMapPB* const to); + void _reset_zone_map(ZoneMap* zone_map) { + // we should allocate max varchar length and set to max for min value + _field->set_to_max(zone_map->min_value); + _field->set_to_min(zone_map->max_value); + zone_map->has_null = false; + zone_map->has_not_null = false; + } -private: - std::unique_ptr _page_builder; Field* _field; // memory will be managed by MemPool - ZoneMap _zone_map; + ZoneMap _page_zone_map; ZoneMap _segment_zone_map; // TODO(zc): we should replace this memory pool later, we only allocate min/max // for field. But MemPool allocate 4KB least, it will a waste for most cases. MemTracker _tracker; MemPool _pool; + + // serialized ZoneMapPB for each data page + std::vector _values; + uint64_t _estimated_size = 0; }; -// ColumnZoneMap -class ColumnZoneMap { +class ZoneMapIndexReader { public: - ColumnZoneMap(const Slice& data) : _data(data), _num_pages(0) { } - - Status load(); - - const std::vector& get_column_zone_map() const { - return _page_zone_maps; + explicit ZoneMapIndexReader(const std::string& filename, const ZoneMapIndexPB* index_meta) : + _filename(filename), + _index_meta(index_meta) { } - int32_t num_pages() const { - return _num_pages; - } + // load all page zone maps into memory + Status load(bool use_page_cache, bool kept_in_memory); + + const std::vector& page_zone_maps() const { return _page_zone_maps; } + + int32_t num_pages() const { return _page_zone_maps.size(); } private: - Slice _data; + std::string _filename; + const ZoneMapIndexPB* _index_meta; - // valid after load - int32_t _num_pages; std::vector _page_zone_maps; }; diff --git a/be/src/olap/short_key_index.cpp b/be/src/olap/short_key_index.cpp index 03d66dffbf..6c6cf6812c 100644 --- a/be/src/olap/short_key_index.cpp +++ b/be/src/olap/short_key_index.cpp @@ -28,89 +28,63 @@ namespace doris { Status ShortKeyIndexBuilder::add_item(const Slice& key) { put_varint32(&_offset_buf, _key_buf.size()); - _footer.set_num_items(_footer.num_items() + 1); _key_buf.append(key.data, key.size); + _num_items++; return Status::OK(); } -Status ShortKeyIndexBuilder::finalize(uint32_t segment_bytes, - uint32_t num_segment_rows, - std::vector* slices) { - _footer.set_num_segment_rows(num_segment_rows); - _footer.set_segment_bytes(segment_bytes); - _footer.set_key_bytes(_key_buf.size()); - _footer.set_offset_bytes(_offset_buf.size()); +Status ShortKeyIndexBuilder::finalize(uint32_t num_segment_rows, + std::vector* body, + segment_v2::PageFooterPB* page_footer) { + page_footer->set_type(segment_v2::SHORT_KEY_PAGE); + page_footer->set_uncompressed_size(_key_buf.size() + _offset_buf.size()); - // encode header - if (!_footer.SerializeToString(&_footer_buf)) { - return Status::InternalError("Failed to serialize index footer"); - } + segment_v2::ShortKeyFooterPB* footer = page_footer->mutable_short_key_page_footer(); + footer->set_num_items(_num_items); + footer->set_key_bytes(_key_buf.size()); + footer->set_offset_bytes(_offset_buf.size()); + footer->set_segment_id(_segment_id); + footer->set_num_rows_per_block(_num_rows_per_block); + footer->set_num_segment_rows(num_segment_rows); - put_fixed32_le(&_footer_buf, _footer_buf.size()); - // TODO(zc): checksum - uint32_t checksum = 0; - put_fixed32_le(&_footer_buf, checksum); - - slices->emplace_back(_key_buf); - slices->emplace_back(_offset_buf); - slices->emplace_back(_footer_buf); + body->emplace_back(_key_buf); + body->emplace_back(_offset_buf); return Status::OK(); } -Status ShortKeyIndexDecoder::parse() { - Slice data = _data; +Status ShortKeyIndexDecoder::parse(const Slice& body, const segment_v2::ShortKeyFooterPB& footer) { + _footer = footer; - // 1. parse footer, get checksum and footer length - if (data.size < 2 * sizeof(uint32_t)) { + // check if body size match footer's information + if (body.size != (_footer.key_bytes() + _footer.offset_bytes())) { return Status::Corruption( - Substitute("Short key is too short, need=$0 vs real=$1", - 2 * sizeof(uint32_t), data.size)); - } - size_t offset = data.size - 2 * sizeof(uint32_t); - uint32_t footer_length = decode_fixed32_le((uint8_t*)data.data + offset); - uint32_t checksum = decode_fixed32_le((uint8_t*)data.data + offset + 4); - // TODO(zc): do checksum - if (checksum != 0) { - return Status::Corruption( - Substitute("Checksum not match, need=$0 vs read=$1", 0, checksum)); - } - // move offset to parse footer - offset -= footer_length; - std::string footer_buf(data.data + offset, footer_length); - if (!_footer.ParseFromString(footer_buf)) { - return Status::Corruption("Fail to parse index footer from string"); - } - - // check if real data size match footer's content - if (offset != _footer.key_bytes() + _footer.offset_bytes()) { - return Status::Corruption( - Substitute("Index size not match, need=$0, real=$1", - _footer.key_bytes() + _footer.offset_bytes(), offset)); + Substitute("Index size not match, need=$0, real=$1", + _footer.key_bytes() + _footer.offset_bytes(), body.size)); } // set index buffer - _key_data = Slice(_data.data, _footer.key_bytes()); - + _key_data = Slice(body.data, _footer.key_bytes()); + // parse offset information - Slice offset_slice(_data.data + _footer.key_bytes(), _footer.offset_bytes()); + Slice offset_slice(body.data + _footer.key_bytes(), _footer.offset_bytes()); // +1 for record total length _offsets.resize(_footer.num_items() + 1); - _offsets[_footer.num_items()] = _footer.key_bytes(); for (uint32_t i = 0; i < _footer.num_items(); ++i) { uint32_t offset = 0; if (!get_varint32(&offset_slice, &offset)) { return Status::Corruption("Fail to get varint from index offset buffer"); } DCHECK(offset <= _footer.key_bytes()) - << "Offset is larger than total bytes, offset=" << offset - << ", key_bytes=" << _footer.key_bytes(); + << "Offset is larger than total bytes, offset=" << offset + << ", key_bytes=" << _footer.key_bytes(); _offsets[i] = offset; } + _offsets[_footer.num_items()] = _footer.key_bytes(); if (offset_slice.size != 0) { return Status::Corruption("Still has data after parse all key offset"); } - + _parsed = true; return Status::OK(); } diff --git a/be/src/olap/short_key_index.h b/be/src/olap/short_key_index.h index 61a542eda3..2716920d65 100644 --- a/be/src/olap/short_key_index.h +++ b/be/src/olap/short_key_index.h @@ -107,17 +107,12 @@ void encode_key(std::string* buf, const RowType& row, size_t num_keys) { } } -// Used to encode a segment short key indices to binary format. This version +// Encode a segment short key indices to one ShortKeyPage. This version // only accepts binary key, client should assure that input key is sorted, -// otherwise error could happens. This builder would arrange data in following -// format. -// index = encoded_keys + encoded_offsets + footer + footer_size + checksum -// encoded_keys = binary_key + [, ...] -// encoded_offsets = encoded_offset + [, ...] -// encoded_offset = variant32 -// footer = ShortKeyFooterPB -// footer_size = fixed32 -// checksum = fixed32 +// otherwise error could happens. This builder would arrange the page body in the +// following format: +// ShortKeyPageBody := KeyContent^NumEntry, KeyOffset(vint)^NumEntry +// NumEntry, KeyBytes, OffsetBytes is stored in ShortKeyFooterPB // Usage: // ShortKeyIndexBuilder builder(segment_id, num_rows_per_block); // builder.add_item(key1); @@ -132,26 +127,25 @@ void encode_key(std::string* buf, const RowType& row, size_t num_keys) { // more than short key class ShortKeyIndexBuilder { public: - ShortKeyIndexBuilder(uint32_t segment_id, - uint32_t num_rows_per_block) { - _footer.set_segment_id(segment_id); - _footer.set_num_rows_per_block(num_rows_per_block); + ShortKeyIndexBuilder(uint32_t segment_id, uint32_t num_rows_per_block) : + _segment_id(segment_id), _num_rows_per_block(num_rows_per_block), _num_items(0) { } Status add_item(const Slice& key); uint64_t size() { - return _key_buf.size() + _offset_buf.size() + _footer_buf.size(); + return _key_buf.size() + _offset_buf.size(); } - Status finalize(uint32_t segment_size, uint32_t num_rows, std::vector* slices); + Status finalize(uint32_t num_rows, std::vector* body, segment_v2::PageFooterPB* footer); private: - segment_v2::ShortKeyFooterPB _footer; + uint32_t _segment_id; + uint32_t _num_rows_per_block; + uint32_t _num_items; faststring _key_buf; faststring _offset_buf; - std::string _footer_buf; }; class ShortKeyIndexDecoder; @@ -214,40 +208,54 @@ private: // Used to decode short key to header and encoded index data. // Usage: -// MemIndex index; -// ShortKeyIndexDecoder decoder(slice) -// decoder.parse(); +// ShortKeyIndexDecoder decoder; +// decoder.parse(body, footer); // auto iter = decoder.lower_bound(key); class ShortKeyIndexDecoder { public: - // Client should assure that data is available when this class - // is used. - ShortKeyIndexDecoder(const Slice& data) : _data(data) { } + ShortKeyIndexDecoder() : _parsed(false) {} - Status parse(); + // client should assure that body is available when this class is used + Status parse(const Slice& body, const segment_v2::ShortKeyFooterPB& footer); - ShortKeyIndexIterator begin() const { return {this, 0}; } - ShortKeyIndexIterator end() const { return {this, num_items()}; } + ShortKeyIndexIterator begin() const { + DCHECK(_parsed); + return {this, 0}; + } + + ShortKeyIndexIterator end() const { + DCHECK(_parsed); + return {this, num_items()}; + } // Return an iterator which locates at the first item who is // equal with or greater than the given key. // NOTE: If one key is the prefix of other key, this funciton thinks // that longer key is greater than the shorter key. ShortKeyIndexIterator lower_bound(const Slice& key) const { + DCHECK(_parsed); return seek(key); } // Return the iterator which locates the first item greater than the // input key. ShortKeyIndexIterator upper_bound(const Slice& key) const { + DCHECK(_parsed); return seek(key); } - uint32_t num_items() const { return _footer.num_items(); } + uint32_t num_items() const { + DCHECK(_parsed); + return _footer.num_items(); + } - uint32_t num_rows_per_block() const { return _footer.num_rows_per_block(); } + uint32_t num_rows_per_block() const { + DCHECK(_parsed); + return _footer.num_rows_per_block(); + } Slice key(ssize_t ordinal) const { + DCHECK(_parsed); DCHECK(ordinal >= 0 && ordinal < num_items()); return {_key_data.data + _offsets[ordinal], _offsets[ordinal + 1] - _offsets[ordinal]}; } @@ -266,7 +274,7 @@ private: } private: - Slice _data; + bool _parsed; // All following fields are only valid after parse has been executed successfully segment_v2::ShortKeyFooterPB _footer; diff --git a/be/src/olap/types.h b/be/src/olap/types.h index 09dd37df78..0067092203 100644 --- a/be/src/olap/types.h +++ b/be/src/olap/types.h @@ -150,6 +150,10 @@ template<> struct CppTypeTraits { using CppType = int64_t; using UnsignedCppType = uint64_t; }; +template<> struct CppTypeTraits { + using CppType = uint64_t; + using UnsignedCppType = uint64_t; +}; template<> struct CppTypeTraits { using CppType = int128_t; using UnsignedCppType = unsigned int128_t; diff --git a/be/test/olap/CMakeLists.txt b/be/test/olap/CMakeLists.txt index 81580c86c5..585a9970a8 100644 --- a/be/test/olap/CMakeLists.txt +++ b/be/test/olap/CMakeLists.txt @@ -51,19 +51,18 @@ ADD_BE_TEST(rowset/segment_v2/bitshuffle_page_test) ADD_BE_TEST(rowset/segment_v2/plain_page_test) ADD_BE_TEST(rowset/segment_v2/binary_plain_page_test) ADD_BE_TEST(rowset/segment_v2/binary_prefix_page_test) -ADD_BE_TEST(rowset/segment_v2/index_column_reader_writer_test) +ADD_BE_TEST(rowset/segment_v2/bitmap_index_test) ADD_BE_TEST(rowset/segment_v2/column_reader_writer_test) ADD_BE_TEST(rowset/segment_v2/encoding_info_test) -ADD_BE_TEST(rowset/segment_v2/page_compression_test) ADD_BE_TEST(rowset/segment_v2/ordinal_page_index_test) ADD_BE_TEST(rowset/segment_v2/rle_page_test) ADD_BE_TEST(rowset/segment_v2/binary_dict_page_test) ADD_BE_TEST(rowset/segment_v2/segment_test) -ADD_BE_TEST(rowset/segment_v2/column_zone_map_test) ADD_BE_TEST(rowset/segment_v2/row_ranges_test) ADD_BE_TEST(rowset/segment_v2/frame_of_reference_page_test) ADD_BE_TEST(rowset/segment_v2/block_bloom_filter_test) ADD_BE_TEST(rowset/segment_v2/bloom_filter_index_reader_writer_test) +ADD_BE_TEST(rowset/segment_v2/zone_map_index_test) ADD_BE_TEST(tablet_meta_manager_test) ADD_BE_TEST(tablet_mgr_test) ADD_BE_TEST(rowset/rowset_meta_manager_test) diff --git a/be/test/olap/key_coder_test.cpp b/be/test/olap/key_coder_test.cpp index 8fb22c9d03..6840e25ce5 100644 --- a/be/test/olap/key_coder_test.cpp +++ b/be/test/olap/key_coder_test.cpp @@ -108,6 +108,7 @@ TEST_F(KeyCoderTest, test_int) { test_integer_encode(); test_integer_encode(); test_integer_encode(); + test_integer_encode(); test_integer_encode(); test_integer_encode(); diff --git a/be/test/olap/rowset/segment_v2/index_column_reader_writer_test.cpp b/be/test/olap/rowset/segment_v2/bitmap_index_test.cpp similarity index 66% rename from be/test/olap/rowset/segment_v2/index_column_reader_writer_test.cpp rename to be/test/olap/rowset/segment_v2/bitmap_index_test.cpp index 20e65d81af..0067644fbb 100644 --- a/be/test/olap/rowset/segment_v2/index_column_reader_writer_test.cpp +++ b/be/test/olap/rowset/segment_v2/bitmap_index_test.cpp @@ -20,12 +20,12 @@ #include "olap/key_coder.h" #include +#include #include "common/logging.h" #include "env/env.h" #include "olap/olap_common.h" #include "olap/types.h" -#include "olap/column_block.h" #include "util/file_utils.h" #include "runtime/mem_tracker.h" #include "runtime/mem_pool.h" @@ -33,68 +33,72 @@ namespace doris { namespace segment_v2 { -class IndexColumnReaderWriterTest : public testing::Test { - public: - IndexColumnReaderWriterTest() : _pool(&_tracker) { } - virtual ~IndexColumnReaderWriterTest() { +class BitmapIndexTest : public testing::Test { +public: + const std::string kTestDir = "./ut_dir/bitmap_index_test"; + BitmapIndexTest() : _pool(&_tracker) { } + + void SetUp() override { + if (FileUtils::check_exist(kTestDir)) { + ASSERT_TRUE(FileUtils::remove_all(kTestDir).ok()); + } + ASSERT_TRUE(FileUtils::create_dir(kTestDir).ok()); } - private: + void TearDown() override { + if (FileUtils::check_exist(kTestDir)) { + ASSERT_TRUE(FileUtils::remove_all(kTestDir).ok()); + } + } + +private: MemTracker _tracker; MemPool _pool; }; -const std::string dname = "./ut_dir/index_column_reader_writer_test"; - template -void wirte_index_file(std::string& file_name, const void* values, +void write_index_file(std::string& filename, const void* values, size_t value_count, size_t null_count, - BitmapIndexColumnPB* bitmap_index_meta) { + ColumnIndexMetaPB* meta) { const TypeInfo* type_info = get_type_info(type); - FileUtils::create_dir(dname); - std::string fname = dname + "/" + file_name; { std::unique_ptr wfile; - auto st = Env::Default()->new_writable_file(fname, &wfile); - ASSERT_TRUE(st.ok()); - std::unique_ptr _bitmap_index_builder; - BitmapIndexWriter::create(type_info, &_bitmap_index_builder); - _bitmap_index_builder->add_values(values, value_count); - _bitmap_index_builder->add_nulls(null_count); - st = _bitmap_index_builder->finish(wfile.get(), bitmap_index_meta); - ASSERT_TRUE(st.ok()) << "writer finish status:" << st.to_string(); - wfile.reset(); + ASSERT_TRUE(Env::Default()->new_writable_file(filename, &wfile).ok()); + std::unique_ptr writer; + BitmapIndexWriter::create(type_info, &writer); + writer->add_values(values, value_count); + writer->add_nulls(null_count); + ASSERT_TRUE(writer->finish(wfile.get(), meta).ok()); + ASSERT_EQ(BITMAP_INDEX, meta->type()); } } template -void get_bitmap_reader_iter(std::string& file_name, BitmapIndexColumnPB& bitmap_index_meta, +void get_bitmap_reader_iter(std::string& file_name, const ColumnIndexMetaPB& meta, BitmapIndexReader** reader, BitmapIndexIterator** iter) { - file_name = dname + "/" + file_name; - *reader = new BitmapIndexReader(file_name, bitmap_index_meta); - auto st = (*reader)->load(true); + *reader = new BitmapIndexReader(file_name, &meta.bitmap_index()); + auto st = (*reader)->load(true, false); ASSERT_TRUE(st.ok()); st = (*reader)->new_iterator(iter); ASSERT_TRUE(st.ok()); } -TEST_F(IndexColumnReaderWriterTest, test_invert) { +TEST_F(BitmapIndexTest, test_invert) { size_t num_uint8_rows = 1024 * 10; int* val = new int[num_uint8_rows]; for (int i = 0; i < num_uint8_rows; ++i) { val[i] = i; } - std::string file_name = "invert"; - BitmapIndexColumnPB bitmap_index_meta; - wirte_index_file(file_name, val, num_uint8_rows, 0, - &bitmap_index_meta); + std::string file_name = kTestDir + "/invert"; + ColumnIndexMetaPB meta; + write_index_file(file_name, val, num_uint8_rows, 0, &meta); { std::unique_ptr rfile; BitmapIndexReader* reader = nullptr; BitmapIndexIterator* iter = nullptr; - get_bitmap_reader_iter(file_name, bitmap_index_meta, &reader, &iter); + get_bitmap_reader_iter(file_name, meta, &reader, &iter); int value = 2; bool exact_match; @@ -129,7 +133,7 @@ TEST_F(IndexColumnReaderWriterTest, test_invert) { } } -TEST_F(IndexColumnReaderWriterTest, test_invert_2) { +TEST_F(BitmapIndexTest, test_invert_2) { size_t num_uint8_rows = 1024 * 10; int* val = new int[num_uint8_rows]; for (int i = 0; i < 1024; ++i) { @@ -140,15 +144,14 @@ TEST_F(IndexColumnReaderWriterTest, test_invert_2) { val[i] = i * 10; } - std::string file_name = "invert2"; - BitmapIndexColumnPB bitmap_index_meta; - wirte_index_file(file_name, val, num_uint8_rows, 0, - &bitmap_index_meta); + std::string file_name = kTestDir + "/invert2"; + ColumnIndexMetaPB meta; + write_index_file(file_name, val, num_uint8_rows, 0, &meta); { BitmapIndexReader* reader = nullptr; BitmapIndexIterator* iter = nullptr; - get_bitmap_reader_iter(file_name, bitmap_index_meta, &reader, &iter); + get_bitmap_reader_iter(file_name, meta, &reader, &iter); int value = 1026; bool exact_match; @@ -167,7 +170,7 @@ TEST_F(IndexColumnReaderWriterTest, test_invert_2) { } } -TEST_F(IndexColumnReaderWriterTest, test_multi_pages) { +TEST_F(BitmapIndexTest, test_multi_pages) { size_t num_uint8_rows = 1024 * 1024; int64_t* val = new int64_t[num_uint8_rows]; for (int i = 0; i < num_uint8_rows; ++i) { @@ -175,14 +178,13 @@ TEST_F(IndexColumnReaderWriterTest, test_multi_pages) { } val[1024 * 510] = 2019; - std::string file_name = "mul"; - BitmapIndexColumnPB bitmap_index_meta; - wirte_index_file(file_name, val, num_uint8_rows, 0, - &bitmap_index_meta); + std::string file_name = kTestDir + "/mul"; + ColumnIndexMetaPB meta; + write_index_file(file_name, val, num_uint8_rows, 0, &meta); { BitmapIndexReader* reader = nullptr; BitmapIndexIterator* iter = nullptr; - get_bitmap_reader_iter(file_name, bitmap_index_meta, &reader, &iter); + get_bitmap_reader_iter(file_name, meta, &reader, &iter); int64_t value = 2019; bool exact_match; @@ -199,21 +201,20 @@ TEST_F(IndexColumnReaderWriterTest, test_multi_pages) { } } -TEST_F(IndexColumnReaderWriterTest, test_null) { +TEST_F(BitmapIndexTest, test_null) { size_t num_uint8_rows = 1024; int64_t* val = new int64_t[num_uint8_rows]; for (int i = 0; i < num_uint8_rows; ++i) { val[i] = i; } - std::string file_name = "null"; - BitmapIndexColumnPB bitmap_index_meta; - wirte_index_file(file_name, val, num_uint8_rows, 30, - &bitmap_index_meta); + std::string file_name = kTestDir + "/null"; + ColumnIndexMetaPB meta; + write_index_file(file_name, val, num_uint8_rows, 30, &meta); { BitmapIndexReader* reader = nullptr; BitmapIndexIterator* iter = nullptr; - get_bitmap_reader_iter(file_name, bitmap_index_meta, &reader, &iter); + get_bitmap_reader_iter(file_name, meta, &reader, &iter); Roaring bitmap; iter->read_null_bitmap(&bitmap); diff --git a/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp b/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp index 42e9ce3c7d..01110e919e 100644 --- a/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp +++ b/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp @@ -26,10 +26,7 @@ #include "env/env.h" #include "olap/olap_common.h" #include "olap/types.h" -#include "olap/column_block.h" #include "util/file_utils.h" -#include "runtime/mem_tracker.h" -#include "runtime/mem_pool.h" namespace doris { namespace segment_v2 { @@ -46,7 +43,7 @@ const std::string dname = "./ut_dir/bloom_filter_index_reader_writer_test"; template void write_bloom_filter_index_file(const std::string& file_name, const void* values, size_t value_count, size_t null_count, - BloomFilterIndexPB* bloom_filter_index_meta) { + ColumnIndexMetaPB* index_meta) { const TypeInfo* type_info = get_type_info(type); using CppType = typename CppTypeTraits::CppType; FileUtils::create_dir(dname); @@ -70,20 +67,21 @@ void write_bloom_filter_index_file(const std::string& file_name, const void* val ASSERT_TRUE(st.ok()); i += 1024; } - st = bloom_filter_index_writer->finish(wfile.get(), bloom_filter_index_meta); + st = bloom_filter_index_writer->finish(wfile.get(), index_meta); ASSERT_TRUE(st.ok()) << "writer finish status:" << st.to_string(); - wfile.reset(); + ASSERT_EQ(BLOOM_FILTER_INDEX, index_meta->type()); + ASSERT_EQ(bf_options.strategy, index_meta->bloom_filter_index().hash_strategy()); } } -void get_bloom_filter_reader_iter(const std::string& file_name, const BloomFilterIndexPB& bloom_filter_index_meta, +void get_bloom_filter_reader_iter(const std::string& file_name, const ColumnIndexMetaPB& meta, std::unique_ptr* rfile, BloomFilterIndexReader** reader, std::unique_ptr* iter) { std::string fname = dname + "/" + file_name; - *reader = new BloomFilterIndexReader(fname, bloom_filter_index_meta); - auto st = (*reader)->load(true); + *reader = new BloomFilterIndexReader(fname, &meta.bloom_filter_index()); + auto st = (*reader)->load(true, false); ASSERT_TRUE(st.ok()); st = (*reader)->new_iterator(iter); @@ -96,15 +94,13 @@ void test_bloom_filter_index_reader_writer_template(const std::string file_name, typename TypeTraits::CppType* not_exist_value, bool is_slice_type = false) { typedef typename TypeTraits::CppType CppType; - BloomFilterIndexPB bloom_filter_index_meta; - write_bloom_filter_index_file(file_name, val, num, null_num, - &bloom_filter_index_meta); + ColumnIndexMetaPB meta; + write_bloom_filter_index_file(file_name, val, num, null_num, &meta); { std::unique_ptr rfile; BloomFilterIndexReader* reader = nullptr; std::unique_ptr iter; - get_bloom_filter_reader_iter(file_name, bloom_filter_index_meta, - &rfile, &reader, &iter); + get_bloom_filter_reader_iter(file_name, meta, &rfile, &reader, &iter); // page 0 std::unique_ptr bf; diff --git a/be/test/olap/rowset/segment_v2/column_reader_writer_test.cpp b/be/test/olap/rowset/segment_v2/column_reader_writer_test.cpp index 5b730e6a2e..48b9ddcd53 100644 --- a/be/test/olap/rowset/segment_v2/column_reader_writer_test.cpp +++ b/be/test/olap/rowset/segment_v2/column_reader_writer_test.cpp @@ -79,8 +79,18 @@ void test_nullable_data(uint8_t* src_data, uint8_t* src_is_null, int num_rows, s ASSERT_TRUE(st.ok()); ColumnWriterOptions writer_opts; - writer_opts.encoding_type = encoding; - writer_opts.compression_type = segment_v2::CompressionTypePB::LZ4F; + writer_opts.meta = &meta; + writer_opts.meta->set_column_id(0); + writer_opts.meta->set_unique_id(0); + writer_opts.meta->set_type(type); + if (type == OLAP_FIELD_TYPE_CHAR || type == OLAP_FIELD_TYPE_VARCHAR) { + writer_opts.meta->set_length(10); + } else { + writer_opts.meta->set_length(0); + } + writer_opts.meta->set_encoding(encoding); + writer_opts.meta->set_compression(segment_v2::CompressionTypePB::LZ4F); + writer_opts.meta->set_is_nullable(true); writer_opts.need_zone_map = true; TabletColumn column(OLAP_FIELD_AGGREGATION_NONE, type); @@ -90,7 +100,7 @@ void test_nullable_data(uint8_t* src_data, uint8_t* src_is_null, int num_rows, s column = create_char_key(1); } std::unique_ptr field(FieldFactory::create(column)); - ColumnWriter writer(writer_opts, std::move(field), true, wfile.get()); + ColumnWriter writer(writer_opts, std::move(field), wfile.get()); st = writer.init(); ASSERT_TRUE(st.ok()) << st.to_string(); @@ -109,9 +119,6 @@ void test_nullable_data(uint8_t* src_data, uint8_t* src_is_null, int num_rows, s st = writer.write_zone_map(); ASSERT_TRUE(st.ok()); - writer.write_meta(&meta); - ASSERT_TRUE(meta.has_zone_map_page()); - // close the file wfile.reset(); } diff --git a/be/test/olap/rowset/segment_v2/ordinal_page_index_test.cpp b/be/test/olap/rowset/segment_v2/ordinal_page_index_test.cpp index ca663aee85..e8a101e8c4 100644 --- a/be/test/olap/rowset/segment_v2/ordinal_page_index_test.cpp +++ b/be/test/olap/rowset/segment_v2/ordinal_page_index_test.cpp @@ -19,84 +19,134 @@ #include #include +#include +#include #include "common/logging.h" +#include "env/env.h" +#include "util/file_utils.h" namespace doris { namespace segment_v2 { class OrdinalPageIndexTest : public testing::Test { public: - OrdinalPageIndexTest() { } - virtual ~OrdinalPageIndexTest() { + const std::string kTestDir = "./ut_dir/ordinal_page_index_test"; + + void SetUp() override { + if (FileUtils::check_exist(kTestDir)) { + ASSERT_TRUE(FileUtils::remove_all(kTestDir).ok()); + } + ASSERT_TRUE(FileUtils::create_dir(kTestDir).ok()); + } + void TearDown() override { + if (FileUtils::check_exist(kTestDir)) { + ASSERT_TRUE(FileUtils::remove_all(kTestDir).ok()); + } } }; TEST_F(OrdinalPageIndexTest, normal) { - // rowid, page pointer - // 1, (0, 4096) - // 1 + 4096, (1 * 4096, 4096) - // a page have 16KB, and have 4096 rows - OrdinalPageIndexBuilder builder; + std::string filename = kTestDir + "/normal.idx"; - // we test a 16KB page + OrdinalIndexWriter builder; + // generate ordinal index for 16K data pages, + // each data page is 16KB in size and contains 4096 values, + // ordinal starts at 1 instead of 0 for (uint64_t i = 0; i < 16 * 1024; ++i) { builder.append_entry(1 + 4096 * i, {16 * 1024 * i, 16 * 1024}); } + ColumnIndexMetaPB index_meta; + { + std::unique_ptr out_file; + ASSERT_TRUE(Env::Default()->new_writable_file(filename, &out_file).ok()); + ASSERT_TRUE(builder.finish(out_file.get(), &index_meta).ok()); + ASSERT_EQ(ORDINAL_INDEX, index_meta.type()); + ASSERT_FALSE(index_meta.ordinal_index().root_page().is_root_data_page()); + LOG(INFO) << "index page size=" + << index_meta.ordinal_index().root_page().root_page().size(); + } - auto slice = builder.finish(); - LOG(INFO) << "index block's size=" << slice.size; + OrdinalIndexReader index(filename, &index_meta.ordinal_index(), 16 * 1024 * 4096 + 1); + ASSERT_TRUE(index.load(true, false).ok()); + ASSERT_EQ(16 * 1024, index.num_data_pages()); + ASSERT_EQ(1, index.get_first_ordinal(0)); + ASSERT_EQ(4096, index.get_last_ordinal(0)); + ASSERT_EQ((16 * 1024 - 1) * 4096 + 1, index.get_first_ordinal(16 * 1024 - 1)); + ASSERT_EQ(16 * 1024 * 4096, index.get_last_ordinal(16 * 1024 - 1)); - OrdinalPageIndex index(slice, 16 * 1024 * 4096 + 1); - auto st = index.load(); - ASSERT_TRUE(st.ok()); - ASSERT_EQ(1, index.get_first_row_id(0)); - ASSERT_EQ(4096, index.get_last_row_id(0)); - ASSERT_EQ((16 * 1024 - 1) * 4096 + 1, index.get_first_row_id(16 * 1024 - 1)); - ASSERT_EQ(16 * 1024 * 4096, index.get_last_row_id(16 * 1024 - 1)); - - PagePointer page; { auto iter = index.seek_at_or_before(1); ASSERT_TRUE(iter.valid()); - ASSERT_EQ(1, iter.rowid()); + ASSERT_EQ(1, iter.first_ordinal()); ASSERT_EQ(PagePointer(0, 16 * 1024), iter.page()); } { auto iter = index.seek_at_or_before(4095); ASSERT_TRUE(iter.valid()); - ASSERT_EQ(1, iter.rowid()); + ASSERT_EQ(1, iter.first_ordinal()); ASSERT_EQ(PagePointer(0, 16 * 1024), iter.page()); } { auto iter = index.seek_at_or_before(4098); ASSERT_TRUE(iter.valid()); - ASSERT_EQ(4097, iter.rowid()); + ASSERT_EQ(4097, iter.first_ordinal()); ASSERT_EQ(PagePointer(1 * 16 * 1024, 16 * 1024), iter.page()); iter.next(); ASSERT_TRUE(iter.valid()); - ASSERT_EQ(4097 + 4096, iter.rowid()); + ASSERT_EQ(4097 + 4096, iter.first_ordinal()); ASSERT_EQ(PagePointer(2 * 16 * 1024, 16 * 1024), iter.page()); } - { auto iter = index.seek_at_or_before(0); ASSERT_FALSE(iter.valid()); } } -TEST_F(OrdinalPageIndexTest, corrupt) { - std::string str; - str.resize(4); +TEST_F(OrdinalPageIndexTest, one_data_page) { + // index one data page with 1024 values + int num_values = 1024; + PagePointer data_page_pointer(0, 4096); - encode_fixed32_le((uint8_t*)str.data(), 1); + OrdinalIndexWriter builder; + builder.append_entry(0, data_page_pointer); // add only one entry + ColumnIndexMetaPB index_meta; + { + // in this case, no index page is written, thus file could be null + ASSERT_TRUE(builder.finish(nullptr, &index_meta).ok()); + ASSERT_EQ(ORDINAL_INDEX, index_meta.type()); + ASSERT_TRUE(index_meta.ordinal_index().root_page().is_root_data_page()); + PagePointer root_page_pointer(index_meta.ordinal_index().root_page().root_page()); + ASSERT_EQ(data_page_pointer, root_page_pointer); + } - Slice slice(str); - OrdinalPageIndex index(slice, 10); - auto st = index.load(); - ASSERT_FALSE(st.ok()); + OrdinalIndexReader index("", &index_meta.ordinal_index(), num_values); + ASSERT_TRUE(index.load(true, false).ok()); + ASSERT_EQ(1, index.num_data_pages()); + ASSERT_EQ(0, index.get_first_ordinal(0)); + ASSERT_EQ(num_values - 1, index.get_last_ordinal(0)); + + { + auto iter = index.seek_at_or_before(0); + ASSERT_TRUE(iter.valid()); + ASSERT_EQ(0, iter.first_ordinal()); + ASSERT_EQ(num_values - 1, iter.last_ordinal()); + ASSERT_EQ(data_page_pointer, iter.page()); + } + { + auto iter = index.seek_at_or_before(num_values - 1); + ASSERT_TRUE(iter.valid()); + ASSERT_EQ(0, iter.first_ordinal()); + ASSERT_EQ(data_page_pointer, iter.page()); + } + { + auto iter = index.seek_at_or_before(num_values); + ASSERT_TRUE(iter.valid()); + ASSERT_EQ(0, iter.first_ordinal()); + ASSERT_EQ(data_page_pointer, iter.page()); + } } } diff --git a/be/test/olap/rowset/segment_v2/page_compression_test.cpp b/be/test/olap/rowset/segment_v2/page_compression_test.cpp deleted file mode 100644 index e1f9b8bdb2..0000000000 --- a/be/test/olap/rowset/segment_v2/page_compression_test.cpp +++ /dev/null @@ -1,143 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "olap/rowset/segment_v2/page_compression.h" - -#include -#include - -#include "common/logging.h" -#include "util/block_compression.h" - -namespace doris { -namespace segment_v2 { - -class PageCompressionTest : public testing::Test { -public: - PageCompressionTest() { } - virtual ~PageCompressionTest() { - } -}; - -static std::string generate_rand_str(size_t len) { - static char charset[] = "0123456789" - "abcdefghijklmnopqrstuvwxyz" - "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; - std::string result; - result.resize(len); - for (int i = 0; i < len; ++i) { - result[i] = charset[rand() % sizeof(charset)]; - } - return result; -} - -static std::string generate_str(size_t len) { - static char charset[] = "0123456789" - "abcdefghijklmnopqrstuvwxyz" - "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; - std::string result; - result.resize(len); - for (int i = 0; i < len; ++i) { - result[i] = charset[i % sizeof(charset)]; - } - return result; -} - -TEST_F(PageCompressionTest, normal) { - const BlockCompressionCodec* codec = nullptr; - get_block_compression_codec(segment_v2::CompressionTypePB::LZ4F, &codec); - - for (int i = 0; i < 2; ++i) { - // compress - PageCompressor compressor(codec); - - std::vector raw_slices; - std::string raw_data; - if (i == 0) { - raw_data = generate_rand_str(102400); - } else { - raw_data = generate_str(102400); - } - - raw_slices.emplace_back(raw_data.data(), 10240); - raw_slices.emplace_back(raw_data.data() + 10240, 10240); - raw_slices.emplace_back(raw_data.data() + 20480, 81920); - - std::vector compressed_slices; - auto st = compressor.compress(raw_slices, &compressed_slices); - ASSERT_TRUE(st.ok()); - - std::string compressed_data = Slice::to_string(compressed_slices); - - // decompress - PageDecompressor decompressor(compressed_data, codec); - - { - Slice check_slice; - st = decompressor.decompress_to(&check_slice); - ASSERT_TRUE(st.ok()); - ASSERT_STREQ(raw_data.c_str(), check_slice.data); - if (check_slice.data != compressed_data.data()) { - delete[] check_slice.data; - } - } - } -} - -TEST_F(PageCompressionTest, bad_case) { - const BlockCompressionCodec* codec = nullptr; - get_block_compression_codec(segment_v2::CompressionTypePB::LZ4F, &codec); - - for (int i = 0; i < 2; ++i) { - // compress - PageCompressor compressor(codec); - - std::vector raw_slices; - std::string raw_data; - if (i == 0) { - raw_data = generate_rand_str(102400); - } else { - raw_data = generate_str(102400); - } - raw_slices.emplace_back(raw_data.data(), 102400); - - std::vector compressed_slices; - auto st = compressor.compress(raw_slices, &compressed_slices); - ASSERT_TRUE(st.ok()); - - std::string compressed_data = Slice::to_string(compressed_slices); - - Slice bad_compressed_slice(compressed_data.data(), compressed_data.size() - 1); - // decompress - PageDecompressor decompressor(bad_compressed_slice, codec); - - { - Slice check_slice; - st = decompressor.decompress_to(&check_slice); - ASSERT_FALSE(st.ok()); - } - } -} - -} -} - -int main(int argc, char** argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} - diff --git a/be/test/olap/rowset/segment_v2/segment_test.cpp b/be/test/olap/rowset/segment_v2/segment_test.cpp index 81fcfdf1b3..1305aa73d4 100644 --- a/be/test/olap/rowset/segment_v2/segment_test.cpp +++ b/be/test/olap/rowset/segment_v2/segment_test.cpp @@ -57,6 +57,15 @@ static void DefaultIntGenerator(size_t rid, int cid, int block_id, RowCursorCell *(int*)cell.mutable_cell_ptr() = rid * 10 + cid; } +static bool column_contains_index(ColumnMetaPB column_meta, ColumnIndexTypePB type) { + for (int i = 0; i < column_meta.indexes_size(); ++i) { + if (column_meta.indexes(i).type() == type) { + return true; + } + } + return false; +} + class SegmentReaderWriterTest : public ::testing::Test { protected: void SetUp() override { @@ -353,7 +362,7 @@ TEST_F(SegmentReaderWriterTest, LazyMaterialization) { shared_ptr segment; SegmentWriterOptions write_opts; build_segment(write_opts, tablet_schema, tablet_schema, 100, data_gen, &segment); - ASSERT_TRUE(segment->footer().columns(0).has_bitmap_index()); + ASSERT_TRUE(column_contains_index(segment->footer().columns(0), BITMAP_INDEX)); { // lazy disabled when all predicates are removed by bitmap index: // select c1, c2 where c2 = 30; @@ -972,8 +981,8 @@ TEST_F(SegmentReaderWriterTest, TestBitmapPredicate) { SegmentWriterOptions opts; shared_ptr segment; build_segment(opts, tablet_schema, tablet_schema, 4096, DefaultIntGenerator, &segment); - ASSERT_TRUE(segment->footer().columns(0).has_bitmap_index()); - ASSERT_TRUE(segment->footer().columns(1).has_bitmap_index()); + ASSERT_TRUE(column_contains_index(segment->footer().columns(0), BITMAP_INDEX)); + ASSERT_TRUE(column_contains_index(segment->footer().columns(1), BITMAP_INDEX)); { Schema schema(tablet_schema); @@ -1104,14 +1113,14 @@ TEST_F(SegmentReaderWriterTest, TestBloomFilterIndexUniqueModel) { opts1.whether_to_filter_value = false; shared_ptr seg1; build_segment(opts1, schema, schema, 100, DefaultIntGenerator, &seg1); - ASSERT_FALSE(seg1->footer().columns(3).has_bloom_filter_index()); + ASSERT_FALSE(column_contains_index(seg1->footer().columns(3), BLOOM_FILTER_INDEX)); // for base segment SegmentWriterOptions opts2; opts2.whether_to_filter_value = true; shared_ptr seg2; build_segment(opts2, schema, schema, 100, DefaultIntGenerator, &seg2); - ASSERT_TRUE(seg2->footer().columns(3).has_bloom_filter_index()); + ASSERT_TRUE(column_contains_index(seg2->footer().columns(3), BLOOM_FILTER_INDEX)); } } diff --git a/be/test/olap/rowset/segment_v2/column_zone_map_test.cpp b/be/test/olap/rowset/segment_v2/zone_map_index_test.cpp similarity index 61% rename from be/test/olap/rowset/segment_v2/column_zone_map_test.cpp rename to be/test/olap/rowset/segment_v2/zone_map_index_test.cpp index 9de21f213c..f7dbfd2ba3 100644 --- a/be/test/olap/rowset/segment_v2/column_zone_map_test.cpp +++ b/be/test/olap/rowset/segment_v2/zone_map_index_test.cpp @@ -16,41 +16,70 @@ // under the License. #include -#include -#include "olap/rowset/segment_v2/column_zone_map.h" +#include +#include + +#include "env/env.h" +#include "olap/rowset/segment_v2/zone_map_index.h" #include "olap/tablet_schema_helper.h" +#include "util/file_utils.h" namespace doris { namespace segment_v2 { class ColumnZoneMapTest : public testing::Test { public: - void test_string(Field* field) { - ColumnZoneMapBuilder builder(field); + const std::string kTestDir = "./ut_dir/zone_map_index_test"; + + void SetUp() override { + if (FileUtils::check_exist(kTestDir)) { + ASSERT_TRUE(FileUtils::remove_all(kTestDir).ok()); + } + ASSERT_TRUE(FileUtils::create_dir(kTestDir).ok()); + } + void TearDown() override { + if (FileUtils::check_exist(kTestDir)) { + ASSERT_TRUE(FileUtils::remove_all(kTestDir).ok()); + } + } + + void test_string(std::string testname, Field* field) { + std::string filename = kTestDir + "/" + testname; + + ZoneMapIndexWriter builder(field); std::vector values1 = {"aaaa", "bbbb", "cccc", "dddd", "eeee", "ffff"}; for (auto& value : values1) { Slice slice(value); - builder.add((const uint8_t*)&slice, 1); + builder.add_values((const uint8_t*)&slice, 1); } builder.flush(); std::vector values2 = {"aaaaa", "bbbbb", "ccccc", "ddddd", "eeeee", "fffff"}; for (auto& value : values2) { Slice slice(value); - builder.add((const uint8_t*)&slice, 1); + builder.add_values((const uint8_t*)&slice, 1); } - builder.add(nullptr, 1); + builder.add_nulls(1); builder.flush(); for (int i = 0; i < 6; ++i) { - builder.add(nullptr, 1); + builder.add_nulls(1); } builder.flush(); - OwnedSlice zone_map_page = builder.finish(); - ColumnZoneMap column_zone_map(zone_map_page.slice()); - Status status = column_zone_map.load(); + // write out zone map index + ColumnIndexMetaPB index_meta; + { + std::unique_ptr out_file; + ASSERT_TRUE(Env::Default()->new_writable_file(filename, &out_file).ok()); + ASSERT_TRUE(builder.finish(out_file.get(), &index_meta).ok()); + ASSERT_EQ(ZONE_MAP_INDEX, index_meta.type()); + } + + + ZoneMapIndexReader column_zone_map(filename, &index_meta.zone_map_index()); + Status status = column_zone_map.load(true, false); ASSERT_TRUE(status.ok()); ASSERT_EQ(3, column_zone_map.num_pages()); - const std::vector& zone_maps = column_zone_map.get_column_zone_map(); + const std::vector& zone_maps = column_zone_map.page_zone_maps(); ASSERT_EQ(3, zone_maps.size()); ASSERT_EQ("aaaa", zone_maps[0].min()); ASSERT_EQ("ffff", zone_maps[0].max()); @@ -69,31 +98,39 @@ public: // Test for int TEST_F(ColumnZoneMapTest, NormalTestIntPage) { + std::string filename = kTestDir + "/NormalTestIntPage"; + TabletColumn int_column = create_int_key(0); Field* field = FieldFactory::create(int_column); - ColumnZoneMapBuilder builder(field); + ZoneMapIndexWriter builder(field); std::vector values1 = {1, 10, 11, 20, 21, 22}; for (auto value : values1) { - builder.add((const uint8_t*)&value, 1); + builder.add_values((const uint8_t*)&value, 1); } builder.flush(); std::vector values2 = {2, 12, 31, 23, 21, 22}; for (auto value : values2) { - builder.add((const uint8_t*)&value, 1); + builder.add_values((const uint8_t*)&value, 1); } - builder.add(nullptr, 1); + builder.add_nulls(1); builder.flush(); - for (int i = 0; i < 6; ++i) { - builder.add(nullptr, 1); + builder.add_nulls(6); + builder.flush(); + // write out zone map index + ColumnIndexMetaPB index_meta; + { + std::unique_ptr out_file; + ASSERT_TRUE(Env::Default()->new_writable_file(filename, &out_file).ok()); + ASSERT_TRUE(builder.finish(out_file.get(), &index_meta).ok()); + ASSERT_EQ(ZONE_MAP_INDEX, index_meta.type()); } - builder.flush(); - OwnedSlice zone_map_page = builder.finish(); - ColumnZoneMap column_zone_map(zone_map_page.slice()); - Status status = column_zone_map.load(); + + ZoneMapIndexReader column_zone_map(filename, &index_meta.zone_map_index()); + Status status = column_zone_map.load(true, false); ASSERT_TRUE(status.ok()); ASSERT_EQ(3, column_zone_map.num_pages()); - const std::vector& zone_maps = column_zone_map.get_column_zone_map(); + const std::vector& zone_maps = column_zone_map.page_zone_maps(); ASSERT_EQ(3, zone_maps.size()); ASSERT_EQ(std::to_string(1), zone_maps[0].min()); @@ -114,14 +151,14 @@ TEST_F(ColumnZoneMapTest, NormalTestIntPage) { TEST_F(ColumnZoneMapTest, NormalTestVarcharPage) { TabletColumn varchar_column = create_varchar_key(0); Field* field = FieldFactory::create(varchar_column); - test_string(field); + test_string("NormalTestVarcharPage", field); } // Test for string TEST_F(ColumnZoneMapTest, NormalTestCharPage) { TabletColumn char_column = create_char_key(0); Field* field = FieldFactory::create(char_column); - test_string(field); + test_string("NormalTestCharPage", field); } } diff --git a/be/test/olap/short_key_index_test.cpp b/be/test/olap/short_key_index_test.cpp index 1b825e9320..6722207436 100644 --- a/be/test/olap/short_key_index_test.cpp +++ b/be/test/olap/short_key_index_test.cpp @@ -35,20 +35,25 @@ public: TEST_F(ShortKeyIndexTest, buider) { ShortKeyIndexBuilder builder(0, 1024); + int num_items = 0; for (int i = 1000; i < 10000; i += 2) { builder.add_item(std::to_string(i)); + num_items++; } std::vector slices; - auto st = builder.finalize(10000, 9000 * 1024, &slices); + segment_v2::PageFooterPB footer; + auto st = builder.finalize(9000 * 1024, &slices, &footer); ASSERT_TRUE(st.ok()); + ASSERT_EQ(segment_v2::SHORT_KEY_PAGE, footer.type()); + ASSERT_EQ(num_items, footer.short_key_page_footer().num_items()); std::string buf; for (auto& slice : slices) { buf.append(slice.data, slice.size); } - ShortKeyIndexDecoder decoder(buf); - st = decoder.parse(); + ShortKeyIndexDecoder decoder; + st = decoder.parse(buf, footer.short_key_page_footer()); ASSERT_TRUE(st.ok()); // find 1499 diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto index ad9cb27682..584d409717 100644 --- a/gensrc/proto/segment_v2.proto +++ b/gensrc/proto/segment_v2.proto @@ -20,20 +20,6 @@ syntax="proto2"; package doris.segment_v2; -message ColumnSchemaPB { - optional uint32 column_id = 1; - optional string type = 2; - optional string aggregation = 3; - optional uint32 length = 4; - optional bool is_key = 5; - optional string default_value = 6; - optional uint32 precision = 9 [default = 27]; - optional uint32 frac = 10 [default = 9]; - optional bool is_nullable = 11 [default=false]; - optional bool is_bf_column = 15 [default=false]; // is bloom filter indexed column - optional bool has_bitmap_index = 16 [default=false]; -} - // page position info message PagePointerPB { required uint64 offset = 1; // offset in segment file @@ -67,6 +53,75 @@ enum CompressionTypePB { ZSTD = 7; } +enum PageTypePB { + UNKNOWN_PAGE_TYPE = 0; + DATA_PAGE = 1; + INDEX_PAGE = 2; + DICTIONARY_PAGE = 3; + SHORT_KEY_PAGE = 4; +} + +message DataPageFooterPB { + // required: ordinal of the first value + optional uint64 first_ordinal = 1; + // required: number of values, including NULLs + optional uint64 num_values = 2; + // required: size of nullmap, 0 if the page doesn't contain NULL + optional uint32 nullmap_size = 3; + // only for array column, largest array item ordinal + 1, + // used to calculate the length of last array in this page + optional uint64 next_array_item_ordinal = 4; +} + +message IndexPageFooterPB { + // required: number of index entries in this page + optional uint32 num_entries = 1; + + enum Type { + UNKNOWN_INDEX_PAGE_TYPE = 0; + LEAF = 1; + INTERNAL = 2; + }; + // required: type of the index page + optional Type type = 2; +} + +message DictPageFooterPB { + // required: encoding for dictionary + optional EncodingTypePB encoding = 1; +} + +message ShortKeyFooterPB { + // How many index item in this index. + optional uint32 num_items = 1; + // The total bytes occupied by the index key + optional uint32 key_bytes = 2; + // The total bytes occupied by the key offsets + optional uint32 offset_bytes = 3; + // Segment id which this index is belong to + optional uint32 segment_id = 4; + // number rows in each block + optional uint32 num_rows_per_block = 5; + // How many rows in this segment + optional uint32 num_segment_rows = 6; +} + +message PageFooterPB { + // required: indicates which of the *_footer fields is set + optional PageTypePB type = 1; + // required: page body size before compression (exclude footer and crc). + // page body is uncompressed when it's equal to page body size + optional uint32 uncompressed_size = 2; + // present only when type == DATA_PAGE + optional DataPageFooterPB data_page_footer = 7; + // present only when type == INDEX_PAGE + optional IndexPageFooterPB index_page_footer = 8; + // present only when type == DICTIONARY_PAGE + optional DictPageFooterPB dict_page_footer = 9; + // present only when type == SHORT_KEY_PAGE + optional ShortKeyFooterPB short_key_page_footer = 10; +} + message ZoneMapPB { // minimum not-null value, invalid when all values are null(has_not_null==false) optional bytes min = 1; @@ -85,72 +140,17 @@ message ColumnMetaPB { optional uint32 unique_id = 2; // this field is FieldType's value optional int32 type = 3; - optional EncodingTypePB encoding = 4; - // compress type for column - optional CompressionTypePB compression = 5; - // if this column can be nullable - optional bool is_nullable = 6; - // ordinal index page - optional PagePointerPB ordinal_index_page = 7; - // page-level zone map index - optional PagePointerPB zone_map_page = 8; - // segment-level zone map - optional ZoneMapPB zone_map = 9; - // // dictionary page for DICT_ENCODING - optional PagePointerPB dict_page = 10; - // bitmap index - optional BitmapIndexColumnPB bitmap_index = 11; // var length for string type - optional int32 length = 12; - // bloom filter index - optional BloomFilterIndexPB bloom_filter_index = 13; - - // // bloom filter pages for bloom filter column - // repeated PagePointerPB bloom_filter_pages = 3; - - // optional PagePointerPB page_zonemap_page = 5; // page zonemap info of column - - // optional PagePointerPB bitmap_index_page = 6; // bitmap index page - - // // data footprint of column after encoding and compress - // optional uint64 data_footprint = 7; - // // index footprint of column after encoding and compress - // optional uint64 index_footprint = 8; - // // raw column data footprint - // optional uint64 raw_data_footprint = 9; - - // optional ZoneMapPB column_zonemap = 11; // column zonemap info - // repeated MetadataPairPB column_meta_datas = 12; -} - -message FileFooterPB { - optional uint32 version = 1 [default = 1]; // file version - repeated ColumnSchemaPB schema = 2; // tablet schema - optional uint64 num_values = 3; // number of values - optional uint64 index_footprint = 4; // total idnex footprint of all columns - optional uint64 data_footprint = 5; // total data footprint of all columns - optional uint64 raw_data_footprint = 6; // raw data footprint - - optional CompressionTypePB compress_type = 7 [default = LZ4F]; // default compression type for file columns - repeated MetadataPairPB file_meta_datas = 8; // meta data of file - optional PagePointerPB key_index_page = 9; // short key index page -} - -message ShortKeyFooterPB { - // How many index item in this index. - optional uint32 num_items = 1; - // The total bytes occupied by the index key - optional uint32 key_bytes = 2; - // The total bytes occupied by the key offsets - optional uint32 offset_bytes = 3; - // Segment id which this index is belong to - optional uint32 segment_id = 4; - // number rows in each block - optional uint32 num_rows_per_block = 5; - // How many rows in this segment - optional uint32 num_segment_rows = 6; - // Total bytes for this segment - optional uint32 segment_bytes = 7; + optional int32 length = 4; + optional EncodingTypePB encoding = 5; + // compress type for column + optional CompressionTypePB compression = 6; + // if this column can be nullable + optional bool is_nullable = 7; + // metadata about all the column indexes + repeated ColumnIndexMetaPB indexes = 8; + // pointer to dictionary page when using DICT_ENCODING + optional PagePointerPB dict_page = 9; } message SegmentFooterPB { @@ -168,19 +168,6 @@ message SegmentFooterPB { optional PagePointerPB short_key_index_page = 9; } -message IndexPageFooterPB { - // required: number of entries in this page - optional int32 num_entries = 1; - - enum Type { - UNKNOWN_INDEX_PAGE_TYPE = 0; - LEAF = 1; - INTERNAL = 2; - }; - // required: type of the index page - optional Type type = 2; -} - message BTreeMetaPB { // required: pointer to either root index page or sole data page based on is_root_data_page optional PagePointerPB root_page = 1; @@ -205,22 +192,53 @@ message IndexedColumnMetaPB { optional uint64 size = 7; } -message BitmapIndexColumnPB { +// ------------------------------------------------------------- +// Column Index Metadata +// ------------------------------------------------------------- + +enum ColumnIndexTypePB { + UNKNOWN_INDEX_TYPE = 0; + ORDINAL_INDEX = 1; + ZONE_MAP_INDEX = 2; + BITMAP_INDEX = 3; + BLOOM_FILTER_INDEX = 4; +} + +message ColumnIndexMetaPB { + optional ColumnIndexTypePB type = 1; + optional OrdinalIndexPB ordinal_index = 7; + optional ZoneMapIndexPB zone_map_index = 8; + optional BitmapIndexPB bitmap_index = 9; + optional BloomFilterIndexPB bloom_filter_index = 10; +} + +message OrdinalIndexPB { + // required: the root page can be data page if there is only one data page, + // or the only index page if there is more than one data pages. + optional BTreeMetaPB root_page = 1; +} + +message ZoneMapIndexPB { + // required: segment-level zone map + optional ZoneMapPB segment_zone_map = 1; + // required: zone map for each data page is stored in an IndexedColumn with ordinal index + optional IndexedColumnMetaPB page_zone_maps = 2; +} + +message BitmapIndexPB { enum BitmapType { UNKNOWN_BITMAP_TYPE = 0; ROARING_BITMAP = 1; } - optional uint32 column_id = 1; - optional uint32 unique_id = 2; + optional BitmapType bitmap_type = 1 [default=ROARING_BITMAP]; // required: whether the index contains null key. // if true, the last bitmap (ordinal:dict_column.num_values) in bitmap_column is // the bitmap for null key. we don't store null key in dict_column. - optional bool has_null = 3; + optional bool has_null = 2; // required: meta for ordered dictionary part - optional IndexedColumnMetaPB dict_column = 4; + optional IndexedColumnMetaPB dict_column = 3; // required: meta for bitmaps part - optional IndexedColumnMetaPB bitmap_column = 5; - optional BitmapType bitmap_type = 6 [default=ROARING_BITMAP]; + optional IndexedColumnMetaPB bitmap_column = 4; } enum HashStrategyPB { @@ -238,4 +256,4 @@ message BloomFilterIndexPB { optional BloomFilterAlgorithmPB algorithm = 2; // required: meta for bloom filters optional IndexedColumnMetaPB bloom_filter = 3; -} \ No newline at end of file +} diff --git a/run-ut.sh b/run-ut.sh index 5d67c5b760..1216fc4414 100755 --- a/run-ut.sh +++ b/run-ut.sh @@ -282,18 +282,17 @@ ${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/ordinal_page_index_test ${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/bitshuffle_page_test ${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/plain_page_test ${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/binary_plain_page_test +${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/bitmap_index_test ${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/column_reader_writer_test -${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/index_column_reader_writer_test ${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/rle_page_test ${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/binary_dict_page_test ${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/binary_prefix_page_test ${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/segment_test -${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/page_compression_test -${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/column_zone_map_test ${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/row_ranges_test ${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/frame_of_reference_page_test ${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/block_bloom_filter_test ${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test +${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/zone_map_index_test ${DORIS_TEST_BINARY_DIR}/olap/txn_manager_test ${DORIS_TEST_BINARY_DIR}/olap/storage_types_test ${DORIS_TEST_BINARY_DIR}/olap/generic_iterators_test