[segment_v2] Switch to Unified and Extensible Page Format (#2953)

Fixes #2892 

IMPORTANT NOTICE: this CL makes incompatible changes to V2 storage format, developers need to create new tables for test.

This CL refactors the metadata and page format for segment_v2 in order to
* make it easy to extend existing page type
* make it easy to add new page type while not sacrificing code reuse
* make it possible to use SIMD to speed up page decoding

Here we summary the main code changes
* Page and index metadata is redesigned, please see `segment_v2.proto`
* The new class `PageIO` is the single place for reading and writing all pages. This removes lots of duplicated code. `PageCompressor` and `PageDecompressor` are now useless and removed. 
* The type of value ordinal is changed from `rowid_t` to 64-bits `ordinal_t`, this affects ordinal index as well.
* Column's ordinal index is now implemented by IndexPage, the same with IndexedColumn.
* Zone map index is now implemented by IndexedColumn
This commit is contained in:
Dayue Gao
2020-02-27 15:09:57 +08:00
committed by GitHub
parent 54b7828c3f
commit d2d95bfa84
55 changed files with 1707 additions and 1818 deletions

View File

@ -93,18 +93,18 @@ add_library(Olap STATIC
rowset/segment_v2/indexed_column_reader.cpp
rowset/segment_v2/indexed_column_writer.cpp
rowset/segment_v2/ordinal_page_index.cpp
rowset/segment_v2/page_compression.cpp
rowset/segment_v2/page_io.cpp
rowset/segment_v2/binary_dict_page.cpp
rowset/segment_v2/binary_prefix_page.cpp
rowset/segment_v2/segment.cpp
rowset/segment_v2/segment_iterator.cpp
rowset/segment_v2/empty_segment_iterator.cpp
rowset/segment_v2/segment_writer.cpp
rowset/segment_v2/column_zone_map.cpp
rowset/segment_v2/block_split_bloom_filter.cpp
rowset/segment_v2/bloom_filter_index_reader.cpp
rowset/segment_v2/bloom_filter_index_writer.cpp
rowset/segment_v2/bloom_filter.cpp
rowset/segment_v2/zone_map_index.cpp
task/engine_batch_load_task.cpp
task/engine_checksum_task.cpp
task/engine_clone_task.cpp

View File

@ -64,6 +64,7 @@ private:
add_mapping<OLAP_FIELD_TYPE_INT>();
add_mapping<OLAP_FIELD_TYPE_UNSIGNED_INT>();
add_mapping<OLAP_FIELD_TYPE_BIGINT>();
add_mapping<OLAP_FIELD_TYPE_UNSIGNED_BIGINT>();
add_mapping<OLAP_FIELD_TYPE_LARGEINT>();
add_mapping<OLAP_FIELD_TYPE_DATETIME>();

View File

@ -22,15 +22,15 @@
namespace doris {
namespace segment_v2 {
Status BitmapIndexReader::load(bool cache_in_memory) {
const IndexedColumnMetaPB& dict_meta = _bitmap_index_meta.dict_column();
const IndexedColumnMetaPB& bitmap_meta = _bitmap_index_meta.bitmap_column();
_has_null = _bitmap_index_meta.has_null();
Status BitmapIndexReader::load(bool use_page_cache, bool kept_in_memory) {
const IndexedColumnMetaPB& dict_meta = _bitmap_index_meta->dict_column();
const IndexedColumnMetaPB& bitmap_meta = _bitmap_index_meta->bitmap_column();
_has_null = _bitmap_index_meta->has_null();
_dict_column_reader.reset(new IndexedColumnReader(_file_name, dict_meta, cache_in_memory));
_bitmap_column_reader.reset(new IndexedColumnReader(_file_name, bitmap_meta, cache_in_memory));
RETURN_IF_ERROR(_dict_column_reader->load());
RETURN_IF_ERROR(_bitmap_column_reader->load());
_dict_column_reader.reset(new IndexedColumnReader(_file_name, dict_meta));
_bitmap_column_reader.reset(new IndexedColumnReader(_file_name, bitmap_meta));
RETURN_IF_ERROR(_dict_column_reader->load(use_page_cache, kept_in_memory));
RETURN_IF_ERROR(_bitmap_column_reader->load(use_page_cache, kept_in_memory));
return Status::OK();
}

View File

@ -41,13 +41,13 @@ class IndexedColumnIterator;
class BitmapIndexReader {
public:
explicit BitmapIndexReader(const std::string& file_name,
const BitmapIndexColumnPB& bitmap_index_meta)
const BitmapIndexPB* bitmap_index_meta)
: _file_name(file_name),
_bitmap_index_meta(bitmap_index_meta){
_typeinfo = get_type_info(OLAP_FIELD_TYPE_VARCHAR);
}
Status load(bool cache_in_memory);
Status load(bool use_page_cache, bool kept_in_memory);
// create a new column iterator. Client should delete returned iterator
Status new_iterator(BitmapIndexIterator** iterator);
@ -65,7 +65,7 @@ private:
std::string _file_name;
const TypeInfo* _typeinfo;
const BitmapIndexColumnPB& _bitmap_index_meta;
const BitmapIndexPB* _bitmap_index_meta;
bool _has_null = false;
std::unique_ptr<IndexedColumnReader> _dict_column_reader;
std::unique_ptr<IndexedColumnReader> _bitmap_column_reader;

View File

@ -100,8 +100,11 @@ public:
_rid += count;
}
Status finish(WritableFile* file, BitmapIndexColumnPB* meta) override {
meta->set_bitmap_type(BitmapIndexColumnPB::ROARING_BITMAP);
Status finish(WritableFile* file, ColumnIndexMetaPB* index_meta) override {
index_meta->set_type(BITMAP_INDEX);
BitmapIndexPB* meta = index_meta->mutable_bitmap_index();
meta->set_bitmap_type(BitmapIndexPB::ROARING_BITMAP);
meta->set_has_null(!_null_bitmap.isEmpty());
{ // write dictionary

View File

@ -42,7 +42,7 @@ public:
virtual void add_nulls(uint32_t count) = 0;
virtual Status finish(WritableFile* file, BitmapIndexColumnPB* meta) = 0;
virtual Status finish(WritableFile* file, ColumnIndexMetaPB* index_meta) = 0;
virtual uint64_t size() const = 0;
private:

View File

@ -23,11 +23,11 @@
namespace doris {
namespace segment_v2 {
Status BloomFilterIndexReader::load(bool cache_in_memory) {
const IndexedColumnMetaPB& bf_index_meta = _bloom_filter_index_meta.bloom_filter();
Status BloomFilterIndexReader::load(bool use_page_cache, bool kept_in_memory) {
const IndexedColumnMetaPB& bf_index_meta = _bloom_filter_index_meta->bloom_filter();
_bloom_filter_reader.reset(new IndexedColumnReader(_file_name, bf_index_meta, cache_in_memory));
RETURN_IF_ERROR(_bloom_filter_reader->load());
_bloom_filter_reader.reset(new IndexedColumnReader(_file_name, bf_index_meta));
RETURN_IF_ERROR(_bloom_filter_reader->load(use_page_cache, kept_in_memory));
return Status::OK();
}
@ -48,8 +48,8 @@ Status BloomFilterIndexIterator::read_bloom_filter(rowid_t ordinal, std::unique_
RETURN_IF_ERROR(_bloom_filter_iter.next_batch(&num_read, &column_block_view));
DCHECK(num_to_read == num_read);
// construct bloom filter
BloomFilter::create(_reader->_bloom_filter_index_meta.algorithm(), bf);
RETURN_IF_ERROR((*bf)->init(value.data, value.size, _reader->_bloom_filter_index_meta.hash_strategy()));
BloomFilter::create(_reader->_bloom_filter_index_meta->algorithm(), bf);
RETURN_IF_ERROR((*bf)->init(value.data, value.size, _reader->_bloom_filter_index_meta->hash_strategy()));
_pool->clear();
return Status::OK();
}

View File

@ -44,13 +44,13 @@ class BloomFilter;
class BloomFilterIndexReader {
public:
explicit BloomFilterIndexReader(const std::string& file_name,
const BloomFilterIndexPB& bloom_filter_index_meta)
const BloomFilterIndexPB* bloom_filter_index_meta)
: _file_name(file_name),
_bloom_filter_index_meta(bloom_filter_index_meta) {
_typeinfo = get_type_info(OLAP_FIELD_TYPE_VARCHAR);
}
Status load(bool cache_in_memory);
Status load(bool use_page_cache, bool kept_in_memory);
// create a new column iterator.
Status new_iterator(std::unique_ptr<BloomFilterIndexIterator>* iterator);
@ -64,7 +64,7 @@ private:
std::string _file_name;
const TypeInfo* _typeinfo;
BloomFilterIndexPB _bloom_filter_index_meta;
const BloomFilterIndexPB* _bloom_filter_index_meta;
std::unique_ptr<IndexedColumnReader> _bloom_filter_reader;
};

View File

@ -104,10 +104,12 @@ public:
return Status::OK();
}
Status finish(WritableFile* file, BloomFilterIndexPB* meta) override {
Status finish(WritableFile* file, ColumnIndexMetaPB* index_meta) override {
if (_values.size() > 0) {
RETURN_IF_ERROR(flush());
}
index_meta->set_type(BLOOM_FILTER_INDEX);
BloomFilterIndexPB* meta = index_meta->mutable_bloom_filter_index();
meta->set_hash_strategy(_bf_options.strategy);
meta->set_algorithm(BLOCK_BLOOM_FILTER);
@ -118,7 +120,7 @@ public:
options.write_value_index = false;
options.encoding = PLAIN_ENCODING;
IndexedColumnWriter bf_writer(options, bf_typeinfo, file);
bf_writer.init();
RETURN_IF_ERROR(bf_writer.init());
for (auto& bf : _bfs) {
Slice data(bf->data(), bf->size());
bf_writer.add(&data);

View File

@ -47,7 +47,7 @@ public:
virtual Status flush() = 0;
virtual Status finish(WritableFile* file, BloomFilterIndexPB* meta) = 0;
virtual Status finish(WritableFile* file, ColumnIndexMetaPB* index_meta) = 0;
virtual uint64_t size() = 0;
private:

View File

@ -21,19 +21,14 @@
#include "env/env.h" // for RandomAccessFile
#include "gutil/strings/substitute.h" // for Substitute
#include "olap/rowset/segment_v2/encoding_info.h" // for EncodingInfo
#include "olap/rowset/segment_v2/page_decoder.h" // for PagePointer
#include "olap/rowset/segment_v2/page_handle.h" // for PageHandle
#include "olap/rowset/segment_v2/page_io.h"
#include "olap/rowset/segment_v2/page_pointer.h" // for PagePointer
#include "olap/rowset/segment_v2/page_compression.h"
#include "olap/rowset/segment_v2/options.h" // for PageDecoderOptions
#include "olap/types.h" // for TypeInfo
#include "olap/column_block.h" // for ColumnBlockView
#include "olap/page_cache.h"
#include "util/coding.h" // for get_varint32
#include "util/crc32c.h"
#include "util/rle_encoding.h" // for RleDecoder
#include "util/block_compression.h"
#include "util/file_manager.h"
#include "olap/rowset/segment_v2/binary_dict_page.h" // for BinaryDictPageDecoder
#include "olap/rowset/segment_v2/bloom_filter_index_reader.h"
@ -70,6 +65,31 @@ Status ColumnReader::init() {
}
RETURN_IF_ERROR(EncodingInfo::get(_type_info, _meta.encoding(), &_encoding_info));
RETURN_IF_ERROR(get_block_compression_codec(_meta.compression(), &_compress_codec));
for (int i = 0; i < _meta.indexes_size(); i++) {
auto& index_meta = _meta.indexes(i);
switch (index_meta.type()) {
case ORDINAL_INDEX:
_ordinal_index_meta = &index_meta.ordinal_index();
break;
case ZONE_MAP_INDEX:
_zone_map_index_meta = &index_meta.zone_map_index();
break;
case BITMAP_INDEX:
_bitmap_index_meta = &index_meta.bitmap_index();
break;
case BLOOM_FILTER_INDEX:
_bf_index_meta = &index_meta.bloom_filter_index();
break;
default:
return Status::Corruption(Substitute(
"Bad file $0: invalid column index type $1", _file_name, index_meta.type()));
}
}
if (_ordinal_index_meta == nullptr) {
return Status::Corruption(Substitute(
"Bad file $0: missing ordinal index for column $1", _file_name, _meta.column_id()));
}
return Status::OK();
}
@ -80,85 +100,23 @@ Status ColumnReader::new_iterator(ColumnIterator** iterator) {
Status ColumnReader::new_bitmap_index_iterator(BitmapIndexIterator** iterator) {
RETURN_IF_ERROR(_ensure_index_loaded());
RETURN_IF_ERROR(_bitmap_index_reader->new_iterator(iterator));
RETURN_IF_ERROR(_bitmap_index->new_iterator(iterator));
return Status::OK();
}
Status ColumnReader::read_page(const PagePointer& pp, const ColumnIteratorOptions& opts, PageHandle* handle) {
OpenedFileHandle<RandomAccessFile> file_handle;
RETURN_IF_ERROR(FileManager::instance()->open_file(_file_name, &file_handle));
RandomAccessFile* input_file = file_handle.file();
return read_page(input_file, pp, opts, handle);
}
Status ColumnReader::read_page(const ColumnIteratorOptions& iter_opts, const PagePointer& pp,
PageHandle* handle, Slice* page_body, PageFooterPB* footer) {
iter_opts.sanity_check();
PageReadOptions opts;
opts.file = iter_opts.file;
opts.page_pointer = pp;
opts.codec = _compress_codec;
opts.stats = iter_opts.stats;
opts.verify_checksum = _opts.verify_checksum;
opts.use_page_cache = iter_opts.use_page_cache;
opts.kept_in_memory = _opts.kept_in_memory;
Status ColumnReader::read_page(RandomAccessFile* file, const PagePointer& pp,
const ColumnIteratorOptions& iter_opts, PageHandle* handle) {
iter_opts.stats->total_pages_num++;
auto cache = StoragePageCache::instance();
PageCacheHandle cache_handle;
StoragePageCache::CacheKey cache_key(file->file_name(), pp.offset);
if (iter_opts.use_page_cache && cache->lookup(cache_key, &cache_handle)) {
// we find page in cache, use it
*handle = PageHandle(std::move(cache_handle));
iter_opts.stats->cached_pages_num++;
return Status::OK();
}
// Now we read this from file.
size_t page_size = pp.size;
if (page_size < sizeof(uint32_t)) {
return Status::Corruption(Substitute("Bad page, page size is too small, size=$0", page_size));
}
// Now we use this buffer to store page from storage, if this page is compressed
// this buffer will assigned uncompressed page, and origin content will be freed.
std::unique_ptr<uint8_t[]> page(new uint8_t[page_size]);
Slice page_slice(page.get(), page_size);
{
SCOPED_RAW_TIMER(&iter_opts.stats->io_ns);
RETURN_IF_ERROR(file->read_at(pp.offset, page_slice));
iter_opts.stats->compressed_bytes_read += page_size;
}
size_t data_size = page_size - 4;
if (_opts.verify_checksum) {
uint32_t expect = decode_fixed32_le((uint8_t*)page_slice.data + page_slice.size - 4);
uint32_t actual = crc32c::Value(page_slice.data, page_slice.size - 4);
if (expect != actual) {
return Status::Corruption(
Substitute("Page checksum mismatch, actual=$0 vs expect=$1", actual, expect));
}
}
// remove page's suffix
page_slice.size = data_size;
if (_compress_codec != nullptr) {
PageDecompressor decompressor(page_slice, _compress_codec);
Slice uncompressed_page;
{
SCOPED_RAW_TIMER(&iter_opts.stats->decompress_ns);
RETURN_IF_ERROR(decompressor.decompress_to(&uncompressed_page));
}
// If decompressor create new heap memory for uncompressed data,
// assign this uncompressed page to page and page slice
if (uncompressed_page.data != page_slice.data) {
page.reset((uint8_t*)uncompressed_page.data);
}
page_slice = uncompressed_page;
iter_opts.stats->uncompressed_bytes_read += page_slice.size;
}
if (iter_opts.use_page_cache) {
// insert this into cache and return the cache handle
cache->insert(cache_key, page_slice, &cache_handle, _opts.cache_in_memory);
*handle = PageHandle(std::move(cache_handle));
} else {
*handle = PageHandle(page_slice);
}
page.release();
return Status::OK();
return PageIO::read_and_decompress_page(opts, handle, page_body, footer);
}
Status ColumnReader::get_row_ranges_by_zone_map(CondColumn* cond_column,
@ -173,32 +131,57 @@ Status ColumnReader::get_row_ranges_by_zone_map(CondColumn* cond_column,
return Status::OK();
}
bool ColumnReader::match_condition(CondColumn* cond) const {
if (_zone_map_index_meta == nullptr || cond == nullptr) {
return true;
}
FieldType type = _type_info->type();
std::unique_ptr<WrapperField> min_value(WrapperField::create_by_type(type, _meta.length()));
std::unique_ptr<WrapperField> max_value(WrapperField::create_by_type(type, _meta.length()));
return _zone_map_match_condition(
_zone_map_index_meta->segment_zone_map(), min_value.get(), max_value.get(), cond);
}
bool ColumnReader::_zone_map_match_condition(const ZoneMapPB& zone_map,
WrapperField* min_value_container,
WrapperField* max_value_container,
CondColumn* cond) const {
if (cond == nullptr) {
return true;
}
if (!zone_map.has_not_null() && !zone_map.has_null()) {
return false; // no data in this zone
}
// min value and max value are valid if has_not_null is true
if (zone_map.has_not_null()) {
min_value_container->from_string(zone_map.min());
max_value_container->from_string(zone_map.max());
}
// for compatible original Cond eval logic
// TODO(hkp): optimize OlapCond
if (zone_map.has_null()) {
// for compatible, if exist null, original logic treat null as min
min_value_container->set_null();
if (!zone_map.has_not_null()) {
// for compatible OlapCond's 'is not null'
max_value_container->set_null();
}
}
return cond->eval({min_value_container, max_value_container});
}
Status ColumnReader::_get_filtered_pages(CondColumn* cond_column,
const std::vector<CondColumn*>& delete_conditions,
std::vector<uint32_t>* delete_partial_filtered_pages,
std::vector<uint32_t>* page_indexes) {
FieldType type = _type_info->type();
const std::vector<ZoneMapPB>& zone_maps = _column_zone_map->get_column_zone_map();
int32_t page_size = _column_zone_map->num_pages();
const std::vector<ZoneMapPB>& zone_maps = _zone_map_index->page_zone_maps();
int32_t page_size = _zone_map_index->num_pages();
std::unique_ptr<WrapperField> min_value(WrapperField::create_by_type(type, _meta.length()));
std::unique_ptr<WrapperField> max_value(WrapperField::create_by_type(type, _meta.length()));
for (int32_t i = 0; i < page_size; ++i) {
// min value and max value are valid if has_not_null is true
if (zone_maps[i].has_not_null()) {
min_value->from_string(zone_maps[i].min());
max_value->from_string(zone_maps[i].max());
}
// for compatible original Cond eval logic
// TODO(hkp): optimize OlapCond
if (zone_maps[i].has_null()) {
// for compatible, if exist null, original logic treat null as min
min_value->set_null();
if (!zone_maps[i].has_not_null()) {
// for compatible OlapCond's 'is not null'
max_value->set_null();
}
}
if (cond_column == nullptr || cond_column->eval({min_value.get(), max_value.get()})) {
if (_zone_map_match_condition(zone_maps[i], min_value.get(), max_value.get(), cond_column)) {
bool should_read = true;
for (auto& col_cond : delete_conditions) {
int state = col_cond->del_eval({min_value.get(), max_value.get()});
@ -220,8 +203,8 @@ Status ColumnReader::_get_filtered_pages(CondColumn* cond_column,
Status ColumnReader::_calculate_row_ranges(const std::vector<uint32_t>& page_indexes, RowRanges* row_ranges) {
row_ranges->clear();
for (auto i : page_indexes) {
rowid_t page_first_id = _ordinal_index->get_first_row_id(i);
rowid_t page_last_id = _ordinal_index->get_last_row_id(i);
ordinal_t page_first_id = _ordinal_index->get_first_ordinal(i);
ordinal_t page_last_id = _ordinal_index->get_last_ordinal(i);
RowRanges page_row_ranges(RowRanges::create_single(page_first_id, page_last_id + 1));
RowRanges::ranges_union(*row_ranges, page_row_ranges, row_ranges);
}
@ -232,7 +215,7 @@ Status ColumnReader::get_row_ranges_by_bloom_filter(CondColumn* cond_column, Row
RETURN_IF_ERROR(_ensure_index_loaded());
RowRanges bf_row_ranges;
std::unique_ptr<BloomFilterIndexIterator> bf_iter;
RETURN_IF_ERROR(_bloom_filter_index_reader->new_iterator(&bf_iter));
RETURN_IF_ERROR(_bloom_filter_index->new_iterator(&bf_iter));
size_t range_size = row_ranges->range_size();
// get covered page ids
std::set<uint32_t> page_ids;
@ -242,8 +225,8 @@ Status ColumnReader::get_row_ranges_by_bloom_filter(CondColumn* cond_column, Row
int64_t to = row_ranges->get_range_to(i);
auto iter = _ordinal_index->seek_at_or_before(from);
while (idx < to) {
page_ids.insert(iter.cur_idx());
idx = iter.cur_page_last_row_id() + 1;
page_ids.insert(iter.page_index());
idx = iter.last_ordinal() + 1;
iter.next();
}
}
@ -251,69 +234,40 @@ Status ColumnReader::get_row_ranges_by_bloom_filter(CondColumn* cond_column, Row
std::unique_ptr<BloomFilter> bf;
RETURN_IF_ERROR(bf_iter->read_bloom_filter(pid, &bf));
if (cond_column->eval(bf.get())) {
bf_row_ranges.add(RowRange(_ordinal_index->get_first_row_id(pid),
_ordinal_index->get_last_row_id(pid) + 1));
bf_row_ranges.add(RowRange(_ordinal_index->get_first_ordinal(pid),
_ordinal_index->get_last_ordinal(pid) + 1));
}
}
RowRanges::ranges_intersection(*row_ranges, bf_row_ranges, row_ranges);
return Status::OK();
}
Status ColumnReader::_load_ordinal_index() {
PagePointer pp = _meta.ordinal_index_page();
PageHandle ph;
OlapReaderStatistics stats;
ColumnIteratorOptions opts;
// column index only load once, so we use global config to decide
if (!config::disable_storage_page_cache) {
opts.use_page_cache = true;
}
opts.stats = &stats;
RETURN_IF_ERROR(read_page(pp, opts, &ph));
_ordinal_index.reset(new OrdinalPageIndex(ph.data(), _num_rows));
RETURN_IF_ERROR(_ordinal_index->load());
return Status::OK();
Status ColumnReader::_load_ordinal_index(bool use_page_cache, bool kept_in_memory) {
DCHECK(_ordinal_index_meta != nullptr);
_ordinal_index.reset(new OrdinalIndexReader(_file_name, _ordinal_index_meta, _num_rows));
return _ordinal_index->load(use_page_cache, kept_in_memory);
}
Status ColumnReader::_load_zone_map_index() {
if (_meta.has_zone_map_page()) {
PagePointer pp = _meta.zone_map_page();
PageHandle ph;
OlapReaderStatistics stats;
ColumnIteratorOptions opts;
// column index only load once, so we use global config to decide
if (!config::disable_storage_page_cache) {
opts.use_page_cache = true;
}
opts.stats = &stats;
RETURN_IF_ERROR(read_page(pp, opts, &ph));
_column_zone_map.reset(new ColumnZoneMap(ph.data()));
RETURN_IF_ERROR(_column_zone_map->load());
} else {
_column_zone_map.reset(nullptr);
Status ColumnReader::_load_zone_map_index(bool use_page_cache, bool kept_in_memory) {
if (_zone_map_index_meta != nullptr) {
_zone_map_index.reset(new ZoneMapIndexReader(_file_name, _zone_map_index_meta));
return _zone_map_index->load(use_page_cache, kept_in_memory);
}
return Status::OK();
}
Status ColumnReader::_load_bitmap_index() {
if (_meta.has_bitmap_index()) {
const BitmapIndexColumnPB& bitmap_index_meta = _meta.bitmap_index();
_bitmap_index_reader.reset(new BitmapIndexReader(_file_name, bitmap_index_meta));
RETURN_IF_ERROR(_bitmap_index_reader->load(_opts.cache_in_memory));
} else {
_bitmap_index_reader.reset(nullptr);
Status ColumnReader::_load_bitmap_index(bool use_page_cache, bool kept_in_memory) {
if (_bitmap_index_meta != nullptr) {
_bitmap_index.reset(new BitmapIndexReader(_file_name, _bitmap_index_meta));
return _bitmap_index->load(use_page_cache, kept_in_memory);
}
return Status::OK();
}
Status ColumnReader::_load_bloom_filter_index() {
if (_meta.has_bloom_filter_index()) {
const BloomFilterIndexPB& bloom_filter_index_meta = _meta.bloom_filter_index();
_bloom_filter_index_reader.reset(new BloomFilterIndexReader(_file_name, bloom_filter_index_meta));
RETURN_IF_ERROR(_bloom_filter_index_reader->load(_opts.cache_in_memory));
} else {
_bloom_filter_index_reader.reset(nullptr);
Status ColumnReader::_load_bloom_filter_index(bool use_page_cache, bool kept_in_memory) {
if (_bf_index_meta != nullptr) {
_bloom_filter_index.reset(new BloomFilterIndexReader(_file_name, _bf_index_meta));
return _bloom_filter_index->load(use_page_cache, kept_in_memory);
}
return Status::OK();
}
@ -327,11 +281,11 @@ Status ColumnReader::seek_to_first(OrdinalPageIndexIterator* iter) {
return Status::OK();
}
Status ColumnReader::seek_at_or_before(rowid_t rowid, OrdinalPageIndexIterator* iter) {
Status ColumnReader::seek_at_or_before(ordinal_t ordinal, OrdinalPageIndexIterator* iter) {
RETURN_IF_ERROR(_ensure_index_loaded());
*iter = _ordinal_index->seek_at_or_before(rowid);
*iter = _ordinal_index->seek_at_or_before(ordinal);
if (!iter->valid()) {
return Status::NotFound(Substitute("Failed to seek to rowid $0, ", rowid));
return Status::NotFound(Substitute("Failed to seek to ordinal $0, ", ordinal));
}
return Status::OK();
}
@ -343,38 +297,34 @@ FileColumnIterator::~FileColumnIterator() = default;
Status FileColumnIterator::seek_to_first() {
RETURN_IF_ERROR(_reader->seek_to_first(&_page_iter));
_page.reset(new ParsedPage());
RETURN_IF_ERROR(_read_page(_page_iter, _page.get()));
RETURN_IF_ERROR(_read_data_page(_page_iter));
_seek_to_pos_in_page(_page.get(), 0);
_current_rowid = 0;
_current_ordinal = 0;
return Status::OK();
}
Status FileColumnIterator::seek_to_ordinal(rowid_t rid) {
Status FileColumnIterator::seek_to_ordinal(ordinal_t ord) {
// if current page contains this row, we don't need to seek
if (_page == nullptr || !_page->contains(rid)) {
RETURN_IF_ERROR(_reader->seek_at_or_before(rid, &_page_iter));
_page.reset(new ParsedPage());
RETURN_IF_ERROR(_read_page(_page_iter, _page.get()));
if (_page == nullptr || !_page->contains(ord)) {
RETURN_IF_ERROR(_reader->seek_at_or_before(ord, &_page_iter));
RETURN_IF_ERROR(_read_data_page(_page_iter));
}
_seek_to_pos_in_page(_page.get(), rid - _page->first_rowid);
_current_rowid = rid;
_seek_to_pos_in_page(_page.get(), ord - _page->first_ordinal);
_current_ordinal = ord;
return Status::OK();
}
void FileColumnIterator::_seek_to_pos_in_page(ParsedPage* page, uint32_t offset_in_page) {
void FileColumnIterator::_seek_to_pos_in_page(ParsedPage* page, ordinal_t offset_in_page) {
if (page->offset_in_page == offset_in_page) {
// fast path, do nothing
return;
}
uint32_t pos_in_data = offset_in_page;
if (_reader->is_nullable()) {
rowid_t offset_in_data = 0;
rowid_t skips = offset_in_page;
ordinal_t pos_in_data = offset_in_page;
if (_page->has_null) {
ordinal_t offset_in_data = 0;
ordinal_t skips = offset_in_page;
if (offset_in_page > page->offset_in_page) {
// forward, reuse null bitmap
@ -415,8 +365,8 @@ Status FileColumnIterator::next_batch(size_t* n, ColumnBlockView* dst) {
// number of rows to be read from this page
size_t nrows_in_page = std::min(remaining, _page->remaining());
size_t nrows_to_read = nrows_in_page;
if (_reader->is_nullable()) {
// when this column is nullable we read data in some runs
if (_page->has_null) {
// when this page contains NULLs we read data in some runs
// first we read null bits in the same value, if this is null, we
// don't need to read value from page.
// If this is not null, we read data from page in batch.
@ -438,7 +388,7 @@ Status FileColumnIterator::next_batch(size_t* n, ColumnBlockView* dst) {
nrows_to_read -= this_run;
_page->offset_in_page += this_run;
dst->advance(this_run);
_current_rowid += this_run;
_current_ordinal += this_run;
}
} else {
RETURN_IF_ERROR(_page->data_decoder->next_batch(&nrows_to_read, dst));
@ -450,7 +400,7 @@ Status FileColumnIterator::next_batch(size_t* n, ColumnBlockView* dst) {
_page->offset_in_page += nrows_to_read;
dst->advance(nrows_to_read);
_current_rowid += nrows_to_read;
_current_ordinal += nrows_to_read;
}
remaining -= nrows_in_page;
}
@ -467,70 +417,46 @@ Status FileColumnIterator::_load_next_page(bool* eos) {
*eos = true;
return Status::OK();
}
_page.reset(new ParsedPage());
RETURN_IF_ERROR(_read_page(_page_iter, _page.get()));
RETURN_IF_ERROR(_read_data_page(_page_iter));
_seek_to_pos_in_page(_page.get(), 0);
*eos = false;
return Status::OK();
}
// read one page from file and parse this page to make
// it ready to read
Status FileColumnIterator::_read_page(const OrdinalPageIndexIterator& iter, ParsedPage* page) {
page->page_pointer = iter.page();
RETURN_IF_ERROR(_reader->read_page(_file, page->page_pointer, _opts, &page->page_handle));
// TODO(zc): read page from file
Slice data = page->page_handle.data();
Status FileColumnIterator::_read_data_page(const OrdinalPageIndexIterator& iter) {
PageHandle handle;
Slice page_body;
PageFooterPB footer;
RETURN_IF_ERROR(_reader->read_page(_opts, iter.page(), &handle, &page_body, &footer));
// parse data page
RETURN_IF_ERROR(ParsedPage::create(
std::move(handle), page_body, footer.data_page_footer(), _reader->encoding_info(),
iter.page(), iter.page_index(), &_page));
// decode first rowid
if (!get_varint32(&data, &page->first_rowid)) {
return Status::Corruption("Bad page, failed to decode first rowid");
}
// decode number rows
if (!get_varint32(&data, &page->num_rows)) {
return Status::Corruption("Bad page, failed to decode rows count");
}
if (_reader->is_nullable()) {
uint32_t null_bitmap_size = 0;
if (!get_varint32(&data, &null_bitmap_size)) {
return Status::Corruption("Bad page, failed to decode null bitmap size");
}
if (null_bitmap_size > data.size) {
return Status::Corruption(
Substitute("Bad page, null bitmap too large $0 vs $1", null_bitmap_size, data.size));
}
page->null_decoder = RleDecoder<bool>((uint8_t*)data.data, null_bitmap_size, 1);
page->null_bitmap = Slice(data.data, null_bitmap_size);
// remove null bitmap
data.remove_prefix(null_bitmap_size);
}
// create page data decoder
PageDecoderOptions options;
RETURN_IF_ERROR(_reader->encoding_info()->create_page_decoder(data, options, &page->data_decoder));
RETURN_IF_ERROR(page->data_decoder->init());
// lazy init dict_encoding'dict for three reasons
// 1. a column use dictionary encoding still has non-dict-encoded data pages are seeked,load dict when necessary
// 2. ColumnReader which is owned by Segment and Rowset can being alive even when there is no query,it should retain memory as small as possible.
// 3. Iterators of the same column won't repeat load the dict page because of page cache.
// dictionary page is read when the first data page that uses it is read,
// this is to optimize the memory usage: when there is no query on one column, we could
// release the memory of dictionary page.
// note that concurrent iterators for the same column won't repeatedly read dictionary page
// because of page cache.
if (_reader->encoding_info()->encoding() == DICT_ENCODING) {
BinaryDictPageDecoder* binary_dict_page_decoder = (BinaryDictPageDecoder*)page->data_decoder;
if (binary_dict_page_decoder->is_dict_encoding()) {
auto dict_page_decoder = reinterpret_cast<BinaryDictPageDecoder*>(_page->data_decoder);
if (dict_page_decoder->is_dict_encoding()) {
if (_dict_decoder == nullptr) {
PagePointer pp = _reader->get_dict_page_pointer();
RETURN_IF_ERROR(_reader->read_page(_file, pp, _opts, &_dict_page_handle));
_dict_decoder.reset(new BinaryPlainPageDecoder(_dict_page_handle.data()));
// read dictionary page
Slice dict_data;
PageFooterPB dict_footer;
RETURN_IF_ERROR(_reader->read_page(
_opts, _reader->get_dict_page_pointer(),
&_dict_page_handle, &dict_data, &dict_footer));
// ignore dict_footer.dict_page_footer().encoding() due to only
// PLAIN_ENCODING is supported for dict page right now
_dict_decoder.reset(new BinaryPlainPageDecoder(dict_data));
RETURN_IF_ERROR(_dict_decoder->init());
}
binary_dict_page_decoder->set_dict_decoder(_dict_decoder.get());
dict_page_decoder->set_dict_decoder(_dict_decoder.get());
}
}
page->offset_in_page = 0;
page->page_index = iter.cur_idx();
return Status::OK();
}

View File

@ -21,17 +21,18 @@
#include <cstddef> // for size_t
#include <memory> // for unique_ptr
#include "common/logging.h"
#include "common/status.h" // for Status
#include "gen_cpp/segment_v2.pb.h" // for ColumnMetaPB
#include "olap/olap_cond.h" // for CondColumn
#include "olap/tablet_schema.h"
#include "olap/rowset/segment_v2/bitmap_index_reader.h" // for BitmapIndexReader
#include "olap/rowset/segment_v2/common.h" // for rowid_t
#include "olap/rowset/segment_v2/common.h"
#include "olap/rowset/segment_v2/ordinal_page_index.h" // for OrdinalPageIndexIterator
#include "olap/rowset/segment_v2/column_zone_map.h" // for ColumnZoneMap
#include "olap/rowset/segment_v2/row_ranges.h" // for RowRanges
#include "olap/rowset/segment_v2/page_handle.h" // for PageHandle
#include "olap/rowset/segment_v2/parsed_page.h" // for ParsedPage
#include "olap/rowset/segment_v2/zone_map_index.h"
#include "util/once.h"
#include "util/file_cache.h"
@ -41,12 +42,13 @@ class ColumnBlock;
class RandomAccessFile;
class TypeInfo;
class BlockCompressionCodec;
class WrapperField;
namespace segment_v2 {
class EncodingInfo;
class PageHandle;
class PagePointer;
struct PagePointer;
class ColumnIterator;
class BloomFilterIndexReader;
@ -54,14 +56,19 @@ struct ColumnReaderOptions {
// whether verify checksum when read page
bool verify_checksum = true;
// for in memory olap table, use DURABLE CachePriority in page cache
bool cache_in_memory = false;
bool kept_in_memory = false;
};
struct ColumnIteratorOptions {
RandomAccessFile* file = nullptr;
// reader statistics
OlapReaderStatistics* stats = nullptr;
RandomAccessFile* file = nullptr;
bool use_page_cache = false;
void sanity_check() const {
CHECK_NOTNULL(file);
CHECK_NOTNULL(stats);
}
};
// There will be concurrent users to read the same column. So
@ -87,31 +94,25 @@ public:
// Seek to the first entry in the column.
Status seek_to_first(OrdinalPageIndexIterator* iter);
Status seek_at_or_before(rowid_t rowid, OrdinalPageIndexIterator* iter);
Status seek_at_or_before(ordinal_t ordinal, OrdinalPageIndexIterator* iter);
// read a page from file into a page handle
// use reader owned _file(usually is Descriptor<RandomAccessFile>*) to read page
Status read_page(const PagePointer& pp, const ColumnIteratorOptions& opts, PageHandle* handle);
// read a page from file into a page handle
// use file(usually is RandomAccessFile*) to read page
Status read_page(RandomAccessFile* file, const PagePointer& pp, const ColumnIteratorOptions& opts, PageHandle* handle);
Status read_page(const ColumnIteratorOptions& iter_opts, const PagePointer& pp,
PageHandle* handle, Slice* page_body, PageFooterPB* footer);
bool is_nullable() const { return _meta.is_nullable(); }
const EncodingInfo* encoding_info() const { return _encoding_info; }
const TypeInfo* type_info() const { return _type_info; }
bool has_zone_map() const { return _meta.has_zone_map_page(); }
bool has_zone_map() const { return _zone_map_index_meta != nullptr; }
bool has_bitmap_index() const { return _bitmap_index_meta != nullptr; }
bool has_bloom_filter_index() const { return _bf_index_meta != nullptr; }
bool has_bitmap_index() {
return _meta.has_bitmap_index();
}
bool has_bloom_filter_index() {
return _meta.has_bloom_filter_index();
}
// Check if this column could match `cond' using segment zone map.
// Since segment zone map is stored in metadata, this function is fast without I/O.
// Return true if segment zone map is absent or `cond' could be satisfied, false otherwise.
bool match_condition(CondColumn* cond) const;
// get row ranges with zone map
// - cond_column is user's query predicate
@ -137,18 +138,24 @@ private:
// May be called multiple times, subsequent calls will no op.
Status _ensure_index_loaded() {
return _load_index_once.call([this] {
RETURN_IF_ERROR(_load_zone_map_index());
RETURN_IF_ERROR(_load_ordinal_index());
RETURN_IF_ERROR(_load_bitmap_index());
RETURN_IF_ERROR(_load_bloom_filter_index());
bool use_page_cache = !config::disable_storage_page_cache;
RETURN_IF_ERROR(_load_zone_map_index(use_page_cache, _opts.kept_in_memory));
RETURN_IF_ERROR(_load_ordinal_index(use_page_cache, _opts.kept_in_memory));
RETURN_IF_ERROR(_load_bitmap_index(use_page_cache, _opts.kept_in_memory));
RETURN_IF_ERROR(_load_bloom_filter_index(use_page_cache, _opts.kept_in_memory));
return Status::OK();
});
}
Status _load_zone_map_index();
Status _load_ordinal_index();
Status _load_bitmap_index();
Status _load_bloom_filter_index();
Status _load_zone_map_index(bool use_page_cache, bool kept_in_memory);
Status _load_ordinal_index(bool use_page_cache, bool kept_in_memory);
Status _load_bitmap_index(bool use_page_cache, bool kept_in_memory);
Status _load_bloom_filter_index(bool use_page_cache, bool kept_in_memory);
bool _zone_map_match_condition(const ZoneMapPB& zone_map,
WrapperField* min_value_container,
WrapperField* max_value_container,
CondColumn* cond) const;
Status _get_filtered_pages(CondColumn* cond_column,
const std::vector<CondColumn*>& delete_conditions,
@ -167,12 +174,17 @@ private:
const TypeInfo* _type_info = nullptr;
const EncodingInfo* _encoding_info = nullptr;
const BlockCompressionCodec* _compress_codec = nullptr;
// meta for various column indexes (null if the index is absent)
const ZoneMapIndexPB* _zone_map_index_meta = nullptr;
const OrdinalIndexPB* _ordinal_index_meta = nullptr;
const BitmapIndexPB* _bitmap_index_meta = nullptr;
const BloomFilterIndexPB* _bf_index_meta = nullptr;
DorisCallOnce<Status> _load_index_once;
std::unique_ptr<ColumnZoneMap> _column_zone_map;
std::unique_ptr<OrdinalPageIndex> _ordinal_index;
std::unique_ptr<BitmapIndexReader> _bitmap_index_reader;
std::unique_ptr<BloomFilterIndexReader> _bloom_filter_index_reader;
std::unique_ptr<ZoneMapIndexReader> _zone_map_index;
std::unique_ptr<OrdinalIndexReader> _ordinal_index;
std::unique_ptr<BitmapIndexReader> _bitmap_index;
std::unique_ptr<BloomFilterIndexReader> _bloom_filter_index;
};
// Base iterator to read one column data
@ -193,14 +205,14 @@ public:
// Entry 0 is the first entry written to the column.
// If provided seek point is past the end of the file,
// then returns false.
virtual Status seek_to_ordinal(rowid_t ord_idx) = 0;
virtual Status seek_to_ordinal(ordinal_t ord) = 0;
// After one seek, we can call this function many times to read data
// into ColumnBlockView. when read string type data, memory will allocated
// from MemPool
virtual Status next_batch(size_t* n, ColumnBlockView* dst) = 0;
virtual rowid_t get_current_ordinal() const = 0;
virtual ordinal_t get_current_ordinal() const = 0;
virtual Status get_row_ranges_by_zone_map(CondColumn* cond_column,
const std::vector<CondColumn*>& delete_conditions,
@ -238,20 +250,13 @@ public:
FileColumnIterator(ColumnReader* reader);
~FileColumnIterator() override;
Status init(const ColumnIteratorOptions& opts) override {
RETURN_IF_ERROR(ColumnIterator::init(opts));
DCHECK(_opts.file != nullptr);
_file = _opts.file;
return Status::OK();
}
Status seek_to_first() override;
Status seek_to_ordinal(rowid_t ord_idx) override;
Status seek_to_ordinal(ordinal_t ord) override;
Status next_batch(size_t* n, ColumnBlockView* dst) override;
rowid_t get_current_ordinal() const override { return _current_rowid; }
ordinal_t get_current_ordinal() const override { return _current_ordinal; }
// get row ranges by zone map
// - cond_column is user's query predicate
@ -263,9 +268,9 @@ public:
Status get_row_ranges_by_bloom_filter(CondColumn* cond_column, RowRanges* row_ranges) override;
private:
void _seek_to_pos_in_page(ParsedPage* page, uint32_t offset_in_page);
void _seek_to_pos_in_page(ParsedPage* page, ordinal_t offset_in_page);
Status _load_next_page(bool* eos);
Status _read_page(const OrdinalPageIndexIterator& iter, ParsedPage* page);
Status _read_data_page(const OrdinalPageIndexIterator& iter);
private:
ColumnReader* _reader;
@ -286,13 +291,11 @@ private:
// This value will be reset when a new seek is issued
OrdinalPageIndexIterator _page_iter;
// current rowid
rowid_t _current_rowid = 0;
// current value ordinal
ordinal_t _current_ordinal = 0;
// page indexes those are DEL_PARTIAL_SATISFIED
std::vector<uint32_t> _delete_partial_statisfied_pages;
RandomAccessFile* _file;
};
// This iterator is used to read default value column
@ -315,14 +318,14 @@ public:
return Status::OK();
}
Status seek_to_ordinal(rowid_t ord_idx) override {
Status seek_to_ordinal(ordinal_t ord_idx) override {
_current_rowid = ord_idx;
return Status::OK();
}
Status next_batch(size_t* n, ColumnBlockView* dst) override;
rowid_t get_current_ordinal() const override { return _current_rowid; }
ordinal_t get_current_ordinal() const override { return _current_rowid; }
private:
bool _has_default_value;
@ -337,7 +340,7 @@ private:
std::unique_ptr<MemPool> _pool;
// current rowid
rowid_t _current_rowid = 0;
ordinal_t _current_rowid = 0;
};
}

View File

@ -19,22 +19,21 @@
#include <cstddef>
#include "common/logging.h" // for LOG
#include "env/env.h" // for LOG
#include "gutil/strings/substitute.h" // for Substitute
#include "common/logging.h"
#include "env/env.h"
#include "gutil/strings/substitute.h"
#include "olap/rowset/segment_v2/bitmap_index_writer.h"
#include "olap/rowset/segment_v2/encoding_info.h" // for EncodingInfo
#include "olap/rowset/segment_v2/options.h" // for PageBuilderOptions
#include "olap/rowset/segment_v2/ordinal_page_index.h" // for OrdinalPageIndexBuilder
#include "olap/rowset/segment_v2/page_builder.h" // for PageBuilder
#include "olap/rowset/segment_v2/page_compression.h"
#include "olap/rowset/segment_v2/bloom_filter_index_writer.h"
#include "olap/rowset/segment_v2/bloom_filter.h"
#include "olap/types.h" // for TypeInfo
#include "util/crc32c.h"
#include "util/faststring.h" // for fastring
#include "util/rle_encoding.h" // for RleEncoder
#include "olap/rowset/segment_v2/bloom_filter_index_writer.h"
#include "olap/rowset/segment_v2/encoding_info.h"
#include "olap/rowset/segment_v2/options.h"
#include "olap/rowset/segment_v2/ordinal_page_index.h"
#include "olap/rowset/segment_v2/page_builder.h"
#include "olap/rowset/segment_v2/page_io.h"
#include "olap/rowset/segment_v2/zone_map_index.h"
#include "util/block_compression.h"
#include "util/faststring.h"
#include "util/rle_encoding.h"
namespace doris {
namespace segment_v2 {
@ -43,23 +42,28 @@ using strings::Substitute;
class NullBitmapBuilder {
public:
NullBitmapBuilder() : _bitmap_buf(512), _rle_encoder(&_bitmap_buf, 1) {
NullBitmapBuilder() : _has_null(false), _bitmap_buf(512), _rle_encoder(&_bitmap_buf, 1) {
}
explicit NullBitmapBuilder(size_t reserve_bits)
: _bitmap_buf(BitmapSize(reserve_bits)), _rle_encoder(&_bitmap_buf, 1) {
: _has_null(false), _bitmap_buf(BitmapSize(reserve_bits)), _rle_encoder(&_bitmap_buf, 1) {
}
void add_run(bool value, size_t run) {
_has_null |= value;
_rle_encoder.Put(value, run);
}
// Returns whether the building nullmap contains NULL
bool has_null() const { return _has_null; }
OwnedSlice finish() {
_rle_encoder.Flush();
return _bitmap_buf.build();
}
void reset() {
_has_null = false;
_rle_encoder.Clear();
}
@ -67,19 +71,27 @@ public:
return _bitmap_buf.size();
}
private:
bool _has_null;
faststring _bitmap_buf;
RleEncoder<bool> _rle_encoder;
};
ColumnWriter::ColumnWriter(const ColumnWriterOptions& opts,
std::unique_ptr<Field> field,
bool is_nullable,
WritableFile* output_file)
: _opts(opts),
_is_nullable(is_nullable),
_output_file(output_file),
WritableFile* output_file) :
_opts(opts),
_field(std::move(field)),
_output_file(output_file),
_is_nullable(_opts.meta->is_nullable()),
_data_size(0) {
// these opts.meta fields should be set by client
DCHECK(opts.meta->has_column_id());
DCHECK(opts.meta->has_unique_id());
DCHECK(opts.meta->has_type());
DCHECK(opts.meta->has_length());
DCHECK(opts.meta->has_encoding());
DCHECK(opts.meta->has_compression());
DCHECK(opts.meta->has_is_nullable());
}
ColumnWriter::~ColumnWriter() {
@ -93,10 +105,13 @@ ColumnWriter::~ColumnWriter() {
}
Status ColumnWriter::init() {
RETURN_IF_ERROR(EncodingInfo::get(_field->type_info(), _opts.encoding_type, &_encoding_info));
if (_opts.compression_type != NO_COMPRESSION) {
RETURN_IF_ERROR(get_block_compression_codec(_opts.compression_type, &_compress_codec));
}
RETURN_IF_ERROR(EncodingInfo::get(_field->type_info(), _opts.meta->encoding(), &_encoding_info));
_opts.meta->set_encoding(_encoding_info->encoding());
// should store more concrete encoding type instead of DEFAULT_ENCODING
// because the default encoding of a data type can be changed in the future
DCHECK_NE(_opts.meta->encoding(), DEFAULT_ENCODING);
RETURN_IF_ERROR(get_block_compression_codec(_opts.meta->compression(), &_compress_codec));
// create page builder
PageBuilder* page_builder = nullptr;
@ -106,17 +121,17 @@ Status ColumnWriter::init() {
if (page_builder == nullptr) {
return Status::NotSupported(
Substitute("Failed to create page builder for type $0 and encoding $1",
_field->type(), _opts.encoding_type));
_field->type(), _opts.meta->encoding()));
}
_page_builder.reset(page_builder);
// create ordinal builder
_ordinal_index_builder.reset(new OrdinalPageIndexBuilder());
_ordinal_index_builder.reset(new OrdinalIndexWriter());
// create null bitmap builder
if (_is_nullable) {
_null_bitmap_builder.reset(new NullBitmapBuilder());
}
if (_opts.need_zone_map) {
_column_zone_map_builder.reset(new ColumnZoneMapBuilder(_field.get()));
_zone_map_index_builder.reset(new ZoneMapIndexWriter(_field.get()));
}
if (_opts.need_bitmap_index) {
RETURN_IF_ERROR(BitmapIndexWriter::create(_field->type_info(), &_bitmap_index_builder));
@ -132,7 +147,7 @@ Status ColumnWriter::append_nulls(size_t num_rows) {
_null_bitmap_builder->add_run(true, num_rows);
_next_rowid += num_rows;
if (_opts.need_zone_map) {
RETURN_IF_ERROR(_column_zone_map_builder->add(nullptr, 1));
_zone_map_index_builder->add_nulls(num_rows);
}
if (_opts.need_bitmap_index) {
_bitmap_index_builder->add_nulls(num_rows);
@ -156,7 +171,7 @@ Status ColumnWriter::_append_data(const uint8_t** ptr, size_t num_rows) {
size_t num_written = remaining;
RETURN_IF_ERROR(_page_builder->add(*ptr, &num_written));
if (_opts.need_zone_map) {
RETURN_IF_ERROR(_column_zone_map_builder->add(*ptr, num_written));
_zone_map_index_builder->add_values(*ptr, num_written);
}
if (_opts.need_bitmap_index) {
_bitmap_index_builder->add_values(*ptr, num_written);
@ -193,7 +208,7 @@ Status ColumnWriter::append_nullable(
_null_bitmap_builder->add_run(true, this_run);
_next_rowid += this_run;
if (_opts.need_zone_map) {
RETURN_IF_ERROR(_column_zone_map_builder->add(nullptr, 1));
_zone_map_index_builder->add_nulls(this_run);
}
if (_opts.need_bitmap_index) {
_bitmap_index_builder->add_nulls(this_run);
@ -216,7 +231,7 @@ uint64_t ColumnWriter::estimate_buffer_size() {
}
size += _ordinal_index_builder->size();
if (_opts.need_zone_map) {
size += _column_zone_map_builder->size();
size += _zone_map_index_builder->size();
}
if (_opts.need_bitmap_index) {
size += _bitmap_index_builder->size();
@ -239,189 +254,110 @@ Status ColumnWriter::write_data() {
}
// write column dict
if (_encoding_info->encoding() == DICT_ENCODING) {
OwnedSlice dict_page;
_page_builder->get_dictionary_page(&dict_page);
std::vector<Slice> origin_data;
origin_data.push_back(dict_page.slice());
RETURN_IF_ERROR(_compress_and_write_page(&origin_data, &_dict_page_pp));
OwnedSlice dict_body;
RETURN_IF_ERROR(_page_builder->get_dictionary_page(&dict_body));
PageFooterPB footer;
footer.set_type(DICTIONARY_PAGE);
footer.set_uncompressed_size(dict_body.slice().get_size());
footer.mutable_dict_page_footer()->set_encoding(PLAIN_ENCODING);
PagePointer dict_pp;
RETURN_IF_ERROR(PageIO::compress_and_write_page(
_compress_codec, _opts.compression_min_space_saving, _output_file,
{ dict_body.slice() }, footer, &dict_pp));
dict_pp.to_proto(_opts.meta->mutable_dict_page());
}
return Status::OK();
}
Status ColumnWriter::write_ordinal_index() {
Slice data = _ordinal_index_builder->finish();
std::vector<Slice> slices{data};
auto st = _compress_and_write_page(&slices, &_ordinal_index_pp);
return st;
return _ordinal_index_builder->finish(_output_file, _opts.meta->add_indexes());
}
Status ColumnWriter::write_zone_map() {
if (_opts.need_zone_map) {
OwnedSlice data = _column_zone_map_builder->finish();
std::vector<Slice> slices{data.slice()};
RETURN_IF_ERROR(_compress_and_write_page(&slices, &_zone_map_pp));
return _zone_map_index_builder->finish(_output_file, _opts.meta->add_indexes());
}
return Status::OK();
}
Status ColumnWriter::write_bitmap_index() {
if (!_opts.need_bitmap_index) {
return Status::OK();
if (_opts.need_bitmap_index) {
return _bitmap_index_builder->finish(_output_file, _opts.meta->add_indexes());
}
return _bitmap_index_builder->finish(_output_file, &_bitmap_index_meta);
return Status::OK();
}
Status ColumnWriter::write_bloom_filter_index() {
if (!_opts.need_bloom_filter) {
return Status::OK();
}
return _bloom_filter_index_builder->finish(_output_file, &_bloom_filter_index_meta);
}
void ColumnWriter::write_meta(ColumnMetaPB* meta) {
meta->set_type(_field->type());
meta->set_encoding(_encoding_info->encoding());
// should store more concrete encoding type instead of DEFAULT_ENCODING
// because the default encoding of a data type can be changed in the future
DCHECK_NE(meta->encoding(), DEFAULT_ENCODING);
meta->set_compression(_opts.compression_type);
meta->set_is_nullable(_is_nullable);
_ordinal_index_pp.to_proto(meta->mutable_ordinal_index_page());
if (_opts.need_zone_map) {
_zone_map_pp.to_proto(meta->mutable_zone_map_page());
_column_zone_map_builder->fill_segment_zone_map(meta->mutable_zone_map());
}
if (_encoding_info->encoding() == DICT_ENCODING) {
_dict_page_pp.to_proto(meta->mutable_dict_page());
}
if (_opts.need_bitmap_index) {
meta->mutable_bitmap_index()->CopyFrom(_bitmap_index_meta);
}
if (_opts.need_bloom_filter) {
meta->mutable_bloom_filter_index()->CopyFrom(_bloom_filter_index_meta);
return _bloom_filter_index_builder->finish(_output_file, _opts.meta->add_indexes());
}
return Status::OK();
}
// write a page into file and update ordinal index
// this function will call _write_physical_page to write data
// write a data page into file and update ordinal index
Status ColumnWriter::_write_data_page(Page* page) {
PagePointer pp;
std::vector<Slice> origin_data;
std::vector<Slice> compressed_body;
for (auto& data : page->data) {
origin_data.push_back(data.slice());
compressed_body.push_back(data.slice());
}
RETURN_IF_ERROR(_write_physical_page(&origin_data, &pp));
_ordinal_index_builder->append_entry(page->first_rowid, pp);
return Status::OK();
}
Status ColumnWriter::_compress_and_write_page(std::vector<Slice>* origin_data, PagePointer* pp) {
std::vector<Slice>* output_data = origin_data;
std::vector<Slice> compressed_data;
// Put compressor out of if block, because we will use compressor's
// content until this function finished.
PageCompressor compressor(_compress_codec);
if (_compress_codec != nullptr) {
RETURN_IF_ERROR(compressor.compress(*origin_data, &compressed_data));
output_data = &compressed_data;
}
return _write_physical_page(output_data, pp);
}
// write a physical page in to files
Status ColumnWriter::_write_physical_page(std::vector<Slice>* origin_data, PagePointer* pp) {
// checksum
uint8_t checksum_buf[sizeof(uint32_t)];
uint32_t checksum = crc32c::Value(*origin_data);
encode_fixed32_le(checksum_buf, checksum);
origin_data->emplace_back(checksum_buf, sizeof(uint32_t));
// remember the offset
pp->offset = _output_file->size();
// write content to file
size_t bytes_written = 0;
RETURN_IF_ERROR(_write_raw_data(*origin_data, &bytes_written));
pp->size = bytes_written;
return Status::OK();
}
// write raw data into file, this is the only place to write data
Status ColumnWriter::_write_raw_data(const std::vector<Slice>& data, size_t* bytes_written) {
auto file_size = _output_file->size();
auto st = _output_file->appendv(&data[0], data.size());
if (!st.ok()) {
LOG(WARNING) << "failed to append data to file, st=" << st.to_string();
return st;
}
*bytes_written = _output_file->size() - file_size;
_written_size += *bytes_written;
RETURN_IF_ERROR(PageIO::write_page(_output_file, compressed_body, page->footer, &pp));
_ordinal_index_builder->append_entry(page->footer.data_page_footer().first_ordinal(), pp);
return Status::OK();
}
Status ColumnWriter::_finish_current_page() {
if (_next_rowid == _last_first_rowid) {
if (_next_rowid == _first_rowid) {
return Status::OK();
}
std::unique_ptr<Page> page(new Page());
page->first_rowid = _last_first_rowid;
page->num_rows = _next_rowid - _last_first_rowid;
faststring header;
// 1. first rowid
put_varint32(&header, page->first_rowid);
// 2. row count
put_varint32(&header, page->num_rows);
OwnedSlice null_bitmap;
if (_is_nullable) {
null_bitmap = _null_bitmap_builder->finish();
_null_bitmap_builder->reset();
put_varint32(&header, null_bitmap.slice().get_size());
}
page->data.emplace_back(std::move(header.build()));
if (_is_nullable) {
page->data.emplace_back(std::move(null_bitmap));
}
OwnedSlice data_slice = _page_builder->finish();
_page_builder->reset();
page->data.emplace_back(std::move(data_slice));
// compressed data
if (_compress_codec != nullptr) {
PageCompressor compressor(_compress_codec);
std::vector<Slice> data_slices;
size_t origin_size = 0;
for (auto& data : page->data) {
data_slices.push_back(data.slice());
origin_size += data.slice().size;
}
OwnedSlice compressed_data;
bool compressed = false;
RETURN_IF_ERROR(compressor.compress(data_slices, &compressed_data, &compressed));
if (compressed) {
page->data.clear();
page->data.emplace_back(std::move(compressed_data));
} else {
size_t uncompressed_bytes = Slice::compute_total_size(data_slices);
faststring buf;
buf.resize(4);
encode_fixed32_le((uint8_t*)buf.data(), uncompressed_bytes);
page->data.emplace_back(std::move(buf.build()));
}
}
// update last first rowid
_last_first_rowid = _next_rowid;
_push_back_page(page.release());
if (_opts.need_zone_map) {
RETURN_IF_ERROR(_column_zone_map_builder->flush());
RETURN_IF_ERROR(_zone_map_index_builder->flush());
}
if (_opts.need_bloom_filter) {
RETURN_IF_ERROR(_bloom_filter_index_builder->flush());
}
// build data page body : encoded values + [nullmap]
vector<Slice> body;
OwnedSlice encoded_values = _page_builder->finish();
_page_builder->reset();
body.push_back(encoded_values.slice());
OwnedSlice nullmap;
if (_is_nullable && _null_bitmap_builder->has_null()) {
nullmap = _null_bitmap_builder->finish();
_null_bitmap_builder->reset();
body.push_back(nullmap.slice());
}
// prepare data page footer
std::unique_ptr<Page> page(new Page());
page->footer.set_type(DATA_PAGE);
page->footer.set_uncompressed_size(Slice::compute_total_size(body));
auto data_page_footer = page->footer.mutable_data_page_footer();
data_page_footer->set_first_ordinal(_first_rowid);
data_page_footer->set_num_values(_next_rowid - _first_rowid);
data_page_footer->set_nullmap_size(nullmap.slice().size);
// trying to compress page body
OwnedSlice compressed_body;
RETURN_IF_ERROR(PageIO::compress_page_body(
_compress_codec, _opts.compression_min_space_saving, body, &compressed_body));
if (compressed_body.slice().empty()) {
// page body is uncompressed
page->data.emplace_back(std::move(encoded_values));
page->data.emplace_back(std::move(nullmap));
} else {
// page body is compressed
page->data.emplace_back(std::move(compressed_body));
}
_push_back_page(page.release());
_first_rowid = _next_rowid;
return Status::OK();
}

View File

@ -21,8 +21,7 @@
#include "common/status.h" // for Status
#include "gen_cpp/segment_v2.pb.h" // for EncodingTypePB
#include "olap/rowset/segment_v2/column_zone_map.h" // for ColumnZoneMapBuilder
#include "olap/rowset/segment_v2/common.h" // for rowid_t
#include "olap/rowset/segment_v2/common.h"
#include "olap/rowset/segment_v2/page_pointer.h" // for PagePointer
#include "util/bitmap.h" // for BitmapChange
#include "util/slice.h" // for OwnedSlice
@ -36,8 +35,10 @@ class BlockCompressionCodec;
namespace segment_v2 {
struct ColumnWriterOptions {
EncodingTypePB encoding_type = DEFAULT_ENCODING;
CompressionTypePB compression_type = segment_v2::CompressionTypePB::LZ4F;
// input and output parameter:
// - input: column_id/unique_id/type/length/encoding/compression/is_nullable members
// - output: encoding/indexes/dict_page members
ColumnMetaPB* meta;
size_t data_page_size = 64 * 1024;
// store compressed page only when space saving is above the threshold.
// space saving = 1 - compressed_size / uncompressed_size
@ -50,9 +51,10 @@ struct ColumnWriterOptions {
class BitmapIndexWriter;
class EncodingInfo;
class NullBitmapBuilder;
class OrdinalPageIndexBuilder;
class OrdinalIndexWriter;
class PageBuilder;
class BloomFilterIndexWriter;
class ZoneMapIndexWriter;
// Encode one column's data into some memory slice.
// Because some columns would be stored in a file, we should wait
@ -62,7 +64,6 @@ class ColumnWriter {
public:
ColumnWriter(const ColumnWriterOptions& opts,
std::unique_ptr<Field> field,
bool is_nullable,
WritableFile* output_file);
~ColumnWriter();
@ -102,19 +103,17 @@ public:
Status write_zone_map();
Status write_bitmap_index();
Status write_bloom_filter_index();
void write_meta(ColumnMetaPB* meta);
private:
// All Pages will be organized into a linked list
struct Page {
int32_t first_rowid;
int32_t num_rows;
// the data vector may contain:
// 1. one OwnedSlice if the data is compressed
// 2. one OwnedSlice if the data is not compressed and is not nullable
// 3. two OwnedSlice if the data is not compressed and is nullable
// 1. one OwnedSlice if the page body is compressed
// 2. one OwnedSlice if the page body is not compressed and doesn't have nullmap
// 3. two OwnedSlice if the page body is not compressed and has nullmap
// use vector for easier management for lifetime of OwnedSlice
std::vector<OwnedSlice> data;
PageFooterPB footer;
Page* next = nullptr;
};
@ -135,45 +134,37 @@ private:
for (auto& data_slice : page->data) {
_data_size += data_slice.slice().size;
}
// estimate (page footer + footer size + checksum) took 20 bytes
_data_size += 20;
}
Status _append_data(const uint8_t** ptr, size_t num_rows);
Status _finish_current_page();
Status _write_raw_data(const std::vector<Slice>& data, size_t* bytes_written);
Status _write_data_page(Page* page);
Status _compress_and_write_page(std::vector<Slice>* origin_data, PagePointer* pp);
Status _write_physical_page(std::vector<Slice>* origin_data, PagePointer* pp);
private:
ColumnWriterOptions _opts;
std::unique_ptr<Field> _field;
WritableFile* _output_file;
bool _is_nullable;
WritableFile* _output_file = nullptr;
// total size of data page list
uint64_t _data_size;
// cached generated pages,
PageHead _pages;
rowid_t _last_first_rowid = 0;
rowid_t _next_rowid = 0;
ordinal_t _first_rowid = 0;
ordinal_t _next_rowid = 0;
const EncodingInfo* _encoding_info = nullptr;
const BlockCompressionCodec* _compress_codec = nullptr;
std::unique_ptr<PageBuilder> _page_builder;
std::unique_ptr<NullBitmapBuilder> _null_bitmap_builder;
std::unique_ptr<OrdinalPageIndexBuilder> _ordinal_index_builder;
std::unique_ptr<ColumnZoneMapBuilder> _column_zone_map_builder;
std::unique_ptr<Field> _field;
std::unique_ptr<OrdinalIndexWriter> _ordinal_index_builder;
std::unique_ptr<ZoneMapIndexWriter> _zone_map_index_builder;
std::unique_ptr<BitmapIndexWriter> _bitmap_index_builder;
std::unique_ptr<BloomFilterIndexWriter> _bloom_filter_index_builder;
BitmapIndexColumnPB _bitmap_index_meta;
BloomFilterIndexPB _bloom_filter_index_meta;
PagePointer _ordinal_index_pp;
PagePointer _zone_map_pp;
PagePointer _dict_page_pp;
// the total data size of page list
uint64_t _data_size;
uint64_t _written_size = 0;
};

View File

@ -1,127 +0,0 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "olap/rowset/segment_v2/column_zone_map.h"
#include "olap/olap_define.h"
namespace doris {
namespace segment_v2 {
ColumnZoneMapBuilder::ColumnZoneMapBuilder(Field* field) : _field(field), _pool(&_tracker) {
PageBuilderOptions options;
options.data_page_size = 0;
_page_builder.reset(new BinaryPlainPageBuilder(options));
_zone_map.min_value = _field->allocate_value(&_pool);
_zone_map.max_value = _field->allocate_value(&_pool);
_reset_page_zone_map();
_segment_zone_map.min_value = _field->allocate_value(&_pool);
_segment_zone_map.max_value = _field->allocate_value(&_pool);
_reset_segment_zone_map();
}
Status ColumnZoneMapBuilder::add(const uint8_t *vals, size_t count) {
if (vals != nullptr) {
for (int i = 0; i < count; ++i) {
if (_field->compare(_zone_map.min_value, (char *)vals) > 0) {
_field->type_info()->direct_copy(_zone_map.min_value, (const char *)vals);
}
if (_field->compare(_zone_map.max_value, (char *)vals) < 0) {
_field->type_info()->direct_copy(_zone_map.max_value, (const char *)vals);
}
vals += _field->size();
if (!_zone_map.has_not_null) {
_zone_map.has_not_null = true;
}
}
}
else {
if (!_zone_map.has_null) {
_zone_map.has_null = true;
}
}
return Status::OK();
}
void ColumnZoneMapBuilder::fill_segment_zone_map(ZoneMapPB* const to) {
_fill_zone_map_to_pb(_segment_zone_map, to);
}
Status ColumnZoneMapBuilder::flush() {
// Update segment zone map.
if (_field->compare(_segment_zone_map.min_value, _zone_map.min_value) > 0) {
_field->type_info()->direct_copy(_segment_zone_map.min_value, _zone_map.min_value);
}
if (_field->compare(_segment_zone_map.max_value, _zone_map.max_value) < 0) {
_field->type_info()->direct_copy(_segment_zone_map.max_value, _zone_map.max_value);
}
if (!_segment_zone_map.has_null && _zone_map.has_null) {
_segment_zone_map.has_null = true;
}
if (!_segment_zone_map.has_not_null && _zone_map.has_not_null) {
_segment_zone_map.has_not_null = true;
}
ZoneMapPB page_zone_map;
_fill_zone_map_to_pb(_zone_map, &page_zone_map);
std::string serialized_zone_map;
bool ret = page_zone_map.SerializeToString(&serialized_zone_map);
if (!ret) {
return Status::InternalError("serialize zone map failed");
}
Slice data(serialized_zone_map.data(), serialized_zone_map.size());
size_t num = 1;
RETURN_IF_ERROR(_page_builder->add((const uint8_t *)&data, &num));
// reset the variables
// we should allocate max varchar length and set to max for min value
_reset_page_zone_map();
return Status::OK();
}
void ColumnZoneMapBuilder::_reset_zone_map(ZoneMap* zone_map) {
_field->set_to_max(zone_map->min_value);
_field->set_to_min(zone_map->max_value);
zone_map->has_null = false;
zone_map->has_not_null = false;
}
void ColumnZoneMapBuilder::_fill_zone_map_to_pb(const ZoneMap& from, ZoneMapPB* const to) {
to->set_has_not_null(from.has_not_null);
to->set_has_null(from.has_null);
to->set_max(_field->to_string(from.max_value));
to->set_min(_field->to_string(from.min_value));
}
Status ColumnZoneMap::load() {
BinaryPlainPageDecoder page_decoder(_data);
RETURN_IF_ERROR(page_decoder.init());
_num_pages = page_decoder.count();
_page_zone_maps.resize(_num_pages);
for (int i = 0; i < _num_pages; ++i) {
Slice data = page_decoder.string_at_index(i);
bool ret = _page_zone_maps[i].ParseFromString(std::string(data.data, data.size));
if (!ret) {
return Status::Corruption("parse zone map failed");
}
}
return Status::OK();
}
} // namespace segment_v2
} // namespace doris

View File

@ -25,7 +25,10 @@
namespace doris {
namespace segment_v2 {
// One segment file could store at most INT32_MAX rows,
// but due to array type, each column could store more than INT32_MAX values.
using rowid_t = uint32_t;
using ordinal_t = uint64_t;
} // namespace segment_v2
} // namespace doris

View File

@ -20,7 +20,6 @@
#include "olap/rowset/segment_v2/page_builder.h" // for PageBuilder
#include "olap/rowset/segment_v2/page_decoder.h" // for PageDecoder
#include "olap/rowset/segment_v2/options.h" // for PageBuilderOptions/PageDecoderOptions
#include "olap/rowset/segment_v2/common.h" // for rowid_t
#include "util/frame_of_reference_coding.h"
namespace doris {

View File

@ -20,7 +20,6 @@
#include <string>
#include "common/logging.h"
#include "olap/key_coder.h"
#include "util/coding.h"
namespace doris {
@ -38,17 +37,15 @@ bool IndexPageBuilder::is_full() const {
return _buffer.size() + 16 > _index_page_size;
}
Slice IndexPageBuilder::finish() {
void IndexPageBuilder::finish(OwnedSlice* body, PageFooterPB* footer) {
DCHECK(!_finished) << "already called finish()";
IndexPageFooterPB footer;
footer.set_num_entries(_count);
footer.set_type(_is_leaf ? IndexPageFooterPB::LEAF : IndexPageFooterPB::INTERNAL);
*body = _buffer.build();
std::string footer_buf;
footer.SerializeToString(&footer_buf);
_buffer.append(footer_buf);
put_fixed32_le(&_buffer, footer_buf.size());
return Slice(_buffer);
footer->set_type(INDEX_PAGE);
footer->set_uncompressed_size(body->slice().get_size());
footer->mutable_index_page_footer()->set_num_entries(_count);
footer->mutable_index_page_footer()->set_type(
_is_leaf ? IndexPageFooterPB::LEAF : IndexPageFooterPB::INTERNAL);
}
Status IndexPageBuilder::get_first_key(Slice* key) const {
@ -65,15 +62,11 @@ Status IndexPageBuilder::get_first_key(Slice* key) const {
///////////////////////////////////////////////////////////////////////////////
Status IndexPageReader::parse(const Slice& data) {
size_t buffer_len = data.size;
const uint8_t* buffer = (uint8_t*)data.data;
size_t footer_size = decode_fixed32_le(buffer + buffer_len - 4);
std::string footer_buf(data.data + buffer_len - 4 - footer_size, footer_size);
_footer.ParseFromString(footer_buf);
Status IndexPageReader::parse(const Slice& body, const IndexPageFooterPB& footer) {
_footer = footer;
size_t num_entries = _footer.num_entries();
Slice input(data);
Slice input(body);
for (int i = 0; i < num_entries; ++i) {
Slice key;
PagePointer value;

View File

@ -31,22 +31,18 @@
namespace doris {
namespace segment_v2 {
class IndexPageIterator; // forward decl.
// IndexPage is the building block for IndexedColumn's ordinal index and value index.
// It is used to guide searching for a particular key to the data page containing it.
// We use the same general format for all index pages, regardless of the data type and node type (leaf or internal)
// IndexPage := IndexEntry^NumEntry, IndexPageFooterPB, IndexPageFooterPBSize(4)
// IndexEntry := IndexKey, PagePointer
// IndexKey := KeyLength(vint32), KeyData(KeyLength bytes)
// PagePointer := PageOffset(vint64), PageSize(vint32)
// IndexPageBody := IndexEntry^NumEntry
// IndexEntry := KeyLength(vint), Byte^KeyLength, PageOffset(vlong), PageSize(vint)
//
// IndexPageFooterPB records NumEntry and type (leaf/internal) of the index page.
// For leaf, IndexKey records the first/smallest key of the data page PagePointer points to.
// For internal, IndexKey records the first/smallest key of the next-level index page PagePointer points to.
//
// All keys are treated as binary string and compared with memcpy. Keys of other data type are encoded first by
// KeyCoder, e.g., ordinal index's original key type is uint32_t but is encoded to binary string.
// KeyCoder, e.g., ordinal index's original key type is uint64_t but is encoded to binary string.
class IndexPageBuilder {
public:
explicit IndexPageBuilder(size_t index_page_size, bool is_leaf)
@ -59,7 +55,7 @@ public:
size_t count() const { return _count; }
Slice finish();
void finish(OwnedSlice* body, PageFooterPB* footer);
uint64_t size() {
return _buffer.size();
@ -87,9 +83,9 @@ private:
class IndexPageIterator;
class IndexPageReader {
public:
IndexPageReader() : _parsed(false) {};
IndexPageReader() : _parsed(false) {}
Status parse(const Slice& data);
Status parse(const Slice& body, const IndexPageFooterPB& footer);
inline size_t count() const {
DCHECK(_parsed);

View File

@ -21,12 +21,7 @@
#include "gutil/strings/substitute.h" // for Substitute
#include "olap/key_coder.h"
#include "olap/rowset/segment_v2/encoding_info.h" // for EncodingInfo
#include "olap/rowset/segment_v2/index_page.h" // for IndexPageReader
#include "olap/rowset/segment_v2/options.h" // for PageDecoderOptions
#include "olap/rowset/segment_v2/page_compression.h"
#include "olap/rowset/segment_v2/page_decoder.h" // for PagePointer
#include "util/crc32c.h"
#include "util/rle_encoding.h" // for RleDecoder
#include "olap/rowset/segment_v2/page_io.h"
#include "util/file_manager.h"
namespace doris {
@ -34,7 +29,10 @@ namespace segment_v2 {
using strings::Substitute;
Status IndexedColumnReader::load() {
Status IndexedColumnReader::load(bool use_page_cache, bool kept_in_memory) {
_use_page_cache = use_page_cache;
_kept_in_memory = kept_in_memory;
_type_info = get_type_info((FieldType)_meta.data_type());
if (_type_info == nullptr) {
return Status::NotSupported(Substitute("unsupported typeinfo, type=$0", _meta.data_type()));
@ -51,8 +49,10 @@ Status IndexedColumnReader::load() {
if (_meta.ordinal_index_meta().is_root_data_page()) {
_sole_data_page = PagePointer(_meta.ordinal_index_meta().root_page());
} else {
RETURN_IF_ERROR(read_page(input_file, _meta.ordinal_index_meta().root_page(), &_ordinal_index_page_handle));
RETURN_IF_ERROR(_ordinal_index_reader.parse(_ordinal_index_page_handle.data()));
RETURN_IF_ERROR(load_index_page(input_file,
_meta.ordinal_index_meta().root_page(),
&_ordinal_index_page_handle,
&_ordinal_index_reader));
_has_index_page = true;
}
}
@ -62,8 +62,10 @@ Status IndexedColumnReader::load() {
if (_meta.value_index_meta().is_root_data_page()) {
_sole_data_page = PagePointer(_meta.value_index_meta().root_page());
} else {
RETURN_IF_ERROR(read_page(input_file, _meta.value_index_meta().root_page(), &_value_index_page_handle));
RETURN_IF_ERROR(_value_index_reader.parse(_value_index_page_handle.data()));
RETURN_IF_ERROR(load_index_page(input_file,
_meta.value_index_meta().root_page(),
&_value_index_page_handle,
&_value_index_reader));
_has_index_page = true;
}
}
@ -71,91 +73,45 @@ Status IndexedColumnReader::load() {
return Status::OK();
}
Status IndexedColumnReader::read_page(RandomAccessFile* file, const PagePointer& pp, PageHandle* handle) const {
auto cache = StoragePageCache::instance();
PageCacheHandle cache_handle;
StoragePageCache::CacheKey cache_key(file->file_name(), pp.offset);
// column index only load once, so we use global config to decide
if (!config::disable_storage_page_cache && cache->lookup(cache_key, &cache_handle)) {
// we find page in cache, use it
*handle = PageHandle(std::move(cache_handle));
return Status::OK();
}
// Now we read this from file.
size_t page_size = pp.size;
if (page_size < sizeof(uint32_t)) {
return Status::Corruption(Substitute("Bad page, page size is too small, size=$0", page_size));
}
// Now we use this buffer to store page from storage, if this page is compressed
// this buffer will assigned uncompressed page, and origin content will be freed.
std::unique_ptr<uint8_t[]> page(new uint8_t[page_size]);
Slice page_slice(page.get(), page_size);
RETURN_IF_ERROR(file->read_at(pp.offset, page_slice));
size_t data_size = page_size - 4;
if (_verify_checksum) {
uint32_t expect = decode_fixed32_le((uint8_t*)page_slice.data + page_slice.size - 4);
uint32_t actual = crc32c::Value(page_slice.data, page_slice.size - 4);
if (expect != actual) {
return Status::Corruption(
Substitute("Page checksum mismatch, actual=$0 vs expect=$1", actual, expect));
}
}
// remove page's suffix
page_slice.size = data_size;
if (_compress_codec != nullptr) {
PageDecompressor decompressor(page_slice, _compress_codec);
Slice uncompressed_page;
RETURN_IF_ERROR(decompressor.decompress_to(&uncompressed_page));
// If decompressor create new heap memory for uncompressed data,
// assign this uncompressed page to page and page slice
if (uncompressed_page.data != page_slice.data) {
page.reset((uint8_t*)uncompressed_page.data);
}
page_slice = uncompressed_page;
}
if (!config::disable_storage_page_cache) {
// insert this into cache and return the cache handle
cache->insert(cache_key, page_slice, &cache_handle, _cache_in_memory);
*handle = PageHandle(std::move(cache_handle));
} else {
*handle = PageHandle(page_slice);
}
page.release();
Status IndexedColumnReader::load_index_page(RandomAccessFile* file,
const PagePointerPB& pp,
PageHandle* handle,
IndexPageReader* reader) {
Slice body;
PageFooterPB footer;
RETURN_IF_ERROR(read_page(file, PagePointer(pp), handle, &body, &footer));
RETURN_IF_ERROR(reader->parse(body, footer.index_page_footer()));
return Status::OK();
}
Status IndexedColumnReader::read_page(RandomAccessFile* file, const PagePointer& pp,
PageHandle* handle, Slice* body, PageFooterPB* footer) const {
PageReadOptions opts;
opts.file = file;
opts.page_pointer = pp;
opts.codec = _compress_codec;
OlapReaderStatistics tmp_stats;
opts.stats = &tmp_stats;
opts.use_page_cache = _use_page_cache;
opts.kept_in_memory = _kept_in_memory;
return PageIO::read_and_decompress_page(opts, handle, body, footer);
}
///////////////////////////////////////////////////////////////////////////////
Status IndexedColumnIterator::_read_data_page(const PagePointer& page_pointer, ParsedPage* page) {
RETURN_IF_ERROR(_reader->read_page(_file, page_pointer, &page->page_handle));
Slice data = page->page_handle.data();
// decode first rowid
if (!get_varint32(&data, &page->first_rowid)) {
return Status::Corruption("Bad page, failed to decode first rowid");
}
// decode number rows
if (!get_varint32(&data, &page->num_rows)) {
return Status::Corruption("Bad page, failed to decode rows count");
}
// create page data decoder
PageDecoderOptions options;
RETURN_IF_ERROR(_reader->encoding_info()->create_page_decoder(data, options, &page->data_decoder));
RETURN_IF_ERROR(page->data_decoder->init());
page->offset_in_page = 0;
return Status::OK();
Status IndexedColumnIterator::_read_data_page(const PagePointer& pp) {
PageHandle handle;
Slice body;
PageFooterPB footer;
RETURN_IF_ERROR(_reader->read_page(_file, pp, &handle, &body, &footer));
// parse data page
// note that page_index is not used in IndexedColumnIterator, so we pass 0
return ParsedPage::create(std::move(handle), body, footer.data_page_footer(),
_reader->encoding_info(), pp, 0, &_data_page);
}
Status IndexedColumnIterator::seek_to_ordinal(rowid_t idx) {
Status IndexedColumnIterator::seek_to_ordinal(ordinal_t idx) {
DCHECK(idx >= 0 && idx <= _reader->num_values());
if (!_reader->support_ordinal_seek()) {
@ -164,30 +120,29 @@ Status IndexedColumnIterator::seek_to_ordinal(rowid_t idx) {
// it's ok to seek past the last value
if (idx == _reader->num_values()) {
_current_rowid = idx;
_current_ordinal = idx;
_seeked = true;
return Status::OK();
}
if (_data_page == nullptr || !_data_page->contains(idx)) {
// need to read the data page containing row at idx
_data_page.reset(new ParsedPage());
if (_reader->_has_index_page) {
std::string key;
KeyCoderTraits<OLAP_FIELD_TYPE_UNSIGNED_INT>::full_encode_ascending(&idx, &key);
KeyCoderTraits<OLAP_FIELD_TYPE_UNSIGNED_BIGINT>::full_encode_ascending(&idx, &key);
RETURN_IF_ERROR(_ordinal_iter.seek_at_or_before(key));
RETURN_IF_ERROR(_read_data_page(_ordinal_iter.current_page_pointer(), _data_page.get()));
RETURN_IF_ERROR(_read_data_page(_ordinal_iter.current_page_pointer()));
_current_iter = &_ordinal_iter;
} else {
RETURN_IF_ERROR(_read_data_page(_reader->_sole_data_page, _data_page.get()));
RETURN_IF_ERROR(_read_data_page(_reader->_sole_data_page));
}
}
rowid_t offset_in_page = idx - _data_page->first_rowid;
ordinal_t offset_in_page = idx - _data_page->first_ordinal;
RETURN_IF_ERROR(_data_page->data_decoder->seek_to_position_in_page(offset_in_page));
DCHECK(offset_in_page == _data_page->data_decoder->current_index());
_data_page->offset_in_page = offset_in_page;
_current_rowid = idx;
_current_ordinal = idx;
_seeked = true;
return Status::OK();
}
@ -221,27 +176,21 @@ Status IndexedColumnIterator::seek_at_or_after(const void* key, bool* exact_matc
}
if (load_data_page) {
_data_page.reset(new ParsedPage());
RETURN_IF_ERROR(_read_data_page(data_page_pp, _data_page.get()));
RETURN_IF_ERROR(_read_data_page(data_page_pp));
}
// seek inside data page
RETURN_IF_ERROR(_data_page->data_decoder->seek_at_or_after_value(key, exact_match));
_data_page->offset_in_page = _data_page->data_decoder->current_index();
_current_rowid = _data_page->first_rowid + _data_page->offset_in_page;
DCHECK(_data_page->contains(_current_rowid));
_current_ordinal = _data_page->first_ordinal + _data_page->offset_in_page;
DCHECK(_data_page->contains(_current_ordinal));
_seeked = true;
return Status::OK();
}
rowid_t IndexedColumnIterator::get_current_ordinal() const {
DCHECK(_seeked);
return _current_rowid;
}
Status IndexedColumnIterator::next_batch(size_t* n, ColumnBlockView* column_view) {
DCHECK(_seeked);
if (_current_rowid == _reader->num_values()) {
if (_current_ordinal == _reader->num_values()) {
*n = 0;
return Status::OK();
}
@ -257,8 +206,7 @@ Status IndexedColumnIterator::next_batch(size_t* n, ColumnBlockView* column_view
if (!has_next) {
break; // no more data page
}
_data_page.reset(new ParsedPage());
RETURN_IF_ERROR(_read_data_page(_current_iter->current_page_pointer(), _data_page.get()));
RETURN_IF_ERROR(_read_data_page(_current_iter->current_page_pointer()));
}
size_t rows_to_read = std::min(_data_page->remaining(), remaining);
@ -267,7 +215,7 @@ Status IndexedColumnIterator::next_batch(size_t* n, ColumnBlockView* column_view
DCHECK(rows_to_read == rows_read);
_data_page->offset_in_page += rows_read;
_current_rowid += rows_read;
_current_ordinal += rows_read;
column_view->advance(rows_read);
remaining -= rows_read;
}

View File

@ -47,34 +47,34 @@ class IndexedColumnIterator;
class IndexedColumnReader {
public:
explicit IndexedColumnReader(const std::string& file_name,
const IndexedColumnMetaPB& meta,
const bool cache_in_memory)
: _file_name(file_name), _meta(meta), _cache_in_memory(cache_in_memory) {};
const IndexedColumnMetaPB& meta)
: _file_name(file_name), _meta(meta) {};
Status load();
Status load(bool use_page_cache, bool kept_in_memory);
// read a page from file into a page handle
// use file(usually is RandomAccessFile*) to read page
Status read_page(RandomAccessFile* file, const PagePointer& pp, PageHandle* handle) const;
// read a page specified by `pp' from `file' into `handle'
Status read_page(RandomAccessFile* file, const PagePointer& pp,
PageHandle* handle, Slice* body, PageFooterPB* footer) const;
int64_t num_values() const { return _num_values; }
const EncodingInfo* encoding_info() const { return _encoding_info; }
const TypeInfo* type_info() const { return _type_info; }
bool support_ordinal_seek() const { return _meta.has_ordinal_index_meta(); }
bool support_value_seek() const { return _meta.has_value_index_meta(); }
private:
Status load_index_page(RandomAccessFile* file,
const PagePointerPB& pp,
PageHandle* handle,
IndexPageReader* reader);
friend class IndexedColumnIterator;
std::string _file_name;
IndexedColumnMetaPB _meta;
// if _cache_in_memory is true, we will use DURABLE CachePriority in page cache,
// otherwise we use NORMAL CachePriority
bool _cache_in_memory;
bool _use_page_cache;
bool _kept_in_memory;
int64_t _num_values = 0;
// whether this column contains any index page.
// could be false when the column contains only one data page.
@ -86,7 +86,6 @@ private:
PageHandle _ordinal_index_page_handle;
PageHandle _value_index_page_handle;
bool _verify_checksum = true;
const TypeInfo* _type_info = nullptr;
const EncodingInfo* _encoding_info = nullptr;
const BlockCompressionCodec* _compress_codec = nullptr;
@ -109,7 +108,7 @@ public:
// Seek to the given ordinal entry. Entry 0 is the first entry.
// Return NotFound if provided seek point is past the end.
// Return NotSupported for column without ordinal index.
Status seek_to_ordinal(rowid_t idx);
Status seek_to_ordinal(ordinal_t idx);
// Seek the index to the given key, or to the index entry immediately
// before it. Then seek the data block to the value matching value or to
@ -123,14 +122,17 @@ public:
Status seek_at_or_after(const void* key, bool* exact_match);
// Get the ordinal index that the iterator is currently pointed to.
rowid_t get_current_ordinal() const;
ordinal_t get_current_ordinal() const {
DCHECK(_seeked);
return _current_ordinal;
}
// After one seek, we can only call this function once to read data
// into ColumnBlock. when read string type data, memory will allocated
// from Arena
Status next_batch(size_t* n, ColumnBlockView* column_view);
private:
Status _read_data_page(const PagePointer& page_pointer, ParsedPage* page);
Status _read_data_page(const PagePointer& pp);
const IndexedColumnReader* _reader;
// iterator for ordinal index page
@ -141,10 +143,10 @@ private:
bool _seeked = false;
// current in-use index iterator, could be `&_ordinal_iter` or `&_value_iter` or null
IndexPageIterator* _current_iter = nullptr;
// seeked data page, containing value at `_current_rowid`
// seeked data page, containing value at `_current_ordinal`
std::unique_ptr<ParsedPage> _data_page;
// next_batch() will read from this position
rowid_t _current_rowid = 0;
ordinal_t _current_ordinal = 0;
// open file handle
OpenedFileHandle<RandomAccessFile> _file_handle;
// file to read

View File

@ -19,18 +19,18 @@
#include <string>
#include "common/logging.h"
#include "env/env.h"
#include "olap/rowset/segment_v2/encoding_info.h"
#include "olap/rowset/segment_v2/index_page.h"
#include "olap/rowset/segment_v2/options.h"
#include "olap/rowset/segment_v2/page_builder.h"
#include "olap/rowset/segment_v2/page_compression.h"
#include "olap/rowset/segment_v2/page_io.h"
#include "olap/rowset/segment_v2/page_pointer.h"
#include "olap/key_coder.h"
#include "olap/types.h"
#include "util/block_compression.h"
#include "util/coding.h"
#include "util/crc32c.h"
namespace doris {
namespace segment_v2 {
@ -55,6 +55,10 @@ IndexedColumnWriter::~IndexedColumnWriter() = default;
Status IndexedColumnWriter::init() {
const EncodingInfo* encoding_info;
RETURN_IF_ERROR(EncodingInfo::get(_typeinfo, _options.encoding, &encoding_info));
_options.encoding = encoding_info->encoding();
// should store more concrete encoding type instead of DEFAULT_ENCODING
// because the default encoding of a data type can be changed in the future
DCHECK_NE(_options.encoding, DEFAULT_ENCODING);
PageBuilder* data_page_builder;
RETURN_IF_ERROR(encoding_info->create_page_builder(PageBuilderOptions(), &data_page_builder));
@ -89,31 +93,31 @@ Status IndexedColumnWriter::add(const void* value) {
}
Status IndexedColumnWriter::_finish_current_data_page() {
const uint32_t page_row_count = _data_page_builder->count();
if (page_row_count == 0) {
auto num_values_in_page = _data_page_builder->count();
if (num_values_in_page == 0) {
return Status::OK();
}
ordinal_t first_ordinal = _num_values - num_values_in_page;
uint32_t first_rowid = _num_values - page_row_count;
faststring page_header;
put_varint32(&page_header, first_rowid);
put_varint32(&page_header, page_row_count);
OwnedSlice page_data = _data_page_builder->finish();
// IndexedColumn doesn't have NULLs, thus data page body only contains encoded values
OwnedSlice page_body = _data_page_builder->finish();
_data_page_builder->reset();
return _append_data_page({Slice(page_header), page_data.slice()}, first_rowid);
}
PageFooterPB footer;
footer.set_type(DATA_PAGE);
footer.set_uncompressed_size(page_body.slice().get_size());
footer.mutable_data_page_footer()->set_first_ordinal(first_ordinal);
footer.mutable_data_page_footer()->set_num_values(num_values_in_page);
footer.mutable_data_page_footer()->set_nullmap_size(0);
Status IndexedColumnWriter::_append_data_page(const std::vector<Slice>& data_page, rowid_t first_rowid) {
RETURN_IF_ERROR(_append_page(data_page, &_last_data_page));
RETURN_IF_ERROR(PageIO::compress_and_write_page(
_compress_codec, _options.compression_min_space_saving, _file, { page_body.slice() },
footer, &_last_data_page));
_num_data_pages++;
if (_options.write_ordinal_index) {
std::string key;
KeyCoderTraits<OLAP_FIELD_TYPE_UNSIGNED_INT>::full_encode_ascending(
&first_rowid, &key);
KeyCoderTraits<OLAP_FIELD_TYPE_UNSIGNED_BIGINT>::full_encode_ascending(&first_ordinal, &key);
_ordinal_index_builder->add(key, _last_data_page);
}
@ -127,31 +131,6 @@ Status IndexedColumnWriter::_append_data_page(const std::vector<Slice>& data_pag
return Status::OK();
}
Status IndexedColumnWriter::_append_page(const std::vector<Slice>& page, PagePointer* pp) {
std::vector<Slice> output_page;
// Put compressor out of if block, because we will use compressor's
// content until this function finished.
PageCompressor compressor(_compress_codec);
if (_compress_codec != nullptr) {
RETURN_IF_ERROR(compressor.compress(page, &output_page));
} else {
output_page = page;
}
// checksum
uint8_t checksum_buf[sizeof(uint32_t)];
uint32_t checksum = crc32c::Value(output_page);
encode_fixed32_le(checksum_buf, checksum);
output_page.emplace_back(checksum_buf, sizeof(uint32_t));
// append to file
pp->offset = _file->size();
RETURN_IF_ERROR(_file->appendv(&output_page[0], output_page.size()));
pp->size = _file->size() - pp->offset;
return Status::OK();
}
Status IndexedColumnWriter::finish(IndexedColumnMetaPB* meta) {
RETURN_IF_ERROR(_finish_current_data_page());
if (_options.write_ordinal_index) {
@ -174,9 +153,14 @@ Status IndexedColumnWriter::_flush_index(IndexPageBuilder* index_builder, BTreeM
meta->set_is_root_data_page(true);
_last_data_page.to_proto(meta->mutable_root_page());
} else {
Slice root_page = index_builder->finish();
OwnedSlice page_body;
PageFooterPB page_footer;
index_builder->finish(&page_body, &page_footer);
PagePointer pp;
RETURN_IF_ERROR(_append_page({root_page}, &pp));
RETURN_IF_ERROR(PageIO::compress_and_write_page(
_compress_codec, _options.compression_min_space_saving, _file,
{ page_body.slice() }, page_footer, &pp));
meta->set_is_root_data_page(false);
pp.to_proto(meta->mutable_root_page());

View File

@ -48,6 +48,7 @@ struct IndexedColumnWriterOptions {
bool write_value_index = false;
EncodingTypePB encoding = DEFAULT_ENCODING;
CompressionTypePB compression = NO_COMPRESSION;
double compression_min_space_saving = 0.1;
};
// IndexedColumn is a column with an optional "ordinal index" and an optional "value index".
@ -82,15 +83,6 @@ public:
private:
Status _finish_current_data_page();
// Append the given data page, update ordinal index or value index if they're used.
Status _append_data_page(const std::vector<Slice>& data_page, rowid_t first_rowid);
// Append the given page into the file. After return, *pp points to the newly
// inserted page.
// Input data will be compressed when compression is enabled.
// We also compute and append checksum for the page.
Status _append_page(const std::vector<Slice>& page, PagePointer* pp);
Status _flush_index(IndexPageBuilder* index_builder, BTreeMetaPB* meta);
IndexedColumnWriterOptions _options;
@ -100,7 +92,7 @@ private:
MemTracker _mem_tracker;
MemPool _mem_pool;
rowid_t _num_values;
ordinal_t _num_values;
uint32_t _num_data_pages;
// remember the first value in current page
faststring _first_value;

View File

@ -17,58 +17,111 @@
#include "olap/rowset/segment_v2/ordinal_page_index.h"
#include "common/logging.h"
#include "env/env.h"
#include "olap/key_coder.h"
#include "olap/rowset/segment_v2/page_handle.h"
#include "olap/rowset/segment_v2/page_io.h"
#include "util/file_manager.h"
namespace doris {
namespace segment_v2 {
OrdinalPageIndex::~OrdinalPageIndex() {
delete[] _rowids;
delete[] _pages;
void OrdinalIndexWriter::append_entry(ordinal_t ordinal, const PagePointer& data_pp) {
std::string key;
KeyCoderTraits<OLAP_FIELD_TYPE_UNSIGNED_BIGINT>::full_encode_ascending(&ordinal, &key);
_page_builder->add(key, data_pp);
_last_pp = data_pp;
}
Status OrdinalPageIndex::load() {
if (UNLIKELY(_data.size < _header_size())) {
return Status::Corruption("block size must greate than header");
}
const uint8_t* ptr = (const uint8_t*)_data.data;
const uint8_t* limit = (const uint8_t*)_data.data + _data.size;
Status OrdinalIndexWriter::finish(WritableFile* file, ColumnIndexMetaPB* meta) {
CHECK(_page_builder->count() > 0) << "no entry has been added, file=" << file->filename();
meta->set_type(ORDINAL_INDEX);
BTreeMetaPB* root_page_meta = meta->mutable_ordinal_index()->mutable_root_page();
_num_pages = decode_fixed32_le(ptr);
ptr += 4;
if (_page_builder->count() == 1) {
// only one data page, no need to write index page
root_page_meta->set_is_root_data_page(true);
_last_pp.to_proto(root_page_meta->mutable_root_page());
} else {
OwnedSlice page_body;
PageFooterPB page_footer;
_page_builder->finish(&page_body, &page_footer);
// add a additional rowid for row id compute convenience
_rowids = new rowid_t[_num_pages + 1];
_pages = new PagePointer[_num_pages];
for (int i = 0; i < _num_pages; ++i) {
ptr = decode_varint32_ptr(ptr, limit, &_rowids[i]);
if (ptr == nullptr) {
return Status::InternalError("Data corruption");
}
ptr = _pages[i].decode_from(ptr, limit);
if (ptr == nullptr) {
return Status::InternalError("Data corruption");
}
// write index page (currently it's not compressed)
PagePointer pp;
RETURN_IF_ERROR(PageIO::write_page(file, { page_body.slice() }, page_footer, &pp));
root_page_meta->set_is_root_data_page(false);
pp.to_proto(root_page_meta->mutable_root_page());
}
// set the additional last row id as number of rows
_rowids[_num_pages] = _num_rows;
return Status::OK();
}
OrdinalPageIndexIterator OrdinalPageIndex::seek_at_or_before(rowid_t rid) {
Status OrdinalIndexReader::load(bool use_page_cache, bool kept_in_memory) {
if (_index_meta->root_page().is_root_data_page()) {
// only one data page, no index page
_num_pages = 1;
_ordinals.push_back(0);
_ordinals.push_back(_num_values);
_pages.emplace_back(_index_meta->root_page().root_page());
return Status::OK();
}
// need to read index page
OpenedFileHandle<RandomAccessFile> file_handle;
RETURN_IF_ERROR(FileManager::instance()->open_file(_filename, &file_handle));
PageReadOptions opts;
opts.file = file_handle.file();
opts.page_pointer = PagePointer(_index_meta->root_page().root_page());
opts.codec = nullptr; // ordinal index page uses NO_COMPRESSION right now
OlapReaderStatistics tmp_stats;
opts.stats = &tmp_stats;
opts.use_page_cache = use_page_cache;
opts.kept_in_memory = kept_in_memory;
// read index page
PageHandle page_handle;
Slice body;
PageFooterPB footer;
RETURN_IF_ERROR(PageIO::read_and_decompress_page(opts, &page_handle, &body, &footer));
// parse and save all (ordinal, pp) from index page
IndexPageReader reader;
RETURN_IF_ERROR(reader.parse(body, footer.index_page_footer()));
_num_pages = reader.count();
_ordinals.resize(_num_pages + 1);
_pages.resize(_num_pages);
for (int i = 0; i < _num_pages; i++) {
Slice key = reader.get_key(i);
ordinal_t ordinal;
RETURN_IF_ERROR(KeyCoderTraits<OLAP_FIELD_TYPE_UNSIGNED_BIGINT>::decode_ascending(
&key, sizeof(ordinal_t), (uint8_t*) &ordinal, nullptr));
_ordinals[i] = ordinal;
_pages[i] = reader.get_value(i);
}
_ordinals[_num_pages] = _num_values;
return Status::OK();
}
OrdinalPageIndexIterator OrdinalIndexReader::seek_at_or_before(ordinal_t ordinal) {
int32_t left = 0;
int32_t right = _num_pages - 1;
while (left < right) {
int32_t mid = (left + right + 1) / 2;
if (_rowids[mid] < rid) {
if (_ordinals[mid] < ordinal) {
left = mid;
} else if (_rowids[mid] > rid) {
} else if (_ordinals[mid] > ordinal) {
right = mid - 1;
} else {
left = mid;
break;
}
}
if (_rowids[left] > rid) {
if (_ordinals[left] > ordinal) {
return OrdinalPageIndexIterator(this, _num_pages);
}
return OrdinalPageIndexIterator(this, left);

View File

@ -18,154 +18,112 @@
#pragma once
#include <cstdint>
#include <memory>
#include <string>
#include "common/status.h"
#include "gutil/macros.h"
#include "olap/rowset/segment_v2/common.h"
#include "olap/rowset/segment_v2/index_page.h"
#include "olap/rowset/segment_v2/page_pointer.h"
#include "util/coding.h"
#include "util/slice.h"
namespace doris {
class WritableFile;
namespace segment_v2 {
// this class encode ordinal page index
// the binary format is like that
// Header | Content
// Header:
// number of pages (4 Bytes)
// Content:
// array of index_pair
// index_pair:
// Ordinal (4 Bytes)
// PagePointer (8 Bytes)
static const uint32_t ORDINAL_PAGE_INDEX_HEADER_SIZE = 4;
class OrdinalPageIndexBuilder {
// Ordinal index is implemented by one IndexPage that stores the first value ordinal
// and file pointer for each data page.
// But if there is only one data page, there is no need for index page. So we store
// the file pointer to that data page directly in index meta (OrdinalIndexPB).
class OrdinalIndexWriter {
public:
OrdinalPageIndexBuilder() : _num_pages(0) {
_buffer.reserve(4 * 1024);
// reserve space for number of pages
_buffer.resize(ORDINAL_PAGE_INDEX_HEADER_SIZE);
}
OrdinalIndexWriter() : _page_builder(new IndexPageBuilder(0, true)) {}
void append_entry(rowid_t rid, const PagePointer& page) {
// rid
put_varint32(&_buffer, rid);
// page pointer
page.encode_to(&_buffer);
_num_pages++;
}
void append_entry(ordinal_t ordinal, const PagePointer& data_pp);
uint64_t size() {
return _buffer.size();
}
uint64_t size() { return _page_builder->size(); }
Slice finish() {
// encoded number of pages
encode_fixed32_le((uint8_t*)_buffer.data(), _num_pages);
return Slice(_buffer);
}
Status finish(WritableFile* file, ColumnIndexMetaPB* meta);
private:
std::string _buffer;
uint32_t _num_pages;
DISALLOW_COPY_AND_ASSIGN(OrdinalIndexWriter);
std::unique_ptr<IndexPageBuilder> _page_builder;
PagePointer _last_pp;
};
class OrdinalPageIndex;
class OrdinalPageIndexIterator {
class OrdinalPageIndexIterator;
class OrdinalIndexReader {
public:
OrdinalPageIndexIterator() : _index(nullptr), _cur_idx(-1) { }
OrdinalPageIndexIterator(OrdinalPageIndex* index) : _index(index), _cur_idx(0) { }
OrdinalPageIndexIterator(OrdinalPageIndex* index, int cur_idx) : _index(index), _cur_idx(cur_idx) { }
inline bool valid() const;
inline void next();
inline rowid_t rowid() const;
inline int32_t cur_idx() const;
inline const PagePointer& page() const;
inline rowid_t cur_page_first_row_id() const;
inline rowid_t cur_page_last_row_id() const;
private:
OrdinalPageIndex* _index;
int32_t _cur_idx;
};
// Page index
class OrdinalPageIndex {
public:
OrdinalPageIndex(const Slice& data, uint64_t num_rows)
: _data(data), _num_rows(num_rows), _num_pages(0), _rowids(nullptr), _pages(nullptr) {
}
~OrdinalPageIndex();
Status load();
OrdinalPageIndexIterator seek_at_or_before(rowid_t rid);
OrdinalPageIndexIterator begin() {
return OrdinalPageIndexIterator(this);
}
OrdinalPageIndexIterator end() {
return OrdinalPageIndexIterator(this, _num_pages);
}
rowid_t get_first_row_id(int page_index) const {
return _rowids[page_index];
explicit OrdinalIndexReader(const std::string& filename,
const OrdinalIndexPB* index_meta,
ordinal_t num_values) :
_filename(filename), _index_meta(index_meta), _num_values(num_values) {
}
rowid_t get_last_row_id(int page_index) const {
// because add additional number of rows as the last rowid
// so just return next_page_first_id - 1
int next_page_index = page_index + 1;
return get_first_row_id(next_page_index) - 1;
// load and parse the index page into memory
Status load(bool use_page_cache, bool kept_in_memory);
OrdinalPageIndexIterator seek_at_or_before(ordinal_t ordinal);
inline OrdinalPageIndexIterator begin();
inline OrdinalPageIndexIterator end();
ordinal_t get_first_ordinal(int page_index) const {
return _ordinals[page_index];
}
int32_t num_pages() const {
return _num_pages;
ordinal_t get_last_ordinal(int page_index) const {
return get_first_ordinal(page_index + 1) - 1;
}
private:
uint32_t _header_size() const { return ORDINAL_PAGE_INDEX_HEADER_SIZE; }
// for test
int32_t num_data_pages() const { return _num_pages; }
private:
friend OrdinalPageIndexIterator;
Slice _data;
uint64_t _num_rows;
std::string _filename;
const OrdinalIndexPB* _index_meta;
// total number of values (including NULLs) in the indexed column,
// equals to 1 + 'last ordinal of last data pages'
ordinal_t _num_values;
// valid after laod
int32_t _num_pages;
// the last row id is additional, set to number of rows
rowid_t* _rowids;
PagePointer* _pages;
// valid after load
int _num_pages = 0;
// _ordinals[i] = first ordinal of the i-th data page,
std::vector<ordinal_t> _ordinals;
// _pages[i] = page pointer to the i-th data page
std::vector<PagePointer> _pages;
};
inline bool OrdinalPageIndexIterator::valid() const {
return _cur_idx < _index->_num_pages;
class OrdinalPageIndexIterator {
public:
OrdinalPageIndexIterator() : _index(nullptr), _cur_idx(-1) { }
OrdinalPageIndexIterator(OrdinalIndexReader* index) : _index(index), _cur_idx(0) { }
OrdinalPageIndexIterator(OrdinalIndexReader* index, int cur_idx) : _index(index), _cur_idx(cur_idx) { }
bool valid() const { return _cur_idx < _index->_num_pages; }
void next() {
DCHECK_LT(_cur_idx, _index->_num_pages);
_cur_idx++;
}
int32_t page_index() const { return _cur_idx; };
const PagePointer& page() const { return _index->_pages[_cur_idx]; };
ordinal_t first_ordinal() const { return _index->get_first_ordinal(_cur_idx); }
ordinal_t last_ordinal() const { return _index->get_last_ordinal(_cur_idx); }
private:
OrdinalIndexReader* _index;
int32_t _cur_idx;
};
OrdinalPageIndexIterator OrdinalIndexReader::begin() {
return OrdinalPageIndexIterator(this);
}
inline void OrdinalPageIndexIterator::next() {
DCHECK_LT(_cur_idx, _index->_num_pages);
_cur_idx++;
}
inline rowid_t OrdinalPageIndexIterator::rowid() const {
return _index->_rowids[_cur_idx];
}
int32_t OrdinalPageIndexIterator::cur_idx() const {
return _cur_idx;
}
inline const PagePointer& OrdinalPageIndexIterator::page() const {
return _index->_pages[_cur_idx];
}
rowid_t OrdinalPageIndexIterator::cur_page_first_row_id() const {
return _index->get_first_row_id(_cur_idx);
}
rowid_t OrdinalPageIndexIterator::cur_page_last_row_id() const {
return _index->get_last_row_id(_cur_idx);
OrdinalPageIndexIterator OrdinalIndexReader::end() {
return OrdinalPageIndexIterator(this, _num_pages);
}
}

View File

@ -1,120 +0,0 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "olap/rowset/segment_v2/page_compression.h"
#include "gutil/strings/substitute.h"
#include "util/block_compression.h"
#include "util/coding.h"
namespace doris {
namespace segment_v2 {
using strings::Substitute;
Status PageDecompressor::decompress_to(Slice* uncompressed_data) {
if (_data.size < 4) {
return Status::Corruption(
Substitute("Compressed page's size is too small, size=$0, needed=$1",
_data.size, 4));
}
// decode uncompressed_bytes from footer
uint32_t uncompressed_bytes = decode_fixed32_le((uint8_t*)_data.data + _data.size - 4);
Slice compressed_slice(_data.data, _data.size - 4);
if (compressed_slice.size == uncompressed_bytes) {
// If compressed_slice's size is equal with _uncompressed_bytes, it means
// compressor store this directly without compression. So we just copy
// this to buf and return.
*uncompressed_data = compressed_slice;
return Status::OK();
}
std::unique_ptr<char[]> buf(new char[uncompressed_bytes]);
Slice uncompressed_slice(buf.get(), uncompressed_bytes);
RETURN_IF_ERROR(_codec->decompress(compressed_slice, &uncompressed_slice));
if (uncompressed_slice.size != uncompressed_bytes) {
// If size after decompress didn't match recorded size, we think this
// page is corrupt.
return Status::Corruption(
Substitute("Uncompressed size not match, record=$0 vs decompress=$1",
uncompressed_bytes, uncompressed_slice.size));
}
*uncompressed_data = Slice(buf.release(), uncompressed_bytes);
return Status::OK();
}
Status PageCompressor::compress(const std::vector<Slice>& raw_data,
std::vector<Slice>* compressed_slices) {
size_t uncompressed_bytes = Slice::compute_total_size(raw_data);
size_t max_compressed_bytes = _codec->max_compressed_len(uncompressed_bytes);
_buf.resize(max_compressed_bytes + 4);
Slice compressed_slice(_buf.data(), max_compressed_bytes);
RETURN_IF_ERROR(_codec->compress(raw_data, &compressed_slice));
double space_saving = 1.0 - (double)compressed_slice.size / uncompressed_bytes;
if (compressed_slice.size >= uncompressed_bytes || // use integer to make definite
space_saving < _min_space_saving) {
// If space saving is not higher enough we just copy uncompressed
// data to avoid decompression CPU cost
for (auto& slice : raw_data) {
compressed_slices->push_back(slice);
}
// encode uncompressed_bytes into footer of compressed value
encode_fixed32_le((uint8_t*)_buf.data(), uncompressed_bytes);
compressed_slices->emplace_back(_buf.data(), 4);
return Status::OK();
}
// encode uncompressed_bytes into footer of compressed value
encode_fixed32_le((uint8_t*)_buf.data() + compressed_slice.size, uncompressed_bytes);
// return compressed data to client
compressed_slices->emplace_back(_buf.data(), 4 + compressed_slice.size);
return Status::OK();
}
Status PageCompressor::compress(const std::vector<Slice>& raw_data,
OwnedSlice* compressed_data, bool* compressed) {
size_t uncompressed_bytes = Slice::compute_total_size(raw_data);
size_t max_compressed_bytes = _codec->max_compressed_len(uncompressed_bytes);
_buf.resize(max_compressed_bytes + 4);
Slice compression_buffer(_buf.data(), max_compressed_bytes);
RETURN_IF_ERROR(_codec->compress(raw_data, &compression_buffer));
double space_saving = 1.0 - (double)compression_buffer.size / uncompressed_bytes;
if (compression_buffer.size >= uncompressed_bytes || // use integer to make definite
space_saving < _min_space_saving) {
// If space saving is not higher enough we just copy uncompressed
// data to avoid decompression CPU cost
_buf.resize(0);
*compressed_data = _buf.build();
*compressed = false;
return Status::OK();
}
// encode uncompressed_bytes into footer of compressed value
encode_fixed32_le((uint8_t*)_buf.data() + compression_buffer.size, uncompressed_bytes);
// return compressed data to client
_buf.resize(compression_buffer.size + 4);
*compressed_data = _buf.build();
*compressed = true;
return Status::OK();
}
}
}

View File

@ -1,103 +0,0 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstddef>
#include <vector>
#include "common/status.h"
#include "util/slice.h"
#include "util/faststring.h"
namespace doris {
class BlockCompressionCodec;
namespace segment_v2 {
// Utility class for parsing and decompressing compressed page.
// Format of compressed page := Data, UncompressedSize(fixed32)
// When sizeof(Data) == UncompressedSize, it means Data is stored in uncompressed
// form, thus decompression is not needed.
// Otherwise Data is in compressed form and should be decompressed.
// The type of compression codec for Data is stored elsewhere and should
// be passed into the constructor.
// Usage example:
// // page_slice refers to page read from storage
// PageDecompressor decompressor(page_slice, codec);
// // points to decompressed Data of the page (without footer)
// Slice uncompressed_slice;
// RETURN_IF_ERROR(decompressor.decompress_to(&uncompressed_slice));
// // use uncompressed_slice
// // we have a new buffer for decompressed page
// if (uncompressed_slice.data != page_slice.data) {
// delete[] page_slice.data;
// }
class PageDecompressor {
public:
PageDecompressor(const Slice& compressed_data, const BlockCompressionCodec* codec)
: _data(compressed_data), _codec(codec) {
}
// This client will set uncompress content to uncompressed_data.
// In normal case(compressed_data.data != uncompressed_data.data) client should
// call delete[] compressed_data.data to free heap memory. However
// when the data is not compressed, this function will return compressed_data
// directly. In this case compressed_data.data == uncompressed_data.data,
// client should not free content.
Status decompress_to(Slice* uncompressed_data);
private:
Slice _data;
const BlockCompressionCodec* _codec;
};
// Helper to build a compress page.
// Usage:
// std::vector<Slice> raw_data;
// PageCompressor compressor(codec, 0.1);
// std::vector<Slice> compressed_data;
// compressor.compress(raw_data, &compressed_data)
class PageCompressor {
public:
PageCompressor(const BlockCompressionCodec* codec, double min_space_saving = 0.1)
: _codec(codec), _min_space_saving(min_space_saving) {
}
// Try to compress input raw data into compressed page
// according given BlockCompressionCodec. If compressed page is not
// smaller enough than raw data, this class will return uncompressed data.
Status compress(const std::vector<Slice>& raw_data,
std::vector<Slice>* compressed_data);
// Try to compress input raw data into compressed page by returning OwnedSlice
// according given BlockCompressionCodec. If compressed page is not
// smaller enough than raw data, this class will return uncompressed data.
Status compress(const std::vector<Slice>& raw_data,
OwnedSlice* compressed_data, bool* compressed);
private:
const BlockCompressionCodec* _codec;
// If space saving is lower than _min_space_saving, compress will return origin data
double _min_space_saving;
// used to store compressed data
faststring _buf;
};
}
}

View File

@ -18,7 +18,6 @@
#pragma once
#include "olap/column_block.h" // for ColumnBlockView
#include "olap/rowset/segment_v2/common.h" // for rowid_t
#include "common/status.h" // for Status
namespace doris {

View File

@ -65,7 +65,7 @@ public:
}
}
// This function only valid when assign valid data, either in cache or not
// the return slice contains uncompressed page body, page footer, and footer size
Slice data() const {
if (_is_data_owner) {
return _data;

View File

@ -0,0 +1,208 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "olap/rowset/segment_v2/page_io.h"
#include <cstring>
#include <string>
#include "common/logging.h"
#include "env/env.h"
#include "gutil/strings/substitute.h"
#include "olap/page_cache.h"
#include "util/block_compression.h"
#include "util/coding.h"
#include "util/crc32c.h"
#include "util/faststring.h"
#include "util/runtime_profile.h"
namespace doris {
namespace segment_v2 {
using strings::Substitute;
Status PageIO::compress_page_body(const BlockCompressionCodec* codec,
double min_space_saving,
const std::vector<Slice>& body,
OwnedSlice* compressed_body) {
size_t uncompressed_size = Slice::compute_total_size(body);
if (codec != nullptr && uncompressed_size > 0) {
size_t max_compressed_size = codec->max_compressed_len(uncompressed_size);
faststring buf;
buf.resize(max_compressed_size);
Slice compressed_slice(buf);
RETURN_IF_ERROR(codec->compress(body, &compressed_slice));
buf.resize(compressed_slice.get_size());
double space_saving = 1.0 - static_cast<double>(buf.size()) / uncompressed_size;
// return compressed body only when it saves more than min_space_saving
if (space_saving > 0 && space_saving >= min_space_saving) {
*compressed_body = buf.build();
return Status::OK();
}
}
// otherwise, do not compress
OwnedSlice empty;
*compressed_body = std::move(empty);
return Status::OK();
}
Status PageIO::write_page(WritableFile* file,
const std::vector<Slice>& body,
const PageFooterPB& footer,
PagePointer* result) {
// sanity check of page footer
CHECK(footer.has_type()) << "type must be set";
CHECK(footer.has_uncompressed_size()) << "uncompressed_size must be set";
switch (footer.type()) {
case DATA_PAGE:
CHECK(footer.has_data_page_footer());
break;
case INDEX_PAGE:
CHECK(footer.has_index_page_footer());
break;
case DICTIONARY_PAGE:
CHECK(footer.has_dict_page_footer());
break;
case SHORT_KEY_PAGE:
CHECK(footer.has_short_key_page_footer());
break;
default:
CHECK(false) << "Invalid page footer type: " << footer.type();
break;
}
std::string footer_buf; // serialized footer + footer size
footer.SerializeToString(&footer_buf);
put_fixed32_le(&footer_buf, static_cast<uint32_t>(footer_buf.size()));
std::vector<Slice> page = body;
page.emplace_back(footer_buf);
// checksum
uint8_t checksum_buf[sizeof(uint32_t)];
uint32_t checksum = crc32c::Value(page);
encode_fixed32_le(checksum_buf, checksum);
page.emplace_back(checksum_buf, sizeof(uint32_t));
uint64_t offset = file->size();
RETURN_IF_ERROR(file->appendv(&page[0], page.size()));
result->offset = offset;
result->size = file->size() - offset;
return Status::OK();
}
Status PageIO::read_and_decompress_page(const PageReadOptions& opts,
PageHandle* handle,
Slice* body,
PageFooterPB* footer) {
opts.sanity_check();
opts.stats->total_pages_num++;
auto cache = StoragePageCache::instance();
PageCacheHandle cache_handle;
StoragePageCache::CacheKey cache_key(opts.file->file_name(), opts.page_pointer.offset);
if (opts.use_page_cache && cache->lookup(cache_key, &cache_handle)) {
// we find page in cache, use it
*handle = PageHandle(std::move(cache_handle));
opts.stats->cached_pages_num++;
// parse body and footer
Slice page_slice = handle->data();
uint32_t footer_size = decode_fixed32_le((uint8_t*) page_slice.data + page_slice.size - 4);
std::string footer_buf(page_slice.data + page_slice.size - 4 - footer_size, footer_size);
if (!footer->ParseFromString(footer_buf)) {
return Status::Corruption("Bad page: invalid footer");
}
*body = Slice(page_slice.data, page_slice.size - 4 - footer_size);
return Status::OK();
}
// every page contains 4 bytes footer length and 4 bytes checksum
const uint32_t page_size = opts.page_pointer.size;
if (page_size < 8) {
return Status::Corruption(Substitute("Bad page: too small size ($0)", page_size));
}
// hold compressed page at first, reset to decompressed page later
std::unique_ptr<char[]> page(new char[page_size]);
Slice page_slice(page.get(), page_size);
{
SCOPED_RAW_TIMER(&opts.stats->io_ns);
RETURN_IF_ERROR(opts.file->read_at(opts.page_pointer.offset, page_slice));
opts.stats->compressed_bytes_read += page_size;
}
if (opts.verify_checksum) {
uint32_t expect = decode_fixed32_le((uint8_t*) page_slice.data + page_slice.size - 4);
uint32_t actual = crc32c::Value(page_slice.data, page_slice.size - 4);
if (expect != actual) {
return Status::Corruption(Substitute(
"Bad page: checksum mismatch (actual=$0 vs expect=$1)", actual, expect));
}
}
// remove checksum suffix
page_slice.size -= 4;
// parse and set footer
uint32_t footer_size = decode_fixed32_le((uint8_t*) page_slice.data + page_slice.size - 4);
if (!footer->ParseFromArray(page_slice.data + page_slice.size - 4 - footer_size, footer_size)) {
return Status::Corruption("Bad page: invalid footer");
}
uint32_t body_size = page_slice.size - 4 - footer_size;
if (body_size != footer->uncompressed_size()) { // need decompress body
if (opts.codec == nullptr) {
return Status::Corruption("Bad page: page is compressed but codec is NO_COMPRESSION");
}
SCOPED_RAW_TIMER(&opts.stats->decompress_ns);
std::unique_ptr<char[]> decompressed_page(
new char[footer->uncompressed_size() + footer_size + 4]);
// decompress page body
Slice compressed_body(page_slice.data, body_size);
Slice decompressed_body(decompressed_page.get(), footer->uncompressed_size());
RETURN_IF_ERROR(opts.codec->decompress(compressed_body, &decompressed_body));
if (decompressed_body.size != footer->uncompressed_size()) {
return Status::Corruption(Substitute(
"Bad page: record uncompressed size=$0 vs real decompressed size=$1",
footer->uncompressed_size(), decompressed_body.size));
}
// append footer and footer size
memcpy(decompressed_body.data + decompressed_body.size,
page_slice.data + body_size,
footer_size + 4);
// free memory of compressed page
page = std::move(decompressed_page);
page_slice = Slice(page.get(), footer->uncompressed_size() + footer_size + 4);
opts.stats->uncompressed_bytes_read += page_slice.size;
}
*body = Slice(page_slice.data, page_slice.size - 4 - footer_size);
if (opts.use_page_cache) {
// insert this page into cache and return the cache handle
cache->insert(cache_key, page_slice, &cache_handle, opts.kept_in_memory);
*handle = PageHandle(std::move(cache_handle));
} else {
*handle = PageHandle(page_slice);
}
page.release(); // memory now managed by handle
return Status::OK();
}
} // namespace segment_v2
} // namespace doris

View File

@ -0,0 +1,116 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <vector>
#include "common/logging.h"
#include "common/status.h"
#include "gen_cpp/segment_v2.pb.h"
#include "olap/rowset/segment_v2/page_handle.h"
#include "olap/rowset/segment_v2/page_pointer.h"
#include "util/slice.h"
namespace doris {
class BlockCompressionCodec;
struct OlapReaderStatistics;
class RandomAccessFile;
class WritableFile;
namespace segment_v2 {
struct PageReadOptions {
// file to read page
RandomAccessFile* file = nullptr;
// location of the page
PagePointer page_pointer;
// decompressor for page body (null means page body is not compressed)
const BlockCompressionCodec* codec = nullptr;
// used to collect IO metrics
OlapReaderStatistics* stats = nullptr;
// whether to verify page checksum
bool verify_checksum = true;
// whether to use page cache in read path
bool use_page_cache = true;
// if true, use DURABLE CachePriority in page cache
// currently used for in memory olap table
bool kept_in_memory = false;
void sanity_check() const {
CHECK_NOTNULL(file);
CHECK_NOTNULL(stats);
}
};
// Utility class for read and write page. All types of page share the same general layout:
// Page := PageBody, PageFooter, FooterSize(4), Checksum(4)
// - PageBody is defined by page type and may be compressed
// - PageFooter is serialized PageFooterPB. It contains page_type, uncompressed_body_size,
// and other custom metadata. PageBody is not compressed when its size is equal to
// uncompressed_body_size
// - FooterSize stores the size of PageFooter
// - Checksum is the crc32c checksum of all previous part
class PageIO {
public:
// Compress `body' using `codec' into `compressed_body'.
// The size of returned `compressed_body' is 0 when the body is not compressed, this
// could happen when `codec' is null or space saving is less than `min_space_saving'.
static Status compress_page_body(const BlockCompressionCodec* codec,
double min_space_saving,
const std::vector<Slice>& body,
OwnedSlice* compressed_body);
// Encode page from `body' and `footer' and write to `file'.
// `body' could be either uncompressed or compressed.
// On success, the file pointer to the written page is stored in `result'.
static Status write_page(WritableFile* file,
const std::vector<Slice>& body,
const PageFooterPB& footer,
PagePointer* result);
// Convenient function to compress page body and write page in one go.
static Status compress_and_write_page(const BlockCompressionCodec* codec,
double min_space_saving,
WritableFile* file,
const std::vector<Slice>& body,
const PageFooterPB& footer,
PagePointer* result) {
DCHECK_EQ(footer.uncompressed_size(), Slice::compute_total_size(body));
OwnedSlice compressed_body;
RETURN_IF_ERROR(compress_page_body(codec, min_space_saving, body, &compressed_body));
if (compressed_body.slice().empty()) { // uncompressed
return write_page(file, body, footer, result);
}
return write_page(file, { compressed_body.slice() }, footer, result);
}
// Read and parse a page according to `opts'.
// On success
// `handle' holds the memory of page data,
// `body' points to page body,
// `footer' stores the page footer.
static Status read_and_decompress_page(const PageReadOptions& opts,
PageHandle* handle,
Slice* body,
PageFooterPB* footer);
};
} // namespace segment_v2
} // namespace doris

View File

@ -17,47 +17,88 @@
#pragma once
#include "olap/rowset/segment_v2/page_decoder.h" // for PagePointer
#include "util/rle_encoding.h" // for RleDecoder
#include <memory>
#include "common/status.h"
#include "gen_cpp/segment_v2.pb.h"
#include "olap/rowset/segment_v2/common.h"
#include "olap/rowset/segment_v2/encoding_info.h"
#include "olap/rowset/segment_v2/options.h"
#include "olap/rowset/segment_v2/page_decoder.h"
#include "olap/rowset/segment_v2/page_handle.h"
#include "util/rle_encoding.h"
namespace doris {
namespace segment_v2 {
class PageHandle;
struct PagePointer;
// This contains information when one page is loaded, and ready for read
// This struct can be reused, client should call reset first before reusing
// this object
struct ParsedPage {
ParsedPage() { }
static Status create(PageHandle handle,
const Slice& body,
const DataPageFooterPB& footer,
const EncodingInfo* encoding,
const PagePointer& page_pointer,
uint32_t page_index,
std::unique_ptr<ParsedPage>* result) {
std::unique_ptr<ParsedPage> page(new ParsedPage);
page->page_handle = std::move(handle);
auto null_size = footer.nullmap_size();
page->has_null = null_size > 0;
page->null_bitmap = Slice(body.data + body.size - null_size, null_size);
if (page->has_null) {
page->null_decoder = RleDecoder<bool>(
(const uint8_t*) page->null_bitmap.data, null_size, 1);
}
Slice data_slice(body.data, body.size - null_size);
PageDecoderOptions opts;
RETURN_IF_ERROR(encoding->create_page_decoder(data_slice, opts, &page->data_decoder));
RETURN_IF_ERROR(page->data_decoder->init());
page->first_ordinal = footer.first_ordinal();
page->num_rows = footer.num_values();
page->page_pointer = page_pointer;
page->page_index = page_index;
*result = std::move(page);
return Status::OK();
}
~ParsedPage() {
delete data_decoder;
}
PagePointer page_pointer;
PageHandle page_handle;
bool has_null;
Slice null_bitmap;
RleDecoder<bool> null_decoder;
PageDecoder* data_decoder = nullptr;
// first rowid for this page
rowid_t first_rowid = 0;
// ordinal of the first value in this page
ordinal_t first_ordinal = 0;
// number of rows including nulls and not-nulls
uint32_t num_rows = 0;
ordinal_t num_rows = 0;
PagePointer page_pointer;
uint32_t page_index = 0;
// current offset when read this page
// this means next row we will read
uint32_t offset_in_page = 0;
ordinal_t offset_in_page = 0;
uint32_t page_index = 0;
bool contains(rowid_t rid) { return rid >= first_rowid && rid < (first_rowid + num_rows); }
rowid_t last_rowid() { return first_rowid + num_rows - 1; }
bool contains(ordinal_t ord) { return ord >= first_ordinal && ord < (first_ordinal + num_rows); }
bool has_remaining() const { return offset_in_page < num_rows; }
size_t remaining() const { return num_rows - offset_in_page; }
private:
// client should use create() factory method
ParsedPage() = default;
};
}

View File

@ -17,7 +17,6 @@
#pragma once
#include "olap/rowset/segment_v2/common.h" // for rowid_t
#include "olap/rowset/segment_v2/options.h" // for PageBuilderOptions/PageDecoderOptions
#include "olap/rowset/segment_v2/page_builder.h" // for PageBuilder
#include "olap/rowset/segment_v2/page_decoder.h" // for PageDecoder

View File

@ -21,6 +21,7 @@
#include "env/env.h" // RandomAccessFile
#include "gutil/strings/substitute.h"
#include "olap/rowset/segment_v2/column_reader.h" // ColumnReader
#include "olap/rowset/segment_v2/page_io.h"
#include "olap/rowset/segment_v2/segment_writer.h" // k_segment_magic_length
#include "olap/rowset/segment_v2/segment_iterator.h"
#include "olap/rowset/segment_v2/empty_segment_iterator.h"
@ -68,41 +69,10 @@ Status Segment::new_iterator(const Schema& schema,
if (read_options.conditions != nullptr) {
for (auto& column_condition : read_options.conditions->columns()) {
int32_t column_id = column_condition.first;
auto entry = _column_id_to_footer_ordinal.find(column_id);
if (entry == _column_id_to_footer_ordinal.end()) {
if (_column_readers[column_id] == nullptr || !_column_readers[column_id]->has_zone_map()) {
continue;
}
auto& c_meta = _footer.columns(entry->second);
if (!c_meta.has_zone_map()) {
continue;
}
auto& c_zone_map = c_meta.zone_map();
if (!c_zone_map.has_not_null() && !c_zone_map.has_null()) {
// no data
iter->reset(new EmptySegmentIterator(schema));
return Status::OK();
}
// TODO Logic here and the similar logic in ColumnReader::_get_filtered_pages should be unified.
TypeInfo* type_info = get_type_info((FieldType)c_meta.type());
if (type_info == nullptr) {
return Status::NotSupported(Substitute("unsupported typeinfo, type=$0", c_meta.type()));
}
FieldType type = type_info->type();
const Field* field = schema.column(column_id);
int32_t var_length = field->length();
std::unique_ptr<WrapperField> min_value(WrapperField::create_by_type(type, var_length));
std::unique_ptr<WrapperField> max_value(WrapperField::create_by_type(type, var_length));
if (c_zone_map.has_not_null()) {
min_value->from_string(c_zone_map.min());
max_value->from_string(c_zone_map.max());
}
if (c_zone_map.has_null()) {
min_value->set_null();
if (!c_zone_map.has_not_null()) {
max_value->set_null();
}
}
if (!column_condition.second->eval({min_value.get(), max_value.get()})) {
if (!_column_readers[column_id]->match_condition(column_condition.second)) {
// any condition not satisfied, return.
iter->reset(new EmptySegmentIterator(schema));
return Status::OK();
@ -164,18 +134,25 @@ Status Segment::_parse_footer() {
Status Segment::_load_index() {
return _load_index_once.call([this] {
// read short key index content
// read and parse short key index page
OpenedFileHandle<RandomAccessFile> file_handle;
RETURN_IF_ERROR(FileManager::instance()->open_file(_fname, &file_handle));
RandomAccessFile* input_file = file_handle.file();
_sk_index_buf.resize(_footer.short_key_index_page().size());
Slice slice(_sk_index_buf.data(), _sk_index_buf.size());
RETURN_IF_ERROR(input_file->read_at(_footer.short_key_index_page().offset(), slice));
// Parse short key index
_sk_index_decoder.reset(new ShortKeyIndexDecoder(_sk_index_buf));
RETURN_IF_ERROR(_sk_index_decoder->parse());
return Status::OK();
PageReadOptions opts;
opts.file = file_handle.file();
opts.page_pointer = PagePointer(_footer.short_key_index_page());
opts.codec = nullptr; // short key index page uses NO_COMPRESSION for now
OlapReaderStatistics tmp_stats;
opts.stats = &tmp_stats;
Slice body;
PageFooterPB footer;
RETURN_IF_ERROR(PageIO::read_and_decompress_page(opts, &_sk_index_handle, &body, &footer));
DCHECK_EQ(footer.type(), SHORT_KEY_PAGE);
DCHECK(footer.has_short_key_page_footer());
_sk_index_decoder.reset(new ShortKeyIndexDecoder);
return _sk_index_decoder->parse(body, footer.short_key_page_footer());
});
}
@ -194,7 +171,7 @@ Status Segment::_create_column_readers() {
}
ColumnReaderOptions opts;
opts.cache_in_memory = _tablet_schema->is_in_memory();
opts.kept_in_memory = _tablet_schema->is_in_memory();
std::unique_ptr<ColumnReader> reader;
// pass Descriptor<RandomAccessFile>* to column reader
RETURN_IF_ERROR(ColumnReader::create(

View File

@ -26,7 +26,7 @@
#include "gen_cpp/segment_v2.pb.h"
#include "gutil/macros.h"
#include "olap/iterators.h"
#include "olap/rowset/segment_v2/common.h" // rowid_t
#include "olap/rowset/segment_v2/page_handle.h"
#include "olap/short_key_index.h"
#include "olap/tablet_schema.h"
#include "util/faststring.h"
@ -141,8 +141,8 @@ private:
// used to guarantee that short key index will be loaded at most once in a thread-safe way
DorisCallOnce<Status> _load_index_once;
// used to store short key index
faststring _sk_index_buf;
// used to hold short key index page in memory
PageHandle _sk_index_handle;
// short key index decoder
std::unique_ptr<ShortKeyIndexDecoder> _sk_index_decoder;
};

View File

@ -27,8 +27,6 @@
#include "olap/rowset/segment_v2/segment.h"
#include "olap/schema.h"
#include "olap/rowset/segment_v2/row_ranges.h"
#include "olap/rowset/segment_v2/column_zone_map.h"
#include "olap/rowset/segment_v2/ordinal_page_index.h"
#include "olap/olap_cond.h"
#include "util/file_cache.h"

View File

@ -19,9 +19,9 @@
#include "env/env.h" // Env
#include "olap/row.h" // ContiguousRow
#include "olap/row_block.h" // RowBlock
#include "olap/row_cursor.h" // RowCursor
#include "olap/rowset/segment_v2/column_writer.h" // ColumnWriter
#include "olap/rowset/segment_v2/page_io.h"
#include "olap/short_key_index.h"
#include "util/crc32c.h"
@ -48,16 +48,20 @@ Status SegmentWriter::init(uint32_t write_mbytes_per_sec) {
uint32_t column_id = 0;
for (auto& column : _tablet_schema->columns()) {
ColumnMetaPB* column_meta = _footer.add_columns();
// TODO(zc): Do we need this column_id??
column_meta->set_column_id(column_id++);
column_meta->set_unique_id(column.unique_id());
bool is_nullable = column.is_nullable();
column_meta->set_is_nullable(is_nullable);
column_meta->set_length(column.length());
std::unique_ptr<Field> field(FieldFactory::create(column));
DCHECK(field.get() != nullptr);
ColumnWriterOptions opts;
opts.compression_type = segment_v2::CompressionTypePB::LZ4F;
opts.meta = _footer.add_columns();
// TODO(zc): Do we need this column_id??
opts.meta->set_column_id(column_id++);
opts.meta->set_unique_id(column.unique_id());
opts.meta->set_type(field->type());
opts.meta->set_length(column.length());
opts.meta->set_encoding(DEFAULT_ENCODING);
opts.meta->set_compression(LZ4F);
opts.meta->set_is_nullable(column.is_nullable());
// now we create zone map for key columns
if (column.is_key()) {
opts.need_zone_map = true;
@ -85,9 +89,8 @@ Status SegmentWriter::init(uint32_t write_mbytes_per_sec) {
}
}
std::unique_ptr<Field> field(FieldFactory::create(column));
DCHECK(field.get() != nullptr);
std::unique_ptr<ColumnWriter> writer(new ColumnWriter(opts, std::move(field), is_nullable, _output_file.get()));
std::unique_ptr<ColumnWriter> writer(
new ColumnWriter(opts, std::move(field), _output_file.get()));
RETURN_IF_ERROR(writer->init());
_column_writers.push_back(std::move(writer));
}
@ -179,25 +182,18 @@ Status SegmentWriter::_write_bloom_filter_index() {
}
Status SegmentWriter::_write_short_key_index() {
std::vector<Slice> slices;
// TODO(zc): we should get segment_size
RETURN_IF_ERROR(_index_builder->finalize(_row_count * 100, _row_count, &slices));
uint64_t offset = _output_file->size();
RETURN_IF_ERROR(_write_raw_data(slices));
uint32_t written_bytes = _output_file->size() - offset;
_footer.mutable_short_key_index_page()->set_offset(offset);
_footer.mutable_short_key_index_page()->set_size(written_bytes);
std::vector<Slice> body;
PageFooterPB footer;
RETURN_IF_ERROR(_index_builder->finalize(_row_count, &body, &footer));
PagePointer pp;
// short key index page is not compressed right now
RETURN_IF_ERROR(PageIO::write_page(_output_file.get(), body, footer, &pp));
pp.to_proto(_footer.mutable_short_key_index_page());
return Status::OK();
}
Status SegmentWriter::_write_footer() {
_footer.set_num_rows(_row_count);
// collect all
for (int i = 0; i < _column_writers.size(); ++i) {
_column_writers[i]->write_meta(_footer.mutable_columns(i));
}
// Footer := SegmentFooterPB, FooterPBSize(4), FooterPBChecksum(4), MagicNumber(4)
std::string footer_buf;

View File

@ -67,12 +67,6 @@ public:
Status finalize(uint64_t* segment_file_size, uint64_t* index_size);
// for ut
// this function should be called after finalize
bool has_bf_index(uint32_t col_id) const {
return _footer.columns(col_id).has_bloom_filter_index();
}
private:
DISALLOW_COPY_AND_ASSIGN(SegmentWriter);
Status _write_data();

View File

@ -0,0 +1,142 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "olap/rowset/segment_v2/zone_map_index.h"
#include "olap/column_block.h"
#include "olap/olap_define.h"
#include "olap/rowset/segment_v2/encoding_info.h"
#include "olap/rowset/segment_v2/indexed_column_reader.h"
#include "olap/rowset/segment_v2/indexed_column_writer.h"
#include "olap/types.h"
#include "runtime/mem_pool.h"
#include "runtime/mem_tracker.h"
namespace doris {
namespace segment_v2 {
ZoneMapIndexWriter::ZoneMapIndexWriter(Field* field) : _field(field), _pool(&_tracker) {
_page_zone_map.min_value = _field->allocate_value(&_pool);
_page_zone_map.max_value = _field->allocate_value(&_pool);
_reset_zone_map(&_page_zone_map);
_segment_zone_map.min_value = _field->allocate_value(&_pool);
_segment_zone_map.max_value = _field->allocate_value(&_pool);
_reset_zone_map(&_segment_zone_map);
}
void ZoneMapIndexWriter::add_values(const void* values, size_t count) {
if (count > 0) {
_page_zone_map.has_not_null = true;
}
const char* vals = reinterpret_cast<const char*>(values);
for (int i = 0; i < count; ++i) {
if (_field->compare(_page_zone_map.min_value, vals) > 0) {
_field->type_info()->direct_copy(_page_zone_map.min_value, vals);
}
if (_field->compare(_page_zone_map.max_value, vals) < 0) {
_field->type_info()->direct_copy(_page_zone_map.max_value, vals);
}
vals += _field->size();
}
}
Status ZoneMapIndexWriter::flush() {
// Update segment zone map.
if (_field->compare(_segment_zone_map.min_value, _page_zone_map.min_value) > 0) {
_field->type_info()->direct_copy(_segment_zone_map.min_value, _page_zone_map.min_value);
}
if (_field->compare(_segment_zone_map.max_value, _page_zone_map.max_value) < 0) {
_field->type_info()->direct_copy(_segment_zone_map.max_value, _page_zone_map.max_value);
}
if (_page_zone_map.has_null) {
_segment_zone_map.has_null = true;
}
if (_page_zone_map.has_not_null) {
_segment_zone_map.has_not_null = true;
}
ZoneMapPB zone_map_pb;
_page_zone_map.to_proto(&zone_map_pb, _field);
_reset_zone_map(&_page_zone_map);
std::string serialized_zone_map;
bool ret = zone_map_pb.SerializeToString(&serialized_zone_map);
if (!ret) {
return Status::InternalError("serialize zone map failed");
}
_estimated_size += serialized_zone_map.size() + sizeof(uint32_t);
_values.push_back(std::move(serialized_zone_map));
return Status::OK();
}
Status ZoneMapIndexWriter::finish(WritableFile* file, ColumnIndexMetaPB* index_meta) {
index_meta->set_type(ZONE_MAP_INDEX);
ZoneMapIndexPB* meta = index_meta->mutable_zone_map_index();
// store segment zone map
_segment_zone_map.to_proto(meta->mutable_segment_zone_map(), _field);
// write out zone map for each data pages
const TypeInfo* typeinfo = get_type_info(OLAP_FIELD_TYPE_OBJECT);
IndexedColumnWriterOptions options;
options.write_ordinal_index = true;
options.write_value_index = false;
options.encoding = EncodingInfo::get_default_encoding(typeinfo, false);
options.compression = NO_COMPRESSION; // currently not compressed
IndexedColumnWriter writer(options, typeinfo, file);
RETURN_IF_ERROR(writer.init());
for (auto& value : _values) {
Slice value_slice(value);
RETURN_IF_ERROR(writer.add(&value_slice));
}
return writer.finish(meta->mutable_page_zone_maps());
}
Status ZoneMapIndexReader::load(bool use_page_cache, bool kept_in_memory) {
IndexedColumnReader reader(_filename, _index_meta->page_zone_maps());
RETURN_IF_ERROR(reader.load(use_page_cache, kept_in_memory));
IndexedColumnIterator iter(&reader);
MemTracker tracker;
MemPool pool(&tracker);
_page_zone_maps.resize(reader.num_values());
// read and cache all page zone maps
for (int i = 0; i < reader.num_values(); ++i) {
Slice value;
uint8_t nullmap;
size_t num_to_read = 1;
ColumnBlock block(reader.type_info(), (uint8_t*) &value, &nullmap, num_to_read, &pool);
ColumnBlockView column_block_view(&block);
RETURN_IF_ERROR(iter.seek_to_ordinal(i));
size_t num_read = num_to_read;
RETURN_IF_ERROR(iter.next_batch(&num_read, &column_block_view));
DCHECK(num_to_read == num_read);
if (!_page_zone_maps[i].ParseFromArray(value.data, value.size)) {
return Status::Corruption("Failed to parse zone map");
}
pool.clear();
}
return Status::OK();
}
} // namespace segment_v2
} // namespace doris

View File

@ -17,8 +17,9 @@
#pragma once
#include <vector>
#include <memory>
#include <string>
#include <vector>
#include "common/status.h"
#include "util/slice.h"
@ -30,6 +31,8 @@
namespace doris {
class WritableFile;
namespace segment_v2 {
struct ZoneMap {
@ -46,66 +49,77 @@ struct ZoneMap {
bool has_null = false;
// has_not_null means whether zone has none-null value
bool has_not_null = false;
void to_proto(ZoneMapPB* dst, Field* field) {
dst->set_min(field->to_string(min_value));
dst->set_max(field->to_string(max_value));
dst->set_has_null(has_null);
dst->set_has_not_null(has_not_null);
}
};
// This class encode column pages' zone map.
// The binary is encoded by BinaryPlainPageBuilder
class ColumnZoneMapBuilder {
// Zone map index is represented by an IndexedColumn with ordinal index.
// The IndexedColumn stores serialized ZoneMapPB for each data page.
// It also create and store the segment-level zone map in the index meta so that
// reader can prune an entire segment without reading pages.
class ZoneMapIndexWriter {
public:
ColumnZoneMapBuilder(Field* field);
explicit ZoneMapIndexWriter(Field* field);
Status add(const uint8_t* vals, size_t count);
void add_values(const void* values, size_t count);
void add_nulls(uint32_t count) {
_page_zone_map.has_null = true;
}
// mark the end of one data page so that we can finalize the corresponding zone map
Status flush();
void fill_segment_zone_map(ZoneMapPB* const to);
Status finish(WritableFile* file, ColumnIndexMetaPB* index_meta);
uint64_t size() {
return _page_builder->size();
}
OwnedSlice finish() {
return _page_builder->finish();
}
uint64_t size() { return _estimated_size; }
private:
void _reset_zone_map(ZoneMap* zone_map);
void _reset_page_zone_map() { _reset_zone_map(&_zone_map); }
void _reset_segment_zone_map() { _reset_zone_map(&_segment_zone_map); }
void _fill_zone_map_to_pb(const ZoneMap& from, ZoneMapPB* const to);
void _reset_zone_map(ZoneMap* zone_map) {
// we should allocate max varchar length and set to max for min value
_field->set_to_max(zone_map->min_value);
_field->set_to_min(zone_map->max_value);
zone_map->has_null = false;
zone_map->has_not_null = false;
}
private:
std::unique_ptr<BinaryPlainPageBuilder> _page_builder;
Field* _field;
// memory will be managed by MemPool
ZoneMap _zone_map;
ZoneMap _page_zone_map;
ZoneMap _segment_zone_map;
// TODO(zc): we should replace this memory pool later, we only allocate min/max
// for field. But MemPool allocate 4KB least, it will a waste for most cases.
MemTracker _tracker;
MemPool _pool;
// serialized ZoneMapPB for each data page
std::vector<std::string> _values;
uint64_t _estimated_size = 0;
};
// ColumnZoneMap
class ColumnZoneMap {
class ZoneMapIndexReader {
public:
ColumnZoneMap(const Slice& data) : _data(data), _num_pages(0) { }
Status load();
const std::vector<ZoneMapPB>& get_column_zone_map() const {
return _page_zone_maps;
explicit ZoneMapIndexReader(const std::string& filename, const ZoneMapIndexPB* index_meta) :
_filename(filename),
_index_meta(index_meta) {
}
int32_t num_pages() const {
return _num_pages;
}
// load all page zone maps into memory
Status load(bool use_page_cache, bool kept_in_memory);
const std::vector<ZoneMapPB>& page_zone_maps() const { return _page_zone_maps; }
int32_t num_pages() const { return _page_zone_maps.size(); }
private:
Slice _data;
std::string _filename;
const ZoneMapIndexPB* _index_meta;
// valid after load
int32_t _num_pages;
std::vector<ZoneMapPB> _page_zone_maps;
};

View File

@ -28,89 +28,63 @@ namespace doris {
Status ShortKeyIndexBuilder::add_item(const Slice& key) {
put_varint32(&_offset_buf, _key_buf.size());
_footer.set_num_items(_footer.num_items() + 1);
_key_buf.append(key.data, key.size);
_num_items++;
return Status::OK();
}
Status ShortKeyIndexBuilder::finalize(uint32_t segment_bytes,
uint32_t num_segment_rows,
std::vector<Slice>* slices) {
_footer.set_num_segment_rows(num_segment_rows);
_footer.set_segment_bytes(segment_bytes);
_footer.set_key_bytes(_key_buf.size());
_footer.set_offset_bytes(_offset_buf.size());
Status ShortKeyIndexBuilder::finalize(uint32_t num_segment_rows,
std::vector<Slice>* body,
segment_v2::PageFooterPB* page_footer) {
page_footer->set_type(segment_v2::SHORT_KEY_PAGE);
page_footer->set_uncompressed_size(_key_buf.size() + _offset_buf.size());
// encode header
if (!_footer.SerializeToString(&_footer_buf)) {
return Status::InternalError("Failed to serialize index footer");
}
segment_v2::ShortKeyFooterPB* footer = page_footer->mutable_short_key_page_footer();
footer->set_num_items(_num_items);
footer->set_key_bytes(_key_buf.size());
footer->set_offset_bytes(_offset_buf.size());
footer->set_segment_id(_segment_id);
footer->set_num_rows_per_block(_num_rows_per_block);
footer->set_num_segment_rows(num_segment_rows);
put_fixed32_le(&_footer_buf, _footer_buf.size());
// TODO(zc): checksum
uint32_t checksum = 0;
put_fixed32_le(&_footer_buf, checksum);
slices->emplace_back(_key_buf);
slices->emplace_back(_offset_buf);
slices->emplace_back(_footer_buf);
body->emplace_back(_key_buf);
body->emplace_back(_offset_buf);
return Status::OK();
}
Status ShortKeyIndexDecoder::parse() {
Slice data = _data;
Status ShortKeyIndexDecoder::parse(const Slice& body, const segment_v2::ShortKeyFooterPB& footer) {
_footer = footer;
// 1. parse footer, get checksum and footer length
if (data.size < 2 * sizeof(uint32_t)) {
// check if body size match footer's information
if (body.size != (_footer.key_bytes() + _footer.offset_bytes())) {
return Status::Corruption(
Substitute("Short key is too short, need=$0 vs real=$1",
2 * sizeof(uint32_t), data.size));
}
size_t offset = data.size - 2 * sizeof(uint32_t);
uint32_t footer_length = decode_fixed32_le((uint8_t*)data.data + offset);
uint32_t checksum = decode_fixed32_le((uint8_t*)data.data + offset + 4);
// TODO(zc): do checksum
if (checksum != 0) {
return Status::Corruption(
Substitute("Checksum not match, need=$0 vs read=$1", 0, checksum));
}
// move offset to parse footer
offset -= footer_length;
std::string footer_buf(data.data + offset, footer_length);
if (!_footer.ParseFromString(footer_buf)) {
return Status::Corruption("Fail to parse index footer from string");
}
// check if real data size match footer's content
if (offset != _footer.key_bytes() + _footer.offset_bytes()) {
return Status::Corruption(
Substitute("Index size not match, need=$0, real=$1",
_footer.key_bytes() + _footer.offset_bytes(), offset));
Substitute("Index size not match, need=$0, real=$1",
_footer.key_bytes() + _footer.offset_bytes(), body.size));
}
// set index buffer
_key_data = Slice(_data.data, _footer.key_bytes());
_key_data = Slice(body.data, _footer.key_bytes());
// parse offset information
Slice offset_slice(_data.data + _footer.key_bytes(), _footer.offset_bytes());
Slice offset_slice(body.data + _footer.key_bytes(), _footer.offset_bytes());
// +1 for record total length
_offsets.resize(_footer.num_items() + 1);
_offsets[_footer.num_items()] = _footer.key_bytes();
for (uint32_t i = 0; i < _footer.num_items(); ++i) {
uint32_t offset = 0;
if (!get_varint32(&offset_slice, &offset)) {
return Status::Corruption("Fail to get varint from index offset buffer");
}
DCHECK(offset <= _footer.key_bytes())
<< "Offset is larger than total bytes, offset=" << offset
<< ", key_bytes=" << _footer.key_bytes();
<< "Offset is larger than total bytes, offset=" << offset
<< ", key_bytes=" << _footer.key_bytes();
_offsets[i] = offset;
}
_offsets[_footer.num_items()] = _footer.key_bytes();
if (offset_slice.size != 0) {
return Status::Corruption("Still has data after parse all key offset");
}
_parsed = true;
return Status::OK();
}

View File

@ -107,17 +107,12 @@ void encode_key(std::string* buf, const RowType& row, size_t num_keys) {
}
}
// Used to encode a segment short key indices to binary format. This version
// Encode a segment short key indices to one ShortKeyPage. This version
// only accepts binary key, client should assure that input key is sorted,
// otherwise error could happens. This builder would arrange data in following
// format.
// index = encoded_keys + encoded_offsets + footer + footer_size + checksum
// encoded_keys = binary_key + [, ...]
// encoded_offsets = encoded_offset + [, ...]
// encoded_offset = variant32
// footer = ShortKeyFooterPB
// footer_size = fixed32
// checksum = fixed32
// otherwise error could happens. This builder would arrange the page body in the
// following format:
// ShortKeyPageBody := KeyContent^NumEntry, KeyOffset(vint)^NumEntry
// NumEntry, KeyBytes, OffsetBytes is stored in ShortKeyFooterPB
// Usage:
// ShortKeyIndexBuilder builder(segment_id, num_rows_per_block);
// builder.add_item(key1);
@ -132,26 +127,25 @@ void encode_key(std::string* buf, const RowType& row, size_t num_keys) {
// more than short key
class ShortKeyIndexBuilder {
public:
ShortKeyIndexBuilder(uint32_t segment_id,
uint32_t num_rows_per_block) {
_footer.set_segment_id(segment_id);
_footer.set_num_rows_per_block(num_rows_per_block);
ShortKeyIndexBuilder(uint32_t segment_id, uint32_t num_rows_per_block) :
_segment_id(segment_id), _num_rows_per_block(num_rows_per_block), _num_items(0) {
}
Status add_item(const Slice& key);
uint64_t size() {
return _key_buf.size() + _offset_buf.size() + _footer_buf.size();
return _key_buf.size() + _offset_buf.size();
}
Status finalize(uint32_t segment_size, uint32_t num_rows, std::vector<Slice>* slices);
Status finalize(uint32_t num_rows, std::vector<Slice>* body, segment_v2::PageFooterPB* footer);
private:
segment_v2::ShortKeyFooterPB _footer;
uint32_t _segment_id;
uint32_t _num_rows_per_block;
uint32_t _num_items;
faststring _key_buf;
faststring _offset_buf;
std::string _footer_buf;
};
class ShortKeyIndexDecoder;
@ -214,40 +208,54 @@ private:
// Used to decode short key to header and encoded index data.
// Usage:
// MemIndex index;
// ShortKeyIndexDecoder decoder(slice)
// decoder.parse();
// ShortKeyIndexDecoder decoder;
// decoder.parse(body, footer);
// auto iter = decoder.lower_bound(key);
class ShortKeyIndexDecoder {
public:
// Client should assure that data is available when this class
// is used.
ShortKeyIndexDecoder(const Slice& data) : _data(data) { }
ShortKeyIndexDecoder() : _parsed(false) {}
Status parse();
// client should assure that body is available when this class is used
Status parse(const Slice& body, const segment_v2::ShortKeyFooterPB& footer);
ShortKeyIndexIterator begin() const { return {this, 0}; }
ShortKeyIndexIterator end() const { return {this, num_items()}; }
ShortKeyIndexIterator begin() const {
DCHECK(_parsed);
return {this, 0};
}
ShortKeyIndexIterator end() const {
DCHECK(_parsed);
return {this, num_items()};
}
// Return an iterator which locates at the first item who is
// equal with or greater than the given key.
// NOTE: If one key is the prefix of other key, this funciton thinks
// that longer key is greater than the shorter key.
ShortKeyIndexIterator lower_bound(const Slice& key) const {
DCHECK(_parsed);
return seek<true>(key);
}
// Return the iterator which locates the first item greater than the
// input key.
ShortKeyIndexIterator upper_bound(const Slice& key) const {
DCHECK(_parsed);
return seek<false>(key);
}
uint32_t num_items() const { return _footer.num_items(); }
uint32_t num_items() const {
DCHECK(_parsed);
return _footer.num_items();
}
uint32_t num_rows_per_block() const { return _footer.num_rows_per_block(); }
uint32_t num_rows_per_block() const {
DCHECK(_parsed);
return _footer.num_rows_per_block();
}
Slice key(ssize_t ordinal) const {
DCHECK(_parsed);
DCHECK(ordinal >= 0 && ordinal < num_items());
return {_key_data.data + _offsets[ordinal], _offsets[ordinal + 1] - _offsets[ordinal]};
}
@ -266,7 +274,7 @@ private:
}
private:
Slice _data;
bool _parsed;
// All following fields are only valid after parse has been executed successfully
segment_v2::ShortKeyFooterPB _footer;

View File

@ -150,6 +150,10 @@ template<> struct CppTypeTraits<OLAP_FIELD_TYPE_BIGINT> {
using CppType = int64_t;
using UnsignedCppType = uint64_t;
};
template<> struct CppTypeTraits<OLAP_FIELD_TYPE_UNSIGNED_BIGINT> {
using CppType = uint64_t;
using UnsignedCppType = uint64_t;
};
template<> struct CppTypeTraits<OLAP_FIELD_TYPE_LARGEINT> {
using CppType = int128_t;
using UnsignedCppType = unsigned int128_t;

View File

@ -51,19 +51,18 @@ ADD_BE_TEST(rowset/segment_v2/bitshuffle_page_test)
ADD_BE_TEST(rowset/segment_v2/plain_page_test)
ADD_BE_TEST(rowset/segment_v2/binary_plain_page_test)
ADD_BE_TEST(rowset/segment_v2/binary_prefix_page_test)
ADD_BE_TEST(rowset/segment_v2/index_column_reader_writer_test)
ADD_BE_TEST(rowset/segment_v2/bitmap_index_test)
ADD_BE_TEST(rowset/segment_v2/column_reader_writer_test)
ADD_BE_TEST(rowset/segment_v2/encoding_info_test)
ADD_BE_TEST(rowset/segment_v2/page_compression_test)
ADD_BE_TEST(rowset/segment_v2/ordinal_page_index_test)
ADD_BE_TEST(rowset/segment_v2/rle_page_test)
ADD_BE_TEST(rowset/segment_v2/binary_dict_page_test)
ADD_BE_TEST(rowset/segment_v2/segment_test)
ADD_BE_TEST(rowset/segment_v2/column_zone_map_test)
ADD_BE_TEST(rowset/segment_v2/row_ranges_test)
ADD_BE_TEST(rowset/segment_v2/frame_of_reference_page_test)
ADD_BE_TEST(rowset/segment_v2/block_bloom_filter_test)
ADD_BE_TEST(rowset/segment_v2/bloom_filter_index_reader_writer_test)
ADD_BE_TEST(rowset/segment_v2/zone_map_index_test)
ADD_BE_TEST(tablet_meta_manager_test)
ADD_BE_TEST(tablet_mgr_test)
ADD_BE_TEST(rowset/rowset_meta_manager_test)

View File

@ -108,6 +108,7 @@ TEST_F(KeyCoderTest, test_int) {
test_integer_encode<OLAP_FIELD_TYPE_INT>();
test_integer_encode<OLAP_FIELD_TYPE_UNSIGNED_INT>();
test_integer_encode<OLAP_FIELD_TYPE_BIGINT>();
test_integer_encode<OLAP_FIELD_TYPE_UNSIGNED_BIGINT>();
test_integer_encode<OLAP_FIELD_TYPE_LARGEINT>();
test_integer_encode<OLAP_FIELD_TYPE_DATETIME>();

View File

@ -20,12 +20,12 @@
#include "olap/key_coder.h"
#include <gtest/gtest.h>
#include <string>
#include "common/logging.h"
#include "env/env.h"
#include "olap/olap_common.h"
#include "olap/types.h"
#include "olap/column_block.h"
#include "util/file_utils.h"
#include "runtime/mem_tracker.h"
#include "runtime/mem_pool.h"
@ -33,68 +33,72 @@
namespace doris {
namespace segment_v2 {
class IndexColumnReaderWriterTest : public testing::Test {
public:
IndexColumnReaderWriterTest() : _pool(&_tracker) { }
virtual ~IndexColumnReaderWriterTest() {
class BitmapIndexTest : public testing::Test {
public:
const std::string kTestDir = "./ut_dir/bitmap_index_test";
BitmapIndexTest() : _pool(&_tracker) { }
void SetUp() override {
if (FileUtils::check_exist(kTestDir)) {
ASSERT_TRUE(FileUtils::remove_all(kTestDir).ok());
}
ASSERT_TRUE(FileUtils::create_dir(kTestDir).ok());
}
private:
void TearDown() override {
if (FileUtils::check_exist(kTestDir)) {
ASSERT_TRUE(FileUtils::remove_all(kTestDir).ok());
}
}
private:
MemTracker _tracker;
MemPool _pool;
};
const std::string dname = "./ut_dir/index_column_reader_writer_test";
template<FieldType type>
void wirte_index_file(std::string& file_name, const void* values,
void write_index_file(std::string& filename, const void* values,
size_t value_count, size_t null_count,
BitmapIndexColumnPB* bitmap_index_meta) {
ColumnIndexMetaPB* meta) {
const TypeInfo* type_info = get_type_info(type);
FileUtils::create_dir(dname);
std::string fname = dname + "/" + file_name;
{
std::unique_ptr<WritableFile> wfile;
auto st = Env::Default()->new_writable_file(fname, &wfile);
ASSERT_TRUE(st.ok());
std::unique_ptr<BitmapIndexWriter> _bitmap_index_builder;
BitmapIndexWriter::create(type_info, &_bitmap_index_builder);
_bitmap_index_builder->add_values(values, value_count);
_bitmap_index_builder->add_nulls(null_count);
st = _bitmap_index_builder->finish(wfile.get(), bitmap_index_meta);
ASSERT_TRUE(st.ok()) << "writer finish status:" << st.to_string();
wfile.reset();
ASSERT_TRUE(Env::Default()->new_writable_file(filename, &wfile).ok());
std::unique_ptr<BitmapIndexWriter> writer;
BitmapIndexWriter::create(type_info, &writer);
writer->add_values(values, value_count);
writer->add_nulls(null_count);
ASSERT_TRUE(writer->finish(wfile.get(), meta).ok());
ASSERT_EQ(BITMAP_INDEX, meta->type());
}
}
template<FieldType type>
void get_bitmap_reader_iter(std::string& file_name, BitmapIndexColumnPB& bitmap_index_meta,
void get_bitmap_reader_iter(std::string& file_name, const ColumnIndexMetaPB& meta,
BitmapIndexReader** reader,
BitmapIndexIterator** iter) {
file_name = dname + "/" + file_name;
*reader = new BitmapIndexReader(file_name, bitmap_index_meta);
auto st = (*reader)->load(true);
*reader = new BitmapIndexReader(file_name, &meta.bitmap_index());
auto st = (*reader)->load(true, false);
ASSERT_TRUE(st.ok());
st = (*reader)->new_iterator(iter);
ASSERT_TRUE(st.ok());
}
TEST_F(IndexColumnReaderWriterTest, test_invert) {
TEST_F(BitmapIndexTest, test_invert) {
size_t num_uint8_rows = 1024 * 10;
int* val = new int[num_uint8_rows];
for (int i = 0; i < num_uint8_rows; ++i) {
val[i] = i;
}
std::string file_name = "invert";
BitmapIndexColumnPB bitmap_index_meta;
wirte_index_file<OLAP_FIELD_TYPE_INT>(file_name, val, num_uint8_rows, 0,
&bitmap_index_meta);
std::string file_name = kTestDir + "/invert";
ColumnIndexMetaPB meta;
write_index_file<OLAP_FIELD_TYPE_INT>(file_name, val, num_uint8_rows, 0, &meta);
{
std::unique_ptr<RandomAccessFile> rfile;
BitmapIndexReader* reader = nullptr;
BitmapIndexIterator* iter = nullptr;
get_bitmap_reader_iter<OLAP_FIELD_TYPE_INT>(file_name, bitmap_index_meta, &reader, &iter);
get_bitmap_reader_iter<OLAP_FIELD_TYPE_INT>(file_name, meta, &reader, &iter);
int value = 2;
bool exact_match;
@ -129,7 +133,7 @@ TEST_F(IndexColumnReaderWriterTest, test_invert) {
}
}
TEST_F(IndexColumnReaderWriterTest, test_invert_2) {
TEST_F(BitmapIndexTest, test_invert_2) {
size_t num_uint8_rows = 1024 * 10;
int* val = new int[num_uint8_rows];
for (int i = 0; i < 1024; ++i) {
@ -140,15 +144,14 @@ TEST_F(IndexColumnReaderWriterTest, test_invert_2) {
val[i] = i * 10;
}
std::string file_name = "invert2";
BitmapIndexColumnPB bitmap_index_meta;
wirte_index_file<OLAP_FIELD_TYPE_INT>(file_name, val, num_uint8_rows, 0,
&bitmap_index_meta);
std::string file_name = kTestDir + "/invert2";
ColumnIndexMetaPB meta;
write_index_file<OLAP_FIELD_TYPE_INT>(file_name, val, num_uint8_rows, 0, &meta);
{
BitmapIndexReader* reader = nullptr;
BitmapIndexIterator* iter = nullptr;
get_bitmap_reader_iter<OLAP_FIELD_TYPE_INT>(file_name, bitmap_index_meta, &reader, &iter);
get_bitmap_reader_iter<OLAP_FIELD_TYPE_INT>(file_name, meta, &reader, &iter);
int value = 1026;
bool exact_match;
@ -167,7 +170,7 @@ TEST_F(IndexColumnReaderWriterTest, test_invert_2) {
}
}
TEST_F(IndexColumnReaderWriterTest, test_multi_pages) {
TEST_F(BitmapIndexTest, test_multi_pages) {
size_t num_uint8_rows = 1024 * 1024;
int64_t* val = new int64_t[num_uint8_rows];
for (int i = 0; i < num_uint8_rows; ++i) {
@ -175,14 +178,13 @@ TEST_F(IndexColumnReaderWriterTest, test_multi_pages) {
}
val[1024 * 510] = 2019;
std::string file_name = "mul";
BitmapIndexColumnPB bitmap_index_meta;
wirte_index_file<OLAP_FIELD_TYPE_BIGINT>(file_name, val, num_uint8_rows, 0,
&bitmap_index_meta);
std::string file_name = kTestDir + "/mul";
ColumnIndexMetaPB meta;
write_index_file<OLAP_FIELD_TYPE_BIGINT>(file_name, val, num_uint8_rows, 0, &meta);
{
BitmapIndexReader* reader = nullptr;
BitmapIndexIterator* iter = nullptr;
get_bitmap_reader_iter<OLAP_FIELD_TYPE_BIGINT>(file_name, bitmap_index_meta, &reader, &iter);
get_bitmap_reader_iter<OLAP_FIELD_TYPE_BIGINT>(file_name, meta, &reader, &iter);
int64_t value = 2019;
bool exact_match;
@ -199,21 +201,20 @@ TEST_F(IndexColumnReaderWriterTest, test_multi_pages) {
}
}
TEST_F(IndexColumnReaderWriterTest, test_null) {
TEST_F(BitmapIndexTest, test_null) {
size_t num_uint8_rows = 1024;
int64_t* val = new int64_t[num_uint8_rows];
for (int i = 0; i < num_uint8_rows; ++i) {
val[i] = i;
}
std::string file_name = "null";
BitmapIndexColumnPB bitmap_index_meta;
wirte_index_file<OLAP_FIELD_TYPE_BIGINT>(file_name, val, num_uint8_rows, 30,
&bitmap_index_meta);
std::string file_name = kTestDir + "/null";
ColumnIndexMetaPB meta;
write_index_file<OLAP_FIELD_TYPE_BIGINT>(file_name, val, num_uint8_rows, 30, &meta);
{
BitmapIndexReader* reader = nullptr;
BitmapIndexIterator* iter = nullptr;
get_bitmap_reader_iter<OLAP_FIELD_TYPE_BIGINT>(file_name, bitmap_index_meta, &reader, &iter);
get_bitmap_reader_iter<OLAP_FIELD_TYPE_BIGINT>(file_name, meta, &reader, &iter);
Roaring bitmap;
iter->read_null_bitmap(&bitmap);

View File

@ -26,10 +26,7 @@
#include "env/env.h"
#include "olap/olap_common.h"
#include "olap/types.h"
#include "olap/column_block.h"
#include "util/file_utils.h"
#include "runtime/mem_tracker.h"
#include "runtime/mem_pool.h"
namespace doris {
namespace segment_v2 {
@ -46,7 +43,7 @@ const std::string dname = "./ut_dir/bloom_filter_index_reader_writer_test";
template<FieldType type>
void write_bloom_filter_index_file(const std::string& file_name, const void* values,
size_t value_count, size_t null_count,
BloomFilterIndexPB* bloom_filter_index_meta) {
ColumnIndexMetaPB* index_meta) {
const TypeInfo* type_info = get_type_info(type);
using CppType = typename CppTypeTraits<type>::CppType;
FileUtils::create_dir(dname);
@ -70,20 +67,21 @@ void write_bloom_filter_index_file(const std::string& file_name, const void* val
ASSERT_TRUE(st.ok());
i += 1024;
}
st = bloom_filter_index_writer->finish(wfile.get(), bloom_filter_index_meta);
st = bloom_filter_index_writer->finish(wfile.get(), index_meta);
ASSERT_TRUE(st.ok()) << "writer finish status:" << st.to_string();
wfile.reset();
ASSERT_EQ(BLOOM_FILTER_INDEX, index_meta->type());
ASSERT_EQ(bf_options.strategy, index_meta->bloom_filter_index().hash_strategy());
}
}
void get_bloom_filter_reader_iter(const std::string& file_name, const BloomFilterIndexPB& bloom_filter_index_meta,
void get_bloom_filter_reader_iter(const std::string& file_name, const ColumnIndexMetaPB& meta,
std::unique_ptr<RandomAccessFile>* rfile,
BloomFilterIndexReader** reader,
std::unique_ptr<BloomFilterIndexIterator>* iter) {
std::string fname = dname + "/" + file_name;
*reader = new BloomFilterIndexReader(fname, bloom_filter_index_meta);
auto st = (*reader)->load(true);
*reader = new BloomFilterIndexReader(fname, &meta.bloom_filter_index());
auto st = (*reader)->load(true, false);
ASSERT_TRUE(st.ok());
st = (*reader)->new_iterator(iter);
@ -96,15 +94,13 @@ void test_bloom_filter_index_reader_writer_template(const std::string file_name,
typename TypeTraits<Type>::CppType* not_exist_value,
bool is_slice_type = false) {
typedef typename TypeTraits<Type>::CppType CppType;
BloomFilterIndexPB bloom_filter_index_meta;
write_bloom_filter_index_file<Type>(file_name, val, num, null_num,
&bloom_filter_index_meta);
ColumnIndexMetaPB meta;
write_bloom_filter_index_file<Type>(file_name, val, num, null_num, &meta);
{
std::unique_ptr<RandomAccessFile> rfile;
BloomFilterIndexReader* reader = nullptr;
std::unique_ptr<BloomFilterIndexIterator> iter;
get_bloom_filter_reader_iter(file_name, bloom_filter_index_meta,
&rfile, &reader, &iter);
get_bloom_filter_reader_iter(file_name, meta, &rfile, &reader, &iter);
// page 0
std::unique_ptr<BloomFilter> bf;

View File

@ -79,8 +79,18 @@ void test_nullable_data(uint8_t* src_data, uint8_t* src_is_null, int num_rows, s
ASSERT_TRUE(st.ok());
ColumnWriterOptions writer_opts;
writer_opts.encoding_type = encoding;
writer_opts.compression_type = segment_v2::CompressionTypePB::LZ4F;
writer_opts.meta = &meta;
writer_opts.meta->set_column_id(0);
writer_opts.meta->set_unique_id(0);
writer_opts.meta->set_type(type);
if (type == OLAP_FIELD_TYPE_CHAR || type == OLAP_FIELD_TYPE_VARCHAR) {
writer_opts.meta->set_length(10);
} else {
writer_opts.meta->set_length(0);
}
writer_opts.meta->set_encoding(encoding);
writer_opts.meta->set_compression(segment_v2::CompressionTypePB::LZ4F);
writer_opts.meta->set_is_nullable(true);
writer_opts.need_zone_map = true;
TabletColumn column(OLAP_FIELD_AGGREGATION_NONE, type);
@ -90,7 +100,7 @@ void test_nullable_data(uint8_t* src_data, uint8_t* src_is_null, int num_rows, s
column = create_char_key(1);
}
std::unique_ptr<Field> field(FieldFactory::create(column));
ColumnWriter writer(writer_opts, std::move(field), true, wfile.get());
ColumnWriter writer(writer_opts, std::move(field), wfile.get());
st = writer.init();
ASSERT_TRUE(st.ok()) << st.to_string();
@ -109,9 +119,6 @@ void test_nullable_data(uint8_t* src_data, uint8_t* src_is_null, int num_rows, s
st = writer.write_zone_map();
ASSERT_TRUE(st.ok());
writer.write_meta(&meta);
ASSERT_TRUE(meta.has_zone_map_page());
// close the file
wfile.reset();
}

View File

@ -19,84 +19,134 @@
#include <gtest/gtest.h>
#include <iostream>
#include <memory>
#include <string>
#include "common/logging.h"
#include "env/env.h"
#include "util/file_utils.h"
namespace doris {
namespace segment_v2 {
class OrdinalPageIndexTest : public testing::Test {
public:
OrdinalPageIndexTest() { }
virtual ~OrdinalPageIndexTest() {
const std::string kTestDir = "./ut_dir/ordinal_page_index_test";
void SetUp() override {
if (FileUtils::check_exist(kTestDir)) {
ASSERT_TRUE(FileUtils::remove_all(kTestDir).ok());
}
ASSERT_TRUE(FileUtils::create_dir(kTestDir).ok());
}
void TearDown() override {
if (FileUtils::check_exist(kTestDir)) {
ASSERT_TRUE(FileUtils::remove_all(kTestDir).ok());
}
}
};
TEST_F(OrdinalPageIndexTest, normal) {
// rowid, page pointer
// 1, (0, 4096)
// 1 + 4096, (1 * 4096, 4096)
// a page have 16KB, and have 4096 rows
OrdinalPageIndexBuilder builder;
std::string filename = kTestDir + "/normal.idx";
// we test a 16KB page
OrdinalIndexWriter builder;
// generate ordinal index for 16K data pages,
// each data page is 16KB in size and contains 4096 values,
// ordinal starts at 1 instead of 0
for (uint64_t i = 0; i < 16 * 1024; ++i) {
builder.append_entry(1 + 4096 * i, {16 * 1024 * i, 16 * 1024});
}
ColumnIndexMetaPB index_meta;
{
std::unique_ptr<WritableFile> out_file;
ASSERT_TRUE(Env::Default()->new_writable_file(filename, &out_file).ok());
ASSERT_TRUE(builder.finish(out_file.get(), &index_meta).ok());
ASSERT_EQ(ORDINAL_INDEX, index_meta.type());
ASSERT_FALSE(index_meta.ordinal_index().root_page().is_root_data_page());
LOG(INFO) << "index page size="
<< index_meta.ordinal_index().root_page().root_page().size();
}
auto slice = builder.finish();
LOG(INFO) << "index block's size=" << slice.size;
OrdinalIndexReader index(filename, &index_meta.ordinal_index(), 16 * 1024 * 4096 + 1);
ASSERT_TRUE(index.load(true, false).ok());
ASSERT_EQ(16 * 1024, index.num_data_pages());
ASSERT_EQ(1, index.get_first_ordinal(0));
ASSERT_EQ(4096, index.get_last_ordinal(0));
ASSERT_EQ((16 * 1024 - 1) * 4096 + 1, index.get_first_ordinal(16 * 1024 - 1));
ASSERT_EQ(16 * 1024 * 4096, index.get_last_ordinal(16 * 1024 - 1));
OrdinalPageIndex index(slice, 16 * 1024 * 4096 + 1);
auto st = index.load();
ASSERT_TRUE(st.ok());
ASSERT_EQ(1, index.get_first_row_id(0));
ASSERT_EQ(4096, index.get_last_row_id(0));
ASSERT_EQ((16 * 1024 - 1) * 4096 + 1, index.get_first_row_id(16 * 1024 - 1));
ASSERT_EQ(16 * 1024 * 4096, index.get_last_row_id(16 * 1024 - 1));
PagePointer page;
{
auto iter = index.seek_at_or_before(1);
ASSERT_TRUE(iter.valid());
ASSERT_EQ(1, iter.rowid());
ASSERT_EQ(1, iter.first_ordinal());
ASSERT_EQ(PagePointer(0, 16 * 1024), iter.page());
}
{
auto iter = index.seek_at_or_before(4095);
ASSERT_TRUE(iter.valid());
ASSERT_EQ(1, iter.rowid());
ASSERT_EQ(1, iter.first_ordinal());
ASSERT_EQ(PagePointer(0, 16 * 1024), iter.page());
}
{
auto iter = index.seek_at_or_before(4098);
ASSERT_TRUE(iter.valid());
ASSERT_EQ(4097, iter.rowid());
ASSERT_EQ(4097, iter.first_ordinal());
ASSERT_EQ(PagePointer(1 * 16 * 1024, 16 * 1024), iter.page());
iter.next();
ASSERT_TRUE(iter.valid());
ASSERT_EQ(4097 + 4096, iter.rowid());
ASSERT_EQ(4097 + 4096, iter.first_ordinal());
ASSERT_EQ(PagePointer(2 * 16 * 1024, 16 * 1024), iter.page());
}
{
auto iter = index.seek_at_or_before(0);
ASSERT_FALSE(iter.valid());
}
}
TEST_F(OrdinalPageIndexTest, corrupt) {
std::string str;
str.resize(4);
TEST_F(OrdinalPageIndexTest, one_data_page) {
// index one data page with 1024 values
int num_values = 1024;
PagePointer data_page_pointer(0, 4096);
encode_fixed32_le((uint8_t*)str.data(), 1);
OrdinalIndexWriter builder;
builder.append_entry(0, data_page_pointer); // add only one entry
ColumnIndexMetaPB index_meta;
{
// in this case, no index page is written, thus file could be null
ASSERT_TRUE(builder.finish(nullptr, &index_meta).ok());
ASSERT_EQ(ORDINAL_INDEX, index_meta.type());
ASSERT_TRUE(index_meta.ordinal_index().root_page().is_root_data_page());
PagePointer root_page_pointer(index_meta.ordinal_index().root_page().root_page());
ASSERT_EQ(data_page_pointer, root_page_pointer);
}
Slice slice(str);
OrdinalPageIndex index(slice, 10);
auto st = index.load();
ASSERT_FALSE(st.ok());
OrdinalIndexReader index("", &index_meta.ordinal_index(), num_values);
ASSERT_TRUE(index.load(true, false).ok());
ASSERT_EQ(1, index.num_data_pages());
ASSERT_EQ(0, index.get_first_ordinal(0));
ASSERT_EQ(num_values - 1, index.get_last_ordinal(0));
{
auto iter = index.seek_at_or_before(0);
ASSERT_TRUE(iter.valid());
ASSERT_EQ(0, iter.first_ordinal());
ASSERT_EQ(num_values - 1, iter.last_ordinal());
ASSERT_EQ(data_page_pointer, iter.page());
}
{
auto iter = index.seek_at_or_before(num_values - 1);
ASSERT_TRUE(iter.valid());
ASSERT_EQ(0, iter.first_ordinal());
ASSERT_EQ(data_page_pointer, iter.page());
}
{
auto iter = index.seek_at_or_before(num_values);
ASSERT_TRUE(iter.valid());
ASSERT_EQ(0, iter.first_ordinal());
ASSERT_EQ(data_page_pointer, iter.page());
}
}
}

View File

@ -1,143 +0,0 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "olap/rowset/segment_v2/page_compression.h"
#include <gtest/gtest.h>
#include <iostream>
#include "common/logging.h"
#include "util/block_compression.h"
namespace doris {
namespace segment_v2 {
class PageCompressionTest : public testing::Test {
public:
PageCompressionTest() { }
virtual ~PageCompressionTest() {
}
};
static std::string generate_rand_str(size_t len) {
static char charset[] = "0123456789"
"abcdefghijklmnopqrstuvwxyz"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ";
std::string result;
result.resize(len);
for (int i = 0; i < len; ++i) {
result[i] = charset[rand() % sizeof(charset)];
}
return result;
}
static std::string generate_str(size_t len) {
static char charset[] = "0123456789"
"abcdefghijklmnopqrstuvwxyz"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ";
std::string result;
result.resize(len);
for (int i = 0; i < len; ++i) {
result[i] = charset[i % sizeof(charset)];
}
return result;
}
TEST_F(PageCompressionTest, normal) {
const BlockCompressionCodec* codec = nullptr;
get_block_compression_codec(segment_v2::CompressionTypePB::LZ4F, &codec);
for (int i = 0; i < 2; ++i) {
// compress
PageCompressor compressor(codec);
std::vector<Slice> raw_slices;
std::string raw_data;
if (i == 0) {
raw_data = generate_rand_str(102400);
} else {
raw_data = generate_str(102400);
}
raw_slices.emplace_back(raw_data.data(), 10240);
raw_slices.emplace_back(raw_data.data() + 10240, 10240);
raw_slices.emplace_back(raw_data.data() + 20480, 81920);
std::vector<Slice> compressed_slices;
auto st = compressor.compress(raw_slices, &compressed_slices);
ASSERT_TRUE(st.ok());
std::string compressed_data = Slice::to_string(compressed_slices);
// decompress
PageDecompressor decompressor(compressed_data, codec);
{
Slice check_slice;
st = decompressor.decompress_to(&check_slice);
ASSERT_TRUE(st.ok());
ASSERT_STREQ(raw_data.c_str(), check_slice.data);
if (check_slice.data != compressed_data.data()) {
delete[] check_slice.data;
}
}
}
}
TEST_F(PageCompressionTest, bad_case) {
const BlockCompressionCodec* codec = nullptr;
get_block_compression_codec(segment_v2::CompressionTypePB::LZ4F, &codec);
for (int i = 0; i < 2; ++i) {
// compress
PageCompressor compressor(codec);
std::vector<Slice> raw_slices;
std::string raw_data;
if (i == 0) {
raw_data = generate_rand_str(102400);
} else {
raw_data = generate_str(102400);
}
raw_slices.emplace_back(raw_data.data(), 102400);
std::vector<Slice> compressed_slices;
auto st = compressor.compress(raw_slices, &compressed_slices);
ASSERT_TRUE(st.ok());
std::string compressed_data = Slice::to_string(compressed_slices);
Slice bad_compressed_slice(compressed_data.data(), compressed_data.size() - 1);
// decompress
PageDecompressor decompressor(bad_compressed_slice, codec);
{
Slice check_slice;
st = decompressor.decompress_to(&check_slice);
ASSERT_FALSE(st.ok());
}
}
}
}
}
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}

View File

@ -57,6 +57,15 @@ static void DefaultIntGenerator(size_t rid, int cid, int block_id, RowCursorCell
*(int*)cell.mutable_cell_ptr() = rid * 10 + cid;
}
static bool column_contains_index(ColumnMetaPB column_meta, ColumnIndexTypePB type) {
for (int i = 0; i < column_meta.indexes_size(); ++i) {
if (column_meta.indexes(i).type() == type) {
return true;
}
}
return false;
}
class SegmentReaderWriterTest : public ::testing::Test {
protected:
void SetUp() override {
@ -353,7 +362,7 @@ TEST_F(SegmentReaderWriterTest, LazyMaterialization) {
shared_ptr<Segment> segment;
SegmentWriterOptions write_opts;
build_segment(write_opts, tablet_schema, tablet_schema, 100, data_gen, &segment);
ASSERT_TRUE(segment->footer().columns(0).has_bitmap_index());
ASSERT_TRUE(column_contains_index(segment->footer().columns(0), BITMAP_INDEX));
{
// lazy disabled when all predicates are removed by bitmap index:
// select c1, c2 where c2 = 30;
@ -972,8 +981,8 @@ TEST_F(SegmentReaderWriterTest, TestBitmapPredicate) {
SegmentWriterOptions opts;
shared_ptr<Segment> segment;
build_segment(opts, tablet_schema, tablet_schema, 4096, DefaultIntGenerator, &segment);
ASSERT_TRUE(segment->footer().columns(0).has_bitmap_index());
ASSERT_TRUE(segment->footer().columns(1).has_bitmap_index());
ASSERT_TRUE(column_contains_index(segment->footer().columns(0), BITMAP_INDEX));
ASSERT_TRUE(column_contains_index(segment->footer().columns(1), BITMAP_INDEX));
{
Schema schema(tablet_schema);
@ -1104,14 +1113,14 @@ TEST_F(SegmentReaderWriterTest, TestBloomFilterIndexUniqueModel) {
opts1.whether_to_filter_value = false;
shared_ptr<Segment> seg1;
build_segment(opts1, schema, schema, 100, DefaultIntGenerator, &seg1);
ASSERT_FALSE(seg1->footer().columns(3).has_bloom_filter_index());
ASSERT_FALSE(column_contains_index(seg1->footer().columns(3), BLOOM_FILTER_INDEX));
// for base segment
SegmentWriterOptions opts2;
opts2.whether_to_filter_value = true;
shared_ptr<Segment> seg2;
build_segment(opts2, schema, schema, 100, DefaultIntGenerator, &seg2);
ASSERT_TRUE(seg2->footer().columns(3).has_bloom_filter_index());
ASSERT_TRUE(column_contains_index(seg2->footer().columns(3), BLOOM_FILTER_INDEX));
}
}

View File

@ -16,41 +16,70 @@
// under the License.
#include <gtest/gtest.h>
#include <memory>
#include "olap/rowset/segment_v2/column_zone_map.h"
#include <memory>
#include <string>
#include "env/env.h"
#include "olap/rowset/segment_v2/zone_map_index.h"
#include "olap/tablet_schema_helper.h"
#include "util/file_utils.h"
namespace doris {
namespace segment_v2 {
class ColumnZoneMapTest : public testing::Test {
public:
void test_string(Field* field) {
ColumnZoneMapBuilder builder(field);
const std::string kTestDir = "./ut_dir/zone_map_index_test";
void SetUp() override {
if (FileUtils::check_exist(kTestDir)) {
ASSERT_TRUE(FileUtils::remove_all(kTestDir).ok());
}
ASSERT_TRUE(FileUtils::create_dir(kTestDir).ok());
}
void TearDown() override {
if (FileUtils::check_exist(kTestDir)) {
ASSERT_TRUE(FileUtils::remove_all(kTestDir).ok());
}
}
void test_string(std::string testname, Field* field) {
std::string filename = kTestDir + "/" + testname;
ZoneMapIndexWriter builder(field);
std::vector<std::string> values1 = {"aaaa", "bbbb", "cccc", "dddd", "eeee", "ffff"};
for (auto& value : values1) {
Slice slice(value);
builder.add((const uint8_t*)&slice, 1);
builder.add_values((const uint8_t*)&slice, 1);
}
builder.flush();
std::vector<std::string> values2 = {"aaaaa", "bbbbb", "ccccc", "ddddd", "eeeee", "fffff"};
for (auto& value : values2) {
Slice slice(value);
builder.add((const uint8_t*)&slice, 1);
builder.add_values((const uint8_t*)&slice, 1);
}
builder.add(nullptr, 1);
builder.add_nulls(1);
builder.flush();
for (int i = 0; i < 6; ++i) {
builder.add(nullptr, 1);
builder.add_nulls(1);
}
builder.flush();
OwnedSlice zone_map_page = builder.finish();
ColumnZoneMap column_zone_map(zone_map_page.slice());
Status status = column_zone_map.load();
// write out zone map index
ColumnIndexMetaPB index_meta;
{
std::unique_ptr<WritableFile> out_file;
ASSERT_TRUE(Env::Default()->new_writable_file(filename, &out_file).ok());
ASSERT_TRUE(builder.finish(out_file.get(), &index_meta).ok());
ASSERT_EQ(ZONE_MAP_INDEX, index_meta.type());
}
ZoneMapIndexReader column_zone_map(filename, &index_meta.zone_map_index());
Status status = column_zone_map.load(true, false);
ASSERT_TRUE(status.ok());
ASSERT_EQ(3, column_zone_map.num_pages());
const std::vector<ZoneMapPB>& zone_maps = column_zone_map.get_column_zone_map();
const std::vector<ZoneMapPB>& zone_maps = column_zone_map.page_zone_maps();
ASSERT_EQ(3, zone_maps.size());
ASSERT_EQ("aaaa", zone_maps[0].min());
ASSERT_EQ("ffff", zone_maps[0].max());
@ -69,31 +98,39 @@ public:
// Test for int
TEST_F(ColumnZoneMapTest, NormalTestIntPage) {
std::string filename = kTestDir + "/NormalTestIntPage";
TabletColumn int_column = create_int_key(0);
Field* field = FieldFactory::create(int_column);
ColumnZoneMapBuilder builder(field);
ZoneMapIndexWriter builder(field);
std::vector<int> values1 = {1, 10, 11, 20, 21, 22};
for (auto value : values1) {
builder.add((const uint8_t*)&value, 1);
builder.add_values((const uint8_t*)&value, 1);
}
builder.flush();
std::vector<int> values2 = {2, 12, 31, 23, 21, 22};
for (auto value : values2) {
builder.add((const uint8_t*)&value, 1);
builder.add_values((const uint8_t*)&value, 1);
}
builder.add(nullptr, 1);
builder.add_nulls(1);
builder.flush();
for (int i = 0; i < 6; ++i) {
builder.add(nullptr, 1);
builder.add_nulls(6);
builder.flush();
// write out zone map index
ColumnIndexMetaPB index_meta;
{
std::unique_ptr<WritableFile> out_file;
ASSERT_TRUE(Env::Default()->new_writable_file(filename, &out_file).ok());
ASSERT_TRUE(builder.finish(out_file.get(), &index_meta).ok());
ASSERT_EQ(ZONE_MAP_INDEX, index_meta.type());
}
builder.flush();
OwnedSlice zone_map_page = builder.finish();
ColumnZoneMap column_zone_map(zone_map_page.slice());
Status status = column_zone_map.load();
ZoneMapIndexReader column_zone_map(filename, &index_meta.zone_map_index());
Status status = column_zone_map.load(true, false);
ASSERT_TRUE(status.ok());
ASSERT_EQ(3, column_zone_map.num_pages());
const std::vector<ZoneMapPB>& zone_maps = column_zone_map.get_column_zone_map();
const std::vector<ZoneMapPB>& zone_maps = column_zone_map.page_zone_maps();
ASSERT_EQ(3, zone_maps.size());
ASSERT_EQ(std::to_string(1), zone_maps[0].min());
@ -114,14 +151,14 @@ TEST_F(ColumnZoneMapTest, NormalTestIntPage) {
TEST_F(ColumnZoneMapTest, NormalTestVarcharPage) {
TabletColumn varchar_column = create_varchar_key(0);
Field* field = FieldFactory::create(varchar_column);
test_string(field);
test_string("NormalTestVarcharPage", field);
}
// Test for string
TEST_F(ColumnZoneMapTest, NormalTestCharPage) {
TabletColumn char_column = create_char_key(0);
Field* field = FieldFactory::create(char_column);
test_string(field);
test_string("NormalTestCharPage", field);
}
}

View File

@ -35,20 +35,25 @@ public:
TEST_F(ShortKeyIndexTest, buider) {
ShortKeyIndexBuilder builder(0, 1024);
int num_items = 0;
for (int i = 1000; i < 10000; i += 2) {
builder.add_item(std::to_string(i));
num_items++;
}
std::vector<Slice> slices;
auto st = builder.finalize(10000, 9000 * 1024, &slices);
segment_v2::PageFooterPB footer;
auto st = builder.finalize(9000 * 1024, &slices, &footer);
ASSERT_TRUE(st.ok());
ASSERT_EQ(segment_v2::SHORT_KEY_PAGE, footer.type());
ASSERT_EQ(num_items, footer.short_key_page_footer().num_items());
std::string buf;
for (auto& slice : slices) {
buf.append(slice.data, slice.size);
}
ShortKeyIndexDecoder decoder(buf);
st = decoder.parse();
ShortKeyIndexDecoder decoder;
st = decoder.parse(buf, footer.short_key_page_footer());
ASSERT_TRUE(st.ok());
// find 1499

View File

@ -20,20 +20,6 @@ syntax="proto2";
package doris.segment_v2;
message ColumnSchemaPB {
optional uint32 column_id = 1;
optional string type = 2;
optional string aggregation = 3;
optional uint32 length = 4;
optional bool is_key = 5;
optional string default_value = 6;
optional uint32 precision = 9 [default = 27];
optional uint32 frac = 10 [default = 9];
optional bool is_nullable = 11 [default=false];
optional bool is_bf_column = 15 [default=false]; // is bloom filter indexed column
optional bool has_bitmap_index = 16 [default=false];
}
// page position info
message PagePointerPB {
required uint64 offset = 1; // offset in segment file
@ -67,6 +53,75 @@ enum CompressionTypePB {
ZSTD = 7;
}
enum PageTypePB {
UNKNOWN_PAGE_TYPE = 0;
DATA_PAGE = 1;
INDEX_PAGE = 2;
DICTIONARY_PAGE = 3;
SHORT_KEY_PAGE = 4;
}
message DataPageFooterPB {
// required: ordinal of the first value
optional uint64 first_ordinal = 1;
// required: number of values, including NULLs
optional uint64 num_values = 2;
// required: size of nullmap, 0 if the page doesn't contain NULL
optional uint32 nullmap_size = 3;
// only for array column, largest array item ordinal + 1,
// used to calculate the length of last array in this page
optional uint64 next_array_item_ordinal = 4;
}
message IndexPageFooterPB {
// required: number of index entries in this page
optional uint32 num_entries = 1;
enum Type {
UNKNOWN_INDEX_PAGE_TYPE = 0;
LEAF = 1;
INTERNAL = 2;
};
// required: type of the index page
optional Type type = 2;
}
message DictPageFooterPB {
// required: encoding for dictionary
optional EncodingTypePB encoding = 1;
}
message ShortKeyFooterPB {
// How many index item in this index.
optional uint32 num_items = 1;
// The total bytes occupied by the index key
optional uint32 key_bytes = 2;
// The total bytes occupied by the key offsets
optional uint32 offset_bytes = 3;
// Segment id which this index is belong to
optional uint32 segment_id = 4;
// number rows in each block
optional uint32 num_rows_per_block = 5;
// How many rows in this segment
optional uint32 num_segment_rows = 6;
}
message PageFooterPB {
// required: indicates which of the *_footer fields is set
optional PageTypePB type = 1;
// required: page body size before compression (exclude footer and crc).
// page body is uncompressed when it's equal to page body size
optional uint32 uncompressed_size = 2;
// present only when type == DATA_PAGE
optional DataPageFooterPB data_page_footer = 7;
// present only when type == INDEX_PAGE
optional IndexPageFooterPB index_page_footer = 8;
// present only when type == DICTIONARY_PAGE
optional DictPageFooterPB dict_page_footer = 9;
// present only when type == SHORT_KEY_PAGE
optional ShortKeyFooterPB short_key_page_footer = 10;
}
message ZoneMapPB {
// minimum not-null value, invalid when all values are null(has_not_null==false)
optional bytes min = 1;
@ -85,72 +140,17 @@ message ColumnMetaPB {
optional uint32 unique_id = 2;
// this field is FieldType's value
optional int32 type = 3;
optional EncodingTypePB encoding = 4;
// compress type for column
optional CompressionTypePB compression = 5;
// if this column can be nullable
optional bool is_nullable = 6;
// ordinal index page
optional PagePointerPB ordinal_index_page = 7;
// page-level zone map index
optional PagePointerPB zone_map_page = 8;
// segment-level zone map
optional ZoneMapPB zone_map = 9;
// // dictionary page for DICT_ENCODING
optional PagePointerPB dict_page = 10;
// bitmap index
optional BitmapIndexColumnPB bitmap_index = 11;
// var length for string type
optional int32 length = 12;
// bloom filter index
optional BloomFilterIndexPB bloom_filter_index = 13;
// // bloom filter pages for bloom filter column
// repeated PagePointerPB bloom_filter_pages = 3;
// optional PagePointerPB page_zonemap_page = 5; // page zonemap info of column
// optional PagePointerPB bitmap_index_page = 6; // bitmap index page
// // data footprint of column after encoding and compress
// optional uint64 data_footprint = 7;
// // index footprint of column after encoding and compress
// optional uint64 index_footprint = 8;
// // raw column data footprint
// optional uint64 raw_data_footprint = 9;
// optional ZoneMapPB column_zonemap = 11; // column zonemap info
// repeated MetadataPairPB column_meta_datas = 12;
}
message FileFooterPB {
optional uint32 version = 1 [default = 1]; // file version
repeated ColumnSchemaPB schema = 2; // tablet schema
optional uint64 num_values = 3; // number of values
optional uint64 index_footprint = 4; // total idnex footprint of all columns
optional uint64 data_footprint = 5; // total data footprint of all columns
optional uint64 raw_data_footprint = 6; // raw data footprint
optional CompressionTypePB compress_type = 7 [default = LZ4F]; // default compression type for file columns
repeated MetadataPairPB file_meta_datas = 8; // meta data of file
optional PagePointerPB key_index_page = 9; // short key index page
}
message ShortKeyFooterPB {
// How many index item in this index.
optional uint32 num_items = 1;
// The total bytes occupied by the index key
optional uint32 key_bytes = 2;
// The total bytes occupied by the key offsets
optional uint32 offset_bytes = 3;
// Segment id which this index is belong to
optional uint32 segment_id = 4;
// number rows in each block
optional uint32 num_rows_per_block = 5;
// How many rows in this segment
optional uint32 num_segment_rows = 6;
// Total bytes for this segment
optional uint32 segment_bytes = 7;
optional int32 length = 4;
optional EncodingTypePB encoding = 5;
// compress type for column
optional CompressionTypePB compression = 6;
// if this column can be nullable
optional bool is_nullable = 7;
// metadata about all the column indexes
repeated ColumnIndexMetaPB indexes = 8;
// pointer to dictionary page when using DICT_ENCODING
optional PagePointerPB dict_page = 9;
}
message SegmentFooterPB {
@ -168,19 +168,6 @@ message SegmentFooterPB {
optional PagePointerPB short_key_index_page = 9;
}
message IndexPageFooterPB {
// required: number of entries in this page
optional int32 num_entries = 1;
enum Type {
UNKNOWN_INDEX_PAGE_TYPE = 0;
LEAF = 1;
INTERNAL = 2;
};
// required: type of the index page
optional Type type = 2;
}
message BTreeMetaPB {
// required: pointer to either root index page or sole data page based on is_root_data_page
optional PagePointerPB root_page = 1;
@ -205,22 +192,53 @@ message IndexedColumnMetaPB {
optional uint64 size = 7;
}
message BitmapIndexColumnPB {
// -------------------------------------------------------------
// Column Index Metadata
// -------------------------------------------------------------
enum ColumnIndexTypePB {
UNKNOWN_INDEX_TYPE = 0;
ORDINAL_INDEX = 1;
ZONE_MAP_INDEX = 2;
BITMAP_INDEX = 3;
BLOOM_FILTER_INDEX = 4;
}
message ColumnIndexMetaPB {
optional ColumnIndexTypePB type = 1;
optional OrdinalIndexPB ordinal_index = 7;
optional ZoneMapIndexPB zone_map_index = 8;
optional BitmapIndexPB bitmap_index = 9;
optional BloomFilterIndexPB bloom_filter_index = 10;
}
message OrdinalIndexPB {
// required: the root page can be data page if there is only one data page,
// or the only index page if there is more than one data pages.
optional BTreeMetaPB root_page = 1;
}
message ZoneMapIndexPB {
// required: segment-level zone map
optional ZoneMapPB segment_zone_map = 1;
// required: zone map for each data page is stored in an IndexedColumn with ordinal index
optional IndexedColumnMetaPB page_zone_maps = 2;
}
message BitmapIndexPB {
enum BitmapType {
UNKNOWN_BITMAP_TYPE = 0;
ROARING_BITMAP = 1;
}
optional uint32 column_id = 1;
optional uint32 unique_id = 2;
optional BitmapType bitmap_type = 1 [default=ROARING_BITMAP];
// required: whether the index contains null key.
// if true, the last bitmap (ordinal:dict_column.num_values) in bitmap_column is
// the bitmap for null key. we don't store null key in dict_column.
optional bool has_null = 3;
optional bool has_null = 2;
// required: meta for ordered dictionary part
optional IndexedColumnMetaPB dict_column = 4;
optional IndexedColumnMetaPB dict_column = 3;
// required: meta for bitmaps part
optional IndexedColumnMetaPB bitmap_column = 5;
optional BitmapType bitmap_type = 6 [default=ROARING_BITMAP];
optional IndexedColumnMetaPB bitmap_column = 4;
}
enum HashStrategyPB {
@ -238,4 +256,4 @@ message BloomFilterIndexPB {
optional BloomFilterAlgorithmPB algorithm = 2;
// required: meta for bloom filters
optional IndexedColumnMetaPB bloom_filter = 3;
}
}

View File

@ -282,18 +282,17 @@ ${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/ordinal_page_index_test
${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/bitshuffle_page_test
${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/plain_page_test
${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/binary_plain_page_test
${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/bitmap_index_test
${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/column_reader_writer_test
${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/index_column_reader_writer_test
${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/rle_page_test
${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/binary_dict_page_test
${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/binary_prefix_page_test
${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/segment_test
${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/page_compression_test
${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/column_zone_map_test
${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/row_ranges_test
${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/frame_of_reference_page_test
${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/block_bloom_filter_test
${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test
${DORIS_TEST_BINARY_DIR}/olap/rowset/segment_v2/zone_map_index_test
${DORIS_TEST_BINARY_DIR}/olap/txn_manager_test
${DORIS_TEST_BINARY_DIR}/olap/storage_types_test
${DORIS_TEST_BINARY_DIR}/olap/generic_iterators_test