From fad9237d3050fa47db27beceb8bb2b614c9619ba Mon Sep 17 00:00:00 2001 From: zhannngchen <48427519+zhannngchen@users.noreply.github.com> Date: Mon, 15 May 2023 17:16:31 +0800 Subject: [PATCH] [fix](storage) consider file size on page cache key (#19619) The core is due to a DCHECK: F0513 22:48:56.059758 3996895 tablet.cpp:2690] Check failed: num_to_read == num_read Finally, we found that the DCHECK failure is due to page cache: 1. At first we have 20 segments, which id is 0-19. 2. For MoW table, memtable flush process will calculate the delete bitmap. In this procedure, the index pages and data pages of PrimaryKeyIndex is loaded to cache 3. Segment compaction compact all these 10 segments to 2 segment, and rename it to id 0,1 4. Finally, before the load commit, we'll calculate delete bitmap between segments in current rowset. This procedure need to iterator primary key index of each segments, but when we access data of new compacted segments, we read data of old segments in page cache To fix this issue, the best policy is: 1. Add a crc32 or last modified time to CacheKey. 2. Or invalid related cache keys after segment compaction. For policy 1, we don't have crc32 in segment footer, and getting the last-modified-time needs to perform 1 additional disk IO. For policy 2, we need to add additional page cache invalidation methods, which may cause the page cache not stable So I think we can simply add a file size to identify that the file is changed. In LSM-Tree, all modification will generate new files, such file-name reuse is not normal case(as far as I know, only segment compaction), file size is enough to identify the file change. --- be/src/olap/page_cache.h | 5 +- be/src/olap/rowset/segment_v2/page_io.cpp | 2 +- .../olap/rowset/segment_v2/segment_writer.cpp | 5 ++ be/src/olap/tablet.cpp | 7 ++- be/test/olap/page_cache_test.cpp | 52 ++++++++++++------- 5 files changed, 50 insertions(+), 21 deletions(-) diff --git a/be/src/olap/page_cache.h b/be/src/olap/page_cache.h index cab0da2dd7..faeff28d43 100644 --- a/be/src/olap/page_cache.h +++ b/be/src/olap/page_cache.h @@ -45,13 +45,16 @@ public: // TODO(zc): Now we use file name(std::string) as a part of // key, which is not efficient. We should make it better later struct CacheKey { - CacheKey(std::string fname_, int64_t offset_) : fname(std::move(fname_)), offset(offset_) {} + CacheKey(std::string fname_, size_t fsize_, int64_t offset_) + : fname(std::move(fname_)), fsize(fsize_), offset(offset_) {} std::string fname; + size_t fsize; int64_t offset; // Encode to a flat binary which can be used as LRUCache's key std::string encode() const { std::string key_buf(fname); + key_buf.append((char*)&fsize, sizeof(fsize)); key_buf.append((char*)&offset, sizeof(offset)); return key_buf; } diff --git a/be/src/olap/rowset/segment_v2/page_io.cpp b/be/src/olap/rowset/segment_v2/page_io.cpp index ee8b761053..f9b1fb3408 100644 --- a/be/src/olap/rowset/segment_v2/page_io.cpp +++ b/be/src/olap/rowset/segment_v2/page_io.cpp @@ -119,7 +119,7 @@ Status PageIO::read_and_decompress_page(const PageReadOptions& opts, PageHandle* auto cache = StoragePageCache::instance(); PageCacheHandle cache_handle; StoragePageCache::CacheKey cache_key(opts.file_reader->path().native(), - opts.page_pointer.offset); + opts.file_reader->size(), opts.page_pointer.offset); if (opts.use_page_cache && cache->is_cache_available(opts.type) && cache->lookup(cache_key, &cache_handle, opts.type)) { // we find page in cache, use it diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index bb6f47d759..d57406dfdd 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -577,13 +577,18 @@ Status SegmentWriter::append_block(const vectorized::Block* block, size_t row_po if (_has_key) { if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { // create primary indexes + std::string last_key; for (size_t pos = 0; pos < num_rows; pos++) { std::string key = _full_encode_keys(key_columns, pos); if (_tablet_schema->has_sequence_col()) { _encode_seq_column(seq_column, pos, &key); } + DCHECK(key.compare(last_key) > 0) + << "found duplicate key or key is not sorted! current key: " << key + << ", last key" << last_key; RETURN_IF_ERROR(_primary_key_index_builder->add_item(key)); _maybe_invalid_row_cache(key); + last_key = std::move(key); } } else { // create short key indexes' diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index 47fe3a1bb8..6a2fc1553d 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -2684,10 +2684,15 @@ Status Tablet::calc_delete_bitmap(RowsetSharedPtr rowset, auto index_column = index_type->create_column(); Slice last_key_slice(last_key); RETURN_IF_ERROR(iter->seek_at_or_after(&last_key_slice, &exact_match)); + auto current_ordinal = iter->get_current_ordinal(); + DCHECK(total == remaining + current_ordinal) + << "total: " << total << ", remaining: " << remaining + << ", current_ordinal: " << current_ordinal; size_t num_read = num_to_read; RETURN_IF_ERROR(iter->next_batch(&num_read, index_column)); - DCHECK(num_to_read == num_read); + DCHECK(num_to_read == num_read) + << "num_to_read: " << num_to_read << ", num_read: " << num_read; last_key = index_column->get_data_at(num_read - 1).to_string(); // exclude last_key, last_key will be read in next batch. diff --git a/be/test/olap/page_cache_test.cpp b/be/test/olap/page_cache_test.cpp index 9ec0a9c4a2..8c9a37183e 100644 --- a/be/test/olap/page_cache_test.cpp +++ b/be/test/olap/page_cache_test.cpp @@ -36,8 +36,8 @@ public: TEST(StoragePageCacheTest, data_page_only) { StoragePageCache cache(kNumShards * 2048, 0, kNumShards); - StoragePageCache::CacheKey key("abc", 0); - StoragePageCache::CacheKey memory_key("mem", 0); + StoragePageCache::CacheKey key("abc", 0, 0); + StoragePageCache::CacheKey memory_key("mem", 0, 0); segment_v2::PageTypePB page_type = segment_v2::DATA_PAGE; @@ -70,16 +70,24 @@ TEST(StoragePageCacheTest, data_page_only) { // put too many page to eliminate first page for (int i = 0; i < 10 * kNumShards; ++i) { - StoragePageCache::CacheKey key("bcd", i); + StoragePageCache::CacheKey key("bcd", 0, i); PageCacheHandle handle; Slice data(new char[1024], 1024); cache.insert(key, data, &handle, page_type, false); } - // cache miss + // cache miss, different offset { PageCacheHandle handle; - StoragePageCache::CacheKey miss_key("abc", 1); + StoragePageCache::CacheKey miss_key("abc", 0, 1); + auto found = cache.lookup(miss_key, &handle, page_type); + EXPECT_FALSE(found); + } + + // cache miss, different file size + { + PageCacheHandle handle; + StoragePageCache::CacheKey miss_key("abc", 1, 0); auto found = cache.lookup(miss_key, &handle, page_type); EXPECT_FALSE(found); } @@ -96,8 +104,8 @@ TEST(StoragePageCacheTest, data_page_only) { TEST(StoragePageCacheTest, index_page_only) { StoragePageCache cache(kNumShards * 2048, 100, kNumShards); - StoragePageCache::CacheKey key("abc", 0); - StoragePageCache::CacheKey memory_key("mem", 0); + StoragePageCache::CacheKey key("abc", 0, 0); + StoragePageCache::CacheKey memory_key("mem", 0, 0); segment_v2::PageTypePB page_type = segment_v2::INDEX_PAGE; @@ -130,16 +138,24 @@ TEST(StoragePageCacheTest, index_page_only) { // put too many page to eliminate first page for (int i = 0; i < 10 * kNumShards; ++i) { - StoragePageCache::CacheKey key("bcd", i); + StoragePageCache::CacheKey key("bcd", 0, i); PageCacheHandle handle; Slice data(new char[1024], 1024); cache.insert(key, data, &handle, page_type, false); } - // cache miss + // cache miss, different offset { PageCacheHandle handle; - StoragePageCache::CacheKey miss_key("abc", 1); + StoragePageCache::CacheKey miss_key("abc", 1, 1); + auto found = cache.lookup(miss_key, &handle, page_type); + EXPECT_FALSE(found); + } + + // cache miss, different file size + { + PageCacheHandle handle; + StoragePageCache::CacheKey miss_key("abc", 1, 0); auto found = cache.lookup(miss_key, &handle, page_type); EXPECT_FALSE(found); } @@ -156,10 +172,10 @@ TEST(StoragePageCacheTest, index_page_only) { TEST(StoragePageCacheTest, mixed_pages) { StoragePageCache cache(kNumShards * 2048, 10, kNumShards); - StoragePageCache::CacheKey data_key("data", 0); - StoragePageCache::CacheKey index_key("index", 0); - StoragePageCache::CacheKey data_key_mem("data_mem", 0); - StoragePageCache::CacheKey index_key_mem("index_mem", 0); + StoragePageCache::CacheKey data_key("data", 0, 0); + StoragePageCache::CacheKey index_key("index", 0, 0); + StoragePageCache::CacheKey data_key_mem("data_mem", 0, 0); + StoragePageCache::CacheKey index_key_mem("index_mem", 0, 0); segment_v2::PageTypePB page_type_data = segment_v2::DATA_PAGE; segment_v2::PageTypePB page_type_index = segment_v2::INDEX_PAGE; @@ -204,7 +220,7 @@ TEST(StoragePageCacheTest, mixed_pages) { // put too many page to eliminate first page of both cache for (int i = 0; i < 10 * kNumShards; ++i) { - StoragePageCache::CacheKey key("bcd", i); + StoragePageCache::CacheKey key("bcd", 0, i); PageCacheHandle handle; Slice data(new char[1024], 1024), index(new char[1024], 1024); cache.insert(key, data, &handle, page_type_data, false); @@ -214,7 +230,7 @@ TEST(StoragePageCacheTest, mixed_pages) { // cache miss by key { PageCacheHandle data_handle, index_handle; - StoragePageCache::CacheKey miss_key("abc", 1); + StoragePageCache::CacheKey miss_key("abc", 0, 1); auto found_data = cache.lookup(miss_key, &data_handle, page_type_data); auto found_index = cache.lookup(miss_key, &index_handle, page_type_index); EXPECT_FALSE(found_data); @@ -224,8 +240,8 @@ TEST(StoragePageCacheTest, mixed_pages) { // cache miss by page type { PageCacheHandle data_handle, index_handle; - StoragePageCache::CacheKey miss_key_data("data_miss", 1); - StoragePageCache::CacheKey miss_key_index("index_miss", 1); + StoragePageCache::CacheKey miss_key_data("data_miss", 0, 1); + StoragePageCache::CacheKey miss_key_index("index_miss", 0, 1); char* buf_data = new char[1024]; char* buf_index = new char[1024]; Slice data(buf_data, 1024), index(buf_index, 1024);