[fix](storage) consider file size on page cache key (#19619)

The core is due to a DCHECK:

F0513 22:48:56.059758 3996895 tablet.cpp:2690] Check failed: num_to_read == num_read
Finally, we found that the DCHECK failure is due to page cache:

1. At first we have 20 segments, which id is 0-19.
2. For MoW table, memtable flush process will calculate the delete bitmap. In this procedure, the index pages and data pages of PrimaryKeyIndex is loaded to cache
3. Segment compaction compact all these 10 segments to 2 segment, and rename it to id 0,1
4. Finally, before the load commit, we'll calculate delete bitmap between segments in current rowset. This procedure need to iterator primary key index of each segments, but when we access data of new compacted segments, we read data of old segments in page cache
To fix this issue, the best policy is:

1. Add a crc32 or last modified time to CacheKey.
2. Or invalid related cache keys after segment compaction.
For policy 1, we don't have crc32 in segment footer, and getting the last-modified-time needs to perform 1 additional disk IO.
For policy 2, we need to add additional page cache invalidation methods, which may cause the page cache not stable

So I think we can simply add a file size to identify that the file is changed.
In LSM-Tree, all modification will generate new files, such file-name reuse is not normal case(as far as I know, only segment compaction), file size is enough to identify the file change.
This commit is contained in:
zhannngchen
2023-05-15 17:16:31 +08:00
committed by GitHub
parent c87e78dc35
commit fad9237d30
5 changed files with 50 additions and 21 deletions

View File

@ -45,13 +45,16 @@ public:
// TODO(zc): Now we use file name(std::string) as a part of
// key, which is not efficient. We should make it better later
struct CacheKey {
CacheKey(std::string fname_, int64_t offset_) : fname(std::move(fname_)), offset(offset_) {}
CacheKey(std::string fname_, size_t fsize_, int64_t offset_)
: fname(std::move(fname_)), fsize(fsize_), offset(offset_) {}
std::string fname;
size_t fsize;
int64_t offset;
// Encode to a flat binary which can be used as LRUCache's key
std::string encode() const {
std::string key_buf(fname);
key_buf.append((char*)&fsize, sizeof(fsize));
key_buf.append((char*)&offset, sizeof(offset));
return key_buf;
}

View File

@ -119,7 +119,7 @@ Status PageIO::read_and_decompress_page(const PageReadOptions& opts, PageHandle*
auto cache = StoragePageCache::instance();
PageCacheHandle cache_handle;
StoragePageCache::CacheKey cache_key(opts.file_reader->path().native(),
opts.page_pointer.offset);
opts.file_reader->size(), opts.page_pointer.offset);
if (opts.use_page_cache && cache->is_cache_available(opts.type) &&
cache->lookup(cache_key, &cache_handle, opts.type)) {
// we find page in cache, use it

View File

@ -577,13 +577,18 @@ Status SegmentWriter::append_block(const vectorized::Block* block, size_t row_po
if (_has_key) {
if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) {
// create primary indexes
std::string last_key;
for (size_t pos = 0; pos < num_rows; pos++) {
std::string key = _full_encode_keys(key_columns, pos);
if (_tablet_schema->has_sequence_col()) {
_encode_seq_column(seq_column, pos, &key);
}
DCHECK(key.compare(last_key) > 0)
<< "found duplicate key or key is not sorted! current key: " << key
<< ", last key" << last_key;
RETURN_IF_ERROR(_primary_key_index_builder->add_item(key));
_maybe_invalid_row_cache(key);
last_key = std::move(key);
}
} else {
// create short key indexes'

View File

@ -2684,10 +2684,15 @@ Status Tablet::calc_delete_bitmap(RowsetSharedPtr rowset,
auto index_column = index_type->create_column();
Slice last_key_slice(last_key);
RETURN_IF_ERROR(iter->seek_at_or_after(&last_key_slice, &exact_match));
auto current_ordinal = iter->get_current_ordinal();
DCHECK(total == remaining + current_ordinal)
<< "total: " << total << ", remaining: " << remaining
<< ", current_ordinal: " << current_ordinal;
size_t num_read = num_to_read;
RETURN_IF_ERROR(iter->next_batch(&num_read, index_column));
DCHECK(num_to_read == num_read);
DCHECK(num_to_read == num_read)
<< "num_to_read: " << num_to_read << ", num_read: " << num_read;
last_key = index_column->get_data_at(num_read - 1).to_string();
// exclude last_key, last_key will be read in next batch.

View File

@ -36,8 +36,8 @@ public:
TEST(StoragePageCacheTest, data_page_only) {
StoragePageCache cache(kNumShards * 2048, 0, kNumShards);
StoragePageCache::CacheKey key("abc", 0);
StoragePageCache::CacheKey memory_key("mem", 0);
StoragePageCache::CacheKey key("abc", 0, 0);
StoragePageCache::CacheKey memory_key("mem", 0, 0);
segment_v2::PageTypePB page_type = segment_v2::DATA_PAGE;
@ -70,16 +70,24 @@ TEST(StoragePageCacheTest, data_page_only) {
// put too many page to eliminate first page
for (int i = 0; i < 10 * kNumShards; ++i) {
StoragePageCache::CacheKey key("bcd", i);
StoragePageCache::CacheKey key("bcd", 0, i);
PageCacheHandle handle;
Slice data(new char[1024], 1024);
cache.insert(key, data, &handle, page_type, false);
}
// cache miss
// cache miss, different offset
{
PageCacheHandle handle;
StoragePageCache::CacheKey miss_key("abc", 1);
StoragePageCache::CacheKey miss_key("abc", 0, 1);
auto found = cache.lookup(miss_key, &handle, page_type);
EXPECT_FALSE(found);
}
// cache miss, different file size
{
PageCacheHandle handle;
StoragePageCache::CacheKey miss_key("abc", 1, 0);
auto found = cache.lookup(miss_key, &handle, page_type);
EXPECT_FALSE(found);
}
@ -96,8 +104,8 @@ TEST(StoragePageCacheTest, data_page_only) {
TEST(StoragePageCacheTest, index_page_only) {
StoragePageCache cache(kNumShards * 2048, 100, kNumShards);
StoragePageCache::CacheKey key("abc", 0);
StoragePageCache::CacheKey memory_key("mem", 0);
StoragePageCache::CacheKey key("abc", 0, 0);
StoragePageCache::CacheKey memory_key("mem", 0, 0);
segment_v2::PageTypePB page_type = segment_v2::INDEX_PAGE;
@ -130,16 +138,24 @@ TEST(StoragePageCacheTest, index_page_only) {
// put too many page to eliminate first page
for (int i = 0; i < 10 * kNumShards; ++i) {
StoragePageCache::CacheKey key("bcd", i);
StoragePageCache::CacheKey key("bcd", 0, i);
PageCacheHandle handle;
Slice data(new char[1024], 1024);
cache.insert(key, data, &handle, page_type, false);
}
// cache miss
// cache miss, different offset
{
PageCacheHandle handle;
StoragePageCache::CacheKey miss_key("abc", 1);
StoragePageCache::CacheKey miss_key("abc", 1, 1);
auto found = cache.lookup(miss_key, &handle, page_type);
EXPECT_FALSE(found);
}
// cache miss, different file size
{
PageCacheHandle handle;
StoragePageCache::CacheKey miss_key("abc", 1, 0);
auto found = cache.lookup(miss_key, &handle, page_type);
EXPECT_FALSE(found);
}
@ -156,10 +172,10 @@ TEST(StoragePageCacheTest, index_page_only) {
TEST(StoragePageCacheTest, mixed_pages) {
StoragePageCache cache(kNumShards * 2048, 10, kNumShards);
StoragePageCache::CacheKey data_key("data", 0);
StoragePageCache::CacheKey index_key("index", 0);
StoragePageCache::CacheKey data_key_mem("data_mem", 0);
StoragePageCache::CacheKey index_key_mem("index_mem", 0);
StoragePageCache::CacheKey data_key("data", 0, 0);
StoragePageCache::CacheKey index_key("index", 0, 0);
StoragePageCache::CacheKey data_key_mem("data_mem", 0, 0);
StoragePageCache::CacheKey index_key_mem("index_mem", 0, 0);
segment_v2::PageTypePB page_type_data = segment_v2::DATA_PAGE;
segment_v2::PageTypePB page_type_index = segment_v2::INDEX_PAGE;
@ -204,7 +220,7 @@ TEST(StoragePageCacheTest, mixed_pages) {
// put too many page to eliminate first page of both cache
for (int i = 0; i < 10 * kNumShards; ++i) {
StoragePageCache::CacheKey key("bcd", i);
StoragePageCache::CacheKey key("bcd", 0, i);
PageCacheHandle handle;
Slice data(new char[1024], 1024), index(new char[1024], 1024);
cache.insert(key, data, &handle, page_type_data, false);
@ -214,7 +230,7 @@ TEST(StoragePageCacheTest, mixed_pages) {
// cache miss by key
{
PageCacheHandle data_handle, index_handle;
StoragePageCache::CacheKey miss_key("abc", 1);
StoragePageCache::CacheKey miss_key("abc", 0, 1);
auto found_data = cache.lookup(miss_key, &data_handle, page_type_data);
auto found_index = cache.lookup(miss_key, &index_handle, page_type_index);
EXPECT_FALSE(found_data);
@ -224,8 +240,8 @@ TEST(StoragePageCacheTest, mixed_pages) {
// cache miss by page type
{
PageCacheHandle data_handle, index_handle;
StoragePageCache::CacheKey miss_key_data("data_miss", 1);
StoragePageCache::CacheKey miss_key_index("index_miss", 1);
StoragePageCache::CacheKey miss_key_data("data_miss", 0, 1);
StoragePageCache::CacheKey miss_key_index("index_miss", 0, 1);
char* buf_data = new char[1024];
char* buf_index = new char[1024];
Slice data(buf_data, 1024), index(buf_index, 1024);