Files
doris/be/test/io/cache/file_block_cache_test.cpp
Tiewei Fang f17d69e450 [feature](file cache)Import file cache for remote file reader (#15622)
The main purpose of this pr is to import `fileCache` for lakehouse reading remote files.
Use the local disk as the cache for reading remote file, so the next time this file is read,
the data can be obtained directly from the local disk.
In addition, this pr includes a few other minor changes

Import File Cache:
1. The imported `fileCache` is called `block_file_cache`, which uses lru replacement policy.
2. Implement a new FileRereader `CachedRemoteFilereader`, so that the logic of `file cache` is hidden under `CachedRemoteFilereader`.

Other changes:
1. Add a new interface `fs()` for `FileReader`.
2. `IOContext` adds some statistical information to count the situation of `FileCache`

Co-authored-by: Lightman <31928846+Lchangliang@users.noreply.github.com>
2023-01-10 12:23:56 +08:00

545 lines
23 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Interpreters/tests/gtest_lru_file_cache.cpp
// and modified by Doris
#include <gtest/gtest.h>
#include <filesystem>
#include <thread>
#include "common/config.h"
#include "io/cache/block/block_file_cache.h"
#include "io/cache/block/block_file_cache_settings.h"
#include "io/cache/block/block_file_segment.h"
#include "io/cache/block/block_lru_file_cache.h"
#include "olap/options.h"
#include "util/slice.h"
namespace doris::io {
namespace fs = std::filesystem;
fs::path caches_dir = fs::current_path() / "lru_cache_test";
std::string cache_base_path = caches_dir / "cache1" / "";
void assert_range([[maybe_unused]] size_t assert_n, io::FileBlockSPtr file_segment,
const io::FileBlock::Range& expected_range, io::FileBlock::State expected_state) {
auto range = file_segment->range();
ASSERT_EQ(range.left, expected_range.left);
ASSERT_EQ(range.right, expected_range.right);
ASSERT_EQ(file_segment->state(), expected_state);
}
std::vector<io::FileBlockSPtr> fromHolder(const io::FileBlocksHolder& holder) {
return std::vector<io::FileBlockSPtr>(holder.file_segments.begin(), holder.file_segments.end());
}
std::string getFileBlockPath(const std::string& base_path, const io::IFileCache::Key& key,
size_t offset) {
auto key_str = key.to_string();
return fs::path(base_path) / key_str / std::to_string(offset);
}
void download(io::FileBlockSPtr file_segment) {
const auto& key = file_segment->key();
size_t size = file_segment->range().size();
auto key_str = key.to_string();
auto subdir = fs::path(cache_base_path) / key_str;
ASSERT_TRUE(fs::exists(subdir));
std::string data(size, '0');
Slice result(data.data(), size);
file_segment->append(result);
file_segment->finalize_write();
}
void complete(const io::FileBlocksHolder& holder) {
for (const auto& file_segment : holder.file_segments) {
ASSERT_TRUE(file_segment->get_or_set_downloader() == io::FileBlock::get_caller_id());
download(file_segment);
}
}
TEST(LRUFileCache, init) {
std::string string = std::string(R"(
[
{
"path" : "/mnt/ssd01/clickbench/hot/be/file_cache",
"normal" : 193273528320,
"persistent" : 193273528320,
"query_limit" : 38654705664
},
{
"path" : "/mnt/ssd01/clickbench/hot/be/file_cache",
"normal" : 193273528320,
"persistent" : 193273528320,
"query_limit" : 38654705664
}
]
)");
config::enable_file_cache_query_limit = true;
std::vector<CachePath> cache_paths;
parse_conf_cache_paths(string, cache_paths);
EXPECT_EQ(cache_paths.size(), 2);
for (const auto& cache_path : cache_paths) {
io::FileCacheSettings settings = cache_path.init_settings();
EXPECT_EQ(settings.max_size, 193273528320);
EXPECT_EQ(settings.persistent_max_size, 193273528320);
EXPECT_EQ(settings.max_query_cache_size, 38654705664);
}
}
void test_file_cache(bool is_persistent) {
TUniqueId query_id;
query_id.hi = 1;
query_id.lo = 1;
TUniqueId other_query_id;
other_query_id.hi = 2;
other_query_id.lo = 2;
io::FileCacheSettings settings;
settings.max_size = 30;
settings.max_elements = 5;
settings.persistent_max_size = 30;
settings.persistent_max_elements = 5;
settings.max_file_segment_size = 100;
auto key = io::IFileCache::hash("key1");
{
io::LRUFileCache cache(cache_base_path, settings);
cache.initialize();
{
auto holder =
cache.get_or_set(key, 0, 10, is_persistent, query_id); /// Add range [0, 9]
auto segments = fromHolder(holder);
/// Range was not present in cache. It should be added in cache as one while file segment.
ASSERT_GE(segments.size(), 1);
assert_range(1, segments[0], io::FileBlock::Range(0, 9), io::FileBlock::State::EMPTY);
/// Exception because space not reserved.
/// EXPECT_THROW(download(segments[0]), DB::Exception);
/// Exception because space can be reserved only by downloader
/// EXPECT_THROW(segments[0]->reserve(segments[0]->range().size()), DB::Exception);
ASSERT_TRUE(segments[0]->get_or_set_downloader() == io::FileBlock::get_caller_id());
assert_range(2, segments[0], io::FileBlock::Range(0, 9),
io::FileBlock::State::DOWNLOADING);
download(segments[0]);
assert_range(3, segments[0], io::FileBlock::Range(0, 9),
io::FileBlock::State::DOWNLOADED);
}
/// Current cache: [__________]
/// ^ ^
/// 0 9
ASSERT_EQ(cache.get_file_segments_num(is_persistent), 1);
ASSERT_EQ(cache.get_used_cache_size(is_persistent), 10);
{
/// Want range [5, 14], but [0, 9] already in cache, so only [10, 14] will be put in cache.
auto holder = cache.get_or_set(key, 5, 10, is_persistent, query_id);
auto segments = fromHolder(holder);
ASSERT_EQ(segments.size(), 2);
assert_range(4, segments[0], io::FileBlock::Range(0, 9),
io::FileBlock::State::DOWNLOADED);
assert_range(5, segments[1], io::FileBlock::Range(10, 14), io::FileBlock::State::EMPTY);
ASSERT_TRUE(segments[1]->get_or_set_downloader() == io::FileBlock::get_caller_id());
download(segments[1]);
assert_range(6, segments[1], io::FileBlock::Range(10, 14),
io::FileBlock::State::DOWNLOADED);
}
/// Current cache: [__________][_____]
/// ^ ^^ ^
/// 0 910 14
ASSERT_EQ(cache.get_file_segments_num(is_persistent), 2);
ASSERT_EQ(cache.get_used_cache_size(is_persistent), 15);
{
auto holder = cache.get_or_set(key, 9, 1, is_persistent, query_id); /// Get [9, 9]
auto segments = fromHolder(holder);
ASSERT_EQ(segments.size(), 1);
assert_range(7, segments[0], io::FileBlock::Range(0, 9),
io::FileBlock::State::DOWNLOADED);
}
{
auto holder = cache.get_or_set(key, 9, 2, is_persistent, query_id); /// Get [9, 10]
auto segments = fromHolder(holder);
ASSERT_EQ(segments.size(), 2);
assert_range(8, segments[0], io::FileBlock::Range(0, 9),
io::FileBlock::State::DOWNLOADED);
assert_range(9, segments[1], io::FileBlock::Range(10, 14),
io::FileBlock::State::DOWNLOADED);
}
{
auto holder = cache.get_or_set(key, 10, 1, is_persistent, query_id); /// Get [10, 10]
auto segments = fromHolder(holder);
ASSERT_EQ(segments.size(), 1);
assert_range(10, segments[0], io::FileBlock::Range(10, 14),
io::FileBlock::State::DOWNLOADED);
}
complete(cache.get_or_set(key, 17, 4, is_persistent, query_id)); /// Get [17, 20]
complete(cache.get_or_set(key, 24, 3, is_persistent, query_id)); /// Get [24, 26]
/// Current cache: [__________][_____] [____] [___]
/// ^ ^^ ^ ^ ^ ^ ^
/// 0 910 14 17 20 24 26
///
ASSERT_EQ(cache.get_file_segments_num(is_persistent), 4);
ASSERT_EQ(cache.get_used_cache_size(is_persistent), 22);
{
auto holder = cache.get_or_set(key, 0, 26, is_persistent, query_id); /// Get [0, 25]
auto segments = fromHolder(holder);
ASSERT_EQ(segments.size(), 6);
assert_range(11, segments[0], io::FileBlock::Range(0, 9),
io::FileBlock::State::DOWNLOADED);
assert_range(12, segments[1], io::FileBlock::Range(10, 14),
io::FileBlock::State::DOWNLOADED);
/// Missing [15, 16] should be added in cache.
assert_range(13, segments[2], io::FileBlock::Range(15, 16),
io::FileBlock::State::EMPTY);
ASSERT_TRUE(segments[2]->get_or_set_downloader() == io::FileBlock::get_caller_id());
download(segments[2]);
assert_range(14, segments[3], io::FileBlock::Range(17, 20),
io::FileBlock::State::DOWNLOADED);
/// New [21, 23], but will not be added in cache because of elements limit (5)
assert_range(15, segments[4], io::FileBlock::Range(21, 23),
io::FileBlock::State::SKIP_CACHE);
assert_range(16, segments[5], io::FileBlock::Range(24, 26),
io::FileBlock::State::DOWNLOADED);
/// Current cache: [__________][_____][ ][____] [___]
/// ^ ^ ^
/// 0 20 24
///
/// Range [27, 27] must be evicted in previous getOrSet [0, 25].
/// Let's not invalidate pointers to returned segments from range [0, 25] and
/// as max elements size is reached, next attempt to put something in cache should fail.
/// This will also check that [27, 27] was indeed evicted.
auto holder1 = cache.get_or_set(key, 27, 1, is_persistent, query_id);
auto segments_1 = fromHolder(holder1); /// Get [27, 27]
ASSERT_EQ(segments_1.size(), 1);
assert_range(17, segments_1[0], io::FileBlock::Range(27, 27),
io::FileBlock::State::SKIP_CACHE);
}
{
auto holder = cache.get_or_set(key, 12, 10, is_persistent, query_id); /// Get [12, 21]
auto segments = fromHolder(holder);
ASSERT_EQ(segments.size(), 4);
assert_range(18, segments[0], io::FileBlock::Range(10, 14),
io::FileBlock::State::DOWNLOADED);
assert_range(19, segments[1], io::FileBlock::Range(15, 16),
io::FileBlock::State::DOWNLOADED);
assert_range(20, segments[2], io::FileBlock::Range(17, 20),
io::FileBlock::State::DOWNLOADED);
assert_range(21, segments[3], io::FileBlock::Range(21, 21),
io::FileBlock::State::EMPTY);
ASSERT_TRUE(segments[3]->get_or_set_downloader() == io::FileBlock::get_caller_id());
download(segments[3]);
ASSERT_TRUE(segments[3]->state() == io::FileBlock::State::DOWNLOADED);
}
/// Current cache: [_____][__][____][_] [___]
/// ^ ^ ^ ^ ^
/// 10 17 21 24 26
ASSERT_EQ(cache.get_file_segments_num(is_persistent), 5);
{
auto holder = cache.get_or_set(key, 23, 5, is_persistent, query_id); /// Get [23, 28]
auto segments = fromHolder(holder);
ASSERT_EQ(segments.size(), 3);
assert_range(22, segments[0], io::FileBlock::Range(23, 23),
io::FileBlock::State::EMPTY);
assert_range(23, segments[1], io::FileBlock::Range(24, 26),
io::FileBlock::State::DOWNLOADED);
assert_range(24, segments[2], io::FileBlock::Range(27, 27),
io::FileBlock::State::EMPTY);
ASSERT_TRUE(segments[0]->get_or_set_downloader() == io::FileBlock::get_caller_id());
ASSERT_TRUE(segments[2]->get_or_set_downloader() == io::FileBlock::get_caller_id());
download(segments[0]);
download(segments[2]);
}
/// Current cache: [____][_] [][___][__]
/// ^ ^ ^^^ ^^ ^
/// 17 21 2324 26 27
{
auto holder5 = cache.get_or_set(key, 2, 3, is_persistent, query_id); /// Get [2, 4]
auto s5 = fromHolder(holder5);
ASSERT_EQ(s5.size(), 1);
assert_range(25, s5[0], io::FileBlock::Range(2, 4), io::FileBlock::State::EMPTY);
auto holder1 = cache.get_or_set(key, 30, 2, is_persistent, query_id); /// Get [30, 31]
auto s1 = fromHolder(holder1);
ASSERT_EQ(s1.size(), 1);
assert_range(26, s1[0], io::FileBlock::Range(30, 31), io::FileBlock::State::EMPTY);
ASSERT_TRUE(s5[0]->get_or_set_downloader() == io::FileBlock::get_caller_id());
ASSERT_TRUE(s1[0]->get_or_set_downloader() == io::FileBlock::get_caller_id());
download(s5[0]);
download(s1[0]);
/// Current cache: [___] [_][___][_] [__]
/// ^ ^ ^ ^ ^ ^ ^ ^
/// 2 4 23 24 26 27 30 31
auto holder2 = cache.get_or_set(key, 23, 1, is_persistent, query_id); /// Get [23, 23]
auto s2 = fromHolder(holder2);
ASSERT_EQ(s2.size(), 1);
auto holder3 = cache.get_or_set(key, 24, 3, is_persistent, query_id); /// Get [24, 26]
auto s3 = fromHolder(holder3);
ASSERT_EQ(s3.size(), 1);
auto holder4 = cache.get_or_set(key, 27, 1, is_persistent, query_id); /// Get [27, 27]
auto s4 = fromHolder(holder4);
ASSERT_EQ(s4.size(), 1);
/// All cache is now unreleasable because pointers are still hold
auto holder6 = cache.get_or_set(key, 0, 40, is_persistent, query_id);
auto f = fromHolder(holder6);
ASSERT_EQ(f.size(), 9);
assert_range(27, f[0], io::FileBlock::Range(0, 1), io::FileBlock::State::SKIP_CACHE);
assert_range(28, f[2], io::FileBlock::Range(5, 22), io::FileBlock::State::SKIP_CACHE);
assert_range(29, f[6], io::FileBlock::Range(28, 29), io::FileBlock::State::SKIP_CACHE);
assert_range(30, f[8], io::FileBlock::Range(32, 39), io::FileBlock::State::SKIP_CACHE);
}
{
auto holder = cache.get_or_set(key, 2, 3, is_persistent, query_id); /// Get [2, 4]
auto segments = fromHolder(holder);
ASSERT_EQ(segments.size(), 1);
assert_range(31, segments[0], io::FileBlock::Range(2, 4),
io::FileBlock::State::DOWNLOADED);
}
/// Current cache: [___] [_][___][_] [__]
/// ^ ^ ^ ^ ^ ^ ^ ^
/// 2 4 23 24 26 27 30 31
{
auto holder = cache.get_or_set(key, 25, 5, is_persistent, query_id); /// Get [25, 29]
auto segments = fromHolder(holder);
ASSERT_EQ(segments.size(), 3);
assert_range(32, segments[0], io::FileBlock::Range(24, 26),
io::FileBlock::State::DOWNLOADED);
assert_range(33, segments[1], io::FileBlock::Range(27, 27),
io::FileBlock::State::DOWNLOADED);
assert_range(34, segments[2], io::FileBlock::Range(28, 29),
io::FileBlock::State::EMPTY);
ASSERT_TRUE(segments[2]->get_or_set_downloader() == io::FileBlock::get_caller_id());
ASSERT_TRUE(segments[2]->state() == io::FileBlock::State::DOWNLOADING);
bool lets_start_download = false;
std::mutex mutex;
std::condition_variable cv;
std::thread other_1([&] {
auto holder_2 = cache.get_or_set(key, 25, 5, is_persistent,
other_query_id); /// Get [25, 29] once again.
auto segments_2 = fromHolder(holder_2);
ASSERT_EQ(segments.size(), 3);
assert_range(35, segments_2[0], io::FileBlock::Range(24, 26),
io::FileBlock::State::DOWNLOADED);
assert_range(36, segments_2[1], io::FileBlock::Range(27, 27),
io::FileBlock::State::DOWNLOADED);
assert_range(37, segments_2[2], io::FileBlock::Range(28, 29),
io::FileBlock::State::DOWNLOADING);
ASSERT_TRUE(segments[2]->get_or_set_downloader() != io::FileBlock::get_caller_id());
ASSERT_TRUE(segments[2]->state() == io::FileBlock::State::DOWNLOADING);
{
std::lock_guard lock(mutex);
lets_start_download = true;
}
cv.notify_one();
while (segments_2[2]->wait() == io::FileBlock::State::DOWNLOADING) {
}
ASSERT_TRUE(segments_2[2]->state() == io::FileBlock::State::DOWNLOADED);
});
{
std::unique_lock lock(mutex);
cv.wait(lock, [&] { return lets_start_download; });
}
download(segments[2]);
ASSERT_TRUE(segments[2]->state() == io::FileBlock::State::DOWNLOADED);
other_1.join();
}
/// Current cache: [___] [___][_][__][__]
/// ^ ^ ^ ^ ^^ ^^ ^
/// 2 4 24 26 27 2930 31
{
/// Now let's check the similar case but getting ERROR state after segment->wait(), when
/// state is changed not manually via segment->complete(state) but from destructor of holder
/// and notify_all() is also called from destructor of holder.
std::optional<io::FileBlocksHolder> holder;
holder.emplace(cache.get_or_set(key, 3, 23, is_persistent, query_id)); /// Get [3, 25]
auto segments = fromHolder(*holder);
ASSERT_EQ(segments.size(), 3);
assert_range(38, segments[0], io::FileBlock::Range(2, 4),
io::FileBlock::State::DOWNLOADED);
assert_range(39, segments[1], io::FileBlock::Range(5, 23), io::FileBlock::State::EMPTY);
ASSERT_TRUE(segments[1]->get_or_set_downloader() == io::FileBlock::get_caller_id());
ASSERT_TRUE(segments[1]->state() == io::FileBlock::State::DOWNLOADING);
assert_range(40, segments[2], io::FileBlock::Range(24, 26),
io::FileBlock::State::DOWNLOADED);
bool lets_start_download = false;
std::mutex mutex;
std::condition_variable cv;
std::thread other_1([&] {
auto holder_2 = cache.get_or_set(key, 3, 23, is_persistent,
other_query_id); /// Get [3, 25] once again
auto segments_2 = fromHolder(*holder);
ASSERT_EQ(segments_2.size(), 3);
assert_range(41, segments_2[0], io::FileBlock::Range(2, 4),
io::FileBlock::State::DOWNLOADED);
assert_range(42, segments_2[1], io::FileBlock::Range(5, 23),
io::FileBlock::State::DOWNLOADING);
assert_range(43, segments_2[2], io::FileBlock::Range(24, 26),
io::FileBlock::State::DOWNLOADED);
ASSERT_TRUE(segments_2[1]->get_downloader() != io::FileBlock::get_caller_id());
ASSERT_TRUE(segments_2[1]->state() == io::FileBlock::State::DOWNLOADING);
{
std::lock_guard lock(mutex);
lets_start_download = true;
}
cv.notify_one();
while (segments_2[1]->wait() == io::FileBlock::State::DOWNLOADING) {
}
ASSERT_TRUE(segments_2[1]->state() == io::FileBlock::State::EMPTY);
ASSERT_TRUE(segments_2[1]->get_or_set_downloader() ==
io::FileBlock::get_caller_id());
download(segments_2[1]);
});
{
std::unique_lock lock(mutex);
cv.wait(lock, [&] { return lets_start_download; });
}
holder.reset();
other_1.join();
ASSERT_TRUE(segments[1]->state() == io::FileBlock::State::DOWNLOADED);
}
}
/// Current cache: [___][ ][___][_][__]
/// ^ ^^ ^ ^^ ^ ^
/// 2 45 24 2627 28 29
{
/// Test LRUCache::restore().
io::LRUFileCache cache2(cache_base_path, settings);
cache2.initialize();
auto holder1 = cache2.get_or_set(key, 2, 28, is_persistent, query_id); /// Get [2, 29]
auto segments1 = fromHolder(holder1);
ASSERT_EQ(segments1.size(), 5);
assert_range(44, segments1[0], io::FileBlock::Range(2, 4),
io::FileBlock::State::DOWNLOADED);
assert_range(45, segments1[1], io::FileBlock::Range(5, 23),
io::FileBlock::State::DOWNLOADED);
assert_range(45, segments1[2], io::FileBlock::Range(24, 26),
io::FileBlock::State::DOWNLOADED);
assert_range(46, segments1[3], io::FileBlock::Range(27, 27),
io::FileBlock::State::DOWNLOADED);
assert_range(47, segments1[4], io::FileBlock::Range(28, 29),
io::FileBlock::State::DOWNLOADED);
}
{
/// Test max file segment size
auto settings2 = settings;
settings2.max_size = 30;
settings2.max_elements = 5;
settings2.persistent_max_size = 30;
settings2.persistent_max_elements = 5;
settings2.max_file_segment_size = 10;
io::LRUFileCache cache2(caches_dir / "cache2", settings2);
auto holder1 = cache2.get_or_set(key, 0, 25, is_persistent, query_id); /// Get [0, 24]
auto segments1 = fromHolder(holder1);
ASSERT_EQ(segments1.size(), 3);
assert_range(48, segments1[0], io::FileBlock::Range(0, 9), io::FileBlock::State::EMPTY);
assert_range(49, segments1[1], io::FileBlock::Range(10, 19), io::FileBlock::State::EMPTY);
assert_range(50, segments1[2], io::FileBlock::Range(20, 24), io::FileBlock::State::EMPTY);
}
}
TEST(LRUFileCache, normal) {
if (fs::exists(cache_base_path)) {
fs::remove_all(cache_base_path);
}
fs::create_directories(cache_base_path);
test_file_cache(false);
test_file_cache(true);
}
} // namespace doris::io