[refactor](file-system)(step-1) refactor file sysmte on BE and remove storage_backend (#17586)

See #17764 for details
I have tested:
- Unit test for local/s3/hdfs/broker file system: be/test/io/fs/file_system_test.cpp
- Outfile to local/s3/hdfs/broker.
- Load from local/s3/hdfs/broker.
- Query file on local/s3/hdfs/broker file system, with table value function and catalog.
- Backup/Restore with local/s3/hdfs/broker file system

Not test:
- cold & host data separation case.
This commit is contained in:
Mingyu Chen
2023-03-21 21:08:38 +08:00
committed by GitHub
parent 82716ec99d
commit cb79e42e5c
171 changed files with 3523 additions and 5288 deletions

View File

@ -371,8 +371,7 @@ Status RoutineLoadTaskExecutor::_execute_plan_for_test(std::shared_ptr<StreamLoa
int64_t len = 1;
size_t read_bytes = 0;
Slice result((uint8_t*)&one, len);
IOContext io_ctx;
Status st = pipe->read_at(0, result, io_ctx, &read_bytes);
Status st = pipe->read_at(0, result, &read_bytes);
if (!st.ok()) {
LOG(WARNING) << "read failed";
ctx->promise.set_value(st);

View File

@ -26,16 +26,17 @@
#include "gen_cpp/HeartbeatService_types.h"
#include "gen_cpp/PaloBrokerService_types.h"
#include "gen_cpp/TPaloBrokerService.h"
#include "olap/file_helper.h"
#include "io/fs/broker_file_system.h"
#include "io/fs/hdfs_file_system.h"
#include "io/fs/s3_file_system.h"
#include "io/hdfs_builder.h"
#include "olap/snapshot_manager.h"
#include "olap/storage_engine.h"
#include "olap/tablet.h"
#include "runtime/broker_mgr.h"
#include "runtime/exec_env.h"
#include "util/broker_storage_backend.h"
#include "util/file_utils.h"
#include "util/hdfs_storage_backend.h"
#include "util/s3_storage_backend.h"
#include "util/s3_uri.h"
#include "util/thrift_rpc_helper.h"
namespace doris {
@ -46,29 +47,42 @@ SnapshotLoader::SnapshotLoader(ExecEnv* env, int64_t job_id, int64_t task_id)
_task_id(task_id),
_broker_addr(TNetworkAddress()),
_prop(std::map<std::string, std::string>()),
_storage_backend(nullptr) {}
_remote_fs(nullptr) {}
SnapshotLoader::SnapshotLoader(ExecEnv* env, int64_t job_id, int64_t task_id,
const TNetworkAddress& broker_addr,
const std::map<std::string, std::string>& prop,
TStorageBackendType::type type)
: _env(env), _job_id(job_id), _task_id(task_id), _broker_addr(broker_addr), _prop(prop) {
const std::map<std::string, std::string>& prop)
: _env(env), _job_id(job_id), _task_id(task_id), _broker_addr(broker_addr), _prop(prop) {}
Status SnapshotLoader::init(TStorageBackendType::type type, const std::string& location) {
if (TStorageBackendType::type::S3 == type) {
_storage_backend.reset(new S3StorageBackend(_prop));
S3Conf s3_conf;
S3URI s3_uri(location);
RETURN_IF_ERROR(s3_uri.parse());
RETURN_IF_ERROR(S3ClientFactory::convert_properties_to_s3_conf(_prop, s3_uri, &s3_conf));
std::shared_ptr<io::S3FileSystem> fs;
RETURN_IF_ERROR(io::S3FileSystem::create(std::move(s3_conf), "", &fs));
_remote_fs = std::move(fs);
} else if (TStorageBackendType::type::HDFS == type) {
_storage_backend.reset(new HDFSStorageBackend(_prop));
THdfsParams hdfs_params = parse_properties(_prop);
std::shared_ptr<io::HdfsFileSystem> fs;
RETURN_IF_ERROR(io::HdfsFileSystem::create(hdfs_params, "", &fs));
_remote_fs = std::move(fs);
} else if (TStorageBackendType::type::BROKER == type) {
_storage_backend.reset(new BrokerStorageBackend(_env, _broker_addr, _prop));
std::shared_ptr<io::BrokerFileSystem> fs;
RETURN_IF_ERROR(io::BrokerFileSystem::create(_broker_addr, _prop, 0, &fs));
_remote_fs = std::move(fs);
} else {
_storage_backend = nullptr;
return Status::InternalError("Unknown storage tpye: {}", type);
}
return Status::OK();
}
SnapshotLoader::~SnapshotLoader() = default;
Status SnapshotLoader::upload(const std::map<std::string, std::string>& src_to_dest_path,
std::map<int64_t, std::vector<std::string>>* tablet_files) {
if (!_storage_backend) {
if (!_remote_fs) {
return Status::InternalError("Storage backend not initialized.");
}
LOG(INFO) << "begin to upload snapshot files. num: " << src_to_dest_path.size()
@ -99,7 +113,7 @@ Status SnapshotLoader::upload(const std::map<std::string, std::string>& src_to_d
// 2.1 get existing files from remote path
std::map<std::string, FileStat> remote_files;
RETURN_IF_ERROR(_storage_backend->list(dest_path, true, false, &remote_files));
RETURN_IF_ERROR(_list_with_checksum(dest_path, &remote_files));
for (auto& tmp : remote_files) {
VLOG_CRITICAL << "get remote file: " << tmp.first << ", checksum: " << tmp.second.md5;
@ -151,8 +165,8 @@ Status SnapshotLoader::upload(const std::map<std::string, std::string>& src_to_d
// upload
std::string full_remote_file = dest_path + "/" + local_file;
std::string full_local_file = src_path + "/" + local_file;
RETURN_IF_ERROR(_storage_backend->upload_with_checksum(full_local_file,
full_remote_file, md5sum));
RETURN_IF_ERROR(
_remote_fs->upload_with_checksum(full_local_file, full_remote_file, md5sum));
} // end for each tablet's local files
tablet_files->emplace(tablet_id, local_files_with_checksum);
@ -172,7 +186,7 @@ Status SnapshotLoader::upload(const std::map<std::string, std::string>& src_to_d
*/
Status SnapshotLoader::download(const std::map<std::string, std::string>& src_to_dest_path,
std::vector<int64_t>* downloaded_tablet_ids) {
if (!_storage_backend) {
if (!_remote_fs) {
return Status::InternalError("Storage backend not initialized.");
}
LOG(INFO) << "begin to download snapshot files. num: " << src_to_dest_path.size()
@ -213,7 +227,7 @@ Status SnapshotLoader::download(const std::map<std::string, std::string>& src_to
// 2.2. get remote files
std::map<std::string, FileStat> remote_files;
RETURN_IF_ERROR(_storage_backend->list(remote_path, true, false, &remote_files));
RETURN_IF_ERROR(_list_with_checksum(remote_path, &remote_files));
if (remote_files.empty()) {
std::stringstream ss;
ss << "get nothing from remote path: " << remote_path;
@ -287,7 +301,7 @@ Status SnapshotLoader::download(const std::map<std::string, std::string>& src_to
// remove file which will be downloaded now.
// this file will be added to local_files if it be downloaded successfully.
local_files.erase(find);
RETURN_IF_ERROR(_storage_backend->download(full_remote_file, full_local_file));
RETURN_IF_ERROR(_remote_fs->download(full_remote_file, full_local_file));
// 3. check md5 of the downloaded file
std::string downloaded_md5sum;
@ -628,4 +642,25 @@ Status SnapshotLoader::_report_every(int report_threshold, int* counter, int32_t
return Status::OK();
}
Status SnapshotLoader::_list_with_checksum(const std::string& dir,
std::map<std::string, FileStat>* md5_files) {
bool exists = true;
std::vector<io::FileInfo> files;
RETURN_IF_ERROR(_remote_fs->list(dir, true, &files, &exists));
for (auto& tmp_file : files) {
io::Path path(tmp_file.file_name);
std::string file_name = path.filename();
size_t pos = file_name.find_last_of(".");
if (pos == std::string::npos || pos == file_name.size() - 1) {
// Not found checksum separator, ignore this file
continue;
}
FileStat stat = {std::string(file_name, 0, pos), std::string(file_name, pos + 1),
tmp_file.file_size};
md5_files->emplace(std::string(file_name, 0, pos), stat);
}
return Status::OK();
}
} // end namespace doris

View File

@ -25,14 +25,19 @@
#include "common/status.h"
#include "gen_cpp/Types_types.h"
#include "io/fs/remote_file_system.h"
#include "olap/tablet.h"
#include "runtime/client_cache.h"
namespace doris {
struct FileStat {
std::string name;
std::string md5;
size_t size;
};
class ExecEnv;
class StorageBackend;
struct FileStat;
/*
* Upload:
@ -59,11 +64,12 @@ public:
SnapshotLoader(ExecEnv* env, int64_t job_id, int64_t task_id);
SnapshotLoader(ExecEnv* env, int64_t job_id, int64_t task_id,
const TNetworkAddress& broker_addr,
const std::map<std::string, std::string>& broker_prop,
TStorageBackendType::type type);
const std::map<std::string, std::string>& broker_prop);
~SnapshotLoader();
Status init(TStorageBackendType::type type, const std::string& location);
Status upload(const std::map<std::string, std::string>& src_to_dest_path,
std::map<int64_t, std::vector<std::string>>* tablet_files);
@ -92,13 +98,15 @@ private:
Status _report_every(int report_threshold, int* counter, int finished_num, int total_num,
TTaskType::type type);
Status _list_with_checksum(const std::string& dir, std::map<std::string, FileStat>* md5_files);
private:
ExecEnv* _env;
int64_t _job_id;
int64_t _task_id;
const TNetworkAddress _broker_addr;
const std::map<std::string, std::string> _prop;
std::unique_ptr<StorageBackend> _storage_backend;
std::shared_ptr<io::RemoteFileSystem> _remote_fs;
};
} // end namespace doris