[refactor](file-system)(step-1) refactor file sysmte on BE and remove storage_backend (#17586)
See #17764 for details I have tested: - Unit test for local/s3/hdfs/broker file system: be/test/io/fs/file_system_test.cpp - Outfile to local/s3/hdfs/broker. - Load from local/s3/hdfs/broker. - Query file on local/s3/hdfs/broker file system, with table value function and catalog. - Backup/Restore with local/s3/hdfs/broker file system Not test: - cold & host data separation case.
This commit is contained in:
@ -371,8 +371,7 @@ Status RoutineLoadTaskExecutor::_execute_plan_for_test(std::shared_ptr<StreamLoa
|
||||
int64_t len = 1;
|
||||
size_t read_bytes = 0;
|
||||
Slice result((uint8_t*)&one, len);
|
||||
IOContext io_ctx;
|
||||
Status st = pipe->read_at(0, result, io_ctx, &read_bytes);
|
||||
Status st = pipe->read_at(0, result, &read_bytes);
|
||||
if (!st.ok()) {
|
||||
LOG(WARNING) << "read failed";
|
||||
ctx->promise.set_value(st);
|
||||
|
||||
@ -26,16 +26,17 @@
|
||||
#include "gen_cpp/HeartbeatService_types.h"
|
||||
#include "gen_cpp/PaloBrokerService_types.h"
|
||||
#include "gen_cpp/TPaloBrokerService.h"
|
||||
#include "olap/file_helper.h"
|
||||
#include "io/fs/broker_file_system.h"
|
||||
#include "io/fs/hdfs_file_system.h"
|
||||
#include "io/fs/s3_file_system.h"
|
||||
#include "io/hdfs_builder.h"
|
||||
#include "olap/snapshot_manager.h"
|
||||
#include "olap/storage_engine.h"
|
||||
#include "olap/tablet.h"
|
||||
#include "runtime/broker_mgr.h"
|
||||
#include "runtime/exec_env.h"
|
||||
#include "util/broker_storage_backend.h"
|
||||
#include "util/file_utils.h"
|
||||
#include "util/hdfs_storage_backend.h"
|
||||
#include "util/s3_storage_backend.h"
|
||||
#include "util/s3_uri.h"
|
||||
#include "util/thrift_rpc_helper.h"
|
||||
|
||||
namespace doris {
|
||||
@ -46,29 +47,42 @@ SnapshotLoader::SnapshotLoader(ExecEnv* env, int64_t job_id, int64_t task_id)
|
||||
_task_id(task_id),
|
||||
_broker_addr(TNetworkAddress()),
|
||||
_prop(std::map<std::string, std::string>()),
|
||||
_storage_backend(nullptr) {}
|
||||
_remote_fs(nullptr) {}
|
||||
|
||||
SnapshotLoader::SnapshotLoader(ExecEnv* env, int64_t job_id, int64_t task_id,
|
||||
const TNetworkAddress& broker_addr,
|
||||
const std::map<std::string, std::string>& prop,
|
||||
TStorageBackendType::type type)
|
||||
: _env(env), _job_id(job_id), _task_id(task_id), _broker_addr(broker_addr), _prop(prop) {
|
||||
const std::map<std::string, std::string>& prop)
|
||||
: _env(env), _job_id(job_id), _task_id(task_id), _broker_addr(broker_addr), _prop(prop) {}
|
||||
|
||||
Status SnapshotLoader::init(TStorageBackendType::type type, const std::string& location) {
|
||||
if (TStorageBackendType::type::S3 == type) {
|
||||
_storage_backend.reset(new S3StorageBackend(_prop));
|
||||
S3Conf s3_conf;
|
||||
S3URI s3_uri(location);
|
||||
RETURN_IF_ERROR(s3_uri.parse());
|
||||
RETURN_IF_ERROR(S3ClientFactory::convert_properties_to_s3_conf(_prop, s3_uri, &s3_conf));
|
||||
std::shared_ptr<io::S3FileSystem> fs;
|
||||
RETURN_IF_ERROR(io::S3FileSystem::create(std::move(s3_conf), "", &fs));
|
||||
_remote_fs = std::move(fs);
|
||||
} else if (TStorageBackendType::type::HDFS == type) {
|
||||
_storage_backend.reset(new HDFSStorageBackend(_prop));
|
||||
THdfsParams hdfs_params = parse_properties(_prop);
|
||||
std::shared_ptr<io::HdfsFileSystem> fs;
|
||||
RETURN_IF_ERROR(io::HdfsFileSystem::create(hdfs_params, "", &fs));
|
||||
_remote_fs = std::move(fs);
|
||||
} else if (TStorageBackendType::type::BROKER == type) {
|
||||
_storage_backend.reset(new BrokerStorageBackend(_env, _broker_addr, _prop));
|
||||
std::shared_ptr<io::BrokerFileSystem> fs;
|
||||
RETURN_IF_ERROR(io::BrokerFileSystem::create(_broker_addr, _prop, 0, &fs));
|
||||
_remote_fs = std::move(fs);
|
||||
} else {
|
||||
_storage_backend = nullptr;
|
||||
return Status::InternalError("Unknown storage tpye: {}", type);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
SnapshotLoader::~SnapshotLoader() = default;
|
||||
|
||||
Status SnapshotLoader::upload(const std::map<std::string, std::string>& src_to_dest_path,
|
||||
std::map<int64_t, std::vector<std::string>>* tablet_files) {
|
||||
if (!_storage_backend) {
|
||||
if (!_remote_fs) {
|
||||
return Status::InternalError("Storage backend not initialized.");
|
||||
}
|
||||
LOG(INFO) << "begin to upload snapshot files. num: " << src_to_dest_path.size()
|
||||
@ -99,7 +113,7 @@ Status SnapshotLoader::upload(const std::map<std::string, std::string>& src_to_d
|
||||
|
||||
// 2.1 get existing files from remote path
|
||||
std::map<std::string, FileStat> remote_files;
|
||||
RETURN_IF_ERROR(_storage_backend->list(dest_path, true, false, &remote_files));
|
||||
RETURN_IF_ERROR(_list_with_checksum(dest_path, &remote_files));
|
||||
|
||||
for (auto& tmp : remote_files) {
|
||||
VLOG_CRITICAL << "get remote file: " << tmp.first << ", checksum: " << tmp.second.md5;
|
||||
@ -151,8 +165,8 @@ Status SnapshotLoader::upload(const std::map<std::string, std::string>& src_to_d
|
||||
// upload
|
||||
std::string full_remote_file = dest_path + "/" + local_file;
|
||||
std::string full_local_file = src_path + "/" + local_file;
|
||||
RETURN_IF_ERROR(_storage_backend->upload_with_checksum(full_local_file,
|
||||
full_remote_file, md5sum));
|
||||
RETURN_IF_ERROR(
|
||||
_remote_fs->upload_with_checksum(full_local_file, full_remote_file, md5sum));
|
||||
} // end for each tablet's local files
|
||||
|
||||
tablet_files->emplace(tablet_id, local_files_with_checksum);
|
||||
@ -172,7 +186,7 @@ Status SnapshotLoader::upload(const std::map<std::string, std::string>& src_to_d
|
||||
*/
|
||||
Status SnapshotLoader::download(const std::map<std::string, std::string>& src_to_dest_path,
|
||||
std::vector<int64_t>* downloaded_tablet_ids) {
|
||||
if (!_storage_backend) {
|
||||
if (!_remote_fs) {
|
||||
return Status::InternalError("Storage backend not initialized.");
|
||||
}
|
||||
LOG(INFO) << "begin to download snapshot files. num: " << src_to_dest_path.size()
|
||||
@ -213,7 +227,7 @@ Status SnapshotLoader::download(const std::map<std::string, std::string>& src_to
|
||||
|
||||
// 2.2. get remote files
|
||||
std::map<std::string, FileStat> remote_files;
|
||||
RETURN_IF_ERROR(_storage_backend->list(remote_path, true, false, &remote_files));
|
||||
RETURN_IF_ERROR(_list_with_checksum(remote_path, &remote_files));
|
||||
if (remote_files.empty()) {
|
||||
std::stringstream ss;
|
||||
ss << "get nothing from remote path: " << remote_path;
|
||||
@ -287,7 +301,7 @@ Status SnapshotLoader::download(const std::map<std::string, std::string>& src_to
|
||||
// remove file which will be downloaded now.
|
||||
// this file will be added to local_files if it be downloaded successfully.
|
||||
local_files.erase(find);
|
||||
RETURN_IF_ERROR(_storage_backend->download(full_remote_file, full_local_file));
|
||||
RETURN_IF_ERROR(_remote_fs->download(full_remote_file, full_local_file));
|
||||
|
||||
// 3. check md5 of the downloaded file
|
||||
std::string downloaded_md5sum;
|
||||
@ -628,4 +642,25 @@ Status SnapshotLoader::_report_every(int report_threshold, int* counter, int32_t
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status SnapshotLoader::_list_with_checksum(const std::string& dir,
|
||||
std::map<std::string, FileStat>* md5_files) {
|
||||
bool exists = true;
|
||||
std::vector<io::FileInfo> files;
|
||||
RETURN_IF_ERROR(_remote_fs->list(dir, true, &files, &exists));
|
||||
for (auto& tmp_file : files) {
|
||||
io::Path path(tmp_file.file_name);
|
||||
std::string file_name = path.filename();
|
||||
size_t pos = file_name.find_last_of(".");
|
||||
if (pos == std::string::npos || pos == file_name.size() - 1) {
|
||||
// Not found checksum separator, ignore this file
|
||||
continue;
|
||||
}
|
||||
FileStat stat = {std::string(file_name, 0, pos), std::string(file_name, pos + 1),
|
||||
tmp_file.file_size};
|
||||
md5_files->emplace(std::string(file_name, 0, pos), stat);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // end namespace doris
|
||||
|
||||
@ -25,14 +25,19 @@
|
||||
|
||||
#include "common/status.h"
|
||||
#include "gen_cpp/Types_types.h"
|
||||
#include "io/fs/remote_file_system.h"
|
||||
#include "olap/tablet.h"
|
||||
#include "runtime/client_cache.h"
|
||||
|
||||
namespace doris {
|
||||
|
||||
struct FileStat {
|
||||
std::string name;
|
||||
std::string md5;
|
||||
size_t size;
|
||||
};
|
||||
class ExecEnv;
|
||||
class StorageBackend;
|
||||
struct FileStat;
|
||||
|
||||
/*
|
||||
* Upload:
|
||||
@ -59,11 +64,12 @@ public:
|
||||
SnapshotLoader(ExecEnv* env, int64_t job_id, int64_t task_id);
|
||||
SnapshotLoader(ExecEnv* env, int64_t job_id, int64_t task_id,
|
||||
const TNetworkAddress& broker_addr,
|
||||
const std::map<std::string, std::string>& broker_prop,
|
||||
TStorageBackendType::type type);
|
||||
const std::map<std::string, std::string>& broker_prop);
|
||||
|
||||
~SnapshotLoader();
|
||||
|
||||
Status init(TStorageBackendType::type type, const std::string& location);
|
||||
|
||||
Status upload(const std::map<std::string, std::string>& src_to_dest_path,
|
||||
std::map<int64_t, std::vector<std::string>>* tablet_files);
|
||||
|
||||
@ -92,13 +98,15 @@ private:
|
||||
Status _report_every(int report_threshold, int* counter, int finished_num, int total_num,
|
||||
TTaskType::type type);
|
||||
|
||||
Status _list_with_checksum(const std::string& dir, std::map<std::string, FileStat>* md5_files);
|
||||
|
||||
private:
|
||||
ExecEnv* _env;
|
||||
int64_t _job_id;
|
||||
int64_t _task_id;
|
||||
const TNetworkAddress _broker_addr;
|
||||
const std::map<std::string, std::string> _prop;
|
||||
std::unique_ptr<StorageBackend> _storage_backend;
|
||||
std::shared_ptr<io::RemoteFileSystem> _remote_fs;
|
||||
};
|
||||
|
||||
} // end namespace doris
|
||||
|
||||
Reference in New Issue
Block a user