Files
doris/be/src/util/hdfs_storage_backend.cpp
2022-06-19 10:26:20 +08:00

309 lines
11 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "util/hdfs_storage_backend.h"
#include "io/hdfs_file_reader.h"
#include "io/hdfs_reader_writer.h"
#include "io/hdfs_writer.h"
#include "olap/file_helper.h"
#include "util/hdfs_util.h"
namespace doris {
#ifndef CHECK_HDFS_CLIENT
#define CHECK_HDFS_CLIENT(client) \
if (!client) { \
return Status::InternalError("init hdfs client error."); \
}
#endif
static const std::string hdfs_file_prefix = "hdfs://";
HDFSStorageBackend::HDFSStorageBackend(const std::map<std::string, std::string>& prop)
: _properties(prop), _builder(createHDFSBuilder(_properties)) {
_hdfs_fs = HDFSHandle::instance().create_hdfs_fs(_builder);
DCHECK(_hdfs_fs) << "init hdfs client error.";
}
HDFSStorageBackend::~HDFSStorageBackend() {
close();
}
Status HDFSStorageBackend::close() {
if (_hdfs_fs != nullptr) {
hdfsDisconnect(_hdfs_fs);
_hdfs_fs = nullptr;
}
return Status::OK();
}
// if the format of path is hdfs://ip:port/path, replace it to /path.
// path like hdfs://ip:port/path can't be used by libhdfs3.
std::string HDFSStorageBackend::parse_path(const std::string& path) {
if (path.find(hdfs_file_prefix) != path.npos) {
std::string temp = path.substr(hdfs_file_prefix.size());
std::size_t pos = temp.find_first_of('/');
return temp.substr(pos);
} else {
return path;
}
}
Status HDFSStorageBackend::upload(const std::string& local, const std::string& remote) {
FileHandler file_handler;
Status ost = file_handler.open(local, O_RDONLY);
if (!ost.ok()) {
return Status::InternalError("failed to open file: " + local);
}
size_t file_len = file_handler.length();
if (file_len == -1) {
return Status::InternalError("failed to get length of file: " + local);
}
std::unique_ptr<HDFSWriter> hdfs_writer(new HDFSWriter(_properties, remote));
RETURN_IF_ERROR(hdfs_writer->open());
constexpr size_t buf_sz = 1024 * 1024;
char read_buf[buf_sz];
size_t left_len = file_len;
size_t read_offset = 0;
while (left_len > 0) {
size_t read_len = left_len > buf_sz ? buf_sz : left_len;
ost = file_handler.pread(read_buf, read_len, read_offset);
if (!ost.ok()) {
return Status::InternalError("failed to read file: " + local);
}
size_t write_len = 0;
RETURN_IF_ERROR(hdfs_writer->write(reinterpret_cast<const uint8_t*>(read_buf), read_len,
&write_len));
DCHECK_EQ(write_len, read_len);
read_offset += read_len;
left_len -= read_len;
}
LOG(INFO) << "finished to write file: " << local << ", length: " << file_len;
return Status::OK();
}
Status HDFSStorageBackend::direct_upload(const std::string& remote, const std::string& content) {
std::unique_ptr<HDFSWriter> hdfs_writer(new HDFSWriter(_properties, remote));
RETURN_IF_ERROR(hdfs_writer->open());
size_t write_len = 0;
RETURN_IF_ERROR(hdfs_writer->write(reinterpret_cast<const uint8_t*>(content.c_str()),
content.size(), &write_len));
DCHECK_EQ(write_len, content.size());
return Status::OK();
}
Status HDFSStorageBackend::list(const std::string& remote_path, bool contain_md5, bool recursion,
std::map<std::string, FileStat>* files) {
CHECK_HDFS_CLIENT(_hdfs_fs);
std::string normal_str = parse_path(remote_path);
int exists = hdfsExists(_hdfs_fs, normal_str.c_str());
if (exists != 0) {
LOG(INFO) << "path does not exist: " << normal_str << ", err: " << strerror(errno);
return Status::OK();
}
int file_num = 0;
hdfsFileInfo* files_info = hdfsListDirectory(_hdfs_fs, normal_str.c_str(), &file_num);
if (files_info == nullptr) {
std::stringstream ss;
ss << "failed to list files from remote path: " << normal_str
<< ", err: " << strerror(errno);
LOG(WARNING) << ss.str();
return Status::InternalError(ss.str());
}
LOG(INFO) << "finished to list files from remote path:" << normal_str
<< ", file num:" << file_num;
for (int i = 0; i < file_num; ++i) {
auto file_info = files_info[i];
if (file_info.mKind == kObjectKindDirectory) {
continue;
}
const std::string& file_name_with_path(file_info.mName);
// get filename
std::filesystem::path file_path(file_name_with_path);
std::string file_name = file_path.filename();
size_t pos = file_name.find_last_of(".");
if (pos == std::string::npos || pos == file_name.size() - 1) {
// Not found checksum separator, ignore this file
continue;
}
FileStat stat = {std::string(file_name, 0, pos), std::string(file_name, pos + 1),
file_info.mSize};
files->emplace(std::string(file_name, 0, pos), stat);
VLOG(2) << "split remote file: " << std::string(file_name, 0, pos)
<< ", checksum: " << std::string(file_name, pos + 1);
}
hdfsFreeFileInfo(files_info, file_num);
LOG(INFO) << "finished to split files. valid file num: " << files->size();
return Status::OK();
}
Status HDFSStorageBackend::download(const std::string& remote, const std::string& local) {
// 1. open remote file for read
std::unique_ptr<HdfsFileReader> hdfs_reader(new HdfsFileReader(_properties, remote, 0));
RETURN_IF_ERROR(hdfs_reader->open());
// 2. remove the existing local file if exist
if (std::filesystem::remove(local)) {
LOG(INFO) << "remove the previously exist local file: " << local;
}
// 3. open local file for write
FileHandler file_handler;
Status ost =
file_handler.open_with_mode(local, O_CREAT | O_TRUNC | O_WRONLY, S_IRUSR | S_IWUSR);
if (!ost.ok()) {
return Status::InternalError("failed to open file: " + local);
}
// 4. read remote and write to local
LOG(INFO) << "read remote file: " << remote << " to local: " << local;
constexpr size_t buf_sz = 1024 * 1024;
char read_buf[buf_sz];
size_t write_offset = 0;
bool eof = false;
while (!eof) {
int64_t read_len = 0;
RETURN_IF_ERROR(
hdfs_reader->read(reinterpret_cast<uint8_t*>(read_buf), buf_sz, &read_len, &eof));
if (eof) {
continue;
}
if (read_len > 0) {
ost = file_handler.pwrite(read_buf, read_len, write_offset);
if (!ost.ok()) {
return Status::InternalError("failed to write file: " + local);
}
write_offset += read_len;
}
}
return Status::OK();
}
Status HDFSStorageBackend::direct_download(const std::string& remote, std::string* content) {
std::unique_ptr<HdfsFileReader> hdfs_reader(new HdfsFileReader(_properties, remote, 0));
RETURN_IF_ERROR(hdfs_reader->open());
constexpr size_t buf_sz = 1024 * 1024;
char read_buf[buf_sz];
size_t write_offset = 0;
bool eof = false;
while (!eof) {
int64_t read_len = 0;
RETURN_IF_ERROR(
hdfs_reader->read(reinterpret_cast<uint8_t*>(read_buf), buf_sz, &read_len, &eof));
if (eof) {
continue;
}
if (read_len > 0) {
content->insert(write_offset, read_buf, read_len);
write_offset += read_len;
}
}
return Status::OK();
}
Status HDFSStorageBackend::upload_with_checksum(const std::string& local, const std::string& remote,
const std::string& checksum) {
std::string temp = remote + ".part";
std::string final = remote + "." + checksum;
RETURN_IF_ERROR(upload(local, temp));
return rename(temp, final);
}
Status HDFSStorageBackend::rename(const std::string& orig_name, const std::string& new_name) {
CHECK_HDFS_CLIENT(_hdfs_fs);
std::string normal_orig_name = parse_path(orig_name);
std::string normal_new_name = parse_path(new_name);
int ret = hdfsRename(_hdfs_fs, normal_orig_name.c_str(), normal_new_name.c_str());
if (ret == 0) {
LOG(INFO) << "finished to rename file. orig: " << normal_orig_name
<< ", new: " << normal_new_name;
return Status::OK();
} else {
std::stringstream ss;
ss << "Fail to rename file: " << normal_orig_name << " to: " << normal_new_name;
LOG(WARNING) << ss.str();
return Status::InternalError(ss.str());
}
}
Status HDFSStorageBackend::rename_dir(const std::string& orig_name, const std::string& new_name) {
return rename(orig_name, new_name);
}
Status HDFSStorageBackend::copy(const std::string& src, const std::string& dst) {
return Status::NotSupported("copy not implemented!");
}
Status HDFSStorageBackend::copy_dir(const std::string& src, const std::string& dst) {
return copy(src, dst);
}
Status HDFSStorageBackend::mkdir(const std::string& path) {
return Status::NotSupported("mkdir not implemented!");
}
Status HDFSStorageBackend::mkdirs(const std::string& path) {
return Status::NotSupported("mkdirs not implemented!");
}
Status HDFSStorageBackend::exist(const std::string& path) {
CHECK_HDFS_CLIENT(_hdfs_fs);
int exists = hdfsExists(_hdfs_fs, path.c_str());
if (exists == 0) {
return Status::OK();
} else {
return Status::NotFound(path + " not exists!");
}
}
Status HDFSStorageBackend::exist_dir(const std::string& path) {
return exist(path);
}
Status HDFSStorageBackend::rm(const std::string& remote) {
CHECK_HDFS_CLIENT(_hdfs_fs);
int ret = hdfsDelete(_hdfs_fs, remote.c_str(), 1);
if (ret == 0) {
return Status::OK();
} else {
std::stringstream ss;
ss << "failed to delete from remote path: " << remote;
LOG(WARNING) << ss.str();
return Status::InternalError(ss.str());
}
}
Status HDFSStorageBackend::rmdir(const std::string& remote) {
return rm(remote);
}
} // end namespace doris