Files
doris/be/src/io/fs/hdfs_file_reader.cpp
Mingyu Chen (Rayner) 5d3f0a267a [opt](scan) unify the local and remote scan bytes stats for all scanners for 2.1 (#45167)
pick part of #40493

TODO: not working with s3 reader
2024-12-10 14:19:19 +08:00

230 lines
9.2 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "io/fs/hdfs_file_reader.h"
#include <stdint.h>
#include <algorithm>
#include <filesystem>
#include <ostream>
#include <utility>
#include "common/compiler_util.h" // IWYU pragma: keep
#include "common/logging.h"
#include "io/fs/err_utils.h"
// #include "io/fs/hdfs_file_system.h"
#include "runtime/thread_context.h"
#include "runtime/workload_management/io_throttle.h"
#include "service/backend_options.h"
#include "util/doris_metrics.h"
#include "util/hdfs_util.h"
namespace doris {
namespace io {
HdfsFileReader::HdfsFileReader(Path path, const std::string& name_node,
FileHandleCache::Accessor accessor, RuntimeProfile* profile)
: _path(std::move(path)),
_name_node(name_node),
_accessor(std::move(accessor)),
_profile(profile) {
_handle = _accessor.get();
DorisMetrics::instance()->hdfs_file_open_reading->increment(1);
DorisMetrics::instance()->hdfs_file_reader_total->increment(1);
if (_profile != nullptr && is_hdfs(_name_node)) {
#ifdef USE_HADOOP_HDFS
const char* hdfs_profile_name = "HdfsIO";
ADD_TIMER(_profile, hdfs_profile_name);
_hdfs_profile.total_bytes_read =
ADD_CHILD_COUNTER(_profile, "TotalBytesRead", TUnit::BYTES, hdfs_profile_name);
_hdfs_profile.total_local_bytes_read =
ADD_CHILD_COUNTER(_profile, "TotalLocalBytesRead", TUnit::BYTES, hdfs_profile_name);
_hdfs_profile.total_short_circuit_bytes_read = ADD_CHILD_COUNTER(
_profile, "TotalShortCircuitBytesRead", TUnit::BYTES, hdfs_profile_name);
_hdfs_profile.total_total_zero_copy_bytes_read = ADD_CHILD_COUNTER(
_profile, "TotalZeroCopyBytesRead", TUnit::BYTES, hdfs_profile_name);
_hdfs_profile.total_hedged_read =
ADD_CHILD_COUNTER(_profile, "TotalHedgedRead", TUnit::UNIT, hdfs_profile_name);
_hdfs_profile.hedged_read_in_cur_thread = ADD_CHILD_COUNTER(
_profile, "HedgedReadInCurThread", TUnit::UNIT, hdfs_profile_name);
_hdfs_profile.hedged_read_wins =
ADD_CHILD_COUNTER(_profile, "HedgedReadWins", TUnit::UNIT, hdfs_profile_name);
#endif
}
}
HdfsFileReader::~HdfsFileReader() {
static_cast<void>(close());
}
Status HdfsFileReader::close() {
bool expected = false;
if (_closed.compare_exchange_strong(expected, true, std::memory_order_acq_rel)) {
DorisMetrics::instance()->hdfs_file_open_reading->increment(-1);
}
return Status::OK();
}
#ifdef USE_HADOOP_HDFS
Status HdfsFileReader::read_at_impl(size_t offset, Slice result, size_t* bytes_read,
const IOContext* io_ctx) {
DCHECK(!closed());
if (offset > _handle->file_size()) {
return Status::IOError("offset exceeds file size(offset: {}, file size: {}, path: {})",
offset, _handle->file_size(), _path.native());
}
size_t bytes_req = result.size;
char* to = result.data;
bytes_req = std::min(bytes_req, (size_t)(_handle->file_size() - offset));
*bytes_read = 0;
if (UNLIKELY(bytes_req == 0)) {
return Status::OK();
}
LIMIT_REMOTE_SCAN_IO(bytes_read);
size_t has_read = 0;
while (has_read < bytes_req) {
tSize loop_read = hdfsPread(_handle->fs(), _handle->file(), offset + has_read,
to + has_read, bytes_req - has_read);
if (loop_read < 0) {
// invoker maybe just skip Status.NotFound and continue
// so we need distinguish between it and other kinds of errors
std::string _err_msg = hdfs_error();
if (_err_msg.find("No such file or directory") != std::string::npos) {
return Status::NotFound(_err_msg);
}
return Status::InternalError(
"Read hdfs file failed. (BE: {}) namenode:{}, path:{}, err: {}",
BackendOptions::get_localhost(), _name_node, _path.string(), _err_msg);
}
if (loop_read == 0) {
break;
}
has_read += loop_read;
}
*bytes_read = has_read;
if (io_ctx && io_ctx->file_cache_stats) {
io_ctx->file_cache_stats->bytes_read_from_remote += bytes_req;
}
return Status::OK();
}
#else
// The hedged read only support hdfsPread().
// TODO: rethink here to see if there are some difference between hdfsPread() and hdfsRead()
Status HdfsFileReader::read_at_impl(size_t offset, Slice result, size_t* bytes_read,
const IOContext* io_ctx) {
DCHECK(!closed());
if (offset > _handle->file_size()) {
return Status::IOError("offset exceeds file size(offset: {}, file size: {}, path: {})",
offset, _handle->file_size(), _path.native());
}
int res = hdfsSeek(_handle->fs(), _handle->file(), offset);
if (res != 0) {
// invoker maybe just skip Status.NotFound and continue
// so we need distinguish between it and other kinds of errors
std::string _err_msg = hdfs_error();
if (_err_msg.find("No such file or directory") != std::string::npos) {
return Status::NotFound(_err_msg);
}
return Status::InternalError("Seek to offset failed. (BE: {}) offset={}, err: {}",
BackendOptions::get_localhost(), offset, _err_msg);
}
size_t bytes_req = result.size;
char* to = result.data;
bytes_req = std::min(bytes_req, (size_t)(_handle->file_size() - offset));
*bytes_read = 0;
if (UNLIKELY(bytes_req == 0)) {
return Status::OK();
}
LIMIT_REMOTE_SCAN_IO(bytes_read);
size_t has_read = 0;
while (has_read < bytes_req) {
int64_t loop_read =
hdfsRead(_handle->fs(), _handle->file(), to + has_read, bytes_req - has_read);
if (loop_read < 0) {
// invoker maybe just skip Status.NotFound and continue
// so we need distinguish between it and other kinds of errors
std::string _err_msg = hdfs_error();
if (_err_msg.find("No such file or directory") != std::string::npos) {
return Status::NotFound(_err_msg);
}
return Status::InternalError(
"Read hdfs file failed. (BE: {}) namenode:{}, path:{}, err: {}",
BackendOptions::get_localhost(), _name_node, _path.string(), _err_msg);
}
if (loop_read == 0) {
break;
}
has_read += loop_read;
}
*bytes_read = has_read;
if (io_ctx && io_ctx->file_cache_stats) {
io_ctx->file_cache_stats->bytes_read_from_remote += bytes_req;
}
return Status::OK();
}
#endif
void HdfsFileReader::_collect_profile_before_close() {
if (_profile != nullptr && is_hdfs(_name_node)) {
#ifdef USE_HADOOP_HDFS
struct hdfsReadStatistics* hdfs_statistics = nullptr;
auto r = hdfsFileGetReadStatistics(_handle->file(), &hdfs_statistics);
if (r != 0) {
LOG(WARNING) << "Failed to run hdfsFileGetReadStatistics(): " << r
<< ", name node: " << _name_node;
return;
}
COUNTER_UPDATE(_hdfs_profile.total_bytes_read, hdfs_statistics->totalBytesRead);
COUNTER_UPDATE(_hdfs_profile.total_local_bytes_read, hdfs_statistics->totalLocalBytesRead);
COUNTER_UPDATE(_hdfs_profile.total_short_circuit_bytes_read,
hdfs_statistics->totalShortCircuitBytesRead);
COUNTER_UPDATE(_hdfs_profile.total_total_zero_copy_bytes_read,
hdfs_statistics->totalZeroCopyBytesRead);
hdfsFileFreeReadStatistics(hdfs_statistics);
struct hdfsHedgedReadMetrics* hdfs_hedged_read_statistics = nullptr;
r = hdfsGetHedgedReadMetrics(_handle->fs(), &hdfs_hedged_read_statistics);
if (r != 0) {
LOG(WARNING) << "Failed to run hdfsGetHedgedReadMetrics(): " << r
<< ", name node: " << _name_node;
return;
}
COUNTER_UPDATE(_hdfs_profile.total_hedged_read, hdfs_hedged_read_statistics->hedgedReadOps);
COUNTER_UPDATE(_hdfs_profile.hedged_read_in_cur_thread,
hdfs_hedged_read_statistics->hedgedReadOpsInCurThread);
COUNTER_UPDATE(_hdfs_profile.hedged_read_wins,
hdfs_hedged_read_statistics->hedgedReadOpsWin);
hdfsFreeHedgedReadMetrics(hdfs_hedged_read_statistics);
hdfsFileClearReadStatistics(_handle->file());
#endif
}
}
} // namespace io
} // namespace doris