501 lines
17 KiB
C++
501 lines
17 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#include "runtime/disk_io_mgr.h"
|
|
#include "runtime/disk_io_mgr_internal.h"
|
|
#include "util/error_util.h"
|
|
// #include "util/hdfs-util.h"
|
|
|
|
using std::string;
|
|
using std::stringstream;
|
|
using std::vector;
|
|
using std::list;
|
|
using std::endl;
|
|
|
|
using boost::lock_guard;
|
|
using boost::unique_lock;
|
|
using boost::mutex;
|
|
|
|
namespace doris {
|
|
|
|
// A very large max value to prevent things from going out of control. Not
|
|
// expected to ever hit this value (1GB of buffered data per range).
|
|
const int MAX_QUEUE_CAPACITY = 128;
|
|
const int MIN_QUEUE_CAPACITY = 2;
|
|
|
|
// Implementation of the ScanRange functionality. Each ScanRange contains a queue
|
|
// of ready buffers. For each ScanRange, there is only a single producer and
|
|
// consumer thread, i.e. only one disk thread will push to a scan range at
|
|
// any time and only one thread will remove from the queue. This is to guarantee
|
|
// that buffers are queued and read in file order.
|
|
|
|
// This must be called with the reader lock taken.
|
|
bool DiskIoMgr::ScanRange::enqueue_buffer(BufferDescriptor* buffer) {
|
|
{
|
|
unique_lock<mutex> scan_range_lock(_lock);
|
|
DCHECK(validate()) << debug_string();
|
|
DCHECK(!_eosr_returned);
|
|
DCHECK(!_eosr_queued);
|
|
if (_is_cancelled) {
|
|
// Return the buffer, this range has been cancelled
|
|
if (buffer->_buffer != NULL) {
|
|
++_io_mgr->_num_buffers_in_readers;
|
|
++_reader->_num_buffers_in_reader;
|
|
}
|
|
--_reader->_num_used_buffers;
|
|
buffer->return_buffer();
|
|
return false;
|
|
}
|
|
++_reader->_num_ready_buffers;
|
|
_ready_buffers.push_back(buffer);
|
|
_eosr_queued = buffer->eosr();
|
|
|
|
_blocked_on_queue = _ready_buffers.size() >= _ready_buffers_capacity;
|
|
if (_blocked_on_queue && _ready_buffers_capacity > MIN_QUEUE_CAPACITY) {
|
|
// We have filled the queue, indicating we need back pressure on
|
|
// the producer side (i.e. we are pushing buffers faster than they
|
|
// are pulled off, throttle this range more).
|
|
--_ready_buffers_capacity;
|
|
}
|
|
}
|
|
|
|
_buffer_ready_cv.notify_one();
|
|
|
|
return _blocked_on_queue;
|
|
}
|
|
|
|
Status DiskIoMgr::ScanRange::get_next(BufferDescriptor** buffer) {
|
|
*buffer = NULL;
|
|
|
|
{
|
|
unique_lock<mutex> scan_range_lock(_lock);
|
|
if (_eosr_returned) {
|
|
return Status::OK();
|
|
}
|
|
DCHECK(validate()) << debug_string();
|
|
|
|
if (_ready_buffers.empty()) {
|
|
// The queue is empty indicating this thread could use more
|
|
// IO. Increase the capacity to allow for more queueing.
|
|
++_ready_buffers_capacity ;
|
|
_ready_buffers_capacity = std::min(_ready_buffers_capacity, MAX_QUEUE_CAPACITY);
|
|
}
|
|
|
|
while (_ready_buffers.empty() && !_is_cancelled) {
|
|
_buffer_ready_cv.wait(scan_range_lock);
|
|
}
|
|
|
|
if (_is_cancelled) {
|
|
DCHECK(!_status.ok());
|
|
return _status;
|
|
}
|
|
|
|
// Remove the first ready buffer from the queue and return it
|
|
DCHECK(!_ready_buffers.empty());
|
|
*buffer = _ready_buffers.front();
|
|
_ready_buffers.pop_front();
|
|
_eosr_returned = (*buffer)->eosr();
|
|
}
|
|
|
|
// Update tracking counters. The buffer has now moved from the IoMgr to the
|
|
// caller.
|
|
++_io_mgr->_num_buffers_in_readers;
|
|
++_reader->_num_buffers_in_reader;
|
|
--_reader->_num_ready_buffers;
|
|
--_reader->_num_used_buffers;
|
|
|
|
Status status = (*buffer)->_status;
|
|
if (!status.ok()) {
|
|
(*buffer)->return_buffer();
|
|
*buffer = NULL;
|
|
return status;
|
|
}
|
|
|
|
unique_lock<mutex> reader_lock(_reader->_lock);
|
|
if (_eosr_returned) {
|
|
_reader->_total_range_queue_capacity += _ready_buffers_capacity;
|
|
++_reader->_num_finished_ranges;
|
|
_reader->_initial_queue_capacity =
|
|
_reader->_total_range_queue_capacity / _reader->_num_finished_ranges;
|
|
}
|
|
|
|
DCHECK(_reader->validate()) << endl << _reader->debug_string();
|
|
if (_reader->_state == RequestContext::Cancelled) {
|
|
_reader->_blocked_ranges.remove(this);
|
|
cancel(_reader->_status);
|
|
(*buffer)->return_buffer();
|
|
*buffer = NULL;
|
|
return _status;
|
|
}
|
|
|
|
bool was_blocked = _blocked_on_queue;
|
|
_blocked_on_queue = _ready_buffers.size() >= _ready_buffers_capacity;
|
|
if (was_blocked && !_blocked_on_queue && !_eosr_queued) {
|
|
// This scan range was blocked and is no longer, add it to the reader
|
|
// queue again.
|
|
_reader->_blocked_ranges.remove(this);
|
|
_reader->schedule_scan_range(this);
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
void DiskIoMgr::ScanRange::cancel(const Status& status) {
|
|
// Cancelling a range that was never started, ignore.
|
|
if (_io_mgr == NULL) {
|
|
return;
|
|
}
|
|
|
|
DCHECK(!status.ok());
|
|
{
|
|
// Grab both locks to make sure that all working threads see _is_cancelled.
|
|
unique_lock<mutex> scan_range_lock(_lock);
|
|
unique_lock<mutex> hdfs_lock(_hdfs_lock);
|
|
DCHECK(validate()) << debug_string();
|
|
if (_is_cancelled) {
|
|
return;
|
|
}
|
|
_is_cancelled = true;
|
|
_status = status;
|
|
}
|
|
_buffer_ready_cv.notify_all();
|
|
cleanup_queued_buffers();
|
|
|
|
// For cached buffers, we can't close the range until the cached buffer is returned.
|
|
// close() is called from DiskIoMgr::return_buffer().
|
|
if (_cached_buffer == NULL) {
|
|
close();
|
|
}
|
|
}
|
|
|
|
void DiskIoMgr::ScanRange::cleanup_queued_buffers() {
|
|
DCHECK(_is_cancelled);
|
|
_io_mgr->_num_buffers_in_readers += _ready_buffers.size();
|
|
_reader->_num_buffers_in_reader += _ready_buffers.size();
|
|
_reader->_num_used_buffers -= _ready_buffers.size();
|
|
_reader->_num_ready_buffers -= _ready_buffers.size();
|
|
|
|
while (!_ready_buffers.empty()) {
|
|
BufferDescriptor* buffer = _ready_buffers.front();
|
|
buffer->return_buffer();
|
|
_ready_buffers.pop_front();
|
|
}
|
|
}
|
|
|
|
string DiskIoMgr::ScanRange::debug_string() const {
|
|
stringstream ss;
|
|
ss << "file=" << _file << " disk_id=" << _disk_id << " offset=" << _offset
|
|
<< " len=" << _len << " bytes_read=" << _bytes_read
|
|
<< " buffer_queue=" << _ready_buffers.size()
|
|
<< " capacity=" << _ready_buffers_capacity
|
|
<< " hdfs_file=" << _hdfs_file;
|
|
return ss.str();
|
|
}
|
|
|
|
bool DiskIoMgr::ScanRange::validate() {
|
|
if (_bytes_read > _len) {
|
|
LOG(WARNING) << "Bytes read tracking is wrong. Shouldn't read past the scan range."
|
|
<< " _bytes_read=" << _bytes_read << " _len=" << _len;
|
|
return false;
|
|
}
|
|
if (_eosr_returned && !_eosr_queued) {
|
|
LOG(WARNING) << "Returned eosr to reader before finishing reading the scan range"
|
|
<< " _eosr_returned=" << _eosr_returned
|
|
<< " _eosr_queued=" << _eosr_queued;
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
DiskIoMgr::ScanRange::ScanRange(int capacity) : _ready_buffers_capacity(capacity) {
|
|
_request_type = RequestType::READ;
|
|
reset(NULL, "", -1, -1, -1, false, false, NEVER_CACHE);
|
|
}
|
|
|
|
DiskIoMgr::ScanRange::~ScanRange() {
|
|
DCHECK(_hdfs_file == NULL) << "File was not closed.";
|
|
DCHECK(_cached_buffer == NULL) << "Cached buffer was not released.";
|
|
}
|
|
|
|
void DiskIoMgr::ScanRange::reset(
|
|
hdfsFS fs, const char* file, int64_t len,
|
|
int64_t offset, int disk_id, bool try_cache,
|
|
bool expected_local, int64_t mtime, void* meta_data) {
|
|
DCHECK(_ready_buffers.empty());
|
|
_fs = fs;
|
|
_file = file;
|
|
_len = len;
|
|
_offset = offset;
|
|
_disk_id = disk_id;
|
|
_try_cache = try_cache;
|
|
_expected_local = expected_local;
|
|
_meta_data = meta_data;
|
|
_cached_buffer = NULL;
|
|
_io_mgr = NULL;
|
|
_reader = NULL;
|
|
_hdfs_file = NULL;
|
|
_mtime = mtime;
|
|
}
|
|
|
|
void DiskIoMgr::ScanRange::init_internal(DiskIoMgr* io_mgr, RequestContext* reader) {
|
|
DCHECK(_hdfs_file == NULL);
|
|
_io_mgr = io_mgr;
|
|
_reader = reader;
|
|
_local_file = NULL;
|
|
_hdfs_file = NULL;
|
|
_bytes_read = 0;
|
|
_is_cancelled = false;
|
|
_eosr_queued = false;
|
|
_eosr_returned = false;
|
|
_blocked_on_queue = false;
|
|
if (_ready_buffers_capacity <= 0) {
|
|
_ready_buffers_capacity = reader->initial_scan_range_queue_capacity();
|
|
DCHECK_GE(_ready_buffers_capacity, MIN_QUEUE_CAPACITY);
|
|
}
|
|
DCHECK(validate()) << debug_string();
|
|
}
|
|
|
|
Status DiskIoMgr::ScanRange::open() {
|
|
unique_lock<mutex> hdfs_lock(_hdfs_lock);
|
|
if (_is_cancelled) {
|
|
return Status::Cancelled("Cancelled");
|
|
}
|
|
|
|
// if (_fs != NULL) {
|
|
// if (_hdfs_file != NULL) {
|
|
// return Status::OK();
|
|
// }
|
|
// _hdfs_file = _io_mgr->OpenHdfsFile(_fs, file(), mtime());
|
|
// if (_hdfs_file == NULL) {
|
|
// return Status::InternalError("GetHdfsErrorMsg("Failed to open HDFS file ", _file));
|
|
// }
|
|
|
|
// if (hdfsSeek(_fs, _hdfs_file->file(), _offset) != 0) {
|
|
// _io_mgr->cache_or_close_file_handle(file(), _hdfs_file, false);
|
|
// _hdfs_file = NULL;
|
|
// string error_msg = GetHdfsErrorMsg("");
|
|
// stringstream ss;
|
|
// ss << "Error seeking to " << _offset << " in file: " << _file << " " << error_msg;
|
|
// return Status::InternalError(ss.str());
|
|
// }
|
|
// } else {
|
|
if (_local_file != NULL) {
|
|
return Status::OK();
|
|
}
|
|
|
|
_local_file = fopen(file(), "r");
|
|
if (_local_file == NULL) {
|
|
string error_msg = get_str_err_msg();
|
|
stringstream ss;
|
|
ss << "Could not open file: " << _file << ": " << error_msg;
|
|
return Status::InternalError(ss.str());
|
|
}
|
|
if (fseek(_local_file, _offset, SEEK_SET) == -1) {
|
|
fclose(_local_file);
|
|
_local_file = NULL;
|
|
string error_msg = get_str_err_msg();
|
|
stringstream ss;
|
|
ss << "Could not seek to " << _offset << " for file: " << _file
|
|
<< ": " << error_msg;
|
|
return Status::InternalError(ss.str());
|
|
}
|
|
// }
|
|
#if 0
|
|
if (DorisMetrics::io_mgr_num_open_files() != NULL) {
|
|
DorisMetrics::io_mgr_num_open_files()->increment(1L);
|
|
}
|
|
#endif
|
|
return Status::OK();
|
|
}
|
|
|
|
void DiskIoMgr::ScanRange::close() {
|
|
unique_lock<mutex> hdfs_lock(_hdfs_lock);
|
|
/*
|
|
* if (_fs != NULL) {
|
|
* if (_hdfs_file == NULL) return;
|
|
*
|
|
* struct hdfsReadStatistics* stats;
|
|
* if (IsDfsPath(file())) {
|
|
* int success = hdfsFileGetReadStatistics(_hdfs_file->file(), &stats);
|
|
* if (success == 0) {
|
|
* _reader->_bytes_read_local += stats->totalLocalBytesRead;
|
|
* _reader->_bytes_read_short_circuit += stats->totalShortCircuitBytesRead;
|
|
* _reader->_bytes_read_dn_cache += stats->totalZeroCopyBytesRead;
|
|
* if (stats->totalLocalBytesRead != stats->totalBytesRead) {
|
|
* ++_reader->_num_remote_ranges;
|
|
* if (_expected_local) {
|
|
* int remote_bytes = stats->totalBytesRead - stats->totalLocalBytesRead;
|
|
* _reader->_unexpected_remote_bytes += remote_bytes;
|
|
* VLOG_FILE << "Unexpected remote HDFS read of "
|
|
* << PrettyPrinter::Print(remote_bytes, TUnit::BYTES)
|
|
* << " for file '" << _file << "'";
|
|
* }
|
|
* }
|
|
* hdfsFileFreeReadStatistics(stats);
|
|
* }
|
|
* }
|
|
* if (_cached_buffer != NULL) {
|
|
* hadoopRzBufferFree(_hdfs_file->file(), _cached_buffer);
|
|
* _cached_buffer = NULL;
|
|
* }
|
|
* _io_mgr->cache_or_close_file_handle(file(), _hdfs_file, false);
|
|
* VLOG_FILE << "Cache HDFS file handle file=" << file();
|
|
* _hdfs_file = NULL;
|
|
* } else {
|
|
*/
|
|
{
|
|
if (_local_file == NULL) {
|
|
return;
|
|
}
|
|
fclose(_local_file);
|
|
_local_file = NULL;
|
|
}
|
|
#if 0
|
|
if (DorisMetrics::io_mgr_num_open_files() != NULL) {
|
|
DorisMetrics::io_mgr_num_open_files()->increment(-1L);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* int64_t DiskIoMgr::ScanRange::max_read_chunk_size() const {
|
|
* // S3 InputStreams don't support DIRECT_READ (i.e. java.nio.ByteBuffer read()
|
|
* // interface). So, hdfsRead() needs to allocate a Java byte[] and copy the data out.
|
|
* // Profiles show that both the JNI array allocation and the memcpy adds much more
|
|
* // overhead for larger buffers, so limit the size of each read request. 128K was
|
|
* // chosen empirically by trying values between 4K and 8M and optimizing for lower CPU
|
|
* // utilization and higher S3 througput.
|
|
* if (_disk_id == _io_mgr->RemoteS3DiskId()) {
|
|
* DCHECK(IsS3APath(file()));
|
|
* return 128 * 1024;
|
|
* }
|
|
* return numeric_limits<int64_t>::max();
|
|
* }
|
|
*/
|
|
|
|
// TODO: how do we best use the disk here. e.g. is it good to break up a
|
|
// 1MB read into 8 128K reads?
|
|
// TODO: look at linux disk scheduling
|
|
Status DiskIoMgr::ScanRange::read(char* buffer, int64_t* bytes_read, bool* eosr) {
|
|
unique_lock<mutex> hdfs_lock(_hdfs_lock);
|
|
if (_is_cancelled) {
|
|
return Status::Cancelled("Cancelled");
|
|
}
|
|
|
|
*eosr = false;
|
|
*bytes_read = 0;
|
|
// hdfsRead() length argument is an int. Since _max_buffer_size type is no bigger
|
|
// than an int, this min() will ensure that we don't overflow the length argument.
|
|
DCHECK_LE(sizeof(_io_mgr->_max_buffer_size), sizeof(int));
|
|
int bytes_to_read =
|
|
std::min(static_cast<int64_t>(_io_mgr->_max_buffer_size), _len - _bytes_read);
|
|
DCHECK_GE(bytes_to_read, 0);
|
|
|
|
/*
|
|
* if (_fs != NULL) {
|
|
* DCHECK(_hdfs_file != NULL);
|
|
* int64_t max_chunk_size = max_read_chunk_size();
|
|
* while (*bytes_read < bytes_to_read) {
|
|
* int chunk_size = min(bytes_to_read - *bytes_read, max_chunk_size);
|
|
* int last_read = hdfsRead(_fs, _hdfs_file->file(), buffer + *bytes_read, chunk_size);
|
|
* if (last_read == -1) {
|
|
* return Status::InternalError("GetHdfsErrorMsg("Error reading from HDFS file: ", _file));
|
|
* } else if (last_read == 0) {
|
|
* // No more bytes in the file. The scan range went past the end.
|
|
* *eosr = true;
|
|
* break;
|
|
* }
|
|
* *bytes_read += last_read;
|
|
* }
|
|
* } else {
|
|
*/
|
|
DCHECK(_local_file != NULL);
|
|
*bytes_read = fread(buffer, 1, bytes_to_read, _local_file);
|
|
DCHECK_GE(*bytes_read, 0);
|
|
DCHECK_LE(*bytes_read, bytes_to_read);
|
|
if (*bytes_read < bytes_to_read) {
|
|
if (ferror(_local_file) != 0) {
|
|
string error_msg = get_str_err_msg();
|
|
stringstream ss;
|
|
ss << "Error reading from " << _file << " at byte offset: "
|
|
<< (_offset + _bytes_read) << ": " << error_msg;
|
|
return Status::InternalError(ss.str());
|
|
} else {
|
|
// On Linux, we should only get partial reads from block devices on error or eof.
|
|
DCHECK(feof(_local_file) != 0);
|
|
*eosr = true;
|
|
}
|
|
}
|
|
// }
|
|
_bytes_read += *bytes_read;
|
|
DCHECK_LE(_bytes_read, _len);
|
|
if (_bytes_read == _len) {
|
|
*eosr = true;
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
/*
|
|
* Status DiskIoMgr::ScanRange::read_from_cache(bool* read_succeeded) {
|
|
* DCHECK(_try_cache);
|
|
* DCHECK_EQ(_bytes_read, 0);
|
|
* *read_succeeded = false;
|
|
* Status status = open();
|
|
* if (!status.ok()) return status;
|
|
*
|
|
* // Cached reads not supported on local filesystem.
|
|
* if (_fs == NULL) return Status::OK();
|
|
*
|
|
* {
|
|
* unique_lock<mutex> hdfs_lock(_hdfs_lock);
|
|
* if (_is_cancelled) return Status::Cancelled("Cancelled");
|
|
*
|
|
* DCHECK(_hdfs_file != NULL);
|
|
* DCHECK(_cached_buffer == NULL);
|
|
* _cached_buffer = hadoopReadZero(_hdfs_file->file(),
|
|
* _io_mgr->_cached_read_options, len());
|
|
*
|
|
* // Data was not cached, caller will fall back to normal read path.
|
|
* if (_cached_buffer == NULL) return Status::OK();
|
|
* }
|
|
*
|
|
* // Cached read succeeded.
|
|
* void* buffer = const_cast<void*>(hadoopRzBufferGet(_cached_buffer));
|
|
* int32_t bytes_read = hadoopRzBufferLength(_cached_buffer);
|
|
* // For now, entire the entire block is cached or none of it.
|
|
* // TODO: if HDFS ever changes this, we'll have to handle the case where half
|
|
* // the block is cached.
|
|
* DCHECK_EQ(bytes_read, len());
|
|
*
|
|
* // Create a single buffer desc for the entire scan range and enqueue that.
|
|
* BufferDescriptor* desc = _io_mgr->get_buffer_desc(
|
|
* _reader, this, reinterpret_cast<char*>(buffer), 0);
|
|
* desc->_len = bytes_read;
|
|
* desc->_scan_range_offset = 0;
|
|
* desc->_eosr = true;
|
|
* _bytes_read = bytes_read;
|
|
* enqueue_buffer(desc);
|
|
* if (_reader->_bytes_read_counter != NULL) {
|
|
* COUNTER_ADD(_reader->_bytes_read_counter, bytes_read);
|
|
* }
|
|
* *read_succeeded = true;
|
|
* ++_reader->_num_used_buffers;
|
|
* return Status::OK();
|
|
* }
|
|
*/
|
|
} // namespace doris
|
|
|