Files
doris/be/src/runtime/disk_io_mgr_scan_range.cc
2019-06-14 23:38:31 +08:00

501 lines
17 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "runtime/disk_io_mgr.h"
#include "runtime/disk_io_mgr_internal.h"
#include "util/error_util.h"
// #include "util/hdfs-util.h"
using std::string;
using std::stringstream;
using std::vector;
using std::list;
using std::endl;
using boost::lock_guard;
using boost::unique_lock;
using boost::mutex;
namespace doris {
// A very large max value to prevent things from going out of control. Not
// expected to ever hit this value (1GB of buffered data per range).
const int MAX_QUEUE_CAPACITY = 128;
const int MIN_QUEUE_CAPACITY = 2;
// Implementation of the ScanRange functionality. Each ScanRange contains a queue
// of ready buffers. For each ScanRange, there is only a single producer and
// consumer thread, i.e. only one disk thread will push to a scan range at
// any time and only one thread will remove from the queue. This is to guarantee
// that buffers are queued and read in file order.
// This must be called with the reader lock taken.
bool DiskIoMgr::ScanRange::enqueue_buffer(BufferDescriptor* buffer) {
{
unique_lock<mutex> scan_range_lock(_lock);
DCHECK(validate()) << debug_string();
DCHECK(!_eosr_returned);
DCHECK(!_eosr_queued);
if (_is_cancelled) {
// Return the buffer, this range has been cancelled
if (buffer->_buffer != NULL) {
++_io_mgr->_num_buffers_in_readers;
++_reader->_num_buffers_in_reader;
}
--_reader->_num_used_buffers;
buffer->return_buffer();
return false;
}
++_reader->_num_ready_buffers;
_ready_buffers.push_back(buffer);
_eosr_queued = buffer->eosr();
_blocked_on_queue = _ready_buffers.size() >= _ready_buffers_capacity;
if (_blocked_on_queue && _ready_buffers_capacity > MIN_QUEUE_CAPACITY) {
// We have filled the queue, indicating we need back pressure on
// the producer side (i.e. we are pushing buffers faster than they
// are pulled off, throttle this range more).
--_ready_buffers_capacity;
}
}
_buffer_ready_cv.notify_one();
return _blocked_on_queue;
}
Status DiskIoMgr::ScanRange::get_next(BufferDescriptor** buffer) {
*buffer = NULL;
{
unique_lock<mutex> scan_range_lock(_lock);
if (_eosr_returned) {
return Status::OK();
}
DCHECK(validate()) << debug_string();
if (_ready_buffers.empty()) {
// The queue is empty indicating this thread could use more
// IO. Increase the capacity to allow for more queueing.
++_ready_buffers_capacity ;
_ready_buffers_capacity = std::min(_ready_buffers_capacity, MAX_QUEUE_CAPACITY);
}
while (_ready_buffers.empty() && !_is_cancelled) {
_buffer_ready_cv.wait(scan_range_lock);
}
if (_is_cancelled) {
DCHECK(!_status.ok());
return _status;
}
// Remove the first ready buffer from the queue and return it
DCHECK(!_ready_buffers.empty());
*buffer = _ready_buffers.front();
_ready_buffers.pop_front();
_eosr_returned = (*buffer)->eosr();
}
// Update tracking counters. The buffer has now moved from the IoMgr to the
// caller.
++_io_mgr->_num_buffers_in_readers;
++_reader->_num_buffers_in_reader;
--_reader->_num_ready_buffers;
--_reader->_num_used_buffers;
Status status = (*buffer)->_status;
if (!status.ok()) {
(*buffer)->return_buffer();
*buffer = NULL;
return status;
}
unique_lock<mutex> reader_lock(_reader->_lock);
if (_eosr_returned) {
_reader->_total_range_queue_capacity += _ready_buffers_capacity;
++_reader->_num_finished_ranges;
_reader->_initial_queue_capacity =
_reader->_total_range_queue_capacity / _reader->_num_finished_ranges;
}
DCHECK(_reader->validate()) << endl << _reader->debug_string();
if (_reader->_state == RequestContext::Cancelled) {
_reader->_blocked_ranges.remove(this);
cancel(_reader->_status);
(*buffer)->return_buffer();
*buffer = NULL;
return _status;
}
bool was_blocked = _blocked_on_queue;
_blocked_on_queue = _ready_buffers.size() >= _ready_buffers_capacity;
if (was_blocked && !_blocked_on_queue && !_eosr_queued) {
// This scan range was blocked and is no longer, add it to the reader
// queue again.
_reader->_blocked_ranges.remove(this);
_reader->schedule_scan_range(this);
}
return Status::OK();
}
void DiskIoMgr::ScanRange::cancel(const Status& status) {
// Cancelling a range that was never started, ignore.
if (_io_mgr == NULL) {
return;
}
DCHECK(!status.ok());
{
// Grab both locks to make sure that all working threads see _is_cancelled.
unique_lock<mutex> scan_range_lock(_lock);
unique_lock<mutex> hdfs_lock(_hdfs_lock);
DCHECK(validate()) << debug_string();
if (_is_cancelled) {
return;
}
_is_cancelled = true;
_status = status;
}
_buffer_ready_cv.notify_all();
cleanup_queued_buffers();
// For cached buffers, we can't close the range until the cached buffer is returned.
// close() is called from DiskIoMgr::return_buffer().
if (_cached_buffer == NULL) {
close();
}
}
void DiskIoMgr::ScanRange::cleanup_queued_buffers() {
DCHECK(_is_cancelled);
_io_mgr->_num_buffers_in_readers += _ready_buffers.size();
_reader->_num_buffers_in_reader += _ready_buffers.size();
_reader->_num_used_buffers -= _ready_buffers.size();
_reader->_num_ready_buffers -= _ready_buffers.size();
while (!_ready_buffers.empty()) {
BufferDescriptor* buffer = _ready_buffers.front();
buffer->return_buffer();
_ready_buffers.pop_front();
}
}
string DiskIoMgr::ScanRange::debug_string() const {
stringstream ss;
ss << "file=" << _file << " disk_id=" << _disk_id << " offset=" << _offset
<< " len=" << _len << " bytes_read=" << _bytes_read
<< " buffer_queue=" << _ready_buffers.size()
<< " capacity=" << _ready_buffers_capacity
<< " hdfs_file=" << _hdfs_file;
return ss.str();
}
bool DiskIoMgr::ScanRange::validate() {
if (_bytes_read > _len) {
LOG(WARNING) << "Bytes read tracking is wrong. Shouldn't read past the scan range."
<< " _bytes_read=" << _bytes_read << " _len=" << _len;
return false;
}
if (_eosr_returned && !_eosr_queued) {
LOG(WARNING) << "Returned eosr to reader before finishing reading the scan range"
<< " _eosr_returned=" << _eosr_returned
<< " _eosr_queued=" << _eosr_queued;
return false;
}
return true;
}
DiskIoMgr::ScanRange::ScanRange(int capacity) : _ready_buffers_capacity(capacity) {
_request_type = RequestType::READ;
reset(NULL, "", -1, -1, -1, false, false, NEVER_CACHE);
}
DiskIoMgr::ScanRange::~ScanRange() {
DCHECK(_hdfs_file == NULL) << "File was not closed.";
DCHECK(_cached_buffer == NULL) << "Cached buffer was not released.";
}
void DiskIoMgr::ScanRange::reset(
hdfsFS fs, const char* file, int64_t len,
int64_t offset, int disk_id, bool try_cache,
bool expected_local, int64_t mtime, void* meta_data) {
DCHECK(_ready_buffers.empty());
_fs = fs;
_file = file;
_len = len;
_offset = offset;
_disk_id = disk_id;
_try_cache = try_cache;
_expected_local = expected_local;
_meta_data = meta_data;
_cached_buffer = NULL;
_io_mgr = NULL;
_reader = NULL;
_hdfs_file = NULL;
_mtime = mtime;
}
void DiskIoMgr::ScanRange::init_internal(DiskIoMgr* io_mgr, RequestContext* reader) {
DCHECK(_hdfs_file == NULL);
_io_mgr = io_mgr;
_reader = reader;
_local_file = NULL;
_hdfs_file = NULL;
_bytes_read = 0;
_is_cancelled = false;
_eosr_queued = false;
_eosr_returned = false;
_blocked_on_queue = false;
if (_ready_buffers_capacity <= 0) {
_ready_buffers_capacity = reader->initial_scan_range_queue_capacity();
DCHECK_GE(_ready_buffers_capacity, MIN_QUEUE_CAPACITY);
}
DCHECK(validate()) << debug_string();
}
Status DiskIoMgr::ScanRange::open() {
unique_lock<mutex> hdfs_lock(_hdfs_lock);
if (_is_cancelled) {
return Status::Cancelled("Cancelled");
}
// if (_fs != NULL) {
// if (_hdfs_file != NULL) {
// return Status::OK();
// }
// _hdfs_file = _io_mgr->OpenHdfsFile(_fs, file(), mtime());
// if (_hdfs_file == NULL) {
// return Status::InternalError("GetHdfsErrorMsg("Failed to open HDFS file ", _file));
// }
// if (hdfsSeek(_fs, _hdfs_file->file(), _offset) != 0) {
// _io_mgr->cache_or_close_file_handle(file(), _hdfs_file, false);
// _hdfs_file = NULL;
// string error_msg = GetHdfsErrorMsg("");
// stringstream ss;
// ss << "Error seeking to " << _offset << " in file: " << _file << " " << error_msg;
// return Status::InternalError(ss.str());
// }
// } else {
if (_local_file != NULL) {
return Status::OK();
}
_local_file = fopen(file(), "r");
if (_local_file == NULL) {
string error_msg = get_str_err_msg();
stringstream ss;
ss << "Could not open file: " << _file << ": " << error_msg;
return Status::InternalError(ss.str());
}
if (fseek(_local_file, _offset, SEEK_SET) == -1) {
fclose(_local_file);
_local_file = NULL;
string error_msg = get_str_err_msg();
stringstream ss;
ss << "Could not seek to " << _offset << " for file: " << _file
<< ": " << error_msg;
return Status::InternalError(ss.str());
}
// }
#if 0
if (DorisMetrics::io_mgr_num_open_files() != NULL) {
DorisMetrics::io_mgr_num_open_files()->increment(1L);
}
#endif
return Status::OK();
}
void DiskIoMgr::ScanRange::close() {
unique_lock<mutex> hdfs_lock(_hdfs_lock);
/*
* if (_fs != NULL) {
* if (_hdfs_file == NULL) return;
*
* struct hdfsReadStatistics* stats;
* if (IsDfsPath(file())) {
* int success = hdfsFileGetReadStatistics(_hdfs_file->file(), &stats);
* if (success == 0) {
* _reader->_bytes_read_local += stats->totalLocalBytesRead;
* _reader->_bytes_read_short_circuit += stats->totalShortCircuitBytesRead;
* _reader->_bytes_read_dn_cache += stats->totalZeroCopyBytesRead;
* if (stats->totalLocalBytesRead != stats->totalBytesRead) {
* ++_reader->_num_remote_ranges;
* if (_expected_local) {
* int remote_bytes = stats->totalBytesRead - stats->totalLocalBytesRead;
* _reader->_unexpected_remote_bytes += remote_bytes;
* VLOG_FILE << "Unexpected remote HDFS read of "
* << PrettyPrinter::Print(remote_bytes, TUnit::BYTES)
* << " for file '" << _file << "'";
* }
* }
* hdfsFileFreeReadStatistics(stats);
* }
* }
* if (_cached_buffer != NULL) {
* hadoopRzBufferFree(_hdfs_file->file(), _cached_buffer);
* _cached_buffer = NULL;
* }
* _io_mgr->cache_or_close_file_handle(file(), _hdfs_file, false);
* VLOG_FILE << "Cache HDFS file handle file=" << file();
* _hdfs_file = NULL;
* } else {
*/
{
if (_local_file == NULL) {
return;
}
fclose(_local_file);
_local_file = NULL;
}
#if 0
if (DorisMetrics::io_mgr_num_open_files() != NULL) {
DorisMetrics::io_mgr_num_open_files()->increment(-1L);
}
#endif
}
/*
* int64_t DiskIoMgr::ScanRange::max_read_chunk_size() const {
* // S3 InputStreams don't support DIRECT_READ (i.e. java.nio.ByteBuffer read()
* // interface). So, hdfsRead() needs to allocate a Java byte[] and copy the data out.
* // Profiles show that both the JNI array allocation and the memcpy adds much more
* // overhead for larger buffers, so limit the size of each read request. 128K was
* // chosen empirically by trying values between 4K and 8M and optimizing for lower CPU
* // utilization and higher S3 througput.
* if (_disk_id == _io_mgr->RemoteS3DiskId()) {
* DCHECK(IsS3APath(file()));
* return 128 * 1024;
* }
* return numeric_limits<int64_t>::max();
* }
*/
// TODO: how do we best use the disk here. e.g. is it good to break up a
// 1MB read into 8 128K reads?
// TODO: look at linux disk scheduling
Status DiskIoMgr::ScanRange::read(char* buffer, int64_t* bytes_read, bool* eosr) {
unique_lock<mutex> hdfs_lock(_hdfs_lock);
if (_is_cancelled) {
return Status::Cancelled("Cancelled");
}
*eosr = false;
*bytes_read = 0;
// hdfsRead() length argument is an int. Since _max_buffer_size type is no bigger
// than an int, this min() will ensure that we don't overflow the length argument.
DCHECK_LE(sizeof(_io_mgr->_max_buffer_size), sizeof(int));
int bytes_to_read =
std::min(static_cast<int64_t>(_io_mgr->_max_buffer_size), _len - _bytes_read);
DCHECK_GE(bytes_to_read, 0);
/*
* if (_fs != NULL) {
* DCHECK(_hdfs_file != NULL);
* int64_t max_chunk_size = max_read_chunk_size();
* while (*bytes_read < bytes_to_read) {
* int chunk_size = min(bytes_to_read - *bytes_read, max_chunk_size);
* int last_read = hdfsRead(_fs, _hdfs_file->file(), buffer + *bytes_read, chunk_size);
* if (last_read == -1) {
* return Status::InternalError("GetHdfsErrorMsg("Error reading from HDFS file: ", _file));
* } else if (last_read == 0) {
* // No more bytes in the file. The scan range went past the end.
* *eosr = true;
* break;
* }
* *bytes_read += last_read;
* }
* } else {
*/
DCHECK(_local_file != NULL);
*bytes_read = fread(buffer, 1, bytes_to_read, _local_file);
DCHECK_GE(*bytes_read, 0);
DCHECK_LE(*bytes_read, bytes_to_read);
if (*bytes_read < bytes_to_read) {
if (ferror(_local_file) != 0) {
string error_msg = get_str_err_msg();
stringstream ss;
ss << "Error reading from " << _file << " at byte offset: "
<< (_offset + _bytes_read) << ": " << error_msg;
return Status::InternalError(ss.str());
} else {
// On Linux, we should only get partial reads from block devices on error or eof.
DCHECK(feof(_local_file) != 0);
*eosr = true;
}
}
// }
_bytes_read += *bytes_read;
DCHECK_LE(_bytes_read, _len);
if (_bytes_read == _len) {
*eosr = true;
}
return Status::OK();
}
/*
* Status DiskIoMgr::ScanRange::read_from_cache(bool* read_succeeded) {
* DCHECK(_try_cache);
* DCHECK_EQ(_bytes_read, 0);
* *read_succeeded = false;
* Status status = open();
* if (!status.ok()) return status;
*
* // Cached reads not supported on local filesystem.
* if (_fs == NULL) return Status::OK();
*
* {
* unique_lock<mutex> hdfs_lock(_hdfs_lock);
* if (_is_cancelled) return Status::Cancelled("Cancelled");
*
* DCHECK(_hdfs_file != NULL);
* DCHECK(_cached_buffer == NULL);
* _cached_buffer = hadoopReadZero(_hdfs_file->file(),
* _io_mgr->_cached_read_options, len());
*
* // Data was not cached, caller will fall back to normal read path.
* if (_cached_buffer == NULL) return Status::OK();
* }
*
* // Cached read succeeded.
* void* buffer = const_cast<void*>(hadoopRzBufferGet(_cached_buffer));
* int32_t bytes_read = hadoopRzBufferLength(_cached_buffer);
* // For now, entire the entire block is cached or none of it.
* // TODO: if HDFS ever changes this, we'll have to handle the case where half
* // the block is cached.
* DCHECK_EQ(bytes_read, len());
*
* // Create a single buffer desc for the entire scan range and enqueue that.
* BufferDescriptor* desc = _io_mgr->get_buffer_desc(
* _reader, this, reinterpret_cast<char*>(buffer), 0);
* desc->_len = bytes_read;
* desc->_scan_range_offset = 0;
* desc->_eosr = true;
* _bytes_read = bytes_read;
* enqueue_buffer(desc);
* if (_reader->_bytes_read_counter != NULL) {
* COUNTER_ADD(_reader->_bytes_read_counter, bytes_read);
* }
* *read_succeeded = true;
* ++_reader->_num_used_buffers;
* return Status::OK();
* }
*/
} // namespace doris