1. replace all boost::shared_ptr to std::shared_ptr 2. replace all boost::scopted_ptr to std::unique_ptr 3. replace all boost::scoped_array to std::unique<T[]> 4. replace all boost:thread to std::thread
850 lines
40 KiB
C++
850 lines
40 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#ifndef DORIS_BE_SRC_QUERY_RUNTIME_DISK_IO_MGR_H
|
|
#define DORIS_BE_SRC_QUERY_RUNTIME_DISK_IO_MGR_H
|
|
|
|
#include <condition_variable>
|
|
#include <list>
|
|
#include <mutex>
|
|
#include <thread>
|
|
#include <unordered_set>
|
|
#include <vector>
|
|
|
|
#include "common/atomic.h"
|
|
#include "common/config.h"
|
|
#include "common/hdfs.h"
|
|
#include "common/object_pool.h"
|
|
#include "common/status.h"
|
|
#include "runtime/mem_tracker.h"
|
|
#include "util/error_util.h"
|
|
#include "util/internal_queue.h"
|
|
#include "util/metrics.h"
|
|
#include "util/runtime_profile.h"
|
|
#include "util/thread_group.h"
|
|
|
|
namespace doris {
|
|
|
|
class MemTracker;
|
|
|
|
// Manager object that schedules IO for all queries on all disks and remote filesystems
|
|
// (such as S3). Each query maps to one or more RequestContext objects, each of which
|
|
// has its own queue of scan ranges and/or write ranges.
|
|
//
|
|
// The API splits up requesting scan/write ranges (non-blocking) and reading the data
|
|
// (blocking). The DiskIoMgr has worker threads that will read from and write to
|
|
// disk/hdfs/remote-filesystems, allowing interleaving of IO and CPU. This allows us to
|
|
// keep all disks and all cores as busy as possible.
|
|
//
|
|
// All public APIs are thread-safe. It is not valid to call any of the APIs after
|
|
// unregister_context() returns.
|
|
//
|
|
// For Readers:
|
|
// We can model this problem as a multiple producer (threads for each disk), multiple
|
|
// consumer (scan ranges) problem. There are multiple queues that need to be
|
|
// synchronized. Conceptually, there are two queues:
|
|
// 1. The per disk queue: this contains a queue of readers that need reads.
|
|
// 2. The per scan range ready-buffer queue: this contains buffers that have been
|
|
// read and are ready for the caller.
|
|
// The disk queue contains a queue of readers and is scheduled in a round robin fashion.
|
|
// Readers map to scan nodes. The reader then contains a queue of scan ranges. The caller
|
|
// asks the IoMgr for the next range to process. The IoMgr then selects the best range
|
|
// to read based on disk activity and begins reading and queuing buffers for that range.
|
|
// TODO: We should map readers to queries. A reader is the unit of scheduling and queries
|
|
// that have multiple scan nodes shouldn't have more 'turns'.
|
|
//
|
|
// For Writers:
|
|
// Data is written via add_write_range(). This is non-blocking and adds a WriteRange to a
|
|
// per-disk queue. After the write is complete, a callback in WriteRange is invoked.
|
|
// No memory is allocated within IoMgr for writes and no copies are made. It is the
|
|
// responsibility of the client to ensure that the data to be written is valid and that
|
|
// the file to be written to exists until the callback is invoked.
|
|
//
|
|
// The IoMgr provides three key APIs.
|
|
// 1. add_scan_ranges: this is non-blocking and tells the IoMgr all the ranges that
|
|
// will eventually need to be read.
|
|
// 2. get_next_range: returns to the caller the next scan range it should process.
|
|
// This is based on disk load. This also begins reading the data in this scan
|
|
// range. This is blocking.
|
|
// 3. ScanRange::get_next: returns the next buffer for this range. This is blocking.
|
|
//
|
|
// The disk threads do not synchronize with each other. The readers and writers don't
|
|
// synchronize with each other. There is a lock and condition variable for each request
|
|
// context queue and each disk queue.
|
|
// IMPORTANT: whenever both locks are needed, the lock order is to grab the context lock
|
|
// before the disk lock.
|
|
//
|
|
// Scheduling: If there are multiple request contexts with work for a single disk, the
|
|
// request contexts are scheduled in round-robin order. Multiple disk threads can
|
|
// operate on the same request context. Exactly one request range is processed by a
|
|
// disk thread at a time. If there are multiple scan ranges scheduled via
|
|
// get_next_range() for a single context, these are processed in round-robin order.
|
|
// If there are multiple scan and write ranges for a disk, a read is always followed
|
|
// by a write, and a write is followed by a read, i.e. reads and writes alternate.
|
|
// If multiple write ranges are enqueued for a single disk, they will be processed
|
|
// by the disk threads in order, but may complete in any order. No guarantees are made
|
|
// on ordering of writes across disks.
|
|
//
|
|
// Resource Management: effective resource management in the IoMgr is key to good
|
|
// performance. The IoMgr helps coordinate two resources: CPU and disk. For CPU,
|
|
// spinning up too many threads causes thrashing.
|
|
// Memory usage in the IoMgr comes from queued read buffers. If we queue the minimum
|
|
// (i.e. 1), then the disks are idle while we are processing the buffer. If we don't
|
|
// limit the queue, then it possible we end up queueing the entire data set (i.e. CPU
|
|
// is slower than disks) and run out of memory.
|
|
// For both CPU and memory, we want to model the machine as having a fixed amount of
|
|
// resources. If a single query is running, it should saturate either CPU or Disk
|
|
// as well as using as little memory as possible. With multiple queries, each query
|
|
// should get less CPU. In that case each query will need fewer queued buffers and
|
|
// therefore have less memory usage.
|
|
//
|
|
// The IoMgr defers CPU management to the caller. The IoMgr provides a get_next_range
|
|
// API which will return the next scan range the caller should process. The caller
|
|
// can call this from the desired number of reading threads. Once a scan range
|
|
// has been returned via get_next_range, the IoMgr will start to buffer reads for
|
|
// that range and it is expected the caller will pull those buffers promptly. For
|
|
// example, if the caller would like to have 1 scanner thread, the read loop
|
|
// would look like:
|
|
// while (more_ranges)
|
|
// range = get_next_range()
|
|
// while (!range.eosr)
|
|
// buffer = range.get_next()
|
|
// To have multiple reading threads, the caller would simply spin up the threads
|
|
// and each would process the loops above.
|
|
//
|
|
// To control the number of IO buffers, each scan range has a soft max capacity for
|
|
// the number of queued buffers. If the number of buffers is at capacity, the IoMgr
|
|
// will no longer read for that scan range until the caller has processed a buffer.
|
|
// This capacity does not need to be fixed, and the caller can dynamically adjust
|
|
// it if necessary.
|
|
//
|
|
// As an example: If we allowed 5 buffers per range on a 24 core, 72 thread
|
|
// (we default to allowing 3x threads) machine, we should see at most
|
|
// 72 * 5 * 8MB = 2.8GB in io buffers memory usage. This should remain roughly constant
|
|
// regardless of how many concurrent readers are running.
|
|
//
|
|
// Buffer Management:
|
|
// Buffers are allocated by the IoMgr as necessary to service reads. These buffers
|
|
// are directly returned to the caller. The caller must call Return() on the buffer
|
|
// when it is done, at which point the buffer will be recycled for another read. In error
|
|
// cases, the IoMgr will recycle the buffers more promptly but regardless, the caller
|
|
// must always call Return()
|
|
//
|
|
// Caching support:
|
|
// Scan ranges contain metadata on whether or not it is cached on the DN. In that
|
|
// case, we use the HDFS APIs to read the cached data without doing any copies. For these
|
|
// ranges, the reads happen on the caller thread (as opposed to the disk threads).
|
|
// It is possible for the cached read APIs to fail, in which case the ranges are then
|
|
// queued on the disk threads and behave identically to the case where the range
|
|
// is not cached.
|
|
// Resources for these ranges are also not accounted against the reader because none
|
|
// are consumed.
|
|
// While a cached block is being processed, the block is mlocked. We want to minimize
|
|
// the time the mlock is held.
|
|
// - HDFS will time us out if we hold onto the mlock for too long
|
|
// - Holding the lock prevents uncaching this file due to a caching policy change.
|
|
// Therefore, we only issue the cached read when the caller is ready to process the
|
|
// range (get_next_range()) instead of when the ranges are issued. This guarantees that
|
|
// there will be a CPU available to process the buffer and any throttling we do with
|
|
// the number of scanner threads properly controls the amount of files we mlock.
|
|
// With cached scan ranges, we cannot close the scan range until the cached buffer
|
|
// is returned (HDFS does not allow this). We therefore need to defer the close until
|
|
// the cached buffer is returned (BufferDescriptor::Return()).
|
|
//
|
|
// Remote filesystem support (e.g. S3):
|
|
// Remote filesystems are modeled as "remote disks". That is, there is a seperate disk
|
|
// queue for each supported remote filesystem type. In order to maximize throughput,
|
|
// multiple connections are opened in parallel by having multiple threads running per
|
|
// queue. Also note that reading from a remote filesystem service can be more CPU
|
|
// intensive than local disk/hdfs because of non-direct I/O and SSL processing, and can
|
|
// be CPU bottlenecked especially if not enough I/O threads for these queues are
|
|
// started.
|
|
//
|
|
// TODO: IoMgr should be able to request additional scan ranges from the coordinator
|
|
// to help deal with stragglers.
|
|
// TODO: look into using a lock free queue
|
|
// TODO: simplify the common path (less locking, memory allocations).
|
|
// TODO: Break this up the .h/.cc into multiple files under an /io subdirectory.
|
|
//
|
|
// Structure of the Implementation:
|
|
// - All client APIs are defined in this file
|
|
// - Internal classes are defined in disk-io-mgr-internal.h
|
|
// - ScanRange APIs are implemented in disk-io-mgr-scan-range.cc
|
|
// This contains the ready buffer queue logic
|
|
// - RequestContext APIs are implemented in disk-io-mgr-reader-context.cc
|
|
// This contains the logic for picking scan ranges for a reader.
|
|
// - Disk Thread and general APIs are implemented in disk-io-mgr.cc.
|
|
|
|
typedef void* hdfsFS;
|
|
typedef void* hdfsFile;
|
|
|
|
class DiskIoMgr {
|
|
public:
|
|
class RequestContext;
|
|
class ScanRange;
|
|
|
|
// This class is a small wrapper around the hdfsFile handle and the file system
|
|
// instance which is needed to close the file handle in case of eviction. It
|
|
// additionally encapsulates the last modified time of the associated file when it was
|
|
// last opened.
|
|
class HdfsCachedFileHandle {
|
|
public:
|
|
// Constructor will open the file
|
|
HdfsCachedFileHandle(const hdfsFS& fs, const char* fname, int64_t mtime);
|
|
|
|
// Destructor will close the file handle
|
|
~HdfsCachedFileHandle();
|
|
|
|
hdfsFile file() const { return _hdfs_file; }
|
|
|
|
int64_t mtime() const { return _mtime; }
|
|
|
|
// This method is called to release acquired resources by the cached handle when it
|
|
// is evicted.
|
|
static void release(HdfsCachedFileHandle** h);
|
|
|
|
bool ok() const { return _hdfs_file != nullptr; }
|
|
|
|
private:
|
|
hdfsFS _fs;
|
|
hdfsFile _hdfs_file;
|
|
int64_t _mtime;
|
|
};
|
|
|
|
// Buffer struct that is used by the caller and IoMgr to pass read buffers.
|
|
// It is is expected that only one thread has ownership of this object at a
|
|
// time.
|
|
class BufferDescriptor {
|
|
public:
|
|
// a null dtor to pass codestyle check
|
|
~BufferDescriptor() {}
|
|
|
|
ScanRange* scan_range() { return _scan_range; }
|
|
char* buffer() { return _buffer; }
|
|
int64_t buffer_len() { return _buffer_len; }
|
|
int64_t len() { return _len; }
|
|
bool eosr() { return _eosr; }
|
|
|
|
// Returns the offset within the scan range that this buffer starts at
|
|
int64_t scan_range_offset() const { return _scan_range_offset; }
|
|
|
|
// Updates this buffer to be owned by the new tracker. Consumption is
|
|
// release from the current tracker and added to the new one.
|
|
void set_mem_tracker(std::shared_ptr<MemTracker> tracker);
|
|
|
|
// Returns the buffer to the IoMgr. This must be called for every buffer
|
|
// returned by get_next()/read() that did not return an error. This is non-blocking.
|
|
// After calling this, the buffer descriptor is invalid and cannot be accessed.
|
|
void return_buffer();
|
|
|
|
private:
|
|
friend class DiskIoMgr;
|
|
BufferDescriptor(DiskIoMgr* io_mgr);
|
|
|
|
// Resets the buffer descriptor state for a new reader, range and data buffer.
|
|
void reset(RequestContext* reader, ScanRange* range, char* buffer, int64_t buffer_len);
|
|
|
|
DiskIoMgr* _io_mgr;
|
|
|
|
// Reader that this buffer is for
|
|
RequestContext* _reader;
|
|
|
|
// The current tracker this buffer is associated with.
|
|
std::shared_ptr<MemTracker> _mem_tracker;
|
|
|
|
// Scan range that this buffer is for.
|
|
ScanRange* _scan_range;
|
|
|
|
// buffer with the read contents
|
|
char* _buffer;
|
|
|
|
// length of _buffer. For buffers from cached reads, the length is 0.
|
|
int64_t _buffer_len;
|
|
|
|
// length of read contents
|
|
int64_t _len;
|
|
|
|
// true if the current scan range is complete
|
|
bool _eosr;
|
|
|
|
// Status of the read to this buffer. if status is not ok, 'buffer' is nullptr
|
|
Status _status;
|
|
|
|
int64_t _scan_range_offset;
|
|
};
|
|
|
|
// The request type, read or write associated with a request range.
|
|
struct RequestType {
|
|
enum type {
|
|
READ,
|
|
WRITE,
|
|
};
|
|
};
|
|
|
|
// Represents a contiguous sequence of bytes in a single file.
|
|
// This is the common base class for read and write IO requests - ScanRange and
|
|
// WriteRange. Each disk thread processes exactly one RequestRange at a time.
|
|
class RequestRange : public InternalQueue<RequestRange>::Node {
|
|
public:
|
|
// hdfsFS fs() const { return _fs; }
|
|
const char* file() const { return _file.c_str(); }
|
|
int64_t offset() const { return _offset; }
|
|
int64_t len() const { return _len; }
|
|
int disk_id() const { return _disk_id; }
|
|
RequestType::type request_type() const { return _request_type; }
|
|
|
|
protected:
|
|
// Hadoop filesystem that contains _file, or set to nullptr for local filesystem.
|
|
hdfsFS _fs;
|
|
|
|
// Path to file being read or written.
|
|
std::string _file;
|
|
|
|
// Offset within _file being read or written.
|
|
int64_t _offset;
|
|
|
|
// Length of data read or written.
|
|
int64_t _len;
|
|
|
|
// Id of disk containing byte range.
|
|
int _disk_id;
|
|
|
|
// The type of IO request, READ or WRITE.
|
|
RequestType::type _request_type;
|
|
};
|
|
|
|
// ScanRange description. The caller must call Reset() to initialize the fields
|
|
// before calling add_scan_ranges(). The private fields are used internally by
|
|
// the IoMgr.
|
|
class ScanRange : public RequestRange {
|
|
public:
|
|
// If the mtime is set to NEVER_CACHE, the file handle should never be cached.
|
|
const static int64_t NEVER_CACHE = -1;
|
|
|
|
// The initial queue capacity for this. Specify -1 to use IoMgr default.
|
|
ScanRange() : ScanRange(-1) {}
|
|
ScanRange(int initial_capacity);
|
|
|
|
virtual ~ScanRange();
|
|
|
|
// Resets this scan range object with the scan range description. The scan range
|
|
// must fall within the file bounds (offset >= 0 and offset + len <= file_length).
|
|
// Resets this scan range object with the scan range description.
|
|
void reset(hdfsFS fs, const char* file, int64_t len, int64_t offset, int disk_id,
|
|
bool try_cache, bool expected_local, int64_t mtime, void* metadata = nullptr);
|
|
|
|
void* meta_data() const { return _meta_data; }
|
|
// bool try_cache() const { return _try_cache; }
|
|
bool expected_local() const { return _expected_local; }
|
|
int ready_buffers_capacity() const { return _ready_buffers_capacity; }
|
|
|
|
// Returns the next buffer for this scan range. buffer is an output parameter.
|
|
// This function blocks until a buffer is ready or an error occurred. If this is
|
|
// called when all buffers have been returned, *buffer is set to nullptr and Status::OK()
|
|
// is returned.
|
|
// Only one thread can be in get_next() at any time.
|
|
Status get_next(BufferDescriptor** buffer);
|
|
|
|
// Cancel this scan range. This cleans up all queued buffers and
|
|
// wakes up any threads blocked on get_next().
|
|
// Status is the reason the range was cancelled. Must not be ok().
|
|
// Status is returned to the user in get_next().
|
|
void cancel(const Status& status);
|
|
|
|
// return a descriptive string for debug.
|
|
std::string debug_string() const;
|
|
|
|
int64_t mtime() const { return _mtime; }
|
|
|
|
private:
|
|
friend class DiskIoMgr;
|
|
|
|
// Initialize internal fields
|
|
void init_internal(DiskIoMgr* io_mgr, RequestContext* reader);
|
|
|
|
// Enqueues a buffer for this range. This does not block.
|
|
// Returns true if this scan range has hit the queue capacity, false otherwise.
|
|
// The caller passes ownership of buffer to the scan range and it is not
|
|
// valid to access buffer after this call.
|
|
bool enqueue_buffer(BufferDescriptor* buffer);
|
|
|
|
// Cleanup any queued buffers (i.e. due to cancellation). This cannot
|
|
// be called with any locks taken.
|
|
void cleanup_queued_buffers();
|
|
|
|
// Validates the internal state of this range. _lock must be taken
|
|
// before calling this.
|
|
bool validate();
|
|
|
|
// Maximum length in bytes for hdfsRead() calls.
|
|
int64_t max_read_chunk_size() const;
|
|
|
|
// Opens the file for this range. This function only modifies state in this range.
|
|
Status open();
|
|
|
|
// Closes the file for this range. This function only modifies state in this range.
|
|
void close();
|
|
|
|
// Reads from this range into 'buffer'. Buffer is preallocated. Returns the number
|
|
// of bytes read. Updates range to keep track of where in the file we are.
|
|
Status read(char* buffer, int64_t* bytes_read, bool* eosr);
|
|
|
|
// Reads from the DN cache. On success, sets _cached_buffer to the DN buffer
|
|
// and *read_succeeded to true.
|
|
// If the data is not cached, returns ok() and *read_succeeded is set to false.
|
|
// Returns a non-ok status if it ran into a non-continuable error.
|
|
Status read_from_cache(bool* read_succeeded);
|
|
|
|
// Pointer to caller specified metadata. This is untouched by the io manager
|
|
// and the caller can put whatever auxiliary data in here.
|
|
void* _meta_data;
|
|
|
|
// If true, this scan range is expected to be cached. Note that this might be wrong
|
|
// since the block could have been uncached. In that case, the cached path
|
|
// will fail and we'll just put the scan range on the normal read path.
|
|
bool _try_cache;
|
|
|
|
// If true, we expect this scan range to be a local read. Note that if this is false,
|
|
// it does not necessarily mean we expect the read to be remote, and that we never
|
|
// create scan ranges where some of the range is expected to be remote and some of it
|
|
// local.
|
|
// TODO: we can do more with this
|
|
bool _expected_local;
|
|
|
|
DiskIoMgr* _io_mgr;
|
|
|
|
// Reader/owner of the scan range
|
|
RequestContext* _reader;
|
|
|
|
// File handle either to hdfs or local fs (FILE*)
|
|
//
|
|
// TODO: The pointer to HdfsCachedFileHandle is manually managed and should be
|
|
// replaced by unique_ptr in C++11
|
|
union {
|
|
FILE* _local_file;
|
|
HdfsCachedFileHandle* _hdfs_file;
|
|
};
|
|
|
|
// If non-null, this is DN cached buffer. This means the cached read succeeded
|
|
// and all the bytes for the range are in this buffer.
|
|
struct hadoopRzBuffer* _cached_buffer;
|
|
|
|
// Lock protecting fields below.
|
|
// This lock should not be taken during Open/Read/Close.
|
|
std::mutex _lock;
|
|
|
|
// Number of bytes read so far for this scan range
|
|
int _bytes_read;
|
|
|
|
// Status for this range. This is non-ok if _is_cancelled is true.
|
|
// Note: an individual range can fail without the RequestContext being
|
|
// cancelled. This allows us to skip individual ranges.
|
|
Status _status;
|
|
|
|
// If true, the last buffer for this scan range has been queued.
|
|
bool _eosr_queued;
|
|
|
|
// If true, the last buffer for this scan range has been returned.
|
|
bool _eosr_returned;
|
|
|
|
// If true, this scan range has been removed from the reader's in_flight_ranges
|
|
// queue because the _ready_buffers queue is full.
|
|
bool _blocked_on_queue;
|
|
|
|
// IO buffers that are queued for this scan range.
|
|
// Condition variable for get_next
|
|
std::condition_variable _buffer_ready_cv;
|
|
std::list<BufferDescriptor*> _ready_buffers;
|
|
|
|
// The soft capacity limit for _ready_buffers. _ready_buffers can exceed
|
|
// the limit temporarily as the capacity is adjusted dynamically.
|
|
// In that case, the capacity is only realized when the caller removes buffers
|
|
// from _ready_buffers.
|
|
int _ready_buffers_capacity;
|
|
|
|
// Lock that should be taken during hdfs calls. Only one thread (the disk reading
|
|
// thread) calls into hdfs at a time so this lock does not have performance impact.
|
|
// This lock only serves to coordinate cleanup. Specifically it serves to ensure
|
|
// that the disk threads are finished with HDFS calls before _is_cancelled is set
|
|
// to true and cleanup starts.
|
|
// If this lock and _lock need to be taken, _lock must be taken first.
|
|
std::mutex _hdfs_lock;
|
|
|
|
// If true, this scan range has been cancelled.
|
|
bool _is_cancelled;
|
|
|
|
// Last modified time of the file associated with the scan range
|
|
int64_t _mtime;
|
|
};
|
|
|
|
// Used to specify data to be written to a file and offset.
|
|
// It is the responsibility of the client to ensure that the data to be written is
|
|
// valid and that the file to be written to exists until the callback is invoked.
|
|
// A callback is invoked to inform the client when the write is done.
|
|
class WriteRange : public RequestRange {
|
|
public:
|
|
// a null dtor to pass codestyle check
|
|
~WriteRange() {}
|
|
|
|
// This callback is invoked on each WriteRange after the write is complete or the
|
|
// context is cancelled. The status returned by the callback parameter indicates
|
|
// if the write was successful (i.e. Status::OK()), if there was an error
|
|
// TStatusCode::RUNTIME_ERROR) or if the context was cancelled
|
|
// (TStatusCode::CANCELLED). The callback is only invoked if this WriteRange was
|
|
// successfully added (i.e. add_write_range() succeeded). No locks are held while
|
|
// the callback is invoked.
|
|
typedef std::function<void(const Status&)> WriteDoneCallback;
|
|
WriteRange(const std::string& file, int64_t file_offset, int disk_id,
|
|
WriteDoneCallback callback);
|
|
|
|
// Set the data and number of bytes to be written for this WriteRange.
|
|
// File data can be over-written by calling set_data() and add_write_range().
|
|
void set_data(const uint8_t* buffer, int64_t len);
|
|
|
|
private:
|
|
friend class DiskIoMgr;
|
|
|
|
// Data to be written. RequestRange::_len contains the length of data
|
|
// to be written.
|
|
const uint8_t* _data;
|
|
|
|
// Callback to invoke after the write is complete.
|
|
WriteDoneCallback _callback;
|
|
};
|
|
|
|
// Create a DiskIoMgr object.
|
|
// - num_disks: The number of disks the IoMgr should use. This is used for testing.
|
|
// Specify 0, to have the disk IoMgr query the os for the number of disks.
|
|
// - threads_per_disk: number of read threads to create per disk. This is also
|
|
// the max queue depth.
|
|
// - min_buffer_size: minimum io buffer size (in bytes)
|
|
// - max_buffer_size: maximum io buffer size (in bytes). Also the max read size.
|
|
DiskIoMgr(int num_disks, int threads_per_disk, int min_buffer_size, int max_buffer_size);
|
|
|
|
// Create DiskIoMgr with default configs.
|
|
DiskIoMgr();
|
|
|
|
// Clean up all threads and resources. This is mostly useful for testing since
|
|
// for impalad, this object is never destroyed.
|
|
~DiskIoMgr();
|
|
|
|
// Initialize the IoMgr. Must be called once before any of the other APIs.
|
|
Status init(const std::shared_ptr<MemTracker>& process_mem_tracker);
|
|
|
|
// Allocates tracking structure for a request context.
|
|
// Register a new request context which is returned in *request_context.
|
|
// The IoMgr owns the allocated RequestContext object. The caller must call
|
|
// unregister_context() for each context.
|
|
// reader_mem_tracker: Is non-null only for readers. IO buffers
|
|
// used for this reader will be tracked by this. If the limit is exceeded
|
|
// the reader will be cancelled and MEM_LIMIT_EXCEEDED will be returned via
|
|
// get_next().
|
|
Status register_context(
|
|
RequestContext** request_context,
|
|
std::shared_ptr<MemTracker> reader_mem_tracker = std::shared_ptr<MemTracker>());
|
|
|
|
// Unregisters context from the disk IoMgr. This must be called for every
|
|
// register_context() regardless of cancellation and must be called in the
|
|
// same thread as get_next()
|
|
// The 'context' cannot be used after this call.
|
|
// This call blocks until all the disk threads have finished cleaning up.
|
|
// unregister_context also cancels the reader/writer from the disk IoMgr.
|
|
void unregister_context(RequestContext* context);
|
|
|
|
// This function cancels the context asynchronously. All outstanding requests
|
|
// are aborted and tracking structures cleaned up. This does not need to be
|
|
// called if the context finishes normally.
|
|
// This will also fail any outstanding get_next()/Read requests.
|
|
// If wait_for_disks_completion is true, wait for the number of active disks for this
|
|
// context to reach 0. After calling with wait_for_disks_completion = true, the only
|
|
// valid API is returning IO buffers that have already been returned.
|
|
// Takes context->_lock if wait_for_disks_completion is true.
|
|
void cancel_context(RequestContext* context, bool wait_for_disks_completion = false);
|
|
|
|
// Adds the scan ranges to the queues. This call is non-blocking. The caller must
|
|
// not deallocate the scan range pointers before unregister_context().
|
|
// If schedule_immediately, the ranges are immediately put on the read queue
|
|
// (i.e. the caller should not/cannot call get_next_range for these ranges).
|
|
// This can be used to do synchronous reads as well as schedule dependent ranges,
|
|
// as in the case for columnar formats.
|
|
Status add_scan_ranges(RequestContext* reader, const std::vector<ScanRange*>& ranges,
|
|
bool schedule_immediately = false);
|
|
|
|
// Add a WriteRange for the writer. This is non-blocking and schedules the context
|
|
// on the IoMgr disk queue. Does not create any files.
|
|
Status add_write_range(RequestContext* writer, WriteRange* write_range);
|
|
|
|
// Returns the next unstarted scan range for this reader. When the range is returned,
|
|
// the disk threads in the IoMgr will already have started reading from it. The
|
|
// caller is expected to call ScanRange::get_next on the returned range.
|
|
// If there are no more unstarted ranges, nullptr is returned.
|
|
// This call is blocking.
|
|
Status get_next_range(RequestContext* reader, ScanRange** range);
|
|
|
|
// Reads the range and returns the result in buffer.
|
|
// This behaves like the typical synchronous read() api, blocking until the data
|
|
// is read. This can be called while there are outstanding ScanRanges and is
|
|
// thread safe. Multiple threads can be calling read() per reader at a time.
|
|
// range *cannot* have already been added via add_scan_ranges.
|
|
Status read(RequestContext* reader, ScanRange* range, BufferDescriptor** buffer);
|
|
|
|
// Determine which disk queue this file should be assigned to. Returns an index into
|
|
// _disk_queues. The disk_id is the volume ID for the local disk that holds the
|
|
// files, or -1 if unknown. Flag expected_local is true iff this impalad is
|
|
// co-located with the datanode for this file.
|
|
/*
|
|
* int AssignQueue(const char* file, int disk_id, bool expected_local);
|
|
*/
|
|
|
|
// TODO: The functions below can be moved to RequestContext.
|
|
// Returns the current status of the context.
|
|
Status context_status(RequestContext* context) const;
|
|
|
|
// Returns the number of unstarted scan ranges for this reader.
|
|
int num_unstarted_ranges(RequestContext* reader) const;
|
|
|
|
void set_bytes_read_counter(RequestContext*, RuntimeProfile::Counter*);
|
|
void set_read_timer(RequestContext*, RuntimeProfile::Counter*);
|
|
void set_active_read_thread_counter(RequestContext*, RuntimeProfile::Counter*);
|
|
void set_disks_access_bitmap(RequestContext*, RuntimeProfile::Counter*);
|
|
|
|
int64_t queue_size(RequestContext* reader) const;
|
|
int64_t bytes_read_local(RequestContext* reader) const;
|
|
int64_t bytes_read_short_circuit(RequestContext* reader) const;
|
|
int64_t bytes_read_dn_cache(RequestContext* reader) const;
|
|
int num_remote_ranges(RequestContext* reader) const;
|
|
int64_t unexpected_remote_bytes(RequestContext* reader) const;
|
|
|
|
// Returns the read throughput across all readers.
|
|
// TODO: should this be a sliding window? This should report metrics for the
|
|
// last minute, hour and since the beginning.
|
|
int64_t get_read_throughput();
|
|
|
|
// Returns the maximum read buffer size
|
|
int max_read_buffer_size() const { return _max_buffer_size; }
|
|
|
|
// Returns the total number of disk queues (both local and remote).
|
|
int num_total_disks() const { return _disk_queues.size(); }
|
|
|
|
// Returns the total number of remote "disk" queues.
|
|
int num_remote_disks() const { return REMOTE_NUM_DISKS; }
|
|
|
|
// Returns the number of local disks attached to the system.
|
|
int num_local_disks() const { return num_total_disks() - num_remote_disks(); }
|
|
|
|
// The disk ID (and therefore _disk_queues index) used for DFS accesses.
|
|
// int RemoteDfsDiskId() const { return num_local_disks() + REMOTE_DFS_DISK_OFFSET; }
|
|
|
|
// The disk ID (and therefore _disk_queues index) used for S3 accesses.
|
|
// int RemoteS3DiskId() const { return num_local_disks() + REMOTE_S3_DISK_OFFSET; }
|
|
|
|
// Returns the number of allocated buffers.
|
|
int num_allocated_buffers() const { return _num_allocated_buffers; }
|
|
|
|
// Returns the number of buffers currently owned by all readers.
|
|
int num_buffers_in_readers() const { return _num_buffers_in_readers; }
|
|
|
|
// Dumps the disk IoMgr queues (for readers and disks)
|
|
std::string debug_string();
|
|
|
|
// Validates the internal state is consistent. This is intended to only be used
|
|
// for debugging.
|
|
bool validate() const;
|
|
|
|
// Given a FS handle, name and last modified time of the file, tries to open that file
|
|
// and return an instance of HdfsCachedFileHandle. In case of an error returns nullptr.
|
|
// HdfsCachedFileHandle* OpenHdfsFile(const hdfsFS& fs, const char* fname, int64_t mtime);
|
|
|
|
// When the file handle is no longer in use by the scan range, return it and try to
|
|
// unbuffer the handle. If unbuffering, closing sockets and dropping buffers in the
|
|
// libhdfs client, is not supported, close the file handle. If the unbuffer operation
|
|
// is supported, put the file handle together with the mtime in the LRU cache for
|
|
// later reuse.
|
|
// void cache_or_close_file_handle(const char* fname, HdfsCachedFileHandle* fid, bool close);
|
|
|
|
// Default ready buffer queue capacity. This constant doesn't matter too much
|
|
// since the system dynamically adjusts.
|
|
static const int DEFAULT_QUEUE_CAPACITY;
|
|
|
|
// "Disk" queue offsets for remote accesses. Offset 0 corresponds to
|
|
// disk ID (i.e. _disk_queue index) of num_local_disks().
|
|
enum { REMOTE_DFS_DISK_OFFSET = 0, REMOTE_S3_DISK_OFFSET, REMOTE_NUM_DISKS };
|
|
|
|
private:
|
|
friend class BufferDescriptor;
|
|
struct DiskQueue;
|
|
class RequestContextCache;
|
|
|
|
// Pool to allocate BufferDescriptors.
|
|
ObjectPool _pool;
|
|
|
|
// Process memory tracker; needed to account for io buffers.
|
|
std::shared_ptr<MemTracker> _process_mem_tracker;
|
|
|
|
// Number of worker(read) threads per disk. Also the max depth of queued
|
|
// work to the disk.
|
|
const int _num_threads_per_disk;
|
|
|
|
// Maximum read size. This is also the maximum size of each allocated buffer.
|
|
const int _max_buffer_size;
|
|
|
|
// The minimum size of each read buffer.
|
|
const int _min_buffer_size;
|
|
|
|
// Thread group containing all the worker threads.
|
|
// ThreadGroup _disk_thread_group;
|
|
ThreadGroup _disk_thread_group;
|
|
|
|
// Options object for cached hdfs reads. Set on startup and never modified.
|
|
struct hadoopRzOptions* _cached_read_options;
|
|
|
|
// True if the IoMgr should be torn down. Worker threads watch for this to
|
|
// know to terminate. This variable is read/written to by different threads.
|
|
std::atomic<bool> _shut_down;
|
|
|
|
// Total bytes read by the IoMgr.
|
|
RuntimeProfile::Counter _total_bytes_read_counter;
|
|
|
|
// Total time spent in hdfs reading
|
|
RuntimeProfile::Counter _read_timer;
|
|
|
|
// Contains all contexts that the IoMgr is tracking. This includes contexts that are
|
|
// active as well as those in the process of being cancelled. This is a cache
|
|
// of context objects that get recycled to minimize object allocations and lock
|
|
// contention.
|
|
std::unique_ptr<RequestContextCache> _request_context_cache;
|
|
|
|
// Protects _free_buffers and _free_buffer_descs
|
|
std::mutex _free_buffers_lock;
|
|
|
|
// Free buffers that can be handed out to clients. There is one list for each buffer
|
|
// size, indexed by the Log2 of the buffer size in units of _min_buffer_size. The
|
|
// maximum buffer size is _max_buffer_size, so the maximum index is
|
|
// Log2(_max_buffer_size / _min_buffer_size).
|
|
//
|
|
// E.g. if _min_buffer_size = 1024 bytes:
|
|
// _free_buffers[0] => list of free buffers with size 1024 B
|
|
// _free_buffers[1] => list of free buffers with size 2048 B
|
|
// _free_buffers[10] => list of free buffers with size 1 MB
|
|
// _free_buffers[13] => list of free buffers with size 8 MB
|
|
// _free_buffers[n] => list of free buffers with size 2^n * 1024 B
|
|
std::vector<std::list<char*>> _free_buffers;
|
|
|
|
// List of free buffer desc objects that can be handed out to clients
|
|
std::list<BufferDescriptor*> _free_buffer_descs;
|
|
|
|
// Total number of allocated buffers, used for debugging.
|
|
AtomicInt<int> _num_allocated_buffers;
|
|
|
|
// Total number of buffers in readers
|
|
AtomicInt<int> _num_buffers_in_readers;
|
|
|
|
// Per disk queues. This is static and created once at init() time. One queue is
|
|
// allocated for each local disk on the system and for each remote filesystem type.
|
|
// It is indexed by disk id.
|
|
std::vector<DiskQueue*> _disk_queues;
|
|
|
|
// Caching structure that maps file names to cached file handles. The cache has an upper
|
|
// limit of entries defined by FLAGS_max_cached_file_handles. Evicted cached file
|
|
// handles are closed.
|
|
// FifoMultimap<std::string, HdfsCachedFileHandle*> _file_handle_cache;
|
|
std::multimap<std::string, HdfsCachedFileHandle*> _file_handle_cache;
|
|
|
|
// Returns the index into _free_buffers for a given buffer size
|
|
int free_buffers_idx(int64_t buffer_size);
|
|
|
|
// Gets a buffer description object, initialized for this reader, allocating one as
|
|
// necessary. buffer_size / _min_buffer_size should be a power of 2, and buffer_size
|
|
// should be <= _max_buffer_size. These constraints will be met if buffer was acquired
|
|
// via get_free_buffer() (which it should have been).
|
|
BufferDescriptor* get_buffer_desc(RequestContext* reader, ScanRange* range, char* buffer,
|
|
int64_t buffer_size);
|
|
|
|
// Returns a buffer desc object which can now be used for another reader.
|
|
void return_buffer_desc(BufferDescriptor* desc);
|
|
|
|
// Returns the buffer desc and underlying buffer to the disk IoMgr. This also updates
|
|
// the reader and disk queue state.
|
|
void return_buffer(BufferDescriptor* buffer);
|
|
|
|
// Returns a buffer to read into with size between *buffer_size and _max_buffer_size,
|
|
// and *buffer_size is set to the size of the buffer. If there is an
|
|
// appropriately-sized free buffer in the '_free_buffers', that is returned, otherwise
|
|
// a new one is allocated. *buffer_size must be between 0 and _max_buffer_size.
|
|
char* get_free_buffer(int64_t* buffer_size);
|
|
|
|
// Garbage collect all unused io buffers. This is currently only triggered when the
|
|
// process wide limit is hit. This is not good enough. While it is sufficient for
|
|
// the IoMgr, other components do not trigger this GC.
|
|
// TODO: make this run periodically?
|
|
void gc_io_buffers();
|
|
|
|
// Returns a buffer to the free list. buffer_size / _min_buffer_size should be a power
|
|
// of 2, and buffer_size should be <= _max_buffer_size. These constraints will be met
|
|
// if buffer was acquired via get_free_buffer() (which it should have been).
|
|
void return_free_buffer(char* buffer, int64_t buffer_size);
|
|
|
|
// Returns the buffer in desc (cannot be nullptr), sets buffer to nullptr and clears the
|
|
// mem tracker.
|
|
void return_free_buffer(BufferDescriptor* desc);
|
|
|
|
// Disk worker thread loop. This function retrieves the next range to process on
|
|
// the disk queue and invokes read_range() or Write() depending on the type of Range().
|
|
// There can be multiple threads per disk running this loop.
|
|
void work_loop(DiskQueue* queue);
|
|
|
|
// This is called from the disk thread to get the next range to process. It will
|
|
// wait until a scan range and buffer are available, or a write range is available.
|
|
// This functions returns the range to process.
|
|
// Only returns false if the disk thread should be shut down.
|
|
// No locks should be taken before this function call and none are left taken after.
|
|
bool get_next_request_range(DiskQueue* disk_queue, RequestRange** range,
|
|
RequestContext** request_context);
|
|
|
|
// Updates disk queue and reader state after a read is complete. The read result
|
|
// is captured in the buffer descriptor.
|
|
void handle_read_finished(DiskQueue*, RequestContext*, BufferDescriptor*);
|
|
|
|
// Invokes write_range->_callback after the range has been written and
|
|
// updates per-disk state and handle state. The status of the write OK/RUNTIME_ERROR
|
|
// etc. is passed via write_status and to the callback.
|
|
// The write_status does not affect the writer->_status. That is, an write error does
|
|
// not cancel the writer context - that decision is left to the callback handler.
|
|
// TODO: On the read path, consider not canceling the reader context on error.
|
|
void handle_write_finished(RequestContext* writer, WriteRange* write_range,
|
|
const Status& write_status);
|
|
|
|
// Validates that range is correctly initialized
|
|
Status validate_scan_range(ScanRange* range);
|
|
|
|
// Write the specified range to disk and calls handle_write_finished when done.
|
|
// Responsible for opening and closing the file that is written.
|
|
void write(RequestContext* writer_context, WriteRange* write_range);
|
|
|
|
// Helper method to write a range using the specified FILE handle. Returns Status:OK
|
|
// if the write succeeded, or a RUNTIME_ERROR with an appropriate message otherwise.
|
|
// Does not open or close the file that is written.
|
|
Status write_range_helper(FILE* file_handle, WriteRange* write_range);
|
|
|
|
// Reads the specified scan range and calls handle_read_finished when done.
|
|
void read_range(DiskQueue* disk_queue, RequestContext* reader, ScanRange* range);
|
|
};
|
|
|
|
} // end namespace doris
|
|
|
|
#endif // DORIS_BE_SRC_QUERY_RUNTIME_DISK_IO_MGR_H
|