Files
doris/be/src/runtime/disk_io_mgr.h
Zhengguo Yang 6c6380969b [refactor] replace boost smart ptr with stl (#6856)
1. replace all boost::shared_ptr to std::shared_ptr
2. replace all boost::scopted_ptr to std::unique_ptr
3. replace all boost::scoped_array to std::unique<T[]>
4. replace all boost:thread to std::thread
2021-11-17 10:18:35 +08:00

850 lines
40 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef DORIS_BE_SRC_QUERY_RUNTIME_DISK_IO_MGR_H
#define DORIS_BE_SRC_QUERY_RUNTIME_DISK_IO_MGR_H
#include <condition_variable>
#include <list>
#include <mutex>
#include <thread>
#include <unordered_set>
#include <vector>
#include "common/atomic.h"
#include "common/config.h"
#include "common/hdfs.h"
#include "common/object_pool.h"
#include "common/status.h"
#include "runtime/mem_tracker.h"
#include "util/error_util.h"
#include "util/internal_queue.h"
#include "util/metrics.h"
#include "util/runtime_profile.h"
#include "util/thread_group.h"
namespace doris {
class MemTracker;
// Manager object that schedules IO for all queries on all disks and remote filesystems
// (such as S3). Each query maps to one or more RequestContext objects, each of which
// has its own queue of scan ranges and/or write ranges.
//
// The API splits up requesting scan/write ranges (non-blocking) and reading the data
// (blocking). The DiskIoMgr has worker threads that will read from and write to
// disk/hdfs/remote-filesystems, allowing interleaving of IO and CPU. This allows us to
// keep all disks and all cores as busy as possible.
//
// All public APIs are thread-safe. It is not valid to call any of the APIs after
// unregister_context() returns.
//
// For Readers:
// We can model this problem as a multiple producer (threads for each disk), multiple
// consumer (scan ranges) problem. There are multiple queues that need to be
// synchronized. Conceptually, there are two queues:
// 1. The per disk queue: this contains a queue of readers that need reads.
// 2. The per scan range ready-buffer queue: this contains buffers that have been
// read and are ready for the caller.
// The disk queue contains a queue of readers and is scheduled in a round robin fashion.
// Readers map to scan nodes. The reader then contains a queue of scan ranges. The caller
// asks the IoMgr for the next range to process. The IoMgr then selects the best range
// to read based on disk activity and begins reading and queuing buffers for that range.
// TODO: We should map readers to queries. A reader is the unit of scheduling and queries
// that have multiple scan nodes shouldn't have more 'turns'.
//
// For Writers:
// Data is written via add_write_range(). This is non-blocking and adds a WriteRange to a
// per-disk queue. After the write is complete, a callback in WriteRange is invoked.
// No memory is allocated within IoMgr for writes and no copies are made. It is the
// responsibility of the client to ensure that the data to be written is valid and that
// the file to be written to exists until the callback is invoked.
//
// The IoMgr provides three key APIs.
// 1. add_scan_ranges: this is non-blocking and tells the IoMgr all the ranges that
// will eventually need to be read.
// 2. get_next_range: returns to the caller the next scan range it should process.
// This is based on disk load. This also begins reading the data in this scan
// range. This is blocking.
// 3. ScanRange::get_next: returns the next buffer for this range. This is blocking.
//
// The disk threads do not synchronize with each other. The readers and writers don't
// synchronize with each other. There is a lock and condition variable for each request
// context queue and each disk queue.
// IMPORTANT: whenever both locks are needed, the lock order is to grab the context lock
// before the disk lock.
//
// Scheduling: If there are multiple request contexts with work for a single disk, the
// request contexts are scheduled in round-robin order. Multiple disk threads can
// operate on the same request context. Exactly one request range is processed by a
// disk thread at a time. If there are multiple scan ranges scheduled via
// get_next_range() for a single context, these are processed in round-robin order.
// If there are multiple scan and write ranges for a disk, a read is always followed
// by a write, and a write is followed by a read, i.e. reads and writes alternate.
// If multiple write ranges are enqueued for a single disk, they will be processed
// by the disk threads in order, but may complete in any order. No guarantees are made
// on ordering of writes across disks.
//
// Resource Management: effective resource management in the IoMgr is key to good
// performance. The IoMgr helps coordinate two resources: CPU and disk. For CPU,
// spinning up too many threads causes thrashing.
// Memory usage in the IoMgr comes from queued read buffers. If we queue the minimum
// (i.e. 1), then the disks are idle while we are processing the buffer. If we don't
// limit the queue, then it possible we end up queueing the entire data set (i.e. CPU
// is slower than disks) and run out of memory.
// For both CPU and memory, we want to model the machine as having a fixed amount of
// resources. If a single query is running, it should saturate either CPU or Disk
// as well as using as little memory as possible. With multiple queries, each query
// should get less CPU. In that case each query will need fewer queued buffers and
// therefore have less memory usage.
//
// The IoMgr defers CPU management to the caller. The IoMgr provides a get_next_range
// API which will return the next scan range the caller should process. The caller
// can call this from the desired number of reading threads. Once a scan range
// has been returned via get_next_range, the IoMgr will start to buffer reads for
// that range and it is expected the caller will pull those buffers promptly. For
// example, if the caller would like to have 1 scanner thread, the read loop
// would look like:
// while (more_ranges)
// range = get_next_range()
// while (!range.eosr)
// buffer = range.get_next()
// To have multiple reading threads, the caller would simply spin up the threads
// and each would process the loops above.
//
// To control the number of IO buffers, each scan range has a soft max capacity for
// the number of queued buffers. If the number of buffers is at capacity, the IoMgr
// will no longer read for that scan range until the caller has processed a buffer.
// This capacity does not need to be fixed, and the caller can dynamically adjust
// it if necessary.
//
// As an example: If we allowed 5 buffers per range on a 24 core, 72 thread
// (we default to allowing 3x threads) machine, we should see at most
// 72 * 5 * 8MB = 2.8GB in io buffers memory usage. This should remain roughly constant
// regardless of how many concurrent readers are running.
//
// Buffer Management:
// Buffers are allocated by the IoMgr as necessary to service reads. These buffers
// are directly returned to the caller. The caller must call Return() on the buffer
// when it is done, at which point the buffer will be recycled for another read. In error
// cases, the IoMgr will recycle the buffers more promptly but regardless, the caller
// must always call Return()
//
// Caching support:
// Scan ranges contain metadata on whether or not it is cached on the DN. In that
// case, we use the HDFS APIs to read the cached data without doing any copies. For these
// ranges, the reads happen on the caller thread (as opposed to the disk threads).
// It is possible for the cached read APIs to fail, in which case the ranges are then
// queued on the disk threads and behave identically to the case where the range
// is not cached.
// Resources for these ranges are also not accounted against the reader because none
// are consumed.
// While a cached block is being processed, the block is mlocked. We want to minimize
// the time the mlock is held.
// - HDFS will time us out if we hold onto the mlock for too long
// - Holding the lock prevents uncaching this file due to a caching policy change.
// Therefore, we only issue the cached read when the caller is ready to process the
// range (get_next_range()) instead of when the ranges are issued. This guarantees that
// there will be a CPU available to process the buffer and any throttling we do with
// the number of scanner threads properly controls the amount of files we mlock.
// With cached scan ranges, we cannot close the scan range until the cached buffer
// is returned (HDFS does not allow this). We therefore need to defer the close until
// the cached buffer is returned (BufferDescriptor::Return()).
//
// Remote filesystem support (e.g. S3):
// Remote filesystems are modeled as "remote disks". That is, there is a seperate disk
// queue for each supported remote filesystem type. In order to maximize throughput,
// multiple connections are opened in parallel by having multiple threads running per
// queue. Also note that reading from a remote filesystem service can be more CPU
// intensive than local disk/hdfs because of non-direct I/O and SSL processing, and can
// be CPU bottlenecked especially if not enough I/O threads for these queues are
// started.
//
// TODO: IoMgr should be able to request additional scan ranges from the coordinator
// to help deal with stragglers.
// TODO: look into using a lock free queue
// TODO: simplify the common path (less locking, memory allocations).
// TODO: Break this up the .h/.cc into multiple files under an /io subdirectory.
//
// Structure of the Implementation:
// - All client APIs are defined in this file
// - Internal classes are defined in disk-io-mgr-internal.h
// - ScanRange APIs are implemented in disk-io-mgr-scan-range.cc
// This contains the ready buffer queue logic
// - RequestContext APIs are implemented in disk-io-mgr-reader-context.cc
// This contains the logic for picking scan ranges for a reader.
// - Disk Thread and general APIs are implemented in disk-io-mgr.cc.
typedef void* hdfsFS;
typedef void* hdfsFile;
class DiskIoMgr {
public:
class RequestContext;
class ScanRange;
// This class is a small wrapper around the hdfsFile handle and the file system
// instance which is needed to close the file handle in case of eviction. It
// additionally encapsulates the last modified time of the associated file when it was
// last opened.
class HdfsCachedFileHandle {
public:
// Constructor will open the file
HdfsCachedFileHandle(const hdfsFS& fs, const char* fname, int64_t mtime);
// Destructor will close the file handle
~HdfsCachedFileHandle();
hdfsFile file() const { return _hdfs_file; }
int64_t mtime() const { return _mtime; }
// This method is called to release acquired resources by the cached handle when it
// is evicted.
static void release(HdfsCachedFileHandle** h);
bool ok() const { return _hdfs_file != nullptr; }
private:
hdfsFS _fs;
hdfsFile _hdfs_file;
int64_t _mtime;
};
// Buffer struct that is used by the caller and IoMgr to pass read buffers.
// It is is expected that only one thread has ownership of this object at a
// time.
class BufferDescriptor {
public:
// a null dtor to pass codestyle check
~BufferDescriptor() {}
ScanRange* scan_range() { return _scan_range; }
char* buffer() { return _buffer; }
int64_t buffer_len() { return _buffer_len; }
int64_t len() { return _len; }
bool eosr() { return _eosr; }
// Returns the offset within the scan range that this buffer starts at
int64_t scan_range_offset() const { return _scan_range_offset; }
// Updates this buffer to be owned by the new tracker. Consumption is
// release from the current tracker and added to the new one.
void set_mem_tracker(std::shared_ptr<MemTracker> tracker);
// Returns the buffer to the IoMgr. This must be called for every buffer
// returned by get_next()/read() that did not return an error. This is non-blocking.
// After calling this, the buffer descriptor is invalid and cannot be accessed.
void return_buffer();
private:
friend class DiskIoMgr;
BufferDescriptor(DiskIoMgr* io_mgr);
// Resets the buffer descriptor state for a new reader, range and data buffer.
void reset(RequestContext* reader, ScanRange* range, char* buffer, int64_t buffer_len);
DiskIoMgr* _io_mgr;
// Reader that this buffer is for
RequestContext* _reader;
// The current tracker this buffer is associated with.
std::shared_ptr<MemTracker> _mem_tracker;
// Scan range that this buffer is for.
ScanRange* _scan_range;
// buffer with the read contents
char* _buffer;
// length of _buffer. For buffers from cached reads, the length is 0.
int64_t _buffer_len;
// length of read contents
int64_t _len;
// true if the current scan range is complete
bool _eosr;
// Status of the read to this buffer. if status is not ok, 'buffer' is nullptr
Status _status;
int64_t _scan_range_offset;
};
// The request type, read or write associated with a request range.
struct RequestType {
enum type {
READ,
WRITE,
};
};
// Represents a contiguous sequence of bytes in a single file.
// This is the common base class for read and write IO requests - ScanRange and
// WriteRange. Each disk thread processes exactly one RequestRange at a time.
class RequestRange : public InternalQueue<RequestRange>::Node {
public:
// hdfsFS fs() const { return _fs; }
const char* file() const { return _file.c_str(); }
int64_t offset() const { return _offset; }
int64_t len() const { return _len; }
int disk_id() const { return _disk_id; }
RequestType::type request_type() const { return _request_type; }
protected:
// Hadoop filesystem that contains _file, or set to nullptr for local filesystem.
hdfsFS _fs;
// Path to file being read or written.
std::string _file;
// Offset within _file being read or written.
int64_t _offset;
// Length of data read or written.
int64_t _len;
// Id of disk containing byte range.
int _disk_id;
// The type of IO request, READ or WRITE.
RequestType::type _request_type;
};
// ScanRange description. The caller must call Reset() to initialize the fields
// before calling add_scan_ranges(). The private fields are used internally by
// the IoMgr.
class ScanRange : public RequestRange {
public:
// If the mtime is set to NEVER_CACHE, the file handle should never be cached.
const static int64_t NEVER_CACHE = -1;
// The initial queue capacity for this. Specify -1 to use IoMgr default.
ScanRange() : ScanRange(-1) {}
ScanRange(int initial_capacity);
virtual ~ScanRange();
// Resets this scan range object with the scan range description. The scan range
// must fall within the file bounds (offset >= 0 and offset + len <= file_length).
// Resets this scan range object with the scan range description.
void reset(hdfsFS fs, const char* file, int64_t len, int64_t offset, int disk_id,
bool try_cache, bool expected_local, int64_t mtime, void* metadata = nullptr);
void* meta_data() const { return _meta_data; }
// bool try_cache() const { return _try_cache; }
bool expected_local() const { return _expected_local; }
int ready_buffers_capacity() const { return _ready_buffers_capacity; }
// Returns the next buffer for this scan range. buffer is an output parameter.
// This function blocks until a buffer is ready or an error occurred. If this is
// called when all buffers have been returned, *buffer is set to nullptr and Status::OK()
// is returned.
// Only one thread can be in get_next() at any time.
Status get_next(BufferDescriptor** buffer);
// Cancel this scan range. This cleans up all queued buffers and
// wakes up any threads blocked on get_next().
// Status is the reason the range was cancelled. Must not be ok().
// Status is returned to the user in get_next().
void cancel(const Status& status);
// return a descriptive string for debug.
std::string debug_string() const;
int64_t mtime() const { return _mtime; }
private:
friend class DiskIoMgr;
// Initialize internal fields
void init_internal(DiskIoMgr* io_mgr, RequestContext* reader);
// Enqueues a buffer for this range. This does not block.
// Returns true if this scan range has hit the queue capacity, false otherwise.
// The caller passes ownership of buffer to the scan range and it is not
// valid to access buffer after this call.
bool enqueue_buffer(BufferDescriptor* buffer);
// Cleanup any queued buffers (i.e. due to cancellation). This cannot
// be called with any locks taken.
void cleanup_queued_buffers();
// Validates the internal state of this range. _lock must be taken
// before calling this.
bool validate();
// Maximum length in bytes for hdfsRead() calls.
int64_t max_read_chunk_size() const;
// Opens the file for this range. This function only modifies state in this range.
Status open();
// Closes the file for this range. This function only modifies state in this range.
void close();
// Reads from this range into 'buffer'. Buffer is preallocated. Returns the number
// of bytes read. Updates range to keep track of where in the file we are.
Status read(char* buffer, int64_t* bytes_read, bool* eosr);
// Reads from the DN cache. On success, sets _cached_buffer to the DN buffer
// and *read_succeeded to true.
// If the data is not cached, returns ok() and *read_succeeded is set to false.
// Returns a non-ok status if it ran into a non-continuable error.
Status read_from_cache(bool* read_succeeded);
// Pointer to caller specified metadata. This is untouched by the io manager
// and the caller can put whatever auxiliary data in here.
void* _meta_data;
// If true, this scan range is expected to be cached. Note that this might be wrong
// since the block could have been uncached. In that case, the cached path
// will fail and we'll just put the scan range on the normal read path.
bool _try_cache;
// If true, we expect this scan range to be a local read. Note that if this is false,
// it does not necessarily mean we expect the read to be remote, and that we never
// create scan ranges where some of the range is expected to be remote and some of it
// local.
// TODO: we can do more with this
bool _expected_local;
DiskIoMgr* _io_mgr;
// Reader/owner of the scan range
RequestContext* _reader;
// File handle either to hdfs or local fs (FILE*)
//
// TODO: The pointer to HdfsCachedFileHandle is manually managed and should be
// replaced by unique_ptr in C++11
union {
FILE* _local_file;
HdfsCachedFileHandle* _hdfs_file;
};
// If non-null, this is DN cached buffer. This means the cached read succeeded
// and all the bytes for the range are in this buffer.
struct hadoopRzBuffer* _cached_buffer;
// Lock protecting fields below.
// This lock should not be taken during Open/Read/Close.
std::mutex _lock;
// Number of bytes read so far for this scan range
int _bytes_read;
// Status for this range. This is non-ok if _is_cancelled is true.
// Note: an individual range can fail without the RequestContext being
// cancelled. This allows us to skip individual ranges.
Status _status;
// If true, the last buffer for this scan range has been queued.
bool _eosr_queued;
// If true, the last buffer for this scan range has been returned.
bool _eosr_returned;
// If true, this scan range has been removed from the reader's in_flight_ranges
// queue because the _ready_buffers queue is full.
bool _blocked_on_queue;
// IO buffers that are queued for this scan range.
// Condition variable for get_next
std::condition_variable _buffer_ready_cv;
std::list<BufferDescriptor*> _ready_buffers;
// The soft capacity limit for _ready_buffers. _ready_buffers can exceed
// the limit temporarily as the capacity is adjusted dynamically.
// In that case, the capacity is only realized when the caller removes buffers
// from _ready_buffers.
int _ready_buffers_capacity;
// Lock that should be taken during hdfs calls. Only one thread (the disk reading
// thread) calls into hdfs at a time so this lock does not have performance impact.
// This lock only serves to coordinate cleanup. Specifically it serves to ensure
// that the disk threads are finished with HDFS calls before _is_cancelled is set
// to true and cleanup starts.
// If this lock and _lock need to be taken, _lock must be taken first.
std::mutex _hdfs_lock;
// If true, this scan range has been cancelled.
bool _is_cancelled;
// Last modified time of the file associated with the scan range
int64_t _mtime;
};
// Used to specify data to be written to a file and offset.
// It is the responsibility of the client to ensure that the data to be written is
// valid and that the file to be written to exists until the callback is invoked.
// A callback is invoked to inform the client when the write is done.
class WriteRange : public RequestRange {
public:
// a null dtor to pass codestyle check
~WriteRange() {}
// This callback is invoked on each WriteRange after the write is complete or the
// context is cancelled. The status returned by the callback parameter indicates
// if the write was successful (i.e. Status::OK()), if there was an error
// TStatusCode::RUNTIME_ERROR) or if the context was cancelled
// (TStatusCode::CANCELLED). The callback is only invoked if this WriteRange was
// successfully added (i.e. add_write_range() succeeded). No locks are held while
// the callback is invoked.
typedef std::function<void(const Status&)> WriteDoneCallback;
WriteRange(const std::string& file, int64_t file_offset, int disk_id,
WriteDoneCallback callback);
// Set the data and number of bytes to be written for this WriteRange.
// File data can be over-written by calling set_data() and add_write_range().
void set_data(const uint8_t* buffer, int64_t len);
private:
friend class DiskIoMgr;
// Data to be written. RequestRange::_len contains the length of data
// to be written.
const uint8_t* _data;
// Callback to invoke after the write is complete.
WriteDoneCallback _callback;
};
// Create a DiskIoMgr object.
// - num_disks: The number of disks the IoMgr should use. This is used for testing.
// Specify 0, to have the disk IoMgr query the os for the number of disks.
// - threads_per_disk: number of read threads to create per disk. This is also
// the max queue depth.
// - min_buffer_size: minimum io buffer size (in bytes)
// - max_buffer_size: maximum io buffer size (in bytes). Also the max read size.
DiskIoMgr(int num_disks, int threads_per_disk, int min_buffer_size, int max_buffer_size);
// Create DiskIoMgr with default configs.
DiskIoMgr();
// Clean up all threads and resources. This is mostly useful for testing since
// for impalad, this object is never destroyed.
~DiskIoMgr();
// Initialize the IoMgr. Must be called once before any of the other APIs.
Status init(const std::shared_ptr<MemTracker>& process_mem_tracker);
// Allocates tracking structure for a request context.
// Register a new request context which is returned in *request_context.
// The IoMgr owns the allocated RequestContext object. The caller must call
// unregister_context() for each context.
// reader_mem_tracker: Is non-null only for readers. IO buffers
// used for this reader will be tracked by this. If the limit is exceeded
// the reader will be cancelled and MEM_LIMIT_EXCEEDED will be returned via
// get_next().
Status register_context(
RequestContext** request_context,
std::shared_ptr<MemTracker> reader_mem_tracker = std::shared_ptr<MemTracker>());
// Unregisters context from the disk IoMgr. This must be called for every
// register_context() regardless of cancellation and must be called in the
// same thread as get_next()
// The 'context' cannot be used after this call.
// This call blocks until all the disk threads have finished cleaning up.
// unregister_context also cancels the reader/writer from the disk IoMgr.
void unregister_context(RequestContext* context);
// This function cancels the context asynchronously. All outstanding requests
// are aborted and tracking structures cleaned up. This does not need to be
// called if the context finishes normally.
// This will also fail any outstanding get_next()/Read requests.
// If wait_for_disks_completion is true, wait for the number of active disks for this
// context to reach 0. After calling with wait_for_disks_completion = true, the only
// valid API is returning IO buffers that have already been returned.
// Takes context->_lock if wait_for_disks_completion is true.
void cancel_context(RequestContext* context, bool wait_for_disks_completion = false);
// Adds the scan ranges to the queues. This call is non-blocking. The caller must
// not deallocate the scan range pointers before unregister_context().
// If schedule_immediately, the ranges are immediately put on the read queue
// (i.e. the caller should not/cannot call get_next_range for these ranges).
// This can be used to do synchronous reads as well as schedule dependent ranges,
// as in the case for columnar formats.
Status add_scan_ranges(RequestContext* reader, const std::vector<ScanRange*>& ranges,
bool schedule_immediately = false);
// Add a WriteRange for the writer. This is non-blocking and schedules the context
// on the IoMgr disk queue. Does not create any files.
Status add_write_range(RequestContext* writer, WriteRange* write_range);
// Returns the next unstarted scan range for this reader. When the range is returned,
// the disk threads in the IoMgr will already have started reading from it. The
// caller is expected to call ScanRange::get_next on the returned range.
// If there are no more unstarted ranges, nullptr is returned.
// This call is blocking.
Status get_next_range(RequestContext* reader, ScanRange** range);
// Reads the range and returns the result in buffer.
// This behaves like the typical synchronous read() api, blocking until the data
// is read. This can be called while there are outstanding ScanRanges and is
// thread safe. Multiple threads can be calling read() per reader at a time.
// range *cannot* have already been added via add_scan_ranges.
Status read(RequestContext* reader, ScanRange* range, BufferDescriptor** buffer);
// Determine which disk queue this file should be assigned to. Returns an index into
// _disk_queues. The disk_id is the volume ID for the local disk that holds the
// files, or -1 if unknown. Flag expected_local is true iff this impalad is
// co-located with the datanode for this file.
/*
* int AssignQueue(const char* file, int disk_id, bool expected_local);
*/
// TODO: The functions below can be moved to RequestContext.
// Returns the current status of the context.
Status context_status(RequestContext* context) const;
// Returns the number of unstarted scan ranges for this reader.
int num_unstarted_ranges(RequestContext* reader) const;
void set_bytes_read_counter(RequestContext*, RuntimeProfile::Counter*);
void set_read_timer(RequestContext*, RuntimeProfile::Counter*);
void set_active_read_thread_counter(RequestContext*, RuntimeProfile::Counter*);
void set_disks_access_bitmap(RequestContext*, RuntimeProfile::Counter*);
int64_t queue_size(RequestContext* reader) const;
int64_t bytes_read_local(RequestContext* reader) const;
int64_t bytes_read_short_circuit(RequestContext* reader) const;
int64_t bytes_read_dn_cache(RequestContext* reader) const;
int num_remote_ranges(RequestContext* reader) const;
int64_t unexpected_remote_bytes(RequestContext* reader) const;
// Returns the read throughput across all readers.
// TODO: should this be a sliding window? This should report metrics for the
// last minute, hour and since the beginning.
int64_t get_read_throughput();
// Returns the maximum read buffer size
int max_read_buffer_size() const { return _max_buffer_size; }
// Returns the total number of disk queues (both local and remote).
int num_total_disks() const { return _disk_queues.size(); }
// Returns the total number of remote "disk" queues.
int num_remote_disks() const { return REMOTE_NUM_DISKS; }
// Returns the number of local disks attached to the system.
int num_local_disks() const { return num_total_disks() - num_remote_disks(); }
// The disk ID (and therefore _disk_queues index) used for DFS accesses.
// int RemoteDfsDiskId() const { return num_local_disks() + REMOTE_DFS_DISK_OFFSET; }
// The disk ID (and therefore _disk_queues index) used for S3 accesses.
// int RemoteS3DiskId() const { return num_local_disks() + REMOTE_S3_DISK_OFFSET; }
// Returns the number of allocated buffers.
int num_allocated_buffers() const { return _num_allocated_buffers; }
// Returns the number of buffers currently owned by all readers.
int num_buffers_in_readers() const { return _num_buffers_in_readers; }
// Dumps the disk IoMgr queues (for readers and disks)
std::string debug_string();
// Validates the internal state is consistent. This is intended to only be used
// for debugging.
bool validate() const;
// Given a FS handle, name and last modified time of the file, tries to open that file
// and return an instance of HdfsCachedFileHandle. In case of an error returns nullptr.
// HdfsCachedFileHandle* OpenHdfsFile(const hdfsFS& fs, const char* fname, int64_t mtime);
// When the file handle is no longer in use by the scan range, return it and try to
// unbuffer the handle. If unbuffering, closing sockets and dropping buffers in the
// libhdfs client, is not supported, close the file handle. If the unbuffer operation
// is supported, put the file handle together with the mtime in the LRU cache for
// later reuse.
// void cache_or_close_file_handle(const char* fname, HdfsCachedFileHandle* fid, bool close);
// Default ready buffer queue capacity. This constant doesn't matter too much
// since the system dynamically adjusts.
static const int DEFAULT_QUEUE_CAPACITY;
// "Disk" queue offsets for remote accesses. Offset 0 corresponds to
// disk ID (i.e. _disk_queue index) of num_local_disks().
enum { REMOTE_DFS_DISK_OFFSET = 0, REMOTE_S3_DISK_OFFSET, REMOTE_NUM_DISKS };
private:
friend class BufferDescriptor;
struct DiskQueue;
class RequestContextCache;
// Pool to allocate BufferDescriptors.
ObjectPool _pool;
// Process memory tracker; needed to account for io buffers.
std::shared_ptr<MemTracker> _process_mem_tracker;
// Number of worker(read) threads per disk. Also the max depth of queued
// work to the disk.
const int _num_threads_per_disk;
// Maximum read size. This is also the maximum size of each allocated buffer.
const int _max_buffer_size;
// The minimum size of each read buffer.
const int _min_buffer_size;
// Thread group containing all the worker threads.
// ThreadGroup _disk_thread_group;
ThreadGroup _disk_thread_group;
// Options object for cached hdfs reads. Set on startup and never modified.
struct hadoopRzOptions* _cached_read_options;
// True if the IoMgr should be torn down. Worker threads watch for this to
// know to terminate. This variable is read/written to by different threads.
std::atomic<bool> _shut_down;
// Total bytes read by the IoMgr.
RuntimeProfile::Counter _total_bytes_read_counter;
// Total time spent in hdfs reading
RuntimeProfile::Counter _read_timer;
// Contains all contexts that the IoMgr is tracking. This includes contexts that are
// active as well as those in the process of being cancelled. This is a cache
// of context objects that get recycled to minimize object allocations and lock
// contention.
std::unique_ptr<RequestContextCache> _request_context_cache;
// Protects _free_buffers and _free_buffer_descs
std::mutex _free_buffers_lock;
// Free buffers that can be handed out to clients. There is one list for each buffer
// size, indexed by the Log2 of the buffer size in units of _min_buffer_size. The
// maximum buffer size is _max_buffer_size, so the maximum index is
// Log2(_max_buffer_size / _min_buffer_size).
//
// E.g. if _min_buffer_size = 1024 bytes:
// _free_buffers[0] => list of free buffers with size 1024 B
// _free_buffers[1] => list of free buffers with size 2048 B
// _free_buffers[10] => list of free buffers with size 1 MB
// _free_buffers[13] => list of free buffers with size 8 MB
// _free_buffers[n] => list of free buffers with size 2^n * 1024 B
std::vector<std::list<char*>> _free_buffers;
// List of free buffer desc objects that can be handed out to clients
std::list<BufferDescriptor*> _free_buffer_descs;
// Total number of allocated buffers, used for debugging.
AtomicInt<int> _num_allocated_buffers;
// Total number of buffers in readers
AtomicInt<int> _num_buffers_in_readers;
// Per disk queues. This is static and created once at init() time. One queue is
// allocated for each local disk on the system and for each remote filesystem type.
// It is indexed by disk id.
std::vector<DiskQueue*> _disk_queues;
// Caching structure that maps file names to cached file handles. The cache has an upper
// limit of entries defined by FLAGS_max_cached_file_handles. Evicted cached file
// handles are closed.
// FifoMultimap<std::string, HdfsCachedFileHandle*> _file_handle_cache;
std::multimap<std::string, HdfsCachedFileHandle*> _file_handle_cache;
// Returns the index into _free_buffers for a given buffer size
int free_buffers_idx(int64_t buffer_size);
// Gets a buffer description object, initialized for this reader, allocating one as
// necessary. buffer_size / _min_buffer_size should be a power of 2, and buffer_size
// should be <= _max_buffer_size. These constraints will be met if buffer was acquired
// via get_free_buffer() (which it should have been).
BufferDescriptor* get_buffer_desc(RequestContext* reader, ScanRange* range, char* buffer,
int64_t buffer_size);
// Returns a buffer desc object which can now be used for another reader.
void return_buffer_desc(BufferDescriptor* desc);
// Returns the buffer desc and underlying buffer to the disk IoMgr. This also updates
// the reader and disk queue state.
void return_buffer(BufferDescriptor* buffer);
// Returns a buffer to read into with size between *buffer_size and _max_buffer_size,
// and *buffer_size is set to the size of the buffer. If there is an
// appropriately-sized free buffer in the '_free_buffers', that is returned, otherwise
// a new one is allocated. *buffer_size must be between 0 and _max_buffer_size.
char* get_free_buffer(int64_t* buffer_size);
// Garbage collect all unused io buffers. This is currently only triggered when the
// process wide limit is hit. This is not good enough. While it is sufficient for
// the IoMgr, other components do not trigger this GC.
// TODO: make this run periodically?
void gc_io_buffers();
// Returns a buffer to the free list. buffer_size / _min_buffer_size should be a power
// of 2, and buffer_size should be <= _max_buffer_size. These constraints will be met
// if buffer was acquired via get_free_buffer() (which it should have been).
void return_free_buffer(char* buffer, int64_t buffer_size);
// Returns the buffer in desc (cannot be nullptr), sets buffer to nullptr and clears the
// mem tracker.
void return_free_buffer(BufferDescriptor* desc);
// Disk worker thread loop. This function retrieves the next range to process on
// the disk queue and invokes read_range() or Write() depending on the type of Range().
// There can be multiple threads per disk running this loop.
void work_loop(DiskQueue* queue);
// This is called from the disk thread to get the next range to process. It will
// wait until a scan range and buffer are available, or a write range is available.
// This functions returns the range to process.
// Only returns false if the disk thread should be shut down.
// No locks should be taken before this function call and none are left taken after.
bool get_next_request_range(DiskQueue* disk_queue, RequestRange** range,
RequestContext** request_context);
// Updates disk queue and reader state after a read is complete. The read result
// is captured in the buffer descriptor.
void handle_read_finished(DiskQueue*, RequestContext*, BufferDescriptor*);
// Invokes write_range->_callback after the range has been written and
// updates per-disk state and handle state. The status of the write OK/RUNTIME_ERROR
// etc. is passed via write_status and to the callback.
// The write_status does not affect the writer->_status. That is, an write error does
// not cancel the writer context - that decision is left to the callback handler.
// TODO: On the read path, consider not canceling the reader context on error.
void handle_write_finished(RequestContext* writer, WriteRange* write_range,
const Status& write_status);
// Validates that range is correctly initialized
Status validate_scan_range(ScanRange* range);
// Write the specified range to disk and calls handle_write_finished when done.
// Responsible for opening and closing the file that is written.
void write(RequestContext* writer_context, WriteRange* write_range);
// Helper method to write a range using the specified FILE handle. Returns Status:OK
// if the write succeeded, or a RUNTIME_ERROR with an appropriate message otherwise.
// Does not open or close the file that is written.
Status write_range_helper(FILE* file_handle, WriteRange* write_range);
// Reads the specified scan range and calls handle_read_finished when done.
void read_range(DiskQueue* disk_queue, RequestContext* reader, ScanRange* range);
};
} // end namespace doris
#endif // DORIS_BE_SRC_QUERY_RUNTIME_DISK_IO_MGR_H