doris/be/src/runtime/buffered_block_mgr2.cc

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/apache/impala/blob/branch-2.9.0/be/src/runtime/buffered-block-mgr2.cc
// and modified by Doris

#include "runtime/buffered_block_mgr2.h"

#include "exec/exec_node.h"
#include "runtime/exec_env.h"
#include "runtime/mem_tracker.h"
#include "runtime/runtime_state.h"
#include "runtime/tmp_file_mgr.h"
#include "util/bit_util.h"
#include "util/debug_util.h"
#include "util/disk_info.h"
#include "util/doris_metrics.h"
#include "util/pretty_printer.h"
#include "util/runtime_profile.h"
#include "util/stack_util.h"
#include "util/uid_util.h"

using std::string;
using std::stringstream;
using std::vector;
using std::list;
using std::endl;

using std::bind;
using std::mem_fn;
using std::lock_guard;
using std::mutex;
using std::shared_ptr;
using std::unique_lock;

namespace doris {

BufferedBlockMgr2::BlockMgrsMap BufferedBlockMgr2::_s_query_to_block_mgrs;
SpinLock BufferedBlockMgr2::_s_block_mgrs_lock;

class BufferedBlockMgr2::Client {
public:
    Client(BufferedBlockMgr2* mgr, int num_reserved_buffers,
           const std::shared_ptr<MemTracker>& tracker, RuntimeState* state)
            : _mgr(mgr),
              _state(state),
              _tracker(
                      MemTracker::create_virtual_tracker(-1, "BufferedBlockMgr2::Client", tracker)),
              _num_reserved_buffers(num_reserved_buffers),
              _num_tmp_reserved_buffers(0),
              _num_pinned_buffers(0) {
        DCHECK(tracker != nullptr);
    }

    // A null dtor to pass codestyle check
    ~Client() {}

    // Unowned.
    BufferedBlockMgr2* _mgr;

    // Unowned.
    RuntimeState* _state;

    // Tracker for this client. Unowned.
    // When the client gets a buffer, we update the consumption on this tracker. However,
    // we don't want to transfer the buffer from the block mgr to the client (i.e. release
    // from the block mgr), since the block mgr is where the block mem usage limit is
    // enforced. Even when we give a buffer to a client, the buffer is still owned and
    // counts against the block mgr tracker (i.e. there is a fixed pool of buffers
    // regardless of if they are in the block mgr or the clients).
    std::shared_ptr<MemTracker> _tracker;

    // Number of buffers reserved by this client.
    int _num_reserved_buffers;

    // Number of buffers temporarily reserved.
    int _num_tmp_reserved_buffers;

    // Number of buffers pinned by this client.
    int _num_pinned_buffers;

    void pin_buffer(BufferDescriptor* buffer) {
        DCHECK(buffer != nullptr);
        if (buffer->len == _mgr->max_block_size()) {
            ++_num_pinned_buffers;
            _tracker->consume(buffer->len);
        }
    }

    void unpin_buffer(BufferDescriptor* buffer) {
        DCHECK(buffer != nullptr);
        if (buffer->len == _mgr->max_block_size()) {
            DCHECK_GT(_num_pinned_buffers, 0);
            --_num_pinned_buffers;
            _tracker->release(buffer->len);
        }
    }

    string debug_string() const {
        stringstream ss;
        ss << "Client " << this << endl
           << "  num_reserved_buffers=" << _num_reserved_buffers << endl
           << "  num_tmp_reserved_buffers=" << _num_tmp_reserved_buffers << endl
           << "  num_pinned_buffers=" << _num_pinned_buffers;
        return ss.str();
    }
};

// BufferedBlockMgr2::Block methods.
BufferedBlockMgr2::Block::Block(BufferedBlockMgr2* block_mgr)
        : _buffer_desc(nullptr),
          _block_mgr(block_mgr),
          _client(nullptr),
          _write_range(nullptr),
          _tmp_file(nullptr),
          _valid_data_len(0),
          _num_rows(0) {}

Status BufferedBlockMgr2::Block::pin(bool* pinned, Block* release_block, bool unpin) {
    return _block_mgr->pin_block(this, pinned, release_block, unpin);
}

Status BufferedBlockMgr2::Block::unpin() {
    return _block_mgr->unpin_block(this);
}

void BufferedBlockMgr2::Block::del() {
    _block_mgr->delete_block(this);
}

void BufferedBlockMgr2::Block::init() {
    // No locks are taken because the block is new or has previously been deleted.
    _is_pinned = false;
    _in_write = false;
    _is_deleted = false;
    _valid_data_len = 0;
    _client = nullptr;
    _num_rows = 0;
}

bool BufferedBlockMgr2::Block::validate() const {
    if (_is_deleted && (_is_pinned || (!_in_write && _buffer_desc != nullptr))) {
        LOG(ERROR) << "Deleted block in use - " << debug_string();
        return false;
    }

    if (_buffer_desc == nullptr && (_is_pinned || _in_write)) {
        LOG(ERROR) << "Block without buffer in use - " << debug_string();
        return false;
    }

    if (_buffer_desc == nullptr && _block_mgr->_unpinned_blocks.contains(this)) {
        LOG(ERROR) << "Unpersisted block without buffer - " << debug_string();
        return false;
    }

    if (_buffer_desc != nullptr && (_buffer_desc->block != this)) {
        LOG(ERROR) << "Block buffer inconsistency - " << debug_string();
        return false;
    }

    return true;
}

string BufferedBlockMgr2::Block::tmp_file_path() const {
    if (_tmp_file == nullptr) {
        return "";
    }
    return _tmp_file->path();
}

string BufferedBlockMgr2::Block::debug_string() const {
    stringstream ss;
    ss << "Block: " << this << endl
       << "  Buffer Desc: " << _buffer_desc << endl
       << "  Data Len: " << _valid_data_len << endl
       << "  Num Rows: " << _num_rows << endl;
    if (_is_pinned) {
        ss << "  Buffer Len: " << buffer_len() << endl;
    }
    ss << "  Deleted: " << _is_deleted << endl
       << "  Pinned: " << _is_pinned << endl
       << "  Write Issued: " << _in_write << endl
       << "  Client Local: " << _client_local;
    return ss.str();
}

BufferedBlockMgr2::BufferedBlockMgr2(RuntimeState* state, TmpFileMgr* tmp_file_mgr,
                                     int64_t block_size)
        : _max_block_size(block_size),
          // Keep two writes in flight per scratch disk so the disks can stay busy.
          _block_write_threshold(tmp_file_mgr->num_active_tmp_devices() * 2),
          _enable_spill(state->enable_spill()),
          _query_id(state->query_id()),
          _tmp_file_mgr(tmp_file_mgr),
          _initialized(false),
          _unfullfilled_reserved_buffers(0),
          _total_pinned_buffers(0),
          _non_local_outstanding_writes(0),
          _io_mgr(state->exec_env()->disk_io_mgr()),
          _is_cancelled(false),
          _writes_issued(0),
          _state(state) {}

Status BufferedBlockMgr2::create(RuntimeState* state, RuntimeProfile* profile,
                                 TmpFileMgr* tmp_file_mgr, int64_t mem_limit, int64_t block_size,
                                 std::shared_ptr<BufferedBlockMgr2>* block_mgr) {
    block_mgr->reset();
    {
        // we do not use global BlockMgrsMap for now, to avoid mem-exceeded different fragments
        // running on the same machine.
        // TODO(lingbin): open it later. note that open with query-mem-limit in RuntimeState
        // at the same time.

        // lock_guard<SpinLock> lock(_s_block_mgrs_lock);
        // BlockMgrsMap::iterator it = _s_query_to_block_mgrs.find(state->query_id());
        // if (it != _s_query_to_block_mgrs.end()){
        //     *block_mgr = it->second.lock();
        // }
        if (*block_mgr == nullptr) {
            // weak_ptr::lock returns nullptr if the weak_ptr is expired. This means
            // all shared_ptr references have gone to 0 and it is in the process of
            // being deleted. This can happen if the last shared reference is released
            // but before the weak ptr is removed from the map.
            block_mgr->reset(new BufferedBlockMgr2(state, tmp_file_mgr, block_size));
            // _s_query_to_block_mgrs[state->query_id()] = *block_mgr;
        }
    }
    (*block_mgr)->init(state->exec_env()->disk_io_mgr(), profile, mem_limit);
    return Status::OK();
}

int64_t BufferedBlockMgr2::available_buffers(Client* client) const {
    int64_t unused_reserved = client->_num_reserved_buffers + client->_num_tmp_reserved_buffers -
                              client->_num_pinned_buffers;
    return std::max<int64_t>(0, remaining_unreserved_buffers()) +
           std::max<int64_t>(0, unused_reserved);
}

int64_t BufferedBlockMgr2::remaining_unreserved_buffers() const {
    int64_t num_buffers =
            _free_io_buffers.size() + _unpinned_blocks.size() + _non_local_outstanding_writes;
    num_buffers += _mem_tracker->spare_capacity() / max_block_size();
    num_buffers -= _unfullfilled_reserved_buffers;
    return num_buffers;
}

Status BufferedBlockMgr2::register_client(int num_reserved_buffers,
                                          const std::shared_ptr<MemTracker>& tracker,
                                          RuntimeState* state, Client** client) {
    DCHECK_GE(num_reserved_buffers, 0);
    Client* a_client = new Client(this, num_reserved_buffers, tracker, state);
    lock_guard<mutex> lock(_lock);
    *client = _obj_pool.add(a_client);
    _unfullfilled_reserved_buffers += num_reserved_buffers;
    return Status::OK();
}

void BufferedBlockMgr2::clear_reservations(Client* client) {
    lock_guard<mutex> lock(_lock);
    // TODO: Can the modifications to the client's mem variables can be made w/o the lock?
    if (client->_num_pinned_buffers < client->_num_reserved_buffers) {
        _unfullfilled_reserved_buffers -=
                client->_num_reserved_buffers - client->_num_pinned_buffers;
    }
    client->_num_reserved_buffers = 0;

    _unfullfilled_reserved_buffers -= client->_num_tmp_reserved_buffers;
    client->_num_tmp_reserved_buffers = 0;
}

bool BufferedBlockMgr2::try_acquire_tmp_reservation(Client* client, int num_buffers) {
    lock_guard<mutex> lock(_lock);
    // TODO: Can the modifications to the client's mem variables can be made w/o the lock?
    DCHECK_EQ(client->_num_tmp_reserved_buffers, 0);
    if (client->_num_pinned_buffers < client->_num_reserved_buffers) {
        // If client has unused reserved buffers, we use those first.
        num_buffers -= client->_num_reserved_buffers - client->_num_pinned_buffers;
    }
    if (num_buffers < 0) {
        return true;
    }
    if (available_buffers(client) < num_buffers) {
        return false;
    }

    client->_num_tmp_reserved_buffers = num_buffers;
    _unfullfilled_reserved_buffers += num_buffers;
    return true;
}

bool BufferedBlockMgr2::consume_memory(Client* client, int64_t size) {
    // Later, we use this interface to manage the consumption of memory of hashtable instead of ReservationTracker.
    // So it is possible to allocate 0, which has no additional impact on the behavior of BufferedBlockMgr.
    // The process of memory allocation still by BufferPool, Because bufferpool has done a lot of optimization in memory allocation
    // which is better than using the new operator directly.
    if (size == 0) return true;
    // Workaround IMPALA-1619. Return immediately if the allocation size will cause
    // an arithmetic overflow.
    if (UNLIKELY(size >= (1LL << 31))) {
        LOG(WARNING) << "Trying to allocate memory >=2GB (" << size << ")B." << get_stack_trace();
        return false;
    }
    int buffers_needed = BitUtil::ceil(size, max_block_size());
    unique_lock<mutex> lock(_lock);
    if (size < max_block_size() && _mem_tracker->try_consume(size)) {
        // For small allocations (less than a block size), just let the allocation through.
        client->_tracker->consume(size);
        return true;
    }

    if (available_buffers(client) + client->_num_tmp_reserved_buffers < buffers_needed) {
        return false;
    }
    Status st = _mem_tracker->try_consume(size);
    WARN_IF_ERROR(st, "consume failed");
    if (st) {
        // There was still unallocated memory, don't need to recycle allocated blocks.
        client->_tracker->consume(size);
        return true;
    }

    // Bump up client->_num_tmp_reserved_buffers to satisfy this request. We don't want
    // another client to grab the buffer.
    int additional_tmp_reservations = 0;
    if (client->_num_tmp_reserved_buffers < buffers_needed) {
        additional_tmp_reservations = buffers_needed - client->_num_tmp_reserved_buffers;
        client->_num_tmp_reserved_buffers += additional_tmp_reservations;
        _unfullfilled_reserved_buffers += additional_tmp_reservations;
    }

    // Loop until we have freed enough memory.
    // We free all the memory at the end. We don't want another component to steal the
    // memory.
    int buffers_acquired = 0;
    do {
        BufferDescriptor* buffer_desc = nullptr;
        Status s = find_buffer(lock, &buffer_desc); // This waits on the lock.
        if (buffer_desc == nullptr) {
            break;
        }
        DCHECK(s.ok());
        _all_io_buffers.erase(buffer_desc->all_buffers_it);
        if (buffer_desc->block != nullptr) {
            buffer_desc->block->_buffer_desc = nullptr;
        }
        delete[] buffer_desc->buffer;
        ++buffers_acquired;
    } while (buffers_acquired != buffers_needed);

    Status status = Status::OK();
    if (buffers_acquired == buffers_needed) {
        status = write_unpinned_blocks();
    }
    // If we either couldn't acquire enough buffers or write_unpinned_blocks() failed, undo
    // the reservation.
    if (buffers_acquired != buffers_needed || !status.ok()) {
        if (!status.ok()) {
            VLOG_QUERY << "Query: " << _query_id << " write unpinned buffers failed.";
            client->_state->log_error(status);
        }
        client->_num_tmp_reserved_buffers -= additional_tmp_reservations;
        _unfullfilled_reserved_buffers -= additional_tmp_reservations;
        _mem_tracker->release(buffers_acquired * max_block_size());
        return false;
    }

    client->_num_tmp_reserved_buffers -= buffers_acquired;
    _unfullfilled_reserved_buffers -= buffers_acquired;

    DCHECK_GE(buffers_acquired * max_block_size(), size);
    _mem_tracker->release(buffers_acquired * max_block_size());
    st = _mem_tracker->try_consume(size);
    WARN_IF_ERROR(st, "consume failed");
    if (!st) {
        return false;
    }
    client->_tracker->consume(size);
    DCHECK(validate()) << endl << debug_internal();
    return true;
}

void BufferedBlockMgr2::release_memory(Client* client, int64_t size) {
    _mem_tracker->release(size);
    client->_tracker->release(size);
}

void BufferedBlockMgr2::cancel() {
    {
        lock_guard<mutex> lock(_lock);
        if (_is_cancelled) {
            return;
        }
        _is_cancelled = true;
    }
    // Cancel the underlying io mgr to unblock any waiting threads.
    _io_mgr->cancel_context(_io_request_context);
}

bool BufferedBlockMgr2::is_cancelled() {
    lock_guard<mutex> lock(_lock);
    return _is_cancelled;
}

Status BufferedBlockMgr2::mem_limit_too_low_error(Client* client, int node_id) {
    VLOG_QUERY << "Query: " << _query_id << ". Node=" << node_id << " ran out of memory: " << endl
               << debug_internal() << endl
               << client->debug_string();

    // TODO: what to print here. We can't know the value of the entire query here.
    stringstream error_msg;
    error_msg << "The memory limit is set too low to initialize spilling operator (id=" << node_id
              << "). The minimum required memory to spill this operator is "
              << PrettyPrinter::print(client->_num_reserved_buffers * max_block_size(),
                                      TUnit::BYTES)
              << ".";
    return add_exec_msg(error_msg.str());
}

Status BufferedBlockMgr2::add_exec_msg(const std::string& msg) const {
    stringstream str;
    str << msg << " ";
    str << "Backend: " << BackendOptions::get_localhost() << ", ";
    str << "fragment: " << print_id(_state->fragment_instance_id()) << " ";
    return Status::MemoryLimitExceeded(str.str());
}

Status BufferedBlockMgr2::get_new_block(Client* client, Block* unpin_block, Block** block,
                                        int64_t len) {
    DCHECK_LE(len, _max_block_size) << "Cannot request block bigger than max_len";
    DCHECK_NE(len, 0) << "Cannot request block of zero size";
    *block = nullptr;
    Block* new_block = nullptr;

    {
        lock_guard<mutex> lock(_lock);
        if (_is_cancelled) {
            return Status::Cancelled("Cancelled");
        }
        new_block = get_unused_block(client);
        DCHECK(new_block->validate()) << endl << new_block->debug_string();
        DCHECK_EQ(new_block->_client, client);

        if (len > 0 && len < _max_block_size) {
            DCHECK(unpin_block == nullptr);
            Status st = client->_tracker->try_consume(len);
            WARN_IF_ERROR(st, "get_new_block failed");
            if (st) {
                // TODO: Have a cache of unused blocks of size 'len' (0, _max_block_size)
                uint8_t* buffer = new uint8_t[len];
                // Descriptors for non-I/O sized buffers are deleted when the block is deleted.
                new_block->_buffer_desc = new BufferDescriptor(buffer, len);
                new_block->_buffer_desc->block = new_block;
                new_block->_is_pinned = true;
                client->pin_buffer(new_block->_buffer_desc);
                ++_total_pinned_buffers;
                *block = new_block;
            } else {
                new_block->_is_deleted = true;
                return_unused_block(new_block);
            }
            return Status::OK();
        }
    }

    bool in_mem = true;
    RETURN_IF_ERROR(find_buffer_for_block(new_block, &in_mem));
    DCHECK(!in_mem) << "A new block cannot start in mem.";
    DCHECK(!new_block->is_pinned() || new_block->_buffer_desc != nullptr)
            << new_block->debug_string();

    if (!new_block->is_pinned()) {
        if (unpin_block == nullptr) {
            // We couldn't get a new block and no unpin block was provided. Can't return
            // a block.
            new_block->_is_deleted = true;
            return_unused_block(new_block);
            new_block = nullptr;
        } else {
            // We need to transfer the buffer from unpin_block to new_block.
            RETURN_IF_ERROR(transfer_buffer(new_block, unpin_block, true));
        }
    } else if (unpin_block != nullptr) {
        // Got a new block without needing to transfer. Just unpin this block.
        RETURN_IF_ERROR(unpin_block->unpin());
    }

    DCHECK(new_block == nullptr || new_block->is_pinned());
    *block = new_block;
    return Status::OK();
}

Status BufferedBlockMgr2::transfer_buffer(Block* dst, Block* src, bool unpin) {
    Status status = Status::OK();
    DCHECK(dst != nullptr);
    DCHECK(src != nullptr);

    // First write out the src block.
    DCHECK(src->_is_pinned);
    DCHECK(!dst->_is_pinned);
    DCHECK(dst->_buffer_desc == nullptr);
    DCHECK_EQ(src->_buffer_desc->len, _max_block_size);
    src->_is_pinned = false;

    if (unpin) {
        unique_lock<mutex> lock(_lock);
        src->_client_local = true;
        status = write_unpinned_block(src);
        if (!status.ok()) {
            // The transfer failed, return the buffer to src.
            src->_is_pinned = true;
            return status;
        }
        // Wait for the write to complete.
        while (src->_in_write && !_is_cancelled) {
            src->_write_complete_cv.wait(lock);
        }
        if (_is_cancelled) {
            // We can't be sure the write succeeded, so return the buffer to src.
            src->_is_pinned = true;
            return Status::Cancelled("Cancelled");
        }
        DCHECK(!src->_in_write);
    }
    // Assign the buffer to the new block.
    dst->_buffer_desc = src->_buffer_desc;
    dst->_buffer_desc->block = dst;
    src->_buffer_desc = nullptr;
    dst->_is_pinned = true;
    if (!unpin) {
        src->_is_deleted = true;
        return_unused_block(src);
    }
    return Status::OK();
}

BufferedBlockMgr2::~BufferedBlockMgr2() {
    {
        lock_guard<SpinLock> lock(_s_block_mgrs_lock);
        BlockMgrsMap::iterator it = _s_query_to_block_mgrs.find(_query_id);
        // IMPALA-2286: Another fragment may have called create() for this _query_id and
        // saw that this BufferedBlockMgr2 is being destructed.  That fragement will
        // overwrite the map entry for _query_id, pointing it to a different
        // BufferedBlockMgr2 object.  We should let that object's destructor remove the
        // entry.  On the other hand, if the second BufferedBlockMgr2 is destructed before
        // this thread acquires the lock, then we'll remove the entry (because we can't
        // distinguish between the two expired pointers), and when the other
        // ~BufferedBlockMgr2() call occurs, it won't find an entry for this _query_id.
        if (it != _s_query_to_block_mgrs.end()) {
            std::shared_ptr<BufferedBlockMgr2> mgr = it->second.lock();
            if (mgr.get() == nullptr) {
                // The BufferBlockMgr object referenced by this entry is being deconstructed.
                _s_query_to_block_mgrs.erase(it);
            } else {
                // The map references another (still valid) BufferedBlockMgr2.
                DCHECK_NE(mgr.get(), this);
            }
        }
    }

    if (_io_request_context != nullptr) {
        _io_mgr->unregister_context(_io_request_context);
    }

    // If there are any outstanding writes and we are here it means that when the
    // write_complete() callback gets executed it is going to access invalid memory.
    // See IMPALA-1890.
    DCHECK_EQ(_non_local_outstanding_writes, 0) << endl << debug_internal();
    // Delete tmp files.
    for (auto& file : _tmp_files) {
        file->remove();
    }
    _tmp_files.clear();

    // Free memory resources.
    for (BufferDescriptor* buffer : _all_io_buffers) {
        _mem_tracker->release(buffer->len);
        delete[] buffer->buffer;
    }
    _mem_tracker.reset();
}

int64_t BufferedBlockMgr2::bytes_allocated() const {
    return _mem_tracker->consumption();
}

int BufferedBlockMgr2::num_pinned_buffers(Client* client) const {
    return client->_num_pinned_buffers;
}

int BufferedBlockMgr2::num_reserved_buffers_remaining(Client* client) const {
    return std::max(client->_num_reserved_buffers - client->_num_pinned_buffers, 0);
}

std::shared_ptr<MemTracker> BufferedBlockMgr2::get_tracker(Client* client) const {
    return client->_tracker;
}

// TODO: It would be good if we had a sync primitive that supports is_mine() calls, see
//       IMPALA-1884.
Status BufferedBlockMgr2::delete_or_unpin_block(Block* block, bool unpin) {
    if (block == nullptr) {
        return is_cancelled() ? Status::Cancelled("Cancelled") : Status::OK();
    }
    if (unpin) {
        return block->unpin();
    } else {
        block->del();
        return is_cancelled() ? Status::Cancelled("Cancelled") : Status::OK();
    }
}

Status BufferedBlockMgr2::pin_block(Block* block, bool* pinned, Block* release_block, bool unpin) {
    DCHECK(block != nullptr);
    DCHECK(!block->_is_deleted);
    *pinned = false;
    if (block->_is_pinned) {
        *pinned = true;
        return delete_or_unpin_block(release_block, unpin);
    }

    bool in_mem = false;
    RETURN_IF_ERROR(find_buffer_for_block(block, &in_mem));
    *pinned = block->_is_pinned;

    // Block was not evicted or had no data, nothing left to do.
    if (in_mem || block->_valid_data_len == 0) {
        return delete_or_unpin_block(release_block, unpin);
    }

    if (!block->_is_pinned) {
        if (release_block == nullptr) {
            return Status::OK();
        }

        if (block->_buffer_desc != nullptr) {
            {
                lock_guard<mutex> lock(_lock);
                if (_free_io_buffers.contains(block->_buffer_desc)) {
                    DCHECK(!block->_is_pinned && !block->_in_write &&
                           !_unpinned_blocks.contains(block))
                            << endl
                            << block->debug_string();
                    _free_io_buffers.remove(block->_buffer_desc);
                } else if (_unpinned_blocks.contains(block)) {
                    _unpinned_blocks.remove(block);
                } else {
                    DCHECK(block->_in_write);
                }
                block->_is_pinned = true;
                *pinned = true;
                block->_client->pin_buffer(block->_buffer_desc);
                ++_total_pinned_buffers;
                RETURN_IF_ERROR(write_unpinned_blocks());
            }
            return delete_or_unpin_block(release_block, unpin);
        }

        RETURN_IF_ERROR(transfer_buffer(block, release_block, unpin));
        DCHECK(!release_block->_is_pinned);
        release_block = nullptr; // Handled by transfer.
        DCHECK(block->_is_pinned);
        *pinned = true;
    }

    // Read the block from disk if it was not in memory.
    DCHECK(block->_write_range != nullptr) << block->debug_string() << endl << release_block;
    SCOPED_TIMER(_disk_read_timer);
    // Create a ScanRange to perform the read.
    DiskIoMgr::ScanRange* scan_range = _obj_pool.add(new DiskIoMgr::ScanRange());
    scan_range->reset(nullptr, block->_write_range->file(), block->_write_range->len(),
                      block->_write_range->offset(), block->_write_range->disk_id(), false, block,
                      DiskIoMgr::ScanRange::NEVER_CACHE);
    vector<DiskIoMgr::ScanRange*> ranges(1, scan_range);
    RETURN_IF_ERROR(_io_mgr->add_scan_ranges(_io_request_context, ranges, true));

    // Read from the io mgr buffer into the block's assigned buffer.
    int64_t offset = 0;
    bool buffer_eosr = false;
    do {
        DiskIoMgr::BufferDescriptor* io_mgr_buffer;
        RETURN_IF_ERROR(scan_range->get_next(&io_mgr_buffer));
        memcpy(block->buffer() + offset, io_mgr_buffer->buffer(), io_mgr_buffer->len());
        offset += io_mgr_buffer->len();
        buffer_eosr = io_mgr_buffer->eosr();
        io_mgr_buffer->return_buffer();
    } while (!buffer_eosr);
    DCHECK_EQ(offset, block->_write_range->len());

    return delete_or_unpin_block(release_block, unpin);
}

Status BufferedBlockMgr2::unpin_block(Block* block) {
    DCHECK(!block->_is_deleted) << "Unpin for deleted block.";

    lock_guard<mutex> unpinned_lock(_lock);
    if (_is_cancelled) {
        return Status::Cancelled("Cancelled");
    }
    DCHECK(block->validate()) << endl << block->debug_string();
    if (!block->_is_pinned) {
        return Status::OK();
    }
    DCHECK_EQ(block->_buffer_desc->len, _max_block_size) << "Can only unpin io blocks.";
    DCHECK(validate()) << endl << debug_internal();
    // Add 'block' to the list of unpinned blocks and set _is_pinned to false.
    // Cache its position in the list for later removal.
    block->_is_pinned = false;
    DCHECK(!_unpinned_blocks.contains(block)) << " Unpin for block in unpinned list";
    if (!block->_in_write) {
        _unpinned_blocks.enqueue(block);
    }
    block->_client->unpin_buffer(block->_buffer_desc);
    if (block->_client->_num_pinned_buffers < block->_client->_num_reserved_buffers) {
        ++_unfullfilled_reserved_buffers;
    }
    --_total_pinned_buffers;
    RETURN_IF_ERROR(write_unpinned_blocks());
    DCHECK(validate()) << endl << debug_internal();
    DCHECK(block->validate()) << endl << block->debug_string();
    return Status::OK();
}

Status BufferedBlockMgr2::write_unpinned_blocks() {
    if (!_enable_spill) {
        return Status::OK();
    }

    // Assumes block manager lock is already taken.
    while (_non_local_outstanding_writes + _free_io_buffers.size() < _block_write_threshold &&
           !_unpinned_blocks.empty()) {
        // Pop a block from the back of the list (LIFO).
        Block* write_block = _unpinned_blocks.pop_back();
        write_block->_client_local = false;
        RETURN_IF_ERROR(write_unpinned_block(write_block));
        ++_non_local_outstanding_writes;
    }
    DCHECK(validate()) << endl << debug_internal();
    return Status::OK();
}

Status BufferedBlockMgr2::write_unpinned_block(Block* block) {
    // Assumes block manager lock is already taken.
    DCHECK(!block->_is_pinned) << block->debug_string();
    DCHECK(!block->_in_write) << block->debug_string();
    DCHECK_EQ(block->_buffer_desc->len, _max_block_size);

    if (block->_write_range == nullptr) {
        if (_tmp_files.empty()) {
            RETURN_IF_ERROR(init_tmp_files());
        }

        // First time the block is being persisted - need to allocate tmp file space.
        TmpFileMgr::File* tmp_file;
        int64_t file_offset;
        RETURN_IF_ERROR(allocate_scratch_space(_max_block_size, &tmp_file, &file_offset));
        int disk_id = tmp_file->disk_id();
        if (disk_id < 0) {
            // Assign a valid disk id to the write range if the tmp file was not assigned one.
            static unsigned int next_disk_id = 0;
            disk_id = ++next_disk_id;
        }
        disk_id %= _io_mgr->num_local_disks();
        DiskIoMgr::WriteRange::WriteDoneCallback callback = bind(
                mem_fn(&BufferedBlockMgr2::write_complete), this, block, std::placeholders::_1);
        block->_write_range = _obj_pool.add(
                new DiskIoMgr::WriteRange(tmp_file->path(), file_offset, disk_id, callback));
        block->_tmp_file = tmp_file;
    }

    uint8_t* outbuf = nullptr;
    outbuf = block->buffer();

    block->_write_range->set_data(outbuf, block->_valid_data_len);

    // Issue write through DiskIoMgr.
    RETURN_IF_ERROR(_io_mgr->add_write_range(_io_request_context, block->_write_range));
    block->_in_write = true;
    DCHECK(block->validate()) << endl << block->debug_string();
    _outstanding_writes_counter->update(1);
    _bytes_written_counter->update(block->_valid_data_len);
    ++_writes_issued;
    if (_writes_issued == 1) {
    }
    return Status::OK();
}

Status BufferedBlockMgr2::allocate_scratch_space(int64_t block_size, TmpFileMgr::File** tmp_file,
                                                 int64_t* file_offset) {
    // Assumes block manager lock is already taken.
    vector<std::string> errs;
    // Find the next physical file in round-robin order and create a write range for it.
    for (int attempt = 0; attempt < _tmp_files.size(); ++attempt) {
        *tmp_file = _tmp_files[_next_block_index].get();
        _next_block_index = (_next_block_index + 1) % _tmp_files.size();
        if ((*tmp_file)->is_blacklisted()) {
            continue;
        }
        Status status = (*tmp_file)->allocate_space(_max_block_size, file_offset);
        if (status.ok()) {
            return Status::OK();
        }
        // Log error and try other files if there was a problem. Problematic files will be
        // blacklisted so we will not repeatedly log the same error.
        LOG(WARNING) << "Error while allocating temporary file range: " << status.get_error_msg()
                     << ". Will try another temporary file.";
        errs.emplace_back(status.message().data, status.message().size);
    }
    Status err_status = Status::InternalError(
            "No usable temporary files: space could not be allocated on any temporary device.");
    for (int i = 0; i < errs.size(); ++i) {
        err_status = err_status.clone_and_append(errs[i]);
    }
    return err_status;
}

void BufferedBlockMgr2::write_complete(Block* block, const Status& write_status) {
    Status status = Status::OK();
    lock_guard<mutex> lock(_lock);
    _outstanding_writes_counter->update(-1);
    DCHECK(validate()) << endl << debug_internal();
    DCHECK(_is_cancelled || block->_in_write) << "write_complete() for block not in write." << endl
                                              << block->debug_string();
    if (!block->_client_local) {
        DCHECK_GT(_non_local_outstanding_writes, 0) << block->debug_string();
        --_non_local_outstanding_writes;
    }
    block->_in_write = false;

    // Explicitly release our temporarily allocated buffer here so that it doesn't
    // hang around needlessly.

    // return_unused_block() will clear the block, so save the client pointer.
    // We have to be careful while touching the state because it may have been cleaned up by
    // another thread.
    RuntimeState* state = block->_client->_state;
    // If the block was re-pinned when it was in the IOMgr queue, don't free it.
    if (block->_is_pinned) {
        // The number of outstanding writes has decreased but the number of free buffers
        // hasn't.
        DCHECK(!block->_client_local)
                << "Client should be waiting. No one should have pinned this block.";
        if (write_status.ok() && !_is_cancelled && !state->is_cancelled()) {
            status = write_unpinned_blocks();
        }
    } else if (block->_client_local) {
        DCHECK(!block->_is_deleted)
                << "Client should be waiting. No one should have deleted this block.";
        block->_write_complete_cv.notify_one();
    } else {
        DCHECK_EQ(block->_buffer_desc->len, _max_block_size)
                << "Only io sized buffers should spill";
        _free_io_buffers.enqueue(block->_buffer_desc);
        // Finish the delete_block() work.
        if (block->_is_deleted) {
            block->_buffer_desc->block = nullptr;
            block->_buffer_desc = nullptr;
            return_unused_block(block);
        }
        // Multiple threads may be waiting for the same block in find_buffer().  Wake them
        // all up.  One thread will get this block, and the others will re-evaluate whether
        // they should continue waiting and if another write needs to be initiated.
        _buffer_available_cv.notify_all();
    }
    DCHECK(validate()) << endl << debug_internal();

    if (!write_status.ok() || !status.ok() || _is_cancelled) {
        VLOG_FILE << "Query: " << _query_id
                  << ". Write did not complete successfully: "
                     "write_status="
                  << write_status.get_error_msg() << ", status=" << status.get_error_msg()
                  << ". _is_cancelled=" << _is_cancelled;

        // If the instance is already cancelled, don't confuse things with these errors.
        if (!write_status.is_cancelled() && !state->is_cancelled()) {
            if (!write_status.ok()) {
                // Report but do not attempt to recover from write error.
                DCHECK(block->_tmp_file != nullptr);
                block->_tmp_file->report_io_error(write_status.get_error_msg());
                VLOG_QUERY << "Query: " << _query_id << " write complete callback with error.";
                state->log_error(write_status.get_error_msg());
            }
            if (!status.ok()) {
                VLOG_QUERY << "Query: " << _query_id << " error while writing unpinned blocks.";
                state->log_error(status.get_error_msg());
            }
        }
        // Set cancelled and wake up waiting threads if an error occurred.  Note that in
        // the case of _client_local, that thread was woken up above.
        _is_cancelled = true;
        _buffer_available_cv.notify_all();
    }
}

void BufferedBlockMgr2::delete_block(Block* block) {
    DCHECK(!block->_is_deleted);

    lock_guard<mutex> lock(_lock);
    DCHECK(block->validate()) << endl << debug_internal();
    block->_is_deleted = true;

    if (block->_is_pinned) {
        if (block->is_max_size()) {
            --_total_pinned_buffers;
        }
        block->_client->unpin_buffer(block->_buffer_desc);
        // Only block is io size we need change _unfullfilled_reserved_buffers
        if (block->is_max_size() &&
            block->_client->_num_pinned_buffers < block->_client->_num_reserved_buffers) {
            ++_unfullfilled_reserved_buffers;
        }
        block->_is_pinned = false;
    } else if (_unpinned_blocks.contains(block)) {
        // Remove block from unpinned list.
        _unpinned_blocks.remove(block);
    }

    if (block->_in_write) {
        DCHECK(block->_buffer_desc != nullptr && block->_buffer_desc->len == _max_block_size)
                << "Should never be writing a small buffer";
        // If a write is still pending, return. Cleanup will be done in write_complete().
        DCHECK(block->validate()) << endl << block->debug_string();
        return;
    }

    if (block->_buffer_desc != nullptr) {
        if (block->_buffer_desc->len != _max_block_size) {
            // Just delete the block for now.
            delete[] block->_buffer_desc->buffer;
            block->_client->_tracker->release(block->_buffer_desc->len);
            delete block->_buffer_desc;
            block->_buffer_desc = nullptr;
        } else {
            if (!_free_io_buffers.contains(block->_buffer_desc)) {
                _free_io_buffers.enqueue(block->_buffer_desc);
                _buffer_available_cv.notify_one();
            }
            block->_buffer_desc->block = nullptr;
            block->_buffer_desc = nullptr;
        }
    }
    return_unused_block(block);
    DCHECK(block->validate()) << endl << block->debug_string();
    DCHECK(validate()) << endl << debug_internal();
}

void BufferedBlockMgr2::return_unused_block(Block* block) {
    DCHECK(block->_is_deleted) << block->debug_string();
    DCHECK(!block->_is_pinned) << block->debug_string();
    ;
    DCHECK(block->_buffer_desc == nullptr);
    block->init();
    _unused_blocks.enqueue(block);
}

Status BufferedBlockMgr2::find_buffer_for_block(Block* block, bool* in_mem) {
    DCHECK(block != nullptr);
    Client* client = block->_client;
    DCHECK(client != nullptr);
    DCHECK(!block->_is_pinned && !block->_is_deleted) << "Pinned or deleted block " << endl
                                                      << block->debug_string();
    *in_mem = false;

    unique_lock<mutex> l(_lock);
    if (_is_cancelled) {
        return Status::Cancelled("Cancelled");
    }

    // First check if there is enough reserved memory to satisfy this request.
    bool is_reserved_request = false;
    if (client->_num_pinned_buffers < client->_num_reserved_buffers) {
        is_reserved_request = true;
    } else if (client->_num_tmp_reserved_buffers > 0) {
        is_reserved_request = true;
        --client->_num_tmp_reserved_buffers;
    }

    DCHECK(validate()) << endl << debug_internal();
    if (is_reserved_request) {
        --_unfullfilled_reserved_buffers;
    }

    if (!is_reserved_request && remaining_unreserved_buffers() < 1) {
        // The client already has its quota and there are no unreserved blocks left.
        // Note that even if this passes, it is still possible for the path below to
        // see OOM because another query consumed memory from the process tracker. This
        // only happens if the buffer has not already been allocated by the block mgr.
        // This check should ensure that the memory cannot be consumed by another client
        // of the block mgr.
        return Status::OK();
    }

    if (block->_buffer_desc != nullptr) {
        // The block is in memory. It may be in 3 states:
        //  1. In the unpinned list. The buffer will not be in the free list.
        //  2. _in_write == true. The buffer will not be in the free list.
        //  3. The buffer is free, but hasn't yet been reassigned to a different block.
        DCHECK_EQ(block->_buffer_desc->len, max_block_size()) << "Non-I/O blocks are always pinned";
        DCHECK(_unpinned_blocks.contains(block) || block->_in_write ||
               _free_io_buffers.contains(block->_buffer_desc));
        if (_unpinned_blocks.contains(block)) {
            _unpinned_blocks.remove(block);
            DCHECK(!_free_io_buffers.contains(block->_buffer_desc));
        } else if (block->_in_write) {
            DCHECK(block->_in_write && !_free_io_buffers.contains(block->_buffer_desc));
        } else {
            _free_io_buffers.remove(block->_buffer_desc);
        }
        _buffered_pin_counter->update(1);
        *in_mem = true;
    } else {
        BufferDescriptor* buffer_desc = nullptr;
        RETURN_IF_ERROR(find_buffer(l, &buffer_desc));

        if (buffer_desc == nullptr) {
            // There are no free buffers or blocks we can evict. We need to fail this request.
            // If this is an optional request, return OK. If it is required, return OOM.
            if (!is_reserved_request) {
                return Status::OK();
            }

            if (VLOG_QUERY_IS_ON) {
                stringstream ss;
                ss << "Query id=" << _query_id << " was unable to get minimum required buffers."
                   << endl
                   << debug_internal() << endl
                   << client->debug_string();
                VLOG_QUERY << ss.str();
            }
            return add_exec_msg(
                    "Query did not have enough memory to get the minimum required "
                    "buffers in the block manager.");
        }

        DCHECK(buffer_desc != nullptr);
        DCHECK_EQ(buffer_desc->len, max_block_size()) << "Non-I/O buffer";
        if (buffer_desc->block != nullptr) {
            // This buffer was assigned to a block but now we are reusing it. Reset the
            // previous block->buffer link.
            DCHECK(buffer_desc->block->validate()) << endl << buffer_desc->block->debug_string();
            buffer_desc->block->_buffer_desc = nullptr;
        }
        buffer_desc->block = block;
        block->_buffer_desc = buffer_desc;
    }
    DCHECK(block->_buffer_desc != nullptr);
    DCHECK(block->_buffer_desc->len < max_block_size() || !block->_is_pinned)
            << "Trying to pin already pinned block. " << block->_buffer_desc->len << " "
            << block->_is_pinned;
    block->_is_pinned = true;
    client->pin_buffer(block->_buffer_desc);
    ++_total_pinned_buffers;

    DCHECK(block->validate()) << endl << block->debug_string();
    // The number of free buffers has decreased. Write unpinned blocks if the number
    // of free buffers below the threshold is reached.
    RETURN_IF_ERROR(write_unpinned_blocks());
    DCHECK(validate()) << endl << debug_internal();
    return Status::OK();
}

// We need to find a new buffer. We prefer getting this buffer in this order:
//  1. Allocate a new block if the number of free blocks is less than the write
//     threshold, until we run out of memory.
//  2. Pick a buffer from the free list.
//  3. Wait and evict an unpinned buffer.
Status BufferedBlockMgr2::find_buffer(unique_lock<mutex>& lock, BufferDescriptor** buffer_desc) {
    *buffer_desc = nullptr;

    // First, try to allocate a new buffer.
    if (_free_io_buffers.size() < _block_write_threshold &&
        _mem_tracker->try_consume(_max_block_size)) {
        uint8_t* new_buffer = new uint8_t[_max_block_size];
        *buffer_desc = _obj_pool.add(new BufferDescriptor(new_buffer, _max_block_size));
        (*buffer_desc)->all_buffers_it =
                _all_io_buffers.insert(_all_io_buffers.end(), *buffer_desc);
        return Status::OK();
    }

    // Second, try to pick a buffer from the free list.
    if (_free_io_buffers.empty()) {
        // There are no free buffers. If spills are disabled or there no unpinned blocks we
        // can write, return. We can't get a buffer.
        if (!_enable_spill) {
            return add_exec_msg(
                    "Spilling has been disabled for plans,"
                    "current memory usage has reached the bottleneck. "
                    "You can avoid the behavior via increasing the mem limit "
                    "by session variable exec_mem_limit or enable_spilling.");
        }

        // Third, this block needs to use a buffer that was unpinned from another block.
        // Get a free buffer from the front of the queue and assign it to the block.
        do {
            if (_unpinned_blocks.empty() && _non_local_outstanding_writes == 0) {
                return Status::OK();
            }
            SCOPED_TIMER(_buffer_wait_timer);
            // Try to evict unpinned blocks before waiting.
            RETURN_IF_ERROR(write_unpinned_blocks());
            DCHECK_GT(_non_local_outstanding_writes, 0) << endl << debug_internal();
            _buffer_available_cv.wait(lock);
            if (_is_cancelled) {
                return Status::Cancelled("Cancelled");
            }
        } while (_free_io_buffers.empty());
    }
    *buffer_desc = _free_io_buffers.dequeue();
    return Status::OK();
}

BufferedBlockMgr2::Block* BufferedBlockMgr2::get_unused_block(Client* client) {
    DCHECK(client != nullptr);
    Block* new_block = nullptr;
    if (_unused_blocks.empty()) {
        new_block = _obj_pool.add(new Block(this));
        new_block->init();
        _created_block_counter->update(1);
    } else {
        new_block = _unused_blocks.dequeue();
        _recycled_blocks_counter->update(1);
    }
    DCHECK(new_block != nullptr);
    new_block->_client = client;
    return new_block;
}

bool BufferedBlockMgr2::validate() const {
    int num_free_io_buffers = 0;

    if (_total_pinned_buffers < 0) {
        LOG(ERROR) << "_total_pinned_buffers < 0: " << _total_pinned_buffers;
        return false;
    }

    for (BufferDescriptor* buffer : _all_io_buffers) {
        bool is_free = _free_io_buffers.contains(buffer);
        num_free_io_buffers += is_free;

        if (*buffer->all_buffers_it != buffer) {
            LOG(ERROR) << "All buffers list is corrupt. Buffer iterator is not valid.";
            return false;
        }

        if (buffer->block == nullptr && !is_free) {
            LOG(ERROR) << "Buffer with no block not in free list." << endl << debug_internal();
            return false;
        }

        if (buffer->len != _max_block_size) {
            LOG(ERROR) << "Non-io sized buffers should not end up on free list.";
            return false;
        }

        if (buffer->block != nullptr) {
            if (buffer->block->_buffer_desc != buffer) {
                LOG(ERROR) << "buffer<->block pointers inconsistent. Buffer: " << buffer << endl
                           << buffer->block->debug_string();
                return false;
            }

            if (!buffer->block->validate()) {
                LOG(ERROR) << "buffer->block inconsistent." << endl
                           << buffer->block->debug_string();
                return false;
            }

            if (is_free && (buffer->block->_is_pinned || buffer->block->_in_write ||
                            _unpinned_blocks.contains(buffer->block))) {
                LOG(ERROR) << "Block with buffer in free list and"
                           << " _is_pinned = " << buffer->block->_is_pinned
                           << " _in_write = " << buffer->block->_in_write
                           << " _Unpinned_blocks.contains = "
                           << _unpinned_blocks.contains(buffer->block) << endl
                           << buffer->block->debug_string();
                return false;
            }
        }
    }

    if (_free_io_buffers.size() != num_free_io_buffers) {
        LOG(ERROR) << "_free_buffer_list inconsistency."
                   << " num_free_io_buffers = " << num_free_io_buffers
                   << " _free_io_buffers.size() = " << _free_io_buffers.size() << endl
                   << debug_internal();
        return false;
    }

    Block* block = _unpinned_blocks.head();
    while (block != nullptr) {
        if (!block->validate()) {
            LOG(ERROR) << "Block inconsistent in unpinned list." << endl << block->debug_string();
            return false;
        }

        if (block->_in_write || _free_io_buffers.contains(block->_buffer_desc)) {
            LOG(ERROR) << "Block in unpinned list with"
                       << " _in_write = " << block->_in_write << " _free_io_buffers.contains = "
                       << _free_io_buffers.contains(block->_buffer_desc) << endl
                       << block->debug_string();
            return false;
        }
        block = block->next();
    }

    // Check if we're writing blocks when the number of free buffers falls below
    // threshold. We don't write blocks after cancellation.
    if (!_is_cancelled && !_unpinned_blocks.empty() && _enable_spill &&
        (_free_io_buffers.size() + _non_local_outstanding_writes < _block_write_threshold)) {
        // TODO: this isn't correct when write_unpinned_blocks() fails during the call to
        // write_unpinned_block() so just log the condition but don't return false. Figure
        // out a way to re-enable this change?
        LOG(ERROR) << "Missed writing unpinned blocks";
    }
    return true;
}

string BufferedBlockMgr2::debug_string(Client* client) {
    stringstream ss;
    unique_lock<mutex> l(_lock);
    ss << debug_internal();
    if (client != nullptr) {
        ss << endl << client->debug_string();
    }
    return ss.str();
}

string BufferedBlockMgr2::debug_internal() const {
    stringstream ss;
    ss << "Buffered block mgr" << endl
       << "  Num writes outstanding: " << _outstanding_writes_counter->value() << endl
       << "  Num free io buffers: " << _free_io_buffers.size() << endl
       << "  Num unpinned blocks: " << _unpinned_blocks.size() << endl
       << "  Num available buffers: " << remaining_unreserved_buffers() << endl
       << "  Total pinned buffers: " << _total_pinned_buffers << endl
       << "  Unfullfilled reserved buffers: " << _unfullfilled_reserved_buffers << endl
       << "  Remaining memory: " << _mem_tracker->spare_capacity()
       << " (#blocks=" << (_mem_tracker->spare_capacity() / _max_block_size) << ")" << endl
       << "  Block write threshold: " << _block_write_threshold;
    return ss.str();
}

void BufferedBlockMgr2::init(DiskIoMgr* io_mgr, RuntimeProfile* parent_profile, int64_t mem_limit) {
    unique_lock<mutex> l(_lock);
    if (_initialized) {
        return;
    }

    io_mgr->register_context(&_io_request_context);

    _profile.reset(new RuntimeProfile("BlockMgr"));
    parent_profile->add_child(_profile.get(), true, nullptr);

    _block_size_counter = ADD_COUNTER(_profile.get(), "MaxBlockSize", TUnit::BYTES);
    _block_size_counter->set(_max_block_size);
    _created_block_counter = ADD_COUNTER(_profile.get(), "BlocksCreated", TUnit::UNIT);
    _recycled_blocks_counter = ADD_COUNTER(_profile.get(), "BlocksRecycled", TUnit::UNIT);
    _bytes_written_counter = ADD_COUNTER(_profile.get(), "BytesWritten", TUnit::BYTES);
    _outstanding_writes_counter =
            ADD_COUNTER(_profile.get(), "BlockWritesOutstanding", TUnit::UNIT);
    _buffered_pin_counter = ADD_COUNTER(_profile.get(), "BufferedPins", TUnit::UNIT);
    _disk_read_timer = ADD_TIMER(_profile.get(), "TotalReadBlockTime");
    _buffer_wait_timer = ADD_TIMER(_profile.get(), "TotalBufferWaitTime");
    _encryption_timer = ADD_TIMER(_profile.get(), "TotalEncryptionTime");
    _integrity_check_timer = ADD_TIMER(_profile.get(), "TotalIntegrityCheckTime");

    // Create a new mem_tracker and allocate buffers.
    _mem_tracker = MemTracker::create_virtual_tracker(mem_limit, "BufferedBlockMgr2");

    _initialized = true;
}

Status BufferedBlockMgr2::init_tmp_files() {
    DCHECK(_tmp_files.empty());
    DCHECK(_tmp_file_mgr != nullptr);

    vector<TmpFileMgr::DeviceId> tmp_devices = _tmp_file_mgr->active_tmp_devices();
    // Initialize the tmp files and the initial file to use.
    _tmp_files.reserve(tmp_devices.size());
    for (int i = 0; i < tmp_devices.size(); ++i) {
        TmpFileMgr::File* tmp_file;
        TmpFileMgr::DeviceId tmp_device_id = tmp_devices[i];
        // It is possible for a device to be blacklisted after it was returned
        // by active_tmp_devices() - handle this gracefully.
        Status status = _tmp_file_mgr->get_file(tmp_device_id, _query_id, &tmp_file);
        if (status.ok()) {
            _tmp_files.emplace_back(tmp_file);
        }
    }
    if (_tmp_files.empty()) {
        return Status::InternalError(
                "No spilling directories configured. Cannot spill. Set --scratch_dirs"
                " or see log for previous errors that prevented use of provided directories");
    }
    _next_block_index = rand() % _tmp_files.size();
    return Status::OK();
}

} // namespace doris