Early Design Documentation: https://shimo.im/docs/DT6JXDRkdTvdyV3G Implement a new way of memory statistics based on TCMalloc New/Delete Hook, MemTracker and TLS, and it is expected that all memory new/delete/malloc/free of the BE process can be counted.
803 lines
30 KiB
C++
803 lines
30 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#include "runtime/buffered_tuple_stream2.h"
|
|
|
|
#include "runtime/descriptors.h"
|
|
#include "runtime/row_batch.h"
|
|
#include "runtime/tuple_row.h"
|
|
#include "util/bit_util.h"
|
|
#include "util/debug_util.h"
|
|
#include "util/pretty_printer.h"
|
|
|
|
using std::stringstream;
|
|
using std::string;
|
|
using std::vector;
|
|
using std::list;
|
|
|
|
using std::unique_ptr;
|
|
|
|
namespace doris {
|
|
|
|
// The first NUM_SMALL_BLOCKS of the tuple stream are made of blocks less than the
|
|
// IO size. These blocks never spill.
|
|
// TODO: Consider adding a 4MB in-memory buffer that would split the gap between the
|
|
// 512KB in-memory buffer and the 8MB (IO-sized) spillable buffer.
|
|
static const int64_t INITIAL_BLOCK_SIZES[] = {64 * 1024, 512 * 1024};
|
|
static const int NUM_SMALL_BLOCKS = sizeof(INITIAL_BLOCK_SIZES) / sizeof(int64_t);
|
|
|
|
string BufferedTupleStream2::RowIdx::debug_string() const {
|
|
stringstream ss;
|
|
ss << "RowIdx block=" << block() << " offset=" << offset() << " idx=" << idx();
|
|
return ss.str();
|
|
}
|
|
|
|
BufferedTupleStream2::BufferedTupleStream2(RuntimeState* state, const RowDescriptor& row_desc,
|
|
BufferedBlockMgr2* block_mgr,
|
|
BufferedBlockMgr2::Client* client,
|
|
bool use_initial_small_buffers, bool read_write)
|
|
: _use_small_buffers(use_initial_small_buffers),
|
|
_delete_on_read(false),
|
|
_read_write(read_write),
|
|
_state(state),
|
|
_desc(row_desc),
|
|
_nullable_tuple(row_desc.is_any_tuple_nullable()),
|
|
_block_mgr(block_mgr),
|
|
_block_mgr_client(client),
|
|
_total_byte_size(0),
|
|
_read_ptr(nullptr),
|
|
_read_tuple_idx(0),
|
|
_read_bytes(0),
|
|
_rows_returned(0),
|
|
_read_block_idx(-1),
|
|
_write_block(nullptr),
|
|
_num_pinned(0),
|
|
_num_small_blocks(0),
|
|
_closed(false),
|
|
_num_rows(0),
|
|
_pinned(true),
|
|
_pin_timer(nullptr),
|
|
_unpin_timer(nullptr),
|
|
_get_new_block_timer(nullptr) {
|
|
_null_indicators_read_block = _null_indicators_write_block = -1;
|
|
_read_block = _blocks.end();
|
|
_fixed_tuple_row_size = 0;
|
|
for (int i = 0; i < _desc.tuple_descriptors().size(); ++i) {
|
|
const TupleDescriptor* tuple_desc = _desc.tuple_descriptors()[i];
|
|
const int tuple_byte_size = tuple_desc->byte_size();
|
|
_fixed_tuple_row_size += tuple_byte_size;
|
|
if (!tuple_desc->string_slots().empty()) {
|
|
_string_slots.push_back(make_pair(i, tuple_desc->string_slots()));
|
|
}
|
|
// if (!tuple_desc->collection_slots().empty()) {
|
|
// _collection_slots.push_back(make_pair(i, tuple_desc->collection_slots()));
|
|
// }
|
|
}
|
|
}
|
|
|
|
// Returns the number of pinned blocks in the list.
|
|
// Only called in DCHECKs to validate _num_pinned.
|
|
int num_pinned(const list<BufferedBlockMgr2::Block*>& blocks) {
|
|
int num_pinned = 0;
|
|
for (list<BufferedBlockMgr2::Block*>::const_iterator it = blocks.begin(); it != blocks.end();
|
|
++it) {
|
|
if ((*it)->is_pinned() && (*it)->is_max_size()) {
|
|
++num_pinned;
|
|
}
|
|
}
|
|
return num_pinned;
|
|
}
|
|
|
|
string BufferedTupleStream2::debug_string() const {
|
|
stringstream ss;
|
|
ss << "BufferedTupleStream2 num_rows=" << _num_rows << " rows_returned=" << _rows_returned
|
|
<< " pinned=" << (_pinned ? "true" : "false")
|
|
<< " delete_on_read=" << (_delete_on_read ? "true" : "false")
|
|
<< " closed=" << (_closed ? "true" : "false") << " num_pinned=" << _num_pinned
|
|
<< " write_block=" << _write_block << " _read_block=";
|
|
if (_read_block == _blocks.end()) {
|
|
ss << "<end>";
|
|
} else {
|
|
ss << *_read_block;
|
|
}
|
|
ss << " blocks=[\n";
|
|
for (list<BufferedBlockMgr2::Block*>::const_iterator it = _blocks.begin(); it != _blocks.end();
|
|
++it) {
|
|
ss << "{" << (*it)->debug_string() << "}";
|
|
if (*it != _blocks.back()) {
|
|
ss << ",\n";
|
|
}
|
|
}
|
|
ss << "]";
|
|
return ss.str();
|
|
}
|
|
|
|
Status BufferedTupleStream2::init(int node_id, RuntimeProfile* profile, bool pinned) {
|
|
if (profile != nullptr) {
|
|
_pin_timer = ADD_TIMER(profile, "PinTime");
|
|
_unpin_timer = ADD_TIMER(profile, "UnpinTime");
|
|
_get_new_block_timer = ADD_TIMER(profile, "GetNewBlockTime");
|
|
}
|
|
|
|
if (_block_mgr->max_block_size() < INITIAL_BLOCK_SIZES[0]) {
|
|
_use_small_buffers = false;
|
|
}
|
|
|
|
bool got_block = false;
|
|
RETURN_IF_ERROR(new_block_for_write(_fixed_tuple_row_size, &got_block));
|
|
if (!got_block) {
|
|
return _block_mgr->mem_limit_too_low_error(_block_mgr_client, node_id);
|
|
}
|
|
DCHECK(_write_block != nullptr);
|
|
if (!pinned) {
|
|
RETURN_IF_ERROR(unpin_stream());
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
Status BufferedTupleStream2::switch_to_io_buffers(bool* got_buffer) {
|
|
if (!_use_small_buffers) {
|
|
*got_buffer = (_write_block != nullptr);
|
|
return Status::OK();
|
|
}
|
|
_use_small_buffers = false;
|
|
Status status = new_block_for_write(_block_mgr->max_block_size(), got_buffer);
|
|
// IMPALA-2330: Set the flag using small buffers back to false in case it failed to
|
|
// got a buffer.
|
|
DCHECK(status.ok() || !*got_buffer) << status.ok() << " " << *got_buffer;
|
|
_use_small_buffers = !*got_buffer;
|
|
return status;
|
|
}
|
|
|
|
void BufferedTupleStream2::close() {
|
|
for (list<BufferedBlockMgr2::Block*>::iterator it = _blocks.begin(); it != _blocks.end();
|
|
++it) {
|
|
(*it)->del();
|
|
}
|
|
_blocks.clear();
|
|
_num_pinned = 0;
|
|
DCHECK_EQ(_num_pinned, num_pinned(_blocks));
|
|
_closed = true;
|
|
}
|
|
|
|
int64_t BufferedTupleStream2::bytes_in_mem(bool ignore_current) const {
|
|
int64_t result = 0;
|
|
for (list<BufferedBlockMgr2::Block*>::const_iterator it = _blocks.begin(); it != _blocks.end();
|
|
++it) {
|
|
if (!(*it)->is_pinned()) {
|
|
continue;
|
|
}
|
|
if (!(*it)->is_max_size()) {
|
|
continue;
|
|
}
|
|
if (*it == _write_block && ignore_current) {
|
|
continue;
|
|
}
|
|
result += (*it)->buffer_len();
|
|
}
|
|
return result;
|
|
}
|
|
|
|
Status BufferedTupleStream2::unpin_block(BufferedBlockMgr2::Block* block) {
|
|
SCOPED_TIMER(_unpin_timer);
|
|
DCHECK(block->is_pinned());
|
|
if (!block->is_max_size()) {
|
|
return Status::OK();
|
|
}
|
|
RETURN_IF_ERROR(block->unpin());
|
|
--_num_pinned;
|
|
DCHECK_EQ(_num_pinned, num_pinned(_blocks));
|
|
return Status::OK();
|
|
}
|
|
|
|
Status BufferedTupleStream2::new_block_for_write(int64_t min_size, bool* got_block) {
|
|
DCHECK(!_closed);
|
|
*got_block = false;
|
|
if (min_size > _block_mgr->max_block_size()) {
|
|
std::stringstream error_msg;
|
|
error_msg << "Cannot process row that is bigger than the IO size (row_size="
|
|
<< PrettyPrinter::print(min_size, TUnit::BYTES)
|
|
<< "). To run this query, increase the IO size (--read_size option).";
|
|
return Status::InternalError(error_msg.str());
|
|
}
|
|
|
|
BufferedBlockMgr2::Block* unpin_block = _write_block;
|
|
if (_write_block != nullptr) {
|
|
DCHECK(_write_block->is_pinned());
|
|
if (_pinned || _write_block == *_read_block || !_write_block->is_max_size()) {
|
|
// In these cases, don't unpin the current write block.
|
|
unpin_block = nullptr;
|
|
}
|
|
}
|
|
|
|
int64_t block_len = _block_mgr->max_block_size();
|
|
if (_use_small_buffers) {
|
|
if (_blocks.size() < NUM_SMALL_BLOCKS) {
|
|
block_len = std::min(block_len, INITIAL_BLOCK_SIZES[_blocks.size()]);
|
|
if (block_len < min_size) {
|
|
block_len = _block_mgr->max_block_size();
|
|
}
|
|
}
|
|
if (block_len == _block_mgr->max_block_size()) {
|
|
// Do not switch to IO-buffers automatically. Do not get a buffer.
|
|
*got_block = false;
|
|
return Status::OK();
|
|
}
|
|
}
|
|
|
|
BufferedBlockMgr2::Block* new_block = nullptr;
|
|
{
|
|
SCOPED_TIMER(_get_new_block_timer);
|
|
RETURN_IF_ERROR(
|
|
_block_mgr->get_new_block(_block_mgr_client, unpin_block, &new_block, block_len));
|
|
}
|
|
*got_block = (new_block != nullptr);
|
|
|
|
if (!*got_block) {
|
|
DCHECK(unpin_block == nullptr);
|
|
return Status::OK();
|
|
}
|
|
|
|
if (unpin_block != nullptr) {
|
|
DCHECK(unpin_block == _write_block);
|
|
DCHECK(!_write_block->is_pinned());
|
|
--_num_pinned;
|
|
DCHECK_EQ(_num_pinned, num_pinned(_blocks));
|
|
}
|
|
|
|
// Compute and allocate the block header with the null indicators
|
|
_null_indicators_write_block = compute_num_null_indicator_bytes(block_len);
|
|
new_block->allocate<uint8_t>(_null_indicators_write_block);
|
|
_write_tuple_idx = 0;
|
|
|
|
_blocks.push_back(new_block);
|
|
_block_start_idx.push_back(new_block->buffer());
|
|
_write_block = new_block;
|
|
DCHECK(_write_block->is_pinned());
|
|
DCHECK_EQ(_write_block->num_rows(), 0);
|
|
if (_write_block->is_max_size()) {
|
|
++_num_pinned;
|
|
DCHECK_EQ(_num_pinned, num_pinned(_blocks));
|
|
} else {
|
|
++_num_small_blocks;
|
|
}
|
|
_total_byte_size += block_len;
|
|
return Status::OK();
|
|
}
|
|
|
|
Status BufferedTupleStream2::next_block_for_read() {
|
|
DCHECK(!_closed);
|
|
DCHECK(_read_block != _blocks.end());
|
|
DCHECK_EQ(_num_pinned, num_pinned(_blocks)) << _pinned;
|
|
|
|
// If non-nullptr, this will be the current block if we are going to free it while
|
|
// grabbing the next block. This will stay nullptr if we don't want to free the
|
|
// current block.
|
|
BufferedBlockMgr2::Block* block_to_free =
|
|
(!_pinned || _delete_on_read) ? *_read_block : nullptr;
|
|
if (_delete_on_read) {
|
|
// TODO: this is weird. We are deleting even if it is pinned. The analytic
|
|
// eval node needs this.
|
|
DCHECK(_read_block == _blocks.begin());
|
|
DCHECK(*_read_block != _write_block);
|
|
_blocks.pop_front();
|
|
_read_block = _blocks.begin();
|
|
_read_block_idx = 0;
|
|
if (block_to_free != nullptr && !block_to_free->is_max_size()) {
|
|
block_to_free->del();
|
|
block_to_free = nullptr;
|
|
DCHECK_EQ(_num_pinned, num_pinned(_blocks)) << debug_string();
|
|
}
|
|
} else {
|
|
++_read_block;
|
|
++_read_block_idx;
|
|
if (block_to_free != nullptr && !block_to_free->is_max_size()) {
|
|
block_to_free = nullptr;
|
|
}
|
|
}
|
|
|
|
_read_ptr = nullptr;
|
|
_read_tuple_idx = 0;
|
|
_read_bytes = 0;
|
|
|
|
bool pinned = false;
|
|
if (_read_block == _blocks.end() || (*_read_block)->is_pinned()) {
|
|
// End of the blocks or already pinned, just handle block_to_free
|
|
if (block_to_free != nullptr) {
|
|
SCOPED_TIMER(_unpin_timer);
|
|
if (_delete_on_read) {
|
|
block_to_free->del();
|
|
--_num_pinned;
|
|
} else {
|
|
RETURN_IF_ERROR(unpin_block(block_to_free));
|
|
}
|
|
}
|
|
} else {
|
|
// Call into the block mgr to atomically unpin/delete the old block and pin the
|
|
// new block.
|
|
SCOPED_TIMER(_pin_timer);
|
|
RETURN_IF_ERROR((*_read_block)->pin(&pinned, block_to_free, !_delete_on_read));
|
|
if (!pinned) {
|
|
DCHECK(block_to_free == nullptr) << "Should have been able to pin." << std::endl
|
|
<< _block_mgr->debug_string(_block_mgr_client);
|
|
}
|
|
if (block_to_free == nullptr && pinned) {
|
|
++_num_pinned;
|
|
}
|
|
}
|
|
|
|
if (_read_block != _blocks.end() && (*_read_block)->is_pinned()) {
|
|
_null_indicators_read_block =
|
|
compute_num_null_indicator_bytes((*_read_block)->buffer_len());
|
|
_read_ptr = (*_read_block)->buffer() + _null_indicators_read_block;
|
|
}
|
|
DCHECK_EQ(_num_pinned, num_pinned(_blocks)) << debug_string();
|
|
return Status::OK();
|
|
}
|
|
|
|
Status BufferedTupleStream2::prepare_for_read(bool delete_on_read, bool* got_buffer) {
|
|
DCHECK(!_closed);
|
|
if (_blocks.empty()) {
|
|
return Status::OK();
|
|
}
|
|
|
|
if (!_read_write && _write_block != nullptr) {
|
|
DCHECK(_write_block->is_pinned());
|
|
if (!_pinned && _write_block != _blocks.front()) {
|
|
RETURN_IF_ERROR(unpin_block(_write_block));
|
|
}
|
|
_write_block = nullptr;
|
|
}
|
|
|
|
// Walk the blocks and pin the first non-io sized block.
|
|
// (small buffers always being pinned, no need to pin again)
|
|
for (list<BufferedBlockMgr2::Block*>::iterator it = _blocks.begin(); it != _blocks.end();
|
|
++it) {
|
|
if (!(*it)->is_pinned()) {
|
|
SCOPED_TIMER(_pin_timer);
|
|
bool current_pinned = false;
|
|
RETURN_IF_ERROR((*it)->pin(¤t_pinned));
|
|
if (!current_pinned) {
|
|
DCHECK(got_buffer != nullptr) << "Should have reserved enough blocks";
|
|
*got_buffer = false;
|
|
return Status::OK();
|
|
}
|
|
++_num_pinned;
|
|
DCHECK_EQ(_num_pinned, num_pinned(_blocks));
|
|
}
|
|
if ((*it)->is_max_size()) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
_read_block = _blocks.begin();
|
|
DCHECK(_read_block != _blocks.end());
|
|
_null_indicators_read_block = compute_num_null_indicator_bytes((*_read_block)->buffer_len());
|
|
_read_ptr = (*_read_block)->buffer() + _null_indicators_read_block;
|
|
_read_tuple_idx = 0;
|
|
_read_bytes = 0;
|
|
_rows_returned = 0;
|
|
_read_block_idx = 0;
|
|
_delete_on_read = delete_on_read;
|
|
if (got_buffer != nullptr) {
|
|
*got_buffer = true;
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
Status BufferedTupleStream2::pin_stream(bool already_reserved, bool* pinned) {
|
|
DCHECK(!_closed);
|
|
DCHECK(pinned != nullptr);
|
|
if (!already_reserved) {
|
|
// If we can't get all the blocks, don't try at all.
|
|
if (!_block_mgr->try_acquire_tmp_reservation(_block_mgr_client, blocks_unpinned())) {
|
|
*pinned = false;
|
|
return Status::OK();
|
|
}
|
|
}
|
|
|
|
for (list<BufferedBlockMgr2::Block*>::iterator it = _blocks.begin(); it != _blocks.end();
|
|
++it) {
|
|
if ((*it)->is_pinned()) {
|
|
continue;
|
|
}
|
|
{
|
|
SCOPED_TIMER(_pin_timer);
|
|
RETURN_IF_ERROR((*it)->pin(pinned));
|
|
}
|
|
if (!*pinned) {
|
|
VLOG_QUERY << "Should have been reserved." << std::endl
|
|
<< _block_mgr->debug_string(_block_mgr_client);
|
|
return Status::OK();
|
|
}
|
|
++_num_pinned;
|
|
DCHECK_EQ(_num_pinned, num_pinned(_blocks));
|
|
}
|
|
|
|
if (!_delete_on_read) {
|
|
// Populate _block_start_idx on pin.
|
|
DCHECK_EQ(_block_start_idx.size(), _blocks.size());
|
|
_block_start_idx.clear();
|
|
for (list<BufferedBlockMgr2::Block*>::iterator it = _blocks.begin(); it != _blocks.end();
|
|
++it) {
|
|
_block_start_idx.push_back((*it)->buffer());
|
|
}
|
|
}
|
|
*pinned = true;
|
|
_pinned = true;
|
|
return Status::OK();
|
|
}
|
|
|
|
Status BufferedTupleStream2::unpin_stream(bool all) {
|
|
DCHECK(!_closed);
|
|
SCOPED_TIMER(_unpin_timer);
|
|
|
|
for (BufferedBlockMgr2::Block* block : _blocks) {
|
|
if (!block->is_pinned()) {
|
|
continue;
|
|
}
|
|
if (!all && (block == _write_block || (_read_write && block == *_read_block))) {
|
|
continue;
|
|
}
|
|
RETURN_IF_ERROR(unpin_block(block));
|
|
}
|
|
if (all) {
|
|
_read_block = _blocks.end();
|
|
_write_block = nullptr;
|
|
}
|
|
_pinned = false;
|
|
return Status::OK();
|
|
}
|
|
|
|
int BufferedTupleStream2::compute_num_null_indicator_bytes(int block_size) const {
|
|
if (_nullable_tuple) {
|
|
// We assume that all rows will use their max size, so we may be underutilizing the
|
|
// space, i.e. we may have some unused space in case of rows with nullptr tuples.
|
|
const uint32_t tuples_per_row = _desc.tuple_descriptors().size();
|
|
const uint32_t min_row_size_in_bits = 8 * _fixed_tuple_row_size + tuples_per_row;
|
|
const uint32_t block_size_in_bits = 8 * block_size;
|
|
const uint32_t max_num_rows = block_size_in_bits / min_row_size_in_bits;
|
|
return BitUtil::round_up_numi64(max_num_rows * tuples_per_row) * 8;
|
|
} else {
|
|
// If there are no nullable tuples then no need to waste space for null indicators.
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
Status BufferedTupleStream2::get_rows(unique_ptr<RowBatch>* batch, bool* got_rows) {
|
|
RETURN_IF_ERROR(pin_stream(false, got_rows));
|
|
if (!*got_rows) {
|
|
return Status::OK();
|
|
}
|
|
RETURN_IF_ERROR(prepare_for_read(false));
|
|
batch->reset(new RowBatch(_desc, num_rows()));
|
|
bool eos = false;
|
|
// Loop until get_next fills the entire batch. Each call can stop at block
|
|
// boundaries. We generally want it to stop, so that blocks can be freed
|
|
// as we read. It is safe in this case because we pin the entire stream.
|
|
while (!eos) {
|
|
RETURN_IF_ERROR(get_next(batch->get(), &eos));
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
Status BufferedTupleStream2::get_next(RowBatch* batch, bool* eos, vector<RowIdx>* indices) {
|
|
if (_nullable_tuple) {
|
|
return get_next_internal<true>(batch, eos, indices);
|
|
} else {
|
|
return get_next_internal<false>(batch, eos, indices);
|
|
}
|
|
}
|
|
|
|
template <bool HasNullableTuple>
|
|
Status BufferedTupleStream2::get_next_internal(RowBatch* batch, bool* eos,
|
|
vector<RowIdx>* indices) {
|
|
DCHECK(!_closed);
|
|
DCHECK(batch->row_desc().equals(_desc));
|
|
*eos = (_rows_returned == _num_rows);
|
|
if (*eos) {
|
|
return Status::OK();
|
|
}
|
|
DCHECK_GE(_null_indicators_read_block, 0);
|
|
|
|
const uint64_t tuples_per_row = _desc.tuple_descriptors().size();
|
|
DCHECK_LE(_read_tuple_idx / tuples_per_row, (*_read_block)->num_rows());
|
|
DCHECK_EQ(_read_tuple_idx % tuples_per_row, 0);
|
|
int rows_returned_curr_block = _read_tuple_idx / tuples_per_row;
|
|
|
|
int64_t data_len = (*_read_block)->valid_data_len() - _null_indicators_read_block;
|
|
if (UNLIKELY(rows_returned_curr_block == (*_read_block)->num_rows())) {
|
|
// Get the next block in the stream. We need to do this at the beginning of
|
|
// the get_next() call to ensure the buffer management semantics. next_block_for_read()
|
|
// will recycle the memory for the rows returned from the *previous* call to
|
|
// get_next().
|
|
RETURN_IF_ERROR(next_block_for_read());
|
|
DCHECK(_read_block != _blocks.end()) << debug_string();
|
|
DCHECK_GE(_null_indicators_read_block, 0);
|
|
data_len = (*_read_block)->valid_data_len() - _null_indicators_read_block;
|
|
rows_returned_curr_block = 0;
|
|
}
|
|
|
|
DCHECK(_read_block != _blocks.end());
|
|
DCHECK((*_read_block)->is_pinned()) << debug_string();
|
|
DCHECK(_read_ptr != nullptr);
|
|
|
|
int64_t rows_left = _num_rows - _rows_returned;
|
|
int rows_to_fill =
|
|
std::min(static_cast<int64_t>(batch->capacity() - batch->num_rows()), rows_left);
|
|
DCHECK_GE(rows_to_fill, 1);
|
|
batch->add_rows(rows_to_fill);
|
|
uint8_t* tuple_row_mem = reinterpret_cast<uint8_t*>(batch->get_row(batch->num_rows()));
|
|
|
|
// Produce tuple rows from the current block and the corresponding position on the
|
|
// null tuple indicator.
|
|
vector<RowIdx> local_indices;
|
|
if (indices == nullptr) {
|
|
// A hack so that we do not need to check whether 'indices' is not null in the
|
|
// tight loop.
|
|
indices = &local_indices;
|
|
} else {
|
|
DCHECK(is_pinned());
|
|
DCHECK(!_delete_on_read);
|
|
DCHECK_EQ(batch->num_rows(), 0);
|
|
indices->clear();
|
|
}
|
|
indices->reserve(rows_to_fill);
|
|
|
|
int i = 0;
|
|
uint8_t* null_word = nullptr;
|
|
uint32_t null_pos = 0;
|
|
// Start reading from position _read_tuple_idx in the block.
|
|
uint64_t last_read_ptr = 0;
|
|
// IMPALA-2256: Special case if there are no materialized slots.
|
|
bool increment_row = has_tuple_footprint();
|
|
uint64_t last_read_row = increment_row * (_read_tuple_idx / tuples_per_row);
|
|
while (i < rows_to_fill) {
|
|
// Check if current block is done.
|
|
if (UNLIKELY(rows_returned_curr_block + i == (*_read_block)->num_rows())) {
|
|
break;
|
|
}
|
|
|
|
// Copy the row into the output batch.
|
|
TupleRow* row = reinterpret_cast<TupleRow*>(tuple_row_mem);
|
|
last_read_ptr = reinterpret_cast<uint64_t>(_read_ptr);
|
|
indices->push_back(RowIdx());
|
|
DCHECK_EQ(indices->size(), i + 1);
|
|
(*indices)[i].set(_read_block_idx, _read_bytes + _null_indicators_read_block,
|
|
last_read_row);
|
|
if (HasNullableTuple) {
|
|
for (int j = 0; j < tuples_per_row; ++j) {
|
|
// Stitch together the tuples from the block and the nullptr ones.
|
|
null_word = (*_read_block)->buffer() + (_read_tuple_idx >> 3);
|
|
null_pos = _read_tuple_idx & 7;
|
|
++_read_tuple_idx;
|
|
const bool is_not_null = ((*null_word & (1 << (7 - null_pos))) == 0);
|
|
// Copy tuple and advance _read_ptr. If it is a nullptr tuple, it calls set_tuple
|
|
// with Tuple* being 0x0. To do that we multiply the current _read_ptr with
|
|
// false (0x0).
|
|
row->set_tuple(j, reinterpret_cast<Tuple*>(reinterpret_cast<uint64_t>(_read_ptr) *
|
|
is_not_null));
|
|
_read_ptr += _desc.tuple_descriptors()[j]->byte_size() * is_not_null;
|
|
}
|
|
const uint64_t row_read_bytes = reinterpret_cast<uint64_t>(_read_ptr) - last_read_ptr;
|
|
DCHECK_GE(_fixed_tuple_row_size, row_read_bytes);
|
|
_read_bytes += row_read_bytes;
|
|
last_read_ptr = reinterpret_cast<uint64_t>(_read_ptr);
|
|
} else {
|
|
// When we know that there are no nullable tuples we can safely copy them without
|
|
// checking for nullability.
|
|
for (int j = 0; j < tuples_per_row; ++j) {
|
|
row->set_tuple(j, reinterpret_cast<Tuple*>(_read_ptr));
|
|
_read_ptr += _desc.tuple_descriptors()[j]->byte_size();
|
|
}
|
|
_read_bytes += _fixed_tuple_row_size;
|
|
_read_tuple_idx += tuples_per_row;
|
|
}
|
|
tuple_row_mem += sizeof(Tuple*) * tuples_per_row;
|
|
|
|
// Update string slot ptrs.
|
|
for (int j = 0; j < _string_slots.size(); ++j) {
|
|
Tuple* tuple = row->get_tuple(_string_slots[j].first);
|
|
if (HasNullableTuple && tuple == nullptr) {
|
|
continue;
|
|
}
|
|
read_strings(_string_slots[j].second, data_len, tuple);
|
|
}
|
|
|
|
// Update collection slot ptrs. We traverse the collection structure in the same order
|
|
// as it was written to the stream, allowing us to infer the data layout based on the
|
|
// length of collections and strings.
|
|
// for (int j = 0; j < _collection_slots.size(); ++j) {
|
|
// Tuple* tuple = row->get_tuple(_collection_slots[j].first);
|
|
// if (HasNullableTuple && tuple == nullptr) {
|
|
// continue;
|
|
// }
|
|
// ReadCollections(_collection_slots[j].second, data_len, tuple);
|
|
// }
|
|
last_read_row += increment_row;
|
|
++i;
|
|
}
|
|
|
|
batch->commit_rows(i);
|
|
_rows_returned += i;
|
|
*eos = (_rows_returned == _num_rows);
|
|
if ((!_pinned || _delete_on_read) &&
|
|
rows_returned_curr_block + i == (*_read_block)->num_rows()) {
|
|
// No more data in this block. Mark this batch as needing to return so
|
|
// the caller can pass the rows up the operator tree.
|
|
batch->mark_need_to_return();
|
|
}
|
|
DCHECK_EQ(indices->size(), i);
|
|
return Status::OK();
|
|
}
|
|
|
|
void BufferedTupleStream2::read_strings(const vector<SlotDescriptor*>& string_slots, int data_len,
|
|
Tuple* tuple) {
|
|
DCHECK(tuple != nullptr);
|
|
for (int i = 0; i < string_slots.size(); ++i) {
|
|
const SlotDescriptor* slot_desc = string_slots[i];
|
|
if (tuple->is_null(slot_desc->null_indicator_offset())) {
|
|
continue;
|
|
}
|
|
|
|
StringValue* sv = tuple->get_string_slot(slot_desc->tuple_offset());
|
|
DCHECK_LE(sv->len, data_len - _read_bytes);
|
|
sv->ptr = reinterpret_cast<char*>(_read_ptr);
|
|
_read_ptr += sv->len;
|
|
_read_bytes += sv->len;
|
|
}
|
|
}
|
|
|
|
int64_t BufferedTupleStream2::compute_row_size(TupleRow* row) const {
|
|
int64_t size = 0;
|
|
for (int i = 0; i < _desc.tuple_descriptors().size(); ++i) {
|
|
const TupleDescriptor* tuple_desc = _desc.tuple_descriptors()[i];
|
|
Tuple* tuple = row->get_tuple(i);
|
|
DCHECK(_nullable_tuple || tuple_desc->byte_size() == 0 || tuple != nullptr);
|
|
if (tuple == nullptr) {
|
|
continue;
|
|
}
|
|
size += tuple->total_byte_size(*tuple_desc);
|
|
}
|
|
return size;
|
|
}
|
|
|
|
bool BufferedTupleStream2::deep_copy(TupleRow* row) {
|
|
if (_nullable_tuple) {
|
|
return deep_copy_internal<true>(row);
|
|
} else {
|
|
return deep_copy_internal<false>(row);
|
|
}
|
|
}
|
|
|
|
// TODO: this really needs codegen
|
|
// TODO: in case of duplicate tuples, this can redundantly serialize data.
|
|
template <bool HasNullableTuple>
|
|
bool BufferedTupleStream2::deep_copy_internal(TupleRow* row) {
|
|
if (UNLIKELY(_write_block == nullptr)) {
|
|
return false;
|
|
}
|
|
DCHECK_GE(_null_indicators_write_block, 0);
|
|
DCHECK(_write_block->is_pinned()) << debug_string() << std::endl
|
|
<< _write_block->debug_string();
|
|
|
|
const uint64_t tuples_per_row = _desc.tuple_descriptors().size();
|
|
if (UNLIKELY((_write_block->bytes_remaining() < _fixed_tuple_row_size) ||
|
|
(HasNullableTuple &&
|
|
(_write_tuple_idx + tuples_per_row > _null_indicators_write_block * 8)))) {
|
|
return false;
|
|
}
|
|
// Allocate the maximum possible buffer for the fixed portion of the tuple.
|
|
uint8_t* tuple_buf = _write_block->allocate<uint8_t>(_fixed_tuple_row_size);
|
|
// Total bytes allocated in _write_block for this row. Saved so we can roll back
|
|
// if this row doesn't fit.
|
|
int bytes_allocated = _fixed_tuple_row_size;
|
|
|
|
// Copy the not nullptr fixed len tuples. For the nullptr tuples just update the nullptr tuple
|
|
// indicator.
|
|
if (HasNullableTuple) {
|
|
DCHECK_GT(_null_indicators_write_block, 0);
|
|
uint8_t* null_word = nullptr;
|
|
uint32_t null_pos = 0;
|
|
// Calculate how much space it should return.
|
|
int to_return = 0;
|
|
for (int i = 0; i < tuples_per_row; ++i) {
|
|
null_word = _write_block->buffer() + (_write_tuple_idx >> 3); // / 8
|
|
null_pos = _write_tuple_idx & 7;
|
|
++_write_tuple_idx;
|
|
const int tuple_size = _desc.tuple_descriptors()[i]->byte_size();
|
|
Tuple* t = row->get_tuple(i);
|
|
const uint8_t mask = 1 << (7 - null_pos);
|
|
if (t != nullptr) {
|
|
*null_word &= ~mask;
|
|
memcpy(tuple_buf, t, tuple_size);
|
|
tuple_buf += tuple_size;
|
|
} else {
|
|
*null_word |= mask;
|
|
to_return += tuple_size;
|
|
}
|
|
}
|
|
DCHECK_LE(_write_tuple_idx - 1, _null_indicators_write_block * 8);
|
|
_write_block->return_allocation(to_return);
|
|
bytes_allocated -= to_return;
|
|
} else {
|
|
// If we know that there are no nullable tuples no need to set the nullability flags.
|
|
DCHECK_EQ(_null_indicators_write_block, 0);
|
|
for (int i = 0; i < tuples_per_row; ++i) {
|
|
const int tuple_size = _desc.tuple_descriptors()[i]->byte_size();
|
|
Tuple* t = row->get_tuple(i);
|
|
// TODO: Once IMPALA-1306 (Avoid passing empty tuples of non-materialized slots)
|
|
// is delivered, the check below should become DCHECK(t != nullptr).
|
|
DCHECK(t != nullptr || tuple_size == 0);
|
|
memcpy(tuple_buf, t, tuple_size);
|
|
tuple_buf += tuple_size;
|
|
}
|
|
}
|
|
|
|
// Copy string slots. Note: we do not need to convert the string ptrs to offsets
|
|
// on the write path, only on the read. The tuple data is immediately followed
|
|
// by the string data so only the len information is necessary.
|
|
for (int i = 0; i < _string_slots.size(); ++i) {
|
|
Tuple* tuple = row->get_tuple(_string_slots[i].first);
|
|
if (HasNullableTuple && tuple == nullptr) {
|
|
continue;
|
|
}
|
|
if (UNLIKELY(!copy_strings(tuple, _string_slots[i].second, &bytes_allocated))) {
|
|
_write_block->return_allocation(bytes_allocated);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Copy collection slots. We copy collection data in a well-defined order so we do not
|
|
// need to convert pointers to offsets on the write path.
|
|
// for (int i = 0; i < _collection_slots.size(); ++i) {
|
|
// Tuple* tuple = row->get_tuple(_collection_slots[i].first);
|
|
// if (HasNullableTuple && tuple == nullptr) continue;
|
|
// if (UNLIKELY(!copy_collections(tuple, _collection_slots[i].second,
|
|
// &bytes_allocated))) {
|
|
// _write_block->return_allocation(bytes_allocated);
|
|
// return false;
|
|
// }
|
|
// }
|
|
|
|
_write_block->add_row();
|
|
++_num_rows;
|
|
return true;
|
|
}
|
|
|
|
bool BufferedTupleStream2::copy_strings(const Tuple* tuple,
|
|
const vector<SlotDescriptor*>& string_slots,
|
|
int* bytes_allocated) {
|
|
for (int i = 0; i < string_slots.size(); ++i) {
|
|
const SlotDescriptor* slot_desc = string_slots[i];
|
|
if (tuple->is_null(slot_desc->null_indicator_offset())) {
|
|
continue;
|
|
}
|
|
const StringValue* sv = tuple->get_string_slot(slot_desc->tuple_offset());
|
|
if (LIKELY(sv->len > 0)) {
|
|
if (UNLIKELY(_write_block->bytes_remaining() < sv->len)) {
|
|
return false;
|
|
}
|
|
uint8_t* buf = _write_block->allocate<uint8_t>(sv->len);
|
|
(*bytes_allocated) += sv->len;
|
|
memcpy(buf, sv->ptr, sv->len);
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
} // end namespace doris
|