673 lines
27 KiB
C++
673 lines
27 KiB
C++
// Modifications copyright (C) 2017, Baidu.com, Inc.
|
|
// Copyright 2017 The Apache Software Foundation
|
|
|
|
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#include "runtime/merge_sorter.h"
|
|
#include "runtime/buffered_block_mgr.h"
|
|
#include "runtime/row_batch.h"
|
|
#include "runtime/runtime_state.h"
|
|
#include "util/runtime_profile.h"
|
|
#include <string>
|
|
#include <boost/foreach.hpp>
|
|
#include "runtime/mem_tracker.h"
|
|
|
|
namespace palo {
|
|
// A run is a sequence of blocks containing tuples that are or will eventually be in
|
|
// sorted order.
|
|
// A run may maintain two sequences of blocks - one containing the tuples themselves,
|
|
// (i.e. fixed-len slots and ptrs to var-len data), and the other for the var-length
|
|
// column data pointed to by those tuples.
|
|
// Tuples in a run may be sorted in place (in-memory) and merged using a merger.
|
|
class MergeSorter::Run {
|
|
public:
|
|
// materialize_slots is true for runs constructed from input rows. The input rows are
|
|
// materialized into single sort tuples using the expressions in
|
|
// _sort_tuple_slot_expr_ctxs. For intermediate merges, the tuples are already
|
|
// materialized so materialize_slots is false.
|
|
Run(MergeSorter* parent, TupleDescriptor* sort_tuple_desc, bool materialize_slots);
|
|
|
|
~Run() {}
|
|
// Initialize the run for input rows by allocating the minimum number of required
|
|
// blocks - one block for fixed-len data added to _fixed_len_blocks, one for the
|
|
// initially unsorted var-len data added to _var_len_blocks, and one to copy sorted
|
|
// var-len data into (var_len_copy_block_).
|
|
Status init();
|
|
|
|
// Add a batch of input rows to the current run. Returns the number
|
|
// of rows actually added in num_processed. If the run is full (no more blocks can
|
|
// be allocated), num_processed may be less than the number of rows in the batch.
|
|
// If _materialize_slots is true, materializes the input rows using the expressions
|
|
// in _sorter->_sort_tuple_slot_expr_ctxs, else just copies the input rows.
|
|
template <bool has_var_len_data>
|
|
Status add_batch(RowBatch* batch, int start_index, int* num_processed);
|
|
|
|
// Interface for merger - get the next batch of rows from this run. The callee (Run)
|
|
// still owns the returned batch. Calls get_next(RowBatch*, bool*).
|
|
Status get_next_batch(RowBatch** sorted_batch);
|
|
|
|
private:
|
|
friend class MergeSorter;
|
|
friend class TupleSorter;
|
|
|
|
template <bool convert_offset_to_ptr>
|
|
Status get_next(RowBatch* output_batch, bool* eos);
|
|
|
|
// Check if the current run can be extended by a block. Add the newly allocated block
|
|
// to block_sequence, or set added to false if the run could not be extended.
|
|
// If the run is sorted (produced by an intermediate merge), unpin the last block in
|
|
// block_sequence before allocating and adding a new block - the run can always be
|
|
// extended in this case. If the run is unsorted, check max_blocks_in_unsorted_run_
|
|
// to see if a block can be added to the run. Also updates the sort bytes counter.
|
|
Status try_add_block(std::vector<BufferedBlockMgr::Block*>* block_sequence, bool* added);
|
|
|
|
// Prepare to read a sorted run. Pins the first block(s) in the run if the run was
|
|
// previously unpinned.
|
|
Status prepare_read();
|
|
|
|
// Copy the StringValue data in var_values to dest in order and update the StringValue
|
|
// ptrs to point to the copied data.
|
|
void copy_var_len_data(char* dest, const std::vector<StringValue*>& var_values);
|
|
|
|
// Parent sorter object.
|
|
const MergeSorter* _sorter;
|
|
|
|
// Materialized sort tuple. Input rows are materialized into 1 tuple (with descriptor
|
|
// _sort_tuple_desc) before sorting.
|
|
const TupleDescriptor* _sort_tuple_desc;
|
|
|
|
// Sizes of sort tuple and block.
|
|
const int _sort_tuple_size;
|
|
const int _block_size;
|
|
|
|
const bool _has_var_len_slots;
|
|
|
|
// True if the sort tuple must be materialized from the input batch in add_batch().
|
|
// _materialize_slots is true for runs being constructed from input batches, and
|
|
// is false for runs being constructed from intermediate merges.
|
|
const bool _materialize_slots;
|
|
|
|
// True if the run is sorted. Set to true after an in-memory sort, and initialized to
|
|
// true for runs resulting from merges.
|
|
bool _is_sorted;
|
|
|
|
// Sequence of blocks in this run containing the fixed-length portion of the
|
|
// sort tuples comprising this run. The data pointed to by the var-len slots are in
|
|
// _var_len_blocks. If _is_sorted is true, the tuples in _fixed_len_blocks will be in
|
|
// sorted order.
|
|
// _fixed_len_blocks[i] is NULL iff it has been deleted.
|
|
std::vector<BufferedBlockMgr::Block*> _fixed_len_blocks;
|
|
|
|
// Sequence of blocks in this run containing the var-length data corresponding to the
|
|
// var-length column data from _fixed_len_blocks. These are reconstructed to be in
|
|
// sorted order in UnpinAllBlocks().
|
|
// _var_len_blocks[i] is NULL iff it has been deleted.
|
|
std::vector<BufferedBlockMgr::Block*> _var_len_blocks;
|
|
|
|
// Number of tuples so far in this run.
|
|
int64_t _num_tuples;
|
|
|
|
// Number of tuples returned via get_next(), maintained for debug purposes.
|
|
int64_t _num_tuples_returned;
|
|
|
|
|
|
// Members used when a run is read in get_next()
|
|
// The index into the fixed_ and _var_len_blocks vectors of the current blocks being
|
|
// processed in get_next().
|
|
int _fixed_len_blocks_index;
|
|
|
|
// Offset into the current fixed length data block being processed.
|
|
int _fixed_len_block_offset;
|
|
}; // class MergeSorter::Run
|
|
|
|
// Sorts a sequence of tuples from a run in place using a provided tuple comparator.
|
|
// Quick sort is used for sequences of tuples larger that 16 elements, and
|
|
// insertion sort is used for smaller sequences.
|
|
// The TupleSorter is initialized with a RuntimeState instance to check for
|
|
// cancellation during an in-memory sort.
|
|
class MergeSorter::TupleSorter {
|
|
public:
|
|
TupleSorter(const TupleRowComparator& less_than_comp, int64_t block_size,
|
|
int tuple_size, RuntimeState* state);
|
|
|
|
~TupleSorter() {
|
|
delete[] _temp_tuple_buffer;
|
|
delete[] _swap_buffer;
|
|
}
|
|
|
|
// Performs a quicksort for tuples in 'run' followed by an insertion sort to
|
|
// finish smaller blocks.
|
|
// Returns early if stste_->is_cancelled() is true. No status
|
|
// is returned - the caller must check for cancellation.
|
|
|
|
void sort(Run* run) {
|
|
_run = run;
|
|
sort_helper(TupleIterator(this, 0), TupleIterator(this, _run->_num_tuples));
|
|
run->_is_sorted = true;
|
|
}
|
|
|
|
private:
|
|
static const int INSERTION_THRESHOLD = 16;
|
|
// static const int INSERTION_THRESHOLD = FLAGS_insertion_threadhold;
|
|
|
|
// Helper class used to iterate over tuples in a run during quick sort and insertion
|
|
// sort.
|
|
class TupleIterator {
|
|
public:
|
|
TupleIterator(TupleSorter* parent, int64_t index)
|
|
: _parent(parent),
|
|
_index(index) {
|
|
DCHECK_GE(index, 0);
|
|
DCHECK_LE(index, _parent->_run->_num_tuples);
|
|
// If the run is empty, only _index is initialized.
|
|
if (_parent->_run->_num_tuples == 0) {
|
|
return;
|
|
}
|
|
// If the iterator is initialized to past the end, set up _buffer_start and
|
|
// _block_index as if it pointing to the last tuple. Add _tuple_size bytes to
|
|
// _current_tuple, so everything is correct when prev() is invoked.
|
|
int past_end_bytes = 0;
|
|
if (UNLIKELY(index >= _parent->_run->_num_tuples)) {
|
|
past_end_bytes = parent->_tuple_size;
|
|
_index = _parent->_run->_num_tuples;
|
|
index = _index - 1;
|
|
}
|
|
_block_index = index / parent->_block_capacity;
|
|
_buffer_start = parent->_run->_fixed_len_blocks[_block_index]->buffer();
|
|
int block_offset = (index % parent->_block_capacity) * parent->_tuple_size;
|
|
_current_tuple = _buffer_start + block_offset + past_end_bytes;
|
|
}
|
|
~TupleIterator() {}
|
|
// Sets _current_tuple to point to the next tuple in the run. Increments
|
|
// block_index and resets buffer if the next tuple is in the next block.
|
|
void next() {
|
|
_current_tuple += _parent->_tuple_size;
|
|
++_index;
|
|
if (UNLIKELY(_current_tuple > _buffer_start + _parent->_last_tuple_block_offset &&
|
|
_index < _parent->_run->_num_tuples)) {
|
|
// Don't increment block index, etc. past the end.
|
|
++_block_index;
|
|
DCHECK_LT(_block_index, _parent->_run->_fixed_len_blocks.size());
|
|
_buffer_start = _parent->_run->_fixed_len_blocks[_block_index]->buffer();
|
|
_current_tuple = _buffer_start;
|
|
}
|
|
}
|
|
|
|
// Sets current_tuple to point to the previous tuple in the run. Decrements
|
|
// block_index and resets buffer if the new tuple is in the previous block.
|
|
void prev() {
|
|
_current_tuple -= _parent->_tuple_size;
|
|
--_index;
|
|
if (UNLIKELY(_current_tuple < _buffer_start && _index >= 0)) {
|
|
--_block_index;
|
|
DCHECK_GE(_block_index, 0);
|
|
_buffer_start = _parent->_run->_fixed_len_blocks[_block_index]->buffer();
|
|
_current_tuple = _buffer_start + _parent->_last_tuple_block_offset;
|
|
}
|
|
}
|
|
|
|
private:
|
|
friend class TupleSorter;
|
|
|
|
// Pointer to the tuple sorter.
|
|
TupleSorter* _parent;
|
|
|
|
// Index of the current tuple in the run.
|
|
int64_t _index;
|
|
|
|
// Pointer to the current tuple.
|
|
uint8_t* _current_tuple;
|
|
|
|
// Start of the buffer containing current tuple.
|
|
uint8_t* _buffer_start;
|
|
|
|
// Index into _run._fixed_len_blocks of the block containing the current tuple.
|
|
int _block_index;
|
|
};
|
|
|
|
// Size of the tuples in memory.
|
|
const int _tuple_size;
|
|
|
|
// Number of tuples per block in a run.
|
|
const int _block_capacity;
|
|
|
|
// Offset in bytes of the last tuple in a block, calculated from block and tuple sizes.
|
|
const int _last_tuple_block_offset;
|
|
|
|
// Tuple comparator that returns true if lhs < rhs.
|
|
const TupleRowComparator _less_than_comp;
|
|
|
|
// Runtime state instance to check for cancellation. Not owned.
|
|
RuntimeState* const _state;
|
|
|
|
// The run to be sorted.
|
|
Run* _run;
|
|
|
|
// Temporarily allocated space to copy and swap tuples (Both are used in partition()).
|
|
// temp_tuple_ points to _temp_tuple_buffer. Owned by this TupleSorter instance.
|
|
TupleRow* _temp_tuple_row;
|
|
uint8_t* _temp_tuple_buffer;
|
|
uint8_t* _swap_buffer;
|
|
|
|
// Perform an insertion sort for rows in the range [first, last) in a run.
|
|
void insertion_sort(const TupleIterator& first, const TupleIterator& last);
|
|
|
|
// Partitions the sequence of tuples in the range [first, last) in a run into two
|
|
// groups around the pivot tuple - i.e. tuples in first group are <= the pivot, and
|
|
// tuples in the second group are >= pivot. Tuples are swapped in place to create the
|
|
// groups and the index to the first element in the second group is returned.
|
|
// Checks _state->is_cancelled() and returns early with an invalid result if true.
|
|
TupleIterator partition(TupleIterator first, TupleIterator last, Tuple* pivot);
|
|
|
|
// Performs a quicksort of rows in the range [first, last).
|
|
// followed by insertion sort for smaller groups of elements.
|
|
// Checks _state->is_cancelled() and returns early if true.
|
|
void sort_helper(TupleIterator first, TupleIterator last);
|
|
|
|
// Swaps tuples pointed to by left and right using the swap buffer.
|
|
void swap(uint8_t* left, uint8_t* right);
|
|
}; // class TupleSorter
|
|
|
|
// MergeSorter::Run methods
|
|
MergeSorter::Run::Run(MergeSorter* parent, TupleDescriptor* sort_tuple_desc,
|
|
bool materialize_slots)
|
|
: _sorter(parent),
|
|
_sort_tuple_desc(sort_tuple_desc),
|
|
_sort_tuple_size(sort_tuple_desc->byte_size()),
|
|
_block_size(parent->_block_mgr->max_block_size()),
|
|
_has_var_len_slots(sort_tuple_desc->string_slots().size() > 0),
|
|
_materialize_slots(materialize_slots),
|
|
_is_sorted(!materialize_slots),
|
|
_num_tuples(0) {
|
|
}
|
|
|
|
Status MergeSorter::Run::init() {
|
|
BufferedBlockMgr::Block* block = NULL;
|
|
RETURN_IF_ERROR(
|
|
_sorter->_block_mgr->get_new_block(&block));
|
|
DCHECK(block != NULL);
|
|
_fixed_len_blocks.push_back(block);
|
|
|
|
if (_has_var_len_slots) {
|
|
RETURN_IF_ERROR(
|
|
_sorter->_block_mgr->get_new_block(&block));
|
|
DCHECK(block != NULL);
|
|
_var_len_blocks.push_back(block);
|
|
}
|
|
|
|
if (!_is_sorted) {
|
|
_sorter->_initial_runs_counter->update(1);
|
|
}
|
|
return Status::OK;
|
|
}
|
|
|
|
template <bool has_var_len_data>
|
|
Status MergeSorter::Run::add_batch(RowBatch* batch, int start_index, int* num_processed) {
|
|
*num_processed = 0;
|
|
BufferedBlockMgr::Block* cur_fixed_len_block = _fixed_len_blocks.back();
|
|
|
|
DCHECK_EQ(_materialize_slots, !_is_sorted);
|
|
DCHECK_EQ(_materialize_slots, true);
|
|
|
|
// Input rows are copied/materialized into tuples allocated in _fixed_len_blocks.
|
|
// The variable length column data are copied into blocks stored in _var_len_blocks.
|
|
// Input row processing is split into two loops.
|
|
// The inner loop processes as many input rows as will fit in cur_fixed_len_block.
|
|
// The outer loop allocates a new block for fixed-len data if the input batch is
|
|
// not exhausted.
|
|
|
|
// cur_input_index is the index into the input 'batch' of the current
|
|
// input row being processed.
|
|
int cur_input_index = start_index;
|
|
std::vector<StringValue*> var_values;
|
|
var_values.reserve(_sort_tuple_desc->string_slots().size());
|
|
while (cur_input_index < batch->num_rows()) {
|
|
// tuples_remaining is the number of tuples to copy/materialize into
|
|
// cur_fixed_len_block.
|
|
int tuples_remaining = cur_fixed_len_block->bytes_remaining() / _sort_tuple_size;
|
|
tuples_remaining = std::min(batch->num_rows() - cur_input_index, tuples_remaining);
|
|
|
|
for (int i = 0; i < tuples_remaining; ++i) {
|
|
int total_var_len = 0;
|
|
TupleRow* input_row = batch->get_row(cur_input_index);
|
|
Tuple* new_tuple = cur_fixed_len_block->allocate<Tuple>(_sort_tuple_size);
|
|
if (_materialize_slots) {
|
|
new_tuple->materialize_exprs<has_var_len_data>(input_row, *_sort_tuple_desc,
|
|
_sorter->_sort_tuple_slot_expr_ctxs, NULL, &var_values, &total_var_len);
|
|
if (total_var_len > _sorter->_block_mgr->max_block_size()) {
|
|
std::stringstream ss;
|
|
ss << "Variable length data in a single tuple larger than block size ";
|
|
ss << total_var_len;
|
|
ss << " > " << _sorter->_block_mgr->max_block_size();
|
|
return Status(TStatusCode::INTERNAL_ERROR, ss.str(), false);
|
|
}
|
|
}
|
|
|
|
if (has_var_len_data) {
|
|
BufferedBlockMgr::Block* cur_var_len_block = _var_len_blocks.back();
|
|
if (cur_var_len_block->bytes_remaining() < total_var_len) {
|
|
bool added;
|
|
RETURN_IF_ERROR(try_add_block(&_var_len_blocks, &added));
|
|
if (added) {
|
|
cur_var_len_block = _var_len_blocks.back();
|
|
} else {
|
|
// There wasn't enough space in the last var-len block for this tuple, and
|
|
// the run could not be extended. Return the fixed-len allocation and exit.
|
|
// dhc: we can't get here, because we can get the new block. If we can't get new block,
|
|
// we will exit in tryAddBlock(MemTracker exceed).
|
|
cur_fixed_len_block->return_allocation(_sort_tuple_size);
|
|
return Status::OK;
|
|
}
|
|
}
|
|
|
|
char* var_data_ptr = cur_var_len_block->allocate<char>(total_var_len);
|
|
if (_materialize_slots) {
|
|
copy_var_len_data(var_data_ptr, var_values);
|
|
}
|
|
}
|
|
|
|
++_num_tuples;
|
|
++*num_processed;
|
|
++cur_input_index;
|
|
}
|
|
|
|
// If there are still rows left to process, get a new block for the fixed-length
|
|
// tuples. If the run is already too long, return.
|
|
if (cur_input_index < batch->num_rows()) {
|
|
bool added;
|
|
RETURN_IF_ERROR(try_add_block(&_fixed_len_blocks, &added));
|
|
if (added) {
|
|
cur_fixed_len_block = _fixed_len_blocks.back();
|
|
} else {
|
|
return Status::OK;
|
|
}
|
|
}
|
|
}
|
|
|
|
return Status::OK;
|
|
}
|
|
|
|
Status MergeSorter::Run::prepare_read() {
|
|
_fixed_len_blocks_index = 0;
|
|
_fixed_len_block_offset = 0;
|
|
//var_len_blocks_index_ = 0;
|
|
_num_tuples_returned = 0;
|
|
|
|
return Status::OK;
|
|
}
|
|
|
|
template <bool convert_offset_to_ptr>
|
|
Status MergeSorter::Run::get_next(RowBatch* output_batch, bool* eos) {
|
|
if (_fixed_len_blocks_index == _fixed_len_blocks.size()) {
|
|
*eos = true;
|
|
DCHECK_EQ(_num_tuples_returned, _num_tuples);
|
|
return Status::OK;
|
|
} else {
|
|
*eos = false;
|
|
}
|
|
|
|
BufferedBlockMgr::Block* fixed_len_block = _fixed_len_blocks[_fixed_len_blocks_index];
|
|
|
|
// get_next fills rows into the output batch until a block boundary is reached.
|
|
while (!output_batch->is_full() &&
|
|
_fixed_len_block_offset < fixed_len_block->valid_data_len()) {
|
|
Tuple* input_tuple = reinterpret_cast<Tuple*>(
|
|
fixed_len_block->buffer() + _fixed_len_block_offset);
|
|
|
|
int output_row_idx = output_batch->add_row();
|
|
output_batch->get_row(output_row_idx)->set_tuple(0, input_tuple);
|
|
output_batch->commit_last_row();
|
|
_fixed_len_block_offset += _sort_tuple_size;
|
|
++_num_tuples_returned;
|
|
}
|
|
|
|
if (_fixed_len_block_offset >= fixed_len_block->valid_data_len()) {
|
|
++_fixed_len_blocks_index;
|
|
_fixed_len_block_offset = 0;
|
|
}
|
|
|
|
return Status::OK;
|
|
}
|
|
|
|
Status MergeSorter::Run::try_add_block(std::vector<BufferedBlockMgr::Block*>* block_sequence,
|
|
bool* added) {
|
|
DCHECK(!block_sequence->empty());
|
|
|
|
BufferedBlockMgr::Block* last_block = block_sequence->back();
|
|
_sorter->_sorted_data_size->update(last_block->valid_data_len());
|
|
|
|
BufferedBlockMgr::Block* new_block;
|
|
RETURN_IF_ERROR(_sorter->_block_mgr->get_new_block(&new_block));
|
|
if (new_block != NULL) {
|
|
*added = true;
|
|
block_sequence->push_back(new_block);
|
|
} else {
|
|
*added = false;
|
|
}
|
|
return Status::OK;
|
|
}
|
|
|
|
void MergeSorter::Run::copy_var_len_data(char* dest, const std::vector<StringValue*>& var_values) {
|
|
BOOST_FOREACH(StringValue* var_val, var_values) {
|
|
memcpy(dest, var_val->ptr, var_val->len);
|
|
var_val->ptr = dest;
|
|
dest += var_val->len;
|
|
}
|
|
}
|
|
|
|
|
|
// MergeSorter::TupleSorter methods.
|
|
MergeSorter::TupleSorter::TupleSorter(const TupleRowComparator& comp, int64_t block_size,
|
|
int tuple_size, RuntimeState* state)
|
|
: _tuple_size(tuple_size),
|
|
_block_capacity(block_size / tuple_size),
|
|
_last_tuple_block_offset(tuple_size * ((block_size / tuple_size) - 1)),
|
|
_less_than_comp(comp),
|
|
_state(state) {
|
|
_temp_tuple_buffer = new uint8_t[tuple_size];
|
|
_temp_tuple_row = reinterpret_cast<TupleRow*>(&_temp_tuple_buffer);
|
|
_swap_buffer = new uint8_t[tuple_size];
|
|
}
|
|
|
|
|
|
// Sort the sequence of tuples from [first, last).
|
|
// Begin with a sorted sequence of size 1 [first, first+1).
|
|
// During each pass of the outermost loop, add the next tuple (at position 'i') to
|
|
// the sorted sequence by comparing it to each element of the sorted sequence
|
|
// (reverse order) to find its correct place in the sorted sequence, copying tuples
|
|
// along the way.
|
|
void MergeSorter::TupleSorter::insertion_sort(const TupleIterator& first,
|
|
const TupleIterator& last) {
|
|
TupleIterator insert_iter = first;
|
|
insert_iter.next();
|
|
for (; insert_iter._index < last._index; insert_iter.next()) {
|
|
// insert_iter points to the tuple after the currently sorted sequence that must
|
|
// be inserted into the sorted sequence. Copy to _temp_tuple_row since it may be
|
|
// overwritten by the one at position 'insert_iter - 1'
|
|
memcpy(_temp_tuple_buffer, insert_iter._current_tuple, _tuple_size);
|
|
|
|
// 'iter' points to the tuple that _temp_tuple_row will be compared to.
|
|
// 'copy_to' is the where iter should be copied to if it is >= _temp_tuple_row.
|
|
// copy_to always to the next row after 'iter'
|
|
TupleIterator iter = insert_iter;
|
|
iter.prev();
|
|
uint8_t* copy_to = insert_iter._current_tuple;
|
|
while (_less_than_comp(_temp_tuple_row,
|
|
reinterpret_cast<TupleRow*>(&iter._current_tuple))) {
|
|
memcpy(copy_to, iter._current_tuple, _tuple_size);
|
|
copy_to = iter._current_tuple;
|
|
// Break if 'iter' has reached the first row, meaning that _temp_tuple_row
|
|
// will be inserted in position 'first'
|
|
if (iter._index <= first._index) break;
|
|
iter.prev();
|
|
}
|
|
|
|
memcpy(copy_to, _temp_tuple_buffer, _tuple_size);
|
|
}
|
|
}
|
|
|
|
MergeSorter::TupleSorter::TupleIterator MergeSorter::TupleSorter::partition(TupleIterator first,
|
|
TupleIterator last, Tuple* pivot) {
|
|
|
|
// Copy pivot into temp_tuple since it points to a tuple within [first, last).
|
|
memcpy(_temp_tuple_buffer, pivot, _tuple_size);
|
|
|
|
last.prev();
|
|
while (true) {
|
|
// Search for the first and last out-of-place elements, and swap them.
|
|
while (_less_than_comp(reinterpret_cast<TupleRow*>(&first._current_tuple),
|
|
_temp_tuple_row)) {
|
|
first.next();
|
|
}
|
|
while (_less_than_comp(_temp_tuple_row,
|
|
reinterpret_cast<TupleRow*>(&last._current_tuple))) {
|
|
last.prev();
|
|
}
|
|
|
|
if (first._index >= last._index) break;
|
|
// swap first and last tuples.
|
|
swap(first._current_tuple, last._current_tuple);
|
|
|
|
first.next();
|
|
last.prev();
|
|
}
|
|
|
|
return first;
|
|
|
|
}
|
|
|
|
void MergeSorter::TupleSorter::sort_helper(TupleIterator first, TupleIterator last) {
|
|
if (UNLIKELY(_state->is_cancelled())) return;
|
|
// Use insertion sort for smaller sequences.
|
|
while (last._index - first._index > INSERTION_THRESHOLD) {
|
|
TupleIterator iter(this, first._index + (last._index - first._index)/2);
|
|
// Parititon() splits the tuples in [first, last) into two groups (<= pivot
|
|
// and >= pivot) in-place. 'cut' is the index of the first tuple in the second group.
|
|
TupleIterator cut = partition(first, last,
|
|
reinterpret_cast<Tuple*>(iter._current_tuple));
|
|
sort_helper(cut, last);
|
|
last = cut;
|
|
if (UNLIKELY(_state->is_cancelled())) return;
|
|
}
|
|
insertion_sort(first, last);
|
|
}
|
|
|
|
inline void MergeSorter::TupleSorter::swap(uint8_t* left, uint8_t* right) {
|
|
memcpy(_swap_buffer, left, _tuple_size);
|
|
memcpy(left, right, _tuple_size);
|
|
memcpy(right, _swap_buffer, _tuple_size);
|
|
}
|
|
|
|
// MergeSorter methods
|
|
MergeSorter::MergeSorter(const TupleRowComparator& compare_less_than,
|
|
const std::vector<ExprContext*>& slot_materialize_expr_ctxs,
|
|
RowDescriptor* output_row_desc,
|
|
RuntimeProfile* profile, RuntimeState* state)
|
|
: _state(state),
|
|
_compare_less_than(compare_less_than),
|
|
_block_mgr(state->block_mgr()),
|
|
_output_row_desc(output_row_desc),
|
|
_sort_tuple_slot_expr_ctxs(slot_materialize_expr_ctxs),
|
|
_profile(profile) {
|
|
TupleDescriptor* sort_tuple_desc = output_row_desc->tuple_descriptors()[0];
|
|
_has_var_len_slots = sort_tuple_desc->string_slots().size() > 0;
|
|
_in_mem_tuple_sorter.reset(new TupleSorter(compare_less_than,
|
|
_block_mgr->max_block_size(), sort_tuple_desc->byte_size(), state));
|
|
|
|
_initial_runs_counter = ADD_COUNTER(_profile, "InitialRunsCreated", TUnit::UNIT);
|
|
_num_merges_counter = ADD_COUNTER(_profile, "TotalMergesPerformed", TUnit::UNIT);
|
|
_in_mem_sort_timer = ADD_TIMER(_profile, "InMemorySortTime");
|
|
_sorted_data_size = ADD_COUNTER(_profile, "SortDataSize", TUnit::BYTES);
|
|
|
|
_unsorted_run = _obj_pool.add(new Run(this, sort_tuple_desc, true));
|
|
_unsorted_run->init();
|
|
}
|
|
|
|
MergeSorter::~MergeSorter() {
|
|
}
|
|
|
|
Status MergeSorter::add_batch(RowBatch* batch) {
|
|
int num_processed = 0;
|
|
int cur_batch_index = 0;
|
|
|
|
while (cur_batch_index < batch->num_rows()) {
|
|
if (_has_var_len_slots) {
|
|
_unsorted_run->add_batch<true>(batch, cur_batch_index, &num_processed);
|
|
} else {
|
|
_unsorted_run->add_batch<false>(batch, cur_batch_index, &num_processed);
|
|
}
|
|
|
|
cur_batch_index += num_processed;
|
|
if (cur_batch_index < batch->num_rows()) {
|
|
return Status("run is full");
|
|
}
|
|
}
|
|
return Status::OK;
|
|
}
|
|
|
|
Status MergeSorter::input_done() {
|
|
// Sort the tuples accumulated so far in the current run.
|
|
RETURN_IF_ERROR(sort_run());
|
|
|
|
DCHECK(_sorted_runs.size() == 1);
|
|
|
|
// The entire input fit in one run. Read sorted rows in get_next() directly
|
|
// from the sorted run.
|
|
_sorted_runs.back()->prepare_read();
|
|
|
|
return Status::OK;
|
|
}
|
|
|
|
Status MergeSorter::get_next(RowBatch* output_batch, bool* eos) {
|
|
DCHECK(_sorted_runs.size() == 1);
|
|
|
|
// In this case, only TupleRows are copied into output_batch. Sorted tuples are left
|
|
// in the pinned blocks in the single sorted run.
|
|
RETURN_IF_ERROR(_sorted_runs.back()->get_next<false>(output_batch, eos));
|
|
|
|
return Status::OK;
|
|
}
|
|
|
|
Status MergeSorter::sort_run() {
|
|
BufferedBlockMgr::Block* last_block = _unsorted_run->_fixed_len_blocks.back();
|
|
if (last_block->valid_data_len() > 0) {
|
|
_sorted_data_size->update(last_block->valid_data_len());
|
|
} else {
|
|
// need to delete block?
|
|
_unsorted_run->_fixed_len_blocks.pop_back();
|
|
}
|
|
if (_has_var_len_slots) {
|
|
last_block = _unsorted_run->_var_len_blocks.back();
|
|
if (last_block->valid_data_len() > 0) {
|
|
_sorted_data_size->update(last_block->valid_data_len());
|
|
} else {
|
|
// need to delete block?
|
|
_unsorted_run->_var_len_blocks.pop_back();
|
|
}
|
|
}
|
|
{
|
|
SCOPED_TIMER(_in_mem_sort_timer);
|
|
_in_mem_tuple_sorter->sort(_unsorted_run);
|
|
RETURN_IF_CANCELLED(_state);
|
|
}
|
|
_sorted_runs.push_back(_unsorted_run);
|
|
_unsorted_run = NULL;
|
|
return Status::OK;
|
|
}
|
|
} // namespace palo
|