doris/be/src/runtime/spill_sorter.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <deque>

#include "runtime/buffered_block_mgr2.h"
#include "util/tuple_row_compare.h"

namespace doris {

class SortedRunMerger;
class RuntimeProfile;
class RowBatch;

// SpillSorter contains the external sort implementation. Its purpose is to sort arbitrarily
// large input data sets with a fixed memory budget by spilling data to disk if
// necessary. BufferedBlockMgr2 is used to allocate and manage blocks of data to be
// sorted.
//
// The client API for SpillSorter is as follows:
// add_batch() is used to add input rows to be sorted. Multiple tuples in an input row are
// materialized into a row with a single tuple (the sort tuple) using the materialization
// exprs in _sort_tuple_slot_expr_ctxs. The sort tuples are sorted according to the sort
// parameters and output by the sorter.
// add_batch() can be called multiple times.
//
// input_done() is called to indicate the end of input. If multiple sorted runs were
// created, it triggers intermediate merge steps (if necessary) and creates the final
// merger that returns results via get_next().
//
// get_next() is used to retrieve sorted rows. It can be called multiple times.
// add_batch(), input_done() and get_next() must be called in that order.
//
// Batches of input rows are collected into a sequence of pinned BufferedBlockMgr2 blocks
// called a run. The maximum size of a run is determined by the maximum available buffers
// in the block manager. After the run is full, it is sorted in memory, unpinned and the
// next run is collected.  The variable-length column data (e.g. string slots) in the
// materialized sort tuples are stored in separate sequence of blocks from the tuples
// themselves.
// When the blocks containing tuples in a run are unpinned, the var-len slot pointers are
// converted to offsets from the start of the first var-len data block. When a block is
// read back, these offsets are converted back to pointers.
// The in-memory sorter sorts the fixed-length tuples in-place. The output rows have the
// same schema as the materialized sort tuples.
//
// After the input is consumed, the sorter is left with one or more sorted runs. The
// client calls get_next(output_batch) to retrieve batches of sorted rows. If there are
// multiple runs, the runs are merged using SortedRunMerger to produce a stream of sorted
// tuples. At least one block per run (two if there are var-length slots) must be pinned
// in memory during a merge, so multiple merges may be necessary if the number of runs is
// too large. During a merge, rows from multiple sorted input runs are compared and copied
// into a single larger run. One input batch is created to hold tuple rows for each
// input run, and one batch is created to hold deep copied rows (i.e. ptrs + data) from
// the output of the merge.
//
// If there is a single sorted run (i.e. no merge required), only tuple rows are
// copied into the output batch supplied by get_next, and the data itself is left in
// pinned blocks held by the sorter.
//
// Note that init() must be called right after the constructor.
//
// During a merge, one row batch is created for each input run, and one batch is created
// for the output of the merge (if is not the final merge). It is assumed that the memory
// for these batches have already been accounted for in the memory budget for the sort.
// That is, the memory for these batches does not come out of the block buffer manager.
//
// TODO: Not necessary to actually copy var-len data - instead take ownership of the
// var-length data in the input batch. Copying can be deferred until a run is unpinned.
// TODO: When the first run is constructed, create a sequence of pointers to materialized
// tuples. If the input fits in memory, the pointers can be sorted instead of sorting the
// tuples in place.
class SpillSorter {
public:
    // sort_tuple_slot_exprs are the slot exprs used to materialize the tuple to be sorted.
    // compare_less_than is a comparator for the sort tuples (returns true if lhs < rhs).
    // _merge_batch_size is the size of the batches created to provide rows to the merger
    // and retrieve rows from an intermediate merger.
    SpillSorter(const TupleRowComparator& compare_less_than,
                const std::vector<ExprContext*>& sort_tuple_slot_expr_ctxs,
                RowDescriptor* output_row_desc, const std::shared_ptr<MemTracker>& mem_tracker,
                RuntimeProfile* profile, RuntimeState* state);

    ~SpillSorter();

    // Initialization code, including registration to the block_mgr and the initialization
    // of the _unsorted_run, both of these may fail.
    Status init();

    // Adds a batch of input rows to the current unsorted run.
    Status add_batch(RowBatch* batch);

    // Called to indicate there is no more input. Triggers the creation of merger(s) if
    // necessary.
    Status input_done();

    // Get the next batch of sorted output rows from the sorter.
    Status get_next(RowBatch* batch, bool* eos);

    // Resets all internal state like ExecNode::reset().
    // init() must have been called, add_batch()/get_next()/input_done()
    // may or may not have been called.
    Status reset();

    bool is_spilled() { return _spilled; }
    // Estimate the memory overhead in bytes for an intermediate merge, based on the
    // maximum number of memory buffers available for the sort, the row descriptor for
    // the sorted tuples and the batch size used (in rows).
    // This is a pessimistic estimate of the memory needed by the sorter in addition to the
    // memory used by the block buffer manager. The memory overhead is 0 if the input fits
    // in memory. Merges incur additional memory overhead because row batches are created
    // to hold tuple rows from the input runs, and the merger itself deep-copies
    // sort-merged rows into its output batch.
    static uint64_t estimate_merge_mem(uint64_t available_blocks, RowDescriptor* row_desc,
                                       int merge_batch_size);

private:
    class Run;
    class TupleSorter;

    // Create a SortedRunMerger from the first 'num_runs' sorted runs in _sorted_runs and
    // assign it to _merger. The runs to be merged are removed from _sorted_runs.
    // The SpillSorter sets the deep_copy_input flag to true for the merger, since the blocks
    // containing input run data will be unpinned as input runs are read.
    Status create_merger(int num_runs);

    // Repeatedly replaces multiple smaller runs in _sorted_runs with a single larger
    // merged run until the number of remaining runs is small enough for a single merge.
    // At least 1 (2 if var-len slots) block from each sorted run must be pinned for
    // a merge. If the number of sorted runs is too large, merge sets of smaller runs
    // into large runs until a final merge can be performed. An intermediate row batch
    // containing deep copied rows is used for the output of each intermediate merge.
    Status merge_intermediate_runs();

    // Sorts _unsorted_run and appends it to the list of sorted runs. Deletes any empty
    // blocks at the end of the run. Updates the sort bytes counter if necessary.
    Status sort_run();

    // Runtime state instance used to check for cancellation. Not owned.
    RuntimeState* const _state;

    // In memory sorter and less-than comparator.
    TupleRowComparator _compare_less_than;
    std::unique_ptr<TupleSorter> _in_mem_tuple_sorter;

    // Block manager object used to allocate, pin and release runs. Not owned by SpillSorter.
    BufferedBlockMgr2* _block_mgr;

    // Handle to block mgr to make allocations from.
    BufferedBlockMgr2::Client* _block_mgr_client;

    // True if the tuples to be sorted have var-length slots.
    bool _has_var_len_slots;

    // Expressions used to materialize the sort tuple. Contains one expr per slot in the tuple.
    std::vector<ExprContext*> _sort_tuple_slot_expr_ctxs;

    // Mem tracker for batches created during merge. Not owned by SpillSorter.
    std::shared_ptr<MemTracker> _mem_tracker;

    // Descriptor for the sort tuple. Input rows are materialized into 1 tuple before
    // sorting. Not owned by the SpillSorter.
    RowDescriptor* _output_row_desc;

    /////////////////////////////////////////
    // BEGIN: Members that must be reset()

    // The current unsorted run that is being collected. Is sorted and added to
    // _sorted_runs after it is full (i.e. number of blocks allocated == max available
    // buffers) or after the input is complete. Owned and placed in _obj_pool.
    // When it is added to _sorted_runs, it is set to nullptr.
    Run* _unsorted_run;

    // List of sorted runs that have been produced but not merged. _unsorted_run is added
    // to this list after an in-memory sort. Sorted runs produced by intermediate merges
    // are also added to this list. Runs are added to the object pool.
    std::deque<Run*> _sorted_runs;

    // Merger object (intermediate or final) currently used to produce sorted runs.
    // Only one merge is performed at a time. Will never be used if the input fits in
    // memory.
    std::unique_ptr<SortedRunMerger> _merger;

    // Runs that are currently processed by the _merge.
    // These runs can be deleted when we are done with the current merge.
    std::deque<Run*> _merging_runs;

    // Pool of owned Run objects. Maintains Runs objects across non-freeing reset() calls.
    ObjectPool _obj_pool;

    // END: Members that must be reset()
    /////////////////////////////////////////

    // Runtime profile and counters for this sorter instance.
    RuntimeProfile* _profile;
    RuntimeProfile::Counter* _initial_runs_counter;
    RuntimeProfile::Counter* _num_merges_counter;
    RuntimeProfile::Counter* _in_mem_sort_timer;
    RuntimeProfile::Counter* _sorted_data_size;

    bool _spilled;
};

} // namespace doris