[Enhancement](topn) support two phase read for topn query (#15642)

This PR optimize topn query like `SELECT * FROM tableX ORDER BY columnA ASC/DESC LIMIT N`. TopN is is compose of SortNode and ScanNode, when user table is wide like 100+ columns the order by clause is just a few columns.But ScanNode need to scan all data from storage engine even if the limit is very small.This may lead to lots of read amplification.So In this PR I devide TopN query into two phase: 1. The first phase we just need to read `columnA`'s data from storage engine along with an extra RowId column called `__DORIS_ROWID_COL__`.The other columns are pruned from ScanNode. 2. The second phase I put it in the ExchangeNode beacuase it's the central node for topn nodes in the cluster.The ExchangeNode will spawn a RPC to other nodes using the RowIds(sorted and limited from SortNode) read from the first phase and read row by row from storage engine. After the second phase read, Block will contain all the data needed for the query
2023-01-19 10:01:33 +08:00
parent c7a72436e6
commit 3894de49d2
53 changed files with 829 additions and 33 deletions
--- a/be/src/vec/common/sort/heap_sorter.cpp
+++ b/be/src/vec/common/sort/heap_sorter.cpp
@ -45,6 +45,9 @@ Status HeapSorter::append_block(Block* block) {
            int i = 0;
            const auto& convert_nullable_flags = _vsort_exec_exprs.get_convert_nullable_flags();
            for (auto column_id : valid_column_ids) {
+                if (column_id < 0) {
+                    continue;
+                }
                if (convert_nullable_flags[i]) {
                    auto column_ptr = make_nullable(block->get_by_position(column_id).column);
                    new_block.insert({column_ptr,
--- a/be/src/vec/common/sort/sorter.cpp
+++ b/be/src/vec/common/sort/sorter.cpp
@ -247,6 +247,9 @@ Status Sorter::partial_sort(Block& src_block, Block& dest_block) {
        int i = 0;
        const auto& convert_nullable_flags = _vsort_exec_exprs.get_convert_nullable_flags();
        for (auto column_id : valid_column_ids) {
+            if (column_id < 0) {
+                continue;
+            }
            if (convert_nullable_flags[i]) {
                auto column_ptr = make_nullable(src_block.get_by_position(column_id).column);
                new_block.insert(
--- a/be/src/vec/common/sort/sorter.h
+++ b/be/src/vec/common/sort/sorter.h
@ -18,6 +18,7 @@
 #pragma once
 #include <queue>

+#include "common/consts.h"
 #include "common/status.h"
 #include "vec/common/sort/vsort_exec_exprs.h"
 #include "vec/core/block.h"
@ -34,7 +35,11 @@ class MergeSorterState {
 public:
    MergeSorterState(const RowDescriptor& row_desc, int64_t offset, int64_t limit,
                     RuntimeState* state, RuntimeProfile* profile)
-            : unsorted_block_(new Block(VectorizedUtils::create_empty_block(row_desc))),
+            // create_empty_block should ignore invalid slots, unsorted_block
+            // should be same structure with arrival block from child node
+            // since block from child node may ignored these slots
+            : unsorted_block_(new Block(
+                      VectorizedUtils::create_empty_block(row_desc, true /*ignore invalid slot*/))),
              offset_(offset),
              limit_(limit),
              profile_(profile) {
--- a/be/src/vec/core/block.cpp
+++ b/be/src/vec/core/block.cpp
@ -54,8 +54,12 @@ Block::Block(const ColumnsWithTypeAndName& data_) : data {data_} {
    initialize_index_by_name();
 }

-Block::Block(const std::vector<SlotDescriptor*>& slots, size_t block_size) {
+Block::Block(const std::vector<SlotDescriptor*>& slots, size_t block_size,
+             bool ignore_trivial_slot) {
    for (const auto slot_desc : slots) {
+        if (ignore_trivial_slot && !slot_desc->need_materialize()) {
+            continue;
+        }
        auto column_ptr = slot_desc->get_empty_mutable_column();
        column_ptr->reserve(block_size);
        insert(ColumnWithTypeAndName(std::move(column_ptr), slot_desc->get_data_type_ptr(),
@ -919,9 +923,13 @@ void Block::deep_copy_slot(void* dst, MemPool* pool, const doris::TypeDescriptor
    }
 }

-MutableBlock::MutableBlock(const std::vector<TupleDescriptor*>& tuple_descs, int reserve_size) {
+MutableBlock::MutableBlock(const std::vector<TupleDescriptor*>& tuple_descs, int reserve_size,
+                           bool ignore_trivial_slot) {
    for (auto tuple_desc : tuple_descs) {
        for (auto slot_desc : tuple_desc->slots()) {
+            if (ignore_trivial_slot && !slot_desc->need_materialize()) {
+                continue;
+            }
            _data_types.emplace_back(slot_desc->get_data_type_ptr());
            _columns.emplace_back(_data_types.back()->create_column());
            if (reserve_size != 0) {
--- a/be/src/vec/core/block.h
+++ b/be/src/vec/core/block.h
@ -74,7 +74,8 @@ public:
    Block(std::initializer_list<ColumnWithTypeAndName> il);
    Block(const ColumnsWithTypeAndName& data_);
    Block(const PBlock& pblock);
-    Block(const std::vector<SlotDescriptor*>& slots, size_t block_size);
+    Block(const std::vector<SlotDescriptor*>& slots, size_t block_size,
+          bool ignore_trivial_slot = false);

    /// insert the column at the specified position
    void insert(size_t position, const ColumnWithTypeAndName& elem);
@ -391,7 +392,8 @@ public:
    MutableBlock() = default;
    ~MutableBlock() = default;

-    MutableBlock(const std::vector<TupleDescriptor*>& tuple_descs, int reserve_size = 0);
+    MutableBlock(const std::vector<TupleDescriptor*>& tuple_descs, int reserve_size = 0,
+                 bool igore_trivial_slot = false);

    MutableBlock(Block* block)
            : _columns(block->mutate_columns()), _data_types(block->get_data_types()) {}
--- a/be/src/vec/exec/scan/new_olap_scanner.cpp
+++ b/be/src/vec/exec/scan/new_olap_scanner.cpp
@ -91,6 +91,22 @@ Status NewOlapScanner::prepare(const TPaloScanRange& scan_range,
                _tablet_schema->append_column(TabletColumn(column_desc));
            }
        }
+
+        {
+            if (_output_tuple_desc->slots().back()->col_name() == BeConsts::ROWID_COL) {
+                // inject ROWID_COL
+                TabletColumn rowid_column;
+                rowid_column.set_is_nullable(false);
+                rowid_column.set_name(BeConsts::ROWID_COL);
+                // avoid column reader init error
+                rowid_column.set_has_default_value(true);
+                // fake unique id
+                rowid_column.set_unique_id(INT32_MAX);
+                rowid_column.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
+                _tablet_schema->append_column(rowid_column);
+            }
+        }
+
        {
            std::shared_lock rdlock(_tablet->get_header_lock());
            const RowsetSharedPtr rowset = _tablet->rowset_with_max_version();
@ -333,7 +349,9 @@ Status NewOlapScanner::_init_return_columns() {
        if (!slot->is_materialized()) {
            continue;
        }
-
+        if (!slot->need_materialize()) {
+            continue;
+        }
        int32_t index = slot->col_unique_id() >= 0
                                ? _tablet_schema->field_index(slot->col_unique_id())
                                : _tablet_schema->field_index(slot->col_name());
--- a/be/src/vec/exec/scan/scanner_context.cpp
+++ b/be/src/vec/exec/scan/scanner_context.cpp
@ -55,7 +55,8 @@ Status ScannerContext::init() {
    // So use _output_tuple_desc;
    int64_t free_blocks_memory_usage = 0;
    for (int i = 0; i < pre_alloc_block_count; ++i) {
-        auto block = new vectorized::Block(_output_tuple_desc->slots(), real_block_size);
+        auto block = new vectorized::Block(_output_tuple_desc->slots(), real_block_size,
+                                           true /*ignore invalid slots*/);
        free_blocks_memory_usage += block->allocated_bytes();
        _free_blocks.emplace_back(block);
    }
@ -93,7 +94,8 @@ vectorized::Block* ScannerContext::get_free_block(bool* get_free_block) {
    *get_free_block = false;

    COUNTER_UPDATE(_parent->_newly_create_free_blocks_num, 1);
-    return new vectorized::Block(_real_tuple_desc->slots(), _state->batch_size());
+    return new vectorized::Block(_real_tuple_desc->slots(), _state->batch_size(),
+                                 true /*ignore invalid slots*/);
 }

 void ScannerContext::return_free_block(vectorized::Block* block) {
--- a/be/src/vec/exec/scan/vscanner.cpp
+++ b/be/src/vec/exec/scan/vscanner.cpp
@ -40,6 +40,10 @@ Status VScanner::get_block(RuntimeState* state, Block* block, bool* eof) {
    int64_t raw_rows_threshold = raw_rows_read() + config::doris_scanner_row_num;
    if (!block->mem_reuse()) {
        for (const auto slot_desc : _output_tuple_desc->slots()) {
+            if (!slot_desc->need_materialize()) {
+                // should be ignore from reading
+                continue;
+            }
            block->insert(ColumnWithTypeAndName(slot_desc->get_empty_mutable_column(),
                                                slot_desc->get_data_type_ptr(),
                                                slot_desc->col_name()));
@ -80,8 +84,7 @@ Status VScanner::get_block(RuntimeState* state, Block* block, bool* eof) {

 Status VScanner::_filter_output_block(Block* block) {
    auto old_rows = block->rows();
-    Status st =
-            VExprContext::filter_block(_vconjunct_ctx, block, _output_tuple_desc->slots().size());
+    Status st = VExprContext::filter_block(_vconjunct_ctx, block, block->columns());
    _counter.num_rows_unselected += old_rows - block->rows();
    return st;
 }
--- a/be/src/vec/exec/vexchange_node.cpp
+++ b/be/src/vec/exec/vexchange_node.cpp
@ -17,12 +17,15 @@

 #include "vec/exec/vexchange_node.h"

+#include "common/consts.h"
+#include "exec/rowid_fetcher.h"
 #include "pipeline/exec/exchange_source_operator.h"
 #include "pipeline/pipeline.h"
 #include "pipeline/pipeline_fragment_context.h"
 #include "runtime/exec_env.h"
 #include "runtime/runtime_state.h"
 #include "runtime/thread_context.h"
+#include "util/defer_op.h"
 #include "vec/runtime/vdata_stream_mgr.h"
 #include "vec/runtime/vdata_stream_recvr.h"

@ -45,10 +48,15 @@ Status VExchangeNode::init(const TPlanNode& tnode, RuntimeState* state) {
    if (!_is_merging) {
        return Status::OK();
    }
-
    RETURN_IF_ERROR(_vsort_exec_exprs.init(tnode.exchange_node.sort_info, _pool));
    _is_asc_order = tnode.exchange_node.sort_info.is_asc_order;
    _nulls_first = tnode.exchange_node.sort_info.nulls_first;
+
+    if (tnode.exchange_node.__isset.nodes_info) {
+        _nodes_info = _pool->add(new DorisNodesInfo(tnode.exchange_node.nodes_info));
+    }
+    _use_two_phase_read = tnode.exchange_node.sort_info.__isset.use_two_phase_read &&
+                          tnode.exchange_node.sort_info.use_two_phase_read;
    return Status::OK();
 }

@ -87,6 +95,19 @@ Status VExchangeNode::open(RuntimeState* state) {
    return Status::OK();
 }

+Status VExchangeNode::_second_phase_fetch_data(RuntimeState* state, Block* final_block) {
+    auto row_id_col = final_block->get_by_position(final_block->columns() - 1);
+    auto tuple_desc = _row_descriptor.tuple_descriptors()[0];
+    RowIDFetcher id_fetcher(tuple_desc, state);
+    RETURN_IF_ERROR(id_fetcher.init(_nodes_info));
+    MutableBlock materialized_block(_row_descriptor.tuple_descriptors(), final_block->rows());
+    // fetch will sort block by sequence of ROWID_COL
+    RETURN_IF_ERROR(id_fetcher.fetch(row_id_col.column, &materialized_block));
+    // Notice swap may change the structure of final_block
+    final_block->swap(materialized_block.to_block());
+    return Status::OK();
+}
+
 Status VExchangeNode::get_next(RuntimeState* state, Block* block, bool* eos) {
    INIT_AND_SCOPE_GET_NEXT_SPAN(state->get_tracer(), _get_next_span, "VExchangeNode::get_next");
    SCOPED_TIMER(runtime_profile()->total_time_counter());
@ -97,6 +118,12 @@ Status VExchangeNode::get_next(RuntimeState* state, Block* block, bool* eos) {
        _is_ready = true;
        return Status::OK();
    }
+    if (_use_two_phase_read) {
+        // Block structure may be changed by calling _second_phase_fetch_data() before.
+        // So we should clear block before _stream_recvr->get_next, since
+        // blocks in VSortedRunMerger may not compatible with this block.
+        block->clear();
+    }
    auto status = _stream_recvr->get_next(block, eos);
    if (block != nullptr) {
        if (!_is_merging) {
@ -119,6 +146,9 @@ Status VExchangeNode::get_next(RuntimeState* state, Block* block, bool* eos) {
        }
        COUNTER_SET(_rows_returned_counter, _num_rows_returned);
    }
+    if (_use_two_phase_read && block->rows() > 0) {
+        RETURN_IF_ERROR(_second_phase_fetch_data(state, block));
+    }
    return status;
 }

--- a/be/src/vec/exec/vexchange_node.h
+++ b/be/src/vec/exec/vexchange_node.h
@ -20,6 +20,8 @@
 #include <memory>

 #include "exec/exec_node.h"
+#include "exec/tablet_info.h" // DorisNodesInfo
+#include "runtime/descriptors.h"
 #include "vec/common/sort/vsort_exec_exprs.h"

 namespace doris {
@ -47,6 +49,9 @@ public:
    // Status collect_query_statistics(QueryStatistics* statistics) override;
    void set_num_senders(int num_senders) { _num_senders = num_senders; }

+    // final materializtion, used only in topn node
+    Status _second_phase_fetch_data(RuntimeState* state, Block* final_block);
+
 private:
    int _num_senders;
    bool _is_merging;
@ -61,6 +66,10 @@ private:
    VSortExecExprs _vsort_exec_exprs;
    std::vector<bool> _is_asc_order;
    std::vector<bool> _nulls_first;
+
+    // for fetch data by rowids
+    DorisNodesInfo* _nodes_info = nullptr;
+    bool _use_two_phase_read = false;
 };
 } // namespace vectorized
 } // namespace doris
--- a/be/src/vec/exec/vsort_node.cpp
+++ b/be/src/vec/exec/vsort_node.cpp
@ -82,7 +82,6 @@ Status VSortNode::init(const TPlanNode& tnode, RuntimeState* state) {
    }

    _sorter->init_profile(_runtime_profile.get());
-
    return Status::OK();
 }

@ -127,7 +126,6 @@ Status VSortNode::sink(RuntimeState* state, vectorized::Block* input_block, bool
                old_top = std::move(new_top);
            }
        }
-
        if (!_reuse_mem) {
            input_block->clear();
        }
--- a/be/src/vec/exprs/vslot_ref.cpp
+++ b/be/src/vec/exprs/vslot_ref.cpp
@ -50,12 +50,17 @@ Status VSlotRef::prepare(doris::RuntimeState* state, const doris::RowDescriptor&
    if (slot_desc == nullptr) {
        return Status::InternalError("couldn't resolve slot descriptor {}", _slot_id);
    }
+    _column_name = &slot_desc->col_name();
+    if (!slot_desc->need_materialize()) {
+        // slot should be ignored manually
+        _column_id = -1;
+        return Status::OK();
+    }
    _column_id = desc.get_column_id(_slot_id);
    if (_column_id < 0) {
        LOG(INFO) << "VSlotRef - invalid slot id: " << _slot_id << " desc:" << desc.debug_string();
        return Status::InternalError("VSlotRef - invalid slot id {}", _slot_id);
    }
-    _column_name = &slot_desc->col_name();
    return Status::OK();
 }

--- a/be/src/vec/utils/util.hpp
+++ b/be/src/vec/utils/util.hpp
@ -34,10 +34,14 @@ public:
        return create_columns_with_type_and_name(row_desc);
    }

-    static ColumnsWithTypeAndName create_columns_with_type_and_name(const RowDescriptor& row_desc) {
+    static ColumnsWithTypeAndName create_columns_with_type_and_name(
+            const RowDescriptor& row_desc, bool ignore_trivial_slot = false) {
        ColumnsWithTypeAndName columns_with_type_and_name;
        for (const auto& tuple_desc : row_desc.tuple_descriptors()) {
            for (const auto& slot_desc : tuple_desc->slots()) {
+                if (ignore_trivial_slot && !slot_desc->need_materialize()) {
+                    continue;
+                }
                columns_with_type_and_name.emplace_back(nullptr, slot_desc->get_data_type_ptr(),
                                                        slot_desc->col_name());
            }
@ -45,10 +49,14 @@ public:
        return columns_with_type_and_name;
    }

-    static ColumnsWithTypeAndName create_empty_block(const RowDescriptor& row_desc) {
+    static ColumnsWithTypeAndName create_empty_block(const RowDescriptor& row_desc,
+                                                     bool ignore_trivial_slot = false) {
        ColumnsWithTypeAndName columns_with_type_and_name;
        for (const auto& tuple_desc : row_desc.tuple_descriptors()) {
            for (const auto& slot_desc : tuple_desc->slots()) {
+                if (ignore_trivial_slot && !slot_desc->need_materialize()) {
+                    continue;
+                }
                columns_with_type_and_name.emplace_back(
                        slot_desc->get_data_type_ptr()->create_column(),
                        slot_desc->get_data_type_ptr(), slot_desc->col_name());