[Enhancement](topn) support two phase read for topn query (#15642)

This PR optimize topn query like `SELECT * FROM tableX ORDER BY columnA ASC/DESC LIMIT N`. TopN is is compose of SortNode and ScanNode, when user table is wide like 100+ columns the order by clause is just a few columns.But ScanNode need to scan all data from storage engine even if the limit is very small.This may lead to lots of read amplification.So In this PR I devide TopN query into two phase: 1. The first phase we just need to read `columnA`'s data from storage engine along with an extra RowId column called `__DORIS_ROWID_COL__`.The other columns are pruned from ScanNode. 2. The second phase I put it in the ExchangeNode beacuase it's the central node for topn nodes in the cluster.The ExchangeNode will spawn a RPC to other nodes using the RowIds(sorted and limited from SortNode) read from the first phase and read row by row from storage engine. After the second phase read, Block will contain all the data needed for the query
2023-01-19 10:01:33 +08:00
parent c7a72436e6
commit 3894de49d2
53 changed files with 829 additions and 33 deletions
--- a/gensrc/proto/descriptors.proto
+++ b/gensrc/proto/descriptors.proto
@ -34,6 +34,8 @@ message PSlotDescriptor {
    required string col_name = 8;
    required int32 slot_idx = 9;
    required bool is_materialized = 10;
+    required int32 col_unique_id = 11;
+    required bool is_key = 12;
 };

 message PTupleDescriptor {
--- a/gensrc/proto/internal_service.proto
+++ b/gensrc/proto/internal_service.proto
@ -538,6 +538,25 @@ message PFetchTableSchemaResult {
  optional int32 column_nums = 2;
  repeated string column_names = 3;
  repeated PTypeDesc column_types = 4;
+}
+
+message PMultiGetRequest {
+    message RowId {
+        optional int64 tablet_id = 1;
+        optional string rowset_id = 2;
+        optional uint64 segment_id = 3;
+        optional uint64 ordinal_id = 4;
+    };
+    repeated RowId rowids = 1;
+    optional PTupleDescriptor desc = 2;
+    repeated PSlotDescriptor slots = 3;
+    // for compability
+    optional int32 be_exec_version = 4;
+};
+
+message PMultiGetResponse {
+    optional PBlock block = 1;
+    optional PStatus status = 2;
 };

 service PBackendService {
@ -572,5 +591,6 @@ service PBackendService {
    rpc request_slave_tablet_pull_rowset(PTabletWriteSlaveRequest) returns (PTabletWriteSlaveResult);
    rpc response_slave_tablet_pull_rowset(PTabletWriteSlaveDoneRequest) returns (PTabletWriteSlaveDoneResult);
    rpc fetch_table_schema(PFetchTableSchemaRequest) returns (PFetchTableSchemaResult);
+    rpc multiget_data(PMultiGetRequest) returns (PMultiGetResponse);
 };

--- a/gensrc/thrift/Descriptors.thrift
+++ b/gensrc/thrift/Descriptors.thrift
@ -51,6 +51,10 @@ struct TSlotDescriptor {
  9: required i32 slotIdx
  10: required bool isMaterialized
  11: optional i32 col_unique_id = -1
+  12: optional bool is_key = false
+  // If set to false, then such slots will be ignored during
+  // materialize them.Used to optmize to read less data and less memory usage
+  13: optional bool need_materialize = true
 }

 struct TTupleDescriptor {
--- a/gensrc/thrift/Exprs.thrift
+++ b/gensrc/thrift/Exprs.thrift
@ -143,6 +143,7 @@ struct TTupleIsNullPredicate {
 struct TSlotRef {
  1: required Types.TSlotId slot_id
  2: required Types.TTupleId tuple_id
+  3: optional i32 col_unique_id
 }

 struct TStringLiteral {
--- a/gensrc/thrift/PlanNodes.thrift
+++ b/gensrc/thrift/PlanNodes.thrift
@ -543,6 +543,8 @@ struct TSortInfo {

  // Indicates the nullable info of sort_tuple_slot_exprs is changed after substitute by child's smap
  5: optional list<bool> slot_exprs_nullability_changed_flags   
+  // Indicates whether topn query using two phase read
+  6: optional bool use_two_phase_read
 }

 enum TPushAggOp {
@ -891,6 +893,8 @@ struct TExchangeNode {
  2: optional TSortInfo sort_info
  // This is tHe number of rows to skip before returning results
  3: optional i64 offset
+  // Nodes in this cluster, used for second phase fetch
+  4: optional Descriptors.TPaloNodesInfo nodes_info
 }

 struct TOlapRewriteNode {