From b085ff49f0e36309d0bb15b25f1a27370e38854f Mon Sep 17 00:00:00 2001
From: Gabriel <gabrielleebuaa@gmail.com>
Date: Fri, 23 Dec 2022 14:10:47 +0800
Subject: [PATCH] [refactor](non-vec) delete non-vec data sink (#15283)

* [refactor](non-vec) delete non-vec data sink

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 be/src/common/status.h                        |    3 +
 be/src/exec/CMakeLists.txt                    |    2 +-
 be/src/exec/base_scanner.h                    |    4 +-
 be/src/exec/data_sink.cpp                     |   22 +-
 be/src/exec/data_sink.h                       |    4 -
 be/src/exec/exec_node.cpp                     |   50 +-
 be/src/exec/exec_node.h                       |    1 -
 be/src/exec/parquet_scanner.cpp               |  140 ++
 be/src/exec/parquet_scanner.h                 |   84 +
 be/src/exec/tablet_sink.cpp                   | 1497 -----------------
 be/src/exec/tablet_sink.h                     |  583 -------
 be/src/olap/push_handler.cpp                  |    8 +-
 be/src/runtime/CMakeLists.txt                 |   14 -
 be/src/runtime/buffered_block_mgr2.cc         | 1216 -------------
 be/src/runtime/buffered_block_mgr2.h          |  614 -------
 be/src/runtime/buffered_tuple_stream2.cc      |  805 ---------
 be/src/runtime/buffered_tuple_stream2.h       |  412 -----
 .../runtime/buffered_tuple_stream2.inline.h   |   90 -
 be/src/runtime/buffered_tuple_stream3.cc      |  867 ----------
 be/src/runtime/buffered_tuple_stream3.h       |  647 -------
 .../runtime/buffered_tuple_stream3.inline.h   |   55 -
 be/src/runtime/export_sink.cpp                |  276 ---
 be/src/runtime/export_sink.h                  |   83 -
 be/src/runtime/memory_scratch_sink.cpp        |   97 --
 be/src/runtime/memory_scratch_sink.h          |   82 -
 be/src/runtime/mysql_result_writer.cpp        |  282 ----
 be/src/runtime/mysql_result_writer.h          |   79 -
 be/src/runtime/mysql_table_sink.cpp           |   86 -
 be/src/runtime/mysql_table_sink.h             |   73 -
 be/src/runtime/mysql_table_writer.cpp         |  182 --
 be/src/runtime/mysql_table_writer.h           |   68 -
 be/src/runtime/odbc_table_sink.cpp            |  105 --
 be/src/runtime/odbc_table_sink.h              |   74 -
 be/src/runtime/plan_fragment_executor.cpp     |  134 +-
 be/src/runtime/plan_fragment_executor.h       |   10 -
 be/src/runtime/result_writer.h                |    6 -
 be/src/runtime/row_batch.cpp                  |   48 -
 be/src/runtime/row_batch.h                    |   17 +-
 be/src/runtime/runtime_state.cpp              |    9 -
 be/src/runtime/runtime_state.h                |    3 -
 be/src/runtime/sorter.h                       |   53 -
 be/src/vec/exec/join/vhash_join_node.cpp      |    4 -
 be/src/vec/exec/join/vhash_join_node.h        |    1 -
 be/src/vec/exec/join/vnested_loop_join_node.h |    4 -
 be/src/vec/exec/scan/vscan_node.h             |    4 -
 be/src/vec/exec/vaggregation_node.cpp         |    4 -
 be/src/vec/exec/vaggregation_node.h           |    1 -
 be/src/vec/exec/vanalytic_eval_node.cpp       |    4 -
 be/src/vec/exec/vanalytic_eval_node.h         |    1 -
 be/src/vec/exec/varrow_scanner.h              |    5 -
 be/src/vec/exec/vassert_num_rows_node.h       |    4 -
 be/src/vec/exec/vbroker_scan_node.h           |    5 -
 be/src/vec/exec/vbroker_scanner.h             |    5 -
 be/src/vec/exec/vdata_gen_scan_node.cpp       |    5 -
 be/src/vec/exec/vdata_gen_scan_node.h         |    4 -
 be/src/vec/exec/vempty_set_node.h             |    3 -
 be/src/vec/exec/vexchange_node.cpp            |    3 -
 be/src/vec/exec/vexchange_node.h              |    1 -
 be/src/vec/exec/vjson_scanner.h               |    4 -
 be/src/vec/exec/vmysql_scan_node.h            |    3 -
 be/src/vec/exec/vschema_scan_node.h           |    3 -
 be/src/vec/exec/vselect_node.cpp              |    4 -
 be/src/vec/exec/vselect_node.h                |    1 -
 be/src/vec/exec/vset_operation_node.h         |    3 -
 be/src/vec/exec/vsort_node.cpp                |    5 -
 be/src/vec/exec/vsort_node.h                  |   24 +-
 be/src/vec/exec/vunion_node.h                 |    3 -
 be/src/vec/runtime/vfile_result_writer.h      |    3 -
 be/src/vec/runtime/vsorted_run_merger.cpp     |    1 -
 be/src/vec/sink/vdata_stream_sender.cpp       |    4 -
 be/src/vec/sink/vdata_stream_sender.h         |    1 -
 be/src/vec/sink/vmysql_result_writer.cpp      |    4 -
 be/src/vec/sink/vmysql_result_writer.h        |    2 -
 be/src/vec/sink/vmysql_table_writer.cpp       |    8 +
 be/src/vec/sink/vmysql_table_writer.h         |   12 +-
 be/src/vec/sink/vresult_file_sink.cpp         |    4 -
 be/src/vec/sink/vresult_file_sink.h           |    1 -
 be/src/vec/sink/vresult_sink.cpp              |    4 -
 be/src/vec/sink/vresult_sink.h                |    2 -
 be/src/vec/sink/vtable_sink.cpp               |    5 -
 be/src/vec/sink/vtable_sink.h                 |    2 -
 be/src/vec/sink/vtablet_sink.cpp              |  785 ++++++++-
 be/src/vec/sink/vtablet_sink.h                |  459 ++++-
 be/test/CMakeLists.txt                        |    3 -
 be/test/runtime/buffered_block_mgr2_test.cpp  | 1246 --------------
 .../runtime/buffered_tuple_stream2_test.cpp   |  821 ---------
 be/test/runtime/test_env.cc                   |   28 -
 be/test/runtime/test_env.h                    |   12 +-
 88 files changed, 1441 insertions(+), 10959 deletions(-)
 create mode 100644 be/src/exec/parquet_scanner.cpp
 create mode 100644 be/src/exec/parquet_scanner.h
 delete mode 100644 be/src/exec/tablet_sink.cpp
 delete mode 100644 be/src/exec/tablet_sink.h
 delete mode 100644 be/src/runtime/buffered_block_mgr2.cc
 delete mode 100644 be/src/runtime/buffered_block_mgr2.h
 delete mode 100644 be/src/runtime/buffered_tuple_stream2.cc
 delete mode 100644 be/src/runtime/buffered_tuple_stream2.h
 delete mode 100644 be/src/runtime/buffered_tuple_stream2.inline.h
 delete mode 100644 be/src/runtime/buffered_tuple_stream3.cc
 delete mode 100644 be/src/runtime/buffered_tuple_stream3.h
 delete mode 100644 be/src/runtime/buffered_tuple_stream3.inline.h
 delete mode 100644 be/src/runtime/export_sink.cpp
 delete mode 100644 be/src/runtime/export_sink.h
 delete mode 100644 be/src/runtime/memory_scratch_sink.cpp
 delete mode 100644 be/src/runtime/memory_scratch_sink.h
 delete mode 100644 be/src/runtime/mysql_result_writer.cpp
 delete mode 100644 be/src/runtime/mysql_result_writer.h
 delete mode 100644 be/src/runtime/mysql_table_sink.cpp
 delete mode 100644 be/src/runtime/mysql_table_sink.h
 delete mode 100644 be/src/runtime/mysql_table_writer.cpp
 delete mode 100644 be/src/runtime/mysql_table_writer.h
 delete mode 100644 be/src/runtime/odbc_table_sink.cpp
 delete mode 100644 be/src/runtime/odbc_table_sink.h
 delete mode 100644 be/src/runtime/sorter.h
 delete mode 100644 be/test/runtime/buffered_block_mgr2_test.cpp
 delete mode 100644 be/test/runtime/buffered_tuple_stream2_test.cpp

diff --git a/be/src/common/status.h b/be/src/common/status.h
index 0970447a45..cd514305b3 100644
--- a/be/src/common/status.h
+++ b/be/src/common/status.h
@@ -488,6 +488,9 @@ inline std::string Status::to_string() const {
         }                               \
     } while (false)
 
+#define RETURN_ERROR_IF_NON_VEC \
+    return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+
 // End _get_next_span after last call to get_next method
 #define RETURN_IF_ERROR_AND_CHECK_SPAN(stmt, get_next_span, done) \
     do {                                                          \
diff --git a/be/src/exec/CMakeLists.txt b/be/src/exec/CMakeLists.txt
index 3684b005d5..04580ba622 100644
--- a/be/src/exec/CMakeLists.txt
+++ b/be/src/exec/CMakeLists.txt
@@ -34,7 +34,6 @@ set(EXEC_FILES
     text_converter.cpp
     olap_common.cpp
     tablet_info.cpp
-    tablet_sink.cpp
     plain_binary_line_reader.cpp
     plain_text_line_reader.cpp
     es/es_predicate.cpp
@@ -64,6 +63,7 @@ set(EXEC_FILES
     odbc_connector.cpp
     table_connector.cpp
     schema_scanner.cpp
+    parquet_scanner.cpp
 )
 if (WITH_MYSQL)
     set(EXEC_FILES
diff --git a/be/src/exec/base_scanner.h b/be/src/exec/base_scanner.h
index c6bcde2f67..f00bfe3641 100644
--- a/be/src/exec/base_scanner.h
+++ b/be/src/exec/base_scanner.h
@@ -72,7 +72,9 @@ public:
     virtual Status open();
 
     // Get next tuple
-    virtual Status get_next(Tuple* tuple, MemPool* tuple_pool, bool* eof, bool* fill_tuple) = 0;
+    virtual Status get_next(Tuple* tuple, MemPool* tuple_pool, bool* eof, bool* fill_tuple) {
+        return Status::NotSupported("Not Implemented get block");
+    }
 
     // Get next block
     virtual Status get_next(vectorized::Block* block, bool* eof) {
diff --git a/be/src/exec/data_sink.cpp b/be/src/exec/data_sink.cpp
index ecb9329f6b..6c9068744d 100644
--- a/be/src/exec/data_sink.cpp
+++ b/be/src/exec/data_sink.cpp
@@ -25,7 +25,6 @@
 #include <string>
 
 #include "gen_cpp/PaloInternalService_types.h"
-#include "runtime/memory_scratch_sink.h"
 #include "runtime/runtime_state.h"
 #include "vec/sink/vdata_stream_sender.h"
 #include "vec/sink/vjdbc_table_sink.h"
@@ -59,7 +58,7 @@ Status DataSink::create_data_sink(ObjectPool* pool, const TDataSink& thrift_sink
                     state, pool, params.sender_id, row_desc, thrift_sink.stream_sink,
                     params.destinations, 16 * 1024, send_query_statistics_with_every_batch);
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         // RETURN_IF_ERROR(sender->prepare(state->obj_pool(), thrift_sink.stream_sink));
         sink->reset(tmp_sink);
@@ -75,7 +74,7 @@ Status DataSink::create_data_sink(ObjectPool* pool, const TDataSink& thrift_sink
             tmp_sink = new doris::vectorized::VResultSink(row_desc, output_exprs,
                                                           thrift_sink.result_sink, 4096);
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         sink->reset(tmp_sink);
         break;
@@ -103,19 +102,14 @@ Status DataSink::create_data_sink(ObjectPool* pool, const TDataSink& thrift_sink
                         send_query_statistics_with_every_batch, output_exprs);
             }
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
 
         sink->reset(tmp_sink);
         break;
     }
     case TDataSinkType::MEMORY_SCRATCH_SINK: {
-        if (!thrift_sink.__isset.memory_scratch_sink) {
-            return Status::InternalError("Missing data buffer sink.");
-        }
-
-        tmp_sink = new MemoryScratchSink(row_desc, output_exprs, thrift_sink.memory_scratch_sink);
-        sink->reset(tmp_sink);
+        RETURN_ERROR_IF_NON_VEC;
         break;
     }
     case TDataSinkType::MYSQL_TABLE_SINK: {
@@ -128,7 +122,7 @@ Status DataSink::create_data_sink(ObjectPool* pool, const TDataSink& thrift_sink
                     new doris::vectorized::VMysqlTableSink(pool, row_desc, output_exprs);
             sink->reset(vmysql_tbl_sink);
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         break;
 #else
@@ -143,7 +137,7 @@ Status DataSink::create_data_sink(ObjectPool* pool, const TDataSink& thrift_sink
         if (state->enable_vectorized_exec()) {
             sink->reset(new vectorized::VOdbcTableSink(pool, row_desc, output_exprs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         break;
     }
@@ -167,7 +161,7 @@ Status DataSink::create_data_sink(ObjectPool* pool, const TDataSink& thrift_sink
     }
 
     case TDataSinkType::EXPORT_SINK: {
-        return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+        RETURN_ERROR_IF_NON_VEC;
         break;
     }
     case TDataSinkType::OLAP_TABLE_SINK: {
@@ -176,7 +170,7 @@ Status DataSink::create_data_sink(ObjectPool* pool, const TDataSink& thrift_sink
         if (state->enable_vectorized_exec()) {
             sink->reset(new stream_load::VOlapTableSink(pool, row_desc, output_exprs, &status));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         RETURN_IF_ERROR(status);
         break;
diff --git a/be/src/exec/data_sink.h b/be/src/exec/data_sink.h
index 9f21bcf4a1..299e1c5376 100644
--- a/be/src/exec/data_sink.h
+++ b/be/src/exec/data_sink.h
@@ -57,10 +57,6 @@ public:
     // Setup. Call before send() or close().
     virtual Status open(RuntimeState* state) = 0;
 
-    // Send a row batch into this sink.
-    // eos should be true when the last batch is passed to send()
-    virtual Status send(RuntimeState* state, RowBatch* batch) = 0;
-
     // Send a Block into this sink.
     virtual Status send(RuntimeState* state, vectorized::Block* block, bool eos = false) {
         return Status::NotSupported("Not support send block");
diff --git a/be/src/exec/exec_node.cpp b/be/src/exec/exec_node.cpp
index 7d35db7ad9..616bf9f5e0 100644
--- a/be/src/exec/exec_node.cpp
+++ b/be/src/exec/exec_node.cpp
@@ -427,7 +427,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
         if (state->enable_vectorized_exec()) {
             *node = pool->add(new vectorized::VMysqlScanNode(pool, tnode, descs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         return Status::OK();
 #else
@@ -438,7 +438,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
         if (state->enable_vectorized_exec()) {
             *node = pool->add(new vectorized::NewOdbcScanNode(pool, tnode, descs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         return Status::OK();
 
@@ -460,7 +460,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
         if (state->enable_vectorized_exec()) {
             *node = pool->add(new vectorized::NewEsScanNode(pool, tnode, descs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         return Status::OK();
 
@@ -468,7 +468,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
         if (state->enable_vectorized_exec()) {
             *node = pool->add(new vectorized::VSchemaScanNode(pool, tnode, descs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         return Status::OK();
 
@@ -476,7 +476,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
         if (state->enable_vectorized_exec()) {
             *node = pool->add(new vectorized::NewOlapScanNode(pool, tnode, descs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         return Status::OK();
 
@@ -484,7 +484,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
         if (state->enable_vectorized_exec()) {
             *node = pool->add(new vectorized::AggregationNode(pool, tnode, descs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         return Status::OK();
 
@@ -499,7 +499,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
             }
             *node = pool->add(new vectorized::HashJoinNode(pool, tnode, descs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         return Status::OK();
 
@@ -507,7 +507,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
         if (state->enable_vectorized_exec()) {
             *node = pool->add(new vectorized::VNestedLoopJoinNode(pool, tnode, descs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         return Status::OK();
 
@@ -515,7 +515,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
         if (state->enable_vectorized_exec()) {
             *node = pool->add(new vectorized::VEmptySetNode(pool, tnode, descs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         return Status::OK();
 
@@ -523,7 +523,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
         if (state->enable_vectorized_exec()) {
             *node = pool->add(new doris::vectorized::VExchangeNode(pool, tnode, descs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         return Status::OK();
 
@@ -531,7 +531,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
         if (state->enable_vectorized_exec()) {
             *node = pool->add(new doris::vectorized::VSelectNode(pool, tnode, descs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         return Status::OK();
 
@@ -539,7 +539,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
         if (state->enable_vectorized_exec()) {
             *node = pool->add(new vectorized::VSortNode(pool, tnode, descs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
 
         return Status::OK();
@@ -547,18 +547,18 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
         if (state->enable_vectorized_exec()) {
             *node = pool->add(new vectorized::VAnalyticEvalNode(pool, tnode, descs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         return Status::OK();
 
     case TPlanNodeType::MERGE_NODE:
-        return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+        RETURN_ERROR_IF_NON_VEC;
 
     case TPlanNodeType::UNION_NODE:
         if (state->enable_vectorized_exec()) {
             *node = pool->add(new vectorized::VUnionNode(pool, tnode, descs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         return Status::OK();
 
@@ -566,7 +566,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
         if (state->enable_vectorized_exec()) {
             *node = pool->add(new vectorized::VIntersectNode(pool, tnode, descs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         return Status::OK();
 
@@ -574,7 +574,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
         if (state->enable_vectorized_exec()) {
             *node = pool->add(new vectorized::VExceptNode(pool, tnode, descs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         return Status::OK();
 
@@ -582,7 +582,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
         if (state->enable_vectorized_exec()) {
             *node = pool->add(new vectorized::VBrokerScanNode(pool, tnode, descs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         return Status::OK();
 
@@ -590,7 +590,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
         if (state->enable_vectorized_exec()) {
             *node = pool->add(new vectorized::NewFileScanNode(pool, tnode, descs));
         } else {
-            return Status::InternalError("Not support file scan node in non-vec engine");
+            RETURN_ERROR_IF_NON_VEC;
         }
         return Status::OK();
 
@@ -598,7 +598,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
         if (state->enable_vectorized_exec()) {
             *node = pool->add(new vectorized::VRepeatNode(pool, tnode, descs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         return Status::OK();
 
@@ -606,7 +606,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
         if (state->enable_vectorized_exec()) {
             *node = pool->add(new vectorized::VAssertNumRowsNode(pool, tnode, descs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         return Status::OK();
 
@@ -614,7 +614,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
         if (state->enable_vectorized_exec()) {
             *node = pool->add(new vectorized::VTableFunctionNode(pool, tnode, descs));
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
         return Status::OK();
 
@@ -623,7 +623,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN
             *node = pool->add(new vectorized::VDataGenFunctionScanNode(pool, tnode, descs));
             return Status::OK();
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
 
     default:
@@ -781,10 +781,6 @@ Status ExecNode::QueryMaintenance(RuntimeState* state, const std::string& msg) {
     return state->check_query_state(msg);
 }
 
-Status ExecNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) {
-    return Status::NotSupported("Not Implemented get batch");
-}
-
 Status ExecNode::get_next(RuntimeState* state, vectorized::Block* block, bool* eos) {
     return Status::NotSupported("Not Implemented get block");
 }
diff --git a/be/src/exec/exec_node.h b/be/src/exec/exec_node.h
index 1b089b4788..f5af72ac61 100644
--- a/be/src/exec/exec_node.h
+++ b/be/src/exec/exec_node.h
@@ -109,7 +109,6 @@ public:
     // row_batch's tuple_data_pool.
     // Caller must not be holding any io buffers. This will cause deadlock.
     // TODO: AggregationNode and HashJoinNode cannot be "re-opened" yet.
-    virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos);
     virtual Status get_next(RuntimeState* state, vectorized::Block* block, bool* eos);
     // new interface to compatible new optimizers in FE
     Status get_next_after_projects(
diff --git a/be/src/exec/parquet_scanner.cpp b/be/src/exec/parquet_scanner.cpp
new file mode 100644
index 0000000000..074f7d35a7
--- /dev/null
+++ b/be/src/exec/parquet_scanner.cpp
@@ -0,0 +1,140 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "exec/parquet_scanner.h"
+
+#include "exec/arrow/parquet_reader.h"
+#include "io/file_factory.h"
+#include "runtime/descriptors.h"
+#include "runtime/exec_env.h"
+#include "runtime/stream_load/stream_load_pipe.h"
+
+namespace doris {
+using namespace ErrorCode;
+
+ParquetScanner::ParquetScanner(RuntimeState* state, RuntimeProfile* profile,
+                               const TBrokerScanRangeParams& params,
+                               const std::vector<TBrokerRangeDesc>& ranges,
+                               const std::vector<TNetworkAddress>& broker_addresses,
+                               const std::vector<TExpr>& pre_filter_texprs, ScannerCounter* counter)
+        : BaseScanner(state, profile, params, ranges, broker_addresses, pre_filter_texprs, counter),
+          // _splittable(params.splittable),
+          _cur_file_reader(nullptr),
+          _cur_file_eof(false) {}
+
+ParquetScanner::~ParquetScanner() {
+    close();
+}
+
+Status ParquetScanner::open() {
+    return BaseScanner::open();
+}
+
+Status ParquetScanner::get_next(Tuple* tuple, MemPool* tuple_pool, bool* eof, bool* fill_tuple) {
+    SCOPED_TIMER(_read_timer);
+    // Get one line
+    while (!_scanner_eof) {
+        if (_cur_file_reader == nullptr || _cur_file_eof) {
+            RETURN_IF_ERROR(open_next_reader());
+            // If there isn't any more reader, break this
+            if (_scanner_eof) {
+                continue;
+            }
+            _cur_file_eof = false;
+        }
+        RETURN_IF_ERROR(_cur_file_reader->read(_src_tuple, tuple_pool, &_cur_file_eof));
+        // range of current file
+        const TBrokerRangeDesc& range = _ranges.at(_next_range - 1);
+        if (range.__isset.num_of_columns_from_file) {
+            fill_slots_of_columns_from_path(range.num_of_columns_from_file,
+                                            range.columns_from_path);
+        }
+
+        COUNTER_UPDATE(_rows_read_counter, 1);
+        SCOPED_TIMER(_materialize_timer);
+        RETURN_IF_ERROR(fill_dest_tuple(tuple, tuple_pool, fill_tuple));
+        break; // break always
+    }
+
+    *eof = _scanner_eof;
+    return Status::OK();
+}
+
+Status ParquetScanner::open_next_reader() {
+    // open_file_reader
+    if (_cur_file_reader != nullptr) {
+        if (_stream_load_pipe != nullptr) {
+            _stream_load_pipe.reset();
+            _cur_file_reader = nullptr;
+        } else {
+            delete _cur_file_reader;
+            _cur_file_reader = nullptr;
+        }
+    }
+
+    while (true) {
+        if (_next_range >= _ranges.size()) {
+            _scanner_eof = true;
+            return Status::OK();
+        }
+        const TBrokerRangeDesc& range = _ranges[_next_range++];
+        std::unique_ptr<FileReader> file_reader;
+        RETURN_IF_ERROR(FileFactory::create_file_reader(
+                range.file_type, _state->exec_env(), _profile, _broker_addresses,
+                _params.properties, range, range.start_offset, file_reader));
+        RETURN_IF_ERROR(file_reader->open());
+
+        if (file_reader->size() == 0) {
+            file_reader->close();
+            continue;
+        }
+        int32_t num_of_columns_from_file = _src_slot_descs.size();
+        if (range.__isset.num_of_columns_from_file) {
+            num_of_columns_from_file = range.num_of_columns_from_file;
+        }
+        _cur_file_reader = new ParquetReaderWrap(_state, _src_slot_descs, file_reader.release(),
+                                                 num_of_columns_from_file, 0, 0);
+        auto tuple_desc = _state->desc_tbl().get_tuple_descriptor(_tupleId);
+        Status status =
+                _cur_file_reader->init_reader(tuple_desc, _conjunct_ctxs, _state->timezone());
+        if (status.is<END_OF_FILE>()) {
+            continue;
+        } else {
+            if (!status.ok()) {
+                return Status::InternalError("file: {}, error:{}", range.path, status.to_string());
+            } else {
+                RETURN_IF_ERROR(_cur_file_reader->init_parquet_type());
+                return status;
+            }
+        }
+    }
+}
+
+void ParquetScanner::close() {
+    BaseScanner::close();
+    if (_cur_file_reader != nullptr) {
+        if (_stream_load_pipe != nullptr) {
+            _stream_load_pipe.reset();
+            _cur_file_reader = nullptr;
+        } else {
+            delete _cur_file_reader;
+            _cur_file_reader = nullptr;
+        }
+    }
+}
+
+} // namespace doris
diff --git a/be/src/exec/parquet_scanner.h b/be/src/exec/parquet_scanner.h
new file mode 100644
index 0000000000..3c0ca48eae
--- /dev/null
+++ b/be/src/exec/parquet_scanner.h
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "common/status.h"
+#include "exec/base_scanner.h"
+#include "gen_cpp/PlanNodes_types.h"
+#include "gen_cpp/Types_types.h"
+#include "runtime/mem_pool.h"
+#include "util/runtime_profile.h"
+#include "util/slice.h"
+
+namespace doris {
+
+class Tuple;
+class SlotDescriptor;
+struct Slice;
+class ParquetReaderWrap;
+class RuntimeState;
+class ExprContext;
+class TupleDescriptor;
+class TupleRow;
+class RowDescriptor;
+class RuntimeProfile;
+class StreamLoadPipe;
+
+// Broker scanner convert the data read from broker to doris's tuple.
+class ParquetScanner : public BaseScanner {
+public:
+    ParquetScanner(RuntimeState* state, RuntimeProfile* profile,
+                   const TBrokerScanRangeParams& params,
+                   const std::vector<TBrokerRangeDesc>& ranges,
+                   const std::vector<TNetworkAddress>& broker_addresses,
+                   const std::vector<TExpr>& pre_filter_texprs, ScannerCounter* counter);
+
+    ~ParquetScanner() override;
+
+    // Open this scanner, will initialize information need to
+    Status open() override;
+
+    // Get next tuple
+    Status get_next(Tuple* tuple, MemPool* tuple_pool, bool* eof, bool* fill_tuple) override;
+
+    Status get_next(vectorized::Block* block, bool* eof) override {
+        return Status::NotSupported("Not Implemented get block");
+    }
+
+    // Close this scanner
+    void close() override;
+
+protected:
+    // Read next buffer from reader
+    Status open_next_reader();
+
+    // Reader
+    ParquetReaderWrap* _cur_file_reader;
+    bool _cur_file_eof; // is read over?
+
+    // used to hold current StreamLoadPipe
+    std::shared_ptr<StreamLoadPipe> _stream_load_pipe;
+};
+
+} // namespace doris
diff --git a/be/src/exec/tablet_sink.cpp b/be/src/exec/tablet_sink.cpp
deleted file mode 100644
index 106623fa93..0000000000
--- a/be/src/exec/tablet_sink.cpp
+++ /dev/null
@@ -1,1497 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "exec/tablet_sink.h"
-
-#include <fmt/format.h>
-
-#include <sstream>
-#include <string>
-#include <unordered_map>
-
-#include "exprs/expr.h"
-#include "exprs/expr_context.h"
-#include "olap/hll.h"
-#include "runtime/exec_env.h"
-#include "runtime/row_batch.h"
-#include "runtime/runtime_state.h"
-#include "runtime/thread_context.h"
-#include "runtime/tuple_row.h"
-#include "service/backend_options.h"
-#include "util/brpc_client_cache.h"
-#include "util/debug/sanitizer_scopes.h"
-#include "util/defer_op.h"
-#include "util/proto_util.h"
-#include "util/threadpool.h"
-#include "util/time.h"
-#include "util/uid_util.h"
-#include "vec/sink/vtablet_sink.h"
-
-namespace doris {
-namespace stream_load {
-
-NodeChannel::NodeChannel(OlapTableSink* parent, IndexChannel* index_channel, int64_t node_id)
-        : _parent(parent), _index_channel(index_channel), _node_id(node_id) {
-    _node_channel_tracker = std::make_shared<MemTracker>(fmt::format(
-            "NodeChannel:indexID={}:threadId={}", std::to_string(_index_channel->_index_id),
-            thread_context()->get_thread_id()));
-}
-
-NodeChannel::~NodeChannel() noexcept {
-    if (_open_closure != nullptr) {
-        if (_open_closure->unref()) {
-            delete _open_closure;
-        }
-        _open_closure = nullptr;
-    }
-    if (_add_batch_closure != nullptr) {
-        // it's safe to delete, but may take some time to wait until brpc joined
-        delete _add_batch_closure;
-        _add_batch_closure = nullptr;
-    }
-    if (!_is_vectorized) {
-        _cur_add_batch_request.release_id();
-    }
-}
-
-// if "_cancelled" is set to true,
-// no need to set _cancel_msg because the error will be
-// returned directly via "TabletSink::prepare()" method.
-Status NodeChannel::init(RuntimeState* state) {
-    SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get());
-    _tuple_desc = _parent->_output_tuple_desc;
-    _state = state;
-    auto node = _parent->_nodes_info->find_node(_node_id);
-    if (node == nullptr) {
-        _cancelled = true;
-        return Status::InternalError("unknown node id, id={}", _node_id);
-    }
-
-    _node_info = *node;
-
-    _load_info = "load_id=" + print_id(_parent->_load_id) +
-                 ", txn_id=" + std::to_string(_parent->_txn_id);
-
-    _row_desc.reset(new RowDescriptor(_tuple_desc, false));
-    _batch_size = state->batch_size();
-
-    _stub = state->exec_env()->brpc_internal_client_cache()->get_client(_node_info.host,
-                                                                        _node_info.brpc_port);
-    if (_stub == nullptr) {
-        LOG(WARNING) << "Get rpc stub failed, host=" << _node_info.host
-                     << ", port=" << _node_info.brpc_port << ", " << channel_info();
-        _cancelled = true;
-        return Status::InternalError("get rpc stub failed");
-    }
-
-    if (!_is_vectorized) {
-        _cur_batch.reset(new RowBatch(*_row_desc, _batch_size));
-
-        // Initialize _cur_add_batch_request
-        _cur_add_batch_request.set_allocated_id(&_parent->_load_id);
-        _cur_add_batch_request.set_index_id(_index_channel->_index_id);
-        _cur_add_batch_request.set_sender_id(_parent->_sender_id);
-        _cur_add_batch_request.set_backend_id(_node_id);
-        _cur_add_batch_request.set_eos(false);
-
-        _name = fmt::format("NodeChannel[{}-{}]", _index_channel->_index_id, _node_id);
-    }
-
-    _rpc_timeout_ms = state->query_options().query_timeout * 1000;
-    _timeout_watch.start();
-
-    return Status::OK();
-}
-
-void NodeChannel::open() {
-    SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get());
-    PTabletWriterOpenRequest request;
-    request.set_allocated_id(&_parent->_load_id);
-    request.set_index_id(_index_channel->_index_id);
-    request.set_txn_id(_parent->_txn_id);
-    request.set_allocated_schema(_parent->_schema->to_protobuf());
-    for (auto& tablet : _all_tablets) {
-        auto ptablet = request.add_tablets();
-        ptablet->set_partition_id(tablet.partition_id);
-        ptablet->set_tablet_id(tablet.tablet_id);
-    }
-    request.set_num_senders(_parent->_num_senders);
-    request.set_need_gen_rollup(false); // Useless but it is a required field in pb
-    request.set_load_mem_limit(_parent->_load_mem_limit);
-    request.set_load_channel_timeout_s(_parent->_load_channel_timeout_s);
-    request.set_is_high_priority(_parent->_is_high_priority);
-    request.set_sender_ip(BackendOptions::get_localhost());
-    request.set_is_vectorized(_is_vectorized);
-
-    _open_closure = new RefCountClosure<PTabletWriterOpenResult>();
-    _open_closure->ref();
-
-    // This ref is for RPC's reference
-    _open_closure->ref();
-    _open_closure->cntl.set_timeout_ms(config::tablet_writer_open_rpc_timeout_sec * 1000);
-    if (config::tablet_writer_ignore_eovercrowded) {
-        _open_closure->cntl.ignore_eovercrowded();
-    }
-    _stub->tablet_writer_open(&_open_closure->cntl, &request, &_open_closure->result,
-                              _open_closure);
-    request.release_id();
-    request.release_schema();
-}
-
-void NodeChannel::_cancel_with_msg(const std::string& msg) {
-    LOG(WARNING) << "cancel node channel " << channel_info() << ", error message: " << msg;
-    {
-        std::lock_guard<SpinLock> l(_cancel_msg_lock);
-        if (_cancel_msg == "") {
-            _cancel_msg = msg;
-        }
-    }
-    _cancelled = true;
-}
-
-Status NodeChannel::open_wait() {
-    _open_closure->join();
-    SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get());
-    if (_open_closure->cntl.Failed()) {
-        if (!ExecEnv::GetInstance()->brpc_internal_client_cache()->available(
-                    _stub, _node_info.host, _node_info.brpc_port)) {
-            ExecEnv::GetInstance()->brpc_internal_client_cache()->erase(
-                    _open_closure->cntl.remote_side());
-        }
-        std::stringstream ss;
-        ss << "failed to open tablet writer, error=" << berror(_open_closure->cntl.ErrorCode())
-           << ", error_text=" << _open_closure->cntl.ErrorText();
-        _cancelled = true;
-        LOG(WARNING) << ss.str() << " " << channel_info();
-        return Status::InternalError("failed to open tablet writer, error={}, error_text={}",
-                                     berror(_open_closure->cntl.ErrorCode()),
-                                     _open_closure->cntl.ErrorText());
-    }
-    Status status(_open_closure->result.status());
-    if (_open_closure->unref()) {
-        delete _open_closure;
-    }
-    _open_closure = nullptr;
-
-    if (!status.ok()) {
-        _cancelled = true;
-        return status;
-    }
-
-    if (!_is_vectorized) {
-        // add batch closure
-        _add_batch_closure = ReusableClosure<PTabletWriterAddBatchResult>::create();
-        _add_batch_closure->addFailedHandler([this](bool is_last_rpc) {
-            SCOPED_ATTACH_TASK(_state);
-            std::lock_guard<std::mutex> l(this->_closed_lock);
-            if (this->_is_closed) {
-                // if the node channel is closed, no need to call `mark_as_failed`,
-                // and notice that _index_channel may already be destroyed.
-                return;
-            }
-            // If rpc failed, mark all tablets on this node channel as failed
-            _index_channel->mark_as_failed(this->node_id(), this->host(),
-                                           fmt::format("rpc failed, error coed:{}, error text:{}",
-                                                       _add_batch_closure->cntl.ErrorCode(),
-                                                       _add_batch_closure->cntl.ErrorText()),
-                                           -1);
-            Status st = _index_channel->check_intolerable_failure();
-            if (!st.ok()) {
-                _cancel_with_msg(fmt::format("{}, err: {}", channel_info(), st.to_string()));
-            } else if (is_last_rpc) {
-                // if this is last rpc, will must set _add_batches_finished. otherwise, node channel's close_wait
-                // will be blocked.
-                _add_batches_finished = true;
-                VLOG_PROGRESS << "node channel " << channel_info() << "add_batches_finished";
-            }
-        });
-
-        _add_batch_closure->addSuccessHandler([this](const PTabletWriterAddBatchResult& result,
-                                                     bool is_last_rpc) {
-            SCOPED_ATTACH_TASK(_state);
-            std::lock_guard<std::mutex> l(this->_closed_lock);
-            if (this->_is_closed) {
-                // if the node channel is closed, no need to call the following logic,
-                // and notice that _index_channel may already be destroyed.
-                return;
-            }
-            Status status(result.status());
-            if (status.ok()) {
-                // if has error tablet, handle them first
-                for (auto& error : result.tablet_errors()) {
-                    _index_channel->mark_as_failed(this->node_id(), this->host(),
-                                                   "tablet error: " + error.msg(),
-                                                   error.tablet_id());
-                }
-
-                Status st = _index_channel->check_intolerable_failure();
-                if (!st.ok()) {
-                    _cancel_with_msg(st.to_string());
-                } else if (is_last_rpc) {
-                    for (auto& tablet : result.tablet_vec()) {
-                        TTabletCommitInfo commit_info;
-                        commit_info.tabletId = tablet.tablet_id();
-                        commit_info.backendId = _node_id;
-                        _tablet_commit_infos.emplace_back(std::move(commit_info));
-                        if (tablet.has_received_rows()) {
-                            _tablets_received_rows.emplace_back(tablet.tablet_id(),
-                                                                tablet.received_rows());
-                        }
-                        VLOG_CRITICAL
-                                << "master replica commit info: tabletId=" << tablet.tablet_id()
-                                << ", backendId=" << _node_id
-                                << ", master node id: " << this->node_id()
-                                << ", host: " << this->host() << ", txn_id=" << _parent->_txn_id;
-                    }
-
-                    if (_parent->_write_single_replica) {
-                        for (auto& tablet_slave_node_ids : result.success_slave_tablet_node_ids()) {
-                            for (auto slave_node_id :
-                                 tablet_slave_node_ids.second.slave_node_ids()) {
-                                TTabletCommitInfo commit_info;
-                                commit_info.tabletId = tablet_slave_node_ids.first;
-                                commit_info.backendId = slave_node_id;
-                                _tablet_commit_infos.emplace_back(std::move(commit_info));
-                                VLOG_CRITICAL << "slave replica commit info: tabletId="
-                                              << tablet_slave_node_ids.first
-                                              << ", backendId=" << slave_node_id
-                                              << ", master node id: " << this->node_id()
-                                              << ", host: " << this->host()
-                                              << ", txn_id=" << _parent->_txn_id;
-                            }
-                        }
-                    }
-                    _add_batches_finished = true;
-                    VLOG_PROGRESS << "node channel " << channel_info()
-                                  << "add_batches_finished and handled "
-                                  << result.tablet_errors().size() << " tablets errors";
-                }
-            } else {
-                _cancel_with_msg(
-                        fmt::format("{}, add batch req success but status isn't ok, err: {}",
-                                    channel_info(), status.to_string()));
-            }
-
-            if (result.has_execution_time_us()) {
-                _add_batch_counter.add_batch_execution_time_us += result.execution_time_us();
-                _add_batch_counter.add_batch_wait_execution_time_us +=
-                        result.wait_execution_time_us();
-                _add_batch_counter.add_batch_num++;
-            }
-        });
-    }
-    return status;
-}
-
-Status NodeChannel::add_row(Tuple* input_tuple, int64_t tablet_id) {
-    SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get());
-    // If add_row() when _eos_is_produced==true, there must be sth wrong, we can only mark this channel as failed.
-    auto st = none_of({_cancelled, _eos_is_produced});
-    if (!st.ok()) {
-        if (_cancelled) {
-            std::lock_guard<SpinLock> l(_cancel_msg_lock);
-            return Status::InternalError("add row failed. {}", _cancel_msg);
-        } else {
-            return std::move(st.prepend("already stopped, can't add row. cancelled/eos: "));
-        }
-    }
-
-    // We use OlapTableSink mem_tracker which has the same ancestor of _plan node,
-    // so in the ideal case, mem limit is a matter for _plan node.
-    // But there is still some unfinished things, we do mem limit here temporarily.
-    // _cancelled may be set by rpc callback, and it's possible that _cancelled might be set in any of the steps below.
-    // It's fine to do a fake add_row() and return OK, because we will check _cancelled in next add_row() or mark_close().
-    while (!_cancelled && _pending_batches_num > 0 &&
-           _pending_batches_bytes > _max_pending_batches_bytes) {
-        SCOPED_ATOMIC_TIMER(&_mem_exceeded_block_ns);
-        std::this_thread::sleep_for(std::chrono::milliseconds(10));
-    }
-
-    auto row_no = _cur_batch->add_row();
-    if (row_no == RowBatch::INVALID_ROW_INDEX) {
-        {
-            SCOPED_ATOMIC_TIMER(&_queue_push_lock_ns);
-            std::lock_guard<std::mutex> l(_pending_batches_lock);
-            _pending_batches_bytes += _cur_batch->tuple_data_pool()->total_reserved_bytes();
-            //To simplify the add_row logic, postpone adding batch into req until the time of sending req
-            _pending_batches.emplace(std::move(_cur_batch), _cur_add_batch_request);
-            _pending_batches_num++;
-            VLOG_DEBUG << "OlapTableSink:" << _parent << " NodeChannel:" << this
-                       << " pending_batches_bytes:" << _pending_batches_bytes
-                       << " jobid:" << std::to_string(_state->load_job_id())
-                       << " tabletid:" << tablet_id << " loadinfo:" << _load_info;
-        }
-
-        _cur_batch.reset(new RowBatch(*_row_desc, _batch_size));
-        _cur_add_batch_request.clear_tablet_ids();
-
-        row_no = _cur_batch->add_row();
-    }
-    DCHECK_NE(row_no, RowBatch::INVALID_ROW_INDEX);
-    auto tuple = input_tuple->deep_copy(*_tuple_desc, _cur_batch->tuple_data_pool());
-
-    _cur_batch->get_row(row_no)->set_tuple(0, tuple);
-    _cur_batch->commit_last_row();
-    _cur_add_batch_request.add_tablet_ids(tablet_id);
-    return Status::OK();
-}
-
-// Used for vectorized engine.
-// TODO(cmy): deprecated, need refactor
-Status NodeChannel::add_row(const BlockRow& block_row, int64_t tablet_id) {
-    SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get());
-    // If add_row() when _eos_is_produced==true, there must be sth wrong, we can only mark this channel as failed.
-    auto st = none_of({_cancelled, _eos_is_produced});
-    if (!st.ok()) {
-        if (_cancelled) {
-            std::lock_guard<SpinLock> l(_cancel_msg_lock);
-            return Status::InternalError("add row failed. " + _cancel_msg);
-        } else {
-            return std::move(st.prepend("already stopped, can't add row. cancelled/eos: "));
-        }
-    }
-
-    while (!_cancelled && _pending_batches_num > 0 &&
-           _pending_batches_bytes > _max_pending_batches_bytes) {
-        SCOPED_ATOMIC_TIMER(&_mem_exceeded_block_ns);
-        std::this_thread::sleep_for(std::chrono::milliseconds(10));
-    }
-
-    constexpr size_t BATCH_SIZE_FOR_SEND = 2 * 1024 * 1024; //2M
-    auto row_no = _cur_batch->add_row();
-    if (row_no == RowBatch::INVALID_ROW_INDEX ||
-        _cur_batch->tuple_data_pool()->total_allocated_bytes() > BATCH_SIZE_FOR_SEND) {
-        {
-            SCOPED_ATOMIC_TIMER(&_queue_push_lock_ns);
-            std::lock_guard<std::mutex> l(_pending_batches_lock);
-            _pending_batches_bytes += _cur_batch->tuple_data_pool()->total_reserved_bytes();
-            //To simplify the add_row logic, postpone adding batch into req until the time of sending req
-            _pending_batches.emplace(std::move(_cur_batch), _cur_add_batch_request);
-            _pending_batches_num++;
-        }
-
-        _cur_batch.reset(new RowBatch(*_row_desc, _batch_size));
-        _cur_add_batch_request.clear_tablet_ids();
-
-        row_no = _cur_batch->add_row();
-    }
-    DCHECK_NE(row_no, RowBatch::INVALID_ROW_INDEX);
-
-    _cur_batch->get_row(row_no)->set_tuple(
-            0, block_row.first->deep_copy_tuple(*_tuple_desc, _cur_batch->tuple_data_pool(),
-                                                block_row.second, 0, true));
-    _cur_batch->commit_last_row();
-    _cur_add_batch_request.add_tablet_ids(tablet_id);
-    return Status::OK();
-}
-
-void NodeChannel::mark_close() {
-    SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get());
-    auto st = none_of({_cancelled, _eos_is_produced});
-    if (!st.ok()) {
-        return;
-    }
-
-    _cur_add_batch_request.set_eos(true);
-    {
-        debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan;
-        std::lock_guard<std::mutex> l(_pending_batches_lock);
-        _pending_batches_bytes += _cur_batch->tuple_data_pool()->total_reserved_bytes();
-        _pending_batches.emplace(std::move(_cur_batch), _cur_add_batch_request);
-        _pending_batches_num++;
-        DCHECK(_pending_batches.back().second.eos());
-        _close_time_ms = UnixMillis();
-        LOG(INFO) << channel_info()
-                  << " mark closed, left pending batch size: " << _pending_batches.size()
-                  << " left pending batch size: " << _pending_batches_bytes;
-    }
-
-    _eos_is_produced = true;
-    return;
-}
-
-void NodeChannel::_close_check() {
-    std::lock_guard<std::mutex> lg(_pending_batches_lock);
-    CHECK(_pending_batches.empty()) << name();
-    CHECK(_cur_batch == nullptr) << name();
-}
-Status NodeChannel::close_wait(RuntimeState* state) {
-    SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get());
-    // set _is_closed to true finally
-    Defer set_closed {[&]() {
-        std::lock_guard<std::mutex> l(_closed_lock);
-        _is_closed = true;
-    }};
-
-    auto st = none_of({_cancelled, !_eos_is_produced});
-    if (!st.ok()) {
-        if (_cancelled) {
-            std::lock_guard<SpinLock> l(_cancel_msg_lock);
-            return Status::InternalError("wait close failed. {}", _cancel_msg);
-        } else {
-            return std::move(
-                    st.prepend("already stopped, skip waiting for close. cancelled/!eos: "));
-        }
-    }
-
-    // waiting for finished, it may take a long time, so we couldn't set a timeout
-    while (!_add_batches_finished && !_cancelled) {
-        std::this_thread::sleep_for(std::chrono::milliseconds(1));
-    }
-    _close_time_ms = UnixMillis() - _close_time_ms;
-
-    if (_add_batches_finished) {
-        _close_check();
-        state->tablet_commit_infos().insert(state->tablet_commit_infos().end(),
-                                            std::make_move_iterator(_tablet_commit_infos.begin()),
-                                            std::make_move_iterator(_tablet_commit_infos.end()));
-
-        _index_channel->set_error_tablet_in_state(state);
-        _index_channel->set_tablets_received_rows(_tablets_received_rows, _node_id);
-        return Status::OK();
-    }
-
-    std::stringstream ss;
-    ss << "close wait failed coz rpc error";
-    {
-        std::lock_guard<SpinLock> l(_cancel_msg_lock);
-        if (_cancel_msg != "") {
-            ss << ". " << _cancel_msg;
-        }
-    }
-    return Status::InternalError(ss.str());
-}
-
-void NodeChannel::cancel(const std::string& cancel_msg) {
-    SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get());
-    // set _is_closed to true finally
-    Defer set_closed {[&]() {
-        std::lock_guard<std::mutex> l(_closed_lock);
-        _is_closed = true;
-    }};
-    // we don't need to wait last rpc finished, cause closure's release/reset will join.
-    // But do we need brpc::StartCancel(call_id)?
-    _cancel_with_msg(cancel_msg);
-
-    PTabletWriterCancelRequest request;
-    request.set_allocated_id(&_parent->_load_id);
-    request.set_index_id(_index_channel->_index_id);
-    request.set_sender_id(_parent->_sender_id);
-
-    auto closure = new RefCountClosure<PTabletWriterCancelResult>();
-
-    closure->ref();
-    int remain_ms = _rpc_timeout_ms - _timeout_watch.elapsed_time() / NANOS_PER_MILLIS;
-    if (UNLIKELY(remain_ms < config::min_load_rpc_timeout_ms)) {
-        remain_ms = config::min_load_rpc_timeout_ms;
-    }
-    closure->cntl.set_timeout_ms(remain_ms);
-    if (config::tablet_writer_ignore_eovercrowded) {
-        closure->cntl.ignore_eovercrowded();
-    }
-    _stub->tablet_writer_cancel(&closure->cntl, &request, &closure->result, closure);
-    request.release_id();
-}
-
-int NodeChannel::try_send_and_fetch_status(RuntimeState* state,
-                                           std::unique_ptr<ThreadPoolToken>& thread_pool_token) {
-    auto st = none_of({_cancelled, _send_finished});
-    if (!st.ok()) {
-        return 0;
-    }
-
-    if (!_add_batch_closure->try_set_in_flight()) {
-        return _send_finished ? 0 : 1;
-    }
-
-    // We are sure that try_send_batch is not running
-    if (_pending_batches_num > 0) {
-        auto s = thread_pool_token->submit_func(
-                std::bind(&NodeChannel::try_send_batch, this, state));
-        if (!s.ok()) {
-            _cancel_with_msg("submit send_batch task to send_batch_thread_pool failed");
-            // clear in flight
-            _add_batch_closure->clear_in_flight();
-        }
-        // in_flight is cleared in closure::Run
-    } else {
-        // clear in flight
-        _add_batch_closure->clear_in_flight();
-    }
-    return _send_finished ? 0 : 1;
-}
-
-void NodeChannel::try_send_batch(RuntimeState* state) {
-    SCOPED_ATOMIC_TIMER(&_actual_consume_ns);
-    SCOPED_ATTACH_TASK(state);
-    SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker);
-    AddBatchReq send_batch;
-    {
-        debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan;
-        std::lock_guard<std::mutex> l(_pending_batches_lock);
-        DCHECK(!_pending_batches.empty());
-        send_batch = std::move(_pending_batches.front());
-        _pending_batches.pop();
-        _pending_batches_num--;
-        _pending_batches_bytes -= send_batch.first->tuple_data_pool()->total_reserved_bytes();
-    }
-
-    auto row_batch = std::move(send_batch.first);
-    auto request = std::move(send_batch.second); // doesn't need to be saved in heap
-
-    // tablet_ids has already set when add row
-    request.set_packet_seq(_next_packet_seq);
-    if (row_batch->num_rows() > 0) {
-        SCOPED_ATOMIC_TIMER(&_serialize_batch_ns);
-        size_t uncompressed_bytes = 0, compressed_bytes = 0;
-        Status st = row_batch->serialize(request.mutable_row_batch(), &uncompressed_bytes,
-                                         &compressed_bytes, _parent->_transfer_large_data_by_brpc);
-        if (!st.ok()) {
-            cancel(fmt::format("{}, err: {}", channel_info(), st.to_string()));
-            _add_batch_closure->clear_in_flight();
-            return;
-        }
-        if (compressed_bytes >= double(config::brpc_max_body_size) * 0.95f) {
-            LOG(WARNING) << "send batch too large, this rpc may failed. send size: "
-                         << compressed_bytes << ", threshold: " << config::brpc_max_body_size
-                         << ", " << channel_info();
-        }
-    }
-
-    int remain_ms = _rpc_timeout_ms - _timeout_watch.elapsed_time() / NANOS_PER_MILLIS;
-    if (UNLIKELY(remain_ms < config::min_load_rpc_timeout_ms)) {
-        if (remain_ms <= 0 && !request.eos()) {
-            cancel(fmt::format("{}, err: timeout", channel_info()));
-            _add_batch_closure->clear_in_flight();
-            return;
-        } else {
-            remain_ms = config::min_load_rpc_timeout_ms;
-        }
-    }
-
-    // After calling reset(), make sure that the rpc will be called finally.
-    // Otherwise, when calling _add_batch_closure->join(), it will be blocked forever.
-    // and _add_batch_closure->join() will be called in ~NodeChannel().
-    _add_batch_closure->reset();
-    _add_batch_closure->cntl.set_timeout_ms(remain_ms);
-    if (config::tablet_writer_ignore_eovercrowded) {
-        _add_batch_closure->cntl.ignore_eovercrowded();
-    }
-
-    if (request.eos()) {
-        for (auto pid : _parent->_partition_ids) {
-            request.add_partition_ids(pid);
-        }
-
-        request.set_write_single_replica(false);
-        if (_parent->_write_single_replica) {
-            request.set_write_single_replica(true);
-            for (std::unordered_map<int64_t, std::vector<int64_t>>::iterator iter =
-                         _slave_tablet_nodes.begin();
-                 iter != _slave_tablet_nodes.end(); iter++) {
-                PSlaveTabletNodes slave_tablet_nodes;
-                for (auto node_id : iter->second) {
-                    auto node = _parent->_nodes_info->find_node(node_id);
-                    if (node == nullptr) {
-                        return;
-                    }
-                    PNodeInfo* pnode = slave_tablet_nodes.add_slave_nodes();
-                    pnode->set_id(node->id);
-                    pnode->set_option(node->option);
-                    pnode->set_host(node->host);
-                    pnode->set_async_internal_port(config::single_replica_load_brpc_port);
-                }
-                request.mutable_slave_tablet_nodes()->insert({iter->first, slave_tablet_nodes});
-            }
-        }
-
-        // eos request must be the last request
-        _add_batch_closure->end_mark();
-        _send_finished = true;
-        CHECK(_pending_batches_num == 0) << _pending_batches_num;
-    }
-
-    if (_parent->_transfer_large_data_by_brpc && request.has_row_batch() &&
-        request.row_batch().has_tuple_data() && request.ByteSizeLong() > MIN_HTTP_BRPC_SIZE) {
-        Status st = request_embed_attachment_contain_tuple<
-                PTabletWriterAddBatchRequest, ReusableClosure<PTabletWriterAddBatchResult>>(
-                &request, _add_batch_closure);
-        if (!st.ok()) {
-            cancel(fmt::format("{}, err: {}", channel_info(), st.to_string()));
-            _add_batch_closure->clear_in_flight();
-            return;
-        }
-        std::string brpc_url = fmt::format("http://{}:{}", _node_info.host, _node_info.brpc_port);
-        std::shared_ptr<PBackendService_Stub> _brpc_http_stub =
-                _state->exec_env()->brpc_internal_client_cache()->get_new_client_no_cache(brpc_url,
-                                                                                          "http");
-        _add_batch_closure->cntl.http_request().uri() =
-                brpc_url + "/PInternalServiceImpl/tablet_writer_add_batch_by_http";
-        _add_batch_closure->cntl.http_request().set_method(brpc::HTTP_METHOD_POST);
-        _add_batch_closure->cntl.http_request().set_content_type("application/json");
-        {
-            SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(ExecEnv::GetInstance()->orphan_mem_tracker());
-            _brpc_http_stub->tablet_writer_add_batch_by_http(&_add_batch_closure->cntl, NULL,
-                                                             &_add_batch_closure->result,
-                                                             _add_batch_closure);
-        }
-    } else {
-        _add_batch_closure->cntl.http_request().Clear();
-        {
-            SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(ExecEnv::GetInstance()->orphan_mem_tracker());
-            _stub->tablet_writer_add_batch(&_add_batch_closure->cntl, &request,
-                                           &_add_batch_closure->result, _add_batch_closure);
-        }
-    }
-    _next_packet_seq++;
-}
-
-Status NodeChannel::none_of(std::initializer_list<bool> vars) {
-    bool none = std::none_of(vars.begin(), vars.end(), [](bool var) { return var; });
-    Status st = Status::OK();
-    if (!none) {
-        std::string vars_str;
-        std::for_each(vars.begin(), vars.end(),
-                      [&vars_str](bool var) -> void { vars_str += (var ? "1/" : "0/"); });
-        if (!vars_str.empty()) {
-            vars_str.pop_back(); // 0/1/0/ -> 0/1/0
-        }
-        st = Status::InternalError(vars_str);
-    }
-
-    return st;
-}
-
-void NodeChannel::clear_all_batches() {
-    std::lock_guard<std::mutex> lg(_pending_batches_lock);
-    std::queue<AddBatchReq> empty;
-    std::swap(_pending_batches, empty);
-    _cur_batch.reset();
-}
-
-Status IndexChannel::init(RuntimeState* state, const std::vector<TTabletWithPartition>& tablets) {
-    SCOPED_CONSUME_MEM_TRACKER(_index_channel_tracker.get());
-    for (auto& tablet : tablets) {
-        auto location = _parent->_location->find_tablet(tablet.tablet_id);
-        if (location == nullptr) {
-            LOG(WARNING) << "unknown tablet, tablet_id=" << tablet.tablet_id;
-            return Status::InternalError("unknown tablet");
-        }
-        std::vector<std::shared_ptr<NodeChannel>> channels;
-        for (auto& node_id : location->node_ids) {
-            std::shared_ptr<NodeChannel> channel;
-            auto it = _node_channels.find(node_id);
-            if (it == _node_channels.end()) {
-                // NodeChannel is not added to the _parent->_pool.
-                // Because the deconstruction of NodeChannel may take a long time to wait rpc finish.
-                // but the ObjectPool will hold a spin lock to delete objects.
-                if (!_is_vectorized) {
-                    channel = std::make_shared<NodeChannel>(_parent, this, node_id);
-                } else {
-                    channel = std::make_shared<VNodeChannel>(_parent, this, node_id);
-                }
-                _node_channels.emplace(node_id, channel);
-            } else {
-                channel = it->second;
-            }
-            channel->add_tablet(tablet);
-            if (_parent->_write_single_replica) {
-                auto slave_location = _parent->_slave_location->find_tablet(tablet.tablet_id);
-                if (slave_location != nullptr) {
-                    channel->add_slave_tablet_nodes(tablet.tablet_id, slave_location->node_ids);
-                }
-            }
-            channels.push_back(channel);
-            _tablets_by_channel[node_id].insert(tablet.tablet_id);
-        }
-        _channels_by_tablet.emplace(tablet.tablet_id, std::move(channels));
-    }
-    for (auto& it : _node_channels) {
-        RETURN_IF_ERROR(it.second->init(state));
-    }
-    return Status::OK();
-}
-
-void IndexChannel::mark_as_failed(int64_t node_id, const std::string& host, const std::string& err,
-                                  int64_t tablet_id) {
-    VLOG_PROGRESS << "mark node_id:" << node_id << " tablet_id: " << tablet_id
-                  << " as failed, err: " << err;
-    const auto& it = _tablets_by_channel.find(node_id);
-    if (it == _tablets_by_channel.end()) {
-        return;
-    }
-
-    {
-        std::lock_guard<SpinLock> l(_fail_lock);
-        if (tablet_id == -1) {
-            for (const auto the_tablet_id : it->second) {
-                _failed_channels[the_tablet_id].insert(node_id);
-                _failed_channels_msgs.emplace(the_tablet_id, err + ", host: " + host);
-                if (_failed_channels[the_tablet_id].size() >= ((_parent->_num_replicas + 1) / 2)) {
-                    _intolerable_failure_status =
-                            Status::InternalError(_failed_channels_msgs[the_tablet_id]);
-                }
-            }
-        } else {
-            _failed_channels[tablet_id].insert(node_id);
-            _failed_channels_msgs.emplace(tablet_id, err + ", host: " + host);
-            if (_failed_channels[tablet_id].size() >= ((_parent->_num_replicas + 1) / 2)) {
-                _intolerable_failure_status =
-                        Status::InternalError(_failed_channels_msgs[tablet_id]);
-            }
-        }
-    }
-}
-
-Status IndexChannel::check_intolerable_failure() {
-    std::lock_guard<SpinLock> l(_fail_lock);
-    return _intolerable_failure_status;
-}
-
-void IndexChannel::set_error_tablet_in_state(RuntimeState* state) {
-    std::vector<TErrorTabletInfo>& error_tablet_infos = state->error_tablet_infos();
-
-    std::lock_guard<SpinLock> l(_fail_lock);
-    for (const auto& it : _failed_channels_msgs) {
-        TErrorTabletInfo error_info;
-        error_info.__set_tabletId(it.first);
-        error_info.__set_msg(it.second);
-        error_tablet_infos.emplace_back(error_info);
-    }
-}
-
-void IndexChannel::set_tablets_received_rows(
-        const std::vector<std::pair<int64_t, int64_t>>& tablets_received_rows, int64_t node_id) {
-    for (const auto& [tablet_id, rows_num] : tablets_received_rows) {
-        _tablets_received_rows[tablet_id].emplace_back(node_id, rows_num);
-    }
-}
-
-Status IndexChannel::check_tablet_received_rows_consistency() {
-    for (auto& tablet : _tablets_received_rows) {
-        for (size_t i = 0; i < tablet.second.size(); i++) {
-            VLOG_NOTICE << "check_tablet_received_rows_consistency, load_id: " << _parent->_load_id
-                        << ", txn_id: " << std::to_string(_parent->_txn_id)
-                        << ", tablet_id: " << tablet.first
-                        << ", node_id: " << tablet.second[i].first
-                        << ", rows_num: " << tablet.second[i].second;
-            if (i == 0) {
-                continue;
-            }
-            if (tablet.second[i].second != tablet.second[0].second) {
-                LOG(WARNING) << "rows num doest't match, load_id: " << _parent->_load_id
-                             << ", txn_id: " << std::to_string(_parent->_txn_id)
-                             << ", tablt_id: " << tablet.first
-                             << ", node_id: " << tablet.second[i].first
-                             << ", rows_num: " << tablet.second[i].second
-                             << ", node_id: " << tablet.second[0].first
-                             << ", rows_num: " << tablet.second[0].second;
-                return Status::InternalError("rows num written by multi replicas doest't match");
-            }
-        }
-    }
-    return Status::OK();
-}
-
-OlapTableSink::OlapTableSink(ObjectPool* pool, const RowDescriptor& row_desc,
-                             const std::vector<TExpr>& texprs, Status* status)
-        : _pool(pool),
-          _input_row_desc(row_desc),
-          _filter_bitmap(1024),
-          _stop_background_threads_latch(1) {
-    if (!_is_vectorized) {
-        if (!texprs.empty()) {
-            *status = Expr::create_expr_trees(_pool, texprs, &_output_expr_ctxs);
-        }
-        _name = "OlapTableSink";
-    } else {
-        *status = Status::OK();
-    }
-    _transfer_large_data_by_brpc = config::transfer_large_data_by_brpc;
-}
-
-OlapTableSink::~OlapTableSink() {
-    // We clear NodeChannels' batches here, cuz NodeChannels' batches destruction will use
-    // OlapTableSink::_mem_tracker and its parents.
-    // But their destructions are after OlapTableSink's.
-    for (auto index_channel : _channels) {
-        index_channel->for_each_node_channel(
-                [](const std::shared_ptr<NodeChannel>& ch) { ch->clear_all_batches(); });
-    }
-}
-
-Status OlapTableSink::init(const TDataSink& t_sink) {
-    DCHECK(t_sink.__isset.olap_table_sink);
-    auto& table_sink = t_sink.olap_table_sink;
-    _load_id.set_hi(table_sink.load_id.hi);
-    _load_id.set_lo(table_sink.load_id.lo);
-    _txn_id = table_sink.txn_id;
-    _num_replicas = table_sink.num_replicas;
-    _tuple_desc_id = table_sink.tuple_id;
-    _schema.reset(new OlapTableSchemaParam());
-    RETURN_IF_ERROR(_schema->init(table_sink.schema));
-    _partition = _pool->add(new OlapTablePartitionParam(_schema, table_sink.partition));
-    RETURN_IF_ERROR(_partition->init());
-    _location = _pool->add(new OlapTableLocationParam(table_sink.location));
-    _nodes_info = _pool->add(new DorisNodesInfo(table_sink.nodes_info));
-    if (table_sink.__isset.write_single_replica && table_sink.write_single_replica) {
-        _write_single_replica = true;
-        _slave_location = _pool->add(new OlapTableLocationParam(table_sink.slave_location));
-        if (!config::enable_single_replica_load) {
-            return Status::InternalError("single replica load is disabled on BE.");
-        }
-    }
-
-    if (table_sink.__isset.load_channel_timeout_s) {
-        _load_channel_timeout_s = table_sink.load_channel_timeout_s;
-    } else {
-        _load_channel_timeout_s = config::streaming_load_rpc_max_alive_time_sec;
-    }
-    if (table_sink.__isset.send_batch_parallelism && table_sink.send_batch_parallelism > 1) {
-        _send_batch_parallelism = table_sink.send_batch_parallelism;
-    }
-    // if distributed column list is empty, we can ensure that tablet is with random distribution info
-    // and if load_to_single_tablet is set and set to true, we should find only one tablet in one partition
-    // for the whole olap table sink
-    if (table_sink.partition.distributed_columns.empty()) {
-        if (table_sink.__isset.load_to_single_tablet && table_sink.load_to_single_tablet) {
-            findTabletMode = FindTabletMode::FIND_TABLET_EVERY_SINK;
-        } else {
-            findTabletMode = FindTabletMode::FIND_TABLET_EVERY_BATCH;
-        }
-    }
-    return Status::OK();
-}
-
-Status OlapTableSink::prepare(RuntimeState* state) {
-    RETURN_IF_ERROR(DataSink::prepare(state));
-
-    _sender_id = state->per_fragment_instance_idx();
-    _num_senders = state->num_per_fragment_instances();
-    _is_high_priority = (state->query_options().query_timeout <=
-                         config::load_task_high_priority_threshold_second);
-
-    // profile must add to state's object pool
-    _profile = state->obj_pool()->add(new RuntimeProfile("OlapTableSink"));
-    _mem_tracker =
-            std::make_shared<MemTracker>("OlapTableSink:" + std::to_string(state->load_job_id()));
-    SCOPED_TIMER(_profile->total_time_counter());
-    SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get());
-
-    if (!_is_vectorized) {
-        // Prepare the exprs to run.
-        RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _input_row_desc));
-    }
-
-    // get table's tuple descriptor
-    _output_tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_desc_id);
-    if (_output_tuple_desc == nullptr) {
-        LOG(WARNING) << "unknown destination tuple descriptor, id=" << _tuple_desc_id;
-        return Status::InternalError("unknown destination tuple descriptor");
-    }
-
-    _output_row_desc = _pool->add(new RowDescriptor(_output_tuple_desc, false));
-
-    if (!_is_vectorized) {
-        if (!_output_expr_ctxs.empty()) {
-            if (_output_expr_ctxs.size() != _output_tuple_desc->slots().size()) {
-                LOG(WARNING) << "number of exprs is not same with slots, num_exprs="
-                             << _output_expr_ctxs.size()
-                             << ", num_slots=" << _output_tuple_desc->slots().size();
-                return Status::InternalError("number of exprs is not same with slots");
-            }
-            for (int i = 0; i < _output_expr_ctxs.size(); ++i) {
-                if (!is_type_compatible(_output_expr_ctxs[i]->root()->type().type,
-                                        _output_tuple_desc->slots()[i]->type().type)) {
-                    LOG(WARNING) << "type of exprs is not match slot's, expr_type="
-                                 << _output_expr_ctxs[i]->root()->type().type
-                                 << ", slot_type=" << _output_tuple_desc->slots()[i]->type().type
-                                 << ", slot_name=" << _output_tuple_desc->slots()[i]->col_name();
-                    return Status::InternalError("expr's type is not same with slot's");
-                }
-            }
-        }
-
-        _output_batch.reset(new RowBatch(*_output_row_desc, state->batch_size()));
-    }
-
-    _max_decimalv2_val.resize(_output_tuple_desc->slots().size());
-    _min_decimalv2_val.resize(_output_tuple_desc->slots().size());
-    // check if need validate batch
-    for (int i = 0; i < _output_tuple_desc->slots().size(); ++i) {
-        auto slot = _output_tuple_desc->slots()[i];
-        switch (slot->type().type) {
-        // For DECIMAL32,DECIMAL64,DECIMAL128, we have done precision and scale conversion so just
-        // skip data validation here.
-        case TYPE_DECIMALV2:
-            _max_decimalv2_val[i].to_max_decimal(slot->type().precision, slot->type().scale);
-            _min_decimalv2_val[i].to_min_decimal(slot->type().precision, slot->type().scale);
-            _need_validate_data = true;
-            break;
-        case TYPE_CHAR:
-        case TYPE_VARCHAR:
-        case TYPE_DATE:
-        case TYPE_DATETIME:
-        case TYPE_DATEV2:
-        case TYPE_DATETIMEV2:
-        case TYPE_HLL:
-        case TYPE_OBJECT:
-        case TYPE_STRING:
-        case TYPE_ARRAY:
-            _need_validate_data = true;
-            break;
-        default:
-            break;
-        }
-    }
-
-    // add all counter
-    _input_rows_counter = ADD_COUNTER(_profile, "RowsRead", TUnit::UNIT);
-    _output_rows_counter = ADD_COUNTER(_profile, "RowsReturned", TUnit::UNIT);
-    _filtered_rows_counter = ADD_COUNTER(_profile, "RowsFiltered", TUnit::UNIT);
-    _send_data_timer = ADD_TIMER(_profile, "SendDataTime");
-    _wait_mem_limit_timer = ADD_CHILD_TIMER(_profile, "WaitMemLimitTime", "SendDataTime");
-    _convert_batch_timer = ADD_TIMER(_profile, "ConvertBatchTime");
-    _validate_data_timer = ADD_TIMER(_profile, "ValidateDataTime");
-    _open_timer = ADD_TIMER(_profile, "OpenTime");
-    _close_timer = ADD_TIMER(_profile, "CloseWaitTime");
-    _non_blocking_send_timer = ADD_TIMER(_profile, "NonBlockingSendTime");
-    _non_blocking_send_work_timer =
-            ADD_CHILD_TIMER(_profile, "NonBlockingSendWorkTime", "NonBlockingSendTime");
-    _serialize_batch_timer =
-            ADD_CHILD_TIMER(_profile, "SerializeBatchTime", "NonBlockingSendWorkTime");
-    _total_add_batch_exec_timer = ADD_TIMER(_profile, "TotalAddBatchExecTime");
-    _max_add_batch_exec_timer = ADD_TIMER(_profile, "MaxAddBatchExecTime");
-    _add_batch_number = ADD_COUNTER(_profile, "NumberBatchAdded", TUnit::UNIT);
-    _num_node_channels = ADD_COUNTER(_profile, "NumberNodeChannels", TUnit::UNIT);
-    _load_mem_limit = state->get_load_mem_limit();
-
-    // open all channels
-    bool use_vec = _is_vectorized && state->be_exec_version() > 0;
-    const auto& partitions = _partition->get_partitions();
-    for (int i = 0; i < _schema->indexes().size(); ++i) {
-        // collect all tablets belong to this rollup
-        std::vector<TTabletWithPartition> tablets;
-        auto index = _schema->indexes()[i];
-        for (const auto& part : partitions) {
-            for (const auto& tablet : part->indexes[i].tablets) {
-                TTabletWithPartition tablet_with_partition;
-                tablet_with_partition.partition_id = part->id;
-                tablet_with_partition.tablet_id = tablet;
-                tablets.emplace_back(std::move(tablet_with_partition));
-            }
-        }
-        if (UNLIKELY(tablets.empty())) {
-            LOG(WARNING) << "load job:" << state->load_job_id() << " index: " << index->index_id
-                         << " would open 0 tablet";
-        }
-        _channels.emplace_back(new IndexChannel(this, index->index_id, use_vec));
-        RETURN_IF_ERROR(_channels.back()->init(state, tablets));
-    }
-
-    return Status::OK();
-}
-
-Status OlapTableSink::open(RuntimeState* state) {
-    SCOPED_TIMER(_profile->total_time_counter());
-    SCOPED_TIMER(_open_timer);
-    SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get());
-    if (!_is_vectorized) {
-        // Prepare the exprs to run.
-        RETURN_IF_ERROR(Expr::open(_output_expr_ctxs, state));
-    }
-
-    for (auto index_channel : _channels) {
-        index_channel->for_each_node_channel(
-                [](const std::shared_ptr<NodeChannel>& ch) { ch->open(); });
-    }
-
-    for (auto index_channel : _channels) {
-        index_channel->for_each_node_channel([&index_channel](
-                                                     const std::shared_ptr<NodeChannel>& ch) {
-            auto st = ch->open_wait();
-            if (!st.ok()) {
-                // The open() phase is mainly to generate DeltaWriter instances on the nodes corresponding to each node channel.
-                // This phase will not fail due to a single tablet.
-                // Therefore, if the open() phase fails, all tablets corresponding to the node need to be marked as failed.
-                index_channel->mark_as_failed(
-                        ch->node_id(), ch->host(),
-                        fmt::format("{}, open failed, err: {}", ch->channel_info(), st.to_string()),
-                        -1);
-            }
-        });
-
-        RETURN_IF_ERROR(index_channel->check_intolerable_failure());
-    }
-    int32_t send_batch_parallelism =
-            MIN(_send_batch_parallelism, config::max_send_batch_parallelism_per_job);
-    _send_batch_thread_pool_token = state->exec_env()->send_batch_thread_pool()->new_token(
-            ThreadPool::ExecutionMode::CONCURRENT, send_batch_parallelism);
-    RETURN_IF_ERROR(Thread::create(
-            "OlapTableSink", "send_batch_process",
-            [this, state]() { this->_send_batch_process(state); }, &_sender_thread));
-
-    return Status::OK();
-}
-
-Status OlapTableSink::send(RuntimeState* state, RowBatch* input_batch) {
-    SCOPED_TIMER(_profile->total_time_counter());
-    SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get());
-    // update incrementally so that FE can get the progress.
-    // the real 'num_rows_load_total' will be set when sink being closed.
-    int64_t num_rows = input_batch->num_rows();
-    int64_t num_bytes = input_batch->total_byte_size();
-    _number_input_rows += num_rows;
-    state->update_num_rows_load_total(num_rows);
-    state->update_num_bytes_load_total(num_bytes);
-    DorisMetrics::instance()->load_rows->increment(num_rows);
-    DorisMetrics::instance()->load_bytes->increment(num_bytes);
-    RowBatch* batch = input_batch;
-    if (!_output_expr_ctxs.empty()) {
-        SCOPED_RAW_TIMER(&_convert_batch_ns);
-        _output_batch->reset();
-        RETURN_IF_ERROR(_convert_batch(state, input_batch, _output_batch.get()));
-        batch = _output_batch.get();
-    }
-
-    int filtered_rows = 0;
-    if (_need_validate_data) {
-        SCOPED_RAW_TIMER(&_validate_data_ns);
-        _filter_bitmap.Reset(batch->num_rows());
-        bool stop_processing = false;
-        RETURN_IF_ERROR(
-                _validate_data(state, batch, &_filter_bitmap, &filtered_rows, &stop_processing));
-        _number_filtered_rows += filtered_rows;
-        if (stop_processing) {
-            // should be returned after updating "_number_filtered_rows", to make sure that load job can be cancelled
-            // because of "data unqualified"
-            return Status::EndOfFile("Encountered unqualified data, stop processing");
-        }
-    }
-
-    SCOPED_RAW_TIMER(&_send_data_ns);
-    bool stop_processing = false;
-    if (findTabletMode == FindTabletMode::FIND_TABLET_EVERY_BATCH) {
-        _partition_to_tablet_map.clear();
-    }
-    for (int i = 0; i < batch->num_rows(); ++i) {
-        Tuple* tuple = batch->get_row(i)->get_tuple(0);
-        if (filtered_rows > 0 && _filter_bitmap.Get(i)) {
-            continue;
-        }
-        const OlapTablePartition* partition = nullptr;
-        if (!_partition->find_partition(tuple, &partition)) {
-            RETURN_IF_ERROR(state->append_error_msg_to_file(
-                    []() -> std::string { return ""; },
-                    [&]() -> std::string {
-                        fmt::memory_buffer buf;
-                        fmt::format_to(buf, "no partition for this tuple. tuple={}",
-                                       Tuple::to_string(tuple, *_output_tuple_desc));
-                        return fmt::to_string(buf);
-                    },
-                    &stop_processing));
-            _number_filtered_rows++;
-            if (stop_processing) {
-                return Status::EndOfFile("Encountered unqualified data, stop processing");
-            }
-            continue;
-        }
-        uint32_t tablet_index = 0;
-        if (findTabletMode != FindTabletMode::FIND_TABLET_EVERY_ROW) {
-            if (_partition_to_tablet_map.find(partition->id) == _partition_to_tablet_map.end()) {
-                tablet_index = _partition->find_tablet(tuple, *partition);
-                _partition_to_tablet_map.emplace(partition->id, tablet_index);
-            } else {
-                tablet_index = _partition_to_tablet_map[partition->id];
-            }
-        } else {
-            tablet_index = _partition->find_tablet(tuple, *partition);
-        }
-        _partition_ids.emplace(partition->id);
-        for (int j = 0; j < partition->indexes.size(); ++j) {
-            int64_t tablet_id = partition->indexes[j].tablets[tablet_index];
-            _channels[j]->add_row(tuple, tablet_id);
-            _number_output_rows++;
-        }
-    }
-
-    // check intolerable failure
-    for (const auto& index_channel : _channels) {
-        RETURN_IF_ERROR(index_channel->check_intolerable_failure());
-    }
-    return Status::OK();
-}
-
-Status OlapTableSink::close(RuntimeState* state, Status close_status) {
-    if (_closed) {
-        /// The close method may be called twice.
-        /// In the open_internal() method of plan_fragment_executor, close is called once.
-        /// If an error occurs in this call, it will be called again in fragment_mgr.
-        /// So here we use a flag to prevent repeated close operations.
-        return _close_status;
-    }
-    Status status = close_status;
-    if (status.ok()) {
-        // only if status is ok can we call this _profile->total_time_counter().
-        // if status is not ok, this sink may not be prepared, so that _profile is null
-        SCOPED_TIMER(_profile->total_time_counter());
-        // BE id -> add_batch method counter
-        std::unordered_map<int64_t, AddBatchCounter> node_add_batch_counter_map;
-        int64_t serialize_batch_ns = 0, mem_exceeded_block_ns = 0, queue_push_lock_ns = 0,
-                actual_consume_ns = 0, total_add_batch_exec_time_ns = 0,
-                max_add_batch_exec_time_ns = 0, total_add_batch_num = 0, num_node_channels = 0;
-        {
-            SCOPED_TIMER(_close_timer);
-            for (auto index_channel : _channels) {
-                index_channel->for_each_node_channel(
-                        [](const std::shared_ptr<NodeChannel>& ch) { ch->mark_close(); });
-                num_node_channels += index_channel->num_node_channels();
-            }
-
-            for (auto index_channel : _channels) {
-                int64_t add_batch_exec_time = 0;
-                index_channel->for_each_node_channel(
-                        [&index_channel, &state, &node_add_batch_counter_map, &serialize_batch_ns,
-                         &mem_exceeded_block_ns, &queue_push_lock_ns, &actual_consume_ns,
-                         &total_add_batch_exec_time_ns, &add_batch_exec_time,
-                         &total_add_batch_num](const std::shared_ptr<NodeChannel>& ch) {
-                            auto s = ch->close_wait(state);
-                            if (!s.ok()) {
-                                auto err_msg = s.to_string();
-                                index_channel->mark_as_failed(ch->node_id(), ch->host(), err_msg,
-                                                              -1);
-                                // cancel the node channel in best effort
-                                ch->cancel(err_msg);
-                                LOG(WARNING) << ch->channel_info()
-                                             << ", close channel failed, err: " << err_msg;
-                            }
-                            ch->time_report(&node_add_batch_counter_map, &serialize_batch_ns,
-                                            &mem_exceeded_block_ns, &queue_push_lock_ns,
-                                            &actual_consume_ns, &total_add_batch_exec_time_ns,
-                                            &add_batch_exec_time, &total_add_batch_num);
-                        });
-
-                if (add_batch_exec_time > max_add_batch_exec_time_ns) {
-                    max_add_batch_exec_time_ns = add_batch_exec_time;
-                }
-
-                // check if index has intolerable failure
-                Status index_st = index_channel->check_intolerable_failure();
-                if (!index_st.ok()) {
-                    status = index_st;
-                } else if (Status st = index_channel->check_tablet_received_rows_consistency();
-                           !st.ok()) {
-                    status = st;
-                }
-            } // end for index channels
-        }
-        // TODO need to be improved
-        LOG(INFO) << "total mem_exceeded_block_ns=" << mem_exceeded_block_ns
-                  << ", total queue_push_lock_ns=" << queue_push_lock_ns
-                  << ", total actual_consume_ns=" << actual_consume_ns
-                  << ", load id=" << print_id(_load_id);
-
-        COUNTER_SET(_input_rows_counter, _number_input_rows);
-        COUNTER_SET(_output_rows_counter, _number_output_rows);
-        COUNTER_SET(_filtered_rows_counter, _number_filtered_rows);
-        COUNTER_SET(_send_data_timer, _send_data_ns);
-        COUNTER_SET(_wait_mem_limit_timer, mem_exceeded_block_ns);
-        COUNTER_SET(_convert_batch_timer, _convert_batch_ns);
-        COUNTER_SET(_validate_data_timer, _validate_data_ns);
-        COUNTER_SET(_serialize_batch_timer, serialize_batch_ns);
-        COUNTER_SET(_non_blocking_send_work_timer, actual_consume_ns);
-        COUNTER_SET(_total_add_batch_exec_timer, total_add_batch_exec_time_ns);
-        COUNTER_SET(_max_add_batch_exec_timer, max_add_batch_exec_time_ns);
-        COUNTER_SET(_add_batch_number, total_add_batch_num);
-        COUNTER_SET(_num_node_channels, num_node_channels);
-        // _number_input_rows don't contain num_rows_load_filtered and num_rows_load_unselected in scan node
-        int64_t num_rows_load_total = _number_input_rows + state->num_rows_load_filtered() +
-                                      state->num_rows_load_unselected();
-        state->set_num_rows_load_total(num_rows_load_total);
-        state->update_num_rows_load_filtered(_number_filtered_rows);
-
-        // print log of add batch time of all node, for tracing load performance easily
-        std::stringstream ss;
-        ss << "finished to close olap table sink. load_id=" << print_id(_load_id)
-           << ", txn_id=" << _txn_id
-           << ", node add batch time(ms)/wait execution time(ms)/close time(ms)/num: ";
-        for (auto const& pair : node_add_batch_counter_map) {
-            ss << "{" << pair.first << ":(" << (pair.second.add_batch_execution_time_us / 1000)
-               << ")(" << (pair.second.add_batch_wait_execution_time_us / 1000) << ")("
-               << pair.second.close_wait_time_ms << ")(" << pair.second.add_batch_num << ")} ";
-        }
-        LOG(INFO) << ss.str();
-    } else {
-        for (auto channel : _channels) {
-            channel->for_each_node_channel([&status](const std::shared_ptr<NodeChannel>& ch) {
-                ch->cancel(status.to_string());
-            });
-        }
-        LOG(INFO) << "finished to close olap table sink. load_id=" << print_id(_load_id)
-                  << ", txn_id=" << _txn_id
-                  << ", canceled all node channels due to error: " << status;
-    }
-
-    // Sender join() must put after node channels mark_close/cancel.
-    // But there is no specific sequence required between sender join() & close_wait().
-    _stop_background_threads_latch.count_down();
-    if (_sender_thread) {
-        _sender_thread->join();
-        // We have to wait all task in _send_batch_thread_pool_token finished,
-        // because it is difficult to handle concurrent problem if we just
-        // shutdown it.
-        _send_batch_thread_pool_token->wait();
-    }
-
-    Expr::close(_output_expr_ctxs, state);
-    _output_batch.reset();
-
-    _close_status = status;
-    DataSink::close(state, close_status);
-    return status;
-}
-
-Status OlapTableSink::_convert_batch(RuntimeState* state, RowBatch* input_batch,
-                                     RowBatch* output_batch) {
-    DCHECK_GE(output_batch->capacity(), input_batch->num_rows());
-    int commit_rows = 0;
-    bool stop_processing = false;
-    for (int i = 0; i < input_batch->num_rows(); ++i) {
-        auto src_row = input_batch->get_row(i);
-        Tuple* dst_tuple =
-                (Tuple*)output_batch->tuple_data_pool()->allocate(_output_tuple_desc->byte_size());
-        bool ignore_this_row = false;
-        for (int j = 0; j < _output_expr_ctxs.size(); ++j) {
-            auto src_val = _output_expr_ctxs[j]->get_value(src_row);
-            auto slot_desc = _output_tuple_desc->slots()[j];
-            // The following logic is similar to BaseScanner::fill_dest_tuple
-            // Todo(kks): we should unify it
-            if (src_val == nullptr) {
-                // Only when the expr return value is null, we will check the error message.
-                std::string expr_error = _output_expr_ctxs[j]->get_error_msg();
-                if (!expr_error.empty()) {
-                    RETURN_IF_ERROR(state->append_error_msg_to_file(
-                            [&]() -> std::string { return slot_desc->col_name(); },
-                            [&]() -> std::string { return expr_error; }, &stop_processing));
-                    _number_filtered_rows++;
-                    ignore_this_row = true;
-                    // The ctx is reused, so must clear the error state and message.
-                    _output_expr_ctxs[j]->clear_error_msg();
-                    break;
-                }
-                if (!slot_desc->is_nullable()) {
-                    RETURN_IF_ERROR(state->append_error_msg_to_file(
-                            []() -> std::string { return ""; },
-                            [&]() -> std::string {
-                                fmt::memory_buffer buf;
-                                fmt::format_to(
-                                        buf, "null value for not null column, column={}, type={}",
-                                        slot_desc->col_name(), slot_desc->type().debug_string());
-                                return fmt::to_string(buf);
-                            },
-                            &stop_processing));
-                    _number_filtered_rows++;
-                    ignore_this_row = true;
-                    break;
-                }
-                dst_tuple->set_null(slot_desc->null_indicator_offset());
-                continue;
-            }
-            if (slot_desc->is_nullable()) {
-                dst_tuple->set_not_null(slot_desc->null_indicator_offset());
-            }
-            void* slot = dst_tuple->get_slot(slot_desc->tuple_offset());
-            RawValue::write(src_val, slot, slot_desc->type(), _output_batch->tuple_data_pool());
-        } // end for output expr
-
-        if (!ignore_this_row) {
-            output_batch->get_row(commit_rows)->set_tuple(0, dst_tuple);
-            commit_rows++;
-        }
-
-        if (stop_processing) {
-            return Status::EndOfFile("Encountered unqualified data, stop processing");
-        }
-    }
-    output_batch->commit_rows(commit_rows);
-    return Status::OK();
-}
-
-bool OlapTableSink::_validate_cell(const TypeDescriptor& type, const std::string& col_name,
-                                   void* slot, size_t slot_index, fmt::memory_buffer& error_msg,
-                                   RowBatch* batch) {
-    switch (type.type) {
-    case TYPE_CHAR:
-    case TYPE_VARCHAR: {
-        // Fixed length string
-        StringValue* str_val = (StringValue*)slot;
-        if (str_val->len > type.len) {
-            fmt::format_to(error_msg, "{}", "the length of input is too long than schema. ");
-            fmt::format_to(error_msg, "column_name: {}; ", col_name);
-            fmt::format_to(error_msg, "input str: [{}] ", std::string(str_val->ptr, str_val->len));
-            fmt::format_to(error_msg, "schema length: {}; ", type.len);
-            fmt::format_to(error_msg, "actual length: {}; ", str_val->len);
-            return false;
-        }
-        // padding 0 to CHAR field
-        if (type.type == TYPE_CHAR && str_val->len < type.len) {
-            auto new_ptr = (char*)batch->tuple_data_pool()->allocate(type.len);
-            memcpy(new_ptr, str_val->ptr, str_val->len);
-            memset(new_ptr + str_val->len, 0, type.len - str_val->len);
-
-            str_val->ptr = new_ptr;
-            str_val->len = type.len;
-        }
-        break;
-    }
-    case TYPE_STRING: {
-        StringValue* str_val = (StringValue*)slot;
-        if (str_val->len > config::string_type_length_soft_limit_bytes) {
-            fmt::format_to(error_msg, "{}", "the length of input is too long than schema. ");
-            fmt::format_to(error_msg, "column_name: {}; ", col_name);
-            fmt::format_to(error_msg, "first 128 bytes of input str: [{}] ",
-                           std::string(str_val->ptr, 128));
-            fmt::format_to(error_msg, "schema length: {}; ",
-                           config::string_type_length_soft_limit_bytes);
-            fmt::format_to(error_msg, "actual length: {}; ", str_val->len);
-            return false;
-        }
-        break;
-    }
-    case TYPE_DECIMALV2: {
-        DecimalV2Value dec_val(reinterpret_cast<const PackedInt128*>(slot)->value);
-        if (dec_val.greater_than_scale(type.scale)) {
-            int code = dec_val.round(&dec_val, type.scale, HALF_UP);
-            reinterpret_cast<PackedInt128*>(slot)->value = dec_val.value();
-            if (code != E_DEC_OK) {
-                fmt::format_to(error_msg, "round one decimal failed.value={}; ",
-                               dec_val.to_string());
-                return false;
-            }
-        }
-        if (dec_val > _max_decimalv2_val[slot_index] || dec_val < _min_decimalv2_val[slot_index]) {
-            fmt::format_to(error_msg, "decimal value is not valid for definition, column={}",
-                           col_name);
-            fmt::format_to(error_msg, ", value={}", dec_val.to_string());
-            fmt::format_to(error_msg, ", precision={}, scale={}; ", type.precision, type.scale);
-            return false;
-        }
-        break;
-    }
-    case TYPE_HLL: {
-        Slice* hll_val = (Slice*)slot;
-        if (!HyperLogLog::is_valid(*hll_val)) {
-            fmt::format_to(error_msg, "Content of HLL type column is invalid. column name: {}; ",
-                           col_name);
-            return false;
-        }
-        break;
-    }
-    case TYPE_ARRAY: {
-        auto array_val = (CollectionValue*)slot;
-        DCHECK(type.children.size() == 1);
-        auto nested_type = type.children[0];
-        if (nested_type.type != TYPE_ARRAY && nested_type.type != TYPE_CHAR &&
-            nested_type.type != TYPE_VARCHAR && nested_type.type != TYPE_STRING) {
-            break;
-        }
-        auto iter = array_val->iterator(nested_type.type);
-        while (iter.has_next()) {
-            auto data = iter.get();
-            // validate array nested element is nullable
-            if (data == nullptr) {
-                if (!type.contains_null) {
-                    fmt::format_to(error_msg,
-                                   "null element for null nested column of ARRAY, column={}, "
-                                   "type={} ",
-                                   col_name, type.debug_string());
-                    return false;
-                }
-            } else {
-                // validate array nested element data
-                if (!_validate_cell(nested_type, col_name, data, slot_index, error_msg, batch)) {
-                    fmt::format_to(error_msg, "ARRAY or elements invalid");
-                    return false;
-                }
-            }
-            iter.next();
-        }
-        break;
-    }
-    default:
-        break;
-    }
-    return true;
-}
-Status OlapTableSink::_validate_data(RuntimeState* state, RowBatch* batch, Bitmap* filter_bitmap,
-                                     int* filtered_rows, bool* stop_processing) {
-    for (int row_no = 0; row_no < batch->num_rows(); ++row_no) {
-        Tuple* tuple = batch->get_row(row_no)->get_tuple(0);
-        bool row_valid = true;
-        fmt::memory_buffer error_msg; // error message
-        for (int i = 0; row_valid && i < _output_tuple_desc->slots().size(); ++i) {
-            SlotDescriptor* desc = _output_tuple_desc->slots()[i];
-            if (desc->is_nullable() && tuple->is_null(desc->null_indicator_offset())) {
-                if (desc->type().type == TYPE_OBJECT) {
-                    fmt::format_to(error_msg,
-                                   "null is not allowed for bitmap column, column_name: {}; ",
-                                   desc->col_name());
-                    row_valid = false;
-                }
-                continue;
-            }
-            void* slot = tuple->get_slot(desc->tuple_offset());
-            row_valid = _validate_cell(desc->type(), desc->col_name(), slot, i, error_msg, batch);
-        }
-
-        if (!row_valid) {
-            (*filtered_rows)++;
-            filter_bitmap->Set(row_no, true);
-            RETURN_IF_ERROR(state->append_error_msg_to_file(
-                    []() -> std::string { return ""; },
-                    [&]() -> std::string { return fmt::to_string(error_msg); }, stop_processing));
-        }
-    }
-    return Status::OK();
-}
-
-void OlapTableSink::_send_batch_process(RuntimeState* state) {
-    SCOPED_TIMER(_non_blocking_send_timer);
-    SCOPED_ATTACH_TASK(state);
-    SCOPED_CONSUME_MEM_TRACKER(_mem_tracker);
-    do {
-        int running_channels_num = 0;
-        for (auto index_channel : _channels) {
-            index_channel->for_each_node_channel([&running_channels_num, this,
-                                                  state](const std::shared_ptr<NodeChannel>& ch) {
-                running_channels_num +=
-                        ch->try_send_and_fetch_status(state, this->_send_batch_thread_pool_token);
-            });
-        }
-
-        if (running_channels_num == 0) {
-            LOG(INFO) << "all node channels are stopped(maybe finished/offending/cancelled), "
-                         "sender thread exit. "
-                      << print_id(_load_id);
-            return;
-        }
-    } while (!_stop_background_threads_latch.wait_for(
-            std::chrono::milliseconds(config::olap_table_sink_send_interval_ms)));
-}
-
-} // namespace stream_load
-} // namespace doris
diff --git a/be/src/exec/tablet_sink.h b/be/src/exec/tablet_sink.h
deleted file mode 100644
index 2216c89fff..0000000000
--- a/be/src/exec/tablet_sink.h
+++ /dev/null
@@ -1,583 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <fmt/format.h>
-
-#include <memory>
-#include <queue>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "common/object_pool.h"
-#include "common/status.h"
-#include "exec/data_sink.h"
-#include "exec/tablet_info.h"
-#include "gen_cpp/Types_types.h"
-#include "gen_cpp/internal_service.pb.h"
-#include "runtime/thread_context.h"
-#include "util/bitmap.h"
-#include "util/countdown_latch.h"
-#include "util/ref_count_closure.h"
-#include "util/spinlock.h"
-#include "util/thread.h"
-
-namespace doris {
-
-class Bitmap;
-class MemTracker;
-class RuntimeProfile;
-class RowDescriptor;
-class ThreadPool;
-class ThreadPoolToken;
-class Tuple;
-class TupleDescriptor;
-class ExprContext;
-class TExpr;
-
-namespace vectorized {
-class Block;
-class MutableBlock;
-} // namespace vectorized
-namespace stream_load {
-
-class OlapTableSink;
-
-// The counter of add_batch rpc of a single node
-struct AddBatchCounter {
-    // total execution time of a add_batch rpc
-    int64_t add_batch_execution_time_us = 0;
-    // lock waiting time in a add_batch rpc
-    int64_t add_batch_wait_execution_time_us = 0;
-    // number of add_batch call
-    int64_t add_batch_num = 0;
-    // time passed between marked close and finish close
-    int64_t close_wait_time_ms = 0;
-
-    AddBatchCounter& operator+=(const AddBatchCounter& rhs) {
-        add_batch_execution_time_us += rhs.add_batch_execution_time_us;
-        add_batch_wait_execution_time_us += rhs.add_batch_wait_execution_time_us;
-        add_batch_num += rhs.add_batch_num;
-        close_wait_time_ms += rhs.close_wait_time_ms;
-        return *this;
-    }
-    friend AddBatchCounter operator+(const AddBatchCounter& lhs, const AddBatchCounter& rhs) {
-        AddBatchCounter sum = lhs;
-        sum += rhs;
-        return sum;
-    }
-};
-
-// It's very error-prone to guarantee the handler capture vars' & this closure's destruct sequence.
-// So using create() to get the closure pointer is recommended. We can delete the closure ptr before the capture vars destruction.
-// Delete this point is safe, don't worry about RPC callback will run after ReusableClosure deleted.
-template <typename T>
-class ReusableClosure final : public google::protobuf::Closure {
-public:
-    ReusableClosure() : cid(INVALID_BTHREAD_ID) {}
-    ~ReusableClosure() override {
-        // shouldn't delete when Run() is calling or going to be called, wait for current Run() done.
-        join();
-        SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(ExecEnv::GetInstance()->orphan_mem_tracker());
-        cntl.Reset();
-    }
-
-    static ReusableClosure<T>* create() { return new ReusableClosure<T>(); }
-
-    void addFailedHandler(const std::function<void(bool)>& fn) { failed_handler = fn; }
-    void addSuccessHandler(const std::function<void(const T&, bool)>& fn) { success_handler = fn; }
-
-    void join() {
-        // We rely on in_flight to assure one rpc is running,
-        // while cid is not reliable due to memory order.
-        // in_flight is written before getting callid,
-        // so we can not use memory fence to synchronize.
-        while (_packet_in_flight) {
-            // cid here is complicated
-            if (cid != INVALID_BTHREAD_ID) {
-                // actually cid may be the last rpc call id.
-                brpc::Join(cid);
-            }
-            if (_packet_in_flight) {
-                std::this_thread::sleep_for(std::chrono::milliseconds(10));
-            }
-        }
-    }
-
-    // plz follow this order: reset() -> set_in_flight() -> send brpc batch
-    void reset() {
-        SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(ExecEnv::GetInstance()->orphan_mem_tracker());
-        cntl.Reset();
-        cid = cntl.call_id();
-    }
-
-    bool try_set_in_flight() {
-        bool value = false;
-        return _packet_in_flight.compare_exchange_strong(value, true);
-    }
-
-    void clear_in_flight() { _packet_in_flight = false; }
-
-    bool is_packet_in_flight() { return _packet_in_flight; }
-
-    void end_mark() {
-        DCHECK(_is_last_rpc == false);
-        _is_last_rpc = true;
-    }
-
-    void Run() override {
-        DCHECK(_packet_in_flight);
-        if (cntl.Failed()) {
-            LOG(WARNING) << "failed to send brpc batch, error=" << berror(cntl.ErrorCode())
-                         << ", error_text=" << cntl.ErrorText();
-            failed_handler(_is_last_rpc);
-        } else {
-            success_handler(result, _is_last_rpc);
-        }
-        clear_in_flight();
-    }
-
-    brpc::Controller cntl;
-    T result;
-
-private:
-    brpc::CallId cid;
-    std::atomic<bool> _packet_in_flight {false};
-    std::atomic<bool> _is_last_rpc {false};
-    std::function<void(bool)> failed_handler;
-    std::function<void(const T&, bool)> success_handler;
-};
-
-class IndexChannel;
-class NodeChannel {
-public:
-    NodeChannel(OlapTableSink* parent, IndexChannel* index_channel, int64_t node_id);
-    virtual ~NodeChannel() noexcept;
-
-    // called before open, used to add tablet located in this backend
-    void add_tablet(const TTabletWithPartition& tablet) { _all_tablets.emplace_back(tablet); }
-
-    virtual Status init(RuntimeState* state);
-
-    void add_slave_tablet_nodes(int64_t tablet_id, const std::vector<int64_t>& slave_nodes) {
-        _slave_tablet_nodes[tablet_id] = slave_nodes;
-    }
-
-    // we use open/open_wait to parallel
-    void open();
-    virtual Status open_wait();
-
-    Status add_row(Tuple* tuple, int64_t tablet_id);
-
-    Status add_row(const BlockRow& block_row, int64_t tablet_id);
-
-    virtual Status add_block(vectorized::Block* block,
-                             const std::pair<std::unique_ptr<vectorized::IColumn::Selector>,
-                                             std::vector<int64_t>>& payload) {
-        LOG(FATAL) << "add block to NodeChannel not supported";
-        return Status::OK();
-    }
-
-    // two ways to stop channel:
-    // 1. mark_close()->close_wait() PS. close_wait() will block waiting for the last AddBatch rpc response.
-    // 2. just cancel()
-    virtual void mark_close();
-    Status close_wait(RuntimeState* state);
-
-    void cancel(const std::string& cancel_msg);
-
-    // return:
-    // 0: stopped, send finished(eos request has been sent), or any internal error;
-    // 1: running, haven't reach eos.
-    // only allow 1 rpc in flight
-    // plz make sure, this func should be called after open_wait().
-    virtual int try_send_and_fetch_status(RuntimeState* state,
-                                          std::unique_ptr<ThreadPoolToken>& thread_pool_token);
-
-    void try_send_batch(RuntimeState* state);
-
-    void time_report(std::unordered_map<int64_t, AddBatchCounter>* add_batch_counter_map,
-                     int64_t* serialize_batch_ns, int64_t* mem_exceeded_block_ns,
-                     int64_t* queue_push_lock_ns, int64_t* actual_consume_ns,
-                     int64_t* total_add_batch_exec_time_ns, int64_t* add_batch_exec_time_ns,
-                     int64_t* total_add_batch_num) {
-        (*add_batch_counter_map)[_node_id] += _add_batch_counter;
-        (*add_batch_counter_map)[_node_id].close_wait_time_ms = _close_time_ms;
-        *serialize_batch_ns += _serialize_batch_ns;
-        *mem_exceeded_block_ns += _mem_exceeded_block_ns;
-        *queue_push_lock_ns += _queue_push_lock_ns;
-        *actual_consume_ns += _actual_consume_ns;
-        *add_batch_exec_time_ns = (_add_batch_counter.add_batch_execution_time_us * 1000);
-        *total_add_batch_exec_time_ns += *add_batch_exec_time_ns;
-        *total_add_batch_num += _add_batch_counter.add_batch_num;
-    }
-
-    int64_t node_id() const { return _node_id; }
-    std::string host() const { return _node_info.host; }
-    std::string name() const { return _name; }
-
-    Status none_of(std::initializer_list<bool> vars);
-
-    void clear_all_batches();
-
-    virtual void clear_all_blocks() {}
-
-    std::string channel_info() const {
-        return fmt::format("{}, {}, node={}:{}", _name, _load_info, _node_info.host,
-                           _node_info.brpc_port);
-    }
-
-    size_t get_pending_bytes() { return _pending_batches_bytes; }
-
-protected:
-    void _cancel_with_msg(const std::string& msg);
-
-    virtual void _close_check();
-
-protected:
-    bool _is_vectorized = false;
-    OlapTableSink* _parent = nullptr;
-    IndexChannel* _index_channel = nullptr;
-    int64_t _node_id = -1;
-    std::string _load_info;
-    std::string _name;
-
-    std::shared_ptr<MemTracker> _node_channel_tracker;
-
-    TupleDescriptor* _tuple_desc = nullptr;
-    NodeInfo _node_info;
-
-    // this should be set in init() using config
-    int _rpc_timeout_ms = 60000;
-    int64_t _next_packet_seq = 0;
-    MonotonicStopWatch _timeout_watch;
-
-    // the timestamp when this node channel be marked closed and finished closed
-    uint64_t _close_time_ms = 0;
-
-    // user cancel or get some errors
-    std::atomic<bool> _cancelled {false};
-    SpinLock _cancel_msg_lock;
-    std::string _cancel_msg = "";
-
-    // send finished means the consumer thread which send the rpc can exit
-    std::atomic<bool> _send_finished {false};
-
-    // add batches finished means the last rpc has be response, used to check whether this channel can be closed
-    std::atomic<bool> _add_batches_finished {false}; // reuse for vectorized
-
-    bool _eos_is_produced {false}; // only for restricting producer behaviors
-
-    std::unique_ptr<RowDescriptor> _row_desc;
-    int _batch_size = 0;
-
-    // limit _pending_batches size
-    std::atomic<size_t> _pending_batches_bytes {0};
-    size_t _max_pending_batches_bytes {(size_t)config::nodechannel_pending_queue_max_bytes};
-    std::mutex _pending_batches_lock;          // reuse for vectorized
-    std::atomic<int> _pending_batches_num {0}; // reuse for vectorized
-
-    std::shared_ptr<PBackendService_Stub> _stub = nullptr;
-    RefCountClosure<PTabletWriterOpenResult>* _open_closure = nullptr;
-
-    std::vector<TTabletWithPartition> _all_tablets;
-    // map from tablet_id to node_id where slave replicas locate in
-    std::unordered_map<int64_t, std::vector<int64_t>> _slave_tablet_nodes;
-    std::vector<TTabletCommitInfo> _tablet_commit_infos;
-
-    AddBatchCounter _add_batch_counter;
-    std::atomic<int64_t> _serialize_batch_ns {0};
-    std::atomic<int64_t> _mem_exceeded_block_ns {0};
-    std::atomic<int64_t> _queue_push_lock_ns {0};
-    std::atomic<int64_t> _actual_consume_ns {0};
-
-    // lock to protect _is_closed.
-    // The methods in the IndexChannel are called back in the RpcClosure in the NodeChannel.
-    // However, this rpc callback may occur after the whole task is finished (e.g. due to network latency),
-    // and by that time the IndexChannel may have been destructured, so we should not call the
-    // IndexChannel methods anymore, otherwise the BE will crash.
-    // Therefore, we use the _is_closed and _closed_lock to ensure that the RPC callback
-    // function will not call the IndexChannel method after the NodeChannel is closed.
-    // The IndexChannel is definitely accessible until the NodeChannel is closed.
-    std::mutex _closed_lock;
-    bool _is_closed = false;
-
-    RuntimeState* _state;
-    // rows number received per tablet, tablet_id -> rows_num
-    std::vector<std::pair<int64_t, int64_t>> _tablets_received_rows;
-
-private:
-    std::unique_ptr<RowBatch> _cur_batch;
-    PTabletWriterAddBatchRequest _cur_add_batch_request;
-    using AddBatchReq = std::pair<std::unique_ptr<RowBatch>, PTabletWriterAddBatchRequest>;
-    std::queue<AddBatchReq> _pending_batches;
-    ReusableClosure<PTabletWriterAddBatchResult>* _add_batch_closure = nullptr;
-};
-
-class IndexChannel {
-public:
-    IndexChannel(OlapTableSink* parent, int64_t index_id, bool is_vec)
-            : _parent(parent), _index_id(index_id), _is_vectorized(is_vec) {
-        _index_channel_tracker =
-                std::make_unique<MemTracker>("IndexChannel:indexID=" + std::to_string(_index_id));
-    }
-    ~IndexChannel() = default;
-
-    Status init(RuntimeState* state, const std::vector<TTabletWithPartition>& tablets);
-
-    template <typename Row>
-    void add_row(const Row& tuple, int64_t tablet_id);
-
-    void for_each_node_channel(
-            const std::function<void(const std::shared_ptr<NodeChannel>&)>& func) {
-        for (auto& it : _node_channels) {
-            func(it.second);
-        }
-    }
-
-    void mark_as_failed(int64_t node_id, const std::string& host, const std::string& err,
-                        int64_t tablet_id = -1);
-    Status check_intolerable_failure();
-
-    // set error tablet info in runtime state, so that it can be returned to FE.
-    void set_error_tablet_in_state(RuntimeState* state);
-
-    size_t num_node_channels() const { return _node_channels.size(); }
-
-    size_t get_pending_bytes() const {
-        size_t mem_consumption = 0;
-        for (auto& kv : _node_channels) {
-            mem_consumption += kv.second->get_pending_bytes();
-        }
-        return mem_consumption;
-    }
-
-    void set_tablets_received_rows(
-            const std::vector<std::pair<int64_t, int64_t>>& tablets_received_rows, int64_t node_id);
-
-    // check whether the rows num written by different replicas is consistent
-    Status check_tablet_received_rows_consistency();
-
-private:
-    friend class NodeChannel;
-    friend class VNodeChannel;
-    friend class VOlapTableSink;
-
-    OlapTableSink* _parent;
-    int64_t _index_id;
-    bool _is_vectorized = false;
-
-    // from backend channel to tablet_id
-    // ATTN: must be placed before `_node_channels` and `_channels_by_tablet`.
-    // Because the destruct order of objects is opposite to the creation order.
-    // So NodeChannel will be destructured first.
-    // And the destructor function of NodeChannel waits for all RPCs to finish.
-    // This ensures that it is safe to use `_tablets_by_channel` in the callback function for the end of the RPC.
-    std::unordered_map<int64_t, std::unordered_set<int64_t>> _tablets_by_channel;
-    // BeId -> channel
-    std::unordered_map<int64_t, std::shared_ptr<NodeChannel>> _node_channels;
-    // from tablet_id to backend channel
-    std::unordered_map<int64_t, std::vector<std::shared_ptr<NodeChannel>>> _channels_by_tablet;
-
-    // lock to protect _failed_channels and _failed_channels_msgs
-    mutable SpinLock _fail_lock;
-    // key is tablet_id, value is a set of failed node id
-    std::unordered_map<int64_t, std::unordered_set<int64_t>> _failed_channels;
-    // key is tablet_id, value is error message
-    std::unordered_map<int64_t, std::string> _failed_channels_msgs;
-    Status _intolerable_failure_status = Status::OK();
-
-    std::unique_ptr<MemTracker> _index_channel_tracker;
-    // rows num received by DeltaWriter per tablet, tablet_id -> <node_Id, rows_num>
-    // used to verify whether the rows num received by different replicas is consistent
-    std::map<int64_t, std::vector<std::pair<int64_t, int64_t>>> _tablets_received_rows;
-};
-
-template <typename Row>
-void IndexChannel::add_row(const Row& tuple, int64_t tablet_id) {
-    SCOPED_CONSUME_MEM_TRACKER(_index_channel_tracker.get());
-    auto it = _channels_by_tablet.find(tablet_id);
-    DCHECK(it != _channels_by_tablet.end()) << "unknown tablet, tablet_id=" << tablet_id;
-    for (const auto& channel : it->second) {
-        // if this node channel is already failed, this add_row will be skipped
-        auto st = channel->add_row(tuple, tablet_id);
-        if (!st.ok()) {
-            mark_as_failed(channel->node_id(), channel->host(), st.to_string(), tablet_id);
-            // continue add row to other node, the error will be checked for every batch outside
-        }
-    }
-}
-
-// Write data to Olap Table.
-// When OlapTableSink::open() called, there will be a consumer thread running in the background.
-// When you call OlapTableSink::send(), you will be the producer who products pending batches.
-// Join the consumer thread in close().
-class OlapTableSink : public DataSink {
-public:
-    // Construct from thrift struct which is generated by FE.
-    OlapTableSink(ObjectPool* pool, const RowDescriptor& row_desc, const std::vector<TExpr>& texprs,
-                  Status* status);
-    ~OlapTableSink() override;
-
-    Status init(const TDataSink& sink) override;
-
-    Status prepare(RuntimeState* state) override;
-
-    Status open(RuntimeState* state) override;
-
-    Status send(RuntimeState* state, RowBatch* batch) override;
-
-    // close() will send RPCs too. If RPCs failed, return error.
-    Status close(RuntimeState* state, Status close_status) override;
-
-    // Returns the runtime profile for the sink.
-    RuntimeProfile* profile() override { return _profile; }
-
-private:
-    // convert input batch to output batch which will be loaded into OLAP table.
-    // this is only used in insert statement.
-    Status _convert_batch(RuntimeState* state, RowBatch* input_batch, RowBatch* output_batch);
-
-    // make input data valid for OLAP table
-    // return number of invalid/filtered rows.
-    // invalid row number is set in Bitmap
-    // set stop_processing is we want to stop the whole process now.
-    Status _validate_data(RuntimeState* state, RowBatch* batch, Bitmap* filter_bitmap,
-                          int* filtered_rows, bool* stop_processing);
-    bool _validate_cell(const TypeDescriptor& type, const std::string& col_name, void* slot,
-                        size_t slot_index, fmt::memory_buffer& error_msg, RowBatch* batch);
-
-    // the consumer func of sending pending batches in every NodeChannel.
-    // use polling & NodeChannel::try_send_and_fetch_status() to achieve nonblocking sending.
-    // only focus on pending batches and channel status, the internal errors of NodeChannels will be handled by the producer
-    void _send_batch_process(RuntimeState* state);
-
-protected:
-    friend class NodeChannel;
-    friend class VNodeChannel;
-    friend class IndexChannel;
-
-    bool _is_vectorized = false;
-
-    std::shared_ptr<MemTracker> _mem_tracker;
-
-    ObjectPool* _pool;
-    const RowDescriptor& _input_row_desc;
-
-    // unique load id
-    PUniqueId _load_id;
-    int64_t _txn_id = -1;
-    int _num_replicas = -1;
-    int _tuple_desc_id = -1;
-
-    // this is tuple descriptor of destination OLAP table
-    TupleDescriptor* _output_tuple_desc = nullptr;
-    RowDescriptor* _output_row_desc = nullptr;
-
-    bool _need_validate_data = false;
-
-    // number of senders used to insert into OlapTable, if we only support single node insert,
-    // all data from select should collectted and then send to OlapTable.
-    // To support multiple senders, we maintain a channel for each sender.
-    int _sender_id = -1;
-    int _num_senders = -1;
-    bool _is_high_priority = false;
-
-    // TODO(zc): think about cache this data
-    std::shared_ptr<OlapTableSchemaParam> _schema;
-    OlapTableLocationParam* _location = nullptr;
-    bool _write_single_replica = false;
-    OlapTableLocationParam* _slave_location = nullptr;
-    DorisNodesInfo* _nodes_info = nullptr;
-
-    RuntimeProfile* _profile = nullptr;
-
-    std::set<int64_t> _partition_ids;
-    // only used for partition with random distribution
-    std::map<int64_t, int64_t> _partition_to_tablet_map;
-
-    Bitmap _filter_bitmap;
-
-    // index_channel
-    std::vector<std::shared_ptr<IndexChannel>> _channels;
-
-    CountDownLatch _stop_background_threads_latch;
-    scoped_refptr<Thread> _sender_thread;
-    std::unique_ptr<ThreadPoolToken> _send_batch_thread_pool_token;
-
-    std::vector<DecimalV2Value> _max_decimalv2_val;
-    std::vector<DecimalV2Value> _min_decimalv2_val;
-
-    // Stats for this
-    int64_t _convert_batch_ns = 0;
-    int64_t _validate_data_ns = 0;
-    int64_t _send_data_ns = 0;
-    int64_t _number_input_rows = 0;
-    int64_t _number_output_rows = 0;
-    int64_t _number_filtered_rows = 0;
-
-    RuntimeProfile::Counter* _input_rows_counter = nullptr;
-    RuntimeProfile::Counter* _output_rows_counter = nullptr;
-    RuntimeProfile::Counter* _filtered_rows_counter = nullptr;
-    RuntimeProfile::Counter* _send_data_timer = nullptr;
-    RuntimeProfile::Counter* _wait_mem_limit_timer = nullptr;
-    RuntimeProfile::Counter* _convert_batch_timer = nullptr;
-    RuntimeProfile::Counter* _validate_data_timer = nullptr;
-    RuntimeProfile::Counter* _open_timer = nullptr;
-    RuntimeProfile::Counter* _close_timer = nullptr;
-    RuntimeProfile::Counter* _non_blocking_send_timer = nullptr;
-    RuntimeProfile::Counter* _non_blocking_send_work_timer = nullptr;
-    RuntimeProfile::Counter* _serialize_batch_timer = nullptr;
-    RuntimeProfile::Counter* _total_add_batch_exec_timer = nullptr;
-    RuntimeProfile::Counter* _max_add_batch_exec_timer = nullptr;
-    RuntimeProfile::Counter* _add_batch_number = nullptr;
-    RuntimeProfile::Counter* _num_node_channels = nullptr;
-
-    // load mem limit is for remote load channel
-    int64_t _load_mem_limit = -1;
-
-    // the timeout of load channels opened by this tablet sink. in second
-    int64_t _load_channel_timeout_s = 0;
-
-    int32_t _send_batch_parallelism = 1;
-    // Save the status of close() method
-    Status _close_status;
-
-    // User can change this config at runtime, avoid it being modified during query or loading process.
-    bool _transfer_large_data_by_brpc = false;
-
-    // FIND_TABLET_EVERY_ROW is used for both hash and random distribution info, which indicates that we
-    // should compute tablet index for every row
-    // FIND_TABLET_EVERY_BATCH is only used for random distribution info, which indicates that we should
-    // compute tablet index for every row batch
-    // FIND_TABLET_EVERY_SINK is only used for random distribution info, which indicates that we should
-    // only compute tablet index in the corresponding partition once for the whole time in olap table sink
-    enum FindTabletMode { FIND_TABLET_EVERY_ROW, FIND_TABLET_EVERY_BATCH, FIND_TABLET_EVERY_SINK };
-    FindTabletMode findTabletMode = FindTabletMode::FIND_TABLET_EVERY_ROW;
-
-private:
-    OlapTablePartitionParam* _partition = nullptr;
-    std::vector<ExprContext*> _output_expr_ctxs;
-    std::unique_ptr<RowBatch> _output_batch;
-};
-
-} // namespace stream_load
-} // namespace doris
diff --git a/be/src/olap/push_handler.cpp b/be/src/olap/push_handler.cpp
index 8ebed0ccdb..9b2c7149ed 100644
--- a/be/src/olap/push_handler.cpp
+++ b/be/src/olap/push_handler.cpp
@@ -24,6 +24,7 @@
 
 #include "common/object_pool.h"
 #include "common/status.h"
+#include "exec/parquet_scanner.h"
 #include "olap/row.h"
 #include "olap/rowset/rowset_id_generator.h"
 #include "olap/rowset/rowset_meta_manager.h"
@@ -32,7 +33,6 @@
 #include "olap/tablet.h"
 #include "olap/tablet_schema.h"
 #include "runtime/exec_env.h"
-#include "vec/exec/vparquet_scanner.h"
 
 namespace doris {
 using namespace ErrorCode;
@@ -821,9 +821,9 @@ Status PushBrokerReader::init(const Schema* schema, const TBrokerScanRange& t_sc
     BaseScanner* scanner = nullptr;
     switch (t_scan_range.ranges[0].format_type) {
     case TFileFormatType::FORMAT_PARQUET:
-        scanner = new vectorized::VParquetScanner(
-                _runtime_state.get(), _runtime_profile, t_scan_range.params, t_scan_range.ranges,
-                t_scan_range.broker_addresses, _pre_filter_texprs, _counter.get());
+        scanner = new ParquetScanner(_runtime_state.get(), _runtime_profile, t_scan_range.params,
+                                     t_scan_range.ranges, t_scan_range.broker_addresses,
+                                     _pre_filter_texprs, _counter.get());
         break;
     default:
         LOG(WARNING) << "Unsupported file format type: " << t_scan_range.ranges[0].format_type;
diff --git a/be/src/runtime/CMakeLists.txt b/be/src/runtime/CMakeLists.txt
index 3d42ae0faf..36c75ffbef 100644
--- a/be/src/runtime/CMakeLists.txt
+++ b/be/src/runtime/CMakeLists.txt
@@ -59,10 +59,6 @@ set(RUNTIME_FILES
     disk_io_mgr.cc
     disk_io_mgr_reader_context.cc
     disk_io_mgr_scan_range.cc 
-    buffered_block_mgr2.cc
-    buffered_tuple_stream2.cc
-    buffered_tuple_stream3.cc
-    export_sink.cpp
     load_channel_mgr.cpp
     load_channel.cpp
     tablets_channel.cpp
@@ -85,9 +81,7 @@ set(RUNTIME_FILES
     small_file_mgr.cpp
     record_batch_queue.cpp
     result_queue_mgr.cpp
-    memory_scratch_sink.cpp
     external_scan_context_mgr.cpp
-    mysql_result_writer.cpp
     memory/system_allocator.cpp
     memory/chunk_allocator.cpp
     memory/mem_tracker_limiter.cpp
@@ -96,16 +90,8 @@ set(RUNTIME_FILES
     fold_constant_executor.cpp
     cache/result_node.cpp
     cache/result_cache.cpp
-    odbc_table_sink.cpp	
 )
 
-if (WITH_MYSQL)
-    set(RUNTIME_FILES ${RUNTIME_FILES}
-        mysql_table_writer.cpp
-        mysql_table_sink.cpp
-        )
-endif()
-
 if (USE_JEMALLOC AND USE_MEM_TRACKER)
     set(RUNTIME_FILES ${RUNTIME_FILES}
         memory/jemalloc_hook.cpp
diff --git a/be/src/runtime/buffered_block_mgr2.cc b/be/src/runtime/buffered_block_mgr2.cc
deleted file mode 100644
index 1754be4dc1..0000000000
--- a/be/src/runtime/buffered_block_mgr2.cc
+++ /dev/null
@@ -1,1216 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// https://github.com/apache/impala/blob/branch-2.9.0/be/src/runtime/buffered-block-mgr2.cc
-// and modified by Doris
-
-#include "runtime/buffered_block_mgr2.h"
-
-#include "exec/exec_node.h"
-#include "runtime/exec_env.h"
-#include "runtime/memory/mem_tracker.h"
-#include "runtime/runtime_state.h"
-#include "runtime/tmp_file_mgr.h"
-#include "util/bit_util.h"
-#include "util/debug_util.h"
-#include "util/disk_info.h"
-#include "util/doris_metrics.h"
-#include "util/pretty_printer.h"
-#include "util/runtime_profile.h"
-#include "util/stack_util.h"
-#include "util/uid_util.h"
-
-using std::string;
-using std::stringstream;
-using std::vector;
-using std::list;
-using std::endl;
-
-using std::bind;
-using std::mem_fn;
-using std::lock_guard;
-using std::mutex;
-using std::shared_ptr;
-using std::unique_lock;
-
-namespace doris {
-using namespace ErrorCode;
-
-BufferedBlockMgr2::BlockMgrsMap BufferedBlockMgr2::_s_query_to_block_mgrs;
-SpinLock BufferedBlockMgr2::_s_block_mgrs_lock;
-
-class BufferedBlockMgr2::Client {
-public:
-    Client(BufferedBlockMgr2* mgr, int num_reserved_buffers, RuntimeState* state)
-            : _mgr(mgr),
-              _state(state),
-              _tracker(std::make_unique<MemTracker>("BufferedBlockMgr2::Client")),
-              _num_reserved_buffers(num_reserved_buffers),
-              _num_tmp_reserved_buffers(0),
-              _num_pinned_buffers(0) {}
-
-    // A null dtor to pass codestyle check
-    ~Client() {}
-
-    // Unowned.
-    BufferedBlockMgr2* _mgr;
-
-    // Unowned.
-    RuntimeState* _state;
-
-    // Tracker for this client. Unowned.
-    // When the client gets a buffer, we update the consumption on this tracker. However,
-    // we don't want to transfer the buffer from the block mgr to the client (i.e. release
-    // from the block mgr), since the block mgr is where the block mem usage limit is
-    // enforced. Even when we give a buffer to a client, the buffer is still owned and
-    // counts against the block mgr tracker (i.e. there is a fixed pool of buffers
-    // regardless of if they are in the block mgr or the clients).
-    std::unique_ptr<MemTracker> _tracker;
-
-    // Number of buffers reserved by this client.
-    int _num_reserved_buffers;
-
-    // Number of buffers temporarily reserved.
-    int _num_tmp_reserved_buffers;
-
-    // Number of buffers pinned by this client.
-    int _num_pinned_buffers;
-
-    void pin_buffer(BufferDescriptor* buffer) {
-        DCHECK(buffer != nullptr);
-        if (buffer->len == _mgr->max_block_size()) {
-            ++_num_pinned_buffers;
-            _tracker->consume(buffer->len);
-        }
-    }
-
-    void unpin_buffer(BufferDescriptor* buffer) {
-        DCHECK(buffer != nullptr);
-        if (buffer->len == _mgr->max_block_size()) {
-            DCHECK_GT(_num_pinned_buffers, 0);
-            --_num_pinned_buffers;
-            _tracker->release(buffer->len);
-        }
-    }
-
-    string debug_string() const {
-        stringstream ss;
-        ss << "Client " << this << endl
-           << "  num_reserved_buffers=" << _num_reserved_buffers << endl
-           << "  num_tmp_reserved_buffers=" << _num_tmp_reserved_buffers << endl
-           << "  num_pinned_buffers=" << _num_pinned_buffers;
-        return ss.str();
-    }
-};
-
-// BufferedBlockMgr2::Block methods.
-BufferedBlockMgr2::Block::Block(BufferedBlockMgr2* block_mgr)
-        : _buffer_desc(nullptr),
-          _block_mgr(block_mgr),
-          _client(nullptr),
-          _write_range(nullptr),
-          _tmp_file(nullptr),
-          _valid_data_len(0),
-          _num_rows(0) {}
-
-Status BufferedBlockMgr2::Block::pin(bool* pinned, Block* release_block, bool unpin) {
-    return _block_mgr->pin_block(this, pinned, release_block, unpin);
-}
-
-Status BufferedBlockMgr2::Block::unpin() {
-    return _block_mgr->unpin_block(this);
-}
-
-void BufferedBlockMgr2::Block::del() {
-    _block_mgr->delete_block(this);
-}
-
-void BufferedBlockMgr2::Block::init() {
-    // No locks are taken because the block is new or has previously been deleted.
-    _is_pinned = false;
-    _in_write = false;
-    _is_deleted = false;
-    _valid_data_len = 0;
-    _client = nullptr;
-    _num_rows = 0;
-}
-
-bool BufferedBlockMgr2::Block::validate() const {
-    if (_is_deleted && (_is_pinned || (!_in_write && _buffer_desc != nullptr))) {
-        LOG(ERROR) << "Deleted block in use - " << debug_string();
-        return false;
-    }
-
-    if (_buffer_desc == nullptr && (_is_pinned || _in_write)) {
-        LOG(ERROR) << "Block without buffer in use - " << debug_string();
-        return false;
-    }
-
-    if (_buffer_desc == nullptr && _block_mgr->_unpinned_blocks.contains(this)) {
-        LOG(ERROR) << "Unpersisted block without buffer - " << debug_string();
-        return false;
-    }
-
-    if (_buffer_desc != nullptr && (_buffer_desc->block != this)) {
-        LOG(ERROR) << "Block buffer inconsistency - " << debug_string();
-        return false;
-    }
-
-    return true;
-}
-
-string BufferedBlockMgr2::Block::tmp_file_path() const {
-    if (_tmp_file == nullptr) {
-        return "";
-    }
-    return _tmp_file->path();
-}
-
-string BufferedBlockMgr2::Block::debug_string() const {
-    stringstream ss;
-    ss << "Block: " << this << endl
-       << "  Buffer Desc: " << _buffer_desc << endl
-       << "  Data Len: " << _valid_data_len << endl
-       << "  Num Rows: " << _num_rows << endl;
-    if (_is_pinned) {
-        ss << "  Buffer Len: " << buffer_len() << endl;
-    }
-    ss << "  Deleted: " << _is_deleted << endl
-       << "  Pinned: " << _is_pinned << endl
-       << "  Write Issued: " << _in_write << endl
-       << "  Client Local: " << _client_local;
-    return ss.str();
-}
-
-BufferedBlockMgr2::BufferedBlockMgr2(RuntimeState* state, TmpFileMgr* tmp_file_mgr,
-                                     int64_t block_size)
-        : _max_block_size(block_size),
-          // Keep two writes in flight per scratch disk so the disks can stay busy.
-          _block_write_threshold(tmp_file_mgr->num_active_tmp_devices() * 2),
-          _enable_spill(state->enable_spill()),
-          _query_id(state->query_id()),
-          _tmp_file_mgr(tmp_file_mgr),
-          _initialized(false),
-          _unfullfilled_reserved_buffers(0),
-          _total_pinned_buffers(0),
-          _non_local_outstanding_writes(0),
-          _io_mgr(state->exec_env()->disk_io_mgr()),
-          _is_cancelled(false),
-          _writes_issued(0),
-          _state(state) {}
-
-Status BufferedBlockMgr2::create(RuntimeState* state, RuntimeProfile* profile,
-                                 TmpFileMgr* tmp_file_mgr, int64_t block_size,
-                                 std::shared_ptr<BufferedBlockMgr2>* block_mgr) {
-    block_mgr->reset();
-    {
-        // we do not use global BlockMgrsMap for now, to avoid mem-exceeded different fragments
-        // running on the same machine.
-        // TODO(lingbin): open it later. note that open with query-mem-limit in RuntimeState
-        // at the same time.
-
-        // lock_guard<SpinLock> lock(_s_block_mgrs_lock);
-        // BlockMgrsMap::iterator it = _s_query_to_block_mgrs.find(state->query_id());
-        // if (it != _s_query_to_block_mgrs.end()){
-        //     *block_mgr = it->second.lock();
-        // }
-        if (*block_mgr == nullptr) {
-            // weak_ptr::lock returns nullptr if the weak_ptr is expired. This means
-            // all shared_ptr references have gone to 0 and it is in the process of
-            // being deleted. This can happen if the last shared reference is released
-            // but before the weak ptr is removed from the map.
-            block_mgr->reset(new BufferedBlockMgr2(state, tmp_file_mgr, block_size));
-            // _s_query_to_block_mgrs[state->query_id()] = *block_mgr;
-        }
-    }
-    (*block_mgr)->init(state->exec_env()->disk_io_mgr(), profile);
-    return Status::OK();
-}
-
-int64_t BufferedBlockMgr2::available_buffers(Client* client) const {
-    int64_t unused_reserved = client->_num_reserved_buffers + client->_num_tmp_reserved_buffers -
-                              client->_num_pinned_buffers;
-    return std::max<int64_t>(0, remaining_unreserved_buffers()) +
-           std::max<int64_t>(0, unused_reserved);
-}
-
-int64_t BufferedBlockMgr2::remaining_unreserved_buffers() const {
-    int64_t num_buffers =
-            _free_io_buffers.size() + _unpinned_blocks.size() + _non_local_outstanding_writes;
-    num_buffers += thread_context()->thread_mem_tracker()->spare_capacity() / max_block_size();
-    num_buffers -= _unfullfilled_reserved_buffers;
-    return num_buffers;
-}
-
-Status BufferedBlockMgr2::register_client(int num_reserved_buffers, RuntimeState* state,
-                                          Client** client) {
-    DCHECK_GE(num_reserved_buffers, 0);
-    Client* a_client = new Client(this, num_reserved_buffers, state);
-    lock_guard<mutex> lock(_lock);
-    *client = _obj_pool.add(a_client);
-    _unfullfilled_reserved_buffers += num_reserved_buffers;
-    return Status::OK();
-}
-
-void BufferedBlockMgr2::clear_reservations(Client* client) {
-    lock_guard<mutex> lock(_lock);
-    // TODO: Can the modifications to the client's mem variables can be made w/o the lock?
-    if (client->_num_pinned_buffers < client->_num_reserved_buffers) {
-        _unfullfilled_reserved_buffers -=
-                client->_num_reserved_buffers - client->_num_pinned_buffers;
-    }
-    client->_num_reserved_buffers = 0;
-
-    _unfullfilled_reserved_buffers -= client->_num_tmp_reserved_buffers;
-    client->_num_tmp_reserved_buffers = 0;
-}
-
-bool BufferedBlockMgr2::try_acquire_tmp_reservation(Client* client, int num_buffers) {
-    lock_guard<mutex> lock(_lock);
-    // TODO: Can the modifications to the client's mem variables can be made w/o the lock?
-    DCHECK_EQ(client->_num_tmp_reserved_buffers, 0);
-    if (client->_num_pinned_buffers < client->_num_reserved_buffers) {
-        // If client has unused reserved buffers, we use those first.
-        num_buffers -= client->_num_reserved_buffers - client->_num_pinned_buffers;
-    }
-    if (num_buffers < 0) {
-        return true;
-    }
-    if (available_buffers(client) < num_buffers) {
-        return false;
-    }
-
-    client->_num_tmp_reserved_buffers = num_buffers;
-    _unfullfilled_reserved_buffers += num_buffers;
-    return true;
-}
-
-void BufferedBlockMgr2::cancel() {
-    {
-        lock_guard<mutex> lock(_lock);
-        if (_is_cancelled) {
-            return;
-        }
-        _is_cancelled = true;
-    }
-    // Cancel the underlying io mgr to unblock any waiting threads.
-    _io_mgr->cancel_context(_io_request_context);
-}
-
-bool BufferedBlockMgr2::is_cancelled() {
-    lock_guard<mutex> lock(_lock);
-    return _is_cancelled;
-}
-
-Status BufferedBlockMgr2::mem_limit_too_low_error(Client* client, int node_id) {
-    VLOG_QUERY << "Query: " << _query_id << ". Node=" << node_id << " ran out of memory: " << endl
-               << debug_internal() << endl
-               << client->debug_string();
-
-    // TODO: what to print here. We can't know the value of the entire query here.
-    stringstream error_msg;
-    error_msg << "The memory limit is set too low to initialize spilling operator (id=" << node_id
-              << "). The minimum required memory to spill this operator is "
-              << PrettyPrinter::print(client->_num_reserved_buffers * max_block_size(),
-                                      TUnit::BYTES)
-              << ".";
-    return add_exec_msg(error_msg.str());
-}
-
-Status BufferedBlockMgr2::add_exec_msg(const std::string& msg) const {
-    stringstream str;
-    str << msg << " ";
-    str << "Backend: " << BackendOptions::get_localhost() << ", ";
-    str << "fragment: " << print_id(_state->fragment_instance_id()) << " ";
-    return Status::MemoryLimitExceeded(str.str());
-}
-
-Status BufferedBlockMgr2::get_new_block(Client* client, Block* unpin_block, Block** block,
-                                        int64_t len) {
-    DCHECK_LE(len, _max_block_size) << "Cannot request block bigger than max_len";
-    DCHECK_NE(len, 0) << "Cannot request block of zero size";
-    *block = nullptr;
-    Block* new_block = nullptr;
-
-    {
-        lock_guard<mutex> lock(_lock);
-        if (_is_cancelled) {
-            return Status::Cancelled("Cancelled");
-        }
-        new_block = get_unused_block(client);
-        DCHECK(new_block->validate()) << endl << new_block->debug_string();
-        DCHECK_EQ(new_block->_client, client);
-
-        if (len > 0 && len < _max_block_size) {
-            DCHECK(unpin_block == nullptr);
-            Status st = thread_context()->thread_mem_tracker()->check_limit(len);
-            WARN_IF_ERROR(st, "get_new_block failed");
-            if (st) {
-                client->_tracker->consume(len);
-                // TODO: Have a cache of unused blocks of size 'len' (0, _max_block_size)
-                uint8_t* buffer = new uint8_t[len];
-                // Descriptors for non-I/O sized buffers are deleted when the block is deleted.
-                new_block->_buffer_desc = new BufferDescriptor(buffer, len);
-                new_block->_buffer_desc->block = new_block;
-                new_block->_is_pinned = true;
-                client->pin_buffer(new_block->_buffer_desc);
-                ++_total_pinned_buffers;
-                *block = new_block;
-            } else {
-                new_block->_is_deleted = true;
-                return_unused_block(new_block);
-            }
-            return Status::OK();
-        }
-    }
-
-    bool in_mem = true;
-    RETURN_IF_ERROR(find_buffer_for_block(new_block, &in_mem));
-    DCHECK(!in_mem) << "A new block cannot start in mem.";
-    DCHECK(!new_block->is_pinned() || new_block->_buffer_desc != nullptr)
-            << new_block->debug_string();
-
-    if (!new_block->is_pinned()) {
-        if (unpin_block == nullptr) {
-            // We couldn't get a new block and no unpin block was provided. Can't return
-            // a block.
-            new_block->_is_deleted = true;
-            return_unused_block(new_block);
-            new_block = nullptr;
-        } else {
-            // We need to transfer the buffer from unpin_block to new_block.
-            RETURN_IF_ERROR(transfer_buffer(new_block, unpin_block, true));
-        }
-    } else if (unpin_block != nullptr) {
-        // Got a new block without needing to transfer. Just unpin this block.
-        RETURN_IF_ERROR(unpin_block->unpin());
-    }
-
-    DCHECK(new_block == nullptr || new_block->is_pinned());
-    *block = new_block;
-    return Status::OK();
-}
-
-Status BufferedBlockMgr2::transfer_buffer(Block* dst, Block* src, bool unpin) {
-    Status status = Status::OK();
-    DCHECK(dst != nullptr);
-    DCHECK(src != nullptr);
-
-    // First write out the src block.
-    DCHECK(src->_is_pinned);
-    DCHECK(!dst->_is_pinned);
-    DCHECK(dst->_buffer_desc == nullptr);
-    DCHECK_EQ(src->_buffer_desc->len, _max_block_size);
-    src->_is_pinned = false;
-
-    if (unpin) {
-        unique_lock<mutex> lock(_lock);
-        src->_client_local = true;
-        status = write_unpinned_block(src);
-        if (!status.ok()) {
-            // The transfer failed, return the buffer to src.
-            src->_is_pinned = true;
-            return status;
-        }
-        // Wait for the write to complete.
-        while (src->_in_write && !_is_cancelled) {
-            src->_write_complete_cv.wait(lock);
-        }
-        if (_is_cancelled) {
-            // We can't be sure the write succeeded, so return the buffer to src.
-            src->_is_pinned = true;
-            return Status::Cancelled("Cancelled");
-        }
-        DCHECK(!src->_in_write);
-    }
-    // Assign the buffer to the new block.
-    dst->_buffer_desc = src->_buffer_desc;
-    dst->_buffer_desc->block = dst;
-    src->_buffer_desc = nullptr;
-    dst->_is_pinned = true;
-    if (!unpin) {
-        src->_is_deleted = true;
-        return_unused_block(src);
-    }
-    return Status::OK();
-}
-
-BufferedBlockMgr2::~BufferedBlockMgr2() {
-    {
-        lock_guard<SpinLock> lock(_s_block_mgrs_lock);
-        BlockMgrsMap::iterator it = _s_query_to_block_mgrs.find(_query_id);
-        // IMPALA-2286: Another fragment may have called create() for this _query_id and
-        // saw that this BufferedBlockMgr2 is being destructed.  That fragment will
-        // overwrite the map entry for _query_id, pointing it to a different
-        // BufferedBlockMgr2 object.  We should let that object's destructor remove the
-        // entry.  On the other hand, if the second BufferedBlockMgr2 is destructed before
-        // this thread acquires the lock, then we'll remove the entry (because we can't
-        // distinguish between the two expired pointers), and when the other
-        // ~BufferedBlockMgr2() call occurs, it won't find an entry for this _query_id.
-        if (it != _s_query_to_block_mgrs.end()) {
-            std::shared_ptr<BufferedBlockMgr2> mgr = it->second.lock();
-            if (mgr.get() == nullptr) {
-                // The BufferBlockMgr object referenced by this entry is being deconstructed.
-                _s_query_to_block_mgrs.erase(it);
-            } else {
-                // The map references another (still valid) BufferedBlockMgr2.
-                DCHECK_NE(mgr.get(), this);
-            }
-        }
-    }
-
-    if (_io_request_context != nullptr) {
-        _io_mgr->unregister_context(_io_request_context);
-    }
-
-    // If there are any outstanding writes and we are here it means that when the
-    // write_complete() callback gets executed it is going to access invalid memory.
-    // See IMPALA-1890.
-    DCHECK_EQ(_non_local_outstanding_writes, 0) << endl << debug_internal();
-    // Delete tmp files.
-    for (auto& file : _tmp_files) {
-        file->remove();
-    }
-    _tmp_files.clear();
-
-    // Free memory resources.
-    for (BufferDescriptor* buffer : _all_io_buffers) {
-        _mem_tracker->release(buffer->len);
-        delete[] buffer->buffer;
-    }
-}
-
-int64_t BufferedBlockMgr2::bytes_allocated() const {
-    return _mem_tracker->consumption();
-}
-
-int BufferedBlockMgr2::num_pinned_buffers(Client* client) const {
-    return client->_num_pinned_buffers;
-}
-
-int BufferedBlockMgr2::num_reserved_buffers_remaining(Client* client) const {
-    return std::max(client->_num_reserved_buffers - client->_num_pinned_buffers, 0);
-}
-
-MemTracker* BufferedBlockMgr2::get_tracker(Client* client) const {
-    return client->_tracker.get();
-}
-
-// TODO: It would be good if we had a sync primitive that supports is_mine() calls, see
-//       IMPALA-1884.
-Status BufferedBlockMgr2::delete_or_unpin_block(Block* block, bool unpin) {
-    if (block == nullptr) {
-        return is_cancelled() ? Status::Cancelled("Cancelled") : Status::OK();
-    }
-    if (unpin) {
-        return block->unpin();
-    } else {
-        block->del();
-        return is_cancelled() ? Status::Cancelled("Cancelled") : Status::OK();
-    }
-}
-
-Status BufferedBlockMgr2::pin_block(Block* block, bool* pinned, Block* release_block, bool unpin) {
-    DCHECK(block != nullptr);
-    DCHECK(!block->_is_deleted);
-    *pinned = false;
-    if (block->_is_pinned) {
-        *pinned = true;
-        return delete_or_unpin_block(release_block, unpin);
-    }
-
-    bool in_mem = false;
-    RETURN_IF_ERROR(find_buffer_for_block(block, &in_mem));
-    *pinned = block->_is_pinned;
-
-    // Block was not evicted or had no data, nothing left to do.
-    if (in_mem || block->_valid_data_len == 0) {
-        return delete_or_unpin_block(release_block, unpin);
-    }
-
-    if (!block->_is_pinned) {
-        if (release_block == nullptr) {
-            return Status::OK();
-        }
-
-        if (block->_buffer_desc != nullptr) {
-            {
-                lock_guard<mutex> lock(_lock);
-                if (_free_io_buffers.contains(block->_buffer_desc)) {
-                    DCHECK(!block->_is_pinned && !block->_in_write &&
-                           !_unpinned_blocks.contains(block))
-                            << endl
-                            << block->debug_string();
-                    _free_io_buffers.remove(block->_buffer_desc);
-                } else if (_unpinned_blocks.contains(block)) {
-                    _unpinned_blocks.remove(block);
-                } else {
-                    DCHECK(block->_in_write);
-                }
-                block->_is_pinned = true;
-                *pinned = true;
-                block->_client->pin_buffer(block->_buffer_desc);
-                ++_total_pinned_buffers;
-                RETURN_IF_ERROR(write_unpinned_blocks());
-            }
-            return delete_or_unpin_block(release_block, unpin);
-        }
-
-        RETURN_IF_ERROR(transfer_buffer(block, release_block, unpin));
-        DCHECK(!release_block->_is_pinned);
-        release_block = nullptr; // Handled by transfer.
-        DCHECK(block->_is_pinned);
-        *pinned = true;
-    }
-
-    // Read the block from disk if it was not in memory.
-    DCHECK(block->_write_range != nullptr) << block->debug_string() << endl << release_block;
-    SCOPED_TIMER(_disk_read_timer);
-    // Create a ScanRange to perform the read.
-    DiskIoMgr::ScanRange* scan_range = _obj_pool.add(new DiskIoMgr::ScanRange());
-    scan_range->reset(nullptr, block->_write_range->file(), block->_write_range->len(),
-                      block->_write_range->offset(), block->_write_range->disk_id(), false, block,
-                      DiskIoMgr::ScanRange::NEVER_CACHE);
-    vector<DiskIoMgr::ScanRange*> ranges(1, scan_range);
-    RETURN_IF_ERROR(_io_mgr->add_scan_ranges(_io_request_context, ranges, true));
-
-    // Read from the io mgr buffer into the block's assigned buffer.
-    int64_t offset = 0;
-    bool buffer_eosr = false;
-    do {
-        DiskIoMgr::BufferDescriptor* io_mgr_buffer;
-        RETURN_IF_ERROR(scan_range->get_next(&io_mgr_buffer));
-        memcpy(block->buffer() + offset, io_mgr_buffer->buffer(), io_mgr_buffer->len());
-        offset += io_mgr_buffer->len();
-        buffer_eosr = io_mgr_buffer->eosr();
-        io_mgr_buffer->return_buffer();
-    } while (!buffer_eosr);
-    DCHECK_EQ(offset, block->_write_range->len());
-
-    return delete_or_unpin_block(release_block, unpin);
-}
-
-Status BufferedBlockMgr2::unpin_block(Block* block) {
-    DCHECK(!block->_is_deleted) << "Unpin for deleted block.";
-
-    lock_guard<mutex> unpinned_lock(_lock);
-    if (_is_cancelled) {
-        return Status::Cancelled("Cancelled");
-    }
-    DCHECK(block->validate()) << endl << block->debug_string();
-    if (!block->_is_pinned) {
-        return Status::OK();
-    }
-    DCHECK_EQ(block->_buffer_desc->len, _max_block_size) << "Can only unpin io blocks.";
-    DCHECK(validate()) << endl << debug_internal();
-    // Add 'block' to the list of unpinned blocks and set _is_pinned to false.
-    // Cache its position in the list for later removal.
-    block->_is_pinned = false;
-    DCHECK(!_unpinned_blocks.contains(block)) << " Unpin for block in unpinned list";
-    if (!block->_in_write) {
-        _unpinned_blocks.enqueue(block);
-    }
-    block->_client->unpin_buffer(block->_buffer_desc);
-    if (block->_client->_num_pinned_buffers < block->_client->_num_reserved_buffers) {
-        ++_unfullfilled_reserved_buffers;
-    }
-    --_total_pinned_buffers;
-    RETURN_IF_ERROR(write_unpinned_blocks());
-    DCHECK(validate()) << endl << debug_internal();
-    DCHECK(block->validate()) << endl << block->debug_string();
-    return Status::OK();
-}
-
-Status BufferedBlockMgr2::write_unpinned_blocks() {
-    if (!_enable_spill) {
-        return Status::OK();
-    }
-
-    // Assumes block manager lock is already taken.
-    while (_non_local_outstanding_writes + _free_io_buffers.size() < _block_write_threshold &&
-           !_unpinned_blocks.empty()) {
-        // Pop a block from the back of the list (LIFO).
-        Block* write_block = _unpinned_blocks.pop_back();
-        write_block->_client_local = false;
-        RETURN_IF_ERROR(write_unpinned_block(write_block));
-        ++_non_local_outstanding_writes;
-    }
-    DCHECK(validate()) << endl << debug_internal();
-    return Status::OK();
-}
-
-Status BufferedBlockMgr2::write_unpinned_block(Block* block) {
-    // Assumes block manager lock is already taken.
-    DCHECK(!block->_is_pinned) << block->debug_string();
-    DCHECK(!block->_in_write) << block->debug_string();
-    DCHECK_EQ(block->_buffer_desc->len, _max_block_size);
-
-    if (block->_write_range == nullptr) {
-        if (_tmp_files.empty()) {
-            RETURN_IF_ERROR(init_tmp_files());
-        }
-
-        // First time the block is being persisted - need to allocate tmp file space.
-        TmpFileMgr::File* tmp_file;
-        int64_t file_offset;
-        RETURN_IF_ERROR(allocate_scratch_space(_max_block_size, &tmp_file, &file_offset));
-        int disk_id = tmp_file->disk_id();
-        if (disk_id < 0) {
-            // Assign a valid disk id to the write range if the tmp file was not assigned one.
-            static unsigned int next_disk_id = 0;
-            disk_id = ++next_disk_id;
-        }
-        disk_id %= _io_mgr->num_local_disks();
-        DiskIoMgr::WriteRange::WriteDoneCallback callback = bind(
-                mem_fn(&BufferedBlockMgr2::write_complete), this, block, std::placeholders::_1);
-        block->_write_range = _obj_pool.add(
-                new DiskIoMgr::WriteRange(tmp_file->path(), file_offset, disk_id, callback));
-        block->_tmp_file = tmp_file;
-    }
-
-    uint8_t* outbuf = nullptr;
-    outbuf = block->buffer();
-
-    block->_write_range->set_data(outbuf, block->_valid_data_len);
-
-    // Issue write through DiskIoMgr.
-    RETURN_IF_ERROR(_io_mgr->add_write_range(_io_request_context, block->_write_range));
-    block->_in_write = true;
-    DCHECK(block->validate()) << endl << block->debug_string();
-    _outstanding_writes_counter->update(1);
-    _bytes_written_counter->update(block->_valid_data_len);
-    ++_writes_issued;
-    if (_writes_issued == 1) {
-    }
-    return Status::OK();
-}
-
-Status BufferedBlockMgr2::allocate_scratch_space(int64_t block_size, TmpFileMgr::File** tmp_file,
-                                                 int64_t* file_offset) {
-    // Assumes block manager lock is already taken.
-    vector<std::string> errs;
-    // Find the next physical file in round-robin order and create a write range for it.
-    for (int attempt = 0; attempt < _tmp_files.size(); ++attempt) {
-        *tmp_file = _tmp_files[_next_block_index].get();
-        _next_block_index = (_next_block_index + 1) % _tmp_files.size();
-        if ((*tmp_file)->is_blacklisted()) {
-            continue;
-        }
-        Status status = (*tmp_file)->allocate_space(_max_block_size, file_offset);
-        if (status.ok()) {
-            return Status::OK();
-        }
-        // Log error and try other files if there was a problem. Problematic files will be
-        // blacklisted so we will not repeatedly log the same error.
-        LOG(WARNING) << "Error while allocating temporary file range: " << status
-                     << ". Will try another temporary file.";
-        errs.emplace_back(status.to_string());
-    }
-    Status err_status = Status::InternalError(
-            "No usable temporary files: space could not be allocated on any temporary device.");
-    for (int i = 0; i < errs.size(); ++i) {
-        err_status.append(errs[i]);
-    }
-    return err_status;
-}
-
-void BufferedBlockMgr2::write_complete(Block* block, const Status& write_status) {
-    Status status = Status::OK();
-    lock_guard<mutex> lock(_lock);
-    _outstanding_writes_counter->update(-1);
-    DCHECK(validate()) << endl << debug_internal();
-    DCHECK(_is_cancelled || block->_in_write) << "write_complete() for block not in write." << endl
-                                              << block->debug_string();
-    if (!block->_client_local) {
-        DCHECK_GT(_non_local_outstanding_writes, 0) << block->debug_string();
-        --_non_local_outstanding_writes;
-    }
-    block->_in_write = false;
-
-    // Explicitly release our temporarily allocated buffer here so that it doesn't
-    // hang around needlessly.
-
-    // return_unused_block() will clear the block, so save the client pointer.
-    // We have to be careful while touching the state because it may have been cleaned up by
-    // another thread.
-    RuntimeState* state = block->_client->_state;
-    // If the block was re-pinned when it was in the IOMgr queue, don't free it.
-    if (block->_is_pinned) {
-        // The number of outstanding writes has decreased but the number of free buffers
-        // hasn't.
-        DCHECK(!block->_client_local)
-                << "Client should be waiting. No one should have pinned this block.";
-        if (write_status.ok() && !_is_cancelled && !state->is_cancelled()) {
-            status = write_unpinned_blocks();
-        }
-    } else if (block->_client_local) {
-        DCHECK(!block->_is_deleted)
-                << "Client should be waiting. No one should have deleted this block.";
-        block->_write_complete_cv.notify_one();
-    } else {
-        DCHECK_EQ(block->_buffer_desc->len, _max_block_size)
-                << "Only io sized buffers should spill";
-        _free_io_buffers.enqueue(block->_buffer_desc);
-        // Finish the delete_block() work.
-        if (block->_is_deleted) {
-            block->_buffer_desc->block = nullptr;
-            block->_buffer_desc = nullptr;
-            return_unused_block(block);
-        }
-        // Multiple threads may be waiting for the same block in find_buffer().  Wake them
-        // all up.  One thread will get this block, and the others will re-evaluate whether
-        // they should continue waiting and if another write needs to be initiated.
-        _buffer_available_cv.notify_all();
-    }
-    DCHECK(validate()) << endl << debug_internal();
-
-    if (!write_status.ok() || !status.ok() || _is_cancelled) {
-        VLOG_FILE << "Query: " << _query_id
-                  << ". Write did not complete successfully: "
-                     "write_status="
-                  << write_status << ", status=" << status << ". _is_cancelled=" << _is_cancelled;
-
-        // If the instance is already cancelled, don't confuse things with these errors.
-        if (!write_status.is<CANCELLED>() && !state->is_cancelled()) {
-            if (!write_status.ok()) {
-                // Report but do not attempt to recover from write error.
-                DCHECK(block->_tmp_file != nullptr);
-                block->_tmp_file->report_io_error(write_status.to_string());
-                VLOG_QUERY << "Query: " << _query_id << " write complete callback with error.";
-                state->log_error(write_status.to_string());
-            }
-            if (!status.ok()) {
-                VLOG_QUERY << "Query: " << _query_id << " error while writing unpinned blocks.";
-                state->log_error(status.to_string());
-            }
-        }
-        // Set cancelled and wake up waiting threads if an error occurred.  Note that in
-        // the case of _client_local, that thread was woken up above.
-        _is_cancelled = true;
-        _buffer_available_cv.notify_all();
-    }
-}
-
-void BufferedBlockMgr2::delete_block(Block* block) {
-    DCHECK(!block->_is_deleted);
-
-    lock_guard<mutex> lock(_lock);
-    DCHECK(block->validate()) << endl << debug_internal();
-    block->_is_deleted = true;
-
-    if (block->_is_pinned) {
-        if (block->is_max_size()) {
-            --_total_pinned_buffers;
-        }
-        block->_client->unpin_buffer(block->_buffer_desc);
-        // Only block is io size we need change _unfullfilled_reserved_buffers
-        if (block->is_max_size() &&
-            block->_client->_num_pinned_buffers < block->_client->_num_reserved_buffers) {
-            ++_unfullfilled_reserved_buffers;
-        }
-        block->_is_pinned = false;
-    } else if (_unpinned_blocks.contains(block)) {
-        // Remove block from unpinned list.
-        _unpinned_blocks.remove(block);
-    }
-
-    if (block->_in_write) {
-        DCHECK(block->_buffer_desc != nullptr && block->_buffer_desc->len == _max_block_size)
-                << "Should never be writing a small buffer";
-        // If a write is still pending, return. Cleanup will be done in write_complete().
-        DCHECK(block->validate()) << endl << block->debug_string();
-        return;
-    }
-
-    if (block->_buffer_desc != nullptr) {
-        if (block->_buffer_desc->len != _max_block_size) {
-            // Just delete the block for now.
-            delete[] block->_buffer_desc->buffer;
-            block->_client->_tracker->release(block->_buffer_desc->len);
-            delete block->_buffer_desc;
-            block->_buffer_desc = nullptr;
-        } else {
-            if (!_free_io_buffers.contains(block->_buffer_desc)) {
-                _free_io_buffers.enqueue(block->_buffer_desc);
-                _buffer_available_cv.notify_one();
-            }
-            block->_buffer_desc->block = nullptr;
-            block->_buffer_desc = nullptr;
-        }
-    }
-    return_unused_block(block);
-    DCHECK(block->validate()) << endl << block->debug_string();
-    DCHECK(validate()) << endl << debug_internal();
-}
-
-void BufferedBlockMgr2::return_unused_block(Block* block) {
-    DCHECK(block->_is_deleted) << block->debug_string();
-    DCHECK(!block->_is_pinned) << block->debug_string();
-    ;
-    DCHECK(block->_buffer_desc == nullptr);
-    block->init();
-    _unused_blocks.enqueue(block);
-}
-
-Status BufferedBlockMgr2::find_buffer_for_block(Block* block, bool* in_mem) {
-    DCHECK(block != nullptr);
-    Client* client = block->_client;
-    DCHECK(client != nullptr);
-    DCHECK(!block->_is_pinned && !block->_is_deleted) << "Pinned or deleted block " << endl
-                                                      << block->debug_string();
-    *in_mem = false;
-
-    unique_lock<mutex> l(_lock);
-    if (_is_cancelled) {
-        return Status::Cancelled("Cancelled");
-    }
-
-    // First check if there is enough reserved memory to satisfy this request.
-    bool is_reserved_request = false;
-    if (client->_num_pinned_buffers < client->_num_reserved_buffers) {
-        is_reserved_request = true;
-    } else if (client->_num_tmp_reserved_buffers > 0) {
-        is_reserved_request = true;
-        --client->_num_tmp_reserved_buffers;
-    }
-
-    DCHECK(validate()) << endl << debug_internal();
-    if (is_reserved_request) {
-        --_unfullfilled_reserved_buffers;
-    }
-
-    if (!is_reserved_request && remaining_unreserved_buffers() < 1) {
-        // The client already has its quota and there are no unreserved blocks left.
-        // Note that even if this passes, it is still possible for the path below to
-        // see OOM because another query consumed memory from the process tracker. This
-        // only happens if the buffer has not already been allocated by the block mgr.
-        // This check should ensure that the memory cannot be consumed by another client
-        // of the block mgr.
-        return Status::OK();
-    }
-
-    if (block->_buffer_desc != nullptr) {
-        // The block is in memory. It may be in 3 states:
-        //  1. In the unpinned list. The buffer will not be in the free list.
-        //  2. _in_write == true. The buffer will not be in the free list.
-        //  3. The buffer is free, but hasn't yet been reassigned to a different block.
-        DCHECK_EQ(block->_buffer_desc->len, max_block_size()) << "Non-I/O blocks are always pinned";
-        DCHECK(_unpinned_blocks.contains(block) || block->_in_write ||
-               _free_io_buffers.contains(block->_buffer_desc));
-        if (_unpinned_blocks.contains(block)) {
-            _unpinned_blocks.remove(block);
-            DCHECK(!_free_io_buffers.contains(block->_buffer_desc));
-        } else if (block->_in_write) {
-            DCHECK(block->_in_write && !_free_io_buffers.contains(block->_buffer_desc));
-        } else {
-            _free_io_buffers.remove(block->_buffer_desc);
-        }
-        _buffered_pin_counter->update(1);
-        *in_mem = true;
-    } else {
-        BufferDescriptor* buffer_desc = nullptr;
-        RETURN_IF_ERROR(find_buffer(l, &buffer_desc));
-
-        if (buffer_desc == nullptr) {
-            // There are no free buffers or blocks we can evict. We need to fail this request.
-            // If this is an optional request, return OK. If it is required, return OOM.
-            if (!is_reserved_request) {
-                return Status::OK();
-            }
-
-            if (VLOG_QUERY_IS_ON) {
-                stringstream ss;
-                ss << "Query id=" << _query_id << " was unable to get minimum required buffers."
-                   << endl
-                   << debug_internal() << endl
-                   << client->debug_string();
-                VLOG_QUERY << ss.str();
-            }
-            return add_exec_msg(
-                    "Query did not have enough memory to get the minimum required "
-                    "buffers in the block manager.");
-        }
-
-        DCHECK(buffer_desc != nullptr);
-        DCHECK_EQ(buffer_desc->len, max_block_size()) << "Non-I/O buffer";
-        if (buffer_desc->block != nullptr) {
-            // This buffer was assigned to a block but now we are reusing it. Reset the
-            // previous block->buffer link.
-            DCHECK(buffer_desc->block->validate()) << endl << buffer_desc->block->debug_string();
-            buffer_desc->block->_buffer_desc = nullptr;
-        }
-        buffer_desc->block = block;
-        block->_buffer_desc = buffer_desc;
-    }
-    DCHECK(block->_buffer_desc != nullptr);
-    DCHECK(block->_buffer_desc->len < max_block_size() || !block->_is_pinned)
-            << "Trying to pin already pinned block. " << block->_buffer_desc->len << " "
-            << block->_is_pinned;
-    block->_is_pinned = true;
-    client->pin_buffer(block->_buffer_desc);
-    ++_total_pinned_buffers;
-
-    DCHECK(block->validate()) << endl << block->debug_string();
-    // The number of free buffers has decreased. Write unpinned blocks if the number
-    // of free buffers below the threshold is reached.
-    RETURN_IF_ERROR(write_unpinned_blocks());
-    DCHECK(validate()) << endl << debug_internal();
-    return Status::OK();
-}
-
-// We need to find a new buffer. We prefer getting this buffer in this order:
-//  1. Allocate a new block if the number of free blocks is less than the write
-//     threshold, until we run out of memory.
-//  2. Pick a buffer from the free list.
-//  3. Wait and evict an unpinned buffer.
-Status BufferedBlockMgr2::find_buffer(unique_lock<mutex>& lock, BufferDescriptor** buffer_desc) {
-    *buffer_desc = nullptr;
-
-    // First, try to allocate a new buffer.
-    if (_free_io_buffers.size() < _block_write_threshold &&
-        thread_context()->thread_mem_tracker()->check_limit(_max_block_size)) {
-        _mem_tracker->consume(_max_block_size);
-        uint8_t* new_buffer = new uint8_t[_max_block_size];
-        *buffer_desc = _obj_pool.add(new BufferDescriptor(new_buffer, _max_block_size));
-        (*buffer_desc)->all_buffers_it =
-                _all_io_buffers.insert(_all_io_buffers.end(), *buffer_desc);
-        return Status::OK();
-    }
-
-    // Second, try to pick a buffer from the free list.
-    if (_free_io_buffers.empty()) {
-        // There are no free buffers. If spills are disabled or there no unpinned blocks we
-        // can write, return. We can't get a buffer.
-        if (!_enable_spill) {
-            return add_exec_msg(
-                    "Spilling has been disabled for plans,"
-                    "current memory usage has reached the bottleneck. "
-                    "You can avoid the behavior via increasing the mem limit "
-                    "by session variable exec_mem_limit or enable_spilling.");
-        }
-
-        // Third, this block needs to use a buffer that was unpinned from another block.
-        // Get a free buffer from the front of the queue and assign it to the block.
-        do {
-            if (_unpinned_blocks.empty() && _non_local_outstanding_writes == 0) {
-                return Status::OK();
-            }
-            SCOPED_TIMER(_buffer_wait_timer);
-            // Try to evict unpinned blocks before waiting.
-            RETURN_IF_ERROR(write_unpinned_blocks());
-            DCHECK_GT(_non_local_outstanding_writes, 0) << endl << debug_internal();
-            _buffer_available_cv.wait(lock);
-            if (_is_cancelled) {
-                return Status::Cancelled("Cancelled");
-            }
-        } while (_free_io_buffers.empty());
-    }
-    *buffer_desc = _free_io_buffers.dequeue();
-    return Status::OK();
-}
-
-BufferedBlockMgr2::Block* BufferedBlockMgr2::get_unused_block(Client* client) {
-    DCHECK(client != nullptr);
-    Block* new_block = nullptr;
-    if (_unused_blocks.empty()) {
-        new_block = _obj_pool.add(new Block(this));
-        new_block->init();
-        _created_block_counter->update(1);
-    } else {
-        new_block = _unused_blocks.dequeue();
-        _recycled_blocks_counter->update(1);
-    }
-    DCHECK(new_block != nullptr);
-    new_block->_client = client;
-    return new_block;
-}
-
-bool BufferedBlockMgr2::validate() const {
-    int num_free_io_buffers = 0;
-
-    if (_total_pinned_buffers < 0) {
-        LOG(ERROR) << "_total_pinned_buffers < 0: " << _total_pinned_buffers;
-        return false;
-    }
-
-    for (BufferDescriptor* buffer : _all_io_buffers) {
-        bool is_free = _free_io_buffers.contains(buffer);
-        num_free_io_buffers += is_free;
-
-        if (*buffer->all_buffers_it != buffer) {
-            LOG(ERROR) << "All buffers list is corrupt. Buffer iterator is not valid.";
-            return false;
-        }
-
-        if (buffer->block == nullptr && !is_free) {
-            LOG(ERROR) << "Buffer with no block not in free list." << endl << debug_internal();
-            return false;
-        }
-
-        if (buffer->len != _max_block_size) {
-            LOG(ERROR) << "Non-io sized buffers should not end up on free list.";
-            return false;
-        }
-
-        if (buffer->block != nullptr) {
-            if (buffer->block->_buffer_desc != buffer) {
-                LOG(ERROR) << "buffer<->block pointers inconsistent. Buffer: " << buffer << endl
-                           << buffer->block->debug_string();
-                return false;
-            }
-
-            if (!buffer->block->validate()) {
-                LOG(ERROR) << "buffer->block inconsistent." << endl
-                           << buffer->block->debug_string();
-                return false;
-            }
-
-            if (is_free && (buffer->block->_is_pinned || buffer->block->_in_write ||
-                            _unpinned_blocks.contains(buffer->block))) {
-                LOG(ERROR) << "Block with buffer in free list and"
-                           << " _is_pinned = " << buffer->block->_is_pinned
-                           << " _in_write = " << buffer->block->_in_write
-                           << " _Unpinned_blocks.contains = "
-                           << _unpinned_blocks.contains(buffer->block) << endl
-                           << buffer->block->debug_string();
-                return false;
-            }
-        }
-    }
-
-    if (_free_io_buffers.size() != num_free_io_buffers) {
-        LOG(ERROR) << "_free_buffer_list inconsistency."
-                   << " num_free_io_buffers = " << num_free_io_buffers
-                   << " _free_io_buffers.size() = " << _free_io_buffers.size() << endl
-                   << debug_internal();
-        return false;
-    }
-
-    Block* block = _unpinned_blocks.head();
-    while (block != nullptr) {
-        if (!block->validate()) {
-            LOG(ERROR) << "Block inconsistent in unpinned list." << endl << block->debug_string();
-            return false;
-        }
-
-        if (block->_in_write || _free_io_buffers.contains(block->_buffer_desc)) {
-            LOG(ERROR) << "Block in unpinned list with"
-                       << " _in_write = " << block->_in_write << " _free_io_buffers.contains = "
-                       << _free_io_buffers.contains(block->_buffer_desc) << endl
-                       << block->debug_string();
-            return false;
-        }
-        block = block->next();
-    }
-
-    // Check if we're writing blocks when the number of free buffers falls below
-    // threshold. We don't write blocks after cancellation.
-    if (!_is_cancelled && !_unpinned_blocks.empty() && _enable_spill &&
-        (_free_io_buffers.size() + _non_local_outstanding_writes < _block_write_threshold)) {
-        // TODO: this isn't correct when write_unpinned_blocks() fails during the call to
-        // write_unpinned_block() so just log the condition but don't return false. Figure
-        // out a way to re-enable this change?
-        LOG(ERROR) << "Missed writing unpinned blocks";
-    }
-    return true;
-}
-
-string BufferedBlockMgr2::debug_string(Client* client) {
-    stringstream ss;
-    unique_lock<mutex> l(_lock);
-    ss << debug_internal();
-    if (client != nullptr) {
-        ss << endl << client->debug_string();
-    }
-    return ss.str();
-}
-
-string BufferedBlockMgr2::debug_internal() const {
-    stringstream ss;
-    ss << "Buffered block mgr" << endl
-       << "  Num writes outstanding: " << _outstanding_writes_counter->value() << endl
-       << "  Num free io buffers: " << _free_io_buffers.size() << endl
-       << "  Num unpinned blocks: " << _unpinned_blocks.size() << endl
-       << "  Num available buffers: " << remaining_unreserved_buffers() << endl
-       << "  Total pinned buffers: " << _total_pinned_buffers << endl
-       << "  Unfullfilled reserved buffers: " << _unfullfilled_reserved_buffers << endl
-       << "  BUffer Block Mgr Used memory: " << _mem_tracker->consumption()
-       << "  Instance remaining memory: "
-       << thread_context()->thread_mem_tracker()->spare_capacity() << " (#blocks="
-       << (thread_context()->thread_mem_tracker()->spare_capacity() / _max_block_size) << ")"
-       << endl
-       << "  Block write threshold: " << _block_write_threshold;
-    return ss.str();
-}
-
-void BufferedBlockMgr2::init(DiskIoMgr* io_mgr, RuntimeProfile* parent_profile) {
-    unique_lock<mutex> l(_lock);
-    if (_initialized) {
-        return;
-    }
-
-    io_mgr->register_context(&_io_request_context);
-
-    _profile.reset(new RuntimeProfile("BlockMgr"));
-    parent_profile->add_child(_profile.get(), true, nullptr);
-
-    _block_size_counter = ADD_COUNTER(_profile.get(), "MaxBlockSize", TUnit::BYTES);
-    _block_size_counter->set(_max_block_size);
-    _created_block_counter = ADD_COUNTER(_profile.get(), "BlocksCreated", TUnit::UNIT);
-    _recycled_blocks_counter = ADD_COUNTER(_profile.get(), "BlocksRecycled", TUnit::UNIT);
-    _bytes_written_counter = ADD_COUNTER(_profile.get(), "BytesWritten", TUnit::BYTES);
-    _outstanding_writes_counter =
-            ADD_COUNTER(_profile.get(), "BlockWritesOutstanding", TUnit::UNIT);
-    _buffered_pin_counter = ADD_COUNTER(_profile.get(), "BufferedPins", TUnit::UNIT);
-    _disk_read_timer = ADD_TIMER(_profile.get(), "TotalReadBlockTime");
-    _buffer_wait_timer = ADD_TIMER(_profile.get(), "TotalBufferWaitTime");
-    _encryption_timer = ADD_TIMER(_profile.get(), "TotalEncryptionTime");
-    _integrity_check_timer = ADD_TIMER(_profile.get(), "TotalIntegrityCheckTime");
-
-    // Create a new mem_tracker and allocate buffers.
-    _mem_tracker = std::make_unique<MemTracker>("BufferedBlockMgr2");
-
-    _initialized = true;
-}
-
-Status BufferedBlockMgr2::init_tmp_files() {
-    DCHECK(_tmp_files.empty());
-    DCHECK(_tmp_file_mgr != nullptr);
-
-    vector<TmpFileMgr::DeviceId> tmp_devices = _tmp_file_mgr->active_tmp_devices();
-    // Initialize the tmp files and the initial file to use.
-    _tmp_files.reserve(tmp_devices.size());
-    for (int i = 0; i < tmp_devices.size(); ++i) {
-        TmpFileMgr::File* tmp_file;
-        TmpFileMgr::DeviceId tmp_device_id = tmp_devices[i];
-        // It is possible for a device to be blacklisted after it was returned
-        // by active_tmp_devices() - handle this gracefully.
-        Status status = _tmp_file_mgr->get_file(tmp_device_id, _query_id, &tmp_file);
-        if (status.ok()) {
-            _tmp_files.emplace_back(tmp_file);
-        }
-    }
-    if (_tmp_files.empty()) {
-        return Status::InternalError(
-                "No spilling directories configured. Cannot spill. Set --scratch_dirs"
-                " or see log for previous errors that prevented use of provided directories");
-    }
-    _next_block_index = rand() % _tmp_files.size();
-    return Status::OK();
-}
-
-} // namespace doris
diff --git a/be/src/runtime/buffered_block_mgr2.h b/be/src/runtime/buffered_block_mgr2.h
deleted file mode 100644
index 9b423cf7f1..0000000000
--- a/be/src/runtime/buffered_block_mgr2.h
+++ /dev/null
@@ -1,614 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// https://github.com/apache/impala/blob/branch-2.9.0/be/src/runtime/buffered-block-mgr2.h
-// and modified by Doris
-
-#pragma once
-
-#include <unordered_map>
-
-#include "runtime/disk_io_mgr.h"
-#include "runtime/tmp_file_mgr.h"
-#include "util/uid_util.h"
-
-namespace doris {
-
-class RuntimeState;
-
-// The BufferedBlockMgr2 is used to allocate and manage blocks of data using a fixed memory
-// budget. Available memory is split into a pool of fixed-size memory buffers. When a
-// client allocates or requests a block, the block is assigned a buffer from this pool and
-// is 'pinned' in memory. Clients can also unpin a block, allowing the manager to reassign
-// its buffer to a different block.
-//
-// The BufferedBlockMgr2 typically allocates blocks in IO buffer size to get maximal IO
-// efficiency when spilling. Clients can also request smaller buffers that cannot spill
-// (note that it would be possible to spill small buffers, but we currently do not allow
-// it). This is useful to present the same block API and mem tracking for clients (one can
-// use the block mgr API to mem track non-spillable (smaller) buffers). Clients that do
-// partitioning (e.g. PHJ and PAGG) will start with these smaller buffer sizes to reduce
-// the minimum buffering requirements and grow to max sized buffers as the input grows.
-// For simplicity, these small buffers are not recycled (there's also not really a need
-// since they are allocated all at once on query startup). These buffers are not counted
-// against the reservation.
-//
-// The BufferedBlockMgr2 reserves one buffer per disk ('_block_write_threshold') for
-// itself. When the number of free buffers falls below 'block_write_threshold', unpinned
-// blocks are flushed in Last-In-First-Out order. (It is assumed that unpinned blocks are
-// re-read in FIFO order). The TmpFileMgr is used to obtain file handles to write to
-// within the tmp directories configured for Impala.
-//
-// It is expected to have one BufferedBlockMgr2 per query. All allocations that can grow
-// proportional to the input size and that might need to spill to disk should allocate
-// from the same BufferedBlockMgr2.
-//
-// A client must pin a block in memory to read/write its contents and unpin it when it is
-// no longer in active use. The BufferedBlockMgr2 guarantees that:
-//  a) The memory buffer assigned to a block is not removed or released while it is pinned.
-//  b) The contents of an unpinned block will be available on a subsequent call to pin.
-//
-// The Client supports the following operations:
-//  get_new_block(): Returns a new pinned block.
-//  Close(): Frees all memory and disk space. Called when a query is closed or cancelled.
-//   Close() is idempotent.
-//
-// A Block supports the following operations:
-//  pin(): Pins a block to a buffer in memory, and reads its contents from disk if
-//   necessary. If there are no free buffers, waits for a buffer to become available.
-//   Invoked before the contents of a block are read or written. The block
-//   will be maintained in memory until unpin() is called.
-//  unpin(): Invoked to indicate the block is not in active use. The block is added to a
-//   list of unpinned blocks. Unpinned blocks are only written when the number of free
-//   blocks falls below the 'block_write_threshold'.
-//  del(): Invoked to deallocate a block. The buffer associated with the block is
-//   immediately released and its on-disk location (if any) reused.
-//
-// The block manager is thread-safe with the following caveat: A single block cannot be
-// used simultaneously by multiple clients in any capacity.
-// However, the block manager client is not thread-safe. That is, the block manager
-// allows multiple single-threaded block manager clients.
-//
-/// TODO: When a block is read from disk, data is copied from the IOMgr buffer to the
-/// block manager's buffer. This should be avoided in the common case where these buffers
-/// are of the same size.
-/// TODO: See if the one big lock is a bottleneck. Break it up. This object is shared by
-/// all operators within a query (across fragments), see IMPALA-1883.
-/// TODO: No reason we can't spill the smaller buffers. Add it if we need to (it's likely
-/// just removing dchecks).
-/// TODO: The requirements on this object has grown organically. Consider a major
-/// reworking.
-class BufferedBlockMgr2 {
-private:
-    struct BufferDescriptor;
-
-public:
-    // A client of the BufferedBlockMgr2. There is a single BufferedBlockMgr2 per plan
-    // fragment and all operators that need blocks from it should use a separate client.
-    // Each client has the option to reserve a number of blocks that it can claim later.
-    // The remaining memory that is not reserved by any clients is free for all and
-    // available to all clients.
-    // This is an opaque handle.
-    // TODO: move the APIs to client we don't need to pass the BufferedBlockMgr2 around.
-    // TODO: how can we ensure that each operator uses a separate client?
-    class Client;
-
-    // A fixed-size block of data that may be be persisted to disk. The state of the block
-    // is maintained by the block manager and is described by 3 bools:
-    // _is_pinned = True if the block is pinned. The block has a non-null _buffer_desc,
-    //   _buffer_desc cannot be in the free buffer list and the block cannot be in
-    //   _unused_blocks or _unpinned_blocks. Newly allocated blocks are pinned.
-    // _in_write = True if a write has been issued but not completed for this block.
-    //   The block cannot be in the _unpinned_blocks and must have a non-null _buffer_desc
-    //   that's not in the free buffer list. It may be pinned or unpinned.
-    // _is_deleted = True if del() has been called on a block. After this, no API call
-    //   is valid on the block.
-    //
-    // pin() and unpin() can be invoked on a block any number of times before del().
-    // When a pinned block is unpinned for the first time, it is added to the
-    // _unpinned_blocks list and its buffer is removed from the free list.
-    // If it is pinned or deleted at any time while it is on the unpinned list, it is
-    // simply removed from that list. When it is dequeued from that list and enqueued
-    // for writing, _in_write is set to true. The block may be pinned, unpinned or deleted
-    // while _in_write is true. After the write has completed, the block's buffer will be
-    // returned to the free buffer list if it is no longer pinned, and the block itself
-    // will be put on the unused blocks list if del() was called.
-    //
-    // A block MUST have a non-null _buffer_desc if
-    //  a) _is_pinned is true (i.e. the client is using it), or
-    //  b) _in_write is true, (i.e. IO mgr is using it), or
-    //  c) It is on the unpinned list (buffer has not been persisted.)
-    //
-    // In addition to the block manager API, Block exposes allocate(), return_allocation()
-    // and bytes_remaining() to allocate and free memory within a block, and buffer() and
-    // valid_data_len() to read/write the contents of a block. These are not thread-safe.
-    class Block : public InternalQueue<Block>::Node {
-    public:
-        // A null dtor to pass codestyle check
-        ~Block() {}
-
-        // Pins a block in memory--assigns a free buffer to a block and reads it from disk if
-        // necessary. If there are no free blocks and no unpinned blocks, '*pinned' is set to
-        // false and the block is not pinned. If 'release_block' is non-nullptr, if there is
-        // memory pressure, this block will be pinned using the buffer from 'release_block'.
-        // If 'unpin' is true, 'release_block' will be unpinned (regardless of whether or not
-        // the buffer was used for this block). If 'unpin' is false, 'release_block' is
-        // deleted. 'release_block' must be pinned.
-        Status pin(bool* pinned, Block* release_block = nullptr, bool unpin = true);
-
-        // Unpins a block by adding it to the list of unpinned blocks maintained by the block
-        // manager. An unpinned block must be flushed before its buffer is released or
-        // assigned to a different block. Is non-blocking.
-        Status unpin();
-
-        // Delete a block. Its buffer is released and on-disk location can be over-written.
-        // Non-blocking.
-        void del();
-
-        void add_row() { ++_num_rows; }
-        int num_rows() const { return _num_rows; }
-
-        // Allocates the specified number of bytes from this block.
-        template <typename T>
-        T* allocate(int size) {
-            DCHECK_GE(bytes_remaining(), size);
-            uint8_t* current_location = _buffer_desc->buffer + _valid_data_len;
-            _valid_data_len += size;
-            return reinterpret_cast<T*>(current_location);
-        }
-
-        // Return the number of remaining bytes that can be allocated in this block.
-        int bytes_remaining() const {
-            DCHECK(_buffer_desc != nullptr);
-            return _buffer_desc->len - _valid_data_len;
-        }
-
-        // Return size bytes from the most recent allocation.
-        void return_allocation(int size) {
-            DCHECK_GE(_valid_data_len, size);
-            _valid_data_len -= size;
-        }
-
-        // Pointer to start of the block data in memory. Only guaranteed to be valid if the
-        // block is pinned.
-        uint8_t* buffer() const {
-            DCHECK(_buffer_desc != nullptr);
-            return _buffer_desc->buffer;
-        }
-
-        // Return the number of bytes allocated in this block.
-        int64_t valid_data_len() const { return _valid_data_len; }
-
-        // Returns the length of the underlying buffer. Only callable if the block is
-        // pinned.
-        int64_t buffer_len() const {
-            DCHECK(is_pinned());
-            return _buffer_desc->len;
-        }
-
-        // Returns true if this block is the max block size. Only callable if the block
-        // is pinned.
-        bool is_max_size() const {
-            DCHECK(is_pinned());
-            return _buffer_desc->len == _block_mgr->max_block_size();
-        }
-
-        bool is_pinned() const { return _is_pinned; }
-
-        // Path of temporary file backing the block. Intended for use in testing.
-        // Returns empty string if no backing file allocated.
-        std::string tmp_file_path() const;
-
-        // Debug helper method to print the state of a block.
-        std::string debug_string() const;
-
-    private:
-        friend class BufferedBlockMgr2;
-
-        Block(BufferedBlockMgr2* block_mgr);
-
-        // Initialize the state of a block and set the number of bytes allocated to 0.
-        void init();
-
-        // Debug helper method to validate the state of a block. _block_mgr lock must already
-        // be taken.
-        bool validate() const;
-
-        // Pointer to the buffer associated with the block. nullptr if the block is not in
-        // memory and cannot be changed while the block is pinned or being written.
-        BufferDescriptor* _buffer_desc;
-
-        // Parent block manager object. Responsible for maintaining the state of the block.
-        BufferedBlockMgr2* _block_mgr;
-
-        // The client that owns this block.
-        Client* _client;
-
-        // WriteRange object representing the on-disk location used to persist a block.
-        // Is created the first time a block is persisted, and retained until the block
-        // object is destroyed. The file location and offset in _write_range are valid
-        // throughout the lifetime of this object, but the data and length in the
-        // _write_range are only valid while the block is being written.
-        // _write_range instance is owned by the block manager.
-        DiskIoMgr::WriteRange* _write_range;
-
-        // The file this block belongs to. The lifetime is the same as the file location
-        // and offset in _write_range. The File is owned by BufferedBlockMgr2, not TmpFileMgr.
-        TmpFileMgr::File* _tmp_file;
-
-        // Length of valid (i.e. allocated) data within the block.
-        int64_t _valid_data_len;
-
-        // Number of rows in this block.
-        int _num_rows;
-
-        // Block state variables. The block's buffer can be freed only if _is_pinned and
-        // _in_write are both false.
-        // TODO: this might be better expressed as an enum.
-
-        // _is_pinned is true while the block is pinned by a client.
-        bool _is_pinned;
-
-        // _in_write is set to true when the block is enqueued for writing via DiskIoMgr,
-        // and set to false when the write is complete.
-        bool _in_write;
-
-        // True if the block is deleted by the client.
-        bool _is_deleted;
-
-        // Condition variable for when there is a specific client waiting for this block.
-        // Only used if _client_local is true.
-        // TODO: Currently we use _block_mgr->_lock for this condvar. There is no reason to
-        // use that _lock that is already overloaded, see IMPALA-1883.
-        std::condition_variable _write_complete_cv;
-
-        // If true, this block is being written out so the underlying buffer can be
-        // transferred to another block from the same client. We don't want this buffer
-        // getting picked up by another client.
-        bool _client_local;
-    }; // class Block
-
-    // Create a block manager with the specified mem_limit. If a block mgr with the
-    // same query id has already been created, that block mgr is returned.
-    // - buffer_size: maximum size of each buffer.
-    static Status create(RuntimeState* state, RuntimeProfile* profile, TmpFileMgr* tmp_file_mgr,
-                         int64_t buffer_size, std::shared_ptr<BufferedBlockMgr2>* block_mgr);
-
-    ~BufferedBlockMgr2();
-
-    // Registers a client with num_reserved_buffers. The returned client is owned
-    // by the BufferedBlockMgr2 and has the same lifetime as it.
-    // We allow oversubscribing the reserved buffers. It is likely that the
-    // num_reserved_buffers be very pessimistic for small queries and we don't want to
-    // fail all of them with mem limit exceeded.
-    // The min reserved buffers is often independent of data size and we still want
-    // to run small queries with very small limits.
-    // Buffers used by this client are reflected in tracker.
-    // TODO: The fact that we allow oversubscription is problematic.
-    // as the code expects the reservations to always be granted (currently not the case).
-    Status register_client(int num_reserved_buffers, RuntimeState* state, Client** client);
-
-    // Clears all reservations for this client.
-    void clear_reservations(Client* client);
-
-    // Tries to acquire a one-time reservation of num_buffers. The semantics are:
-    //  - If this call fails, the next 'num_buffers' calls to pin()/get_new_block() might
-    //    not have enough memory.
-    //  - If this call succeeds, the next 'num_buffers' call to pin()/get_new_block() will
-    //    be guaranteed to get the block. Once these blocks have been pinned, the
-    //    reservation from this call has no more effect.
-    // Blocks coming from the tmp reservation also count towards the regular reservation.
-    // This is useful to pin() a number of blocks and guarantee all or nothing behavior.
-    bool try_acquire_tmp_reservation(Client* client, int num_buffers);
-
-    // Return a new pinned block. If there is no memory for this block, *block will be set
-    // to nullptr.
-    // If len > 0, get_new_block() will return a block with a buffer of size len. len
-    // must be less than max_block_size and this block cannot be unpinned.
-    // This function will try to allocate new memory for the block up to the limit.
-    // Otherwise it will (conceptually) write out an unpinned block and use that memory.
-    // The caller can pass a non-nullptr 'unpin_block' to transfer memory from 'unpin_block'
-    // to the new block. If 'unpin_block' is non-nullptr, the new block can never fail to
-    // get a buffer. The semantics of this are:
-    //   - If 'unpin_block' is non-nullptr, it must be pinned.
-    //   - If the call succeeds, 'unpin_block' is unpinned.
-    //   - If there is no memory pressure, block will get a newly allocated buffer.
-    //   - If there is memory pressure, block will get the buffer from 'unpin_block'.
-    Status get_new_block(Client* client, Block* unpin_block, Block** block, int64_t len = -1);
-
-    // Cancels the block mgr. All subsequent calls that return a Status fail with
-    // Status::Cancelled("Cancelled"). Idempotent.
-    void cancel();
-
-    // Returns true if the block manager was cancelled.
-    bool is_cancelled();
-
-    // Dumps block mgr state. Grabs lock. If client is not nullptr, also dumps its state.
-    std::string debug_string(Client* client = nullptr);
-
-    // The number of buffers available for client. That is, if all other clients were
-    // stopped, the number of buffers this client could get.
-    int64_t available_buffers(Client* client) const;
-
-    // Returns a MEM_LIMIT_EXCEEDED error which includes the minimum memory required by
-    // this 'client' that acts on behalf of the node with id 'node_id'. 'node_id' is used
-    // only for error reporting.
-    Status mem_limit_too_low_error(Client* client, int node_id);
-
-    // TODO: Remove these two. Not clear what the sorter really needs.
-    // TODO: Those are dirty, dangerous reads to two lists whose all other accesses are
-    // protected by the _lock. Using those two functions is looking for trouble.
-    int available_allocated_buffers() const { return _all_io_buffers.size(); }
-    int num_free_buffers() const { return _free_io_buffers.size(); }
-
-    int num_pinned_buffers(Client* client) const;
-    int num_reserved_buffers_remaining(Client* client) const;
-    MemTracker* get_tracker(Client* client) const;
-    MemTracker* mem_tracker() const { return _mem_tracker.get(); }
-    int64_t max_block_size() const {
-        { return _max_block_size; }
-    }
-    int64_t bytes_allocated() const;
-    RuntimeProfile* profile() {
-        { return _profile.get(); }
-    }
-    int writes_issued() const {
-        { return _writes_issued; }
-    }
-
-private:
-    friend class Client;
-
-    // Descriptor for a single memory buffer in the pool.
-    struct BufferDescriptor : public InternalQueue<BufferDescriptor>::Node {
-        // Start of the buffer.
-        uint8_t* buffer;
-
-        // Length of the buffer.
-        int64_t len;
-
-        // Block that this buffer is assigned to. May be nullptr.
-        Block* block;
-
-        // Iterator into _all_io_buffers for this buffer.
-        std::list<BufferDescriptor*>::iterator all_buffers_it;
-
-        BufferDescriptor(uint8_t* buf, int64_t len) : buffer(buf), len(len), block(nullptr) {}
-    };
-
-    BufferedBlockMgr2(RuntimeState* state, TmpFileMgr* tmp_file_mgr, int64_t block_size);
-
-    // Initializes the block mgr. Idempotent and thread-safe.
-    void init(DiskIoMgr* io_mgr, RuntimeProfile* profile);
-
-    // Initializes _tmp_files. This is initialized the first time we need to write to disk.
-    // Must be called with _lock taken.
-    Status init_tmp_files();
-
-    // pin_block(), unpin_block(), delete_block() perform the actual work of Block::pin(),
-    // unpin() and del(). Must be called with the _lock taken.
-    Status pin_block(Block* block, bool* pinned, Block* src, bool unpin);
-    Status unpin_block(Block* block);
-    void delete_block(Block* block);
-
-    // If the 'block' is nullptr, checks if cancelled and returns. Otherwise, depending on
-    // 'unpin' calls either  delete_block() or unpin_block(), which both first check for
-    // cancellation. It should be called without the _lock acquired.
-    Status delete_or_unpin_block(Block* block, bool unpin);
-
-    // Transfers the buffer from 'src' to 'dst'. 'src' must be pinned.
-    // If unpin == false, 'src' is simply deleted.
-    // If unpin == true, 'src' is unpinned and it may block until the write of 'src' is
-    // completed. In that case it will use the _lock for the condvar. Thus, the _lock
-    // needs to not have been taken when this function is called.
-    Status transfer_buffer(Block* dst, Block* src, bool unpin);
-
-    // Returns the total number of unreserved buffers. This is the sum of unpinned,
-    // free and buffers we can still allocate minus the total number of reserved buffers
-    // that are not pinned.
-    // Note this can be negative if the buffers are oversubscribed.
-    // Must be called with _lock taken.
-    int64_t remaining_unreserved_buffers() const;
-
-    // Finds a buffer for a block and pins it. If the block's buffer has not been evicted,
-    // it removes the block from the unpinned list and sets *in_mem = true.
-    // If the block is not in memory, it will call find_buffer() that may block.
-    // If we can't get a buffer (e.g. no more memory, nothing in the unpinned and free
-    // lists) this function returns with the block unpinned.
-    // Uses the _lock, the caller should not have already acquired the _lock.
-    Status find_buffer_for_block(Block* block, bool* in_mem);
-
-    // Returns a new buffer that can be used. *buffer is set to nullptr if there was no
-    // memory.
-    // Otherwise, this function gets a new buffer by:
-    //   1. Allocating a new buffer if possible
-    //   2. Using a buffer from the free list (which is populated by moving blocks from
-    //      the unpinned list by writing them out).
-    // Must be called with the _lock already taken. This function can block.
-    Status find_buffer(std::unique_lock<std::mutex>& lock, BufferDescriptor** buffer);
-
-    // Writes unpinned blocks via DiskIoMgr until one of the following is true:
-    //   1. The number of outstanding writes >= (_block_write_threshold - num free buffers)
-    //   2. There are no more unpinned blocks
-    // Must be called with the _lock already taken. Is not blocking.
-    Status write_unpinned_blocks();
-
-    // Issues the write for this block to the DiskIoMgr.
-    Status write_unpinned_block(Block* block);
-
-    // Allocate block_size bytes in a temporary file. Try multiple disks if error occurs.
-    // Returns an error only if no temporary files are usable.
-    Status allocate_scratch_space(int64_t block_size, TmpFileMgr::File** tmp_file,
-                                  int64_t* file_offset);
-
-    // Callback used by DiskIoMgr to indicate a block write has completed.  write_status
-    // is the status of the write. _is_cancelled is set to true if write_status is not
-    // Status::OK() or a re-issue of the write fails. Returns the block's buffer to the
-    // free buffers list if it is no longer pinned. Returns the block itself to the free
-    // blocks list if it has been deleted.
-    void write_complete(Block* block, const Status& write_status);
-
-    // Returns a deleted block to the list of free blocks. Assumes the block's buffer has
-    // already been returned to the free buffers list. Non-blocking.
-    // Thread-safe and does not need the _lock acquired.
-    void return_unused_block(Block* block);
-
-    // Checks _unused_blocks for an unused block object, else allocates a new one.
-    // Non-blocking and needs no _lock.
-    Block* get_unused_block(Client* client);
-
-    // Used to debug the state of the block manager. Lock must already be taken.
-    bool validate() const;
-    std::string debug_internal() const;
-
-    // Add BE hostname and fragmentid for debug tuning
-    Status add_exec_msg(const std::string& msg) const;
-
-    // Size of the largest/default block in bytes.
-    const int64_t _max_block_size;
-
-    // Unpinned blocks are written when the number of free buffers is below this threshold.
-    // Equal to the number of disks.
-    const int _block_write_threshold;
-
-    // If false, spilling is disabled. The client calls will fail if there is not enough
-    // memory.
-    const bool _enable_spill;
-
-    const TUniqueId _query_id;
-
-    ObjectPool _obj_pool;
-
-    // Track buffers allocated by the block manager.
-    std::unique_ptr<MemTracker> _mem_tracker;
-
-    // The temporary file manager used to allocate temporary file space.
-    TmpFileMgr* _tmp_file_mgr;
-
-    // This lock protects the block and buffer lists below, except for _unused_blocks.
-    // It also protects the various counters and changes to block state. Additionally, it is
-    // used for the blocking condvars: _buffer_available_cv and block->_write_complete_cv.
-    // TODO: We should break the protection of the various structures and usages to
-    //       different spinlocks and a mutex to be used in the wait()s, see IMPALA-1883.
-    std::mutex _lock;
-
-    // If true, init() has been called.
-    bool _initialized;
-
-    // The total number of reserved buffers across all clients that are not pinned.
-    int _unfullfilled_reserved_buffers;
-
-    // The total number of pinned buffers across all clients.
-    int _total_pinned_buffers;
-
-    // Number of outstanding writes (Writes issued but not completed).
-    // This does not include client-local writes.
-    int _non_local_outstanding_writes;
-
-    // Signal availability of free buffers.
-    std::condition_variable _buffer_available_cv;
-
-    // List of blocks _is_pinned = false AND are not on DiskIoMgr's write queue.
-    // Blocks are added to and removed from the back of the list. (i.e. in LIFO order).
-    // Blocks in this list must have _is_pinned = false, _in_write = false,
-    // _is_deleted = false.
-    InternalQueue<Block> _unpinned_blocks;
-
-    // List of blocks that have been deleted and are no longer in use.
-    // Can be reused in get_new_block(). Blocks in this list must be in the Init'ed state,
-    // i.e. _buffer_desc = nullptr, _is_pinned = false, _in_write = false,
-    // _is_deleted = false, valid_data_len = 0.
-    InternalQueue<Block> _unused_blocks;
-
-    // List of buffers that can be assigned to a block in pin() or get_new_block().
-    // These buffers either have no block associated with them or are associated with an
-    // an unpinned block that has been persisted. That is, either block = nullptr or
-    // (!block->_is_pinned  && !block->_in_write  && !_unpinned_blocks.Contains(block)).
-    // All of these buffers are io sized.
-    InternalQueue<BufferDescriptor> _free_io_buffers;
-
-    // All allocated io-sized buffers.
-    std::list<BufferDescriptor*> _all_io_buffers;
-
-    // Temporary physical file handle, (one per tmp device) to which blocks may be written.
-    // Blocks are round-robined across these files.
-    std::vector<std::unique_ptr<TmpFileMgr::File>> _tmp_files;
-
-    // Index into _tmp_files denoting the file to which the next block to be persisted will
-    // be written.
-    int _next_block_index;
-
-    // DiskIoMgr handles to read and write blocks.
-    DiskIoMgr* _io_mgr;
-    DiskIoMgr::RequestContext* _io_request_context;
-
-    // If true, a disk write failed and all API calls return.
-    // Status::Cancelled("Cancelled"). Set to true if there was an error writing a block, or if
-    // write_complete() needed to reissue the write and that failed.
-    bool _is_cancelled;
-
-    // Counters and timers to track behavior.
-    std::unique_ptr<RuntimeProfile> _profile;
-
-    RuntimeProfile::Counter* _block_size_counter;
-
-    // Total number of blocks created.
-    RuntimeProfile::Counter* _created_block_counter;
-
-    // Number of deleted blocks reused.
-    RuntimeProfile::Counter* _recycled_blocks_counter;
-
-    // Number of pin() calls that did not require a disk read.
-    RuntimeProfile::Counter* _buffered_pin_counter;
-
-    // Time taken for disk reads.
-    RuntimeProfile::Counter* _disk_read_timer;
-
-    // Time spent waiting for a free buffer.
-    RuntimeProfile::Counter* _buffer_wait_timer;
-
-    // Number of bytes written to disk (includes writes still queued in the IO manager).
-    RuntimeProfile::Counter* _bytes_written_counter;
-
-    // Number of writes outstanding (issued but not completed).
-    RuntimeProfile::Counter* _outstanding_writes_counter;
-
-    // Time spent in disk spill encryption and decryption.
-    RuntimeProfile::Counter* _encryption_timer;
-
-    // Time spent in disk spill integrity generation and checking.
-    RuntimeProfile::Counter* _integrity_check_timer;
-
-    // Number of writes issued.
-    int _writes_issued;
-
-    // Protects _s_query_to_block_mgrs.
-    static SpinLock _s_block_mgrs_lock;
-
-    // All per-query BufferedBlockMgr2 objects that are in use.  For memory management, this
-    // map contains only weak ptrs. BufferedBlockMgr2s that are handed out are shared ptrs.
-    // When all the shared ptrs are no longer referenced, the BufferedBlockMgr2
-    // d'tor will be called at which point the weak ptr will be removed from the map.
-    typedef std::unordered_map<TUniqueId, std::weak_ptr<BufferedBlockMgr2>> BlockMgrsMap;
-    static BlockMgrsMap _s_query_to_block_mgrs;
-
-    // Unowned.
-    RuntimeState* _state;
-
-}; // class BufferedBlockMgr2
-
-} // end namespace doris
diff --git a/be/src/runtime/buffered_tuple_stream2.cc b/be/src/runtime/buffered_tuple_stream2.cc
deleted file mode 100644
index 1528f8de66..0000000000
--- a/be/src/runtime/buffered_tuple_stream2.cc
+++ /dev/null
@@ -1,805 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// https://github.com/apache/impala/blob/branch-2.10.0/be/src/runtime/buffered-tuple-stream.cc
-// and modified by Doris
-
-#include "runtime/buffered_tuple_stream2.h"
-
-#include "runtime/descriptors.h"
-#include "runtime/row_batch.h"
-#include "runtime/string_value.h"
-#include "runtime/tuple_row.h"
-#include "util/bit_util.h"
-#include "util/pretty_printer.h"
-
-using std::stringstream;
-using std::string;
-using std::vector;
-using std::list;
-
-using std::unique_ptr;
-
-namespace doris {
-
-// The first NUM_SMALL_BLOCKS of the tuple stream are made of blocks less than the
-// IO size. These blocks never spill.
-// TODO: Consider adding a 4MB in-memory buffer that would split the gap between the
-// 512KB in-memory buffer and the 8MB (IO-sized) spillable buffer.
-static const int64_t INITIAL_BLOCK_SIZES[] = {64 * 1024, 512 * 1024};
-static const int NUM_SMALL_BLOCKS = sizeof(INITIAL_BLOCK_SIZES) / sizeof(int64_t);
-
-string BufferedTupleStream2::RowIdx::debug_string() const {
-    stringstream ss;
-    ss << "RowIdx block=" << block() << " offset=" << offset() << " idx=" << idx();
-    return ss.str();
-}
-
-BufferedTupleStream2::BufferedTupleStream2(RuntimeState* state, const RowDescriptor& row_desc,
-                                           BufferedBlockMgr2* block_mgr,
-                                           BufferedBlockMgr2::Client* client,
-                                           bool use_initial_small_buffers, bool read_write)
-        : _use_small_buffers(use_initial_small_buffers),
-          _delete_on_read(false),
-          _read_write(read_write),
-          _state(state),
-          _desc(row_desc),
-          _nullable_tuple(row_desc.is_any_tuple_nullable()),
-          _block_mgr(block_mgr),
-          _block_mgr_client(client),
-          _total_byte_size(0),
-          _read_ptr(nullptr),
-          _read_tuple_idx(0),
-          _read_bytes(0),
-          _rows_returned(0),
-          _read_block_idx(-1),
-          _write_block(nullptr),
-          _num_pinned(0),
-          _num_small_blocks(0),
-          _closed(false),
-          _num_rows(0),
-          _pinned(true),
-          _pin_timer(nullptr),
-          _unpin_timer(nullptr),
-          _get_new_block_timer(nullptr) {
-    _null_indicators_read_block = _null_indicators_write_block = -1;
-    _read_block = _blocks.end();
-    _fixed_tuple_row_size = 0;
-    for (int i = 0; i < _desc.tuple_descriptors().size(); ++i) {
-        const TupleDescriptor* tuple_desc = _desc.tuple_descriptors()[i];
-        const int tuple_byte_size = tuple_desc->byte_size();
-        _fixed_tuple_row_size += tuple_byte_size;
-        if (!tuple_desc->string_slots().empty()) {
-            _string_slots.push_back(make_pair(i, tuple_desc->string_slots()));
-        }
-        // if (!tuple_desc->collection_slots().empty()) {
-        //     _collection_slots.push_back(make_pair(i, tuple_desc->collection_slots()));
-        // }
-    }
-}
-
-// Returns the number of pinned blocks in the list.
-// Only called in DCHECKs to validate _num_pinned.
-int num_pinned(const list<BufferedBlockMgr2::Block*>& blocks) {
-    int num_pinned = 0;
-    for (list<BufferedBlockMgr2::Block*>::const_iterator it = blocks.begin(); it != blocks.end();
-         ++it) {
-        if ((*it)->is_pinned() && (*it)->is_max_size()) {
-            ++num_pinned;
-        }
-    }
-    return num_pinned;
-}
-
-string BufferedTupleStream2::debug_string() const {
-    stringstream ss;
-    ss << "BufferedTupleStream2 num_rows=" << _num_rows << " rows_returned=" << _rows_returned
-       << " pinned=" << (_pinned ? "true" : "false")
-       << " delete_on_read=" << (_delete_on_read ? "true" : "false")
-       << " closed=" << (_closed ? "true" : "false") << " num_pinned=" << _num_pinned
-       << " write_block=" << _write_block << " _read_block=";
-    if (_read_block == _blocks.end()) {
-        ss << "<end>";
-    } else {
-        ss << *_read_block;
-    }
-    ss << " blocks=[\n";
-    for (list<BufferedBlockMgr2::Block*>::const_iterator it = _blocks.begin(); it != _blocks.end();
-         ++it) {
-        ss << "{" << (*it)->debug_string() << "}";
-        if (*it != _blocks.back()) {
-            ss << ",\n";
-        }
-    }
-    ss << "]";
-    return ss.str();
-}
-
-Status BufferedTupleStream2::init(int node_id, RuntimeProfile* profile, bool pinned) {
-    if (profile != nullptr) {
-        _pin_timer = ADD_TIMER(profile, "PinTime");
-        _unpin_timer = ADD_TIMER(profile, "UnpinTime");
-        _get_new_block_timer = ADD_TIMER(profile, "GetNewBlockTime");
-    }
-
-    if (_block_mgr->max_block_size() < INITIAL_BLOCK_SIZES[0]) {
-        _use_small_buffers = false;
-    }
-
-    bool got_block = false;
-    RETURN_IF_ERROR(new_block_for_write(_fixed_tuple_row_size, &got_block));
-    if (!got_block) {
-        return _block_mgr->mem_limit_too_low_error(_block_mgr_client, node_id);
-    }
-    DCHECK(_write_block != nullptr);
-    if (!pinned) {
-        RETURN_IF_ERROR(unpin_stream());
-    }
-    return Status::OK();
-}
-
-Status BufferedTupleStream2::switch_to_io_buffers(bool* got_buffer) {
-    if (!_use_small_buffers) {
-        *got_buffer = (_write_block != nullptr);
-        return Status::OK();
-    }
-    _use_small_buffers = false;
-    Status status = new_block_for_write(_block_mgr->max_block_size(), got_buffer);
-    // IMPALA-2330: Set the flag using small buffers back to false in case it failed to
-    // got a buffer.
-    DCHECK(status.ok() || !*got_buffer) << status.ok() << " " << *got_buffer;
-    _use_small_buffers = !*got_buffer;
-    return status;
-}
-
-void BufferedTupleStream2::close() {
-    for (list<BufferedBlockMgr2::Block*>::iterator it = _blocks.begin(); it != _blocks.end();
-         ++it) {
-        (*it)->del();
-    }
-    _blocks.clear();
-    _num_pinned = 0;
-    DCHECK_EQ(_num_pinned, num_pinned(_blocks));
-    _closed = true;
-}
-
-int64_t BufferedTupleStream2::bytes_in_mem(bool ignore_current) const {
-    int64_t result = 0;
-    for (list<BufferedBlockMgr2::Block*>::const_iterator it = _blocks.begin(); it != _blocks.end();
-         ++it) {
-        if (!(*it)->is_pinned()) {
-            continue;
-        }
-        if (!(*it)->is_max_size()) {
-            continue;
-        }
-        if (*it == _write_block && ignore_current) {
-            continue;
-        }
-        result += (*it)->buffer_len();
-    }
-    return result;
-}
-
-Status BufferedTupleStream2::unpin_block(BufferedBlockMgr2::Block* block) {
-    SCOPED_TIMER(_unpin_timer);
-    DCHECK(block->is_pinned());
-    if (!block->is_max_size()) {
-        return Status::OK();
-    }
-    RETURN_IF_ERROR(block->unpin());
-    --_num_pinned;
-    DCHECK_EQ(_num_pinned, num_pinned(_blocks));
-    return Status::OK();
-}
-
-Status BufferedTupleStream2::new_block_for_write(int64_t min_size, bool* got_block) {
-    DCHECK(!_closed);
-    *got_block = false;
-    if (min_size > _block_mgr->max_block_size()) {
-        std::stringstream error_msg;
-        error_msg << "Cannot process row that is bigger than the IO size (row_size="
-                  << PrettyPrinter::print(min_size, TUnit::BYTES)
-                  << "). To run this query, increase the IO size (--read_size option).";
-        return Status::InternalError(error_msg.str());
-    }
-
-    BufferedBlockMgr2::Block* unpin_block = _write_block;
-    if (_write_block != nullptr) {
-        DCHECK(_write_block->is_pinned());
-        if (_pinned || _write_block == *_read_block || !_write_block->is_max_size()) {
-            // In these cases, don't unpin the current write block.
-            unpin_block = nullptr;
-        }
-    }
-
-    int64_t block_len = _block_mgr->max_block_size();
-    if (_use_small_buffers) {
-        if (_blocks.size() < NUM_SMALL_BLOCKS) {
-            block_len = std::min(block_len, INITIAL_BLOCK_SIZES[_blocks.size()]);
-            if (block_len < min_size) {
-                block_len = _block_mgr->max_block_size();
-            }
-        }
-        if (block_len == _block_mgr->max_block_size()) {
-            // Do not switch to IO-buffers automatically. Do not get a buffer.
-            *got_block = false;
-            return Status::OK();
-        }
-    }
-
-    BufferedBlockMgr2::Block* new_block = nullptr;
-    {
-        SCOPED_TIMER(_get_new_block_timer);
-        RETURN_IF_ERROR(
-                _block_mgr->get_new_block(_block_mgr_client, unpin_block, &new_block, block_len));
-    }
-    *got_block = (new_block != nullptr);
-
-    if (!*got_block) {
-        DCHECK(unpin_block == nullptr);
-        return Status::OK();
-    }
-
-    if (unpin_block != nullptr) {
-        DCHECK(unpin_block == _write_block);
-        DCHECK(!_write_block->is_pinned());
-        --_num_pinned;
-        DCHECK_EQ(_num_pinned, num_pinned(_blocks));
-    }
-
-    // Compute and allocate the block header with the null indicators
-    _null_indicators_write_block = compute_num_null_indicator_bytes(block_len);
-    new_block->allocate<uint8_t>(_null_indicators_write_block);
-    _write_tuple_idx = 0;
-
-    _blocks.push_back(new_block);
-    _block_start_idx.push_back(new_block->buffer());
-    _write_block = new_block;
-    DCHECK(_write_block->is_pinned());
-    DCHECK_EQ(_write_block->num_rows(), 0);
-    if (_write_block->is_max_size()) {
-        ++_num_pinned;
-        DCHECK_EQ(_num_pinned, num_pinned(_blocks));
-    } else {
-        ++_num_small_blocks;
-    }
-    _total_byte_size += block_len;
-    return Status::OK();
-}
-
-Status BufferedTupleStream2::next_block_for_read() {
-    DCHECK(!_closed);
-    DCHECK(_read_block != _blocks.end());
-    DCHECK_EQ(_num_pinned, num_pinned(_blocks)) << _pinned;
-
-    // If non-nullptr, this will be the current block if we are going to free it while
-    // grabbing the next block. This will stay nullptr if we don't want to free the
-    // current block.
-    BufferedBlockMgr2::Block* block_to_free =
-            (!_pinned || _delete_on_read) ? *_read_block : nullptr;
-    if (_delete_on_read) {
-        // TODO: this is weird. We are deleting even if it is pinned. The analytic
-        // eval node needs this.
-        DCHECK(_read_block == _blocks.begin());
-        DCHECK(*_read_block != _write_block);
-        _blocks.pop_front();
-        _read_block = _blocks.begin();
-        _read_block_idx = 0;
-        if (block_to_free != nullptr && !block_to_free->is_max_size()) {
-            block_to_free->del();
-            block_to_free = nullptr;
-            DCHECK_EQ(_num_pinned, num_pinned(_blocks)) << debug_string();
-        }
-    } else {
-        ++_read_block;
-        ++_read_block_idx;
-        if (block_to_free != nullptr && !block_to_free->is_max_size()) {
-            block_to_free = nullptr;
-        }
-    }
-
-    _read_ptr = nullptr;
-    _read_tuple_idx = 0;
-    _read_bytes = 0;
-
-    bool pinned = false;
-    if (_read_block == _blocks.end() || (*_read_block)->is_pinned()) {
-        // End of the blocks or already pinned, just handle block_to_free
-        if (block_to_free != nullptr) {
-            SCOPED_TIMER(_unpin_timer);
-            if (_delete_on_read) {
-                block_to_free->del();
-                --_num_pinned;
-            } else {
-                RETURN_IF_ERROR(unpin_block(block_to_free));
-            }
-        }
-    } else {
-        // Call into the block mgr to atomically unpin/delete the old block and pin the
-        // new block.
-        SCOPED_TIMER(_pin_timer);
-        RETURN_IF_ERROR((*_read_block)->pin(&pinned, block_to_free, !_delete_on_read));
-        if (!pinned) {
-            DCHECK(block_to_free == nullptr) << "Should have been able to pin." << std::endl
-                                             << _block_mgr->debug_string(_block_mgr_client);
-        }
-        if (block_to_free == nullptr && pinned) {
-            ++_num_pinned;
-        }
-    }
-
-    if (_read_block != _blocks.end() && (*_read_block)->is_pinned()) {
-        _null_indicators_read_block =
-                compute_num_null_indicator_bytes((*_read_block)->buffer_len());
-        _read_ptr = (*_read_block)->buffer() + _null_indicators_read_block;
-    }
-    DCHECK_EQ(_num_pinned, num_pinned(_blocks)) << debug_string();
-    return Status::OK();
-}
-
-Status BufferedTupleStream2::prepare_for_read(bool delete_on_read, bool* got_buffer) {
-    DCHECK(!_closed);
-    if (_blocks.empty()) {
-        return Status::OK();
-    }
-
-    if (!_read_write && _write_block != nullptr) {
-        DCHECK(_write_block->is_pinned());
-        if (!_pinned && _write_block != _blocks.front()) {
-            RETURN_IF_ERROR(unpin_block(_write_block));
-        }
-        _write_block = nullptr;
-    }
-
-    // Walk the blocks and pin the first non-io sized block.
-    // (small buffers always being pinned, no need to pin again)
-    for (list<BufferedBlockMgr2::Block*>::iterator it = _blocks.begin(); it != _blocks.end();
-         ++it) {
-        if (!(*it)->is_pinned()) {
-            SCOPED_TIMER(_pin_timer);
-            bool current_pinned = false;
-            RETURN_IF_ERROR((*it)->pin(&current_pinned));
-            if (!current_pinned) {
-                DCHECK(got_buffer != nullptr) << "Should have reserved enough blocks";
-                *got_buffer = false;
-                return Status::OK();
-            }
-            ++_num_pinned;
-            DCHECK_EQ(_num_pinned, num_pinned(_blocks));
-        }
-        if ((*it)->is_max_size()) {
-            break;
-        }
-    }
-
-    _read_block = _blocks.begin();
-    DCHECK(_read_block != _blocks.end());
-    _null_indicators_read_block = compute_num_null_indicator_bytes((*_read_block)->buffer_len());
-    _read_ptr = (*_read_block)->buffer() + _null_indicators_read_block;
-    _read_tuple_idx = 0;
-    _read_bytes = 0;
-    _rows_returned = 0;
-    _read_block_idx = 0;
-    _delete_on_read = delete_on_read;
-    if (got_buffer != nullptr) {
-        *got_buffer = true;
-    }
-    return Status::OK();
-}
-
-Status BufferedTupleStream2::pin_stream(bool already_reserved, bool* pinned) {
-    DCHECK(!_closed);
-    DCHECK(pinned != nullptr);
-    if (!already_reserved) {
-        // If we can't get all the blocks, don't try at all.
-        if (!_block_mgr->try_acquire_tmp_reservation(_block_mgr_client, blocks_unpinned())) {
-            *pinned = false;
-            return Status::OK();
-        }
-    }
-
-    for (list<BufferedBlockMgr2::Block*>::iterator it = _blocks.begin(); it != _blocks.end();
-         ++it) {
-        if ((*it)->is_pinned()) {
-            continue;
-        }
-        {
-            SCOPED_TIMER(_pin_timer);
-            RETURN_IF_ERROR((*it)->pin(pinned));
-        }
-        if (!*pinned) {
-            VLOG_QUERY << "Should have been reserved." << std::endl
-                       << _block_mgr->debug_string(_block_mgr_client);
-            return Status::OK();
-        }
-        ++_num_pinned;
-        DCHECK_EQ(_num_pinned, num_pinned(_blocks));
-    }
-
-    if (!_delete_on_read) {
-        // Populate _block_start_idx on pin.
-        DCHECK_EQ(_block_start_idx.size(), _blocks.size());
-        _block_start_idx.clear();
-        for (list<BufferedBlockMgr2::Block*>::iterator it = _blocks.begin(); it != _blocks.end();
-             ++it) {
-            _block_start_idx.push_back((*it)->buffer());
-        }
-    }
-    *pinned = true;
-    _pinned = true;
-    return Status::OK();
-}
-
-Status BufferedTupleStream2::unpin_stream(bool all) {
-    DCHECK(!_closed);
-    SCOPED_TIMER(_unpin_timer);
-
-    for (BufferedBlockMgr2::Block* block : _blocks) {
-        if (!block->is_pinned()) {
-            continue;
-        }
-        if (!all && (block == _write_block || (_read_write && block == *_read_block))) {
-            continue;
-        }
-        RETURN_IF_ERROR(unpin_block(block));
-    }
-    if (all) {
-        _read_block = _blocks.end();
-        _write_block = nullptr;
-    }
-    _pinned = false;
-    return Status::OK();
-}
-
-int BufferedTupleStream2::compute_num_null_indicator_bytes(int block_size) const {
-    if (_nullable_tuple) {
-        // We assume that all rows will use their max size, so we may be underutilizing the
-        // space, i.e. we may have some unused space in case of rows with nullptr tuples.
-        const uint32_t tuples_per_row = _desc.tuple_descriptors().size();
-        const uint32_t min_row_size_in_bits = 8 * _fixed_tuple_row_size + tuples_per_row;
-        const uint32_t block_size_in_bits = 8 * block_size;
-        const uint32_t max_num_rows = block_size_in_bits / min_row_size_in_bits;
-        return BitUtil::round_up_numi64(max_num_rows * tuples_per_row) * 8;
-    } else {
-        // If there are no nullable tuples then no need to waste space for null indicators.
-        return 0;
-    }
-}
-
-Status BufferedTupleStream2::get_rows(unique_ptr<RowBatch>* batch, bool* got_rows) {
-    RETURN_IF_ERROR(pin_stream(false, got_rows));
-    if (!*got_rows) {
-        return Status::OK();
-    }
-    RETURN_IF_ERROR(prepare_for_read(false));
-    batch->reset(new RowBatch(_desc, num_rows()));
-    bool eos = false;
-    // Loop until get_next fills the entire batch. Each call can stop at block
-    // boundaries. We generally want it to stop, so that blocks can be freed
-    // as we read. It is safe in this case because we pin the entire stream.
-    while (!eos) {
-        RETURN_IF_ERROR(get_next(batch->get(), &eos));
-    }
-    return Status::OK();
-}
-
-Status BufferedTupleStream2::get_next(RowBatch* batch, bool* eos, vector<RowIdx>* indices) {
-    if (_nullable_tuple) {
-        return get_next_internal<true>(batch, eos, indices);
-    } else {
-        return get_next_internal<false>(batch, eos, indices);
-    }
-}
-
-template <bool HasNullableTuple>
-Status BufferedTupleStream2::get_next_internal(RowBatch* batch, bool* eos,
-                                               vector<RowIdx>* indices) {
-    DCHECK(!_closed);
-    DCHECK(batch->row_desc().equals(_desc));
-    *eos = (_rows_returned == _num_rows);
-    if (*eos) {
-        return Status::OK();
-    }
-    DCHECK_GE(_null_indicators_read_block, 0);
-
-    const uint64_t tuples_per_row = _desc.tuple_descriptors().size();
-    DCHECK_LE(_read_tuple_idx / tuples_per_row, (*_read_block)->num_rows());
-    DCHECK_EQ(_read_tuple_idx % tuples_per_row, 0);
-    int rows_returned_curr_block = _read_tuple_idx / tuples_per_row;
-
-    int64_t data_len = (*_read_block)->valid_data_len() - _null_indicators_read_block;
-    if (UNLIKELY(rows_returned_curr_block == (*_read_block)->num_rows())) {
-        // Get the next block in the stream. We need to do this at the beginning of
-        // the get_next() call to ensure the buffer management semantics. next_block_for_read()
-        // will recycle the memory for the rows returned from the *previous* call to
-        // get_next().
-        RETURN_IF_ERROR(next_block_for_read());
-        DCHECK(_read_block != _blocks.end()) << debug_string();
-        DCHECK_GE(_null_indicators_read_block, 0);
-        data_len = (*_read_block)->valid_data_len() - _null_indicators_read_block;
-        rows_returned_curr_block = 0;
-    }
-
-    DCHECK(_read_block != _blocks.end());
-    DCHECK((*_read_block)->is_pinned()) << debug_string();
-    DCHECK(_read_ptr != nullptr);
-
-    int64_t rows_left = _num_rows - _rows_returned;
-    int rows_to_fill =
-            std::min(static_cast<int64_t>(batch->capacity() - batch->num_rows()), rows_left);
-    DCHECK_GE(rows_to_fill, 1);
-    batch->add_rows(rows_to_fill);
-    uint8_t* tuple_row_mem = reinterpret_cast<uint8_t*>(batch->get_row(batch->num_rows()));
-
-    // Produce tuple rows from the current block and the corresponding position on the
-    // null tuple indicator.
-    vector<RowIdx> local_indices;
-    if (indices == nullptr) {
-        // A hack so that we do not need to check whether 'indices' is not null in the
-        // tight loop.
-        indices = &local_indices;
-    } else {
-        DCHECK(is_pinned());
-        DCHECK(!_delete_on_read);
-        DCHECK_EQ(batch->num_rows(), 0);
-        indices->clear();
-    }
-    indices->reserve(rows_to_fill);
-
-    int i = 0;
-    uint8_t* null_word = nullptr;
-    uint32_t null_pos = 0;
-    // Start reading from position _read_tuple_idx in the block.
-    uint64_t last_read_ptr = 0;
-    // IMPALA-2256: Special case if there are no materialized slots.
-    bool increment_row = has_tuple_footprint();
-    uint64_t last_read_row = increment_row * (_read_tuple_idx / tuples_per_row);
-    while (i < rows_to_fill) {
-        // Check if current block is done.
-        if (UNLIKELY(rows_returned_curr_block + i == (*_read_block)->num_rows())) {
-            break;
-        }
-
-        // Copy the row into the output batch.
-        TupleRow* row = reinterpret_cast<TupleRow*>(tuple_row_mem);
-        last_read_ptr = reinterpret_cast<uint64_t>(_read_ptr);
-        indices->push_back(RowIdx());
-        DCHECK_EQ(indices->size(), i + 1);
-        (*indices)[i].set(_read_block_idx, _read_bytes + _null_indicators_read_block,
-                          last_read_row);
-        if (HasNullableTuple) {
-            for (int j = 0; j < tuples_per_row; ++j) {
-                // Stitch together the tuples from the block and the nullptr ones.
-                null_word = (*_read_block)->buffer() + (_read_tuple_idx >> 3);
-                null_pos = _read_tuple_idx & 7;
-                ++_read_tuple_idx;
-                const bool is_not_null = ((*null_word & (1 << (7 - null_pos))) == 0);
-                // Copy tuple and advance _read_ptr. If it is a nullptr tuple, it calls set_tuple
-                // with Tuple* being 0x0. To do that we multiply the current _read_ptr with
-                // false (0x0).
-                row->set_tuple(j, reinterpret_cast<Tuple*>(reinterpret_cast<uint64_t>(_read_ptr) *
-                                                           is_not_null));
-                _read_ptr += _desc.tuple_descriptors()[j]->byte_size() * is_not_null;
-            }
-            const uint64_t row_read_bytes = reinterpret_cast<uint64_t>(_read_ptr) - last_read_ptr;
-            DCHECK_GE(_fixed_tuple_row_size, row_read_bytes);
-            _read_bytes += row_read_bytes;
-            last_read_ptr = reinterpret_cast<uint64_t>(_read_ptr);
-        } else {
-            // When we know that there are no nullable tuples we can safely copy them without
-            // checking for nullability.
-            for (int j = 0; j < tuples_per_row; ++j) {
-                row->set_tuple(j, reinterpret_cast<Tuple*>(_read_ptr));
-                _read_ptr += _desc.tuple_descriptors()[j]->byte_size();
-            }
-            _read_bytes += _fixed_tuple_row_size;
-            _read_tuple_idx += tuples_per_row;
-        }
-        tuple_row_mem += sizeof(Tuple*) * tuples_per_row;
-
-        // Update string slot ptrs.
-        for (int j = 0; j < _string_slots.size(); ++j) {
-            Tuple* tuple = row->get_tuple(_string_slots[j].first);
-            if (HasNullableTuple && tuple == nullptr) {
-                continue;
-            }
-            read_strings(_string_slots[j].second, data_len, tuple);
-        }
-
-        // Update collection slot ptrs. We traverse the collection structure in the same order
-        // as it was written to the stream, allowing us to infer the data layout based on the
-        // length of collections and strings.
-        // for (int j = 0; j < _collection_slots.size(); ++j) {
-        //     Tuple* tuple = row->get_tuple(_collection_slots[j].first);
-        //     if (HasNullableTuple && tuple == nullptr) {
-        //         continue;
-        //     }
-        //     ReadCollections(_collection_slots[j].second, data_len, tuple);
-        // }
-        last_read_row += increment_row;
-        ++i;
-    }
-
-    batch->commit_rows(i);
-    _rows_returned += i;
-    *eos = (_rows_returned == _num_rows);
-    if ((!_pinned || _delete_on_read) &&
-        rows_returned_curr_block + i == (*_read_block)->num_rows()) {
-        // No more data in this block. Mark this batch as needing to return so
-        // the caller can pass the rows up the operator tree.
-        batch->mark_need_to_return();
-    }
-    DCHECK_EQ(indices->size(), i);
-    return Status::OK();
-}
-
-void BufferedTupleStream2::read_strings(const vector<SlotDescriptor*>& string_slots, int data_len,
-                                        Tuple* tuple) {
-    DCHECK(tuple != nullptr);
-    for (int i = 0; i < string_slots.size(); ++i) {
-        const SlotDescriptor* slot_desc = string_slots[i];
-        if (tuple->is_null(slot_desc->null_indicator_offset())) {
-            continue;
-        }
-
-        StringValue* sv = tuple->get_string_slot(slot_desc->tuple_offset());
-        DCHECK_LE(sv->len, data_len - _read_bytes);
-        sv->ptr = reinterpret_cast<char*>(_read_ptr);
-        _read_ptr += sv->len;
-        _read_bytes += sv->len;
-    }
-}
-
-int64_t BufferedTupleStream2::compute_row_size(TupleRow* row) const {
-    int64_t size = 0;
-    for (int i = 0; i < _desc.tuple_descriptors().size(); ++i) {
-        const TupleDescriptor* tuple_desc = _desc.tuple_descriptors()[i];
-        Tuple* tuple = row->get_tuple(i);
-        DCHECK(_nullable_tuple || tuple_desc->byte_size() == 0 || tuple != nullptr);
-        if (tuple == nullptr) {
-            continue;
-        }
-        size += tuple->total_byte_size(*tuple_desc);
-    }
-    return size;
-}
-
-bool BufferedTupleStream2::deep_copy(TupleRow* row) {
-    if (_nullable_tuple) {
-        return deep_copy_internal<true>(row);
-    } else {
-        return deep_copy_internal<false>(row);
-    }
-}
-
-// TODO: this really needs codegen
-// TODO: in case of duplicate tuples, this can redundantly serialize data.
-template <bool HasNullableTuple>
-bool BufferedTupleStream2::deep_copy_internal(TupleRow* row) {
-    if (UNLIKELY(_write_block == nullptr)) {
-        return false;
-    }
-    DCHECK_GE(_null_indicators_write_block, 0);
-    DCHECK(_write_block->is_pinned()) << debug_string() << std::endl
-                                      << _write_block->debug_string();
-
-    const uint64_t tuples_per_row = _desc.tuple_descriptors().size();
-    if (UNLIKELY((_write_block->bytes_remaining() < _fixed_tuple_row_size) ||
-                 (HasNullableTuple &&
-                  (_write_tuple_idx + tuples_per_row > _null_indicators_write_block * 8)))) {
-        return false;
-    }
-    // Allocate the maximum possible buffer for the fixed portion of the tuple.
-    uint8_t* tuple_buf = _write_block->allocate<uint8_t>(_fixed_tuple_row_size);
-    // Total bytes allocated in _write_block for this row. Saved so we can roll back
-    // if this row doesn't fit.
-    int bytes_allocated = _fixed_tuple_row_size;
-
-    // Copy the not nullptr fixed len tuples. For the nullptr tuples just update the nullptr tuple
-    // indicator.
-    if (HasNullableTuple) {
-        DCHECK_GT(_null_indicators_write_block, 0);
-        uint8_t* null_word = nullptr;
-        uint32_t null_pos = 0;
-        // Calculate how much space it should return.
-        int to_return = 0;
-        for (int i = 0; i < tuples_per_row; ++i) {
-            null_word = _write_block->buffer() + (_write_tuple_idx >> 3); // / 8
-            null_pos = _write_tuple_idx & 7;
-            ++_write_tuple_idx;
-            const int tuple_size = _desc.tuple_descriptors()[i]->byte_size();
-            Tuple* t = row->get_tuple(i);
-            const uint8_t mask = 1 << (7 - null_pos);
-            if (t != nullptr) {
-                *null_word &= ~mask;
-                memcpy(tuple_buf, t, tuple_size);
-                tuple_buf += tuple_size;
-            } else {
-                *null_word |= mask;
-                to_return += tuple_size;
-            }
-        }
-        DCHECK_LE(_write_tuple_idx - 1, _null_indicators_write_block * 8);
-        _write_block->return_allocation(to_return);
-        bytes_allocated -= to_return;
-    } else {
-        // If we know that there are no nullable tuples no need to set the nullability flags.
-        DCHECK_EQ(_null_indicators_write_block, 0);
-        for (int i = 0; i < tuples_per_row; ++i) {
-            const int tuple_size = _desc.tuple_descriptors()[i]->byte_size();
-            Tuple* t = row->get_tuple(i);
-            // TODO: Once IMPALA-1306 (Avoid passing empty tuples of non-materialized slots)
-            // is delivered, the check below should become DCHECK(t != nullptr).
-            DCHECK(t != nullptr || tuple_size == 0);
-            memcpy(tuple_buf, t, tuple_size);
-            tuple_buf += tuple_size;
-        }
-    }
-
-    // Copy string slots. Note: we do not need to convert the string ptrs to offsets
-    // on the write path, only on the read. The tuple data is immediately followed
-    // by the string data so only the len information is necessary.
-    for (int i = 0; i < _string_slots.size(); ++i) {
-        Tuple* tuple = row->get_tuple(_string_slots[i].first);
-        if (HasNullableTuple && tuple == nullptr) {
-            continue;
-        }
-        if (UNLIKELY(!copy_strings(tuple, _string_slots[i].second, &bytes_allocated))) {
-            _write_block->return_allocation(bytes_allocated);
-            return false;
-        }
-    }
-
-    // Copy collection slots. We copy collection data in a well-defined order so we do not
-    // need to convert pointers to offsets on the write path.
-    // for (int i = 0; i < _collection_slots.size(); ++i) {
-    //     Tuple* tuple = row->get_tuple(_collection_slots[i].first);
-    //     if (HasNullableTuple && tuple == nullptr) continue;
-    //     if (UNLIKELY(!copy_collections(tuple, _collection_slots[i].second,
-    //                     &bytes_allocated))) {
-    //         _write_block->return_allocation(bytes_allocated);
-    //         return false;
-    //     }
-    // }
-
-    _write_block->add_row();
-    ++_num_rows;
-    return true;
-}
-
-bool BufferedTupleStream2::copy_strings(const Tuple* tuple,
-                                        const vector<SlotDescriptor*>& string_slots,
-                                        int* bytes_allocated) {
-    for (int i = 0; i < string_slots.size(); ++i) {
-        const SlotDescriptor* slot_desc = string_slots[i];
-        if (tuple->is_null(slot_desc->null_indicator_offset())) {
-            continue;
-        }
-        const StringValue* sv = tuple->get_string_slot(slot_desc->tuple_offset());
-        if (LIKELY(sv->len > 0)) {
-            if (UNLIKELY(_write_block->bytes_remaining() < sv->len)) {
-                return false;
-            }
-            uint8_t* buf = _write_block->allocate<uint8_t>(sv->len);
-            (*bytes_allocated) += sv->len;
-            memcpy(buf, sv->ptr, sv->len);
-        }
-    }
-    return true;
-}
-} // end namespace doris
diff --git a/be/src/runtime/buffered_tuple_stream2.h b/be/src/runtime/buffered_tuple_stream2.h
deleted file mode 100644
index 7d16ad3441..0000000000
--- a/be/src/runtime/buffered_tuple_stream2.h
+++ /dev/null
@@ -1,412 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// https://github.com/apache/impala/blob/branch-2.10.0/be/src/runtime/buffered-tuple-stream.h
-// and modified by Doris
-
-#pragma once
-
-#include <vector>
-
-#include "common/status.h"
-#include "runtime/buffered_block_mgr2.h"
-
-namespace doris {
-
-class BufferedBlockMgr2;
-class RuntimeProfile;
-class RuntimeState;
-class RowBatch;
-class RowDescriptor;
-class SlotDescriptor;
-class TupleRow;
-class Tuple;
-
-// Class that provides an abstraction for a stream of tuple rows. Rows can be
-// added to the stream and returned. Rows are returned in the order they are added.
-//
-// The underlying memory management is done by the BufferedBlockMgr2.
-//
-// The tuple stream consists of a number of small (less than IO-sized blocks) before
-// an arbitrary number of IO-sized blocks. The smaller blocks do not spill and are
-// there to lower the minimum buffering requirements. For example, an operator that
-// needs to maintain 64 streams (1 buffer per partition) would need, by default,
-// 64 * 8MB = 512MB of buffering. A query with 5 of these operators would require
-// 2.56GB just to run, regardless of how much of that is used. This is
-// problematic for small queries. Instead we will start with a fixed number of small
-// buffers (currently 2 small buffers: one 64KB and one 512KB) and only start using IO
-// sized buffers when those fill up. The small buffers never spill.
-// The stream will *not* automatically switch from using small buffers to IO-sized
-// buffers when all the small buffers for this stream have been used.
-//
-// The BufferedTupleStream2 is *not* thread safe from the caller's point of view. It is
-// expected that all the APIs are called from a single thread. Internally, the
-// object is thread safe wrt to the underlying block mgr.
-//
-// Buffer management:
-// The stream is either pinned or unpinned, set via pin_stream() and unpin_stream().
-// Blocks are optionally deleted as they are read, set with the delete_on_read argument
-// to prepare_for_read().
-//
-// Block layout:
-// At the header of each block, starting at position 0, there is a bitstring with null
-// indicators for all the tuples in each row in the block. Then there are the tuple rows.
-// We further optimize the codepaths when we know that no tuple is nullable, indicated
-// by '_nullable_tuple'.
-//
-// Tuple row layout:
-// Tuples are stored back to back. Each tuple starts with the fixed length portion,
-// directly followed by the var len portion. (Fixed len and var len are interleaved).
-// If any tuple in the row is nullable, then there is a bitstring of null tuple
-// indicators at the header of the block. The order of bits in the null indicators
-// bitstring corresponds to the order of tuples in the block. The nullptr tuples are not
-// stored in the body of the block, only as set bits in the null indicators bitsting.
-//
-// The behavior of reads and writes is as follows:
-// Read:
-//   1. Delete on read (_delete_on_read): Blocks are deleted as we go through the stream.
-//   The data returned by the tuple stream is valid until the next read call so the
-//   caller does not need to copy if it is streaming.
-//   2. Unpinned: Blocks remain in _blocks and are unpinned after reading.
-//   3. Pinned: Blocks remain in _blocks and are left pinned after reading. If the next
-//   block in the stream cannot be pinned, the read call will fail and the caller needs
-//   to free memory from the underlying block mgr.
-// Write:
-//   1. Unpinned: Unpin blocks as they fill up. This means only a single (i.e. the
-//   current) block needs to be in memory regardless of the input size (if read_write is
-//   true, then two blocks need to be in memory).
-//   2. Pinned: Blocks are left pinned. If we run out of blocks, the write will fail and
-//   the caller needs to free memory from the underlying block mgr.
-//
-// TODO: we need to be able to do read ahead in the BufferedBlockMgr2. It currently
-// only has PinAllBlocks() which is blocking. We need a non-blocking version of this or
-// some way to indicate a block will need to be pinned soon.
-// TODO: see if this can be merged with Sorter::Run. The key difference is that this
-// does not need to return rows in the order they were added, which allows it to be
-// simpler.
-// TODO: we could compact the small buffers when we need to spill but they use very
-// little memory so ths might not be very useful.
-// TODO: improvements:
-//   - Think about how to layout for the var len data more, possibly filling in them
-//     from the end of the same block. Don't interleave fixed and var len data.
-//   - It would be good to allocate the null indicators at the end of each block and grow
-//     this array as new rows are inserted in the block. If we do so, then there will be
-//     fewer gaps in case of many rows with nullptr tuples.
-//   - We will want to multithread this. Add a AddBlock() call so the synchronization
-//     happens at the block level. This is a natural extension.
-//   - Instead of allocating all blocks from the block_mgr, allocate some blocks that
-//     are much smaller (e.g. 16K and doubling up to the block size). This way, very
-//     small streams (a common case) will use very little memory. This small blocks
-//     are always in memory since spilling them frees up negligible memory.
-//   - Return row batches in get_next() instead of filling one in
-//   - Should we 32-bit align the start of the tuple rows? Now it is byte-aligned.
-class BufferedTupleStream2 {
-public:
-    // Ordinal index into the stream to retrieve a row in O(1) time. This index can
-    // only be used if the stream is pinned.
-    // To read a row from a stream we need three pieces of information that we squeeze in
-    // 64 bits:
-    //  - The index of the block. The block id is stored in 16 bits. We can have up to
-    //    64K blocks per tuple stream. With 8MB blocks that is 512GB per stream.
-    //  - The offset of the start of the row (data) within the block. Since blocks are 8MB
-    //    we use 24 bits for the offsets. (In theory we could use 23 bits.)
-    //  - The idx of the row in the block. We need this for retrieving the null indicators.
-    //    We use 24 bits for this index as well.
-    struct RowIdx {
-        static const uint64_t BLOCK_MASK = 0xFFFF;
-        static const uint64_t BLOCK_SHIFT = 0;
-        static const uint64_t OFFSET_MASK = 0xFFFFFF0000;
-        static const uint64_t OFFSET_SHIFT = 16;
-        static const uint64_t IDX_MASK = 0xFFFFFF0000000000;
-        static const uint64_t IDX_SHIFT = 40;
-
-        uint64_t block() const { return (data & BLOCK_MASK); };
-
-        uint64_t offset() const { return (data & OFFSET_MASK) >> OFFSET_SHIFT; };
-
-        uint64_t idx() const { return (data & IDX_MASK) >> IDX_SHIFT; }
-
-        uint64_t set(uint64_t block, uint64_t offset, uint64_t idx) {
-            DCHECK_LE(block, BLOCK_MASK)
-                    << "Cannot have more than 2^16 = 64K blocks in a tuple stream.";
-            DCHECK_LE(offset, OFFSET_MASK >> OFFSET_SHIFT)
-                    << "Cannot have blocks larger than 2^24 = 16MB";
-            DCHECK_LE(idx, IDX_MASK >> IDX_SHIFT)
-                    << "Cannot have more than 2^24 = 16M rows in a block.";
-            data = block | (offset << OFFSET_SHIFT) | (idx << IDX_SHIFT);
-            return data;
-        }
-
-        std::string debug_string() const;
-
-        uint64_t data;
-    };
-
-    // row_desc: description of rows stored in the stream. This is the desc for rows
-    // that are added and the rows being returned.
-    // block_mgr: Underlying block mgr that owns the data blocks.
-    // use_initial_small_buffers: If true, the initial N buffers allocated for the
-    // tuple stream use smaller than IO-sized buffers.
-    // read_write: Stream allows interchanging read and write operations. Requires at
-    // least two blocks may be pinned.
-    BufferedTupleStream2(RuntimeState* state, const RowDescriptor& row_desc,
-                         BufferedBlockMgr2* block_mgr, BufferedBlockMgr2::Client* client,
-                         bool use_initial_small_buffers, bool read_write);
-    // A null dtor to pass codestyle check
-    ~BufferedTupleStream2() {}
-
-    // Initializes the tuple stream object on behalf of node 'node_id'. Must be called
-    // once before any of the other APIs.
-    // If 'pinned' is true, the tuple stream starts of pinned, otherwise it is unpinned.
-    // If 'profile' is non-nullptr, counters are created.
-    // 'node_id' is only used for error reporting.
-    Status init(int node_id, RuntimeProfile* profile, bool pinned);
-
-    // Must be called for streams using small buffers to switch to IO-sized buffers.
-    // If it fails to get a buffer (i.e. the switch fails) it resets the _use_small_buffers
-    // back to false.
-    // TODO: this does not seem like the best mechanism.
-    Status switch_to_io_buffers(bool* got_buffer);
-
-    // Adds a single row to the stream. Returns false and sets *status if an error
-    // occurred.  BufferedTupleStream2 will do a deep copy of the memory in the row.
-    bool add_row(TupleRow* row, Status* status);
-
-    // Allocates space to store a row of size 'size' and returns a pointer to the memory
-    // when successful. Returns nullptr if there is not enough memory or an error occurred.
-    // When returning nullptr, sets *status. The returned memory is guaranteed to fit on one
-    // block.
-    uint8_t* allocate_row(int size, Status* status);
-
-    // Populates 'row' with the row at 'idx'. The stream must be pinned. The row must have
-    // been allocated with the stream's row desc.
-    void get_tuple_row(const RowIdx& idx, TupleRow* row) const;
-
-    // Prepares the stream for reading. If _read_write, this can be called at any time to
-    // begin reading. Otherwise this must be called after the last AddRow() and
-    // before get_next().
-    // delete_on_read: Blocks are deleted after they are read.
-    // If got_buffer is nullptr, this function will fail (with a bad status) if no buffer
-    // is available. If got_buffer is non-null, this function will not fail on OOM and
-    // *got_buffer is true if a buffer was pinned.
-    Status prepare_for_read(bool delete_on_read, bool* got_buffer = nullptr);
-
-    // Pins all blocks in this stream and switches to pinned mode.
-    // If there is not enough memory, *pinned is set to false and the stream is unmodified.
-    // If already_reserved is true, the caller has already made a reservation on
-    // _block_mgr_client to pin the stream.
-    Status pin_stream(bool already_reserved, bool* pinned);
-
-    // Unpins stream. If all is true, all blocks are unpinned, otherwise all blocks
-    // except the _write_block and _read_block are unpinned.
-    Status unpin_stream(bool all = false);
-
-    // Get the next batch of output rows. Memory is still owned by the BufferedTupleStream2
-    // and must be copied out by the caller.
-    // If 'indices' is non-nullptr, that is also populated for each returned row with the
-    // index for that row.
-    Status get_next(RowBatch* batch, bool* eos, std::vector<RowIdx>* indices = nullptr);
-
-    // Returns all the rows in the stream in batch. This pins the entire stream
-    // in the process.
-    // *got_rows is false if the stream could not be pinned.
-    Status get_rows(std::unique_ptr<RowBatch>* batch, bool* got_rows);
-
-    // Must be called once at the end to cleanup all resources. Idempotent.
-    void close();
-
-    // Number of rows in the stream.
-    int64_t num_rows() const { return _num_rows; }
-
-    // Number of rows returned via get_next().
-    int64_t rows_returned() const { return _rows_returned; }
-
-    // Returns the byte size necessary to store the entire stream in memory.
-    int64_t byte_size() const { return _total_byte_size; }
-
-    // Returns the byte size of the stream that is currently pinned in memory.
-    // If ignore_current is true, the _write_block memory is not included.
-    int64_t bytes_in_mem(bool ignore_current) const;
-
-    bool is_pinned() const { return _pinned; }
-    int blocks_pinned() const { return _num_pinned; }
-    int blocks_unpinned() const { return _blocks.size() - _num_pinned - _num_small_blocks; }
-    bool has_read_block() const { return _read_block != _blocks.end(); }
-    bool has_write_block() const { return _write_block != nullptr; }
-    bool using_small_buffers() const { return _use_small_buffers; }
-    bool has_tuple_footprint() const {
-        return _fixed_tuple_row_size > 0 || !_string_slots.empty() || _nullable_tuple;
-    }
-
-    std::string debug_string() const;
-
-private:
-    // friend class ArrayTupleStreamTest_TestArrayDeepCopy_Test;
-
-    // If true, this stream is still using small buffers.
-    bool _use_small_buffers;
-
-    // If true, blocks are deleted after they are read.
-    bool _delete_on_read;
-
-    // If true, read and write operations may be interleaved. Otherwise all calls
-    // to AddRow() must occur before calling prepare_for_read() and subsequent calls to
-    // get_next().
-    const bool _read_write;
-
-    // Runtime state instance used to check for cancellation. Not owned.
-    RuntimeState* const _state;
-
-    // Description of rows stored in the stream.
-    const RowDescriptor& _desc;
-
-    // Whether any tuple in the rows is nullable.
-    const bool _nullable_tuple;
-
-    // Sum of the fixed length portion of all the tuples in _desc.
-    int _fixed_tuple_row_size;
-
-    // Max size (in bytes) of null indicators bitstring in the current read and write
-    // blocks. If 0, it means that there is no need to store null indicators for this
-    // RowDesc. We calculate this value based on the block's size and the
-    // _fixed_tuple_row_size. When not 0, this value is also an upper bound for the number
-    // of (rows * tuples_per_row) in this block.
-    uint32_t _null_indicators_read_block;
-    uint32_t _null_indicators_write_block;
-
-    // Vector of all the strings slots grouped by tuple_idx.
-    std::vector<std::pair<int, std::vector<SlotDescriptor*>>> _string_slots;
-
-    // Vector of all the collection slots grouped by tuple_idx.
-    // std::vector<std::pair<int, std::vector<SlotDescriptor*>>> _collection_slots;
-
-    // Block manager and client used to allocate, pin and release blocks. Not owned.
-    BufferedBlockMgr2* _block_mgr;
-    BufferedBlockMgr2::Client* _block_mgr_client;
-
-    // List of blocks in the stream.
-    std::list<BufferedBlockMgr2::Block*> _blocks;
-
-    // Total size of _blocks, including small blocks.
-    int64_t _total_byte_size;
-
-    // Iterator pointing to the current block for read. Equal to list.end() until
-    // prepare_for_read() is called.
-    std::list<BufferedBlockMgr2::Block*>::iterator _read_block;
-
-    // For each block in the stream, the buffer of the start of the block. This is only
-    // valid when the stream is pinned, giving random access to data in the stream.
-    // This is not maintained for _delete_on_read.
-    std::vector<uint8_t*> _block_start_idx;
-
-    // Current ptr offset in _read_block's buffer.
-    uint8_t* _read_ptr;
-
-    // Current idx of the tuple read from the _read_block buffer.
-    uint32_t _read_tuple_idx;
-
-    // Current idx of the tuple written at the _write_block buffer.
-    uint32_t _write_tuple_idx;
-
-    // Bytes read in _read_block.
-    int64_t _read_bytes;
-
-    // Number of rows returned to the caller from get_next().
-    int64_t _rows_returned;
-
-    // The block index of the current read block.
-    int _read_block_idx;
-
-    // The current block for writing. nullptr if there is no available block to write to.
-    BufferedBlockMgr2::Block* _write_block;
-
-    // Number of pinned blocks in _blocks, stored to avoid iterating over the list
-    // to compute bytes_in_mem and bytes_unpinned.
-    // This does not include small blocks.
-    int _num_pinned;
-
-    // The total number of small blocks in _blocks;
-    int _num_small_blocks;
-
-    bool _closed; // Used for debugging.
-
-    // Number of rows stored in the stream.
-    int64_t _num_rows;
-
-    // If true, this stream has been explicitly pinned by the caller. This changes the
-    // memory management of the stream. The blocks are not unpinned until the caller calls
-    // UnpinAllBlocks(). If false, only the _write_block and/or _read_block are pinned
-    // (both are if _read_write is true).
-    bool _pinned;
-
-    // Counters added by this object to the parent runtime profile.
-    RuntimeProfile::Counter* _pin_timer;
-    RuntimeProfile::Counter* _unpin_timer;
-    RuntimeProfile::Counter* _get_new_block_timer;
-
-    // Copies 'row' into _write_block. Returns false if there is not enough space in
-    // '_write_block'.
-    template <bool HasNullableTuple>
-    bool deep_copy_internal(TupleRow* row);
-
-    // Helper function to copy strings from tuple into _write_block. Increments
-    // bytes_allocated by the number of bytes allocated from _write_block.
-    bool copy_strings(const Tuple* tuple, const std::vector<SlotDescriptor*>& string_slots,
-                      int* bytes_allocated);
-
-    // Helper function to deep copy collections from tuple into _write_block. Increments
-    // bytes_allocated by the number of bytes allocated from _write_block.
-    // bool copy_collections(const Tuple* tuple,
-    //         const std::vector<SlotDescriptor*>& collection_slots, int* bytes_allocated);
-
-    // Wrapper of the templated deep_copy_internal() function.
-    bool deep_copy(TupleRow* row);
-
-    // Gets a new block from the _block_mgr, updating _write_block and _write_tuple_idx,
-    // and setting *got_block. If there are no blocks available, *got_block is set to
-    // false and _write_block is unchanged.
-    // 'min_size' is the minimum number of bytes required for this block.
-    Status new_block_for_write(int64_t min_size, bool* got_block);
-
-    // Reads the next block from the _block_mgr. This blocks if necessary.
-    // Updates _read_block, _read_ptr, _read_tuple_idx and _read_bytes.
-    Status next_block_for_read();
-
-    // Returns the byte size of this row when encoded in a block.
-    int64_t compute_row_size(TupleRow* row) const;
-
-    // Unpins block if it is an IO-sized block and updates tracking stats.
-    Status unpin_block(BufferedBlockMgr2::Block* block);
-
-    // Templated get_next implementation.
-    template <bool HasNullableTuple>
-    Status get_next_internal(RowBatch* batch, bool* eos, std::vector<RowIdx>* indices);
-
-    // Read strings from stream by converting pointers and updating _read_ptr and
-    // _read_bytes.
-    void read_strings(const std::vector<SlotDescriptor*>& string_slots, int data_len, Tuple* tuple);
-
-    // Read collections from stream by converting pointers and updating _read_ptr and
-    // _read_bytes.
-    // void ReadCollections(const std::vector<SlotDescriptor*>& collection_slots, int data_len,
-    //         Tuple* tuple);
-
-    // Computes the number of bytes needed for null indicators for a block of 'block_size'
-    int compute_num_null_indicator_bytes(int block_size) const;
-};
-
-} // end namespace doris
diff --git a/be/src/runtime/buffered_tuple_stream2.inline.h b/be/src/runtime/buffered_tuple_stream2.inline.h
deleted file mode 100644
index 99add39b83..0000000000
--- a/be/src/runtime/buffered_tuple_stream2.inline.h
+++ /dev/null
@@ -1,90 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// https://github.com/apache/impala/blob/branch-2.10.0/be/src/runtime/buffered-tuple-stream.inline.h
-// and modified by Doris
-
-#pragma once
-
-#include "runtime/buffered_tuple_stream2.h"
-#include "runtime/descriptors.h"
-#include "runtime/tuple_row.h"
-
-namespace doris {
-
-inline bool BufferedTupleStream2::add_row(TupleRow* row, Status* status) {
-    DCHECK(!_closed);
-    if (LIKELY(deep_copy(row))) {
-        return true;
-    }
-    bool got_block;
-    int64_t row_size = compute_row_size(row);
-    *status = new_block_for_write(row_size, &got_block);
-    if (!status->ok() || !got_block) {
-        return false;
-    }
-    return deep_copy(row);
-}
-
-inline uint8_t* BufferedTupleStream2::allocate_row(int size, Status* status) {
-    DCHECK(!_closed);
-    if (UNLIKELY(_write_block == nullptr || _write_block->bytes_remaining() < size)) {
-        bool got_block;
-        *status = new_block_for_write(size, &got_block);
-        if (!status->ok() || !got_block) {
-            return nullptr;
-        }
-    }
-    DCHECK(_write_block != nullptr);
-    DCHECK(_write_block->is_pinned());
-    DCHECK_GE(_write_block->bytes_remaining(), size);
-    ++_num_rows;
-    _write_block->add_row();
-    return _write_block->allocate<uint8_t>(size);
-}
-
-inline void BufferedTupleStream2::get_tuple_row(const RowIdx& idx, TupleRow* row) const {
-    DCHECK(row != nullptr);
-    DCHECK(!_closed);
-    DCHECK(is_pinned());
-    DCHECK(!_delete_on_read);
-    DCHECK_EQ(_blocks.size(), _block_start_idx.size());
-    DCHECK_LT(idx.block(), _blocks.size());
-
-    uint8_t* data = _block_start_idx[idx.block()] + idx.offset();
-    if (_nullable_tuple) {
-        // Stitch together the tuples from the block and the nullptr ones.
-        const int tuples_per_row = _desc.tuple_descriptors().size();
-        uint32_t tuple_idx = idx.idx() * tuples_per_row;
-        for (int i = 0; i < tuples_per_row; ++i) {
-            const uint8_t* null_word = _block_start_idx[idx.block()] + (tuple_idx >> 3);
-            const uint32_t null_pos = tuple_idx & 7;
-            const bool is_not_null = ((*null_word & (1 << (7 - null_pos))) == 0);
-            row->set_tuple(
-                    i, reinterpret_cast<Tuple*>(reinterpret_cast<uint64_t>(data) * is_not_null));
-            data += _desc.tuple_descriptors()[i]->byte_size() * is_not_null;
-            ++tuple_idx;
-        }
-    } else {
-        for (int i = 0; i < _desc.tuple_descriptors().size(); ++i) {
-            row->set_tuple(i, reinterpret_cast<Tuple*>(data));
-            data += _desc.tuple_descriptors()[i]->byte_size();
-        }
-    }
-}
-
-} // namespace doris
diff --git a/be/src/runtime/buffered_tuple_stream3.cc b/be/src/runtime/buffered_tuple_stream3.cc
deleted file mode 100644
index 2a35f5c70c..0000000000
--- a/be/src/runtime/buffered_tuple_stream3.cc
+++ /dev/null
@@ -1,867 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// https://github.com/apache/impala/blob/branch-3.0.0/be/src/runtime/buffered-tuple-stream.cc
-// and modified by Doris
-
-#include <gutil/strings/substitute.h>
-
-#include "runtime/buffered_tuple_stream3.inline.h"
-#include "runtime/descriptors.h"
-#include "runtime/exec_env.h"
-#include "runtime/row_batch.h"
-#include "runtime/runtime_state.h"
-#include "runtime/string_value.h"
-#include "runtime/tuple_row.h"
-#include "util/bit_util.h"
-#include "util/debug_util.h"
-
-#ifdef NDEBUG
-#define CHECK_CONSISTENCY_FAST()
-#define CHECK_CONSISTENCY_FULL()
-#else
-#define CHECK_CONSISTENCY_FAST() CheckConsistencyFast()
-#define CHECK_CONSISTENCY_FULL() CheckConsistencyFull()
-#endif
-
-using namespace doris;
-using namespace strings;
-
-using BufferHandle = BufferPool::BufferHandle;
-
-BufferedTupleStream3::BufferedTupleStream3(RuntimeState* state, const RowDescriptor* row_desc,
-                                           BufferPool::ClientHandle* buffer_pool_client,
-                                           int64_t default_page_len,
-                                           const std::set<SlotId>& ext_varlen_slots)
-        : state_(state),
-          desc_(row_desc),
-          node_id_(-1),
-          buffer_pool_(state->exec_env()->buffer_pool()),
-          buffer_pool_client_(buffer_pool_client),
-          num_pages_(0),
-          total_byte_size_(0),
-          has_read_iterator_(false),
-          read_page_rows_returned_(-1),
-          read_ptr_(nullptr),
-          read_end_ptr_(nullptr),
-          write_ptr_(nullptr),
-          write_end_ptr_(nullptr),
-          rows_returned_(0),
-          has_write_iterator_(false),
-          write_page_(nullptr),
-          bytes_pinned_(0),
-          num_rows_(0),
-          default_page_len_(default_page_len),
-          has_nullable_tuple_(row_desc->is_any_tuple_nullable()),
-          delete_on_read_(false),
-          closed_(false),
-          pinned_(true) {
-    DCHECK(BitUtil::IsPowerOf2(default_page_len)) << default_page_len;
-    read_page_ = pages_.end();
-    for (int i = 0; i < desc_->tuple_descriptors().size(); ++i) {
-        const TupleDescriptor* tuple_desc = desc_->tuple_descriptors()[i];
-        const int tuple_byte_size = tuple_desc->byte_size();
-        fixed_tuple_sizes_.push_back(tuple_byte_size);
-
-        vector<SlotDescriptor*> tuple_string_slots;
-        vector<SlotDescriptor*> tuple_coll_slots;
-        for (int j = 0; j < tuple_desc->slots().size(); ++j) {
-            SlotDescriptor* slot = tuple_desc->slots()[j];
-            if (!slot->type().is_var_len_string_type()) continue;
-            if (ext_varlen_slots.find(slot->id()) == ext_varlen_slots.end()) {
-                if (slot->type().is_var_len_string_type()) {
-                    tuple_string_slots.push_back(slot);
-                } else {
-                    DCHECK(slot->type().is_collection_type());
-                    tuple_coll_slots.push_back(slot);
-                }
-            }
-        }
-        if (!tuple_string_slots.empty()) {
-            inlined_string_slots_.push_back(make_pair(i, tuple_string_slots));
-        }
-        /*
-    if (!tuple_coll_slots.empty()) {
-      inlined_coll_slots_.push_back(make_pair(i, tuple_coll_slots));
-    }
-*/
-    }
-}
-
-BufferedTupleStream3::~BufferedTupleStream3() {
-    DCHECK(closed_);
-}
-
-void BufferedTupleStream3::CheckConsistencyFull() const {
-    CheckConsistencyFast();
-    // The below checks require iterating over all the pages in the stream.
-    DCHECK_EQ(bytes_pinned_, CalcBytesPinned()) << DebugString();
-    DCHECK_EQ(pages_.size(), num_pages_) << DebugString();
-    for (const Page& page : pages_) CheckPageConsistency(&page);
-}
-
-void BufferedTupleStream3::CheckConsistencyFast() const {
-    // All the below checks should be O(1).
-    DCHECK(has_write_iterator() || write_page_ == nullptr);
-    if (write_page_ != nullptr) {
-        CheckPageConsistency(write_page_);
-        DCHECK(write_page_->is_pinned());
-        DCHECK(write_page_->retrieved_buffer);
-        const BufferHandle* write_buffer;
-        Status status = write_page_->GetBuffer(&write_buffer);
-        DCHECK(status.ok()); // Write buffer should never have been unpinned.
-        DCHECK_GE(write_ptr_, write_buffer->data());
-        DCHECK_EQ(write_end_ptr_, write_buffer->data() + write_page_->len());
-        DCHECK_GE(write_end_ptr_, write_ptr_);
-    }
-    DCHECK(has_read_iterator() || read_page_ == pages_.end());
-    if (read_page_ != pages_.end()) {
-        CheckPageConsistency(&*read_page_);
-        DCHECK(read_page_->is_pinned());
-        DCHECK(read_page_->retrieved_buffer);
-        // Can't check read buffer without affecting behaviour, because a read may be in
-        // flight and this would required blocking on that write.
-        DCHECK_GE(read_end_ptr_, read_ptr_);
-    }
-}
-
-void BufferedTupleStream3::CheckPageConsistency(const Page* page) const {
-    DCHECK_EQ(ExpectedPinCount(pinned_, page), page->pin_count()) << DebugString();
-    // Only one large row per page.
-    if (page->len() > default_page_len_) DCHECK_LE(page->num_rows, 1);
-    // We only create pages when we have a row to append to them.
-    DCHECK_GT(page->num_rows, 0);
-}
-
-string BufferedTupleStream3::DebugString() const {
-    std::stringstream ss;
-    ss << "BufferedTupleStream3 num_rows=" << num_rows_ << " rows_returned=" << rows_returned_
-       << " pinned=" << pinned_ << " delete_on_read=" << delete_on_read_ << " closed=" << closed_
-       << "\n"
-       << " bytes_pinned=" << bytes_pinned_ << " has_write_iterator=" << has_write_iterator_
-       << " write_page=" << write_page_ << " has_read_iterator=" << has_read_iterator_
-       << " read_page=";
-    if (read_page_ == pages_.end()) {
-        ss << "<end>";
-    } else {
-        ss << &*read_page_;
-    }
-    ss << "\n # pages=" << num_pages_ << " pages=[\n";
-    for (const Page& page : pages_) {
-        ss << "{" << page.DebugString() << "}";
-        if (&page != &pages_.back()) ss << ",\n";
-    }
-    ss << "]";
-    return ss.str();
-}
-
-string BufferedTupleStream3::Page::DebugString() const {
-    //return Substitute("$0 num_rows=$1", handle.DebugString(), num_rows);
-    return string("");
-}
-
-Status BufferedTupleStream3::Init(int node_id, bool pinned) {
-    //  if (!pinned) UnpinStream(UNPIN_ALL_EXCEPT_CURRENT);
-    node_id_ = node_id;
-    return Status::OK();
-}
-
-Status BufferedTupleStream3::PrepareForWrite() {
-    // This must be the first iterator created.
-    DCHECK(pages_.empty());
-    DCHECK(!delete_on_read_);
-    DCHECK(!has_write_iterator());
-    DCHECK(!has_read_iterator());
-    CHECK_CONSISTENCY_FULL();
-
-    has_write_iterator_ = true;
-    return Status::OK();
-}
-
-Status BufferedTupleStream3::PrepareForReadWrite(bool delete_on_read) {
-    // This must be the first iterator created.
-    DCHECK(pages_.empty());
-    DCHECK(!delete_on_read_);
-    DCHECK(!has_write_iterator());
-    DCHECK(!has_read_iterator());
-    CHECK_CONSISTENCY_FULL();
-
-    has_write_iterator_ = true;
-    RETURN_IF_ERROR(PrepareForReadInternal(delete_on_read));
-    return Status::OK();
-}
-
-void BufferedTupleStream3::Close(RowBatch* batch, RowBatch::FlushMode flush) {
-    for (Page& page : pages_) {
-        if (batch != nullptr && page.retrieved_buffer) {
-            // Subtle: We only need to attach buffers from pages that we may have returned
-            // references to. ExtractBuffer() cannot fail for these pages because the data
-            // is guaranteed to already be in -memory.
-            BufferPool::BufferHandle buffer;
-            Status status = buffer_pool_->ExtractBuffer(buffer_pool_client_, &page.handle, &buffer);
-            DCHECK(status.ok());
-            batch->add_buffer(buffer_pool_client_, std::move(buffer), flush);
-        } else {
-            buffer_pool_->DestroyPage(buffer_pool_client_, &page.handle);
-        }
-    }
-    pages_.clear();
-    num_pages_ = 0;
-    bytes_pinned_ = 0;
-    closed_ = true;
-}
-
-int64_t BufferedTupleStream3::CalcBytesPinned() const {
-    int64_t result = 0;
-    for (const Page& page : pages_) result += page.pin_count() * page.len();
-    return result;
-}
-
-Status BufferedTupleStream3::PinPage(Page* page) {
-    RETURN_IF_ERROR(buffer_pool_->Pin(buffer_pool_client_, &page->handle));
-    bytes_pinned_ += page->len();
-    return Status::OK();
-}
-
-int BufferedTupleStream3::ExpectedPinCount(bool stream_pinned, const Page* page) const {
-    return (stream_pinned || is_read_page(page) || is_write_page(page)) ? 1 : 0;
-}
-
-Status BufferedTupleStream3::PinPageIfNeeded(Page* page, bool stream_pinned) {
-    int new_pin_count = ExpectedPinCount(stream_pinned, page);
-    if (new_pin_count != page->pin_count()) {
-        DCHECK_EQ(new_pin_count, page->pin_count() + 1);
-        RETURN_IF_ERROR(PinPage(page));
-    }
-    return Status::OK();
-}
-
-void BufferedTupleStream3::UnpinPageIfNeeded(Page* page, bool stream_pinned) {
-    int new_pin_count = ExpectedPinCount(stream_pinned, page);
-    if (new_pin_count != page->pin_count()) {
-        DCHECK_EQ(new_pin_count, page->pin_count() - 1);
-        buffer_pool_->Unpin(buffer_pool_client_, &page->handle);
-        bytes_pinned_ -= page->len();
-        if (page->pin_count() == 0) page->retrieved_buffer = false;
-    }
-}
-
-Status BufferedTupleStream3::NewWritePage(int64_t page_len) noexcept {
-    DCHECK(!closed_);
-    DCHECK(write_page_ == nullptr);
-
-    Page new_page;
-    const BufferHandle* write_buffer;
-    RETURN_IF_ERROR(buffer_pool_->CreatePage(buffer_pool_client_, page_len, &new_page.handle,
-                                             &write_buffer));
-    bytes_pinned_ += page_len;
-    total_byte_size_ += page_len;
-
-    pages_.push_back(std::move(new_page));
-    ++num_pages_;
-    write_page_ = &pages_.back();
-    DCHECK_EQ(write_page_->num_rows, 0);
-    write_ptr_ = write_buffer->data();
-    write_end_ptr_ = write_ptr_ + page_len;
-    return Status::OK();
-}
-
-void BufferedTupleStream3::CalcPageLenForRow(int64_t row_size, int64_t* page_len) {
-    *page_len = std::max(default_page_len_, BitUtil::RoundUpToPowerOfTwo(row_size));
-}
-
-Status BufferedTupleStream3::AdvanceWritePage(int64_t row_size) noexcept {
-    DCHECK(has_write_iterator());
-    CHECK_CONSISTENCY_FAST();
-
-    int64_t page_len;
-
-    CalcPageLenForRow(row_size, &page_len);
-    ResetWritePage();
-    //RETURN_IF_ERROR(NewWritePage(page_len));
-    Status status = NewWritePage(page_len);
-    if (UNLIKELY(!status.ok())) {
-        return status;
-    }
-    return Status::OK();
-}
-
-void BufferedTupleStream3::ResetWritePage() {
-    if (write_page_ == nullptr) return;
-    // Unpin the write page if we're reading in unpinned mode.
-    Page* prev_write_page = write_page_;
-    write_page_ = nullptr;
-    write_ptr_ = nullptr;
-    write_end_ptr_ = nullptr;
-
-    // May need to decrement pin count now that it's not the write page, depending on
-    // the stream's mode.
-    UnpinPageIfNeeded(prev_write_page, pinned_);
-}
-
-void BufferedTupleStream3::InvalidateWriteIterator() {
-    if (!has_write_iterator()) return;
-    ResetWritePage();
-    has_write_iterator_ = false;
-}
-
-Status BufferedTupleStream3::NextReadPage() {
-    DCHECK(has_read_iterator());
-    DCHECK(!closed_);
-    CHECK_CONSISTENCY_FAST();
-
-    if (read_page_ == pages_.end()) {
-        // No rows read yet - start reading at first page. If the stream is unpinned, we can
-        // use the reservation saved in PrepareForReadWrite() to pin the first page.
-        read_page_ = pages_.begin();
-    } else if (delete_on_read_) {
-        DCHECK(read_page_ == pages_.begin()) << read_page_->DebugString() << " " << DebugString();
-        DCHECK_NE(&*read_page_, write_page_);
-        bytes_pinned_ -= pages_.front().len();
-        buffer_pool_->DestroyPage(buffer_pool_client_, &pages_.front().handle);
-        pages_.pop_front();
-        --num_pages_;
-        read_page_ = pages_.begin();
-    } else {
-        // Unpin pages after reading them if needed.
-        Page* prev_read_page = &*read_page_;
-        ++read_page_;
-        UnpinPageIfNeeded(prev_read_page, pinned_);
-    }
-
-    if (read_page_ == pages_.end()) {
-        CHECK_CONSISTENCY_FULL();
-        return Status::OK();
-    }
-
-    // Ensure the next page is pinned for reading. By this point we should have enough
-    // reservation to pin the page. If the stream is pinned, the page is already pinned.
-    // If the stream is unpinned, we freed up enough memory for a default-sized page by
-    // deleting or unpinning the previous page and ensured that, if the page was larger,
-    // that the reservation is available with the above check.
-    RETURN_IF_ERROR(PinPageIfNeeded(&*read_page_, pinned_));
-
-    // This waits for the pin to complete if the page was unpinned earlier.
-    const BufferHandle* read_buffer;
-    RETURN_IF_ERROR(read_page_->GetBuffer(&read_buffer));
-
-    read_page_rows_returned_ = 0;
-    read_ptr_ = read_buffer->data();
-    read_end_ptr_ = read_ptr_ + read_buffer->len();
-
-    CHECK_CONSISTENCY_FAST();
-    return Status::OK();
-}
-
-void BufferedTupleStream3::InvalidateReadIterator() {
-    if (read_page_ != pages_.end()) {
-        // Unpin the write page if we're reading in unpinned mode.
-        Page* prev_read_page = &*read_page_;
-        read_page_ = pages_.end();
-        read_ptr_ = nullptr;
-        read_end_ptr_ = nullptr;
-
-        // May need to decrement pin count after destroying read iterator.
-        UnpinPageIfNeeded(prev_read_page, pinned_);
-    }
-    has_read_iterator_ = false;
-    // It is safe to re-read a delete-on-read stream if no rows were read and no pages
-    // were therefore deleted.
-    if (rows_returned_ == 0) delete_on_read_ = false;
-}
-
-Status BufferedTupleStream3::PrepareForRead(bool delete_on_read) {
-    CHECK_CONSISTENCY_FULL();
-    InvalidateWriteIterator();
-    InvalidateReadIterator();
-    return PrepareForReadInternal(delete_on_read);
-}
-
-Status BufferedTupleStream3::PrepareForReadInternal(bool delete_on_read) {
-    DCHECK(!closed_);
-    DCHECK(!delete_on_read_);
-    DCHECK(!has_read_iterator());
-
-    has_read_iterator_ = true;
-    if (pages_.empty()) {
-        // No rows to return, or a the first read/write page has not yet been allocated.
-        read_page_ = pages_.end();
-        read_ptr_ = nullptr;
-        read_end_ptr_ = nullptr;
-    } else {
-        // Eagerly pin the first page in the stream.
-        read_page_ = pages_.begin();
-        // Check if we need to increment the pin count of the read page.
-        RETURN_IF_ERROR(PinPageIfNeeded(&*read_page_, pinned_));
-        DCHECK(read_page_->is_pinned());
-
-        // This waits for the pin to complete if the page was unpinned earlier.
-        const BufferHandle* read_buffer;
-        RETURN_IF_ERROR(read_page_->GetBuffer(&read_buffer));
-        read_ptr_ = read_buffer->data();
-        read_end_ptr_ = read_ptr_ + read_buffer->len();
-    }
-    read_page_rows_returned_ = 0;
-    rows_returned_ = 0;
-    delete_on_read_ = delete_on_read;
-    CHECK_CONSISTENCY_FULL();
-    return Status::OK();
-}
-
-Status BufferedTupleStream3::PinStream(bool* pinned) {
-    DCHECK(!closed_);
-    CHECK_CONSISTENCY_FULL();
-    if (pinned_) {
-        *pinned = true;
-        return Status::OK();
-    }
-    *pinned = false;
-
-    // At this point success is guaranteed - go through to pin the pages we need to pin.
-    // If the page data was evicted from memory, the read I/O can happen in parallel
-    // because we defer calling GetBuffer() until NextReadPage().
-    for (Page& page : pages_) RETURN_IF_ERROR(PinPageIfNeeded(&page, true));
-
-    pinned_ = true;
-    *pinned = true;
-    CHECK_CONSISTENCY_FULL();
-    return Status::OK();
-}
-/*
-void BufferedTupleStream3::UnpinStream(UnpinMode mode) {
-  CHECK_CONSISTENCY_FULL();
-  DCHECK(!closed_);
-  if (mode == UNPIN_ALL) {
-    // Invalidate the iterators so they don't keep pages pinned.
-    InvalidateWriteIterator();
-    InvalidateReadIterator();
-  }
-
-  if (pinned_) {
-    CHECK_CONSISTENCY_FULL();
-    // If the stream was pinned, there may be some remaining pinned pages that should
-    // be unpinned at this point.
-    for (Page& page : pages_) UnpinPageIfNeeded(&page, false);
-
-    pinned_ = false;
-  }
-  CHECK_CONSISTENCY_FULL();
-}
-*/
-Status BufferedTupleStream3::GetRows(std::unique_ptr<RowBatch>* batch, bool* got_rows) {
-    if (num_rows() > numeric_limits<int>::max()) {
-        // RowBatch::num_rows_ is a 32-bit int, avoid an overflow.
-        return Status::InternalError(
-                "Trying to read {} rows into in-memory batch failed. Limit "
-                "is {}",
-                num_rows(), numeric_limits<int>::max());
-    }
-    RETURN_IF_ERROR(PinStream(got_rows));
-    if (!*got_rows) return Status::OK();
-    RETURN_IF_ERROR(PrepareForRead(false));
-
-    // TODO chenhao
-    // capacity in RowBatch use int, but _num_rows is int64_t
-    // it may be precision loss
-    batch->reset(new RowBatch(*desc_, num_rows()));
-    bool eos = false;
-    // Loop until GetNext fills the entire batch. Each call can stop at page
-    // boundaries. We generally want it to stop, so that pages can be freed
-    // as we read. It is safe in this case because we pin the entire stream.
-    while (!eos) {
-        RETURN_IF_ERROR(GetNext(batch->get(), &eos));
-    }
-    return Status::OK();
-}
-
-Status BufferedTupleStream3::GetNext(RowBatch* batch, bool* eos) {
-    return GetNextInternal<false>(batch, eos, nullptr);
-}
-
-Status BufferedTupleStream3::GetNext(RowBatch* batch, bool* eos, vector<FlatRowPtr>* flat_rows) {
-    return GetNextInternal<true>(batch, eos, flat_rows);
-}
-
-template <bool FILL_FLAT_ROWS>
-Status BufferedTupleStream3::GetNextInternal(RowBatch* batch, bool* eos,
-                                             vector<FlatRowPtr>* flat_rows) {
-    if (has_nullable_tuple_) {
-        return GetNextInternal<FILL_FLAT_ROWS, true>(batch, eos, flat_rows);
-    } else {
-        return GetNextInternal<FILL_FLAT_ROWS, false>(batch, eos, flat_rows);
-    }
-}
-
-template <bool FILL_FLAT_ROWS, bool HAS_NULLABLE_TUPLE>
-Status BufferedTupleStream3::GetNextInternal(RowBatch* batch, bool* eos,
-                                             vector<FlatRowPtr>* flat_rows) {
-    DCHECK(!closed_);
-    DCHECK(batch->row_desc().equals(*desc_));
-    DCHECK(is_pinned() || !FILL_FLAT_ROWS) << "FlatRowPtrs are only valid for pinned streams";
-    *eos = (rows_returned_ == num_rows_);
-    if (*eos) return Status::OK();
-
-    if (UNLIKELY(read_page_ == pages_.end() || read_page_rows_returned_ == read_page_->num_rows)) {
-        // Get the next page in the stream (or the first page if read_page_ was not yet
-        // initialized.) We need to do this at the beginning of the GetNext() call to ensure
-        // the buffer management semantics. NextReadPage() may unpin or delete the buffer
-        // backing the rows returned from the *previous* call to GetNext().
-        RETURN_IF_ERROR(NextReadPage());
-    }
-
-    DCHECK(has_read_iterator());
-    DCHECK(read_page_ != pages_.end());
-    DCHECK(read_page_->is_pinned()) << DebugString();
-    DCHECK_GE(read_page_rows_returned_, 0);
-
-    int rows_left_in_page = read_page_->num_rows - read_page_rows_returned_;
-    int rows_to_fill = std::min(batch->capacity() - batch->num_rows(), rows_left_in_page);
-    DCHECK_GE(rows_to_fill, 1);
-    uint8_t* tuple_row_mem = reinterpret_cast<uint8_t*>(batch->get_row(batch->num_rows()));
-
-    // Produce tuple rows from the current page and the corresponding position on the
-    // null tuple indicator.
-    if (FILL_FLAT_ROWS) {
-        DCHECK(flat_rows != nullptr);
-        DCHECK(!delete_on_read_);
-        DCHECK_EQ(batch->num_rows(), 0);
-        flat_rows->clear();
-        flat_rows->reserve(rows_to_fill);
-    }
-
-    const uint64_t tuples_per_row = desc_->tuple_descriptors().size();
-    // Start reading from the current position in 'read_page_'.
-    for (int i = 0; i < rows_to_fill; ++i) {
-        if (FILL_FLAT_ROWS) {
-            flat_rows->push_back(read_ptr_);
-            DCHECK_EQ(flat_rows->size(), i + 1);
-        }
-        // Copy the row into the output batch.
-        TupleRow* output_row = reinterpret_cast<TupleRow*>(tuple_row_mem);
-        tuple_row_mem += sizeof(Tuple*) * tuples_per_row;
-        UnflattenTupleRow<HAS_NULLABLE_TUPLE>(&read_ptr_, output_row);
-
-        // Update string slot ptrs, skipping external strings.
-        for (int j = 0; j < inlined_string_slots_.size(); ++j) {
-            Tuple* tuple = output_row->get_tuple(inlined_string_slots_[j].first);
-            if (HAS_NULLABLE_TUPLE && tuple == nullptr) continue;
-            FixUpStringsForRead(inlined_string_slots_[j].second, tuple);
-        }
-        /*
-    // Update collection slot ptrs, skipping external collections. We traverse the
-    // collection structure in the same order as it was written to the stream, allowing
-    // us to infer the data layout based on the length of collections and strings.
-    for (int j = 0; j < inlined_coll_slots_.size(); ++j) {
-      Tuple* tuple = output_row->get_tuple(inlined_coll_slots_[j].first);
-      if (HAS_NULLABLE_TUPLE && tuple == nullptr) continue;
-      FixUpCollectionsForRead(inlined_coll_slots_[j].second, tuple);
-    }
-*/
-    }
-
-    batch->commit_rows(rows_to_fill);
-    rows_returned_ += rows_to_fill;
-    read_page_rows_returned_ += rows_to_fill;
-    *eos = (rows_returned_ == num_rows_);
-    if (read_page_rows_returned_ == read_page_->num_rows && (!pinned_ || delete_on_read_)) {
-        // No more data in this page. The batch must be immediately returned up the operator
-        // tree and deep copied so that NextReadPage() can reuse the read page's buffer.
-        // TODO: IMPALA-4179 - instead attach the buffer and flush the resources.
-        batch->mark_needs_deep_copy();
-    }
-    if (FILL_FLAT_ROWS) DCHECK_EQ(flat_rows->size(), rows_to_fill);
-    DCHECK_LE(read_ptr_, read_end_ptr_);
-    return Status::OK();
-}
-
-void BufferedTupleStream3::FixUpStringsForRead(const vector<SlotDescriptor*>& string_slots,
-                                               Tuple* tuple) {
-    DCHECK(tuple != nullptr);
-    for (const SlotDescriptor* slot_desc : string_slots) {
-        if (tuple->is_null(slot_desc->null_indicator_offset())) continue;
-
-        StringValue* sv = tuple->get_string_slot(slot_desc->tuple_offset());
-        DCHECK_LE(read_ptr_ + sv->len, read_end_ptr_);
-        sv->ptr = reinterpret_cast<char*>(read_ptr_);
-        read_ptr_ += sv->len;
-    }
-}
-/*
-void BufferedTupleStream3::FixUpCollectionsForRead(
-    const vector<SlotDescriptor*>& collection_slots, Tuple* tuple) {
-  DCHECK(tuple != nullptr);
-  for (const SlotDescriptor* slot_desc : collection_slots) {
-    if (tuple->is_null(slot_desc->null_indicator_offset())) continue;
-
-    CollectionValue* cv = tuple->get_collection_slot(slot_desc->tuple_offset());
-    const TupleDescriptor& item_desc = *slot_desc->collection_item_descriptor();
-    int coll_byte_size = cv->num_tuples * item_desc.byte_size();
-    DCHECK_LE(read_ptr_ + coll_byte_size, read_end_ptr_);
-    cv->ptr = reinterpret_cast<uint8_t*>(read_ptr_);
-    read_ptr_ += coll_byte_size;
-
-    if (!item_desc.has_varlen_slots()) continue;
-    uint8_t* coll_data = cv->ptr;
-    for (int i = 0; i < cv->num_tuples; ++i) {
-      Tuple* item = reinterpret_cast<Tuple*>(coll_data);
-      FixUpStringsForRead(item_desc.string_slots(), item);
-      FixUpCollectionsForRead(item_desc.collection_slots(), item);
-      coll_data += item_desc.byte_size();
-    }
-  }
-}
-*/
-int64_t BufferedTupleStream3::ComputeRowSize(TupleRow* row) const noexcept {
-    int64_t size = 0;
-    if (has_nullable_tuple_) {
-        size += NullIndicatorBytesPerRow();
-        for (int i = 0; i < fixed_tuple_sizes_.size(); ++i) {
-            if (row->get_tuple(i) != nullptr) size += fixed_tuple_sizes_[i];
-        }
-    } else {
-        for (int i = 0; i < fixed_tuple_sizes_.size(); ++i) {
-            size += fixed_tuple_sizes_[i];
-        }
-    }
-    for (int i = 0; i < inlined_string_slots_.size(); ++i) {
-        Tuple* tuple = row->get_tuple(inlined_string_slots_[i].first);
-        if (tuple == nullptr) continue;
-        const vector<SlotDescriptor*>& slots = inlined_string_slots_[i].second;
-        for (auto it = slots.begin(); it != slots.end(); ++it) {
-            if (tuple->is_null((*it)->null_indicator_offset())) continue;
-            size += tuple->get_string_slot((*it)->tuple_offset())->len;
-        }
-    }
-
-    /*
-  for (int i = 0; i < inlined_coll_slots_.size(); ++i) {
-    Tuple* tuple = row->get_tuple(inlined_coll_slots_[i].first);
-    if (tuple == nullptr) continue;
-    const vector<SlotDescriptor*>& slots = inlined_coll_slots_[i].second;
-    for (auto it = slots.begin(); it != slots.end(); ++it) {
-      if (tuple->is_null((*it)->null_indicator_offset())) continue;
-      CollectionValue* cv = tuple->get_collection_slot((*it)->tuple_offset());
-      const TupleDescriptor& item_desc = *(*it)->collection_item_descriptor();
-      size += cv->num_tuples * item_desc.byte_size();
-
-      if (!item_desc.has_varlen_slots()) continue;
-      for (int j = 0; j < cv->num_tuples; ++j) {
-        Tuple* item = reinterpret_cast<Tuple*>(&cv->ptr[j * item_desc.byte_size()]);
-        size += item->varlen_byte_size(item_desc);
-      }
-    }
-  }
-*/
-    return size;
-}
-
-bool BufferedTupleStream3::AddRowSlow(TupleRow* row, Status* status) noexcept {
-    // Use AddRowCustom*() to do the work of advancing the page.
-    int64_t row_size = ComputeRowSize(row);
-    uint8_t* data = AddRowCustomBeginSlow(row_size, status);
-    if (data == nullptr) return false;
-    bool success = DeepCopy(row, &data, data + row_size);
-    DCHECK(success);
-    DCHECK_EQ(data, write_ptr_);
-    AddRowCustomEnd(row_size);
-    return true;
-}
-
-uint8_t* BufferedTupleStream3::AddRowCustomBeginSlow(int64_t size, Status* status) noexcept {
-    *status = AdvanceWritePage(size);
-    if (!status->ok()) {
-        return nullptr;
-    }
-    // We have a large-enough page so now success is guaranteed.
-    uint8_t* result = AddRowCustomBegin(size, status);
-    DCHECK(result != nullptr);
-    return result;
-}
-
-void BufferedTupleStream3::AddLargeRowCustomEnd(int64_t size) noexcept {
-    DCHECK_GT(size, default_page_len_);
-    // Immediately unpin the large write page so that we're not using up extra reservation
-    // and so we don't append another row to the page.
-    ResetWritePage();
-    // The stream should be in a consistent state once the row is added.
-    CHECK_CONSISTENCY_FAST();
-}
-
-bool BufferedTupleStream3::AddRow(TupleRow* row, Status* status) noexcept {
-    DCHECK(!closed_);
-    DCHECK(has_write_iterator());
-    if (UNLIKELY(write_page_ == nullptr || !DeepCopy(row, &write_ptr_, write_end_ptr_))) {
-        return AddRowSlow(row, status);
-    }
-    ++num_rows_;
-    ++write_page_->num_rows;
-    return true;
-}
-
-bool BufferedTupleStream3::DeepCopy(TupleRow* row, uint8_t** data,
-                                    const uint8_t* data_end) noexcept {
-    return has_nullable_tuple_ ? DeepCopyInternal<true>(row, data, data_end)
-                               : DeepCopyInternal<false>(row, data, data_end);
-}
-
-// TODO: consider codegening this.
-// TODO: in case of duplicate tuples, this can redundantly serialize data.
-template <bool HAS_NULLABLE_TUPLE>
-bool BufferedTupleStream3::DeepCopyInternal(TupleRow* row, uint8_t** data,
-                                            const uint8_t* data_end) noexcept {
-    uint8_t* pos = *data;
-    const uint64_t tuples_per_row = desc_->tuple_descriptors().size();
-    // Copy the not nullptr fixed len tuples. For the nullptr tuples just update the nullptr tuple
-    // indicator.
-    if (HAS_NULLABLE_TUPLE) {
-        int null_indicator_bytes = NullIndicatorBytesPerRow();
-        if (UNLIKELY(pos + null_indicator_bytes > data_end)) return false;
-        uint8_t* null_indicators = pos;
-        pos += NullIndicatorBytesPerRow();
-        memset(null_indicators, 0, null_indicator_bytes);
-        for (int i = 0; i < tuples_per_row; ++i) {
-            uint8_t* null_word = null_indicators + (i >> 3);
-            const uint32_t null_pos = i & 7;
-            const int tuple_size = fixed_tuple_sizes_[i];
-            Tuple* t = row->get_tuple(i);
-            const uint8_t mask = 1 << (7 - null_pos);
-            if (t != nullptr) {
-                if (UNLIKELY(pos + tuple_size > data_end)) return false;
-                memcpy(pos, t, tuple_size);
-                pos += tuple_size;
-            } else {
-                *null_word |= mask;
-            }
-        }
-    } else {
-        // If we know that there are no nullable tuples no need to set the nullability flags.
-        for (int i = 0; i < tuples_per_row; ++i) {
-            const int tuple_size = fixed_tuple_sizes_[i];
-            if (UNLIKELY(pos + tuple_size > data_end)) return false;
-            Tuple* t = row->get_tuple(i);
-            // TODO: Once IMPALA-1306 (Avoid passing empty tuples of non-materialized slots)
-            // is delivered, the check below should become DCHECK(t != nullptr).
-            DCHECK(t != nullptr || tuple_size == 0);
-            memcpy(pos, t, tuple_size);
-            pos += tuple_size;
-        }
-    }
-
-    // Copy inlined string slots. Note: we do not need to convert the string ptrs to offsets
-    // on the write path, only on the read. The tuple data is immediately followed
-    // by the string data so only the len information is necessary.
-    for (int i = 0; i < inlined_string_slots_.size(); ++i) {
-        const Tuple* tuple = row->get_tuple(inlined_string_slots_[i].first);
-        if (HAS_NULLABLE_TUPLE && tuple == nullptr) continue;
-        if (UNLIKELY(!CopyStrings(tuple, inlined_string_slots_[i].second, &pos, data_end)))
-            return false;
-    }
-    /*
-  // Copy inlined collection slots. We copy collection data in a well-defined order so
-  // we do not need to convert pointers to offsets on the write path.
-  for (int i = 0; i < inlined_coll_slots_.size(); ++i) {
-    const Tuple* tuple = row->get_tuple(inlined_coll_slots_[i].first);
-    if (HAS_NULLABLE_TUPLE && tuple == nullptr) continue;
-    if (UNLIKELY(!CopyCollections(tuple, inlined_coll_slots_[i].second, &pos, data_end)))
-      return false;
-  }
-*/
-    *data = pos;
-    return true;
-}
-
-bool BufferedTupleStream3::CopyStrings(const Tuple* tuple,
-                                       const vector<SlotDescriptor*>& string_slots, uint8_t** data,
-                                       const uint8_t* data_end) {
-    for (const SlotDescriptor* slot_desc : string_slots) {
-        if (tuple->is_null(slot_desc->null_indicator_offset())) continue;
-        const StringValue* sv = tuple->get_string_slot(slot_desc->tuple_offset());
-        if (LIKELY(sv->len > 0)) {
-            if (UNLIKELY(*data + sv->len > data_end)) return false;
-
-            memcpy(*data, sv->ptr, sv->len);
-            *data += sv->len;
-        }
-    }
-    return true;
-}
-/*
-bool BufferedTupleStream3::CopyCollections(const Tuple* tuple,
-    const vector<SlotDescriptor*>& collection_slots, uint8_t** data, const uint8_t* data_end) {
-  for (const SlotDescriptor* slot_desc : collection_slots) {
-    if (tuple->is_null(slot_desc->null_indicator_offset())) continue;
-    const CollectionValue* cv = tuple->get_collection_slot(slot_desc->tuple_offset());
-    const TupleDescriptor& item_desc = *slot_desc->collection_item_descriptor();
-    if (LIKELY(cv->num_tuples > 0)) {
-      int coll_byte_size = cv->num_tuples * item_desc.byte_size();
-      if (UNLIKELY(*data + coll_byte_size > data_end)) return false;
-      uint8_t* coll_data = *data;
-      memcpy(coll_data, cv->ptr, coll_byte_size);
-      *data += coll_byte_size;
-
-      if (!item_desc.has_varlen_slots()) continue;
-      // Copy variable length data when present in collection items.
-      for (int i = 0; i < cv->num_tuples; ++i) {
-        const Tuple* item = reinterpret_cast<Tuple*>(coll_data);
-        if (UNLIKELY(!CopyStrings(item, item_desc.string_slots(), data, data_end))) {
-          return false;
-        }
-        if (UNLIKELY(
-                !CopyCollections(item, item_desc.collection_slots(), data, data_end))) {
-          return false;
-        }
-        coll_data += item_desc.byte_size();
-      }
-    }
-  }
-  return true;
-}
-*/
-void BufferedTupleStream3::GetTupleRow(FlatRowPtr flat_row, TupleRow* row) const {
-    DCHECK(row != nullptr);
-    DCHECK(!closed_);
-    DCHECK(is_pinned());
-    DCHECK(!delete_on_read_);
-    uint8_t* data = flat_row;
-    return has_nullable_tuple_ ? UnflattenTupleRow<true>(&data, row)
-                               : UnflattenTupleRow<false>(&data, row);
-}
-
-template <bool HAS_NULLABLE_TUPLE>
-void BufferedTupleStream3::UnflattenTupleRow(uint8_t** data, TupleRow* row) const {
-    const int tuples_per_row = desc_->tuple_descriptors().size();
-    uint8_t* ptr = *data;
-    if (has_nullable_tuple_) {
-        // Stitch together the tuples from the page and the nullptr ones.
-        const uint8_t* null_indicators = ptr;
-        ptr += NullIndicatorBytesPerRow();
-        for (int i = 0; i < tuples_per_row; ++i) {
-            const uint8_t* null_word = null_indicators + (i >> 3);
-            const uint32_t null_pos = i & 7;
-            const bool is_not_null = ((*null_word & (1 << (7 - null_pos))) == 0);
-            row->set_tuple(i,
-                           reinterpret_cast<Tuple*>(reinterpret_cast<uint64_t>(ptr) * is_not_null));
-            ptr += fixed_tuple_sizes_[i] * is_not_null;
-        }
-    } else {
-        for (int i = 0; i < tuples_per_row; ++i) {
-            row->set_tuple(i, reinterpret_cast<Tuple*>(ptr));
-            ptr += fixed_tuple_sizes_[i];
-        }
-    }
-    *data = ptr;
-}
diff --git a/be/src/runtime/buffered_tuple_stream3.h b/be/src/runtime/buffered_tuple_stream3.h
deleted file mode 100644
index a225b5d892..0000000000
--- a/be/src/runtime/buffered_tuple_stream3.h
+++ /dev/null
@@ -1,647 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// https://github.com/apache/impala/blob/branch-3.0.0/be/src/runtime/buffered-tuple-stream.h
-// and modified by Doris
-
-#pragma once
-
-#include <functional>
-#include <set>
-#include <vector>
-
-#include "common/global_types.h"
-#include "common/status.h"
-#include "gutil/macros.h"
-#include "runtime/bufferpool/buffer_pool.h"
-#include "runtime/row_batch.h"
-
-namespace doris {
-
-class RuntimeState;
-class RowDescriptor;
-class SlotDescriptor;
-class Tuple;
-class TupleRow;
-
-/// Class that provides an abstraction for a stream of tuple rows backed by BufferPool
-/// Pages. Rows can be added to the stream and read back. Rows are returned in the order
-/// they are added.
-///
-/// The BufferedTupleStream3 is *not* thread safe from the caller's point of view.
-/// Different threads should not concurrently call methods of the same BufferedTupleStream3
-/// object.
-///
-/// Reading and writing the stream:
-/// The stream supports two modes of reading/writing, depending on whether
-/// PrepareForWrite() is called to initialize a write iterator only or
-/// PrepareForReadWrite() is called to initialize both read and write iterators to enable
-/// interleaved reads and writes.
-///
-/// To use write-only mode, PrepareForWrite() is called once and AddRow()/AddRowCustom*()
-/// are called repeatedly to initialize then advance a write iterator through the stream.
-/// Once the stream is fully written, it can be read back by calling PrepareForRead()
-/// then GetNext() repeatedly to advance a read iterator through the stream, or by
-/// calling GetRows() to get all of the rows at once.
-///
-/// To use read/write mode, PrepareForReadWrite() is called once to initialize the read
-/// and write iterators. AddRow()/AddRowCustom*() then advance a write iterator through
-/// the stream, and GetNext() advances a trailing read iterator through the stream.
-///
-/// Buffer management:
-/// The tuple stream is backed by a sequence of BufferPool Pages. The tuple stream uses
-/// the client's reservation to pin pages in memory. It will automatically try to
-/// increase the client's reservation whenever it needs to do so to make progress.
-///
-/// Normally pages are all of the same default page length, but larger pages up to the
-/// max page length are used if needed to store rows that are too large for a
-/// default-length page.
-///
-/// The stream has both pinned and unpinned modes. In the pinned mode all pages are
-/// pinned for reading. The pinned mode avoids I/O by keeping all pages pinned in memory
-/// and allows clients to save pointers to rows in the stream and randomly access them.
-/// E.g. hash tables can be backed by a BufferedTupleStream3. In the unpinned mode, only
-/// pages currently being read and written are pinned and other pages are unpinned and
-/// therefore do not use the client's reservation and can be spilled to disk. The stream
-/// always holds onto a default page's worth of reservation for the read and write
-/// iterators (i.e. two page's worth if the stream is in read/write mode), even if that
-/// many pages are not currently pinned. This means that UnpinStream() always succeeds,
-/// and moving to the next default-length write page or read page on an unpinned stream
-/// does not require additional reservation. This is implemented by saving reservations
-/// in SubReservations.
-///
-/// To read or write a row larger than the default page size to/from an unpinned stream,
-/// the client must have max_page_len - default_page_len unused reservation. Writing a
-/// large row to an unpinned stream only uses the reservation for the duration of the
-/// AddRow()/AddRowCustom*() call. Reading a large row from an unpinned stream uses the
-/// reservation until the next call to GetNext(). E.g. to partition a single unpinned
-/// stream into n unpinned streams, the reservation needed is (n - 1) *
-/// default_page_len + 2 * max_page_len: one large read buffer and one large write
-/// buffer is needed to keep the row being processed in-memory, but only default-sized
-/// buffers are needed for the other streams being written.
-///
-/// The tuple stream also supports a 'delete_on_read' mode, enabled by passing a flag
-/// to PrepareForRead() which deletes the stream's pages as it does a final read
-/// pass over the stream.
-///
-/// TODO: IMPALA-4179: the buffer management can be simplified once we can attach
-/// buffers to RowBatches.
-///
-/// Page layout:
-/// Rows are stored back to back starting at the first byte of each page's buffer, with
-/// no interleaving of data from different rows. There is no padding or alignment
-/// between rows. Rows larger than the default page length are stored on their own
-/// page.
-///
-/// Tuple row layout:
-/// If the stream's tuples are nullable (i.e. has_nullable_tuple_ is true), there is a
-/// bitstring at the start of each row with null indicators for all tuples in each row
-/// (including non-nullable tuples). The bitstring occupies ceil(num_tuples_per_row / 8)
-/// bytes. A 1 indicates the tuple is null.
-///
-/// The fixed length parts of the row's tuples are stored first, followed by var len data
-/// for inlined_string_slots_ and inlined_coll_slots_. Other "external" var len slots can
-/// point to var len data outside the stream. When reading the stream, the length of each
-/// row's var len data in the stream must be computed to find the next row's start.
-///
-/// The tuple stream supports reading from the stream into RowBatches without copying
-/// out any data: the RowBatches' Tuple pointers will point directly into the stream's
-/// pages' buffers. The fixed length parts follow Impala's internal tuple format, so for
-/// the tuple to be valid, we only need to update pointers to point to the var len data
-/// in the stream. These pointers need to be updated by the stream because a spilled
-/// page's data may be relocated to a different buffer. The pointers are updated lazily
-/// upon reading the stream via GetNext() or GetRows().
-///
-/// Example layout for a row with two non-nullable tuples ((1, "hello"), (2, "world"))
-/// with all var len data stored in the stream:
-///  <---- tuple 1 -----> <------ tuple 2 ------> <- var len -> <- next row ...
-/// +--------+-----------+-----------+-----------+-------------+
-/// | IntVal | StringVal | BigIntVal | StringVal |             | ...
-/// +--------+-----------+-----------+-----------++------------+
-/// | val: 1 | len: 5    | val: 2    | len: 5    | helloworld  | ...
-/// |        | ptr: 0x.. |           | ptr: 0x.. |             | ...
-/// +--------+-----------+-----------+-----------+-------------+
-///  <--4b--> <---12b---> <----8b---> <---12b---> <----10b---->
-///
-/// Example layout for a row with the second tuple nullable ((1, "hello"), nullptr)
-/// with all var len data stored in the stream:
-/// <- null tuple bitstring -> <---- tuple 1 -----> <- var len -> <- next row ...
-/// +-------------------------+--------+-----------+------------+
-/// |                         | IntVal | StringVal |            | ...
-/// +-------------------------+--------+-----------+------------+
-/// | 0000 0010               | val: 1 | len: 5    | hello      | ...
-/// |                         |        | ptr: 0x.. |            | ...
-/// +-------------------------+--------+-----------+------------+
-///  <---------1b------------> <--4b--> <---12b---> <----5b---->
-///
-/// Example layout for a row with a single non-nullable tuple (("hello", "world")) with
-/// the second string slot stored externally to the stream:
-///  <------ tuple 1 ------> <- var len ->  <- next row ...
-/// +-----------+-----------+-------------+
-/// | StringVal | StringVal |             | ...
-/// +-----------+-----------+-------------+
-/// | len: 5    | len: 5    |  hello      | ...
-/// | ptr: 0x.. | ptr: 0x.. |             | ...
-/// +-----------+-----------+-------------+
-///  <---12b---> <---12b---> <-----5b---->
-///
-/// The behavior of reads and writes is as follows:
-/// Read:
-///   1. Unpinned: Only a single read page is pinned at a time. This means that only
-///     enough reservation to pin a single page is needed to read the stream, regardless
-///     of the stream's size. Each page is deleted or unpinned (if delete on read is true
-///     or false respectively) before advancing to the next page.
-///   2. Pinned: All pages in the stream are pinned so do not need to be pinned or
-///     unpinned when reading from the stream. If delete on read is true, pages are
-///     deleted after being read. If the stream was previously unpinned, the page's data
-///     may not yet be in memory - reading from the stream can block on I/O or fail with
-///     an I/O error.
-/// Write:
-///   1. Unpinned: Unpin pages as they fill up. This means that only a enough reservation
-///     to pin a single write page is required to write to the stream, regardless of the
-///     stream's size.
-///   2. Pinned: Pages are left pinned. If the next page in the stream cannot be pinned
-///     because the client's reservation is insufficient (and could not be increased by
-///     the stream), the read call will fail and the client can either unpin the stream
-///     or free up other memory before retrying.
-///
-/// Memory lifetime of rows read from stream:
-/// If the stream is pinned and delete on read is false, it is valid to access any tuples
-/// returned via GetNext() or GetRows() until the stream is unpinned. If the stream is
-/// unpinned or delete on read is true, then the batch returned from GetNext() may have
-/// the needs_deep_copy flag set, which means that any tuple memory returned so far from
-/// the stream may be freed on the next call to GetNext().
-/// TODO: IMPALA-4179, instead of needs_deep_copy, attach the pages' buffers to the batch.
-///
-/// Manual construction of rows with AddRowCustomBegin()/AddRowCustomEnd():
-/// The BufferedTupleStream3 supports allocation of uninitialized rows with
-/// AddRowCustom*(). AddRowCustomBegin() is called instead of AddRow() if the client wants
-/// to manually construct a row. The caller of AddRowCustomBegin() is responsible for
-/// writing the row with exactly the layout described above then calling
-/// AddRowCustomEnd() when done.
-///
-/// If a caller constructs a tuple in this way, the caller can set the pointers and they
-/// will not be modified until the stream is read via GetNext() or GetRows().
-/// TODO: IMPALA-5007: try to remove AddRowCustom*() by unifying with AddRow().
-///
-/// TODO: we need to be able to do read ahead for pages. We need some way to indicate a
-/// page will need to be pinned soon.
-class BufferedTupleStream3 {
-public:
-    /// A pointer to the start of a flattened TupleRow in the stream.
-    typedef uint8_t* FlatRowPtr;
-
-    /// row_desc: description of rows stored in the stream. This is the desc for rows
-    /// that are added and the rows being returned.
-    /// page_len: the size of pages to use in the stream
-    /// ext_varlen_slots: set of varlen slots with data stored externally to the stream
-    BufferedTupleStream3(RuntimeState* state, const RowDescriptor* row_desc,
-                         BufferPool::ClientHandle* buffer_pool_client, int64_t default_page_len,
-                         const std::set<SlotId>& ext_varlen_slots = std::set<SlotId>());
-
-    virtual ~BufferedTupleStream3();
-
-    /// Initializes the tuple stream object on behalf of node 'node_id'. Must be called
-    /// once before any of the other APIs.
-    /// If 'pinned' is true, the tuple stream starts off pinned, otherwise it is unpinned.
-    /// 'node_id' is only used for error reporting.
-    Status Init(int node_id, bool pinned) WARN_UNUSED_RESULT;
-
-    /// Prepares the stream for writing by saving enough reservation for a default-size
-    /// write page. Tries to increase reservation if there is not enough unused reservation
-    /// for a page. Called after Init() and before the first AddRow() or
-    /// AddRowCustomBegin() call.
-    /// 'got_reservation': set to true if there was enough reservation to initialize the
-    ///     first write page and false if there was not enough reservation and no other
-    ///     error was encountered. Undefined if an error status is returned.
-    Status PrepareForWrite() WARN_UNUSED_RESULT;
-
-    /// Prepares the stream for interleaved reads and writes by saving enough reservation
-    /// for default-sized read and write pages. Called after Init() and before the first
-    /// AddRow() or AddRowCustomBegin() call.
-    /// 'delete_on_read': Pages are deleted after they are read.
-    /// 'got_reservation': set to true if there was enough reservation to initialize the
-    ///     read and write pages and false if there was not enough reservation and no other
-    ///     error was encountered. Undefined if an error status is returned.
-    Status PrepareForReadWrite(bool delete_on_read) WARN_UNUSED_RESULT;
-
-    /// Prepares the stream for reading, invalidating the write iterator (if there is one).
-    /// Therefore must be called after the last AddRow() or AddRowCustomEnd() and before
-    /// GetNext(). PrepareForRead() can be called multiple times to do multiple read passes
-    /// over the stream, unless rows were read from the stream after PrepareForRead() or
-    /// PrepareForReadWrite() was called with delete_on_read = true.
-    /// 'delete_on_read': Pages are deleted after they are read.
-    /// 'got_reservation': set to true if there was enough reservation to initialize the
-    ///     first read page and false if there was not enough reservation and no other
-    ///     error was encountered. Undefined if an error status is returned.
-    Status PrepareForRead(bool delete_on_read) WARN_UNUSED_RESULT;
-
-    /// Adds a single row to the stream. There are three possible outcomes:
-    /// a) The append succeeds. True is returned.
-    /// b) The append fails because the unused reservation was not sufficient to add
-    ///   a new page to the stream large enough to fit 'row' and the stream could not
-    ///   increase the reservation to get enough unused reservation. Returns false and
-    ///   sets 'status' to OK. The append can be retried after freeing up memory or
-    ///   unpinning the stream.
-    /// c) The append fails with a runtime error. Returns false and sets 'status' to an
-    ///   error.
-    /// d) The append fails because the row is too large to fit in a page of a stream.
-    ///   Returns false and sets 'status' to an error.
-    ///
-    /// Unpinned streams can only encounter case b) when appending a row larger than
-    /// the default page size and the reservation could not be increased sufficiently.
-    /// Otherwise enough memory is automatically freed up by unpinning the current write
-    /// page.
-    ///
-    /// BufferedTupleStream3 will do a deep copy of the memory in the row. After AddRow()
-    /// returns an error, it should not be called again.
-    bool AddRow(TupleRow* row, Status* status) noexcept WARN_UNUSED_RESULT;
-
-    /// Allocates space to store a row of 'size' bytes (including fixed and variable length
-    /// data). If successful, returns a pointer to the allocated row. The caller then must
-    /// writes valid data to the row and call AddRowCustomEnd().
-    ///
-    /// If unsuccessful, returns nullptr. The failure modes are the same as described in the
-    /// AddRow() comment.
-    ALWAYS_INLINE uint8_t* AddRowCustomBegin(int64_t size, Status* status);
-
-    /// Called after AddRowCustomBegin() when done writing the row. Only should be called
-    /// if AddRowCustomBegin() succeeded. See the AddRowCustomBegin() comment for
-    /// explanation.
-    /// 'size': the size passed into AddRowCustomBegin().
-    void AddRowCustomEnd(int64_t size);
-
-    /// Unflattens 'flat_row' into a regular TupleRow 'row'. Only valid to call if the
-    /// stream is pinned. The row must have been allocated with the stream's row desc.
-    /// The returned 'row' is backed by memory from the stream so is only valid as long
-    /// as the stream is pinned.
-    void GetTupleRow(FlatRowPtr flat_row, TupleRow* row) const;
-
-    /// Pins all pages in this stream and switches to pinned mode. Has no effect if the
-    /// stream is already pinned.
-    /// If the current unused reservation is not sufficient to pin the stream in memory,
-    /// this will try to increase the reservation. If that fails, 'pinned' is set to false
-    /// and the stream is left unpinned. Otherwise 'pinned' is set to true.
-    Status PinStream(bool* pinned) WARN_UNUSED_RESULT;
-
-    /// Modes for UnpinStream().
-    enum UnpinMode {
-        /// All pages in the stream are unpinned and the read/write positions in the stream
-        /// are reset. No more rows can be written to the stream after this. The stream can
-        /// be re-read from the beginning by calling PrepareForRead().
-        UNPIN_ALL,
-        /// All pages are unpinned aside from the current read and write pages (if any),
-        /// which is left in the same state. The unpinned stream can continue being read
-        /// or written from the current read or write positions.
-        UNPIN_ALL_EXCEPT_CURRENT,
-    };
-
-    /// Unpins stream with the given 'mode' as described above.
-    void UnpinStream(UnpinMode mode);
-
-    /// Get the next batch of output rows, which are backed by the stream's memory.
-    /// If the stream is unpinned or 'delete_on_read' is true, the 'needs_deep_copy'
-    /// flag may be set on 'batch' to signal that memory will be freed on the next
-    /// call to GetNext() and that the caller should copy out any data it needs from
-    /// rows in 'batch' or in previous batches returned from GetNext().
-    ///
-    /// If the stream is pinned and 'delete_on_read' is false, the memory backing the
-    /// rows will remain valid until the stream is unpinned, destroyed, etc.
-    /// TODO: IMPALA-4179: update when we simplify the memory transfer model.
-    Status GetNext(RowBatch* batch, bool* eos) WARN_UNUSED_RESULT;
-
-    /// Same as above, but populate 'flat_rows' with a pointer to the flat version of
-    /// each returned row in the pinned stream. The pointers in 'flat_rows' are only
-    /// valid as long as the stream remains pinned.
-    Status GetNext(RowBatch* batch, bool* eos,
-                   std::vector<FlatRowPtr>* flat_rows) WARN_UNUSED_RESULT;
-
-    /// Returns all the rows in the stream in batch. This pins the entire stream in the
-    /// process. If the current unused reservation is not sufficient to pin the stream in
-    /// memory, this will try to increase the reservation. If that fails, 'got_rows' is set
-    /// to false.
-    Status GetRows(std::unique_ptr<RowBatch>* batch, bool* got_rows) WARN_UNUSED_RESULT;
-
-    /// Must be called once at the end to cleanup all resources. If 'batch' is non-nullptr,
-    /// attaches buffers from pinned pages that rows returned from GetNext() may reference.
-    /// Otherwise deletes all pages. Does nothing if the stream was already closed. The
-    /// 'flush' mode is forwarded to RowBatch::AddBuffer() when attaching buffers.
-    void Close(RowBatch* batch, RowBatch::FlushMode flush);
-
-    /// Number of rows in the stream.
-    int64_t num_rows() const { return num_rows_; }
-
-    /// Number of rows returned via GetNext().
-    int64_t rows_returned() const { return rows_returned_; }
-
-    /// Returns the byte size necessary to store the entire stream in memory.
-    int64_t byte_size() const { return total_byte_size_; }
-
-    /// Returns the number of bytes currently pinned in memory by the stream.
-    /// If ignore_current is true, the write_page_ memory is not included.
-    int64_t BytesPinned(bool ignore_current) const {
-        if (ignore_current && write_page_ != nullptr && write_page_->is_pinned()) {
-            return bytes_pinned_ - write_page_->len();
-        }
-        return bytes_pinned_;
-    }
-
-    bool is_closed() const { return closed_; }
-    bool is_pinned() const { return pinned_; }
-    bool has_read_iterator() const { return has_read_iterator_; }
-    bool has_write_iterator() const { return has_write_iterator_; }
-
-    std::string DebugString() const;
-
-private:
-    DISALLOW_COPY_AND_ASSIGN(BufferedTupleStream3);
-    friend class ArrayTupleStreamTest_TestArrayDeepCopy_Test;
-    friend class ArrayTupleStreamTest_TestComputeRowSize_Test;
-    friend class MultiNullableTupleStreamTest_TestComputeRowSize_Test;
-    friend class SimpleTupleStreamTest_TestGetRowsOverflow_Test;
-
-    /// Wrapper around BufferPool::PageHandle that tracks additional info about the page.
-    struct Page {
-        Page() : num_rows(0), retrieved_buffer(true) {}
-
-        int len() const { return handle.len(); }
-        bool is_pinned() const { return handle.is_pinned(); }
-        int pin_count() const { return handle.pin_count(); }
-        Status GetBuffer(const BufferPool::BufferHandle** buffer) {
-            RETURN_IF_ERROR(handle.GetBuffer(buffer));
-            retrieved_buffer = true;
-            return Status::OK();
-        }
-        std::string DebugString() const;
-
-        BufferPool::PageHandle handle;
-
-        /// Number of rows written to the page.
-        int num_rows;
-
-        /// Whether we called GetBuffer() on the page since it was last pinned. This means
-        /// that GetBuffer() and ExtractBuffer() cannot fail and that GetNext() may have
-        /// returned rows referencing the page's buffer.
-        bool retrieved_buffer;
-    };
-
-    /// Runtime state instance used to check for cancellation. Not owned.
-    RuntimeState* const state_;
-
-    /// Description of rows stored in the stream.
-    const RowDescriptor* desc_;
-
-    /// Plan node ID, used for error reporting.
-    int node_id_;
-
-    /// The size of the fixed length portion for each tuple in the row.
-    std::vector<int> fixed_tuple_sizes_;
-
-    /// Vectors of all the strings slots that have their varlen data stored in stream
-    /// grouped by tuple_idx.
-    std::vector<std::pair<int, std::vector<SlotDescriptor*>>> inlined_string_slots_;
-
-    /// Vectors of all the collection slots that have their varlen data stored in the
-    /// stream, grouped by tuple_idx.
-    // std::vector<std::pair<int, std::vector<SlotDescriptor*>>> inlined_coll_slots_;
-
-    /// Buffer pool and client used to allocate, pin and release pages. Not owned.
-    BufferPool* buffer_pool_;
-    BufferPool::ClientHandle* buffer_pool_client_;
-
-    /// List of pages in the stream.
-    /// Empty iff one of two cases applies:
-    /// * before the first row has been added with AddRow() or AddRowCustom().
-    /// * after the stream has been destructively read in 'delete_on_read' mode
-    std::list<Page> pages_;
-    // IMPALA-5629: avoid O(n) list.size() call by explicitly tracking the number of pages.
-    // TODO: remove when we switch to GCC5+, where list.size() is O(1). See GCC bug #49561.
-    int64_t num_pages_;
-
-    /// Total size of pages_, including any pages already deleted in 'delete_on_read'
-    /// mode.
-    int64_t total_byte_size_;
-
-    /// True if there is currently an active read iterator for the stream.
-    bool has_read_iterator_;
-
-    /// The current page being read. When no read iterator is active, equal to list.end().
-    /// When a read iterator is active, either points to the current read page, or equals
-    /// list.end() if no rows have yet been read.  GetNext() does not advance this past
-    /// the end of the stream, so upon eos 'read_page_' points to the last page and
-    /// rows_returned_ == num_rows_. Always pinned, unless a Pin() call failed and an error
-    /// status was returned.
-    std::list<Page>::iterator read_page_;
-
-    /// Number of rows returned from the current read_page_.
-    uint32_t read_page_rows_returned_;
-
-    /// Pointer into read_page_ to the byte after the last row read.
-    uint8_t* read_ptr_;
-
-    /// Pointer to one byte past the end of read_page_. Used to detect overruns.
-    const uint8_t* read_end_ptr_;
-
-    /// Pointer into write_page_ to the byte after the last row written.
-    uint8_t* write_ptr_;
-
-    /// Pointer to one byte past the end of write_page_. Cached to speed up computation
-    const uint8_t* write_end_ptr_;
-
-    /// Number of rows returned to the caller from GetNext() since the last
-    /// PrepareForRead() call.
-    int64_t rows_returned_;
-
-    /// True if there is currently an active write iterator into the stream.
-    bool has_write_iterator_;
-
-    /// The current page for writing. nullptr if there is no write iterator or no current
-    /// write page. Always pinned. Size is 'default_page_len_', except temporarily while
-    /// appending a larger row between AddRowCustomBegin() and AddRowCustomEnd().
-    Page* write_page_;
-
-    /// Total bytes of pinned pages in pages_, stored to avoid iterating over the list
-    /// to compute it.
-    int64_t bytes_pinned_;
-
-    /// Number of rows stored in the stream. Includes rows that were already deleted during
-    /// a destructive 'delete_on_read' pass over the stream.
-    int64_t num_rows_;
-
-    /// The default length in bytes of pages used to store the stream's rows. All rows that
-    /// fit in a default-sized page are stored in default-sized page.
-    const int64_t default_page_len_;
-
-    /// Whether any tuple in the rows is nullable.
-    const bool has_nullable_tuple_;
-
-    /// If true, pages are deleted after they are read during this read pass. Once rows
-    /// have been read from a stream with 'delete_on_read_' true, this is always true.
-    bool delete_on_read_;
-
-    bool closed_; // Used for debugging.
-
-    /// If true, this stream has been explicitly pinned by the caller and all pages are
-    /// kept pinned until the caller calls UnpinStream().
-    bool pinned_;
-
-    bool is_read_page(const Page* page) const {
-        return read_page_ != pages_.end() && &*read_page_ == page;
-    }
-
-    bool is_write_page(const Page* page) const { return write_page_ == page; }
-
-    /// Return true if the read and write page are the same.
-    bool has_read_write_page() const { return write_page_ != nullptr && is_read_page(write_page_); }
-
-    /// The slow path for AddRow() that is called if there is not sufficient space in
-    /// the current page.
-    bool AddRowSlow(TupleRow* row, Status* status) noexcept;
-
-    /// The slow path for AddRowCustomBegin() that is called if there is not sufficient space in
-    /// the current page.
-    uint8_t* AddRowCustomBeginSlow(int64_t size, Status* status) noexcept;
-
-    /// The slow path for AddRowCustomEnd() that is called for large pages.
-    void AddLargeRowCustomEnd(int64_t size) noexcept;
-
-    /// Copies 'row' into the buffer starting at *data and ending at the byte before
-    /// 'data_end'. On success, returns true and updates *data to point after the last
-    /// byte written. Returns false if there is not enough space in the buffer provided.
-    bool DeepCopy(TupleRow* row, uint8_t** data, const uint8_t* data_end) noexcept;
-
-    /// Templated implementation of DeepCopy().
-    template <bool HAS_NULLABLE_TUPLE>
-    bool DeepCopyInternal(TupleRow* row, uint8_t** data, const uint8_t* data_end) noexcept;
-
-    /// Helper function to copy strings in string_slots from tuple into *data.
-    /// Updates *data to the end of the string data added. Returns false if the data
-    /// does not fit in the buffer [*data, data_end).
-    static bool CopyStrings(const Tuple* tuple, const std::vector<SlotDescriptor*>& string_slots,
-                            uint8_t** data, const uint8_t* data_end);
-
-    /// Helper function to deep copy collections in collection_slots from tuple into
-    /// the buffer [*data, data_end). Updates *data to the end of the collection data
-    /// added. Returns false if the data does not fit in the buffer.
-    //static bool CopyCollections(const Tuple* tuple,
-    //    const std::vector<SlotDescriptor*>& collection_slots, uint8_t** data,
-    //   const uint8_t* data_end);
-
-    /// Gets a new page of 'page_len' bytes from buffer_pool_, updating write_page_,
-    /// write_ptr_ and write_end_ptr_. The caller must ensure there is 'page_len' unused
-    /// reservation. The caller must reset the write page (if there is one) before calling.
-    Status NewWritePage(int64_t page_len) noexcept WARN_UNUSED_RESULT;
-
-    /// Determines what page size is needed to fit a row of 'row_size' bytes.
-    /// Returns an error if the row cannot fit in a page.
-    void CalcPageLenForRow(int64_t row_size, int64_t* page_len);
-
-    /// Wrapper around NewWritePage() that allocates a new write page that fits a row of
-    /// 'row_size' bytes. Increases reservation if needed to allocate the next page.
-    /// Returns OK and sets 'got_reservation' to true if the write page was successfully
-    /// allocated. Returns an error if the row cannot fit in a page. Returns OK and sets
-    /// 'got_reservation' to false if the reservation could not be increased and no other
-    /// error was encountered.
-    Status AdvanceWritePage(int64_t row_size) noexcept WARN_UNUSED_RESULT;
-
-    /// Reset the write page, if there is one, and unpin pages accordingly. If there
-    /// is an active write iterator, the next row will be appended to a new page.
-    void ResetWritePage();
-
-    /// Invalidate the write iterator and release any resources associated with it. After
-    /// calling this, no more rows can be appended to the stream.
-    void InvalidateWriteIterator();
-
-    /// Same as PrepareForRead(), except the iterators are not invalidated and
-    /// the caller is assumed to have checked there is sufficient unused reservation.
-    Status PrepareForReadInternal(bool delete_on_read) WARN_UNUSED_RESULT;
-
-    /// Pins the next read page. This blocks reading from disk if necessary to bring the
-    /// page's data into memory. Updates read_page_, read_ptr_, and
-    /// read_page_rows_returned_.
-    Status NextReadPage() WARN_UNUSED_RESULT;
-
-    /// Invalidate the read iterator, and release any resources associated with the active
-    /// iterator.
-    void InvalidateReadIterator();
-
-    /// Returns the total additional bytes that this row will consume in write_page_ if
-    /// appended to the page. This includes the row's null indicators, the fixed length
-    /// part of the row and the data for inlined_string_slots_ and inlined_coll_slots_.
-    int64_t ComputeRowSize(TupleRow* row) const noexcept;
-
-    /// Pins page and updates tracking stats.
-    Status PinPage(Page* page) WARN_UNUSED_RESULT;
-
-    /// Increment the page's pin count if this page needs a higher pin count given the
-    /// current read and write iterator positions and whether the stream will be pinned
-    /// ('stream_pinned'). Assumes that no scenarios occur when the pin count needs to
-    /// be incremented multiple times. The caller is responsible for ensuring sufficient
-    /// reservation is available.
-    Status PinPageIfNeeded(Page* page, bool stream_pinned) WARN_UNUSED_RESULT;
-
-    /// Decrement the page's pin count if this page needs a lower pin count given the
-    /// current read and write iterator positions and whether the stream will be pinned
-    /// ('stream_pinned'). Assumes that no scenarios occur when the pin count needs to
-    /// be decremented multiple times.
-    void UnpinPageIfNeeded(Page* page, bool stream_pinned);
-
-    /// Return the expected pin count for 'page' in the current stream based on the current
-    /// read and write pages and whether the stream is pinned.
-    int ExpectedPinCount(bool stream_pinned, const Page* page) const;
-
-    /// Templated GetNext implementations.
-    template <bool FILL_FLAT_ROWS>
-    Status GetNextInternal(RowBatch* batch, bool* eos, std::vector<FlatRowPtr>* flat_rows);
-    template <bool FILL_FLAT_ROWS, bool HAS_NULLABLE_TUPLE>
-    Status GetNextInternal(RowBatch* batch, bool* eos, std::vector<FlatRowPtr>* flat_rows);
-
-    /// Helper function to convert a flattened TupleRow stored starting at '*data' into
-    /// 'row'. *data is updated to point to the first byte past the end of the row.
-    template <bool HAS_NULLABLE_TUPLE>
-    void UnflattenTupleRow(uint8_t** data, TupleRow* row) const;
-
-    /// Helper function for GetNextInternal(). For each string slot in string_slots,
-    /// update StringValue's ptr field to point to the corresponding string data stored
-    /// inline in the stream (at the current value of read_ptr_) advance read_ptr_ by the
-    /// StringValue's length field.
-    void FixUpStringsForRead(const std::vector<SlotDescriptor*>& string_slots, Tuple* tuple);
-
-    /// Helper function for GetNextInternal(). For each collection slot in collection_slots,
-    /// recursively update any pointers in the CollectionValue to point to the corresponding
-    /// var len data stored inline in the stream, advancing read_ptr_ as data is read.
-    /// Assumes that the collection was serialized to the stream in DeepCopy()'s format.
-    //void FixUpCollectionsForRead(
-    //    const std::vector<SlotDescriptor*>& collection_slots, Tuple* tuple);
-
-    /// Returns the number of null indicator bytes per row. Only valid if this stream has
-    /// nullable tuples.
-    int NullIndicatorBytesPerRow() const;
-
-    /// Returns the total bytes pinned. Only called in DCHECKs to validate bytes_pinned_.
-    int64_t CalcBytesPinned() const;
-
-    /// DCHECKs if the stream is internally inconsistent. The stream should always be in
-    /// a consistent state after returning success from a public API call. The Fast version
-    /// has constant runtime and does not check all of 'pages_'. The Full version includes
-    /// O(n) checks that require iterating over the whole 'pages_' list (e.g. checking that
-    /// each page is in a valid state).
-    void CheckConsistencyFast() const;
-    void CheckConsistencyFull() const;
-    void CheckPageConsistency(const Page* page) const;
-};
-} // namespace doris
diff --git a/be/src/runtime/buffered_tuple_stream3.inline.h b/be/src/runtime/buffered_tuple_stream3.inline.h
deleted file mode 100644
index 7670e764a0..0000000000
--- a/be/src/runtime/buffered_tuple_stream3.inline.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// https://github.com/apache/impala/blob/branch-3.0.0/be/src/runtime/buffered-tuple-stream.inline.h
-// and modified by Doris
-
-#pragma once
-
-#include "runtime/buffered_tuple_stream3.h"
-#include "runtime/descriptors.h"
-#include "runtime/tuple_row.h"
-#include "util/bit_util.h"
-
-namespace doris {
-
-inline int BufferedTupleStream3::NullIndicatorBytesPerRow() const {
-    DCHECK(has_nullable_tuple_);
-    return BitUtil::RoundUpNumBytes(fixed_tuple_sizes_.size());
-}
-
-inline uint8_t* BufferedTupleStream3::AddRowCustomBegin(int64_t size, Status* status) {
-    DCHECK(!closed_);
-    DCHECK(has_write_iterator());
-    if (UNLIKELY(write_page_ == nullptr || write_ptr_ + size > write_end_ptr_)) {
-        return AddRowCustomBeginSlow(size, status);
-    }
-    DCHECK(write_page_ != nullptr);
-    DCHECK(write_page_->is_pinned());
-    DCHECK_LE(write_ptr_ + size, write_end_ptr_);
-    ++num_rows_;
-    ++write_page_->num_rows;
-
-    uint8_t* data = write_ptr_;
-    write_ptr_ += size;
-    return data;
-}
-
-inline void BufferedTupleStream3::AddRowCustomEnd(int64_t size) {
-    if (UNLIKELY(size > default_page_len_)) AddLargeRowCustomEnd(size);
-}
-} // namespace doris
diff --git a/be/src/runtime/export_sink.cpp b/be/src/runtime/export_sink.cpp
deleted file mode 100644
index f709c182ec..0000000000
--- a/be/src/runtime/export_sink.cpp
+++ /dev/null
@@ -1,276 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/export_sink.h"
-
-#include <thrift/protocol/TDebugProtocol.h>
-
-#include <sstream>
-
-#include "exprs/expr.h"
-#include "exprs/expr_context.h"
-#include "gutil/strings/numbers.h"
-#include "io/file_factory.h"
-#include "runtime/large_int_value.h"
-#include "runtime/raw_value.h"
-#include "runtime/row_batch.h"
-#include "runtime/runtime_state.h"
-#include "runtime/tuple_row.h"
-#include "util/mysql_global.h"
-#include "util/runtime_profile.h"
-#include "util/types.h"
-#include "util/uid_util.h"
-
-namespace doris {
-
-ExportSink::ExportSink(ObjectPool* pool, const RowDescriptor& row_desc,
-                       const std::vector<TExpr>& t_exprs)
-        : _pool(pool),
-          _row_desc(row_desc),
-          _t_output_expr(t_exprs),
-          _bytes_written_counter(nullptr),
-          _rows_written_counter(nullptr),
-          _write_timer(nullptr),
-          _header_sent(false) {
-    _name = "ExportSink";
-}
-
-ExportSink::~ExportSink() {}
-
-Status ExportSink::init(const TDataSink& t_sink) {
-    RETURN_IF_ERROR(DataSink::init(t_sink));
-    _t_export_sink = t_sink.export_sink;
-
-    // From the thrift expressions create the real exprs.
-    RETURN_IF_ERROR(Expr::create_expr_trees(_pool, _t_output_expr, &_output_expr_ctxs));
-    return Status::OK();
-}
-
-Status ExportSink::prepare(RuntimeState* state) {
-    RETURN_IF_ERROR(DataSink::prepare(state));
-
-    _state = state;
-
-    std::stringstream title;
-    title << "ExportSink (frag_id=" << state->fragment_instance_id() << ")";
-    // create profile
-    _profile = state->obj_pool()->add(new RuntimeProfile(title.str()));
-    SCOPED_TIMER(_profile->total_time_counter());
-
-    // Prepare the exprs to run.
-    RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc));
-
-    // TODO(lingbin): add some Counter
-    _bytes_written_counter = ADD_COUNTER(profile(), "BytesExported", TUnit::BYTES);
-    _rows_written_counter = ADD_COUNTER(profile(), "RowsExported", TUnit::UNIT);
-    _write_timer = ADD_TIMER(profile(), "WriteTime");
-
-    return Status::OK();
-}
-
-Status ExportSink::open(RuntimeState* state) {
-    // Prepare the exprs to run.
-    RETURN_IF_ERROR(Expr::open(_output_expr_ctxs, state));
-    // open broker
-    RETURN_IF_ERROR(open_file_writer());
-    return Status::OK();
-}
-
-Status ExportSink::write_csv_header() {
-    if (!_header_sent && _t_export_sink.header.size() > 0) {
-        size_t written_len = 0;
-        RETURN_IF_ERROR(
-                _file_writer->write(reinterpret_cast<const uint8_t*>(_t_export_sink.header.c_str()),
-                                    _t_export_sink.header.size(), &written_len));
-        _header_sent = true;
-    }
-    return Status::OK();
-}
-
-Status ExportSink::send(RuntimeState* state, RowBatch* batch) {
-    VLOG_ROW << "debug: export_sink send batch: " << batch->to_string();
-    SCOPED_TIMER(_profile->total_time_counter());
-    int num_rows = batch->num_rows();
-    // we send at most 1024 rows at a time
-    int batch_send_rows = num_rows > 1024 ? 1024 : num_rows;
-    RETURN_IF_ERROR(write_csv_header());
-    std::stringstream ss;
-    for (int i = 0; i < num_rows;) {
-        ss.str("");
-        for (int j = 0; j < batch_send_rows && i < num_rows; ++j, ++i) {
-            RETURN_IF_ERROR(gen_row_buffer(batch->get_row(i), &ss));
-        }
-
-        VLOG_ROW << "debug: export_sink send row: " << ss.str();
-        const std::string& buf = ss.str();
-        size_t written_len = 0;
-
-        SCOPED_TIMER(_write_timer);
-        // TODO(lingbin): for broker writer, we should not send rpc each row.
-        RETURN_IF_ERROR(_file_writer->write(reinterpret_cast<const uint8_t*>(buf.c_str()),
-                                            buf.size(), &written_len));
-        COUNTER_UPDATE(_bytes_written_counter, buf.size());
-    }
-    COUNTER_UPDATE(_rows_written_counter, num_rows);
-    return Status::OK();
-}
-
-Status ExportSink::gen_row_buffer(TupleRow* row, std::stringstream* ss) {
-    int num_columns = _output_expr_ctxs.size();
-    // const TupleDescriptor& desc = row_desc().TupleDescriptor;
-    for (int i = 0; i < num_columns; ++i) {
-        void* item = _output_expr_ctxs[i]->get_value(row);
-        if (item == nullptr) {
-            (*ss) << "\\N";
-        } else {
-            switch (_output_expr_ctxs[i]->root()->type().type) {
-            case TYPE_BOOLEAN:
-            case TYPE_TINYINT:
-                (*ss) << (int)*static_cast<int8_t*>(item);
-                break;
-            case TYPE_SMALLINT:
-                (*ss) << *static_cast<int16_t*>(item);
-                break;
-            case TYPE_INT:
-                (*ss) << *static_cast<int32_t*>(item);
-                break;
-            case TYPE_BIGINT:
-                (*ss) << *static_cast<int64_t*>(item);
-                break;
-            case TYPE_LARGEINT:
-                (*ss) << reinterpret_cast<PackedInt128*>(item)->value;
-                break;
-            case TYPE_FLOAT: {
-                char buffer[MAX_FLOAT_STR_LENGTH + 2];
-                float float_value = *static_cast<float*>(item);
-                buffer[0] = '\0';
-                int length = FloatToBuffer(float_value, MAX_FLOAT_STR_LENGTH, buffer);
-                DCHECK(length >= 0) << "gcvt float failed, float value=" << float_value;
-                (*ss) << buffer;
-                break;
-            }
-            case TYPE_DOUBLE: {
-                // To prevent loss of precision on float and double types,
-                // they are converted to strings before output.
-                // For example: For a double value 27361919854.929001,
-                // the direct output of using std::stringstream is 2.73619e+10,
-                // and after conversion to a string, it outputs 27361919854.929001
-                char buffer[MAX_DOUBLE_STR_LENGTH + 2];
-                double double_value = *static_cast<double*>(item);
-                buffer[0] = '\0';
-                int length = DoubleToBuffer(double_value, MAX_DOUBLE_STR_LENGTH, buffer);
-                DCHECK(length >= 0) << "gcvt double failed, double value=" << double_value;
-                (*ss) << buffer;
-                break;
-            }
-            case TYPE_DATE:
-            case TYPE_DATETIME: {
-                char buf[64];
-                const DateTimeValue* time_val = (const DateTimeValue*)(item);
-                time_val->to_string(buf);
-                (*ss) << buf;
-                break;
-            }
-            case TYPE_VARCHAR:
-            case TYPE_CHAR:
-            case TYPE_STRING: {
-                const StringValue* string_val = (const StringValue*)(item);
-
-                if (string_val->ptr == nullptr) {
-                    if (string_val->len == 0) {
-                    } else {
-                        (*ss) << "\\N";
-                    }
-                } else {
-                    (*ss) << std::string(string_val->ptr, string_val->len);
-                }
-                break;
-            }
-
-            case TYPE_DECIMALV2: {
-                const DecimalV2Value decimal_val(
-                        reinterpret_cast<const PackedInt128*>(item)->value);
-                std::string decimal_str;
-                int output_scale = _output_expr_ctxs[i]->root()->output_scale();
-                decimal_str = decimal_val.to_string(output_scale);
-                (*ss) << decimal_str;
-                break;
-            }
-            case TYPE_ARRAY: {
-                auto col_type = _output_expr_ctxs[i]->root()->type();
-                int output_scale = _output_expr_ctxs[i]->root()->output_scale();
-                RawValue::print_value(item, col_type, output_scale, ss);
-                break;
-            }
-            default: {
-                std::stringstream err_ss;
-                err_ss << "can't export this type. type = " << _output_expr_ctxs[i]->root()->type();
-                return Status::InternalError(err_ss.str());
-            }
-            }
-        }
-
-        if (i < num_columns - 1) {
-            (*ss) << _t_export_sink.column_separator;
-        }
-    }
-    (*ss) << _t_export_sink.line_delimiter;
-
-    return Status::OK();
-}
-
-Status ExportSink::close(RuntimeState* state, Status exec_status) {
-    if (_closed) {
-        return Status::OK();
-    }
-    Expr::close(_output_expr_ctxs, state);
-    if (_file_writer != nullptr) {
-        _file_writer->close();
-        _file_writer = nullptr;
-    }
-    return DataSink::close(state, exec_status);
-}
-
-Status ExportSink::open_file_writer() {
-    if (_file_writer != nullptr) {
-        return Status::OK();
-    }
-
-    std::string file_name = gen_file_name();
-    // TODO(lingbin): gen file path
-    RETURN_IF_ERROR(FileFactory::create_file_writer(
-            _t_export_sink.file_type, _state->exec_env(), _t_export_sink.broker_addresses,
-            _t_export_sink.properties, _t_export_sink.export_path + "/" + file_name, 0,
-            _file_writer));
-    _state->add_export_output_file(_t_export_sink.export_path + "/" + file_name);
-
-    return _file_writer->open();
-}
-
-// TODO(lingbin): add some other info to file name, like partition
-std::string ExportSink::gen_file_name() {
-    const TUniqueId& id = _state->fragment_instance_id();
-
-    struct timeval tv;
-    gettimeofday(&tv, nullptr);
-
-    std::stringstream file_name;
-    file_name << "export-data-" << print_id(id) << "-" << (tv.tv_sec * 1000 + tv.tv_usec / 1000);
-    return file_name.str();
-}
-
-} // namespace doris
diff --git a/be/src/runtime/export_sink.h b/be/src/runtime/export_sink.h
deleted file mode 100644
index 107b6f4203..0000000000
--- a/be/src/runtime/export_sink.h
+++ /dev/null
@@ -1,83 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <vector>
-
-#include "common/status.h"
-#include "exec/data_sink.h"
-#include "util/runtime_profile.h"
-
-namespace doris {
-
-class RowDescriptor;
-class TExpr;
-class RuntimeState;
-class RuntimeProfile;
-class ExprContext;
-class FileWriter;
-class TupleRow;
-
-// This class is a sinker, which put export data to external storage by broker.
-class ExportSink : public DataSink {
-public:
-    ExportSink(ObjectPool* pool, const RowDescriptor& row_desc, const std::vector<TExpr>& t_exprs);
-
-    virtual ~ExportSink();
-
-    virtual Status init(const TDataSink& thrift_sink) override;
-
-    virtual Status prepare(RuntimeState* state) override;
-
-    virtual Status open(RuntimeState* state) override;
-
-    virtual Status send(RuntimeState* state, RowBatch* batch) override;
-
-    // Flush all buffered data and close all existing channels to destination
-    // hosts. Further send() calls are illegal after calling close().
-    virtual Status close(RuntimeState* state, Status exec_status) override;
-
-    virtual RuntimeProfile* profile() override { return _profile; }
-
-private:
-    Status open_file_writer();
-    Status gen_row_buffer(TupleRow* row, std::stringstream* ss);
-    std::string gen_file_name();
-    Status write_csv_header();
-
-    RuntimeState* _state;
-
-    // owned by RuntimeState
-    ObjectPool* _pool;
-    const RowDescriptor& _row_desc;
-    const std::vector<TExpr>& _t_output_expr;
-
-    std::vector<ExprContext*> _output_expr_ctxs;
-
-    TExportSink _t_export_sink;
-    std::unique_ptr<FileWriter> _file_writer;
-
-    RuntimeProfile* _profile;
-
-    RuntimeProfile::Counter* _bytes_written_counter;
-    RuntimeProfile::Counter* _rows_written_counter;
-    RuntimeProfile::Counter* _write_timer;
-    bool _header_sent;
-};
-
-} // end namespace doris
diff --git a/be/src/runtime/memory_scratch_sink.cpp b/be/src/runtime/memory_scratch_sink.cpp
deleted file mode 100644
index 32bec47259..0000000000
--- a/be/src/runtime/memory_scratch_sink.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/memory_scratch_sink.h"
-
-#include <arrow/memory_pool.h>
-#include <arrow/record_batch.h>
-
-#include <sstream>
-
-#include "exprs/expr.h"
-#include "gen_cpp/Types_types.h"
-#include "runtime/exec_env.h"
-#include "runtime/primitive_type.h"
-#include "runtime/row_batch.h"
-#include "runtime/runtime_state.h"
-#include "runtime/tuple_row.h"
-#include "util/arrow/row_batch.h"
-
-namespace doris {
-
-MemoryScratchSink::MemoryScratchSink(const RowDescriptor& row_desc,
-                                     const std::vector<TExpr>& t_output_expr,
-                                     const TMemoryScratchSink& sink)
-        : _row_desc(row_desc), _t_output_expr(t_output_expr) {
-    _name = "MemoryScratchSink";
-}
-
-MemoryScratchSink::~MemoryScratchSink() {}
-
-Status MemoryScratchSink::prepare_exprs(RuntimeState* state) {
-    // From the thrift expressions create the real exprs.
-    RETURN_IF_ERROR(Expr::create_expr_trees(state->obj_pool(), _t_output_expr, &_output_expr_ctxs));
-    // Prepare the exprs to run.
-    RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc));
-    // generate the arrow schema
-    RETURN_IF_ERROR(convert_to_arrow_schema(_row_desc, &_arrow_schema));
-    return Status::OK();
-}
-
-Status MemoryScratchSink::prepare(RuntimeState* state) {
-    RETURN_IF_ERROR(DataSink::prepare(state));
-    // prepare output_expr
-    RETURN_IF_ERROR(prepare_exprs(state));
-    // create queue
-    TUniqueId fragment_instance_id = state->fragment_instance_id();
-    state->exec_env()->result_queue_mgr()->create_queue(fragment_instance_id, &_queue);
-    std::stringstream title;
-    title << "MemoryScratchSink (frag_id=" << fragment_instance_id << ")";
-    // create profile
-    _profile = state->obj_pool()->add(new RuntimeProfile(title.str()));
-
-    return Status::OK();
-}
-
-Status MemoryScratchSink::send(RuntimeState* state, RowBatch* batch) {
-    if (nullptr == batch || 0 == batch->num_rows()) {
-        return Status::OK();
-    }
-    std::shared_ptr<arrow::RecordBatch> result;
-    RETURN_IF_ERROR(
-            convert_to_arrow_batch(*batch, _arrow_schema, arrow::default_memory_pool(), &result));
-    _queue->blocking_put(result);
-    return Status::OK();
-}
-
-Status MemoryScratchSink::open(RuntimeState* state) {
-    return Expr::open(_output_expr_ctxs, state);
-}
-
-Status MemoryScratchSink::close(RuntimeState* state, Status exec_status) {
-    if (_closed) {
-        return Status::OK();
-    }
-    // put sentinel
-    if (_queue != nullptr) {
-        _queue->blocking_put(nullptr);
-    }
-    Expr::close(_output_expr_ctxs, state);
-    return DataSink::close(state, exec_status);
-}
-
-} // namespace doris
diff --git a/be/src/runtime/memory_scratch_sink.h b/be/src/runtime/memory_scratch_sink.h
deleted file mode 100644
index f8e5fceca1..0000000000
--- a/be/src/runtime/memory_scratch_sink.h
+++ /dev/null
@@ -1,82 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "common/status.h"
-#include "exec/data_sink.h"
-#include "gen_cpp/DorisExternalService_types.h"
-#include "gen_cpp/PlanNodes_types.h"
-#include "runtime/result_queue_mgr.h"
-#include "util/blocking_queue.hpp"
-
-namespace arrow {
-
-class MemoryPool;
-class RecordBatch;
-class Schema;
-
-} // namespace arrow
-
-namespace doris {
-
-class ObjectPool;
-class RowBatch;
-class ObjectPool;
-class RuntimeState;
-class RuntimeProfile;
-class BufferControlBlock;
-class ExprContext;
-class ResultWriter;
-class TupleRow;
-
-// used to push data to blocking queue
-class MemoryScratchSink : public DataSink {
-public:
-    MemoryScratchSink(const RowDescriptor& row_desc, const std::vector<TExpr>& select_exprs,
-                      const TMemoryScratchSink& sink);
-
-    virtual ~MemoryScratchSink();
-
-    virtual Status prepare(RuntimeState* state);
-
-    virtual Status open(RuntimeState* state);
-
-    // send data in 'batch' to this backend queue mgr
-    // Blocks until all rows in batch are pushed to the queue
-    virtual Status send(RuntimeState* state, RowBatch* batch);
-
-    virtual Status close(RuntimeState* state, Status exec_status);
-
-    virtual RuntimeProfile* profile() { return _profile; }
-
-private:
-    Status prepare_exprs(RuntimeState* state);
-
-    // Owned by the RuntimeState.
-    const RowDescriptor& _row_desc;
-    std::shared_ptr<arrow::Schema> _arrow_schema;
-
-    BlockQueueSharedPtr _queue;
-
-    RuntimeProfile* _profile; // Allocated from _pool
-
-    // Owned by the RuntimeState.
-    const std::vector<TExpr>& _t_output_expr;
-    std::vector<ExprContext*> _output_expr_ctxs;
-};
-} // namespace doris
diff --git a/be/src/runtime/mysql_result_writer.cpp b/be/src/runtime/mysql_result_writer.cpp
deleted file mode 100644
index 823f05d2d4..0000000000
--- a/be/src/runtime/mysql_result_writer.cpp
+++ /dev/null
@@ -1,282 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/mysql_result_writer.h"
-
-#include "exprs/expr_context.h"
-#include "gen_cpp/PaloInternalService_types.h"
-#include "runtime/buffer_control_block.h"
-#include "runtime/primitive_type.h"
-#include "runtime/result_buffer_mgr.h"
-#include "runtime/row_batch.h"
-#include "runtime/tuple_row.h"
-#include "util/mysql_row_buffer.h"
-#include "util/types.h"
-#include "vec/columns/column_vector.h"
-#include "vec/core/block.h"
-
-namespace doris {
-
-MysqlResultWriter::MysqlResultWriter(BufferControlBlock* sinker,
-                                     const std::vector<ExprContext*>& output_expr_ctxs,
-                                     RuntimeProfile* parent_profile, bool output_object_data)
-        : ResultWriter(output_object_data),
-          _sinker(sinker),
-          _output_expr_ctxs(output_expr_ctxs),
-          _row_buffer(nullptr),
-          _parent_profile(parent_profile) {}
-
-MysqlResultWriter::~MysqlResultWriter() {
-    delete _row_buffer;
-}
-
-Status MysqlResultWriter::init(RuntimeState* state) {
-    _init_profile();
-    if (nullptr == _sinker) {
-        return Status::InternalError("sinker is nullptr pointer.");
-    }
-
-    _row_buffer = new (std::nothrow) MysqlRowBuffer();
-    if (nullptr == _row_buffer) {
-        return Status::InternalError("no memory to alloc.");
-    }
-
-    return Status::OK();
-}
-
-void MysqlResultWriter::_init_profile() {
-    _append_row_batch_timer = ADD_TIMER(_parent_profile, "AppendBatchTime");
-    _convert_tuple_timer = ADD_CHILD_TIMER(_parent_profile, "TupleConvertTime", "AppendBatchTime");
-    _result_send_timer = ADD_CHILD_TIMER(_parent_profile, "ResultSendTime", "AppendBatchTime");
-    _sent_rows_counter = ADD_COUNTER(_parent_profile, "NumSentRows", TUnit::UNIT);
-}
-
-int MysqlResultWriter::_add_row_value(int index, const TypeDescriptor& type, void* item) {
-    int buf_ret = 0;
-    if (item == nullptr) {
-        return _row_buffer->push_null();
-    }
-
-    switch (type.type) {
-    case TYPE_BOOLEAN:
-    case TYPE_TINYINT:
-        buf_ret = _row_buffer->push_tinyint(*static_cast<int8_t*>(item));
-        break;
-
-    case TYPE_SMALLINT:
-        buf_ret = _row_buffer->push_smallint(*static_cast<int16_t*>(item));
-        break;
-
-    case TYPE_INT:
-        buf_ret = _row_buffer->push_int(*static_cast<int32_t*>(item));
-        break;
-
-    case TYPE_BIGINT:
-        buf_ret = _row_buffer->push_bigint(*static_cast<int64_t*>(item));
-        break;
-
-    case TYPE_LARGEINT: {
-        buf_ret = _row_buffer->push_largeint(reinterpret_cast<const PackedInt128*>(item)->value);
-        break;
-    }
-
-    case TYPE_FLOAT:
-        buf_ret = _row_buffer->push_float(*static_cast<float*>(item));
-        break;
-
-    case TYPE_DOUBLE:
-        buf_ret = _row_buffer->push_double(*static_cast<double*>(item));
-        break;
-
-    case TYPE_TIME: {
-        buf_ret = _row_buffer->push_time(*static_cast<double*>(item));
-        break;
-    }
-
-    case TYPE_DATE:
-    case TYPE_DATETIME: {
-        buf_ret = _row_buffer->push_datetime(*static_cast<DateTimeValue*>(item));
-        break;
-    }
-
-    case TYPE_HLL:
-    case TYPE_OBJECT:
-    case TYPE_QUANTILE_STATE: {
-        if (_output_object_data) {
-            const StringValue* string_val = (const StringValue*)(item);
-
-            if (string_val->ptr == nullptr) {
-                buf_ret = _row_buffer->push_null();
-            } else {
-                buf_ret = _row_buffer->push_string(string_val->ptr, string_val->len);
-            }
-        } else {
-            buf_ret = _row_buffer->push_null();
-        }
-
-        break;
-    }
-
-    case TYPE_VARCHAR:
-    case TYPE_CHAR:
-    case TYPE_STRING: {
-        const StringValue* string_val = (const StringValue*)(item);
-
-        if (string_val->ptr == nullptr) {
-            if (string_val->len == 0) {
-                // 0x01 is a magic num, not useful actually, just for present ""
-                char* tmp_val = reinterpret_cast<char*>(0x01);
-                buf_ret = _row_buffer->push_string(tmp_val, string_val->len);
-            } else {
-                buf_ret = _row_buffer->push_null();
-            }
-        } else {
-            buf_ret = _row_buffer->push_string(string_val->ptr, string_val->len);
-        }
-
-        break;
-    }
-
-    case TYPE_DECIMALV2: {
-        DecimalV2Value decimal_val(reinterpret_cast<const PackedInt128*>(item)->value);
-        // TODO: Support decimal output_scale after we support FE can sure
-        // accuracy of output_scale
-        // int output_scale = _output_expr_ctxs[index]->root()->output_scale();
-        buf_ret = _row_buffer->push_decimal(decimal_val, type.scale);
-        break;
-    }
-
-    case TYPE_ARRAY: {
-        auto child_type = type.children[0];
-        auto array_value = (const CollectionValue*)(item);
-
-        ArrayIterator iter = array_value->iterator(child_type.type);
-
-        _row_buffer->open_dynamic_mode();
-
-        buf_ret = _row_buffer->push_string("[", 1);
-
-        int begin = 0;
-        while (iter.has_next() && !buf_ret) {
-            if (begin != 0) {
-                buf_ret = _row_buffer->push_string(", ", 2);
-            }
-            if (!iter.get()) {
-                buf_ret = _row_buffer->push_string("NULL", 4);
-            } else {
-                if (child_type.is_string_type()) {
-                    buf_ret = _row_buffer->push_string("'", 1);
-                    buf_ret = _add_row_value(index, child_type, iter.get());
-                    buf_ret = _row_buffer->push_string("'", 1);
-                } else if (child_type.is_date_type()) {
-                    DateTimeVal data;
-                    iter.get(&data);
-                    auto datetime_value = DateTimeValue::from_datetime_val(data);
-                    buf_ret = _add_row_value(index, child_type, &datetime_value);
-                } else if (child_type.is_decimal_v2_type()) {
-                    DecimalV2Val data;
-                    iter.get(&data);
-                    auto decimal_value = DecimalV2Value::from_decimal_val(data);
-                    buf_ret = _add_row_value(index, child_type, &decimal_value);
-                } else {
-                    buf_ret = _add_row_value(index, child_type, iter.get());
-                }
-            }
-
-            iter.next();
-            begin++;
-        }
-
-        if (!buf_ret) {
-            buf_ret = _row_buffer->push_string("]", 1);
-        }
-
-        _row_buffer->close_dynamic_mode();
-        break;
-    }
-
-    default:
-        LOG(WARNING) << "can't convert this type to mysql type. type = "
-                     << _output_expr_ctxs[index]->root()->type();
-        buf_ret = -1;
-        break;
-    }
-
-    return buf_ret;
-}
-
-Status MysqlResultWriter::_add_one_row(TupleRow* row) {
-    _row_buffer->reset();
-    int num_columns = _output_expr_ctxs.size();
-    int buf_ret = 0;
-
-    for (int i = 0; 0 == buf_ret && i < num_columns; ++i) {
-        void* item = _output_expr_ctxs[i]->get_value(row);
-
-        buf_ret = _add_row_value(i, _output_expr_ctxs[i]->root()->type(), item);
-    }
-
-    if (0 != buf_ret) {
-        return Status::InternalError("pack mysql buffer failed.");
-    }
-
-    return Status::OK();
-}
-
-Status MysqlResultWriter::append_row_batch(const RowBatch* batch) {
-    SCOPED_TIMER(_append_row_batch_timer);
-    if (nullptr == batch || 0 == batch->num_rows()) {
-        return Status::OK();
-    }
-
-    Status status;
-    // convert one batch
-    std::unique_ptr<TFetchDataResult> result = std::make_unique<TFetchDataResult>();
-    int num_rows = batch->num_rows();
-    result->result_batch.rows.resize(num_rows);
-
-    {
-        SCOPED_TIMER(_convert_tuple_timer);
-        for (int i = 0; status.ok() && i < num_rows; ++i) {
-            TupleRow* row = batch->get_row(i);
-            status = _add_one_row(row);
-
-            if (status.ok()) {
-                result->result_batch.rows[i].assign(_row_buffer->buf(), _row_buffer->length());
-            } else {
-                LOG(WARNING) << "convert row to mysql result failed.";
-                break;
-            }
-        }
-    }
-
-    if (status.ok()) {
-        SCOPED_TIMER(_result_send_timer);
-        // push this batch to back
-        RETURN_NOT_OK_STATUS_WITH_WARN(_sinker->add_batch(result),
-                                       "fappend result batch to sink failed.");
-        _written_rows += num_rows;
-    }
-    return Status::OK();
-}
-
-Status MysqlResultWriter::close() {
-    COUNTER_SET(_sent_rows_counter, _written_rows);
-    return Status::OK();
-}
-
-} // namespace doris
diff --git a/be/src/runtime/mysql_result_writer.h b/be/src/runtime/mysql_result_writer.h
deleted file mode 100644
index 14f4ce7c99..0000000000
--- a/be/src/runtime/mysql_result_writer.h
+++ /dev/null
@@ -1,79 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "primitive_type.h"
-#include "runtime/result_writer.h"
-#include "runtime/runtime_state.h"
-#include "vec/data_types/data_type.h"
-
-namespace doris {
-
-class TupleRow;
-class RowBatch;
-class ExprContext;
-class MysqlRowBuffer;
-class BufferControlBlock;
-class RuntimeProfile;
-
-namespace vectorized {
-class VExprContext;
-}
-
-// convert the row batch to mysql protocol row
-class MysqlResultWriter final : public ResultWriter {
-public:
-    MysqlResultWriter(BufferControlBlock* sinker, const std::vector<ExprContext*>& output_expr_ctxs,
-                      RuntimeProfile* parent_profile, bool output_object_data);
-
-    virtual ~MysqlResultWriter();
-
-    virtual Status init(RuntimeState* state) override;
-    // convert one row batch to mysql result and
-    // append this batch to the result sink
-    virtual Status append_row_batch(const RowBatch* batch) override;
-
-    virtual Status close() override;
-
-private:
-    void _init_profile();
-    // convert one tuple row
-    Status _add_one_row(TupleRow* row);
-    int _add_row_value(int index, const TypeDescriptor& type, void* item);
-
-private:
-    BufferControlBlock* _sinker;
-    const std::vector<ExprContext*>& _output_expr_ctxs;
-
-    std::vector<int> _result_column_ids;
-
-    MysqlRowBuffer* _row_buffer;
-    std::vector<MysqlRowBuffer*> _vec_buffers;
-
-    RuntimeProfile* _parent_profile; // parent profile from result sink. not owned
-    // total time cost on append batch operation
-    RuntimeProfile::Counter* _append_row_batch_timer = nullptr;
-    // tuple convert timer, child timer of _append_row_batch_timer
-    RuntimeProfile::Counter* _convert_tuple_timer = nullptr;
-    // file write timer, child timer of _append_row_batch_timer
-    RuntimeProfile::Counter* _result_send_timer = nullptr;
-    // number of sent rows
-    RuntimeProfile::Counter* _sent_rows_counter = nullptr;
-};
-
-} // namespace doris
diff --git a/be/src/runtime/mysql_table_sink.cpp b/be/src/runtime/mysql_table_sink.cpp
deleted file mode 100644
index c3357eacc5..0000000000
--- a/be/src/runtime/mysql_table_sink.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/mysql_table_sink.h"
-
-#include <sstream>
-
-#include "exprs/expr.h"
-#include "runtime/runtime_state.h"
-#include "util/debug_util.h"
-#include "util/runtime_profile.h"
-
-namespace doris {
-
-MysqlTableSink::MysqlTableSink(ObjectPool* pool, const RowDescriptor& row_desc,
-                               const std::vector<TExpr>& t_exprs)
-        : _pool(pool), _row_desc(row_desc), _t_output_expr(t_exprs) {
-    _name = "MysqlTableSink";
-}
-
-MysqlTableSink::~MysqlTableSink() {}
-
-Status MysqlTableSink::init(const TDataSink& t_sink) {
-    RETURN_IF_ERROR(DataSink::init(t_sink));
-    const TMysqlTableSink& t_mysql_sink = t_sink.mysql_table_sink;
-
-    _conn_info.host = t_mysql_sink.host;
-    _conn_info.port = t_mysql_sink.port;
-    _conn_info.user = t_mysql_sink.user;
-    _conn_info.passwd = t_mysql_sink.passwd;
-    _conn_info.db = t_mysql_sink.db;
-    _mysql_tbl = t_mysql_sink.table;
-    _conn_info.charset = t_mysql_sink.charset;
-
-    // From the thrift expressions create the real exprs.
-    RETURN_IF_ERROR(Expr::create_expr_trees(_pool, _t_output_expr, &_output_expr_ctxs));
-    return Status::OK();
-}
-
-Status MysqlTableSink::prepare(RuntimeState* state) {
-    RETURN_IF_ERROR(DataSink::prepare(state));
-    // Prepare the exprs to run.
-    RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc));
-    std::stringstream title;
-    title << "MysqlTableSink (frag_id=" << state->fragment_instance_id() << ")";
-    // create profile
-    _profile = state->obj_pool()->add(new RuntimeProfile(title.str()));
-    return Status::OK();
-}
-
-Status MysqlTableSink::open(RuntimeState* state) {
-    // Prepare the exprs to run.
-    RETURN_IF_ERROR(Expr::open(_output_expr_ctxs, state));
-    // create writer
-    _writer = state->obj_pool()->add(new MysqlTableWriter(_output_expr_ctxs));
-    RETURN_IF_ERROR(_writer->open(_conn_info, _mysql_tbl));
-    return Status::OK();
-}
-
-Status MysqlTableSink::send(RuntimeState* state, RowBatch* batch) {
-    return _writer->append(batch);
-}
-
-Status MysqlTableSink::close(RuntimeState* state, Status exec_status) {
-    if (_closed) {
-        return Status::OK();
-    }
-    Expr::close(_output_expr_ctxs, state);
-    return DataSink::close(state, exec_status);
-}
-
-} // namespace doris
diff --git a/be/src/runtime/mysql_table_sink.h b/be/src/runtime/mysql_table_sink.h
deleted file mode 100644
index 08ae566a21..0000000000
--- a/be/src/runtime/mysql_table_sink.h
+++ /dev/null
@@ -1,73 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <vector>
-
-#include "common/status.h"
-#include "exec/data_sink.h"
-#include "runtime/mysql_table_writer.h"
-
-namespace doris {
-
-class RowDescriptor;
-class TExpr;
-class TMysqlTableSink;
-class RuntimeState;
-class RuntimeProfile;
-class ExprContext;
-
-// This class is a sinker, which put input data to mysql table
-class MysqlTableSink : public DataSink {
-public:
-    MysqlTableSink(ObjectPool* pool, const RowDescriptor& row_desc,
-                   const std::vector<TExpr>& t_exprs);
-
-    virtual ~MysqlTableSink();
-
-    virtual Status init(const TDataSink& thrift_sink);
-
-    virtual Status prepare(RuntimeState* state);
-
-    virtual Status open(RuntimeState* state);
-
-    // send data in 'batch' to this backend stream mgr
-    // Blocks until all rows in batch are placed in the buffer
-    virtual Status send(RuntimeState* state, RowBatch* batch);
-
-    // Flush all buffered data and close all existing channels to destination
-    // hosts. Further send() calls are illegal after calling close().
-    virtual Status close(RuntimeState* state, Status exec_status);
-
-    virtual RuntimeProfile* profile() { return _profile; }
-
-private:
-    // owned by RuntimeState
-    ObjectPool* _pool;
-    const RowDescriptor& _row_desc;
-    const std::vector<TExpr>& _t_output_expr;
-
-    std::vector<ExprContext*> _output_expr_ctxs;
-    MysqlConnInfo _conn_info;
-    std::string _mysql_tbl;
-    MysqlTableWriter* _writer;
-
-    RuntimeProfile* _profile;
-};
-
-} // namespace doris
diff --git a/be/src/runtime/mysql_table_writer.cpp b/be/src/runtime/mysql_table_writer.cpp
deleted file mode 100644
index bd1f746db8..0000000000
--- a/be/src/runtime/mysql_table_writer.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/mysql_table_writer.h"
-
-#include <mysql/mysql.h>
-
-#include <sstream>
-
-#include "exprs/expr_context.h"
-#include "runtime/row_batch.h"
-#include "runtime/tuple_row.h"
-#include "util/types.h"
-
-namespace doris {
-
-std::string MysqlConnInfo::debug_string() const {
-    std::stringstream ss;
-
-    ss << "(host=" << host << ",port=" << port << ",user=" << user << ",db=" << db
-       << ",passwd=" << passwd << ",charset=" << charset << ")";
-    return ss.str();
-}
-
-MysqlTableWriter::MysqlTableWriter(const std::vector<ExprContext*>& output_expr_ctxs)
-        : _output_expr_ctxs(output_expr_ctxs) {}
-
-MysqlTableWriter::~MysqlTableWriter() {
-    if (_mysql_conn) {
-        mysql_close(_mysql_conn);
-    }
-}
-
-Status MysqlTableWriter::open(const MysqlConnInfo& conn_info, const std::string& tbl) {
-    _mysql_conn = mysql_init(nullptr);
-    if (_mysql_conn == nullptr) {
-        return Status::InternalError("Call mysql_init failed.");
-    }
-
-    MYSQL* res = mysql_real_connect(_mysql_conn, conn_info.host.c_str(), conn_info.user.c_str(),
-                                    conn_info.passwd.c_str(), conn_info.db.c_str(), conn_info.port,
-                                    nullptr, // unix socket
-                                    0);      // flags
-    if (res == nullptr) {
-        std::stringstream ss;
-        ss << "mysql_real_connect failed because " << mysql_error(_mysql_conn);
-        return Status::InternalError(ss.str());
-    }
-
-    // set character
-    if (mysql_set_character_set(_mysql_conn, conn_info.charset.c_str())) {
-        std::stringstream ss;
-        ss << "mysql_set_character_set failed because " << mysql_error(_mysql_conn);
-        return Status::InternalError(ss.str());
-    }
-
-    _mysql_tbl = tbl;
-
-    return Status::OK();
-}
-
-Status MysqlTableWriter::insert_row(TupleRow* row) {
-    std::stringstream ss;
-
-    // Construct Insert statement of mysql
-    ss << "INSERT INTO `" << _mysql_tbl << "` VALUES (";
-    int num_columns = _output_expr_ctxs.size();
-    for (int i = 0; i < num_columns; ++i) {
-        if (i != 0) {
-            ss << ", ";
-        }
-        void* item = _output_expr_ctxs[i]->get_value(row);
-        if (item == nullptr) {
-            ss << "NULL";
-            continue;
-        }
-        switch (_output_expr_ctxs[i]->root()->type().type) {
-        case TYPE_BOOLEAN:
-        case TYPE_TINYINT:
-            ss << (int)*static_cast<int8_t*>(item);
-            break;
-        case TYPE_SMALLINT:
-            ss << *static_cast<int16_t*>(item);
-            break;
-        case TYPE_INT:
-            ss << *static_cast<int32_t*>(item);
-            break;
-        case TYPE_BIGINT:
-            ss << *static_cast<int64_t*>(item);
-            break;
-        case TYPE_FLOAT:
-            ss << *static_cast<float*>(item);
-            break;
-        case TYPE_DOUBLE:
-            ss << *static_cast<double*>(item);
-            break;
-        case TYPE_DATE:
-        case TYPE_DATETIME: {
-            char buf[64];
-            const DateTimeValue* time_val = (const DateTimeValue*)(item);
-            time_val->to_string(buf);
-            ss << "\'" << buf << "\'";
-            break;
-        }
-        case TYPE_VARCHAR:
-        case TYPE_CHAR:
-        case TYPE_STRING: {
-            const StringValue* string_val = (const StringValue*)(item);
-
-            if (string_val->ptr == nullptr) {
-                if (string_val->len == 0) {
-                    ss << "\'\'";
-                } else {
-                    ss << "NULL";
-                }
-            } else {
-                char* buf = new char[2 * string_val->len + 1];
-                mysql_real_escape_string(_mysql_conn, buf, string_val->ptr, string_val->len);
-                ss << "\'" << buf << "\'";
-                delete[] buf;
-            }
-            break;
-        }
-
-        case TYPE_DECIMALV2: {
-            const DecimalV2Value decimal_val(reinterpret_cast<const PackedInt128*>(item)->value);
-            std::string decimal_str;
-            int output_scale = _output_expr_ctxs[i]->root()->output_scale();
-            decimal_str = decimal_val.to_string(output_scale);
-            ss << decimal_str;
-            break;
-        }
-
-        default: {
-            return Status::InternalError("can't convert this type to mysql type. type = {}",
-                                         _output_expr_ctxs[i]->root()->type().type);
-        }
-        }
-    }
-    ss << ")";
-
-    // Insert this to MySQL server
-    std::string insert_stmt = ss.str();
-    LOG(INFO) << insert_stmt;
-    if (mysql_real_query(_mysql_conn, insert_stmt.c_str(), insert_stmt.length())) {
-        std::stringstream err_ss;
-        err_ss << "Insert to mysql server(" << mysql_get_host_info(_mysql_conn)
-               << ") failed, because: " << mysql_error(_mysql_conn);
-        return Status::InternalError(err_ss.str());
-    }
-
-    return Status::OK();
-}
-
-Status MysqlTableWriter::append(RowBatch* batch) {
-    if (batch == nullptr || batch->num_rows() == 0) {
-        return Status::OK();
-    }
-
-    int num_rows = batch->num_rows();
-    for (int i = 0; i < num_rows; ++i) {
-        RETURN_IF_ERROR(insert_row(batch->get_row(i)));
-    }
-
-    return Status::OK();
-}
-
-} // namespace doris
diff --git a/be/src/runtime/mysql_table_writer.h b/be/src/runtime/mysql_table_writer.h
deleted file mode 100644
index ae2080acfe..0000000000
--- a/be/src/runtime/mysql_table_writer.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <mysql/mysql.h>
-
-#include <string>
-#include <vector>
-
-#include "common/status.h"
-
-namespace doris {
-
-struct MysqlConnInfo {
-    std::string host;
-    std::string user;
-    std::string passwd;
-    std::string db;
-    int port;
-    std::string charset;
-
-    std::string debug_string() const;
-};
-
-class RowBatch;
-class TupleRow;
-class ExprContext;
-
-class MysqlTableWriter {
-public:
-    MysqlTableWriter(const std::vector<ExprContext*>& output_exprs);
-    ~MysqlTableWriter();
-
-    // connect to mysql server
-    Status open(const MysqlConnInfo& conn_info, const std::string& tbl);
-
-    Status begin_trans() { return Status::OK(); }
-
-    Status append(RowBatch* batch);
-
-    Status abort_tarns() { return Status::OK(); }
-
-    Status finish_tarns() { return Status::OK(); }
-
-private:
-    Status insert_row(TupleRow* row);
-
-    const std::vector<ExprContext*>& _output_expr_ctxs;
-    std::string _mysql_tbl;
-    MYSQL* _mysql_conn;
-};
-
-} // namespace doris
diff --git a/be/src/runtime/odbc_table_sink.cpp b/be/src/runtime/odbc_table_sink.cpp
deleted file mode 100644
index a7c58d22d4..0000000000
--- a/be/src/runtime/odbc_table_sink.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/odbc_table_sink.h"
-
-#include <sstream>
-
-#include "exprs/expr.h"
-#include "runtime/runtime_state.h"
-#include "util/debug_util.h"
-#include "util/runtime_profile.h"
-
-namespace doris {
-
-OdbcTableSink::OdbcTableSink(ObjectPool* pool, const RowDescriptor& row_desc,
-                             const std::vector<TExpr>& t_exprs)
-        : _pool(pool), _row_desc(row_desc), _t_output_expr(t_exprs) {
-    _name = "OOBC_TABLE_SINK";
-}
-
-OdbcTableSink::~OdbcTableSink() = default;
-
-Status OdbcTableSink::init(const TDataSink& t_sink) {
-    RETURN_IF_ERROR(DataSink::init(t_sink));
-    // From the thrift expressions create the real exprs.
-    RETURN_IF_ERROR(Expr::create_expr_trees(_pool, _t_output_expr, &_output_expr_ctxs));
-
-    const TOdbcTableSink& t_odbc_sink = t_sink.odbc_table_sink;
-
-    _odbc_param.connect_string = t_odbc_sink.connect_string;
-    _odbc_param.output_expr_ctxs = _output_expr_ctxs;
-    _odbc_tbl = t_odbc_sink.table;
-    _use_transaction = t_odbc_sink.use_transaction;
-
-    return Status::OK();
-}
-
-Status OdbcTableSink::prepare(RuntimeState* state) {
-    RETURN_IF_ERROR(DataSink::prepare(state));
-    // Prepare the exprs to run.
-    RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc));
-    std::stringstream title;
-    title << _name << " (frag_id=" << state->fragment_instance_id() << ")";
-    // create profile
-    _profile = state->obj_pool()->add(new RuntimeProfile(title.str()));
-    return Status::OK();
-}
-
-Status OdbcTableSink::open(RuntimeState* state) {
-    // Prepare the exprs to run.
-    RETURN_IF_ERROR(Expr::open(_output_expr_ctxs, state));
-    // create writer
-    _writer.reset(new ODBCConnector(_odbc_param));
-    RETURN_IF_ERROR(_writer->open(state));
-    if (_use_transaction) {
-        RETURN_IF_ERROR(_writer->begin_trans());
-    }
-    RETURN_IF_ERROR(_writer->init_to_write(_profile));
-    return Status::OK();
-}
-
-Status OdbcTableSink::send(RuntimeState* state, RowBatch* batch) {
-    if (batch == nullptr || batch->num_rows() == 0) {
-        return Status::OK();
-    }
-    uint32_t start_send_row = 0;
-    uint32_t num_row_sent = 0;
-    while (start_send_row < batch->num_rows()) {
-        auto status =
-                _writer->append(_odbc_tbl, batch, _output_expr_ctxs, start_send_row, &num_row_sent);
-        if (UNLIKELY(!status.ok())) {
-            return status;
-        }
-        start_send_row += num_row_sent;
-        num_row_sent = 0;
-    }
-    return Status::OK();
-}
-
-Status OdbcTableSink::close(RuntimeState* state, Status exec_status) {
-    if (_closed) {
-        return Status::OK();
-    }
-    Expr::close(_output_expr_ctxs, state);
-    if (exec_status.ok() && _use_transaction) {
-        RETURN_IF_ERROR(_writer->finish_trans());
-    }
-    return DataSink::close(state, exec_status);
-}
-
-} // namespace doris
diff --git a/be/src/runtime/odbc_table_sink.h b/be/src/runtime/odbc_table_sink.h
deleted file mode 100644
index ecffd3ef27..0000000000
--- a/be/src/runtime/odbc_table_sink.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <vector>
-
-#include "common/status.h"
-#include "exec/data_sink.h"
-#include "exec/odbc_connector.h"
-
-namespace doris {
-
-class RowDescriptor;
-class TExpr;
-class TOdbcTableSink;
-class RuntimeState;
-class RuntimeProfile;
-class ExprContext;
-
-//This class is a sinker, which put input data to odbc table
-class OdbcTableSink : public DataSink {
-public:
-    OdbcTableSink(ObjectPool* pool, const RowDescriptor& row_desc,
-                  const std::vector<TExpr>& t_exprs);
-
-    virtual ~OdbcTableSink();
-
-    virtual Status init(const TDataSink& thrift_sink);
-
-    virtual Status prepare(RuntimeState* state);
-
-    virtual Status open(RuntimeState* state);
-
-    // send data in 'batch' to this backend stream mgr
-    // Blocks until all rows in batch are placed in the buffer
-    virtual Status send(RuntimeState* state, RowBatch* batch);
-
-    // Flush all buffered data and close all existing channels to destination
-    // hosts. Further send() calls are illegal after calling close().
-    virtual Status close(RuntimeState* state, Status exec_status);
-
-    virtual RuntimeProfile* profile() { return _profile; }
-
-private:
-    ObjectPool* _pool;
-    const RowDescriptor& _row_desc;
-    const std::vector<TExpr>& _t_output_expr;
-
-    std::vector<ExprContext*> _output_expr_ctxs;
-    ODBCConnectorParam _odbc_param;
-    std::string _odbc_tbl;
-    std::unique_ptr<ODBCConnector> _writer;
-    // whether use transaction
-    bool _use_transaction;
-
-    RuntimeProfile* _profile;
-};
-
-} // namespace doris
diff --git a/be/src/runtime/plan_fragment_executor.cpp b/be/src/runtime/plan_fragment_executor.cpp
index 5f3b2c47ee..d7ae0556e7 100644
--- a/be/src/runtime/plan_fragment_executor.cpp
+++ b/be/src/runtime/plan_fragment_executor.cpp
@@ -121,8 +121,6 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request,
         _is_report_success = request.query_options.is_report_success;
     }
 
-    RETURN_IF_ERROR(_runtime_state->create_block_mgr());
-
     // set up desc tbl
     DescriptorTbl* desc_tbl = nullptr;
     if (fragments_ctx != nullptr) {
@@ -149,7 +147,7 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request,
         if (_runtime_state->enable_vectorized_exec()) {
             static_cast<doris::vectorized::VExchangeNode*>(exch_node)->set_num_senders(num_senders);
         } else {
-            return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+.");
+            RETURN_ERROR_IF_NON_VEC;
         }
     }
 
@@ -213,8 +211,6 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request,
     _rows_produced_counter = ADD_COUNTER(profile(), "RowsProduced", TUnit::UNIT);
     _fragment_cpu_timer = ADD_TIMER(profile(), "FragmentCpuTime");
 
-    _row_batch.reset(new RowBatch(_plan->row_desc(), _runtime_state->batch_size()));
-    // _row_batch->tuple_data_pool()->set_limits(*_runtime_state->mem_trackers());
     VLOG_NOTICE << "plan_root=\n" << _plan->debug_string();
     _prepared = true;
 
@@ -247,7 +243,7 @@ Status PlanFragmentExecutor::open() {
     if (_runtime_state->enable_vectorized_exec()) {
         status = open_vectorized_internal();
     } else {
-        status = open_internal();
+        RETURN_ERROR_IF_NON_VEC;
     }
 
     if (!status.ok() && !status.is<CANCELLED>() && _runtime_state->log_has_space()) {
@@ -358,90 +354,6 @@ Status PlanFragmentExecutor::get_vectorized_internal(::doris::vectorized::Block*
     return Status::OK();
 }
 
-Status PlanFragmentExecutor::open_internal() {
-    {
-        SCOPED_CPU_TIMER(_fragment_cpu_timer);
-        SCOPED_TIMER(profile()->total_time_counter());
-        RETURN_IF_ERROR(_plan->open(_runtime_state.get()));
-    }
-
-    if (_sink == nullptr) {
-        return Status::OK();
-    }
-    {
-        SCOPED_CPU_TIMER(_fragment_cpu_timer);
-        RETURN_IF_ERROR(_sink->open(runtime_state()));
-    }
-
-    // If there is a sink, do all the work of driving it here, so that
-    // when this returns the query has actually finished
-    RowBatch* batch = nullptr;
-    while (true) {
-        {
-            SCOPED_CPU_TIMER(_fragment_cpu_timer);
-            RETURN_IF_ERROR(get_next_internal(&batch));
-        }
-
-        if (batch == nullptr) {
-            break;
-        }
-
-        if (VLOG_ROW_IS_ON) {
-            VLOG_ROW << "open_internal: #rows=" << batch->num_rows()
-                     << " desc=" << row_desc().debug_string();
-
-            for (int i = 0; i < batch->num_rows(); ++i) {
-                TupleRow* row = batch->get_row(i);
-                VLOG_ROW << row->to_string(row_desc());
-            }
-        }
-
-        SCOPED_TIMER(profile()->total_time_counter());
-        SCOPED_CPU_TIMER(_fragment_cpu_timer);
-        // Collect this plan and sub plan statistics, and send to parent plan.
-        if (_collect_query_statistics_with_every_batch) {
-            _collect_query_statistics();
-        }
-        const Status& st = _sink->send(runtime_state(), batch);
-        if (st.is<END_OF_FILE>()) {
-            break;
-        }
-        RETURN_IF_ERROR(st);
-    }
-
-    // Close the sink *before* stopping the report thread. Close may
-    // need to add some important information to the last report that
-    // gets sent. (e.g. table sinks record the files they have written
-    // to in this method)
-    // The coordinator report channel waits until all backends are
-    // either in error or have returned a status report with done =
-    // true, so tearing down any data stream state (a separate
-    // channel) in Close is safe.
-
-    // TODO: If this returns an error, the d'tor will call Close again. We should
-    // audit the sinks to check that this is ok, or change that behaviour.
-    {
-        SCOPED_TIMER(profile()->total_time_counter());
-        _collect_query_statistics();
-        Status status;
-        {
-            std::lock_guard<std::mutex> l(_status_lock);
-            status = _status;
-        }
-        status = _sink->close(runtime_state(), status);
-        RETURN_IF_ERROR(status);
-    }
-
-    // Setting to nullptr ensures that the d'tor won't double-close the sink.
-    _sink.reset(nullptr);
-    _done = true;
-
-    stop_report_thread();
-    send_report(true);
-
-    return Status::OK();
-}
-
 void PlanFragmentExecutor::_collect_query_statistics() {
     _query_statistics->clear();
     _plan->collect_query_statistics(_query_statistics.get());
@@ -556,46 +468,6 @@ void PlanFragmentExecutor::stop_report_thread() {
     _report_thread.join();
 }
 
-Status PlanFragmentExecutor::get_next(RowBatch** batch) {
-    VLOG_FILE << "GetNext(): instance_id=" << _runtime_state->fragment_instance_id();
-    Status status = get_next_internal(batch);
-    update_status(status);
-
-    if (_done) {
-        LOG_INFO("PlanFragmentExecutor::get_next finished")
-                .tag("query_id", _query_id)
-                .tag("instance_id", _runtime_state->fragment_instance_id());
-        // Query is done, return the thread token
-        stop_report_thread();
-        send_report(true);
-    }
-
-    return status;
-}
-
-Status PlanFragmentExecutor::get_next_internal(RowBatch** batch) {
-    if (_done) {
-        *batch = nullptr;
-        return Status::OK();
-    }
-
-    while (!_done) {
-        _row_batch->reset();
-        SCOPED_TIMER(profile()->total_time_counter());
-        RETURN_IF_ERROR(_plan->get_next(_runtime_state.get(), _row_batch.get(), &_done));
-
-        if (_row_batch->num_rows() > 0) {
-            COUNTER_UPDATE(_rows_produced_counter, _row_batch->num_rows());
-            *batch = _row_batch.get();
-            break;
-        }
-
-        *batch = nullptr;
-    }
-
-    return Status::OK();
-}
-
 void PlanFragmentExecutor::update_status(const Status& new_status) {
     if (new_status.ok()) {
         return;
@@ -656,8 +528,6 @@ void PlanFragmentExecutor::close() {
         return;
     }
 
-    _row_batch.reset(nullptr);
-
     // Prepare may not have been called, which sets _runtime_state
     if (_runtime_state != nullptr) {
         // _runtime_state init failed
diff --git a/be/src/runtime/plan_fragment_executor.h b/be/src/runtime/plan_fragment_executor.h
index 7abf9fdc89..013c56471f 100644
--- a/be/src/runtime/plan_fragment_executor.h
+++ b/be/src/runtime/plan_fragment_executor.h
@@ -108,13 +108,6 @@ public:
     // time when open() returns, and the status-reporting thread will have been stopped.
     Status open();
 
-    // Return results through 'batch'. Sets '*batch' to nullptr if no more results.
-    // '*batch' is owned by PlanFragmentExecutor and must not be deleted.
-    // When *batch == nullptr, get_next() should not be called anymore. Also, report_status_cb
-    // will have been called for the final time and the status-reporting thread
-    // will have been stopped.
-    Status get_next(RowBatch** batch);
-
     // Closes the underlying plan fragment and frees up all resources allocated
     // in open()/get_next().
     void close();
@@ -187,7 +180,6 @@ private:
     // returned via get_next's row batch
     // Created in prepare (if required), owned by this object.
     std::unique_ptr<DataSink> _sink;
-    std::unique_ptr<RowBatch> _row_batch;
 
     // Number of rows returned by this fragment
     RuntimeProfile::Counter* _rows_produced_counter;
@@ -230,11 +222,9 @@ private:
     // error condition, all rows will have been sent to the sink, the sink will
     // have been closed, a final report will have been sent and the report thread will
     // have been stopped. _sink will be set to nullptr after successful execution.
-    Status open_internal();
     Status open_vectorized_internal();
 
     // Executes get_next() logic and returns resulting status.
-    Status get_next_internal(RowBatch** batch);
     Status get_vectorized_internal(::doris::vectorized::Block* block, bool* eos);
 
     // Stops report thread, if one is running. Blocks until report thread terminates.
diff --git a/be/src/runtime/result_writer.h b/be/src/runtime/result_writer.h
index 7d669e1b4f..a77956c0c4 100644
--- a/be/src/runtime/result_writer.h
+++ b/be/src/runtime/result_writer.h
@@ -39,12 +39,6 @@ public:
     ~ResultWriter() {};
 
     virtual Status init(RuntimeState* state) = 0;
-    // convert and write one row batch
-    virtual Status append_row_batch(const RowBatch* batch) = 0;
-
-    // virtual Status append_block(const vectorized::Block& block) {
-    //     return Status::InternalError("Not support append vec block now.");
-    // }
 
     virtual Status close() = 0;
 
diff --git a/be/src/runtime/row_batch.cpp b/be/src/runtime/row_batch.cpp
index 52860bdd09..0c2a6826a8 100644
--- a/be/src/runtime/row_batch.cpp
+++ b/be/src/runtime/row_batch.cpp
@@ -26,7 +26,6 @@
 #include "common/utils.h"
 #include "gen_cpp/Data_types.h"
 #include "gen_cpp/data.pb.h"
-#include "runtime/buffered_tuple_stream2.inline.h"
 #include "runtime/collection_value.h"
 #include "runtime/exec_env.h"
 #include "runtime/runtime_state.h"
@@ -200,10 +199,6 @@ void RowBatch::clear() {
         ExecEnv::GetInstance()->buffer_pool()->FreeBuffer(buffer_info.client, &buffer_info.buffer);
     }
 
-    close_tuple_streams();
-    for (int i = 0; i < _blocks.size(); ++i) {
-        _blocks[i]->del();
-    }
     DCHECK(_tuple_ptrs != nullptr);
     free(_tuple_ptrs);
     _tuple_ptrs = nullptr;
@@ -348,18 +343,6 @@ Status RowBatch::resize_and_allocate_tuple_buffer(RuntimeState* state, int64_t*
     return Status::OK();
 }
 
-void RowBatch::add_tuple_stream(BufferedTupleStream2* stream) {
-    DCHECK(stream != nullptr);
-    _tuple_streams.push_back(stream);
-    _auxiliary_mem_usage += stream->byte_size();
-}
-
-void RowBatch::add_block(BufferedBlockMgr2::Block* block) {
-    DCHECK(block != nullptr);
-    _blocks.push_back(block);
-    _auxiliary_mem_usage += block->buffer_len();
-}
-
 void RowBatch::reset() {
     _num_rows = 0;
     _capacity = _tuple_ptrs_size / (_num_tuples_per_row * sizeof(Tuple*));
@@ -378,25 +361,12 @@ void RowBatch::reset() {
     }
     _buffers.clear();
 
-    close_tuple_streams();
-    for (int i = 0; i < _blocks.size(); ++i) {
-        _blocks[i]->del();
-    }
-    _blocks.clear();
     _auxiliary_mem_usage = 0;
     _need_to_return = false;
     _flush = FlushMode::NO_FLUSH_RESOURCES;
     _needs_deep_copy = false;
 }
 
-void RowBatch::close_tuple_streams() {
-    for (int i = 0; i < _tuple_streams.size(); ++i) {
-        _tuple_streams[i]->close();
-        delete _tuple_streams[i];
-    }
-    _tuple_streams.clear();
-}
-
 void RowBatch::transfer_resource_ownership(RowBatch* dest) {
     dest->_auxiliary_mem_usage += _tuple_data_pool.total_allocated_bytes();
     dest->_tuple_data_pool.acquire_data(&_tuple_data_pool, false);
@@ -414,21 +384,6 @@ void RowBatch::transfer_resource_ownership(RowBatch* dest) {
     }
     _buffers.clear();
 
-    for (int i = 0; i < _tuple_streams.size(); ++i) {
-        dest->_tuple_streams.push_back(_tuple_streams[i]);
-        dest->_auxiliary_mem_usage += _tuple_streams[i]->byte_size();
-    }
-    // Resource release should be done by dest RowBatch. if we don't clear the corresponding resources.
-    // This Rowbatch calls the reset() method, dest Rowbatch will also call the reset() method again,
-    // which will cause the core problem of double delete
-    _tuple_streams.clear();
-
-    for (int i = 0; i < _blocks.size(); ++i) {
-        dest->_blocks.push_back(_blocks[i]);
-        dest->_auxiliary_mem_usage += _blocks[i]->buffer_len();
-    }
-    _blocks.clear();
-
     dest->_need_to_return |= _need_to_return;
 
     if (_needs_deep_copy) {
@@ -517,9 +472,6 @@ void RowBatch::acquire_state(RowBatch* src) {
     src->_io_buffers.clear();
     src->_auxiliary_mem_usage = 0;
 
-    DCHECK(src->_tuple_streams.empty());
-    DCHECK(src->_blocks.empty());
-
     _has_in_flight_row = src->_has_in_flight_row;
     _num_rows = src->_num_rows;
     _capacity = src->_capacity;
diff --git a/be/src/runtime/row_batch.h b/be/src/runtime/row_batch.h
index e73ee320c1..9cb1f310e5 100644
--- a/be/src/runtime/row_batch.h
+++ b/be/src/runtime/row_batch.h
@@ -24,7 +24,6 @@
 #include <vector>
 
 #include "common/logging.h"
-#include "runtime/buffered_block_mgr2.h" // for BufferedBlockMgr2::Block
 #include "runtime/bufferpool/buffer_pool.h"
 #include "runtime/descriptors.h"
 #include "runtime/disk_io_mgr.h"
@@ -142,7 +141,7 @@ public:
     // enough memory.
     bool at_capacity() const {
         return _num_rows == _capacity || _auxiliary_mem_usage >= AT_CAPACITY_MEM_USAGE ||
-               num_tuple_streams() > 0 || _need_to_return;
+               _need_to_return;
     }
 
     // Returns true if the row batch has filled all the rows or has accumulated
@@ -238,7 +237,6 @@ public:
     MemPool* tuple_data_pool() { return &_tuple_data_pool; }
     ObjectPool* agg_object_pool() { return &_agg_object_pool; }
     int num_io_buffers() const { return _io_buffers.size(); }
-    int num_tuple_streams() const { return _tuple_streams.size(); }
 
     // increase # of uncommitted rows
     void increase_uncommitted_rows();
@@ -263,10 +261,6 @@ public:
     void add_buffer(BufferPool::ClientHandle* client, BufferPool::BufferHandle&& buffer,
                     FlushMode flush);
 
-    // Adds a block to this row batch. The block must be pinned. The blocks must be
-    // deleted when freeing resources.
-    void add_block(BufferedBlockMgr2::Block* block);
-
     // Called to indicate this row batch must be returned up the operator tree.
     // This is used to control memory management for streaming rows.
     // TODO: consider using this mechanism instead of add_io_buffer/add_tuple_stream. This is
@@ -393,9 +387,6 @@ public:
     std::string to_string();
 
 private:
-    // Close owned tuple streams and delete if needed.
-    void close_tuple_streams();
-
     // All members need to be handled in RowBatch::swap()
 
     bool _has_in_flight_row;   // if true, last row hasn't been committed yet
@@ -460,12 +451,6 @@ private:
     };
     /// Pages attached to this row batch. See AddBuffer() for ownership semantics.
     std::vector<BufferInfo> _buffers;
-    // Tuple streams currently owned by this row batch.
-    std::vector<BufferedTupleStream2*> _tuple_streams;
-
-    // Blocks attached to this row batch. The underlying memory and block manager client
-    // are owned by the BufferedBlockMgr2.
-    std::vector<BufferedBlockMgr2::Block*> _blocks;
 
     // String to write compressed tuple data to in serialize().
     // This is a string so we can swap() with the string in the PRowBatch we're serializing
diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp
index 9688bbf441..044a619e03 100644
--- a/be/src/runtime/runtime_state.cpp
+++ b/be/src/runtime/runtime_state.cpp
@@ -30,7 +30,6 @@
 #include "common/object_pool.h"
 #include "common/status.h"
 #include "exec/exec_node.h"
-#include "runtime/buffered_block_mgr2.h"
 #include "runtime/exec_env.h"
 #include "runtime/load_path_mgr.h"
 #include "runtime/memory/mem_tracker.h"
@@ -222,14 +221,6 @@ Status RuntimeState::init_mem_trackers(const TUniqueId& query_id) {
     return Status::OK();
 }
 
-Status RuntimeState::create_block_mgr() {
-    DCHECK(_block_mgr2.get() == nullptr);
-    RETURN_IF_ERROR(BufferedBlockMgr2::create(this, runtime_profile(), _exec_env->tmp_file_mgr(),
-                                              _exec_env->disk_io_mgr()->max_read_buffer_size(),
-                                              &_block_mgr2));
-    return Status::OK();
-}
-
 bool RuntimeState::error_log_is_empty() {
     std::lock_guard<std::mutex> l(_error_log_lock);
     return (_error_log.size() > 0);
diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h
index f1180155d9..dedef5340d 100644
--- a/be/src/runtime/runtime_state.h
+++ b/be/src/runtime/runtime_state.h
@@ -84,9 +84,6 @@ public:
     // for ut and non-query.
     Status init_mem_trackers(const TUniqueId& query_id = TUniqueId());
 
-    // Gets/Creates the query wide block mgr.
-    Status create_block_mgr();
-
     Status create_load_dir();
 
     const TQueryOptions& query_options() const { return _query_options; }
diff --git a/be/src/runtime/sorter.h b/be/src/runtime/sorter.h
deleted file mode 100644
index 2b1a4d4fe4..0000000000
--- a/be/src/runtime/sorter.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// https://github.com/apache/impala/blob/branch-2.9.0/be/src/runtime/sorter.h
-// and modified by Doris
-
-#pragma once
-
-#include "common/status.h"
-
-namespace doris {
-
-class RowBatch;
-class RuntimeState;
-// Interface to sort rows
-// 1. create one sorter
-// 2. add data need be sorted through 'add_batch'
-// 3. call 'input_done' when all data were added.
-// 4. call 'get_next' fetch data which is sorted.
-class Sorter {
-public:
-    virtual ~Sorter() {}
-
-    virtual Status prepare(RuntimeState* state) { return Status::OK(); }
-
-    // Add data to be sorted.
-    virtual Status add_batch(RowBatch* batch) { return Status::OK(); }
-
-    // call when all data be added
-    virtual Status input_done() = 0;
-
-    // fetch data already sorted,
-    // client must insure that call this function AFTER call input_done
-    virtual Status get_next(RowBatch* batch, bool* eos) = 0;
-
-    virtual Status close(RuntimeState* state) { return Status::OK(); }
-};
-
-} // namespace doris
diff --git a/be/src/vec/exec/join/vhash_join_node.cpp b/be/src/vec/exec/join/vhash_join_node.cpp
index fc2a7ed4dc..497f60b828 100644
--- a/be/src/vec/exec/join/vhash_join_node.cpp
+++ b/be/src/vec/exec/join/vhash_join_node.cpp
@@ -448,10 +448,6 @@ Status HashJoinNode::close(RuntimeState* state) {
     return VJoinNodeBase::close(state);
 }
 
-Status HashJoinNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) {
-    return Status::NotSupported("Not Implemented HashJoin Node::get_next scalar");
-}
-
 bool HashJoinNode::need_more_input_data() {
     return (_probe_block.rows() == 0 || _probe_index == _probe_block.rows()) && !_probe_eos &&
            !_short_circuit_for_null_in_probe_side;
diff --git a/be/src/vec/exec/join/vhash_join_node.h b/be/src/vec/exec/join/vhash_join_node.h
index b4b49d7b61..76fa064903 100644
--- a/be/src/vec/exec/join/vhash_join_node.h
+++ b/be/src/vec/exec/join/vhash_join_node.h
@@ -199,7 +199,6 @@ public:
     Status init(const TPlanNode& tnode, RuntimeState* state = nullptr) override;
     Status prepare(RuntimeState* state) override;
     Status open(RuntimeState* state) override;
-    Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override;
     Status get_next(RuntimeState* state, Block* block, bool* eos) override;
     Status close(RuntimeState* state) override;
     void add_hash_buckets_info(const std::string& info);
diff --git a/be/src/vec/exec/join/vnested_loop_join_node.h b/be/src/vec/exec/join/vnested_loop_join_node.h
index da2cd73915..23eba8fb96 100644
--- a/be/src/vec/exec/join/vnested_loop_join_node.h
+++ b/be/src/vec/exec/join/vnested_loop_join_node.h
@@ -57,10 +57,6 @@ public:
 
     Status open(RuntimeState* state) override;
 
-    Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override {
-        return Status::NotSupported("Not Implemented VNestedLoopJoinNode::get_next scalar");
-    }
-
     void debug_string(int indentation_level, std::stringstream* out) const override;
 
     const RowDescriptor& intermediate_row_desc() const override {
diff --git a/be/src/vec/exec/scan/vscan_node.h b/be/src/vec/exec/scan/vscan_node.h
index c2de829808..3a91d91b91 100644
--- a/be/src/vec/exec/scan/vscan_node.h
+++ b/be/src/vec/exec/scan/vscan_node.h
@@ -65,10 +65,6 @@ public:
 
     virtual void set_scan_ranges(const std::vector<TScanRangeParams>& scan_ranges) {}
 
-    Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override {
-        return Status::NotSupported("Not implement");
-    }
-
     // Get next block.
     // If eos is true, no more data will be read and block should be empty.
     Status get_next(RuntimeState* state, vectorized::Block* block, bool* eos) override;
diff --git a/be/src/vec/exec/vaggregation_node.cpp b/be/src/vec/exec/vaggregation_node.cpp
index ad035e21e8..6884d7b17d 100644
--- a/be/src/vec/exec/vaggregation_node.cpp
+++ b/be/src/vec/exec/vaggregation_node.cpp
@@ -505,10 +505,6 @@ Status AggregationNode::open(RuntimeState* state) {
     return Status::OK();
 }
 
-Status AggregationNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) {
-    return Status::NotSupported("Not Implemented Aggregation Node::get_next scalar");
-}
-
 Status AggregationNode::do_pre_agg(vectorized::Block* input_block,
                                    vectorized::Block* output_block) {
     RETURN_IF_ERROR(_executor.pre_agg(input_block, output_block));
diff --git a/be/src/vec/exec/vaggregation_node.h b/be/src/vec/exec/vaggregation_node.h
index ac34ca84f6..bb40c7f509 100644
--- a/be/src/vec/exec/vaggregation_node.h
+++ b/be/src/vec/exec/vaggregation_node.h
@@ -777,7 +777,6 @@ public:
     virtual Status prepare(RuntimeState* state) override;
     virtual Status open(RuntimeState* state) override;
     virtual Status alloc_resource(RuntimeState* state) override;
-    virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override;
     virtual Status get_next(RuntimeState* state, Block* block, bool* eos) override;
     virtual Status close(RuntimeState* state) override;
     virtual void release_resource(RuntimeState* state) override;
diff --git a/be/src/vec/exec/vanalytic_eval_node.cpp b/be/src/vec/exec/vanalytic_eval_node.cpp
index bfac9e5d00..f748886f31 100644
--- a/be/src/vec/exec/vanalytic_eval_node.cpp
+++ b/be/src/vec/exec/vanalytic_eval_node.cpp
@@ -305,10 +305,6 @@ bool VAnalyticEvalNode::can_read() {
     return true;
 }
 
-Status VAnalyticEvalNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) {
-    return Status::NotSupported("Not Implemented VAnalyticEvalNode::get_next.");
-}
-
 Status VAnalyticEvalNode::get_next(RuntimeState* state, vectorized::Block* block, bool* eos) {
     INIT_AND_SCOPE_GET_NEXT_SPAN(state->get_tracer(), _get_next_span,
                                  "VAnalyticEvalNode::get_next");
diff --git a/be/src/vec/exec/vanalytic_eval_node.h b/be/src/vec/exec/vanalytic_eval_node.h
index 9957221193..54becbba88 100644
--- a/be/src/vec/exec/vanalytic_eval_node.h
+++ b/be/src/vec/exec/vanalytic_eval_node.h
@@ -44,7 +44,6 @@ public:
     Status init(const TPlanNode& tnode, RuntimeState* state = nullptr) override;
     Status prepare(RuntimeState* state) override;
     Status open(RuntimeState* state) override;
-    Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override;
     Status get_next(RuntimeState* state, vectorized::Block* block, bool* eos) override;
     Status close(RuntimeState* state) override;
     Status alloc_resource(RuntimeState* state) override;
diff --git a/be/src/vec/exec/varrow_scanner.h b/be/src/vec/exec/varrow_scanner.h
index e67300332d..5779dbb372 100644
--- a/be/src/vec/exec/varrow_scanner.h
+++ b/be/src/vec/exec/varrow_scanner.h
@@ -50,11 +50,6 @@ public:
     // Open this scanner, will initialize information need to
     virtual Status open() override;
 
-    virtual Status get_next(doris::Tuple* tuple, MemPool* tuple_pool, bool* eof,
-                            bool* fill_tuple) override {
-        return Status::NotSupported("Not Implemented get next");
-    }
-
     virtual Status get_next(Block* block, bool* eof) override;
 
     // Update file predicate filter profile
diff --git a/be/src/vec/exec/vassert_num_rows_node.h b/be/src/vec/exec/vassert_num_rows_node.h
index 0f6ffcb9de..6b0432b6db 100644
--- a/be/src/vec/exec/vassert_num_rows_node.h
+++ b/be/src/vec/exec/vassert_num_rows_node.h
@@ -27,10 +27,6 @@ class VAssertNumRowsNode : public ExecNode {
 public:
     VAssertNumRowsNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs);
 
-    Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override {
-        return Status::NotSupported("Not Implemented VAnalyticEvalNode::get_next.");
-    }
-
     Status open(RuntimeState* state) override;
     Status get_next(RuntimeState* state, Block* block, bool* eos) override;
     Status pull(RuntimeState* state, vectorized::Block* output_block, bool* eos) override;
diff --git a/be/src/vec/exec/vbroker_scan_node.h b/be/src/vec/exec/vbroker_scan_node.h
index 452415014f..9c5e436b19 100644
--- a/be/src/vec/exec/vbroker_scan_node.h
+++ b/be/src/vec/exec/vbroker_scan_node.h
@@ -45,11 +45,6 @@ public:
     // Start broker scan using ParquetScanner or BrokerScanner.
     Status open(RuntimeState* state) override;
 
-    // Fill the next row batch by calling next() on the scanner,
-    Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override {
-        return Status::NotSupported("Not Implemented VBrokerScanNode::get_next.");
-    }
-
     Status get_next(RuntimeState* state, vectorized::Block* block, bool* eos) override;
 
     // Close the scanner, and report errors.
diff --git a/be/src/vec/exec/vbroker_scanner.h b/be/src/vec/exec/vbroker_scanner.h
index 2e26eb58b0..3283530462 100644
--- a/be/src/vec/exec/vbroker_scanner.h
+++ b/be/src/vec/exec/vbroker_scanner.h
@@ -47,11 +47,6 @@ public:
 
     Status open() override;
 
-    virtual Status get_next(doris::Tuple* tuple, MemPool* tuple_pool, bool* eof,
-                            bool* fill_tuple) override {
-        return Status::NotSupported("Not Implemented get next");
-    }
-
     Status get_next(Block* block, bool* eof) override;
 
     void close() override;
diff --git a/be/src/vec/exec/vdata_gen_scan_node.cpp b/be/src/vec/exec/vdata_gen_scan_node.cpp
index 26ae698795..ae3d7d0986 100644
--- a/be/src/vec/exec/vdata_gen_scan_node.cpp
+++ b/be/src/vec/exec/vdata_gen_scan_node.cpp
@@ -86,11 +86,6 @@ Status VDataGenFunctionScanNode::open(RuntimeState* state) {
     return Status::OK();
 }
 
-Status VDataGenFunctionScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) {
-    LOG(FATAL) << "VDataGenFunctionScanNode only support vectorized execution";
-    return Status::OK();
-}
-
 Status VDataGenFunctionScanNode::get_next(RuntimeState* state, vectorized::Block* block,
                                           bool* eos) {
     if (state == nullptr || block == nullptr || eos == nullptr) {
diff --git a/be/src/vec/exec/vdata_gen_scan_node.h b/be/src/vec/exec/vdata_gen_scan_node.h
index 73470e8d49..18ca2c040f 100644
--- a/be/src/vec/exec/vdata_gen_scan_node.h
+++ b/be/src/vec/exec/vdata_gen_scan_node.h
@@ -45,10 +45,6 @@ public:
     // Start MySQL scan using _mysql_scanner.
     Status open(RuntimeState* state) override;
 
-    // Fill the next row batch by calling next() on the _mysql_scanner,
-    // converting text data in MySQL cells to binary data.
-    Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override;
-
     Status get_next(RuntimeState* state, vectorized::Block* block, bool* eos) override;
 
     // Close the _mysql_scanner, and report errors.
diff --git a/be/src/vec/exec/vempty_set_node.h b/be/src/vec/exec/vempty_set_node.h
index 900f0c6016..80b1d2775f 100644
--- a/be/src/vec/exec/vempty_set_node.h
+++ b/be/src/vec/exec/vempty_set_node.h
@@ -26,9 +26,6 @@ namespace vectorized {
 class VEmptySetNode : public ExecNode {
 public:
     VEmptySetNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs);
-    virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override {
-        return Status::NotSupported("Not Implemented get RowBatch in vecorized execution.");
-    }
     virtual Status get_next(RuntimeState* state, Block* block, bool* eos) override;
 };
 } // namespace vectorized
diff --git a/be/src/vec/exec/vexchange_node.cpp b/be/src/vec/exec/vexchange_node.cpp
index 8e34b1e444..7196c8a5c1 100644
--- a/be/src/vec/exec/vexchange_node.cpp
+++ b/be/src/vec/exec/vexchange_node.cpp
@@ -88,9 +88,6 @@ Status VExchangeNode::open(RuntimeState* state) {
 
     return Status::OK();
 }
-Status VExchangeNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) {
-    return Status::NotSupported("Not Implemented VExchange Node::get_next scalar");
-}
 
 Status VExchangeNode::get_next(RuntimeState* state, Block* block, bool* eos) {
     INIT_AND_SCOPE_GET_NEXT_SPAN(state->get_tracer(), _get_next_span, "VExchangeNode::get_next");
diff --git a/be/src/vec/exec/vexchange_node.h b/be/src/vec/exec/vexchange_node.h
index 1d767c8cb2..68b778aade 100644
--- a/be/src/vec/exec/vexchange_node.h
+++ b/be/src/vec/exec/vexchange_node.h
@@ -39,7 +39,6 @@ public:
     Status prepare(RuntimeState* state) override;
     Status alloc_resource(RuntimeState* state) override;
     Status open(RuntimeState* state) override;
-    Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override;
     Status get_next(RuntimeState* state, Block* row_batch, bool* eos) override;
     void release_resource(RuntimeState* state) override;
     Status collect_query_statistics(QueryStatistics* statistics) override;
diff --git a/be/src/vec/exec/vjson_scanner.h b/be/src/vec/exec/vjson_scanner.h
index 710d540d14..3f32648f29 100644
--- a/be/src/vec/exec/vjson_scanner.h
+++ b/be/src/vec/exec/vjson_scanner.h
@@ -59,10 +59,6 @@ public:
     // Open this scanner, will initialize information needed
     Status open() override;
 
-    Status get_next(doris::Tuple* tuple, MemPool* tuple_pool, bool* eof,
-                    bool* fill_tuple) override {
-        return Status::NotSupported("Not Implemented get tuple");
-    }
     Status get_next(vectorized::Block* output_block, bool* eof) override;
 
     void close() override;
diff --git a/be/src/vec/exec/vmysql_scan_node.h b/be/src/vec/exec/vmysql_scan_node.h
index 2fd8240956..5bea0fb388 100644
--- a/be/src/vec/exec/vmysql_scan_node.h
+++ b/be/src/vec/exec/vmysql_scan_node.h
@@ -43,9 +43,6 @@ public:
     // Start MySQL scan using mysql_scanner.
     Status open(RuntimeState* state) override;
 
-    Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override {
-        return Status::NotSupported("Not Implemented VMysqlScanNode Node::get_next scalar");
-    }
     // Fill the next block by calling next() on the mysql_scanner,
     // converting text data in MySQL cells to binary data.
     Status get_next(RuntimeState* state, vectorized::Block* block, bool* eos) override;
diff --git a/be/src/vec/exec/vschema_scan_node.h b/be/src/vec/exec/vschema_scan_node.h
index acb725baa4..57117b23af 100644
--- a/be/src/vec/exec/vschema_scan_node.h
+++ b/be/src/vec/exec/vschema_scan_node.h
@@ -37,9 +37,6 @@ public:
     VSchemaScanNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs);
     ~VSchemaScanNode();
     Status prepare(RuntimeState* state) override;
-    Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override {
-        return Status::NotSupported("Not Implemented VSchemaScanNode Node::get_next scalar");
-    }
     virtual Status get_next(RuntimeState* state, vectorized::Block* block, bool* eos) override;
 
     // Prepare conjuncts, create Schema columns to slots mapping
diff --git a/be/src/vec/exec/vselect_node.cpp b/be/src/vec/exec/vselect_node.cpp
index 469d72ad73..b8f10d57be 100644
--- a/be/src/vec/exec/vselect_node.cpp
+++ b/be/src/vec/exec/vselect_node.cpp
@@ -38,10 +38,6 @@ Status VSelectNode::open(RuntimeState* state) {
     return Status::OK();
 }
 
-Status VSelectNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) {
-    return Status::NotSupported("Not Implemented VSelectNode::get_next.");
-}
-
 Status VSelectNode::get_next(RuntimeState* state, vectorized::Block* block, bool* eos) {
     INIT_AND_SCOPE_GET_NEXT_SPAN(state->get_tracer(), _get_next_span, "VSelectNode::get_next");
     SCOPED_TIMER(_runtime_profile->total_time_counter());
diff --git a/be/src/vec/exec/vselect_node.h b/be/src/vec/exec/vselect_node.h
index afff5cb734..984e4578fb 100644
--- a/be/src/vec/exec/vselect_node.h
+++ b/be/src/vec/exec/vselect_node.h
@@ -27,7 +27,6 @@ public:
     Status init(const TPlanNode& tnode, RuntimeState* state = nullptr) override;
     Status prepare(RuntimeState* state) override;
     Status open(RuntimeState* state) override;
-    Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override;
     Status get_next(RuntimeState* state, vectorized::Block* block, bool* eos) override;
     Status close(RuntimeState* state) override;
     Status pull(RuntimeState* state, vectorized::Block* output_block, bool* eos) override;
diff --git a/be/src/vec/exec/vset_operation_node.h b/be/src/vec/exec/vset_operation_node.h
index 8b8d10e9e3..1e339e3a80 100644
--- a/be/src/vec/exec/vset_operation_node.h
+++ b/be/src/vec/exec/vset_operation_node.h
@@ -39,9 +39,6 @@ public:
     Status init(const TPlanNode& tnode, RuntimeState* state = nullptr) override;
     Status prepare(RuntimeState* state) override;
     Status open(RuntimeState* state) override;
-    Status get_next(RuntimeState* /*state*/, RowBatch* /*row_batch*/, bool* /*eos*/) override {
-        return Status::NotSupported("Not implemented get RowBatch in vectorized execution.");
-    }
 
     Status get_next(RuntimeState* state, Block* output_block, bool* eos) override;
 
diff --git a/be/src/vec/exec/vsort_node.cpp b/be/src/vec/exec/vsort_node.cpp
index 2596c0eb03..241c99c287 100644
--- a/be/src/vec/exec/vsort_node.cpp
+++ b/be/src/vec/exec/vsort_node.cpp
@@ -135,11 +135,6 @@ Status VSortNode::open(RuntimeState* state) {
     return Status::OK();
 }
 
-Status VSortNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) {
-    *eos = true;
-    return Status::NotSupported("Not Implemented VSortNode::get_next scalar");
-}
-
 Status VSortNode::pull(doris::RuntimeState* state, vectorized::Block* output_block, bool* eos) {
     RETURN_IF_ERROR(_sorter->get_next(state, output_block, eos));
     reached_limit(output_block, eos);
diff --git a/be/src/vec/exec/vsort_node.h b/be/src/vec/exec/vsort_node.h
index 0a29d08c54..ff7b692096 100644
--- a/be/src/vec/exec/vsort_node.h
+++ b/be/src/vec/exec/vsort_node.h
@@ -37,30 +37,28 @@ public:
 
     ~VSortNode() override = default;
 
-    virtual Status init(const TPlanNode& tnode, RuntimeState* state = nullptr) override;
+    Status init(const TPlanNode& tnode, RuntimeState* state = nullptr) override;
 
-    virtual Status prepare(RuntimeState* state) override;
+    Status prepare(RuntimeState* state) override;
 
-    virtual Status alloc_resource(RuntimeState* state) override;
+    Status alloc_resource(RuntimeState* state) override;
 
-    virtual Status open(RuntimeState* state) override;
+    Status open(RuntimeState* state) override;
 
-    virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override;
+    Status get_next(RuntimeState* state, Block* block, bool* eos) override;
 
-    virtual Status get_next(RuntimeState* state, Block* block, bool* eos) override;
+    Status reset(RuntimeState* state) override;
 
-    virtual Status reset(RuntimeState* state) override;
+    Status close(RuntimeState* state) override;
 
-    virtual Status close(RuntimeState* state) override;
+    void release_resource(RuntimeState* state) override;
 
-    virtual void release_resource(RuntimeState* state) override;
+    Status pull(RuntimeState* state, vectorized::Block* output_block, bool* eos) override;
 
-    virtual Status pull(RuntimeState* state, vectorized::Block* output_block, bool* eos) override;
-
-    virtual Status sink(RuntimeState* state, vectorized::Block* input_block, bool eos) override;
+    Status sink(RuntimeState* state, vectorized::Block* input_block, bool eos) override;
 
 protected:
-    virtual void debug_string(int indentation_level, std::stringstream* out) const override;
+    void debug_string(int indentation_level, std::stringstream* out) const override;
 
 private:
     // Number of rows to skip.
diff --git a/be/src/vec/exec/vunion_node.h b/be/src/vec/exec/vunion_node.h
index ccc683452b..fa517df2ff 100644
--- a/be/src/vec/exec/vunion_node.h
+++ b/be/src/vec/exec/vunion_node.h
@@ -29,9 +29,6 @@ public:
     Status prepare(RuntimeState* state) override;
     Status open(RuntimeState* state) override;
     Status get_next(RuntimeState* state, vectorized::Block* block, bool* eos) override;
-    Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override {
-        return Status::NotSupported("Not Implemented get RowBatch in vecorized execution.");
-    }
     Status close(RuntimeState* state) override;
 
     Status alloc_resource(RuntimeState* state) override;
diff --git a/be/src/vec/runtime/vfile_result_writer.h b/be/src/vec/runtime/vfile_result_writer.h
index 37f37ae159..0c305ef4dd 100644
--- a/be/src/vec/runtime/vfile_result_writer.h
+++ b/be/src/vec/runtime/vfile_result_writer.h
@@ -37,9 +37,6 @@ public:
     virtual ~VFileResultWriter() = default;
 
     Status append_block(Block& block) override;
-    Status append_row_batch(const RowBatch* batch) override {
-        return Status::NotSupported("append_row_batch is not supported in VFileResultWriter!");
-    };
 
     Status init(RuntimeState* state) override;
     Status close() override;
diff --git a/be/src/vec/runtime/vsorted_run_merger.cpp b/be/src/vec/runtime/vsorted_run_merger.cpp
index e2654e6b60..418b5f990d 100644
--- a/be/src/vec/runtime/vsorted_run_merger.cpp
+++ b/be/src/vec/runtime/vsorted_run_merger.cpp
@@ -21,7 +21,6 @@
 
 #include "runtime/descriptors.h"
 #include "runtime/row_batch.h"
-#include "runtime/sorter.h"
 #include "util/debug_util.h"
 #include "util/defer_op.h"
 #include "util/runtime_profile.h"
diff --git a/be/src/vec/sink/vdata_stream_sender.cpp b/be/src/vec/sink/vdata_stream_sender.cpp
index 30ce7be6ac..3b32023b6a 100644
--- a/be/src/vec/sink/vdata_stream_sender.cpp
+++ b/be/src/vec/sink/vdata_stream_sender.cpp
@@ -480,10 +480,6 @@ Status VDataStreamSender::open(RuntimeState* state) {
     return Status::OK();
 }
 
-Status VDataStreamSender::send(RuntimeState* state, RowBatch* batch) {
-    return Status::NotSupported("Not Implemented VOlapScanNode Node::get_next scalar");
-}
-
 Status VDataStreamSender::send(RuntimeState* state, Block* block, bool eos) {
     INIT_AND_SCOPE_SEND_SPAN(state->get_tracer(), _send_span, "VDataStreamSender::send")
     SCOPED_TIMER(_profile->total_time_counter());
diff --git a/be/src/vec/sink/vdata_stream_sender.h b/be/src/vec/sink/vdata_stream_sender.h
index 7cc13162ef..69cd1ecc9b 100644
--- a/be/src/vec/sink/vdata_stream_sender.h
+++ b/be/src/vec/sink/vdata_stream_sender.h
@@ -74,7 +74,6 @@ public:
     Status prepare(RuntimeState* state) override;
     Status open(RuntimeState* state) override;
 
-    Status send(RuntimeState* state, RowBatch* batch) override;
     Status send(RuntimeState* state, Block* block, bool eos = false) override;
 
     Status close(RuntimeState* state, Status exec_status) override;
diff --git a/be/src/vec/sink/vmysql_result_writer.cpp b/be/src/vec/sink/vmysql_result_writer.cpp
index e5f2c258e3..1beacdfaa8 100644
--- a/be/src/vec/sink/vmysql_result_writer.cpp
+++ b/be/src/vec/sink/vmysql_result_writer.cpp
@@ -413,10 +413,6 @@ int VMysqlResultWriter::_add_one_cell(const ColumnPtr& column_ptr, size_t row_id
     }
 }
 
-Status VMysqlResultWriter::append_row_batch(const RowBatch* batch) {
-    return Status::RuntimeError("Not Implemented MysqlResultWriter::append_row_batch scalar");
-}
-
 Status VMysqlResultWriter::append_block(Block& input_block) {
     SCOPED_TIMER(_append_row_batch_timer);
     Status status = Status::OK();
diff --git a/be/src/vec/sink/vmysql_result_writer.h b/be/src/vec/sink/vmysql_result_writer.h
index e17d41b7fa..e566a30213 100644
--- a/be/src/vec/sink/vmysql_result_writer.h
+++ b/be/src/vec/sink/vmysql_result_writer.h
@@ -39,8 +39,6 @@ public:
 
     virtual Status init(RuntimeState* state) override;
 
-    virtual Status append_row_batch(const RowBatch* batch) override;
-
     virtual Status append_block(Block& block) override;
 
     virtual bool can_sink() override;
diff --git a/be/src/vec/sink/vmysql_table_writer.cpp b/be/src/vec/sink/vmysql_table_writer.cpp
index f302513c7a..cbba836377 100644
--- a/be/src/vec/sink/vmysql_table_writer.cpp
+++ b/be/src/vec/sink/vmysql_table_writer.cpp
@@ -33,6 +33,14 @@
 namespace doris {
 namespace vectorized {
 
+std::string MysqlConnInfo::debug_string() const {
+    std::stringstream ss;
+
+    ss << "(host=" << host << ",port=" << port << ",user=" << user << ",db=" << db
+       << ",passwd=" << passwd << ",charset=" << charset << ")";
+    return ss.str();
+}
+
 VMysqlTableWriter::VMysqlTableWriter(const std::vector<vectorized::VExprContext*>& output_expr_ctxs)
         : _vec_output_expr_ctxs(output_expr_ctxs) {}
 
diff --git a/be/src/vec/sink/vmysql_table_writer.h b/be/src/vec/sink/vmysql_table_writer.h
index 6379896e63..5c494262c2 100644
--- a/be/src/vec/sink/vmysql_table_writer.h
+++ b/be/src/vec/sink/vmysql_table_writer.h
@@ -24,11 +24,21 @@
 #include <vector>
 
 #include "common/status.h"
-#include "runtime/mysql_table_writer.h"
 
 namespace doris {
 namespace vectorized {
 
+struct MysqlConnInfo {
+    std::string host;
+    std::string user;
+    std::string passwd;
+    std::string db;
+    int port;
+    std::string charset;
+
+    std::string debug_string() const;
+};
+
 class VExprContext;
 class Block;
 class VMysqlTableWriter {
diff --git a/be/src/vec/sink/vresult_file_sink.cpp b/be/src/vec/sink/vresult_file_sink.cpp
index b63ebf160f..a25683e6da 100644
--- a/be/src/vec/sink/vresult_file_sink.cpp
+++ b/be/src/vec/sink/vresult_file_sink.cpp
@@ -137,10 +137,6 @@ Status VResultFileSink::open(RuntimeState* state) {
     return VExpr::open(_output_vexpr_ctxs, state);
 }
 
-Status VResultFileSink::send(RuntimeState* state, RowBatch* batch) {
-    return Status::NotSupported("Not Implemented VResultFileSink Node::get_next scalar");
-}
-
 Status VResultFileSink::send(RuntimeState* state, Block* block, bool eos) {
     INIT_AND_SCOPE_SEND_SPAN(state->get_tracer(), _send_span, "VResultFileSink::send");
     RETURN_IF_ERROR(_writer->append_block(*block));
diff --git a/be/src/vec/sink/vresult_file_sink.h b/be/src/vec/sink/vresult_file_sink.h
index 33d454f0bc..ba63cad517 100644
--- a/be/src/vec/sink/vresult_file_sink.h
+++ b/be/src/vec/sink/vresult_file_sink.h
@@ -40,7 +40,6 @@ public:
     Status open(RuntimeState* state) override;
     // send data in 'batch' to this backend stream mgr
     // Blocks until all rows in batch are placed in the buffer
-    Status send(RuntimeState* state, RowBatch* batch) override;
     Status send(RuntimeState* state, Block* block, bool eos = false) override;
     // Flush all buffered data and close all existing channels to destination
     // hosts. Further send() calls are illegal after calling close().
diff --git a/be/src/vec/sink/vresult_sink.cpp b/be/src/vec/sink/vresult_sink.cpp
index 2521636c6a..467fb6c82c 100644
--- a/be/src/vec/sink/vresult_sink.cpp
+++ b/be/src/vec/sink/vresult_sink.cpp
@@ -83,10 +83,6 @@ Status VResultSink::open(RuntimeState* state) {
     return VExpr::open(_output_vexpr_ctxs, state);
 }
 
-Status VResultSink::send(RuntimeState* state, RowBatch* batch) {
-    return Status::NotSupported("Not Implemented Result Sink::send scalar");
-}
-
 Status VResultSink::send(RuntimeState* state, Block* block, bool eos) {
     INIT_AND_SCOPE_SEND_SPAN(state->get_tracer(), _send_span, "VResultSink::send");
     // The memory consumption in the process of sending the results is not check query memory limit.
diff --git a/be/src/vec/sink/vresult_sink.h b/be/src/vec/sink/vresult_sink.h
index 4b63c48f95..63441e3179 100644
--- a/be/src/vec/sink/vresult_sink.h
+++ b/be/src/vec/sink/vresult_sink.h
@@ -115,8 +115,6 @@ public:
     virtual Status prepare(RuntimeState* state) override;
     virtual Status open(RuntimeState* state) override;
 
-    // not implement
-    virtual Status send(RuntimeState* state, RowBatch* batch) override;
     virtual Status send(RuntimeState* state, Block* block, bool eos = false) override;
     // Flush all buffered data and close all existing channels to destination
     // hosts. Further send() calls are illegal after calling close().
diff --git a/be/src/vec/sink/vtable_sink.cpp b/be/src/vec/sink/vtable_sink.cpp
index 4bf4d64147..a09fb2cb5f 100644
--- a/be/src/vec/sink/vtable_sink.cpp
+++ b/be/src/vec/sink/vtable_sink.cpp
@@ -54,11 +54,6 @@ Status VTableSink::open(RuntimeState* state) {
     return Status::OK();
 }
 
-Status VTableSink::send(RuntimeState* state, RowBatch* batch) {
-    return Status::NotSupported(
-            "Not Implemented VTableSink::send(RuntimeState* state, RowBatch* batch)");
-}
-
 Status VTableSink::send(RuntimeState* state, Block* block, bool eos) {
     INIT_AND_SCOPE_SEND_SPAN(state->get_tracer(), _send_span, "VTableSink::send");
     return Status::OK();
diff --git a/be/src/vec/sink/vtable_sink.h b/be/src/vec/sink/vtable_sink.h
index 62d2e6fde6..339df25f5c 100644
--- a/be/src/vec/sink/vtable_sink.h
+++ b/be/src/vec/sink/vtable_sink.h
@@ -38,8 +38,6 @@ public:
 
     Status open(RuntimeState* state) override;
 
-    Status send(RuntimeState* state, RowBatch* batch) override;
-
     Status send(RuntimeState* state, vectorized::Block* block, bool eos = false) override;
     // Flush all buffered data and close all existing channels to destination
     // hosts. Further send() calls are illegal after calling close().
diff --git a/be/src/vec/sink/vtablet_sink.cpp b/be/src/vec/sink/vtablet_sink.cpp
index ae799e46d9..13aa7661ea 100644
--- a/be/src/vec/sink/vtablet_sink.cpp
+++ b/be/src/vec/sink/vtablet_sink.cpp
@@ -17,12 +17,29 @@
 
 #include "vec/sink/vtablet_sink.h"
 
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include "exec/tablet_info.h"
+#include "exprs/expr.h"
+#include "exprs/expr_context.h"
+#include "olap/hll.h"
+#include "runtime/exec_env.h"
+#include "runtime/row_batch.h"
+#include "runtime/runtime_state.h"
 #include "runtime/thread_context.h"
+#include "runtime/tuple_row.h"
+#include "service/backend_options.h"
 #include "util/brpc_client_cache.h"
 #include "util/debug/sanitizer_scopes.h"
+#include "util/defer_op.h"
 #include "util/doris_metrics.h"
 #include "util/proto_util.h"
+#include "util/threadpool.h"
 #include "util/time.h"
+#include "util/uid_util.h"
 #include "vec/columns/column_array.h"
 #include "vec/core/block.h"
 #include "vec/exprs/vexpr.h"
@@ -31,9 +48,131 @@
 namespace doris {
 namespace stream_load {
 
-VNodeChannel::VNodeChannel(OlapTableSink* parent, IndexChannel* index_channel, int64_t node_id)
-        : NodeChannel(parent, index_channel, node_id) {
-    _is_vectorized = true;
+Status IndexChannel::init(RuntimeState* state, const std::vector<TTabletWithPartition>& tablets) {
+    SCOPED_CONSUME_MEM_TRACKER(_index_channel_tracker.get());
+    for (auto& tablet : tablets) {
+        auto location = _parent->_location->find_tablet(tablet.tablet_id);
+        if (location == nullptr) {
+            LOG(WARNING) << "unknown tablet, tablet_id=" << tablet.tablet_id;
+            return Status::InternalError("unknown tablet");
+        }
+        std::vector<std::shared_ptr<VNodeChannel>> channels;
+        for (auto& node_id : location->node_ids) {
+            std::shared_ptr<VNodeChannel> channel;
+            auto it = _node_channels.find(node_id);
+            if (it == _node_channels.end()) {
+                // NodeChannel is not added to the _parent->_pool.
+                // Because the deconstruction of NodeChannel may take a long time to wait rpc finish.
+                // but the ObjectPool will hold a spin lock to delete objects.
+                channel = std::make_shared<VNodeChannel>(_parent, this, node_id);
+                _node_channels.emplace(node_id, channel);
+            } else {
+                channel = it->second;
+            }
+            channel->add_tablet(tablet);
+            if (_parent->_write_single_replica) {
+                auto slave_location = _parent->_slave_location->find_tablet(tablet.tablet_id);
+                if (slave_location != nullptr) {
+                    channel->add_slave_tablet_nodes(tablet.tablet_id, slave_location->node_ids);
+                }
+            }
+            channels.push_back(channel);
+            _tablets_by_channel[node_id].insert(tablet.tablet_id);
+        }
+        _channels_by_tablet.emplace(tablet.tablet_id, std::move(channels));
+    }
+    for (auto& it : _node_channels) {
+        RETURN_IF_ERROR(it.second->init(state));
+    }
+    return Status::OK();
+}
+
+void IndexChannel::mark_as_failed(int64_t node_id, const std::string& host, const std::string& err,
+                                  int64_t tablet_id) {
+    VLOG_PROGRESS << "mark node_id:" << node_id << " tablet_id: " << tablet_id
+                  << " as failed, err: " << err;
+    const auto& it = _tablets_by_channel.find(node_id);
+    if (it == _tablets_by_channel.end()) {
+        return;
+    }
+
+    {
+        std::lock_guard<doris::SpinLock> l(_fail_lock);
+        if (tablet_id == -1) {
+            for (const auto the_tablet_id : it->second) {
+                _failed_channels[the_tablet_id].insert(node_id);
+                _failed_channels_msgs.emplace(the_tablet_id, err + ", host: " + host);
+                if (_failed_channels[the_tablet_id].size() >= ((_parent->_num_replicas + 1) / 2)) {
+                    _intolerable_failure_status =
+                            Status::InternalError(_failed_channels_msgs[the_tablet_id]);
+                }
+            }
+        } else {
+            _failed_channels[tablet_id].insert(node_id);
+            _failed_channels_msgs.emplace(tablet_id, err + ", host: " + host);
+            if (_failed_channels[tablet_id].size() >= ((_parent->_num_replicas + 1) / 2)) {
+                _intolerable_failure_status =
+                        Status::InternalError(_failed_channels_msgs[tablet_id]);
+            }
+        }
+    }
+}
+
+Status IndexChannel::check_intolerable_failure() {
+    std::lock_guard<doris::SpinLock> l(_fail_lock);
+    return _intolerable_failure_status;
+}
+
+void IndexChannel::set_error_tablet_in_state(RuntimeState* state) {
+    std::vector<TErrorTabletInfo>& error_tablet_infos = state->error_tablet_infos();
+
+    std::lock_guard<doris::SpinLock> l(_fail_lock);
+    for (const auto& it : _failed_channels_msgs) {
+        TErrorTabletInfo error_info;
+        error_info.__set_tabletId(it.first);
+        error_info.__set_msg(it.second);
+        error_tablet_infos.emplace_back(error_info);
+    }
+}
+
+void IndexChannel::set_tablets_received_rows(
+        const std::vector<std::pair<int64_t, int64_t>>& tablets_received_rows, int64_t node_id) {
+    for (const auto& [tablet_id, rows_num] : tablets_received_rows) {
+        _tablets_received_rows[tablet_id].emplace_back(node_id, rows_num);
+    }
+}
+
+Status IndexChannel::check_tablet_received_rows_consistency() {
+    for (auto& tablet : _tablets_received_rows) {
+        for (size_t i = 0; i < tablet.second.size(); i++) {
+            VLOG_NOTICE << "check_tablet_received_rows_consistency, load_id: " << _parent->_load_id
+                        << ", txn_id: " << std::to_string(_parent->_txn_id)
+                        << ", tablet_id: " << tablet.first
+                        << ", node_id: " << tablet.second[i].first
+                        << ", rows_num: " << tablet.second[i].second;
+            if (i == 0) {
+                continue;
+            }
+            if (tablet.second[i].second != tablet.second[0].second) {
+                LOG(WARNING) << "rows num doest't match, load_id: " << _parent->_load_id
+                             << ", txn_id: " << std::to_string(_parent->_txn_id)
+                             << ", tablt_id: " << tablet.first
+                             << ", node_id: " << tablet.second[i].first
+                             << ", rows_num: " << tablet.second[i].second
+                             << ", node_id: " << tablet.second[0].first
+                             << ", rows_num: " << tablet.second[0].second;
+                return Status::InternalError("rows num written by multi replicas doest't match");
+            }
+        }
+    }
+    return Status::OK();
+}
+
+VNodeChannel::VNodeChannel(VOlapTableSink* parent, IndexChannel* index_channel, int64_t node_id)
+        : _parent(parent), _index_channel(index_channel), _node_id(node_id) {
+    _node_channel_tracker = std::make_shared<MemTracker>(fmt::format(
+            "NodeChannel:indexID={}:threadId={}", std::to_string(_index_channel->_index_id),
+            thread_context()->get_thread_id()));
 }
 
 VNodeChannel::~VNodeChannel() {
@@ -55,7 +194,34 @@ void VNodeChannel::clear_all_blocks() {
 // no need to set _cancel_msg because the error will be
 // returned directly via "TabletSink::prepare()" method.
 Status VNodeChannel::init(RuntimeState* state) {
-    RETURN_IF_ERROR(NodeChannel::init(state));
+    SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get());
+    _tuple_desc = _parent->_output_tuple_desc;
+    _state = state;
+    auto node = _parent->_nodes_info->find_node(_node_id);
+    if (node == nullptr) {
+        _cancelled = true;
+        return Status::InternalError("unknown node id, id={}", _node_id);
+    }
+
+    _node_info = *node;
+
+    _load_info = "load_id=" + print_id(_parent->_load_id) +
+                 ", txn_id=" + std::to_string(_parent->_txn_id);
+
+    _row_desc.reset(new RowDescriptor(_tuple_desc, false));
+    _batch_size = state->batch_size();
+
+    _stub = state->exec_env()->brpc_internal_client_cache()->get_client(_node_info.host,
+                                                                        _node_info.brpc_port);
+    if (_stub == nullptr) {
+        LOG(WARNING) << "Get rpc stub failed, host=" << _node_info.host
+                     << ", port=" << _node_info.brpc_port << ", " << channel_info();
+        _cancelled = true;
+        return Status::InternalError("get rpc stub failed");
+    }
+
+    _rpc_timeout_ms = state->query_options().query_timeout * 1000;
+    _timeout_watch.start();
 
     _cur_mutable_block.reset(new vectorized::MutableBlock({_tuple_desc}));
 
@@ -77,9 +243,67 @@ Status VNodeChannel::init(RuntimeState* state) {
     return Status::OK();
 }
 
+void VNodeChannel::open() {
+    SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get());
+    PTabletWriterOpenRequest request;
+    request.set_allocated_id(&_parent->_load_id);
+    request.set_index_id(_index_channel->_index_id);
+    request.set_txn_id(_parent->_txn_id);
+    request.set_allocated_schema(_parent->_schema->to_protobuf());
+    for (auto& tablet : _all_tablets) {
+        auto ptablet = request.add_tablets();
+        ptablet->set_partition_id(tablet.partition_id);
+        ptablet->set_tablet_id(tablet.tablet_id);
+    }
+    request.set_num_senders(_parent->_num_senders);
+    request.set_need_gen_rollup(false); // Useless but it is a required field in pb
+    request.set_load_mem_limit(_parent->_load_mem_limit);
+    request.set_load_channel_timeout_s(_parent->_load_channel_timeout_s);
+    request.set_is_high_priority(_parent->_is_high_priority);
+    request.set_sender_ip(BackendOptions::get_localhost());
+    request.set_is_vectorized(true);
+
+    _open_closure = new RefCountClosure<PTabletWriterOpenResult>();
+    _open_closure->ref();
+
+    // This ref is for RPC's reference
+    _open_closure->ref();
+    _open_closure->cntl.set_timeout_ms(config::tablet_writer_open_rpc_timeout_sec * 1000);
+    if (config::tablet_writer_ignore_eovercrowded) {
+        _open_closure->cntl.ignore_eovercrowded();
+    }
+    _stub->tablet_writer_open(&_open_closure->cntl, &request, &_open_closure->result,
+                              _open_closure);
+    request.release_id();
+    request.release_schema();
+}
+
 Status VNodeChannel::open_wait() {
-    Status status = NodeChannel::open_wait();
+    _open_closure->join();
+    SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get());
+    if (_open_closure->cntl.Failed()) {
+        if (!ExecEnv::GetInstance()->brpc_internal_client_cache()->available(
+                    _stub, _node_info.host, _node_info.brpc_port)) {
+            ExecEnv::GetInstance()->brpc_internal_client_cache()->erase(
+                    _open_closure->cntl.remote_side());
+        }
+        std::stringstream ss;
+        ss << "failed to open tablet writer, error=" << berror(_open_closure->cntl.ErrorCode())
+           << ", error_text=" << _open_closure->cntl.ErrorText();
+        _cancelled = true;
+        LOG(WARNING) << ss.str() << " " << channel_info();
+        return Status::InternalError("failed to open tablet writer, error={}, error_text={}",
+                                     berror(_open_closure->cntl.ErrorCode()),
+                                     _open_closure->cntl.ErrorText());
+    }
+    Status status(_open_closure->result.status());
+    if (_open_closure->unref()) {
+        delete _open_closure;
+    }
+    _open_closure = nullptr;
+
     if (!status.ok()) {
+        _cancelled = true;
         return status;
     }
 
@@ -184,7 +408,7 @@ Status VNodeChannel::add_block(vectorized::Block* block,
     auto st = none_of({_cancelled, _eos_is_produced});
     if (!st.ok()) {
         if (_cancelled) {
-            std::lock_guard<SpinLock> l(_cancel_msg_lock);
+            std::lock_guard<doris::SpinLock> l(_cancel_msg_lock);
             return Status::InternalError("add row failed. {}", _cancel_msg);
         } else {
             return std::move(st.prepend("already stopped, can't add row. cancelled/eos: "));
@@ -257,6 +481,33 @@ int VNodeChannel::try_send_and_fetch_status(RuntimeState* state,
     return _send_finished ? 0 : 1;
 }
 
+void VNodeChannel::_cancel_with_msg(const std::string& msg) {
+    LOG(WARNING) << "cancel node channel " << channel_info() << ", error message: " << msg;
+    {
+        std::lock_guard<doris::SpinLock> l(_cancel_msg_lock);
+        if (_cancel_msg == "") {
+            _cancel_msg = msg;
+        }
+    }
+    _cancelled = true;
+}
+
+Status VNodeChannel::none_of(std::initializer_list<bool> vars) {
+    bool none = std::none_of(vars.begin(), vars.end(), [](bool var) { return var; });
+    Status st = Status::OK();
+    if (!none) {
+        std::string vars_str;
+        std::for_each(vars.begin(), vars.end(),
+                      [&vars_str](bool var) -> void { vars_str += (var ? "1/" : "0/"); });
+        if (!vars_str.empty()) {
+            vars_str.pop_back(); // 0/1/0/ -> 0/1/0
+        }
+        st = Status::InternalError(vars_str);
+    }
+
+    return st;
+}
+
 void VNodeChannel::try_send_block(RuntimeState* state) {
     SCOPED_ATTACH_TASK(state);
     SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker);
@@ -384,6 +635,84 @@ void VNodeChannel::try_send_block(RuntimeState* state) {
     _next_packet_seq++;
 }
 
+void VNodeChannel::cancel(const std::string& cancel_msg) {
+    SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get());
+    // set _is_closed to true finally
+    Defer set_closed {[&]() {
+        std::lock_guard<std::mutex> l(_closed_lock);
+        _is_closed = true;
+    }};
+    // we don't need to wait last rpc finished, cause closure's release/reset will join.
+    // But do we need brpc::StartCancel(call_id)?
+    _cancel_with_msg(cancel_msg);
+
+    PTabletWriterCancelRequest request;
+    request.set_allocated_id(&_parent->_load_id);
+    request.set_index_id(_index_channel->_index_id);
+    request.set_sender_id(_parent->_sender_id);
+
+    auto closure = new RefCountClosure<PTabletWriterCancelResult>();
+
+    closure->ref();
+    int remain_ms = _rpc_timeout_ms - _timeout_watch.elapsed_time() / NANOS_PER_MILLIS;
+    if (UNLIKELY(remain_ms < config::min_load_rpc_timeout_ms)) {
+        remain_ms = config::min_load_rpc_timeout_ms;
+    }
+    closure->cntl.set_timeout_ms(remain_ms);
+    if (config::tablet_writer_ignore_eovercrowded) {
+        closure->cntl.ignore_eovercrowded();
+    }
+    _stub->tablet_writer_cancel(&closure->cntl, &request, &closure->result, closure);
+    request.release_id();
+}
+
+Status VNodeChannel::close_wait(RuntimeState* state) {
+    SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get());
+    // set _is_closed to true finally
+    Defer set_closed {[&]() {
+        std::lock_guard<std::mutex> l(_closed_lock);
+        _is_closed = true;
+    }};
+
+    auto st = none_of({_cancelled, !_eos_is_produced});
+    if (!st.ok()) {
+        if (_cancelled) {
+            std::lock_guard<doris::SpinLock> l(_cancel_msg_lock);
+            return Status::InternalError("wait close failed. {}", _cancel_msg);
+        } else {
+            return std::move(
+                    st.prepend("already stopped, skip waiting for close. cancelled/!eos: "));
+        }
+    }
+
+    // waiting for finished, it may take a long time, so we couldn't set a timeout
+    while (!_add_batches_finished && !_cancelled) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
+    _close_time_ms = UnixMillis() - _close_time_ms;
+
+    if (_add_batches_finished) {
+        _close_check();
+        state->tablet_commit_infos().insert(state->tablet_commit_infos().end(),
+                                            std::make_move_iterator(_tablet_commit_infos.begin()),
+                                            std::make_move_iterator(_tablet_commit_infos.end()));
+
+        _index_channel->set_error_tablet_in_state(state);
+        _index_channel->set_tablets_received_rows(_tablets_received_rows, _node_id);
+        return Status::OK();
+    }
+
+    std::stringstream ss;
+    ss << "close wait failed coz rpc error";
+    {
+        std::lock_guard<doris::SpinLock> l(_cancel_msg_lock);
+        if (_cancel_msg != "") {
+            ss << ". " << _cancel_msg;
+        }
+    }
+    return Status::InternalError(ss.str());
+}
+
 void VNodeChannel::_close_check() {
     std::lock_guard<std::mutex> lg(_pending_batches_lock);
     CHECK(_pending_blocks.empty()) << name();
@@ -413,11 +742,14 @@ void VNodeChannel::mark_close() {
 
 VOlapTableSink::VOlapTableSink(ObjectPool* pool, const RowDescriptor& row_desc,
                                const std::vector<TExpr>& texprs, Status* status)
-        : OlapTableSink(pool, row_desc, texprs, status) {
-    _is_vectorized = true;
+        : _pool(pool),
+          _input_row_desc(row_desc),
+          _filter_bitmap(1024),
+          _stop_background_threads_latch(1) {
     // From the thrift expressions create the real exprs.
-    vectorized::VExpr::create_expr_trees(pool, texprs, &_output_vexpr_ctxs);
+    *status = vectorized::VExpr::create_expr_trees(pool, texprs, &_output_vexpr_ctxs);
     _name = "VOlapTableSink";
+    _transfer_large_data_by_brpc = config::transfer_large_data_by_brpc;
 }
 
 VOlapTableSink::~VOlapTableSink() {
@@ -426,18 +758,150 @@ VOlapTableSink::~VOlapTableSink() {
     // But their destructions are after OlapTableSink's.
     for (const auto& index_channel : _channels) {
         index_channel->for_each_node_channel(
-                [](const std::shared_ptr<NodeChannel>& ch) { ch->clear_all_blocks(); });
+                [](const std::shared_ptr<VNodeChannel>& ch) { ch->clear_all_blocks(); });
     }
 }
 
-Status VOlapTableSink::init(const TDataSink& sink) {
-    RETURN_IF_ERROR(OlapTableSink::init(sink));
-    _vpartition = _pool->add(new VOlapTablePartitionParam(_schema, sink.olap_table_sink.partition));
+Status VOlapTableSink::init(const TDataSink& t_sink) {
+    DCHECK(t_sink.__isset.olap_table_sink);
+    auto& table_sink = t_sink.olap_table_sink;
+    _load_id.set_hi(table_sink.load_id.hi);
+    _load_id.set_lo(table_sink.load_id.lo);
+    _txn_id = table_sink.txn_id;
+    _num_replicas = table_sink.num_replicas;
+    _tuple_desc_id = table_sink.tuple_id;
+    _schema.reset(new OlapTableSchemaParam());
+    RETURN_IF_ERROR(_schema->init(table_sink.schema));
+    _partition = _pool->add(new OlapTablePartitionParam(_schema, table_sink.partition));
+    RETURN_IF_ERROR(_partition->init());
+    _location = _pool->add(new OlapTableLocationParam(table_sink.location));
+    _nodes_info = _pool->add(new DorisNodesInfo(table_sink.nodes_info));
+    if (table_sink.__isset.write_single_replica && table_sink.write_single_replica) {
+        _write_single_replica = true;
+        _slave_location = _pool->add(new OlapTableLocationParam(table_sink.slave_location));
+        if (!config::enable_single_replica_load) {
+            return Status::InternalError("single replica load is disabled on BE.");
+        }
+    }
+
+    if (table_sink.__isset.load_channel_timeout_s) {
+        _load_channel_timeout_s = table_sink.load_channel_timeout_s;
+    } else {
+        _load_channel_timeout_s = config::streaming_load_rpc_max_alive_time_sec;
+    }
+    if (table_sink.__isset.send_batch_parallelism && table_sink.send_batch_parallelism > 1) {
+        _send_batch_parallelism = table_sink.send_batch_parallelism;
+    }
+    // if distributed column list is empty, we can ensure that tablet is with random distribution info
+    // and if load_to_single_tablet is set and set to true, we should find only one tablet in one partition
+    // for the whole olap table sink
+    if (table_sink.partition.distributed_columns.empty()) {
+        if (table_sink.__isset.load_to_single_tablet && table_sink.load_to_single_tablet) {
+            findTabletMode = FindTabletMode::FIND_TABLET_EVERY_SINK;
+        } else {
+            findTabletMode = FindTabletMode::FIND_TABLET_EVERY_BATCH;
+        }
+    }
+    _vpartition = _pool->add(
+            new doris::VOlapTablePartitionParam(_schema, t_sink.olap_table_sink.partition));
     return _vpartition->init();
 }
 
 Status VOlapTableSink::prepare(RuntimeState* state) {
-    RETURN_IF_ERROR(OlapTableSink::prepare(state));
+    RETURN_IF_ERROR(DataSink::prepare(state));
+
+    _sender_id = state->per_fragment_instance_idx();
+    _num_senders = state->num_per_fragment_instances();
+    _is_high_priority = (state->query_options().query_timeout <=
+                         config::load_task_high_priority_threshold_second);
+
+    // profile must add to state's object pool
+    _profile = state->obj_pool()->add(new RuntimeProfile("OlapTableSink"));
+    _mem_tracker =
+            std::make_shared<MemTracker>("OlapTableSink:" + std::to_string(state->load_job_id()));
+    SCOPED_TIMER(_profile->total_time_counter());
+    SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get());
+
+    // get table's tuple descriptor
+    _output_tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_desc_id);
+    if (_output_tuple_desc == nullptr) {
+        LOG(WARNING) << "unknown destination tuple descriptor, id=" << _tuple_desc_id;
+        return Status::InternalError("unknown destination tuple descriptor");
+    }
+
+    _output_row_desc = _pool->add(new RowDescriptor(_output_tuple_desc, false));
+
+    _max_decimalv2_val.resize(_output_tuple_desc->slots().size());
+    _min_decimalv2_val.resize(_output_tuple_desc->slots().size());
+    // check if need validate batch
+    for (int i = 0; i < _output_tuple_desc->slots().size(); ++i) {
+        auto slot = _output_tuple_desc->slots()[i];
+        switch (slot->type().type) {
+        // For DECIMAL32,DECIMAL64,DECIMAL128, we have done precision and scale conversion so just
+        // skip data validation here.
+        case TYPE_DECIMALV2:
+            _max_decimalv2_val[i].to_max_decimal(slot->type().precision, slot->type().scale);
+            _min_decimalv2_val[i].to_min_decimal(slot->type().precision, slot->type().scale);
+            _need_validate_data = true;
+            break;
+        case TYPE_CHAR:
+        case TYPE_VARCHAR:
+        case TYPE_DATE:
+        case TYPE_DATETIME:
+        case TYPE_DATEV2:
+        case TYPE_DATETIMEV2:
+        case TYPE_HLL:
+        case TYPE_OBJECT:
+        case TYPE_STRING:
+        case TYPE_ARRAY:
+            _need_validate_data = true;
+            break;
+        default:
+            break;
+        }
+    }
+
+    // add all counter
+    _input_rows_counter = ADD_COUNTER(_profile, "RowsRead", TUnit::UNIT);
+    _output_rows_counter = ADD_COUNTER(_profile, "RowsReturned", TUnit::UNIT);
+    _filtered_rows_counter = ADD_COUNTER(_profile, "RowsFiltered", TUnit::UNIT);
+    _send_data_timer = ADD_TIMER(_profile, "SendDataTime");
+    _wait_mem_limit_timer = ADD_CHILD_TIMER(_profile, "WaitMemLimitTime", "SendDataTime");
+    _validate_data_timer = ADD_TIMER(_profile, "ValidateDataTime");
+    _open_timer = ADD_TIMER(_profile, "OpenTime");
+    _close_timer = ADD_TIMER(_profile, "CloseWaitTime");
+    _non_blocking_send_timer = ADD_TIMER(_profile, "NonBlockingSendTime");
+    _non_blocking_send_work_timer =
+            ADD_CHILD_TIMER(_profile, "NonBlockingSendWorkTime", "NonBlockingSendTime");
+    _serialize_batch_timer =
+            ADD_CHILD_TIMER(_profile, "SerializeBatchTime", "NonBlockingSendWorkTime");
+    _total_add_batch_exec_timer = ADD_TIMER(_profile, "TotalAddBatchExecTime");
+    _max_add_batch_exec_timer = ADD_TIMER(_profile, "MaxAddBatchExecTime");
+    _add_batch_number = ADD_COUNTER(_profile, "NumberBatchAdded", TUnit::UNIT);
+    _num_node_channels = ADD_COUNTER(_profile, "NumberNodeChannels", TUnit::UNIT);
+    _load_mem_limit = state->get_load_mem_limit();
+
+    // open all channels
+    const auto& partitions = _partition->get_partitions();
+    for (int i = 0; i < _schema->indexes().size(); ++i) {
+        // collect all tablets belong to this rollup
+        std::vector<TTabletWithPartition> tablets;
+        auto index = _schema->indexes()[i];
+        for (const auto& part : partitions) {
+            for (const auto& tablet : part->indexes[i].tablets) {
+                TTabletWithPartition tablet_with_partition;
+                tablet_with_partition.partition_id = part->id;
+                tablet_with_partition.tablet_id = tablet;
+                tablets.emplace_back(std::move(tablet_with_partition));
+            }
+        }
+        if (UNLIKELY(tablets.empty())) {
+            LOG(WARNING) << "load job:" << state->load_job_id() << " index: " << index->index_id
+                         << " would open 0 tablet";
+        }
+        _channels.emplace_back(new IndexChannel(this, index->index_id));
+        RETURN_IF_ERROR(_channels.back()->init(state, tablets));
+    }
     // Prepare the exprs to run.
     RETURN_IF_ERROR(vectorized::VExpr::prepare(_output_vexpr_ctxs, state, _input_row_desc));
     return Status::OK();
@@ -447,7 +911,65 @@ Status VOlapTableSink::open(RuntimeState* state) {
     START_AND_SCOPE_SPAN(state->get_tracer(), span, "VOlapTableSink::open");
     // Prepare the exprs to run.
     RETURN_IF_ERROR(vectorized::VExpr::open(_output_vexpr_ctxs, state));
-    return OlapTableSink::open(state);
+    SCOPED_TIMER(_profile->total_time_counter());
+    SCOPED_TIMER(_open_timer);
+    SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get());
+
+    for (auto index_channel : _channels) {
+        index_channel->for_each_node_channel(
+                [](const std::shared_ptr<VNodeChannel>& ch) { ch->open(); });
+    }
+
+    for (auto index_channel : _channels) {
+        index_channel->for_each_node_channel([&index_channel](
+                                                     const std::shared_ptr<VNodeChannel>& ch) {
+            auto st = ch->open_wait();
+            if (!st.ok()) {
+                // The open() phase is mainly to generate DeltaWriter instances on the nodes corresponding to each node channel.
+                // This phase will not fail due to a single tablet.
+                // Therefore, if the open() phase fails, all tablets corresponding to the node need to be marked as failed.
+                index_channel->mark_as_failed(
+                        ch->node_id(), ch->host(),
+                        fmt::format("{}, open failed, err: {}", ch->channel_info(), st.to_string()),
+                        -1);
+            }
+        });
+
+        RETURN_IF_ERROR(index_channel->check_intolerable_failure());
+    }
+    int32_t send_batch_parallelism =
+            MIN(_send_batch_parallelism, config::max_send_batch_parallelism_per_job);
+    _send_batch_thread_pool_token = state->exec_env()->send_batch_thread_pool()->new_token(
+            ThreadPool::ExecutionMode::CONCURRENT, send_batch_parallelism);
+    RETURN_IF_ERROR(Thread::create(
+            "OlapTableSink", "send_batch_process",
+            [this, state]() { this->_send_batch_process(state); }, &_sender_thread));
+
+    return Status::OK();
+}
+
+void VOlapTableSink::_send_batch_process(RuntimeState* state) {
+    SCOPED_TIMER(_non_blocking_send_timer);
+    SCOPED_ATTACH_TASK(state);
+    SCOPED_CONSUME_MEM_TRACKER(_mem_tracker);
+    do {
+        int running_channels_num = 0;
+        for (auto index_channel : _channels) {
+            index_channel->for_each_node_channel([&running_channels_num, this,
+                                                  state](const std::shared_ptr<VNodeChannel>& ch) {
+                running_channels_num +=
+                        ch->try_send_and_fetch_status(state, this->_send_batch_thread_pool_token);
+            });
+        }
+
+        if (running_channels_num == 0) {
+            LOG(INFO) << "all node channels are stopped(maybe finished/offending/cancelled), "
+                         "sender thread exit. "
+                      << print_id(_load_id);
+            return;
+        }
+    } while (!_stop_background_threads_latch.wait_for(
+            std::chrono::milliseconds(config::olap_table_sink_send_interval_ms)));
 }
 
 size_t VOlapTableSink::get_pending_bytes() const {
@@ -552,81 +1074,51 @@ Status VOlapTableSink::send(RuntimeState* state, vectorized::Block* input_block,
         _partition_to_tablet_map.clear();
     }
 
-    bool use_vec = _is_vectorized && state->be_exec_version() > 0;
-    if (use_vec) {
-        std::vector<std::unordered_map<
-                NodeChannel*,
-                std::pair<std::unique_ptr<vectorized::IColumn::Selector>, std::vector<int64_t>>>>
-                channel_to_payload;
-        channel_to_payload.resize(_channels.size());
-        for (int i = 0; i < num_rows; ++i) {
-            if (filtered_rows > 0 && _filter_bitmap.Get(i)) {
-                continue;
-            }
-            const VOlapTablePartition* partition = nullptr;
-            uint32_t tablet_index = 0;
-            bool is_continue = false;
-            RETURN_IF_ERROR(find_tablet(state, &block, i, &partition, tablet_index, stop_processing,
-                                        is_continue));
-            if (is_continue) {
-                continue;
-            }
-            for (int j = 0; j < partition->indexes.size(); ++j) {
-                auto tid = partition->indexes[j].tablets[tablet_index];
-                auto it = _channels[j]->_channels_by_tablet.find(tid);
-                DCHECK(it != _channels[j]->_channels_by_tablet.end())
-                        << "unknown tablet, tablet_id=" << tablet_index;
-                for (const auto& channel : it->second) {
-                    if (channel_to_payload[j].count(channel.get()) < 1) {
-                        channel_to_payload[j].insert(
-                                {channel.get(),
-                                 std::pair<std::unique_ptr<vectorized::IColumn::Selector>,
-                                           std::vector<int64_t>> {
-                                         std::unique_ptr<vectorized::IColumn::Selector>(
-                                                 new vectorized::IColumn::Selector()),
-                                         std::vector<int64_t>()}});
-                    }
-                    channel_to_payload[j][channel.get()].first->push_back(i);
-                    channel_to_payload[j][channel.get()].second.push_back(tid);
+    std::vector<std::unordered_map<
+            VNodeChannel*,
+            std::pair<std::unique_ptr<vectorized::IColumn::Selector>, std::vector<int64_t>>>>
+            channel_to_payload;
+    channel_to_payload.resize(_channels.size());
+    for (int i = 0; i < num_rows; ++i) {
+        if (filtered_rows > 0 && _filter_bitmap.Get(i)) {
+            continue;
+        }
+        const VOlapTablePartition* partition = nullptr;
+        uint32_t tablet_index = 0;
+        bool is_continue = false;
+        RETURN_IF_ERROR(find_tablet(state, &block, i, &partition, tablet_index, stop_processing,
+                                    is_continue));
+        if (is_continue) {
+            continue;
+        }
+        for (int j = 0; j < partition->indexes.size(); ++j) {
+            auto tid = partition->indexes[j].tablets[tablet_index];
+            auto it = _channels[j]->_channels_by_tablet.find(tid);
+            DCHECK(it != _channels[j]->_channels_by_tablet.end())
+                    << "unknown tablet, tablet_id=" << tablet_index;
+            for (const auto& channel : it->second) {
+                if (channel_to_payload[j].count(channel.get()) < 1) {
+                    channel_to_payload[j].insert(
+                            {channel.get(),
+                             std::pair<std::unique_ptr<vectorized::IColumn::Selector>,
+                                       std::vector<int64_t>> {
+                                     std::unique_ptr<vectorized::IColumn::Selector>(
+                                             new vectorized::IColumn::Selector()),
+                                     std::vector<int64_t>()}});
                 }
-                _number_output_rows++;
+                channel_to_payload[j][channel.get()].first->push_back(i);
+                channel_to_payload[j][channel.get()].second.push_back(tid);
             }
+            _number_output_rows++;
         }
-        for (size_t i = 0; i < _channels.size(); i++) {
-            for (const auto& entry : channel_to_payload[i]) {
-                // if this node channel is already failed, this add_row will be skipped
-                auto st = entry.first->add_block(&block, entry.second);
-                if (!st.ok()) {
-                    _channels[i]->mark_as_failed(entry.first->node_id(), entry.first->host(),
-                                                 st.to_string());
-                }
-            }
-        }
-    } else {
-        size_t MAX_PENDING_BYTES = _load_mem_limit / 3;
-        while (get_pending_bytes() > MAX_PENDING_BYTES && !state->is_cancelled()) {
-            std::this_thread::sleep_for(std::chrono::microseconds(100));
-        }
-
-        for (int i = 0; i < num_rows; ++i) {
-            if (filtered_rows > 0 && _filter_bitmap.Get(i)) {
-                continue;
-            }
-            const VOlapTablePartition* partition = nullptr;
-            uint32_t tablet_index = 0;
-            BlockRow block_row;
-            block_row = {&block, i};
-            bool is_continue = false;
-            RETURN_IF_ERROR(find_tablet(state, &block, i, &partition, tablet_index, stop_processing,
-                                        is_continue));
-            if (is_continue) {
-                continue;
-            }
-
-            for (int j = 0; j < partition->indexes.size(); ++j) {
-                int64_t tablet_id = partition->indexes[j].tablets[tablet_index];
-                _channels[j]->add_row(block_row, tablet_id);
-                _number_output_rows++;
+    }
+    for (size_t i = 0; i < _channels.size(); i++) {
+        for (const auto& entry : channel_to_payload[i]) {
+            // if this node channel is already failed, this add_row will be skipped
+            auto st = entry.first->add_block(&block, entry.second);
+            if (!st.ok()) {
+                _channels[i]->mark_as_failed(entry.first->node_id(), entry.first->host(),
+                                             st.to_string());
             }
         }
     }
@@ -642,7 +1134,124 @@ Status VOlapTableSink::close(RuntimeState* state, Status exec_status) {
     if (_closed) return _close_status;
     START_AND_SCOPE_SPAN(state->get_tracer(), span, "VOlapTableSink::close");
     vectorized::VExpr::close(_output_vexpr_ctxs, state);
-    return OlapTableSink::close(state, exec_status);
+    Status status = exec_status;
+    if (status.ok()) {
+        // only if status is ok can we call this _profile->total_time_counter().
+        // if status is not ok, this sink may not be prepared, so that _profile is null
+        SCOPED_TIMER(_profile->total_time_counter());
+        // BE id -> add_batch method counter
+        std::unordered_map<int64_t, AddBatchCounter> node_add_batch_counter_map;
+        int64_t serialize_batch_ns = 0, mem_exceeded_block_ns = 0, queue_push_lock_ns = 0,
+                actual_consume_ns = 0, total_add_batch_exec_time_ns = 0,
+                max_add_batch_exec_time_ns = 0, total_add_batch_num = 0, num_node_channels = 0;
+        {
+            SCOPED_TIMER(_close_timer);
+            for (auto index_channel : _channels) {
+                index_channel->for_each_node_channel(
+                        [](const std::shared_ptr<VNodeChannel>& ch) { ch->mark_close(); });
+                num_node_channels += index_channel->num_node_channels();
+            }
+
+            for (auto index_channel : _channels) {
+                int64_t add_batch_exec_time = 0;
+                index_channel->for_each_node_channel(
+                        [&index_channel, &state, &node_add_batch_counter_map, &serialize_batch_ns,
+                         &mem_exceeded_block_ns, &queue_push_lock_ns, &actual_consume_ns,
+                         &total_add_batch_exec_time_ns, &add_batch_exec_time,
+                         &total_add_batch_num](const std::shared_ptr<VNodeChannel>& ch) {
+                            auto s = ch->close_wait(state);
+                            if (!s.ok()) {
+                                auto err_msg = s.to_string();
+                                index_channel->mark_as_failed(ch->node_id(), ch->host(), err_msg,
+                                                              -1);
+                                // cancel the node channel in best effort
+                                ch->cancel(err_msg);
+                                LOG(WARNING) << ch->channel_info()
+                                             << ", close channel failed, err: " << err_msg;
+                            }
+                            ch->time_report(&node_add_batch_counter_map, &serialize_batch_ns,
+                                            &mem_exceeded_block_ns, &queue_push_lock_ns,
+                                            &actual_consume_ns, &total_add_batch_exec_time_ns,
+                                            &add_batch_exec_time, &total_add_batch_num);
+                        });
+
+                if (add_batch_exec_time > max_add_batch_exec_time_ns) {
+                    max_add_batch_exec_time_ns = add_batch_exec_time;
+                }
+
+                // check if index has intolerable failure
+                Status index_st = index_channel->check_intolerable_failure();
+                if (!index_st.ok()) {
+                    status = index_st;
+                } else if (Status st = index_channel->check_tablet_received_rows_consistency();
+                           !st.ok()) {
+                    status = st;
+                }
+            } // end for index channels
+        }
+        // TODO need to be improved
+        LOG(INFO) << "total mem_exceeded_block_ns=" << mem_exceeded_block_ns
+                  << ", total queue_push_lock_ns=" << queue_push_lock_ns
+                  << ", total actual_consume_ns=" << actual_consume_ns
+                  << ", load id=" << print_id(_load_id);
+
+        COUNTER_SET(_input_rows_counter, _number_input_rows);
+        COUNTER_SET(_output_rows_counter, _number_output_rows);
+        COUNTER_SET(_filtered_rows_counter, _number_filtered_rows);
+        COUNTER_SET(_send_data_timer, _send_data_ns);
+        COUNTER_SET(_wait_mem_limit_timer, mem_exceeded_block_ns);
+        COUNTER_SET(_validate_data_timer, _validate_data_ns);
+        COUNTER_SET(_serialize_batch_timer, serialize_batch_ns);
+        COUNTER_SET(_non_blocking_send_work_timer, actual_consume_ns);
+        COUNTER_SET(_total_add_batch_exec_timer, total_add_batch_exec_time_ns);
+        COUNTER_SET(_max_add_batch_exec_timer, max_add_batch_exec_time_ns);
+        COUNTER_SET(_add_batch_number, total_add_batch_num);
+        COUNTER_SET(_num_node_channels, num_node_channels);
+        // _number_input_rows don't contain num_rows_load_filtered and num_rows_load_unselected in scan node
+        int64_t num_rows_load_total = _number_input_rows + state->num_rows_load_filtered() +
+                                      state->num_rows_load_unselected();
+        state->set_num_rows_load_total(num_rows_load_total);
+        state->update_num_rows_load_filtered(_number_filtered_rows);
+
+        // print log of add batch time of all node, for tracing load performance easily
+        std::stringstream ss;
+        ss << "finished to close olap table sink. load_id=" << print_id(_load_id)
+           << ", txn_id=" << _txn_id
+           << ", node add batch time(ms)/wait execution time(ms)/close time(ms)/num: ";
+        for (auto const& pair : node_add_batch_counter_map) {
+            ss << "{" << pair.first << ":(" << (pair.second.add_batch_execution_time_us / 1000)
+               << ")(" << (pair.second.add_batch_wait_execution_time_us / 1000) << ")("
+               << pair.second.close_wait_time_ms << ")(" << pair.second.add_batch_num << ")} ";
+        }
+        LOG(INFO) << ss.str();
+    } else {
+        for (auto channel : _channels) {
+            channel->for_each_node_channel([&status](const std::shared_ptr<VNodeChannel>& ch) {
+                ch->cancel(status.to_string());
+            });
+        }
+        LOG(INFO) << "finished to close olap table sink. load_id=" << print_id(_load_id)
+                  << ", txn_id=" << _txn_id
+                  << ", canceled all node channels due to error: " << status;
+    }
+
+    // Sender join() must put after node channels mark_close/cancel.
+    // But there is no specific sequence required between sender join() & close_wait().
+    _stop_background_threads_latch.count_down();
+    if (_sender_thread) {
+        _sender_thread->join();
+        // We have to wait all task in _send_batch_thread_pool_token finished,
+        // because it is difficult to handle concurrent problem if we just
+        // shutdown it.
+        _send_batch_thread_pool_token->wait();
+    }
+
+    Expr::close(_output_expr_ctxs, state);
+    _output_batch.reset();
+
+    _close_status = status;
+    DataSink::close(state, exec_status);
+    return status;
 }
 
 Status VOlapTableSink::_validate_column(RuntimeState* state, const TypeDescriptor& type,
diff --git a/be/src/vec/sink/vtablet_sink.h b/be/src/vec/sink/vtablet_sink.h
index 8f7697f409..097bb1e6b7 100644
--- a/be/src/vec/sink/vtablet_sink.h
+++ b/be/src/vec/sink/vtablet_sink.h
@@ -16,10 +16,31 @@
 // under the License.
 
 #pragma once
+#include <brpc/controller.h>
 
-#include "exec/tablet_sink.h"
+#include <memory>
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "common/object_pool.h"
+#include "common/status.h"
+#include "exec/data_sink.h"
+#include "exec/tablet_info.h"
+#include "gen_cpp/Types_types.h"
+#include "gen_cpp/internal_service.pb.h"
 #include "runtime/row_batch.h"
+#include "runtime/thread_context.h"
+#include "util/bitmap.h"
+#include "util/countdown_latch.h"
+#include "util/ref_count_closure.h"
+#include "util/spinlock.h"
+#include "util/thread.h"
 #include "vec/columns/column.h"
+#include "vec/core/block.h"
 
 namespace doris {
 
@@ -29,36 +50,265 @@ class VExprContext;
 
 namespace stream_load {
 
-class VNodeChannel : public NodeChannel {
+// The counter of add_batch rpc of a single node
+struct AddBatchCounter {
+    // total execution time of a add_batch rpc
+    int64_t add_batch_execution_time_us = 0;
+    // lock waiting time in a add_batch rpc
+    int64_t add_batch_wait_execution_time_us = 0;
+    // number of add_batch call
+    int64_t add_batch_num = 0;
+    // time passed between marked close and finish close
+    int64_t close_wait_time_ms = 0;
+
+    AddBatchCounter& operator+=(const AddBatchCounter& rhs) {
+        add_batch_execution_time_us += rhs.add_batch_execution_time_us;
+        add_batch_wait_execution_time_us += rhs.add_batch_wait_execution_time_us;
+        add_batch_num += rhs.add_batch_num;
+        close_wait_time_ms += rhs.close_wait_time_ms;
+        return *this;
+    }
+    friend AddBatchCounter operator+(const AddBatchCounter& lhs, const AddBatchCounter& rhs) {
+        AddBatchCounter sum = lhs;
+        sum += rhs;
+        return sum;
+    }
+};
+
+// It's very error-prone to guarantee the handler capture vars' & this closure's destruct sequence.
+// So using create() to get the closure pointer is recommended. We can delete the closure ptr before the capture vars destruction.
+// Delete this point is safe, don't worry about RPC callback will run after ReusableClosure deleted.
+template <typename T>
+class ReusableClosure final : public google::protobuf::Closure {
 public:
-    VNodeChannel(OlapTableSink* parent, IndexChannel* index_channel, int64_t node_id);
+    ReusableClosure() : cid(INVALID_BTHREAD_ID) {}
+    ~ReusableClosure() override {
+        // shouldn't delete when Run() is calling or going to be called, wait for current Run() done.
+        join();
+        SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(ExecEnv::GetInstance()->orphan_mem_tracker());
+        cntl.Reset();
+    }
 
-    ~VNodeChannel() override;
+    static ReusableClosure<T>* create() { return new ReusableClosure<T>(); }
 
-    Status init(RuntimeState* state) override;
+    void addFailedHandler(const std::function<void(bool)>& fn) { failed_handler = fn; }
+    void addSuccessHandler(const std::function<void(const T&, bool)>& fn) { success_handler = fn; }
 
-    Status open_wait() override;
+    void join() {
+        // We rely on in_flight to assure one rpc is running,
+        // while cid is not reliable due to memory order.
+        // in_flight is written before getting callid,
+        // so we can not use memory fence to synchronize.
+        while (_packet_in_flight) {
+            // cid here is complicated
+            if (cid != INVALID_BTHREAD_ID) {
+                // actually cid may be the last rpc call id.
+                brpc::Join(cid);
+            }
+            if (_packet_in_flight) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(10));
+            }
+        }
+    }
+
+    // plz follow this order: reset() -> set_in_flight() -> send brpc batch
+    void reset() {
+        SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(ExecEnv::GetInstance()->orphan_mem_tracker());
+        cntl.Reset();
+        cid = cntl.call_id();
+    }
+
+    bool try_set_in_flight() {
+        bool value = false;
+        return _packet_in_flight.compare_exchange_strong(value, true);
+    }
+
+    void clear_in_flight() { _packet_in_flight = false; }
+
+    bool is_packet_in_flight() { return _packet_in_flight; }
+
+    void end_mark() {
+        DCHECK(_is_last_rpc == false);
+        _is_last_rpc = true;
+    }
+
+    void Run() override {
+        DCHECK(_packet_in_flight);
+        if (cntl.Failed()) {
+            LOG(WARNING) << "failed to send brpc batch, error=" << berror(cntl.ErrorCode())
+                         << ", error_text=" << cntl.ErrorText();
+            failed_handler(_is_last_rpc);
+        } else {
+            success_handler(result, _is_last_rpc);
+        }
+        clear_in_flight();
+    }
+
+    brpc::Controller cntl;
+    T result;
+
+private:
+    brpc::CallId cid;
+    std::atomic<bool> _packet_in_flight {false};
+    std::atomic<bool> _is_last_rpc {false};
+    std::function<void(bool)> failed_handler;
+    std::function<void(const T&, bool)> success_handler;
+};
+
+class IndexChannel;
+class VOlapTableSink;
+
+class VNodeChannel {
+public:
+    VNodeChannel(VOlapTableSink* parent, IndexChannel* index_channel, int64_t node_id);
+
+    ~VNodeChannel();
+
+    // called before open, used to add tablet located in this backend
+    void add_tablet(const TTabletWithPartition& tablet) { _all_tablets.emplace_back(tablet); }
+
+    void add_slave_tablet_nodes(int64_t tablet_id, const std::vector<int64_t>& slave_nodes) {
+        _slave_tablet_nodes[tablet_id] = slave_nodes;
+    }
+
+    void open();
+
+    Status init(RuntimeState* state);
+
+    Status open_wait();
 
     Status add_block(vectorized::Block* block,
                      const std::pair<std::unique_ptr<vectorized::IColumn::Selector>,
-                                     std::vector<int64_t>>& payload) override;
+                                     std::vector<int64_t>>& payload);
 
     int try_send_and_fetch_status(RuntimeState* state,
-                                  std::unique_ptr<ThreadPoolToken>& thread_pool_token) override;
+                                  std::unique_ptr<ThreadPoolToken>& thread_pool_token);
 
     void try_send_block(RuntimeState* state);
 
-    void clear_all_blocks() override;
+    void clear_all_blocks();
 
     // two ways to stop channel:
     // 1. mark_close()->close_wait() PS. close_wait() will block waiting for the last AddBatch rpc response.
     // 2. just cancel()
-    void mark_close() override;
+    void mark_close();
+
+    // two ways to stop channel:
+    // 1. mark_close()->close_wait() PS. close_wait() will block waiting for the last AddBatch rpc response.
+    // 2. just cancel()
+    Status close_wait(RuntimeState* state);
+
+    void cancel(const std::string& cancel_msg);
+
+    void time_report(std::unordered_map<int64_t, AddBatchCounter>* add_batch_counter_map,
+                     int64_t* serialize_batch_ns, int64_t* mem_exceeded_block_ns,
+                     int64_t* queue_push_lock_ns, int64_t* actual_consume_ns,
+                     int64_t* total_add_batch_exec_time_ns, int64_t* add_batch_exec_time_ns,
+                     int64_t* total_add_batch_num) const {
+        (*add_batch_counter_map)[_node_id] += _add_batch_counter;
+        (*add_batch_counter_map)[_node_id].close_wait_time_ms = _close_time_ms;
+        *serialize_batch_ns += _serialize_batch_ns;
+        *mem_exceeded_block_ns += _mem_exceeded_block_ns;
+        *queue_push_lock_ns += _queue_push_lock_ns;
+        *actual_consume_ns += _actual_consume_ns;
+        *add_batch_exec_time_ns = (_add_batch_counter.add_batch_execution_time_us * 1000);
+        *total_add_batch_exec_time_ns += *add_batch_exec_time_ns;
+        *total_add_batch_num += _add_batch_counter.add_batch_num;
+    }
+
+    int64_t node_id() const { return _node_id; }
+    std::string host() const { return _node_info.host; }
+    std::string name() const { return _name; }
+
+    Status none_of(std::initializer_list<bool> vars);
+
+    std::string channel_info() const {
+        return fmt::format("{}, {}, node={}:{}", _name, _load_info, _node_info.host,
+                           _node_info.brpc_port);
+    }
+
+    size_t get_pending_bytes() { return _pending_batches_bytes; }
 
 protected:
-    void _close_check() override;
+    void _close_check();
+    void _cancel_with_msg(const std::string& msg);
+
+    VOlapTableSink* _parent = nullptr;
+    IndexChannel* _index_channel = nullptr;
+    int64_t _node_id = -1;
+    std::string _load_info;
+    std::string _name;
+
+    std::shared_ptr<MemTracker> _node_channel_tracker;
+
+    TupleDescriptor* _tuple_desc = nullptr;
+    NodeInfo _node_info;
+
+    // this should be set in init() using config
+    int _rpc_timeout_ms = 60000;
+    int64_t _next_packet_seq = 0;
+    MonotonicStopWatch _timeout_watch;
+
+    // the timestamp when this node channel be marked closed and finished closed
+    uint64_t _close_time_ms = 0;
+
+    // user cancel or get some errors
+    std::atomic<bool> _cancelled {false};
+    doris::SpinLock _cancel_msg_lock;
+    std::string _cancel_msg;
+
+    // send finished means the consumer thread which send the rpc can exit
+    std::atomic<bool> _send_finished {false};
+
+    // add batches finished means the last rpc has be response, used to check whether this channel can be closed
+    std::atomic<bool> _add_batches_finished {false}; // reuse for vectorized
+
+    bool _eos_is_produced {false}; // only for restricting producer behaviors
+
+    std::unique_ptr<RowDescriptor> _row_desc;
+    int _batch_size = 0;
+
+    // limit _pending_batches size
+    std::atomic<size_t> _pending_batches_bytes {0};
+    size_t _max_pending_batches_bytes {(size_t)config::nodechannel_pending_queue_max_bytes};
+    std::mutex _pending_batches_lock;          // reuse for vectorized
+    std::atomic<int> _pending_batches_num {0}; // reuse for vectorized
+
+    std::shared_ptr<PBackendService_Stub> _stub = nullptr;
+    RefCountClosure<PTabletWriterOpenResult>* _open_closure = nullptr;
+
+    std::vector<TTabletWithPartition> _all_tablets;
+    // map from tablet_id to node_id where slave replicas locate in
+    std::unordered_map<int64_t, std::vector<int64_t>> _slave_tablet_nodes;
+    std::vector<TTabletCommitInfo> _tablet_commit_infos;
+
+    AddBatchCounter _add_batch_counter;
+    std::atomic<int64_t> _serialize_batch_ns {0};
+    std::atomic<int64_t> _mem_exceeded_block_ns {0};
+    std::atomic<int64_t> _queue_push_lock_ns {0};
+    std::atomic<int64_t> _actual_consume_ns {0};
+
+    // lock to protect _is_closed.
+    // The methods in the IndexChannel are called back in the RpcClosure in the NodeChannel.
+    // However, this rpc callback may occur after the whole task is finished (e.g. due to network latency),
+    // and by that time the IndexChannel may have been destructured, so we should not call the
+    // IndexChannel methods anymore, otherwise the BE will crash.
+    // Therefore, we use the _is_closed and _closed_lock to ensure that the RPC callback
+    // function will not call the IndexChannel method after the NodeChannel is closed.
+    // The IndexChannel is definitely accessible until the NodeChannel is closed.
+    std::mutex _closed_lock;
+    bool _is_closed = false;
+
+    RuntimeState* _state;
+    // rows number received per tablet, tablet_id -> rows_num
+    std::vector<std::pair<int64_t, int64_t>> _tablets_received_rows;
+
+    std::unique_ptr<RowBatch> _cur_batch;
+    PTabletWriterAddBatchRequest _cur_add_batch_request;
+    using AddBatchReq = std::pair<std::unique_ptr<RowBatch>, PTabletWriterAddBatchRequest>;
+    std::queue<AddBatchReq> _pending_batches;
+    ReusableClosure<PTabletWriterAddBatchResult>* _add_batch_closure = nullptr;
 
-private:
     std::unique_ptr<vectorized::MutableBlock> _cur_mutable_block;
     PTabletWriterAddBlockRequest _cur_add_block_request;
 
@@ -68,13 +318,84 @@ private:
     ReusableClosure<PTabletWriterAddBlockResult>* _add_block_closure = nullptr;
 };
 
-class OlapTableSink;
+class IndexChannel {
+public:
+    IndexChannel(VOlapTableSink* parent, int64_t index_id) : _parent(parent), _index_id(index_id) {
+        _index_channel_tracker =
+                std::make_unique<MemTracker>("IndexChannel:indexID=" + std::to_string(_index_id));
+    }
+    ~IndexChannel() = default;
+
+    Status init(RuntimeState* state, const std::vector<TTabletWithPartition>& tablets);
+
+    void for_each_node_channel(
+            const std::function<void(const std::shared_ptr<VNodeChannel>&)>& func) {
+        for (auto& it : _node_channels) {
+            func(it.second);
+        }
+    }
+
+    void mark_as_failed(int64_t node_id, const std::string& host, const std::string& err,
+                        int64_t tablet_id = -1);
+    Status check_intolerable_failure();
+
+    // set error tablet info in runtime state, so that it can be returned to FE.
+    void set_error_tablet_in_state(RuntimeState* state);
+
+    size_t num_node_channels() const { return _node_channels.size(); }
+
+    size_t get_pending_bytes() const {
+        size_t mem_consumption = 0;
+        for (auto& kv : _node_channels) {
+            mem_consumption += kv.second->get_pending_bytes();
+        }
+        return mem_consumption;
+    }
+
+    void set_tablets_received_rows(
+            const std::vector<std::pair<int64_t, int64_t>>& tablets_received_rows, int64_t node_id);
+
+    // check whether the rows num written by different replicas is consistent
+    Status check_tablet_received_rows_consistency();
+
+private:
+    friend class VNodeChannel;
+    friend class VOlapTableSink;
+
+    VOlapTableSink* _parent;
+    int64_t _index_id;
+
+    // from backend channel to tablet_id
+    // ATTN: must be placed before `_node_channels` and `_channels_by_tablet`.
+    // Because the destruct order of objects is opposite to the creation order.
+    // So NodeChannel will be destructured first.
+    // And the destructor function of NodeChannel waits for all RPCs to finish.
+    // This ensures that it is safe to use `_tablets_by_channel` in the callback function for the end of the RPC.
+    std::unordered_map<int64_t, std::unordered_set<int64_t>> _tablets_by_channel;
+    // BeId -> channel
+    std::unordered_map<int64_t, std::shared_ptr<VNodeChannel>> _node_channels;
+    // from tablet_id to backend channel
+    std::unordered_map<int64_t, std::vector<std::shared_ptr<VNodeChannel>>> _channels_by_tablet;
+
+    // lock to protect _failed_channels and _failed_channels_msgs
+    mutable doris::SpinLock _fail_lock;
+    // key is tablet_id, value is a set of failed node id
+    std::unordered_map<int64_t, std::unordered_set<int64_t>> _failed_channels;
+    // key is tablet_id, value is error message
+    std::unordered_map<int64_t, std::string> _failed_channels_msgs;
+    Status _intolerable_failure_status = Status::OK();
+
+    std::unique_ptr<MemTracker> _index_channel_tracker;
+    // rows num received by DeltaWriter per tablet, tablet_id -> <node_Id, rows_num>
+    // used to verify whether the rows num received by different replicas is consistent
+    std::map<int64_t, std::vector<std::pair<int64_t, int64_t>>> _tablets_received_rows;
+};
 
 // Write block data to Olap Table.
 // When OlapTableSink::open() called, there will be a consumer thread running in the background.
 // When you call VOlapTableSink::send(), you will be the producer who products pending batches.
 // Join the consumer thread in close().
-class VOlapTableSink : public OlapTableSink {
+class VOlapTableSink final : public DataSink {
 public:
     // Construct from thrift struct which is generated by FE.
     VOlapTableSink(ObjectPool* pool, const RowDescriptor& row_desc,
@@ -89,14 +410,24 @@ public:
     Status open(RuntimeState* state) override;
 
     Status close(RuntimeState* state, Status close_status) override;
-    using OlapTableSink::send;
     Status send(RuntimeState* state, vectorized::Block* block, bool eos = false) override;
 
     size_t get_pending_bytes() const;
 
     const RowDescriptor& row_desc() { return _input_row_desc; }
 
+    // Returns the runtime profile for the sink.
+    RuntimeProfile* profile() override { return _profile; }
+
+    // the consumer func of sending pending batches in every NodeChannel.
+    // use polling & NodeChannel::try_send_and_fetch_status() to achieve nonblocking sending.
+    // only focus on pending batches and channel status, the internal errors of NodeChannels will be handled by the producer
+    void _send_batch_process(RuntimeState* state);
+
 private:
+    friend class VNodeChannel;
+    friend class IndexChannel;
+
     // make input data valid for OLAP table
     // return number of invalid/filtered rows.
     // invalid row number is set in Bitmap
@@ -116,6 +447,104 @@ private:
                        const VOlapTablePartition** partition, uint32_t& tablet_index,
                        bool& stop_processing, bool& is_continue);
 
+    std::shared_ptr<MemTracker> _mem_tracker;
+
+    ObjectPool* _pool;
+    const RowDescriptor& _input_row_desc;
+
+    // unique load id
+    PUniqueId _load_id;
+    int64_t _txn_id = -1;
+    int _num_replicas = -1;
+    int _tuple_desc_id = -1;
+
+    // this is tuple descriptor of destination OLAP table
+    TupleDescriptor* _output_tuple_desc = nullptr;
+    RowDescriptor* _output_row_desc = nullptr;
+
+    bool _need_validate_data = false;
+
+    // number of senders used to insert into OlapTable, if we only support single node insert,
+    // all data from select should collectted and then send to OlapTable.
+    // To support multiple senders, we maintain a channel for each sender.
+    int _sender_id = -1;
+    int _num_senders = -1;
+    bool _is_high_priority = false;
+
+    // TODO(zc): think about cache this data
+    std::shared_ptr<OlapTableSchemaParam> _schema;
+    OlapTableLocationParam* _location = nullptr;
+    bool _write_single_replica = false;
+    OlapTableLocationParam* _slave_location = nullptr;
+    DorisNodesInfo* _nodes_info = nullptr;
+
+    RuntimeProfile* _profile = nullptr;
+
+    std::set<int64_t> _partition_ids;
+    // only used for partition with random distribution
+    std::map<int64_t, int64_t> _partition_to_tablet_map;
+
+    Bitmap _filter_bitmap;
+
+    // index_channel
+    std::vector<std::shared_ptr<IndexChannel>> _channels;
+
+    CountDownLatch _stop_background_threads_latch;
+    scoped_refptr<Thread> _sender_thread;
+    std::unique_ptr<ThreadPoolToken> _send_batch_thread_pool_token;
+
+    std::vector<DecimalV2Value> _max_decimalv2_val;
+    std::vector<DecimalV2Value> _min_decimalv2_val;
+
+    // Stats for this
+    int64_t _validate_data_ns = 0;
+    int64_t _send_data_ns = 0;
+    int64_t _number_input_rows = 0;
+    int64_t _number_output_rows = 0;
+    int64_t _number_filtered_rows = 0;
+
+    RuntimeProfile::Counter* _input_rows_counter = nullptr;
+    RuntimeProfile::Counter* _output_rows_counter = nullptr;
+    RuntimeProfile::Counter* _filtered_rows_counter = nullptr;
+    RuntimeProfile::Counter* _send_data_timer = nullptr;
+    RuntimeProfile::Counter* _wait_mem_limit_timer = nullptr;
+    RuntimeProfile::Counter* _validate_data_timer = nullptr;
+    RuntimeProfile::Counter* _open_timer = nullptr;
+    RuntimeProfile::Counter* _close_timer = nullptr;
+    RuntimeProfile::Counter* _non_blocking_send_timer = nullptr;
+    RuntimeProfile::Counter* _non_blocking_send_work_timer = nullptr;
+    RuntimeProfile::Counter* _serialize_batch_timer = nullptr;
+    RuntimeProfile::Counter* _total_add_batch_exec_timer = nullptr;
+    RuntimeProfile::Counter* _max_add_batch_exec_timer = nullptr;
+    RuntimeProfile::Counter* _add_batch_number = nullptr;
+    RuntimeProfile::Counter* _num_node_channels = nullptr;
+
+    // load mem limit is for remote load channel
+    int64_t _load_mem_limit = -1;
+
+    // the timeout of load channels opened by this tablet sink. in second
+    int64_t _load_channel_timeout_s = 0;
+
+    int32_t _send_batch_parallelism = 1;
+    // Save the status of close() method
+    Status _close_status;
+
+    // User can change this config at runtime, avoid it being modified during query or loading process.
+    bool _transfer_large_data_by_brpc = false;
+
+    // FIND_TABLET_EVERY_ROW is used for both hash and random distribution info, which indicates that we
+    // should compute tablet index for every row
+    // FIND_TABLET_EVERY_BATCH is only used for random distribution info, which indicates that we should
+    // compute tablet index for every row batch
+    // FIND_TABLET_EVERY_SINK is only used for random distribution info, which indicates that we should
+    // only compute tablet index in the corresponding partition once for the whole time in olap table sink
+    enum FindTabletMode { FIND_TABLET_EVERY_ROW, FIND_TABLET_EVERY_BATCH, FIND_TABLET_EVERY_SINK };
+    FindTabletMode findTabletMode = FindTabletMode::FIND_TABLET_EVERY_ROW;
+
+    OlapTablePartitionParam* _partition = nullptr;
+    std::vector<ExprContext*> _output_expr_ctxs;
+    std::unique_ptr<RowBatch> _output_batch;
+
     VOlapTablePartitionParam* _vpartition = nullptr;
     std::vector<vectorized::VExprContext*> _output_vexpr_ctxs;
 };
diff --git a/be/test/CMakeLists.txt b/be/test/CMakeLists.txt
index eb710a5eb5..3a93ebd9c7 100644
--- a/be/test/CMakeLists.txt
+++ b/be/test/CMakeLists.txt
@@ -150,7 +150,6 @@ set(OLAP_TEST_FILES
 )
 
 set(RUNTIME_TEST_FILES
-    # runtime/buffered_tuple_stream_test.cpp
     # runtime/buffer_control_block_test.cpp
     # runtime/result_buffer_mgr_test.cpp
     # runtime/result_sink_test.cpp
@@ -163,8 +162,6 @@ set(RUNTIME_TEST_FILES
     # runtime/tmp_file_mgr_test.cpp
     # runtime/disk_io_mgr_test.cpp
     # runtime/thread_resource_mgr_test.cpp
-    # runtime/buffered_block_mgr2_test.cpp
-    # runtime/buffered_tuple_stream2_test.cpp
     # runtime/export_task_mgr_test.cpp
     runtime/mem_pool_test.cpp
     runtime/string_buffer_test.cpp
diff --git a/be/test/runtime/buffered_block_mgr2_test.cpp b/be/test/runtime/buffered_block_mgr2_test.cpp
deleted file mode 100644
index 09994c0c42..0000000000
--- a/be/test/runtime/buffered_block_mgr2_test.cpp
+++ /dev/null
@@ -1,1246 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/buffered_block_mgr2.h"
-
-#include <gtest/gtest.h>
-#include <sys/stat.h>
-
-#include <filesystem>
-#include <functional>
-#include <thread>
-
-#include "runtime/disk_io_mgr.h"
-#include "runtime/exec_env.h"
-#include "runtime/runtime_state.h"
-#include "runtime/test_env.h"
-#include "runtime/tmp_file_mgr.h"
-#include "util/cpu_info.h"
-#include "util/disk_info.h"
-#include "util/filesystem_util.h"
-#include "util/monotime.h"
-#include "util/thread_group.h"
-
-using std::filesystem::directory_iterator;
-using std::filesystem::remove;
-using std::unique_ptr;
-using std::unordered_map;
-using std::thread;
-
-using std::string;
-using std::stringstream;
-using std::vector;
-
-// Note: This is the default scratch dir created by doris.
-// config::query_scratch_dirs + TmpFileMgr::_s_tmp_sub_dir_name.
-const static string SCRATCH_DIR = "/tmp/doris-scratch";
-
-// This suffix is appended to a tmp dir
-const static string SCRATCH_SUFFIX = "/doris-scratch";
-
-// Number of milliseconds to wait to ensure write completes
-const static int WRITE_WAIT_MILLIS = 500;
-
-// How often to check for write completion
-const static int WRITE_CHECK_INTERVAL_MILLIS = 10;
-
-namespace doris {
-
-class BufferedBlockMgrTest : public ::testing::Test {
-protected:
-    const static int _block_size = 1024;
-
-    virtual void SetUp() { _test_env.reset(new TestEnv()); }
-
-    virtual void TearDown() {
-        TearDownMgrs();
-        _test_env.reset();
-
-        // Tests modify permissions, so make sure we can delete if they didn't clean up.
-        for (int i = 0; i < _created_tmp_dirs.size(); ++i) {
-            chmod((_created_tmp_dirs[i] + SCRATCH_SUFFIX).c_str(), S_IRWXU);
-        }
-        FileSystemUtil::remove_paths(_created_tmp_dirs);
-        _created_tmp_dirs.clear();
-    }
-
-    // Reinitialize _test_env to have multiple temporary directories.
-    std::vector<string> InitMultipleTmpDirs(int num_dirs) {
-        std::vector<string> tmp_dirs;
-        for (int i = 0; i < num_dirs; ++i) {
-            std::stringstream dir_str;
-            dir_str << "/tmp/buffered-block-mgr-test." << i;
-            const string& dir = dir_str.str();
-            // Fix permissions in case old directories were left from previous runs of test.
-            chmod((dir + SCRATCH_SUFFIX).c_str(), S_IRWXU);
-            EXPECT_TRUE(FileSystemUtil::create_directory(dir).ok());
-            tmp_dirs.push_back(dir);
-            _created_tmp_dirs.push_back(dir);
-        }
-        _test_env->init_tmp_file_mgr(tmp_dirs, false);
-        EXPECT_EQ(num_dirs, _test_env->tmp_file_mgr()->num_active_tmp_devices());
-        return tmp_dirs;
-    }
-
-    static void ValidateBlock(BufferedBlockMgr2::Block* block, int32_t data) {
-        EXPECT_TRUE(block->valid_data_len() == sizeof(int32_t));
-        EXPECT_TRUE(*reinterpret_cast<int32_t*>(block->buffer()) == data);
-    }
-
-    static int32_t* MakeRandomSizeData(BufferedBlockMgr2::Block* block) {
-        // Format is int32_t size, followed by size bytes of data
-        int32_t size = (rand() % 252) + 4; // So blocks have 4-256 bytes of data
-        uint8_t* data = block->allocate<uint8_t>(size);
-        *(reinterpret_cast<int32_t*>(data)) = size;
-        int i = 0;
-        for (i = 4; i < size - 5; ++i) {
-            data[i] = i;
-        }
-        for (; i < size; ++i) { // End marker of at least 5 0xff's
-            data[i] = 0xff;
-        }
-        return reinterpret_cast<int32_t*>(data); // Really returns a pointer to size
-    }
-
-    static void ValidateRandomSizeData(BufferedBlockMgr2::Block* block, int32_t size) {
-        int32_t bsize = *(reinterpret_cast<int32_t*>(block->buffer()));
-        uint8_t* data = reinterpret_cast<uint8_t*>(block->buffer());
-        int i = 0;
-        EXPECT_EQ(block->valid_data_len(), size);
-        EXPECT_EQ(size, bsize);
-        for (i = 4; i < size - 5; ++i) {
-            EXPECT_EQ(data[i], i);
-        }
-        for (; i < size; ++i) {
-            EXPECT_EQ(data[i], 0xff);
-        }
-    }
-
-    /// Helper to create a simple block manager.
-    BufferedBlockMgr2* CreateMgr(int64_t query_id, int max_buffers, int block_size,
-                                 RuntimeState** query_state = nullptr) {
-        RuntimeState* state = nullptr;
-        EXPECT_TRUE(_test_env->create_query_state(query_id, max_buffers, block_size, &state).ok());
-        if (query_state != nullptr) {
-            *query_state = state;
-        }
-        return state->block_mgr2();
-    }
-
-    BufferedBlockMgr2* CreateMgrAndClient(int64_t query_id, int max_buffers, int block_size,
-                                          int reserved_blocks, BufferedBlockMgr2::Client** client) {
-        RuntimeState* state = nullptr;
-        BufferedBlockMgr2* mgr = CreateMgr(query_id, max_buffers, block_size, &state);
-        EXPECT_TRUE(mgr->register_client(reserved_blocks, state, client).ok());
-        EXPECT_TRUE(client != nullptr);
-        return mgr;
-    }
-
-    void CreateMgrsAndClients(int64_t start_query_id, int num_mgrs, int buffers_per_mgr,
-                              int block_size, int reserved_blocks_per_client,
-                              std::vector<BufferedBlockMgr2*>* mgrs,
-                              std::vector<BufferedBlockMgr2::Client*>* clients) {
-        for (int i = 0; i < num_mgrs; ++i) {
-            BufferedBlockMgr2::Client* client;
-            BufferedBlockMgr2* mgr =
-                    CreateMgrAndClient(start_query_id + i, buffers_per_mgr, _block_size,
-                                       reserved_blocks_per_client, &client);
-            mgrs->push_back(mgr);
-            clients->push_back(client);
-        }
-    }
-
-    // Destroy all created query states and associated block managers.
-    void TearDownMgrs() {
-        // Freeing all block managers should clean up all consumed memory.
-        _test_env->tear_down_query_states();
-    }
-
-    void AllocateBlocks(BufferedBlockMgr2* block_mgr, BufferedBlockMgr2::Client* client,
-                        int num_blocks, std::vector<BufferedBlockMgr2::Block*>* blocks) {
-        int32_t* data = nullptr;
-        Status status;
-        BufferedBlockMgr2::Block* new_block;
-        for (int i = 0; i < num_blocks; ++i) {
-            status = block_mgr->get_new_block(client, nullptr, &new_block);
-            EXPECT_TRUE(status.ok());
-            EXPECT_TRUE(new_block != nullptr);
-            data = new_block->allocate<int32_t>(sizeof(int32_t));
-            *data = blocks->size();
-            blocks->push_back(new_block);
-        }
-    }
-
-    // Pin all blocks, expecting they are pinned successfully.
-    void PinBlocks(const std::vector<BufferedBlockMgr2::Block*>& blocks) {
-        for (int i = 0; i < blocks.size(); ++i) {
-            bool pinned = false;
-            EXPECT_TRUE(blocks[i]->pin(&pinned).ok());
-            EXPECT_TRUE(pinned);
-        }
-    }
-
-    // Pin all blocks, expecting no errors from unpin() calls.
-    void UnpinBlocks(const std::vector<BufferedBlockMgr2::Block*>& blocks) {
-        for (int i = 0; i < blocks.size(); ++i) {
-            EXPECT_TRUE(blocks[i]->unpin().ok());
-        }
-    }
-
-    static void WaitForWrites(BufferedBlockMgr2* block_mgr) {
-        std::vector<BufferedBlockMgr2*> block_mgrs;
-        block_mgrs.push_back(block_mgr);
-        WaitForWrites(block_mgrs);
-    }
-
-    // Wait for writes issued through block managers to complete.
-    static void WaitForWrites(const std::vector<BufferedBlockMgr2*>& block_mgrs) {
-        int max_attempts = WRITE_WAIT_MILLIS / WRITE_CHECK_INTERVAL_MILLIS;
-        for (int i = 0; i < max_attempts; ++i) {
-            SleepFor(MonoDelta::FromMilliseconds(WRITE_CHECK_INTERVAL_MILLIS));
-            if (AllWritesComplete(block_mgrs)) {
-                return;
-            }
-        }
-        EXPECT_TRUE(false) << "Writes did not complete after " << WRITE_WAIT_MILLIS << "ms";
-    }
-
-    static bool AllWritesComplete(const std::vector<BufferedBlockMgr2*>& block_mgrs) {
-        for (int i = 0; i < block_mgrs.size(); ++i) {
-            RuntimeProfile::Counter* writes_outstanding =
-                    block_mgrs[i]->profile()->get_counter("BlockWritesOutstanding");
-            if (writes_outstanding->value() != 0) {
-                return false;
-            }
-        }
-        return true;
-    }
-
-    // Delete the temporary file backing a block - all subsequent writes to the file
-    // should fail. Expects backing file has already been allocated.
-    static void DeleteBackingFile(BufferedBlockMgr2::Block* block) {
-        const string& path = block->tmp_file_path();
-        EXPECT_GT(path.size(), 0);
-        EXPECT_TRUE(remove(path));
-        LOG(INFO) << "Injected fault by deleting file " << path;
-    }
-
-    // Check that the file backing the block has dir as a prefix of its path.
-    static bool BlockInDir(BufferedBlockMgr2::Block* block, const string& dir) {
-        return block->tmp_file_path().find(dir) == 0;
-    }
-
-    // Find a block in the list that is backed by a file with the given directory as prefix
-    // of its path.
-    static BufferedBlockMgr2::Block* FindBlockForDir(
-            const std::vector<BufferedBlockMgr2::Block*>& blocks, const string& dir) {
-        for (int i = 0; i < blocks.size(); ++i) {
-            if (BlockInDir(blocks[i], dir)) {
-                return blocks[i];
-            }
-        }
-        return nullptr;
-    }
-
-    void TestGetNewBlockImpl(int block_size) {
-        Status status;
-        int max_num_blocks = 5;
-        BufferedBlockMgr2* block_mgr = nullptr;
-        BufferedBlockMgr2::Client* client;
-        block_mgr = CreateMgrAndClient(0, max_num_blocks, block_size, 0, &client);
-        EXPECT_EQ(block_mgr->mem_tracker()->consumption(), 0);
-
-        // Allocate blocks until max_num_blocks, they should all succeed and memory
-        // usage should go up.
-        BufferedBlockMgr2::Block* new_block;
-        BufferedBlockMgr2::Block* first_block = nullptr;
-        for (int i = 0; i < max_num_blocks; ++i) {
-            status = block_mgr->get_new_block(client, nullptr, &new_block);
-            EXPECT_TRUE(new_block != nullptr);
-            EXPECT_EQ(block_mgr->bytes_allocated(), (i + 1) * block_size);
-            if (first_block == nullptr) {
-                first_block = new_block;
-            }
-        }
-
-        // Trying to allocate a new one should fail.
-        status = block_mgr->get_new_block(client, nullptr, &new_block);
-        EXPECT_TRUE(new_block == nullptr);
-        EXPECT_EQ(block_mgr->bytes_allocated(), max_num_blocks * block_size);
-
-        // We can allocate a new block by transferring an already allocated one.
-        uint8_t* old_buffer = first_block->buffer();
-        status = block_mgr->get_new_block(client, first_block, &new_block);
-        EXPECT_TRUE(new_block != nullptr);
-        EXPECT_TRUE(old_buffer == new_block->buffer());
-        EXPECT_EQ(block_mgr->bytes_allocated(), max_num_blocks * block_size);
-        EXPECT_TRUE(!first_block->is_pinned());
-
-        // Trying to allocate a new one should still fail.
-        status = block_mgr->get_new_block(client, nullptr, &new_block);
-        EXPECT_TRUE(new_block == nullptr);
-        EXPECT_EQ(block_mgr->bytes_allocated(), max_num_blocks * block_size);
-
-        EXPECT_EQ(block_mgr->writes_issued(), 1);
-        TearDownMgrs();
-    }
-
-    void TestEvictionImpl(int block_size) {
-        Status status;
-        DCHECK_GT(block_size, 0);
-        int max_num_buffers = 5;
-        BufferedBlockMgr2* block_mgr = nullptr;
-        BufferedBlockMgr2::Client* client = nullptr;
-        block_mgr = CreateMgrAndClient(0, max_num_buffers, block_size, 0,
-                                       block_mgr->get_tracker(client), &client);
-
-        // Check counters.
-        RuntimeProfile* profile = block_mgr->profile();
-        RuntimeProfile::Counter* buffered_pin = profile->get_counter("BufferedPins");
-
-        std::vector<BufferedBlockMgr2::Block*> blocks;
-        AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-
-        EXPECT_EQ(block_mgr->bytes_allocated(), max_num_buffers * block_size);
-        for (BufferedBlockMgr2::Block* block : blocks) {
-            block->unpin();
-        }
-
-        // Re-pinning all blocks
-        for (int i = 0; i < blocks.size(); ++i) {
-            bool pinned = false;
-            status = blocks[i]->pin(&pinned);
-            EXPECT_TRUE(status.ok());
-            EXPECT_TRUE(pinned);
-            ValidateBlock(blocks[i], i);
-        }
-        int buffered_pins_expected = blocks.size();
-        EXPECT_EQ(buffered_pin->value(), buffered_pins_expected);
-
-        // Unpin all blocks
-        for (BufferedBlockMgr2::Block* block : blocks) {
-            block->unpin();
-        }
-        // Get two new blocks.
-        AllocateBlocks(block_mgr, client, 2, &blocks);
-        // At least two writes must be issued. The first (num_blocks - 2) must be in memory.
-        EXPECT_GE(block_mgr->writes_issued(), 2);
-        for (int i = 0; i < (max_num_buffers - 2); ++i) {
-            bool pinned = false;
-            status = blocks[i]->pin(&pinned);
-            EXPECT_TRUE(status.ok());
-            EXPECT_TRUE(pinned);
-            ValidateBlock(blocks[i], i);
-        }
-        EXPECT_GE(buffered_pin->value(), buffered_pins_expected);
-
-        // can not pin any more
-        for (int i = (max_num_buffers - 2); i < max_num_buffers; ++i) {
-            bool pinned = true;
-            status = blocks[i]->pin(&pinned);
-            EXPECT_TRUE(status.ok());
-            EXPECT_FALSE(pinned);
-        }
-
-        // the last 2 block has already been pinned
-        for (int i = max_num_buffers; i < blocks.size(); ++i) {
-            bool pinned = false;
-            status = blocks[i]->pin(&pinned);
-            EXPECT_TRUE(status.ok());
-            EXPECT_TRUE(pinned);
-            ValidateBlock(blocks[i], i);
-        }
-
-        TearDownMgrs();
-    }
-
-    // Test that randomly issues GetFreeBlock(), pin(), unpin(), del() and Close()
-    // calls. All calls made are legal - error conditions are not expected until the first
-    // call to Close(). This is called 2 times with encryption+integrity on/off.
-    // When executed in single-threaded mode 'tid' should be SINGLE_THREADED_TID.
-    static const int SINGLE_THREADED_TID = -1;
-    void TestRandomInternalImpl(RuntimeState* state, BufferedBlockMgr2* block_mgr, int num_buffers,
-                                int tid) {
-        DCHECK(block_mgr != nullptr);
-        const int num_iterations = 100000;
-        const int iters_before_close = num_iterations - 5000;
-        bool close_called = false;
-        unordered_map<BufferedBlockMgr2::Block*, int> pinned_block_map;
-        std::vector<std::pair<BufferedBlockMgr2::Block*, int32_t>> pinned_blocks;
-        unordered_map<BufferedBlockMgr2::Block*, int> unpinned_block_map;
-        std::vector<std::pair<BufferedBlockMgr2::Block*, int32_t>> unpinned_blocks;
-
-        typedef enum { Pin, New, Unpin, Delete, Close } ApiFunction;
-        ApiFunction api_function;
-
-        BufferedBlockMgr2::Client* client;
-        Status status = block_mgr->register_client(0, state, &client);
-        EXPECT_TRUE(status.ok());
-        EXPECT_TRUE(client != nullptr);
-
-        pinned_blocks.reserve(num_buffers);
-        BufferedBlockMgr2::Block* new_block;
-        for (int i = 0; i < num_iterations; ++i) {
-            if ((i % 20000) == 0) {
-                LOG(ERROR) << " Iteration " << i << std::endl;
-            }
-            if (i > iters_before_close && (rand() % 5 == 0)) {
-                api_function = Close;
-            } else if (pinned_blocks.size() == 0 && unpinned_blocks.size() == 0) {
-                api_function = New;
-            } else if (pinned_blocks.size() == 0) {
-                // Pin or New. Can't unpin or delete.
-                api_function = static_cast<ApiFunction>(rand() % 2);
-            } else if (pinned_blocks.size() >= num_buffers) {
-                // Unpin or delete. Can't pin or get new.
-                api_function = static_cast<ApiFunction>(2 + (rand() % 2));
-            } else if (unpinned_blocks.size() == 0) {
-                // Can't pin. Unpin, new or delete.
-                api_function = static_cast<ApiFunction>(1 + (rand() % 3));
-            } else {
-                // Any api function.
-                api_function = static_cast<ApiFunction>(rand() % 4);
-            }
-
-            std::pair<BufferedBlockMgr2::Block*, int32_t> block_data;
-            int rand_pick = 0;
-            int32_t* data = nullptr;
-            bool pinned = false;
-            switch (api_function) {
-            case New:
-                status = block_mgr->get_new_block(client, nullptr, &new_block);
-                if (close_called || (tid != SINGLE_THREADED_TID && status.is_cancelled())) {
-                    EXPECT_TRUE(new_block == nullptr);
-                    EXPECT_TRUE(status.is_cancelled());
-                    continue;
-                }
-                EXPECT_TRUE(status.ok());
-                EXPECT_TRUE(new_block != nullptr);
-                data = MakeRandomSizeData(new_block);
-                block_data = std::make_pair(new_block, *data);
-
-                pinned_blocks.push_back(block_data);
-                pinned_block_map.insert(std::make_pair(block_data.first, pinned_blocks.size() - 1));
-                break;
-            case Pin:
-                rand_pick = rand() % unpinned_blocks.size();
-                block_data = unpinned_blocks[rand_pick];
-                status = block_data.first->pin(&pinned);
-                if (close_called || (tid != SINGLE_THREADED_TID && status.is_cancelled())) {
-                    EXPECT_TRUE(status.is_cancelled());
-                    // In single-threaded runs the block should not have been pinned.
-                    // In multi-threaded runs pin() may return the block pinned but the status to
-                    // be cancelled. In this case we could move the block from unpinned_blocks
-                    // to pinned_blocks. We do not do that because after is_cancelled() no actual
-                    // block operations should take place.
-                    // reason: when block_mgr is cancelled in one thread, the same block_mgr
-                    // is waiting for scan-range to be ready.
-                    if (tid == SINGLE_THREADED_TID) {
-                        EXPECT_FALSE(pinned);
-                    }
-                    continue;
-                }
-                EXPECT_TRUE(status.ok());
-                EXPECT_TRUE(pinned);
-                ValidateRandomSizeData(block_data.first, block_data.second);
-                unpinned_blocks[rand_pick] = unpinned_blocks.back();
-                unpinned_blocks.pop_back();
-                unpinned_block_map[unpinned_blocks[rand_pick].first] = rand_pick;
-
-                pinned_blocks.push_back(block_data);
-                pinned_block_map.insert(std::make_pair(block_data.first, pinned_blocks.size() - 1));
-                break;
-            case Unpin:
-                rand_pick = rand() % pinned_blocks.size();
-                block_data = pinned_blocks[rand_pick];
-                status = block_data.first->unpin();
-                if (close_called || (tid != SINGLE_THREADED_TID && status.is_cancelled())) {
-                    EXPECT_TRUE(status.is_cancelled());
-                    continue;
-                }
-                EXPECT_TRUE(status.ok());
-                pinned_blocks[rand_pick] = pinned_blocks.back();
-                pinned_blocks.pop_back();
-                pinned_block_map[pinned_blocks[rand_pick].first] = rand_pick;
-
-                unpinned_blocks.push_back(block_data);
-                unpinned_block_map.insert(
-                        std::make_pair(block_data.first, unpinned_blocks.size() - 1));
-                break;
-            case Delete:
-                rand_pick = rand() % pinned_blocks.size();
-                block_data = pinned_blocks[rand_pick];
-                block_data.first->del();
-                pinned_blocks[rand_pick] = pinned_blocks.back();
-                pinned_blocks.pop_back();
-                pinned_block_map[pinned_blocks[rand_pick].first] = rand_pick;
-                break;
-            case Close:
-                block_mgr->cancel();
-                close_called = true;
-                break;
-            } // end switch (apiFunction)
-        }     // end for ()
-    }
-
-    // Single-threaded execution of the TestRandomInternalImpl.
-    void TestRandomInternalSingle(int block_size) {
-        DCHECK_GT(block_size, 0);
-        DCHECK(_test_env.get() != nullptr);
-        const int max_num_buffers = 100;
-        RuntimeState* state = nullptr;
-        BufferedBlockMgr2* block_mgr = CreateMgr(0, max_num_buffers, block_size, &state);
-        TestRandomInternalImpl(state, block_mgr, max_num_buffers, SINGLE_THREADED_TID);
-        TearDownMgrs();
-    }
-
-    // Multi-threaded execution of the TestRandomInternalImpl.
-    void TestRandomInternalMulti(int num_threads, int block_size) {
-        DCHECK_GT(num_threads, 0);
-        DCHECK_GT(block_size, 0);
-        DCHECK(_test_env.get() != nullptr);
-        const int max_num_buffers = 100;
-        RuntimeState* state = nullptr;
-        BufferedBlockMgr2* block_mgr =
-                CreateMgr(0, num_threads * max_num_buffers, block_size, &state);
-
-        ThreadGroup workers;
-        for (int i = 0; i < num_threads; ++i) {
-            thread* t = new thread(std::bind(&BufferedBlockMgrTest::TestRandomInternalImpl, this,
-                                             state, block_mgr, max_num_buffers, i));
-            workers.add_thread(t);
-        }
-        workers.join_all();
-        TearDownMgrs();
-    }
-
-    // Repeatedly call BufferedBlockMgr2::Create() and BufferedBlockMgr2::~BufferedBlockMgr2().
-    void CreateDestroyThread(int index, RuntimeState* state) {
-        const int num_buffers = 10;
-        const int iters = 100;
-        for (int i = 0; i < iters; ++i) {
-            LOG(WARNING) << "CreateDestroyThread thread " << index << " begin " << i << std::endl;
-            std::shared_ptr<BufferedBlockMgr2> mgr;
-            Status status = BufferedBlockMgr2::create(state, -1, state->runtime_profile(),
-                                                      _test_env->tmp_file_mgr(),
-                                                      _block_size * num_buffers, _block_size, &mgr);
-            LOG(WARNING) << "CreateDestroyThread thread " << index << " end " << i << std::endl;
-        }
-    }
-
-    // IMPALA-2286: Test for races between BufferedBlockMgr2::Create() and
-    // BufferedBlockMgr2::~BufferedBlockMgr2().
-    void CreateDestroyMulti() {
-        const int num_threads = 4;
-        ThreadGroup workers;
-        // Create a shared RuntimeState with no BufferedBlockMgr2.
-        RuntimeState* shared_state = new RuntimeState(TUniqueId(), TQueryOptions(), TQueryGlobals(),
-                                                      _test_env->exec_env());
-        for (int i = 0; i < num_threads; ++i) {
-            thread* t = new thread(
-                    std::bind(&BufferedBlockMgrTest::CreateDestroyThread, this, i, shared_state));
-            workers.add_thread(t);
-        }
-        workers.join_all();
-    }
-
-    std::unique_ptr<TestEnv> _test_env;
-    std::vector<string> _created_tmp_dirs;
-};
-
-TEST_F(BufferedBlockMgrTest, get_new_block) {
-    TestGetNewBlockImpl(1024);
-    TestGetNewBlockImpl(8 * 1024);
-    TestGetNewBlockImpl(8 * 1024 * 1024);
-    LOG(WARNING) << "finish test get_new_block." << std::endl;
-}
-
-TEST_F(BufferedBlockMgrTest, GetNewBlockSmallBlocks) {
-    const int block_size = 1024;
-    int max_num_blocks = 3;
-    BufferedBlockMgr2* block_mgr;
-    BufferedBlockMgr2::Client* client;
-    block_mgr = CreateMgrAndClient(0, max_num_blocks, block_size, 0, &client);
-    EXPECT_EQ(0, block_mgr->mem_tracker()->consumption());
-
-    std::vector<BufferedBlockMgr2::Block*> blocks;
-
-    // Allocate a small block.
-    BufferedBlockMgr2::Block* new_block = nullptr;
-    EXPECT_TRUE(block_mgr->get_new_block(client, nullptr, &new_block, 128).ok());
-    EXPECT_TRUE(new_block != nullptr);
-    EXPECT_EQ(block_mgr->bytes_allocated(), 0);
-    EXPECT_EQ(block_mgr->mem_tracker()->consumption(), 0);
-    EXPECT_EQ(block_mgr->get_tracker(client)->consumption(), 128);
-    EXPECT_TRUE(new_block->is_pinned());
-    EXPECT_EQ(new_block->bytes_remaining(), 128);
-    EXPECT_TRUE(new_block->buffer() != nullptr);
-    blocks.push_back(new_block);
-
-    // Allocate a normal block
-    EXPECT_TRUE(block_mgr->get_new_block(client, nullptr, &new_block).ok());
-    EXPECT_TRUE(new_block != nullptr);
-    EXPECT_EQ(block_mgr->bytes_allocated(), block_mgr->max_block_size());
-    EXPECT_EQ(block_mgr->mem_tracker()->consumption(), block_mgr->max_block_size());
-    EXPECT_EQ(block_mgr->get_tracker(client)->consumption(), 128 + block_mgr->max_block_size());
-    EXPECT_TRUE(new_block->is_pinned());
-    EXPECT_EQ(new_block->bytes_remaining(), block_mgr->max_block_size());
-    EXPECT_TRUE(new_block->buffer() != nullptr);
-    blocks.push_back(new_block);
-
-    // Allocate another small block.
-    EXPECT_TRUE(block_mgr->get_new_block(client, nullptr, &new_block, 512).ok());
-    EXPECT_TRUE(new_block != nullptr);
-    EXPECT_EQ(block_mgr->bytes_allocated(), block_mgr->max_block_size());
-    EXPECT_EQ(block_mgr->mem_tracker()->consumption(), block_mgr->max_block_size());
-    EXPECT_EQ(block_mgr->get_tracker(client)->consumption(),
-              128 + 512 + block_mgr->max_block_size());
-    EXPECT_TRUE(new_block->is_pinned());
-    EXPECT_EQ(new_block->bytes_remaining(), 512);
-    EXPECT_TRUE(new_block->buffer() != nullptr);
-    blocks.push_back(new_block);
-
-    // Should be able to unpin and pin the middle block
-    EXPECT_TRUE(blocks[1]->unpin().ok());
-
-    bool pinned;
-    EXPECT_TRUE(blocks[1]->pin(&pinned).ok());
-    EXPECT_TRUE(pinned);
-
-    for (int i = 0; i < blocks.size(); ++i) {
-        blocks[i]->del();
-    }
-
-    TearDownMgrs();
-}
-
-// Test that pinning more blocks than the max available buffers.
-TEST_F(BufferedBlockMgrTest, Pin) {
-    Status status;
-    int max_num_blocks = 5;
-    const int block_size = 1024;
-    BufferedBlockMgr2* block_mgr;
-    BufferedBlockMgr2::Client* client;
-    block_mgr = CreateMgrAndClient(0, max_num_blocks, block_size, 0, &client);
-
-    std::vector<BufferedBlockMgr2::Block*> blocks;
-    AllocateBlocks(block_mgr, client, max_num_blocks, &blocks);
-
-    // Unpin them all.
-    for (int i = 0; i < blocks.size(); ++i) {
-        status = blocks[i]->unpin();
-        EXPECT_TRUE(status.ok());
-    }
-
-    // Allocate more, this should work since we just unpinned some blocks.
-    AllocateBlocks(block_mgr, client, max_num_blocks, &blocks);
-
-    // Try to pin a unpinned block, this should not be possible.
-    bool pinned;
-    status = blocks[0]->pin(&pinned);
-    EXPECT_TRUE(status.ok());
-    EXPECT_FALSE(pinned);
-
-    // Unpin all blocks.
-    for (int i = 0; i < blocks.size(); ++i) {
-        status = blocks[i]->unpin();
-        EXPECT_TRUE(status.ok());
-    }
-
-    // Should be able to pin max_num_blocks blocks.
-    for (int i = 0; i < max_num_blocks; ++i) {
-        status = blocks[i]->pin(&pinned);
-        EXPECT_TRUE(status.ok());
-        EXPECT_TRUE(pinned);
-    }
-
-    // Can't pin any more though.
-    status = blocks[max_num_blocks]->pin(&pinned);
-    EXPECT_TRUE(status.ok());
-    EXPECT_FALSE(pinned);
-
-    TearDownMgrs();
-}
-
-// Test the eviction policy of the block mgr. No writes issued until more than
-// the max available buffers are allocated. Writes must be issued in LIFO order.
-TEST_F(BufferedBlockMgrTest, Eviction) {
-    TestEvictionImpl(1024);
-    TestEvictionImpl(8 * 1024 * 1024);
-}
-
-// Test deletion and reuse of blocks.
-TEST_F(BufferedBlockMgrTest, Deletion) {
-    int max_num_buffers = 5;
-    const int block_size = 1024;
-    BufferedBlockMgr2* block_mgr;
-    BufferedBlockMgr2::Client* client;
-    block_mgr = CreateMgrAndClient(0, max_num_buffers, block_size, 0, &client);
-
-    // Check counters.
-    RuntimeProfile* profile = block_mgr->profile();
-    RuntimeProfile::Counter* recycled_cnt = profile->get_counter("BlocksRecycled");
-    RuntimeProfile::Counter* created_cnt = profile->get_counter("BlocksCreated");
-
-    std::vector<BufferedBlockMgr2::Block*> blocks;
-    AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-    EXPECT_TRUE(created_cnt->value() == max_num_buffers);
-
-    for (BufferedBlockMgr2::Block* block : blocks) {
-        block->del();
-    }
-    AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-    EXPECT_TRUE(created_cnt->value() == max_num_buffers);
-    EXPECT_TRUE(recycled_cnt->value() == max_num_buffers);
-
-    TearDownMgrs();
-}
-
-// Delete blocks of various sizes and statuses to exercise the different code paths.
-// This relies on internal validation in block manager to detect many errors.
-TEST_F(BufferedBlockMgrTest, DeleteSingleBlocks) {
-    int max_num_buffers = 16;
-    BufferedBlockMgr2::Client* client;
-    BufferedBlockMgr2* block_mgr = CreateMgrAndClient(0, max_num_buffers, _block_size, 0, &client);
-
-    // Pinned I/O block.
-    BufferedBlockMgr2::Block* new_block;
-    EXPECT_TRUE(block_mgr->get_new_block(client, nullptr, &new_block).ok());
-    EXPECT_TRUE(new_block != nullptr);
-    EXPECT_TRUE(new_block->is_pinned());
-    EXPECT_TRUE(new_block->is_max_size());
-    new_block->del();
-    EXPECT_TRUE(block_mgr->get_tracker(client)->consumption() == 0);
-
-    // Pinned non-I/O block.
-    int small_block_size = 128;
-    EXPECT_TRUE(block_mgr->get_new_block(client, nullptr, &new_block, small_block_size).ok());
-    EXPECT_TRUE(new_block != nullptr);
-    EXPECT_TRUE(new_block->is_pinned());
-    EXPECT_EQ(small_block_size, block_mgr->get_tracker(client)->consumption());
-    new_block->del();
-    EXPECT_EQ(0, block_mgr->get_tracker(client)->consumption());
-
-    // Unpinned I/O block - delete after written to disk.
-    EXPECT_TRUE(block_mgr->get_new_block(client, nullptr, &new_block).ok());
-    EXPECT_TRUE(new_block != nullptr);
-    EXPECT_TRUE(new_block->is_pinned());
-    EXPECT_TRUE(new_block->is_max_size());
-    new_block->unpin();
-    EXPECT_FALSE(new_block->is_pinned());
-    WaitForWrites(block_mgr);
-    new_block->del();
-    EXPECT_TRUE(block_mgr->get_tracker(client)->consumption() == 0);
-
-    // Unpinned I/O block - delete before written to disk.
-    EXPECT_TRUE(block_mgr->get_new_block(client, nullptr, &new_block).ok());
-    EXPECT_TRUE(new_block != nullptr);
-    EXPECT_TRUE(new_block->is_pinned());
-    EXPECT_TRUE(new_block->is_max_size());
-    new_block->unpin();
-    EXPECT_FALSE(new_block->is_pinned());
-    new_block->del();
-    WaitForWrites(block_mgr);
-    EXPECT_TRUE(block_mgr->get_tracker(client)->consumption() == 0);
-
-    TearDownMgrs();
-}
-
-// Test that all APIs return cancelled after close.
-TEST_F(BufferedBlockMgrTest, Close) {
-    int max_num_buffers = 5;
-    const int block_size = 1024;
-    BufferedBlockMgr2* block_mgr;
-    BufferedBlockMgr2::Client* client;
-    block_mgr = CreateMgrAndClient(0, max_num_buffers, block_size, 0, &client);
-
-    std::vector<BufferedBlockMgr2::Block*> blocks;
-    AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-
-    block_mgr->cancel();
-
-    BufferedBlockMgr2::Block* new_block;
-    Status status = block_mgr->get_new_block(client, nullptr, &new_block);
-    EXPECT_TRUE(status.is_cancelled());
-    EXPECT_TRUE(new_block == nullptr);
-    status = blocks[0]->unpin();
-    EXPECT_TRUE(status.is_cancelled());
-    bool pinned;
-    status = blocks[0]->pin(&pinned);
-    EXPECT_TRUE(status.is_cancelled());
-    blocks[1]->del();
-
-    TearDownMgrs();
-}
-
-// Clear scratch directory. Return # of files deleted.
-static int clear_scratch_dir() {
-    int num_files = 0;
-    directory_iterator dir_it(SCRATCH_DIR);
-    for (; dir_it != directory_iterator(); ++dir_it) {
-        ++num_files;
-        remove_all(dir_it->path());
-    }
-    return num_files;
-}
-
-// Test that the block manager behaves correctly after a write error.  Delete the scratch
-// directory before an operation that would cause a write and test that subsequent API
-// calls return 'CANCELLED' correctly.
-TEST_F(BufferedBlockMgrTest, WriteError) {
-    Status status;
-    int max_num_buffers = 2;
-    const int block_size = 1024;
-    BufferedBlockMgr2* block_mgr;
-    BufferedBlockMgr2::Client* client;
-    block_mgr = CreateMgrAndClient(0, max_num_buffers, block_size, 0, &client);
-
-    std::vector<BufferedBlockMgr2::Block*> blocks;
-    AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-    // Unpin two blocks here, to ensure that backing storage is allocated in tmp file.
-    for (int i = 0; i < 2; ++i) {
-        status = blocks[i]->unpin();
-        EXPECT_TRUE(status.ok());
-    }
-    WaitForWrites(block_mgr);
-    // Repin the blocks
-    for (int i = 0; i < 2; ++i) {
-        bool pinned;
-        status = blocks[i]->pin(&pinned);
-        EXPECT_TRUE(status.ok());
-        EXPECT_TRUE(pinned);
-    }
-    // Remove the backing storage so that future writes will fail
-    int num_files = clear_scratch_dir();
-    EXPECT_TRUE(num_files > 0);
-    for (int i = 0; i < 2; ++i) {
-        status = blocks[i]->unpin();
-        EXPECT_TRUE(status.ok());
-    }
-    WaitForWrites(block_mgr);
-    // Subsequent calls should fail.
-    for (int i = 0; i < 2; ++i) {
-        blocks[i]->del();
-    }
-    BufferedBlockMgr2::Block* new_block;
-    status = block_mgr->get_new_block(client, nullptr, &new_block);
-    EXPECT_TRUE(status.is_cancelled());
-    EXPECT_TRUE(new_block == nullptr);
-
-    TearDownMgrs();
-}
-
-// Test block manager error handling when temporary file space cannot be allocated to
-// back an unpinned buffer.
-TEST_F(BufferedBlockMgrTest, TmpFileAllocateError) {
-    Status status;
-    int max_num_buffers = 2;
-    BufferedBlockMgr2::Client* client;
-    BufferedBlockMgr2* block_mgr = CreateMgrAndClient(0, max_num_buffers, _block_size, 0, &client);
-
-    std::vector<BufferedBlockMgr2::Block*> blocks;
-    AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-    // Unpin a block, forcing a write.
-    status = blocks[0]->unpin();
-    EXPECT_TRUE(status.ok());
-    WaitForWrites(block_mgr);
-    // Remove temporary files - subsequent operations will fail.
-    int num_files = clear_scratch_dir();
-    EXPECT_TRUE(num_files > 0);
-    // Current implementation will fail here because it tries to expand the tmp file
-    // immediately. This behavior is not contractual but we want to know if it changes
-    // accidentally.
-    status = blocks[1]->unpin();
-    EXPECT_FALSE(status.ok());
-
-    TearDownMgrs();
-}
-
-// Test that the block manager is able to blacklist a temporary device correctly after a
-// write error. We should not allocate more blocks on that device, but existing blocks
-// on the device will remain in use.
-/// Disabled because blacklisting was disabled as workaround for IMPALA-2305.
-TEST_F(BufferedBlockMgrTest, DISABLED_WriteErrorBlacklist) {
-    // TEST_F(BufferedBlockMgrTest, WriteErrorBlacklist) {
-    // Set up two buffered block managers with two temporary dirs.
-    std::vector<string> tmp_dirs = InitMultipleTmpDirs(2);
-    // Simulate two concurrent queries.
-    const int NUM_BLOCK_MGRS = 2;
-    const int MAX_NUM_BLOCKS = 4;
-    int blocks_per_mgr = MAX_NUM_BLOCKS / NUM_BLOCK_MGRS;
-    std::vector<BufferedBlockMgr2*> block_mgrs;
-    std::vector<BufferedBlockMgr2::Client*> clients;
-    CreateMgrsAndClients(0, NUM_BLOCK_MGRS, blocks_per_mgr, _block_size, 0, &block_mgrs, &clients);
-
-    // Allocate files for all 2x2 combinations by unpinning blocks.
-    std::vector<vector<BufferedBlockMgr2::Block*>> blocks;
-    std::vector<BufferedBlockMgr2::Block*> all_blocks;
-    for (int i = 0; i < NUM_BLOCK_MGRS; ++i) {
-        std::vector<BufferedBlockMgr2::Block*> mgr_blocks;
-        AllocateBlocks(block_mgrs[i], clients[i], blocks_per_mgr, &mgr_blocks);
-        UnpinBlocks(mgr_blocks);
-        for (int j = 0; j < blocks_per_mgr; ++j) {
-            LOG(INFO) << "Manager " << i << " Block " << j << " backed by file "
-                      << mgr_blocks[j]->tmp_file_path();
-        }
-        blocks.push_back(mgr_blocks);
-        all_blocks.insert(all_blocks.end(), mgr_blocks.begin(), mgr_blocks.end());
-    }
-    WaitForWrites(block_mgrs);
-    int error_mgr = 0;
-    int no_error_mgr = 1;
-    const string& error_dir = tmp_dirs[0];
-    const string& good_dir = tmp_dirs[1];
-    // Delete one file from first scratch dir for first block manager.
-    BufferedBlockMgr2::Block* error_block = FindBlockForDir(blocks[error_mgr], error_dir);
-    EXPECT_TRUE(error_block != nullptr) << "Expected a tmp file in dir " << error_dir;
-    PinBlocks(all_blocks);
-    DeleteBackingFile(error_block);
-    UnpinBlocks(all_blocks); // Should succeed since tmp file space was already allocated.
-    WaitForWrites(block_mgrs);
-    EXPECT_TRUE(block_mgrs[error_mgr]->is_cancelled());
-    EXPECT_FALSE(block_mgrs[no_error_mgr]->is_cancelled());
-    // Temporary device with error should no longer be active.
-    std::vector<TmpFileMgr::DeviceId> active_tmp_devices =
-            _test_env->tmp_file_mgr()->active_tmp_devices();
-    EXPECT_EQ(tmp_dirs.size() - 1, active_tmp_devices.size());
-    for (int i = 0; i < active_tmp_devices.size(); ++i) {
-        const string& device_path =
-                _test_env->tmp_file_mgr()->get_tmp_dir_path(active_tmp_devices[i]);
-        EXPECT_EQ(string::npos, error_dir.find(device_path));
-    }
-    // The second block manager should continue using allocated scratch space, since it
-    // didn't encounter a write error itself. In future this could change but for now it is
-    // the intended behaviour.
-    PinBlocks(blocks[no_error_mgr]);
-    UnpinBlocks(blocks[no_error_mgr]);
-    EXPECT_TRUE(FindBlockForDir(blocks[no_error_mgr], good_dir) != nullptr);
-    EXPECT_TRUE(FindBlockForDir(blocks[no_error_mgr], error_dir) != nullptr);
-    // The second block manager should avoid using bad directory for new blocks.
-    std::vector<BufferedBlockMgr2::Block*> no_error_new_blocks;
-    AllocateBlocks(block_mgrs[no_error_mgr], clients[no_error_mgr], blocks_per_mgr,
-                   &no_error_new_blocks);
-    UnpinBlocks(no_error_new_blocks);
-    for (int i = 0; i < no_error_new_blocks.size(); ++i) {
-        LOG(INFO) << "Newly created block backed by file "
-                  << no_error_new_blocks[i]->tmp_file_path();
-        EXPECT_TRUE(BlockInDir(no_error_new_blocks[i], good_dir));
-    }
-    // A new block manager should only use the good dir for backing storage.
-    BufferedBlockMgr2::Client* new_client;
-    BufferedBlockMgr2* new_block_mgr =
-            CreateMgrAndClient(9999, blocks_per_mgr, _block_size, 0, &new_client);
-    std::vector<BufferedBlockMgr2::Block*> new_mgr_blocks;
-    AllocateBlocks(new_block_mgr, new_client, blocks_per_mgr, &new_mgr_blocks);
-    UnpinBlocks(new_mgr_blocks);
-    for (int i = 0; i < blocks_per_mgr; ++i) {
-        LOG(INFO) << "New manager Block " << i << " backed by file "
-                  << new_mgr_blocks[i]->tmp_file_path();
-        EXPECT_TRUE(BlockInDir(new_mgr_blocks[i], good_dir));
-    }
-}
-
-// Check that allocation error resulting from removal of directory results in blocks
-/// being allocated in other directories.
-TEST_F(BufferedBlockMgrTest, AllocationErrorHandling) {
-    // Set up two buffered block managers with two temporary dirs.
-    std::vector<string> tmp_dirs = InitMultipleTmpDirs(2);
-    // Simulate two concurrent queries.
-    int num_block_mgrs = 2;
-    int max_num_blocks = 4;
-    int blocks_per_mgr = max_num_blocks / num_block_mgrs;
-    // std::vector<RuntimeState*> runtime_states;
-    std::vector<BufferedBlockMgr2*> block_mgrs;
-    std::vector<BufferedBlockMgr2::Client*> clients;
-    CreateMgrsAndClients(0, num_block_mgrs, blocks_per_mgr, _block_size, 0, &block_mgrs, &clients);
-
-    // Allocate files for all 2x2 combinations by unpinning blocks.
-    std::vector<vector<BufferedBlockMgr2::Block*>> blocks;
-    for (int i = 0; i < num_block_mgrs; ++i) {
-        std::vector<BufferedBlockMgr2::Block*> mgr_blocks;
-        LOG(INFO) << "Iter " << i;
-        AllocateBlocks(block_mgrs[i], clients[i], blocks_per_mgr, &mgr_blocks);
-        blocks.push_back(mgr_blocks);
-    }
-    const string& bad_dir = tmp_dirs[0];
-    const string& bad_scratch_subdir = bad_dir + SCRATCH_SUFFIX;
-    // const string& good_dir = tmp_dirs[1];
-    // const string& good_scratch_subdir = good_dir + SCRATCH_SUFFIX;
-    chmod(bad_scratch_subdir.c_str(), 0);
-    // The block mgr should attempt to allocate space in bad dir for one block, which will
-    // cause an error when it tries to create/expand the file. It should recover and just
-    // use the good dir.
-    UnpinBlocks(blocks[0]);
-    // Directories remain on active list even when they experience errors.
-    EXPECT_EQ(2, _test_env->tmp_file_mgr()->num_active_tmp_devices());
-    // Blocks should not be written to bad dir even if it remains non-writable.
-    UnpinBlocks(blocks[1]);
-    // All writes should succeed.
-    WaitForWrites(block_mgrs);
-    for (int i = 0; i < blocks.size(); ++i) {
-        for (int j = 0; j < blocks[i].size(); ++j) {
-            blocks[i][j]->del();
-        }
-    }
-}
-
-// Test that block manager fails cleanly when all directories are inaccessible at runtime.
-TEST_F(BufferedBlockMgrTest, NoDirsAllocationError) {
-    std::vector<string> tmp_dirs = InitMultipleTmpDirs(2);
-    int max_num_buffers = 2;
-    BufferedBlockMgr2::Client* client;
-    BufferedBlockMgr2* block_mgr = CreateMgrAndClient(0, max_num_buffers, _block_size, 0, &client);
-    std::vector<BufferedBlockMgr2::Block*> blocks;
-    AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-    for (int i = 0; i < tmp_dirs.size(); ++i) {
-        const string& tmp_scratch_subdir = tmp_dirs[i] + SCRATCH_SUFFIX;
-        chmod(tmp_scratch_subdir.c_str(), 0);
-    }
-    for (int i = 0; i < blocks.size(); ++i) {
-        EXPECT_FALSE(blocks[i]->unpin().ok());
-    }
-}
-
-// Create two clients with different number of reserved buffers.
-TEST_F(BufferedBlockMgrTest, MultipleClients) {
-    Status status;
-    int client1_buffers = 3;
-    int client2_buffers = 5;
-    int max_num_buffers = client1_buffers + client2_buffers;
-    const int block_size = 1024;
-    RuntimeState* runtime_state;
-    BufferedBlockMgr2* block_mgr = CreateMgr(0, max_num_buffers, block_size, &runtime_state);
-
-    BufferedBlockMgr2::Client* client1 = nullptr;
-    BufferedBlockMgr2::Client* client2 = nullptr;
-    status = block_mgr->register_client(client1_buffers, runtime_state, &client1);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(client1 != nullptr);
-    status = block_mgr->register_client(client2_buffers, runtime_state, &client2);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(client2 != nullptr);
-
-    // Reserve client 1's and 2's buffers. They should succeed.
-    bool reserved = block_mgr->try_acquire_tmp_reservation(client1, 1);
-    EXPECT_TRUE(reserved);
-    reserved = block_mgr->try_acquire_tmp_reservation(client2, 1);
-    EXPECT_TRUE(reserved);
-
-    std::vector<BufferedBlockMgr2::Block*> client1_blocks;
-    // Allocate all of client1's reserved blocks, they should all succeed.
-    AllocateBlocks(block_mgr, client1, client1_buffers, &client1_blocks);
-
-    // Try allocating one more, that should fail.
-    BufferedBlockMgr2::Block* block;
-    status = block_mgr->get_new_block(client1, nullptr, &block);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(block == nullptr);
-
-    // Trying to reserve should also fail.
-    reserved = block_mgr->try_acquire_tmp_reservation(client1, 1);
-    EXPECT_FALSE(reserved);
-
-    // Allocate all of client2's reserved blocks, these should succeed.
-    std::vector<BufferedBlockMgr2::Block*> client2_blocks;
-    AllocateBlocks(block_mgr, client2, client2_buffers, &client2_blocks);
-
-    // Try allocating one more from client 2, that should fail.
-    status = block_mgr->get_new_block(client2, nullptr, &block);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(block == nullptr);
-
-    // Unpin one block from client 1.
-    status = client1_blocks[0]->unpin();
-    EXPECT_TRUE(status.ok());
-
-    // Client 2 should still not be able to allocate.
-    status = block_mgr->get_new_block(client2, nullptr, &block);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(block == nullptr);
-
-    // Client 2 should still not be able to reserve.
-    reserved = block_mgr->try_acquire_tmp_reservation(client2, 1);
-    EXPECT_FALSE(reserved);
-
-    // Client 1 should be able to though.
-    status = block_mgr->get_new_block(client1, nullptr, &block);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(block != nullptr);
-
-    // Unpin two of client 1's blocks (client 1 should have 3 unpinned blocks now).
-    status = client1_blocks[1]->unpin();
-    EXPECT_TRUE(status.ok());
-    status = client1_blocks[2]->unpin();
-    EXPECT_TRUE(status.ok());
-
-    // Clear client 1's reservation
-    block_mgr->clear_reservations(client1);
-
-    // Client 2 should be able to reserve 1 buffers now (there are 2 left);
-    reserved = block_mgr->try_acquire_tmp_reservation(client2, 1);
-    EXPECT_TRUE(reserved);
-
-    // Client one can only pin 1.
-    bool pinned;
-    status = client1_blocks[0]->pin(&pinned);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(pinned);
-    // Can't get this one.
-    status = client1_blocks[1]->pin(&pinned);
-    EXPECT_TRUE(status.ok());
-    EXPECT_FALSE(pinned);
-
-    // Client 2 can pick up the one reserved buffer
-    status = block_mgr->get_new_block(client2, nullptr, &block);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(block != nullptr);
-    // But not a second
-    BufferedBlockMgr2::Block* block2;
-    status = block_mgr->get_new_block(client2, nullptr, &block2);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(block2 == nullptr);
-
-    // Unpin client 2's block it got from the reservation. Sine this is a tmp
-    // reservation, client 1 can pick it up again (it is not longer reserved).
-    status = block->unpin();
-    EXPECT_TRUE(status.ok());
-    status = client1_blocks[1]->pin(&pinned);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(pinned);
-
-    TearDownMgrs();
-}
-
-// Create two clients with different number of reserved buffers and some additional.
-TEST_F(BufferedBlockMgrTest, MultipleClientsExtraBuffers) {
-    Status status;
-    int client1_buffers = 1;
-    int client2_buffers = 1;
-    int max_num_buffers = client1_buffers + client2_buffers + 2;
-    const int block_size = 1024;
-    RuntimeState* runtime_state;
-    BufferedBlockMgr2* block_mgr = CreateMgr(0, max_num_buffers, block_size, &runtime_state);
-
-    BufferedBlockMgr2::Client* client1 = nullptr;
-    BufferedBlockMgr2::Client* client2 = nullptr;
-    BufferedBlockMgr2::Block* block = nullptr;
-    status = block_mgr->register_client(client1_buffers, runtime_state, &client1);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(client1 != nullptr);
-    status = block_mgr->register_client(client2_buffers, runtime_state, &client2);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(client2 != nullptr);
-
-    std::vector<BufferedBlockMgr2::Block*> client1_blocks;
-    // Allocate all of client1's reserved blocks, they should all succeed.
-    AllocateBlocks(block_mgr, client1, client1_buffers, &client1_blocks);
-
-    // Allocate all of client2's reserved blocks, these should succeed.
-    std::vector<BufferedBlockMgr2::Block*> client2_blocks;
-    AllocateBlocks(block_mgr, client2, client2_buffers, &client2_blocks);
-
-    // We have two spare buffers now. Each client should be able to allocate it.
-    status = block_mgr->get_new_block(client1, nullptr, &block);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(block != nullptr);
-    status = block_mgr->get_new_block(client2, nullptr, &block);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(block != nullptr);
-
-    // Now we are completely full, no one should be able to allocate a new block.
-    status = block_mgr->get_new_block(client1, nullptr, &block);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(block == nullptr);
-    status = block_mgr->get_new_block(client2, nullptr, &block);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(block == nullptr);
-
-    TearDownMgrs();
-}
-
-// Create two clients causing oversubscription.
-TEST_F(BufferedBlockMgrTest, ClientOversubscription) {
-    Status status;
-    int client1_buffers = 1;
-    int client2_buffers = 2;
-    int max_num_buffers = 2;
-    const int block_size = 1024;
-    RuntimeState* runtime_state;
-    BufferedBlockMgr2* block_mgr = CreateMgr(0, max_num_buffers, block_size, &runtime_state);
-
-    BufferedBlockMgr2::Client* client1 = nullptr;
-    BufferedBlockMgr2::Client* client2 = nullptr;
-    BufferedBlockMgr2::Block* block = nullptr;
-    status = block_mgr->register_client(client1_buffers, runtime_state, &client1);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(client1 != nullptr);
-    status = block_mgr->register_client(client2_buffers, runtime_state, &client2);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(client2 != nullptr);
-
-    // Client one allocates first block, should work.
-    status = block_mgr->get_new_block(client1, nullptr, &block);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(block != nullptr);
-
-    // Client two allocates first block, should work.
-    status = block_mgr->get_new_block(client2, nullptr, &block);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(block != nullptr);
-
-    // At this point we've used both buffers. Client one reserved one so subsequent
-    // calls should fail with no error (but returns no block).
-    status = block_mgr->get_new_block(client1, nullptr, &block);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(block == nullptr);
-
-    // Allocate with client two. Since client two reserved 2 buffers, this should fail
-    // with MEM_LIMIT_EXCEEDED.
-    status = block_mgr->get_new_block(client2, nullptr, &block);
-    EXPECT_TRUE(status.is_mem_limit_exceeded());
-
-    TearDownMgrs();
-}
-
-TEST_F(BufferedBlockMgrTest, SingleRandom_plain) {
-    TestRandomInternalSingle(1024);
-    TestRandomInternalSingle(8 * 1024);
-    TestRandomInternalSingle(8 * 1024 * 1024);
-}
-
-TEST_F(BufferedBlockMgrTest, Multi2Random_plain) {
-    TestRandomInternalMulti(2, 1024);
-    TestRandomInternalMulti(2, 8 * 1024);
-    TestRandomInternalMulti(2, 8 * 1024 * 1024);
-}
-
-TEST_F(BufferedBlockMgrTest, Multi4Random_plain) {
-    TestRandomInternalMulti(4, 1024);
-    TestRandomInternalMulti(4, 8 * 1024);
-    TestRandomInternalMulti(4, 8 * 1024 * 1024);
-}
-
-// TODO: Enable when we improve concurrency/scalability of block mgr.
-TEST_F(BufferedBlockMgrTest, DISABLED_Multi8Random_plain) {
-    TestRandomInternalMulti(8, 1024);
-}
-
-TEST_F(BufferedBlockMgrTest, CreateDestroyMulti) {
-    CreateDestroyMulti();
-}
-
-} // end namespace doris
diff --git a/be/test/runtime/buffered_tuple_stream2_test.cpp b/be/test/runtime/buffered_tuple_stream2_test.cpp
deleted file mode 100644
index 3bf3a7ed7a..0000000000
--- a/be/test/runtime/buffered_tuple_stream2_test.cpp
+++ /dev/null
@@ -1,821 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <gtest/gtest.h>
-
-#include <filesystem>
-#include <functional>
-#include <limits> // for std::numeric_limits<int>::max()
-#include <string>
-
-#include "gen_cpp/Types_types.h"
-#include "runtime/buffered_tuple_stream2.inline.h"
-#include "runtime/row_batch.h"
-#include "runtime/string_value.hpp"
-#include "runtime/test_env.h"
-#include "runtime/tmp_file_mgr.h"
-#include "runtime/types.h"
-#include "testutil/desc_tbl_builder.h"
-#include "util/cpu_info.h"
-#include "util/debug_util.h"
-#include "util/disk_info.h"
-
-using std::vector;
-
-using std::unique_ptr;
-
-static const int BATCH_SIZE = 250;
-static const uint32_t PRIME = 479001599;
-
-namespace doris {
-
-static const StringValue STRINGS[] = {
-        StringValue("ABC"),
-        StringValue("HELLO"),
-        StringValue("123456789"),
-        StringValue("FOOBAR"),
-        StringValue("ONE"),
-        StringValue("THREE"),
-        StringValue("abcdefghijklmno"),
-        StringValue("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
-        StringValue("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"),
-};
-
-static const int NUM_STRINGS = sizeof(STRINGS) / sizeof(StringValue);
-
-class SimpleTupleStreamTest : public testing::Test {
-public:
-    SimpleTupleStreamTest() {}
-    // A null dtor to pass codestyle check
-    ~SimpleTupleStreamTest() {}
-
-protected:
-    virtual void SetUp() {
-        _test_env.reset(new TestEnv());
-        create_descriptors();
-        _mem_pool.reset(new MemPool());
-    }
-
-    virtual void create_descriptors() {
-        std::vector<bool> nullable_tuples(1, false);
-        std::vector<TTupleId> tuple_ids(1, static_cast<TTupleId>(0));
-
-        DescriptorTblBuilder int_builder(&_pool);
-        int_builder.declare_tuple() << TYPE_INT;
-        _int_desc = _pool.add(new RowDescriptor(*int_builder.build(), tuple_ids, nullable_tuples));
-
-        DescriptorTblBuilder string_builder(&_pool);
-        // string_builder.declare_tuple() << TYPE_STRING;
-        string_builder.declare_tuple() << TYPE_VARCHAR;
-        _string_desc =
-                _pool.add(new RowDescriptor(*string_builder.build(), tuple_ids, nullable_tuples));
-    }
-
-    virtual void TearDown() {
-        _runtime_state = nullptr;
-        _client = nullptr;
-        _pool.clear();
-        _mem_pool->free_all();
-        _test_env.reset();
-    }
-
-    // Setup a block manager with the provided settings and client with no reservation,
-    // tracked by _tracker.
-    void InitBlockMgr(int64_t limit, int block_size) {
-        Status status = _test_env->create_query_state(0, limit, block_size, &_runtime_state);
-        EXPECT_TRUE(status.ok());
-        status = _runtime_state->block_mgr2()->register_client(0, _runtime_state, &_client);
-        EXPECT_TRUE(status.ok());
-    }
-
-    // Generate the ith element of a sequence of int values.
-    int GenIntValue(int i) {
-        // Multiply by large prime to get varied bit patterns.
-        return i * PRIME;
-    }
-
-    // Generate the ith element of a sequence of bool values.
-    bool GenBoolValue(int i) {
-        // Use a middle bit of the int value.
-        return ((GenIntValue(i) >> 8) & 0x1) != 0;
-    }
-
-    virtual RowBatch* CreateIntBatch(int offset, int num_rows, bool gen_null) {
-        RowBatch* batch = _pool.add(new RowBatch(*_int_desc, num_rows));
-        int tuple_size = _int_desc->tuple_descriptors()[0]->byte_size();
-        uint8_t* tuple_mem = reinterpret_cast<uint8_t*>(
-                batch->tuple_data_pool()->allocate(tuple_size * num_rows));
-        memset(tuple_mem, 0, tuple_size * num_rows);
-
-        const int int_tuples = _int_desc->tuple_descriptors().size();
-        for (int i = 0; i < num_rows; ++i) {
-            int idx = batch->add_row();
-            TupleRow* row = batch->get_row(idx);
-            Tuple* int_tuple = reinterpret_cast<Tuple*>(tuple_mem + i * tuple_size);
-            // *reinterpret_cast<int*>(int_tuple + 1) = GenIntValue(i + offset);
-            *reinterpret_cast<int*>(reinterpret_cast<uint8_t*>(int_tuple) + 1) =
-                    GenIntValue(i + offset);
-            for (int j = 0; j < int_tuples; ++j) {
-                int idx = (i + offset) * int_tuples + j;
-                if (!gen_null || GenBoolValue(idx)) {
-                    row->set_tuple(j, int_tuple);
-                } else {
-                    row->set_tuple(j, nullptr);
-                }
-            }
-            batch->commit_last_row();
-        }
-        return batch;
-    }
-
-    virtual RowBatch* CreateStringBatch(int offset, int num_rows, bool gen_null) {
-        int tuple_size = sizeof(StringValue) + 1;
-        RowBatch* batch = _pool.add(new RowBatch(*_string_desc, num_rows));
-        uint8_t* tuple_mem = batch->tuple_data_pool()->allocate(tuple_size * num_rows);
-        memset(tuple_mem, 0, tuple_size * num_rows);
-        const int string_tuples = _string_desc->tuple_descriptors().size();
-        for (int i = 0; i < num_rows; ++i) {
-            TupleRow* row = batch->get_row(batch->add_row());
-            *reinterpret_cast<StringValue*>(tuple_mem + 1) = STRINGS[(i + offset) % NUM_STRINGS];
-            for (int j = 0; j < string_tuples; ++j) {
-                int idx = (i + offset) * string_tuples + j;
-                if (!gen_null || GenBoolValue(idx)) {
-                    row->set_tuple(j, reinterpret_cast<Tuple*>(tuple_mem));
-                } else {
-                    row->set_tuple(j, nullptr);
-                }
-            }
-            batch->commit_last_row();
-            tuple_mem += tuple_size;
-        }
-        return batch;
-    }
-
-    void AppendRowTuples(TupleRow* row, std::vector<int>* results) {
-        DCHECK(row != nullptr);
-        const int int_tuples = _int_desc->tuple_descriptors().size();
-        for (int i = 0; i < int_tuples; ++i) {
-            AppendValue(row->get_tuple(i), results);
-        }
-    }
-
-    void AppendRowTuples(TupleRow* row, std::vector<StringValue>* results) {
-        DCHECK(row != nullptr);
-        const int string_tuples = _string_desc->tuple_descriptors().size();
-        for (int i = 0; i < string_tuples; ++i) {
-            AppendValue(row->get_tuple(i), results);
-        }
-    }
-
-    void AppendValue(Tuple* t, std::vector<int>* results) {
-        if (t == nullptr) {
-            // For the tests indicate null-ability using the max int value
-            results->push_back(std::numeric_limits<int>::max());
-        } else {
-            results->push_back(*reinterpret_cast<int*>(reinterpret_cast<uint8_t*>(t) + 1));
-        }
-    }
-
-    void AppendValue(Tuple* t, std::vector<StringValue>* results) {
-        if (t == nullptr) {
-            results->push_back(StringValue());
-        } else {
-            uint8_t* mem = reinterpret_cast<uint8_t*>(t);
-            StringValue sv = *reinterpret_cast<StringValue*>(mem + 1);
-            uint8_t* copy = _mem_pool->allocate(sv.len);
-            memcpy(copy, sv.ptr, sv.len);
-            sv.ptr = reinterpret_cast<char*>(copy);
-            results->push_back(sv);
-        }
-    }
-
-    template <typename T>
-    void ReadValues(BufferedTupleStream2* stream, RowDescriptor* desc, std::vector<T>* results,
-                    int num_batches = -1) {
-        bool eos = false;
-        RowBatch batch(*desc, BATCH_SIZE);
-        int batches_read = 0;
-        do {
-            batch.reset();
-            Status status = stream->get_next(&batch, &eos);
-            EXPECT_TRUE(status.ok());
-            ++batches_read;
-            for (int i = 0; i < batch.num_rows(); ++i) {
-                AppendRowTuples(batch.get_row(i), results);
-            }
-        } while (!eos && (num_batches < 0 || batches_read <= num_batches));
-    }
-
-    virtual void VerifyResults(const std::vector<int>& results, int exp_rows, bool gen_null) {
-        const int int_tuples = _int_desc->tuple_descriptors().size();
-        EXPECT_EQ(results.size(), exp_rows * int_tuples);
-        for (int i = 0; i < exp_rows; ++i) {
-            for (int j = 0; j < int_tuples; ++j) {
-                int idx = i * int_tuples + j;
-                if (!gen_null || GenBoolValue(idx)) {
-                    EXPECT_EQ(results[idx], GenIntValue(i))
-                            << " results[" << idx << "]: " << results[idx]
-                            << " != " << GenIntValue(i) << " gen_null=" << gen_null;
-                } else {
-                    EXPECT_TRUE(results[idx] == std::numeric_limits<int>::max())
-                            << "i: " << i << " j: " << j << " results[" << idx
-                            << "]: " << results[idx] << " != " << std::numeric_limits<int>::max();
-                }
-            }
-        }
-    }
-
-    virtual void VerifyResults(const std::vector<StringValue>& results, int exp_rows,
-                               bool gen_null) {
-        const int string_tuples = _string_desc->tuple_descriptors().size();
-        EXPECT_EQ(results.size(), exp_rows * string_tuples);
-        for (int i = 0; i < exp_rows; ++i) {
-            for (int j = 0; j < string_tuples; ++j) {
-                int idx = i * string_tuples + j;
-                if (!gen_null || GenBoolValue(idx)) {
-                    EXPECT_TRUE(results[idx] == STRINGS[i % NUM_STRINGS])
-                            << "results[" << idx << "] " << results[idx]
-                            << " != " << STRINGS[i % NUM_STRINGS] << " i=" << i
-                            << " gen_null=" << gen_null;
-                } else {
-                    EXPECT_TRUE(results[idx] == StringValue())
-                            << "results[" << idx << "] " << results[idx] << " not nullptr";
-                }
-            }
-        }
-    }
-
-    // Test adding num_batches of ints to the stream and reading them back.
-    template <typename T>
-    void TestValues(int num_batches, RowDescriptor* desc, bool gen_null) {
-        BufferedTupleStream2 stream(_runtime_state, *desc, _runtime_state->block_mgr2(), _client,
-                                    true, false);
-        Status status = stream.init(-1, nullptr, true);
-        EXPECT_TRUE(status.ok()) << status;
-        status = stream.unpin_stream();
-        EXPECT_TRUE(status.ok());
-
-        // Add rows to the stream
-        int offset = 0;
-        for (int i = 0; i < num_batches; ++i) {
-            RowBatch* batch = nullptr;
-            if (sizeof(T) == sizeof(int)) {
-                batch = CreateIntBatch(offset, BATCH_SIZE, gen_null);
-            } else if (sizeof(T) == sizeof(StringValue)) {
-                batch = CreateStringBatch(offset, BATCH_SIZE, gen_null);
-            } else {
-                DCHECK(false);
-            }
-            for (int j = 0; j < batch->num_rows(); ++j) {
-                bool b = stream.add_row(batch->get_row(j), &status);
-                EXPECT_TRUE(status.ok());
-                if (!b) {
-                    EXPECT_TRUE(stream.using_small_buffers());
-                    bool got_buffer;
-                    status = stream.switch_to_io_buffers(&got_buffer);
-                    EXPECT_TRUE(status.ok());
-                    EXPECT_TRUE(got_buffer);
-                    b = stream.add_row(batch->get_row(j), &status);
-                    EXPECT_TRUE(status.ok());
-                }
-                EXPECT_TRUE(b);
-            }
-            offset += batch->num_rows();
-            // Reset the batch to make sure the stream handles the memory correctly.
-            batch->reset();
-        }
-
-        status = stream.prepare_for_read(false);
-        EXPECT_TRUE(status.ok());
-
-        // Read all the rows back
-        std::vector<T> results;
-        ReadValues(&stream, desc, &results);
-
-        // Verify result
-        VerifyResults(results, BATCH_SIZE * num_batches, gen_null);
-
-        stream.close();
-    }
-
-    void TestIntValuesInterleaved(int num_batches, int num_batches_before_read) {
-        for (int small_buffers = 0; small_buffers < 2; ++small_buffers) {
-            BufferedTupleStream2 stream(_runtime_state, *_int_desc, _runtime_state->block_mgr2(),
-                                        _client, small_buffers == 0, // initial small buffers
-                                        true);                       // read_write
-            Status status = stream.init(-1, nullptr, true);
-            EXPECT_TRUE(status.ok());
-            status = stream.prepare_for_read(true);
-            EXPECT_TRUE(status.ok());
-            status = stream.unpin_stream();
-            EXPECT_TRUE(status.ok());
-
-            std::vector<int> results;
-
-            for (int i = 0; i < num_batches; ++i) {
-                RowBatch* batch = CreateIntBatch(i * BATCH_SIZE, BATCH_SIZE, false);
-                for (int j = 0; j < batch->num_rows(); ++j) {
-                    bool b = stream.add_row(batch->get_row(j), &status);
-                    EXPECT_TRUE(b);
-                    EXPECT_TRUE(status.ok());
-                }
-                // Reset the batch to make sure the stream handles the memory correctly.
-                batch->reset();
-                if (i % num_batches_before_read == 0) {
-                    ReadValues(&stream, _int_desc, &results,
-                               (rand() % num_batches_before_read) + 1);
-                }
-            }
-            ReadValues(&stream, _int_desc, &results);
-
-            VerifyResults(results, BATCH_SIZE * num_batches, false);
-
-            stream.close();
-        }
-    }
-
-    std::unique_ptr<TestEnv> _test_env;
-    RuntimeState* _runtime_state;
-    BufferedBlockMgr2::Client* _client;
-
-    ObjectPool _pool;
-    RowDescriptor* _int_desc;
-    RowDescriptor* _string_desc;
-    std::unique_ptr<MemPool> _mem_pool;
-};
-
-// Tests with a non-NULLable tuple per row.
-class SimpleNullStreamTest : public SimpleTupleStreamTest {
-protected:
-    virtual void create_descriptors() {
-        std::vector<bool> nullable_tuples(1, true);
-        std::vector<TTupleId> tuple_ids(1, static_cast<TTupleId>(0));
-
-        DescriptorTblBuilder int_builder(&_pool);
-        int_builder.declare_tuple() << TYPE_INT;
-        _int_desc = _pool.add(new RowDescriptor(*int_builder.build(), tuple_ids, nullable_tuples));
-
-        DescriptorTblBuilder string_builder(&_pool);
-        string_builder.declare_tuple() << TYPE_VARCHAR;
-        _string_desc =
-                _pool.add(new RowDescriptor(*string_builder.build(), tuple_ids, nullable_tuples));
-    }
-}; // SimpleNullStreamTest
-
-// Tests with multiple non-NULLable tuples per row.
-class MultiTupleStreamTest : public SimpleTupleStreamTest {
-protected:
-    virtual void create_descriptors() {
-        std::vector<bool> nullable_tuples;
-        nullable_tuples.push_back(false);
-        nullable_tuples.push_back(false);
-        nullable_tuples.push_back(false);
-
-        std::vector<TTupleId> tuple_ids;
-        tuple_ids.push_back(static_cast<TTupleId>(0));
-        tuple_ids.push_back(static_cast<TTupleId>(1));
-        tuple_ids.push_back(static_cast<TTupleId>(2));
-
-        DescriptorTblBuilder int_builder(&_pool);
-        int_builder.declare_tuple() << TYPE_INT;
-        int_builder.declare_tuple() << TYPE_INT;
-        int_builder.declare_tuple() << TYPE_INT;
-        _int_desc = _pool.add(new RowDescriptor(*int_builder.build(), tuple_ids, nullable_tuples));
-
-        DescriptorTblBuilder string_builder(&_pool);
-        string_builder.declare_tuple() << TYPE_VARCHAR;
-        string_builder.declare_tuple() << TYPE_VARCHAR;
-        string_builder.declare_tuple() << TYPE_VARCHAR;
-        _string_desc =
-                _pool.add(new RowDescriptor(*string_builder.build(), tuple_ids, nullable_tuples));
-    }
-};
-
-// Tests with multiple NULLable tuples per row.
-class MultiNullableTupleStreamTest : public SimpleTupleStreamTest {
-protected:
-    virtual void create_descriptors() {
-        std::vector<bool> nullable_tuples;
-        nullable_tuples.push_back(false);
-        nullable_tuples.push_back(true);
-        nullable_tuples.push_back(true);
-
-        std::vector<TTupleId> tuple_ids;
-        tuple_ids.push_back(static_cast<TTupleId>(0));
-        tuple_ids.push_back(static_cast<TTupleId>(1));
-        tuple_ids.push_back(static_cast<TTupleId>(2));
-
-        DescriptorTblBuilder int_builder(&_pool);
-        int_builder.declare_tuple() << TYPE_INT;
-        int_builder.declare_tuple() << TYPE_INT;
-        int_builder.declare_tuple() << TYPE_INT;
-        _int_desc = _pool.add(new RowDescriptor(*int_builder.build(), tuple_ids, nullable_tuples));
-
-        DescriptorTblBuilder string_builder(&_pool);
-        string_builder.declare_tuple() << TYPE_VARCHAR;
-        string_builder.declare_tuple() << TYPE_VARCHAR;
-        string_builder.declare_tuple() << TYPE_VARCHAR;
-        _string_desc =
-                _pool.add(new RowDescriptor(*string_builder.build(), tuple_ids, nullable_tuples));
-    }
-};
-
-#if 0
-// Tests with collection types.
-class ArrayTupleStreamTest : public SimpleTupleStreamTest {
-protected:
-    RowDescriptor* _array_desc;
-
-    virtual void create_descriptors() {
-        // tuples: (array<string>, array<array<int>>) (array<int>)
-        std::vector<bool> nullable_tuples(2, true);
-        std::vector<TTupleId> tuple_ids;
-        tuple_ids.push_back(static_cast<TTupleId>(0));
-        tuple_ids.push_back(static_cast<TTupleId>(1));
-        TypeDescriptor string_array_type;
-        string_array_type.type = TYPE_ARRAY;
-        string_array_type.children.push_back(TYPE_VARCHAR);
-
-        TypeDescriptor int_array_type;
-        int_array_type.type = TYPE_ARRAY;
-        int_array_type.children.push_back(TYPE_VARCHAR);
-
-        TypeDescriptor nested_array_type;
-        nested_array_type.type = TYPE_ARRAY;
-        nested_array_type.children.push_back(int_array_type);
-
-        DescriptorTblBuilder builder(&_pool);
-        builder.declare_tuple() << string_array_type << nested_array_type;
-        builder.declare_tuple() << int_array_type;
-        _array_desc = _pool.add(new RowDescriptor(
-                    *builder.build(), tuple_ids, nullable_tuples));
-    }
-};
-#endif
-
-// Basic API test. No data should be going to disk.
-TEST_F(SimpleTupleStreamTest, Basic) {
-    InitBlockMgr(-1, 8 * 1024 * 1024);
-    TestValues<int>(1, _int_desc, false);
-    TestValues<int>(10, _int_desc, false);
-    TestValues<int>(100, _int_desc, false);
-
-    TestValues<StringValue>(1, _string_desc, false);
-    TestValues<StringValue>(10, _string_desc, false);
-    TestValues<StringValue>(100, _string_desc, false);
-
-    TestIntValuesInterleaved(1, 1);
-    TestIntValuesInterleaved(10, 5);
-    TestIntValuesInterleaved(100, 15);
-}
-
-// #if 0
-// Test with only 1 buffer.
-TEST_F(SimpleTupleStreamTest, OneBufferSpill) {
-    // Each buffer can only hold 100 ints, so this spills quite often.
-    int buffer_size = 100 * sizeof(int);
-    InitBlockMgr(buffer_size, buffer_size);
-    TestValues<int>(1, _int_desc, false);
-    TestValues<int>(10, _int_desc, false);
-
-    TestValues<StringValue>(1, _string_desc, false);
-    TestValues<StringValue>(10, _string_desc, false);
-}
-
-// Test with a few buffers.
-TEST_F(SimpleTupleStreamTest, ManyBufferSpill) {
-    int buffer_size = 100 * sizeof(int);
-    InitBlockMgr(10 * buffer_size, buffer_size);
-
-    TestValues<int>(1, _int_desc, false);
-    TestValues<int>(10, _int_desc, false);
-    TestValues<int>(100, _int_desc, false);
-    TestValues<StringValue>(1, _string_desc, false);
-    TestValues<StringValue>(10, _string_desc, false);
-    TestValues<StringValue>(100, _string_desc, false);
-
-    TestIntValuesInterleaved(1, 1);
-    TestIntValuesInterleaved(10, 5);
-    TestIntValuesInterleaved(100, 15);
-}
-
-TEST_F(SimpleTupleStreamTest, UnpinPin) {
-    int buffer_size = 100 * sizeof(int);
-    InitBlockMgr(3 * buffer_size, buffer_size);
-
-    BufferedTupleStream2 stream(_runtime_state, *_int_desc, _runtime_state->block_mgr2(), _client,
-                                true, false);
-    Status status = stream.init(-1, nullptr, true);
-    EXPECT_TRUE(status.ok());
-
-    int offset = 0;
-    bool full = false;
-    while (!full) {
-        RowBatch* batch = CreateIntBatch(offset, BATCH_SIZE, false);
-        int j = 0;
-        for (; j < batch->num_rows(); ++j) {
-            full = !stream.add_row(batch->get_row(j), &status);
-            EXPECT_TRUE(status.ok());
-            if (full) {
-                break;
-            }
-        }
-        offset += j;
-    }
-
-    status = stream.unpin_stream();
-    EXPECT_TRUE(status.ok());
-
-    bool pinned = false;
-    status = stream.pin_stream(false, &pinned);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(pinned);
-
-    std::vector<int> results;
-
-    // Read and verify result a few times. We should be able to reread the stream if
-    // we don't use delete on read mode.
-    int read_iters = 3;
-    for (int i = 0; i < read_iters; ++i) {
-        bool delete_on_read = i == read_iters - 1;
-        status = stream.prepare_for_read(delete_on_read);
-        EXPECT_TRUE(status.ok());
-        results.clear();
-        ReadValues(&stream, _int_desc, &results);
-        VerifyResults(results, offset, false);
-    }
-
-    // After delete_on_read, all blocks aside from the last should be deleted.
-    // Note: this should really be 0, but the BufferedTupleStream2 returns eos before
-    // deleting the last block, rather than after, so the last block isn't deleted
-    // until the stream is closed.
-    DCHECK_EQ(stream.bytes_in_mem(false), buffer_size);
-
-    stream.close();
-
-    DCHECK_EQ(stream.bytes_in_mem(false), 0);
-}
-
-TEST_F(SimpleTupleStreamTest, SmallBuffers) {
-    int buffer_size = 8 * 1024 * 1024;
-    InitBlockMgr(2 * buffer_size, buffer_size);
-
-    BufferedTupleStream2 stream(_runtime_state, *_int_desc, _runtime_state->block_mgr2(), _client,
-                                true, false);
-    Status status = stream.init(-1, nullptr, false);
-    EXPECT_TRUE(status.ok());
-
-    // Initial buffer should be small.
-    EXPECT_LT(stream.bytes_in_mem(false), buffer_size);
-
-    RowBatch* batch = CreateIntBatch(0, 1024, false);
-    for (int i = 0; i < batch->num_rows(); ++i) {
-        bool ret = stream.add_row(batch->get_row(i), &status);
-        EXPECT_TRUE(ret);
-        EXPECT_TRUE(status.ok());
-    }
-    EXPECT_LT(stream.bytes_in_mem(false), buffer_size);
-    EXPECT_LT(stream.byte_size(), buffer_size);
-    EXPECT_TRUE(stream.using_small_buffers());
-
-    // 40 MB of ints
-    batch = CreateIntBatch(0, 10 * 1024 * 1024, false);
-    for (int i = 0; i < batch->num_rows(); ++i) {
-        bool ret = stream.add_row(batch->get_row(i), &status);
-        EXPECT_TRUE(status.ok());
-        if (!ret) {
-            EXPECT_TRUE(stream.using_small_buffers());
-            bool got_buffer;
-            status = stream.switch_to_io_buffers(&got_buffer);
-            EXPECT_TRUE(status.ok());
-            EXPECT_TRUE(got_buffer);
-            ret = stream.add_row(batch->get_row(i), &status);
-            EXPECT_TRUE(status.ok());
-        }
-        EXPECT_TRUE(ret);
-    }
-    EXPECT_EQ(stream.bytes_in_mem(false), buffer_size);
-
-    // TODO: Test for IMPALA-2330. In case switch_to_io_buffers() fails to get buffer then
-    // using_small_buffers() should still return true.
-    stream.close();
-}
-
-// Basic API test. No data should be going to disk.
-TEST_F(SimpleNullStreamTest, Basic) {
-    InitBlockMgr(-1, 8 * 1024 * 1024);
-    TestValues<int>(1, _int_desc, false);
-    TestValues<int>(10, _int_desc, false);
-    TestValues<int>(100, _int_desc, false);
-    TestValues<int>(1, _int_desc, true);
-    TestValues<int>(10, _int_desc, true);
-    TestValues<int>(100, _int_desc, true);
-
-    TestValues<StringValue>(1, _string_desc, false);
-    TestValues<StringValue>(10, _string_desc, false);
-    TestValues<StringValue>(100, _string_desc, false);
-    TestValues<StringValue>(1, _string_desc, true);
-    TestValues<StringValue>(10, _string_desc, true);
-    TestValues<StringValue>(100, _string_desc, true);
-
-    TestIntValuesInterleaved(1, 1);
-    TestIntValuesInterleaved(10, 5);
-    TestIntValuesInterleaved(100, 15);
-}
-
-// Test tuple stream with only 1 buffer and rows with multiple tuples.
-TEST_F(MultiTupleStreamTest, MultiTupleOneBufferSpill) {
-    // Each buffer can only hold 100 ints, so this spills quite often.
-    int buffer_size = 100 * sizeof(int);
-    InitBlockMgr(buffer_size, buffer_size);
-    TestValues<int>(1, _int_desc, false);
-    TestValues<int>(10, _int_desc, false);
-
-    TestValues<StringValue>(1, _string_desc, false);
-    TestValues<StringValue>(10, _string_desc, false);
-}
-
-// Test with a few buffers and rows with multiple tuples.
-TEST_F(MultiTupleStreamTest, MultiTupleManyBufferSpill) {
-    int buffer_size = 100 * sizeof(int);
-    InitBlockMgr(10 * buffer_size, buffer_size);
-
-    TestValues<int>(1, _int_desc, false);
-    TestValues<int>(10, _int_desc, false);
-    TestValues<int>(100, _int_desc, false);
-
-    TestValues<StringValue>(1, _string_desc, false);
-    TestValues<StringValue>(10, _string_desc, false);
-    TestValues<StringValue>(100, _string_desc, false);
-
-    TestIntValuesInterleaved(1, 1);
-    TestIntValuesInterleaved(10, 5);
-    TestIntValuesInterleaved(100, 15);
-}
-
-// Test with rows with multiple nullable tuples.
-TEST_F(MultiNullableTupleStreamTest, MultiNullableTupleOneBufferSpill) {
-    // Each buffer can only hold 100 ints, so this spills quite often.
-    int buffer_size = 100 * sizeof(int);
-    InitBlockMgr(buffer_size, buffer_size);
-    TestValues<int>(1, _int_desc, false);
-    TestValues<int>(10, _int_desc, false);
-    TestValues<int>(1, _int_desc, true);
-    TestValues<int>(10, _int_desc, true);
-
-    TestValues<StringValue>(1, _string_desc, false);
-    TestValues<StringValue>(10, _string_desc, false);
-    TestValues<StringValue>(1, _string_desc, true);
-    TestValues<StringValue>(10, _string_desc, true);
-}
-
-// Test with a few buffers.
-TEST_F(MultiNullableTupleStreamTest, MultiNullableTupleManyBufferSpill) {
-    int buffer_size = 100 * sizeof(int);
-    InitBlockMgr(10 * buffer_size, buffer_size);
-
-    TestValues<int>(1, _int_desc, false);
-    TestValues<int>(10, _int_desc, false);
-    TestValues<int>(100, _int_desc, false);
-    TestValues<int>(1, _int_desc, true);
-    TestValues<int>(10, _int_desc, true);
-    TestValues<int>(100, _int_desc, true);
-
-    TestValues<StringValue>(1, _string_desc, false);
-    TestValues<StringValue>(10, _string_desc, false);
-    TestValues<StringValue>(100, _string_desc, false);
-    TestValues<StringValue>(1, _string_desc, true);
-    TestValues<StringValue>(10, _string_desc, true);
-    TestValues<StringValue>(100, _string_desc, true);
-
-    TestIntValuesInterleaved(1, 1);
-    TestIntValuesInterleaved(10, 5);
-    TestIntValuesInterleaved(100, 15);
-}
-// #endif
-
-#if 0
-// Test that deep copy works with arrays by copying into a BufferedTupleStream2, freeing
-// the original rows, then reading back the rows and verifying the contents.
-TEST_F(ArrayTupleStreamTest, TestArrayDeepCopy) {
-    Status status;
-    InitBlockMgr(-1, 8 * 1024 * 1024);
-    const int NUM_ROWS = 4000;
-    BufferedTupleStream2 stream(_runtime_state, *_array_desc, _runtime_state->block_mgr2(),
-            _client, false, false);
-    const std::vector<TupleDescriptor*>& tuple_descs = _array_desc->tuple_descriptors();
-    // Write out a predictable pattern of data by iterating over arrays of constants.
-    int strings_index = 0; // we take the mod of this as index into STRINGS.
-    int array_lens[] = { 0, 1, 5, 10, 1000, 2, 49, 20 };
-    int num_array_lens = sizeof(array_lens) / sizeof(array_lens[0]);
-    int array_len_index = 0;
-    for (int i = 0; i < NUM_ROWS; ++i) {
-        int expected_row_size = tuple_descs[0]->byte_size() + tuple_descs[1]->byte_size();
-        // gscoped_ptr<TupleRow, FreeDeleter> row(reinterpret_cast<TupleRow*>(
-        //             malloc(tuple_descs.size() * sizeof(Tuple*))));
-        // gscoped_ptr<Tuple, FreeDeleter> tuple0(reinterpret_cast<Tuple*>(
-        //             malloc(tuple_descs[0]->byte_size())));
-        // gscoped_ptr<Tuple, FreeDeleter> tuple1(reinterpret_cast<Tuple*>(
-        //             malloc(tuple_descs[1]->byte_size())));
-        std::unique_ptr<TupleRow> row(reinterpret_cast<TupleRow*>(
-                    malloc(tuple_descs.size() * sizeof(Tuple*))));
-        std::unique_ptr<Tuple> tuple0(reinterpret_cast<Tuple*>(
-                    malloc(tuple_descs[0]->byte_size())));
-        std::unique_ptr<Tuple> tuple1(reinterpret_cast<Tuple*>(
-                    malloc(tuple_descs[1]->byte_size())));
-        memset(tuple0.get(), 0, tuple_descs[0]->byte_size());
-        memset(tuple1.get(), 0, tuple_descs[1]->byte_size());
-        row->set_tuple(0, tuple0.get());
-        row->set_tuple(1, tuple1.get());
-
-        // Only array<string> is non-null.
-        tuple0->set_null(tuple_descs[0]->slots()[1]->null_indicator_offset());
-        tuple1->set_null(tuple_descs[1]->slots()[0]->null_indicator_offset());
-        const SlotDescriptor* array_slot_desc = tuple_descs[0]->slots()[0];
-        const TupleDescriptor* item_desc = array_slot_desc->collection_item_descriptor();
-
-        int array_len = array_lens[array_len_index++ % num_array_lens];
-        CollectionValue* cv = tuple0->GetCollectionSlot(array_slot_desc->tuple_offset());
-        cv->ptr = nullptr;
-        cv->num_tuples = 0;
-        CollectionValueBuilder builder(cv, *item_desc, _mem_pool.get(), array_len);
-        Tuple* array_data;
-        builder.GetFreeMemory(&array_data);
-        expected_row_size += item_desc->byte_size() * array_len;
-
-        // Fill the array with pointers to our constant strings.
-        for (int j = 0; j < array_len; ++j) {
-            const StringValue* string = &STRINGS[strings_index++ % NUM_STRINGS];
-            array_data->SetNotNull(item_desc->slots()[0]->null_indicator_offset());
-            RawValue::Write(string, array_data, item_desc->slots()[0], _mem_pool.get());
-            array_data += item_desc->byte_size();
-            expected_row_size += string->len;
-        }
-        builder.CommitTuples(array_len);
-
-        // Check that internal row size computation gives correct result.
-        EXPECT_EQ(expected_row_size, stream.ComputeRowSize(row.get()));
-        bool b = stream.add_row(row.get(), &status);
-        EXPECT_TRUE(b);
-        EXPECT_TRUE(status.ok());
-        _mem_pool->FreeAll(); // Free data as soon as possible to smoke out issues.
-    }
-
-    // Read back and verify data.
-    stream.prepare_for_read(false);
-    strings_index = 0;
-    array_len_index = 0;
-    bool eos = false;
-    int rows_read = 0;
-    RowBatch batch(*_array_desc, BATCH_SIZE);
-    do {
-        batch.reset();
-        EXPECT_TRUE(stream.get_next(&batch, &eos).ok());
-        for (int i = 0; i < batch.num_rows(); ++i) {
-            TupleRow* row = batch.GetRow(i);
-            Tuple* tuple0 = row->get_tuple(0);
-            Tuple* tuple1 = row->get_tuple(1);
-            EXPECT_TRUE(tuple0 != nullptr);
-            EXPECT_TRUE(tuple1 != nullptr);
-            const SlotDescriptor* array_slot_desc = tuple_descs[0]->slots()[0];
-            EXPECT_FALSE(tuple0->IsNull(array_slot_desc->null_indicator_offset()));
-            EXPECT_TRUE(tuple0->IsNull(tuple_descs[0]->slots()[1]->null_indicator_offset()));
-            EXPECT_TRUE(tuple1->IsNull(tuple_descs[1]->slots()[0]->null_indicator_offset()));
-
-            const TupleDescriptor* item_desc = array_slot_desc->collection_item_descriptor();
-            int expected_array_len = array_lens[array_len_index++ % num_array_lens];
-            CollectionValue* cv = tuple0->GetCollectionSlot(array_slot_desc->tuple_offset());
-            EXPECT_EQ(expected_array_len, cv->num_tuples);
-            for (int j = 0; j < cv->num_tuples; ++j) {
-                Tuple* item = reinterpret_cast<Tuple*>(cv->ptr + j * item_desc->byte_size());
-                const SlotDescriptor* string_desc = item_desc->slots()[0];
-                EXPECT_FALSE(item->IsNull(string_desc->null_indicator_offset()));
-                const StringValue* expected = &STRINGS[strings_index++ % NUM_STRINGS];
-                const StringValue* actual = item->GetStringSlot(string_desc->tuple_offset());
-                EXPECT_EQ(*expected, *actual);
-            }
-        }
-        rows_read += batch.num_rows();
-    } while (!eos);
-    EXPECT_EQ(NUM_ROWS, rows_read);
-}
-#endif
-
-// TODO: more tests.
-//  - The stream can operate in many modes
-
-} // namespace doris
diff --git a/be/test/runtime/test_env.cc b/be/test/runtime/test_env.cc
index fb8dcc68fe..36c040ca5d 100644
--- a/be/test/runtime/test_env.cc
+++ b/be/test/runtime/test_env.cc
@@ -76,34 +76,6 @@ RuntimeState* TestEnv::create_runtime_state(int64_t query_id) {
     return new RuntimeState(plan_params.params, TQueryOptions(), TQueryGlobals(), _exec_env);
 }
 
-Status TestEnv::create_query_state(int64_t query_id, int max_buffers, int block_size,
-                                   RuntimeState** runtime_state) {
-    *runtime_state = create_runtime_state(query_id);
-    if (*runtime_state == nullptr) {
-        return Status::InternalError("Unexpected error creating RuntimeState");
-    }
-
-    std::shared_ptr<BufferedBlockMgr2> mgr;
-    RETURN_IF_ERROR(BufferedBlockMgr2::create(*runtime_state, (*runtime_state)->runtime_profile(),
-                                              _tmp_file_mgr.get(), block_size, &mgr));
-    (*runtime_state)->set_block_mgr2(mgr);
-    // (*runtime_state)->_block_mgr = mgr;
-
-    _query_states.push_back(std::shared_ptr<RuntimeState>(*runtime_state));
-    return Status::OK();
-}
-
-Status TestEnv::create_query_states(int64_t start_query_id, int num_mgrs, int buffers_per_mgr,
-                                    int block_size, std::vector<RuntimeState*>* runtime_states) {
-    for (int i = 0; i < num_mgrs; ++i) {
-        RuntimeState* runtime_state = nullptr;
-        RETURN_IF_ERROR(create_query_state(start_query_id + i, buffers_per_mgr, block_size,
-                                           &runtime_state));
-        runtime_states->push_back(runtime_state);
-    }
-    return Status::OK();
-}
-
 void TestEnv::tear_down_query_states() {
     _query_states.clear();
 }
diff --git a/be/test/runtime/test_env.h b/be/test/runtime/test_env.h
index e47a7d595d..ea034ebd19 100644
--- a/be/test/runtime/test_env.h
+++ b/be/test/runtime/test_env.h
@@ -18,10 +18,10 @@
 #ifndef DORIS_BE_TEST_QUERY_RUNTIME_TEST_ENV_H
 #define DORIS_BE_TEST_QUERY_RUNTIME_TEST_ENV_H
 
-#include "runtime/buffered_block_mgr2.h"
 #include "runtime/disk_io_mgr.h"
 #include "runtime/exec_env.h"
 #include "runtime/runtime_state.h"
+#include "runtime/tmp_file_mgr.h"
 
 namespace doris {
 
@@ -42,16 +42,6 @@ public:
     // If don't need to open, paths can be empty.
     void init_storage_engine(bool need_open, const std::vector<std::string>& paths = {});
 
-    // Create a RuntimeState for a query with a new block manager. The RuntimeState is
-    // owned by the TestEnv.
-    Status create_query_state(int64_t query_id, int max_buffers, int block_size,
-                              RuntimeState** runtime_state);
-
-    // Create multiple separate RuntimeStates with associated block managers, e.g. as if
-    // multiple queries were executing. The RuntimeStates are owned by TestEnv.
-    Status create_query_states(int64_t start_query_id, int num_mgrs, int buffers_per_mgr,
-                               int block_size, std::vector<RuntimeState*>* runtime_states);
-
     // Destroy all RuntimeStates and block managers created by this TestEnv.
     void tear_down_query_states();