From b085ff49f0e36309d0bb15b25f1a27370e38854f Mon Sep 17 00:00:00 2001 From: Gabriel Date: Fri, 23 Dec 2022 14:10:47 +0800 Subject: [PATCH] [refactor](non-vec) delete non-vec data sink (#15283) * [refactor](non-vec) delete non-vec data sink Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- be/src/common/status.h | 3 + be/src/exec/CMakeLists.txt | 2 +- be/src/exec/base_scanner.h | 4 +- be/src/exec/data_sink.cpp | 22 +- be/src/exec/data_sink.h | 4 - be/src/exec/exec_node.cpp | 50 +- be/src/exec/exec_node.h | 1 - be/src/exec/parquet_scanner.cpp | 140 ++ be/src/exec/parquet_scanner.h | 84 + be/src/exec/tablet_sink.cpp | 1497 ----------------- be/src/exec/tablet_sink.h | 583 ------- be/src/olap/push_handler.cpp | 8 +- be/src/runtime/CMakeLists.txt | 14 - be/src/runtime/buffered_block_mgr2.cc | 1216 ------------- be/src/runtime/buffered_block_mgr2.h | 614 ------- be/src/runtime/buffered_tuple_stream2.cc | 805 --------- be/src/runtime/buffered_tuple_stream2.h | 412 ----- .../runtime/buffered_tuple_stream2.inline.h | 90 - be/src/runtime/buffered_tuple_stream3.cc | 867 ---------- be/src/runtime/buffered_tuple_stream3.h | 647 ------- .../runtime/buffered_tuple_stream3.inline.h | 55 - be/src/runtime/export_sink.cpp | 276 --- be/src/runtime/export_sink.h | 83 - be/src/runtime/memory_scratch_sink.cpp | 97 -- be/src/runtime/memory_scratch_sink.h | 82 - be/src/runtime/mysql_result_writer.cpp | 282 ---- be/src/runtime/mysql_result_writer.h | 79 - be/src/runtime/mysql_table_sink.cpp | 86 - be/src/runtime/mysql_table_sink.h | 73 - be/src/runtime/mysql_table_writer.cpp | 182 -- be/src/runtime/mysql_table_writer.h | 68 - be/src/runtime/odbc_table_sink.cpp | 105 -- be/src/runtime/odbc_table_sink.h | 74 - be/src/runtime/plan_fragment_executor.cpp | 134 +- be/src/runtime/plan_fragment_executor.h | 10 - be/src/runtime/result_writer.h | 6 - be/src/runtime/row_batch.cpp | 48 - be/src/runtime/row_batch.h | 17 +- be/src/runtime/runtime_state.cpp | 9 - be/src/runtime/runtime_state.h | 3 - be/src/runtime/sorter.h | 53 - be/src/vec/exec/join/vhash_join_node.cpp | 4 - be/src/vec/exec/join/vhash_join_node.h | 1 - be/src/vec/exec/join/vnested_loop_join_node.h | 4 - be/src/vec/exec/scan/vscan_node.h | 4 - be/src/vec/exec/vaggregation_node.cpp | 4 - be/src/vec/exec/vaggregation_node.h | 1 - be/src/vec/exec/vanalytic_eval_node.cpp | 4 - be/src/vec/exec/vanalytic_eval_node.h | 1 - be/src/vec/exec/varrow_scanner.h | 5 - be/src/vec/exec/vassert_num_rows_node.h | 4 - be/src/vec/exec/vbroker_scan_node.h | 5 - be/src/vec/exec/vbroker_scanner.h | 5 - be/src/vec/exec/vdata_gen_scan_node.cpp | 5 - be/src/vec/exec/vdata_gen_scan_node.h | 4 - be/src/vec/exec/vempty_set_node.h | 3 - be/src/vec/exec/vexchange_node.cpp | 3 - be/src/vec/exec/vexchange_node.h | 1 - be/src/vec/exec/vjson_scanner.h | 4 - be/src/vec/exec/vmysql_scan_node.h | 3 - be/src/vec/exec/vschema_scan_node.h | 3 - be/src/vec/exec/vselect_node.cpp | 4 - be/src/vec/exec/vselect_node.h | 1 - be/src/vec/exec/vset_operation_node.h | 3 - be/src/vec/exec/vsort_node.cpp | 5 - be/src/vec/exec/vsort_node.h | 24 +- be/src/vec/exec/vunion_node.h | 3 - be/src/vec/runtime/vfile_result_writer.h | 3 - be/src/vec/runtime/vsorted_run_merger.cpp | 1 - be/src/vec/sink/vdata_stream_sender.cpp | 4 - be/src/vec/sink/vdata_stream_sender.h | 1 - be/src/vec/sink/vmysql_result_writer.cpp | 4 - be/src/vec/sink/vmysql_result_writer.h | 2 - be/src/vec/sink/vmysql_table_writer.cpp | 8 + be/src/vec/sink/vmysql_table_writer.h | 12 +- be/src/vec/sink/vresult_file_sink.cpp | 4 - be/src/vec/sink/vresult_file_sink.h | 1 - be/src/vec/sink/vresult_sink.cpp | 4 - be/src/vec/sink/vresult_sink.h | 2 - be/src/vec/sink/vtable_sink.cpp | 5 - be/src/vec/sink/vtable_sink.h | 2 - be/src/vec/sink/vtablet_sink.cpp | 785 ++++++++- be/src/vec/sink/vtablet_sink.h | 459 ++++- be/test/CMakeLists.txt | 3 - be/test/runtime/buffered_block_mgr2_test.cpp | 1246 -------------- .../runtime/buffered_tuple_stream2_test.cpp | 821 --------- be/test/runtime/test_env.cc | 28 - be/test/runtime/test_env.h | 12 +- 88 files changed, 1441 insertions(+), 10959 deletions(-) create mode 100644 be/src/exec/parquet_scanner.cpp create mode 100644 be/src/exec/parquet_scanner.h delete mode 100644 be/src/exec/tablet_sink.cpp delete mode 100644 be/src/exec/tablet_sink.h delete mode 100644 be/src/runtime/buffered_block_mgr2.cc delete mode 100644 be/src/runtime/buffered_block_mgr2.h delete mode 100644 be/src/runtime/buffered_tuple_stream2.cc delete mode 100644 be/src/runtime/buffered_tuple_stream2.h delete mode 100644 be/src/runtime/buffered_tuple_stream2.inline.h delete mode 100644 be/src/runtime/buffered_tuple_stream3.cc delete mode 100644 be/src/runtime/buffered_tuple_stream3.h delete mode 100644 be/src/runtime/buffered_tuple_stream3.inline.h delete mode 100644 be/src/runtime/export_sink.cpp delete mode 100644 be/src/runtime/export_sink.h delete mode 100644 be/src/runtime/memory_scratch_sink.cpp delete mode 100644 be/src/runtime/memory_scratch_sink.h delete mode 100644 be/src/runtime/mysql_result_writer.cpp delete mode 100644 be/src/runtime/mysql_result_writer.h delete mode 100644 be/src/runtime/mysql_table_sink.cpp delete mode 100644 be/src/runtime/mysql_table_sink.h delete mode 100644 be/src/runtime/mysql_table_writer.cpp delete mode 100644 be/src/runtime/mysql_table_writer.h delete mode 100644 be/src/runtime/odbc_table_sink.cpp delete mode 100644 be/src/runtime/odbc_table_sink.h delete mode 100644 be/src/runtime/sorter.h delete mode 100644 be/test/runtime/buffered_block_mgr2_test.cpp delete mode 100644 be/test/runtime/buffered_tuple_stream2_test.cpp diff --git a/be/src/common/status.h b/be/src/common/status.h index 0970447a45..cd514305b3 100644 --- a/be/src/common/status.h +++ b/be/src/common/status.h @@ -488,6 +488,9 @@ inline std::string Status::to_string() const { } \ } while (false) +#define RETURN_ERROR_IF_NON_VEC \ + return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + // End _get_next_span after last call to get_next method #define RETURN_IF_ERROR_AND_CHECK_SPAN(stmt, get_next_span, done) \ do { \ diff --git a/be/src/exec/CMakeLists.txt b/be/src/exec/CMakeLists.txt index 3684b005d5..04580ba622 100644 --- a/be/src/exec/CMakeLists.txt +++ b/be/src/exec/CMakeLists.txt @@ -34,7 +34,6 @@ set(EXEC_FILES text_converter.cpp olap_common.cpp tablet_info.cpp - tablet_sink.cpp plain_binary_line_reader.cpp plain_text_line_reader.cpp es/es_predicate.cpp @@ -64,6 +63,7 @@ set(EXEC_FILES odbc_connector.cpp table_connector.cpp schema_scanner.cpp + parquet_scanner.cpp ) if (WITH_MYSQL) set(EXEC_FILES diff --git a/be/src/exec/base_scanner.h b/be/src/exec/base_scanner.h index c6bcde2f67..f00bfe3641 100644 --- a/be/src/exec/base_scanner.h +++ b/be/src/exec/base_scanner.h @@ -72,7 +72,9 @@ public: virtual Status open(); // Get next tuple - virtual Status get_next(Tuple* tuple, MemPool* tuple_pool, bool* eof, bool* fill_tuple) = 0; + virtual Status get_next(Tuple* tuple, MemPool* tuple_pool, bool* eof, bool* fill_tuple) { + return Status::NotSupported("Not Implemented get block"); + } // Get next block virtual Status get_next(vectorized::Block* block, bool* eof) { diff --git a/be/src/exec/data_sink.cpp b/be/src/exec/data_sink.cpp index ecb9329f6b..6c9068744d 100644 --- a/be/src/exec/data_sink.cpp +++ b/be/src/exec/data_sink.cpp @@ -25,7 +25,6 @@ #include #include "gen_cpp/PaloInternalService_types.h" -#include "runtime/memory_scratch_sink.h" #include "runtime/runtime_state.h" #include "vec/sink/vdata_stream_sender.h" #include "vec/sink/vjdbc_table_sink.h" @@ -59,7 +58,7 @@ Status DataSink::create_data_sink(ObjectPool* pool, const TDataSink& thrift_sink state, pool, params.sender_id, row_desc, thrift_sink.stream_sink, params.destinations, 16 * 1024, send_query_statistics_with_every_batch); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } // RETURN_IF_ERROR(sender->prepare(state->obj_pool(), thrift_sink.stream_sink)); sink->reset(tmp_sink); @@ -75,7 +74,7 @@ Status DataSink::create_data_sink(ObjectPool* pool, const TDataSink& thrift_sink tmp_sink = new doris::vectorized::VResultSink(row_desc, output_exprs, thrift_sink.result_sink, 4096); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } sink->reset(tmp_sink); break; @@ -103,19 +102,14 @@ Status DataSink::create_data_sink(ObjectPool* pool, const TDataSink& thrift_sink send_query_statistics_with_every_batch, output_exprs); } } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } sink->reset(tmp_sink); break; } case TDataSinkType::MEMORY_SCRATCH_SINK: { - if (!thrift_sink.__isset.memory_scratch_sink) { - return Status::InternalError("Missing data buffer sink."); - } - - tmp_sink = new MemoryScratchSink(row_desc, output_exprs, thrift_sink.memory_scratch_sink); - sink->reset(tmp_sink); + RETURN_ERROR_IF_NON_VEC; break; } case TDataSinkType::MYSQL_TABLE_SINK: { @@ -128,7 +122,7 @@ Status DataSink::create_data_sink(ObjectPool* pool, const TDataSink& thrift_sink new doris::vectorized::VMysqlTableSink(pool, row_desc, output_exprs); sink->reset(vmysql_tbl_sink); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } break; #else @@ -143,7 +137,7 @@ Status DataSink::create_data_sink(ObjectPool* pool, const TDataSink& thrift_sink if (state->enable_vectorized_exec()) { sink->reset(new vectorized::VOdbcTableSink(pool, row_desc, output_exprs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } break; } @@ -167,7 +161,7 @@ Status DataSink::create_data_sink(ObjectPool* pool, const TDataSink& thrift_sink } case TDataSinkType::EXPORT_SINK: { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; break; } case TDataSinkType::OLAP_TABLE_SINK: { @@ -176,7 +170,7 @@ Status DataSink::create_data_sink(ObjectPool* pool, const TDataSink& thrift_sink if (state->enable_vectorized_exec()) { sink->reset(new stream_load::VOlapTableSink(pool, row_desc, output_exprs, &status)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } RETURN_IF_ERROR(status); break; diff --git a/be/src/exec/data_sink.h b/be/src/exec/data_sink.h index 9f21bcf4a1..299e1c5376 100644 --- a/be/src/exec/data_sink.h +++ b/be/src/exec/data_sink.h @@ -57,10 +57,6 @@ public: // Setup. Call before send() or close(). virtual Status open(RuntimeState* state) = 0; - // Send a row batch into this sink. - // eos should be true when the last batch is passed to send() - virtual Status send(RuntimeState* state, RowBatch* batch) = 0; - // Send a Block into this sink. virtual Status send(RuntimeState* state, vectorized::Block* block, bool eos = false) { return Status::NotSupported("Not support send block"); diff --git a/be/src/exec/exec_node.cpp b/be/src/exec/exec_node.cpp index 7d35db7ad9..616bf9f5e0 100644 --- a/be/src/exec/exec_node.cpp +++ b/be/src/exec/exec_node.cpp @@ -427,7 +427,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN if (state->enable_vectorized_exec()) { *node = pool->add(new vectorized::VMysqlScanNode(pool, tnode, descs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); #else @@ -438,7 +438,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN if (state->enable_vectorized_exec()) { *node = pool->add(new vectorized::NewOdbcScanNode(pool, tnode, descs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); @@ -460,7 +460,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN if (state->enable_vectorized_exec()) { *node = pool->add(new vectorized::NewEsScanNode(pool, tnode, descs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); @@ -468,7 +468,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN if (state->enable_vectorized_exec()) { *node = pool->add(new vectorized::VSchemaScanNode(pool, tnode, descs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); @@ -476,7 +476,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN if (state->enable_vectorized_exec()) { *node = pool->add(new vectorized::NewOlapScanNode(pool, tnode, descs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); @@ -484,7 +484,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN if (state->enable_vectorized_exec()) { *node = pool->add(new vectorized::AggregationNode(pool, tnode, descs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); @@ -499,7 +499,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN } *node = pool->add(new vectorized::HashJoinNode(pool, tnode, descs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); @@ -507,7 +507,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN if (state->enable_vectorized_exec()) { *node = pool->add(new vectorized::VNestedLoopJoinNode(pool, tnode, descs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); @@ -515,7 +515,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN if (state->enable_vectorized_exec()) { *node = pool->add(new vectorized::VEmptySetNode(pool, tnode, descs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); @@ -523,7 +523,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN if (state->enable_vectorized_exec()) { *node = pool->add(new doris::vectorized::VExchangeNode(pool, tnode, descs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); @@ -531,7 +531,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN if (state->enable_vectorized_exec()) { *node = pool->add(new doris::vectorized::VSelectNode(pool, tnode, descs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); @@ -539,7 +539,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN if (state->enable_vectorized_exec()) { *node = pool->add(new vectorized::VSortNode(pool, tnode, descs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); @@ -547,18 +547,18 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN if (state->enable_vectorized_exec()) { *node = pool->add(new vectorized::VAnalyticEvalNode(pool, tnode, descs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); case TPlanNodeType::MERGE_NODE: - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; case TPlanNodeType::UNION_NODE: if (state->enable_vectorized_exec()) { *node = pool->add(new vectorized::VUnionNode(pool, tnode, descs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); @@ -566,7 +566,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN if (state->enable_vectorized_exec()) { *node = pool->add(new vectorized::VIntersectNode(pool, tnode, descs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); @@ -574,7 +574,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN if (state->enable_vectorized_exec()) { *node = pool->add(new vectorized::VExceptNode(pool, tnode, descs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); @@ -582,7 +582,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN if (state->enable_vectorized_exec()) { *node = pool->add(new vectorized::VBrokerScanNode(pool, tnode, descs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); @@ -590,7 +590,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN if (state->enable_vectorized_exec()) { *node = pool->add(new vectorized::NewFileScanNode(pool, tnode, descs)); } else { - return Status::InternalError("Not support file scan node in non-vec engine"); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); @@ -598,7 +598,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN if (state->enable_vectorized_exec()) { *node = pool->add(new vectorized::VRepeatNode(pool, tnode, descs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); @@ -606,7 +606,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN if (state->enable_vectorized_exec()) { *node = pool->add(new vectorized::VAssertNumRowsNode(pool, tnode, descs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); @@ -614,7 +614,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN if (state->enable_vectorized_exec()) { *node = pool->add(new vectorized::VTableFunctionNode(pool, tnode, descs)); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } return Status::OK(); @@ -623,7 +623,7 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN *node = pool->add(new vectorized::VDataGenFunctionScanNode(pool, tnode, descs)); return Status::OK(); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } default: @@ -781,10 +781,6 @@ Status ExecNode::QueryMaintenance(RuntimeState* state, const std::string& msg) { return state->check_query_state(msg); } -Status ExecNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { - return Status::NotSupported("Not Implemented get batch"); -} - Status ExecNode::get_next(RuntimeState* state, vectorized::Block* block, bool* eos) { return Status::NotSupported("Not Implemented get block"); } diff --git a/be/src/exec/exec_node.h b/be/src/exec/exec_node.h index 1b089b4788..f5af72ac61 100644 --- a/be/src/exec/exec_node.h +++ b/be/src/exec/exec_node.h @@ -109,7 +109,6 @@ public: // row_batch's tuple_data_pool. // Caller must not be holding any io buffers. This will cause deadlock. // TODO: AggregationNode and HashJoinNode cannot be "re-opened" yet. - virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos); virtual Status get_next(RuntimeState* state, vectorized::Block* block, bool* eos); // new interface to compatible new optimizers in FE Status get_next_after_projects( diff --git a/be/src/exec/parquet_scanner.cpp b/be/src/exec/parquet_scanner.cpp new file mode 100644 index 0000000000..074f7d35a7 --- /dev/null +++ b/be/src/exec/parquet_scanner.cpp @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exec/parquet_scanner.h" + +#include "exec/arrow/parquet_reader.h" +#include "io/file_factory.h" +#include "runtime/descriptors.h" +#include "runtime/exec_env.h" +#include "runtime/stream_load/stream_load_pipe.h" + +namespace doris { +using namespace ErrorCode; + +ParquetScanner::ParquetScanner(RuntimeState* state, RuntimeProfile* profile, + const TBrokerScanRangeParams& params, + const std::vector& ranges, + const std::vector& broker_addresses, + const std::vector& pre_filter_texprs, ScannerCounter* counter) + : BaseScanner(state, profile, params, ranges, broker_addresses, pre_filter_texprs, counter), + // _splittable(params.splittable), + _cur_file_reader(nullptr), + _cur_file_eof(false) {} + +ParquetScanner::~ParquetScanner() { + close(); +} + +Status ParquetScanner::open() { + return BaseScanner::open(); +} + +Status ParquetScanner::get_next(Tuple* tuple, MemPool* tuple_pool, bool* eof, bool* fill_tuple) { + SCOPED_TIMER(_read_timer); + // Get one line + while (!_scanner_eof) { + if (_cur_file_reader == nullptr || _cur_file_eof) { + RETURN_IF_ERROR(open_next_reader()); + // If there isn't any more reader, break this + if (_scanner_eof) { + continue; + } + _cur_file_eof = false; + } + RETURN_IF_ERROR(_cur_file_reader->read(_src_tuple, tuple_pool, &_cur_file_eof)); + // range of current file + const TBrokerRangeDesc& range = _ranges.at(_next_range - 1); + if (range.__isset.num_of_columns_from_file) { + fill_slots_of_columns_from_path(range.num_of_columns_from_file, + range.columns_from_path); + } + + COUNTER_UPDATE(_rows_read_counter, 1); + SCOPED_TIMER(_materialize_timer); + RETURN_IF_ERROR(fill_dest_tuple(tuple, tuple_pool, fill_tuple)); + break; // break always + } + + *eof = _scanner_eof; + return Status::OK(); +} + +Status ParquetScanner::open_next_reader() { + // open_file_reader + if (_cur_file_reader != nullptr) { + if (_stream_load_pipe != nullptr) { + _stream_load_pipe.reset(); + _cur_file_reader = nullptr; + } else { + delete _cur_file_reader; + _cur_file_reader = nullptr; + } + } + + while (true) { + if (_next_range >= _ranges.size()) { + _scanner_eof = true; + return Status::OK(); + } + const TBrokerRangeDesc& range = _ranges[_next_range++]; + std::unique_ptr file_reader; + RETURN_IF_ERROR(FileFactory::create_file_reader( + range.file_type, _state->exec_env(), _profile, _broker_addresses, + _params.properties, range, range.start_offset, file_reader)); + RETURN_IF_ERROR(file_reader->open()); + + if (file_reader->size() == 0) { + file_reader->close(); + continue; + } + int32_t num_of_columns_from_file = _src_slot_descs.size(); + if (range.__isset.num_of_columns_from_file) { + num_of_columns_from_file = range.num_of_columns_from_file; + } + _cur_file_reader = new ParquetReaderWrap(_state, _src_slot_descs, file_reader.release(), + num_of_columns_from_file, 0, 0); + auto tuple_desc = _state->desc_tbl().get_tuple_descriptor(_tupleId); + Status status = + _cur_file_reader->init_reader(tuple_desc, _conjunct_ctxs, _state->timezone()); + if (status.is()) { + continue; + } else { + if (!status.ok()) { + return Status::InternalError("file: {}, error:{}", range.path, status.to_string()); + } else { + RETURN_IF_ERROR(_cur_file_reader->init_parquet_type()); + return status; + } + } + } +} + +void ParquetScanner::close() { + BaseScanner::close(); + if (_cur_file_reader != nullptr) { + if (_stream_load_pipe != nullptr) { + _stream_load_pipe.reset(); + _cur_file_reader = nullptr; + } else { + delete _cur_file_reader; + _cur_file_reader = nullptr; + } + } +} + +} // namespace doris diff --git a/be/src/exec/parquet_scanner.h b/be/src/exec/parquet_scanner.h new file mode 100644 index 0000000000..3c0ca48eae --- /dev/null +++ b/be/src/exec/parquet_scanner.h @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "common/status.h" +#include "exec/base_scanner.h" +#include "gen_cpp/PlanNodes_types.h" +#include "gen_cpp/Types_types.h" +#include "runtime/mem_pool.h" +#include "util/runtime_profile.h" +#include "util/slice.h" + +namespace doris { + +class Tuple; +class SlotDescriptor; +struct Slice; +class ParquetReaderWrap; +class RuntimeState; +class ExprContext; +class TupleDescriptor; +class TupleRow; +class RowDescriptor; +class RuntimeProfile; +class StreamLoadPipe; + +// Broker scanner convert the data read from broker to doris's tuple. +class ParquetScanner : public BaseScanner { +public: + ParquetScanner(RuntimeState* state, RuntimeProfile* profile, + const TBrokerScanRangeParams& params, + const std::vector& ranges, + const std::vector& broker_addresses, + const std::vector& pre_filter_texprs, ScannerCounter* counter); + + ~ParquetScanner() override; + + // Open this scanner, will initialize information need to + Status open() override; + + // Get next tuple + Status get_next(Tuple* tuple, MemPool* tuple_pool, bool* eof, bool* fill_tuple) override; + + Status get_next(vectorized::Block* block, bool* eof) override { + return Status::NotSupported("Not Implemented get block"); + } + + // Close this scanner + void close() override; + +protected: + // Read next buffer from reader + Status open_next_reader(); + + // Reader + ParquetReaderWrap* _cur_file_reader; + bool _cur_file_eof; // is read over? + + // used to hold current StreamLoadPipe + std::shared_ptr _stream_load_pipe; +}; + +} // namespace doris diff --git a/be/src/exec/tablet_sink.cpp b/be/src/exec/tablet_sink.cpp deleted file mode 100644 index 106623fa93..0000000000 --- a/be/src/exec/tablet_sink.cpp +++ /dev/null @@ -1,1497 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "exec/tablet_sink.h" - -#include - -#include -#include -#include - -#include "exprs/expr.h" -#include "exprs/expr_context.h" -#include "olap/hll.h" -#include "runtime/exec_env.h" -#include "runtime/row_batch.h" -#include "runtime/runtime_state.h" -#include "runtime/thread_context.h" -#include "runtime/tuple_row.h" -#include "service/backend_options.h" -#include "util/brpc_client_cache.h" -#include "util/debug/sanitizer_scopes.h" -#include "util/defer_op.h" -#include "util/proto_util.h" -#include "util/threadpool.h" -#include "util/time.h" -#include "util/uid_util.h" -#include "vec/sink/vtablet_sink.h" - -namespace doris { -namespace stream_load { - -NodeChannel::NodeChannel(OlapTableSink* parent, IndexChannel* index_channel, int64_t node_id) - : _parent(parent), _index_channel(index_channel), _node_id(node_id) { - _node_channel_tracker = std::make_shared(fmt::format( - "NodeChannel:indexID={}:threadId={}", std::to_string(_index_channel->_index_id), - thread_context()->get_thread_id())); -} - -NodeChannel::~NodeChannel() noexcept { - if (_open_closure != nullptr) { - if (_open_closure->unref()) { - delete _open_closure; - } - _open_closure = nullptr; - } - if (_add_batch_closure != nullptr) { - // it's safe to delete, but may take some time to wait until brpc joined - delete _add_batch_closure; - _add_batch_closure = nullptr; - } - if (!_is_vectorized) { - _cur_add_batch_request.release_id(); - } -} - -// if "_cancelled" is set to true, -// no need to set _cancel_msg because the error will be -// returned directly via "TabletSink::prepare()" method. -Status NodeChannel::init(RuntimeState* state) { - SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get()); - _tuple_desc = _parent->_output_tuple_desc; - _state = state; - auto node = _parent->_nodes_info->find_node(_node_id); - if (node == nullptr) { - _cancelled = true; - return Status::InternalError("unknown node id, id={}", _node_id); - } - - _node_info = *node; - - _load_info = "load_id=" + print_id(_parent->_load_id) + - ", txn_id=" + std::to_string(_parent->_txn_id); - - _row_desc.reset(new RowDescriptor(_tuple_desc, false)); - _batch_size = state->batch_size(); - - _stub = state->exec_env()->brpc_internal_client_cache()->get_client(_node_info.host, - _node_info.brpc_port); - if (_stub == nullptr) { - LOG(WARNING) << "Get rpc stub failed, host=" << _node_info.host - << ", port=" << _node_info.brpc_port << ", " << channel_info(); - _cancelled = true; - return Status::InternalError("get rpc stub failed"); - } - - if (!_is_vectorized) { - _cur_batch.reset(new RowBatch(*_row_desc, _batch_size)); - - // Initialize _cur_add_batch_request - _cur_add_batch_request.set_allocated_id(&_parent->_load_id); - _cur_add_batch_request.set_index_id(_index_channel->_index_id); - _cur_add_batch_request.set_sender_id(_parent->_sender_id); - _cur_add_batch_request.set_backend_id(_node_id); - _cur_add_batch_request.set_eos(false); - - _name = fmt::format("NodeChannel[{}-{}]", _index_channel->_index_id, _node_id); - } - - _rpc_timeout_ms = state->query_options().query_timeout * 1000; - _timeout_watch.start(); - - return Status::OK(); -} - -void NodeChannel::open() { - SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get()); - PTabletWriterOpenRequest request; - request.set_allocated_id(&_parent->_load_id); - request.set_index_id(_index_channel->_index_id); - request.set_txn_id(_parent->_txn_id); - request.set_allocated_schema(_parent->_schema->to_protobuf()); - for (auto& tablet : _all_tablets) { - auto ptablet = request.add_tablets(); - ptablet->set_partition_id(tablet.partition_id); - ptablet->set_tablet_id(tablet.tablet_id); - } - request.set_num_senders(_parent->_num_senders); - request.set_need_gen_rollup(false); // Useless but it is a required field in pb - request.set_load_mem_limit(_parent->_load_mem_limit); - request.set_load_channel_timeout_s(_parent->_load_channel_timeout_s); - request.set_is_high_priority(_parent->_is_high_priority); - request.set_sender_ip(BackendOptions::get_localhost()); - request.set_is_vectorized(_is_vectorized); - - _open_closure = new RefCountClosure(); - _open_closure->ref(); - - // This ref is for RPC's reference - _open_closure->ref(); - _open_closure->cntl.set_timeout_ms(config::tablet_writer_open_rpc_timeout_sec * 1000); - if (config::tablet_writer_ignore_eovercrowded) { - _open_closure->cntl.ignore_eovercrowded(); - } - _stub->tablet_writer_open(&_open_closure->cntl, &request, &_open_closure->result, - _open_closure); - request.release_id(); - request.release_schema(); -} - -void NodeChannel::_cancel_with_msg(const std::string& msg) { - LOG(WARNING) << "cancel node channel " << channel_info() << ", error message: " << msg; - { - std::lock_guard l(_cancel_msg_lock); - if (_cancel_msg == "") { - _cancel_msg = msg; - } - } - _cancelled = true; -} - -Status NodeChannel::open_wait() { - _open_closure->join(); - SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get()); - if (_open_closure->cntl.Failed()) { - if (!ExecEnv::GetInstance()->brpc_internal_client_cache()->available( - _stub, _node_info.host, _node_info.brpc_port)) { - ExecEnv::GetInstance()->brpc_internal_client_cache()->erase( - _open_closure->cntl.remote_side()); - } - std::stringstream ss; - ss << "failed to open tablet writer, error=" << berror(_open_closure->cntl.ErrorCode()) - << ", error_text=" << _open_closure->cntl.ErrorText(); - _cancelled = true; - LOG(WARNING) << ss.str() << " " << channel_info(); - return Status::InternalError("failed to open tablet writer, error={}, error_text={}", - berror(_open_closure->cntl.ErrorCode()), - _open_closure->cntl.ErrorText()); - } - Status status(_open_closure->result.status()); - if (_open_closure->unref()) { - delete _open_closure; - } - _open_closure = nullptr; - - if (!status.ok()) { - _cancelled = true; - return status; - } - - if (!_is_vectorized) { - // add batch closure - _add_batch_closure = ReusableClosure::create(); - _add_batch_closure->addFailedHandler([this](bool is_last_rpc) { - SCOPED_ATTACH_TASK(_state); - std::lock_guard l(this->_closed_lock); - if (this->_is_closed) { - // if the node channel is closed, no need to call `mark_as_failed`, - // and notice that _index_channel may already be destroyed. - return; - } - // If rpc failed, mark all tablets on this node channel as failed - _index_channel->mark_as_failed(this->node_id(), this->host(), - fmt::format("rpc failed, error coed:{}, error text:{}", - _add_batch_closure->cntl.ErrorCode(), - _add_batch_closure->cntl.ErrorText()), - -1); - Status st = _index_channel->check_intolerable_failure(); - if (!st.ok()) { - _cancel_with_msg(fmt::format("{}, err: {}", channel_info(), st.to_string())); - } else if (is_last_rpc) { - // if this is last rpc, will must set _add_batches_finished. otherwise, node channel's close_wait - // will be blocked. - _add_batches_finished = true; - VLOG_PROGRESS << "node channel " << channel_info() << "add_batches_finished"; - } - }); - - _add_batch_closure->addSuccessHandler([this](const PTabletWriterAddBatchResult& result, - bool is_last_rpc) { - SCOPED_ATTACH_TASK(_state); - std::lock_guard l(this->_closed_lock); - if (this->_is_closed) { - // if the node channel is closed, no need to call the following logic, - // and notice that _index_channel may already be destroyed. - return; - } - Status status(result.status()); - if (status.ok()) { - // if has error tablet, handle them first - for (auto& error : result.tablet_errors()) { - _index_channel->mark_as_failed(this->node_id(), this->host(), - "tablet error: " + error.msg(), - error.tablet_id()); - } - - Status st = _index_channel->check_intolerable_failure(); - if (!st.ok()) { - _cancel_with_msg(st.to_string()); - } else if (is_last_rpc) { - for (auto& tablet : result.tablet_vec()) { - TTabletCommitInfo commit_info; - commit_info.tabletId = tablet.tablet_id(); - commit_info.backendId = _node_id; - _tablet_commit_infos.emplace_back(std::move(commit_info)); - if (tablet.has_received_rows()) { - _tablets_received_rows.emplace_back(tablet.tablet_id(), - tablet.received_rows()); - } - VLOG_CRITICAL - << "master replica commit info: tabletId=" << tablet.tablet_id() - << ", backendId=" << _node_id - << ", master node id: " << this->node_id() - << ", host: " << this->host() << ", txn_id=" << _parent->_txn_id; - } - - if (_parent->_write_single_replica) { - for (auto& tablet_slave_node_ids : result.success_slave_tablet_node_ids()) { - for (auto slave_node_id : - tablet_slave_node_ids.second.slave_node_ids()) { - TTabletCommitInfo commit_info; - commit_info.tabletId = tablet_slave_node_ids.first; - commit_info.backendId = slave_node_id; - _tablet_commit_infos.emplace_back(std::move(commit_info)); - VLOG_CRITICAL << "slave replica commit info: tabletId=" - << tablet_slave_node_ids.first - << ", backendId=" << slave_node_id - << ", master node id: " << this->node_id() - << ", host: " << this->host() - << ", txn_id=" << _parent->_txn_id; - } - } - } - _add_batches_finished = true; - VLOG_PROGRESS << "node channel " << channel_info() - << "add_batches_finished and handled " - << result.tablet_errors().size() << " tablets errors"; - } - } else { - _cancel_with_msg( - fmt::format("{}, add batch req success but status isn't ok, err: {}", - channel_info(), status.to_string())); - } - - if (result.has_execution_time_us()) { - _add_batch_counter.add_batch_execution_time_us += result.execution_time_us(); - _add_batch_counter.add_batch_wait_execution_time_us += - result.wait_execution_time_us(); - _add_batch_counter.add_batch_num++; - } - }); - } - return status; -} - -Status NodeChannel::add_row(Tuple* input_tuple, int64_t tablet_id) { - SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get()); - // If add_row() when _eos_is_produced==true, there must be sth wrong, we can only mark this channel as failed. - auto st = none_of({_cancelled, _eos_is_produced}); - if (!st.ok()) { - if (_cancelled) { - std::lock_guard l(_cancel_msg_lock); - return Status::InternalError("add row failed. {}", _cancel_msg); - } else { - return std::move(st.prepend("already stopped, can't add row. cancelled/eos: ")); - } - } - - // We use OlapTableSink mem_tracker which has the same ancestor of _plan node, - // so in the ideal case, mem limit is a matter for _plan node. - // But there is still some unfinished things, we do mem limit here temporarily. - // _cancelled may be set by rpc callback, and it's possible that _cancelled might be set in any of the steps below. - // It's fine to do a fake add_row() and return OK, because we will check _cancelled in next add_row() or mark_close(). - while (!_cancelled && _pending_batches_num > 0 && - _pending_batches_bytes > _max_pending_batches_bytes) { - SCOPED_ATOMIC_TIMER(&_mem_exceeded_block_ns); - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - } - - auto row_no = _cur_batch->add_row(); - if (row_no == RowBatch::INVALID_ROW_INDEX) { - { - SCOPED_ATOMIC_TIMER(&_queue_push_lock_ns); - std::lock_guard l(_pending_batches_lock); - _pending_batches_bytes += _cur_batch->tuple_data_pool()->total_reserved_bytes(); - //To simplify the add_row logic, postpone adding batch into req until the time of sending req - _pending_batches.emplace(std::move(_cur_batch), _cur_add_batch_request); - _pending_batches_num++; - VLOG_DEBUG << "OlapTableSink:" << _parent << " NodeChannel:" << this - << " pending_batches_bytes:" << _pending_batches_bytes - << " jobid:" << std::to_string(_state->load_job_id()) - << " tabletid:" << tablet_id << " loadinfo:" << _load_info; - } - - _cur_batch.reset(new RowBatch(*_row_desc, _batch_size)); - _cur_add_batch_request.clear_tablet_ids(); - - row_no = _cur_batch->add_row(); - } - DCHECK_NE(row_no, RowBatch::INVALID_ROW_INDEX); - auto tuple = input_tuple->deep_copy(*_tuple_desc, _cur_batch->tuple_data_pool()); - - _cur_batch->get_row(row_no)->set_tuple(0, tuple); - _cur_batch->commit_last_row(); - _cur_add_batch_request.add_tablet_ids(tablet_id); - return Status::OK(); -} - -// Used for vectorized engine. -// TODO(cmy): deprecated, need refactor -Status NodeChannel::add_row(const BlockRow& block_row, int64_t tablet_id) { - SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get()); - // If add_row() when _eos_is_produced==true, there must be sth wrong, we can only mark this channel as failed. - auto st = none_of({_cancelled, _eos_is_produced}); - if (!st.ok()) { - if (_cancelled) { - std::lock_guard l(_cancel_msg_lock); - return Status::InternalError("add row failed. " + _cancel_msg); - } else { - return std::move(st.prepend("already stopped, can't add row. cancelled/eos: ")); - } - } - - while (!_cancelled && _pending_batches_num > 0 && - _pending_batches_bytes > _max_pending_batches_bytes) { - SCOPED_ATOMIC_TIMER(&_mem_exceeded_block_ns); - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - } - - constexpr size_t BATCH_SIZE_FOR_SEND = 2 * 1024 * 1024; //2M - auto row_no = _cur_batch->add_row(); - if (row_no == RowBatch::INVALID_ROW_INDEX || - _cur_batch->tuple_data_pool()->total_allocated_bytes() > BATCH_SIZE_FOR_SEND) { - { - SCOPED_ATOMIC_TIMER(&_queue_push_lock_ns); - std::lock_guard l(_pending_batches_lock); - _pending_batches_bytes += _cur_batch->tuple_data_pool()->total_reserved_bytes(); - //To simplify the add_row logic, postpone adding batch into req until the time of sending req - _pending_batches.emplace(std::move(_cur_batch), _cur_add_batch_request); - _pending_batches_num++; - } - - _cur_batch.reset(new RowBatch(*_row_desc, _batch_size)); - _cur_add_batch_request.clear_tablet_ids(); - - row_no = _cur_batch->add_row(); - } - DCHECK_NE(row_no, RowBatch::INVALID_ROW_INDEX); - - _cur_batch->get_row(row_no)->set_tuple( - 0, block_row.first->deep_copy_tuple(*_tuple_desc, _cur_batch->tuple_data_pool(), - block_row.second, 0, true)); - _cur_batch->commit_last_row(); - _cur_add_batch_request.add_tablet_ids(tablet_id); - return Status::OK(); -} - -void NodeChannel::mark_close() { - SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get()); - auto st = none_of({_cancelled, _eos_is_produced}); - if (!st.ok()) { - return; - } - - _cur_add_batch_request.set_eos(true); - { - debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan; - std::lock_guard l(_pending_batches_lock); - _pending_batches_bytes += _cur_batch->tuple_data_pool()->total_reserved_bytes(); - _pending_batches.emplace(std::move(_cur_batch), _cur_add_batch_request); - _pending_batches_num++; - DCHECK(_pending_batches.back().second.eos()); - _close_time_ms = UnixMillis(); - LOG(INFO) << channel_info() - << " mark closed, left pending batch size: " << _pending_batches.size() - << " left pending batch size: " << _pending_batches_bytes; - } - - _eos_is_produced = true; - return; -} - -void NodeChannel::_close_check() { - std::lock_guard lg(_pending_batches_lock); - CHECK(_pending_batches.empty()) << name(); - CHECK(_cur_batch == nullptr) << name(); -} -Status NodeChannel::close_wait(RuntimeState* state) { - SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get()); - // set _is_closed to true finally - Defer set_closed {[&]() { - std::lock_guard l(_closed_lock); - _is_closed = true; - }}; - - auto st = none_of({_cancelled, !_eos_is_produced}); - if (!st.ok()) { - if (_cancelled) { - std::lock_guard l(_cancel_msg_lock); - return Status::InternalError("wait close failed. {}", _cancel_msg); - } else { - return std::move( - st.prepend("already stopped, skip waiting for close. cancelled/!eos: ")); - } - } - - // waiting for finished, it may take a long time, so we couldn't set a timeout - while (!_add_batches_finished && !_cancelled) { - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - } - _close_time_ms = UnixMillis() - _close_time_ms; - - if (_add_batches_finished) { - _close_check(); - state->tablet_commit_infos().insert(state->tablet_commit_infos().end(), - std::make_move_iterator(_tablet_commit_infos.begin()), - std::make_move_iterator(_tablet_commit_infos.end())); - - _index_channel->set_error_tablet_in_state(state); - _index_channel->set_tablets_received_rows(_tablets_received_rows, _node_id); - return Status::OK(); - } - - std::stringstream ss; - ss << "close wait failed coz rpc error"; - { - std::lock_guard l(_cancel_msg_lock); - if (_cancel_msg != "") { - ss << ". " << _cancel_msg; - } - } - return Status::InternalError(ss.str()); -} - -void NodeChannel::cancel(const std::string& cancel_msg) { - SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get()); - // set _is_closed to true finally - Defer set_closed {[&]() { - std::lock_guard l(_closed_lock); - _is_closed = true; - }}; - // we don't need to wait last rpc finished, cause closure's release/reset will join. - // But do we need brpc::StartCancel(call_id)? - _cancel_with_msg(cancel_msg); - - PTabletWriterCancelRequest request; - request.set_allocated_id(&_parent->_load_id); - request.set_index_id(_index_channel->_index_id); - request.set_sender_id(_parent->_sender_id); - - auto closure = new RefCountClosure(); - - closure->ref(); - int remain_ms = _rpc_timeout_ms - _timeout_watch.elapsed_time() / NANOS_PER_MILLIS; - if (UNLIKELY(remain_ms < config::min_load_rpc_timeout_ms)) { - remain_ms = config::min_load_rpc_timeout_ms; - } - closure->cntl.set_timeout_ms(remain_ms); - if (config::tablet_writer_ignore_eovercrowded) { - closure->cntl.ignore_eovercrowded(); - } - _stub->tablet_writer_cancel(&closure->cntl, &request, &closure->result, closure); - request.release_id(); -} - -int NodeChannel::try_send_and_fetch_status(RuntimeState* state, - std::unique_ptr& thread_pool_token) { - auto st = none_of({_cancelled, _send_finished}); - if (!st.ok()) { - return 0; - } - - if (!_add_batch_closure->try_set_in_flight()) { - return _send_finished ? 0 : 1; - } - - // We are sure that try_send_batch is not running - if (_pending_batches_num > 0) { - auto s = thread_pool_token->submit_func( - std::bind(&NodeChannel::try_send_batch, this, state)); - if (!s.ok()) { - _cancel_with_msg("submit send_batch task to send_batch_thread_pool failed"); - // clear in flight - _add_batch_closure->clear_in_flight(); - } - // in_flight is cleared in closure::Run - } else { - // clear in flight - _add_batch_closure->clear_in_flight(); - } - return _send_finished ? 0 : 1; -} - -void NodeChannel::try_send_batch(RuntimeState* state) { - SCOPED_ATOMIC_TIMER(&_actual_consume_ns); - SCOPED_ATTACH_TASK(state); - SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker); - AddBatchReq send_batch; - { - debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan; - std::lock_guard l(_pending_batches_lock); - DCHECK(!_pending_batches.empty()); - send_batch = std::move(_pending_batches.front()); - _pending_batches.pop(); - _pending_batches_num--; - _pending_batches_bytes -= send_batch.first->tuple_data_pool()->total_reserved_bytes(); - } - - auto row_batch = std::move(send_batch.first); - auto request = std::move(send_batch.second); // doesn't need to be saved in heap - - // tablet_ids has already set when add row - request.set_packet_seq(_next_packet_seq); - if (row_batch->num_rows() > 0) { - SCOPED_ATOMIC_TIMER(&_serialize_batch_ns); - size_t uncompressed_bytes = 0, compressed_bytes = 0; - Status st = row_batch->serialize(request.mutable_row_batch(), &uncompressed_bytes, - &compressed_bytes, _parent->_transfer_large_data_by_brpc); - if (!st.ok()) { - cancel(fmt::format("{}, err: {}", channel_info(), st.to_string())); - _add_batch_closure->clear_in_flight(); - return; - } - if (compressed_bytes >= double(config::brpc_max_body_size) * 0.95f) { - LOG(WARNING) << "send batch too large, this rpc may failed. send size: " - << compressed_bytes << ", threshold: " << config::brpc_max_body_size - << ", " << channel_info(); - } - } - - int remain_ms = _rpc_timeout_ms - _timeout_watch.elapsed_time() / NANOS_PER_MILLIS; - if (UNLIKELY(remain_ms < config::min_load_rpc_timeout_ms)) { - if (remain_ms <= 0 && !request.eos()) { - cancel(fmt::format("{}, err: timeout", channel_info())); - _add_batch_closure->clear_in_flight(); - return; - } else { - remain_ms = config::min_load_rpc_timeout_ms; - } - } - - // After calling reset(), make sure that the rpc will be called finally. - // Otherwise, when calling _add_batch_closure->join(), it will be blocked forever. - // and _add_batch_closure->join() will be called in ~NodeChannel(). - _add_batch_closure->reset(); - _add_batch_closure->cntl.set_timeout_ms(remain_ms); - if (config::tablet_writer_ignore_eovercrowded) { - _add_batch_closure->cntl.ignore_eovercrowded(); - } - - if (request.eos()) { - for (auto pid : _parent->_partition_ids) { - request.add_partition_ids(pid); - } - - request.set_write_single_replica(false); - if (_parent->_write_single_replica) { - request.set_write_single_replica(true); - for (std::unordered_map>::iterator iter = - _slave_tablet_nodes.begin(); - iter != _slave_tablet_nodes.end(); iter++) { - PSlaveTabletNodes slave_tablet_nodes; - for (auto node_id : iter->second) { - auto node = _parent->_nodes_info->find_node(node_id); - if (node == nullptr) { - return; - } - PNodeInfo* pnode = slave_tablet_nodes.add_slave_nodes(); - pnode->set_id(node->id); - pnode->set_option(node->option); - pnode->set_host(node->host); - pnode->set_async_internal_port(config::single_replica_load_brpc_port); - } - request.mutable_slave_tablet_nodes()->insert({iter->first, slave_tablet_nodes}); - } - } - - // eos request must be the last request - _add_batch_closure->end_mark(); - _send_finished = true; - CHECK(_pending_batches_num == 0) << _pending_batches_num; - } - - if (_parent->_transfer_large_data_by_brpc && request.has_row_batch() && - request.row_batch().has_tuple_data() && request.ByteSizeLong() > MIN_HTTP_BRPC_SIZE) { - Status st = request_embed_attachment_contain_tuple< - PTabletWriterAddBatchRequest, ReusableClosure>( - &request, _add_batch_closure); - if (!st.ok()) { - cancel(fmt::format("{}, err: {}", channel_info(), st.to_string())); - _add_batch_closure->clear_in_flight(); - return; - } - std::string brpc_url = fmt::format("http://{}:{}", _node_info.host, _node_info.brpc_port); - std::shared_ptr _brpc_http_stub = - _state->exec_env()->brpc_internal_client_cache()->get_new_client_no_cache(brpc_url, - "http"); - _add_batch_closure->cntl.http_request().uri() = - brpc_url + "/PInternalServiceImpl/tablet_writer_add_batch_by_http"; - _add_batch_closure->cntl.http_request().set_method(brpc::HTTP_METHOD_POST); - _add_batch_closure->cntl.http_request().set_content_type("application/json"); - { - SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(ExecEnv::GetInstance()->orphan_mem_tracker()); - _brpc_http_stub->tablet_writer_add_batch_by_http(&_add_batch_closure->cntl, NULL, - &_add_batch_closure->result, - _add_batch_closure); - } - } else { - _add_batch_closure->cntl.http_request().Clear(); - { - SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(ExecEnv::GetInstance()->orphan_mem_tracker()); - _stub->tablet_writer_add_batch(&_add_batch_closure->cntl, &request, - &_add_batch_closure->result, _add_batch_closure); - } - } - _next_packet_seq++; -} - -Status NodeChannel::none_of(std::initializer_list vars) { - bool none = std::none_of(vars.begin(), vars.end(), [](bool var) { return var; }); - Status st = Status::OK(); - if (!none) { - std::string vars_str; - std::for_each(vars.begin(), vars.end(), - [&vars_str](bool var) -> void { vars_str += (var ? "1/" : "0/"); }); - if (!vars_str.empty()) { - vars_str.pop_back(); // 0/1/0/ -> 0/1/0 - } - st = Status::InternalError(vars_str); - } - - return st; -} - -void NodeChannel::clear_all_batches() { - std::lock_guard lg(_pending_batches_lock); - std::queue empty; - std::swap(_pending_batches, empty); - _cur_batch.reset(); -} - -Status IndexChannel::init(RuntimeState* state, const std::vector& tablets) { - SCOPED_CONSUME_MEM_TRACKER(_index_channel_tracker.get()); - for (auto& tablet : tablets) { - auto location = _parent->_location->find_tablet(tablet.tablet_id); - if (location == nullptr) { - LOG(WARNING) << "unknown tablet, tablet_id=" << tablet.tablet_id; - return Status::InternalError("unknown tablet"); - } - std::vector> channels; - for (auto& node_id : location->node_ids) { - std::shared_ptr channel; - auto it = _node_channels.find(node_id); - if (it == _node_channels.end()) { - // NodeChannel is not added to the _parent->_pool. - // Because the deconstruction of NodeChannel may take a long time to wait rpc finish. - // but the ObjectPool will hold a spin lock to delete objects. - if (!_is_vectorized) { - channel = std::make_shared(_parent, this, node_id); - } else { - channel = std::make_shared(_parent, this, node_id); - } - _node_channels.emplace(node_id, channel); - } else { - channel = it->second; - } - channel->add_tablet(tablet); - if (_parent->_write_single_replica) { - auto slave_location = _parent->_slave_location->find_tablet(tablet.tablet_id); - if (slave_location != nullptr) { - channel->add_slave_tablet_nodes(tablet.tablet_id, slave_location->node_ids); - } - } - channels.push_back(channel); - _tablets_by_channel[node_id].insert(tablet.tablet_id); - } - _channels_by_tablet.emplace(tablet.tablet_id, std::move(channels)); - } - for (auto& it : _node_channels) { - RETURN_IF_ERROR(it.second->init(state)); - } - return Status::OK(); -} - -void IndexChannel::mark_as_failed(int64_t node_id, const std::string& host, const std::string& err, - int64_t tablet_id) { - VLOG_PROGRESS << "mark node_id:" << node_id << " tablet_id: " << tablet_id - << " as failed, err: " << err; - const auto& it = _tablets_by_channel.find(node_id); - if (it == _tablets_by_channel.end()) { - return; - } - - { - std::lock_guard l(_fail_lock); - if (tablet_id == -1) { - for (const auto the_tablet_id : it->second) { - _failed_channels[the_tablet_id].insert(node_id); - _failed_channels_msgs.emplace(the_tablet_id, err + ", host: " + host); - if (_failed_channels[the_tablet_id].size() >= ((_parent->_num_replicas + 1) / 2)) { - _intolerable_failure_status = - Status::InternalError(_failed_channels_msgs[the_tablet_id]); - } - } - } else { - _failed_channels[tablet_id].insert(node_id); - _failed_channels_msgs.emplace(tablet_id, err + ", host: " + host); - if (_failed_channels[tablet_id].size() >= ((_parent->_num_replicas + 1) / 2)) { - _intolerable_failure_status = - Status::InternalError(_failed_channels_msgs[tablet_id]); - } - } - } -} - -Status IndexChannel::check_intolerable_failure() { - std::lock_guard l(_fail_lock); - return _intolerable_failure_status; -} - -void IndexChannel::set_error_tablet_in_state(RuntimeState* state) { - std::vector& error_tablet_infos = state->error_tablet_infos(); - - std::lock_guard l(_fail_lock); - for (const auto& it : _failed_channels_msgs) { - TErrorTabletInfo error_info; - error_info.__set_tabletId(it.first); - error_info.__set_msg(it.second); - error_tablet_infos.emplace_back(error_info); - } -} - -void IndexChannel::set_tablets_received_rows( - const std::vector>& tablets_received_rows, int64_t node_id) { - for (const auto& [tablet_id, rows_num] : tablets_received_rows) { - _tablets_received_rows[tablet_id].emplace_back(node_id, rows_num); - } -} - -Status IndexChannel::check_tablet_received_rows_consistency() { - for (auto& tablet : _tablets_received_rows) { - for (size_t i = 0; i < tablet.second.size(); i++) { - VLOG_NOTICE << "check_tablet_received_rows_consistency, load_id: " << _parent->_load_id - << ", txn_id: " << std::to_string(_parent->_txn_id) - << ", tablet_id: " << tablet.first - << ", node_id: " << tablet.second[i].first - << ", rows_num: " << tablet.second[i].second; - if (i == 0) { - continue; - } - if (tablet.second[i].second != tablet.second[0].second) { - LOG(WARNING) << "rows num doest't match, load_id: " << _parent->_load_id - << ", txn_id: " << std::to_string(_parent->_txn_id) - << ", tablt_id: " << tablet.first - << ", node_id: " << tablet.second[i].first - << ", rows_num: " << tablet.second[i].second - << ", node_id: " << tablet.second[0].first - << ", rows_num: " << tablet.second[0].second; - return Status::InternalError("rows num written by multi replicas doest't match"); - } - } - } - return Status::OK(); -} - -OlapTableSink::OlapTableSink(ObjectPool* pool, const RowDescriptor& row_desc, - const std::vector& texprs, Status* status) - : _pool(pool), - _input_row_desc(row_desc), - _filter_bitmap(1024), - _stop_background_threads_latch(1) { - if (!_is_vectorized) { - if (!texprs.empty()) { - *status = Expr::create_expr_trees(_pool, texprs, &_output_expr_ctxs); - } - _name = "OlapTableSink"; - } else { - *status = Status::OK(); - } - _transfer_large_data_by_brpc = config::transfer_large_data_by_brpc; -} - -OlapTableSink::~OlapTableSink() { - // We clear NodeChannels' batches here, cuz NodeChannels' batches destruction will use - // OlapTableSink::_mem_tracker and its parents. - // But their destructions are after OlapTableSink's. - for (auto index_channel : _channels) { - index_channel->for_each_node_channel( - [](const std::shared_ptr& ch) { ch->clear_all_batches(); }); - } -} - -Status OlapTableSink::init(const TDataSink& t_sink) { - DCHECK(t_sink.__isset.olap_table_sink); - auto& table_sink = t_sink.olap_table_sink; - _load_id.set_hi(table_sink.load_id.hi); - _load_id.set_lo(table_sink.load_id.lo); - _txn_id = table_sink.txn_id; - _num_replicas = table_sink.num_replicas; - _tuple_desc_id = table_sink.tuple_id; - _schema.reset(new OlapTableSchemaParam()); - RETURN_IF_ERROR(_schema->init(table_sink.schema)); - _partition = _pool->add(new OlapTablePartitionParam(_schema, table_sink.partition)); - RETURN_IF_ERROR(_partition->init()); - _location = _pool->add(new OlapTableLocationParam(table_sink.location)); - _nodes_info = _pool->add(new DorisNodesInfo(table_sink.nodes_info)); - if (table_sink.__isset.write_single_replica && table_sink.write_single_replica) { - _write_single_replica = true; - _slave_location = _pool->add(new OlapTableLocationParam(table_sink.slave_location)); - if (!config::enable_single_replica_load) { - return Status::InternalError("single replica load is disabled on BE."); - } - } - - if (table_sink.__isset.load_channel_timeout_s) { - _load_channel_timeout_s = table_sink.load_channel_timeout_s; - } else { - _load_channel_timeout_s = config::streaming_load_rpc_max_alive_time_sec; - } - if (table_sink.__isset.send_batch_parallelism && table_sink.send_batch_parallelism > 1) { - _send_batch_parallelism = table_sink.send_batch_parallelism; - } - // if distributed column list is empty, we can ensure that tablet is with random distribution info - // and if load_to_single_tablet is set and set to true, we should find only one tablet in one partition - // for the whole olap table sink - if (table_sink.partition.distributed_columns.empty()) { - if (table_sink.__isset.load_to_single_tablet && table_sink.load_to_single_tablet) { - findTabletMode = FindTabletMode::FIND_TABLET_EVERY_SINK; - } else { - findTabletMode = FindTabletMode::FIND_TABLET_EVERY_BATCH; - } - } - return Status::OK(); -} - -Status OlapTableSink::prepare(RuntimeState* state) { - RETURN_IF_ERROR(DataSink::prepare(state)); - - _sender_id = state->per_fragment_instance_idx(); - _num_senders = state->num_per_fragment_instances(); - _is_high_priority = (state->query_options().query_timeout <= - config::load_task_high_priority_threshold_second); - - // profile must add to state's object pool - _profile = state->obj_pool()->add(new RuntimeProfile("OlapTableSink")); - _mem_tracker = - std::make_shared("OlapTableSink:" + std::to_string(state->load_job_id())); - SCOPED_TIMER(_profile->total_time_counter()); - SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get()); - - if (!_is_vectorized) { - // Prepare the exprs to run. - RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _input_row_desc)); - } - - // get table's tuple descriptor - _output_tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_desc_id); - if (_output_tuple_desc == nullptr) { - LOG(WARNING) << "unknown destination tuple descriptor, id=" << _tuple_desc_id; - return Status::InternalError("unknown destination tuple descriptor"); - } - - _output_row_desc = _pool->add(new RowDescriptor(_output_tuple_desc, false)); - - if (!_is_vectorized) { - if (!_output_expr_ctxs.empty()) { - if (_output_expr_ctxs.size() != _output_tuple_desc->slots().size()) { - LOG(WARNING) << "number of exprs is not same with slots, num_exprs=" - << _output_expr_ctxs.size() - << ", num_slots=" << _output_tuple_desc->slots().size(); - return Status::InternalError("number of exprs is not same with slots"); - } - for (int i = 0; i < _output_expr_ctxs.size(); ++i) { - if (!is_type_compatible(_output_expr_ctxs[i]->root()->type().type, - _output_tuple_desc->slots()[i]->type().type)) { - LOG(WARNING) << "type of exprs is not match slot's, expr_type=" - << _output_expr_ctxs[i]->root()->type().type - << ", slot_type=" << _output_tuple_desc->slots()[i]->type().type - << ", slot_name=" << _output_tuple_desc->slots()[i]->col_name(); - return Status::InternalError("expr's type is not same with slot's"); - } - } - } - - _output_batch.reset(new RowBatch(*_output_row_desc, state->batch_size())); - } - - _max_decimalv2_val.resize(_output_tuple_desc->slots().size()); - _min_decimalv2_val.resize(_output_tuple_desc->slots().size()); - // check if need validate batch - for (int i = 0; i < _output_tuple_desc->slots().size(); ++i) { - auto slot = _output_tuple_desc->slots()[i]; - switch (slot->type().type) { - // For DECIMAL32,DECIMAL64,DECIMAL128, we have done precision and scale conversion so just - // skip data validation here. - case TYPE_DECIMALV2: - _max_decimalv2_val[i].to_max_decimal(slot->type().precision, slot->type().scale); - _min_decimalv2_val[i].to_min_decimal(slot->type().precision, slot->type().scale); - _need_validate_data = true; - break; - case TYPE_CHAR: - case TYPE_VARCHAR: - case TYPE_DATE: - case TYPE_DATETIME: - case TYPE_DATEV2: - case TYPE_DATETIMEV2: - case TYPE_HLL: - case TYPE_OBJECT: - case TYPE_STRING: - case TYPE_ARRAY: - _need_validate_data = true; - break; - default: - break; - } - } - - // add all counter - _input_rows_counter = ADD_COUNTER(_profile, "RowsRead", TUnit::UNIT); - _output_rows_counter = ADD_COUNTER(_profile, "RowsReturned", TUnit::UNIT); - _filtered_rows_counter = ADD_COUNTER(_profile, "RowsFiltered", TUnit::UNIT); - _send_data_timer = ADD_TIMER(_profile, "SendDataTime"); - _wait_mem_limit_timer = ADD_CHILD_TIMER(_profile, "WaitMemLimitTime", "SendDataTime"); - _convert_batch_timer = ADD_TIMER(_profile, "ConvertBatchTime"); - _validate_data_timer = ADD_TIMER(_profile, "ValidateDataTime"); - _open_timer = ADD_TIMER(_profile, "OpenTime"); - _close_timer = ADD_TIMER(_profile, "CloseWaitTime"); - _non_blocking_send_timer = ADD_TIMER(_profile, "NonBlockingSendTime"); - _non_blocking_send_work_timer = - ADD_CHILD_TIMER(_profile, "NonBlockingSendWorkTime", "NonBlockingSendTime"); - _serialize_batch_timer = - ADD_CHILD_TIMER(_profile, "SerializeBatchTime", "NonBlockingSendWorkTime"); - _total_add_batch_exec_timer = ADD_TIMER(_profile, "TotalAddBatchExecTime"); - _max_add_batch_exec_timer = ADD_TIMER(_profile, "MaxAddBatchExecTime"); - _add_batch_number = ADD_COUNTER(_profile, "NumberBatchAdded", TUnit::UNIT); - _num_node_channels = ADD_COUNTER(_profile, "NumberNodeChannels", TUnit::UNIT); - _load_mem_limit = state->get_load_mem_limit(); - - // open all channels - bool use_vec = _is_vectorized && state->be_exec_version() > 0; - const auto& partitions = _partition->get_partitions(); - for (int i = 0; i < _schema->indexes().size(); ++i) { - // collect all tablets belong to this rollup - std::vector tablets; - auto index = _schema->indexes()[i]; - for (const auto& part : partitions) { - for (const auto& tablet : part->indexes[i].tablets) { - TTabletWithPartition tablet_with_partition; - tablet_with_partition.partition_id = part->id; - tablet_with_partition.tablet_id = tablet; - tablets.emplace_back(std::move(tablet_with_partition)); - } - } - if (UNLIKELY(tablets.empty())) { - LOG(WARNING) << "load job:" << state->load_job_id() << " index: " << index->index_id - << " would open 0 tablet"; - } - _channels.emplace_back(new IndexChannel(this, index->index_id, use_vec)); - RETURN_IF_ERROR(_channels.back()->init(state, tablets)); - } - - return Status::OK(); -} - -Status OlapTableSink::open(RuntimeState* state) { - SCOPED_TIMER(_profile->total_time_counter()); - SCOPED_TIMER(_open_timer); - SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get()); - if (!_is_vectorized) { - // Prepare the exprs to run. - RETURN_IF_ERROR(Expr::open(_output_expr_ctxs, state)); - } - - for (auto index_channel : _channels) { - index_channel->for_each_node_channel( - [](const std::shared_ptr& ch) { ch->open(); }); - } - - for (auto index_channel : _channels) { - index_channel->for_each_node_channel([&index_channel]( - const std::shared_ptr& ch) { - auto st = ch->open_wait(); - if (!st.ok()) { - // The open() phase is mainly to generate DeltaWriter instances on the nodes corresponding to each node channel. - // This phase will not fail due to a single tablet. - // Therefore, if the open() phase fails, all tablets corresponding to the node need to be marked as failed. - index_channel->mark_as_failed( - ch->node_id(), ch->host(), - fmt::format("{}, open failed, err: {}", ch->channel_info(), st.to_string()), - -1); - } - }); - - RETURN_IF_ERROR(index_channel->check_intolerable_failure()); - } - int32_t send_batch_parallelism = - MIN(_send_batch_parallelism, config::max_send_batch_parallelism_per_job); - _send_batch_thread_pool_token = state->exec_env()->send_batch_thread_pool()->new_token( - ThreadPool::ExecutionMode::CONCURRENT, send_batch_parallelism); - RETURN_IF_ERROR(Thread::create( - "OlapTableSink", "send_batch_process", - [this, state]() { this->_send_batch_process(state); }, &_sender_thread)); - - return Status::OK(); -} - -Status OlapTableSink::send(RuntimeState* state, RowBatch* input_batch) { - SCOPED_TIMER(_profile->total_time_counter()); - SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get()); - // update incrementally so that FE can get the progress. - // the real 'num_rows_load_total' will be set when sink being closed. - int64_t num_rows = input_batch->num_rows(); - int64_t num_bytes = input_batch->total_byte_size(); - _number_input_rows += num_rows; - state->update_num_rows_load_total(num_rows); - state->update_num_bytes_load_total(num_bytes); - DorisMetrics::instance()->load_rows->increment(num_rows); - DorisMetrics::instance()->load_bytes->increment(num_bytes); - RowBatch* batch = input_batch; - if (!_output_expr_ctxs.empty()) { - SCOPED_RAW_TIMER(&_convert_batch_ns); - _output_batch->reset(); - RETURN_IF_ERROR(_convert_batch(state, input_batch, _output_batch.get())); - batch = _output_batch.get(); - } - - int filtered_rows = 0; - if (_need_validate_data) { - SCOPED_RAW_TIMER(&_validate_data_ns); - _filter_bitmap.Reset(batch->num_rows()); - bool stop_processing = false; - RETURN_IF_ERROR( - _validate_data(state, batch, &_filter_bitmap, &filtered_rows, &stop_processing)); - _number_filtered_rows += filtered_rows; - if (stop_processing) { - // should be returned after updating "_number_filtered_rows", to make sure that load job can be cancelled - // because of "data unqualified" - return Status::EndOfFile("Encountered unqualified data, stop processing"); - } - } - - SCOPED_RAW_TIMER(&_send_data_ns); - bool stop_processing = false; - if (findTabletMode == FindTabletMode::FIND_TABLET_EVERY_BATCH) { - _partition_to_tablet_map.clear(); - } - for (int i = 0; i < batch->num_rows(); ++i) { - Tuple* tuple = batch->get_row(i)->get_tuple(0); - if (filtered_rows > 0 && _filter_bitmap.Get(i)) { - continue; - } - const OlapTablePartition* partition = nullptr; - if (!_partition->find_partition(tuple, &partition)) { - RETURN_IF_ERROR(state->append_error_msg_to_file( - []() -> std::string { return ""; }, - [&]() -> std::string { - fmt::memory_buffer buf; - fmt::format_to(buf, "no partition for this tuple. tuple={}", - Tuple::to_string(tuple, *_output_tuple_desc)); - return fmt::to_string(buf); - }, - &stop_processing)); - _number_filtered_rows++; - if (stop_processing) { - return Status::EndOfFile("Encountered unqualified data, stop processing"); - } - continue; - } - uint32_t tablet_index = 0; - if (findTabletMode != FindTabletMode::FIND_TABLET_EVERY_ROW) { - if (_partition_to_tablet_map.find(partition->id) == _partition_to_tablet_map.end()) { - tablet_index = _partition->find_tablet(tuple, *partition); - _partition_to_tablet_map.emplace(partition->id, tablet_index); - } else { - tablet_index = _partition_to_tablet_map[partition->id]; - } - } else { - tablet_index = _partition->find_tablet(tuple, *partition); - } - _partition_ids.emplace(partition->id); - for (int j = 0; j < partition->indexes.size(); ++j) { - int64_t tablet_id = partition->indexes[j].tablets[tablet_index]; - _channels[j]->add_row(tuple, tablet_id); - _number_output_rows++; - } - } - - // check intolerable failure - for (const auto& index_channel : _channels) { - RETURN_IF_ERROR(index_channel->check_intolerable_failure()); - } - return Status::OK(); -} - -Status OlapTableSink::close(RuntimeState* state, Status close_status) { - if (_closed) { - /// The close method may be called twice. - /// In the open_internal() method of plan_fragment_executor, close is called once. - /// If an error occurs in this call, it will be called again in fragment_mgr. - /// So here we use a flag to prevent repeated close operations. - return _close_status; - } - Status status = close_status; - if (status.ok()) { - // only if status is ok can we call this _profile->total_time_counter(). - // if status is not ok, this sink may not be prepared, so that _profile is null - SCOPED_TIMER(_profile->total_time_counter()); - // BE id -> add_batch method counter - std::unordered_map node_add_batch_counter_map; - int64_t serialize_batch_ns = 0, mem_exceeded_block_ns = 0, queue_push_lock_ns = 0, - actual_consume_ns = 0, total_add_batch_exec_time_ns = 0, - max_add_batch_exec_time_ns = 0, total_add_batch_num = 0, num_node_channels = 0; - { - SCOPED_TIMER(_close_timer); - for (auto index_channel : _channels) { - index_channel->for_each_node_channel( - [](const std::shared_ptr& ch) { ch->mark_close(); }); - num_node_channels += index_channel->num_node_channels(); - } - - for (auto index_channel : _channels) { - int64_t add_batch_exec_time = 0; - index_channel->for_each_node_channel( - [&index_channel, &state, &node_add_batch_counter_map, &serialize_batch_ns, - &mem_exceeded_block_ns, &queue_push_lock_ns, &actual_consume_ns, - &total_add_batch_exec_time_ns, &add_batch_exec_time, - &total_add_batch_num](const std::shared_ptr& ch) { - auto s = ch->close_wait(state); - if (!s.ok()) { - auto err_msg = s.to_string(); - index_channel->mark_as_failed(ch->node_id(), ch->host(), err_msg, - -1); - // cancel the node channel in best effort - ch->cancel(err_msg); - LOG(WARNING) << ch->channel_info() - << ", close channel failed, err: " << err_msg; - } - ch->time_report(&node_add_batch_counter_map, &serialize_batch_ns, - &mem_exceeded_block_ns, &queue_push_lock_ns, - &actual_consume_ns, &total_add_batch_exec_time_ns, - &add_batch_exec_time, &total_add_batch_num); - }); - - if (add_batch_exec_time > max_add_batch_exec_time_ns) { - max_add_batch_exec_time_ns = add_batch_exec_time; - } - - // check if index has intolerable failure - Status index_st = index_channel->check_intolerable_failure(); - if (!index_st.ok()) { - status = index_st; - } else if (Status st = index_channel->check_tablet_received_rows_consistency(); - !st.ok()) { - status = st; - } - } // end for index channels - } - // TODO need to be improved - LOG(INFO) << "total mem_exceeded_block_ns=" << mem_exceeded_block_ns - << ", total queue_push_lock_ns=" << queue_push_lock_ns - << ", total actual_consume_ns=" << actual_consume_ns - << ", load id=" << print_id(_load_id); - - COUNTER_SET(_input_rows_counter, _number_input_rows); - COUNTER_SET(_output_rows_counter, _number_output_rows); - COUNTER_SET(_filtered_rows_counter, _number_filtered_rows); - COUNTER_SET(_send_data_timer, _send_data_ns); - COUNTER_SET(_wait_mem_limit_timer, mem_exceeded_block_ns); - COUNTER_SET(_convert_batch_timer, _convert_batch_ns); - COUNTER_SET(_validate_data_timer, _validate_data_ns); - COUNTER_SET(_serialize_batch_timer, serialize_batch_ns); - COUNTER_SET(_non_blocking_send_work_timer, actual_consume_ns); - COUNTER_SET(_total_add_batch_exec_timer, total_add_batch_exec_time_ns); - COUNTER_SET(_max_add_batch_exec_timer, max_add_batch_exec_time_ns); - COUNTER_SET(_add_batch_number, total_add_batch_num); - COUNTER_SET(_num_node_channels, num_node_channels); - // _number_input_rows don't contain num_rows_load_filtered and num_rows_load_unselected in scan node - int64_t num_rows_load_total = _number_input_rows + state->num_rows_load_filtered() + - state->num_rows_load_unselected(); - state->set_num_rows_load_total(num_rows_load_total); - state->update_num_rows_load_filtered(_number_filtered_rows); - - // print log of add batch time of all node, for tracing load performance easily - std::stringstream ss; - ss << "finished to close olap table sink. load_id=" << print_id(_load_id) - << ", txn_id=" << _txn_id - << ", node add batch time(ms)/wait execution time(ms)/close time(ms)/num: "; - for (auto const& pair : node_add_batch_counter_map) { - ss << "{" << pair.first << ":(" << (pair.second.add_batch_execution_time_us / 1000) - << ")(" << (pair.second.add_batch_wait_execution_time_us / 1000) << ")(" - << pair.second.close_wait_time_ms << ")(" << pair.second.add_batch_num << ")} "; - } - LOG(INFO) << ss.str(); - } else { - for (auto channel : _channels) { - channel->for_each_node_channel([&status](const std::shared_ptr& ch) { - ch->cancel(status.to_string()); - }); - } - LOG(INFO) << "finished to close olap table sink. load_id=" << print_id(_load_id) - << ", txn_id=" << _txn_id - << ", canceled all node channels due to error: " << status; - } - - // Sender join() must put after node channels mark_close/cancel. - // But there is no specific sequence required between sender join() & close_wait(). - _stop_background_threads_latch.count_down(); - if (_sender_thread) { - _sender_thread->join(); - // We have to wait all task in _send_batch_thread_pool_token finished, - // because it is difficult to handle concurrent problem if we just - // shutdown it. - _send_batch_thread_pool_token->wait(); - } - - Expr::close(_output_expr_ctxs, state); - _output_batch.reset(); - - _close_status = status; - DataSink::close(state, close_status); - return status; -} - -Status OlapTableSink::_convert_batch(RuntimeState* state, RowBatch* input_batch, - RowBatch* output_batch) { - DCHECK_GE(output_batch->capacity(), input_batch->num_rows()); - int commit_rows = 0; - bool stop_processing = false; - for (int i = 0; i < input_batch->num_rows(); ++i) { - auto src_row = input_batch->get_row(i); - Tuple* dst_tuple = - (Tuple*)output_batch->tuple_data_pool()->allocate(_output_tuple_desc->byte_size()); - bool ignore_this_row = false; - for (int j = 0; j < _output_expr_ctxs.size(); ++j) { - auto src_val = _output_expr_ctxs[j]->get_value(src_row); - auto slot_desc = _output_tuple_desc->slots()[j]; - // The following logic is similar to BaseScanner::fill_dest_tuple - // Todo(kks): we should unify it - if (src_val == nullptr) { - // Only when the expr return value is null, we will check the error message. - std::string expr_error = _output_expr_ctxs[j]->get_error_msg(); - if (!expr_error.empty()) { - RETURN_IF_ERROR(state->append_error_msg_to_file( - [&]() -> std::string { return slot_desc->col_name(); }, - [&]() -> std::string { return expr_error; }, &stop_processing)); - _number_filtered_rows++; - ignore_this_row = true; - // The ctx is reused, so must clear the error state and message. - _output_expr_ctxs[j]->clear_error_msg(); - break; - } - if (!slot_desc->is_nullable()) { - RETURN_IF_ERROR(state->append_error_msg_to_file( - []() -> std::string { return ""; }, - [&]() -> std::string { - fmt::memory_buffer buf; - fmt::format_to( - buf, "null value for not null column, column={}, type={}", - slot_desc->col_name(), slot_desc->type().debug_string()); - return fmt::to_string(buf); - }, - &stop_processing)); - _number_filtered_rows++; - ignore_this_row = true; - break; - } - dst_tuple->set_null(slot_desc->null_indicator_offset()); - continue; - } - if (slot_desc->is_nullable()) { - dst_tuple->set_not_null(slot_desc->null_indicator_offset()); - } - void* slot = dst_tuple->get_slot(slot_desc->tuple_offset()); - RawValue::write(src_val, slot, slot_desc->type(), _output_batch->tuple_data_pool()); - } // end for output expr - - if (!ignore_this_row) { - output_batch->get_row(commit_rows)->set_tuple(0, dst_tuple); - commit_rows++; - } - - if (stop_processing) { - return Status::EndOfFile("Encountered unqualified data, stop processing"); - } - } - output_batch->commit_rows(commit_rows); - return Status::OK(); -} - -bool OlapTableSink::_validate_cell(const TypeDescriptor& type, const std::string& col_name, - void* slot, size_t slot_index, fmt::memory_buffer& error_msg, - RowBatch* batch) { - switch (type.type) { - case TYPE_CHAR: - case TYPE_VARCHAR: { - // Fixed length string - StringValue* str_val = (StringValue*)slot; - if (str_val->len > type.len) { - fmt::format_to(error_msg, "{}", "the length of input is too long than schema. "); - fmt::format_to(error_msg, "column_name: {}; ", col_name); - fmt::format_to(error_msg, "input str: [{}] ", std::string(str_val->ptr, str_val->len)); - fmt::format_to(error_msg, "schema length: {}; ", type.len); - fmt::format_to(error_msg, "actual length: {}; ", str_val->len); - return false; - } - // padding 0 to CHAR field - if (type.type == TYPE_CHAR && str_val->len < type.len) { - auto new_ptr = (char*)batch->tuple_data_pool()->allocate(type.len); - memcpy(new_ptr, str_val->ptr, str_val->len); - memset(new_ptr + str_val->len, 0, type.len - str_val->len); - - str_val->ptr = new_ptr; - str_val->len = type.len; - } - break; - } - case TYPE_STRING: { - StringValue* str_val = (StringValue*)slot; - if (str_val->len > config::string_type_length_soft_limit_bytes) { - fmt::format_to(error_msg, "{}", "the length of input is too long than schema. "); - fmt::format_to(error_msg, "column_name: {}; ", col_name); - fmt::format_to(error_msg, "first 128 bytes of input str: [{}] ", - std::string(str_val->ptr, 128)); - fmt::format_to(error_msg, "schema length: {}; ", - config::string_type_length_soft_limit_bytes); - fmt::format_to(error_msg, "actual length: {}; ", str_val->len); - return false; - } - break; - } - case TYPE_DECIMALV2: { - DecimalV2Value dec_val(reinterpret_cast(slot)->value); - if (dec_val.greater_than_scale(type.scale)) { - int code = dec_val.round(&dec_val, type.scale, HALF_UP); - reinterpret_cast(slot)->value = dec_val.value(); - if (code != E_DEC_OK) { - fmt::format_to(error_msg, "round one decimal failed.value={}; ", - dec_val.to_string()); - return false; - } - } - if (dec_val > _max_decimalv2_val[slot_index] || dec_val < _min_decimalv2_val[slot_index]) { - fmt::format_to(error_msg, "decimal value is not valid for definition, column={}", - col_name); - fmt::format_to(error_msg, ", value={}", dec_val.to_string()); - fmt::format_to(error_msg, ", precision={}, scale={}; ", type.precision, type.scale); - return false; - } - break; - } - case TYPE_HLL: { - Slice* hll_val = (Slice*)slot; - if (!HyperLogLog::is_valid(*hll_val)) { - fmt::format_to(error_msg, "Content of HLL type column is invalid. column name: {}; ", - col_name); - return false; - } - break; - } - case TYPE_ARRAY: { - auto array_val = (CollectionValue*)slot; - DCHECK(type.children.size() == 1); - auto nested_type = type.children[0]; - if (nested_type.type != TYPE_ARRAY && nested_type.type != TYPE_CHAR && - nested_type.type != TYPE_VARCHAR && nested_type.type != TYPE_STRING) { - break; - } - auto iter = array_val->iterator(nested_type.type); - while (iter.has_next()) { - auto data = iter.get(); - // validate array nested element is nullable - if (data == nullptr) { - if (!type.contains_null) { - fmt::format_to(error_msg, - "null element for null nested column of ARRAY, column={}, " - "type={} ", - col_name, type.debug_string()); - return false; - } - } else { - // validate array nested element data - if (!_validate_cell(nested_type, col_name, data, slot_index, error_msg, batch)) { - fmt::format_to(error_msg, "ARRAY or elements invalid"); - return false; - } - } - iter.next(); - } - break; - } - default: - break; - } - return true; -} -Status OlapTableSink::_validate_data(RuntimeState* state, RowBatch* batch, Bitmap* filter_bitmap, - int* filtered_rows, bool* stop_processing) { - for (int row_no = 0; row_no < batch->num_rows(); ++row_no) { - Tuple* tuple = batch->get_row(row_no)->get_tuple(0); - bool row_valid = true; - fmt::memory_buffer error_msg; // error message - for (int i = 0; row_valid && i < _output_tuple_desc->slots().size(); ++i) { - SlotDescriptor* desc = _output_tuple_desc->slots()[i]; - if (desc->is_nullable() && tuple->is_null(desc->null_indicator_offset())) { - if (desc->type().type == TYPE_OBJECT) { - fmt::format_to(error_msg, - "null is not allowed for bitmap column, column_name: {}; ", - desc->col_name()); - row_valid = false; - } - continue; - } - void* slot = tuple->get_slot(desc->tuple_offset()); - row_valid = _validate_cell(desc->type(), desc->col_name(), slot, i, error_msg, batch); - } - - if (!row_valid) { - (*filtered_rows)++; - filter_bitmap->Set(row_no, true); - RETURN_IF_ERROR(state->append_error_msg_to_file( - []() -> std::string { return ""; }, - [&]() -> std::string { return fmt::to_string(error_msg); }, stop_processing)); - } - } - return Status::OK(); -} - -void OlapTableSink::_send_batch_process(RuntimeState* state) { - SCOPED_TIMER(_non_blocking_send_timer); - SCOPED_ATTACH_TASK(state); - SCOPED_CONSUME_MEM_TRACKER(_mem_tracker); - do { - int running_channels_num = 0; - for (auto index_channel : _channels) { - index_channel->for_each_node_channel([&running_channels_num, this, - state](const std::shared_ptr& ch) { - running_channels_num += - ch->try_send_and_fetch_status(state, this->_send_batch_thread_pool_token); - }); - } - - if (running_channels_num == 0) { - LOG(INFO) << "all node channels are stopped(maybe finished/offending/cancelled), " - "sender thread exit. " - << print_id(_load_id); - return; - } - } while (!_stop_background_threads_latch.wait_for( - std::chrono::milliseconds(config::olap_table_sink_send_interval_ms))); -} - -} // namespace stream_load -} // namespace doris diff --git a/be/src/exec/tablet_sink.h b/be/src/exec/tablet_sink.h deleted file mode 100644 index 2216c89fff..0000000000 --- a/be/src/exec/tablet_sink.h +++ /dev/null @@ -1,583 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "common/object_pool.h" -#include "common/status.h" -#include "exec/data_sink.h" -#include "exec/tablet_info.h" -#include "gen_cpp/Types_types.h" -#include "gen_cpp/internal_service.pb.h" -#include "runtime/thread_context.h" -#include "util/bitmap.h" -#include "util/countdown_latch.h" -#include "util/ref_count_closure.h" -#include "util/spinlock.h" -#include "util/thread.h" - -namespace doris { - -class Bitmap; -class MemTracker; -class RuntimeProfile; -class RowDescriptor; -class ThreadPool; -class ThreadPoolToken; -class Tuple; -class TupleDescriptor; -class ExprContext; -class TExpr; - -namespace vectorized { -class Block; -class MutableBlock; -} // namespace vectorized -namespace stream_load { - -class OlapTableSink; - -// The counter of add_batch rpc of a single node -struct AddBatchCounter { - // total execution time of a add_batch rpc - int64_t add_batch_execution_time_us = 0; - // lock waiting time in a add_batch rpc - int64_t add_batch_wait_execution_time_us = 0; - // number of add_batch call - int64_t add_batch_num = 0; - // time passed between marked close and finish close - int64_t close_wait_time_ms = 0; - - AddBatchCounter& operator+=(const AddBatchCounter& rhs) { - add_batch_execution_time_us += rhs.add_batch_execution_time_us; - add_batch_wait_execution_time_us += rhs.add_batch_wait_execution_time_us; - add_batch_num += rhs.add_batch_num; - close_wait_time_ms += rhs.close_wait_time_ms; - return *this; - } - friend AddBatchCounter operator+(const AddBatchCounter& lhs, const AddBatchCounter& rhs) { - AddBatchCounter sum = lhs; - sum += rhs; - return sum; - } -}; - -// It's very error-prone to guarantee the handler capture vars' & this closure's destruct sequence. -// So using create() to get the closure pointer is recommended. We can delete the closure ptr before the capture vars destruction. -// Delete this point is safe, don't worry about RPC callback will run after ReusableClosure deleted. -template -class ReusableClosure final : public google::protobuf::Closure { -public: - ReusableClosure() : cid(INVALID_BTHREAD_ID) {} - ~ReusableClosure() override { - // shouldn't delete when Run() is calling or going to be called, wait for current Run() done. - join(); - SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(ExecEnv::GetInstance()->orphan_mem_tracker()); - cntl.Reset(); - } - - static ReusableClosure* create() { return new ReusableClosure(); } - - void addFailedHandler(const std::function& fn) { failed_handler = fn; } - void addSuccessHandler(const std::function& fn) { success_handler = fn; } - - void join() { - // We rely on in_flight to assure one rpc is running, - // while cid is not reliable due to memory order. - // in_flight is written before getting callid, - // so we can not use memory fence to synchronize. - while (_packet_in_flight) { - // cid here is complicated - if (cid != INVALID_BTHREAD_ID) { - // actually cid may be the last rpc call id. - brpc::Join(cid); - } - if (_packet_in_flight) { - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - } - } - } - - // plz follow this order: reset() -> set_in_flight() -> send brpc batch - void reset() { - SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(ExecEnv::GetInstance()->orphan_mem_tracker()); - cntl.Reset(); - cid = cntl.call_id(); - } - - bool try_set_in_flight() { - bool value = false; - return _packet_in_flight.compare_exchange_strong(value, true); - } - - void clear_in_flight() { _packet_in_flight = false; } - - bool is_packet_in_flight() { return _packet_in_flight; } - - void end_mark() { - DCHECK(_is_last_rpc == false); - _is_last_rpc = true; - } - - void Run() override { - DCHECK(_packet_in_flight); - if (cntl.Failed()) { - LOG(WARNING) << "failed to send brpc batch, error=" << berror(cntl.ErrorCode()) - << ", error_text=" << cntl.ErrorText(); - failed_handler(_is_last_rpc); - } else { - success_handler(result, _is_last_rpc); - } - clear_in_flight(); - } - - brpc::Controller cntl; - T result; - -private: - brpc::CallId cid; - std::atomic _packet_in_flight {false}; - std::atomic _is_last_rpc {false}; - std::function failed_handler; - std::function success_handler; -}; - -class IndexChannel; -class NodeChannel { -public: - NodeChannel(OlapTableSink* parent, IndexChannel* index_channel, int64_t node_id); - virtual ~NodeChannel() noexcept; - - // called before open, used to add tablet located in this backend - void add_tablet(const TTabletWithPartition& tablet) { _all_tablets.emplace_back(tablet); } - - virtual Status init(RuntimeState* state); - - void add_slave_tablet_nodes(int64_t tablet_id, const std::vector& slave_nodes) { - _slave_tablet_nodes[tablet_id] = slave_nodes; - } - - // we use open/open_wait to parallel - void open(); - virtual Status open_wait(); - - Status add_row(Tuple* tuple, int64_t tablet_id); - - Status add_row(const BlockRow& block_row, int64_t tablet_id); - - virtual Status add_block(vectorized::Block* block, - const std::pair, - std::vector>& payload) { - LOG(FATAL) << "add block to NodeChannel not supported"; - return Status::OK(); - } - - // two ways to stop channel: - // 1. mark_close()->close_wait() PS. close_wait() will block waiting for the last AddBatch rpc response. - // 2. just cancel() - virtual void mark_close(); - Status close_wait(RuntimeState* state); - - void cancel(const std::string& cancel_msg); - - // return: - // 0: stopped, send finished(eos request has been sent), or any internal error; - // 1: running, haven't reach eos. - // only allow 1 rpc in flight - // plz make sure, this func should be called after open_wait(). - virtual int try_send_and_fetch_status(RuntimeState* state, - std::unique_ptr& thread_pool_token); - - void try_send_batch(RuntimeState* state); - - void time_report(std::unordered_map* add_batch_counter_map, - int64_t* serialize_batch_ns, int64_t* mem_exceeded_block_ns, - int64_t* queue_push_lock_ns, int64_t* actual_consume_ns, - int64_t* total_add_batch_exec_time_ns, int64_t* add_batch_exec_time_ns, - int64_t* total_add_batch_num) { - (*add_batch_counter_map)[_node_id] += _add_batch_counter; - (*add_batch_counter_map)[_node_id].close_wait_time_ms = _close_time_ms; - *serialize_batch_ns += _serialize_batch_ns; - *mem_exceeded_block_ns += _mem_exceeded_block_ns; - *queue_push_lock_ns += _queue_push_lock_ns; - *actual_consume_ns += _actual_consume_ns; - *add_batch_exec_time_ns = (_add_batch_counter.add_batch_execution_time_us * 1000); - *total_add_batch_exec_time_ns += *add_batch_exec_time_ns; - *total_add_batch_num += _add_batch_counter.add_batch_num; - } - - int64_t node_id() const { return _node_id; } - std::string host() const { return _node_info.host; } - std::string name() const { return _name; } - - Status none_of(std::initializer_list vars); - - void clear_all_batches(); - - virtual void clear_all_blocks() {} - - std::string channel_info() const { - return fmt::format("{}, {}, node={}:{}", _name, _load_info, _node_info.host, - _node_info.brpc_port); - } - - size_t get_pending_bytes() { return _pending_batches_bytes; } - -protected: - void _cancel_with_msg(const std::string& msg); - - virtual void _close_check(); - -protected: - bool _is_vectorized = false; - OlapTableSink* _parent = nullptr; - IndexChannel* _index_channel = nullptr; - int64_t _node_id = -1; - std::string _load_info; - std::string _name; - - std::shared_ptr _node_channel_tracker; - - TupleDescriptor* _tuple_desc = nullptr; - NodeInfo _node_info; - - // this should be set in init() using config - int _rpc_timeout_ms = 60000; - int64_t _next_packet_seq = 0; - MonotonicStopWatch _timeout_watch; - - // the timestamp when this node channel be marked closed and finished closed - uint64_t _close_time_ms = 0; - - // user cancel or get some errors - std::atomic _cancelled {false}; - SpinLock _cancel_msg_lock; - std::string _cancel_msg = ""; - - // send finished means the consumer thread which send the rpc can exit - std::atomic _send_finished {false}; - - // add batches finished means the last rpc has be response, used to check whether this channel can be closed - std::atomic _add_batches_finished {false}; // reuse for vectorized - - bool _eos_is_produced {false}; // only for restricting producer behaviors - - std::unique_ptr _row_desc; - int _batch_size = 0; - - // limit _pending_batches size - std::atomic _pending_batches_bytes {0}; - size_t _max_pending_batches_bytes {(size_t)config::nodechannel_pending_queue_max_bytes}; - std::mutex _pending_batches_lock; // reuse for vectorized - std::atomic _pending_batches_num {0}; // reuse for vectorized - - std::shared_ptr _stub = nullptr; - RefCountClosure* _open_closure = nullptr; - - std::vector _all_tablets; - // map from tablet_id to node_id where slave replicas locate in - std::unordered_map> _slave_tablet_nodes; - std::vector _tablet_commit_infos; - - AddBatchCounter _add_batch_counter; - std::atomic _serialize_batch_ns {0}; - std::atomic _mem_exceeded_block_ns {0}; - std::atomic _queue_push_lock_ns {0}; - std::atomic _actual_consume_ns {0}; - - // lock to protect _is_closed. - // The methods in the IndexChannel are called back in the RpcClosure in the NodeChannel. - // However, this rpc callback may occur after the whole task is finished (e.g. due to network latency), - // and by that time the IndexChannel may have been destructured, so we should not call the - // IndexChannel methods anymore, otherwise the BE will crash. - // Therefore, we use the _is_closed and _closed_lock to ensure that the RPC callback - // function will not call the IndexChannel method after the NodeChannel is closed. - // The IndexChannel is definitely accessible until the NodeChannel is closed. - std::mutex _closed_lock; - bool _is_closed = false; - - RuntimeState* _state; - // rows number received per tablet, tablet_id -> rows_num - std::vector> _tablets_received_rows; - -private: - std::unique_ptr _cur_batch; - PTabletWriterAddBatchRequest _cur_add_batch_request; - using AddBatchReq = std::pair, PTabletWriterAddBatchRequest>; - std::queue _pending_batches; - ReusableClosure* _add_batch_closure = nullptr; -}; - -class IndexChannel { -public: - IndexChannel(OlapTableSink* parent, int64_t index_id, bool is_vec) - : _parent(parent), _index_id(index_id), _is_vectorized(is_vec) { - _index_channel_tracker = - std::make_unique("IndexChannel:indexID=" + std::to_string(_index_id)); - } - ~IndexChannel() = default; - - Status init(RuntimeState* state, const std::vector& tablets); - - template - void add_row(const Row& tuple, int64_t tablet_id); - - void for_each_node_channel( - const std::function&)>& func) { - for (auto& it : _node_channels) { - func(it.second); - } - } - - void mark_as_failed(int64_t node_id, const std::string& host, const std::string& err, - int64_t tablet_id = -1); - Status check_intolerable_failure(); - - // set error tablet info in runtime state, so that it can be returned to FE. - void set_error_tablet_in_state(RuntimeState* state); - - size_t num_node_channels() const { return _node_channels.size(); } - - size_t get_pending_bytes() const { - size_t mem_consumption = 0; - for (auto& kv : _node_channels) { - mem_consumption += kv.second->get_pending_bytes(); - } - return mem_consumption; - } - - void set_tablets_received_rows( - const std::vector>& tablets_received_rows, int64_t node_id); - - // check whether the rows num written by different replicas is consistent - Status check_tablet_received_rows_consistency(); - -private: - friend class NodeChannel; - friend class VNodeChannel; - friend class VOlapTableSink; - - OlapTableSink* _parent; - int64_t _index_id; - bool _is_vectorized = false; - - // from backend channel to tablet_id - // ATTN: must be placed before `_node_channels` and `_channels_by_tablet`. - // Because the destruct order of objects is opposite to the creation order. - // So NodeChannel will be destructured first. - // And the destructor function of NodeChannel waits for all RPCs to finish. - // This ensures that it is safe to use `_tablets_by_channel` in the callback function for the end of the RPC. - std::unordered_map> _tablets_by_channel; - // BeId -> channel - std::unordered_map> _node_channels; - // from tablet_id to backend channel - std::unordered_map>> _channels_by_tablet; - - // lock to protect _failed_channels and _failed_channels_msgs - mutable SpinLock _fail_lock; - // key is tablet_id, value is a set of failed node id - std::unordered_map> _failed_channels; - // key is tablet_id, value is error message - std::unordered_map _failed_channels_msgs; - Status _intolerable_failure_status = Status::OK(); - - std::unique_ptr _index_channel_tracker; - // rows num received by DeltaWriter per tablet, tablet_id -> - // used to verify whether the rows num received by different replicas is consistent - std::map>> _tablets_received_rows; -}; - -template -void IndexChannel::add_row(const Row& tuple, int64_t tablet_id) { - SCOPED_CONSUME_MEM_TRACKER(_index_channel_tracker.get()); - auto it = _channels_by_tablet.find(tablet_id); - DCHECK(it != _channels_by_tablet.end()) << "unknown tablet, tablet_id=" << tablet_id; - for (const auto& channel : it->second) { - // if this node channel is already failed, this add_row will be skipped - auto st = channel->add_row(tuple, tablet_id); - if (!st.ok()) { - mark_as_failed(channel->node_id(), channel->host(), st.to_string(), tablet_id); - // continue add row to other node, the error will be checked for every batch outside - } - } -} - -// Write data to Olap Table. -// When OlapTableSink::open() called, there will be a consumer thread running in the background. -// When you call OlapTableSink::send(), you will be the producer who products pending batches. -// Join the consumer thread in close(). -class OlapTableSink : public DataSink { -public: - // Construct from thrift struct which is generated by FE. - OlapTableSink(ObjectPool* pool, const RowDescriptor& row_desc, const std::vector& texprs, - Status* status); - ~OlapTableSink() override; - - Status init(const TDataSink& sink) override; - - Status prepare(RuntimeState* state) override; - - Status open(RuntimeState* state) override; - - Status send(RuntimeState* state, RowBatch* batch) override; - - // close() will send RPCs too. If RPCs failed, return error. - Status close(RuntimeState* state, Status close_status) override; - - // Returns the runtime profile for the sink. - RuntimeProfile* profile() override { return _profile; } - -private: - // convert input batch to output batch which will be loaded into OLAP table. - // this is only used in insert statement. - Status _convert_batch(RuntimeState* state, RowBatch* input_batch, RowBatch* output_batch); - - // make input data valid for OLAP table - // return number of invalid/filtered rows. - // invalid row number is set in Bitmap - // set stop_processing is we want to stop the whole process now. - Status _validate_data(RuntimeState* state, RowBatch* batch, Bitmap* filter_bitmap, - int* filtered_rows, bool* stop_processing); - bool _validate_cell(const TypeDescriptor& type, const std::string& col_name, void* slot, - size_t slot_index, fmt::memory_buffer& error_msg, RowBatch* batch); - - // the consumer func of sending pending batches in every NodeChannel. - // use polling & NodeChannel::try_send_and_fetch_status() to achieve nonblocking sending. - // only focus on pending batches and channel status, the internal errors of NodeChannels will be handled by the producer - void _send_batch_process(RuntimeState* state); - -protected: - friend class NodeChannel; - friend class VNodeChannel; - friend class IndexChannel; - - bool _is_vectorized = false; - - std::shared_ptr _mem_tracker; - - ObjectPool* _pool; - const RowDescriptor& _input_row_desc; - - // unique load id - PUniqueId _load_id; - int64_t _txn_id = -1; - int _num_replicas = -1; - int _tuple_desc_id = -1; - - // this is tuple descriptor of destination OLAP table - TupleDescriptor* _output_tuple_desc = nullptr; - RowDescriptor* _output_row_desc = nullptr; - - bool _need_validate_data = false; - - // number of senders used to insert into OlapTable, if we only support single node insert, - // all data from select should collectted and then send to OlapTable. - // To support multiple senders, we maintain a channel for each sender. - int _sender_id = -1; - int _num_senders = -1; - bool _is_high_priority = false; - - // TODO(zc): think about cache this data - std::shared_ptr _schema; - OlapTableLocationParam* _location = nullptr; - bool _write_single_replica = false; - OlapTableLocationParam* _slave_location = nullptr; - DorisNodesInfo* _nodes_info = nullptr; - - RuntimeProfile* _profile = nullptr; - - std::set _partition_ids; - // only used for partition with random distribution - std::map _partition_to_tablet_map; - - Bitmap _filter_bitmap; - - // index_channel - std::vector> _channels; - - CountDownLatch _stop_background_threads_latch; - scoped_refptr _sender_thread; - std::unique_ptr _send_batch_thread_pool_token; - - std::vector _max_decimalv2_val; - std::vector _min_decimalv2_val; - - // Stats for this - int64_t _convert_batch_ns = 0; - int64_t _validate_data_ns = 0; - int64_t _send_data_ns = 0; - int64_t _number_input_rows = 0; - int64_t _number_output_rows = 0; - int64_t _number_filtered_rows = 0; - - RuntimeProfile::Counter* _input_rows_counter = nullptr; - RuntimeProfile::Counter* _output_rows_counter = nullptr; - RuntimeProfile::Counter* _filtered_rows_counter = nullptr; - RuntimeProfile::Counter* _send_data_timer = nullptr; - RuntimeProfile::Counter* _wait_mem_limit_timer = nullptr; - RuntimeProfile::Counter* _convert_batch_timer = nullptr; - RuntimeProfile::Counter* _validate_data_timer = nullptr; - RuntimeProfile::Counter* _open_timer = nullptr; - RuntimeProfile::Counter* _close_timer = nullptr; - RuntimeProfile::Counter* _non_blocking_send_timer = nullptr; - RuntimeProfile::Counter* _non_blocking_send_work_timer = nullptr; - RuntimeProfile::Counter* _serialize_batch_timer = nullptr; - RuntimeProfile::Counter* _total_add_batch_exec_timer = nullptr; - RuntimeProfile::Counter* _max_add_batch_exec_timer = nullptr; - RuntimeProfile::Counter* _add_batch_number = nullptr; - RuntimeProfile::Counter* _num_node_channels = nullptr; - - // load mem limit is for remote load channel - int64_t _load_mem_limit = -1; - - // the timeout of load channels opened by this tablet sink. in second - int64_t _load_channel_timeout_s = 0; - - int32_t _send_batch_parallelism = 1; - // Save the status of close() method - Status _close_status; - - // User can change this config at runtime, avoid it being modified during query or loading process. - bool _transfer_large_data_by_brpc = false; - - // FIND_TABLET_EVERY_ROW is used for both hash and random distribution info, which indicates that we - // should compute tablet index for every row - // FIND_TABLET_EVERY_BATCH is only used for random distribution info, which indicates that we should - // compute tablet index for every row batch - // FIND_TABLET_EVERY_SINK is only used for random distribution info, which indicates that we should - // only compute tablet index in the corresponding partition once for the whole time in olap table sink - enum FindTabletMode { FIND_TABLET_EVERY_ROW, FIND_TABLET_EVERY_BATCH, FIND_TABLET_EVERY_SINK }; - FindTabletMode findTabletMode = FindTabletMode::FIND_TABLET_EVERY_ROW; - -private: - OlapTablePartitionParam* _partition = nullptr; - std::vector _output_expr_ctxs; - std::unique_ptr _output_batch; -}; - -} // namespace stream_load -} // namespace doris diff --git a/be/src/olap/push_handler.cpp b/be/src/olap/push_handler.cpp index 8ebed0ccdb..9b2c7149ed 100644 --- a/be/src/olap/push_handler.cpp +++ b/be/src/olap/push_handler.cpp @@ -24,6 +24,7 @@ #include "common/object_pool.h" #include "common/status.h" +#include "exec/parquet_scanner.h" #include "olap/row.h" #include "olap/rowset/rowset_id_generator.h" #include "olap/rowset/rowset_meta_manager.h" @@ -32,7 +33,6 @@ #include "olap/tablet.h" #include "olap/tablet_schema.h" #include "runtime/exec_env.h" -#include "vec/exec/vparquet_scanner.h" namespace doris { using namespace ErrorCode; @@ -821,9 +821,9 @@ Status PushBrokerReader::init(const Schema* schema, const TBrokerScanRange& t_sc BaseScanner* scanner = nullptr; switch (t_scan_range.ranges[0].format_type) { case TFileFormatType::FORMAT_PARQUET: - scanner = new vectorized::VParquetScanner( - _runtime_state.get(), _runtime_profile, t_scan_range.params, t_scan_range.ranges, - t_scan_range.broker_addresses, _pre_filter_texprs, _counter.get()); + scanner = new ParquetScanner(_runtime_state.get(), _runtime_profile, t_scan_range.params, + t_scan_range.ranges, t_scan_range.broker_addresses, + _pre_filter_texprs, _counter.get()); break; default: LOG(WARNING) << "Unsupported file format type: " << t_scan_range.ranges[0].format_type; diff --git a/be/src/runtime/CMakeLists.txt b/be/src/runtime/CMakeLists.txt index 3d42ae0faf..36c75ffbef 100644 --- a/be/src/runtime/CMakeLists.txt +++ b/be/src/runtime/CMakeLists.txt @@ -59,10 +59,6 @@ set(RUNTIME_FILES disk_io_mgr.cc disk_io_mgr_reader_context.cc disk_io_mgr_scan_range.cc - buffered_block_mgr2.cc - buffered_tuple_stream2.cc - buffered_tuple_stream3.cc - export_sink.cpp load_channel_mgr.cpp load_channel.cpp tablets_channel.cpp @@ -85,9 +81,7 @@ set(RUNTIME_FILES small_file_mgr.cpp record_batch_queue.cpp result_queue_mgr.cpp - memory_scratch_sink.cpp external_scan_context_mgr.cpp - mysql_result_writer.cpp memory/system_allocator.cpp memory/chunk_allocator.cpp memory/mem_tracker_limiter.cpp @@ -96,16 +90,8 @@ set(RUNTIME_FILES fold_constant_executor.cpp cache/result_node.cpp cache/result_cache.cpp - odbc_table_sink.cpp ) -if (WITH_MYSQL) - set(RUNTIME_FILES ${RUNTIME_FILES} - mysql_table_writer.cpp - mysql_table_sink.cpp - ) -endif() - if (USE_JEMALLOC AND USE_MEM_TRACKER) set(RUNTIME_FILES ${RUNTIME_FILES} memory/jemalloc_hook.cpp diff --git a/be/src/runtime/buffered_block_mgr2.cc b/be/src/runtime/buffered_block_mgr2.cc deleted file mode 100644 index 1754be4dc1..0000000000 --- a/be/src/runtime/buffered_block_mgr2.cc +++ /dev/null @@ -1,1216 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/apache/impala/blob/branch-2.9.0/be/src/runtime/buffered-block-mgr2.cc -// and modified by Doris - -#include "runtime/buffered_block_mgr2.h" - -#include "exec/exec_node.h" -#include "runtime/exec_env.h" -#include "runtime/memory/mem_tracker.h" -#include "runtime/runtime_state.h" -#include "runtime/tmp_file_mgr.h" -#include "util/bit_util.h" -#include "util/debug_util.h" -#include "util/disk_info.h" -#include "util/doris_metrics.h" -#include "util/pretty_printer.h" -#include "util/runtime_profile.h" -#include "util/stack_util.h" -#include "util/uid_util.h" - -using std::string; -using std::stringstream; -using std::vector; -using std::list; -using std::endl; - -using std::bind; -using std::mem_fn; -using std::lock_guard; -using std::mutex; -using std::shared_ptr; -using std::unique_lock; - -namespace doris { -using namespace ErrorCode; - -BufferedBlockMgr2::BlockMgrsMap BufferedBlockMgr2::_s_query_to_block_mgrs; -SpinLock BufferedBlockMgr2::_s_block_mgrs_lock; - -class BufferedBlockMgr2::Client { -public: - Client(BufferedBlockMgr2* mgr, int num_reserved_buffers, RuntimeState* state) - : _mgr(mgr), - _state(state), - _tracker(std::make_unique("BufferedBlockMgr2::Client")), - _num_reserved_buffers(num_reserved_buffers), - _num_tmp_reserved_buffers(0), - _num_pinned_buffers(0) {} - - // A null dtor to pass codestyle check - ~Client() {} - - // Unowned. - BufferedBlockMgr2* _mgr; - - // Unowned. - RuntimeState* _state; - - // Tracker for this client. Unowned. - // When the client gets a buffer, we update the consumption on this tracker. However, - // we don't want to transfer the buffer from the block mgr to the client (i.e. release - // from the block mgr), since the block mgr is where the block mem usage limit is - // enforced. Even when we give a buffer to a client, the buffer is still owned and - // counts against the block mgr tracker (i.e. there is a fixed pool of buffers - // regardless of if they are in the block mgr or the clients). - std::unique_ptr _tracker; - - // Number of buffers reserved by this client. - int _num_reserved_buffers; - - // Number of buffers temporarily reserved. - int _num_tmp_reserved_buffers; - - // Number of buffers pinned by this client. - int _num_pinned_buffers; - - void pin_buffer(BufferDescriptor* buffer) { - DCHECK(buffer != nullptr); - if (buffer->len == _mgr->max_block_size()) { - ++_num_pinned_buffers; - _tracker->consume(buffer->len); - } - } - - void unpin_buffer(BufferDescriptor* buffer) { - DCHECK(buffer != nullptr); - if (buffer->len == _mgr->max_block_size()) { - DCHECK_GT(_num_pinned_buffers, 0); - --_num_pinned_buffers; - _tracker->release(buffer->len); - } - } - - string debug_string() const { - stringstream ss; - ss << "Client " << this << endl - << " num_reserved_buffers=" << _num_reserved_buffers << endl - << " num_tmp_reserved_buffers=" << _num_tmp_reserved_buffers << endl - << " num_pinned_buffers=" << _num_pinned_buffers; - return ss.str(); - } -}; - -// BufferedBlockMgr2::Block methods. -BufferedBlockMgr2::Block::Block(BufferedBlockMgr2* block_mgr) - : _buffer_desc(nullptr), - _block_mgr(block_mgr), - _client(nullptr), - _write_range(nullptr), - _tmp_file(nullptr), - _valid_data_len(0), - _num_rows(0) {} - -Status BufferedBlockMgr2::Block::pin(bool* pinned, Block* release_block, bool unpin) { - return _block_mgr->pin_block(this, pinned, release_block, unpin); -} - -Status BufferedBlockMgr2::Block::unpin() { - return _block_mgr->unpin_block(this); -} - -void BufferedBlockMgr2::Block::del() { - _block_mgr->delete_block(this); -} - -void BufferedBlockMgr2::Block::init() { - // No locks are taken because the block is new or has previously been deleted. - _is_pinned = false; - _in_write = false; - _is_deleted = false; - _valid_data_len = 0; - _client = nullptr; - _num_rows = 0; -} - -bool BufferedBlockMgr2::Block::validate() const { - if (_is_deleted && (_is_pinned || (!_in_write && _buffer_desc != nullptr))) { - LOG(ERROR) << "Deleted block in use - " << debug_string(); - return false; - } - - if (_buffer_desc == nullptr && (_is_pinned || _in_write)) { - LOG(ERROR) << "Block without buffer in use - " << debug_string(); - return false; - } - - if (_buffer_desc == nullptr && _block_mgr->_unpinned_blocks.contains(this)) { - LOG(ERROR) << "Unpersisted block without buffer - " << debug_string(); - return false; - } - - if (_buffer_desc != nullptr && (_buffer_desc->block != this)) { - LOG(ERROR) << "Block buffer inconsistency - " << debug_string(); - return false; - } - - return true; -} - -string BufferedBlockMgr2::Block::tmp_file_path() const { - if (_tmp_file == nullptr) { - return ""; - } - return _tmp_file->path(); -} - -string BufferedBlockMgr2::Block::debug_string() const { - stringstream ss; - ss << "Block: " << this << endl - << " Buffer Desc: " << _buffer_desc << endl - << " Data Len: " << _valid_data_len << endl - << " Num Rows: " << _num_rows << endl; - if (_is_pinned) { - ss << " Buffer Len: " << buffer_len() << endl; - } - ss << " Deleted: " << _is_deleted << endl - << " Pinned: " << _is_pinned << endl - << " Write Issued: " << _in_write << endl - << " Client Local: " << _client_local; - return ss.str(); -} - -BufferedBlockMgr2::BufferedBlockMgr2(RuntimeState* state, TmpFileMgr* tmp_file_mgr, - int64_t block_size) - : _max_block_size(block_size), - // Keep two writes in flight per scratch disk so the disks can stay busy. - _block_write_threshold(tmp_file_mgr->num_active_tmp_devices() * 2), - _enable_spill(state->enable_spill()), - _query_id(state->query_id()), - _tmp_file_mgr(tmp_file_mgr), - _initialized(false), - _unfullfilled_reserved_buffers(0), - _total_pinned_buffers(0), - _non_local_outstanding_writes(0), - _io_mgr(state->exec_env()->disk_io_mgr()), - _is_cancelled(false), - _writes_issued(0), - _state(state) {} - -Status BufferedBlockMgr2::create(RuntimeState* state, RuntimeProfile* profile, - TmpFileMgr* tmp_file_mgr, int64_t block_size, - std::shared_ptr* block_mgr) { - block_mgr->reset(); - { - // we do not use global BlockMgrsMap for now, to avoid mem-exceeded different fragments - // running on the same machine. - // TODO(lingbin): open it later. note that open with query-mem-limit in RuntimeState - // at the same time. - - // lock_guard lock(_s_block_mgrs_lock); - // BlockMgrsMap::iterator it = _s_query_to_block_mgrs.find(state->query_id()); - // if (it != _s_query_to_block_mgrs.end()){ - // *block_mgr = it->second.lock(); - // } - if (*block_mgr == nullptr) { - // weak_ptr::lock returns nullptr if the weak_ptr is expired. This means - // all shared_ptr references have gone to 0 and it is in the process of - // being deleted. This can happen if the last shared reference is released - // but before the weak ptr is removed from the map. - block_mgr->reset(new BufferedBlockMgr2(state, tmp_file_mgr, block_size)); - // _s_query_to_block_mgrs[state->query_id()] = *block_mgr; - } - } - (*block_mgr)->init(state->exec_env()->disk_io_mgr(), profile); - return Status::OK(); -} - -int64_t BufferedBlockMgr2::available_buffers(Client* client) const { - int64_t unused_reserved = client->_num_reserved_buffers + client->_num_tmp_reserved_buffers - - client->_num_pinned_buffers; - return std::max(0, remaining_unreserved_buffers()) + - std::max(0, unused_reserved); -} - -int64_t BufferedBlockMgr2::remaining_unreserved_buffers() const { - int64_t num_buffers = - _free_io_buffers.size() + _unpinned_blocks.size() + _non_local_outstanding_writes; - num_buffers += thread_context()->thread_mem_tracker()->spare_capacity() / max_block_size(); - num_buffers -= _unfullfilled_reserved_buffers; - return num_buffers; -} - -Status BufferedBlockMgr2::register_client(int num_reserved_buffers, RuntimeState* state, - Client** client) { - DCHECK_GE(num_reserved_buffers, 0); - Client* a_client = new Client(this, num_reserved_buffers, state); - lock_guard lock(_lock); - *client = _obj_pool.add(a_client); - _unfullfilled_reserved_buffers += num_reserved_buffers; - return Status::OK(); -} - -void BufferedBlockMgr2::clear_reservations(Client* client) { - lock_guard lock(_lock); - // TODO: Can the modifications to the client's mem variables can be made w/o the lock? - if (client->_num_pinned_buffers < client->_num_reserved_buffers) { - _unfullfilled_reserved_buffers -= - client->_num_reserved_buffers - client->_num_pinned_buffers; - } - client->_num_reserved_buffers = 0; - - _unfullfilled_reserved_buffers -= client->_num_tmp_reserved_buffers; - client->_num_tmp_reserved_buffers = 0; -} - -bool BufferedBlockMgr2::try_acquire_tmp_reservation(Client* client, int num_buffers) { - lock_guard lock(_lock); - // TODO: Can the modifications to the client's mem variables can be made w/o the lock? - DCHECK_EQ(client->_num_tmp_reserved_buffers, 0); - if (client->_num_pinned_buffers < client->_num_reserved_buffers) { - // If client has unused reserved buffers, we use those first. - num_buffers -= client->_num_reserved_buffers - client->_num_pinned_buffers; - } - if (num_buffers < 0) { - return true; - } - if (available_buffers(client) < num_buffers) { - return false; - } - - client->_num_tmp_reserved_buffers = num_buffers; - _unfullfilled_reserved_buffers += num_buffers; - return true; -} - -void BufferedBlockMgr2::cancel() { - { - lock_guard lock(_lock); - if (_is_cancelled) { - return; - } - _is_cancelled = true; - } - // Cancel the underlying io mgr to unblock any waiting threads. - _io_mgr->cancel_context(_io_request_context); -} - -bool BufferedBlockMgr2::is_cancelled() { - lock_guard lock(_lock); - return _is_cancelled; -} - -Status BufferedBlockMgr2::mem_limit_too_low_error(Client* client, int node_id) { - VLOG_QUERY << "Query: " << _query_id << ". Node=" << node_id << " ran out of memory: " << endl - << debug_internal() << endl - << client->debug_string(); - - // TODO: what to print here. We can't know the value of the entire query here. - stringstream error_msg; - error_msg << "The memory limit is set too low to initialize spilling operator (id=" << node_id - << "). The minimum required memory to spill this operator is " - << PrettyPrinter::print(client->_num_reserved_buffers * max_block_size(), - TUnit::BYTES) - << "."; - return add_exec_msg(error_msg.str()); -} - -Status BufferedBlockMgr2::add_exec_msg(const std::string& msg) const { - stringstream str; - str << msg << " "; - str << "Backend: " << BackendOptions::get_localhost() << ", "; - str << "fragment: " << print_id(_state->fragment_instance_id()) << " "; - return Status::MemoryLimitExceeded(str.str()); -} - -Status BufferedBlockMgr2::get_new_block(Client* client, Block* unpin_block, Block** block, - int64_t len) { - DCHECK_LE(len, _max_block_size) << "Cannot request block bigger than max_len"; - DCHECK_NE(len, 0) << "Cannot request block of zero size"; - *block = nullptr; - Block* new_block = nullptr; - - { - lock_guard lock(_lock); - if (_is_cancelled) { - return Status::Cancelled("Cancelled"); - } - new_block = get_unused_block(client); - DCHECK(new_block->validate()) << endl << new_block->debug_string(); - DCHECK_EQ(new_block->_client, client); - - if (len > 0 && len < _max_block_size) { - DCHECK(unpin_block == nullptr); - Status st = thread_context()->thread_mem_tracker()->check_limit(len); - WARN_IF_ERROR(st, "get_new_block failed"); - if (st) { - client->_tracker->consume(len); - // TODO: Have a cache of unused blocks of size 'len' (0, _max_block_size) - uint8_t* buffer = new uint8_t[len]; - // Descriptors for non-I/O sized buffers are deleted when the block is deleted. - new_block->_buffer_desc = new BufferDescriptor(buffer, len); - new_block->_buffer_desc->block = new_block; - new_block->_is_pinned = true; - client->pin_buffer(new_block->_buffer_desc); - ++_total_pinned_buffers; - *block = new_block; - } else { - new_block->_is_deleted = true; - return_unused_block(new_block); - } - return Status::OK(); - } - } - - bool in_mem = true; - RETURN_IF_ERROR(find_buffer_for_block(new_block, &in_mem)); - DCHECK(!in_mem) << "A new block cannot start in mem."; - DCHECK(!new_block->is_pinned() || new_block->_buffer_desc != nullptr) - << new_block->debug_string(); - - if (!new_block->is_pinned()) { - if (unpin_block == nullptr) { - // We couldn't get a new block and no unpin block was provided. Can't return - // a block. - new_block->_is_deleted = true; - return_unused_block(new_block); - new_block = nullptr; - } else { - // We need to transfer the buffer from unpin_block to new_block. - RETURN_IF_ERROR(transfer_buffer(new_block, unpin_block, true)); - } - } else if (unpin_block != nullptr) { - // Got a new block without needing to transfer. Just unpin this block. - RETURN_IF_ERROR(unpin_block->unpin()); - } - - DCHECK(new_block == nullptr || new_block->is_pinned()); - *block = new_block; - return Status::OK(); -} - -Status BufferedBlockMgr2::transfer_buffer(Block* dst, Block* src, bool unpin) { - Status status = Status::OK(); - DCHECK(dst != nullptr); - DCHECK(src != nullptr); - - // First write out the src block. - DCHECK(src->_is_pinned); - DCHECK(!dst->_is_pinned); - DCHECK(dst->_buffer_desc == nullptr); - DCHECK_EQ(src->_buffer_desc->len, _max_block_size); - src->_is_pinned = false; - - if (unpin) { - unique_lock lock(_lock); - src->_client_local = true; - status = write_unpinned_block(src); - if (!status.ok()) { - // The transfer failed, return the buffer to src. - src->_is_pinned = true; - return status; - } - // Wait for the write to complete. - while (src->_in_write && !_is_cancelled) { - src->_write_complete_cv.wait(lock); - } - if (_is_cancelled) { - // We can't be sure the write succeeded, so return the buffer to src. - src->_is_pinned = true; - return Status::Cancelled("Cancelled"); - } - DCHECK(!src->_in_write); - } - // Assign the buffer to the new block. - dst->_buffer_desc = src->_buffer_desc; - dst->_buffer_desc->block = dst; - src->_buffer_desc = nullptr; - dst->_is_pinned = true; - if (!unpin) { - src->_is_deleted = true; - return_unused_block(src); - } - return Status::OK(); -} - -BufferedBlockMgr2::~BufferedBlockMgr2() { - { - lock_guard lock(_s_block_mgrs_lock); - BlockMgrsMap::iterator it = _s_query_to_block_mgrs.find(_query_id); - // IMPALA-2286: Another fragment may have called create() for this _query_id and - // saw that this BufferedBlockMgr2 is being destructed. That fragment will - // overwrite the map entry for _query_id, pointing it to a different - // BufferedBlockMgr2 object. We should let that object's destructor remove the - // entry. On the other hand, if the second BufferedBlockMgr2 is destructed before - // this thread acquires the lock, then we'll remove the entry (because we can't - // distinguish between the two expired pointers), and when the other - // ~BufferedBlockMgr2() call occurs, it won't find an entry for this _query_id. - if (it != _s_query_to_block_mgrs.end()) { - std::shared_ptr mgr = it->second.lock(); - if (mgr.get() == nullptr) { - // The BufferBlockMgr object referenced by this entry is being deconstructed. - _s_query_to_block_mgrs.erase(it); - } else { - // The map references another (still valid) BufferedBlockMgr2. - DCHECK_NE(mgr.get(), this); - } - } - } - - if (_io_request_context != nullptr) { - _io_mgr->unregister_context(_io_request_context); - } - - // If there are any outstanding writes and we are here it means that when the - // write_complete() callback gets executed it is going to access invalid memory. - // See IMPALA-1890. - DCHECK_EQ(_non_local_outstanding_writes, 0) << endl << debug_internal(); - // Delete tmp files. - for (auto& file : _tmp_files) { - file->remove(); - } - _tmp_files.clear(); - - // Free memory resources. - for (BufferDescriptor* buffer : _all_io_buffers) { - _mem_tracker->release(buffer->len); - delete[] buffer->buffer; - } -} - -int64_t BufferedBlockMgr2::bytes_allocated() const { - return _mem_tracker->consumption(); -} - -int BufferedBlockMgr2::num_pinned_buffers(Client* client) const { - return client->_num_pinned_buffers; -} - -int BufferedBlockMgr2::num_reserved_buffers_remaining(Client* client) const { - return std::max(client->_num_reserved_buffers - client->_num_pinned_buffers, 0); -} - -MemTracker* BufferedBlockMgr2::get_tracker(Client* client) const { - return client->_tracker.get(); -} - -// TODO: It would be good if we had a sync primitive that supports is_mine() calls, see -// IMPALA-1884. -Status BufferedBlockMgr2::delete_or_unpin_block(Block* block, bool unpin) { - if (block == nullptr) { - return is_cancelled() ? Status::Cancelled("Cancelled") : Status::OK(); - } - if (unpin) { - return block->unpin(); - } else { - block->del(); - return is_cancelled() ? Status::Cancelled("Cancelled") : Status::OK(); - } -} - -Status BufferedBlockMgr2::pin_block(Block* block, bool* pinned, Block* release_block, bool unpin) { - DCHECK(block != nullptr); - DCHECK(!block->_is_deleted); - *pinned = false; - if (block->_is_pinned) { - *pinned = true; - return delete_or_unpin_block(release_block, unpin); - } - - bool in_mem = false; - RETURN_IF_ERROR(find_buffer_for_block(block, &in_mem)); - *pinned = block->_is_pinned; - - // Block was not evicted or had no data, nothing left to do. - if (in_mem || block->_valid_data_len == 0) { - return delete_or_unpin_block(release_block, unpin); - } - - if (!block->_is_pinned) { - if (release_block == nullptr) { - return Status::OK(); - } - - if (block->_buffer_desc != nullptr) { - { - lock_guard lock(_lock); - if (_free_io_buffers.contains(block->_buffer_desc)) { - DCHECK(!block->_is_pinned && !block->_in_write && - !_unpinned_blocks.contains(block)) - << endl - << block->debug_string(); - _free_io_buffers.remove(block->_buffer_desc); - } else if (_unpinned_blocks.contains(block)) { - _unpinned_blocks.remove(block); - } else { - DCHECK(block->_in_write); - } - block->_is_pinned = true; - *pinned = true; - block->_client->pin_buffer(block->_buffer_desc); - ++_total_pinned_buffers; - RETURN_IF_ERROR(write_unpinned_blocks()); - } - return delete_or_unpin_block(release_block, unpin); - } - - RETURN_IF_ERROR(transfer_buffer(block, release_block, unpin)); - DCHECK(!release_block->_is_pinned); - release_block = nullptr; // Handled by transfer. - DCHECK(block->_is_pinned); - *pinned = true; - } - - // Read the block from disk if it was not in memory. - DCHECK(block->_write_range != nullptr) << block->debug_string() << endl << release_block; - SCOPED_TIMER(_disk_read_timer); - // Create a ScanRange to perform the read. - DiskIoMgr::ScanRange* scan_range = _obj_pool.add(new DiskIoMgr::ScanRange()); - scan_range->reset(nullptr, block->_write_range->file(), block->_write_range->len(), - block->_write_range->offset(), block->_write_range->disk_id(), false, block, - DiskIoMgr::ScanRange::NEVER_CACHE); - vector ranges(1, scan_range); - RETURN_IF_ERROR(_io_mgr->add_scan_ranges(_io_request_context, ranges, true)); - - // Read from the io mgr buffer into the block's assigned buffer. - int64_t offset = 0; - bool buffer_eosr = false; - do { - DiskIoMgr::BufferDescriptor* io_mgr_buffer; - RETURN_IF_ERROR(scan_range->get_next(&io_mgr_buffer)); - memcpy(block->buffer() + offset, io_mgr_buffer->buffer(), io_mgr_buffer->len()); - offset += io_mgr_buffer->len(); - buffer_eosr = io_mgr_buffer->eosr(); - io_mgr_buffer->return_buffer(); - } while (!buffer_eosr); - DCHECK_EQ(offset, block->_write_range->len()); - - return delete_or_unpin_block(release_block, unpin); -} - -Status BufferedBlockMgr2::unpin_block(Block* block) { - DCHECK(!block->_is_deleted) << "Unpin for deleted block."; - - lock_guard unpinned_lock(_lock); - if (_is_cancelled) { - return Status::Cancelled("Cancelled"); - } - DCHECK(block->validate()) << endl << block->debug_string(); - if (!block->_is_pinned) { - return Status::OK(); - } - DCHECK_EQ(block->_buffer_desc->len, _max_block_size) << "Can only unpin io blocks."; - DCHECK(validate()) << endl << debug_internal(); - // Add 'block' to the list of unpinned blocks and set _is_pinned to false. - // Cache its position in the list for later removal. - block->_is_pinned = false; - DCHECK(!_unpinned_blocks.contains(block)) << " Unpin for block in unpinned list"; - if (!block->_in_write) { - _unpinned_blocks.enqueue(block); - } - block->_client->unpin_buffer(block->_buffer_desc); - if (block->_client->_num_pinned_buffers < block->_client->_num_reserved_buffers) { - ++_unfullfilled_reserved_buffers; - } - --_total_pinned_buffers; - RETURN_IF_ERROR(write_unpinned_blocks()); - DCHECK(validate()) << endl << debug_internal(); - DCHECK(block->validate()) << endl << block->debug_string(); - return Status::OK(); -} - -Status BufferedBlockMgr2::write_unpinned_blocks() { - if (!_enable_spill) { - return Status::OK(); - } - - // Assumes block manager lock is already taken. - while (_non_local_outstanding_writes + _free_io_buffers.size() < _block_write_threshold && - !_unpinned_blocks.empty()) { - // Pop a block from the back of the list (LIFO). - Block* write_block = _unpinned_blocks.pop_back(); - write_block->_client_local = false; - RETURN_IF_ERROR(write_unpinned_block(write_block)); - ++_non_local_outstanding_writes; - } - DCHECK(validate()) << endl << debug_internal(); - return Status::OK(); -} - -Status BufferedBlockMgr2::write_unpinned_block(Block* block) { - // Assumes block manager lock is already taken. - DCHECK(!block->_is_pinned) << block->debug_string(); - DCHECK(!block->_in_write) << block->debug_string(); - DCHECK_EQ(block->_buffer_desc->len, _max_block_size); - - if (block->_write_range == nullptr) { - if (_tmp_files.empty()) { - RETURN_IF_ERROR(init_tmp_files()); - } - - // First time the block is being persisted - need to allocate tmp file space. - TmpFileMgr::File* tmp_file; - int64_t file_offset; - RETURN_IF_ERROR(allocate_scratch_space(_max_block_size, &tmp_file, &file_offset)); - int disk_id = tmp_file->disk_id(); - if (disk_id < 0) { - // Assign a valid disk id to the write range if the tmp file was not assigned one. - static unsigned int next_disk_id = 0; - disk_id = ++next_disk_id; - } - disk_id %= _io_mgr->num_local_disks(); - DiskIoMgr::WriteRange::WriteDoneCallback callback = bind( - mem_fn(&BufferedBlockMgr2::write_complete), this, block, std::placeholders::_1); - block->_write_range = _obj_pool.add( - new DiskIoMgr::WriteRange(tmp_file->path(), file_offset, disk_id, callback)); - block->_tmp_file = tmp_file; - } - - uint8_t* outbuf = nullptr; - outbuf = block->buffer(); - - block->_write_range->set_data(outbuf, block->_valid_data_len); - - // Issue write through DiskIoMgr. - RETURN_IF_ERROR(_io_mgr->add_write_range(_io_request_context, block->_write_range)); - block->_in_write = true; - DCHECK(block->validate()) << endl << block->debug_string(); - _outstanding_writes_counter->update(1); - _bytes_written_counter->update(block->_valid_data_len); - ++_writes_issued; - if (_writes_issued == 1) { - } - return Status::OK(); -} - -Status BufferedBlockMgr2::allocate_scratch_space(int64_t block_size, TmpFileMgr::File** tmp_file, - int64_t* file_offset) { - // Assumes block manager lock is already taken. - vector errs; - // Find the next physical file in round-robin order and create a write range for it. - for (int attempt = 0; attempt < _tmp_files.size(); ++attempt) { - *tmp_file = _tmp_files[_next_block_index].get(); - _next_block_index = (_next_block_index + 1) % _tmp_files.size(); - if ((*tmp_file)->is_blacklisted()) { - continue; - } - Status status = (*tmp_file)->allocate_space(_max_block_size, file_offset); - if (status.ok()) { - return Status::OK(); - } - // Log error and try other files if there was a problem. Problematic files will be - // blacklisted so we will not repeatedly log the same error. - LOG(WARNING) << "Error while allocating temporary file range: " << status - << ". Will try another temporary file."; - errs.emplace_back(status.to_string()); - } - Status err_status = Status::InternalError( - "No usable temporary files: space could not be allocated on any temporary device."); - for (int i = 0; i < errs.size(); ++i) { - err_status.append(errs[i]); - } - return err_status; -} - -void BufferedBlockMgr2::write_complete(Block* block, const Status& write_status) { - Status status = Status::OK(); - lock_guard lock(_lock); - _outstanding_writes_counter->update(-1); - DCHECK(validate()) << endl << debug_internal(); - DCHECK(_is_cancelled || block->_in_write) << "write_complete() for block not in write." << endl - << block->debug_string(); - if (!block->_client_local) { - DCHECK_GT(_non_local_outstanding_writes, 0) << block->debug_string(); - --_non_local_outstanding_writes; - } - block->_in_write = false; - - // Explicitly release our temporarily allocated buffer here so that it doesn't - // hang around needlessly. - - // return_unused_block() will clear the block, so save the client pointer. - // We have to be careful while touching the state because it may have been cleaned up by - // another thread. - RuntimeState* state = block->_client->_state; - // If the block was re-pinned when it was in the IOMgr queue, don't free it. - if (block->_is_pinned) { - // The number of outstanding writes has decreased but the number of free buffers - // hasn't. - DCHECK(!block->_client_local) - << "Client should be waiting. No one should have pinned this block."; - if (write_status.ok() && !_is_cancelled && !state->is_cancelled()) { - status = write_unpinned_blocks(); - } - } else if (block->_client_local) { - DCHECK(!block->_is_deleted) - << "Client should be waiting. No one should have deleted this block."; - block->_write_complete_cv.notify_one(); - } else { - DCHECK_EQ(block->_buffer_desc->len, _max_block_size) - << "Only io sized buffers should spill"; - _free_io_buffers.enqueue(block->_buffer_desc); - // Finish the delete_block() work. - if (block->_is_deleted) { - block->_buffer_desc->block = nullptr; - block->_buffer_desc = nullptr; - return_unused_block(block); - } - // Multiple threads may be waiting for the same block in find_buffer(). Wake them - // all up. One thread will get this block, and the others will re-evaluate whether - // they should continue waiting and if another write needs to be initiated. - _buffer_available_cv.notify_all(); - } - DCHECK(validate()) << endl << debug_internal(); - - if (!write_status.ok() || !status.ok() || _is_cancelled) { - VLOG_FILE << "Query: " << _query_id - << ". Write did not complete successfully: " - "write_status=" - << write_status << ", status=" << status << ". _is_cancelled=" << _is_cancelled; - - // If the instance is already cancelled, don't confuse things with these errors. - if (!write_status.is() && !state->is_cancelled()) { - if (!write_status.ok()) { - // Report but do not attempt to recover from write error. - DCHECK(block->_tmp_file != nullptr); - block->_tmp_file->report_io_error(write_status.to_string()); - VLOG_QUERY << "Query: " << _query_id << " write complete callback with error."; - state->log_error(write_status.to_string()); - } - if (!status.ok()) { - VLOG_QUERY << "Query: " << _query_id << " error while writing unpinned blocks."; - state->log_error(status.to_string()); - } - } - // Set cancelled and wake up waiting threads if an error occurred. Note that in - // the case of _client_local, that thread was woken up above. - _is_cancelled = true; - _buffer_available_cv.notify_all(); - } -} - -void BufferedBlockMgr2::delete_block(Block* block) { - DCHECK(!block->_is_deleted); - - lock_guard lock(_lock); - DCHECK(block->validate()) << endl << debug_internal(); - block->_is_deleted = true; - - if (block->_is_pinned) { - if (block->is_max_size()) { - --_total_pinned_buffers; - } - block->_client->unpin_buffer(block->_buffer_desc); - // Only block is io size we need change _unfullfilled_reserved_buffers - if (block->is_max_size() && - block->_client->_num_pinned_buffers < block->_client->_num_reserved_buffers) { - ++_unfullfilled_reserved_buffers; - } - block->_is_pinned = false; - } else if (_unpinned_blocks.contains(block)) { - // Remove block from unpinned list. - _unpinned_blocks.remove(block); - } - - if (block->_in_write) { - DCHECK(block->_buffer_desc != nullptr && block->_buffer_desc->len == _max_block_size) - << "Should never be writing a small buffer"; - // If a write is still pending, return. Cleanup will be done in write_complete(). - DCHECK(block->validate()) << endl << block->debug_string(); - return; - } - - if (block->_buffer_desc != nullptr) { - if (block->_buffer_desc->len != _max_block_size) { - // Just delete the block for now. - delete[] block->_buffer_desc->buffer; - block->_client->_tracker->release(block->_buffer_desc->len); - delete block->_buffer_desc; - block->_buffer_desc = nullptr; - } else { - if (!_free_io_buffers.contains(block->_buffer_desc)) { - _free_io_buffers.enqueue(block->_buffer_desc); - _buffer_available_cv.notify_one(); - } - block->_buffer_desc->block = nullptr; - block->_buffer_desc = nullptr; - } - } - return_unused_block(block); - DCHECK(block->validate()) << endl << block->debug_string(); - DCHECK(validate()) << endl << debug_internal(); -} - -void BufferedBlockMgr2::return_unused_block(Block* block) { - DCHECK(block->_is_deleted) << block->debug_string(); - DCHECK(!block->_is_pinned) << block->debug_string(); - ; - DCHECK(block->_buffer_desc == nullptr); - block->init(); - _unused_blocks.enqueue(block); -} - -Status BufferedBlockMgr2::find_buffer_for_block(Block* block, bool* in_mem) { - DCHECK(block != nullptr); - Client* client = block->_client; - DCHECK(client != nullptr); - DCHECK(!block->_is_pinned && !block->_is_deleted) << "Pinned or deleted block " << endl - << block->debug_string(); - *in_mem = false; - - unique_lock l(_lock); - if (_is_cancelled) { - return Status::Cancelled("Cancelled"); - } - - // First check if there is enough reserved memory to satisfy this request. - bool is_reserved_request = false; - if (client->_num_pinned_buffers < client->_num_reserved_buffers) { - is_reserved_request = true; - } else if (client->_num_tmp_reserved_buffers > 0) { - is_reserved_request = true; - --client->_num_tmp_reserved_buffers; - } - - DCHECK(validate()) << endl << debug_internal(); - if (is_reserved_request) { - --_unfullfilled_reserved_buffers; - } - - if (!is_reserved_request && remaining_unreserved_buffers() < 1) { - // The client already has its quota and there are no unreserved blocks left. - // Note that even if this passes, it is still possible for the path below to - // see OOM because another query consumed memory from the process tracker. This - // only happens if the buffer has not already been allocated by the block mgr. - // This check should ensure that the memory cannot be consumed by another client - // of the block mgr. - return Status::OK(); - } - - if (block->_buffer_desc != nullptr) { - // The block is in memory. It may be in 3 states: - // 1. In the unpinned list. The buffer will not be in the free list. - // 2. _in_write == true. The buffer will not be in the free list. - // 3. The buffer is free, but hasn't yet been reassigned to a different block. - DCHECK_EQ(block->_buffer_desc->len, max_block_size()) << "Non-I/O blocks are always pinned"; - DCHECK(_unpinned_blocks.contains(block) || block->_in_write || - _free_io_buffers.contains(block->_buffer_desc)); - if (_unpinned_blocks.contains(block)) { - _unpinned_blocks.remove(block); - DCHECK(!_free_io_buffers.contains(block->_buffer_desc)); - } else if (block->_in_write) { - DCHECK(block->_in_write && !_free_io_buffers.contains(block->_buffer_desc)); - } else { - _free_io_buffers.remove(block->_buffer_desc); - } - _buffered_pin_counter->update(1); - *in_mem = true; - } else { - BufferDescriptor* buffer_desc = nullptr; - RETURN_IF_ERROR(find_buffer(l, &buffer_desc)); - - if (buffer_desc == nullptr) { - // There are no free buffers or blocks we can evict. We need to fail this request. - // If this is an optional request, return OK. If it is required, return OOM. - if (!is_reserved_request) { - return Status::OK(); - } - - if (VLOG_QUERY_IS_ON) { - stringstream ss; - ss << "Query id=" << _query_id << " was unable to get minimum required buffers." - << endl - << debug_internal() << endl - << client->debug_string(); - VLOG_QUERY << ss.str(); - } - return add_exec_msg( - "Query did not have enough memory to get the minimum required " - "buffers in the block manager."); - } - - DCHECK(buffer_desc != nullptr); - DCHECK_EQ(buffer_desc->len, max_block_size()) << "Non-I/O buffer"; - if (buffer_desc->block != nullptr) { - // This buffer was assigned to a block but now we are reusing it. Reset the - // previous block->buffer link. - DCHECK(buffer_desc->block->validate()) << endl << buffer_desc->block->debug_string(); - buffer_desc->block->_buffer_desc = nullptr; - } - buffer_desc->block = block; - block->_buffer_desc = buffer_desc; - } - DCHECK(block->_buffer_desc != nullptr); - DCHECK(block->_buffer_desc->len < max_block_size() || !block->_is_pinned) - << "Trying to pin already pinned block. " << block->_buffer_desc->len << " " - << block->_is_pinned; - block->_is_pinned = true; - client->pin_buffer(block->_buffer_desc); - ++_total_pinned_buffers; - - DCHECK(block->validate()) << endl << block->debug_string(); - // The number of free buffers has decreased. Write unpinned blocks if the number - // of free buffers below the threshold is reached. - RETURN_IF_ERROR(write_unpinned_blocks()); - DCHECK(validate()) << endl << debug_internal(); - return Status::OK(); -} - -// We need to find a new buffer. We prefer getting this buffer in this order: -// 1. Allocate a new block if the number of free blocks is less than the write -// threshold, until we run out of memory. -// 2. Pick a buffer from the free list. -// 3. Wait and evict an unpinned buffer. -Status BufferedBlockMgr2::find_buffer(unique_lock& lock, BufferDescriptor** buffer_desc) { - *buffer_desc = nullptr; - - // First, try to allocate a new buffer. - if (_free_io_buffers.size() < _block_write_threshold && - thread_context()->thread_mem_tracker()->check_limit(_max_block_size)) { - _mem_tracker->consume(_max_block_size); - uint8_t* new_buffer = new uint8_t[_max_block_size]; - *buffer_desc = _obj_pool.add(new BufferDescriptor(new_buffer, _max_block_size)); - (*buffer_desc)->all_buffers_it = - _all_io_buffers.insert(_all_io_buffers.end(), *buffer_desc); - return Status::OK(); - } - - // Second, try to pick a buffer from the free list. - if (_free_io_buffers.empty()) { - // There are no free buffers. If spills are disabled or there no unpinned blocks we - // can write, return. We can't get a buffer. - if (!_enable_spill) { - return add_exec_msg( - "Spilling has been disabled for plans," - "current memory usage has reached the bottleneck. " - "You can avoid the behavior via increasing the mem limit " - "by session variable exec_mem_limit or enable_spilling."); - } - - // Third, this block needs to use a buffer that was unpinned from another block. - // Get a free buffer from the front of the queue and assign it to the block. - do { - if (_unpinned_blocks.empty() && _non_local_outstanding_writes == 0) { - return Status::OK(); - } - SCOPED_TIMER(_buffer_wait_timer); - // Try to evict unpinned blocks before waiting. - RETURN_IF_ERROR(write_unpinned_blocks()); - DCHECK_GT(_non_local_outstanding_writes, 0) << endl << debug_internal(); - _buffer_available_cv.wait(lock); - if (_is_cancelled) { - return Status::Cancelled("Cancelled"); - } - } while (_free_io_buffers.empty()); - } - *buffer_desc = _free_io_buffers.dequeue(); - return Status::OK(); -} - -BufferedBlockMgr2::Block* BufferedBlockMgr2::get_unused_block(Client* client) { - DCHECK(client != nullptr); - Block* new_block = nullptr; - if (_unused_blocks.empty()) { - new_block = _obj_pool.add(new Block(this)); - new_block->init(); - _created_block_counter->update(1); - } else { - new_block = _unused_blocks.dequeue(); - _recycled_blocks_counter->update(1); - } - DCHECK(new_block != nullptr); - new_block->_client = client; - return new_block; -} - -bool BufferedBlockMgr2::validate() const { - int num_free_io_buffers = 0; - - if (_total_pinned_buffers < 0) { - LOG(ERROR) << "_total_pinned_buffers < 0: " << _total_pinned_buffers; - return false; - } - - for (BufferDescriptor* buffer : _all_io_buffers) { - bool is_free = _free_io_buffers.contains(buffer); - num_free_io_buffers += is_free; - - if (*buffer->all_buffers_it != buffer) { - LOG(ERROR) << "All buffers list is corrupt. Buffer iterator is not valid."; - return false; - } - - if (buffer->block == nullptr && !is_free) { - LOG(ERROR) << "Buffer with no block not in free list." << endl << debug_internal(); - return false; - } - - if (buffer->len != _max_block_size) { - LOG(ERROR) << "Non-io sized buffers should not end up on free list."; - return false; - } - - if (buffer->block != nullptr) { - if (buffer->block->_buffer_desc != buffer) { - LOG(ERROR) << "buffer<->block pointers inconsistent. Buffer: " << buffer << endl - << buffer->block->debug_string(); - return false; - } - - if (!buffer->block->validate()) { - LOG(ERROR) << "buffer->block inconsistent." << endl - << buffer->block->debug_string(); - return false; - } - - if (is_free && (buffer->block->_is_pinned || buffer->block->_in_write || - _unpinned_blocks.contains(buffer->block))) { - LOG(ERROR) << "Block with buffer in free list and" - << " _is_pinned = " << buffer->block->_is_pinned - << " _in_write = " << buffer->block->_in_write - << " _Unpinned_blocks.contains = " - << _unpinned_blocks.contains(buffer->block) << endl - << buffer->block->debug_string(); - return false; - } - } - } - - if (_free_io_buffers.size() != num_free_io_buffers) { - LOG(ERROR) << "_free_buffer_list inconsistency." - << " num_free_io_buffers = " << num_free_io_buffers - << " _free_io_buffers.size() = " << _free_io_buffers.size() << endl - << debug_internal(); - return false; - } - - Block* block = _unpinned_blocks.head(); - while (block != nullptr) { - if (!block->validate()) { - LOG(ERROR) << "Block inconsistent in unpinned list." << endl << block->debug_string(); - return false; - } - - if (block->_in_write || _free_io_buffers.contains(block->_buffer_desc)) { - LOG(ERROR) << "Block in unpinned list with" - << " _in_write = " << block->_in_write << " _free_io_buffers.contains = " - << _free_io_buffers.contains(block->_buffer_desc) << endl - << block->debug_string(); - return false; - } - block = block->next(); - } - - // Check if we're writing blocks when the number of free buffers falls below - // threshold. We don't write blocks after cancellation. - if (!_is_cancelled && !_unpinned_blocks.empty() && _enable_spill && - (_free_io_buffers.size() + _non_local_outstanding_writes < _block_write_threshold)) { - // TODO: this isn't correct when write_unpinned_blocks() fails during the call to - // write_unpinned_block() so just log the condition but don't return false. Figure - // out a way to re-enable this change? - LOG(ERROR) << "Missed writing unpinned blocks"; - } - return true; -} - -string BufferedBlockMgr2::debug_string(Client* client) { - stringstream ss; - unique_lock l(_lock); - ss << debug_internal(); - if (client != nullptr) { - ss << endl << client->debug_string(); - } - return ss.str(); -} - -string BufferedBlockMgr2::debug_internal() const { - stringstream ss; - ss << "Buffered block mgr" << endl - << " Num writes outstanding: " << _outstanding_writes_counter->value() << endl - << " Num free io buffers: " << _free_io_buffers.size() << endl - << " Num unpinned blocks: " << _unpinned_blocks.size() << endl - << " Num available buffers: " << remaining_unreserved_buffers() << endl - << " Total pinned buffers: " << _total_pinned_buffers << endl - << " Unfullfilled reserved buffers: " << _unfullfilled_reserved_buffers << endl - << " BUffer Block Mgr Used memory: " << _mem_tracker->consumption() - << " Instance remaining memory: " - << thread_context()->thread_mem_tracker()->spare_capacity() << " (#blocks=" - << (thread_context()->thread_mem_tracker()->spare_capacity() / _max_block_size) << ")" - << endl - << " Block write threshold: " << _block_write_threshold; - return ss.str(); -} - -void BufferedBlockMgr2::init(DiskIoMgr* io_mgr, RuntimeProfile* parent_profile) { - unique_lock l(_lock); - if (_initialized) { - return; - } - - io_mgr->register_context(&_io_request_context); - - _profile.reset(new RuntimeProfile("BlockMgr")); - parent_profile->add_child(_profile.get(), true, nullptr); - - _block_size_counter = ADD_COUNTER(_profile.get(), "MaxBlockSize", TUnit::BYTES); - _block_size_counter->set(_max_block_size); - _created_block_counter = ADD_COUNTER(_profile.get(), "BlocksCreated", TUnit::UNIT); - _recycled_blocks_counter = ADD_COUNTER(_profile.get(), "BlocksRecycled", TUnit::UNIT); - _bytes_written_counter = ADD_COUNTER(_profile.get(), "BytesWritten", TUnit::BYTES); - _outstanding_writes_counter = - ADD_COUNTER(_profile.get(), "BlockWritesOutstanding", TUnit::UNIT); - _buffered_pin_counter = ADD_COUNTER(_profile.get(), "BufferedPins", TUnit::UNIT); - _disk_read_timer = ADD_TIMER(_profile.get(), "TotalReadBlockTime"); - _buffer_wait_timer = ADD_TIMER(_profile.get(), "TotalBufferWaitTime"); - _encryption_timer = ADD_TIMER(_profile.get(), "TotalEncryptionTime"); - _integrity_check_timer = ADD_TIMER(_profile.get(), "TotalIntegrityCheckTime"); - - // Create a new mem_tracker and allocate buffers. - _mem_tracker = std::make_unique("BufferedBlockMgr2"); - - _initialized = true; -} - -Status BufferedBlockMgr2::init_tmp_files() { - DCHECK(_tmp_files.empty()); - DCHECK(_tmp_file_mgr != nullptr); - - vector tmp_devices = _tmp_file_mgr->active_tmp_devices(); - // Initialize the tmp files and the initial file to use. - _tmp_files.reserve(tmp_devices.size()); - for (int i = 0; i < tmp_devices.size(); ++i) { - TmpFileMgr::File* tmp_file; - TmpFileMgr::DeviceId tmp_device_id = tmp_devices[i]; - // It is possible for a device to be blacklisted after it was returned - // by active_tmp_devices() - handle this gracefully. - Status status = _tmp_file_mgr->get_file(tmp_device_id, _query_id, &tmp_file); - if (status.ok()) { - _tmp_files.emplace_back(tmp_file); - } - } - if (_tmp_files.empty()) { - return Status::InternalError( - "No spilling directories configured. Cannot spill. Set --scratch_dirs" - " or see log for previous errors that prevented use of provided directories"); - } - _next_block_index = rand() % _tmp_files.size(); - return Status::OK(); -} - -} // namespace doris diff --git a/be/src/runtime/buffered_block_mgr2.h b/be/src/runtime/buffered_block_mgr2.h deleted file mode 100644 index 9b423cf7f1..0000000000 --- a/be/src/runtime/buffered_block_mgr2.h +++ /dev/null @@ -1,614 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/apache/impala/blob/branch-2.9.0/be/src/runtime/buffered-block-mgr2.h -// and modified by Doris - -#pragma once - -#include - -#include "runtime/disk_io_mgr.h" -#include "runtime/tmp_file_mgr.h" -#include "util/uid_util.h" - -namespace doris { - -class RuntimeState; - -// The BufferedBlockMgr2 is used to allocate and manage blocks of data using a fixed memory -// budget. Available memory is split into a pool of fixed-size memory buffers. When a -// client allocates or requests a block, the block is assigned a buffer from this pool and -// is 'pinned' in memory. Clients can also unpin a block, allowing the manager to reassign -// its buffer to a different block. -// -// The BufferedBlockMgr2 typically allocates blocks in IO buffer size to get maximal IO -// efficiency when spilling. Clients can also request smaller buffers that cannot spill -// (note that it would be possible to spill small buffers, but we currently do not allow -// it). This is useful to present the same block API and mem tracking for clients (one can -// use the block mgr API to mem track non-spillable (smaller) buffers). Clients that do -// partitioning (e.g. PHJ and PAGG) will start with these smaller buffer sizes to reduce -// the minimum buffering requirements and grow to max sized buffers as the input grows. -// For simplicity, these small buffers are not recycled (there's also not really a need -// since they are allocated all at once on query startup). These buffers are not counted -// against the reservation. -// -// The BufferedBlockMgr2 reserves one buffer per disk ('_block_write_threshold') for -// itself. When the number of free buffers falls below 'block_write_threshold', unpinned -// blocks are flushed in Last-In-First-Out order. (It is assumed that unpinned blocks are -// re-read in FIFO order). The TmpFileMgr is used to obtain file handles to write to -// within the tmp directories configured for Impala. -// -// It is expected to have one BufferedBlockMgr2 per query. All allocations that can grow -// proportional to the input size and that might need to spill to disk should allocate -// from the same BufferedBlockMgr2. -// -// A client must pin a block in memory to read/write its contents and unpin it when it is -// no longer in active use. The BufferedBlockMgr2 guarantees that: -// a) The memory buffer assigned to a block is not removed or released while it is pinned. -// b) The contents of an unpinned block will be available on a subsequent call to pin. -// -// The Client supports the following operations: -// get_new_block(): Returns a new pinned block. -// Close(): Frees all memory and disk space. Called when a query is closed or cancelled. -// Close() is idempotent. -// -// A Block supports the following operations: -// pin(): Pins a block to a buffer in memory, and reads its contents from disk if -// necessary. If there are no free buffers, waits for a buffer to become available. -// Invoked before the contents of a block are read or written. The block -// will be maintained in memory until unpin() is called. -// unpin(): Invoked to indicate the block is not in active use. The block is added to a -// list of unpinned blocks. Unpinned blocks are only written when the number of free -// blocks falls below the 'block_write_threshold'. -// del(): Invoked to deallocate a block. The buffer associated with the block is -// immediately released and its on-disk location (if any) reused. -// -// The block manager is thread-safe with the following caveat: A single block cannot be -// used simultaneously by multiple clients in any capacity. -// However, the block manager client is not thread-safe. That is, the block manager -// allows multiple single-threaded block manager clients. -// -/// TODO: When a block is read from disk, data is copied from the IOMgr buffer to the -/// block manager's buffer. This should be avoided in the common case where these buffers -/// are of the same size. -/// TODO: See if the one big lock is a bottleneck. Break it up. This object is shared by -/// all operators within a query (across fragments), see IMPALA-1883. -/// TODO: No reason we can't spill the smaller buffers. Add it if we need to (it's likely -/// just removing dchecks). -/// TODO: The requirements on this object has grown organically. Consider a major -/// reworking. -class BufferedBlockMgr2 { -private: - struct BufferDescriptor; - -public: - // A client of the BufferedBlockMgr2. There is a single BufferedBlockMgr2 per plan - // fragment and all operators that need blocks from it should use a separate client. - // Each client has the option to reserve a number of blocks that it can claim later. - // The remaining memory that is not reserved by any clients is free for all and - // available to all clients. - // This is an opaque handle. - // TODO: move the APIs to client we don't need to pass the BufferedBlockMgr2 around. - // TODO: how can we ensure that each operator uses a separate client? - class Client; - - // A fixed-size block of data that may be be persisted to disk. The state of the block - // is maintained by the block manager and is described by 3 bools: - // _is_pinned = True if the block is pinned. The block has a non-null _buffer_desc, - // _buffer_desc cannot be in the free buffer list and the block cannot be in - // _unused_blocks or _unpinned_blocks. Newly allocated blocks are pinned. - // _in_write = True if a write has been issued but not completed for this block. - // The block cannot be in the _unpinned_blocks and must have a non-null _buffer_desc - // that's not in the free buffer list. It may be pinned or unpinned. - // _is_deleted = True if del() has been called on a block. After this, no API call - // is valid on the block. - // - // pin() and unpin() can be invoked on a block any number of times before del(). - // When a pinned block is unpinned for the first time, it is added to the - // _unpinned_blocks list and its buffer is removed from the free list. - // If it is pinned or deleted at any time while it is on the unpinned list, it is - // simply removed from that list. When it is dequeued from that list and enqueued - // for writing, _in_write is set to true. The block may be pinned, unpinned or deleted - // while _in_write is true. After the write has completed, the block's buffer will be - // returned to the free buffer list if it is no longer pinned, and the block itself - // will be put on the unused blocks list if del() was called. - // - // A block MUST have a non-null _buffer_desc if - // a) _is_pinned is true (i.e. the client is using it), or - // b) _in_write is true, (i.e. IO mgr is using it), or - // c) It is on the unpinned list (buffer has not been persisted.) - // - // In addition to the block manager API, Block exposes allocate(), return_allocation() - // and bytes_remaining() to allocate and free memory within a block, and buffer() and - // valid_data_len() to read/write the contents of a block. These are not thread-safe. - class Block : public InternalQueue::Node { - public: - // A null dtor to pass codestyle check - ~Block() {} - - // Pins a block in memory--assigns a free buffer to a block and reads it from disk if - // necessary. If there are no free blocks and no unpinned blocks, '*pinned' is set to - // false and the block is not pinned. If 'release_block' is non-nullptr, if there is - // memory pressure, this block will be pinned using the buffer from 'release_block'. - // If 'unpin' is true, 'release_block' will be unpinned (regardless of whether or not - // the buffer was used for this block). If 'unpin' is false, 'release_block' is - // deleted. 'release_block' must be pinned. - Status pin(bool* pinned, Block* release_block = nullptr, bool unpin = true); - - // Unpins a block by adding it to the list of unpinned blocks maintained by the block - // manager. An unpinned block must be flushed before its buffer is released or - // assigned to a different block. Is non-blocking. - Status unpin(); - - // Delete a block. Its buffer is released and on-disk location can be over-written. - // Non-blocking. - void del(); - - void add_row() { ++_num_rows; } - int num_rows() const { return _num_rows; } - - // Allocates the specified number of bytes from this block. - template - T* allocate(int size) { - DCHECK_GE(bytes_remaining(), size); - uint8_t* current_location = _buffer_desc->buffer + _valid_data_len; - _valid_data_len += size; - return reinterpret_cast(current_location); - } - - // Return the number of remaining bytes that can be allocated in this block. - int bytes_remaining() const { - DCHECK(_buffer_desc != nullptr); - return _buffer_desc->len - _valid_data_len; - } - - // Return size bytes from the most recent allocation. - void return_allocation(int size) { - DCHECK_GE(_valid_data_len, size); - _valid_data_len -= size; - } - - // Pointer to start of the block data in memory. Only guaranteed to be valid if the - // block is pinned. - uint8_t* buffer() const { - DCHECK(_buffer_desc != nullptr); - return _buffer_desc->buffer; - } - - // Return the number of bytes allocated in this block. - int64_t valid_data_len() const { return _valid_data_len; } - - // Returns the length of the underlying buffer. Only callable if the block is - // pinned. - int64_t buffer_len() const { - DCHECK(is_pinned()); - return _buffer_desc->len; - } - - // Returns true if this block is the max block size. Only callable if the block - // is pinned. - bool is_max_size() const { - DCHECK(is_pinned()); - return _buffer_desc->len == _block_mgr->max_block_size(); - } - - bool is_pinned() const { return _is_pinned; } - - // Path of temporary file backing the block. Intended for use in testing. - // Returns empty string if no backing file allocated. - std::string tmp_file_path() const; - - // Debug helper method to print the state of a block. - std::string debug_string() const; - - private: - friend class BufferedBlockMgr2; - - Block(BufferedBlockMgr2* block_mgr); - - // Initialize the state of a block and set the number of bytes allocated to 0. - void init(); - - // Debug helper method to validate the state of a block. _block_mgr lock must already - // be taken. - bool validate() const; - - // Pointer to the buffer associated with the block. nullptr if the block is not in - // memory and cannot be changed while the block is pinned or being written. - BufferDescriptor* _buffer_desc; - - // Parent block manager object. Responsible for maintaining the state of the block. - BufferedBlockMgr2* _block_mgr; - - // The client that owns this block. - Client* _client; - - // WriteRange object representing the on-disk location used to persist a block. - // Is created the first time a block is persisted, and retained until the block - // object is destroyed. The file location and offset in _write_range are valid - // throughout the lifetime of this object, but the data and length in the - // _write_range are only valid while the block is being written. - // _write_range instance is owned by the block manager. - DiskIoMgr::WriteRange* _write_range; - - // The file this block belongs to. The lifetime is the same as the file location - // and offset in _write_range. The File is owned by BufferedBlockMgr2, not TmpFileMgr. - TmpFileMgr::File* _tmp_file; - - // Length of valid (i.e. allocated) data within the block. - int64_t _valid_data_len; - - // Number of rows in this block. - int _num_rows; - - // Block state variables. The block's buffer can be freed only if _is_pinned and - // _in_write are both false. - // TODO: this might be better expressed as an enum. - - // _is_pinned is true while the block is pinned by a client. - bool _is_pinned; - - // _in_write is set to true when the block is enqueued for writing via DiskIoMgr, - // and set to false when the write is complete. - bool _in_write; - - // True if the block is deleted by the client. - bool _is_deleted; - - // Condition variable for when there is a specific client waiting for this block. - // Only used if _client_local is true. - // TODO: Currently we use _block_mgr->_lock for this condvar. There is no reason to - // use that _lock that is already overloaded, see IMPALA-1883. - std::condition_variable _write_complete_cv; - - // If true, this block is being written out so the underlying buffer can be - // transferred to another block from the same client. We don't want this buffer - // getting picked up by another client. - bool _client_local; - }; // class Block - - // Create a block manager with the specified mem_limit. If a block mgr with the - // same query id has already been created, that block mgr is returned. - // - buffer_size: maximum size of each buffer. - static Status create(RuntimeState* state, RuntimeProfile* profile, TmpFileMgr* tmp_file_mgr, - int64_t buffer_size, std::shared_ptr* block_mgr); - - ~BufferedBlockMgr2(); - - // Registers a client with num_reserved_buffers. The returned client is owned - // by the BufferedBlockMgr2 and has the same lifetime as it. - // We allow oversubscribing the reserved buffers. It is likely that the - // num_reserved_buffers be very pessimistic for small queries and we don't want to - // fail all of them with mem limit exceeded. - // The min reserved buffers is often independent of data size and we still want - // to run small queries with very small limits. - // Buffers used by this client are reflected in tracker. - // TODO: The fact that we allow oversubscription is problematic. - // as the code expects the reservations to always be granted (currently not the case). - Status register_client(int num_reserved_buffers, RuntimeState* state, Client** client); - - // Clears all reservations for this client. - void clear_reservations(Client* client); - - // Tries to acquire a one-time reservation of num_buffers. The semantics are: - // - If this call fails, the next 'num_buffers' calls to pin()/get_new_block() might - // not have enough memory. - // - If this call succeeds, the next 'num_buffers' call to pin()/get_new_block() will - // be guaranteed to get the block. Once these blocks have been pinned, the - // reservation from this call has no more effect. - // Blocks coming from the tmp reservation also count towards the regular reservation. - // This is useful to pin() a number of blocks and guarantee all or nothing behavior. - bool try_acquire_tmp_reservation(Client* client, int num_buffers); - - // Return a new pinned block. If there is no memory for this block, *block will be set - // to nullptr. - // If len > 0, get_new_block() will return a block with a buffer of size len. len - // must be less than max_block_size and this block cannot be unpinned. - // This function will try to allocate new memory for the block up to the limit. - // Otherwise it will (conceptually) write out an unpinned block and use that memory. - // The caller can pass a non-nullptr 'unpin_block' to transfer memory from 'unpin_block' - // to the new block. If 'unpin_block' is non-nullptr, the new block can never fail to - // get a buffer. The semantics of this are: - // - If 'unpin_block' is non-nullptr, it must be pinned. - // - If the call succeeds, 'unpin_block' is unpinned. - // - If there is no memory pressure, block will get a newly allocated buffer. - // - If there is memory pressure, block will get the buffer from 'unpin_block'. - Status get_new_block(Client* client, Block* unpin_block, Block** block, int64_t len = -1); - - // Cancels the block mgr. All subsequent calls that return a Status fail with - // Status::Cancelled("Cancelled"). Idempotent. - void cancel(); - - // Returns true if the block manager was cancelled. - bool is_cancelled(); - - // Dumps block mgr state. Grabs lock. If client is not nullptr, also dumps its state. - std::string debug_string(Client* client = nullptr); - - // The number of buffers available for client. That is, if all other clients were - // stopped, the number of buffers this client could get. - int64_t available_buffers(Client* client) const; - - // Returns a MEM_LIMIT_EXCEEDED error which includes the minimum memory required by - // this 'client' that acts on behalf of the node with id 'node_id'. 'node_id' is used - // only for error reporting. - Status mem_limit_too_low_error(Client* client, int node_id); - - // TODO: Remove these two. Not clear what the sorter really needs. - // TODO: Those are dirty, dangerous reads to two lists whose all other accesses are - // protected by the _lock. Using those two functions is looking for trouble. - int available_allocated_buffers() const { return _all_io_buffers.size(); } - int num_free_buffers() const { return _free_io_buffers.size(); } - - int num_pinned_buffers(Client* client) const; - int num_reserved_buffers_remaining(Client* client) const; - MemTracker* get_tracker(Client* client) const; - MemTracker* mem_tracker() const { return _mem_tracker.get(); } - int64_t max_block_size() const { - { return _max_block_size; } - } - int64_t bytes_allocated() const; - RuntimeProfile* profile() { - { return _profile.get(); } - } - int writes_issued() const { - { return _writes_issued; } - } - -private: - friend class Client; - - // Descriptor for a single memory buffer in the pool. - struct BufferDescriptor : public InternalQueue::Node { - // Start of the buffer. - uint8_t* buffer; - - // Length of the buffer. - int64_t len; - - // Block that this buffer is assigned to. May be nullptr. - Block* block; - - // Iterator into _all_io_buffers for this buffer. - std::list::iterator all_buffers_it; - - BufferDescriptor(uint8_t* buf, int64_t len) : buffer(buf), len(len), block(nullptr) {} - }; - - BufferedBlockMgr2(RuntimeState* state, TmpFileMgr* tmp_file_mgr, int64_t block_size); - - // Initializes the block mgr. Idempotent and thread-safe. - void init(DiskIoMgr* io_mgr, RuntimeProfile* profile); - - // Initializes _tmp_files. This is initialized the first time we need to write to disk. - // Must be called with _lock taken. - Status init_tmp_files(); - - // pin_block(), unpin_block(), delete_block() perform the actual work of Block::pin(), - // unpin() and del(). Must be called with the _lock taken. - Status pin_block(Block* block, bool* pinned, Block* src, bool unpin); - Status unpin_block(Block* block); - void delete_block(Block* block); - - // If the 'block' is nullptr, checks if cancelled and returns. Otherwise, depending on - // 'unpin' calls either delete_block() or unpin_block(), which both first check for - // cancellation. It should be called without the _lock acquired. - Status delete_or_unpin_block(Block* block, bool unpin); - - // Transfers the buffer from 'src' to 'dst'. 'src' must be pinned. - // If unpin == false, 'src' is simply deleted. - // If unpin == true, 'src' is unpinned and it may block until the write of 'src' is - // completed. In that case it will use the _lock for the condvar. Thus, the _lock - // needs to not have been taken when this function is called. - Status transfer_buffer(Block* dst, Block* src, bool unpin); - - // Returns the total number of unreserved buffers. This is the sum of unpinned, - // free and buffers we can still allocate minus the total number of reserved buffers - // that are not pinned. - // Note this can be negative if the buffers are oversubscribed. - // Must be called with _lock taken. - int64_t remaining_unreserved_buffers() const; - - // Finds a buffer for a block and pins it. If the block's buffer has not been evicted, - // it removes the block from the unpinned list and sets *in_mem = true. - // If the block is not in memory, it will call find_buffer() that may block. - // If we can't get a buffer (e.g. no more memory, nothing in the unpinned and free - // lists) this function returns with the block unpinned. - // Uses the _lock, the caller should not have already acquired the _lock. - Status find_buffer_for_block(Block* block, bool* in_mem); - - // Returns a new buffer that can be used. *buffer is set to nullptr if there was no - // memory. - // Otherwise, this function gets a new buffer by: - // 1. Allocating a new buffer if possible - // 2. Using a buffer from the free list (which is populated by moving blocks from - // the unpinned list by writing them out). - // Must be called with the _lock already taken. This function can block. - Status find_buffer(std::unique_lock& lock, BufferDescriptor** buffer); - - // Writes unpinned blocks via DiskIoMgr until one of the following is true: - // 1. The number of outstanding writes >= (_block_write_threshold - num free buffers) - // 2. There are no more unpinned blocks - // Must be called with the _lock already taken. Is not blocking. - Status write_unpinned_blocks(); - - // Issues the write for this block to the DiskIoMgr. - Status write_unpinned_block(Block* block); - - // Allocate block_size bytes in a temporary file. Try multiple disks if error occurs. - // Returns an error only if no temporary files are usable. - Status allocate_scratch_space(int64_t block_size, TmpFileMgr::File** tmp_file, - int64_t* file_offset); - - // Callback used by DiskIoMgr to indicate a block write has completed. write_status - // is the status of the write. _is_cancelled is set to true if write_status is not - // Status::OK() or a re-issue of the write fails. Returns the block's buffer to the - // free buffers list if it is no longer pinned. Returns the block itself to the free - // blocks list if it has been deleted. - void write_complete(Block* block, const Status& write_status); - - // Returns a deleted block to the list of free blocks. Assumes the block's buffer has - // already been returned to the free buffers list. Non-blocking. - // Thread-safe and does not need the _lock acquired. - void return_unused_block(Block* block); - - // Checks _unused_blocks for an unused block object, else allocates a new one. - // Non-blocking and needs no _lock. - Block* get_unused_block(Client* client); - - // Used to debug the state of the block manager. Lock must already be taken. - bool validate() const; - std::string debug_internal() const; - - // Add BE hostname and fragmentid for debug tuning - Status add_exec_msg(const std::string& msg) const; - - // Size of the largest/default block in bytes. - const int64_t _max_block_size; - - // Unpinned blocks are written when the number of free buffers is below this threshold. - // Equal to the number of disks. - const int _block_write_threshold; - - // If false, spilling is disabled. The client calls will fail if there is not enough - // memory. - const bool _enable_spill; - - const TUniqueId _query_id; - - ObjectPool _obj_pool; - - // Track buffers allocated by the block manager. - std::unique_ptr _mem_tracker; - - // The temporary file manager used to allocate temporary file space. - TmpFileMgr* _tmp_file_mgr; - - // This lock protects the block and buffer lists below, except for _unused_blocks. - // It also protects the various counters and changes to block state. Additionally, it is - // used for the blocking condvars: _buffer_available_cv and block->_write_complete_cv. - // TODO: We should break the protection of the various structures and usages to - // different spinlocks and a mutex to be used in the wait()s, see IMPALA-1883. - std::mutex _lock; - - // If true, init() has been called. - bool _initialized; - - // The total number of reserved buffers across all clients that are not pinned. - int _unfullfilled_reserved_buffers; - - // The total number of pinned buffers across all clients. - int _total_pinned_buffers; - - // Number of outstanding writes (Writes issued but not completed). - // This does not include client-local writes. - int _non_local_outstanding_writes; - - // Signal availability of free buffers. - std::condition_variable _buffer_available_cv; - - // List of blocks _is_pinned = false AND are not on DiskIoMgr's write queue. - // Blocks are added to and removed from the back of the list. (i.e. in LIFO order). - // Blocks in this list must have _is_pinned = false, _in_write = false, - // _is_deleted = false. - InternalQueue _unpinned_blocks; - - // List of blocks that have been deleted and are no longer in use. - // Can be reused in get_new_block(). Blocks in this list must be in the Init'ed state, - // i.e. _buffer_desc = nullptr, _is_pinned = false, _in_write = false, - // _is_deleted = false, valid_data_len = 0. - InternalQueue _unused_blocks; - - // List of buffers that can be assigned to a block in pin() or get_new_block(). - // These buffers either have no block associated with them or are associated with an - // an unpinned block that has been persisted. That is, either block = nullptr or - // (!block->_is_pinned && !block->_in_write && !_unpinned_blocks.Contains(block)). - // All of these buffers are io sized. - InternalQueue _free_io_buffers; - - // All allocated io-sized buffers. - std::list _all_io_buffers; - - // Temporary physical file handle, (one per tmp device) to which blocks may be written. - // Blocks are round-robined across these files. - std::vector> _tmp_files; - - // Index into _tmp_files denoting the file to which the next block to be persisted will - // be written. - int _next_block_index; - - // DiskIoMgr handles to read and write blocks. - DiskIoMgr* _io_mgr; - DiskIoMgr::RequestContext* _io_request_context; - - // If true, a disk write failed and all API calls return. - // Status::Cancelled("Cancelled"). Set to true if there was an error writing a block, or if - // write_complete() needed to reissue the write and that failed. - bool _is_cancelled; - - // Counters and timers to track behavior. - std::unique_ptr _profile; - - RuntimeProfile::Counter* _block_size_counter; - - // Total number of blocks created. - RuntimeProfile::Counter* _created_block_counter; - - // Number of deleted blocks reused. - RuntimeProfile::Counter* _recycled_blocks_counter; - - // Number of pin() calls that did not require a disk read. - RuntimeProfile::Counter* _buffered_pin_counter; - - // Time taken for disk reads. - RuntimeProfile::Counter* _disk_read_timer; - - // Time spent waiting for a free buffer. - RuntimeProfile::Counter* _buffer_wait_timer; - - // Number of bytes written to disk (includes writes still queued in the IO manager). - RuntimeProfile::Counter* _bytes_written_counter; - - // Number of writes outstanding (issued but not completed). - RuntimeProfile::Counter* _outstanding_writes_counter; - - // Time spent in disk spill encryption and decryption. - RuntimeProfile::Counter* _encryption_timer; - - // Time spent in disk spill integrity generation and checking. - RuntimeProfile::Counter* _integrity_check_timer; - - // Number of writes issued. - int _writes_issued; - - // Protects _s_query_to_block_mgrs. - static SpinLock _s_block_mgrs_lock; - - // All per-query BufferedBlockMgr2 objects that are in use. For memory management, this - // map contains only weak ptrs. BufferedBlockMgr2s that are handed out are shared ptrs. - // When all the shared ptrs are no longer referenced, the BufferedBlockMgr2 - // d'tor will be called at which point the weak ptr will be removed from the map. - typedef std::unordered_map> BlockMgrsMap; - static BlockMgrsMap _s_query_to_block_mgrs; - - // Unowned. - RuntimeState* _state; - -}; // class BufferedBlockMgr2 - -} // end namespace doris diff --git a/be/src/runtime/buffered_tuple_stream2.cc b/be/src/runtime/buffered_tuple_stream2.cc deleted file mode 100644 index 1528f8de66..0000000000 --- a/be/src/runtime/buffered_tuple_stream2.cc +++ /dev/null @@ -1,805 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/apache/impala/blob/branch-2.10.0/be/src/runtime/buffered-tuple-stream.cc -// and modified by Doris - -#include "runtime/buffered_tuple_stream2.h" - -#include "runtime/descriptors.h" -#include "runtime/row_batch.h" -#include "runtime/string_value.h" -#include "runtime/tuple_row.h" -#include "util/bit_util.h" -#include "util/pretty_printer.h" - -using std::stringstream; -using std::string; -using std::vector; -using std::list; - -using std::unique_ptr; - -namespace doris { - -// The first NUM_SMALL_BLOCKS of the tuple stream are made of blocks less than the -// IO size. These blocks never spill. -// TODO: Consider adding a 4MB in-memory buffer that would split the gap between the -// 512KB in-memory buffer and the 8MB (IO-sized) spillable buffer. -static const int64_t INITIAL_BLOCK_SIZES[] = {64 * 1024, 512 * 1024}; -static const int NUM_SMALL_BLOCKS = sizeof(INITIAL_BLOCK_SIZES) / sizeof(int64_t); - -string BufferedTupleStream2::RowIdx::debug_string() const { - stringstream ss; - ss << "RowIdx block=" << block() << " offset=" << offset() << " idx=" << idx(); - return ss.str(); -} - -BufferedTupleStream2::BufferedTupleStream2(RuntimeState* state, const RowDescriptor& row_desc, - BufferedBlockMgr2* block_mgr, - BufferedBlockMgr2::Client* client, - bool use_initial_small_buffers, bool read_write) - : _use_small_buffers(use_initial_small_buffers), - _delete_on_read(false), - _read_write(read_write), - _state(state), - _desc(row_desc), - _nullable_tuple(row_desc.is_any_tuple_nullable()), - _block_mgr(block_mgr), - _block_mgr_client(client), - _total_byte_size(0), - _read_ptr(nullptr), - _read_tuple_idx(0), - _read_bytes(0), - _rows_returned(0), - _read_block_idx(-1), - _write_block(nullptr), - _num_pinned(0), - _num_small_blocks(0), - _closed(false), - _num_rows(0), - _pinned(true), - _pin_timer(nullptr), - _unpin_timer(nullptr), - _get_new_block_timer(nullptr) { - _null_indicators_read_block = _null_indicators_write_block = -1; - _read_block = _blocks.end(); - _fixed_tuple_row_size = 0; - for (int i = 0; i < _desc.tuple_descriptors().size(); ++i) { - const TupleDescriptor* tuple_desc = _desc.tuple_descriptors()[i]; - const int tuple_byte_size = tuple_desc->byte_size(); - _fixed_tuple_row_size += tuple_byte_size; - if (!tuple_desc->string_slots().empty()) { - _string_slots.push_back(make_pair(i, tuple_desc->string_slots())); - } - // if (!tuple_desc->collection_slots().empty()) { - // _collection_slots.push_back(make_pair(i, tuple_desc->collection_slots())); - // } - } -} - -// Returns the number of pinned blocks in the list. -// Only called in DCHECKs to validate _num_pinned. -int num_pinned(const list& blocks) { - int num_pinned = 0; - for (list::const_iterator it = blocks.begin(); it != blocks.end(); - ++it) { - if ((*it)->is_pinned() && (*it)->is_max_size()) { - ++num_pinned; - } - } - return num_pinned; -} - -string BufferedTupleStream2::debug_string() const { - stringstream ss; - ss << "BufferedTupleStream2 num_rows=" << _num_rows << " rows_returned=" << _rows_returned - << " pinned=" << (_pinned ? "true" : "false") - << " delete_on_read=" << (_delete_on_read ? "true" : "false") - << " closed=" << (_closed ? "true" : "false") << " num_pinned=" << _num_pinned - << " write_block=" << _write_block << " _read_block="; - if (_read_block == _blocks.end()) { - ss << ""; - } else { - ss << *_read_block; - } - ss << " blocks=[\n"; - for (list::const_iterator it = _blocks.begin(); it != _blocks.end(); - ++it) { - ss << "{" << (*it)->debug_string() << "}"; - if (*it != _blocks.back()) { - ss << ",\n"; - } - } - ss << "]"; - return ss.str(); -} - -Status BufferedTupleStream2::init(int node_id, RuntimeProfile* profile, bool pinned) { - if (profile != nullptr) { - _pin_timer = ADD_TIMER(profile, "PinTime"); - _unpin_timer = ADD_TIMER(profile, "UnpinTime"); - _get_new_block_timer = ADD_TIMER(profile, "GetNewBlockTime"); - } - - if (_block_mgr->max_block_size() < INITIAL_BLOCK_SIZES[0]) { - _use_small_buffers = false; - } - - bool got_block = false; - RETURN_IF_ERROR(new_block_for_write(_fixed_tuple_row_size, &got_block)); - if (!got_block) { - return _block_mgr->mem_limit_too_low_error(_block_mgr_client, node_id); - } - DCHECK(_write_block != nullptr); - if (!pinned) { - RETURN_IF_ERROR(unpin_stream()); - } - return Status::OK(); -} - -Status BufferedTupleStream2::switch_to_io_buffers(bool* got_buffer) { - if (!_use_small_buffers) { - *got_buffer = (_write_block != nullptr); - return Status::OK(); - } - _use_small_buffers = false; - Status status = new_block_for_write(_block_mgr->max_block_size(), got_buffer); - // IMPALA-2330: Set the flag using small buffers back to false in case it failed to - // got a buffer. - DCHECK(status.ok() || !*got_buffer) << status.ok() << " " << *got_buffer; - _use_small_buffers = !*got_buffer; - return status; -} - -void BufferedTupleStream2::close() { - for (list::iterator it = _blocks.begin(); it != _blocks.end(); - ++it) { - (*it)->del(); - } - _blocks.clear(); - _num_pinned = 0; - DCHECK_EQ(_num_pinned, num_pinned(_blocks)); - _closed = true; -} - -int64_t BufferedTupleStream2::bytes_in_mem(bool ignore_current) const { - int64_t result = 0; - for (list::const_iterator it = _blocks.begin(); it != _blocks.end(); - ++it) { - if (!(*it)->is_pinned()) { - continue; - } - if (!(*it)->is_max_size()) { - continue; - } - if (*it == _write_block && ignore_current) { - continue; - } - result += (*it)->buffer_len(); - } - return result; -} - -Status BufferedTupleStream2::unpin_block(BufferedBlockMgr2::Block* block) { - SCOPED_TIMER(_unpin_timer); - DCHECK(block->is_pinned()); - if (!block->is_max_size()) { - return Status::OK(); - } - RETURN_IF_ERROR(block->unpin()); - --_num_pinned; - DCHECK_EQ(_num_pinned, num_pinned(_blocks)); - return Status::OK(); -} - -Status BufferedTupleStream2::new_block_for_write(int64_t min_size, bool* got_block) { - DCHECK(!_closed); - *got_block = false; - if (min_size > _block_mgr->max_block_size()) { - std::stringstream error_msg; - error_msg << "Cannot process row that is bigger than the IO size (row_size=" - << PrettyPrinter::print(min_size, TUnit::BYTES) - << "). To run this query, increase the IO size (--read_size option)."; - return Status::InternalError(error_msg.str()); - } - - BufferedBlockMgr2::Block* unpin_block = _write_block; - if (_write_block != nullptr) { - DCHECK(_write_block->is_pinned()); - if (_pinned || _write_block == *_read_block || !_write_block->is_max_size()) { - // In these cases, don't unpin the current write block. - unpin_block = nullptr; - } - } - - int64_t block_len = _block_mgr->max_block_size(); - if (_use_small_buffers) { - if (_blocks.size() < NUM_SMALL_BLOCKS) { - block_len = std::min(block_len, INITIAL_BLOCK_SIZES[_blocks.size()]); - if (block_len < min_size) { - block_len = _block_mgr->max_block_size(); - } - } - if (block_len == _block_mgr->max_block_size()) { - // Do not switch to IO-buffers automatically. Do not get a buffer. - *got_block = false; - return Status::OK(); - } - } - - BufferedBlockMgr2::Block* new_block = nullptr; - { - SCOPED_TIMER(_get_new_block_timer); - RETURN_IF_ERROR( - _block_mgr->get_new_block(_block_mgr_client, unpin_block, &new_block, block_len)); - } - *got_block = (new_block != nullptr); - - if (!*got_block) { - DCHECK(unpin_block == nullptr); - return Status::OK(); - } - - if (unpin_block != nullptr) { - DCHECK(unpin_block == _write_block); - DCHECK(!_write_block->is_pinned()); - --_num_pinned; - DCHECK_EQ(_num_pinned, num_pinned(_blocks)); - } - - // Compute and allocate the block header with the null indicators - _null_indicators_write_block = compute_num_null_indicator_bytes(block_len); - new_block->allocate(_null_indicators_write_block); - _write_tuple_idx = 0; - - _blocks.push_back(new_block); - _block_start_idx.push_back(new_block->buffer()); - _write_block = new_block; - DCHECK(_write_block->is_pinned()); - DCHECK_EQ(_write_block->num_rows(), 0); - if (_write_block->is_max_size()) { - ++_num_pinned; - DCHECK_EQ(_num_pinned, num_pinned(_blocks)); - } else { - ++_num_small_blocks; - } - _total_byte_size += block_len; - return Status::OK(); -} - -Status BufferedTupleStream2::next_block_for_read() { - DCHECK(!_closed); - DCHECK(_read_block != _blocks.end()); - DCHECK_EQ(_num_pinned, num_pinned(_blocks)) << _pinned; - - // If non-nullptr, this will be the current block if we are going to free it while - // grabbing the next block. This will stay nullptr if we don't want to free the - // current block. - BufferedBlockMgr2::Block* block_to_free = - (!_pinned || _delete_on_read) ? *_read_block : nullptr; - if (_delete_on_read) { - // TODO: this is weird. We are deleting even if it is pinned. The analytic - // eval node needs this. - DCHECK(_read_block == _blocks.begin()); - DCHECK(*_read_block != _write_block); - _blocks.pop_front(); - _read_block = _blocks.begin(); - _read_block_idx = 0; - if (block_to_free != nullptr && !block_to_free->is_max_size()) { - block_to_free->del(); - block_to_free = nullptr; - DCHECK_EQ(_num_pinned, num_pinned(_blocks)) << debug_string(); - } - } else { - ++_read_block; - ++_read_block_idx; - if (block_to_free != nullptr && !block_to_free->is_max_size()) { - block_to_free = nullptr; - } - } - - _read_ptr = nullptr; - _read_tuple_idx = 0; - _read_bytes = 0; - - bool pinned = false; - if (_read_block == _blocks.end() || (*_read_block)->is_pinned()) { - // End of the blocks or already pinned, just handle block_to_free - if (block_to_free != nullptr) { - SCOPED_TIMER(_unpin_timer); - if (_delete_on_read) { - block_to_free->del(); - --_num_pinned; - } else { - RETURN_IF_ERROR(unpin_block(block_to_free)); - } - } - } else { - // Call into the block mgr to atomically unpin/delete the old block and pin the - // new block. - SCOPED_TIMER(_pin_timer); - RETURN_IF_ERROR((*_read_block)->pin(&pinned, block_to_free, !_delete_on_read)); - if (!pinned) { - DCHECK(block_to_free == nullptr) << "Should have been able to pin." << std::endl - << _block_mgr->debug_string(_block_mgr_client); - } - if (block_to_free == nullptr && pinned) { - ++_num_pinned; - } - } - - if (_read_block != _blocks.end() && (*_read_block)->is_pinned()) { - _null_indicators_read_block = - compute_num_null_indicator_bytes((*_read_block)->buffer_len()); - _read_ptr = (*_read_block)->buffer() + _null_indicators_read_block; - } - DCHECK_EQ(_num_pinned, num_pinned(_blocks)) << debug_string(); - return Status::OK(); -} - -Status BufferedTupleStream2::prepare_for_read(bool delete_on_read, bool* got_buffer) { - DCHECK(!_closed); - if (_blocks.empty()) { - return Status::OK(); - } - - if (!_read_write && _write_block != nullptr) { - DCHECK(_write_block->is_pinned()); - if (!_pinned && _write_block != _blocks.front()) { - RETURN_IF_ERROR(unpin_block(_write_block)); - } - _write_block = nullptr; - } - - // Walk the blocks and pin the first non-io sized block. - // (small buffers always being pinned, no need to pin again) - for (list::iterator it = _blocks.begin(); it != _blocks.end(); - ++it) { - if (!(*it)->is_pinned()) { - SCOPED_TIMER(_pin_timer); - bool current_pinned = false; - RETURN_IF_ERROR((*it)->pin(¤t_pinned)); - if (!current_pinned) { - DCHECK(got_buffer != nullptr) << "Should have reserved enough blocks"; - *got_buffer = false; - return Status::OK(); - } - ++_num_pinned; - DCHECK_EQ(_num_pinned, num_pinned(_blocks)); - } - if ((*it)->is_max_size()) { - break; - } - } - - _read_block = _blocks.begin(); - DCHECK(_read_block != _blocks.end()); - _null_indicators_read_block = compute_num_null_indicator_bytes((*_read_block)->buffer_len()); - _read_ptr = (*_read_block)->buffer() + _null_indicators_read_block; - _read_tuple_idx = 0; - _read_bytes = 0; - _rows_returned = 0; - _read_block_idx = 0; - _delete_on_read = delete_on_read; - if (got_buffer != nullptr) { - *got_buffer = true; - } - return Status::OK(); -} - -Status BufferedTupleStream2::pin_stream(bool already_reserved, bool* pinned) { - DCHECK(!_closed); - DCHECK(pinned != nullptr); - if (!already_reserved) { - // If we can't get all the blocks, don't try at all. - if (!_block_mgr->try_acquire_tmp_reservation(_block_mgr_client, blocks_unpinned())) { - *pinned = false; - return Status::OK(); - } - } - - for (list::iterator it = _blocks.begin(); it != _blocks.end(); - ++it) { - if ((*it)->is_pinned()) { - continue; - } - { - SCOPED_TIMER(_pin_timer); - RETURN_IF_ERROR((*it)->pin(pinned)); - } - if (!*pinned) { - VLOG_QUERY << "Should have been reserved." << std::endl - << _block_mgr->debug_string(_block_mgr_client); - return Status::OK(); - } - ++_num_pinned; - DCHECK_EQ(_num_pinned, num_pinned(_blocks)); - } - - if (!_delete_on_read) { - // Populate _block_start_idx on pin. - DCHECK_EQ(_block_start_idx.size(), _blocks.size()); - _block_start_idx.clear(); - for (list::iterator it = _blocks.begin(); it != _blocks.end(); - ++it) { - _block_start_idx.push_back((*it)->buffer()); - } - } - *pinned = true; - _pinned = true; - return Status::OK(); -} - -Status BufferedTupleStream2::unpin_stream(bool all) { - DCHECK(!_closed); - SCOPED_TIMER(_unpin_timer); - - for (BufferedBlockMgr2::Block* block : _blocks) { - if (!block->is_pinned()) { - continue; - } - if (!all && (block == _write_block || (_read_write && block == *_read_block))) { - continue; - } - RETURN_IF_ERROR(unpin_block(block)); - } - if (all) { - _read_block = _blocks.end(); - _write_block = nullptr; - } - _pinned = false; - return Status::OK(); -} - -int BufferedTupleStream2::compute_num_null_indicator_bytes(int block_size) const { - if (_nullable_tuple) { - // We assume that all rows will use their max size, so we may be underutilizing the - // space, i.e. we may have some unused space in case of rows with nullptr tuples. - const uint32_t tuples_per_row = _desc.tuple_descriptors().size(); - const uint32_t min_row_size_in_bits = 8 * _fixed_tuple_row_size + tuples_per_row; - const uint32_t block_size_in_bits = 8 * block_size; - const uint32_t max_num_rows = block_size_in_bits / min_row_size_in_bits; - return BitUtil::round_up_numi64(max_num_rows * tuples_per_row) * 8; - } else { - // If there are no nullable tuples then no need to waste space for null indicators. - return 0; - } -} - -Status BufferedTupleStream2::get_rows(unique_ptr* batch, bool* got_rows) { - RETURN_IF_ERROR(pin_stream(false, got_rows)); - if (!*got_rows) { - return Status::OK(); - } - RETURN_IF_ERROR(prepare_for_read(false)); - batch->reset(new RowBatch(_desc, num_rows())); - bool eos = false; - // Loop until get_next fills the entire batch. Each call can stop at block - // boundaries. We generally want it to stop, so that blocks can be freed - // as we read. It is safe in this case because we pin the entire stream. - while (!eos) { - RETURN_IF_ERROR(get_next(batch->get(), &eos)); - } - return Status::OK(); -} - -Status BufferedTupleStream2::get_next(RowBatch* batch, bool* eos, vector* indices) { - if (_nullable_tuple) { - return get_next_internal(batch, eos, indices); - } else { - return get_next_internal(batch, eos, indices); - } -} - -template -Status BufferedTupleStream2::get_next_internal(RowBatch* batch, bool* eos, - vector* indices) { - DCHECK(!_closed); - DCHECK(batch->row_desc().equals(_desc)); - *eos = (_rows_returned == _num_rows); - if (*eos) { - return Status::OK(); - } - DCHECK_GE(_null_indicators_read_block, 0); - - const uint64_t tuples_per_row = _desc.tuple_descriptors().size(); - DCHECK_LE(_read_tuple_idx / tuples_per_row, (*_read_block)->num_rows()); - DCHECK_EQ(_read_tuple_idx % tuples_per_row, 0); - int rows_returned_curr_block = _read_tuple_idx / tuples_per_row; - - int64_t data_len = (*_read_block)->valid_data_len() - _null_indicators_read_block; - if (UNLIKELY(rows_returned_curr_block == (*_read_block)->num_rows())) { - // Get the next block in the stream. We need to do this at the beginning of - // the get_next() call to ensure the buffer management semantics. next_block_for_read() - // will recycle the memory for the rows returned from the *previous* call to - // get_next(). - RETURN_IF_ERROR(next_block_for_read()); - DCHECK(_read_block != _blocks.end()) << debug_string(); - DCHECK_GE(_null_indicators_read_block, 0); - data_len = (*_read_block)->valid_data_len() - _null_indicators_read_block; - rows_returned_curr_block = 0; - } - - DCHECK(_read_block != _blocks.end()); - DCHECK((*_read_block)->is_pinned()) << debug_string(); - DCHECK(_read_ptr != nullptr); - - int64_t rows_left = _num_rows - _rows_returned; - int rows_to_fill = - std::min(static_cast(batch->capacity() - batch->num_rows()), rows_left); - DCHECK_GE(rows_to_fill, 1); - batch->add_rows(rows_to_fill); - uint8_t* tuple_row_mem = reinterpret_cast(batch->get_row(batch->num_rows())); - - // Produce tuple rows from the current block and the corresponding position on the - // null tuple indicator. - vector local_indices; - if (indices == nullptr) { - // A hack so that we do not need to check whether 'indices' is not null in the - // tight loop. - indices = &local_indices; - } else { - DCHECK(is_pinned()); - DCHECK(!_delete_on_read); - DCHECK_EQ(batch->num_rows(), 0); - indices->clear(); - } - indices->reserve(rows_to_fill); - - int i = 0; - uint8_t* null_word = nullptr; - uint32_t null_pos = 0; - // Start reading from position _read_tuple_idx in the block. - uint64_t last_read_ptr = 0; - // IMPALA-2256: Special case if there are no materialized slots. - bool increment_row = has_tuple_footprint(); - uint64_t last_read_row = increment_row * (_read_tuple_idx / tuples_per_row); - while (i < rows_to_fill) { - // Check if current block is done. - if (UNLIKELY(rows_returned_curr_block + i == (*_read_block)->num_rows())) { - break; - } - - // Copy the row into the output batch. - TupleRow* row = reinterpret_cast(tuple_row_mem); - last_read_ptr = reinterpret_cast(_read_ptr); - indices->push_back(RowIdx()); - DCHECK_EQ(indices->size(), i + 1); - (*indices)[i].set(_read_block_idx, _read_bytes + _null_indicators_read_block, - last_read_row); - if (HasNullableTuple) { - for (int j = 0; j < tuples_per_row; ++j) { - // Stitch together the tuples from the block and the nullptr ones. - null_word = (*_read_block)->buffer() + (_read_tuple_idx >> 3); - null_pos = _read_tuple_idx & 7; - ++_read_tuple_idx; - const bool is_not_null = ((*null_word & (1 << (7 - null_pos))) == 0); - // Copy tuple and advance _read_ptr. If it is a nullptr tuple, it calls set_tuple - // with Tuple* being 0x0. To do that we multiply the current _read_ptr with - // false (0x0). - row->set_tuple(j, reinterpret_cast(reinterpret_cast(_read_ptr) * - is_not_null)); - _read_ptr += _desc.tuple_descriptors()[j]->byte_size() * is_not_null; - } - const uint64_t row_read_bytes = reinterpret_cast(_read_ptr) - last_read_ptr; - DCHECK_GE(_fixed_tuple_row_size, row_read_bytes); - _read_bytes += row_read_bytes; - last_read_ptr = reinterpret_cast(_read_ptr); - } else { - // When we know that there are no nullable tuples we can safely copy them without - // checking for nullability. - for (int j = 0; j < tuples_per_row; ++j) { - row->set_tuple(j, reinterpret_cast(_read_ptr)); - _read_ptr += _desc.tuple_descriptors()[j]->byte_size(); - } - _read_bytes += _fixed_tuple_row_size; - _read_tuple_idx += tuples_per_row; - } - tuple_row_mem += sizeof(Tuple*) * tuples_per_row; - - // Update string slot ptrs. - for (int j = 0; j < _string_slots.size(); ++j) { - Tuple* tuple = row->get_tuple(_string_slots[j].first); - if (HasNullableTuple && tuple == nullptr) { - continue; - } - read_strings(_string_slots[j].second, data_len, tuple); - } - - // Update collection slot ptrs. We traverse the collection structure in the same order - // as it was written to the stream, allowing us to infer the data layout based on the - // length of collections and strings. - // for (int j = 0; j < _collection_slots.size(); ++j) { - // Tuple* tuple = row->get_tuple(_collection_slots[j].first); - // if (HasNullableTuple && tuple == nullptr) { - // continue; - // } - // ReadCollections(_collection_slots[j].second, data_len, tuple); - // } - last_read_row += increment_row; - ++i; - } - - batch->commit_rows(i); - _rows_returned += i; - *eos = (_rows_returned == _num_rows); - if ((!_pinned || _delete_on_read) && - rows_returned_curr_block + i == (*_read_block)->num_rows()) { - // No more data in this block. Mark this batch as needing to return so - // the caller can pass the rows up the operator tree. - batch->mark_need_to_return(); - } - DCHECK_EQ(indices->size(), i); - return Status::OK(); -} - -void BufferedTupleStream2::read_strings(const vector& string_slots, int data_len, - Tuple* tuple) { - DCHECK(tuple != nullptr); - for (int i = 0; i < string_slots.size(); ++i) { - const SlotDescriptor* slot_desc = string_slots[i]; - if (tuple->is_null(slot_desc->null_indicator_offset())) { - continue; - } - - StringValue* sv = tuple->get_string_slot(slot_desc->tuple_offset()); - DCHECK_LE(sv->len, data_len - _read_bytes); - sv->ptr = reinterpret_cast(_read_ptr); - _read_ptr += sv->len; - _read_bytes += sv->len; - } -} - -int64_t BufferedTupleStream2::compute_row_size(TupleRow* row) const { - int64_t size = 0; - for (int i = 0; i < _desc.tuple_descriptors().size(); ++i) { - const TupleDescriptor* tuple_desc = _desc.tuple_descriptors()[i]; - Tuple* tuple = row->get_tuple(i); - DCHECK(_nullable_tuple || tuple_desc->byte_size() == 0 || tuple != nullptr); - if (tuple == nullptr) { - continue; - } - size += tuple->total_byte_size(*tuple_desc); - } - return size; -} - -bool BufferedTupleStream2::deep_copy(TupleRow* row) { - if (_nullable_tuple) { - return deep_copy_internal(row); - } else { - return deep_copy_internal(row); - } -} - -// TODO: this really needs codegen -// TODO: in case of duplicate tuples, this can redundantly serialize data. -template -bool BufferedTupleStream2::deep_copy_internal(TupleRow* row) { - if (UNLIKELY(_write_block == nullptr)) { - return false; - } - DCHECK_GE(_null_indicators_write_block, 0); - DCHECK(_write_block->is_pinned()) << debug_string() << std::endl - << _write_block->debug_string(); - - const uint64_t tuples_per_row = _desc.tuple_descriptors().size(); - if (UNLIKELY((_write_block->bytes_remaining() < _fixed_tuple_row_size) || - (HasNullableTuple && - (_write_tuple_idx + tuples_per_row > _null_indicators_write_block * 8)))) { - return false; - } - // Allocate the maximum possible buffer for the fixed portion of the tuple. - uint8_t* tuple_buf = _write_block->allocate(_fixed_tuple_row_size); - // Total bytes allocated in _write_block for this row. Saved so we can roll back - // if this row doesn't fit. - int bytes_allocated = _fixed_tuple_row_size; - - // Copy the not nullptr fixed len tuples. For the nullptr tuples just update the nullptr tuple - // indicator. - if (HasNullableTuple) { - DCHECK_GT(_null_indicators_write_block, 0); - uint8_t* null_word = nullptr; - uint32_t null_pos = 0; - // Calculate how much space it should return. - int to_return = 0; - for (int i = 0; i < tuples_per_row; ++i) { - null_word = _write_block->buffer() + (_write_tuple_idx >> 3); // / 8 - null_pos = _write_tuple_idx & 7; - ++_write_tuple_idx; - const int tuple_size = _desc.tuple_descriptors()[i]->byte_size(); - Tuple* t = row->get_tuple(i); - const uint8_t mask = 1 << (7 - null_pos); - if (t != nullptr) { - *null_word &= ~mask; - memcpy(tuple_buf, t, tuple_size); - tuple_buf += tuple_size; - } else { - *null_word |= mask; - to_return += tuple_size; - } - } - DCHECK_LE(_write_tuple_idx - 1, _null_indicators_write_block * 8); - _write_block->return_allocation(to_return); - bytes_allocated -= to_return; - } else { - // If we know that there are no nullable tuples no need to set the nullability flags. - DCHECK_EQ(_null_indicators_write_block, 0); - for (int i = 0; i < tuples_per_row; ++i) { - const int tuple_size = _desc.tuple_descriptors()[i]->byte_size(); - Tuple* t = row->get_tuple(i); - // TODO: Once IMPALA-1306 (Avoid passing empty tuples of non-materialized slots) - // is delivered, the check below should become DCHECK(t != nullptr). - DCHECK(t != nullptr || tuple_size == 0); - memcpy(tuple_buf, t, tuple_size); - tuple_buf += tuple_size; - } - } - - // Copy string slots. Note: we do not need to convert the string ptrs to offsets - // on the write path, only on the read. The tuple data is immediately followed - // by the string data so only the len information is necessary. - for (int i = 0; i < _string_slots.size(); ++i) { - Tuple* tuple = row->get_tuple(_string_slots[i].first); - if (HasNullableTuple && tuple == nullptr) { - continue; - } - if (UNLIKELY(!copy_strings(tuple, _string_slots[i].second, &bytes_allocated))) { - _write_block->return_allocation(bytes_allocated); - return false; - } - } - - // Copy collection slots. We copy collection data in a well-defined order so we do not - // need to convert pointers to offsets on the write path. - // for (int i = 0; i < _collection_slots.size(); ++i) { - // Tuple* tuple = row->get_tuple(_collection_slots[i].first); - // if (HasNullableTuple && tuple == nullptr) continue; - // if (UNLIKELY(!copy_collections(tuple, _collection_slots[i].second, - // &bytes_allocated))) { - // _write_block->return_allocation(bytes_allocated); - // return false; - // } - // } - - _write_block->add_row(); - ++_num_rows; - return true; -} - -bool BufferedTupleStream2::copy_strings(const Tuple* tuple, - const vector& string_slots, - int* bytes_allocated) { - for (int i = 0; i < string_slots.size(); ++i) { - const SlotDescriptor* slot_desc = string_slots[i]; - if (tuple->is_null(slot_desc->null_indicator_offset())) { - continue; - } - const StringValue* sv = tuple->get_string_slot(slot_desc->tuple_offset()); - if (LIKELY(sv->len > 0)) { - if (UNLIKELY(_write_block->bytes_remaining() < sv->len)) { - return false; - } - uint8_t* buf = _write_block->allocate(sv->len); - (*bytes_allocated) += sv->len; - memcpy(buf, sv->ptr, sv->len); - } - } - return true; -} -} // end namespace doris diff --git a/be/src/runtime/buffered_tuple_stream2.h b/be/src/runtime/buffered_tuple_stream2.h deleted file mode 100644 index 7d16ad3441..0000000000 --- a/be/src/runtime/buffered_tuple_stream2.h +++ /dev/null @@ -1,412 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/apache/impala/blob/branch-2.10.0/be/src/runtime/buffered-tuple-stream.h -// and modified by Doris - -#pragma once - -#include - -#include "common/status.h" -#include "runtime/buffered_block_mgr2.h" - -namespace doris { - -class BufferedBlockMgr2; -class RuntimeProfile; -class RuntimeState; -class RowBatch; -class RowDescriptor; -class SlotDescriptor; -class TupleRow; -class Tuple; - -// Class that provides an abstraction for a stream of tuple rows. Rows can be -// added to the stream and returned. Rows are returned in the order they are added. -// -// The underlying memory management is done by the BufferedBlockMgr2. -// -// The tuple stream consists of a number of small (less than IO-sized blocks) before -// an arbitrary number of IO-sized blocks. The smaller blocks do not spill and are -// there to lower the minimum buffering requirements. For example, an operator that -// needs to maintain 64 streams (1 buffer per partition) would need, by default, -// 64 * 8MB = 512MB of buffering. A query with 5 of these operators would require -// 2.56GB just to run, regardless of how much of that is used. This is -// problematic for small queries. Instead we will start with a fixed number of small -// buffers (currently 2 small buffers: one 64KB and one 512KB) and only start using IO -// sized buffers when those fill up. The small buffers never spill. -// The stream will *not* automatically switch from using small buffers to IO-sized -// buffers when all the small buffers for this stream have been used. -// -// The BufferedTupleStream2 is *not* thread safe from the caller's point of view. It is -// expected that all the APIs are called from a single thread. Internally, the -// object is thread safe wrt to the underlying block mgr. -// -// Buffer management: -// The stream is either pinned or unpinned, set via pin_stream() and unpin_stream(). -// Blocks are optionally deleted as they are read, set with the delete_on_read argument -// to prepare_for_read(). -// -// Block layout: -// At the header of each block, starting at position 0, there is a bitstring with null -// indicators for all the tuples in each row in the block. Then there are the tuple rows. -// We further optimize the codepaths when we know that no tuple is nullable, indicated -// by '_nullable_tuple'. -// -// Tuple row layout: -// Tuples are stored back to back. Each tuple starts with the fixed length portion, -// directly followed by the var len portion. (Fixed len and var len are interleaved). -// If any tuple in the row is nullable, then there is a bitstring of null tuple -// indicators at the header of the block. The order of bits in the null indicators -// bitstring corresponds to the order of tuples in the block. The nullptr tuples are not -// stored in the body of the block, only as set bits in the null indicators bitsting. -// -// The behavior of reads and writes is as follows: -// Read: -// 1. Delete on read (_delete_on_read): Blocks are deleted as we go through the stream. -// The data returned by the tuple stream is valid until the next read call so the -// caller does not need to copy if it is streaming. -// 2. Unpinned: Blocks remain in _blocks and are unpinned after reading. -// 3. Pinned: Blocks remain in _blocks and are left pinned after reading. If the next -// block in the stream cannot be pinned, the read call will fail and the caller needs -// to free memory from the underlying block mgr. -// Write: -// 1. Unpinned: Unpin blocks as they fill up. This means only a single (i.e. the -// current) block needs to be in memory regardless of the input size (if read_write is -// true, then two blocks need to be in memory). -// 2. Pinned: Blocks are left pinned. If we run out of blocks, the write will fail and -// the caller needs to free memory from the underlying block mgr. -// -// TODO: we need to be able to do read ahead in the BufferedBlockMgr2. It currently -// only has PinAllBlocks() which is blocking. We need a non-blocking version of this or -// some way to indicate a block will need to be pinned soon. -// TODO: see if this can be merged with Sorter::Run. The key difference is that this -// does not need to return rows in the order they were added, which allows it to be -// simpler. -// TODO: we could compact the small buffers when we need to spill but they use very -// little memory so ths might not be very useful. -// TODO: improvements: -// - Think about how to layout for the var len data more, possibly filling in them -// from the end of the same block. Don't interleave fixed and var len data. -// - It would be good to allocate the null indicators at the end of each block and grow -// this array as new rows are inserted in the block. If we do so, then there will be -// fewer gaps in case of many rows with nullptr tuples. -// - We will want to multithread this. Add a AddBlock() call so the synchronization -// happens at the block level. This is a natural extension. -// - Instead of allocating all blocks from the block_mgr, allocate some blocks that -// are much smaller (e.g. 16K and doubling up to the block size). This way, very -// small streams (a common case) will use very little memory. This small blocks -// are always in memory since spilling them frees up negligible memory. -// - Return row batches in get_next() instead of filling one in -// - Should we 32-bit align the start of the tuple rows? Now it is byte-aligned. -class BufferedTupleStream2 { -public: - // Ordinal index into the stream to retrieve a row in O(1) time. This index can - // only be used if the stream is pinned. - // To read a row from a stream we need three pieces of information that we squeeze in - // 64 bits: - // - The index of the block. The block id is stored in 16 bits. We can have up to - // 64K blocks per tuple stream. With 8MB blocks that is 512GB per stream. - // - The offset of the start of the row (data) within the block. Since blocks are 8MB - // we use 24 bits for the offsets. (In theory we could use 23 bits.) - // - The idx of the row in the block. We need this for retrieving the null indicators. - // We use 24 bits for this index as well. - struct RowIdx { - static const uint64_t BLOCK_MASK = 0xFFFF; - static const uint64_t BLOCK_SHIFT = 0; - static const uint64_t OFFSET_MASK = 0xFFFFFF0000; - static const uint64_t OFFSET_SHIFT = 16; - static const uint64_t IDX_MASK = 0xFFFFFF0000000000; - static const uint64_t IDX_SHIFT = 40; - - uint64_t block() const { return (data & BLOCK_MASK); }; - - uint64_t offset() const { return (data & OFFSET_MASK) >> OFFSET_SHIFT; }; - - uint64_t idx() const { return (data & IDX_MASK) >> IDX_SHIFT; } - - uint64_t set(uint64_t block, uint64_t offset, uint64_t idx) { - DCHECK_LE(block, BLOCK_MASK) - << "Cannot have more than 2^16 = 64K blocks in a tuple stream."; - DCHECK_LE(offset, OFFSET_MASK >> OFFSET_SHIFT) - << "Cannot have blocks larger than 2^24 = 16MB"; - DCHECK_LE(idx, IDX_MASK >> IDX_SHIFT) - << "Cannot have more than 2^24 = 16M rows in a block."; - data = block | (offset << OFFSET_SHIFT) | (idx << IDX_SHIFT); - return data; - } - - std::string debug_string() const; - - uint64_t data; - }; - - // row_desc: description of rows stored in the stream. This is the desc for rows - // that are added and the rows being returned. - // block_mgr: Underlying block mgr that owns the data blocks. - // use_initial_small_buffers: If true, the initial N buffers allocated for the - // tuple stream use smaller than IO-sized buffers. - // read_write: Stream allows interchanging read and write operations. Requires at - // least two blocks may be pinned. - BufferedTupleStream2(RuntimeState* state, const RowDescriptor& row_desc, - BufferedBlockMgr2* block_mgr, BufferedBlockMgr2::Client* client, - bool use_initial_small_buffers, bool read_write); - // A null dtor to pass codestyle check - ~BufferedTupleStream2() {} - - // Initializes the tuple stream object on behalf of node 'node_id'. Must be called - // once before any of the other APIs. - // If 'pinned' is true, the tuple stream starts of pinned, otherwise it is unpinned. - // If 'profile' is non-nullptr, counters are created. - // 'node_id' is only used for error reporting. - Status init(int node_id, RuntimeProfile* profile, bool pinned); - - // Must be called for streams using small buffers to switch to IO-sized buffers. - // If it fails to get a buffer (i.e. the switch fails) it resets the _use_small_buffers - // back to false. - // TODO: this does not seem like the best mechanism. - Status switch_to_io_buffers(bool* got_buffer); - - // Adds a single row to the stream. Returns false and sets *status if an error - // occurred. BufferedTupleStream2 will do a deep copy of the memory in the row. - bool add_row(TupleRow* row, Status* status); - - // Allocates space to store a row of size 'size' and returns a pointer to the memory - // when successful. Returns nullptr if there is not enough memory or an error occurred. - // When returning nullptr, sets *status. The returned memory is guaranteed to fit on one - // block. - uint8_t* allocate_row(int size, Status* status); - - // Populates 'row' with the row at 'idx'. The stream must be pinned. The row must have - // been allocated with the stream's row desc. - void get_tuple_row(const RowIdx& idx, TupleRow* row) const; - - // Prepares the stream for reading. If _read_write, this can be called at any time to - // begin reading. Otherwise this must be called after the last AddRow() and - // before get_next(). - // delete_on_read: Blocks are deleted after they are read. - // If got_buffer is nullptr, this function will fail (with a bad status) if no buffer - // is available. If got_buffer is non-null, this function will not fail on OOM and - // *got_buffer is true if a buffer was pinned. - Status prepare_for_read(bool delete_on_read, bool* got_buffer = nullptr); - - // Pins all blocks in this stream and switches to pinned mode. - // If there is not enough memory, *pinned is set to false and the stream is unmodified. - // If already_reserved is true, the caller has already made a reservation on - // _block_mgr_client to pin the stream. - Status pin_stream(bool already_reserved, bool* pinned); - - // Unpins stream. If all is true, all blocks are unpinned, otherwise all blocks - // except the _write_block and _read_block are unpinned. - Status unpin_stream(bool all = false); - - // Get the next batch of output rows. Memory is still owned by the BufferedTupleStream2 - // and must be copied out by the caller. - // If 'indices' is non-nullptr, that is also populated for each returned row with the - // index for that row. - Status get_next(RowBatch* batch, bool* eos, std::vector* indices = nullptr); - - // Returns all the rows in the stream in batch. This pins the entire stream - // in the process. - // *got_rows is false if the stream could not be pinned. - Status get_rows(std::unique_ptr* batch, bool* got_rows); - - // Must be called once at the end to cleanup all resources. Idempotent. - void close(); - - // Number of rows in the stream. - int64_t num_rows() const { return _num_rows; } - - // Number of rows returned via get_next(). - int64_t rows_returned() const { return _rows_returned; } - - // Returns the byte size necessary to store the entire stream in memory. - int64_t byte_size() const { return _total_byte_size; } - - // Returns the byte size of the stream that is currently pinned in memory. - // If ignore_current is true, the _write_block memory is not included. - int64_t bytes_in_mem(bool ignore_current) const; - - bool is_pinned() const { return _pinned; } - int blocks_pinned() const { return _num_pinned; } - int blocks_unpinned() const { return _blocks.size() - _num_pinned - _num_small_blocks; } - bool has_read_block() const { return _read_block != _blocks.end(); } - bool has_write_block() const { return _write_block != nullptr; } - bool using_small_buffers() const { return _use_small_buffers; } - bool has_tuple_footprint() const { - return _fixed_tuple_row_size > 0 || !_string_slots.empty() || _nullable_tuple; - } - - std::string debug_string() const; - -private: - // friend class ArrayTupleStreamTest_TestArrayDeepCopy_Test; - - // If true, this stream is still using small buffers. - bool _use_small_buffers; - - // If true, blocks are deleted after they are read. - bool _delete_on_read; - - // If true, read and write operations may be interleaved. Otherwise all calls - // to AddRow() must occur before calling prepare_for_read() and subsequent calls to - // get_next(). - const bool _read_write; - - // Runtime state instance used to check for cancellation. Not owned. - RuntimeState* const _state; - - // Description of rows stored in the stream. - const RowDescriptor& _desc; - - // Whether any tuple in the rows is nullable. - const bool _nullable_tuple; - - // Sum of the fixed length portion of all the tuples in _desc. - int _fixed_tuple_row_size; - - // Max size (in bytes) of null indicators bitstring in the current read and write - // blocks. If 0, it means that there is no need to store null indicators for this - // RowDesc. We calculate this value based on the block's size and the - // _fixed_tuple_row_size. When not 0, this value is also an upper bound for the number - // of (rows * tuples_per_row) in this block. - uint32_t _null_indicators_read_block; - uint32_t _null_indicators_write_block; - - // Vector of all the strings slots grouped by tuple_idx. - std::vector>> _string_slots; - - // Vector of all the collection slots grouped by tuple_idx. - // std::vector>> _collection_slots; - - // Block manager and client used to allocate, pin and release blocks. Not owned. - BufferedBlockMgr2* _block_mgr; - BufferedBlockMgr2::Client* _block_mgr_client; - - // List of blocks in the stream. - std::list _blocks; - - // Total size of _blocks, including small blocks. - int64_t _total_byte_size; - - // Iterator pointing to the current block for read. Equal to list.end() until - // prepare_for_read() is called. - std::list::iterator _read_block; - - // For each block in the stream, the buffer of the start of the block. This is only - // valid when the stream is pinned, giving random access to data in the stream. - // This is not maintained for _delete_on_read. - std::vector _block_start_idx; - - // Current ptr offset in _read_block's buffer. - uint8_t* _read_ptr; - - // Current idx of the tuple read from the _read_block buffer. - uint32_t _read_tuple_idx; - - // Current idx of the tuple written at the _write_block buffer. - uint32_t _write_tuple_idx; - - // Bytes read in _read_block. - int64_t _read_bytes; - - // Number of rows returned to the caller from get_next(). - int64_t _rows_returned; - - // The block index of the current read block. - int _read_block_idx; - - // The current block for writing. nullptr if there is no available block to write to. - BufferedBlockMgr2::Block* _write_block; - - // Number of pinned blocks in _blocks, stored to avoid iterating over the list - // to compute bytes_in_mem and bytes_unpinned. - // This does not include small blocks. - int _num_pinned; - - // The total number of small blocks in _blocks; - int _num_small_blocks; - - bool _closed; // Used for debugging. - - // Number of rows stored in the stream. - int64_t _num_rows; - - // If true, this stream has been explicitly pinned by the caller. This changes the - // memory management of the stream. The blocks are not unpinned until the caller calls - // UnpinAllBlocks(). If false, only the _write_block and/or _read_block are pinned - // (both are if _read_write is true). - bool _pinned; - - // Counters added by this object to the parent runtime profile. - RuntimeProfile::Counter* _pin_timer; - RuntimeProfile::Counter* _unpin_timer; - RuntimeProfile::Counter* _get_new_block_timer; - - // Copies 'row' into _write_block. Returns false if there is not enough space in - // '_write_block'. - template - bool deep_copy_internal(TupleRow* row); - - // Helper function to copy strings from tuple into _write_block. Increments - // bytes_allocated by the number of bytes allocated from _write_block. - bool copy_strings(const Tuple* tuple, const std::vector& string_slots, - int* bytes_allocated); - - // Helper function to deep copy collections from tuple into _write_block. Increments - // bytes_allocated by the number of bytes allocated from _write_block. - // bool copy_collections(const Tuple* tuple, - // const std::vector& collection_slots, int* bytes_allocated); - - // Wrapper of the templated deep_copy_internal() function. - bool deep_copy(TupleRow* row); - - // Gets a new block from the _block_mgr, updating _write_block and _write_tuple_idx, - // and setting *got_block. If there are no blocks available, *got_block is set to - // false and _write_block is unchanged. - // 'min_size' is the minimum number of bytes required for this block. - Status new_block_for_write(int64_t min_size, bool* got_block); - - // Reads the next block from the _block_mgr. This blocks if necessary. - // Updates _read_block, _read_ptr, _read_tuple_idx and _read_bytes. - Status next_block_for_read(); - - // Returns the byte size of this row when encoded in a block. - int64_t compute_row_size(TupleRow* row) const; - - // Unpins block if it is an IO-sized block and updates tracking stats. - Status unpin_block(BufferedBlockMgr2::Block* block); - - // Templated get_next implementation. - template - Status get_next_internal(RowBatch* batch, bool* eos, std::vector* indices); - - // Read strings from stream by converting pointers and updating _read_ptr and - // _read_bytes. - void read_strings(const std::vector& string_slots, int data_len, Tuple* tuple); - - // Read collections from stream by converting pointers and updating _read_ptr and - // _read_bytes. - // void ReadCollections(const std::vector& collection_slots, int data_len, - // Tuple* tuple); - - // Computes the number of bytes needed for null indicators for a block of 'block_size' - int compute_num_null_indicator_bytes(int block_size) const; -}; - -} // end namespace doris diff --git a/be/src/runtime/buffered_tuple_stream2.inline.h b/be/src/runtime/buffered_tuple_stream2.inline.h deleted file mode 100644 index 99add39b83..0000000000 --- a/be/src/runtime/buffered_tuple_stream2.inline.h +++ /dev/null @@ -1,90 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/apache/impala/blob/branch-2.10.0/be/src/runtime/buffered-tuple-stream.inline.h -// and modified by Doris - -#pragma once - -#include "runtime/buffered_tuple_stream2.h" -#include "runtime/descriptors.h" -#include "runtime/tuple_row.h" - -namespace doris { - -inline bool BufferedTupleStream2::add_row(TupleRow* row, Status* status) { - DCHECK(!_closed); - if (LIKELY(deep_copy(row))) { - return true; - } - bool got_block; - int64_t row_size = compute_row_size(row); - *status = new_block_for_write(row_size, &got_block); - if (!status->ok() || !got_block) { - return false; - } - return deep_copy(row); -} - -inline uint8_t* BufferedTupleStream2::allocate_row(int size, Status* status) { - DCHECK(!_closed); - if (UNLIKELY(_write_block == nullptr || _write_block->bytes_remaining() < size)) { - bool got_block; - *status = new_block_for_write(size, &got_block); - if (!status->ok() || !got_block) { - return nullptr; - } - } - DCHECK(_write_block != nullptr); - DCHECK(_write_block->is_pinned()); - DCHECK_GE(_write_block->bytes_remaining(), size); - ++_num_rows; - _write_block->add_row(); - return _write_block->allocate(size); -} - -inline void BufferedTupleStream2::get_tuple_row(const RowIdx& idx, TupleRow* row) const { - DCHECK(row != nullptr); - DCHECK(!_closed); - DCHECK(is_pinned()); - DCHECK(!_delete_on_read); - DCHECK_EQ(_blocks.size(), _block_start_idx.size()); - DCHECK_LT(idx.block(), _blocks.size()); - - uint8_t* data = _block_start_idx[idx.block()] + idx.offset(); - if (_nullable_tuple) { - // Stitch together the tuples from the block and the nullptr ones. - const int tuples_per_row = _desc.tuple_descriptors().size(); - uint32_t tuple_idx = idx.idx() * tuples_per_row; - for (int i = 0; i < tuples_per_row; ++i) { - const uint8_t* null_word = _block_start_idx[idx.block()] + (tuple_idx >> 3); - const uint32_t null_pos = tuple_idx & 7; - const bool is_not_null = ((*null_word & (1 << (7 - null_pos))) == 0); - row->set_tuple( - i, reinterpret_cast(reinterpret_cast(data) * is_not_null)); - data += _desc.tuple_descriptors()[i]->byte_size() * is_not_null; - ++tuple_idx; - } - } else { - for (int i = 0; i < _desc.tuple_descriptors().size(); ++i) { - row->set_tuple(i, reinterpret_cast(data)); - data += _desc.tuple_descriptors()[i]->byte_size(); - } - } -} - -} // namespace doris diff --git a/be/src/runtime/buffered_tuple_stream3.cc b/be/src/runtime/buffered_tuple_stream3.cc deleted file mode 100644 index 2a35f5c70c..0000000000 --- a/be/src/runtime/buffered_tuple_stream3.cc +++ /dev/null @@ -1,867 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/apache/impala/blob/branch-3.0.0/be/src/runtime/buffered-tuple-stream.cc -// and modified by Doris - -#include - -#include "runtime/buffered_tuple_stream3.inline.h" -#include "runtime/descriptors.h" -#include "runtime/exec_env.h" -#include "runtime/row_batch.h" -#include "runtime/runtime_state.h" -#include "runtime/string_value.h" -#include "runtime/tuple_row.h" -#include "util/bit_util.h" -#include "util/debug_util.h" - -#ifdef NDEBUG -#define CHECK_CONSISTENCY_FAST() -#define CHECK_CONSISTENCY_FULL() -#else -#define CHECK_CONSISTENCY_FAST() CheckConsistencyFast() -#define CHECK_CONSISTENCY_FULL() CheckConsistencyFull() -#endif - -using namespace doris; -using namespace strings; - -using BufferHandle = BufferPool::BufferHandle; - -BufferedTupleStream3::BufferedTupleStream3(RuntimeState* state, const RowDescriptor* row_desc, - BufferPool::ClientHandle* buffer_pool_client, - int64_t default_page_len, - const std::set& ext_varlen_slots) - : state_(state), - desc_(row_desc), - node_id_(-1), - buffer_pool_(state->exec_env()->buffer_pool()), - buffer_pool_client_(buffer_pool_client), - num_pages_(0), - total_byte_size_(0), - has_read_iterator_(false), - read_page_rows_returned_(-1), - read_ptr_(nullptr), - read_end_ptr_(nullptr), - write_ptr_(nullptr), - write_end_ptr_(nullptr), - rows_returned_(0), - has_write_iterator_(false), - write_page_(nullptr), - bytes_pinned_(0), - num_rows_(0), - default_page_len_(default_page_len), - has_nullable_tuple_(row_desc->is_any_tuple_nullable()), - delete_on_read_(false), - closed_(false), - pinned_(true) { - DCHECK(BitUtil::IsPowerOf2(default_page_len)) << default_page_len; - read_page_ = pages_.end(); - for (int i = 0; i < desc_->tuple_descriptors().size(); ++i) { - const TupleDescriptor* tuple_desc = desc_->tuple_descriptors()[i]; - const int tuple_byte_size = tuple_desc->byte_size(); - fixed_tuple_sizes_.push_back(tuple_byte_size); - - vector tuple_string_slots; - vector tuple_coll_slots; - for (int j = 0; j < tuple_desc->slots().size(); ++j) { - SlotDescriptor* slot = tuple_desc->slots()[j]; - if (!slot->type().is_var_len_string_type()) continue; - if (ext_varlen_slots.find(slot->id()) == ext_varlen_slots.end()) { - if (slot->type().is_var_len_string_type()) { - tuple_string_slots.push_back(slot); - } else { - DCHECK(slot->type().is_collection_type()); - tuple_coll_slots.push_back(slot); - } - } - } - if (!tuple_string_slots.empty()) { - inlined_string_slots_.push_back(make_pair(i, tuple_string_slots)); - } - /* - if (!tuple_coll_slots.empty()) { - inlined_coll_slots_.push_back(make_pair(i, tuple_coll_slots)); - } -*/ - } -} - -BufferedTupleStream3::~BufferedTupleStream3() { - DCHECK(closed_); -} - -void BufferedTupleStream3::CheckConsistencyFull() const { - CheckConsistencyFast(); - // The below checks require iterating over all the pages in the stream. - DCHECK_EQ(bytes_pinned_, CalcBytesPinned()) << DebugString(); - DCHECK_EQ(pages_.size(), num_pages_) << DebugString(); - for (const Page& page : pages_) CheckPageConsistency(&page); -} - -void BufferedTupleStream3::CheckConsistencyFast() const { - // All the below checks should be O(1). - DCHECK(has_write_iterator() || write_page_ == nullptr); - if (write_page_ != nullptr) { - CheckPageConsistency(write_page_); - DCHECK(write_page_->is_pinned()); - DCHECK(write_page_->retrieved_buffer); - const BufferHandle* write_buffer; - Status status = write_page_->GetBuffer(&write_buffer); - DCHECK(status.ok()); // Write buffer should never have been unpinned. - DCHECK_GE(write_ptr_, write_buffer->data()); - DCHECK_EQ(write_end_ptr_, write_buffer->data() + write_page_->len()); - DCHECK_GE(write_end_ptr_, write_ptr_); - } - DCHECK(has_read_iterator() || read_page_ == pages_.end()); - if (read_page_ != pages_.end()) { - CheckPageConsistency(&*read_page_); - DCHECK(read_page_->is_pinned()); - DCHECK(read_page_->retrieved_buffer); - // Can't check read buffer without affecting behaviour, because a read may be in - // flight and this would required blocking on that write. - DCHECK_GE(read_end_ptr_, read_ptr_); - } -} - -void BufferedTupleStream3::CheckPageConsistency(const Page* page) const { - DCHECK_EQ(ExpectedPinCount(pinned_, page), page->pin_count()) << DebugString(); - // Only one large row per page. - if (page->len() > default_page_len_) DCHECK_LE(page->num_rows, 1); - // We only create pages when we have a row to append to them. - DCHECK_GT(page->num_rows, 0); -} - -string BufferedTupleStream3::DebugString() const { - std::stringstream ss; - ss << "BufferedTupleStream3 num_rows=" << num_rows_ << " rows_returned=" << rows_returned_ - << " pinned=" << pinned_ << " delete_on_read=" << delete_on_read_ << " closed=" << closed_ - << "\n" - << " bytes_pinned=" << bytes_pinned_ << " has_write_iterator=" << has_write_iterator_ - << " write_page=" << write_page_ << " has_read_iterator=" << has_read_iterator_ - << " read_page="; - if (read_page_ == pages_.end()) { - ss << ""; - } else { - ss << &*read_page_; - } - ss << "\n # pages=" << num_pages_ << " pages=[\n"; - for (const Page& page : pages_) { - ss << "{" << page.DebugString() << "}"; - if (&page != &pages_.back()) ss << ",\n"; - } - ss << "]"; - return ss.str(); -} - -string BufferedTupleStream3::Page::DebugString() const { - //return Substitute("$0 num_rows=$1", handle.DebugString(), num_rows); - return string(""); -} - -Status BufferedTupleStream3::Init(int node_id, bool pinned) { - // if (!pinned) UnpinStream(UNPIN_ALL_EXCEPT_CURRENT); - node_id_ = node_id; - return Status::OK(); -} - -Status BufferedTupleStream3::PrepareForWrite() { - // This must be the first iterator created. - DCHECK(pages_.empty()); - DCHECK(!delete_on_read_); - DCHECK(!has_write_iterator()); - DCHECK(!has_read_iterator()); - CHECK_CONSISTENCY_FULL(); - - has_write_iterator_ = true; - return Status::OK(); -} - -Status BufferedTupleStream3::PrepareForReadWrite(bool delete_on_read) { - // This must be the first iterator created. - DCHECK(pages_.empty()); - DCHECK(!delete_on_read_); - DCHECK(!has_write_iterator()); - DCHECK(!has_read_iterator()); - CHECK_CONSISTENCY_FULL(); - - has_write_iterator_ = true; - RETURN_IF_ERROR(PrepareForReadInternal(delete_on_read)); - return Status::OK(); -} - -void BufferedTupleStream3::Close(RowBatch* batch, RowBatch::FlushMode flush) { - for (Page& page : pages_) { - if (batch != nullptr && page.retrieved_buffer) { - // Subtle: We only need to attach buffers from pages that we may have returned - // references to. ExtractBuffer() cannot fail for these pages because the data - // is guaranteed to already be in -memory. - BufferPool::BufferHandle buffer; - Status status = buffer_pool_->ExtractBuffer(buffer_pool_client_, &page.handle, &buffer); - DCHECK(status.ok()); - batch->add_buffer(buffer_pool_client_, std::move(buffer), flush); - } else { - buffer_pool_->DestroyPage(buffer_pool_client_, &page.handle); - } - } - pages_.clear(); - num_pages_ = 0; - bytes_pinned_ = 0; - closed_ = true; -} - -int64_t BufferedTupleStream3::CalcBytesPinned() const { - int64_t result = 0; - for (const Page& page : pages_) result += page.pin_count() * page.len(); - return result; -} - -Status BufferedTupleStream3::PinPage(Page* page) { - RETURN_IF_ERROR(buffer_pool_->Pin(buffer_pool_client_, &page->handle)); - bytes_pinned_ += page->len(); - return Status::OK(); -} - -int BufferedTupleStream3::ExpectedPinCount(bool stream_pinned, const Page* page) const { - return (stream_pinned || is_read_page(page) || is_write_page(page)) ? 1 : 0; -} - -Status BufferedTupleStream3::PinPageIfNeeded(Page* page, bool stream_pinned) { - int new_pin_count = ExpectedPinCount(stream_pinned, page); - if (new_pin_count != page->pin_count()) { - DCHECK_EQ(new_pin_count, page->pin_count() + 1); - RETURN_IF_ERROR(PinPage(page)); - } - return Status::OK(); -} - -void BufferedTupleStream3::UnpinPageIfNeeded(Page* page, bool stream_pinned) { - int new_pin_count = ExpectedPinCount(stream_pinned, page); - if (new_pin_count != page->pin_count()) { - DCHECK_EQ(new_pin_count, page->pin_count() - 1); - buffer_pool_->Unpin(buffer_pool_client_, &page->handle); - bytes_pinned_ -= page->len(); - if (page->pin_count() == 0) page->retrieved_buffer = false; - } -} - -Status BufferedTupleStream3::NewWritePage(int64_t page_len) noexcept { - DCHECK(!closed_); - DCHECK(write_page_ == nullptr); - - Page new_page; - const BufferHandle* write_buffer; - RETURN_IF_ERROR(buffer_pool_->CreatePage(buffer_pool_client_, page_len, &new_page.handle, - &write_buffer)); - bytes_pinned_ += page_len; - total_byte_size_ += page_len; - - pages_.push_back(std::move(new_page)); - ++num_pages_; - write_page_ = &pages_.back(); - DCHECK_EQ(write_page_->num_rows, 0); - write_ptr_ = write_buffer->data(); - write_end_ptr_ = write_ptr_ + page_len; - return Status::OK(); -} - -void BufferedTupleStream3::CalcPageLenForRow(int64_t row_size, int64_t* page_len) { - *page_len = std::max(default_page_len_, BitUtil::RoundUpToPowerOfTwo(row_size)); -} - -Status BufferedTupleStream3::AdvanceWritePage(int64_t row_size) noexcept { - DCHECK(has_write_iterator()); - CHECK_CONSISTENCY_FAST(); - - int64_t page_len; - - CalcPageLenForRow(row_size, &page_len); - ResetWritePage(); - //RETURN_IF_ERROR(NewWritePage(page_len)); - Status status = NewWritePage(page_len); - if (UNLIKELY(!status.ok())) { - return status; - } - return Status::OK(); -} - -void BufferedTupleStream3::ResetWritePage() { - if (write_page_ == nullptr) return; - // Unpin the write page if we're reading in unpinned mode. - Page* prev_write_page = write_page_; - write_page_ = nullptr; - write_ptr_ = nullptr; - write_end_ptr_ = nullptr; - - // May need to decrement pin count now that it's not the write page, depending on - // the stream's mode. - UnpinPageIfNeeded(prev_write_page, pinned_); -} - -void BufferedTupleStream3::InvalidateWriteIterator() { - if (!has_write_iterator()) return; - ResetWritePage(); - has_write_iterator_ = false; -} - -Status BufferedTupleStream3::NextReadPage() { - DCHECK(has_read_iterator()); - DCHECK(!closed_); - CHECK_CONSISTENCY_FAST(); - - if (read_page_ == pages_.end()) { - // No rows read yet - start reading at first page. If the stream is unpinned, we can - // use the reservation saved in PrepareForReadWrite() to pin the first page. - read_page_ = pages_.begin(); - } else if (delete_on_read_) { - DCHECK(read_page_ == pages_.begin()) << read_page_->DebugString() << " " << DebugString(); - DCHECK_NE(&*read_page_, write_page_); - bytes_pinned_ -= pages_.front().len(); - buffer_pool_->DestroyPage(buffer_pool_client_, &pages_.front().handle); - pages_.pop_front(); - --num_pages_; - read_page_ = pages_.begin(); - } else { - // Unpin pages after reading them if needed. - Page* prev_read_page = &*read_page_; - ++read_page_; - UnpinPageIfNeeded(prev_read_page, pinned_); - } - - if (read_page_ == pages_.end()) { - CHECK_CONSISTENCY_FULL(); - return Status::OK(); - } - - // Ensure the next page is pinned for reading. By this point we should have enough - // reservation to pin the page. If the stream is pinned, the page is already pinned. - // If the stream is unpinned, we freed up enough memory for a default-sized page by - // deleting or unpinning the previous page and ensured that, if the page was larger, - // that the reservation is available with the above check. - RETURN_IF_ERROR(PinPageIfNeeded(&*read_page_, pinned_)); - - // This waits for the pin to complete if the page was unpinned earlier. - const BufferHandle* read_buffer; - RETURN_IF_ERROR(read_page_->GetBuffer(&read_buffer)); - - read_page_rows_returned_ = 0; - read_ptr_ = read_buffer->data(); - read_end_ptr_ = read_ptr_ + read_buffer->len(); - - CHECK_CONSISTENCY_FAST(); - return Status::OK(); -} - -void BufferedTupleStream3::InvalidateReadIterator() { - if (read_page_ != pages_.end()) { - // Unpin the write page if we're reading in unpinned mode. - Page* prev_read_page = &*read_page_; - read_page_ = pages_.end(); - read_ptr_ = nullptr; - read_end_ptr_ = nullptr; - - // May need to decrement pin count after destroying read iterator. - UnpinPageIfNeeded(prev_read_page, pinned_); - } - has_read_iterator_ = false; - // It is safe to re-read a delete-on-read stream if no rows were read and no pages - // were therefore deleted. - if (rows_returned_ == 0) delete_on_read_ = false; -} - -Status BufferedTupleStream3::PrepareForRead(bool delete_on_read) { - CHECK_CONSISTENCY_FULL(); - InvalidateWriteIterator(); - InvalidateReadIterator(); - return PrepareForReadInternal(delete_on_read); -} - -Status BufferedTupleStream3::PrepareForReadInternal(bool delete_on_read) { - DCHECK(!closed_); - DCHECK(!delete_on_read_); - DCHECK(!has_read_iterator()); - - has_read_iterator_ = true; - if (pages_.empty()) { - // No rows to return, or a the first read/write page has not yet been allocated. - read_page_ = pages_.end(); - read_ptr_ = nullptr; - read_end_ptr_ = nullptr; - } else { - // Eagerly pin the first page in the stream. - read_page_ = pages_.begin(); - // Check if we need to increment the pin count of the read page. - RETURN_IF_ERROR(PinPageIfNeeded(&*read_page_, pinned_)); - DCHECK(read_page_->is_pinned()); - - // This waits for the pin to complete if the page was unpinned earlier. - const BufferHandle* read_buffer; - RETURN_IF_ERROR(read_page_->GetBuffer(&read_buffer)); - read_ptr_ = read_buffer->data(); - read_end_ptr_ = read_ptr_ + read_buffer->len(); - } - read_page_rows_returned_ = 0; - rows_returned_ = 0; - delete_on_read_ = delete_on_read; - CHECK_CONSISTENCY_FULL(); - return Status::OK(); -} - -Status BufferedTupleStream3::PinStream(bool* pinned) { - DCHECK(!closed_); - CHECK_CONSISTENCY_FULL(); - if (pinned_) { - *pinned = true; - return Status::OK(); - } - *pinned = false; - - // At this point success is guaranteed - go through to pin the pages we need to pin. - // If the page data was evicted from memory, the read I/O can happen in parallel - // because we defer calling GetBuffer() until NextReadPage(). - for (Page& page : pages_) RETURN_IF_ERROR(PinPageIfNeeded(&page, true)); - - pinned_ = true; - *pinned = true; - CHECK_CONSISTENCY_FULL(); - return Status::OK(); -} -/* -void BufferedTupleStream3::UnpinStream(UnpinMode mode) { - CHECK_CONSISTENCY_FULL(); - DCHECK(!closed_); - if (mode == UNPIN_ALL) { - // Invalidate the iterators so they don't keep pages pinned. - InvalidateWriteIterator(); - InvalidateReadIterator(); - } - - if (pinned_) { - CHECK_CONSISTENCY_FULL(); - // If the stream was pinned, there may be some remaining pinned pages that should - // be unpinned at this point. - for (Page& page : pages_) UnpinPageIfNeeded(&page, false); - - pinned_ = false; - } - CHECK_CONSISTENCY_FULL(); -} -*/ -Status BufferedTupleStream3::GetRows(std::unique_ptr* batch, bool* got_rows) { - if (num_rows() > numeric_limits::max()) { - // RowBatch::num_rows_ is a 32-bit int, avoid an overflow. - return Status::InternalError( - "Trying to read {} rows into in-memory batch failed. Limit " - "is {}", - num_rows(), numeric_limits::max()); - } - RETURN_IF_ERROR(PinStream(got_rows)); - if (!*got_rows) return Status::OK(); - RETURN_IF_ERROR(PrepareForRead(false)); - - // TODO chenhao - // capacity in RowBatch use int, but _num_rows is int64_t - // it may be precision loss - batch->reset(new RowBatch(*desc_, num_rows())); - bool eos = false; - // Loop until GetNext fills the entire batch. Each call can stop at page - // boundaries. We generally want it to stop, so that pages can be freed - // as we read. It is safe in this case because we pin the entire stream. - while (!eos) { - RETURN_IF_ERROR(GetNext(batch->get(), &eos)); - } - return Status::OK(); -} - -Status BufferedTupleStream3::GetNext(RowBatch* batch, bool* eos) { - return GetNextInternal(batch, eos, nullptr); -} - -Status BufferedTupleStream3::GetNext(RowBatch* batch, bool* eos, vector* flat_rows) { - return GetNextInternal(batch, eos, flat_rows); -} - -template -Status BufferedTupleStream3::GetNextInternal(RowBatch* batch, bool* eos, - vector* flat_rows) { - if (has_nullable_tuple_) { - return GetNextInternal(batch, eos, flat_rows); - } else { - return GetNextInternal(batch, eos, flat_rows); - } -} - -template -Status BufferedTupleStream3::GetNextInternal(RowBatch* batch, bool* eos, - vector* flat_rows) { - DCHECK(!closed_); - DCHECK(batch->row_desc().equals(*desc_)); - DCHECK(is_pinned() || !FILL_FLAT_ROWS) << "FlatRowPtrs are only valid for pinned streams"; - *eos = (rows_returned_ == num_rows_); - if (*eos) return Status::OK(); - - if (UNLIKELY(read_page_ == pages_.end() || read_page_rows_returned_ == read_page_->num_rows)) { - // Get the next page in the stream (or the first page if read_page_ was not yet - // initialized.) We need to do this at the beginning of the GetNext() call to ensure - // the buffer management semantics. NextReadPage() may unpin or delete the buffer - // backing the rows returned from the *previous* call to GetNext(). - RETURN_IF_ERROR(NextReadPage()); - } - - DCHECK(has_read_iterator()); - DCHECK(read_page_ != pages_.end()); - DCHECK(read_page_->is_pinned()) << DebugString(); - DCHECK_GE(read_page_rows_returned_, 0); - - int rows_left_in_page = read_page_->num_rows - read_page_rows_returned_; - int rows_to_fill = std::min(batch->capacity() - batch->num_rows(), rows_left_in_page); - DCHECK_GE(rows_to_fill, 1); - uint8_t* tuple_row_mem = reinterpret_cast(batch->get_row(batch->num_rows())); - - // Produce tuple rows from the current page and the corresponding position on the - // null tuple indicator. - if (FILL_FLAT_ROWS) { - DCHECK(flat_rows != nullptr); - DCHECK(!delete_on_read_); - DCHECK_EQ(batch->num_rows(), 0); - flat_rows->clear(); - flat_rows->reserve(rows_to_fill); - } - - const uint64_t tuples_per_row = desc_->tuple_descriptors().size(); - // Start reading from the current position in 'read_page_'. - for (int i = 0; i < rows_to_fill; ++i) { - if (FILL_FLAT_ROWS) { - flat_rows->push_back(read_ptr_); - DCHECK_EQ(flat_rows->size(), i + 1); - } - // Copy the row into the output batch. - TupleRow* output_row = reinterpret_cast(tuple_row_mem); - tuple_row_mem += sizeof(Tuple*) * tuples_per_row; - UnflattenTupleRow(&read_ptr_, output_row); - - // Update string slot ptrs, skipping external strings. - for (int j = 0; j < inlined_string_slots_.size(); ++j) { - Tuple* tuple = output_row->get_tuple(inlined_string_slots_[j].first); - if (HAS_NULLABLE_TUPLE && tuple == nullptr) continue; - FixUpStringsForRead(inlined_string_slots_[j].second, tuple); - } - /* - // Update collection slot ptrs, skipping external collections. We traverse the - // collection structure in the same order as it was written to the stream, allowing - // us to infer the data layout based on the length of collections and strings. - for (int j = 0; j < inlined_coll_slots_.size(); ++j) { - Tuple* tuple = output_row->get_tuple(inlined_coll_slots_[j].first); - if (HAS_NULLABLE_TUPLE && tuple == nullptr) continue; - FixUpCollectionsForRead(inlined_coll_slots_[j].second, tuple); - } -*/ - } - - batch->commit_rows(rows_to_fill); - rows_returned_ += rows_to_fill; - read_page_rows_returned_ += rows_to_fill; - *eos = (rows_returned_ == num_rows_); - if (read_page_rows_returned_ == read_page_->num_rows && (!pinned_ || delete_on_read_)) { - // No more data in this page. The batch must be immediately returned up the operator - // tree and deep copied so that NextReadPage() can reuse the read page's buffer. - // TODO: IMPALA-4179 - instead attach the buffer and flush the resources. - batch->mark_needs_deep_copy(); - } - if (FILL_FLAT_ROWS) DCHECK_EQ(flat_rows->size(), rows_to_fill); - DCHECK_LE(read_ptr_, read_end_ptr_); - return Status::OK(); -} - -void BufferedTupleStream3::FixUpStringsForRead(const vector& string_slots, - Tuple* tuple) { - DCHECK(tuple != nullptr); - for (const SlotDescriptor* slot_desc : string_slots) { - if (tuple->is_null(slot_desc->null_indicator_offset())) continue; - - StringValue* sv = tuple->get_string_slot(slot_desc->tuple_offset()); - DCHECK_LE(read_ptr_ + sv->len, read_end_ptr_); - sv->ptr = reinterpret_cast(read_ptr_); - read_ptr_ += sv->len; - } -} -/* -void BufferedTupleStream3::FixUpCollectionsForRead( - const vector& collection_slots, Tuple* tuple) { - DCHECK(tuple != nullptr); - for (const SlotDescriptor* slot_desc : collection_slots) { - if (tuple->is_null(slot_desc->null_indicator_offset())) continue; - - CollectionValue* cv = tuple->get_collection_slot(slot_desc->tuple_offset()); - const TupleDescriptor& item_desc = *slot_desc->collection_item_descriptor(); - int coll_byte_size = cv->num_tuples * item_desc.byte_size(); - DCHECK_LE(read_ptr_ + coll_byte_size, read_end_ptr_); - cv->ptr = reinterpret_cast(read_ptr_); - read_ptr_ += coll_byte_size; - - if (!item_desc.has_varlen_slots()) continue; - uint8_t* coll_data = cv->ptr; - for (int i = 0; i < cv->num_tuples; ++i) { - Tuple* item = reinterpret_cast(coll_data); - FixUpStringsForRead(item_desc.string_slots(), item); - FixUpCollectionsForRead(item_desc.collection_slots(), item); - coll_data += item_desc.byte_size(); - } - } -} -*/ -int64_t BufferedTupleStream3::ComputeRowSize(TupleRow* row) const noexcept { - int64_t size = 0; - if (has_nullable_tuple_) { - size += NullIndicatorBytesPerRow(); - for (int i = 0; i < fixed_tuple_sizes_.size(); ++i) { - if (row->get_tuple(i) != nullptr) size += fixed_tuple_sizes_[i]; - } - } else { - for (int i = 0; i < fixed_tuple_sizes_.size(); ++i) { - size += fixed_tuple_sizes_[i]; - } - } - for (int i = 0; i < inlined_string_slots_.size(); ++i) { - Tuple* tuple = row->get_tuple(inlined_string_slots_[i].first); - if (tuple == nullptr) continue; - const vector& slots = inlined_string_slots_[i].second; - for (auto it = slots.begin(); it != slots.end(); ++it) { - if (tuple->is_null((*it)->null_indicator_offset())) continue; - size += tuple->get_string_slot((*it)->tuple_offset())->len; - } - } - - /* - for (int i = 0; i < inlined_coll_slots_.size(); ++i) { - Tuple* tuple = row->get_tuple(inlined_coll_slots_[i].first); - if (tuple == nullptr) continue; - const vector& slots = inlined_coll_slots_[i].second; - for (auto it = slots.begin(); it != slots.end(); ++it) { - if (tuple->is_null((*it)->null_indicator_offset())) continue; - CollectionValue* cv = tuple->get_collection_slot((*it)->tuple_offset()); - const TupleDescriptor& item_desc = *(*it)->collection_item_descriptor(); - size += cv->num_tuples * item_desc.byte_size(); - - if (!item_desc.has_varlen_slots()) continue; - for (int j = 0; j < cv->num_tuples; ++j) { - Tuple* item = reinterpret_cast(&cv->ptr[j * item_desc.byte_size()]); - size += item->varlen_byte_size(item_desc); - } - } - } -*/ - return size; -} - -bool BufferedTupleStream3::AddRowSlow(TupleRow* row, Status* status) noexcept { - // Use AddRowCustom*() to do the work of advancing the page. - int64_t row_size = ComputeRowSize(row); - uint8_t* data = AddRowCustomBeginSlow(row_size, status); - if (data == nullptr) return false; - bool success = DeepCopy(row, &data, data + row_size); - DCHECK(success); - DCHECK_EQ(data, write_ptr_); - AddRowCustomEnd(row_size); - return true; -} - -uint8_t* BufferedTupleStream3::AddRowCustomBeginSlow(int64_t size, Status* status) noexcept { - *status = AdvanceWritePage(size); - if (!status->ok()) { - return nullptr; - } - // We have a large-enough page so now success is guaranteed. - uint8_t* result = AddRowCustomBegin(size, status); - DCHECK(result != nullptr); - return result; -} - -void BufferedTupleStream3::AddLargeRowCustomEnd(int64_t size) noexcept { - DCHECK_GT(size, default_page_len_); - // Immediately unpin the large write page so that we're not using up extra reservation - // and so we don't append another row to the page. - ResetWritePage(); - // The stream should be in a consistent state once the row is added. - CHECK_CONSISTENCY_FAST(); -} - -bool BufferedTupleStream3::AddRow(TupleRow* row, Status* status) noexcept { - DCHECK(!closed_); - DCHECK(has_write_iterator()); - if (UNLIKELY(write_page_ == nullptr || !DeepCopy(row, &write_ptr_, write_end_ptr_))) { - return AddRowSlow(row, status); - } - ++num_rows_; - ++write_page_->num_rows; - return true; -} - -bool BufferedTupleStream3::DeepCopy(TupleRow* row, uint8_t** data, - const uint8_t* data_end) noexcept { - return has_nullable_tuple_ ? DeepCopyInternal(row, data, data_end) - : DeepCopyInternal(row, data, data_end); -} - -// TODO: consider codegening this. -// TODO: in case of duplicate tuples, this can redundantly serialize data. -template -bool BufferedTupleStream3::DeepCopyInternal(TupleRow* row, uint8_t** data, - const uint8_t* data_end) noexcept { - uint8_t* pos = *data; - const uint64_t tuples_per_row = desc_->tuple_descriptors().size(); - // Copy the not nullptr fixed len tuples. For the nullptr tuples just update the nullptr tuple - // indicator. - if (HAS_NULLABLE_TUPLE) { - int null_indicator_bytes = NullIndicatorBytesPerRow(); - if (UNLIKELY(pos + null_indicator_bytes > data_end)) return false; - uint8_t* null_indicators = pos; - pos += NullIndicatorBytesPerRow(); - memset(null_indicators, 0, null_indicator_bytes); - for (int i = 0; i < tuples_per_row; ++i) { - uint8_t* null_word = null_indicators + (i >> 3); - const uint32_t null_pos = i & 7; - const int tuple_size = fixed_tuple_sizes_[i]; - Tuple* t = row->get_tuple(i); - const uint8_t mask = 1 << (7 - null_pos); - if (t != nullptr) { - if (UNLIKELY(pos + tuple_size > data_end)) return false; - memcpy(pos, t, tuple_size); - pos += tuple_size; - } else { - *null_word |= mask; - } - } - } else { - // If we know that there are no nullable tuples no need to set the nullability flags. - for (int i = 0; i < tuples_per_row; ++i) { - const int tuple_size = fixed_tuple_sizes_[i]; - if (UNLIKELY(pos + tuple_size > data_end)) return false; - Tuple* t = row->get_tuple(i); - // TODO: Once IMPALA-1306 (Avoid passing empty tuples of non-materialized slots) - // is delivered, the check below should become DCHECK(t != nullptr). - DCHECK(t != nullptr || tuple_size == 0); - memcpy(pos, t, tuple_size); - pos += tuple_size; - } - } - - // Copy inlined string slots. Note: we do not need to convert the string ptrs to offsets - // on the write path, only on the read. The tuple data is immediately followed - // by the string data so only the len information is necessary. - for (int i = 0; i < inlined_string_slots_.size(); ++i) { - const Tuple* tuple = row->get_tuple(inlined_string_slots_[i].first); - if (HAS_NULLABLE_TUPLE && tuple == nullptr) continue; - if (UNLIKELY(!CopyStrings(tuple, inlined_string_slots_[i].second, &pos, data_end))) - return false; - } - /* - // Copy inlined collection slots. We copy collection data in a well-defined order so - // we do not need to convert pointers to offsets on the write path. - for (int i = 0; i < inlined_coll_slots_.size(); ++i) { - const Tuple* tuple = row->get_tuple(inlined_coll_slots_[i].first); - if (HAS_NULLABLE_TUPLE && tuple == nullptr) continue; - if (UNLIKELY(!CopyCollections(tuple, inlined_coll_slots_[i].second, &pos, data_end))) - return false; - } -*/ - *data = pos; - return true; -} - -bool BufferedTupleStream3::CopyStrings(const Tuple* tuple, - const vector& string_slots, uint8_t** data, - const uint8_t* data_end) { - for (const SlotDescriptor* slot_desc : string_slots) { - if (tuple->is_null(slot_desc->null_indicator_offset())) continue; - const StringValue* sv = tuple->get_string_slot(slot_desc->tuple_offset()); - if (LIKELY(sv->len > 0)) { - if (UNLIKELY(*data + sv->len > data_end)) return false; - - memcpy(*data, sv->ptr, sv->len); - *data += sv->len; - } - } - return true; -} -/* -bool BufferedTupleStream3::CopyCollections(const Tuple* tuple, - const vector& collection_slots, uint8_t** data, const uint8_t* data_end) { - for (const SlotDescriptor* slot_desc : collection_slots) { - if (tuple->is_null(slot_desc->null_indicator_offset())) continue; - const CollectionValue* cv = tuple->get_collection_slot(slot_desc->tuple_offset()); - const TupleDescriptor& item_desc = *slot_desc->collection_item_descriptor(); - if (LIKELY(cv->num_tuples > 0)) { - int coll_byte_size = cv->num_tuples * item_desc.byte_size(); - if (UNLIKELY(*data + coll_byte_size > data_end)) return false; - uint8_t* coll_data = *data; - memcpy(coll_data, cv->ptr, coll_byte_size); - *data += coll_byte_size; - - if (!item_desc.has_varlen_slots()) continue; - // Copy variable length data when present in collection items. - for (int i = 0; i < cv->num_tuples; ++i) { - const Tuple* item = reinterpret_cast(coll_data); - if (UNLIKELY(!CopyStrings(item, item_desc.string_slots(), data, data_end))) { - return false; - } - if (UNLIKELY( - !CopyCollections(item, item_desc.collection_slots(), data, data_end))) { - return false; - } - coll_data += item_desc.byte_size(); - } - } - } - return true; -} -*/ -void BufferedTupleStream3::GetTupleRow(FlatRowPtr flat_row, TupleRow* row) const { - DCHECK(row != nullptr); - DCHECK(!closed_); - DCHECK(is_pinned()); - DCHECK(!delete_on_read_); - uint8_t* data = flat_row; - return has_nullable_tuple_ ? UnflattenTupleRow(&data, row) - : UnflattenTupleRow(&data, row); -} - -template -void BufferedTupleStream3::UnflattenTupleRow(uint8_t** data, TupleRow* row) const { - const int tuples_per_row = desc_->tuple_descriptors().size(); - uint8_t* ptr = *data; - if (has_nullable_tuple_) { - // Stitch together the tuples from the page and the nullptr ones. - const uint8_t* null_indicators = ptr; - ptr += NullIndicatorBytesPerRow(); - for (int i = 0; i < tuples_per_row; ++i) { - const uint8_t* null_word = null_indicators + (i >> 3); - const uint32_t null_pos = i & 7; - const bool is_not_null = ((*null_word & (1 << (7 - null_pos))) == 0); - row->set_tuple(i, - reinterpret_cast(reinterpret_cast(ptr) * is_not_null)); - ptr += fixed_tuple_sizes_[i] * is_not_null; - } - } else { - for (int i = 0; i < tuples_per_row; ++i) { - row->set_tuple(i, reinterpret_cast(ptr)); - ptr += fixed_tuple_sizes_[i]; - } - } - *data = ptr; -} diff --git a/be/src/runtime/buffered_tuple_stream3.h b/be/src/runtime/buffered_tuple_stream3.h deleted file mode 100644 index a225b5d892..0000000000 --- a/be/src/runtime/buffered_tuple_stream3.h +++ /dev/null @@ -1,647 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/apache/impala/blob/branch-3.0.0/be/src/runtime/buffered-tuple-stream.h -// and modified by Doris - -#pragma once - -#include -#include -#include - -#include "common/global_types.h" -#include "common/status.h" -#include "gutil/macros.h" -#include "runtime/bufferpool/buffer_pool.h" -#include "runtime/row_batch.h" - -namespace doris { - -class RuntimeState; -class RowDescriptor; -class SlotDescriptor; -class Tuple; -class TupleRow; - -/// Class that provides an abstraction for a stream of tuple rows backed by BufferPool -/// Pages. Rows can be added to the stream and read back. Rows are returned in the order -/// they are added. -/// -/// The BufferedTupleStream3 is *not* thread safe from the caller's point of view. -/// Different threads should not concurrently call methods of the same BufferedTupleStream3 -/// object. -/// -/// Reading and writing the stream: -/// The stream supports two modes of reading/writing, depending on whether -/// PrepareForWrite() is called to initialize a write iterator only or -/// PrepareForReadWrite() is called to initialize both read and write iterators to enable -/// interleaved reads and writes. -/// -/// To use write-only mode, PrepareForWrite() is called once and AddRow()/AddRowCustom*() -/// are called repeatedly to initialize then advance a write iterator through the stream. -/// Once the stream is fully written, it can be read back by calling PrepareForRead() -/// then GetNext() repeatedly to advance a read iterator through the stream, or by -/// calling GetRows() to get all of the rows at once. -/// -/// To use read/write mode, PrepareForReadWrite() is called once to initialize the read -/// and write iterators. AddRow()/AddRowCustom*() then advance a write iterator through -/// the stream, and GetNext() advances a trailing read iterator through the stream. -/// -/// Buffer management: -/// The tuple stream is backed by a sequence of BufferPool Pages. The tuple stream uses -/// the client's reservation to pin pages in memory. It will automatically try to -/// increase the client's reservation whenever it needs to do so to make progress. -/// -/// Normally pages are all of the same default page length, but larger pages up to the -/// max page length are used if needed to store rows that are too large for a -/// default-length page. -/// -/// The stream has both pinned and unpinned modes. In the pinned mode all pages are -/// pinned for reading. The pinned mode avoids I/O by keeping all pages pinned in memory -/// and allows clients to save pointers to rows in the stream and randomly access them. -/// E.g. hash tables can be backed by a BufferedTupleStream3. In the unpinned mode, only -/// pages currently being read and written are pinned and other pages are unpinned and -/// therefore do not use the client's reservation and can be spilled to disk. The stream -/// always holds onto a default page's worth of reservation for the read and write -/// iterators (i.e. two page's worth if the stream is in read/write mode), even if that -/// many pages are not currently pinned. This means that UnpinStream() always succeeds, -/// and moving to the next default-length write page or read page on an unpinned stream -/// does not require additional reservation. This is implemented by saving reservations -/// in SubReservations. -/// -/// To read or write a row larger than the default page size to/from an unpinned stream, -/// the client must have max_page_len - default_page_len unused reservation. Writing a -/// large row to an unpinned stream only uses the reservation for the duration of the -/// AddRow()/AddRowCustom*() call. Reading a large row from an unpinned stream uses the -/// reservation until the next call to GetNext(). E.g. to partition a single unpinned -/// stream into n unpinned streams, the reservation needed is (n - 1) * -/// default_page_len + 2 * max_page_len: one large read buffer and one large write -/// buffer is needed to keep the row being processed in-memory, but only default-sized -/// buffers are needed for the other streams being written. -/// -/// The tuple stream also supports a 'delete_on_read' mode, enabled by passing a flag -/// to PrepareForRead() which deletes the stream's pages as it does a final read -/// pass over the stream. -/// -/// TODO: IMPALA-4179: the buffer management can be simplified once we can attach -/// buffers to RowBatches. -/// -/// Page layout: -/// Rows are stored back to back starting at the first byte of each page's buffer, with -/// no interleaving of data from different rows. There is no padding or alignment -/// between rows. Rows larger than the default page length are stored on their own -/// page. -/// -/// Tuple row layout: -/// If the stream's tuples are nullable (i.e. has_nullable_tuple_ is true), there is a -/// bitstring at the start of each row with null indicators for all tuples in each row -/// (including non-nullable tuples). The bitstring occupies ceil(num_tuples_per_row / 8) -/// bytes. A 1 indicates the tuple is null. -/// -/// The fixed length parts of the row's tuples are stored first, followed by var len data -/// for inlined_string_slots_ and inlined_coll_slots_. Other "external" var len slots can -/// point to var len data outside the stream. When reading the stream, the length of each -/// row's var len data in the stream must be computed to find the next row's start. -/// -/// The tuple stream supports reading from the stream into RowBatches without copying -/// out any data: the RowBatches' Tuple pointers will point directly into the stream's -/// pages' buffers. The fixed length parts follow Impala's internal tuple format, so for -/// the tuple to be valid, we only need to update pointers to point to the var len data -/// in the stream. These pointers need to be updated by the stream because a spilled -/// page's data may be relocated to a different buffer. The pointers are updated lazily -/// upon reading the stream via GetNext() or GetRows(). -/// -/// Example layout for a row with two non-nullable tuples ((1, "hello"), (2, "world")) -/// with all var len data stored in the stream: -/// <---- tuple 1 -----> <------ tuple 2 ------> <- var len -> <- next row ... -/// +--------+-----------+-----------+-----------+-------------+ -/// | IntVal | StringVal | BigIntVal | StringVal | | ... -/// +--------+-----------+-----------+-----------++------------+ -/// | val: 1 | len: 5 | val: 2 | len: 5 | helloworld | ... -/// | | ptr: 0x.. | | ptr: 0x.. | | ... -/// +--------+-----------+-----------+-----------+-------------+ -/// <--4b--> <---12b---> <----8b---> <---12b---> <----10b----> -/// -/// Example layout for a row with the second tuple nullable ((1, "hello"), nullptr) -/// with all var len data stored in the stream: -/// <- null tuple bitstring -> <---- tuple 1 -----> <- var len -> <- next row ... -/// +-------------------------+--------+-----------+------------+ -/// | | IntVal | StringVal | | ... -/// +-------------------------+--------+-----------+------------+ -/// | 0000 0010 | val: 1 | len: 5 | hello | ... -/// | | | ptr: 0x.. | | ... -/// +-------------------------+--------+-----------+------------+ -/// <---------1b------------> <--4b--> <---12b---> <----5b----> -/// -/// Example layout for a row with a single non-nullable tuple (("hello", "world")) with -/// the second string slot stored externally to the stream: -/// <------ tuple 1 ------> <- var len -> <- next row ... -/// +-----------+-----------+-------------+ -/// | StringVal | StringVal | | ... -/// +-----------+-----------+-------------+ -/// | len: 5 | len: 5 | hello | ... -/// | ptr: 0x.. | ptr: 0x.. | | ... -/// +-----------+-----------+-------------+ -/// <---12b---> <---12b---> <-----5b----> -/// -/// The behavior of reads and writes is as follows: -/// Read: -/// 1. Unpinned: Only a single read page is pinned at a time. This means that only -/// enough reservation to pin a single page is needed to read the stream, regardless -/// of the stream's size. Each page is deleted or unpinned (if delete on read is true -/// or false respectively) before advancing to the next page. -/// 2. Pinned: All pages in the stream are pinned so do not need to be pinned or -/// unpinned when reading from the stream. If delete on read is true, pages are -/// deleted after being read. If the stream was previously unpinned, the page's data -/// may not yet be in memory - reading from the stream can block on I/O or fail with -/// an I/O error. -/// Write: -/// 1. Unpinned: Unpin pages as they fill up. This means that only a enough reservation -/// to pin a single write page is required to write to the stream, regardless of the -/// stream's size. -/// 2. Pinned: Pages are left pinned. If the next page in the stream cannot be pinned -/// because the client's reservation is insufficient (and could not be increased by -/// the stream), the read call will fail and the client can either unpin the stream -/// or free up other memory before retrying. -/// -/// Memory lifetime of rows read from stream: -/// If the stream is pinned and delete on read is false, it is valid to access any tuples -/// returned via GetNext() or GetRows() until the stream is unpinned. If the stream is -/// unpinned or delete on read is true, then the batch returned from GetNext() may have -/// the needs_deep_copy flag set, which means that any tuple memory returned so far from -/// the stream may be freed on the next call to GetNext(). -/// TODO: IMPALA-4179, instead of needs_deep_copy, attach the pages' buffers to the batch. -/// -/// Manual construction of rows with AddRowCustomBegin()/AddRowCustomEnd(): -/// The BufferedTupleStream3 supports allocation of uninitialized rows with -/// AddRowCustom*(). AddRowCustomBegin() is called instead of AddRow() if the client wants -/// to manually construct a row. The caller of AddRowCustomBegin() is responsible for -/// writing the row with exactly the layout described above then calling -/// AddRowCustomEnd() when done. -/// -/// If a caller constructs a tuple in this way, the caller can set the pointers and they -/// will not be modified until the stream is read via GetNext() or GetRows(). -/// TODO: IMPALA-5007: try to remove AddRowCustom*() by unifying with AddRow(). -/// -/// TODO: we need to be able to do read ahead for pages. We need some way to indicate a -/// page will need to be pinned soon. -class BufferedTupleStream3 { -public: - /// A pointer to the start of a flattened TupleRow in the stream. - typedef uint8_t* FlatRowPtr; - - /// row_desc: description of rows stored in the stream. This is the desc for rows - /// that are added and the rows being returned. - /// page_len: the size of pages to use in the stream - /// ext_varlen_slots: set of varlen slots with data stored externally to the stream - BufferedTupleStream3(RuntimeState* state, const RowDescriptor* row_desc, - BufferPool::ClientHandle* buffer_pool_client, int64_t default_page_len, - const std::set& ext_varlen_slots = std::set()); - - virtual ~BufferedTupleStream3(); - - /// Initializes the tuple stream object on behalf of node 'node_id'. Must be called - /// once before any of the other APIs. - /// If 'pinned' is true, the tuple stream starts off pinned, otherwise it is unpinned. - /// 'node_id' is only used for error reporting. - Status Init(int node_id, bool pinned) WARN_UNUSED_RESULT; - - /// Prepares the stream for writing by saving enough reservation for a default-size - /// write page. Tries to increase reservation if there is not enough unused reservation - /// for a page. Called after Init() and before the first AddRow() or - /// AddRowCustomBegin() call. - /// 'got_reservation': set to true if there was enough reservation to initialize the - /// first write page and false if there was not enough reservation and no other - /// error was encountered. Undefined if an error status is returned. - Status PrepareForWrite() WARN_UNUSED_RESULT; - - /// Prepares the stream for interleaved reads and writes by saving enough reservation - /// for default-sized read and write pages. Called after Init() and before the first - /// AddRow() or AddRowCustomBegin() call. - /// 'delete_on_read': Pages are deleted after they are read. - /// 'got_reservation': set to true if there was enough reservation to initialize the - /// read and write pages and false if there was not enough reservation and no other - /// error was encountered. Undefined if an error status is returned. - Status PrepareForReadWrite(bool delete_on_read) WARN_UNUSED_RESULT; - - /// Prepares the stream for reading, invalidating the write iterator (if there is one). - /// Therefore must be called after the last AddRow() or AddRowCustomEnd() and before - /// GetNext(). PrepareForRead() can be called multiple times to do multiple read passes - /// over the stream, unless rows were read from the stream after PrepareForRead() or - /// PrepareForReadWrite() was called with delete_on_read = true. - /// 'delete_on_read': Pages are deleted after they are read. - /// 'got_reservation': set to true if there was enough reservation to initialize the - /// first read page and false if there was not enough reservation and no other - /// error was encountered. Undefined if an error status is returned. - Status PrepareForRead(bool delete_on_read) WARN_UNUSED_RESULT; - - /// Adds a single row to the stream. There are three possible outcomes: - /// a) The append succeeds. True is returned. - /// b) The append fails because the unused reservation was not sufficient to add - /// a new page to the stream large enough to fit 'row' and the stream could not - /// increase the reservation to get enough unused reservation. Returns false and - /// sets 'status' to OK. The append can be retried after freeing up memory or - /// unpinning the stream. - /// c) The append fails with a runtime error. Returns false and sets 'status' to an - /// error. - /// d) The append fails because the row is too large to fit in a page of a stream. - /// Returns false and sets 'status' to an error. - /// - /// Unpinned streams can only encounter case b) when appending a row larger than - /// the default page size and the reservation could not be increased sufficiently. - /// Otherwise enough memory is automatically freed up by unpinning the current write - /// page. - /// - /// BufferedTupleStream3 will do a deep copy of the memory in the row. After AddRow() - /// returns an error, it should not be called again. - bool AddRow(TupleRow* row, Status* status) noexcept WARN_UNUSED_RESULT; - - /// Allocates space to store a row of 'size' bytes (including fixed and variable length - /// data). If successful, returns a pointer to the allocated row. The caller then must - /// writes valid data to the row and call AddRowCustomEnd(). - /// - /// If unsuccessful, returns nullptr. The failure modes are the same as described in the - /// AddRow() comment. - ALWAYS_INLINE uint8_t* AddRowCustomBegin(int64_t size, Status* status); - - /// Called after AddRowCustomBegin() when done writing the row. Only should be called - /// if AddRowCustomBegin() succeeded. See the AddRowCustomBegin() comment for - /// explanation. - /// 'size': the size passed into AddRowCustomBegin(). - void AddRowCustomEnd(int64_t size); - - /// Unflattens 'flat_row' into a regular TupleRow 'row'. Only valid to call if the - /// stream is pinned. The row must have been allocated with the stream's row desc. - /// The returned 'row' is backed by memory from the stream so is only valid as long - /// as the stream is pinned. - void GetTupleRow(FlatRowPtr flat_row, TupleRow* row) const; - - /// Pins all pages in this stream and switches to pinned mode. Has no effect if the - /// stream is already pinned. - /// If the current unused reservation is not sufficient to pin the stream in memory, - /// this will try to increase the reservation. If that fails, 'pinned' is set to false - /// and the stream is left unpinned. Otherwise 'pinned' is set to true. - Status PinStream(bool* pinned) WARN_UNUSED_RESULT; - - /// Modes for UnpinStream(). - enum UnpinMode { - /// All pages in the stream are unpinned and the read/write positions in the stream - /// are reset. No more rows can be written to the stream after this. The stream can - /// be re-read from the beginning by calling PrepareForRead(). - UNPIN_ALL, - /// All pages are unpinned aside from the current read and write pages (if any), - /// which is left in the same state. The unpinned stream can continue being read - /// or written from the current read or write positions. - UNPIN_ALL_EXCEPT_CURRENT, - }; - - /// Unpins stream with the given 'mode' as described above. - void UnpinStream(UnpinMode mode); - - /// Get the next batch of output rows, which are backed by the stream's memory. - /// If the stream is unpinned or 'delete_on_read' is true, the 'needs_deep_copy' - /// flag may be set on 'batch' to signal that memory will be freed on the next - /// call to GetNext() and that the caller should copy out any data it needs from - /// rows in 'batch' or in previous batches returned from GetNext(). - /// - /// If the stream is pinned and 'delete_on_read' is false, the memory backing the - /// rows will remain valid until the stream is unpinned, destroyed, etc. - /// TODO: IMPALA-4179: update when we simplify the memory transfer model. - Status GetNext(RowBatch* batch, bool* eos) WARN_UNUSED_RESULT; - - /// Same as above, but populate 'flat_rows' with a pointer to the flat version of - /// each returned row in the pinned stream. The pointers in 'flat_rows' are only - /// valid as long as the stream remains pinned. - Status GetNext(RowBatch* batch, bool* eos, - std::vector* flat_rows) WARN_UNUSED_RESULT; - - /// Returns all the rows in the stream in batch. This pins the entire stream in the - /// process. If the current unused reservation is not sufficient to pin the stream in - /// memory, this will try to increase the reservation. If that fails, 'got_rows' is set - /// to false. - Status GetRows(std::unique_ptr* batch, bool* got_rows) WARN_UNUSED_RESULT; - - /// Must be called once at the end to cleanup all resources. If 'batch' is non-nullptr, - /// attaches buffers from pinned pages that rows returned from GetNext() may reference. - /// Otherwise deletes all pages. Does nothing if the stream was already closed. The - /// 'flush' mode is forwarded to RowBatch::AddBuffer() when attaching buffers. - void Close(RowBatch* batch, RowBatch::FlushMode flush); - - /// Number of rows in the stream. - int64_t num_rows() const { return num_rows_; } - - /// Number of rows returned via GetNext(). - int64_t rows_returned() const { return rows_returned_; } - - /// Returns the byte size necessary to store the entire stream in memory. - int64_t byte_size() const { return total_byte_size_; } - - /// Returns the number of bytes currently pinned in memory by the stream. - /// If ignore_current is true, the write_page_ memory is not included. - int64_t BytesPinned(bool ignore_current) const { - if (ignore_current && write_page_ != nullptr && write_page_->is_pinned()) { - return bytes_pinned_ - write_page_->len(); - } - return bytes_pinned_; - } - - bool is_closed() const { return closed_; } - bool is_pinned() const { return pinned_; } - bool has_read_iterator() const { return has_read_iterator_; } - bool has_write_iterator() const { return has_write_iterator_; } - - std::string DebugString() const; - -private: - DISALLOW_COPY_AND_ASSIGN(BufferedTupleStream3); - friend class ArrayTupleStreamTest_TestArrayDeepCopy_Test; - friend class ArrayTupleStreamTest_TestComputeRowSize_Test; - friend class MultiNullableTupleStreamTest_TestComputeRowSize_Test; - friend class SimpleTupleStreamTest_TestGetRowsOverflow_Test; - - /// Wrapper around BufferPool::PageHandle that tracks additional info about the page. - struct Page { - Page() : num_rows(0), retrieved_buffer(true) {} - - int len() const { return handle.len(); } - bool is_pinned() const { return handle.is_pinned(); } - int pin_count() const { return handle.pin_count(); } - Status GetBuffer(const BufferPool::BufferHandle** buffer) { - RETURN_IF_ERROR(handle.GetBuffer(buffer)); - retrieved_buffer = true; - return Status::OK(); - } - std::string DebugString() const; - - BufferPool::PageHandle handle; - - /// Number of rows written to the page. - int num_rows; - - /// Whether we called GetBuffer() on the page since it was last pinned. This means - /// that GetBuffer() and ExtractBuffer() cannot fail and that GetNext() may have - /// returned rows referencing the page's buffer. - bool retrieved_buffer; - }; - - /// Runtime state instance used to check for cancellation. Not owned. - RuntimeState* const state_; - - /// Description of rows stored in the stream. - const RowDescriptor* desc_; - - /// Plan node ID, used for error reporting. - int node_id_; - - /// The size of the fixed length portion for each tuple in the row. - std::vector fixed_tuple_sizes_; - - /// Vectors of all the strings slots that have their varlen data stored in stream - /// grouped by tuple_idx. - std::vector>> inlined_string_slots_; - - /// Vectors of all the collection slots that have their varlen data stored in the - /// stream, grouped by tuple_idx. - // std::vector>> inlined_coll_slots_; - - /// Buffer pool and client used to allocate, pin and release pages. Not owned. - BufferPool* buffer_pool_; - BufferPool::ClientHandle* buffer_pool_client_; - - /// List of pages in the stream. - /// Empty iff one of two cases applies: - /// * before the first row has been added with AddRow() or AddRowCustom(). - /// * after the stream has been destructively read in 'delete_on_read' mode - std::list pages_; - // IMPALA-5629: avoid O(n) list.size() call by explicitly tracking the number of pages. - // TODO: remove when we switch to GCC5+, where list.size() is O(1). See GCC bug #49561. - int64_t num_pages_; - - /// Total size of pages_, including any pages already deleted in 'delete_on_read' - /// mode. - int64_t total_byte_size_; - - /// True if there is currently an active read iterator for the stream. - bool has_read_iterator_; - - /// The current page being read. When no read iterator is active, equal to list.end(). - /// When a read iterator is active, either points to the current read page, or equals - /// list.end() if no rows have yet been read. GetNext() does not advance this past - /// the end of the stream, so upon eos 'read_page_' points to the last page and - /// rows_returned_ == num_rows_. Always pinned, unless a Pin() call failed and an error - /// status was returned. - std::list::iterator read_page_; - - /// Number of rows returned from the current read_page_. - uint32_t read_page_rows_returned_; - - /// Pointer into read_page_ to the byte after the last row read. - uint8_t* read_ptr_; - - /// Pointer to one byte past the end of read_page_. Used to detect overruns. - const uint8_t* read_end_ptr_; - - /// Pointer into write_page_ to the byte after the last row written. - uint8_t* write_ptr_; - - /// Pointer to one byte past the end of write_page_. Cached to speed up computation - const uint8_t* write_end_ptr_; - - /// Number of rows returned to the caller from GetNext() since the last - /// PrepareForRead() call. - int64_t rows_returned_; - - /// True if there is currently an active write iterator into the stream. - bool has_write_iterator_; - - /// The current page for writing. nullptr if there is no write iterator or no current - /// write page. Always pinned. Size is 'default_page_len_', except temporarily while - /// appending a larger row between AddRowCustomBegin() and AddRowCustomEnd(). - Page* write_page_; - - /// Total bytes of pinned pages in pages_, stored to avoid iterating over the list - /// to compute it. - int64_t bytes_pinned_; - - /// Number of rows stored in the stream. Includes rows that were already deleted during - /// a destructive 'delete_on_read' pass over the stream. - int64_t num_rows_; - - /// The default length in bytes of pages used to store the stream's rows. All rows that - /// fit in a default-sized page are stored in default-sized page. - const int64_t default_page_len_; - - /// Whether any tuple in the rows is nullable. - const bool has_nullable_tuple_; - - /// If true, pages are deleted after they are read during this read pass. Once rows - /// have been read from a stream with 'delete_on_read_' true, this is always true. - bool delete_on_read_; - - bool closed_; // Used for debugging. - - /// If true, this stream has been explicitly pinned by the caller and all pages are - /// kept pinned until the caller calls UnpinStream(). - bool pinned_; - - bool is_read_page(const Page* page) const { - return read_page_ != pages_.end() && &*read_page_ == page; - } - - bool is_write_page(const Page* page) const { return write_page_ == page; } - - /// Return true if the read and write page are the same. - bool has_read_write_page() const { return write_page_ != nullptr && is_read_page(write_page_); } - - /// The slow path for AddRow() that is called if there is not sufficient space in - /// the current page. - bool AddRowSlow(TupleRow* row, Status* status) noexcept; - - /// The slow path for AddRowCustomBegin() that is called if there is not sufficient space in - /// the current page. - uint8_t* AddRowCustomBeginSlow(int64_t size, Status* status) noexcept; - - /// The slow path for AddRowCustomEnd() that is called for large pages. - void AddLargeRowCustomEnd(int64_t size) noexcept; - - /// Copies 'row' into the buffer starting at *data and ending at the byte before - /// 'data_end'. On success, returns true and updates *data to point after the last - /// byte written. Returns false if there is not enough space in the buffer provided. - bool DeepCopy(TupleRow* row, uint8_t** data, const uint8_t* data_end) noexcept; - - /// Templated implementation of DeepCopy(). - template - bool DeepCopyInternal(TupleRow* row, uint8_t** data, const uint8_t* data_end) noexcept; - - /// Helper function to copy strings in string_slots from tuple into *data. - /// Updates *data to the end of the string data added. Returns false if the data - /// does not fit in the buffer [*data, data_end). - static bool CopyStrings(const Tuple* tuple, const std::vector& string_slots, - uint8_t** data, const uint8_t* data_end); - - /// Helper function to deep copy collections in collection_slots from tuple into - /// the buffer [*data, data_end). Updates *data to the end of the collection data - /// added. Returns false if the data does not fit in the buffer. - //static bool CopyCollections(const Tuple* tuple, - // const std::vector& collection_slots, uint8_t** data, - // const uint8_t* data_end); - - /// Gets a new page of 'page_len' bytes from buffer_pool_, updating write_page_, - /// write_ptr_ and write_end_ptr_. The caller must ensure there is 'page_len' unused - /// reservation. The caller must reset the write page (if there is one) before calling. - Status NewWritePage(int64_t page_len) noexcept WARN_UNUSED_RESULT; - - /// Determines what page size is needed to fit a row of 'row_size' bytes. - /// Returns an error if the row cannot fit in a page. - void CalcPageLenForRow(int64_t row_size, int64_t* page_len); - - /// Wrapper around NewWritePage() that allocates a new write page that fits a row of - /// 'row_size' bytes. Increases reservation if needed to allocate the next page. - /// Returns OK and sets 'got_reservation' to true if the write page was successfully - /// allocated. Returns an error if the row cannot fit in a page. Returns OK and sets - /// 'got_reservation' to false if the reservation could not be increased and no other - /// error was encountered. - Status AdvanceWritePage(int64_t row_size) noexcept WARN_UNUSED_RESULT; - - /// Reset the write page, if there is one, and unpin pages accordingly. If there - /// is an active write iterator, the next row will be appended to a new page. - void ResetWritePage(); - - /// Invalidate the write iterator and release any resources associated with it. After - /// calling this, no more rows can be appended to the stream. - void InvalidateWriteIterator(); - - /// Same as PrepareForRead(), except the iterators are not invalidated and - /// the caller is assumed to have checked there is sufficient unused reservation. - Status PrepareForReadInternal(bool delete_on_read) WARN_UNUSED_RESULT; - - /// Pins the next read page. This blocks reading from disk if necessary to bring the - /// page's data into memory. Updates read_page_, read_ptr_, and - /// read_page_rows_returned_. - Status NextReadPage() WARN_UNUSED_RESULT; - - /// Invalidate the read iterator, and release any resources associated with the active - /// iterator. - void InvalidateReadIterator(); - - /// Returns the total additional bytes that this row will consume in write_page_ if - /// appended to the page. This includes the row's null indicators, the fixed length - /// part of the row and the data for inlined_string_slots_ and inlined_coll_slots_. - int64_t ComputeRowSize(TupleRow* row) const noexcept; - - /// Pins page and updates tracking stats. - Status PinPage(Page* page) WARN_UNUSED_RESULT; - - /// Increment the page's pin count if this page needs a higher pin count given the - /// current read and write iterator positions and whether the stream will be pinned - /// ('stream_pinned'). Assumes that no scenarios occur when the pin count needs to - /// be incremented multiple times. The caller is responsible for ensuring sufficient - /// reservation is available. - Status PinPageIfNeeded(Page* page, bool stream_pinned) WARN_UNUSED_RESULT; - - /// Decrement the page's pin count if this page needs a lower pin count given the - /// current read and write iterator positions and whether the stream will be pinned - /// ('stream_pinned'). Assumes that no scenarios occur when the pin count needs to - /// be decremented multiple times. - void UnpinPageIfNeeded(Page* page, bool stream_pinned); - - /// Return the expected pin count for 'page' in the current stream based on the current - /// read and write pages and whether the stream is pinned. - int ExpectedPinCount(bool stream_pinned, const Page* page) const; - - /// Templated GetNext implementations. - template - Status GetNextInternal(RowBatch* batch, bool* eos, std::vector* flat_rows); - template - Status GetNextInternal(RowBatch* batch, bool* eos, std::vector* flat_rows); - - /// Helper function to convert a flattened TupleRow stored starting at '*data' into - /// 'row'. *data is updated to point to the first byte past the end of the row. - template - void UnflattenTupleRow(uint8_t** data, TupleRow* row) const; - - /// Helper function for GetNextInternal(). For each string slot in string_slots, - /// update StringValue's ptr field to point to the corresponding string data stored - /// inline in the stream (at the current value of read_ptr_) advance read_ptr_ by the - /// StringValue's length field. - void FixUpStringsForRead(const std::vector& string_slots, Tuple* tuple); - - /// Helper function for GetNextInternal(). For each collection slot in collection_slots, - /// recursively update any pointers in the CollectionValue to point to the corresponding - /// var len data stored inline in the stream, advancing read_ptr_ as data is read. - /// Assumes that the collection was serialized to the stream in DeepCopy()'s format. - //void FixUpCollectionsForRead( - // const std::vector& collection_slots, Tuple* tuple); - - /// Returns the number of null indicator bytes per row. Only valid if this stream has - /// nullable tuples. - int NullIndicatorBytesPerRow() const; - - /// Returns the total bytes pinned. Only called in DCHECKs to validate bytes_pinned_. - int64_t CalcBytesPinned() const; - - /// DCHECKs if the stream is internally inconsistent. The stream should always be in - /// a consistent state after returning success from a public API call. The Fast version - /// has constant runtime and does not check all of 'pages_'. The Full version includes - /// O(n) checks that require iterating over the whole 'pages_' list (e.g. checking that - /// each page is in a valid state). - void CheckConsistencyFast() const; - void CheckConsistencyFull() const; - void CheckPageConsistency(const Page* page) const; -}; -} // namespace doris diff --git a/be/src/runtime/buffered_tuple_stream3.inline.h b/be/src/runtime/buffered_tuple_stream3.inline.h deleted file mode 100644 index 7670e764a0..0000000000 --- a/be/src/runtime/buffered_tuple_stream3.inline.h +++ /dev/null @@ -1,55 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/apache/impala/blob/branch-3.0.0/be/src/runtime/buffered-tuple-stream.inline.h -// and modified by Doris - -#pragma once - -#include "runtime/buffered_tuple_stream3.h" -#include "runtime/descriptors.h" -#include "runtime/tuple_row.h" -#include "util/bit_util.h" - -namespace doris { - -inline int BufferedTupleStream3::NullIndicatorBytesPerRow() const { - DCHECK(has_nullable_tuple_); - return BitUtil::RoundUpNumBytes(fixed_tuple_sizes_.size()); -} - -inline uint8_t* BufferedTupleStream3::AddRowCustomBegin(int64_t size, Status* status) { - DCHECK(!closed_); - DCHECK(has_write_iterator()); - if (UNLIKELY(write_page_ == nullptr || write_ptr_ + size > write_end_ptr_)) { - return AddRowCustomBeginSlow(size, status); - } - DCHECK(write_page_ != nullptr); - DCHECK(write_page_->is_pinned()); - DCHECK_LE(write_ptr_ + size, write_end_ptr_); - ++num_rows_; - ++write_page_->num_rows; - - uint8_t* data = write_ptr_; - write_ptr_ += size; - return data; -} - -inline void BufferedTupleStream3::AddRowCustomEnd(int64_t size) { - if (UNLIKELY(size > default_page_len_)) AddLargeRowCustomEnd(size); -} -} // namespace doris diff --git a/be/src/runtime/export_sink.cpp b/be/src/runtime/export_sink.cpp deleted file mode 100644 index f709c182ec..0000000000 --- a/be/src/runtime/export_sink.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "runtime/export_sink.h" - -#include - -#include - -#include "exprs/expr.h" -#include "exprs/expr_context.h" -#include "gutil/strings/numbers.h" -#include "io/file_factory.h" -#include "runtime/large_int_value.h" -#include "runtime/raw_value.h" -#include "runtime/row_batch.h" -#include "runtime/runtime_state.h" -#include "runtime/tuple_row.h" -#include "util/mysql_global.h" -#include "util/runtime_profile.h" -#include "util/types.h" -#include "util/uid_util.h" - -namespace doris { - -ExportSink::ExportSink(ObjectPool* pool, const RowDescriptor& row_desc, - const std::vector& t_exprs) - : _pool(pool), - _row_desc(row_desc), - _t_output_expr(t_exprs), - _bytes_written_counter(nullptr), - _rows_written_counter(nullptr), - _write_timer(nullptr), - _header_sent(false) { - _name = "ExportSink"; -} - -ExportSink::~ExportSink() {} - -Status ExportSink::init(const TDataSink& t_sink) { - RETURN_IF_ERROR(DataSink::init(t_sink)); - _t_export_sink = t_sink.export_sink; - - // From the thrift expressions create the real exprs. - RETURN_IF_ERROR(Expr::create_expr_trees(_pool, _t_output_expr, &_output_expr_ctxs)); - return Status::OK(); -} - -Status ExportSink::prepare(RuntimeState* state) { - RETURN_IF_ERROR(DataSink::prepare(state)); - - _state = state; - - std::stringstream title; - title << "ExportSink (frag_id=" << state->fragment_instance_id() << ")"; - // create profile - _profile = state->obj_pool()->add(new RuntimeProfile(title.str())); - SCOPED_TIMER(_profile->total_time_counter()); - - // Prepare the exprs to run. - RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc)); - - // TODO(lingbin): add some Counter - _bytes_written_counter = ADD_COUNTER(profile(), "BytesExported", TUnit::BYTES); - _rows_written_counter = ADD_COUNTER(profile(), "RowsExported", TUnit::UNIT); - _write_timer = ADD_TIMER(profile(), "WriteTime"); - - return Status::OK(); -} - -Status ExportSink::open(RuntimeState* state) { - // Prepare the exprs to run. - RETURN_IF_ERROR(Expr::open(_output_expr_ctxs, state)); - // open broker - RETURN_IF_ERROR(open_file_writer()); - return Status::OK(); -} - -Status ExportSink::write_csv_header() { - if (!_header_sent && _t_export_sink.header.size() > 0) { - size_t written_len = 0; - RETURN_IF_ERROR( - _file_writer->write(reinterpret_cast(_t_export_sink.header.c_str()), - _t_export_sink.header.size(), &written_len)); - _header_sent = true; - } - return Status::OK(); -} - -Status ExportSink::send(RuntimeState* state, RowBatch* batch) { - VLOG_ROW << "debug: export_sink send batch: " << batch->to_string(); - SCOPED_TIMER(_profile->total_time_counter()); - int num_rows = batch->num_rows(); - // we send at most 1024 rows at a time - int batch_send_rows = num_rows > 1024 ? 1024 : num_rows; - RETURN_IF_ERROR(write_csv_header()); - std::stringstream ss; - for (int i = 0; i < num_rows;) { - ss.str(""); - for (int j = 0; j < batch_send_rows && i < num_rows; ++j, ++i) { - RETURN_IF_ERROR(gen_row_buffer(batch->get_row(i), &ss)); - } - - VLOG_ROW << "debug: export_sink send row: " << ss.str(); - const std::string& buf = ss.str(); - size_t written_len = 0; - - SCOPED_TIMER(_write_timer); - // TODO(lingbin): for broker writer, we should not send rpc each row. - RETURN_IF_ERROR(_file_writer->write(reinterpret_cast(buf.c_str()), - buf.size(), &written_len)); - COUNTER_UPDATE(_bytes_written_counter, buf.size()); - } - COUNTER_UPDATE(_rows_written_counter, num_rows); - return Status::OK(); -} - -Status ExportSink::gen_row_buffer(TupleRow* row, std::stringstream* ss) { - int num_columns = _output_expr_ctxs.size(); - // const TupleDescriptor& desc = row_desc().TupleDescriptor; - for (int i = 0; i < num_columns; ++i) { - void* item = _output_expr_ctxs[i]->get_value(row); - if (item == nullptr) { - (*ss) << "\\N"; - } else { - switch (_output_expr_ctxs[i]->root()->type().type) { - case TYPE_BOOLEAN: - case TYPE_TINYINT: - (*ss) << (int)*static_cast(item); - break; - case TYPE_SMALLINT: - (*ss) << *static_cast(item); - break; - case TYPE_INT: - (*ss) << *static_cast(item); - break; - case TYPE_BIGINT: - (*ss) << *static_cast(item); - break; - case TYPE_LARGEINT: - (*ss) << reinterpret_cast(item)->value; - break; - case TYPE_FLOAT: { - char buffer[MAX_FLOAT_STR_LENGTH + 2]; - float float_value = *static_cast(item); - buffer[0] = '\0'; - int length = FloatToBuffer(float_value, MAX_FLOAT_STR_LENGTH, buffer); - DCHECK(length >= 0) << "gcvt float failed, float value=" << float_value; - (*ss) << buffer; - break; - } - case TYPE_DOUBLE: { - // To prevent loss of precision on float and double types, - // they are converted to strings before output. - // For example: For a double value 27361919854.929001, - // the direct output of using std::stringstream is 2.73619e+10, - // and after conversion to a string, it outputs 27361919854.929001 - char buffer[MAX_DOUBLE_STR_LENGTH + 2]; - double double_value = *static_cast(item); - buffer[0] = '\0'; - int length = DoubleToBuffer(double_value, MAX_DOUBLE_STR_LENGTH, buffer); - DCHECK(length >= 0) << "gcvt double failed, double value=" << double_value; - (*ss) << buffer; - break; - } - case TYPE_DATE: - case TYPE_DATETIME: { - char buf[64]; - const DateTimeValue* time_val = (const DateTimeValue*)(item); - time_val->to_string(buf); - (*ss) << buf; - break; - } - case TYPE_VARCHAR: - case TYPE_CHAR: - case TYPE_STRING: { - const StringValue* string_val = (const StringValue*)(item); - - if (string_val->ptr == nullptr) { - if (string_val->len == 0) { - } else { - (*ss) << "\\N"; - } - } else { - (*ss) << std::string(string_val->ptr, string_val->len); - } - break; - } - - case TYPE_DECIMALV2: { - const DecimalV2Value decimal_val( - reinterpret_cast(item)->value); - std::string decimal_str; - int output_scale = _output_expr_ctxs[i]->root()->output_scale(); - decimal_str = decimal_val.to_string(output_scale); - (*ss) << decimal_str; - break; - } - case TYPE_ARRAY: { - auto col_type = _output_expr_ctxs[i]->root()->type(); - int output_scale = _output_expr_ctxs[i]->root()->output_scale(); - RawValue::print_value(item, col_type, output_scale, ss); - break; - } - default: { - std::stringstream err_ss; - err_ss << "can't export this type. type = " << _output_expr_ctxs[i]->root()->type(); - return Status::InternalError(err_ss.str()); - } - } - } - - if (i < num_columns - 1) { - (*ss) << _t_export_sink.column_separator; - } - } - (*ss) << _t_export_sink.line_delimiter; - - return Status::OK(); -} - -Status ExportSink::close(RuntimeState* state, Status exec_status) { - if (_closed) { - return Status::OK(); - } - Expr::close(_output_expr_ctxs, state); - if (_file_writer != nullptr) { - _file_writer->close(); - _file_writer = nullptr; - } - return DataSink::close(state, exec_status); -} - -Status ExportSink::open_file_writer() { - if (_file_writer != nullptr) { - return Status::OK(); - } - - std::string file_name = gen_file_name(); - // TODO(lingbin): gen file path - RETURN_IF_ERROR(FileFactory::create_file_writer( - _t_export_sink.file_type, _state->exec_env(), _t_export_sink.broker_addresses, - _t_export_sink.properties, _t_export_sink.export_path + "/" + file_name, 0, - _file_writer)); - _state->add_export_output_file(_t_export_sink.export_path + "/" + file_name); - - return _file_writer->open(); -} - -// TODO(lingbin): add some other info to file name, like partition -std::string ExportSink::gen_file_name() { - const TUniqueId& id = _state->fragment_instance_id(); - - struct timeval tv; - gettimeofday(&tv, nullptr); - - std::stringstream file_name; - file_name << "export-data-" << print_id(id) << "-" << (tv.tv_sec * 1000 + tv.tv_usec / 1000); - return file_name.str(); -} - -} // namespace doris diff --git a/be/src/runtime/export_sink.h b/be/src/runtime/export_sink.h deleted file mode 100644 index 107b6f4203..0000000000 --- a/be/src/runtime/export_sink.h +++ /dev/null @@ -1,83 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "common/status.h" -#include "exec/data_sink.h" -#include "util/runtime_profile.h" - -namespace doris { - -class RowDescriptor; -class TExpr; -class RuntimeState; -class RuntimeProfile; -class ExprContext; -class FileWriter; -class TupleRow; - -// This class is a sinker, which put export data to external storage by broker. -class ExportSink : public DataSink { -public: - ExportSink(ObjectPool* pool, const RowDescriptor& row_desc, const std::vector& t_exprs); - - virtual ~ExportSink(); - - virtual Status init(const TDataSink& thrift_sink) override; - - virtual Status prepare(RuntimeState* state) override; - - virtual Status open(RuntimeState* state) override; - - virtual Status send(RuntimeState* state, RowBatch* batch) override; - - // Flush all buffered data and close all existing channels to destination - // hosts. Further send() calls are illegal after calling close(). - virtual Status close(RuntimeState* state, Status exec_status) override; - - virtual RuntimeProfile* profile() override { return _profile; } - -private: - Status open_file_writer(); - Status gen_row_buffer(TupleRow* row, std::stringstream* ss); - std::string gen_file_name(); - Status write_csv_header(); - - RuntimeState* _state; - - // owned by RuntimeState - ObjectPool* _pool; - const RowDescriptor& _row_desc; - const std::vector& _t_output_expr; - - std::vector _output_expr_ctxs; - - TExportSink _t_export_sink; - std::unique_ptr _file_writer; - - RuntimeProfile* _profile; - - RuntimeProfile::Counter* _bytes_written_counter; - RuntimeProfile::Counter* _rows_written_counter; - RuntimeProfile::Counter* _write_timer; - bool _header_sent; -}; - -} // end namespace doris diff --git a/be/src/runtime/memory_scratch_sink.cpp b/be/src/runtime/memory_scratch_sink.cpp deleted file mode 100644 index 32bec47259..0000000000 --- a/be/src/runtime/memory_scratch_sink.cpp +++ /dev/null @@ -1,97 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "runtime/memory_scratch_sink.h" - -#include -#include - -#include - -#include "exprs/expr.h" -#include "gen_cpp/Types_types.h" -#include "runtime/exec_env.h" -#include "runtime/primitive_type.h" -#include "runtime/row_batch.h" -#include "runtime/runtime_state.h" -#include "runtime/tuple_row.h" -#include "util/arrow/row_batch.h" - -namespace doris { - -MemoryScratchSink::MemoryScratchSink(const RowDescriptor& row_desc, - const std::vector& t_output_expr, - const TMemoryScratchSink& sink) - : _row_desc(row_desc), _t_output_expr(t_output_expr) { - _name = "MemoryScratchSink"; -} - -MemoryScratchSink::~MemoryScratchSink() {} - -Status MemoryScratchSink::prepare_exprs(RuntimeState* state) { - // From the thrift expressions create the real exprs. - RETURN_IF_ERROR(Expr::create_expr_trees(state->obj_pool(), _t_output_expr, &_output_expr_ctxs)); - // Prepare the exprs to run. - RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc)); - // generate the arrow schema - RETURN_IF_ERROR(convert_to_arrow_schema(_row_desc, &_arrow_schema)); - return Status::OK(); -} - -Status MemoryScratchSink::prepare(RuntimeState* state) { - RETURN_IF_ERROR(DataSink::prepare(state)); - // prepare output_expr - RETURN_IF_ERROR(prepare_exprs(state)); - // create queue - TUniqueId fragment_instance_id = state->fragment_instance_id(); - state->exec_env()->result_queue_mgr()->create_queue(fragment_instance_id, &_queue); - std::stringstream title; - title << "MemoryScratchSink (frag_id=" << fragment_instance_id << ")"; - // create profile - _profile = state->obj_pool()->add(new RuntimeProfile(title.str())); - - return Status::OK(); -} - -Status MemoryScratchSink::send(RuntimeState* state, RowBatch* batch) { - if (nullptr == batch || 0 == batch->num_rows()) { - return Status::OK(); - } - std::shared_ptr result; - RETURN_IF_ERROR( - convert_to_arrow_batch(*batch, _arrow_schema, arrow::default_memory_pool(), &result)); - _queue->blocking_put(result); - return Status::OK(); -} - -Status MemoryScratchSink::open(RuntimeState* state) { - return Expr::open(_output_expr_ctxs, state); -} - -Status MemoryScratchSink::close(RuntimeState* state, Status exec_status) { - if (_closed) { - return Status::OK(); - } - // put sentinel - if (_queue != nullptr) { - _queue->blocking_put(nullptr); - } - Expr::close(_output_expr_ctxs, state); - return DataSink::close(state, exec_status); -} - -} // namespace doris diff --git a/be/src/runtime/memory_scratch_sink.h b/be/src/runtime/memory_scratch_sink.h deleted file mode 100644 index f8e5fceca1..0000000000 --- a/be/src/runtime/memory_scratch_sink.h +++ /dev/null @@ -1,82 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "common/status.h" -#include "exec/data_sink.h" -#include "gen_cpp/DorisExternalService_types.h" -#include "gen_cpp/PlanNodes_types.h" -#include "runtime/result_queue_mgr.h" -#include "util/blocking_queue.hpp" - -namespace arrow { - -class MemoryPool; -class RecordBatch; -class Schema; - -} // namespace arrow - -namespace doris { - -class ObjectPool; -class RowBatch; -class ObjectPool; -class RuntimeState; -class RuntimeProfile; -class BufferControlBlock; -class ExprContext; -class ResultWriter; -class TupleRow; - -// used to push data to blocking queue -class MemoryScratchSink : public DataSink { -public: - MemoryScratchSink(const RowDescriptor& row_desc, const std::vector& select_exprs, - const TMemoryScratchSink& sink); - - virtual ~MemoryScratchSink(); - - virtual Status prepare(RuntimeState* state); - - virtual Status open(RuntimeState* state); - - // send data in 'batch' to this backend queue mgr - // Blocks until all rows in batch are pushed to the queue - virtual Status send(RuntimeState* state, RowBatch* batch); - - virtual Status close(RuntimeState* state, Status exec_status); - - virtual RuntimeProfile* profile() { return _profile; } - -private: - Status prepare_exprs(RuntimeState* state); - - // Owned by the RuntimeState. - const RowDescriptor& _row_desc; - std::shared_ptr _arrow_schema; - - BlockQueueSharedPtr _queue; - - RuntimeProfile* _profile; // Allocated from _pool - - // Owned by the RuntimeState. - const std::vector& _t_output_expr; - std::vector _output_expr_ctxs; -}; -} // namespace doris diff --git a/be/src/runtime/mysql_result_writer.cpp b/be/src/runtime/mysql_result_writer.cpp deleted file mode 100644 index 823f05d2d4..0000000000 --- a/be/src/runtime/mysql_result_writer.cpp +++ /dev/null @@ -1,282 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "runtime/mysql_result_writer.h" - -#include "exprs/expr_context.h" -#include "gen_cpp/PaloInternalService_types.h" -#include "runtime/buffer_control_block.h" -#include "runtime/primitive_type.h" -#include "runtime/result_buffer_mgr.h" -#include "runtime/row_batch.h" -#include "runtime/tuple_row.h" -#include "util/mysql_row_buffer.h" -#include "util/types.h" -#include "vec/columns/column_vector.h" -#include "vec/core/block.h" - -namespace doris { - -MysqlResultWriter::MysqlResultWriter(BufferControlBlock* sinker, - const std::vector& output_expr_ctxs, - RuntimeProfile* parent_profile, bool output_object_data) - : ResultWriter(output_object_data), - _sinker(sinker), - _output_expr_ctxs(output_expr_ctxs), - _row_buffer(nullptr), - _parent_profile(parent_profile) {} - -MysqlResultWriter::~MysqlResultWriter() { - delete _row_buffer; -} - -Status MysqlResultWriter::init(RuntimeState* state) { - _init_profile(); - if (nullptr == _sinker) { - return Status::InternalError("sinker is nullptr pointer."); - } - - _row_buffer = new (std::nothrow) MysqlRowBuffer(); - if (nullptr == _row_buffer) { - return Status::InternalError("no memory to alloc."); - } - - return Status::OK(); -} - -void MysqlResultWriter::_init_profile() { - _append_row_batch_timer = ADD_TIMER(_parent_profile, "AppendBatchTime"); - _convert_tuple_timer = ADD_CHILD_TIMER(_parent_profile, "TupleConvertTime", "AppendBatchTime"); - _result_send_timer = ADD_CHILD_TIMER(_parent_profile, "ResultSendTime", "AppendBatchTime"); - _sent_rows_counter = ADD_COUNTER(_parent_profile, "NumSentRows", TUnit::UNIT); -} - -int MysqlResultWriter::_add_row_value(int index, const TypeDescriptor& type, void* item) { - int buf_ret = 0; - if (item == nullptr) { - return _row_buffer->push_null(); - } - - switch (type.type) { - case TYPE_BOOLEAN: - case TYPE_TINYINT: - buf_ret = _row_buffer->push_tinyint(*static_cast(item)); - break; - - case TYPE_SMALLINT: - buf_ret = _row_buffer->push_smallint(*static_cast(item)); - break; - - case TYPE_INT: - buf_ret = _row_buffer->push_int(*static_cast(item)); - break; - - case TYPE_BIGINT: - buf_ret = _row_buffer->push_bigint(*static_cast(item)); - break; - - case TYPE_LARGEINT: { - buf_ret = _row_buffer->push_largeint(reinterpret_cast(item)->value); - break; - } - - case TYPE_FLOAT: - buf_ret = _row_buffer->push_float(*static_cast(item)); - break; - - case TYPE_DOUBLE: - buf_ret = _row_buffer->push_double(*static_cast(item)); - break; - - case TYPE_TIME: { - buf_ret = _row_buffer->push_time(*static_cast(item)); - break; - } - - case TYPE_DATE: - case TYPE_DATETIME: { - buf_ret = _row_buffer->push_datetime(*static_cast(item)); - break; - } - - case TYPE_HLL: - case TYPE_OBJECT: - case TYPE_QUANTILE_STATE: { - if (_output_object_data) { - const StringValue* string_val = (const StringValue*)(item); - - if (string_val->ptr == nullptr) { - buf_ret = _row_buffer->push_null(); - } else { - buf_ret = _row_buffer->push_string(string_val->ptr, string_val->len); - } - } else { - buf_ret = _row_buffer->push_null(); - } - - break; - } - - case TYPE_VARCHAR: - case TYPE_CHAR: - case TYPE_STRING: { - const StringValue* string_val = (const StringValue*)(item); - - if (string_val->ptr == nullptr) { - if (string_val->len == 0) { - // 0x01 is a magic num, not useful actually, just for present "" - char* tmp_val = reinterpret_cast(0x01); - buf_ret = _row_buffer->push_string(tmp_val, string_val->len); - } else { - buf_ret = _row_buffer->push_null(); - } - } else { - buf_ret = _row_buffer->push_string(string_val->ptr, string_val->len); - } - - break; - } - - case TYPE_DECIMALV2: { - DecimalV2Value decimal_val(reinterpret_cast(item)->value); - // TODO: Support decimal output_scale after we support FE can sure - // accuracy of output_scale - // int output_scale = _output_expr_ctxs[index]->root()->output_scale(); - buf_ret = _row_buffer->push_decimal(decimal_val, type.scale); - break; - } - - case TYPE_ARRAY: { - auto child_type = type.children[0]; - auto array_value = (const CollectionValue*)(item); - - ArrayIterator iter = array_value->iterator(child_type.type); - - _row_buffer->open_dynamic_mode(); - - buf_ret = _row_buffer->push_string("[", 1); - - int begin = 0; - while (iter.has_next() && !buf_ret) { - if (begin != 0) { - buf_ret = _row_buffer->push_string(", ", 2); - } - if (!iter.get()) { - buf_ret = _row_buffer->push_string("NULL", 4); - } else { - if (child_type.is_string_type()) { - buf_ret = _row_buffer->push_string("'", 1); - buf_ret = _add_row_value(index, child_type, iter.get()); - buf_ret = _row_buffer->push_string("'", 1); - } else if (child_type.is_date_type()) { - DateTimeVal data; - iter.get(&data); - auto datetime_value = DateTimeValue::from_datetime_val(data); - buf_ret = _add_row_value(index, child_type, &datetime_value); - } else if (child_type.is_decimal_v2_type()) { - DecimalV2Val data; - iter.get(&data); - auto decimal_value = DecimalV2Value::from_decimal_val(data); - buf_ret = _add_row_value(index, child_type, &decimal_value); - } else { - buf_ret = _add_row_value(index, child_type, iter.get()); - } - } - - iter.next(); - begin++; - } - - if (!buf_ret) { - buf_ret = _row_buffer->push_string("]", 1); - } - - _row_buffer->close_dynamic_mode(); - break; - } - - default: - LOG(WARNING) << "can't convert this type to mysql type. type = " - << _output_expr_ctxs[index]->root()->type(); - buf_ret = -1; - break; - } - - return buf_ret; -} - -Status MysqlResultWriter::_add_one_row(TupleRow* row) { - _row_buffer->reset(); - int num_columns = _output_expr_ctxs.size(); - int buf_ret = 0; - - for (int i = 0; 0 == buf_ret && i < num_columns; ++i) { - void* item = _output_expr_ctxs[i]->get_value(row); - - buf_ret = _add_row_value(i, _output_expr_ctxs[i]->root()->type(), item); - } - - if (0 != buf_ret) { - return Status::InternalError("pack mysql buffer failed."); - } - - return Status::OK(); -} - -Status MysqlResultWriter::append_row_batch(const RowBatch* batch) { - SCOPED_TIMER(_append_row_batch_timer); - if (nullptr == batch || 0 == batch->num_rows()) { - return Status::OK(); - } - - Status status; - // convert one batch - std::unique_ptr result = std::make_unique(); - int num_rows = batch->num_rows(); - result->result_batch.rows.resize(num_rows); - - { - SCOPED_TIMER(_convert_tuple_timer); - for (int i = 0; status.ok() && i < num_rows; ++i) { - TupleRow* row = batch->get_row(i); - status = _add_one_row(row); - - if (status.ok()) { - result->result_batch.rows[i].assign(_row_buffer->buf(), _row_buffer->length()); - } else { - LOG(WARNING) << "convert row to mysql result failed."; - break; - } - } - } - - if (status.ok()) { - SCOPED_TIMER(_result_send_timer); - // push this batch to back - RETURN_NOT_OK_STATUS_WITH_WARN(_sinker->add_batch(result), - "fappend result batch to sink failed."); - _written_rows += num_rows; - } - return Status::OK(); -} - -Status MysqlResultWriter::close() { - COUNTER_SET(_sent_rows_counter, _written_rows); - return Status::OK(); -} - -} // namespace doris diff --git a/be/src/runtime/mysql_result_writer.h b/be/src/runtime/mysql_result_writer.h deleted file mode 100644 index 14f4ce7c99..0000000000 --- a/be/src/runtime/mysql_result_writer.h +++ /dev/null @@ -1,79 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "primitive_type.h" -#include "runtime/result_writer.h" -#include "runtime/runtime_state.h" -#include "vec/data_types/data_type.h" - -namespace doris { - -class TupleRow; -class RowBatch; -class ExprContext; -class MysqlRowBuffer; -class BufferControlBlock; -class RuntimeProfile; - -namespace vectorized { -class VExprContext; -} - -// convert the row batch to mysql protocol row -class MysqlResultWriter final : public ResultWriter { -public: - MysqlResultWriter(BufferControlBlock* sinker, const std::vector& output_expr_ctxs, - RuntimeProfile* parent_profile, bool output_object_data); - - virtual ~MysqlResultWriter(); - - virtual Status init(RuntimeState* state) override; - // convert one row batch to mysql result and - // append this batch to the result sink - virtual Status append_row_batch(const RowBatch* batch) override; - - virtual Status close() override; - -private: - void _init_profile(); - // convert one tuple row - Status _add_one_row(TupleRow* row); - int _add_row_value(int index, const TypeDescriptor& type, void* item); - -private: - BufferControlBlock* _sinker; - const std::vector& _output_expr_ctxs; - - std::vector _result_column_ids; - - MysqlRowBuffer* _row_buffer; - std::vector _vec_buffers; - - RuntimeProfile* _parent_profile; // parent profile from result sink. not owned - // total time cost on append batch operation - RuntimeProfile::Counter* _append_row_batch_timer = nullptr; - // tuple convert timer, child timer of _append_row_batch_timer - RuntimeProfile::Counter* _convert_tuple_timer = nullptr; - // file write timer, child timer of _append_row_batch_timer - RuntimeProfile::Counter* _result_send_timer = nullptr; - // number of sent rows - RuntimeProfile::Counter* _sent_rows_counter = nullptr; -}; - -} // namespace doris diff --git a/be/src/runtime/mysql_table_sink.cpp b/be/src/runtime/mysql_table_sink.cpp deleted file mode 100644 index c3357eacc5..0000000000 --- a/be/src/runtime/mysql_table_sink.cpp +++ /dev/null @@ -1,86 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "runtime/mysql_table_sink.h" - -#include - -#include "exprs/expr.h" -#include "runtime/runtime_state.h" -#include "util/debug_util.h" -#include "util/runtime_profile.h" - -namespace doris { - -MysqlTableSink::MysqlTableSink(ObjectPool* pool, const RowDescriptor& row_desc, - const std::vector& t_exprs) - : _pool(pool), _row_desc(row_desc), _t_output_expr(t_exprs) { - _name = "MysqlTableSink"; -} - -MysqlTableSink::~MysqlTableSink() {} - -Status MysqlTableSink::init(const TDataSink& t_sink) { - RETURN_IF_ERROR(DataSink::init(t_sink)); - const TMysqlTableSink& t_mysql_sink = t_sink.mysql_table_sink; - - _conn_info.host = t_mysql_sink.host; - _conn_info.port = t_mysql_sink.port; - _conn_info.user = t_mysql_sink.user; - _conn_info.passwd = t_mysql_sink.passwd; - _conn_info.db = t_mysql_sink.db; - _mysql_tbl = t_mysql_sink.table; - _conn_info.charset = t_mysql_sink.charset; - - // From the thrift expressions create the real exprs. - RETURN_IF_ERROR(Expr::create_expr_trees(_pool, _t_output_expr, &_output_expr_ctxs)); - return Status::OK(); -} - -Status MysqlTableSink::prepare(RuntimeState* state) { - RETURN_IF_ERROR(DataSink::prepare(state)); - // Prepare the exprs to run. - RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc)); - std::stringstream title; - title << "MysqlTableSink (frag_id=" << state->fragment_instance_id() << ")"; - // create profile - _profile = state->obj_pool()->add(new RuntimeProfile(title.str())); - return Status::OK(); -} - -Status MysqlTableSink::open(RuntimeState* state) { - // Prepare the exprs to run. - RETURN_IF_ERROR(Expr::open(_output_expr_ctxs, state)); - // create writer - _writer = state->obj_pool()->add(new MysqlTableWriter(_output_expr_ctxs)); - RETURN_IF_ERROR(_writer->open(_conn_info, _mysql_tbl)); - return Status::OK(); -} - -Status MysqlTableSink::send(RuntimeState* state, RowBatch* batch) { - return _writer->append(batch); -} - -Status MysqlTableSink::close(RuntimeState* state, Status exec_status) { - if (_closed) { - return Status::OK(); - } - Expr::close(_output_expr_ctxs, state); - return DataSink::close(state, exec_status); -} - -} // namespace doris diff --git a/be/src/runtime/mysql_table_sink.h b/be/src/runtime/mysql_table_sink.h deleted file mode 100644 index 08ae566a21..0000000000 --- a/be/src/runtime/mysql_table_sink.h +++ /dev/null @@ -1,73 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "common/status.h" -#include "exec/data_sink.h" -#include "runtime/mysql_table_writer.h" - -namespace doris { - -class RowDescriptor; -class TExpr; -class TMysqlTableSink; -class RuntimeState; -class RuntimeProfile; -class ExprContext; - -// This class is a sinker, which put input data to mysql table -class MysqlTableSink : public DataSink { -public: - MysqlTableSink(ObjectPool* pool, const RowDescriptor& row_desc, - const std::vector& t_exprs); - - virtual ~MysqlTableSink(); - - virtual Status init(const TDataSink& thrift_sink); - - virtual Status prepare(RuntimeState* state); - - virtual Status open(RuntimeState* state); - - // send data in 'batch' to this backend stream mgr - // Blocks until all rows in batch are placed in the buffer - virtual Status send(RuntimeState* state, RowBatch* batch); - - // Flush all buffered data and close all existing channels to destination - // hosts. Further send() calls are illegal after calling close(). - virtual Status close(RuntimeState* state, Status exec_status); - - virtual RuntimeProfile* profile() { return _profile; } - -private: - // owned by RuntimeState - ObjectPool* _pool; - const RowDescriptor& _row_desc; - const std::vector& _t_output_expr; - - std::vector _output_expr_ctxs; - MysqlConnInfo _conn_info; - std::string _mysql_tbl; - MysqlTableWriter* _writer; - - RuntimeProfile* _profile; -}; - -} // namespace doris diff --git a/be/src/runtime/mysql_table_writer.cpp b/be/src/runtime/mysql_table_writer.cpp deleted file mode 100644 index bd1f746db8..0000000000 --- a/be/src/runtime/mysql_table_writer.cpp +++ /dev/null @@ -1,182 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "runtime/mysql_table_writer.h" - -#include - -#include - -#include "exprs/expr_context.h" -#include "runtime/row_batch.h" -#include "runtime/tuple_row.h" -#include "util/types.h" - -namespace doris { - -std::string MysqlConnInfo::debug_string() const { - std::stringstream ss; - - ss << "(host=" << host << ",port=" << port << ",user=" << user << ",db=" << db - << ",passwd=" << passwd << ",charset=" << charset << ")"; - return ss.str(); -} - -MysqlTableWriter::MysqlTableWriter(const std::vector& output_expr_ctxs) - : _output_expr_ctxs(output_expr_ctxs) {} - -MysqlTableWriter::~MysqlTableWriter() { - if (_mysql_conn) { - mysql_close(_mysql_conn); - } -} - -Status MysqlTableWriter::open(const MysqlConnInfo& conn_info, const std::string& tbl) { - _mysql_conn = mysql_init(nullptr); - if (_mysql_conn == nullptr) { - return Status::InternalError("Call mysql_init failed."); - } - - MYSQL* res = mysql_real_connect(_mysql_conn, conn_info.host.c_str(), conn_info.user.c_str(), - conn_info.passwd.c_str(), conn_info.db.c_str(), conn_info.port, - nullptr, // unix socket - 0); // flags - if (res == nullptr) { - std::stringstream ss; - ss << "mysql_real_connect failed because " << mysql_error(_mysql_conn); - return Status::InternalError(ss.str()); - } - - // set character - if (mysql_set_character_set(_mysql_conn, conn_info.charset.c_str())) { - std::stringstream ss; - ss << "mysql_set_character_set failed because " << mysql_error(_mysql_conn); - return Status::InternalError(ss.str()); - } - - _mysql_tbl = tbl; - - return Status::OK(); -} - -Status MysqlTableWriter::insert_row(TupleRow* row) { - std::stringstream ss; - - // Construct Insert statement of mysql - ss << "INSERT INTO `" << _mysql_tbl << "` VALUES ("; - int num_columns = _output_expr_ctxs.size(); - for (int i = 0; i < num_columns; ++i) { - if (i != 0) { - ss << ", "; - } - void* item = _output_expr_ctxs[i]->get_value(row); - if (item == nullptr) { - ss << "NULL"; - continue; - } - switch (_output_expr_ctxs[i]->root()->type().type) { - case TYPE_BOOLEAN: - case TYPE_TINYINT: - ss << (int)*static_cast(item); - break; - case TYPE_SMALLINT: - ss << *static_cast(item); - break; - case TYPE_INT: - ss << *static_cast(item); - break; - case TYPE_BIGINT: - ss << *static_cast(item); - break; - case TYPE_FLOAT: - ss << *static_cast(item); - break; - case TYPE_DOUBLE: - ss << *static_cast(item); - break; - case TYPE_DATE: - case TYPE_DATETIME: { - char buf[64]; - const DateTimeValue* time_val = (const DateTimeValue*)(item); - time_val->to_string(buf); - ss << "\'" << buf << "\'"; - break; - } - case TYPE_VARCHAR: - case TYPE_CHAR: - case TYPE_STRING: { - const StringValue* string_val = (const StringValue*)(item); - - if (string_val->ptr == nullptr) { - if (string_val->len == 0) { - ss << "\'\'"; - } else { - ss << "NULL"; - } - } else { - char* buf = new char[2 * string_val->len + 1]; - mysql_real_escape_string(_mysql_conn, buf, string_val->ptr, string_val->len); - ss << "\'" << buf << "\'"; - delete[] buf; - } - break; - } - - case TYPE_DECIMALV2: { - const DecimalV2Value decimal_val(reinterpret_cast(item)->value); - std::string decimal_str; - int output_scale = _output_expr_ctxs[i]->root()->output_scale(); - decimal_str = decimal_val.to_string(output_scale); - ss << decimal_str; - break; - } - - default: { - return Status::InternalError("can't convert this type to mysql type. type = {}", - _output_expr_ctxs[i]->root()->type().type); - } - } - } - ss << ")"; - - // Insert this to MySQL server - std::string insert_stmt = ss.str(); - LOG(INFO) << insert_stmt; - if (mysql_real_query(_mysql_conn, insert_stmt.c_str(), insert_stmt.length())) { - std::stringstream err_ss; - err_ss << "Insert to mysql server(" << mysql_get_host_info(_mysql_conn) - << ") failed, because: " << mysql_error(_mysql_conn); - return Status::InternalError(err_ss.str()); - } - - return Status::OK(); -} - -Status MysqlTableWriter::append(RowBatch* batch) { - if (batch == nullptr || batch->num_rows() == 0) { - return Status::OK(); - } - - int num_rows = batch->num_rows(); - for (int i = 0; i < num_rows; ++i) { - RETURN_IF_ERROR(insert_row(batch->get_row(i))); - } - - return Status::OK(); -} - -} // namespace doris diff --git a/be/src/runtime/mysql_table_writer.h b/be/src/runtime/mysql_table_writer.h deleted file mode 100644 index ae2080acfe..0000000000 --- a/be/src/runtime/mysql_table_writer.h +++ /dev/null @@ -1,68 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include -#include - -#include "common/status.h" - -namespace doris { - -struct MysqlConnInfo { - std::string host; - std::string user; - std::string passwd; - std::string db; - int port; - std::string charset; - - std::string debug_string() const; -}; - -class RowBatch; -class TupleRow; -class ExprContext; - -class MysqlTableWriter { -public: - MysqlTableWriter(const std::vector& output_exprs); - ~MysqlTableWriter(); - - // connect to mysql server - Status open(const MysqlConnInfo& conn_info, const std::string& tbl); - - Status begin_trans() { return Status::OK(); } - - Status append(RowBatch* batch); - - Status abort_tarns() { return Status::OK(); } - - Status finish_tarns() { return Status::OK(); } - -private: - Status insert_row(TupleRow* row); - - const std::vector& _output_expr_ctxs; - std::string _mysql_tbl; - MYSQL* _mysql_conn; -}; - -} // namespace doris diff --git a/be/src/runtime/odbc_table_sink.cpp b/be/src/runtime/odbc_table_sink.cpp deleted file mode 100644 index a7c58d22d4..0000000000 --- a/be/src/runtime/odbc_table_sink.cpp +++ /dev/null @@ -1,105 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "runtime/odbc_table_sink.h" - -#include - -#include "exprs/expr.h" -#include "runtime/runtime_state.h" -#include "util/debug_util.h" -#include "util/runtime_profile.h" - -namespace doris { - -OdbcTableSink::OdbcTableSink(ObjectPool* pool, const RowDescriptor& row_desc, - const std::vector& t_exprs) - : _pool(pool), _row_desc(row_desc), _t_output_expr(t_exprs) { - _name = "OOBC_TABLE_SINK"; -} - -OdbcTableSink::~OdbcTableSink() = default; - -Status OdbcTableSink::init(const TDataSink& t_sink) { - RETURN_IF_ERROR(DataSink::init(t_sink)); - // From the thrift expressions create the real exprs. - RETURN_IF_ERROR(Expr::create_expr_trees(_pool, _t_output_expr, &_output_expr_ctxs)); - - const TOdbcTableSink& t_odbc_sink = t_sink.odbc_table_sink; - - _odbc_param.connect_string = t_odbc_sink.connect_string; - _odbc_param.output_expr_ctxs = _output_expr_ctxs; - _odbc_tbl = t_odbc_sink.table; - _use_transaction = t_odbc_sink.use_transaction; - - return Status::OK(); -} - -Status OdbcTableSink::prepare(RuntimeState* state) { - RETURN_IF_ERROR(DataSink::prepare(state)); - // Prepare the exprs to run. - RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc)); - std::stringstream title; - title << _name << " (frag_id=" << state->fragment_instance_id() << ")"; - // create profile - _profile = state->obj_pool()->add(new RuntimeProfile(title.str())); - return Status::OK(); -} - -Status OdbcTableSink::open(RuntimeState* state) { - // Prepare the exprs to run. - RETURN_IF_ERROR(Expr::open(_output_expr_ctxs, state)); - // create writer - _writer.reset(new ODBCConnector(_odbc_param)); - RETURN_IF_ERROR(_writer->open(state)); - if (_use_transaction) { - RETURN_IF_ERROR(_writer->begin_trans()); - } - RETURN_IF_ERROR(_writer->init_to_write(_profile)); - return Status::OK(); -} - -Status OdbcTableSink::send(RuntimeState* state, RowBatch* batch) { - if (batch == nullptr || batch->num_rows() == 0) { - return Status::OK(); - } - uint32_t start_send_row = 0; - uint32_t num_row_sent = 0; - while (start_send_row < batch->num_rows()) { - auto status = - _writer->append(_odbc_tbl, batch, _output_expr_ctxs, start_send_row, &num_row_sent); - if (UNLIKELY(!status.ok())) { - return status; - } - start_send_row += num_row_sent; - num_row_sent = 0; - } - return Status::OK(); -} - -Status OdbcTableSink::close(RuntimeState* state, Status exec_status) { - if (_closed) { - return Status::OK(); - } - Expr::close(_output_expr_ctxs, state); - if (exec_status.ok() && _use_transaction) { - RETURN_IF_ERROR(_writer->finish_trans()); - } - return DataSink::close(state, exec_status); -} - -} // namespace doris diff --git a/be/src/runtime/odbc_table_sink.h b/be/src/runtime/odbc_table_sink.h deleted file mode 100644 index ecffd3ef27..0000000000 --- a/be/src/runtime/odbc_table_sink.h +++ /dev/null @@ -1,74 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "common/status.h" -#include "exec/data_sink.h" -#include "exec/odbc_connector.h" - -namespace doris { - -class RowDescriptor; -class TExpr; -class TOdbcTableSink; -class RuntimeState; -class RuntimeProfile; -class ExprContext; - -//This class is a sinker, which put input data to odbc table -class OdbcTableSink : public DataSink { -public: - OdbcTableSink(ObjectPool* pool, const RowDescriptor& row_desc, - const std::vector& t_exprs); - - virtual ~OdbcTableSink(); - - virtual Status init(const TDataSink& thrift_sink); - - virtual Status prepare(RuntimeState* state); - - virtual Status open(RuntimeState* state); - - // send data in 'batch' to this backend stream mgr - // Blocks until all rows in batch are placed in the buffer - virtual Status send(RuntimeState* state, RowBatch* batch); - - // Flush all buffered data and close all existing channels to destination - // hosts. Further send() calls are illegal after calling close(). - virtual Status close(RuntimeState* state, Status exec_status); - - virtual RuntimeProfile* profile() { return _profile; } - -private: - ObjectPool* _pool; - const RowDescriptor& _row_desc; - const std::vector& _t_output_expr; - - std::vector _output_expr_ctxs; - ODBCConnectorParam _odbc_param; - std::string _odbc_tbl; - std::unique_ptr _writer; - // whether use transaction - bool _use_transaction; - - RuntimeProfile* _profile; -}; - -} // namespace doris diff --git a/be/src/runtime/plan_fragment_executor.cpp b/be/src/runtime/plan_fragment_executor.cpp index 5f3b2c47ee..d7ae0556e7 100644 --- a/be/src/runtime/plan_fragment_executor.cpp +++ b/be/src/runtime/plan_fragment_executor.cpp @@ -121,8 +121,6 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request, _is_report_success = request.query_options.is_report_success; } - RETURN_IF_ERROR(_runtime_state->create_block_mgr()); - // set up desc tbl DescriptorTbl* desc_tbl = nullptr; if (fragments_ctx != nullptr) { @@ -149,7 +147,7 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request, if (_runtime_state->enable_vectorized_exec()) { static_cast(exch_node)->set_num_senders(num_senders); } else { - return Status::NotSupported("Non-vectorized engine is not supported since Doris 1.3+."); + RETURN_ERROR_IF_NON_VEC; } } @@ -213,8 +211,6 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request, _rows_produced_counter = ADD_COUNTER(profile(), "RowsProduced", TUnit::UNIT); _fragment_cpu_timer = ADD_TIMER(profile(), "FragmentCpuTime"); - _row_batch.reset(new RowBatch(_plan->row_desc(), _runtime_state->batch_size())); - // _row_batch->tuple_data_pool()->set_limits(*_runtime_state->mem_trackers()); VLOG_NOTICE << "plan_root=\n" << _plan->debug_string(); _prepared = true; @@ -247,7 +243,7 @@ Status PlanFragmentExecutor::open() { if (_runtime_state->enable_vectorized_exec()) { status = open_vectorized_internal(); } else { - status = open_internal(); + RETURN_ERROR_IF_NON_VEC; } if (!status.ok() && !status.is() && _runtime_state->log_has_space()) { @@ -358,90 +354,6 @@ Status PlanFragmentExecutor::get_vectorized_internal(::doris::vectorized::Block* return Status::OK(); } -Status PlanFragmentExecutor::open_internal() { - { - SCOPED_CPU_TIMER(_fragment_cpu_timer); - SCOPED_TIMER(profile()->total_time_counter()); - RETURN_IF_ERROR(_plan->open(_runtime_state.get())); - } - - if (_sink == nullptr) { - return Status::OK(); - } - { - SCOPED_CPU_TIMER(_fragment_cpu_timer); - RETURN_IF_ERROR(_sink->open(runtime_state())); - } - - // If there is a sink, do all the work of driving it here, so that - // when this returns the query has actually finished - RowBatch* batch = nullptr; - while (true) { - { - SCOPED_CPU_TIMER(_fragment_cpu_timer); - RETURN_IF_ERROR(get_next_internal(&batch)); - } - - if (batch == nullptr) { - break; - } - - if (VLOG_ROW_IS_ON) { - VLOG_ROW << "open_internal: #rows=" << batch->num_rows() - << " desc=" << row_desc().debug_string(); - - for (int i = 0; i < batch->num_rows(); ++i) { - TupleRow* row = batch->get_row(i); - VLOG_ROW << row->to_string(row_desc()); - } - } - - SCOPED_TIMER(profile()->total_time_counter()); - SCOPED_CPU_TIMER(_fragment_cpu_timer); - // Collect this plan and sub plan statistics, and send to parent plan. - if (_collect_query_statistics_with_every_batch) { - _collect_query_statistics(); - } - const Status& st = _sink->send(runtime_state(), batch); - if (st.is()) { - break; - } - RETURN_IF_ERROR(st); - } - - // Close the sink *before* stopping the report thread. Close may - // need to add some important information to the last report that - // gets sent. (e.g. table sinks record the files they have written - // to in this method) - // The coordinator report channel waits until all backends are - // either in error or have returned a status report with done = - // true, so tearing down any data stream state (a separate - // channel) in Close is safe. - - // TODO: If this returns an error, the d'tor will call Close again. We should - // audit the sinks to check that this is ok, or change that behaviour. - { - SCOPED_TIMER(profile()->total_time_counter()); - _collect_query_statistics(); - Status status; - { - std::lock_guard l(_status_lock); - status = _status; - } - status = _sink->close(runtime_state(), status); - RETURN_IF_ERROR(status); - } - - // Setting to nullptr ensures that the d'tor won't double-close the sink. - _sink.reset(nullptr); - _done = true; - - stop_report_thread(); - send_report(true); - - return Status::OK(); -} - void PlanFragmentExecutor::_collect_query_statistics() { _query_statistics->clear(); _plan->collect_query_statistics(_query_statistics.get()); @@ -556,46 +468,6 @@ void PlanFragmentExecutor::stop_report_thread() { _report_thread.join(); } -Status PlanFragmentExecutor::get_next(RowBatch** batch) { - VLOG_FILE << "GetNext(): instance_id=" << _runtime_state->fragment_instance_id(); - Status status = get_next_internal(batch); - update_status(status); - - if (_done) { - LOG_INFO("PlanFragmentExecutor::get_next finished") - .tag("query_id", _query_id) - .tag("instance_id", _runtime_state->fragment_instance_id()); - // Query is done, return the thread token - stop_report_thread(); - send_report(true); - } - - return status; -} - -Status PlanFragmentExecutor::get_next_internal(RowBatch** batch) { - if (_done) { - *batch = nullptr; - return Status::OK(); - } - - while (!_done) { - _row_batch->reset(); - SCOPED_TIMER(profile()->total_time_counter()); - RETURN_IF_ERROR(_plan->get_next(_runtime_state.get(), _row_batch.get(), &_done)); - - if (_row_batch->num_rows() > 0) { - COUNTER_UPDATE(_rows_produced_counter, _row_batch->num_rows()); - *batch = _row_batch.get(); - break; - } - - *batch = nullptr; - } - - return Status::OK(); -} - void PlanFragmentExecutor::update_status(const Status& new_status) { if (new_status.ok()) { return; @@ -656,8 +528,6 @@ void PlanFragmentExecutor::close() { return; } - _row_batch.reset(nullptr); - // Prepare may not have been called, which sets _runtime_state if (_runtime_state != nullptr) { // _runtime_state init failed diff --git a/be/src/runtime/plan_fragment_executor.h b/be/src/runtime/plan_fragment_executor.h index 7abf9fdc89..013c56471f 100644 --- a/be/src/runtime/plan_fragment_executor.h +++ b/be/src/runtime/plan_fragment_executor.h @@ -108,13 +108,6 @@ public: // time when open() returns, and the status-reporting thread will have been stopped. Status open(); - // Return results through 'batch'. Sets '*batch' to nullptr if no more results. - // '*batch' is owned by PlanFragmentExecutor and must not be deleted. - // When *batch == nullptr, get_next() should not be called anymore. Also, report_status_cb - // will have been called for the final time and the status-reporting thread - // will have been stopped. - Status get_next(RowBatch** batch); - // Closes the underlying plan fragment and frees up all resources allocated // in open()/get_next(). void close(); @@ -187,7 +180,6 @@ private: // returned via get_next's row batch // Created in prepare (if required), owned by this object. std::unique_ptr _sink; - std::unique_ptr _row_batch; // Number of rows returned by this fragment RuntimeProfile::Counter* _rows_produced_counter; @@ -230,11 +222,9 @@ private: // error condition, all rows will have been sent to the sink, the sink will // have been closed, a final report will have been sent and the report thread will // have been stopped. _sink will be set to nullptr after successful execution. - Status open_internal(); Status open_vectorized_internal(); // Executes get_next() logic and returns resulting status. - Status get_next_internal(RowBatch** batch); Status get_vectorized_internal(::doris::vectorized::Block* block, bool* eos); // Stops report thread, if one is running. Blocks until report thread terminates. diff --git a/be/src/runtime/result_writer.h b/be/src/runtime/result_writer.h index 7d669e1b4f..a77956c0c4 100644 --- a/be/src/runtime/result_writer.h +++ b/be/src/runtime/result_writer.h @@ -39,12 +39,6 @@ public: ~ResultWriter() {}; virtual Status init(RuntimeState* state) = 0; - // convert and write one row batch - virtual Status append_row_batch(const RowBatch* batch) = 0; - - // virtual Status append_block(const vectorized::Block& block) { - // return Status::InternalError("Not support append vec block now."); - // } virtual Status close() = 0; diff --git a/be/src/runtime/row_batch.cpp b/be/src/runtime/row_batch.cpp index 52860bdd09..0c2a6826a8 100644 --- a/be/src/runtime/row_batch.cpp +++ b/be/src/runtime/row_batch.cpp @@ -26,7 +26,6 @@ #include "common/utils.h" #include "gen_cpp/Data_types.h" #include "gen_cpp/data.pb.h" -#include "runtime/buffered_tuple_stream2.inline.h" #include "runtime/collection_value.h" #include "runtime/exec_env.h" #include "runtime/runtime_state.h" @@ -200,10 +199,6 @@ void RowBatch::clear() { ExecEnv::GetInstance()->buffer_pool()->FreeBuffer(buffer_info.client, &buffer_info.buffer); } - close_tuple_streams(); - for (int i = 0; i < _blocks.size(); ++i) { - _blocks[i]->del(); - } DCHECK(_tuple_ptrs != nullptr); free(_tuple_ptrs); _tuple_ptrs = nullptr; @@ -348,18 +343,6 @@ Status RowBatch::resize_and_allocate_tuple_buffer(RuntimeState* state, int64_t* return Status::OK(); } -void RowBatch::add_tuple_stream(BufferedTupleStream2* stream) { - DCHECK(stream != nullptr); - _tuple_streams.push_back(stream); - _auxiliary_mem_usage += stream->byte_size(); -} - -void RowBatch::add_block(BufferedBlockMgr2::Block* block) { - DCHECK(block != nullptr); - _blocks.push_back(block); - _auxiliary_mem_usage += block->buffer_len(); -} - void RowBatch::reset() { _num_rows = 0; _capacity = _tuple_ptrs_size / (_num_tuples_per_row * sizeof(Tuple*)); @@ -378,25 +361,12 @@ void RowBatch::reset() { } _buffers.clear(); - close_tuple_streams(); - for (int i = 0; i < _blocks.size(); ++i) { - _blocks[i]->del(); - } - _blocks.clear(); _auxiliary_mem_usage = 0; _need_to_return = false; _flush = FlushMode::NO_FLUSH_RESOURCES; _needs_deep_copy = false; } -void RowBatch::close_tuple_streams() { - for (int i = 0; i < _tuple_streams.size(); ++i) { - _tuple_streams[i]->close(); - delete _tuple_streams[i]; - } - _tuple_streams.clear(); -} - void RowBatch::transfer_resource_ownership(RowBatch* dest) { dest->_auxiliary_mem_usage += _tuple_data_pool.total_allocated_bytes(); dest->_tuple_data_pool.acquire_data(&_tuple_data_pool, false); @@ -414,21 +384,6 @@ void RowBatch::transfer_resource_ownership(RowBatch* dest) { } _buffers.clear(); - for (int i = 0; i < _tuple_streams.size(); ++i) { - dest->_tuple_streams.push_back(_tuple_streams[i]); - dest->_auxiliary_mem_usage += _tuple_streams[i]->byte_size(); - } - // Resource release should be done by dest RowBatch. if we don't clear the corresponding resources. - // This Rowbatch calls the reset() method, dest Rowbatch will also call the reset() method again, - // which will cause the core problem of double delete - _tuple_streams.clear(); - - for (int i = 0; i < _blocks.size(); ++i) { - dest->_blocks.push_back(_blocks[i]); - dest->_auxiliary_mem_usage += _blocks[i]->buffer_len(); - } - _blocks.clear(); - dest->_need_to_return |= _need_to_return; if (_needs_deep_copy) { @@ -517,9 +472,6 @@ void RowBatch::acquire_state(RowBatch* src) { src->_io_buffers.clear(); src->_auxiliary_mem_usage = 0; - DCHECK(src->_tuple_streams.empty()); - DCHECK(src->_blocks.empty()); - _has_in_flight_row = src->_has_in_flight_row; _num_rows = src->_num_rows; _capacity = src->_capacity; diff --git a/be/src/runtime/row_batch.h b/be/src/runtime/row_batch.h index e73ee320c1..9cb1f310e5 100644 --- a/be/src/runtime/row_batch.h +++ b/be/src/runtime/row_batch.h @@ -24,7 +24,6 @@ #include #include "common/logging.h" -#include "runtime/buffered_block_mgr2.h" // for BufferedBlockMgr2::Block #include "runtime/bufferpool/buffer_pool.h" #include "runtime/descriptors.h" #include "runtime/disk_io_mgr.h" @@ -142,7 +141,7 @@ public: // enough memory. bool at_capacity() const { return _num_rows == _capacity || _auxiliary_mem_usage >= AT_CAPACITY_MEM_USAGE || - num_tuple_streams() > 0 || _need_to_return; + _need_to_return; } // Returns true if the row batch has filled all the rows or has accumulated @@ -238,7 +237,6 @@ public: MemPool* tuple_data_pool() { return &_tuple_data_pool; } ObjectPool* agg_object_pool() { return &_agg_object_pool; } int num_io_buffers() const { return _io_buffers.size(); } - int num_tuple_streams() const { return _tuple_streams.size(); } // increase # of uncommitted rows void increase_uncommitted_rows(); @@ -263,10 +261,6 @@ public: void add_buffer(BufferPool::ClientHandle* client, BufferPool::BufferHandle&& buffer, FlushMode flush); - // Adds a block to this row batch. The block must be pinned. The blocks must be - // deleted when freeing resources. - void add_block(BufferedBlockMgr2::Block* block); - // Called to indicate this row batch must be returned up the operator tree. // This is used to control memory management for streaming rows. // TODO: consider using this mechanism instead of add_io_buffer/add_tuple_stream. This is @@ -393,9 +387,6 @@ public: std::string to_string(); private: - // Close owned tuple streams and delete if needed. - void close_tuple_streams(); - // All members need to be handled in RowBatch::swap() bool _has_in_flight_row; // if true, last row hasn't been committed yet @@ -460,12 +451,6 @@ private: }; /// Pages attached to this row batch. See AddBuffer() for ownership semantics. std::vector _buffers; - // Tuple streams currently owned by this row batch. - std::vector _tuple_streams; - - // Blocks attached to this row batch. The underlying memory and block manager client - // are owned by the BufferedBlockMgr2. - std::vector _blocks; // String to write compressed tuple data to in serialize(). // This is a string so we can swap() with the string in the PRowBatch we're serializing diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp index 9688bbf441..044a619e03 100644 --- a/be/src/runtime/runtime_state.cpp +++ b/be/src/runtime/runtime_state.cpp @@ -30,7 +30,6 @@ #include "common/object_pool.h" #include "common/status.h" #include "exec/exec_node.h" -#include "runtime/buffered_block_mgr2.h" #include "runtime/exec_env.h" #include "runtime/load_path_mgr.h" #include "runtime/memory/mem_tracker.h" @@ -222,14 +221,6 @@ Status RuntimeState::init_mem_trackers(const TUniqueId& query_id) { return Status::OK(); } -Status RuntimeState::create_block_mgr() { - DCHECK(_block_mgr2.get() == nullptr); - RETURN_IF_ERROR(BufferedBlockMgr2::create(this, runtime_profile(), _exec_env->tmp_file_mgr(), - _exec_env->disk_io_mgr()->max_read_buffer_size(), - &_block_mgr2)); - return Status::OK(); -} - bool RuntimeState::error_log_is_empty() { std::lock_guard l(_error_log_lock); return (_error_log.size() > 0); diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index f1180155d9..dedef5340d 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -84,9 +84,6 @@ public: // for ut and non-query. Status init_mem_trackers(const TUniqueId& query_id = TUniqueId()); - // Gets/Creates the query wide block mgr. - Status create_block_mgr(); - Status create_load_dir(); const TQueryOptions& query_options() const { return _query_options; } diff --git a/be/src/runtime/sorter.h b/be/src/runtime/sorter.h deleted file mode 100644 index 2b1a4d4fe4..0000000000 --- a/be/src/runtime/sorter.h +++ /dev/null @@ -1,53 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/apache/impala/blob/branch-2.9.0/be/src/runtime/sorter.h -// and modified by Doris - -#pragma once - -#include "common/status.h" - -namespace doris { - -class RowBatch; -class RuntimeState; -// Interface to sort rows -// 1. create one sorter -// 2. add data need be sorted through 'add_batch' -// 3. call 'input_done' when all data were added. -// 4. call 'get_next' fetch data which is sorted. -class Sorter { -public: - virtual ~Sorter() {} - - virtual Status prepare(RuntimeState* state) { return Status::OK(); } - - // Add data to be sorted. - virtual Status add_batch(RowBatch* batch) { return Status::OK(); } - - // call when all data be added - virtual Status input_done() = 0; - - // fetch data already sorted, - // client must insure that call this function AFTER call input_done - virtual Status get_next(RowBatch* batch, bool* eos) = 0; - - virtual Status close(RuntimeState* state) { return Status::OK(); } -}; - -} // namespace doris diff --git a/be/src/vec/exec/join/vhash_join_node.cpp b/be/src/vec/exec/join/vhash_join_node.cpp index fc2a7ed4dc..497f60b828 100644 --- a/be/src/vec/exec/join/vhash_join_node.cpp +++ b/be/src/vec/exec/join/vhash_join_node.cpp @@ -448,10 +448,6 @@ Status HashJoinNode::close(RuntimeState* state) { return VJoinNodeBase::close(state); } -Status HashJoinNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { - return Status::NotSupported("Not Implemented HashJoin Node::get_next scalar"); -} - bool HashJoinNode::need_more_input_data() { return (_probe_block.rows() == 0 || _probe_index == _probe_block.rows()) && !_probe_eos && !_short_circuit_for_null_in_probe_side; diff --git a/be/src/vec/exec/join/vhash_join_node.h b/be/src/vec/exec/join/vhash_join_node.h index b4b49d7b61..76fa064903 100644 --- a/be/src/vec/exec/join/vhash_join_node.h +++ b/be/src/vec/exec/join/vhash_join_node.h @@ -199,7 +199,6 @@ public: Status init(const TPlanNode& tnode, RuntimeState* state = nullptr) override; Status prepare(RuntimeState* state) override; Status open(RuntimeState* state) override; - Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override; Status get_next(RuntimeState* state, Block* block, bool* eos) override; Status close(RuntimeState* state) override; void add_hash_buckets_info(const std::string& info); diff --git a/be/src/vec/exec/join/vnested_loop_join_node.h b/be/src/vec/exec/join/vnested_loop_join_node.h index da2cd73915..23eba8fb96 100644 --- a/be/src/vec/exec/join/vnested_loop_join_node.h +++ b/be/src/vec/exec/join/vnested_loop_join_node.h @@ -57,10 +57,6 @@ public: Status open(RuntimeState* state) override; - Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override { - return Status::NotSupported("Not Implemented VNestedLoopJoinNode::get_next scalar"); - } - void debug_string(int indentation_level, std::stringstream* out) const override; const RowDescriptor& intermediate_row_desc() const override { diff --git a/be/src/vec/exec/scan/vscan_node.h b/be/src/vec/exec/scan/vscan_node.h index c2de829808..3a91d91b91 100644 --- a/be/src/vec/exec/scan/vscan_node.h +++ b/be/src/vec/exec/scan/vscan_node.h @@ -65,10 +65,6 @@ public: virtual void set_scan_ranges(const std::vector& scan_ranges) {} - Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override { - return Status::NotSupported("Not implement"); - } - // Get next block. // If eos is true, no more data will be read and block should be empty. Status get_next(RuntimeState* state, vectorized::Block* block, bool* eos) override; diff --git a/be/src/vec/exec/vaggregation_node.cpp b/be/src/vec/exec/vaggregation_node.cpp index ad035e21e8..6884d7b17d 100644 --- a/be/src/vec/exec/vaggregation_node.cpp +++ b/be/src/vec/exec/vaggregation_node.cpp @@ -505,10 +505,6 @@ Status AggregationNode::open(RuntimeState* state) { return Status::OK(); } -Status AggregationNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { - return Status::NotSupported("Not Implemented Aggregation Node::get_next scalar"); -} - Status AggregationNode::do_pre_agg(vectorized::Block* input_block, vectorized::Block* output_block) { RETURN_IF_ERROR(_executor.pre_agg(input_block, output_block)); diff --git a/be/src/vec/exec/vaggregation_node.h b/be/src/vec/exec/vaggregation_node.h index ac34ca84f6..bb40c7f509 100644 --- a/be/src/vec/exec/vaggregation_node.h +++ b/be/src/vec/exec/vaggregation_node.h @@ -777,7 +777,6 @@ public: virtual Status prepare(RuntimeState* state) override; virtual Status open(RuntimeState* state) override; virtual Status alloc_resource(RuntimeState* state) override; - virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override; virtual Status get_next(RuntimeState* state, Block* block, bool* eos) override; virtual Status close(RuntimeState* state) override; virtual void release_resource(RuntimeState* state) override; diff --git a/be/src/vec/exec/vanalytic_eval_node.cpp b/be/src/vec/exec/vanalytic_eval_node.cpp index bfac9e5d00..f748886f31 100644 --- a/be/src/vec/exec/vanalytic_eval_node.cpp +++ b/be/src/vec/exec/vanalytic_eval_node.cpp @@ -305,10 +305,6 @@ bool VAnalyticEvalNode::can_read() { return true; } -Status VAnalyticEvalNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { - return Status::NotSupported("Not Implemented VAnalyticEvalNode::get_next."); -} - Status VAnalyticEvalNode::get_next(RuntimeState* state, vectorized::Block* block, bool* eos) { INIT_AND_SCOPE_GET_NEXT_SPAN(state->get_tracer(), _get_next_span, "VAnalyticEvalNode::get_next"); diff --git a/be/src/vec/exec/vanalytic_eval_node.h b/be/src/vec/exec/vanalytic_eval_node.h index 9957221193..54becbba88 100644 --- a/be/src/vec/exec/vanalytic_eval_node.h +++ b/be/src/vec/exec/vanalytic_eval_node.h @@ -44,7 +44,6 @@ public: Status init(const TPlanNode& tnode, RuntimeState* state = nullptr) override; Status prepare(RuntimeState* state) override; Status open(RuntimeState* state) override; - Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override; Status get_next(RuntimeState* state, vectorized::Block* block, bool* eos) override; Status close(RuntimeState* state) override; Status alloc_resource(RuntimeState* state) override; diff --git a/be/src/vec/exec/varrow_scanner.h b/be/src/vec/exec/varrow_scanner.h index e67300332d..5779dbb372 100644 --- a/be/src/vec/exec/varrow_scanner.h +++ b/be/src/vec/exec/varrow_scanner.h @@ -50,11 +50,6 @@ public: // Open this scanner, will initialize information need to virtual Status open() override; - virtual Status get_next(doris::Tuple* tuple, MemPool* tuple_pool, bool* eof, - bool* fill_tuple) override { - return Status::NotSupported("Not Implemented get next"); - } - virtual Status get_next(Block* block, bool* eof) override; // Update file predicate filter profile diff --git a/be/src/vec/exec/vassert_num_rows_node.h b/be/src/vec/exec/vassert_num_rows_node.h index 0f6ffcb9de..6b0432b6db 100644 --- a/be/src/vec/exec/vassert_num_rows_node.h +++ b/be/src/vec/exec/vassert_num_rows_node.h @@ -27,10 +27,6 @@ class VAssertNumRowsNode : public ExecNode { public: VAssertNumRowsNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs); - Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override { - return Status::NotSupported("Not Implemented VAnalyticEvalNode::get_next."); - } - Status open(RuntimeState* state) override; Status get_next(RuntimeState* state, Block* block, bool* eos) override; Status pull(RuntimeState* state, vectorized::Block* output_block, bool* eos) override; diff --git a/be/src/vec/exec/vbroker_scan_node.h b/be/src/vec/exec/vbroker_scan_node.h index 452415014f..9c5e436b19 100644 --- a/be/src/vec/exec/vbroker_scan_node.h +++ b/be/src/vec/exec/vbroker_scan_node.h @@ -45,11 +45,6 @@ public: // Start broker scan using ParquetScanner or BrokerScanner. Status open(RuntimeState* state) override; - // Fill the next row batch by calling next() on the scanner, - Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override { - return Status::NotSupported("Not Implemented VBrokerScanNode::get_next."); - } - Status get_next(RuntimeState* state, vectorized::Block* block, bool* eos) override; // Close the scanner, and report errors. diff --git a/be/src/vec/exec/vbroker_scanner.h b/be/src/vec/exec/vbroker_scanner.h index 2e26eb58b0..3283530462 100644 --- a/be/src/vec/exec/vbroker_scanner.h +++ b/be/src/vec/exec/vbroker_scanner.h @@ -47,11 +47,6 @@ public: Status open() override; - virtual Status get_next(doris::Tuple* tuple, MemPool* tuple_pool, bool* eof, - bool* fill_tuple) override { - return Status::NotSupported("Not Implemented get next"); - } - Status get_next(Block* block, bool* eof) override; void close() override; diff --git a/be/src/vec/exec/vdata_gen_scan_node.cpp b/be/src/vec/exec/vdata_gen_scan_node.cpp index 26ae698795..ae3d7d0986 100644 --- a/be/src/vec/exec/vdata_gen_scan_node.cpp +++ b/be/src/vec/exec/vdata_gen_scan_node.cpp @@ -86,11 +86,6 @@ Status VDataGenFunctionScanNode::open(RuntimeState* state) { return Status::OK(); } -Status VDataGenFunctionScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { - LOG(FATAL) << "VDataGenFunctionScanNode only support vectorized execution"; - return Status::OK(); -} - Status VDataGenFunctionScanNode::get_next(RuntimeState* state, vectorized::Block* block, bool* eos) { if (state == nullptr || block == nullptr || eos == nullptr) { diff --git a/be/src/vec/exec/vdata_gen_scan_node.h b/be/src/vec/exec/vdata_gen_scan_node.h index 73470e8d49..18ca2c040f 100644 --- a/be/src/vec/exec/vdata_gen_scan_node.h +++ b/be/src/vec/exec/vdata_gen_scan_node.h @@ -45,10 +45,6 @@ public: // Start MySQL scan using _mysql_scanner. Status open(RuntimeState* state) override; - // Fill the next row batch by calling next() on the _mysql_scanner, - // converting text data in MySQL cells to binary data. - Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override; - Status get_next(RuntimeState* state, vectorized::Block* block, bool* eos) override; // Close the _mysql_scanner, and report errors. diff --git a/be/src/vec/exec/vempty_set_node.h b/be/src/vec/exec/vempty_set_node.h index 900f0c6016..80b1d2775f 100644 --- a/be/src/vec/exec/vempty_set_node.h +++ b/be/src/vec/exec/vempty_set_node.h @@ -26,9 +26,6 @@ namespace vectorized { class VEmptySetNode : public ExecNode { public: VEmptySetNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs); - virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override { - return Status::NotSupported("Not Implemented get RowBatch in vecorized execution."); - } virtual Status get_next(RuntimeState* state, Block* block, bool* eos) override; }; } // namespace vectorized diff --git a/be/src/vec/exec/vexchange_node.cpp b/be/src/vec/exec/vexchange_node.cpp index 8e34b1e444..7196c8a5c1 100644 --- a/be/src/vec/exec/vexchange_node.cpp +++ b/be/src/vec/exec/vexchange_node.cpp @@ -88,9 +88,6 @@ Status VExchangeNode::open(RuntimeState* state) { return Status::OK(); } -Status VExchangeNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { - return Status::NotSupported("Not Implemented VExchange Node::get_next scalar"); -} Status VExchangeNode::get_next(RuntimeState* state, Block* block, bool* eos) { INIT_AND_SCOPE_GET_NEXT_SPAN(state->get_tracer(), _get_next_span, "VExchangeNode::get_next"); diff --git a/be/src/vec/exec/vexchange_node.h b/be/src/vec/exec/vexchange_node.h index 1d767c8cb2..68b778aade 100644 --- a/be/src/vec/exec/vexchange_node.h +++ b/be/src/vec/exec/vexchange_node.h @@ -39,7 +39,6 @@ public: Status prepare(RuntimeState* state) override; Status alloc_resource(RuntimeState* state) override; Status open(RuntimeState* state) override; - Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override; Status get_next(RuntimeState* state, Block* row_batch, bool* eos) override; void release_resource(RuntimeState* state) override; Status collect_query_statistics(QueryStatistics* statistics) override; diff --git a/be/src/vec/exec/vjson_scanner.h b/be/src/vec/exec/vjson_scanner.h index 710d540d14..3f32648f29 100644 --- a/be/src/vec/exec/vjson_scanner.h +++ b/be/src/vec/exec/vjson_scanner.h @@ -59,10 +59,6 @@ public: // Open this scanner, will initialize information needed Status open() override; - Status get_next(doris::Tuple* tuple, MemPool* tuple_pool, bool* eof, - bool* fill_tuple) override { - return Status::NotSupported("Not Implemented get tuple"); - } Status get_next(vectorized::Block* output_block, bool* eof) override; void close() override; diff --git a/be/src/vec/exec/vmysql_scan_node.h b/be/src/vec/exec/vmysql_scan_node.h index 2fd8240956..5bea0fb388 100644 --- a/be/src/vec/exec/vmysql_scan_node.h +++ b/be/src/vec/exec/vmysql_scan_node.h @@ -43,9 +43,6 @@ public: // Start MySQL scan using mysql_scanner. Status open(RuntimeState* state) override; - Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override { - return Status::NotSupported("Not Implemented VMysqlScanNode Node::get_next scalar"); - } // Fill the next block by calling next() on the mysql_scanner, // converting text data in MySQL cells to binary data. Status get_next(RuntimeState* state, vectorized::Block* block, bool* eos) override; diff --git a/be/src/vec/exec/vschema_scan_node.h b/be/src/vec/exec/vschema_scan_node.h index acb725baa4..57117b23af 100644 --- a/be/src/vec/exec/vschema_scan_node.h +++ b/be/src/vec/exec/vschema_scan_node.h @@ -37,9 +37,6 @@ public: VSchemaScanNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs); ~VSchemaScanNode(); Status prepare(RuntimeState* state) override; - Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override { - return Status::NotSupported("Not Implemented VSchemaScanNode Node::get_next scalar"); - } virtual Status get_next(RuntimeState* state, vectorized::Block* block, bool* eos) override; // Prepare conjuncts, create Schema columns to slots mapping diff --git a/be/src/vec/exec/vselect_node.cpp b/be/src/vec/exec/vselect_node.cpp index 469d72ad73..b8f10d57be 100644 --- a/be/src/vec/exec/vselect_node.cpp +++ b/be/src/vec/exec/vselect_node.cpp @@ -38,10 +38,6 @@ Status VSelectNode::open(RuntimeState* state) { return Status::OK(); } -Status VSelectNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { - return Status::NotSupported("Not Implemented VSelectNode::get_next."); -} - Status VSelectNode::get_next(RuntimeState* state, vectorized::Block* block, bool* eos) { INIT_AND_SCOPE_GET_NEXT_SPAN(state->get_tracer(), _get_next_span, "VSelectNode::get_next"); SCOPED_TIMER(_runtime_profile->total_time_counter()); diff --git a/be/src/vec/exec/vselect_node.h b/be/src/vec/exec/vselect_node.h index afff5cb734..984e4578fb 100644 --- a/be/src/vec/exec/vselect_node.h +++ b/be/src/vec/exec/vselect_node.h @@ -27,7 +27,6 @@ public: Status init(const TPlanNode& tnode, RuntimeState* state = nullptr) override; Status prepare(RuntimeState* state) override; Status open(RuntimeState* state) override; - Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override; Status get_next(RuntimeState* state, vectorized::Block* block, bool* eos) override; Status close(RuntimeState* state) override; Status pull(RuntimeState* state, vectorized::Block* output_block, bool* eos) override; diff --git a/be/src/vec/exec/vset_operation_node.h b/be/src/vec/exec/vset_operation_node.h index 8b8d10e9e3..1e339e3a80 100644 --- a/be/src/vec/exec/vset_operation_node.h +++ b/be/src/vec/exec/vset_operation_node.h @@ -39,9 +39,6 @@ public: Status init(const TPlanNode& tnode, RuntimeState* state = nullptr) override; Status prepare(RuntimeState* state) override; Status open(RuntimeState* state) override; - Status get_next(RuntimeState* /*state*/, RowBatch* /*row_batch*/, bool* /*eos*/) override { - return Status::NotSupported("Not implemented get RowBatch in vectorized execution."); - } Status get_next(RuntimeState* state, Block* output_block, bool* eos) override; diff --git a/be/src/vec/exec/vsort_node.cpp b/be/src/vec/exec/vsort_node.cpp index 2596c0eb03..241c99c287 100644 --- a/be/src/vec/exec/vsort_node.cpp +++ b/be/src/vec/exec/vsort_node.cpp @@ -135,11 +135,6 @@ Status VSortNode::open(RuntimeState* state) { return Status::OK(); } -Status VSortNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { - *eos = true; - return Status::NotSupported("Not Implemented VSortNode::get_next scalar"); -} - Status VSortNode::pull(doris::RuntimeState* state, vectorized::Block* output_block, bool* eos) { RETURN_IF_ERROR(_sorter->get_next(state, output_block, eos)); reached_limit(output_block, eos); diff --git a/be/src/vec/exec/vsort_node.h b/be/src/vec/exec/vsort_node.h index 0a29d08c54..ff7b692096 100644 --- a/be/src/vec/exec/vsort_node.h +++ b/be/src/vec/exec/vsort_node.h @@ -37,30 +37,28 @@ public: ~VSortNode() override = default; - virtual Status init(const TPlanNode& tnode, RuntimeState* state = nullptr) override; + Status init(const TPlanNode& tnode, RuntimeState* state = nullptr) override; - virtual Status prepare(RuntimeState* state) override; + Status prepare(RuntimeState* state) override; - virtual Status alloc_resource(RuntimeState* state) override; + Status alloc_resource(RuntimeState* state) override; - virtual Status open(RuntimeState* state) override; + Status open(RuntimeState* state) override; - virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override; + Status get_next(RuntimeState* state, Block* block, bool* eos) override; - virtual Status get_next(RuntimeState* state, Block* block, bool* eos) override; + Status reset(RuntimeState* state) override; - virtual Status reset(RuntimeState* state) override; + Status close(RuntimeState* state) override; - virtual Status close(RuntimeState* state) override; + void release_resource(RuntimeState* state) override; - virtual void release_resource(RuntimeState* state) override; + Status pull(RuntimeState* state, vectorized::Block* output_block, bool* eos) override; - virtual Status pull(RuntimeState* state, vectorized::Block* output_block, bool* eos) override; - - virtual Status sink(RuntimeState* state, vectorized::Block* input_block, bool eos) override; + Status sink(RuntimeState* state, vectorized::Block* input_block, bool eos) override; protected: - virtual void debug_string(int indentation_level, std::stringstream* out) const override; + void debug_string(int indentation_level, std::stringstream* out) const override; private: // Number of rows to skip. diff --git a/be/src/vec/exec/vunion_node.h b/be/src/vec/exec/vunion_node.h index ccc683452b..fa517df2ff 100644 --- a/be/src/vec/exec/vunion_node.h +++ b/be/src/vec/exec/vunion_node.h @@ -29,9 +29,6 @@ public: Status prepare(RuntimeState* state) override; Status open(RuntimeState* state) override; Status get_next(RuntimeState* state, vectorized::Block* block, bool* eos) override; - Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) override { - return Status::NotSupported("Not Implemented get RowBatch in vecorized execution."); - } Status close(RuntimeState* state) override; Status alloc_resource(RuntimeState* state) override; diff --git a/be/src/vec/runtime/vfile_result_writer.h b/be/src/vec/runtime/vfile_result_writer.h index 37f37ae159..0c305ef4dd 100644 --- a/be/src/vec/runtime/vfile_result_writer.h +++ b/be/src/vec/runtime/vfile_result_writer.h @@ -37,9 +37,6 @@ public: virtual ~VFileResultWriter() = default; Status append_block(Block& block) override; - Status append_row_batch(const RowBatch* batch) override { - return Status::NotSupported("append_row_batch is not supported in VFileResultWriter!"); - }; Status init(RuntimeState* state) override; Status close() override; diff --git a/be/src/vec/runtime/vsorted_run_merger.cpp b/be/src/vec/runtime/vsorted_run_merger.cpp index e2654e6b60..418b5f990d 100644 --- a/be/src/vec/runtime/vsorted_run_merger.cpp +++ b/be/src/vec/runtime/vsorted_run_merger.cpp @@ -21,7 +21,6 @@ #include "runtime/descriptors.h" #include "runtime/row_batch.h" -#include "runtime/sorter.h" #include "util/debug_util.h" #include "util/defer_op.h" #include "util/runtime_profile.h" diff --git a/be/src/vec/sink/vdata_stream_sender.cpp b/be/src/vec/sink/vdata_stream_sender.cpp index 30ce7be6ac..3b32023b6a 100644 --- a/be/src/vec/sink/vdata_stream_sender.cpp +++ b/be/src/vec/sink/vdata_stream_sender.cpp @@ -480,10 +480,6 @@ Status VDataStreamSender::open(RuntimeState* state) { return Status::OK(); } -Status VDataStreamSender::send(RuntimeState* state, RowBatch* batch) { - return Status::NotSupported("Not Implemented VOlapScanNode Node::get_next scalar"); -} - Status VDataStreamSender::send(RuntimeState* state, Block* block, bool eos) { INIT_AND_SCOPE_SEND_SPAN(state->get_tracer(), _send_span, "VDataStreamSender::send") SCOPED_TIMER(_profile->total_time_counter()); diff --git a/be/src/vec/sink/vdata_stream_sender.h b/be/src/vec/sink/vdata_stream_sender.h index 7cc13162ef..69cd1ecc9b 100644 --- a/be/src/vec/sink/vdata_stream_sender.h +++ b/be/src/vec/sink/vdata_stream_sender.h @@ -74,7 +74,6 @@ public: Status prepare(RuntimeState* state) override; Status open(RuntimeState* state) override; - Status send(RuntimeState* state, RowBatch* batch) override; Status send(RuntimeState* state, Block* block, bool eos = false) override; Status close(RuntimeState* state, Status exec_status) override; diff --git a/be/src/vec/sink/vmysql_result_writer.cpp b/be/src/vec/sink/vmysql_result_writer.cpp index e5f2c258e3..1beacdfaa8 100644 --- a/be/src/vec/sink/vmysql_result_writer.cpp +++ b/be/src/vec/sink/vmysql_result_writer.cpp @@ -413,10 +413,6 @@ int VMysqlResultWriter::_add_one_cell(const ColumnPtr& column_ptr, size_t row_id } } -Status VMysqlResultWriter::append_row_batch(const RowBatch* batch) { - return Status::RuntimeError("Not Implemented MysqlResultWriter::append_row_batch scalar"); -} - Status VMysqlResultWriter::append_block(Block& input_block) { SCOPED_TIMER(_append_row_batch_timer); Status status = Status::OK(); diff --git a/be/src/vec/sink/vmysql_result_writer.h b/be/src/vec/sink/vmysql_result_writer.h index e17d41b7fa..e566a30213 100644 --- a/be/src/vec/sink/vmysql_result_writer.h +++ b/be/src/vec/sink/vmysql_result_writer.h @@ -39,8 +39,6 @@ public: virtual Status init(RuntimeState* state) override; - virtual Status append_row_batch(const RowBatch* batch) override; - virtual Status append_block(Block& block) override; virtual bool can_sink() override; diff --git a/be/src/vec/sink/vmysql_table_writer.cpp b/be/src/vec/sink/vmysql_table_writer.cpp index f302513c7a..cbba836377 100644 --- a/be/src/vec/sink/vmysql_table_writer.cpp +++ b/be/src/vec/sink/vmysql_table_writer.cpp @@ -33,6 +33,14 @@ namespace doris { namespace vectorized { +std::string MysqlConnInfo::debug_string() const { + std::stringstream ss; + + ss << "(host=" << host << ",port=" << port << ",user=" << user << ",db=" << db + << ",passwd=" << passwd << ",charset=" << charset << ")"; + return ss.str(); +} + VMysqlTableWriter::VMysqlTableWriter(const std::vector& output_expr_ctxs) : _vec_output_expr_ctxs(output_expr_ctxs) {} diff --git a/be/src/vec/sink/vmysql_table_writer.h b/be/src/vec/sink/vmysql_table_writer.h index 6379896e63..5c494262c2 100644 --- a/be/src/vec/sink/vmysql_table_writer.h +++ b/be/src/vec/sink/vmysql_table_writer.h @@ -24,11 +24,21 @@ #include #include "common/status.h" -#include "runtime/mysql_table_writer.h" namespace doris { namespace vectorized { +struct MysqlConnInfo { + std::string host; + std::string user; + std::string passwd; + std::string db; + int port; + std::string charset; + + std::string debug_string() const; +}; + class VExprContext; class Block; class VMysqlTableWriter { diff --git a/be/src/vec/sink/vresult_file_sink.cpp b/be/src/vec/sink/vresult_file_sink.cpp index b63ebf160f..a25683e6da 100644 --- a/be/src/vec/sink/vresult_file_sink.cpp +++ b/be/src/vec/sink/vresult_file_sink.cpp @@ -137,10 +137,6 @@ Status VResultFileSink::open(RuntimeState* state) { return VExpr::open(_output_vexpr_ctxs, state); } -Status VResultFileSink::send(RuntimeState* state, RowBatch* batch) { - return Status::NotSupported("Not Implemented VResultFileSink Node::get_next scalar"); -} - Status VResultFileSink::send(RuntimeState* state, Block* block, bool eos) { INIT_AND_SCOPE_SEND_SPAN(state->get_tracer(), _send_span, "VResultFileSink::send"); RETURN_IF_ERROR(_writer->append_block(*block)); diff --git a/be/src/vec/sink/vresult_file_sink.h b/be/src/vec/sink/vresult_file_sink.h index 33d454f0bc..ba63cad517 100644 --- a/be/src/vec/sink/vresult_file_sink.h +++ b/be/src/vec/sink/vresult_file_sink.h @@ -40,7 +40,6 @@ public: Status open(RuntimeState* state) override; // send data in 'batch' to this backend stream mgr // Blocks until all rows in batch are placed in the buffer - Status send(RuntimeState* state, RowBatch* batch) override; Status send(RuntimeState* state, Block* block, bool eos = false) override; // Flush all buffered data and close all existing channels to destination // hosts. Further send() calls are illegal after calling close(). diff --git a/be/src/vec/sink/vresult_sink.cpp b/be/src/vec/sink/vresult_sink.cpp index 2521636c6a..467fb6c82c 100644 --- a/be/src/vec/sink/vresult_sink.cpp +++ b/be/src/vec/sink/vresult_sink.cpp @@ -83,10 +83,6 @@ Status VResultSink::open(RuntimeState* state) { return VExpr::open(_output_vexpr_ctxs, state); } -Status VResultSink::send(RuntimeState* state, RowBatch* batch) { - return Status::NotSupported("Not Implemented Result Sink::send scalar"); -} - Status VResultSink::send(RuntimeState* state, Block* block, bool eos) { INIT_AND_SCOPE_SEND_SPAN(state->get_tracer(), _send_span, "VResultSink::send"); // The memory consumption in the process of sending the results is not check query memory limit. diff --git a/be/src/vec/sink/vresult_sink.h b/be/src/vec/sink/vresult_sink.h index 4b63c48f95..63441e3179 100644 --- a/be/src/vec/sink/vresult_sink.h +++ b/be/src/vec/sink/vresult_sink.h @@ -115,8 +115,6 @@ public: virtual Status prepare(RuntimeState* state) override; virtual Status open(RuntimeState* state) override; - // not implement - virtual Status send(RuntimeState* state, RowBatch* batch) override; virtual Status send(RuntimeState* state, Block* block, bool eos = false) override; // Flush all buffered data and close all existing channels to destination // hosts. Further send() calls are illegal after calling close(). diff --git a/be/src/vec/sink/vtable_sink.cpp b/be/src/vec/sink/vtable_sink.cpp index 4bf4d64147..a09fb2cb5f 100644 --- a/be/src/vec/sink/vtable_sink.cpp +++ b/be/src/vec/sink/vtable_sink.cpp @@ -54,11 +54,6 @@ Status VTableSink::open(RuntimeState* state) { return Status::OK(); } -Status VTableSink::send(RuntimeState* state, RowBatch* batch) { - return Status::NotSupported( - "Not Implemented VTableSink::send(RuntimeState* state, RowBatch* batch)"); -} - Status VTableSink::send(RuntimeState* state, Block* block, bool eos) { INIT_AND_SCOPE_SEND_SPAN(state->get_tracer(), _send_span, "VTableSink::send"); return Status::OK(); diff --git a/be/src/vec/sink/vtable_sink.h b/be/src/vec/sink/vtable_sink.h index 62d2e6fde6..339df25f5c 100644 --- a/be/src/vec/sink/vtable_sink.h +++ b/be/src/vec/sink/vtable_sink.h @@ -38,8 +38,6 @@ public: Status open(RuntimeState* state) override; - Status send(RuntimeState* state, RowBatch* batch) override; - Status send(RuntimeState* state, vectorized::Block* block, bool eos = false) override; // Flush all buffered data and close all existing channels to destination // hosts. Further send() calls are illegal after calling close(). diff --git a/be/src/vec/sink/vtablet_sink.cpp b/be/src/vec/sink/vtablet_sink.cpp index ae799e46d9..13aa7661ea 100644 --- a/be/src/vec/sink/vtablet_sink.cpp +++ b/be/src/vec/sink/vtablet_sink.cpp @@ -17,12 +17,29 @@ #include "vec/sink/vtablet_sink.h" +#include +#include +#include +#include + +#include "exec/tablet_info.h" +#include "exprs/expr.h" +#include "exprs/expr_context.h" +#include "olap/hll.h" +#include "runtime/exec_env.h" +#include "runtime/row_batch.h" +#include "runtime/runtime_state.h" #include "runtime/thread_context.h" +#include "runtime/tuple_row.h" +#include "service/backend_options.h" #include "util/brpc_client_cache.h" #include "util/debug/sanitizer_scopes.h" +#include "util/defer_op.h" #include "util/doris_metrics.h" #include "util/proto_util.h" +#include "util/threadpool.h" #include "util/time.h" +#include "util/uid_util.h" #include "vec/columns/column_array.h" #include "vec/core/block.h" #include "vec/exprs/vexpr.h" @@ -31,9 +48,131 @@ namespace doris { namespace stream_load { -VNodeChannel::VNodeChannel(OlapTableSink* parent, IndexChannel* index_channel, int64_t node_id) - : NodeChannel(parent, index_channel, node_id) { - _is_vectorized = true; +Status IndexChannel::init(RuntimeState* state, const std::vector& tablets) { + SCOPED_CONSUME_MEM_TRACKER(_index_channel_tracker.get()); + for (auto& tablet : tablets) { + auto location = _parent->_location->find_tablet(tablet.tablet_id); + if (location == nullptr) { + LOG(WARNING) << "unknown tablet, tablet_id=" << tablet.tablet_id; + return Status::InternalError("unknown tablet"); + } + std::vector> channels; + for (auto& node_id : location->node_ids) { + std::shared_ptr channel; + auto it = _node_channels.find(node_id); + if (it == _node_channels.end()) { + // NodeChannel is not added to the _parent->_pool. + // Because the deconstruction of NodeChannel may take a long time to wait rpc finish. + // but the ObjectPool will hold a spin lock to delete objects. + channel = std::make_shared(_parent, this, node_id); + _node_channels.emplace(node_id, channel); + } else { + channel = it->second; + } + channel->add_tablet(tablet); + if (_parent->_write_single_replica) { + auto slave_location = _parent->_slave_location->find_tablet(tablet.tablet_id); + if (slave_location != nullptr) { + channel->add_slave_tablet_nodes(tablet.tablet_id, slave_location->node_ids); + } + } + channels.push_back(channel); + _tablets_by_channel[node_id].insert(tablet.tablet_id); + } + _channels_by_tablet.emplace(tablet.tablet_id, std::move(channels)); + } + for (auto& it : _node_channels) { + RETURN_IF_ERROR(it.second->init(state)); + } + return Status::OK(); +} + +void IndexChannel::mark_as_failed(int64_t node_id, const std::string& host, const std::string& err, + int64_t tablet_id) { + VLOG_PROGRESS << "mark node_id:" << node_id << " tablet_id: " << tablet_id + << " as failed, err: " << err; + const auto& it = _tablets_by_channel.find(node_id); + if (it == _tablets_by_channel.end()) { + return; + } + + { + std::lock_guard l(_fail_lock); + if (tablet_id == -1) { + for (const auto the_tablet_id : it->second) { + _failed_channels[the_tablet_id].insert(node_id); + _failed_channels_msgs.emplace(the_tablet_id, err + ", host: " + host); + if (_failed_channels[the_tablet_id].size() >= ((_parent->_num_replicas + 1) / 2)) { + _intolerable_failure_status = + Status::InternalError(_failed_channels_msgs[the_tablet_id]); + } + } + } else { + _failed_channels[tablet_id].insert(node_id); + _failed_channels_msgs.emplace(tablet_id, err + ", host: " + host); + if (_failed_channels[tablet_id].size() >= ((_parent->_num_replicas + 1) / 2)) { + _intolerable_failure_status = + Status::InternalError(_failed_channels_msgs[tablet_id]); + } + } + } +} + +Status IndexChannel::check_intolerable_failure() { + std::lock_guard l(_fail_lock); + return _intolerable_failure_status; +} + +void IndexChannel::set_error_tablet_in_state(RuntimeState* state) { + std::vector& error_tablet_infos = state->error_tablet_infos(); + + std::lock_guard l(_fail_lock); + for (const auto& it : _failed_channels_msgs) { + TErrorTabletInfo error_info; + error_info.__set_tabletId(it.first); + error_info.__set_msg(it.second); + error_tablet_infos.emplace_back(error_info); + } +} + +void IndexChannel::set_tablets_received_rows( + const std::vector>& tablets_received_rows, int64_t node_id) { + for (const auto& [tablet_id, rows_num] : tablets_received_rows) { + _tablets_received_rows[tablet_id].emplace_back(node_id, rows_num); + } +} + +Status IndexChannel::check_tablet_received_rows_consistency() { + for (auto& tablet : _tablets_received_rows) { + for (size_t i = 0; i < tablet.second.size(); i++) { + VLOG_NOTICE << "check_tablet_received_rows_consistency, load_id: " << _parent->_load_id + << ", txn_id: " << std::to_string(_parent->_txn_id) + << ", tablet_id: " << tablet.first + << ", node_id: " << tablet.second[i].first + << ", rows_num: " << tablet.second[i].second; + if (i == 0) { + continue; + } + if (tablet.second[i].second != tablet.second[0].second) { + LOG(WARNING) << "rows num doest't match, load_id: " << _parent->_load_id + << ", txn_id: " << std::to_string(_parent->_txn_id) + << ", tablt_id: " << tablet.first + << ", node_id: " << tablet.second[i].first + << ", rows_num: " << tablet.second[i].second + << ", node_id: " << tablet.second[0].first + << ", rows_num: " << tablet.second[0].second; + return Status::InternalError("rows num written by multi replicas doest't match"); + } + } + } + return Status::OK(); +} + +VNodeChannel::VNodeChannel(VOlapTableSink* parent, IndexChannel* index_channel, int64_t node_id) + : _parent(parent), _index_channel(index_channel), _node_id(node_id) { + _node_channel_tracker = std::make_shared(fmt::format( + "NodeChannel:indexID={}:threadId={}", std::to_string(_index_channel->_index_id), + thread_context()->get_thread_id())); } VNodeChannel::~VNodeChannel() { @@ -55,7 +194,34 @@ void VNodeChannel::clear_all_blocks() { // no need to set _cancel_msg because the error will be // returned directly via "TabletSink::prepare()" method. Status VNodeChannel::init(RuntimeState* state) { - RETURN_IF_ERROR(NodeChannel::init(state)); + SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get()); + _tuple_desc = _parent->_output_tuple_desc; + _state = state; + auto node = _parent->_nodes_info->find_node(_node_id); + if (node == nullptr) { + _cancelled = true; + return Status::InternalError("unknown node id, id={}", _node_id); + } + + _node_info = *node; + + _load_info = "load_id=" + print_id(_parent->_load_id) + + ", txn_id=" + std::to_string(_parent->_txn_id); + + _row_desc.reset(new RowDescriptor(_tuple_desc, false)); + _batch_size = state->batch_size(); + + _stub = state->exec_env()->brpc_internal_client_cache()->get_client(_node_info.host, + _node_info.brpc_port); + if (_stub == nullptr) { + LOG(WARNING) << "Get rpc stub failed, host=" << _node_info.host + << ", port=" << _node_info.brpc_port << ", " << channel_info(); + _cancelled = true; + return Status::InternalError("get rpc stub failed"); + } + + _rpc_timeout_ms = state->query_options().query_timeout * 1000; + _timeout_watch.start(); _cur_mutable_block.reset(new vectorized::MutableBlock({_tuple_desc})); @@ -77,9 +243,67 @@ Status VNodeChannel::init(RuntimeState* state) { return Status::OK(); } +void VNodeChannel::open() { + SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get()); + PTabletWriterOpenRequest request; + request.set_allocated_id(&_parent->_load_id); + request.set_index_id(_index_channel->_index_id); + request.set_txn_id(_parent->_txn_id); + request.set_allocated_schema(_parent->_schema->to_protobuf()); + for (auto& tablet : _all_tablets) { + auto ptablet = request.add_tablets(); + ptablet->set_partition_id(tablet.partition_id); + ptablet->set_tablet_id(tablet.tablet_id); + } + request.set_num_senders(_parent->_num_senders); + request.set_need_gen_rollup(false); // Useless but it is a required field in pb + request.set_load_mem_limit(_parent->_load_mem_limit); + request.set_load_channel_timeout_s(_parent->_load_channel_timeout_s); + request.set_is_high_priority(_parent->_is_high_priority); + request.set_sender_ip(BackendOptions::get_localhost()); + request.set_is_vectorized(true); + + _open_closure = new RefCountClosure(); + _open_closure->ref(); + + // This ref is for RPC's reference + _open_closure->ref(); + _open_closure->cntl.set_timeout_ms(config::tablet_writer_open_rpc_timeout_sec * 1000); + if (config::tablet_writer_ignore_eovercrowded) { + _open_closure->cntl.ignore_eovercrowded(); + } + _stub->tablet_writer_open(&_open_closure->cntl, &request, &_open_closure->result, + _open_closure); + request.release_id(); + request.release_schema(); +} + Status VNodeChannel::open_wait() { - Status status = NodeChannel::open_wait(); + _open_closure->join(); + SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get()); + if (_open_closure->cntl.Failed()) { + if (!ExecEnv::GetInstance()->brpc_internal_client_cache()->available( + _stub, _node_info.host, _node_info.brpc_port)) { + ExecEnv::GetInstance()->brpc_internal_client_cache()->erase( + _open_closure->cntl.remote_side()); + } + std::stringstream ss; + ss << "failed to open tablet writer, error=" << berror(_open_closure->cntl.ErrorCode()) + << ", error_text=" << _open_closure->cntl.ErrorText(); + _cancelled = true; + LOG(WARNING) << ss.str() << " " << channel_info(); + return Status::InternalError("failed to open tablet writer, error={}, error_text={}", + berror(_open_closure->cntl.ErrorCode()), + _open_closure->cntl.ErrorText()); + } + Status status(_open_closure->result.status()); + if (_open_closure->unref()) { + delete _open_closure; + } + _open_closure = nullptr; + if (!status.ok()) { + _cancelled = true; return status; } @@ -184,7 +408,7 @@ Status VNodeChannel::add_block(vectorized::Block* block, auto st = none_of({_cancelled, _eos_is_produced}); if (!st.ok()) { if (_cancelled) { - std::lock_guard l(_cancel_msg_lock); + std::lock_guard l(_cancel_msg_lock); return Status::InternalError("add row failed. {}", _cancel_msg); } else { return std::move(st.prepend("already stopped, can't add row. cancelled/eos: ")); @@ -257,6 +481,33 @@ int VNodeChannel::try_send_and_fetch_status(RuntimeState* state, return _send_finished ? 0 : 1; } +void VNodeChannel::_cancel_with_msg(const std::string& msg) { + LOG(WARNING) << "cancel node channel " << channel_info() << ", error message: " << msg; + { + std::lock_guard l(_cancel_msg_lock); + if (_cancel_msg == "") { + _cancel_msg = msg; + } + } + _cancelled = true; +} + +Status VNodeChannel::none_of(std::initializer_list vars) { + bool none = std::none_of(vars.begin(), vars.end(), [](bool var) { return var; }); + Status st = Status::OK(); + if (!none) { + std::string vars_str; + std::for_each(vars.begin(), vars.end(), + [&vars_str](bool var) -> void { vars_str += (var ? "1/" : "0/"); }); + if (!vars_str.empty()) { + vars_str.pop_back(); // 0/1/0/ -> 0/1/0 + } + st = Status::InternalError(vars_str); + } + + return st; +} + void VNodeChannel::try_send_block(RuntimeState* state) { SCOPED_ATTACH_TASK(state); SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker); @@ -384,6 +635,84 @@ void VNodeChannel::try_send_block(RuntimeState* state) { _next_packet_seq++; } +void VNodeChannel::cancel(const std::string& cancel_msg) { + SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get()); + // set _is_closed to true finally + Defer set_closed {[&]() { + std::lock_guard l(_closed_lock); + _is_closed = true; + }}; + // we don't need to wait last rpc finished, cause closure's release/reset will join. + // But do we need brpc::StartCancel(call_id)? + _cancel_with_msg(cancel_msg); + + PTabletWriterCancelRequest request; + request.set_allocated_id(&_parent->_load_id); + request.set_index_id(_index_channel->_index_id); + request.set_sender_id(_parent->_sender_id); + + auto closure = new RefCountClosure(); + + closure->ref(); + int remain_ms = _rpc_timeout_ms - _timeout_watch.elapsed_time() / NANOS_PER_MILLIS; + if (UNLIKELY(remain_ms < config::min_load_rpc_timeout_ms)) { + remain_ms = config::min_load_rpc_timeout_ms; + } + closure->cntl.set_timeout_ms(remain_ms); + if (config::tablet_writer_ignore_eovercrowded) { + closure->cntl.ignore_eovercrowded(); + } + _stub->tablet_writer_cancel(&closure->cntl, &request, &closure->result, closure); + request.release_id(); +} + +Status VNodeChannel::close_wait(RuntimeState* state) { + SCOPED_CONSUME_MEM_TRACKER(_node_channel_tracker.get()); + // set _is_closed to true finally + Defer set_closed {[&]() { + std::lock_guard l(_closed_lock); + _is_closed = true; + }}; + + auto st = none_of({_cancelled, !_eos_is_produced}); + if (!st.ok()) { + if (_cancelled) { + std::lock_guard l(_cancel_msg_lock); + return Status::InternalError("wait close failed. {}", _cancel_msg); + } else { + return std::move( + st.prepend("already stopped, skip waiting for close. cancelled/!eos: ")); + } + } + + // waiting for finished, it may take a long time, so we couldn't set a timeout + while (!_add_batches_finished && !_cancelled) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + _close_time_ms = UnixMillis() - _close_time_ms; + + if (_add_batches_finished) { + _close_check(); + state->tablet_commit_infos().insert(state->tablet_commit_infos().end(), + std::make_move_iterator(_tablet_commit_infos.begin()), + std::make_move_iterator(_tablet_commit_infos.end())); + + _index_channel->set_error_tablet_in_state(state); + _index_channel->set_tablets_received_rows(_tablets_received_rows, _node_id); + return Status::OK(); + } + + std::stringstream ss; + ss << "close wait failed coz rpc error"; + { + std::lock_guard l(_cancel_msg_lock); + if (_cancel_msg != "") { + ss << ". " << _cancel_msg; + } + } + return Status::InternalError(ss.str()); +} + void VNodeChannel::_close_check() { std::lock_guard lg(_pending_batches_lock); CHECK(_pending_blocks.empty()) << name(); @@ -413,11 +742,14 @@ void VNodeChannel::mark_close() { VOlapTableSink::VOlapTableSink(ObjectPool* pool, const RowDescriptor& row_desc, const std::vector& texprs, Status* status) - : OlapTableSink(pool, row_desc, texprs, status) { - _is_vectorized = true; + : _pool(pool), + _input_row_desc(row_desc), + _filter_bitmap(1024), + _stop_background_threads_latch(1) { // From the thrift expressions create the real exprs. - vectorized::VExpr::create_expr_trees(pool, texprs, &_output_vexpr_ctxs); + *status = vectorized::VExpr::create_expr_trees(pool, texprs, &_output_vexpr_ctxs); _name = "VOlapTableSink"; + _transfer_large_data_by_brpc = config::transfer_large_data_by_brpc; } VOlapTableSink::~VOlapTableSink() { @@ -426,18 +758,150 @@ VOlapTableSink::~VOlapTableSink() { // But their destructions are after OlapTableSink's. for (const auto& index_channel : _channels) { index_channel->for_each_node_channel( - [](const std::shared_ptr& ch) { ch->clear_all_blocks(); }); + [](const std::shared_ptr& ch) { ch->clear_all_blocks(); }); } } -Status VOlapTableSink::init(const TDataSink& sink) { - RETURN_IF_ERROR(OlapTableSink::init(sink)); - _vpartition = _pool->add(new VOlapTablePartitionParam(_schema, sink.olap_table_sink.partition)); +Status VOlapTableSink::init(const TDataSink& t_sink) { + DCHECK(t_sink.__isset.olap_table_sink); + auto& table_sink = t_sink.olap_table_sink; + _load_id.set_hi(table_sink.load_id.hi); + _load_id.set_lo(table_sink.load_id.lo); + _txn_id = table_sink.txn_id; + _num_replicas = table_sink.num_replicas; + _tuple_desc_id = table_sink.tuple_id; + _schema.reset(new OlapTableSchemaParam()); + RETURN_IF_ERROR(_schema->init(table_sink.schema)); + _partition = _pool->add(new OlapTablePartitionParam(_schema, table_sink.partition)); + RETURN_IF_ERROR(_partition->init()); + _location = _pool->add(new OlapTableLocationParam(table_sink.location)); + _nodes_info = _pool->add(new DorisNodesInfo(table_sink.nodes_info)); + if (table_sink.__isset.write_single_replica && table_sink.write_single_replica) { + _write_single_replica = true; + _slave_location = _pool->add(new OlapTableLocationParam(table_sink.slave_location)); + if (!config::enable_single_replica_load) { + return Status::InternalError("single replica load is disabled on BE."); + } + } + + if (table_sink.__isset.load_channel_timeout_s) { + _load_channel_timeout_s = table_sink.load_channel_timeout_s; + } else { + _load_channel_timeout_s = config::streaming_load_rpc_max_alive_time_sec; + } + if (table_sink.__isset.send_batch_parallelism && table_sink.send_batch_parallelism > 1) { + _send_batch_parallelism = table_sink.send_batch_parallelism; + } + // if distributed column list is empty, we can ensure that tablet is with random distribution info + // and if load_to_single_tablet is set and set to true, we should find only one tablet in one partition + // for the whole olap table sink + if (table_sink.partition.distributed_columns.empty()) { + if (table_sink.__isset.load_to_single_tablet && table_sink.load_to_single_tablet) { + findTabletMode = FindTabletMode::FIND_TABLET_EVERY_SINK; + } else { + findTabletMode = FindTabletMode::FIND_TABLET_EVERY_BATCH; + } + } + _vpartition = _pool->add( + new doris::VOlapTablePartitionParam(_schema, t_sink.olap_table_sink.partition)); return _vpartition->init(); } Status VOlapTableSink::prepare(RuntimeState* state) { - RETURN_IF_ERROR(OlapTableSink::prepare(state)); + RETURN_IF_ERROR(DataSink::prepare(state)); + + _sender_id = state->per_fragment_instance_idx(); + _num_senders = state->num_per_fragment_instances(); + _is_high_priority = (state->query_options().query_timeout <= + config::load_task_high_priority_threshold_second); + + // profile must add to state's object pool + _profile = state->obj_pool()->add(new RuntimeProfile("OlapTableSink")); + _mem_tracker = + std::make_shared("OlapTableSink:" + std::to_string(state->load_job_id())); + SCOPED_TIMER(_profile->total_time_counter()); + SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get()); + + // get table's tuple descriptor + _output_tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_desc_id); + if (_output_tuple_desc == nullptr) { + LOG(WARNING) << "unknown destination tuple descriptor, id=" << _tuple_desc_id; + return Status::InternalError("unknown destination tuple descriptor"); + } + + _output_row_desc = _pool->add(new RowDescriptor(_output_tuple_desc, false)); + + _max_decimalv2_val.resize(_output_tuple_desc->slots().size()); + _min_decimalv2_val.resize(_output_tuple_desc->slots().size()); + // check if need validate batch + for (int i = 0; i < _output_tuple_desc->slots().size(); ++i) { + auto slot = _output_tuple_desc->slots()[i]; + switch (slot->type().type) { + // For DECIMAL32,DECIMAL64,DECIMAL128, we have done precision and scale conversion so just + // skip data validation here. + case TYPE_DECIMALV2: + _max_decimalv2_val[i].to_max_decimal(slot->type().precision, slot->type().scale); + _min_decimalv2_val[i].to_min_decimal(slot->type().precision, slot->type().scale); + _need_validate_data = true; + break; + case TYPE_CHAR: + case TYPE_VARCHAR: + case TYPE_DATE: + case TYPE_DATETIME: + case TYPE_DATEV2: + case TYPE_DATETIMEV2: + case TYPE_HLL: + case TYPE_OBJECT: + case TYPE_STRING: + case TYPE_ARRAY: + _need_validate_data = true; + break; + default: + break; + } + } + + // add all counter + _input_rows_counter = ADD_COUNTER(_profile, "RowsRead", TUnit::UNIT); + _output_rows_counter = ADD_COUNTER(_profile, "RowsReturned", TUnit::UNIT); + _filtered_rows_counter = ADD_COUNTER(_profile, "RowsFiltered", TUnit::UNIT); + _send_data_timer = ADD_TIMER(_profile, "SendDataTime"); + _wait_mem_limit_timer = ADD_CHILD_TIMER(_profile, "WaitMemLimitTime", "SendDataTime"); + _validate_data_timer = ADD_TIMER(_profile, "ValidateDataTime"); + _open_timer = ADD_TIMER(_profile, "OpenTime"); + _close_timer = ADD_TIMER(_profile, "CloseWaitTime"); + _non_blocking_send_timer = ADD_TIMER(_profile, "NonBlockingSendTime"); + _non_blocking_send_work_timer = + ADD_CHILD_TIMER(_profile, "NonBlockingSendWorkTime", "NonBlockingSendTime"); + _serialize_batch_timer = + ADD_CHILD_TIMER(_profile, "SerializeBatchTime", "NonBlockingSendWorkTime"); + _total_add_batch_exec_timer = ADD_TIMER(_profile, "TotalAddBatchExecTime"); + _max_add_batch_exec_timer = ADD_TIMER(_profile, "MaxAddBatchExecTime"); + _add_batch_number = ADD_COUNTER(_profile, "NumberBatchAdded", TUnit::UNIT); + _num_node_channels = ADD_COUNTER(_profile, "NumberNodeChannels", TUnit::UNIT); + _load_mem_limit = state->get_load_mem_limit(); + + // open all channels + const auto& partitions = _partition->get_partitions(); + for (int i = 0; i < _schema->indexes().size(); ++i) { + // collect all tablets belong to this rollup + std::vector tablets; + auto index = _schema->indexes()[i]; + for (const auto& part : partitions) { + for (const auto& tablet : part->indexes[i].tablets) { + TTabletWithPartition tablet_with_partition; + tablet_with_partition.partition_id = part->id; + tablet_with_partition.tablet_id = tablet; + tablets.emplace_back(std::move(tablet_with_partition)); + } + } + if (UNLIKELY(tablets.empty())) { + LOG(WARNING) << "load job:" << state->load_job_id() << " index: " << index->index_id + << " would open 0 tablet"; + } + _channels.emplace_back(new IndexChannel(this, index->index_id)); + RETURN_IF_ERROR(_channels.back()->init(state, tablets)); + } // Prepare the exprs to run. RETURN_IF_ERROR(vectorized::VExpr::prepare(_output_vexpr_ctxs, state, _input_row_desc)); return Status::OK(); @@ -447,7 +911,65 @@ Status VOlapTableSink::open(RuntimeState* state) { START_AND_SCOPE_SPAN(state->get_tracer(), span, "VOlapTableSink::open"); // Prepare the exprs to run. RETURN_IF_ERROR(vectorized::VExpr::open(_output_vexpr_ctxs, state)); - return OlapTableSink::open(state); + SCOPED_TIMER(_profile->total_time_counter()); + SCOPED_TIMER(_open_timer); + SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get()); + + for (auto index_channel : _channels) { + index_channel->for_each_node_channel( + [](const std::shared_ptr& ch) { ch->open(); }); + } + + for (auto index_channel : _channels) { + index_channel->for_each_node_channel([&index_channel]( + const std::shared_ptr& ch) { + auto st = ch->open_wait(); + if (!st.ok()) { + // The open() phase is mainly to generate DeltaWriter instances on the nodes corresponding to each node channel. + // This phase will not fail due to a single tablet. + // Therefore, if the open() phase fails, all tablets corresponding to the node need to be marked as failed. + index_channel->mark_as_failed( + ch->node_id(), ch->host(), + fmt::format("{}, open failed, err: {}", ch->channel_info(), st.to_string()), + -1); + } + }); + + RETURN_IF_ERROR(index_channel->check_intolerable_failure()); + } + int32_t send_batch_parallelism = + MIN(_send_batch_parallelism, config::max_send_batch_parallelism_per_job); + _send_batch_thread_pool_token = state->exec_env()->send_batch_thread_pool()->new_token( + ThreadPool::ExecutionMode::CONCURRENT, send_batch_parallelism); + RETURN_IF_ERROR(Thread::create( + "OlapTableSink", "send_batch_process", + [this, state]() { this->_send_batch_process(state); }, &_sender_thread)); + + return Status::OK(); +} + +void VOlapTableSink::_send_batch_process(RuntimeState* state) { + SCOPED_TIMER(_non_blocking_send_timer); + SCOPED_ATTACH_TASK(state); + SCOPED_CONSUME_MEM_TRACKER(_mem_tracker); + do { + int running_channels_num = 0; + for (auto index_channel : _channels) { + index_channel->for_each_node_channel([&running_channels_num, this, + state](const std::shared_ptr& ch) { + running_channels_num += + ch->try_send_and_fetch_status(state, this->_send_batch_thread_pool_token); + }); + } + + if (running_channels_num == 0) { + LOG(INFO) << "all node channels are stopped(maybe finished/offending/cancelled), " + "sender thread exit. " + << print_id(_load_id); + return; + } + } while (!_stop_background_threads_latch.wait_for( + std::chrono::milliseconds(config::olap_table_sink_send_interval_ms))); } size_t VOlapTableSink::get_pending_bytes() const { @@ -552,81 +1074,51 @@ Status VOlapTableSink::send(RuntimeState* state, vectorized::Block* input_block, _partition_to_tablet_map.clear(); } - bool use_vec = _is_vectorized && state->be_exec_version() > 0; - if (use_vec) { - std::vector, std::vector>>> - channel_to_payload; - channel_to_payload.resize(_channels.size()); - for (int i = 0; i < num_rows; ++i) { - if (filtered_rows > 0 && _filter_bitmap.Get(i)) { - continue; - } - const VOlapTablePartition* partition = nullptr; - uint32_t tablet_index = 0; - bool is_continue = false; - RETURN_IF_ERROR(find_tablet(state, &block, i, &partition, tablet_index, stop_processing, - is_continue)); - if (is_continue) { - continue; - } - for (int j = 0; j < partition->indexes.size(); ++j) { - auto tid = partition->indexes[j].tablets[tablet_index]; - auto it = _channels[j]->_channels_by_tablet.find(tid); - DCHECK(it != _channels[j]->_channels_by_tablet.end()) - << "unknown tablet, tablet_id=" << tablet_index; - for (const auto& channel : it->second) { - if (channel_to_payload[j].count(channel.get()) < 1) { - channel_to_payload[j].insert( - {channel.get(), - std::pair, - std::vector> { - std::unique_ptr( - new vectorized::IColumn::Selector()), - std::vector()}}); - } - channel_to_payload[j][channel.get()].first->push_back(i); - channel_to_payload[j][channel.get()].second.push_back(tid); + std::vector, std::vector>>> + channel_to_payload; + channel_to_payload.resize(_channels.size()); + for (int i = 0; i < num_rows; ++i) { + if (filtered_rows > 0 && _filter_bitmap.Get(i)) { + continue; + } + const VOlapTablePartition* partition = nullptr; + uint32_t tablet_index = 0; + bool is_continue = false; + RETURN_IF_ERROR(find_tablet(state, &block, i, &partition, tablet_index, stop_processing, + is_continue)); + if (is_continue) { + continue; + } + for (int j = 0; j < partition->indexes.size(); ++j) { + auto tid = partition->indexes[j].tablets[tablet_index]; + auto it = _channels[j]->_channels_by_tablet.find(tid); + DCHECK(it != _channels[j]->_channels_by_tablet.end()) + << "unknown tablet, tablet_id=" << tablet_index; + for (const auto& channel : it->second) { + if (channel_to_payload[j].count(channel.get()) < 1) { + channel_to_payload[j].insert( + {channel.get(), + std::pair, + std::vector> { + std::unique_ptr( + new vectorized::IColumn::Selector()), + std::vector()}}); } - _number_output_rows++; + channel_to_payload[j][channel.get()].first->push_back(i); + channel_to_payload[j][channel.get()].second.push_back(tid); } + _number_output_rows++; } - for (size_t i = 0; i < _channels.size(); i++) { - for (const auto& entry : channel_to_payload[i]) { - // if this node channel is already failed, this add_row will be skipped - auto st = entry.first->add_block(&block, entry.second); - if (!st.ok()) { - _channels[i]->mark_as_failed(entry.first->node_id(), entry.first->host(), - st.to_string()); - } - } - } - } else { - size_t MAX_PENDING_BYTES = _load_mem_limit / 3; - while (get_pending_bytes() > MAX_PENDING_BYTES && !state->is_cancelled()) { - std::this_thread::sleep_for(std::chrono::microseconds(100)); - } - - for (int i = 0; i < num_rows; ++i) { - if (filtered_rows > 0 && _filter_bitmap.Get(i)) { - continue; - } - const VOlapTablePartition* partition = nullptr; - uint32_t tablet_index = 0; - BlockRow block_row; - block_row = {&block, i}; - bool is_continue = false; - RETURN_IF_ERROR(find_tablet(state, &block, i, &partition, tablet_index, stop_processing, - is_continue)); - if (is_continue) { - continue; - } - - for (int j = 0; j < partition->indexes.size(); ++j) { - int64_t tablet_id = partition->indexes[j].tablets[tablet_index]; - _channels[j]->add_row(block_row, tablet_id); - _number_output_rows++; + } + for (size_t i = 0; i < _channels.size(); i++) { + for (const auto& entry : channel_to_payload[i]) { + // if this node channel is already failed, this add_row will be skipped + auto st = entry.first->add_block(&block, entry.second); + if (!st.ok()) { + _channels[i]->mark_as_failed(entry.first->node_id(), entry.first->host(), + st.to_string()); } } } @@ -642,7 +1134,124 @@ Status VOlapTableSink::close(RuntimeState* state, Status exec_status) { if (_closed) return _close_status; START_AND_SCOPE_SPAN(state->get_tracer(), span, "VOlapTableSink::close"); vectorized::VExpr::close(_output_vexpr_ctxs, state); - return OlapTableSink::close(state, exec_status); + Status status = exec_status; + if (status.ok()) { + // only if status is ok can we call this _profile->total_time_counter(). + // if status is not ok, this sink may not be prepared, so that _profile is null + SCOPED_TIMER(_profile->total_time_counter()); + // BE id -> add_batch method counter + std::unordered_map node_add_batch_counter_map; + int64_t serialize_batch_ns = 0, mem_exceeded_block_ns = 0, queue_push_lock_ns = 0, + actual_consume_ns = 0, total_add_batch_exec_time_ns = 0, + max_add_batch_exec_time_ns = 0, total_add_batch_num = 0, num_node_channels = 0; + { + SCOPED_TIMER(_close_timer); + for (auto index_channel : _channels) { + index_channel->for_each_node_channel( + [](const std::shared_ptr& ch) { ch->mark_close(); }); + num_node_channels += index_channel->num_node_channels(); + } + + for (auto index_channel : _channels) { + int64_t add_batch_exec_time = 0; + index_channel->for_each_node_channel( + [&index_channel, &state, &node_add_batch_counter_map, &serialize_batch_ns, + &mem_exceeded_block_ns, &queue_push_lock_ns, &actual_consume_ns, + &total_add_batch_exec_time_ns, &add_batch_exec_time, + &total_add_batch_num](const std::shared_ptr& ch) { + auto s = ch->close_wait(state); + if (!s.ok()) { + auto err_msg = s.to_string(); + index_channel->mark_as_failed(ch->node_id(), ch->host(), err_msg, + -1); + // cancel the node channel in best effort + ch->cancel(err_msg); + LOG(WARNING) << ch->channel_info() + << ", close channel failed, err: " << err_msg; + } + ch->time_report(&node_add_batch_counter_map, &serialize_batch_ns, + &mem_exceeded_block_ns, &queue_push_lock_ns, + &actual_consume_ns, &total_add_batch_exec_time_ns, + &add_batch_exec_time, &total_add_batch_num); + }); + + if (add_batch_exec_time > max_add_batch_exec_time_ns) { + max_add_batch_exec_time_ns = add_batch_exec_time; + } + + // check if index has intolerable failure + Status index_st = index_channel->check_intolerable_failure(); + if (!index_st.ok()) { + status = index_st; + } else if (Status st = index_channel->check_tablet_received_rows_consistency(); + !st.ok()) { + status = st; + } + } // end for index channels + } + // TODO need to be improved + LOG(INFO) << "total mem_exceeded_block_ns=" << mem_exceeded_block_ns + << ", total queue_push_lock_ns=" << queue_push_lock_ns + << ", total actual_consume_ns=" << actual_consume_ns + << ", load id=" << print_id(_load_id); + + COUNTER_SET(_input_rows_counter, _number_input_rows); + COUNTER_SET(_output_rows_counter, _number_output_rows); + COUNTER_SET(_filtered_rows_counter, _number_filtered_rows); + COUNTER_SET(_send_data_timer, _send_data_ns); + COUNTER_SET(_wait_mem_limit_timer, mem_exceeded_block_ns); + COUNTER_SET(_validate_data_timer, _validate_data_ns); + COUNTER_SET(_serialize_batch_timer, serialize_batch_ns); + COUNTER_SET(_non_blocking_send_work_timer, actual_consume_ns); + COUNTER_SET(_total_add_batch_exec_timer, total_add_batch_exec_time_ns); + COUNTER_SET(_max_add_batch_exec_timer, max_add_batch_exec_time_ns); + COUNTER_SET(_add_batch_number, total_add_batch_num); + COUNTER_SET(_num_node_channels, num_node_channels); + // _number_input_rows don't contain num_rows_load_filtered and num_rows_load_unselected in scan node + int64_t num_rows_load_total = _number_input_rows + state->num_rows_load_filtered() + + state->num_rows_load_unselected(); + state->set_num_rows_load_total(num_rows_load_total); + state->update_num_rows_load_filtered(_number_filtered_rows); + + // print log of add batch time of all node, for tracing load performance easily + std::stringstream ss; + ss << "finished to close olap table sink. load_id=" << print_id(_load_id) + << ", txn_id=" << _txn_id + << ", node add batch time(ms)/wait execution time(ms)/close time(ms)/num: "; + for (auto const& pair : node_add_batch_counter_map) { + ss << "{" << pair.first << ":(" << (pair.second.add_batch_execution_time_us / 1000) + << ")(" << (pair.second.add_batch_wait_execution_time_us / 1000) << ")(" + << pair.second.close_wait_time_ms << ")(" << pair.second.add_batch_num << ")} "; + } + LOG(INFO) << ss.str(); + } else { + for (auto channel : _channels) { + channel->for_each_node_channel([&status](const std::shared_ptr& ch) { + ch->cancel(status.to_string()); + }); + } + LOG(INFO) << "finished to close olap table sink. load_id=" << print_id(_load_id) + << ", txn_id=" << _txn_id + << ", canceled all node channels due to error: " << status; + } + + // Sender join() must put after node channels mark_close/cancel. + // But there is no specific sequence required between sender join() & close_wait(). + _stop_background_threads_latch.count_down(); + if (_sender_thread) { + _sender_thread->join(); + // We have to wait all task in _send_batch_thread_pool_token finished, + // because it is difficult to handle concurrent problem if we just + // shutdown it. + _send_batch_thread_pool_token->wait(); + } + + Expr::close(_output_expr_ctxs, state); + _output_batch.reset(); + + _close_status = status; + DataSink::close(state, exec_status); + return status; } Status VOlapTableSink::_validate_column(RuntimeState* state, const TypeDescriptor& type, diff --git a/be/src/vec/sink/vtablet_sink.h b/be/src/vec/sink/vtablet_sink.h index 8f7697f409..097bb1e6b7 100644 --- a/be/src/vec/sink/vtablet_sink.h +++ b/be/src/vec/sink/vtablet_sink.h @@ -16,10 +16,31 @@ // under the License. #pragma once +#include -#include "exec/tablet_sink.h" +#include +#include +#include +#include +#include +#include +#include + +#include "common/object_pool.h" +#include "common/status.h" +#include "exec/data_sink.h" +#include "exec/tablet_info.h" +#include "gen_cpp/Types_types.h" +#include "gen_cpp/internal_service.pb.h" #include "runtime/row_batch.h" +#include "runtime/thread_context.h" +#include "util/bitmap.h" +#include "util/countdown_latch.h" +#include "util/ref_count_closure.h" +#include "util/spinlock.h" +#include "util/thread.h" #include "vec/columns/column.h" +#include "vec/core/block.h" namespace doris { @@ -29,36 +50,265 @@ class VExprContext; namespace stream_load { -class VNodeChannel : public NodeChannel { +// The counter of add_batch rpc of a single node +struct AddBatchCounter { + // total execution time of a add_batch rpc + int64_t add_batch_execution_time_us = 0; + // lock waiting time in a add_batch rpc + int64_t add_batch_wait_execution_time_us = 0; + // number of add_batch call + int64_t add_batch_num = 0; + // time passed between marked close and finish close + int64_t close_wait_time_ms = 0; + + AddBatchCounter& operator+=(const AddBatchCounter& rhs) { + add_batch_execution_time_us += rhs.add_batch_execution_time_us; + add_batch_wait_execution_time_us += rhs.add_batch_wait_execution_time_us; + add_batch_num += rhs.add_batch_num; + close_wait_time_ms += rhs.close_wait_time_ms; + return *this; + } + friend AddBatchCounter operator+(const AddBatchCounter& lhs, const AddBatchCounter& rhs) { + AddBatchCounter sum = lhs; + sum += rhs; + return sum; + } +}; + +// It's very error-prone to guarantee the handler capture vars' & this closure's destruct sequence. +// So using create() to get the closure pointer is recommended. We can delete the closure ptr before the capture vars destruction. +// Delete this point is safe, don't worry about RPC callback will run after ReusableClosure deleted. +template +class ReusableClosure final : public google::protobuf::Closure { public: - VNodeChannel(OlapTableSink* parent, IndexChannel* index_channel, int64_t node_id); + ReusableClosure() : cid(INVALID_BTHREAD_ID) {} + ~ReusableClosure() override { + // shouldn't delete when Run() is calling or going to be called, wait for current Run() done. + join(); + SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(ExecEnv::GetInstance()->orphan_mem_tracker()); + cntl.Reset(); + } - ~VNodeChannel() override; + static ReusableClosure* create() { return new ReusableClosure(); } - Status init(RuntimeState* state) override; + void addFailedHandler(const std::function& fn) { failed_handler = fn; } + void addSuccessHandler(const std::function& fn) { success_handler = fn; } - Status open_wait() override; + void join() { + // We rely on in_flight to assure one rpc is running, + // while cid is not reliable due to memory order. + // in_flight is written before getting callid, + // so we can not use memory fence to synchronize. + while (_packet_in_flight) { + // cid here is complicated + if (cid != INVALID_BTHREAD_ID) { + // actually cid may be the last rpc call id. + brpc::Join(cid); + } + if (_packet_in_flight) { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + } + } + + // plz follow this order: reset() -> set_in_flight() -> send brpc batch + void reset() { + SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(ExecEnv::GetInstance()->orphan_mem_tracker()); + cntl.Reset(); + cid = cntl.call_id(); + } + + bool try_set_in_flight() { + bool value = false; + return _packet_in_flight.compare_exchange_strong(value, true); + } + + void clear_in_flight() { _packet_in_flight = false; } + + bool is_packet_in_flight() { return _packet_in_flight; } + + void end_mark() { + DCHECK(_is_last_rpc == false); + _is_last_rpc = true; + } + + void Run() override { + DCHECK(_packet_in_flight); + if (cntl.Failed()) { + LOG(WARNING) << "failed to send brpc batch, error=" << berror(cntl.ErrorCode()) + << ", error_text=" << cntl.ErrorText(); + failed_handler(_is_last_rpc); + } else { + success_handler(result, _is_last_rpc); + } + clear_in_flight(); + } + + brpc::Controller cntl; + T result; + +private: + brpc::CallId cid; + std::atomic _packet_in_flight {false}; + std::atomic _is_last_rpc {false}; + std::function failed_handler; + std::function success_handler; +}; + +class IndexChannel; +class VOlapTableSink; + +class VNodeChannel { +public: + VNodeChannel(VOlapTableSink* parent, IndexChannel* index_channel, int64_t node_id); + + ~VNodeChannel(); + + // called before open, used to add tablet located in this backend + void add_tablet(const TTabletWithPartition& tablet) { _all_tablets.emplace_back(tablet); } + + void add_slave_tablet_nodes(int64_t tablet_id, const std::vector& slave_nodes) { + _slave_tablet_nodes[tablet_id] = slave_nodes; + } + + void open(); + + Status init(RuntimeState* state); + + Status open_wait(); Status add_block(vectorized::Block* block, const std::pair, - std::vector>& payload) override; + std::vector>& payload); int try_send_and_fetch_status(RuntimeState* state, - std::unique_ptr& thread_pool_token) override; + std::unique_ptr& thread_pool_token); void try_send_block(RuntimeState* state); - void clear_all_blocks() override; + void clear_all_blocks(); // two ways to stop channel: // 1. mark_close()->close_wait() PS. close_wait() will block waiting for the last AddBatch rpc response. // 2. just cancel() - void mark_close() override; + void mark_close(); + + // two ways to stop channel: + // 1. mark_close()->close_wait() PS. close_wait() will block waiting for the last AddBatch rpc response. + // 2. just cancel() + Status close_wait(RuntimeState* state); + + void cancel(const std::string& cancel_msg); + + void time_report(std::unordered_map* add_batch_counter_map, + int64_t* serialize_batch_ns, int64_t* mem_exceeded_block_ns, + int64_t* queue_push_lock_ns, int64_t* actual_consume_ns, + int64_t* total_add_batch_exec_time_ns, int64_t* add_batch_exec_time_ns, + int64_t* total_add_batch_num) const { + (*add_batch_counter_map)[_node_id] += _add_batch_counter; + (*add_batch_counter_map)[_node_id].close_wait_time_ms = _close_time_ms; + *serialize_batch_ns += _serialize_batch_ns; + *mem_exceeded_block_ns += _mem_exceeded_block_ns; + *queue_push_lock_ns += _queue_push_lock_ns; + *actual_consume_ns += _actual_consume_ns; + *add_batch_exec_time_ns = (_add_batch_counter.add_batch_execution_time_us * 1000); + *total_add_batch_exec_time_ns += *add_batch_exec_time_ns; + *total_add_batch_num += _add_batch_counter.add_batch_num; + } + + int64_t node_id() const { return _node_id; } + std::string host() const { return _node_info.host; } + std::string name() const { return _name; } + + Status none_of(std::initializer_list vars); + + std::string channel_info() const { + return fmt::format("{}, {}, node={}:{}", _name, _load_info, _node_info.host, + _node_info.brpc_port); + } + + size_t get_pending_bytes() { return _pending_batches_bytes; } protected: - void _close_check() override; + void _close_check(); + void _cancel_with_msg(const std::string& msg); + + VOlapTableSink* _parent = nullptr; + IndexChannel* _index_channel = nullptr; + int64_t _node_id = -1; + std::string _load_info; + std::string _name; + + std::shared_ptr _node_channel_tracker; + + TupleDescriptor* _tuple_desc = nullptr; + NodeInfo _node_info; + + // this should be set in init() using config + int _rpc_timeout_ms = 60000; + int64_t _next_packet_seq = 0; + MonotonicStopWatch _timeout_watch; + + // the timestamp when this node channel be marked closed and finished closed + uint64_t _close_time_ms = 0; + + // user cancel or get some errors + std::atomic _cancelled {false}; + doris::SpinLock _cancel_msg_lock; + std::string _cancel_msg; + + // send finished means the consumer thread which send the rpc can exit + std::atomic _send_finished {false}; + + // add batches finished means the last rpc has be response, used to check whether this channel can be closed + std::atomic _add_batches_finished {false}; // reuse for vectorized + + bool _eos_is_produced {false}; // only for restricting producer behaviors + + std::unique_ptr _row_desc; + int _batch_size = 0; + + // limit _pending_batches size + std::atomic _pending_batches_bytes {0}; + size_t _max_pending_batches_bytes {(size_t)config::nodechannel_pending_queue_max_bytes}; + std::mutex _pending_batches_lock; // reuse for vectorized + std::atomic _pending_batches_num {0}; // reuse for vectorized + + std::shared_ptr _stub = nullptr; + RefCountClosure* _open_closure = nullptr; + + std::vector _all_tablets; + // map from tablet_id to node_id where slave replicas locate in + std::unordered_map> _slave_tablet_nodes; + std::vector _tablet_commit_infos; + + AddBatchCounter _add_batch_counter; + std::atomic _serialize_batch_ns {0}; + std::atomic _mem_exceeded_block_ns {0}; + std::atomic _queue_push_lock_ns {0}; + std::atomic _actual_consume_ns {0}; + + // lock to protect _is_closed. + // The methods in the IndexChannel are called back in the RpcClosure in the NodeChannel. + // However, this rpc callback may occur after the whole task is finished (e.g. due to network latency), + // and by that time the IndexChannel may have been destructured, so we should not call the + // IndexChannel methods anymore, otherwise the BE will crash. + // Therefore, we use the _is_closed and _closed_lock to ensure that the RPC callback + // function will not call the IndexChannel method after the NodeChannel is closed. + // The IndexChannel is definitely accessible until the NodeChannel is closed. + std::mutex _closed_lock; + bool _is_closed = false; + + RuntimeState* _state; + // rows number received per tablet, tablet_id -> rows_num + std::vector> _tablets_received_rows; + + std::unique_ptr _cur_batch; + PTabletWriterAddBatchRequest _cur_add_batch_request; + using AddBatchReq = std::pair, PTabletWriterAddBatchRequest>; + std::queue _pending_batches; + ReusableClosure* _add_batch_closure = nullptr; -private: std::unique_ptr _cur_mutable_block; PTabletWriterAddBlockRequest _cur_add_block_request; @@ -68,13 +318,84 @@ private: ReusableClosure* _add_block_closure = nullptr; }; -class OlapTableSink; +class IndexChannel { +public: + IndexChannel(VOlapTableSink* parent, int64_t index_id) : _parent(parent), _index_id(index_id) { + _index_channel_tracker = + std::make_unique("IndexChannel:indexID=" + std::to_string(_index_id)); + } + ~IndexChannel() = default; + + Status init(RuntimeState* state, const std::vector& tablets); + + void for_each_node_channel( + const std::function&)>& func) { + for (auto& it : _node_channels) { + func(it.second); + } + } + + void mark_as_failed(int64_t node_id, const std::string& host, const std::string& err, + int64_t tablet_id = -1); + Status check_intolerable_failure(); + + // set error tablet info in runtime state, so that it can be returned to FE. + void set_error_tablet_in_state(RuntimeState* state); + + size_t num_node_channels() const { return _node_channels.size(); } + + size_t get_pending_bytes() const { + size_t mem_consumption = 0; + for (auto& kv : _node_channels) { + mem_consumption += kv.second->get_pending_bytes(); + } + return mem_consumption; + } + + void set_tablets_received_rows( + const std::vector>& tablets_received_rows, int64_t node_id); + + // check whether the rows num written by different replicas is consistent + Status check_tablet_received_rows_consistency(); + +private: + friend class VNodeChannel; + friend class VOlapTableSink; + + VOlapTableSink* _parent; + int64_t _index_id; + + // from backend channel to tablet_id + // ATTN: must be placed before `_node_channels` and `_channels_by_tablet`. + // Because the destruct order of objects is opposite to the creation order. + // So NodeChannel will be destructured first. + // And the destructor function of NodeChannel waits for all RPCs to finish. + // This ensures that it is safe to use `_tablets_by_channel` in the callback function for the end of the RPC. + std::unordered_map> _tablets_by_channel; + // BeId -> channel + std::unordered_map> _node_channels; + // from tablet_id to backend channel + std::unordered_map>> _channels_by_tablet; + + // lock to protect _failed_channels and _failed_channels_msgs + mutable doris::SpinLock _fail_lock; + // key is tablet_id, value is a set of failed node id + std::unordered_map> _failed_channels; + // key is tablet_id, value is error message + std::unordered_map _failed_channels_msgs; + Status _intolerable_failure_status = Status::OK(); + + std::unique_ptr _index_channel_tracker; + // rows num received by DeltaWriter per tablet, tablet_id -> + // used to verify whether the rows num received by different replicas is consistent + std::map>> _tablets_received_rows; +}; // Write block data to Olap Table. // When OlapTableSink::open() called, there will be a consumer thread running in the background. // When you call VOlapTableSink::send(), you will be the producer who products pending batches. // Join the consumer thread in close(). -class VOlapTableSink : public OlapTableSink { +class VOlapTableSink final : public DataSink { public: // Construct from thrift struct which is generated by FE. VOlapTableSink(ObjectPool* pool, const RowDescriptor& row_desc, @@ -89,14 +410,24 @@ public: Status open(RuntimeState* state) override; Status close(RuntimeState* state, Status close_status) override; - using OlapTableSink::send; Status send(RuntimeState* state, vectorized::Block* block, bool eos = false) override; size_t get_pending_bytes() const; const RowDescriptor& row_desc() { return _input_row_desc; } + // Returns the runtime profile for the sink. + RuntimeProfile* profile() override { return _profile; } + + // the consumer func of sending pending batches in every NodeChannel. + // use polling & NodeChannel::try_send_and_fetch_status() to achieve nonblocking sending. + // only focus on pending batches and channel status, the internal errors of NodeChannels will be handled by the producer + void _send_batch_process(RuntimeState* state); + private: + friend class VNodeChannel; + friend class IndexChannel; + // make input data valid for OLAP table // return number of invalid/filtered rows. // invalid row number is set in Bitmap @@ -116,6 +447,104 @@ private: const VOlapTablePartition** partition, uint32_t& tablet_index, bool& stop_processing, bool& is_continue); + std::shared_ptr _mem_tracker; + + ObjectPool* _pool; + const RowDescriptor& _input_row_desc; + + // unique load id + PUniqueId _load_id; + int64_t _txn_id = -1; + int _num_replicas = -1; + int _tuple_desc_id = -1; + + // this is tuple descriptor of destination OLAP table + TupleDescriptor* _output_tuple_desc = nullptr; + RowDescriptor* _output_row_desc = nullptr; + + bool _need_validate_data = false; + + // number of senders used to insert into OlapTable, if we only support single node insert, + // all data from select should collectted and then send to OlapTable. + // To support multiple senders, we maintain a channel for each sender. + int _sender_id = -1; + int _num_senders = -1; + bool _is_high_priority = false; + + // TODO(zc): think about cache this data + std::shared_ptr _schema; + OlapTableLocationParam* _location = nullptr; + bool _write_single_replica = false; + OlapTableLocationParam* _slave_location = nullptr; + DorisNodesInfo* _nodes_info = nullptr; + + RuntimeProfile* _profile = nullptr; + + std::set _partition_ids; + // only used for partition with random distribution + std::map _partition_to_tablet_map; + + Bitmap _filter_bitmap; + + // index_channel + std::vector> _channels; + + CountDownLatch _stop_background_threads_latch; + scoped_refptr _sender_thread; + std::unique_ptr _send_batch_thread_pool_token; + + std::vector _max_decimalv2_val; + std::vector _min_decimalv2_val; + + // Stats for this + int64_t _validate_data_ns = 0; + int64_t _send_data_ns = 0; + int64_t _number_input_rows = 0; + int64_t _number_output_rows = 0; + int64_t _number_filtered_rows = 0; + + RuntimeProfile::Counter* _input_rows_counter = nullptr; + RuntimeProfile::Counter* _output_rows_counter = nullptr; + RuntimeProfile::Counter* _filtered_rows_counter = nullptr; + RuntimeProfile::Counter* _send_data_timer = nullptr; + RuntimeProfile::Counter* _wait_mem_limit_timer = nullptr; + RuntimeProfile::Counter* _validate_data_timer = nullptr; + RuntimeProfile::Counter* _open_timer = nullptr; + RuntimeProfile::Counter* _close_timer = nullptr; + RuntimeProfile::Counter* _non_blocking_send_timer = nullptr; + RuntimeProfile::Counter* _non_blocking_send_work_timer = nullptr; + RuntimeProfile::Counter* _serialize_batch_timer = nullptr; + RuntimeProfile::Counter* _total_add_batch_exec_timer = nullptr; + RuntimeProfile::Counter* _max_add_batch_exec_timer = nullptr; + RuntimeProfile::Counter* _add_batch_number = nullptr; + RuntimeProfile::Counter* _num_node_channels = nullptr; + + // load mem limit is for remote load channel + int64_t _load_mem_limit = -1; + + // the timeout of load channels opened by this tablet sink. in second + int64_t _load_channel_timeout_s = 0; + + int32_t _send_batch_parallelism = 1; + // Save the status of close() method + Status _close_status; + + // User can change this config at runtime, avoid it being modified during query or loading process. + bool _transfer_large_data_by_brpc = false; + + // FIND_TABLET_EVERY_ROW is used for both hash and random distribution info, which indicates that we + // should compute tablet index for every row + // FIND_TABLET_EVERY_BATCH is only used for random distribution info, which indicates that we should + // compute tablet index for every row batch + // FIND_TABLET_EVERY_SINK is only used for random distribution info, which indicates that we should + // only compute tablet index in the corresponding partition once for the whole time in olap table sink + enum FindTabletMode { FIND_TABLET_EVERY_ROW, FIND_TABLET_EVERY_BATCH, FIND_TABLET_EVERY_SINK }; + FindTabletMode findTabletMode = FindTabletMode::FIND_TABLET_EVERY_ROW; + + OlapTablePartitionParam* _partition = nullptr; + std::vector _output_expr_ctxs; + std::unique_ptr _output_batch; + VOlapTablePartitionParam* _vpartition = nullptr; std::vector _output_vexpr_ctxs; }; diff --git a/be/test/CMakeLists.txt b/be/test/CMakeLists.txt index eb710a5eb5..3a93ebd9c7 100644 --- a/be/test/CMakeLists.txt +++ b/be/test/CMakeLists.txt @@ -150,7 +150,6 @@ set(OLAP_TEST_FILES ) set(RUNTIME_TEST_FILES - # runtime/buffered_tuple_stream_test.cpp # runtime/buffer_control_block_test.cpp # runtime/result_buffer_mgr_test.cpp # runtime/result_sink_test.cpp @@ -163,8 +162,6 @@ set(RUNTIME_TEST_FILES # runtime/tmp_file_mgr_test.cpp # runtime/disk_io_mgr_test.cpp # runtime/thread_resource_mgr_test.cpp - # runtime/buffered_block_mgr2_test.cpp - # runtime/buffered_tuple_stream2_test.cpp # runtime/export_task_mgr_test.cpp runtime/mem_pool_test.cpp runtime/string_buffer_test.cpp diff --git a/be/test/runtime/buffered_block_mgr2_test.cpp b/be/test/runtime/buffered_block_mgr2_test.cpp deleted file mode 100644 index 09994c0c42..0000000000 --- a/be/test/runtime/buffered_block_mgr2_test.cpp +++ /dev/null @@ -1,1246 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "runtime/buffered_block_mgr2.h" - -#include -#include - -#include -#include -#include - -#include "runtime/disk_io_mgr.h" -#include "runtime/exec_env.h" -#include "runtime/runtime_state.h" -#include "runtime/test_env.h" -#include "runtime/tmp_file_mgr.h" -#include "util/cpu_info.h" -#include "util/disk_info.h" -#include "util/filesystem_util.h" -#include "util/monotime.h" -#include "util/thread_group.h" - -using std::filesystem::directory_iterator; -using std::filesystem::remove; -using std::unique_ptr; -using std::unordered_map; -using std::thread; - -using std::string; -using std::stringstream; -using std::vector; - -// Note: This is the default scratch dir created by doris. -// config::query_scratch_dirs + TmpFileMgr::_s_tmp_sub_dir_name. -const static string SCRATCH_DIR = "/tmp/doris-scratch"; - -// This suffix is appended to a tmp dir -const static string SCRATCH_SUFFIX = "/doris-scratch"; - -// Number of milliseconds to wait to ensure write completes -const static int WRITE_WAIT_MILLIS = 500; - -// How often to check for write completion -const static int WRITE_CHECK_INTERVAL_MILLIS = 10; - -namespace doris { - -class BufferedBlockMgrTest : public ::testing::Test { -protected: - const static int _block_size = 1024; - - virtual void SetUp() { _test_env.reset(new TestEnv()); } - - virtual void TearDown() { - TearDownMgrs(); - _test_env.reset(); - - // Tests modify permissions, so make sure we can delete if they didn't clean up. - for (int i = 0; i < _created_tmp_dirs.size(); ++i) { - chmod((_created_tmp_dirs[i] + SCRATCH_SUFFIX).c_str(), S_IRWXU); - } - FileSystemUtil::remove_paths(_created_tmp_dirs); - _created_tmp_dirs.clear(); - } - - // Reinitialize _test_env to have multiple temporary directories. - std::vector InitMultipleTmpDirs(int num_dirs) { - std::vector tmp_dirs; - for (int i = 0; i < num_dirs; ++i) { - std::stringstream dir_str; - dir_str << "/tmp/buffered-block-mgr-test." << i; - const string& dir = dir_str.str(); - // Fix permissions in case old directories were left from previous runs of test. - chmod((dir + SCRATCH_SUFFIX).c_str(), S_IRWXU); - EXPECT_TRUE(FileSystemUtil::create_directory(dir).ok()); - tmp_dirs.push_back(dir); - _created_tmp_dirs.push_back(dir); - } - _test_env->init_tmp_file_mgr(tmp_dirs, false); - EXPECT_EQ(num_dirs, _test_env->tmp_file_mgr()->num_active_tmp_devices()); - return tmp_dirs; - } - - static void ValidateBlock(BufferedBlockMgr2::Block* block, int32_t data) { - EXPECT_TRUE(block->valid_data_len() == sizeof(int32_t)); - EXPECT_TRUE(*reinterpret_cast(block->buffer()) == data); - } - - static int32_t* MakeRandomSizeData(BufferedBlockMgr2::Block* block) { - // Format is int32_t size, followed by size bytes of data - int32_t size = (rand() % 252) + 4; // So blocks have 4-256 bytes of data - uint8_t* data = block->allocate(size); - *(reinterpret_cast(data)) = size; - int i = 0; - for (i = 4; i < size - 5; ++i) { - data[i] = i; - } - for (; i < size; ++i) { // End marker of at least 5 0xff's - data[i] = 0xff; - } - return reinterpret_cast(data); // Really returns a pointer to size - } - - static void ValidateRandomSizeData(BufferedBlockMgr2::Block* block, int32_t size) { - int32_t bsize = *(reinterpret_cast(block->buffer())); - uint8_t* data = reinterpret_cast(block->buffer()); - int i = 0; - EXPECT_EQ(block->valid_data_len(), size); - EXPECT_EQ(size, bsize); - for (i = 4; i < size - 5; ++i) { - EXPECT_EQ(data[i], i); - } - for (; i < size; ++i) { - EXPECT_EQ(data[i], 0xff); - } - } - - /// Helper to create a simple block manager. - BufferedBlockMgr2* CreateMgr(int64_t query_id, int max_buffers, int block_size, - RuntimeState** query_state = nullptr) { - RuntimeState* state = nullptr; - EXPECT_TRUE(_test_env->create_query_state(query_id, max_buffers, block_size, &state).ok()); - if (query_state != nullptr) { - *query_state = state; - } - return state->block_mgr2(); - } - - BufferedBlockMgr2* CreateMgrAndClient(int64_t query_id, int max_buffers, int block_size, - int reserved_blocks, BufferedBlockMgr2::Client** client) { - RuntimeState* state = nullptr; - BufferedBlockMgr2* mgr = CreateMgr(query_id, max_buffers, block_size, &state); - EXPECT_TRUE(mgr->register_client(reserved_blocks, state, client).ok()); - EXPECT_TRUE(client != nullptr); - return mgr; - } - - void CreateMgrsAndClients(int64_t start_query_id, int num_mgrs, int buffers_per_mgr, - int block_size, int reserved_blocks_per_client, - std::vector* mgrs, - std::vector* clients) { - for (int i = 0; i < num_mgrs; ++i) { - BufferedBlockMgr2::Client* client; - BufferedBlockMgr2* mgr = - CreateMgrAndClient(start_query_id + i, buffers_per_mgr, _block_size, - reserved_blocks_per_client, &client); - mgrs->push_back(mgr); - clients->push_back(client); - } - } - - // Destroy all created query states and associated block managers. - void TearDownMgrs() { - // Freeing all block managers should clean up all consumed memory. - _test_env->tear_down_query_states(); - } - - void AllocateBlocks(BufferedBlockMgr2* block_mgr, BufferedBlockMgr2::Client* client, - int num_blocks, std::vector* blocks) { - int32_t* data = nullptr; - Status status; - BufferedBlockMgr2::Block* new_block; - for (int i = 0; i < num_blocks; ++i) { - status = block_mgr->get_new_block(client, nullptr, &new_block); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(new_block != nullptr); - data = new_block->allocate(sizeof(int32_t)); - *data = blocks->size(); - blocks->push_back(new_block); - } - } - - // Pin all blocks, expecting they are pinned successfully. - void PinBlocks(const std::vector& blocks) { - for (int i = 0; i < blocks.size(); ++i) { - bool pinned = false; - EXPECT_TRUE(blocks[i]->pin(&pinned).ok()); - EXPECT_TRUE(pinned); - } - } - - // Pin all blocks, expecting no errors from unpin() calls. - void UnpinBlocks(const std::vector& blocks) { - for (int i = 0; i < blocks.size(); ++i) { - EXPECT_TRUE(blocks[i]->unpin().ok()); - } - } - - static void WaitForWrites(BufferedBlockMgr2* block_mgr) { - std::vector block_mgrs; - block_mgrs.push_back(block_mgr); - WaitForWrites(block_mgrs); - } - - // Wait for writes issued through block managers to complete. - static void WaitForWrites(const std::vector& block_mgrs) { - int max_attempts = WRITE_WAIT_MILLIS / WRITE_CHECK_INTERVAL_MILLIS; - for (int i = 0; i < max_attempts; ++i) { - SleepFor(MonoDelta::FromMilliseconds(WRITE_CHECK_INTERVAL_MILLIS)); - if (AllWritesComplete(block_mgrs)) { - return; - } - } - EXPECT_TRUE(false) << "Writes did not complete after " << WRITE_WAIT_MILLIS << "ms"; - } - - static bool AllWritesComplete(const std::vector& block_mgrs) { - for (int i = 0; i < block_mgrs.size(); ++i) { - RuntimeProfile::Counter* writes_outstanding = - block_mgrs[i]->profile()->get_counter("BlockWritesOutstanding"); - if (writes_outstanding->value() != 0) { - return false; - } - } - return true; - } - - // Delete the temporary file backing a block - all subsequent writes to the file - // should fail. Expects backing file has already been allocated. - static void DeleteBackingFile(BufferedBlockMgr2::Block* block) { - const string& path = block->tmp_file_path(); - EXPECT_GT(path.size(), 0); - EXPECT_TRUE(remove(path)); - LOG(INFO) << "Injected fault by deleting file " << path; - } - - // Check that the file backing the block has dir as a prefix of its path. - static bool BlockInDir(BufferedBlockMgr2::Block* block, const string& dir) { - return block->tmp_file_path().find(dir) == 0; - } - - // Find a block in the list that is backed by a file with the given directory as prefix - // of its path. - static BufferedBlockMgr2::Block* FindBlockForDir( - const std::vector& blocks, const string& dir) { - for (int i = 0; i < blocks.size(); ++i) { - if (BlockInDir(blocks[i], dir)) { - return blocks[i]; - } - } - return nullptr; - } - - void TestGetNewBlockImpl(int block_size) { - Status status; - int max_num_blocks = 5; - BufferedBlockMgr2* block_mgr = nullptr; - BufferedBlockMgr2::Client* client; - block_mgr = CreateMgrAndClient(0, max_num_blocks, block_size, 0, &client); - EXPECT_EQ(block_mgr->mem_tracker()->consumption(), 0); - - // Allocate blocks until max_num_blocks, they should all succeed and memory - // usage should go up. - BufferedBlockMgr2::Block* new_block; - BufferedBlockMgr2::Block* first_block = nullptr; - for (int i = 0; i < max_num_blocks; ++i) { - status = block_mgr->get_new_block(client, nullptr, &new_block); - EXPECT_TRUE(new_block != nullptr); - EXPECT_EQ(block_mgr->bytes_allocated(), (i + 1) * block_size); - if (first_block == nullptr) { - first_block = new_block; - } - } - - // Trying to allocate a new one should fail. - status = block_mgr->get_new_block(client, nullptr, &new_block); - EXPECT_TRUE(new_block == nullptr); - EXPECT_EQ(block_mgr->bytes_allocated(), max_num_blocks * block_size); - - // We can allocate a new block by transferring an already allocated one. - uint8_t* old_buffer = first_block->buffer(); - status = block_mgr->get_new_block(client, first_block, &new_block); - EXPECT_TRUE(new_block != nullptr); - EXPECT_TRUE(old_buffer == new_block->buffer()); - EXPECT_EQ(block_mgr->bytes_allocated(), max_num_blocks * block_size); - EXPECT_TRUE(!first_block->is_pinned()); - - // Trying to allocate a new one should still fail. - status = block_mgr->get_new_block(client, nullptr, &new_block); - EXPECT_TRUE(new_block == nullptr); - EXPECT_EQ(block_mgr->bytes_allocated(), max_num_blocks * block_size); - - EXPECT_EQ(block_mgr->writes_issued(), 1); - TearDownMgrs(); - } - - void TestEvictionImpl(int block_size) { - Status status; - DCHECK_GT(block_size, 0); - int max_num_buffers = 5; - BufferedBlockMgr2* block_mgr = nullptr; - BufferedBlockMgr2::Client* client = nullptr; - block_mgr = CreateMgrAndClient(0, max_num_buffers, block_size, 0, - block_mgr->get_tracker(client), &client); - - // Check counters. - RuntimeProfile* profile = block_mgr->profile(); - RuntimeProfile::Counter* buffered_pin = profile->get_counter("BufferedPins"); - - std::vector blocks; - AllocateBlocks(block_mgr, client, max_num_buffers, &blocks); - - EXPECT_EQ(block_mgr->bytes_allocated(), max_num_buffers * block_size); - for (BufferedBlockMgr2::Block* block : blocks) { - block->unpin(); - } - - // Re-pinning all blocks - for (int i = 0; i < blocks.size(); ++i) { - bool pinned = false; - status = blocks[i]->pin(&pinned); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(pinned); - ValidateBlock(blocks[i], i); - } - int buffered_pins_expected = blocks.size(); - EXPECT_EQ(buffered_pin->value(), buffered_pins_expected); - - // Unpin all blocks - for (BufferedBlockMgr2::Block* block : blocks) { - block->unpin(); - } - // Get two new blocks. - AllocateBlocks(block_mgr, client, 2, &blocks); - // At least two writes must be issued. The first (num_blocks - 2) must be in memory. - EXPECT_GE(block_mgr->writes_issued(), 2); - for (int i = 0; i < (max_num_buffers - 2); ++i) { - bool pinned = false; - status = blocks[i]->pin(&pinned); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(pinned); - ValidateBlock(blocks[i], i); - } - EXPECT_GE(buffered_pin->value(), buffered_pins_expected); - - // can not pin any more - for (int i = (max_num_buffers - 2); i < max_num_buffers; ++i) { - bool pinned = true; - status = blocks[i]->pin(&pinned); - EXPECT_TRUE(status.ok()); - EXPECT_FALSE(pinned); - } - - // the last 2 block has already been pinned - for (int i = max_num_buffers; i < blocks.size(); ++i) { - bool pinned = false; - status = blocks[i]->pin(&pinned); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(pinned); - ValidateBlock(blocks[i], i); - } - - TearDownMgrs(); - } - - // Test that randomly issues GetFreeBlock(), pin(), unpin(), del() and Close() - // calls. All calls made are legal - error conditions are not expected until the first - // call to Close(). This is called 2 times with encryption+integrity on/off. - // When executed in single-threaded mode 'tid' should be SINGLE_THREADED_TID. - static const int SINGLE_THREADED_TID = -1; - void TestRandomInternalImpl(RuntimeState* state, BufferedBlockMgr2* block_mgr, int num_buffers, - int tid) { - DCHECK(block_mgr != nullptr); - const int num_iterations = 100000; - const int iters_before_close = num_iterations - 5000; - bool close_called = false; - unordered_map pinned_block_map; - std::vector> pinned_blocks; - unordered_map unpinned_block_map; - std::vector> unpinned_blocks; - - typedef enum { Pin, New, Unpin, Delete, Close } ApiFunction; - ApiFunction api_function; - - BufferedBlockMgr2::Client* client; - Status status = block_mgr->register_client(0, state, &client); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(client != nullptr); - - pinned_blocks.reserve(num_buffers); - BufferedBlockMgr2::Block* new_block; - for (int i = 0; i < num_iterations; ++i) { - if ((i % 20000) == 0) { - LOG(ERROR) << " Iteration " << i << std::endl; - } - if (i > iters_before_close && (rand() % 5 == 0)) { - api_function = Close; - } else if (pinned_blocks.size() == 0 && unpinned_blocks.size() == 0) { - api_function = New; - } else if (pinned_blocks.size() == 0) { - // Pin or New. Can't unpin or delete. - api_function = static_cast(rand() % 2); - } else if (pinned_blocks.size() >= num_buffers) { - // Unpin or delete. Can't pin or get new. - api_function = static_cast(2 + (rand() % 2)); - } else if (unpinned_blocks.size() == 0) { - // Can't pin. Unpin, new or delete. - api_function = static_cast(1 + (rand() % 3)); - } else { - // Any api function. - api_function = static_cast(rand() % 4); - } - - std::pair block_data; - int rand_pick = 0; - int32_t* data = nullptr; - bool pinned = false; - switch (api_function) { - case New: - status = block_mgr->get_new_block(client, nullptr, &new_block); - if (close_called || (tid != SINGLE_THREADED_TID && status.is_cancelled())) { - EXPECT_TRUE(new_block == nullptr); - EXPECT_TRUE(status.is_cancelled()); - continue; - } - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(new_block != nullptr); - data = MakeRandomSizeData(new_block); - block_data = std::make_pair(new_block, *data); - - pinned_blocks.push_back(block_data); - pinned_block_map.insert(std::make_pair(block_data.first, pinned_blocks.size() - 1)); - break; - case Pin: - rand_pick = rand() % unpinned_blocks.size(); - block_data = unpinned_blocks[rand_pick]; - status = block_data.first->pin(&pinned); - if (close_called || (tid != SINGLE_THREADED_TID && status.is_cancelled())) { - EXPECT_TRUE(status.is_cancelled()); - // In single-threaded runs the block should not have been pinned. - // In multi-threaded runs pin() may return the block pinned but the status to - // be cancelled. In this case we could move the block from unpinned_blocks - // to pinned_blocks. We do not do that because after is_cancelled() no actual - // block operations should take place. - // reason: when block_mgr is cancelled in one thread, the same block_mgr - // is waiting for scan-range to be ready. - if (tid == SINGLE_THREADED_TID) { - EXPECT_FALSE(pinned); - } - continue; - } - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(pinned); - ValidateRandomSizeData(block_data.first, block_data.second); - unpinned_blocks[rand_pick] = unpinned_blocks.back(); - unpinned_blocks.pop_back(); - unpinned_block_map[unpinned_blocks[rand_pick].first] = rand_pick; - - pinned_blocks.push_back(block_data); - pinned_block_map.insert(std::make_pair(block_data.first, pinned_blocks.size() - 1)); - break; - case Unpin: - rand_pick = rand() % pinned_blocks.size(); - block_data = pinned_blocks[rand_pick]; - status = block_data.first->unpin(); - if (close_called || (tid != SINGLE_THREADED_TID && status.is_cancelled())) { - EXPECT_TRUE(status.is_cancelled()); - continue; - } - EXPECT_TRUE(status.ok()); - pinned_blocks[rand_pick] = pinned_blocks.back(); - pinned_blocks.pop_back(); - pinned_block_map[pinned_blocks[rand_pick].first] = rand_pick; - - unpinned_blocks.push_back(block_data); - unpinned_block_map.insert( - std::make_pair(block_data.first, unpinned_blocks.size() - 1)); - break; - case Delete: - rand_pick = rand() % pinned_blocks.size(); - block_data = pinned_blocks[rand_pick]; - block_data.first->del(); - pinned_blocks[rand_pick] = pinned_blocks.back(); - pinned_blocks.pop_back(); - pinned_block_map[pinned_blocks[rand_pick].first] = rand_pick; - break; - case Close: - block_mgr->cancel(); - close_called = true; - break; - } // end switch (apiFunction) - } // end for () - } - - // Single-threaded execution of the TestRandomInternalImpl. - void TestRandomInternalSingle(int block_size) { - DCHECK_GT(block_size, 0); - DCHECK(_test_env.get() != nullptr); - const int max_num_buffers = 100; - RuntimeState* state = nullptr; - BufferedBlockMgr2* block_mgr = CreateMgr(0, max_num_buffers, block_size, &state); - TestRandomInternalImpl(state, block_mgr, max_num_buffers, SINGLE_THREADED_TID); - TearDownMgrs(); - } - - // Multi-threaded execution of the TestRandomInternalImpl. - void TestRandomInternalMulti(int num_threads, int block_size) { - DCHECK_GT(num_threads, 0); - DCHECK_GT(block_size, 0); - DCHECK(_test_env.get() != nullptr); - const int max_num_buffers = 100; - RuntimeState* state = nullptr; - BufferedBlockMgr2* block_mgr = - CreateMgr(0, num_threads * max_num_buffers, block_size, &state); - - ThreadGroup workers; - for (int i = 0; i < num_threads; ++i) { - thread* t = new thread(std::bind(&BufferedBlockMgrTest::TestRandomInternalImpl, this, - state, block_mgr, max_num_buffers, i)); - workers.add_thread(t); - } - workers.join_all(); - TearDownMgrs(); - } - - // Repeatedly call BufferedBlockMgr2::Create() and BufferedBlockMgr2::~BufferedBlockMgr2(). - void CreateDestroyThread(int index, RuntimeState* state) { - const int num_buffers = 10; - const int iters = 100; - for (int i = 0; i < iters; ++i) { - LOG(WARNING) << "CreateDestroyThread thread " << index << " begin " << i << std::endl; - std::shared_ptr mgr; - Status status = BufferedBlockMgr2::create(state, -1, state->runtime_profile(), - _test_env->tmp_file_mgr(), - _block_size * num_buffers, _block_size, &mgr); - LOG(WARNING) << "CreateDestroyThread thread " << index << " end " << i << std::endl; - } - } - - // IMPALA-2286: Test for races between BufferedBlockMgr2::Create() and - // BufferedBlockMgr2::~BufferedBlockMgr2(). - void CreateDestroyMulti() { - const int num_threads = 4; - ThreadGroup workers; - // Create a shared RuntimeState with no BufferedBlockMgr2. - RuntimeState* shared_state = new RuntimeState(TUniqueId(), TQueryOptions(), TQueryGlobals(), - _test_env->exec_env()); - for (int i = 0; i < num_threads; ++i) { - thread* t = new thread( - std::bind(&BufferedBlockMgrTest::CreateDestroyThread, this, i, shared_state)); - workers.add_thread(t); - } - workers.join_all(); - } - - std::unique_ptr _test_env; - std::vector _created_tmp_dirs; -}; - -TEST_F(BufferedBlockMgrTest, get_new_block) { - TestGetNewBlockImpl(1024); - TestGetNewBlockImpl(8 * 1024); - TestGetNewBlockImpl(8 * 1024 * 1024); - LOG(WARNING) << "finish test get_new_block." << std::endl; -} - -TEST_F(BufferedBlockMgrTest, GetNewBlockSmallBlocks) { - const int block_size = 1024; - int max_num_blocks = 3; - BufferedBlockMgr2* block_mgr; - BufferedBlockMgr2::Client* client; - block_mgr = CreateMgrAndClient(0, max_num_blocks, block_size, 0, &client); - EXPECT_EQ(0, block_mgr->mem_tracker()->consumption()); - - std::vector blocks; - - // Allocate a small block. - BufferedBlockMgr2::Block* new_block = nullptr; - EXPECT_TRUE(block_mgr->get_new_block(client, nullptr, &new_block, 128).ok()); - EXPECT_TRUE(new_block != nullptr); - EXPECT_EQ(block_mgr->bytes_allocated(), 0); - EXPECT_EQ(block_mgr->mem_tracker()->consumption(), 0); - EXPECT_EQ(block_mgr->get_tracker(client)->consumption(), 128); - EXPECT_TRUE(new_block->is_pinned()); - EXPECT_EQ(new_block->bytes_remaining(), 128); - EXPECT_TRUE(new_block->buffer() != nullptr); - blocks.push_back(new_block); - - // Allocate a normal block - EXPECT_TRUE(block_mgr->get_new_block(client, nullptr, &new_block).ok()); - EXPECT_TRUE(new_block != nullptr); - EXPECT_EQ(block_mgr->bytes_allocated(), block_mgr->max_block_size()); - EXPECT_EQ(block_mgr->mem_tracker()->consumption(), block_mgr->max_block_size()); - EXPECT_EQ(block_mgr->get_tracker(client)->consumption(), 128 + block_mgr->max_block_size()); - EXPECT_TRUE(new_block->is_pinned()); - EXPECT_EQ(new_block->bytes_remaining(), block_mgr->max_block_size()); - EXPECT_TRUE(new_block->buffer() != nullptr); - blocks.push_back(new_block); - - // Allocate another small block. - EXPECT_TRUE(block_mgr->get_new_block(client, nullptr, &new_block, 512).ok()); - EXPECT_TRUE(new_block != nullptr); - EXPECT_EQ(block_mgr->bytes_allocated(), block_mgr->max_block_size()); - EXPECT_EQ(block_mgr->mem_tracker()->consumption(), block_mgr->max_block_size()); - EXPECT_EQ(block_mgr->get_tracker(client)->consumption(), - 128 + 512 + block_mgr->max_block_size()); - EXPECT_TRUE(new_block->is_pinned()); - EXPECT_EQ(new_block->bytes_remaining(), 512); - EXPECT_TRUE(new_block->buffer() != nullptr); - blocks.push_back(new_block); - - // Should be able to unpin and pin the middle block - EXPECT_TRUE(blocks[1]->unpin().ok()); - - bool pinned; - EXPECT_TRUE(blocks[1]->pin(&pinned).ok()); - EXPECT_TRUE(pinned); - - for (int i = 0; i < blocks.size(); ++i) { - blocks[i]->del(); - } - - TearDownMgrs(); -} - -// Test that pinning more blocks than the max available buffers. -TEST_F(BufferedBlockMgrTest, Pin) { - Status status; - int max_num_blocks = 5; - const int block_size = 1024; - BufferedBlockMgr2* block_mgr; - BufferedBlockMgr2::Client* client; - block_mgr = CreateMgrAndClient(0, max_num_blocks, block_size, 0, &client); - - std::vector blocks; - AllocateBlocks(block_mgr, client, max_num_blocks, &blocks); - - // Unpin them all. - for (int i = 0; i < blocks.size(); ++i) { - status = blocks[i]->unpin(); - EXPECT_TRUE(status.ok()); - } - - // Allocate more, this should work since we just unpinned some blocks. - AllocateBlocks(block_mgr, client, max_num_blocks, &blocks); - - // Try to pin a unpinned block, this should not be possible. - bool pinned; - status = blocks[0]->pin(&pinned); - EXPECT_TRUE(status.ok()); - EXPECT_FALSE(pinned); - - // Unpin all blocks. - for (int i = 0; i < blocks.size(); ++i) { - status = blocks[i]->unpin(); - EXPECT_TRUE(status.ok()); - } - - // Should be able to pin max_num_blocks blocks. - for (int i = 0; i < max_num_blocks; ++i) { - status = blocks[i]->pin(&pinned); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(pinned); - } - - // Can't pin any more though. - status = blocks[max_num_blocks]->pin(&pinned); - EXPECT_TRUE(status.ok()); - EXPECT_FALSE(pinned); - - TearDownMgrs(); -} - -// Test the eviction policy of the block mgr. No writes issued until more than -// the max available buffers are allocated. Writes must be issued in LIFO order. -TEST_F(BufferedBlockMgrTest, Eviction) { - TestEvictionImpl(1024); - TestEvictionImpl(8 * 1024 * 1024); -} - -// Test deletion and reuse of blocks. -TEST_F(BufferedBlockMgrTest, Deletion) { - int max_num_buffers = 5; - const int block_size = 1024; - BufferedBlockMgr2* block_mgr; - BufferedBlockMgr2::Client* client; - block_mgr = CreateMgrAndClient(0, max_num_buffers, block_size, 0, &client); - - // Check counters. - RuntimeProfile* profile = block_mgr->profile(); - RuntimeProfile::Counter* recycled_cnt = profile->get_counter("BlocksRecycled"); - RuntimeProfile::Counter* created_cnt = profile->get_counter("BlocksCreated"); - - std::vector blocks; - AllocateBlocks(block_mgr, client, max_num_buffers, &blocks); - EXPECT_TRUE(created_cnt->value() == max_num_buffers); - - for (BufferedBlockMgr2::Block* block : blocks) { - block->del(); - } - AllocateBlocks(block_mgr, client, max_num_buffers, &blocks); - EXPECT_TRUE(created_cnt->value() == max_num_buffers); - EXPECT_TRUE(recycled_cnt->value() == max_num_buffers); - - TearDownMgrs(); -} - -// Delete blocks of various sizes and statuses to exercise the different code paths. -// This relies on internal validation in block manager to detect many errors. -TEST_F(BufferedBlockMgrTest, DeleteSingleBlocks) { - int max_num_buffers = 16; - BufferedBlockMgr2::Client* client; - BufferedBlockMgr2* block_mgr = CreateMgrAndClient(0, max_num_buffers, _block_size, 0, &client); - - // Pinned I/O block. - BufferedBlockMgr2::Block* new_block; - EXPECT_TRUE(block_mgr->get_new_block(client, nullptr, &new_block).ok()); - EXPECT_TRUE(new_block != nullptr); - EXPECT_TRUE(new_block->is_pinned()); - EXPECT_TRUE(new_block->is_max_size()); - new_block->del(); - EXPECT_TRUE(block_mgr->get_tracker(client)->consumption() == 0); - - // Pinned non-I/O block. - int small_block_size = 128; - EXPECT_TRUE(block_mgr->get_new_block(client, nullptr, &new_block, small_block_size).ok()); - EXPECT_TRUE(new_block != nullptr); - EXPECT_TRUE(new_block->is_pinned()); - EXPECT_EQ(small_block_size, block_mgr->get_tracker(client)->consumption()); - new_block->del(); - EXPECT_EQ(0, block_mgr->get_tracker(client)->consumption()); - - // Unpinned I/O block - delete after written to disk. - EXPECT_TRUE(block_mgr->get_new_block(client, nullptr, &new_block).ok()); - EXPECT_TRUE(new_block != nullptr); - EXPECT_TRUE(new_block->is_pinned()); - EXPECT_TRUE(new_block->is_max_size()); - new_block->unpin(); - EXPECT_FALSE(new_block->is_pinned()); - WaitForWrites(block_mgr); - new_block->del(); - EXPECT_TRUE(block_mgr->get_tracker(client)->consumption() == 0); - - // Unpinned I/O block - delete before written to disk. - EXPECT_TRUE(block_mgr->get_new_block(client, nullptr, &new_block).ok()); - EXPECT_TRUE(new_block != nullptr); - EXPECT_TRUE(new_block->is_pinned()); - EXPECT_TRUE(new_block->is_max_size()); - new_block->unpin(); - EXPECT_FALSE(new_block->is_pinned()); - new_block->del(); - WaitForWrites(block_mgr); - EXPECT_TRUE(block_mgr->get_tracker(client)->consumption() == 0); - - TearDownMgrs(); -} - -// Test that all APIs return cancelled after close. -TEST_F(BufferedBlockMgrTest, Close) { - int max_num_buffers = 5; - const int block_size = 1024; - BufferedBlockMgr2* block_mgr; - BufferedBlockMgr2::Client* client; - block_mgr = CreateMgrAndClient(0, max_num_buffers, block_size, 0, &client); - - std::vector blocks; - AllocateBlocks(block_mgr, client, max_num_buffers, &blocks); - - block_mgr->cancel(); - - BufferedBlockMgr2::Block* new_block; - Status status = block_mgr->get_new_block(client, nullptr, &new_block); - EXPECT_TRUE(status.is_cancelled()); - EXPECT_TRUE(new_block == nullptr); - status = blocks[0]->unpin(); - EXPECT_TRUE(status.is_cancelled()); - bool pinned; - status = blocks[0]->pin(&pinned); - EXPECT_TRUE(status.is_cancelled()); - blocks[1]->del(); - - TearDownMgrs(); -} - -// Clear scratch directory. Return # of files deleted. -static int clear_scratch_dir() { - int num_files = 0; - directory_iterator dir_it(SCRATCH_DIR); - for (; dir_it != directory_iterator(); ++dir_it) { - ++num_files; - remove_all(dir_it->path()); - } - return num_files; -} - -// Test that the block manager behaves correctly after a write error. Delete the scratch -// directory before an operation that would cause a write and test that subsequent API -// calls return 'CANCELLED' correctly. -TEST_F(BufferedBlockMgrTest, WriteError) { - Status status; - int max_num_buffers = 2; - const int block_size = 1024; - BufferedBlockMgr2* block_mgr; - BufferedBlockMgr2::Client* client; - block_mgr = CreateMgrAndClient(0, max_num_buffers, block_size, 0, &client); - - std::vector blocks; - AllocateBlocks(block_mgr, client, max_num_buffers, &blocks); - // Unpin two blocks here, to ensure that backing storage is allocated in tmp file. - for (int i = 0; i < 2; ++i) { - status = blocks[i]->unpin(); - EXPECT_TRUE(status.ok()); - } - WaitForWrites(block_mgr); - // Repin the blocks - for (int i = 0; i < 2; ++i) { - bool pinned; - status = blocks[i]->pin(&pinned); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(pinned); - } - // Remove the backing storage so that future writes will fail - int num_files = clear_scratch_dir(); - EXPECT_TRUE(num_files > 0); - for (int i = 0; i < 2; ++i) { - status = blocks[i]->unpin(); - EXPECT_TRUE(status.ok()); - } - WaitForWrites(block_mgr); - // Subsequent calls should fail. - for (int i = 0; i < 2; ++i) { - blocks[i]->del(); - } - BufferedBlockMgr2::Block* new_block; - status = block_mgr->get_new_block(client, nullptr, &new_block); - EXPECT_TRUE(status.is_cancelled()); - EXPECT_TRUE(new_block == nullptr); - - TearDownMgrs(); -} - -// Test block manager error handling when temporary file space cannot be allocated to -// back an unpinned buffer. -TEST_F(BufferedBlockMgrTest, TmpFileAllocateError) { - Status status; - int max_num_buffers = 2; - BufferedBlockMgr2::Client* client; - BufferedBlockMgr2* block_mgr = CreateMgrAndClient(0, max_num_buffers, _block_size, 0, &client); - - std::vector blocks; - AllocateBlocks(block_mgr, client, max_num_buffers, &blocks); - // Unpin a block, forcing a write. - status = blocks[0]->unpin(); - EXPECT_TRUE(status.ok()); - WaitForWrites(block_mgr); - // Remove temporary files - subsequent operations will fail. - int num_files = clear_scratch_dir(); - EXPECT_TRUE(num_files > 0); - // Current implementation will fail here because it tries to expand the tmp file - // immediately. This behavior is not contractual but we want to know if it changes - // accidentally. - status = blocks[1]->unpin(); - EXPECT_FALSE(status.ok()); - - TearDownMgrs(); -} - -// Test that the block manager is able to blacklist a temporary device correctly after a -// write error. We should not allocate more blocks on that device, but existing blocks -// on the device will remain in use. -/// Disabled because blacklisting was disabled as workaround for IMPALA-2305. -TEST_F(BufferedBlockMgrTest, DISABLED_WriteErrorBlacklist) { - // TEST_F(BufferedBlockMgrTest, WriteErrorBlacklist) { - // Set up two buffered block managers with two temporary dirs. - std::vector tmp_dirs = InitMultipleTmpDirs(2); - // Simulate two concurrent queries. - const int NUM_BLOCK_MGRS = 2; - const int MAX_NUM_BLOCKS = 4; - int blocks_per_mgr = MAX_NUM_BLOCKS / NUM_BLOCK_MGRS; - std::vector block_mgrs; - std::vector clients; - CreateMgrsAndClients(0, NUM_BLOCK_MGRS, blocks_per_mgr, _block_size, 0, &block_mgrs, &clients); - - // Allocate files for all 2x2 combinations by unpinning blocks. - std::vector> blocks; - std::vector all_blocks; - for (int i = 0; i < NUM_BLOCK_MGRS; ++i) { - std::vector mgr_blocks; - AllocateBlocks(block_mgrs[i], clients[i], blocks_per_mgr, &mgr_blocks); - UnpinBlocks(mgr_blocks); - for (int j = 0; j < blocks_per_mgr; ++j) { - LOG(INFO) << "Manager " << i << " Block " << j << " backed by file " - << mgr_blocks[j]->tmp_file_path(); - } - blocks.push_back(mgr_blocks); - all_blocks.insert(all_blocks.end(), mgr_blocks.begin(), mgr_blocks.end()); - } - WaitForWrites(block_mgrs); - int error_mgr = 0; - int no_error_mgr = 1; - const string& error_dir = tmp_dirs[0]; - const string& good_dir = tmp_dirs[1]; - // Delete one file from first scratch dir for first block manager. - BufferedBlockMgr2::Block* error_block = FindBlockForDir(blocks[error_mgr], error_dir); - EXPECT_TRUE(error_block != nullptr) << "Expected a tmp file in dir " << error_dir; - PinBlocks(all_blocks); - DeleteBackingFile(error_block); - UnpinBlocks(all_blocks); // Should succeed since tmp file space was already allocated. - WaitForWrites(block_mgrs); - EXPECT_TRUE(block_mgrs[error_mgr]->is_cancelled()); - EXPECT_FALSE(block_mgrs[no_error_mgr]->is_cancelled()); - // Temporary device with error should no longer be active. - std::vector active_tmp_devices = - _test_env->tmp_file_mgr()->active_tmp_devices(); - EXPECT_EQ(tmp_dirs.size() - 1, active_tmp_devices.size()); - for (int i = 0; i < active_tmp_devices.size(); ++i) { - const string& device_path = - _test_env->tmp_file_mgr()->get_tmp_dir_path(active_tmp_devices[i]); - EXPECT_EQ(string::npos, error_dir.find(device_path)); - } - // The second block manager should continue using allocated scratch space, since it - // didn't encounter a write error itself. In future this could change but for now it is - // the intended behaviour. - PinBlocks(blocks[no_error_mgr]); - UnpinBlocks(blocks[no_error_mgr]); - EXPECT_TRUE(FindBlockForDir(blocks[no_error_mgr], good_dir) != nullptr); - EXPECT_TRUE(FindBlockForDir(blocks[no_error_mgr], error_dir) != nullptr); - // The second block manager should avoid using bad directory for new blocks. - std::vector no_error_new_blocks; - AllocateBlocks(block_mgrs[no_error_mgr], clients[no_error_mgr], blocks_per_mgr, - &no_error_new_blocks); - UnpinBlocks(no_error_new_blocks); - for (int i = 0; i < no_error_new_blocks.size(); ++i) { - LOG(INFO) << "Newly created block backed by file " - << no_error_new_blocks[i]->tmp_file_path(); - EXPECT_TRUE(BlockInDir(no_error_new_blocks[i], good_dir)); - } - // A new block manager should only use the good dir for backing storage. - BufferedBlockMgr2::Client* new_client; - BufferedBlockMgr2* new_block_mgr = - CreateMgrAndClient(9999, blocks_per_mgr, _block_size, 0, &new_client); - std::vector new_mgr_blocks; - AllocateBlocks(new_block_mgr, new_client, blocks_per_mgr, &new_mgr_blocks); - UnpinBlocks(new_mgr_blocks); - for (int i = 0; i < blocks_per_mgr; ++i) { - LOG(INFO) << "New manager Block " << i << " backed by file " - << new_mgr_blocks[i]->tmp_file_path(); - EXPECT_TRUE(BlockInDir(new_mgr_blocks[i], good_dir)); - } -} - -// Check that allocation error resulting from removal of directory results in blocks -/// being allocated in other directories. -TEST_F(BufferedBlockMgrTest, AllocationErrorHandling) { - // Set up two buffered block managers with two temporary dirs. - std::vector tmp_dirs = InitMultipleTmpDirs(2); - // Simulate two concurrent queries. - int num_block_mgrs = 2; - int max_num_blocks = 4; - int blocks_per_mgr = max_num_blocks / num_block_mgrs; - // std::vector runtime_states; - std::vector block_mgrs; - std::vector clients; - CreateMgrsAndClients(0, num_block_mgrs, blocks_per_mgr, _block_size, 0, &block_mgrs, &clients); - - // Allocate files for all 2x2 combinations by unpinning blocks. - std::vector> blocks; - for (int i = 0; i < num_block_mgrs; ++i) { - std::vector mgr_blocks; - LOG(INFO) << "Iter " << i; - AllocateBlocks(block_mgrs[i], clients[i], blocks_per_mgr, &mgr_blocks); - blocks.push_back(mgr_blocks); - } - const string& bad_dir = tmp_dirs[0]; - const string& bad_scratch_subdir = bad_dir + SCRATCH_SUFFIX; - // const string& good_dir = tmp_dirs[1]; - // const string& good_scratch_subdir = good_dir + SCRATCH_SUFFIX; - chmod(bad_scratch_subdir.c_str(), 0); - // The block mgr should attempt to allocate space in bad dir for one block, which will - // cause an error when it tries to create/expand the file. It should recover and just - // use the good dir. - UnpinBlocks(blocks[0]); - // Directories remain on active list even when they experience errors. - EXPECT_EQ(2, _test_env->tmp_file_mgr()->num_active_tmp_devices()); - // Blocks should not be written to bad dir even if it remains non-writable. - UnpinBlocks(blocks[1]); - // All writes should succeed. - WaitForWrites(block_mgrs); - for (int i = 0; i < blocks.size(); ++i) { - for (int j = 0; j < blocks[i].size(); ++j) { - blocks[i][j]->del(); - } - } -} - -// Test that block manager fails cleanly when all directories are inaccessible at runtime. -TEST_F(BufferedBlockMgrTest, NoDirsAllocationError) { - std::vector tmp_dirs = InitMultipleTmpDirs(2); - int max_num_buffers = 2; - BufferedBlockMgr2::Client* client; - BufferedBlockMgr2* block_mgr = CreateMgrAndClient(0, max_num_buffers, _block_size, 0, &client); - std::vector blocks; - AllocateBlocks(block_mgr, client, max_num_buffers, &blocks); - for (int i = 0; i < tmp_dirs.size(); ++i) { - const string& tmp_scratch_subdir = tmp_dirs[i] + SCRATCH_SUFFIX; - chmod(tmp_scratch_subdir.c_str(), 0); - } - for (int i = 0; i < blocks.size(); ++i) { - EXPECT_FALSE(blocks[i]->unpin().ok()); - } -} - -// Create two clients with different number of reserved buffers. -TEST_F(BufferedBlockMgrTest, MultipleClients) { - Status status; - int client1_buffers = 3; - int client2_buffers = 5; - int max_num_buffers = client1_buffers + client2_buffers; - const int block_size = 1024; - RuntimeState* runtime_state; - BufferedBlockMgr2* block_mgr = CreateMgr(0, max_num_buffers, block_size, &runtime_state); - - BufferedBlockMgr2::Client* client1 = nullptr; - BufferedBlockMgr2::Client* client2 = nullptr; - status = block_mgr->register_client(client1_buffers, runtime_state, &client1); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(client1 != nullptr); - status = block_mgr->register_client(client2_buffers, runtime_state, &client2); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(client2 != nullptr); - - // Reserve client 1's and 2's buffers. They should succeed. - bool reserved = block_mgr->try_acquire_tmp_reservation(client1, 1); - EXPECT_TRUE(reserved); - reserved = block_mgr->try_acquire_tmp_reservation(client2, 1); - EXPECT_TRUE(reserved); - - std::vector client1_blocks; - // Allocate all of client1's reserved blocks, they should all succeed. - AllocateBlocks(block_mgr, client1, client1_buffers, &client1_blocks); - - // Try allocating one more, that should fail. - BufferedBlockMgr2::Block* block; - status = block_mgr->get_new_block(client1, nullptr, &block); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(block == nullptr); - - // Trying to reserve should also fail. - reserved = block_mgr->try_acquire_tmp_reservation(client1, 1); - EXPECT_FALSE(reserved); - - // Allocate all of client2's reserved blocks, these should succeed. - std::vector client2_blocks; - AllocateBlocks(block_mgr, client2, client2_buffers, &client2_blocks); - - // Try allocating one more from client 2, that should fail. - status = block_mgr->get_new_block(client2, nullptr, &block); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(block == nullptr); - - // Unpin one block from client 1. - status = client1_blocks[0]->unpin(); - EXPECT_TRUE(status.ok()); - - // Client 2 should still not be able to allocate. - status = block_mgr->get_new_block(client2, nullptr, &block); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(block == nullptr); - - // Client 2 should still not be able to reserve. - reserved = block_mgr->try_acquire_tmp_reservation(client2, 1); - EXPECT_FALSE(reserved); - - // Client 1 should be able to though. - status = block_mgr->get_new_block(client1, nullptr, &block); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(block != nullptr); - - // Unpin two of client 1's blocks (client 1 should have 3 unpinned blocks now). - status = client1_blocks[1]->unpin(); - EXPECT_TRUE(status.ok()); - status = client1_blocks[2]->unpin(); - EXPECT_TRUE(status.ok()); - - // Clear client 1's reservation - block_mgr->clear_reservations(client1); - - // Client 2 should be able to reserve 1 buffers now (there are 2 left); - reserved = block_mgr->try_acquire_tmp_reservation(client2, 1); - EXPECT_TRUE(reserved); - - // Client one can only pin 1. - bool pinned; - status = client1_blocks[0]->pin(&pinned); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(pinned); - // Can't get this one. - status = client1_blocks[1]->pin(&pinned); - EXPECT_TRUE(status.ok()); - EXPECT_FALSE(pinned); - - // Client 2 can pick up the one reserved buffer - status = block_mgr->get_new_block(client2, nullptr, &block); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(block != nullptr); - // But not a second - BufferedBlockMgr2::Block* block2; - status = block_mgr->get_new_block(client2, nullptr, &block2); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(block2 == nullptr); - - // Unpin client 2's block it got from the reservation. Sine this is a tmp - // reservation, client 1 can pick it up again (it is not longer reserved). - status = block->unpin(); - EXPECT_TRUE(status.ok()); - status = client1_blocks[1]->pin(&pinned); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(pinned); - - TearDownMgrs(); -} - -// Create two clients with different number of reserved buffers and some additional. -TEST_F(BufferedBlockMgrTest, MultipleClientsExtraBuffers) { - Status status; - int client1_buffers = 1; - int client2_buffers = 1; - int max_num_buffers = client1_buffers + client2_buffers + 2; - const int block_size = 1024; - RuntimeState* runtime_state; - BufferedBlockMgr2* block_mgr = CreateMgr(0, max_num_buffers, block_size, &runtime_state); - - BufferedBlockMgr2::Client* client1 = nullptr; - BufferedBlockMgr2::Client* client2 = nullptr; - BufferedBlockMgr2::Block* block = nullptr; - status = block_mgr->register_client(client1_buffers, runtime_state, &client1); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(client1 != nullptr); - status = block_mgr->register_client(client2_buffers, runtime_state, &client2); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(client2 != nullptr); - - std::vector client1_blocks; - // Allocate all of client1's reserved blocks, they should all succeed. - AllocateBlocks(block_mgr, client1, client1_buffers, &client1_blocks); - - // Allocate all of client2's reserved blocks, these should succeed. - std::vector client2_blocks; - AllocateBlocks(block_mgr, client2, client2_buffers, &client2_blocks); - - // We have two spare buffers now. Each client should be able to allocate it. - status = block_mgr->get_new_block(client1, nullptr, &block); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(block != nullptr); - status = block_mgr->get_new_block(client2, nullptr, &block); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(block != nullptr); - - // Now we are completely full, no one should be able to allocate a new block. - status = block_mgr->get_new_block(client1, nullptr, &block); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(block == nullptr); - status = block_mgr->get_new_block(client2, nullptr, &block); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(block == nullptr); - - TearDownMgrs(); -} - -// Create two clients causing oversubscription. -TEST_F(BufferedBlockMgrTest, ClientOversubscription) { - Status status; - int client1_buffers = 1; - int client2_buffers = 2; - int max_num_buffers = 2; - const int block_size = 1024; - RuntimeState* runtime_state; - BufferedBlockMgr2* block_mgr = CreateMgr(0, max_num_buffers, block_size, &runtime_state); - - BufferedBlockMgr2::Client* client1 = nullptr; - BufferedBlockMgr2::Client* client2 = nullptr; - BufferedBlockMgr2::Block* block = nullptr; - status = block_mgr->register_client(client1_buffers, runtime_state, &client1); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(client1 != nullptr); - status = block_mgr->register_client(client2_buffers, runtime_state, &client2); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(client2 != nullptr); - - // Client one allocates first block, should work. - status = block_mgr->get_new_block(client1, nullptr, &block); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(block != nullptr); - - // Client two allocates first block, should work. - status = block_mgr->get_new_block(client2, nullptr, &block); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(block != nullptr); - - // At this point we've used both buffers. Client one reserved one so subsequent - // calls should fail with no error (but returns no block). - status = block_mgr->get_new_block(client1, nullptr, &block); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(block == nullptr); - - // Allocate with client two. Since client two reserved 2 buffers, this should fail - // with MEM_LIMIT_EXCEEDED. - status = block_mgr->get_new_block(client2, nullptr, &block); - EXPECT_TRUE(status.is_mem_limit_exceeded()); - - TearDownMgrs(); -} - -TEST_F(BufferedBlockMgrTest, SingleRandom_plain) { - TestRandomInternalSingle(1024); - TestRandomInternalSingle(8 * 1024); - TestRandomInternalSingle(8 * 1024 * 1024); -} - -TEST_F(BufferedBlockMgrTest, Multi2Random_plain) { - TestRandomInternalMulti(2, 1024); - TestRandomInternalMulti(2, 8 * 1024); - TestRandomInternalMulti(2, 8 * 1024 * 1024); -} - -TEST_F(BufferedBlockMgrTest, Multi4Random_plain) { - TestRandomInternalMulti(4, 1024); - TestRandomInternalMulti(4, 8 * 1024); - TestRandomInternalMulti(4, 8 * 1024 * 1024); -} - -// TODO: Enable when we improve concurrency/scalability of block mgr. -TEST_F(BufferedBlockMgrTest, DISABLED_Multi8Random_plain) { - TestRandomInternalMulti(8, 1024); -} - -TEST_F(BufferedBlockMgrTest, CreateDestroyMulti) { - CreateDestroyMulti(); -} - -} // end namespace doris diff --git a/be/test/runtime/buffered_tuple_stream2_test.cpp b/be/test/runtime/buffered_tuple_stream2_test.cpp deleted file mode 100644 index 3bf3a7ed7a..0000000000 --- a/be/test/runtime/buffered_tuple_stream2_test.cpp +++ /dev/null @@ -1,821 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include - -#include -#include -#include // for std::numeric_limits::max() -#include - -#include "gen_cpp/Types_types.h" -#include "runtime/buffered_tuple_stream2.inline.h" -#include "runtime/row_batch.h" -#include "runtime/string_value.hpp" -#include "runtime/test_env.h" -#include "runtime/tmp_file_mgr.h" -#include "runtime/types.h" -#include "testutil/desc_tbl_builder.h" -#include "util/cpu_info.h" -#include "util/debug_util.h" -#include "util/disk_info.h" - -using std::vector; - -using std::unique_ptr; - -static const int BATCH_SIZE = 250; -static const uint32_t PRIME = 479001599; - -namespace doris { - -static const StringValue STRINGS[] = { - StringValue("ABC"), - StringValue("HELLO"), - StringValue("123456789"), - StringValue("FOOBAR"), - StringValue("ONE"), - StringValue("THREE"), - StringValue("abcdefghijklmno"), - StringValue("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"), - StringValue("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"), -}; - -static const int NUM_STRINGS = sizeof(STRINGS) / sizeof(StringValue); - -class SimpleTupleStreamTest : public testing::Test { -public: - SimpleTupleStreamTest() {} - // A null dtor to pass codestyle check - ~SimpleTupleStreamTest() {} - -protected: - virtual void SetUp() { - _test_env.reset(new TestEnv()); - create_descriptors(); - _mem_pool.reset(new MemPool()); - } - - virtual void create_descriptors() { - std::vector nullable_tuples(1, false); - std::vector tuple_ids(1, static_cast(0)); - - DescriptorTblBuilder int_builder(&_pool); - int_builder.declare_tuple() << TYPE_INT; - _int_desc = _pool.add(new RowDescriptor(*int_builder.build(), tuple_ids, nullable_tuples)); - - DescriptorTblBuilder string_builder(&_pool); - // string_builder.declare_tuple() << TYPE_STRING; - string_builder.declare_tuple() << TYPE_VARCHAR; - _string_desc = - _pool.add(new RowDescriptor(*string_builder.build(), tuple_ids, nullable_tuples)); - } - - virtual void TearDown() { - _runtime_state = nullptr; - _client = nullptr; - _pool.clear(); - _mem_pool->free_all(); - _test_env.reset(); - } - - // Setup a block manager with the provided settings and client with no reservation, - // tracked by _tracker. - void InitBlockMgr(int64_t limit, int block_size) { - Status status = _test_env->create_query_state(0, limit, block_size, &_runtime_state); - EXPECT_TRUE(status.ok()); - status = _runtime_state->block_mgr2()->register_client(0, _runtime_state, &_client); - EXPECT_TRUE(status.ok()); - } - - // Generate the ith element of a sequence of int values. - int GenIntValue(int i) { - // Multiply by large prime to get varied bit patterns. - return i * PRIME; - } - - // Generate the ith element of a sequence of bool values. - bool GenBoolValue(int i) { - // Use a middle bit of the int value. - return ((GenIntValue(i) >> 8) & 0x1) != 0; - } - - virtual RowBatch* CreateIntBatch(int offset, int num_rows, bool gen_null) { - RowBatch* batch = _pool.add(new RowBatch(*_int_desc, num_rows)); - int tuple_size = _int_desc->tuple_descriptors()[0]->byte_size(); - uint8_t* tuple_mem = reinterpret_cast( - batch->tuple_data_pool()->allocate(tuple_size * num_rows)); - memset(tuple_mem, 0, tuple_size * num_rows); - - const int int_tuples = _int_desc->tuple_descriptors().size(); - for (int i = 0; i < num_rows; ++i) { - int idx = batch->add_row(); - TupleRow* row = batch->get_row(idx); - Tuple* int_tuple = reinterpret_cast(tuple_mem + i * tuple_size); - // *reinterpret_cast(int_tuple + 1) = GenIntValue(i + offset); - *reinterpret_cast(reinterpret_cast(int_tuple) + 1) = - GenIntValue(i + offset); - for (int j = 0; j < int_tuples; ++j) { - int idx = (i + offset) * int_tuples + j; - if (!gen_null || GenBoolValue(idx)) { - row->set_tuple(j, int_tuple); - } else { - row->set_tuple(j, nullptr); - } - } - batch->commit_last_row(); - } - return batch; - } - - virtual RowBatch* CreateStringBatch(int offset, int num_rows, bool gen_null) { - int tuple_size = sizeof(StringValue) + 1; - RowBatch* batch = _pool.add(new RowBatch(*_string_desc, num_rows)); - uint8_t* tuple_mem = batch->tuple_data_pool()->allocate(tuple_size * num_rows); - memset(tuple_mem, 0, tuple_size * num_rows); - const int string_tuples = _string_desc->tuple_descriptors().size(); - for (int i = 0; i < num_rows; ++i) { - TupleRow* row = batch->get_row(batch->add_row()); - *reinterpret_cast(tuple_mem + 1) = STRINGS[(i + offset) % NUM_STRINGS]; - for (int j = 0; j < string_tuples; ++j) { - int idx = (i + offset) * string_tuples + j; - if (!gen_null || GenBoolValue(idx)) { - row->set_tuple(j, reinterpret_cast(tuple_mem)); - } else { - row->set_tuple(j, nullptr); - } - } - batch->commit_last_row(); - tuple_mem += tuple_size; - } - return batch; - } - - void AppendRowTuples(TupleRow* row, std::vector* results) { - DCHECK(row != nullptr); - const int int_tuples = _int_desc->tuple_descriptors().size(); - for (int i = 0; i < int_tuples; ++i) { - AppendValue(row->get_tuple(i), results); - } - } - - void AppendRowTuples(TupleRow* row, std::vector* results) { - DCHECK(row != nullptr); - const int string_tuples = _string_desc->tuple_descriptors().size(); - for (int i = 0; i < string_tuples; ++i) { - AppendValue(row->get_tuple(i), results); - } - } - - void AppendValue(Tuple* t, std::vector* results) { - if (t == nullptr) { - // For the tests indicate null-ability using the max int value - results->push_back(std::numeric_limits::max()); - } else { - results->push_back(*reinterpret_cast(reinterpret_cast(t) + 1)); - } - } - - void AppendValue(Tuple* t, std::vector* results) { - if (t == nullptr) { - results->push_back(StringValue()); - } else { - uint8_t* mem = reinterpret_cast(t); - StringValue sv = *reinterpret_cast(mem + 1); - uint8_t* copy = _mem_pool->allocate(sv.len); - memcpy(copy, sv.ptr, sv.len); - sv.ptr = reinterpret_cast(copy); - results->push_back(sv); - } - } - - template - void ReadValues(BufferedTupleStream2* stream, RowDescriptor* desc, std::vector* results, - int num_batches = -1) { - bool eos = false; - RowBatch batch(*desc, BATCH_SIZE); - int batches_read = 0; - do { - batch.reset(); - Status status = stream->get_next(&batch, &eos); - EXPECT_TRUE(status.ok()); - ++batches_read; - for (int i = 0; i < batch.num_rows(); ++i) { - AppendRowTuples(batch.get_row(i), results); - } - } while (!eos && (num_batches < 0 || batches_read <= num_batches)); - } - - virtual void VerifyResults(const std::vector& results, int exp_rows, bool gen_null) { - const int int_tuples = _int_desc->tuple_descriptors().size(); - EXPECT_EQ(results.size(), exp_rows * int_tuples); - for (int i = 0; i < exp_rows; ++i) { - for (int j = 0; j < int_tuples; ++j) { - int idx = i * int_tuples + j; - if (!gen_null || GenBoolValue(idx)) { - EXPECT_EQ(results[idx], GenIntValue(i)) - << " results[" << idx << "]: " << results[idx] - << " != " << GenIntValue(i) << " gen_null=" << gen_null; - } else { - EXPECT_TRUE(results[idx] == std::numeric_limits::max()) - << "i: " << i << " j: " << j << " results[" << idx - << "]: " << results[idx] << " != " << std::numeric_limits::max(); - } - } - } - } - - virtual void VerifyResults(const std::vector& results, int exp_rows, - bool gen_null) { - const int string_tuples = _string_desc->tuple_descriptors().size(); - EXPECT_EQ(results.size(), exp_rows * string_tuples); - for (int i = 0; i < exp_rows; ++i) { - for (int j = 0; j < string_tuples; ++j) { - int idx = i * string_tuples + j; - if (!gen_null || GenBoolValue(idx)) { - EXPECT_TRUE(results[idx] == STRINGS[i % NUM_STRINGS]) - << "results[" << idx << "] " << results[idx] - << " != " << STRINGS[i % NUM_STRINGS] << " i=" << i - << " gen_null=" << gen_null; - } else { - EXPECT_TRUE(results[idx] == StringValue()) - << "results[" << idx << "] " << results[idx] << " not nullptr"; - } - } - } - } - - // Test adding num_batches of ints to the stream and reading them back. - template - void TestValues(int num_batches, RowDescriptor* desc, bool gen_null) { - BufferedTupleStream2 stream(_runtime_state, *desc, _runtime_state->block_mgr2(), _client, - true, false); - Status status = stream.init(-1, nullptr, true); - EXPECT_TRUE(status.ok()) << status; - status = stream.unpin_stream(); - EXPECT_TRUE(status.ok()); - - // Add rows to the stream - int offset = 0; - for (int i = 0; i < num_batches; ++i) { - RowBatch* batch = nullptr; - if (sizeof(T) == sizeof(int)) { - batch = CreateIntBatch(offset, BATCH_SIZE, gen_null); - } else if (sizeof(T) == sizeof(StringValue)) { - batch = CreateStringBatch(offset, BATCH_SIZE, gen_null); - } else { - DCHECK(false); - } - for (int j = 0; j < batch->num_rows(); ++j) { - bool b = stream.add_row(batch->get_row(j), &status); - EXPECT_TRUE(status.ok()); - if (!b) { - EXPECT_TRUE(stream.using_small_buffers()); - bool got_buffer; - status = stream.switch_to_io_buffers(&got_buffer); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(got_buffer); - b = stream.add_row(batch->get_row(j), &status); - EXPECT_TRUE(status.ok()); - } - EXPECT_TRUE(b); - } - offset += batch->num_rows(); - // Reset the batch to make sure the stream handles the memory correctly. - batch->reset(); - } - - status = stream.prepare_for_read(false); - EXPECT_TRUE(status.ok()); - - // Read all the rows back - std::vector results; - ReadValues(&stream, desc, &results); - - // Verify result - VerifyResults(results, BATCH_SIZE * num_batches, gen_null); - - stream.close(); - } - - void TestIntValuesInterleaved(int num_batches, int num_batches_before_read) { - for (int small_buffers = 0; small_buffers < 2; ++small_buffers) { - BufferedTupleStream2 stream(_runtime_state, *_int_desc, _runtime_state->block_mgr2(), - _client, small_buffers == 0, // initial small buffers - true); // read_write - Status status = stream.init(-1, nullptr, true); - EXPECT_TRUE(status.ok()); - status = stream.prepare_for_read(true); - EXPECT_TRUE(status.ok()); - status = stream.unpin_stream(); - EXPECT_TRUE(status.ok()); - - std::vector results; - - for (int i = 0; i < num_batches; ++i) { - RowBatch* batch = CreateIntBatch(i * BATCH_SIZE, BATCH_SIZE, false); - for (int j = 0; j < batch->num_rows(); ++j) { - bool b = stream.add_row(batch->get_row(j), &status); - EXPECT_TRUE(b); - EXPECT_TRUE(status.ok()); - } - // Reset the batch to make sure the stream handles the memory correctly. - batch->reset(); - if (i % num_batches_before_read == 0) { - ReadValues(&stream, _int_desc, &results, - (rand() % num_batches_before_read) + 1); - } - } - ReadValues(&stream, _int_desc, &results); - - VerifyResults(results, BATCH_SIZE * num_batches, false); - - stream.close(); - } - } - - std::unique_ptr _test_env; - RuntimeState* _runtime_state; - BufferedBlockMgr2::Client* _client; - - ObjectPool _pool; - RowDescriptor* _int_desc; - RowDescriptor* _string_desc; - std::unique_ptr _mem_pool; -}; - -// Tests with a non-NULLable tuple per row. -class SimpleNullStreamTest : public SimpleTupleStreamTest { -protected: - virtual void create_descriptors() { - std::vector nullable_tuples(1, true); - std::vector tuple_ids(1, static_cast(0)); - - DescriptorTblBuilder int_builder(&_pool); - int_builder.declare_tuple() << TYPE_INT; - _int_desc = _pool.add(new RowDescriptor(*int_builder.build(), tuple_ids, nullable_tuples)); - - DescriptorTblBuilder string_builder(&_pool); - string_builder.declare_tuple() << TYPE_VARCHAR; - _string_desc = - _pool.add(new RowDescriptor(*string_builder.build(), tuple_ids, nullable_tuples)); - } -}; // SimpleNullStreamTest - -// Tests with multiple non-NULLable tuples per row. -class MultiTupleStreamTest : public SimpleTupleStreamTest { -protected: - virtual void create_descriptors() { - std::vector nullable_tuples; - nullable_tuples.push_back(false); - nullable_tuples.push_back(false); - nullable_tuples.push_back(false); - - std::vector tuple_ids; - tuple_ids.push_back(static_cast(0)); - tuple_ids.push_back(static_cast(1)); - tuple_ids.push_back(static_cast(2)); - - DescriptorTblBuilder int_builder(&_pool); - int_builder.declare_tuple() << TYPE_INT; - int_builder.declare_tuple() << TYPE_INT; - int_builder.declare_tuple() << TYPE_INT; - _int_desc = _pool.add(new RowDescriptor(*int_builder.build(), tuple_ids, nullable_tuples)); - - DescriptorTblBuilder string_builder(&_pool); - string_builder.declare_tuple() << TYPE_VARCHAR; - string_builder.declare_tuple() << TYPE_VARCHAR; - string_builder.declare_tuple() << TYPE_VARCHAR; - _string_desc = - _pool.add(new RowDescriptor(*string_builder.build(), tuple_ids, nullable_tuples)); - } -}; - -// Tests with multiple NULLable tuples per row. -class MultiNullableTupleStreamTest : public SimpleTupleStreamTest { -protected: - virtual void create_descriptors() { - std::vector nullable_tuples; - nullable_tuples.push_back(false); - nullable_tuples.push_back(true); - nullable_tuples.push_back(true); - - std::vector tuple_ids; - tuple_ids.push_back(static_cast(0)); - tuple_ids.push_back(static_cast(1)); - tuple_ids.push_back(static_cast(2)); - - DescriptorTblBuilder int_builder(&_pool); - int_builder.declare_tuple() << TYPE_INT; - int_builder.declare_tuple() << TYPE_INT; - int_builder.declare_tuple() << TYPE_INT; - _int_desc = _pool.add(new RowDescriptor(*int_builder.build(), tuple_ids, nullable_tuples)); - - DescriptorTblBuilder string_builder(&_pool); - string_builder.declare_tuple() << TYPE_VARCHAR; - string_builder.declare_tuple() << TYPE_VARCHAR; - string_builder.declare_tuple() << TYPE_VARCHAR; - _string_desc = - _pool.add(new RowDescriptor(*string_builder.build(), tuple_ids, nullable_tuples)); - } -}; - -#if 0 -// Tests with collection types. -class ArrayTupleStreamTest : public SimpleTupleStreamTest { -protected: - RowDescriptor* _array_desc; - - virtual void create_descriptors() { - // tuples: (array, array>) (array) - std::vector nullable_tuples(2, true); - std::vector tuple_ids; - tuple_ids.push_back(static_cast(0)); - tuple_ids.push_back(static_cast(1)); - TypeDescriptor string_array_type; - string_array_type.type = TYPE_ARRAY; - string_array_type.children.push_back(TYPE_VARCHAR); - - TypeDescriptor int_array_type; - int_array_type.type = TYPE_ARRAY; - int_array_type.children.push_back(TYPE_VARCHAR); - - TypeDescriptor nested_array_type; - nested_array_type.type = TYPE_ARRAY; - nested_array_type.children.push_back(int_array_type); - - DescriptorTblBuilder builder(&_pool); - builder.declare_tuple() << string_array_type << nested_array_type; - builder.declare_tuple() << int_array_type; - _array_desc = _pool.add(new RowDescriptor( - *builder.build(), tuple_ids, nullable_tuples)); - } -}; -#endif - -// Basic API test. No data should be going to disk. -TEST_F(SimpleTupleStreamTest, Basic) { - InitBlockMgr(-1, 8 * 1024 * 1024); - TestValues(1, _int_desc, false); - TestValues(10, _int_desc, false); - TestValues(100, _int_desc, false); - - TestValues(1, _string_desc, false); - TestValues(10, _string_desc, false); - TestValues(100, _string_desc, false); - - TestIntValuesInterleaved(1, 1); - TestIntValuesInterleaved(10, 5); - TestIntValuesInterleaved(100, 15); -} - -// #if 0 -// Test with only 1 buffer. -TEST_F(SimpleTupleStreamTest, OneBufferSpill) { - // Each buffer can only hold 100 ints, so this spills quite often. - int buffer_size = 100 * sizeof(int); - InitBlockMgr(buffer_size, buffer_size); - TestValues(1, _int_desc, false); - TestValues(10, _int_desc, false); - - TestValues(1, _string_desc, false); - TestValues(10, _string_desc, false); -} - -// Test with a few buffers. -TEST_F(SimpleTupleStreamTest, ManyBufferSpill) { - int buffer_size = 100 * sizeof(int); - InitBlockMgr(10 * buffer_size, buffer_size); - - TestValues(1, _int_desc, false); - TestValues(10, _int_desc, false); - TestValues(100, _int_desc, false); - TestValues(1, _string_desc, false); - TestValues(10, _string_desc, false); - TestValues(100, _string_desc, false); - - TestIntValuesInterleaved(1, 1); - TestIntValuesInterleaved(10, 5); - TestIntValuesInterleaved(100, 15); -} - -TEST_F(SimpleTupleStreamTest, UnpinPin) { - int buffer_size = 100 * sizeof(int); - InitBlockMgr(3 * buffer_size, buffer_size); - - BufferedTupleStream2 stream(_runtime_state, *_int_desc, _runtime_state->block_mgr2(), _client, - true, false); - Status status = stream.init(-1, nullptr, true); - EXPECT_TRUE(status.ok()); - - int offset = 0; - bool full = false; - while (!full) { - RowBatch* batch = CreateIntBatch(offset, BATCH_SIZE, false); - int j = 0; - for (; j < batch->num_rows(); ++j) { - full = !stream.add_row(batch->get_row(j), &status); - EXPECT_TRUE(status.ok()); - if (full) { - break; - } - } - offset += j; - } - - status = stream.unpin_stream(); - EXPECT_TRUE(status.ok()); - - bool pinned = false; - status = stream.pin_stream(false, &pinned); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(pinned); - - std::vector results; - - // Read and verify result a few times. We should be able to reread the stream if - // we don't use delete on read mode. - int read_iters = 3; - for (int i = 0; i < read_iters; ++i) { - bool delete_on_read = i == read_iters - 1; - status = stream.prepare_for_read(delete_on_read); - EXPECT_TRUE(status.ok()); - results.clear(); - ReadValues(&stream, _int_desc, &results); - VerifyResults(results, offset, false); - } - - // After delete_on_read, all blocks aside from the last should be deleted. - // Note: this should really be 0, but the BufferedTupleStream2 returns eos before - // deleting the last block, rather than after, so the last block isn't deleted - // until the stream is closed. - DCHECK_EQ(stream.bytes_in_mem(false), buffer_size); - - stream.close(); - - DCHECK_EQ(stream.bytes_in_mem(false), 0); -} - -TEST_F(SimpleTupleStreamTest, SmallBuffers) { - int buffer_size = 8 * 1024 * 1024; - InitBlockMgr(2 * buffer_size, buffer_size); - - BufferedTupleStream2 stream(_runtime_state, *_int_desc, _runtime_state->block_mgr2(), _client, - true, false); - Status status = stream.init(-1, nullptr, false); - EXPECT_TRUE(status.ok()); - - // Initial buffer should be small. - EXPECT_LT(stream.bytes_in_mem(false), buffer_size); - - RowBatch* batch = CreateIntBatch(0, 1024, false); - for (int i = 0; i < batch->num_rows(); ++i) { - bool ret = stream.add_row(batch->get_row(i), &status); - EXPECT_TRUE(ret); - EXPECT_TRUE(status.ok()); - } - EXPECT_LT(stream.bytes_in_mem(false), buffer_size); - EXPECT_LT(stream.byte_size(), buffer_size); - EXPECT_TRUE(stream.using_small_buffers()); - - // 40 MB of ints - batch = CreateIntBatch(0, 10 * 1024 * 1024, false); - for (int i = 0; i < batch->num_rows(); ++i) { - bool ret = stream.add_row(batch->get_row(i), &status); - EXPECT_TRUE(status.ok()); - if (!ret) { - EXPECT_TRUE(stream.using_small_buffers()); - bool got_buffer; - status = stream.switch_to_io_buffers(&got_buffer); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(got_buffer); - ret = stream.add_row(batch->get_row(i), &status); - EXPECT_TRUE(status.ok()); - } - EXPECT_TRUE(ret); - } - EXPECT_EQ(stream.bytes_in_mem(false), buffer_size); - - // TODO: Test for IMPALA-2330. In case switch_to_io_buffers() fails to get buffer then - // using_small_buffers() should still return true. - stream.close(); -} - -// Basic API test. No data should be going to disk. -TEST_F(SimpleNullStreamTest, Basic) { - InitBlockMgr(-1, 8 * 1024 * 1024); - TestValues(1, _int_desc, false); - TestValues(10, _int_desc, false); - TestValues(100, _int_desc, false); - TestValues(1, _int_desc, true); - TestValues(10, _int_desc, true); - TestValues(100, _int_desc, true); - - TestValues(1, _string_desc, false); - TestValues(10, _string_desc, false); - TestValues(100, _string_desc, false); - TestValues(1, _string_desc, true); - TestValues(10, _string_desc, true); - TestValues(100, _string_desc, true); - - TestIntValuesInterleaved(1, 1); - TestIntValuesInterleaved(10, 5); - TestIntValuesInterleaved(100, 15); -} - -// Test tuple stream with only 1 buffer and rows with multiple tuples. -TEST_F(MultiTupleStreamTest, MultiTupleOneBufferSpill) { - // Each buffer can only hold 100 ints, so this spills quite often. - int buffer_size = 100 * sizeof(int); - InitBlockMgr(buffer_size, buffer_size); - TestValues(1, _int_desc, false); - TestValues(10, _int_desc, false); - - TestValues(1, _string_desc, false); - TestValues(10, _string_desc, false); -} - -// Test with a few buffers and rows with multiple tuples. -TEST_F(MultiTupleStreamTest, MultiTupleManyBufferSpill) { - int buffer_size = 100 * sizeof(int); - InitBlockMgr(10 * buffer_size, buffer_size); - - TestValues(1, _int_desc, false); - TestValues(10, _int_desc, false); - TestValues(100, _int_desc, false); - - TestValues(1, _string_desc, false); - TestValues(10, _string_desc, false); - TestValues(100, _string_desc, false); - - TestIntValuesInterleaved(1, 1); - TestIntValuesInterleaved(10, 5); - TestIntValuesInterleaved(100, 15); -} - -// Test with rows with multiple nullable tuples. -TEST_F(MultiNullableTupleStreamTest, MultiNullableTupleOneBufferSpill) { - // Each buffer can only hold 100 ints, so this spills quite often. - int buffer_size = 100 * sizeof(int); - InitBlockMgr(buffer_size, buffer_size); - TestValues(1, _int_desc, false); - TestValues(10, _int_desc, false); - TestValues(1, _int_desc, true); - TestValues(10, _int_desc, true); - - TestValues(1, _string_desc, false); - TestValues(10, _string_desc, false); - TestValues(1, _string_desc, true); - TestValues(10, _string_desc, true); -} - -// Test with a few buffers. -TEST_F(MultiNullableTupleStreamTest, MultiNullableTupleManyBufferSpill) { - int buffer_size = 100 * sizeof(int); - InitBlockMgr(10 * buffer_size, buffer_size); - - TestValues(1, _int_desc, false); - TestValues(10, _int_desc, false); - TestValues(100, _int_desc, false); - TestValues(1, _int_desc, true); - TestValues(10, _int_desc, true); - TestValues(100, _int_desc, true); - - TestValues(1, _string_desc, false); - TestValues(10, _string_desc, false); - TestValues(100, _string_desc, false); - TestValues(1, _string_desc, true); - TestValues(10, _string_desc, true); - TestValues(100, _string_desc, true); - - TestIntValuesInterleaved(1, 1); - TestIntValuesInterleaved(10, 5); - TestIntValuesInterleaved(100, 15); -} -// #endif - -#if 0 -// Test that deep copy works with arrays by copying into a BufferedTupleStream2, freeing -// the original rows, then reading back the rows and verifying the contents. -TEST_F(ArrayTupleStreamTest, TestArrayDeepCopy) { - Status status; - InitBlockMgr(-1, 8 * 1024 * 1024); - const int NUM_ROWS = 4000; - BufferedTupleStream2 stream(_runtime_state, *_array_desc, _runtime_state->block_mgr2(), - _client, false, false); - const std::vector& tuple_descs = _array_desc->tuple_descriptors(); - // Write out a predictable pattern of data by iterating over arrays of constants. - int strings_index = 0; // we take the mod of this as index into STRINGS. - int array_lens[] = { 0, 1, 5, 10, 1000, 2, 49, 20 }; - int num_array_lens = sizeof(array_lens) / sizeof(array_lens[0]); - int array_len_index = 0; - for (int i = 0; i < NUM_ROWS; ++i) { - int expected_row_size = tuple_descs[0]->byte_size() + tuple_descs[1]->byte_size(); - // gscoped_ptr row(reinterpret_cast( - // malloc(tuple_descs.size() * sizeof(Tuple*)))); - // gscoped_ptr tuple0(reinterpret_cast( - // malloc(tuple_descs[0]->byte_size()))); - // gscoped_ptr tuple1(reinterpret_cast( - // malloc(tuple_descs[1]->byte_size()))); - std::unique_ptr row(reinterpret_cast( - malloc(tuple_descs.size() * sizeof(Tuple*)))); - std::unique_ptr tuple0(reinterpret_cast( - malloc(tuple_descs[0]->byte_size()))); - std::unique_ptr tuple1(reinterpret_cast( - malloc(tuple_descs[1]->byte_size()))); - memset(tuple0.get(), 0, tuple_descs[0]->byte_size()); - memset(tuple1.get(), 0, tuple_descs[1]->byte_size()); - row->set_tuple(0, tuple0.get()); - row->set_tuple(1, tuple1.get()); - - // Only array is non-null. - tuple0->set_null(tuple_descs[0]->slots()[1]->null_indicator_offset()); - tuple1->set_null(tuple_descs[1]->slots()[0]->null_indicator_offset()); - const SlotDescriptor* array_slot_desc = tuple_descs[0]->slots()[0]; - const TupleDescriptor* item_desc = array_slot_desc->collection_item_descriptor(); - - int array_len = array_lens[array_len_index++ % num_array_lens]; - CollectionValue* cv = tuple0->GetCollectionSlot(array_slot_desc->tuple_offset()); - cv->ptr = nullptr; - cv->num_tuples = 0; - CollectionValueBuilder builder(cv, *item_desc, _mem_pool.get(), array_len); - Tuple* array_data; - builder.GetFreeMemory(&array_data); - expected_row_size += item_desc->byte_size() * array_len; - - // Fill the array with pointers to our constant strings. - for (int j = 0; j < array_len; ++j) { - const StringValue* string = &STRINGS[strings_index++ % NUM_STRINGS]; - array_data->SetNotNull(item_desc->slots()[0]->null_indicator_offset()); - RawValue::Write(string, array_data, item_desc->slots()[0], _mem_pool.get()); - array_data += item_desc->byte_size(); - expected_row_size += string->len; - } - builder.CommitTuples(array_len); - - // Check that internal row size computation gives correct result. - EXPECT_EQ(expected_row_size, stream.ComputeRowSize(row.get())); - bool b = stream.add_row(row.get(), &status); - EXPECT_TRUE(b); - EXPECT_TRUE(status.ok()); - _mem_pool->FreeAll(); // Free data as soon as possible to smoke out issues. - } - - // Read back and verify data. - stream.prepare_for_read(false); - strings_index = 0; - array_len_index = 0; - bool eos = false; - int rows_read = 0; - RowBatch batch(*_array_desc, BATCH_SIZE); - do { - batch.reset(); - EXPECT_TRUE(stream.get_next(&batch, &eos).ok()); - for (int i = 0; i < batch.num_rows(); ++i) { - TupleRow* row = batch.GetRow(i); - Tuple* tuple0 = row->get_tuple(0); - Tuple* tuple1 = row->get_tuple(1); - EXPECT_TRUE(tuple0 != nullptr); - EXPECT_TRUE(tuple1 != nullptr); - const SlotDescriptor* array_slot_desc = tuple_descs[0]->slots()[0]; - EXPECT_FALSE(tuple0->IsNull(array_slot_desc->null_indicator_offset())); - EXPECT_TRUE(tuple0->IsNull(tuple_descs[0]->slots()[1]->null_indicator_offset())); - EXPECT_TRUE(tuple1->IsNull(tuple_descs[1]->slots()[0]->null_indicator_offset())); - - const TupleDescriptor* item_desc = array_slot_desc->collection_item_descriptor(); - int expected_array_len = array_lens[array_len_index++ % num_array_lens]; - CollectionValue* cv = tuple0->GetCollectionSlot(array_slot_desc->tuple_offset()); - EXPECT_EQ(expected_array_len, cv->num_tuples); - for (int j = 0; j < cv->num_tuples; ++j) { - Tuple* item = reinterpret_cast(cv->ptr + j * item_desc->byte_size()); - const SlotDescriptor* string_desc = item_desc->slots()[0]; - EXPECT_FALSE(item->IsNull(string_desc->null_indicator_offset())); - const StringValue* expected = &STRINGS[strings_index++ % NUM_STRINGS]; - const StringValue* actual = item->GetStringSlot(string_desc->tuple_offset()); - EXPECT_EQ(*expected, *actual); - } - } - rows_read += batch.num_rows(); - } while (!eos); - EXPECT_EQ(NUM_ROWS, rows_read); -} -#endif - -// TODO: more tests. -// - The stream can operate in many modes - -} // namespace doris diff --git a/be/test/runtime/test_env.cc b/be/test/runtime/test_env.cc index fb8dcc68fe..36c040ca5d 100644 --- a/be/test/runtime/test_env.cc +++ b/be/test/runtime/test_env.cc @@ -76,34 +76,6 @@ RuntimeState* TestEnv::create_runtime_state(int64_t query_id) { return new RuntimeState(plan_params.params, TQueryOptions(), TQueryGlobals(), _exec_env); } -Status TestEnv::create_query_state(int64_t query_id, int max_buffers, int block_size, - RuntimeState** runtime_state) { - *runtime_state = create_runtime_state(query_id); - if (*runtime_state == nullptr) { - return Status::InternalError("Unexpected error creating RuntimeState"); - } - - std::shared_ptr mgr; - RETURN_IF_ERROR(BufferedBlockMgr2::create(*runtime_state, (*runtime_state)->runtime_profile(), - _tmp_file_mgr.get(), block_size, &mgr)); - (*runtime_state)->set_block_mgr2(mgr); - // (*runtime_state)->_block_mgr = mgr; - - _query_states.push_back(std::shared_ptr(*runtime_state)); - return Status::OK(); -} - -Status TestEnv::create_query_states(int64_t start_query_id, int num_mgrs, int buffers_per_mgr, - int block_size, std::vector* runtime_states) { - for (int i = 0; i < num_mgrs; ++i) { - RuntimeState* runtime_state = nullptr; - RETURN_IF_ERROR(create_query_state(start_query_id + i, buffers_per_mgr, block_size, - &runtime_state)); - runtime_states->push_back(runtime_state); - } - return Status::OK(); -} - void TestEnv::tear_down_query_states() { _query_states.clear(); } diff --git a/be/test/runtime/test_env.h b/be/test/runtime/test_env.h index e47a7d595d..ea034ebd19 100644 --- a/be/test/runtime/test_env.h +++ b/be/test/runtime/test_env.h @@ -18,10 +18,10 @@ #ifndef DORIS_BE_TEST_QUERY_RUNTIME_TEST_ENV_H #define DORIS_BE_TEST_QUERY_RUNTIME_TEST_ENV_H -#include "runtime/buffered_block_mgr2.h" #include "runtime/disk_io_mgr.h" #include "runtime/exec_env.h" #include "runtime/runtime_state.h" +#include "runtime/tmp_file_mgr.h" namespace doris { @@ -42,16 +42,6 @@ public: // If don't need to open, paths can be empty. void init_storage_engine(bool need_open, const std::vector& paths = {}); - // Create a RuntimeState for a query with a new block manager. The RuntimeState is - // owned by the TestEnv. - Status create_query_state(int64_t query_id, int max_buffers, int block_size, - RuntimeState** runtime_state); - - // Create multiple separate RuntimeStates with associated block managers, e.g. as if - // multiple queries were executing. The RuntimeStates are owned by TestEnv. - Status create_query_states(int64_t start_query_id, int num_mgrs, int buffers_per_mgr, - int block_size, std::vector* runtime_states); - // Destroy all RuntimeStates and block managers created by this TestEnv. void tear_down_query_states();