From 10f822eb4353cbf9769d1ae6ada5632480d5f30e Mon Sep 17 00:00:00 2001 From: HuangWei Date: Fri, 31 Jul 2020 21:57:21 +0800 Subject: [PATCH] [MemTracker] make all MemTrackers shared (#4135) We make all MemTrackers shared, in order to show MemTracker real-time consumptions on the web. As follows: 1. nearly all MemTracker raw ptr -> shared_ptr 2. Use CreateTracker() to create new MemTracker(in order to add itself to its parent) 3. RowBatch & MemPool still use raw ptrs of MemTracker, it's easy to ensure RowBatch & MemPool destructor exec before MemTracker's destructor. So we don't change these code. 4. MemTracker can use RuntimeProfile's counter to calc consumption. So RuntimeProfile's counter need to be shared too. We add a shared counter pool to store the shared counter, don't change other counters of RuntimeProfile. Note that, this PR doesn't change the MemTracker tree structure. So there still have some orphan trackers, e.g. RowBlockV2's MemTracker. If you find some shared MemTrackers are little memory consumption & too time-consuming, you could make them be the orphan, then it's fine to use the raw ptr. --- be/src/common/atomic.h | 6 + be/src/common/config.h | 2 + be/src/common/logging.h | 6 + be/src/exec/aggregation_node.cpp | 8 +- be/src/exec/analytic_eval_node.cpp | 22 +- be/src/exec/base_scanner.cpp | 32 +- be/src/exec/base_scanner.h | 2 +- be/src/exec/blocking_join_node.cpp | 4 +- be/src/exec/broker_scan_node.cpp | 4 +- be/src/exec/cross_join_node.cpp | 2 +- be/src/exec/csv_scan_node.cpp | 2 +- be/src/exec/data_sink.cpp | 3 +- be/src/exec/data_sink.h | 4 +- be/src/exec/es_http_scan_node.cpp | 2 +- be/src/exec/es_http_scanner.cpp | 54 +- be/src/exec/es_http_scanner.h | 2 +- be/src/exec/except_node.cpp | 2 +- be/src/exec/exec_node.cpp | 27 +- be/src/exec/exec_node.h | 12 +- be/src/exec/hash_join_node.cpp | 6 +- be/src/exec/hash_table.cpp | 16 +- be/src/exec/hash_table.h | 4 +- be/src/exec/intersect_node.cpp | 2 +- be/src/exec/merge_join_node.h | 9 +- be/src/exec/merge_node.cpp | 2 +- be/src/exec/mysql_scan_node.cpp | 2 +- be/src/exec/olap_rewrite_node.cpp | 2 +- be/src/exec/olap_scan_node.cpp | 2 +- be/src/exec/olap_scanner.cpp | 2 +- be/src/exec/partitioned_aggregation_node.cc | 14 +- be/src/exec/partitioned_hash_table.cc | 105 ++- be/src/exec/partitioned_hash_table.h | 35 +- be/src/exec/repeat_node.cpp | 2 +- be/src/exec/schema_scan_node.cpp | 2 +- be/src/exec/select_node.cpp | 2 +- be/src/exec/set_operation_node.cpp | 4 +- be/src/exec/sort_exec_exprs.cpp | 2 +- be/src/exec/sort_exec_exprs.h | 3 +- be/src/exec/spill_sort_node.cc | 2 +- be/src/exec/tablet_info.cpp | 2 +- be/src/exec/tablet_info.h | 2 +- be/src/exec/tablet_sink.cpp | 15 +- be/src/exec/tablet_sink.h | 4 +- be/src/exec/topn_node.cpp | 6 +- be/src/exec/union_node.cpp | 6 +- be/src/exprs/agg_fn_evaluator.cpp | 8 +- be/src/exprs/agg_fn_evaluator.h | 4 +- be/src/exprs/expr.cpp | 26 +- be/src/exprs/expr.h | 17 +- be/src/exprs/expr_context.cpp | 8 +- be/src/exprs/expr_context.h | 2 +- be/src/exprs/new_agg_fn_evaluator.cc | 39 +- be/src/exprs/new_agg_fn_evaluator.h | 8 +- be/src/http/default_path_handlers.cpp | 14 +- be/src/http/default_path_handlers.h | 6 +- be/src/olap/aggregate_func.h | 4 +- be/src/olap/base_compaction.cpp | 2 +- be/src/olap/base_compaction.h | 3 +- be/src/olap/compaction.cpp | 8 +- be/src/olap/compaction.h | 6 +- be/src/olap/cumulative_compaction.cpp | 2 +- be/src/olap/cumulative_compaction.h | 2 +- be/src/olap/delta_writer.cpp | 25 +- be/src/olap/delta_writer.h | 8 +- be/src/olap/fs/file_block_manager.cpp | 2 +- be/src/olap/fs/file_block_manager.h | 2 +- be/src/olap/memtable.cpp | 31 +- be/src/olap/memtable.h | 5 +- be/src/olap/merger.cpp | 2 +- be/src/olap/olap_index.cpp | 15 +- be/src/olap/olap_index.h | 2 +- be/src/olap/reader.h | 2 +- be/src/olap/row_block.cpp | 4 +- be/src/olap/row_block.h | 5 +- be/src/olap/row_block2.cpp | 15 +- be/src/olap/row_block2.h | 2 +- be/src/olap/rowset/alpha_rowset.cpp | 2 +- be/src/olap/rowset/alpha_rowset.h | 2 +- be/src/olap/rowset/alpha_rowset_reader.cpp | 2 +- be/src/olap/rowset/alpha_rowset_reader.h | 4 +- be/src/olap/rowset/beta_rowset.cpp | 2 +- be/src/olap/rowset/beta_rowset.h | 2 +- be/src/olap/rowset/beta_rowset_reader.cpp | 3 +- be/src/olap/rowset/beta_rowset_reader.h | 5 +- be/src/olap/rowset/column_data.cpp | 28 +- be/src/olap/rowset/column_data.h | 8 +- be/src/olap/rowset/rowset.h | 2 +- be/src/olap/rowset/segment_reader.cpp | 11 +- be/src/olap/rowset/segment_reader.h | 4 +- .../rowset/segment_v2/binary_dict_page.cpp | 3 +- .../olap/rowset/segment_v2/binary_dict_page.h | 2 +- .../rowset/segment_v2/bitmap_index_reader.h | 14 +- .../rowset/segment_v2/bitmap_index_writer.cpp | 7 +- .../segment_v2/bloom_filter_index_reader.h | 5 +- .../segment_v2/bloom_filter_index_writer.cpp | 12 +- be/src/olap/rowset/segment_v2/column_reader.h | 20 +- .../segment_v2/indexed_column_writer.cpp | 7 +- .../rowset/segment_v2/indexed_column_writer.h | 2 +- .../olap/rowset/segment_v2/zone_map_index.cpp | 7 +- .../olap/rowset/segment_v2/zone_map_index.h | 2 +- be/src/olap/schema_change.cpp | 2 +- be/src/olap/storage_engine.cpp | 10 +- be/src/olap/storage_engine.h | 2 +- be/src/olap/task/engine_checksum_task.cpp | 2 +- be/src/runtime/buffered_block_mgr2.cc | 87 +- be/src/runtime/buffered_block_mgr2.h | 11 +- be/src/runtime/buffered_tuple_stream2.cc | 2 +- be/src/runtime/buffered_tuple_stream3.cc | 4 +- be/src/runtime/buffered_tuple_stream3.h | 2 +- be/src/runtime/bufferpool/buffer_pool.cc | 8 +- be/src/runtime/bufferpool/buffer_pool.h | 4 +- .../runtime/bufferpool/buffer_pool_internal.h | 2 +- .../runtime/bufferpool/reservation_tracker.cc | 14 +- be/src/runtime/data_spliter.cpp | 9 +- be/src/runtime/data_stream_recvr.cc | 12 +- be/src/runtime/data_stream_recvr.h | 6 +- be/src/runtime/data_stream_sender.cpp | 14 +- be/src/runtime/data_stream_sender.h | 2 +- be/src/runtime/disk_io_mgr.cc | 45 +- be/src/runtime/disk_io_mgr.h | 18 +- be/src/runtime/disk_io_mgr_internal.h | 6 +- be/src/runtime/disk_io_mgr_reader_context.cc | 5 +- be/src/runtime/dpp_sink.cpp | 4 +- be/src/runtime/dpp_sink_internal.cpp | 15 +- be/src/runtime/dpp_sink_internal.h | 4 +- be/src/runtime/exec_env.cpp | 9 +- be/src/runtime/exec_env.h | 51 +- be/src/runtime/exec_env_init.cpp | 5 +- be/src/runtime/export_sink.cpp | 4 +- be/src/runtime/export_sink.h | 2 +- be/src/runtime/initial_reservations.cc | 10 +- be/src/runtime/initial_reservations.h | 4 +- be/src/runtime/load_channel.cpp | 6 +- be/src/runtime/load_channel.h | 4 +- be/src/runtime/load_channel_mgr.cpp | 4 +- be/src/runtime/load_channel_mgr.h | 2 +- be/src/runtime/mem_pool.cpp | 18 +- be/src/runtime/mem_tracker.cpp | 744 +++++++++------ be/src/runtime/mem_tracker.h | 879 ++++++++++-------- be/src/runtime/memory_scratch_sink.cpp | 3 +- be/src/runtime/mysql_table_sink.cpp | 15 +- be/src/runtime/mysql_table_sink.h | 2 +- be/src/runtime/plan_fragment_executor.cpp | 14 +- be/src/runtime/plan_fragment_executor.h | 3 +- be/src/runtime/qsorter.cpp | 2 +- be/src/runtime/result_sink.cpp | 3 +- be/src/runtime/row_batch.cpp | 14 +- be/src/runtime/runtime_state.cpp | 47 +- be/src/runtime/runtime_state.h | 48 +- be/src/runtime/spill_sorter.cc | 6 +- be/src/runtime/spill_sorter.h | 4 +- be/src/runtime/tablets_channel.cpp | 6 +- be/src/runtime/tablets_channel.h | 4 +- be/src/runtime/test_env.cc | 4 +- be/src/runtime/test_env.h | 8 +- be/src/runtime/vectorized_row_batch.cpp | 4 +- be/src/runtime/vectorized_row_batch.h | 4 +- be/src/testutil/function_utils.cpp | 9 +- be/src/testutil/function_utils.h | 4 +- be/src/util/arrow/row_batch.cpp | 8 +- be/src/util/arrow/row_batch.h | 2 +- be/src/util/runtime_profile.cpp | 24 + be/src/util/runtime_profile.h | 8 + be/test/exec/broker_scan_node_test.cpp | 4 +- be/test/exec/broker_scanner_test.cpp | 30 +- be/test/exec/csv_scan_node_test.cpp | 15 +- be/test/exec/es_scan_node_test.cpp | 2 +- be/test/exec/hash_table_test.cpp | 19 +- be/test/exec/orc_scanner_test.cpp | 15 +- be/test/exec/parquet_scanner_test.cpp | 4 +- be/test/exec/tablet_info_test.cpp | 8 +- be/test/exec/tablet_sink_test.cpp | 20 +- be/test/olap/aggregate_func_test.cpp | 10 +- be/test/olap/column_reader_test.cpp | 2 +- be/test/olap/comparison_predicate_test.cpp | 2 +- be/test/olap/delta_writer_test.cpp | 9 +- be/test/olap/in_list_predicate_test.cpp | 2 +- be/test/olap/key_coder_test.cpp | 4 +- be/test/olap/null_predicate_test.cpp | 2 +- be/test/olap/row_block_v2_test.cpp | 4 +- be/test/olap/row_cursor_test.cpp | 6 +- be/test/olap/rowset/alpha_rowset_test.cpp | 2 +- be/test/olap/rowset/beta_rowset_test.cpp | 4 +- be/test/olap/rowset/rowset_converter_test.cpp | 2 +- .../segment_v2/binary_dict_page_test.cpp | 8 +- .../segment_v2/binary_plain_page_test.cpp | 6 +- .../segment_v2/binary_prefix_page_test.cpp | 8 +- .../rowset/segment_v2/bitmap_index_test.cpp | 4 +- .../segment_v2/bitshuffle_page_test.cpp | 8 +- .../segment_v2/bloom_filter_page_test.cpp | 4 +- .../segment_v2/column_reader_writer_test.cpp | 22 +- .../frame_of_reference_page_test.cpp | 8 +- .../rowset/segment_v2/plain_page_test.cpp | 8 +- .../olap/rowset/segment_v2/rle_page_test.cpp | 8 +- .../olap/rowset/segment_v2/segment_test.cpp | 4 +- be/test/olap/schema_change_test.cpp | 2 +- be/test/olap/skiplist_test.cpp | 16 +- be/test/olap/storage_types_test.cpp | 8 +- be/test/runtime/CMakeLists.txt | 11 +- be/test/runtime/buffered_block_mgr2_test.cpp | 47 +- .../runtime/buffered_tuple_stream2_test.cpp | 16 +- be/test/runtime/data_stream_test.cpp | 18 +- be/test/runtime/disk_io_mgr_test.cpp | 140 ++- be/test/runtime/load_channel_mgr_test.cpp | 28 +- be/test/runtime/mem_limit_test.cpp | 206 ++-- be/test/runtime/memory_scratch_sink_test.cpp | 5 +- be/test/util/arrow/arrow_row_batch_test.cpp | 20 +- be/test/util/arrow/arrow_row_block_test.cpp | 24 +- be/test/util/arrow/arrow_work_flow_test.cpp | 25 +- 209 files changed, 2137 insertions(+), 1845 deletions(-) diff --git a/be/src/common/atomic.h b/be/src/common/atomic.h index 766c496f40..a675c2d604 100644 --- a/be/src/common/atomic.h +++ b/be/src/common/atomic.h @@ -201,6 +201,12 @@ public: /// Atomic store with "release" memory-ordering semantic. inline void store(T* val) { _ptr.store(reinterpret_cast(val)); } + /// Store 'new_val' and return the previous value. Implies a Release memory barrier + /// (i.e. the same as Store()). + inline T* swap(T* val) { + return reinterpret_cast(_ptr.swap(reinterpret_cast(val))); + } + private: AtomicInt _ptr; }; diff --git a/be/src/common/config.h b/be/src/common/config.h index d0858aa279..31a62b76a0 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -544,6 +544,8 @@ namespace config { // Whether to continue to start be when load tablet from header failed. CONF_Bool(ignore_rowset_stale_unconsistent_delete, "false"); + // Soft memory limit as a fraction of hard memory limit. + CONF_Double(soft_mem_limit_frac, "0.9"); } // namespace config } // namespace doris diff --git a/be/src/common/logging.h b/be/src/common/logging.h index cad58f39a0..14545bc789 100644 --- a/be/src/common/logging.h +++ b/be/src/common/logging.h @@ -61,4 +61,10 @@ #define VLOG_ROW_IS_ON VLOG_IS_ON(3) #define VLOG_PROGRESS_IS_ON VLOG_IS_ON(2) +/// Define a wrapper around DCHECK for strongly typed enums that print a useful error +/// message on failure. +#define DCHECK_ENUM_EQ(a, b) \ + DCHECK(a == b) << "[ " #a " = " << static_cast(a) << " , " #b " = " \ + << static_cast(b) << " ]" + #endif diff --git a/be/src/exec/aggregation_node.cpp b/be/src/exec/aggregation_node.cpp index bea37b48e4..990fb1641e 100644 --- a/be/src/exec/aggregation_node.cpp +++ b/be/src/exec/aggregation_node.cpp @@ -113,7 +113,7 @@ Status AggregationNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(Expr::prepare( _build_expr_ctxs, state, build_row_desc, expr_mem_tracker())); - _tuple_pool.reset(new MemPool(mem_tracker())); + _tuple_pool.reset(new MemPool(mem_tracker().get())); _agg_fn_ctxs.resize(_aggregate_evaluators.size()); int j = _probe_expr_ctxs.size(); @@ -128,8 +128,8 @@ Status AggregationNode::prepare(RuntimeState* state) { SlotDescriptor* intermediate_slot_desc = _intermediate_tuple_desc->slots()[j]; SlotDescriptor* output_slot_desc = _output_tuple_desc->slots()[j]; RETURN_IF_ERROR(_aggregate_evaluators[i]->prepare( - state, child(0)->row_desc(), _tuple_pool.get(), - intermediate_slot_desc, output_slot_desc, mem_tracker(), &_agg_fn_ctxs[i])); + state, child(0)->row_desc(), _tuple_pool.get(), intermediate_slot_desc, + output_slot_desc, mem_tracker(), &_agg_fn_ctxs[i])); state->obj_pool()->add(_agg_fn_ctxs[i]); } @@ -160,7 +160,7 @@ Status AggregationNode::open(RuntimeState* state) { RETURN_IF_ERROR(_children[0]->open(state)); - RowBatch batch(_children[0]->row_desc(), state->batch_size(), mem_tracker()); + RowBatch batch(_children[0]->row_desc(), state->batch_size(), mem_tracker().get()); int64_t num_input_rows = 0; int64_t num_agg_rows = 0; diff --git a/be/src/exec/analytic_eval_node.cpp b/be/src/exec/analytic_eval_node.cpp index 400faa5a25..b4e97e506e 100644 --- a/be/src/exec/analytic_eval_node.cpp +++ b/be/src/exec/analytic_eval_node.cpp @@ -146,18 +146,18 @@ Status AnalyticEvalNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); DCHECK(child(0)->row_desc().is_prefix_of(row_desc())); _child_tuple_desc = child(0)->row_desc().tuple_descriptors()[0]; - _curr_tuple_pool.reset(new MemPool(mem_tracker())); - _prev_tuple_pool.reset(new MemPool(mem_tracker())); - _mem_pool.reset(new MemPool(mem_tracker())); + _curr_tuple_pool.reset(new MemPool(mem_tracker().get())); + _prev_tuple_pool.reset(new MemPool(mem_tracker().get())); + _mem_pool.reset(new MemPool(mem_tracker().get())); _evaluation_timer = ADD_TIMER(runtime_profile(), "EvaluationTime"); DCHECK_EQ(_result_tuple_desc->slots().size(), _evaluators.size()); for (int i = 0; i < _evaluators.size(); ++i) { doris_udf::FunctionContext* ctx; - RETURN_IF_ERROR(_evaluators[i]->prepare(state, child(0)->row_desc(), _mem_pool.get(), - _intermediate_tuple_desc->slots()[i], _result_tuple_desc->slots()[i], - mem_tracker(), &ctx)); + RETURN_IF_ERROR(_evaluators[i]->prepare( + state, child(0)->row_desc(), _mem_pool.get(), _intermediate_tuple_desc->slots()[i], + _result_tuple_desc->slots()[i], mem_tracker(), &ctx)); _fn_ctxs.push_back(ctx); state->obj_pool()->add(ctx); } @@ -171,13 +171,13 @@ Status AnalyticEvalNode::prepare(RuntimeState* state) { if (_partition_by_eq_expr_ctx != NULL) { RETURN_IF_ERROR( - _partition_by_eq_expr_ctx->prepare(state, cmp_row_desc, expr_mem_tracker())); + _partition_by_eq_expr_ctx->prepare(state, cmp_row_desc, expr_mem_tracker())); //AddExprCtxToFree(_partition_by_eq_expr_ctx); } if (_order_by_eq_expr_ctx != NULL) { RETURN_IF_ERROR( - _order_by_eq_expr_ctx->prepare(state, cmp_row_desc, expr_mem_tracker())); + _order_by_eq_expr_ctx->prepare(state, cmp_row_desc, expr_mem_tracker())); //AddExprCtxToFree(_order_by_eq_expr_ctx); } } @@ -238,8 +238,8 @@ Status AnalyticEvalNode::open(RuntimeState* state) { // Fetch the first input batch so that some _prev_input_row can be set here to avoid // special casing in GetNext(). - _prev_child_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker())); - _curr_child_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker())); + _prev_child_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker().get())); + _curr_child_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker().get())); while (!_input_eos && _prev_input_row == NULL) { RETURN_IF_ERROR(child(0)->get_next(state, _curr_child_batch.get(), &_input_eos)); @@ -744,7 +744,7 @@ Status AnalyticEvalNode::get_next_output_batch(RuntimeState* state, RowBatch* ou ExprContext** ctxs = &_conjunct_ctxs[0]; int num_ctxs = _conjunct_ctxs.size(); - RowBatch input_batch(child(0)->row_desc(), output_batch->capacity(), mem_tracker()); + RowBatch input_batch(child(0)->row_desc(), output_batch->capacity(), mem_tracker().get()); int64_t stream_idx = _input_stream->rows_returned(); RETURN_IF_ERROR(_input_stream->get_next(&input_batch, eos)); diff --git a/be/src/exec/base_scanner.cpp b/be/src/exec/base_scanner.cpp index d72c301462..73e6d5bed7 100644 --- a/be/src/exec/base_scanner.cpp +++ b/be/src/exec/base_scanner.cpp @@ -27,23 +27,25 @@ namespace doris { -BaseScanner::BaseScanner(RuntimeState* state, RuntimeProfile* profile, const TBrokerScanRangeParams& params, ScannerCounter* counter) : - _state(state), _params(params), _counter(counter), - _src_tuple(nullptr), - _src_tuple_row(nullptr), +BaseScanner::BaseScanner(RuntimeState* state, RuntimeProfile* profile, + const TBrokerScanRangeParams& params, ScannerCounter* counter) + : _state(state), + _params(params), + _counter(counter), + _src_tuple(nullptr), + _src_tuple_row(nullptr), #if BE_TEST - _mem_tracker(new MemTracker()), - _mem_pool(_mem_tracker.get()), + _mem_tracker(new MemTracker()), #else - _mem_tracker(new MemTracker(-1, "Broker Scanner", state->instance_mem_tracker())), - _mem_pool(_state->instance_mem_tracker()), + _mem_tracker(MemTracker::CreateTracker(-1, "Broker Scanner", state->instance_mem_tracker())), #endif - _dest_tuple_desc(nullptr), - _strict_mode(false), - _profile(profile), - _rows_read_counter(nullptr), - _read_timer(nullptr), - _materialize_timer(nullptr) { + _mem_pool(_mem_tracker.get()), + _dest_tuple_desc(nullptr), + _strict_mode(false), + _profile(profile), + _rows_read_counter(nullptr), + _read_timer(nullptr), + _materialize_timer(nullptr) { } Status BaseScanner::open() { @@ -113,7 +115,7 @@ Status BaseScanner::init_expr_ctxes() { } ExprContext* ctx = nullptr; RETURN_IF_ERROR(Expr::create_expr_tree(_state->obj_pool(), it->second, &ctx)); - RETURN_IF_ERROR(ctx->prepare(_state, *_row_desc.get(), _mem_tracker.get())); + RETURN_IF_ERROR(ctx->prepare(_state, *_row_desc.get(), _mem_tracker)); RETURN_IF_ERROR(ctx->open(_state)); _dest_expr_ctx.emplace_back(ctx); if (has_slot_id_map) { diff --git a/be/src/exec/base_scanner.h b/be/src/exec/base_scanner.h index 43461c2e4f..d2335020ed 100644 --- a/be/src/exec/base_scanner.h +++ b/be/src/exec/base_scanner.h @@ -76,7 +76,7 @@ protected: Tuple* _src_tuple; TupleRow* _src_tuple_row; - std::unique_ptr _mem_tracker; + std::shared_ptr _mem_tracker; // Mem pool used to allocate _src_tuple and _src_tuple_row MemPool _mem_pool; diff --git a/be/src/exec/blocking_join_node.cpp b/be/src/exec/blocking_join_node.cpp index e174973f1c..7f60b81b09 100644 --- a/be/src/exec/blocking_join_node.cpp +++ b/be/src/exec/blocking_join_node.cpp @@ -50,7 +50,7 @@ Status BlockingJoinNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); - _build_pool.reset(new MemPool(mem_tracker())); + _build_pool.reset(new MemPool(mem_tracker().get())); _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); _left_child_timer = ADD_TIMER(runtime_profile(), "LeftChildTime"); _build_row_counter = ADD_COUNTER(runtime_profile(), "BuildRows", TUnit::UNIT); @@ -74,7 +74,7 @@ Status BlockingJoinNode::prepare(RuntimeState* state) { _probe_tuple_row_size = num_left_tuples * sizeof(Tuple*); _build_tuple_row_size = num_build_tuples * sizeof(Tuple*); - _left_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker())); + _left_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker().get())); return Status::OK(); } diff --git a/be/src/exec/broker_scan_node.cpp b/be/src/exec/broker_scan_node.cpp index 53212e1da9..dcadbc8154 100644 --- a/be/src/exec/broker_scan_node.cpp +++ b/be/src/exec/broker_scan_node.cpp @@ -325,7 +325,7 @@ Status BrokerScanNode::scanner_scan( while (!scanner_eof) { // Fill one row batch std::shared_ptr row_batch( - new RowBatch(row_desc(), _runtime_state->batch_size(), mem_tracker())); + new RowBatch(row_desc(), _runtime_state->batch_size(), mem_tracker().get())); // create new tuple buffer for row_batch MemPool* tuple_pool = row_batch->tuple_data_pool(); @@ -382,7 +382,7 @@ Status BrokerScanNode::scanner_scan( // 1. too many batches in queue, or // 2. at least one batch in queue and memory exceed limit. (_batch_queue.size() >= _max_buffered_batches - || (mem_tracker()->any_limit_exceeded() && !_batch_queue.empty()))) { + || (mem_tracker()->AnyLimitExceeded(MemLimit::HARD) && !_batch_queue.empty()))) { _queue_writer_cond.wait_for(l, std::chrono::seconds(1)); } // Process already set failed, so we just return OK diff --git a/be/src/exec/cross_join_node.cpp b/be/src/exec/cross_join_node.cpp index 3d6736d12b..c85877df15 100644 --- a/be/src/exec/cross_join_node.cpp +++ b/be/src/exec/cross_join_node.cpp @@ -57,7 +57,7 @@ Status CrossJoinNode::construct_build_side(RuntimeState* state) { while (true) { RowBatch* batch = _build_batch_pool->add( - new RowBatch(child(1)->row_desc(), state->batch_size(), mem_tracker())); + new RowBatch(child(1)->row_desc(), state->batch_size(), mem_tracker().get())); RETURN_IF_CANCELLED(state); // TODO(zhaochun): diff --git a/be/src/exec/csv_scan_node.cpp b/be/src/exec/csv_scan_node.cpp index f8ed70bf87..745f8a7476 100644 --- a/be/src/exec/csv_scan_node.cpp +++ b/be/src/exec/csv_scan_node.cpp @@ -209,7 +209,7 @@ Status CsvScanNode::prepare(RuntimeState* state) { return Status::InternalError("new a csv scanner failed."); } - _tuple_pool.reset(new(std::nothrow) MemPool(state->instance_mem_tracker())); + _tuple_pool.reset(new(std::nothrow) MemPool(state->instance_mem_tracker().get())); if (_tuple_pool.get() == nullptr) { return Status::InternalError("new a mem pool failed."); } diff --git a/be/src/exec/data_sink.cpp b/be/src/exec/data_sink.cpp index 7c75b2f538..50a27cbb9e 100644 --- a/be/src/exec/data_sink.cpp +++ b/be/src/exec/data_sink.cpp @@ -152,7 +152,8 @@ Status DataSink::init(const TDataSink& thrift_sink) { } Status DataSink::prepare(RuntimeState* state) { - _expr_mem_tracker.reset(new MemTracker(-1, "Data sink", state->instance_mem_tracker())); + _expr_mem_tracker = MemTracker::CreateTracker(-1, std::string("DataSink:") + std::to_string(state->load_job_id()), + state->instance_mem_tracker()); return Status::OK(); } diff --git a/be/src/exec/data_sink.h b/be/src/exec/data_sink.h index eeaf66e544..1c26e2d4fa 100644 --- a/be/src/exec/data_sink.h +++ b/be/src/exec/data_sink.h @@ -63,7 +63,7 @@ public: // It must be okay to call this multiple times. Subsequent calls should // be ignored. virtual Status close(RuntimeState* state, Status exec_status) { - _expr_mem_tracker->close(); + _expr_mem_tracker.reset(); _closed = true; return Status::OK(); } @@ -86,7 +86,7 @@ protected: // Set to true after close() has been called. subclasses should check and set this in // close(). bool _closed; - std::unique_ptr _expr_mem_tracker; + std::shared_ptr _expr_mem_tracker; // Maybe this will be transferred to BufferControlBlock. std::shared_ptr _query_statistics; diff --git a/be/src/exec/es_http_scan_node.cpp b/be/src/exec/es_http_scan_node.cpp index fa7b895805..9cc9b25f32 100644 --- a/be/src/exec/es_http_scan_node.cpp +++ b/be/src/exec/es_http_scan_node.cpp @@ -317,7 +317,7 @@ Status EsHttpScanNode::scanner_scan( while (!scanner_eof) { // Fill one row batch std::shared_ptr row_batch( - new RowBatch(row_desc(), _runtime_state->batch_size(), mem_tracker())); + new RowBatch(row_desc(), _runtime_state->batch_size(), mem_tracker().get())); // create new tuple buffer for row_batch MemPool* tuple_pool = row_batch->tuple_data_pool(); diff --git a/be/src/exec/es_http_scanner.cpp b/be/src/exec/es_http_scanner.cpp index f04408088f..6890010ef8 100644 --- a/be/src/exec/es_http_scanner.cpp +++ b/be/src/exec/es_http_scanner.cpp @@ -30,37 +30,33 @@ namespace doris { -EsHttpScanner::EsHttpScanner( - RuntimeState* state, - RuntimeProfile* profile, - TupleId tuple_id, - const std::map& properties, - const std::vector& conjunct_ctxs, - EsScanCounter* counter, - bool doc_value_mode) : - _state(state), - _profile(profile), - _tuple_id(tuple_id), - _properties(properties), - _conjunct_ctxs(conjunct_ctxs), - _next_range(0), - _line_eof(false), - _batch_eof(false), +EsHttpScanner::EsHttpScanner(RuntimeState* state, RuntimeProfile* profile, TupleId tuple_id, + const std::map& properties, + const std::vector& conjunct_ctxs, EsScanCounter* counter, + bool doc_value_mode) + : _state(state), + _profile(profile), + _tuple_id(tuple_id), + _properties(properties), + _conjunct_ctxs(conjunct_ctxs), + _next_range(0), + _line_eof(false), + _batch_eof(false), #if BE_TEST - _mem_tracker(new MemTracker()), - _mem_pool(_mem_tracker.get()), -#else - _mem_tracker(new MemTracker(-1, "EsHttp Scanner", state->instance_mem_tracker())), - _mem_pool(_state->instance_mem_tracker()), + _mem_tracker(new MemTracker()), +#else + _mem_tracker( + MemTracker::CreateTracker(-1, "EsHttp Scanner", state->instance_mem_tracker())), #endif - _tuple_desc(nullptr), - _counter(counter), - _es_reader(nullptr), - _es_scroll_parser(nullptr), - _doc_value_mode(doc_value_mode), - _rows_read_counter(nullptr), - _read_timer(nullptr), - _materialize_timer(nullptr) { + _mem_pool(_mem_tracker.get()), + _tuple_desc(nullptr), + _counter(counter), + _es_reader(nullptr), + _es_scroll_parser(nullptr), + _doc_value_mode(doc_value_mode), + _rows_read_counter(nullptr), + _read_timer(nullptr), + _materialize_timer(nullptr) { } EsHttpScanner::~EsHttpScanner() { diff --git a/be/src/exec/es_http_scanner.h b/be/src/exec/es_http_scanner.h index f94eded5ed..3ffa1eae14 100644 --- a/be/src/exec/es_http_scanner.h +++ b/be/src/exec/es_http_scanner.h @@ -87,7 +87,7 @@ private: std::vector _slot_descs; std::unique_ptr _row_desc; - std::unique_ptr _mem_tracker; + std::shared_ptr _mem_tracker; MemPool _mem_pool; const TupleDescriptor* _tuple_desc; diff --git a/be/src/exec/except_node.cpp b/be/src/exec/except_node.cpp index e6de96189a..1411647b62 100644 --- a/be/src/exec/except_node.cpp +++ b/be/src/exec/except_node.cpp @@ -73,7 +73,7 @@ Status ExceptNode::open(RuntimeState* state) { temp_tbl->close(); } // probe - _probe_batch.reset(new RowBatch(child(i)->row_desc(), state->batch_size(), mem_tracker())); + _probe_batch.reset(new RowBatch(child(i)->row_desc(), state->batch_size(), mem_tracker().get())); ScopedTimer probe_timer(_probe_timer); RETURN_IF_ERROR(child(i)->open(state)); eos = false; diff --git a/be/src/exec/exec_node.cpp b/be/src/exec/exec_node.cpp index 697dcc6a24..5d94c8cbce 100644 --- a/be/src/exec/exec_node.cpp +++ b/be/src/exec/exec_node.cpp @@ -131,8 +131,7 @@ ExecNode::ExecNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl init_runtime_profile(print_plan_node_type(tnode.node_type)); } -ExecNode::~ExecNode() { -} +ExecNode::~ExecNode() {} void ExecNode::push_down_predicate( RuntimeState* state, std::list* expr_ctxs) { @@ -150,7 +149,7 @@ void ExecNode::push_down_predicate( if ((*iter)->root()->is_bound(&_tuple_ids)) { // LOG(INFO) << "push down success expr is " << (*iter)->debug_string() // << " and node is " << debug_string(); - (*iter)->prepare(state, row_desc(), _expr_mem_tracker.get()); + (*iter)->prepare(state, row_desc(), _expr_mem_tracker); (*iter)->open(state); _conjunct_ctxs.push_back(*iter); iter = expr_ctxs->erase(iter); @@ -177,8 +176,8 @@ Status ExecNode::prepare(RuntimeState* state) { _rows_returned_counter, runtime_profile()->total_time_counter()), ""); - _mem_tracker.reset(new MemTracker(_runtime_profile.get(), -1, _runtime_profile->name(), state->instance_mem_tracker())); - _expr_mem_tracker.reset(new MemTracker(-1, "Exprs", _mem_tracker.get())); + _mem_tracker = MemTracker::CreateTracker(_runtime_profile.get(), -1, "ExecNode "+ _runtime_profile->name(), state->instance_mem_tracker()); + _expr_mem_tracker = MemTracker::CreateTracker(-1, "ExecNode Exprs", _mem_tracker); _expr_mem_pool.reset(new MemPool(_expr_mem_tracker.get())); // TODO chenhao RETURN_IF_ERROR(Expr::prepare(_conjunct_ctxs, state, row_desc(), expr_mem_tracker())); @@ -246,14 +245,6 @@ Status ExecNode::close(RuntimeState* state) { state->exec_env()->buffer_pool()->DeregisterClient(&_buffer_pool_client); } - if (_expr_mem_tracker != nullptr) { - _expr_mem_tracker->close(); - } - - if (_mem_tracker != nullptr) { - _mem_tracker->close(); - } - return result; } @@ -591,12 +582,10 @@ Status ExecNode::claim_buffer_reservation(RuntimeState* state) { } ss << print_plan_node_type(_type) << " id=" << _id << " ptr=" << this; - RETURN_IF_ERROR(buffer_pool->RegisterClient(ss.str(), - state->instance_buffer_reservation(), - mem_tracker(), buffer_pool->GetSystemBytesLimit(), - runtime_profile(), - &_buffer_pool_client)); - + RETURN_IF_ERROR(buffer_pool->RegisterClient(ss.str(), state->instance_buffer_reservation(), + mem_tracker(), buffer_pool->GetSystemBytesLimit(), + runtime_profile(), &_buffer_pool_client)); + state->initial_reservations()->Claim(&_buffer_pool_client, _resource_profile.min_reservation); /* if (debug_action_ == TDebugAction::SET_DENY_RESERVATION_PROBABILITY && diff --git a/be/src/exec/exec_node.h b/be/src/exec/exec_node.h index ab51166084..03c3eca4d7 100644 --- a/be/src/exec/exec_node.h +++ b/be/src/exec/exec_node.h @@ -205,12 +205,12 @@ public: return _memory_used_counter; } - MemTracker* mem_tracker() const { - return _mem_tracker.get(); + std::shared_ptr mem_tracker() const { + return _mem_tracker; } - MemTracker* expr_mem_tracker() const { - return _expr_mem_tracker.get(); + std::shared_ptr expr_mem_tracker() const { + return _expr_mem_tracker; } MemPool* expr_mem_pool() { @@ -313,10 +313,10 @@ protected: boost::scoped_ptr _runtime_profile; /// Account for peak memory used by this node - boost::scoped_ptr _mem_tracker; + std::shared_ptr _mem_tracker; /// MemTracker used by 'expr_mem_pool_'. - boost::scoped_ptr _expr_mem_tracker; + std::shared_ptr _expr_mem_tracker; /// MemPool for allocating data structures used by expression evaluators in this node. /// Created in Prepare(). diff --git a/be/src/exec/hash_join_node.cpp b/be/src/exec/hash_join_node.cpp index 3065cd8d6e..dea793ecfd 100644 --- a/be/src/exec/hash_join_node.cpp +++ b/be/src/exec/hash_join_node.cpp @@ -81,7 +81,7 @@ Status HashJoinNode::init(const TPlanNode& tnode, RuntimeState* state) { Status HashJoinNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); - _build_pool.reset(new MemPool(mem_tracker())); + _build_pool.reset(new MemPool(mem_tracker().get())); _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); _push_down_timer = @@ -139,7 +139,7 @@ Status HashJoinNode::prepare(RuntimeState* state) { _build_expr_ctxs, _probe_expr_ctxs, _build_tuple_size, stores_nulls, _is_null_safe_eq_join, id(), mem_tracker(), 1024)); - _probe_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker())); + _probe_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker().get())); return Status::OK(); } @@ -185,7 +185,7 @@ Status HashJoinNode::construct_hash_table(RuntimeState* state) { // The hash join node needs to keep in memory all build tuples, including the tuple // row ptrs. The row ptrs are copied into the hash table's internal structure so they // don't need to be stored in the _build_pool. - RowBatch build_batch(child(1)->row_desc(), state->batch_size(), mem_tracker()); + RowBatch build_batch(child(1)->row_desc(), state->batch_size(), mem_tracker().get()); RETURN_IF_ERROR(child(1)->open(state)); while (true) { diff --git a/be/src/exec/hash_table.cpp b/be/src/exec/hash_table.cpp index eab95d4209..48b70b52f8 100644 --- a/be/src/exec/hash_table.cpp +++ b/be/src/exec/hash_table.cpp @@ -33,7 +33,7 @@ HashTable::HashTable(const vector& build_expr_ctxs, int num_build_tuples, bool stores_nulls, const std::vector& finds_nulls, int32_t initial_seed, - MemTracker* mem_tracker, int64_t num_buckets) : + const std::shared_ptr& mem_tracker, int64_t num_buckets) : _build_expr_ctxs(build_expr_ctxs), _probe_expr_ctxs(probe_expr_ctxs), _num_build_tuples(num_build_tuples), @@ -47,14 +47,14 @@ HashTable::HashTable(const vector& build_expr_ctxs, _exceeded_limit(false), _mem_tracker(mem_tracker), _mem_limit_exceeded(false) { - DCHECK(mem_tracker != NULL); + DCHECK(_mem_tracker); DCHECK_EQ(_build_expr_ctxs.size(), _probe_expr_ctxs.size()); DCHECK_EQ((num_buckets & (num_buckets - 1)), 0) << "num_buckets must be a power of 2"; _buckets.resize(num_buckets); _num_buckets = num_buckets; _num_buckets_till_resize = MAX_BUCKET_OCCUPANCY_FRACTION * _num_buckets; - _mem_tracker->consume(_buckets.capacity() * sizeof(Bucket)); + _mem_tracker->Consume(_buckets.capacity() * sizeof(Bucket)); // Compute the layout and buffer size to store the evaluated expr results _results_buffer_size = Expr::compute_results_layout(_build_expr_ctxs, @@ -67,7 +67,7 @@ HashTable::HashTable(const vector& build_expr_ctxs, _nodes = reinterpret_cast(malloc(_nodes_capacity * _node_byte_size)); memset(_nodes, 0, _nodes_capacity * _node_byte_size); - _mem_tracker->consume(_nodes_capacity * _node_byte_size); + _mem_tracker->Consume(_nodes_capacity * _node_byte_size); if (_mem_tracker->limit_exceeded()) { mem_limit_exceeded(_nodes_capacity * _node_byte_size); } @@ -81,8 +81,8 @@ void HashTable::close() { delete[] _expr_values_buffer; delete[] _expr_value_null_bits; free(_nodes); - _mem_tracker->release(_nodes_capacity * _node_byte_size); - _mem_tracker->release(_buckets.size() * sizeof(Bucket)); + _mem_tracker->Release(_nodes_capacity * _node_byte_size); + _mem_tracker->Release(_buckets.size() * sizeof(Bucket)); } bool HashTable::eval_row(TupleRow* row, const vector& ctxs) { @@ -187,7 +187,7 @@ void HashTable::resize_buckets(int64_t num_buckets) { int64_t old_num_buckets = _num_buckets; int64_t delta_bytes = (num_buckets - old_num_buckets) * sizeof(Bucket); - if (!_mem_tracker->try_consume(delta_bytes)) { + if (!_mem_tracker->TryConsume(delta_bytes)) { mem_limit_exceeded(delta_bytes); return; } @@ -248,7 +248,7 @@ void HashTable::grow_node_array() { free(_nodes); _nodes = new_nodes; - _mem_tracker->consume(new_size - old_size); + _mem_tracker->Consume(new_size - old_size); if (_mem_tracker->limit_exceeded()) { mem_limit_exceeded(new_size - old_size); } diff --git a/be/src/exec/hash_table.h b/be/src/exec/hash_table.h index 06544b34d0..010c9b7d96 100644 --- a/be/src/exec/hash_table.h +++ b/be/src/exec/hash_table.h @@ -94,7 +94,7 @@ public: int num_build_tuples, bool stores_nulls, const std::vector& finds_nulls, int32_t initial_seed, - MemTracker* mem_tracker, + const std::shared_ptr& mem_tracker, int64_t num_buckets); ~HashTable(); @@ -401,7 +401,7 @@ private: bool _exceeded_limit; // true if any of _mem_trackers[].limit_exceeded() - MemTracker* _mem_tracker; + std::shared_ptr _mem_tracker; // Set to true if the hash table exceeds the memory limit. If this is set, // subsequent calls to Insert() will be ignored. bool _mem_limit_exceeded; diff --git a/be/src/exec/intersect_node.cpp b/be/src/exec/intersect_node.cpp index 39bd10cfca..59f6c2b178 100755 --- a/be/src/exec/intersect_node.cpp +++ b/be/src/exec/intersect_node.cpp @@ -77,7 +77,7 @@ Status IntersectNode::open(RuntimeState* state) { } } // probe - _probe_batch.reset(new RowBatch(child(i)->row_desc(), state->batch_size(), mem_tracker())); + _probe_batch.reset(new RowBatch(child(i)->row_desc(), state->batch_size(), mem_tracker().get())); ScopedTimer probe_timer(_probe_timer); RETURN_IF_ERROR(child(i)->open(state)); eos = false; diff --git a/be/src/exec/merge_join_node.h b/be/src/exec/merge_join_node.h index 758c9fa3fb..1dab87fc2b 100644 --- a/be/src/exec/merge_join_node.h +++ b/be/src/exec/merge_join_node.h @@ -66,9 +66,12 @@ private: int row_idx; bool is_eos; TupleRow* current_row; - ChildReaderContext(const RowDescriptor& desc, int batch_size, MemTracker* mem_tracker) : - batch(desc, batch_size, mem_tracker), row_idx(0), is_eos(false), current_row(NULL) { - } + ChildReaderContext(const RowDescriptor& desc, int batch_size, + const std::shared_ptr& mem_tracker) + : batch(desc, batch_size, mem_tracker.get()), + row_idx(0), + is_eos(false), + current_row(NULL) {} }; // _left_batch must be cleared before calling get_next(). used cache child(0)'s data // _rigth_batch must be cleared before calling get_next(). used cache child(1)'s data diff --git a/be/src/exec/merge_node.cpp b/be/src/exec/merge_node.cpp index 9d393b4947..dabde5902a 100644 --- a/be/src/exec/merge_node.cpp +++ b/be/src/exec/merge_node.cpp @@ -138,7 +138,7 @@ Status MergeNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) if (_child_row_batch.get() == NULL) { RETURN_IF_CANCELLED(state); _child_row_batch.reset( - new RowBatch(child(_child_idx)->row_desc(), state->batch_size(), mem_tracker())); + new RowBatch(child(_child_idx)->row_desc(), state->batch_size(), mem_tracker().get())); // Open child and fetch the first row batch. RETURN_IF_ERROR(child(_child_idx)->open(state)); RETURN_IF_ERROR(child(_child_idx)->get_next(state, _child_row_batch.get(), diff --git a/be/src/exec/mysql_scan_node.cpp b/be/src/exec/mysql_scan_node.cpp index f211b07707..2da3507177 100644 --- a/be/src/exec/mysql_scan_node.cpp +++ b/be/src/exec/mysql_scan_node.cpp @@ -83,7 +83,7 @@ Status MysqlScanNode::prepare(RuntimeState* state) { return Status::InternalError("new a mysql scanner failed."); } - _tuple_pool.reset(new(std::nothrow) MemPool(mem_tracker())); + _tuple_pool.reset(new(std::nothrow) MemPool(mem_tracker().get())); if (_tuple_pool.get() == NULL) { return Status::InternalError("new a mem pool failed."); diff --git a/be/src/exec/olap_rewrite_node.cpp b/be/src/exec/olap_rewrite_node.cpp index fcfeee6b0c..d113fe7265 100644 --- a/be/src/exec/olap_rewrite_node.cpp +++ b/be/src/exec/olap_rewrite_node.cpp @@ -55,7 +55,7 @@ Status OlapRewriteNode::prepare(RuntimeState* state) { _output_tuple_desc = state->desc_tbl().get_tuple_descriptor(_output_tuple_id); // _child_row_batch.reset(new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker())); _child_row_batch.reset( - new RowBatch(child(0)->row_desc(), state->batch_size(), state->fragment_mem_tracker())); + new RowBatch(child(0)->row_desc(), state->batch_size(), state->fragment_mem_tracker().get())); _max_decimal_val.resize(_column_types.size()); _max_decimalv2_val.resize(_column_types.size()); diff --git a/be/src/exec/olap_scan_node.cpp b/be/src/exec/olap_scan_node.cpp index 86df57cc8a..caec9af467 100644 --- a/be/src/exec/olap_scan_node.cpp +++ b/be/src/exec/olap_scan_node.cpp @@ -1258,7 +1258,7 @@ void OlapScanNode::scanner_thread(OlapScanner* scanner) { break; } RowBatch *row_batch = new RowBatch( - this->row_desc(), state->batch_size(), _runtime_state->fragment_mem_tracker()); + this->row_desc(), state->batch_size(), _runtime_state->fragment_mem_tracker().get()); row_batch->set_scanner_id(scanner->id()); status = scanner->get_batch(_runtime_state, row_batch, &eos); if (!status.ok()) { diff --git a/be/src/exec/olap_scanner.cpp b/be/src/exec/olap_scanner.cpp index 4888757533..a9bc32e7c7 100644 --- a/be/src/exec/olap_scanner.cpp +++ b/be/src/exec/olap_scanner.cpp @@ -238,7 +238,7 @@ Status OlapScanner::get_batch( bzero(tuple_buf, state->batch_size() * _tuple_desc->byte_size()); Tuple *tuple = reinterpret_cast(tuple_buf); - std::unique_ptr tracker(new MemTracker(state->fragment_mem_tracker()->limit())); + auto tracker = MemTracker::CreateTracker(state->fragment_mem_tracker()->limit(), "OlapScanner"); std::unique_ptr mem_pool(new MemPool(tracker.get())); int64_t raw_rows_threshold = raw_rows_read() + config::doris_scanner_row_num; diff --git a/be/src/exec/partitioned_aggregation_node.cc b/be/src/exec/partitioned_aggregation_node.cc index cec1e0b6fd..2c767e2afe 100644 --- a/be/src/exec/partitioned_aggregation_node.cc +++ b/be/src/exec/partitioned_aggregation_node.cc @@ -190,8 +190,8 @@ Status PartitionedAggregationNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); state_ = state; - mem_pool_.reset(new MemPool(mem_tracker())); - agg_fn_pool_.reset(new MemPool(expr_mem_tracker())); + mem_pool_.reset(new MemPool(mem_tracker().get())); + agg_fn_pool_.reset(new MemPool(expr_mem_tracker().get())); ht_resize_timer_ = ADD_TIMER(runtime_profile(), "HTResizeTime"); get_results_timer_ = ADD_TIMER(runtime_profile(), "GetResultsTime"); @@ -247,7 +247,7 @@ Status PartitionedAggregationNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(NewAggFnEvaluator::Create(agg_fns_, state, _pool, agg_fn_pool_.get(), &agg_fn_evals_, expr_mem_tracker(), row_desc)); - expr_results_pool_.reset(new MemPool(_expr_mem_tracker.get())); + expr_results_pool_.reset(new MemPool(expr_mem_tracker().get())); if (!grouping_exprs_.empty()) { RowDescriptor build_row_desc(intermediate_tuple_desc_, false); RETURN_IF_ERROR(PartitionedHashTableCtx::Create(_pool, state, build_exprs_, @@ -308,7 +308,7 @@ Status PartitionedAggregationNode::open(RuntimeState* state) { // Streaming preaggregations do all processing in GetNext(). if (is_streaming_preagg_) return Status::OK(); - RowBatch batch(child(0)->row_desc(), state->batch_size(), mem_tracker()); + RowBatch batch(child(0)->row_desc(), state->batch_size(), mem_tracker().get()); // Read all the rows from the child and process them. bool eos = false; do { @@ -532,7 +532,7 @@ Status PartitionedAggregationNode::GetRowsStreaming(RuntimeState* state, if (child_batch_ == NULL) { child_batch_.reset(new RowBatch(child(0)->row_desc(), state->batch_size(), - mem_tracker())); + mem_tracker().get())); } do { @@ -722,7 +722,7 @@ PartitionedAggregationNode::Partition::~Partition() { } Status PartitionedAggregationNode::Partition::InitStreams() { - agg_fn_pool.reset(new MemPool(parent->expr_mem_tracker())); + agg_fn_pool.reset(new MemPool(parent->expr_mem_tracker().get())); DCHECK_EQ(agg_fn_evals.size(), 0); NewAggFnEvaluator::ShallowClone(parent->partition_pool_.get(), agg_fn_pool.get(), parent->agg_fn_evals_, &agg_fn_evals); @@ -1363,7 +1363,7 @@ Status PartitionedAggregationNode::ProcessStream(BufferedTupleStream3* input_str bool eos = false; const RowDescriptor* desc = AGGREGATED_ROWS ? &intermediate_row_desc_ : &(_children[0]->row_desc()); - RowBatch batch(*desc, state_->batch_size(), const_cast(mem_tracker())); + RowBatch batch(*desc, state_->batch_size(), mem_tracker().get()); do { RETURN_IF_ERROR(input_stream->GetNext(&batch, &eos)); RETURN_IF_ERROR( diff --git a/be/src/exec/partitioned_hash_table.cc b/be/src/exec/partitioned_hash_table.cc index ea62abd4a5..e8ab74a2f8 100644 --- a/be/src/exec/partitioned_hash_table.cc +++ b/be/src/exec/partitioned_hash_table.cc @@ -78,19 +78,24 @@ static int64_t NULL_VALUE[] = { }; PartitionedHashTableCtx::PartitionedHashTableCtx(const std::vector& build_exprs, - const std::vector& probe_exprs, bool stores_nulls, - const std::vector& finds_nulls, int32_t initial_seed, - int max_levels, MemPool* mem_pool, MemPool* expr_results_pool) - : build_exprs_(build_exprs), - probe_exprs_(probe_exprs), - stores_nulls_(stores_nulls), - finds_nulls_(finds_nulls), - finds_some_nulls_(std::accumulate( - finds_nulls_.begin(), finds_nulls_.end(), false, std::logical_or())), - level_(0), - scratch_row_(NULL), - mem_pool_(mem_pool), - expr_results_pool_(expr_results_pool) { + const std::vector& probe_exprs, + bool stores_nulls, + const std::vector& finds_nulls, + int32_t initial_seed, int max_levels, + MemPool* mem_pool, MemPool* expr_results_pool, + const std::shared_ptr& tracker) + : tracker_(tracker), + build_exprs_(build_exprs), + probe_exprs_(probe_exprs), + stores_nulls_(stores_nulls), + finds_nulls_(finds_nulls), + finds_some_nulls_(std::accumulate(finds_nulls_.begin(), finds_nulls_.end(), false, + std::logical_or())), + level_(0), + scratch_row_(NULL), + mem_pool_(mem_pool), + expr_results_pool_(expr_results_pool) { + DCHECK(tracker_ != nullptr); DCHECK(!finds_some_nulls_ || stores_nulls_); // Compute the layout and buffer size to store the evaluated expr results DCHECK_EQ(build_exprs_.size(), probe_exprs_.size()); @@ -109,36 +114,38 @@ PartitionedHashTableCtx::PartitionedHashTableCtx(const std::vector& build } Status PartitionedHashTableCtx::Init(ObjectPool* pool, RuntimeState* state, int num_build_tuples, - MemTracker* tracker, const RowDescriptor& row_desc, const RowDescriptor& row_desc_probe) { + const RowDescriptor& row_desc, + const RowDescriptor& row_desc_probe) { + int scratch_row_size = sizeof(Tuple*) * num_build_tuples; + scratch_row_ = reinterpret_cast(malloc(scratch_row_size)); + if (UNLIKELY(scratch_row_ == NULL)) { + return Status::InternalError( + Substitute("Failed to allocate $0 bytes for scratch row of " + "PartitionedHashTableCtx.", + scratch_row_size)); + } - int scratch_row_size = sizeof(Tuple*) * num_build_tuples; - scratch_row_ = reinterpret_cast(malloc(scratch_row_size)); - if (UNLIKELY(scratch_row_ == NULL)) { - return Status::InternalError(Substitute("Failed to allocate $0 bytes for scratch row of " - "PartitionedHashTableCtx.", scratch_row_size)); - } + // TODO chenhao replace ExprContext with ScalarFnEvaluator + for (int i = 0; i < build_exprs_.size(); i++) { + ExprContext* context = pool->add(new ExprContext(build_exprs_[i])); + context->prepare(state, row_desc, tracker_); + if (context == nullptr) { + return Status::InternalError("Hashtable init error."); + } + build_expr_evals_.push_back(context); + } + DCHECK_EQ(build_exprs_.size(), build_expr_evals_.size()); - // TODO chenhao replace ExprContext with ScalarFnEvaluator - for (int i = 0; i < build_exprs_.size(); i++) { - ExprContext* context = pool->add(new ExprContext(build_exprs_[i])); - context->prepare(state, row_desc, tracker); - if (context == nullptr) { - return Status::InternalError("Hashtable init error."); - } - build_expr_evals_.push_back(context); - } - DCHECK_EQ(build_exprs_.size(), build_expr_evals_.size()); - - for (int i = 0; i < probe_exprs_.size(); i++) { - ExprContext* context = pool->add(new ExprContext(probe_exprs_[i])); - context->prepare(state, row_desc_probe, tracker); - if (context == nullptr) { - return Status::InternalError("Hashtable init error."); - } - probe_expr_evals_.push_back(context); - } - DCHECK_EQ(probe_exprs_.size(), probe_expr_evals_.size()); - return expr_values_cache_.Init(state, mem_pool_->mem_tracker(), build_exprs_); + for (int i = 0; i < probe_exprs_.size(); i++) { + ExprContext* context = pool->add(new ExprContext(probe_exprs_[i])); + context->prepare(state, row_desc_probe, tracker_); + if (context == nullptr) { + return Status::InternalError("Hashtable init error."); + } + probe_expr_evals_.push_back(context); + } + DCHECK_EQ(probe_exprs_.size(), probe_expr_evals_.size()); + return expr_values_cache_.Init(state, tracker_, build_exprs_); } Status PartitionedHashTableCtx::Create(ObjectPool* pool, RuntimeState* state, @@ -146,12 +153,12 @@ Status PartitionedHashTableCtx::Create(ObjectPool* pool, RuntimeState* state, const std::vector& probe_exprs, bool stores_nulls, const std::vector& finds_nulls, int32_t initial_seed, int max_levels, int num_build_tuples, MemPool* mem_pool, MemPool* expr_results_pool, - MemTracker* tracker, const RowDescriptor& row_desc, + const std::shared_ptr& tracker, const RowDescriptor& row_desc, const RowDescriptor& row_desc_probe, scoped_ptr* ht_ctx) { ht_ctx->reset(new PartitionedHashTableCtx(build_exprs, probe_exprs, stores_nulls, - finds_nulls, initial_seed, max_levels, mem_pool, expr_results_pool)); - return (*ht_ctx)->Init(pool, state, num_build_tuples, tracker, row_desc, row_desc_probe); + finds_nulls, initial_seed, max_levels, mem_pool, expr_results_pool, tracker)); + return (*ht_ctx)->Init(pool, state, num_build_tuples, row_desc, row_desc_probe); } Status PartitionedHashTableCtx::Open(RuntimeState* state) { @@ -168,7 +175,7 @@ Status PartitionedHashTableCtx::Open(RuntimeState* state) { void PartitionedHashTableCtx::Close(RuntimeState* state) { free(scratch_row_); scratch_row_ = NULL; - expr_values_cache_.Close(mem_pool_->mem_tracker()); + expr_values_cache_.Close(tracker_); for (int i = 0; i < build_expr_evals_.size(); i++) { build_expr_evals_[i]->close(state); } @@ -310,7 +317,7 @@ PartitionedHashTableCtx::ExprValuesCache::ExprValuesCache() null_bitmap_(0) {} Status PartitionedHashTableCtx::ExprValuesCache::Init(RuntimeState* state, - MemTracker* tracker, const std::vector& build_exprs) { + const std::shared_ptr& tracker, const std::vector& build_exprs) { // Initialize the number of expressions. num_exprs_ = build_exprs.size(); // Compute the layout of evaluated values of a row. @@ -328,7 +335,7 @@ Status PartitionedHashTableCtx::ExprValuesCache::Init(RuntimeState* state, MAX_EXPR_VALUES_ARRAY_SIZE / expr_values_bytes_per_row_)); int mem_usage = MemUsage(capacity_, expr_values_bytes_per_row_, num_exprs_); - if (UNLIKELY(!tracker->try_consume(mem_usage))) { + if (UNLIKELY(!tracker->TryConsume(mem_usage))) { capacity_ = 0; string details = Substitute("PartitionedHashTableCtx::ExprValuesCache failed to allocate $0 bytes.", mem_usage); @@ -354,7 +361,7 @@ Status PartitionedHashTableCtx::ExprValuesCache::Init(RuntimeState* state, return Status::OK(); } -void PartitionedHashTableCtx::ExprValuesCache::Close(MemTracker* tracker) { +void PartitionedHashTableCtx::ExprValuesCache::Close(const std::shared_ptr& tracker) { if (capacity_ == 0) return; cur_expr_values_ = NULL; cur_expr_values_null_ = NULL; @@ -365,7 +372,7 @@ void PartitionedHashTableCtx::ExprValuesCache::Close(MemTracker* tracker) { expr_values_hash_array_.reset(); null_bitmap_.Reset(0); int mem_usage = MemUsage(capacity_, expr_values_bytes_per_row_, num_exprs_); - tracker->release(mem_usage); + tracker->Release(mem_usage); } int PartitionedHashTableCtx::ExprValuesCache::MemUsage(int capacity, diff --git a/be/src/exec/partitioned_hash_table.h b/be/src/exec/partitioned_hash_table.h index ab78b2efe9..13209d37c8 100644 --- a/be/src/exec/partitioned_hash_table.h +++ b/be/src/exec/partitioned_hash_table.h @@ -110,16 +110,16 @@ class PartitionedHashTableCtx { /// evaluators for the build and probe expressions will also be allocated. /// Please see the comments of HashTableCtx constructor and Init() for details /// of other parameters. - static Status Create(ObjectPool* pool, RuntimeState* state, - const std::vector& build_exprs, - const std::vector& probe_exprs, bool stores_nulls, - const std::vector& finds_nulls, int32_t initial_seed, int max_levels, - int num_build_tuples, MemPool* mem_pool, MemPool* expr_results_pool, - MemTracker* tracker, const RowDescriptor& row_desc, - const RowDescriptor& row_desc_probe, - boost::scoped_ptr* ht_ctx); + static Status Create(ObjectPool* pool, RuntimeState* state, + const std::vector& build_exprs, + const std::vector& probe_exprs, bool stores_nulls, + const std::vector& finds_nulls, int32_t initial_seed, + int max_levels, int num_build_tuples, MemPool* mem_pool, + MemPool* expr_results_pool, const std::shared_ptr& tracker, + const RowDescriptor& row_desc, const RowDescriptor& row_desc_probe, + boost::scoped_ptr* ht_ctx); - /// Initialize the build and probe expression evaluators. + /// Initialize the build and probe expression evaluators. Status Open(RuntimeState* state); /// Call to cleanup any resources allocated by the expression evaluators. @@ -211,12 +211,12 @@ class PartitionedHashTableCtx { /// Allocates memory and initializes various data structures. Return error status /// if memory allocation leads to the memory limits of the exec node to be exceeded. /// 'tracker' is the memory tracker of the exec node which owns this PartitionedHashTableCtx. - Status Init(RuntimeState* state, MemTracker* tracker, - const std::vector& build_exprs); + Status Init(RuntimeState* state, const std::shared_ptr& tracker, + const std::vector& build_exprs); /// Frees up various resources and updates memory tracker with proper accounting. /// 'tracker' should be the same memory tracker which was passed in for Init(). - void Close(MemTracker* tracker); + void Close(const std::shared_ptr& tracker); /// Resets the cache states (iterators, end pointers etc) before writing. void Reset() noexcept; @@ -382,9 +382,10 @@ class PartitionedHashTableCtx { /// in which nulls are stored and columns in which they are not, which could save /// space by not storing some rows we know will never match. PartitionedHashTableCtx(const std::vector& build_exprs, - const std::vector& probe_exprs, bool stores_nulls, - const std::vector& finds_nulls, int32_t initial_seed, - int max_levels, MemPool* mem_pool, MemPool* expr_results_pool); + const std::vector& probe_exprs, bool stores_nulls, + const std::vector& finds_nulls, int32_t initial_seed, + int max_levels, MemPool* mem_pool, MemPool* expr_results_pool, + const std::shared_ptr& tracker); /// Allocate various buffers for storing expression evaluation results, hash values, /// null bits etc. Also allocate evaluators for the build and probe expressions and @@ -392,7 +393,7 @@ class PartitionedHashTableCtx { /// be exceeded or the evaluators fail to initialize. 'num_build_tuples' is the number /// of tuples of a row in the build side, used for computing the size of a scratch row. Status Init(ObjectPool* pool, RuntimeState* state, int num_build_tuples, - MemTracker* tracker, const RowDescriptor& row_desc, const RowDescriptor& row_desc_probe); + const RowDescriptor& row_desc, const RowDescriptor& row_desc_probe); /// Compute the hash of the values in 'expr_values' with nullness 'expr_values_null'. /// This will be replaced by codegen. We don't want this inlined for replacing @@ -454,6 +455,8 @@ class PartitionedHashTableCtx { bool IR_NO_INLINE stores_nulls() const { return stores_nulls_; } bool IR_NO_INLINE finds_some_nulls() const { return finds_some_nulls_; } + std::shared_ptr tracker_; + const std::vector& build_exprs_; std::vector build_expr_evals_; diff --git a/be/src/exec/repeat_node.cpp b/be/src/exec/repeat_node.cpp index 274f07b0fe..bfc22f5793 100644 --- a/be/src/exec/repeat_node.cpp +++ b/be/src/exec/repeat_node.cpp @@ -175,7 +175,7 @@ Status RepeatNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) } _child_row_batch.reset( - new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker())); + new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker().get())); RETURN_IF_ERROR(child(0)->get_next(state, _child_row_batch.get(), &_child_eos)); if (_child_row_batch->num_rows() <= 0) { diff --git a/be/src/exec/schema_scan_node.cpp b/be/src/exec/schema_scan_node.cpp index a0521af94b..6abee99938 100644 --- a/be/src/exec/schema_scan_node.cpp +++ b/be/src/exec/schema_scan_node.cpp @@ -103,7 +103,7 @@ Status SchemaScanNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ScanNode::prepare(state)); // new one mem pool - _tuple_pool.reset(new(std::nothrow) MemPool(mem_tracker())); + _tuple_pool.reset(new(std::nothrow) MemPool(mem_tracker().get())); if (NULL == _tuple_pool.get()) { return Status::InternalError("Allocate MemPool failed."); diff --git a/be/src/exec/select_node.cpp b/be/src/exec/select_node.cpp index 933f97064f..015713f17e 100644 --- a/be/src/exec/select_node.cpp +++ b/be/src/exec/select_node.cpp @@ -35,7 +35,7 @@ SelectNode::SelectNode( Status SelectNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); _child_row_batch.reset( - new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker())); + new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker().get())); return Status::OK(); } diff --git a/be/src/exec/set_operation_node.cpp b/be/src/exec/set_operation_node.cpp index 627e139f38..f5552f12c1 100644 --- a/be/src/exec/set_operation_node.cpp +++ b/be/src/exec/set_operation_node.cpp @@ -39,7 +39,7 @@ Status SetOperationNode::prepare(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::prepare(state)); _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); DCHECK(_tuple_desc != nullptr); - _build_pool.reset(new MemPool(mem_tracker())); + _build_pool.reset(new MemPool(mem_tracker().get())); _build_timer = ADD_TIMER(runtime_profile(), "BuildTime"); _probe_timer = ADD_TIMER(runtime_profile(), "ProbeTime"); SCOPED_TIMER(_runtime_profile->total_time_counter()); @@ -142,7 +142,7 @@ Status SetOperationNode::open(RuntimeState* state) { // initial build hash table used for remove duplicted _hash_tbl.reset(new HashTable(_child_expr_lists[0], _child_expr_lists[1], _build_tuple_size, true, _find_nulls, id(), mem_tracker(), 1024)); - RowBatch build_batch(child(0)->row_desc(), state->batch_size(), mem_tracker()); + RowBatch build_batch(child(0)->row_desc(), state->batch_size(), mem_tracker().get()); RETURN_IF_ERROR(child(0)->open(state)); bool eos = false; diff --git a/be/src/exec/sort_exec_exprs.cpp b/be/src/exec/sort_exec_exprs.cpp index 5a3d62c064..3c3c52e11b 100644 --- a/be/src/exec/sort_exec_exprs.cpp +++ b/be/src/exec/sort_exec_exprs.cpp @@ -50,7 +50,7 @@ Status SortExecExprs::init(const std::vector& lhs_ordering_expr_ct Status SortExecExprs::prepare(RuntimeState* state, const RowDescriptor& child_row_desc, const RowDescriptor& output_row_desc, - MemTracker* expr_mem_tracker) { + const std::shared_ptr& expr_mem_tracker) { if (_materialize_tuple) { RETURN_IF_ERROR(Expr::prepare( _sort_tuple_slot_expr_ctxs, state, child_row_desc, expr_mem_tracker)); diff --git a/be/src/exec/sort_exec_exprs.h b/be/src/exec/sort_exec_exprs.h index d78f7f7f95..91cb03ce0f 100644 --- a/be/src/exec/sort_exec_exprs.h +++ b/be/src/exec/sort_exec_exprs.h @@ -44,7 +44,8 @@ public: // prepare all expressions used for sorting and tuple materialization. Status prepare(RuntimeState* state, const RowDescriptor& child_row_desc, - const RowDescriptor& output_row_desc, MemTracker* mem_tracker); + const RowDescriptor& output_row_desc, + const std::shared_ptr& mem_tracker); // open all expressions used for sorting and tuple materialization. Status open(RuntimeState* state); diff --git a/be/src/exec/spill_sort_node.cc b/be/src/exec/spill_sort_node.cc index 2d49364958..4463fa8d3d 100644 --- a/be/src/exec/spill_sort_node.cc +++ b/be/src/exec/spill_sort_node.cc @@ -157,7 +157,7 @@ void SpillSortNode::debug_string(int indentation_level, stringstream* out) const } Status SpillSortNode::sort_input(RuntimeState* state) { - RowBatch batch(child(0)->row_desc(), state->batch_size(), mem_tracker()); + RowBatch batch(child(0)->row_desc(), state->batch_size(), mem_tracker().get()); bool eos = false; do { batch.reset(); diff --git a/be/src/exec/tablet_info.cpp b/be/src/exec/tablet_info.cpp index 8188b20fdc..703d941d99 100644 --- a/be/src/exec/tablet_info.cpp +++ b/be/src/exec/tablet_info.cpp @@ -150,7 +150,7 @@ OlapTablePartitionParam::OlapTablePartitionParam( std::shared_ptr schema, const TOlapTablePartitionParam& t_param) : _schema(schema), _t_param(t_param), - _mem_tracker(new MemTracker()), + _mem_tracker(MemTracker::CreateTracker(-1, "OlapTablePartitionParam")), _mem_pool(new MemPool(_mem_tracker.get())) { } diff --git a/be/src/exec/tablet_info.h b/be/src/exec/tablet_info.h index f80719ffae..dece0bf830 100644 --- a/be/src/exec/tablet_info.h +++ b/be/src/exec/tablet_info.h @@ -182,7 +182,7 @@ private: std::vector _distributed_slot_descs; ObjectPool _obj_pool; - std::unique_ptr _mem_tracker; + std::shared_ptr _mem_tracker; std::unique_ptr _mem_pool; std::vector _partitions; std::unique_ptr< diff --git a/be/src/exec/tablet_sink.cpp b/be/src/exec/tablet_sink.cpp index e626081047..b43829e459 100644 --- a/be/src/exec/tablet_sink.cpp +++ b/be/src/exec/tablet_sink.cpp @@ -65,7 +65,7 @@ Status NodeChannel::init(RuntimeState* state) { _row_desc.reset(new RowDescriptor(_tuple_desc, false)); _batch_size = state->batch_size(); - _cur_batch.reset(new RowBatch(*_row_desc, _batch_size, _parent->_mem_tracker)); + _cur_batch.reset(new RowBatch(*_row_desc, _batch_size, _parent->_mem_tracker.get())); _stub = state->exec_env()->brpc_stub_cache()->get_stub(_node_info->host, _node_info->brpc_port); if (_stub == nullptr) { @@ -187,7 +187,8 @@ Status NodeChannel::add_row(Tuple* input_tuple, int64_t tablet_id) { // But there is still some unfinished things, we do mem limit here temporarily. // _cancelled may be set by rpc callback, and it's possible that _cancelled might be set in any of the steps below. // It's fine to do a fake add_row() and return OK, because we will check _cancelled in next add_row() or mark_close(). - while (!_cancelled && _parent->_mem_tracker->any_limit_exceeded() && _pending_batches_num > 0) { + while (!_cancelled && _parent->_mem_tracker->AnyLimitExceeded(MemLimit::HARD) && + _pending_batches_num > 0) { SCOPED_RAW_TIMER(&_mem_exceeded_block_ns); SleepFor(MonoDelta::FromMilliseconds(10)); } @@ -202,7 +203,7 @@ Status NodeChannel::add_row(Tuple* input_tuple, int64_t tablet_id) { _pending_batches_num++; } - _cur_batch.reset(new RowBatch(*_row_desc, _batch_size, _parent->_mem_tracker)); + _cur_batch.reset(new RowBatch(*_row_desc, _batch_size, _parent->_mem_tracker.get())); _cur_add_batch_request.clear_tablet_ids(); row_no = _cur_batch->add_row(); @@ -420,7 +421,6 @@ OlapTableSink::~OlapTableSink() { // We clear NodeChannels' batches here, cuz NodeChannels' batches destruction will use // OlapTableSink::_mem_tracker and its parents. // But their destructions are after OlapTableSink's. - // TODO: can be remove after all MemTrackers become shared. for (auto index_channel : _channels) { index_channel->for_each_node_channel([](NodeChannel* ch) { ch->clear_all_batches(); }); } @@ -463,13 +463,12 @@ Status OlapTableSink::prepare(RuntimeState* state) { // profile must add to state's object pool _profile = state->obj_pool()->add(new RuntimeProfile("OlapTableSink")); - _mem_tracker = _pool->add(new MemTracker(-1, "OlapTableSink", state->instance_mem_tracker())); + _mem_tracker = MemTracker::CreateTracker(-1, "OlapTableSink", state->instance_mem_tracker()); SCOPED_TIMER(_profile->total_time_counter()); // Prepare the exprs to run. - RETURN_IF_ERROR( - Expr::prepare(_output_expr_ctxs, state, _input_row_desc, _expr_mem_tracker.get())); + RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _input_row_desc, _expr_mem_tracker)); // get table's tuple descriptor _output_tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_desc_id); @@ -497,7 +496,7 @@ Status OlapTableSink::prepare(RuntimeState* state) { } _output_row_desc = _pool->add(new RowDescriptor(_output_tuple_desc, false)); - _output_batch.reset(new RowBatch(*_output_row_desc, state->batch_size(), _mem_tracker)); + _output_batch.reset(new RowBatch(*_output_row_desc, state->batch_size(), _mem_tracker.get())); _max_decimal_val.resize(_output_tuple_desc->slots().size()); _min_decimal_val.resize(_output_tuple_desc->slots().size()); diff --git a/be/src/exec/tablet_sink.h b/be/src/exec/tablet_sink.h index 4f61ed15cc..3c993d1eef 100644 --- a/be/src/exec/tablet_sink.h +++ b/be/src/exec/tablet_sink.h @@ -186,6 +186,7 @@ public: Status none_of(std::initializer_list vars); + // TODO(HW): remove after mem tracker shared void clear_all_batches(); private: @@ -315,6 +316,8 @@ private: friend class NodeChannel; friend class IndexChannel; + std::shared_ptr _mem_tracker; + ObjectPool* _pool; const RowDescriptor& _input_row_desc; @@ -350,7 +353,6 @@ private: DorisNodesInfo* _nodes_info = nullptr; RuntimeProfile* _profile = nullptr; - MemTracker* _mem_tracker = nullptr; std::set _partition_ids; diff --git a/be/src/exec/topn_node.cpp b/be/src/exec/topn_node.cpp index 304820f969..29fd60e1c4 100644 --- a/be/src/exec/topn_node.cpp +++ b/be/src/exec/topn_node.cpp @@ -62,7 +62,7 @@ Status TopNNode::init(const TPlanNode& tnode, RuntimeState* state) { Status TopNNode::prepare(RuntimeState* state) { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(ExecNode::prepare(state)); - _tuple_pool.reset(new MemPool(mem_tracker())); + _tuple_pool.reset(new MemPool(mem_tracker().get())); RETURN_IF_ERROR(_sort_exec_exprs.prepare( state, child(0)->row_desc(), _row_descriptor, expr_mem_tracker())); // AddExprCtxsToFree(_sort_exec_exprs); @@ -99,7 +99,7 @@ Status TopNNode::open(RuntimeState* state) { // Limit of 0, no need to fetch anything from children. if (_limit != 0) { - RowBatch batch(child(0)->row_desc(), state->batch_size(), mem_tracker()); + RowBatch batch(child(0)->row_desc(), state->batch_size(), mem_tracker().get()); bool eos = false; do { @@ -248,7 +248,7 @@ void TopNNode::push_down_predicate( if ((*iter)->root()->is_bound(&_tuple_ids)) { // LOG(INFO) << "push down success expr is " << (*iter)->debug_string(); // (*iter)->get_child(0)->prepare(state, row_desc()); - (*iter)->prepare(state, row_desc(), _expr_mem_tracker.get()); + (*iter)->prepare(state, row_desc(), _expr_mem_tracker); (*iter)->open(state); _conjunct_ctxs.push_back(*iter); iter = expr_ctxs->erase(iter); diff --git a/be/src/exec/union_node.cpp b/be/src/exec/union_node.cpp index 86ff768be3..a6ff9f75ae 100644 --- a/be/src/exec/union_node.cpp +++ b/be/src/exec/union_node.cpp @@ -84,8 +84,8 @@ Status UnionNode::prepare(RuntimeState* state) { // Prepare result expr lists. for (int i = 0; i < _child_expr_lists.size(); ++i) { - RETURN_IF_ERROR(Expr::prepare( - _child_expr_lists[i], state, child(i)->row_desc(), expr_mem_tracker())); + RETURN_IF_ERROR(Expr::prepare(_child_expr_lists[i], state, child(i)->row_desc(), + expr_mem_tracker())); // TODO(zc) // AddExprCtxsToFree(_child_expr_lists[i]); DCHECK_EQ(_child_expr_lists[i].size(), _tuple_desc->slots().size()); @@ -152,7 +152,7 @@ Status UnionNode::get_next_materialized(RuntimeState* state, RowBatch* row_batch if (_child_batch.get() == nullptr) { DCHECK_LT(_child_idx, _children.size()); _child_batch.reset(new RowBatch( - child(_child_idx)->row_desc(), state->batch_size(), mem_tracker())); + child(_child_idx)->row_desc(), state->batch_size(), mem_tracker().get())); _child_row_idx = 0; // open the current child unless it's the first child, which was already opened in // UnionNode::open(). diff --git a/be/src/exprs/agg_fn_evaluator.cpp b/be/src/exprs/agg_fn_evaluator.cpp index 02906ac028..12405526b4 100755 --- a/be/src/exprs/agg_fn_evaluator.cpp +++ b/be/src/exprs/agg_fn_evaluator.cpp @@ -148,7 +148,7 @@ Status AggFnEvaluator::prepare( MemPool* pool, const SlotDescriptor* intermediate_slot_desc, const SlotDescriptor* output_slot_desc, - MemTracker* mem_tracker, + const std::shared_ptr& mem_tracker, FunctionContext** agg_fn_ctx) { DCHECK(pool != NULL); DCHECK(intermediate_slot_desc != NULL); @@ -160,7 +160,7 @@ Status AggFnEvaluator::prepare( _string_buffer_len = 0; _mem_tracker = mem_tracker; - Status status = Expr::prepare(_input_exprs_ctxs, state, desc, pool->mem_tracker()); + Status status = Expr::prepare(_input_exprs_ctxs, state, desc, _mem_tracker); RETURN_IF_ERROR(status); ObjectPool* obj_pool = state->obj_pool(); @@ -276,7 +276,7 @@ Status AggFnEvaluator::open(RuntimeState* state, FunctionContext* agg_fn_ctx) { void AggFnEvaluator::close(RuntimeState* state) { Expr::close(_input_exprs_ctxs, state); if (UNLIKELY(_total_mem_consumption > 0)) { - _mem_tracker->release(_total_mem_consumption); + _mem_tracker->Release(_total_mem_consumption); } } @@ -459,7 +459,7 @@ void AggFnEvaluator::update_mem_limlits(int len) { _accumulated_mem_consumption += len; // per 16M , update mem_tracker one time if (UNLIKELY(_accumulated_mem_consumption > 16777216)) { - _mem_tracker->consume(_accumulated_mem_consumption); + _mem_tracker->Consume(_accumulated_mem_consumption); _total_mem_consumption += _accumulated_mem_consumption; _accumulated_mem_consumption = 0; } diff --git a/be/src/exprs/agg_fn_evaluator.h b/be/src/exprs/agg_fn_evaluator.h index eac15ab4b1..0f76aa763a 100755 --- a/be/src/exprs/agg_fn_evaluator.h +++ b/be/src/exprs/agg_fn_evaluator.h @@ -85,7 +85,7 @@ public: MemPool* pool, const SlotDescriptor* intermediate_slot_desc, const SlotDescriptor* output_slot_desc, - MemTracker* mem_tracker, + const std::shared_ptr& mem_tracker, FunctionContext** agg_fn_ctx); Status open(RuntimeState* state, FunctionContext* agg_fn_ctx); @@ -213,7 +213,7 @@ private: std::vector _input_exprs_ctxs; boost::scoped_array _string_buffer; //for count distinct int _string_buffer_len; //for count distinct - MemTracker* _mem_tracker; // saved c'tor param + std::shared_ptr _mem_tracker; // saved c'tor param const TypeDescriptor _return_type; const TypeDescriptor _intermediate_type; diff --git a/be/src/exprs/expr.cpp b/be/src/exprs/expr.cpp index b5ade12204..ee89a332c5 100644 --- a/be/src/exprs/expr.cpp +++ b/be/src/exprs/expr.cpp @@ -539,7 +539,7 @@ Status Expr::prepare( const std::vector& ctxs, RuntimeState* state, const RowDescriptor& row_desc, - MemTracker* tracker) { + const std::shared_ptr& tracker) { for (int i = 0; i < ctxs.size(); ++i) { RETURN_IF_ERROR(ctxs[i]->prepare(state, row_desc, tracker)); } @@ -868,13 +868,12 @@ void Expr::assign_fn_ctx_idx(int* next_fn_ctx_idx) { _fn_ctx_idx_end = *next_fn_ctx_idx; } - -Status Expr::create(const TExpr& texpr, const RowDescriptor& row_desc, - RuntimeState* state, ObjectPool* pool, Expr** scalar_expr, - MemTracker* tracker) { - *scalar_expr = nullptr; - Expr* root; - RETURN_IF_ERROR(create_expr(pool, texpr.nodes[0], &root)); +Status Expr::create(const TExpr& texpr, const RowDescriptor& row_desc, RuntimeState* state, + ObjectPool* pool, Expr** scalar_expr, + const std::shared_ptr& tracker) { + *scalar_expr = nullptr; + Expr* root; + RETURN_IF_ERROR(create_expr(pool, texpr.nodes[0], &root)); RETURN_IF_ERROR(create_tree(texpr, pool, root)); // TODO pengyubing replace by Init() ExprContext* ctx = pool->add(new ExprContext(root)); @@ -893,9 +892,10 @@ Status Expr::create(const TExpr& texpr, const RowDescriptor& row_desc, return Status::OK(); } -Status Expr::create(const vector& texprs, const RowDescriptor& row_desc, - RuntimeState* state, ObjectPool* pool, vector* exprs, MemTracker* tracker) { - exprs->clear(); +Status Expr::create(const vector& texprs, const RowDescriptor& row_desc, RuntimeState* state, + ObjectPool* pool, vector* exprs, + const std::shared_ptr& tracker) { + exprs->clear(); for (const TExpr& texpr: texprs) { Expr* expr; RETURN_IF_ERROR(create(texpr, row_desc, state, pool, &expr, tracker)); @@ -906,12 +906,12 @@ Status Expr::create(const vector& texprs, const RowDescriptor& row_desc, } Status Expr::create(const TExpr& texpr, const RowDescriptor& row_desc, - RuntimeState* state, Expr** scalar_expr, MemTracker* tracker) { + RuntimeState* state, Expr** scalar_expr, const std::shared_ptr& tracker) { return Expr::create(texpr, row_desc, state, state->obj_pool(), scalar_expr, tracker); } Status Expr::create(const vector& texprs, const RowDescriptor& row_desc, - RuntimeState* state, vector* exprs, MemTracker* tracker) { + RuntimeState* state, vector* exprs, const std::shared_ptr& tracker) { return Expr::create(texprs, row_desc, state, state->obj_pool(), exprs, tracker); } diff --git a/be/src/exprs/expr.h b/be/src/exprs/expr.h index fa72194ad3..292626492a 100644 --- a/be/src/exprs/expr.h +++ b/be/src/exprs/expr.h @@ -206,27 +206,30 @@ public: /// is stored in ObjectPool 'pool' and returned in 'expr' on success. 'row_desc' is the /// tuple row descriptor of the input tuple row. On failure, 'expr' is set to NULL and /// the expr tree (if created) will be closed. Error status will be returned too. - static Status create(const TExpr& texpr, const RowDescriptor& row_desc, - RuntimeState* state, ObjectPool* pool, Expr** expr, MemTracker* tracker); + static Status create(const TExpr& texpr, const RowDescriptor& row_desc, RuntimeState* state, + ObjectPool* pool, Expr** expr, const std::shared_ptr& tracker); /// Create a new ScalarExpr based on thrift Expr 'texpr'. The newly created ScalarExpr /// is stored in ObjectPool 'state->obj_pool()' and returned in 'expr'. 'row_desc' is /// the tuple row descriptor of the input tuple row. Returns error status on failure. - static Status create(const TExpr& texpr, const RowDescriptor& row_desc, - RuntimeState* state, Expr** expr, MemTracker* tracker); + static Status create(const TExpr& texpr, const RowDescriptor& row_desc, RuntimeState* state, + Expr** expr, const std::shared_ptr& tracker); /// Convenience functions creating multiple ScalarExpr. static Status create(const std::vector& texprs, const RowDescriptor& row_desc, - RuntimeState* state, ObjectPool* pool, std::vector* exprs, MemTracker* tracker); + RuntimeState* state, ObjectPool* pool, std::vector* exprs, + const std::shared_ptr& tracker); /// Convenience functions creating multiple ScalarExpr. static Status create(const std::vector& texprs, const RowDescriptor& row_desc, - RuntimeState* state, std::vector* exprs, MemTracker* tracker); + RuntimeState* state, std::vector* exprs, + const std::shared_ptr& tracker); /// Convenience function for preparing multiple expr trees. /// Allocations from 'ctxs' will be counted against 'tracker'. static Status prepare(const std::vector& ctxs, RuntimeState* state, - const RowDescriptor& row_desc, MemTracker* tracker); + const RowDescriptor& row_desc, + const std::shared_ptr& tracker); /// Convenience function for opening multiple expr trees. static Status open(const std::vector& ctxs, RuntimeState* state); diff --git a/be/src/exprs/expr_context.cpp b/be/src/exprs/expr_context.cpp index 99506f2df7..68ae418bf3 100644 --- a/be/src/exprs/expr_context.cpp +++ b/be/src/exprs/expr_context.cpp @@ -51,13 +51,13 @@ ExprContext::~ExprContext() { // TODO(zc): memory tracker Status ExprContext::prepare(RuntimeState* state, const RowDescriptor& row_desc, - MemTracker* tracker) { - DCHECK(tracker != NULL) << std::endl << get_stack_trace(); + const std::shared_ptr& tracker) { + DCHECK(tracker != nullptr) << std::endl << get_stack_trace(); DCHECK(_pool.get() == NULL); _prepared = true; - // TODO: use param tracker to replace instance_mem_tracker + // TODO: use param tracker to replace instance_mem_tracker, be careful about tracker's life cycle // _pool.reset(new MemPool(new MemTracker(-1))); - _pool.reset(new MemPool(state->instance_mem_tracker())); + _pool.reset(new MemPool(state->instance_mem_tracker().get())); return _root->prepare(state, row_desc, this); } diff --git a/be/src/exprs/expr_context.h b/be/src/exprs/expr_context.h index ebc0822e10..7fe294cb2d 100644 --- a/be/src/exprs/expr_context.h +++ b/be/src/exprs/expr_context.h @@ -52,7 +52,7 @@ public: /// Prepare expr tree for evaluation. /// Allocations from this context will be counted against 'tracker'. Status prepare(RuntimeState* state, const RowDescriptor& row_desc, - MemTracker* tracker); + const std::shared_ptr& tracker); /// Must be called after calling Prepare(). Does not need to be called on clones. /// Idempotent (this allows exprs to be opened multiple times in subplans without diff --git a/be/src/exprs/new_agg_fn_evaluator.cc b/be/src/exprs/new_agg_fn_evaluator.cc index 56409d79ab..f1a72897c7 100644 --- a/be/src/exprs/new_agg_fn_evaluator.cc +++ b/be/src/exprs/new_agg_fn_evaluator.cc @@ -89,18 +89,18 @@ typedef AnyVal (*FinalizeFn)(FunctionContext*, const AnyVal&); const int DEFAULT_MULTI_DISTINCT_COUNT_STRING_BUFFER_SIZE = 1024; -NewAggFnEvaluator::NewAggFnEvaluator(const AggFn& agg_fn, MemPool* mem_pool, MemTracker* tracker, bool is_clone) - : _total_mem_consumption(0), - _accumulated_mem_consumption(0), - is_clone_(is_clone), - agg_fn_(agg_fn), - mem_pool_(mem_pool), - _mem_tracker(tracker) { -} +NewAggFnEvaluator::NewAggFnEvaluator(const AggFn& agg_fn, MemPool* mem_pool, + const std::shared_ptr& tracker, bool is_clone) + : _total_mem_consumption(0), + _accumulated_mem_consumption(0), + is_clone_(is_clone), + agg_fn_(agg_fn), + mem_pool_(mem_pool), + _mem_tracker(tracker) {} NewAggFnEvaluator::~NewAggFnEvaluator() { if (UNLIKELY(_total_mem_consumption > 0)) { - _mem_tracker->release(_total_mem_consumption); + _mem_tracker->Release(_total_mem_consumption); } DCHECK(closed_); } @@ -114,11 +114,12 @@ const TypeDescriptor& NewAggFnEvaluator::intermediate_type() const { } Status NewAggFnEvaluator::Create(const AggFn& agg_fn, RuntimeState* state, ObjectPool* pool, - MemPool* mem_pool, NewAggFnEvaluator** result, - MemTracker* tracker, const RowDescriptor& row_desc) { - *result = nullptr; + MemPool* mem_pool, NewAggFnEvaluator** result, + const std::shared_ptr& tracker, + const RowDescriptor& row_desc) { + *result = nullptr; - // Create a new AggFn evaluator. + // Create a new AggFn evaluator. NewAggFnEvaluator* agg_fn_eval = pool->add(new NewAggFnEvaluator(agg_fn, mem_pool, tracker, false)); agg_fn_eval->agg_fn_ctx_.reset(FunctionContextImpl::create_context(state, mem_pool, @@ -168,11 +169,13 @@ cleanup: } Status NewAggFnEvaluator::Create(const vector& agg_fns, RuntimeState* state, - ObjectPool* pool, MemPool* mem_pool, vector* evals, - MemTracker* tracker, const RowDescriptor& row_desc) { - for (const AggFn* agg_fn : agg_fns) { - NewAggFnEvaluator* agg_fn_eval; - RETURN_IF_ERROR(NewAggFnEvaluator::Create(*agg_fn, state, pool, mem_pool, + ObjectPool* pool, MemPool* mem_pool, + vector* evals, + const std::shared_ptr& tracker, + const RowDescriptor& row_desc) { + for (const AggFn* agg_fn : agg_fns) { + NewAggFnEvaluator* agg_fn_eval; + RETURN_IF_ERROR(NewAggFnEvaluator::Create(*agg_fn, state, pool, mem_pool, &agg_fn_eval, tracker, row_desc)); evals->push_back(agg_fn_eval); } diff --git a/be/src/exprs/new_agg_fn_evaluator.h b/be/src/exprs/new_agg_fn_evaluator.h index 5d482e92cd..7be1c3d825 100644 --- a/be/src/exprs/new_agg_fn_evaluator.h +++ b/be/src/exprs/new_agg_fn_evaluator.h @@ -69,13 +69,13 @@ class NewAggFnEvaluator { /// from 'mem_pool'. Note that it's the responsibility to call Close() all evaluators /// even if this function returns error status on initialization failure. static Status Create(const AggFn& agg_fn, RuntimeState* state, ObjectPool* pool, - MemPool* mem_pool, NewAggFnEvaluator** eval, MemTracker* tracker, + MemPool* mem_pool, NewAggFnEvaluator** eval, const std::shared_ptr& tracker, const RowDescriptor& row_desc) WARN_UNUSED_RESULT; /// Convenience functions for creating evaluators for multiple aggregate functions. static Status Create(const std::vector& agg_fns, RuntimeState* state, ObjectPool* pool, MemPool* mem_pool, std::vector* evals, - MemTracker* tracker, const RowDescriptor& row_desc) WARN_UNUSED_RESULT; + const std::shared_ptr& tracker, const RowDescriptor& row_desc) WARN_UNUSED_RESULT; ~NewAggFnEvaluator(); @@ -223,7 +223,7 @@ class NewAggFnEvaluator { /// Owned by the exec node which owns this evaluator. MemPool* mem_pool_ = nullptr; - MemTracker* _mem_tracker; // saved c'tor param + std::shared_ptr _mem_tracker; // saved c'tor param /// This contains runtime state such as constant input arguments to the aggregate /// functions and a FreePool from which the intermediate values are allocated. @@ -245,7 +245,7 @@ class NewAggFnEvaluator { doris_udf::AnyVal* staging_merge_input_val_ = nullptr; /// Use Create() instead. - NewAggFnEvaluator(const AggFn& agg_fn, MemPool* mem_pool, MemTracker* tracker, bool is_clone); + NewAggFnEvaluator(const AggFn& agg_fn, MemPool* mem_pool, const std::shared_ptr& tracker, bool is_clone); /// Return the intermediate type of the aggregate function. inline const SlotDescriptor& intermediate_slot_desc() const; diff --git a/be/src/http/default_path_handlers.cpp b/be/src/http/default_path_handlers.cpp index dd854007af..bcc798ae86 100644 --- a/be/src/http/default_path_handlers.cpp +++ b/be/src/http/default_path_handlers.cpp @@ -72,9 +72,9 @@ void config_handler(const WebPageHandler::ArgumentMap& args, std::stringstream* } // Registered to handle "/memz", and prints out memory allocation statistics. -void mem_usage_handler(MemTracker* mem_tracker, const WebPageHandler::ArgumentMap& args, - std::stringstream* output) { - if (mem_tracker != NULL) { +void mem_usage_handler(const std::shared_ptr& mem_tracker, + const WebPageHandler::ArgumentMap& args, std::stringstream* output) { + if (mem_tracker != nullptr) { (*output) << "
"
                   << "Mem Limit: " << PrettyPrinter::print(mem_tracker->limit(), TUnit::BYTES)
                   << std::endl
@@ -100,12 +100,14 @@ void mem_usage_handler(MemTracker* mem_tracker, const WebPageHandler::ArgumentMa
 #endif
 }
 
-void add_default_path_handlers(WebPageHandler* web_page_handler, MemTracker* process_mem_tracker) {
+void add_default_path_handlers(WebPageHandler* web_page_handler,
+                               const std::shared_ptr& process_mem_tracker) {
     // TODO(yingchun): logs_handler is not implemented yet, so not show it on navigate bar
     web_page_handler->register_page("/logs", "Logs", logs_handler, false /* is_on_nav_bar */);
     web_page_handler->register_page("/varz", "Configs", config_handler, true /* is_on_nav_bar */);
-    web_page_handler->register_page("/memz", "Memory",
-        boost::bind(&mem_usage_handler, process_mem_tracker, _1, _2), true /* is_on_nav_bar */);
+    web_page_handler->register_page(
+            "/memz", "Memory", boost::bind(&mem_usage_handler, process_mem_tracker, _1, _2),
+            true /* is_on_nav_bar */);
     register_thread_display_page(web_page_handler);
 }
 
diff --git a/be/src/http/default_path_handlers.h b/be/src/http/default_path_handlers.h
index 06f5c3f9bb..93c0ba6bfa 100644
--- a/be/src/http/default_path_handlers.h
+++ b/be/src/http/default_path_handlers.h
@@ -19,6 +19,7 @@
 #define DORIS_BE_SRC_COMMON_UTIL_DEFAULT_PATH_HANDLERS_H
 
 #include 
+#include 
 
 namespace doris {
 
@@ -27,7 +28,8 @@ class WebPageHandler;
 
 // Adds a set of default path handlers to the webserver to display
 // logs and configuration flags
-void add_default_path_handlers(WebPageHandler* web_page_handler, MemTracker* process_mem_tracker);
-}
+void add_default_path_handlers(WebPageHandler* web_page_handler,
+                               const std::shared_ptr& process_mem_tracker);
+} // namespace doris
 
 #endif // IMPALA_UTIL_DEFAULT_PATH_HANDLERS_H
diff --git a/be/src/olap/aggregate_func.h b/be/src/olap/aggregate_func.h
index 7e7df69175..ecd46e0ce0 100644
--- a/be/src/olap/aggregate_func.h
+++ b/be/src/olap/aggregate_func.h
@@ -460,7 +460,7 @@ struct AggregateFuncTraitsdata = reinterpret_cast(hll);
 
-        mem_pool->mem_tracker()->consume(sizeof(HyperLogLog));
+        mem_pool->mem_tracker()->Consume(sizeof(HyperLogLog));
 
         agg_pool->add(hll);
     }
@@ -507,7 +507,7 @@ struct AggregateFuncTraitsdata = (char*) bitmap;
 
-        mem_pool->mem_tracker()->consume(sizeof(BitmapValue));
+        mem_pool->mem_tracker()->Consume(sizeof(BitmapValue));
 
         agg_pool->add(bitmap);
     }
diff --git a/be/src/olap/base_compaction.cpp b/be/src/olap/base_compaction.cpp
index 0ccb4cae7d..2b6343fbf6 100644
--- a/be/src/olap/base_compaction.cpp
+++ b/be/src/olap/base_compaction.cpp
@@ -22,7 +22,7 @@
 namespace doris {
 
 BaseCompaction::BaseCompaction(TabletSharedPtr tablet, const std::string& label,
-                               MemTracker* parent_tracker)
+                               const std::shared_ptr& parent_tracker)
         : Compaction(tablet, label, parent_tracker) {}
 
 BaseCompaction::~BaseCompaction() { }
diff --git a/be/src/olap/base_compaction.h b/be/src/olap/base_compaction.h
index 9ea54a9275..58d662def2 100644
--- a/be/src/olap/base_compaction.h
+++ b/be/src/olap/base_compaction.h
@@ -29,7 +29,8 @@ namespace doris {
 
 class BaseCompaction : public Compaction {
 public:
-    BaseCompaction(TabletSharedPtr tablet, const std::string& label, MemTracker* parent_tracker);
+    BaseCompaction(TabletSharedPtr tablet, const std::string& label,
+                   const std::shared_ptr& parent_tracker);
     ~BaseCompaction() override;
 
     OLAPStatus compact() override;
diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp
index 5a74671fb2..67b56f6e76 100644
--- a/be/src/olap/compaction.cpp
+++ b/be/src/olap/compaction.cpp
@@ -27,9 +27,9 @@ namespace doris {
 
 Semaphore Compaction::_concurrency_sem;
 
-Compaction::Compaction(TabletSharedPtr tablet, const std::string& label, MemTracker* parent_tracker)
-        : _mem_tracker(-1, label, parent_tracker, true),
-          _readers_tracker(-1, "readers tracker", &_mem_tracker, true),
+Compaction::Compaction(TabletSharedPtr tablet, const std::string& label, const std::shared_ptr& parent_tracker)
+        : _mem_tracker(MemTracker::CreateTracker(-1, label, parent_tracker)),
+          _readers_tracker(MemTracker::CreateTracker(-1, "readers tracker", _mem_tracker)),
           _tablet(tablet),
           _input_rowsets_size(0),
           _input_row_num(0),
@@ -153,7 +153,7 @@ OLAPStatus Compaction::construct_output_rowset_writer() {
 OLAPStatus Compaction::construct_input_rowset_readers() {
     for (auto& rowset : _input_rowsets) {
         RowsetReaderSharedPtr rs_reader;
-        RETURN_NOT_OK(rowset->create_reader(&_readers_tracker, &rs_reader));
+        RETURN_NOT_OK(rowset->create_reader(_readers_tracker, &rs_reader));
         _input_rs_readers.push_back(std::move(rs_reader));
     }
     return OLAP_SUCCESS;
diff --git a/be/src/olap/compaction.h b/be/src/olap/compaction.h
index 9507cc2026..6c4b438aae 100644
--- a/be/src/olap/compaction.h
+++ b/be/src/olap/compaction.h
@@ -44,7 +44,7 @@ class Merger;
 //  4. gc unused rowstes
 class Compaction {
 public:
-    Compaction(TabletSharedPtr tablet, const std::string& label, MemTracker* parent_tracker);
+    Compaction(TabletSharedPtr tablet, const std::string& label, const std::shared_ptr& parent_tracker);
     virtual ~Compaction();
 
     virtual OLAPStatus compact() = 0;
@@ -78,10 +78,10 @@ private:
 
 protected:
     // the root tracker for this compaction
-    MemTracker _mem_tracker;
+    std::shared_ptr _mem_tracker;
 
     // the child of root, only track rowset readers mem
-    MemTracker _readers_tracker;
+    std::shared_ptr _readers_tracker;
     TabletSharedPtr _tablet;
 
     std::vector _input_rowsets;
diff --git a/be/src/olap/cumulative_compaction.cpp b/be/src/olap/cumulative_compaction.cpp
index 3494561e83..a5f135869b 100755
--- a/be/src/olap/cumulative_compaction.cpp
+++ b/be/src/olap/cumulative_compaction.cpp
@@ -23,7 +23,7 @@
 namespace doris {
 
 CumulativeCompaction::CumulativeCompaction(TabletSharedPtr tablet, const std::string& label,
-                                           MemTracker* parent_tracker)
+                                           const std::shared_ptr& parent_tracker)
         : Compaction(tablet, label, parent_tracker),
           _cumulative_rowset_size_threshold(config::cumulative_compaction_budgeted_bytes) {}
 
diff --git a/be/src/olap/cumulative_compaction.h b/be/src/olap/cumulative_compaction.h
index 49e5f2c2b3..f32268d661 100755
--- a/be/src/olap/cumulative_compaction.h
+++ b/be/src/olap/cumulative_compaction.h
@@ -27,7 +27,7 @@ namespace doris {
 class CumulativeCompaction : public Compaction {
 public:
     CumulativeCompaction(TabletSharedPtr tablet, const std::string& label,
-                         MemTracker* parent_tracker);
+                         const std::shared_ptr& parent_tracker);
     ~CumulativeCompaction() override;
 
     OLAPStatus compact() override;
diff --git a/be/src/olap/delta_writer.cpp b/be/src/olap/delta_writer.cpp
index 0a89bc41a0..74c76a2bb9 100644
--- a/be/src/olap/delta_writer.cpp
+++ b/be/src/olap/delta_writer.cpp
@@ -27,17 +27,24 @@
 
 namespace doris {
 
-OLAPStatus DeltaWriter::open(WriteRequest* req, MemTracker* mem_tracker, DeltaWriter** writer) {
-    *writer = new DeltaWriter(req, mem_tracker, StorageEngine::instance());
+OLAPStatus DeltaWriter::open(WriteRequest* req, const std::shared_ptr& parent,
+                             DeltaWriter** writer) {
+    *writer = new DeltaWriter(req, parent, StorageEngine::instance());
     return OLAP_SUCCESS;
 }
 
-DeltaWriter::DeltaWriter(WriteRequest* req, MemTracker* parent, StorageEngine* storage_engine) :
-        _req(*req), _tablet(nullptr), _cur_rowset(nullptr), _new_rowset(nullptr),
-        _new_tablet(nullptr), _rowset_writer(nullptr), _tablet_schema(nullptr),
-        _delta_written_success(false), _storage_engine(storage_engine) {
-    _mem_tracker.reset(new MemTracker(-1, "delta writer", parent));
-}
+DeltaWriter::DeltaWriter(WriteRequest* req, const std::shared_ptr& parent,
+                         StorageEngine* storage_engine)
+        : _req(*req),
+          _tablet(nullptr),
+          _cur_rowset(nullptr),
+          _new_rowset(nullptr),
+          _new_tablet(nullptr),
+          _rowset_writer(nullptr),
+          _tablet_schema(nullptr),
+          _delta_written_success(false),
+          _storage_engine(storage_engine),
+          _mem_tracker(MemTracker::CreateTracker(-1, "DeltaWriter", parent)) {}
 
 DeltaWriter::~DeltaWriter() {
     if (_is_init && !_delta_written_success) {
@@ -195,7 +202,7 @@ OLAPStatus DeltaWriter::flush_memtable_and_wait() {
 void DeltaWriter::_reset_mem_table() {
     _mem_table.reset(new MemTable(_tablet->tablet_id(), _schema.get(), _tablet_schema, _req.slots,
                                   _req.tuple_desc, _tablet->keys_type(), _rowset_writer.get(),
-                                  _mem_tracker.get()));
+                                  _mem_tracker));
 }
 
 OLAPStatus DeltaWriter::close() {
diff --git a/be/src/olap/delta_writer.h b/be/src/olap/delta_writer.h
index 64828d59c9..2c4eb79a69 100644
--- a/be/src/olap/delta_writer.h
+++ b/be/src/olap/delta_writer.h
@@ -56,7 +56,8 @@ struct WriteRequest {
 // This class is NOT thread-safe, external synchronization is required.
 class DeltaWriter {
 public:
-    static OLAPStatus open(WriteRequest* req, MemTracker* mem_tracker, DeltaWriter** writer);
+    static OLAPStatus open(WriteRequest* req, const std::shared_ptr& parent,
+                           DeltaWriter** writer);
 
     ~DeltaWriter();
 
@@ -83,7 +84,8 @@ public:
     int64_t mem_consumption() const;
 
 private:
-    DeltaWriter(WriteRequest* req, MemTracker* parent, StorageEngine* storage_engine);
+    DeltaWriter(WriteRequest* req, const std::shared_ptr& parent,
+                StorageEngine* storage_engine);
 
     // push a full memtable to flush executor
     OLAPStatus _flush_memtable_async();
@@ -107,7 +109,7 @@ private:
 
     StorageEngine* _storage_engine;
     std::unique_ptr _flush_token;
-    std::unique_ptr _mem_tracker;
+    std::shared_ptr _mem_tracker;
 };
 
 }  // namespace doris
diff --git a/be/src/olap/fs/file_block_manager.cpp b/be/src/olap/fs/file_block_manager.cpp
index 255d22e0b2..291dc0ed61 100644
--- a/be/src/olap/fs/file_block_manager.cpp
+++ b/be/src/olap/fs/file_block_manager.cpp
@@ -381,7 +381,7 @@ Status FileReadableBlock::readv(uint64_t offset, const Slice* results, size_t re
 FileBlockManager::FileBlockManager(Env* env, BlockManagerOptions opts) :
         _env(DCHECK_NOTNULL(env)),
         _opts(std::move(opts)),
-        _mem_tracker(new MemTracker(-1, "file_block_manager", _opts.parent_mem_tracker.get())) {
+        _mem_tracker(MemTracker::CreateTracker(-1, "file_block_manager", _opts.parent_mem_tracker)) {
     if (_opts.enable_metric) {
         _metrics.reset(new internal::BlockManagerMetrics());
     }
diff --git a/be/src/olap/fs/file_block_manager.h b/be/src/olap/fs/file_block_manager.h
index 5f12aa4f2f..4a4782b80e 100644
--- a/be/src/olap/fs/file_block_manager.h
+++ b/be/src/olap/fs/file_block_manager.h
@@ -110,7 +110,7 @@ private:
 
     // Tracks memory consumption of any allocations numerous enough to be
     // interesting.
-    std::unique_ptr _mem_tracker;
+    std::shared_ptr _mem_tracker;
 
     // DISALLOW_COPY_AND_ASSIGN(FileBlockManager);
 
diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp
index 2dd059bd92..25a6a81912 100644
--- a/be/src/olap/memtable.cpp
+++ b/be/src/olap/memtable.cpp
@@ -31,22 +31,21 @@ namespace doris {
 
 MemTable::MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet_schema,
                    const std::vector* slot_descs, TupleDescriptor* tuple_desc,
-                   KeysType keys_type, RowsetWriter* rowset_writer, MemTracker* mem_tracker)
-    : _tablet_id(tablet_id),
-      _schema(schema),
-      _tablet_schema(tablet_schema),
-      _tuple_desc(tuple_desc),
-      _slot_descs(slot_descs),
-      _keys_type(keys_type),
-      _row_comparator(_schema),
-      _rowset_writer(rowset_writer) {
-
-    _schema_size = _schema->schema_size();
-    _mem_tracker.reset(new MemTracker(-1, "memtable", mem_tracker));
-    _buffer_mem_pool.reset(new MemPool(_mem_tracker.get()));
-    _table_mem_pool.reset(new MemPool(_mem_tracker.get()));
-    _skip_list = new Table(_row_comparator, _table_mem_pool.get(), _keys_type == KeysType::DUP_KEYS);
-}
+                   KeysType keys_type, RowsetWriter* rowset_writer,
+                   const std::shared_ptr& parent_tracker)
+        : _tablet_id(tablet_id),
+          _schema(schema),
+          _tablet_schema(tablet_schema),
+          _tuple_desc(tuple_desc),
+          _slot_descs(slot_descs),
+          _keys_type(keys_type),
+          _row_comparator(_schema),
+          _mem_tracker(MemTracker::CreateTracker(-1, "MemTable", parent_tracker)),
+          _buffer_mem_pool(new MemPool(_mem_tracker.get())),
+          _table_mem_pool(new MemPool(_mem_tracker.get())),
+          _schema_size(_schema->schema_size()),
+          _skip_list(new Table(_row_comparator, _table_mem_pool.get(), _keys_type == KeysType::DUP_KEYS)),
+          _rowset_writer(rowset_writer) {}
 
 MemTable::~MemTable() {
     delete _skip_list;
diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h
index a7142ac512..583eefdb72 100644
--- a/be/src/olap/memtable.h
+++ b/be/src/olap/memtable.h
@@ -39,7 +39,8 @@ class MemTable {
 public:
     MemTable(int64_t tablet_id, Schema* schema, const TabletSchema* tablet_schema,
              const std::vector* slot_descs, TupleDescriptor* tuple_desc,
-             KeysType keys_type, RowsetWriter* rowset_writer, MemTracker* mem_tracker);
+             KeysType keys_type, RowsetWriter* rowset_writer,
+             const std::shared_ptr& parent_tracker);
     ~MemTable();
 
     int64_t tablet_id() const { return _tablet_id; }
@@ -72,7 +73,7 @@ private:
     KeysType _keys_type;
 
     RowCursorComparator _row_comparator;
-    std::unique_ptr _mem_tracker;
+    std::shared_ptr _mem_tracker;
     // This is a buffer, to hold the memory referenced by the rows that have not
     // been inserted into the SkipList
     std::unique_ptr _buffer_mem_pool;
diff --git a/be/src/olap/merger.cpp b/be/src/olap/merger.cpp
index e6e6e1b4f8..c8b262c515 100644
--- a/be/src/olap/merger.cpp
+++ b/be/src/olap/merger.cpp
@@ -48,7 +48,7 @@ OLAPStatus Merger::merge_rowsets(TabletSharedPtr tablet,
                  "failed to init row cursor when merging rowsets of tablet " + tablet->full_name());
     row_cursor.allocate_memory_for_string_type(tablet->tablet_schema());
 
-    std::unique_ptr tracker(new MemTracker(-1));
+    std::shared_ptr tracker(new MemTracker(-1));
     std::unique_ptr mem_pool(new MemPool(tracker.get()));
 
     // The following procedure would last for long time, half of one day, etc.
diff --git a/be/src/olap/olap_index.cpp b/be/src/olap/olap_index.cpp
index 3976bc8940..4b549030de 100644
--- a/be/src/olap/olap_index.cpp
+++ b/be/src/olap/olap_index.cpp
@@ -35,14 +35,13 @@ using std::vector;
 namespace doris {
 
 MemIndex::MemIndex()
-    : _key_length(0),
-      _num_entries(0),
-      _index_size(0),
-      _data_size(0),
-      _num_rows(0) {
-    _tracker.reset(new MemTracker(-1));
-    _mem_pool.reset(new MemPool(_tracker.get()));
-}
+        : _key_length(0),
+          _num_entries(0),
+          _index_size(0),
+          _data_size(0),
+          _num_rows(0),
+          _tracker(new MemTracker(-1)),
+          _mem_pool(new MemPool(_tracker.get())) {}
 
 MemIndex::~MemIndex() {
     _num_entries = 0;
diff --git a/be/src/olap/olap_index.h b/be/src/olap/olap_index.h
index 0db0e97ec5..c4d3714d9d 100644
--- a/be/src/olap/olap_index.h
+++ b/be/src/olap/olap_index.h
@@ -330,7 +330,7 @@ private:
     size_t _num_rows;
     std::vector* _short_key_columns;
 
-    std::unique_ptr _tracker;
+    std::shared_ptr _tracker;
     std::unique_ptr _mem_pool;
     DISALLOW_COPY_AND_ASSIGN(MemIndex);
 };
diff --git a/be/src/olap/reader.h b/be/src/olap/reader.h
index f4ffa59ae9..fe3c8ff02d 100644
--- a/be/src/olap/reader.h
+++ b/be/src/olap/reader.h
@@ -201,7 +201,7 @@ private:
     TabletSharedPtr tablet() { return _tablet; }
 
 private:
-    std::unique_ptr _tracker;
+    std::shared_ptr _tracker;
     std::unique_ptr _predicate_mem_pool;
     std::set _load_bf_columns;
     std::vector _return_columns;
diff --git a/be/src/olap/row_block.cpp b/be/src/olap/row_block.cpp
index 09cf48b18e..f50acf5d34 100644
--- a/be/src/olap/row_block.cpp
+++ b/be/src/olap/row_block.cpp
@@ -37,10 +37,10 @@ using std::vector;
 
 namespace doris {
 
-RowBlock::RowBlock(const TabletSchema* schema, MemTracker* parent_tracker) :
+RowBlock::RowBlock(const TabletSchema* schema, const std::shared_ptr& parent_tracker) :
         _capacity(0),
         _schema(schema) {
-    _tracker.reset(new MemTracker(-1, "RowBlock", parent_tracker, true));
+    _tracker = MemTracker::CreateTracker(-1, "RowBlock", parent_tracker);
     _mem_pool.reset(new MemPool(_tracker.get()));
 }
 
diff --git a/be/src/olap/row_block.h b/be/src/olap/row_block.h
index c9d277f629..1292181f80 100644
--- a/be/src/olap/row_block.h
+++ b/be/src/olap/row_block.h
@@ -56,7 +56,8 @@ class RowBlock {
     friend class RowBlockChanger;
     friend class VectorizedRowBatch;
 public:
-    RowBlock(const TabletSchema* schema, MemTracker* parent_tracker = nullptr);
+    RowBlock(const TabletSchema* schema,
+             const std::shared_ptr& parent_tracker = nullptr);
 
     // 注意回收内部buffer
     ~RowBlock();
@@ -137,7 +138,7 @@ private:
     size_t _limit = 0;
     uint8_t _block_status = DEL_PARTIAL_SATISFIED;
 
-    std::unique_ptr _tracker;
+    std::shared_ptr _tracker;
     std::unique_ptr _mem_pool;
     // 由于内部持有内存资源,所以这里禁止拷贝和赋值
     DISALLOW_COPY_AND_ASSIGN(RowBlock);
diff --git a/be/src/olap/row_block2.cpp b/be/src/olap/row_block2.cpp
index 1ab8782882..c70477dcbd 100644
--- a/be/src/olap/row_block2.cpp
+++ b/be/src/olap/row_block2.cpp
@@ -27,19 +27,20 @@ using strings::Substitute;
 namespace doris {
 
 RowBlockV2::RowBlockV2(const Schema& schema, uint16_t capacity)
-    : _schema(schema),
-      _capacity(capacity),
-      _column_datas(_schema.num_columns(), nullptr),
-      _column_null_bitmaps(_schema.num_columns(), nullptr),
-      _pool(new MemPool(&_tracker)),
-      _selection_vector(nullptr) {
+        : _schema(schema),
+          _capacity(capacity),
+          _column_datas(_schema.num_columns(), nullptr),
+          _column_null_bitmaps(_schema.num_columns(), nullptr),
+          _tracker(new MemTracker(-1, "RowBlockV2")),
+          _pool(new MemPool(_tracker.get())),
+          _selection_vector(nullptr) {
     auto bitmap_size = BitmapSize(capacity);
     for (auto cid : _schema.column_ids()) {
         size_t data_size = _schema.column(cid)->type_info()->size() * _capacity;
         _column_datas[cid] = new uint8_t[data_size];
 
         if (_schema.column(cid)->is_nullable()) {
-            _column_null_bitmaps[cid] = new uint8_t[bitmap_size];;
+            _column_null_bitmaps[cid] = new uint8_t[bitmap_size];
         }
     }
     _selection_vector = new uint16_t[_capacity];
diff --git a/be/src/olap/row_block2.h b/be/src/olap/row_block2.h
index 671659232b..374c00f4b0 100644
--- a/be/src/olap/row_block2.h
+++ b/be/src/olap/row_block2.h
@@ -123,7 +123,7 @@ private:
     std::vector _column_null_bitmaps;
     size_t _num_rows;
     // manages the memory for slice's data
-    MemTracker _tracker;
+    std::shared_ptr _tracker;
     std::unique_ptr _pool;
 
     // index of selected rows for rows passed the predicate
diff --git a/be/src/olap/rowset/alpha_rowset.cpp b/be/src/olap/rowset/alpha_rowset.cpp
index 337a049b62..138b9dbd3a 100644
--- a/be/src/olap/rowset/alpha_rowset.cpp
+++ b/be/src/olap/rowset/alpha_rowset.cpp
@@ -57,7 +57,7 @@ OLAPStatus AlphaRowset::create_reader(std::shared_ptr* result) {
     return OLAP_SUCCESS;
 }
 
-OLAPStatus AlphaRowset::create_reader(MemTracker* parent_tracker,
+OLAPStatus AlphaRowset::create_reader(const std::shared_ptr& parent_tracker,
                                       std::shared_ptr* result) {
     result->reset(new AlphaRowsetReader(_schema->num_rows_per_row_block(),
                                         std::static_pointer_cast(shared_from_this()),
diff --git a/be/src/olap/rowset/alpha_rowset.h b/be/src/olap/rowset/alpha_rowset.h
index bff9618dfa..3c21e7dd21 100644
--- a/be/src/olap/rowset/alpha_rowset.h
+++ b/be/src/olap/rowset/alpha_rowset.h
@@ -42,7 +42,7 @@ public:
 
     OLAPStatus create_reader(std::shared_ptr* result) override;
 
-    OLAPStatus create_reader(MemTracker* parent_tracker,
+    OLAPStatus create_reader(const std::shared_ptr& parent_tracker,
                              std::shared_ptr* result) override;
 
     OLAPStatus split_range(const RowCursor& start_key, const RowCursor& end_key,
diff --git a/be/src/olap/rowset/alpha_rowset_reader.cpp b/be/src/olap/rowset/alpha_rowset_reader.cpp
index 906eeeba63..361b945e3b 100644
--- a/be/src/olap/rowset/alpha_rowset_reader.cpp
+++ b/be/src/olap/rowset/alpha_rowset_reader.cpp
@@ -22,7 +22,7 @@
 namespace doris {
 
 AlphaRowsetReader::AlphaRowsetReader(int num_rows_per_row_block, AlphaRowsetSharedPtr rowset,
-                                     MemTracker* parent_tracker)
+                                     const std::shared_ptr& parent_tracker)
         : _num_rows_per_row_block(num_rows_per_row_block),
           _rowset(std::move(rowset)),
           _parent_tracker(parent_tracker),
diff --git a/be/src/olap/rowset/alpha_rowset_reader.h b/be/src/olap/rowset/alpha_rowset_reader.h
index 222523a4b3..711f4c2a60 100644
--- a/be/src/olap/rowset/alpha_rowset_reader.h
+++ b/be/src/olap/rowset/alpha_rowset_reader.h
@@ -53,7 +53,7 @@ struct AlphaMergeContextComparator {
 class AlphaRowsetReader : public RowsetReader {
 public:
     AlphaRowsetReader(int num_rows_per_row_block, AlphaRowsetSharedPtr rowset,
-                      MemTracker* parent_tracker = nullptr);
+                      const std::shared_ptr& parent_tracker = nullptr);
 
     ~AlphaRowsetReader() override;
 
@@ -104,7 +104,7 @@ private:
 private:
     int _num_rows_per_row_block;
     AlphaRowsetSharedPtr _rowset;
-    MemTracker* _parent_tracker;
+    std::shared_ptr _parent_tracker;
     std::string _rowset_path;
     AlphaRowsetMeta* _alpha_rowset_meta;
     const std::vector>& _segment_groups;
diff --git a/be/src/olap/rowset/beta_rowset.cpp b/be/src/olap/rowset/beta_rowset.cpp
index 784872a8fc..cc531493dd 100644
--- a/be/src/olap/rowset/beta_rowset.cpp
+++ b/be/src/olap/rowset/beta_rowset.cpp
@@ -67,7 +67,7 @@ OLAPStatus BetaRowset::create_reader(RowsetReaderSharedPtr* result) {
     return OLAP_SUCCESS;
 }
 
-OLAPStatus BetaRowset::create_reader(MemTracker* parent_tracker, std::shared_ptr* result) {
+OLAPStatus BetaRowset::create_reader(const std::shared_ptr& parent_tracker, std::shared_ptr* result) {
     // NOTE: We use std::static_pointer_cast for performance
     result->reset(new BetaRowsetReader(std::static_pointer_cast(shared_from_this()), parent_tracker));
     return OLAP_SUCCESS;
diff --git a/be/src/olap/rowset/beta_rowset.h b/be/src/olap/rowset/beta_rowset.h
index 6704fd5e8e..b85e8750b7 100644
--- a/be/src/olap/rowset/beta_rowset.h
+++ b/be/src/olap/rowset/beta_rowset.h
@@ -39,7 +39,7 @@ public:
 
     OLAPStatus create_reader(RowsetReaderSharedPtr* result) override;
 
-    OLAPStatus create_reader(MemTracker* parent_tracker,
+    OLAPStatus create_reader(const std::shared_ptr& parent_tracker,
                              std::shared_ptr* result) override;
 
     static std::string segment_file_path(const std::string& segment_dir, const RowsetId& rowset_id,
diff --git a/be/src/olap/rowset/beta_rowset_reader.cpp b/be/src/olap/rowset/beta_rowset_reader.cpp
index dbc2ea16b0..2f6ce76764 100644
--- a/be/src/olap/rowset/beta_rowset_reader.cpp
+++ b/be/src/olap/rowset/beta_rowset_reader.cpp
@@ -27,7 +27,8 @@
 
 namespace doris {
 
-BetaRowsetReader::BetaRowsetReader(BetaRowsetSharedPtr rowset, MemTracker* parent_tracker)
+BetaRowsetReader::BetaRowsetReader(BetaRowsetSharedPtr rowset,
+                                   const std::shared_ptr& parent_tracker)
         : _rowset(std::move(rowset)), _stats(&_owned_stats), _parent_tracker(parent_tracker) {
     _rowset->aquire();
 }
diff --git a/be/src/olap/rowset/beta_rowset_reader.h b/be/src/olap/rowset/beta_rowset_reader.h
index 368354085b..9ad2de4f81 100644
--- a/be/src/olap/rowset/beta_rowset_reader.h
+++ b/be/src/olap/rowset/beta_rowset_reader.h
@@ -29,7 +29,8 @@ namespace doris {
 
 class BetaRowsetReader : public RowsetReader {
 public:
-    BetaRowsetReader(BetaRowsetSharedPtr rowset, MemTracker* parent_tracker = nullptr);
+    BetaRowsetReader(BetaRowsetSharedPtr rowset,
+                     const std::shared_ptr& parent_tracker = nullptr);
 
     ~BetaRowsetReader() override { _rowset->release(); }
 
@@ -56,7 +57,7 @@ private:
     OlapReaderStatistics _owned_stats;
     OlapReaderStatistics* _stats;
 
-    MemTracker* _parent_tracker;
+    std::shared_ptr _parent_tracker;
 
     std::unique_ptr _iterator;
 
diff --git a/be/src/olap/rowset/column_data.cpp b/be/src/olap/rowset/column_data.cpp
index e33025559c..b1b15cba5a 100644
--- a/be/src/olap/rowset/column_data.cpp
+++ b/be/src/olap/rowset/column_data.cpp
@@ -24,23 +24,25 @@
 
 namespace doris {
 
-ColumnData* ColumnData::create(SegmentGroup* segment_group, MemTracker* parent_tracker) {
+ColumnData* ColumnData::create(SegmentGroup* segment_group,
+                               const std::shared_ptr& parent_tracker) {
     ColumnData* data = new (std::nothrow) ColumnData(segment_group, parent_tracker);
     return data;
 }
 
-ColumnData::ColumnData(SegmentGroup* segment_group, MemTracker* parent_tracker)
-      : _segment_group(segment_group),
-        _parent_tracker(parent_tracker),
-        _eof(false),
-        _conditions(nullptr),
-        _col_predicates(nullptr),
-        _delete_status(DEL_NOT_SATISFIED),
-        _runtime_state(nullptr),
-        _schema(segment_group->get_tablet_schema()),
-        _is_using_cache(false),
-        _segment_reader(nullptr),
-        _lru_cache(nullptr) {
+ColumnData::ColumnData(SegmentGroup* segment_group,
+                       const std::shared_ptr& parent_tracker)
+        : _segment_group(segment_group),
+          _parent_tracker(parent_tracker),
+          _eof(false),
+          _conditions(nullptr),
+          _col_predicates(nullptr),
+          _delete_status(DEL_NOT_SATISFIED),
+          _runtime_state(nullptr),
+          _schema(segment_group->get_tablet_schema()),
+          _is_using_cache(false),
+          _segment_reader(nullptr),
+          _lru_cache(nullptr) {
     if (StorageEngine::instance() != nullptr) {
         _lru_cache = StorageEngine::instance()->index_stream_lru_cache();
     } else {
diff --git a/be/src/olap/rowset/column_data.h b/be/src/olap/rowset/column_data.h
index 3363a6cadb..7124dda140 100644
--- a/be/src/olap/rowset/column_data.h
+++ b/be/src/olap/rowset/column_data.h
@@ -40,8 +40,10 @@ class SegmentReader;
 // This class is column data reader. this class will be used in two case.
 class ColumnData {
 public:
-    static ColumnData* create(SegmentGroup* segment_group, MemTracker* parent_tracker = nullptr);
-    ColumnData(SegmentGroup* segment_group, MemTracker* parent_tracker = nullptr);
+    static ColumnData* create(SegmentGroup* segment_group,
+                              const std::shared_ptr& parent_tracker = nullptr);
+    ColumnData(SegmentGroup* segment_group,
+               const std::shared_ptr& parent_tracker = nullptr);
     ~ColumnData();
 
     // 为了与之前兼容, 暴露部分index的接口
@@ -155,7 +157,7 @@ private:
     }
 private:
     SegmentGroup* _segment_group;
-    MemTracker* _parent_tracker;
+    std::shared_ptr _parent_tracker;
     // 当到达文件末尾或者到达end key时设置此标志
     bool _eof;
     const Conditions* _conditions;
diff --git a/be/src/olap/rowset/rowset.h b/be/src/olap/rowset/rowset.h
index 07a1b9d530..aac920c2f4 100644
--- a/be/src/olap/rowset/rowset.h
+++ b/be/src/olap/rowset/rowset.h
@@ -120,7 +120,7 @@ public:
     virtual OLAPStatus create_reader(std::shared_ptr* result) = 0;
 
     // Support adding parent tracker, but should be careful about destruction sequence.
-    virtual OLAPStatus create_reader(MemTracker* parent_tracker,
+    virtual OLAPStatus create_reader(const std::shared_ptr& parent_tracker,
                                      std::shared_ptr* result) = 0;
 
     // Split range denoted by `start_key` and `end_key` into sub-ranges, each contains roughly
diff --git a/be/src/olap/rowset/segment_reader.cpp b/be/src/olap/rowset/segment_reader.cpp
index 4dcf7cd0aa..b01f476c31 100644
--- a/be/src/olap/rowset/segment_reader.cpp
+++ b/be/src/olap/rowset/segment_reader.cpp
@@ -38,7 +38,7 @@ SegmentReader::SegmentReader(const std::string file, SegmentGroup* segment_group
                              const Conditions* conditions, const DeleteHandler* delete_handler,
                              const DelCondSatisfied delete_status, Cache* lru_cache,
                              RuntimeState* runtime_state, OlapReaderStatistics* stats,
-                             MemTracker* parent_tracker)
+                             const std::shared_ptr& parent_tracker)
         : _file_name(file),
           _segment_group(segment_group),
           _segment_id(segment_id),
@@ -58,13 +58,12 @@ SegmentReader::SegmentReader(const std::string file, SegmentGroup* segment_group
           _is_using_mmap(false),
           _is_data_loaded(false),
           _buffer_size(0),
+          _tracker(MemTracker::CreateTracker(-1, "SegmentReader", parent_tracker)),
+          _mem_pool(new MemPool(_tracker.get())),
           _shared_buffer(NULL),
           _lru_cache(lru_cache),
           _runtime_state(runtime_state),
-          _stats(stats) {
-    _tracker.reset(new MemTracker(-1, "SegmentReader", parent_tracker, true));
-    _mem_pool.reset(new MemPool(_tracker.get()));
-}
+          _stats(stats) {}
 
 SegmentReader::~SegmentReader() {
     SAFE_DELETE(_shared_buffer);
@@ -253,7 +252,7 @@ OLAPStatus SegmentReader::seek_to_block(
 
         if (_runtime_state != NULL) {
             MemTracker::update_limits(_buffer_size, _runtime_state->mem_trackers());
-            if (MemTracker::limit_exceeded(*_runtime_state->mem_trackers())) {
+            if (MemTracker::limit_exceeded(_runtime_state->mem_trackers())) {
                 return OLAP_ERR_FETCH_MEMORY_EXCEEDED;
             }
         }
diff --git a/be/src/olap/rowset/segment_reader.h b/be/src/olap/rowset/segment_reader.h
index 081ed8bc91..f2b723792d 100644
--- a/be/src/olap/rowset/segment_reader.h
+++ b/be/src/olap/rowset/segment_reader.h
@@ -53,7 +53,7 @@ public:
                   const std::set& load_bf_columns, const Conditions* conditions,
                   const DeleteHandler* delete_handler, const DelCondSatisfied delete_status,
                   Cache* lru_cache, RuntimeState* runtime_state, OlapReaderStatistics* stats,
-                  MemTracker* parent_tracker = nullptr);
+                  const std::shared_ptr& parent_tracker = nullptr);
 
     ~SegmentReader();
 
@@ -338,7 +338,7 @@ private:
     std::vector _cache_handle;
     const FileHeader* _file_header;
 
-    std::unique_ptr _tracker;
+    std::shared_ptr _tracker;
     std::unique_ptr _mem_pool;
 
     StorageByteBuffer* _shared_buffer;
diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
index f94c528e4b..961adfc76e 100644
--- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
+++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
@@ -33,7 +33,8 @@ BinaryDictPageBuilder::BinaryDictPageBuilder(const PageBuilderOptions& options)
     _data_page_builder(nullptr),
     _dict_builder(nullptr),
     _encoding_type(DICT_ENCODING),
-    _pool(&_tracker) {
+    _tracker(new MemTracker()),
+    _pool(_tracker.get()) {
     // initially use DICT_ENCODING
     // TODO: the data page builder type can be created by Factory according to user config
     _data_page_builder.reset(new BitshufflePageBuilder(options));
diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.h b/be/src/olap/rowset/segment_v2/binary_dict_page.h
index 4fbe946c93..635fe590c0 100644
--- a/be/src/olap/rowset/segment_v2/binary_dict_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_dict_page.h
@@ -91,7 +91,7 @@ private:
     // used to remember the insertion order of dict keys
     std::vector _dict_items;
     // TODO(zc): rethink about this mem pool
-    MemTracker _tracker;
+    std::shared_ptr _tracker;
     MemPool _pool;
     faststring _buffer;
     faststring _first_value;
diff --git a/be/src/olap/rowset/segment_v2/bitmap_index_reader.h b/be/src/olap/rowset/segment_v2/bitmap_index_reader.h
index 8ac3e5e245..e8ff043054 100644
--- a/be/src/olap/rowset/segment_v2/bitmap_index_reader.h
+++ b/be/src/olap/rowset/segment_v2/bitmap_index_reader.h
@@ -73,12 +73,12 @@ private:
 class BitmapIndexIterator {
 public:
     explicit BitmapIndexIterator(BitmapIndexReader* reader)
-        : _reader(reader),
-          _dict_column_iter(reader->_dict_column_reader.get()),
-          _bitmap_column_iter(reader->_bitmap_column_reader.get()),
-          _current_rowid(0),
-          _pool(new MemPool(&_tracker)) {
-    }
+            : _reader(reader),
+              _dict_column_iter(reader->_dict_column_reader.get()),
+              _bitmap_column_iter(reader->_bitmap_column_reader.get()),
+              _current_rowid(0),
+              _tracker(new MemTracker()),
+              _pool(new MemPool(_tracker.get())) {}
 
     bool has_null_bitmap() const { return _reader->_has_null; }
 
@@ -119,7 +119,7 @@ private:
     IndexedColumnIterator _dict_column_iter;
     IndexedColumnIterator _bitmap_column_iter;
     rowid_t _current_rowid;
-    MemTracker _tracker;
+    std::shared_ptr _tracker;
     std::unique_ptr _pool;
 };
 
diff --git a/be/src/olap/rowset/segment_v2/bitmap_index_writer.cpp b/be/src/olap/rowset/segment_v2/bitmap_index_writer.cpp
index eb98767f72..62c5328f97 100644
--- a/be/src/olap/rowset/segment_v2/bitmap_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/bitmap_index_writer.cpp
@@ -65,7 +65,10 @@ public:
     using MemoryIndexType = typename BitmapIndexTraits::MemoryIndexType;
 
     explicit BitmapIndexWriterImpl(const TypeInfo* typeinfo)
-        : _typeinfo(typeinfo), _reverted_index_size(0), _tracker(), _pool(&_tracker) {}
+            : _typeinfo(typeinfo),
+              _reverted_index_size(0),
+              _tracker(new MemTracker()),
+              _pool(_tracker.get()) {}
 
     ~BitmapIndexWriterImpl() = default;
 
@@ -183,7 +186,7 @@ private:
     Roaring _null_bitmap;
     // unique value to its row id list
     MemoryIndexType _mem_index;
-    MemTracker _tracker;
+    std::shared_ptr _tracker;
     MemPool _pool;
 };
 
diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h
index d50b101802..cf3e3c23c9 100644
--- a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h
+++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.h
@@ -72,7 +72,8 @@ public:
     explicit BloomFilterIndexIterator(BloomFilterIndexReader* reader)
         : _reader(reader),
           _bloom_filter_iter(reader->_bloom_filter_reader.get()),
-          _pool(new MemPool(&_tracker)) {
+          _tracker(new MemTracker()),
+          _pool(new MemPool(_tracker.get())) {
     }
 
     // Read bloom filter at the given ordinal into `bf`.
@@ -85,7 +86,7 @@ public:
 private:
     BloomFilterIndexReader* _reader;
     IndexedColumnIterator _bloom_filter_iter;
-    MemTracker _tracker;
+    std::shared_ptr _tracker;
     std::unique_ptr _pool;
 };
 
diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp
index 9b0ae13038..0044622567 100644
--- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp
@@ -71,9 +71,13 @@ public:
     using ValueDict = typename BloomFilterTraits::ValueDict;
 
     explicit BloomFilterIndexWriterImpl(const BloomFilterOptions& bf_options,
-            const TypeInfo* typeinfo)
-        : _bf_options(bf_options), _typeinfo(typeinfo),
-          _tracker(), _pool(&_tracker), _has_null(false), _bf_buffer_size(0) { }
+                                        const TypeInfo* typeinfo)
+            : _bf_options(bf_options),
+              _typeinfo(typeinfo),
+              _tracker(new MemTracker(-1, "BloomFilterIndexWriterImpl")),
+              _pool(_tracker.get()),
+              _has_null(false),
+              _bf_buffer_size(0) {}
 
     ~BloomFilterIndexWriterImpl() = default;
 
@@ -164,7 +168,7 @@ private:
 private:
     BloomFilterOptions _bf_options;
     const TypeInfo* _typeinfo;
-    MemTracker _tracker;
+    std::shared_ptr _tracker;
     MemPool _pool;
     bool _has_null;
     uint64_t _bf_buffer_size;
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h
index 1d2ac88058..65d4d4945a 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -309,14 +309,16 @@ private:
 class DefaultValueColumnIterator : public ColumnIterator {
 public:
     DefaultValueColumnIterator(bool has_default_value, const std::string& default_value,
-            bool is_nullable, FieldType type, size_t schema_length) : _has_default_value(has_default_value),
-                                                _default_value(default_value),
-                                                _is_nullable(is_nullable),
-                                                _type(type),
-                                                _schema_length(schema_length),
-                                                _is_default_value_null(false),
-                                                _type_size(0),
-                                                _pool(new MemPool(&_tracker)){ }
+                               bool is_nullable, FieldType type, size_t schema_length)
+            : _has_default_value(has_default_value),
+              _default_value(default_value),
+              _is_nullable(is_nullable),
+              _type(type),
+              _schema_length(schema_length),
+              _is_default_value_null(false),
+              _type_size(0),
+              _tracker(new MemTracker()),
+              _pool(new MemPool(_tracker.get())) {}
 
     Status init(const ColumnIteratorOptions& opts) override;
 
@@ -343,7 +345,7 @@ private:
     bool _is_default_value_null;
     size_t _type_size;
     void* _mem_value = nullptr;
-    MemTracker _tracker;
+    std::shared_ptr _tracker;
     std::unique_ptr _pool;
 
     // current rowid
diff --git a/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp b/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp
index 7082204ab4..1a4bf2dc36 100644
--- a/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp
@@ -37,13 +37,12 @@ namespace doris {
 namespace segment_v2 {
 
 IndexedColumnWriter::IndexedColumnWriter(const IndexedColumnWriterOptions& options,
-                                         const TypeInfo* typeinfo,
-                                         fs::WritableBlock* wblock)
+                                         const TypeInfo* typeinfo, fs::WritableBlock* wblock)
         : _options(options),
           _typeinfo(typeinfo),
           _wblock(wblock),
-          _mem_tracker(-1),
-          _mem_pool(&_mem_tracker),
+          _mem_tracker(new MemTracker()),
+          _mem_pool(_mem_tracker.get()),
           _num_values(0),
           _num_data_pages(0),
           _validx_key_coder(nullptr),
diff --git a/be/src/olap/rowset/segment_v2/indexed_column_writer.h b/be/src/olap/rowset/segment_v2/indexed_column_writer.h
index d84347a59c..d2704d1069 100644
--- a/be/src/olap/rowset/segment_v2/indexed_column_writer.h
+++ b/be/src/olap/rowset/segment_v2/indexed_column_writer.h
@@ -92,7 +92,7 @@ private:
     const TypeInfo* _typeinfo;
     fs::WritableBlock* _wblock;
     // only used for `_first_value`
-    MemTracker _mem_tracker;
+    std::shared_ptr _mem_tracker;
     MemPool _mem_pool;
 
     ordinal_t _num_values;
diff --git a/be/src/olap/rowset/segment_v2/zone_map_index.cpp b/be/src/olap/rowset/segment_v2/zone_map_index.cpp
index 5206676782..83cd124423 100644
--- a/be/src/olap/rowset/segment_v2/zone_map_index.cpp
+++ b/be/src/olap/rowset/segment_v2/zone_map_index.cpp
@@ -31,7 +31,8 @@ namespace doris {
 
 namespace segment_v2 {
 
-ZoneMapIndexWriter::ZoneMapIndexWriter(Field* field) : _field(field), _pool(&_tracker) {
+ZoneMapIndexWriter::ZoneMapIndexWriter(Field* field)
+        : _field(field), _tracker(new MemTracker(-1, "ZoneMapIndexWriter")), _pool(_tracker.get()) {
     _page_zone_map.min_value = _field->allocate_value(&_pool);
     _page_zone_map.max_value = _field->allocate_value(&_pool);
     _reset_zone_map(&_page_zone_map);
@@ -114,8 +115,8 @@ Status ZoneMapIndexReader::load(bool use_page_cache, bool kept_in_memory) {
     RETURN_IF_ERROR(reader.load(use_page_cache, kept_in_memory));
     IndexedColumnIterator iter(&reader);
 
-    MemTracker tracker;
-    MemPool pool(&tracker);
+    auto tracker = std::make_shared(-1, "temp in ZoneMapIndexReader");
+    MemPool pool(tracker.get());
     _page_zone_maps.resize(reader.num_values());
 
     // read and cache all page zone maps
diff --git a/be/src/olap/rowset/segment_v2/zone_map_index.h b/be/src/olap/rowset/segment_v2/zone_map_index.h
index 3070cccd6b..0d47229372 100644
--- a/be/src/olap/rowset/segment_v2/zone_map_index.h
+++ b/be/src/olap/rowset/segment_v2/zone_map_index.h
@@ -96,7 +96,7 @@ private:
     ZoneMap _segment_zone_map;
     // TODO(zc): we should replace this memory pool later, we only allocate min/max
     // for field. But MemPool allocate 4KB least, it will a waste for most cases.
-    MemTracker _tracker;
+    std::shared_ptr _tracker;
     MemPool _pool;
 
     // serialized ZoneMapPB for each data page
diff --git a/be/src/olap/schema_change.cpp b/be/src/olap/schema_change.cpp
index 863652670c..5e84aa5957 100644
--- a/be/src/olap/schema_change.cpp
+++ b/be/src/olap/schema_change.cpp
@@ -806,7 +806,7 @@ bool RowBlockMerger::merge(const vector& row_block_arr, RowsetWriter*
                            uint64_t* merged_rows) {
     uint64_t tmp_merged_rows = 0;
     RowCursor row_cursor;
-    std::unique_ptr tracker(new MemTracker(-1));
+    std::shared_ptr tracker(new MemTracker(-1));
     std::unique_ptr mem_pool(new MemPool(tracker.get()));
     std::unique_ptr agg_object_pool(new ObjectPool());
     if (row_cursor.init(_tablet->tablet_schema()) != OLAP_SUCCESS) {
diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp
index 09ddb098b9..3472e9e502 100644
--- a/be/src/olap/storage_engine.cpp
+++ b/be/src/olap/storage_engine.cpp
@@ -109,7 +109,7 @@ StorageEngine::StorageEngine(const EngineOptions& options)
           _is_all_cluster_id_exist(true),
           _index_stream_lru_cache(NULL),
           _file_cache(nullptr),
-          _compaction_mem_tracker(-1, "compaction mem tracker(unlimited)"),
+          _compaction_mem_tracker(MemTracker::CreateTracker(-1, "compaction mem tracker(unlimited)")),
           _tablet_manager(new TabletManager(config::tablet_map_shard_size)),
           _txn_manager(new TxnManager(config::txn_map_shard_size, config::txn_shard_size)),
           _rowset_id_generator(new UniqueRowsetIdGenerator(options.backend_uid)),
@@ -125,9 +125,9 @@ StorageEngine::StorageEngine(const EngineOptions& options)
         return _unused_rowsets.size();
     });
     REGISTER_GAUGE_DORIS_METRIC(compaction_mem_current_consumption, [this]() {
-        return _compaction_mem_tracker.consumption();
+        return _compaction_mem_tracker->consumption();
         // We can get each compaction's detail usage
-        LOG(INFO) << _compaction_mem_tracker.LogUsage(2);
+        // LOG(INFO) << _compaction_mem_tracker=>LogUsage(2);
     });
 }
 
@@ -539,7 +539,7 @@ void StorageEngine::_perform_cumulative_compaction(DataDir* data_dir) {
     DorisMetrics::instance()->cumulative_compaction_request_total.increment(1);
 
     std::string tracker_label = "cumulative compaction " + std::to_string(syscall(__NR_gettid));
-    CumulativeCompaction cumulative_compaction(best_tablet, tracker_label, &_compaction_mem_tracker);
+    CumulativeCompaction cumulative_compaction(best_tablet, tracker_label, _compaction_mem_tracker);
 
     OLAPStatus res = cumulative_compaction.compact();
     if (res != OLAP_SUCCESS) {
@@ -575,7 +575,7 @@ void StorageEngine::_perform_base_compaction(DataDir* data_dir) {
     DorisMetrics::instance()->base_compaction_request_total.increment(1);
 
     std::string tracker_label = "base compaction " + std::to_string(syscall(__NR_gettid));
-    BaseCompaction base_compaction(best_tablet, tracker_label, &_compaction_mem_tracker);
+    BaseCompaction base_compaction(best_tablet, tracker_label, _compaction_mem_tracker);
     OLAPStatus res = base_compaction.compact();
     if (res != OLAP_SUCCESS) {
         best_tablet->set_last_base_compaction_failure_time(UnixMillis());
diff --git a/be/src/olap/storage_engine.h b/be/src/olap/storage_engine.h
index d8d97dde92..56ac12fd52 100644
--- a/be/src/olap/storage_engine.h
+++ b/be/src/olap/storage_engine.h
@@ -316,7 +316,7 @@ private:
     // map, if we use RowsetId as the key, we need custom hash func
     std::unordered_map _unused_rowsets;
 
-    MemTracker _compaction_mem_tracker;
+    std::shared_ptr _compaction_mem_tracker;
 
     bool _stop_bg_worker = false;
     std::thread _unused_rowset_monitor_thread;
diff --git a/be/src/olap/task/engine_checksum_task.cpp b/be/src/olap/task/engine_checksum_task.cpp
index d4624ad200..c201e5aae2 100644
--- a/be/src/olap/task/engine_checksum_task.cpp
+++ b/be/src/olap/task/engine_checksum_task.cpp
@@ -91,7 +91,7 @@ OLAPStatus EngineChecksumTask::_compute_checksum() {
     }
 
     RowCursor row;
-    std::unique_ptr tracker(new MemTracker(-1));
+    std::shared_ptr tracker(new MemTracker(-1));
     std::unique_ptr mem_pool(new MemPool(tracker.get()));
     std::unique_ptr agg_object_pool(new ObjectPool());
     res = row.init(tablet->tablet_schema(), reader_params.return_columns);
diff --git a/be/src/runtime/buffered_block_mgr2.cc b/be/src/runtime/buffered_block_mgr2.cc
index d86a39b8c7..37691a831b 100644
--- a/be/src/runtime/buffered_block_mgr2.cc
+++ b/be/src/runtime/buffered_block_mgr2.cc
@@ -53,16 +53,16 @@ SpinLock BufferedBlockMgr2::_s_block_mgrs_lock;
 
 class BufferedBlockMgr2::Client {
 public:
-    Client(BufferedBlockMgr2* mgr, int num_reserved_buffers, MemTracker* tracker,
-            RuntimeState* state) :
-            _mgr(mgr),
-            _state(state),
-            _tracker(tracker),
-            _query_tracker(_mgr->_mem_tracker->parent()),
-            _num_reserved_buffers(num_reserved_buffers),
-            _num_tmp_reserved_buffers(0),
-            _num_pinned_buffers(0) {
-        DCHECK(tracker != NULL);
+    Client(BufferedBlockMgr2* mgr, int num_reserved_buffers,
+           const std::shared_ptr& tracker, RuntimeState* state)
+            : _mgr(mgr),
+              _state(state),
+              _tracker(tracker),
+              _query_tracker(MemTracker::CreateTracker(-1, "BufferedBlockMgr2", _mgr->_mem_tracker->parent())),
+              _num_reserved_buffers(num_reserved_buffers),
+              _num_tmp_reserved_buffers(0),
+              _num_pinned_buffers(0) {
+        DCHECK(tracker != nullptr);
     }
 
     // A null dtor to pass codestyle check
@@ -81,11 +81,11 @@ public:
     // enforced. Even when we give a buffer to a client, the buffer is still owned and
     // counts against the block mgr tracker (i.e. there is a fixed pool of buffers
     // regardless of if they are in the block mgr or the clients).
-    MemTracker* _tracker;
+    std::shared_ptr _tracker;
 
     // This is the common ancestor between the block mgr tracker and the client tracker.
     // When memory is transferred to the client, we want it to stop at this tracker.
-    MemTracker* _query_tracker;
+    std::shared_ptr _query_tracker;
 
     // Number of buffers reserved by this client.
     int _num_reserved_buffers;
@@ -100,8 +100,8 @@ public:
         DCHECK(buffer != NULL);
         if (buffer->len == _mgr->max_block_size()) {
             ++_num_pinned_buffers;
-            _tracker->consume_local(buffer->len, _query_tracker);
-            // _tracker->consume(buffer->len);
+            _tracker->ConsumeLocal(buffer->len, _query_tracker.get());
+            // _tracker->Consume(buffer->len);
         }
     }
 
@@ -110,8 +110,8 @@ public:
         if (buffer->len == _mgr->max_block_size()) {
             DCHECK_GT(_num_pinned_buffers, 0);
             --_num_pinned_buffers;
-            _tracker->release_local(buffer->len, _query_tracker);
-            // _tracker->release(buffer->len);
+            _tracker->ReleaseLocal(buffer->len, _query_tracker.get());
+            // _tracker->Release(buffer->len);
         }
     }
 
@@ -224,11 +224,11 @@ BufferedBlockMgr2::BufferedBlockMgr2(RuntimeState* state, TmpFileMgr* tmp_file_m
 }
 
 Status BufferedBlockMgr2::create(
-        RuntimeState* state, MemTracker* parent,
+        RuntimeState* state, const std::shared_ptr& parent,
         RuntimeProfile* profile, TmpFileMgr* tmp_file_mgr,
         int64_t mem_limit, int64_t block_size,
-        shared_ptr* block_mgr) {
-    DCHECK(parent != NULL);
+        boost::shared_ptr* block_mgr) {
+    DCHECK(parent != nullptr);
     block_mgr->reset();
     {
         // we do not use global BlockMgrsMap for now, to avoid mem-exceeded different fragments
@@ -264,13 +264,13 @@ int64_t BufferedBlockMgr2::available_buffers(Client* client) const {
 int64_t BufferedBlockMgr2::remaining_unreserved_buffers() const {
     int64_t num_buffers = _free_io_buffers.size() +
         _unpinned_blocks.size() + _non_local_outstanding_writes;
-    num_buffers += _mem_tracker->spare_capacity() / max_block_size();
+    num_buffers += _mem_tracker->SpareCapacity(MemLimit::HARD) / max_block_size();
     num_buffers -= _unfullfilled_reserved_buffers;
     return num_buffers;
 }
 
 Status BufferedBlockMgr2::register_client(
-        int num_reserved_buffers, MemTracker* tracker,
+        int num_reserved_buffers, const std::shared_ptr& tracker,
         RuntimeState* state, Client** client) {
     DCHECK_GE(num_reserved_buffers, 0);
     Client* a_client = new Client(this, num_reserved_buffers, tracker, state);
@@ -329,10 +329,10 @@ bool BufferedBlockMgr2::consume_memory(Client* client, int64_t size) {
     int buffers_needed = BitUtil::ceil(size, max_block_size());
     unique_lock lock(_lock);
 
-    if (size < max_block_size() && _mem_tracker->try_consume(size)) {
+    if (size < max_block_size() && _mem_tracker->TryConsume(size)) {
         // For small allocations (less than a block size), just let the allocation through.
-        client->_tracker->consume_local(size, client->_query_tracker);
-        // client->_tracker->consume(size);
+        client->_tracker->ConsumeLocal(size, client->_query_tracker.get());
+        // client->_tracker->Consume(size);
         return true;
     }
 
@@ -341,10 +341,10 @@ bool BufferedBlockMgr2::consume_memory(Client* client, int64_t size) {
         return false;
     }
 
-    if (_mem_tracker->try_consume(size)) {
+    if (_mem_tracker->TryConsume(size)) {
         // There was still unallocated memory, don't need to recycle allocated blocks.
-        client->_tracker->consume_local(size, client->_query_tracker);
-        // client->_tracker->consume(size);
+        client->_tracker->ConsumeLocal(size, client->_query_tracker.get());
+        // client->_tracker->Consume(size);
         return true;
     }
 
@@ -389,7 +389,7 @@ bool BufferedBlockMgr2::consume_memory(Client* client, int64_t size) {
         }
         client->_num_tmp_reserved_buffers -= additional_tmp_reservations;
         _unfullfilled_reserved_buffers -= additional_tmp_reservations;
-        _mem_tracker->release(buffers_acquired * max_block_size());
+        _mem_tracker->Release(buffers_acquired * max_block_size());
         return false;
     }
 
@@ -397,19 +397,19 @@ bool BufferedBlockMgr2::consume_memory(Client* client, int64_t size) {
     _unfullfilled_reserved_buffers -= buffers_acquired;
 
     DCHECK_GE(buffers_acquired * max_block_size(), size);
-    _mem_tracker->release(buffers_acquired * max_block_size());
-    if (!_mem_tracker->try_consume(size)) {
+    _mem_tracker->Release(buffers_acquired * max_block_size());
+    if (!_mem_tracker->TryConsume(size)) {
         return false;
     }
-    client->_tracker->consume_local(size, client->_query_tracker);
-    // client->_tracker->consume(size);
+    client->_tracker->ConsumeLocal(size, client->_query_tracker.get());
+    // client->_tracker->Consume(size);
     DCHECK(validate()) << endl << debug_internal();
     return true;
 }
 
 void BufferedBlockMgr2::release_memory(Client* client, int64_t size) {
-    _mem_tracker->release(size);
-    client->_tracker->release_local(size, client->_query_tracker);
+    _mem_tracker->Release(size);
+    client->_tracker->ReleaseLocal(size, client->_query_tracker.get());
 }
 
 void BufferedBlockMgr2::cancel() {
@@ -470,7 +470,7 @@ Status BufferedBlockMgr2::get_new_block(
 
         if (len > 0 && len < _max_block_size) {
             DCHECK(unpin_block == NULL);
-            if (client->_tracker->try_consume(len)) {
+            if (client->_tracker->TryConsume(len)) {
                 // TODO: Have a cache of unused blocks of size 'len' (0, _max_block_size)
                 uint8_t* buffer = new uint8_t[len];
                 // Descriptors for non-I/O sized buffers are deleted when the block is deleted.
@@ -599,11 +599,10 @@ BufferedBlockMgr2::~BufferedBlockMgr2() {
 
     // Free memory resources.
     BOOST_FOREACH(BufferDescriptor* buffer, _all_io_buffers) {
-        _mem_tracker->release(buffer->len);
+        _mem_tracker->Release(buffer->len);
         delete[] buffer->buffer;
     }
     DCHECK_EQ(_mem_tracker->consumption(), 0);
-    _mem_tracker->unregister_from_parent();
     _mem_tracker.reset();
 }
 
@@ -619,7 +618,7 @@ int BufferedBlockMgr2::num_reserved_buffers_remaining(Client* client) const {
     return std::max(client->_num_reserved_buffers - client->_num_pinned_buffers, 0);
 }
 
-MemTracker* BufferedBlockMgr2::get_tracker(Client* client) const {
+std::shared_ptr BufferedBlockMgr2::get_tracker(Client* client) const {
     return client->_tracker;
 }
 
@@ -950,7 +949,7 @@ void BufferedBlockMgr2::delete_block(Block* block) {
         if (block->_buffer_desc->len != _max_block_size) {
             // Just delete the block for now.
             delete[] block->_buffer_desc->buffer;
-            block->_client->_tracker->release(block->_buffer_desc->len);
+            block->_client->_tracker->Release(block->_buffer_desc->len);
             delete block->_buffer_desc;
             block->_buffer_desc = NULL;
         } else {
@@ -1090,7 +1089,7 @@ Status BufferedBlockMgr2::find_buffer(
 
     // First, try to allocate a new buffer.
     if (_free_io_buffers.size() < _block_write_threshold &&
-            _mem_tracker->try_consume(_max_block_size)) {
+            _mem_tracker->TryConsume(_max_block_size)) {
         uint8_t* new_buffer = new uint8_t[_max_block_size];
         *buffer_desc = _obj_pool.add(new BufferDescriptor(new_buffer, _max_block_size));
         (*buffer_desc)->all_buffers_it = _all_io_buffers.insert(
@@ -1257,15 +1256,15 @@ string BufferedBlockMgr2::debug_internal() const {
         << "  Num available buffers: " << remaining_unreserved_buffers() << endl
         << "  Total pinned buffers: " << _total_pinned_buffers << endl
         << "  Unfullfilled reserved buffers: " << _unfullfilled_reserved_buffers << endl
-        << "  Remaining memory: " << _mem_tracker->spare_capacity()
-        << " (#blocks=" << (_mem_tracker->spare_capacity() / _max_block_size) << ")" << endl
+        << "  Remaining memory: " << _mem_tracker->SpareCapacity(MemLimit::HARD)
+        << " (#blocks=" << (_mem_tracker->SpareCapacity(MemLimit::HARD) / _max_block_size) << ")" << endl
         << "  Block write threshold: " << _block_write_threshold;
     return ss.str();
 }
 
 void BufferedBlockMgr2::init(
         DiskIoMgr* io_mgr, RuntimeProfile* parent_profile,
-        MemTracker* parent_tracker, int64_t mem_limit) {
+        const std::shared_ptr& parent_tracker, int64_t mem_limit) {
     unique_lock l(_lock);
     if (_initialized) {
         return;
@@ -1292,7 +1291,7 @@ void BufferedBlockMgr2::init(
     // Create a new mem_tracker and allocate buffers.
     // _mem_tracker.reset(new MemTracker(
     //             profile(), mem_limit, -1, "Block Manager", parent_tracker));
-    _mem_tracker.reset(new MemTracker(mem_limit, "Block Manager", parent_tracker));
+    _mem_tracker = MemTracker::CreateTracker(mem_limit, "Block Manager2", parent_tracker);
 
     _initialized = true;
 }
diff --git a/be/src/runtime/buffered_block_mgr2.h b/be/src/runtime/buffered_block_mgr2.h
index b2d355b1f6..a69383de14 100644
--- a/be/src/runtime/buffered_block_mgr2.h
+++ b/be/src/runtime/buffered_block_mgr2.h
@@ -294,8 +294,7 @@ public:
     // - mem_limit: maximum memory that will be used by the block mgr.
     // - buffer_size: maximum size of each buffer.
     static Status create(
-            // RuntimeState* state, MemTracker* parent,
-            RuntimeState* state, MemTracker* parent,
+            RuntimeState* state, const std::shared_ptr& parent,
             RuntimeProfile* profile, TmpFileMgr* tmp_file_mgr,
             int64_t mem_limit, int64_t buffer_size,
             boost::shared_ptr* block_mgr);
@@ -312,7 +311,7 @@ public:
     // Buffers used by this client are reflected in tracker.
     // TODO: The fact that we allow oversubscription is problematic.
     // as the code expects the reservations to always be granted (currently not the case).
-    Status register_client(int num_reserved_buffers, MemTracker* tracker,
+    Status register_client(int num_reserved_buffers, const std::shared_ptr& tracker,
             RuntimeState* state, Client** client);
 
     // Clears all reservations for this client.
@@ -388,7 +387,7 @@ public:
 
     int num_pinned_buffers(Client* client) const;
     int num_reserved_buffers_remaining(Client* client) const;
-    MemTracker* get_tracker(Client* client) const;
+    std::shared_ptr get_tracker(Client* client) const;
     int64_t max_block_size() const { {
         return _max_block_size; }
     }
@@ -425,7 +424,7 @@ private:
 
     // Initializes the block mgr. Idempotent and thread-safe.
     void init(DiskIoMgr* io_mgr, RuntimeProfile* profile,
-            MemTracker* parent_tracker, int64_t mem_limit);
+              const std::shared_ptr& parent_tracker, int64_t mem_limit);
 
     // Initializes _tmp_files. This is initialized the first time we need to write to disk.
     // Must be called with _lock taken.
@@ -527,7 +526,7 @@ private:
     ObjectPool _obj_pool;
 
     // Track buffers allocated by the block manager.
-    boost::scoped_ptr _mem_tracker;
+    std::shared_ptr _mem_tracker;
 
     // The temporary file manager used to allocate temporary file space.
     TmpFileMgr* _tmp_file_mgr;
diff --git a/be/src/runtime/buffered_tuple_stream2.cc b/be/src/runtime/buffered_tuple_stream2.cc
index 135eb1440a..24444b78e1 100644
--- a/be/src/runtime/buffered_tuple_stream2.cc
+++ b/be/src/runtime/buffered_tuple_stream2.cc
@@ -488,7 +488,7 @@ Status BufferedTupleStream2::get_rows(scoped_ptr* batch, bool* got_row
     }
     RETURN_IF_ERROR(prepare_for_read(false));
     batch->reset(
-            new RowBatch(_desc, num_rows(), _block_mgr->get_tracker(_block_mgr_client)));
+            new RowBatch(_desc, num_rows(), _block_mgr->get_tracker(_block_mgr_client).get()));
     bool eos = false;
     // Loop until get_next fills the entire batch. Each call can stop at block
     // boundaries. We generally want it to stop, so that blocks can be freed
diff --git a/be/src/runtime/buffered_tuple_stream3.cc b/be/src/runtime/buffered_tuple_stream3.cc
index add66234fc..bddbbc4895 100644
--- a/be/src/runtime/buffered_tuple_stream3.cc
+++ b/be/src/runtime/buffered_tuple_stream3.cc
@@ -695,7 +695,7 @@ void BufferedTupleStream3::UnpinStream(UnpinMode mode) {
 }
 */
 Status BufferedTupleStream3::GetRows(
-    MemTracker* tracker, scoped_ptr* batch, bool* got_rows) {
+    const std::shared_ptr& tracker, scoped_ptr* batch, bool* got_rows) {
   if (num_rows() > numeric_limits::max()) {
     // RowBatch::num_rows_ is a 32-bit int, avoid an overflow.
     return Status::InternalError(Substitute("Trying to read $0 rows into in-memory batch failed. Limit "
@@ -710,7 +710,7 @@ Status BufferedTupleStream3::GetRows(
   // TODO chenhao 
   // capacity in RowBatch use int, but _num_rows is int64_t
   // it may be precision loss
-  batch->reset(new RowBatch(*desc_, num_rows(), tracker));
+  batch->reset(new RowBatch(*desc_, num_rows(), tracker.get()));
   bool eos = false;
   // Loop until GetNext fills the entire batch. Each call can stop at page
   // boundaries. We generally want it to stop, so that pages can be freed
diff --git a/be/src/runtime/buffered_tuple_stream3.h b/be/src/runtime/buffered_tuple_stream3.h
index ebc26d3a8c..d93c8004a4 100644
--- a/be/src/runtime/buffered_tuple_stream3.h
+++ b/be/src/runtime/buffered_tuple_stream3.h
@@ -336,7 +336,7 @@ class BufferedTupleStream3 {
   /// process. If the current unused reservation is not sufficient to pin the stream in
   /// memory, this will try to increase the reservation. If that fails, 'got_rows' is set
   /// to false.
-  Status GetRows(MemTracker* tracker, boost::scoped_ptr* batch,
+  Status GetRows(const std::shared_ptr& tracker, boost::scoped_ptr* batch,
       bool* got_rows) WARN_UNUSED_RESULT;
 
   /// Must be called once at the end to cleanup all resources. If 'batch' is non-NULL,
diff --git a/be/src/runtime/bufferpool/buffer_pool.cc b/be/src/runtime/bufferpool/buffer_pool.cc
index e66bcc232f..da640f6ed6 100644
--- a/be/src/runtime/bufferpool/buffer_pool.cc
+++ b/be/src/runtime/bufferpool/buffer_pool.cc
@@ -118,8 +118,8 @@ BufferPool::BufferPool(int64_t min_buffer_len, int64_t buffer_bytes_limit,
 
 BufferPool::~BufferPool() {}
 
-Status BufferPool::RegisterClient(const string& name, //TmpFileMgr::FileGroup* file_group,
-    ReservationTracker* parent_reservation, MemTracker* mem_tracker,
+Status BufferPool::RegisterClient(const string& name,
+    ReservationTracker* parent_reservation, const std::shared_ptr& mem_tracker,
     int64_t reservation_limit, RuntimeProfile* profile, ClientHandle* client) {
   DCHECK(!client->is_registered());
   DCHECK(parent_reservation != NULL);
@@ -375,7 +375,7 @@ void BufferPool::SubReservation::Close() {
 }
 
 BufferPool::Client::Client(BufferPool* pool, //TmpFileMgr::FileGroup* file_group,
-    const string& name, ReservationTracker* parent_reservation, MemTracker* mem_tracker,
+    const string& name, ReservationTracker* parent_reservation, const std::shared_ptr& mem_tracker,
     int64_t reservation_limit, RuntimeProfile* profile)
   : pool_(pool),
     //file_group_(file_group),
@@ -386,7 +386,7 @@ BufferPool::Client::Client(BufferPool* pool, //TmpFileMgr::FileGroup* file_group
   // Set up a child profile with buffer pool info.
   RuntimeProfile* child_profile = profile->create_child("Buffer pool", true, true);
   reservation_.InitChildTracker(
-      child_profile, parent_reservation, mem_tracker, reservation_limit);
+      child_profile, parent_reservation, mem_tracker.get(), reservation_limit);
   counters_.alloc_time = ADD_TIMER(child_profile, "AllocTime");
   counters_.cumulative_allocations =
       ADD_COUNTER(child_profile, "CumulativeAllocations", TUnit::UNIT);
diff --git a/be/src/runtime/bufferpool/buffer_pool.h b/be/src/runtime/bufferpool/buffer_pool.h
index 4309c94869..56892c273e 100644
--- a/be/src/runtime/bufferpool/buffer_pool.h
+++ b/be/src/runtime/bufferpool/buffer_pool.h
@@ -176,8 +176,8 @@ class BufferPool : public CacheLineAligned {
   /// The client's reservation is created as a child of 'parent_reservation' with limit
   /// 'reservation_limit' and associated with MemTracker 'mem_tracker'. The initial
   /// reservation is 0 bytes.
-  Status RegisterClient(const std::string& name, //TmpFileMgr::FileGroup* file_group,
-      ReservationTracker* parent_reservation, MemTracker* mem_tracker,
+  Status RegisterClient(const std::string& name,
+      ReservationTracker* parent_reservation, const std::shared_ptr& mem_tracker,
       int64_t reservation_limit, RuntimeProfile* profile,
       ClientHandle* client) WARN_UNUSED_RESULT;
 
diff --git a/be/src/runtime/bufferpool/buffer_pool_internal.h b/be/src/runtime/bufferpool/buffer_pool_internal.h
index be764c2af8..1ad3b02e01 100644
--- a/be/src/runtime/bufferpool/buffer_pool_internal.h
+++ b/be/src/runtime/bufferpool/buffer_pool_internal.h
@@ -138,7 +138,7 @@ class BufferPool::Client {
  public:
   Client(BufferPool* pool, //TmpFileMgr::FileGroup* file_group, 
      const std::string& name,
-      ReservationTracker* parent_reservation, MemTracker* mem_tracker,
+      ReservationTracker* parent_reservation, const std::shared_ptr& mem_tracker,
       int64_t reservation_limit, RuntimeProfile* profile);
 
   ~Client() {
diff --git a/be/src/runtime/bufferpool/reservation_tracker.cc b/be/src/runtime/bufferpool/reservation_tracker.cc
index 7fcc2bdfd0..41620f3157 100644
--- a/be/src/runtime/bufferpool/reservation_tracker.cc
+++ b/be/src/runtime/bufferpool/reservation_tracker.cc
@@ -75,10 +75,10 @@ void ReservationTracker::InitChildTracker(RuntimeProfile* profile,
     MemTracker* parent_mem_tracker = GetParentMemTracker();
     if (parent_mem_tracker != nullptr) {
       // Make sure the parent links of the MemTrackers correspond to our parent links.
-      DCHECK_EQ(parent_mem_tracker, mem_tracker_->parent());
+      DCHECK_EQ(parent_mem_tracker, mem_tracker_->parent().get());
       // Make sure we don't have a lower limit than the ancestor, since we don't enforce
       // limits at lower links.
-      DCHECK_EQ(mem_tracker_->lowest_limit(), parent_mem_tracker->lowest_limit());
+      DCHECK_EQ(mem_tracker_->GetLowestLimit(MemLimit::HARD), parent_mem_tracker->GetLowestLimit(MemLimit::HARD));
     } else {
       // Make sure we didn't leave a gap in the links. E.g. this tracker's grandparent
       // shouldn't have a MemTracker.
@@ -114,7 +114,7 @@ void ReservationTracker::InitCounters(
     counters_.reservation_limit = ADD_COUNTER(profile, "ReservationLimit", TUnit::BYTES);
     COUNTER_SET(counters_.reservation_limit, reservation_limit);
   }
-  if (mem_tracker_ != nullptr) mem_tracker_->enable_reservation_reporting(counters_);
+  if (mem_tracker_ != nullptr) mem_tracker_->EnableReservationReporting(counters_);
 }
 
 void ReservationTracker::Close() {
@@ -191,12 +191,12 @@ bool ReservationTracker::TryConsumeFromMemTracker(int64_t reservation_increase)
   if (GetParentMemTracker() == nullptr) {
     // At the topmost link, which may be a MemTracker with a limit, we need to use
     // TryConsume() to check the limit.
-    return mem_tracker_->try_consume(reservation_increase);
+    return mem_tracker_->TryConsume(reservation_increase);
   } else {
     // For lower links, there shouldn't be a limit to enforce, so we just need to
     // update the consumption of the linked MemTracker since the reservation is
     // already reflected in its parent.
-    mem_tracker_->consume_local(reservation_increase, GetParentMemTracker());
+    mem_tracker_->ConsumeLocal(reservation_increase, GetParentMemTracker());
     return true;
   }
 }
@@ -205,9 +205,9 @@ void ReservationTracker::ReleaseToMemTracker(int64_t reservation_decrease) {
   DCHECK_GE(reservation_decrease, 0);
   if (mem_tracker_ == nullptr) return;
   if (GetParentMemTracker() == nullptr) {
-    mem_tracker_->release(reservation_decrease);
+    mem_tracker_->Release(reservation_decrease);
   } else {
-    mem_tracker_->release_local(reservation_decrease, GetParentMemTracker());
+    mem_tracker_->ReleaseLocal(reservation_decrease, GetParentMemTracker());
   }
 }
 
diff --git a/be/src/runtime/data_spliter.cpp b/be/src/runtime/data_spliter.cpp
index d06e7cbbdf..778e49e05c 100644
--- a/be/src/runtime/data_spliter.cpp
+++ b/be/src/runtime/data_spliter.cpp
@@ -89,14 +89,13 @@ Status DataSpliter::prepare(RuntimeState* state) {
     std::stringstream title;
     title << "DataSplitSink (dst_fragment_instance_id=" << print_id(state->fragment_instance_id()) << ")";
     RETURN_IF_ERROR(DataSink::prepare(state));
-    RETURN_IF_ERROR(Expr::prepare(
-            _partition_expr_ctxs, state, _row_desc, _expr_mem_tracker.get()));
+    RETURN_IF_ERROR(Expr::prepare(_partition_expr_ctxs, state, _row_desc, _expr_mem_tracker));
     for (auto& iter : _rollup_map) {
-        RETURN_IF_ERROR(iter.second->prepare(state, _row_desc, _expr_mem_tracker.get()));
+        RETURN_IF_ERROR(iter.second->prepare(state, _row_desc, _expr_mem_tracker));
     }
     _profile = state->obj_pool()->add(new RuntimeProfile(title.str()));
     for (auto iter : _partition_infos) {
-        RETURN_IF_ERROR(iter->prepare(state, _row_desc, _expr_mem_tracker.get()));
+        RETURN_IF_ERROR(iter->prepare(state, _row_desc, _expr_mem_tracker));
     }
     return Status::OK();
 }
@@ -327,7 +326,7 @@ Status DataSpliter::close(RuntimeState* state, Status close_status) {
         }
     }
   
-    _expr_mem_tracker->close();
+    _expr_mem_tracker.reset();
     _closed = true;
     if (is_ok) {
         return Status::OK();
diff --git a/be/src/runtime/data_stream_recvr.cc b/be/src/runtime/data_stream_recvr.cc
index 32618901ea..78dd50c9ed 100644
--- a/be/src/runtime/data_stream_recvr.cc
+++ b/be/src/runtime/data_stream_recvr.cc
@@ -242,7 +242,7 @@ void DataStreamRecvr::SenderQueue::add_batch(
         // Note: if this function makes a row batch, the batch *must* be added
         // to _batch_queue. It is not valid to create the row batch and destroy
         // it in this thread.
-        batch = new RowBatch(_recvr->row_desc(), pb_batch, _recvr->mem_tracker());
+        batch = new RowBatch(_recvr->row_desc(), pb_batch, _recvr->mem_tracker().get());
     }
    
     VLOG_ROW << "added #rows=" << batch->num_rows()
@@ -352,7 +352,7 @@ void DataStreamRecvr::transfer_all_resources(RowBatch* transfer_batch) {
 }
 
 DataStreamRecvr::DataStreamRecvr(
-        DataStreamMgr* stream_mgr, MemTracker* parent_tracker,
+        DataStreamMgr* stream_mgr, const std::shared_ptr& parent_tracker,
         const RowDescriptor& row_desc, const TUniqueId& fragment_instance_id,
         PlanNodeId dest_node_id, int num_senders, bool is_merging, 
         int total_buffer_limit, RuntimeProfile* profile, 
@@ -366,10 +366,7 @@ DataStreamRecvr::DataStreamRecvr(
             _num_buffered_bytes(0),
             _profile(profile),
             _sub_plan_query_statistics_recvr(sub_plan_query_statistics_recvr) {
-    // TODO: Now the parent tracker may cause problem when we need spill to disk, so we
-    // replace parent_tracker with nullptr, fix future
-    _mem_tracker.reset(new MemTracker(_profile, -1, "DataStreamRecvr", nullptr));
-    // _mem_tracker.reset(new MemTracker(_profile.get(), -1, "DataStreamRecvr", parent_tracker));
+    _mem_tracker = MemTracker::CreateTracker(_profile, -1, "DataStreamRecvr", parent_tracker);
 
     // Create one queue per sender if is_merging is true.
     int num_queues = is_merging ? num_senders : 1;
@@ -427,8 +424,7 @@ void DataStreamRecvr::close() {
     _mgr->deregister_recvr(fragment_instance_id(), dest_node_id());
     _mgr = NULL;
     _merger.reset();
-    _mem_tracker->close();
-//    _mem_tracker->unregister_from_parent();
+    // TODO: Maybe shared tracker doesn't need to be reset manually
     _mem_tracker.reset();
 }
 
diff --git a/be/src/runtime/data_stream_recvr.h b/be/src/runtime/data_stream_recvr.h
index 104ee769c3..5fcbde7704 100644
--- a/be/src/runtime/data_stream_recvr.h
+++ b/be/src/runtime/data_stream_recvr.h
@@ -98,7 +98,7 @@ public:
     const TUniqueId& fragment_instance_id() const { return _fragment_instance_id; }
     PlanNodeId dest_node_id() const { return _dest_node_id; }
     const RowDescriptor& row_desc() const { return _row_desc; }
-    MemTracker* mem_tracker() const { return _mem_tracker.get(); }
+    std::shared_ptr mem_tracker() const { return _mem_tracker; }
 
     void add_sub_plan_statistics(const PQueryStatistics& statistics, int sender_id) {
         _sub_plan_query_statistics_recvr->insert(statistics, sender_id);
@@ -108,7 +108,7 @@ private:
     friend class DataStreamMgr;
     class SenderQueue;
 
-    DataStreamRecvr(DataStreamMgr* stream_mgr, MemTracker* parent_tracker,
+    DataStreamRecvr(DataStreamMgr* stream_mgr, const std::shared_ptr& parent_tracker,
             const RowDescriptor& row_desc, const TUniqueId& fragment_instance_id,
             PlanNodeId dest_node_id, int num_senders, bool is_merging, 
             int total_buffer_limit, RuntimeProfile* profile, 
@@ -155,7 +155,7 @@ private:
     AtomicInt _num_buffered_bytes;
 
     // Memtracker for batches in the sender queue(s).
-    boost::scoped_ptr _mem_tracker;
+    std::shared_ptr _mem_tracker;
 
     // One or more queues of row batches received from senders. If _is_merging is true,
     // there is one SenderQueue for each sender. Otherwise, row batches from all senders
diff --git a/be/src/runtime/data_stream_sender.cpp b/be/src/runtime/data_stream_sender.cpp
index 8c5d97beb2..73d642d677 100644
--- a/be/src/runtime/data_stream_sender.cpp
+++ b/be/src/runtime/data_stream_sender.cpp
@@ -397,22 +397,18 @@ Status DataStreamSender::prepare(RuntimeState* state) {
     title << "DataStreamSender (dst_id=" << _dest_node_id << ", dst_fragments=[" << instances << "])";
     _profile = _pool->add(new RuntimeProfile(title.str()));
     SCOPED_TIMER(_profile->total_time_counter());
-    _mem_tracker.reset(
-            new MemTracker(_profile, -1, "DataStreamSender", state->instance_mem_tracker()));
+    _mem_tracker = MemTracker::CreateTracker(_profile, -1, "DataStreamSender", state->instance_mem_tracker());
 
-    if (_part_type == TPartitionType::UNPARTITIONED 
-            || _part_type == TPartitionType::RANDOM) {
+    if (_part_type == TPartitionType::UNPARTITIONED || _part_type == TPartitionType::RANDOM) {
         // Randomize the order we open/transmit to channels to avoid thundering herd problems.
         srand(reinterpret_cast(this));
         random_shuffle(_channels.begin(), _channels.end());
     } else if (_part_type == TPartitionType::HASH_PARTITIONED) {
-        RETURN_IF_ERROR(Expr::prepare(
-                _partition_expr_ctxs, state, _row_desc, _expr_mem_tracker.get()));
+        RETURN_IF_ERROR(Expr::prepare(_partition_expr_ctxs, state, _row_desc, _expr_mem_tracker));
     } else {
-        RETURN_IF_ERROR(Expr::prepare(
-                _partition_expr_ctxs, state, _row_desc, _expr_mem_tracker.get()));
+        RETURN_IF_ERROR(Expr::prepare(_partition_expr_ctxs, state, _row_desc, _expr_mem_tracker));
         for (auto iter : _partition_infos) {
-            RETURN_IF_ERROR(iter->prepare(state, _row_desc, _expr_mem_tracker.get()));
+            RETURN_IF_ERROR(iter->prepare(state, _row_desc, _expr_mem_tracker));
         }
     }
 
diff --git a/be/src/runtime/data_stream_sender.h b/be/src/runtime/data_stream_sender.h
index 38ce7b815a..c39074bb60 100644
--- a/be/src/runtime/data_stream_sender.h
+++ b/be/src/runtime/data_stream_sender.h
@@ -151,7 +151,7 @@ private:
     RuntimeProfile::Counter* _uncompressed_bytes_counter;
     RuntimeProfile::Counter* _ignore_rows;
 
-    std::unique_ptr _mem_tracker;
+    std::shared_ptr _mem_tracker;
 
     // Throughput per total time spent in sender
     RuntimeProfile::Counter* _overall_throughput;
diff --git a/be/src/runtime/disk_io_mgr.cc b/be/src/runtime/disk_io_mgr.cc
index 529d5a042c..2372048cc1 100644
--- a/be/src/runtime/disk_io_mgr.cc
+++ b/be/src/runtime/disk_io_mgr.cc
@@ -215,7 +215,6 @@ void DiskIoMgr::BufferDescriptor::reset(RequestContext* reader,
     _len = 0;
     _eosr = false;
     _status = Status::OK();
-    _mem_tracker = NULL;
 }
 
 void DiskIoMgr::BufferDescriptor::return_buffer() {
@@ -223,21 +222,21 @@ void DiskIoMgr::BufferDescriptor::return_buffer() {
     _io_mgr->return_buffer(this);
 }
 
-// void DiskIoMgr::BufferDescriptor::SetMemTracker(MemTracker* tracker) {
-void DiskIoMgr::BufferDescriptor::set_mem_tracker(MemTracker* tracker) {
+void DiskIoMgr::BufferDescriptor::set_mem_tracker(std::shared_ptr tracker) {
     // Cached buffers don't count towards mem usage.
     if (_scan_range->_cached_buffer != NULL) {
         return;
     }
-    if (_mem_tracker == tracker) {
+    if (_mem_tracker.get() == tracker.get()) {
         return;
     }
-    if (_mem_tracker != NULL) {
-        _mem_tracker->release(_buffer_len);
+    // TODO(yingchun): use TransferTo?
+    if (_mem_tracker != nullptr) {
+        _mem_tracker->Release(_buffer_len);
     }
-    _mem_tracker = tracker;
-    if (_mem_tracker != NULL) {
-        _mem_tracker->consume(_buffer_len);
+    _mem_tracker = std::move(tracker);
+    if (_mem_tracker != nullptr) {
+        _mem_tracker->Consume(_buffer_len);
     }
 }
 
@@ -360,9 +359,8 @@ DiskIoMgr::~DiskIoMgr() {
      */
 }
 
-// Status DiskIoMgr::init(MemTracker* process_mem_tracker) {
-Status DiskIoMgr::init(MemTracker* process_mem_tracker) {
-    DCHECK(process_mem_tracker != NULL);
+Status DiskIoMgr::init(const std::shared_ptr& process_mem_tracker) {
+    DCHECK(process_mem_tracker != nullptr);
     _process_mem_tracker = process_mem_tracker;
     // If we hit the process limit, see if we can reclaim some memory by removing
     // previously allocated (but unused) io buffers.
@@ -406,11 +404,10 @@ Status DiskIoMgr::init(MemTracker* process_mem_tracker) {
     return Status::OK();
 }
 
-// Status DiskIoMgr::register_context(RequestContext** request_context, MemTracker* mem_tracker) {
-Status DiskIoMgr::register_context(RequestContext** request_context, MemTracker* mem_tracker) {
+Status DiskIoMgr::register_context(RequestContext** request_context, std::shared_ptr mem_tracker) {
     DCHECK(_request_context_cache.get() != NULL) << "Must call init() first.";
     *request_context = _request_context_cache->get_new_context();
-    (*request_context)->reset(mem_tracker);
+    (*request_context)->reset(std::move(mem_tracker));
     return Status::OK();
 }
 
@@ -720,7 +717,7 @@ char* DiskIoMgr::get_free_buffer(int64_t* buffer_size) {
         ++_num_allocated_buffers;
         // Update the process mem usage.  This is checked the next time we start
         // a read for the next reader (DiskIoMgr::GetNextScanRange)
-        _process_mem_tracker->consume(*buffer_size);
+        _process_mem_tracker->Consume(*buffer_size);
         buffer = new char[*buffer_size];
     } else {
         buffer = _free_buffers[idx].front();
@@ -738,7 +735,7 @@ void DiskIoMgr::gc_io_buffers() {
         for (list::iterator iter = _free_buffers[idx].begin();
                 iter != _free_buffers[idx].end(); ++iter) {
             int64_t buffer_size = (1 << idx) * _min_buffer_size;
-            _process_mem_tracker->release(buffer_size);
+            _process_mem_tracker->Release(buffer_size);
             --_num_allocated_buffers;
             delete[] *iter;
 
@@ -751,7 +748,7 @@ void DiskIoMgr::gc_io_buffers() {
 
 void DiskIoMgr::return_free_buffer(BufferDescriptor* desc) {
     return_free_buffer(desc->_buffer, desc->_buffer_len);
-    desc->set_mem_tracker(NULL);
+    desc->set_mem_tracker(nullptr);
     desc->_buffer = NULL;
 }
 
@@ -765,7 +762,7 @@ void DiskIoMgr::return_free_buffer(char* buffer, int64_t buffer_size) {
     if (!config::disable_mem_pools && _free_buffers[idx].size() < config::max_free_io_buffers) {
         _free_buffers[idx].push_back(buffer);
     } else {
-        _process_mem_tracker->release(buffer_size);
+        _process_mem_tracker->Release(buffer_size);
         --_num_allocated_buffers;
         delete[] buffer;
     }
@@ -823,8 +820,8 @@ bool DiskIoMgr::get_next_request_range(DiskQueue* disk_queue, RequestRange** ran
         // TODO: we can do a lot better here.  The reader can likely make progress
         // with fewer io buffers.
         bool process_limit_exceeded = _process_mem_tracker->limit_exceeded();
-        bool reader_limit_exceeded = (*request_context)->_mem_tracker != NULL
-                ? (*request_context)->_mem_tracker->any_limit_exceeded() : false;
+        bool reader_limit_exceeded = (*request_context)->_mem_tracker != nullptr
+                ? (*request_context)->_mem_tracker->AnyLimitExceeded(MemLimit::HARD) : false;
         // bool reader_limit_exceeded = (*request_context)->_mem_tracker != NULL
         //     ? (*request_context)->_mem_tracker->limit_exceeded() : false;
 
@@ -1020,12 +1017,12 @@ void DiskIoMgr::read_range(DiskQueue* disk_queue, RequestContext* reader, ScanRa
     DCHECK_GT(bytes_remaining, 0);
     int64_t buffer_size = std::min(bytes_remaining, static_cast(_max_buffer_size));
     bool enough_memory = true;
-    if (reader->_mem_tracker != NULL) {
-        enough_memory = reader->_mem_tracker->spare_capacity() > LOW_MEMORY;
+    if (reader->_mem_tracker != nullptr) {
+        enough_memory = reader->_mem_tracker->SpareCapacity(MemLimit::HARD) > LOW_MEMORY;
         if (!enough_memory) {
             // Low memory, GC and try again.
             gc_io_buffers();
-            enough_memory = reader->_mem_tracker->spare_capacity() > LOW_MEMORY;
+            enough_memory = reader->_mem_tracker->SpareCapacity(MemLimit::HARD) > LOW_MEMORY;
         }
     }
 
diff --git a/be/src/runtime/disk_io_mgr.h b/be/src/runtime/disk_io_mgr.h
index 03229dfd2c..9b66e18751 100644
--- a/be/src/runtime/disk_io_mgr.h
+++ b/be/src/runtime/disk_io_mgr.h
@@ -244,10 +244,9 @@ public:
         // Returns the offset within the scan range that this buffer starts at
         int64_t scan_range_offset() const { return _scan_range_offset; }
 
-        // Updates this buffer buffer to be owned by the new tracker. Consumption is
+        // Updates this buffer to be owned by the new tracker. Consumption is
         // release from the current tracker and added to the new one.
-        // void SetMemTracker(MemTracker* tracker);
-        void set_mem_tracker(MemTracker* mem_tracker);
+        void set_mem_tracker(std::shared_ptr tracker);
 
         // Returns the buffer to the IoMgr. This must be called for every buffer
         // returned by get_next()/read() that did not return an error. This is non-blocking.
@@ -268,8 +267,7 @@ public:
         RequestContext* _reader;
 
         // The current tracker this buffer is associated with.
-        // MemTracker* _mem_tracker;
-        MemTracker* _mem_tracker;
+        std::shared_ptr _mem_tracker;
 
         // Scan range that this buffer is for.
         ScanRange* _scan_range;
@@ -548,8 +546,7 @@ public:
     ~DiskIoMgr();
 
     // Initialize the IoMgr. Must be called once before any of the other APIs.
-    // Status init(MemTracker* process_mem_tracker);
-    Status init(MemTracker* process_mem_tracker);
+    Status init(const std::shared_ptr& process_mem_tracker);
 
     // Allocates tracking structure for a request context.
     // Register a new request context which is returned in *request_context.
@@ -559,10 +556,8 @@ public:
     //    used for this reader will be tracked by this. If the limit is exceeded
     //    the reader will be cancelled and MEM_LIMIT_EXCEEDED will be returned via
     //    get_next().
-    // Status register_context(RequestContext** request_context,
-    //         MemTracker* reader_mem_tracker = NULL);
     Status register_context(RequestContext** request_context,
-            MemTracker* reader_mem_tracker = NULL);
+                            std::shared_ptr reader_mem_tracker = std::shared_ptr());
 
     // Unregisters context from the disk IoMgr. This must be called for every
     // register_context() regardless of cancellation and must be called in the
@@ -704,8 +699,7 @@ private:
     ObjectPool _pool;
 
     // Process memory tracker; needed to account for io buffers.
-    // MemTracker* _process_mem_tracker;
-    MemTracker* _process_mem_tracker;
+    std::shared_ptr _process_mem_tracker;
 
     // Number of worker(read) threads per disk. Also the max depth of queued
     // work to the disk.
diff --git a/be/src/runtime/disk_io_mgr_internal.h b/be/src/runtime/disk_io_mgr_internal.h
index ad212e7fe1..a3b229dead 100644
--- a/be/src/runtime/disk_io_mgr_internal.h
+++ b/be/src/runtime/disk_io_mgr_internal.h
@@ -138,8 +138,7 @@ public:
     RequestContext(DiskIoMgr* parent, int num_disks);
 
     // Resets this object.
-    // void reset(MemTracker* tracker);
-    void reset(MemTracker* tracker);
+    void reset(std::shared_ptr tracker);
 
     // Decrements the number of active disks for this reader.  If the disk count
     // goes to 0, the disk complete condition variable is signaled.
@@ -196,8 +195,7 @@ private:
     DiskIoMgr* _parent;
 
     // Memory used for this reader.  This is unowned by this object.
-    // MemTracker* _mem_tracker;
-    MemTracker* _mem_tracker;
+    std::shared_ptr _mem_tracker;
 
     // Total bytes read for this reader
     RuntimeProfile::Counter* _bytes_read_counter;
diff --git a/be/src/runtime/disk_io_mgr_reader_context.cc b/be/src/runtime/disk_io_mgr_reader_context.cc
index 7d34229520..6e349950a5 100644
--- a/be/src/runtime/disk_io_mgr_reader_context.cc
+++ b/be/src/runtime/disk_io_mgr_reader_context.cc
@@ -152,8 +152,7 @@ DiskIoMgr::RequestContext::RequestContext(DiskIoMgr* parent, int num_disks) :
 }
 
 // Resets this object.
-// void DiskIoMgr::RequestContext::reset(MemTracker* tracker) {
-void DiskIoMgr::RequestContext::reset(MemTracker* tracker) {
+void DiskIoMgr::RequestContext::reset(std::shared_ptr tracker) {
     DCHECK_EQ(_state, Inactive);
     _status = Status::OK();
 
@@ -163,7 +162,7 @@ void DiskIoMgr::RequestContext::reset(MemTracker* tracker) {
     _disks_accessed_bitmap = NULL;
 
     _state = Active;
-    _mem_tracker = tracker;
+    _mem_tracker = std::move(tracker);
 
     _num_unstarted_scan_ranges = 0;
     _num_disks_with_ranges = 0;
diff --git a/be/src/runtime/dpp_sink.cpp b/be/src/runtime/dpp_sink.cpp
index b4aa6c63d6..74953e56c8 100644
--- a/be/src/runtime/dpp_sink.cpp
+++ b/be/src/runtime/dpp_sink.cpp
@@ -594,7 +594,7 @@ Status Translator::prepare(RuntimeState* state) {
 
     // 4. new batch for writer
     _batch_to_write.reset(
-            new RowBatch(_row_desc, state->batch_size(), state->instance_mem_tracker()));
+            new RowBatch(_row_desc, state->batch_size(), state->instance_mem_tracker().get()));
     if (_batch_to_write.get() == nullptr) {
         return Status::InternalError("No memory to allocate RowBatch.");
     }
@@ -828,7 +828,7 @@ Status Translator::process(RuntimeState* state) {
         SCOPED_TIMER(_agg_timer);
         bool eos = false;
         while (!eos) {
-            RowBatch batch(_row_desc, state->batch_size(), state->instance_mem_tracker());
+            RowBatch batch(_row_desc, state->batch_size(), state->instance_mem_tracker().get());
 
             RETURN_IF_ERROR(_sorter->get_next(&batch, &eos));
 
diff --git a/be/src/runtime/dpp_sink_internal.cpp b/be/src/runtime/dpp_sink_internal.cpp
index 2f54b5bea6..d2d5b19ed3 100644
--- a/be/src/runtime/dpp_sink_internal.cpp
+++ b/be/src/runtime/dpp_sink_internal.cpp
@@ -70,11 +70,9 @@ Status RollupSchema::from_thrift(
 }
 
 Status RollupSchema::prepare(
-        RuntimeState* state, const RowDescriptor& row_desc, MemTracker* mem_tracker) {
-    RETURN_IF_ERROR(Expr::prepare(
-            _key_ctxs, state, row_desc, mem_tracker));
-    RETURN_IF_ERROR(Expr::prepare(
-            _value_ctxs, state, row_desc, mem_tracker));
+        RuntimeState* state, const RowDescriptor& row_desc, const std::shared_ptr& mem_tracker) {
+    RETURN_IF_ERROR(Expr::prepare(_key_ctxs, state, row_desc, mem_tracker));
+    RETURN_IF_ERROR(Expr::prepare(_value_ctxs, state, row_desc, mem_tracker));
     return Status::OK();
 }
 
@@ -224,11 +222,10 @@ Status PartitionInfo::from_thrift(
     return Status::OK();
 }
 
-Status PartitionInfo::prepare(
-        RuntimeState* state, const RowDescriptor& row_desc, MemTracker* mem_tracker) {
+Status PartitionInfo::prepare(RuntimeState* state, const RowDescriptor& row_desc,
+                              const std::shared_ptr& mem_tracker) {
     if (_distributed_expr_ctxs.size() > 0) {
-        RETURN_IF_ERROR(Expr::prepare(
-                _distributed_expr_ctxs, state, row_desc, mem_tracker));
+        RETURN_IF_ERROR(Expr::prepare(_distributed_expr_ctxs, state, row_desc, mem_tracker));
     }
     return Status::OK();
 }
diff --git a/be/src/runtime/dpp_sink_internal.h b/be/src/runtime/dpp_sink_internal.h
index b11833e38d..af093b8b25 100644
--- a/be/src/runtime/dpp_sink_internal.h
+++ b/be/src/runtime/dpp_sink_internal.h
@@ -49,7 +49,7 @@ public:
                               const TRollupSchema& t_schema,
                               RollupSchema* schema);
 
-    Status prepare(RuntimeState* state, const RowDescriptor& row_desc, MemTracker* mem_tracker);
+    Status prepare(RuntimeState* state, const RowDescriptor& row_desc, const std::shared_ptr& mem_tracker);
 
     Status open(RuntimeState* state);
 
@@ -259,7 +259,7 @@ public:
                               const TRangePartition& t_partition,
                               PartitionInfo* partition);
 
-    Status prepare(RuntimeState* state, const RowDescriptor& row_desc, MemTracker*);
+    Status prepare(RuntimeState* state, const RowDescriptor& row_desc, const std::shared_ptr& mem_tracker);
 
     Status open(RuntimeState* state);
 
diff --git a/be/src/runtime/exec_env.cpp b/be/src/runtime/exec_env.cpp
index 8eea2c9afc..d22150da47 100644
--- a/be/src/runtime/exec_env.cpp
+++ b/be/src/runtime/exec_env.cpp
@@ -21,14 +21,11 @@
 
 namespace doris {
 
-ExecEnv::ExecEnv() {
-}
+ExecEnv::ExecEnv() {}
 
-ExecEnv::~ExecEnv() {
-}
+ExecEnv::~ExecEnv() {}
 
 const std::string& ExecEnv::token() const {
     return _master_info->token;
 }
-
-}
+} // namespace doris
diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h
index 245026113e..f13dc3be26 100644
--- a/be/src/runtime/exec_env.h
+++ b/be/src/runtime/exec_env.h
@@ -59,8 +59,9 @@ class PluginMgr;
 class BackendServiceClient;
 class FrontendServiceClient;
 class TPaloBrokerServiceClient;
-class TExtDataSourceServiceClient; 
-template class ClientCache;
+class TExtDataSourceServiceClient;
+template 
+class ClientCache;
 class HeartbeatFlags;
 
 // Execution environment for queries/plan fragments.
@@ -89,21 +90,25 @@ public:
     ~ExecEnv();
 
     const std::string& token() const;
-    ExternalScanContextMgr* external_scan_context_mgr() {return _external_scan_context_mgr;}
+    ExternalScanContextMgr* external_scan_context_mgr() { return _external_scan_context_mgr; }
     MetricRegistry* metrics() const { return _metrics; }
     DataStreamMgr* stream_mgr() { return _stream_mgr; }
     ResultBufferMgr* result_mgr() { return _result_mgr; }
-    ResultQueueMgr* result_queue_mgr() {return _result_queue_mgr;}
+    ResultQueueMgr* result_queue_mgr() { return _result_queue_mgr; }
     ClientCache* client_cache() { return _backend_client_cache; }
     ClientCache* frontend_client_cache() { return _frontend_client_cache; }
     ClientCache* broker_client_cache() { return _broker_client_cache; }
-    ClientCache* extdatasource_client_cache() { return _extdatasource_client_cache; }
+    ClientCache* extdatasource_client_cache() {
+        return _extdatasource_client_cache;
+    }
 
     // using template to simplify client cache management
-    template
-    ClientCache* get_client_cache() { return nullptr; }
+    template 
+    ClientCache* get_client_cache() {
+        return nullptr;
+    }
 
-    MemTracker* process_mem_tracker() { return _mem_tracker; }
+    std::shared_ptr process_mem_tracker() { return _mem_tracker; }
     PoolMemTrackerRegistry* pool_mem_trackers() { return _pool_mem_trackers; }
     ThreadResourceMgr* thread_mgr() { return _thread_mgr; }
     PriorityThreadPool* thread_pool() { return _thread_pool; }
@@ -134,15 +139,14 @@ public:
     HeartbeatFlags* heartbeat_flags() { return _heartbeat_flags; }
 
     PluginMgr* plugin_mgr() { return _plugin_mgr; }
-    
+
 private:
     Status _init(const std::vector& store_paths);
     void _destory();
 
     Status _init_mem_tracker();
     /// Initialise 'buffer_pool_' and 'buffer_reservation_' with given capacity.
-    void _init_buffer_pool(int64_t min_page_len,
-                           int64_t capacity, int64_t clean_pages_limit);
+    void _init_buffer_pool(int64_t min_page_len, int64_t capacity, int64_t clean_pages_limit);
 
 private:
     std::vector _store_paths;
@@ -156,7 +160,7 @@ private:
     ClientCache* _frontend_client_cache = nullptr;
     ClientCache* _broker_client_cache = nullptr;
     ClientCache* _extdatasource_client_cache = nullptr;
-    MemTracker* _mem_tracker = nullptr;
+    std::shared_ptr _mem_tracker;
     PoolMemTrackerRegistry* _pool_mem_trackers = nullptr;
     ThreadResourceMgr* _thread_mgr = nullptr;
     PriorityThreadPool* _thread_pool = nullptr;
@@ -184,20 +188,29 @@ private:
     RoutineLoadTaskExecutor* _routine_load_task_executor = nullptr;
     SmallFileMgr* _small_file_mgr = nullptr;
     HeartbeatFlags* _heartbeat_flags = nullptr;
-    
+
     PluginMgr* _plugin_mgr = nullptr;
 };
 
-
 template <>
-inline ClientCache* ExecEnv::get_client_cache() { return _backend_client_cache; }
+inline ClientCache* ExecEnv::get_client_cache() {
+    return _backend_client_cache;
+}
 template <>
-inline ClientCache* ExecEnv::get_client_cache() { return _frontend_client_cache; }
+inline ClientCache* ExecEnv::get_client_cache() {
+    return _frontend_client_cache;
+}
 template <>
-inline ClientCache* ExecEnv::get_client_cache() { return _broker_client_cache; }
+inline ClientCache*
+ExecEnv::get_client_cache() {
+    return _broker_client_cache;
+}
 template <>
-inline ClientCache* ExecEnv::get_client_cache() { return _extdatasource_client_cache; }
-
+inline ClientCache*
+ExecEnv::get_client_cache() {
+    return _extdatasource_client_cache;
 }
 
+} // namespace doris
+
 #endif
diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp
index ea0b830d47..7325a8a428 100644
--- a/be/src/runtime/exec_env_init.cpp
+++ b/be/src/runtime/exec_env_init.cpp
@@ -80,7 +80,6 @@ Status ExecEnv::_init(const std::vector& store_paths) {
     _frontend_client_cache = new FrontendServiceClientCache(config::max_client_cache_size_per_host);
     _broker_client_cache = new BrokerServiceClientCache(config::max_client_cache_size_per_host);
     _extdatasource_client_cache = new ExtDataSourceServiceClientCache(config::max_client_cache_size_per_host);
-    _mem_tracker = nullptr;
     _pool_mem_trackers = new PoolMemTrackerRegistry();
     _thread_mgr = new ThreadResourceMgr();
     _thread_pool = new PriorityThreadPool(
@@ -178,7 +177,8 @@ Status ExecEnv::_init_mem_tracker() {
         return Status::InternalError(ss.str());
     }
 
-    _mem_tracker = new MemTracker(bytes_limit);
+    _mem_tracker =
+            MemTracker::CreateTracker(bytes_limit, "ExecEnv root", MemTracker::GetRootTracker());
 
     LOG(INFO) << "Using global memory limit: " << PrettyPrinter::print(bytes_limit, TUnit::BYTES);
     RETURN_IF_ERROR(_disk_io_mgr->init(_mem_tracker));
@@ -224,7 +224,6 @@ void ExecEnv::_destory() {
     delete _thread_pool;
     delete _thread_mgr;
     delete _pool_mem_trackers;
-    delete _mem_tracker;
     delete _broker_client_cache;
     delete _extdatasource_client_cache;
     delete _frontend_client_cache;
diff --git a/be/src/runtime/export_sink.cpp b/be/src/runtime/export_sink.cpp
index 3aea0348cb..85616fc88d 100644
--- a/be/src/runtime/export_sink.cpp
+++ b/be/src/runtime/export_sink.cpp
@@ -67,10 +67,10 @@ Status ExportSink::prepare(RuntimeState* state) {
     _profile = state->obj_pool()->add(new RuntimeProfile(title.str()));
     SCOPED_TIMER(_profile->total_time_counter());
 
-    _mem_tracker.reset(new MemTracker(-1, "ExportSink", state->instance_mem_tracker()));
+    _mem_tracker = MemTracker::CreateTracker(-1, "ExportSink", state->instance_mem_tracker());
 
     // Prepare the exprs to run.
-    RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc, _mem_tracker.get()));
+    RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc, _mem_tracker));
 
     // TODO(lingbin): add some Counter
     _bytes_written_counter = ADD_COUNTER(profile(), "BytesExported", TUnit::BYTES);
diff --git a/be/src/runtime/export_sink.h b/be/src/runtime/export_sink.h
index 167d0a55d9..ce8e2fbdb5 100644
--- a/be/src/runtime/export_sink.h
+++ b/be/src/runtime/export_sink.h
@@ -79,7 +79,7 @@ private:
 
     RuntimeProfile* _profile;
 
-    std::unique_ptr _mem_tracker;
+    std::shared_ptr _mem_tracker;
 
     RuntimeProfile::Counter* _bytes_written_counter;
     RuntimeProfile::Counter* _rows_written_counter;
diff --git a/be/src/runtime/initial_reservations.cc b/be/src/runtime/initial_reservations.cc
index 1a279f45cc..2daf7aa0a5 100644
--- a/be/src/runtime/initial_reservations.cc
+++ b/be/src/runtime/initial_reservations.cc
@@ -37,13 +37,12 @@ using std::numeric_limits;
 namespace doris {
 
 InitialReservations::InitialReservations(ObjectPool* obj_pool,
-    ReservationTracker* query_reservation, MemTracker* query_mem_tracker,
+    ReservationTracker* query_reservation, std::shared_ptr query_mem_tracker,
     int64_t initial_reservation_total_claims)
-  : initial_reservation_mem_tracker_(obj_pool->add(
-      new MemTracker(-1, "Unclaimed reservations", query_mem_tracker, false, false))),
+  : initial_reservation_mem_tracker_(MemTracker::CreateTracker(-1, "Unclaimed reservations", query_mem_tracker, false)),
       remaining_initial_reservation_claims_(initial_reservation_total_claims) {
   initial_reservations_.InitChildTracker(nullptr, query_reservation,
-      initial_reservation_mem_tracker_, numeric_limits::max());
+      initial_reservation_mem_tracker_.get(), numeric_limits::max());
 }
 
 Status InitialReservations::Init(
@@ -85,6 +84,7 @@ void InitialReservations::Return(BufferPool::ClientHandle* src, int64_t bytes) {
 
 void InitialReservations::ReleaseResources() {
   initial_reservations_.Close();
-  initial_reservation_mem_tracker_->close();
+  // TODO(HW): Close() is private. make this tracker shared later
+  // initial_reservation_mem_tracker_->Close();
 }
 }
diff --git a/be/src/runtime/initial_reservations.h b/be/src/runtime/initial_reservations.h
index 0863e5d445..ad69a33998 100644
--- a/be/src/runtime/initial_reservations.h
+++ b/be/src/runtime/initial_reservations.h
@@ -42,7 +42,7 @@ class InitialReservations {
   /// claimed over the lifetime of the query. The total bytes claimed via Claim()
   /// cannot exceed this. Allocated objects are stored in 'obj_pool'.
   InitialReservations(ObjectPool* obj_pool, ReservationTracker* query_reservation,
-      MemTracker* query_mem_tracker, int64_t initial_reservation_total_claims);
+      std::shared_ptr query_mem_tracker, int64_t initial_reservation_total_claims);
 
   /// Initialize the query's pool of initial reservations by acquiring the minimum
   /// reservation required for the query on this host. Fails if the reservation could
@@ -70,7 +70,7 @@ class InitialReservations {
   // Return() returns reservations to.
   ReservationTracker initial_reservations_;
 
-  MemTracker* const initial_reservation_mem_tracker_;
+  std::shared_ptr const initial_reservation_mem_tracker_;
 
   /// The total bytes of additional reservations that we expect to be claimed.
   /// initial_reservations_->GetReservation() <= remaining_initial_reservation_claims_.
diff --git a/be/src/runtime/load_channel.cpp b/be/src/runtime/load_channel.cpp
index 44f1815093..72ace23577 100644
--- a/be/src/runtime/load_channel.cpp
+++ b/be/src/runtime/load_channel.cpp
@@ -24,9 +24,9 @@
 namespace doris {
 
 LoadChannel::LoadChannel(const UniqueId& load_id, int64_t mem_limit,
-                         int64_t timeout_s, MemTracker* mem_tracker) :
+                         int64_t timeout_s, const std::shared_ptr& mem_tracker) :
         _load_id(load_id), _timeout_s(timeout_s) {
-    _mem_tracker.reset(new MemTracker(mem_limit, _load_id.to_string(), mem_tracker));
+    _mem_tracker = MemTracker::CreateTracker(mem_limit, _load_id.to_string(), mem_tracker);
     // _last_updated_time should be set before being inserted to
     // _load_channels in load_channel_mgr, or it may be erased
     // immediately by gc thread.
@@ -50,7 +50,7 @@ Status LoadChannel::open(const PTabletWriterOpenRequest& params) {
         } else {
             // create a new tablets channel
             TabletsChannelKey key(params.id(), index_id);
-            channel.reset(new TabletsChannel(key, _mem_tracker.get()));
+            channel.reset(new TabletsChannel(key, _mem_tracker));
             _tablets_channels.insert({index_id, channel});
         }
     }
diff --git a/be/src/runtime/load_channel.h b/be/src/runtime/load_channel.h
index 50d67d9715..8bea5954bf 100644
--- a/be/src/runtime/load_channel.h
+++ b/be/src/runtime/load_channel.h
@@ -39,7 +39,7 @@ class TabletsChannel;
 class LoadChannel {
 public:
     LoadChannel(const UniqueId& load_id, int64_t mem_limit,
-                int64_t timeout_s, MemTracker* mem_tracker);
+                int64_t timeout_s, const std::shared_ptr& mem_tracker);
     ~LoadChannel();
 
     // open a new load channel if not exist
@@ -75,7 +75,7 @@ private:
 
     UniqueId _load_id;
     // Tracks the total memory comsupted by current load job on this BE
-    std::unique_ptr _mem_tracker;
+    std::shared_ptr _mem_tracker;
 
     // lock protect the tablets channel map
     std::mutex _lock;
diff --git a/be/src/runtime/load_channel_mgr.cpp b/be/src/runtime/load_channel_mgr.cpp
index b9a6eca045..f0237ad777 100644
--- a/be/src/runtime/load_channel_mgr.cpp
+++ b/be/src/runtime/load_channel_mgr.cpp
@@ -79,7 +79,7 @@ LoadChannelMgr::~LoadChannelMgr() {
 
 Status LoadChannelMgr::init(int64_t process_mem_limit) {
     int64_t load_mem_limit = calc_process_max_load_memory(process_mem_limit);
-    _mem_tracker.reset(new MemTracker(load_mem_limit, "load channel mgr"));
+    _mem_tracker = MemTracker::CreateTracker(load_mem_limit, "load channel mgr");
     RETURN_IF_ERROR(_start_bg_worker());
     return Status::OK();
 }
@@ -103,7 +103,7 @@ Status LoadChannelMgr::open(const PTabletWriterOpenRequest& params) {
             int64_t job_timeout_s = calc_job_timeout_s(timeout_in_req_s);
 
             channel.reset(new LoadChannel(load_id, job_max_memory,
-                                          job_timeout_s, _mem_tracker.get()));
+                                          job_timeout_s, _mem_tracker));
             _load_channels.insert({load_id, channel});
         }
     }
diff --git a/be/src/runtime/load_channel_mgr.h b/be/src/runtime/load_channel_mgr.h
index 0f9fb46ddc..9af8435618 100644
--- a/be/src/runtime/load_channel_mgr.h
+++ b/be/src/runtime/load_channel_mgr.h
@@ -70,7 +70,7 @@ private:
     Cache* _lastest_success_channel = nullptr;
 
     // check the total load mem consumption of this Backend
-    std::unique_ptr _mem_tracker;
+    std::shared_ptr _mem_tracker;
 
     // thread to clean timeout load channels
     std::thread _load_channels_clean_thread;
diff --git a/be/src/runtime/mem_pool.cpp b/be/src/runtime/mem_pool.cpp
index 37ca1970c9..4232cde200 100644
--- a/be/src/runtime/mem_pool.cpp
+++ b/be/src/runtime/mem_pool.cpp
@@ -49,7 +49,7 @@ MemPool::~MemPool() {
         total_bytes_released += chunk.chunk.size;
         ChunkAllocator::instance()->free(chunk.chunk);
     }
-    mem_tracker_->release(total_bytes_released);
+    mem_tracker_->Release(total_bytes_released);
     DorisMetrics::instance()->memory_pool_bytes_total.increment(-total_bytes_released);
 }
 
@@ -75,7 +75,7 @@ void MemPool::free_all() {
     total_allocated_bytes_ = 0;
     total_reserved_bytes_ = 0;
 
-    mem_tracker_->release(total_bytes_released);
+    mem_tracker_->Release(total_bytes_released);
     DorisMetrics::instance()->memory_pool_bytes_total.increment(-total_bytes_released);
 }
 
@@ -119,15 +119,15 @@ bool MemPool::find_chunk(size_t min_size, bool check_limits) {
 
     chunk_size = BitUtil::RoundUpToPowerOfTwo(chunk_size);
     if (check_limits) {
-        if (!mem_tracker_->try_consume(chunk_size)) return false;
+        if (!mem_tracker_->TryConsume(chunk_size)) return false;
     } else {
-        mem_tracker_->consume(chunk_size);
+        mem_tracker_->Consume(chunk_size);
     }
 
     // Allocate a new chunk. Return early if allocate fails.
     Chunk chunk;
     if (!ChunkAllocator::instance()->allocate(chunk_size, &chunk)) {
-        mem_tracker_->release(chunk_size);
+        mem_tracker_->Release(chunk_size);
         return false;
     }
     ASAN_POISON_MEMORY_REGION(chunk.data, chunk_size);
@@ -174,8 +174,8 @@ void MemPool::acquire_data(MemPool* src, bool keep_current) {
 
     // Skip unnecessary atomic ops if the mem_trackers are the same.
     if (src->mem_tracker_ != mem_tracker_) {
-        src->mem_tracker_->release(total_transfered_bytes);
-        mem_tracker_->consume(total_transfered_bytes);
+        src->mem_tracker_->Release(total_transfered_bytes);
+        mem_tracker_->Consume(total_transfered_bytes);
     }
 
     // insert new chunks after current_chunk_idx_
@@ -213,8 +213,8 @@ void MemPool::exchange_data(MemPool* other) {
     std::swap(chunks_, other->chunks_);
 
     // update MemTracker
-    mem_tracker_->consume(delta_size);
-    other->mem_tracker_->release(delta_size);
+    mem_tracker_->Consume(delta_size);
+    other->mem_tracker_->Release(delta_size);
 }
 
 string MemPool::debug_string() {
diff --git a/be/src/runtime/mem_tracker.cpp b/be/src/runtime/mem_tracker.cpp
index 0f0a0c0667..5e3c90b8be 100644
--- a/be/src/runtime/mem_tracker.cpp
+++ b/be/src/runtime/mem_tracker.cpp
@@ -20,185 +20,307 @@
 #include 
 #include 
 #include 
-//#include 
-//#include 
-//include 
-
-#include "exec/exec_node.h"
-#include "gutil/strings/substitute.h"
-#include "runtime/exec_env.h"
-#include "runtime/runtime_state.h"
-#include "util/debug_util.h"
-#include "util/doris_metrics.h"
-#include "util/mem_info.h"
-#include "util/pretty_printer.h"
-#include "util/stack_util.h"
-#include "util/uid_util.h"
-
-//using std::shared_ptr;
-//using std::weak_ptr;
-//using std::lexical_cast;
 #include 
 
+#include "exec/exec_node.h"
+#include "gutil/once.h"
+#include "gutil/strings/substitute.h"
 #include "runtime/bufferpool/reservation_tracker_counters.h"
+#include "runtime/exec_env.h"
+#include "runtime/runtime_state.h"
+#include "service/backend_options.h"
+#include "util/debug_util.h"
+#include "util/doris_metrics.h"
+#include "util/debug_util.h"
+#include "util/mem_info.h"
+#include "util/pretty_printer.h"
+#include "util/uid_util.h"
+#include "util/stack_util.h"
+
+using boost::join;
+using std::deque;
+using std::endl;
+using std::greater;
+using std::list;
+using std::pair;
+using std::priority_queue;
+using std::shared_ptr;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+using std::weak_ptr;
+using strings::Substitute;
 
 namespace doris {
 
-const std::string MemTracker::COUNTER_NAME = "PeakMemoryUsage";
+const string MemTracker::COUNTER_NAME = "PeakMemoryUsage";
 
 // Name for request pool MemTrackers. '$0' is replaced with the pool name.
-const std::string REQUEST_POOL_MEM_TRACKER_LABEL_FORMAT = "RequestPool=$0";
+const string REQUEST_POOL_MEM_TRACKER_LABEL_FORMAT = "RequestPool=$0";
+
+/// Calculate the soft limit for a MemTracker based on the hard limit 'limit'.
+static int64_t CalcSoftLimit(int64_t limit) {
+  if (limit < 0) return -1;
+  double frac = std::max(0.0, std::min(1.0, config::soft_mem_limit_frac));
+  return static_cast(limit * frac);
+}
+
+// The ancestor for all trackers. Every tracker is visible from the root down.
+static std::shared_ptr root_tracker;
+static GoogleOnceType root_tracker_once = GOOGLE_ONCE_INIT;
+
+void MemTracker::CreateRootTracker() {
+  root_tracker.reset(new MemTracker(-1, "root", std::shared_ptr()));
+  root_tracker->Init();
+}
+
+std::shared_ptr MemTracker::CreateTracker(
+    int64_t byte_limit,
+    const std::string& label,
+    std::shared_ptr parent,
+    bool log_usage_if_zero) {
+  shared_ptr real_parent;
+  if (parent) {
+      real_parent = std::move(parent);
+  } else {
+      real_parent = GetRootTracker();
+  }
+  shared_ptr tracker(new MemTracker(byte_limit, label, real_parent, log_usage_if_zero));
+  real_parent->AddChildTracker(tracker);
+  tracker->Init();
+
+  return tracker;
+}
+
+std::shared_ptr MemTracker::CreateTracker(
+    RuntimeProfile* profile, int64_t byte_limit,
+    const std::string& label,
+    const std::shared_ptr& parent) {
+  shared_ptr real_parent;
+  if (parent) {
+      real_parent = std::move(parent);
+  } else {
+      real_parent = GetRootTracker();
+  }
+  shared_ptr tracker(new MemTracker(profile, byte_limit, label, real_parent));
+  real_parent->AddChildTracker(tracker);
+  tracker->Init();
+
+  return tracker;
+}
 
 MemTracker::MemTracker(
-        int64_t byte_limit, const std::string& label, MemTracker* parent, bool auto_unregister, bool log_usage_if_zero)
-    : _limit(byte_limit),
-    _label(label),
-    _parent(parent),
-    _consumption(&_local_counter),
-    _local_counter(TUnit::BYTES),
-    _consumption_metric(NULL),
-    _log_usage_if_zero(log_usage_if_zero),
-    _num_gcs_metric(NULL),
-    _bytes_freed_by_last_gc_metric(NULL),
-    _bytes_over_limit_metric(NULL),
-    _limit_metric(NULL),
-    _auto_unregister(auto_unregister) {
-        if (parent != NULL) _parent->add_child_tracker(this);
-        Init();
-    }
+    int64_t byte_limit, const string& label, const std::shared_ptr& parent, bool log_usage_if_zero)
+  : limit_(byte_limit),
+    soft_limit_(CalcSoftLimit(byte_limit)),
+    label_(label),
+    parent_(parent),
+    consumption_(std::make_shared(TUnit::BYTES)),
+    consumption_metric_(nullptr),
+    log_usage_if_zero_(log_usage_if_zero),
+    num_gcs_metric_(nullptr),
+    bytes_freed_by_last_gc_metric_(nullptr),
+    bytes_over_limit_metric_(nullptr),
+    limit_metric_(nullptr) {
+}
 
 MemTracker::MemTracker(RuntimeProfile* profile, int64_t byte_limit,
-        const std::string& label, MemTracker* parent)
-    : _limit(byte_limit),
-    _label(label),
-    _parent(parent),
-    _consumption(profile->AddHighWaterMarkCounter(COUNTER_NAME, TUnit::BYTES)),
-    _local_counter(TUnit::BYTES),
-    _consumption_metric(NULL),
-    _log_usage_if_zero(true),
-    _num_gcs_metric(NULL),
-    _bytes_freed_by_last_gc_metric(NULL),
-    _bytes_over_limit_metric(NULL),
-    _limit_metric(NULL) {
-        if (parent != NULL) _parent->add_child_tracker(this);
-        Init();
-    }
+    const std::string& label, const std::shared_ptr& parent)
+  : limit_(byte_limit),
+    soft_limit_(CalcSoftLimit(byte_limit)),
+    label_(label),
+    parent_(parent),
+    consumption_(profile->AddSharedHighWaterMarkCounter(COUNTER_NAME, TUnit::BYTES)),
+    consumption_metric_(nullptr),
+    log_usage_if_zero_(true),
+    num_gcs_metric_(nullptr),
+    bytes_freed_by_last_gc_metric_(nullptr),
+    bytes_over_limit_metric_(nullptr),
+    limit_metric_(nullptr) {
+}
 
-MemTracker::MemTracker(
-        UIntGauge* consumption_metric, int64_t byte_limit, const std::string& label)
-    : _limit(byte_limit),
-    _label(label),
-    _parent(NULL),
-    _consumption(&_local_counter),
-    _local_counter(TUnit::BYTES),
-    _consumption_metric(consumption_metric),
-    _log_usage_if_zero(true),
-    _num_gcs_metric(NULL),
-    _bytes_freed_by_last_gc_metric(NULL),
-    _bytes_over_limit_metric(NULL),
-    _limit_metric(NULL) {
-        Init();
-    }
+MemTracker::MemTracker(IntGauge* consumption_metric,
+    int64_t byte_limit, const string& label, const std::shared_ptr& parent)
+  : limit_(byte_limit),
+    soft_limit_(CalcSoftLimit(byte_limit)),
+    label_(label),
+    parent_(parent),
+    consumption_(std::make_shared(TUnit::BYTES)),
+    consumption_metric_(consumption_metric),
+    log_usage_if_zero_(true),
+    num_gcs_metric_(nullptr),
+    bytes_freed_by_last_gc_metric_(nullptr),
+    bytes_over_limit_metric_(nullptr),
+    limit_metric_(nullptr) {
+}
 
 void MemTracker::Init() {
-    DCHECK_GE(_limit, -1);
-    // populate _all_trackers and _limit_trackers
-    MemTracker* tracker = this;
-    while (tracker != NULL) {
-        _all_trackers.push_back(tracker);
-        if (tracker->has_limit()) _limit_trackers.push_back(tracker);
-        tracker = tracker->_parent;
-    }
-    DCHECK_GT(_all_trackers.size(), 0);
-    DCHECK_EQ(_all_trackers[0], this);
+  DCHECK_GE(limit_, -1);
+  DCHECK_LE(soft_limit_, limit_);
+  // populate all_trackers_ and limit_trackers_
+  MemTracker* tracker = this;
+  while (tracker != nullptr) {
+    all_trackers_.push_back(tracker);
+    if (tracker->has_limit()) limit_trackers_.push_back(tracker);
+    tracker = tracker->parent_.get();
+  }
+  DCHECK_GT(all_trackers_.size(), 0);
+  DCHECK_EQ(all_trackers_[0], this);
 }
 
-// TODO chenhao , set MemTracker close state
-void MemTracker::close() {}
-
-void MemTracker::enable_reservation_reporting(const ReservationTrackerCounters& counters) {
-    ReservationTrackerCounters* new_counters = new ReservationTrackerCounters(counters);
-    _reservation_counters.store(new_counters);
+void MemTracker::AddChildTracker(const std::shared_ptr& tracker) {
+  lock_guard l(child_trackers_lock_);
+  tracker->child_tracker_it_ = child_trackers_.insert(child_trackers_.end(), tracker);
 }
 
-int64_t MemTracker::GetPoolMemReserved() const {
-    // Pool trackers should have a _pool_name and no limit.
-    DCHECK(!_pool_name.empty());
-    DCHECK_EQ(_limit, -1) << LogUsage(UNLIMITED_DEPTH);
+void MemTracker::EnableReservationReporting(const ReservationTrackerCounters& counters) {
+  delete reservation_counters_.swap(new ReservationTrackerCounters(counters));
+}
 
-    int64_t mem_reserved = 0L;
-    std::lock_guard l(_child_trackers_lock);
-    for (MemTracker* child : _child_trackers) {
-        int64_t child_limit = child->limit();
-        if (child_limit > 0) {
-            // Make sure we don't overflow if the query limits are set to ridiculous values.
-            mem_reserved += std::min(child_limit, MemInfo::physical_mem());
-        } else {
-            DCHECK_EQ(child_limit, -1) << child->LogUsage(UNLIMITED_DEPTH);
-            mem_reserved += child->consumption();
-        }
+int64_t MemTracker::GetLowestLimit(MemLimit mode) const {
+  if (limit_trackers_.empty()) return -1;
+  int64_t min_limit = numeric_limits::max();
+  for (MemTracker* limit_tracker : limit_trackers_) {
+    DCHECK(limit_tracker->has_limit());
+    min_limit = std::min(min_limit, limit_tracker->GetLimit(mode));
+  }
+  return min_limit;
+}
+
+int64_t MemTracker::SpareCapacity(MemLimit mode) const {
+  int64_t result = std::numeric_limits::max();
+  for (const auto& tracker : limit_trackers_) {
+    int64_t mem_left = tracker->GetLimit(mode) - tracker->consumption();
+    result = std::min(result, mem_left);
+  }
+  return result;
+}
+
+void MemTracker::RefreshConsumptionFromMetric() {
+  DCHECK(consumption_metric_ != nullptr);
+  consumption_->set(consumption_metric_->value());
+}
+
+int64_t MemTracker::GetPoolMemReserved() {
+  // Pool trackers should have a pool_name_ and no limit.
+  DCHECK(!pool_name_.empty());
+  DCHECK_EQ(limit_, -1) << LogUsage(UNLIMITED_DEPTH);
+
+  // Use cache to avoid holding child_trackers_lock_
+  list> children;
+  {
+    lock_guard l(child_trackers_lock_);
+    children = child_trackers_;
+  }
+
+  int64_t mem_reserved = 0L;
+  for (const auto& child_weak : children) {
+    std::shared_ptr child = child_weak.lock();
+    if (child) {
+      int64_t child_limit = child->limit();
+      bool query_exec_finished = child->query_exec_finished_.load() != 0;
+      if (child_limit > 0 && !query_exec_finished) {
+        // Make sure we don't overflow if the query limits are set to ridiculous values.
+        mem_reserved += std::min(child_limit, MemInfo::physical_mem());
+      } else {
+        DCHECK(query_exec_finished || child_limit == -1)
+            << child->LogUsage(UNLIMITED_DEPTH);
+        mem_reserved += child->consumption();
+      }
     }
-    return mem_reserved;
+  }
+  return mem_reserved;
 }
 
 MemTracker* PoolMemTrackerRegistry::GetRequestPoolMemTracker(
-        const std::string& pool_name, bool create_if_not_present) {
-    DCHECK(!pool_name.empty());
-    std::lock_guard l(_pool_to_mem_trackers_lock);
-    PoolTrackersMap::iterator it = _pool_to_mem_trackers.find(pool_name);
-    if (it != _pool_to_mem_trackers.end()) {
-        MemTracker* tracker = it->second.get();
-        DCHECK(pool_name == tracker->_pool_name);
-        return tracker;
-    }
-    if (!create_if_not_present) return nullptr;
-    // First time this pool_name registered, make a new object.
-    MemTracker* tracker =
-        new MemTracker(-1, strings::Substitute(REQUEST_POOL_MEM_TRACKER_LABEL_FORMAT, pool_name),
-                ExecEnv::GetInstance()->process_mem_tracker());
-    tracker->_pool_name = pool_name;
-    _pool_to_mem_trackers.emplace(pool_name, std::unique_ptr(tracker));
+    const string& pool_name, bool create_if_not_present) {
+  DCHECK(!pool_name.empty());
+  lock_guard l(pool_to_mem_trackers_lock_);
+  PoolTrackersMap::iterator it = pool_to_mem_trackers_.find(pool_name);
+  if (it != pool_to_mem_trackers_.end()) {
+    MemTracker* tracker = it->second.get();
+    DCHECK(pool_name == tracker->pool_name_);
     return tracker;
-}
-
-MemTracker* MemTracker::CreateQueryMemTracker(const TUniqueId& id,
-        const TQueryOptions& query_options, const std::string& pool_name, ObjectPool* obj_pool) {
-    int64_t byte_limit = -1;
-    if (query_options.__isset.mem_limit && query_options.mem_limit > 0) {
-        byte_limit = query_options.mem_limit;
-    }
-    if (byte_limit != -1) {
-        if (byte_limit > MemInfo::physical_mem()) {
-            LOG(WARNING) << "Memory limit " << PrettyPrinter::print(byte_limit, TUnit::BYTES)
-                << " exceeds physical memory of "
-                << PrettyPrinter::print(MemInfo::physical_mem(), TUnit::BYTES);
-        }
-        VLOG_QUERY << "Using query memory limit: "
-            << PrettyPrinter::print(byte_limit, TUnit::BYTES);
-    }
-
-    MemTracker* pool_tracker =
-        ExecEnv::GetInstance()->pool_mem_trackers()->GetRequestPoolMemTracker(
-                pool_name, true);
-    return pool_tracker;
+  }
+  if (!create_if_not_present) return nullptr;
+  // First time this pool_name registered, make a new object.
+  MemTracker* tracker =
+      new MemTracker(-1, Substitute(REQUEST_POOL_MEM_TRACKER_LABEL_FORMAT, pool_name),
+          ExecEnv::GetInstance()->process_mem_tracker());
+  tracker->pool_name_ = pool_name;
+  pool_to_mem_trackers_.emplace(pool_name, unique_ptr(tracker));
+  return tracker;
 }
 
 MemTracker::~MemTracker() {
-    int64_t remaining_bytes = consumption();
-    // work around some scenario where consume() is not paired with release()
-    // e.g., in the initialization of hll and bitmap aggregator (see aggregate_func.h)
-    // TODO(gaodayue) should be replaced with `DCHECK_EQ(consumption(), 0);` when
-    // we fixed thoses invalid usages
-    if (remaining_bytes > 0) {
-        for (auto tracker : _all_trackers) {
-            tracker->_consumption->add(-remaining_bytes);
-        }
-    }
-    delete _reservation_counters.load();
+  delete reservation_counters_.load();
 
-    if (_auto_unregister && parent()) {
-        unregister_from_parent();
+  if (parent()) {
+    DCHECK(consumption() == 0) << "Memory tracker " << debug_string()
+                               << " has unreleased consumption " << consumption();
+    parent_->Release(consumption());
+
+    lock_guard l(parent_->child_trackers_lock_);
+    if (child_tracker_it_ != parent_->child_trackers_.end()) {
+      parent_->child_trackers_.erase(child_tracker_it_);
+      child_tracker_it_ = parent_->child_trackers_.end();
     }
+  }
+}
+
+void MemTracker::ListTrackers(vector>* trackers) {
+  trackers->clear();
+  deque> to_process;
+  to_process.push_front(GetRootTracker());
+  while (!to_process.empty()) {
+    shared_ptr t = to_process.back();
+    to_process.pop_back();
+
+    trackers->push_back(t);
+    list> children;
+    {
+      lock_guard l(t->child_trackers_lock_);
+      children = t->child_trackers_;
+    }
+    for (const auto& child_weak : children) {
+      shared_ptr child = child_weak.lock();
+      if (child) {
+        to_process.emplace_back(std::move(child));
+      }
+    }
+  }
+}
+
+//void MemTracker::RegisterMetrics(MetricGroup* metrics, const string& prefix) {
+//  num_gcs_metric_ = metrics->AddCounter(Substitute("$0.num-gcs", prefix), 0);
+//
+//  // TODO: Consider a total amount of bytes freed counter
+//  bytes_freed_by_last_gc_metric_ = metrics->AddGauge(
+//      Substitute("$0.bytes-freed-by-last-gc", prefix), -1);
+//
+//  bytes_over_limit_metric_ = metrics->AddGauge(
+//      Substitute("$0.bytes-over-limit", prefix), -1);
+//
+//  limit_metric_ = metrics->AddGauge(Substitute("$0.limit", prefix), limit_);
+//}
+
+void MemTracker::TransferTo(MemTracker* dst, int64_t bytes) {
+  DCHECK_EQ(all_trackers_.back(), dst->all_trackers_.back())
+      << "Must have same root";
+  // Find the common ancestor and update trackers between 'this'/'dst' and
+  // the common ancestor. This logic handles all cases, including the
+  // two trackers being the same or being ancestors of each other because
+  // 'all_trackers_' includes the current tracker.
+  int ancestor_idx = all_trackers_.size() - 1;
+  int dst_ancestor_idx = dst->all_trackers_.size() - 1;
+  while (ancestor_idx > 0 && dst_ancestor_idx > 0
+      && all_trackers_[ancestor_idx - 1] == dst->all_trackers_[dst_ancestor_idx - 1]) {
+    --ancestor_idx;
+    --dst_ancestor_idx;
+  }
+  MemTracker* common_ancestor = all_trackers_[ancestor_idx];
+  ReleaseLocal(bytes, common_ancestor);
+  dst->ConsumeLocal(bytes, common_ancestor);
 }
 
 // Calling this on the query tracker results in output like:
@@ -219,154 +341,226 @@ MemTracker::~MemTracker() {
 //      DataStreamSender (dst_id=4): Total=680.00 B Peak=680.00 B
 //
 // If 'reservation_metrics_' are set, we ge a more granular breakdown:
-//   TrackerName: Limit=5.00 MB BufferPoolUsed/Reservation=0/5.00 MB OtherMemory=1.04 MB
+//   TrackerName: Limit=5.00 MB Reservation=5.00 MB OtherMemory=1.04 MB
 //                Total=6.04 MB Peak=6.45 MB
 //
-std::string MemTracker::LogUsage(int max_recursive_depth, const std::string& prefix,
-                                 int64_t* logged_consumption) const {
-    int64_t curr_consumption = consumption();
-    int64_t peak_consumption = _consumption->value();
-    if (logged_consumption != nullptr) *logged_consumption = curr_consumption;
+string MemTracker::LogUsage(int max_recursive_depth, const string& prefix,
+    int64_t* logged_consumption) {
+  // Make sure the consumption is up to date.
+  if (consumption_metric_ != nullptr) RefreshConsumptionFromMetric();
+  int64_t curr_consumption = consumption();
+  int64_t peak_consumption = consumption_->value();
+  if (logged_consumption != nullptr) *logged_consumption = curr_consumption;
 
-    if (!_log_usage_if_zero && curr_consumption == 0) return "";
+  if (!log_usage_if_zero_ && curr_consumption == 0) return "";
 
-    std::stringstream ss;
-    ss << prefix << _label << ":";
-    //if (CheckLimitExceeded()) ss << " memory limit exceeded.";
-    if (limit_exceeded()) ss << " memory limit exceeded.";
-    if (_limit > 0) ss << " Limit=" << PrettyPrinter::print(_limit, TUnit::BYTES);
+  stringstream ss;
+  ss << prefix << label_ << ":";
+  if (CheckLimitExceeded(MemLimit::HARD)) ss << " memory limit exceeded.";
+  if (limit_ > 0) ss << " Limit=" << PrettyPrinter::print(limit_, TUnit::BYTES);
 
-    ReservationTrackerCounters* reservation_counters = _reservation_counters.load();
-    if (reservation_counters != nullptr) {
-        int64_t reservation = reservation_counters->peak_reservation->current_value();
-        int64_t used_reservation = reservation_counters->peak_used_reservation->current_value();
-        int64_t reservation_limit = 0;
-        //TODO chenhao, reservation_limit is null when ReservationTracker
-        // does't have reservation limit
-        if (reservation_counters->reservation_limit != nullptr) {
-            reservation_limit = reservation_counters->reservation_limit->value();
-        }
-        ss << " BufferPoolUsed/Reservation=" << PrettyPrinter::print(used_reservation, TUnit::BYTES)
-           << "/" << PrettyPrinter::print(reservation, TUnit::BYTES);
-        if (reservation_limit != std::numeric_limits::max()) {
-            ss << " BufferPoolLimit=" << PrettyPrinter::print(reservation_limit, TUnit::BYTES);
-        }
-        ss << " OtherMemory=" << PrettyPrinter::print(curr_consumption - reservation, TUnit::BYTES);
+  ReservationTrackerCounters* reservation_counters = reservation_counters_.load();
+  if (reservation_counters != nullptr) {
+    int64_t reservation = reservation_counters->peak_reservation->current_value();
+    ss << " Reservation=" << PrettyPrinter::print(reservation, TUnit::BYTES);
+    if (reservation_counters->reservation_limit != nullptr) {
+      int64_t limit = reservation_counters->reservation_limit->value();
+      ss << " ReservationLimit=" << PrettyPrinter::print(limit, TUnit::BYTES);
     }
-    ss << " Total=" << PrettyPrinter::print(curr_consumption, TUnit::BYTES)
-       << " Peak=" << PrettyPrinter::print(peak_consumption, TUnit::BYTES);
+    ss << " OtherMemory="
+       << PrettyPrinter::print(curr_consumption - reservation, TUnit::BYTES);
+  }
+  ss << " Total=" << PrettyPrinter::print(curr_consumption, TUnit::BYTES);
+  // Peak consumption is not accurate if the metric is lazily updated (i.e.
+  // this is a non-root tracker that exists only for reporting purposes).
+  // Only report peak consumption if we actually call Consume()/Release() on
+  // this tracker or an descendent.
+  if (consumption_metric_ == nullptr || parent_ == nullptr) {
+    ss << " Peak=" << PrettyPrinter::print(peak_consumption, TUnit::BYTES);
+  }
 
-    // This call does not need the children, so return early.
-    if (max_recursive_depth == 0) return ss.str();
+  // This call does not need the children, so return early.
+  if (max_recursive_depth == 0) return ss.str();
 
-    std::string new_prefix = strings::Substitute("  $0", prefix);
-    int64_t child_consumption;
-    std::string child_trackers_usage;
-    {
-        std::lock_guard l(_child_trackers_lock);
-        child_trackers_usage =
-                LogUsage(max_recursive_depth - 1, new_prefix, _child_trackers, &child_consumption);
-    }
-    if (!child_trackers_usage.empty()) ss << "\n" << child_trackers_usage;
+  // Recurse and get information about the children
+  string new_prefix = Substitute("  $0", prefix);
+  int64_t child_consumption;
+  string child_trackers_usage;
+  list> children;
+  {
+    lock_guard l(child_trackers_lock_);
+    children = child_trackers_;
+  }
+  child_trackers_usage = LogUsage(max_recursive_depth - 1, new_prefix, children, &child_consumption);
+  if (!child_trackers_usage.empty()) ss << "\n" << child_trackers_usage;
 
-    if (_consumption_metric != nullptr) {
-        // Log the difference between the metric value and children as "untracked" memory so
-        // that the values always add up. This value is not always completely accurate because
-        // we did not necessarily get a consistent snapshot of the consumption values for all
-        // children at a single moment in time, but is good enough for our purposes.
-        int64_t untracked_bytes = curr_consumption - child_consumption;
-        ss << "\n" << new_prefix << "Untracked Memory: Total=";
-        ss << "\n"
-           << new_prefix
-           << "Untracked Memory: Total=" << PrettyPrinter::print(untracked_bytes, TUnit::BYTES);
-    }
-
-    return ss.str();
+  if (parent_ == nullptr) {
+    // Log the difference between the metric value and children as "untracked" memory so
+    // that the values always add up. This value is not always completely accurate because
+    // we did not necessarily get a consistent snapshot of the consumption values for all
+    // children at a single moment in time, but is good enough for our purposes.
+    int64_t untracked_bytes = curr_consumption - child_consumption;
+    ss << "\n"
+       << new_prefix << "Untracked Memory: Total="
+       << PrettyPrinter::print(untracked_bytes, TUnit::BYTES);
+  }
+  return ss.str();
 }
 
-std::string MemTracker::LogUsage(int max_recursive_depth, const std::string& prefix,
-                                 const std::list& trackers,
-                                 int64_t* logged_consumption) {
-    *logged_consumption = 0;
-    std::vector usage_strings;
-    for (MemTracker* tracker : trackers) {
-        int64_t tracker_consumption;
-        std::string usage_string =
-                tracker->LogUsage(max_recursive_depth, prefix, &tracker_consumption);
-        if (!usage_string.empty()) usage_strings.push_back(usage_string);
-        *logged_consumption += tracker_consumption;
+string MemTracker::LogUsage(int max_recursive_depth, const string& prefix,
+    const list>& trackers, int64_t* logged_consumption) {
+  *logged_consumption = 0;
+  vector usage_strings;
+  for (const auto& tracker_weak : trackers) {
+    shared_ptr tracker = tracker_weak.lock();
+    if (tracker) {
+      int64_t tracker_consumption;
+      string usage_string = tracker->LogUsage(max_recursive_depth, prefix,
+          &tracker_consumption);
+      if (!usage_string.empty()) usage_strings.push_back(usage_string);
+      *logged_consumption += tracker_consumption;
     }
-    return boost::join(usage_strings, "\n");
+  }
+  return join(usage_strings, "\n");
 }
 
-Status MemTracker::MemLimitExceeded(RuntimeState* state, const std::string& details,
-        int64_t failed_allocation_size) {
-    DCHECK_GE(failed_allocation_size, 0);
-    std::stringstream ss;
-    if (details.size() != 0) ss << details << std::endl;
-    if (failed_allocation_size != 0) {
-        ss << label() << " could not allocate "
-            << PrettyPrinter::print(failed_allocation_size, TUnit::BYTES)
-            << " without exceeding limit." << std::endl;
-    }
-    //ss << "Error occurred on backend " << GetBackendString();
-    if (state != nullptr) ss << " by fragment " << state->fragment_instance_id();
-    ss << std::endl;
-    ExecEnv* exec_env = ExecEnv::GetInstance();
-    //ExecEnv* exec_env = nullptr;
-    MemTracker* process_tracker = exec_env->process_mem_tracker();
-    const int64_t process_capacity = process_tracker->spare_capacity();
-    ss << "Memory left in process limit: "
-        << PrettyPrinter::print(process_capacity, TUnit::BYTES) << std::endl;
+string MemTracker::LogTopNQueries(int limit) {
+  if (limit == 0) return "";
+  if (this->is_query_mem_tracker_) return LogUsage(0);
+  priority_queue, vector>,
+      std::greater>>
+      min_pq;
+  GetTopNQueries(min_pq, limit);
+  vector usage_strings(min_pq.size());
+  while (!min_pq.empty()) {
+    usage_strings.push_back(min_pq.top().second);
+    min_pq.pop();
+  }
+  std::reverse(usage_strings.begin(), usage_strings.end());
+  return join(usage_strings, "\n");
+}
 
-    // Choose which tracker to log the usage of. Default to the process tracker so we can
-    // get the full view of memory consumption.
-    // FIXME(cmy): call LogUsage() lead to crash here, fix it later
-    // MemTracker* tracker_to_log = process_tracker;
-    // if (state != nullptr && state->query_mem_tracker()->has_limit()) {
-    //     MemTracker* query_tracker = state->query_mem_tracker();
-    //     const int64_t query_capacity = query_tracker->limit() - query_tracker->consumption();
-    //     ss << "Memory left in query limit: "
-    //         << PrettyPrinter::print(query_capacity, TUnit::BYTES) << std::endl;
-    //     // Log the query tracker only if the query limit was closer to being exceeded.
-    //     if (query_capacity < process_capacity) tracker_to_log = query_tracker;
-    // }
-    // ss << tracker_to_log->LogUsage();
-    // Status status = Status::MemLimitExceeded(ss.str());
-    LIMIT_EXCEEDED(this, state, ss.str());
+void MemTracker::GetTopNQueries(
+    priority_queue, vector>,
+        greater>>& min_pq,
+    int limit) {
+  list> children;
+  {
+    lock_guard l(child_trackers_lock_);
+    children = child_trackers_;
+  }
+  for (const auto& child_weak : children) {
+    shared_ptr child = child_weak.lock();
+    if (child) {
+      if (!child->is_query_mem_tracker_) {
+        child->GetTopNQueries(min_pq, limit);
+      } else {
+        min_pq.push(pair(child->consumption(), child->LogUsage(0)));
+        if (min_pq.size() > limit) min_pq.pop();
+      }
+    }
+  }
+}
+
+MemTracker* MemTracker::GetQueryMemTracker() {
+  MemTracker* tracker = this;
+  while (tracker != nullptr && !tracker->is_query_mem_tracker_) {
+    tracker = tracker->parent_.get();
+  }
+  return tracker;
+}
+
+Status MemTracker::MemLimitExceeded(MemTracker* mtracker, RuntimeState* state,
+    const std::string& details, int64_t failed_allocation_size) {
+  DCHECK_GE(failed_allocation_size, 0);
+  stringstream ss;
+  if (!details.empty()) ss << details << endl;
+  if (failed_allocation_size != 0) {
+    if (mtracker != nullptr) ss << mtracker->label();
+    ss << " could not allocate "
+       << PrettyPrinter::print(failed_allocation_size, TUnit::BYTES)
+       << " without exceeding limit." << endl;
+  }
+  ss << "Error occurred on backend " << BackendOptions::get_localhost();
+  if (state != nullptr) ss << " by fragment " << print_id(state->fragment_instance_id());
+  ss << endl;
+  ExecEnv* exec_env = ExecEnv::GetInstance();
+  MemTracker* process_tracker = exec_env->process_mem_tracker().get();
+  const int64_t process_capacity = process_tracker->SpareCapacity(MemLimit::HARD);
+  ss << "Memory left in process limit: "
+     << PrettyPrinter::print(process_capacity, TUnit::BYTES) << endl;
+
+  // Always log the query tracker (if available).
+  MemTracker* query_tracker = nullptr;
+  if (mtracker != nullptr) {
+    query_tracker = mtracker->GetQueryMemTracker();
+    if (query_tracker != nullptr) {
+      if (query_tracker->has_limit()) {
+        const int64_t query_capacity =
+            query_tracker->limit() - query_tracker->consumption();
+        ss << "Memory left in query limit: "
+           << PrettyPrinter::print(query_capacity, TUnit::BYTES) << endl;
+      }
+      ss << query_tracker->LogUsage(UNLIMITED_DEPTH);
+    }
+  }
+
+  // Log the process level if the process tracker is close to the limit or
+  // if this tracker is not within a query's MemTracker hierarchy.
+  if (process_capacity < failed_allocation_size || query_tracker == nullptr) {
+    // IMPALA-5598: For performance reasons, limit the levels of recursion when
+    // dumping the process tracker to only two layers.
+    ss << process_tracker->LogUsage(PROCESS_MEMTRACKER_LIMITED_DEPTH);
+  }
+
+  Status status = Status::MemoryLimitExceeded(ss.str());
+  if (state != nullptr) state->log_error(status.to_string());
+  return status;
 }
 
 void MemTracker::AddGcFunction(GcFunction f) {
-    _gc_functions.push_back(f);
+  gc_functions_.push_back(f);
+}
+
+bool MemTracker::LimitExceededSlow(MemLimit mode) {
+  if (mode == MemLimit::HARD && bytes_over_limit_metric_ != nullptr) {
+    bytes_over_limit_metric_->set_value(consumption() - limit_);
+  }
+  return GcMemory(GetLimit(mode));
 }
 
 bool MemTracker::GcMemory(int64_t max_consumption) {
-    if (max_consumption < 0) return true;
-    std::lock_guard l(_gc_lock);
-    if (_consumption_metric != NULL) RefreshConsumptionFromMetric();
-    int64_t pre_gc_consumption = consumption();
-    // Check if someone gc'd before us
-    if (pre_gc_consumption < max_consumption) return false;
-    if (_num_gcs_metric != NULL) _num_gcs_metric->increment(1);
+  if (max_consumption < 0) return true;
+  lock_guard l(gc_lock_);
+  if (consumption_metric_ != nullptr) RefreshConsumptionFromMetric();
+  int64_t pre_gc_consumption = consumption();
+  // Check if someone gc'd before us
+  if (pre_gc_consumption < max_consumption) return false;
+  if (num_gcs_metric_ != nullptr) num_gcs_metric_->increment(1);
 
-    int64_t curr_consumption = pre_gc_consumption;
-    // Try to free up some memory
-    for (int i = 0; i < _gc_functions.size(); ++i) {
-        // Try to free up the amount we are over plus some extra so that we don't have to
-        // immediately GC again. Don't free all the memory since that can be unnecessarily
-        // expensive.
-        const int64_t EXTRA_BYTES_TO_FREE = 512L * 1024L * 1024L;
-        int64_t bytes_to_free = curr_consumption - max_consumption + EXTRA_BYTES_TO_FREE;
-        _gc_functions[i](bytes_to_free);
-        if (_consumption_metric != NULL) RefreshConsumptionFromMetric();
-        curr_consumption = consumption();
-        if (max_consumption - curr_consumption <= EXTRA_BYTES_TO_FREE) break;
-    }
+  int64_t curr_consumption = pre_gc_consumption;
+  // Try to free up some memory
+  for (int i = 0; i < gc_functions_.size(); ++i) {
+    // Try to free up the amount we are over plus some extra so that we don't have to
+    // immediately GC again. Don't free all the memory since that can be unnecessarily
+    // expensive.
+    const int64_t EXTRA_BYTES_TO_FREE = 512L * 1024L * 1024L;
+    int64_t bytes_to_free = curr_consumption - max_consumption + EXTRA_BYTES_TO_FREE;
+    gc_functions_[i](bytes_to_free);
+    if (consumption_metric_ != nullptr) RefreshConsumptionFromMetric();
+    curr_consumption = consumption();
+    if (max_consumption - curr_consumption <= EXTRA_BYTES_TO_FREE) break;
+  }
 
-    if (_bytes_freed_by_last_gc_metric != NULL) {
-        _bytes_freed_by_last_gc_metric->set_value(pre_gc_consumption - curr_consumption);
-    }
-    return curr_consumption > max_consumption;
+  if (bytes_freed_by_last_gc_metric_ != nullptr) {
+    bytes_freed_by_last_gc_metric_->set_value(pre_gc_consumption - curr_consumption);
+  }
+  return curr_consumption > max_consumption;
 }
 
-} // end namespace doris
+std::shared_ptr MemTracker::GetRootTracker() {
+  GoogleOnceInit(&root_tracker_once, &MemTracker::CreateRootTracker);
+  return root_tracker;
+}
+
+} // namespace doris
diff --git a/be/src/runtime/mem_tracker.h b/be/src/runtime/mem_tracker.h
index cbcc500ef7..b88d404db1 100644
--- a/be/src/runtime/mem_tracker.h
+++ b/be/src/runtime/mem_tracker.h
@@ -15,26 +15,34 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#ifndef DORIS_BE_SRC_QUERY_BE_RUNTIME_MEM_LIMIT_H
-#define DORIS_BE_SRC_QUERY_BE_RUNTIME_MEM_LIMIT_H
-
-#include 
+#pragma once
 
+#include 
+#include 
 #include 
 #include 
+#include 
+#include 
+#include 
+#include 
+#include 
 #include 
 
-#include "common/status.h"
-#include "gen_cpp/Types_types.h"
+#include "gen_cpp/Types_types.h" // for TUniqueId
 #include "util/metrics.h"
 #include "util/runtime_profile.h"
 #include "util/spinlock.h"
+#include "common/status.h"
 
 namespace doris {
 
+/// Mode argument passed to various MemTracker methods to indicate whether a soft or hard
+/// limit should be used.
+enum class MemLimit { HARD, SOFT };
+
 class ObjectPool;
 class MemTracker;
-class ReservationTrackerCounters;
+struct ReservationTrackerCounters;
 class RuntimeState;
 class TQueryOptions;
 
@@ -42,6 +50,12 @@ class TQueryOptions;
 /// and can be arranged into a tree structure such that the consumption tracked
 /// by a MemTracker is also tracked by its ancestors.
 ///
+/// A MemTracker has a hard and a soft limit derived from the limit. If the hard limit
+/// is exceeded, all memory allocations and queries should fail until we are under the
+/// limit again. The soft limit can be exceeded without causing query failures, but
+/// consumers of memory that can tolerate running without more memory should not allocate
+/// memory in excess of the soft limit.
+///
 /// We use a five-level hierarchy of mem trackers: process, pool, query, fragment
 /// instance. Specific parts of the fragment (exec nodes, sinks, etc) will add a
 /// fifth level when they are initialized. This function also initializes a user
@@ -49,11 +63,14 @@ class TQueryOptions;
 ///
 /// By default, memory consumption is tracked via calls to Consume()/Release(), either to
 /// the tracker itself or to one of its descendents. Alternatively, a consumption metric
-/// can specified, and then the metric's value is used as the consumption rather than the
-/// tally maintained by Consume() and Release(). A tcmalloc metric is used to track
+/// can be specified, and then the metric's value is used as the consumption rather than
+/// the tally maintained by Consume() and Release(). A tcmalloc metric is used to track
 /// process memory consumption, since the process memory usage may be higher than the
 /// computed total memory (tcmalloc does not release deallocated memory immediately).
-//
+/// Other consumption metrics are used in trackers below the process level to account
+/// for memory (such as free buffer pool buffers) that is not tracked by Consume() and
+/// Release().
+///
 /// GcFunctions can be attached to a MemTracker in order to free up memory if the limit is
 /// reached. If LimitExceeded() is called and the limit is exceeded, it will first call
 /// the GcFunctions to try to free memory and recheck the limit. For example, the process
@@ -64,437 +81,501 @@ class TQueryOptions;
 /// call back into MemTrackers, except to release memory.
 //
 /// This class is thread-safe.
-class MemTracker {
-public:
-    /// 'byte_limit' < 0 means no limit
-    /// 'label' is the label used in the usage string (LogUsage())
-    /// If 'auto_unregister' is true, never call unregister_from_parent().
-    /// If 'log_usage_if_zero' is false, this tracker (and its children) will not be included
-    /// in LogUsage() output if consumption is 0.
-    MemTracker(int64_t byte_limit = -1, const std::string& label = std::string(),
-               MemTracker* parent = NULL, bool auto_unregister = false, bool log_usage_if_zero = true);
+class MemTracker : public std::enable_shared_from_this {
+ public:
+  // Creates and adds the tracker to the tree so that it can be retrieved with
+  // FindTracker/FindOrCreateTracker.
+  static std::shared_ptr CreateTracker(
+      int64_t byte_limit = -1,
+      const std::string& label = std::string(),
+      std::shared_ptr parent = std::shared_ptr(),
+      bool log_usage_if_zero = true);
 
-    /// C'tor for tracker for which consumption counter is created as part of a profile.
-    /// The counter is created with name COUNTER_NAME.
-    MemTracker(RuntimeProfile* profile, int64_t byte_limit,
-            const std::string& label = std::string(), MemTracker* parent = NULL);
+  static std::shared_ptr CreateTracker(
+      RuntimeProfile* profile, int64_t byte_limit,
+      const std::string& label = std::string(),
+      const std::shared_ptr& parent = std::shared_ptr());
 
-    /// C'tor for tracker that uses consumption_metric as the consumption value.
-    /// Consume()/Release() can still be called. This is used for the process tracker.
-    MemTracker(UIntGauge* consumption_metric, int64_t byte_limit = -1,
-      const std::string& label = std::string());
+  /// 'byte_limit' < 0 means no limit
+  /// 'label' is the label used in the usage string (LogUsage())
+  /// If 'log_usage_if_zero' is false, this tracker (and its children) will not be
+  /// included
+  /// in LogUsage() output if consumption is 0.
+  MemTracker(int64_t byte_limit = -1, const std::string& label = std::string(),
+             const std::shared_ptr& parent = std::shared_ptr(),
+             bool log_usage_if_zero = true);
 
-    ~MemTracker();
+  /// C'tor for tracker for which consumption counter is created as part of a profile.
+  /// The counter is created with name COUNTER_NAME.
+  MemTracker(RuntimeProfile* profile, int64_t byte_limit,
+      const std::string& label = std::string(), const std::shared_ptr& parent = std::shared_ptr());
 
-    /// Closes this MemTracker. After closing it is invalid to consume memory on this
-    /// tracker and the tracker's consumption counter (which may be owned by a
-    /// RuntimeProfile, not this MemTracker) can be safely destroyed. MemTrackers without
-    /// consumption metrics in the context of a daemon must always be closed.
-    /// Idempotent: calling multiple times has no effect.
-    void close();
+  // TODO(yingchun): not used, remove it later
+  /// C'tor for tracker that uses consumption_metric as the consumption value.
+  /// Consume()/Release() can still be called. This is used for the root process tracker
+  /// (if 'parent' is NULL). It is also to report on other categories of memory under the
+  /// process tracker, e.g. buffer pool free buffers (if 'parent - non-NULL).
+  MemTracker(IntGauge* consumption_metric, int64_t byte_limit = -1,
+      const std::string& label = std::string(), const std::shared_ptr& parent = std::shared_ptr());
 
-    // Removes this tracker from _parent->_child_trackers.
-    void unregister_from_parent() {
-        DCHECK(_parent != NULL);
-        std::lock_guard l(_parent->_child_trackers_lock);
-        _parent->_child_trackers.erase(_child_tracker_it);
-        _child_tracker_it = _parent->_child_trackers.end();
+  ~MemTracker();
+
+  // Returns a list of all the valid trackers.
+  static void ListTrackers(std::vector>* trackers);
+
+  /// Include counters from a ReservationTracker in logs and other diagnostics.
+  /// The counters should be owned by the fragment's RuntimeProfile.
+  void EnableReservationReporting(const ReservationTrackerCounters& counters);
+
+  // Gets a shared_ptr to the "root" tracker, creating it if necessary.
+  static std::shared_ptr GetRootTracker();
+
+  // delete static CreateQueryMemTracker(), cuz it cannot use shared tracker
+
+  /// Increases consumption of this tracker and its ancestors by 'bytes'.
+  void Consume(int64_t bytes) {
+    // DCHECK_GE(bytes, 0);
+    if (bytes < 0) {
+      Release(-bytes);
+      return;
+    }
+    if (bytes == 0) {
+      return;
     }
 
-    /// Include counters from a ReservationTracker in logs and other diagnostics.
-    /// The counters should be owned by the fragment's RuntimeProfile.
-    void enable_reservation_reporting(const ReservationTrackerCounters& counters);
-
-    /// Construct a MemTracker object for query 'id'. The query limits are determined based
-    /// on 'query_options'. The MemTracker is a child of the request pool MemTracker for
-    /// 'pool_name', which is created if needed. The returned MemTracker is owned by
-    /// 'obj_pool'.
-    static MemTracker* CreateQueryMemTracker(const TUniqueId& id,
-            const TQueryOptions& query_options, const std::string& pool_name,
-            ObjectPool* obj_pool);
-
-    // Returns a MemTracker object for query 'id'.  Calling this with the same id will
-    // return the same MemTracker object.  An example of how this is used is to pass it
-    // the same query id for all fragments of that query running on this machine.  This
-    // way, we have per-query limits rather than per-fragment.
-    // The first time this is called for an id, a new MemTracker object is created with
-    // 'parent' as the parent tracker.
-    // byte_limit and parent must be the same for all GetMemTracker() calls with the
-    // same id.
-    static std::shared_ptr get_query_mem_tracker(const TUniqueId& id,
-            int64_t byte_limit, MemTracker* parent);
-
-    void consume(int64_t bytes) {
-        if (bytes <= 0) {
-            if (bytes < 0) release(-bytes);
-            return;
-        }
-
-        if (_consumption_metric != NULL) {
-            RefreshConsumptionFromMetric();
-            return;
-        }
-        for (std::vector::iterator tracker = _all_trackers.begin();
-             tracker != _all_trackers.end(); ++tracker) {
-            (*tracker)->_consumption->add(bytes);
-            if ((*tracker)->_consumption_metric == NULL) {
-                DCHECK_GE((*tracker)->_consumption->current_value(), 0);
-            }
-        }
+    if (consumption_metric_ != nullptr) {
+      RefreshConsumptionFromMetric();
+      return;  // TODO(yingchun): why return not update tracker?
     }
-
-    /// Increases/Decreases the consumption of this tracker and the ancestors up to (but
-    /// not including) end_tracker. This is useful if we want to move tracking between
-    /// trackers that share a common (i.e. end_tracker) ancestor. This happens when we want
-    /// to update tracking on a particular mem tracker but the consumption against
-    /// the limit recorded in one of its ancestors already happened.
-    void consume_local(int64_t bytes, MemTracker* end_tracker) {
-        DCHECK(_consumption_metric == NULL) << "Should not be called on root.";
-        for (int i = 0; i < _all_trackers.size(); ++i) {
-            if (_all_trackers[i] == end_tracker) return;
-            DCHECK(!_all_trackers[i]->has_limit());
-            _all_trackers[i]->_consumption->add(bytes);
-        }
-        DCHECK(false) << "end_tracker is not an ancestor";
+    for (auto& tracker : all_trackers_) {
+      tracker->consumption_->add(bytes);
+      if (tracker->consumption_metric_ == nullptr) {
+        DCHECK_GE(tracker->consumption_->current_value(), 0);
+      }
     }
+  }
 
-    void release_local(int64_t bytes, MemTracker* end_tracker) {
-        consume_local(-bytes, end_tracker);
-    }
+  /// Increases the consumption of this tracker and the ancestors up to (but
+  /// not including) end_tracker. This is useful if we want to move tracking between
+  /// trackers that share a common (i.e. end_tracker) ancestor. This happens when we want
+  /// to update tracking on a particular mem tracker but the consumption against
+  /// the limit recorded in one of its ancestors already happened.
+  void ConsumeLocal(int64_t bytes, MemTracker* end_tracker) {
+    DCHECK_GE(bytes, 0);
+    if (UNLIKELY(bytes < 0)) return; // needed in RELEASE, hits DCHECK in DEBUG
+    ChangeConsumption(bytes, end_tracker);
+  }
 
-    /// Increases consumption of this tracker and its ancestors by 'bytes' only if
-    /// they can all consume 'bytes'. If this brings any of them over, none of them
-    /// are updated.
-    /// Returns true if the try succeeded.
-    WARN_UNUSED_RESULT
-    bool try_consume(int64_t bytes) {
-        if (_consumption_metric != NULL) RefreshConsumptionFromMetric();
-        if (UNLIKELY(bytes <= 0)) return true;
-        int i;
-        // Walk the tracker tree top-down.
-        for (i = _all_trackers.size() - 1; i >= 0; --i) {
-            MemTracker* tracker = _all_trackers[i];
-            const int64_t limit = tracker->limit();
-            if (limit < 0) {
-                tracker->_consumption->add(bytes); // No limit at this tracker.
-            } else {
-                // If TryConsume fails, we can try to GC, but we may need to try several times if
-                // there are concurrent consumers because we don't take a lock before trying to
-                // update _consumption.
-                while (true) {
-                    if (LIKELY(tracker->_consumption->try_add(bytes, limit))) break;
+  /// Same as above, but it decreases the consumption.
+  void ReleaseLocal(int64_t bytes, MemTracker* end_tracker) {
+    DCHECK_GE(bytes, 0);
+    if (UNLIKELY(bytes < 0)) return; // needed in RELEASE, hits DCHECK in DEBUG
+    ChangeConsumption(-bytes, end_tracker);
+  }
 
-                    VLOG_RPC << "TryConsume failed, bytes=" << bytes
-                        << " consumption=" << tracker->_consumption->current_value()
-                        << " limit=" << limit << " attempting to GC";
-                    if (UNLIKELY(tracker->GcMemory(limit - bytes))) {
-                        DCHECK_GE(i, 0);
-                        // Failed for this mem tracker. Roll back the ones that succeeded.
-                        for (int j = _all_trackers.size() - 1; j > i; --j) {
-                            _all_trackers[j]->_consumption->add(-bytes);
-                        }
-                        return false;
-                    }
-                    VLOG_RPC << "GC succeeded, TryConsume bytes=" << bytes
-                        << " consumption=" << tracker->_consumption->current_value()
-                        << " limit=" << limit;
-                }
-            }
-        }
-        // Everyone succeeded, return.
-        DCHECK_EQ(i, -1);
+  /// Increases consumption of this tracker and its ancestors by 'bytes' only if
+  /// they can all consume 'bytes' without exceeding limit (hard or soft) specified
+  /// by 'mode'. If any limit would be exceed, no MemTrackers are updated. If the
+  /// caller can tolerate an allocation failing, it should set mode=SOFT so that
+  /// other callers that may not tolerate allocation failures have a better chance
+  /// of success. Returns true if the consumption was successfully updated.
+  WARN_UNUSED_RESULT
+  bool TryConsume(int64_t bytes, MemLimit mode = MemLimit::HARD) {
+    // DCHECK_GE(bytes, 0);
+    if (bytes <= 0) {
+        Release(-bytes);
         return true;
     }
+    // if (UNLIKELY(bytes == 0)) return true;
+    // if (UNLIKELY(bytes < 0)) return false; // needed in RELEASE, hits DCHECK in DEBUG
+    if (consumption_metric_ != nullptr) RefreshConsumptionFromMetric();
+    int i;
+    // Walk the tracker tree top-down.
+    for (i = all_trackers_.size() - 1; i >= 0; --i) {
+      MemTracker* tracker = all_trackers_[i];
+      const int64_t limit = tracker->GetLimit(mode);
+      if (limit < 0) {
+        tracker->consumption_->add(bytes); // No limit at this tracker.
+      } else {
+        // If TryConsume fails, we can try to GC, but we may need to try several times if
+        // there are concurrent consumers because we don't take a lock before trying to
+        // update consumption_.
+        while (true) {
+          if (LIKELY(tracker->consumption_->try_add(bytes, limit))) break;
 
-    /// Decreases consumption of this tracker and its ancestors by 'bytes'.
-    void release(int64_t bytes) {
-        if (bytes <= 0) {
-            if (bytes < 0) consume(-bytes);
-            return;
-        }
-
-        if (_consumption_metric != NULL) {
-            RefreshConsumptionFromMetric();
-            return;
-        }
-        for (std::vector::iterator tracker = _all_trackers.begin();
-             tracker != _all_trackers.end(); ++tracker) {
-            (*tracker)->_consumption->add(-bytes);
-            /// If a UDF calls FunctionContext::TrackAllocation() but allocates less than the
-            /// reported amount, the subsequent call to FunctionContext::Free() may cause the
-            /// process mem tracker to go negative until it is synced back to the tcmalloc
-            /// metric. Don't blow up in this case. (Note that this doesn't affect non-process
-            /// trackers since we can enforce that the reported memory usage is internally
-            /// consistent.)
-            if ((*tracker)->_consumption_metric == NULL) {
-                DCHECK_GE((*tracker)->_consumption->current_value(), 0)
-                        << std::endl
-                        << (*tracker)->LogUsage(UNLIMITED_DEPTH);
+          VLOG_RPC << "TryConsume failed, bytes=" << bytes
+                   << " consumption=" << tracker->consumption_->current_value()
+                   << " limit=" << limit << " attempting to GC";
+          if (UNLIKELY(tracker->GcMemory(limit - bytes))) {
+            DCHECK_GE(i, 0);
+            // Failed for this mem tracker. Roll back the ones that succeeded.
+            for (int j = all_trackers_.size() - 1; j > i; --j) {
+              all_trackers_[j]->consumption_->add(-bytes);
             }
+            return false;
+          }
+          VLOG_RPC << "GC succeeded, TryConsume bytes=" << bytes
+                   << " consumption=" << tracker->consumption_->current_value()
+                   << " limit=" << limit;
         }
+      }
+    }
+    // Everyone succeeded, return.
+    DCHECK_EQ(i, -1);
+    return true;
+  }
 
-        /// TODO: Release brokered memory?
+  /// Decreases consumption of this tracker and its ancestors by 'bytes'.
+  void Release(int64_t bytes) {
+    // DCHECK_GE(bytes, 0);
+    if (bytes < 0) {
+      Consume(-bytes);
+      return;
     }
 
-    // Returns true if a valid limit of this tracker or one of its ancestors is exceeded.
-    bool any_limit_exceeded() {
-        for (std::vector::iterator tracker = _limit_trackers.begin();
-                tracker != _limit_trackers.end(); ++tracker) {
-            if ((*tracker)->limit_exceeded()) {
-                return true;
-            }
-        }
-        return false;
+    if (bytes == 0) {
+      return;
     }
 
-    // Return limit exceeded tracker or null
-    MemTracker* find_limit_exceeded_tracker() {
-        for (std::vector::iterator tracker = _limit_trackers.begin();
-                tracker != _limit_trackers.end(); ++tracker) {
-            if ((*tracker)->limit_exceeded()) {
-                return *tracker;
-            }
-        }
-        return NULL;
+    // if (UNLIKELY(bytes <= 0)) return; // < 0 needed in RELEASE, hits DCHECK in DEBUG
+
+    if (consumption_metric_ != nullptr) {
+      RefreshConsumptionFromMetric();
+      return;
+    }
+    for (auto& tracker : all_trackers_) {
+      tracker->consumption_->add(-bytes);
+      /// If a UDF calls FunctionContext::TrackAllocation() but allocates less than the
+      /// reported amount, the subsequent call to FunctionContext::Free() may cause the
+      /// process mem tracker to go negative until it is synced back to the tcmalloc
+      /// metric. Don't blow up in this case. (Note that this doesn't affect non-process
+      /// trackers since we can enforce that the reported memory usage is internally
+      /// consistent.)
+      if (tracker->consumption_metric_ == nullptr) {
+        DCHECK_GE(tracker->consumption_->current_value(), 0)
+            << std::endl
+            << tracker->LogUsage(UNLIMITED_DEPTH);
+      }
+    }
+  }
+
+  /// Transfer 'bytes' of consumption from this tracker to 'dst', updating
+  /// all ancestors up to the first shared ancestor. Must not be used if
+  /// 'dst' has a limit, or an ancestor with a limit, that is not a common
+  /// ancestor with the tracker, because this does not check memory limits.
+  void TransferTo(MemTracker* dst, int64_t bytes);
+
+  /// Returns true if a valid limit of this tracker or one of its ancestors is
+  /// exceeded.
+  bool AnyLimitExceeded(MemLimit mode) {
+    for (const auto& tracker : limit_trackers_) {
+      if (tracker->LimitExceeded(mode)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /// If this tracker has a limit, checks the limit and attempts to free up some memory if
+  /// the hard limit is exceeded by calling any added GC functions. Returns true if the
+  /// limit is exceeded after calling the GC functions. Returns false if there is no limit
+  /// or consumption is under the limit.
+  bool LimitExceeded(MemLimit mode) {
+    if (UNLIKELY(CheckLimitExceeded(mode))) return LimitExceededSlow(mode);
+    return false;
+  }
+  
+  // Return limit exceeded tracker or null
+  MemTracker* find_limit_exceeded_tracker() {
+    for (const auto& tracker : limit_trackers_) {
+      if (tracker->limit_exceeded()) {
+        return tracker;
+      }
+    }
+    return nullptr;
+  }
+
+  /// Returns the maximum consumption that can be made without exceeding the limit on
+  /// this tracker or any of its parents. Returns int64_t::max() if there are no
+  /// limits and a negative value if any limit is already exceeded.
+  int64_t SpareCapacity(MemLimit mode) const;
+
+  /// Refresh the memory consumption value from the consumption metric. Only valid to
+  /// call if this tracker has a consumption metric.
+  void RefreshConsumptionFromMetric();
+
+  // TODO(yingchun): following functions are old style which have no MemLimit parameter
+  bool limit_exceeded() const { return limit_ >= 0 && limit_ < consumption(); }
+
+  int64_t limit() const { return limit_; }
+  bool has_limit() const { return limit_ >= 0; }
+
+  int64_t soft_limit() const { return soft_limit_; }
+  int64_t GetLimit(MemLimit mode) const {
+    if (mode == MemLimit::SOFT) return soft_limit();
+    DCHECK_ENUM_EQ(mode, MemLimit::HARD);
+    return limit();
+  }
+  const std::string& label() const { return label_; }
+
+  /// Returns the lowest limit for this tracker and its ancestors. Returns
+  /// -1 if there is no limit.
+  int64_t GetLowestLimit(MemLimit mode) const;
+
+  /// Returns the memory 'reserved' by this resource pool mem tracker, which is the sum
+  /// of the memory reserved by the queries in it (i.e. its child trackers). The mem
+  /// reserved for a query that is currently executing is its limit_, if set (which
+  /// should be the common case with admission control). Otherwise, if the query has
+  /// no limit or the query is finished executing, the current consumption is used.
+  int64_t GetPoolMemReserved();
+
+  /// Returns the memory consumed in bytes.
+  int64_t consumption() const { return consumption_->current_value(); }
+
+  /// Note that if consumption_ is based on consumption_metric_, this will the max value
+  /// we've recorded in consumption(), not necessarily the highest value
+  /// consumption_metric_ has ever reached.
+  int64_t peak_consumption() const { return consumption_->value(); }
+
+  std::shared_ptr parent() const { return parent_; }
+
+  /// Signature for function that can be called to free some memory after limit is
+  /// reached. The function should try to free at least 'bytes_to_free' bytes of
+  /// memory. See the class header for further details on the expected behaviour of
+  /// these functions.
+  typedef std::function GcFunction;
+
+  /// Add a function 'f' to be called if the limit is reached, if none of the other
+  /// previously-added GC functions were successful at freeing up enough memory.
+  /// 'f' does not need to be thread-safe as long as it is added to only one MemTracker.
+  /// Note that 'f' must be valid for the lifetime of this MemTracker.
+  void AddGcFunction(GcFunction f);
+
+  /// Register this MemTracker's metrics. Each key will be of the form
+  /// ".".
+  // TODO(yingchun): remove comments
+  //void RegisterMetrics(MetricGroup* metrics, const std::string& prefix);
+
+  /// Logs the usage of this tracker and optionally its children (recursively).
+  /// If 'logged_consumption' is non-NULL, sets the consumption value logged.
+  /// 'max_recursive_depth' specifies the maximum number of levels of children
+  /// to include in the dump. If it is zero, then no children are dumped.
+  /// Limiting the recursive depth reduces the cost of dumping, particularly
+  /// for the process MemTracker.
+  /// TODO: once all memory is accounted in ReservationTracker hierarchy, move
+  /// reporting there.
+  std::string LogUsage(int max_recursive_depth,
+      const std::string& prefix = "", int64_t* logged_consumption = nullptr);
+  /// Dumping the process MemTracker is expensive. Limiting the recursive depth
+  /// to two levels limits the level of detail to a one-line summary for each query
+  /// MemTracker, avoiding all MemTrackers below that level. This provides a summary
+  /// of process usage with substantially lower cost than the full dump.
+  static const int PROCESS_MEMTRACKER_LIMITED_DEPTH = 2;
+  /// Unlimited dumping is useful for query memtrackers or error conditions that
+  /// are not performance sensitive
+  static const int UNLIMITED_DEPTH = INT_MAX;
+
+  /// Logs the usage of 'limit' number of queries based on maximum total memory
+  /// consumption.
+  std::string LogTopNQueries(int limit);
+
+  /// Log the memory usage when memory limit is exceeded and return a status object with
+  /// details of the allocation which caused the limit to be exceeded.
+  /// If 'failed_allocation_size' is greater than zero, logs the allocation size. If
+  /// 'failed_allocation_size' is zero, nothing about the allocation size is logged.
+  /// If 'state' is non-NULL, logs the error to 'state'.
+  Status MemLimitExceeded(RuntimeState* state, const std::string& details,
+      int64_t failed_allocation = 0) WARN_UNUSED_RESULT {
+    return MemLimitExceeded(this, state, details, failed_allocation);
+  }
+
+  /// Makes MemLimitExceeded callable for nullptr MemTrackers.
+  static Status MemLimitExceeded(MemTracker* mtracker, RuntimeState* state,
+      const std::string& details, int64_t failed_allocation = 0) WARN_UNUSED_RESULT;
+
+  void set_query_exec_finished() {
+    DCHECK(is_query_mem_tracker_);
+    query_exec_finished_.store(1);
+  }
+
+  static void update_limits(int64_t bytes, const std::vector>& trackers) {
+    for (auto& tracker : trackers) {
+      tracker->Consume(bytes);
+    }
+  }
+
+  static bool limit_exceeded(const std::vector>& trackers) {
+    for (const auto& tracker : trackers) {
+      if (tracker->limit_exceeded()) {
+        // TODO: remove logging
+        LOG(WARNING) << "exceeded limit: limit=" << tracker->limit() << " consumption="
+                     << tracker->consumption();
+        return true;
+      }
     }
 
-    // Returns the maximum consumption that can be made without exceeding the limit on
-    // this tracker or any of its parents. Returns int64_t::max() if there are no
-    // limits and a negative value if any limit is already exceeded.
-    int64_t spare_capacity() const {
-        int64_t result = std::numeric_limits::max();
-        for (std::vector::const_iterator tracker = _limit_trackers.begin();
-                tracker != _limit_trackers.end(); ++tracker) {
-            int64_t mem_left = (*tracker)->limit() - (*tracker)->consumption();
-            result = std::min(result, mem_left);
-        }
-        return result;
+    return false;
+  }
+
+  std::string debug_string() {
+    std::stringstream msg;
+    msg << "limit: " << limit_ << "; "
+        << "consumption: " << consumption_->current_value() << "; "
+        << "label: " << label_ << "; "
+        << "all tracker size: " << all_trackers_.size() << "; "
+        << "limit trackers size: " << limit_trackers_.size() << "; "
+        << "parent is null: " << ((parent_ == nullptr) ? "true" : "false") << "; ";
+    return msg.str();
+  }
+
+  bool is_consumption_metric_null() { return consumption_metric_ == nullptr; }
+  
+  static const std::string COUNTER_NAME;
+
+ private:
+  friend class PoolMemTrackerRegistry;
+
+  // TODO(HW): remove later
+  /// Closes this MemTracker. After closing it is invalid to consume memory on this
+  /// tracker and the tracker's consumption counter (which may be owned by a
+  /// RuntimeProfile, not this MemTracker) can be safely destroyed. MemTrackers without
+  /// consumption metrics in the context of a daemon must always be closed.
+  /// Idempotent: calling multiple times has no effect.
+  void Close();
+
+  /// Returns true if the current memory tracker's limit is exceeded.
+  bool CheckLimitExceeded(MemLimit mode) const {
+    int64_t limit = GetLimit(mode);
+    return limit >= 0 && limit < consumption();
+  }
+
+  /// Slow path for LimitExceeded().
+  bool LimitExceededSlow(MemLimit mode);
+
+  /// If consumption is higher than max_consumption, attempts to free memory by calling
+  /// any added GC functions.  Returns true if max_consumption is still exceeded. Takes
+  /// gc_lock. Updates metrics if initialized.
+  bool GcMemory(int64_t max_consumption);
+
+  /// Walks the MemTracker hierarchy and populates all_trackers_ and
+  /// limit_trackers_
+  void Init();
+
+  /// Adds tracker to child_trackers_
+  void AddChildTracker(const std::shared_ptr& tracker);
+
+  /// Log consumption of all the trackers provided. Returns the sum of consumption in
+  /// 'logged_consumption'. 'max_recursive_depth' specifies the maximum number of levels
+  /// of children to include in the dump. If it is zero, then no children are dumped.
+  static std::string LogUsage(int max_recursive_depth, const std::string& prefix,
+      const std::list>& trackers, int64_t* logged_consumption);
+
+  /// Helper function for LogTopNQueries that iterates through the MemTracker hierarchy
+  /// and populates 'min_pq' with 'limit' number of elements (that contain state related
+  /// to query MemTrackers) based on maximum total memory consumption.
+  void GetTopNQueries(
+      std::priority_queue,
+          std::vector>, std::greater>>& min_pq,
+      int limit);
+
+  /// If an ancestor of this tracker is a query MemTracker, return that tracker.
+  /// Otherwise return NULL.
+  MemTracker* GetQueryMemTracker();
+
+  /// Increases/Decreases the consumption of this tracker and the ancestors up to (but
+  /// not including) end_tracker.
+  void ChangeConsumption(int64_t bytes, MemTracker* end_tracker) {
+    DCHECK(consumption_metric_ == nullptr) << "Should not be called on root.";
+    for (MemTracker* tracker : all_trackers_) {
+      if (tracker == end_tracker) return;
+      DCHECK(!tracker->has_limit());
+      tracker->consumption_->add(bytes);
     }
+    DCHECK(false) << "end_tracker is not an ancestor";
+  }
 
-    /// Refresh the memory consumption value from the consumption metric. Only valid to
-    /// call if this tracker has a consumption metric.
-    void RefreshConsumptionFromMetric() {
-        DCHECK(_consumption_metric != nullptr);
-        DCHECK(_parent == nullptr);
-        _consumption->set(_consumption_metric->value());
-    }
+  // Creates the root tracker.
+  static void CreateRootTracker();
 
-    bool limit_exceeded() const { return _limit >= 0 && _limit < consumption(); }
+  /// Lock to protect GcMemory(). This prevents many GCs from occurring at once.
+  std::mutex gc_lock_;
 
-    int64_t limit() const { return _limit; }
+  /// True if this is a Query MemTracker returned from CreateQueryMemTracker().
+  bool is_query_mem_tracker_ = false;
 
-    bool has_limit() const { return _limit >= 0; }
+  /// Only used if 'is_query_mem_tracker_' is true.
+  /// 0 if the query is still executing or 1 if it has finished executing. Before
+  /// it has finished executing, the tracker limit is treated as "reserved memory"
+  /// for the purpose of admission control - see GetPoolMemReserved().
+  std::atomic query_exec_finished_{0};
 
-    const std::string& label() const { return _label; }
+  /// Only valid for MemTrackers returned from CreateQueryMemTracker()
+  TUniqueId query_id_;
 
-    /// Returns the lowest limit for this tracker and its ancestors. Returns
-    /// -1 if there is no limit.
-    int64_t lowest_limit() const {
-        if (_limit_trackers.empty()) return -1;
-        int64_t v = std::numeric_limits::max();
-        for (int i = 0; i < _limit_trackers.size(); ++i) {
-            DCHECK(_limit_trackers[i]->has_limit());
-            v = std::min(v, _limit_trackers[i]->limit());
-        }
-        return v;
-    }
+  /// Only valid for MemTrackers returned from GetRequestPoolMemTracker()
+  std::string pool_name_;
 
-    /// Returns the memory 'reserved' by this resource pool mem tracker, which is the sum
-    /// of the memory reserved by the queries in it (i.e. its child trackers). The mem
-    /// reserved for a query is its limit_, if set (which should be the common case with
-    /// admission control). Otherwise the current consumption is used.
-    int64_t GetPoolMemReserved() const;
+  /// Hard limit on memory consumption, in bytes. May not be exceeded. If limit_ == -1,
+  /// there is no consumption limit.
+  const int64_t limit_;
 
-    int64_t consumption() const { return _consumption->current_value(); }
+  /// Soft limit on memory consumption, in bytes. Can be exceeded but callers to
+  /// TryConsume() can opt not to exceed this limit. If -1, there is no consumption limit.
+  const int64_t soft_limit_;
 
-    /// Note that if _consumption is based on _consumption_metric, this will the max value
-    /// we've recorded in consumption(), not necessarily the highest value
-    /// _consumption_metric has ever reached.
-    int64_t peak_consumption() const { return _consumption->value(); }
+  std::string label_;
 
-    MemTracker* parent() const { return _parent; }
+  /// The parent of this tracker. The pointer is never modified, even after this tracker
+  /// is unregistered.
+  std::shared_ptr parent_;
 
-    /// Signature for function that can be called to free some memory after limit is
-    /// reached. The function should try to free at least 'bytes_to_free' bytes of
-    /// memory. See the class header for further details on the expected behaviour of
-    /// these functions.
-    typedef std::function GcFunction;
+  /// in bytes
+  std::shared_ptr consumption_;
 
-    /// Add a function 'f' to be called if the limit is reached, if none of the other
-    /// previously-added GC functions were successful at freeing up enough memory.
-    /// 'f' does not need to be thread-safe as long as it is added to only one MemTracker.
-    /// Note that 'f' must be valid for the lifetime of this MemTracker.
-    void AddGcFunction(GcFunction f);
+  /// If non-NULL, used to measure consumption (in bytes) rather than the values provided
+  /// to Consume()/Release(). Only used for the process tracker, thus parent_ should be
+  /// NULL if consumption_metric_ is set.
+  IntGauge* consumption_metric_;
 
-    /// Register this MemTracker's metrics. Each key will be of the form
-    /// ".".
-    void RegisterMetrics(MetricRegistry* metrics, const std::string& prefix);
+  /// If non-NULL, counters from a corresponding ReservationTracker that should be
+  /// reported in logs and other diagnostics. Owned by this MemTracker. The counters
+  /// are owned by the fragment's RuntimeProfile.
+  AtomicPtr reservation_counters_;
 
-    /// Logs the usage of this tracker and optionally its children (recursively).
-    /// If 'logged_consumption' is non-NULL, sets the consumption value logged.
-    /// 'max_recursive_depth' specifies the maximum number of levels of children
-    /// to include in the dump. If it is zero, then no children are dumped.
-    /// Limiting the recursive depth reduces the cost of dumping, particularly
-    /// for the process MemTracker.
-    /// TODO: once all memory is accounted in ReservationTracker hierarchy, move
-    /// reporting there.
-    std::string LogUsage(int max_recursive_depth, const std::string& prefix = "",
-                         int64_t* logged_consumption = nullptr) const;
-    /// Dumping the process MemTracker is expensive. Limiting the recursive depth
-    /// to two levels limits the level of detail to a one-line summary for each query
-    /// MemTracker, avoiding all MemTrackers below that level. This provides a summary
-    /// of process usage with substantially lower cost than the full dump.
-    static const int PROCESS_MEMTRACKER_LIMITED_DEPTH = 2;
-    /// Unlimited dumping is useful for query memtrackers or error conditions that
-    /// are not performance sensitive
-    static const int UNLIMITED_DEPTH = INT_MAX;
+  std::vector all_trackers_;  // this tracker plus all of its ancestors
+  std::vector limit_trackers_;  // all_trackers_ with valid limits
 
-    /// Log the memory usage when memory limit is exceeded and return a status object with
-    /// details of the allocation which caused the limit to be exceeded.
-    /// If 'failed_allocation_size' is greater than zero, logs the allocation size. If
-    /// 'failed_allocation_size' is zero, nothing about the allocation size is logged.
-    Status MemLimitExceeded(RuntimeState* state, const std::string& details,
-            int64_t failed_allocation = 0);
+  // All the child trackers of this tracker. Used for error reporting and
+  // listing only (i.e. updating the consumption of a parent tracker does not
+  // update that of its children).
+  SpinLock child_trackers_lock_;
+  std::list> child_trackers_;
 
-    static const std::string COUNTER_NAME;
+  /// Iterator into parent_->child_trackers_ for this object. Stored to have O(1)
+  /// remove.
+  std::list>::iterator child_tracker_it_;
 
-    static void update_limits(int64_t bytes, std::vector* limits) {
-        for (std::vector::iterator i = limits->begin(); i != limits->end(); ++i) {
-            (*i)->consume(bytes);
-        }
-    }
+  /// Functions to call after the limit is reached to free memory.
+  std::vector gc_functions_;
 
-    static bool limit_exceeded(const std::vector& limits) {
-        for (std::vector::const_iterator i = limits.begin(); i != limits.end(); ++i) {
-            if ((*i)->limit_exceeded()) {
-                // TODO: remove logging
-                LOG(WARNING) << "exceeded limit: limit=" << (*i)->limit() << " consumption="
-                             << (*i)->consumption();
-                return true;
-            }
-        }
+  /// If false, this tracker (and its children) will not be included in LogUsage() output
+  /// if consumption is 0.
+  bool log_usage_if_zero_;
 
-        return false;
-    }
+  /// The number of times the GcFunctions were called.
+  IntCounter* num_gcs_metric_;
 
-    std::string debug_string() {
-        std::stringstream msg;
-        msg << "limit: " << _limit << "; "
-            << "consumption: " << _consumption->current_value() << "; "
-            << "label: " << _label << "; "
-            << "all tracker size: " << _all_trackers.size() << "; "
-            << "limit trackers size: " << _limit_trackers.size() << "; "
-            << "parent is null: " << ((_parent == NULL) ? "true" : "false") << "; ";
-        return msg.str();
-    }
+  /// The number of bytes freed by the last round of calling the GcFunctions (-1 before any
+  /// GCs are performed).
+  IntGauge* bytes_freed_by_last_gc_metric_;
 
-    bool is_consumption_metric_null() { return _consumption_metric == nullptr; }
+  /// The number of bytes over the limit we were the last time LimitExceeded() was called
+  /// and the limit was exceeded pre-GC. -1 if there is no limit or the limit was never
+  /// exceeded.
+  IntGauge* bytes_over_limit_metric_;
 
-private:
-    friend class PoolMemTrackerRegistry;
-
-    /// If consumption is higher than max_consumption, attempts to free memory by calling
-    /// any added GC functions.  Returns true if max_consumption is still exceeded. Takes
-    /// gc_lock. Updates metrics if initialized.
-    bool GcMemory(int64_t max_consumption);
-
-    // Walks the MemTracker hierarchy and populates _all_trackers and _limit_trackers
-    void Init();
-
-    // Adds tracker to _child_trackers
-    void add_child_tracker(MemTracker* tracker) {
-        std::lock_guard l(_child_trackers_lock);
-        tracker->_child_tracker_it = _child_trackers.insert(_child_trackers.end(), tracker);
-    }
-
-    /// Log consumption of all the trackers provided. Returns the sum of consumption in
-    /// 'logged_consumption'. 'max_recursive_depth' specifies the maximum number of levels
-    /// of children to include in the dump. If it is zero, then no children are dumped.
-    static std::string LogUsage(int max_recursive_depth, const std::string& prefix,
-                                const std::list& trackers,
-                                int64_t* logged_consumption);
-
-    /// Lock to protect GcMemory(). This prevents many GCs from occurring at once.
-    std::mutex _gc_lock;
-
-    // Protects _request_to_mem_trackers and _pool_to_mem_trackers
-    static std::mutex _s_mem_trackers_lock;
-
-    // All per-request MemTracker objects that are in use.  For memory management, this map
-    // contains only weak ptrs.  MemTrackers that are handed out via get_query_mem_tracker()
-    // are shared ptrs.  When all the shared ptrs are no longer referenced, the MemTracker
-    // d'tor will be called at which point the weak ptr will be removed from the map.
-    typedef std::unordered_map> RequestTrackersMap;
-    static RequestTrackersMap _s_request_to_mem_trackers;
-
-    // Only valid for MemTrackers returned from get_query_mem_tracker()
-    /// Only valid for MemTrackers returned from CreateQueryMemTracker()
-    TUniqueId _query_id;
-
-    /// Only valid for MemTrackers returned from GetRequestPoolMemTracker()
-    std::string _pool_name;
-
-    int64_t _limit; // in bytes
-    //int64_t _consumption;  // in bytes
-
-    std::string _label;
-    MemTracker* _parent;
-
-    /// in bytes; not owned
-    RuntimeProfile::HighWaterMarkCounter* _consumption;
-
-    /// holds _consumption counter if not tied to a profile
-    RuntimeProfile::HighWaterMarkCounter _local_counter;
-
-    /// If non-NULL, used to measure consumption (in bytes) rather than the values provided
-    /// to Consume()/Release(). Only used for the process tracker, thus parent_ should be
-    /// NULL if _consumption_metric is set.
-    UIntGauge* _consumption_metric;
-
-    /// If non-NULL, counters from a corresponding ReservationTracker that should be
-    /// reported in logs and other diagnostics. Owned by this MemTracker. The counters
-    /// are owned by the fragment's RuntimeProfile.
-    AtomicPtr _reservation_counters;
-
-    std::vector _all_trackers;   // this tracker plus all of its ancestors
-    std::vector _limit_trackers; // _all_trackers with valid limits
-
-    // All the child trackers of this tracker. Used for error reporting only.
-    // i.e., Updating a parent tracker does not update the children.
-    mutable std::mutex _child_trackers_lock;
-    std::list _child_trackers;
-    // Iterator into _parent->_child_trackers for this object. Stored to have O(1)
-    // remove.
-    std::list::iterator _child_tracker_it;
-
-    /// Functions to call after the limit is reached to free memory.
-    std::vector _gc_functions;
-
-    /// If false, this tracker (and its children) will not be included in LogUsage() output
-    /// if consumption is 0.
-    bool _log_usage_if_zero;
-
-    /// The number of times the GcFunctions were called.
-    IntCounter* _num_gcs_metric;
-
-    /// The number of bytes freed by the last round of calling the GcFunctions (-1 before any
-    /// GCs are performed).
-    IntGauge* _bytes_freed_by_last_gc_metric;
-
-    /// The number of bytes over the limit we were the last time LimitExceeded() was called
-    /// and the limit was exceeded pre-GC. -1 if there is no limit or the limit was never
-    /// exceeded.
-    IntGauge* _bytes_over_limit_metric;
-
-    /// Metric for limit_.
-    IntGauge* _limit_metric;
-
-    // If true, calls unregister_from_parent() in the dtor. This is only used for
-    // the query wide trackers to remove it from the process mem tracker. The
-    // process tracker never gets deleted so it is safe to reference it in the dtor.
-    // The query tracker has lifetime shared by multiple plan fragments so it's hard
-    // to do cleanup another way.
-    bool _auto_unregister = false;
+  /// Metric for limit_.
+  IntGauge* limit_metric_;
 };
 
 /// Global registry for query and pool MemTrackers. Owned by ExecEnv.
@@ -507,19 +588,19 @@ class PoolMemTrackerRegistry {
   /// with the process tracker as its parent. There is no explicit per-pool byte_limit
   /// set at any particular impalad, so newly created trackers will always have a limit
   /// of -1.
-    MemTracker* GetRequestPoolMemTracker(const std::string& pool_name, bool create_if_not_present);
+  MemTracker* GetRequestPoolMemTracker(
+      const std::string& pool_name, bool create_if_not_present);
 
  private:
   /// All per-request pool MemTracker objects. It is assumed that request pools will live
   /// for the entire duration of the process lifetime so MemTrackers are never removed
   /// from this map. Protected by '_pool_to_mem_trackers_lock'
   typedef std::unordered_map> PoolTrackersMap;
-  PoolTrackersMap _pool_to_mem_trackers;
+  PoolTrackersMap pool_to_mem_trackers_;
   /// IMPALA-3068: Use SpinLock instead of std::mutex so that the lock won't
   /// automatically destroy itself as part of process teardown, which could cause races.
-  SpinLock _pool_to_mem_trackers_lock;
+  SpinLock pool_to_mem_trackers_lock_;
 };
 
-} // namespace doris
+}
 
-#endif
diff --git a/be/src/runtime/memory_scratch_sink.cpp b/be/src/runtime/memory_scratch_sink.cpp
index 8093850d2c..fcc20c4948 100644
--- a/be/src/runtime/memory_scratch_sink.cpp
+++ b/be/src/runtime/memory_scratch_sink.cpp
@@ -50,8 +50,7 @@ Status MemoryScratchSink::prepare_exprs(RuntimeState* state) {
     RETURN_IF_ERROR(Expr::create_expr_trees(
             state->obj_pool(), _t_output_expr, &_output_expr_ctxs));
     // Prepare the exprs to run.
-    RETURN_IF_ERROR(Expr::prepare(
-            _output_expr_ctxs, state, _row_desc, _expr_mem_tracker.get()));
+    RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc, _expr_mem_tracker));
     // generate the arrow schema 
     RETURN_IF_ERROR(convert_to_arrow_schema(_row_desc, &_arrow_schema));
     return Status::OK();
diff --git a/be/src/runtime/mysql_table_sink.cpp b/be/src/runtime/mysql_table_sink.cpp
index ec267e93e7..8377d8ad28 100644
--- a/be/src/runtime/mysql_table_sink.cpp
+++ b/be/src/runtime/mysql_table_sink.cpp
@@ -29,14 +29,13 @@
 namespace doris {
 
 MysqlTableSink::MysqlTableSink(ObjectPool* pool, const RowDescriptor& row_desc,
-                               const std::vector& t_exprs) :
-        _pool(pool),
-        _row_desc(row_desc),
-        _t_output_expr(t_exprs) {
-}
+                               const std::vector& t_exprs)
+        : _pool(pool),
+          _row_desc(row_desc),
+          _t_output_expr(t_exprs),
+          _mem_tracker(MemTracker::CreateTracker(-1, "MysqlTableSink")) {}
 
-MysqlTableSink::~MysqlTableSink() {
-}
+MysqlTableSink::~MysqlTableSink() {}
 
 Status MysqlTableSink::init(const TDataSink& t_sink) {
     RETURN_IF_ERROR(DataSink::init(t_sink));
@@ -57,7 +56,7 @@ Status MysqlTableSink::init(const TDataSink& t_sink) {
 Status MysqlTableSink::prepare(RuntimeState* state) {
     RETURN_IF_ERROR(DataSink::prepare(state));
     // Prepare the exprs to run.
-    RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc, _mem_tracker.get()));
+    RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc, _mem_tracker));
     std::stringstream title;
     title << "MysqlTableSink (frag_id=" << state->fragment_instance_id() << ")";
     // create profile
diff --git a/be/src/runtime/mysql_table_sink.h b/be/src/runtime/mysql_table_sink.h
index b6ac5ab766..de63edb4ce 100644
--- a/be/src/runtime/mysql_table_sink.h
+++ b/be/src/runtime/mysql_table_sink.h
@@ -73,7 +73,7 @@ private:
     MysqlTableWriter* _writer;
 
     RuntimeProfile* _profile;
-    std::unique_ptr _mem_tracker;
+    std::shared_ptr _mem_tracker;
 };
 
 }
diff --git a/be/src/runtime/plan_fragment_executor.cpp b/be/src/runtime/plan_fragment_executor.cpp
index d55dd19e0c..264a6685a8 100644
--- a/be/src/runtime/plan_fragment_executor.cpp
+++ b/be/src/runtime/plan_fragment_executor.cpp
@@ -65,11 +65,6 @@ PlanFragmentExecutor::~PlanFragmentExecutor() {
     // }
     // at this point, the report thread should have been stopped
     DCHECK(!_report_thread_active);
-
-    // fragment mem tracker needs unregister
-    if (_mem_tracker.get() != nullptr) {
-        _mem_tracker->unregister_from_parent();
-    }
 }
 
 Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request) {
@@ -134,9 +129,8 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request) {
         bytes_limit = _exec_env->process_mem_tracker()->limit();
     }
     // NOTE: this MemTracker only for olap
-    _mem_tracker.reset(
-            new MemTracker(bytes_limit, "fragment mem-limit", _exec_env->process_mem_tracker()));
-    _runtime_state->set_fragment_mem_tracker(_mem_tracker.get());
+    _mem_tracker = MemTracker::CreateTracker(bytes_limit, "fragment mem-limit", _exec_env->process_mem_tracker());
+    _runtime_state->set_fragment_mem_tracker(_mem_tracker);
 
     LOG(INFO) << "Using query memory limit: "
         << PrettyPrinter::print(bytes_limit, TUnit::BYTES);
@@ -221,7 +215,7 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request) {
     _row_batch.reset(new RowBatch(
             _plan->row_desc(),
             _runtime_state->batch_size(),
-            _runtime_state->instance_mem_tracker()));
+            _runtime_state->instance_mem_tracker().get()));
     // _row_batch->tuple_data_pool()->set_limits(*_runtime_state->mem_trackers());
     VLOG(3) << "plan_root=\n" << _plan->debug_string();
     _prepared = true;
@@ -574,7 +568,7 @@ void PlanFragmentExecutor::close() {
      
     // _mem_tracker init failed
     if (_mem_tracker.get() != nullptr) {
-        _mem_tracker->release(_mem_tracker->consumption());
+        _mem_tracker->Release(_mem_tracker->consumption());
     }
     _closed = true;
 }
diff --git a/be/src/runtime/plan_fragment_executor.h b/be/src/runtime/plan_fragment_executor.h
index bd4b0a2e69..3d36c70a3a 100644
--- a/be/src/runtime/plan_fragment_executor.h
+++ b/be/src/runtime/plan_fragment_executor.h
@@ -154,8 +154,7 @@ private:
     ExecEnv* _exec_env;  // not owned
     ExecNode* _plan;  // lives in _runtime_state->obj_pool()
     TUniqueId _query_id;
-    // MemTracker* _mem_tracker;
-    boost::scoped_ptr _mem_tracker;
+    std::shared_ptr _mem_tracker;
 
     // profile reporting-related
     report_status_callback _report_status_cb;
diff --git a/be/src/runtime/qsorter.cpp b/be/src/runtime/qsorter.cpp
index eaabebf1bc..e9c6b405f5 100644
--- a/be/src/runtime/qsorter.cpp
+++ b/be/src/runtime/qsorter.cpp
@@ -84,7 +84,7 @@ QSorter::QSorter(
             RuntimeState* state) :
         _row_desc(row_desc),
         _order_expr_ctxs(order_expr_ctxs),
-        _tuple_pool(new MemPool(state->instance_mem_tracker())) {
+        _tuple_pool(new MemPool(state->instance_mem_tracker().get())) {
 }
 
 Status QSorter::prepare(RuntimeState* state) {
diff --git a/be/src/runtime/result_sink.cpp b/be/src/runtime/result_sink.cpp
index af85fb8256..0a39f0672d 100644
--- a/be/src/runtime/result_sink.cpp
+++ b/be/src/runtime/result_sink.cpp
@@ -57,8 +57,7 @@ Status ResultSink::prepare_exprs(RuntimeState* state) {
     RETURN_IF_ERROR(Expr::create_expr_trees(
             state->obj_pool(), _t_output_expr, &_output_expr_ctxs));
     // Prepare the exprs to run.
-    RETURN_IF_ERROR(Expr::prepare(
-            _output_expr_ctxs, state, _row_desc, _expr_mem_tracker.get()));
+    RETURN_IF_ERROR(Expr::prepare(_output_expr_ctxs, state, _row_desc, _expr_mem_tracker));
     return Status::OK();
 }
 
diff --git a/be/src/runtime/row_batch.cpp b/be/src/runtime/row_batch.cpp
index 4cfd953996..2fd4fcf366 100644
--- a/be/src/runtime/row_batch.cpp
+++ b/be/src/runtime/row_batch.cpp
@@ -56,7 +56,7 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, int capacity, MemTracker* mem_
     DCHECK_GT(_tuple_ptrs_size, 0);
     // TODO: switch to Init() pattern so we can check memory limit and return Status.
     if (config::enable_partitioned_aggregation) {
-        _mem_tracker->consume(_tuple_ptrs_size);
+        _mem_tracker->Consume(_tuple_ptrs_size);
         _tuple_ptrs = reinterpret_cast(malloc(_tuple_ptrs_size));
         DCHECK(_tuple_ptrs != NULL);
     } else {
@@ -90,7 +90,7 @@ RowBatch::RowBatch(const RowDescriptor& row_desc,
     DCHECK_GT(_tuple_ptrs_size, 0);
     // TODO: switch to Init() pattern so we can check memory limit and return Status.
     if (config::enable_partitioned_aggregation) {
-        _mem_tracker->consume(_tuple_ptrs_size);
+        _mem_tracker->Consume(_tuple_ptrs_size);
         _tuple_ptrs = reinterpret_cast(malloc(_tuple_ptrs_size));
         DCHECK(_tuple_ptrs != nullptr);
     } else {
@@ -188,7 +188,7 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, const TRowBatch& input_batch,
     DCHECK_GT(_tuple_ptrs_size, 0);
     // TODO: switch to Init() pattern so we can check memory limit and return Status.
     if (config::enable_partitioned_aggregation) {
-        _mem_tracker->consume(_tuple_ptrs_size);
+        _mem_tracker->Consume(_tuple_ptrs_size);
         _tuple_ptrs = reinterpret_cast(malloc(_tuple_ptrs_size));
         DCHECK(_tuple_ptrs != NULL);
     } else {
@@ -291,7 +291,7 @@ void RowBatch::clear() {
     if (config::enable_partitioned_aggregation) {
         DCHECK(_tuple_ptrs != NULL);
         free(_tuple_ptrs);
-        _mem_tracker->release(_tuple_ptrs_size);
+        _mem_tracker->Release(_tuple_ptrs_size);
         _tuple_ptrs = NULL;
     }
     _cleared = true;
@@ -438,7 +438,7 @@ void RowBatch::add_io_buffer(DiskIoMgr::BufferDescriptor* buffer) {
     DCHECK(buffer != NULL);
     _io_buffers.push_back(buffer);
     _auxiliary_mem_usage += buffer->buffer_len();
-    buffer->set_mem_tracker(_mem_tracker);
+    buffer->set_mem_tracker(std::shared_ptr(_mem_tracker));  // TODO(yingchun): fixme
 }
 
 Status RowBatch::resize_and_allocate_tuple_buffer(RuntimeState* state,
@@ -522,7 +522,7 @@ void RowBatch::transfer_resource_ownership(RowBatch* dest) {
         DiskIoMgr::BufferDescriptor* buffer = _io_buffers[i];
         dest->_io_buffers.push_back(buffer);
         dest->_auxiliary_mem_usage += buffer->buffer_len();
-        buffer->set_mem_tracker(dest->_mem_tracker);
+        buffer->set_mem_tracker(std::shared_ptr(dest->_mem_tracker));   // TODO(yingchun): fixme
     }
     _io_buffers.clear();
 
@@ -585,7 +585,7 @@ void RowBatch::acquire_state(RowBatch* src) {
         DiskIoMgr::BufferDescriptor* buffer = src->_io_buffers[i];
         _io_buffers.push_back(buffer);
         _auxiliary_mem_usage += buffer->buffer_len();
-        buffer->set_mem_tracker(_mem_tracker);
+        buffer->set_mem_tracker(std::shared_ptr(_mem_tracker));  // TODO(yingchun): fixme
     }
     src->_io_buffers.clear();
     src->_auxiliary_mem_usage = 0;
diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp
index 372c80894f..15781e7cb9 100644
--- a/be/src/runtime/runtime_state.cpp
+++ b/be/src/runtime/runtime_state.cpp
@@ -51,11 +51,11 @@ RuntimeState::RuntimeState(
         const TUniqueId& fragment_instance_id,
         const TQueryOptions& query_options,
         const TQueryGlobals& query_globals, ExecEnv* exec_env) :
+            _fragment_mem_tracker(nullptr),
             _profile("Fragment " + print_id(fragment_instance_id)),
             _obj_pool(new ObjectPool()),
             _data_stream_recvrs_pool(new ObjectPool()),
             _unreported_error_idx(0),
-            _fragment_mem_tracker(NULL),
             _is_cancelled(false),
             _per_fragment_instance_idx(0),
             _root_node_id(-1),
@@ -76,12 +76,12 @@ RuntimeState::RuntimeState(
         const TExecPlanFragmentParams& fragment_params,
         const TQueryOptions& query_options,
         const TQueryGlobals& query_globals, ExecEnv* exec_env) :
+            _fragment_mem_tracker(nullptr),
             _profile("Fragment " + print_id(fragment_params.params.fragment_instance_id)),
             _obj_pool(new ObjectPool()),
             _data_stream_recvrs_pool(new ObjectPool()),
             _unreported_error_idx(0),
             _query_id(fragment_params.params.query_id),
-            _fragment_mem_tracker(NULL),
             _is_cancelled(false),
             _per_fragment_instance_idx(0),
             _root_node_id(-1),
@@ -153,27 +153,6 @@ RuntimeState::~RuntimeState() {
     if (_exec_env != nullptr && _exec_env->thread_mgr() != nullptr) {
         _exec_env->thread_mgr()->unregister_pool(_resource_pool);
     }
-
-#ifndef BE_TEST
-    // _query_mem_tracker must be valid as long as _instance_mem_tracker is so
-    // delete _instance_mem_tracker first.
-    // LogUsage() walks the MemTracker tree top-down when the memory limit is exceeded.
-    // Break the link between the instance_mem_tracker and its parent (_query_mem_tracker)
-    // before the _instance_mem_tracker and its children are destroyed.
-    if (_instance_mem_tracker.get() != NULL) {
-        // May be NULL if InitMemTrackers() is not called, for example from tests.
-        _instance_mem_tracker->unregister_from_parent();
-        _instance_mem_tracker->close();
-    }
-
-    _instance_mem_tracker.reset();
-   
-    if (_query_mem_tracker.get() != NULL) {
-        _query_mem_tracker->unregister_from_parent();
-        _query_mem_tracker->close();
-    }
-    _query_mem_tracker.reset();
-#endif
 }
 
 Status RuntimeState::init(
@@ -235,10 +214,12 @@ Status RuntimeState::init_mem_trackers(const TUniqueId& query_id) {
     auto mem_tracker_counter = ADD_COUNTER(&_profile, "MemoryLimit", TUnit::BYTES);
     mem_tracker_counter->set(bytes_limit);
 
-    _query_mem_tracker.reset(
-            new MemTracker(bytes_limit, runtime_profile()->name(), _exec_env->process_mem_tracker()));
-    _instance_mem_tracker.reset(
-            new MemTracker(&_profile, -1, runtime_profile()->name(), _query_mem_tracker.get()));
+    _query_mem_tracker = MemTracker::CreateTracker(
+            bytes_limit, std::string("RuntimeState: query ") + runtime_profile()->name(),
+            _exec_env->process_mem_tracker());
+    _instance_mem_tracker = MemTracker::CreateTracker(
+            &_profile, -1, std::string("RuntimeState: instance ") + runtime_profile()->name(),
+            _query_mem_tracker);
 
     /*
     // TODO: this is a stopgap until we implement ExprContext
@@ -250,9 +231,9 @@ Status RuntimeState::init_mem_trackers(const TUniqueId& query_id) {
 
     RETURN_IF_ERROR(init_buffer_poolstate());
 
-    _initial_reservations = _obj_pool->add(new InitialReservations(_obj_pool.get(),
-                      _buffer_reservation, _query_mem_tracker.get(), 
-                      _query_options.initial_reservation_total_claims));
+    _initial_reservations = _obj_pool->add(
+            new InitialReservations(_obj_pool.get(), _buffer_reservation, _query_mem_tracker,
+                                    _query_options.initial_reservation_total_claims));
     RETURN_IF_ERROR(
         _initial_reservations->Init(_query_id, min_reservation()));
     DCHECK_EQ(0, _initial_reservation_refcnt.load());
@@ -267,13 +248,13 @@ Status RuntimeState::init_mem_trackers(const TUniqueId& query_id) {
 }
 
 Status RuntimeState::init_instance_mem_tracker() {
-    _instance_mem_tracker.reset(new MemTracker(-1));
+    _instance_mem_tracker = MemTracker::CreateTracker(-1);
     return Status::OK();
 }
 
 Status RuntimeState::init_buffer_poolstate() {
   ExecEnv* exec_env = ExecEnv::GetInstance();
-  int64_t mem_limit = _query_mem_tracker->lowest_limit();
+  int64_t mem_limit = _query_mem_tracker->GetLowestLimit(MemLimit::HARD);
   int64_t max_reservation;
   if (query_options().__isset.buffer_pool_limit
       && query_options().buffer_pool_limit > 0) {
@@ -303,7 +284,7 @@ Status RuntimeState::create_block_mgr() {
     if (block_mgr_limit < 0) {
         block_mgr_limit = std::numeric_limits::max();
     }
-    RETURN_IF_ERROR(BufferedBlockMgr2::create(this, _query_mem_tracker.get(),
+    RETURN_IF_ERROR(BufferedBlockMgr2::create(this, _query_mem_tracker,
             runtime_profile(), _exec_env->tmp_file_mgr(),
             block_mgr_limit, _exec_env->disk_io_mgr()->max_read_buffer_size(), &_block_mgr2));
     return Status::OK();
diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h
index dcd97e2b3e..8fc07e76ab 100644
--- a/be/src/runtime/runtime_state.h
+++ b/be/src/runtime/runtime_state.h
@@ -162,17 +162,15 @@ public:
     ExecEnv* exec_env() {
         return _exec_env;
     }
-    std::vector* mem_trackers() {
-        return &_mem_trackers;
+    const std::vector>& mem_trackers() {
+        return _mem_trackers;
     }
-    MemTracker* fragment_mem_tracker() {
+   std::shared_ptr fragment_mem_tracker() {
         return _fragment_mem_tracker;
     }
-    MemTracker* instance_mem_tracker() { {
-        return _instance_mem_tracker.get(); }
-    }
-    MemTracker* query_mem_tracker() { {
-        return _query_mem_tracker.get(); }
+
+    std::shared_ptr instance_mem_tracker() {
+        return _instance_mem_tracker;
     }
     ThreadResourceMgr::ResourcePool* resource_pool() {
         return _resource_pool;
@@ -226,10 +224,10 @@ public:
         int buffer_size, RuntimeProfile* profile);
 
     // Sets the fragment memory limit and adds it to _mem_trackers
-    void set_fragment_mem_tracker(MemTracker* limit) {
-        DCHECK(_fragment_mem_tracker == NULL);
-        _fragment_mem_tracker = limit;
-        _mem_trackers.push_back(limit);
+    void set_fragment_mem_tracker(std::shared_ptr tracker) {
+        DCHECK(_fragment_mem_tracker == nullptr);
+        _fragment_mem_tracker = tracker;
+        _mem_trackers.push_back(tracker);
     }
 
     // Appends error to the _error_log if there is space
@@ -508,6 +506,19 @@ private:
 
     static const int DEFAULT_BATCH_SIZE = 2048;
 
+    // all mem limits that apply to this query
+    std::vector> _mem_trackers;
+
+    // Fragment memory limit.  Also contained in _mem_trackers
+    std::shared_ptr _fragment_mem_tracker;
+
+    // MemTracker that is shared by all fragment instances running on this host.
+    // The query mem tracker must be released after the _instance_mem_tracker.
+    std::shared_ptr _query_mem_tracker;
+
+    // Memory usage of this fragment instance
+    std::shared_ptr _instance_mem_tracker;
+
     // put runtime state before _obj_pool, so that it will be deconstructed after
     // _obj_pool. Because some of object in _obj_pool will use profile when deconstructing.
     RuntimeProfile _profile;
@@ -551,19 +562,6 @@ private:
     // state is responsible for returning this pool to the thread mgr.
     ThreadResourceMgr::ResourcePool* _resource_pool;
 
-    // all mem limits that apply to this query
-    std::vector _mem_trackers;
-
-    // Fragment memory limit.  Also contained in _mem_trackers
-    MemTracker* _fragment_mem_tracker;
-
-    // MemTracker that is shared by all fragment instances running on this host.
-    // The query mem tracker must be released after the _instance_mem_tracker.
-    boost::shared_ptr _query_mem_tracker;
-
-    // Memory usage of this fragment instance
-    boost::scoped_ptr _instance_mem_tracker;
-
     // if true, execution should stop with a CANCELLED status
     bool _is_cancelled;
 
diff --git a/be/src/runtime/spill_sorter.cc b/be/src/runtime/spill_sorter.cc
index b331731376..80d1b30a98 100644
--- a/be/src/runtime/spill_sorter.cc
+++ b/be/src/runtime/spill_sorter.cc
@@ -646,7 +646,7 @@ Status SpillSorter::Run::prepare_read() {
     //         _sorter->_state->batch_size(), _sorter->_mem_tracker));
     _buffered_batch.reset(
             new RowBatch(
-                *_sorter->_output_row_desc, _sorter->_state->batch_size(), _sorter->_mem_tracker));
+                *_sorter->_output_row_desc, _sorter->_state->batch_size(), _sorter->_mem_tracker.get()));
 
     // If the run is pinned, merge is not invoked, so _buffered_batch is not needed
     // and the individual blocks do not need to be pinned.
@@ -1031,7 +1031,7 @@ inline void SpillSorter::TupleSorter::swap(uint8_t* left, uint8_t* right) {
 // SpillSorter methods
 SpillSorter::SpillSorter(const TupleRowComparator& compare_less_than,
         const vector& slot_materialize_expr_ctxs,
-        RowDescriptor* output_row_desc, MemTracker* mem_tracker,
+        RowDescriptor* output_row_desc, const std::shared_ptr& mem_tracker,
         RuntimeProfile* profile, RuntimeState* state) :
     _state(state),
     _compare_less_than(compare_less_than),
@@ -1258,7 +1258,7 @@ Status SpillSorter::merge_intermediate_runs() {
         int num_runs_to_merge = std::min(max_runs_per_intermediate_merge,
                 _sorted_runs.size() - max_runs_per_intermediate_merge);
         RETURN_IF_ERROR(create_merger(num_runs_to_merge));
-        RowBatch intermediate_merge_batch(*_output_row_desc, _state->batch_size(), _mem_tracker);
+        RowBatch intermediate_merge_batch(*_output_row_desc, _state->batch_size(), _mem_tracker.get());
         // merged_run is the new sorted run that is produced by the intermediate merge.
         Run* merged_run = _obj_pool.add(
                 new Run(this, _output_row_desc->tuple_descriptors()[0], false));
diff --git a/be/src/runtime/spill_sorter.h b/be/src/runtime/spill_sorter.h
index d8ddab65c0..5bafee93ef 100644
--- a/be/src/runtime/spill_sorter.h
+++ b/be/src/runtime/spill_sorter.h
@@ -94,7 +94,7 @@ public:
     // and retrieve rows from an intermediate merger.
     SpillSorter(const TupleRowComparator& compare_less_than,
             const std::vector& sort_tuple_slot_expr_ctxs,
-            RowDescriptor* output_row_desc, MemTracker* mem_tracker,
+            RowDescriptor* output_row_desc, const std::shared_ptr& mem_tracker,
             RuntimeProfile* profile, RuntimeState* state);
 
     ~SpillSorter();
@@ -174,7 +174,7 @@ private:
     std::vector _sort_tuple_slot_expr_ctxs;
 
     // Mem tracker for batches created during merge. Not owned by SpillSorter.
-    MemTracker* _mem_tracker;
+    std::shared_ptr _mem_tracker;
 
     // Descriptor for the sort tuple. Input rows are materialized into 1 tuple before
     // sorting. Not owned by the SpillSorter.
diff --git a/be/src/runtime/tablets_channel.cpp b/be/src/runtime/tablets_channel.cpp
index 2a125af94f..57c33a5531 100644
--- a/be/src/runtime/tablets_channel.cpp
+++ b/be/src/runtime/tablets_channel.cpp
@@ -29,9 +29,9 @@ namespace doris {
 
 std::atomic TabletsChannel::_s_tablet_writer_count;
 
-TabletsChannel::TabletsChannel(const TabletsChannelKey& key, MemTracker* mem_tracker):
+TabletsChannel::TabletsChannel(const TabletsChannelKey& key, const std::shared_ptr& mem_tracker):
         _key(key), _state(kInitialized), _closed_senders(64) {
-    _mem_tracker.reset(new MemTracker(-1, "tablets channel", mem_tracker));
+    _mem_tracker = MemTracker::CreateTracker(-1, "tablets channel", mem_tracker);
     static std::once_flag once_flag;
     std::call_once(once_flag, [] {
         REGISTER_GAUGE_DORIS_METRIC(tablet_writer_count, [&]() {
@@ -235,7 +235,7 @@ Status TabletsChannel::_open_all_writers(const PTabletWriterOpenRequest& params)
         request.slots = index_slots;
 
         DeltaWriter* writer = nullptr;
-        auto st = DeltaWriter::open(&request, _mem_tracker.get(),  &writer);
+        auto st = DeltaWriter::open(&request, _mem_tracker,  &writer);
         if (st != OLAP_SUCCESS) {
             std::stringstream ss;
             ss << "open delta writer failed, tablet_id=" << tablet.tablet_id()
diff --git a/be/src/runtime/tablets_channel.h b/be/src/runtime/tablets_channel.h
index c9953c3cd3..6e7851152b 100644
--- a/be/src/runtime/tablets_channel.h
+++ b/be/src/runtime/tablets_channel.h
@@ -56,7 +56,7 @@ class OlapTableSchemaParam;
 // Write channel for a particular (load, index).
 class TabletsChannel {
 public:
-    TabletsChannel(const TabletsChannelKey& key, MemTracker* mem_tracker);
+    TabletsChannel(const TabletsChannelKey& key, const std::shared_ptr& mem_tracker);
 
     ~TabletsChannel();
 
@@ -123,7 +123,7 @@ private:
 
     std::unordered_set _partition_ids;
 
-    std::unique_ptr _mem_tracker;
+    std::shared_ptr _mem_tracker;
 
     static std::atomic _s_tablet_writer_count;
 };
diff --git a/be/src/runtime/test_env.cc b/be/src/runtime/test_env.cc
index 136929809e..1289aaad5f 100644
--- a/be/src/runtime/test_env.cc
+++ b/be/src/runtime/test_env.cc
@@ -33,7 +33,7 @@ TestEnv::TestEnv() {
     // _exec_env->init_for_tests();
     _io_mgr_tracker.reset(new MemTracker(-1));
     _block_mgr_parent_tracker.reset(new MemTracker(-1));
-    _exec_env->disk_io_mgr()->init(_io_mgr_tracker.get());
+    _exec_env->disk_io_mgr()->init(_io_mgr_tracker);
     init_metrics();
     _tmp_file_mgr.reset(new TmpFileMgr());
     _tmp_file_mgr->init(_metrics.get());
@@ -77,7 +77,7 @@ Status TestEnv::create_query_state(int64_t query_id, int max_buffers, int block_
 
     shared_ptr mgr;
     RETURN_IF_ERROR(BufferedBlockMgr2::create(
-                *runtime_state, _block_mgr_parent_tracker.get(),
+                *runtime_state, _block_mgr_parent_tracker,
                 (*runtime_state)->runtime_profile(), _tmp_file_mgr.get(),
                 calculate_mem_tracker(max_buffers, block_size), block_size, &mgr));
     (*runtime_state)->set_block_mgr2(mgr);
diff --git a/be/src/runtime/test_env.h b/be/src/runtime/test_env.h
index 1a2c49bf47..1aa937eb2e 100644
--- a/be/src/runtime/test_env.h
+++ b/be/src/runtime/test_env.h
@@ -56,8 +56,8 @@ public:
     ExecEnv* exec_env() {
         return _exec_env.get();
     }
-    MemTracker* block_mgr_parent_tracker() {
-        return _block_mgr_parent_tracker.get();
+    std::shared_ptr block_mgr_parent_tracker() {
+        return _block_mgr_parent_tracker;
     }
     MemTracker* io_mgr_tracker() {
         return _io_mgr_tracker.get();
@@ -80,8 +80,8 @@ private:
     // Global state for test environment.
     static boost::scoped_ptr _s_static_metrics;
     boost::scoped_ptr _exec_env;
-    boost::scoped_ptr _block_mgr_parent_tracker;
-    boost::scoped_ptr _io_mgr_tracker;
+    std::shared_ptr _block_mgr_parent_tracker;
+    std::shared_ptr _io_mgr_tracker;
     boost::scoped_ptr _metrics;
     boost::scoped_ptr _tmp_file_mgr;
 
diff --git a/be/src/runtime/vectorized_row_batch.cpp b/be/src/runtime/vectorized_row_batch.cpp
index 60b309c8bf..68ae2cc098 100644
--- a/be/src/runtime/vectorized_row_batch.cpp
+++ b/be/src/runtime/vectorized_row_batch.cpp
@@ -24,12 +24,12 @@ namespace doris {
 
 VectorizedRowBatch::VectorizedRowBatch(const TabletSchema* schema,
                                        const std::vector& cols, int capacity,
-                                       MemTracker* parent_tracker)
+                                       const std::shared_ptr& parent_tracker)
         : _schema(schema), _cols(cols), _capacity(capacity), _limit(capacity) {
     _selected_in_use = false;
     _size = 0;
 
-    _tracker.reset(new MemTracker(-1, "VectorizedRowBatch", parent_tracker, true));
+    _tracker = MemTracker::CreateTracker(-1, "VectorizedRowBatch", parent_tracker);
     _mem_pool.reset(new MemPool(_tracker.get()));
 
     _selected = reinterpret_cast(new char[sizeof(uint16_t) * _capacity]);
diff --git a/be/src/runtime/vectorized_row_batch.h b/be/src/runtime/vectorized_row_batch.h
index aef23ae701..c3a8ee2f9e 100644
--- a/be/src/runtime/vectorized_row_batch.h
+++ b/be/src/runtime/vectorized_row_batch.h
@@ -73,7 +73,7 @@ private:
 class VectorizedRowBatch {
 public:
     VectorizedRowBatch(const TabletSchema* schema, const std::vector& cols, int capacity,
-                       MemTracker* parent_tracker = nullptr);
+                       const std::shared_ptr& parent_tracker = nullptr);
 
     ~VectorizedRowBatch() {
         for (auto vec: _col_vectors) {
@@ -147,7 +147,7 @@ private:
     bool _selected_in_use = false;
     uint8_t _block_status;
 
-    std::unique_ptr _tracker;
+    std::shared_ptr _tracker;
     std::unique_ptr _mem_pool;
     uint16_t _limit;
 };
diff --git a/be/src/testutil/function_utils.cpp b/be/src/testutil/function_utils.cpp
index 28ac6c1d15..0506dcc462 100644
--- a/be/src/testutil/function_utils.cpp
+++ b/be/src/testutil/function_utils.cpp
@@ -29,8 +29,8 @@ namespace doris {
 FunctionUtils::FunctionUtils() {
     doris_udf::FunctionContext::TypeDesc return_type;
     std::vector arg_types;
-    _mem_tracker = new MemTracker();
-    _memory_pool = new MemPool(_mem_tracker);
+    _mem_tracker.reset(new MemTracker(-1, "function util"));
+    _memory_pool = new MemPool(_mem_tracker.get());
     _fn_ctx = FunctionContextImpl::create_context(
         _state, _memory_pool, return_type, arg_types, 0, false);
 }
@@ -38,8 +38,8 @@ FunctionUtils::FunctionUtils(RuntimeState* state) {
     _state = state;
     doris_udf::FunctionContext::TypeDesc return_type;
     std::vector arg_types;
-    _mem_tracker = new MemTracker();
-    _memory_pool = new MemPool(_mem_tracker);
+    _mem_tracker.reset(new MemTracker(-1, "function util"));
+    _memory_pool = new MemPool(_mem_tracker.get());
     _fn_ctx = FunctionContextImpl::create_context(
         _state, _memory_pool, return_type, arg_types, 0, false);
 }
@@ -48,7 +48,6 @@ FunctionUtils::~FunctionUtils() {
     _fn_ctx->impl()->close();
     delete _fn_ctx;
     delete _memory_pool;
-    delete _mem_tracker;
 }
 
 }
diff --git a/be/src/testutil/function_utils.h b/be/src/testutil/function_utils.h
index d777aedb0c..041f6b2b55 100644
--- a/be/src/testutil/function_utils.h
+++ b/be/src/testutil/function_utils.h
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include 
+
 namespace doris_udf {
 class FunctionContext;
 }
@@ -36,7 +38,7 @@ public:
     }
 private:
     RuntimeState* _state = nullptr;
-    MemTracker* _mem_tracker = nullptr;
+    std::shared_ptr _mem_tracker;
     MemPool* _memory_pool = nullptr;
     doris_udf::FunctionContext* _fn_ctx = nullptr;
 };
diff --git a/be/src/util/arrow/row_batch.cpp b/be/src/util/arrow/row_batch.cpp
index 4dd9294da9..5c35e44108 100644
--- a/be/src/util/arrow/row_batch.cpp
+++ b/be/src/util/arrow/row_batch.cpp
@@ -363,7 +363,7 @@ public:
 
     ToRowBatchConverter(const arrow::RecordBatch& batch,
                         const RowDescriptor& row_desc,
-                        MemTracker* tracker)
+                        const std::shared_ptr& tracker)
         : _batch(batch), _row_desc(row_desc), _tracker(tracker) { }
 
 #define PRIMITIVE_VISIT(TYPE) \
@@ -407,7 +407,7 @@ private:
 private:
     const arrow::RecordBatch& _batch;
     const RowDescriptor& _row_desc;
-    MemTracker* _tracker;
+    std::shared_ptr _tracker;
 
     std::unique_ptr _cur_slot_ref;
     std::shared_ptr _output;
@@ -427,7 +427,7 @@ Status ToRowBatchConverter:: convert(std::shared_ptr* result) {
     // TODO(zc): check if field type match
 
     size_t num_rows = _batch.num_rows();
-    _output.reset(new RowBatch(_row_desc, num_rows, _tracker));
+    _output.reset(new RowBatch(_row_desc, num_rows, _tracker.get()));
     _output->commit_rows(num_rows);
     auto pool = _output->tuple_data_pool();
     for (size_t row_id = 0; row_id < num_rows; ++row_id) {
@@ -454,7 +454,7 @@ Status ToRowBatchConverter:: convert(std::shared_ptr* result) {
 
 Status convert_to_row_batch(const arrow::RecordBatch& batch,
                             const RowDescriptor& row_desc,
-                            MemTracker* tracker,
+                            const std::shared_ptr& tracker,
                             std::shared_ptr* result) {
     ToRowBatchConverter converter(batch, row_desc, tracker);
     return converter.convert(result);
diff --git a/be/src/util/arrow/row_batch.h b/be/src/util/arrow/row_batch.h
index eb68fe46ab..68d03ffcb9 100644
--- a/be/src/util/arrow/row_batch.h
+++ b/be/src/util/arrow/row_batch.h
@@ -68,7 +68,7 @@ Status convert_to_arrow_batch(
 Status convert_to_row_batch(
     const arrow::RecordBatch& batch,
     const RowDescriptor& row_desc,
-    MemTracker* tracker,
+    std::shared_ptr tracker,
     std::shared_ptr* result);
 
 Status serialize_record_batch(const arrow::RecordBatch& record_batch, std::string* result);
diff --git a/be/src/util/runtime_profile.cpp b/be/src/util/runtime_profile.cpp
index 53c5b3431d..2208ede1f7 100644
--- a/be/src/util/runtime_profile.cpp
+++ b/be/src/util/runtime_profile.cpp
@@ -382,10 +382,34 @@ const std::string* RuntimeProfile::get_info_string(const std::string& key) {
 ADD_COUNTER_IMPL(AddHighWaterMarkCounter, HighWaterMarkCounter);
 //ADD_COUNTER_IMPL(AddConcurrentTimerCounter, ConcurrentTimerCounter);
 
+std::shared_ptr RuntimeProfile::AddSharedHighWaterMarkCounter(
+        const std::string& name, TUnit::type unit, const std::string& parent_counter_name) {
+    DCHECK_EQ(_is_averaged_profile, false);
+    boost::lock_guard l(_counter_map_lock);
+    if (_shared_counter_pool.find(name) != _shared_counter_pool.end()) {
+        return _shared_counter_pool[name];
+    }
+    DCHECK(parent_counter_name == ROOT_COUNTER ||
+           _counter_map.find(parent_counter_name) != _counter_map.end());
+    std::shared_ptr counter = std::make_shared(unit);
+    _shared_counter_pool[name] = counter;
+
+    DCHECK(_counter_map.find(name) == _counter_map.end())
+            << "already has a raw counter named " << name;
+
+    // it's OK to insert shared counter to _counter_map, cuz _counter_map is not the owner of counters
+    _counter_map[name] = counter.get();
+    std::set* child_counters =
+            find_or_insert(&_child_counter_map, parent_counter_name, std::set());
+    child_counters->insert(name);
+    return counter;
+}
+
 RuntimeProfile::Counter* RuntimeProfile::add_counter(const std::string& name, TUnit::type type,
                                                      const std::string& parent_counter_name) {
     boost::lock_guard l(_counter_map_lock);
 
+    // TODO(yingchun): Can we ensure that 'name' is not exist in '_counter_map'? Use CHECK instead?
     if (_counter_map.find(name) != _counter_map.end()) {
         // TODO: should we make sure that we don't return existing derived counters?
         return _counter_map[name];
diff --git a/be/src/util/runtime_profile.h b/be/src/util/runtime_profile.h
index ec81559d03..ee126dcc5a 100644
--- a/be/src/util/runtime_profile.h
+++ b/be/src/util/runtime_profile.h
@@ -454,6 +454,11 @@ public:
     HighWaterMarkCounter* AddHighWaterMarkCounter(const std::string& name,
             TUnit::type unit, const std::string& parent_counter_name = "");
 
+    // Only for create MemTracker(using profile's counter to calc consumption)
+    std::shared_ptr AddSharedHighWaterMarkCounter(
+        const std::string& name, TUnit::type unit,
+        const std::string& parent_counter_name = "");
+
     // stops updating the value of 'rate_counter'. Rate counters are updated
     // periodically so should be removed as soon as the underlying counter is
     // no longer going to change.
@@ -480,6 +485,9 @@ private:
     // object, but occasionally allocated in the constructor.
     std::unique_ptr _pool;
 
+    // Pool for allocated counters. These counters are shared with some other objects.
+    std::map> _shared_counter_pool;
+
     // True if we have to delete the _pool on destruction.
     bool _own_pool;
 
diff --git a/be/test/exec/broker_scan_node_test.cpp b/be/test/exec/broker_scan_node_test.cpp
index d9804cf1d2..a06ca82359 100644
--- a/be/test/exec/broker_scan_node_test.cpp
+++ b/be/test/exec/broker_scan_node_test.cpp
@@ -457,9 +457,9 @@ TEST_F(BrokerScanNodeTest, normal) {
     status = scan_node.open(&_runtime_state);
     ASSERT_TRUE(status.ok());
 
-    MemTracker tracker;
+    auto tracker = std::make_shared();
     // Get batch
-    RowBatch batch(scan_node.row_desc(), _runtime_state.batch_size(), &tracker);
+    RowBatch batch(scan_node.row_desc(), _runtime_state.batch_size(), tracker.get());
 
     bool eos = false;
     status = scan_node.get_next(&_runtime_state, &batch, &eos);
diff --git a/be/test/exec/broker_scanner_test.cpp b/be/test/exec/broker_scanner_test.cpp
index 1c59c10d79..790f560af0 100644
--- a/be/test/exec/broker_scanner_test.cpp
+++ b/be/test/exec/broker_scanner_test.cpp
@@ -38,7 +38,7 @@ namespace doris {
 
 class BrokerScannerTest : public testing::Test {
 public:
-    BrokerScannerTest() : _runtime_state(TQueryGlobals()) {
+    BrokerScannerTest() : _tracker(new MemTracker()), _runtime_state(TQueryGlobals()) {
         init();
         _profile = _runtime_state.runtime_profile();
         _runtime_state._instance_mem_tracker.reset(new MemTracker());
@@ -59,7 +59,7 @@ private:
     void init_desc_table();
     void init_params();
 
-    MemTracker _tracker;
+    std::shared_ptr _tracker;
     RuntimeState _runtime_state;
     RuntimeProfile* _profile;
     ObjectPool _obj_pool;
@@ -361,7 +361,7 @@ TEST_F(BrokerScannerTest, normal) {
     auto st = scanner.open();
     ASSERT_TRUE(st.ok());
 
-    MemPool tuple_pool(&_tracker);
+    MemPool tuple_pool(_tracker.get());
     Tuple* tuple = (Tuple*)tuple_pool.allocate(20);
     bool eof = false;
     // 1,2,3
@@ -413,8 +413,7 @@ TEST_F(BrokerScannerTest, normal2) {
     auto st = scanner.open();
     ASSERT_TRUE(st.ok());
 
-    MemTracker tracker;
-    MemPool tuple_pool(&tracker);
+    MemPool tuple_pool(_tracker.get());
     Tuple* tuple = (Tuple*)tuple_pool.allocate(20);
     bool eof = false;
     // 1,2,3
@@ -460,8 +459,7 @@ TEST_F(BrokerScannerTest, normal3) {
     auto st = scanner.open();
     ASSERT_TRUE(st.ok());
 
-    MemTracker tracker;
-    MemPool tuple_pool(&tracker);
+    MemPool tuple_pool(_tracker.get());
     Tuple* tuple = (Tuple*)tuple_pool.allocate(20);
     bool eof = false;
     // 1,2,3
@@ -508,8 +506,7 @@ TEST_F(BrokerScannerTest, normal4) {
     auto st = scanner.open();
     ASSERT_TRUE(st.ok());
 
-    MemTracker tracker;
-    MemPool tuple_pool(&tracker);
+    MemPool tuple_pool(_tracker.get());
     Tuple* tuple = (Tuple*)tuple_pool.allocate(20);
     bool eof = false;
     // 1,2,3
@@ -540,8 +537,7 @@ TEST_F(BrokerScannerTest, normal5) {
     auto st = scanner.open();
     ASSERT_TRUE(st.ok());
 
-    MemTracker tracker;
-    MemPool tuple_pool(&tracker);
+    MemPool tuple_pool(_tracker.get());
     Tuple* tuple = (Tuple*)tuple_pool.allocate(20);
     bool eof = false;
     // end of file
@@ -565,8 +561,7 @@ TEST_F(BrokerScannerTest, normal6) {
     auto st = scanner.open();
     ASSERT_TRUE(st.ok());
 
-    MemTracker tracker;
-    MemPool tuple_pool(&tracker);
+    MemPool tuple_pool(_tracker.get());
     Tuple* tuple = (Tuple*)tuple_pool.allocate(20);
     bool eof = false;
     // 4,5,6
@@ -597,8 +592,7 @@ TEST_F(BrokerScannerTest, normal7) {
     auto st = scanner.open();
     ASSERT_TRUE(st.ok());
 
-    MemTracker tracker;
-    MemPool tuple_pool(&tracker);
+    MemPool tuple_pool(_tracker.get());
     Tuple* tuple = (Tuple*)tuple_pool.allocate(20);
     bool eof = false;
     // end of file
@@ -622,8 +616,7 @@ TEST_F(BrokerScannerTest, normal8) {
     auto st = scanner.open();
     ASSERT_TRUE(st.ok());
 
-    MemTracker tracker;
-    MemPool tuple_pool(&tracker);
+    MemPool tuple_pool(_tracker.get());
     Tuple* tuple = (Tuple*)tuple_pool.allocate(20);
     bool eof = false;
     // 4,5,6
@@ -654,8 +647,7 @@ TEST_F(BrokerScannerTest, normal9) {
     auto st = scanner.open();
     ASSERT_TRUE(st.ok());
 
-    MemTracker tracker;
-    MemPool tuple_pool(&tracker);
+    MemPool tuple_pool(_tracker.get());
     Tuple* tuple = (Tuple*)tuple_pool.allocate(20);
     bool eof = false;
     // end of file
diff --git a/be/test/exec/csv_scan_node_test.cpp b/be/test/exec/csv_scan_node_test.cpp
index 00ae3a778b..a3d845c3ea 100644
--- a/be/test/exec/csv_scan_node_test.cpp
+++ b/be/test/exec/csv_scan_node_test.cpp
@@ -252,7 +252,8 @@ TEST_F(CsvScanNodeTest, NormalUse) {
     status = scan_node.open(_state);
     ASSERT_TRUE(status.ok());
 
-    RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), new MemTracker(-1));
+    auto tracker = std::make_shared();
+    RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), tracker.get());
     bool eos = false;
 
     while (!eos) {
@@ -291,7 +292,8 @@ TEST_F(CsvScanNodeTest, continuousDelim) {
     status = scan_node.open(_state);
     ASSERT_TRUE(status.ok());
 
-    RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), new MemTracker(-1));
+
+    RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), tracker.get());
     bool eos = false;
 
     while (!eos) {
@@ -330,7 +332,8 @@ TEST_F(CsvScanNodeTest, wrong_decimal_format_test) {
     status = scan_node.open(_state);
     ASSERT_TRUE(status.ok());
 
-    RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), new MemTracker(-1));
+    auto tracker = std::make_shared();
+    RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), tracker.get());
     bool eos = false;
 
     while (!eos) {
@@ -358,7 +361,8 @@ TEST_F(CsvScanNodeTest, fill_fix_len_stringi_test) {
     status = scan_node.open(_state);
     ASSERT_TRUE(status.ok());
 
-    RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), new MemTracker(-1));
+    auto tracker = std::make_shared();
+    RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), tracker.get());
     bool eos = false;
 
     while (!eos) {
@@ -403,7 +407,8 @@ TEST_F(CsvScanNodeTest, wrong_fix_len_string_format_test) {
     status = scan_node.open(_state);
     ASSERT_TRUE(status.ok());
 
-    RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), new MemTracker(-1));
+    auto tracker = std::make_shared();
+    RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), tracker.get());
     bool eos = false;
 
     while (!eos) {
diff --git a/be/test/exec/es_scan_node_test.cpp b/be/test/exec/es_scan_node_test.cpp
index c56848f34c..a38db389af 100644
--- a/be/test/exec/es_scan_node_test.cpp
+++ b/be/test/exec/es_scan_node_test.cpp
@@ -134,7 +134,7 @@ TEST_F(EsScanNodeTest, normal_use) {
 
     status = scan_node.open(&_runtime_state);
     ASSERT_TRUE(status.ok());
-    std::unique_ptr mem_tracker(new MemTracker(-1));
+    std::shared_ptr mem_tracker(new MemTracker(-1));
     RowBatch row_batch(scan_node._row_descriptor, _runtime_state.batch_size(), mem_tracker.get());
     bool eos = false;
     status = scan_node.get_next(&_runtime_state, &row_batch, &eos);
diff --git a/be/test/exec/hash_table_test.cpp b/be/test/exec/hash_table_test.cpp
index bd45684bc8..c01aca3e5a 100644
--- a/be/test/exec/hash_table_test.cpp
+++ b/be/test/exec/hash_table_test.cpp
@@ -28,7 +28,6 @@
 #include "exprs/expr.h"
 #include "runtime/mem_pool.h"
 #include "runtime/string_value.h"
-#include "runtime/mem_limit.hpp"
 #include "util/cpu_info.h"
 #include "util/runtime_profile.h"
 
@@ -277,12 +276,11 @@ TEST_F(HashTableTest, GrowTableTest) {
     int build_row_val = 0;
     int num_to_add = 4;
     int expected_size = 0;
-    MemTracker mem_limit(1024 * 1024);
-    vector mem_limits;
-    mem_limits.push_back(&mem_limit);
+
+    auto mem_tracker = std::make_shared(1024 * 1024);
     HashTable hash_table(
-        _build_expr, _probe_expr, 1, false, 0, mem_limits, num_to_add);
-    EXPECT_TRUE(!mem_limit.limit_exceeded());
+        _build_expr, _probe_expr, 1, false, 0, mem_tracker, num_to_add);
+    EXPECT_FALSE(mem_tracker->limit_exceeded());
 
     // This inserts about 5M entries
     for (int i = 0; i < 20; ++i) {
@@ -295,7 +293,7 @@ TEST_F(HashTableTest, GrowTableTest) {
         EXPECT_EQ(hash_table.size(), expected_size);
     }
 
-    EXPECT_TRUE(mem_limit.limit_exceeded());
+    EXPECT_TRUE(mem_tracker->limit_exceeded());
 
     // Validate that we can find the entries
     for (int i = 0; i < expected_size * 5; i += 100000) {
@@ -316,11 +314,10 @@ TEST_F(HashTableTest, GrowTableTest2) {
     int build_row_val = 0;
     int num_to_add = 1024;
     int expected_size = 0;
-    MemTracker mem_limit(1024 * 1024);
-    vector mem_limits;
-    mem_limits.push_back(&mem_limit);
+
+    auto mem_tracker = std::make_shared(1024 * 1024);
     HashTable hash_table(
-        _build_expr, _probe_expr, 1, false, 0, mem_limits, num_to_add);
+        _build_expr, _probe_expr, 1, false, 0, mem_tracker, num_to_add);
 
     LOG(INFO) << time(NULL);
 
diff --git a/be/test/exec/orc_scanner_test.cpp b/be/test/exec/orc_scanner_test.cpp
index 5e27d74949..270a87f4ba 100644
--- a/be/test/exec/orc_scanner_test.cpp
+++ b/be/test/exec/orc_scanner_test.cpp
@@ -371,8 +371,9 @@ TEST_F(OrcScannerTest, normal) {
 
     ORCScanner scanner(&_runtime_state, _profile, params, ranges, _addresses, &_counter);
     ASSERT_TRUE(scanner.open().ok());
-    MemTracker tracker;
-    MemPool tuple_pool(&tracker);
+
+    auto tracker = std::make_shared();
+    MemPool tuple_pool(tracker.get());
 
     Tuple *tuple = (Tuple *) tuple_pool.allocate(_desc_tbl->get_tuple_descriptor(1)->byte_size());
     bool eof = false;
@@ -476,8 +477,9 @@ TEST_F(OrcScannerTest, normal2) {
 
     ORCScanner scanner(&_runtime_state, _profile, params, ranges, _addresses, &_counter);
     ASSERT_TRUE(scanner.open().ok());
-    MemTracker tracker;
-    MemPool tuple_pool(&tracker);
+
+    auto tracker = std::make_shared();
+    MemPool tuple_pool(tracker.get());
 
     Tuple *tuple = (Tuple *) tuple_pool.allocate(_desc_tbl->get_tuple_descriptor(1)->byte_size());
     bool eof = false;
@@ -785,8 +787,9 @@ TEST_F(OrcScannerTest, normal3) {
 
     ORCScanner scanner(&_runtime_state, _profile, params, ranges, _addresses, &_counter);
     ASSERT_TRUE(scanner.open().ok());
-    MemTracker tracker;
-    MemPool tuple_pool(&tracker);
+
+    auto tracker = std::make_shared();
+    MemPool tuple_pool(tracker.get());
 
     Tuple *tuple = (Tuple *) tuple_pool.allocate(_desc_tbl->get_tuple_descriptor(1)->byte_size());
     bool eof = false;
diff --git a/be/test/exec/parquet_scanner_test.cpp b/be/test/exec/parquet_scanner_test.cpp
index 64941f8782..bdfe5303bb 100644
--- a/be/test/exec/parquet_scanner_test.cpp
+++ b/be/test/exec/parquet_scanner_test.cpp
@@ -459,9 +459,9 @@ TEST_F(ParquetSannerTest, normal) {
     status = scan_node.open(&_runtime_state);
     ASSERT_TRUE(status.ok());
 
-    MemTracker tracker;
+    auto tracker = std::make_shared();
     // Get batch
-    RowBatch batch(scan_node.row_desc(), _runtime_state.batch_size(), &tracker);
+    RowBatch batch(scan_node.row_desc(), _runtime_state.batch_size(), tracker.get());
     bool eof = false;
     for (int i = 0; i < 14; i++) {
         status = scan_node.get_next(&_runtime_state, &batch, &eof);
diff --git a/be/test/exec/tablet_info_test.cpp b/be/test/exec/tablet_info_test.cpp
index acd29b40a4..0a8c09f15f 100644
--- a/be/test/exec/tablet_info_test.cpp
+++ b/be/test/exec/tablet_info_test.cpp
@@ -143,8 +143,8 @@ TEST_F(OlapTablePartitionParamTest, normal) {
     ASSERT_TRUE(st.ok());
     RowDescriptor row_desc(*desc_tbl, {0}, {false});
     TupleDescriptor* tuple_desc = desc_tbl->get_tuple_descriptor(0);
-    MemTracker tracker;
-    RowBatch batch(row_desc, 1024, &tracker);
+    auto tracker = std::make_shared();
+    RowBatch batch(row_desc, 1024, tracker.get());
     // 12, 9, "abc"
     {
         Tuple* tuple = (Tuple*)batch.tuple_data_pool()->allocate(tuple_desc->byte_size());
@@ -280,8 +280,8 @@ TEST_F(OlapTablePartitionParamTest, unpartitioned) {
     ASSERT_TRUE(st.ok());
     RowDescriptor row_desc(*desc_tbl, {0}, {false});
     TupleDescriptor* tuple_desc = desc_tbl->get_tuple_descriptor(0);
-    MemTracker tracker;
-    RowBatch batch(row_desc, 1024, &tracker);
+    auto tracker = std::make_shared();
+    RowBatch batch(row_desc, 1024, tracker.get());
     // 12, 9, "abc"
     {
         Tuple* tuple = (Tuple*)batch.tuple_data_pool()->allocate(tuple_desc->byte_size());
diff --git a/be/test/exec/tablet_sink_test.cpp b/be/test/exec/tablet_sink_test.cpp
index 7b73f69371..0428a18234 100644
--- a/be/test/exec/tablet_sink_test.cpp
+++ b/be/test/exec/tablet_sink_test.cpp
@@ -332,8 +332,8 @@ public:
             k_add_batch_status.to_protobuf(response->mutable_status());
 
             if (request->has_row_batch() && _row_desc != nullptr) {
-                MemTracker tracker;
-                RowBatch batch(*_row_desc, request->row_batch(), &tracker);
+                auto tracker = std::make_shared();
+                RowBatch batch(*_row_desc, request->row_batch(), tracker.get());
                 for (int i = 0; i < batch.num_rows(); ++i) {
                     LOG(INFO) << batch.get_row(i)->to_string(*_row_desc);
                     _output_set->emplace(batch.get_row(i)->to_string(*_row_desc));
@@ -403,8 +403,8 @@ TEST_F(OlapTableSinkTest, normal) {
     st = sink.open(&state);
     ASSERT_TRUE(st.ok());
     // send
-    MemTracker tracker;
-    RowBatch batch(row_desc, 1024, &tracker);
+    auto tracker = std::make_shared();
+    RowBatch batch(row_desc, 1024, tracker.get());
     // 12, 9, "abc"
     {
         Tuple* tuple = (Tuple*)batch.tuple_data_pool()->allocate(tuple_desc->byte_size());
@@ -536,8 +536,8 @@ TEST_F(OlapTableSinkTest, convert) {
     st = sink.open(&state);
     ASSERT_TRUE(st.ok());
     // send
-    MemTracker tracker;
-    RowBatch batch(row_desc, 1024, &tracker);
+    auto tracker = std::make_shared();
+    RowBatch batch(row_desc, 1024, tracker.get());
     // 12, 9, "abc"
     {
         Tuple* tuple = (Tuple*)batch.tuple_data_pool()->allocate(tuple_desc->byte_size());
@@ -844,8 +844,8 @@ TEST_F(OlapTableSinkTest, add_batch_failed) {
     st = sink.open(&state);
     ASSERT_TRUE(st.ok());
     // send
-    MemTracker tracker;
-    RowBatch batch(row_desc, 1024, &tracker);
+    auto tracker = std::make_shared();
+    RowBatch batch(row_desc, 1024, tracker.get());
     TupleDescriptor* tuple_desc = desc_tbl->get_tuple_descriptor(0);
     // 12, 9, "abc"
     {
@@ -925,8 +925,8 @@ TEST_F(OlapTableSinkTest, decimal) {
     st = sink.open(&state);
     ASSERT_TRUE(st.ok());
     // send
-    MemTracker tracker;
-    RowBatch batch(row_desc, 1024, &tracker);
+    auto tracker = std::make_shared();
+    RowBatch batch(row_desc, 1024, tracker.get());
     // 12, 12.3
     {
         Tuple* tuple = (Tuple*)batch.tuple_data_pool()->allocate(tuple_desc->byte_size());
diff --git a/be/test/olap/aggregate_func_test.cpp b/be/test/olap/aggregate_func_test.cpp
index 5c9d9b1744..a79d7436a7 100644
--- a/be/test/olap/aggregate_func_test.cpp
+++ b/be/test/olap/aggregate_func_test.cpp
@@ -40,7 +40,7 @@ void test_min() {
     static const size_t kValSize = sizeof(CppType) + 1;  // '1' represent the leading bool flag.
     char buf[64];
 
-    std::unique_ptr tracker(new MemTracker(-1));
+    std::shared_ptr tracker(new MemTracker(-1));
     std::unique_ptr mem_pool(new MemPool(tracker.get()));
     ObjectPool agg_object_pool;
     const AggregateInfo* agg = get_aggregate_info(OLAP_FIELD_AGGREGATION_MIN, field_type);
@@ -116,7 +116,7 @@ void test_max() {
 
     char buf[64];
 
-    std::unique_ptr tracker(new MemTracker(-1));
+    std::shared_ptr tracker(new MemTracker(-1));
     std::unique_ptr mem_pool(new MemPool(tracker.get()));
     ObjectPool agg_object_pool;
     const AggregateInfo* agg = get_aggregate_info(OLAP_FIELD_AGGREGATION_MAX, field_type);
@@ -192,7 +192,7 @@ void test_sum() {
     char buf[64];
     RowCursorCell dst(buf);
 
-    std::unique_ptr tracker(new MemTracker(-1));
+    std::shared_ptr tracker(new MemTracker(-1));
     std::unique_ptr mem_pool(new MemPool(tracker.get()));
     ObjectPool agg_object_pool;
     const AggregateInfo* agg = get_aggregate_info(OLAP_FIELD_AGGREGATION_SUM, field_type);
@@ -267,7 +267,7 @@ void test_replace() {
     char buf[64];
     RowCursorCell dst(buf);
 
-    std::unique_ptr tracker(new MemTracker(-1));
+    std::shared_ptr tracker(new MemTracker(-1));
     std::unique_ptr mem_pool(new MemPool(tracker.get()));
     ObjectPool agg_object_pool;
     const AggregateInfo* agg = get_aggregate_info(OLAP_FIELD_AGGREGATION_REPLACE, field_type);
@@ -326,7 +326,7 @@ void test_replace_string() {
     dst_slice->data = nullptr;
     dst_slice->size = 0;
 
-    std::unique_ptr tracker(new MemTracker(-1));
+    std::shared_ptr tracker(new MemTracker(-1));
     std::unique_ptr mem_pool(new MemPool(tracker.get()));
     ObjectPool agg_object_pool;
     const AggregateInfo* agg = get_aggregate_info(OLAP_FIELD_AGGREGATION_REPLACE, field_type);
diff --git a/be/test/olap/column_reader_test.cpp b/be/test/olap/column_reader_test.cpp
index 8a8ac97856..f902a03e66 100644
--- a/be/test/olap/column_reader_test.cpp
+++ b/be/test/olap/column_reader_test.cpp
@@ -216,7 +216,7 @@ public:
 
     ColumnWriter *_column_writer;
     ColumnReader *_column_reader;
-    std::unique_ptr _mem_tracker;
+    std::shared_ptr _mem_tracker;
     std::unique_ptr _mem_pool;
     std::unique_ptr _col_vector;
 
diff --git a/be/test/olap/comparison_predicate_test.cpp b/be/test/olap/comparison_predicate_test.cpp
index a2bd35702a..dc7db5ca54 100644
--- a/be/test/olap/comparison_predicate_test.cpp
+++ b/be/test/olap/comparison_predicate_test.cpp
@@ -124,7 +124,7 @@ public: \
         _vectorized_batch = new VectorizedRowBatch(tablet_schema, ids, size); \
         _vectorized_batch->set_size(size); \
     } \
-    std::unique_ptr _mem_tracker; \
+    std::shared_ptr _mem_tracker; \
     std::unique_ptr _mem_pool; \
     VectorizedRowBatch* _vectorized_batch; \
 }; \
diff --git a/be/test/olap/delta_writer_test.cpp b/be/test/olap/delta_writer_test.cpp
index a14c74d2f8..b6aa07e9e7 100644
--- a/be/test/olap/delta_writer_test.cpp
+++ b/be/test/olap/delta_writer_test.cpp
@@ -47,7 +47,7 @@ static const uint32_t MAX_RETRY_TIMES = 10;
 static const uint32_t MAX_PATH_LEN = 1024;
 
 StorageEngine* k_engine = nullptr;
-MemTracker* k_mem_tracker = nullptr;
+std::shared_ptr k_mem_tracker = nullptr;
 
 void set_up() {
     char buffer[MAX_PATH_LEN];
@@ -66,7 +66,7 @@ void set_up() {
     ExecEnv* exec_env = doris::ExecEnv::GetInstance();
     exec_env->set_storage_engine(k_engine);
 
-    k_mem_tracker = new MemTracker(-1, "delta writer test");
+    k_mem_tracker.reset(new MemTracker(-1, "delta writer test"));
 }
 
 void tear_down() {
@@ -74,7 +74,6 @@ void tear_down() {
     k_engine = nullptr;
     system("rm -rf ./data_test");
     FileUtils::remove_all(std::string(getenv("DORIS_HOME")) + UNUSED_PREFIX);
-    delete k_mem_tracker;
 }
 
 void create_tablet_request(int64_t tablet_id, int32_t schema_hash, TCreateTabletReq* request) {
@@ -349,8 +348,8 @@ TEST_F(TestDeltaWriter, write) {
     DeltaWriter::open(&write_req, k_mem_tracker, &delta_writer);
     ASSERT_NE(delta_writer, nullptr);
 
-    MemTracker tracker;
-    MemPool pool(&tracker);
+    auto tracker = std::make_shared();
+    MemPool pool(tracker.get());
     // Tuple 1
     {
         Tuple* tuple = reinterpret_cast(pool.allocate(tuple_desc->byte_size()));
diff --git a/be/test/olap/in_list_predicate_test.cpp b/be/test/olap/in_list_predicate_test.cpp
index c293d744d9..2e4e2dbe02 100644
--- a/be/test/olap/in_list_predicate_test.cpp
+++ b/be/test/olap/in_list_predicate_test.cpp
@@ -127,7 +127,7 @@ public:
         _vectorized_batch = new VectorizedRowBatch(tablet_schema, ids, size);
         _vectorized_batch->set_size(size);
     }
-    std::unique_ptr _mem_tracker;
+    std::shared_ptr _mem_tracker;
     std::unique_ptr _mem_pool;
     VectorizedRowBatch* _vectorized_batch;
 };
diff --git a/be/test/olap/key_coder_test.cpp b/be/test/olap/key_coder_test.cpp
index c34287f5d2..45bca2b1b0 100644
--- a/be/test/olap/key_coder_test.cpp
+++ b/be/test/olap/key_coder_test.cpp
@@ -29,11 +29,11 @@ namespace doris {
 
 class KeyCoderTest : public testing::Test {
 public:
-    KeyCoderTest() : _pool(&_tracker) { }
+    KeyCoderTest() : _tracker(new MemTracker()), _pool(_tracker.get()) { }
     virtual ~KeyCoderTest() {
     }
 private:
-    MemTracker _tracker;
+    std::shared_ptr _tracker;
     MemPool _pool;
 };
 
diff --git a/be/test/olap/null_predicate_test.cpp b/be/test/olap/null_predicate_test.cpp
index 4be0f8d34e..d3270a9b58 100644
--- a/be/test/olap/null_predicate_test.cpp
+++ b/be/test/olap/null_predicate_test.cpp
@@ -95,7 +95,7 @@ public:
         _vectorized_batch = new VectorizedRowBatch(tablet_schema, ids, size);
         _vectorized_batch->set_size(size);
     }
-    std::unique_ptr _mem_tracker;
+    std::shared_ptr _mem_tracker;
     std::unique_ptr _mem_pool;
     VectorizedRowBatch* _vectorized_batch;
 };
diff --git a/be/test/olap/row_block_v2_test.cpp b/be/test/olap/row_block_v2_test.cpp
index 7fe0b97e39..8c377b48f2 100644
--- a/be/test/olap/row_block_v2_test.cpp
+++ b/be/test/olap/row_block_v2_test.cpp
@@ -92,8 +92,8 @@ TEST_F(TestRowBlockV2, test_convert) {
     block_info.null_supported = true;
     auto res = output_block.init(block_info);
     ASSERT_EQ(OLAP_SUCCESS, res);
-    MemTracker tracker;
-    MemPool pool(&tracker);
+    auto tracker = std::make_shared();
+    MemPool pool(tracker.get());
     for (int i = 0; i < input_block.capacity(); ++i) {
         RowBlockRow row = input_block.row(i);
 
diff --git a/be/test/olap/row_cursor_test.cpp b/be/test/olap/row_cursor_test.cpp
index 2a39e6d3e5..3f931264c5 100644
--- a/be/test/olap/row_cursor_test.cpp
+++ b/be/test/olap/row_cursor_test.cpp
@@ -259,7 +259,7 @@ public:
 
     virtual void TearDown() {}
 
-    std::unique_ptr _mem_tracker;
+    std::shared_ptr _mem_tracker;
     std::unique_ptr _mem_pool;
 };
 
@@ -471,7 +471,7 @@ TEST_F(TestRowCursor, AggregateWithoutNull) {
     left.set_field_content(4, reinterpret_cast(&l_decimal), _mem_pool.get());
     left.set_field_content(5, reinterpret_cast(&l_varchar), _mem_pool.get());
 
-    std::unique_ptr tracker(new MemTracker(-1));
+    std::shared_ptr tracker(new MemTracker(-1));
     std::unique_ptr mem_pool(new MemPool(tracker.get()));
     ObjectPool agg_object_pool;
     init_row_with_others(&row, left, mem_pool.get(), &agg_object_pool);
@@ -532,7 +532,7 @@ TEST_F(TestRowCursor, AggregateWithNull) {
     left.set_null(4);
     left.set_field_content(5, reinterpret_cast(&l_varchar), _mem_pool.get());
 
-    std::unique_ptr tracker(new MemTracker(-1));
+    std::shared_ptr tracker(new MemTracker(-1));
     std::unique_ptr mem_pool(new MemPool(tracker.get()));
     ObjectPool agg_object_pool;
     init_row_with_others(&row, left, mem_pool.get(), &agg_object_pool);
diff --git a/be/test/olap/rowset/alpha_rowset_test.cpp b/be/test/olap/rowset/alpha_rowset_test.cpp
index 96fd9f4852..73132ee07b 100644
--- a/be/test/olap/rowset/alpha_rowset_test.cpp
+++ b/be/test/olap/rowset/alpha_rowset_test.cpp
@@ -159,7 +159,7 @@ public:
 
 private:
     std::unique_ptr _alpha_rowset_writer;
-    std::unique_ptr _mem_tracker;
+    std::shared_ptr _mem_tracker;
     std::unique_ptr _mem_pool;
 };
 
diff --git a/be/test/olap/rowset/beta_rowset_test.cpp b/be/test/olap/rowset/beta_rowset_test.cpp
index f625512839..5a4fe34bb8 100644
--- a/be/test/olap/rowset/beta_rowset_test.cpp
+++ b/be/test/olap/rowset/beta_rowset_test.cpp
@@ -173,8 +173,8 @@ TEST_F(BetaRowsetTest, BasicFunctionTest) {
         // k2 := k1 * 10
         // k3 := 4096 * i + rid
         for (int i = 0; i < num_segments; ++i) {
-            MemTracker mem_tracker(-1);
-            MemPool mem_pool(&mem_tracker);
+            auto tracker = std::make_shared();
+            MemPool mem_pool(tracker.get());
             for (int rid = 0; rid < rows_per_segment; ++rid) {
                 uint32_t k1 = rid * 10 + i;
                 uint32_t k2 = k1 * 10;
diff --git a/be/test/olap/rowset/rowset_converter_test.cpp b/be/test/olap/rowset/rowset_converter_test.cpp
index 5ef600cd2e..1e06d2e79f 100644
--- a/be/test/olap/rowset/rowset_converter_test.cpp
+++ b/be/test/olap/rowset/rowset_converter_test.cpp
@@ -192,7 +192,7 @@ public:
 
 private:
     std::string _schema_hash_path;
-    std::unique_ptr _mem_tracker;
+    std::shared_ptr _mem_tracker;
     std::unique_ptr _mem_pool;
 };
 
diff --git a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
index 1313061fa8..826ad2984e 100644
--- a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
+++ b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
@@ -80,8 +80,8 @@ public:
         ASSERT_EQ(slices.size(), page_decoder.count());
 
         //check values
-        MemTracker tracker;
-        MemPool pool(&tracker);
+        auto tracker = std::make_shared();
+        MemPool pool(tracker.get());
         TypeInfo* type_info = get_type_info(OLAP_FIELD_TYPE_VARCHAR);
         size_t size = slices.size();
         Slice* values = reinterpret_cast(pool.allocate(size * sizeof(Slice)));
@@ -170,8 +170,8 @@ public:
             ASSERT_TRUE(status.ok());
 
             //check values
-            MemTracker tracker;
-            MemPool pool(&tracker);
+            auto tracker = std::make_shared();
+            MemPool pool(tracker.get());
             TypeInfo* type_info = get_type_info(OLAP_FIELD_TYPE_VARCHAR);
             Slice* values = reinterpret_cast(pool.allocate(sizeof(Slice)));
             ColumnBlock column_block(type_info, (uint8_t*)values, nullptr, 1, &pool);
diff --git a/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp b/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp
index a43ec35d64..d155e94ed5 100644
--- a/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp
+++ b/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp
@@ -69,10 +69,8 @@ public:
         ASSERT_TRUE(status.ok());
 
         //test1
-        
-
-        MemTracker tracker;
-        MemPool pool(&tracker);
+        auto tracker = std::make_shared();
+        MemPool pool(tracker.get());
         size_t size = 3;
         Slice* values = reinterpret_cast(pool.allocate(size * sizeof(Slice)));
         uint8_t* null_bitmap = reinterpret_cast(pool.allocate(BitmapSize(size)));
diff --git a/be/test/olap/rowset/segment_v2/binary_prefix_page_test.cpp b/be/test/olap/rowset/segment_v2/binary_prefix_page_test.cpp
index f7fe9264d0..02dd7bf38f 100644
--- a/be/test/olap/rowset/segment_v2/binary_prefix_page_test.cpp
+++ b/be/test/olap/rowset/segment_v2/binary_prefix_page_test.cpp
@@ -72,8 +72,8 @@ class BinaryPrefixPageTest : public testing::Test {
         ASSERT_EQ(slices.size(), page_decoder->count());
 
         //check values
-        MemTracker tracker;
-        MemPool pool(&tracker);
+        auto tracker = std::make_shared();
+        MemPool pool(tracker.get());
         TypeInfo* type_info = get_type_info(OLAP_FIELD_TYPE_VARCHAR);
         size_t size = slices.size();
         Slice* values = reinterpret_cast(pool.allocate(size * sizeof(Slice)));
@@ -146,8 +146,8 @@ class BinaryPrefixPageTest : public testing::Test {
         ret = page_decoder->init();
         ASSERT_TRUE(ret.ok());
 
-        MemTracker tracker;
-        MemPool pool(&tracker);
+        auto tracker = std::make_shared();
+        MemPool pool(tracker.get());
         TypeInfo* type_info = get_type_info(OLAP_FIELD_TYPE_VARCHAR);
         size_t size = slices.size();
         Slice* values = reinterpret_cast(pool.allocate(size * sizeof(Slice)));
diff --git a/be/test/olap/rowset/segment_v2/bitmap_index_test.cpp b/be/test/olap/rowset/segment_v2/bitmap_index_test.cpp
index 4874af6ffa..ff9e27e3f7 100644
--- a/be/test/olap/rowset/segment_v2/bitmap_index_test.cpp
+++ b/be/test/olap/rowset/segment_v2/bitmap_index_test.cpp
@@ -38,7 +38,7 @@ namespace segment_v2 {
 class BitmapIndexTest : public testing::Test {
 public:
     const std::string kTestDir = "./ut_dir/bitmap_index_test";
-    BitmapIndexTest() : _pool(&_tracker) { }
+    BitmapIndexTest() : _tracker(new MemTracker()), _pool(_tracker.get()) {}
 
     void SetUp() override {
         if (FileUtils::check_exist(kTestDir)) {
@@ -53,7 +53,7 @@ public:
     }
 
 private:
-    MemTracker _tracker;
+    std::shared_ptr _tracker;
     MemPool _pool;
 };
 
diff --git a/be/test/olap/rowset/segment_v2/bitshuffle_page_test.cpp b/be/test/olap/rowset/segment_v2/bitshuffle_page_test.cpp
index f90c3df121..1a99644f6f 100644
--- a/be/test/olap/rowset/segment_v2/bitshuffle_page_test.cpp
+++ b/be/test/olap/rowset/segment_v2/bitshuffle_page_test.cpp
@@ -36,8 +36,8 @@ public:
 
     template
     void copy_one(PageDecoderType* decoder, typename TypeTraits::CppType* ret) {
-        MemTracker tracker;
-        MemPool pool(&tracker);
+        auto tracker = std::make_shared();
+        MemPool pool(tracker.get());
         uint8_t null_bitmap = 0;
         ColumnBlock block(get_type_info(type), (uint8_t*)ret, &null_bitmap, 1, &pool);
         ColumnBlockView column_block_view(&block);
@@ -72,8 +72,8 @@ public:
         ASSERT_TRUE(status.ok());
         ASSERT_EQ(0, page_decoder.current_index());
 
-        MemTracker tracker;
-        MemPool pool(&tracker);
+        auto tracker = std::make_shared();
+        MemPool pool(tracker.get());
 
         CppType* values = reinterpret_cast(pool.allocate(size * sizeof(CppType)));
         uint8_t* null_bitmap = reinterpret_cast(pool.allocate(BitmapSize(size)));
diff --git a/be/test/olap/rowset/segment_v2/bloom_filter_page_test.cpp b/be/test/olap/rowset/segment_v2/bloom_filter_page_test.cpp
index 4409882c9e..6909b35197 100644
--- a/be/test/olap/rowset/segment_v2/bloom_filter_page_test.cpp
+++ b/be/test/olap/rowset/segment_v2/bloom_filter_page_test.cpp
@@ -62,8 +62,8 @@ public:
         status = bf_page_decoder.seek_to_position_in_page(0);
         ASSERT_TRUE(status.ok());
 
-        MemTracker tracker;
-        MemPool pool(&tracker);
+        auto tracker = std::make_shared();
+        MemPool pool(tracker.get());
         Slice* values = reinterpret_cast(pool.allocate(sizeof(Slice)));
         ColumnBlock block(get_type_info(OLAP_FIELD_TYPE_VARCHAR), (uint8_t*)values, nullptr, 2, &pool);
         ColumnBlockView column_block_view(&block);
diff --git a/be/test/olap/rowset/segment_v2/column_reader_writer_test.cpp b/be/test/olap/rowset/segment_v2/column_reader_writer_test.cpp
index c8f6820cd9..a004ea0a81 100644
--- a/be/test/olap/rowset/segment_v2/column_reader_writer_test.cpp
+++ b/be/test/olap/rowset/segment_v2/column_reader_writer_test.cpp
@@ -42,8 +42,8 @@ static const string TEST_DIR = "./ut_dir/column_reader_writer_test";
 
 class ColumnReaderWriterTest : public testing::Test {
 public:
-    ColumnReaderWriterTest() : _pool(&_tracker) { }
-    virtual ~ColumnReaderWriterTest() { }
+    ColumnReaderWriterTest() : _tracker(new MemTracker()), _pool(_tracker.get()) {}
+    virtual ~ColumnReaderWriterTest() {}
 
 protected:
     void SetUp() override {
@@ -60,7 +60,7 @@ protected:
     }
 
 private:
-    MemTracker _tracker;
+    std::shared_ptr _tracker;
     MemPool _pool;
 };
 
@@ -146,8 +146,8 @@ void test_nullable_data(uint8_t* src_data, uint8_t* src_is_null, int num_rows, s
             st = iter->seek_to_first();
             ASSERT_TRUE(st.ok()) << st.to_string();
 
-            MemTracker tracker;
-            MemPool pool(&tracker);
+            auto tracker = std::make_shared();
+            MemPool pool(tracker.get());
             Type vals[1024];
             Type* vals_ = vals;
             uint8_t is_null[1024];
@@ -180,8 +180,8 @@ void test_nullable_data(uint8_t* src_data, uint8_t* src_is_null, int num_rows, s
         }
 
         {
-            MemTracker tracker;
-            MemPool pool(&tracker);
+            auto tracker = std::make_shared();
+            MemPool pool(tracker.get());
             Type vals[1024];
             uint8_t is_null[1024];
             ColumnBlock col(type_info, (uint8_t*)vals, is_null, 1024, &pool);
@@ -235,8 +235,8 @@ void test_read_default_value(string value, void* result) {
             st = iter.seek_to_first();
             ASSERT_TRUE(st.ok()) << st.to_string();
 
-            MemTracker tracker;
-            MemPool pool(&tracker);
+            auto tracker = std::make_shared();
+            MemPool pool(tracker.get());
             Type vals[1024];
             Type* vals_ = vals;
             uint8_t is_null[1024];
@@ -264,8 +264,8 @@ void test_read_default_value(string value, void* result) {
         }
 
         {
-            MemTracker tracker;
-            MemPool pool(&tracker);
+            auto tracker = std::make_shared();
+            MemPool pool(tracker.get());
             Type vals[1024];
             uint8_t is_null[1024];
             ColumnBlock col(type_info, (uint8_t*)vals, is_null, 1024, &pool);
diff --git a/be/test/olap/rowset/segment_v2/frame_of_reference_page_test.cpp b/be/test/olap/rowset/segment_v2/frame_of_reference_page_test.cpp
index da9729b09a..a40a98fc68 100644
--- a/be/test/olap/rowset/segment_v2/frame_of_reference_page_test.cpp
+++ b/be/test/olap/rowset/segment_v2/frame_of_reference_page_test.cpp
@@ -35,8 +35,8 @@ class FrameOfReferencePageTest : public testing::Test {
 public:
     template
     void copy_one(PageDecoderType* decoder, typename TypeTraits::CppType* ret) {
-        MemTracker tracker;
-        MemPool pool(&tracker);
+        auto tracker = std::make_shared();
+        MemPool pool(tracker.get());
         uint8_t null_bitmap = 0;
         ColumnBlock block(get_type_info(type), (uint8_t*)ret, &null_bitmap, 1, &pool);
         ColumnBlockView column_block_view(&block);
@@ -66,8 +66,8 @@ public:
         ASSERT_EQ(0, for_page_decoder.current_index());
         ASSERT_EQ(size, for_page_decoder.count());
 
-        MemTracker tracker;
-        MemPool pool(&tracker);
+        auto tracker = std::make_shared();
+        MemPool pool(tracker.get());
         CppType* values = reinterpret_cast(pool.allocate(size * sizeof(CppType)));
         uint8_t* null_bitmap = reinterpret_cast(pool.allocate(BitmapSize(size)));
         ColumnBlock block(get_type_info(Type), (uint8_t*)values, null_bitmap, size, &pool);
diff --git a/be/test/olap/rowset/segment_v2/plain_page_test.cpp b/be/test/olap/rowset/segment_v2/plain_page_test.cpp
index 33a9501c5c..a755dcbc1d 100644
--- a/be/test/olap/rowset/segment_v2/plain_page_test.cpp
+++ b/be/test/olap/rowset/segment_v2/plain_page_test.cpp
@@ -46,8 +46,8 @@ public:
 
     template
     void copy_one(PageDecoderType* decoder, typename TypeTraits::CppType* ret) {
-        MemTracker tracker;
-        MemPool pool(&tracker);
+        auto tracker = std::make_shared();
+        MemPool pool(tracker.get());
         uint8_t null_bitmap = 0;
         ColumnBlock block(get_type_info(type), (uint8_t*)ret, &null_bitmap, 1, &pool);
         ColumnBlockView column_block_view(&block);
@@ -84,8 +84,8 @@ public:
         
         ASSERT_EQ(0, page_decoder.current_index());
 
-        MemTracker tracker;
-        MemPool pool(&tracker);
+        auto tracker = std::make_shared();
+        MemPool pool(tracker.get());
 
         CppType* values = reinterpret_cast(pool.allocate(size * sizeof(CppType)));
         uint8_t* null_bitmap = reinterpret_cast(pool.allocate(BitmapSize(size)));
diff --git a/be/test/olap/rowset/segment_v2/rle_page_test.cpp b/be/test/olap/rowset/segment_v2/rle_page_test.cpp
index bb7050b658..13122d4187 100644
--- a/be/test/olap/rowset/segment_v2/rle_page_test.cpp
+++ b/be/test/olap/rowset/segment_v2/rle_page_test.cpp
@@ -37,8 +37,8 @@ public:
 
     template
     void copy_one(PageDecoderType* decoder, typename TypeTraits::CppType* ret) {
-        MemTracker tracker;
-        MemPool pool(&tracker);
+        auto tracker = std::make_shared();
+        MemPool pool(tracker.get());
         uint8_t null_bitmap = 0;
         ColumnBlock block(get_type_info(type), (uint8_t*)ret, &null_bitmap, 1, &pool);
         ColumnBlockView column_block_view(&block);
@@ -74,8 +74,8 @@ public:
         ASSERT_EQ(0, rle_page_decoder.current_index());
         ASSERT_EQ(size, rle_page_decoder.count());
 
-        MemTracker tracker;
-        MemPool pool(&tracker);
+        auto tracker = std::make_shared();
+        MemPool pool(tracker.get());
         CppType* values = reinterpret_cast(pool.allocate(size * sizeof(CppType)));
         uint8_t* null_bitmap = reinterpret_cast(pool.allocate(BitmapSize(size)));
         ColumnBlock block(get_type_info(Type), (uint8_t*)values, null_bitmap, size, &pool);
diff --git a/be/test/olap/rowset/segment_v2/segment_test.cpp b/be/test/olap/rowset/segment_v2/segment_test.cpp
index 86bd258972..8d8fc89996 100644
--- a/be/test/olap/rowset/segment_v2/segment_test.cpp
+++ b/be/test/olap/rowset/segment_v2/segment_test.cpp
@@ -752,8 +752,8 @@ TEST_F(SegmentReaderWriterTest, TestDefaultValueColumn) {
 
 TEST_F(SegmentReaderWriterTest, TestStringDict) {
     size_t num_rows_per_block = 10;
-    MemTracker tracker;
-    MemPool pool(&tracker);
+    auto tracker = std::make_shared();
+    MemPool pool(tracker.get());
 
     std::shared_ptr tablet_schema(new TabletSchema());
     tablet_schema->_num_columns = 4;
diff --git a/be/test/olap/schema_change_test.cpp b/be/test/olap/schema_change_test.cpp
index b8e2e4613e..8367fa8bba 100644
--- a/be/test/olap/schema_change_test.cpp
+++ b/be/test/olap/schema_change_test.cpp
@@ -306,7 +306,7 @@ public:
     ColumnWriter *_column_writer;
 
     ColumnReader *_column_reader;
-    std::unique_ptr _mem_tracker;
+    std::shared_ptr _mem_tracker;
     std::unique_ptr _mem_pool;
     std::unique_ptr _col_vector;
 
diff --git a/be/test/olap/skiplist_test.cpp b/be/test/olap/skiplist_test.cpp
index bfaf73819c..8771f36244 100644
--- a/be/test/olap/skiplist_test.cpp
+++ b/be/test/olap/skiplist_test.cpp
@@ -50,7 +50,7 @@ struct TestComparator {
 class SkipTest : public testing::Test {};
 
 TEST_F(SkipTest, Empty) {
-    std::unique_ptr tracker(new MemTracker(-1));
+    std::shared_ptr tracker(new MemTracker(-1));
     std::unique_ptr mem_pool(new MemPool(tracker.get()));
 
     TestComparator cmp;
@@ -68,7 +68,7 @@ TEST_F(SkipTest, Empty) {
 }
 
 TEST_F(SkipTest, InsertAndLookup) {
-    std::unique_ptr tracker(new MemTracker(-1));
+    std::shared_ptr tracker(new MemTracker(-1));
     std::unique_ptr mem_pool(new MemPool(tracker.get()));
 
     const int N = 2000;
@@ -150,7 +150,7 @@ TEST_F(SkipTest, InsertAndLookup) {
 
 // Only non-DUP model will use Find() and InsertWithHint().
 TEST_F(SkipTest, InsertWithHintNoneDupModel) {
-    std::unique_ptr tracker(new MemTracker(-1));
+    std::shared_ptr tracker(new MemTracker(-1));
     std::unique_ptr mem_pool(new MemPool(tracker.get()));
 
     const int N = 2000;
@@ -260,7 +260,7 @@ private:
     // Current state of the test
     State _current;
 
-    std::unique_ptr _mem_tracker;
+    std::shared_ptr _mem_tracker;
     std::unique_ptr _mem_pool;
 
     // SkipList is not protected by _mu.  We just use a single writer
@@ -268,10 +268,10 @@ private:
     SkipList _list;
 
 public:
-    ConcurrentTest():
-        _mem_tracker(new MemTracker(-1)),
-        _mem_pool(new MemPool(_mem_tracker.get())),
-        _list(TestComparator(), _mem_pool.get(), false) { }
+    ConcurrentTest()
+            : _mem_tracker(new MemTracker(-1)),
+              _mem_pool(new MemPool(_mem_tracker.get())),
+              _list(TestComparator(), _mem_pool.get(), false) {}
 
     // REQUIRES: External synchronization
     void write_step(Random* rnd) {
diff --git a/be/test/olap/storage_types_test.cpp b/be/test/olap/storage_types_test.cpp
index d4865a36a5..c49ae9bf5b 100644
--- a/be/test/olap/storage_types_test.cpp
+++ b/be/test/olap/storage_types_test.cpp
@@ -41,8 +41,8 @@ void common_test(typename TypeTraits::CppType src_val) {
     ASSERT_EQ(sizeof(src_val), type->size());
     {
         typename TypeTraits::CppType dst_val;
-        MemTracker tracker;
-        MemPool pool(&tracker);
+        auto tracker = std::make_shared();
+        MemPool pool(tracker.get());
         type->deep_copy((char*)&dst_val, (char*)&src_val, &pool);
         ASSERT_TRUE(type->equal((char*)&src_val, (char*)&dst_val));
         ASSERT_EQ(0, type->cmp((char*)&src_val, (char*)&dst_val));
@@ -82,8 +82,8 @@ void test_char(Slice src_val) {
     {
         char buf[64];
         Slice dst_val(buf, sizeof(buf));
-        MemTracker tracker;
-        MemPool pool(&tracker);
+        auto tracker = std::make_shared();
+        MemPool pool(tracker.get());
         type->deep_copy((char*)&dst_val, (char*)&src_val, &pool);
         ASSERT_TRUE(type->equal((char*)&src_val, (char*)&dst_val));
         ASSERT_EQ(0, type->cmp((char*)&src_val, (char*)&dst_val));
diff --git a/be/test/runtime/CMakeLists.txt b/be/test/runtime/CMakeLists.txt
index e2574b3129..2580380592 100644
--- a/be/test/runtime/CMakeLists.txt
+++ b/be/test/runtime/CMakeLists.txt
@@ -26,8 +26,8 @@ set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/test/runtime")
 ADD_BE_TEST(mem_pool_test)
 ADD_BE_TEST(free_list_test)
 ADD_BE_TEST(string_buffer_test)
-# ADD_BE_TEST(data_stream_test)
-#ADD_BE_TEST(disk_io_mgr_test)
+#ADD_BE_TEST(data_stream_test)
+ADD_BE_TEST(disk_io_mgr_test)
 #ADD_BE_TEST(parallel_executor_test)
 ADD_BE_TEST(datetime_value_test)
 ADD_BE_TEST(decimal_value_test)
@@ -43,10 +43,9 @@ ADD_BE_TEST(fragment_mgr_test)
 #ADD_BE_TEST(etl_job_mgr_test)
 
 # ADD_BE_TEST(tmp_file_mgr_test)
-# ADD_BE_TEST(disk_io_mgr_test)
-# ADD_BE_TEST(mem_limit_test)
-# ADD_BE_TEST(buffered_block_mgr2_test)
-# ADD_BE_TEST(buffered_tuple_stream2_test)
+ADD_BE_TEST(mem_limit_test)
+ADD_BE_TEST(buffered_block_mgr2_test)
+ADD_BE_TEST(buffered_tuple_stream2_test)
 ADD_BE_TEST(stream_load_pipe_test)
 ADD_BE_TEST(load_channel_mgr_test)
 #ADD_BE_TEST(export_task_mgr_test)
diff --git a/be/test/runtime/buffered_block_mgr2_test.cpp b/be/test/runtime/buffered_block_mgr2_test.cpp
index 5304b8e44d..dc0e8b068f 100644
--- a/be/test/runtime/buffered_block_mgr2_test.cpp
+++ b/be/test/runtime/buffered_block_mgr2_test.cpp
@@ -148,7 +148,7 @@ protected:
     }
 
     BufferedBlockMgr2* CreateMgrAndClient(int64_t query_id, int max_buffers, int block_size,
-                                          int reserved_blocks, MemTracker* tracker,
+                                          int reserved_blocks, const std::shared_ptr& tracker,
                                           BufferedBlockMgr2::Client** client) {
         RuntimeState* state = NULL;
         BufferedBlockMgr2* mgr = CreateMgr(query_id, max_buffers, block_size, &state);
@@ -158,7 +158,7 @@ protected:
     }
 
     void CreateMgrsAndClients(int64_t start_query_id, int num_mgrs, int buffers_per_mgr,
-                              int block_size, int reserved_blocks_per_client, MemTracker* tracker,
+                              int block_size, int reserved_blocks_per_client, const std::shared_ptr& tracker,
                               vector* mgrs,
                               vector* clients) {
         for (int i = 0; i < num_mgrs; ++i) {
@@ -269,7 +269,7 @@ protected:
         int max_num_blocks = 5;
         BufferedBlockMgr2* block_mgr = NULL;
         BufferedBlockMgr2::Client* client;
-        block_mgr = CreateMgrAndClient(0, max_num_blocks, block_size, 0, _client_tracker.get(),
+        block_mgr = CreateMgrAndClient(0, max_num_blocks, block_size, 0, _client_tracker,
                                        &client);
         EXPECT_EQ(_test_env->block_mgr_parent_tracker()->consumption(), 0);
 
@@ -314,7 +314,7 @@ protected:
         int max_num_buffers = 5;
         BufferedBlockMgr2* block_mgr = NULL;
         BufferedBlockMgr2::Client* client = NULL;
-        block_mgr = CreateMgrAndClient(0, max_num_buffers, block_size, 0, _client_tracker.get(),
+        block_mgr = CreateMgrAndClient(0, max_num_buffers, block_size, 0, _client_tracker,
                                        &client);
 
         // Check counters.
@@ -393,7 +393,7 @@ protected:
         ApiFunction api_function;
 
         BufferedBlockMgr2::Client* client;
-        Status status = block_mgr->register_client(0, _client_tracker.get(), state, &client);
+        Status status = block_mgr->register_client(0, _client_tracker, state, &client);
         EXPECT_TRUE(status.ok());
         EXPECT_TRUE(client != NULL);
 
@@ -566,8 +566,7 @@ protected:
     }
 
     scoped_ptr _test_env;
-    // scoped_ptr _client_tracker;
-    scoped_ptr _client_tracker;
+    std::shared_ptr _client_tracker;
     vector _created_tmp_dirs;
 };
 
@@ -584,7 +583,7 @@ TEST_F(BufferedBlockMgrTest, GetNewBlockSmallBlocks) {
     BufferedBlockMgr2* block_mgr;
     BufferedBlockMgr2::Client* client;
     block_mgr =
-            CreateMgrAndClient(0, max_num_blocks, block_size, 0, _client_tracker.get(), &client);
+            CreateMgrAndClient(0, max_num_blocks, block_size, 0, _client_tracker, &client);
     EXPECT_EQ(0, _test_env->block_mgr_parent_tracker()->consumption());
 
     vector blocks;
@@ -645,7 +644,7 @@ TEST_F(BufferedBlockMgrTest, Pin) {
     BufferedBlockMgr2* block_mgr;
     BufferedBlockMgr2::Client* client;
     block_mgr =
-            CreateMgrAndClient(0, max_num_blocks, block_size, 0, _client_tracker.get(), &client);
+            CreateMgrAndClient(0, max_num_blocks, block_size, 0, _client_tracker, &client);
 
     vector blocks;
     AllocateBlocks(block_mgr, client, max_num_blocks, &blocks);
@@ -700,7 +699,7 @@ TEST_F(BufferedBlockMgrTest, Deletion) {
     BufferedBlockMgr2* block_mgr;
     BufferedBlockMgr2::Client* client;
     block_mgr =
-            CreateMgrAndClient(0, max_num_buffers, block_size, 0, _client_tracker.get(), &client);
+            CreateMgrAndClient(0, max_num_buffers, block_size, 0, _client_tracker, &client);
 
     // Check counters.
     RuntimeProfile* profile = block_mgr->profile();
@@ -725,7 +724,7 @@ TEST_F(BufferedBlockMgrTest, DeleteSingleBlocks) {
     int max_num_buffers = 16;
     BufferedBlockMgr2::Client* client;
     BufferedBlockMgr2* block_mgr =
-            CreateMgrAndClient(0, max_num_buffers, _block_size, 0, _client_tracker.get(), &client);
+            CreateMgrAndClient(0, max_num_buffers, _block_size, 0, _client_tracker, &client);
 
     // Pinned I/O block.
     BufferedBlockMgr2::Block* new_block;
@@ -777,7 +776,7 @@ TEST_F(BufferedBlockMgrTest, Close) {
     BufferedBlockMgr2* block_mgr;
     BufferedBlockMgr2::Client* client;
     block_mgr =
-            CreateMgrAndClient(0, max_num_buffers, block_size, 0, _client_tracker.get(), &client);
+            CreateMgrAndClient(0, max_num_buffers, block_size, 0, _client_tracker, &client);
 
     vector blocks;
     AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
@@ -819,7 +818,7 @@ TEST_F(BufferedBlockMgrTest, WriteError) {
     BufferedBlockMgr2* block_mgr;
     BufferedBlockMgr2::Client* client;
     block_mgr =
-            CreateMgrAndClient(0, max_num_buffers, block_size, 0, _client_tracker.get(), &client);
+            CreateMgrAndClient(0, max_num_buffers, block_size, 0, _client_tracker, &client);
 
     vector blocks;
     AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
@@ -863,7 +862,7 @@ TEST_F(BufferedBlockMgrTest, TmpFileAllocateError) {
     int max_num_buffers = 2;
     BufferedBlockMgr2::Client* client;
     BufferedBlockMgr2* block_mgr =
-            CreateMgrAndClient(0, max_num_buffers, _block_size, 0, _client_tracker.get(), &client);
+            CreateMgrAndClient(0, max_num_buffers, _block_size, 0, _client_tracker, &client);
 
     vector blocks;
     AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
@@ -897,7 +896,7 @@ TEST_F(BufferedBlockMgrTest, DISABLED_WriteErrorBlacklist) {
     int blocks_per_mgr = MAX_NUM_BLOCKS / NUM_BLOCK_MGRS;
     vector block_mgrs;
     vector clients;
-    CreateMgrsAndClients(0, NUM_BLOCK_MGRS, blocks_per_mgr, _block_size, 0, _client_tracker.get(),
+    CreateMgrsAndClients(0, NUM_BLOCK_MGRS, blocks_per_mgr, _block_size, 0, _client_tracker,
                          &block_mgrs, &clients);
 
     // Allocate files for all 2x2 combinations by unpinning blocks.
@@ -957,7 +956,7 @@ TEST_F(BufferedBlockMgrTest, DISABLED_WriteErrorBlacklist) {
     // A new block manager should only use the good dir for backing storage.
     BufferedBlockMgr2::Client* new_client;
     BufferedBlockMgr2* new_block_mgr = CreateMgrAndClient(9999, blocks_per_mgr, _block_size, 0,
-                                                          _client_tracker.get(), &new_client);
+                                                          _client_tracker, &new_client);
     vector new_mgr_blocks;
     AllocateBlocks(new_block_mgr, new_client, blocks_per_mgr, &new_mgr_blocks);
     UnpinBlocks(new_mgr_blocks);
@@ -980,7 +979,7 @@ TEST_F(BufferedBlockMgrTest, AllocationErrorHandling) {
     // vector runtime_states;
     vector block_mgrs;
     vector clients;
-    CreateMgrsAndClients(0, num_block_mgrs, blocks_per_mgr, _block_size, 0, _client_tracker.get(),
+    CreateMgrsAndClients(0, num_block_mgrs, blocks_per_mgr, _block_size, 0, _client_tracker,
                          &block_mgrs, &clients);
 
     // Allocate files for all 2x2 combinations by unpinning blocks.
@@ -1019,7 +1018,7 @@ TEST_F(BufferedBlockMgrTest, NoDirsAllocationError) {
     int max_num_buffers = 2;
     BufferedBlockMgr2::Client* client;
     BufferedBlockMgr2* block_mgr =
-            CreateMgrAndClient(0, max_num_buffers, _block_size, 0, _client_tracker.get(), &client);
+            CreateMgrAndClient(0, max_num_buffers, _block_size, 0, _client_tracker, &client);
     vector blocks;
     AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
     for (int i = 0; i < tmp_dirs.size(); ++i) {
@@ -1043,11 +1042,11 @@ TEST_F(BufferedBlockMgrTest, MultipleClients) {
 
     BufferedBlockMgr2::Client* client1 = NULL;
     BufferedBlockMgr2::Client* client2 = NULL;
-    status = block_mgr->register_client(client1_buffers, _client_tracker.get(), runtime_state,
+    status = block_mgr->register_client(client1_buffers, _client_tracker, runtime_state,
                                         &client1);
     EXPECT_TRUE(status.ok());
     EXPECT_TRUE(client1 != NULL);
-    status = block_mgr->register_client(client2_buffers, _client_tracker.get(), runtime_state,
+    status = block_mgr->register_client(client2_buffers, _client_tracker, runtime_state,
                                         &client2);
     EXPECT_TRUE(status.ok());
     EXPECT_TRUE(client2 != NULL);
@@ -1156,11 +1155,11 @@ TEST_F(BufferedBlockMgrTest, MultipleClientsExtraBuffers) {
     BufferedBlockMgr2::Client* client1 = NULL;
     BufferedBlockMgr2::Client* client2 = NULL;
     BufferedBlockMgr2::Block* block = NULL;
-    status = block_mgr->register_client(client1_buffers, _client_tracker.get(), runtime_state,
+    status = block_mgr->register_client(client1_buffers, _client_tracker, runtime_state,
                                         &client1);
     EXPECT_TRUE(status.ok());
     EXPECT_TRUE(client1 != NULL);
-    status = block_mgr->register_client(client2_buffers, _client_tracker.get(), runtime_state,
+    status = block_mgr->register_client(client2_buffers, _client_tracker, runtime_state,
                                         &client2);
     EXPECT_TRUE(status.ok());
     EXPECT_TRUE(client2 != NULL);
@@ -1205,11 +1204,11 @@ TEST_F(BufferedBlockMgrTest, ClientOversubscription) {
     BufferedBlockMgr2::Client* client1 = NULL;
     BufferedBlockMgr2::Client* client2 = NULL;
     BufferedBlockMgr2::Block* block = NULL;
-    status = block_mgr->register_client(client1_buffers, _client_tracker.get(), runtime_state,
+    status = block_mgr->register_client(client1_buffers, _client_tracker, runtime_state,
                                         &client1);
     EXPECT_TRUE(status.ok());
     EXPECT_TRUE(client1 != NULL);
-    status = block_mgr->register_client(client2_buffers, _client_tracker.get(), runtime_state,
+    status = block_mgr->register_client(client2_buffers, _client_tracker, runtime_state,
                                         &client2);
     EXPECT_TRUE(status.ok());
     EXPECT_TRUE(client2 != NULL);
diff --git a/be/test/runtime/buffered_tuple_stream2_test.cpp b/be/test/runtime/buffered_tuple_stream2_test.cpp
index 8630ee1267..251e537624 100644
--- a/be/test/runtime/buffered_tuple_stream2_test.cpp
+++ b/be/test/runtime/buffered_tuple_stream2_test.cpp
@@ -64,14 +64,14 @@ static const int NUM_STRINGS = sizeof(STRINGS) / sizeof(StringValue);
 
 class SimpleTupleStreamTest : public testing::Test {
 public:
-    SimpleTupleStreamTest() : _tracker(-1) {}
+    SimpleTupleStreamTest() : _tracker(new MemTracker(-1)) {}
      // A null dtor to pass codestyle check
     ~SimpleTupleStreamTest() {}
 protected:
     virtual void SetUp() {
         _test_env.reset(new TestEnv());
         create_descriptors();
-        _mem_pool.reset(new MemPool(&_tracker));
+        _mem_pool.reset(new MemPool(_tracker.get()));
     }
 
     virtual void create_descriptors() {
@@ -102,7 +102,7 @@ protected:
     void InitBlockMgr(int64_t limit, int block_size) {
         Status status = _test_env->create_query_state(0, limit, block_size, &_runtime_state);
         ASSERT_TRUE(status.ok());
-        status = _runtime_state->block_mgr2()->register_client(0, &_tracker, _runtime_state,
+        status = _runtime_state->block_mgr2()->register_client(0, _tracker, _runtime_state,
                 &_client);
         ASSERT_TRUE(status.ok());
     }
@@ -120,7 +120,7 @@ protected:
     }
 
     virtual RowBatch* CreateIntBatch(int offset, int num_rows, bool gen_null) {
-        RowBatch* batch = _pool.add(new RowBatch(*_int_desc, num_rows, &_tracker));
+        RowBatch* batch = _pool.add(new RowBatch(*_int_desc, num_rows, _tracker.get()));
         int tuple_size = _int_desc->tuple_descriptors()[0]->byte_size();
         uint8_t* tuple_mem = reinterpret_cast(
                 batch->tuple_data_pool()->allocate(tuple_size * num_rows));
@@ -149,7 +149,7 @@ protected:
 
     virtual RowBatch* CreateStringBatch(int offset, int num_rows, bool gen_null) {
         int tuple_size = sizeof(StringValue) + 1;
-        RowBatch* batch = _pool.add(new RowBatch(*_string_desc, num_rows, &_tracker));
+        RowBatch* batch = _pool.add(new RowBatch(*_string_desc, num_rows, _tracker.get()));
         uint8_t* tuple_mem = batch->tuple_data_pool()->allocate(tuple_size * num_rows);
         memset(tuple_mem, 0, tuple_size * num_rows);
         const int string_tuples = _string_desc->tuple_descriptors().size();
@@ -212,7 +212,7 @@ protected:
     void ReadValues(BufferedTupleStream2* stream, RowDescriptor* desc, vector* results,
             int num_batches = -1) {
         bool eos = false;
-        RowBatch batch(*desc, BATCH_SIZE, &_tracker);
+        RowBatch batch(*desc, BATCH_SIZE, _tracker.get());
         int batches_read = 0;
         do {
             batch.reset();
@@ -357,7 +357,7 @@ protected:
     RuntimeState* _runtime_state;
     BufferedBlockMgr2::Client* _client;
 
-    MemTracker _tracker;
+    std::shared_ptr _tracker;
     ObjectPool _pool;
     RowDescriptor* _int_desc;
     RowDescriptor* _string_desc;
@@ -791,7 +791,7 @@ TEST_F(ArrayTupleStreamTest, TestArrayDeepCopy) {
     array_len_index = 0;
     bool eos = false;
     int rows_read = 0;
-    RowBatch batch(*_array_desc, BATCH_SIZE, &_tracker);
+    RowBatch batch(*_array_desc, BATCH_SIZE, _tracker.get());
     do {
         batch.reset();
         ASSERT_TRUE(stream.get_next(&batch, &eos).ok());
diff --git a/be/test/runtime/data_stream_test.cpp b/be/test/runtime/data_stream_test.cpp
index e2558f8238..527d2fde35 100644
--- a/be/test/runtime/data_stream_test.cpp
+++ b/be/test/runtime/data_stream_test.cpp
@@ -123,8 +123,7 @@ private:
 class DataStreamTest : public testing::Test {
 protected:
     DataStreamTest()
-            : _limit(-1),
-              _dummy_mem_limit(-1),
+            : _limit(new MemTracker(-1)),
               _runtime_state(TUniqueId(), TQueryOptions(), "", &_exec_env),
               _next_val(0) {
         _exec_env.init_for_tests();
@@ -210,13 +209,11 @@ protected:
     static const int NUM_BATCHES = TOTAL_DATA_SIZE / BATCH_CAPACITY / PER_ROW_DATA;
 
     ObjectPool _obj_pool;
-    MemTracker _limit;
-    MemTracker _tracker;
+    std::shared_ptr _limit;
+    std::shared_ptr _tracker;
     DescriptorTbl* _desc_tbl;
     const RowDescriptor* _row_desc;
     TupleRowComparator* _less_than;
-    MemTracker _dummy_mem_limit;
-    MemTracker _dummy_mem_tracker;
     ExecEnv _exec_env;
     RuntimeState _runtime_state;
     TUniqueId _next_instance_id;
@@ -336,8 +333,8 @@ protected:
         SlotRef* rhs_slot = _obj_pool.add(new SlotRef(expr_node));
         _rhs_slot_ctx = _obj_pool.add(new ExprContext(rhs_slot));
 
-        _lhs_slot_ctx->prepare(&_runtime_state, *_row_desc, &_tracker);
-        _rhs_slot_ctx->prepare(&_runtime_state, *_row_desc, &_tracker);
+        _lhs_slot_ctx->prepare(&_runtime_state, *_row_desc, _tracker.get());
+        _rhs_slot_ctx->prepare(&_runtime_state, *_row_desc, _tracker.get());
         _lhs_slot_ctx->open(NULL);
         _rhs_slot_ctx->open(NULL);
         SortExecExprs* sort_exprs = _obj_pool.add(new SortExecExprs());
@@ -349,7 +346,7 @@ protected:
 
     // Create _batch, but don't fill it with data yet. Assumes we created _row_desc.
     RowBatch* create_row_batch() {
-        RowBatch* batch = new RowBatch(*_row_desc, BATCH_CAPACITY, &_limit);
+        RowBatch* batch = new RowBatch(*_row_desc, BATCH_CAPACITY, _limit.get());
         int64_t* tuple_mem =
                 reinterpret_cast(batch->tuple_data_pool()->allocate(BATCH_CAPACITY * 8));
         bzero(tuple_mem, BATCH_CAPACITY * 8);
@@ -436,8 +433,7 @@ protected:
         if (info->status.is_cancelled()) {
             return;
         }
-        // RowBatch batch(*_row_desc, 1024, &_tracker);
-        RowBatch batch(*_row_desc, 1024, &_limit);
+        RowBatch batch(*_row_desc, 1024, _limit.get());
         VLOG_QUERY << "start reading merging";
         bool eos = false;
         while (!(info->status = info->stream_recvr->get_next(&batch, &eos)).is_cancelled()) {
diff --git a/be/test/runtime/disk_io_mgr_test.cpp b/be/test/runtime/disk_io_mgr_test.cpp
index e23b18c484..b6f99eff20 100644
--- a/be/test/runtime/disk_io_mgr_test.cpp
+++ b/be/test/runtime/disk_io_mgr_test.cpp
@@ -187,8 +187,7 @@ protected:
 // by reading the data back via a separate IoMgr instance. All writes are expected to
 // complete successfully.
 TEST_F(DiskIoMgrTest, SingleWriter) {
-    // MemTracker mem_tracker(LARGE_MEM_LIMIT);
-    MemTracker mem_tracker(LARGE_MEM_LIMIT);
+    std::shared_ptr mem_tracker(new MemTracker(LARGE_MEM_LIMIT));
     _num_ranges_written = 0;
     string tmp_file = "/tmp/disk_io_mgr_test.txt";
     int num_ranges = 100;
@@ -202,21 +201,20 @@ TEST_F(DiskIoMgrTest, SingleWriter) {
     }
 
     scoped_ptr read_io_mgr(new DiskIoMgr(1, 1, 1, 10));
-    // MemTracker reader_mem_tracker(LARGE_MEM_LIMIT);
-    MemTracker reader_mem_tracker(LARGE_MEM_LIMIT);
-    Status status = read_io_mgr->init(&reader_mem_tracker);
+    std::shared_ptr reader_mem_tracker(new MemTracker(LARGE_MEM_LIMIT));
+    Status status = read_io_mgr->init(reader_mem_tracker);
     ASSERT_TRUE(status.ok());
     DiskIoMgr::RequestContext* reader;
-    status = read_io_mgr->register_context(&reader, &reader_mem_tracker);
+    status = read_io_mgr->register_context(&reader, reader_mem_tracker);
     ASSERT_TRUE(status.ok());
     for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
         for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
             _pool.reset(new ObjectPool);
             DiskIoMgr io_mgr(num_disks, num_threads_per_disk, 1, 10);
-            status = io_mgr.init(&mem_tracker);
+            status = io_mgr.init(mem_tracker);
             ASSERT_TRUE(status.ok());
             DiskIoMgr::RequestContext* writer;
-            io_mgr.register_context(&writer, &mem_tracker);
+            io_mgr.register_context(&writer, mem_tracker);
             for (int i = 0; i < num_ranges; ++i) {
                 int32_t* data = _pool->add(new int32_t);
                 *data = rand();
@@ -250,11 +248,11 @@ TEST_F(DiskIoMgrTest, SingleWriter) {
 // Perform invalid writes (e.g. non-existent file, negative offset) and validate
 // that an error status is returned via the write callback.
 TEST_F(DiskIoMgrTest, InvalidWrite) {
-    MemTracker mem_tracker(LARGE_MEM_LIMIT);
+    std::shared_ptr mem_tracker(new MemTracker(LARGE_MEM_LIMIT));
     _num_ranges_written = 0;
     string tmp_file = "/tmp/non-existent.txt";
     DiskIoMgr io_mgr(1, 1, 1, 10);
-    Status status = io_mgr.init(&mem_tracker);
+    Status status = io_mgr.init(mem_tracker);
     ASSERT_TRUE(status.ok());
     DiskIoMgr::RequestContext* writer;
     status = io_mgr.register_context(&writer);
@@ -306,7 +304,7 @@ TEST_F(DiskIoMgrTest, InvalidWrite) {
 // add_write_range() is expected to succeed before the cancel and fail after it.
 // The writes themselves may finish with status cancelled or ok.
 TEST_F(DiskIoMgrTest, SingleWriterCancel) {
-    MemTracker mem_tracker(LARGE_MEM_LIMIT);
+    std::shared_ptr mem_tracker(new MemTracker(LARGE_MEM_LIMIT));
     _num_ranges_written = 0;
     string tmp_file = "/tmp/disk_io_mgr_test.txt";
     int num_ranges = 100;
@@ -321,19 +319,19 @@ TEST_F(DiskIoMgrTest, SingleWriterCancel) {
     }
 
     scoped_ptr read_io_mgr(new DiskIoMgr(1, 1, 1, 10));
-    MemTracker reader_mem_tracker(LARGE_MEM_LIMIT);
-    Status status = read_io_mgr->init(&reader_mem_tracker);
+    std::shared_ptr reader_mem_tracker(new MemTracker(LARGE_MEM_LIMIT));
+    Status status = read_io_mgr->init(reader_mem_tracker);
     ASSERT_TRUE(status.ok());
     DiskIoMgr::RequestContext* reader;
-    status = read_io_mgr->register_context(&reader, &reader_mem_tracker);
+    status = read_io_mgr->register_context(&reader, reader_mem_tracker);
     ASSERT_TRUE(status.ok());
     for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
         for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
             _pool.reset(new ObjectPool);
             DiskIoMgr io_mgr(num_disks, num_threads_per_disk, 1, 10);
-            status = io_mgr.init(&mem_tracker);
+            status = io_mgr.init(mem_tracker);
             DiskIoMgr::RequestContext* writer;
-            io_mgr.register_context(&writer, &mem_tracker);
+            io_mgr.register_context(&writer, mem_tracker);
             Status validate_status = Status::OK();
             for (int i = 0; i < num_ranges; ++i) {
                 if (i == num_ranges_before_cancel) {
@@ -373,7 +371,7 @@ TEST_F(DiskIoMgrTest, SingleWriterCancel) {
 // Basic test with a single reader, testing multiple threads, disks and a different
 // number of buffers.
 TEST_F(DiskIoMgrTest, SingleReader) {
-    MemTracker mem_tracker(LARGE_MEM_LIMIT);
+    std::shared_ptr mem_tracker(new MemTracker(LARGE_MEM_LIMIT));
     const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
     const char* data = "abcdefghijklm";
     int len = strlen(data);
@@ -398,11 +396,11 @@ TEST_F(DiskIoMgrTest, SingleReader) {
                     }
                     DiskIoMgr io_mgr(num_disks, num_threads_per_disk, 1, 1);
 
-                    Status status = io_mgr.init(&mem_tracker);
+                    Status status = io_mgr.init(mem_tracker);
                     ASSERT_TRUE(status.ok());
-                    MemTracker reader_mem_tracker(LARGE_MEM_LIMIT);
+                    std::shared_ptr reader_mem_tracker(new MemTracker(LARGE_MEM_LIMIT));
                     DiskIoMgr::RequestContext* reader;
-                    status = io_mgr.register_context(&reader, &reader_mem_tracker);
+                    status = io_mgr.register_context(&reader, reader_mem_tracker);
                     ASSERT_TRUE(status.ok());
 
                     vector ranges;
@@ -424,17 +422,17 @@ TEST_F(DiskIoMgrTest, SingleReader) {
 
                     EXPECT_EQ(num_ranges_processed, ranges.size());
                     io_mgr.unregister_context(reader);
-                    EXPECT_EQ(reader_mem_tracker.consumption(), 0);
+                    EXPECT_EQ(reader_mem_tracker->consumption(), 0);
                 }
             }
         }
     }
-    EXPECT_EQ(mem_tracker.consumption(), 0);
+    EXPECT_EQ(mem_tracker->consumption(), 0);
 }
 
 // This test issues adding additional scan ranges while there are some still in flight.
 TEST_F(DiskIoMgrTest, AddScanRangeTest) {
-    MemTracker mem_tracker(LARGE_MEM_LIMIT);
+    std::shared_ptr mem_tracker(new MemTracker(LARGE_MEM_LIMIT));
     const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
     const char* data = "abcdefghijklm";
     int len = strlen(data);
@@ -455,11 +453,11 @@ TEST_F(DiskIoMgrTest, AddScanRangeTest) {
                 if (++iters % 5000 == 0) LOG(ERROR) << "Starting iteration " << iters;
                 DiskIoMgr io_mgr(num_disks, num_threads_per_disk, 1, 1);
 
-                Status status = io_mgr.init(&mem_tracker);
+                Status status = io_mgr.init(mem_tracker);
                 ASSERT_TRUE(status.ok());
-                MemTracker reader_mem_tracker(LARGE_MEM_LIMIT);
+                std::shared_ptr reader_mem_tracker(new MemTracker(LARGE_MEM_LIMIT));
                 DiskIoMgr::RequestContext* reader;
-                status = io_mgr.register_context(&reader, &reader_mem_tracker);
+                status = io_mgr.register_context(&reader, reader_mem_tracker);
                 ASSERT_TRUE(status.ok());
 
                 vector ranges_first_half;
@@ -499,18 +497,18 @@ TEST_F(DiskIoMgrTest, AddScanRangeTest) {
                 threads.join_all();
                 EXPECT_EQ(num_ranges_processed, len);
                 io_mgr.unregister_context(reader);
-                EXPECT_EQ(reader_mem_tracker.consumption(), 0);
+                EXPECT_EQ(reader_mem_tracker->consumption(), 0);
             }
         }
     }
-    EXPECT_EQ(mem_tracker.consumption(), 0);
+    EXPECT_EQ(mem_tracker->consumption(), 0);
 }
 
 // Test to make sure that sync reads and async reads work together
 // Note: this test is constructed so the number of buffers is greater than the
 // number of scan ranges.
 TEST_F(DiskIoMgrTest, SyncReadTest) {
-    MemTracker mem_tracker(LARGE_MEM_LIMIT);
+    std::shared_ptr mem_tracker(new MemTracker(LARGE_MEM_LIMIT));
     const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
     const char* data = "abcdefghijklm";
     int len = strlen(data);
@@ -534,11 +532,11 @@ TEST_F(DiskIoMgrTest, SyncReadTest) {
                 DiskIoMgr io_mgr(
                         num_disks, num_threads_per_disk, MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
 
-                Status status = io_mgr.init(&mem_tracker);
+                Status status = io_mgr.init(mem_tracker);
                 ASSERT_TRUE(status.ok());
-                MemTracker reader_mem_tracker(LARGE_MEM_LIMIT);
+                std::shared_ptr reader_mem_tracker(new MemTracker(LARGE_MEM_LIMIT));
                 DiskIoMgr::RequestContext* reader;
-                status = io_mgr.register_context(&reader, &reader_mem_tracker);
+                status = io_mgr.register_context(&reader, reader_mem_tracker);
                 ASSERT_TRUE(status.ok());
 
                 DiskIoMgr::ScanRange* complete_range = init_range(1, tmp_file, 0, strlen(data), 0,
@@ -577,16 +575,16 @@ TEST_F(DiskIoMgrTest, SyncReadTest) {
 
                 EXPECT_EQ(num_ranges_processed, ranges.size());
                 io_mgr.unregister_context(reader);
-                EXPECT_EQ(reader_mem_tracker.consumption(), 0);
+                EXPECT_EQ(reader_mem_tracker->consumption(), 0);
             }
         }
     }
-    EXPECT_EQ(mem_tracker.consumption(), 0);
+    EXPECT_EQ(mem_tracker->consumption(), 0);
 }
 
 // Tests a single reader cancelling half way through scan ranges.
 TEST_F(DiskIoMgrTest, SingleReaderCancel) {
-    MemTracker mem_tracker(LARGE_MEM_LIMIT);
+    std::shared_ptr mem_tracker(new MemTracker(LARGE_MEM_LIMIT));
     const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
     const char* data = "abcdefghijklm";
     int len = strlen(data);
@@ -607,11 +605,11 @@ TEST_F(DiskIoMgrTest, SingleReaderCancel) {
                 if (++iters % 5000 == 0) LOG(ERROR) << "Starting iteration " << iters;
                 DiskIoMgr io_mgr(num_disks, num_threads_per_disk, 1, 1);
 
-                Status status = io_mgr.init(&mem_tracker);
+                Status status = io_mgr.init(mem_tracker);
                 ASSERT_TRUE(status.ok());
-                MemTracker reader_mem_tracker(LARGE_MEM_LIMIT);
+                std::shared_ptr reader_mem_tracker(new MemTracker(LARGE_MEM_LIMIT));
                 DiskIoMgr::RequestContext* reader;
-                status = io_mgr.register_context(&reader, &reader_mem_tracker);
+                status = io_mgr.register_context(&reader, reader_mem_tracker);
                 ASSERT_TRUE(status.ok());
 
                 vector ranges;
@@ -645,11 +643,11 @@ TEST_F(DiskIoMgrTest, SingleReaderCancel) {
                 threads.join_all();
                 EXPECT_TRUE(io_mgr.context_status(reader).is_cancelled());
                 io_mgr.unregister_context(reader);
-                EXPECT_EQ(reader_mem_tracker.consumption(), 0);
+                EXPECT_EQ(reader_mem_tracker->consumption(), 0);
             }
         }
     }
-    EXPECT_EQ(mem_tracker.consumption(), 0);
+    EXPECT_EQ(mem_tracker->consumption(), 0);
 }
 
 // Test when the reader goes over the mem limit
@@ -674,14 +672,14 @@ TEST_F(DiskIoMgrTest, MemTrackers) {
             LOG(ERROR) << "Starting iteration " << iters;
         }
 
-        MemTracker mem_tracker(mem_limit_num_buffers * MAX_BUFFER_SIZE);
+        std::shared_ptr mem_tracker(new MemTracker(mem_limit_num_buffers * MAX_BUFFER_SIZE));
         DiskIoMgr io_mgr(1, 1, MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
 
-        Status status = io_mgr.init(&mem_tracker);
+        Status status = io_mgr.init(mem_tracker);
         ASSERT_TRUE(status.ok());
-        MemTracker reader_mem_tracker(LARGE_MEM_LIMIT);
+        std::shared_ptr reader_mem_tracker(new MemTracker(LARGE_MEM_LIMIT));
         DiskIoMgr::RequestContext* reader;
-        status = io_mgr.register_context(&reader, &reader_mem_tracker);
+        status = io_mgr.register_context(&reader, reader_mem_tracker);
         ASSERT_TRUE(status.ok());
 
         vector ranges;
@@ -727,7 +725,7 @@ TEST_F(DiskIoMgrTest, MemTrackers) {
 
         EXPECT_TRUE(io_mgr.context_status(reader).is_mem_limit_exceeded());
         io_mgr.unregister_context(reader);
-        EXPECT_EQ(reader_mem_tracker.consumption(), 0);
+        EXPECT_EQ(reader_mem_tracker->consumption(), 0);
     }
 }
 #if 0
@@ -736,7 +734,7 @@ TEST_F(DiskIoMgrTest, MemTrackers) {
 // only tests the fallback mechanism.
 // TODO: we can fake the cached read path without HDFS
 TEST_F(DiskIoMgrTest, CachedReads) {
-    MemTracker mem_tracker(LARGE_MEM_LIMIT);
+    std::shared_ptr mem_tracker(new MemTracker(LARGE_MEM_LIMIT));
     const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
     const char* data = "abcdefghijklm";
     int len = strlen(data);
@@ -755,11 +753,11 @@ TEST_F(DiskIoMgrTest, CachedReads) {
         if (++iters % 5000 == 0) LOG(ERROR) << "Starting iteration " << iters;
         DiskIoMgr io_mgr(num_disks, 1, MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
 
-        Status status = io_mgr.init(&mem_tracker);
+        Status status = io_mgr.init(mem_tracker);
         ASSERT_TRUE(status.ok());
-        MemTracker reader_mem_tracker;
+        std::shared_ptr reader_mem_tracker(new MemTracker());
         DiskIoMgr::RequestContext* reader;
-        status = io_mgr.register_context(&reader, &reader_mem_tracker);
+        status = io_mgr.register_context(&reader, reader_mem_tracker);
         ASSERT_TRUE(status.ok());
 
         DiskIoMgr::ScanRange* complete_range =
@@ -798,14 +796,14 @@ TEST_F(DiskIoMgrTest, CachedReads) {
 
         EXPECT_EQ(num_ranges_processed, ranges.size());
         io_mgr.unregister_context(reader);
-        EXPECT_EQ(reader_mem_tracker.consumption(), 0);
+        EXPECT_EQ(reader_mem_tracker->consumption(), 0);
     }
-    EXPECT_EQ(mem_tracker.consumption(), 0);
+    EXPECT_EQ(mem_tracker->consumption(), 0);
 }
 #endif // end #if 0
 
 TEST_F(DiskIoMgrTest, MultipleReaderWriter) {
-    MemTracker mem_tracker(LARGE_MEM_LIMIT);
+    std::shared_ptr mem_tracker(new MemTracker(LARGE_MEM_LIMIT));
     const int ITERATIONS = 1;
     const char* data = "abcdefghijklmnopqrstuvwxyz";
     const int num_contexts = 5;
@@ -832,7 +830,7 @@ TEST_F(DiskIoMgrTest, MultipleReaderWriter) {
         for (int threads_per_disk = 1; threads_per_disk <= 5; ++threads_per_disk) {
             for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
                 DiskIoMgr io_mgr(num_disks, threads_per_disk, MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
-                io_mgr.init(&mem_tracker);
+                io_mgr.init(mem_tracker);
                 for (int file_index = 0; file_index < num_contexts; ++file_index) {
                     status = io_mgr.register_context(&contexts[file_index]);
                     ASSERT_TRUE(status.ok());
@@ -896,7 +894,7 @@ TEST_F(DiskIoMgrTest, MultipleReaderWriter) {
 
 // This test will test multiple concurrent reads each reading a different file.
 TEST_F(DiskIoMgrTest, MultipleReader) {
-    MemTracker mem_tracker(LARGE_MEM_LIMIT);
+    std::shared_ptr mem_tracker(new MemTracker(LARGE_MEM_LIMIT));
     const int NUM_READERS = 5;
     const int DATA_LEN = 50;
     const int ITERATIONS = 25;
@@ -950,7 +948,7 @@ TEST_F(DiskIoMgrTest, MultipleReader) {
                     if (++iters % 2500 == 0) LOG(ERROR) << "Starting iteration " << iters;
 
                     DiskIoMgr io_mgr(num_disks, threads_per_disk, MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
-                    Status status = io_mgr.init(&mem_tracker);
+                    Status status = io_mgr.init(mem_tracker);
                     ASSERT_TRUE(status.ok());
 
                     for (int i = 0; i < NUM_READERS; ++i) {
@@ -986,7 +984,7 @@ TEST_F(DiskIoMgrTest, MultipleReader) {
             }
         }
     }
-    EXPECT_EQ(mem_tracker.consumption(), 0);
+    EXPECT_EQ(mem_tracker->consumption(), 0);
 }
 
 #if 0
@@ -1004,12 +1002,12 @@ TEST_F(DiskIoMgrTest, Buffers) {
     // Test default min/max buffer size
     int min_buffer_size = 1024;
     int max_buffer_size = 8 * 1024 * 1024; // 8 MB
-    MemTracker mem_tracker(max_buffer_size * 2);
+    std::shared_ptr mem_tracker(new MemTracker(max_buffer_size * 2));
 
     DiskIoMgr io_mgr(1, 1, min_buffer_size, max_buffer_size);
-    Status status = io_mgr.init(&mem_tracker);
+    Status status = io_mgr.init(mem_tracker);
     ASSERT_TRUE(status.ok());
-    ASSERT_EQ(mem_tracker.consumption(), 0);
+    ASSERT_EQ(mem_tracker->consumption(), 0);
 
     // buffer length should be rounded up to min buffer size
     int64_t buffer_len = 1;
@@ -1017,7 +1015,7 @@ TEST_F(DiskIoMgrTest, Buffers) {
     EXPECT_EQ(buffer_len, min_buffer_size);
     EXPECT_EQ(io_mgr._num_allocated_buffers, 1);
     io_mgr.return_free_buffer(buf, buffer_len);
-    EXPECT_EQ(mem_tracker.consumption(), min_buffer_size);
+    EXPECT_EQ(mem_tracker->consumption(), min_buffer_size);
 
     // reuse buffer
     buffer_len = min_buffer_size;
@@ -1025,19 +1023,19 @@ TEST_F(DiskIoMgrTest, Buffers) {
     EXPECT_EQ(buffer_len, min_buffer_size);
     EXPECT_EQ(io_mgr._num_allocated_buffers, 1);
     io_mgr.return_free_buffer(buf, buffer_len);
-    EXPECT_EQ(mem_tracker.consumption(), min_buffer_size);
+    EXPECT_EQ(mem_tracker->consumption(), min_buffer_size);
 
     // bump up to next buffer size
     buffer_len = min_buffer_size + 1;
     buf = io_mgr.get_free_buffer(&buffer_len);
     EXPECT_EQ(buffer_len, min_buffer_size * 2);
     EXPECT_EQ(io_mgr._num_allocated_buffers, 2);
-    EXPECT_EQ(mem_tracker.consumption(), min_buffer_size * 3);
+    EXPECT_EQ(mem_tracker->consumption(), min_buffer_size * 3);
 
     // gc unused buffer
     io_mgr.gc_io_buffers();
     EXPECT_EQ(io_mgr._num_allocated_buffers, 1);
-    EXPECT_EQ(mem_tracker.consumption(), min_buffer_size * 2);
+    EXPECT_EQ(mem_tracker->consumption(), min_buffer_size * 2);
 
     io_mgr.return_free_buffer(buf, buffer_len);
 
@@ -1047,17 +1045,17 @@ TEST_F(DiskIoMgrTest, Buffers) {
     EXPECT_EQ(buffer_len, max_buffer_size);
     EXPECT_EQ(io_mgr._num_allocated_buffers, 2);
     io_mgr.return_free_buffer(buf, buffer_len);
-    EXPECT_EQ(mem_tracker.consumption(), min_buffer_size * 2 + max_buffer_size);
+    EXPECT_EQ(mem_tracker->consumption(), min_buffer_size * 2 + max_buffer_size);
 
     // gc buffers
     io_mgr.gc_io_buffers();
     EXPECT_EQ(io_mgr._num_allocated_buffers, 0);
-    EXPECT_EQ(mem_tracker.consumption(), 0);
+    EXPECT_EQ(mem_tracker->consumption(), 0);
 }
 
 // IMPALA-2366: handle partial read where range goes past end of file.
 TEST_F(DiskIoMgrTest, PartialRead) {
-    MemTracker mem_tracker(LARGE_MEM_LIMIT);
+    std::shared_ptr mem_tracker(new MemTracker(LARGE_MEM_LIMIT));
     const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
     const char* data = "the quick brown fox jumped over the lazy dog";
     int len = strlen(data);
@@ -1071,11 +1069,11 @@ TEST_F(DiskIoMgrTest, PartialRead) {
     _pool.reset(new ObjectPool);
     scoped_ptr io_mgr(new DiskIoMgr(1, 1, read_len, read_len));
 
-    Status status = io_mgr->init(&mem_tracker);
+    Status status = io_mgr->init(mem_tracker);
     ASSERT_TRUE(status.ok());
-    MemTracker reader_mem_tracker(LARGE_MEM_LIMIT);
+    std::shared_ptr reader_mem_tracker(new MemTracker(LARGE_MEM_LIMIT));
     DiskIoMgr::RequestContext* reader;
-    status = io_mgr->register_context(&reader, &reader_mem_tracker);
+    status = io_mgr->register_context(&reader, reader_mem_tracker);
     ASSERT_TRUE(status.ok());
 
     // We should not read past the end of file.
@@ -1091,8 +1089,8 @@ TEST_F(DiskIoMgrTest, PartialRead) {
     io_mgr->unregister_context(reader);
     _pool.reset();
     io_mgr.reset();
-    EXPECT_EQ(reader_mem_tracker.consumption(), 0);
-    EXPECT_EQ(mem_tracker.consumption(), 0);
+    EXPECT_EQ(reader_mem_tracker->consumption(), 0);
+    EXPECT_EQ(mem_tracker->consumption(), 0);
 }
 
 } // end namespace doris
diff --git a/be/test/runtime/load_channel_mgr_test.cpp b/be/test/runtime/load_channel_mgr_test.cpp
index bda51c49cf..386f44a793 100644
--- a/be/test/runtime/load_channel_mgr_test.cpp
+++ b/be/test/runtime/load_channel_mgr_test.cpp
@@ -45,7 +45,7 @@ OLAPStatus close_status;
 int64_t wait_lock_time_ns;
 
 // mock
-DeltaWriter::DeltaWriter(WriteRequest* req, MemTracker* mem_tracker,
+DeltaWriter::DeltaWriter(WriteRequest* req, std::shared_ptr mem_tracker,
                          StorageEngine* storage_engine) :
         _req(*req) {
 }
@@ -57,7 +57,7 @@ OLAPStatus DeltaWriter::init() {
     return OLAP_SUCCESS;
 }
 
-OLAPStatus DeltaWriter::open(WriteRequest* req, MemTracker* mem_tracker, DeltaWriter** writer) {
+OLAPStatus DeltaWriter::open(WriteRequest* req, std::shared_ptr mem_tracker, DeltaWriter** writer) {
     if (open_status != OLAP_SUCCESS) {
         return open_status;
     }
@@ -173,7 +173,7 @@ TEST_F(LoadChannelMgrTest, normal) {
     DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl);
     auto tuple_desc = desc_tbl->get_tuple_descriptor(0);
     RowDescriptor row_desc(*desc_tbl, {0}, {false});
-    MemTracker tracker;
+    auto tracker = std::make_shared();
     PUniqueId load_id;
     load_id.set_hi(2);
     load_id.set_lo(3);
@@ -208,7 +208,7 @@ TEST_F(LoadChannelMgrTest, normal) {
         request.add_tablet_ids(21);
         request.add_tablet_ids(20);
 
-        RowBatch row_batch(row_desc, 1024, &tracker);
+        RowBatch row_batch(row_desc, 1024, tracker.get());
 
         // row1
         {
@@ -261,7 +261,7 @@ TEST_F(LoadChannelMgrTest, cancel) {
     DescriptorTbl* desc_tbl = nullptr;
     DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl);
     RowDescriptor row_desc(*desc_tbl, {0}, {false});
-    MemTracker tracker;
+
     PUniqueId load_id;
     load_id.set_hi(2);
     load_id.set_lo(3);
@@ -304,7 +304,7 @@ TEST_F(LoadChannelMgrTest, open_failed) {
     DescriptorTbl* desc_tbl = nullptr;
     DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl);
     RowDescriptor row_desc(*desc_tbl, {0}, {false});
-    MemTracker tracker;
+
     PUniqueId load_id;
     load_id.set_hi(2);
     load_id.set_lo(3);
@@ -339,7 +339,7 @@ TEST_F(LoadChannelMgrTest, add_failed) {
     DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl);
     auto tuple_desc = desc_tbl->get_tuple_descriptor(0);
     RowDescriptor row_desc(*desc_tbl, {0}, {false});
-    MemTracker tracker;
+    auto tracker = std::make_shared();
     PUniqueId load_id;
     load_id.set_hi(2);
     load_id.set_lo(3);
@@ -374,7 +374,7 @@ TEST_F(LoadChannelMgrTest, add_failed) {
         request.add_tablet_ids(21);
         request.add_tablet_ids(20);
 
-        RowBatch row_batch(row_desc, 1024, &tracker);
+        RowBatch row_batch(row_desc, 1024, tracker.get());
 
         // row1
         {
@@ -426,7 +426,7 @@ TEST_F(LoadChannelMgrTest, close_failed) {
     DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl);
     auto tuple_desc = desc_tbl->get_tuple_descriptor(0);
     RowDescriptor row_desc(*desc_tbl, {0}, {false});
-    MemTracker tracker;
+    auto tracker = std::make_shared();
     PUniqueId load_id;
     load_id.set_hi(2);
     load_id.set_lo(3);
@@ -464,7 +464,7 @@ TEST_F(LoadChannelMgrTest, close_failed) {
         request.add_partition_ids(10);
         request.add_partition_ids(11);
 
-        RowBatch row_batch(row_desc, 1024, &tracker);
+        RowBatch row_batch(row_desc, 1024, tracker.get());
 
         // row1
         {
@@ -518,7 +518,7 @@ TEST_F(LoadChannelMgrTest, unknown_tablet) {
     DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl);
     auto tuple_desc = desc_tbl->get_tuple_descriptor(0);
     RowDescriptor row_desc(*desc_tbl, {0}, {false});
-    MemTracker tracker;
+    auto tracker = std::make_shared();
     PUniqueId load_id;
     load_id.set_hi(2);
     load_id.set_lo(3);
@@ -553,7 +553,7 @@ TEST_F(LoadChannelMgrTest, unknown_tablet) {
         request.add_tablet_ids(22);
         request.add_tablet_ids(20);
 
-        RowBatch row_batch(row_desc, 1024, &tracker);
+        RowBatch row_batch(row_desc, 1024, tracker.get());
 
         // row1
         {
@@ -604,7 +604,7 @@ TEST_F(LoadChannelMgrTest, duplicate_packet) {
     DescriptorTbl::create(&obj_pool, tdesc_tbl, &desc_tbl);
     auto tuple_desc = desc_tbl->get_tuple_descriptor(0);
     RowDescriptor row_desc(*desc_tbl, {0}, {false});
-    MemTracker tracker;
+    auto tracker = std::make_shared();
     PUniqueId load_id;
     load_id.set_hi(2);
     load_id.set_lo(3);
@@ -639,7 +639,7 @@ TEST_F(LoadChannelMgrTest, duplicate_packet) {
         request.add_tablet_ids(21);
         request.add_tablet_ids(20);
 
-        RowBatch row_batch(row_desc, 1024, &tracker);
+        RowBatch row_batch(row_desc, 1024, tracker.get());
 
         // row1
         {
diff --git a/be/test/runtime/mem_limit_test.cpp b/be/test/runtime/mem_limit_test.cpp
index bd1a9a4b26..0af067e5ea 100644
--- a/be/test/runtime/mem_limit_test.cpp
+++ b/be/test/runtime/mem_limit_test.cpp
@@ -27,27 +27,27 @@ namespace doris {
 TEST(MemTestTest, SingleTrackerNoLimit) {
     MemTracker t(-1);
     EXPECT_FALSE(t.has_limit());
-    t.consume(10);
+    t.Consume(10);
     EXPECT_EQ(t.consumption(), 10);
-    t.consume(10);
+    t.Consume(10);
     EXPECT_EQ(t.consumption(), 20);
-    t.release(15);
+    t.Release(15);
     EXPECT_EQ(t.consumption(), 5);
-    EXPECT_FALSE(t.limit_exceeded());
+    EXPECT_FALSE(t.LimitExceeded(MemLimit::HARD));
 }
 
 TEST(MemTestTest, SingleTrackerWithLimit) {
     MemTracker t(11);
     EXPECT_TRUE(t.has_limit());
-    t.consume(10);
+    t.Consume(10);
     EXPECT_EQ(t.consumption(), 10);
-    EXPECT_FALSE(t.limit_exceeded());
-    t.consume(10);
+    EXPECT_FALSE(t.LimitExceeded(MemLimit::HARD));
+    t.Consume(10);
     EXPECT_EQ(t.consumption(), 20);
-    EXPECT_TRUE(t.limit_exceeded());
-    t.release(15);
+    EXPECT_TRUE(t.LimitExceeded(MemLimit::HARD));
+    t.Release(15);
     EXPECT_EQ(t.consumption(), 5);
-    EXPECT_FALSE(t.limit_exceeded());
+    EXPECT_FALSE(t.LimitExceeded(MemLimit::HARD));
 }
 
 #if 0
@@ -63,129 +63,129 @@ TEST(MemTestTest, ConsumptionMetric) {
     EXPECT_TRUE(t.has_limit());
     EXPECT_EQ(t.consumption(), 0);
 
-    // consume()/release() arguments have no effect
-    t.consume(150);
+    // Consume()/Release() arguments have no effect
+    t.Consume(150);
     EXPECT_EQ(t.consumption(), 0);
     EXPECT_EQ(t.peak_consumption(), 0);
-    EXPECT_FALSE(t.limit_exceeded());
-    t.release(5);
+    EXPECT_FALSE(t.LimitExceeded(MemLimit::HARD));
+    t.Release(5);
     EXPECT_EQ(t.consumption(), 0);
     EXPECT_EQ(t.peak_consumption(), 0);
-    EXPECT_FALSE(t.limit_exceeded());
+    EXPECT_FALSE(t.LimitExceeded(MemLimit::HARD));
 
     metric.Increment(10);
     // _consumption is only updated with _consumption_metric after calls to
-    // consume()/release() with a non-zero value
-    t.consume(1);
+    // Consume()/Release() with a non-zero value
+    t.Consume(1);
     EXPECT_EQ(t.consumption(), 10);
     EXPECT_EQ(t.peak_consumption(), 10);
     metric.Increment(-5);
-    t.consume(-1);
+    t.Consume(-1);
     EXPECT_EQ(t.consumption(), 5);
     EXPECT_EQ(t.peak_consumption(), 10);
-    EXPECT_FALSE(t.limit_exceeded());
+    EXPECT_FALSE(t.LimitExceeded(MemLimit::HARD));
     metric.Increment(150);
-    t.consume(1);
+    t.Consume(1);
     EXPECT_EQ(t.consumption(), 155);
     EXPECT_EQ(t.peak_consumption(), 155);
-    EXPECT_TRUE(t.limit_exceeded());
+    EXPECT_TRUE(t.LimitExceeded(MemLimit::HARD));
     metric.Increment(-150);
-    t.consume(-1);
+    t.Consume(-1);
     EXPECT_EQ(t.consumption(), 5);
     EXPECT_EQ(t.peak_consumption(), 155);
-    EXPECT_FALSE(t.limit_exceeded());
-    // _consumption is not updated when consume()/release() is called with a zero value
+    EXPECT_FALSE(t.LimitExceeded(MemLimit::HARD));
+    // _consumption is not updated when Consume()/Release() is called with a zero value
     metric.Increment(10);
-    t.consume(0);
+    t.Consume(0);
     EXPECT_EQ(t.consumption(), 5);
     EXPECT_EQ(t.peak_consumption(), 155);
-    EXPECT_FALSE(t.limit_exceeded());
+    EXPECT_FALSE(t.LimitExceeded(MemLimit::HARD));
 }
 #endif // #end #if 0
 
 TEST(MemTestTest, TrackerHierarchy) {
-    MemTracker p(100);
-    MemTracker c1(80, "", &p);
-    MemTracker c2(50, "", &p);
+    auto p = std::make_shared(100);
+    auto c1= std::make_shared(80, "", p);
+    auto c2= std::make_shared(50, "", p);
 
     // everything below limits
-    c1.consume(60);
-    EXPECT_EQ(c1.consumption(), 60);
-    EXPECT_FALSE(c1.limit_exceeded());
-    EXPECT_FALSE(c1.any_limit_exceeded());
-    EXPECT_EQ(c2.consumption(), 0);
-    EXPECT_FALSE(c2.limit_exceeded());
-    EXPECT_FALSE(c2.any_limit_exceeded());
-    EXPECT_EQ(p.consumption(), 60);
-    EXPECT_FALSE(p.limit_exceeded());
-    EXPECT_FALSE(p.any_limit_exceeded());
+    c1->Consume(60);
+    EXPECT_EQ(c1->consumption(), 60);
+    EXPECT_FALSE(c1->LimitExceeded(MemLimit::HARD));
+    EXPECT_FALSE(c1->AnyLimitExceeded(MemLimit::HARD));
+    EXPECT_EQ(c2->consumption(), 0);
+    EXPECT_FALSE(c2->LimitExceeded(MemLimit::HARD));
+    EXPECT_FALSE(c2->AnyLimitExceeded(MemLimit::HARD));
+    EXPECT_EQ(p->consumption(), 60);
+    EXPECT_FALSE(p->LimitExceeded(MemLimit::HARD));
+    EXPECT_FALSE(p->AnyLimitExceeded(MemLimit::HARD));
 
     // p goes over limit
-    c2.consume(50);
-    EXPECT_EQ(c1.consumption(), 60);
-    EXPECT_FALSE(c1.limit_exceeded());
-    EXPECT_TRUE(c1.any_limit_exceeded());
-    EXPECT_EQ(c2.consumption(), 50);
-    EXPECT_FALSE(c2.limit_exceeded());
-    EXPECT_TRUE(c2.any_limit_exceeded());
-    EXPECT_EQ(p.consumption(), 110);
-    EXPECT_TRUE(p.limit_exceeded());
+    c2->Consume(50);
+    EXPECT_EQ(c1->consumption(), 60);
+    EXPECT_FALSE(c1->LimitExceeded(MemLimit::HARD));
+    EXPECT_TRUE(c1->AnyLimitExceeded(MemLimit::HARD));
+    EXPECT_EQ(c2->consumption(), 50);
+    EXPECT_FALSE(c2->LimitExceeded(MemLimit::HARD));
+    EXPECT_TRUE(c2->AnyLimitExceeded(MemLimit::HARD));
+    EXPECT_EQ(p->consumption(), 110);
+    EXPECT_TRUE(p->LimitExceeded(MemLimit::HARD));
 
     // c2 goes over limit, p drops below limit
-    c1.release(20);
-    c2.consume(10);
-    EXPECT_EQ(c1.consumption(), 40);
-    EXPECT_FALSE(c1.limit_exceeded());
-    EXPECT_FALSE(c1.any_limit_exceeded());
-    EXPECT_EQ(c2.consumption(), 60);
-    EXPECT_TRUE(c2.limit_exceeded());
-    EXPECT_TRUE(c2.any_limit_exceeded());
-    EXPECT_EQ(p.consumption(), 100);
-    EXPECT_FALSE(p.limit_exceeded());
+    c1->Release(20);
+    c2->Consume(10);
+    EXPECT_EQ(c1->consumption(), 40);
+    EXPECT_FALSE(c1->LimitExceeded(MemLimit::HARD));
+    EXPECT_FALSE(c1->AnyLimitExceeded(MemLimit::HARD));
+    EXPECT_EQ(c2->consumption(), 60);
+    EXPECT_TRUE(c2->LimitExceeded(MemLimit::HARD));
+    EXPECT_TRUE(c2->AnyLimitExceeded(MemLimit::HARD));
+    EXPECT_EQ(p->consumption(), 100);
+    EXPECT_FALSE(p->LimitExceeded(MemLimit::HARD));
 }
 
 TEST(MemTestTest, TrackerHierarchyTryConsume) {
-    MemTracker p(100);
-    MemTracker c1(80, "", &p);
-    MemTracker c2(50, "", &p);
+    auto p = std::make_shared(100);
+    auto c1= std::make_shared(80, "", p);
+    auto c2= std::make_shared(50, "", p);
 
     // everything below limits
-    bool consumption = c1.try_consume(60);
+    bool consumption = c1->TryConsume(60);
     EXPECT_EQ(consumption, true);
-    EXPECT_EQ(c1.consumption(), 60);
-    EXPECT_FALSE(c1.limit_exceeded());
-    EXPECT_FALSE(c1.any_limit_exceeded());
-    EXPECT_EQ(c2.consumption(), 0);
-    EXPECT_FALSE(c2.limit_exceeded());
-    EXPECT_FALSE(c2.any_limit_exceeded());
-    EXPECT_EQ(p.consumption(), 60);
-    EXPECT_FALSE(p.limit_exceeded());
-    EXPECT_FALSE(p.any_limit_exceeded());
+    EXPECT_EQ(c1->consumption(), 60);
+    EXPECT_FALSE(c1->LimitExceeded(MemLimit::HARD));
+    EXPECT_FALSE(c1->AnyLimitExceeded(MemLimit::HARD));
+    EXPECT_EQ(c2->consumption(), 0);
+    EXPECT_FALSE(c2->LimitExceeded(MemLimit::HARD));
+    EXPECT_FALSE(c2->AnyLimitExceeded(MemLimit::HARD));
+    EXPECT_EQ(p->consumption(), 60);
+    EXPECT_FALSE(p->LimitExceeded(MemLimit::HARD));
+    EXPECT_FALSE(p->AnyLimitExceeded(MemLimit::HARD));
 
     // p goes over limit
-    consumption = c2.try_consume(50);
+    consumption = c2->TryConsume(50);
     EXPECT_EQ(consumption, true);
-    EXPECT_EQ(c1.consumption(), 60);
-    EXPECT_FALSE(c1.limit_exceeded());
-    EXPECT_FALSE(c1.any_limit_exceeded());
-    EXPECT_EQ(c2.consumption(), 0);
-    EXPECT_FALSE(c2.limit_exceeded());
-    EXPECT_FALSE(c2.any_limit_exceeded());
-    EXPECT_EQ(p.consumption(), 60);
-    EXPECT_FALSE(p.limit_exceeded());
-    EXPECT_FALSE(p.any_limit_exceeded());
+    EXPECT_EQ(c1->consumption(), 60);
+    EXPECT_FALSE(c1->LimitExceeded(MemLimit::HARD));
+    EXPECT_FALSE(c1->AnyLimitExceeded(MemLimit::HARD));
+    EXPECT_EQ(c2->consumption(), 0);
+    EXPECT_FALSE(c2->LimitExceeded(MemLimit::HARD));
+    EXPECT_FALSE(c2->AnyLimitExceeded(MemLimit::HARD));
+    EXPECT_EQ(p->consumption(), 60);
+    EXPECT_FALSE(p->LimitExceeded(MemLimit::HARD));
+    EXPECT_FALSE(p->AnyLimitExceeded(MemLimit::HARD));
 
     // c2 goes over limit, p drops below limit
-    c1.release(20);
-    c2.consume(10);
-    EXPECT_EQ(c1.consumption(), 40);
-    EXPECT_FALSE(c1.limit_exceeded());
-    EXPECT_FALSE(c1.any_limit_exceeded());
-    EXPECT_EQ(c2.consumption(), 10);
-    EXPECT_FALSE(c2.limit_exceeded());
-    EXPECT_FALSE(c2.any_limit_exceeded());
-    EXPECT_EQ(p.consumption(), 50);
-    EXPECT_FALSE(p.limit_exceeded());
+    c1->Release(20);
+    c2->Consume(10);
+    EXPECT_EQ(c1->consumption(), 40);
+    EXPECT_FALSE(c1->LimitExceeded(MemLimit::HARD));
+    EXPECT_FALSE(c1->AnyLimitExceeded(MemLimit::HARD));
+    EXPECT_EQ(c2->consumption(), 10);
+    EXPECT_FALSE(c2->LimitExceeded(MemLimit::HARD));
+    EXPECT_FALSE(c2->AnyLimitExceeded(MemLimit::HARD));
+    EXPECT_EQ(p->consumption(), 50);
+    EXPECT_FALSE(p->LimitExceeded(MemLimit::HARD));
 }
 
 #if 0
@@ -197,7 +197,7 @@ class GcFunctionHelper {
 
         ~GcFunctionHelper() {}
 
-        void gc_func() { _tracker->release(NUM_RELEASE_BYTES); }
+        void gc_func() { _tracker->Release(NUM_RELEASE_BYTES); }
 
     private:
         MemTracker* _tracker;
@@ -207,35 +207,35 @@ TEST(MemTestTest, GcFunctions) {
     MemTracker t(10);
     ASSERT_TRUE(t.has_limit());
 
-    t.consume(9);
-    EXPECT_FALSE(t.limit_exceeded());
+    t.Consume(9);
+    EXPECT_FALSE(t.LimitExceeded(MemLimit::HARD));
 
     // Test TryConsume()
     EXPECT_FALSE(t.TryConsume(2));
     EXPECT_EQ(t.consumption(), 9);
-    EXPECT_FALSE(t.limit_exceeded());
+    EXPECT_FALSE(t.LimitExceeded(MemLimit::HARD));
 
     // Attach GcFunction that releases 1 byte
     GcFunctionHelper gc_func_helper(&t);
     t.AddGcFunction(boost::bind(&GcFunctionHelper::gc_func, &gc_func_helper));
     EXPECT_TRUE(t.TryConsume(2));
     EXPECT_EQ(t.consumption(), 10);
-    EXPECT_FALSE(t.limit_exceeded());
+    EXPECT_FALSE(t.LimitExceeded(MemLimit::HARD));
 
     // GcFunction will be called even though TryConsume() fails
     EXPECT_FALSE(t.TryConsume(2));
     EXPECT_EQ(t.consumption(), 9);
-    EXPECT_FALSE(t.limit_exceeded());
+    EXPECT_FALSE(t.LimitExceeded(MemLimit::HARD));
 
     // GcFunction won't be called
     EXPECT_TRUE(t.TryConsume(1));
     EXPECT_EQ(t.consumption(), 10);
-    EXPECT_FALSE(t.limit_exceeded());
+    EXPECT_FALSE(t.LimitExceeded(MemLimit::HARD));
 
-    // Test limit_exceeded()
-    t.consume(1);
+    // Test LimitExceeded(MemLimit::HARD)
+    t.Consume(1);
     EXPECT_EQ(t.consumption(), 11);
-    EXPECT_FALSE(t.limit_exceeded());
+    EXPECT_FALSE(t.LimitExceeded(MemLimit::HARD));
     EXPECT_EQ(t.consumption(), 10);
 
     // Add more GcFunctions, test that we only call them until the limit is no longer
@@ -244,9 +244,9 @@ TEST(MemTestTest, GcFunctions) {
     t.AddGcFunction(boost::bind(&GcFunctionHelper::gc_func, &gc_func_helper2));
     GcFunctionHelper gc_func_helper3(&t);
     t.AddGcFunction(boost::bind(&GcFunctionHelper::gc_func, &gc_func_helper3));
-    t.consume(1);
+    t.Consume(1);
     EXPECT_EQ(t.consumption(), 11);
-    EXPECT_FALSE(t.limit_exceeded());
+    EXPECT_FALSE(t.LimitExceeded(MemLimit::HARD));
     EXPECT_EQ(t.consumption(), 10);
 }
 #endif // enf #if 0
diff --git a/be/test/runtime/memory_scratch_sink_test.cpp b/be/test/runtime/memory_scratch_sink_test.cpp
index eaf602bfc4..a7b2321a15 100644
--- a/be/test/runtime/memory_scratch_sink_test.cpp
+++ b/be/test/runtime/memory_scratch_sink_test.cpp
@@ -67,7 +67,6 @@ public:
 
     ~MemoryScratchSinkTest() {
         delete _state;
-        delete _mem_tracker;
         delete _exec_env->_result_queue_mgr;
         delete _exec_env->_thread_mgr;
         delete _exec_env->_buffer_reservation;
@@ -102,7 +101,7 @@ private:
     TPlanNode _tnode;
     RowDescriptor* _row_desc = nullptr;
     TMemoryScratchSink _tsink;
-    MemTracker *_mem_tracker = nullptr;
+    std::shared_ptr _mem_tracker = nullptr;
     DescriptorTbl* _desc_tbl = nullptr;
     std::vector _exprs;
 };
@@ -125,7 +124,7 @@ void MemoryScratchSinkTest::init_runtime_state() {
     query_id.hi = 100;
     _state = new RuntimeState(query_id, query_options, TQueryGlobals(), _exec_env);
     _state->init_instance_mem_tracker();
-    _mem_tracker = new MemTracker(-1, "MemoryScratchSinkTest", _state->instance_mem_tracker());
+    _mem_tracker = MemTracker::CreateTracker(-1, "MemoryScratchSinkTest", _state->instance_mem_tracker());
     _state->set_desc_tbl(_desc_tbl);
     _state->_load_dir = "./test_run/output/";
     _state->init_mem_trackers(TUniqueId());
diff --git a/be/test/util/arrow/arrow_row_batch_test.cpp b/be/test/util/arrow/arrow_row_batch_test.cpp
index 0c8daa4452..29707eb6c0 100644
--- a/be/test/util/arrow/arrow_row_batch_test.cpp
+++ b/be/test/util/arrow/arrow_row_batch_test.cpp
@@ -19,15 +19,15 @@
 
 #include 
 
-#include 
 #include 
+#include 
 
 #include "common/logging.h"
 
 #define ARROW_UTIL_LOGGING_H
+#include 
 #include 
 #include 
-#include 
 #include 
 
 #include "common/object_pool.h"
@@ -37,8 +37,7 @@
 
 namespace doris {
 
-class ArrowRowBatchTest : public testing::Test {
-};
+class ArrowRowBatchTest : public testing::Test {};
 
 std::string test_str() {
     return R"(
@@ -58,10 +57,9 @@ TEST_F(ArrowRowBatchTest, PrettyPrint) {
     std::shared_ptr buffer;
     MakeBuffer(test_str(), &buffer);
     arrow::json::ParseOptions parse_opts = arrow::json::ParseOptions::Defaults();
-    parse_opts.explicit_schema = arrow::schema(
-        {
-        arrow::field("c1", arrow::int64()),
-        });
+    parse_opts.explicit_schema = arrow::schema({
+            arrow::field("c1", arrow::int64()),
+    });
 
     std::shared_ptr record_batch;
     auto arrow_st = arrow::json::ParseOne(parse_opts, buffer, &record_batch);
@@ -71,9 +69,9 @@ TEST_F(ArrowRowBatchTest, PrettyPrint) {
     RowDescriptor* row_desc;
     auto doris_st = convert_to_row_desc(&obj_pool, *record_batch->schema(), &row_desc);
     ASSERT_TRUE(doris_st.ok());
-    MemTracker tracker;
+    auto tracker = std::make_shared(-1, "PrettyPrintTest");
     std::shared_ptr row_batch;
-    doris_st = convert_to_row_batch(*record_batch, *row_desc, &tracker, &row_batch);
+    doris_st = convert_to_row_batch(*record_batch, *row_desc, tracker, &row_batch);
     ASSERT_TRUE(doris_st.ok());
 
     {
@@ -90,7 +88,7 @@ TEST_F(ArrowRowBatchTest, PrettyPrint) {
     }
 }
 
-}
+} // namespace doris
 
 int main(int argc, char** argv) {
     ::testing::InitGoogleTest(&argc, argv);
diff --git a/be/test/util/arrow/arrow_row_block_test.cpp b/be/test/util/arrow/arrow_row_block_test.cpp
index 111a477824..6efd5ac57b 100644
--- a/be/test/util/arrow/arrow_row_block_test.cpp
+++ b/be/test/util/arrow/arrow_row_block_test.cpp
@@ -23,24 +23,23 @@
 #include "common/logging.h"
 
 #define ARROW_UTIL_LOGGING_H
+#include 
 #include 
 #include 
-#include 
-#include 
 #include 
+#include 
 #include 
 
-#include "olap/tablet_schema_helper.h"
-#include "olap/schema.h"
 #include "olap/row_block2.h"
+#include "olap/schema.h"
+#include "olap/tablet_schema_helper.h"
 
 namespace doris {
 
 class ArrowRowBlockTest : public testing::Test {
 public:
-    ArrowRowBlockTest() { }
-    virtual ~ArrowRowBlockTest() {
-    }
+    ArrowRowBlockTest() {}
+    virtual ~ArrowRowBlockTest() {}
 };
 
 std::string test_str() {
@@ -61,10 +60,9 @@ TEST_F(ArrowRowBlockTest, Normal) {
     std::shared_ptr buffer;
     MakeBuffer(test_str(), &buffer);
     arrow::json::ParseOptions parse_opts = arrow::json::ParseOptions::Defaults();
-    parse_opts.explicit_schema = arrow::schema(
-        {
-        arrow::field("c1", arrow::int64()),
-        });
+    parse_opts.explicit_schema = arrow::schema({
+            arrow::field("c1", arrow::int64()),
+    });
 
     std::shared_ptr record_batch;
     auto arrow_st = arrow::json::ParseOne(parse_opts, buffer, &record_batch);
@@ -82,7 +80,6 @@ TEST_F(ArrowRowBlockTest, Normal) {
         std::shared_ptr check_schema;
         doris_st = convert_to_arrow_schema(*schema, &check_schema);
         ASSERT_TRUE(doris_st.ok());
-
         arrow::MemoryPool* pool = arrow::default_memory_pool();
         std::shared_ptr check_batch;
         doris_st = convert_to_arrow_batch(*row_block, check_schema, pool, &check_batch);
@@ -92,10 +89,9 @@ TEST_F(ArrowRowBlockTest, Normal) {
     }
 }
 
-}
+} // namespace doris
 
 int main(int argc, char** argv) {
     ::testing::InitGoogleTest(&argc, argv);
     return RUN_ALL_TESTS();
 }
-
diff --git a/be/test/util/arrow/arrow_work_flow_test.cpp b/be/test/util/arrow/arrow_work_flow_test.cpp
index fb6f814c8a..05806f819a 100644
--- a/be/test/util/arrow/arrow_work_flow_test.cpp
+++ b/be/test/util/arrow/arrow_work_flow_test.cpp
@@ -17,13 +17,13 @@
 
 #include "exec/csv_scan_node.h"
 
-#include 
-#include 
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
+#include 
+#include 
 
 #include "common/logging.h"
 #include "gen_cpp/PlanNodes_types.h"
@@ -33,22 +33,22 @@
 #include "runtime/exec_env.h"
 #include "runtime/mem_tracker.h"
 #include "runtime/result_queue_mgr.h"
-#include "runtime/thread_resource_mgr.h"
 #include "runtime/row_batch.h"
 #include "runtime/runtime_state.h"
+#include "runtime/thread_resource_mgr.h"
 #include "runtime/tuple_row.h"
 #include "util/arrow/row_batch.h"
+#include "util/cpu_info.h"
 #include "util/debug_util.h"
 #include "util/disk_info.h"
-#include "util/cpu_info.h"
 #include "util/logging.h"
 
 namespace doris {
 
 class ArrowWorkFlowTest : public testing::Test {
 public:
-    ArrowWorkFlowTest(){}
-    ~ArrowWorkFlowTest(){}
+    ArrowWorkFlowTest() {}
+    ~ArrowWorkFlowTest() {}
 
 protected:
     virtual void SetUp() {
@@ -66,7 +66,6 @@ protected:
         system("rm -rf ./test_run");
 
         delete _state;
-        delete _mem_tracker;
     }
 
     void init();
@@ -80,7 +79,7 @@ private:
     TPlanNode _tnode;
     ExecEnv* _exec_env = nullptr;
     RuntimeState* _state = nullptr;
-    MemTracker *_mem_tracker = nullptr;
+    std::shared_ptr _mem_tracker;
 }; // end class ArrowWorkFlowTest
 
 void ArrowWorkFlowTest::init() {
@@ -100,7 +99,7 @@ void ArrowWorkFlowTest::init_runtime_state() {
     query_id.hi = 100;
     _state = new RuntimeState(query_id, query_options, TQueryGlobals(), _exec_env);
     _state->init_instance_mem_tracker();
-    _mem_tracker = new MemTracker(-1, "ArrowWorkFlowTest", _state->instance_mem_tracker());
+    _mem_tracker = MemTracker::CreateTracker(-1, "ArrowWorkFlowTest", _state->instance_mem_tracker());
     _state->set_desc_tbl(_desc_tbl);
     _state->_load_dir = "./test_run/output/";
     _state->init_mem_trackers(TUniqueId());
@@ -318,7 +317,6 @@ void ArrowWorkFlowTest::init_desc_tbl() {
     _tnode.csv_scan_node.__isset.default_values = true;
     _tnode.csv_scan_node.max_filter_ratio = 0.5;
     _tnode.__isset.csv_scan_node = true;
-
 }
 
 TEST_F(ArrowWorkFlowTest, NormalUse) {
@@ -333,7 +331,7 @@ TEST_F(ArrowWorkFlowTest, NormalUse) {
     status = scan_node.open(_state);
     ASSERT_TRUE(status.ok());
 
-    std::unique_ptr mem_tracker(new MemTracker(-1));
+    auto mem_tracker = std::make_shared(-1);
     RowBatch row_batch(scan_node._row_descriptor, _state->batch_size(), mem_tracker.get());
     bool eos = false;
 
@@ -347,7 +345,8 @@ TEST_F(ArrowWorkFlowTest, NormalUse) {
         status = convert_to_arrow_schema(scan_node._row_descriptor, &schema);
         ASSERT_TRUE(status.ok());
         std::shared_ptr record_batch;
-        status = convert_to_arrow_batch(row_batch, schema, arrow::default_memory_pool(), &record_batch);
+        status = convert_to_arrow_batch(row_batch, schema, arrow::default_memory_pool(),
+                                        &record_batch);
         ASSERT_TRUE(status.ok());
         ASSERT_EQ(6, record_batch->num_rows());
         ASSERT_EQ(6, record_batch->num_columns());