[feature] (memory) Switch TLS mem tracker to separate more detailed memory usage (#8605)

In pr #8476, all memory usage of a process is recorded in the process mem tracker, and all memory usage of a query is recorded in the query mem tracker, and it is still necessary to manually call `transfer to` to track the cached memory size. We hope to separate out more detailed memory usage based on Hook TCMalloc new/delete + TLS mem tracker. In this pr, the more detailed mem tracker is switched to TLS, which automatically and accurately counts more detailed memory usage than before.
2022-03-24 14:29:34 +08:00
parent 5f606c9d57
commit aaaaae53b5
22 changed files with 202 additions and 86 deletions
--- a/be/src/exec/cross_join_node.cpp
+++ b/be/src/exec/cross_join_node.cpp
@ -23,6 +23,7 @@
 #include "gen_cpp/PlanNodes_types.h"
 #include "runtime/row_batch.h"
 #include "runtime/runtime_state.h"
+#include "runtime/thread_context.h"
 #include "util/debug_util.h"
 #include "util/runtime_profile.h"

@ -52,6 +53,7 @@ Status CrossJoinNode::close(RuntimeState* state) {
 Status CrossJoinNode::construct_build_side(RuntimeState* state) {
    // Do a full scan of child(1) and store all build row batches.
    RETURN_IF_ERROR(child(1)->open(state));
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_ERR_CB("Cross join, while getting next from child 1");

    while (true) {
        RowBatch* batch =
@ -63,9 +65,6 @@ Status CrossJoinNode::construct_build_side(RuntimeState* state) {
        bool eos = false;
        RETURN_IF_ERROR(child(1)->get_next(state, batch, &eos));

-        // to prevent use too many memory
-        RETURN_IF_INSTANCE_LIMIT_EXCEEDED(state, "Cross join, while getting next from the child 1.");
-
        SCOPED_TIMER(_build_timer);
        _build_batches.add_row_batch(batch);
        VLOG_ROW << build_list_debug_string();
--- a/be/src/exec/except_node.cpp
+++ b/be/src/exec/except_node.cpp
@ -21,6 +21,7 @@
 #include "exprs/expr.h"
 #include "runtime/row_batch.h"
 #include "runtime/runtime_state.h"
+#include "runtime/thread_context.h"

 namespace doris {
 ExceptNode::ExceptNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs)
@ -40,6 +41,7 @@ Status ExceptNode::init(const TPlanNode& tnode, RuntimeState* state) {

 Status ExceptNode::open(RuntimeState* state) {
    RETURN_IF_ERROR(SetOperationNode::open(state));
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_ERR_CB("Except Node, while probing the hash table.");
    // if a table is empty, the result must be empty
    if (_hash_tbl->size() == 0) {
        _hash_tbl_iterator = _hash_tbl->begin();
@ -62,7 +64,6 @@ Status ExceptNode::open(RuntimeState* state) {
        while (!eos) {
            RETURN_IF_CANCELLED(state);
            RETURN_IF_ERROR(child(i)->get_next(state, _probe_batch.get(), &eos));
-            RETURN_IF_INSTANCE_LIMIT_EXCEEDED(state, " Except , while probing the hash table.");
            for (int j = 0; j < _probe_batch->num_rows(); ++j) {
                _hash_tbl_iterator = _hash_tbl->find(_probe_batch->get_row(j));
                if (_hash_tbl_iterator != _hash_tbl->end()) {
--- a/be/src/exec/exec_node.cpp
+++ b/be/src/exec/exec_node.cpp
@ -57,6 +57,7 @@
 #include "runtime/mem_tracker.h"
 #include "runtime/row_batch.h"
 #include "runtime/runtime_state.h"
+#include "runtime/thread_context.h"
 #include "util/debug_util.h"
 #include "util/runtime_profile.h"
 #include "vec/core/block.h"
@ -208,6 +209,7 @@ Status ExecNode::prepare(RuntimeState* state) {
    _mem_tracker = MemTracker::create_tracker(-1, "ExecNode:" + _runtime_profile->name(),
                                              state->instance_mem_tracker(),
                                              MemTrackerLevel::VERBOSE, _runtime_profile.get());
+    SCOPED_SWITCH_TASK_THREAD_LOCAL_MEM_TRACKER(_mem_tracker);
    _expr_mem_tracker = MemTracker::create_tracker(-1, "ExecNode:Exprs:" + _runtime_profile->name(),
                                                   _mem_tracker);

@ -226,6 +228,7 @@ Status ExecNode::prepare(RuntimeState* state) {
 }

 Status ExecNode::open(RuntimeState* state) {
+    SCOPED_SWITCH_TASK_THREAD_LOCAL_MEM_TRACKER(_mem_tracker);
    RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN));
    if (_vconjunct_ctx_ptr) {
        RETURN_IF_ERROR((*_vconjunct_ctx_ptr)->open(state));
--- a/be/src/exec/hash_join_node.cpp
+++ b/be/src/exec/hash_join_node.cpp
@ -186,6 +186,7 @@ Status HashJoinNode::construct_hash_table(RuntimeState* state) {
    // The hash join node needs to keep in memory all build tuples, including the tuple
    // row ptrs.  The row ptrs are copied into the hash table's internal structure so they
    // don't need to be stored in the _build_pool.
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_ERR_CB("Hash join, while constructing the hash table.");
    RowBatch build_batch(child(1)->row_desc(), state->batch_size());
    RETURN_IF_ERROR(child(1)->open(state));

@ -303,7 +304,7 @@ Status HashJoinNode::get_next(RuntimeState* state, RowBatch* out_batch, bool* eo
    // In most cases, no additional memory overhead will be applied for at this stage,
    // but if the expression calculation in this node needs to apply for additional memory,
    // it may cause the memory to exceed the limit.
-    RETURN_IF_INSTANCE_LIMIT_EXCEEDED(state, "Hash join, while execute get_next.");
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_ERR_CB("Hash join, while execute get_next.");
    SCOPED_TIMER(_runtime_profile->total_time_counter());

    if (reached_limit()) {
@ -771,11 +772,9 @@ Status HashJoinNode::process_build_batch(RuntimeState* state, RowBatch* build_ba
                                                   _build_pool.get(), false);
            }
        }
-        RETURN_IF_INSTANCE_LIMIT_EXCEEDED(state, "Hash join, while constructing the hash table.");
    } else {
        // take ownership of tuple data of build_batch
        _build_pool->acquire_data(build_batch->tuple_data_pool(), false);
-        RETURN_IF_INSTANCE_LIMIT_EXCEEDED(state, "Hash join, while constructing the hash table.");
        RETURN_IF_ERROR(_hash_tbl->resize_buckets_ahead(build_batch->num_rows()));
        for (int i = 0; i < build_batch->num_rows(); ++i) {
            _hash_tbl->insert_without_check(build_batch->get_row(i));
--- a/be/src/exec/intersect_node.cpp
+++ b/be/src/exec/intersect_node.cpp
@ -21,6 +21,7 @@
 #include "exprs/expr.h"
 #include "runtime/row_batch.h"
 #include "runtime/runtime_state.h"
+#include "runtime/thread_context.h"

 namespace doris {
 IntersectNode::IntersectNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs)
@ -44,6 +45,7 @@ Status IntersectNode::init(const TPlanNode& tnode, RuntimeState* state) {
 // repeat [2] this for all the rest child
 Status IntersectNode::open(RuntimeState* state) {
    RETURN_IF_ERROR(SetOperationNode::open(state));
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_ERR_CB("Intersect Node, while probing the hash table.");
    // if a table is empty, the result must be empty
    if (_hash_tbl->size() == 0) {
        _hash_tbl_iterator = _hash_tbl->begin();
@ -66,7 +68,6 @@ Status IntersectNode::open(RuntimeState* state) {
        while (!eos) {
            RETURN_IF_CANCELLED(state);
            RETURN_IF_ERROR(child(i)->get_next(state, _probe_batch.get(), &eos));
-            RETURN_IF_INSTANCE_LIMIT_EXCEEDED(state, " Intersect , while probing the hash table.");
            for (int j = 0; j < _probe_batch->num_rows(); ++j) {
                _hash_tbl_iterator = _hash_tbl->find(_probe_batch->get_row(j));
                if (_hash_tbl_iterator != _hash_tbl->end()) {
--- a/be/src/exec/set_operation_node.cpp
+++ b/be/src/exec/set_operation_node.cpp
@ -23,6 +23,7 @@
 #include "runtime/raw_value.h"
 #include "runtime/row_batch.h"
 #include "runtime/runtime_state.h"
+#include "runtime/thread_context.h"

 namespace doris {
 SetOperationNode::SetOperationNode(ObjectPool* pool, const TPlanNode& tnode,
@ -137,6 +138,7 @@ bool SetOperationNode::equals(TupleRow* row, TupleRow* other) {
 Status SetOperationNode::open(RuntimeState* state) {
    RETURN_IF_ERROR(ExecNode::open(state));
    RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN));
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_ERR_CB("SetOperation, while constructing the hash table.");
    SCOPED_TIMER(_runtime_profile->total_time_counter());
    RETURN_IF_CANCELLED(state);
    // open result expr lists.
@ -156,7 +158,6 @@ Status SetOperationNode::open(RuntimeState* state) {
        RETURN_IF_ERROR(child(0)->get_next(state, &build_batch, &eos));
        // take ownership of tuple data of build_batch
        _build_pool->acquire_data(build_batch.tuple_data_pool(), false);
-        RETURN_IF_INSTANCE_LIMIT_EXCEEDED(state, " SetOperation, while constructing the hash table.");
        // build hash table and remove duplicate items
        RETURN_IF_ERROR(_hash_tbl->resize_buckets_ahead(build_batch.num_rows()));
        for (int i = 0; i < build_batch.num_rows(); ++i) {
--- a/be/src/olap/lru_cache.cpp
+++ b/be/src/olap/lru_cache.cpp
@ -364,13 +364,7 @@ void LRUCache::erase(const CacheKey& key, uint32_t hash, MemTracker* tracker) {
    }
    // free handle out of mutex, when last_ref is true, e must not be nullptr
    if (last_ref) {
-        size_t charge = e->charge;
        e->free();
-        // The parameter tracker is ShardedLRUCache::_mem_tracker,
-        // because the memory released by LRUHandle is recorded in the tls mem tracker,
-        // so this part of the memory is subsidized from ShardedLRUCache::_mem_tracker to the tls mem tracker
-        tracker->transfer_to(thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker().get(),
-                            charge);
    }
 }

@ -449,11 +443,15 @@ ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t total_capacity,
        : _name(name),
          _last_id(1),
          _mem_tracker(MemTracker::create_tracker(-1, name, nullptr, MemTrackerLevel::OVERVIEW)) {
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker);
    const size_t per_shard = (total_capacity + (kNumShards - 1)) / kNumShards;
    for (int s = 0; s < kNumShards; s++) {
        _shards[s] = new LRUCache(type);
        _shards[s]->set_capacity(per_shard);
    }
+    // After the lru cache is created in the main thread, the main thread will not switch to the
+    // lru cache mem tracker again, so manually clear the untracked mem in tls.
+    thread_local_ctx.get()->_thread_mem_tracker_mgr->clear_untracked_mems();

    _entity = DorisMetrics::instance()->metric_registry()->register_entity(
            std::string("lru_cache:") + name, {{"name", name}});
@ -467,6 +465,7 @@ ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t total_capacity,
 }

 ShardedLRUCache::~ShardedLRUCache() {
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker);
    for (int s = 0; s < kNumShards; s++) {
        delete _shards[s];
    }
@ -481,6 +480,7 @@ Cache::Handle* ShardedLRUCache::insert(const CacheKey& key, void* value, size_t
    // transfer the memory ownership of the value to ShardedLRUCache::_mem_tracker.
    thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker()->transfer_to(_mem_tracker.get(),
                                                                                charge);
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker);
    const uint32_t hash = _hash_slice(key);
    return _shards[_shard(hash)]->insert(key, hash, value, charge, deleter, priority);
 }
@ -491,11 +491,13 @@ Cache::Handle* ShardedLRUCache::lookup(const CacheKey& key) {
 }

 void ShardedLRUCache::release(Handle* handle) {
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker);
    LRUHandle* h = reinterpret_cast<LRUHandle*>(handle);
    _shards[_shard(h->hash)]->release(handle);
 }

 void ShardedLRUCache::erase(const CacheKey& key) {
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker);
    const uint32_t hash = _hash_slice(key);
    _shards[_shard(hash)]->erase(key, hash, _mem_tracker.get());
 }
@ -514,6 +516,7 @@ uint64_t ShardedLRUCache::new_id() {
 }

 int64_t ShardedLRUCache::prune() {
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker);
    int64_t num_prune = 0;
    for (int s = 0; s < kNumShards; s++) {
        num_prune += _shards[s]->prune();
@ -522,6 +525,7 @@ int64_t ShardedLRUCache::prune() {
 }

 int64_t ShardedLRUCache::prune_if(CacheValuePredicate pred) {
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker);
    int64_t num_prune = 0;
    for (int s = 0; s < kNumShards; s++) {
        num_prune += _shards[s]->prune_if(pred);
--- a/be/src/olap/tablet_manager.cpp
+++ b/be/src/olap/tablet_manager.cpp
@ -192,11 +192,6 @@ OLAPStatus TabletManager::_add_tablet_to_map_unlocked(TTabletId tablet_id,
    tablet_map_t& tablet_map = _get_tablet_map(tablet_id);
    tablet_map[tablet_id] = tablet;
    _add_tablet_to_partition(tablet);
-    // TODO: remove multiply 2 of tablet meta mem size
-    // Because table schema will copy in tablet, there will be double mem cost
-    // so here multiply 2
-    thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker()->transfer_to(
-            _mem_tracker.get(), tablet->tablet_meta()->mem_size() * 2);

    VLOG_NOTICE << "add tablet to map successfully." << " tablet_id=" << tablet_id ;

@ -215,6 +210,7 @@ bool TabletManager::_check_tablet_id_exist_unlocked(TTabletId tablet_id) {

 OLAPStatus TabletManager::create_tablet(const TCreateTabletReq& request,
                                        std::vector<DataDir*> stores) {
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker);
    DorisMetrics::instance()->create_tablet_requests_total->increment(1);

    int64_t tablet_id = request.tablet_id;
@ -432,6 +428,7 @@ TabletSharedPtr TabletManager::_create_tablet_meta_and_dir_unlocked(
 OLAPStatus TabletManager::drop_tablet(TTabletId tablet_id, SchemaHash schema_hash,
                                      bool keep_files) {
    WriteLock wrlock(_get_tablets_shard_lock(tablet_id));
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker);
    return _drop_tablet_unlocked(tablet_id, keep_files);
 }

@ -460,6 +457,7 @@ OLAPStatus TabletManager::_drop_tablet_unlocked(TTabletId tablet_id, bool keep_f

 OLAPStatus TabletManager::drop_tablets_on_error_root_path(
        const std::vector<TabletInfo>& tablet_info_vec) {
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker);
    OLAPStatus res = OLAP_SUCCESS;
    if (tablet_info_vec.empty()) { // This is a high probability event
        return res;
@ -670,6 +668,7 @@ OLAPStatus TabletManager::load_tablet_from_meta(DataDir* data_dir, TTabletId tab
                                                TSchemaHash schema_hash, const string& meta_binary,
                                                bool update_meta, bool force, bool restore,
                                                bool check_path) {
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker);
    TabletMetaSharedPtr tablet_meta(new TabletMeta());
    OLAPStatus status = tablet_meta->deserialize(meta_binary);
    if (status != OLAP_SUCCESS) {
@ -752,6 +751,7 @@ OLAPStatus TabletManager::load_tablet_from_dir(DataDir* store, TTabletId tablet_
                                               SchemaHash schema_hash,
                                               const string& schema_hash_path, bool force,
                                               bool restore) {
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker);
    LOG(INFO) << "begin to load tablet from dir. "
              << " tablet_id=" << tablet_id << " schema_hash=" << schema_hash
              << " path = " << schema_hash_path << " force = " << force << " restore = " << restore;
@ -1219,11 +1219,6 @@ OLAPStatus TabletManager::_drop_tablet_directly_unlocked(TTabletId tablet_id, bo
    }

    dropped_tablet->deregister_tablet_from_dir();
-    // The dropped tablet meta is expected to be released in the TabletManager mem tracker,
-    // but is actually released in the tls mem tracker.
-    // So from TabletManager mem tracker compensate memory to tls tracker.
-    _mem_tracker->transfer_to(thread_local_ctx.get()->_thread_mem_tracker_mgr->mem_tracker().get(),
-                              dropped_tablet->tablet_meta()->mem_size() * 2);
    return OLAP_SUCCESS;
 }

--- a/be/src/runtime/bufferpool/buffer_allocator.cc
+++ b/be/src/runtime/bufferpool/buffer_allocator.cc
@ -26,6 +26,7 @@
 #include "util/cpu_info.h"
 #include "util/pretty_printer.h"
 #include "util/runtime_profile.h"
+#include "runtime/thread_context.h"

 //DECLARE_bool(disable_mem_pools);

@ -48,7 +49,7 @@ public:

    /// Add a free buffer to the free lists. May free buffers to the system allocator
    /// if the list becomes full. Caller should not hold 'lock_'
-    void AddFreeBuffer(BufferHandle&& buffer);
+    bool AddFreeBuffer(BufferHandle&& buffer);

    /// Try to get a free buffer of 'buffer_len' bytes from this arena. Returns true and
    /// sets 'buffer' if found or false if not found. Caller should not hold 'lock_'.
@ -193,7 +194,8 @@ BufferPool::BufferAllocator::BufferAllocator(BufferPool* pool, int64_t min_buffe
          clean_page_bytes_limit_(clean_page_bytes_limit),
          clean_page_bytes_remaining_(clean_page_bytes_limit),
          per_core_arenas_(CpuInfo::get_max_num_cores()),
-          max_scavenge_attempts_(MAX_SCAVENGE_ATTEMPTS) {
+          max_scavenge_attempts_(MAX_SCAVENGE_ATTEMPTS),
+          _mem_tracker(MemTracker::create_virtual_tracker(-1, "BufferAllocator", nullptr, MemTrackerLevel::OVERVIEW)) {
    DCHECK(BitUtil::IsPowerOf2(min_buffer_len_)) << min_buffer_len_;
    DCHECK(BitUtil::IsPowerOf2(max_buffer_len_)) << max_buffer_len_;
    DCHECK_LE(0, min_buffer_len_);
@ -303,6 +305,7 @@ Status BufferPool::BufferAllocator::AllocateInternal(int64_t len, BufferHandle*
        system_bytes_remaining_.add(len);
        return status;
    }
+    _mem_tracker->consume_cache(len);
    return Status::OK();
 }

@ -375,7 +378,9 @@ void BufferPool::BufferAllocator::Free(BufferHandle&& handle) {
    handle.client_ = nullptr; // Buffer is no longer associated with a client.
    FreeBufferArena* arena = per_core_arenas_[handle.home_core_].get();
    handle.Poison();
-    arena->AddFreeBuffer(std::move(handle));
+    if (!arena->AddFreeBuffer(std::move(handle))) {
+        _mem_tracker->release_cache(handle.len());
+    }
 }

 void BufferPool::BufferAllocator::AddCleanPage(const std::unique_lock<std::mutex>& client_lock,
@ -426,6 +431,7 @@ int64_t BufferPool::BufferAllocator::FreeToSystem(std::vector<BufferHandle>&& bu
        buffer.Unpoison();
        system_allocator_->Free(std::move(buffer));
    }
+    _mem_tracker->release_cache(bytes_freed);
    return bytes_freed;
 }

@ -485,16 +491,17 @@ BufferPool::FreeBufferArena::~FreeBufferArena() {
    }
 }

-void BufferPool::FreeBufferArena::AddFreeBuffer(BufferHandle&& buffer) {
+bool BufferPool::FreeBufferArena::AddFreeBuffer(BufferHandle&& buffer) {
    std::lock_guard<SpinLock> al(lock_);
    if (config::disable_mem_pools) {
        int64_t len = buffer.len();
        parent_->system_allocator_->Free(std::move(buffer));
        parent_->system_bytes_remaining_.add(len);
-        return;
+        return false;
    }
    PerSizeLists* lists = GetListsForSize(buffer.len());
    lists->AddFreeBuffer(std::move(buffer));
+    return true;
 }

 bool BufferPool::FreeBufferArena::RemoveCleanPage(bool claim_buffer, Page* page) {
--- a/be/src/runtime/bufferpool/buffer_allocator.h
+++ b/be/src/runtime/bufferpool/buffer_allocator.h
@ -21,6 +21,7 @@
 #include "runtime/bufferpool/buffer_pool_internal.h"
 #include "runtime/bufferpool/free_list.h"
 #include "util/aligned_new.h"
+#include "runtime/mem_tracker.h"

 namespace doris {

@ -235,6 +236,8 @@ private:
    /// all arenas so may fail. The final attempt locks all arenas, which is expensive
    /// but is guaranteed to succeed.
    int max_scavenge_attempts_;
+
+    std::shared_ptr<MemTracker> _mem_tracker;
 };
 } // namespace doris

--- a/be/src/runtime/memory/chunk_allocator.cpp
+++ b/be/src/runtime/memory/chunk_allocator.cpp
@ -99,6 +99,7 @@ public:
        // Poison this chunk to make asan can detect invalid access
        ASAN_POISON_MEMORY_REGION(ptr, size);
        std::lock_guard<SpinLock> l(_lock);
+        // TODO(zxy) The memory of vector resize is not recorded in chunk allocator mem tracker
        _chunk_lists[idx].push_back(ptr);
    }

@ -118,9 +119,13 @@ ChunkAllocator::ChunkAllocator(size_t reserve_limit)
          _arenas(CpuInfo::get_max_num_cores()) {
    _mem_tracker =
            MemTracker::create_tracker(-1, "ChunkAllocator", nullptr, MemTrackerLevel::OVERVIEW);
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_mem_tracker);
    for (int i = 0; i < _arenas.size(); ++i) {
        _arenas[i].reset(new ChunkArena());
    }
+    // After the ChunkAllocator is created in the main thread, the main thread will not switch to the
+    // chunk allocator mem tracker again, so manually clear the untracked mem in tls.
+    thread_local_ctx.get()->_thread_mem_tracker_mgr->clear_untracked_mems();

    _chunk_allocator_metric_entity =
            DorisMetrics::instance()->metric_registry()->register_entity("chunk_allocator");
--- a/be/src/runtime/tcmalloc_hook.h
+++ b/be/src/runtime/tcmalloc_hook.h
@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.

+#pragma once
+
 #include <gperftools/malloc_hook.h>
 #include <gperftools/nallocx.h>
 #include <gperftools/tcmalloc.h>
--- a/be/src/runtime/thread_context.h
+++ b/be/src/runtime/thread_context.h
@ -25,14 +25,29 @@
 #include "runtime/runtime_state.h"
 #include "runtime/thread_mem_tracker_mgr.h"
 #include "runtime/threadlocal.h"
+#include "util/doris_metrics.h"

 // Attach to task when thread starts
 #define SCOPED_ATTACH_TASK_THREAD(type, ...) \
    auto VARNAME_LINENUM(attach_task_thread) = AttachTaskThread(type, ##__VA_ARGS__)
+// Be careful to stop the thread mem tracker, because the actual order of malloc and free memory
+// may be different from the order of execution of instructions, which will cause the position of
+// the memory track to be unexpected.
 #define SCOPED_STOP_THREAD_LOCAL_MEM_TRACKER() \
    auto VARNAME_LINENUM(stop_tracker) = StopThreadMemTracker(true)
 #define GLOBAL_STOP_THREAD_LOCAL_MEM_TRACKER() \
    auto VARNAME_LINENUM(stop_tracker) = StopThreadMemTracker(false)
+// Switch thread mem tracker during task execution.
+// After the non-query thread switches the mem tracker, if the thread will not switch the mem
+// tracker again in the short term, can consider manually clear_untracked_mems.
+// The query thread will automatically clear_untracked_mems when detach_task.
+#define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(mem_tracker) \
+    auto VARNAME_LINENUM(switch_tracker) = SwitchThreadMemTracker(mem_tracker, false)
+#define SCOPED_SWITCH_TASK_THREAD_LOCAL_MEM_TRACKER(mem_tracker) \
+    auto VARNAME_LINENUM(switch_tracker) = SwitchThreadMemTracker(mem_tracker, true);
+#define SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_ERR_CB(action_type, ...) \
+    auto VARNAME_LINENUM(witch_tracker_cb) =                            \
+            SwitchThreadMemTrackerErrCallBack(action_type, ##__VA_ARGS__)

 namespace doris {

@ -72,7 +87,7 @@ public:
        _type = type;
        _task_id = task_id;
        _fragment_instance_id = fragment_instance_id;
-        _thread_mem_tracker_mgr->attach_task(task_type_string(_type), task_id, fragment_instance_id,
+        _thread_mem_tracker_mgr->attach_task(TaskTypeStr[_type], task_id, fragment_instance_id,
                                             mem_tracker);
    }

@ -88,10 +103,6 @@ public:
    const std::string& thread_id_str() const { return _thread_id_str; }
    const TUniqueId& fragment_instance_id() const { return _fragment_instance_id; }

-    inline static const std::string task_type_string(ThreadContext::TaskType type) {
-        return TaskTypeStr[type];
-    }
-
    void consume_mem(int64_t size) {
        if (start_thread_mem_tracker) {
            _thread_mem_tracker_mgr->cache_consume(size);
@ -166,13 +177,13 @@ public:

    explicit AttachTaskThread(const ThreadContext::TaskType& type,
                              const std::shared_ptr<MemTracker>& mem_tracker) {
-        DCHECK(mem_tracker != nullptr);
+        DCHECK(mem_tracker);
        thread_local_ctx.get()->attach(type, "", TUniqueId(), mem_tracker);
    }

    explicit AttachTaskThread(const TQueryType::type& query_type,
                              const std::shared_ptr<MemTracker>& mem_tracker) {
-        DCHECK(mem_tracker != nullptr);
+        DCHECK(mem_tracker);
        thread_local_ctx.get()->attach(query_to_task_type(query_type), "", TUniqueId(),
                                       mem_tracker);
    }
@ -182,7 +193,7 @@ public:
                              const std::shared_ptr<MemTracker>& mem_tracker) {
        DCHECK(task_id != "");
        DCHECK(fragment_instance_id != TUniqueId());
-        DCHECK(mem_tracker != nullptr);
+        DCHECK(mem_tracker);
        thread_local_ctx.get()->attach(query_to_task_type(query_type), task_id,
                                       fragment_instance_id, mem_tracker);
    }
@ -192,7 +203,7 @@ public:
 #ifndef BE_TEST
        DCHECK(print_id(runtime_state->query_id()) != "");
        DCHECK(runtime_state->fragment_instance_id() != TUniqueId());
-        DCHECK(mem_tracker != nullptr);
+        DCHECK(mem_tracker);
        thread_local_ctx.get()->attach(query_to_task_type(runtime_state->query_type()),
                                       print_id(runtime_state->query_id()),
                                       runtime_state->fragment_instance_id(), mem_tracker);
@ -211,7 +222,12 @@ public:
        }
    }

-    ~AttachTaskThread() { thread_local_ctx.get()->detach(); }
+    ~AttachTaskThread() {
+#ifndef BE_TEST
+        thread_local_ctx.get()->detach();
+        DorisMetrics::instance()->attach_task_thread_count->increment(1);
+#endif
+    }
 };

 class StopThreadMemTracker {
@ -228,4 +244,49 @@ private:
    bool _scope;
 };

+class SwitchThreadMemTracker {
+public:
+    explicit SwitchThreadMemTracker(const std::shared_ptr<MemTracker>& mem_tracker,
+                                    bool in_task = true) {
+#ifndef BE_TEST
+        DCHECK(mem_tracker);
+        // The thread tracker must be switched after the attach task, otherwise switching
+        // in the main thread will cause the cached tracker not be cleaned up in time.
+        DCHECK(in_task == false ||
+               thread_local_ctx.get()->_thread_mem_tracker_mgr->is_attach_task());
+        _old_tracker_id =
+                thread_local_ctx.get()->_thread_mem_tracker_mgr->update_tracker(mem_tracker);
+#endif
+    }
+
+    ~SwitchThreadMemTracker() {
+#ifndef BE_TEST
+        thread_local_ctx.get()->_thread_mem_tracker_mgr->update_tracker_id(_old_tracker_id);
+        DorisMetrics::instance()->switch_thread_mem_tracker_count->increment(1);
+#endif
+    }
+
+private:
+    std::string _old_tracker_id;
+};
+
+class SwitchThreadMemTrackerErrCallBack {
+public:
+    explicit SwitchThreadMemTrackerErrCallBack(const std::string& action_type,
+                                               bool cancel_work = true,
+                                               ERRCALLBACK err_call_back_func = nullptr) {
+        DCHECK(action_type != std::string());
+        _old_tracker_cb = thread_local_ctx.get()->_thread_mem_tracker_mgr->update_consume_err_cb(
+                action_type, cancel_work, err_call_back_func);
+    }
+
+    ~SwitchThreadMemTrackerErrCallBack() {
+        thread_local_ctx.get()->_thread_mem_tracker_mgr->update_consume_err_cb(_old_tracker_cb);
+        DorisMetrics::instance()->switch_thread_mem_tracker_err_cb_count->increment(1);
+    }
+
+private:
+    ConsumeErrCallBackInfo _old_tracker_cb;
+};
+
 } // namespace doris
--- a/be/src/runtime/thread_mem_tracker_mgr.cpp
+++ b/be/src/runtime/thread_mem_tracker_mgr.cpp
@ -22,19 +22,21 @@

 namespace doris {

-void ThreadMemTrackerMgr::attach_task(const std::string& action_type, const std::string& task_id,
+void ThreadMemTrackerMgr::attach_task(const std::string& cancel_msg, const std::string& task_id,
                                      const TUniqueId& fragment_instance_id,
                                      const std::shared_ptr<MemTracker>& mem_tracker) {
    _task_id = task_id;
    _fragment_instance_id = fragment_instance_id;
-    _consume_err_call_back.update(action_type, true, nullptr);
+    _consume_err_cb.cancel_msg = cancel_msg;
    if (mem_tracker == nullptr) {
 #ifdef BE_TEST
        if (ExecEnv::GetInstance()->task_pool_mem_tracker_registry() == nullptr) {
            return;
        }
 #endif
-        _temp_task_mem_tracker = ExecEnv::GetInstance()->task_pool_mem_tracker_registry()->get_task_mem_tracker(task_id);
+        _temp_task_mem_tracker =
+                ExecEnv::GetInstance()->task_pool_mem_tracker_registry()->get_task_mem_tracker(
+                        task_id);
        update_tracker(_temp_task_mem_tracker);
    } else {
        update_tracker(mem_tracker);
@ -44,7 +46,7 @@ void ThreadMemTrackerMgr::attach_task(const std::string& action_type, const std:
 void ThreadMemTrackerMgr::detach_task() {
    _task_id = "";
    _fragment_instance_id = TUniqueId();
-    _consume_err_call_back.init();
+    _consume_err_cb.init();
    clear_untracked_mems();
    _tracker_id = "process";
    // The following memory changes for the two map operations of _untracked_mems and _mem_trackers
@ -70,12 +72,12 @@ void ThreadMemTrackerMgr::exceeded_cancel_task(const std::string& cancel_details

 void ThreadMemTrackerMgr::exceeded(int64_t mem_usage, Status st) {
    auto rst = _mem_trackers[_tracker_id]->mem_limit_exceeded(
-            nullptr, "In TCMalloc Hook, " + _consume_err_call_back.action_type, mem_usage, st);
-    if (_consume_err_call_back.call_back_func != nullptr) {
-        _consume_err_call_back.call_back_func();
+            nullptr, "In TCMalloc Hook, " + _consume_err_cb.cancel_msg, mem_usage, st);
+    if (_consume_err_cb.cb_func != nullptr) {
+        _consume_err_cb.cb_func();
    }
-    if (_task_id != "") {
-        if (_consume_err_call_back.cancel_task == true) {
+    if (is_attach_task()) {
+        if (_consume_err_cb.cancel_task == true) {
            exceeded_cancel_task(rst.to_string());
        } else {
            // TODO(zxy) Need other processing, or log (not too often).
--- a/be/src/runtime/thread_mem_tracker_mgr.h
+++ b/be/src/runtime/thread_mem_tracker_mgr.h
@ -28,27 +28,19 @@ namespace doris {
 typedef void (*ERRCALLBACK)();

 struct ConsumeErrCallBackInfo {
-    std::string action_type;
+    std::string cancel_msg;
    bool cancel_task; // Whether to cancel the task when the current tracker exceeds the limit
-    ERRCALLBACK call_back_func;
+    ERRCALLBACK cb_func;

-    ConsumeErrCallBackInfo() {
-        init();
-    }
+    ConsumeErrCallBackInfo() { init(); }

-    ConsumeErrCallBackInfo(std::string action_type, bool cancel_task, ERRCALLBACK call_back_func)
-            : action_type(action_type), cancel_task(cancel_task), call_back_func(call_back_func) {}
-
-    void update(std::string new_action_type, bool new_cancel_task, ERRCALLBACK new_call_back_func) {
-        action_type = new_action_type;
-        cancel_task = new_cancel_task;
-        call_back_func = new_call_back_func;
-    }
+    ConsumeErrCallBackInfo(const std::string& cancel_msg, bool cancel_task, ERRCALLBACK cb_func)
+            : cancel_msg(cancel_msg), cancel_task(cancel_task), cb_func(cb_func) {}

    void init() {
-        action_type = "";
+        cancel_msg = "";
        cancel_task = false;
-        call_back_func = nullptr;
+        cb_func = nullptr;
    }
 };

@ -80,7 +72,7 @@ public:
    }

    void clear_untracked_mems() {
-        for(auto untracked_mem : _untracked_mems) {
+        for (const auto& untracked_mem : _untracked_mems) {
            if (untracked_mem.second != 0) {
                DCHECK(_mem_trackers[untracked_mem.first]);
                _mem_trackers[untracked_mem.first]->consume(untracked_mem.second);
@ -91,7 +83,7 @@ public:
    }

    // After attach, the current thread TCMalloc Hook starts to consume/release task mem_tracker
-    void attach_task(const std::string& action_type, const std::string& task_id,
+    void attach_task(const std::string& cancel_msg, const std::string& task_id,
                     const TUniqueId& fragment_instance_id,
                     const std::shared_ptr<MemTracker>& mem_tracker);

@ -101,6 +93,27 @@ public:
    // Thread update_tracker may be called very frequently, adding a memory copy will be slow.
    std::string update_tracker(const std::shared_ptr<MemTracker>& mem_tracker);

+    void update_tracker_id(const std::string& tracker_id) {
+        if (tracker_id != _tracker_id) {
+            _untracked_mems[_tracker_id] += _untracked_mem;
+            _untracked_mem = 0;
+            _tracker_id = tracker_id;
+        }
+    }
+
+    inline ConsumeErrCallBackInfo update_consume_err_cb(const std::string& cancel_msg,
+                                                        bool cancel_task, ERRCALLBACK cb_func) {
+        _temp_consume_err_cb = _consume_err_cb;
+        _consume_err_cb.cancel_msg = cancel_msg;
+        _consume_err_cb.cancel_task = cancel_task;
+        _consume_err_cb.cb_func = cb_func;
+        return _temp_consume_err_cb;
+    }
+
+    inline void update_consume_err_cb(const ConsumeErrCallBackInfo& consume_err_cb) {
+        _consume_err_cb = consume_err_cb;
+    }
+
    // Note that, If call the memory allocation operation in TCMalloc new/delete Hook,
    // such as calling LOG/iostream/sstream/stringstream/etc. related methods,
    // must increase the control to avoid entering infinite recursion, otherwise it may cause crash or stuck,
@ -108,6 +121,8 @@ public:

    void noncache_consume();

+    bool is_attach_task() { return _task_id != ""; }
+
    std::shared_ptr<MemTracker> mem_tracker() {
        DCHECK(_mem_trackers[_tracker_id]);
        return _mem_trackers[_tracker_id];
@ -137,15 +152,16 @@ private:

    // Avoid memory allocation in functions and fall into an infinite loop
    std::string _temp_tracker_id;
-    ConsumeErrCallBackInfo _temp_consume_err_call_back;
+    ConsumeErrCallBackInfo _temp_consume_err_cb;
    std::shared_ptr<MemTracker> _temp_task_mem_tracker;

    std::string _task_id;
    TUniqueId _fragment_instance_id;
-    ConsumeErrCallBackInfo _consume_err_call_back;
+    ConsumeErrCallBackInfo _consume_err_cb;
 };

-inline std::string ThreadMemTrackerMgr::update_tracker(const std::shared_ptr<MemTracker>& mem_tracker) {
+inline std::string ThreadMemTrackerMgr::update_tracker(
+        const std::shared_ptr<MemTracker>& mem_tracker) {
    DCHECK(mem_tracker);
    _temp_tracker_id = mem_tracker->id();
    if (_temp_tracker_id == _tracker_id) {
--- a/be/src/service/doris_main.cpp
+++ b/be/src/service/doris_main.cpp
@ -53,7 +53,6 @@
 #include "runtime/exec_env.h"
 #include "runtime/heartbeat_flags.h"
 #include "runtime/minidump.h"
-#include "runtime/tcmalloc_hook.h"
 #include "service/backend_options.h"
 #include "service/backend_service.h"
 #include "service/brpc_service.h"
@ -65,6 +64,11 @@
 #include "util/thrift_server.h"
 #include "util/uid_util.h"

+#if !defined(__SANITIZE_ADDRESS__) && !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER) && \
+        !defined(THREAD_SANITIZER)
+#include "runtime/tcmalloc_hook.h"
+#endif
+
 static void help(const char*);

 #include <dlfcn.h>
@ -336,11 +340,8 @@ int main(int argc, char** argv) {
        return -1;
    }

-    if (doris::config::track_new_delete) {
-        init_hook();
-    }
-
-#if !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER) && !defined(THREAD_SANITIZER)
+#if !defined(__SANITIZE_ADDRESS__) && !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER) && \
+        !defined(THREAD_SANITIZER)
    // Aggressive decommit is required so that unused pages in the TCMalloc page heap are
    // not backed by physical pages and do not contribute towards memory consumption.
    MallocExtension::instance()->SetNumericProperty("tcmalloc.aggressive_memory_decommit", 1);
@ -351,6 +352,9 @@ int main(int argc, char** argv) {
        fprintf(stderr, "Failed to change TCMalloc total thread cache size.\n");
        return -1;
    }
+    if (doris::config::track_new_delete) {
+        init_hook();
+    }
 #endif

    if (!doris::Env::init()) {
--- a/be/src/util/doris_metrics.cpp
+++ b/be/src/util/doris_metrics.cpp
@ -132,6 +132,10 @@ DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(load_bytes, MetricUnit::BYTES);
 DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(memtable_flush_total, MetricUnit::OPERATIONS);
 DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(memtable_flush_duration_us, MetricUnit::MICROSECONDS);

+DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(attach_task_thread_count, MetricUnit::NOUNIT);
+DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(switch_thread_mem_tracker_count, MetricUnit::NOUNIT);
+DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(switch_thread_mem_tracker_err_cb_count, MetricUnit::NOUNIT);
+
 DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(memory_pool_bytes_total, MetricUnit::BYTES);
 DEFINE_GAUGE_CORE_METRIC_PROTOTYPE_2ARG(process_thread_num, MetricUnit::NOUNIT);
 DEFINE_GAUGE_CORE_METRIC_PROTOTYPE_2ARG(process_fd_num_used, MetricUnit::NOUNIT);
@ -275,6 +279,10 @@ DorisMetrics::DorisMetrics() : _metric_registry(_s_registry_name) {
    INT_COUNTER_METRIC_REGISTER(_server_metric_entity, load_rows);
    INT_COUNTER_METRIC_REGISTER(_server_metric_entity, load_bytes);

+    INT_COUNTER_METRIC_REGISTER(_server_metric_entity, attach_task_thread_count);
+    INT_COUNTER_METRIC_REGISTER(_server_metric_entity, switch_thread_mem_tracker_count);
+    INT_COUNTER_METRIC_REGISTER(_server_metric_entity, switch_thread_mem_tracker_err_cb_count);
+
    _server_metric_entity->register_hook(_s_hook_name, std::bind(&DorisMetrics::_update, this));

    INT_UGAUGE_METRIC_REGISTER(_server_metric_entity, query_cache_memory_total_byte);
--- a/be/src/util/doris_metrics.h
+++ b/be/src/util/doris_metrics.h
@ -125,6 +125,10 @@ public:
    IntCounter* memtable_flush_total;
    IntCounter* memtable_flush_duration_us;

+    IntCounter* attach_task_thread_count;
+    IntCounter* switch_thread_mem_tracker_count;
+    IntCounter* switch_thread_mem_tracker_err_cb_count;
+
    IntGauge* memory_pool_bytes_total;
    IntGauge* process_thread_num;
    IntGauge* process_fd_num_used;
--- a/be/src/vec/exec/join/vhash_join_node.cpp
+++ b/be/src/vec/exec/join/vhash_join_node.cpp
@ -20,6 +20,7 @@
 #include "gen_cpp/PlanNodes_types.h"
 #include "runtime/mem_tracker.h"
 #include "runtime/runtime_filter_mgr.h"
+#include "runtime/thread_context.h"
 #include "util/defer_op.h"
 #include "vec/core/materialize_block.h"
 #include "vec/exprs/vexpr.h"
@ -921,6 +922,7 @@ Status HashJoinNode::open(RuntimeState* state) {

 Status HashJoinNode::_hash_table_build(RuntimeState* state) {
    RETURN_IF_ERROR(child(1)->open(state));
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_ERR_CB("Hash join, while constructing the hash table.");
    SCOPED_TIMER(_build_timer);
    MutableBlock mutable_block(child(1)->row_desc().tuple_descriptors());

@ -936,7 +938,6 @@ Status HashJoinNode::_hash_table_build(RuntimeState* state) {
        RETURN_IF_ERROR(child(1)->get_next(state, &block, &eos));
        _hash_table_mem_tracker->consume(block.allocated_bytes());
        _mem_used += block.allocated_bytes();
-        RETURN_IF_INSTANCE_LIMIT_EXCEEDED(state, "Hash join, while getting next from the child 1.");

        if (block.rows() != 0) { mutable_block.merge(block); }

@ -947,7 +948,6 @@ Status HashJoinNode::_hash_table_build(RuntimeState* state) {
            // TODO:: Rethink may we should do the proess after we recevie all build blocks ?
            // which is better.
            RETURN_IF_ERROR(_process_build_block(state, _build_blocks[index], index));
-            RETURN_IF_INSTANCE_LIMIT_EXCEEDED(state, "Hash join, while constructing the hash table.");

            mutable_block = MutableBlock();
            ++index;
@ -957,7 +957,6 @@ Status HashJoinNode::_hash_table_build(RuntimeState* state) {

    _build_blocks.emplace_back(mutable_block.to_block());
    RETURN_IF_ERROR(_process_build_block(state, _build_blocks[index], index));
-    RETURN_IF_INSTANCE_LIMIT_EXCEEDED(state, "Hash join, while constructing the hash table.");

    return std::visit(
            [&](auto&& arg) -> Status {
--- a/be/src/vec/exec/vaggregation_node.cpp
+++ b/be/src/vec/exec/vaggregation_node.cpp
@ -22,6 +22,7 @@
 #include "exec/exec_node.h"
 #include "runtime/mem_pool.h"
 #include "runtime/row_batch.h"
+#include "runtime/thread_context.h"
 #include "util/defer_op.h"
 #include "vec/core/block.h"
 #include "vec/data_types/data_type_nullable.h"
@ -332,6 +333,7 @@ Status AggregationNode::prepare(RuntimeState* state) {

 Status AggregationNode::open(RuntimeState* state) {
    RETURN_IF_ERROR(ExecNode::open(state));
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_ERR_CB("aggregator, while execute open.");
    SCOPED_TIMER(_runtime_profile->total_time_counter());

    RETURN_IF_ERROR(VExpr::open(_probe_expr_ctxs, state));
@ -356,7 +358,6 @@ Status AggregationNode::open(RuntimeState* state) {
        }
        RETURN_IF_ERROR(_executor.execute(&block));
        _executor.update_memusage();
-        RETURN_IF_INSTANCE_LIMIT_EXCEEDED(state, "aggregator, while execute open.");
    }

    return Status::OK();
@ -366,7 +367,9 @@ Status AggregationNode::get_next(RuntimeState* state, RowBatch* row_batch, bool*
    return Status::NotSupported("Not Implemented Aggregation Node::get_next scalar");
 }

-Status AggregationNode::get_next(RuntimeState* state, Block* block, bool* eos) {    SCOPED_TIMER(_runtime_profile->total_time_counter());
+Status AggregationNode::get_next(RuntimeState* state, Block* block, bool* eos) {
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_ERR_CB("aggregator, while execute get_next.");
+    SCOPED_TIMER(_runtime_profile->total_time_counter());

    if (_is_streaming_preagg) {
        bool child_eos = false;
@ -395,7 +398,6 @@ Status AggregationNode::get_next(RuntimeState* state, Block* block, bool* eos) {
    }

    _executor.update_memusage();
-    RETURN_IF_INSTANCE_LIMIT_EXCEEDED(state, "aggregator, while execute get_next.");
    return Status::OK();
 }

--- a/be/src/vec/exec/vcross_join_node.cpp
+++ b/be/src/vec/exec/vcross_join_node.cpp
@ -23,6 +23,7 @@
 #include "gen_cpp/PlanNodes_types.h"
 #include "runtime/row_batch.h"
 #include "runtime/runtime_state.h"
+#include "runtime/thread_context.h"
 #include "util/runtime_profile.h"

 namespace doris::vectorized {
@ -53,6 +54,7 @@ Status VCrossJoinNode::close(RuntimeState* state) {
 Status VCrossJoinNode::construct_build_side(RuntimeState* state) {
    // Do a full scan of child(1) and store all build row batches.
    RETURN_IF_ERROR(child(1)->open(state));
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_ERR_CB("Vec Cross join, while getting next from the child 1");

    bool eos = false;
    while (true) {
@ -70,8 +72,6 @@ Status VCrossJoinNode::construct_build_side(RuntimeState* state) {
            _build_blocks.emplace_back(std::move(block));
            _block_mem_tracker->consume(mem_usage);
        }
-        // to prevent use too many memory
-        RETURN_IF_INSTANCE_LIMIT_EXCEEDED(state, "Cross join, while getting next from the child 1.");

        if (eos) {
            break;
--- a/be/src/vec/exec/vset_operation_node.cpp
+++ b/be/src/vec/exec/vset_operation_node.cpp
@ -17,6 +17,7 @@

 #include "vec/exec/vset_operation_node.h"

+#include "runtime/thread_context.h"
 #include "util/defer_op.h"
 #include "vec/exprs/vexpr.h"
 namespace doris {
@ -228,6 +229,8 @@ void VSetOperationNode::hash_table_init() {
 //build a hash table from child(0)
 Status VSetOperationNode::hash_table_build(RuntimeState* state) {
    RETURN_IF_ERROR(child(0)->open(state));
+    SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER_ERR_CB(
+                "Vec Set Operation Node, while constructing the hash table");
    Block block;
    MutableBlock mutable_block(child(0)->row_desc().tuple_descriptors());

@ -244,7 +247,6 @@ Status VSetOperationNode::hash_table_build(RuntimeState* state) {
        _hash_table_mem_tracker->consume(allocated_bytes);
        _mem_used += allocated_bytes;

-        RETURN_IF_INSTANCE_LIMIT_EXCEEDED(state, "Set Operation Node, while getting next from the child 0.");
        if (block.rows() != 0) { mutable_block.merge(block); }

        // make one block for each 4 gigabytes
@ -254,7 +256,6 @@ Status VSetOperationNode::hash_table_build(RuntimeState* state) {
            // TODO:: Rethink may we should do the proess after we recevie all build blocks ?
            // which is better.
            RETURN_IF_ERROR(process_build_block(_build_blocks[index], index));
-            RETURN_IF_INSTANCE_LIMIT_EXCEEDED(state, "Set Operation Node, while constructing the hash table.");
            mutable_block = MutableBlock();
            ++index;
            last_mem_used = _mem_used;
@ -263,7 +264,6 @@ Status VSetOperationNode::hash_table_build(RuntimeState* state) {

    _build_blocks.emplace_back(mutable_block.to_block());
    RETURN_IF_ERROR(process_build_block(_build_blocks[index], index));
-    RETURN_IF_INSTANCE_LIMIT_EXCEEDED(state, "Set Operation Node, while constructing the hash table.");
    return Status::OK();
 }