From 4bb2e952955b96e049522bf769ab3cc1b1736c09 Mon Sep 17 00:00:00 2001 From: tushicheng <18829573815@163.com> Date: Mon, 15 Apr 2024 03:20:56 +0000 Subject: [PATCH] diagnose memory bloat --- .../oblib/src/lib/alloc/alloc_failed_reason.h | 6 ++ deps/oblib/src/lib/alloc/memory_dump.cpp | 70 +++++++++++++++++-- deps/oblib/src/lib/alloc/memory_dump.h | 2 + .../src/lib/alloc/ob_malloc_sample_struct.h | 20 +++++- .../src/lib/alloc/ob_tenant_ctx_allocator.cpp | 7 ++ mittest/mtlenv/mock_tenant_module_env.h | 6 -- src/observer/ob_server.cpp | 14 ---- src/observer/omt/ob_multi_tenant.cpp | 2 +- src/share/ob_tenant_mgr.cpp | 5 -- src/share/ob_thread_define.h | 1 - src/storage/tx_storage/ob_tenant_freezer.cpp | 6 -- .../tx_storage/ob_tenant_memory_printer.cpp | 20 ------ 12 files changed, 101 insertions(+), 58 deletions(-) diff --git a/deps/oblib/src/lib/alloc/alloc_failed_reason.h b/deps/oblib/src/lib/alloc/alloc_failed_reason.h index 48e4480990..b8fb82bf76 100644 --- a/deps/oblib/src/lib/alloc/alloc_failed_reason.h +++ b/deps/oblib/src/lib/alloc/alloc_failed_reason.h @@ -64,6 +64,12 @@ public: return reason_ == lib::PHYSICAL_MEMORY_EXHAUST; } + bool reach_limit_except_ctx() const + { + return reason_ == lib::TENANT_HOLD_REACH_LIMIT || + reason_ == lib::SERVER_HOLD_REACH_LIMIT || + reason_ == lib::PHYSICAL_MEMORY_EXHAUST; + } }; char *alloc_failed_msg(); diff --git a/deps/oblib/src/lib/alloc/memory_dump.cpp b/deps/oblib/src/lib/alloc/memory_dump.cpp index f7757f8a32..059ef1bb10 100644 --- a/deps/oblib/src/lib/alloc/memory_dump.cpp +++ b/deps/oblib/src/lib/alloc/memory_dump.cpp @@ -21,8 +21,10 @@ #include "lib/thread/ob_thread_name.h" #include "lib/thread/thread_mgr.h" #include "lib/utility/ob_print_utils.h" +#include "lib/container/ob_vector.h" #include "rpc/obrpc/ob_rpc_packet.h" #include "common/ob_clock_generator.h" +#include "common/ob_smart_var.h" namespace oceanbase { @@ -257,6 +259,57 @@ int ObMemoryDump::load_malloc_sample_map(ObMallocSampleMap &malloc_sample_map) return ret; } +void ObMemoryDump::print_malloc_sample_info() +{ + int ret = OB_SUCCESS; + typedef ObSortedVector MallocSamplePairVector; + ObLatchRGuard guard(iter_lock_, ObLatchIds::MEM_DUMP_ITER_LOCK); + ObMallocSampleMap &map = r_stat_->malloc_sample_map_; + ObMemAttr attr(OB_SERVER_TENANT_ID, "MallocSampleInf", ObCtxIds::DEFAULT_CTX_ID, lib::OB_HIGH_ALLOC); + MallocSamplePairVector vector(map.size(), nullptr, attr); + for (ObMallocSampleIter it = map.begin(); OB_SUCC(ret) && it != map.end(); ++it) { + MallocSamplePairVector::iterator pos; + ret = vector.insert(&(*it), pos, ObMallocSamplePairCmp()); + } + int64_t print_pos = 0; + int64_t tenant_id = OB_SERVER_TENANT_ID; + int64_t ctx_id = ObCtxIds::DEFAULT_CTX_ID; + const char *label = ""; + int64_t bt_cnt = 0; + const int64_t MAX_LABEL_BT_CNT = 5; + for (MallocSamplePairVector::iterator it = vector.begin(); OB_SUCC(ret) && it != vector.end(); ++it) { + if ((*it)->first.tenant_id_ != tenant_id || (*it)->first.ctx_id_ != ctx_id) { + if (print_pos > 0) { + _LOG_INFO("\n[MEMORY][BT] tenant_id=%5ld ctx_id=%25s\n%.*s", + tenant_id, get_global_ctx_info().get_ctx_name(ctx_id), static_cast(print_pos), print_buf_); + print_pos = 0; + } + tenant_id = (*it)->first.tenant_id_; + ctx_id = (*it)->first.ctx_id_; + label = (*it)->first.label_; + bt_cnt = 0; + } else if (0 != STRCMP(label, (*it)->first.label_)) { + label = (*it)->first.label_; + bt_cnt = 0; + } + if (bt_cnt++ < MAX_LABEL_BT_CNT) { + char bt[MAX_BACKTRACE_LENGTH]; + parray(bt, sizeof(bt), (int64_t*)(*it)->first.bt_, AOBJECT_BACKTRACE_COUNT); + ret = databuff_printf(print_buf_, PRINT_BUF_LEN, print_pos, "[MEMORY][BT] mod=%15s, alloc_bytes=% '15ld, alloc_count=% '15ld, bt=%s\n", + label, (*it)->second.alloc_bytes_, (*it)->second.alloc_count_, bt); + if (OB_SUCC(ret) && print_pos > PRINT_BUF_LEN / 2) { + _LOG_INFO("\n[MEMORY][BT] tenant_id=%5ld ctx_id=%25s\n%.*s", + tenant_id, get_global_ctx_info().get_ctx_name(ctx_id), static_cast(print_pos), print_buf_); + print_pos = 0; + } + } + } + if (OB_SUCC(ret) && print_pos > 0) { + _LOG_INFO("\n[MEMORY][BT] tenant_id=%5ld ctx_id=%25s\n%.*s", + tenant_id, get_global_ctx_info().get_ctx_name(ctx_id), static_cast(print_pos), print_buf_); + } +} + void ObMemoryDump::run1() { SANITY_DISABLE_CHECK_RANGE(); // prevent sanity_check_range @@ -522,14 +575,13 @@ void ObMemoryDump::handle(void *task) ObMemoryCheckContext *memory_check_ctx = m_task->memory_check_ctx_; ObSqlMemoryLeakChecker::get_instance().update_check_range(NULL == memory_check_ctx || !memory_check_ctx->is_sql_memory_leak(), min_check_version, max_check_version); + ObMallocAllocator *ma = ObMallocAllocator::get_instance(); for (int tenant_idx = 0; tenant_idx < tenant_cnt; tenant_idx++) { uint64_t tenant_id = tenant_ids_[tenant_idx]; for (int ctx_id = 0; ctx_id < ObCtxIds::MAX_CTX_ID; ctx_id++) { - auto ta = - ObMallocAllocator::get_instance()->get_tenant_ctx_allocator(tenant_id, ctx_id); + ObTenantCtxAllocatorGuard ta = ma->get_tenant_ctx_allocator(tenant_id, ctx_id); if (nullptr == ta) { - ta = ObMallocAllocator::get_instance()->get_tenant_ctx_allocator_unrecycled(tenant_id, - ctx_id); + ta = ma->get_tenant_ctx_allocator_unrecycled(tenant_id, ctx_id); } if (nullptr == ta) { continue; @@ -641,6 +693,16 @@ void ObMemoryDump::handle(void *task) } memory_check_ctx = NULL; } + + for (int tenant_idx = 0; tenant_idx < tenant_cnt; tenant_idx++) { + uint64_t tenant_id = tenant_ids_[tenant_idx]; + ma->print_tenant_memory_usage(tenant_id); + ma->print_tenant_ctx_memory_usage(tenant_id); + } + +#ifdef FATAL_ERROR_HANG + print_malloc_sample_info(); +#endif } else { int fd = -1; if (-1 == (fd = ::open(LOG_FILE, diff --git a/deps/oblib/src/lib/alloc/memory_dump.h b/deps/oblib/src/lib/alloc/memory_dump.h index 7b1b9c25bd..bba769bd12 100644 --- a/deps/oblib/src/lib/alloc/memory_dump.h +++ b/deps/oblib/src/lib/alloc/memory_dump.h @@ -213,6 +213,8 @@ public: private: void run1() override; void handle(void *task); + + void print_malloc_sample_info(); private: AChunk *find_chunk(void *ptr); private: diff --git a/deps/oblib/src/lib/alloc/ob_malloc_sample_struct.h b/deps/oblib/src/lib/alloc/ob_malloc_sample_struct.h index 1ff63d3bc8..de1dc3b97e 100644 --- a/deps/oblib/src/lib/alloc/ob_malloc_sample_struct.h +++ b/deps/oblib/src/lib/alloc/ob_malloc_sample_struct.h @@ -67,7 +67,25 @@ struct ObMallocSampleValue typedef hash::ObHashMap ObMallocSampleMap; - +typedef hash::HashMapPair ObMallocSamplePair; +typedef ObMallocSampleMap::iterator ObMallocSampleIter; +struct ObMallocSamplePairCmp +{ + bool operator()(const ObMallocSamplePair *left, const ObMallocSamplePair *right) + { + bool bret = true; + if (left->first.tenant_id_ != right->first.tenant_id_) { + bret = left->first.tenant_id_ < right->first.tenant_id_; + } else if (left->first.ctx_id_ != right->first.ctx_id_) { + bret = left->first.ctx_id_ < right->first.ctx_id_; + } else if (0 != STRCMP(left->first.label_, right->first.label_)) { + bret = STRCMP(left->first.label_, right->first.label_) < 0; + } else if (left->second.alloc_bytes_ != right->second.alloc_bytes_) { + bret = left->second.alloc_bytes_ > right->second.alloc_bytes_; + } + return bret; + } +}; inline uint64_t ob_malloc_sample_hash(const char* data) { diff --git a/deps/oblib/src/lib/alloc/ob_tenant_ctx_allocator.cpp b/deps/oblib/src/lib/alloc/ob_tenant_ctx_allocator.cpp index 19cfde3a3f..751b272355 100644 --- a/deps/oblib/src/lib/alloc/ob_tenant_ctx_allocator.cpp +++ b/deps/oblib/src/lib/alloc/ob_tenant_ctx_allocator.cpp @@ -481,6 +481,13 @@ void* ObTenantCtxAllocator::common_realloc(const void *ptr, const int64_t size, SANITY_POISON(obj, obj->nobjs_ * AOBJECT_CELL_BYTES); SANITY_UNPOISON(obj->data_, size); } else if (TC_REACH_TIME_INTERVAL(1 * 1000 * 1000)) { +#ifdef FATAL_ERROR_HANG + if (g_alloc_failed_ctx().reach_limit_except_ctx() && + REACH_TIME_INTERVAL(60 * 1000 * 1000)) { + ObMemoryDump::get_instance().generate_mod_stat_task(); + sleep(1); + } +#endif const char *msg = is_errsim ? "[ERRSIM] errsim inject memory error" : alloc_failed_msg(); LOG_DBA_WARN(OB_ALLOCATE_MEMORY_FAILED, "[OOPS]", "alloc failed reason", KCSTRING(msg)); _OB_LOG_RET(WARN, OB_ALLOCATE_MEMORY_FAILED, "oops, alloc failed, tenant_id=%ld, ctx_id=%ld, ctx_name=%s, ctx_hold=%ld, " diff --git a/mittest/mtlenv/mock_tenant_module_env.h b/mittest/mtlenv/mock_tenant_module_env.h index c2ca3bbc24..f7b3951c18 100644 --- a/mittest/mtlenv/mock_tenant_module_env.h +++ b/mittest/mtlenv/mock_tenant_module_env.h @@ -659,8 +659,6 @@ int MockTenantModuleEnv::init_before_start_mtl() STORAGE_LOG(ERROR, "failed to init bandwidth_throttle_", K(ret)); } else if (OB_FAIL(TG_START(lib::TGDefIDs::ServerGTimer))) { STORAGE_LOG(ERROR, "init timer fail", KR(ret)); - } else if (OB_FAIL(TG_START(lib::TGDefIDs::MemDumpTimer))) { - STORAGE_LOG(ERROR, "init memory dump timer fail", KR(ret)); } else { obrpc::ObRpcNetHandler::CLUSTER_ID = 1; oceanbase::palf::election::INIT_TS = 1; @@ -865,10 +863,6 @@ void MockTenantModuleEnv::destroy() TG_WAIT(lib::TGDefIDs::ServerGTimer); TG_DESTROY(lib::TGDefIDs::ServerGTimer); - TG_STOP(lib::TGDefIDs::MemDumpTimer); - TG_WAIT(lib::TGDefIDs::MemDumpTimer); - TG_DESTROY(lib::TGDefIDs::MemDumpTimer); - if (OB_NOT_NULL(THE_IO_DEVICE)) { THE_IO_DEVICE->destroy(); } diff --git a/src/observer/ob_server.cpp b/src/observer/ob_server.cpp index fe02cd3c23..98b0f9c42c 100644 --- a/src/observer/ob_server.cpp +++ b/src/observer/ob_server.cpp @@ -634,10 +634,6 @@ void ObServer::destroy() TG_DESTROY(lib::TGDefIDs::CTASCleanUpTimer); FLOG_INFO("ctas clean up timer destroyed"); - FLOG_INFO("begin to destroy memory dump timer"); - TG_DESTROY(lib::TGDefIDs::MemDumpTimer); - FLOG_INFO("memory dump timer destroyed"); - FLOG_INFO("begin to destroy redef heart beat task"); TG_DESTROY(lib::TGDefIDs::RedefHeartBeatTask); FLOG_INFO("redef heart beat task destroyed"); @@ -1342,10 +1338,6 @@ int ObServer::stop() TG_STOP(lib::TGDefIDs::CTASCleanUpTimer); FLOG_INFO("ctas clean up timer stopped"); - FLOG_INFO("begin to stop memory dump timer"); - TG_STOP(lib::TGDefIDs::MemDumpTimer); - FLOG_INFO("memory dump timer stopped"); - FLOG_INFO("begin to stop ctas clean up timer"); TG_STOP(lib::TGDefIDs::HeartBeatCheckTask); FLOG_INFO("ctas clean up timer stopped"); @@ -1640,10 +1632,6 @@ int ObServer::wait() TG_WAIT(lib::TGDefIDs::CTASCleanUpTimer); FLOG_INFO("wait ctas clean up timer success"); - FLOG_INFO("begin to wait memory dump timer"); - TG_WAIT(lib::TGDefIDs::MemDumpTimer); - FLOG_INFO("wait memory dump timer success"); - FLOG_INFO("begin to wait root service"); root_service_.wait(); FLOG_INFO("wait root service success"); @@ -2056,8 +2044,6 @@ int ObServer::init_config_module() LOG_ERROR("fail to init server trace timer", KR(ret)); } else if (OB_FAIL(TG_START(lib::TGDefIDs::CTASCleanUpTimer))) { LOG_ERROR("fail to init ctas clean up timer", KR(ret)); - } else if (OB_FAIL(TG_START(lib::TGDefIDs::MemDumpTimer))) { - LOG_ERROR("fail to init memory dump timer", KR(ret)); } else if (OB_FAIL(config_mgr_.base_init())) { LOG_ERROR("config_mgr_ base_init failed", KR(ret)); } else if (OB_FAIL(config_mgr_.init(sql_proxy_, self_addr_))) { diff --git a/src/observer/omt/ob_multi_tenant.cpp b/src/observer/omt/ob_multi_tenant.cpp index be85f1f370..57f5ea94a1 100644 --- a/src/observer/omt/ob_multi_tenant.cpp +++ b/src/observer/omt/ob_multi_tenant.cpp @@ -605,7 +605,7 @@ int ObMultiTenant::start() } else if (OB_FAIL(ObTenantNodeBalancer::get_instance().start())) { LOG_ERROR("start tenant node balancer thread failed", K(ret)); // start memstore print timer. - } else if (OB_FAIL(printer.register_timer_task(lib::TGDefIDs::MemDumpTimer))) { + } else if (OB_FAIL(printer.register_timer_task(lib::TGDefIDs::ServerGTimer))) { LOG_ERROR("Fail to register timer task", K(ret)); } else { LOG_INFO("succ to start multi tenant"); diff --git a/src/share/ob_tenant_mgr.cpp b/src/share/ob_tenant_mgr.cpp index 61a00fa7f0..768ddaaf80 100644 --- a/src/share/ob_tenant_mgr.cpp +++ b/src/share/ob_tenant_mgr.cpp @@ -355,7 +355,6 @@ int ObVirtualTenantManager::print_tenant_usage_( int64_t &pos) { int ret = OB_SUCCESS; - lib::ObMallocAllocator *mallocator = lib::ObMallocAllocator::get_instance(); int64_t kv_cache_mem = 0; if (OB_FAIL(get_kv_cache_mem_(node.tenant_id_, kv_cache_mem))) { @@ -372,10 +371,6 @@ int ObVirtualTenantManager::print_tenant_usage_( get_tenant_memory_hold(node.tenant_id_), kv_cache_mem); } - if (!OB_ISNULL(mallocator)) { - mallocator->print_tenant_memory_usage(node.tenant_id_); - mallocator->print_tenant_ctx_memory_usage(node.tenant_id_); - } return ret; } diff --git a/src/share/ob_thread_define.h b/src/share/ob_thread_define.h index efea987ba9..6e2036499e 100755 --- a/src/share/ob_thread_define.h +++ b/src/share/ob_thread_define.h @@ -145,7 +145,6 @@ TG_DEF(CDCService, CDCSrv, THREAD_POOL, 1) TG_DEF(LogUpdater, LogUpdater, TIMER) TG_DEF(HeartBeatCheckTask, HeartBeatCheckTask, TIMER) TG_DEF(RedefHeartBeatTask, RedefHeartBeatTask, TIMER) -TG_DEF(MemDumpTimer, MemDumpTimer, TIMER) TG_DEF(SSTableDefragment, SSTableDefragment, TIMER) TG_DEF(TenantMetaMemMgr, TenantMetaMemMgr, TIMER) TG_DEF(IngressService, IngressService, TIMER) diff --git a/src/storage/tx_storage/ob_tenant_freezer.cpp b/src/storage/tx_storage/ob_tenant_freezer.cpp index 8629358d39..b53dc111d9 100644 --- a/src/storage/tx_storage/ob_tenant_freezer.cpp +++ b/src/storage/tx_storage/ob_tenant_freezer.cpp @@ -1464,7 +1464,6 @@ int ObTenantFreezer::print_tenant_usage( { int ret = OB_SUCCESS; ObTenantStatistic stat; - lib::ObMallocAllocator *mallocator = lib::ObMallocAllocator::get_instance(); if (!is_inited_) { ret = OB_NOT_INIT; @@ -1502,11 +1501,6 @@ int ObTenantFreezer::print_tenant_usage( stat.memstore_reclaimed_pos_); } - if (!OB_ISNULL(mallocator)) { - mallocator->print_tenant_memory_usage(tenant_info_.tenant_id_); - mallocator->print_tenant_ctx_memory_usage(tenant_info_.tenant_id_); - } - return ret; } diff --git a/src/storage/tx_storage/ob_tenant_memory_printer.cpp b/src/storage/tx_storage/ob_tenant_memory_printer.cpp index eea122d18e..af98e10820 100644 --- a/src/storage/tx_storage/ob_tenant_memory_printer.cpp +++ b/src/storage/tx_storage/ob_tenant_memory_printer.cpp @@ -90,26 +90,6 @@ int ObTenantMemoryPrinter::print_tenant_usage() LOG_WARN("print mtl tenant usage failed", K(tmp_ret), K(tenant_id)); } } - int tenant_cnt = 0; - static uint64_t all_tenant_ids[OB_MAX_SERVER_TENANT_CNT] = {0}; - common::get_tenant_ids(all_tenant_ids, OB_MAX_SERVER_TENANT_CNT, tenant_cnt); - lib::ObMallocAllocator *mallocator = lib::ObMallocAllocator::get_instance(); - for (int64_t i = 0; OB_SUCC(ret) && i < tenant_cnt; ++i) { - uint64_t id = all_tenant_ids[i]; - if (!is_virtual_tenant_id(id)) { - bool is_deleted_tenant = true; - for (int j = 0; j < mtl_tenant_ids.count(); ++j) { - if (id == mtl_tenant_ids[j]) { - is_deleted_tenant = false; - break; - } - } - if (is_deleted_tenant) { - mallocator->print_tenant_memory_usage(id); - mallocator->print_tenant_ctx_memory_usage(id); - } - } - } } if (OB_SIZE_OVERFLOW == ret) {