From d2c42ec638f788d6eb23fc02ba45584ca198c8f0 Mon Sep 17 00:00:00 2001 From: Xinyi Zou Date: Wed, 28 Jun 2023 16:49:45 +0800 Subject: [PATCH] [fix](memory) Purge Jemalloc arena dirty pages when memory insufficient (#21237) Jemalloc dirty page only use madvise MADV_FREE, memory is not release back to system, RSS won't reduce in time, So when the process memory exceed limit or system available memory is insufficient, manually transfer dirty page to the muzzy page, which will call MADV_DONTNEED to release the physical memory back to the system. https://jemalloc.net/jemalloc.3.html#opt.dirty_decay_ms --- be/src/common/daemon.cpp | 4 +- be/src/runtime/memory/mem_tracker_limiter.cpp | 2 +- be/src/util/mem_info.cpp | 17 ++++++--- be/src/util/mem_info.h | 37 +++++++++++++++++++ be/src/util/system_metrics.cpp | 30 +++++++++++++++ 5 files changed, 82 insertions(+), 8 deletions(-) diff --git a/be/src/common/daemon.cpp b/be/src/common/daemon.cpp index 7c64596bac..cfb002a4c4 100644 --- a/be/src/common/daemon.cpp +++ b/be/src/common/daemon.cpp @@ -245,7 +245,7 @@ void Daemon::memory_gc_thread() { // No longer full gc and minor gc during sleep. memory_full_gc_sleep_time_ms = config::memory_gc_sleep_time_ms; memory_minor_gc_sleep_time_ms = config::memory_gc_sleep_time_ms; - doris::MemTrackerLimiter::print_log_process_usage("process full gc", false); + doris::MemTrackerLimiter::print_log_process_usage("Start Full GC", false); if (doris::MemInfo::process_full_gc()) { // If there is not enough memory to be gc, the process memory usage will not be printed in the next continuous gc. doris::MemTrackerLimiter::enable_print_log_process_usage(); @@ -255,7 +255,7 @@ void Daemon::memory_gc_thread() { proc_mem_no_allocator_cache >= doris::MemInfo::soft_mem_limit())) { // No minor gc during sleep, but full gc is possible. memory_minor_gc_sleep_time_ms = config::memory_gc_sleep_time_ms; - doris::MemTrackerLimiter::print_log_process_usage("process minor gc", false); + doris::MemTrackerLimiter::print_log_process_usage("Start Minor GC", false); if (doris::MemInfo::process_minor_gc()) { doris::MemTrackerLimiter::enable_print_log_process_usage(); } diff --git a/be/src/runtime/memory/mem_tracker_limiter.cpp b/be/src/runtime/memory/mem_tracker_limiter.cpp index d03bd1ac00..683971ecac 100644 --- a/be/src/runtime/memory/mem_tracker_limiter.cpp +++ b/be/src/runtime/memory/mem_tracker_limiter.cpp @@ -137,7 +137,7 @@ void MemTrackerLimiter::make_process_snapshots(std::vector process_mem_sum += it.second->current_value(); } - snapshot.type = "tc/jemalloc_cache"; + snapshot.type = "tc/jemalloc_free_memory"; snapshot.label = ""; snapshot.limit = -1; snapshot.cur_consumption = MemInfo::allocator_cache_mem(); diff --git a/be/src/util/mem_info.cpp b/be/src/util/mem_info.cpp index f50f41e198..200d346ded 100644 --- a/be/src/util/mem_info.cpp +++ b/be/src/util/mem_info.cpp @@ -77,14 +77,18 @@ int64_t MemInfo::_s_process_full_gc_size = -1; void MemInfo::refresh_allocator_mem() { #if defined(ADDRESS_SANITIZER) || defined(LEAK_SANITIZER) || defined(THREAD_SANITIZER) #elif defined(USE_JEMALLOC) + // 'epoch' is a special mallctl -- it updates the statistics. Without it, all + // the following calls will return stale values. It increments and returns + // the current epoch number, which might be useful to log as a sanity check. uint64_t epoch = 0; size_t sz = sizeof(epoch); jemallctl("epoch", &epoch, &sz, &epoch, sz); // https://jemalloc.net/jemalloc.3.html - _s_allocator_cache_mem = - get_je_metrics(fmt::format("stats.arenas.{}.tcache_bytes", MALLCTL_ARENAS_ALL)) + - get_je_metrics("stats.metadata"); + // https://www.bookstack.cn/read/aliyun-rds-core/4a0cdf677f62feb3.md + _s_allocator_cache_mem = get_je_all_arena_metrics("tcache_bytes") + + get_je_metrics("stats.metadata") + + get_je_all_arena_metrics("pdirty") * get_page_size(); _s_allocator_cache_mem_str = PrettyPrinter::print(static_cast(_s_allocator_cache_mem), TUnit::BYTES); _s_virtual_memory_used = get_je_metrics("stats.mapped"); @@ -125,6 +129,7 @@ void MemInfo::process_cache_gc(int64_t& freed_mem) { segment_v2::PRIMARY_KEY_INDEX_PAGE); StoragePageCache::instance()->prune(segment_v2::PRIMARY_KEY_INDEX_PAGE); } + je_purge_all_arena_dirty_pages(); } // step1: free all cache @@ -139,7 +144,8 @@ bool MemInfo::process_minor_gc() { std::string mem_available_str = MemInfo::sys_mem_available_str(); Defer defer {[&]() { - LOG(INFO) << fmt::format("Process Minor GC Free Memory {} Bytes. cost(us): {}", freed_mem, + je_purge_all_arena_dirty_pages(); + LOG(INFO) << fmt::format("End Minor GC, Free Memory {} Bytes. cost(us): {}", freed_mem, watch.elapsed_time() / 1000); }}; @@ -181,7 +187,8 @@ bool MemInfo::process_full_gc() { std::string mem_available_str = MemInfo::sys_mem_available_str(); Defer defer {[&]() { - LOG(INFO) << fmt::format("Process Full GC Free Memory {} Bytes. cost(us): {}", freed_mem, + je_purge_all_arena_dirty_pages(); + LOG(INFO) << fmt::format("End Full GC Free, Memory {} Bytes. cost(us): {}", freed_mem, watch.elapsed_time() / 1000); }}; diff --git a/be/src/util/mem_info.h b/be/src/util/mem_info.h index 12c70d8cc4..89a66b0658 100644 --- a/be/src/util/mem_info.h +++ b/be/src/util/mem_info.h @@ -26,6 +26,12 @@ #include #include +#if !defined(__APPLE__) || !defined(_POSIX_C_SOURCE) +#include +#else +#include +#endif + #include "common/logging.h" #ifdef USE_JEMALLOC #include "jemalloc/jemalloc.h" @@ -46,6 +52,14 @@ public: static inline bool initialized() { return _s_initialized; } + static int get_page_size() { +#if !defined(__APPLE__) || !defined(_POSIX_C_SOURCE) + return getpagesize(); +#else + return vm_page_size; +#endif + } + // Get total physical memory in bytes (if has cgroups memory limits, return the limits). static inline int64_t physical_mem() { DCHECK(_s_initialized); @@ -83,6 +97,22 @@ public: #endif return 0; } + + static inline int64_t get_je_all_arena_metrics(const std::string& name) { +#ifdef USE_JEMALLOC + return get_je_metrics(fmt::format("stats.arenas.{}.{}", MALLCTL_ARENAS_ALL, name)); +#endif + return 0; + } + + static inline void je_purge_all_arena_dirty_pages() { +#ifdef USE_JEMALLOC + // Purge all unused dirty pages for arena , or for all arenas if equals MALLCTL_ARENAS_ALL. + jemallctl(fmt::format("arena.{}.purge", MALLCTL_ARENAS_ALL).c_str(), nullptr, nullptr, + nullptr, 0); +#endif + } + static inline size_t allocator_virtual_mem() { return _s_virtual_memory_used; } static inline size_t allocator_cache_mem() { return _s_allocator_cache_mem; } static inline std::string allocator_cache_mem_str() { return _s_allocator_cache_mem_str; } @@ -94,6 +124,13 @@ public: // obtained by the process malloc, not the physical memory actually used by the process in the OS. static void refresh_allocator_mem(); + /** jemalloc pdirty is number of pages within unused extents that are potentially + * dirty, and for which madvise() or similar has not been called. + * + * So they will be subtracted from RSS to make accounting more + * accurate, since those pages are not really RSS but a memory + * that can be used at anytime via jemalloc. + */ static inline void refresh_proc_mem_no_allocator_cache() { _s_proc_mem_no_allocator_cache = PerfCounters::get_vm_rss() - static_cast(_s_allocator_cache_mem); diff --git a/be/src/util/system_metrics.cpp b/be/src/util/system_metrics.cpp index fa8f5a181a..ee7db9494c 100644 --- a/be/src/util/system_metrics.cpp +++ b/be/src/util/system_metrics.cpp @@ -117,6 +117,12 @@ DEFINE_MEMORY_GAUGE_METRIC(jemalloc_metadata_bytes, MetricUnit::BYTES); DEFINE_MEMORY_GAUGE_METRIC(jemalloc_resident_bytes, MetricUnit::BYTES); DEFINE_MEMORY_GAUGE_METRIC(jemalloc_mapped_bytes, MetricUnit::BYTES); DEFINE_MEMORY_GAUGE_METRIC(jemalloc_retained_bytes, MetricUnit::BYTES); +DEFINE_MEMORY_GAUGE_METRIC(jemalloc_tcache_bytes, MetricUnit::BYTES); +DEFINE_MEMORY_GAUGE_METRIC(jemalloc_pactive_num, MetricUnit::NOUNIT); +DEFINE_MEMORY_GAUGE_METRIC(jemalloc_pdirty_num, MetricUnit::NOUNIT); +DEFINE_MEMORY_GAUGE_METRIC(jemalloc_pmuzzy_num, MetricUnit::NOUNIT); +DEFINE_MEMORY_GAUGE_METRIC(jemalloc_dirty_purged_num, MetricUnit::NOUNIT); +DEFINE_MEMORY_GAUGE_METRIC(jemalloc_muzzy_purged_num, MetricUnit::NOUNIT); #endif struct MemoryMetrics { @@ -142,6 +148,12 @@ struct MemoryMetrics { INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_resident_bytes); INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_mapped_bytes); INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_retained_bytes); + INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_tcache_bytes); + INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_pactive_num); + INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_pdirty_num); + INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_pmuzzy_num); + INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_dirty_purged_num); + INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_muzzy_purged_num); #endif } @@ -167,6 +179,12 @@ struct MemoryMetrics { IntGauge* memory_jemalloc_resident_bytes; IntGauge* memory_jemalloc_mapped_bytes; IntGauge* memory_jemalloc_retained_bytes; + IntGauge* memory_jemalloc_tcache_bytes; + IntGauge* memory_jemalloc_pactive_num; + IntGauge* memory_jemalloc_pdirty_num; + IntGauge* memory_jemalloc_pmuzzy_num; + IntGauge* memory_jemalloc_dirty_purged_num; + IntGauge* memory_jemalloc_muzzy_purged_num; #endif }; @@ -457,6 +475,18 @@ void SystemMetrics::update_allocator_metrics() { MemInfo::get_je_metrics("stats.mapped")); _memory_metrics->memory_jemalloc_retained_bytes->set_value( MemInfo::get_je_metrics("stats.retained")); + _memory_metrics->memory_jemalloc_tcache_bytes->set_value( + MemInfo::get_je_all_arena_metrics("tcache_bytes")); + _memory_metrics->memory_jemalloc_pactive_num->set_value( + MemInfo::get_je_all_arena_metrics("pactive")); + _memory_metrics->memory_jemalloc_pdirty_num->set_value( + MemInfo::get_je_all_arena_metrics("pdirty")); + _memory_metrics->memory_jemalloc_pmuzzy_num->set_value( + MemInfo::get_je_all_arena_metrics("pmuzzy")); + _memory_metrics->memory_jemalloc_dirty_purged_num->set_value( + MemInfo::get_je_all_arena_metrics("dirty_purged")); + _memory_metrics->memory_jemalloc_muzzy_purged_num->set_value( + MemInfo::get_je_all_arena_metrics("muzzy_purged")); #else _memory_metrics->memory_tcmalloc_allocated_bytes->set_value( MemInfo::get_tc_metrics("generic.total_physical_bytes"));