[feature](be jvm monitor)append enable_jvm_monitor in be.conf to control jvm monitor. (#35608) (#35764)

bp #35608

Co-authored-by: daidai <2017501503@qq.com>
This commit is contained in:
Mingyu Chen
2024-06-02 00:18:44 +08:00
committed by GitHub
parent 72489a04c3
commit e755d64e62
7 changed files with 117 additions and 30 deletions

View File

@ -1222,6 +1222,9 @@ DEFINE_mInt32(thrift_client_open_num_tries, "1");
DEFINE_mBool(ignore_schema_change_check, "false");
//JVM monitoring enable. To prevent be from crashing due to jvm compatibility issues. The default setting is off.
DEFINE_Bool(enable_jvm_monitor, "false");
// clang-format off
#ifdef BE_TEST
// test s3

View File

@ -1303,6 +1303,9 @@ DECLARE_mInt32(thrift_client_open_num_tries);
DECLARE_mBool(ignore_schema_change_check);
//JVM monitoring enable. To prevent be from crashing due to jvm compatibility issues.
DECLARE_Bool(enable_jvm_monitor);
#ifdef BE_TEST
// test s3
DECLARE_String(test_s3_resource);

View File

@ -17,10 +17,12 @@
#include "jvm_metrics.h"
#include <util/jni-util.h>
#include <functional>
#include "common/config.h"
#include "util/metrics.h"
namespace doris {
#define DEFINE_JVM_SIZE_BYTES_METRIC(name, type) \
@ -76,15 +78,28 @@ DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(jvm_gc_g1_old_generation_time_ms, MetricUni
const char* JvmMetrics::_s_hook_name = "jvm_metrics";
JvmMetrics::JvmMetrics(MetricRegistry* registry, JNIEnv* env) : _jvm_stats(env) {
JvmMetrics::JvmMetrics(MetricRegistry* registry, JNIEnv* env) {
DCHECK(registry != nullptr);
_registry = registry;
_server_entity = _registry->register_entity("server");
DCHECK(_server_entity != nullptr);
if (_jvm_stats.init_complete()) {
do {
if (!doris::config::enable_jvm_monitor) {
break;
}
try {
_jvm_stats.init(env);
} catch (...) {
LOG(WARNING) << "JVM STATS INIT FAIL";
break;
}
if (!_jvm_stats.init_complete()) {
break;
}
_server_entity->register_hook(_s_hook_name, std::bind(&JvmMetrics::update, this));
}
} while (false);
INT_GAUGE_METRIC_REGISTER(_server_entity, jvm_heap_size_bytes_max);
INT_GAUGE_METRIC_REGISTER(_server_entity, jvm_heap_size_bytes_committed);
@ -117,11 +132,58 @@ JvmMetrics::JvmMetrics(MetricRegistry* registry, JNIEnv* env) : _jvm_stats(env)
}
void JvmMetrics::update() {
_jvm_stats.refresh(this);
}
#include <util/jni-util.h>
static long fail_count = 0;
bool have_exception = false;
try {
_jvm_stats.refresh(this);
} catch (...) {
have_exception = true;
LOG(WARNING) << "JVM MONITOR UPDATE FAIL!";
fail_count++;
}
jvmStats::jvmStats(JNIEnv* ENV) : env(ENV) {
//When 30 consecutive exceptions occur, turn off jvm information collection.
if (!have_exception) {
fail_count = 0;
}
if (fail_count >= 30) {
LOG(WARNING) << "JVM MONITOR CLOSE!";
_jvm_stats.set_complete(false);
_server_entity->deregister_hook(_s_hook_name);
jvm_heap_size_bytes_max->set_value(0);
jvm_heap_size_bytes_committed->set_value(0);
jvm_heap_size_bytes_used->set_value(0);
jvm_non_heap_size_bytes_used->set_value(0);
jvm_non_heap_size_bytes_committed->set_value(0);
jvm_young_size_bytes_used->set_value(0);
jvm_young_size_bytes_peak_used->set_value(0);
jvm_young_size_bytes_max->set_value(0);
jvm_old_size_bytes_used->set_value(0);
jvm_old_size_bytes_peak_used->set_value(0);
jvm_old_size_bytes_max->set_value(0);
jvm_thread_count->set_value(0);
jvm_thread_peak_count->set_value(0);
jvm_thread_new_count->set_value(0);
jvm_thread_runnable_count->set_value(0);
jvm_thread_blocked_count->set_value(0);
jvm_thread_waiting_count->set_value(0);
jvm_thread_timed_waiting_count->set_value(0);
jvm_thread_terminated_count->set_value(0);
jvm_gc_g1_young_generation_count->set_value(0);
jvm_gc_g1_young_generation_time_ms->set_value(0);
jvm_gc_g1_old_generation_count->set_value(0);
jvm_gc_g1_old_generation_time_ms->set_value(0);
}
}
void JvmStats::init(JNIEnv* ENV) {
env = ENV;
_managementFactoryClass = env->FindClass("java/lang/management/ManagementFactory");
if (_managementFactoryClass == nullptr) {
LOG(WARNING)
@ -244,15 +306,19 @@ jvmStats::jvmStats(JNIEnv* ENV) : env(ENV) {
LOG(INFO) << "Start JVM monitoring.";
_init_complete = true;
return;
}
#include "jni.h"
void jvmStats::refresh(JvmMetrics* jvm_metrics) {
void JvmStats::refresh(JvmMetrics* jvm_metrics) {
if (!_init_complete) {
return;
}
static_cast<void>(JniUtil::GetJNIEnv(&env));
Status st = JniUtil::GetJNIEnv(&env);
if (!st.ok()) {
LOG(WARNING) << "JVM STATS GET JNI ENV FAIL";
return;
}
jobject memoryMXBeanObj =
env->CallStaticObjectMethod(_managementFactoryClass, _getMemoryMXBeanMethod);
@ -302,8 +368,8 @@ void jvmStats::refresh(JvmMetrics* jvm_metrics) {
jstring name =
(jstring)env->CallObjectMethod(memoryPoolMXBean, _getMemoryPollMXBeanNameMethod);
const char* nameStr = env->GetStringUTFChars(name, NULL);
if (nameStr != NULL) {
const char* nameStr = env->GetStringUTFChars(name, nullptr);
if (nameStr != nullptr) {
auto it = _memoryPoolName.find(nameStr);
if (it == _memoryPoolName.end()) {
continue;
@ -408,16 +474,22 @@ void jvmStats::refresh(JvmMetrics* jvm_metrics) {
env->DeleteLocalRef(threadMXBean);
env->DeleteLocalRef(gcMXBeansList);
}
jvmStats::~jvmStats() {
JvmStats::~JvmStats() {
if (!_init_complete) {
return;
}
env->DeleteLocalRef(_newThreadStateObj);
env->DeleteLocalRef(_runnableThreadStateObj);
env->DeleteLocalRef(_blockedThreadStateObj);
env->DeleteLocalRef(_waitingThreadStateObj);
env->DeleteLocalRef(_timedWaitingThreadStateObj);
env->DeleteLocalRef(_terminatedThreadStateObj);
try {
env->DeleteLocalRef(_newThreadStateObj);
env->DeleteLocalRef(_runnableThreadStateObj);
env->DeleteLocalRef(_blockedThreadStateObj);
env->DeleteLocalRef(_waitingThreadStateObj);
env->DeleteLocalRef(_timedWaitingThreadStateObj);
env->DeleteLocalRef(_terminatedThreadStateObj);
} catch (...) {
// When be is killed, DeleteLocalRef may fail.
// In order to exit more gracefully, we catch the exception here.
}
}
} // namespace doris

View File

@ -17,8 +17,6 @@
#pragma once
#include <jni.h>
#include "jni.h"
#include "util/jni-util.h"
#include "util/metrics.h"
@ -27,7 +25,7 @@ namespace doris {
class JvmMetrics;
class jvmStats {
class JvmStats {
private:
JNIEnv* env = nullptr;
jclass _managementFactoryClass = nullptr;
@ -98,16 +96,18 @@ private:
bool _init_complete = false;
public:
jvmStats(JNIEnv* ENV);
bool init_complete() { return _init_complete; }
// JvmStats(JNIEnv* ENV);
void init(JNIEnv* ENV);
bool init_complete() const { return _init_complete; }
void set_complete(bool val) { _init_complete = val; }
void refresh(JvmMetrics* jvm_metrics);
~jvmStats();
~JvmStats();
};
class JvmMetrics {
public:
JvmMetrics(MetricRegistry* registry, JNIEnv* env);
~JvmMetrics() {}
~JvmMetrics() = default;
void update();
IntGauge* jvm_heap_size_bytes_max = nullptr;
@ -140,7 +140,7 @@ public:
IntGauge* jvm_gc_g1_old_generation_time_ms = nullptr;
private:
jvmStats _jvm_stats;
JvmStats _jvm_stats;
std::shared_ptr<MetricEntity> _server_entity;
static const char* _s_hook_name;
MetricRegistry* _registry = nullptr;

View File

@ -70,3 +70,6 @@ fragment_pool_thread_num_max=5000
enable_fuzzy_mode=true
enable_set_in_bitmap_value=true
enable_feature_binlog=true
enable_jvm_monitor = true

View File

@ -82,3 +82,6 @@ user_files_secure_path=/
enable_debug_points=true
# debug scanner context dead loop
enable_debug_log_timeout_secs=0
enable_jvm_monitor = true

View File

@ -71,4 +71,7 @@ fragment_pool_thread_num_max=5000
enable_fuzzy_mode=true
enable_set_in_bitmap_value=true
enable_feature_binlog=true
max_sys_mem_available_low_water_mark_bytes=69206016
max_sys_mem_available_low_water_mark_bytes=69206016
enable_jvm_monitor = true