diff --git a/be/src/util/system_metrics.cpp b/be/src/util/system_metrics.cpp index 1cdb50a563..d78c0588ad 100644 --- a/be/src/util/system_metrics.cpp +++ b/be/src/util/system_metrics.cpp @@ -88,14 +88,26 @@ struct CpuMetrics { #define DEFINE_MEMORY_GAUGE_METRIC(metric, unit) \ DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(memory_##metric, unit); DEFINE_MEMORY_GAUGE_METRIC(allocated_bytes, MetricUnit::BYTES); +DEFINE_MEMORY_GAUGE_METRIC(pgpgin, MetricUnit::NOUNIT); +DEFINE_MEMORY_GAUGE_METRIC(pgpgout, MetricUnit::NOUNIT); +DEFINE_MEMORY_GAUGE_METRIC(pswpin, MetricUnit::NOUNIT); +DEFINE_MEMORY_GAUGE_METRIC(pswpout, MetricUnit::NOUNIT); struct MemoryMetrics { MemoryMetrics(MetricEntity* ent) : entity(ent) { INT_GAUGE_METRIC_REGISTER(entity, memory_allocated_bytes); + INT_GAUGE_METRIC_REGISTER(entity, memory_pgpgin); + INT_GAUGE_METRIC_REGISTER(entity, memory_pgpgout); + INT_GAUGE_METRIC_REGISTER(entity, memory_pswpin); + INT_GAUGE_METRIC_REGISTER(entity, memory_pswpout); } MetricEntity* entity = nullptr; IntGauge* memory_allocated_bytes; + IntGauge* memory_pgpgin; + IntGauge* memory_pgpgout; + IntGauge* memory_pswpin; + IntGauge* memory_pswpout; }; #define DEFINE_DISK_COUNTER_METRIC(metric, unit) \ @@ -214,6 +226,30 @@ struct LoadAverageMetrics { DoubleGauge* load_average_15_minutes; }; +#define DEFINE_PROC_STAT_COUNTER_METRIC(metric) \ + DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(proc_##metric, MetricUnit::NOUNIT, "", proc, \ + Labels({{"mode", #metric}})); +DEFINE_PROC_STAT_COUNTER_METRIC(interrupt); +DEFINE_PROC_STAT_COUNTER_METRIC(ctxt_switch); +DEFINE_PROC_STAT_COUNTER_METRIC(procs_running); +DEFINE_PROC_STAT_COUNTER_METRIC(procs_blocked); + +struct ProcMetrics { + ProcMetrics(MetricEntity* ent) : entity(ent) { + INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, proc_interrupt); + INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, proc_ctxt_switch); + INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, proc_procs_running); + INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, proc_procs_blocked); + } + + MetricEntity* entity = nullptr; + + IntAtomicCounter* proc_interrupt; + IntAtomicCounter* proc_ctxt_switch; + IntAtomicCounter* proc_procs_running; + IntAtomicCounter* proc_procs_blocked; +}; + const char* SystemMetrics::_s_hook_name = "system_metrics"; SystemMetrics::SystemMetrics(MetricRegistry* registry, const std::set& disk_devices, @@ -223,19 +259,23 @@ SystemMetrics::SystemMetrics(MetricRegistry* registry, const std::setregister_entity("server"); DCHECK(_server_entity != nullptr); _server_entity->register_hook(_s_hook_name, std::bind(&SystemMetrics::update, this)); - _install_cpu_metrics(_server_entity.get()); + _install_cpu_metrics(); _install_memory_metrics(_server_entity.get()); _install_disk_metrics(disk_devices); _install_net_metrics(network_interfaces); _install_fd_metrics(_server_entity.get()); _install_snmp_metrics(_server_entity.get()); _install_load_avg_metrics(_server_entity.get()); + _install_proc_metrics(_server_entity.get()); } SystemMetrics::~SystemMetrics() { DCHECK(_server_entity != nullptr); _server_entity->deregister_hook(_s_hook_name); + for (auto& it : _cpu_metrics) { + delete it.second; + } for (auto& it : _disk_metrics) { delete it.second; } @@ -255,10 +295,16 @@ void SystemMetrics::update() { _update_fd_metrics(); _update_snmp_metrics(); _update_load_avg_metrics(); + _update_proc_metrics(); } -void SystemMetrics::_install_cpu_metrics(MetricEntity* entity) { - _cpu_metrics.reset(new CpuMetrics(entity)); +void SystemMetrics::_install_cpu_metrics() { + get_cpu_name(); + for (auto cpu_name : _cpu_names) { + auto cpu_entity = _registry->register_entity(cpu_name, {{"device", cpu_name}}); + CpuMetrics* metrics = new CpuMetrics(cpu_entity.get()); + _cpu_metrics.emplace(cpu_name, metrics); + } } #ifdef BE_TEST @@ -268,6 +314,7 @@ const char* k_ut_net_dev_path; const char* k_ut_fd_path; const char* k_ut_net_snmp_path; const char* k_ut_load_avg_path; +const char* k_ut_vmstat_path; #endif void SystemMetrics::_update_cpu_metrics() { @@ -283,26 +330,35 @@ void SystemMetrics::_update_cpu_metrics() { return; } - if (getline(&_line_ptr, &_line_buf_size, fp) < 0) { + while (getline(&_line_ptr, &_line_buf_size, fp) > 0) { + char cpu[16]; + int64_t values[CpuMetrics::cpu_num_metrics]; + memset(values, 0, sizeof(values)); + int num = sscanf(_line_ptr, + "%15s" + " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 + " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64, + cpu, &values[0], &values[1], &values[2], &values[3], &values[4], + &values[5], &values[6], &values[7], &values[8], &values[9]); + if (num < 4) { + continue; + } + + std::string cpu_name(cpu); + auto it = _cpu_metrics.find(cpu_name); + if (it == _cpu_metrics.end()) { + continue; + } + + for (int i = 0; i < CpuMetrics::cpu_num_metrics; ++i) { + it->second->metrics[i]->set_value(values[i]); + } + } + + if (ferror(fp) != 0) { char buf[64]; LOG(WARNING) << "getline failed, errno=" << errno << ", message=" << strerror_r(errno, buf, 64); - fclose(fp); - return; - } - - char cpu[16]; - int64_t values[CpuMetrics::cpu_num_metrics]; - memset(values, 0, sizeof(values)); - sscanf(_line_ptr, - "%15s" - " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 - " %" PRId64 " %" PRId64 " %" PRId64, - cpu, &values[0], &values[1], &values[2], &values[3], &values[4], &values[5], &values[6], - &values[7], &values[8], &values[9]); - - for (int i = 0; i < CpuMetrics::cpu_num_metrics; ++i) { - _cpu_metrics->metrics[i]->set_value(values[i]); } fclose(fp); @@ -314,6 +370,7 @@ void SystemMetrics::_install_memory_metrics(MetricEntity* entity) { void SystemMetrics::_update_memory_metrics() { _memory_metrics->memory_allocated_bytes->set_value(MemInfo::current_mem()); + get_metrics_from_proc_vmstat(); } void SystemMetrics::_install_disk_metrics(const std::set& disk_devices) { @@ -718,4 +775,133 @@ void SystemMetrics::get_max_net_traffic(const std::map& ls *send_rate = max_send / interval_sec; *rcv_rate = max_rcv / interval_sec; } + +void SystemMetrics::_install_proc_metrics(MetricEntity* entity) { + _proc_metrics.reset(new ProcMetrics(entity)); +} + +void SystemMetrics::_update_proc_metrics() { +#ifdef BE_TEST + FILE* fp = fopen(k_ut_stat_path, "r"); +#else + FILE* fp = fopen("/proc/stat", "r"); +#endif + if (fp == nullptr) { + char buf[64]; + LOG(WARNING) << "open /proc/stat failed, errno=" << errno + << ", message=" << strerror_r(errno, buf, 64); + return; + } + + uint64_t inter = 0, ctxt = 0, procs_r = 0, procs_b = 0; + while (getline(&_line_ptr, &_line_buf_size, fp) > 0) { + char* start_pos = nullptr; + start_pos = strstr(_line_ptr, "intr "); + if (start_pos) { + sscanf(start_pos, "intr %lu", &inter); + _proc_metrics->proc_interrupt->set_value(inter); + } + + start_pos = strstr(_line_ptr, "ctxt "); + if (start_pos) { + sscanf(start_pos, "ctxt %lu", &ctxt); + _proc_metrics->proc_ctxt_switch->set_value(ctxt); + } + + start_pos = strstr(_line_ptr, "procs_running "); + if (start_pos) { + sscanf(start_pos, "procs_running %lu", &procs_r); + _proc_metrics->proc_procs_running->set_value(procs_r); + } + + start_pos = strstr(_line_ptr, "procs_blocked "); + if (start_pos) { + sscanf(start_pos, "procs_blocked %lu", &procs_b); + _proc_metrics->proc_procs_blocked->set_value(procs_b); + } + } + + if (ferror(fp) != 0) { + char buf[64]; + LOG(WARNING) << "getline failed, errno=" << errno + << ", message=" << strerror_r(errno, buf, 64); + } + + fclose(fp); +} + +void SystemMetrics::get_metrics_from_proc_vmstat() { +#ifdef BE_TEST + FILE* fp = fopen(k_ut_vmstat_path, "r"); +#else + FILE* fp = fopen("/proc/vmstat", "r"); +#endif + if (fp == nullptr) { + char buf[64]; + LOG(WARNING) << "open /proc/vmstat failed, errno=" << errno + << ", message=" << strerror_r(errno, buf, 64); + return; + } + + while (getline(&_line_ptr, &_line_buf_size, fp) > 0) { + uint64_t value; + char name[64]; + int num = sscanf(_line_ptr, "%s %lu", name, &value); + if (num < 2) { + continue; + } + + if (strcmp(name, "pgpgin") == 0) { + _memory_metrics->memory_pgpgin->set_value(value); + } else if (strcmp(name, "pgpgout") == 0) { + _memory_metrics->memory_pgpgout->set_value(value); + } else if (strcmp(name, "pswpin") == 0) { + _memory_metrics->memory_pswpin->set_value(value); + } else if (strcmp(name, "pswpout") == 0) { + _memory_metrics->memory_pswpout->set_value(value); + } + } + + if (ferror(fp) != 0) { + char buf[64]; + LOG(WARNING) << "getline failed, errno=" << errno + << ", message=" << strerror_r(errno, buf, 64); + } + + fclose(fp); +} + +void SystemMetrics::get_cpu_name() { +#ifdef BE_TEST + FILE* fp = fopen(k_ut_stat_path, "r"); +#else + FILE* fp = fopen("/proc/stat", "r"); +#endif + if (fp == nullptr) { + char buf[64]; + LOG(WARNING) << "open /proc/stat failed, errno=" << errno + << ", message=" << strerror_r(errno, buf, 64); + return; + } + + while (getline(&_line_ptr, &_line_buf_size, fp) > 0) { + char cpu[16]; + char* start_pos = nullptr; + start_pos = strstr(_line_ptr, "cpu"); + if (start_pos) { + sscanf(_line_ptr, "%15s", cpu); + std::string cpu_name(cpu); + _cpu_names.push_back(cpu_name); + } + } + + if (ferror(fp) != 0) { + char buf[64]; + LOG(WARNING) << "getline failed, errno=" << errno + << ", message=" << strerror_r(errno, buf, 64); + } + + fclose(fp); +} + } // namespace doris diff --git a/be/src/util/system_metrics.h b/be/src/util/system_metrics.h index 67225657f8..903588602d 100644 --- a/be/src/util/system_metrics.h +++ b/be/src/util/system_metrics.h @@ -31,6 +31,7 @@ struct NetworkMetrics; struct FileDescriptorMetrics; struct SnmpMetrics; struct LoadAverageMetrics; +struct ProcMetrics; class SystemMetrics { public: @@ -51,7 +52,7 @@ public: int64_t interval_sec, int64_t* send_rate, int64_t* rcv_rate); private: - void _install_cpu_metrics(MetricEntity* entity); + void _install_cpu_metrics(); // On Intel(R) Xeon(R) CPU E5-2450 0 @ 2.10GHz; // read /proc/stat would cost about 170us void _update_cpu_metrics(); @@ -74,10 +75,16 @@ private: void _install_load_avg_metrics(MetricEntity* entity); void _update_load_avg_metrics(); + void _install_proc_metrics(MetricEntity* entity); + void _update_proc_metrics(); + + void get_metrics_from_proc_vmstat(); + void get_cpu_name(); + private: static const char* _s_hook_name; - std::unique_ptr _cpu_metrics; + std::map _cpu_metrics; std::unique_ptr _memory_metrics; std::map _disk_metrics; std::map _network_metrics; @@ -85,7 +92,9 @@ private: std::unique_ptr _load_average_metrics; int _proc_net_dev_version = 0; std::unique_ptr _snmp_metrics; + std::unique_ptr _proc_metrics; + std::vector _cpu_names; char* _line_ptr = nullptr; size_t _line_buf_size = 0; MetricRegistry* _registry = nullptr; diff --git a/be/test/util/system_metrics_test.cpp b/be/test/util/system_metrics_test.cpp index 755cacb7e4..23ac0e1a46 100644 --- a/be/test/util/system_metrics_test.cpp +++ b/be/test/util/system_metrics_test.cpp @@ -39,6 +39,7 @@ extern const char* k_ut_net_dev_path; extern const char* k_ut_fd_path; extern const char* k_ut_net_snmp_path; extern const char* k_ut_load_avg_path; +extern const char* k_ut_vmstat_path; TEST_F(SystemMetricsTest, normal) { std::string dir_path = GetCurrentRunningDir(); @@ -61,6 +62,9 @@ TEST_F(SystemMetricsTest, normal) { std::string load_avg_path(dir_path); load_avg_path += "/util/test_data/load_avg_normal"; k_ut_load_avg_path = load_avg_path.c_str(); + std::string vmstat_path(dir_path); + vmstat_path += "/util/test_data/vmstat_normal"; + k_ut_vmstat_path = vmstat_path.c_str(); MetricRegistry registry("test"); { @@ -75,39 +79,55 @@ TEST_F(SystemMetricsTest, normal) { metrics.update(); // cpu - Metric* cpu_user = entity->get_metric("cpu_user", "cpu"); + auto cpu_entity = registry.get_entity("cpu", {{"device", "cpu"}}); + EXPECT_TRUE(cpu_entity != nullptr); + EXPECT_TRUE("cpu" == cpu_entity->name()); + Metric* cpu_user = cpu_entity->get_metric("cpu_user", "cpu"); EXPECT_TRUE(cpu_user != nullptr); - // EXPECT_STREQ("57199151", cpu_user->to_string().c_str()); - Metric* cpu_nice = entity->get_metric("cpu_nice", "cpu"); + EXPECT_STREQ("57199151", cpu_user->to_string().c_str()); + Metric* cpu_nice = cpu_entity->get_metric("cpu_nice", "cpu"); EXPECT_TRUE(cpu_nice != nullptr); EXPECT_STREQ("2616310", cpu_nice->to_string().c_str()); - Metric* cpu_system = entity->get_metric("cpu_system", "cpu"); + Metric* cpu_system = cpu_entity->get_metric("cpu_system", "cpu"); EXPECT_TRUE(cpu_system != nullptr); EXPECT_STREQ("10600935", cpu_system->to_string().c_str()); - Metric* cpu_idle = entity->get_metric("cpu_idle", "cpu"); + Metric* cpu_idle = cpu_entity->get_metric("cpu_idle", "cpu"); EXPECT_TRUE(cpu_idle != nullptr); EXPECT_STREQ("1517505423", cpu_idle->to_string().c_str()); - Metric* cpu_iowait = entity->get_metric("cpu_iowait", "cpu"); + Metric* cpu_iowait = cpu_entity->get_metric("cpu_iowait", "cpu"); EXPECT_TRUE(cpu_iowait != nullptr); EXPECT_STREQ("2137148", cpu_iowait->to_string().c_str()); - Metric* cpu_irq = entity->get_metric("cpu_irq", "cpu"); + Metric* cpu_irq = cpu_entity->get_metric("cpu_irq", "cpu"); EXPECT_TRUE(cpu_irq != nullptr); EXPECT_STREQ("0", cpu_irq->to_string().c_str()); - Metric* cpu_softirq = entity->get_metric("cpu_soft_irq", "cpu"); + Metric* cpu_softirq = cpu_entity->get_metric("cpu_soft_irq", "cpu"); EXPECT_TRUE(cpu_softirq != nullptr); EXPECT_STREQ("108277", cpu_softirq->to_string().c_str()); - Metric* cpu_steal = entity->get_metric("cpu_steal", "cpu"); + Metric* cpu_steal = cpu_entity->get_metric("cpu_steal", "cpu"); EXPECT_TRUE(cpu_steal != nullptr); EXPECT_STREQ("0", cpu_steal->to_string().c_str()); - Metric* cpu_guest = entity->get_metric("cpu_guest", "cpu"); + Metric* cpu_guest = cpu_entity->get_metric("cpu_guest", "cpu"); EXPECT_TRUE(cpu_guest != nullptr); EXPECT_STREQ("0", cpu_guest->to_string().c_str()); - Metric* cpu_guest_nice = entity->get_metric("cpu_guest_nice", "cpu"); + Metric* cpu_guest_nice = cpu_entity->get_metric("cpu_guest_nice", "cpu"); EXPECT_TRUE(cpu_guest_nice != nullptr); EXPECT_STREQ("0", cpu_guest_nice->to_string().c_str()); + // memroy Metric* memory_allocated_bytes = entity->get_metric("memory_allocated_bytes"); EXPECT_TRUE(memory_allocated_bytes != nullptr); + Metric* memory_pgpgin = entity->get_metric("memory_pgpgin"); + EXPECT_TRUE(memory_pgpgin != nullptr); + EXPECT_STREQ("21458611100", memory_pgpgin->to_string().c_str()); + Metric* memory_pgpgout = entity->get_metric("memory_pgpgout"); + EXPECT_TRUE(memory_pgpgout != nullptr); + EXPECT_STREQ("149080494692", memory_pgpgout->to_string().c_str()); + Metric* memory_pswpin = entity->get_metric("memory_pswpin"); + EXPECT_TRUE(memory_pswpin != nullptr); + EXPECT_STREQ("167785", memory_pswpin->to_string().c_str()); + Metric* memory_pswpout = entity->get_metric("memory_pswpout"); + EXPECT_TRUE(memory_pswpout != nullptr); + EXPECT_STREQ("203724", memory_pswpout->to_string().c_str()); // network auto net_entity = registry.get_entity("network_metrics.xgbe0", {{"device", "xgbe0"}}); @@ -184,6 +204,20 @@ TEST_F(SystemMetricsTest, normal) { entity->get_metric("load_average_15_minutes", "load_average"); EXPECT_TRUE(fd_metric != nullptr); EXPECT_STREQ("2.020000", load_average_15_minutes->to_string().c_str()); + + // proc + Metric* proc_interrupt = entity->get_metric("proc_interrupt", "proc"); + EXPECT_TRUE(proc_interrupt != nullptr); + EXPECT_STREQ("20935913098", proc_interrupt->to_string().c_str()); + Metric* proc_ctxt_switch = entity->get_metric("proc_ctxt_switch", "proc"); + EXPECT_TRUE(proc_ctxt_switch != nullptr); + EXPECT_STREQ("11043516832", proc_ctxt_switch->to_string().c_str()); + Metric* proc_procs_running = entity->get_metric("proc_procs_running", "proc"); + EXPECT_TRUE(proc_procs_running != nullptr); + EXPECT_STREQ("1", proc_procs_running->to_string().c_str()); + Metric* proc_procs_blocked = entity->get_metric("proc_procs_blocked", "proc"); + EXPECT_TRUE(proc_procs_blocked != nullptr); + EXPECT_STREQ("0", proc_procs_blocked->to_string().c_str()); } } @@ -201,6 +235,9 @@ TEST_F(SystemMetricsTest, no_proc_file) { k_ut_net_dev_path = net_dev_path.c_str(); k_ut_fd_path = ""; k_ut_net_snmp_path = ""; + std::string vmstat_path(dir_path); + vmstat_path += "/util/test_data/no_vmstat_normal"; + k_ut_vmstat_path = vmstat_path.c_str(); MetricRegistry registry("test"); { @@ -214,24 +251,52 @@ TEST_F(SystemMetricsTest, no_proc_file) { EXPECT_TRUE(entity != nullptr); // cpu - Metric* cpu_user = entity->get_metric("cpu_user", "cpu"); - EXPECT_TRUE(cpu_user != nullptr); - EXPECT_STREQ("0", cpu_user->to_string().c_str()); + auto cpu_entity = registry.get_entity("cpu", {{"device", "cpu"}}); + EXPECT_TRUE(cpu_entity == nullptr); + // memroy Metric* memory_allocated_bytes = entity->get_metric("memory_allocated_bytes"); EXPECT_TRUE(memory_allocated_bytes != nullptr); + Metric* memory_pgpgin = entity->get_metric("memory_pgpgin"); + EXPECT_TRUE(memory_pgpgin != nullptr); + EXPECT_STREQ("0", memory_pgpgin->to_string().c_str()); + Metric* memory_pgpgout = entity->get_metric("memory_pgpgout"); + EXPECT_TRUE(memory_pgpgout != nullptr); + EXPECT_STREQ("0", memory_pgpgout->to_string().c_str()); + Metric* memory_pswpin = entity->get_metric("memory_pswpin"); + EXPECT_TRUE(memory_pswpin != nullptr); + EXPECT_STREQ("0", memory_pswpin->to_string().c_str()); + Metric* memory_pswpout = entity->get_metric("memory_pswpout"); + EXPECT_TRUE(memory_pswpout != nullptr); + EXPECT_STREQ("0", memory_pswpout->to_string().c_str()); + // network auto net_entity = registry.get_entity("network_metrics.xgbe0", {{"device", "xgbe0"}}); EXPECT_TRUE(net_entity != nullptr); Metric* receive_bytes = net_entity->get_metric("network_receive_bytes"); EXPECT_TRUE(receive_bytes != nullptr); EXPECT_STREQ("0", receive_bytes->to_string().c_str()); + // disk auto disk_entity = registry.get_entity("disk_metrics.sda", {{"device", "sda"}}); EXPECT_TRUE(disk_entity != nullptr); Metric* bytes_read = disk_entity->get_metric("disk_bytes_read"); EXPECT_TRUE(bytes_read != nullptr); EXPECT_STREQ("0", bytes_read->to_string().c_str()); + + // proc + Metric* proc_interrupt = entity->get_metric("proc_interrupt", "proc"); + EXPECT_TRUE(proc_interrupt != nullptr); + EXPECT_STREQ("0", proc_interrupt->to_string().c_str()); + Metric* proc_ctxt_switch = entity->get_metric("proc_ctxt_switch", "proc"); + EXPECT_TRUE(proc_ctxt_switch != nullptr); + EXPECT_STREQ("0", proc_ctxt_switch->to_string().c_str()); + Metric* proc_procs_running = entity->get_metric("proc_procs_running", "proc"); + EXPECT_TRUE(proc_procs_running != nullptr); + EXPECT_STREQ("0", proc_procs_running->to_string().c_str()); + Metric* proc_procs_blocked = entity->get_metric("proc_procs_blocked", "proc"); + EXPECT_TRUE(proc_procs_blocked != nullptr); + EXPECT_STREQ("0", proc_procs_blocked->to_string().c_str()); } } diff --git a/be/test/util/test_data/vmstat_normal b/be/test/util/test_data/vmstat_normal new file mode 100644 index 0000000000..e7e528ee69 --- /dev/null +++ b/be/test/util/test_data/vmstat_normal @@ -0,0 +1,12 @@ +nr_free_pages 4631912 +nr_zone_inactive_anon 2399241 +nr_zone_active_anon 62962233 +nr_zone_inactive_file 14358369 +nr_zone_active_file 11459904 +nr_zone_unevictable 0 +pgpgin 21458611100 +pgpgout 149080494692 +pswpin 167785 +pswpout 203724 +swap_ra 3521 +swap_ra_hit 2219 \ No newline at end of file