[Metrics] Add metrics document and 2 new metrics of TCP (#3835)
This commit is contained in:
@ -66,6 +66,14 @@ struct NetMetrics {
|
||||
METRIC_DEFINE_INT_LOCK_COUNTER(send_packets, MetricUnit::NUMBER);
|
||||
};
|
||||
|
||||
// metrics read from /proc/net/snmp
|
||||
struct SnmpMetrics {
|
||||
// The number of all problematic TCP packets received
|
||||
METRIC_DEFINE_INT_LOCK_COUNTER(tcp_in_errs, MetricUnit::NUMBER);
|
||||
// All TCP packets retransmitted
|
||||
METRIC_DEFINE_INT_LOCK_COUNTER(tcp_retrans_segs, MetricUnit::NUMBER);
|
||||
};
|
||||
|
||||
struct FileDescriptorMetrics {
|
||||
METRIC_DEFINE_INT_GAUGE(fd_num_limit, MetricUnit::NUMBER);
|
||||
METRIC_DEFINE_INT_GAUGE(fd_num_used, MetricUnit::NUMBER);
|
||||
@ -103,6 +111,7 @@ void SystemMetrics::install(MetricRegistry* registry,
|
||||
_install_disk_metrics(registry, disk_devices);
|
||||
_install_net_metrics(registry, network_interfaces);
|
||||
_install_fd_metrics(registry);
|
||||
_install_snmp_metrics(registry);
|
||||
_registry = registry;
|
||||
}
|
||||
|
||||
@ -112,6 +121,7 @@ void SystemMetrics::update() {
|
||||
_update_disk_metrics();
|
||||
_update_net_metrics();
|
||||
_update_fd_metrics();
|
||||
_update_snmp_metrics();
|
||||
}
|
||||
|
||||
void SystemMetrics::_install_cpu_metrics(MetricRegistry* registry) {
|
||||
@ -129,6 +139,7 @@ const char* k_ut_stat_path;
|
||||
const char* k_ut_diskstats_path;
|
||||
const char* k_ut_net_dev_path;
|
||||
const char* k_ut_fd_path;
|
||||
const char* k_ut_net_snmp_path;
|
||||
#endif
|
||||
|
||||
void SystemMetrics::_update_cpu_metrics() {
|
||||
@ -304,6 +315,16 @@ void SystemMetrics::_install_net_metrics(MetricRegistry* registry,
|
||||
}
|
||||
}
|
||||
|
||||
void SystemMetrics::_install_snmp_metrics(MetricRegistry* registry) {
|
||||
_snmp_metrics.reset(new SnmpMetrics());
|
||||
#define REGISTER_SNMP_METRIC(name) \
|
||||
registry->register_metric("snmp", \
|
||||
MetricLabels().add("name", #name), \
|
||||
&_snmp_metrics->name)
|
||||
REGISTER_SNMP_METRIC(tcp_in_errs);
|
||||
REGISTER_SNMP_METRIC(tcp_retrans_segs);
|
||||
}
|
||||
|
||||
void SystemMetrics::_update_net_metrics() {
|
||||
#ifdef BE_TEST
|
||||
// to mock proc
|
||||
@ -399,6 +420,65 @@ void SystemMetrics::_update_net_metrics() {
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
void SystemMetrics::_update_snmp_metrics() {
|
||||
#ifdef BE_TEST
|
||||
// to mock proc
|
||||
FILE* fp = fopen(k_ut_net_snmp_path, "r");
|
||||
#else
|
||||
FILE* fp = fopen("/proc/net/snmp", "r");
|
||||
#endif
|
||||
if (fp == nullptr) {
|
||||
char buf[64];
|
||||
LOG(WARNING) << "open /proc/net/snmp failed, errno=" << errno
|
||||
<< ", message=" << strerror_r(errno, buf, 64);
|
||||
return;
|
||||
}
|
||||
|
||||
// We only care about Tcp lines, so skip other lines in front of Tcp line
|
||||
int res = 0;
|
||||
while ((res = getline(&_line_ptr, &_line_buf_size, fp)) > 0) {
|
||||
if (strstr(_line_ptr, "Tcp") != nullptr) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (res <= 0) {
|
||||
char buf[64];
|
||||
LOG(WARNING) << "failed to skip lines of /proc/net/snmp, errno=" << errno
|
||||
<< ", message=" << strerror_r(errno, buf, 64);
|
||||
fclose(fp);
|
||||
return;
|
||||
}
|
||||
|
||||
// skip the Tcp header line
|
||||
// Tcp: RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors
|
||||
if (getline(&_line_ptr, &_line_buf_size, fp) < 0) {
|
||||
char buf[64];
|
||||
LOG(WARNING) << "failed to skip Tcp header line of /proc/net/snmp, errno=" << errno
|
||||
<< ", message=" << strerror_r(errno, buf, 64);
|
||||
fclose(fp);
|
||||
return;
|
||||
}
|
||||
|
||||
// metric line looks like:
|
||||
// Tcp: 1 200 120000 -1 47849374 38601877 3353843 2320314 276 1033354613 1166025166 825439 12694 23238924 0
|
||||
int64_t retrans_segs = 0;
|
||||
int64_t in_errs = 0;
|
||||
sscanf(_line_ptr,
|
||||
"Tcp: %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d"
|
||||
" %" PRId64 " %" PRId64 " %*d %*d",
|
||||
&retrans_segs, &in_errs);
|
||||
|
||||
_snmp_metrics->tcp_retrans_segs.set_value(retrans_segs);
|
||||
_snmp_metrics->tcp_in_errs.set_value(in_errs);
|
||||
|
||||
if (ferror(fp) != 0) {
|
||||
char buf[64];
|
||||
LOG(WARNING) << "getline failed, errno=" << errno
|
||||
<< ", message=" << strerror_r(errno, buf, 64);
|
||||
}
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
void SystemMetrics::_install_fd_metrics(MetricRegistry* registry) {
|
||||
_fd_metrics.reset(new FileDescriptorMetrics());
|
||||
registry->register_metric("fd_num_limit", &_fd_metrics->fd_num_limit);
|
||||
|
||||
@ -28,6 +28,7 @@ class MemoryMetrics;
|
||||
class DiskMetrics;
|
||||
class NetMetrics;
|
||||
class FileDescriptorMetrics;
|
||||
class SnmpMetrics;
|
||||
|
||||
class SystemMetrics {
|
||||
public:
|
||||
@ -76,6 +77,9 @@ private:
|
||||
|
||||
void _update_fd_metrics();
|
||||
|
||||
void _install_snmp_metrics(MetricRegistry* registry);
|
||||
void _update_snmp_metrics();
|
||||
|
||||
private:
|
||||
static const char* _s_hook_name;
|
||||
|
||||
@ -85,6 +89,7 @@ private:
|
||||
std::map<std::string, NetMetrics*> _net_metrics;
|
||||
std::unique_ptr<FileDescriptorMetrics> _fd_metrics;
|
||||
int _proc_net_dev_version = 0;
|
||||
std::unique_ptr<SnmpMetrics> _snmp_metrics;
|
||||
|
||||
char* _line_ptr = nullptr;
|
||||
size_t _line_buf_size = 0;
|
||||
|
||||
@ -84,6 +84,7 @@ extern const char* k_ut_stat_path;
|
||||
extern const char* k_ut_diskstats_path;
|
||||
extern const char* k_ut_net_dev_path;
|
||||
extern const char* k_ut_fd_path;
|
||||
extern const char* k_ut_net_snmp_path;
|
||||
|
||||
TEST_F(SystemMetricsTest, normal) {
|
||||
MetricRegistry registry("test");
|
||||
@ -104,6 +105,9 @@ TEST_F(SystemMetricsTest, normal) {
|
||||
std::string fd_path(dir_path);
|
||||
fd_path += "/test_data/fd_file_nr";
|
||||
k_ut_fd_path = fd_path.c_str();
|
||||
std::string net_snmp_path(dir_path);
|
||||
net_snmp_path += "/test_data/net_snmp_normal";
|
||||
k_ut_net_snmp_path = net_snmp_path.c_str();
|
||||
|
||||
std::set<std::string> disk_devices;
|
||||
disk_devices.emplace("sda");
|
||||
@ -219,6 +223,14 @@ TEST_F(SystemMetricsTest, normal) {
|
||||
"fd_num_used");
|
||||
ASSERT_TRUE(fd_metric != nullptr);
|
||||
ASSERT_STREQ("19520", fd_metric->to_string().c_str());
|
||||
|
||||
// net snmp
|
||||
Metric* tcp_retrans_segs = registry.get_metric("snmp", MetricLabels().add("name", "tcp_retrans_segs"));
|
||||
ASSERT_TRUE(tcp_retrans_segs != nullptr);
|
||||
Metric* tcp_in_errs = registry.get_metric("snmp", MetricLabels().add("name","tcp_in_errs"));
|
||||
ASSERT_TRUE(tcp_in_errs != nullptr);
|
||||
ASSERT_STREQ("826271", tcp_retrans_segs->to_string().c_str());
|
||||
ASSERT_STREQ("12712", tcp_in_errs->to_string().c_str());
|
||||
}
|
||||
{
|
||||
TestMetricsVisitor visitor;
|
||||
|
||||
12
be/test/util/test_data/net_snmp_normal
Normal file
12
be/test/util/test_data/net_snmp_normal
Normal file
@ -0,0 +1,12 @@
|
||||
Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates
|
||||
Ip: 1 64 1049877820 0 0 0 0 0 1049877596 1052780427 0 1317 0 0 0 0 0 0 0
|
||||
Icmp: InMsgs InErrors InCsumErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps
|
||||
Icmp: 1142563 126992 0 198790 26 0 0 0 153700 790046 1 0 0 0 1174563 0 198734 0 0 0 0 822128 153700 0 1 0 0
|
||||
IcmpMsg: InType0 InType3 InType8 InType11 InType13 OutType0 OutType3 OutType8 OutType14
|
||||
IcmpMsg: 790046 198790 153700 26 1 153700 198734 822128 1
|
||||
Tcp: RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors
|
||||
Tcp: 1 200 120000 -1 47884867 38628916 3356043 2323781 278 1034019111 1166716939 826271 12712 23260066 0
|
||||
Udp: InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors
|
||||
Udp: 14706122 9772 0 14917947 0 0 0
|
||||
UdpLite: InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors
|
||||
UdpLite: 0 0 0 0 0 0 0
|
||||
@ -89,6 +89,14 @@ module.exports = [
|
||||
"multi-tenant",
|
||||
"tablet-meta-tool",
|
||||
"tablet-repair-and-balance",
|
||||
{
|
||||
title: "Metrics",
|
||||
directoryPath: "monitor-metrics/",
|
||||
children: [
|
||||
"fe-metrics",
|
||||
"be-metrics",
|
||||
],
|
||||
},
|
||||
],
|
||||
sidebarDepth: 2,
|
||||
},
|
||||
|
||||
@ -98,6 +98,14 @@ module.exports = [
|
||||
"tablet-meta-tool",
|
||||
"tablet-repair-and-balance",
|
||||
"tablet-restore-tool",
|
||||
{
|
||||
title: "监控项",
|
||||
directoryPath: "monitor-metrics/",
|
||||
children: [
|
||||
"fe-metrics",
|
||||
"be-metrics",
|
||||
],
|
||||
},
|
||||
],
|
||||
sidebarDepth: 2,
|
||||
},
|
||||
|
||||
@ -0,0 +1,61 @@
|
||||
---
|
||||
{
|
||||
"title": "BE Metrics",
|
||||
"language": "en"
|
||||
}
|
||||
---
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
|
||||
<!-- Please sort the metrics alphabetically -->
|
||||
|
||||
# BE Metrics
|
||||
|
||||
This document mainly introduces the monitor metrics of BE.
|
||||
|
||||
## View Metrics
|
||||
|
||||
BE metrics can be viewed by visiting:
|
||||
|
||||
`http://be_host:be_webserver_port/metrics`
|
||||
|
||||
The default format is of [Prometheus](https://prometheus.io/).
|
||||
|
||||
You can get Json format by visiting:
|
||||
|
||||
`http://be_host:be_webserver_port/metrics?type=agent`
|
||||
|
||||
## Metrics List
|
||||
|
||||
### `doris_be_snmp{name="tcp_in_errs"}`
|
||||
|
||||
Value of the `Tcp: InErrs` field in `/proc/net/snmp`. Represents the number of error TCP packets currently received.
|
||||
|
||||
The incidence rate can be calculated in combination with the sampling period.
|
||||
|
||||
Usually used to troubleshoot network problems.
|
||||
|
||||
### `doris_be_snmp{name="tcp_retrans_segs"}`
|
||||
|
||||
Value of the `Tcp: RetransSegs` field in `/proc/net/snmp`. Represents the number of error TCP packets currently received.
|
||||
|
||||
The incidence rate can be calculated in combination with the sampling period.
|
||||
|
||||
Usually used to troubleshoot network problems.
|
||||
@ -0,0 +1,61 @@
|
||||
---
|
||||
{
|
||||
"title": "FE Metrics",
|
||||
"language": "en"
|
||||
}
|
||||
---
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
|
||||
<!-- Please sort the metrics alphabetically -->
|
||||
|
||||
# FE Metrics
|
||||
|
||||
This document mainly introduces the monitor metrics of FE.
|
||||
|
||||
## View Metrics
|
||||
|
||||
FE metrics can be viewed by visiting:
|
||||
|
||||
`http://fe_host:fe_http_port/metrics`
|
||||
|
||||
The default format is of [Prometheus](https://prometheus.io/).
|
||||
|
||||
You can get Json format by visiting:
|
||||
|
||||
`http://fe_host:fe_http_port/metrics?type=agent`
|
||||
|
||||
## Metrics List
|
||||
|
||||
### `doris_fe_snmp{name="tcp_in_errs"}`
|
||||
|
||||
Value of the `Tcp: InErrs` field in `/proc/net/snmp`. Represents the number of error TCP packets currently received.
|
||||
|
||||
The incidence rate can be calculated in combination with the sampling period.
|
||||
|
||||
Usually used to troubleshoot network problems.
|
||||
|
||||
### `doris_fe_snmp{name="tcp_retrans_segs"}`
|
||||
|
||||
Value of the `Tcp: RetransSegs` field in `/proc/net/snmp`. Represents the number of error TCP packets currently received.
|
||||
|
||||
The incidence rate can be calculated in combination with the sampling period.
|
||||
|
||||
Usually used to troubleshoot network problems.
|
||||
@ -0,0 +1,61 @@
|
||||
---
|
||||
{
|
||||
"title": "BE 监控项",
|
||||
"language": "zh-CN"
|
||||
}
|
||||
---
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
|
||||
<!-- Please sort the metrics alphabetically -->
|
||||
|
||||
# BE 监控项
|
||||
|
||||
该文档主要介绍 BE 的相关监控项。
|
||||
|
||||
## 查看监控项
|
||||
|
||||
BE 的监控项可以通过以下方式访问:
|
||||
|
||||
`http://be_host:be_webserver_port/metrics`
|
||||
|
||||
默认显示为 [Prometheus](https://prometheus.io/) 格式。
|
||||
|
||||
通过以下接口可以获取 Json 格式的监控项:
|
||||
|
||||
`http://be_host:be_webserver_port/metrics?type=agent`
|
||||
|
||||
## 监控项列表
|
||||
|
||||
### `doris_be_snmp{name="tcp_in_errs"}`
|
||||
|
||||
该监控项为 `/proc/net/snmp` 中的 `Tcp: InErrs` 字段值。表示当前接收到的错误的 TCP 包的数量。
|
||||
|
||||
结合采样周期可以计算发生率。
|
||||
|
||||
通常用于排查网络问题。
|
||||
|
||||
### `doris_be_snmp{name="tcp_retrans_segs"}`
|
||||
|
||||
该监控项为 `/proc/net/snmp` 中的 `Tcp: RetransSegs` 字段值。表示当前重传的 TCP 包的数量。
|
||||
|
||||
结合采样周期可以计算发生率。
|
||||
|
||||
通常用于排查网络问题。
|
||||
@ -0,0 +1,61 @@
|
||||
---
|
||||
{
|
||||
"title": "FE 监控项",
|
||||
"language": "zh-CN"
|
||||
}
|
||||
---
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
|
||||
<!-- Please sort the metrics alphabetically -->
|
||||
|
||||
# FE 监控项
|
||||
|
||||
该文档主要介绍 FE 的相关监控项。
|
||||
|
||||
## 查看监控项
|
||||
|
||||
FE 的监控项可以通过以下方式访问:
|
||||
|
||||
`http://fe_host:fe_http_port/metrics`
|
||||
|
||||
默认显示为 [Prometheus](https://prometheus.io/) 格式。
|
||||
|
||||
通过以下接口可以获取 Json 格式的监控项:
|
||||
|
||||
`http://fe_host:fe_http_port/metrics?type=agent`
|
||||
|
||||
## 监控项列表
|
||||
|
||||
### `doris_fe_snmp{name="tcp_in_errs"}`
|
||||
|
||||
该监控项为 `/proc/net/snmp` 中的 `Tcp: InErrs` 字段值。表示当前接收到的错误的 TCP 包的数量。
|
||||
|
||||
结合采样周期可以计算发生率。
|
||||
|
||||
通常用于排查网络问题。
|
||||
|
||||
### `doris_fe_snmp{name="tcp_retrans_segs"}`
|
||||
|
||||
该监控项为 `/proc/net/snmp` 中的 `Tcp: RetransSegs` 字段值。表示当前重传的 TCP 包的数量。
|
||||
|
||||
结合采样周期可以计算发生率。
|
||||
|
||||
通常用于排查网络问题。
|
||||
@ -554,12 +554,14 @@ under the License.
|
||||
<artifactId>spark-core_2.12</artifactId>
|
||||
<version>2.4.5</version>
|
||||
</dependency>
|
||||
|
||||
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-launcher_2.12 -->
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-launcher_2.12</artifactId>
|
||||
<version>2.4.5</version>
|
||||
</dependency>
|
||||
|
||||
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql_2.12 -->
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
|
||||
@ -52,6 +52,7 @@ public final class MetricRepo {
|
||||
private static final DorisMetricRegistry PALO_METRIC_REGISTER = new DorisMetricRegistry();
|
||||
|
||||
public static AtomicBoolean isInit = new AtomicBoolean(false);
|
||||
public static final SystemMetrics SYSTEM_METRICS = new SystemMetrics();
|
||||
|
||||
public static final String TABLET_NUM = "tablet_num";
|
||||
public static final String TABLET_MAX_COMPACTION_SCORE = "tablet_max_compaction_score";
|
||||
@ -241,6 +242,10 @@ public final class MetricRepo {
|
||||
HISTO_QUERY_LATENCY = METRIC_REGISTER.histogram(MetricRegistry.name("query", "latency", "ms"));
|
||||
HISTO_EDIT_LOG_WRITE_LATENCY = METRIC_REGISTER.histogram(MetricRegistry.name("editlog", "write", "latency", "ms"));
|
||||
|
||||
// init system metrics
|
||||
initSystemMetrics();
|
||||
|
||||
updateMetrics();
|
||||
isInit.set(true);
|
||||
|
||||
if (Config.enable_metric_calculator) {
|
||||
@ -248,6 +253,30 @@ public final class MetricRepo {
|
||||
}
|
||||
}
|
||||
|
||||
private static void initSystemMetrics() {
|
||||
// TCP retransSegs
|
||||
GaugeMetric<Long> tcpRetransSegs = (GaugeMetric<Long>) new GaugeMetric<Long>(
|
||||
"snmp", MetricUnit.NUMBER, "All TCP packets retransmitted") {
|
||||
@Override
|
||||
public Long getValue() {
|
||||
return SYSTEM_METRICS.tcpRetransSegs;
|
||||
}
|
||||
};
|
||||
tcpRetransSegs.addLabel(new MetricLabel("name", "tcp_retrans_segs"));
|
||||
PALO_METRIC_REGISTER.addPaloMetrics(tcpRetransSegs);
|
||||
|
||||
// TCP inErrs
|
||||
GaugeMetric<Long> tpcInErrs = (GaugeMetric<Long>) new GaugeMetric<Long>(
|
||||
"snmp", MetricUnit.NUMBER, "The number of all problematic TCP packets received") {
|
||||
@Override
|
||||
public Long getValue() {
|
||||
return SYSTEM_METRICS.tcpInErrs;
|
||||
}
|
||||
};
|
||||
tpcInErrs.addLabel(new MetricLabel("name", "tcp_in_errs"));
|
||||
PALO_METRIC_REGISTER.addPaloMetrics(tpcInErrs);
|
||||
}
|
||||
|
||||
// to generate the metrics related to tablets of each backends
|
||||
// this metric is reentrant, so that we can add or remove metric along with the backend add or remove
|
||||
// at runtime.
|
||||
@ -301,6 +330,10 @@ public final class MetricRepo {
|
||||
if (!isInit.get()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// update the metrics first
|
||||
updateMetrics();
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
// jvm
|
||||
JvmService jvmService = new JvmService();
|
||||
@ -325,6 +358,11 @@ public final class MetricRepo {
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
// update some metrics to make a ready to be visited
|
||||
private static void updateMetrics() {
|
||||
SYSTEM_METRICS.update();
|
||||
}
|
||||
|
||||
public static synchronized List<Metric> getMetricsByName(String name) {
|
||||
return PALO_METRIC_REGISTER.getPaloMetricsByName(name);
|
||||
}
|
||||
|
||||
83
fe/src/main/java/org/apache/doris/metric/SystemMetrics.java
Normal file
83
fe/src/main/java/org/apache/doris/metric/SystemMetrics.java
Normal file
@ -0,0 +1,83 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
package org.apache.doris.metric;
|
||||
|
||||
import org.apache.doris.common.FeConstants;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileReader;
|
||||
|
||||
/**
|
||||
* Save system metrics such as CPU, MEM, IO, Networks.
|
||||
* TODO: Add them gradually
|
||||
*/
|
||||
public class SystemMetrics {
|
||||
private static final Logger LOG = LogManager.getLogger(SystemMetrics.class);
|
||||
|
||||
// NOTICE: The following 2 tcp metrics is got from /proc/net/snmp
|
||||
// So they can only be got on Linux system.
|
||||
// All TCP packets retransmitted
|
||||
protected long tcpRetransSegs = 0;
|
||||
// The number of all problematic TCP packets received
|
||||
protected long tcpInErrs = 0;
|
||||
|
||||
public synchronized void update() {
|
||||
updateSnmpMetrics();
|
||||
}
|
||||
|
||||
private void updateSnmpMetrics() {
|
||||
String procFile = "/proc/net/snmp";
|
||||
if (FeConstants.runningUnitTest) {
|
||||
procFile = getClass().getClassLoader().getResource("data/net_snmp_normal").getFile();
|
||||
}
|
||||
try (FileReader fileReader = new FileReader(procFile);
|
||||
BufferedReader br = new BufferedReader(fileReader)) {
|
||||
String line = null;
|
||||
boolean found = false;
|
||||
while ((line = br.readLine()) != null) {
|
||||
if (line.startsWith("Tcp: ")) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
throw new Exception("can not find tcp metrics");
|
||||
}
|
||||
// skip tcp header line
|
||||
if ((line = br.readLine()) == null) {
|
||||
throw new Exception("failed to skip tcp metrics header");
|
||||
}
|
||||
|
||||
// eg: Tcp: 1 200 120000 -1 38920626 10487279 105581903 300009 305 18079291213 15411998945 11808180 22905 4174570 0
|
||||
String[] parts = line.split(" ");
|
||||
if (parts.length != 16) {
|
||||
throw new Exception("invalid tcp metrics: " + line);
|
||||
}
|
||||
|
||||
tcpRetransSegs = Long.valueOf(parts[12]);
|
||||
tcpInErrs = Long.valueOf(parts[13]);
|
||||
|
||||
} catch (Exception e) {
|
||||
LOG.warn("failed to get /proc/net/snmp", e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
51
fe/src/test/java/org/apache/doris/metric/MetricsTest.java
Normal file
51
fe/src/test/java/org/apache/doris/metric/MetricsTest.java
Normal file
@ -0,0 +1,51 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
package org.apache.doris.metric;
|
||||
|
||||
import org.apache.doris.common.FeConstants;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class MetricsTest {
|
||||
|
||||
@BeforeClass
|
||||
public static void setUp() {
|
||||
FeConstants.runningUnitTest = true;
|
||||
MetricRepo.init();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTcpMetrics() {
|
||||
List<Metric> metrics = MetricRepo.getMetricsByName("snmp");
|
||||
Assert.assertEquals(2, metrics.size());
|
||||
for (Metric metric : metrics) {
|
||||
GaugeMetric<Long> gm = (GaugeMetric<Long>) metric;
|
||||
if (gm.getLabels().get(0).getValue().equals("tcp_retrans_segs")) {
|
||||
Assert.assertEquals(Long.valueOf(826271L), (Long) gm.getValue());
|
||||
} else if (gm.getLabels().get(0).getValue().equals("tcp_in_errs")) {
|
||||
Assert.assertEquals(Long.valueOf(12712L), (Long) gm.getValue());
|
||||
} else {
|
||||
Assert.fail();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
12
fe/src/test/resources/data/net_snmp_normal
Normal file
12
fe/src/test/resources/data/net_snmp_normal
Normal file
@ -0,0 +1,12 @@
|
||||
Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates
|
||||
Ip: 1 64 1049877820 0 0 0 0 0 1049877596 1052780427 0 1317 0 0 0 0 0 0 0
|
||||
Icmp: InMsgs InErrors InCsumErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps
|
||||
Icmp: 1142563 126992 0 198790 26 0 0 0 153700 790046 1 0 0 0 1174563 0 198734 0 0 0 0 822128 153700 0 1 0 0
|
||||
IcmpMsg: InType0 InType3 InType8 InType11 InType13 OutType0 OutType3 OutType8 OutType14
|
||||
IcmpMsg: 790046 198790 153700 26 1 153700 198734 822128 1
|
||||
Tcp: RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors
|
||||
Tcp: 1 200 120000 -1 47884867 38628916 3356043 2323781 278 1034019111 1166716939 826271 12712 23260066 0
|
||||
Udp: InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors
|
||||
Udp: 14706122 9772 0 14917947 0 0 0
|
||||
UdpLite: InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors
|
||||
UdpLite: 0 0 0 0 0 0 0
|
||||
Reference in New Issue
Block a user