diff --git a/be/src/util/system_metrics.cpp b/be/src/util/system_metrics.cpp index a572661d684b24..79a59077b0a1df 100644 --- a/be/src/util/system_metrics.cpp +++ b/be/src/util/system_metrics.cpp @@ -16,6 +16,8 @@ // under the License. #include "util/system_metrics.h" +#include "gutil/strings/split.h" // for string split +#include "gutil/strtoint.h" // for atoi64 #include #include @@ -72,6 +74,10 @@ struct SnmpMetrics { METRIC_DEFINE_INT_LOCK_COUNTER(tcp_in_errs, MetricUnit::NOUNIT); // All TCP packets retransmitted METRIC_DEFINE_INT_LOCK_COUNTER(tcp_retrans_segs, MetricUnit::NOUNIT); + // All received TCP packets + METRIC_DEFINE_INT_LOCK_COUNTER(tcp_in_segs, MetricUnit::NOUNIT); + // All send TCP packets with RST mark + METRIC_DEFINE_INT_LOCK_COUNTER(tcp_out_segs, MetricUnit::NOUNIT); }; struct FileDescriptorMetrics { @@ -323,6 +329,8 @@ void SystemMetrics::_install_snmp_metrics(MetricRegistry* registry) { &_snmp_metrics->name) REGISTER_SNMP_METRIC(tcp_in_errs); REGISTER_SNMP_METRIC(tcp_retrans_segs); + REGISTER_SNMP_METRIC(tcp_in_segs); + REGISTER_SNMP_METRIC(tcp_out_segs); } void SystemMetrics::_update_net_metrics() { @@ -449,8 +457,16 @@ void SystemMetrics::_update_snmp_metrics() { return; } - // skip the Tcp header line + // parse the Tcp header // Tcp: RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors + std::vector headers = strings::Split(_line_ptr, " "); + std::unordered_map header_map; + int32_t pos = 0; + for (auto& h : headers) { + header_map.emplace(h, pos++); + } + + // read the metrics of TCP if (getline(&_line_ptr, &_line_buf_size, fp) < 0) { char buf[64]; LOG(WARNING) << "failed to skip Tcp header line of /proc/net/snmp, errno=" << errno @@ -461,15 +477,20 @@ void SystemMetrics::_update_snmp_metrics() { // metric line looks like: // Tcp: 1 200 120000 -1 47849374 38601877 3353843 2320314 276 1033354613 1166025166 825439 12694 23238924 0 - int64_t retrans_segs = 0; - int64_t in_errs = 0; - sscanf(_line_ptr, - "Tcp: %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d" - " %" PRId64 " %" PRId64 " %*d %*d", - &retrans_segs, &in_errs); - + std::vector metrics = strings::Split(_line_ptr, " "); + if (metrics.size() != headers.size()) { + LOG(WARNING) << "invalid tcp metrics line: " << _line_ptr; + fclose(fp); + return; + } + int64_t retrans_segs = atoi64(metrics[header_map["RetransSegs"]]); + int64_t in_errs = atoi64(metrics[header_map["InErrs"]]); + int64_t in_segs = atoi64(metrics[header_map["InSegs"]]); + int64_t out_segs = atoi64(metrics[header_map["OutSegs"]]); _snmp_metrics->tcp_retrans_segs.set_value(retrans_segs); _snmp_metrics->tcp_in_errs.set_value(in_errs); + _snmp_metrics->tcp_in_segs.set_value(in_segs); + _snmp_metrics->tcp_out_segs.set_value(out_segs); if (ferror(fp) != 0) { char buf[64]; diff --git a/docs/en/administrator-guide/operation/monitor-metrics/be-metrics.md b/docs/en/administrator-guide/operation/monitor-metrics/be-metrics.md index 642228feccf931..a2716a742ee344 100644 --- a/docs/en/administrator-guide/operation/monitor-metrics/be-metrics.md +++ b/docs/en/administrator-guide/operation/monitor-metrics/be-metrics.md @@ -59,3 +59,19 @@ Value of the `Tcp: RetransSegs` field in `/proc/net/snmp`. Represents the number The incidence rate can be calculated in combination with the sampling period. Usually used to troubleshoot network problems. + +### `doris_be_snmp{name="tcp_in_segs"}` + +Value of the `Tcp: InSegs` field in `/proc/net/snmp`. Represents the number of receivied TCP packets. + +Use `(NEW_tcp_in_errs - OLD_tcp_in_errs) / (NEW_tcp_in_segs - OLD_tcp_in_segs)` can calculate the error rate of received TCP packets. + +Usually used to troubleshoot network problems. + +### `doris_be_snmp{name="tcp_out_segs"}` + +Value of the `Tcp: OutSegs` field in `/proc/net/snmp`. Represents the number of send TCP packets with RST mark. + +Use `(NEW_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` can calculate the retrans rate of TCP packets. + +Usually used to troubleshoot network problems. diff --git a/docs/en/administrator-guide/operation/monitor-metrics/fe-metrics.md b/docs/en/administrator-guide/operation/monitor-metrics/fe-metrics.md index 9c5487fc067049..26ba0ac06128d8 100644 --- a/docs/en/administrator-guide/operation/monitor-metrics/fe-metrics.md +++ b/docs/en/administrator-guide/operation/monitor-metrics/fe-metrics.md @@ -59,3 +59,19 @@ Value of the `Tcp: RetransSegs` field in `/proc/net/snmp`. Represents the number The incidence rate can be calculated in combination with the sampling period. Usually used to troubleshoot network problems. + +### `doris_fe_snmp{name="tcp_in_segs"}` + +Value of the `Tcp: InSegs` field in `/proc/net/snmp`. Represents the number of receivied TCP packets. + +Use `(NEW_tcp_in_errs - OLD_tcp_in_errs) / (NEW_tcp_in_segs - OLD_tcp_in_segs)` can calculate the error rate of received TCP packets. + +Usually used to troubleshoot network problems. + +### `doris_fe_snmp{name="tcp_out_segs"}` + +Value of the `Tcp: OutSegs` field in `/proc/net/snmp`. Represents the number of send TCP packets with RST mark. + +Use `(NEW_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` can calculate the retrans rate of TCP packets. + +Usually used to troubleshoot network problems. diff --git a/docs/zh-CN/administrator-guide/operation/monitor-metrics/be-metrics.md b/docs/zh-CN/administrator-guide/operation/monitor-metrics/be-metrics.md index 1a2afde02f1b2f..41533b8b01dd83 100644 --- a/docs/zh-CN/administrator-guide/operation/monitor-metrics/be-metrics.md +++ b/docs/zh-CN/administrator-guide/operation/monitor-metrics/be-metrics.md @@ -59,3 +59,19 @@ BE 的监控项可以通过以下方式访问: 结合采样周期可以计算发生率。 通常用于排查网络问题。 + +### `doris_be_snmp{name="tcp_in_segs"}` + +该监控项为 `/proc/net/snmp` 中的 `Tcp: InSegs` 字段值。表示当前接收到的所有 TCP 包的数量。 + +通过 `(NEW_tcp_in_errs - OLD_tcp_in_errs) / (NEW_tcp_in_segs - OLD_tcp_in_segs)` 可以计算接收到的 TCP 错误包率。 + +通常用于排查网络问题。 + +### `doris_be_snmp{name="tcp_out_segs"}` + +该监控项为 `/proc/net/snmp` 中的 `Tcp: OutSegs` 字段值。表示当前发送的所有带 RST 标记的 TCP 包的数量。 + +通过 `(NEW_tcp_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` 可以计算 TCP 重传率。 + +通常用于排查网络问题。 diff --git a/docs/zh-CN/administrator-guide/operation/monitor-metrics/fe-metrics.md b/docs/zh-CN/administrator-guide/operation/monitor-metrics/fe-metrics.md index a938ae6fdd0723..aaa18541047d0b 100644 --- a/docs/zh-CN/administrator-guide/operation/monitor-metrics/fe-metrics.md +++ b/docs/zh-CN/administrator-guide/operation/monitor-metrics/fe-metrics.md @@ -59,3 +59,19 @@ FE 的监控项可以通过以下方式访问: 结合采样周期可以计算发生率。 通常用于排查网络问题。 + +### `doris_fe_snmp{name="tcp_in_segs"}` + +该监控项为 `/proc/net/snmp` 中的 `Tcp: InSegs` 字段值。表示当前接收到的所有 TCP 包的数量。 + +通过 `(NEW_tcp_in_errs - OLD_tcp_in_errs) / (NEW_tcp_in_segs - OLD_tcp_in_segs)` 可以计算接收到的 TCP 错误包率。 + +通常用于排查网络问题。 + +### `doris_fe_snmp{name="tcp_out_segs"}` + +该监控项为 `/proc/net/snmp` 中的 `Tcp: OutSegs` 字段值。表示当前发送的所有带 RST 标记的 TCP 包的数量。 + +通过 `(NEW_tcp_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` 可以计算 TCP 重传率。 + +通常用于排查网络问题。 diff --git a/fe/src/main/java/org/apache/doris/metric/MetricRepo.java b/fe/src/main/java/org/apache/doris/metric/MetricRepo.java index 3b2fd54d12317e..1fa58a2f562055 100644 --- a/fe/src/main/java/org/apache/doris/metric/MetricRepo.java +++ b/fe/src/main/java/org/apache/doris/metric/MetricRepo.java @@ -275,6 +275,28 @@ public Long getValue() { }; tpcInErrs.addLabel(new MetricLabel("name", "tcp_in_errs")); PALO_METRIC_REGISTER.addPaloMetrics(tpcInErrs); + + // TCP inSegs + GaugeMetric tpcInSegs = (GaugeMetric) new GaugeMetric( + "snmp", MetricUnit.NOUNIT, "The number of all TCP packets received") { + @Override + public Long getValue() { + return SYSTEM_METRICS.tcpInSegs; + } + }; + tpcInSegs.addLabel(new MetricLabel("name", "tcp_in_segs")); + PALO_METRIC_REGISTER.addPaloMetrics(tpcInSegs); + + // TCP outSegs + GaugeMetric tpcOutSegs = (GaugeMetric) new GaugeMetric( + "snmp", MetricUnit.NOUNIT, "The number of all TCP packets send with RST") { + @Override + public Long getValue() { + return SYSTEM_METRICS.tcpOutSegs; + } + }; + tpcOutSegs.addLabel(new MetricLabel("name", "tcp_out_segs")); + PALO_METRIC_REGISTER.addPaloMetrics(tpcOutSegs); } // to generate the metrics related to tablets of each backends diff --git a/fe/src/main/java/org/apache/doris/metric/SystemMetrics.java b/fe/src/main/java/org/apache/doris/metric/SystemMetrics.java index 78ec70f643dcef..15b221db2c6294 100644 --- a/fe/src/main/java/org/apache/doris/metric/SystemMetrics.java +++ b/fe/src/main/java/org/apache/doris/metric/SystemMetrics.java @@ -19,11 +19,14 @@ import org.apache.doris.common.FeConstants; +import com.google.common.collect.Maps; + import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.BufferedReader; import java.io.FileReader; +import java.util.Map; /** * Save system metrics such as CPU, MEM, IO, Networks. @@ -38,6 +41,10 @@ public class SystemMetrics { protected long tcpRetransSegs = 0; // The number of all problematic TCP packets received protected long tcpInErrs = 0; + // All received TCP packets + protected long tcpInSegs = 0; + // All send TCP packets with RST mark + protected long tcpOutSegs = 0; public synchronized void update() { updateSnmpMetrics(); @@ -61,19 +68,30 @@ private void updateSnmpMetrics() { if (!found) { throw new Exception("can not find tcp metrics"); } - // skip tcp header line + + // parse the header of TCP + String[] headers = line.split(" "); + Map headerMap = Maps.newHashMap(); + int pos = 0; + for (int i = 0; i < headers.length; i++) { + headerMap.put(headers[i], pos++); + } + + // read the metrics of TCP if ((line = br.readLine()) == null) { - throw new Exception("failed to skip tcp metrics header"); + throw new Exception("failed to read metrics of TCP"); } // eg: Tcp: 1 200 120000 -1 38920626 10487279 105581903 300009 305 18079291213 15411998945 11808180 22905 4174570 0 String[] parts = line.split(" "); - if (parts.length != 16) { - throw new Exception("invalid tcp metrics: " + line); + if (parts.length != headerMap.size()) { + throw new Exception("invalid tcp metrics: " + line + ". header size: " + headerMap.size()); } - tcpRetransSegs = Long.valueOf(parts[12]); - tcpInErrs = Long.valueOf(parts[13]); + tcpRetransSegs = Long.valueOf(parts[headerMap.get("RetransSegs")]); + tcpInErrs = Long.valueOf(parts[headerMap.get("InErrs")]); + tcpInSegs = Long.valueOf(parts[headerMap.get("InSegs")]); + tcpOutSegs = Long.valueOf(parts[headerMap.get("OutSegs")]); } catch (Exception e) { LOG.warn("failed to get /proc/net/snmp", e);