diff --git a/README.md b/README.md index 7a4b9f407f..d31db6fd01 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,7 @@ runit | Exposes service status from [runit](http://smarden.org/runit/). | _any_ supervisord | Exposes service status from [supervisord](http://supervisord.org/). | _any_ systemd | Exposes service and system status from [systemd](http://www.freedesktop.org/wiki/Software/systemd/). | Linux tcpstat | Exposes TCP connection status information from `/proc/net/tcp` and `/proc/net/tcp6`. (Warning: the current version has potential performance issues in high load situations.) | Linux +udp_queues | Exposes UDP total lengths of the rx_queue and tx_queue from `/proc/net/udp` and `/proc/net/udp6`. | Linux wifi | Exposes WiFi device and station statistics. | Linux perf | Exposes perf based metrics (Warning: Metrics are dependent on kernel configuration and settings). | Linux diff --git a/collector/fixtures/e2e-64k-page-output.txt b/collector/fixtures/e2e-64k-page-output.txt index 6b2e6bd25b..4ac50a17f4 100644 --- a/collector/fixtures/e2e-64k-page-output.txt +++ b/collector/fixtures/e2e-64k-page-output.txt @@ -816,12 +816,10 @@ node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="1 node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239 # HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down # TYPE node_infiniband_link_downed_total counter -node_infiniband_link_downed_total{device="i40iw0",port="1"} 0 node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0 node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0 # HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state # TYPE node_infiniband_link_error_recovery_total counter -node_infiniband_link_error_recovery_total{device="i40iw0",port="1"} 0 node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0 node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0 # HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors) @@ -834,20 +832,16 @@ node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16 node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 # HELP node_infiniband_port_constraint_errors_received_total Number of packets received on the switch physical port that are discarded # TYPE node_infiniband_port_constraint_errors_received_total counter -node_infiniband_port_constraint_errors_received_total{device="i40iw0",port="1"} 0 node_infiniband_port_constraint_errors_received_total{device="mlx4_0",port="1"} 0 # HELP node_infiniband_port_constraint_errors_transmitted_total Number of packets not transmitted from the switch physical port # TYPE node_infiniband_port_constraint_errors_transmitted_total counter -node_infiniband_port_constraint_errors_transmitted_total{device="i40iw0",port="1"} 0 node_infiniband_port_constraint_errors_transmitted_total{device="mlx4_0",port="1"} 0 # HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links # TYPE node_infiniband_port_data_received_bytes_total counter -node_infiniband_port_data_received_bytes_total{device="i40iw0",port="1"} 0 node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07 node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0 # HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links # TYPE node_infiniband_port_data_transmitted_bytes_total counter -node_infiniband_port_data_transmitted_bytes_total{device="i40iw0",port="1"} 0 node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07 node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0 # HELP node_infiniband_port_discards_received_total Number of inbound packets discarded by the port because the port is down or congested @@ -855,23 +849,18 @@ node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0 node_infiniband_port_discards_received_total{device="mlx4_0",port="1"} 0 # HELP node_infiniband_port_discards_transmitted_total Number of outbound packets discarded by the port because the port is down or congested # TYPE node_infiniband_port_discards_transmitted_total counter -node_infiniband_port_discards_transmitted_total{device="i40iw0",port="1"} 0 node_infiniband_port_discards_transmitted_total{device="mlx4_0",port="1"} 5 # HELP node_infiniband_port_errors_received_total Number of packets containing an error that were received on this port # TYPE node_infiniband_port_errors_received_total counter -node_infiniband_port_errors_received_total{device="i40iw0",port="1"} 0 node_infiniband_port_errors_received_total{device="mlx4_0",port="1"} 0 # HELP node_infiniband_port_packets_received_total Number of packets received on all VLs by this port (including errors) # TYPE node_infiniband_port_packets_received_total counter -node_infiniband_port_packets_received_total{device="i40iw0",port="1"} 0 node_infiniband_port_packets_received_total{device="mlx4_0",port="1"} 6.825908347e+09 # HELP node_infiniband_port_packets_transmitted_total Number of packets transmitted on all VLs from this port (including errors) # TYPE node_infiniband_port_packets_transmitted_total counter -node_infiniband_port_packets_transmitted_total{device="i40iw0",port="1"} 0 node_infiniband_port_packets_transmitted_total{device="mlx4_0",port="1"} 6.235865e+06 # HELP node_infiniband_port_transmit_wait_total Number of ticks during which the port had data to transmit but no data was sent during the entire tick # TYPE node_infiniband_port_transmit_wait_total counter -node_infiniband_port_transmit_wait_total{device="i40iw0",port="1"} 0 node_infiniband_port_transmit_wait_total{device="mlx4_0",port="1"} 4.294967295e+09 # HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) # TYPE node_infiniband_unicast_packets_received_total counter diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index 7651f53be3..607d250aa7 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -816,12 +816,10 @@ node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="1 node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239 # HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down # TYPE node_infiniband_link_downed_total counter -node_infiniband_link_downed_total{device="i40iw0",port="1"} 0 node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0 node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0 # HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state # TYPE node_infiniband_link_error_recovery_total counter -node_infiniband_link_error_recovery_total{device="i40iw0",port="1"} 0 node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0 node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0 # HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors) @@ -834,20 +832,16 @@ node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16 node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 # HELP node_infiniband_port_constraint_errors_received_total Number of packets received on the switch physical port that are discarded # TYPE node_infiniband_port_constraint_errors_received_total counter -node_infiniband_port_constraint_errors_received_total{device="i40iw0",port="1"} 0 node_infiniband_port_constraint_errors_received_total{device="mlx4_0",port="1"} 0 # HELP node_infiniband_port_constraint_errors_transmitted_total Number of packets not transmitted from the switch physical port # TYPE node_infiniband_port_constraint_errors_transmitted_total counter -node_infiniband_port_constraint_errors_transmitted_total{device="i40iw0",port="1"} 0 node_infiniband_port_constraint_errors_transmitted_total{device="mlx4_0",port="1"} 0 # HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links # TYPE node_infiniband_port_data_received_bytes_total counter -node_infiniband_port_data_received_bytes_total{device="i40iw0",port="1"} 0 node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07 node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0 # HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links # TYPE node_infiniband_port_data_transmitted_bytes_total counter -node_infiniband_port_data_transmitted_bytes_total{device="i40iw0",port="1"} 0 node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07 node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0 # HELP node_infiniband_port_discards_received_total Number of inbound packets discarded by the port because the port is down or congested @@ -855,23 +849,18 @@ node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0 node_infiniband_port_discards_received_total{device="mlx4_0",port="1"} 0 # HELP node_infiniband_port_discards_transmitted_total Number of outbound packets discarded by the port because the port is down or congested # TYPE node_infiniband_port_discards_transmitted_total counter -node_infiniband_port_discards_transmitted_total{device="i40iw0",port="1"} 0 node_infiniband_port_discards_transmitted_total{device="mlx4_0",port="1"} 5 # HELP node_infiniband_port_errors_received_total Number of packets containing an error that were received on this port # TYPE node_infiniband_port_errors_received_total counter -node_infiniband_port_errors_received_total{device="i40iw0",port="1"} 0 node_infiniband_port_errors_received_total{device="mlx4_0",port="1"} 0 # HELP node_infiniband_port_packets_received_total Number of packets received on all VLs by this port (including errors) # TYPE node_infiniband_port_packets_received_total counter -node_infiniband_port_packets_received_total{device="i40iw0",port="1"} 0 node_infiniband_port_packets_received_total{device="mlx4_0",port="1"} 6.825908347e+09 # HELP node_infiniband_port_packets_transmitted_total Number of packets transmitted on all VLs from this port (including errors) # TYPE node_infiniband_port_packets_transmitted_total counter -node_infiniband_port_packets_transmitted_total{device="i40iw0",port="1"} 0 node_infiniband_port_packets_transmitted_total{device="mlx4_0",port="1"} 6.235865e+06 # HELP node_infiniband_port_transmit_wait_total Number of ticks during which the port had data to transmit but no data was sent during the entire tick # TYPE node_infiniband_port_transmit_wait_total counter -node_infiniband_port_transmit_wait_total{device="i40iw0",port="1"} 0 node_infiniband_port_transmit_wait_total{device="mlx4_0",port="1"} 4.294967295e+09 # HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) # TYPE node_infiniband_unicast_packets_received_total counter diff --git a/collector/fixtures/proc/net/tcpstat b/collector/fixtures/proc/net/tcpstat index 8b3777a969..352c00bbf3 100644 --- a/collector/fixtures/proc/net/tcpstat +++ b/collector/fixtures/proc/net/tcpstat @@ -1,3 +1,3 @@ sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode - 0: 00000000:0016 00000000:0000 0A 00000000:00000000 00:00000000 00000000 0 0 2740 1 ffff88003d3af3c0 100 0 0 10 0 - 1: 0F02000A:0016 0202000A:8B6B 01 00000000:00000000 02:000AC99B 00000000 0 0 3652 4 ffff88003d3ae040 21 4 31 47 46 + 0: 00000000:0016 00000000:0000 0A 00000015:00000000 00:00000000 00000000 0 0 2740 1 ffff88003d3af3c0 100 0 0 10 0 + 1: 0F02000A:0016 0202000A:8B6B 01 00000015:00000001 02:000AC99B 00000000 0 0 3652 4 ffff88003d3ae040 21 4 31 47 46 diff --git a/collector/fixtures/proc/net/udp b/collector/fixtures/proc/net/udp new file mode 100644 index 0000000000..3c5052400a --- /dev/null +++ b/collector/fixtures/proc/net/udp @@ -0,0 +1,2 @@ + sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode + 0: 00000000:0016 00000000:0000 0A 00000015:00000000 00:00000000 00000000 0 0 2740 1 ffff88003d3af3c0 100 0 0 10 0 diff --git a/collector/fixtures/sys.ttar b/collector/fixtures/sys.ttar index 41e062dac7..8502ec6b44 100644 --- a/collector/fixtures/sys.ttar +++ b/collector/fixtures/sys.ttar @@ -112,6 +112,21 @@ Mode: 755 Directory: sys/class/infiniband/i40iw0 Mode: 755 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/board_id +Lines: 1 +I40IW Board ID +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/fw_ver +Lines: 1 +0.2 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/hca_type +Lines: 1 +I40IW +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/class/infiniband/i40iw0/ports Mode: 755 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -206,9 +221,39 @@ Lines: 1 N/A (no PMA) Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/ports/1/phys_state +Lines: 1 +5: LinkUp +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/ports/1/rate +Lines: 1 +10 Gb/sec (4X) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/ports/1/state +Lines: 1 +4: ACTIVE +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/class/infiniband/mlx4_0 Mode: 755 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/board_id +Lines: 1 +SM_1141000001000 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/fw_ver +Lines: 1 +2.31.5050 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/hca_type +Lines: 1 +MT4099 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/class/infiniband/mlx4_0/ports Mode: 755 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -346,6 +391,21 @@ Lines: 1 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/phys_state +Lines: 1 +5: LinkUp +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/rate +Lines: 1 +40 Gb/sec (4X QDR) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/state +Lines: 1 +4: ACTIVE +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/class/infiniband/mlx4_0/ports/2 Mode: 755 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -435,6 +495,21 @@ Lines: 1 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/phys_state +Lines: 1 +5: LinkUp +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/rate +Lines: 1 +40 Gb/sec (4X QDR) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/state +Lines: 1 +4: ACTIVE +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/class/net Mode: 755 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/collector/infiniband_linux.go b/collector/infiniband_linux.go index 16134f3e84..237a913394 100644 --- a/collector/infiniband_linux.go +++ b/collector/infiniband_linux.go @@ -1,4 +1,4 @@ -// Copyright 2017 The Prometheus Authors +// Copyright 2017-2019 The Prometheus Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -17,31 +17,16 @@ package collector import ( - "errors" - "os" - "path/filepath" - "strings" + "fmt" + "strconv" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/common/log" -) - -const infinibandPath = "class/infiniband" - -var ( - errInfinibandNoDevicesFound = errors.New("no InfiniBand devices detected") - errInfinibandNoPortsFound = errors.New("no InfiniBand ports detected") + "github.com/prometheus/procfs/sysfs" ) type infinibandCollector struct { - metricDescs map[string]*prometheus.Desc - counters map[string]infinibandMetric - legacyCounters map[string]infinibandMetric -} - -type infinibandMetric struct { - File string - Help string + fs sysfs.FS + metricDescs map[string]*prometheus.Desc } func init() { @@ -51,55 +36,47 @@ func init() { // NewInfiniBandCollector returns a new Collector exposing InfiniBand stats. func NewInfiniBandCollector() (Collector, error) { var i infinibandCollector + var err error - // Filenames of all InfiniBand counter metrics including a detailed description. - i.counters = map[string]infinibandMetric{ - "link_downed_total": {"link_downed", "Number of times the link failed to recover from an error state and went down"}, - "link_error_recovery_total": {"link_error_recovery", "Number of times the link successfully recovered from an error state"}, - "multicast_packets_received_total": {"multicast_rcv_packets", "Number of multicast packets received (including errors)"}, - "multicast_packets_transmitted_total": {"multicast_xmit_packets", "Number of multicast packets transmitted (including errors)"}, - "port_constraint_errors_received_total": {"port_rcv_constraint_errors", "Number of packets received on the switch physical port that are discarded"}, - "port_constraint_errors_transmitted_total": {"port_xmit_constraint_errors", "Number of packets not transmitted from the switch physical port"}, - "port_data_received_bytes_total": {"port_rcv_data", "Number of data octets received on all links"}, - "port_data_transmitted_bytes_total": {"port_xmit_data", "Number of data octets transmitted on all links"}, - "port_discards_received_total": {"port_rcv_discards", "Number of inbound packets discarded by the port because the port is down or congested"}, - "port_discards_transmitted_total": {"port_xmit_discards", "Number of outbound packets discarded by the port because the port is down or congested"}, - "port_errors_received_total": {"port_rcv_errors", "Number of packets containing an error that were received on this port"}, - "port_packets_received_total": {"port_rcv_packets", "Number of packets received on all VLs by this port (including errors)"}, - "port_packets_transmitted_total": {"port_xmit_packets", "Number of packets transmitted on all VLs from this port (including errors)"}, - "port_transmit_wait_total": {"port_xmit_wait", "Number of ticks during which the port had data to transmit but no data was sent during the entire tick"}, - "unicast_packets_received_total": {"unicast_rcv_packets", "Number of unicast packets received (including errors)"}, - "unicast_packets_transmitted_total": {"unicast_xmit_packets", "Number of unicast packets transmitted (including errors)"}, + i.fs, err = sysfs.NewFS(*sysPath) + if err != nil { + return nil, fmt.Errorf("failed to open sysfs: %v", err) } - // Deprecated counters for some older versions of InfiniBand drivers. - i.legacyCounters = map[string]infinibandMetric{ - "legacy_multicast_packets_received_total": {"port_multicast_rcv_packets", "Number of multicast packets received"}, - "legacy_multicast_packets_transmitted_total": {"port_multicast_xmit_packets", "Number of multicast packets transmitted"}, - "legacy_data_received_bytes_total": {"port_rcv_data_64", "Number of data octets received on all links"}, - "legacy_packets_received_total": {"port_rcv_packets_64", "Number of data packets received on all links"}, - "legacy_unicast_packets_received_total": {"port_unicast_rcv_packets", "Number of unicast packets received"}, - "legacy_unicast_packets_transmitted_total": {"port_unicast_xmit_packets", "Number of unicast packets transmitted"}, - "legacy_data_transmitted_bytes_total": {"port_xmit_data_64", "Number of data octets transmitted on all links"}, - "legacy_packets_transmitted_total": {"port_xmit_packets_64", "Number of data packets received on all links"}, + // Detailed description for all metrics. + descriptions := map[string]string{ + "legacy_multicast_packets_received_total": "Number of multicast packets received", + "legacy_multicast_packets_transmitted_total": "Number of multicast packets transmitted", + "legacy_data_received_bytes_total": "Number of data octets received on all links", + "legacy_packets_received_total": "Number of data packets received on all links", + "legacy_unicast_packets_received_total": "Number of unicast packets received", + "legacy_unicast_packets_transmitted_total": "Number of unicast packets transmitted", + "legacy_data_transmitted_bytes_total": "Number of data octets transmitted on all links", + "legacy_packets_transmitted_total": "Number of data packets received on all links", + "link_downed_total": "Number of times the link failed to recover from an error state and went down", + "link_error_recovery_total": "Number of times the link successfully recovered from an error state", + "multicast_packets_received_total": "Number of multicast packets received (including errors)", + "multicast_packets_transmitted_total": "Number of multicast packets transmitted (including errors)", + "port_constraint_errors_received_total": "Number of packets received on the switch physical port that are discarded", + "port_constraint_errors_transmitted_total": "Number of packets not transmitted from the switch physical port", + "port_data_received_bytes_total": "Number of data octets received on all links", + "port_data_transmitted_bytes_total": "Number of data octets transmitted on all links", + "port_discards_received_total": "Number of inbound packets discarded by the port because the port is down or congested", + "port_discards_transmitted_total": "Number of outbound packets discarded by the port because the port is down or congested", + "port_errors_received_total": "Number of packets containing an error that were received on this port", + "port_packets_received_total": "Number of packets received on all VLs by this port (including errors)", + "port_packets_transmitted_total": "Number of packets transmitted on all VLs from this port (including errors)", + "port_transmit_wait_total": "Number of ticks during which the port had data to transmit but no data was sent during the entire tick", + "unicast_packets_received_total": "Number of unicast packets received (including errors)", + "unicast_packets_transmitted_total": "Number of unicast packets transmitted (including errors)", } - subsystem := "infiniband" i.metricDescs = make(map[string]*prometheus.Desc) - for metricName, infinibandMetric := range i.counters { - i.metricDescs[metricName] = prometheus.NewDesc( - prometheus.BuildFQName(namespace, subsystem, metricName), - infinibandMetric.Help, - []string{"device", "port"}, - nil, - ) - } - - for metricName, infinibandMetric := range i.legacyCounters { + for metricName, description := range descriptions { i.metricDescs[metricName] = prometheus.NewDesc( - prometheus.BuildFQName(namespace, subsystem, metricName), - infinibandMetric.Help, + prometheus.BuildFQName(namespace, "infiniband", metricName), + description, []string{"device", "port"}, nil, ) @@ -108,141 +85,50 @@ func NewInfiniBandCollector() (Collector, error) { return &i, nil } -// infinibandDevices retrieves a list of InfiniBand devices. -func infinibandDevices(infinibandPath string) ([]string, error) { - devices, err := filepath.Glob(filepath.Join(infinibandPath, "/*")) - if err != nil { - return nil, err - } - - if len(devices) < 1 { - log.Debugf("Unable to detect InfiniBand devices") - err = errInfinibandNoDevicesFound - return nil, err - } - - // Extract just the filenames which equate to the device names. - for i, device := range devices { - devices[i] = filepath.Base(device) - } - - return devices, nil -} - -// Retrieve a list of ports for the InfiniBand device. -func infinibandPorts(infinibandPath, device string) ([]string, error) { - ports, err := filepath.Glob(filepath.Join(infinibandPath, device, "ports/*")) - if err != nil { - return nil, err - } - - if len(ports) < 1 { - log.Debugf("Unable to detect ports for %s", device) - err = errInfinibandNoPortsFound - return nil, err - } - - // Extract just the filenames which equates to the port numbers. - for i, port := range ports { - ports[i] = filepath.Base(port) - } - - return ports, nil +func (c *infinibandCollector) pushMetric(ch chan<- prometheus.Metric, name string, value uint64, deviceName string, port string, valueType prometheus.ValueType) { + ch <- prometheus.MustNewConstMetric(c.metricDescs[name], valueType, float64(value), deviceName, port) } -func readMetric(directory, metricFile string) (uint64, error) { - metric, err := readUintFromFile(filepath.Join(directory, metricFile)) - if err != nil { - // Ugly workaround for handling #966, when counters are - // `N/A (not available)`. - // This was already patched and submitted, see - // https://www.spinics.net/lists/linux-rdma/msg68596.html - // Remove this as soon as the fix lands in the enterprise distros. - if strings.Contains(err.Error(), "N/A (no PMA)") { - log.Debugf("%q value is N/A", metricFile) - return 0, nil - } - log.Debugf("Error reading %q file", metricFile) - return 0, err - } - - // According to Mellanox, the following metrics "are divided by 4 unconditionally" - // as they represent the amount of data being transmitted and received per lane. - // Mellanox cards have 4 lanes per port, so all values must be multiplied by 4 - // to get the expected value. - switch metricFile { - case "port_rcv_data", "port_xmit_data", "port_rcv_data_64", "port_xmit_data_64": - metric *= 4 +func (c *infinibandCollector) pushCounter(ch chan<- prometheus.Metric, name string, value *uint64, deviceName string, port string) { + if value != nil { + c.pushMetric(ch, name, *value, deviceName, port, prometheus.CounterValue) } - - return metric, nil } func (c *infinibandCollector) Update(ch chan<- prometheus.Metric) error { - devices, err := infinibandDevices(sysFilePath(infinibandPath)) - - // If no devices are found or another error is raised while attempting to find devices, - // InfiniBand is likely not installed and the collector should be skipped. - switch err { - case nil: - case errInfinibandNoDevicesFound: - return nil - default: - return err + devices, err := c.fs.InfiniBandClass() + if err != nil { + return fmt.Errorf("error obtaining InfiniBand class info: %s", err) } for _, device := range devices { - ports, err := infinibandPorts(sysFilePath(infinibandPath), device) - - // If no ports are found for the specified device, skip to the next device. - switch err { - case nil: - case errInfinibandNoPortsFound: - continue - default: - return err - } - - for _, port := range ports { - portFiles := sysFilePath(filepath.Join(infinibandPath, device, "ports", port)) - - // Add metrics for the InfiniBand counters. - for metricName, infinibandMetric := range c.counters { - if _, err := os.Stat(filepath.Join(portFiles, "counters", infinibandMetric.File)); os.IsNotExist(err) { - continue - } - metric, err := readMetric(filepath.Join(portFiles, "counters"), infinibandMetric.File) - if err != nil { - return err - } - - ch <- prometheus.MustNewConstMetric( - c.metricDescs[metricName], - prometheus.CounterValue, - float64(metric), - device, - port, - ) - } - - // Add metrics for the legacy InfiniBand counters. - for metricName, infinibandMetric := range c.legacyCounters { - if _, err := os.Stat(filepath.Join(portFiles, "counters_ext", infinibandMetric.File)); os.IsNotExist(err) { - continue - } - metric, err := readMetric(filepath.Join(portFiles, "counters_ext"), infinibandMetric.File) - if err != nil { - return err - } - - ch <- prometheus.MustNewConstMetric( - c.metricDescs[metricName], - prometheus.CounterValue, - float64(metric), - device, - port, - ) - } + for _, port := range device.Ports { + portStr := strconv.FormatUint(uint64(port.Port), 10) + + c.pushCounter(ch, "legacy_multicast_packets_received_total", port.Counters.LegacyPortMulticastRcvPackets, port.Name, portStr) + c.pushCounter(ch, "legacy_multicast_packets_transmitted_total", port.Counters.LegacyPortMulticastXmitPackets, port.Name, portStr) + c.pushCounter(ch, "legacy_data_received_bytes_total", port.Counters.LegacyPortRcvData64, port.Name, portStr) + c.pushCounter(ch, "legacy_packets_received_total", port.Counters.LegacyPortRcvPackets64, port.Name, portStr) + c.pushCounter(ch, "legacy_unicast_packets_received_total", port.Counters.LegacyPortUnicastRcvPackets, port.Name, portStr) + c.pushCounter(ch, "legacy_unicast_packets_transmitted_total", port.Counters.LegacyPortUnicastXmitPackets, port.Name, portStr) + c.pushCounter(ch, "legacy_data_transmitted_bytes_total", port.Counters.LegacyPortXmitData64, port.Name, portStr) + c.pushCounter(ch, "legacy_packets_transmitted_total", port.Counters.LegacyPortXmitPackets64, port.Name, portStr) + c.pushCounter(ch, "link_downed_total", port.Counters.LinkDowned, port.Name, portStr) + c.pushCounter(ch, "link_error_recovery_total", port.Counters.LinkErrorRecovery, port.Name, portStr) + c.pushCounter(ch, "multicast_packets_received_total", port.Counters.MulticastRcvPackets, port.Name, portStr) + c.pushCounter(ch, "multicast_packets_transmitted_total", port.Counters.MulticastXmitPackets, port.Name, portStr) + c.pushCounter(ch, "port_constraint_errors_received_total", port.Counters.PortRcvConstraintErrors, port.Name, portStr) + c.pushCounter(ch, "port_constraint_errors_transmitted_total", port.Counters.PortXmitConstraintErrors, port.Name, portStr) + c.pushCounter(ch, "port_data_received_bytes_total", port.Counters.PortRcvData, port.Name, portStr) + c.pushCounter(ch, "port_data_transmitted_bytes_total", port.Counters.PortXmitData, port.Name, portStr) + c.pushCounter(ch, "port_discards_received_total", port.Counters.PortRcvDiscards, port.Name, portStr) + c.pushCounter(ch, "port_discards_transmitted_total", port.Counters.PortXmitDiscards, port.Name, portStr) + c.pushCounter(ch, "port_errors_received_total", port.Counters.PortRcvErrors, port.Name, portStr) + c.pushCounter(ch, "port_packets_received_total", port.Counters.PortRcvPackets, port.Name, portStr) + c.pushCounter(ch, "port_packets_transmitted_total", port.Counters.PortXmitPackets, port.Name, portStr) + c.pushCounter(ch, "port_transmit_wait_total", port.Counters.PortXmitWait, port.Name, portStr) + c.pushCounter(ch, "unicast_packets_received_total", port.Counters.UnicastRcvPackets, port.Name, portStr) + c.pushCounter(ch, "unicast_packets_transmitted_total", port.Counters.UnicastXmitPackets, port.Name, portStr) } } diff --git a/collector/infiniband_linux_test.go b/collector/infiniband_linux_test.go deleted file mode 100644 index d2090f834a..0000000000 --- a/collector/infiniband_linux_test.go +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2017 The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package collector - -import ( - "testing" -) - -func TestInfiniBandDevices(t *testing.T) { - devices, err := infinibandDevices("fixtures/sys/class/infiniband") - if err != nil { - t.Fatal(err) - } - - if l := len(devices); l != 2 { - t.Fatalf("Retrieved an unexpected number of InfiniBand devices: %d", l) - } -} - -func TestInfiniBandPorts(t *testing.T) { - ports, err := infinibandPorts("fixtures/sys/class/infiniband", "mlx4_0") - if err != nil { - t.Fatal(err) - } - - if l := len(ports); l != 2 { - t.Fatalf("Retrieved an unexpected number of InfiniBand ports: %d", l) - } -} diff --git a/collector/tcpstat_linux.go b/collector/tcpstat_linux.go index cc4e960b0f..5641111cbf 100644 --- a/collector/tcpstat_linux.go +++ b/collector/tcpstat_linux.go @@ -51,6 +51,10 @@ const ( tcpListen // TCP_CLOSING tcpClosing + // TCP_RX_BUFFER + tcpRxQueuedBytes + // TCP_TX_BUFFER + tcpTxQueuedBytes ) type tcpStatCollector struct { @@ -119,16 +123,34 @@ func parseTCPStats(r io.Reader) (map[tcpConnectionState]float64, error) { if len(parts) == 0 { continue } - if len(parts) < 4 { + if len(parts) < 5 { return nil, fmt.Errorf("invalid TCP stats line: %q", line) } + qu := strings.Split(parts[4], ":") + if len(qu) < 2 { + return nil, fmt.Errorf("cannot parse tx_queues and rx_queues: %q", line) + } + + tx, err := strconv.ParseUint(qu[0], 16, 64) + if err != nil { + return nil, err + } + tcpStats[tcpConnectionState(tcpTxQueuedBytes)] += float64(tx) + + rx, err := strconv.ParseUint(qu[1], 16, 64) + if err != nil { + return nil, err + } + tcpStats[tcpConnectionState(tcpRxQueuedBytes)] += float64(rx) + st, err := strconv.ParseInt(parts[3], 16, 8) if err != nil { return nil, err } tcpStats[tcpConnectionState(st)]++ + } return tcpStats, nil @@ -158,6 +180,10 @@ func (st tcpConnectionState) String() string { return "listen" case tcpClosing: return "closing" + case tcpRxQueuedBytes: + return "rx_queued_bytes" + case tcpTxQueuedBytes: + return "tx_queued_bytes" default: return "unknown" } diff --git a/collector/tcpstat_linux_test.go b/collector/tcpstat_linux_test.go index f4c3b36c8c..b609b84679 100644 --- a/collector/tcpstat_linux_test.go +++ b/collector/tcpstat_linux_test.go @@ -28,8 +28,27 @@ func Test_parseTCPStatsError(t *testing.T) { name: "too few fields", in: "sl local_address\n 0: 00000000:0016", }, + { + name: "missing colon in tx-rx field", + in: "sl local_address rem_address st tx_queue rx_queue\n" + + " 1: 0F02000A:0016 0202000A:8B6B 01 0000000000000001", + }, + { + name: "tx parsing issue", + in: "sl local_address rem_address st tx_queue rx_queue\n" + + " 1: 0F02000A:0016 0202000A:8B6B 01 0000000x:00000001", + }, + { + name: "rx parsing issue", + in: "sl local_address rem_address st tx_queue rx_queue\n" + + " 1: 0F02000A:0016 0202000A:8B6B 01 00000000:0000000x", + }, + { + name: "state parsing issue", + in: "sl local_address rem_address st tx_queue rx_queue\n" + + " 1: 0F02000A:0016 0202000A:8B6B 0H 00000000:00000001", + }, } - for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { if _, err := parseTCPStats(strings.NewReader(tt.in)); err == nil { @@ -40,6 +59,14 @@ func Test_parseTCPStatsError(t *testing.T) { } func TestTCPStat(t *testing.T) { + + noFile, _ := os.Open("follow the white rabbit") + defer noFile.Close() + + if _, err := parseTCPStats(noFile); err == nil { + t.Fatal("expected an error, but none occurred") + } + file, err := os.Open("fixtures/proc/net/tcpstat") if err != nil { t.Fatal(err) @@ -58,4 +85,39 @@ func TestTCPStat(t *testing.T) { if want, got := 1, int(tcpStats[tcpListen]); want != got { t.Errorf("want tcpstat number of listen state %d, got %d", want, got) } + + if want, got := 42, int(tcpStats[tcpTxQueuedBytes]); want != got { + t.Errorf("want tcpstat number of bytes in tx queue %d, got %d", want, got) + } + if want, got := 1, int(tcpStats[tcpRxQueuedBytes]); want != got { + t.Errorf("want tcpstat number of bytes in rx queue %d, got %d", want, got) + } + +} + +func Test_getTCPStats(t *testing.T) { + type args struct { + statsFile string + } + tests := []struct { + name string + args args + wantErr bool + }{ + { + name: "file not found", + args: args{statsFile: "somewhere over the rainbow"}, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := getTCPStats(tt.args.statsFile) + if (err != nil) != tt.wantErr { + t.Errorf("getTCPStats() error = %v, wantErr %v", err, tt.wantErr) + return + } + // other cases are covered by TestTCPStat() + }) + } } diff --git a/collector/udpqueues_linux.go b/collector/udpqueues_linux.go new file mode 100644 index 0000000000..504fd038f1 --- /dev/null +++ b/collector/udpqueues_linux.go @@ -0,0 +1,118 @@ +// Copyright 2015 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build !noudp_queues + +package collector + +import ( + "fmt" + "io" + "io/ioutil" + "os" + "strconv" + "strings" + + "github.com/prometheus/client_golang/prometheus" +) + +type udpQueuesCollector struct { + desc typedDesc +} + +func init() { + registerCollector("udp_queues", defaultDisabled, NewUDPqueuesCollector) +} + +// NewUDPqueuesCollector returns a new Collector exposing network udp queued bytes. +func NewUDPqueuesCollector() (Collector, error) { + return &udpQueuesCollector{ + desc: typedDesc{prometheus.NewDesc( + prometheus.BuildFQName(namespace, "udp", "queues"), + "Number of allocated memory in the kernel for UDP datagrams in bytes.", + []string{"queue"}, nil, + ), prometheus.GaugeValue}, + }, nil +} + +func (c *udpQueuesCollector) Update(ch chan<- prometheus.Metric) error { + updQueues, err := getUDPqueues(procFilePath("net/udp")) + if err != nil { + return fmt.Errorf("couldn't get upd queued bytes: %s", err) + } + + // if enabled ipv6 system + udp6File := procFilePath("net/udp6") + if _, hasIPv6 := os.Stat(udp6File); hasIPv6 == nil { + udp6Queues, err := getUDPqueues(udp6File) + if err != nil { + return fmt.Errorf("couldn't get udp6 queued bytes: %s", err) + } + + for qu, value := range udp6Queues { + updQueues[qu] += value + } + } + + for qu, value := range updQueues { + ch <- c.desc.mustNewConstMetric(value, qu) + } + return nil +} + +func getUDPqueues(statsFile string) (map[string]float64, error) { + file, err := os.Open(statsFile) + if err != nil { + return nil, err + } + defer file.Close() + + return parseUDPqueues(file) +} + +func parseUDPqueues(r io.Reader) (map[string]float64, error) { + updQueues := map[string]float64{} + contents, err := ioutil.ReadAll(r) + if err != nil { + return nil, err + } + + for _, line := range strings.Split(string(contents), "\n")[1:] { + fields := strings.Fields(line) + if len(fields) == 0 { + continue + } + if len(fields) < 5 { + return nil, fmt.Errorf("invalid line in file: %q", line) + } + + qu := strings.Split(fields[4], ":") + if len(qu) < 2 { + return nil, fmt.Errorf("cannot parse tx_queues and rx_queues: %q", line) + } + + tx, err := strconv.ParseUint(qu[0], 16, 64) + if err != nil { + return nil, err + } + updQueues["tx_queue"] += float64(tx) + + rx, err := strconv.ParseUint(qu[1], 16, 64) + if err != nil { + return nil, err + } + updQueues["rx_queue"] += float64(rx) + } + + return updQueues, nil +} diff --git a/collector/udpqueues_linux_test.go b/collector/udpqueues_linux_test.go new file mode 100644 index 0000000000..d117d7fdb1 --- /dev/null +++ b/collector/udpqueues_linux_test.go @@ -0,0 +1,151 @@ +// Copyright 2015 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build !noudpqueues + +package collector + +import ( + "io" + "os" + "strings" + "testing" +) + +func Test_parseUDPqueues(t *testing.T) { + noFile, _ := os.Open("follow the white rabbit") + defer noFile.Close() + + if _, err := parseUDPqueues(noFile); err == nil { + t.Fatal("expected an error, but none occurred") + } + + type args struct { + r io.Reader + } + tests := []struct { + name string + args args + want map[string]float64 + wantErr bool + }{ + { + name: "reading valid lines, no issue should happened", + args: args{ + strings.NewReader( + "sl local_address rem_address st tx_queue rx_queue \n" + + "1: 00000000:0000 00000000:0000 07 00000000:00000001 \n" + + "2: 00000000:0000 00000000:0000 07 00000002:00000001 \n"), + }, + want: map[string]float64{"tx_queue": 2, "rx_queue": 2}, + wantErr: false, + }, + { + name: "error case - invalid line - number of fields < 5", + args: args{ + strings.NewReader( + "sl local_address rem_address st tx_queue rx_queue \n" + + "1: 00000000:0000 00000000:0000 07 00000000:00000001 \n" + + "2: 00000000:0000 00000000:0000 07 \n"), + }, + want: nil, + wantErr: true, + }, + { + name: "error case - cannot parse line - missing colon", + args: args{ + strings.NewReader( + "sl local_address rem_address st tx_queue rx_queue \n" + + "1: 00000000:0000 00000000:0000 07 00000000:00000001 \n" + + "2: 00000000:0000 00000000:0000 07 0000000200000001 \n"), + }, + want: nil, + wantErr: true, + }, + { + name: "error case - parse tx_queue - not an valid hex", + args: args{ + strings.NewReader( + "sl local_address rem_address st tx_queue rx_queue \n" + + "1: 00000000:0000 00000000:0000 07 0000000G:00000001 \n" + + "2: 00000000:0000 00000000:0000 07 00000002:00000001 \n"), + }, + want: nil, + wantErr: true, + }, + { + name: "error case - parse rx_queue - not an valid hex", + args: args{ + strings.NewReader( + "sl local_address rem_address st tx_queue rx_queue \n" + + "1: 00000000:0000 00000000:0000 07 00000000:00000001 \n" + + "2: 00000000:0000 00000000:0000 07 00000002:0000000G \n"), + }, + want: nil, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := parseUDPqueues(tt.args.r) + if (err != nil) != tt.wantErr { + t.Errorf("parseUDPqueues() error = %v, wantErr %v", err, tt.wantErr) + return + } + + if len(tt.want) != len(got) { + t.Errorf("parseUDPqueues() = %v, want %v", got, tt.want) + } + for k, v := range tt.want { + if _, ok := got[k]; !ok { + t.Errorf("parseUDPqueues() = %v, want %v", got, tt.want) + } + if got[k] != v { + t.Errorf("parseUDPqueues() = %v, want %v", got, tt.want) + } + } + }) + } +} + +func Test_getUDPqueues(t *testing.T) { + type args struct { + statsFile string + } + tests := []struct { + name string + args args + wantErr bool + }{ + { + name: "file found", + args: args{statsFile: "fixtures/proc/net/udp"}, + wantErr: false, + }, + { + name: "error case - file not found", + args: args{statsFile: "somewhere over the rainbow"}, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := getUDPqueues(tt.args.statsFile) + if (err != nil) != tt.wantErr { + t.Errorf("getUDPqueues() error = %v, wantErr %v", err, tt.wantErr) + return + } + // other cases are covered by Test_getUDPqueues() + }) + } +}