From 05cad1fb4b9ed02c96ffb49ba6004adb3daeeb97 Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Fri, 20 May 2022 12:08:37 +0400 Subject: [PATCH 1/5] mixin: update units for disk and networking panels (#7) * Update units of network ad disk graphs https://prometheus.io/docs/prometheus/latest/querying/functions/#rate rate() calculates per-second average rate, therefore Bps units should be used for disks. In networking bandwidth throughput is usually measured in bits/s so units are changed accordingly. Signed-off-by: Vitaly Zhuravlev * Change io time units to %util When appying rate() to seconds we have 'seconds per second' or fractions of the second, so actually it actually can be from 0 to 1. Also update intervalFactor to 1 for better rates. Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/lib/prom-mixin.libsonnet | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/docs/node-mixin/lib/prom-mixin.libsonnet b/docs/node-mixin/lib/prom-mixin.libsonnet index 7e22fd9a73..6b541ac360 100644 --- a/docs/node-mixin/lib/prom-mixin.libsonnet +++ b/docs/node-mixin/lib/prom-mixin.libsonnet @@ -192,14 +192,17 @@ local table = grafana70.panel.table; .addTarget(prometheus.target( 'rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % config, legendFormat='{{device}} read', + intervalFactor=1, )) .addTarget(prometheus.target( 'rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % config, legendFormat='{{device}} written', + intervalFactor=1, )) .addTarget(prometheus.target( 'rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % config, legendFormat='{{device}} io time', + intervalFactor=1, )) + { seriesOverrides: [ @@ -213,8 +216,8 @@ local table = grafana70.panel.table; }, ], yaxes: [ - self.yaxe(format='bytes'), - self.yaxe(format='s'), + self.yaxe(format='Bps'), + self.yaxe(format='percentunit'), ], }, @@ -410,29 +413,33 @@ local table = grafana70.panel.table; local networkReceived = graphPanel.new( 'Network Received', + description='Network received (bits/s)', datasource='$datasource', span=6, - format='bytes', + format='bps', min=0, fill=0, ) .addTarget(prometheus.target( - 'rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[$__rate_interval])' % config, + 'rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[$__rate_interval]) * 8' % config, legendFormat='{{device}}', + intervalFactor=1, )), local networkTransmitted = graphPanel.new( 'Network Transmitted', + description='Network transmitted (bits/s)', datasource='$datasource', span=6, - format='bytes', + format='bps', min=0, fill=0, ) .addTarget(prometheus.target( - 'rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[$__rate_interval])' % config, + 'rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[$__rate_interval]) * 8' % config, legendFormat='{{device}}', + intervalFactor=1, )), local cpuRow = From ef24d54230e1b896a79196512286bd800f0f0890 Mon Sep 17 00:00:00 2001 From: "Ryan J. Geyer" Date: Wed, 26 Oct 2022 14:44:01 -0700 Subject: [PATCH 2/5] Replace mistaken ) with }, resulting in parsable promql Signed-off-by: Ryan J. Geyer --- docs/node-mixin/dashboards/use.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index 1602c13480..65e96dd8dc 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet @@ -453,7 +453,7 @@ local diskSpaceUtilisation = sum ( sum without (device) ( max without (fstype, mountpoint, instance, pod) (( - node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s) - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s} + node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s} ) != 0) ) / scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}))) From e9e74722d820c1b3de435b570421b3d5fb15ebb3 Mon Sep 17 00:00:00 2001 From: "Ryan J. Geyer" Date: Thu, 27 Oct 2022 14:00:57 -0700 Subject: [PATCH 3/5] Add some lint exclusions. Add UIDs to all dashboards. Add units and descriptions to all panels which were missing them. Modify alerts descriptions and summaries as needed for linting. Signed-off-by: Ryan J. Geyer --- docs/node-mixin/.lint | 56 +++++++++++++++++++ docs/node-mixin/alerts/alerts.libsonnet | 6 +- docs/node-mixin/dashboards.jsonnet | 2 +- .../dashboards/dashboards.libsonnet | 3 +- docs/node-mixin/dashboards/defaults.libsonnet | 8 +++ docs/node-mixin/dashboards/use.libsonnet | 9 +++ docs/node-mixin/lib/prom-mixin.libsonnet | 6 ++ 7 files changed, 85 insertions(+), 5 deletions(-) create mode 100644 docs/node-mixin/.lint create mode 100644 docs/node-mixin/dashboards/defaults.libsonnet diff --git a/docs/node-mixin/.lint b/docs/node-mixin/.lint new file mode 100644 index 0000000000..535c75e6c4 --- /dev/null +++ b/docs/node-mixin/.lint @@ -0,0 +1,56 @@ +--- +exclusions: + template-datasource-rule: + reason: using the not yet implemented new convention for dashboards with Loki and Prometheus datasources. + panel-datasource-rule: + reason: using the not yet implemented new convention for dashboards with Loki and Prometheus datasources. + target-job-rule: + reason: Job is hardcoded by the mixin. + entries: + - dashboard: Node Exporter / USE Method / Node + - dashboard: Node Exporter / Nodes + - dashboard: Node Exporter / MacOS + - dashboard: Node Exporter / USE Method / Multi-cluster + - dashboard: Node Exporter / USE Method / Cluster + template-job-rule: + reason: Job is hardcoded by the mixin. + entries: + - dashboard: Node Exporter / USE Method / Node + - dashboard: Node Exporter / Nodes + - dashboard: Node Exporter / MacOS + - dashboard: Node Exporter / USE Method / Multi-cluster + - dashboard: Node Exporter / USE Method / Cluster + target-instance-rule: + entries: + - dashboard: Node Exporter / USE Method / Multi-cluster + reason: Instances are aggregated for all clusters + - dashboard: Node Exporter / USE Method / Cluster + reason: Instances are aggregated for the whole cluster + - dashboard: Node Exporter / USE Method / Node + reason: Dashboard only allows selecting a single instance at a time. + - dashboard: Node Exporter / Nodes + reason: Dashboard only allows selecting a single instance at a time. + - dashboard: Node Exporter / MacOS + reason: Dashboard only allows selecting a single instance at a time. + template-instance-rule: + entries: + - dashboard: Node Exporter / USE Method / Multi-cluster + reason: Instances are aggregated for all clusters + - dashboard: Node Exporter / USE Method / Cluster + reason: Instances are aggregated for the whole cluster + - dashboard: Node Exporter / Nodes + reason: Dashboard only allows selecting a single instance at a time. + - dashboard: Node Exporter / MacOS + reason: Ignoring mislabeling of instance template + - dashboard: Node Exporter / USE Method / Node + reason: Ignoring mislabeling of instance template + panel-units-rule: + entries: + - dashboard: Node Exporter / Nodes + reason: Units are indeed set for all but load average (which doesn't have a reasonable unit), but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly. + - dashboard: Node Exporter / MacOS + reason: Units are indeed set for all but load average (which doesn't have a reasonable unit), but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly. + - dashboard: Node Exporter / USE Method / Cluster + reason: Units are indeed set, but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly. + - dashboard: Node Exporter / USE Method / Node + reason: Units are indeed set, but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly. diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 2382ac292d..4496573204 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -204,7 +204,7 @@ ||| % $._config, annotations: { summary: 'Node Exporter text file collector failed to scrape.', - description: 'Node Exporter text file collector failed to scrape.', + description: 'Node Exporter text file collector on {{ $labels.instance }} failed to scrape.', }, labels: { severity: 'warning', @@ -260,7 +260,7 @@ severity: 'critical', }, annotations: { - summary: 'RAID Array is degraded', + summary: 'RAID Array is degraded.', description: "RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.", }, }, @@ -273,7 +273,7 @@ severity: 'warning', }, annotations: { - summary: 'Failed device in RAID array', + summary: 'Failed device in RAID array.', description: "At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.", }, }, diff --git a/docs/node-mixin/dashboards.jsonnet b/docs/node-mixin/dashboards.jsonnet index 9d913ed3f1..fb70fdeabe 100644 --- a/docs/node-mixin/dashboards.jsonnet +++ b/docs/node-mixin/dashboards.jsonnet @@ -1,6 +1,6 @@ local dashboards = (import 'mixin.libsonnet').grafanaDashboards; { - [name]: dashboards[name] + [name]: dashboards[name] + { uid: std.md5(name) }, for name in std.objectFields(dashboards) } diff --git a/docs/node-mixin/dashboards/dashboards.libsonnet b/docs/node-mixin/dashboards/dashboards.libsonnet index e6adbd4fa0..cb340b952a 100644 --- a/docs/node-mixin/dashboards/dashboards.libsonnet +++ b/docs/node-mixin/dashboards/dashboards.libsonnet @@ -1,2 +1,3 @@ (import 'node.libsonnet') + -(import 'use.libsonnet') +(import 'use.libsonnet') + +(import 'defaults.libsonnet') \ No newline at end of file diff --git a/docs/node-mixin/dashboards/defaults.libsonnet b/docs/node-mixin/dashboards/defaults.libsonnet new file mode 100644 index 0000000000..e9197bb0e8 --- /dev/null +++ b/docs/node-mixin/dashboards/defaults.libsonnet @@ -0,0 +1,8 @@ +{ + local grafanaDashboards = super.grafanaDashboards, + grafanaDashboards:: + { + [fname]: grafanaDashboards[fname] { uid: std.md5(fname) } + for fname in std.objectFields(grafanaDashboards) + }, +} diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index 1602c13480..9dc1465b28 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet @@ -25,6 +25,7 @@ local datasourceTemplate = { local CPUUtilisation = graphPanel.new( 'CPU Utilisation', + description='Total CPU utilisation percent.', datasource='$datasource', span=6, format='percentunit', @@ -38,6 +39,7 @@ local CPUSaturation = // average relates to the "CPU saturation" in the title. graphPanel.new( 'CPU Saturation (Load1 per CPU)', + description='System load average over the last minute. A measurement of how many processes are waiting for CPU cycles. The maximum number is the number of CPU cores for the node.', datasource='$datasource', span=6, format='percentunit', @@ -49,6 +51,7 @@ local CPUSaturation = local memoryUtilisation = graphPanel.new( 'Memory Utilisation', + description='Total memory utilisation in bytes.', datasource='$datasource', span=6, format='percentunit', @@ -60,6 +63,7 @@ local memoryUtilisation = local memorySaturation = graphPanel.new( 'Memory Saturation (Major Page Faults)', + description='Rate of major memory page faults.', datasource='$datasource', span=6, format='rds', @@ -71,6 +75,7 @@ local memorySaturation = local networkUtilisation = graphPanel.new( 'Network Utilisation (Bytes Receive/Transmit)', + description='Network Utilisation (Bytes Receive/Transmit)', datasource='$datasource', span=6, format='Bps', @@ -85,6 +90,7 @@ local networkUtilisation = local networkSaturation = graphPanel.new( 'Network Saturation (Drops Receive/Transmit)', + description='Network Saturation (Drops Receive/Transmit)', datasource='$datasource', span=6, format='Bps', @@ -99,6 +105,7 @@ local networkSaturation = local diskIOUtilisation = graphPanel.new( 'Disk IO Utilisation', + description='Disk total IO seconds.', datasource='$datasource', span=6, format='percentunit', @@ -110,6 +117,7 @@ local diskIOUtilisation = local diskIOSaturation = graphPanel.new( 'Disk IO Saturation', + description='Disk saturation (weighted seconds spent, 1 second rate)', datasource='$datasource', span=6, format='percentunit', @@ -121,6 +129,7 @@ local diskIOSaturation = local diskSpaceUtilisation = graphPanel.new( 'Disk Space Utilisation', + description='Total disk utilisation percent', datasource='$datasource', span=12, format='percentunit', diff --git a/docs/node-mixin/lib/prom-mixin.libsonnet b/docs/node-mixin/lib/prom-mixin.libsonnet index 6c4d990481..d8e4d9ff07 100644 --- a/docs/node-mixin/lib/prom-mixin.libsonnet +++ b/docs/node-mixin/lib/prom-mixin.libsonnet @@ -47,6 +47,7 @@ local table = grafana70.panel.table; local idleCPU = graphPanel.new( 'CPU Usage', + description='Total CPU utilisation percent.', datasource='$datasource', span=6, format='percentunit', @@ -69,6 +70,7 @@ local table = grafana70.panel.table; local systemLoad = graphPanel.new( 'Load Average', + description='System load average over the previous 1, 5, and 15 minute ranges. A measurement of how many processes are waiting for CPU cycles. The maximum number is the number of CPU cores for the node.', datasource='$datasource', span=6, format='short', @@ -83,6 +85,7 @@ local table = grafana70.panel.table; local memoryGraphPanelPrototype = graphPanel.new( 'Memory Usage', + description='Memory usage by category, measured in bytes.', datasource='$datasource', span=9, format='bytes', @@ -137,6 +140,7 @@ local table = grafana70.panel.table; local memoryGaugePanelPrototype = gaugePanel.new( title='Memory Usage', + description='Total memory utilisation by category, in bytes.', datasource='$datasource', ) .addThresholdStep('rgba(50, 172, 45, 0.97)') @@ -183,6 +187,7 @@ local table = grafana70.panel.table; local diskIO = graphPanel.new( 'Disk I/O', + description='Disk read/writes in bytes, and total IO seconds.', datasource='$datasource', span=6, min=0, @@ -224,6 +229,7 @@ local table = grafana70.panel.table; local diskSpaceUsage = table.new( title='Disk Space Usage', + description='Disk utilisation in percent, by mountpoint. Some duplication can occur if the same filesystem is mounted in multiple locations.', datasource='$datasource', ) .setFieldConfig(unit='decbytes') From af1d44b47828847ac78b0538f124c3aee2bce2d8 Mon Sep 17 00:00:00 2001 From: "Ryan J. Geyer" Date: Thu, 27 Oct 2022 14:17:36 -0700 Subject: [PATCH 4/5] Add multi-cluster dashboard lint exclusions --- docs/node-mixin/.lint | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/node-mixin/.lint b/docs/node-mixin/.lint index 535c75e6c4..c95289c4dc 100644 --- a/docs/node-mixin/.lint +++ b/docs/node-mixin/.lint @@ -52,5 +52,7 @@ exclusions: reason: Units are indeed set for all but load average (which doesn't have a reasonable unit), but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly. - dashboard: Node Exporter / USE Method / Cluster reason: Units are indeed set, but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly. + - dashboard: Node Exporter / USE Method / Multi-cluster + reason: Units are indeed set, but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly. - dashboard: Node Exporter / USE Method / Node reason: Units are indeed set, but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly. From 31daaef9893d6fdb6aa1a1ae1df2742df84ad64b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 1 Dec 2022 01:03:09 +0000 Subject: [PATCH 5/5] build(deps): bump github.com/coreos/go-systemd/v22 from 22.4.0 to 22.5.0 Bumps [github.com/coreos/go-systemd/v22](https://github.com/coreos/go-systemd) from 22.4.0 to 22.5.0. - [Release notes](https://github.com/coreos/go-systemd/releases) - [Commits](https://github.com/coreos/go-systemd/compare/v22.4.0...v22.5.0) --- updated-dependencies: - dependency-name: github.com/coreos/go-systemd/v22 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index a98d3de2d4..511990deb2 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.17 require ( github.com/beevik/ntp v0.3.0 - github.com/coreos/go-systemd/v22 v22.4.0 + github.com/coreos/go-systemd/v22 v22.5.0 github.com/dennwc/btrfs v0.0.0-20220403080356-b3db0b2dedac github.com/ema/qdisc v0.0.0-20200603082823-62d0308e3e00 github.com/go-kit/log v0.2.1 diff --git a/go.sum b/go.sum index f20b226111..f1d0170a4c 100644 --- a/go.sum +++ b/go.sum @@ -93,8 +93,9 @@ github.com/cncf/xds/go v0.0.0-20210805033703-aa0b78936158/go.mod h1:eXthEFrGJvWH github.com/cncf/xds/go v0.0.0-20210922020428-25de7278fc84/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cncf/xds/go v0.0.0-20211001041855-01bcc9b48dfe/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cncf/xds/go v0.0.0-20211011173535-cb28da3451f1/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= -github.com/coreos/go-systemd/v22 v22.4.0 h1:y9YHcjnjynCd/DVbg5j9L/33jQM3MxJlbj/zWskzfGU= github.com/coreos/go-systemd/v22 v22.4.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= +github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs= +github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=