diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml index 662ea3b6ee..136d7d4b96 100644 --- a/.github/workflows/golangci-lint.yml +++ b/.github/workflows/golangci-lint.yml @@ -21,6 +21,9 @@ jobs: uses: actions/setup-go@v2 with: go-version: 1.18.x + - name: Install snmp_exporter/generator dependencies + run: sudo apt-get update && sudo apt-get -y install libsnmp-dev + if: github.repository == 'prometheus/snmp_exporter' - name: Lint uses: golangci/golangci-lint-action@v3.1.0 with: diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 9a1aff4127..d325872bdf 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,3 +1,3 @@ -## Prometheus Community Code of Conduct +# Prometheus Community Code of Conduct -Prometheus follows the [CNCF Code of Conduct](https://github.com/cncf/foundation/blob/master/code-of-conduct.md). +Prometheus follows the [CNCF Code of Conduct](https://github.com/cncf/foundation/blob/main/code-of-conduct.md). diff --git a/SECURITY.md b/SECURITY.md index 67741f015a..fed02d85c7 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -3,4 +3,4 @@ The Prometheus security policy, including how to report vulnerabilities, can be found here: -https://prometheus.io/docs/operating/security/ + diff --git a/collector/arp_linux.go b/collector/arp_linux.go index 7592057d93..84216fe975 100644 --- a/collector/arp_linux.go +++ b/collector/arp_linux.go @@ -34,7 +34,7 @@ var ( ) type arpCollector struct { - deviceFilter netDevFilter + deviceFilter devFilter entries *prometheus.Desc logger log.Logger } @@ -46,7 +46,7 @@ func init() { // NewARPCollector returns a new Collector exposing ARP stats. func NewARPCollector(logger log.Logger) (Collector, error) { return &arpCollector{ - deviceFilter: newNetDevFilter(*arpDeviceExclude, *arpDeviceInclude), + deviceFilter: newdevFilter(*arpDeviceExclude, *arpDeviceInclude), entries: prometheus.NewDesc( prometheus.BuildFQName(namespace, "arp", "entries"), "ARP entries by device", diff --git a/collector/diskstats_linux.go b/collector/diskstats_linux.go index f0de40776b..41800996ec 100644 --- a/collector/diskstats_linux.go +++ b/collector/diskstats_linux.go @@ -17,9 +17,8 @@ package collector import ( + "errors" "fmt" - "regexp" - "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/prometheus/client_golang/prometheus" @@ -36,7 +35,9 @@ const ( ) var ( - ignoredDevices = kingpin.Flag("collector.diskstats.ignored-devices", "Regexp of devices to ignore for diskstats.").Default("^(ram|loop|fd|(h|s|v|xv)d[a-z]|nvme\\d+n\\d+p)\\d+$").String() + diskstatsDeviceExclude = kingpin.Flag("collector.diskstats.device-exclude", "Regexp of diskstats devices to exclude (mutually exclusive to device-include).").String() + oldDiskstatsDeviceExclude = kingpin.Flag("collector.diskstats.ignored-devices", "DEPRECATED: Use collector.diskstats.device-exclude").Default("^(ram|loop|fd|(h|s|v|xv)d[a-z]|nvme\\d+n\\d+p)\\d+$").Hidden().String() + diskstatsDeviceInclude = kingpin.Flag("collector.diskstats.device-include", "Regexp of diskstats devices to include (mutually exclusive to device-exclude).").String() ) type typedFactorDesc struct { @@ -49,11 +50,11 @@ func (d *typedFactorDesc) mustNewConstMetric(value float64, labels ...string) pr } type diskstatsCollector struct { - ignoredDevicesPattern *regexp.Regexp - fs blockdevice.FS - infoDesc typedFactorDesc - descs []typedFactorDesc - logger log.Logger + deviceFilter devFilter + fs blockdevice.FS + infoDesc typedFactorDesc + descs []typedFactorDesc + logger log.Logger } func init() { @@ -68,10 +69,30 @@ func NewDiskstatsCollector(logger log.Logger) (Collector, error) { if err != nil { return nil, fmt.Errorf("failed to open sysfs: %w", err) } + if *oldDiskstatsDeviceExclude != "" { + if *diskstatsDeviceExclude == "" { + level.Warn(logger).Log("msg", "--collector.netdev.ignored-devices is DEPRECATED and will be removed in 2.0.0, use --collector.diskstats.device-exclude") + *diskstatsDeviceExclude = *oldDiskstatsDeviceExclude + } else { + return nil, errors.New("--collector.diskstats.ignored-devices and --collector.diskstats.device-exclude are mutually exclusive") + } + } + + if *diskstatsDeviceExclude != "" && *diskstatsDeviceInclude != "" { + return nil, errors.New("device-exclude & device-include are mutually exclusive") + } + + if *diskstatsDeviceExclude != "" { + level.Info(logger).Log("msg", "Parsed flag --collector.diskstats.device-exclude", "flag", *diskstatsDeviceExclude) + } + + if *diskstatsDeviceInclude != "" { + level.Info(logger).Log("msg", "Parsed Flag --collector.diskstats.device-include", "flag", *diskstatsDeviceInclude) + } return &diskstatsCollector{ - ignoredDevicesPattern: regexp.MustCompile(*ignoredDevices), - fs: fs, + deviceFilter: newdevFilter(*diskstatsDeviceExclude, *diskstatsDeviceInclude), + fs: fs, infoDesc: typedFactorDesc{ desc: prometheus.NewDesc(prometheus.BuildFQName(namespace, diskSubsystem, "info"), "Info of /sys/block/.", @@ -194,11 +215,9 @@ func (c *diskstatsCollector) Update(ch chan<- prometheus.Metric) error { for _, stats := range diskStats { dev := stats.DeviceName - if c.ignoredDevicesPattern.MatchString(dev) { - level.Debug(c.logger).Log("msg", "Ignoring device", "device", dev, "pattern", c.ignoredDevicesPattern) + if c.deviceFilter.ignored(dev) { continue } - ch <- c.infoDesc.mustNewConstMetric(1.0, dev, fmt.Sprint(stats.MajorNumber), fmt.Sprint(stats.MinorNumber)) statCount := stats.IoStatsCount - 3 // Total diskstats record count, less MajorNumber, MinorNumber and DeviceName @@ -227,6 +246,7 @@ func (c *diskstatsCollector) Update(ch chan<- prometheus.Metric) error { } ch <- c.descs[i].mustNewConstMetric(val, dev) } + } return nil } diff --git a/collector/ethtool_linux.go b/collector/ethtool_linux.go index 04a7260d2d..9f5193620e 100644 --- a/collector/ethtool_linux.go +++ b/collector/ethtool_linux.go @@ -76,7 +76,7 @@ type ethtoolCollector struct { entries map[string]*prometheus.Desc entriesMutex sync.Mutex ethtool Ethtool - deviceFilter netDevFilter + deviceFilter devFilter infoDesc *prometheus.Desc metricsPattern *regexp.Regexp logger log.Logger @@ -100,7 +100,7 @@ func makeEthtoolCollector(logger log.Logger) (*ethtoolCollector, error) { return ðtoolCollector{ fs: fs, ethtool: ðtoolLibrary{e}, - deviceFilter: newNetDevFilter(*ethtoolDeviceExclude, *ethtoolDeviceInclude), + deviceFilter: newdevFilter(*ethtoolDeviceExclude, *ethtoolDeviceInclude), metricsPattern: regexp.MustCompile(*ethtoolIncludedMetrics), logger: logger, entries: map[string]*prometheus.Desc{ diff --git a/collector/netdev_bsd.go b/collector/netdev_bsd.go index d472b2dfeb..41baa9e4f7 100644 --- a/collector/netdev_bsd.go +++ b/collector/netdev_bsd.go @@ -34,7 +34,7 @@ import ( */ import "C" -func getNetDevStats(filter *netDevFilter, logger log.Logger) (netDevStats, error) { +func getNetDevStats(filter *devFilter, logger log.Logger) (netDevStats, error) { netDev := netDevStats{} var ifap, ifa *C.struct_ifaddrs diff --git a/collector/netdev_common.go b/collector/netdev_common.go index 595b34b464..7bedc415f3 100644 --- a/collector/netdev_common.go +++ b/collector/netdev_common.go @@ -86,7 +86,7 @@ func NewNetDevCollector(logger log.Logger) (Collector, error) { return &netDevCollector{ subsystem: "network", - deviceFilter: newNetDevFilter(*netdevDeviceExclude, *netdevDeviceInclude), + deviceFilter: newdevFilter(*netdevDeviceExclude, *netdevDeviceInclude), metricDescs: map[string]*prometheus.Desc{}, logger: logger, }, nil diff --git a/collector/netdev_darwin.go b/collector/netdev_darwin.go index 0f83a60665..b1789d4b43 100644 --- a/collector/netdev_darwin.go +++ b/collector/netdev_darwin.go @@ -27,7 +27,7 @@ import ( "golang.org/x/sys/unix" ) -func getNetDevStats(filter *netDevFilter, logger log.Logger) (netDevStats, error) { +func getNetDevStats(filter *devFilter, logger log.Logger) (netDevStats, error) { netDev := netDevStats{} ifs, err := net.Interfaces() diff --git a/collector/netdev_filter.go b/collector/netdev_filter.go index e809c499e3..25ec5b9a1a 100644 --- a/collector/netdev_filter.go +++ b/collector/netdev_filter.go @@ -17,12 +17,12 @@ import ( "regexp" ) -type netDevFilter struct { +type devFilter struct { ignorePattern *regexp.Regexp acceptPattern *regexp.Regexp } -func newNetDevFilter(ignoredPattern, acceptPattern string) (f netDevFilter) { +func newdevFilter(ignoredPattern, acceptPattern string) (f devFilter) { if ignoredPattern != "" { f.ignorePattern = regexp.MustCompile(ignoredPattern) } @@ -35,7 +35,7 @@ func newNetDevFilter(ignoredPattern, acceptPattern string) (f netDevFilter) { } // ignores returns whether the device should be ignored -func (f *netDevFilter) ignored(name string) bool { +func (f *devFilter) ignored(name string) bool { return ((f.ignorePattern != nil && f.ignorePattern.MatchString(name)) || (f.acceptPattern != nil && !f.acceptPattern.MatchString(name))) } diff --git a/collector/netdev_filter_test.go b/collector/netdev_filter_test.go index 13cebbc68a..182364bdb8 100644 --- a/collector/netdev_filter_test.go +++ b/collector/netdev_filter_test.go @@ -17,7 +17,7 @@ import ( "testing" ) -func TestNetDevFilter(t *testing.T) { +func TestdevFilter(t *testing.T) { tests := []struct { ignore string accept string @@ -33,7 +33,7 @@ func TestNetDevFilter(t *testing.T) { } for _, test := range tests { - filter := newNetDevFilter(test.ignore, test.accept) + filter := newdevFilter(test.ignore, test.accept) result := filter.ignored(test.name) if result != test.expectedResult { diff --git a/collector/netdev_linux.go b/collector/netdev_linux.go index e825db0855..88fa575af6 100644 --- a/collector/netdev_linux.go +++ b/collector/netdev_linux.go @@ -34,7 +34,7 @@ var ( procNetDevFieldSep = regexp.MustCompile(` +`) ) -func getNetDevStats(filter *netDevFilter, logger log.Logger) (netDevStats, error) { +func getNetDevStats(filter *devFilter, logger log.Logger) (netDevStats, error) { file, err := os.Open(procFilePath("net/dev")) if err != nil { return nil, err @@ -44,7 +44,7 @@ func getNetDevStats(filter *netDevFilter, logger log.Logger) (netDevStats, error return parseNetDevStats(file, filter, logger) } -func parseNetDevStats(r io.Reader, filter *netDevFilter, logger log.Logger) (netDevStats, error) { +func parseNetDevStats(r io.Reader, filter *devFilter, logger log.Logger) (netDevStats, error) { scanner := bufio.NewScanner(r) scanner.Scan() // skip first header scanner.Scan() diff --git a/collector/netdev_linux_test.go b/collector/netdev_linux_test.go index 8e227a2463..5032fd1e3b 100644 --- a/collector/netdev_linux_test.go +++ b/collector/netdev_linux_test.go @@ -27,7 +27,7 @@ func TestNetDevStatsIgnore(t *testing.T) { } defer file.Close() - filter := newNetDevFilter("^veth", "") + filter := newdevFilter("^veth", "") netStats, err := parseNetDevStats(file, &filter, log.NewNopLogger()) if err != nil { @@ -70,7 +70,7 @@ func TestNetDevStatsAccept(t *testing.T) { } defer file.Close() - filter := newNetDevFilter("", "^💩0$") + filter := newdevFilter("", "^💩0$") netStats, err := parseNetDevStats(file, &filter, log.NewNopLogger()) if err != nil { t.Fatal(err) diff --git a/collector/netdev_openbsd.go b/collector/netdev_openbsd.go index 2be10a3d6c..eb1f472c72 100644 --- a/collector/netdev_openbsd.go +++ b/collector/netdev_openbsd.go @@ -31,7 +31,7 @@ import ( */ import "C" -func getNetDevStats(filter *netDevFilter, logger log.Logger) (netDevStats, error) { +func getNetDevStats(filter *devFilter, logger log.Logger) (netDevStats, error) { netDev := netDevStats{} var ifap, ifa *C.struct_ifaddrs diff --git a/collector/netdev_openbsd_amd64.go b/collector/netdev_openbsd_amd64.go index 8b2bfa0e73..f5ada40cbe 100644 --- a/collector/netdev_openbsd_amd64.go +++ b/collector/netdev_openbsd_amd64.go @@ -24,7 +24,7 @@ import ( "unsafe" ) -func getNetDevStats(filter *netDevFilter, logger log.Logger) (netDevStats, error) { +func getNetDevStats(filter *devFilter, logger log.Logger) (netDevStats, error) { netDev := netDevStats{} mib := [6]_C_int{unix.CTL_NET, unix.AF_ROUTE, 0, 0, unix.NET_RT_IFLIST, 0} diff --git a/docs/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet index 57e585b8bc..898c912d56 100644 --- a/docs/node-mixin/dashboards/node.libsonnet +++ b/docs/node-mixin/dashboards/node.libsonnet @@ -1,256 +1,7 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local graphPanel = grafana.graphPanel; -local promgrafonnet = import 'github.com/kubernetes-monitoring/kubernetes-mixin/lib/promgrafonnet/promgrafonnet.libsonnet'; -local gauge = promgrafonnet.gauge; - { + local nodemixin = import '../lib/prom-mixin.libsonnet', grafanaDashboards+:: { - 'nodes.json': - local idleCPU = - graphPanel.new( - 'CPU Usage', - datasource='$datasource', - span=6, - format='percentunit', - max=1, - min=0, - stack=true, - ) - .addTarget(prometheus.target( - ||| - ( - (1 - sum without (mode) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode=~"idle|iowait|steal", instance="$instance"}[$__rate_interval]))) - / ignoring(cpu) group_left - count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}) - ) - ||| % $._config, - legendFormat='{{cpu}}', - intervalFactor=5, - )); - - local systemLoad = - graphPanel.new( - 'Load Average', - datasource='$datasource', - span=6, - format='short', - min=0, - fill=0, - ) - .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='1m load average')) - .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='5m load average')) - .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='15m load average')) - .addTarget(prometheus.target('count(node_cpu_seconds_total{%(nodeExporterSelector)s, instance="$instance", mode="idle"})' % $._config, legendFormat='logical cores')); - - local memoryGraph = - graphPanel.new( - 'Memory Usage', - datasource='$datasource', - span=9, - format='bytes', - stack=true, - min=0, - ) - .addTarget(prometheus.target( - ||| - ( - node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"} - - - node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"} - - - node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"} - - - node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"} - ) - ||| % $._config, legendFormat='memory used' - )) - .addTarget(prometheus.target('node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers')) - .addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached')) - .addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free')); - - // TODO: It would be nicer to have a gauge that gets a 0-1 range and displays it as a percentage 0%-100%. - // This needs to be added upstream in the promgrafonnet library and then changed here. - // NOTE: avg() is used to circumvent a label change caused by a node_exporter rollout. - local memoryGauge = gauge.new( - 'Memory Usage', - ||| - 100 - - ( - avg(node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"}) - / - avg(node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"}) - * 100 - ) - ||| % $._config, - ).withLowerBeingBetter(); - - local diskIO = - graphPanel.new( - 'Disk I/O', - datasource='$datasource', - span=6, - min=0, - fill=0, - ) - // TODO: Does it make sense to have those three in the same panel? - .addTarget(prometheus.target( - 'rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % $._config, - legendFormat='{{device}} read', - )) - .addTarget(prometheus.target( - 'rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % $._config, - legendFormat='{{device}} written', - )) - .addTarget(prometheus.target( - 'rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % $._config, - legendFormat='{{device}} io time', - )) + - { - seriesOverrides: [ - { - alias: '/ read| written/', - yaxis: 1, - }, - { - alias: '/ io time/', - yaxis: 2, - }, - ], - yaxes: [ - self.yaxe(format='bytes'), - self.yaxe(format='s'), - ], - }; - - // TODO: Somehow partition this by device while excluding read-only devices. - local diskSpaceUsage = - graphPanel.new( - 'Disk Space Usage', - datasource='$datasource', - span=6, - format='bytes', - min=0, - fill=1, - stack=true, - ) - .addTarget(prometheus.target( - ||| - sum( - max by (device) ( - node_filesystem_size_bytes{%(nodeExporterSelector)s, instance="$instance", %(fsSelector)s} - - - node_filesystem_avail_bytes{%(nodeExporterSelector)s, instance="$instance", %(fsSelector)s} - ) - ) - ||| % $._config, - legendFormat='used', - )) - .addTarget(prometheus.target( - ||| - sum( - max by (device) ( - node_filesystem_avail_bytes{%(nodeExporterSelector)s, instance="$instance", %(fsSelector)s} - ) - ) - ||| % $._config, - legendFormat='available', - )) + - { - seriesOverrides: [ - { - alias: 'used', - color: '#E0B400', - }, - { - alias: 'available', - color: '#73BF69', - }, - ], - }; - - local networkReceived = - graphPanel.new( - 'Network Received', - datasource='$datasource', - span=6, - format='bytes', - min=0, - fill=0, - ) - .addTarget(prometheus.target( - 'rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[$__rate_interval])' % $._config, - legendFormat='{{device}}', - )); - - local networkTransmitted = - graphPanel.new( - 'Network Transmitted', - datasource='$datasource', - span=6, - format='bytes', - min=0, - fill=0, - ) - .addTarget(prometheus.target( - 'rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[$__rate_interval])' % $._config, - legendFormat='{{device}}', - )); - - dashboard.new( - '%sNodes' % $._config.dashboardNamePrefix, - time_from='now-1h', - tags=($._config.dashboardTags), - timezone='utc', - refresh='30s', - graphTooltip='shared_crosshair' - ) - .addTemplate( - { - current: { - text: 'default', - value: 'default', - }, - hide: 0, - label: 'Data Source', - name: 'datasource', - options: [], - query: 'prometheus', - refresh: 1, - regex: '', - type: 'datasource', - }, - ) - .addTemplate( - template.new( - 'instance', - '$datasource', - 'label_values(node_exporter_build_info{%(nodeExporterSelector)s}, instance)' % $._config, - refresh='time', - ) - ) - .addRow( - row.new() - .addPanel(idleCPU) - .addPanel(systemLoad) - ) - .addRow( - row.new() - .addPanel(memoryGraph) - .addPanel(memoryGauge) - ) - .addRow( - row.new() - .addPanel(diskIO) - .addPanel(diskSpaceUsage) - ) - .addRow( - row.new() - .addPanel(networkReceived) - .addPanel(networkTransmitted) - ), + 'nodes.json': nodemixin.new(config=$._config, platform='Linux').dashboard, + 'nodes-darwin.json': nodemixin.new(config=$._config, platform='Darwin').dashboard, }, } diff --git a/docs/node-mixin/jsonnetfile.json b/docs/node-mixin/jsonnetfile.json index 46ebffe47c..721d4833a0 100644 --- a/docs/node-mixin/jsonnetfile.json +++ b/docs/node-mixin/jsonnetfile.json @@ -13,17 +13,8 @@ { "source": { "git": { - "remote": "https://github.com/grafana/jsonnet-libs.git", - "subdir": "grafana-builder" - } - }, - "version": "master" - }, - { - "source": { - "git": { - "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin.git", - "subdir": "lib/promgrafonnet" + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet-7.0" } }, "version": "master" diff --git a/docs/node-mixin/lib/prom-mixin.libsonnet b/docs/node-mixin/lib/prom-mixin.libsonnet new file mode 100644 index 0000000000..e545651d08 --- /dev/null +++ b/docs/node-mixin/lib/prom-mixin.libsonnet @@ -0,0 +1,353 @@ +local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; +local dashboard = grafana.dashboard; +local row = grafana.row; +local prometheus = grafana.prometheus; +local template = grafana.template; +local graphPanel = grafana.graphPanel; +local grafana70 = import 'github.com/grafana/grafonnet-lib/grafonnet-7.0/grafana.libsonnet'; +local gaugePanel = grafana70.panel.gauge; + +{ + + new(config=null, platform=null):: { + + local prometheusDatasourceTemplate = { + current: { + text: 'default', + value: 'default', + }, + hide: 0, + label: 'Data Source', + name: 'datasource', + options: [], + query: 'prometheus', + refresh: 1, + regex: '', + type: 'datasource', + }, + + local instanceTemplatePrototype = + template.new( + 'instance', + '$datasource', + '', + refresh='time', + label='Instance', + ), + local instanceTemplate = + if platform == 'Darwin' then + instanceTemplatePrototype + { query: 'label_values(node_uname_info{%(nodeExporterSelector)s, sysname="Darwin"}, instance)' % config } + else + instanceTemplatePrototype + { query: 'label_values(node_uname_info{%(nodeExporterSelector)s, sysname!="Darwin"}, instance)' % config }, + + + local idleCPU = + graphPanel.new( + 'CPU Usage', + datasource='$datasource', + span=6, + format='percentunit', + max=1, + min=0, + stack=true, + ) + .addTarget(prometheus.target( + ||| + ( + (1 - sum without (mode) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode=~"idle|iowait|steal", instance="$instance"}[$__rate_interval]))) + / ignoring(cpu) group_left + count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}) + ) + ||| % config, + legendFormat='{{cpu}}', + intervalFactor=5, + )), + + local systemLoad = + graphPanel.new( + 'Load Average', + datasource='$datasource', + span=6, + format='short', + min=0, + fill=0, + ) + .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='1m load average')) + .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='5m load average')) + .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='15m load average')) + .addTarget(prometheus.target('count(node_cpu_seconds_total{%(nodeExporterSelector)s, instance="$instance", mode="idle"})' % config, legendFormat='logical cores')), + + local memoryGraphPanelPrototype = + graphPanel.new( + 'Memory Usage', + datasource='$datasource', + span=9, + format='bytes', + min=0, + ), + local memoryGraph = + if platform == 'Linux' then + memoryGraphPanelPrototype { stack: true } + .addTarget(prometheus.target( + ||| + ( + node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"} + - + node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"} + - + node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"} + - + node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"} + ) + ||| % config, + legendFormat='memory used' + )) + .addTarget(prometheus.target('node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='memory buffers')) + .addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='memory cached')) + .addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='memory free')) + else if platform == 'Darwin' then + // not useful to stack + memoryGraphPanelPrototype { stack: false } + .addTarget(prometheus.target('node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='Physical Memory')) + .addTarget(prometheus.target( + ||| + ( + node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance"} - + node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance"} + + node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance"} + + node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance"} + ) + ||| % config, legendFormat='Memory Used' + )) + .addTarget(prometheus.target( + ||| + ( + node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance"} - + node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance"} + ) + ||| % config, legendFormat='App Memory' + )) + .addTarget(prometheus.target('node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='Wired Memory')) + .addTarget(prometheus.target('node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='Compressed')), + + // NOTE: avg() is used to circumvent a label change caused by a node_exporter rollout. + local memoryGaugePanelPrototype = + gaugePanel.new( + title='Memory Usage', + datasource='$datasource', + ) + .addThresholdStep('rgba(50, 172, 45, 0.97)') + .addThresholdStep('rgba(237, 129, 40, 0.89)', 80) + .addThresholdStep('rgba(245, 54, 54, 0.9)', 90) + .setFieldConfig(max=100, min=0, unit='percent') + + { + span: 3, + }, + + local memoryGauge = + if platform == 'Linux' then + memoryGaugePanelPrototype + + .addTarget(prometheus.target( + ||| + 100 - + ( + avg(node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"}) / + avg(node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"}) + * 100 + ) + ||| % config, + )) + + else if platform == 'Darwin' then + memoryGaugePanelPrototype + .addTarget(prometheus.target( + ||| + ( + ( + avg(node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance"}) - + avg(node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance"}) + + avg(node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance"}) + + avg(node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance"}) + ) / + avg(node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance"}) + ) + * + 100 + ||| % config + )), + + local diskIO = + graphPanel.new( + 'Disk I/O', + datasource='$datasource', + span=6, + min=0, + fill=0, + ) + // TODO: Does it make sense to have those three in the same panel? + .addTarget(prometheus.target( + 'rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % config, + legendFormat='{{device}} read', + )) + .addTarget(prometheus.target( + 'rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % config, + legendFormat='{{device}} written', + )) + .addTarget(prometheus.target( + 'rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % config, + legendFormat='{{device}} io time', + )) + + { + seriesOverrides: [ + { + alias: '/ read| written/', + yaxis: 1, + }, + { + alias: '/ io time/', + yaxis: 2, + }, + ], + yaxes: [ + self.yaxe(format='bytes'), + self.yaxe(format='s'), + ], + }, + + // TODO: Somehow partition this by device while excluding read-only devices. + local diskSpaceUsage = + graphPanel.new( + 'Disk Space Usage', + datasource='$datasource', + span=6, + format='bytes', + min=0, + fill=1, + stack=true, + ) + .addTarget(prometheus.target( + ||| + sum( + max by (device) ( + node_filesystem_size_bytes{%(nodeExporterSelector)s, instance="$instance", %(fsSelector)s} + - + node_filesystem_avail_bytes{%(nodeExporterSelector)s, instance="$instance", %(fsSelector)s} + ) + ) + ||| % config, + legendFormat='used', + )) + .addTarget(prometheus.target( + ||| + sum( + max by (device) ( + node_filesystem_avail_bytes{%(nodeExporterSelector)s, instance="$instance", %(fsSelector)s} + ) + ) + ||| % config, + legendFormat='available', + )) + + { + seriesOverrides: [ + { + alias: 'used', + color: '#E0B400', + }, + { + alias: 'available', + color: '#73BF69', + }, + ], + }, + + local networkReceived = + graphPanel.new( + 'Network Received', + datasource='$datasource', + span=6, + format='bytes', + min=0, + fill=0, + ) + .addTarget(prometheus.target( + 'rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[$__rate_interval])' % config, + legendFormat='{{device}}', + )), + + local networkTransmitted = + graphPanel.new( + 'Network Transmitted', + datasource='$datasource', + span=6, + format='bytes', + min=0, + fill=0, + ) + .addTarget(prometheus.target( + 'rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[$__rate_interval])' % config, + legendFormat='{{device}}', + )), + + local cpuRow = + row.new('CPU') + .addPanel(idleCPU) + .addPanel(systemLoad), + + local memoryRow = + row.new('Memory') + .addPanel(memoryGraph) + .addPanel(memoryGauge), + + local diskRow = + row.new('Disk') + .addPanel(diskIO) + .addPanel(diskSpaceUsage), + + local networkRow = + row.new('Network') + .addPanel(networkReceived) + .addPanel(networkTransmitted), + + local rows = + [ + cpuRow, + memoryRow, + diskRow, + networkRow, + ], + + local templates = + [ + prometheusDatasourceTemplate, + instanceTemplate, + ], + + + dashboard: if platform == 'Linux' then + dashboard.new( + '%sNodes' % config.dashboardNamePrefix, + time_from='now-1h', + tags=(config.dashboardTags), + timezone='utc', + refresh='30s', + graphTooltip='shared_crosshair' + ) + .addTemplates(templates) + .addRows(rows) + else if platform == 'Darwin' then + dashboard.new( + '%sMacOS' % config.dashboardNamePrefix, + time_from='now-1h', + tags=(config.dashboardTags), + timezone='utc', + refresh='30s', + graphTooltip='shared_crosshair' + ) + .addTemplates(templates) + .addRows(rows), + + }, +}