From b17a8728b6d37bd14ba5275bb29f3908d3acf53c Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Mon, 14 Oct 2019 14:33:48 -0700 Subject: [PATCH 01/11] add a map of profilers to CPUids `runtime.NumCPU()` returns the number of CPUs that the process can run on. This number does not necessarily correlate to CPU ids if the affinity mask of the process is set. This change maintains the current behavior as default, but also allows the user to specify a range of CPUids to use instead. The CPU id is stored as the value of a map keyed on the profiler object's address. Signed-off-by: Joe Damato --- CHANGELOG.md | 1 + README.md | 7 ++++ collector/perf_linux.go | 80 +++++++++++++++++++++++++++++++++-------- 3 files changed, 74 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8eaf77306a..e07e9f5a9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ ### Changes +* [ENHANCEMENT] Add `--collector.perf.cpus` to allow setting the CPU list for perf stats. * [CHANGE] Add `--collector.netdev.device-whitelist`. #1279 * [CHANGE] Refactor mdadm collector #1403 * [CHANGE] Add `mountaddr` label to NFS metrics. #1417 diff --git a/README.md b/README.md index 7a4b9f407f..ff54260448 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,13 @@ Depending on the configured value different metrics will be available, for most cases `0` will provide the most complete set. For more information see [`man 2 perf_event_open`](http://man7.org/linux/man-pages/man2/perf_event_open.2.html). +By default, the perf collector will only collect metrics of the CPUs that +`node_exporter` can run on. If this is insufficient (e.g. if you run `node_exporter` with +its CPU affinity set to specific CPUs) You can specify a list of alternate CPUs by using the +`--collector.perf.cpus` flag. For example, to collect metrics on CPUs 2-6, you +would specify: `--collector.perf --collector.perf.cpus=2-6`. + + Name | Description | OS ---------|-------------|---- buddyinfo | Exposes statistics of memory fragments as reported by /proc/buddyinfo. | Linux diff --git a/collector/perf_linux.go b/collector/perf_linux.go index 0ab7b84f98..b6eae0207f 100644 --- a/collector/perf_linux.go +++ b/collector/perf_linux.go @@ -15,16 +15,25 @@ package collector import ( "fmt" - "runtime" - perf "github.com/hodgesds/perf-utils" "github.com/prometheus/client_golang/prometheus" + kingpin "gopkg.in/alecthomas/kingpin.v2" + "runtime" + "strconv" + "strings" ) const ( perfSubsystem = "perf" ) +var ( + cpus = kingpin.Flag("collector.perf.cpus", "List of CPUs from which perf metrics should be collected").Default("").String() + hwProfilerCpuMap = make(map[*perf.HardwareProfiler]int) + swProfilerCpuMap = make(map[*perf.SoftwareProfiler]int) + cacheProfilerCpuMap = make(map[*perf.CacheProfiler]int) +) + func init() { registerCollector(perfSubsystem, defaultDisabled, NewPerfCollector) } @@ -41,6 +50,14 @@ type perfCollector struct { desc map[string]*prometheus.Desc } +func isValidCPUString(cpus *string) bool { + if cpus == nil || *cpus == "" || !strings.Contains(*cpus, "-") || strings.Count(*cpus, "-") != 1 { + return false + } + + return true +} + // NewPerfCollector returns a new perf based collector, it creates a profiler // per CPU. func NewPerfCollector() (Collector, error) { @@ -49,23 +66,55 @@ func NewPerfCollector() (Collector, error) { perfSwProfilers: map[int]perf.SoftwareProfiler{}, perfCacheProfilers: map[int]perf.CacheProfiler{}, } - ncpus := runtime.NumCPU() - for i := 0; i < ncpus; i++ { + + start := 0 + ncpus := 0 + var err error + + if !isValidCPUString(cpus) { + start = 0 + ncpus = runtime.NumCPU() - 1 + } else { + cpu_range := strings.Split(*cpus, "-") + start, err = strconv.Atoi(cpu_range[0]) + if err != nil { + start = 0 + } + + ncpus, err = strconv.Atoi(cpu_range[1]) + if err != nil { + ncpus = runtime.NumCPU() - 1 + } + } + + for i, idx := start, 0; i <= ncpus; i, idx = i+1, idx+1 { // Use -1 to profile all processes on the CPU, see: // man perf_event_open - collector.perfHwProfilers[i] = perf.NewHardwareProfiler(-1, i) - if err := collector.perfHwProfilers[i].Start(); err != nil { + p := perf.NewHardwareProfiler(-1, i) + collector.perfHwProfilers[idx] = p + if err := collector.perfHwProfilers[idx].Start(); err != nil { return collector, err + } else { + hwProfilerCpuMap[&p] = i } - collector.perfSwProfilers[i] = perf.NewSoftwareProfiler(-1, i) + + p2 := perf.NewSoftwareProfiler(-1, i) + collector.perfSwProfilers[i] = p2 if err := collector.perfSwProfilers[i].Start(); err != nil { return collector, err + } else { + swProfilerCpuMap[&p2] = i } - collector.perfCacheProfilers[i] = perf.NewCacheProfiler(-1, i) + + p3 := perf.NewCacheProfiler(-1, i) + collector.perfCacheProfilers[i] = p3 if err := collector.perfCacheProfilers[i].Start(); err != nil { return collector, err + } else { + cacheProfilerCpuMap[&p3] = i } } + collector.desc = map[string]*prometheus.Desc{ "cpucycles_total": prometheus.NewDesc( prometheus.BuildFQName( @@ -330,8 +379,9 @@ func (c *perfCollector) Update(ch chan<- prometheus.Metric) error { } func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { - for cpu, profiler := range c.perfHwProfilers { - cpuStr := fmt.Sprintf("%d", cpu) + for _, profiler := range c.perfHwProfilers { + cpuid := hwProfilerCpuMap[&profiler] + cpuStr := fmt.Sprintf("%d", cpuid) hwProfile, err := profiler.Profile() if err != nil { return err @@ -401,8 +451,9 @@ func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { } func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { - for cpu, profiler := range c.perfSwProfilers { - cpuStr := fmt.Sprintf("%d", cpu) + for _, profiler := range c.perfSwProfilers { + cpuid := swProfilerCpuMap[&profiler] + cpuStr := fmt.Sprintf("%d", cpuid) swProfile, err := profiler.Profile() if err != nil { return err @@ -456,8 +507,9 @@ func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { } func (c *perfCollector) updateCacheStats(ch chan<- prometheus.Metric) error { - for cpu, profiler := range c.perfCacheProfilers { - cpuStr := fmt.Sprintf("%d", cpu) + for _, profiler := range c.perfCacheProfilers { + cpuid := cacheProfilerCpuMap[&profiler] + cpuStr := fmt.Sprintf("%d", cpuid) cacheProfile, err := profiler.Profile() if err != nil { return err From 27381c4a2e909a86a5ae2b76d54f3b8a8637902d Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 16 Oct 2019 11:31:23 -0700 Subject: [PATCH 02/11] Fix scoping on CPU id maps and style cleanup Signed-off-by: Joe Damato --- collector/perf_linux.go | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/collector/perf_linux.go b/collector/perf_linux.go index b6eae0207f..a4db7f90d7 100644 --- a/collector/perf_linux.go +++ b/collector/perf_linux.go @@ -14,6 +14,7 @@ package collector import ( + "errors" "fmt" perf "github.com/hodgesds/perf-utils" "github.com/prometheus/client_golang/prometheus" @@ -28,10 +29,7 @@ const ( ) var ( - cpus = kingpin.Flag("collector.perf.cpus", "List of CPUs from which perf metrics should be collected").Default("").String() - hwProfilerCpuMap = make(map[*perf.HardwareProfiler]int) - swProfilerCpuMap = make(map[*perf.SoftwareProfiler]int) - cacheProfilerCpuMap = make(map[*perf.CacheProfiler]int) + perfCpusFlag = kingpin.Flag("collector.perf.cpus", "List of CPUs from which perf metrics should be collected").Default("").String() ) func init() { @@ -44,14 +42,17 @@ func init() { // settings not all profiler values may be exposed on the target system at any // given time. type perfCollector struct { - perfHwProfilers map[int]perf.HardwareProfiler - perfSwProfilers map[int]perf.SoftwareProfiler - perfCacheProfilers map[int]perf.CacheProfiler - desc map[string]*prometheus.Desc + hwProfilerCpuMap map[*perf.HardwareProfiler]int + swProfilerCpuMap map[*perf.SoftwareProfiler]int + cacheProfilerCpuMap map[*perf.CacheProfiler]int + perfHwProfilers map[int]perf.HardwareProfiler + perfSwProfilers map[int]perf.SoftwareProfiler + perfCacheProfilers map[int]perf.CacheProfiler + desc map[string]*prometheus.Desc } func isValidCPUString(cpus *string) bool { - if cpus == nil || *cpus == "" || !strings.Contains(*cpus, "-") || strings.Count(*cpus, "-") != 1 { + if !strings.Contains(*cpus, "-") || strings.Count(*cpus, "-") != 1 { return false } @@ -71,17 +72,19 @@ func NewPerfCollector() (Collector, error) { ncpus := 0 var err error - if !isValidCPUString(cpus) { + if perfCpusFlag == nil || *perfCpusFlag == "" { start = 0 ncpus = runtime.NumCPU() - 1 + } else if !isValidCPUString(perfCpusFlag) { + return nil, errors.New("--collector.perf.cpus flag value is invalid, it must be a range (e.g. 2-6)") } else { - cpu_range := strings.Split(*cpus, "-") - start, err = strconv.Atoi(cpu_range[0]) + cpuRange := strings.Split(*perfCpusFlag, "-") + start, err = strconv.Atoi(cpuRange[0]) if err != nil { start = 0 } - ncpus, err = strconv.Atoi(cpu_range[1]) + ncpus, err = strconv.Atoi(cpuRange[1]) if err != nil { ncpus = runtime.NumCPU() - 1 } @@ -95,7 +98,7 @@ func NewPerfCollector() (Collector, error) { if err := collector.perfHwProfilers[idx].Start(); err != nil { return collector, err } else { - hwProfilerCpuMap[&p] = i + collector.hwProfilerCpuMap[&p] = i } p2 := perf.NewSoftwareProfiler(-1, i) @@ -103,7 +106,7 @@ func NewPerfCollector() (Collector, error) { if err := collector.perfSwProfilers[i].Start(); err != nil { return collector, err } else { - swProfilerCpuMap[&p2] = i + collector.swProfilerCpuMap[&p2] = i } p3 := perf.NewCacheProfiler(-1, i) @@ -111,7 +114,7 @@ func NewPerfCollector() (Collector, error) { if err := collector.perfCacheProfilers[i].Start(); err != nil { return collector, err } else { - cacheProfilerCpuMap[&p3] = i + collector.cacheProfilerCpuMap[&p3] = i } } @@ -380,7 +383,7 @@ func (c *perfCollector) Update(ch chan<- prometheus.Metric) error { func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { for _, profiler := range c.perfHwProfilers { - cpuid := hwProfilerCpuMap[&profiler] + cpuid := c.hwProfilerCpuMap[&profiler] cpuStr := fmt.Sprintf("%d", cpuid) hwProfile, err := profiler.Profile() if err != nil { @@ -452,7 +455,7 @@ func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { for _, profiler := range c.perfSwProfilers { - cpuid := swProfilerCpuMap[&profiler] + cpuid := c.swProfilerCpuMap[&profiler] cpuStr := fmt.Sprintf("%d", cpuid) swProfile, err := profiler.Profile() if err != nil { @@ -508,7 +511,7 @@ func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { func (c *perfCollector) updateCacheStats(ch chan<- prometheus.Metric) error { for _, profiler := range c.perfCacheProfilers { - cpuid := cacheProfilerCpuMap[&profiler] + cpuid := c.cacheProfilerCpuMap[&profiler] cpuStr := fmt.Sprintf("%d", cpuid) cacheProfile, err := profiler.Profile() if err != nil { From 341322ef552935ee4ab56e3d1913bf993a7fea0f Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 16 Oct 2019 13:09:07 -0700 Subject: [PATCH 03/11] Initialize CPU maps Signed-off-by: Joe Damato --- collector/perf_linux.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/collector/perf_linux.go b/collector/perf_linux.go index a4db7f90d7..bde584458f 100644 --- a/collector/perf_linux.go +++ b/collector/perf_linux.go @@ -63,9 +63,12 @@ func isValidCPUString(cpus *string) bool { // per CPU. func NewPerfCollector() (Collector, error) { collector := &perfCollector{ - perfHwProfilers: map[int]perf.HardwareProfiler{}, - perfSwProfilers: map[int]perf.SoftwareProfiler{}, - perfCacheProfilers: map[int]perf.CacheProfiler{}, + perfHwProfilers: map[int]perf.HardwareProfiler{}, + perfSwProfilers: map[int]perf.SoftwareProfiler{}, + perfCacheProfilers: map[int]perf.CacheProfiler{}, + hwProfilerCpuMap: map[*perf.HardwareProfiler]int{}, + swProfilerCpuMap: map[*perf.SoftwareProfiler]int{}, + cacheProfilerCpuMap: map[*perf.CacheProfiler]int{}, } start := 0 From 573cf020fe6fa3a195ce322ec7dbed487769701c Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 16 Oct 2019 13:35:30 -0700 Subject: [PATCH 04/11] Use perf Profiler pointers instead Signed-off-by: Joe Damato --- collector/perf_linux.go | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/collector/perf_linux.go b/collector/perf_linux.go index bde584458f..348a1b51fd 100644 --- a/collector/perf_linux.go +++ b/collector/perf_linux.go @@ -45,9 +45,9 @@ type perfCollector struct { hwProfilerCpuMap map[*perf.HardwareProfiler]int swProfilerCpuMap map[*perf.SoftwareProfiler]int cacheProfilerCpuMap map[*perf.CacheProfiler]int - perfHwProfilers map[int]perf.HardwareProfiler - perfSwProfilers map[int]perf.SoftwareProfiler - perfCacheProfilers map[int]perf.CacheProfiler + perfHwProfilers map[int]*perf.HardwareProfiler + perfSwProfilers map[int]*perf.SoftwareProfiler + perfCacheProfilers map[int]*perf.CacheProfiler desc map[string]*prometheus.Desc } @@ -63,9 +63,9 @@ func isValidCPUString(cpus *string) bool { // per CPU. func NewPerfCollector() (Collector, error) { collector := &perfCollector{ - perfHwProfilers: map[int]perf.HardwareProfiler{}, - perfSwProfilers: map[int]perf.SoftwareProfiler{}, - perfCacheProfilers: map[int]perf.CacheProfiler{}, + perfHwProfilers: map[int]*perf.HardwareProfiler{}, + perfSwProfilers: map[int]*perf.SoftwareProfiler{}, + perfCacheProfilers: map[int]*perf.CacheProfiler{}, hwProfilerCpuMap: map[*perf.HardwareProfiler]int{}, swProfilerCpuMap: map[*perf.SoftwareProfiler]int{}, cacheProfilerCpuMap: map[*perf.CacheProfiler]int{}, @@ -97,24 +97,24 @@ func NewPerfCollector() (Collector, error) { // Use -1 to profile all processes on the CPU, see: // man perf_event_open p := perf.NewHardwareProfiler(-1, i) - collector.perfHwProfilers[idx] = p - if err := collector.perfHwProfilers[idx].Start(); err != nil { + collector.perfHwProfilers[idx] = &p + if err := p.Start(); err != nil { return collector, err } else { collector.hwProfilerCpuMap[&p] = i } p2 := perf.NewSoftwareProfiler(-1, i) - collector.perfSwProfilers[i] = p2 - if err := collector.perfSwProfilers[i].Start(); err != nil { + collector.perfSwProfilers[i] = &p2 + if err := p2.Start(); err != nil { return collector, err } else { collector.swProfilerCpuMap[&p2] = i } p3 := perf.NewCacheProfiler(-1, i) - collector.perfCacheProfilers[i] = p3 - if err := collector.perfCacheProfilers[i].Start(); err != nil { + collector.perfCacheProfilers[i] = &p3 + if err := p3.Start(); err != nil { return collector, err } else { collector.cacheProfilerCpuMap[&p3] = i @@ -386,9 +386,9 @@ func (c *perfCollector) Update(ch chan<- prometheus.Metric) error { func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { for _, profiler := range c.perfHwProfilers { - cpuid := c.hwProfilerCpuMap[&profiler] + cpuid := c.hwProfilerCpuMap[profiler] cpuStr := fmt.Sprintf("%d", cpuid) - hwProfile, err := profiler.Profile() + hwProfile, err := (*profiler).Profile() if err != nil { return err } @@ -458,9 +458,9 @@ func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { for _, profiler := range c.perfSwProfilers { - cpuid := c.swProfilerCpuMap[&profiler] + cpuid := c.swProfilerCpuMap[profiler] cpuStr := fmt.Sprintf("%d", cpuid) - swProfile, err := profiler.Profile() + swProfile, err := (*profiler).Profile() if err != nil { return err } @@ -514,9 +514,9 @@ func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { func (c *perfCollector) updateCacheStats(ch chan<- prometheus.Metric) error { for _, profiler := range c.perfCacheProfilers { - cpuid := c.cacheProfilerCpuMap[&profiler] + cpuid := c.cacheProfilerCpuMap[profiler] cpuStr := fmt.Sprintf("%d", cpuid) - cacheProfile, err := profiler.Profile() + cacheProfile, err := (*profiler).Profile() if err != nil { return err } From b813bf1ebac5bdf12fecc06b6df765636f5f38ce Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 16 Oct 2019 16:04:06 -0700 Subject: [PATCH 05/11] Update docs to include 0-indexed CPU ids Signed-off-by: Joe Damato --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ff54260448..7bd8d0a0a4 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,8 @@ By default, the perf collector will only collect metrics of the CPUs that `node_exporter` can run on. If this is insufficient (e.g. if you run `node_exporter` with its CPU affinity set to specific CPUs) You can specify a list of alternate CPUs by using the `--collector.perf.cpus` flag. For example, to collect metrics on CPUs 2-6, you -would specify: `--collector.perf --collector.perf.cpus=2-6`. +would specify: `--collector.perf --collector.perf.cpus=2-6`. The CPU ids start +at 0. Name | Description | OS From 0ded6624c1898424b707b2ecf4a2e59d8c9b872f Mon Sep 17 00:00:00 2001 From: Daniel Hodges Date: Sat, 30 Nov 2019 16:27:05 -0500 Subject: [PATCH 06/11] Fix linting issues Signed-off-by: Daniel Hodges --- collector/perf_linux.go | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/collector/perf_linux.go b/collector/perf_linux.go index 348a1b51fd..3c97e44e74 100644 --- a/collector/perf_linux.go +++ b/collector/perf_linux.go @@ -16,12 +16,13 @@ package collector import ( "errors" "fmt" - perf "github.com/hodgesds/perf-utils" - "github.com/prometheus/client_golang/prometheus" - kingpin "gopkg.in/alecthomas/kingpin.v2" "runtime" "strconv" "strings" + + perf "github.com/hodgesds/perf-utils" + "github.com/prometheus/client_golang/prometheus" + kingpin "gopkg.in/alecthomas/kingpin.v2" ) const ( @@ -42,9 +43,9 @@ func init() { // settings not all profiler values may be exposed on the target system at any // given time. type perfCollector struct { - hwProfilerCpuMap map[*perf.HardwareProfiler]int - swProfilerCpuMap map[*perf.SoftwareProfiler]int - cacheProfilerCpuMap map[*perf.CacheProfiler]int + hwProfilerCPUMap map[*perf.HardwareProfiler]int + swProfilerCPUMap map[*perf.SoftwareProfiler]int + cacheProfilerCPUMap map[*perf.CacheProfiler]int perfHwProfilers map[int]*perf.HardwareProfiler perfSwProfilers map[int]*perf.SoftwareProfiler perfCacheProfilers map[int]*perf.CacheProfiler @@ -66,9 +67,9 @@ func NewPerfCollector() (Collector, error) { perfHwProfilers: map[int]*perf.HardwareProfiler{}, perfSwProfilers: map[int]*perf.SoftwareProfiler{}, perfCacheProfilers: map[int]*perf.CacheProfiler{}, - hwProfilerCpuMap: map[*perf.HardwareProfiler]int{}, - swProfilerCpuMap: map[*perf.SoftwareProfiler]int{}, - cacheProfilerCpuMap: map[*perf.CacheProfiler]int{}, + hwProfilerCPUMap: map[*perf.HardwareProfiler]int{}, + swProfilerCPUMap: map[*perf.SoftwareProfiler]int{}, + cacheProfilerCPUMap: map[*perf.CacheProfiler]int{}, } start := 0 @@ -100,25 +101,22 @@ func NewPerfCollector() (Collector, error) { collector.perfHwProfilers[idx] = &p if err := p.Start(); err != nil { return collector, err - } else { - collector.hwProfilerCpuMap[&p] = i } + collector.hwProfilerCPUMap[&p] = i p2 := perf.NewSoftwareProfiler(-1, i) collector.perfSwProfilers[i] = &p2 if err := p2.Start(); err != nil { return collector, err - } else { - collector.swProfilerCpuMap[&p2] = i } + collector.swProfilerCPUMap[&p2] = i p3 := perf.NewCacheProfiler(-1, i) collector.perfCacheProfilers[i] = &p3 if err := p3.Start(); err != nil { return collector, err - } else { - collector.cacheProfilerCpuMap[&p3] = i } + collector.cacheProfilerCPUMap[&p3] = i } collector.desc = map[string]*prometheus.Desc{ @@ -386,7 +384,7 @@ func (c *perfCollector) Update(ch chan<- prometheus.Metric) error { func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { for _, profiler := range c.perfHwProfilers { - cpuid := c.hwProfilerCpuMap[profiler] + cpuid := c.hwProfilerCPUMap[profiler] cpuStr := fmt.Sprintf("%d", cpuid) hwProfile, err := (*profiler).Profile() if err != nil { @@ -458,7 +456,7 @@ func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { for _, profiler := range c.perfSwProfilers { - cpuid := c.swProfilerCpuMap[profiler] + cpuid := c.swProfilerCPUMap[profiler] cpuStr := fmt.Sprintf("%d", cpuid) swProfile, err := (*profiler).Profile() if err != nil { @@ -514,7 +512,7 @@ func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { func (c *perfCollector) updateCacheStats(ch chan<- prometheus.Metric) error { for _, profiler := range c.perfCacheProfilers { - cpuid := c.cacheProfilerCpuMap[profiler] + cpuid := c.cacheProfilerCPUMap[profiler] cpuStr := fmt.Sprintf("%d", cpuid) cacheProfile, err := (*profiler).Profile() if err != nil { From b073530bab4f8801abcdc224d7861e4983ea205a Mon Sep 17 00:00:00 2001 From: Daniel Hodges Date: Sat, 30 Nov 2019 16:51:46 -0500 Subject: [PATCH 07/11] Update readme and add tests Signed-off-by: Daniel Hodges --- README.md | 12 ++-- collector/perf_linux.go | 133 ++++++++++++++++++++++------------- collector/perf_linux_test.go | 73 +++++++++++++++++++ 3 files changed, 166 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index 7bd8d0a0a4..6ea96eb5f6 100644 --- a/README.md +++ b/README.md @@ -83,11 +83,15 @@ cases `0` will provide the most complete set. For more information see [`man 2 perf_event_open`](http://man7.org/linux/man-pages/man2/perf_event_open.2.html). By default, the perf collector will only collect metrics of the CPUs that -`node_exporter` can run on. If this is insufficient (e.g. if you run `node_exporter` with -its CPU affinity set to specific CPUs) You can specify a list of alternate CPUs by using the +`node_exporter` is running on (ie +[`runtime.NumCPU`](https://golang.org/pkg/runtime/#NumCPU). If this is +insufficient (e.g. if you run `node_exporter` with its CPU affinity set to +specific CPUs) You can specify a list of alternate CPUs by using the `--collector.perf.cpus` flag. For example, to collect metrics on CPUs 2-6, you -would specify: `--collector.perf --collector.perf.cpus=2-6`. The CPU ids start -at 0. +would specify: `--collector.perf --collector.perf.cpus=2-6`. The CPU +configuration is zero indexed and can also take a stride value +`--collector.perf --collector.perf.cpus=1-4:10`, would collect on CPUs +1,2,3,4,10. Name | Description | OS diff --git a/collector/perf_linux.go b/collector/perf_linux.go index 3c97e44e74..c9fbe7f684 100644 --- a/collector/perf_linux.go +++ b/collector/perf_linux.go @@ -14,7 +14,6 @@ package collector import ( - "errors" "fmt" "runtime" "strconv" @@ -30,7 +29,7 @@ const ( ) var ( - perfCpusFlag = kingpin.Flag("collector.perf.cpus", "List of CPUs from which perf metrics should be collected").Default("").String() + perfCPUsFlag = kingpin.Flag("collector.perf.cpus", "List of CPUs from which perf metrics should be collected").Default("").String() ) func init() { @@ -52,12 +51,49 @@ type perfCollector struct { desc map[string]*prometheus.Desc } -func isValidCPUString(cpus *string) bool { - if !strings.Contains(*cpus, "-") || strings.Count(*cpus, "-") != 1 { - return false +// perfCPUFlagToCPUs returns a set of CPUs for the perf collectors to monitor. +func perfCPUFlagToCPUs(cpuFlag string) ([]int, error) { + var err error + cpus := []int{} + for _, subset := range strings.Split(cpuFlag, ",") { + // First parse a single CPU. + if !strings.Contains(subset, "-") { + cpu, err := strconv.Atoi(subset) + if err != nil { + return nil, err + } + cpus = append(cpus, cpu) + continue + } + + stride := 1 + // Handle strides, ie 1-10:5 should yield 1,5,10 + strideSet := strings.Split(subset, ":") + if len(strideSet) == 2 { + stride, err = strconv.Atoi(strideSet[1]) + if err != nil { + return nil, err + } + } + + rangeSet := strings.Split(strideSet[0], "-") + if len(rangeSet) != 2 { + return nil, fmt.Errorf("invalid flag value %q", cpuFlag) + } + start, err := strconv.Atoi(rangeSet[0]) + if err != nil { + return nil, err + } + end, err := strconv.Atoi(rangeSet[1]) + if err != nil { + return nil, err + } + for i := start; i <= end; i += stride { + cpus = append(cpus, i) + } } - return true + return cpus, nil } // NewPerfCollector returns a new perf based collector, it creates a profiler @@ -72,51 +108,52 @@ func NewPerfCollector() (Collector, error) { cacheProfilerCPUMap: map[*perf.CacheProfiler]int{}, } - start := 0 - ncpus := 0 - var err error - - if perfCpusFlag == nil || *perfCpusFlag == "" { - start = 0 - ncpus = runtime.NumCPU() - 1 - } else if !isValidCPUString(perfCpusFlag) { - return nil, errors.New("--collector.perf.cpus flag value is invalid, it must be a range (e.g. 2-6)") - } else { - cpuRange := strings.Split(*perfCpusFlag, "-") - start, err = strconv.Atoi(cpuRange[0]) - if err != nil { - start = 0 - } - - ncpus, err = strconv.Atoi(cpuRange[1]) + if perfCPUsFlag != nil && *perfCPUsFlag != "" { + cpus, err := perfCPUFlagToCPUs(*perfCPUsFlag) if err != nil { - ncpus = runtime.NumCPU() - 1 - } - } - - for i, idx := start, 0; i <= ncpus; i, idx = i+1, idx+1 { - // Use -1 to profile all processes on the CPU, see: - // man perf_event_open - p := perf.NewHardwareProfiler(-1, i) - collector.perfHwProfilers[idx] = &p - if err := p.Start(); err != nil { - return collector, err - } - collector.hwProfilerCPUMap[&p] = i - - p2 := perf.NewSoftwareProfiler(-1, i) - collector.perfSwProfilers[i] = &p2 - if err := p2.Start(); err != nil { - return collector, err + return nil, err + } + for _, cpu := range cpus { + // Use -1 to profile all processes on the CPU, see: + // man perf_event_open + hwProf := perf.NewHardwareProfiler(-1, cpu) + if err := hwProf.Start(); err != nil { + return nil, err + } + collector.perfHwProfilers[cpu] = &hwProf + + swProf := perf.NewSoftwareProfiler(-1, cpu) + if err := swProf.Start(); err != nil { + return nil, err + } + collector.perfSwProfilers[cpu] = &swProf + + cacheProf := perf.NewCacheProfiler(-1, cpu) + if err := cacheProf.Start(); err != nil { + return nil, err + } + collector.perfCacheProfilers[cpu] = &cacheProf } - collector.swProfilerCPUMap[&p2] = i - - p3 := perf.NewCacheProfiler(-1, i) - collector.perfCacheProfilers[i] = &p3 - if err := p3.Start(); err != nil { - return collector, err + } else { + for i := 0; i < runtime.NumCPU(); i++ { + hwProf := perf.NewHardwareProfiler(-1, i) + if err := hwProf.Start(); err != nil { + return nil, err + } + collector.perfHwProfilers[i] = &hwProf + + swProf := perf.NewSoftwareProfiler(-1, i) + if err := swProf.Start(); err != nil { + return nil, err + } + collector.perfSwProfilers[i] = &swProf + + cacheProf := perf.NewCacheProfiler(-1, i) + if err := cacheProf.Start(); err != nil { + return nil, err + } + collector.perfCacheProfilers[i] = &cacheProf } - collector.cacheProfilerCPUMap[&p3] = i } collector.desc = map[string]*prometheus.Desc{ diff --git a/collector/perf_linux_test.go b/collector/perf_linux_test.go index 0b57d101dd..05aa0b5546 100644 --- a/collector/perf_linux_test.go +++ b/collector/perf_linux_test.go @@ -53,3 +53,76 @@ func TestPerfCollector(t *testing.T) { t.Fatal(err) } } + +func TestPerfCPUFlagToCPUs(t *testing.T) { + tests := []struct { + name string + flag string + exCpus []int + errStr string + }{ + { + name: "valid single cpu", + flag: "1", + exCpus: []int{1}, + }, + { + name: "valid range cpus", + flag: "1-5", + exCpus: []int{1, 2, 3, 4, 5}, + }, + { + name: "valid double digit", + flag: "10", + exCpus: []int{10}, + }, + { + name: "valid double digit range", + flag: "10-12", + exCpus: []int{10, 11, 12}, + }, + { + name: "valid double digit stride", + flag: "10-20:5", + exCpus: []int{10, 15, 20}, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + cpus, err := perfCPUFlagToCPUs(test.flag) + if test.errStr != "" { + if err != nil { + t.Fatal("expected error to not be nil") + } + if test.errStr != err.Error() { + t.Fatalf( + "expected error %q, got %q", + test.errStr, + err.Error(), + ) + } + return + } + if err != nil { + t.Fatal(err) + } + if len(cpus) != len(test.exCpus) { + t.Fatalf( + "expected cpus %v, got %v", + test.exCpus, + cpus, + ) + } + for i := range cpus { + if test.exCpus[i] != cpus[i] { + t.Fatalf( + "expected cpus %v, got %v", + test.exCpus, + cpus, + ) + } + } + }) + } +} From 1fe2b5b5b555063f836aa8c2821936b7b7ccc24c Mon Sep 17 00:00:00 2001 From: Daniel Hodges Date: Sat, 30 Nov 2019 16:54:12 -0500 Subject: [PATCH 08/11] Update readme Signed-off-by: Daniel Hodges --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6ea96eb5f6..cb9016091c 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ specific CPUs) You can specify a list of alternate CPUs by using the would specify: `--collector.perf --collector.perf.cpus=2-6`. The CPU configuration is zero indexed and can also take a stride value `--collector.perf --collector.perf.cpus=1-4:10`, would collect on CPUs -1,2,3,4,10. +1, 2, 3, 4, and 10. Name | Description | OS From 1f58ab455af9272b7fb49ce973244131637b55cf Mon Sep 17 00:00:00 2001 From: Daniel Hodges Date: Sat, 30 Nov 2019 16:58:44 -0500 Subject: [PATCH 09/11] Update readme Signed-off-by: Daniel Hodges --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index cb9016091c..1a3c3d64e9 100644 --- a/README.md +++ b/README.md @@ -90,8 +90,8 @@ specific CPUs) You can specify a list of alternate CPUs by using the `--collector.perf.cpus` flag. For example, to collect metrics on CPUs 2-6, you would specify: `--collector.perf --collector.perf.cpus=2-6`. The CPU configuration is zero indexed and can also take a stride value -`--collector.perf --collector.perf.cpus=1-4:10`, would collect on CPUs -1, 2, 3, 4, and 10. +`--collector.perf --collector.perf.cpus=1-10:5`, would collect on CPUs +1, 5, and 10. Name | Description | OS From df73c1d419dc72c3358362b6e192a07c68b0d19b Mon Sep 17 00:00:00 2001 From: Daniel Hodges Date: Wed, 19 Feb 2020 09:31:33 -0500 Subject: [PATCH 10/11] fix format issues from GH ui Signed-off-by: Daniel Hodges --- collector/perf_linux.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collector/perf_linux.go b/collector/perf_linux.go index eb773e8d56..99c8dd0c5d 100644 --- a/collector/perf_linux.go +++ b/collector/perf_linux.go @@ -49,7 +49,7 @@ type perfCollector struct { perfSwProfilers map[int]*perf.SoftwareProfiler perfCacheProfilers map[int]*perf.CacheProfiler desc map[string]*prometheus.Desc - logger log.Logger + logger log.Logger } // perfCPUFlagToCPUs returns a set of CPUs for the perf collectors to monitor. From 4db63094e4b9bf30ac3b4f761e31da02cfb9defb Mon Sep 17 00:00:00 2001 From: Daniel Hodges Date: Wed, 19 Feb 2020 11:16:58 -0500 Subject: [PATCH 11/11] fix perf collection initialization Signed-off-by: Daniel Hodges --- collector/perf_linux.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/collector/perf_linux.go b/collector/perf_linux.go index 99c8dd0c5d..b67f970d7d 100644 --- a/collector/perf_linux.go +++ b/collector/perf_linux.go @@ -14,6 +14,7 @@ package collector import ( + "fmt" "runtime" "strconv" "strings" @@ -99,7 +100,7 @@ func perfCPUFlagToCPUs(cpuFlag string) ([]int, error) { // NewPerfCollector returns a new perf based collector, it creates a profiler // per CPU. -func NewPerfCollector() (Collector, error) { +func NewPerfCollector(logger log.Logger) (Collector, error) { collector := &perfCollector{ perfHwProfilers: map[int]*perf.HardwareProfiler{}, perfSwProfilers: map[int]*perf.SoftwareProfiler{}, @@ -107,6 +108,7 @@ func NewPerfCollector() (Collector, error) { hwProfilerCPUMap: map[*perf.HardwareProfiler]int{}, swProfilerCPUMap: map[*perf.SoftwareProfiler]int{}, cacheProfilerCPUMap: map[*perf.CacheProfiler]int{}, + logger: logger, } if perfCPUsFlag != nil && *perfCPUsFlag != "" { @@ -400,7 +402,7 @@ func NewPerfCollector() (Collector, error) { ), } - return c, nil + return collector, nil } // Update implements the Collector interface and will collect metrics per CPU.