Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

### Changes

* [ENHANCEMENT] Add `--collector.perf.cpus` to allow setting the CPU list for perf stats.
* [CHANGE] Add `--collector.netdev.device-whitelist`. #1279
* [CHANGE] Refactor mdadm collector #1403
* [CHANGE] Add `mountaddr` label to NFS metrics. #1417
Expand Down
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,14 @@ Depending on the configured value different metrics will be available, for most
cases `0` will provide the most complete set. For more information see [`man 2
perf_event_open`](http://man7.org/linux/man-pages/man2/perf_event_open.2.html).

By default, the perf collector will only collect metrics of the CPUs that
`node_exporter` can run on. If this is insufficient (e.g. if you run `node_exporter` with
its CPU affinity set to specific CPUs) You can specify a list of alternate CPUs by using the
`--collector.perf.cpus` flag. For example, to collect metrics on CPUs 2-6, you
would specify: `--collector.perf --collector.perf.cpus=2-6`. The CPU ids start
at 0.


Name | Description | OS
---------|-------------|----
buddyinfo | Exposes statistics of memory fragments as reported by /proc/buddyinfo. | Linux
Expand Down
110 changes: 84 additions & 26 deletions collector/perf_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,24 @@
package collector

import (
"errors"
"fmt"
"runtime"

perf "github.com/hodgesds/perf-utils"
"github.com/prometheus/client_golang/prometheus"
kingpin "gopkg.in/alecthomas/kingpin.v2"
"runtime"
"strconv"
"strings"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Normally we group all the built-in imports at the top, can you re-format these using goimports?

go get golang.org/x/tools/cmd/goimports
goimports -w ./collector/perf_linux.go

)

const (
perfSubsystem = "perf"
)

var (
perfCpusFlag = kingpin.Flag("collector.perf.cpus", "List of CPUs from which perf metrics should be collected").Default("").String()
)

func init() {
registerCollector(perfSubsystem, defaultDisabled, NewPerfCollector)
}
Expand All @@ -35,37 +42,85 @@ func init() {
// settings not all profiler values may be exposed on the target system at any
// given time.
type perfCollector struct {
perfHwProfilers map[int]perf.HardwareProfiler
perfSwProfilers map[int]perf.SoftwareProfiler
perfCacheProfilers map[int]perf.CacheProfiler
desc map[string]*prometheus.Desc
hwProfilerCpuMap map[*perf.HardwareProfiler]int
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't expect to keep a map of pointers to interfaces. For example NewSoftwareProfiler returns an interface that can be used directly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yea but I think you need them otherwise you can't generate a map of pointers to CPU ids.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

confirmed that this change (pointers to interfaces) works correctly, as the address pointed to can be used as a key for the CPU ID map. the code without this change does not work as mentioned here: #1518 (comment)

I think keeping this is correct -- I am using this in a lab setting and am getting the expected results now.

swProfilerCpuMap map[*perf.SoftwareProfiler]int
cacheProfilerCpuMap map[*perf.CacheProfiler]int
perfHwProfilers map[int]*perf.HardwareProfiler
perfSwProfilers map[int]*perf.SoftwareProfiler
perfCacheProfilers map[int]*perf.CacheProfiler
desc map[string]*prometheus.Desc
}

func isValidCPUString(cpus *string) bool {
if !strings.Contains(*cpus, "-") || strings.Count(*cpus, "-") != 1 {
return false
}

return true
}

// NewPerfCollector returns a new perf based collector, it creates a profiler
// per CPU.
func NewPerfCollector() (Collector, error) {
collector := &perfCollector{
perfHwProfilers: map[int]perf.HardwareProfiler{},
perfSwProfilers: map[int]perf.SoftwareProfiler{},
perfCacheProfilers: map[int]perf.CacheProfiler{},
perfHwProfilers: map[int]*perf.HardwareProfiler{},
perfSwProfilers: map[int]*perf.SoftwareProfiler{},
perfCacheProfilers: map[int]*perf.CacheProfiler{},
hwProfilerCpuMap: map[*perf.HardwareProfiler]int{},
swProfilerCpuMap: map[*perf.SoftwareProfiler]int{},
cacheProfilerCpuMap: map[*perf.CacheProfiler]int{},
}

start := 0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use a var block here:

var (
  start = 0
  ncpus = 0
  ..
)

ncpus := 0
var err error

if perfCpusFlag == nil || *perfCpusFlag == "" {
start = 0
ncpus = runtime.NumCPU() - 1
} else if !isValidCPUString(perfCpusFlag) {
return nil, errors.New("--collector.perf.cpus flag value is invalid, it must be a range (e.g. 2-6)")
} else {
cpuRange := strings.Split(*perfCpusFlag, "-")
start, err = strconv.Atoi(cpuRange[0])
if err != nil {
start = 0
}

ncpus, err = strconv.Atoi(cpuRange[1])
if err != nil {
ncpus = runtime.NumCPU() - 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we should fallback to the default when a user provided a non-integer. This should fail instead.

}
}
ncpus := runtime.NumCPU()
for i := 0; i < ncpus; i++ {

for i, idx := start, 0; i <= ncpus; i, idx = i+1, idx+1 {
// Use -1 to profile all processes on the CPU, see:
// man perf_event_open
collector.perfHwProfilers[i] = perf.NewHardwareProfiler(-1, i)
if err := collector.perfHwProfilers[i].Start(); err != nil {
p := perf.NewHardwareProfiler(-1, i)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's rename the p's here:

  • ph = hardware profiler
  • ps = software profiler
  • pc = cache profiler

collector.perfHwProfilers[idx] = &p
if err := p.Start(); err != nil {
return collector, err
} else {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need for the else here, if err != nil it returns anyway.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(same below)

collector.hwProfilerCpuMap[&p] = i
}
collector.perfSwProfilers[i] = perf.NewSoftwareProfiler(-1, i)
if err := collector.perfSwProfilers[i].Start(); err != nil {

p2 := perf.NewSoftwareProfiler(-1, i)
collector.perfSwProfilers[i] = &p2
if err := p2.Start(); err != nil {
return collector, err
} else {
collector.swProfilerCpuMap[&p2] = i
}
collector.perfCacheProfilers[i] = perf.NewCacheProfiler(-1, i)
if err := collector.perfCacheProfilers[i].Start(); err != nil {

p3 := perf.NewCacheProfiler(-1, i)
collector.perfCacheProfilers[i] = &p3
if err := p3.Start(); err != nil {
return collector, err
} else {
collector.cacheProfilerCpuMap[&p3] = i
}
}

collector.desc = map[string]*prometheus.Desc{
"cpucycles_total": prometheus.NewDesc(
prometheus.BuildFQName(
Expand Down Expand Up @@ -330,9 +385,10 @@ func (c *perfCollector) Update(ch chan<- prometheus.Metric) error {
}

func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error {
for cpu, profiler := range c.perfHwProfilers {
cpuStr := fmt.Sprintf("%d", cpu)
hwProfile, err := profiler.Profile()
for _, profiler := range c.perfHwProfilers {
cpuid := c.hwProfilerCpuMap[profiler]
cpuStr := fmt.Sprintf("%d", cpuid)
hwProfile, err := (*profiler).Profile()
if err != nil {
return err
}
Expand Down Expand Up @@ -401,9 +457,10 @@ func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error {
}

func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error {
for cpu, profiler := range c.perfSwProfilers {
cpuStr := fmt.Sprintf("%d", cpu)
swProfile, err := profiler.Profile()
for _, profiler := range c.perfSwProfilers {
cpuid := c.swProfilerCpuMap[profiler]
cpuStr := fmt.Sprintf("%d", cpuid)
swProfile, err := (*profiler).Profile()
if err != nil {
return err
}
Expand Down Expand Up @@ -456,9 +513,10 @@ func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error {
}

func (c *perfCollector) updateCacheStats(ch chan<- prometheus.Metric) error {
for cpu, profiler := range c.perfCacheProfilers {
cpuStr := fmt.Sprintf("%d", cpu)
cacheProfile, err := profiler.Profile()
for _, profiler := range c.perfCacheProfilers {
cpuid := c.cacheProfilerCpuMap[profiler]
cpuStr := fmt.Sprintf("%d", cpuid)
cacheProfile, err := (*profiler).Profile()
if err != nil {
return err
}
Expand Down