Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
steps:
- checkout
- run: sudo pip install codespell
- run: codespell --skip=".git,./vendor,ttar,go.mod,go.sum,*pem" -L uint,packages\',uptodate
- run: codespell --skip=".git,./vendor,ttar,go.mod,go.sum,*pem,./collector/fixtures" -L uint,packages\',uptodate

build:
machine:
Expand Down
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* [CHANGE]
* [FEATURE]
* [ENHANCEMENT]
* [BUGFIX]
* [BUGFIX] Linux CPU: Cache CPU metrics to make them monotonically increasing #1711

## 1.0.0-rc.1 / 2020-05-14

Expand Down
85 changes: 84 additions & 1 deletion collector/cpu_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"fmt"
"path/filepath"
"strconv"
"sync"

"github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
Expand All @@ -35,6 +36,8 @@ type cpuCollector struct {
cpuCoreThrottle *prometheus.Desc
cpuPackageThrottle *prometheus.Desc
logger log.Logger
cpuStats []procfs.CPUStat
cpuStatsMutex sync.Mutex
}

var (
Expand Down Expand Up @@ -203,7 +206,12 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error {
return err
}

for cpuID, cpuStat := range stats.CPU {
c.updateCPUStats(stats.CPU)

// Acquire a lock to read the stats.
c.cpuStatsMutex.Lock()
defer c.cpuStatsMutex.Unlock()
for cpuID, cpuStat := range c.cpuStats {
cpuNum := strconv.Itoa(cpuID)
ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.User, cpuNum, "user")
ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.Nice, cpuNum, "nice")
Expand All @@ -221,3 +229,78 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error {

return nil
}

// updateCPUStats updates the internal cache of CPU stats.
func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) {
// Acquire a lock to update the stats.
c.cpuStatsMutex.Lock()
defer c.cpuStatsMutex.Unlock()

// Reset the cache if the list of CPUs has changed.
if len(c.cpuStats) != len(newStats) {
c.cpuStats = make([]procfs.CPUStat, len(newStats))
}

for i, n := range newStats {
// If idle jumps backwards, assume we had a hotplug event and reset the stats for this CPU.
if n.Idle < c.cpuStats[i].Idle {
level.Warn(c.logger).Log("msg", "CPU Idle counter jumped backwards, possible hotplug event, resetting CPU stats", "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
c.cpuStats[i] = procfs.CPUStat{}
}
c.cpuStats[i].Idle = n.Idle

if n.User >= c.cpuStats[i].User {
c.cpuStats[i].User = n.User
} else {
level.Warn(c.logger).Log("msg", "CPU User counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].User, "new_value", n.User)
}

if n.Nice >= c.cpuStats[i].Nice {
c.cpuStats[i].Nice = n.Nice
} else {
level.Warn(c.logger).Log("msg", "CPU Nice counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Nice, "new_value", n.Nice)
}

if n.System >= c.cpuStats[i].System {
c.cpuStats[i].System = n.System
} else {
level.Warn(c.logger).Log("msg", "CPU System counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].System, "new_value", n.System)
}

if n.Iowait >= c.cpuStats[i].Iowait {
c.cpuStats[i].Iowait = n.Iowait
} else {
level.Warn(c.logger).Log("msg", "CPU Iowait counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Iowait, "new_value", n.Iowait)
}

if n.IRQ >= c.cpuStats[i].IRQ {
c.cpuStats[i].IRQ = n.IRQ
} else {
level.Warn(c.logger).Log("msg", "CPU IRQ counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].IRQ, "new_value", n.IRQ)
}

if n.SoftIRQ >= c.cpuStats[i].SoftIRQ {
c.cpuStats[i].SoftIRQ = n.SoftIRQ
} else {
level.Warn(c.logger).Log("msg", "CPU SoftIRQ counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].SoftIRQ, "new_value", n.SoftIRQ)
}

if n.Steal >= c.cpuStats[i].Steal {
c.cpuStats[i].Steal = n.Steal
} else {
level.Warn(c.logger).Log("msg", "CPU Steal counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Steal, "new_value", n.Steal)
}

if n.Guest >= c.cpuStats[i].Guest {
c.cpuStats[i].Guest = n.Guest
} else {
level.Warn(c.logger).Log("msg", "CPU Guest counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Guest, "new_value", n.Guest)
}

if n.GuestNice >= c.cpuStats[i].GuestNice {
c.cpuStats[i].GuestNice = n.GuestNice
} else {
level.Warn(c.logger).Log("msg", "CPU GuestNice counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].GuestNice, "new_value", n.GuestNice)
}
}
}