From 65b2bf6105132029a4a90ba2f91777c095f2ab4d Mon Sep 17 00:00:00 2001 From: Nikolay Sivko Date: Fri, 30 May 2025 19:24:07 +0300 Subject: [PATCH] add GPU metrics --- Dockerfile | 2 +- containers/app.go | 2 + containers/container.go | 25 ++++ containers/metrics.go | 6 + containers/process.go | 50 +++++++ containers/registry.go | 13 +- go.mod | 1 + go.sum | 2 + gpu/gpu.go | 290 ++++++++++++++++++++++++++++++++++++++++ main.go | 11 +- 10 files changed, 399 insertions(+), 3 deletions(-) create mode 100644 gpu/gpu.go diff --git a/Dockerfile b/Dockerfile index 2918c89..580f747 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,7 +6,7 @@ COPY go.sum . RUN go mod download COPY . . ARG VERSION=unknown -RUN CGO_ENABLED=1 go build -mod=readonly -ldflags "-X 'github.com/coroot/coroot-node-agent/flags.Version=${VERSION}'" -o coroot-node-agent . +RUN CGO_ENABLED=1 go build -mod=readonly -ldflags "-extldflags='-Wl,-z,lazy' -X 'github.com/coroot/coroot-node-agent/flags.Version=${VERSION}'" -o coroot-node-agent . FROM registry.access.redhat.com/ubi9/ubi diff --git a/containers/app.go b/containers/app.go index c6fa309..9e252f1 100644 --- a/containers/app.go +++ b/containers/app.go @@ -120,6 +120,8 @@ func guessApplicationTypeByCmdline(cmdline []byte) string { return "nats" case bytes.HasSuffix(cmd, []byte("java")): return "java" + case bytes.HasSuffix(cmd, []byte("ollama")): + return "ollama" case bytes.Contains(cmd, []byte("victoria-metrics")) || bytes.Contains(cmd, []byte("vmstorage")) || bytes.Contains(cmd, []byte("vminsert")) || diff --git a/containers/container.go b/containers/container.go index c9b8e41..a35eb4c 100644 --- a/containers/container.go +++ b/containers/container.go @@ -29,6 +29,7 @@ var ( gcInterval = 10 * time.Minute pingTimeout = 300 * time.Millisecond multilineCollectorTimeout = time.Second + gpuStatsWindow = 15 * time.Second ) type ContainerID string @@ -132,6 +133,8 @@ type Container struct { l7Stats L7Stats dnsStats *L7Metrics + gpuStats map[string]*GpuUsage + oomKills int pythonThreadLockWaitTime time.Duration @@ -181,6 +184,8 @@ func NewContainer(id ContainerID, cg *cgroup.Cgroup, md *ContainerMetadata, pid l7Stats: L7Stats{}, dnsStats: &L7Metrics{}, + gpuStats: map[string]*GpuUsage{}, + mounts: map[string]proc.MountInfo{}, seenMounts: map[uint64]struct{}{}, @@ -370,7 +375,27 @@ func (c *Container) Collect(ch chan<- prometheus.Metric) { process.dotNetMonitor.Collect(ch) } } + + for _, usage := range c.gpuStats { + usage.Reset() + } + if usage := process.getGPUUsage(); usage != nil { + for uuid, u := range usage { + tu := c.gpuStats[uuid] + if tu == nil { + tu = &GpuUsage{} + c.gpuStats[uuid] = tu + } + tu.GPU += u.GPU + tu.Memory += u.Memory + } + } } + for uuid, usage := range c.gpuStats { + ch <- gauge(metrics.GpuUsagePercent, usage.GPU, uuid) + ch <- gauge(metrics.GpuMemoryUsagePercent, usage.Memory, uuid) + } + for appType := range appTypes { ch <- gauge(metrics.ApplicationType, 1, appType) } diff --git a/containers/metrics.go b/containers/metrics.go index dcc17cd..2d0df76 100644 --- a/containers/metrics.go +++ b/containers/metrics.go @@ -51,6 +51,9 @@ var metrics = struct { PythonThreadLockWaitTime *prometheus.Desc + GpuUsagePercent *prometheus.Desc + GpuMemoryUsagePercent *prometheus.Desc + Ip2Fqdn *prometheus.Desc }{ ContainerInfo: metric("container_info", "Meta information about the container", "image", "systemd_triggered_by"), @@ -100,6 +103,9 @@ var metrics = struct { Ip2Fqdn: metric("ip_to_fqdn", "Mapping IP addresses to FQDNs based on DNS requests initiated by containers", "ip", "fqdn"), PythonThreadLockWaitTime: metric("container_python_thread_lock_wait_time_seconds", "Time spent waiting acquiring GIL in seconds"), + + GpuUsagePercent: metric("container_resources_gpu_usage_percent", "Percent of GPU compute resources used by the container", "gpu_uuid"), + GpuMemoryUsagePercent: metric("container_resources_gpu_memory_usage_percent", "Percent of GPU memory used by the container", "gpu_uuid"), } var ( diff --git a/containers/process.go b/containers/process.go index ddf5ac2..9ee7242 100644 --- a/containers/process.go +++ b/containers/process.go @@ -8,11 +8,22 @@ import ( "github.com/cilium/ebpf/link" "github.com/coroot/coroot-node-agent/ebpftracer" + "github.com/coroot/coroot-node-agent/gpu" "github.com/coroot/coroot-node-agent/proc" "github.com/jpillora/backoff" "github.com/mdlayher/taskstats" ) +type GpuUsage struct { + GPU float64 + Memory float64 +} + +func (gu *GpuUsage) Reset() { + gu.Memory = 0 + gu.GPU = 0 +} + type Process struct { Pid uint32 StartedAt time.Time @@ -29,6 +40,8 @@ type Process struct { goTlsUprobesChecked bool openSslUprobesChecked bool pythonGilChecked bool + + gpuUsageSamples []gpu.ProcessUsageSample } func NewProcess(pid uint32, stats *taskstats.Stats, tracer *ebpftracer.Tracer) *Process { @@ -97,6 +110,43 @@ func (p *Process) instrumentPython(cmdline []byte, tracer *ebpftracer.Tracer) { p.uprobes = append(p.uprobes, tracer.AttachPythonThreadLockProbes(p.Pid)...) } +func (p *Process) addGpuUsageSample(sample gpu.ProcessUsageSample) { + p.removeOldGpuUsageSamples(sample.Timestamp.Add(-gpuStatsWindow)) + p.gpuUsageSamples = append(p.gpuUsageSamples, sample) +} + +func (p *Process) getGPUUsage() map[string]*GpuUsage { + p.removeOldGpuUsageSamples(time.Now().Add(-gpuStatsWindow)) + if len(p.gpuUsageSamples) == 0 { + return nil + } + gpuStatsWindowSeconds := gpuStatsWindow.Seconds() + res := make(map[string]*GpuUsage) + for _, sample := range p.gpuUsageSamples { + u := res[sample.UUID] + if u == nil { + u = &GpuUsage{} + res[sample.UUID] = u + } + u.GPU += float64(sample.GPUPercent) / gpuStatsWindowSeconds + u.Memory += float64(sample.MemoryPercent) / gpuStatsWindowSeconds + } + return res +} + +func (p *Process) removeOldGpuUsageSamples(cutoff time.Time) { + i := 0 + for ; i < len(p.gpuUsageSamples); i++ { + if p.gpuUsageSamples[i].Timestamp.After(cutoff) { + break + } + } + if i > 0 { + copy(p.gpuUsageSamples, p.gpuUsageSamples[i:]) + p.gpuUsageSamples = p.gpuUsageSamples[:len(p.gpuUsageSamples)-i] + } +} + func (p *Process) Close() { p.cancelFunc() for _, u := range p.uprobes { diff --git a/containers/registry.go b/containers/registry.go index aee9f76..2eba30b 100644 --- a/containers/registry.go +++ b/containers/registry.go @@ -14,6 +14,7 @@ import ( "github.com/coroot/coroot-node-agent/common" "github.com/coroot/coroot-node-agent/ebpftracer" "github.com/coroot/coroot-node-agent/flags" + "github.com/coroot/coroot-node-agent/gpu" "github.com/coroot/coroot-node-agent/proc" "github.com/prometheus/client_golang/prometheus" "github.com/vishvananda/netns" @@ -59,9 +60,11 @@ type Registry struct { trafficStatsLastUpdated time.Time trafficStatsLock sync.Mutex trafficStatsUpdateCh chan *TrafficStatsUpdate + + gpuProcessUsageSampleChan chan gpu.ProcessUsageSample } -func NewRegistry(reg prometheus.Registerer, processInfoCh chan<- ProcessInfo) (*Registry, error) { +func NewRegistry(reg prometheus.Registerer, processInfoCh chan<- ProcessInfo, gpuProcessUsageSampleChan chan gpu.ProcessUsageSample) (*Registry, error) { ns, err := proc.GetSelfNetNs() if err != nil { return nil, err @@ -113,6 +116,8 @@ func NewRegistry(reg prometheus.Registerer, processInfoCh chan<- ProcessInfo) (* tracer: ebpftracer.NewTracer(hostNetNs, selfNetNs, *flags.DisableL7Tracing), trafficStatsUpdateCh: make(chan *TrafficStatsUpdate), + + gpuProcessUsageSampleChan: gpuProcessUsageSampleChan, } if err = reg.Register(r); err != nil { return nil, err @@ -205,6 +210,12 @@ func (r *Registry) handleEvents(ch <-chan ebpftracer.Event) { if c := r.containersByPid[u.Pid]; c != nil { c.updateTrafficStats(u) } + case sample := <-r.gpuProcessUsageSampleChan: + if c := r.containersByPid[sample.Pid]; c != nil { + if p := c.processes[sample.Pid]; p != nil { + p.addGpuUsageSample(sample) + } + } case e, more := <-ch: if !more { return diff --git a/go.mod b/go.mod index 0fe4c53..c8a1c23 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,7 @@ go 1.23.8 require ( cloud.google.com/go/compute/metadata v0.5.2 github.com/ClickHouse/ch-go v0.62.0 + github.com/NVIDIA/go-nvml v0.12.4-1 github.com/agoda-com/opentelemetry-logs-go v0.4.1 github.com/cilium/cilium v1.17.2 github.com/cilium/ebpf v0.17.3 diff --git a/go.sum b/go.sum index 079f40b..5e6d21c 100644 --- a/go.sum +++ b/go.sum @@ -70,6 +70,8 @@ github.com/Microsoft/hcsshim v0.9.12 h1:0Wgl1fRF4WmBuqP6EnHk2w3m7CCCumD/KUumZxp7 github.com/Microsoft/hcsshim v0.9.12/go.mod h1:qAiPvMgZoM0wpkVg6qMdSEu+1VtI6/qHOOPkTGt8ftQ= github.com/Microsoft/hcsshim/test v0.0.0-20201218223536-d3e5debf77da/go.mod h1:5hlzMzRKMLyo42nCZ9oml8AdTlq/0cvIaBv6tK1RehU= github.com/Microsoft/hcsshim/test v0.0.0-20210227013316-43a75bb4edd3/go.mod h1:mw7qgWloBUl75W/gVH3cQszUg1+gUITj7D6NY7ywVnY= +github.com/NVIDIA/go-nvml v0.12.4-1 h1:WKUvqshhWSNTfm47ETRhv0A0zJyr1ncCuHiXwoTrBEc= +github.com/NVIDIA/go-nvml v0.12.4-1/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ= github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= diff --git a/gpu/gpu.go b/gpu/gpu.go new file mode 100644 index 0000000..f91d690 --- /dev/null +++ b/gpu/gpu.go @@ -0,0 +1,290 @@ +package gpu + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "os" + "runtime" + "strings" + "sync" + "time" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/coroot/coroot-node-agent/proc" + "github.com/prometheus/client_golang/prometheus" + "k8s.io/klog/v2" +) + +var ( + gpuInfo = prometheus.NewDesc( + "node_gpu_info", + "Meta information about the GPU", + []string{"gpu_uuid", "name"}, nil, + ) + gpuMemoryTotal = prometheus.NewDesc( + "node_resources_gpu_memory_total_bytes", + "Total memory available on the GPU in bytes", + []string{"gpu_uuid"}, nil, + ) + gpuMemoryUsed = prometheus.NewDesc( + "node_resources_gpu_memory_used_bytes", + "GPU memory currently in use in bytes", + []string{"gpu_uuid"}, nil, + ) + gpuMemoryUsageAvg = prometheus.NewDesc( + "node_resources_gpu_memory_utilization_percent_avg", + "Average GPU memory utilization (percentage) over the collection interval", + []string{"gpu_uuid"}, nil, + ) + gpuTemperature = prometheus.NewDesc( + "node_resources_gpu_temperature_celsius", + "Current temperature of the GPU in Celsius", + []string{"gpu_uuid"}, nil, + ) + gpuPowerWatts = prometheus.NewDesc( + "node_resources_gpu_power_usage_watts", + "Current power usage of the GPU in watts", + []string{"gpu_uuid"}, nil, + ) + gpuMemoryUsagePeak = prometheus.NewDesc( + "node_resources_gpu_memory_utilization_percent_peak", + "Peak GPU memory utilization (percentage) over the collection interval", + []string{"gpu_uuid"}, nil, + ) + gpuUsageAvg = prometheus.NewDesc( + "node_resources_gpu_utilization_percent_avg", + "Average GPU core utilization (percentage) over the collection interval", + []string{"gpu_uuid"}, nil, + ) + gpuUsagePeak = prometheus.NewDesc( + "node_resources_gpu_utilization_percent_peak", + "Peak GPU core utilization (percentage) over the collection interval", + []string{"gpu_uuid"}, nil, + ) +) + +type Collector struct { + ProcessUsageSampleCh chan ProcessUsageSample + iface nvml.Interface + devices []*Device + lock sync.Mutex +} + +type Device struct { + UUID string + Name string + device nvml.Device + lastSampleTime map[nvml.SamplingType]uint64 +} + +type ProcessUsageSample struct { + UUID string + Pid uint32 + Timestamp time.Time + GPUPercent uint32 + MemoryPercent uint32 +} + +func NewCollector() (*Collector, error) { + c := &Collector{ + ProcessUsageSampleCh: make(chan ProcessUsageSample, 100), + } + + libPath, err := findNvidiaMLLib() + if err != nil { + klog.Infoln(err) + return c, nil + } + klog.Infof("found NVML lib at %s", libPath) + + c.iface = nvml.New(nvml.WithLibraryPath(libPath)) + if ret := c.iface.Init(); ret != nvml.SUCCESS { + return c, fmt.Errorf("unable to initialize NVML: %s", nvml.ErrorString(ret)) + } + count, ret := c.iface.DeviceGetCount() + if ret != nvml.SUCCESS { + return c, fmt.Errorf("unable to get device count: %s", nvml.ErrorString(ret)) + } + var names []string + for i := 0; i < count; i++ { + device, ret := c.iface.DeviceGetHandleByIndex(i) + if ret != nvml.SUCCESS { + return c, errors.New(nvml.ErrorString(ret)) + } + dev := Device{ + lastSampleTime: map[nvml.SamplingType]uint64{}, + device: device, + } + if dev.UUID, ret = device.GetUUID(); ret != nvml.SUCCESS { + return c, errors.New(nvml.ErrorString(ret)) + } + if dev.Name, ret = device.GetName(); ret != nvml.SUCCESS { + return c, errors.New(nvml.ErrorString(ret)) + } + names = append(names, dev.Name) + c.devices = append(c.devices, &dev) + } + if len(names) > 0 { + klog.Infof("found %d GPU: %s", len(names), strings.Join(names, ", ")) + } + go c.processUtilizationPoller() + return c, nil +} + +func (c *Collector) processUtilizationPoller() { + ticker := time.NewTicker(1 * time.Second) + lastTs := uint64(time.Now().UnixMicro()) + for range ticker.C { + for _, dev := range c.devices { + samples, _ := dev.device.GetProcessUtilization(lastTs) + for _, sample := range samples { + if sample.TimeStamp <= lastTs { + continue + } + if sample.SmUtil > 0 { + c.ProcessUsageSampleCh <- ProcessUsageSample{ + UUID: dev.UUID, + Pid: sample.Pid, + GPUPercent: sample.SmUtil, + MemoryPercent: sample.MemUtil, + Timestamp: time.UnixMicro(int64(sample.TimeStamp)), + } + } + lastTs = sample.TimeStamp + } + } + } +} + +func (c *Collector) Describe(ch chan<- *prometheus.Desc) { + ch <- gpuInfo + ch <- gpuMemoryTotal + ch <- gpuMemoryUsed + ch <- gpuMemoryUsageAvg + ch <- gpuMemoryUsagePeak + ch <- gpuUsageAvg + ch <- gpuUsagePeak + ch <- gpuTemperature + ch <- gpuPowerWatts +} + +func (c *Collector) Collect(ch chan<- prometheus.Metric) { + c.lock.Lock() + defer c.lock.Unlock() + for _, dev := range c.devices { + ch <- gauge(gpuInfo, 1, dev.UUID, dev.Name) + + mi, ret := dev.device.GetMemoryInfo() + if ret == nvml.SUCCESS { + ch <- gauge(gpuMemoryTotal, float64(mi.Total), dev.UUID) + ch <- gauge(gpuMemoryUsed, float64(mi.Used), dev.UUID) + } + if t, ret := dev.device.GetTemperature(nvml.TEMPERATURE_GPU); ret == nvml.SUCCESS { + ch <- gauge(gpuTemperature, float64(t), dev.UUID) + } + if mw, ret := dev.device.GetPowerUsage(); ret == nvml.SUCCESS { + ch <- gauge(gpuPowerWatts, float64(mw)/1000., dev.UUID) + } + for _, st := range []nvml.SamplingType{nvml.GPU_UTILIZATION_SAMPLES, nvml.MEMORY_UTILIZATION_SAMPLES} { + lastTs := dev.lastSampleTime[st] + valtype, samples, ret := dev.device.GetSamples(st, lastTs) + if ret != nvml.SUCCESS { + continue + } + total := float64(0) + count := float64(0) + peak := float64(0) + for _, sample := range samples { + if sample.TimeStamp <= lastTs { + continue + } + value, err := valueToFloat(valtype, sample.SampleValue) + if err != nil { + continue + } + total += value + if value > peak { + peak = value + } + count++ + lastTs = sample.TimeStamp + } + if count > 0 { + switch st { + case nvml.GPU_UTILIZATION_SAMPLES: + ch <- gauge(gpuUsageAvg, total/count, dev.UUID) + ch <- gauge(gpuUsagePeak, peak, dev.UUID) + case nvml.MEMORY_UTILIZATION_SAMPLES: + ch <- gauge(gpuMemoryUsageAvg, total/count, dev.UUID) + ch <- gauge(gpuMemoryUsagePeak, peak, dev.UUID) + } + } + dev.lastSampleTime[st] = lastTs + } + } +} + +func (c *Collector) Close() { + c.iface.Shutdown() +} + +func findNvidiaMLLib() (string, error) { + paths := []string{ + // gpu-operator + "/run/nvidia/driver/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1", + "/run/nvidia/driver/usr/lib64/libnvidia-ml.so.1", + "/home/kubernetes/bin/nvidia/lib64/libnvidia-ml.so.1", //GKE + + "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1", + "/usr/lib64/libnvidia-ml.so.1", + "/usr/local/cuda/lib64/libnvidia-ml.so.1", + "/usr/lib/libnvidia-ml.so.1", + } + if runtime.GOARCH == "arm64" { + paths = append(paths, + "/usr/lib/aarch64-linux-gnu/libnvidia-ml.so.1", + "/run/nvidia/driver/usr/lib/aarch64-linux-gnu/libnvidia-ml.so.1", + "/home/kubernetes/bin/nvidia/lib64-aarch64/libnvidia-ml.so.1", //GKE + ) + } + for _, p := range paths { + if _, err := os.Stat(proc.HostPath(p)); err == nil { + return proc.HostPath(p), nil + } + } + return "", fmt.Errorf("libnvidia-ml.so.1 not found in known paths") +} + +func valueToFloat(valueType nvml.ValueType, value [8]byte) (float64, error) { + r := bytes.NewReader(value[:]) + switch valueType { + case nvml.VALUE_TYPE_DOUBLE: + var v float64 + err := binary.Read(r, binary.LittleEndian, &v) + return v, err + case nvml.VALUE_TYPE_UNSIGNED_INT: + var v uint32 + err := binary.Read(r, binary.LittleEndian, &v) + return float64(v), err + case nvml.VALUE_TYPE_UNSIGNED_LONG, nvml.VALUE_TYPE_UNSIGNED_LONG_LONG: + var v uint64 + err := binary.Read(r, binary.LittleEndian, &v) + return float64(v), err + case nvml.VALUE_TYPE_SIGNED_LONG_LONG: + var v int64 + err := binary.Read(r, binary.LittleEndian, &v) + return float64(v), err + case nvml.VALUE_TYPE_SIGNED_INT: + var v int32 + err := binary.Read(r, binary.LittleEndian, &v) + return float64(v), err + default: + return 0, fmt.Errorf("unsupported value type %d", valueType) + } +} + +func gauge(desc *prometheus.Desc, value float64, labelValues ...string) prometheus.Metric { + return prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, value, labelValues...) +} diff --git a/main.go b/main.go index 6f47e0d..173dd88 100644 --- a/main.go +++ b/main.go @@ -11,6 +11,7 @@ import ( "github.com/coroot/coroot-node-agent/common" "github.com/coroot/coroot-node-agent/containers" "github.com/coroot/coroot-node-agent/flags" + "github.com/coroot/coroot-node-agent/gpu" "github.com/coroot/coroot-node-agent/logs" "github.com/coroot/coroot-node-agent/node" "github.com/coroot/coroot-node-agent/proc" @@ -141,6 +142,14 @@ func main() { if err := registerer.Register(nodeCollector); err != nil { klog.Exitln(err) } + + gpuCollector, err := gpu.NewCollector() + if err != nil { + klog.Warningln("failed to initialize GPU collector:", err) + } + if err := registerer.Register(gpuCollector); err != nil { + klog.Exitln(err) + } registerer.MustRegister(info("node_agent_info", version)) if md := nodeCollector.Metadata(); md != nil { @@ -151,7 +160,7 @@ func main() { } } processInfoCh := profiling.Init(machineId, hostname) - cr, err := containers.NewRegistry(registerer, processInfoCh) + cr, err := containers.NewRegistry(registerer, processInfoCh, gpuCollector.ProcessUsageSampleCh) if err != nil { klog.Exitln(err) }