From 65b2bf6105132029a4a90ba2f91777c095f2ab4d Mon Sep 17 00:00:00 2001
From: Nikolay Sivko <n.sivko@gmail.com>
Date: Fri, 30 May 2025 19:24:07 +0300
Subject: [PATCH] add GPU metrics

---
 Dockerfile              |   2 +-
 containers/app.go       |   2 +
 containers/container.go |  25 ++++
 containers/metrics.go   |   6 +
 containers/process.go   |  50 +++++++
 containers/registry.go  |  13 +-
 go.mod                  |   1 +
 go.sum                  |   2 +
 gpu/gpu.go              | 290 ++++++++++++++++++++++++++++++++++++++++
 main.go                 |  11 +-
 10 files changed, 399 insertions(+), 3 deletions(-)
 create mode 100644 gpu/gpu.go

diff --git a/Dockerfile b/Dockerfile
index 2918c89..580f747 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -6,7 +6,7 @@ COPY go.sum .
 RUN go mod download
 COPY . .
 ARG VERSION=unknown
-RUN CGO_ENABLED=1 go build -mod=readonly -ldflags "-X 'github.com/coroot/coroot-node-agent/flags.Version=${VERSION}'" -o coroot-node-agent .
+RUN CGO_ENABLED=1 go build -mod=readonly -ldflags "-extldflags='-Wl,-z,lazy' -X 'github.com/coroot/coroot-node-agent/flags.Version=${VERSION}'" -o coroot-node-agent .
 
 FROM registry.access.redhat.com/ubi9/ubi
 
diff --git a/containers/app.go b/containers/app.go
index c6fa309..9e252f1 100644
--- a/containers/app.go
+++ b/containers/app.go
@@ -120,6 +120,8 @@ func guessApplicationTypeByCmdline(cmdline []byte) string {
 		return "nats"
 	case bytes.HasSuffix(cmd, []byte("java")):
 		return "java"
+	case bytes.HasSuffix(cmd, []byte("ollama")):
+		return "ollama"
 	case bytes.Contains(cmd, []byte("victoria-metrics")) ||
 		bytes.Contains(cmd, []byte("vmstorage")) ||
 		bytes.Contains(cmd, []byte("vminsert")) ||
diff --git a/containers/container.go b/containers/container.go
index c9b8e41..a35eb4c 100644
--- a/containers/container.go
+++ b/containers/container.go
@@ -29,6 +29,7 @@ var (
 	gcInterval                = 10 * time.Minute
 	pingTimeout               = 300 * time.Millisecond
 	multilineCollectorTimeout = time.Second
+	gpuStatsWindow            = 15 * time.Second
 )
 
 type ContainerID string
@@ -132,6 +133,8 @@ type Container struct {
 	l7Stats  L7Stats
 	dnsStats *L7Metrics
 
+	gpuStats map[string]*GpuUsage
+
 	oomKills                 int
 	pythonThreadLockWaitTime time.Duration
 
@@ -181,6 +184,8 @@ func NewContainer(id ContainerID, cg *cgroup.Cgroup, md *ContainerMetadata, pid
 		l7Stats:                  L7Stats{},
 		dnsStats:                 &L7Metrics{},
 
+		gpuStats: map[string]*GpuUsage{},
+
 		mounts:     map[string]proc.MountInfo{},
 		seenMounts: map[uint64]struct{}{},
 
@@ -370,7 +375,27 @@ func (c *Container) Collect(ch chan<- prometheus.Metric) {
 				process.dotNetMonitor.Collect(ch)
 			}
 		}
+
+		for _, usage := range c.gpuStats {
+			usage.Reset()
+		}
+		if usage := process.getGPUUsage(); usage != nil {
+			for uuid, u := range usage {
+				tu := c.gpuStats[uuid]
+				if tu == nil {
+					tu = &GpuUsage{}
+					c.gpuStats[uuid] = tu
+				}
+				tu.GPU += u.GPU
+				tu.Memory += u.Memory
+			}
+		}
 	}
+	for uuid, usage := range c.gpuStats {
+		ch <- gauge(metrics.GpuUsagePercent, usage.GPU, uuid)
+		ch <- gauge(metrics.GpuMemoryUsagePercent, usage.Memory, uuid)
+	}
+
 	for appType := range appTypes {
 		ch <- gauge(metrics.ApplicationType, 1, appType)
 	}
diff --git a/containers/metrics.go b/containers/metrics.go
index dcc17cd..2d0df76 100644
--- a/containers/metrics.go
+++ b/containers/metrics.go
@@ -51,6 +51,9 @@ var metrics = struct {
 
 	PythonThreadLockWaitTime *prometheus.Desc
 
+	GpuUsagePercent       *prometheus.Desc
+	GpuMemoryUsagePercent *prometheus.Desc
+
 	Ip2Fqdn *prometheus.Desc
 }{
 	ContainerInfo: metric("container_info", "Meta information about the container", "image", "systemd_triggered_by"),
@@ -100,6 +103,9 @@ var metrics = struct {
 	Ip2Fqdn: metric("ip_to_fqdn", "Mapping IP addresses to FQDNs based on DNS requests initiated by containers", "ip", "fqdn"),
 
 	PythonThreadLockWaitTime: metric("container_python_thread_lock_wait_time_seconds", "Time spent waiting acquiring GIL in seconds"),
+
+	GpuUsagePercent:       metric("container_resources_gpu_usage_percent", "Percent of GPU compute resources used by the container", "gpu_uuid"),
+	GpuMemoryUsagePercent: metric("container_resources_gpu_memory_usage_percent", "Percent of GPU memory used by the container", "gpu_uuid"),
 }
 
 var (
diff --git a/containers/process.go b/containers/process.go
index ddf5ac2..9ee7242 100644
--- a/containers/process.go
+++ b/containers/process.go
@@ -8,11 +8,22 @@ import (
 
 	"github.com/cilium/ebpf/link"
 	"github.com/coroot/coroot-node-agent/ebpftracer"
+	"github.com/coroot/coroot-node-agent/gpu"
 	"github.com/coroot/coroot-node-agent/proc"
 	"github.com/jpillora/backoff"
 	"github.com/mdlayher/taskstats"
 )
 
+type GpuUsage struct {
+	GPU    float64
+	Memory float64
+}
+
+func (gu *GpuUsage) Reset() {
+	gu.Memory = 0
+	gu.GPU = 0
+}
+
 type Process struct {
 	Pid       uint32
 	StartedAt time.Time
@@ -29,6 +40,8 @@ type Process struct {
 	goTlsUprobesChecked   bool
 	openSslUprobesChecked bool
 	pythonGilChecked      bool
+
+	gpuUsageSamples []gpu.ProcessUsageSample
 }
 
 func NewProcess(pid uint32, stats *taskstats.Stats, tracer *ebpftracer.Tracer) *Process {
@@ -97,6 +110,43 @@ func (p *Process) instrumentPython(cmdline []byte, tracer *ebpftracer.Tracer) {
 	p.uprobes = append(p.uprobes, tracer.AttachPythonThreadLockProbes(p.Pid)...)
 }
 
+func (p *Process) addGpuUsageSample(sample gpu.ProcessUsageSample) {
+	p.removeOldGpuUsageSamples(sample.Timestamp.Add(-gpuStatsWindow))
+	p.gpuUsageSamples = append(p.gpuUsageSamples, sample)
+}
+
+func (p *Process) getGPUUsage() map[string]*GpuUsage {
+	p.removeOldGpuUsageSamples(time.Now().Add(-gpuStatsWindow))
+	if len(p.gpuUsageSamples) == 0 {
+		return nil
+	}
+	gpuStatsWindowSeconds := gpuStatsWindow.Seconds()
+	res := make(map[string]*GpuUsage)
+	for _, sample := range p.gpuUsageSamples {
+		u := res[sample.UUID]
+		if u == nil {
+			u = &GpuUsage{}
+			res[sample.UUID] = u
+		}
+		u.GPU += float64(sample.GPUPercent) / gpuStatsWindowSeconds
+		u.Memory += float64(sample.MemoryPercent) / gpuStatsWindowSeconds
+	}
+	return res
+}
+
+func (p *Process) removeOldGpuUsageSamples(cutoff time.Time) {
+	i := 0
+	for ; i < len(p.gpuUsageSamples); i++ {
+		if p.gpuUsageSamples[i].Timestamp.After(cutoff) {
+			break
+		}
+	}
+	if i > 0 {
+		copy(p.gpuUsageSamples, p.gpuUsageSamples[i:])
+		p.gpuUsageSamples = p.gpuUsageSamples[:len(p.gpuUsageSamples)-i]
+	}
+}
+
 func (p *Process) Close() {
 	p.cancelFunc()
 	for _, u := range p.uprobes {
diff --git a/containers/registry.go b/containers/registry.go
index aee9f76..2eba30b 100644
--- a/containers/registry.go
+++ b/containers/registry.go
@@ -14,6 +14,7 @@ import (
 	"github.com/coroot/coroot-node-agent/common"
 	"github.com/coroot/coroot-node-agent/ebpftracer"
 	"github.com/coroot/coroot-node-agent/flags"
+	"github.com/coroot/coroot-node-agent/gpu"
 	"github.com/coroot/coroot-node-agent/proc"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/vishvananda/netns"
@@ -59,9 +60,11 @@ type Registry struct {
 	trafficStatsLastUpdated time.Time
 	trafficStatsLock        sync.Mutex
 	trafficStatsUpdateCh    chan *TrafficStatsUpdate
+
+	gpuProcessUsageSampleChan chan gpu.ProcessUsageSample
 }
 
-func NewRegistry(reg prometheus.Registerer, processInfoCh chan<- ProcessInfo) (*Registry, error) {
+func NewRegistry(reg prometheus.Registerer, processInfoCh chan<- ProcessInfo, gpuProcessUsageSampleChan chan gpu.ProcessUsageSample) (*Registry, error) {
 	ns, err := proc.GetSelfNetNs()
 	if err != nil {
 		return nil, err
@@ -113,6 +116,8 @@ func NewRegistry(reg prometheus.Registerer, processInfoCh chan<- ProcessInfo) (*
 		tracer: ebpftracer.NewTracer(hostNetNs, selfNetNs, *flags.DisableL7Tracing),
 
 		trafficStatsUpdateCh: make(chan *TrafficStatsUpdate),
+
+		gpuProcessUsageSampleChan: gpuProcessUsageSampleChan,
 	}
 	if err = reg.Register(r); err != nil {
 		return nil, err
@@ -205,6 +210,12 @@ func (r *Registry) handleEvents(ch <-chan ebpftracer.Event) {
 			if c := r.containersByPid[u.Pid]; c != nil {
 				c.updateTrafficStats(u)
 			}
+		case sample := <-r.gpuProcessUsageSampleChan:
+			if c := r.containersByPid[sample.Pid]; c != nil {
+				if p := c.processes[sample.Pid]; p != nil {
+					p.addGpuUsageSample(sample)
+				}
+			}
 		case e, more := <-ch:
 			if !more {
 				return
diff --git a/go.mod b/go.mod
index 0fe4c53..c8a1c23 100644
--- a/go.mod
+++ b/go.mod
@@ -5,6 +5,7 @@ go 1.23.8
 require (
 	cloud.google.com/go/compute/metadata v0.5.2
 	github.com/ClickHouse/ch-go v0.62.0
+	github.com/NVIDIA/go-nvml v0.12.4-1
 	github.com/agoda-com/opentelemetry-logs-go v0.4.1
 	github.com/cilium/cilium v1.17.2
 	github.com/cilium/ebpf v0.17.3
diff --git a/go.sum b/go.sum
index 079f40b..5e6d21c 100644
--- a/go.sum
+++ b/go.sum
@@ -70,6 +70,8 @@ github.com/Microsoft/hcsshim v0.9.12 h1:0Wgl1fRF4WmBuqP6EnHk2w3m7CCCumD/KUumZxp7
 github.com/Microsoft/hcsshim v0.9.12/go.mod h1:qAiPvMgZoM0wpkVg6qMdSEu+1VtI6/qHOOPkTGt8ftQ=
 github.com/Microsoft/hcsshim/test v0.0.0-20201218223536-d3e5debf77da/go.mod h1:5hlzMzRKMLyo42nCZ9oml8AdTlq/0cvIaBv6tK1RehU=
 github.com/Microsoft/hcsshim/test v0.0.0-20210227013316-43a75bb4edd3/go.mod h1:mw7qgWloBUl75W/gVH3cQszUg1+gUITj7D6NY7ywVnY=
+github.com/NVIDIA/go-nvml v0.12.4-1 h1:WKUvqshhWSNTfm47ETRhv0A0zJyr1ncCuHiXwoTrBEc=
+github.com/NVIDIA/go-nvml v0.12.4-1/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ=
 github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ=
 github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
 github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
diff --git a/gpu/gpu.go b/gpu/gpu.go
new file mode 100644
index 0000000..f91d690
--- /dev/null
+++ b/gpu/gpu.go
@@ -0,0 +1,290 @@
+package gpu
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"os"
+	"runtime"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/NVIDIA/go-nvml/pkg/nvml"
+	"github.com/coroot/coroot-node-agent/proc"
+	"github.com/prometheus/client_golang/prometheus"
+	"k8s.io/klog/v2"
+)
+
+var (
+	gpuInfo = prometheus.NewDesc(
+		"node_gpu_info",
+		"Meta information about the GPU",
+		[]string{"gpu_uuid", "name"}, nil,
+	)
+	gpuMemoryTotal = prometheus.NewDesc(
+		"node_resources_gpu_memory_total_bytes",
+		"Total memory available on the GPU in bytes",
+		[]string{"gpu_uuid"}, nil,
+	)
+	gpuMemoryUsed = prometheus.NewDesc(
+		"node_resources_gpu_memory_used_bytes",
+		"GPU memory currently in use in bytes",
+		[]string{"gpu_uuid"}, nil,
+	)
+	gpuMemoryUsageAvg = prometheus.NewDesc(
+		"node_resources_gpu_memory_utilization_percent_avg",
+		"Average GPU memory utilization (percentage) over the collection interval",
+		[]string{"gpu_uuid"}, nil,
+	)
+	gpuTemperature = prometheus.NewDesc(
+		"node_resources_gpu_temperature_celsius",
+		"Current temperature of the GPU in Celsius",
+		[]string{"gpu_uuid"}, nil,
+	)
+	gpuPowerWatts = prometheus.NewDesc(
+		"node_resources_gpu_power_usage_watts",
+		"Current power usage of the GPU in watts",
+		[]string{"gpu_uuid"}, nil,
+	)
+	gpuMemoryUsagePeak = prometheus.NewDesc(
+		"node_resources_gpu_memory_utilization_percent_peak",
+		"Peak GPU memory utilization (percentage) over the collection interval",
+		[]string{"gpu_uuid"}, nil,
+	)
+	gpuUsageAvg = prometheus.NewDesc(
+		"node_resources_gpu_utilization_percent_avg",
+		"Average GPU core utilization (percentage) over the collection interval",
+		[]string{"gpu_uuid"}, nil,
+	)
+	gpuUsagePeak = prometheus.NewDesc(
+		"node_resources_gpu_utilization_percent_peak",
+		"Peak GPU core utilization (percentage) over the collection interval",
+		[]string{"gpu_uuid"}, nil,
+	)
+)
+
+type Collector struct {
+	ProcessUsageSampleCh chan ProcessUsageSample
+	iface                nvml.Interface
+	devices              []*Device
+	lock                 sync.Mutex
+}
+
+type Device struct {
+	UUID           string
+	Name           string
+	device         nvml.Device
+	lastSampleTime map[nvml.SamplingType]uint64
+}
+
+type ProcessUsageSample struct {
+	UUID          string
+	Pid           uint32
+	Timestamp     time.Time
+	GPUPercent    uint32
+	MemoryPercent uint32
+}
+
+func NewCollector() (*Collector, error) {
+	c := &Collector{
+		ProcessUsageSampleCh: make(chan ProcessUsageSample, 100),
+	}
+
+	libPath, err := findNvidiaMLLib()
+	if err != nil {
+		klog.Infoln(err)
+		return c, nil
+	}
+	klog.Infof("found NVML lib at %s", libPath)
+
+	c.iface = nvml.New(nvml.WithLibraryPath(libPath))
+	if ret := c.iface.Init(); ret != nvml.SUCCESS {
+		return c, fmt.Errorf("unable to initialize NVML: %s", nvml.ErrorString(ret))
+	}
+	count, ret := c.iface.DeviceGetCount()
+	if ret != nvml.SUCCESS {
+		return c, fmt.Errorf("unable to get device count: %s", nvml.ErrorString(ret))
+	}
+	var names []string
+	for i := 0; i < count; i++ {
+		device, ret := c.iface.DeviceGetHandleByIndex(i)
+		if ret != nvml.SUCCESS {
+			return c, errors.New(nvml.ErrorString(ret))
+		}
+		dev := Device{
+			lastSampleTime: map[nvml.SamplingType]uint64{},
+			device:         device,
+		}
+		if dev.UUID, ret = device.GetUUID(); ret != nvml.SUCCESS {
+			return c, errors.New(nvml.ErrorString(ret))
+		}
+		if dev.Name, ret = device.GetName(); ret != nvml.SUCCESS {
+			return c, errors.New(nvml.ErrorString(ret))
+		}
+		names = append(names, dev.Name)
+		c.devices = append(c.devices, &dev)
+	}
+	if len(names) > 0 {
+		klog.Infof("found %d GPU: %s", len(names), strings.Join(names, ", "))
+	}
+	go c.processUtilizationPoller()
+	return c, nil
+}
+
+func (c *Collector) processUtilizationPoller() {
+	ticker := time.NewTicker(1 * time.Second)
+	lastTs := uint64(time.Now().UnixMicro())
+	for range ticker.C {
+		for _, dev := range c.devices {
+			samples, _ := dev.device.GetProcessUtilization(lastTs)
+			for _, sample := range samples {
+				if sample.TimeStamp <= lastTs {
+					continue
+				}
+				if sample.SmUtil > 0 {
+					c.ProcessUsageSampleCh <- ProcessUsageSample{
+						UUID:          dev.UUID,
+						Pid:           sample.Pid,
+						GPUPercent:    sample.SmUtil,
+						MemoryPercent: sample.MemUtil,
+						Timestamp:     time.UnixMicro(int64(sample.TimeStamp)),
+					}
+				}
+				lastTs = sample.TimeStamp
+			}
+		}
+	}
+}
+
+func (c *Collector) Describe(ch chan<- *prometheus.Desc) {
+	ch <- gpuInfo
+	ch <- gpuMemoryTotal
+	ch <- gpuMemoryUsed
+	ch <- gpuMemoryUsageAvg
+	ch <- gpuMemoryUsagePeak
+	ch <- gpuUsageAvg
+	ch <- gpuUsagePeak
+	ch <- gpuTemperature
+	ch <- gpuPowerWatts
+}
+
+func (c *Collector) Collect(ch chan<- prometheus.Metric) {
+	c.lock.Lock()
+	defer c.lock.Unlock()
+	for _, dev := range c.devices {
+		ch <- gauge(gpuInfo, 1, dev.UUID, dev.Name)
+
+		mi, ret := dev.device.GetMemoryInfo()
+		if ret == nvml.SUCCESS {
+			ch <- gauge(gpuMemoryTotal, float64(mi.Total), dev.UUID)
+			ch <- gauge(gpuMemoryUsed, float64(mi.Used), dev.UUID)
+		}
+		if t, ret := dev.device.GetTemperature(nvml.TEMPERATURE_GPU); ret == nvml.SUCCESS {
+			ch <- gauge(gpuTemperature, float64(t), dev.UUID)
+		}
+		if mw, ret := dev.device.GetPowerUsage(); ret == nvml.SUCCESS {
+			ch <- gauge(gpuPowerWatts, float64(mw)/1000., dev.UUID)
+		}
+		for _, st := range []nvml.SamplingType{nvml.GPU_UTILIZATION_SAMPLES, nvml.MEMORY_UTILIZATION_SAMPLES} {
+			lastTs := dev.lastSampleTime[st]
+			valtype, samples, ret := dev.device.GetSamples(st, lastTs)
+			if ret != nvml.SUCCESS {
+				continue
+			}
+			total := float64(0)
+			count := float64(0)
+			peak := float64(0)
+			for _, sample := range samples {
+				if sample.TimeStamp <= lastTs {
+					continue
+				}
+				value, err := valueToFloat(valtype, sample.SampleValue)
+				if err != nil {
+					continue
+				}
+				total += value
+				if value > peak {
+					peak = value
+				}
+				count++
+				lastTs = sample.TimeStamp
+			}
+			if count > 0 {
+				switch st {
+				case nvml.GPU_UTILIZATION_SAMPLES:
+					ch <- gauge(gpuUsageAvg, total/count, dev.UUID)
+					ch <- gauge(gpuUsagePeak, peak, dev.UUID)
+				case nvml.MEMORY_UTILIZATION_SAMPLES:
+					ch <- gauge(gpuMemoryUsageAvg, total/count, dev.UUID)
+					ch <- gauge(gpuMemoryUsagePeak, peak, dev.UUID)
+				}
+			}
+			dev.lastSampleTime[st] = lastTs
+		}
+	}
+}
+
+func (c *Collector) Close() {
+	c.iface.Shutdown()
+}
+
+func findNvidiaMLLib() (string, error) {
+	paths := []string{
+		// gpu-operator
+		"/run/nvidia/driver/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1",
+		"/run/nvidia/driver/usr/lib64/libnvidia-ml.so.1",
+		"/home/kubernetes/bin/nvidia/lib64/libnvidia-ml.so.1", //GKE
+
+		"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1",
+		"/usr/lib64/libnvidia-ml.so.1",
+		"/usr/local/cuda/lib64/libnvidia-ml.so.1",
+		"/usr/lib/libnvidia-ml.so.1",
+	}
+	if runtime.GOARCH == "arm64" {
+		paths = append(paths,
+			"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so.1",
+			"/run/nvidia/driver/usr/lib/aarch64-linux-gnu/libnvidia-ml.so.1",
+			"/home/kubernetes/bin/nvidia/lib64-aarch64/libnvidia-ml.so.1", //GKE
+		)
+	}
+	for _, p := range paths {
+		if _, err := os.Stat(proc.HostPath(p)); err == nil {
+			return proc.HostPath(p), nil
+		}
+	}
+	return "", fmt.Errorf("libnvidia-ml.so.1 not found in known paths")
+}
+
+func valueToFloat(valueType nvml.ValueType, value [8]byte) (float64, error) {
+	r := bytes.NewReader(value[:])
+	switch valueType {
+	case nvml.VALUE_TYPE_DOUBLE:
+		var v float64
+		err := binary.Read(r, binary.LittleEndian, &v)
+		return v, err
+	case nvml.VALUE_TYPE_UNSIGNED_INT:
+		var v uint32
+		err := binary.Read(r, binary.LittleEndian, &v)
+		return float64(v), err
+	case nvml.VALUE_TYPE_UNSIGNED_LONG, nvml.VALUE_TYPE_UNSIGNED_LONG_LONG:
+		var v uint64
+		err := binary.Read(r, binary.LittleEndian, &v)
+		return float64(v), err
+	case nvml.VALUE_TYPE_SIGNED_LONG_LONG:
+		var v int64
+		err := binary.Read(r, binary.LittleEndian, &v)
+		return float64(v), err
+	case nvml.VALUE_TYPE_SIGNED_INT:
+		var v int32
+		err := binary.Read(r, binary.LittleEndian, &v)
+		return float64(v), err
+	default:
+		return 0, fmt.Errorf("unsupported value type %d", valueType)
+	}
+}
+
+func gauge(desc *prometheus.Desc, value float64, labelValues ...string) prometheus.Metric {
+	return prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, value, labelValues...)
+}
diff --git a/main.go b/main.go
index 6f47e0d..173dd88 100644
--- a/main.go
+++ b/main.go
@@ -11,6 +11,7 @@ import (
 	"github.com/coroot/coroot-node-agent/common"
 	"github.com/coroot/coroot-node-agent/containers"
 	"github.com/coroot/coroot-node-agent/flags"
+	"github.com/coroot/coroot-node-agent/gpu"
 	"github.com/coroot/coroot-node-agent/logs"
 	"github.com/coroot/coroot-node-agent/node"
 	"github.com/coroot/coroot-node-agent/proc"
@@ -141,6 +142,14 @@ func main() {
 	if err := registerer.Register(nodeCollector); err != nil {
 		klog.Exitln(err)
 	}
+
+	gpuCollector, err := gpu.NewCollector()
+	if err != nil {
+		klog.Warningln("failed to initialize GPU collector:", err)
+	}
+	if err := registerer.Register(gpuCollector); err != nil {
+		klog.Exitln(err)
+	}
 	registerer.MustRegister(info("node_agent_info", version))
 
 	if md := nodeCollector.Metadata(); md != nil {
@@ -151,7 +160,7 @@ func main() {
 		}
 	}
 	processInfoCh := profiling.Init(machineId, hostname)
-	cr, err := containers.NewRegistry(registerer, processInfoCh)
+	cr, err := containers.NewRegistry(registerer, processInfoCh, gpuCollector.ProcessUsageSampleCh)
 	if err != nil {
 		klog.Exitln(err)
 	}