diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 012126b..401b021 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -5,7 +5,7 @@ Below are some basic guidelines. ## Requirements -* Linux ≥v4.16 (amd64, arm64) +* Linux ≥v5.1 (amd64, arm64) * Go v1.23 diff --git a/Dockerfile b/Dockerfile index 2918c89..580f747 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,7 +6,7 @@ COPY go.sum . RUN go mod download COPY . . ARG VERSION=unknown -RUN CGO_ENABLED=1 go build -mod=readonly -ldflags "-X 'github.com/coroot/coroot-node-agent/flags.Version=${VERSION}'" -o coroot-node-agent . +RUN CGO_ENABLED=1 go build -mod=readonly -ldflags "-extldflags='-Wl,-z,lazy' -X 'github.com/coroot/coroot-node-agent/flags.Version=${VERSION}'" -o coroot-node-agent . FROM registry.access.redhat.com/ubi9/ubi diff --git a/README.md b/README.md index 0877509..bcc2609 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ The agent gathers metrics related to a node and the containers running on it, and it exposes them in the Prometheus format. -It uses eBPF to track container related events such as TCP connects, so the minimum supported Linux kernel version is 4.16. +It uses eBPF to track container related events such as TCP connects, so the minimum supported Linux kernel version is 5.1. diff --git a/cgroup/cgroup.go b/cgroup/cgroup.go index cee8aa5..1dd1df5 100644 --- a/cgroup/cgroup.go +++ b/cgroup/cgroup.go @@ -25,6 +25,7 @@ var ( lxcIdRegexp = regexp.MustCompile(`/lxc/([^/]+)`) systemSliceIdRegexp = regexp.MustCompile(`(/(system|runtime|reserved)\.slice/([^/]+))`) talosIdRegexp = regexp.MustCompile(`/(system|podruntime)/([^/]+)`) + lxcPayloadRegexp = regexp.MustCompile(`/lxc\.payload\.([^/]+)`) ) type ContainerType uint8 @@ -129,7 +130,14 @@ func NewFromProcessCgroupFile(filePath string) (*Cgroup, error) { continue } for _, cgType := range strings.Split(parts[1], ",") { - p := path.Join(baseCgroupPath, parts[2]) + cgPath := parts[2] + if strings.HasPrefix(parts[2], "/lxc.payload.") { + pp := strings.Split(cgPath, "/") + if len(parts) > 2 { + cgPath = "/" + pp[1] + } + } + p := path.Join(baseCgroupPath, cgPath) switch p { case "/", "/init.scope": continue @@ -147,24 +155,22 @@ func NewFromProcessCgroupFile(filePath string) (*Cgroup, error) { func containerByCgroup(cgroupPath string) (ContainerType, string, error) { parts := strings.Split(strings.TrimLeft(cgroupPath, "/"), "/") - if cgroupPath == "/init" { - return ContainerTypeTalosRuntime, "/talos/init", nil - } - if len(parts) < 2 { + if len(parts) == 0 { return ContainerTypeStandaloneProcess, "", nil } prefix := parts[0] - if prefix == "user.slice" || prefix == "init.scope" { + switch { + case cgroupPath == "/init": + return ContainerTypeTalosRuntime, "/talos/init", nil + case prefix == "user.slice" || prefix == "init.scope": return ContainerTypeStandaloneProcess, "", nil - } - if prefix == "docker" || (prefix == "system.slice" && strings.HasPrefix(parts[1], "docker-")) { + case prefix == "docker" || (prefix == "system.slice" && len(parts) > 1 && strings.HasPrefix(parts[1], "docker-")): matches := dockerIdRegexp.FindStringSubmatch(cgroupPath) if matches == nil { return ContainerTypeUnknown, "", fmt.Errorf("invalid docker cgroup %s", cgroupPath) } return ContainerTypeDocker, matches[1], nil - } - if strings.Contains(cgroupPath, "kubepods") { + case strings.Contains(cgroupPath, "kubepods"): crioMatches := crioIdRegexp.FindStringSubmatch(cgroupPath) if crioMatches != nil { return ContainerTypeCrio, crioMatches[1], nil @@ -181,27 +187,33 @@ func containerByCgroup(cgroupPath string) (ContainerType, string, error) { return ContainerTypeSandbox, "", nil } return ContainerTypeDocker, matches[1], nil - } - if prefix == "lxc" { - matches := lxcIdRegexp.FindStringSubmatch(cgroupPath) - if matches == nil { - return ContainerTypeUnknown, "", fmt.Errorf("invalid lxc cgroup %s", cgroupPath) - } - return ContainerTypeLxc, matches[1], nil - } - if prefix == "system" || prefix == "podruntime" { + case prefix == "system" || prefix == "podruntime": matches := talosIdRegexp.FindStringSubmatch(cgroupPath) if matches == nil { return ContainerTypeUnknown, "", fmt.Errorf("invalid talos runtime cgroup %s", cgroupPath) } return ContainerTypeTalosRuntime, path.Join("/talos/", matches[2]), nil - } - if prefix == "system.slice" || prefix == "runtime.slice" || prefix == "reserved.slice" { + case prefix == "system.slice" || prefix == "runtime.slice" || prefix == "reserved.slice": matches := systemSliceIdRegexp.FindStringSubmatch(cgroupPath) if matches == nil { return ContainerTypeUnknown, "", fmt.Errorf("invalid systemd cgroup %s", cgroupPath) } return ContainerTypeSystemdService, strings.Replace(matches[1], "\\x2d", "-", -1), nil + case prefix == "lxc": + matches := lxcIdRegexp.FindStringSubmatch(cgroupPath) + if matches == nil { + return ContainerTypeUnknown, "", fmt.Errorf("invalid lxc cgroup %s", cgroupPath) + } + return ContainerTypeLxc, matches[1], nil + case strings.HasPrefix(prefix, "lxc.payload."): + matches := lxcPayloadRegexp.FindStringSubmatch(cgroupPath) + if matches == nil { + return ContainerTypeUnknown, "", fmt.Errorf("invalid lxc payload cgroup %s", cgroupPath) + } + return ContainerTypeLxc, "/lxc/" + matches[1], nil + case len(parts) < 2: + return ContainerTypeStandaloneProcess, "", nil } + return ContainerTypeUnknown, "", fmt.Errorf("unknown container: %s", cgroupPath) } diff --git a/cgroup/cgroup_test.go b/cgroup/cgroup_test.go index 2ee5b1b..b7adbfc 100644 --- a/cgroup/cgroup_test.go +++ b/cgroup/cgroup_test.go @@ -5,6 +5,7 @@ import ( "testing" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestNewFromProcessCgroupFile(t *testing.T) { @@ -75,6 +76,12 @@ func TestNewFromProcessCgroupFile(t *testing.T) { assert.Equal(t, "95cbe853416f52d927dec41f1406dd75015ea131244a1ca875a7cd4ebe927ac8", cg.ContainerId) assert.Equal(t, ContainerTypeDocker, cg.ContainerType) + cg, err = NewFromProcessCgroupFile(path.Join("fixtures/proc/3000/cgroup")) + require.Nil(t, err) + assert.Equal(t, "/lxc.payload.first", cg.Id) + assert.Equal(t, "/lxc/first", cg.ContainerId) + assert.Equal(t, ContainerTypeLxc, cg.ContainerType) + baseCgroupPath = "/kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-podc83d0428_58af_41eb_8dba_b9e6eddffe7b.slice/docker-0e612005fd07e7f47e2cd07df99a2b4e909446814d71d0b5e4efc7159dd51252.scope" defer func() { baseCgroupPath = "" @@ -178,4 +185,14 @@ func TestContainerByCgroup(t *testing.T) { as.Equal(typ, ContainerTypeTalosRuntime) as.Equal("/talos/init", id) as.Nil(err) + + typ, id, err = containerByCgroup("/lxc.payload.first") + as.Equal(typ, ContainerTypeLxc) + as.Equal("/lxc/first", id) + as.Nil(err) + + typ, id, err = containerByCgroup("/lxc.monitor.first") + as.Equal(ContainerTypeStandaloneProcess, typ) + as.Equal("", id) + as.Nil(err) } diff --git a/cgroup/fixtures/proc/3000/cgroup b/cgroup/fixtures/proc/3000/cgroup new file mode 100644 index 0000000..9cd26be --- /dev/null +++ b/cgroup/fixtures/proc/3000/cgroup @@ -0,0 +1 @@ +0::/lxc.payload.first/system.slice/systemd-logind.service diff --git a/containers/app.go b/containers/app.go index c6fa309..9e252f1 100644 --- a/containers/app.go +++ b/containers/app.go @@ -120,6 +120,8 @@ func guessApplicationTypeByCmdline(cmdline []byte) string { return "nats" case bytes.HasSuffix(cmd, []byte("java")): return "java" + case bytes.HasSuffix(cmd, []byte("ollama")): + return "ollama" case bytes.Contains(cmd, []byte("victoria-metrics")) || bytes.Contains(cmd, []byte("vmstorage")) || bytes.Contains(cmd, []byte("vminsert")) || diff --git a/containers/container.go b/containers/container.go index c9b8e41..a35eb4c 100644 --- a/containers/container.go +++ b/containers/container.go @@ -29,6 +29,7 @@ var ( gcInterval = 10 * time.Minute pingTimeout = 300 * time.Millisecond multilineCollectorTimeout = time.Second + gpuStatsWindow = 15 * time.Second ) type ContainerID string @@ -132,6 +133,8 @@ type Container struct { l7Stats L7Stats dnsStats *L7Metrics + gpuStats map[string]*GpuUsage + oomKills int pythonThreadLockWaitTime time.Duration @@ -181,6 +184,8 @@ func NewContainer(id ContainerID, cg *cgroup.Cgroup, md *ContainerMetadata, pid l7Stats: L7Stats{}, dnsStats: &L7Metrics{}, + gpuStats: map[string]*GpuUsage{}, + mounts: map[string]proc.MountInfo{}, seenMounts: map[uint64]struct{}{}, @@ -370,7 +375,27 @@ func (c *Container) Collect(ch chan<- prometheus.Metric) { process.dotNetMonitor.Collect(ch) } } + + for _, usage := range c.gpuStats { + usage.Reset() + } + if usage := process.getGPUUsage(); usage != nil { + for uuid, u := range usage { + tu := c.gpuStats[uuid] + if tu == nil { + tu = &GpuUsage{} + c.gpuStats[uuid] = tu + } + tu.GPU += u.GPU + tu.Memory += u.Memory + } + } } + for uuid, usage := range c.gpuStats { + ch <- gauge(metrics.GpuUsagePercent, usage.GPU, uuid) + ch <- gauge(metrics.GpuMemoryUsagePercent, usage.Memory, uuid) + } + for appType := range appTypes { ch <- gauge(metrics.ApplicationType, 1, appType) } diff --git a/containers/metrics.go b/containers/metrics.go index dcc17cd..2d0df76 100644 --- a/containers/metrics.go +++ b/containers/metrics.go @@ -51,6 +51,9 @@ var metrics = struct { PythonThreadLockWaitTime *prometheus.Desc + GpuUsagePercent *prometheus.Desc + GpuMemoryUsagePercent *prometheus.Desc + Ip2Fqdn *prometheus.Desc }{ ContainerInfo: metric("container_info", "Meta information about the container", "image", "systemd_triggered_by"), @@ -100,6 +103,9 @@ var metrics = struct { Ip2Fqdn: metric("ip_to_fqdn", "Mapping IP addresses to FQDNs based on DNS requests initiated by containers", "ip", "fqdn"), PythonThreadLockWaitTime: metric("container_python_thread_lock_wait_time_seconds", "Time spent waiting acquiring GIL in seconds"), + + GpuUsagePercent: metric("container_resources_gpu_usage_percent", "Percent of GPU compute resources used by the container", "gpu_uuid"), + GpuMemoryUsagePercent: metric("container_resources_gpu_memory_usage_percent", "Percent of GPU memory used by the container", "gpu_uuid"), } var ( diff --git a/containers/process.go b/containers/process.go index ddf5ac2..9ee7242 100644 --- a/containers/process.go +++ b/containers/process.go @@ -8,11 +8,22 @@ import ( "github.com/cilium/ebpf/link" "github.com/coroot/coroot-node-agent/ebpftracer" + "github.com/coroot/coroot-node-agent/gpu" "github.com/coroot/coroot-node-agent/proc" "github.com/jpillora/backoff" "github.com/mdlayher/taskstats" ) +type GpuUsage struct { + GPU float64 + Memory float64 +} + +func (gu *GpuUsage) Reset() { + gu.Memory = 0 + gu.GPU = 0 +} + type Process struct { Pid uint32 StartedAt time.Time @@ -29,6 +40,8 @@ type Process struct { goTlsUprobesChecked bool openSslUprobesChecked bool pythonGilChecked bool + + gpuUsageSamples []gpu.ProcessUsageSample } func NewProcess(pid uint32, stats *taskstats.Stats, tracer *ebpftracer.Tracer) *Process { @@ -97,6 +110,43 @@ func (p *Process) instrumentPython(cmdline []byte, tracer *ebpftracer.Tracer) { p.uprobes = append(p.uprobes, tracer.AttachPythonThreadLockProbes(p.Pid)...) } +func (p *Process) addGpuUsageSample(sample gpu.ProcessUsageSample) { + p.removeOldGpuUsageSamples(sample.Timestamp.Add(-gpuStatsWindow)) + p.gpuUsageSamples = append(p.gpuUsageSamples, sample) +} + +func (p *Process) getGPUUsage() map[string]*GpuUsage { + p.removeOldGpuUsageSamples(time.Now().Add(-gpuStatsWindow)) + if len(p.gpuUsageSamples) == 0 { + return nil + } + gpuStatsWindowSeconds := gpuStatsWindow.Seconds() + res := make(map[string]*GpuUsage) + for _, sample := range p.gpuUsageSamples { + u := res[sample.UUID] + if u == nil { + u = &GpuUsage{} + res[sample.UUID] = u + } + u.GPU += float64(sample.GPUPercent) / gpuStatsWindowSeconds + u.Memory += float64(sample.MemoryPercent) / gpuStatsWindowSeconds + } + return res +} + +func (p *Process) removeOldGpuUsageSamples(cutoff time.Time) { + i := 0 + for ; i < len(p.gpuUsageSamples); i++ { + if p.gpuUsageSamples[i].Timestamp.After(cutoff) { + break + } + } + if i > 0 { + copy(p.gpuUsageSamples, p.gpuUsageSamples[i:]) + p.gpuUsageSamples = p.gpuUsageSamples[:len(p.gpuUsageSamples)-i] + } +} + func (p *Process) Close() { p.cancelFunc() for _, u := range p.uprobes { diff --git a/containers/registry.go b/containers/registry.go index aee9f76..2eba30b 100644 --- a/containers/registry.go +++ b/containers/registry.go @@ -14,6 +14,7 @@ import ( "github.com/coroot/coroot-node-agent/common" "github.com/coroot/coroot-node-agent/ebpftracer" "github.com/coroot/coroot-node-agent/flags" + "github.com/coroot/coroot-node-agent/gpu" "github.com/coroot/coroot-node-agent/proc" "github.com/prometheus/client_golang/prometheus" "github.com/vishvananda/netns" @@ -59,9 +60,11 @@ type Registry struct { trafficStatsLastUpdated time.Time trafficStatsLock sync.Mutex trafficStatsUpdateCh chan *TrafficStatsUpdate + + gpuProcessUsageSampleChan chan gpu.ProcessUsageSample } -func NewRegistry(reg prometheus.Registerer, processInfoCh chan<- ProcessInfo) (*Registry, error) { +func NewRegistry(reg prometheus.Registerer, processInfoCh chan<- ProcessInfo, gpuProcessUsageSampleChan chan gpu.ProcessUsageSample) (*Registry, error) { ns, err := proc.GetSelfNetNs() if err != nil { return nil, err @@ -113,6 +116,8 @@ func NewRegistry(reg prometheus.Registerer, processInfoCh chan<- ProcessInfo) (* tracer: ebpftracer.NewTracer(hostNetNs, selfNetNs, *flags.DisableL7Tracing), trafficStatsUpdateCh: make(chan *TrafficStatsUpdate), + + gpuProcessUsageSampleChan: gpuProcessUsageSampleChan, } if err = reg.Register(r); err != nil { return nil, err @@ -205,6 +210,12 @@ func (r *Registry) handleEvents(ch <-chan ebpftracer.Event) { if c := r.containersByPid[u.Pid]; c != nil { c.updateTrafficStats(u) } + case sample := <-r.gpuProcessUsageSampleChan: + if c := r.containersByPid[sample.Pid]; c != nil { + if p := c.processes[sample.Pid]; p != nil { + p.addGpuUsageSample(sample) + } + } case e, more := <-ch: if !more { return diff --git a/go.mod b/go.mod index 0fe4c53..f16c2a0 100644 --- a/go.mod +++ b/go.mod @@ -3,10 +3,11 @@ module github.com/coroot/coroot-node-agent go 1.23.8 require ( - cloud.google.com/go/compute/metadata v0.5.2 + cloud.google.com/go/compute/metadata v0.6.0 github.com/ClickHouse/ch-go v0.62.0 + github.com/NVIDIA/go-nvml v0.12.4-1 github.com/agoda-com/opentelemetry-logs-go v0.4.1 - github.com/cilium/cilium v1.17.2 + github.com/cilium/cilium v1.17.3 github.com/cilium/ebpf v0.17.3 github.com/containerd/cgroups v1.0.4 github.com/containerd/containerd v1.6.38 @@ -28,19 +29,19 @@ require ( github.com/prometheus/prometheus v0.51.2 github.com/pyroscope-io/dotnetdiag v1.2.1 github.com/stretchr/testify v1.10.0 - github.com/vishvananda/netlink v1.3.1-0.20250221194427-0af32151e72b + github.com/vishvananda/netlink v1.3.1-0.20250303224720-0e7078ed04c8 github.com/vishvananda/netns v0.0.5 github.com/xin053/hsperfdata v0.2.3 go.mongodb.org/mongo-driver v1.14.0 go.opentelemetry.io/otel v1.34.0 go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.28.0 - go.opentelemetry.io/otel/sdk v1.31.0 + go.opentelemetry.io/otel/sdk v1.34.0 go.opentelemetry.io/otel/trace v1.34.0 golang.org/x/arch v0.4.0 golang.org/x/exp v0.0.0-20240808152545-0cdaa3abc0fa - golang.org/x/net v0.36.0 - golang.org/x/sys v0.30.0 + golang.org/x/net v0.38.0 + golang.org/x/sys v0.32.0 golang.org/x/time v0.8.0 gopkg.in/alecthomas/kingpin.v2 v2.2.6 gopkg.in/yaml.v2 v2.4.0 @@ -60,7 +61,7 @@ require ( github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cilium/hive v0.0.0-20250121145729-e67f66eb0375 // indirect - github.com/cilium/proxy v0.0.0-20241115112946-fb67566cbd95 // indirect + github.com/cilium/proxy v0.0.0-20250408103314-d3be7c547203 // indirect github.com/cilium/statedb v0.3.6 // indirect github.com/cilium/stream v0.0.0-20241203114243-53c3e5d79744 // indirect github.com/cilium/workerpool v1.2.0 // indirect @@ -106,7 +107,7 @@ require ( github.com/google/uuid v1.6.0 // indirect github.com/gopacket/gopacket v1.3.1 // indirect github.com/grafana/regexp v0.0.0-20221123153739-15dc172cd2db // indirect - github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1 // indirect github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect github.com/hashicorp/hcl v1.0.1-vault-5 // indirect github.com/ianlancetaylor/demangle v0.0.0-20240312041847-bd984b5ce465 // indirect @@ -165,7 +166,7 @@ require ( go.opentelemetry.io/auto/sdk v1.1.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 // indirect go.opentelemetry.io/otel/metric v1.34.0 // indirect - go.opentelemetry.io/proto/otlp v1.3.1 // indirect + go.opentelemetry.io/proto/otlp v1.5.0 // indirect go.uber.org/dig v1.17.1 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 // indirect @@ -173,14 +174,14 @@ require ( go4.org/netipx v0.0.0-20231129151722-fdeea329fbba // indirect go4.org/unsafe/assume-no-moving-gc v0.0.0-20230525183740-e7c30c78aeb2 // indirect golang.org/x/oauth2 v0.27.0 // indirect - golang.org/x/sync v0.11.0 // indirect - golang.org/x/term v0.29.0 // indirect - golang.org/x/text v0.22.0 // indirect + golang.org/x/sync v0.12.0 // indirect + golang.org/x/term v0.30.0 // indirect + golang.org/x/text v0.23.0 // indirect golang.org/x/tools v0.28.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20241104194629-dd2ea8efbc28 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250102185135-69823020774d // indirect - google.golang.org/grpc v1.69.2 // indirect - google.golang.org/protobuf v1.36.5 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20250407143221-ac9807e6c755 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250407143221-ac9807e6c755 // indirect + google.golang.org/grpc v1.71.1 // indirect + google.golang.org/protobuf v1.36.6 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/ini.v1 v1.67.0 // indirect diff --git a/go.sum b/go.sum index 079f40b..ac97851 100644 --- a/go.sum +++ b/go.sum @@ -1,6 +1,6 @@ bazil.org/fuse v0.0.0-20160811212531-371fbbdaa898/go.mod h1:Xbm+BRKSBEpa4q4hTSxohYNQpsxXPbPry4JJWOB3LB8= -cel.dev/expr v0.18.0 h1:CJ6drgk+Hf96lkLikr4rFf19WrU0BOWEihyZnI2TAzo= -cel.dev/expr v0.18.0/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= +cel.dev/expr v0.19.1 h1:NciYrtDRIR0lNCnH1LFJegdjspNx9fI59O7TWcua/W4= +cel.dev/expr v0.19.1/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= @@ -16,8 +16,8 @@ cloud.google.com/go v0.57.0/go.mod h1:oXiQ6Rzq3RAkkY7N6t3TcE6jE+CIBBbA36lwQ1JyzZ cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE= cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc= -cloud.google.com/go/compute/metadata v0.5.2 h1:UxK4uu/Tn+I3p2dYWTfiX4wva7aYlKixAHn3fyqngqo= -cloud.google.com/go/compute/metadata v0.5.2/go.mod h1:C66sj2AluDcIqakBq/M8lw8/ybHgOZqin2obFxa/E5k= +cloud.google.com/go/compute/metadata v0.6.0 h1:A6hENjEsCDtC1k8byVsgwvVcioamEHvZ4j01OwKxG9I= +cloud.google.com/go/compute/metadata v0.6.0/go.mod h1:FjyFAW1MW0C203CEOMDTu3Dk1FlqW3Rga40jzHL4hfg= cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk= cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I= @@ -70,6 +70,8 @@ github.com/Microsoft/hcsshim v0.9.12 h1:0Wgl1fRF4WmBuqP6EnHk2w3m7CCCumD/KUumZxp7 github.com/Microsoft/hcsshim v0.9.12/go.mod h1:qAiPvMgZoM0wpkVg6qMdSEu+1VtI6/qHOOPkTGt8ftQ= github.com/Microsoft/hcsshim/test v0.0.0-20201218223536-d3e5debf77da/go.mod h1:5hlzMzRKMLyo42nCZ9oml8AdTlq/0cvIaBv6tK1RehU= github.com/Microsoft/hcsshim/test v0.0.0-20210227013316-43a75bb4edd3/go.mod h1:mw7qgWloBUl75W/gVH3cQszUg1+gUITj7D6NY7ywVnY= +github.com/NVIDIA/go-nvml v0.12.4-1 h1:WKUvqshhWSNTfm47ETRhv0A0zJyr1ncCuHiXwoTrBEc= +github.com/NVIDIA/go-nvml v0.12.4-1/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ= github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= @@ -128,8 +130,8 @@ github.com/checkpoint-restore/go-criu/v5 v5.0.0/go.mod h1:cfwC0EG7HMUenopBsUf9d8 github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= -github.com/cilium/cilium v1.17.2 h1:hMahLKho06pzcAk8X+Co7jXhhwrj79rkTsy5FFehr8Q= -github.com/cilium/cilium v1.17.2/go.mod h1:BMwiENNHcSrSsx59fmzyu9V5pBKzav8bWawiA2PcA7U= +github.com/cilium/cilium v1.17.3 h1:puIIQhN4GnDZmlvEyMy4BTniffMHirFgaHeFZksavpg= +github.com/cilium/cilium v1.17.3/go.mod h1:fIBDMpEU/adT00vwvSR2ULvG0as10rNd0CzhJuU3Kao= github.com/cilium/ebpf v0.0.0-20200110133405-4032b1d8aae3/go.mod h1:MA5e5Lr8slmEg9bt0VpxxWqJlO4iwu3FBdHUzV7wQVg= github.com/cilium/ebpf v0.0.0-20200702112145-1c8d4c9ef775/go.mod h1:7cR51M8ViRLIdUjrmSXlK9pkrsDlLHbO8jiB8X8JnOc= github.com/cilium/ebpf v0.2.0/go.mod h1:To2CFviqOWL/M0gIMsvSMlqe7em/l1ALkX1PyjrX2Qs= @@ -142,8 +144,8 @@ github.com/cilium/fake v0.6.1 h1:cLkNx1nkF0b0pPW79JaQxaI5oG2/rBzRKpp0YUg1fTA= github.com/cilium/fake v0.6.1/go.mod h1:V9lCbbcsnSf3vB6sdOP7Q0bsUUJ/jyHPZxnFAw5nPUc= github.com/cilium/hive v0.0.0-20250121145729-e67f66eb0375 h1:EhoCO0AI3qJavnhfAls4w7VpVVpAr12wIh293sNA0hQ= github.com/cilium/hive v0.0.0-20250121145729-e67f66eb0375/go.mod h1:pI2GJ1n3SLKIQVFrKF7W6A6gb6BQkZ+3Hp4PAEo5SuI= -github.com/cilium/proxy v0.0.0-20241115112946-fb67566cbd95 h1:iMn0++U3CDqoDINY5JLOhlPcjj3kW/xCmse+d+EZkOM= -github.com/cilium/proxy v0.0.0-20241115112946-fb67566cbd95/go.mod h1:/UoCz3gByKwF5gCHFMUhwmIN5/Pgmb8LTIrfBlmjGCo= +github.com/cilium/proxy v0.0.0-20250408103314-d3be7c547203 h1:ncS+290YywupX+tmbB5bFbzcsZcrlK+go27Aihs36eQ= +github.com/cilium/proxy v0.0.0-20250408103314-d3be7c547203/go.mod h1:Kksp75+KLYLFwTLA6qcrUlIOQkXLRqQm+Tm6foOTJyM= github.com/cilium/statedb v0.3.6 h1:dGwzZTJgVWlnG7io0Wl0XsI7ULsz2TbNqH8Ag+dP6is= github.com/cilium/statedb v0.3.6/go.mod h1:n2lNVxi8vz5Up1Y1rRD++aQP2izQA932fUwTkedKSV0= github.com/cilium/stream v0.0.0-20241203114243-53c3e5d79744 h1:f+CgYUy2YyZ2EX31QSqf3vwFiJJQSAMIQLn4d3QQYno= @@ -154,8 +156,8 @@ github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDk github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= github.com/cncf/xds/go v0.0.0-20210312221358-fbca930ec8ed/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= -github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78 h1:QVw89YDxXxEe+l8gU8ETbOasdwEV+avkR75ZzsVV9WI= -github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= +github.com/cncf/xds/go v0.0.0-20250326154945-ae57f3c0d45f h1:C5bqEmzEPLsHm9Mv73lSE9e9bKV23aB1vxOsmZrkl3k= +github.com/cncf/xds/go v0.0.0-20250326154945-ae57f3c0d45f/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= github.com/cockroachdb/datadriven v0.0.0-20190809214429-80d97fb3cbaa/go.mod h1:zn76sxSg3SzpJ0PPJaLDCu+Bu0Lg3sKTORVIj19EIF8= github.com/containerd/aufs v0.0.0-20200908144142-dab0cbea06f4/go.mod h1:nukgQABAEopAHvB6j7cnP5zJ+/3aVcE7hCYqvIwAHyE= github.com/containerd/aufs v0.0.0-20201003224125-76a6863f2989/go.mod h1:AkGGQs9NM2vtYHaUen+NljV0/baGCAPELGm2q9ZXpWU= @@ -337,8 +339,8 @@ github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1m github.com/envoyproxy/go-control-plane v0.9.9-0.20201210154907-fd9021fe5dad/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk= github.com/envoyproxy/go-control-plane v0.9.9-0.20210512163311-63b5d3c536b0/go.mod h1:hliV/p42l8fGbc6Y9bQ70uLwIvmJyVE5k4iMKlh8wCQ= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= -github.com/envoyproxy/protoc-gen-validate v1.1.0 h1:tntQDh69XqOCOZsDz0lVJQez/2L6Uu2PdjCQwWCJ3bM= -github.com/envoyproxy/protoc-gen-validate v1.1.0/go.mod h1:sXRDRVmzEbkM7CVcM06s9shE/m23dg3wzjl0UWqJ2q4= +github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfUKS7KJ7spH3d86P8= +github.com/envoyproxy/protoc-gen-validate v1.2.1/go.mod h1:d/C80l/jxXLdfEIhX1W2TmLfsJ31lvEjwamM4DxlWXU= github.com/evanphx/json-patch v4.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch v5.9.0+incompatible h1:fBXyNpNMuTTDdquAq/uisOr2lShz4oaXpDTX2bLe7ls= github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0/FOJfg= @@ -537,8 +539,8 @@ github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgf github.com/grpc-ecosystem/grpc-gateway v1.9.0/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY= github.com/grpc-ecosystem/grpc-gateway v1.9.5/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY= github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 h1:bkypFPDjIYGfCYD5mRBvpqxfYX1YCS1PXdKYWi8FsN0= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0/go.mod h1:P+Lt/0by1T8bfcF3z737NnSbmxQAppXMRziHUxPOC8k= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1 h1:VNqngBF40hVlDloBruUehVYC3ArSgIyScOAyMRqBxRg= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1/go.mod h1:RBRO7fro65R6tjKzYgLAFo0t1QEXY1Dp+i/bvpRiqiQ= github.com/hashicorp/errwrap v0.0.0-20141028054710-7554cd9344ce/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/go-multierror v0.0.0-20161216184304-ed905158d874/go.mod h1:JMRHfdO9jKNzS/+BTlxCjKNQHg/jZAft8U7LloJvN7I= @@ -919,8 +921,8 @@ github.com/urfave/cli v1.22.2/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtX github.com/vishvananda/netlink v0.0.0-20181108222139-023a6dafdcdf/go.mod h1:+SR5DhBJrl6ZM7CoCKvpw5BKroDKQ+PJqOg65H/2ktk= github.com/vishvananda/netlink v1.1.0/go.mod h1:cTgwzPIzzgDAYoQrMm0EdrjRUBkTqKYppBueQtXaqoE= github.com/vishvananda/netlink v1.1.1-0.20201029203352-d40f9887b852/go.mod h1:twkDnbuQxJYemMlGd4JFIcuhgX83tXhKS2B/PRMpOho= -github.com/vishvananda/netlink v1.3.1-0.20250221194427-0af32151e72b h1:hYWtmuzlR0jpWu+ljWfPMi7oNiZ9x/D3GbBqgZTOhyI= -github.com/vishvananda/netlink v1.3.1-0.20250221194427-0af32151e72b/go.mod h1:i6NetklAujEcC6fK0JPjT8qSwWyO0HLn4UKG+hGqeJs= +github.com/vishvananda/netlink v1.3.1-0.20250303224720-0e7078ed04c8 h1:Y4egeTrP7sccowz2GWTJVtHlwkZippgBTpUmMteFUWQ= +github.com/vishvananda/netlink v1.3.1-0.20250303224720-0e7078ed04c8/go.mod h1:i6NetklAujEcC6fK0JPjT8qSwWyO0HLn4UKG+hGqeJs= github.com/vishvananda/netns v0.0.0-20180720170159-13995c7128cc/go.mod h1:ZjcWmFBXmLKZu9Nxj3WKYEafiSqer2rnvPr0en9UNpI= github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU= github.com/vishvananda/netns v0.0.0-20200728191858-db3c7e526aae/go.mod h1:DD4vA1DwXk04H54A1oHXtwZmA0grkVMdPxx/VGLCah0= @@ -976,15 +978,15 @@ go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.28.0 h1:j9+03 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.28.0/go.mod h1:Y5+XiUG4Emn1hTfciPzGPJaSI+RpDts6BnCIir0SLqk= go.opentelemetry.io/otel/metric v1.34.0 h1:+eTR3U0MyfWjRDhmFMxe2SsW64QrZ84AOhvqS7Y+PoQ= go.opentelemetry.io/otel/metric v1.34.0/go.mod h1:CEDrp0fy2D0MvkXE+dPV7cMi8tWZwX3dmaIhwPOaqHE= -go.opentelemetry.io/otel/sdk v1.31.0 h1:xLY3abVHYZ5HSfOg3l2E5LUj2Cwva5Y7yGxnSW9H5Gk= -go.opentelemetry.io/otel/sdk v1.31.0/go.mod h1:TfRbMdhvxIIr/B2N2LQW2S5v9m3gOQ/08KsbbO5BPT0= -go.opentelemetry.io/otel/sdk/metric v1.31.0 h1:i9hxxLJF/9kkvfHppyLL55aW7iIJz4JjxTeYusH7zMc= -go.opentelemetry.io/otel/sdk/metric v1.31.0/go.mod h1:CRInTMVvNhUKgSAMbKyTMxqOBC0zgyxzW55lZzX43Y8= +go.opentelemetry.io/otel/sdk v1.34.0 h1:95zS4k/2GOy069d321O8jWgYsW3MzVV+KuSPKp7Wr1A= +go.opentelemetry.io/otel/sdk v1.34.0/go.mod h1:0e/pNiaMAqaykJGKbi+tSjWfNNHMTxoC9qANsCzbyxU= +go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce1EK0Gyvahk= +go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w= go.opentelemetry.io/otel/trace v1.34.0 h1:+ouXS2V8Rd4hp4580a8q23bg0azF2nI8cqLYnC8mh/k= go.opentelemetry.io/otel/trace v1.34.0/go.mod h1:Svm7lSjQD7kG7KJ/MUHPVXSDGz2OX4h0M2jHBhmSfRE= go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= -go.opentelemetry.io/proto/otlp v1.3.1 h1:TrMUixzpM0yuc/znrFTP9MMRh8trP93mkCiDVeXrui0= -go.opentelemetry.io/proto/otlp v1.3.1/go.mod h1:0X1WI4de4ZsLrrJNLAQbFeLCm3T7yBkR0XqQ7niQU+8= +go.opentelemetry.io/proto/otlp v1.5.0 h1:xJvq7gMzB31/d406fB8U5CBdyQGw4P399D1aQWU/3i4= +go.opentelemetry.io/proto/otlp v1.5.0/go.mod h1:keN8WnHxOy8PG0rQZjJJ5A2ebUoafqWp0eVQ4yIXvJ4= go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/dig v1.17.1 h1:Tga8Lz8PcYNsWsyHMZ1Vm0OQOUaJNDyvPImgbAu9YSc= @@ -1097,8 +1099,8 @@ golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96b golang.org/x/net v0.0.0-20210525063256-abc453219eb5/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20210813160813-60bc85c4be6d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20210825183410-e898025ed96a/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= -golang.org/x/net v0.36.0 h1:vWF2fRbw4qslQsQzgFqZff+BItCvGFQqKzKIzx1rmoA= -golang.org/x/net v0.36.0/go.mod h1:bFmbeoIPfrw4sMHNhb4J9f6+tPziuGjq7Jk/38fxi1I= +golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= +golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -1117,8 +1119,8 @@ golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w= -golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw= +golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -1203,11 +1205,11 @@ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= -golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= +golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU= -golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s= +golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y= +golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -1215,8 +1217,8 @@ golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM= -golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY= +golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= +golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -1320,10 +1322,10 @@ google.golang.org/genproto v0.0.0-20200513103714-09dca8ec2884/go.mod h1:55QSHmfG google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= google.golang.org/genproto v0.0.0-20200527145253-8367513e4ece/go.mod h1:jDfRM7FcilCzHH/e9qn6dsT145K34l5v+OpcnNgKAAA= google.golang.org/genproto v0.0.0-20201110150050-8816d57aaa9a/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= -google.golang.org/genproto/googleapis/api v0.0.0-20241104194629-dd2ea8efbc28 h1:M0KvPgPmDZHPlbRbaNU1APr28TvwvvdUPlSv7PUvy8g= -google.golang.org/genproto/googleapis/api v0.0.0-20241104194629-dd2ea8efbc28/go.mod h1:dguCy7UOdZhTvLzDyt15+rOrawrpM4q7DD9dQ1P11P4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250102185135-69823020774d h1:xJJRGY7TJcvIlpSrN3K6LAWgNFUILlO+OMAqtg9aqnw= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250102185135-69823020774d/go.mod h1:3ENsm/5D1mzDyhpzeRi1NR784I0BcofWBoSc5QqqMK4= +google.golang.org/genproto/googleapis/api v0.0.0-20250407143221-ac9807e6c755 h1:AMLTAunltONNuzWgVPZXrjLWtXpsG6A3yLLPEoJ/IjU= +google.golang.org/genproto/googleapis/api v0.0.0-20250407143221-ac9807e6c755/go.mod h1:2R6XrVC8Oc08GlNh8ujEpc7HkLiEZ16QeY7FxIs20ac= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250407143221-ac9807e6c755 h1:TwXJCGVREgQ/cl18iY0Z4wJCTL/GmW+Um2oSwZiZPnc= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250407143221-ac9807e6c755/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= google.golang.org/grpc v0.0.0-20160317175043-d3ddb4469d5a/go.mod h1:yo6s7OP7yaDglbqo1J04qKzAhqBH6lvTonzMVmEdcZw= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= @@ -1342,8 +1344,8 @@ google.golang.org/grpc v1.33.1/go.mod h1:fr5YgcSWrqhRRxogOsw7RzIpsmvOZ6IcH4kBYTp google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc= google.golang.org/grpc v1.36.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU= google.golang.org/grpc v1.40.0/go.mod h1:ogyxbiOoUXAkP+4+xa6PZSE9DZgIHtSpzjDTB9KAK34= -google.golang.org/grpc v1.69.2 h1:U3S9QEtbXC0bYNvRtcoklF3xGtLViumSYxWykJS+7AU= -google.golang.org/grpc v1.69.2/go.mod h1:vyjdE6jLBI76dgpDojsFGNaHlxdjXN9ghpnd2o7JGZ4= +google.golang.org/grpc v1.71.1 h1:ffsFWr7ygTUscGPI0KKK6TLrGz0476KUvvsbqWK0rPI= +google.golang.org/grpc v1.71.1/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= @@ -1357,8 +1359,8 @@ google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlba google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.36.5 h1:tPhr+woSbjfYvY6/GPufUoYizxw1cF/yFoxJ2fmpwlM= -google.golang.org/protobuf v1.36.5/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= +google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/airbrake/gobrake.v2 v2.0.9/go.mod h1:/h5ZAUhDkGaJfjzjKLSjv6zCL6O0LLBxU4K+aSYdM/U= gopkg.in/alecthomas/kingpin.v2 v2.2.6 h1:jMFz6MfLP0/4fUyZle81rXUoxOBFi19VUFKVDOQfozc= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= diff --git a/gpu/gpu.go b/gpu/gpu.go new file mode 100644 index 0000000..f91d690 --- /dev/null +++ b/gpu/gpu.go @@ -0,0 +1,290 @@ +package gpu + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "os" + "runtime" + "strings" + "sync" + "time" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/coroot/coroot-node-agent/proc" + "github.com/prometheus/client_golang/prometheus" + "k8s.io/klog/v2" +) + +var ( + gpuInfo = prometheus.NewDesc( + "node_gpu_info", + "Meta information about the GPU", + []string{"gpu_uuid", "name"}, nil, + ) + gpuMemoryTotal = prometheus.NewDesc( + "node_resources_gpu_memory_total_bytes", + "Total memory available on the GPU in bytes", + []string{"gpu_uuid"}, nil, + ) + gpuMemoryUsed = prometheus.NewDesc( + "node_resources_gpu_memory_used_bytes", + "GPU memory currently in use in bytes", + []string{"gpu_uuid"}, nil, + ) + gpuMemoryUsageAvg = prometheus.NewDesc( + "node_resources_gpu_memory_utilization_percent_avg", + "Average GPU memory utilization (percentage) over the collection interval", + []string{"gpu_uuid"}, nil, + ) + gpuTemperature = prometheus.NewDesc( + "node_resources_gpu_temperature_celsius", + "Current temperature of the GPU in Celsius", + []string{"gpu_uuid"}, nil, + ) + gpuPowerWatts = prometheus.NewDesc( + "node_resources_gpu_power_usage_watts", + "Current power usage of the GPU in watts", + []string{"gpu_uuid"}, nil, + ) + gpuMemoryUsagePeak = prometheus.NewDesc( + "node_resources_gpu_memory_utilization_percent_peak", + "Peak GPU memory utilization (percentage) over the collection interval", + []string{"gpu_uuid"}, nil, + ) + gpuUsageAvg = prometheus.NewDesc( + "node_resources_gpu_utilization_percent_avg", + "Average GPU core utilization (percentage) over the collection interval", + []string{"gpu_uuid"}, nil, + ) + gpuUsagePeak = prometheus.NewDesc( + "node_resources_gpu_utilization_percent_peak", + "Peak GPU core utilization (percentage) over the collection interval", + []string{"gpu_uuid"}, nil, + ) +) + +type Collector struct { + ProcessUsageSampleCh chan ProcessUsageSample + iface nvml.Interface + devices []*Device + lock sync.Mutex +} + +type Device struct { + UUID string + Name string + device nvml.Device + lastSampleTime map[nvml.SamplingType]uint64 +} + +type ProcessUsageSample struct { + UUID string + Pid uint32 + Timestamp time.Time + GPUPercent uint32 + MemoryPercent uint32 +} + +func NewCollector() (*Collector, error) { + c := &Collector{ + ProcessUsageSampleCh: make(chan ProcessUsageSample, 100), + } + + libPath, err := findNvidiaMLLib() + if err != nil { + klog.Infoln(err) + return c, nil + } + klog.Infof("found NVML lib at %s", libPath) + + c.iface = nvml.New(nvml.WithLibraryPath(libPath)) + if ret := c.iface.Init(); ret != nvml.SUCCESS { + return c, fmt.Errorf("unable to initialize NVML: %s", nvml.ErrorString(ret)) + } + count, ret := c.iface.DeviceGetCount() + if ret != nvml.SUCCESS { + return c, fmt.Errorf("unable to get device count: %s", nvml.ErrorString(ret)) + } + var names []string + for i := 0; i < count; i++ { + device, ret := c.iface.DeviceGetHandleByIndex(i) + if ret != nvml.SUCCESS { + return c, errors.New(nvml.ErrorString(ret)) + } + dev := Device{ + lastSampleTime: map[nvml.SamplingType]uint64{}, + device: device, + } + if dev.UUID, ret = device.GetUUID(); ret != nvml.SUCCESS { + return c, errors.New(nvml.ErrorString(ret)) + } + if dev.Name, ret = device.GetName(); ret != nvml.SUCCESS { + return c, errors.New(nvml.ErrorString(ret)) + } + names = append(names, dev.Name) + c.devices = append(c.devices, &dev) + } + if len(names) > 0 { + klog.Infof("found %d GPU: %s", len(names), strings.Join(names, ", ")) + } + go c.processUtilizationPoller() + return c, nil +} + +func (c *Collector) processUtilizationPoller() { + ticker := time.NewTicker(1 * time.Second) + lastTs := uint64(time.Now().UnixMicro()) + for range ticker.C { + for _, dev := range c.devices { + samples, _ := dev.device.GetProcessUtilization(lastTs) + for _, sample := range samples { + if sample.TimeStamp <= lastTs { + continue + } + if sample.SmUtil > 0 { + c.ProcessUsageSampleCh <- ProcessUsageSample{ + UUID: dev.UUID, + Pid: sample.Pid, + GPUPercent: sample.SmUtil, + MemoryPercent: sample.MemUtil, + Timestamp: time.UnixMicro(int64(sample.TimeStamp)), + } + } + lastTs = sample.TimeStamp + } + } + } +} + +func (c *Collector) Describe(ch chan<- *prometheus.Desc) { + ch <- gpuInfo + ch <- gpuMemoryTotal + ch <- gpuMemoryUsed + ch <- gpuMemoryUsageAvg + ch <- gpuMemoryUsagePeak + ch <- gpuUsageAvg + ch <- gpuUsagePeak + ch <- gpuTemperature + ch <- gpuPowerWatts +} + +func (c *Collector) Collect(ch chan<- prometheus.Metric) { + c.lock.Lock() + defer c.lock.Unlock() + for _, dev := range c.devices { + ch <- gauge(gpuInfo, 1, dev.UUID, dev.Name) + + mi, ret := dev.device.GetMemoryInfo() + if ret == nvml.SUCCESS { + ch <- gauge(gpuMemoryTotal, float64(mi.Total), dev.UUID) + ch <- gauge(gpuMemoryUsed, float64(mi.Used), dev.UUID) + } + if t, ret := dev.device.GetTemperature(nvml.TEMPERATURE_GPU); ret == nvml.SUCCESS { + ch <- gauge(gpuTemperature, float64(t), dev.UUID) + } + if mw, ret := dev.device.GetPowerUsage(); ret == nvml.SUCCESS { + ch <- gauge(gpuPowerWatts, float64(mw)/1000., dev.UUID) + } + for _, st := range []nvml.SamplingType{nvml.GPU_UTILIZATION_SAMPLES, nvml.MEMORY_UTILIZATION_SAMPLES} { + lastTs := dev.lastSampleTime[st] + valtype, samples, ret := dev.device.GetSamples(st, lastTs) + if ret != nvml.SUCCESS { + continue + } + total := float64(0) + count := float64(0) + peak := float64(0) + for _, sample := range samples { + if sample.TimeStamp <= lastTs { + continue + } + value, err := valueToFloat(valtype, sample.SampleValue) + if err != nil { + continue + } + total += value + if value > peak { + peak = value + } + count++ + lastTs = sample.TimeStamp + } + if count > 0 { + switch st { + case nvml.GPU_UTILIZATION_SAMPLES: + ch <- gauge(gpuUsageAvg, total/count, dev.UUID) + ch <- gauge(gpuUsagePeak, peak, dev.UUID) + case nvml.MEMORY_UTILIZATION_SAMPLES: + ch <- gauge(gpuMemoryUsageAvg, total/count, dev.UUID) + ch <- gauge(gpuMemoryUsagePeak, peak, dev.UUID) + } + } + dev.lastSampleTime[st] = lastTs + } + } +} + +func (c *Collector) Close() { + c.iface.Shutdown() +} + +func findNvidiaMLLib() (string, error) { + paths := []string{ + // gpu-operator + "/run/nvidia/driver/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1", + "/run/nvidia/driver/usr/lib64/libnvidia-ml.so.1", + "/home/kubernetes/bin/nvidia/lib64/libnvidia-ml.so.1", //GKE + + "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1", + "/usr/lib64/libnvidia-ml.so.1", + "/usr/local/cuda/lib64/libnvidia-ml.so.1", + "/usr/lib/libnvidia-ml.so.1", + } + if runtime.GOARCH == "arm64" { + paths = append(paths, + "/usr/lib/aarch64-linux-gnu/libnvidia-ml.so.1", + "/run/nvidia/driver/usr/lib/aarch64-linux-gnu/libnvidia-ml.so.1", + "/home/kubernetes/bin/nvidia/lib64-aarch64/libnvidia-ml.so.1", //GKE + ) + } + for _, p := range paths { + if _, err := os.Stat(proc.HostPath(p)); err == nil { + return proc.HostPath(p), nil + } + } + return "", fmt.Errorf("libnvidia-ml.so.1 not found in known paths") +} + +func valueToFloat(valueType nvml.ValueType, value [8]byte) (float64, error) { + r := bytes.NewReader(value[:]) + switch valueType { + case nvml.VALUE_TYPE_DOUBLE: + var v float64 + err := binary.Read(r, binary.LittleEndian, &v) + return v, err + case nvml.VALUE_TYPE_UNSIGNED_INT: + var v uint32 + err := binary.Read(r, binary.LittleEndian, &v) + return float64(v), err + case nvml.VALUE_TYPE_UNSIGNED_LONG, nvml.VALUE_TYPE_UNSIGNED_LONG_LONG: + var v uint64 + err := binary.Read(r, binary.LittleEndian, &v) + return float64(v), err + case nvml.VALUE_TYPE_SIGNED_LONG_LONG: + var v int64 + err := binary.Read(r, binary.LittleEndian, &v) + return float64(v), err + case nvml.VALUE_TYPE_SIGNED_INT: + var v int32 + err := binary.Read(r, binary.LittleEndian, &v) + return float64(v), err + default: + return 0, fmt.Errorf("unsupported value type %d", valueType) + } +} + +func gauge(desc *prometheus.Desc, value float64, labelValues ...string) prometheus.Metric { + return prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, value, labelValues...) +} diff --git a/main.go b/main.go index 6f47e0d..173dd88 100644 --- a/main.go +++ b/main.go @@ -11,6 +11,7 @@ import ( "github.com/coroot/coroot-node-agent/common" "github.com/coroot/coroot-node-agent/containers" "github.com/coroot/coroot-node-agent/flags" + "github.com/coroot/coroot-node-agent/gpu" "github.com/coroot/coroot-node-agent/logs" "github.com/coroot/coroot-node-agent/node" "github.com/coroot/coroot-node-agent/proc" @@ -141,6 +142,14 @@ func main() { if err := registerer.Register(nodeCollector); err != nil { klog.Exitln(err) } + + gpuCollector, err := gpu.NewCollector() + if err != nil { + klog.Warningln("failed to initialize GPU collector:", err) + } + if err := registerer.Register(gpuCollector); err != nil { + klog.Exitln(err) + } registerer.MustRegister(info("node_agent_info", version)) if md := nodeCollector.Metadata(); md != nil { @@ -151,7 +160,7 @@ func main() { } } processInfoCh := profiling.Init(machineId, hostname) - cr, err := containers.NewRegistry(registerer, processInfoCh) + cr, err := containers.NewRegistry(registerer, processInfoCh, gpuCollector.ProcessUsageSampleCh) if err != nil { klog.Exitln(err) } diff --git a/prom/remote_writer.go b/prom/remote_writer.go index 4a71f2d..7e9c567 100644 --- a/prom/remote_writer.go +++ b/prom/remote_writer.go @@ -35,8 +35,6 @@ type Agent struct { spoolDir string maxSpoolSize int64 - - compressionBuf []byte } func StartAgent(reg *prometheus.Registry, machineId string) error { @@ -139,6 +137,7 @@ func (a *Agent) send(fPath string) error { req.Header.Set("Content-Type", "application/x-protobuf") req.Header.Set("Content-Encoding", "snappy") req.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0") + t := time.Now() resp, err := a.httpClient.Do(req) if err != nil { return err @@ -147,6 +146,7 @@ func (a *Agent) send(fPath string) error { if resp.StatusCode >= 300 { return errors.New(resp.Status) } + klog.Infof("sent metrics in %s", time.Since(t).Truncate(time.Millisecond)) return nil } @@ -167,9 +167,8 @@ func (a *Agent) scrape() error { return err } - a.compressionBuf = snappy.Encode(a.compressionBuf, decompressed) - err = a.writeToSpool(timestamp, a.compressionBuf) - a.compressionBuf = a.compressionBuf[:cap(a.compressionBuf)] + compressed := snappy.Encode(nil, decompressed) + err = a.writeToSpool(timestamp, compressed) return err }