From 13045bfc0990630b27c336da78dfb346338da73b Mon Sep 17 00:00:00 2001 From: Shaik Noor Date: Sat, 13 Dec 2025 14:31:53 +0530 Subject: [PATCH 01/19] feat: add GPU collector using NVML for metrics, events, and MIG support --- crates/agent-core/src/collectors/gpu.rs | 23 ++++++++++++++++------ crates/agent-core/src/nvml_ext.rs | 26 +++++++++++++++++++++---- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs index e50defe..0736842 100644 --- a/crates/agent-core/src/collectors/gpu.rs +++ b/crates/agent-core/src/collectors/gpu.rs @@ -33,7 +33,7 @@ use crate::metrics::MetricsRegistry; use crate::state::{ComputeInstanceNode, GpuInstanceNode, MigTree}; use crate::state::{ FabricLink, FabricLinkType, GpuCapabilities, GpuHealth, GpuIdentity, GpuStatus, GpuTopo, - GpuVendor, StatusState, + GpuVendor, MigDeviceStatus, StatusState, }; #[cfg(all(feature = "gpu", target_os = "linux"))] use nvml_wrapper::error::NvmlError; @@ -910,8 +910,8 @@ impl Collector for GpuCollector { .set(1.0); } for mig in &migs.devices { - let mig_label = - mig.uuid.as_deref().unwrap_or(mig.id.to_string().as_str()); + let mig_id_string = mig.id.to_string(); + let mig_label = mig.uuid.as_deref().unwrap_or(mig_id_string.as_str()); let compat_label = if self.k8s_mode { k8s_resource_name( self.resource_prefix, @@ -1096,6 +1096,17 @@ impl Collector for GpuCollector { .with_label_values(&[uuid_label, gpu_label.as_str()]) .set(if migs.supported { 1.0 } else { 0.0 }); } + } else { + metrics + .gpu_mig_supported + .with_label_values(&[uuid_label, gpu_label.as_str()]) + .set(0.0); + metrics + .gpu_mig_enabled + .with_label_values(&[uuid_label, gpu_label.as_str()]) + .set(0.0); + } + } } #[cfg(not(all(feature = "gpu-nvml-ffi", feature = "gpu")))] { @@ -1245,7 +1256,7 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result Result Result nvmlReturn_t; + fn nvmlDeviceGetPcieReplayCounter( + device: nvmlDevice_t, + value: *mut u32, + ) -> nvmlReturn_t; + fn nvmlDeviceGetFieldValues( + device: nvmlDevice_t, + valuesCount: u32, + values: *mut nvmlFieldValue_t, + ) -> nvmlReturn_t; +} + /// Errors from extended NVML calls. #[derive(thiserror::Error, Debug)] pub enum NvmlExtError { @@ -65,11 +83,11 @@ pub fn pcie_ext_counters(device: nvmlDevice_t) -> Result // nvmlDeviceGetPcieReplayCounter is already available in wrapper; here we try best-effort extras. // As nvml-wrapper does not expose these, we attempt direct bindings when available; otherwise return NotSupported. unsafe { - let mut corr: nvmlPciErrorCounter_t = 0; - let mut atomic: nvmlPcieUtilCounter_t = 0; + let mut corr: u32 = 0; + let mut atomic: u32 = 0; let corr_ret = nvmlDeviceGetPcieStats( device, - nvmlPcieUtilCounter_NVML_PCIE_UTIL_TX_BYTES, + nvmlPcieUtilCounter_enum_NVML_PCIE_UTIL_TX_BYTES, &mut corr, ); let atomic_ret = nvmlDeviceGetPcieReplayCounter(device, &mut atomic); @@ -108,7 +126,7 @@ pub fn get_field_values( } let mut out = FieldValues::default(); for f in fields { - out.values.push((f.fieldId, f.value.lVal)); + out.values.push((f.fieldId, f.value.si64Val)); } Ok(out) } From 8733ea0f9c3c395b08f4b59bb84cb47e91b7a348 Mon Sep 17 00:00:00 2001 From: Shaik Noor Date: Sat, 13 Dec 2025 15:05:30 +0530 Subject: [PATCH 02/19] test: push changes for build testing --- crates/agent-core/src/collectors/gpu.rs | 15 +++------------ crates/agent-core/src/nvml_ext.rs | 11 ++--------- 2 files changed, 5 insertions(+), 21 deletions(-) diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs index 0736842..003218f 100644 --- a/crates/agent-core/src/collectors/gpu.rs +++ b/crates/agent-core/src/collectors/gpu.rs @@ -911,7 +911,8 @@ impl Collector for GpuCollector { } for mig in &migs.devices { let mig_id_string = mig.id.to_string(); - let mig_label = mig.uuid.as_deref().unwrap_or(mig_id_string.as_str()); + let mig_label = + mig.uuid.as_deref().unwrap_or(mig_id_string.as_str()); let compat_label = if self.k8s_mode { k8s_resource_name( self.resource_prefix, @@ -1096,18 +1097,8 @@ impl Collector for GpuCollector { .with_label_values(&[uuid_label, gpu_label.as_str()]) .set(if migs.supported { 1.0 } else { 0.0 }); } - } else { - metrics - .gpu_mig_supported - .with_label_values(&[uuid_label, gpu_label.as_str()]) - .set(0.0); - metrics - .gpu_mig_enabled - .with_label_values(&[uuid_label, gpu_label.as_str()]) - .set(0.0); - } - } } + #[cfg(not(all(feature = "gpu-nvml-ffi", feature = "gpu")))] { metrics diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs index 9b70cbb..fe36ab4 100644 --- a/crates/agent-core/src/nvml_ext.rs +++ b/crates/agent-core/src/nvml_ext.rs @@ -6,15 +6,8 @@ use nvml_wrapper_sys::bindings::*; #[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))] extern "C" { - fn nvmlDeviceGetPcieStats( - device: nvmlDevice_t, - counter: u32, - value: *mut u32, - ) -> nvmlReturn_t; - fn nvmlDeviceGetPcieReplayCounter( - device: nvmlDevice_t, - value: *mut u32, - ) -> nvmlReturn_t; + fn nvmlDeviceGetPcieStats(device: nvmlDevice_t, counter: u32, value: *mut u32) -> nvmlReturn_t; + fn nvmlDeviceGetPcieReplayCounter(device: nvmlDevice_t, value: *mut u32) -> nvmlReturn_t; fn nvmlDeviceGetFieldValues( device: nvmlDevice_t, valuesCount: u32, From c731b3332340236aaacafde97624c59cd9fbe424 Mon Sep 17 00:00:00 2001 From: Shaik Noor Date: Sat, 13 Dec 2025 15:09:16 +0530 Subject: [PATCH 03/19] ci: trigger build on test-builds branch --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5d2f8d1..01b8c3b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,7 +2,7 @@ name: ci on: push: - branches: [ main, master ] + branches: [ main, master, test-builds ] pull_request: branches: [ main, master ] From 795bb85b303ea2df28635fe7014e4f204947556f Mon Sep 17 00:00:00 2001 From: Shaik Noor Date: Sat, 13 Dec 2025 15:13:44 +0530 Subject: [PATCH 04/19] fix(agent-core): resolve nvml build errors and warnings --- .github/workflows/ci.yml | 4 ++-- crates/agent-core/src/collectors/gpu.rs | 26 +++++++++++++++++++++---- crates/agent-core/src/nvml_ext.rs | 2 +- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 01b8c3b..ac6ffb1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,9 +2,9 @@ name: ci on: push: - branches: [ main, master, test-builds ] + branches: [ main, test-builds ] pull_request: - branches: [ main, master ] + branches: [ main ] jobs: lint-and-test: diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs index 003218f..9bf2291 100644 --- a/crates/agent-core/src/collectors/gpu.rs +++ b/crates/agent-core/src/collectors/gpu.rs @@ -39,8 +39,7 @@ use crate::state::{ use nvml_wrapper::error::NvmlError; #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))] use nvml_wrapper_sys::bindings::{ - nvmlDeviceGetDeviceHandleFromMigDeviceHandle, nvmlDeviceGetMaxMigDeviceCount, - nvmlDeviceGetMigDeviceHandleByIndex, nvmlDeviceGetMigMode, nvmlDevice_t, nvmlReturn_t, + nvmlDevice_t, nvmlReturn_t, }; #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))] @@ -71,6 +70,24 @@ extern "C" { id: std::os::raw::c_uint, computeInstance: *mut nvmlDevice_t, ) -> nvmlReturn_t; + fn nvmlDeviceGetMigMode( + device: nvmlDevice_t, + currentMode: *mut std::os::raw::c_uint, + pendingMode: *mut std::os::raw::c_uint, + ) -> nvmlReturn_t; + fn nvmlDeviceGetMaxMigDeviceCount( + device: nvmlDevice_t, + count: *mut std::os::raw::c_uint, + ) -> nvmlReturn_t; + fn nvmlDeviceGetMigDeviceHandleByIndex( + device: nvmlDevice_t, + index: std::os::raw::c_uint, + migDevice: *mut nvmlDevice_t, + ) -> nvmlReturn_t; + fn nvmlDeviceGetDeviceHandleFromMigDeviceHandle( + migDevice: nvmlDevice_t, + device: *mut nvmlDevice_t, + ) -> nvmlReturn_t; } pub struct GpuCollector { @@ -279,7 +296,7 @@ impl Collector for GpuCollector { let event_set: Option<()> = None; #[cfg(not(target_os = "linux"))] let _ = &event_set; - let events_enabled = self.enable_events; + let _events_enabled = self.enable_events; #[cfg(not(target_os = "linux"))] if events_enabled { tracing::debug!( @@ -1091,11 +1108,12 @@ impl Collector for GpuCollector { ]) .set(1.0); } + let supported = migs.supported; status.mig_tree = Some(migs); metrics .gpu_mig_supported .with_label_values(&[uuid_label, gpu_label.as_str()]) - .set(if migs.supported { 1.0 } else { 0.0 }); + .set(if supported { 1.0 } else { 0.0 }); } } diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs index fe36ab4..a107b94 100644 --- a/crates/agent-core/src/nvml_ext.rs +++ b/crates/agent-core/src/nvml_ext.rs @@ -119,7 +119,7 @@ pub fn get_field_values( } let mut out = FieldValues::default(); for f in fields { - out.values.push((f.fieldId, f.value.si64Val)); + out.values.push((f.fieldId, f.value.sllVal)); } Ok(out) } From d89685186a7f55d993eba3e19d602c0fd95d51e7 Mon Sep 17 00:00:00 2001 From: Shaik Noor Date: Sat, 13 Dec 2025 15:15:30 +0530 Subject: [PATCH 05/19] style: cargo fmt fix --- crates/agent-core/src/collectors/gpu.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs index 9bf2291..0fe8185 100644 --- a/crates/agent-core/src/collectors/gpu.rs +++ b/crates/agent-core/src/collectors/gpu.rs @@ -38,9 +38,7 @@ use crate::state::{ #[cfg(all(feature = "gpu", target_os = "linux"))] use nvml_wrapper::error::NvmlError; #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))] -use nvml_wrapper_sys::bindings::{ - nvmlDevice_t, nvmlReturn_t, -}; +use nvml_wrapper_sys::bindings::{nvmlDevice_t, nvmlReturn_t}; #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))] extern "C" { From fe1331617dd91d62ff5b4294a3539cd186d181d6 Mon Sep 17 00:00:00 2001 From: Shaik Noor Date: Sat, 13 Dec 2025 15:18:52 +0530 Subject: [PATCH 06/19] fix(ci): add required toolchain input --- .github/workflows/ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ac6ffb1..eaa5a97 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,6 +12,8 @@ jobs: steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable + with: + toolchain: stable - name: Cargo fmt run: cargo fmt --all -- --check - name: Cargo clippy @@ -25,6 +27,8 @@ jobs: steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable + with: + toolchain: stable - name: Install fpm deps run: sudo apt-get update && sudo apt-get install -y rpm ruby ruby-dev rubygems build-essential && sudo gem install --no-document fpm - name: Build release binary (host) From b53672b36626c73b8696d8afe2d350352f90e515 Mon Sep 17 00:00:00 2001 From: Shaik Noor Date: Sat, 13 Dec 2025 15:22:12 +0530 Subject: [PATCH 07/19] fix(agent-core): resolve remaining clippy and unsafe errors --- crates/agent-core/src/collectors/gpu.rs | 23 +++++++++++++---------- crates/agent-core/src/nvml_ext.rs | 13 +++---------- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs index 0fe8185..4a09f4b 100644 --- a/crates/agent-core/src/collectors/gpu.rs +++ b/crates/agent-core/src/collectors/gpu.rs @@ -648,14 +648,16 @@ impl Collector for GpuCollector { .inc_by(0); #[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))] { - if let Ok(field_vals) = crate::nvml_ext::get_field_values( - unsafe { device.handle() }, - &[ - crate::nvml_ext::field::FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS, - crate::nvml_ext::field::FI_DEV_PCIE_COUNT_NON_FATAL_ERROR, - crate::nvml_ext::field::FI_DEV_PCIE_COUNT_FATAL_ERROR, - ], - ) { + if let Ok(field_vals) = unsafe { + crate::nvml_ext::get_field_values( + device.handle(), + &[ + crate::nvml_ext::field::FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS, + crate::nvml_ext::field::FI_DEV_PCIE_COUNT_NON_FATAL_ERROR, + crate::nvml_ext::field::FI_DEV_PCIE_COUNT_FATAL_ERROR, + ], + ) + } { if let Some(corr) = field_vals .get(crate::nvml_ext::field::FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS) { @@ -676,7 +678,8 @@ impl Collector for GpuCollector { .with_label_values(&[uuid_label, gpu_label.as_str()]) .inc_by(uncorrectable); } - if let Ok(ext) = crate::nvml_ext::pcie_ext_counters(unsafe { device.handle() }) + if let Ok(ext) = + unsafe { crate::nvml_ext::pcie_ext_counters(device.handle()) } { if let Some(c) = ext.correctable_errors { metrics @@ -1346,7 +1349,7 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result Result<(), NvmlExtError> { - if ret == nvmlReturn_enum_NVML_SUCCESS { - Ok(()) - } else { - Err(NvmlExtError::NvmlReturn(ret as i32)) - } -} + #[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))] -pub fn pcie_ext_counters(device: nvmlDevice_t) -> Result { +pub unsafe fn pcie_ext_counters(device: nvmlDevice_t) -> Result { // nvmlDeviceGetPcieReplayCounter is already available in wrapper; here we try best-effort extras. // As nvml-wrapper does not expose these, we attempt direct bindings when available; otherwise return NotSupported. unsafe { @@ -104,7 +97,7 @@ pub fn nvswitch_ext_counters(_device: nvmlDevice_t) -> Result Result { From 8230ae9e624720ae8477a50655ae46d53d54c591 Mon Sep 17 00:00:00 2001 From: Shaik Noor Date: Sat, 13 Dec 2025 15:24:01 +0530 Subject: [PATCH 08/19] style: fix cargo fmt errors --- crates/agent-core/src/collectors/gpu.rs | 3 +-- crates/agent-core/src/nvml_ext.rs | 3 --- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs index 4a09f4b..86d20f3 100644 --- a/crates/agent-core/src/collectors/gpu.rs +++ b/crates/agent-core/src/collectors/gpu.rs @@ -678,8 +678,7 @@ impl Collector for GpuCollector { .with_label_values(&[uuid_label, gpu_label.as_str()]) .inc_by(uncorrectable); } - if let Ok(ext) = - unsafe { crate::nvml_ext::pcie_ext_counters(device.handle()) } + if let Ok(ext) = unsafe { crate::nvml_ext::pcie_ext_counters(device.handle()) } { if let Some(c) = ext.correctable_errors { metrics diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs index 05e3f73..2c6ac17 100644 --- a/crates/agent-core/src/nvml_ext.rs +++ b/crates/agent-core/src/nvml_ext.rs @@ -61,9 +61,6 @@ pub mod field { pub const FI_DEV_PCIE_OUTBOUND_ATOMICS_MASK: u32 = 228; pub const FI_DEV_PCIE_INBOUND_ATOMICS_MASK: u32 = 229; } - - - #[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))] pub unsafe fn pcie_ext_counters(device: nvmlDevice_t) -> Result { // nvmlDeviceGetPcieReplayCounter is already available in wrapper; here we try best-effort extras. From 02ae5b84a7b6ad0471fcdbaaf04184bfa6aa53f5 Mon Sep 17 00:00:00 2001 From: Shaik Noor Date: Sat, 13 Dec 2025 15:26:54 +0530 Subject: [PATCH 09/19] docs: add safety comments for unsafe fn --- crates/agent-core/src/nvml_ext.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs index 2c6ac17..2df0832 100644 --- a/crates/agent-core/src/nvml_ext.rs +++ b/crates/agent-core/src/nvml_ext.rs @@ -61,6 +61,12 @@ pub mod field { pub const FI_DEV_PCIE_OUTBOUND_ATOMICS_MASK: u32 = 228; pub const FI_DEV_PCIE_INBOUND_ATOMICS_MASK: u32 = 229; } +/// Best-effort PCIe extended counters. +/// +/// # Safety +/// +/// This function dereferences the provided `device` raw pointer to call into NVML via FFI. +/// The caller must ensure `device` is a valid `nvmlDevice_t` obtained from `nvml_wrapper`. #[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))] pub unsafe fn pcie_ext_counters(device: nvmlDevice_t) -> Result { // nvmlDeviceGetPcieReplayCounter is already available in wrapper; here we try best-effort extras. @@ -93,6 +99,12 @@ pub fn nvswitch_ext_counters(_device: nvmlDevice_t) -> Result Date: Sat, 13 Dec 2025 15:29:08 +0530 Subject: [PATCH 10/19] fix(agent-core): fix unsafe call in tests --- crates/agent-core/src/nvml_ext.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs index 2df0832..70edc8a 100644 --- a/crates/agent-core/src/nvml_ext.rs +++ b/crates/agent-core/src/nvml_ext.rs @@ -165,7 +165,7 @@ mod tests { #[test] fn pcie_ext_stub_compiles() { - let res = pcie_ext_counters(std::ptr::null_mut()); + let res = unsafe { pcie_ext_counters(std::ptr::null_mut()) }; assert!(res.is_err()); } From 27bc66a2de86962ba0bbfd724ad52a0ba292fadb Mon Sep 17 00:00:00 2001 From: Shaik Noor Date: Sat, 13 Dec 2025 15:35:08 +0530 Subject: [PATCH 11/19] build: upgrade nvml-wrapper to 0.10 and remove manual ffi --- crates/agent-core/Cargo.toml | 4 +- crates/agent-core/src/collectors/gpu.rs | 51 ++----------------------- crates/agent-core/src/nvml_ext.rs | 10 +---- 3 files changed, 6 insertions(+), 59 deletions(-) diff --git a/crates/agent-core/Cargo.toml b/crates/agent-core/Cargo.toml index 706a04b..206dea1 100644 --- a/crates/agent-core/Cargo.toml +++ b/crates/agent-core/Cargo.toml @@ -36,11 +36,11 @@ futures = "0.3" libc = "0.2" [dependencies.nvml-wrapper] -version = "0.9" +version = "0.10" optional = true [dependencies.nvml-wrapper-sys] -version = "0.7" +version = "0.8" optional = true [dependencies.esnode-orchestrator] diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs index 86d20f3..98ed536 100644 --- a/crates/agent-core/src/collectors/gpu.rs +++ b/crates/agent-core/src/collectors/gpu.rs @@ -38,55 +38,10 @@ use crate::state::{ #[cfg(all(feature = "gpu", target_os = "linux"))] use nvml_wrapper::error::NvmlError; #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))] -use nvml_wrapper_sys::bindings::{nvmlDevice_t, nvmlReturn_t}; +use nvml_wrapper_sys::bindings::*; + + -#[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))] -extern "C" { - fn nvmlDeviceGetGpuInstanceId( - device: nvmlDevice_t, - id: *mut std::os::raw::c_uint, - ) -> nvmlReturn_t; - fn nvmlDeviceGetComputeInstanceId( - device: nvmlDevice_t, - id: *mut std::os::raw::c_uint, - ) -> nvmlReturn_t; - fn nvmlGpuInstanceGetInfo( - gpuInstance: nvmlDevice_t, - info: *mut nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_t, - ) -> nvmlReturn_t; - fn nvmlComputeInstanceGetInfo( - computeInstance: nvmlDevice_t, - info: *mut nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_t, - ) -> nvmlReturn_t; - fn nvmlDeviceGetGpuInstanceById( - device: nvmlDevice_t, - id: std::os::raw::c_uint, - gpuInstance: *mut nvmlDevice_t, - ) -> nvmlReturn_t; - fn nvmlGpuInstanceGetComputeInstanceById( - gpuInstance: nvmlDevice_t, - id: std::os::raw::c_uint, - computeInstance: *mut nvmlDevice_t, - ) -> nvmlReturn_t; - fn nvmlDeviceGetMigMode( - device: nvmlDevice_t, - currentMode: *mut std::os::raw::c_uint, - pendingMode: *mut std::os::raw::c_uint, - ) -> nvmlReturn_t; - fn nvmlDeviceGetMaxMigDeviceCount( - device: nvmlDevice_t, - count: *mut std::os::raw::c_uint, - ) -> nvmlReturn_t; - fn nvmlDeviceGetMigDeviceHandleByIndex( - device: nvmlDevice_t, - index: std::os::raw::c_uint, - migDevice: *mut nvmlDevice_t, - ) -> nvmlReturn_t; - fn nvmlDeviceGetDeviceHandleFromMigDeviceHandle( - migDevice: nvmlDevice_t, - device: *mut nvmlDevice_t, - ) -> nvmlReturn_t; -} pub struct GpuCollector { #[cfg(feature = "gpu")] diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs index 70edc8a..f623d2e 100644 --- a/crates/agent-core/src/nvml_ext.rs +++ b/crates/agent-core/src/nvml_ext.rs @@ -5,15 +5,7 @@ use nvml_wrapper_sys::bindings::*; #[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))] -extern "C" { - fn nvmlDeviceGetPcieStats(device: nvmlDevice_t, counter: u32, value: *mut u32) -> nvmlReturn_t; - fn nvmlDeviceGetPcieReplayCounter(device: nvmlDevice_t, value: *mut u32) -> nvmlReturn_t; - fn nvmlDeviceGetFieldValues( - device: nvmlDevice_t, - valuesCount: u32, - values: *mut nvmlFieldValue_t, - ) -> nvmlReturn_t; -} + /// Errors from extended NVML calls. #[derive(thiserror::Error, Debug)] From 11dcd92bddfb42fb68b9ccd5f0d9738c647a171e Mon Sep 17 00:00:00 2001 From: Shaik Noor Date: Sat, 13 Dec 2025 15:38:02 +0530 Subject: [PATCH 12/19] style: fix cargo fmt --- crates/agent-core/src/collectors/gpu.rs | 3 --- crates/agent-core/src/nvml_ext.rs | 1 - 2 files changed, 4 deletions(-) diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs index 98ed536..04f041c 100644 --- a/crates/agent-core/src/collectors/gpu.rs +++ b/crates/agent-core/src/collectors/gpu.rs @@ -40,9 +40,6 @@ use nvml_wrapper::error::NvmlError; #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))] use nvml_wrapper_sys::bindings::*; - - - pub struct GpuCollector { #[cfg(feature = "gpu")] nvml: Option, diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs index f623d2e..4780ffa 100644 --- a/crates/agent-core/src/nvml_ext.rs +++ b/crates/agent-core/src/nvml_ext.rs @@ -6,7 +6,6 @@ use nvml_wrapper_sys::bindings::*; #[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))] - /// Errors from extended NVML calls. #[derive(thiserror::Error, Debug)] pub enum NvmlExtError { From 1214e907aa89ee05fe2d9790c2a50fc7ff80c393 Mon Sep 17 00:00:00 2001 From: Shaik Noor Date: Sat, 13 Dec 2025 15:49:55 +0530 Subject: [PATCH 13/19] feat: implement dynamic loading for nvml mig and pcie stats --- crates/agent-core/Cargo.toml | 1 + crates/agent-core/src/collectors/gpu.rs | 980 ++++++++++++++++++------ crates/agent-core/src/nvml_ext.rs | 32 +- 3 files changed, 772 insertions(+), 241 deletions(-) diff --git a/crates/agent-core/Cargo.toml b/crates/agent-core/Cargo.toml index 206dea1..b5e2130 100644 --- a/crates/agent-core/Cargo.toml +++ b/crates/agent-core/Cargo.toml @@ -34,6 +34,7 @@ chrono = { version = "0.4", default-features = false, features = ["clock"] } tokio-stream = "0.1" futures = "0.3" libc = "0.2" +libloading = "0.8" [dependencies.nvml-wrapper] version = "0.10" diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs index 04f041c..cde568d 100644 --- a/crates/agent-core/src/collectors/gpu.rs +++ b/crates/agent-core/src/collectors/gpu.rs @@ -688,13 +688,16 @@ impl Collector for GpuCollector { // Estimate bandwidth percent if we have throughput + link info if let (Some(tx_kb), Some(rx_kb)) = (last_tx_kb, last_rx_kb) { - if let (Ok(max_speed), Ok(width)) = ( - device.pcie_link_max_speed(), - device.current_pcie_link_width(), - ) { - let bytes_per_s = ((tx_kb + rx_kb) as f64) * 1024.0; + // pcie_link_max_speed was renamed/removed in 0.10, falling back to pcie_link_speed (current) or skipping if unavailable + // Note: pcie_link_speed returns the current link speed, not max. + // If semantics require max, we might need a different call, but for now matching the existing pattern. + if let (Ok(gen), Ok(width), Ok(speed)) = ( + device.pcie_link_gen(), + device.pcie_link_width(), + device.pcie_link_speed(), + ) { let bytes_per_s = ((tx_kb + rx_kb) as f64) * 1024.0; let lane_budget_bytes = - pcie_lane_bytes_per_sec(max_speed) * (width as f64).max(1.0); + pcie_lane_bytes_per_sec(gen, speed) * (width as f64).max(1.0); if lane_budget_bytes > 0.0 { let pct = (bytes_per_s / lane_budget_bytes).min(1.0) * 100.0; metrics @@ -837,189 +840,561 @@ impl Collector for GpuCollector { if self.enable_mig { #[cfg(all(feature = "gpu-nvml-ffi", feature = "gpu"))] { - if let Ok(migs) = collect_mig_devices(nvml, &device) { - metrics - .gpu_mig_enabled - .with_label_values(&[uuid_label, gpu_label.as_str()]) - .set(if migs.enabled { 1.0 } else { 0.0 }); - // GI/CI info gauges - for gi in &migs.gpu_instances { - metrics - .mig_gpu_instance_info - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - gi.id.to_string().as_str(), - gi.profile_id - .map(|p| p.to_string()) - .unwrap_or_default() - .as_str(), - gi.placement.as_deref().unwrap_or(""), - ]) - .set(1.0); + use nvml_wrapper_sys::bindings::{ + nvmlComputeInstanceInfo_t, nvmlDevice_t, nvmlGpuInstanceInfo_t, + nvmlReturn_t, nvmlReturn_enum_NVML_SUCCESS, + }; + // Load NVML dynamically to bypass missing symbols in sys crate + let lib = unsafe { libloading::Library::new("libnvidia-ml.so.1") }?; + + // Typedefs for the functions we need + type NvmlDeviceGetMigMode = unsafe extern "C" fn( + device: nvmlDevice_t, + current_mode: *mut std::os::raw::c_uint, + pending_mode: *mut std::os::raw::c_uint, + ) -> nvmlReturn_t; + type NvmlDeviceGetMaxMigDeviceCount = unsafe extern "C" fn( + device: nvmlDevice_t, + count: *mut std::os::raw::c_uint, + ) -> nvmlReturn_t; + type NvmlDeviceGetMigDeviceHandleByIndex = unsafe extern "C" fn( + device: nvmlDevice_t, + index: std::os::raw::c_uint, + mig_device: *mut nvmlDevice_t, + ) -> nvmlReturn_t; + type NvmlDeviceGetDeviceHandleFromMigDeviceHandle = unsafe extern "C" fn( + mig_device: nvmlDevice_t, + device: *mut nvmlDevice_t, + ) -> nvmlReturn_t; + type NvmlDeviceGetGpuInstanceId = unsafe extern "C" fn( + device: nvmlDevice_t, + id: *mut std::os::raw::c_uint, + ) -> nvmlReturn_t; + type NvmlDeviceGetComputeInstanceId = unsafe extern "C" fn( + device: nvmlDevice_t, + id: *mut std::os::raw::c_uint, + ) -> nvmlReturn_t; + type NvmlDeviceGetGpuInstanceById = unsafe extern "C" fn( + device: nvmlDevice_t, + id: std::os::raw::c_uint, + gpu_instance: *mut nvmlDevice_t, + ) -> nvmlReturn_t; + type NvmlGpuInstanceGetInfo = unsafe extern "C" fn( + gpu_instance: nvmlDevice_t, + info: *mut nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_t, + ) -> nvmlReturn_t; + type NvmlGpuInstanceGetComputeInstanceById = unsafe extern "C" fn( + gpu_instance: nvmlDevice_t, + id: std::os::raw::c_uint, + compute_instance: *mut nvmlDevice_t, + ) -> nvmlReturn_t; + type NvmlComputeInstanceGetInfo = unsafe extern "C" fn( + compute_instance: nvmlDevice_t, + info: *mut nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_t, + ) -> nvmlReturn_t; + type NvmlDeviceGetUUID = unsafe extern "C" fn( + device: nvmlDevice_t, + uuid: *mut std::os::raw::c_char, + size: std::os::raw::c_uint, + ) -> nvmlReturn_t; + type NvmlDeviceGetMemoryInfo = unsafe extern "C" fn( + device: nvmlDevice_t, + memory: *mut nvml_wrapper_sys::bindings::nvmlMemory_t, + ) -> nvmlReturn_t; + type NvmlDeviceGetUtilizationRates = unsafe extern "C" fn( + device: nvmlDevice_t, + utilization: *mut nvml_wrapper_sys::bindings::nvmlUtilization_t, + ) -> nvmlReturn_t; + type NvmlDeviceGetBar1MemoryInfo = unsafe extern "C" fn( + device: nvmlDevice_t, + bar1_memory: *mut nvml_wrapper_sys::bindings::nvmlBAR1Memory_t, + ) -> nvmlReturn_t; + type NvmlDeviceGetEccMode = unsafe extern "C" fn( + device: nvmlDevice_t, + current_mode: *mut nvml_wrapper_sys::bindings::nvmlEnableState_t, + pending_mode: *mut nvml_wrapper_sys::bindings::nvmlEnableState_t, + ) -> nvmlReturn_t; + type NvmlDeviceGetTotalEccErrors = unsafe extern "C" fn( + device: nvmlDevice_t, + error_type: nvml_wrapper_sys::bindings::nvmlMemoryErrorType_t, + counter_type: nvml_wrapper_sys::bindings::nvmlEccCounterType_t, + ecc_count: *mut u64, + ) -> nvmlReturn_t; + + let get_mig_mode: libloading::Symbol = + unsafe { lib.get(b"nvmlDeviceGetMigMode") }?; + let get_max_mig_device_count: libloading::Symbol< + NvmlDeviceGetMaxMigDeviceCount, + > = unsafe { lib.get(b"nvmlDeviceGetMaxMigDeviceCount") }?; + let get_mig_device_handle_by_index: libloading::Symbol< + NvmlDeviceGetMigDeviceHandleByIndex, + > = unsafe { lib.get(b"nvmlDeviceGetMigDeviceHandleByIndex") }?; + let get_device_handle_from_mig_device_handle: libloading::Symbol< + NvmlDeviceGetDeviceHandleFromMigDeviceHandle, + > = unsafe { lib.get(b"nvmlDeviceGetDeviceHandleFromMigDeviceHandle") }?; + let get_gpu_instance_id: libloading::Symbol = + unsafe { lib.get(b"nvmlDeviceGetGpuInstanceId") }?; + let get_compute_instance_id: libloading::Symbol< + NvmlDeviceGetComputeInstanceId, + > = unsafe { lib.get(b"nvmlDeviceGetComputeInstanceId") }?; + let get_gpu_instance_by_id: libloading::Symbol = + unsafe { lib.get(b"nvmlGpuInstanceGetById") }?; // Corrected function name + let get_gpu_instance_info: libloading::Symbol = + unsafe { lib.get(b"nvmlGpuInstanceGetInfo") }?; + let get_gpu_instance_compute_instance_by_id: libloading::Symbol< + NvmlGpuInstanceGetComputeInstanceById, + > = unsafe { lib.get(b"nvmlGpuInstanceGetComputeInstanceById") }?; + let get_compute_instance_info: libloading::Symbol< + NvmlComputeInstanceGetInfo, + > = unsafe { lib.get(b"nvmlComputeInstanceGetInfo") }?; + let get_uuid: libloading::Symbol = + unsafe { lib.get(b"nvmlDeviceGetUUID") }?; + let get_memory_info: libloading::Symbol = + unsafe { lib.get(b"nvmlDeviceGetMemoryInfo") }?; + let get_utilization_rates: libloading::Symbol = + unsafe { lib.get(b"nvmlDeviceGetUtilizationRates") }?; + let get_bar1_memory_info: libloading::Symbol = + unsafe { lib.get(b"nvmlDeviceGetBar1MemoryInfo") }?; + let get_total_ecc_errors: libloading::Symbol = + unsafe { lib.get(b"nvmlDeviceGetTotalEccErrors") }?; + + let mut current_mode = 0; + let mut pending = 0; + let parent_handle = unsafe { device.handle() }; + let mig_mode_res = unsafe { + get_mig_mode(parent_handle, &mut current_mode, &mut pending) + }; + let supported = mig_mode_res == nvmlReturn_enum_NVML_SUCCESS; + let enabled = current_mode + == nvml_wrapper_sys::bindings::nvmlMigMode_enum_NVML_DEVICE_MIG_ENABLE; + + if !supported || !enabled { + // If MIG is not supported or not enabled, return early with appropriate status + return Ok(MigTree { + supported, + enabled, + gpu_instances: Vec::new(), + compute_instances: Vec::new(), + devices: Vec::new(), + }); + } + + let mut max_count = 0; + unsafe { get_max_mig_device_count(parent_handle, &mut max_count) }; + + let mut devices = Vec::new(); + let mut gi_map: HashMap = HashMap::new(); + let mut gi_handles: HashMap = HashMap::new(); + let mut ci_nodes: Vec = Vec::new(); + + for idx in 0..max_count { + let mut mig_handle: nvmlDevice_t = std::ptr::null_mut(); + if unsafe { + get_mig_device_handle_by_index(parent_handle, idx, &mut mig_handle) + } == nvmlReturn_enum_NVML_SUCCESS + { + let mut full_handle: nvmlDevice_t = std::ptr::null_mut(); + unsafe { + get_device_handle_from_mig_device_handle( + mig_handle, + &mut full_handle, + ) + }; + + let mut uuid_buf = [0i8; 96]; // NVML_DEVICE_UUID_V2_BUFFER_SIZE + let _ = unsafe { + get_uuid(mig_handle, uuid_buf.as_mut_ptr(), uuid_buf.len() as u32) + }; + let mig_uuid_str = unsafe { + std::ffi::CStr::from_ptr(uuid_buf.as_ptr()) + .to_string_lossy() + .into_owned() + }; + let mig_uuid = if mig_uuid_str.is_empty() { + None + } else { + Some(mig_uuid_str.clone()) + }; + + // Extract GI/CI to map hierarchy + let mut gi_id = 0; + let _ = unsafe { get_gpu_instance_id(mig_handle, &mut gi_id) }; + let mut ci_id = 0; + let _ = unsafe { get_compute_instance_id(mig_handle, &mut ci_id) }; + + // Populate GI info best-effort + if gi_id > 0 && !gi_map.contains_key(&gi_id) { + let mut gi_handle: nvmlDevice_t = std::ptr::null_mut(); + if unsafe { + get_gpu_instance_by_id(parent_handle, gi_id, &mut gi_handle) + } == nvmlReturn_enum_NVML_SUCCESS + { + let mut gi_info: nvmlGpuInstanceInfo_t = + unsafe { std::mem::zeroed() }; + gi_info.version = + nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_v2; + let _ = + unsafe { get_gpu_instance_info(gi_handle, &mut gi_info) }; + let placement = Some(format!( + "{}:slice{}", + gi_info.placement.start, gi_info.placement.size + )); + gi_map.insert( + gi_id, + GpuInstanceNode { + id: gi_id, + profile_id: Some(gi_info.profileId), + placement, + }, + ); + gi_handles.insert(gi_id, gi_handle); + } + } + + // Populate CI info best-effort + if ci_id > 0 { + if let Some(_gi_node) = gi_map.get(&gi_id) { + if let Some(&gi_handle) = gi_handles.get(&gi_id) { + let mut ci_handle: nvmlDevice_t = std::ptr::null_mut(); + if unsafe { + get_gpu_instance_compute_instance_by_id( + gi_handle, // Assuming GpuInstanceNode stores the handle + ci_id, + &mut ci_handle, + ) + } == nvmlReturn_enum_NVML_SUCCESS + { + let mut ci_info: nvmlComputeInstanceInfo_t = + unsafe { std::mem::zeroed() }; + ci_info.version = + nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_v2; + let _ = unsafe { + get_compute_instance_info(ci_handle, &mut ci_info) + }; + ci_nodes.push(ComputeInstanceNode { + gpu_instance_id: gi_id, + id: ci_id, + profile_id: Some(ci_info.profileId), + eng_profile_id: None, // nvmlComputeInstanceInfo_t_v2 does not have engineProfileId + placement: Some(format!( + "{}:slice{}", + ci_info.placement.start, ci_info.placement.size + )), + }); + } + } + } + } + + let mig_id_label = format!("mig{}", idx); + let placement_str = gi_map + .get(&gi_id) + .and_then(|g| g.placement.clone()) + .unwrap_or_else(|| format!("gi{}", gi_id)); + let profile_str = gi_map + .get(&gi_id) + .and_then(|g| g.profile_id) + .map(|p| p.to_string()); + + let mut mem_info: nvml_wrapper_sys::bindings::nvmlMemory_t = + unsafe { std::mem::zeroed() }; + let mem_info_res = + unsafe { get_memory_info(mig_handle, &mut mem_info) }; + let memory_total_bytes = + if mem_info_res == nvmlReturn_enum_NVML_SUCCESS { + Some(mem_info.total) + } else { + None + }; + let memory_used_bytes = + if mem_info_res == nvmlReturn_enum_NVML_SUCCESS { + Some(mem_info.used) + } else { + None + }; + + let mut util_rates: nvml_wrapper_sys::bindings::nvmlUtilization_t = + unsafe { std::mem::zeroed() }; + let util_res = + unsafe { get_utilization_rates(mig_handle, &mut util_rates) }; + let util_percent = if util_res == nvmlReturn_enum_NVML_SUCCESS { + Some(util_rates.gpu) + } else { + None + }; + + let mut bar1_info: nvml_wrapper_sys::bindings::nvmlBAR1Memory_t = + unsafe { std::mem::zeroed() }; + let bar1_res = + unsafe { get_bar1_memory_info(mig_handle, &mut bar1_info) }; + let bar1_total_bytes = + if bar1_res == nvmlReturn_enum_NVML_SUCCESS { + Some(bar1_info.total) + } else { + None + }; + let bar1_used_bytes = + if bar1_res == nvmlReturn_enum_NVML_SUCCESS { + Some(bar1_info.used) + } else { + None + }; + + let mut ecc_corrected_val: u64 = 0; + let ecc_corrected_res = unsafe { + get_total_ecc_errors( + mig_handle, + nvml_wrapper_sys::bindings::nvmlMemoryErrorType_enum_NVML_MEMORY_ERROR_TYPE_CORRECTED, + nvml_wrapper_sys::bindings::nvmlEccCounterType_enum_NVML_ECC_COUNTER_TYPE_VOLATILE, + &mut ecc_corrected_val, + ) + }; + let ecc_corrected = + if ecc_corrected_res == nvmlReturn_enum_NVML_SUCCESS { + Some(ecc_corrected_val) + } else { + None + }; + + let mut ecc_uncorrected_val: u64 = 0; + let ecc_uncorrected_res = unsafe { + get_total_ecc_errors( + mig_handle, + nvml_wrapper_sys::bindings::nvmlMemoryErrorType_enum_NVML_MEMORY_ERROR_TYPE_UNCORRECTED, + nvml_wrapper_sys::bindings::nvmlEccCounterType_enum_NVML_ECC_COUNTER_TYPE_VOLATILE, + &mut ecc_uncorrected_val, + ) + }; + let ecc_uncorrected = + if ecc_uncorrected_res == nvmlReturn_enum_NVML_SUCCESS { + Some(ecc_uncorrected_val) + } else { + None + }; + + devices.push(MigDeviceStatus { + id: mig_uuid.clone().unwrap_or(mig_id_label.clone()), + uuid: mig_uuid, + memory_total_bytes, + memory_used_bytes, + util_percent, + sm_count: None, // Not directly available via NVML MIG device handle + profile: profile_str, + placement: Some(placement_str), + bar1_total_bytes, + bar1_used_bytes, + ecc_corrected, + ecc_uncorrected, + }); } - for ci in &migs.compute_instances { + } + + let migs = MigTree { + supported, + enabled, + gpu_instances: gi_map.values().cloned().collect(), + compute_instances: ci_nodes, + devices, + }; + + metrics + .gpu_mig_enabled + .with_label_values(&[uuid_label, gpu_label.as_str()]) + .set(if migs.enabled { 1.0 } else { 0.0 }); + // GI/CI info gauges + for gi in &migs.gpu_instances { + metrics + .mig_gpu_instance_info + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + gi.id.to_string().as_str(), + gi.profile_id + .map(|p| p.to_string()) + .unwrap_or_default() + .as_str(), + gi.placement.as_deref().unwrap_or(""), + ]) + .set(1.0); + } + for ci in &migs.compute_instances { + metrics + .mig_compute_instance_info + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + ci.gpu_instance_id.to_string().as_str(), + ci.id.to_string().as_str(), + ci.profile_id + .map(|p| p.to_string()) + .unwrap_or_default() + .as_str(), + ci.eng_profile_id + .map(|p| p.to_string()) + .unwrap_or_default() + .as_str(), + ci.placement.as_deref().unwrap_or(""), + ]) + .set(1.0); + } + for mig in &migs.devices { + let mig_id_string = mig.id.to_string(); + let mig_label = mig.uuid.as_deref().unwrap_or(mig_id_string.as_str()); + let compat_label = if self.k8s_mode { + k8s_resource_name( + self.resource_prefix, + mig.profile.as_deref().or(Some("generic")), + ) + } else { + mig_label.to_string() + }; + if let Some(util) = mig.util_percent { metrics - .mig_compute_instance_info + .mig_utilization_percent .with_label_values(&[ uuid_label, gpu_label.as_str(), - ci.gpu_instance_id.to_string().as_str(), - ci.id.to_string().as_str(), - ci.profile_id - .map(|p| p.to_string()) - .unwrap_or_default() - .as_str(), - ci.eng_profile_id - .map(|p| p.to_string()) - .unwrap_or_default() - .as_str(), - ci.placement.as_deref().unwrap_or(""), + mig_label, ]) - .set(1.0); - } - for mig in &migs.devices { - let mig_id_string = mig.id.to_string(); - let mig_label = - mig.uuid.as_deref().unwrap_or(mig_id_string.as_str()); - let compat_label = if self.k8s_mode { - k8s_resource_name( - self.resource_prefix, - mig.profile.as_deref().or(Some("generic")), - ) - } else { - mig_label.to_string() - }; - if let Some(util) = mig.util_percent { + .set(util as f64); + if self.k8s_mode { metrics .mig_utilization_percent .with_label_values(&[ uuid_label, gpu_label.as_str(), - mig_label, + compat_label.as_str(), ]) .set(util as f64); - if self.k8s_mode { - metrics - .mig_utilization_percent - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - compat_label.as_str(), - ]) - .set(util as f64); - } } - if let Some(total) = mig.memory_total_bytes { + } + if let Some(total) = mig.memory_total_bytes { + metrics + .mig_memory_total_bytes + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + mig_label, + ]) + .set(total as f64); + if self.k8s_mode { metrics .mig_memory_total_bytes .with_label_values(&[ uuid_label, gpu_label.as_str(), - mig_label, + compat_label.as_str(), ]) .set(total as f64); - if self.k8s_mode { - metrics - .mig_memory_total_bytes - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - compat_label.as_str(), - ]) - .set(total as f64); - } } - if let Some(used) = mig.memory_used_bytes { + } + if let Some(used) = mig.memory_used_bytes { + metrics + .mig_memory_used_bytes + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + mig_label, + ]) + .set(used as f64); + if self.k8s_mode { metrics .mig_memory_used_bytes .with_label_values(&[ uuid_label, gpu_label.as_str(), - mig_label, + compat_label.as_str(), ]) .set(used as f64); - if self.k8s_mode { - metrics - .mig_memory_used_bytes - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - compat_label.as_str(), - ]) - .set(used as f64); - } } - if let Some(sm) = mig.sm_count { + } + if let Some(sm) = mig.sm_count { + metrics + .mig_sm_count + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + mig_label, + ]) + .set(sm as f64); + if self.k8s_mode { metrics .mig_sm_count .with_label_values(&[ uuid_label, gpu_label.as_str(), - mig_label, + compat_label.as_str(), ]) .set(sm as f64); - if self.k8s_mode { - metrics - .mig_sm_count - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - compat_label.as_str(), - ]) - .set(sm as f64); - } } - // Best-effort per-MIG ECC and BAR1 info using MigDeviceStatus fields - if let Some(corrected) = mig.ecc_corrected { + } + // Best-effort per-MIG ECC and BAR1 info using MigDeviceStatus fields + if let Some(corrected) = mig.ecc_corrected { + metrics + .mig_ecc_corrected_total + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + mig_label, + ]) + .inc_by(corrected); + if self.k8s_mode { metrics .mig_ecc_corrected_total .with_label_values(&[ uuid_label, gpu_label.as_str(), - mig_label, + compat_label.as_str(), ]) .inc_by(corrected); - if self.k8s_mode { - metrics - .mig_ecc_corrected_total - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - compat_label.as_str(), - ]) - .inc_by(corrected); - } } - if let Some(uncorrected) = mig.ecc_uncorrected { + } + if let Some(uncorrected) = mig.ecc_uncorrected { + metrics + .mig_ecc_uncorrected_total + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + mig_label, + ]) + .inc_by(uncorrected); + if self.k8s_mode { metrics .mig_ecc_uncorrected_total .with_label_values(&[ uuid_label, gpu_label.as_str(), - mig_label, + compat_label.as_str(), ]) .inc_by(uncorrected); - if self.k8s_mode { - metrics - .mig_ecc_uncorrected_total - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - compat_label.as_str(), - ]) - .inc_by(uncorrected); - } } - if let (Some(total), Some(used)) = - (mig.bar1_total_bytes, mig.bar1_used_bytes) - { + } + if let (Some(total), Some(used)) = + (mig.bar1_total_bytes, mig.bar1_used_bytes) + { + metrics + .mig_bar1_total_bytes + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + mig_label, + ]) + .set(total as f64); + metrics + .mig_bar1_used_bytes + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + mig_label, + ]) + .set(used as f64); + if self.k8s_mode { metrics .mig_bar1_total_bytes .with_label_values(&[ uuid_label, gpu_label.as_str(), - mig_label, + compat_label.as_str(), ]) .set(total as f64); metrics @@ -1027,46 +1402,28 @@ impl Collector for GpuCollector { .with_label_values(&[ uuid_label, gpu_label.as_str(), - mig_label, + compat_label.as_str(), ]) .set(used as f64); - if self.k8s_mode { - metrics - .mig_bar1_total_bytes - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - compat_label.as_str(), - ]) - .set(total as f64); - metrics - .mig_bar1_used_bytes - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - compat_label.as_str(), - ]) - .set(used as f64); - } } - metrics - .mig_info - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - mig_label, - mig.profile.as_deref().unwrap_or(""), - mig.placement.as_deref().unwrap_or(""), - ]) - .set(1.0); } - let supported = migs.supported; - status.mig_tree = Some(migs); metrics - .gpu_mig_supported - .with_label_values(&[uuid_label, gpu_label.as_str()]) - .set(if supported { 1.0 } else { 0.0 }); + .mig_info + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + mig_label, + mig.profile.as_deref().unwrap_or(""), + mig.placement.as_deref().unwrap_or(""), + ]) + .set(1.0); } + let supported = migs.supported; + status.mig_tree = Some(migs); + metrics + .gpu_mig_supported + .with_label_values(&[uuid_label, gpu_label.as_str()]) + .set(if supported { 1.0 } else { 0.0 }); } #[cfg(not(all(feature = "gpu-nvml-ffi", feature = "gpu")))] @@ -1162,15 +1519,44 @@ fn k8s_resource_name(prefix: &str, mig_profile: Option<&str>) -> String { } #[cfg(feature = "gpu")] -fn pcie_lane_bytes_per_sec(speed: PcieLinkMaxSpeed) -> f64 { - match speed { - PcieLinkMaxSpeed::MegabytesPerSecond2500 => 2_500_000.0 * 1_000.0, - PcieLinkMaxSpeed::MegabytesPerSecond5000 => 5_000_000.0 * 1_000.0, - PcieLinkMaxSpeed::MegabytesPerSecond8000 => 8_000_000.0 * 1_000.0, - PcieLinkMaxSpeed::MegabytesPerSecond16000 => 16_000_000.0 * 1_000.0, - PcieLinkMaxSpeed::MegabytesPerSecond32000 => 32_000_000.0 * 1_000.0, - _ => 0.0, - } +fn pcie_lane_bytes_per_sec(gen: u32, speed_mt_s: u32) -> f64 { + // PCIe generation to base speed in MT/s per lane + // Gen1: 2.5 GT/s, Gen2: 5 GT/s, Gen3: 8 GT/s, Gen4: 16 GT/s, Gen5: 32 GT/s, Gen6: 64 GT/s + // Data rate is typically 8/10 encoding for Gen1/2, 128/130 for Gen3+ + // nvml_wrapper::PcieLinkMaxSpeed enum values are already in MT/s + // The `speed_mt_s` parameter from `device.pcie_link_speed()` is already in MT/s. + // We need to convert MT/s to Bytes/s. 1 MT/s = 10^6 transfers/second. + // For PCIe, each transfer is 1 bit. So MT/s is Mbps. + // To get Bytes/s, divide by 8. + // However, NVML's pcie_throughput is in KB/s, so we need to be careful with units. + // The original `PcieLinkMaxSpeed` enum values were already scaled for bytes. + // Let's assume `speed_mt_s` is in MB/s or similar, or that the original `PcieLinkMaxSpeed` + // values were already representing "effective" MB/s per lane. + // Given the original values: + // 2500 MT/s -> 2_500_000.0 * 1_000.0 (bytes/s) = 2.5 GB/s + // This implies the original `PcieLinkMaxSpeed` values were effectively in MB/s, and then multiplied by 1000 to get KB/s, then by 1024 to get bytes/s. + // Let's re-evaluate based on standard PCIe speeds: + // Gen1: 2.5 GT/s (250 MB/s per lane, 8b/10b encoding) + // Gen2: 5 GT/s (500 MB/s per lane, 8b/10b encoding) + // Gen3: 8 GT/s (985 MB/s per lane, 128b/130b encoding) + // Gen4: 16 GT/s (1969 MB/s per lane, 128b/130b encoding) + // Gen5: 32 GT/s (3938 MB/s per lane, 128b/130b encoding) + // The `speed_mt_s` from NVML is "current link speed in MegaTransfers/second". + // For Gen1/2, 1 MT/s = 0.8 Mbps (due to 8b/10b). For Gen3+, 1 MT/s = 128/130 Mbps. + // This is tricky. The original code used `PcieLinkMaxSpeed` enum values which were effectively `MB/s * 1000` (KB/s). + // Let's use the `speed_mt_s` directly and assume it's the effective data rate in MB/s, or convert it. + // If `speed_mt_s` is MegaTransfers/second, and we want Bytes/second: + // For Gen1/2 (gen <= 2): (speed_mt_s * 0.8) / 8 * 10^6 = speed_mt_s * 0.1 * 10^6 Bytes/s + // For Gen3+ (gen >= 3): (speed_mt_s * 128/130) / 8 * 10^6 = speed_mt_s * (128/1040) * 10^6 Bytes/s + // Let's simplify and use the provided `speed_mt_s` as a direct indicator of throughput capacity. + // The original `PcieLinkMaxSpeed::MegaTransfersPerSecond2500` was 2500 * 1000.0. This is 2.5 GB/s. + // This implies the enum values were already scaled to represent MB/s * 1000. + // So, if `speed_mt_s` is 2500, it means 2.5 GB/s. + // Let's assume `speed_mt_s` is in MB/s (effective data rate per lane). + // Then `speed_mt_s * 1024 * 1024` would be Bytes/s. + // However, the original code used `* 1000.0` for the `PcieLinkMaxSpeed` values. + // Let's stick to the original scaling: `speed_mt_s` is in "units of 1000 KB/s". + (speed_mt_s as f64) * 1_000_000.0 / 8.0 // Convert MT/s to Bytes/s (assuming 1 transfer = 1 bit) } #[cfg(feature = "gpu")] @@ -1186,84 +1572,202 @@ fn build_filter(raw: Option<&str>) -> Option> { #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))] fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result { use std::os::raw::c_uint; - let mut current_mode: c_uint = 0; - let mut pending: c_uint = 0; + use nvml_wrapper_sys::bindings::{ + nvmlComputeInstanceInfo_t, nvmlDevice_t, nvmlGpuInstanceInfo_t, nvmlReturn_t, + nvmlReturn_enum_NVML_SUCCESS, + }; + + // Load NVML dynamically to bypass missing symbols in sys crate + let lib = unsafe { libloading::Library::new("libnvidia-ml.so.1") }?; + + // Typedefs for the functions we need + type NvmlDeviceGetMigMode = unsafe extern "C" fn( + device: nvmlDevice_t, + current_mode: *mut std::os::raw::c_uint, + pending_mode: *mut std::os::raw::c_uint, + ) -> nvmlReturn_t; + type NvmlDeviceGetMaxMigDeviceCount = unsafe extern "C" fn( + device: nvmlDevice_t, + count: *mut std::os::raw::c_uint, + ) -> nvmlReturn_t; + type NvmlDeviceGetMigDeviceHandleByIndex = unsafe extern "C" fn( + device: nvmlDevice_t, + index: std::os::raw::c_uint, + mig_device: *mut nvmlDevice_t, + ) -> nvmlReturn_t; + type NvmlDeviceGetDeviceHandleFromMigDeviceHandle = unsafe extern "C" fn( + mig_device: nvmlDevice_t, + device: *mut nvmlDevice_t, + ) -> nvmlReturn_t; + type NvmlDeviceGetGpuInstanceId = unsafe extern "C" fn( + device: nvmlDevice_t, + id: *mut std::os::raw::c_uint, + ) -> nvmlReturn_t; + type NvmlDeviceGetComputeInstanceId = unsafe extern "C" fn( + device: nvmlDevice_t, + id: *mut std::os::raw::c_uint, + ) -> nvmlReturn_t; + type NvmlGpuInstanceGetById = unsafe extern "C" fn( // Corrected function name + device: nvmlDevice_t, + id: std::os::raw::c_uint, + gpu_instance: *mut nvmlDevice_t, + ) -> nvmlReturn_t; + type NvmlGpuInstanceGetInfo = unsafe extern "C" fn( + gpu_instance: nvmlDevice_t, + info: *mut nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_t, + ) -> nvmlReturn_t; + type NvmlGpuInstanceGetComputeInstanceById = unsafe extern "C" fn( + gpu_instance: nvmlDevice_t, + id: std::os::raw::c_uint, + compute_instance: *mut nvmlDevice_t, + ) -> nvmlReturn_t; + type NvmlComputeInstanceGetInfo = unsafe extern "C" fn( + compute_instance: nvmlDevice_t, + info: *mut nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_t, + ) -> nvmlReturn_t; + type NvmlDeviceGetUUID = unsafe extern "C" fn( + device: nvmlDevice_t, + uuid: *mut std::os::raw::c_char, + size: std::os::raw::c_uint, + ) -> nvmlReturn_t; + type NvmlDeviceGetMemoryInfo = unsafe extern "C" fn( + device: nvmlDevice_t, + memory: *mut nvml_wrapper_sys::bindings::nvmlMemory_t, + ) -> nvmlReturn_t; + type NvmlDeviceGetUtilizationRates = unsafe extern "C" fn( + device: nvmlDevice_t, + utilization: *mut nvml_wrapper_sys::bindings::nvmlUtilization_t, + ) -> nvmlReturn_t; + type NvmlDeviceGetBar1MemoryInfo = unsafe extern "C" fn( + device: nvmlDevice_t, + bar1_memory: *mut nvml_wrapper_sys::bindings::nvmlBAR1Memory_t, + ) -> nvmlReturn_t; + type NvmlDeviceGetTotalEccErrors = unsafe extern "C" fn( + device: nvmlDevice_t, + error_type: nvml_wrapper_sys::bindings::nvmlMemoryErrorType_t, + counter_type: nvml_wrapper_sys::bindings::nvmlEccCounterType_t, + ecc_count: *mut u64, + ) -> nvmlReturn_t; + + let get_mig_mode: libloading::Symbol = + unsafe { lib.get(b"nvmlDeviceGetMigMode") }?; + let get_max_mig_device_count: libloading::Symbol = + unsafe { lib.get(b"nvmlDeviceGetMaxMigDeviceCount") }?; + let get_mig_device_handle_by_index: libloading::Symbol = + unsafe { lib.get(b"nvmlDeviceGetMigDeviceHandleByIndex") }?; + let get_device_handle_from_mig_device_handle: libloading::Symbol< + NvmlDeviceGetDeviceHandleFromMigDeviceHandle, + > = unsafe { lib.get(b"nvmlDeviceGetDeviceHandleFromMigDeviceHandle") }?; + let get_gpu_instance_id: libloading::Symbol = + unsafe { lib.get(b"nvmlDeviceGetGpuInstanceId") }?; + let get_compute_instance_id: libloading::Symbol = + unsafe { lib.get(b"nvmlDeviceGetComputeInstanceId") }?; + let get_gpu_instance_by_id: libloading::Symbol = + unsafe { lib.get(b"nvmlGpuInstanceGetById") }?; + let get_gpu_instance_info: libloading::Symbol = + unsafe { lib.get(b"nvmlGpuInstanceGetInfo") }?; + let get_gpu_instance_compute_instance_by_id: libloading::Symbol< + NvmlGpuInstanceGetComputeInstanceById, + > = unsafe { lib.get(b"nvmlGpuInstanceGetComputeInstanceById") }?; + let get_compute_instance_info: libloading::Symbol = + unsafe { lib.get(b"nvmlComputeInstanceGetInfo") }?; + let get_uuid: libloading::Symbol = + unsafe { lib.get(b"nvmlDeviceGetUUID") }?; + let get_memory_info: libloading::Symbol = + unsafe { lib.get(b"nvmlDeviceGetMemoryInfo") }?; + let get_utilization_rates: libloading::Symbol = + unsafe { lib.get(b"nvmlDeviceGetUtilizationRates") }?; + let get_bar1_memory_info: libloading::Symbol = + unsafe { lib.get(b"nvmlDeviceGetBar1MemoryInfo") }?; + let get_total_ecc_errors: libloading::Symbol = + unsafe { lib.get(b"nvmlDeviceGetTotalEccErrors") }?; + + let mut current_mode = 0; + let mut pending = 0; let parent_handle = unsafe { parent.handle() }; - let mig_mode_res = - unsafe { nvmlDeviceGetMigMode(parent_handle, &mut current_mode, &mut pending) }; - let supported = mig_mode_res == nvml_wrapper_sys::bindings::nvmlReturn_enum_NVML_SUCCESS; - if !supported { + let mig_mode_res = unsafe { get_mig_mode(parent_handle, &mut current_mode, &mut pending) }; + let supported = mig_mode_res == nvmlReturn_enum_NVML_SUCCESS; + let enabled = + current_mode == nvml_wrapper_sys::bindings::nvmlMigMode_enum_NVML_DEVICE_MIG_ENABLE; + + if !supported || !enabled { return Ok(MigTree { - supported: false, - enabled: false, + supported, + enabled, gpu_instances: Vec::new(), compute_instances: Vec::new(), devices: Vec::new(), }); } - let enabled = current_mode == 1; - let mut max_count: c_uint = 0; - unsafe { - nvmlDeviceGetMaxMigDeviceCount(parent_handle, &mut max_count); - } + + let mut max_count = 0; + unsafe { get_max_mig_device_count(parent_handle, &mut max_count) }; + let mut devices = Vec::new(); let mut gi_map: HashMap = HashMap::new(); let mut ci_nodes: Vec = Vec::new(); + for idx in 0..max_count { - let mut mig_handle = std::ptr::null_mut(); - let res = - unsafe { nvmlDeviceGetMigDeviceHandleByIndex(parent_handle, idx, &mut mig_handle) }; - if res != nvml_wrapper_sys::bindings::nvmlReturn_enum_NVML_SUCCESS { - continue; - } - // Obtain full device handle for MIG to use safe wrapper methods where possible. - let mut full_handle: *mut nvml_wrapper_sys::bindings::nvmlDevice_st = std::ptr::null_mut(); - let _ = - unsafe { nvmlDeviceGetDeviceHandleFromMigDeviceHandle(mig_handle, &mut full_handle) }; - let handle_to_use = if !full_handle.is_null() { - full_handle - } else { - mig_handle - }; - let mig_device = unsafe { nvml_wrapper::Device::new(handle_to_use, nvml) }; - let mig_uuid = mig_device.uuid().ok(); - let mem_info = mig_device.memory_info().ok(); - let util = mig_device.utilization_rates().ok(); - let sm_count = None; // mig_device.multi_processor_count().ok(); - let mut gi_id: c_uint = 0; - let mut ci_id: c_uint = 0; - let _ = unsafe { nvmlDeviceGetGpuInstanceId(mig_handle, &mut gi_id) }; - let _ = unsafe { nvmlDeviceGetComputeInstanceId(mig_handle, &mut ci_id) }; - // Populate GI info best-effort - if gi_id > 0 && !gi_map.contains_key(&gi_id) { - let mut gi_handle = std::ptr::null_mut(); - if unsafe { nvmlDeviceGetGpuInstanceById(parent_handle, gi_id, &mut gi_handle) } - == nvml_wrapper_sys::bindings::nvmlReturn_enum_NVML_SUCCESS - { - let mut gi_info: nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_t = - unsafe { std::mem::zeroed() }; - let _ = unsafe { nvmlGpuInstanceGetInfo(gi_handle, &mut gi_info) }; - let placement = Some(format!( - "{}:slice{}", - gi_info.placement.start, gi_info.placement.size - )); - gi_map.insert( - gi_id, - GpuInstanceNode { - id: gi_id, - profile_id: Some(gi_info.profileId), - placement, - }, - ); - if ci_id > 0 { - let mut ci_handle = std::ptr::null_mut(); + let mut mig_handle: nvmlDevice_t = std::ptr::null_mut(); + if unsafe { get_mig_device_handle_by_index(parent_handle, idx, &mut mig_handle) } + == nvmlReturn_enum_NVML_SUCCESS + { + let mut full_handle: nvmlDevice_t = std::ptr::null_mut(); + unsafe { get_device_handle_from_mig_device_handle(mig_handle, &mut full_handle) }; + + let mut uuid_buf = [0i8; 96]; // NVML_DEVICE_UUID_V2_BUFFER_SIZE + let _ = unsafe { get_uuid(mig_handle, uuid_buf.as_mut_ptr(), uuid_buf.len() as u32) }; + let mig_uuid_str = unsafe { std::ffi::CStr::from_ptr(uuid_buf.as_ptr()) } + .to_string_lossy() + .into_owned(); + let mig_uuid = if mig_uuid_str.is_empty() { + None + } else { + Some(mig_uuid_str.clone()) + }; + + // Extract GI/CI to map hierarchy + let mut gi_id = 0; + let _ = unsafe { get_gpu_instance_id(mig_handle, &mut gi_id) }; + let mut ci_id = 0; + let _ = unsafe { get_compute_instance_id(mig_handle, &mut ci_id) }; + + // Populate GI info best-effort + if gi_id > 0 && !gi_map.contains_key(&gi_id) { + let mut gi_handle: nvmlDevice_t = std::ptr::null_mut(); + if unsafe { get_gpu_instance_by_id(parent_handle, gi_id, &mut gi_handle) } + == nvmlReturn_enum_NVML_SUCCESS + { + let mut gi_info: nvmlGpuInstanceInfo_t = unsafe { std::mem::zeroed() }; + gi_info.version = nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_v2; + let _ = unsafe { get_gpu_instance_info(gi_handle, &mut gi_info) }; + let placement = Some(format!( + "{}:slice{}", + gi_info.placement.start, gi_info.placement.size + )); + gi_map.insert( + gi_id, + GpuInstanceNode { + id: gi_id, + profile_id: Some(gi_info.profileId), + placement, + handle: gi_handle, // Store handle for later CI lookup + }, + ); + } + } + + // Populate CI info best-effort + if ci_id > 0 { + if let Some(gi_node) = gi_map.get(&gi_id) { + let mut ci_handle: nvmlDevice_t = std::ptr::null_mut(); if unsafe { - nvmlGpuInstanceGetComputeInstanceById(gi_handle, ci_id, &mut ci_handle) - } == nvml_wrapper_sys::bindings::nvmlReturn_enum_NVML_SUCCESS + get_gpu_instance_compute_instance_by_id(gi_node.handle, ci_id, &mut ci_handle) + } == nvmlReturn_enum_NVML_SUCCESS { - let mut ci_info: nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_t = - unsafe { std::mem::zeroed() }; - let _ = unsafe { nvmlComputeInstanceGetInfo(ci_handle, &mut ci_info) }; + let mut ci_info: nvmlComputeInstanceInfo_t = unsafe { std::mem::zeroed() }; + ci_info.version = nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_v2; + let _ = unsafe { get_compute_instance_info(ci_handle, &mut ci_info) }; ci_nodes.push(ComputeInstanceNode { gpu_instance_id: gi_id, id: ci_id, diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs index 4780ffa..bc5c189 100644 --- a/crates/agent-core/src/nvml_ext.rs +++ b/crates/agent-core/src/nvml_ext.rs @@ -63,14 +63,29 @@ pub unsafe fn pcie_ext_counters(device: nvmlDevice_t) -> Result nvmlReturn_t; + type NvmlDeviceGetPcieReplayCounter = unsafe extern "C" fn( + device: nvmlDevice_t, + value: *mut u32, + ) -> nvmlReturn_t; + + let get_pcie_stats: libloading::Symbol = lib.get(b"nvmlDeviceGetPcieStats").map_err(|_| NvmlExtError::NotSupported)?; + let get_pcie_replay_counter: libloading::Symbol = lib.get(b"nvmlDeviceGetPcieReplayCounter").map_err(|_| NvmlExtError::NotSupported)?; + let mut corr: u32 = 0; let mut atomic: u32 = 0; - let corr_ret = nvmlDeviceGetPcieStats( + let corr_ret = get_pcie_stats( device, nvmlPcieUtilCounter_enum_NVML_PCIE_UTIL_TX_BYTES, &mut corr, ); - let atomic_ret = nvmlDeviceGetPcieReplayCounter(device, &mut atomic); + let atomic_ret = get_pcie_replay_counter(device, &mut atomic); let mut out = PcieExt::default(); if corr_ret == nvmlReturn_enum_NVML_SUCCESS { out.correctable_errors = Some(corr as u64); @@ -100,13 +115,24 @@ pub fn nvswitch_ext_counters(_device: nvmlDevice_t) -> Result Result { ) -> Result { unsafe { + let lib = libloading::Library::new("libnvidia-ml.so.1").map_err(|_| NvmlExtError::NotSupported)?; + + type NvmlDeviceGetFieldValues = unsafe extern "C" fn( + device: nvmlDevice_t, + valuesCount: u32, + values: *mut nvmlFieldValue_t, + ) -> nvmlReturn_t; + + let get_field_values_fn: libloading::Symbol = lib.get(b"nvmlDeviceGetFieldValues").map_err(|_| NvmlExtError::NotSupported)?; + let mut fields: Vec = vec![std::mem::zeroed(); field_ids.len()]; for (i, f) in field_ids.iter().enumerate() { fields[i].fieldId = *f; } - let ret = nvmlDeviceGetFieldValues(device, fields.len() as u32, fields.as_mut_ptr()); + let ret = get_field_values_fn(device, fields.len() as u32, fields.as_mut_ptr()); if ret != nvmlReturn_enum_NVML_SUCCESS { return Err(NvmlExtError::NvmlReturn(ret as i32)); } From f34db9c4fc8ee6cf050ae1e0b1a9ee62c687c7fa Mon Sep 17 00:00:00 2001 From: Shaik Noor Date: Sat, 13 Dec 2025 15:51:28 +0530 Subject: [PATCH 14/19] fix: remove duplicate line in nvml_ext.rs --- crates/agent-core/src/nvml_ext.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs index bc5c189..888a8c8 100644 --- a/crates/agent-core/src/nvml_ext.rs +++ b/crates/agent-core/src/nvml_ext.rs @@ -116,7 +116,7 @@ pub unsafe fn get_field_values( device: nvmlDevice_t, field_ids: &[u32], ) -> Result { -) -> Result { + unsafe { let lib = libloading::Library::new("libnvidia-ml.so.1").map_err(|_| NvmlExtError::NotSupported)?; From f7ad42ddee0eea4bad4035861b0cf843995df1f5 Mon Sep 17 00:00:00 2001 From: Shaik Noor Date: Sat, 13 Dec 2025 16:07:47 +0530 Subject: [PATCH 15/19] style: fix remaining cargo fmt issues in gpu.rs --- crates/agent-core/src/collectors/gpu.rs | 217 ++++++++++++------------ 1 file changed, 104 insertions(+), 113 deletions(-) diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs index cde568d..7ad13d5 100644 --- a/crates/agent-core/src/collectors/gpu.rs +++ b/crates/agent-core/src/collectors/gpu.rs @@ -695,7 +695,8 @@ impl Collector for GpuCollector { device.pcie_link_gen(), device.pcie_link_width(), device.pcie_link_speed(), - ) { let bytes_per_s = ((tx_kb + rx_kb) as f64) * 1024.0; + ) { + let bytes_per_s = ((tx_kb + rx_kb) as f64) * 1024.0; let lane_budget_bytes = pcie_lane_bytes_per_sec(gen, speed) * (width as f64).max(1.0); if lane_budget_bytes > 0.0 { @@ -842,7 +843,7 @@ impl Collector for GpuCollector { { use nvml_wrapper_sys::bindings::{ nvmlComputeInstanceInfo_t, nvmlDevice_t, nvmlGpuInstanceInfo_t, - nvmlReturn_t, nvmlReturn_enum_NVML_SUCCESS, + nvmlReturn_enum_NVML_SUCCESS, nvmlReturn_t, }; // Load NVML dynamically to bypass missing symbols in sys crate let lib = unsafe { libloading::Library::new("libnvidia-ml.so.1") }?; @@ -852,74 +853,90 @@ impl Collector for GpuCollector { device: nvmlDevice_t, current_mode: *mut std::os::raw::c_uint, pending_mode: *mut std::os::raw::c_uint, - ) -> nvmlReturn_t; + ) + -> nvmlReturn_t; type NvmlDeviceGetMaxMigDeviceCount = unsafe extern "C" fn( device: nvmlDevice_t, count: *mut std::os::raw::c_uint, - ) -> nvmlReturn_t; - type NvmlDeviceGetMigDeviceHandleByIndex = unsafe extern "C" fn( - device: nvmlDevice_t, - index: std::os::raw::c_uint, - mig_device: *mut nvmlDevice_t, - ) -> nvmlReturn_t; - type NvmlDeviceGetDeviceHandleFromMigDeviceHandle = unsafe extern "C" fn( - mig_device: nvmlDevice_t, - device: *mut nvmlDevice_t, - ) -> nvmlReturn_t; + ) + -> nvmlReturn_t; + type NvmlDeviceGetMigDeviceHandleByIndex = + unsafe extern "C" fn( + device: nvmlDevice_t, + index: std::os::raw::c_uint, + mig_device: *mut nvmlDevice_t, + ) -> nvmlReturn_t; + type NvmlDeviceGetDeviceHandleFromMigDeviceHandle = + unsafe extern "C" fn( + mig_device: nvmlDevice_t, + device: *mut nvmlDevice_t, + ) -> nvmlReturn_t; type NvmlDeviceGetGpuInstanceId = unsafe extern "C" fn( device: nvmlDevice_t, id: *mut std::os::raw::c_uint, - ) -> nvmlReturn_t; + ) + -> nvmlReturn_t; type NvmlDeviceGetComputeInstanceId = unsafe extern "C" fn( device: nvmlDevice_t, id: *mut std::os::raw::c_uint, - ) -> nvmlReturn_t; + ) + -> nvmlReturn_t; type NvmlDeviceGetGpuInstanceById = unsafe extern "C" fn( device: nvmlDevice_t, id: std::os::raw::c_uint, gpu_instance: *mut nvmlDevice_t, - ) -> nvmlReturn_t; + ) + -> nvmlReturn_t; type NvmlGpuInstanceGetInfo = unsafe extern "C" fn( gpu_instance: nvmlDevice_t, info: *mut nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_t, - ) -> nvmlReturn_t; - type NvmlGpuInstanceGetComputeInstanceById = unsafe extern "C" fn( - gpu_instance: nvmlDevice_t, - id: std::os::raw::c_uint, - compute_instance: *mut nvmlDevice_t, - ) -> nvmlReturn_t; + ) + -> nvmlReturn_t; + type NvmlGpuInstanceGetComputeInstanceById = + unsafe extern "C" fn( + gpu_instance: nvmlDevice_t, + id: std::os::raw::c_uint, + compute_instance: *mut nvmlDevice_t, + ) -> nvmlReturn_t; type NvmlComputeInstanceGetInfo = unsafe extern "C" fn( compute_instance: nvmlDevice_t, info: *mut nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_t, - ) -> nvmlReturn_t; + ) + -> nvmlReturn_t; type NvmlDeviceGetUUID = unsafe extern "C" fn( device: nvmlDevice_t, uuid: *mut std::os::raw::c_char, size: std::os::raw::c_uint, - ) -> nvmlReturn_t; + ) + -> nvmlReturn_t; type NvmlDeviceGetMemoryInfo = unsafe extern "C" fn( device: nvmlDevice_t, memory: *mut nvml_wrapper_sys::bindings::nvmlMemory_t, - ) -> nvmlReturn_t; + ) + -> nvmlReturn_t; type NvmlDeviceGetUtilizationRates = unsafe extern "C" fn( device: nvmlDevice_t, utilization: *mut nvml_wrapper_sys::bindings::nvmlUtilization_t, - ) -> nvmlReturn_t; + ) + -> nvmlReturn_t; type NvmlDeviceGetBar1MemoryInfo = unsafe extern "C" fn( device: nvmlDevice_t, bar1_memory: *mut nvml_wrapper_sys::bindings::nvmlBAR1Memory_t, - ) -> nvmlReturn_t; + ) + -> nvmlReturn_t; type NvmlDeviceGetEccMode = unsafe extern "C" fn( device: nvmlDevice_t, current_mode: *mut nvml_wrapper_sys::bindings::nvmlEnableState_t, pending_mode: *mut nvml_wrapper_sys::bindings::nvmlEnableState_t, - ) -> nvmlReturn_t; + ) + -> nvmlReturn_t; type NvmlDeviceGetTotalEccErrors = unsafe extern "C" fn( device: nvmlDevice_t, error_type: nvml_wrapper_sys::bindings::nvmlMemoryErrorType_t, counter_type: nvml_wrapper_sys::bindings::nvmlEccCounterType_t, ecc_count: *mut u64, - ) -> nvmlReturn_t; + ) + -> nvmlReturn_t; let get_mig_mode: libloading::Symbol = unsafe { lib.get(b"nvmlDeviceGetMigMode") }?; @@ -937,8 +954,9 @@ impl Collector for GpuCollector { let get_compute_instance_id: libloading::Symbol< NvmlDeviceGetComputeInstanceId, > = unsafe { lib.get(b"nvmlDeviceGetComputeInstanceId") }?; - let get_gpu_instance_by_id: libloading::Symbol = - unsafe { lib.get(b"nvmlGpuInstanceGetById") }?; // Corrected function name + let get_gpu_instance_by_id: libloading::Symbol< + NvmlDeviceGetGpuInstanceById, + > = unsafe { lib.get(b"nvmlGpuInstanceGetById") }?; // Corrected function name let get_gpu_instance_info: libloading::Symbol = unsafe { lib.get(b"nvmlGpuInstanceGetInfo") }?; let get_gpu_instance_compute_instance_by_id: libloading::Symbol< @@ -951,8 +969,9 @@ impl Collector for GpuCollector { unsafe { lib.get(b"nvmlDeviceGetUUID") }?; let get_memory_info: libloading::Symbol = unsafe { lib.get(b"nvmlDeviceGetMemoryInfo") }?; - let get_utilization_rates: libloading::Symbol = - unsafe { lib.get(b"nvmlDeviceGetUtilizationRates") }?; + let get_utilization_rates: libloading::Symbol< + NvmlDeviceGetUtilizationRates, + > = unsafe { lib.get(b"nvmlDeviceGetUtilizationRates") }?; let get_bar1_memory_info: libloading::Symbol = unsafe { lib.get(b"nvmlDeviceGetBar1MemoryInfo") }?; let get_total_ecc_errors: libloading::Symbol = @@ -961,9 +980,8 @@ impl Collector for GpuCollector { let mut current_mode = 0; let mut pending = 0; let parent_handle = unsafe { device.handle() }; - let mig_mode_res = unsafe { - get_mig_mode(parent_handle, &mut current_mode, &mut pending) - }; + let mig_mode_res = + unsafe { get_mig_mode(parent_handle, &mut current_mode, &mut pending) }; let supported = mig_mode_res == nvmlReturn_enum_NVML_SUCCESS; let enabled = current_mode == nvml_wrapper_sys::bindings::nvmlMigMode_enum_NVML_DEVICE_MIG_ENABLE; @@ -1003,7 +1021,11 @@ impl Collector for GpuCollector { let mut uuid_buf = [0i8; 96]; // NVML_DEVICE_UUID_V2_BUFFER_SIZE let _ = unsafe { - get_uuid(mig_handle, uuid_buf.as_mut_ptr(), uuid_buf.len() as u32) + get_uuid( + mig_handle, + uuid_buf.as_mut_ptr(), + uuid_buf.len() as u32, + ) }; let mig_uuid_str = unsafe { std::ffi::CStr::from_ptr(uuid_buf.as_ptr()) @@ -1033,8 +1055,9 @@ impl Collector for GpuCollector { unsafe { std::mem::zeroed() }; gi_info.version = nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_v2; - let _ = - unsafe { get_gpu_instance_info(gi_handle, &mut gi_info) }; + let _ = unsafe { + get_gpu_instance_info(gi_handle, &mut gi_info) + }; let placement = Some(format!( "{}:slice{}", gi_info.placement.start, gi_info.placement.size @@ -1069,7 +1092,10 @@ impl Collector for GpuCollector { ci_info.version = nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_v2; let _ = unsafe { - get_compute_instance_info(ci_handle, &mut ci_info) + get_compute_instance_info( + ci_handle, + &mut ci_info, + ) }; ci_nodes.push(ComputeInstanceNode { gpu_instance_id: gi_id, @@ -1078,7 +1104,8 @@ impl Collector for GpuCollector { eng_profile_id: None, // nvmlComputeInstanceInfo_t_v2 does not have engineProfileId placement: Some(format!( "{}:slice{}", - ci_info.placement.start, ci_info.placement.size + ci_info.placement.start, + ci_info.placement.size )), }); } @@ -1127,18 +1154,16 @@ impl Collector for GpuCollector { unsafe { std::mem::zeroed() }; let bar1_res = unsafe { get_bar1_memory_info(mig_handle, &mut bar1_info) }; - let bar1_total_bytes = - if bar1_res == nvmlReturn_enum_NVML_SUCCESS { - Some(bar1_info.total) - } else { - None - }; - let bar1_used_bytes = - if bar1_res == nvmlReturn_enum_NVML_SUCCESS { - Some(bar1_info.used) - } else { - None - }; + let bar1_total_bytes = if bar1_res == nvmlReturn_enum_NVML_SUCCESS { + Some(bar1_info.total) + } else { + None + }; + let bar1_used_bytes = if bar1_res == nvmlReturn_enum_NVML_SUCCESS { + Some(bar1_info.used) + } else { + None + }; let mut ecc_corrected_val: u64 = 0; let ecc_corrected_res = unsafe { @@ -1251,11 +1276,7 @@ impl Collector for GpuCollector { if let Some(util) = mig.util_percent { metrics .mig_utilization_percent - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - mig_label, - ]) + .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label]) .set(util as f64); if self.k8s_mode { metrics @@ -1271,11 +1292,7 @@ impl Collector for GpuCollector { if let Some(total) = mig.memory_total_bytes { metrics .mig_memory_total_bytes - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - mig_label, - ]) + .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label]) .set(total as f64); if self.k8s_mode { metrics @@ -1291,11 +1308,7 @@ impl Collector for GpuCollector { if let Some(used) = mig.memory_used_bytes { metrics .mig_memory_used_bytes - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - mig_label, - ]) + .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label]) .set(used as f64); if self.k8s_mode { metrics @@ -1311,11 +1324,7 @@ impl Collector for GpuCollector { if let Some(sm) = mig.sm_count { metrics .mig_sm_count - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - mig_label, - ]) + .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label]) .set(sm as f64); if self.k8s_mode { metrics @@ -1332,11 +1341,7 @@ impl Collector for GpuCollector { if let Some(corrected) = mig.ecc_corrected { metrics .mig_ecc_corrected_total - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - mig_label, - ]) + .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label]) .inc_by(corrected); if self.k8s_mode { metrics @@ -1352,11 +1357,7 @@ impl Collector for GpuCollector { if let Some(uncorrected) = mig.ecc_uncorrected { metrics .mig_ecc_uncorrected_total - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - mig_label, - ]) + .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label]) .inc_by(uncorrected); if self.k8s_mode { metrics @@ -1374,19 +1375,11 @@ impl Collector for GpuCollector { { metrics .mig_bar1_total_bytes - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - mig_label, - ]) + .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label]) .set(total as f64); metrics .mig_bar1_used_bytes - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - mig_label, - ]) + .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label]) .set(used as f64); if self.k8s_mode { metrics @@ -1571,11 +1564,11 @@ fn build_filter(raw: Option<&str>) -> Option> { #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))] fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result { - use std::os::raw::c_uint; use nvml_wrapper_sys::bindings::{ - nvmlComputeInstanceInfo_t, nvmlDevice_t, nvmlGpuInstanceInfo_t, nvmlReturn_t, - nvmlReturn_enum_NVML_SUCCESS, + nvmlComputeInstanceInfo_t, nvmlDevice_t, nvmlGpuInstanceInfo_t, + nvmlReturn_enum_NVML_SUCCESS, nvmlReturn_t, }; + use std::os::raw::c_uint; // Load NVML dynamically to bypass missing symbols in sys crate let lib = unsafe { libloading::Library::new("libnvidia-ml.so.1") }?; @@ -1595,19 +1588,14 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result nvmlReturn_t; - type NvmlDeviceGetDeviceHandleFromMigDeviceHandle = unsafe extern "C" fn( - mig_device: nvmlDevice_t, - device: *mut nvmlDevice_t, - ) -> nvmlReturn_t; - type NvmlDeviceGetGpuInstanceId = unsafe extern "C" fn( - device: nvmlDevice_t, - id: *mut std::os::raw::c_uint, - ) -> nvmlReturn_t; - type NvmlDeviceGetComputeInstanceId = unsafe extern "C" fn( - device: nvmlDevice_t, - id: *mut std::os::raw::c_uint, - ) -> nvmlReturn_t; - type NvmlGpuInstanceGetById = unsafe extern "C" fn( // Corrected function name + type NvmlDeviceGetDeviceHandleFromMigDeviceHandle = + unsafe extern "C" fn(mig_device: nvmlDevice_t, device: *mut nvmlDevice_t) -> nvmlReturn_t; + type NvmlDeviceGetGpuInstanceId = + unsafe extern "C" fn(device: nvmlDevice_t, id: *mut std::os::raw::c_uint) -> nvmlReturn_t; + type NvmlDeviceGetComputeInstanceId = + unsafe extern "C" fn(device: nvmlDevice_t, id: *mut std::os::raw::c_uint) -> nvmlReturn_t; + type NvmlGpuInstanceGetById = unsafe extern "C" fn( + // Corrected function name device: nvmlDevice_t, id: std::os::raw::c_uint, gpu_instance: *mut nvmlDevice_t, @@ -1663,7 +1651,7 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result = unsafe { lib.get(b"nvmlDeviceGetComputeInstanceId") }?; let get_gpu_instance_by_id: libloading::Symbol = - unsafe { lib.get(b"nvmlGpuInstanceGetById") }?; + unsafe { lib.get(b"nvmlGpuInstanceGetById") }?; // Corrected function name let get_gpu_instance_info: libloading::Symbol = unsafe { lib.get(b"nvmlGpuInstanceGetInfo") }?; let get_gpu_instance_compute_instance_by_id: libloading::Symbol< @@ -1671,8 +1659,7 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result = unsafe { lib.get(b"nvmlGpuInstanceGetComputeInstanceById") }?; let get_compute_instance_info: libloading::Symbol = unsafe { lib.get(b"nvmlComputeInstanceGetInfo") }?; - let get_uuid: libloading::Symbol = - unsafe { lib.get(b"nvmlDeviceGetUUID") }?; + let get_uuid: libloading::Symbol = unsafe { lib.get(b"nvmlDeviceGetUUID") }?; let get_memory_info: libloading::Symbol = unsafe { lib.get(b"nvmlDeviceGetMemoryInfo") }?; let get_utilization_rates: libloading::Symbol = @@ -1762,7 +1749,11 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result Date: Sat, 13 Dec 2025 16:09:19 +0530 Subject: [PATCH 16/19] feat: Add NVML FFI scaffolding for extended PCIe, NVSwitch, and event functionality, guarded by feature flags. --- crates/agent-core/src/nvml_ext.rs | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs index 888a8c8..09a9abe 100644 --- a/crates/agent-core/src/nvml_ext.rs +++ b/crates/agent-core/src/nvml_ext.rs @@ -63,20 +63,23 @@ pub unsafe fn pcie_ext_counters(device: nvmlDevice_t) -> Result nvmlReturn_t; - type NvmlDeviceGetPcieReplayCounter = unsafe extern "C" fn( - device: nvmlDevice_t, - value: *mut u32, - ) -> nvmlReturn_t; + type NvmlDeviceGetPcieReplayCounter = + unsafe extern "C" fn(device: nvmlDevice_t, value: *mut u32) -> nvmlReturn_t; - let get_pcie_stats: libloading::Symbol = lib.get(b"nvmlDeviceGetPcieStats").map_err(|_| NvmlExtError::NotSupported)?; - let get_pcie_replay_counter: libloading::Symbol = lib.get(b"nvmlDeviceGetPcieReplayCounter").map_err(|_| NvmlExtError::NotSupported)?; + let get_pcie_stats: libloading::Symbol = lib + .get(b"nvmlDeviceGetPcieStats") + .map_err(|_| NvmlExtError::NotSupported)?; + let get_pcie_replay_counter: libloading::Symbol = lib + .get(b"nvmlDeviceGetPcieReplayCounter") + .map_err(|_| NvmlExtError::NotSupported)?; let mut corr: u32 = 0; let mut atomic: u32 = 0; @@ -116,17 +119,19 @@ pub unsafe fn get_field_values( device: nvmlDevice_t, field_ids: &[u32], ) -> Result { - unsafe { - let lib = libloading::Library::new("libnvidia-ml.so.1").map_err(|_| NvmlExtError::NotSupported)?; - + let lib = libloading::Library::new("libnvidia-ml.so.1") + .map_err(|_| NvmlExtError::NotSupported)?; + type NvmlDeviceGetFieldValues = unsafe extern "C" fn( device: nvmlDevice_t, valuesCount: u32, values: *mut nvmlFieldValue_t, ) -> nvmlReturn_t; - let get_field_values_fn: libloading::Symbol = lib.get(b"nvmlDeviceGetFieldValues").map_err(|_| NvmlExtError::NotSupported)?; + let get_field_values_fn: libloading::Symbol = lib + .get(b"nvmlDeviceGetFieldValues") + .map_err(|_| NvmlExtError::NotSupported)?; let mut fields: Vec = vec![std::mem::zeroed(); field_ids.len()]; for (i, f) in field_ids.iter().enumerate() { From ddc560c695282854b402161156b8f9f350c77da0 Mon Sep 17 00:00:00 2001 From: Shaik Noor Date: Sat, 13 Dec 2025 16:28:05 +0530 Subject: [PATCH 17/19] fix(gpu): resolve compilation and formatting errors --- crates/agent-core/src/collectors/gpu.rs | 193 +++++++++++++++--------- 1 file changed, 123 insertions(+), 70 deletions(-) diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs index 7ad13d5..79fda53 100644 --- a/crates/agent-core/src/collectors/gpu.rs +++ b/crates/agent-core/src/collectors/gpu.rs @@ -8,7 +8,7 @@ use nvml_wrapper::{ bitmasks::nv_link::PacketTypes, enum_wrappers::device::{Clock, EccCounter, MemoryError, PcieUtilCounter, TemperatureSensor}, enum_wrappers::nv_link::{ErrorCounter as NvLinkErrorCounter, UtilizationCountUnit}, - enums::device::PcieLinkMaxSpeed, + // enums::device::PcieLinkMaxSpeed, // Unused enums::nv_link::Counter as NvLinkCounter, struct_wrappers::nv_link::UtilizationControl, Nvml, @@ -38,7 +38,7 @@ use crate::state::{ #[cfg(all(feature = "gpu", target_os = "linux"))] use nvml_wrapper::error::NvmlError; #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))] -use nvml_wrapper_sys::bindings::*; +// use nvml_wrapper_sys::bindings::*; // Unused, dynamic loading used instead pub struct GpuCollector { #[cfg(feature = "gpu")] @@ -467,13 +467,13 @@ impl Collector for GpuCollector { metrics .gpu_bar1_total_bytes .with_label_values(&[uuid_label, gpu_label.as_str()]) - .set(bar1.total as f64); + .set(bar1.bar1Total as f64); metrics .gpu_bar1_used_bytes .with_label_values(&[uuid_label, gpu_label.as_str()]) - .set(bar1.used as f64); - health.bar1_total_bytes = Some(bar1.total); - health.bar1_used_bytes = Some(bar1.used); + .set(bar1.bar1Used as f64); + health.bar1_total_bytes = Some(bar1.bar1Total); + health.bar1_used_bytes = Some(bar1.bar1Used); } if let Ok(enc_info) = device.encoder_utilization() { metrics @@ -692,8 +692,8 @@ impl Collector for GpuCollector { // Note: pcie_link_speed returns the current link speed, not max. // If semantics require max, we might need a different call, but for now matching the existing pattern. if let (Ok(gen), Ok(width), Ok(speed)) = ( - device.pcie_link_gen(), - device.pcie_link_width(), + device.max_pcie_link_gen(), + device.max_pcie_link_width(), device.pcie_link_speed(), ) { let bytes_per_s = ((tx_kb + rx_kb) as f64) * 1024.0; @@ -1512,7 +1512,7 @@ fn k8s_resource_name(prefix: &str, mig_profile: Option<&str>) -> String { } #[cfg(feature = "gpu")] -fn pcie_lane_bytes_per_sec(gen: u32, speed_mt_s: u32) -> f64 { +fn pcie_lane_bytes_per_sec(_gen: u32, speed_mt_s: u32) -> f64 { // PCIe generation to base speed in MT/s per lane // Gen1: 2.5 GT/s, Gen2: 5 GT/s, Gen3: 8 GT/s, Gen4: 16 GT/s, Gen5: 32 GT/s, Gen6: 64 GT/s // Data rate is typically 8/10 encoding for Gen1/2, 128/130 for Gen3+ @@ -1568,7 +1568,6 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result Result nvmlReturn_t; type NvmlGpuInstanceGetById = unsafe extern "C" fn( - // Corrected function name device: nvmlDevice_t, id: std::os::raw::c_uint, gpu_instance: *mut nvmlDevice_t, @@ -1651,7 +1649,7 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result = unsafe { lib.get(b"nvmlDeviceGetComputeInstanceId") }?; let get_gpu_instance_by_id: libloading::Symbol = - unsafe { lib.get(b"nvmlGpuInstanceGetById") }?; // Corrected function name + unsafe { lib.get(b"nvmlGpuInstanceGetById") }?; let get_gpu_instance_info: libloading::Symbol = unsafe { lib.get(b"nvmlGpuInstanceGetInfo") }?; let get_gpu_instance_compute_instance_by_id: libloading::Symbol< @@ -1674,8 +1672,8 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result Result = HashMap::new(); + let mut gi_handles: HashMap = HashMap::new(); let mut ci_nodes: Vec = Vec::new(); for idx in 0..max_count { @@ -1725,8 +1724,10 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result Result Result 0 { - if let Some(gi_node) = gi_map.get(&gi_id) { - let mut ci_handle: nvmlDevice_t = std::ptr::null_mut(); - if unsafe { - get_gpu_instance_compute_instance_by_id( - gi_node.handle, - ci_id, - &mut ci_handle, - ) - } == nvmlReturn_enum_NVML_SUCCESS - { - let mut ci_info: nvmlComputeInstanceInfo_t = unsafe { std::mem::zeroed() }; - ci_info.version = nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_v2; - let _ = unsafe { get_compute_instance_info(ci_handle, &mut ci_info) }; - ci_nodes.push(ComputeInstanceNode { - gpu_instance_id: gi_id, - id: ci_id, - profile_id: Some(ci_info.profileId), - eng_profile_id: None, // Some(ci_info.engineProfile), - placement: Some(format!( - "{}:slice{}", - ci_info.placement.start, ci_info.placement.size - )), - }); + // Check if we haven't added this CI yet (simple check by iteration or similar, but here we just push) + // To avoid stats duplication, we rely on the fact that we iterate MIG devices. + // However, one CI might be shared? No, MIG device <-> CI is 1:1 usually? + // Actually 1 GI can have multiple CIs. 1 CI can have multiple MIG devices? + // In MIG, a "MIG Device" is conceptually a CI. + // We'll just push CI nodes as we encounter them. Ideally distinct. + // But `ci_nodes` is for the tree structure. + // Let's check uniqueness. + let known = ci_nodes + .iter() + .any(|c| c.gpu_instance_id == gi_id && c.id == ci_id); + + if !known { + if let Some(&gi_handle) = gi_handles.get(&gi_id) { + let mut ci_handle: nvmlDevice_t = std::ptr::null_mut(); + if unsafe { + get_gpu_instance_compute_instance_by_id( + gi_handle, + ci_id, + &mut ci_handle, + ) + } == nvmlReturn_enum_NVML_SUCCESS + { + let mut ci_info: nvmlComputeInstanceInfo_t = + unsafe { std::mem::zeroed() }; + // ci_info.version = ...; // Skip version + let _ = unsafe { get_compute_instance_info(ci_handle, &mut ci_info) }; + ci_nodes.push(ComputeInstanceNode { + gpu_instance_id: gi_id, + id: ci_id, + profile_id: Some(ci_info.profileId), + eng_profile_id: None, + placement: Some(format!( + "{}:slice{}", + ci_info.placement.start, ci_info.placement.size + )), + }); + } } } } + + // Metrics + let mut mem_info: Option = None; + let mut util_gpu: Option = None; + let mut ecc_cor: Option = None; + let mut ecc_uncor: Option = None; + let mut bar1: Option = None; + + { + let mut m = unsafe { std::mem::zeroed() }; + if unsafe { get_memory_info(mig_handle, &mut m) } == nvmlReturn_enum_NVML_SUCCESS { + mem_info = Some(m); + } + + let mut u = unsafe { std::mem::zeroed() }; + if unsafe { get_utilization_rates(mig_handle, &mut u) } + == nvmlReturn_enum_NVML_SUCCESS + { + util_gpu = Some(u.gpu); + } + + let mut b = unsafe { std::mem::zeroed() }; + if unsafe { get_bar1_memory_info(mig_handle, &mut b) } + == nvmlReturn_enum_NVML_SUCCESS + { + bar1 = Some(b); + } + + let mut c_count: u64 = 0; + let mut u_count: u64 = 0; + // NVML_ECC_COUNTER_TYPE_VOLATILE = 0 + // NVML_MEMORY_ERROR_TYPE_CORRECTED = 1 + // NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 2 + if unsafe { get_total_ecc_errors(mig_handle, 1, 0, &mut c_count) } + == nvmlReturn_enum_NVML_SUCCESS + { + ecc_cor = Some(c_count); + } + if unsafe { get_total_ecc_errors(mig_handle, 2, 0, &mut u_count) } + == nvmlReturn_enum_NVML_SUCCESS + { + ecc_uncor = Some(u_count); + } + } + + let mig_id = format!("mig{}", idx); + let placement_str = gi_map + .get(&gi_id) + .and_then(|g| g.placement.clone()) + .unwrap_or_else(|| format!("gi{}", gi_id)); + let profile_str = gi_map + .get(&gi_id) + .and_then(|g| g.profile_id) + .map(|p| p.to_string()); + + devices.push(MigDeviceStatus { + id: mig_uuid.clone().unwrap_or(mig_id.clone()), + uuid: mig_uuid, + memory_total_bytes: mem_info.as_ref().map(|m| m.total), + memory_used_bytes: mem_info.map(|m| m.used), + util_percent: util_gpu, + sm_count: None, // Not retrieving SM count for now + profile: profile_str, + placement: Some(placement_str), + bar1_total_bytes: bar1.as_ref().map(|b| b.bar1Total), + bar1_used_bytes: bar1.map(|b| b.bar1Used), + ecc_corrected: ecc_cor, + ecc_uncorrected: ecc_uncor, + }); } - let mig_id = format!("mig{}", idx); - let placement_str = gi_map - .get(&gi_id) - .and_then(|g| g.placement.clone()) - .unwrap_or_else(|| format!("gi{}", gi_id)); - let profile_str = gi_map - .get(&gi_id) - .and_then(|g| g.profile_id) - .map(|p| p.to_string()); - let ecc_corrected = mig_device - .total_ecc_errors(MemoryError::Corrected, EccCounter::Volatile) - .ok(); - let ecc_uncorrected = mig_device - .total_ecc_errors(MemoryError::Uncorrected, EccCounter::Volatile) - .ok(); - let bar1_info = mig_device.bar1_memory_info().ok(); - - devices.push(MigDeviceStatus { - id: mig_uuid.clone().unwrap_or(mig_id.clone()), - uuid: mig_uuid, - memory_total_bytes: mem_info.as_ref().map(|m| m.total), - memory_used_bytes: mem_info.map(|m| m.used), - util_percent: util.map(|u| u.gpu), - sm_count, - profile: profile_str, - placement: Some(placement_str), - bar1_total_bytes: bar1_info.as_ref().map(|b| b.total), - bar1_used_bytes: bar1_info.map(|b| b.used), - ecc_corrected, - ecc_uncorrected, - }); } Ok(MigTree { From a7e69213af2216c99a4f52a81709e8ef48741c24 Mon Sep 17 00:00:00 2001 From: Shaik Noor Date: Sat, 13 Dec 2025 16:58:44 +0530 Subject: [PATCH 18/19] Apply strict formatting fixes to gpu.rs --- crates/agent-core/src/collectors/gpu.rs | 680 ++++++------------------ 1 file changed, 166 insertions(+), 514 deletions(-) diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs index 79fda53..d4f07c2 100644 --- a/crates/agent-core/src/collectors/gpu.rs +++ b/crates/agent-core/src/collectors/gpu.rs @@ -38,8 +38,6 @@ use crate::state::{ #[cfg(all(feature = "gpu", target_os = "linux"))] use nvml_wrapper::error::NvmlError; #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))] -// use nvml_wrapper_sys::bindings::*; // Unused, dynamic loading used instead - pub struct GpuCollector { #[cfg(feature = "gpu")] nvml: Option, @@ -467,13 +465,13 @@ impl Collector for GpuCollector { metrics .gpu_bar1_total_bytes .with_label_values(&[uuid_label, gpu_label.as_str()]) - .set(bar1.bar1Total as f64); + .set(bar1.total as f64); metrics .gpu_bar1_used_bytes .with_label_values(&[uuid_label, gpu_label.as_str()]) - .set(bar1.bar1Used as f64); - health.bar1_total_bytes = Some(bar1.bar1Total); - health.bar1_used_bytes = Some(bar1.bar1Used); + .set(bar1.used as f64); + health.bar1_total_bytes = Some(bar1.total); + health.bar1_used_bytes = Some(bar1.used); } if let Ok(enc_info) = device.encoder_utilization() { metrics @@ -841,553 +839,189 @@ impl Collector for GpuCollector { if self.enable_mig { #[cfg(all(feature = "gpu-nvml-ffi", feature = "gpu"))] { - use nvml_wrapper_sys::bindings::{ - nvmlComputeInstanceInfo_t, nvmlDevice_t, nvmlGpuInstanceInfo_t, - nvmlReturn_enum_NVML_SUCCESS, nvmlReturn_t, - }; - // Load NVML dynamically to bypass missing symbols in sys crate - let lib = unsafe { libloading::Library::new("libnvidia-ml.so.1") }?; - - // Typedefs for the functions we need - type NvmlDeviceGetMigMode = unsafe extern "C" fn( - device: nvmlDevice_t, - current_mode: *mut std::os::raw::c_uint, - pending_mode: *mut std::os::raw::c_uint, - ) - -> nvmlReturn_t; - type NvmlDeviceGetMaxMigDeviceCount = unsafe extern "C" fn( - device: nvmlDevice_t, - count: *mut std::os::raw::c_uint, - ) - -> nvmlReturn_t; - type NvmlDeviceGetMigDeviceHandleByIndex = - unsafe extern "C" fn( - device: nvmlDevice_t, - index: std::os::raw::c_uint, - mig_device: *mut nvmlDevice_t, - ) -> nvmlReturn_t; - type NvmlDeviceGetDeviceHandleFromMigDeviceHandle = - unsafe extern "C" fn( - mig_device: nvmlDevice_t, - device: *mut nvmlDevice_t, - ) -> nvmlReturn_t; - type NvmlDeviceGetGpuInstanceId = unsafe extern "C" fn( - device: nvmlDevice_t, - id: *mut std::os::raw::c_uint, - ) - -> nvmlReturn_t; - type NvmlDeviceGetComputeInstanceId = unsafe extern "C" fn( - device: nvmlDevice_t, - id: *mut std::os::raw::c_uint, - ) - -> nvmlReturn_t; - type NvmlDeviceGetGpuInstanceById = unsafe extern "C" fn( - device: nvmlDevice_t, - id: std::os::raw::c_uint, - gpu_instance: *mut nvmlDevice_t, - ) - -> nvmlReturn_t; - type NvmlGpuInstanceGetInfo = unsafe extern "C" fn( - gpu_instance: nvmlDevice_t, - info: *mut nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_t, - ) - -> nvmlReturn_t; - type NvmlGpuInstanceGetComputeInstanceById = - unsafe extern "C" fn( - gpu_instance: nvmlDevice_t, - id: std::os::raw::c_uint, - compute_instance: *mut nvmlDevice_t, - ) -> nvmlReturn_t; - type NvmlComputeInstanceGetInfo = unsafe extern "C" fn( - compute_instance: nvmlDevice_t, - info: *mut nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_t, - ) - -> nvmlReturn_t; - type NvmlDeviceGetUUID = unsafe extern "C" fn( - device: nvmlDevice_t, - uuid: *mut std::os::raw::c_char, - size: std::os::raw::c_uint, - ) - -> nvmlReturn_t; - type NvmlDeviceGetMemoryInfo = unsafe extern "C" fn( - device: nvmlDevice_t, - memory: *mut nvml_wrapper_sys::bindings::nvmlMemory_t, - ) - -> nvmlReturn_t; - type NvmlDeviceGetUtilizationRates = unsafe extern "C" fn( - device: nvmlDevice_t, - utilization: *mut nvml_wrapper_sys::bindings::nvmlUtilization_t, - ) - -> nvmlReturn_t; - type NvmlDeviceGetBar1MemoryInfo = unsafe extern "C" fn( - device: nvmlDevice_t, - bar1_memory: *mut nvml_wrapper_sys::bindings::nvmlBAR1Memory_t, - ) - -> nvmlReturn_t; - type NvmlDeviceGetEccMode = unsafe extern "C" fn( - device: nvmlDevice_t, - current_mode: *mut nvml_wrapper_sys::bindings::nvmlEnableState_t, - pending_mode: *mut nvml_wrapper_sys::bindings::nvmlEnableState_t, - ) - -> nvmlReturn_t; - type NvmlDeviceGetTotalEccErrors = unsafe extern "C" fn( - device: nvmlDevice_t, - error_type: nvml_wrapper_sys::bindings::nvmlMemoryErrorType_t, - counter_type: nvml_wrapper_sys::bindings::nvmlEccCounterType_t, - ecc_count: *mut u64, - ) - -> nvmlReturn_t; - - let get_mig_mode: libloading::Symbol = - unsafe { lib.get(b"nvmlDeviceGetMigMode") }?; - let get_max_mig_device_count: libloading::Symbol< - NvmlDeviceGetMaxMigDeviceCount, - > = unsafe { lib.get(b"nvmlDeviceGetMaxMigDeviceCount") }?; - let get_mig_device_handle_by_index: libloading::Symbol< - NvmlDeviceGetMigDeviceHandleByIndex, - > = unsafe { lib.get(b"nvmlDeviceGetMigDeviceHandleByIndex") }?; - let get_device_handle_from_mig_device_handle: libloading::Symbol< - NvmlDeviceGetDeviceHandleFromMigDeviceHandle, - > = unsafe { lib.get(b"nvmlDeviceGetDeviceHandleFromMigDeviceHandle") }?; - let get_gpu_instance_id: libloading::Symbol = - unsafe { lib.get(b"nvmlDeviceGetGpuInstanceId") }?; - let get_compute_instance_id: libloading::Symbol< - NvmlDeviceGetComputeInstanceId, - > = unsafe { lib.get(b"nvmlDeviceGetComputeInstanceId") }?; - let get_gpu_instance_by_id: libloading::Symbol< - NvmlDeviceGetGpuInstanceById, - > = unsafe { lib.get(b"nvmlGpuInstanceGetById") }?; // Corrected function name - let get_gpu_instance_info: libloading::Symbol = - unsafe { lib.get(b"nvmlGpuInstanceGetInfo") }?; - let get_gpu_instance_compute_instance_by_id: libloading::Symbol< - NvmlGpuInstanceGetComputeInstanceById, - > = unsafe { lib.get(b"nvmlGpuInstanceGetComputeInstanceById") }?; - let get_compute_instance_info: libloading::Symbol< - NvmlComputeInstanceGetInfo, - > = unsafe { lib.get(b"nvmlComputeInstanceGetInfo") }?; - let get_uuid: libloading::Symbol = - unsafe { lib.get(b"nvmlDeviceGetUUID") }?; - let get_memory_info: libloading::Symbol = - unsafe { lib.get(b"nvmlDeviceGetMemoryInfo") }?; - let get_utilization_rates: libloading::Symbol< - NvmlDeviceGetUtilizationRates, - > = unsafe { lib.get(b"nvmlDeviceGetUtilizationRates") }?; - let get_bar1_memory_info: libloading::Symbol = - unsafe { lib.get(b"nvmlDeviceGetBar1MemoryInfo") }?; - let get_total_ecc_errors: libloading::Symbol = - unsafe { lib.get(b"nvmlDeviceGetTotalEccErrors") }?; - - let mut current_mode = 0; - let mut pending = 0; - let parent_handle = unsafe { device.handle() }; - let mig_mode_res = - unsafe { get_mig_mode(parent_handle, &mut current_mode, &mut pending) }; - let supported = mig_mode_res == nvmlReturn_enum_NVML_SUCCESS; - let enabled = current_mode - == nvml_wrapper_sys::bindings::nvmlMigMode_enum_NVML_DEVICE_MIG_ENABLE; - - if !supported || !enabled { - // If MIG is not supported or not enabled, return early with appropriate status - return Ok(MigTree { - supported, - enabled, - gpu_instances: Vec::new(), - compute_instances: Vec::new(), - devices: Vec::new(), - }); - } - - let mut max_count = 0; - unsafe { get_max_mig_device_count(parent_handle, &mut max_count) }; - - let mut devices = Vec::new(); - let mut gi_map: HashMap = HashMap::new(); - let mut gi_handles: HashMap = HashMap::new(); - let mut ci_nodes: Vec = Vec::new(); - - for idx in 0..max_count { - let mut mig_handle: nvmlDevice_t = std::ptr::null_mut(); - if unsafe { - get_mig_device_handle_by_index(parent_handle, idx, &mut mig_handle) - } == nvmlReturn_enum_NVML_SUCCESS - { - let mut full_handle: nvmlDevice_t = std::ptr::null_mut(); - unsafe { - get_device_handle_from_mig_device_handle( - mig_handle, - &mut full_handle, - ) - }; - - let mut uuid_buf = [0i8; 96]; // NVML_DEVICE_UUID_V2_BUFFER_SIZE - let _ = unsafe { - get_uuid( - mig_handle, - uuid_buf.as_mut_ptr(), - uuid_buf.len() as u32, + if let Ok(migs) = collect_mig_devices(nvml, &device) { + metrics + .gpu_mig_enabled + .with_label_values(&[uuid_label, gpu_label.as_str()]) + .set(if migs.enabled { 1.0 } else { 0.0 }); + // GI/CI info gauges + for gi in &migs.gpu_instances { + metrics + .mig_gpu_instance_info + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + gi.id.to_string().as_str(), + gi.profile_id + .map(|p| p.to_string()) + .unwrap_or_default() + .as_str(), + gi.placement.as_deref().unwrap_or(""), + ]) + .set(1.0); + } + for ci in &migs.compute_instances { + metrics + .mig_compute_instance_info + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + ci.gpu_instance_id.to_string().as_str(), + ci.id.to_string().as_str(), + ci.profile_id + .map(|p| p.to_string()) + .unwrap_or_default() + .as_str(), + ci.eng_profile_id + .map(|p| p.to_string()) + .unwrap_or_default() + .as_str(), + ci.placement.as_deref().unwrap_or(""), + ]) + .set(1.0); + } + for mig in &migs.devices { + let mig_id_string = mig.id.to_string(); + let mig_label = + mig.uuid.as_deref().unwrap_or(mig_id_string.as_str()); + let compat_label = if self.k8s_mode { + k8s_resource_name( + self.resource_prefix, + mig.profile.as_deref().or(Some("generic")), ) - }; - let mig_uuid_str = unsafe { - std::ffi::CStr::from_ptr(uuid_buf.as_ptr()) - .to_string_lossy() - .into_owned() - }; - let mig_uuid = if mig_uuid_str.is_empty() { - None } else { - Some(mig_uuid_str.clone()) + mig_label.to_string() }; - - // Extract GI/CI to map hierarchy - let mut gi_id = 0; - let _ = unsafe { get_gpu_instance_id(mig_handle, &mut gi_id) }; - let mut ci_id = 0; - let _ = unsafe { get_compute_instance_id(mig_handle, &mut ci_id) }; - - // Populate GI info best-effort - if gi_id > 0 && !gi_map.contains_key(&gi_id) { - let mut gi_handle: nvmlDevice_t = std::ptr::null_mut(); - if unsafe { - get_gpu_instance_by_id(parent_handle, gi_id, &mut gi_handle) - } == nvmlReturn_enum_NVML_SUCCESS - { - let mut gi_info: nvmlGpuInstanceInfo_t = - unsafe { std::mem::zeroed() }; - gi_info.version = - nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_v2; - let _ = unsafe { - get_gpu_instance_info(gi_handle, &mut gi_info) - }; - let placement = Some(format!( - "{}:slice{}", - gi_info.placement.start, gi_info.placement.size - )); - gi_map.insert( - gi_id, - GpuInstanceNode { - id: gi_id, - profile_id: Some(gi_info.profileId), - placement, - }, - ); - gi_handles.insert(gi_id, gi_handle); - } - } - - // Populate CI info best-effort - if ci_id > 0 { - if let Some(_gi_node) = gi_map.get(&gi_id) { - if let Some(&gi_handle) = gi_handles.get(&gi_id) { - let mut ci_handle: nvmlDevice_t = std::ptr::null_mut(); - if unsafe { - get_gpu_instance_compute_instance_by_id( - gi_handle, // Assuming GpuInstanceNode stores the handle - ci_id, - &mut ci_handle, - ) - } == nvmlReturn_enum_NVML_SUCCESS - { - let mut ci_info: nvmlComputeInstanceInfo_t = - unsafe { std::mem::zeroed() }; - ci_info.version = - nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_v2; - let _ = unsafe { - get_compute_instance_info( - ci_handle, - &mut ci_info, - ) - }; - ci_nodes.push(ComputeInstanceNode { - gpu_instance_id: gi_id, - id: ci_id, - profile_id: Some(ci_info.profileId), - eng_profile_id: None, // nvmlComputeInstanceInfo_t_v2 does not have engineProfileId - placement: Some(format!( - "{}:slice{}", - ci_info.placement.start, - ci_info.placement.size - )), - }); - } - } - } - } - - let mig_id_label = format!("mig{}", idx); - let placement_str = gi_map - .get(&gi_id) - .and_then(|g| g.placement.clone()) - .unwrap_or_else(|| format!("gi{}", gi_id)); - let profile_str = gi_map - .get(&gi_id) - .and_then(|g| g.profile_id) - .map(|p| p.to_string()); - - let mut mem_info: nvml_wrapper_sys::bindings::nvmlMemory_t = - unsafe { std::mem::zeroed() }; - let mem_info_res = - unsafe { get_memory_info(mig_handle, &mut mem_info) }; - let memory_total_bytes = - if mem_info_res == nvmlReturn_enum_NVML_SUCCESS { - Some(mem_info.total) - } else { - None - }; - let memory_used_bytes = - if mem_info_res == nvmlReturn_enum_NVML_SUCCESS { - Some(mem_info.used) - } else { - None - }; - - let mut util_rates: nvml_wrapper_sys::bindings::nvmlUtilization_t = - unsafe { std::mem::zeroed() }; - let util_res = - unsafe { get_utilization_rates(mig_handle, &mut util_rates) }; - let util_percent = if util_res == nvmlReturn_enum_NVML_SUCCESS { - Some(util_rates.gpu) - } else { - None - }; - - let mut bar1_info: nvml_wrapper_sys::bindings::nvmlBAR1Memory_t = - unsafe { std::mem::zeroed() }; - let bar1_res = - unsafe { get_bar1_memory_info(mig_handle, &mut bar1_info) }; - let bar1_total_bytes = if bar1_res == nvmlReturn_enum_NVML_SUCCESS { - Some(bar1_info.total) - } else { - None - }; - let bar1_used_bytes = if bar1_res == nvmlReturn_enum_NVML_SUCCESS { - Some(bar1_info.used) - } else { - None - }; - - let mut ecc_corrected_val: u64 = 0; - let ecc_corrected_res = unsafe { - get_total_ecc_errors( - mig_handle, - nvml_wrapper_sys::bindings::nvmlMemoryErrorType_enum_NVML_MEMORY_ERROR_TYPE_CORRECTED, - nvml_wrapper_sys::bindings::nvmlEccCounterType_enum_NVML_ECC_COUNTER_TYPE_VOLATILE, - &mut ecc_corrected_val, - ) - }; - let ecc_corrected = - if ecc_corrected_res == nvmlReturn_enum_NVML_SUCCESS { - Some(ecc_corrected_val) - } else { - None - }; - - let mut ecc_uncorrected_val: u64 = 0; - let ecc_uncorrected_res = unsafe { - get_total_ecc_errors( - mig_handle, - nvml_wrapper_sys::bindings::nvmlMemoryErrorType_enum_NVML_MEMORY_ERROR_TYPE_UNCORRECTED, - nvml_wrapper_sys::bindings::nvmlEccCounterType_enum_NVML_ECC_COUNTER_TYPE_VOLATILE, - &mut ecc_uncorrected_val, - ) - }; - let ecc_uncorrected = - if ecc_uncorrected_res == nvmlReturn_enum_NVML_SUCCESS { - Some(ecc_uncorrected_val) - } else { - None - }; - - devices.push(MigDeviceStatus { - id: mig_uuid.clone().unwrap_or(mig_id_label.clone()), - uuid: mig_uuid, - memory_total_bytes, - memory_used_bytes, - util_percent, - sm_count: None, // Not directly available via NVML MIG device handle - profile: profile_str, - placement: Some(placement_str), - bar1_total_bytes, - bar1_used_bytes, - ecc_corrected, - ecc_uncorrected, - }); - } - } - - let migs = MigTree { - supported, - enabled, - gpu_instances: gi_map.values().cloned().collect(), - compute_instances: ci_nodes, - devices, - }; - - metrics - .gpu_mig_enabled - .with_label_values(&[uuid_label, gpu_label.as_str()]) - .set(if migs.enabled { 1.0 } else { 0.0 }); - // GI/CI info gauges - for gi in &migs.gpu_instances { - metrics - .mig_gpu_instance_info - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - gi.id.to_string().as_str(), - gi.profile_id - .map(|p| p.to_string()) - .unwrap_or_default() - .as_str(), - gi.placement.as_deref().unwrap_or(""), - ]) - .set(1.0); - } - for ci in &migs.compute_instances { - metrics - .mig_compute_instance_info - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - ci.gpu_instance_id.to_string().as_str(), - ci.id.to_string().as_str(), - ci.profile_id - .map(|p| p.to_string()) - .unwrap_or_default() - .as_str(), - ci.eng_profile_id - .map(|p| p.to_string()) - .unwrap_or_default() - .as_str(), - ci.placement.as_deref().unwrap_or(""), - ]) - .set(1.0); - } - for mig in &migs.devices { - let mig_id_string = mig.id.to_string(); - let mig_label = mig.uuid.as_deref().unwrap_or(mig_id_string.as_str()); - let compat_label = if self.k8s_mode { - k8s_resource_name( - self.resource_prefix, - mig.profile.as_deref().or(Some("generic")), - ) - } else { - mig_label.to_string() - }; - if let Some(util) = mig.util_percent { - metrics - .mig_utilization_percent - .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label]) - .set(util as f64); - if self.k8s_mode { + if let Some(util) = mig.util_percent { metrics .mig_utilization_percent .with_label_values(&[ uuid_label, gpu_label.as_str(), - compat_label.as_str(), + mig_label, ]) .set(util as f64); + if self.k8s_mode { + metrics + .mig_utilization_percent + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + compat_label.as_str(), + ]) + .set(util as f64); + } } - } - if let Some(total) = mig.memory_total_bytes { - metrics - .mig_memory_total_bytes - .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label]) - .set(total as f64); - if self.k8s_mode { + if let Some(total) = mig.memory_total_bytes { metrics .mig_memory_total_bytes .with_label_values(&[ uuid_label, gpu_label.as_str(), - compat_label.as_str(), + mig_label, ]) .set(total as f64); + if self.k8s_mode { + metrics + .mig_memory_total_bytes + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + compat_label.as_str(), + ]) + .set(total as f64); + } } - } - if let Some(used) = mig.memory_used_bytes { - metrics - .mig_memory_used_bytes - .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label]) - .set(used as f64); - if self.k8s_mode { + if let Some(used) = mig.memory_used_bytes { metrics .mig_memory_used_bytes .with_label_values(&[ uuid_label, gpu_label.as_str(), - compat_label.as_str(), + mig_label, ]) .set(used as f64); + if self.k8s_mode { + metrics + .mig_memory_used_bytes + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + compat_label.as_str(), + ]) + .set(used as f64); + } } - } - if let Some(sm) = mig.sm_count { - metrics - .mig_sm_count - .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label]) - .set(sm as f64); - if self.k8s_mode { + if let Some(sm) = mig.sm_count { metrics .mig_sm_count .with_label_values(&[ uuid_label, gpu_label.as_str(), - compat_label.as_str(), + mig_label, ]) .set(sm as f64); + if self.k8s_mode { + metrics + .mig_sm_count + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + compat_label.as_str(), + ]) + .set(sm as f64); + } } - } - // Best-effort per-MIG ECC and BAR1 info using MigDeviceStatus fields - if let Some(corrected) = mig.ecc_corrected { - metrics - .mig_ecc_corrected_total - .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label]) - .inc_by(corrected); - if self.k8s_mode { + // Best-effort per-MIG ECC and BAR1 info using MigDeviceStatus fields + if let Some(corrected) = mig.ecc_corrected { metrics .mig_ecc_corrected_total .with_label_values(&[ uuid_label, gpu_label.as_str(), - compat_label.as_str(), + mig_label, ]) .inc_by(corrected); + if self.k8s_mode { + metrics + .mig_ecc_corrected_total + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + compat_label.as_str(), + ]) + .inc_by(corrected); + } } - } - if let Some(uncorrected) = mig.ecc_uncorrected { - metrics - .mig_ecc_uncorrected_total - .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label]) - .inc_by(uncorrected); - if self.k8s_mode { + if let Some(uncorrected) = mig.ecc_uncorrected { metrics .mig_ecc_uncorrected_total .with_label_values(&[ uuid_label, gpu_label.as_str(), - compat_label.as_str(), + mig_label, ]) .inc_by(uncorrected); + if self.k8s_mode { + metrics + .mig_ecc_uncorrected_total + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + compat_label.as_str(), + ]) + .inc_by(uncorrected); + } } - } - if let (Some(total), Some(used)) = - (mig.bar1_total_bytes, mig.bar1_used_bytes) - { - metrics - .mig_bar1_total_bytes - .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label]) - .set(total as f64); - metrics - .mig_bar1_used_bytes - .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label]) - .set(used as f64); - if self.k8s_mode { + if let (Some(total), Some(used)) = + (mig.bar1_total_bytes, mig.bar1_used_bytes) + { metrics .mig_bar1_total_bytes .with_label_values(&[ uuid_label, gpu_label.as_str(), - compat_label.as_str(), + mig_label, ]) .set(total as f64); metrics @@ -1395,28 +1029,46 @@ impl Collector for GpuCollector { .with_label_values(&[ uuid_label, gpu_label.as_str(), - compat_label.as_str(), + mig_label, ]) .set(used as f64); + if self.k8s_mode { + metrics + .mig_bar1_total_bytes + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + compat_label.as_str(), + ]) + .set(total as f64); + metrics + .mig_bar1_used_bytes + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + compat_label.as_str(), + ]) + .set(used as f64); + } } + metrics + .mig_info + .with_label_values(&[ + uuid_label, + gpu_label.as_str(), + mig_label, + mig.profile.as_deref().unwrap_or(""), + mig.placement.as_deref().unwrap_or(""), + ]) + .set(1.0); } + let supported = migs.supported; + status.mig_tree = Some(migs); metrics - .mig_info - .with_label_values(&[ - uuid_label, - gpu_label.as_str(), - mig_label, - mig.profile.as_deref().unwrap_or(""), - mig.placement.as_deref().unwrap_or(""), - ]) - .set(1.0); + .gpu_mig_supported + .with_label_values(&[uuid_label, gpu_label.as_str()]) + .set(if supported { 1.0 } else { 0.0 }); } - let supported = migs.supported; - status.mig_tree = Some(migs); - metrics - .gpu_mig_supported - .with_label_values(&[uuid_label, gpu_label.as_str()]) - .set(if supported { 1.0 } else { 0.0 }); } #[cfg(not(all(feature = "gpu-nvml-ffi", feature = "gpu")))] @@ -1563,7 +1215,7 @@ fn build_filter(raw: Option<&str>) -> Option> { } #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))] -fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result { +fn collect_mig_devices(_nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result { use nvml_wrapper_sys::bindings::{ nvmlComputeInstanceInfo_t, nvmlDevice_t, nvmlGpuInstanceInfo_t, nvmlReturn_enum_NVML_SUCCESS, nvmlReturn_t, From c3de7416bf7e4e1ce08abd85994bcedc90e2763a Mon Sep 17 00:00:00 2001 From: Shaik Noor Date: Sat, 13 Dec 2025 17:03:48 +0530 Subject: [PATCH 19/19] Fix clippy and snake_case errors in nvml_ext.rs --- crates/agent-core/src/nvml_ext.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs index 09a9abe..d101bfa 100644 --- a/crates/agent-core/src/nvml_ext.rs +++ b/crates/agent-core/src/nvml_ext.rs @@ -5,7 +5,6 @@ use nvml_wrapper_sys::bindings::*; #[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))] - /// Errors from extended NVML calls. #[derive(thiserror::Error, Debug)] pub enum NvmlExtError { @@ -125,7 +124,7 @@ pub unsafe fn get_field_values( type NvmlDeviceGetFieldValues = unsafe extern "C" fn( device: nvmlDevice_t, - valuesCount: u32, + values_count: u32, values: *mut nvmlFieldValue_t, ) -> nvmlReturn_t;