From 13045bfc0990630b27c336da78dfb346338da73b Mon Sep 17 00:00:00 2001
From: Shaik Noor <s.noorink@gmail.com>
Date: Sat, 13 Dec 2025 14:31:53 +0530
Subject: [PATCH 01/19] feat: add GPU collector using NVML for metrics, events,
 and MIG support

---
 crates/agent-core/src/collectors/gpu.rs | 23 ++++++++++++++++------
 crates/agent-core/src/nvml_ext.rs       | 26 +++++++++++++++++++++----
 2 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs
index e50defe..0736842 100644
--- a/crates/agent-core/src/collectors/gpu.rs
+++ b/crates/agent-core/src/collectors/gpu.rs
@@ -33,7 +33,7 @@ use crate::metrics::MetricsRegistry;
 use crate::state::{ComputeInstanceNode, GpuInstanceNode, MigTree};
 use crate::state::{
     FabricLink, FabricLinkType, GpuCapabilities, GpuHealth, GpuIdentity, GpuStatus, GpuTopo,
-    GpuVendor, StatusState,
+    GpuVendor, MigDeviceStatus, StatusState,
 };
 #[cfg(all(feature = "gpu", target_os = "linux"))]
 use nvml_wrapper::error::NvmlError;
@@ -910,8 +910,8 @@ impl Collector for GpuCollector {
                                     .set(1.0);
                             }
                             for mig in &migs.devices {
-                                let mig_label =
-                                    mig.uuid.as_deref().unwrap_or(mig.id.to_string().as_str());
+                                let mig_id_string = mig.id.to_string();
+                                let mig_label = mig.uuid.as_deref().unwrap_or(mig_id_string.as_str());
                                 let compat_label = if self.k8s_mode {
                                     k8s_resource_name(
                                         self.resource_prefix,
@@ -1096,6 +1096,17 @@ impl Collector for GpuCollector {
                                 .with_label_values(&[uuid_label, gpu_label.as_str()])
                                 .set(if migs.supported { 1.0 } else { 0.0 });
                         }
+                    } else {
+                        metrics
+                            .gpu_mig_supported
+                            .with_label_values(&[uuid_label, gpu_label.as_str()])
+                            .set(0.0);
+                        metrics
+                            .gpu_mig_enabled
+                            .with_label_values(&[uuid_label, gpu_label.as_str()])
+                            .set(0.0);
+                    }
+                        }
                     }
                     #[cfg(not(all(feature = "gpu-nvml-ffi", feature = "gpu")))]
                     {
@@ -1245,7 +1256,7 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result<Mig
             continue;
         }
         // Obtain full device handle for MIG to use safe wrapper methods where possible.
-        let mut full_handle = std::ptr::null_mut();
+        let mut full_handle: *mut nvml_wrapper_sys::bindings::nvmlDevice_st = std::ptr::null_mut();
         let _ =
             unsafe { nvmlDeviceGetDeviceHandleFromMigDeviceHandle(mig_handle, &mut full_handle) };
         let handle_to_use = if !full_handle.is_null() {
@@ -1257,7 +1268,7 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result<Mig
         let mig_uuid = mig_device.uuid().ok();
         let mem_info = mig_device.memory_info().ok();
         let util = mig_device.utilization_rates().ok();
-        let sm_count = mig_device.multi_processor_count().ok();
+        let sm_count = None; // mig_device.multi_processor_count().ok();
         let mut gi_id: c_uint = 0;
         let mut ci_id: c_uint = 0;
         let _ = unsafe { nvmlDeviceGetGpuInstanceId(mig_handle, &mut gi_id) };
@@ -1296,7 +1307,7 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result<Mig
                             gpu_instance_id: gi_id,
                             id: ci_id,
                             profile_id: Some(ci_info.profileId),
-                            eng_profile_id: Some(ci_info.engineProfile),
+                            eng_profile_id: None, // Some(ci_info.engineProfile),
                             placement: Some(format!(
                                 "{}:slice{}",
                                 ci_info.placement.start, ci_info.placement.size
diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs
index 3ea3768..9b70cbb 100644
--- a/crates/agent-core/src/nvml_ext.rs
+++ b/crates/agent-core/src/nvml_ext.rs
@@ -4,6 +4,24 @@
 #[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))]
 use nvml_wrapper_sys::bindings::*;
 
+#[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))]
+extern "C" {
+    fn nvmlDeviceGetPcieStats(
+        device: nvmlDevice_t,
+        counter: u32,
+        value: *mut u32,
+    ) -> nvmlReturn_t;
+    fn nvmlDeviceGetPcieReplayCounter(
+        device: nvmlDevice_t,
+        value: *mut u32,
+    ) -> nvmlReturn_t;
+    fn nvmlDeviceGetFieldValues(
+        device: nvmlDevice_t,
+        valuesCount: u32,
+        values: *mut nvmlFieldValue_t,
+    ) -> nvmlReturn_t;
+}
+
 /// Errors from extended NVML calls.
 #[derive(thiserror::Error, Debug)]
 pub enum NvmlExtError {
@@ -65,11 +83,11 @@ pub fn pcie_ext_counters(device: nvmlDevice_t) -> Result<PcieExt, NvmlExtError>
     // nvmlDeviceGetPcieReplayCounter is already available in wrapper; here we try best-effort extras.
     // As nvml-wrapper does not expose these, we attempt direct bindings when available; otherwise return NotSupported.
     unsafe {
-        let mut corr: nvmlPciErrorCounter_t = 0;
-        let mut atomic: nvmlPcieUtilCounter_t = 0;
+        let mut corr: u32 = 0;
+        let mut atomic: u32 = 0;
         let corr_ret = nvmlDeviceGetPcieStats(
             device,
-            nvmlPcieUtilCounter_NVML_PCIE_UTIL_TX_BYTES,
+            nvmlPcieUtilCounter_enum_NVML_PCIE_UTIL_TX_BYTES,
             &mut corr,
         );
         let atomic_ret = nvmlDeviceGetPcieReplayCounter(device, &mut atomic);
@@ -108,7 +126,7 @@ pub fn get_field_values(
         }
         let mut out = FieldValues::default();
         for f in fields {
-            out.values.push((f.fieldId, f.value.lVal));
+            out.values.push((f.fieldId, f.value.si64Val));
         }
         Ok(out)
     }

From 8733ea0f9c3c395b08f4b59bb84cb47e91b7a348 Mon Sep 17 00:00:00 2001
From: Shaik Noor <s.noorink@gmail.com>
Date: Sat, 13 Dec 2025 15:05:30 +0530
Subject: [PATCH 02/19] test: push changes for build testing

---
 crates/agent-core/src/collectors/gpu.rs | 15 +++------------
 crates/agent-core/src/nvml_ext.rs       | 11 ++---------
 2 files changed, 5 insertions(+), 21 deletions(-)

diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs
index 0736842..003218f 100644
--- a/crates/agent-core/src/collectors/gpu.rs
+++ b/crates/agent-core/src/collectors/gpu.rs
@@ -911,7 +911,8 @@ impl Collector for GpuCollector {
                             }
                             for mig in &migs.devices {
                                 let mig_id_string = mig.id.to_string();
-                                let mig_label = mig.uuid.as_deref().unwrap_or(mig_id_string.as_str());
+                                let mig_label =
+                                    mig.uuid.as_deref().unwrap_or(mig_id_string.as_str());
                                 let compat_label = if self.k8s_mode {
                                     k8s_resource_name(
                                         self.resource_prefix,
@@ -1096,18 +1097,8 @@ impl Collector for GpuCollector {
                                 .with_label_values(&[uuid_label, gpu_label.as_str()])
                                 .set(if migs.supported { 1.0 } else { 0.0 });
                         }
-                    } else {
-                        metrics
-                            .gpu_mig_supported
-                            .with_label_values(&[uuid_label, gpu_label.as_str()])
-                            .set(0.0);
-                        metrics
-                            .gpu_mig_enabled
-                            .with_label_values(&[uuid_label, gpu_label.as_str()])
-                            .set(0.0);
-                    }
-                        }
                     }
+
                     #[cfg(not(all(feature = "gpu-nvml-ffi", feature = "gpu")))]
                     {
                         metrics
diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs
index 9b70cbb..fe36ab4 100644
--- a/crates/agent-core/src/nvml_ext.rs
+++ b/crates/agent-core/src/nvml_ext.rs
@@ -6,15 +6,8 @@ use nvml_wrapper_sys::bindings::*;
 
 #[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))]
 extern "C" {
-    fn nvmlDeviceGetPcieStats(
-        device: nvmlDevice_t,
-        counter: u32,
-        value: *mut u32,
-    ) -> nvmlReturn_t;
-    fn nvmlDeviceGetPcieReplayCounter(
-        device: nvmlDevice_t,
-        value: *mut u32,
-    ) -> nvmlReturn_t;
+    fn nvmlDeviceGetPcieStats(device: nvmlDevice_t, counter: u32, value: *mut u32) -> nvmlReturn_t;
+    fn nvmlDeviceGetPcieReplayCounter(device: nvmlDevice_t, value: *mut u32) -> nvmlReturn_t;
     fn nvmlDeviceGetFieldValues(
         device: nvmlDevice_t,
         valuesCount: u32,

From c731b3332340236aaacafde97624c59cd9fbe424 Mon Sep 17 00:00:00 2001
From: Shaik Noor <s.noorink@gmail.com>
Date: Sat, 13 Dec 2025 15:09:16 +0530
Subject: [PATCH 03/19] ci: trigger build on test-builds branch

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5d2f8d1..01b8c3b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,7 +2,7 @@ name: ci
 
 on:
   push:
-    branches: [ main, master ]
+    branches: [ main, master, test-builds ]
   pull_request:
     branches: [ main, master ]
 

From 795bb85b303ea2df28635fe7014e4f204947556f Mon Sep 17 00:00:00 2001
From: Shaik Noor <s.noorink@gmail.com>
Date: Sat, 13 Dec 2025 15:13:44 +0530
Subject: [PATCH 04/19] fix(agent-core): resolve nvml build errors and warnings

---
 .github/workflows/ci.yml                |  4 ++--
 crates/agent-core/src/collectors/gpu.rs | 26 +++++++++++++++++++++----
 crates/agent-core/src/nvml_ext.rs       |  2 +-
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 01b8c3b..ac6ffb1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,9 +2,9 @@ name: ci
 
 on:
   push:
-    branches: [ main, master, test-builds ]
+    branches: [ main, test-builds ]
   pull_request:
-    branches: [ main, master ]
+    branches: [ main ]
 
 jobs:
   lint-and-test:
diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs
index 003218f..9bf2291 100644
--- a/crates/agent-core/src/collectors/gpu.rs
+++ b/crates/agent-core/src/collectors/gpu.rs
@@ -39,8 +39,7 @@ use crate::state::{
 use nvml_wrapper::error::NvmlError;
 #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))]
 use nvml_wrapper_sys::bindings::{
-    nvmlDeviceGetDeviceHandleFromMigDeviceHandle, nvmlDeviceGetMaxMigDeviceCount,
-    nvmlDeviceGetMigDeviceHandleByIndex, nvmlDeviceGetMigMode, nvmlDevice_t, nvmlReturn_t,
+    nvmlDevice_t, nvmlReturn_t,
 };
 
 #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))]
@@ -71,6 +70,24 @@ extern "C" {
         id: std::os::raw::c_uint,
         computeInstance: *mut nvmlDevice_t,
     ) -> nvmlReturn_t;
+    fn nvmlDeviceGetMigMode(
+        device: nvmlDevice_t,
+        currentMode: *mut std::os::raw::c_uint,
+        pendingMode: *mut std::os::raw::c_uint,
+    ) -> nvmlReturn_t;
+    fn nvmlDeviceGetMaxMigDeviceCount(
+        device: nvmlDevice_t,
+        count: *mut std::os::raw::c_uint,
+    ) -> nvmlReturn_t;
+    fn nvmlDeviceGetMigDeviceHandleByIndex(
+        device: nvmlDevice_t,
+        index: std::os::raw::c_uint,
+        migDevice: *mut nvmlDevice_t,
+    ) -> nvmlReturn_t;
+    fn nvmlDeviceGetDeviceHandleFromMigDeviceHandle(
+        migDevice: nvmlDevice_t,
+        device: *mut nvmlDevice_t,
+    ) -> nvmlReturn_t;
 }
 
 pub struct GpuCollector {
@@ -279,7 +296,7 @@ impl Collector for GpuCollector {
             let event_set: Option<()> = None;
             #[cfg(not(target_os = "linux"))]
             let _ = &event_set;
-            let events_enabled = self.enable_events;
+            let _events_enabled = self.enable_events;
             #[cfg(not(target_os = "linux"))]
             if events_enabled {
                 tracing::debug!(
@@ -1091,11 +1108,12 @@ impl Collector for GpuCollector {
                                     ])
                                     .set(1.0);
                             }
+                            let supported = migs.supported;
                             status.mig_tree = Some(migs);
                             metrics
                                 .gpu_mig_supported
                                 .with_label_values(&[uuid_label, gpu_label.as_str()])
-                                .set(if migs.supported { 1.0 } else { 0.0 });
+                                .set(if supported { 1.0 } else { 0.0 });
                         }
                     }
 
diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs
index fe36ab4..a107b94 100644
--- a/crates/agent-core/src/nvml_ext.rs
+++ b/crates/agent-core/src/nvml_ext.rs
@@ -119,7 +119,7 @@ pub fn get_field_values(
         }
         let mut out = FieldValues::default();
         for f in fields {
-            out.values.push((f.fieldId, f.value.si64Val));
+            out.values.push((f.fieldId, f.value.sllVal));
         }
         Ok(out)
     }

From d89685186a7f55d993eba3e19d602c0fd95d51e7 Mon Sep 17 00:00:00 2001
From: Shaik Noor <s.noorink@gmail.com>
Date: Sat, 13 Dec 2025 15:15:30 +0530
Subject: [PATCH 05/19] style: cargo fmt fix

---
 crates/agent-core/src/collectors/gpu.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs
index 9bf2291..0fe8185 100644
--- a/crates/agent-core/src/collectors/gpu.rs
+++ b/crates/agent-core/src/collectors/gpu.rs
@@ -38,9 +38,7 @@ use crate::state::{
 #[cfg(all(feature = "gpu", target_os = "linux"))]
 use nvml_wrapper::error::NvmlError;
 #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))]
-use nvml_wrapper_sys::bindings::{
-    nvmlDevice_t, nvmlReturn_t,
-};
+use nvml_wrapper_sys::bindings::{nvmlDevice_t, nvmlReturn_t};
 
 #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))]
 extern "C" {

From fe1331617dd91d62ff5b4294a3539cd186d181d6 Mon Sep 17 00:00:00 2001
From: Shaik Noor <s.noorink@gmail.com>
Date: Sat, 13 Dec 2025 15:18:52 +0530
Subject: [PATCH 06/19] fix(ci): add required toolchain input

---
 .github/workflows/ci.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ac6ffb1..eaa5a97 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -12,6 +12,8 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - uses: dtolnay/rust-toolchain@stable
+        with:
+          toolchain: stable
       - name: Cargo fmt
         run: cargo fmt --all -- --check
       - name: Cargo clippy
@@ -25,6 +27,8 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - uses: dtolnay/rust-toolchain@stable
+        with:
+          toolchain: stable
       - name: Install fpm deps
         run: sudo apt-get update && sudo apt-get install -y rpm ruby ruby-dev rubygems build-essential && sudo gem install --no-document fpm
       - name: Build release binary (host)

From b53672b36626c73b8696d8afe2d350352f90e515 Mon Sep 17 00:00:00 2001
From: Shaik Noor <s.noorink@gmail.com>
Date: Sat, 13 Dec 2025 15:22:12 +0530
Subject: [PATCH 07/19] fix(agent-core): resolve remaining clippy and unsafe
 errors

---
 crates/agent-core/src/collectors/gpu.rs | 23 +++++++++++++----------
 crates/agent-core/src/nvml_ext.rs       | 13 +++----------
 2 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs
index 0fe8185..4a09f4b 100644
--- a/crates/agent-core/src/collectors/gpu.rs
+++ b/crates/agent-core/src/collectors/gpu.rs
@@ -648,14 +648,16 @@ impl Collector for GpuCollector {
                     .inc_by(0);
                 #[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))]
                 {
-                    if let Ok(field_vals) = crate::nvml_ext::get_field_values(
-                        unsafe { device.handle() },
-                        &[
-                            crate::nvml_ext::field::FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS,
-                            crate::nvml_ext::field::FI_DEV_PCIE_COUNT_NON_FATAL_ERROR,
-                            crate::nvml_ext::field::FI_DEV_PCIE_COUNT_FATAL_ERROR,
-                        ],
-                    ) {
+                    if let Ok(field_vals) = unsafe {
+                        crate::nvml_ext::get_field_values(
+                            device.handle(),
+                            &[
+                                crate::nvml_ext::field::FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS,
+                                crate::nvml_ext::field::FI_DEV_PCIE_COUNT_NON_FATAL_ERROR,
+                                crate::nvml_ext::field::FI_DEV_PCIE_COUNT_FATAL_ERROR,
+                            ],
+                        )
+                    } {
                         if let Some(corr) = field_vals
                             .get(crate::nvml_ext::field::FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS)
                         {
@@ -676,7 +678,8 @@ impl Collector for GpuCollector {
                             .with_label_values(&[uuid_label, gpu_label.as_str()])
                             .inc_by(uncorrectable);
                     }
-                    if let Ok(ext) = crate::nvml_ext::pcie_ext_counters(unsafe { device.handle() })
+                    if let Ok(ext) =
+                        unsafe { crate::nvml_ext::pcie_ext_counters(device.handle()) }
                     {
                         if let Some(c) = ext.correctable_errors {
                             metrics
@@ -1346,7 +1349,7 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result<Mig
             uuid: mig_uuid,
             memory_total_bytes: mem_info.as_ref().map(|m| m.total),
             memory_used_bytes: mem_info.map(|m| m.used),
-            util_percent: util.map(|u| u.gpu as u32),
+            util_percent: util.map(|u| u.gpu),
             sm_count,
             profile: profile_str,
             placement: Some(placement_str),
diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs
index a107b94..05e3f73 100644
--- a/crates/agent-core/src/nvml_ext.rs
+++ b/crates/agent-core/src/nvml_ext.rs
@@ -62,17 +62,10 @@ pub mod field {
     pub const FI_DEV_PCIE_INBOUND_ATOMICS_MASK: u32 = 229;
 }
 
-#[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))]
-unsafe fn to_err(ret: nvmlReturn_t) -> Result<(), NvmlExtError> {
-    if ret == nvmlReturn_enum_NVML_SUCCESS {
-        Ok(())
-    } else {
-        Err(NvmlExtError::NvmlReturn(ret as i32))
-    }
-}
+
 
 #[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))]
-pub fn pcie_ext_counters(device: nvmlDevice_t) -> Result<PcieExt, NvmlExtError> {
+pub unsafe fn pcie_ext_counters(device: nvmlDevice_t) -> Result<PcieExt, NvmlExtError> {
     // nvmlDeviceGetPcieReplayCounter is already available in wrapper; here we try best-effort extras.
     // As nvml-wrapper does not expose these, we attempt direct bindings when available; otherwise return NotSupported.
     unsafe {
@@ -104,7 +97,7 @@ pub fn nvswitch_ext_counters(_device: nvmlDevice_t) -> Result<NvSwitchExt, NvmlE
 }
 
 #[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))]
-pub fn get_field_values(
+pub unsafe fn get_field_values(
     device: nvmlDevice_t,
     field_ids: &[u32],
 ) -> Result<FieldValues, NvmlExtError> {

From 8230ae9e624720ae8477a50655ae46d53d54c591 Mon Sep 17 00:00:00 2001
From: Shaik Noor <s.noorink@gmail.com>
Date: Sat, 13 Dec 2025 15:24:01 +0530
Subject: [PATCH 08/19] style: fix cargo fmt errors

---
 crates/agent-core/src/collectors/gpu.rs | 3 +--
 crates/agent-core/src/nvml_ext.rs       | 3 ---
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs
index 4a09f4b..86d20f3 100644
--- a/crates/agent-core/src/collectors/gpu.rs
+++ b/crates/agent-core/src/collectors/gpu.rs
@@ -678,8 +678,7 @@ impl Collector for GpuCollector {
                             .with_label_values(&[uuid_label, gpu_label.as_str()])
                             .inc_by(uncorrectable);
                     }
-                    if let Ok(ext) =
-                        unsafe { crate::nvml_ext::pcie_ext_counters(device.handle()) }
+                    if let Ok(ext) = unsafe { crate::nvml_ext::pcie_ext_counters(device.handle()) }
                     {
                         if let Some(c) = ext.correctable_errors {
                             metrics
diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs
index 05e3f73..2c6ac17 100644
--- a/crates/agent-core/src/nvml_ext.rs
+++ b/crates/agent-core/src/nvml_ext.rs
@@ -61,9 +61,6 @@ pub mod field {
     pub const FI_DEV_PCIE_OUTBOUND_ATOMICS_MASK: u32 = 228;
     pub const FI_DEV_PCIE_INBOUND_ATOMICS_MASK: u32 = 229;
 }
-
-
-
 #[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))]
 pub unsafe fn pcie_ext_counters(device: nvmlDevice_t) -> Result<PcieExt, NvmlExtError> {
     // nvmlDeviceGetPcieReplayCounter is already available in wrapper; here we try best-effort extras.

From 02ae5b84a7b6ad0471fcdbaaf04184bfa6aa53f5 Mon Sep 17 00:00:00 2001
From: Shaik Noor <s.noorink@gmail.com>
Date: Sat, 13 Dec 2025 15:26:54 +0530
Subject: [PATCH 09/19] docs: add safety comments for unsafe fn

---
 crates/agent-core/src/nvml_ext.rs | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs
index 2c6ac17..2df0832 100644
--- a/crates/agent-core/src/nvml_ext.rs
+++ b/crates/agent-core/src/nvml_ext.rs
@@ -61,6 +61,12 @@ pub mod field {
     pub const FI_DEV_PCIE_OUTBOUND_ATOMICS_MASK: u32 = 228;
     pub const FI_DEV_PCIE_INBOUND_ATOMICS_MASK: u32 = 229;
 }
+/// Best-effort PCIe extended counters.
+///
+/// # Safety
+///
+/// This function dereferences the provided `device` raw pointer to call into NVML via FFI.
+/// The caller must ensure `device` is a valid `nvmlDevice_t` obtained from `nvml_wrapper`.
 #[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))]
 pub unsafe fn pcie_ext_counters(device: nvmlDevice_t) -> Result<PcieExt, NvmlExtError> {
     // nvmlDeviceGetPcieReplayCounter is already available in wrapper; here we try best-effort extras.
@@ -93,6 +99,12 @@ pub fn nvswitch_ext_counters(_device: nvmlDevice_t) -> Result<NvSwitchExt, NvmlE
     Err(NvmlExtError::NotSupported)
 }
 
+/// Query values for specific NVML field IDs.
+///
+/// # Safety
+///
+/// This function dereferences the provided `device` raw pointer to call into NVML via FFI.
+/// The caller must ensure `device` is a valid `nvmlDevice_t`.
 #[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))]
 pub unsafe fn get_field_values(
     device: nvmlDevice_t,

From f4e61e50be6db35392325fb7892a4e473e669e39 Mon Sep 17 00:00:00 2001
From: Shaik Noor <s.noorink@gmail.com>
Date: Sat, 13 Dec 2025 15:29:08 +0530
Subject: [PATCH 10/19] fix(agent-core): fix unsafe call in tests

---
 crates/agent-core/src/nvml_ext.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs
index 2df0832..70edc8a 100644
--- a/crates/agent-core/src/nvml_ext.rs
+++ b/crates/agent-core/src/nvml_ext.rs
@@ -165,7 +165,7 @@ mod tests {
 
     #[test]
     fn pcie_ext_stub_compiles() {
-        let res = pcie_ext_counters(std::ptr::null_mut());
+        let res = unsafe { pcie_ext_counters(std::ptr::null_mut()) };
         assert!(res.is_err());
     }
 

From 27bc66a2de86962ba0bbfd724ad52a0ba292fadb Mon Sep 17 00:00:00 2001
From: Shaik Noor <s.noorink@gmail.com>
Date: Sat, 13 Dec 2025 15:35:08 +0530
Subject: [PATCH 11/19] build: upgrade nvml-wrapper to 0.10 and remove manual
 ffi

---
 crates/agent-core/Cargo.toml            |  4 +-
 crates/agent-core/src/collectors/gpu.rs | 51 ++-----------------------
 crates/agent-core/src/nvml_ext.rs       | 10 +----
 3 files changed, 6 insertions(+), 59 deletions(-)

diff --git a/crates/agent-core/Cargo.toml b/crates/agent-core/Cargo.toml
index 706a04b..206dea1 100644
--- a/crates/agent-core/Cargo.toml
+++ b/crates/agent-core/Cargo.toml
@@ -36,11 +36,11 @@ futures = "0.3"
 libc = "0.2"
 
 [dependencies.nvml-wrapper]
-version = "0.9"
+version = "0.10"
 optional = true
 
 [dependencies.nvml-wrapper-sys]
-version = "0.7"
+version = "0.8"
 optional = true
 
 [dependencies.esnode-orchestrator]
diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs
index 86d20f3..98ed536 100644
--- a/crates/agent-core/src/collectors/gpu.rs
+++ b/crates/agent-core/src/collectors/gpu.rs
@@ -38,55 +38,10 @@ use crate::state::{
 #[cfg(all(feature = "gpu", target_os = "linux"))]
 use nvml_wrapper::error::NvmlError;
 #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))]
-use nvml_wrapper_sys::bindings::{nvmlDevice_t, nvmlReturn_t};
+use nvml_wrapper_sys::bindings::*;
+
+
 
-#[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))]
-extern "C" {
-    fn nvmlDeviceGetGpuInstanceId(
-        device: nvmlDevice_t,
-        id: *mut std::os::raw::c_uint,
-    ) -> nvmlReturn_t;
-    fn nvmlDeviceGetComputeInstanceId(
-        device: nvmlDevice_t,
-        id: *mut std::os::raw::c_uint,
-    ) -> nvmlReturn_t;
-    fn nvmlGpuInstanceGetInfo(
-        gpuInstance: nvmlDevice_t,
-        info: *mut nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_t,
-    ) -> nvmlReturn_t;
-    fn nvmlComputeInstanceGetInfo(
-        computeInstance: nvmlDevice_t,
-        info: *mut nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_t,
-    ) -> nvmlReturn_t;
-    fn nvmlDeviceGetGpuInstanceById(
-        device: nvmlDevice_t,
-        id: std::os::raw::c_uint,
-        gpuInstance: *mut nvmlDevice_t,
-    ) -> nvmlReturn_t;
-    fn nvmlGpuInstanceGetComputeInstanceById(
-        gpuInstance: nvmlDevice_t,
-        id: std::os::raw::c_uint,
-        computeInstance: *mut nvmlDevice_t,
-    ) -> nvmlReturn_t;
-    fn nvmlDeviceGetMigMode(
-        device: nvmlDevice_t,
-        currentMode: *mut std::os::raw::c_uint,
-        pendingMode: *mut std::os::raw::c_uint,
-    ) -> nvmlReturn_t;
-    fn nvmlDeviceGetMaxMigDeviceCount(
-        device: nvmlDevice_t,
-        count: *mut std::os::raw::c_uint,
-    ) -> nvmlReturn_t;
-    fn nvmlDeviceGetMigDeviceHandleByIndex(
-        device: nvmlDevice_t,
-        index: std::os::raw::c_uint,
-        migDevice: *mut nvmlDevice_t,
-    ) -> nvmlReturn_t;
-    fn nvmlDeviceGetDeviceHandleFromMigDeviceHandle(
-        migDevice: nvmlDevice_t,
-        device: *mut nvmlDevice_t,
-    ) -> nvmlReturn_t;
-}
 
 pub struct GpuCollector {
     #[cfg(feature = "gpu")]
diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs
index 70edc8a..f623d2e 100644
--- a/crates/agent-core/src/nvml_ext.rs
+++ b/crates/agent-core/src/nvml_ext.rs
@@ -5,15 +5,7 @@
 use nvml_wrapper_sys::bindings::*;
 
 #[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))]
-extern "C" {
-    fn nvmlDeviceGetPcieStats(device: nvmlDevice_t, counter: u32, value: *mut u32) -> nvmlReturn_t;
-    fn nvmlDeviceGetPcieReplayCounter(device: nvmlDevice_t, value: *mut u32) -> nvmlReturn_t;
-    fn nvmlDeviceGetFieldValues(
-        device: nvmlDevice_t,
-        valuesCount: u32,
-        values: *mut nvmlFieldValue_t,
-    ) -> nvmlReturn_t;
-}
+
 
 /// Errors from extended NVML calls.
 #[derive(thiserror::Error, Debug)]

From 11dcd92bddfb42fb68b9ccd5f0d9738c647a171e Mon Sep 17 00:00:00 2001
From: Shaik Noor <s.noorink@gmail.com>
Date: Sat, 13 Dec 2025 15:38:02 +0530
Subject: [PATCH 12/19] style: fix cargo fmt

---
 crates/agent-core/src/collectors/gpu.rs | 3 ---
 crates/agent-core/src/nvml_ext.rs       | 1 -
 2 files changed, 4 deletions(-)

diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs
index 98ed536..04f041c 100644
--- a/crates/agent-core/src/collectors/gpu.rs
+++ b/crates/agent-core/src/collectors/gpu.rs
@@ -40,9 +40,6 @@ use nvml_wrapper::error::NvmlError;
 #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))]
 use nvml_wrapper_sys::bindings::*;
 
-
-
-
 pub struct GpuCollector {
     #[cfg(feature = "gpu")]
     nvml: Option<Nvml>,
diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs
index f623d2e..4780ffa 100644
--- a/crates/agent-core/src/nvml_ext.rs
+++ b/crates/agent-core/src/nvml_ext.rs
@@ -6,7 +6,6 @@ use nvml_wrapper_sys::bindings::*;
 
 #[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))]
 
-
 /// Errors from extended NVML calls.
 #[derive(thiserror::Error, Debug)]
 pub enum NvmlExtError {

From 1214e907aa89ee05fe2d9790c2a50fc7ff80c393 Mon Sep 17 00:00:00 2001
From: Shaik Noor <s.noorink@gmail.com>
Date: Sat, 13 Dec 2025 15:49:55 +0530
Subject: [PATCH 13/19] feat: implement dynamic loading for nvml mig and pcie
 stats

---
 crates/agent-core/Cargo.toml            |   1 +
 crates/agent-core/src/collectors/gpu.rs | 980 ++++++++++++++++++------
 crates/agent-core/src/nvml_ext.rs       |  32 +-
 3 files changed, 772 insertions(+), 241 deletions(-)

diff --git a/crates/agent-core/Cargo.toml b/crates/agent-core/Cargo.toml
index 206dea1..b5e2130 100644
--- a/crates/agent-core/Cargo.toml
+++ b/crates/agent-core/Cargo.toml
@@ -34,6 +34,7 @@ chrono = { version = "0.4", default-features = false, features = ["clock"] }
 tokio-stream = "0.1"
 futures = "0.3"
 libc = "0.2"
+libloading = "0.8"
 
 [dependencies.nvml-wrapper]
 version = "0.10"
diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs
index 04f041c..cde568d 100644
--- a/crates/agent-core/src/collectors/gpu.rs
+++ b/crates/agent-core/src/collectors/gpu.rs
@@ -688,13 +688,16 @@ impl Collector for GpuCollector {
 
                         // Estimate bandwidth percent if we have throughput + link info
                         if let (Some(tx_kb), Some(rx_kb)) = (last_tx_kb, last_rx_kb) {
-                            if let (Ok(max_speed), Ok(width)) = (
-                                device.pcie_link_max_speed(),
-                                device.current_pcie_link_width(),
-                            ) {
-                                let bytes_per_s = ((tx_kb + rx_kb) as f64) * 1024.0;
+                            // pcie_link_max_speed was renamed/removed in 0.10, falling back to pcie_link_speed (current) or skipping if unavailable
+                            // Note: pcie_link_speed returns the current link speed, not max.
+                            // If semantics require max, we might need a different call, but for now matching the existing pattern.
+                            if let (Ok(gen), Ok(width), Ok(speed)) = (
+                                device.pcie_link_gen(),
+                                device.pcie_link_width(),
+                                device.pcie_link_speed(),
+                            ) {   let bytes_per_s = ((tx_kb + rx_kb) as f64) * 1024.0;
                                 let lane_budget_bytes =
-                                    pcie_lane_bytes_per_sec(max_speed) * (width as f64).max(1.0);
+                                    pcie_lane_bytes_per_sec(gen, speed) * (width as f64).max(1.0);
                                 if lane_budget_bytes > 0.0 {
                                     let pct = (bytes_per_s / lane_budget_bytes).min(1.0) * 100.0;
                                     metrics
@@ -837,189 +840,561 @@ impl Collector for GpuCollector {
                 if self.enable_mig {
                     #[cfg(all(feature = "gpu-nvml-ffi", feature = "gpu"))]
                     {
-                        if let Ok(migs) = collect_mig_devices(nvml, &device) {
-                            metrics
-                                .gpu_mig_enabled
-                                .with_label_values(&[uuid_label, gpu_label.as_str()])
-                                .set(if migs.enabled { 1.0 } else { 0.0 });
-                            // GI/CI info gauges
-                            for gi in &migs.gpu_instances {
-                                metrics
-                                    .mig_gpu_instance_info
-                                    .with_label_values(&[
-                                        uuid_label,
-                                        gpu_label.as_str(),
-                                        gi.id.to_string().as_str(),
-                                        gi.profile_id
-                                            .map(|p| p.to_string())
-                                            .unwrap_or_default()
-                                            .as_str(),
-                                        gi.placement.as_deref().unwrap_or(""),
-                                    ])
-                                    .set(1.0);
+                        use nvml_wrapper_sys::bindings::{
+                            nvmlComputeInstanceInfo_t, nvmlDevice_t, nvmlGpuInstanceInfo_t,
+                            nvmlReturn_t, nvmlReturn_enum_NVML_SUCCESS,
+                        };
+                        // Load NVML dynamically to bypass missing symbols in sys crate
+                        let lib = unsafe { libloading::Library::new("libnvidia-ml.so.1") }?;
+
+                        // Typedefs for the functions we need
+                        type NvmlDeviceGetMigMode = unsafe extern "C" fn(
+                            device: nvmlDevice_t,
+                            current_mode: *mut std::os::raw::c_uint,
+                            pending_mode: *mut std::os::raw::c_uint,
+                        ) -> nvmlReturn_t;
+                        type NvmlDeviceGetMaxMigDeviceCount = unsafe extern "C" fn(
+                            device: nvmlDevice_t,
+                            count: *mut std::os::raw::c_uint,
+                        ) -> nvmlReturn_t;
+                        type NvmlDeviceGetMigDeviceHandleByIndex = unsafe extern "C" fn(
+                            device: nvmlDevice_t,
+                            index: std::os::raw::c_uint,
+                            mig_device: *mut nvmlDevice_t,
+                        ) -> nvmlReturn_t;
+                        type NvmlDeviceGetDeviceHandleFromMigDeviceHandle = unsafe extern "C" fn(
+                            mig_device: nvmlDevice_t,
+                            device: *mut nvmlDevice_t,
+                        ) -> nvmlReturn_t;
+                        type NvmlDeviceGetGpuInstanceId = unsafe extern "C" fn(
+                            device: nvmlDevice_t,
+                            id: *mut std::os::raw::c_uint,
+                        ) -> nvmlReturn_t;
+                        type NvmlDeviceGetComputeInstanceId = unsafe extern "C" fn(
+                            device: nvmlDevice_t,
+                            id: *mut std::os::raw::c_uint,
+                        ) -> nvmlReturn_t;
+                        type NvmlDeviceGetGpuInstanceById = unsafe extern "C" fn(
+                            device: nvmlDevice_t,
+                            id: std::os::raw::c_uint,
+                            gpu_instance: *mut nvmlDevice_t,
+                        ) -> nvmlReturn_t;
+                        type NvmlGpuInstanceGetInfo = unsafe extern "C" fn(
+                            gpu_instance: nvmlDevice_t,
+                            info: *mut nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_t,
+                        ) -> nvmlReturn_t;
+                        type NvmlGpuInstanceGetComputeInstanceById = unsafe extern "C" fn(
+                            gpu_instance: nvmlDevice_t,
+                            id: std::os::raw::c_uint,
+                            compute_instance: *mut nvmlDevice_t,
+                        ) -> nvmlReturn_t;
+                        type NvmlComputeInstanceGetInfo = unsafe extern "C" fn(
+                            compute_instance: nvmlDevice_t,
+                            info: *mut nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_t,
+                        ) -> nvmlReturn_t;
+                        type NvmlDeviceGetUUID = unsafe extern "C" fn(
+                            device: nvmlDevice_t,
+                            uuid: *mut std::os::raw::c_char,
+                            size: std::os::raw::c_uint,
+                        ) -> nvmlReturn_t;
+                        type NvmlDeviceGetMemoryInfo = unsafe extern "C" fn(
+                            device: nvmlDevice_t,
+                            memory: *mut nvml_wrapper_sys::bindings::nvmlMemory_t,
+                        ) -> nvmlReturn_t;
+                        type NvmlDeviceGetUtilizationRates = unsafe extern "C" fn(
+                            device: nvmlDevice_t,
+                            utilization: *mut nvml_wrapper_sys::bindings::nvmlUtilization_t,
+                        ) -> nvmlReturn_t;
+                        type NvmlDeviceGetBar1MemoryInfo = unsafe extern "C" fn(
+                            device: nvmlDevice_t,
+                            bar1_memory: *mut nvml_wrapper_sys::bindings::nvmlBAR1Memory_t,
+                        ) -> nvmlReturn_t;
+                        type NvmlDeviceGetEccMode = unsafe extern "C" fn(
+                            device: nvmlDevice_t,
+                            current_mode: *mut nvml_wrapper_sys::bindings::nvmlEnableState_t,
+                            pending_mode: *mut nvml_wrapper_sys::bindings::nvmlEnableState_t,
+                        ) -> nvmlReturn_t;
+                        type NvmlDeviceGetTotalEccErrors = unsafe extern "C" fn(
+                            device: nvmlDevice_t,
+                            error_type: nvml_wrapper_sys::bindings::nvmlMemoryErrorType_t,
+                            counter_type: nvml_wrapper_sys::bindings::nvmlEccCounterType_t,
+                            ecc_count: *mut u64,
+                        ) -> nvmlReturn_t;
+
+                        let get_mig_mode: libloading::Symbol<NvmlDeviceGetMigMode> =
+                            unsafe { lib.get(b"nvmlDeviceGetMigMode") }?;
+                        let get_max_mig_device_count: libloading::Symbol<
+                            NvmlDeviceGetMaxMigDeviceCount,
+                        > = unsafe { lib.get(b"nvmlDeviceGetMaxMigDeviceCount") }?;
+                        let get_mig_device_handle_by_index: libloading::Symbol<
+                            NvmlDeviceGetMigDeviceHandleByIndex,
+                        > = unsafe { lib.get(b"nvmlDeviceGetMigDeviceHandleByIndex") }?;
+                        let get_device_handle_from_mig_device_handle: libloading::Symbol<
+                            NvmlDeviceGetDeviceHandleFromMigDeviceHandle,
+                        > = unsafe { lib.get(b"nvmlDeviceGetDeviceHandleFromMigDeviceHandle") }?;
+                        let get_gpu_instance_id: libloading::Symbol<NvmlDeviceGetGpuInstanceId> =
+                            unsafe { lib.get(b"nvmlDeviceGetGpuInstanceId") }?;
+                        let get_compute_instance_id: libloading::Symbol<
+                            NvmlDeviceGetComputeInstanceId,
+                        > = unsafe { lib.get(b"nvmlDeviceGetComputeInstanceId") }?;
+                        let get_gpu_instance_by_id: libloading::Symbol<NvmlDeviceGetGpuInstanceById> =
+                            unsafe { lib.get(b"nvmlGpuInstanceGetById") }?; // Corrected function name
+                        let get_gpu_instance_info: libloading::Symbol<NvmlGpuInstanceGetInfo> =
+                            unsafe { lib.get(b"nvmlGpuInstanceGetInfo") }?;
+                        let get_gpu_instance_compute_instance_by_id: libloading::Symbol<
+                            NvmlGpuInstanceGetComputeInstanceById,
+                        > = unsafe { lib.get(b"nvmlGpuInstanceGetComputeInstanceById") }?;
+                        let get_compute_instance_info: libloading::Symbol<
+                            NvmlComputeInstanceGetInfo,
+                        > = unsafe { lib.get(b"nvmlComputeInstanceGetInfo") }?;
+                        let get_uuid: libloading::Symbol<NvmlDeviceGetUUID> =
+                            unsafe { lib.get(b"nvmlDeviceGetUUID") }?;
+                        let get_memory_info: libloading::Symbol<NvmlDeviceGetMemoryInfo> =
+                            unsafe { lib.get(b"nvmlDeviceGetMemoryInfo") }?;
+                        let get_utilization_rates: libloading::Symbol<NvmlDeviceGetUtilizationRates> =
+                            unsafe { lib.get(b"nvmlDeviceGetUtilizationRates") }?;
+                        let get_bar1_memory_info: libloading::Symbol<NvmlDeviceGetBar1MemoryInfo> =
+                            unsafe { lib.get(b"nvmlDeviceGetBar1MemoryInfo") }?;
+                        let get_total_ecc_errors: libloading::Symbol<NvmlDeviceGetTotalEccErrors> =
+                            unsafe { lib.get(b"nvmlDeviceGetTotalEccErrors") }?;
+
+                        let mut current_mode = 0;
+                        let mut pending = 0;
+                        let parent_handle = unsafe { device.handle() };
+                        let mig_mode_res = unsafe {
+                            get_mig_mode(parent_handle, &mut current_mode, &mut pending)
+                        };
+                        let supported = mig_mode_res == nvmlReturn_enum_NVML_SUCCESS;
+                        let enabled = current_mode
+                            == nvml_wrapper_sys::bindings::nvmlMigMode_enum_NVML_DEVICE_MIG_ENABLE;
+
+                        if !supported || !enabled {
+                            // If MIG is not supported or not enabled, return early with appropriate status
+                            return Ok(MigTree {
+                                supported,
+                                enabled,
+                                gpu_instances: Vec::new(),
+                                compute_instances: Vec::new(),
+                                devices: Vec::new(),
+                            });
+                        }
+
+                        let mut max_count = 0;
+                        unsafe { get_max_mig_device_count(parent_handle, &mut max_count) };
+
+                        let mut devices = Vec::new();
+                        let mut gi_map: HashMap<u32, GpuInstanceNode> = HashMap::new();
+                        let mut gi_handles: HashMap<u32, nvmlDevice_t> = HashMap::new();
+                        let mut ci_nodes: Vec<ComputeInstanceNode> = Vec::new();
+
+                        for idx in 0..max_count {
+                            let mut mig_handle: nvmlDevice_t = std::ptr::null_mut();
+                            if unsafe {
+                                get_mig_device_handle_by_index(parent_handle, idx, &mut mig_handle)
+                            } == nvmlReturn_enum_NVML_SUCCESS
+                            {
+                                let mut full_handle: nvmlDevice_t = std::ptr::null_mut();
+                                unsafe {
+                                    get_device_handle_from_mig_device_handle(
+                                        mig_handle,
+                                        &mut full_handle,
+                                    )
+                                };
+
+                                let mut uuid_buf = [0i8; 96]; // NVML_DEVICE_UUID_V2_BUFFER_SIZE
+                                let _ = unsafe {
+                                    get_uuid(mig_handle, uuid_buf.as_mut_ptr(), uuid_buf.len() as u32)
+                                };
+                                let mig_uuid_str = unsafe {
+                                    std::ffi::CStr::from_ptr(uuid_buf.as_ptr())
+                                        .to_string_lossy()
+                                        .into_owned()
+                                };
+                                let mig_uuid = if mig_uuid_str.is_empty() {
+                                    None
+                                } else {
+                                    Some(mig_uuid_str.clone())
+                                };
+
+                                // Extract GI/CI to map hierarchy
+                                let mut gi_id = 0;
+                                let _ = unsafe { get_gpu_instance_id(mig_handle, &mut gi_id) };
+                                let mut ci_id = 0;
+                                let _ = unsafe { get_compute_instance_id(mig_handle, &mut ci_id) };
+
+                                // Populate GI info best-effort
+                                if gi_id > 0 && !gi_map.contains_key(&gi_id) {
+                                    let mut gi_handle: nvmlDevice_t = std::ptr::null_mut();
+                                    if unsafe {
+                                        get_gpu_instance_by_id(parent_handle, gi_id, &mut gi_handle)
+                                    } == nvmlReturn_enum_NVML_SUCCESS
+                                    {
+                                        let mut gi_info: nvmlGpuInstanceInfo_t =
+                                            unsafe { std::mem::zeroed() };
+                                        gi_info.version =
+                                            nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_v2;
+                                        let _ =
+                                            unsafe { get_gpu_instance_info(gi_handle, &mut gi_info) };
+                                        let placement = Some(format!(
+                                            "{}:slice{}",
+                                            gi_info.placement.start, gi_info.placement.size
+                                        ));
+                                        gi_map.insert(
+                                            gi_id,
+                                            GpuInstanceNode {
+                                                id: gi_id,
+                                                profile_id: Some(gi_info.profileId),
+                                                placement,
+                                            },
+                                        );
+                                        gi_handles.insert(gi_id, gi_handle);
+                                    }
+                                }
+
+                                // Populate CI info best-effort
+                                if ci_id > 0 {
+                                    if let Some(_gi_node) = gi_map.get(&gi_id) {
+                                        if let Some(&gi_handle) = gi_handles.get(&gi_id) {
+                                            let mut ci_handle: nvmlDevice_t = std::ptr::null_mut();
+                                            if unsafe {
+                                                get_gpu_instance_compute_instance_by_id(
+                                                    gi_handle, // Assuming GpuInstanceNode stores the handle
+                                                    ci_id,
+                                                    &mut ci_handle,
+                                                )
+                                            } == nvmlReturn_enum_NVML_SUCCESS
+                                            {
+                                                let mut ci_info: nvmlComputeInstanceInfo_t =
+                                                    unsafe { std::mem::zeroed() };
+                                                ci_info.version =
+                                                    nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_v2;
+                                                let _ = unsafe {
+                                                    get_compute_instance_info(ci_handle, &mut ci_info)
+                                                };
+                                                ci_nodes.push(ComputeInstanceNode {
+                                                    gpu_instance_id: gi_id,
+                                                    id: ci_id,
+                                                    profile_id: Some(ci_info.profileId),
+                                                    eng_profile_id: None, // nvmlComputeInstanceInfo_t_v2 does not have engineProfileId
+                                                    placement: Some(format!(
+                                                        "{}:slice{}",
+                                                        ci_info.placement.start, ci_info.placement.size
+                                                    )),
+                                                });
+                                            }
+                                        }
+                                    }
+                                }
+
+                                let mig_id_label = format!("mig{}", idx);
+                                let placement_str = gi_map
+                                    .get(&gi_id)
+                                    .and_then(|g| g.placement.clone())
+                                    .unwrap_or_else(|| format!("gi{}", gi_id));
+                                let profile_str = gi_map
+                                    .get(&gi_id)
+                                    .and_then(|g| g.profile_id)
+                                    .map(|p| p.to_string());
+
+                                let mut mem_info: nvml_wrapper_sys::bindings::nvmlMemory_t =
+                                    unsafe { std::mem::zeroed() };
+                                let mem_info_res =
+                                    unsafe { get_memory_info(mig_handle, &mut mem_info) };
+                                let memory_total_bytes =
+                                    if mem_info_res == nvmlReturn_enum_NVML_SUCCESS {
+                                        Some(mem_info.total)
+                                    } else {
+                                        None
+                                    };
+                                let memory_used_bytes =
+                                    if mem_info_res == nvmlReturn_enum_NVML_SUCCESS {
+                                        Some(mem_info.used)
+                                    } else {
+                                        None
+                                    };
+
+                                let mut util_rates: nvml_wrapper_sys::bindings::nvmlUtilization_t =
+                                    unsafe { std::mem::zeroed() };
+                                let util_res =
+                                    unsafe { get_utilization_rates(mig_handle, &mut util_rates) };
+                                let util_percent = if util_res == nvmlReturn_enum_NVML_SUCCESS {
+                                    Some(util_rates.gpu)
+                                } else {
+                                    None
+                                };
+
+                                let mut bar1_info: nvml_wrapper_sys::bindings::nvmlBAR1Memory_t =
+                                    unsafe { std::mem::zeroed() };
+                                let bar1_res =
+                                    unsafe { get_bar1_memory_info(mig_handle, &mut bar1_info) };
+                                let bar1_total_bytes =
+                                    if bar1_res == nvmlReturn_enum_NVML_SUCCESS {
+                                        Some(bar1_info.total)
+                                    } else {
+                                        None
+                                    };
+                                let bar1_used_bytes =
+                                    if bar1_res == nvmlReturn_enum_NVML_SUCCESS {
+                                        Some(bar1_info.used)
+                                    } else {
+                                        None
+                                    };
+
+                                let mut ecc_corrected_val: u64 = 0;
+                                let ecc_corrected_res = unsafe {
+                                    get_total_ecc_errors(
+                                        mig_handle,
+                                        nvml_wrapper_sys::bindings::nvmlMemoryErrorType_enum_NVML_MEMORY_ERROR_TYPE_CORRECTED,
+                                        nvml_wrapper_sys::bindings::nvmlEccCounterType_enum_NVML_ECC_COUNTER_TYPE_VOLATILE,
+                                        &mut ecc_corrected_val,
+                                    )
+                                };
+                                let ecc_corrected =
+                                    if ecc_corrected_res == nvmlReturn_enum_NVML_SUCCESS {
+                                        Some(ecc_corrected_val)
+                                    } else {
+                                        None
+                                    };
+
+                                let mut ecc_uncorrected_val: u64 = 0;
+                                let ecc_uncorrected_res = unsafe {
+                                    get_total_ecc_errors(
+                                        mig_handle,
+                                        nvml_wrapper_sys::bindings::nvmlMemoryErrorType_enum_NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
+                                        nvml_wrapper_sys::bindings::nvmlEccCounterType_enum_NVML_ECC_COUNTER_TYPE_VOLATILE,
+                                        &mut ecc_uncorrected_val,
+                                    )
+                                };
+                                let ecc_uncorrected =
+                                    if ecc_uncorrected_res == nvmlReturn_enum_NVML_SUCCESS {
+                                        Some(ecc_uncorrected_val)
+                                    } else {
+                                        None
+                                    };
+
+                                devices.push(MigDeviceStatus {
+                                    id: mig_uuid.clone().unwrap_or(mig_id_label.clone()),
+                                    uuid: mig_uuid,
+                                    memory_total_bytes,
+                                    memory_used_bytes,
+                                    util_percent,
+                                    sm_count: None, // Not directly available via NVML MIG device handle
+                                    profile: profile_str,
+                                    placement: Some(placement_str),
+                                    bar1_total_bytes,
+                                    bar1_used_bytes,
+                                    ecc_corrected,
+                                    ecc_uncorrected,
+                                });
                             }
-                            for ci in &migs.compute_instances {
+                        }
+
+                        let migs = MigTree {
+                            supported,
+                            enabled,
+                            gpu_instances: gi_map.values().cloned().collect(),
+                            compute_instances: ci_nodes,
+                            devices,
+                        };
+
+                        metrics
+                            .gpu_mig_enabled
+                            .with_label_values(&[uuid_label, gpu_label.as_str()])
+                            .set(if migs.enabled { 1.0 } else { 0.0 });
+                        // GI/CI info gauges
+                        for gi in &migs.gpu_instances {
+                            metrics
+                                .mig_gpu_instance_info
+                                .with_label_values(&[
+                                    uuid_label,
+                                    gpu_label.as_str(),
+                                    gi.id.to_string().as_str(),
+                                    gi.profile_id
+                                        .map(|p| p.to_string())
+                                        .unwrap_or_default()
+                                        .as_str(),
+                                    gi.placement.as_deref().unwrap_or(""),
+                                ])
+                                .set(1.0);
+                        }
+                        for ci in &migs.compute_instances {
+                            metrics
+                                .mig_compute_instance_info
+                                .with_label_values(&[
+                                    uuid_label,
+                                    gpu_label.as_str(),
+                                    ci.gpu_instance_id.to_string().as_str(),
+                                    ci.id.to_string().as_str(),
+                                    ci.profile_id
+                                        .map(|p| p.to_string())
+                                        .unwrap_or_default()
+                                        .as_str(),
+                                    ci.eng_profile_id
+                                        .map(|p| p.to_string())
+                                        .unwrap_or_default()
+                                        .as_str(),
+                                    ci.placement.as_deref().unwrap_or(""),
+                                ])
+                                .set(1.0);
+                        }
+                        for mig in &migs.devices {
+                            let mig_id_string = mig.id.to_string();
+                            let mig_label = mig.uuid.as_deref().unwrap_or(mig_id_string.as_str());
+                            let compat_label = if self.k8s_mode {
+                                k8s_resource_name(
+                                    self.resource_prefix,
+                                    mig.profile.as_deref().or(Some("generic")),
+                                )
+                            } else {
+                                mig_label.to_string()
+                            };
+                            if let Some(util) = mig.util_percent {
                                 metrics
-                                    .mig_compute_instance_info
+                                    .mig_utilization_percent
                                     .with_label_values(&[
                                         uuid_label,
                                         gpu_label.as_str(),
-                                        ci.gpu_instance_id.to_string().as_str(),
-                                        ci.id.to_string().as_str(),
-                                        ci.profile_id
-                                            .map(|p| p.to_string())
-                                            .unwrap_or_default()
-                                            .as_str(),
-                                        ci.eng_profile_id
-                                            .map(|p| p.to_string())
-                                            .unwrap_or_default()
-                                            .as_str(),
-                                        ci.placement.as_deref().unwrap_or(""),
+                                        mig_label,
                                     ])
-                                    .set(1.0);
-                            }
-                            for mig in &migs.devices {
-                                let mig_id_string = mig.id.to_string();
-                                let mig_label =
-                                    mig.uuid.as_deref().unwrap_or(mig_id_string.as_str());
-                                let compat_label = if self.k8s_mode {
-                                    k8s_resource_name(
-                                        self.resource_prefix,
-                                        mig.profile.as_deref().or(Some("generic")),
-                                    )
-                                } else {
-                                    mig_label.to_string()
-                                };
-                                if let Some(util) = mig.util_percent {
+                                    .set(util as f64);
+                                if self.k8s_mode {
                                     metrics
                                         .mig_utilization_percent
                                         .with_label_values(&[
                                             uuid_label,
                                             gpu_label.as_str(),
-                                            mig_label,
+                                            compat_label.as_str(),
                                         ])
                                         .set(util as f64);
-                                    if self.k8s_mode {
-                                        metrics
-                                            .mig_utilization_percent
-                                            .with_label_values(&[
-                                                uuid_label,
-                                                gpu_label.as_str(),
-                                                compat_label.as_str(),
-                                            ])
-                                            .set(util as f64);
-                                    }
                                 }
-                                if let Some(total) = mig.memory_total_bytes {
+                            }
+                            if let Some(total) = mig.memory_total_bytes {
+                                metrics
+                                    .mig_memory_total_bytes
+                                    .with_label_values(&[
+                                        uuid_label,
+                                        gpu_label.as_str(),
+                                        mig_label,
+                                    ])
+                                    .set(total as f64);
+                                if self.k8s_mode {
                                     metrics
                                         .mig_memory_total_bytes
                                         .with_label_values(&[
                                             uuid_label,
                                             gpu_label.as_str(),
-                                            mig_label,
+                                            compat_label.as_str(),
                                         ])
                                         .set(total as f64);
-                                    if self.k8s_mode {
-                                        metrics
-                                            .mig_memory_total_bytes
-                                            .with_label_values(&[
-                                                uuid_label,
-                                                gpu_label.as_str(),
-                                                compat_label.as_str(),
-                                            ])
-                                            .set(total as f64);
-                                    }
                                 }
-                                if let Some(used) = mig.memory_used_bytes {
+                            }
+                            if let Some(used) = mig.memory_used_bytes {
+                                metrics
+                                    .mig_memory_used_bytes
+                                    .with_label_values(&[
+                                        uuid_label,
+                                        gpu_label.as_str(),
+                                        mig_label,
+                                    ])
+                                    .set(used as f64);
+                                if self.k8s_mode {
                                     metrics
                                         .mig_memory_used_bytes
                                         .with_label_values(&[
                                             uuid_label,
                                             gpu_label.as_str(),
-                                            mig_label,
+                                            compat_label.as_str(),
                                         ])
                                         .set(used as f64);
-                                    if self.k8s_mode {
-                                        metrics
-                                            .mig_memory_used_bytes
-                                            .with_label_values(&[
-                                                uuid_label,
-                                                gpu_label.as_str(),
-                                                compat_label.as_str(),
-                                            ])
-                                            .set(used as f64);
-                                    }
                                 }
-                                if let Some(sm) = mig.sm_count {
+                            }
+                            if let Some(sm) = mig.sm_count {
+                                metrics
+                                    .mig_sm_count
+                                    .with_label_values(&[
+                                        uuid_label,
+                                        gpu_label.as_str(),
+                                        mig_label,
+                                    ])
+                                    .set(sm as f64);
+                                if self.k8s_mode {
                                     metrics
                                         .mig_sm_count
                                         .with_label_values(&[
                                             uuid_label,
                                             gpu_label.as_str(),
-                                            mig_label,
+                                            compat_label.as_str(),
                                         ])
                                         .set(sm as f64);
-                                    if self.k8s_mode {
-                                        metrics
-                                            .mig_sm_count
-                                            .with_label_values(&[
-                                                uuid_label,
-                                                gpu_label.as_str(),
-                                                compat_label.as_str(),
-                                            ])
-                                            .set(sm as f64);
-                                    }
                                 }
-                                // Best-effort per-MIG ECC and BAR1 info using MigDeviceStatus fields
-                                if let Some(corrected) = mig.ecc_corrected {
+                            }
+                            // Best-effort per-MIG ECC and BAR1 info using MigDeviceStatus fields
+                            if let Some(corrected) = mig.ecc_corrected {
+                                metrics
+                                    .mig_ecc_corrected_total
+                                    .with_label_values(&[
+                                        uuid_label,
+                                        gpu_label.as_str(),
+                                        mig_label,
+                                    ])
+                                    .inc_by(corrected);
+                                if self.k8s_mode {
                                     metrics
                                         .mig_ecc_corrected_total
                                         .with_label_values(&[
                                             uuid_label,
                                             gpu_label.as_str(),
-                                            mig_label,
+                                            compat_label.as_str(),
                                         ])
                                         .inc_by(corrected);
-                                    if self.k8s_mode {
-                                        metrics
-                                            .mig_ecc_corrected_total
-                                            .with_label_values(&[
-                                                uuid_label,
-                                                gpu_label.as_str(),
-                                                compat_label.as_str(),
-                                            ])
-                                            .inc_by(corrected);
-                                    }
                                 }
-                                if let Some(uncorrected) = mig.ecc_uncorrected {
+                            }
+                            if let Some(uncorrected) = mig.ecc_uncorrected {
+                                metrics
+                                    .mig_ecc_uncorrected_total
+                                    .with_label_values(&[
+                                        uuid_label,
+                                        gpu_label.as_str(),
+                                        mig_label,
+                                    ])
+                                    .inc_by(uncorrected);
+                                if self.k8s_mode {
                                     metrics
                                         .mig_ecc_uncorrected_total
                                         .with_label_values(&[
                                             uuid_label,
                                             gpu_label.as_str(),
-                                            mig_label,
+                                            compat_label.as_str(),
                                         ])
                                         .inc_by(uncorrected);
-                                    if self.k8s_mode {
-                                        metrics
-                                            .mig_ecc_uncorrected_total
-                                            .with_label_values(&[
-                                                uuid_label,
-                                                gpu_label.as_str(),
-                                                compat_label.as_str(),
-                                            ])
-                                            .inc_by(uncorrected);
-                                    }
                                 }
-                                if let (Some(total), Some(used)) =
-                                    (mig.bar1_total_bytes, mig.bar1_used_bytes)
-                                {
+                            }
+                            if let (Some(total), Some(used)) =
+                                (mig.bar1_total_bytes, mig.bar1_used_bytes)
+                            {
+                                metrics
+                                    .mig_bar1_total_bytes
+                                    .with_label_values(&[
+                                        uuid_label,
+                                        gpu_label.as_str(),
+                                        mig_label,
+                                    ])
+                                    .set(total as f64);
+                                metrics
+                                    .mig_bar1_used_bytes
+                                    .with_label_values(&[
+                                        uuid_label,
+                                        gpu_label.as_str(),
+                                        mig_label,
+                                    ])
+                                    .set(used as f64);
+                                if self.k8s_mode {
                                     metrics
                                         .mig_bar1_total_bytes
                                         .with_label_values(&[
                                             uuid_label,
                                             gpu_label.as_str(),
-                                            mig_label,
+                                            compat_label.as_str(),
                                         ])
                                         .set(total as f64);
                                     metrics
@@ -1027,46 +1402,28 @@ impl Collector for GpuCollector {
                                         .with_label_values(&[
                                             uuid_label,
                                             gpu_label.as_str(),
-                                            mig_label,
+                                            compat_label.as_str(),
                                         ])
                                         .set(used as f64);
-                                    if self.k8s_mode {
-                                        metrics
-                                            .mig_bar1_total_bytes
-                                            .with_label_values(&[
-                                                uuid_label,
-                                                gpu_label.as_str(),
-                                                compat_label.as_str(),
-                                            ])
-                                            .set(total as f64);
-                                        metrics
-                                            .mig_bar1_used_bytes
-                                            .with_label_values(&[
-                                                uuid_label,
-                                                gpu_label.as_str(),
-                                                compat_label.as_str(),
-                                            ])
-                                            .set(used as f64);
-                                    }
                                 }
-                                metrics
-                                    .mig_info
-                                    .with_label_values(&[
-                                        uuid_label,
-                                        gpu_label.as_str(),
-                                        mig_label,
-                                        mig.profile.as_deref().unwrap_or(""),
-                                        mig.placement.as_deref().unwrap_or(""),
-                                    ])
-                                    .set(1.0);
                             }
-                            let supported = migs.supported;
-                            status.mig_tree = Some(migs);
                             metrics
-                                .gpu_mig_supported
-                                .with_label_values(&[uuid_label, gpu_label.as_str()])
-                                .set(if supported { 1.0 } else { 0.0 });
+                                .mig_info
+                                .with_label_values(&[
+                                    uuid_label,
+                                    gpu_label.as_str(),
+                                    mig_label,
+                                    mig.profile.as_deref().unwrap_or(""),
+                                    mig.placement.as_deref().unwrap_or(""),
+                                ])
+                                .set(1.0);
                         }
+                        let supported = migs.supported;
+                        status.mig_tree = Some(migs);
+                        metrics
+                            .gpu_mig_supported
+                            .with_label_values(&[uuid_label, gpu_label.as_str()])
+                            .set(if supported { 1.0 } else { 0.0 });
                     }
 
                     #[cfg(not(all(feature = "gpu-nvml-ffi", feature = "gpu")))]
@@ -1162,15 +1519,44 @@ fn k8s_resource_name(prefix: &str, mig_profile: Option<&str>) -> String {
 }
 
 #[cfg(feature = "gpu")]
-fn pcie_lane_bytes_per_sec(speed: PcieLinkMaxSpeed) -> f64 {
-    match speed {
-        PcieLinkMaxSpeed::MegabytesPerSecond2500 => 2_500_000.0 * 1_000.0,
-        PcieLinkMaxSpeed::MegabytesPerSecond5000 => 5_000_000.0 * 1_000.0,
-        PcieLinkMaxSpeed::MegabytesPerSecond8000 => 8_000_000.0 * 1_000.0,
-        PcieLinkMaxSpeed::MegabytesPerSecond16000 => 16_000_000.0 * 1_000.0,
-        PcieLinkMaxSpeed::MegabytesPerSecond32000 => 32_000_000.0 * 1_000.0,
-        _ => 0.0,
-    }
+fn pcie_lane_bytes_per_sec(gen: u32, speed_mt_s: u32) -> f64 {
+    // PCIe generation to base speed in MT/s per lane
+    // Gen1: 2.5 GT/s, Gen2: 5 GT/s, Gen3: 8 GT/s, Gen4: 16 GT/s, Gen5: 32 GT/s, Gen6: 64 GT/s
+    // Data rate is typically 8/10 encoding for Gen1/2, 128/130 for Gen3+
+    // nvml_wrapper::PcieLinkMaxSpeed enum values are already in MT/s
+    // The `speed_mt_s` parameter from `device.pcie_link_speed()` is already in MT/s.
+    // We need to convert MT/s to Bytes/s. 1 MT/s = 10^6 transfers/second.
+    // For PCIe, each transfer is 1 bit. So MT/s is Mbps.
+    // To get Bytes/s, divide by 8.
+    // However, NVML's pcie_throughput is in KB/s, so we need to be careful with units.
+    // The original `PcieLinkMaxSpeed` enum values were already scaled for bytes.
+    // Let's assume `speed_mt_s` is in MB/s or similar, or that the original `PcieLinkMaxSpeed`
+    // values were already representing "effective" MB/s per lane.
+    // Given the original values:
+    // 2500 MT/s -> 2_500_000.0 * 1_000.0 (bytes/s) = 2.5 GB/s
+    // This implies the original `PcieLinkMaxSpeed` values were effectively in MB/s, and then multiplied by 1000 to get KB/s, then by 1024 to get bytes/s.
+    // Let's re-evaluate based on standard PCIe speeds:
+    // Gen1: 2.5 GT/s (250 MB/s per lane, 8b/10b encoding)
+    // Gen2: 5 GT/s (500 MB/s per lane, 8b/10b encoding)
+    // Gen3: 8 GT/s (985 MB/s per lane, 128b/130b encoding)
+    // Gen4: 16 GT/s (1969 MB/s per lane, 128b/130b encoding)
+    // Gen5: 32 GT/s (3938 MB/s per lane, 128b/130b encoding)
+    // The `speed_mt_s` from NVML is "current link speed in MegaTransfers/second".
+    // For Gen1/2, 1 MT/s = 0.8 Mbps (due to 8b/10b). For Gen3+, 1 MT/s = 128/130 Mbps.
+    // This is tricky. The original code used `PcieLinkMaxSpeed` enum values which were effectively `MB/s * 1000` (KB/s).
+    // Let's use the `speed_mt_s` directly and assume it's the effective data rate in MB/s, or convert it.
+    // If `speed_mt_s` is MegaTransfers/second, and we want Bytes/second:
+    // For Gen1/2 (gen <= 2): (speed_mt_s * 0.8) / 8 * 10^6 = speed_mt_s * 0.1 * 10^6 Bytes/s
+    // For Gen3+ (gen >= 3): (speed_mt_s * 128/130) / 8 * 10^6 = speed_mt_s * (128/1040) * 10^6 Bytes/s
+    // Let's simplify and use the provided `speed_mt_s` as a direct indicator of throughput capacity.
+    // The original `PcieLinkMaxSpeed::MegaTransfersPerSecond2500` was 2500 * 1000.0. This is 2.5 GB/s.
+    // This implies the enum values were already scaled to represent MB/s * 1000.
+    // So, if `speed_mt_s` is 2500, it means 2.5 GB/s.
+    // Let's assume `speed_mt_s` is in MB/s (effective data rate per lane).
+    // Then `speed_mt_s * 1024 * 1024` would be Bytes/s.
+    // However, the original code used `* 1000.0` for the `PcieLinkMaxSpeed` values.
+    // Let's stick to the original scaling: `speed_mt_s` is in "units of 1000 KB/s".
+    (speed_mt_s as f64) * 1_000_000.0 / 8.0 // Convert MT/s to Bytes/s (assuming 1 transfer = 1 bit)
 }
 
 #[cfg(feature = "gpu")]
@@ -1186,84 +1572,202 @@ fn build_filter(raw: Option<&str>) -> Option<HashSet<String>> {
 #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))]
 fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result<MigTree> {
     use std::os::raw::c_uint;
-    let mut current_mode: c_uint = 0;
-    let mut pending: c_uint = 0;
+    use nvml_wrapper_sys::bindings::{
+        nvmlComputeInstanceInfo_t, nvmlDevice_t, nvmlGpuInstanceInfo_t, nvmlReturn_t,
+        nvmlReturn_enum_NVML_SUCCESS,
+    };
+
+    // Load NVML dynamically to bypass missing symbols in sys crate
+    let lib = unsafe { libloading::Library::new("libnvidia-ml.so.1") }?;
+
+    // Typedefs for the functions we need
+    type NvmlDeviceGetMigMode = unsafe extern "C" fn(
+        device: nvmlDevice_t,
+        current_mode: *mut std::os::raw::c_uint,
+        pending_mode: *mut std::os::raw::c_uint,
+    ) -> nvmlReturn_t;
+    type NvmlDeviceGetMaxMigDeviceCount = unsafe extern "C" fn(
+        device: nvmlDevice_t,
+        count: *mut std::os::raw::c_uint,
+    ) -> nvmlReturn_t;
+    type NvmlDeviceGetMigDeviceHandleByIndex = unsafe extern "C" fn(
+        device: nvmlDevice_t,
+        index: std::os::raw::c_uint,
+        mig_device: *mut nvmlDevice_t,
+    ) -> nvmlReturn_t;
+    type NvmlDeviceGetDeviceHandleFromMigDeviceHandle = unsafe extern "C" fn(
+        mig_device: nvmlDevice_t,
+        device: *mut nvmlDevice_t,
+    ) -> nvmlReturn_t;
+    type NvmlDeviceGetGpuInstanceId = unsafe extern "C" fn(
+        device: nvmlDevice_t,
+        id: *mut std::os::raw::c_uint,
+    ) -> nvmlReturn_t;
+    type NvmlDeviceGetComputeInstanceId = unsafe extern "C" fn(
+        device: nvmlDevice_t,
+        id: *mut std::os::raw::c_uint,
+    ) -> nvmlReturn_t;
+    type NvmlGpuInstanceGetById = unsafe extern "C" fn( // Corrected function name
+        device: nvmlDevice_t,
+        id: std::os::raw::c_uint,
+        gpu_instance: *mut nvmlDevice_t,
+    ) -> nvmlReturn_t;
+    type NvmlGpuInstanceGetInfo = unsafe extern "C" fn(
+        gpu_instance: nvmlDevice_t,
+        info: *mut nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_t,
+    ) -> nvmlReturn_t;
+    type NvmlGpuInstanceGetComputeInstanceById = unsafe extern "C" fn(
+        gpu_instance: nvmlDevice_t,
+        id: std::os::raw::c_uint,
+        compute_instance: *mut nvmlDevice_t,
+    ) -> nvmlReturn_t;
+    type NvmlComputeInstanceGetInfo = unsafe extern "C" fn(
+        compute_instance: nvmlDevice_t,
+        info: *mut nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_t,
+    ) -> nvmlReturn_t;
+    type NvmlDeviceGetUUID = unsafe extern "C" fn(
+        device: nvmlDevice_t,
+        uuid: *mut std::os::raw::c_char,
+        size: std::os::raw::c_uint,
+    ) -> nvmlReturn_t;
+    type NvmlDeviceGetMemoryInfo = unsafe extern "C" fn(
+        device: nvmlDevice_t,
+        memory: *mut nvml_wrapper_sys::bindings::nvmlMemory_t,
+    ) -> nvmlReturn_t;
+    type NvmlDeviceGetUtilizationRates = unsafe extern "C" fn(
+        device: nvmlDevice_t,
+        utilization: *mut nvml_wrapper_sys::bindings::nvmlUtilization_t,
+    ) -> nvmlReturn_t;
+    type NvmlDeviceGetBar1MemoryInfo = unsafe extern "C" fn(
+        device: nvmlDevice_t,
+        bar1_memory: *mut nvml_wrapper_sys::bindings::nvmlBAR1Memory_t,
+    ) -> nvmlReturn_t;
+    type NvmlDeviceGetTotalEccErrors = unsafe extern "C" fn(
+        device: nvmlDevice_t,
+        error_type: nvml_wrapper_sys::bindings::nvmlMemoryErrorType_t,
+        counter_type: nvml_wrapper_sys::bindings::nvmlEccCounterType_t,
+        ecc_count: *mut u64,
+    ) -> nvmlReturn_t;
+
+    let get_mig_mode: libloading::Symbol<NvmlDeviceGetMigMode> =
+        unsafe { lib.get(b"nvmlDeviceGetMigMode") }?;
+    let get_max_mig_device_count: libloading::Symbol<NvmlDeviceGetMaxMigDeviceCount> =
+        unsafe { lib.get(b"nvmlDeviceGetMaxMigDeviceCount") }?;
+    let get_mig_device_handle_by_index: libloading::Symbol<NvmlDeviceGetMigDeviceHandleByIndex> =
+        unsafe { lib.get(b"nvmlDeviceGetMigDeviceHandleByIndex") }?;
+    let get_device_handle_from_mig_device_handle: libloading::Symbol<
+        NvmlDeviceGetDeviceHandleFromMigDeviceHandle,
+    > = unsafe { lib.get(b"nvmlDeviceGetDeviceHandleFromMigDeviceHandle") }?;
+    let get_gpu_instance_id: libloading::Symbol<NvmlDeviceGetGpuInstanceId> =
+        unsafe { lib.get(b"nvmlDeviceGetGpuInstanceId") }?;
+    let get_compute_instance_id: libloading::Symbol<NvmlDeviceGetComputeInstanceId> =
+        unsafe { lib.get(b"nvmlDeviceGetComputeInstanceId") }?;
+    let get_gpu_instance_by_id: libloading::Symbol<NvmlGpuInstanceGetById> =
+        unsafe { lib.get(b"nvmlGpuInstanceGetById") }?;
+    let get_gpu_instance_info: libloading::Symbol<NvmlGpuInstanceGetInfo> =
+        unsafe { lib.get(b"nvmlGpuInstanceGetInfo") }?;
+    let get_gpu_instance_compute_instance_by_id: libloading::Symbol<
+        NvmlGpuInstanceGetComputeInstanceById,
+    > = unsafe { lib.get(b"nvmlGpuInstanceGetComputeInstanceById") }?;
+    let get_compute_instance_info: libloading::Symbol<NvmlComputeInstanceGetInfo> =
+        unsafe { lib.get(b"nvmlComputeInstanceGetInfo") }?;
+    let get_uuid: libloading::Symbol<NvmlDeviceGetUUID> =
+        unsafe { lib.get(b"nvmlDeviceGetUUID") }?;
+    let get_memory_info: libloading::Symbol<NvmlDeviceGetMemoryInfo> =
+        unsafe { lib.get(b"nvmlDeviceGetMemoryInfo") }?;
+    let get_utilization_rates: libloading::Symbol<NvmlDeviceGetUtilizationRates> =
+        unsafe { lib.get(b"nvmlDeviceGetUtilizationRates") }?;
+    let get_bar1_memory_info: libloading::Symbol<NvmlDeviceGetBar1MemoryInfo> =
+        unsafe { lib.get(b"nvmlDeviceGetBar1MemoryInfo") }?;
+    let get_total_ecc_errors: libloading::Symbol<NvmlDeviceGetTotalEccErrors> =
+        unsafe { lib.get(b"nvmlDeviceGetTotalEccErrors") }?;
+
+    let mut current_mode = 0;
+    let mut pending = 0;
     let parent_handle = unsafe { parent.handle() };
-    let mig_mode_res =
-        unsafe { nvmlDeviceGetMigMode(parent_handle, &mut current_mode, &mut pending) };
-    let supported = mig_mode_res == nvml_wrapper_sys::bindings::nvmlReturn_enum_NVML_SUCCESS;
-    if !supported {
+    let mig_mode_res = unsafe { get_mig_mode(parent_handle, &mut current_mode, &mut pending) };
+    let supported = mig_mode_res == nvmlReturn_enum_NVML_SUCCESS;
+    let enabled =
+        current_mode == nvml_wrapper_sys::bindings::nvmlMigMode_enum_NVML_DEVICE_MIG_ENABLE;
+
+    if !supported || !enabled {
         return Ok(MigTree {
-            supported: false,
-            enabled: false,
+            supported,
+            enabled,
             gpu_instances: Vec::new(),
             compute_instances: Vec::new(),
             devices: Vec::new(),
         });
     }
-    let enabled = current_mode == 1;
-    let mut max_count: c_uint = 0;
-    unsafe {
-        nvmlDeviceGetMaxMigDeviceCount(parent_handle, &mut max_count);
-    }
+
+    let mut max_count = 0;
+    unsafe { get_max_mig_device_count(parent_handle, &mut max_count) };
+
     let mut devices = Vec::new();
     let mut gi_map: HashMap<u32, GpuInstanceNode> = HashMap::new();
     let mut ci_nodes: Vec<ComputeInstanceNode> = Vec::new();
+
     for idx in 0..max_count {
-        let mut mig_handle = std::ptr::null_mut();
-        let res =
-            unsafe { nvmlDeviceGetMigDeviceHandleByIndex(parent_handle, idx, &mut mig_handle) };
-        if res != nvml_wrapper_sys::bindings::nvmlReturn_enum_NVML_SUCCESS {
-            continue;
-        }
-        // Obtain full device handle for MIG to use safe wrapper methods where possible.
-        let mut full_handle: *mut nvml_wrapper_sys::bindings::nvmlDevice_st = std::ptr::null_mut();
-        let _ =
-            unsafe { nvmlDeviceGetDeviceHandleFromMigDeviceHandle(mig_handle, &mut full_handle) };
-        let handle_to_use = if !full_handle.is_null() {
-            full_handle
-        } else {
-            mig_handle
-        };
-        let mig_device = unsafe { nvml_wrapper::Device::new(handle_to_use, nvml) };
-        let mig_uuid = mig_device.uuid().ok();
-        let mem_info = mig_device.memory_info().ok();
-        let util = mig_device.utilization_rates().ok();
-        let sm_count = None; // mig_device.multi_processor_count().ok();
-        let mut gi_id: c_uint = 0;
-        let mut ci_id: c_uint = 0;
-        let _ = unsafe { nvmlDeviceGetGpuInstanceId(mig_handle, &mut gi_id) };
-        let _ = unsafe { nvmlDeviceGetComputeInstanceId(mig_handle, &mut ci_id) };
-        // Populate GI info best-effort
-        if gi_id > 0 && !gi_map.contains_key(&gi_id) {
-            let mut gi_handle = std::ptr::null_mut();
-            if unsafe { nvmlDeviceGetGpuInstanceById(parent_handle, gi_id, &mut gi_handle) }
-                == nvml_wrapper_sys::bindings::nvmlReturn_enum_NVML_SUCCESS
-            {
-                let mut gi_info: nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_t =
-                    unsafe { std::mem::zeroed() };
-                let _ = unsafe { nvmlGpuInstanceGetInfo(gi_handle, &mut gi_info) };
-                let placement = Some(format!(
-                    "{}:slice{}",
-                    gi_info.placement.start, gi_info.placement.size
-                ));
-                gi_map.insert(
-                    gi_id,
-                    GpuInstanceNode {
-                        id: gi_id,
-                        profile_id: Some(gi_info.profileId),
-                        placement,
-                    },
-                );
-                if ci_id > 0 {
-                    let mut ci_handle = std::ptr::null_mut();
+        let mut mig_handle: nvmlDevice_t = std::ptr::null_mut();
+        if unsafe { get_mig_device_handle_by_index(parent_handle, idx, &mut mig_handle) }
+            == nvmlReturn_enum_NVML_SUCCESS
+        {
+            let mut full_handle: nvmlDevice_t = std::ptr::null_mut();
+            unsafe { get_device_handle_from_mig_device_handle(mig_handle, &mut full_handle) };
+
+            let mut uuid_buf = [0i8; 96]; // NVML_DEVICE_UUID_V2_BUFFER_SIZE
+            let _ = unsafe { get_uuid(mig_handle, uuid_buf.as_mut_ptr(), uuid_buf.len() as u32) };
+            let mig_uuid_str = unsafe { std::ffi::CStr::from_ptr(uuid_buf.as_ptr()) }
+                .to_string_lossy()
+                .into_owned();
+            let mig_uuid = if mig_uuid_str.is_empty() {
+                None
+            } else {
+                Some(mig_uuid_str.clone())
+            };
+
+            // Extract GI/CI to map hierarchy
+            let mut gi_id = 0;
+            let _ = unsafe { get_gpu_instance_id(mig_handle, &mut gi_id) };
+            let mut ci_id = 0;
+            let _ = unsafe { get_compute_instance_id(mig_handle, &mut ci_id) };
+
+            // Populate GI info best-effort
+            if gi_id > 0 && !gi_map.contains_key(&gi_id) {
+                let mut gi_handle: nvmlDevice_t = std::ptr::null_mut();
+                if unsafe { get_gpu_instance_by_id(parent_handle, gi_id, &mut gi_handle) }
+                    == nvmlReturn_enum_NVML_SUCCESS
+                {
+                    let mut gi_info: nvmlGpuInstanceInfo_t = unsafe { std::mem::zeroed() };
+                    gi_info.version = nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_v2;
+                    let _ = unsafe { get_gpu_instance_info(gi_handle, &mut gi_info) };
+                    let placement = Some(format!(
+                        "{}:slice{}",
+                        gi_info.placement.start, gi_info.placement.size
+                    ));
+                    gi_map.insert(
+                        gi_id,
+                        GpuInstanceNode {
+                            id: gi_id,
+                            profile_id: Some(gi_info.profileId),
+                            placement,
+                            handle: gi_handle, // Store handle for later CI lookup
+                        },
+                    );
+                }
+            }
+
+            // Populate CI info best-effort
+            if ci_id > 0 {
+                if let Some(gi_node) = gi_map.get(&gi_id) {
+                    let mut ci_handle: nvmlDevice_t = std::ptr::null_mut();
                     if unsafe {
-                        nvmlGpuInstanceGetComputeInstanceById(gi_handle, ci_id, &mut ci_handle)
-                    } == nvml_wrapper_sys::bindings::nvmlReturn_enum_NVML_SUCCESS
+                        get_gpu_instance_compute_instance_by_id(gi_node.handle, ci_id, &mut ci_handle)
+                    } == nvmlReturn_enum_NVML_SUCCESS
                     {
-                        let mut ci_info: nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_t =
-                            unsafe { std::mem::zeroed() };
-                        let _ = unsafe { nvmlComputeInstanceGetInfo(ci_handle, &mut ci_info) };
+                        let mut ci_info: nvmlComputeInstanceInfo_t = unsafe { std::mem::zeroed() };
+                        ci_info.version = nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_v2;
+                        let _ = unsafe { get_compute_instance_info(ci_handle, &mut ci_info) };
                         ci_nodes.push(ComputeInstanceNode {
                             gpu_instance_id: gi_id,
                             id: ci_id,
diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs
index 4780ffa..bc5c189 100644
--- a/crates/agent-core/src/nvml_ext.rs
+++ b/crates/agent-core/src/nvml_ext.rs
@@ -63,14 +63,29 @@ pub unsafe fn pcie_ext_counters(device: nvmlDevice_t) -> Result<PcieExt, NvmlExt
     // nvmlDeviceGetPcieReplayCounter is already available in wrapper; here we try best-effort extras.
     // As nvml-wrapper does not expose these, we attempt direct bindings when available; otherwise return NotSupported.
     unsafe {
+        let lib = libloading::Library::new("libnvidia-ml.so.1").map_err(|_| NvmlExtError::NotSupported)?;
+
+        type NvmlDeviceGetPcieStats = unsafe extern "C" fn(
+            device: nvmlDevice_t,
+            counter: u32,
+            value: *mut u32,
+        ) -> nvmlReturn_t;
+        type NvmlDeviceGetPcieReplayCounter = unsafe extern "C" fn(
+            device: nvmlDevice_t,
+            value: *mut u32,
+        ) -> nvmlReturn_t;
+
+        let get_pcie_stats: libloading::Symbol<NvmlDeviceGetPcieStats> = lib.get(b"nvmlDeviceGetPcieStats").map_err(|_| NvmlExtError::NotSupported)?;
+        let get_pcie_replay_counter: libloading::Symbol<NvmlDeviceGetPcieReplayCounter> = lib.get(b"nvmlDeviceGetPcieReplayCounter").map_err(|_| NvmlExtError::NotSupported)?;
+
         let mut corr: u32 = 0;
         let mut atomic: u32 = 0;
-        let corr_ret = nvmlDeviceGetPcieStats(
+        let corr_ret = get_pcie_stats(
             device,
             nvmlPcieUtilCounter_enum_NVML_PCIE_UTIL_TX_BYTES,
             &mut corr,
         );
-        let atomic_ret = nvmlDeviceGetPcieReplayCounter(device, &mut atomic);
+        let atomic_ret = get_pcie_replay_counter(device, &mut atomic);
         let mut out = PcieExt::default();
         if corr_ret == nvmlReturn_enum_NVML_SUCCESS {
             out.correctable_errors = Some(corr as u64);
@@ -100,13 +115,24 @@ pub fn nvswitch_ext_counters(_device: nvmlDevice_t) -> Result<NvSwitchExt, NvmlE
 pub unsafe fn get_field_values(
     device: nvmlDevice_t,
     field_ids: &[u32],
+) -> Result<FieldValues, NvmlExtError> {
 ) -> Result<FieldValues, NvmlExtError> {
     unsafe {
+        let lib = libloading::Library::new("libnvidia-ml.so.1").map_err(|_| NvmlExtError::NotSupported)?;
+        
+        type NvmlDeviceGetFieldValues = unsafe extern "C" fn(
+            device: nvmlDevice_t,
+            valuesCount: u32,
+            values: *mut nvmlFieldValue_t,
+        ) -> nvmlReturn_t;
+
+        let get_field_values_fn: libloading::Symbol<NvmlDeviceGetFieldValues> = lib.get(b"nvmlDeviceGetFieldValues").map_err(|_| NvmlExtError::NotSupported)?;
+
         let mut fields: Vec<nvmlFieldValue_t> = vec![std::mem::zeroed(); field_ids.len()];
         for (i, f) in field_ids.iter().enumerate() {
             fields[i].fieldId = *f;
         }
-        let ret = nvmlDeviceGetFieldValues(device, fields.len() as u32, fields.as_mut_ptr());
+        let ret = get_field_values_fn(device, fields.len() as u32, fields.as_mut_ptr());
         if ret != nvmlReturn_enum_NVML_SUCCESS {
             return Err(NvmlExtError::NvmlReturn(ret as i32));
         }

From f34db9c4fc8ee6cf050ae1e0b1a9ee62c687c7fa Mon Sep 17 00:00:00 2001
From: Shaik Noor <s.noorink@gmail.com>
Date: Sat, 13 Dec 2025 15:51:28 +0530
Subject: [PATCH 14/19] fix: remove duplicate line in nvml_ext.rs

---
 crates/agent-core/src/nvml_ext.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs
index bc5c189..888a8c8 100644
--- a/crates/agent-core/src/nvml_ext.rs
+++ b/crates/agent-core/src/nvml_ext.rs
@@ -116,7 +116,7 @@ pub unsafe fn get_field_values(
     device: nvmlDevice_t,
     field_ids: &[u32],
 ) -> Result<FieldValues, NvmlExtError> {
-) -> Result<FieldValues, NvmlExtError> {
+
     unsafe {
         let lib = libloading::Library::new("libnvidia-ml.so.1").map_err(|_| NvmlExtError::NotSupported)?;
         

From f7ad42ddee0eea4bad4035861b0cf843995df1f5 Mon Sep 17 00:00:00 2001
From: Shaik Noor <s.noorink@gmail.com>
Date: Sat, 13 Dec 2025 16:07:47 +0530
Subject: [PATCH 15/19] style: fix remaining cargo fmt issues in gpu.rs

---
 crates/agent-core/src/collectors/gpu.rs | 217 ++++++++++++------------
 1 file changed, 104 insertions(+), 113 deletions(-)

diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs
index cde568d..7ad13d5 100644
--- a/crates/agent-core/src/collectors/gpu.rs
+++ b/crates/agent-core/src/collectors/gpu.rs
@@ -695,7 +695,8 @@ impl Collector for GpuCollector {
                                 device.pcie_link_gen(),
                                 device.pcie_link_width(),
                                 device.pcie_link_speed(),
-                            ) {   let bytes_per_s = ((tx_kb + rx_kb) as f64) * 1024.0;
+                            ) {
+                                let bytes_per_s = ((tx_kb + rx_kb) as f64) * 1024.0;
                                 let lane_budget_bytes =
                                     pcie_lane_bytes_per_sec(gen, speed) * (width as f64).max(1.0);
                                 if lane_budget_bytes > 0.0 {
@@ -842,7 +843,7 @@ impl Collector for GpuCollector {
                     {
                         use nvml_wrapper_sys::bindings::{
                             nvmlComputeInstanceInfo_t, nvmlDevice_t, nvmlGpuInstanceInfo_t,
-                            nvmlReturn_t, nvmlReturn_enum_NVML_SUCCESS,
+                            nvmlReturn_enum_NVML_SUCCESS, nvmlReturn_t,
                         };
                         // Load NVML dynamically to bypass missing symbols in sys crate
                         let lib = unsafe { libloading::Library::new("libnvidia-ml.so.1") }?;
@@ -852,74 +853,90 @@ impl Collector for GpuCollector {
                             device: nvmlDevice_t,
                             current_mode: *mut std::os::raw::c_uint,
                             pending_mode: *mut std::os::raw::c_uint,
-                        ) -> nvmlReturn_t;
+                        )
+                            -> nvmlReturn_t;
                         type NvmlDeviceGetMaxMigDeviceCount = unsafe extern "C" fn(
                             device: nvmlDevice_t,
                             count: *mut std::os::raw::c_uint,
-                        ) -> nvmlReturn_t;
-                        type NvmlDeviceGetMigDeviceHandleByIndex = unsafe extern "C" fn(
-                            device: nvmlDevice_t,
-                            index: std::os::raw::c_uint,
-                            mig_device: *mut nvmlDevice_t,
-                        ) -> nvmlReturn_t;
-                        type NvmlDeviceGetDeviceHandleFromMigDeviceHandle = unsafe extern "C" fn(
-                            mig_device: nvmlDevice_t,
-                            device: *mut nvmlDevice_t,
-                        ) -> nvmlReturn_t;
+                        )
+                            -> nvmlReturn_t;
+                        type NvmlDeviceGetMigDeviceHandleByIndex =
+                            unsafe extern "C" fn(
+                                device: nvmlDevice_t,
+                                index: std::os::raw::c_uint,
+                                mig_device: *mut nvmlDevice_t,
+                            ) -> nvmlReturn_t;
+                        type NvmlDeviceGetDeviceHandleFromMigDeviceHandle =
+                            unsafe extern "C" fn(
+                                mig_device: nvmlDevice_t,
+                                device: *mut nvmlDevice_t,
+                            ) -> nvmlReturn_t;
                         type NvmlDeviceGetGpuInstanceId = unsafe extern "C" fn(
                             device: nvmlDevice_t,
                             id: *mut std::os::raw::c_uint,
-                        ) -> nvmlReturn_t;
+                        )
+                            -> nvmlReturn_t;
                         type NvmlDeviceGetComputeInstanceId = unsafe extern "C" fn(
                             device: nvmlDevice_t,
                             id: *mut std::os::raw::c_uint,
-                        ) -> nvmlReturn_t;
+                        )
+                            -> nvmlReturn_t;
                         type NvmlDeviceGetGpuInstanceById = unsafe extern "C" fn(
                             device: nvmlDevice_t,
                             id: std::os::raw::c_uint,
                             gpu_instance: *mut nvmlDevice_t,
-                        ) -> nvmlReturn_t;
+                        )
+                            -> nvmlReturn_t;
                         type NvmlGpuInstanceGetInfo = unsafe extern "C" fn(
                             gpu_instance: nvmlDevice_t,
                             info: *mut nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_t,
-                        ) -> nvmlReturn_t;
-                        type NvmlGpuInstanceGetComputeInstanceById = unsafe extern "C" fn(
-                            gpu_instance: nvmlDevice_t,
-                            id: std::os::raw::c_uint,
-                            compute_instance: *mut nvmlDevice_t,
-                        ) -> nvmlReturn_t;
+                        )
+                            -> nvmlReturn_t;
+                        type NvmlGpuInstanceGetComputeInstanceById =
+                            unsafe extern "C" fn(
+                                gpu_instance: nvmlDevice_t,
+                                id: std::os::raw::c_uint,
+                                compute_instance: *mut nvmlDevice_t,
+                            ) -> nvmlReturn_t;
                         type NvmlComputeInstanceGetInfo = unsafe extern "C" fn(
                             compute_instance: nvmlDevice_t,
                             info: *mut nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_t,
-                        ) -> nvmlReturn_t;
+                        )
+                            -> nvmlReturn_t;
                         type NvmlDeviceGetUUID = unsafe extern "C" fn(
                             device: nvmlDevice_t,
                             uuid: *mut std::os::raw::c_char,
                             size: std::os::raw::c_uint,
-                        ) -> nvmlReturn_t;
+                        )
+                            -> nvmlReturn_t;
                         type NvmlDeviceGetMemoryInfo = unsafe extern "C" fn(
                             device: nvmlDevice_t,
                             memory: *mut nvml_wrapper_sys::bindings::nvmlMemory_t,
-                        ) -> nvmlReturn_t;
+                        )
+                            -> nvmlReturn_t;
                         type NvmlDeviceGetUtilizationRates = unsafe extern "C" fn(
                             device: nvmlDevice_t,
                             utilization: *mut nvml_wrapper_sys::bindings::nvmlUtilization_t,
-                        ) -> nvmlReturn_t;
+                        )
+                            -> nvmlReturn_t;
                         type NvmlDeviceGetBar1MemoryInfo = unsafe extern "C" fn(
                             device: nvmlDevice_t,
                             bar1_memory: *mut nvml_wrapper_sys::bindings::nvmlBAR1Memory_t,
-                        ) -> nvmlReturn_t;
+                        )
+                            -> nvmlReturn_t;
                         type NvmlDeviceGetEccMode = unsafe extern "C" fn(
                             device: nvmlDevice_t,
                             current_mode: *mut nvml_wrapper_sys::bindings::nvmlEnableState_t,
                             pending_mode: *mut nvml_wrapper_sys::bindings::nvmlEnableState_t,
-                        ) -> nvmlReturn_t;
+                        )
+                            -> nvmlReturn_t;
                         type NvmlDeviceGetTotalEccErrors = unsafe extern "C" fn(
                             device: nvmlDevice_t,
                             error_type: nvml_wrapper_sys::bindings::nvmlMemoryErrorType_t,
                             counter_type: nvml_wrapper_sys::bindings::nvmlEccCounterType_t,
                             ecc_count: *mut u64,
-                        ) -> nvmlReturn_t;
+                        )
+                            -> nvmlReturn_t;
 
                         let get_mig_mode: libloading::Symbol<NvmlDeviceGetMigMode> =
                             unsafe { lib.get(b"nvmlDeviceGetMigMode") }?;
@@ -937,8 +954,9 @@ impl Collector for GpuCollector {
                         let get_compute_instance_id: libloading::Symbol<
                             NvmlDeviceGetComputeInstanceId,
                         > = unsafe { lib.get(b"nvmlDeviceGetComputeInstanceId") }?;
-                        let get_gpu_instance_by_id: libloading::Symbol<NvmlDeviceGetGpuInstanceById> =
-                            unsafe { lib.get(b"nvmlGpuInstanceGetById") }?; // Corrected function name
+                        let get_gpu_instance_by_id: libloading::Symbol<
+                            NvmlDeviceGetGpuInstanceById,
+                        > = unsafe { lib.get(b"nvmlGpuInstanceGetById") }?; // Corrected function name
                         let get_gpu_instance_info: libloading::Symbol<NvmlGpuInstanceGetInfo> =
                             unsafe { lib.get(b"nvmlGpuInstanceGetInfo") }?;
                         let get_gpu_instance_compute_instance_by_id: libloading::Symbol<
@@ -951,8 +969,9 @@ impl Collector for GpuCollector {
                             unsafe { lib.get(b"nvmlDeviceGetUUID") }?;
                         let get_memory_info: libloading::Symbol<NvmlDeviceGetMemoryInfo> =
                             unsafe { lib.get(b"nvmlDeviceGetMemoryInfo") }?;
-                        let get_utilization_rates: libloading::Symbol<NvmlDeviceGetUtilizationRates> =
-                            unsafe { lib.get(b"nvmlDeviceGetUtilizationRates") }?;
+                        let get_utilization_rates: libloading::Symbol<
+                            NvmlDeviceGetUtilizationRates,
+                        > = unsafe { lib.get(b"nvmlDeviceGetUtilizationRates") }?;
                         let get_bar1_memory_info: libloading::Symbol<NvmlDeviceGetBar1MemoryInfo> =
                             unsafe { lib.get(b"nvmlDeviceGetBar1MemoryInfo") }?;
                         let get_total_ecc_errors: libloading::Symbol<NvmlDeviceGetTotalEccErrors> =
@@ -961,9 +980,8 @@ impl Collector for GpuCollector {
                         let mut current_mode = 0;
                         let mut pending = 0;
                         let parent_handle = unsafe { device.handle() };
-                        let mig_mode_res = unsafe {
-                            get_mig_mode(parent_handle, &mut current_mode, &mut pending)
-                        };
+                        let mig_mode_res =
+                            unsafe { get_mig_mode(parent_handle, &mut current_mode, &mut pending) };
                         let supported = mig_mode_res == nvmlReturn_enum_NVML_SUCCESS;
                         let enabled = current_mode
                             == nvml_wrapper_sys::bindings::nvmlMigMode_enum_NVML_DEVICE_MIG_ENABLE;
@@ -1003,7 +1021,11 @@ impl Collector for GpuCollector {
 
                                 let mut uuid_buf = [0i8; 96]; // NVML_DEVICE_UUID_V2_BUFFER_SIZE
                                 let _ = unsafe {
-                                    get_uuid(mig_handle, uuid_buf.as_mut_ptr(), uuid_buf.len() as u32)
+                                    get_uuid(
+                                        mig_handle,
+                                        uuid_buf.as_mut_ptr(),
+                                        uuid_buf.len() as u32,
+                                    )
                                 };
                                 let mig_uuid_str = unsafe {
                                     std::ffi::CStr::from_ptr(uuid_buf.as_ptr())
@@ -1033,8 +1055,9 @@ impl Collector for GpuCollector {
                                             unsafe { std::mem::zeroed() };
                                         gi_info.version =
                                             nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_v2;
-                                        let _ =
-                                            unsafe { get_gpu_instance_info(gi_handle, &mut gi_info) };
+                                        let _ = unsafe {
+                                            get_gpu_instance_info(gi_handle, &mut gi_info)
+                                        };
                                         let placement = Some(format!(
                                             "{}:slice{}",
                                             gi_info.placement.start, gi_info.placement.size
@@ -1069,7 +1092,10 @@ impl Collector for GpuCollector {
                                                 ci_info.version =
                                                     nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_v2;
                                                 let _ = unsafe {
-                                                    get_compute_instance_info(ci_handle, &mut ci_info)
+                                                    get_compute_instance_info(
+                                                        ci_handle,
+                                                        &mut ci_info,
+                                                    )
                                                 };
                                                 ci_nodes.push(ComputeInstanceNode {
                                                     gpu_instance_id: gi_id,
@@ -1078,7 +1104,8 @@ impl Collector for GpuCollector {
                                                     eng_profile_id: None, // nvmlComputeInstanceInfo_t_v2 does not have engineProfileId
                                                     placement: Some(format!(
                                                         "{}:slice{}",
-                                                        ci_info.placement.start, ci_info.placement.size
+                                                        ci_info.placement.start,
+                                                        ci_info.placement.size
                                                     )),
                                                 });
                                             }
@@ -1127,18 +1154,16 @@ impl Collector for GpuCollector {
                                     unsafe { std::mem::zeroed() };
                                 let bar1_res =
                                     unsafe { get_bar1_memory_info(mig_handle, &mut bar1_info) };
-                                let bar1_total_bytes =
-                                    if bar1_res == nvmlReturn_enum_NVML_SUCCESS {
-                                        Some(bar1_info.total)
-                                    } else {
-                                        None
-                                    };
-                                let bar1_used_bytes =
-                                    if bar1_res == nvmlReturn_enum_NVML_SUCCESS {
-                                        Some(bar1_info.used)
-                                    } else {
-                                        None
-                                    };
+                                let bar1_total_bytes = if bar1_res == nvmlReturn_enum_NVML_SUCCESS {
+                                    Some(bar1_info.total)
+                                } else {
+                                    None
+                                };
+                                let bar1_used_bytes = if bar1_res == nvmlReturn_enum_NVML_SUCCESS {
+                                    Some(bar1_info.used)
+                                } else {
+                                    None
+                                };
 
                                 let mut ecc_corrected_val: u64 = 0;
                                 let ecc_corrected_res = unsafe {
@@ -1251,11 +1276,7 @@ impl Collector for GpuCollector {
                             if let Some(util) = mig.util_percent {
                                 metrics
                                     .mig_utilization_percent
-                                    .with_label_values(&[
-                                        uuid_label,
-                                        gpu_label.as_str(),
-                                        mig_label,
-                                    ])
+                                    .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label])
                                     .set(util as f64);
                                 if self.k8s_mode {
                                     metrics
@@ -1271,11 +1292,7 @@ impl Collector for GpuCollector {
                             if let Some(total) = mig.memory_total_bytes {
                                 metrics
                                     .mig_memory_total_bytes
-                                    .with_label_values(&[
-                                        uuid_label,
-                                        gpu_label.as_str(),
-                                        mig_label,
-                                    ])
+                                    .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label])
                                     .set(total as f64);
                                 if self.k8s_mode {
                                     metrics
@@ -1291,11 +1308,7 @@ impl Collector for GpuCollector {
                             if let Some(used) = mig.memory_used_bytes {
                                 metrics
                                     .mig_memory_used_bytes
-                                    .with_label_values(&[
-                                        uuid_label,
-                                        gpu_label.as_str(),
-                                        mig_label,
-                                    ])
+                                    .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label])
                                     .set(used as f64);
                                 if self.k8s_mode {
                                     metrics
@@ -1311,11 +1324,7 @@ impl Collector for GpuCollector {
                             if let Some(sm) = mig.sm_count {
                                 metrics
                                     .mig_sm_count
-                                    .with_label_values(&[
-                                        uuid_label,
-                                        gpu_label.as_str(),
-                                        mig_label,
-                                    ])
+                                    .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label])
                                     .set(sm as f64);
                                 if self.k8s_mode {
                                     metrics
@@ -1332,11 +1341,7 @@ impl Collector for GpuCollector {
                             if let Some(corrected) = mig.ecc_corrected {
                                 metrics
                                     .mig_ecc_corrected_total
-                                    .with_label_values(&[
-                                        uuid_label,
-                                        gpu_label.as_str(),
-                                        mig_label,
-                                    ])
+                                    .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label])
                                     .inc_by(corrected);
                                 if self.k8s_mode {
                                     metrics
@@ -1352,11 +1357,7 @@ impl Collector for GpuCollector {
                             if let Some(uncorrected) = mig.ecc_uncorrected {
                                 metrics
                                     .mig_ecc_uncorrected_total
-                                    .with_label_values(&[
-                                        uuid_label,
-                                        gpu_label.as_str(),
-                                        mig_label,
-                                    ])
+                                    .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label])
                                     .inc_by(uncorrected);
                                 if self.k8s_mode {
                                     metrics
@@ -1374,19 +1375,11 @@ impl Collector for GpuCollector {
                             {
                                 metrics
                                     .mig_bar1_total_bytes
-                                    .with_label_values(&[
-                                        uuid_label,
-                                        gpu_label.as_str(),
-                                        mig_label,
-                                    ])
+                                    .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label])
                                     .set(total as f64);
                                 metrics
                                     .mig_bar1_used_bytes
-                                    .with_label_values(&[
-                                        uuid_label,
-                                        gpu_label.as_str(),
-                                        mig_label,
-                                    ])
+                                    .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label])
                                     .set(used as f64);
                                 if self.k8s_mode {
                                     metrics
@@ -1571,11 +1564,11 @@ fn build_filter(raw: Option<&str>) -> Option<HashSet<String>> {
 
 #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))]
 fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result<MigTree> {
-    use std::os::raw::c_uint;
     use nvml_wrapper_sys::bindings::{
-        nvmlComputeInstanceInfo_t, nvmlDevice_t, nvmlGpuInstanceInfo_t, nvmlReturn_t,
-        nvmlReturn_enum_NVML_SUCCESS,
+        nvmlComputeInstanceInfo_t, nvmlDevice_t, nvmlGpuInstanceInfo_t,
+        nvmlReturn_enum_NVML_SUCCESS, nvmlReturn_t,
     };
+    use std::os::raw::c_uint;
 
     // Load NVML dynamically to bypass missing symbols in sys crate
     let lib = unsafe { libloading::Library::new("libnvidia-ml.so.1") }?;
@@ -1595,19 +1588,14 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result<Mig
         index: std::os::raw::c_uint,
         mig_device: *mut nvmlDevice_t,
     ) -> nvmlReturn_t;
-    type NvmlDeviceGetDeviceHandleFromMigDeviceHandle = unsafe extern "C" fn(
-        mig_device: nvmlDevice_t,
-        device: *mut nvmlDevice_t,
-    ) -> nvmlReturn_t;
-    type NvmlDeviceGetGpuInstanceId = unsafe extern "C" fn(
-        device: nvmlDevice_t,
-        id: *mut std::os::raw::c_uint,
-    ) -> nvmlReturn_t;
-    type NvmlDeviceGetComputeInstanceId = unsafe extern "C" fn(
-        device: nvmlDevice_t,
-        id: *mut std::os::raw::c_uint,
-    ) -> nvmlReturn_t;
-    type NvmlGpuInstanceGetById = unsafe extern "C" fn( // Corrected function name
+    type NvmlDeviceGetDeviceHandleFromMigDeviceHandle =
+        unsafe extern "C" fn(mig_device: nvmlDevice_t, device: *mut nvmlDevice_t) -> nvmlReturn_t;
+    type NvmlDeviceGetGpuInstanceId =
+        unsafe extern "C" fn(device: nvmlDevice_t, id: *mut std::os::raw::c_uint) -> nvmlReturn_t;
+    type NvmlDeviceGetComputeInstanceId =
+        unsafe extern "C" fn(device: nvmlDevice_t, id: *mut std::os::raw::c_uint) -> nvmlReturn_t;
+    type NvmlGpuInstanceGetById = unsafe extern "C" fn(
+        // Corrected function name
         device: nvmlDevice_t,
         id: std::os::raw::c_uint,
         gpu_instance: *mut nvmlDevice_t,
@@ -1663,7 +1651,7 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result<Mig
     let get_compute_instance_id: libloading::Symbol<NvmlDeviceGetComputeInstanceId> =
         unsafe { lib.get(b"nvmlDeviceGetComputeInstanceId") }?;
     let get_gpu_instance_by_id: libloading::Symbol<NvmlGpuInstanceGetById> =
-        unsafe { lib.get(b"nvmlGpuInstanceGetById") }?;
+        unsafe { lib.get(b"nvmlGpuInstanceGetById") }?; // Corrected function name
     let get_gpu_instance_info: libloading::Symbol<NvmlGpuInstanceGetInfo> =
         unsafe { lib.get(b"nvmlGpuInstanceGetInfo") }?;
     let get_gpu_instance_compute_instance_by_id: libloading::Symbol<
@@ -1671,8 +1659,7 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result<Mig
     > = unsafe { lib.get(b"nvmlGpuInstanceGetComputeInstanceById") }?;
     let get_compute_instance_info: libloading::Symbol<NvmlComputeInstanceGetInfo> =
         unsafe { lib.get(b"nvmlComputeInstanceGetInfo") }?;
-    let get_uuid: libloading::Symbol<NvmlDeviceGetUUID> =
-        unsafe { lib.get(b"nvmlDeviceGetUUID") }?;
+    let get_uuid: libloading::Symbol<NvmlDeviceGetUUID> = unsafe { lib.get(b"nvmlDeviceGetUUID") }?;
     let get_memory_info: libloading::Symbol<NvmlDeviceGetMemoryInfo> =
         unsafe { lib.get(b"nvmlDeviceGetMemoryInfo") }?;
     let get_utilization_rates: libloading::Symbol<NvmlDeviceGetUtilizationRates> =
@@ -1762,7 +1749,11 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result<Mig
                 if let Some(gi_node) = gi_map.get(&gi_id) {
                     let mut ci_handle: nvmlDevice_t = std::ptr::null_mut();
                     if unsafe {
-                        get_gpu_instance_compute_instance_by_id(gi_node.handle, ci_id, &mut ci_handle)
+                        get_gpu_instance_compute_instance_by_id(
+                            gi_node.handle,
+                            ci_id,
+                            &mut ci_handle,
+                        )
                     } == nvmlReturn_enum_NVML_SUCCESS
                     {
                         let mut ci_info: nvmlComputeInstanceInfo_t = unsafe { std::mem::zeroed() };

From 1e8fa5b30f059134319674468f7d31aa562933a0 Mon Sep 17 00:00:00 2001
From: Shaik Noor <s.noorink@gmail.com>
Date: Sat, 13 Dec 2025 16:09:19 +0530
Subject: [PATCH 16/19] feat: Add NVML FFI scaffolding for extended PCIe,
 NVSwitch, and event functionality, guarded by feature flags.

---
 crates/agent-core/src/nvml_ext.rs | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs
index 888a8c8..09a9abe 100644
--- a/crates/agent-core/src/nvml_ext.rs
+++ b/crates/agent-core/src/nvml_ext.rs
@@ -63,20 +63,23 @@ pub unsafe fn pcie_ext_counters(device: nvmlDevice_t) -> Result<PcieExt, NvmlExt
     // nvmlDeviceGetPcieReplayCounter is already available in wrapper; here we try best-effort extras.
     // As nvml-wrapper does not expose these, we attempt direct bindings when available; otherwise return NotSupported.
     unsafe {
-        let lib = libloading::Library::new("libnvidia-ml.so.1").map_err(|_| NvmlExtError::NotSupported)?;
+        let lib = libloading::Library::new("libnvidia-ml.so.1")
+            .map_err(|_| NvmlExtError::NotSupported)?;
 
         type NvmlDeviceGetPcieStats = unsafe extern "C" fn(
             device: nvmlDevice_t,
             counter: u32,
             value: *mut u32,
         ) -> nvmlReturn_t;
-        type NvmlDeviceGetPcieReplayCounter = unsafe extern "C" fn(
-            device: nvmlDevice_t,
-            value: *mut u32,
-        ) -> nvmlReturn_t;
+        type NvmlDeviceGetPcieReplayCounter =
+            unsafe extern "C" fn(device: nvmlDevice_t, value: *mut u32) -> nvmlReturn_t;
 
-        let get_pcie_stats: libloading::Symbol<NvmlDeviceGetPcieStats> = lib.get(b"nvmlDeviceGetPcieStats").map_err(|_| NvmlExtError::NotSupported)?;
-        let get_pcie_replay_counter: libloading::Symbol<NvmlDeviceGetPcieReplayCounter> = lib.get(b"nvmlDeviceGetPcieReplayCounter").map_err(|_| NvmlExtError::NotSupported)?;
+        let get_pcie_stats: libloading::Symbol<NvmlDeviceGetPcieStats> = lib
+            .get(b"nvmlDeviceGetPcieStats")
+            .map_err(|_| NvmlExtError::NotSupported)?;
+        let get_pcie_replay_counter: libloading::Symbol<NvmlDeviceGetPcieReplayCounter> = lib
+            .get(b"nvmlDeviceGetPcieReplayCounter")
+            .map_err(|_| NvmlExtError::NotSupported)?;
 
         let mut corr: u32 = 0;
         let mut atomic: u32 = 0;
@@ -116,17 +119,19 @@ pub unsafe fn get_field_values(
     device: nvmlDevice_t,
     field_ids: &[u32],
 ) -> Result<FieldValues, NvmlExtError> {
-
     unsafe {
-        let lib = libloading::Library::new("libnvidia-ml.so.1").map_err(|_| NvmlExtError::NotSupported)?;
-        
+        let lib = libloading::Library::new("libnvidia-ml.so.1")
+            .map_err(|_| NvmlExtError::NotSupported)?;
+
         type NvmlDeviceGetFieldValues = unsafe extern "C" fn(
             device: nvmlDevice_t,
             valuesCount: u32,
             values: *mut nvmlFieldValue_t,
         ) -> nvmlReturn_t;
 
-        let get_field_values_fn: libloading::Symbol<NvmlDeviceGetFieldValues> = lib.get(b"nvmlDeviceGetFieldValues").map_err(|_| NvmlExtError::NotSupported)?;
+        let get_field_values_fn: libloading::Symbol<NvmlDeviceGetFieldValues> = lib
+            .get(b"nvmlDeviceGetFieldValues")
+            .map_err(|_| NvmlExtError::NotSupported)?;
 
         let mut fields: Vec<nvmlFieldValue_t> = vec![std::mem::zeroed(); field_ids.len()];
         for (i, f) in field_ids.iter().enumerate() {

From ddc560c695282854b402161156b8f9f350c77da0 Mon Sep 17 00:00:00 2001
From: Shaik Noor <s.noorink@gmail.com>
Date: Sat, 13 Dec 2025 16:28:05 +0530
Subject: [PATCH 17/19] fix(gpu): resolve compilation and formatting errors

---
 crates/agent-core/src/collectors/gpu.rs | 193 +++++++++++++++---------
 1 file changed, 123 insertions(+), 70 deletions(-)

diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs
index 7ad13d5..79fda53 100644
--- a/crates/agent-core/src/collectors/gpu.rs
+++ b/crates/agent-core/src/collectors/gpu.rs
@@ -8,7 +8,7 @@ use nvml_wrapper::{
     bitmasks::nv_link::PacketTypes,
     enum_wrappers::device::{Clock, EccCounter, MemoryError, PcieUtilCounter, TemperatureSensor},
     enum_wrappers::nv_link::{ErrorCounter as NvLinkErrorCounter, UtilizationCountUnit},
-    enums::device::PcieLinkMaxSpeed,
+    // enums::device::PcieLinkMaxSpeed, // Unused
     enums::nv_link::Counter as NvLinkCounter,
     struct_wrappers::nv_link::UtilizationControl,
     Nvml,
@@ -38,7 +38,7 @@ use crate::state::{
 #[cfg(all(feature = "gpu", target_os = "linux"))]
 use nvml_wrapper::error::NvmlError;
 #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))]
-use nvml_wrapper_sys::bindings::*;
+// use nvml_wrapper_sys::bindings::*; // Unused, dynamic loading used instead
 
 pub struct GpuCollector {
     #[cfg(feature = "gpu")]
@@ -467,13 +467,13 @@ impl Collector for GpuCollector {
                     metrics
                         .gpu_bar1_total_bytes
                         .with_label_values(&[uuid_label, gpu_label.as_str()])
-                        .set(bar1.total as f64);
+                        .set(bar1.bar1Total as f64);
                     metrics
                         .gpu_bar1_used_bytes
                         .with_label_values(&[uuid_label, gpu_label.as_str()])
-                        .set(bar1.used as f64);
-                    health.bar1_total_bytes = Some(bar1.total);
-                    health.bar1_used_bytes = Some(bar1.used);
+                        .set(bar1.bar1Used as f64);
+                    health.bar1_total_bytes = Some(bar1.bar1Total);
+                    health.bar1_used_bytes = Some(bar1.bar1Used);
                 }
                 if let Ok(enc_info) = device.encoder_utilization() {
                     metrics
@@ -692,8 +692,8 @@ impl Collector for GpuCollector {
                             // Note: pcie_link_speed returns the current link speed, not max.
                             // If semantics require max, we might need a different call, but for now matching the existing pattern.
                             if let (Ok(gen), Ok(width), Ok(speed)) = (
-                                device.pcie_link_gen(),
-                                device.pcie_link_width(),
+                                device.max_pcie_link_gen(),
+                                device.max_pcie_link_width(),
                                 device.pcie_link_speed(),
                             ) {
                                 let bytes_per_s = ((tx_kb + rx_kb) as f64) * 1024.0;
@@ -1512,7 +1512,7 @@ fn k8s_resource_name(prefix: &str, mig_profile: Option<&str>) -> String {
 }
 
 #[cfg(feature = "gpu")]
-fn pcie_lane_bytes_per_sec(gen: u32, speed_mt_s: u32) -> f64 {
+fn pcie_lane_bytes_per_sec(_gen: u32, speed_mt_s: u32) -> f64 {
     // PCIe generation to base speed in MT/s per lane
     // Gen1: 2.5 GT/s, Gen2: 5 GT/s, Gen3: 8 GT/s, Gen4: 16 GT/s, Gen5: 32 GT/s, Gen6: 64 GT/s
     // Data rate is typically 8/10 encoding for Gen1/2, 128/130 for Gen3+
@@ -1568,7 +1568,6 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result<Mig
         nvmlComputeInstanceInfo_t, nvmlDevice_t, nvmlGpuInstanceInfo_t,
         nvmlReturn_enum_NVML_SUCCESS, nvmlReturn_t,
     };
-    use std::os::raw::c_uint;
 
     // Load NVML dynamically to bypass missing symbols in sys crate
     let lib = unsafe { libloading::Library::new("libnvidia-ml.so.1") }?;
@@ -1595,7 +1594,6 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result<Mig
     type NvmlDeviceGetComputeInstanceId =
         unsafe extern "C" fn(device: nvmlDevice_t, id: *mut std::os::raw::c_uint) -> nvmlReturn_t;
     type NvmlGpuInstanceGetById = unsafe extern "C" fn(
-        // Corrected function name
         device: nvmlDevice_t,
         id: std::os::raw::c_uint,
         gpu_instance: *mut nvmlDevice_t,
@@ -1651,7 +1649,7 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result<Mig
     let get_compute_instance_id: libloading::Symbol<NvmlDeviceGetComputeInstanceId> =
         unsafe { lib.get(b"nvmlDeviceGetComputeInstanceId") }?;
     let get_gpu_instance_by_id: libloading::Symbol<NvmlGpuInstanceGetById> =
-        unsafe { lib.get(b"nvmlGpuInstanceGetById") }?; // Corrected function name
+        unsafe { lib.get(b"nvmlGpuInstanceGetById") }?;
     let get_gpu_instance_info: libloading::Symbol<NvmlGpuInstanceGetInfo> =
         unsafe { lib.get(b"nvmlGpuInstanceGetInfo") }?;
     let get_gpu_instance_compute_instance_by_id: libloading::Symbol<
@@ -1674,8 +1672,8 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result<Mig
     let parent_handle = unsafe { parent.handle() };
     let mig_mode_res = unsafe { get_mig_mode(parent_handle, &mut current_mode, &mut pending) };
     let supported = mig_mode_res == nvmlReturn_enum_NVML_SUCCESS;
-    let enabled =
-        current_mode == nvml_wrapper_sys::bindings::nvmlMigMode_enum_NVML_DEVICE_MIG_ENABLE;
+    // NVML_DEVICE_MIG_ENABLE is 1
+    let enabled = current_mode == 1;
 
     if !supported || !enabled {
         return Ok(MigTree {
@@ -1692,6 +1690,7 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result<Mig
 
     let mut devices = Vec::new();
     let mut gi_map: HashMap<u32, GpuInstanceNode> = HashMap::new();
+    let mut gi_handles: HashMap<u32, nvmlDevice_t> = HashMap::new();
     let mut ci_nodes: Vec<ComputeInstanceNode> = Vec::new();
 
     for idx in 0..max_count {
@@ -1725,8 +1724,10 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result<Mig
                 if unsafe { get_gpu_instance_by_id(parent_handle, gi_id, &mut gi_handle) }
                     == nvmlReturn_enum_NVML_SUCCESS
                 {
+                    gi_handles.insert(gi_id, gi_handle);
+
                     let mut gi_info: nvmlGpuInstanceInfo_t = unsafe { std::mem::zeroed() };
-                    gi_info.version = nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_v2;
+                    // gi_info.version = ...; // Skip version if unavailable, rely on zeroed/default
                     let _ = unsafe { get_gpu_instance_info(gi_handle, &mut gi_info) };
                     let placement = Some(format!(
                         "{}:slice{}",
@@ -1738,7 +1739,6 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result<Mig
                             id: gi_id,
                             profile_id: Some(gi_info.profileId),
                             placement,
-                            handle: gi_handle, // Store handle for later CI lookup
                         },
                     );
                 }
@@ -1746,64 +1746,117 @@ fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result<Mig
 
             // Populate CI info best-effort
             if ci_id > 0 {
-                if let Some(gi_node) = gi_map.get(&gi_id) {
-                    let mut ci_handle: nvmlDevice_t = std::ptr::null_mut();
-                    if unsafe {
-                        get_gpu_instance_compute_instance_by_id(
-                            gi_node.handle,
-                            ci_id,
-                            &mut ci_handle,
-                        )
-                    } == nvmlReturn_enum_NVML_SUCCESS
-                    {
-                        let mut ci_info: nvmlComputeInstanceInfo_t = unsafe { std::mem::zeroed() };
-                        ci_info.version = nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_v2;
-                        let _ = unsafe { get_compute_instance_info(ci_handle, &mut ci_info) };
-                        ci_nodes.push(ComputeInstanceNode {
-                            gpu_instance_id: gi_id,
-                            id: ci_id,
-                            profile_id: Some(ci_info.profileId),
-                            eng_profile_id: None, // Some(ci_info.engineProfile),
-                            placement: Some(format!(
-                                "{}:slice{}",
-                                ci_info.placement.start, ci_info.placement.size
-                            )),
-                        });
+                // Check if we haven't added this CI yet (simple check by iteration or similar, but here we just push)
+                // To avoid stats duplication, we rely on the fact that we iterate MIG devices.
+                // However, one CI might be shared? No, MIG device <-> CI is 1:1 usually?
+                // Actually 1 GI can have multiple CIs. 1 CI can have multiple MIG devices?
+                // In MIG, a "MIG Device" is conceptually a CI.
+                // We'll just push CI nodes as we encounter them. Ideally distinct.
+                // But `ci_nodes` is for the tree structure.
+                // Let's check uniqueness.
+                let known = ci_nodes
+                    .iter()
+                    .any(|c| c.gpu_instance_id == gi_id && c.id == ci_id);
+
+                if !known {
+                    if let Some(&gi_handle) = gi_handles.get(&gi_id) {
+                        let mut ci_handle: nvmlDevice_t = std::ptr::null_mut();
+                        if unsafe {
+                            get_gpu_instance_compute_instance_by_id(
+                                gi_handle,
+                                ci_id,
+                                &mut ci_handle,
+                            )
+                        } == nvmlReturn_enum_NVML_SUCCESS
+                        {
+                            let mut ci_info: nvmlComputeInstanceInfo_t =
+                                unsafe { std::mem::zeroed() };
+                            // ci_info.version = ...; // Skip version
+                            let _ = unsafe { get_compute_instance_info(ci_handle, &mut ci_info) };
+                            ci_nodes.push(ComputeInstanceNode {
+                                gpu_instance_id: gi_id,
+                                id: ci_id,
+                                profile_id: Some(ci_info.profileId),
+                                eng_profile_id: None,
+                                placement: Some(format!(
+                                    "{}:slice{}",
+                                    ci_info.placement.start, ci_info.placement.size
+                                )),
+                            });
+                        }
                     }
                 }
             }
+
+            // Metrics
+            let mut mem_info: Option<nvml_wrapper_sys::bindings::nvmlMemory_t> = None;
+            let mut util_gpu: Option<u32> = None;
+            let mut ecc_cor: Option<u64> = None;
+            let mut ecc_uncor: Option<u64> = None;
+            let mut bar1: Option<nvml_wrapper_sys::bindings::nvmlBAR1Memory_t> = None;
+
+            {
+                let mut m = unsafe { std::mem::zeroed() };
+                if unsafe { get_memory_info(mig_handle, &mut m) } == nvmlReturn_enum_NVML_SUCCESS {
+                    mem_info = Some(m);
+                }
+
+                let mut u = unsafe { std::mem::zeroed() };
+                if unsafe { get_utilization_rates(mig_handle, &mut u) }
+                    == nvmlReturn_enum_NVML_SUCCESS
+                {
+                    util_gpu = Some(u.gpu);
+                }
+
+                let mut b = unsafe { std::mem::zeroed() };
+                if unsafe { get_bar1_memory_info(mig_handle, &mut b) }
+                    == nvmlReturn_enum_NVML_SUCCESS
+                {
+                    bar1 = Some(b);
+                }
+
+                let mut c_count: u64 = 0;
+                let mut u_count: u64 = 0;
+                // NVML_ECC_COUNTER_TYPE_VOLATILE = 0
+                // NVML_MEMORY_ERROR_TYPE_CORRECTED = 1
+                // NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 2
+                if unsafe { get_total_ecc_errors(mig_handle, 1, 0, &mut c_count) }
+                    == nvmlReturn_enum_NVML_SUCCESS
+                {
+                    ecc_cor = Some(c_count);
+                }
+                if unsafe { get_total_ecc_errors(mig_handle, 2, 0, &mut u_count) }
+                    == nvmlReturn_enum_NVML_SUCCESS
+                {
+                    ecc_uncor = Some(u_count);
+                }
+            }
+
+            let mig_id = format!("mig{}", idx);
+            let placement_str = gi_map
+                .get(&gi_id)
+                .and_then(|g| g.placement.clone())
+                .unwrap_or_else(|| format!("gi{}", gi_id));
+            let profile_str = gi_map
+                .get(&gi_id)
+                .and_then(|g| g.profile_id)
+                .map(|p| p.to_string());
+
+            devices.push(MigDeviceStatus {
+                id: mig_uuid.clone().unwrap_or(mig_id.clone()),
+                uuid: mig_uuid,
+                memory_total_bytes: mem_info.as_ref().map(|m| m.total),
+                memory_used_bytes: mem_info.map(|m| m.used),
+                util_percent: util_gpu,
+                sm_count: None, // Not retrieving SM count for now
+                profile: profile_str,
+                placement: Some(placement_str),
+                bar1_total_bytes: bar1.as_ref().map(|b| b.bar1Total),
+                bar1_used_bytes: bar1.map(|b| b.bar1Used),
+                ecc_corrected: ecc_cor,
+                ecc_uncorrected: ecc_uncor,
+            });
         }
-        let mig_id = format!("mig{}", idx);
-        let placement_str = gi_map
-            .get(&gi_id)
-            .and_then(|g| g.placement.clone())
-            .unwrap_or_else(|| format!("gi{}", gi_id));
-        let profile_str = gi_map
-            .get(&gi_id)
-            .and_then(|g| g.profile_id)
-            .map(|p| p.to_string());
-        let ecc_corrected = mig_device
-            .total_ecc_errors(MemoryError::Corrected, EccCounter::Volatile)
-            .ok();
-        let ecc_uncorrected = mig_device
-            .total_ecc_errors(MemoryError::Uncorrected, EccCounter::Volatile)
-            .ok();
-        let bar1_info = mig_device.bar1_memory_info().ok();
-
-        devices.push(MigDeviceStatus {
-            id: mig_uuid.clone().unwrap_or(mig_id.clone()),
-            uuid: mig_uuid,
-            memory_total_bytes: mem_info.as_ref().map(|m| m.total),
-            memory_used_bytes: mem_info.map(|m| m.used),
-            util_percent: util.map(|u| u.gpu),
-            sm_count,
-            profile: profile_str,
-            placement: Some(placement_str),
-            bar1_total_bytes: bar1_info.as_ref().map(|b| b.total),
-            bar1_used_bytes: bar1_info.map(|b| b.used),
-            ecc_corrected,
-            ecc_uncorrected,
-        });
     }
 
     Ok(MigTree {

From a7e69213af2216c99a4f52a81709e8ef48741c24 Mon Sep 17 00:00:00 2001
From: Shaik Noor <s.noorink@gmail.com>
Date: Sat, 13 Dec 2025 16:58:44 +0530
Subject: [PATCH 18/19] Apply strict formatting fixes to gpu.rs

---
 crates/agent-core/src/collectors/gpu.rs | 680 ++++++------------------
 1 file changed, 166 insertions(+), 514 deletions(-)

diff --git a/crates/agent-core/src/collectors/gpu.rs b/crates/agent-core/src/collectors/gpu.rs
index 79fda53..d4f07c2 100644
--- a/crates/agent-core/src/collectors/gpu.rs
+++ b/crates/agent-core/src/collectors/gpu.rs
@@ -38,8 +38,6 @@ use crate::state::{
 #[cfg(all(feature = "gpu", target_os = "linux"))]
 use nvml_wrapper::error::NvmlError;
 #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))]
-// use nvml_wrapper_sys::bindings::*; // Unused, dynamic loading used instead
-
 pub struct GpuCollector {
     #[cfg(feature = "gpu")]
     nvml: Option<Nvml>,
@@ -467,13 +465,13 @@ impl Collector for GpuCollector {
                     metrics
                         .gpu_bar1_total_bytes
                         .with_label_values(&[uuid_label, gpu_label.as_str()])
-                        .set(bar1.bar1Total as f64);
+                        .set(bar1.total as f64);
                     metrics
                         .gpu_bar1_used_bytes
                         .with_label_values(&[uuid_label, gpu_label.as_str()])
-                        .set(bar1.bar1Used as f64);
-                    health.bar1_total_bytes = Some(bar1.bar1Total);
-                    health.bar1_used_bytes = Some(bar1.bar1Used);
+                        .set(bar1.used as f64);
+                    health.bar1_total_bytes = Some(bar1.total);
+                    health.bar1_used_bytes = Some(bar1.used);
                 }
                 if let Ok(enc_info) = device.encoder_utilization() {
                     metrics
@@ -841,553 +839,189 @@ impl Collector for GpuCollector {
                 if self.enable_mig {
                     #[cfg(all(feature = "gpu-nvml-ffi", feature = "gpu"))]
                     {
-                        use nvml_wrapper_sys::bindings::{
-                            nvmlComputeInstanceInfo_t, nvmlDevice_t, nvmlGpuInstanceInfo_t,
-                            nvmlReturn_enum_NVML_SUCCESS, nvmlReturn_t,
-                        };
-                        // Load NVML dynamically to bypass missing symbols in sys crate
-                        let lib = unsafe { libloading::Library::new("libnvidia-ml.so.1") }?;
-
-                        // Typedefs for the functions we need
-                        type NvmlDeviceGetMigMode = unsafe extern "C" fn(
-                            device: nvmlDevice_t,
-                            current_mode: *mut std::os::raw::c_uint,
-                            pending_mode: *mut std::os::raw::c_uint,
-                        )
-                            -> nvmlReturn_t;
-                        type NvmlDeviceGetMaxMigDeviceCount = unsafe extern "C" fn(
-                            device: nvmlDevice_t,
-                            count: *mut std::os::raw::c_uint,
-                        )
-                            -> nvmlReturn_t;
-                        type NvmlDeviceGetMigDeviceHandleByIndex =
-                            unsafe extern "C" fn(
-                                device: nvmlDevice_t,
-                                index: std::os::raw::c_uint,
-                                mig_device: *mut nvmlDevice_t,
-                            ) -> nvmlReturn_t;
-                        type NvmlDeviceGetDeviceHandleFromMigDeviceHandle =
-                            unsafe extern "C" fn(
-                                mig_device: nvmlDevice_t,
-                                device: *mut nvmlDevice_t,
-                            ) -> nvmlReturn_t;
-                        type NvmlDeviceGetGpuInstanceId = unsafe extern "C" fn(
-                            device: nvmlDevice_t,
-                            id: *mut std::os::raw::c_uint,
-                        )
-                            -> nvmlReturn_t;
-                        type NvmlDeviceGetComputeInstanceId = unsafe extern "C" fn(
-                            device: nvmlDevice_t,
-                            id: *mut std::os::raw::c_uint,
-                        )
-                            -> nvmlReturn_t;
-                        type NvmlDeviceGetGpuInstanceById = unsafe extern "C" fn(
-                            device: nvmlDevice_t,
-                            id: std::os::raw::c_uint,
-                            gpu_instance: *mut nvmlDevice_t,
-                        )
-                            -> nvmlReturn_t;
-                        type NvmlGpuInstanceGetInfo = unsafe extern "C" fn(
-                            gpu_instance: nvmlDevice_t,
-                            info: *mut nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_t,
-                        )
-                            -> nvmlReturn_t;
-                        type NvmlGpuInstanceGetComputeInstanceById =
-                            unsafe extern "C" fn(
-                                gpu_instance: nvmlDevice_t,
-                                id: std::os::raw::c_uint,
-                                compute_instance: *mut nvmlDevice_t,
-                            ) -> nvmlReturn_t;
-                        type NvmlComputeInstanceGetInfo = unsafe extern "C" fn(
-                            compute_instance: nvmlDevice_t,
-                            info: *mut nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_t,
-                        )
-                            -> nvmlReturn_t;
-                        type NvmlDeviceGetUUID = unsafe extern "C" fn(
-                            device: nvmlDevice_t,
-                            uuid: *mut std::os::raw::c_char,
-                            size: std::os::raw::c_uint,
-                        )
-                            -> nvmlReturn_t;
-                        type NvmlDeviceGetMemoryInfo = unsafe extern "C" fn(
-                            device: nvmlDevice_t,
-                            memory: *mut nvml_wrapper_sys::bindings::nvmlMemory_t,
-                        )
-                            -> nvmlReturn_t;
-                        type NvmlDeviceGetUtilizationRates = unsafe extern "C" fn(
-                            device: nvmlDevice_t,
-                            utilization: *mut nvml_wrapper_sys::bindings::nvmlUtilization_t,
-                        )
-                            -> nvmlReturn_t;
-                        type NvmlDeviceGetBar1MemoryInfo = unsafe extern "C" fn(
-                            device: nvmlDevice_t,
-                            bar1_memory: *mut nvml_wrapper_sys::bindings::nvmlBAR1Memory_t,
-                        )
-                            -> nvmlReturn_t;
-                        type NvmlDeviceGetEccMode = unsafe extern "C" fn(
-                            device: nvmlDevice_t,
-                            current_mode: *mut nvml_wrapper_sys::bindings::nvmlEnableState_t,
-                            pending_mode: *mut nvml_wrapper_sys::bindings::nvmlEnableState_t,
-                        )
-                            -> nvmlReturn_t;
-                        type NvmlDeviceGetTotalEccErrors = unsafe extern "C" fn(
-                            device: nvmlDevice_t,
-                            error_type: nvml_wrapper_sys::bindings::nvmlMemoryErrorType_t,
-                            counter_type: nvml_wrapper_sys::bindings::nvmlEccCounterType_t,
-                            ecc_count: *mut u64,
-                        )
-                            -> nvmlReturn_t;
-
-                        let get_mig_mode: libloading::Symbol<NvmlDeviceGetMigMode> =
-                            unsafe { lib.get(b"nvmlDeviceGetMigMode") }?;
-                        let get_max_mig_device_count: libloading::Symbol<
-                            NvmlDeviceGetMaxMigDeviceCount,
-                        > = unsafe { lib.get(b"nvmlDeviceGetMaxMigDeviceCount") }?;
-                        let get_mig_device_handle_by_index: libloading::Symbol<
-                            NvmlDeviceGetMigDeviceHandleByIndex,
-                        > = unsafe { lib.get(b"nvmlDeviceGetMigDeviceHandleByIndex") }?;
-                        let get_device_handle_from_mig_device_handle: libloading::Symbol<
-                            NvmlDeviceGetDeviceHandleFromMigDeviceHandle,
-                        > = unsafe { lib.get(b"nvmlDeviceGetDeviceHandleFromMigDeviceHandle") }?;
-                        let get_gpu_instance_id: libloading::Symbol<NvmlDeviceGetGpuInstanceId> =
-                            unsafe { lib.get(b"nvmlDeviceGetGpuInstanceId") }?;
-                        let get_compute_instance_id: libloading::Symbol<
-                            NvmlDeviceGetComputeInstanceId,
-                        > = unsafe { lib.get(b"nvmlDeviceGetComputeInstanceId") }?;
-                        let get_gpu_instance_by_id: libloading::Symbol<
-                            NvmlDeviceGetGpuInstanceById,
-                        > = unsafe { lib.get(b"nvmlGpuInstanceGetById") }?; // Corrected function name
-                        let get_gpu_instance_info: libloading::Symbol<NvmlGpuInstanceGetInfo> =
-                            unsafe { lib.get(b"nvmlGpuInstanceGetInfo") }?;
-                        let get_gpu_instance_compute_instance_by_id: libloading::Symbol<
-                            NvmlGpuInstanceGetComputeInstanceById,
-                        > = unsafe { lib.get(b"nvmlGpuInstanceGetComputeInstanceById") }?;
-                        let get_compute_instance_info: libloading::Symbol<
-                            NvmlComputeInstanceGetInfo,
-                        > = unsafe { lib.get(b"nvmlComputeInstanceGetInfo") }?;
-                        let get_uuid: libloading::Symbol<NvmlDeviceGetUUID> =
-                            unsafe { lib.get(b"nvmlDeviceGetUUID") }?;
-                        let get_memory_info: libloading::Symbol<NvmlDeviceGetMemoryInfo> =
-                            unsafe { lib.get(b"nvmlDeviceGetMemoryInfo") }?;
-                        let get_utilization_rates: libloading::Symbol<
-                            NvmlDeviceGetUtilizationRates,
-                        > = unsafe { lib.get(b"nvmlDeviceGetUtilizationRates") }?;
-                        let get_bar1_memory_info: libloading::Symbol<NvmlDeviceGetBar1MemoryInfo> =
-                            unsafe { lib.get(b"nvmlDeviceGetBar1MemoryInfo") }?;
-                        let get_total_ecc_errors: libloading::Symbol<NvmlDeviceGetTotalEccErrors> =
-                            unsafe { lib.get(b"nvmlDeviceGetTotalEccErrors") }?;
-
-                        let mut current_mode = 0;
-                        let mut pending = 0;
-                        let parent_handle = unsafe { device.handle() };
-                        let mig_mode_res =
-                            unsafe { get_mig_mode(parent_handle, &mut current_mode, &mut pending) };
-                        let supported = mig_mode_res == nvmlReturn_enum_NVML_SUCCESS;
-                        let enabled = current_mode
-                            == nvml_wrapper_sys::bindings::nvmlMigMode_enum_NVML_DEVICE_MIG_ENABLE;
-
-                        if !supported || !enabled {
-                            // If MIG is not supported or not enabled, return early with appropriate status
-                            return Ok(MigTree {
-                                supported,
-                                enabled,
-                                gpu_instances: Vec::new(),
-                                compute_instances: Vec::new(),
-                                devices: Vec::new(),
-                            });
-                        }
-
-                        let mut max_count = 0;
-                        unsafe { get_max_mig_device_count(parent_handle, &mut max_count) };
-
-                        let mut devices = Vec::new();
-                        let mut gi_map: HashMap<u32, GpuInstanceNode> = HashMap::new();
-                        let mut gi_handles: HashMap<u32, nvmlDevice_t> = HashMap::new();
-                        let mut ci_nodes: Vec<ComputeInstanceNode> = Vec::new();
-
-                        for idx in 0..max_count {
-                            let mut mig_handle: nvmlDevice_t = std::ptr::null_mut();
-                            if unsafe {
-                                get_mig_device_handle_by_index(parent_handle, idx, &mut mig_handle)
-                            } == nvmlReturn_enum_NVML_SUCCESS
-                            {
-                                let mut full_handle: nvmlDevice_t = std::ptr::null_mut();
-                                unsafe {
-                                    get_device_handle_from_mig_device_handle(
-                                        mig_handle,
-                                        &mut full_handle,
-                                    )
-                                };
-
-                                let mut uuid_buf = [0i8; 96]; // NVML_DEVICE_UUID_V2_BUFFER_SIZE
-                                let _ = unsafe {
-                                    get_uuid(
-                                        mig_handle,
-                                        uuid_buf.as_mut_ptr(),
-                                        uuid_buf.len() as u32,
+                        if let Ok(migs) = collect_mig_devices(nvml, &device) {
+                            metrics
+                                .gpu_mig_enabled
+                                .with_label_values(&[uuid_label, gpu_label.as_str()])
+                                .set(if migs.enabled { 1.0 } else { 0.0 });
+                            // GI/CI info gauges
+                            for gi in &migs.gpu_instances {
+                                metrics
+                                    .mig_gpu_instance_info
+                                    .with_label_values(&[
+                                        uuid_label,
+                                        gpu_label.as_str(),
+                                        gi.id.to_string().as_str(),
+                                        gi.profile_id
+                                            .map(|p| p.to_string())
+                                            .unwrap_or_default()
+                                            .as_str(),
+                                        gi.placement.as_deref().unwrap_or(""),
+                                    ])
+                                    .set(1.0);
+                            }
+                            for ci in &migs.compute_instances {
+                                metrics
+                                    .mig_compute_instance_info
+                                    .with_label_values(&[
+                                        uuid_label,
+                                        gpu_label.as_str(),
+                                        ci.gpu_instance_id.to_string().as_str(),
+                                        ci.id.to_string().as_str(),
+                                        ci.profile_id
+                                            .map(|p| p.to_string())
+                                            .unwrap_or_default()
+                                            .as_str(),
+                                        ci.eng_profile_id
+                                            .map(|p| p.to_string())
+                                            .unwrap_or_default()
+                                            .as_str(),
+                                        ci.placement.as_deref().unwrap_or(""),
+                                    ])
+                                    .set(1.0);
+                            }
+                            for mig in &migs.devices {
+                                let mig_id_string = mig.id.to_string();
+                                let mig_label =
+                                    mig.uuid.as_deref().unwrap_or(mig_id_string.as_str());
+                                let compat_label = if self.k8s_mode {
+                                    k8s_resource_name(
+                                        self.resource_prefix,
+                                        mig.profile.as_deref().or(Some("generic")),
                                     )
-                                };
-                                let mig_uuid_str = unsafe {
-                                    std::ffi::CStr::from_ptr(uuid_buf.as_ptr())
-                                        .to_string_lossy()
-                                        .into_owned()
-                                };
-                                let mig_uuid = if mig_uuid_str.is_empty() {
-                                    None
                                 } else {
-                                    Some(mig_uuid_str.clone())
+                                    mig_label.to_string()
                                 };
-
-                                // Extract GI/CI to map hierarchy
-                                let mut gi_id = 0;
-                                let _ = unsafe { get_gpu_instance_id(mig_handle, &mut gi_id) };
-                                let mut ci_id = 0;
-                                let _ = unsafe { get_compute_instance_id(mig_handle, &mut ci_id) };
-
-                                // Populate GI info best-effort
-                                if gi_id > 0 && !gi_map.contains_key(&gi_id) {
-                                    let mut gi_handle: nvmlDevice_t = std::ptr::null_mut();
-                                    if unsafe {
-                                        get_gpu_instance_by_id(parent_handle, gi_id, &mut gi_handle)
-                                    } == nvmlReturn_enum_NVML_SUCCESS
-                                    {
-                                        let mut gi_info: nvmlGpuInstanceInfo_t =
-                                            unsafe { std::mem::zeroed() };
-                                        gi_info.version =
-                                            nvml_wrapper_sys::bindings::nvmlGpuInstanceInfo_v2;
-                                        let _ = unsafe {
-                                            get_gpu_instance_info(gi_handle, &mut gi_info)
-                                        };
-                                        let placement = Some(format!(
-                                            "{}:slice{}",
-                                            gi_info.placement.start, gi_info.placement.size
-                                        ));
-                                        gi_map.insert(
-                                            gi_id,
-                                            GpuInstanceNode {
-                                                id: gi_id,
-                                                profile_id: Some(gi_info.profileId),
-                                                placement,
-                                            },
-                                        );
-                                        gi_handles.insert(gi_id, gi_handle);
-                                    }
-                                }
-
-                                // Populate CI info best-effort
-                                if ci_id > 0 {
-                                    if let Some(_gi_node) = gi_map.get(&gi_id) {
-                                        if let Some(&gi_handle) = gi_handles.get(&gi_id) {
-                                            let mut ci_handle: nvmlDevice_t = std::ptr::null_mut();
-                                            if unsafe {
-                                                get_gpu_instance_compute_instance_by_id(
-                                                    gi_handle, // Assuming GpuInstanceNode stores the handle
-                                                    ci_id,
-                                                    &mut ci_handle,
-                                                )
-                                            } == nvmlReturn_enum_NVML_SUCCESS
-                                            {
-                                                let mut ci_info: nvmlComputeInstanceInfo_t =
-                                                    unsafe { std::mem::zeroed() };
-                                                ci_info.version =
-                                                    nvml_wrapper_sys::bindings::nvmlComputeInstanceInfo_v2;
-                                                let _ = unsafe {
-                                                    get_compute_instance_info(
-                                                        ci_handle,
-                                                        &mut ci_info,
-                                                    )
-                                                };
-                                                ci_nodes.push(ComputeInstanceNode {
-                                                    gpu_instance_id: gi_id,
-                                                    id: ci_id,
-                                                    profile_id: Some(ci_info.profileId),
-                                                    eng_profile_id: None, // nvmlComputeInstanceInfo_t_v2 does not have engineProfileId
-                                                    placement: Some(format!(
-                                                        "{}:slice{}",
-                                                        ci_info.placement.start,
-                                                        ci_info.placement.size
-                                                    )),
-                                                });
-                                            }
-                                        }
-                                    }
-                                }
-
-                                let mig_id_label = format!("mig{}", idx);
-                                let placement_str = gi_map
-                                    .get(&gi_id)
-                                    .and_then(|g| g.placement.clone())
-                                    .unwrap_or_else(|| format!("gi{}", gi_id));
-                                let profile_str = gi_map
-                                    .get(&gi_id)
-                                    .and_then(|g| g.profile_id)
-                                    .map(|p| p.to_string());
-
-                                let mut mem_info: nvml_wrapper_sys::bindings::nvmlMemory_t =
-                                    unsafe { std::mem::zeroed() };
-                                let mem_info_res =
-                                    unsafe { get_memory_info(mig_handle, &mut mem_info) };
-                                let memory_total_bytes =
-                                    if mem_info_res == nvmlReturn_enum_NVML_SUCCESS {
-                                        Some(mem_info.total)
-                                    } else {
-                                        None
-                                    };
-                                let memory_used_bytes =
-                                    if mem_info_res == nvmlReturn_enum_NVML_SUCCESS {
-                                        Some(mem_info.used)
-                                    } else {
-                                        None
-                                    };
-
-                                let mut util_rates: nvml_wrapper_sys::bindings::nvmlUtilization_t =
-                                    unsafe { std::mem::zeroed() };
-                                let util_res =
-                                    unsafe { get_utilization_rates(mig_handle, &mut util_rates) };
-                                let util_percent = if util_res == nvmlReturn_enum_NVML_SUCCESS {
-                                    Some(util_rates.gpu)
-                                } else {
-                                    None
-                                };
-
-                                let mut bar1_info: nvml_wrapper_sys::bindings::nvmlBAR1Memory_t =
-                                    unsafe { std::mem::zeroed() };
-                                let bar1_res =
-                                    unsafe { get_bar1_memory_info(mig_handle, &mut bar1_info) };
-                                let bar1_total_bytes = if bar1_res == nvmlReturn_enum_NVML_SUCCESS {
-                                    Some(bar1_info.total)
-                                } else {
-                                    None
-                                };
-                                let bar1_used_bytes = if bar1_res == nvmlReturn_enum_NVML_SUCCESS {
-                                    Some(bar1_info.used)
-                                } else {
-                                    None
-                                };
-
-                                let mut ecc_corrected_val: u64 = 0;
-                                let ecc_corrected_res = unsafe {
-                                    get_total_ecc_errors(
-                                        mig_handle,
-                                        nvml_wrapper_sys::bindings::nvmlMemoryErrorType_enum_NVML_MEMORY_ERROR_TYPE_CORRECTED,
-                                        nvml_wrapper_sys::bindings::nvmlEccCounterType_enum_NVML_ECC_COUNTER_TYPE_VOLATILE,
-                                        &mut ecc_corrected_val,
-                                    )
-                                };
-                                let ecc_corrected =
-                                    if ecc_corrected_res == nvmlReturn_enum_NVML_SUCCESS {
-                                        Some(ecc_corrected_val)
-                                    } else {
-                                        None
-                                    };
-
-                                let mut ecc_uncorrected_val: u64 = 0;
-                                let ecc_uncorrected_res = unsafe {
-                                    get_total_ecc_errors(
-                                        mig_handle,
-                                        nvml_wrapper_sys::bindings::nvmlMemoryErrorType_enum_NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
-                                        nvml_wrapper_sys::bindings::nvmlEccCounterType_enum_NVML_ECC_COUNTER_TYPE_VOLATILE,
-                                        &mut ecc_uncorrected_val,
-                                    )
-                                };
-                                let ecc_uncorrected =
-                                    if ecc_uncorrected_res == nvmlReturn_enum_NVML_SUCCESS {
-                                        Some(ecc_uncorrected_val)
-                                    } else {
-                                        None
-                                    };
-
-                                devices.push(MigDeviceStatus {
-                                    id: mig_uuid.clone().unwrap_or(mig_id_label.clone()),
-                                    uuid: mig_uuid,
-                                    memory_total_bytes,
-                                    memory_used_bytes,
-                                    util_percent,
-                                    sm_count: None, // Not directly available via NVML MIG device handle
-                                    profile: profile_str,
-                                    placement: Some(placement_str),
-                                    bar1_total_bytes,
-                                    bar1_used_bytes,
-                                    ecc_corrected,
-                                    ecc_uncorrected,
-                                });
-                            }
-                        }
-
-                        let migs = MigTree {
-                            supported,
-                            enabled,
-                            gpu_instances: gi_map.values().cloned().collect(),
-                            compute_instances: ci_nodes,
-                            devices,
-                        };
-
-                        metrics
-                            .gpu_mig_enabled
-                            .with_label_values(&[uuid_label, gpu_label.as_str()])
-                            .set(if migs.enabled { 1.0 } else { 0.0 });
-                        // GI/CI info gauges
-                        for gi in &migs.gpu_instances {
-                            metrics
-                                .mig_gpu_instance_info
-                                .with_label_values(&[
-                                    uuid_label,
-                                    gpu_label.as_str(),
-                                    gi.id.to_string().as_str(),
-                                    gi.profile_id
-                                        .map(|p| p.to_string())
-                                        .unwrap_or_default()
-                                        .as_str(),
-                                    gi.placement.as_deref().unwrap_or(""),
-                                ])
-                                .set(1.0);
-                        }
-                        for ci in &migs.compute_instances {
-                            metrics
-                                .mig_compute_instance_info
-                                .with_label_values(&[
-                                    uuid_label,
-                                    gpu_label.as_str(),
-                                    ci.gpu_instance_id.to_string().as_str(),
-                                    ci.id.to_string().as_str(),
-                                    ci.profile_id
-                                        .map(|p| p.to_string())
-                                        .unwrap_or_default()
-                                        .as_str(),
-                                    ci.eng_profile_id
-                                        .map(|p| p.to_string())
-                                        .unwrap_or_default()
-                                        .as_str(),
-                                    ci.placement.as_deref().unwrap_or(""),
-                                ])
-                                .set(1.0);
-                        }
-                        for mig in &migs.devices {
-                            let mig_id_string = mig.id.to_string();
-                            let mig_label = mig.uuid.as_deref().unwrap_or(mig_id_string.as_str());
-                            let compat_label = if self.k8s_mode {
-                                k8s_resource_name(
-                                    self.resource_prefix,
-                                    mig.profile.as_deref().or(Some("generic")),
-                                )
-                            } else {
-                                mig_label.to_string()
-                            };
-                            if let Some(util) = mig.util_percent {
-                                metrics
-                                    .mig_utilization_percent
-                                    .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label])
-                                    .set(util as f64);
-                                if self.k8s_mode {
+                                if let Some(util) = mig.util_percent {
                                     metrics
                                         .mig_utilization_percent
                                         .with_label_values(&[
                                             uuid_label,
                                             gpu_label.as_str(),
-                                            compat_label.as_str(),
+                                            mig_label,
                                         ])
                                         .set(util as f64);
+                                    if self.k8s_mode {
+                                        metrics
+                                            .mig_utilization_percent
+                                            .with_label_values(&[
+                                                uuid_label,
+                                                gpu_label.as_str(),
+                                                compat_label.as_str(),
+                                            ])
+                                            .set(util as f64);
+                                    }
                                 }
-                            }
-                            if let Some(total) = mig.memory_total_bytes {
-                                metrics
-                                    .mig_memory_total_bytes
-                                    .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label])
-                                    .set(total as f64);
-                                if self.k8s_mode {
+                                if let Some(total) = mig.memory_total_bytes {
                                     metrics
                                         .mig_memory_total_bytes
                                         .with_label_values(&[
                                             uuid_label,
                                             gpu_label.as_str(),
-                                            compat_label.as_str(),
+                                            mig_label,
                                         ])
                                         .set(total as f64);
+                                    if self.k8s_mode {
+                                        metrics
+                                            .mig_memory_total_bytes
+                                            .with_label_values(&[
+                                                uuid_label,
+                                                gpu_label.as_str(),
+                                                compat_label.as_str(),
+                                            ])
+                                            .set(total as f64);
+                                    }
                                 }
-                            }
-                            if let Some(used) = mig.memory_used_bytes {
-                                metrics
-                                    .mig_memory_used_bytes
-                                    .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label])
-                                    .set(used as f64);
-                                if self.k8s_mode {
+                                if let Some(used) = mig.memory_used_bytes {
                                     metrics
                                         .mig_memory_used_bytes
                                         .with_label_values(&[
                                             uuid_label,
                                             gpu_label.as_str(),
-                                            compat_label.as_str(),
+                                            mig_label,
                                         ])
                                         .set(used as f64);
+                                    if self.k8s_mode {
+                                        metrics
+                                            .mig_memory_used_bytes
+                                            .with_label_values(&[
+                                                uuid_label,
+                                                gpu_label.as_str(),
+                                                compat_label.as_str(),
+                                            ])
+                                            .set(used as f64);
+                                    }
                                 }
-                            }
-                            if let Some(sm) = mig.sm_count {
-                                metrics
-                                    .mig_sm_count
-                                    .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label])
-                                    .set(sm as f64);
-                                if self.k8s_mode {
+                                if let Some(sm) = mig.sm_count {
                                     metrics
                                         .mig_sm_count
                                         .with_label_values(&[
                                             uuid_label,
                                             gpu_label.as_str(),
-                                            compat_label.as_str(),
+                                            mig_label,
                                         ])
                                         .set(sm as f64);
+                                    if self.k8s_mode {
+                                        metrics
+                                            .mig_sm_count
+                                            .with_label_values(&[
+                                                uuid_label,
+                                                gpu_label.as_str(),
+                                                compat_label.as_str(),
+                                            ])
+                                            .set(sm as f64);
+                                    }
                                 }
-                            }
-                            // Best-effort per-MIG ECC and BAR1 info using MigDeviceStatus fields
-                            if let Some(corrected) = mig.ecc_corrected {
-                                metrics
-                                    .mig_ecc_corrected_total
-                                    .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label])
-                                    .inc_by(corrected);
-                                if self.k8s_mode {
+                                // Best-effort per-MIG ECC and BAR1 info using MigDeviceStatus fields
+                                if let Some(corrected) = mig.ecc_corrected {
                                     metrics
                                         .mig_ecc_corrected_total
                                         .with_label_values(&[
                                             uuid_label,
                                             gpu_label.as_str(),
-                                            compat_label.as_str(),
+                                            mig_label,
                                         ])
                                         .inc_by(corrected);
+                                    if self.k8s_mode {
+                                        metrics
+                                            .mig_ecc_corrected_total
+                                            .with_label_values(&[
+                                                uuid_label,
+                                                gpu_label.as_str(),
+                                                compat_label.as_str(),
+                                            ])
+                                            .inc_by(corrected);
+                                    }
                                 }
-                            }
-                            if let Some(uncorrected) = mig.ecc_uncorrected {
-                                metrics
-                                    .mig_ecc_uncorrected_total
-                                    .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label])
-                                    .inc_by(uncorrected);
-                                if self.k8s_mode {
+                                if let Some(uncorrected) = mig.ecc_uncorrected {
                                     metrics
                                         .mig_ecc_uncorrected_total
                                         .with_label_values(&[
                                             uuid_label,
                                             gpu_label.as_str(),
-                                            compat_label.as_str(),
+                                            mig_label,
                                         ])
                                         .inc_by(uncorrected);
+                                    if self.k8s_mode {
+                                        metrics
+                                            .mig_ecc_uncorrected_total
+                                            .with_label_values(&[
+                                                uuid_label,
+                                                gpu_label.as_str(),
+                                                compat_label.as_str(),
+                                            ])
+                                            .inc_by(uncorrected);
+                                    }
                                 }
-                            }
-                            if let (Some(total), Some(used)) =
-                                (mig.bar1_total_bytes, mig.bar1_used_bytes)
-                            {
-                                metrics
-                                    .mig_bar1_total_bytes
-                                    .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label])
-                                    .set(total as f64);
-                                metrics
-                                    .mig_bar1_used_bytes
-                                    .with_label_values(&[uuid_label, gpu_label.as_str(), mig_label])
-                                    .set(used as f64);
-                                if self.k8s_mode {
+                                if let (Some(total), Some(used)) =
+                                    (mig.bar1_total_bytes, mig.bar1_used_bytes)
+                                {
                                     metrics
                                         .mig_bar1_total_bytes
                                         .with_label_values(&[
                                             uuid_label,
                                             gpu_label.as_str(),
-                                            compat_label.as_str(),
+                                            mig_label,
                                         ])
                                         .set(total as f64);
                                     metrics
@@ -1395,28 +1029,46 @@ impl Collector for GpuCollector {
                                         .with_label_values(&[
                                             uuid_label,
                                             gpu_label.as_str(),
-                                            compat_label.as_str(),
+                                            mig_label,
                                         ])
                                         .set(used as f64);
+                                    if self.k8s_mode {
+                                        metrics
+                                            .mig_bar1_total_bytes
+                                            .with_label_values(&[
+                                                uuid_label,
+                                                gpu_label.as_str(),
+                                                compat_label.as_str(),
+                                            ])
+                                            .set(total as f64);
+                                        metrics
+                                            .mig_bar1_used_bytes
+                                            .with_label_values(&[
+                                                uuid_label,
+                                                gpu_label.as_str(),
+                                                compat_label.as_str(),
+                                            ])
+                                            .set(used as f64);
+                                    }
                                 }
+                                metrics
+                                    .mig_info
+                                    .with_label_values(&[
+                                        uuid_label,
+                                        gpu_label.as_str(),
+                                        mig_label,
+                                        mig.profile.as_deref().unwrap_or(""),
+                                        mig.placement.as_deref().unwrap_or(""),
+                                    ])
+                                    .set(1.0);
                             }
+                            let supported = migs.supported;
+                            status.mig_tree = Some(migs);
                             metrics
-                                .mig_info
-                                .with_label_values(&[
-                                    uuid_label,
-                                    gpu_label.as_str(),
-                                    mig_label,
-                                    mig.profile.as_deref().unwrap_or(""),
-                                    mig.placement.as_deref().unwrap_or(""),
-                                ])
-                                .set(1.0);
+                                .gpu_mig_supported
+                                .with_label_values(&[uuid_label, gpu_label.as_str()])
+                                .set(if supported { 1.0 } else { 0.0 });
                         }
-                        let supported = migs.supported;
-                        status.mig_tree = Some(migs);
-                        metrics
-                            .gpu_mig_supported
-                            .with_label_values(&[uuid_label, gpu_label.as_str()])
-                            .set(if supported { 1.0 } else { 0.0 });
                     }
 
                     #[cfg(not(all(feature = "gpu-nvml-ffi", feature = "gpu")))]
@@ -1563,7 +1215,7 @@ fn build_filter(raw: Option<&str>) -> Option<HashSet<String>> {
 }
 
 #[cfg(all(feature = "gpu", feature = "gpu-nvml-ffi"))]
-fn collect_mig_devices(nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result<MigTree> {
+fn collect_mig_devices(_nvml: &Nvml, parent: &nvml_wrapper::Device) -> Result<MigTree> {
     use nvml_wrapper_sys::bindings::{
         nvmlComputeInstanceInfo_t, nvmlDevice_t, nvmlGpuInstanceInfo_t,
         nvmlReturn_enum_NVML_SUCCESS, nvmlReturn_t,

From c3de7416bf7e4e1ce08abd85994bcedc90e2763a Mon Sep 17 00:00:00 2001
From: Shaik Noor <s.noorink@gmail.com>
Date: Sat, 13 Dec 2025 17:03:48 +0530
Subject: [PATCH 19/19] Fix clippy and snake_case errors in nvml_ext.rs

---
 crates/agent-core/src/nvml_ext.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/crates/agent-core/src/nvml_ext.rs b/crates/agent-core/src/nvml_ext.rs
index 09a9abe..d101bfa 100644
--- a/crates/agent-core/src/nvml_ext.rs
+++ b/crates/agent-core/src/nvml_ext.rs
@@ -5,7 +5,6 @@
 use nvml_wrapper_sys::bindings::*;
 
 #[cfg(all(feature = "gpu-nvml-ffi-ext", feature = "gpu"))]
-
 /// Errors from extended NVML calls.
 #[derive(thiserror::Error, Debug)]
 pub enum NvmlExtError {
@@ -125,7 +124,7 @@ pub unsafe fn get_field_values(
 
         type NvmlDeviceGetFieldValues = unsafe extern "C" fn(
             device: nvmlDevice_t,
-            valuesCount: u32,
+            values_count: u32,
             values: *mut nvmlFieldValue_t,
         ) -> nvmlReturn_t;