From 39be54dd7860f0cffaeaec923a557ca98dd59c8d Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 17 Apr 2026 08:22:17 -0400 Subject: [PATCH 1/2] nvbug6084457: Fix device architecture handling and NVLink link count query --- cuda_bindings/tests/nvml/test_init.py | 11 +++++++++-- cuda_bindings/tests/nvml/test_nvlink.py | 2 +- cuda_core/cuda/core/system/_device.pyx | 7 ++++++- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/cuda_bindings/tests/nvml/test_init.py b/cuda_bindings/tests/nvml/test_init.py index 2a04799708..a21564a1bb 100644 --- a/cuda_bindings/tests/nvml/test_init.py +++ b/cuda_bindings/tests/nvml/test_init.py @@ -25,11 +25,18 @@ def test_devices_are_the_same_architecture(all_devices): # they won't be tested properly. This tests for the (hopefully rare) case # where a system has devices of different architectures and produces a warning. - all_arches = {nvml.DeviceArch(nvml.device_get_architecture(device)) for device in all_devices} + def get_architecture_name(arch): + try: + arch = nvml.DeviceArch(arch) + return arch.name + except ValueError: + return f"UNKNOWN({arch})" + + all_arches = {nvml.device_get_architecture(device) for device in all_devices} if len(all_arches) > 1: warnings.warn( - f"System has devices of multiple architectures ({', '.join(x.name for x in all_arches)}). " + f"System has devices of multiple architectures ({', '.join(get_architecture_name(x) for x in all_arches)}). " f" Some tests may be skipped unexpectedly", UserWarning, ) diff --git a/cuda_bindings/tests/nvml/test_nvlink.py b/cuda_bindings/tests/nvml/test_nvlink.py index d8e782831e..be82aa3745 100644 --- a/cuda_bindings/tests/nvml/test_nvlink.py +++ b/cuda_bindings/tests/nvml/test_nvlink.py @@ -26,4 +26,4 @@ def test_nvlink_get_link_count(all_devices): # The feature_nvlink_supported detection is not robust, so we # can't be more specific about how many links we should find. if value.nvml_return == nvml.Return.SUCCESS: - assert value.value.ui_val <= nvml.NVLINK_MAX_LINKS, f"Unexpected link count {value.value.ui_val}" + assert value.value.ui_val[0] <= nvml.NVLINK_MAX_LINKS, f"Unexpected link count {value.value.ui_val[0]}" diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index f661c4e685..55d3a12b86 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -165,7 +165,12 @@ cdef class Device: "VOLTA"``, and RTX A6000 will report ``DeviceArchitecture.name == "AMPERE"``. """ - return DeviceArch(nvml.device_get_architecture(self._handle)) + arch = nvml.device_get_architecture(self._handle) + try: + arch = DeviceArch(arch) + return arch + except ValueError: + return nvml.DeviceArch.UNKNOWN @property def name(self) -> str: From 9cd877c39e4a37e1d443126f9d4a5f47a7f28d71 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 17 Apr 2026 10:49:41 -0400 Subject: [PATCH 2/2] Apply suggestion from @cpcloud Co-authored-by: Phillip Cloud <417981+cpcloud@users.noreply.github.com> --- cuda_core/cuda/core/system/_device.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 55d3a12b86..ffa3f1356c 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -167,8 +167,7 @@ cdef class Device: """ arch = nvml.device_get_architecture(self._handle) try: - arch = DeviceArch(arch) - return arch + return DeviceArch(arch) except ValueError: return nvml.DeviceArch.UNKNOWN