diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index f661c4e685..a5a78ee9b7 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -32,10 +32,12 @@ include "_fan.pxi" include "_field_values.pxi" include "_inforom.pxi" include "_memory.pxi" +include "_nvlink.pxi" include "_pci_info.pxi" include "_performance.pxi" include "_repair_status.pxi" include "_temperature.pxi" +include "_utilization.pxi" cdef class Device: @@ -674,6 +676,18 @@ cdef class Device: """ return MemoryInfo(nvml.device_get_memory_info_v2(self._handle)) + ########################################################################## + # NVLINK + # See external class definitions in _nvlink.pxi + + def nvlink(self, link: int) -> NvlinkInfo: + """ + Get information about NVLink on this device. + + For devices with NVLink support. + """ + return NvlinkInfo(self, link) + ########################################################################## # PCI INFO # See external class definitions in _pci_info.pxi @@ -765,6 +779,30 @@ cdef class Device: device._handle = handle yield device + ####################################################################### + # UTILIZATION + + @property + def utilization(self) -> Utilization: + """ + Retrieves the current utilization rates for the device's major subsystems. + + For Fermi &tm; or newer fully supported devices. + + Note: During driver initialization when ECC is enabled one can see high + GPU and Memory Utilization readings. This is caused by ECC Memory + Scrubbing mechanism that is performed during driver initialization. + + Note: On MIG-enabled GPUs, querying device utilization rates is not + currently supported. + + Returns + ------- + Utilization + An object containing the current utilization rates for the device. + """ + return Utilization(nvml.device_get_utilization_rates(self._handle)) + def get_topology_common_ancestor(device1: Device, device2: Device) -> GpuTopologyLevel: """ @@ -853,6 +891,8 @@ __all__ = [ "InforomInfo", "InforomObject", "MemoryInfo", + "NvlinkInfo", + "NvlinkVersion", "PcieUtilCounter", "PciInfo", "Pstates", @@ -864,4 +904,5 @@ __all__ = [ "ThermalSensor", "ThermalSettings", "ThermalTarget", + "Utilization", ] diff --git a/cuda_core/cuda/core/system/_nvlink.pxi b/cuda_core/cuda/core/system/_nvlink.pxi new file mode 100644 index 0000000000..36ae37bd13 --- /dev/null +++ b/cuda_core/cuda/core/system/_nvlink.pxi @@ -0,0 +1,52 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +NvlinkVersion = nvml.NvlinkVersion + + +cdef class NvlinkInfo: + """ + Nvlink information for a device. + """ + cdef Device _device + cdef int _link + + def __init__(self, device: Device, link: int): + self._device = device + self._link = link + + @property + def version(self) -> NvLinkVersion: + """ + Retrieves the NvLink version for the device and link. + + For all products with NvLink support. + + Returns + ------- + + The NvLink version. + """ + return NvlinkVersion(nvml.device_get_nvlink_version(self._device._handle, self._link)) + + @property + def state(self) -> bool: + """ + Retrieves the state of the device's NvLink for the device and link specified. + + For Pascal &tm; or newer fully supported devices. + + For all products with NvLink support. + + Returns + ------- + + `True` if the NvLink is active, `False` otherwise. + """ + return ( + nvml.device_get_nvlink_state(self._device._handle, self._link) == nvml.EnableState.FEATURE_ENABLED + ) + + max_links = nvml.NVLINK_MAX_LINKS diff --git a/cuda_core/cuda/core/system/_utilization.pxi b/cuda_core/cuda/core/system/_utilization.pxi new file mode 100644 index 0000000000..689b7dc67f --- /dev/null +++ b/cuda_core/cuda/core/system/_utilization.pxi @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +cdef class Utilization: + """ + Utilization rates for a device. + + For devices with compute capability 2.0 or higher. + """ + cdef object _utilization + + def __init__(self, utilization: nvml.Utilization): + self._utilization = utilization + + @property + def gpu(self) -> int: + """ + Percent of time over the past sample period during which one or more kernels was executing on the GPU. + """ + return self._utilization.gpu + + @property + def memory(self) -> int: + """ + Percent of time over the past sample period during which global (device) memory was being read or written. + """ + return self._utilization.memory diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 005866ddb2..37ddc8d8df 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -222,6 +222,7 @@ Enums system.FanControlPolicy system.FieldId system.InforomObject + system.NvlinkVersion system.PcieUtilCounter system.Pstates system.TemperatureSensors @@ -256,11 +257,13 @@ Types system.GpuTopologyLevel system.InforomInfo system.MemoryInfo + system.NvlinkInfo system.PciInfo system.RepairStatus system.Temperature system.ThermalSensor system.ThermalSettings + system.Utilization .. module:: cuda.core.utils diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 2a094d8211..ff9d80e83b 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -729,3 +729,34 @@ def test_pstates(): assert isinstance(utilization.percentage, int) assert isinstance(utilization.inc_threshold, int) assert isinstance(utilization.dec_threshold, int) + + +def test_nvlink(): + for device in system.Device.get_all_devices(): + max_links = system.NvlinkInfo.max_links + assert isinstance(max_links, int) + assert max_links > 0 + + for link in range(max_links): + with unsupported_before(device, None): + nvlink_info = device.nvlink(link) + assert isinstance(nvlink_info, system.NvlinkInfo) + + with unsupported_before(device, None): + version = nvlink_info.version + assert isinstance(version, system.NvlinkVersion) + + +def test_utilization(): + for device in system.Device.get_all_devices(): + with unsupported_before(device, None): + utilization = device.utilization + assert isinstance(utilization, system.Utilization) + + gpu = utilization.gpu + assert isinstance(gpu, int) + assert 0 <= gpu <= 100 + + memory = utilization.memory + assert isinstance(memory, int) + assert 0 <= memory <= 100