From 14332cd4ba6404f3248484c3c75e1ae2525b63af Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 12 Jan 2026 16:05:31 -0500 Subject: [PATCH 01/11] cuda.core.system: More device-related APIs --- cuda_bindings/cuda/bindings/_nvml.pxd | 7 +- cuda_bindings/cuda/bindings/_nvml.pyx | 274 ++++++++++--- cuda_core/cuda/core/system/_device.pyx | 395 ++++++++++++++++++- cuda_core/cuda/core/system/_inforom.pxi | 96 +++++ cuda_core/docs/source/api.rst | 10 + cuda_core/tests/system/test_system_device.py | 175 ++++++++ 6 files changed, 872 insertions(+), 85 deletions(-) create mode 100644 cuda_core/cuda/core/system/_inforom.pxi diff --git a/cuda_bindings/cuda/bindings/_nvml.pxd b/cuda_bindings/cuda/bindings/_nvml.pxd index d08b087b38..a0e6ed9ad9 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pxd +++ b/cuda_bindings/cuda/bindings/_nvml.pxd @@ -34,7 +34,6 @@ ctypedef nvmlViolationTime_t ViolationTime ctypedef nvmlUUIDValue_t UUIDValue ctypedef nvmlVgpuPlacementList_v1_t VgpuPlacementList_v1 ctypedef nvmlNvLinkPowerThres_t NvLinkPowerThres -ctypedef nvmlSystemEventData_v1_t SystemEventData_v1 ctypedef nvmlGpuInstanceProfileInfo_t GpuInstanceProfileInfo ctypedef nvmlComputeInstanceProfileInfo_t ComputeInstanceProfileInfo ctypedef nvmlMask255_t Mask255 @@ -174,7 +173,7 @@ cpdef str device_get_inforom_version(intptr_t device, int object) cpdef str device_get_inforom_image_version(intptr_t device) cpdef unsigned int device_get_inforom_configuration_checksum(intptr_t device) except? 0 cpdef device_validate_inforom(intptr_t device) -cpdef unsigned long device_get_last_bbx_flush_time(intptr_t device, intptr_t timestamp) except? 0 +cpdef tuple device_get_last_bbx_flush_time(intptr_t device) cpdef int device_get_display_mode(intptr_t device) except? -1 cpdef int device_get_display_active(intptr_t device) except? -1 cpdef int device_get_persistence_mode(intptr_t device) except? -1 @@ -329,10 +328,6 @@ cpdef device_register_events(intptr_t device, unsigned long long event_types, in cpdef unsigned long long device_get_supported_event_types(intptr_t device) except? 0 cpdef object event_set_wait_v2(intptr_t set, unsigned int timeoutms) cpdef event_set_free(intptr_t set) -cpdef system_event_set_create(intptr_t request) -cpdef system_event_set_free(intptr_t request) -cpdef system_register_events(intptr_t request) -cpdef system_event_set_wait(intptr_t request) cpdef device_modify_drain_state(intptr_t pci_info, int new_state) cpdef int device_query_drain_state(intptr_t pci_info) except? -1 cpdef device_remove_gpu_v2(intptr_t pci_info, int gpu_state, int link_state) diff --git a/cuda_bindings/cuda/bindings/_nvml.pyx b/cuda_bindings/cuda/bindings/_nvml.pyx index 3a3f01ea7a..a14eb8571c 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pyx +++ b/cuda_bindings/cuda/bindings/_nvml.pyx @@ -10155,11 +10155,157 @@ cdef class EventData: return obj +cdef _get_system_event_data_v1_dtype_offsets(): + cdef nvmlSystemEventData_v1_t pod = nvmlSystemEventData_v1_t() + return _numpy.dtype({ + 'names': ['event_type', 'gpu_id'], + 'formats': [_numpy.uint64, _numpy.uint32], + 'offsets': [ + (&(pod.eventType)) - (&pod), + (&(pod.gpuId)) - (&pod), + ], + 'itemsize': sizeof(nvmlSystemEventData_v1_t), + }) + +system_event_data_v1_dtype = _get_system_event_data_v1_dtype_offsets() + +cdef class SystemEventData_v1: + """Empty-initialize an array of `nvmlSystemEventData_v1_t`. + + The resulting object is of length `size` and of dtype `system_event_data_v1_dtype`. + If default-constructed, the instance represents a single struct. + + Args: + size (int): number of structs, default=1. + + + .. seealso:: `nvmlSystemEventData_v1_t` + """ + cdef: + readonly object _data + + + + def __init__(self, size=1): + arr = _numpy.empty(size, dtype=system_event_data_v1_dtype) + self._data = arr.view(_numpy.recarray) + assert self._data.itemsize == sizeof(nvmlSystemEventData_v1_t), \ + f"itemsize {self._data.itemsize} mismatches struct size { sizeof(nvmlSystemEventData_v1_t) }" + + def __repr__(self): + if self._data.size > 1: + return f"<{__name__}.SystemEventData_v1_Array_{self._data.size} object at {hex(id(self))}>" + else: + return f"<{__name__}.SystemEventData_v1 object at {hex(id(self))}>" + + @property + def ptr(self): + """Get the pointer address to the data as Python :class:`int`.""" + return self._data.ctypes.data + + cdef intptr_t _get_ptr(self): + return self._data.ctypes.data + + def __int__(self): + if self._data.size > 1: + raise TypeError("int() argument must be a bytes-like object of size 1. " + "To get the pointer address of an array, use .ptr") + return self._data.ctypes.data + + def __len__(self): + return self._data.size + + def __eq__(self, other): + cdef object self_data = self._data + if (not isinstance(other, SystemEventData_v1)) or self_data.size != other._data.size or self_data.dtype != other._data.dtype: + return False + return bool((self_data == other._data).all()) + + @property + def event_type(self): + """Union[~_numpy.uint64, int]: Information about what specific system event occurred.""" + if self._data.size == 1: + return int(self._data.event_type[0]) + return self._data.event_type + + @event_type.setter + def event_type(self, val): + self._data.event_type = val + + @property + def gpu_id(self): + """Union[~_numpy.uint32, int]: gpuId in PCI format""" + if self._data.size == 1: + return int(self._data.gpu_id[0]) + return self._data.gpu_id + + @gpu_id.setter + def gpu_id(self, val): + self._data.gpu_id = val + + def __getitem__(self, key): + cdef ssize_t key_ + cdef ssize_t size + if isinstance(key, int): + key_ = key + size = self._data.size + if key_ >= size or key_ <= -(size+1): + raise IndexError("index is out of bounds") + if key_ < 0: + key_ += size + return SystemEventData_v1.from_data(self._data[key_:key_+1]) + out = self._data[key] + if isinstance(out, _numpy.recarray) and out.dtype == system_event_data_v1_dtype: + return SystemEventData_v1.from_data(out) + return out + + def __setitem__(self, key, val): + self._data[key] = val + + @staticmethod + def from_data(data): + """Create an SystemEventData_v1 instance wrapping the given NumPy array. + + Args: + data (_numpy.ndarray): a 1D array of dtype `system_event_data_v1_dtype` holding the data. + """ + cdef SystemEventData_v1 obj = SystemEventData_v1.__new__(SystemEventData_v1) + if not isinstance(data, _numpy.ndarray): + raise TypeError("data argument must be a NumPy ndarray") + if data.ndim != 1: + raise ValueError("data array must be 1D") + if data.dtype != system_event_data_v1_dtype: + raise ValueError("data array must be of dtype system_event_data_v1_dtype") + obj._data = data.view(_numpy.recarray) + + return obj + + @staticmethod + def from_ptr(intptr_t ptr, size_t size=1, bint readonly=False): + """Create an SystemEventData_v1 instance wrapping the given pointer. + + Args: + ptr (intptr_t): pointer address as Python :class:`int` to the data. + size (int): number of structs, default=1. + readonly (bool): whether the data is read-only (to the user). default is `False`. + """ + if ptr == 0: + raise ValueError("ptr must not be null (0)") + cdef SystemEventData_v1 obj = SystemEventData_v1.__new__(SystemEventData_v1) + cdef flag = cpython.buffer.PyBUF_READ if readonly else cpython.buffer.PyBUF_WRITE + cdef object buf = cpython.memoryview.PyMemoryView_FromMemory( + ptr, sizeof(nvmlSystemEventData_v1_t) * size, flag) + data = _numpy.ndarray(size, buffer=buf, dtype=system_event_data_v1_dtype) + obj._data = data.view(_numpy.recarray) + + return obj + + cdef _get_accounting_stats_dtype_offsets(): cdef nvmlAccountingStats_t pod = nvmlAccountingStats_t() return _numpy.dtype({ 'names': ['gpu_utilization', 'memory_utilization', 'max_memory_usage', 'time', 'start_time', 'is_running', 'reserved'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint64, _numpy.uint64, _numpy.uint64, _numpy.uint32, _numpy.uint32], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint64, _numpy.uint64, _numpy.uint64, _numpy.uint32, (_numpy.uint32, 5)], 'offsets': [ (&(pod.gpuUtilization)) - (&pod), (&(pod.memoryUtilization)) - (&pod), @@ -22082,23 +22228,26 @@ cpdef device_validate_inforom(intptr_t device): check_status(__status__) -cpdef unsigned long device_get_last_bbx_flush_time(intptr_t device, intptr_t timestamp) except? 0: +cpdef tuple device_get_last_bbx_flush_time(intptr_t device): """Retrieves the timestamp and the duration of the last flush of the BBX (blackbox) infoROM object during the current run. Args: device (intptr_t): The identifier of the target device. - timestamp (intptr_t): The start timestamp of the last BBX Flush. Returns: - unsigned long: The duration (us) of the last BBX Flush. + A 2-tuple containing: + + - unsigned long long: The start timestamp of the last BBX Flush. + - unsigned long: The duration (us) of the last BBX Flush. .. seealso:: `nvmlDeviceGetLastBBXFlushTime` """ + cdef unsigned long long timestamp cdef unsigned long duration_us with nogil: - __status__ = nvmlDeviceGetLastBBXFlushTime(device, timestamp, &duration_us) + __status__ = nvmlDeviceGetLastBBXFlushTime(device, ×tamp, &duration_us) check_status(__status__) - return duration_us + return (timestamp, duration_us) cpdef int device_get_display_mode(intptr_t device) except? -1: @@ -24913,58 +25062,6 @@ cpdef event_set_free(intptr_t set): check_status(__status__) -cpdef system_event_set_create(intptr_t request): - """Create an empty set of system events. Event set should be freed by ``nvmlSystemEventSetFree``. - - Args: - request (intptr_t): Reference to nvmlSystemEventSetCreateRequest_t. - - .. seealso:: `nvmlSystemEventSetCreate` - """ - with nogil: - __status__ = nvmlSystemEventSetCreate(request) - check_status(__status__) - - -cpdef system_event_set_free(intptr_t request): - """Releases system event set. - - Args: - request (intptr_t): Reference to nvmlSystemEventSetFreeRequest_t. - - .. seealso:: `nvmlSystemEventSetFree` - """ - with nogil: - __status__ = nvmlSystemEventSetFree(request) - check_status(__status__) - - -cpdef system_register_events(intptr_t request): - """Starts recording of events on system and add the events to specified ``nvmlSystemEventSet_t``. - - Args: - request (intptr_t): Reference to the struct nvmlSystemRegisterEventRequest_t. - - .. seealso:: `nvmlSystemRegisterEvents` - """ - with nogil: - __status__ = nvmlSystemRegisterEvents(request) - check_status(__status__) - - -cpdef system_event_set_wait(intptr_t request): - """Waits on system events and delivers events. - - Args: - request (intptr_t): Reference in which to nvmlSystemEventSetWaitRequest_t. - - .. seealso:: `nvmlSystemEventSetWait` - """ - with nogil: - __status__ = nvmlSystemEventSetWait(request) - check_status(__status__) - - cpdef device_modify_drain_state(intptr_t pci_info, int new_state): """Modify the drain state of a GPU. This method forces a GPU to no longer accept new incoming requests. Any new NVML process will no longer see this GPU. Persistence mode for this GPU must be turned off before this call is made. Must be called as administrator. For Linux only. @@ -27908,3 +28005,64 @@ cpdef object device_get_nvlink_info(intptr_t device): __status__ = nvmlDeviceGetNvLinkInfo(device, info) check_status(__status__) return info_v1_py + + +cpdef intptr_t system_event_set_create(): + """Create an empty set of system events. Event set should be freed by ``nvmlSystemEventSetFree``.""" + cdef nvmlSystemEventSetCreateRequest_v1_t[1] request + with nogil: + request[0].version = sizeof(nvmlSystemEventSetCreateRequest_v1_t) | (1 << 24) + __status__ = nvmlSystemEventSetCreate(request) + check_status(__status__) + return (request[0].set) + + +cpdef system_event_set_free(intptr_t event_set): + """Frees an event set.""" + cdef nvmlSystemEventSetFreeRequest_v1_t[1] request + request[0].set = event_set + with nogil: + request[0].version = sizeof(nvmlSystemEventSetFreeRequest_v1_t) | (1 << 24) + __status__ = nvmlSystemEventSetFree(request) + check_status(__status__) + + +cpdef system_register_events(unsigned long long event_types, intptr_t event_set): + """Starts recording of events on system and add the events to specified ``nvmlSystemEventSet_t``. + + Args: + event_types (unsigned long long): Bitmask of nvmlSystemEventType_t values representing the events to register. + event_set (intptr_t): The system event set handle. + """ + cdef nvmlSystemRegisterEventRequest_v1_t[1] request + request[0].set = event_set + request[0].eventTypes = event_types + with nogil: + request[0].version = sizeof(nvmlSystemRegisterEventRequest_v1_t) | (1 << 24) + __status__ = nvmlSystemRegisterEvents(request) + check_status(__status__) + + +cpdef object system_event_set_wait(intptr_t event_set, unsigned int timeout_ms, unsigned int buffer_size): + """Waits for events to occur on the system event set. + + Args: + event_set (intptr_t): The system event set handle. + timeout_ms (unsigned int): The maximum amount of time in milliseconds to wait for an event. + buffer_size (unsigned int): The size of the event buffer. + + Returns: + SystemEvent: The system event that occurred. + """ + cdef nvmlSystemEventSetWaitRequest_v1_t[1] request + cdef SystemEventData_v1 event_data = SystemEventData_v1(buffer_size) + request[0].timeoutms = timeout_ms + request[0].set = event_set + request[0].data = (event_data._get_ptr()) + request[0].dataSize = buffer_size + with nogil: + request[0].version = sizeof(nvmlSystemEventSetWaitRequest_v1_t) | (1 << 24) + __status__ = nvmlSystemEventSetWait(request) + check_status(__status__) + event_data._data.resize((request[0].numEvent,)) + return event_data diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 2371c09c30..7d647f61aa 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -13,10 +13,17 @@ from cuda.bindings import _nvml as nvml from ._nvml_context cimport initialize include "_device_utils.pxi" +include "_inforom.pxi" +AddressingMode = nvml.DeviceAddressingModeType BrandType = nvml.BrandType FieldId = nvml.FieldId +GpuP2PCapsIndex = nvml.GpuP2PCapsIndex +GpuP2PStatus = nvml.GpuP2PStatus +GpuTopologyLevel = nvml.GpuTopologyLevel +InforomObject = nvml.InforomObject +PcieUtilCounter = nvml.PcieUtilCounter class DeviceArchitecture: @@ -127,52 +134,146 @@ cdef class PciInfo: """ PCI information about a GPU device. """ - cdef object _pci_info - def __init__(self, pci_info: nvml.PciInfo): - self._pci_info = pci_info + cdef object _pci_info_ext + cdef intptr_t _handle + + def __init__(self, pci_info_ext: nvml.PciInfoExt_v1, handle: int): + self._pci_info_ext = pci_info_ext + self._handle = handle @property def bus(self) -> int: """ The bus on which the device resides, 0 to 255 """ - return self._pci_info.bus + return self._pci_info_ext.bus @property def bus_id(self) -> str: """ The tuple domain:bus:device.function PCI identifier string """ - return self._pci_info.bus_id + return self._pci_info_ext.bus_id @property def device(self) -> int: """ The device's id on the bus, 0 to 31 """ - return self._pci_info.device_ + return self._pci_info_ext.device_ @property def domain(self) -> int: """ The PCI domain on which the device's bus resides, 0 to 0xffffffff """ - return self._pci_info.domain + return self._pci_info_ext.domain @property def vendor_id(self) -> int: """ The PCI vendor id of the device """ - return self._pci_info.pci_device_id & 0xFFFF + return self._pci_info_ext.pci_device_id & 0xFFFF @property def device_id(self) -> int: """ The PCI device id of the device """ - return self._pci_info.pci_device_id >> 16 + return self._pci_info_ext.pci_device_id >> 16 + + @property + def subsystem_id(self) -> int: + """ + The subsystem device ID + """ + return self._pci_info_ext.pci_sub_system_id + + @property + def base_class(self) -> int: + """ + The 8-bit PCI base class code + """ + return self._pci_info_ext.base_class + + @property + def sub_class(self) -> int: + """ + The 8-bit PCI sub class code + """ + return self._pci_info_ext.sub_class + + def get_max_pcie_link_generation(self) -> int: + """ + Retrieves the maximum PCIe link generation possible with this device and system. + + For Fermi™ or newer fully supported devices. + + For example, for a generation 2 PCIe device attached to a generation 1 + PCIe bus, the max link generation this function will report is + generation 1. + """ + return nvml.device_get_max_pcie_link_generation(self._handle) + + def get_gpu_max_pcie_link_generation(self) -> int: + """ + Retrieves the maximum PCIe link generation supported by this GPU device. + + For Fermi™ or newer fully supported devices. + """ + return nvml.device_get_gpu_max_pcie_link_generation(self._handle) + + def get_max_pcie_link_width(self) -> int: + """ + Retrieves the maximum PCIe link width possible with this device and system. + + For Fermi™ or newer fully supported devices. + + For example, for a device with a 16x PCIe bus width attached to a 8x + PCIe system bus this function will report + a max link width of 8. + """ + return nvml.device_get_max_pcie_link_width(self._handle) + + def get_current_pcie_link_generation(self) -> int: + """ + Retrieves the current PCIe link generation. + + For Fermi™ or newer fully supported devices. + """ + return nvml.device_get_curr_pcie_link_generation(self._handle) + + def get_current_pcie_link_width(self) -> int: + """ + Retreives the current PCIe link width. + + For Fermi™ or newer fully supported devices. + """ + return nvml.device_get_curr_pcie_link_width(self._handle) + + def get_pcie_throughput(self, counter: PcieUtilCounter) -> int: + """ + Retrieve PCIe utilization information, in KB/s. + + This function is querying a byte counter over a 20ms interval, and thus + is the PCIe throughput over that interval. + + For Maxwell™ or newer fully supported devices. + + This method is not supported in virtual machines running virtual GPU + (vGPU). + """ + return nvml.device_get_pcie_throughput(self._handle, counter) + + def get_pcie_replay_counter(self) -> int: + """ + Retrieve the PCIe replay counter. + + For Kepler™ or newer fully supported devices. + """ + return nvml.device_get_pcie_replay_counter(self._handle) cdef class DeviceAttributes: @@ -381,6 +482,30 @@ cdef class FieldValues: return [x.value for x in self] +cdef class RepairStatus: + """ + Repair status for TPC/Channel repair. + """ + cdef object _repair_status + + def __init__(self, handle: int): + self._repair_status = nvml.device_get_repair_status(handle) + + @property + def channel_repair_pending(self) -> bool: + """ + `True` if a channel repair is pending. + """ + return bool(self._repair_status.b_channel_repair_pending) + + @property + def tpc_repair_pending(self) -> bool: + """ + `True` if a TPC repair is pending. + """ + return bool(self._repair_status.b_tpc_repair_pending) + + cdef class Device: """ Representation of a device. @@ -416,16 +541,23 @@ cdef class Device: cdef intptr_t _handle - def __init__(self, index: int | None = None, uuid: bytes | str | None = None, pci_bus_id: bytes | str | None = None): - initialize() - - args = [index, uuid, pci_bus_id] + def __init__( + self, + *, + index: int | None = None, + uuid: bytes | str | None = None, + pci_bus_id: bytes | str | None = None, + handle: int | None = None + ): + args = [index, uuid, pci_bus_id, handle] arg_count = sum(x is not None for x in args) if arg_count > 1: - raise ValueError("Handle requires only one of either device `index`, `uuid` or `pci_bus_id`.") + raise ValueError("Handle requires only one of either device `index`, `uuid`, `pci_bus_id` or `handle`.") if arg_count == 0: - raise ValueError("Handle requires either a device `index`, `uuid` or `pci_bus_id`.") + raise ValueError("Handle requires either a device `index`, `uuid`, `pci_bus_id` or `handle`.") + + initialize() if index is not None: self._handle = nvml.device_get_handle_by_index_v2(index) @@ -437,8 +569,20 @@ cdef class Device: if isinstance(pci_bus_id, bytes): pci_bus_id = pci_bus_id.decode("ascii") self._handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id) - else: - raise ValueError("Error parsing arguments") + elif handle is not None: + self._handle = handle + + @classmethod + def get_device_count(cls) -> int: + """ + Get the number of available devices. + + Returns + ------- + int + The number of available devices. + """ + return nvml.device_get_count_v2() @classmethod def get_all_devices(cls) -> Iterable[Device]: @@ -450,9 +594,28 @@ cdef class Device: Iterator of Device An iterator over available devices. """ - total = nvml.device_get_count_v2() - for device_id in range(total): - yield cls(device_id) + for device_id in range(nvml.device_get_count_v2()): + yield cls(index=device_id) + + @classmethod + def get_all_devices_with_cpu_affinity(cls, cpu_index: int) -> Iterable[Device]: + """ + Retrieve the set of GPUs that have a CPU affinity with the given CPU number. + + Supported on Linux only. + + Parameters + ---------- + cpu_index: int + The CPU index. + + Returns + ------- + Iterator of Device + An iterator over available devices. + """ + for handle in nvml.system_get_topology_gpu_set(cpu_index): + yield cls(handle=handle) @property def architecture(self) -> DeviceArchitecture: @@ -539,7 +702,7 @@ cdef class Device: """ The PCI attributes of this device. """ - return PciInfo(nvml.device_get_pci_info_v3(self._handle)) + return PciInfo(nvml.device_get_pci_info_ext(self._handle), self._handle) @property def serial(self) -> str: @@ -559,6 +722,133 @@ cdef class Device: return nvml.device_get_uuid(self._handle) @property + def index(self) -> int: + """ + The NVML index of this device. + + Valid indices are derived from the count returned by + :meth:`Device.get_device_count`. For example, if ``get_device_count()`` + returns 2, the valid indices are 0 and 1, corresponding to GPU 0 and GPU + 1. + + The order in which NVML enumerates devices has no guarantees of + consistency between reboots. For that reason, it is recommended that + devices be looked up by their PCI ids or GPU UUID. + + Note: The NVML index may not correlate with other APIs, such as the CUDA + device index. + """ + return nvml.device_get_index(self._handle) + + @property + def module_id(self) -> int: + """ + Get a unique identifier for the device module on the baseboard. + + This API retrieves a unique identifier for each GPU module that exists + on a given baseboard. For non-baseboard products, this ID would always + be 0. + """ + return nvml.device_get_module_id(self._handle) + + @property + def minor_number(self) -> int: + """ + The minor number of this device. + + For Linux only. + + The minor number is used by the Linux device driver to identify the + device node in ``/dev/nvidiaX``. + """ + return nvml.device_get_minor_number(self._handle) + + @property + def board_part_number(self) -> str: + """ + Retrieves the the device board part number which is programmed into the board's InfoROM. + """ + return nvml.device_get_board_part_number(self._handle) + + @property + def addressing_mode(self) -> AddressingMode: + """ + Get the addressing mode of the device. + + Addressing modes can be one of: + + - :attr:`AddressingMode.DEVICE_ADDRESSING_MODE_HMM`: System allocated + memory (``malloc``, ``mmap``) is addressable from the device (GPU), via + software-based mirroring of the CPU's page tables, on the GPU. + - :attr:`AddressingMode.DEVICE_ADDRESSING_MODE_ATS`: System allocated + memory (``malloc``, ``mmap``) is addressable from the device (GPU), via + Address Translation Services. This means that there is (effectively) a + single set of page tables, and the CPU and GPU both use them. + - :attr:`AddressingMode.DEVICE_ADDRESSING_MODE_NONE`: Neither HHM or ATS + is active. + """ + return AddressingMode(nvml.device_get_addressing_mode(self._handle).value) + + @property + def display_mode(self) -> bool: + """ + The display mode for this device. + + Indicates whether a physical display (e.g. monitor) is currently connected to + any of the device's connectors. + """ + return True if nvml.device_get_display_mode(self._handle) == nvml.EnableState.FEATURE_ENABLED else False + + @property + def display_active(self) -> bool: + """ + The display active status for this device. + + Indicates whether a display is initialized on the device. For example, + whether X Server is attached to this device and has allocated memory for + the screen. + + Display can be active even when no monitor is physically attached. + """ + return True if nvml.device_get_display_active(self._handle) == nvml.EnableState.FEATURE_ENABLED else False + + @property + def repair_status(self) -> RepairStatus: + """ + Get the repair status for TPC/Channel repair. + + For Ampere™ or newer fully supported devices. + """ + return RepairStatus(self._handle) + + @property + def inforom(self) -> InforomInfo: + """ + Accessor for InfoROM information. + + For all products with an InfoROM. + """ + return InforomInfo(self) + + def get_topology_nearest_gpus(self, level: GpuTopologyLevel) -> Iterable[Device]: + """ + Retrieve the GPUs that are nearest to this device at a specific interconnectivity level. + + Supported on Linux only. + + Parameters + ---------- + level: :class:`GpuTopologyLevel` + The topology level. + + Returns + ------- + Iterable of :class:`Device` + The nearest devices at the given topology level. + """ + for handle in nvml.device_get_topology_nearest_gpus(self._handle, level): + yield Device(handle=handle) + def attributes(self) -> DeviceAttributes: """ Get various device attributes. @@ -632,7 +922,61 @@ cdef class Device: nvml.device_clear_field_values(self._handle, field_ids) +def get_topology_common_ancestor(device1: Device, device2: Device) -> GpuTopologyLevel: + """ + Retrieve the common ancestor for two devices. + + For Linux only. + + Parameters + ---------- + device1: :class:`Device` + The first device. + device2: :class:`Device` + The second device. + + Returns + ------- + :class:`GpuTopologyLevel` + The common ancestor level of the two devices. + """ + return GpuTopologyLevel( + nvml.device_get_topology_common_ancestor( + device1._handle, + device2._handle, + ) + ) + + +def get_p2p_status(device1: Device, device2: Device, index: GpuP2PCapsIndex) -> GpuP2PStatus: + """ + Retrieve the P2P status between two devices. + + Parameters + ---------- + device1: :class:`Device` + The first device. + device2: :class:`Device` + The second device. + index: :class:`GpuP2PCapsIndex` + The P2P capability index being looked for between ``device1`` and ``device2``. + + Returns + ------- + :class:`GpuP2PStatus` + The P2P status between the two devices. + """ + return GpuP2PStatus( + nvml.device_get_p2p_status( + device1._handle, + device2._handle, + index, + ) + ) + + __all__ = [ + "AddressingMode", "BAR1MemoryInfo", "BrandType", "Device", @@ -641,6 +985,15 @@ __all__ = [ "FieldId", "FieldValue", "FieldValues", + "GpuP2PCapsIndex", + "GpuP2PStatus", + "GpuTopologyLevel", + "InforomInfo", + "InforomObject", "MemoryInfo", + "PcieUtilCounter", "PciInfo", + "RepairStatus", + "get_p2p_status", + "get_topology_common_ancestor", ] diff --git a/cuda_core/cuda/core/system/_inforom.pxi b/cuda_core/cuda/core/system/_inforom.pxi new file mode 100644 index 0000000000..1b2e9325c0 --- /dev/null +++ b/cuda_core/cuda/core/system/_inforom.pxi @@ -0,0 +1,96 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +cdef class InforomInfo: + cdef Device _device + + def __init__(self, device: Device): + self._device = device + + def get_version(self, inforom: InforomObject) -> str: + """ + Retrieves the InfoROM version for a given InfoROM object. + + For all products with an InfoROM. + + Fermi™ and higher parts have non-volatile on-board memory for persisting + device info, such as aggregate ECC counts. + + Parameters + ---------- + inforom: :class:`InforomObject` + The InfoROM object to query. + + Returns + ------- + str + The InfoROM version. + """ + return nvml.device_get_inforom_version(self._device._handle, inforom) + + @property + def image_version(self) -> str: + """ + Retrieves the global InfoROM image version. + + For all products with an InfoROM. + + Image version just like VBIOS version uniquely describes the exact + version of the InfoROM flashed on the board in contrast to InfoROM + object version which is only an indicator of supported features. + + Returns + ------- + str + The InfoROM image version. + """ + return nvml.device_get_inforom_image_version(self._device._handle) + + @property + def configuration_checksum(self) -> int: + """ + Retrieves the checksum of the configuration stored in the device's InfoROM. + + For all products with an InfoROM. + + Can be used to make sure that two GPUs have the exact same + configuration. Current checksum takes into account configuration stored + in PWR and ECC InfoROM objects. Checksum can change between driver + releases or when user changes configuration (e.g. disable/enable ECC) + + Returns + ------- + int + The InfoROM checksum. + """ + return nvml.device_get_inforom_configuration_checksum(self._device._handle) + + def validate(self) -> None: + """ + Reads the InfoROM from the flash and verifies the checksums. + + For all products with an InfoROM. + + Raises + ------ + :class:`cuda.core.system.CorruptedInforomError` + If the device's InfoROM is corrupted. + """ + nvml.device_validate_inforom(self._device._handle) + + @property + def bbx_flush_time(self) -> int: + """ + Retrieves the timestamp and duration of the last flush of the BBX (bloackbox) InfoROM object during the current run. + + For all products with an InfoROM. + + Returns + ------- + tuple[int, int] + - timestamp: The start timestamp of the last BBX flush + - duration_us: The duration (in μs) of the last BBX flush + """ + return nvml.device_get_last_bbx_flush_time(self._device._handle) diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 15338383f6..14845b3e89 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -79,10 +79,13 @@ CUDA system information and NVIDIA Management Library (NVML) system.get_num_devices system.get_nvml_version system.get_process_name + system.get_topology_common_ancestor + system.get_p2p_status :template: autosummary/cyclass.rst system.Device + system.AddressingMode system.BAR1MemoryInfo system.BrandType system.DeviceArchitecture @@ -90,8 +93,15 @@ CUDA system information and NVIDIA Management Library (NVML) system.FieldId system.FieldValue system.FieldValues + system.GpuP2PCapsIndex + system.GpuP2PStatus + system.GpuTopologyLevel + system.InforomInfo + system.InforomObject system.MemoryInfo + system.PcieUtilCounter system.PciInfo + system.RepairStatus .. module:: cuda.core.utils diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 52c08533ff..093ce33e3a 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -9,6 +9,7 @@ pytestmark = skip_if_nvml_unsupported import array +import multiprocessing import os import re import sys @@ -28,6 +29,10 @@ def check_gpu_available(): pytest.skip("No GPUs available to run device tests", allow_module_level=True) +def test_device_count(): + assert system.Device.get_device_count() == system.get_num_devices() + + def test_device_architecture(): for device in system.Device.get_all_devices(): device_arch = device.architecture @@ -138,6 +143,34 @@ def test_device_pci_info(): assert isinstance(pci_info.device_id, int) assert 0x0000 <= pci_info.device_id <= 0xFFFF + assert isinstance(pci_info.subsystem_id, int) + assert 0x00000000 <= pci_info.subsystem_id <= 0xFFFFFFFF + + assert isinstance(pci_info.base_class, int) + assert 0x00 <= pci_info.base_class <= 0xFF + + assert isinstance(pci_info.sub_class, int) + assert 0x00 <= pci_info.sub_class <= 0xFF + + assert isinstance(pci_info.get_max_pcie_link_generation(), int) + assert 0 <= pci_info.get_max_pcie_link_generation() <= 0xFF + + assert isinstance(pci_info.get_gpu_max_pcie_link_generation(), int) + assert 0 <= pci_info.get_gpu_max_pcie_link_generation() <= 0xFF + + assert isinstance(pci_info.get_max_pcie_link_width(), int) + assert 0 <= pci_info.get_max_pcie_link_width() <= 0xFF + + assert isinstance(pci_info.get_current_pcie_link_generation(), int) + assert 0 <= pci_info.get_current_pcie_link_generation() <= 0xFF + + assert isinstance(pci_info.get_current_pcie_link_width(), int) + assert 0 <= pci_info.get_current_pcie_link_width() <= 0xFF + + assert isinstance(pci_info.get_pcie_throughput(system.PcieUtilCounter.PCIE_UTIL_TX_BYTES), int) + + assert isinstance(pci_info.get_pcie_replay_counter(), int) + def test_device_serial(): skip_reasons = set() @@ -304,3 +337,145 @@ def test_field_values(): field_values.validate() assert len(field_values) == 1 assert field_values[0].value <= old_value + + +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") +def test_get_all_devices_with_cpu_affinity(): + try: + for i in range(multiprocessing.cpu_count()): + for device in system.Device.get_all_devices_with_cpu_affinity(i): + affinity = device.cpu_affinity + assert isinstance(affinity, list) + assert {i} == set(affinity) + except system.NotSupportedError: + pytest.skip("Getting devices with CPU affinity not supported") + + +def test_index(): + for i, device in enumerate(system.Device.get_all_devices()): + index = device.index + assert isinstance(index, int) + assert index == i + + +def test_module_id(): + for device in system.Device.get_all_devices(): + module_id = device.module_id + assert isinstance(module_id, int) + assert module_id >= 0 + + +def test_addressing_mode(): + for device in system.Device.get_all_devices(): + try: + addressing_mode = device.addressing_mode + except system.NotSupportedError: + pytest.skip(f"Device addressing mode not supported by device '{device.name}'") + continue + assert isinstance(addressing_mode, system.AddressingMode) + + +def test_display_mode(): + for device in system.Device.get_all_devices(): + display_mode = device.display_mode + assert isinstance(display_mode, bool) + + display_active = device.display_active + assert isinstance(display_active, bool) + + +def test_repair_status(): + for device in system.Device.get_all_devices(): + repair_status = device.repair_status + assert isinstance(repair_status, system.RepairStatus) + + assert isinstance(repair_status.channel_repair_pending, bool) + assert isinstance(repair_status.tpc_repair_pending, bool) + + +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") +def test_get_topology_common_ancestor(): + # TODO: This is not a great test, and probably doesn't test much of anything + # in practice on our CI. + + if system.Device.get_device_count() < 2: + pytest.skip("Test requires at least 2 GPUs") + return + + devices = list(system.Device.get_all_devices()) + + ancestor = system.get_topology_common_ancestor(devices[0], devices[1]) + assert isinstance(ancestor, system.GpuTopologyLevel) + + +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") +def test_get_p2p_status(): + # TODO: This is not a great test, and probably doesn't test much of anything + # in practice on our CI. + + if system.Device.get_device_count() < 2: + pytest.skip("Test requires at least 2 GPUs") + return + + devices = list(system.Device.get_all_devices()) + + status = system.get_p2p_status(devices[0], devices[1], system.GpuP2PCapsIndex.P2P_CAPS_INDEX_READ) + assert isinstance(status, system.GpuP2PStatus) + + +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") +def test_get_nearest_gpus(): + # TODO: This is not a great test, and probably doesn't test much of anything + # in practice on our CI. + + for device in system.Device.get_all_devices(): + for near_device in device.get_topology_nearest_gpus(system.GpuTopologyLevel.TOPOLOGY_SINGLE): + assert isinstance(near_device, system.Device) + + +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") +def test_get_minor_number(): + for device in system.Device.get_all_devices(): + minor_number = device.minor_number + assert isinstance(minor_number, int) + assert minor_number >= 0 + + +def test_board_part_number(): + for device in system.Device.get_all_devices(): + try: + board_part_number = device.board_part_number + except system.NotSupportedError: + pytest.skip(f"Device board part number not supported by device '{device.name}'") + continue + assert isinstance(board_part_number, str) + assert len(board_part_number) > 0 + + +def test_get_inforom_version(): + for device in system.Device.get_all_devices(): + inforom = device.inforom + + inforom_image_version = inforom.image_version + assert isinstance(inforom_image_version, str) + assert len(inforom_image_version) > 0 + + inforom_version = inforom.get_version(system.InforomObject.INFOROM_OEM) + assert isinstance(inforom_version, str) + assert len(inforom_version) > 0 + + checksum = inforom.configuration_checksum + assert isinstance(checksum, int) + + # TODO: This is untested locally. + try: + timestamp, duration_us = inforom.bbx_flush_time + except system.NotSupportedError: + pass + else: + assert isinstance(timestamp, int) + assert timestamp > 0 + assert isinstance(duration_us, int) + assert duration_us > 0 + + inforom.validate() From 421ebda6ac33c7814bd8a917a69b30546b29abae Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 12 Jan 2026 16:14:09 -0500 Subject: [PATCH 02/11] Fix line wrapping --- cuda_core/cuda/core/system/_inforom.pxi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/system/_inforom.pxi b/cuda_core/cuda/core/system/_inforom.pxi index 1b2e9325c0..f71c92b559 100644 --- a/cuda_core/cuda/core/system/_inforom.pxi +++ b/cuda_core/cuda/core/system/_inforom.pxi @@ -83,7 +83,8 @@ cdef class InforomInfo: @property def bbx_flush_time(self) -> int: """ - Retrieves the timestamp and duration of the last flush of the BBX (bloackbox) InfoROM object during the current run. + Retrieves the timestamp and duration of the last flush of the BBX + (bloackbox) InfoROM object during the current run. For all products with an InfoROM. From 16e71e0e1b2a740dc029674197600cf030c14b64 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 12 Jan 2026 16:18:58 -0500 Subject: [PATCH 03/11] Hide handle as an implementation detail --- cuda_core/cuda/core/system/_device.pyx | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 7d647f61aa..237fea584e 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -547,15 +547,14 @@ cdef class Device: index: int | None = None, uuid: bytes | str | None = None, pci_bus_id: bytes | str | None = None, - handle: int | None = None ): - args = [index, uuid, pci_bus_id, handle] - arg_count = sum(x is not None for x in args) + args = [index, uuid, pci_bus_id] + cdef int arg_count = sum(arg is not None for arg in args) if arg_count > 1: - raise ValueError("Handle requires only one of either device `index`, `uuid`, `pci_bus_id` or `handle`.") + raise ValueError("Handle requires only one of `index`, `uuid`, or `pci_bus_id`.") if arg_count == 0: - raise ValueError("Handle requires either a device `index`, `uuid`, `pci_bus_id` or `handle`.") + raise ValueError("Handle requires either a device `index`, `uuid`, or `pci_bus_id`.") initialize() @@ -569,8 +568,6 @@ cdef class Device: if isinstance(pci_bus_id, bytes): pci_bus_id = pci_bus_id.decode("ascii") self._handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id) - elif handle is not None: - self._handle = handle @classmethod def get_device_count(cls) -> int: @@ -615,7 +612,9 @@ cdef class Device: An iterator over available devices. """ for handle in nvml.system_get_topology_gpu_set(cpu_index): - yield cls(handle=handle) + device = Device.__new__() + device._handle = handle + return device @property def architecture(self) -> DeviceArchitecture: From aafd1e981a3baf3947c351987ef2f04b25a13518 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 12 Jan 2026 16:23:39 -0500 Subject: [PATCH 04/11] Update cuda_core/cuda/core/system/_device.pyx Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- cuda_core/cuda/core/system/_device.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 237fea584e..6348aa5f25 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -783,7 +783,7 @@ cdef class Device: memory (``malloc``, ``mmap``) is addressable from the device (GPU), via Address Translation Services. This means that there is (effectively) a single set of page tables, and the CPU and GPU both use them. - - :attr:`AddressingMode.DEVICE_ADDRESSING_MODE_NONE`: Neither HHM or ATS + - :attr:`AddressingMode.DEVICE_ADDRESSING_MODE_NONE`: Neither HMM nor ATS is active. """ return AddressingMode(nvml.device_get_addressing_mode(self._handle).value) From adad2502a01e55476036ba0b8fa4f0fd5f6f0db9 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 12 Jan 2026 16:28:31 -0500 Subject: [PATCH 05/11] Address comments from Copilot --- cuda_core/cuda/core/system/_device.pyx | 17 +++++------------ cuda_core/cuda/core/system/_inforom.pxi | 11 +++++++++-- cuda_core/tests/system/test_system_device.py | 19 ++++++++----------- 3 files changed, 22 insertions(+), 25 deletions(-) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 6348aa5f25..73edbd4be4 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -207,7 +207,7 @@ cdef class PciInfo: def get_max_pcie_link_generation(self) -> int: """ - Retrieves the maximum PCIe link generation possible with this device and system. + Retrieve the maximum PCIe link generation possible with this device and system. For Fermi™ or newer fully supported devices. @@ -219,7 +219,7 @@ cdef class PciInfo: def get_gpu_max_pcie_link_generation(self) -> int: """ - Retrieves the maximum PCIe link generation supported by this GPU device. + Retrieve the maximum PCIe link generation supported by this GPU device. For Fermi™ or newer fully supported devices. """ @@ -227,7 +227,7 @@ cdef class PciInfo: def get_max_pcie_link_width(self) -> int: """ - Retrieves the maximum PCIe link width possible with this device and system. + Retrieve the maximum PCIe link width possible with this device and system. For Fermi™ or newer fully supported devices. @@ -239,7 +239,7 @@ cdef class PciInfo: def get_current_pcie_link_generation(self) -> int: """ - Retrieves the current PCIe link generation. + Retrieve the current PCIe link generation. For Fermi™ or newer fully supported devices. """ @@ -247,7 +247,7 @@ cdef class PciInfo: def get_current_pcie_link_width(self) -> int: """ - Retreives the current PCIe link width. + Retrieve the current PCIe link width. For Fermi™ or newer fully supported devices. """ @@ -762,13 +762,6 @@ cdef class Device: """ return nvml.device_get_minor_number(self._handle) - @property - def board_part_number(self) -> str: - """ - Retrieves the the device board part number which is programmed into the board's InfoROM. - """ - return nvml.device_get_board_part_number(self._handle) - @property def addressing_mode(self) -> AddressingMode: """ diff --git a/cuda_core/cuda/core/system/_inforom.pxi b/cuda_core/cuda/core/system/_inforom.pxi index f71c92b559..c82347ee18 100644 --- a/cuda_core/cuda/core/system/_inforom.pxi +++ b/cuda_core/cuda/core/system/_inforom.pxi @@ -81,10 +81,10 @@ cdef class InforomInfo: nvml.device_validate_inforom(self._device._handle) @property - def bbx_flush_time(self) -> int: + def bbx_flush_time(self) -> tuple[int, int]: """ Retrieves the timestamp and duration of the last flush of the BBX - (bloackbox) InfoROM object during the current run. + (blackbox) InfoROM object during the current run. For all products with an InfoROM. @@ -95,3 +95,10 @@ cdef class InforomInfo: - duration_us: The duration (in μs) of the last BBX flush """ return nvml.device_get_last_bbx_flush_time(self._device._handle) + + @property + def board_part_number(self) -> str: + """ + The device board part number which is programmed into the board's InfoROM. + """ + return nvml.device_get_board_part_number(self._device._handle) diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 093ce33e3a..61ba2a2296 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -441,17 +441,6 @@ def test_get_minor_number(): assert minor_number >= 0 -def test_board_part_number(): - for device in system.Device.get_all_devices(): - try: - board_part_number = device.board_part_number - except system.NotSupportedError: - pytest.skip(f"Device board part number not supported by device '{device.name}'") - continue - assert isinstance(board_part_number, str) - assert len(board_part_number) > 0 - - def test_get_inforom_version(): for device in system.Device.get_all_devices(): inforom = device.inforom @@ -478,4 +467,12 @@ def test_get_inforom_version(): assert isinstance(duration_us, int) assert duration_us > 0 + try: + board_part_number = inforom.board_part_number + except system.NotSupportedError: + pass + else: + assert isinstance(board_part_number, str) + assert len(board_part_number) > 0 + inforom.validate() From eb088208d025dfad85c5311285debfef7e59d032 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 13 Jan 2026 09:23:50 -0500 Subject: [PATCH 06/11] Working on tests --- cuda_bindings/cuda/bindings/_nvml.pyx | 20 ++++++++++---------- cuda_core/tests/system/test_system_device.py | 3 ++- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/cuda_bindings/cuda/bindings/_nvml.pyx b/cuda_bindings/cuda/bindings/_nvml.pyx index a14eb8571c..dbb87e8d0b 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pyx +++ b/cuda_bindings/cuda/bindings/_nvml.pyx @@ -27104,8 +27104,8 @@ cpdef object system_get_topology_gpu_set(unsigned int cpuNumber): __status__ = nvmlSystemGetTopologyGpuSet(cpuNumber, count, NULL) check_status_size(__status__) if count[0] == 0: - return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="i", mode="c")[:0] - cdef view.array deviceArray = view.array(shape=(count[0],), itemsize=sizeof(intptr_t), format="i", mode="c") + return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="P", mode="c")[:0] + cdef view.array deviceArray = view.array(shape=(count[0],), itemsize=sizeof(intptr_t), format="P", mode="c") with nogil: __status__ = nvmlSystemGetTopologyGpuSet(cpuNumber, count, deviceArray.data) check_status(__status__) @@ -27144,8 +27144,8 @@ cpdef object unit_get_devices(intptr_t unit): __status__ = nvmlUnitGetDevices(unit, deviceCount, NULL) check_status_size(__status__) if deviceCount[0] == 0: - return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="i", mode="c")[:0] - cdef view.array deviceArray = view.array(shape=(deviceCount[0],), itemsize=sizeof(intptr_t), format="i", mode="c") + return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="P", mode="c")[:0] + cdef view.array deviceArray = view.array(shape=(deviceCount[0],), itemsize=sizeof(intptr_t), format="P", mode="c") with nogil: __status__ = nvmlUnitGetDevices(unit, deviceCount, deviceArray.data) check_status(__status__) @@ -27172,8 +27172,8 @@ cpdef object device_get_topology_nearest_gpus(intptr_t device, unsigned int leve ) check_status_size(__status__) if count[0] == 0: - return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="i", mode="c")[:0] - cdef view.array deviceArray = view.array(shape=(deviceCount[0],), itemsize=sizeof(intptr_t), format="i", mode="c") + return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="P", mode="c")[:0] + cdef view.array deviceArray = view.array(shape=(deviceCount[0],), itemsize=sizeof(intptr_t), format="P", mode="c") with nogil: __status__ = nvmlDeviceGetTopologyNearestGpus( device, @@ -27837,9 +27837,9 @@ cpdef object device_get_gpu_instances(intptr_t device, unsigned int profile_id): check_status_size(__status__) if count[0] == 0: - view.array(shape=(1,), itemsize=sizeof(intptr_t), format="i", mode="c")[:0] + view.array(shape=(1,), itemsize=sizeof(intptr_t), format="P", mode="c")[:0] - cdef view.array gpuInstances = view.array(shape=(count[0],), itemsize=sizeof(intptr_t), format="i", mode="c") + cdef view.array gpuInstances = view.array(shape=(count[0],), itemsize=sizeof(intptr_t), format="P", mode="c") with nogil: __status__ = nvmlDeviceGetGpuInstances(device, profile_id, gpuInstances.data, count) check_status(__status__) @@ -27863,9 +27863,9 @@ cpdef object gpu_instance_get_compute_instances(intptr_t gpu_instance, unsigned check_status_size(__status__) if count[0] == 0: - view.array(shape=(1,), itemsize=sizeof(intptr_t), format="i", mode="c")[:0] + view.array(shape=(1,), itemsize=sizeof(intptr_t), format="P", mode="c")[:0] - cdef view.array computeInstances = view.array(shape=(count[0],), itemsize=sizeof(intptr_t), format="i", mode="c") + cdef view.array computeInstances = view.array(shape=(count[0],), itemsize=sizeof(intptr_t), format="P", mode="c") with nogil: __status__ = nvmlGpuInstanceGetComputeInstances(gpu_instance, profile_id, computeInstances.data, count) check_status(__status__) diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 61ba2a2296..d76ffa3631 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -247,6 +247,7 @@ def test_device_attributes(): except system.NotSupportedError: skip_reasons.append(f"Device attributes not supported on '{device.name}'") continue + print("Attribute type:", type(attributes)) assert isinstance(attributes, system.DeviceAttributes) assert isinstance(attributes.multiprocessor_count, int) @@ -459,7 +460,7 @@ def test_get_inforom_version(): # TODO: This is untested locally. try: timestamp, duration_us = inforom.bbx_flush_time - except system.NotSupportedError: + except (system.NotSupportedError, system.NotReadyError): pass else: assert isinstance(timestamp, int) From 51496f74cc66fff2b1a7001f07bc7d5159b2f48e Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 13 Jan 2026 13:22:18 -0500 Subject: [PATCH 07/11] Fix tests --- cuda_core/cuda/core/system/_device.pyx | 3 ++- cuda_core/tests/system/test_system_device.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 73edbd4be4..0e3247cbc6 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -612,7 +612,7 @@ cdef class Device: An iterator over available devices. """ for handle in nvml.system_get_topology_gpu_set(cpu_index): - device = Device.__new__() + device = Device.__new__(Device) device._handle = handle return device @@ -841,6 +841,7 @@ cdef class Device: for handle in nvml.device_get_topology_nearest_gpus(self._handle, level): yield Device(handle=handle) + @property def attributes(self) -> DeviceAttributes: """ Get various device attributes. diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index d76ffa3631..6df72455d8 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -247,7 +247,6 @@ def test_device_attributes(): except system.NotSupportedError: skip_reasons.append(f"Device attributes not supported on '{device.name}'") continue - print("Attribute type:", type(attributes)) assert isinstance(attributes, system.DeviceAttributes) assert isinstance(attributes.multiprocessor_count, int) From 47a83ce756a96cd343a7cb53748e83790c049b0e Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 13 Jan 2026 13:57:50 -0500 Subject: [PATCH 08/11] Fix creating new device --- cuda_core/cuda/core/system/_device.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 0e3247cbc6..6df92b4ba8 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -611,6 +611,7 @@ cdef class Device: Iterator of Device An iterator over available devices. """ + cdef Device device for handle in nvml.system_get_topology_gpu_set(cpu_index): device = Device.__new__(Device) device._handle = handle From 3cfdabba42d14f1bb78d8109160a7492f94779ca Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 13 Jan 2026 14:31:32 -0500 Subject: [PATCH 09/11] Fix iterator --- cuda_core/cuda/core/system/_device.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 6df92b4ba8..3434a2ab32 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -615,7 +615,7 @@ cdef class Device: for handle in nvml.system_get_topology_gpu_set(cpu_index): device = Device.__new__(Device) device._handle = handle - return device + yield device @property def architecture(self) -> DeviceArchitecture: From dd3118a953d9cf282e2fd64d544e6856b506990e Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 13 Jan 2026 15:07:58 -0500 Subject: [PATCH 10/11] Fix affinity test --- cuda_core/tests/system/test_system_device.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 6df72455d8..2c6788ff45 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -346,7 +346,7 @@ def test_get_all_devices_with_cpu_affinity(): for device in system.Device.get_all_devices_with_cpu_affinity(i): affinity = device.cpu_affinity assert isinstance(affinity, list) - assert {i} == set(affinity) + assert i in affinity except system.NotSupportedError: pytest.skip("Getting devices with CPU affinity not supported") From 229366074a45ff6415e834dae013ec2911d407b3 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 13 Jan 2026 18:19:15 -0500 Subject: [PATCH 11/11] Fix nearest GPUs --- cuda_core/cuda/core/system/_device.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 3434a2ab32..856c840530 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -839,8 +839,11 @@ cdef class Device: Iterable of :class:`Device` The nearest devices at the given topology level. """ + cdef Device device for handle in nvml.device_get_topology_nearest_gpus(self._handle, level): - yield Device(handle=handle) + device = Device.__new__(Device) + device._handle = handle + yield device @property def attributes(self) -> DeviceAttributes: