From d54034fad1e114c4d3201b9089bc806a07c125d6 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 9 Dec 2025 21:08:30 -0500 Subject: [PATCH 1/2] Add new NVML APIs in CTK 13.1 --- .../cuda/bindings/_internal/_nvml.pxd | 5 +- .../cuda/bindings/_internal/_nvml_linux.pyx | 65 +- .../cuda/bindings/_internal/_nvml_windows.pyx | 53 +- cuda_bindings/cuda/bindings/_nvml.pxd | 9 +- cuda_bindings/cuda/bindings/_nvml.pyx | 780 +++++++++++++++++- cuda_bindings/cuda/bindings/cy_nvml.pxd | 56 +- cuda_bindings/cuda/bindings/cy_nvml.pyx | 14 +- 7 files changed, 972 insertions(+), 10 deletions(-) diff --git a/cuda_bindings/cuda/bindings/_internal/_nvml.pxd b/cuda_bindings/cuda/bindings/_internal/_nvml.pxd index c394c4910e..298a1a72f9 100644 --- a/cuda_bindings/cuda/bindings/_internal/_nvml.pxd +++ b/cuda_bindings/cuda/bindings/_internal/_nvml.pxd @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE # -# This code was automatically generated across versions from 12.9.1 to 13.0.1. Do not modify it directly. +# This code was automatically generated across versions from 12.9.1 to 13.1.0. Do not modify it directly. from ..cy_nvml cimport * @@ -358,3 +358,6 @@ cdef nvmlReturn_t _nvmlDeviceGetNvLinkInfo(nvmlDevice_t device, nvmlNvLinkInfo_t cdef nvmlReturn_t _nvmlDeviceReadWritePRM_v1(nvmlDevice_t device, nvmlPRMTLV_v1_t* buffer) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil cdef nvmlReturn_t _nvmlDeviceGetGpuInstanceProfileInfoByIdV(nvmlDevice_t device, unsigned int profileId, nvmlGpuInstanceProfileInfo_v2_t* info) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil cdef nvmlReturn_t _nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts(nvmlDevice_t device, nvmlEccSramUniqueUncorrectedErrorCounts_t* errorCounts) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil +cdef nvmlReturn_t _nvmlDeviceGetUnrepairableMemoryFlag_v1(nvmlDevice_t device, nvmlUnrepairableMemoryStatus_v1_t* unrepairableMemoryStatus) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil +cdef nvmlReturn_t _nvmlDeviceReadPRMCounters_v1(nvmlDevice_t device, nvmlPRMCounterList_v1_t* counterList) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil +cdef nvmlReturn_t _nvmlDeviceSetRusdSettings_v1(nvmlDevice_t device, nvmlRusdSettings_v1_t* settings) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil diff --git a/cuda_bindings/cuda/bindings/_internal/_nvml_linux.pyx b/cuda_bindings/cuda/bindings/_internal/_nvml_linux.pyx index 1622cc82aa..9031f6f7fe 100644 --- a/cuda_bindings/cuda/bindings/_internal/_nvml_linux.pyx +++ b/cuda_bindings/cuda/bindings/_internal/_nvml_linux.pyx @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE # -# This code was automatically generated across versions from 12.9.1 to 13.0.1. Do not modify it directly. +# This code was automatically generated across versions from 12.9.1 to 13.1.0. Do not modify it directly. from libc.stdint cimport intptr_t, uintptr_t @@ -407,6 +407,9 @@ cdef void* __nvmlDeviceGetNvLinkInfo = NULL cdef void* __nvmlDeviceReadWritePRM_v1 = NULL cdef void* __nvmlDeviceGetGpuInstanceProfileInfoByIdV = NULL cdef void* __nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts = NULL +cdef void* __nvmlDeviceGetUnrepairableMemoryFlag_v1 = NULL +cdef void* __nvmlDeviceReadPRMCounters_v1 = NULL +cdef void* __nvmlDeviceSetRusdSettings_v1 = NULL cdef void* load_library() except* with gil: @@ -2852,6 +2855,27 @@ cdef int _init_nvml() except -1 nogil: handle = load_library() __nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts = dlsym(handle, 'nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts') + global __nvmlDeviceGetUnrepairableMemoryFlag_v1 + __nvmlDeviceGetUnrepairableMemoryFlag_v1 = dlsym(RTLD_DEFAULT, 'nvmlDeviceGetUnrepairableMemoryFlag_v1') + if __nvmlDeviceGetUnrepairableMemoryFlag_v1 == NULL: + if handle == NULL: + handle = load_library() + __nvmlDeviceGetUnrepairableMemoryFlag_v1 = dlsym(handle, 'nvmlDeviceGetUnrepairableMemoryFlag_v1') + + global __nvmlDeviceReadPRMCounters_v1 + __nvmlDeviceReadPRMCounters_v1 = dlsym(RTLD_DEFAULT, 'nvmlDeviceReadPRMCounters_v1') + if __nvmlDeviceReadPRMCounters_v1 == NULL: + if handle == NULL: + handle = load_library() + __nvmlDeviceReadPRMCounters_v1 = dlsym(handle, 'nvmlDeviceReadPRMCounters_v1') + + global __nvmlDeviceSetRusdSettings_v1 + __nvmlDeviceSetRusdSettings_v1 = dlsym(RTLD_DEFAULT, 'nvmlDeviceSetRusdSettings_v1') + if __nvmlDeviceSetRusdSettings_v1 == NULL: + if handle == NULL: + handle = load_library() + __nvmlDeviceSetRusdSettings_v1 = dlsym(handle, 'nvmlDeviceSetRusdSettings_v1') + __py_nvml_init = True return 0 @@ -3915,6 +3939,15 @@ cpdef dict _inspect_function_pointers(): global __nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts data["__nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts"] = __nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts + global __nvmlDeviceGetUnrepairableMemoryFlag_v1 + data["__nvmlDeviceGetUnrepairableMemoryFlag_v1"] = __nvmlDeviceGetUnrepairableMemoryFlag_v1 + + global __nvmlDeviceReadPRMCounters_v1 + data["__nvmlDeviceReadPRMCounters_v1"] = __nvmlDeviceReadPRMCounters_v1 + + global __nvmlDeviceSetRusdSettings_v1 + data["__nvmlDeviceSetRusdSettings_v1"] = __nvmlDeviceSetRusdSettings_v1 + func_ptrs = data return data @@ -7398,3 +7431,33 @@ cdef nvmlReturn_t _nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts(nvmlDevice_t raise FunctionNotFoundError("function nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts is not found") return (__nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts)( device, errorCounts) + + +cdef nvmlReturn_t _nvmlDeviceGetUnrepairableMemoryFlag_v1(nvmlDevice_t device, nvmlUnrepairableMemoryStatus_v1_t* unrepairableMemoryStatus) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil: + global __nvmlDeviceGetUnrepairableMemoryFlag_v1 + _check_or_init_nvml() + if __nvmlDeviceGetUnrepairableMemoryFlag_v1 == NULL: + with gil: + raise FunctionNotFoundError("function nvmlDeviceGetUnrepairableMemoryFlag_v1 is not found") + return (__nvmlDeviceGetUnrepairableMemoryFlag_v1)( + device, unrepairableMemoryStatus) + + +cdef nvmlReturn_t _nvmlDeviceReadPRMCounters_v1(nvmlDevice_t device, nvmlPRMCounterList_v1_t* counterList) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil: + global __nvmlDeviceReadPRMCounters_v1 + _check_or_init_nvml() + if __nvmlDeviceReadPRMCounters_v1 == NULL: + with gil: + raise FunctionNotFoundError("function nvmlDeviceReadPRMCounters_v1 is not found") + return (__nvmlDeviceReadPRMCounters_v1)( + device, counterList) + + +cdef nvmlReturn_t _nvmlDeviceSetRusdSettings_v1(nvmlDevice_t device, nvmlRusdSettings_v1_t* settings) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil: + global __nvmlDeviceSetRusdSettings_v1 + _check_or_init_nvml() + if __nvmlDeviceSetRusdSettings_v1 == NULL: + with gil: + raise FunctionNotFoundError("function nvmlDeviceSetRusdSettings_v1 is not found") + return (__nvmlDeviceSetRusdSettings_v1)( + device, settings) diff --git a/cuda_bindings/cuda/bindings/_internal/_nvml_windows.pyx b/cuda_bindings/cuda/bindings/_internal/_nvml_windows.pyx index 0b260d8b87..d8a9be4c48 100644 --- a/cuda_bindings/cuda/bindings/_internal/_nvml_windows.pyx +++ b/cuda_bindings/cuda/bindings/_internal/_nvml_windows.pyx @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE # -# This code was automatically generated across versions from 12.9.1 to 13.0.1. Do not modify it directly. +# This code was automatically generated across versions from 12.9.1 to 13.1.0. Do not modify it directly. from libc.stdint cimport intptr_t @@ -424,6 +424,9 @@ cdef void* __nvmlDeviceGetNvLinkInfo = NULL cdef void* __nvmlDeviceReadWritePRM_v1 = NULL cdef void* __nvmlDeviceGetGpuInstanceProfileInfoByIdV = NULL cdef void* __nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts = NULL +cdef void* __nvmlDeviceGetUnrepairableMemoryFlag_v1 = NULL +cdef void* __nvmlDeviceReadPRMCounters_v1 = NULL +cdef void* __nvmlDeviceSetRusdSettings_v1 = NULL cdef uintptr_t load_library() except* with gil: @@ -1506,6 +1509,15 @@ cdef int _init_nvml() except -1 nogil: global __nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts __nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts = GetProcAddress(handle, 'nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts') + global __nvmlDeviceGetUnrepairableMemoryFlag_v1 + __nvmlDeviceGetUnrepairableMemoryFlag_v1 = GetProcAddress(handle, 'nvmlDeviceGetUnrepairableMemoryFlag_v1') + + global __nvmlDeviceReadPRMCounters_v1 + __nvmlDeviceReadPRMCounters_v1 = GetProcAddress(handle, 'nvmlDeviceReadPRMCounters_v1') + + global __nvmlDeviceSetRusdSettings_v1 + __nvmlDeviceSetRusdSettings_v1 = GetProcAddress(handle, 'nvmlDeviceSetRusdSettings_v1') + __py_nvml_init = True return 0 @@ -2569,6 +2581,15 @@ cpdef dict _inspect_function_pointers(): global __nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts data["__nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts"] = __nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts + global __nvmlDeviceGetUnrepairableMemoryFlag_v1 + data["__nvmlDeviceGetUnrepairableMemoryFlag_v1"] = __nvmlDeviceGetUnrepairableMemoryFlag_v1 + + global __nvmlDeviceReadPRMCounters_v1 + data["__nvmlDeviceReadPRMCounters_v1"] = __nvmlDeviceReadPRMCounters_v1 + + global __nvmlDeviceSetRusdSettings_v1 + data["__nvmlDeviceSetRusdSettings_v1"] = __nvmlDeviceSetRusdSettings_v1 + func_ptrs = data return data @@ -6052,3 +6073,33 @@ cdef nvmlReturn_t _nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts(nvmlDevice_t raise FunctionNotFoundError("function nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts is not found") return (__nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts)( device, errorCounts) + + +cdef nvmlReturn_t _nvmlDeviceGetUnrepairableMemoryFlag_v1(nvmlDevice_t device, nvmlUnrepairableMemoryStatus_v1_t* unrepairableMemoryStatus) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil: + global __nvmlDeviceGetUnrepairableMemoryFlag_v1 + _check_or_init_nvml() + if __nvmlDeviceGetUnrepairableMemoryFlag_v1 == NULL: + with gil: + raise FunctionNotFoundError("function nvmlDeviceGetUnrepairableMemoryFlag_v1 is not found") + return (__nvmlDeviceGetUnrepairableMemoryFlag_v1)( + device, unrepairableMemoryStatus) + + +cdef nvmlReturn_t _nvmlDeviceReadPRMCounters_v1(nvmlDevice_t device, nvmlPRMCounterList_v1_t* counterList) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil: + global __nvmlDeviceReadPRMCounters_v1 + _check_or_init_nvml() + if __nvmlDeviceReadPRMCounters_v1 == NULL: + with gil: + raise FunctionNotFoundError("function nvmlDeviceReadPRMCounters_v1 is not found") + return (__nvmlDeviceReadPRMCounters_v1)( + device, counterList) + + +cdef nvmlReturn_t _nvmlDeviceSetRusdSettings_v1(nvmlDevice_t device, nvmlRusdSettings_v1_t* settings) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil: + global __nvmlDeviceSetRusdSettings_v1 + _check_or_init_nvml() + if __nvmlDeviceSetRusdSettings_v1 == NULL: + with gil: + raise FunctionNotFoundError("function nvmlDeviceSetRusdSettings_v1 is not found") + return (__nvmlDeviceSetRusdSettings_v1)( + device, settings) diff --git a/cuda_bindings/cuda/bindings/_nvml.pxd b/cuda_bindings/cuda/bindings/_nvml.pxd index dfdbd72f8b..da9fc07f6e 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pxd +++ b/cuda_bindings/cuda/bindings/_nvml.pxd @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE # -# This code was automatically generated across versions from 12.9.1 to 13.0.1. Do not modify it directly. +# This code was automatically generated across versions from 12.9.1 to 13.1.0. Do not modify it directly. from libc.stdint cimport intptr_t @@ -41,6 +41,7 @@ ctypedef nvmlComputeInstanceProfileInfo_t ComputeInstanceProfileInfo ctypedef nvmlMask255_t Mask255 ctypedef nvmlHostname_v1_t Hostname_v1 ctypedef nvmlNvLinkInfo_v1_t NvLinkInfo_v1 +ctypedef nvmlPRMCounterInput_v1_t PRMCounterInput_v1 ctypedef nvmlPowerValue_v2_t PowerValue_v2 ctypedef nvmlVgpuProcessUtilizationSample_t VgpuProcessUtilizationSample ctypedef nvmlGpuFabricInfo_t GpuFabricInfo @@ -55,6 +56,7 @@ ctypedef nvmlGpmMetric_t GpmMetric ctypedef nvmlWorkloadPowerProfileInfo_v1_t WorkloadPowerProfileInfo_v1 ctypedef nvmlWorkloadPowerProfileCurrentProfiles_v1_t WorkloadPowerProfileCurrentProfiles_v1 ctypedef nvmlWorkloadPowerProfileRequestedProfiles_v1_t WorkloadPowerProfileRequestedProfiles_v1 +ctypedef nvmlWorkloadPowerProfileUpdateProfiles_v1_t WorkloadPowerProfileUpdateProfiles_v1 ctypedef nvmlPRMTLV_v1_t PRMTLV_v1 ctypedef nvmlVgpuSchedulerSetState_t VgpuSchedulerSetState ctypedef nvmlGpmMetricsGet_t GpmMetricsGet @@ -123,6 +125,8 @@ ctypedef nvmlVgpuPgpuCompatibilityLimitCode_t _VgpuPgpuCompatibilityLimitCode ctypedef nvmlGpmMetricId_t _GpmMetricId ctypedef nvmlPowerProfileType_t _PowerProfileType ctypedef nvmlDeviceAddressingModeType_t _DeviceAddressingModeType +ctypedef nvmlPRMCounterId_t _PRMCounterId +ctypedef nvmlPowerProfileOperation_t _PowerProfileOperation ############################################################################### @@ -445,3 +449,6 @@ cpdef object device_get_pdi(intptr_t device) cpdef object device_get_nvlink_info(intptr_t device) cpdef device_read_write_prm_v1(intptr_t device, intptr_t buffer) cpdef object device_get_gpu_instance_profile_info_by_id_v(intptr_t device, unsigned int profile_id) +cpdef object device_get_unrepairable_memory_flag_v1(intptr_t device) +cpdef device_read_prm_counters_v1(intptr_t device, intptr_t counter_list) +cpdef device_set_rusd_settings_v1(intptr_t device, intptr_t settings) diff --git a/cuda_bindings/cuda/bindings/_nvml.pyx b/cuda_bindings/cuda/bindings/_nvml.pyx index 623ee68e74..fa151c0886 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pyx +++ b/cuda_bindings/cuda/bindings/_nvml.pyx @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE # -# This code was automatically generated across versions from 12.9.1 to 13.0.1. Do not modify it directly. +# This code was automatically generated across versions from 12.9.1 to 13.1.0. Do not modify it directly. cimport cython # NOQA from cython cimport view @@ -754,6 +754,31 @@ class DeviceAddressingModeType(_IntEnum): DEVICE_ADDRESSING_MODE_HMM = NVML_DEVICE_ADDRESSING_MODE_HMM DEVICE_ADDRESSING_MODE_ATS = NVML_DEVICE_ADDRESSING_MODE_ATS +class PRMCounterId(_IntEnum): + """See `nvmlPRMCounterId_t`.""" + NONE = NVML_PRM_COUNTER_ID_NONE + PPCNT_PHYSICAL_LAYER_CTRS_LINK_DOWN_EVENTS = NVML_PRM_COUNTER_ID_PPCNT_PHYSICAL_LAYER_CTRS_LINK_DOWN_EVENTS + PPCNT_PHYSICAL_LAYER_CTRS_SUCCESSFUL_RECOVERY_EVENTS = NVML_PRM_COUNTER_ID_PPCNT_PHYSICAL_LAYER_CTRS_SUCCESSFUL_RECOVERY_EVENTS + PPCNT_RECOVERY_CTRS_TOTAL_SUCCESSFUL_RECOVERY_EVENTS = NVML_PRM_COUNTER_ID_PPCNT_RECOVERY_CTRS_TOTAL_SUCCESSFUL_RECOVERY_EVENTS + PPCNT_RECOVERY_CTRS_TIME_SINCE_LAST_RECOVERY = NVML_PRM_COUNTER_ID_PPCNT_RECOVERY_CTRS_TIME_SINCE_LAST_RECOVERY + PPCNT_RECOVERY_CTRS_TIME_BETWEEN_LAST_TWO_RECOVERIES = NVML_PRM_COUNTER_ID_PPCNT_RECOVERY_CTRS_TIME_BETWEEN_LAST_TWO_RECOVERIES + PPCNT_PORTCOUNTERS_PORT_XMIT_WAIT = NVML_PRM_COUNTER_ID_PPCNT_PORTCOUNTERS_PORT_XMIT_WAIT + PPCNT_PLR_RCV_CODES = NVML_PRM_COUNTER_ID_PPCNT_PLR_RCV_CODES + PPCNT_PLR_RCV_CODE_ERR = NVML_PRM_COUNTER_ID_PPCNT_PLR_RCV_CODE_ERR + PPCNT_PLR_RCV_UNCORRECTABLE_CODE = NVML_PRM_COUNTER_ID_PPCNT_PLR_RCV_UNCORRECTABLE_CODE + PPCNT_PLR_XMIT_CODES = NVML_PRM_COUNTER_ID_PPCNT_PLR_XMIT_CODES + PPCNT_PLR_XMIT_RETRY_CODES = NVML_PRM_COUNTER_ID_PPCNT_PLR_XMIT_RETRY_CODES + PPCNT_PLR_XMIT_RETRY_EVENTS = NVML_PRM_COUNTER_ID_PPCNT_PLR_XMIT_RETRY_EVENTS + PPCNT_PLR_SYNC_EVENTS = NVML_PRM_COUNTER_ID_PPCNT_PLR_SYNC_EVENTS + PPRM_OPER_RECOVERY = NVML_PRM_COUNTER_ID_PPRM_OPER_RECOVERY + +class PowerProfileOperation(_IntEnum): + """See `nvmlPowerProfileOperation_t`.""" + CLEAR = NVML_POWER_PROFILE_OPERATION_CLEAR + SET = NVML_POWER_PROFILE_OPERATION_SET + SET_AND_OVERWRITE = NVML_POWER_PROFILE_OPERATION_SET_AND_OVERWRITE + MAX = NVML_POWER_PROFILE_OPERATION_MAX + class AffinityScope(_IntEnum): NODE = 0 # Scope of NUMA node for affinity queries @@ -1128,12 +1153,40 @@ class FI(_IntEnum): DEV_CLOCKS_EVENT_REASON_HW_POWER_BRAKE_SLOWDOWN = 271 # Throttling due to external power brake assertion trigger (reducing core clocks by a factor of 2 or more) in ns DEV_POWER_SYNC_BALANCING_FREQ = 272 # Accumulated frequency of the GPU to be used for averaging DEV_POWER_SYNC_BALANCING_AF = 273 # Accumulated activity factor of the GPU to be used for averaging - MAX = 274 # One greater than the largest field ID defined above - + DEV_EDPP_MULTIPLIER = 274 # EDPp multiplier expressed as a percentage + + PWR_SMOOTHING_PRIMARY_POWER_FLOOR = 275 # Current primary power floor value in Watts + PWR_SMOOTHING_SECONDARY_POWER_FLOOR = 276 # Current secondary power floor value in Watts + PWR_SMOOTHING_MIN_PRIMARY_FLOOR_ACT_OFFSET = 277 # Minimum primary floor activation offset value in Watts + PWR_SMOOTHING_MIN_PRIMARY_FLOOR_ACT_POINT = 278 # Minimum primary floor activation point value in Watts + PWR_SMOOTHING_WINDOW_MULTIPLIER = 279 # Window Multiplier value in ms + PWR_SMOOTHING_DELAYED_PWR_SMOOTHING_SUPPORTED = 280 # Support (0/Not Supported or 1/Supported) for delayed power smoothing + PWR_SMOOTHING_PROFILE_SECONDARY_POWER_FLOOR = 281 # Current secondary power floor value in Watts for a given profile + PWR_SMOOTHING_PROFILE_PRIMARY_FLOOR_ACT_WIN_MULT = 282 # Current primary floor activation window multiplier value for a given profile + PWR_SMOOTHING_PROFILE_PRIMARY_FLOOR_TAR_WIN_MULT = 283 # Current primary floor target window multiplier value for a given profile + PWR_SMOOTHING_PROFILE_PRIMARY_FLOOR_ACT_OFFSET = 284 # Current primary floor activation offset value in Watts for a given profile + PWR_SMOOTHING_ADMIN_OVERRIDE_SECONDARY_POWER_FLOOR = 285 # Current secondary power floor value in Watts for admin override + PWR_SMOOTHING_ADMIN_OVERRIDE_PRIMARY_FLOOR_ACT_WIN_MULT = 286 # Current primary floor activation window multiplier value for admin override + PWR_SMOOTHING_ADMIN_OVERRIDE_PRIMARY_FLOOR_TAR_WIN_MULT = 287 # Current primary floor target window multiplier value for admin override + PWR_SMOOTHING_ADMIN_OVERRIDE_PRIMARY_FLOOR_ACT_OFFSET = 288 # Current primary floor activation offset value in Watts for admin override + + MAX = 289 NVLINK_MAX_LINKS = 18 +class RUSD(_IntEnum): + POLL_NONE - 0x0 # Disable RUSD polling on all metric groups + POLL_CLOCK = 0x1 # Enable RUSD polling on clock group + POLL_PERF = 0x2 # Enable RUSD polling on performance group + POLL_MEMORY = 0x4 # Enable RUSD polling on memory group + POLL_POWER = 0x8 # Enable RUSD polling on power group + POLL_THERMAL = 0x10 # Enable RUSD polling on thermal group + POLL_PCI = 0x20 # Enable RUSD polling on pci group + POLL_FAN = 0x40 # Enable RUSD polling on fan group + POLL_PROC_UTIL = 0x80 # Enable RUSD polling on process utilization group + + ############################################################################### # Error handling ############################################################################### @@ -14992,6 +15045,258 @@ cdef class NvlinkFirmwareVersion: return obj +cdef _get_unrepairable_memory_status_v1_dtype_offsets(): + cdef nvmlUnrepairableMemoryStatus_v1_t pod = nvmlUnrepairableMemoryStatus_v1_t() + return _numpy.dtype({ + 'names': ['b_unrepairable_memory'], + 'formats': [_numpy.uint32], + 'offsets': [ + (&(pod.bUnrepairableMemory)) - (&pod), + ], + 'itemsize': sizeof(nvmlUnrepairableMemoryStatus_v1_t), + }) + +unrepairable_memory_status_v1_dtype = _get_unrepairable_memory_status_v1_dtype_offsets() + +cdef class UnrepairableMemoryStatus_v1: + """Empty-initialize an instance of `nvmlUnrepairableMemoryStatus_v1_t`. + + + .. seealso:: `nvmlUnrepairableMemoryStatus_v1_t` + """ + cdef: + nvmlUnrepairableMemoryStatus_v1_t *_ptr + object _owner + bint _owned + bint _readonly + + def __init__(self): + self._ptr = calloc(1, sizeof(nvmlUnrepairableMemoryStatus_v1_t)) + if self._ptr == NULL: + raise MemoryError("Error allocating UnrepairableMemoryStatus_v1") + self._owner = None + self._owned = True + self._readonly = False + + def __dealloc__(self): + cdef nvmlUnrepairableMemoryStatus_v1_t *ptr + if self._owned and self._ptr != NULL: + ptr = self._ptr + self._ptr = NULL + free(ptr) + + def __repr__(self): + return f"<{__name__}.UnrepairableMemoryStatus_v1 object at {hex(id(self))}>" + + @property + def ptr(self): + """Get the pointer address to the data as Python :class:`int`.""" + return (self._ptr) + + cdef intptr_t _get_ptr(self): + return (self._ptr) + + def __int__(self): + return (self._ptr) + + def __eq__(self, other): + cdef UnrepairableMemoryStatus_v1 other_ + if not isinstance(other, UnrepairableMemoryStatus_v1): + return False + other_ = other + return (memcmp((self._ptr), (other_._ptr), sizeof(nvmlUnrepairableMemoryStatus_v1_t)) == 0) + + def __setitem__(self, key, val): + if key == 0 and isinstance(val, _numpy.ndarray): + self._ptr = malloc(sizeof(nvmlUnrepairableMemoryStatus_v1_t)) + if self._ptr == NULL: + raise MemoryError("Error allocating UnrepairableMemoryStatus_v1") + memcpy(self._ptr, val.ctypes.data, sizeof(nvmlUnrepairableMemoryStatus_v1_t)) + self._owner = None + self._owned = True + self._readonly = not val.flags.writeable + else: + setattr(self, key, val) + + @property + def b_unrepairable_memory(self): + """int: Reference to `unsigned` int.""" + return self._ptr[0].bUnrepairableMemory + + @b_unrepairable_memory.setter + def b_unrepairable_memory(self, val): + if self._readonly: + raise ValueError("This UnrepairableMemoryStatus_v1 instance is read-only") + self._ptr[0].bUnrepairableMemory = val + + @staticmethod + def from_data(data): + """Create an UnrepairableMemoryStatus_v1 instance wrapping the given NumPy array. + + Args: + data (_numpy.ndarray): a single-element array of dtype `unrepairable_memory_status_v1_dtype` holding the data. + """ + return __from_data(data, "unrepairable_memory_status_v1_dtype", unrepairable_memory_status_v1_dtype, UnrepairableMemoryStatus_v1) + + @staticmethod + def from_ptr(intptr_t ptr, bint readonly=False, object owner=None): + """Create an UnrepairableMemoryStatus_v1 instance wrapping the given pointer. + + Args: + ptr (intptr_t): pointer address as Python :class:`int` to the data. + owner (object): The Python object that owns the pointer. If not provided, data will be copied. + readonly (bool): whether the data is read-only (to the user). default is `False`. + """ + if ptr == 0: + raise ValueError("ptr must not be null (0)") + cdef UnrepairableMemoryStatus_v1 obj = UnrepairableMemoryStatus_v1.__new__(UnrepairableMemoryStatus_v1) + if owner is None: + obj._ptr = malloc(sizeof(nvmlUnrepairableMemoryStatus_v1_t)) + if obj._ptr == NULL: + raise MemoryError("Error allocating UnrepairableMemoryStatus_v1") + memcpy((obj._ptr), ptr, sizeof(nvmlUnrepairableMemoryStatus_v1_t)) + obj._owner = None + obj._owned = True + else: + obj._ptr = ptr + obj._owner = owner + obj._owned = False + obj._readonly = readonly + return obj + + +cdef _get_rusd_settings_v1_dtype_offsets(): + cdef nvmlRusdSettings_v1_t pod = nvmlRusdSettings_v1_t() + return _numpy.dtype({ + 'names': ['version', 'poll_mask'], + 'formats': [_numpy.uint32, _numpy.uint64], + 'offsets': [ + (&(pod.version)) - (&pod), + (&(pod.pollMask)) - (&pod), + ], + 'itemsize': sizeof(nvmlRusdSettings_v1_t), + }) + +rusd_settings_v1_dtype = _get_rusd_settings_v1_dtype_offsets() + +cdef class RusdSettings_v1: + """Empty-initialize an instance of `nvmlRusdSettings_v1_t`. + + + .. seealso:: `nvmlRusdSettings_v1_t` + """ + cdef: + nvmlRusdSettings_v1_t *_ptr + object _owner + bint _owned + bint _readonly + + def __init__(self): + self._ptr = calloc(1, sizeof(nvmlRusdSettings_v1_t)) + if self._ptr == NULL: + raise MemoryError("Error allocating RusdSettings_v1") + self._owner = None + self._owned = True + self._readonly = False + + def __dealloc__(self): + cdef nvmlRusdSettings_v1_t *ptr + if self._owned and self._ptr != NULL: + ptr = self._ptr + self._ptr = NULL + free(ptr) + + def __repr__(self): + return f"<{__name__}.RusdSettings_v1 object at {hex(id(self))}>" + + @property + def ptr(self): + """Get the pointer address to the data as Python :class:`int`.""" + return (self._ptr) + + cdef intptr_t _get_ptr(self): + return (self._ptr) + + def __int__(self): + return (self._ptr) + + def __eq__(self, other): + cdef RusdSettings_v1 other_ + if not isinstance(other, RusdSettings_v1): + return False + other_ = other + return (memcmp((self._ptr), (other_._ptr), sizeof(nvmlRusdSettings_v1_t)) == 0) + + def __setitem__(self, key, val): + if key == 0 and isinstance(val, _numpy.ndarray): + self._ptr = malloc(sizeof(nvmlRusdSettings_v1_t)) + if self._ptr == NULL: + raise MemoryError("Error allocating RusdSettings_v1") + memcpy(self._ptr, val.ctypes.data, sizeof(nvmlRusdSettings_v1_t)) + self._owner = None + self._owned = True + self._readonly = not val.flags.writeable + else: + setattr(self, key, val) + + @property + def version(self): + """int: """ + return self._ptr[0].version + + @version.setter + def version(self, val): + if self._readonly: + raise ValueError("This RusdSettings_v1 instance is read-only") + self._ptr[0].version = val + + @property + def poll_mask(self): + """int: Bitmask of polling data. 0 value means the GPU's RUSD polling mask is cleared.""" + return self._ptr[0].pollMask + + @poll_mask.setter + def poll_mask(self, val): + if self._readonly: + raise ValueError("This RusdSettings_v1 instance is read-only") + self._ptr[0].pollMask = val + + @staticmethod + def from_data(data): + """Create an RusdSettings_v1 instance wrapping the given NumPy array. + + Args: + data (_numpy.ndarray): a single-element array of dtype `rusd_settings_v1_dtype` holding the data. + """ + return __from_data(data, "rusd_settings_v1_dtype", rusd_settings_v1_dtype, RusdSettings_v1) + + @staticmethod + def from_ptr(intptr_t ptr, bint readonly=False, object owner=None): + """Create an RusdSettings_v1 instance wrapping the given pointer. + + Args: + ptr (intptr_t): pointer address as Python :class:`int` to the data. + owner (object): The Python object that owns the pointer. If not provided, data will be copied. + readonly (bool): whether the data is read-only (to the user). default is `False`. + """ + if ptr == 0: + raise ValueError("ptr must not be null (0)") + cdef RusdSettings_v1 obj = RusdSettings_v1.__new__(RusdSettings_v1) + if owner is None: + obj._ptr = malloc(sizeof(nvmlRusdSettings_v1_t)) + if obj._ptr == NULL: + raise MemoryError("Error allocating RusdSettings_v1") + memcpy((obj._ptr), ptr, sizeof(nvmlRusdSettings_v1_t)) + obj._owner = None + obj._owned = True + else: + obj._ptr = ptr + obj._owner = owner + obj._owned = False + obj._readonly = readonly + return obj + + cdef _get_excluded_device_info_dtype_offsets(): cdef nvmlExcludedDeviceInfo_t pod = nvmlExcludedDeviceInfo_t() return _numpy.dtype({ @@ -15971,6 +16276,151 @@ cdef class FieldValue: return obj +cdef _get_prm_counter_value_v1_dtype_offsets(): + cdef nvmlPRMCounterValue_v1_t pod = nvmlPRMCounterValue_v1_t() + return _numpy.dtype({ + 'names': ['status', 'output_type', 'output_value'], + 'formats': [_numpy.int32, _numpy.int32, value_dtype], + 'offsets': [ + (&(pod.status)) - (&pod), + (&(pod.outputType)) - (&pod), + (&(pod.outputValue)) - (&pod), + ], + 'itemsize': sizeof(nvmlPRMCounterValue_v1_t), + }) + +prm_counter_value_v1_dtype = _get_prm_counter_value_v1_dtype_offsets() + +cdef class PRMCounterValue_v1: + """Empty-initialize an instance of `nvmlPRMCounterValue_v1_t`. + + + .. seealso:: `nvmlPRMCounterValue_v1_t` + """ + cdef: + nvmlPRMCounterValue_v1_t *_ptr + object _owner + bint _owned + bint _readonly + + def __init__(self): + self._ptr = calloc(1, sizeof(nvmlPRMCounterValue_v1_t)) + if self._ptr == NULL: + raise MemoryError("Error allocating PRMCounterValue_v1") + self._owner = None + self._owned = True + self._readonly = False + + def __dealloc__(self): + cdef nvmlPRMCounterValue_v1_t *ptr + if self._owned and self._ptr != NULL: + ptr = self._ptr + self._ptr = NULL + free(ptr) + + def __repr__(self): + return f"<{__name__}.PRMCounterValue_v1 object at {hex(id(self))}>" + + @property + def ptr(self): + """Get the pointer address to the data as Python :class:`int`.""" + return (self._ptr) + + cdef intptr_t _get_ptr(self): + return (self._ptr) + + def __int__(self): + return (self._ptr) + + def __eq__(self, other): + cdef PRMCounterValue_v1 other_ + if not isinstance(other, PRMCounterValue_v1): + return False + other_ = other + return (memcmp((self._ptr), (other_._ptr), sizeof(nvmlPRMCounterValue_v1_t)) == 0) + + def __setitem__(self, key, val): + if key == 0 and isinstance(val, _numpy.ndarray): + self._ptr = malloc(sizeof(nvmlPRMCounterValue_v1_t)) + if self._ptr == NULL: + raise MemoryError("Error allocating PRMCounterValue_v1") + memcpy(self._ptr, val.ctypes.data, sizeof(nvmlPRMCounterValue_v1_t)) + self._owner = None + self._owned = True + self._readonly = not val.flags.writeable + else: + setattr(self, key, val) + + @property + def output_value(self): + """Value: Output value.""" + return Value.from_ptr(&(self._ptr[0].outputValue), self._readonly, self) + + @output_value.setter + def output_value(self, val): + if self._readonly: + raise ValueError("This PRMCounterValue_v1 instance is read-only") + cdef Value val_ = val + memcpy(&(self._ptr[0].outputValue), (val_._get_ptr()), sizeof(nvmlValue_t) * 1) + + @property + def status(self): + """int: Status of the PRM counter read.""" + return (self._ptr[0].status) + + @status.setter + def status(self, val): + if self._readonly: + raise ValueError("This PRMCounterValue_v1 instance is read-only") + self._ptr[0].status = val + + @property + def output_type(self): + """int: Output value type.""" + return (self._ptr[0].outputType) + + @output_type.setter + def output_type(self, val): + if self._readonly: + raise ValueError("This PRMCounterValue_v1 instance is read-only") + self._ptr[0].outputType = val + + @staticmethod + def from_data(data): + """Create an PRMCounterValue_v1 instance wrapping the given NumPy array. + + Args: + data (_numpy.ndarray): a single-element array of dtype `prm_counter_value_v1_dtype` holding the data. + """ + return __from_data(data, "prm_counter_value_v1_dtype", prm_counter_value_v1_dtype, PRMCounterValue_v1) + + @staticmethod + def from_ptr(intptr_t ptr, bint readonly=False, object owner=None): + """Create an PRMCounterValue_v1 instance wrapping the given pointer. + + Args: + ptr (intptr_t): pointer address as Python :class:`int` to the data. + owner (object): The Python object that owns the pointer. If not provided, data will be copied. + readonly (bool): whether the data is read-only (to the user). default is `False`. + """ + if ptr == 0: + raise ValueError("ptr must not be null (0)") + cdef PRMCounterValue_v1 obj = PRMCounterValue_v1.__new__(PRMCounterValue_v1) + if owner is None: + obj._ptr = malloc(sizeof(nvmlPRMCounterValue_v1_t)) + if obj._ptr == NULL: + raise MemoryError("Error allocating PRMCounterValue_v1") + memcpy((obj._ptr), ptr, sizeof(nvmlPRMCounterValue_v1_t)) + obj._owner = None + obj._owned = True + else: + obj._ptr = ptr + obj._owner = owner + obj._owned = False + obj._readonly = readonly + return obj + + cdef _get_gpu_thermal_settings_dtype_offsets(): cdef nvmlGpuThermalSettings_t pod = nvmlGpuThermalSettings_t() return _numpy.dtype({ @@ -18389,6 +18839,152 @@ cdef class VgpuInstancesUtilizationInfo_v1: return obj +cdef _get_prm_counter_v1_dtype_offsets(): + cdef nvmlPRMCounter_v1_t pod = nvmlPRMCounter_v1_t() + return _numpy.dtype({ + 'names': ['counter_id', 'in_data', 'counter_value'], + 'formats': [_numpy.uint32, prm_counter_input_v1_dtype, prm_counter_value_v1_dtype], + 'offsets': [ + (&(pod.counterId)) - (&pod), + (&(pod.inData)) - (&pod), + (&(pod.counterValue)) - (&pod), + ], + 'itemsize': sizeof(nvmlPRMCounter_v1_t), + }) + +prm_counter_v1_dtype = _get_prm_counter_v1_dtype_offsets() + +cdef class PRMCounter_v1: + """Empty-initialize an instance of `nvmlPRMCounter_v1_t`. + + + .. seealso:: `nvmlPRMCounter_v1_t` + """ + cdef: + nvmlPRMCounter_v1_t *_ptr + object _owner + bint _owned + bint _readonly + + def __init__(self): + self._ptr = calloc(1, sizeof(nvmlPRMCounter_v1_t)) + if self._ptr == NULL: + raise MemoryError("Error allocating PRMCounter_v1") + self._owner = None + self._owned = True + self._readonly = False + + def __dealloc__(self): + cdef nvmlPRMCounter_v1_t *ptr + if self._owned and self._ptr != NULL: + ptr = self._ptr + self._ptr = NULL + free(ptr) + + def __repr__(self): + return f"<{__name__}.PRMCounter_v1 object at {hex(id(self))}>" + + @property + def ptr(self): + """Get the pointer address to the data as Python :class:`int`.""" + return (self._ptr) + + cdef intptr_t _get_ptr(self): + return (self._ptr) + + def __int__(self): + return (self._ptr) + + def __eq__(self, other): + cdef PRMCounter_v1 other_ + if not isinstance(other, PRMCounter_v1): + return False + other_ = other + return (memcmp((self._ptr), (other_._ptr), sizeof(nvmlPRMCounter_v1_t)) == 0) + + def __setitem__(self, key, val): + if key == 0 and isinstance(val, _numpy.ndarray): + self._ptr = malloc(sizeof(nvmlPRMCounter_v1_t)) + if self._ptr == NULL: + raise MemoryError("Error allocating PRMCounter_v1") + memcpy(self._ptr, val.ctypes.data, sizeof(nvmlPRMCounter_v1_t)) + self._owner = None + self._owned = True + self._readonly = not val.flags.writeable + else: + setattr(self, key, val) + + @property + def in_data(self): + """PRMCounterInput_v1: PRM input values.""" + return PRMCounterInput_v1.from_ptr(&(self._ptr[0].inData), self._readonly, self) + + @in_data.setter + def in_data(self, val): + if self._readonly: + raise ValueError("This PRMCounter_v1 instance is read-only") + cdef PRMCounterInput_v1 val_ = val + memcpy(&(self._ptr[0].inData), (val_._get_ptr()), sizeof(nvmlPRMCounterInput_v1_t) * 1) + + @property + def counter_value(self): + """PRMCounterValue_v1: Counter value.""" + return PRMCounterValue_v1.from_ptr(&(self._ptr[0].counterValue), self._readonly, self) + + @counter_value.setter + def counter_value(self, val): + if self._readonly: + raise ValueError("This PRMCounter_v1 instance is read-only") + cdef PRMCounterValue_v1 val_ = val + memcpy(&(self._ptr[0].counterValue), (val_._get_ptr()), sizeof(nvmlPRMCounterValue_v1_t) * 1) + + @property + def counter_id(self): + """int: Counter ID, one of nvmlPRMCounterId_t.""" + return self._ptr[0].counterId + + @counter_id.setter + def counter_id(self, val): + if self._readonly: + raise ValueError("This PRMCounter_v1 instance is read-only") + self._ptr[0].counterId = val + + @staticmethod + def from_data(data): + """Create an PRMCounter_v1 instance wrapping the given NumPy array. + + Args: + data (_numpy.ndarray): a single-element array of dtype `prm_counter_v1_dtype` holding the data. + """ + return __from_data(data, "prm_counter_v1_dtype", prm_counter_v1_dtype, PRMCounter_v1) + + @staticmethod + def from_ptr(intptr_t ptr, bint readonly=False, object owner=None): + """Create an PRMCounter_v1 instance wrapping the given pointer. + + Args: + ptr (intptr_t): pointer address as Python :class:`int` to the data. + owner (object): The Python object that owns the pointer. If not provided, data will be copied. + readonly (bool): whether the data is read-only (to the user). default is `False`. + """ + if ptr == 0: + raise ValueError("ptr must not be null (0)") + cdef PRMCounter_v1 obj = PRMCounter_v1.__new__(PRMCounter_v1) + if owner is None: + obj._ptr = malloc(sizeof(nvmlPRMCounter_v1_t)) + if obj._ptr == NULL: + raise MemoryError("Error allocating PRMCounter_v1") + memcpy((obj._ptr), ptr, sizeof(nvmlPRMCounter_v1_t)) + obj._owner = None + obj._owned = True + else: + obj._ptr = ptr + obj._owner = owner + obj._owned = False + obj._readonly = readonly + return obj + + cdef _get_vgpu_scheduler_log_dtype_offsets(): cdef nvmlVgpuSchedulerLog_t pod = nvmlVgpuSchedulerLog_t() return _numpy.dtype({ @@ -19544,6 +20140,135 @@ cdef class NvLinkInfo_v2: return obj +cdef _get_prm_counter_list_v1_dtype_offsets(): + cdef nvmlPRMCounterList_v1_t pod = nvmlPRMCounterList_v1_t() + return _numpy.dtype({ + 'names': ['num_counters', 'counters'], + 'formats': [_numpy.uint32, _numpy.intp], + 'offsets': [ + (&(pod.numCounters)) - (&pod), + (&(pod.counters)) - (&pod), + ], + 'itemsize': sizeof(nvmlPRMCounterList_v1_t), + }) + +prm_counter_list_v1_dtype = _get_prm_counter_list_v1_dtype_offsets() + +cdef class PRMCounterList_v1: + """Empty-initialize an instance of `nvmlPRMCounterList_v1_t`. + + + .. seealso:: `nvmlPRMCounterList_v1_t` + """ + cdef: + nvmlPRMCounterList_v1_t *_ptr + object _owner + bint _owned + bint _readonly + dict _refs + + def __init__(self): + self._ptr = calloc(1, sizeof(nvmlPRMCounterList_v1_t)) + if self._ptr == NULL: + raise MemoryError("Error allocating PRMCounterList_v1") + self._owner = None + self._owned = True + self._readonly = False + self._refs = {} + + def __dealloc__(self): + cdef nvmlPRMCounterList_v1_t *ptr + if self._owned and self._ptr != NULL: + ptr = self._ptr + self._ptr = NULL + free(ptr) + + def __repr__(self): + return f"<{__name__}.PRMCounterList_v1 object at {hex(id(self))}>" + + @property + def ptr(self): + """Get the pointer address to the data as Python :class:`int`.""" + return (self._ptr) + + cdef intptr_t _get_ptr(self): + return (self._ptr) + + def __int__(self): + return (self._ptr) + + def __eq__(self, other): + cdef PRMCounterList_v1 other_ + if not isinstance(other, PRMCounterList_v1): + return False + other_ = other + return (memcmp((self._ptr), (other_._ptr), sizeof(nvmlPRMCounterList_v1_t)) == 0) + + def __setitem__(self, key, val): + if key == 0 and isinstance(val, _numpy.ndarray): + self._ptr = malloc(sizeof(nvmlPRMCounterList_v1_t)) + if self._ptr == NULL: + raise MemoryError("Error allocating PRMCounterList_v1") + memcpy(self._ptr, val.ctypes.data, sizeof(nvmlPRMCounterList_v1_t)) + self._owner = None + self._owned = True + self._readonly = not val.flags.writeable + else: + setattr(self, key, val) + + @property + def counters(self): + """int: Pointer to array of PRM counters.""" + if self._ptr[0].counters == NULL or self._ptr[0].numCounters == 0: + return [] + return PRMCounter_v1.from_ptr((self._ptr[0].counters), self._ptr[0].numCounters) + + @counters.setter + def counters(self, val): + if self._readonly: + raise ValueError("This PRMCounterList_v1 instance is read-only") + cdef PRMCounter_v1 arr = val + self._ptr[0].counters = (arr._get_ptr()) + self._ptr[0].numCounters = len(arr) + self._refs["counters"] = arr + + @staticmethod + def from_data(data): + """Create an PRMCounterList_v1 instance wrapping the given NumPy array. + + Args: + data (_numpy.ndarray): a single-element array of dtype `prm_counter_list_v1_dtype` holding the data. + """ + return __from_data(data, "prm_counter_list_v1_dtype", prm_counter_list_v1_dtype, PRMCounterList_v1) + + @staticmethod + def from_ptr(intptr_t ptr, bint readonly=False, object owner=None): + """Create an PRMCounterList_v1 instance wrapping the given pointer. + + Args: + ptr (intptr_t): pointer address as Python :class:`int` to the data. + owner (object): The Python object that owns the pointer. If not provided, data will be copied. + readonly (bool): whether the data is read-only (to the user). default is `False`. + """ + if ptr == 0: + raise ValueError("ptr must not be null (0)") + cdef PRMCounterList_v1 obj = PRMCounterList_v1.__new__(PRMCounterList_v1) + if owner is None: + obj._ptr = malloc(sizeof(nvmlPRMCounterList_v1_t)) + if obj._ptr == NULL: + raise MemoryError("Error allocating PRMCounterList_v1") + memcpy((obj._ptr), ptr, sizeof(nvmlPRMCounterList_v1_t)) + obj._owner = None + obj._owned = True + else: + obj._ptr = ptr + obj._owner = owner + obj._owned = False + obj._readonly = readonly + obj._refs = {} + return obj + + cpdef init_v2(): """Initialize NVML, but don't initialize any GPUs yet. @@ -21308,7 +22033,7 @@ cpdef tuple device_get_gpu_operation_mode(intptr_t device): cpdef object device_get_memory_info_v2(intptr_t device): - """Retrieves the amount of used, free, reserved and total memory available on the device, in bytes. + """Retrieves the amount of used, free, reserved and total memory available on the device, in bytes. The reserved amount is supported on version 2 only. Args: device (intptr_t): The identifier of the target device. @@ -25231,6 +25956,53 @@ cpdef object device_get_gpu_instance_profile_info_by_id_v(intptr_t device, unsig return info_py +cpdef object device_get_unrepairable_memory_flag_v1(intptr_t device): + """Get the unrepairable memory flag for a given GPU. + + Args: + device (intptr_t): The identifier of the target device. + + Returns: + nvmlUnrepairableMemoryStatus_v1_t: Reference to ``nvmlUnrepairableMemoryStatus_v1_t``. + + .. seealso:: `nvmlDeviceGetUnrepairableMemoryFlag_v1` + """ + cdef UnrepairableMemoryStatus_v1 unrepairable_memory_status_py = UnrepairableMemoryStatus_v1() + cdef nvmlUnrepairableMemoryStatus_v1_t *unrepairable_memory_status = (unrepairable_memory_status_py._get_ptr()) + with nogil: + __status__ = nvmlDeviceGetUnrepairableMemoryFlag_v1(device, unrepairable_memory_status) + check_status(__status__) + return unrepairable_memory_status_py + + +cpdef device_read_prm_counters_v1(intptr_t device, intptr_t counter_list): + """Read a list of GPU PRM Counters. + + Args: + device (intptr_t): Identifer of target GPU device. + counter_list (intptr_t): Structure holding the input parameters as well as the retrieved counter values. + + .. seealso:: `nvmlDeviceReadPRMCounters_v1` + """ + with nogil: + __status__ = nvmlDeviceReadPRMCounters_v1(device, counter_list) + check_status(__status__) + + +cpdef device_set_rusd_settings_v1(intptr_t device, intptr_t settings): + """Set Read-only user shared data (RUSD) settings for GPU. Requires root/admin permissions. + + Args: + device (intptr_t): The identifier of the target device. + settings (intptr_t): Reference to nvmlRusdSettings_t struct. + + .. seealso:: `nvmlDeviceSetRusdSettings_v1` + """ + with nogil: + __status__ = nvmlDeviceSetRusdSettings_v1(device, settings) + check_status(__status__) + + cpdef object system_get_topology_gpu_set(unsigned int cpuNumber): """Retrieve the set of GPUs that have a CPU affinity with the given CPU number diff --git a/cuda_bindings/cuda/bindings/cy_nvml.pxd b/cuda_bindings/cuda/bindings/cy_nvml.pxd index cde4578df3..6b302c4cdc 100644 --- a/cuda_bindings/cuda/bindings/cy_nvml.pxd +++ b/cuda_bindings/cuda/bindings/cy_nvml.pxd @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE # -# This code was automatically generated across versions from 12.9.1 to 13.0.1. Do not modify it directly. +# This code was automatically generated across versions from 12.9.1 to 13.1.0. Do not modify it directly. from libc.stdint cimport int64_t @@ -670,6 +670,29 @@ ctypedef enum nvmlDeviceAddressingModeType_t "nvmlDeviceAddressingModeType_t": NVML_DEVICE_ADDRESSING_MODE_HMM "NVML_DEVICE_ADDRESSING_MODE_HMM" = 1 NVML_DEVICE_ADDRESSING_MODE_ATS "NVML_DEVICE_ADDRESSING_MODE_ATS" = 2 +ctypedef enum nvmlPRMCounterId_t "nvmlPRMCounterId_t": + NVML_PRM_COUNTER_ID_NONE "NVML_PRM_COUNTER_ID_NONE" = 0 + NVML_PRM_COUNTER_ID_PPCNT_PHYSICAL_LAYER_CTRS_LINK_DOWN_EVENTS "NVML_PRM_COUNTER_ID_PPCNT_PHYSICAL_LAYER_CTRS_LINK_DOWN_EVENTS" = 1 + NVML_PRM_COUNTER_ID_PPCNT_PHYSICAL_LAYER_CTRS_SUCCESSFUL_RECOVERY_EVENTS "NVML_PRM_COUNTER_ID_PPCNT_PHYSICAL_LAYER_CTRS_SUCCESSFUL_RECOVERY_EVENTS" = 2 + NVML_PRM_COUNTER_ID_PPCNT_RECOVERY_CTRS_TOTAL_SUCCESSFUL_RECOVERY_EVENTS "NVML_PRM_COUNTER_ID_PPCNT_RECOVERY_CTRS_TOTAL_SUCCESSFUL_RECOVERY_EVENTS" = 101 + NVML_PRM_COUNTER_ID_PPCNT_RECOVERY_CTRS_TIME_SINCE_LAST_RECOVERY "NVML_PRM_COUNTER_ID_PPCNT_RECOVERY_CTRS_TIME_SINCE_LAST_RECOVERY" = 102 + NVML_PRM_COUNTER_ID_PPCNT_RECOVERY_CTRS_TIME_BETWEEN_LAST_TWO_RECOVERIES "NVML_PRM_COUNTER_ID_PPCNT_RECOVERY_CTRS_TIME_BETWEEN_LAST_TWO_RECOVERIES" = 103 + NVML_PRM_COUNTER_ID_PPCNT_PORTCOUNTERS_PORT_XMIT_WAIT "NVML_PRM_COUNTER_ID_PPCNT_PORTCOUNTERS_PORT_XMIT_WAIT" = 201 + NVML_PRM_COUNTER_ID_PPCNT_PLR_RCV_CODES "NVML_PRM_COUNTER_ID_PPCNT_PLR_RCV_CODES" = 301 + NVML_PRM_COUNTER_ID_PPCNT_PLR_RCV_CODE_ERR "NVML_PRM_COUNTER_ID_PPCNT_PLR_RCV_CODE_ERR" = 302 + NVML_PRM_COUNTER_ID_PPCNT_PLR_RCV_UNCORRECTABLE_CODE "NVML_PRM_COUNTER_ID_PPCNT_PLR_RCV_UNCORRECTABLE_CODE" = 303 + NVML_PRM_COUNTER_ID_PPCNT_PLR_XMIT_CODES "NVML_PRM_COUNTER_ID_PPCNT_PLR_XMIT_CODES" = 304 + NVML_PRM_COUNTER_ID_PPCNT_PLR_XMIT_RETRY_CODES "NVML_PRM_COUNTER_ID_PPCNT_PLR_XMIT_RETRY_CODES" = 305 + NVML_PRM_COUNTER_ID_PPCNT_PLR_XMIT_RETRY_EVENTS "NVML_PRM_COUNTER_ID_PPCNT_PLR_XMIT_RETRY_EVENTS" = 306 + NVML_PRM_COUNTER_ID_PPCNT_PLR_SYNC_EVENTS "NVML_PRM_COUNTER_ID_PPCNT_PLR_SYNC_EVENTS" = 307 + NVML_PRM_COUNTER_ID_PPRM_OPER_RECOVERY "NVML_PRM_COUNTER_ID_PPRM_OPER_RECOVERY" = 1001 + +ctypedef enum nvmlPowerProfileOperation_t "nvmlPowerProfileOperation_t": + NVML_POWER_PROFILE_OPERATION_CLEAR "NVML_POWER_PROFILE_OPERATION_CLEAR" = 0 + NVML_POWER_PROFILE_OPERATION_SET "NVML_POWER_PROFILE_OPERATION_SET" = 1 + NVML_POWER_PROFILE_OPERATION_SET_AND_OVERWRITE "NVML_POWER_PROFILE_OPERATION_SET_AND_OVERWRITE" = 2 + NVML_POWER_PROFILE_OPERATION_MAX "NVML_POWER_PROFILE_OPERATION_MAX" = 3 + # types ctypedef struct nvmlPciInfoExt_v1_t 'nvmlPciInfoExt_v1_t': @@ -1278,6 +1301,16 @@ ctypedef union _anon_pod7 '_anon_pod7': unsigned char inData[496] unsigned char outData[496] +ctypedef struct nvmlUnrepairableMemoryStatus_v1_t 'nvmlUnrepairableMemoryStatus_v1_t': + unsigned int bUnrepairableMemory + +ctypedef struct nvmlRusdSettings_v1_t 'nvmlRusdSettings_v1_t': + unsigned int version + unsigned long long pollMask + +ctypedef struct nvmlPRMCounterInput_v1_t 'nvmlPRMCounterInput_v1_t': + unsigned int localPort + ctypedef nvmlPciInfoExt_v1_t nvmlPciInfoExt_t 'nvmlPciInfoExt_t' ctypedef nvmlCoolerInfo_v1_t nvmlCoolerInfo_t 'nvmlCoolerInfo_t' ctypedef nvmlDramEncryptionInfo_v1_t nvmlDramEncryptionInfo_t 'nvmlDramEncryptionInfo_t' @@ -1468,6 +1501,11 @@ ctypedef struct nvmlFieldValue_t 'nvmlFieldValue_t': nvmlReturn_t nvmlReturn nvmlValue_t value +ctypedef struct nvmlPRMCounterValue_v1_t 'nvmlPRMCounterValue_v1_t': + nvmlReturn_t status + nvmlValueType_t outputType + nvmlValue_t outputValue + ctypedef struct nvmlGpuThermalSettings_t 'nvmlGpuThermalSettings_t': unsigned int count _anon_pod0 sensor[3] @@ -1570,6 +1608,10 @@ ctypedef struct nvmlWorkloadPowerProfileRequestedProfiles_v1_t 'nvmlWorkloadPowe unsigned int version nvmlMask255_t requestedProfilesMask +ctypedef struct nvmlWorkloadPowerProfileUpdateProfiles_v1_t 'nvmlWorkloadPowerProfileUpdateProfiles_v1_t': + nvmlPowerProfileOperation_t operation + nvmlMask255_t updateProfilesMask + ctypedef struct nvmlEccSramUniqueUncorrectedErrorCounts_v1_t 'nvmlEccSramUniqueUncorrectedErrorCounts_v1_t': unsigned int version unsigned int entryCount @@ -1606,6 +1648,11 @@ ctypedef struct nvmlVgpuInstancesUtilizationInfo_v1_t 'nvmlVgpuInstancesUtilizat unsigned long long lastSeenTimeStamp nvmlVgpuInstanceUtilizationInfo_v1_t* vgpuUtilArray +ctypedef struct nvmlPRMCounter_v1_t 'nvmlPRMCounter_v1_t': + unsigned int counterId + nvmlPRMCounterInput_v1_t inData + nvmlPRMCounterValue_v1_t counterValue + ctypedef nvmlUUID_v1_t nvmlUUID_t 'nvmlUUID_t' ctypedef nvmlProcessesUtilizationInfo_v1_t nvmlProcessesUtilizationInfo_t 'nvmlProcessesUtilizationInfo_t' ctypedef struct nvmlVgpuSchedulerLog_t 'nvmlVgpuSchedulerLog_t': @@ -1673,6 +1720,10 @@ ctypedef struct nvmlNvLinkInfo_v2_t 'nvmlNvLinkInfo_v2_t': ctypedef nvmlVgpuProcessesUtilizationInfo_v1_t nvmlVgpuProcessesUtilizationInfo_t 'nvmlVgpuProcessesUtilizationInfo_t' ctypedef nvmlVgpuInstancesUtilizationInfo_v1_t nvmlVgpuInstancesUtilizationInfo_t 'nvmlVgpuInstancesUtilizationInfo_t' +ctypedef struct nvmlPRMCounterList_v1_t 'nvmlPRMCounterList_v1_t': + unsigned int numCounters + nvmlPRMCounter_v1_t* counters + ctypedef nvmlVgpuSchedulerStateInfo_v1_t nvmlVgpuSchedulerStateInfo_t 'nvmlVgpuSchedulerStateInfo_t' ctypedef nvmlVgpuSchedulerLogInfo_v1_t nvmlVgpuSchedulerLogInfo_t 'nvmlVgpuSchedulerLogInfo_t' ctypedef nvmlVgpuSchedulerState_v1_t nvmlVgpuSchedulerState_t 'nvmlVgpuSchedulerState_t' @@ -2036,3 +2087,6 @@ cdef nvmlReturn_t nvmlDeviceGetNvLinkInfo(nvmlDevice_t device, nvmlNvLinkInfo_t* cdef nvmlReturn_t nvmlDeviceReadWritePRM_v1(nvmlDevice_t device, nvmlPRMTLV_v1_t* buffer) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil cdef nvmlReturn_t nvmlDeviceGetGpuInstanceProfileInfoByIdV(nvmlDevice_t device, unsigned int profileId, nvmlGpuInstanceProfileInfo_v2_t* info) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil cdef nvmlReturn_t nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts(nvmlDevice_t device, nvmlEccSramUniqueUncorrectedErrorCounts_t* errorCounts) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil +cdef nvmlReturn_t nvmlDeviceGetUnrepairableMemoryFlag_v1(nvmlDevice_t device, nvmlUnrepairableMemoryStatus_v1_t* unrepairableMemoryStatus) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil +cdef nvmlReturn_t nvmlDeviceReadPRMCounters_v1(nvmlDevice_t device, nvmlPRMCounterList_v1_t* counterList) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil +cdef nvmlReturn_t nvmlDeviceSetRusdSettings_v1(nvmlDevice_t device, nvmlRusdSettings_v1_t* settings) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil diff --git a/cuda_bindings/cuda/bindings/cy_nvml.pyx b/cuda_bindings/cuda/bindings/cy_nvml.pyx index aaf047ca0d..961b3a9208 100644 --- a/cuda_bindings/cuda/bindings/cy_nvml.pyx +++ b/cuda_bindings/cuda/bindings/cy_nvml.pyx @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE # -# This code was automatically generated across versions from 12.9.1 to 13.0.1. Do not modify it directly. +# This code was automatically generated across versions from 12.9.1 to 13.1.0. Do not modify it directly. from ._internal cimport _nvml as _nvml @@ -1397,3 +1397,15 @@ cdef nvmlReturn_t nvmlDeviceGetGpuInstanceProfileInfoByIdV(nvmlDevice_t device, cdef nvmlReturn_t nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts(nvmlDevice_t device, nvmlEccSramUniqueUncorrectedErrorCounts_t* errorCounts) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil: return _nvml._nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts(device, errorCounts) + + +cdef nvmlReturn_t nvmlDeviceGetUnrepairableMemoryFlag_v1(nvmlDevice_t device, nvmlUnrepairableMemoryStatus_v1_t* unrepairableMemoryStatus) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil: + return _nvml._nvmlDeviceGetUnrepairableMemoryFlag_v1(device, unrepairableMemoryStatus) + + +cdef nvmlReturn_t nvmlDeviceReadPRMCounters_v1(nvmlDevice_t device, nvmlPRMCounterList_v1_t* counterList) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil: + return _nvml._nvmlDeviceReadPRMCounters_v1(device, counterList) + + +cdef nvmlReturn_t nvmlDeviceSetRusdSettings_v1(nvmlDevice_t device, nvmlRusdSettings_v1_t* settings) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil: + return _nvml._nvmlDeviceSetRusdSettings_v1(device, settings) From 9cfde7644aba7a160872ccc66d4f8ffb11c6c867 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 10 Dec 2025 07:49:23 -0500 Subject: [PATCH 2/2] Small fixes --- cuda_bindings/cuda/bindings/_nvml.pxd | 1 - cuda_bindings/cuda/bindings/_nvml.pyx | 122 +++++++++++++++++++++++++- 2 files changed, 121 insertions(+), 2 deletions(-) diff --git a/cuda_bindings/cuda/bindings/_nvml.pxd b/cuda_bindings/cuda/bindings/_nvml.pxd index da9fc07f6e..1059303fe4 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pxd +++ b/cuda_bindings/cuda/bindings/_nvml.pxd @@ -41,7 +41,6 @@ ctypedef nvmlComputeInstanceProfileInfo_t ComputeInstanceProfileInfo ctypedef nvmlMask255_t Mask255 ctypedef nvmlHostname_v1_t Hostname_v1 ctypedef nvmlNvLinkInfo_v1_t NvLinkInfo_v1 -ctypedef nvmlPRMCounterInput_v1_t PRMCounterInput_v1 ctypedef nvmlPowerValue_v2_t PowerValue_v2 ctypedef nvmlVgpuProcessUtilizationSample_t VgpuProcessUtilizationSample ctypedef nvmlGpuFabricInfo_t GpuFabricInfo diff --git a/cuda_bindings/cuda/bindings/_nvml.pyx b/cuda_bindings/cuda/bindings/_nvml.pyx index fa151c0886..396034acce 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pyx +++ b/cuda_bindings/cuda/bindings/_nvml.pyx @@ -1176,7 +1176,7 @@ NVLINK_MAX_LINKS = 18 class RUSD(_IntEnum): - POLL_NONE - 0x0 # Disable RUSD polling on all metric groups + POLL_NONE = 0x0 # Disable RUSD polling on all metric groups POLL_CLOCK = 0x1 # Enable RUSD polling on clock group POLL_PERF = 0x2 # Enable RUSD polling on performance group POLL_MEMORY = 0x4 # Enable RUSD polling on memory group @@ -15297,6 +15297,126 @@ cdef class RusdSettings_v1: return obj +cdef _get_prm_counter_input_v1_dtype_offsets(): + cdef nvmlPRMCounterInput_v1_t pod = nvmlPRMCounterInput_v1_t() + return _numpy.dtype({ + 'names': ['local_port'], + 'formats': [_numpy.uint32], + 'offsets': [ + (&(pod.localPort)) - (&pod), + ], + 'itemsize': sizeof(nvmlPRMCounterInput_v1_t), + }) + +prm_counter_input_v1_dtype = _get_prm_counter_input_v1_dtype_offsets() + +cdef class PRMCounterInput_v1: + """Empty-initialize an instance of `nvmlPRMCounterInput_v1_t`. + + + .. seealso:: `nvmlPRMCounterInput_v1_t` + """ + cdef: + nvmlPRMCounterInput_v1_t *_ptr + object _owner + bint _owned + bint _readonly + + def __init__(self): + self._ptr = calloc(1, sizeof(nvmlPRMCounterInput_v1_t)) + if self._ptr == NULL: + raise MemoryError("Error allocating PRMCounterInput_v1") + self._owner = None + self._owned = True + self._readonly = False + + def __dealloc__(self): + cdef nvmlPRMCounterInput_v1_t *ptr + if self._owned and self._ptr != NULL: + ptr = self._ptr + self._ptr = NULL + free(ptr) + + def __repr__(self): + return f"<{__name__}.PRMCounterInput_v1 object at {hex(id(self))}>" + + @property + def ptr(self): + """Get the pointer address to the data as Python :class:`int`.""" + return (self._ptr) + + cdef intptr_t _get_ptr(self): + return (self._ptr) + + def __int__(self): + return (self._ptr) + + def __eq__(self, other): + cdef PRMCounterInput_v1 other_ + if not isinstance(other, PRMCounterInput_v1): + return False + other_ = other + return (memcmp((self._ptr), (other_._ptr), sizeof(nvmlPRMCounterInput_v1_t)) == 0) + + def __setitem__(self, key, val): + if key == 0 and isinstance(val, _numpy.ndarray): + self._ptr = malloc(sizeof(nvmlPRMCounterInput_v1_t)) + if self._ptr == NULL: + raise MemoryError("Error allocating PRMCounterInput_v1") + memcpy(self._ptr, val.ctypes.data, sizeof(nvmlPRMCounterInput_v1_t)) + self._owner = None + self._owned = True + self._readonly = not val.flags.writeable + else: + setattr(self, key, val) + + @property + def local_port(self): + """int: Local port number.""" + return self._ptr[0].localPort + + @local_port.setter + def local_port(self, val): + if self._readonly: + raise ValueError("This PRMCounterInput_v1 instance is read-only") + self._ptr[0].localPort = val + + @staticmethod + def from_data(data): + """Create an PRMCounterInput_v1 instance wrapping the given NumPy array. + + Args: + data (_numpy.ndarray): a single-element array of dtype `prm_counter_input_v1_dtype` holding the data. + """ + return __from_data(data, "prm_counter_input_v1_dtype", prm_counter_input_v1_dtype, PRMCounterInput_v1) + + @staticmethod + def from_ptr(intptr_t ptr, bint readonly=False, object owner=None): + """Create an PRMCounterInput_v1 instance wrapping the given pointer. + + Args: + ptr (intptr_t): pointer address as Python :class:`int` to the data. + owner (object): The Python object that owns the pointer. If not provided, data will be copied. + readonly (bool): whether the data is read-only (to the user). default is `False`. + """ + if ptr == 0: + raise ValueError("ptr must not be null (0)") + cdef PRMCounterInput_v1 obj = PRMCounterInput_v1.__new__(PRMCounterInput_v1) + if owner is None: + obj._ptr = malloc(sizeof(nvmlPRMCounterInput_v1_t)) + if obj._ptr == NULL: + raise MemoryError("Error allocating PRMCounterInput_v1") + memcpy((obj._ptr), ptr, sizeof(nvmlPRMCounterInput_v1_t)) + obj._owner = None + obj._owned = True + else: + obj._ptr = ptr + obj._owner = owner + obj._owned = False + obj._readonly = readonly + return obj + + cdef _get_excluded_device_info_dtype_offsets(): cdef nvmlExcludedDeviceInfo_t pod = nvmlExcludedDeviceInfo_t() return _numpy.dtype({