From 20c9c35f08e3ae9f0a4eaf212c5419fd94ffaad0 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdroettboom@nvidia.com>
Date: Thu, 16 Apr 2026 14:21:23 -0400
Subject: [PATCH] cuda.bindings: Update generator changes (12.9.x branch)

---
 .../cuda/bindings/_internal/nvml.pxd          |   10 +-
 .../cuda/bindings/_internal/nvml_linux.pyx    |  170 +-
 .../cuda/bindings/_internal/nvml_windows.pyx  |  138 +-
 cuda_bindings/cuda/bindings/cynvml.pxd        |  109 +-
 cuda_bindings/cuda/bindings/cynvml.pyx        |   34 +-
 cuda_bindings/cuda/bindings/driver.pxd.in     |   50 +-
 cuda_bindings/cuda/bindings/driver.pyx.in     |  523 ++--
 cuda_bindings/cuda/bindings/nvml.pxd          |   28 +-
 cuda_bindings/cuda/bindings/nvml.pyx          | 1726 +++++++++---
 cuda_bindings/cuda/bindings/runtime.pxd.in    |  112 +-
 cuda_bindings/cuda/bindings/runtime.pyx.in    | 2462 +++++++----------
 11 files changed, 3153 insertions(+), 2209 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvml.pxd b/cuda_bindings/cuda/bindings/_internal/nvml.pxd
index 40805378a8..c9a64e2053 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvml.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvml.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1568+g289771de9.d20260413. Do not modify it directly.
 
 from ..cynvml cimport *
 
@@ -139,6 +139,7 @@ cdef nvmlReturn_t _nvmlDeviceGetDriverModel_v2(nvmlDevice_t device, nvmlDriverMo
 cdef nvmlReturn_t _nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char* version, unsigned int length) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
 cdef nvmlReturn_t _nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t* bridgeHierarchy) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
 cdef nvmlReturn_t _nvmlDeviceGetComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_t* infos) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
+cdef nvmlReturn_t _nvmlDeviceGetGraphicsRunningProcesses_v3(nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_t* infos) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
 cdef nvmlReturn_t _nvmlDeviceGetMPSComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_t* infos) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
 cdef nvmlReturn_t _nvmlDeviceGetRunningProcessDetailList(nvmlDevice_t device, nvmlProcessDetailList_t* plist) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
 cdef nvmlReturn_t _nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int* onSameBoard) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
@@ -354,3 +355,10 @@ cdef nvmlReturn_t _nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts(nvmlDevice_t
 cdef nvmlReturn_t _nvmlDeviceGetUnrepairableMemoryFlag_v1(nvmlDevice_t device, nvmlUnrepairableMemoryStatus_v1_t* unrepairableMemoryStatus) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
 cdef nvmlReturn_t _nvmlDeviceReadPRMCounters_v1(nvmlDevice_t device, nvmlPRMCounterList_v1_t* counterList) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
 cdef nvmlReturn_t _nvmlDeviceSetRusdSettings_v1(nvmlDevice_t device, nvmlRusdSettings_v1_t* settings) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
+cdef nvmlReturn_t _nvmlDeviceVgpuForceGspUnload(nvmlDevice_t device) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
+cdef nvmlReturn_t _nvmlDeviceGetVgpuSchedulerState_v2(nvmlDevice_t device, nvmlVgpuSchedulerStateInfo_v2_t* pSchedulerStateInfo) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
+cdef nvmlReturn_t _nvmlGpuInstanceGetVgpuSchedulerState_v2(nvmlGpuInstance_t gpuInstance, nvmlVgpuSchedulerStateInfo_v2_t* pSchedulerStateInfo) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
+cdef nvmlReturn_t _nvmlDeviceGetVgpuSchedulerLog_v2(nvmlDevice_t device, nvmlVgpuSchedulerLogInfo_v2_t* pSchedulerLogInfo) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
+cdef nvmlReturn_t _nvmlGpuInstanceGetVgpuSchedulerLog_v2(nvmlGpuInstance_t gpuInstance, nvmlVgpuSchedulerLogInfo_v2_t* pSchedulerLogInfo) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
+cdef nvmlReturn_t _nvmlDeviceSetVgpuSchedulerState_v2(nvmlDevice_t device, nvmlVgpuSchedulerState_v2_t* pSchedulerState) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
+cdef nvmlReturn_t _nvmlGpuInstanceSetVgpuSchedulerState_v2(nvmlGpuInstance_t gpuInstance, nvmlVgpuSchedulerState_v2_t* pSchedulerState) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
diff --git a/cuda_bindings/cuda/bindings/_internal/nvml_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvml_linux.pyx
index 28f0919423..43b8c18d04 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvml_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvml_linux.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1568+g289771de9.d20260413. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
@@ -188,6 +188,7 @@ cdef void* __nvmlDeviceGetDriverModel_v2 = NULL
 cdef void* __nvmlDeviceGetVbiosVersion = NULL
 cdef void* __nvmlDeviceGetBridgeChipInfo = NULL
 cdef void* __nvmlDeviceGetComputeRunningProcesses_v3 = NULL
+cdef void* __nvmlDeviceGetGraphicsRunningProcesses_v3 = NULL
 cdef void* __nvmlDeviceGetMPSComputeRunningProcesses_v3 = NULL
 cdef void* __nvmlDeviceGetRunningProcessDetailList = NULL
 cdef void* __nvmlDeviceOnSameBoard = NULL
@@ -403,6 +404,13 @@ cdef void* __nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts = NULL
 cdef void* __nvmlDeviceGetUnrepairableMemoryFlag_v1 = NULL
 cdef void* __nvmlDeviceReadPRMCounters_v1 = NULL
 cdef void* __nvmlDeviceSetRusdSettings_v1 = NULL
+cdef void* __nvmlDeviceVgpuForceGspUnload = NULL
+cdef void* __nvmlDeviceGetVgpuSchedulerState_v2 = NULL
+cdef void* __nvmlGpuInstanceGetVgpuSchedulerState_v2 = NULL
+cdef void* __nvmlDeviceGetVgpuSchedulerLog_v2 = NULL
+cdef void* __nvmlGpuInstanceGetVgpuSchedulerLog_v2 = NULL
+cdef void* __nvmlDeviceSetVgpuSchedulerState_v2 = NULL
+cdef void* __nvmlGpuInstanceSetVgpuSchedulerState_v2 = NULL
 
 
 cdef void* load_library() except* with gil:
@@ -1316,6 +1324,13 @@ cdef int _init_nvml() except -1 nogil:
                 handle = load_library()
             __nvmlDeviceGetComputeRunningProcesses_v3 = dlsym(handle, 'nvmlDeviceGetComputeRunningProcesses_v3')
 
+        global __nvmlDeviceGetGraphicsRunningProcesses_v3
+        __nvmlDeviceGetGraphicsRunningProcesses_v3 = dlsym(RTLD_DEFAULT, 'nvmlDeviceGetGraphicsRunningProcesses_v3')
+        if __nvmlDeviceGetGraphicsRunningProcesses_v3 == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvmlDeviceGetGraphicsRunningProcesses_v3 = dlsym(handle, 'nvmlDeviceGetGraphicsRunningProcesses_v3')
+
         global __nvmlDeviceGetMPSComputeRunningProcesses_v3
         __nvmlDeviceGetMPSComputeRunningProcesses_v3 = dlsym(RTLD_DEFAULT, 'nvmlDeviceGetMPSComputeRunningProcesses_v3')
         if __nvmlDeviceGetMPSComputeRunningProcesses_v3 == NULL:
@@ -2821,6 +2836,55 @@ cdef int _init_nvml() except -1 nogil:
                 handle = load_library()
             __nvmlDeviceSetRusdSettings_v1 = dlsym(handle, 'nvmlDeviceSetRusdSettings_v1')
 
+        global __nvmlDeviceVgpuForceGspUnload
+        __nvmlDeviceVgpuForceGspUnload = dlsym(RTLD_DEFAULT, 'nvmlDeviceVgpuForceGspUnload')
+        if __nvmlDeviceVgpuForceGspUnload == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvmlDeviceVgpuForceGspUnload = dlsym(handle, 'nvmlDeviceVgpuForceGspUnload')
+
+        global __nvmlDeviceGetVgpuSchedulerState_v2
+        __nvmlDeviceGetVgpuSchedulerState_v2 = dlsym(RTLD_DEFAULT, 'nvmlDeviceGetVgpuSchedulerState_v2')
+        if __nvmlDeviceGetVgpuSchedulerState_v2 == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvmlDeviceGetVgpuSchedulerState_v2 = dlsym(handle, 'nvmlDeviceGetVgpuSchedulerState_v2')
+
+        global __nvmlGpuInstanceGetVgpuSchedulerState_v2
+        __nvmlGpuInstanceGetVgpuSchedulerState_v2 = dlsym(RTLD_DEFAULT, 'nvmlGpuInstanceGetVgpuSchedulerState_v2')
+        if __nvmlGpuInstanceGetVgpuSchedulerState_v2 == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvmlGpuInstanceGetVgpuSchedulerState_v2 = dlsym(handle, 'nvmlGpuInstanceGetVgpuSchedulerState_v2')
+
+        global __nvmlDeviceGetVgpuSchedulerLog_v2
+        __nvmlDeviceGetVgpuSchedulerLog_v2 = dlsym(RTLD_DEFAULT, 'nvmlDeviceGetVgpuSchedulerLog_v2')
+        if __nvmlDeviceGetVgpuSchedulerLog_v2 == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvmlDeviceGetVgpuSchedulerLog_v2 = dlsym(handle, 'nvmlDeviceGetVgpuSchedulerLog_v2')
+
+        global __nvmlGpuInstanceGetVgpuSchedulerLog_v2
+        __nvmlGpuInstanceGetVgpuSchedulerLog_v2 = dlsym(RTLD_DEFAULT, 'nvmlGpuInstanceGetVgpuSchedulerLog_v2')
+        if __nvmlGpuInstanceGetVgpuSchedulerLog_v2 == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvmlGpuInstanceGetVgpuSchedulerLog_v2 = dlsym(handle, 'nvmlGpuInstanceGetVgpuSchedulerLog_v2')
+
+        global __nvmlDeviceSetVgpuSchedulerState_v2
+        __nvmlDeviceSetVgpuSchedulerState_v2 = dlsym(RTLD_DEFAULT, 'nvmlDeviceSetVgpuSchedulerState_v2')
+        if __nvmlDeviceSetVgpuSchedulerState_v2 == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvmlDeviceSetVgpuSchedulerState_v2 = dlsym(handle, 'nvmlDeviceSetVgpuSchedulerState_v2')
+
+        global __nvmlGpuInstanceSetVgpuSchedulerState_v2
+        __nvmlGpuInstanceSetVgpuSchedulerState_v2 = dlsym(RTLD_DEFAULT, 'nvmlGpuInstanceSetVgpuSchedulerState_v2')
+        if __nvmlGpuInstanceSetVgpuSchedulerState_v2 == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvmlGpuInstanceSetVgpuSchedulerState_v2 = dlsym(handle, 'nvmlGpuInstanceSetVgpuSchedulerState_v2')
+
         __py_nvml_init = True
         return 0
 
@@ -3227,6 +3291,9 @@ cpdef dict _inspect_function_pointers():
     global __nvmlDeviceGetComputeRunningProcesses_v3
     data["__nvmlDeviceGetComputeRunningProcesses_v3"] = <intptr_t>__nvmlDeviceGetComputeRunningProcesses_v3
 
+    global __nvmlDeviceGetGraphicsRunningProcesses_v3
+    data["__nvmlDeviceGetGraphicsRunningProcesses_v3"] = <intptr_t>__nvmlDeviceGetGraphicsRunningProcesses_v3
+
     global __nvmlDeviceGetMPSComputeRunningProcesses_v3
     data["__nvmlDeviceGetMPSComputeRunningProcesses_v3"] = <intptr_t>__nvmlDeviceGetMPSComputeRunningProcesses_v3
 
@@ -3872,6 +3939,27 @@ cpdef dict _inspect_function_pointers():
     global __nvmlDeviceSetRusdSettings_v1
     data["__nvmlDeviceSetRusdSettings_v1"] = <intptr_t>__nvmlDeviceSetRusdSettings_v1
 
+    global __nvmlDeviceVgpuForceGspUnload
+    data["__nvmlDeviceVgpuForceGspUnload"] = <intptr_t>__nvmlDeviceVgpuForceGspUnload
+
+    global __nvmlDeviceGetVgpuSchedulerState_v2
+    data["__nvmlDeviceGetVgpuSchedulerState_v2"] = <intptr_t>__nvmlDeviceGetVgpuSchedulerState_v2
+
+    global __nvmlGpuInstanceGetVgpuSchedulerState_v2
+    data["__nvmlGpuInstanceGetVgpuSchedulerState_v2"] = <intptr_t>__nvmlGpuInstanceGetVgpuSchedulerState_v2
+
+    global __nvmlDeviceGetVgpuSchedulerLog_v2
+    data["__nvmlDeviceGetVgpuSchedulerLog_v2"] = <intptr_t>__nvmlDeviceGetVgpuSchedulerLog_v2
+
+    global __nvmlGpuInstanceGetVgpuSchedulerLog_v2
+    data["__nvmlGpuInstanceGetVgpuSchedulerLog_v2"] = <intptr_t>__nvmlGpuInstanceGetVgpuSchedulerLog_v2
+
+    global __nvmlDeviceSetVgpuSchedulerState_v2
+    data["__nvmlDeviceSetVgpuSchedulerState_v2"] = <intptr_t>__nvmlDeviceSetVgpuSchedulerState_v2
+
+    global __nvmlGpuInstanceSetVgpuSchedulerState_v2
+    data["__nvmlGpuInstanceSetVgpuSchedulerState_v2"] = <intptr_t>__nvmlGpuInstanceSetVgpuSchedulerState_v2
+
     func_ptrs = data
     return data
 
@@ -5167,6 +5255,16 @@ cdef nvmlReturn_t _nvmlDeviceGetComputeRunningProcesses_v3(nvmlDevice_t device,
         device, infoCount, infos)
 
 
+cdef nvmlReturn_t _nvmlDeviceGetGraphicsRunningProcesses_v3(nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_t* infos) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    global __nvmlDeviceGetGraphicsRunningProcesses_v3
+    _check_or_init_nvml()
+    if __nvmlDeviceGetGraphicsRunningProcesses_v3 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvmlDeviceGetGraphicsRunningProcesses_v3 is not found")
+    return (<nvmlReturn_t (*)(nvmlDevice_t, unsigned int*, nvmlProcessInfo_t*) noexcept nogil>__nvmlDeviceGetGraphicsRunningProcesses_v3)(
+        device, infoCount, infos)
+
+
 cdef nvmlReturn_t _nvmlDeviceGetMPSComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_t* infos) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
     global __nvmlDeviceGetMPSComputeRunningProcesses_v3
     _check_or_init_nvml()
@@ -7315,3 +7413,73 @@ cdef nvmlReturn_t _nvmlDeviceSetRusdSettings_v1(nvmlDevice_t device, nvmlRusdSet
             raise FunctionNotFoundError("function nvmlDeviceSetRusdSettings_v1 is not found")
     return (<nvmlReturn_t (*)(nvmlDevice_t, nvmlRusdSettings_v1_t*) noexcept nogil>__nvmlDeviceSetRusdSettings_v1)(
         device, settings)
+
+
+cdef nvmlReturn_t _nvmlDeviceVgpuForceGspUnload(nvmlDevice_t device) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    global __nvmlDeviceVgpuForceGspUnload
+    _check_or_init_nvml()
+    if __nvmlDeviceVgpuForceGspUnload == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvmlDeviceVgpuForceGspUnload is not found")
+    return (<nvmlReturn_t (*)(nvmlDevice_t) noexcept nogil>__nvmlDeviceVgpuForceGspUnload)(
+        device)
+
+
+cdef nvmlReturn_t _nvmlDeviceGetVgpuSchedulerState_v2(nvmlDevice_t device, nvmlVgpuSchedulerStateInfo_v2_t* pSchedulerStateInfo) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    global __nvmlDeviceGetVgpuSchedulerState_v2
+    _check_or_init_nvml()
+    if __nvmlDeviceGetVgpuSchedulerState_v2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvmlDeviceGetVgpuSchedulerState_v2 is not found")
+    return (<nvmlReturn_t (*)(nvmlDevice_t, nvmlVgpuSchedulerStateInfo_v2_t*) noexcept nogil>__nvmlDeviceGetVgpuSchedulerState_v2)(
+        device, pSchedulerStateInfo)
+
+
+cdef nvmlReturn_t _nvmlGpuInstanceGetVgpuSchedulerState_v2(nvmlGpuInstance_t gpuInstance, nvmlVgpuSchedulerStateInfo_v2_t* pSchedulerStateInfo) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    global __nvmlGpuInstanceGetVgpuSchedulerState_v2
+    _check_or_init_nvml()
+    if __nvmlGpuInstanceGetVgpuSchedulerState_v2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvmlGpuInstanceGetVgpuSchedulerState_v2 is not found")
+    return (<nvmlReturn_t (*)(nvmlGpuInstance_t, nvmlVgpuSchedulerStateInfo_v2_t*) noexcept nogil>__nvmlGpuInstanceGetVgpuSchedulerState_v2)(
+        gpuInstance, pSchedulerStateInfo)
+
+
+cdef nvmlReturn_t _nvmlDeviceGetVgpuSchedulerLog_v2(nvmlDevice_t device, nvmlVgpuSchedulerLogInfo_v2_t* pSchedulerLogInfo) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    global __nvmlDeviceGetVgpuSchedulerLog_v2
+    _check_or_init_nvml()
+    if __nvmlDeviceGetVgpuSchedulerLog_v2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvmlDeviceGetVgpuSchedulerLog_v2 is not found")
+    return (<nvmlReturn_t (*)(nvmlDevice_t, nvmlVgpuSchedulerLogInfo_v2_t*) noexcept nogil>__nvmlDeviceGetVgpuSchedulerLog_v2)(
+        device, pSchedulerLogInfo)
+
+
+cdef nvmlReturn_t _nvmlGpuInstanceGetVgpuSchedulerLog_v2(nvmlGpuInstance_t gpuInstance, nvmlVgpuSchedulerLogInfo_v2_t* pSchedulerLogInfo) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    global __nvmlGpuInstanceGetVgpuSchedulerLog_v2
+    _check_or_init_nvml()
+    if __nvmlGpuInstanceGetVgpuSchedulerLog_v2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvmlGpuInstanceGetVgpuSchedulerLog_v2 is not found")
+    return (<nvmlReturn_t (*)(nvmlGpuInstance_t, nvmlVgpuSchedulerLogInfo_v2_t*) noexcept nogil>__nvmlGpuInstanceGetVgpuSchedulerLog_v2)(
+        gpuInstance, pSchedulerLogInfo)
+
+
+cdef nvmlReturn_t _nvmlDeviceSetVgpuSchedulerState_v2(nvmlDevice_t device, nvmlVgpuSchedulerState_v2_t* pSchedulerState) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    global __nvmlDeviceSetVgpuSchedulerState_v2
+    _check_or_init_nvml()
+    if __nvmlDeviceSetVgpuSchedulerState_v2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvmlDeviceSetVgpuSchedulerState_v2 is not found")
+    return (<nvmlReturn_t (*)(nvmlDevice_t, nvmlVgpuSchedulerState_v2_t*) noexcept nogil>__nvmlDeviceSetVgpuSchedulerState_v2)(
+        device, pSchedulerState)
+
+
+cdef nvmlReturn_t _nvmlGpuInstanceSetVgpuSchedulerState_v2(nvmlGpuInstance_t gpuInstance, nvmlVgpuSchedulerState_v2_t* pSchedulerState) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    global __nvmlGpuInstanceSetVgpuSchedulerState_v2
+    _check_or_init_nvml()
+    if __nvmlGpuInstanceSetVgpuSchedulerState_v2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvmlGpuInstanceSetVgpuSchedulerState_v2 is not found")
+    return (<nvmlReturn_t (*)(nvmlGpuInstance_t, nvmlVgpuSchedulerState_v2_t*) noexcept nogil>__nvmlGpuInstanceSetVgpuSchedulerState_v2)(
+        gpuInstance, pSchedulerState)
diff --git a/cuda_bindings/cuda/bindings/_internal/nvml_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvml_windows.pyx
index afbd0a8860..3601d6afcc 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvml_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvml_windows.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1568+g289771de9.d20260413. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
@@ -207,6 +207,7 @@ cdef void* __nvmlDeviceGetDriverModel_v2 = NULL
 cdef void* __nvmlDeviceGetVbiosVersion = NULL
 cdef void* __nvmlDeviceGetBridgeChipInfo = NULL
 cdef void* __nvmlDeviceGetComputeRunningProcesses_v3 = NULL
+cdef void* __nvmlDeviceGetGraphicsRunningProcesses_v3 = NULL
 cdef void* __nvmlDeviceGetMPSComputeRunningProcesses_v3 = NULL
 cdef void* __nvmlDeviceGetRunningProcessDetailList = NULL
 cdef void* __nvmlDeviceOnSameBoard = NULL
@@ -422,6 +423,13 @@ cdef void* __nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts = NULL
 cdef void* __nvmlDeviceGetUnrepairableMemoryFlag_v1 = NULL
 cdef void* __nvmlDeviceReadPRMCounters_v1 = NULL
 cdef void* __nvmlDeviceSetRusdSettings_v1 = NULL
+cdef void* __nvmlDeviceVgpuForceGspUnload = NULL
+cdef void* __nvmlDeviceGetVgpuSchedulerState_v2 = NULL
+cdef void* __nvmlGpuInstanceGetVgpuSchedulerState_v2 = NULL
+cdef void* __nvmlDeviceGetVgpuSchedulerLog_v2 = NULL
+cdef void* __nvmlGpuInstanceGetVgpuSchedulerLog_v2 = NULL
+cdef void* __nvmlDeviceSetVgpuSchedulerState_v2 = NULL
+cdef void* __nvmlGpuInstanceSetVgpuSchedulerState_v2 = NULL
 
 
 cdef uintptr_t load_library() except* with gil:
@@ -822,6 +830,9 @@ cdef int _init_nvml() except -1 nogil:
         global __nvmlDeviceGetComputeRunningProcesses_v3
         __nvmlDeviceGetComputeRunningProcesses_v3 = GetProcAddress(handle, 'nvmlDeviceGetComputeRunningProcesses_v3')
 
+        global __nvmlDeviceGetGraphicsRunningProcesses_v3
+        __nvmlDeviceGetGraphicsRunningProcesses_v3 = GetProcAddress(handle, 'nvmlDeviceGetGraphicsRunningProcesses_v3')
+
         global __nvmlDeviceGetMPSComputeRunningProcesses_v3
         __nvmlDeviceGetMPSComputeRunningProcesses_v3 = GetProcAddress(handle, 'nvmlDeviceGetMPSComputeRunningProcesses_v3')
 
@@ -1467,6 +1478,27 @@ cdef int _init_nvml() except -1 nogil:
         global __nvmlDeviceSetRusdSettings_v1
         __nvmlDeviceSetRusdSettings_v1 = GetProcAddress(handle, 'nvmlDeviceSetRusdSettings_v1')
 
+        global __nvmlDeviceVgpuForceGspUnload
+        __nvmlDeviceVgpuForceGspUnload = GetProcAddress(handle, 'nvmlDeviceVgpuForceGspUnload')
+
+        global __nvmlDeviceGetVgpuSchedulerState_v2
+        __nvmlDeviceGetVgpuSchedulerState_v2 = GetProcAddress(handle, 'nvmlDeviceGetVgpuSchedulerState_v2')
+
+        global __nvmlGpuInstanceGetVgpuSchedulerState_v2
+        __nvmlGpuInstanceGetVgpuSchedulerState_v2 = GetProcAddress(handle, 'nvmlGpuInstanceGetVgpuSchedulerState_v2')
+
+        global __nvmlDeviceGetVgpuSchedulerLog_v2
+        __nvmlDeviceGetVgpuSchedulerLog_v2 = GetProcAddress(handle, 'nvmlDeviceGetVgpuSchedulerLog_v2')
+
+        global __nvmlGpuInstanceGetVgpuSchedulerLog_v2
+        __nvmlGpuInstanceGetVgpuSchedulerLog_v2 = GetProcAddress(handle, 'nvmlGpuInstanceGetVgpuSchedulerLog_v2')
+
+        global __nvmlDeviceSetVgpuSchedulerState_v2
+        __nvmlDeviceSetVgpuSchedulerState_v2 = GetProcAddress(handle, 'nvmlDeviceSetVgpuSchedulerState_v2')
+
+        global __nvmlGpuInstanceSetVgpuSchedulerState_v2
+        __nvmlGpuInstanceSetVgpuSchedulerState_v2 = GetProcAddress(handle, 'nvmlGpuInstanceSetVgpuSchedulerState_v2')
+
         __py_nvml_init = True
         return 0
 
@@ -1873,6 +1905,9 @@ cpdef dict _inspect_function_pointers():
     global __nvmlDeviceGetComputeRunningProcesses_v3
     data["__nvmlDeviceGetComputeRunningProcesses_v3"] = <intptr_t>__nvmlDeviceGetComputeRunningProcesses_v3
 
+    global __nvmlDeviceGetGraphicsRunningProcesses_v3
+    data["__nvmlDeviceGetGraphicsRunningProcesses_v3"] = <intptr_t>__nvmlDeviceGetGraphicsRunningProcesses_v3
+
     global __nvmlDeviceGetMPSComputeRunningProcesses_v3
     data["__nvmlDeviceGetMPSComputeRunningProcesses_v3"] = <intptr_t>__nvmlDeviceGetMPSComputeRunningProcesses_v3
 
@@ -2518,6 +2553,27 @@ cpdef dict _inspect_function_pointers():
     global __nvmlDeviceSetRusdSettings_v1
     data["__nvmlDeviceSetRusdSettings_v1"] = <intptr_t>__nvmlDeviceSetRusdSettings_v1
 
+    global __nvmlDeviceVgpuForceGspUnload
+    data["__nvmlDeviceVgpuForceGspUnload"] = <intptr_t>__nvmlDeviceVgpuForceGspUnload
+
+    global __nvmlDeviceGetVgpuSchedulerState_v2
+    data["__nvmlDeviceGetVgpuSchedulerState_v2"] = <intptr_t>__nvmlDeviceGetVgpuSchedulerState_v2
+
+    global __nvmlGpuInstanceGetVgpuSchedulerState_v2
+    data["__nvmlGpuInstanceGetVgpuSchedulerState_v2"] = <intptr_t>__nvmlGpuInstanceGetVgpuSchedulerState_v2
+
+    global __nvmlDeviceGetVgpuSchedulerLog_v2
+    data["__nvmlDeviceGetVgpuSchedulerLog_v2"] = <intptr_t>__nvmlDeviceGetVgpuSchedulerLog_v2
+
+    global __nvmlGpuInstanceGetVgpuSchedulerLog_v2
+    data["__nvmlGpuInstanceGetVgpuSchedulerLog_v2"] = <intptr_t>__nvmlGpuInstanceGetVgpuSchedulerLog_v2
+
+    global __nvmlDeviceSetVgpuSchedulerState_v2
+    data["__nvmlDeviceSetVgpuSchedulerState_v2"] = <intptr_t>__nvmlDeviceSetVgpuSchedulerState_v2
+
+    global __nvmlGpuInstanceSetVgpuSchedulerState_v2
+    data["__nvmlGpuInstanceSetVgpuSchedulerState_v2"] = <intptr_t>__nvmlGpuInstanceSetVgpuSchedulerState_v2
+
     func_ptrs = data
     return data
 
@@ -3813,6 +3869,16 @@ cdef nvmlReturn_t _nvmlDeviceGetComputeRunningProcesses_v3(nvmlDevice_t device,
         device, infoCount, infos)
 
 
+cdef nvmlReturn_t _nvmlDeviceGetGraphicsRunningProcesses_v3(nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_t* infos) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    global __nvmlDeviceGetGraphicsRunningProcesses_v3
+    _check_or_init_nvml()
+    if __nvmlDeviceGetGraphicsRunningProcesses_v3 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvmlDeviceGetGraphicsRunningProcesses_v3 is not found")
+    return (<nvmlReturn_t (*)(nvmlDevice_t, unsigned int*, nvmlProcessInfo_t*) noexcept nogil>__nvmlDeviceGetGraphicsRunningProcesses_v3)(
+        device, infoCount, infos)
+
+
 cdef nvmlReturn_t _nvmlDeviceGetMPSComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_t* infos) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
     global __nvmlDeviceGetMPSComputeRunningProcesses_v3
     _check_or_init_nvml()
@@ -5961,3 +6027,73 @@ cdef nvmlReturn_t _nvmlDeviceSetRusdSettings_v1(nvmlDevice_t device, nvmlRusdSet
             raise FunctionNotFoundError("function nvmlDeviceSetRusdSettings_v1 is not found")
     return (<nvmlReturn_t (*)(nvmlDevice_t, nvmlRusdSettings_v1_t*) noexcept nogil>__nvmlDeviceSetRusdSettings_v1)(
         device, settings)
+
+
+cdef nvmlReturn_t _nvmlDeviceVgpuForceGspUnload(nvmlDevice_t device) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    global __nvmlDeviceVgpuForceGspUnload
+    _check_or_init_nvml()
+    if __nvmlDeviceVgpuForceGspUnload == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvmlDeviceVgpuForceGspUnload is not found")
+    return (<nvmlReturn_t (*)(nvmlDevice_t) noexcept nogil>__nvmlDeviceVgpuForceGspUnload)(
+        device)
+
+
+cdef nvmlReturn_t _nvmlDeviceGetVgpuSchedulerState_v2(nvmlDevice_t device, nvmlVgpuSchedulerStateInfo_v2_t* pSchedulerStateInfo) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    global __nvmlDeviceGetVgpuSchedulerState_v2
+    _check_or_init_nvml()
+    if __nvmlDeviceGetVgpuSchedulerState_v2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvmlDeviceGetVgpuSchedulerState_v2 is not found")
+    return (<nvmlReturn_t (*)(nvmlDevice_t, nvmlVgpuSchedulerStateInfo_v2_t*) noexcept nogil>__nvmlDeviceGetVgpuSchedulerState_v2)(
+        device, pSchedulerStateInfo)
+
+
+cdef nvmlReturn_t _nvmlGpuInstanceGetVgpuSchedulerState_v2(nvmlGpuInstance_t gpuInstance, nvmlVgpuSchedulerStateInfo_v2_t* pSchedulerStateInfo) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    global __nvmlGpuInstanceGetVgpuSchedulerState_v2
+    _check_or_init_nvml()
+    if __nvmlGpuInstanceGetVgpuSchedulerState_v2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvmlGpuInstanceGetVgpuSchedulerState_v2 is not found")
+    return (<nvmlReturn_t (*)(nvmlGpuInstance_t, nvmlVgpuSchedulerStateInfo_v2_t*) noexcept nogil>__nvmlGpuInstanceGetVgpuSchedulerState_v2)(
+        gpuInstance, pSchedulerStateInfo)
+
+
+cdef nvmlReturn_t _nvmlDeviceGetVgpuSchedulerLog_v2(nvmlDevice_t device, nvmlVgpuSchedulerLogInfo_v2_t* pSchedulerLogInfo) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    global __nvmlDeviceGetVgpuSchedulerLog_v2
+    _check_or_init_nvml()
+    if __nvmlDeviceGetVgpuSchedulerLog_v2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvmlDeviceGetVgpuSchedulerLog_v2 is not found")
+    return (<nvmlReturn_t (*)(nvmlDevice_t, nvmlVgpuSchedulerLogInfo_v2_t*) noexcept nogil>__nvmlDeviceGetVgpuSchedulerLog_v2)(
+        device, pSchedulerLogInfo)
+
+
+cdef nvmlReturn_t _nvmlGpuInstanceGetVgpuSchedulerLog_v2(nvmlGpuInstance_t gpuInstance, nvmlVgpuSchedulerLogInfo_v2_t* pSchedulerLogInfo) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    global __nvmlGpuInstanceGetVgpuSchedulerLog_v2
+    _check_or_init_nvml()
+    if __nvmlGpuInstanceGetVgpuSchedulerLog_v2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvmlGpuInstanceGetVgpuSchedulerLog_v2 is not found")
+    return (<nvmlReturn_t (*)(nvmlGpuInstance_t, nvmlVgpuSchedulerLogInfo_v2_t*) noexcept nogil>__nvmlGpuInstanceGetVgpuSchedulerLog_v2)(
+        gpuInstance, pSchedulerLogInfo)
+
+
+cdef nvmlReturn_t _nvmlDeviceSetVgpuSchedulerState_v2(nvmlDevice_t device, nvmlVgpuSchedulerState_v2_t* pSchedulerState) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    global __nvmlDeviceSetVgpuSchedulerState_v2
+    _check_or_init_nvml()
+    if __nvmlDeviceSetVgpuSchedulerState_v2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvmlDeviceSetVgpuSchedulerState_v2 is not found")
+    return (<nvmlReturn_t (*)(nvmlDevice_t, nvmlVgpuSchedulerState_v2_t*) noexcept nogil>__nvmlDeviceSetVgpuSchedulerState_v2)(
+        device, pSchedulerState)
+
+
+cdef nvmlReturn_t _nvmlGpuInstanceSetVgpuSchedulerState_v2(nvmlGpuInstance_t gpuInstance, nvmlVgpuSchedulerState_v2_t* pSchedulerState) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    global __nvmlGpuInstanceSetVgpuSchedulerState_v2
+    _check_or_init_nvml()
+    if __nvmlGpuInstanceSetVgpuSchedulerState_v2 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvmlGpuInstanceSetVgpuSchedulerState_v2 is not found")
+    return (<nvmlReturn_t (*)(nvmlGpuInstance_t, nvmlVgpuSchedulerState_v2_t*) noexcept nogil>__nvmlGpuInstanceSetVgpuSchedulerState_v2)(
+        gpuInstance, pSchedulerState)
diff --git a/cuda_bindings/cuda/bindings/cynvml.pxd b/cuda_bindings/cuda/bindings/cynvml.pxd
index a1bb81ffb5..1f59e6d522 100644
--- a/cuda_bindings/cuda/bindings/cynvml.pxd
+++ b/cuda_bindings/cuda/bindings/cynvml.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1568+g289771de9.d20260413. Do not modify it directly.
 
 from libc.stdint cimport int64_t
 
@@ -817,12 +817,19 @@ ctypedef struct nvmlPlatformInfo_v2_t 'nvmlPlatformInfo_v2_t':
     unsigned char moduleId
 
 ctypedef unsigned int nvmlDeviceArchitecture_t 'nvmlDeviceArchitecture_t'
+
 ctypedef unsigned int nvmlBusType_t 'nvmlBusType_t'
+
 ctypedef unsigned int nvmlFanControlPolicy_t 'nvmlFanControlPolicy_t'
+
 ctypedef unsigned int nvmlPowerSource_t 'nvmlPowerSource_t'
+
 ctypedef unsigned char nvmlPowerScopeType_t 'nvmlPowerScopeType_t'
+
 ctypedef unsigned int nvmlVgpuTypeId_t 'nvmlVgpuTypeId_t'
+
 ctypedef unsigned int nvmlVgpuInstance_t 'nvmlVgpuInstance_t'
+
 ctypedef struct nvmlVgpuHeterogeneousMode_v1_t 'nvmlVgpuHeterogeneousMode_v1_t':
     unsigned int version
     unsigned int mode
@@ -862,11 +869,13 @@ ctypedef struct nvmlConfComputeGetKeyRotationThresholdInfo_v1_t 'nvmlConfCompute
     unsigned long long attackerAdvantage
 
 ctypedef unsigned char nvmlGpuFabricState_t 'nvmlGpuFabricState_t'
+
 ctypedef struct nvmlSystemDriverBranchInfo_v1_t 'nvmlSystemDriverBranchInfo_v1_t':
     unsigned int version
     char branch[80]
 
 ctypedef unsigned int nvmlAffinityScope_t 'nvmlAffinityScope_t'
+
 ctypedef struct nvmlTemperature_v1_t 'nvmlTemperature_v1_t':
     unsigned int version
     nvmlTemperatureSensors_t sensorType
@@ -915,12 +924,19 @@ ctypedef struct nvmlPdi_v1_t 'nvmlPdi_v1_t':
     unsigned long long value
 
 ctypedef void* nvmlDevice_t 'nvmlDevice_t'
+
 ctypedef void* nvmlGpuInstance_t 'nvmlGpuInstance_t'
+
 ctypedef void* nvmlUnit_t 'nvmlUnit_t'
+
 ctypedef void* nvmlEventSet_t 'nvmlEventSet_t'
+
 ctypedef void* nvmlSystemEventSet_t 'nvmlSystemEventSet_t'
+
 ctypedef void* nvmlComputeInstance_t 'nvmlComputeInstance_t'
+
 ctypedef void* nvmlGpmSample_t 'nvmlGpmSample_t'
+
 ctypedef struct nvmlPciInfo_t 'nvmlPciInfo_t':
     char busIdLegacy[16]
     unsigned int domain
@@ -1022,7 +1038,7 @@ ctypedef struct nvmlViolationTime_t 'nvmlViolationTime_t':
     unsigned long long referenceTime
     unsigned long long violationTime
 
-ctypedef struct _anon_pod0 '_anon_pod0':
+ctypedef struct cuda_bindings_nvml__anon_pod0:
     nvmlThermalController_t controller
     int defaultMinTemp
     int defaultMaxTemp
@@ -1065,7 +1081,7 @@ ctypedef struct nvmlPlatformInfo_v1_t 'nvmlPlatformInfo_v1_t':
     unsigned char peerType
     unsigned char moduleId
 
-ctypedef struct _anon_pod1 '_anon_pod1':
+ctypedef struct cuda_bindings_nvml__anon_pod1:
     unsigned int bIsPresent
     unsigned int percentage
     unsigned int incThreshold
@@ -1077,11 +1093,11 @@ ctypedef struct nvmlVgpuPlacementList_v1_t 'nvmlVgpuPlacementList_v1_t':
     unsigned int count
     unsigned int* placementIds
 
-ctypedef struct _anon_pod2 '_anon_pod2':
+ctypedef struct cuda_bindings_nvml__anon_pod2:
     unsigned int avgFactor
     unsigned int timeslice
 
-ctypedef struct _anon_pod3 '_anon_pod3':
+ctypedef struct cuda_bindings_nvml__anon_pod3:
     unsigned int timeslice
 
 ctypedef struct nvmlVgpuSchedulerLogEntry_t 'nvmlVgpuSchedulerLogEntry_t':
@@ -1092,11 +1108,11 @@ ctypedef struct nvmlVgpuSchedulerLogEntry_t 'nvmlVgpuSchedulerLogEntry_t':
     unsigned long long targetTimeSlice
     unsigned long long cumulativePreemptionTime
 
-ctypedef struct _anon_pod4 '_anon_pod4':
+ctypedef struct cuda_bindings_nvml__anon_pod4:
     unsigned int avgFactor
     unsigned int frequency
 
-ctypedef struct _anon_pod5 '_anon_pod5':
+ctypedef struct cuda_bindings_nvml__anon_pod5:
     unsigned int timeslice
 
 ctypedef struct nvmlVgpuSchedulerCapabilities_t 'nvmlVgpuSchedulerCapabilities_t':
@@ -1308,7 +1324,7 @@ ctypedef struct nvmlComputeInstanceProfileInfo_v3_t 'nvmlComputeInstanceProfileI
     char name[96]
     unsigned int capabilities
 
-ctypedef struct _anon_pod6 '_anon_pod6':
+ctypedef struct cuda_bindings_nvml__anon_pod6:
     char* shortName
     char* longName
     char* unit
@@ -1347,7 +1363,7 @@ ctypedef struct nvmlNvlinkFirmwareVersion_t 'nvmlNvlinkFirmwareVersion_t':
     unsigned int minor
     unsigned int subMinor
 
-ctypedef union _anon_pod7 '_anon_pod7':
+ctypedef union cuda_bindings_nvml__anon_pod7:
     unsigned char inData[496]
     unsigned char outData[496]
 
@@ -1383,15 +1399,25 @@ ctypedef struct nvmlVgpuSchedulerState_v2_t 'nvmlVgpuSchedulerState_v2_t':
     unsigned int frequency
 
 ctypedef nvmlPciInfoExt_v1_t nvmlPciInfoExt_t 'nvmlPciInfoExt_t'
+
 ctypedef nvmlCoolerInfo_v1_t nvmlCoolerInfo_t 'nvmlCoolerInfo_t'
+
 ctypedef nvmlDramEncryptionInfo_v1_t nvmlDramEncryptionInfo_t 'nvmlDramEncryptionInfo_t'
+
 ctypedef nvmlMarginTemperature_v1_t nvmlMarginTemperature_t 'nvmlMarginTemperature_t'
+
 ctypedef nvmlClockOffset_v1_t nvmlClockOffset_t 'nvmlClockOffset_t'
+
 ctypedef nvmlFanSpeedInfo_v1_t nvmlFanSpeedInfo_t 'nvmlFanSpeedInfo_t'
+
 ctypedef nvmlDevicePerfModes_v1_t nvmlDevicePerfModes_t 'nvmlDevicePerfModes_t'
+
 ctypedef nvmlDeviceCurrentClockFreqs_v1_t nvmlDeviceCurrentClockFreqs_t 'nvmlDeviceCurrentClockFreqs_t'
+
 ctypedef nvmlEccSramErrorStatus_v1_t nvmlEccSramErrorStatus_t 'nvmlEccSramErrorStatus_t'
+
 ctypedef nvmlPlatformInfo_v2_t nvmlPlatformInfo_t 'nvmlPlatformInfo_t'
+
 ctypedef struct nvmlPowerValue_v2_t 'nvmlPowerValue_v2_t':
     unsigned int version
     nvmlPowerScopeType_t powerScope
@@ -1466,13 +1492,21 @@ ctypedef struct nvmlFBCSessionInfo_t 'nvmlFBCSessionInfo_t':
     unsigned int averageLatency
 
 ctypedef nvmlVgpuHeterogeneousMode_v1_t nvmlVgpuHeterogeneousMode_t 'nvmlVgpuHeterogeneousMode_t'
+
 ctypedef nvmlVgpuPlacementId_v1_t nvmlVgpuPlacementId_t 'nvmlVgpuPlacementId_t'
+
 ctypedef nvmlVgpuPlacementList_v2_t nvmlVgpuPlacementList_t 'nvmlVgpuPlacementList_t'
+
 ctypedef nvmlVgpuTypeBar1Info_v1_t nvmlVgpuTypeBar1Info_t 'nvmlVgpuTypeBar1Info_t'
+
 ctypedef nvmlVgpuRuntimeState_v1_t nvmlVgpuRuntimeState_t 'nvmlVgpuRuntimeState_t'
+
 ctypedef nvmlSystemConfComputeSettings_v1_t nvmlSystemConfComputeSettings_t 'nvmlSystemConfComputeSettings_t'
+
 ctypedef nvmlConfComputeSetKeyRotationThresholdInfo_v1_t nvmlConfComputeSetKeyRotationThresholdInfo_t 'nvmlConfComputeSetKeyRotationThresholdInfo_t'
+
 ctypedef nvmlConfComputeGetKeyRotationThresholdInfo_v1_t nvmlConfComputeGetKeyRotationThresholdInfo_t 'nvmlConfComputeGetKeyRotationThresholdInfo_t'
+
 ctypedef struct nvmlGpuFabricInfo_t 'nvmlGpuFabricInfo_t':
     unsigned char clusterUuid[16]
     nvmlReturn_t status
@@ -1497,16 +1531,27 @@ ctypedef struct nvmlGpuFabricInfo_v3_t 'nvmlGpuFabricInfo_v3_t':
     unsigned char healthSummary
 
 ctypedef nvmlSystemDriverBranchInfo_v1_t nvmlSystemDriverBranchInfo_t 'nvmlSystemDriverBranchInfo_t'
+
 ctypedef nvmlTemperature_v1_t nvmlTemperature_t 'nvmlTemperature_t'
+
 ctypedef nvmlNvlinkSupportedBwModes_v1_t nvmlNvlinkSupportedBwModes_t 'nvmlNvlinkSupportedBwModes_t'
+
 ctypedef nvmlNvlinkGetBwMode_v1_t nvmlNvlinkGetBwMode_t 'nvmlNvlinkGetBwMode_t'
+
 ctypedef nvmlNvlinkSetBwMode_v1_t nvmlNvlinkSetBwMode_t 'nvmlNvlinkSetBwMode_t'
+
 ctypedef nvmlDeviceCapabilities_v1_t nvmlDeviceCapabilities_t 'nvmlDeviceCapabilities_t'
+
 ctypedef nvmlPowerSmoothingProfile_v1_t nvmlPowerSmoothingProfile_t 'nvmlPowerSmoothingProfile_t'
+
 ctypedef nvmlPowerSmoothingState_v1_t nvmlPowerSmoothingState_t 'nvmlPowerSmoothingState_t'
+
 ctypedef nvmlDeviceAddressingMode_v1_t nvmlDeviceAddressingMode_t 'nvmlDeviceAddressingMode_t'
+
 ctypedef nvmlRepairStatus_v1_t nvmlRepairStatus_t 'nvmlRepairStatus_t'
+
 ctypedef nvmlPdi_v1_t nvmlPdi_t 'nvmlPdi_t'
+
 ctypedef struct nvmlEventData_t 'nvmlEventData_t':
     nvmlDevice_t device
     unsigned long long eventType
@@ -1579,7 +1624,7 @@ ctypedef struct nvmlPRMCounterValue_v1_t 'nvmlPRMCounterValue_v1_t':
 
 ctypedef struct nvmlGpuThermalSettings_t 'nvmlGpuThermalSettings_t':
     unsigned int count
-    _anon_pod0 sensor[3]
+    cuda_bindings_nvml__anon_pod0 sensor[3]
 
 ctypedef struct nvmlUUID_v1_t 'nvmlUUID_v1_t':
     unsigned int version
@@ -1599,15 +1644,15 @@ ctypedef struct nvmlProcessesUtilizationInfo_v1_t 'nvmlProcessesUtilizationInfo_
 
 ctypedef struct nvmlGpuDynamicPstatesInfo_t 'nvmlGpuDynamicPstatesInfo_t':
     unsigned int flags
-    _anon_pod1 utilization[8]
+    cuda_bindings_nvml__anon_pod1 utilization[8]
 
 ctypedef union nvmlVgpuSchedulerParams_t 'nvmlVgpuSchedulerParams_t':
-    _anon_pod2 vgpuSchedDataWithARR
-    _anon_pod3 vgpuSchedData
+    cuda_bindings_nvml__anon_pod2 vgpuSchedDataWithARR
+    cuda_bindings_nvml__anon_pod3 vgpuSchedData
 
 ctypedef union nvmlVgpuSchedulerSetParams_t 'nvmlVgpuSchedulerSetParams_t':
-    _anon_pod4 vgpuSchedDataWithARR
-    _anon_pod5 vgpuSchedData
+    cuda_bindings_nvml__anon_pod4 vgpuSchedDataWithARR
+    cuda_bindings_nvml__anon_pod5 vgpuSchedData
 
 ctypedef struct nvmlVgpuLicenseInfo_t 'nvmlVgpuLicenseInfo_t':
     unsigned char isLicensed
@@ -1661,7 +1706,7 @@ ctypedef struct nvmlGpmMetric_t 'nvmlGpmMetric_t':
     unsigned int metricId
     nvmlReturn_t nvmlReturn
     double value
-    _anon_pod6 metricInfo
+    cuda_bindings_nvml__anon_pod6 metricInfo
 
 ctypedef struct nvmlWorkloadPowerProfileInfo_v1_t 'nvmlWorkloadPowerProfileInfo_v1_t':
     unsigned int version
@@ -1695,7 +1740,7 @@ ctypedef struct nvmlNvlinkFirmwareInfo_t 'nvmlNvlinkFirmwareInfo_t':
 ctypedef struct nvmlPRMTLV_v1_t 'nvmlPRMTLV_v1_t':
     unsigned dataSize
     unsigned status
-    _anon_pod7 _anon_pod_member0
+    cuda_bindings_nvml__anon_pod7 _anon_pod_member0
 
 ctypedef struct nvmlVgpuSchedulerLogInfo_v2_t 'nvmlVgpuSchedulerLogInfo_v2_t':
     unsigned int engineId
@@ -1706,8 +1751,11 @@ ctypedef struct nvmlVgpuSchedulerLogInfo_v2_t 'nvmlVgpuSchedulerLogInfo_v2_t':
     nvmlVgpuSchedulerLogEntry_v2_t logEntries[200]
 
 ctypedef nvmlVgpuTypeIdInfo_v1_t nvmlVgpuTypeIdInfo_t 'nvmlVgpuTypeIdInfo_t'
+
 ctypedef nvmlVgpuTypeMaxInstance_v1_t nvmlVgpuTypeMaxInstance_t 'nvmlVgpuTypeMaxInstance_t'
+
 ctypedef nvmlVgpuCreatablePlacementInfo_v1_t nvmlVgpuCreatablePlacementInfo_t 'nvmlVgpuCreatablePlacementInfo_t'
+
 ctypedef struct nvmlVgpuProcessesUtilizationInfo_v1_t 'nvmlVgpuProcessesUtilizationInfo_v1_t':
     unsigned int version
     unsigned int vgpuProcessCount
@@ -1715,11 +1763,17 @@ ctypedef struct nvmlVgpuProcessesUtilizationInfo_v1_t 'nvmlVgpuProcessesUtilizat
     nvmlVgpuProcessUtilizationInfo_v1_t* vgpuProcUtilArray
 
 ctypedef nvmlActiveVgpuInstanceInfo_v1_t nvmlActiveVgpuInstanceInfo_t 'nvmlActiveVgpuInstanceInfo_t'
+
 ctypedef nvmlGpuFabricInfo_v3_t nvmlGpuFabricInfoV_t 'nvmlGpuFabricInfoV_t'
+
 ctypedef nvmlSystemEventSetCreateRequest_v1_t nvmlSystemEventSetCreateRequest_t 'nvmlSystemEventSetCreateRequest_t'
+
 ctypedef nvmlSystemEventSetFreeRequest_v1_t nvmlSystemEventSetFreeRequest_t 'nvmlSystemEventSetFreeRequest_t'
+
 ctypedef nvmlSystemRegisterEventRequest_v1_t nvmlSystemRegisterEventRequest_t 'nvmlSystemRegisterEventRequest_t'
+
 ctypedef nvmlProcessDetailList_v1_t nvmlProcessDetailList_t 'nvmlProcessDetailList_t'
+
 ctypedef struct nvmlVgpuInstancesUtilizationInfo_v1_t 'nvmlVgpuInstancesUtilizationInfo_v1_t':
     unsigned int version
     nvmlValueType_t sampleValType
@@ -1733,7 +1787,9 @@ ctypedef struct nvmlPRMCounter_v1_t 'nvmlPRMCounter_v1_t':
     nvmlPRMCounterValue_v1_t counterValue
 
 ctypedef nvmlUUID_v1_t nvmlUUID_t 'nvmlUUID_t'
+
 ctypedef nvmlProcessesUtilizationInfo_v1_t nvmlProcessesUtilizationInfo_t 'nvmlProcessesUtilizationInfo_t'
+
 ctypedef struct nvmlVgpuSchedulerLog_t 'nvmlVgpuSchedulerLog_t':
     unsigned int engineId
     unsigned int schedulerPolicy
@@ -1781,6 +1837,7 @@ ctypedef struct nvmlGridLicensableFeatures_t 'nvmlGridLicensableFeatures_t':
     nvmlGridLicensableFeature_t gridLicensableFeatures[3]
 
 ctypedef nvmlSystemEventSetWaitRequest_v1_t nvmlSystemEventSetWaitRequest_t 'nvmlSystemEventSetWaitRequest_t'
+
 ctypedef struct nvmlGpmMetricsGet_t 'nvmlGpmMetricsGet_t':
     unsigned int version
     unsigned int numMetrics
@@ -1789,29 +1846,39 @@ ctypedef struct nvmlGpmMetricsGet_t 'nvmlGpmMetricsGet_t':
     nvmlGpmMetric_t metrics[333]
 
 ctypedef nvmlWorkloadPowerProfileInfo_v1_t nvmlWorkloadPowerProfileInfo_t 'nvmlWorkloadPowerProfileInfo_t'
+
 ctypedef nvmlWorkloadPowerProfileCurrentProfiles_v1_t nvmlWorkloadPowerProfileCurrentProfiles_t 'nvmlWorkloadPowerProfileCurrentProfiles_t'
+
 ctypedef nvmlWorkloadPowerProfileRequestedProfiles_v1_t nvmlWorkloadPowerProfileRequestedProfiles_t 'nvmlWorkloadPowerProfileRequestedProfiles_t'
+
 ctypedef nvmlEccSramUniqueUncorrectedErrorCounts_v1_t nvmlEccSramUniqueUncorrectedErrorCounts_t 'nvmlEccSramUniqueUncorrectedErrorCounts_t'
+
 ctypedef struct nvmlNvLinkInfo_v2_t 'nvmlNvLinkInfo_v2_t':
     unsigned int version
     unsigned int isNvleEnabled
     nvmlNvlinkFirmwareInfo_t firmwareInfo
 
 ctypedef nvmlVgpuProcessesUtilizationInfo_v1_t nvmlVgpuProcessesUtilizationInfo_t 'nvmlVgpuProcessesUtilizationInfo_t'
+
 ctypedef nvmlVgpuInstancesUtilizationInfo_v1_t nvmlVgpuInstancesUtilizationInfo_t 'nvmlVgpuInstancesUtilizationInfo_t'
+
 ctypedef struct nvmlPRMCounterList_v1_t 'nvmlPRMCounterList_v1_t':
     unsigned int numCounters
     nvmlPRMCounter_v1_t* counters
 
 ctypedef nvmlVgpuSchedulerStateInfo_v1_t nvmlVgpuSchedulerStateInfo_t 'nvmlVgpuSchedulerStateInfo_t'
+
 ctypedef nvmlVgpuSchedulerLogInfo_v1_t nvmlVgpuSchedulerLogInfo_t 'nvmlVgpuSchedulerLogInfo_t'
+
 ctypedef nvmlVgpuSchedulerState_v1_t nvmlVgpuSchedulerState_t 'nvmlVgpuSchedulerState_t'
+
 ctypedef struct nvmlWorkloadPowerProfileProfilesInfo_v1_t 'nvmlWorkloadPowerProfileProfilesInfo_v1_t':
     unsigned int version
     nvmlMask255_t perfProfilesMask
     nvmlWorkloadPowerProfileInfo_t perfProfile[255]
 
 ctypedef nvmlNvLinkInfo_v2_t nvmlNvLinkInfo_t 'nvmlNvLinkInfo_t'
+
 ctypedef nvmlWorkloadPowerProfileProfilesInfo_v1_t nvmlWorkloadPowerProfileProfilesInfo_t 'nvmlWorkloadPowerProfileProfilesInfo_t'
 
 
@@ -1947,6 +2014,7 @@ cdef nvmlReturn_t nvmlDeviceGetDriverModel_v2(nvmlDevice_t device, nvmlDriverMod
 cdef nvmlReturn_t nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char* version, unsigned int length) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
 cdef nvmlReturn_t nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t* bridgeHierarchy) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
 cdef nvmlReturn_t nvmlDeviceGetComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_t* infos) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
+cdef nvmlReturn_t nvmlDeviceGetGraphicsRunningProcesses_v3(nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_t* infos) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
 cdef nvmlReturn_t nvmlDeviceGetMPSComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_t* infos) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
 cdef nvmlReturn_t nvmlDeviceGetRunningProcessDetailList(nvmlDevice_t device, nvmlProcessDetailList_t* plist) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
 cdef nvmlReturn_t nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int* onSameBoard) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
@@ -2162,3 +2230,10 @@ cdef nvmlReturn_t nvmlDeviceGetSramUniqueUncorrectedEccErrorCounts(nvmlDevice_t
 cdef nvmlReturn_t nvmlDeviceGetUnrepairableMemoryFlag_v1(nvmlDevice_t device, nvmlUnrepairableMemoryStatus_v1_t* unrepairableMemoryStatus) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
 cdef nvmlReturn_t nvmlDeviceReadPRMCounters_v1(nvmlDevice_t device, nvmlPRMCounterList_v1_t* counterList) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
 cdef nvmlReturn_t nvmlDeviceSetRusdSettings_v1(nvmlDevice_t device, nvmlRusdSettings_v1_t* settings) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
+cdef nvmlReturn_t nvmlDeviceVgpuForceGspUnload(nvmlDevice_t device) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
+cdef nvmlReturn_t nvmlDeviceGetVgpuSchedulerState_v2(nvmlDevice_t device, nvmlVgpuSchedulerStateInfo_v2_t* pSchedulerStateInfo) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
+cdef nvmlReturn_t nvmlGpuInstanceGetVgpuSchedulerState_v2(nvmlGpuInstance_t gpuInstance, nvmlVgpuSchedulerStateInfo_v2_t* pSchedulerStateInfo) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
+cdef nvmlReturn_t nvmlDeviceGetVgpuSchedulerLog_v2(nvmlDevice_t device, nvmlVgpuSchedulerLogInfo_v2_t* pSchedulerLogInfo) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
+cdef nvmlReturn_t nvmlGpuInstanceGetVgpuSchedulerLog_v2(nvmlGpuInstance_t gpuInstance, nvmlVgpuSchedulerLogInfo_v2_t* pSchedulerLogInfo) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
+cdef nvmlReturn_t nvmlDeviceSetVgpuSchedulerState_v2(nvmlDevice_t device, nvmlVgpuSchedulerState_v2_t* pSchedulerState) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
+cdef nvmlReturn_t nvmlGpuInstanceSetVgpuSchedulerState_v2(nvmlGpuInstance_t gpuInstance, nvmlVgpuSchedulerState_v2_t* pSchedulerState) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil
diff --git a/cuda_bindings/cuda/bindings/cynvml.pyx b/cuda_bindings/cuda/bindings/cynvml.pyx
index 1200442977..81e3d7728a 100644
--- a/cuda_bindings/cuda/bindings/cynvml.pyx
+++ b/cuda_bindings/cuda/bindings/cynvml.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1568+g289771de9.d20260413. Do not modify it directly.
 
 from ._internal cimport nvml as _nvml
 
@@ -523,6 +523,10 @@ cdef nvmlReturn_t nvmlDeviceGetComputeRunningProcesses_v3(nvmlDevice_t device, u
     return _nvml._nvmlDeviceGetComputeRunningProcesses_v3(device, infoCount, infos)
 
 
+cdef nvmlReturn_t nvmlDeviceGetGraphicsRunningProcesses_v3(nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_t* infos) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    return _nvml._nvmlDeviceGetGraphicsRunningProcesses_v3(device, infoCount, infos)
+
+
 cdef nvmlReturn_t nvmlDeviceGetMPSComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_t* infos) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
     return _nvml._nvmlDeviceGetMPSComputeRunningProcesses_v3(device, infoCount, infos)
 
@@ -1381,3 +1385,31 @@ cdef nvmlReturn_t nvmlDeviceReadPRMCounters_v1(nvmlDevice_t device, nvmlPRMCount
 
 cdef nvmlReturn_t nvmlDeviceSetRusdSettings_v1(nvmlDevice_t device, nvmlRusdSettings_v1_t* settings) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
     return _nvml._nvmlDeviceSetRusdSettings_v1(device, settings)
+
+
+cdef nvmlReturn_t nvmlDeviceVgpuForceGspUnload(nvmlDevice_t device) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    return _nvml._nvmlDeviceVgpuForceGspUnload(device)
+
+
+cdef nvmlReturn_t nvmlDeviceGetVgpuSchedulerState_v2(nvmlDevice_t device, nvmlVgpuSchedulerStateInfo_v2_t* pSchedulerStateInfo) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    return _nvml._nvmlDeviceGetVgpuSchedulerState_v2(device, pSchedulerStateInfo)
+
+
+cdef nvmlReturn_t nvmlGpuInstanceGetVgpuSchedulerState_v2(nvmlGpuInstance_t gpuInstance, nvmlVgpuSchedulerStateInfo_v2_t* pSchedulerStateInfo) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    return _nvml._nvmlGpuInstanceGetVgpuSchedulerState_v2(gpuInstance, pSchedulerStateInfo)
+
+
+cdef nvmlReturn_t nvmlDeviceGetVgpuSchedulerLog_v2(nvmlDevice_t device, nvmlVgpuSchedulerLogInfo_v2_t* pSchedulerLogInfo) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    return _nvml._nvmlDeviceGetVgpuSchedulerLog_v2(device, pSchedulerLogInfo)
+
+
+cdef nvmlReturn_t nvmlGpuInstanceGetVgpuSchedulerLog_v2(nvmlGpuInstance_t gpuInstance, nvmlVgpuSchedulerLogInfo_v2_t* pSchedulerLogInfo) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    return _nvml._nvmlGpuInstanceGetVgpuSchedulerLog_v2(gpuInstance, pSchedulerLogInfo)
+
+
+cdef nvmlReturn_t nvmlDeviceSetVgpuSchedulerState_v2(nvmlDevice_t device, nvmlVgpuSchedulerState_v2_t* pSchedulerState) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    return _nvml._nvmlDeviceSetVgpuSchedulerState_v2(device, pSchedulerState)
+
+
+cdef nvmlReturn_t nvmlGpuInstanceSetVgpuSchedulerState_v2(nvmlGpuInstance_t gpuInstance, nvmlVgpuSchedulerState_v2_t* pSchedulerState) except?_NVMLRETURN_T_INTERNAL_LOADING_ERROR nogil:
+    return _nvml._nvmlGpuInstanceSetVgpuSchedulerState_v2(gpuInstance, pSchedulerState)
diff --git a/cuda_bindings/cuda/bindings/driver.pxd.in b/cuda_bindings/cuda/bindings/driver.pxd.in
index 86a2ac5760..a60ab2a44e 100644
--- a/cuda_bindings/cuda/bindings/driver.pxd.in
+++ b/cuda_bindings/cuda/bindings/driver.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0, generator version 49a8141. Do not modify it directly.
+# This code was automatically generated with version 12.9.0, generator version 0.3.1.dev1568+g289771de9.d20260413. Do not modify it directly.
 cimport cuda.bindings.cydriver as cydriver
 
 include "_lib/utils.pxd"
@@ -1826,8 +1826,8 @@ cdef class CUlaunchAttributeValue_union:
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
     syncPolicy : CUsynchronizationPolicy
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY.
-        ::CUsynchronizationPolicy for work queued up in this stream
+        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
+        for work queued up in this stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
@@ -3617,8 +3617,8 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st:
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to signal
-        a CUexternalSemaphore of type
+        Only when CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to signal a
+        CUexternalSemaphore of type
         CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
         CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which
         indicates that while signaling the CUexternalSemaphore, no memory
@@ -3760,8 +3760,8 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st:
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on
-        a CUexternalSemaphore of type
+        Only when CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on a
+        CUexternalSemaphore of type
         CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
         CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC which indicates
         that while waiting for the CUexternalSemaphore, no memory
@@ -6544,8 +6544,8 @@ cdef class CUlaunchAttributeValue(CUlaunchAttributeValue_union):
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
     syncPolicy : CUsynchronizationPolicy
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY.
-        ::CUsynchronizationPolicy for work queued up in this stream
+        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
+        for work queued up in this stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
@@ -6741,8 +6741,8 @@ cdef class CUkernelNodeAttrValue_v1(CUlaunchAttributeValue):
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
     syncPolicy : CUsynchronizationPolicy
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY.
-        ::CUsynchronizationPolicy for work queued up in this stream
+        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
+        for work queued up in this stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
@@ -6858,8 +6858,8 @@ cdef class CUkernelNodeAttrValue(CUkernelNodeAttrValue_v1):
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
     syncPolicy : CUsynchronizationPolicy
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY.
-        ::CUsynchronizationPolicy for work queued up in this stream
+        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
+        for work queued up in this stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
@@ -6975,8 +6975,8 @@ cdef class CUstreamAttrValue_v1(CUlaunchAttributeValue):
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
     syncPolicy : CUsynchronizationPolicy
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY.
-        ::CUsynchronizationPolicy for work queued up in this stream
+        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
+        for work queued up in this stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
@@ -7092,8 +7092,8 @@ cdef class CUstreamAttrValue(CUstreamAttrValue_v1):
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
     syncPolicy : CUsynchronizationPolicy
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY.
-        ::CUsynchronizationPolicy for work queued up in this stream
+        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
+        for work queued up in this stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
@@ -9025,8 +9025,8 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1(CUDA_EXTERNAL_SEMAPHORE_SIGN
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to signal
-        a CUexternalSemaphore of type
+        Only when CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to signal a
+        CUexternalSemaphore of type
         CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
         CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which
         indicates that while signaling the CUexternalSemaphore, no memory
@@ -9060,8 +9060,8 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS(CUDA_EXTERNAL_SEMAPHORE_SIGNAL_
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to signal
-        a CUexternalSemaphore of type
+        Only when CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to signal a
+        CUexternalSemaphore of type
         CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
         CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which
         indicates that while signaling the CUexternalSemaphore, no memory
@@ -9095,8 +9095,8 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1(CUDA_EXTERNAL_SEMAPHORE_WAIT_P
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on
-        a CUexternalSemaphore of type
+        Only when CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on a
+        CUexternalSemaphore of type
         CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
         CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC which indicates
         that while waiting for the CUexternalSemaphore, no memory
@@ -9130,8 +9130,8 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS(CUDA_EXTERNAL_SEMAPHORE_WAIT_PARA
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on
-        a CUexternalSemaphore of type
+        Only when CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on a
+        CUexternalSemaphore of type
         CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
         CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC which indicates
         that while waiting for the CUexternalSemaphore, no memory
diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in
index 783b57058f..b9a1623e8d 100644
--- a/cuda_bindings/cuda/bindings/driver.pyx.in
+++ b/cuda_bindings/cuda/bindings/driver.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0, generator version 49a8141. Do not modify it directly.
+# This code was automatically generated with version 12.9.0, generator version 0.3.1.dev1568+g289771de9.d20260413. Do not modify it directly.
 from typing import Any, Optional
 import cython
 import ctypes
@@ -777,7 +777,7 @@ class CUstreamBatchMemOpType(_FastEnum):
 
 class CUstreamMemoryBarrier_flags(_FastEnum):
     """
-    Flags for :py:obj:`~.CUstreamBatchMemOpParams`::memoryBarrier
+    Flags for :py:obj:`~.CUstreamBatchMemOpParams.memoryBarrier`
     """
     {{if 'CU_STREAM_MEMORY_BARRIER_TYPE_SYS' in found_values}}
 
@@ -4015,16 +4015,15 @@ class CUlaunchAttributeID(_FastEnum):
         cydriver.CUlaunchAttributeID_enum.CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE,
         'Valid for graph nodes, launches. This attribute is graphs-only, and passing\n'
         'it to a launch in a non-capturing stream will result in an error.\n'
-        ':py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::deviceUpdatable\n'
+        ':py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.deviceUpdatable`\n'
         'can only be set to 0 or 1. Setting the field to 1 indicates that the\n'
         'corresponding kernel node should be device-updatable. On success, a handle\n'
         'will be returned via\n'
-        ':py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::devNode\n'
-        'which can be passed to the various device-side update functions to update\n'
-        "the node's kernel parameters from within another kernel. For more\n"
-        'information on the types of device updates that can be made, as well as the\n'
-        'relevant limitations thereof, see\n'
-        ':py:obj:`~.cudaGraphKernelNodeUpdatesApply`.\n'
+        ':py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.devNode` which\n'
+        'can be passed to the various device-side update functions to update the\n'
+        "node's kernel parameters from within another kernel. For more information\n"
+        'on the types of device updates that can be made, as well as the relevant\n'
+        'limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`.\n'
         ' Nodes which are device-updatable have additional restrictions compared to\n'
         'regular kernel nodes. Firstly, device-updatable nodes cannot be removed\n'
         'from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once\n'
@@ -7761,16 +7760,15 @@ class CUkernelNodeAttrID(_FastEnum):
         cydriver.CUlaunchAttributeID_enum.CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE,
         'Valid for graph nodes, launches. This attribute is graphs-only, and passing\n'
         'it to a launch in a non-capturing stream will result in an error.\n'
-        ':py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::deviceUpdatable\n'
+        ':py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.deviceUpdatable`\n'
         'can only be set to 0 or 1. Setting the field to 1 indicates that the\n'
         'corresponding kernel node should be device-updatable. On success, a handle\n'
         'will be returned via\n'
-        ':py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::devNode\n'
-        'which can be passed to the various device-side update functions to update\n'
-        "the node's kernel parameters from within another kernel. For more\n"
-        'information on the types of device updates that can be made, as well as the\n'
-        'relevant limitations thereof, see\n'
-        ':py:obj:`~.cudaGraphKernelNodeUpdatesApply`.\n'
+        ':py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.devNode` which\n'
+        'can be passed to the various device-side update functions to update the\n'
+        "node's kernel parameters from within another kernel. For more information\n"
+        'on the types of device updates that can be made, as well as the relevant\n'
+        'limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`.\n'
         ' Nodes which are device-updatable have additional restrictions compared to\n'
         'regular kernel nodes. Firstly, device-updatable nodes cannot be removed\n'
         'from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once\n'
@@ -7971,16 +7969,15 @@ class CUstreamAttrID(_FastEnum):
         cydriver.CUlaunchAttributeID_enum.CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE,
         'Valid for graph nodes, launches. This attribute is graphs-only, and passing\n'
         'it to a launch in a non-capturing stream will result in an error.\n'
-        ':py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::deviceUpdatable\n'
+        ':py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.deviceUpdatable`\n'
         'can only be set to 0 or 1. Setting the field to 1 indicates that the\n'
         'corresponding kernel node should be device-updatable. On success, a handle\n'
         'will be returned via\n'
-        ':py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::devNode\n'
-        'which can be passed to the various device-side update functions to update\n'
-        "the node's kernel parameters from within another kernel. For more\n"
-        'information on the types of device updates that can be made, as well as the\n'
-        'relevant limitations thereof, see\n'
-        ':py:obj:`~.cudaGraphKernelNodeUpdatesApply`.\n'
+        ':py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.devNode` which\n'
+        'can be passed to the various device-side update functions to update the\n'
+        "node's kernel parameters from within another kernel. For more information\n"
+        'on the types of device updates that can be made, as well as the relevant\n'
+        'limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`.\n'
         ' Nodes which are device-updatable have additional restrictions compared to\n'
         'regular kernel nodes. Firstly, device-updatable nodes cannot be removed\n'
         'from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once\n'
@@ -13152,8 +13149,8 @@ cdef class CUlaunchAttributeValue_union:
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
     syncPolicy : CUsynchronizationPolicy
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY.
-        ::CUsynchronizationPolicy for work queued up in this stream
+        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
+        for work queued up in this stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
@@ -18856,8 +18853,8 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st:
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to signal
-        a CUexternalSemaphore of type
+        Only when CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to signal a
+        CUexternalSemaphore of type
         CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
         CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which
         indicates that while signaling the CUexternalSemaphore, no memory
@@ -19236,8 +19233,8 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st:
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on
-        a CUexternalSemaphore of type
+        Only when CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on a
+        CUexternalSemaphore of type
         CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
         CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC which indicates
         that while waiting for the CUexternalSemaphore, no memory
@@ -26529,7 +26526,7 @@ def cuCtxCreate_v4(ctxCreateParams : Optional[CUctxCreateParams], unsigned int f
         pdev = int(CUdevice(dev))
     cydev = <cydriver.CUdevice>pdev
     cdef CUcontext pctx = CUcontext()
-    cdef cydriver.CUctxCreateParams* cyctxCreateParams_ptr = ctxCreateParams._pvt_ptr if ctxCreateParams is not None else NULL
+    cdef cydriver.CUctxCreateParams* cyctxCreateParams_ptr = <cydriver.CUctxCreateParams*>ctxCreateParams._pvt_ptr if ctxCreateParams is not None else NULL
     with nogil:
         err = cydriver.cuCtxCreate_v4(<cydriver.CUcontext*>pctx._pvt_ptr, cyctxCreateParams_ptr, flags, cydev)
     if err != cydriver.CUDA_SUCCESS:
@@ -31474,7 +31471,7 @@ def cuMemcpy2D(pCopy : Optional[CUDA_MEMCPY2D]):
     --------
     :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`
     """
-    cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = pCopy._pvt_ptr if pCopy is not None else NULL
+    cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = <cydriver.CUDA_MEMCPY2D*>pCopy._pvt_ptr if pCopy is not None else NULL
     with nogil:
         err = cydriver.cuMemcpy2D(cypCopy_ptr)
     return (_CUresult(err),)
@@ -31601,7 +31598,7 @@ def cuMemcpy2DUnaligned(pCopy : Optional[CUDA_MEMCPY2D]):
     --------
     :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`
     """
-    cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = pCopy._pvt_ptr if pCopy is not None else NULL
+    cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = <cydriver.CUDA_MEMCPY2D*>pCopy._pvt_ptr if pCopy is not None else NULL
     with nogil:
         err = cydriver.cuMemcpy2DUnaligned(cypCopy_ptr)
     return (_CUresult(err),)
@@ -31731,7 +31728,7 @@ def cuMemcpy3D(pCopy : Optional[CUDA_MEMCPY3D]):
     --------
     :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpy3D`
     """
-    cdef cydriver.CUDA_MEMCPY3D* cypCopy_ptr = pCopy._pvt_ptr if pCopy is not None else NULL
+    cdef cydriver.CUDA_MEMCPY3D* cypCopy_ptr = <cydriver.CUDA_MEMCPY3D*>pCopy._pvt_ptr if pCopy is not None else NULL
     with nogil:
         err = cydriver.cuMemcpy3D(cypCopy_ptr)
     return (_CUresult(err),)
@@ -31761,7 +31758,7 @@ def cuMemcpy3DPeer(pCopy : Optional[CUDA_MEMCPY3D_PEER]):
     --------
     :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyPeer`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyPeerAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :py:obj:`~.cudaMemcpy3DPeer`
     """
-    cdef cydriver.CUDA_MEMCPY3D_PEER* cypCopy_ptr = pCopy._pvt_ptr if pCopy is not None else NULL
+    cdef cydriver.CUDA_MEMCPY3D_PEER* cypCopy_ptr = <cydriver.CUDA_MEMCPY3D_PEER*>pCopy._pvt_ptr if pCopy is not None else NULL
     with nogil:
         err = cydriver.cuMemcpy3DPeer(cypCopy_ptr)
     return (_CUresult(err),)
@@ -32329,7 +32326,7 @@ def cuMemcpy2DAsync(pCopy : Optional[CUDA_MEMCPY2D], hStream):
     else:
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
-    cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = pCopy._pvt_ptr if pCopy is not None else NULL
+    cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = <cydriver.CUDA_MEMCPY2D*>pCopy._pvt_ptr if pCopy is not None else NULL
     with nogil:
         err = cydriver.cuMemcpy2DAsync(cypCopy_ptr, cyhStream)
     return (_CUresult(err),)
@@ -32469,7 +32466,7 @@ def cuMemcpy3DAsync(pCopy : Optional[CUDA_MEMCPY3D], hStream):
     else:
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
-    cdef cydriver.CUDA_MEMCPY3D* cypCopy_ptr = pCopy._pvt_ptr if pCopy is not None else NULL
+    cdef cydriver.CUDA_MEMCPY3D* cypCopy_ptr = <cydriver.CUDA_MEMCPY3D*>pCopy._pvt_ptr if pCopy is not None else NULL
     with nogil:
         err = cydriver.cuMemcpy3DAsync(cypCopy_ptr, cyhStream)
     return (_CUresult(err),)
@@ -32509,7 +32506,7 @@ def cuMemcpy3DPeerAsync(pCopy : Optional[CUDA_MEMCPY3D_PEER], hStream):
     else:
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
-    cdef cydriver.CUDA_MEMCPY3D_PEER* cypCopy_ptr = pCopy._pvt_ptr if pCopy is not None else NULL
+    cdef cydriver.CUDA_MEMCPY3D_PEER* cypCopy_ptr = <cydriver.CUDA_MEMCPY3D_PEER*>pCopy._pvt_ptr if pCopy is not None else NULL
     with nogil:
         err = cydriver.cuMemcpy3DPeerAsync(cypCopy_ptr, cyhStream)
     return (_CUresult(err),)
@@ -32724,16 +32721,16 @@ def cuMemcpy3DBatchAsync(size_t numOps, opList : Optional[tuple[CUDA_MEMCPY3D_BA
     the CUDA array. For CUDA array to CUDA array copies, the element size
     of the two CUDA arrays must match.
 
-    For a given operand, if :py:obj:`~.CUmemcpy3DOperand`::type is
-    specified as :py:obj:`~.CU_MEMCPY_OPERAND_TYPE_POINTER`, then
-    :py:obj:`~.CUmemcpy3DOperand`::op::ptr will be used. The
-    :py:obj:`~.CUmemcpy3DOperand`::op::ptr::ptr field must contain the
-    pointer where the copy should begin. The
-    :py:obj:`~.CUmemcpy3DOperand`::op::ptr::rowLength field specifies the
+    For a given operand, if :py:obj:`~.CUmemcpy3DOperand.type` is specified
+    as :py:obj:`~.CU_MEMCPY_OPERAND_TYPE_POINTER`, then
+    :py:obj:`~.CUmemcpy3DOperand.op.ptr` will be used. The
+    :py:obj:`~.CUmemcpy3DOperand.op.ptr.ptr` field must contain the pointer
+    where the copy should begin. The
+    :py:obj:`~.CUmemcpy3DOperand.op.ptr.rowLength` field specifies the
     length of each row in elements and must either be zero or be greater
     than or equal to the width of the copy specified in
     :py:obj:`~.CUDA_MEMCPY3D_BATCH_OP`::extent::width. The
-    :py:obj:`~.CUmemcpy3DOperand`::op::ptr::layerHeight field specifies the
+    :py:obj:`~.CUmemcpy3DOperand.op.ptr.layerHeight` field specifies the
     height of each layer and must either be zero or be greater than or
     equal to the height of the copy specified in
     :py:obj:`~.CUDA_MEMCPY3D_BATCH_OP`::extent::height. When either of
@@ -32743,15 +32740,15 @@ def cuMemcpy3DBatchAsync(size_t numOps, opList : Optional[tuple[CUDA_MEMCPY3D_BA
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS` is true or
     system-allocated pageable memory on devices where
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS` is true, the
-    :py:obj:`~.CUmemcpy3DOperand`::op::ptr::locHint field can be used to
-    hint the location of the operand.
+    :py:obj:`~.CUmemcpy3DOperand.op.ptr.locHint` field can be used to hint
+    the location of the operand.
 
     If an operand's type is specified as
     :py:obj:`~.CU_MEMCPY_OPERAND_TYPE_ARRAY`, then
-    :py:obj:`~.CUmemcpy3DOperand`::op::array will be used. The
-    :py:obj:`~.CUmemcpy3DOperand`::op::array::array field specifies the
-    CUDA array and :py:obj:`~.CUmemcpy3DOperand`::op::array::offset
-    specifies the 3D offset into that array where the copy begins.
+    :py:obj:`~.CUmemcpy3DOperand.op.array` will be used. The
+    :py:obj:`~.CUmemcpy3DOperand.op.array.array` field specifies the CUDA
+    array and :py:obj:`~.CUmemcpy3DOperand.op.array.offset` specifies the
+    3D offset into that array where the copy begins.
 
     The :py:obj:`~.CUmemcpyAttributes.srcAccessOrder` indicates the source
     access ordering to be observed for copies associated with the
@@ -33487,7 +33484,7 @@ def cuArrayCreate(pAllocateArray : Optional[CUDA_ARRAY_DESCRIPTOR]):
     :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMallocArray`
     """
     cdef CUarray pHandle = CUarray()
-    cdef cydriver.CUDA_ARRAY_DESCRIPTOR* cypAllocateArray_ptr = pAllocateArray._pvt_ptr if pAllocateArray is not None else NULL
+    cdef cydriver.CUDA_ARRAY_DESCRIPTOR* cypAllocateArray_ptr = <cydriver.CUDA_ARRAY_DESCRIPTOR*>pAllocateArray._pvt_ptr if pAllocateArray is not None else NULL
     with nogil:
         err = cydriver.cuArrayCreate(<cydriver.CUarray*>pHandle._pvt_ptr, cypAllocateArray_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -33987,7 +33984,7 @@ def cuArray3DCreate(pAllocateArray : Optional[CUDA_ARRAY3D_DESCRIPTOR]):
     :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMalloc3DArray`
     """
     cdef CUarray pHandle = CUarray()
-    cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR* cypAllocateArray_ptr = pAllocateArray._pvt_ptr if pAllocateArray is not None else NULL
+    cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR* cypAllocateArray_ptr = <cydriver.CUDA_ARRAY3D_DESCRIPTOR*>pAllocateArray._pvt_ptr if pAllocateArray is not None else NULL
     with nogil:
         err = cydriver.cuArray3DCreate(<cydriver.CUarray*>pHandle._pvt_ptr, cypAllocateArray_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -34157,7 +34154,7 @@ def cuMipmappedArrayCreate(pMipmappedArrayDesc : Optional[CUDA_ARRAY3D_DESCRIPTO
     :py:obj:`~.cuMipmappedArrayDestroy`, :py:obj:`~.cuMipmappedArrayGetLevel`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cudaMallocMipmappedArray`
     """
     cdef CUmipmappedArray pHandle = CUmipmappedArray()
-    cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR* cypMipmappedArrayDesc_ptr = pMipmappedArrayDesc._pvt_ptr if pMipmappedArrayDesc is not None else NULL
+    cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR* cypMipmappedArrayDesc_ptr = <cydriver.CUDA_ARRAY3D_DESCRIPTOR*>pMipmappedArrayDesc._pvt_ptr if pMipmappedArrayDesc is not None else NULL
     with nogil:
         err = cydriver.cuMipmappedArrayCreate(<cydriver.CUmipmappedArray*>pHandle._pvt_ptr, cypMipmappedArrayDesc_ptr, numMipmapLevels)
     if err != cydriver.CUDA_SUCCESS:
@@ -34401,7 +34398,7 @@ def cuMemBatchDecompressAsync(paramsArray : Optional[CUmemDecompressParams], siz
     else:
         pstream = int(CUstream(stream))
     cystream = <cydriver.CUstream><void_ptr>pstream
-    cdef cydriver.CUmemDecompressParams* cyparamsArray_ptr = paramsArray._pvt_ptr if paramsArray is not None else NULL
+    cdef cydriver.CUmemDecompressParams* cyparamsArray_ptr = <cydriver.CUmemDecompressParams*>paramsArray._pvt_ptr if paramsArray is not None else NULL
     cdef size_t errorIndex = 0
     with nogil:
         err = cydriver.cuMemBatchDecompressAsync(cyparamsArray_ptr, count, flags, &errorIndex, cystream)
@@ -34522,7 +34519,7 @@ def cuMemCreate(size_t size, prop : Optional[CUmemAllocationProp], unsigned long
     :py:obj:`~.cuMemGetAllocationGranularity` with the
     :py:obj:`~.CU_MEM_ALLOC_GRANULARITY_MINIMUM` flag. To create a CPU
     allocation targeting a specific host NUMA node, applications must set
-    :py:obj:`~.CUmemAllocationProp`::CUmemLocation::type to
+    :py:obj:`~.CUmemAllocationProp.CUmemLocation.type` to
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA` and
     :py:obj:`~.CUmemAllocationProp`::CUmemLocation::id must specify the
     NUMA ID of the CPU. On systems where NUMA is not available
@@ -34552,7 +34549,7 @@ def cuMemCreate(size_t size, prop : Optional[CUmemAllocationProp], unsigned long
     /proc/devices users can execute the following command: `mknod
     /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
 
-    If :py:obj:`~.CUmemAllocationProp`::allocFlags::usage contains
+    If :py:obj:`~.CUmemAllocationProp.allocFlags.usage` contains
     :py:obj:`~.CU_MEM_CREATE_USAGE_TILE_POOL` flag then the memory
     allocation is intended only to be used as backing tile pool for sparse
     CUDA arrays and sparse CUDA mipmapped arrays. (see
@@ -34580,7 +34577,7 @@ def cuMemCreate(size_t size, prop : Optional[CUmemAllocationProp], unsigned long
     :py:obj:`~.cuMemRelease`, :py:obj:`~.cuMemExportToShareableHandle`, :py:obj:`~.cuMemImportFromShareableHandle`
     """
     cdef CUmemGenericAllocationHandle handle = CUmemGenericAllocationHandle()
-    cdef cydriver.CUmemAllocationProp* cyprop_ptr = prop._pvt_ptr if prop is not None else NULL
+    cdef cydriver.CUmemAllocationProp* cyprop_ptr = <cydriver.CUmemAllocationProp*>prop._pvt_ptr if prop is not None else NULL
     with nogil:
         err = cydriver.cuMemCreate(<cydriver.CUmemGenericAllocationHandle*>handle._pvt_ptr, size, cyprop_ptr, flags)
     if err != cydriver.CUDA_SUCCESS:
@@ -34731,17 +34728,17 @@ def cuMemMapArrayAsync(mapInfoList : Optional[tuple[CUarrayMapInfo] | list[CUarr
     where :py:obj:`~.CUarrayMapInfo.resourceType` specifies the type of
     resource to be operated on. If :py:obj:`~.CUarrayMapInfo.resourceType`
     is set to :py:obj:`~.CUresourcetype`::CU_RESOURCE_TYPE_ARRAY then
-    :py:obj:`~.CUarrayMapInfo`::resource::array must be set to a valid
-    sparse CUDA array handle. The CUDA array must be either a 2D, 2D
-    layered or 3D CUDA array and must have been allocated using
-    :py:obj:`~.cuArrayCreate` or :py:obj:`~.cuArray3DCreate` with the flag
+    :py:obj:`~.CUarrayMapInfo.resource.array` must be set to a valid sparse
+    CUDA array handle. The CUDA array must be either a 2D, 2D layered or 3D
+    CUDA array and must have been allocated using :py:obj:`~.cuArrayCreate`
+    or :py:obj:`~.cuArray3DCreate` with the flag
     :py:obj:`~.CUDA_ARRAY3D_SPARSE` or
     :py:obj:`~.CUDA_ARRAY3D_DEFERRED_MAPPING`. For CUDA arrays obtained
     using :py:obj:`~.cuMipmappedArrayGetLevel`,
     :py:obj:`~.CUDA_ERROR_INVALID_VALUE` will be returned. If
     :py:obj:`~.CUarrayMapInfo.resourceType` is set to
     :py:obj:`~.CUresourcetype`::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY then
-    :py:obj:`~.CUarrayMapInfo`::resource::mipmap must be set to a valid
+    :py:obj:`~.CUarrayMapInfo.resource.mipmap` must be set to a valid
     sparse CUDA mipmapped array handle. The CUDA mipmapped array must be
     either a 2D, 2D layered or 3D CUDA mipmapped array and must have been
     allocated using :py:obj:`~.cuMipmappedArrayCreate` with the flag
@@ -34765,26 +34762,25 @@ def cuMemMapArrayAsync(mapInfoList : Optional[tuple[CUarrayMapInfo] | list[CUarr
 
     If :py:obj:`~.CUarrayMapInfo.subresourceType` is set to
     :py:obj:`~.CUarraySparseSubresourceType`::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL
-    then :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel struct must
+    then :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel` struct must
     contain valid array subregion offsets and extents. The
-    :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel::offsetX,
-    :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel::offsetY and
-    :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel::offsetZ must
-    specify valid X, Y and Z offsets respectively. The
-    :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel::extentWidth,
-    :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel::extentHeight and
-    :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel::extentDepth must
+    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.offsetX`,
+    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.offsetY` and
+    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.offsetZ` must specify
+    valid X, Y and Z offsets respectively. The
+    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.extentWidth`,
+    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.extentHeight` and
+    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.extentDepth` must
     specify valid width, height and depth extents respectively. These
     offsets and extents must be aligned to the corresponding tile
     dimension. For CUDA mipmapped arrays
-    :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel::level must
-    specify a valid mip level index. Otherwise, must be zero. For layered
-    CUDA arrays and layered CUDA mipmapped arrays
-    :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel::layer must
-    specify a valid layer index. Otherwise, must be zero.
-    :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel::offsetZ must be
-    zero and
-    :py:obj:`~.CUarrayMapInfo`::subresource::sparseLevel::extentDepth must
+    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.level` must specify a
+    valid mip level index. Otherwise, must be zero. For layered CUDA arrays
+    and layered CUDA mipmapped arrays
+    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.layer` must specify a
+    valid layer index. Otherwise, must be zero.
+    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.offsetZ` must be zero
+    and :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.extentDepth` must
     be set to 1 for 2D and 2D layered CUDA arrays and CUDA mipmapped
     arrays. Tile extents can be obtained by calling
     :py:obj:`~.cuArrayGetSparseProperties` and
@@ -34792,23 +34788,23 @@ def cuMemMapArrayAsync(mapInfoList : Optional[tuple[CUarrayMapInfo] | list[CUarr
 
     If :py:obj:`~.CUarrayMapInfo.subresourceType` is set to
     :py:obj:`~.CUarraySparseSubresourceType`::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL
-    then :py:obj:`~.CUarrayMapInfo`::subresource::miptail struct must
-    contain valid mip tail offset in
-    :py:obj:`~.CUarrayMapInfo`::subresource::miptail::offset and size in
-    :py:obj:`~.CUarrayMapInfo`::subresource::miptail::size. Both, mip tail
+    then :py:obj:`~.CUarrayMapInfo.subresource.miptail` struct must contain
+    valid mip tail offset in
+    :py:obj:`~.CUarrayMapInfo.subresource.miptail.offset` and size in
+    :py:obj:`~.CUarrayMapInfo.subresource.miptail.size`. Both, mip tail
     offset and mip tail size must be aligned to the tile size. For layered
     CUDA mipmapped arrays which don't have the flag
     :py:obj:`~.CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL` set in
     :py:obj:`~.CUDA_ARRAY_SPARSE_PROPERTIES.flags` as returned by
     :py:obj:`~.cuMipmappedArrayGetSparseProperties`,
-    :py:obj:`~.CUarrayMapInfo`::subresource::miptail::layer must specify a
+    :py:obj:`~.CUarrayMapInfo.subresource.miptail.layer` must specify a
     valid layer index. Otherwise, must be zero.
 
-    If :py:obj:`~.CUarrayMapInfo`::resource::array or
-    :py:obj:`~.CUarrayMapInfo`::resource::mipmap was created with
+    If :py:obj:`~.CUarrayMapInfo.resource.array` or
+    :py:obj:`~.CUarrayMapInfo.resource.mipmap` was created with
     :py:obj:`~.CUDA_ARRAY3D_DEFERRED_MAPPING` flag set the
     :py:obj:`~.CUarrayMapInfo.subresourceType` and the contents of
-    :py:obj:`~.CUarrayMapInfo`::subresource will be ignored.
+    :py:obj:`~.CUarrayMapInfo.subresource` will be ignored.
 
     :py:obj:`~.CUarrayMapInfo.memOperationType` specifies the type of
     operation. :py:obj:`~.CUmemOperationType` is defined as:
@@ -34818,7 +34814,7 @@ def cuMemMapArrayAsync(mapInfoList : Optional[tuple[CUarrayMapInfo] | list[CUarr
     If :py:obj:`~.CUarrayMapInfo.memOperationType` is set to
     :py:obj:`~.CUmemOperationType`::CU_MEM_OPERATION_TYPE_MAP then the
     subresource will be mapped onto the tile pool memory specified by
-    :py:obj:`~.CUarrayMapInfo`::memHandle at offset
+    :py:obj:`~.CUarrayMapInfo.memHandle` at offset
     :py:obj:`~.CUarrayMapInfo.offset`. The tile pool allocation has to be
     created by specifying the :py:obj:`~.CU_MEM_CREATE_USAGE_TILE_POOL`
     flag when calling :py:obj:`~.cuMemCreate`. Also,
@@ -34827,7 +34823,7 @@ def cuMemMapArrayAsync(mapInfoList : Optional[tuple[CUarrayMapInfo] | list[CUarr
 
     If :py:obj:`~.CUarrayMapInfo.memOperationType` is set to
     :py:obj:`~.CUmemOperationType`::CU_MEM_OPERATION_TYPE_UNMAP then an
-    unmapping operation is performed. :py:obj:`~.CUarrayMapInfo`::memHandle
+    unmapping operation is performed. :py:obj:`~.CUarrayMapInfo.memHandle`
     must be NULL.
 
     :py:obj:`~.CUarrayMapInfo.deviceBitMask` specifies the list of devices
@@ -34837,7 +34833,7 @@ def cuMemMapArrayAsync(mapInfoList : Optional[tuple[CUarrayMapInfo] | list[CUarr
     :py:obj:`~.CUarrayMapInfo.memOperationType` is set to
     :py:obj:`~.CUmemOperationType`::CU_MEM_OPERATION_TYPE_MAP, the device
     must also match the device associated with the tile pool memory
-    allocation as specified by :py:obj:`~.CUarrayMapInfo`::memHandle.
+    allocation as specified by :py:obj:`~.CUarrayMapInfo.memHandle`.
 
     :py:obj:`~.CUarrayMapInfo.flags` and
     :py:obj:`~.CUarrayMapInfo.reserved`[] are unused and must be set to
@@ -35043,7 +35039,7 @@ def cuMemGetAccess(location : Optional[CUmemLocation], ptr):
         pptr = int(CUdeviceptr(ptr))
     cyptr = <cydriver.CUdeviceptr><void_ptr>pptr
     cdef unsigned long long flags = 0
-    cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location is not None else NULL
+    cdef cydriver.CUmemLocation* cylocation_ptr = <cydriver.CUmemLocation*>location._pvt_ptr if location is not None else NULL
     with nogil:
         err = cydriver.cuMemGetAccess(&flags, cylocation_ptr, cyptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -35192,7 +35188,7 @@ def cuMemGetAllocationGranularity(prop : Optional[CUmemAllocationProp], option n
     :py:obj:`~.cuMemCreate`, :py:obj:`~.cuMemMap`
     """
     cdef size_t granularity = 0
-    cdef cydriver.CUmemAllocationProp* cyprop_ptr = prop._pvt_ptr if prop is not None else NULL
+    cdef cydriver.CUmemAllocationProp* cyprop_ptr = <cydriver.CUmemAllocationProp*>prop._pvt_ptr if prop is not None else NULL
     cdef cydriver.CUmemAllocationGranularity_flags cyoption = int(option)
     with nogil:
         err = cydriver.cuMemGetAllocationGranularity(&granularity, cyprop_ptr, cyoption)
@@ -35682,7 +35678,7 @@ def cuMemPoolGetAccess(memPool, location : Optional[CUmemLocation]):
         pmemPool = int(CUmemoryPool(memPool))
     cymemPool = <cydriver.CUmemoryPool><void_ptr>pmemPool
     cdef cydriver.CUmemAccess_flags flags
-    cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location is not None else NULL
+    cdef cydriver.CUmemLocation* cylocation_ptr = <cydriver.CUmemLocation*>location._pvt_ptr if location is not None else NULL
     with nogil:
         err = cydriver.cuMemPoolGetAccess(&flags, cymemPool, cylocation_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -35757,7 +35753,7 @@ def cuMemPoolCreate(poolProps : Optional[CUmemPoolProps]):
     Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC.
     """
     cdef CUmemoryPool pool = CUmemoryPool()
-    cdef cydriver.CUmemPoolProps* cypoolProps_ptr = poolProps._pvt_ptr if poolProps is not None else NULL
+    cdef cydriver.CUmemPoolProps* cypoolProps_ptr = <cydriver.CUmemPoolProps*>poolProps._pvt_ptr if poolProps is not None else NULL
     with nogil:
         err = cydriver.cuMemPoolCreate(<cydriver.CUmemoryPool*>pool._pvt_ptr, cypoolProps_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -36065,7 +36061,7 @@ def cuMemPoolImportPointer(pool, shareData : Optional[CUmemPoolPtrExportData]):
         ppool = int(CUmemoryPool(pool))
     cypool = <cydriver.CUmemoryPool><void_ptr>ppool
     cdef CUdeviceptr ptr_out = CUdeviceptr()
-    cdef cydriver.CUmemPoolPtrExportData* cyshareData_ptr = shareData._pvt_ptr if shareData is not None else NULL
+    cdef cydriver.CUmemPoolPtrExportData* cyshareData_ptr = <cydriver.CUmemPoolPtrExportData*>shareData._pvt_ptr if shareData is not None else NULL
     with nogil:
         err = cydriver.cuMemPoolImportPointer(<cydriver.CUdeviceptr*>ptr_out._pvt_ptr, cypool, cyshareData_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -36126,7 +36122,7 @@ def cuMulticastCreate(prop : Optional[CUmulticastObjectProp]):
     :py:obj:`~.cuMemCreate`, :py:obj:`~.cuMemRelease`, :py:obj:`~.cuMemExportToShareableHandle`, :py:obj:`~.cuMemImportFromShareableHandle`
     """
     cdef CUmemGenericAllocationHandle mcHandle = CUmemGenericAllocationHandle()
-    cdef cydriver.CUmulticastObjectProp* cyprop_ptr = prop._pvt_ptr if prop is not None else NULL
+    cdef cydriver.CUmulticastObjectProp* cyprop_ptr = <cydriver.CUmulticastObjectProp*>prop._pvt_ptr if prop is not None else NULL
     with nogil:
         err = cydriver.cuMulticastCreate(<cydriver.CUmemGenericAllocationHandle*>mcHandle._pvt_ptr, cyprop_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -36435,7 +36431,7 @@ def cuMulticastGetGranularity(prop : Optional[CUmulticastObjectProp], option not
     :py:obj:`~.cuMulticastCreate`, :py:obj:`~.cuMulticastBindMem`, :py:obj:`~.cuMulticastBindAddr`, :py:obj:`~.cuMulticastUnbind`
     """
     cdef size_t granularity = 0
-    cdef cydriver.CUmulticastObjectProp* cyprop_ptr = prop._pvt_ptr if prop is not None else NULL
+    cdef cydriver.CUmulticastObjectProp* cyprop_ptr = <cydriver.CUmulticastObjectProp*>prop._pvt_ptr if prop is not None else NULL
     cdef cydriver.CUmulticastGranularity_flags cyoption = int(option)
     with nogil:
         err = cydriver.cuMulticastGetGranularity(&granularity, cyprop_ptr, cyoption)
@@ -39346,7 +39342,7 @@ def cuStreamSetAttribute(hStream, attr not None : CUstreamAttrID, value : Option
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef cydriver.CUstreamAttrID cyattr = int(attr)
-    cdef cydriver.CUstreamAttrValue* cyvalue_ptr = value._pvt_ptr if value is not None else NULL
+    cdef cydriver.CUstreamAttrValue* cyvalue_ptr = <cydriver.CUstreamAttrValue*>value._pvt_ptr if value is not None else NULL
     with nogil:
         err = cydriver.cuStreamSetAttribute(cyhStream, cyattr, cyvalue_ptr)
     return (_CUresult(err),)
@@ -39832,89 +39828,84 @@ def cuImportExternalMemory(memHandleDesc : Optional[CUDA_EXTERNAL_MEMORY_HANDLE_
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD`, then
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::fd must be a
-    valid file descriptor referencing a memory object. Ownership of the
-    file descriptor is transferred to the CUDA driver when the handle is
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.fd` must be a valid
+    file descriptor referencing a memory object. Ownership of the file
+    descriptor is transferred to the CUDA driver when the handle is
     imported successfully. Performing any operations on the file descriptor
     after it is imported results in undefined behavior.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32`, then exactly
-    one of
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::handle and
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::name must
+    one of :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle`
+    and :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` must
     not be NULL. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::handle is
-    not NULL, then it must represent a valid shared NT handle that
-    references a memory object. Ownership of this handle is not transferred
-    to CUDA after the import operation, so the application must release the
-    handle using the appropriate system call. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::name is
-    not NULL, then it must point to a NULL-terminated array of UTF-16
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` is not
+    NULL, then it must represent a valid shared NT handle that references a
+    memory object. Ownership of this handle is not transferred to CUDA
+    after the import operation, so the application must release the handle
+    using the appropriate system call. If
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` is not
+    NULL, then it must point to a NULL-terminated array of UTF-16
     characters that refers to a memory object.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT`, then
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::handle
-    must be non-NULL and
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::name must
-    be NULL. The handle specified must be a globally shared KMT handle.
-    This handle does not hold a reference to the underlying object, and
-    thus will be invalid when all references to the memory object are
-    destroyed.
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` must
+    be non-NULL and
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` must be
+    NULL. The handle specified must be a globally shared KMT handle. This
+    handle does not hold a reference to the underlying object, and thus
+    will be invalid when all references to the memory object are destroyed.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP`, then exactly one
-    of :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::handle
-    and :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::name
-    must not be NULL. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::handle is
-    not NULL, then it must represent a valid shared NT handle that is
-    returned by ID3D12Device::CreateSharedHandle when referring to a
-    ID3D12Heap object. This handle holds a reference to the underlying
-    object. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::name is
-    not NULL, then it must point to a NULL-terminated array of UTF-16
+    of :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` and
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` must not
+    be NULL. If
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` is not
+    NULL, then it must represent a valid shared NT handle that is returned
+    by ID3D12Device::CreateSharedHandle when referring to a ID3D12Heap
+    object. This handle holds a reference to the underlying object. If
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` is not
+    NULL, then it must point to a NULL-terminated array of UTF-16
     characters that refers to a ID3D12Heap object.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE`, then exactly
-    one of
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::handle and
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::name must
+    one of :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle`
+    and :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` must
     not be NULL. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::handle is
-    not NULL, then it must represent a valid shared NT handle that is
-    returned by ID3D12Device::CreateSharedHandle when referring to a
-    ID3D12Resource object. This handle holds a reference to the underlying
-    object. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::name is
-    not NULL, then it must point to a NULL-terminated array of UTF-16
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` is not
+    NULL, then it must represent a valid shared NT handle that is returned
+    by ID3D12Device::CreateSharedHandle when referring to a ID3D12Resource
+    object. This handle holds a reference to the underlying object. If
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` is not
+    NULL, then it must point to a NULL-terminated array of UTF-16
     characters that refers to a ID3D12Resource object.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE`, then
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::handle
-    must represent a valid shared NT handle that is returned by
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` must
+    represent a valid shared NT handle that is returned by
     IDXGIResource1::CreateSharedHandle when referring to a ID3D11Resource
     object. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::name is
-    not NULL, then it must point to a NULL-terminated array of UTF-16
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` is not
+    NULL, then it must point to a NULL-terminated array of UTF-16
     characters that refers to a ID3D11Resource object.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT`, then
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::handle
-    must represent a valid shared KMT handle that is returned by
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` must
+    represent a valid shared KMT handle that is returned by
     IDXGIResource::GetSharedHandle when referring to a ID3D11Resource
     object and
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::win32::name must
-    be NULL.
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` must be
+    NULL.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF`, then
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::nvSciBufObject
-    must be non-NULL and reference a valid NvSciBuf object. If the NvSciBuf
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.nvSciBufObject` must
+    be non-NULL and reference a valid NvSciBuf object. If the NvSciBuf
     object imported into CUDA is also mapped by other drivers, then the
     application must use :py:obj:`~.cuWaitExternalSemaphoresAsync` or
     :py:obj:`~.cuSignalExternalSemaphoresAsync` as appropriate barriers to
@@ -39957,7 +39948,7 @@ def cuImportExternalMemory(memHandleDesc : Optional[CUDA_EXTERNAL_MEMORY_HANDLE_
     and Cache Control" chapter from Vulkan specification.
     """
     cdef CUexternalMemory extMem_out = CUexternalMemory()
-    cdef cydriver.CUDA_EXTERNAL_MEMORY_HANDLE_DESC* cymemHandleDesc_ptr = memHandleDesc._pvt_ptr if memHandleDesc is not None else NULL
+    cdef cydriver.CUDA_EXTERNAL_MEMORY_HANDLE_DESC* cymemHandleDesc_ptr = <cydriver.CUDA_EXTERNAL_MEMORY_HANDLE_DESC*>memHandleDesc._pvt_ptr if memHandleDesc is not None else NULL
     with nogil:
         err = cydriver.cuImportExternalMemory(<cydriver.CUexternalMemory*>extMem_out._pvt_ptr, cymemHandleDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -40027,7 +40018,7 @@ def cuExternalMemoryGetMappedBuffer(extMem, bufferDesc : Optional[CUDA_EXTERNAL_
         pextMem = int(CUexternalMemory(extMem))
     cyextMem = <cydriver.CUexternalMemory><void_ptr>pextMem
     cdef CUdeviceptr devPtr = CUdeviceptr()
-    cdef cydriver.CUDA_EXTERNAL_MEMORY_BUFFER_DESC* cybufferDesc_ptr = bufferDesc._pvt_ptr if bufferDesc is not None else NULL
+    cdef cydriver.CUDA_EXTERNAL_MEMORY_BUFFER_DESC* cybufferDesc_ptr = <cydriver.CUDA_EXTERNAL_MEMORY_BUFFER_DESC*>bufferDesc._pvt_ptr if bufferDesc is not None else NULL
     with nogil:
         err = cydriver.cuExternalMemoryGetMappedBuffer(<cydriver.CUdeviceptr*>devPtr._pvt_ptr, cyextMem, cybufferDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -40100,7 +40091,7 @@ def cuExternalMemoryGetMappedMipmappedArray(extMem, mipmapDesc : Optional[CUDA_E
         pextMem = int(CUexternalMemory(extMem))
     cyextMem = <cydriver.CUexternalMemory><void_ptr>pextMem
     cdef CUmipmappedArray mipmap = CUmipmappedArray()
-    cdef cydriver.CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC* cymipmapDesc_ptr = mipmapDesc._pvt_ptr if mipmapDesc is not None else NULL
+    cdef cydriver.CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC* cymipmapDesc_ptr = <cydriver.CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC*>mipmapDesc._pvt_ptr if mipmapDesc is not None else NULL
     with nogil:
         err = cydriver.cuExternalMemoryGetMappedMipmappedArray(<cydriver.CUmipmappedArray*>mipmap._pvt_ptr, cyextMem, cymipmapDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -40169,7 +40160,7 @@ def cuImportExternalSemaphore(semHandleDesc : Optional[CUDA_EXTERNAL_SEMAPHORE_H
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD`, then
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::fd must be a
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.fd` must be a
     valid file descriptor referencing a synchronization object. Ownership
     of the file descriptor is transferred to the CUDA driver when the
     handle is imported successfully. Performing any operations on the file
@@ -40178,98 +40169,95 @@ def cuImportExternalSemaphore(semHandleDesc : Optional[CUDA_EXTERNAL_SEMAPHORE_H
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32`, then
     exactly one of
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::handle
-    and
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::name
-    must not be NULL. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::handle
-    is not NULL, then it must represent a valid shared NT handle that
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle` and
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` must
+    not be NULL. If
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle` is
+    not NULL, then it must represent a valid shared NT handle that
     references a synchronization object. Ownership of this handle is not
     transferred to CUDA after the import operation, so the application must
     release the handle using the appropriate system call. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::name is
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` is
     not NULL, then it must name a valid synchronization object.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT`, then
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::handle
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle`
     must be non-NULL and
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::name
-    must be NULL. The handle specified must be a globally shared KMT
-    handle. This handle does not hold a reference to the underlying object,
-    and thus will be invalid when all references to the synchronization
-    object are destroyed.
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` must
+    be NULL. The handle specified must be a globally shared KMT handle.
+    This handle does not hold a reference to the underlying object, and
+    thus will be invalid when all references to the synchronization object
+    are destroyed.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE`, then exactly
     one of
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::handle
-    and
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::name
-    must not be NULL. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::handle
-    is not NULL, then it must represent a valid shared NT handle that is
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle` and
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` must
+    not be NULL. If
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle` is
+    not NULL, then it must represent a valid shared NT handle that is
     returned by ID3D12Device::CreateSharedHandle when referring to a
     ID3D12Fence object. This handle holds a reference to the underlying
     object. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::name is
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` is
     not NULL, then it must name a valid synchronization object that refers
     to a valid ID3D12Fence object.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE`, then
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::handle
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle`
     represents a valid shared NT handle that is returned by
     ID3D11Fence::CreateSharedHandle. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::name is
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` is
     not NULL, then it must name a valid synchronization object that refers
     to a valid ID3D11Fence object.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC`, then
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::nvSciSyncObj
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.nvSciSyncObj`
     represents a valid NvSciSyncObj.
 
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX`, then
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::handle
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle`
     represents a valid shared NT handle that is returned by
     IDXGIResource1::CreateSharedHandle when referring to a IDXGIKeyedMutex
     object. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::name is
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` is
     not NULL, then it must name a valid synchronization object that refers
     to a valid IDXGIKeyedMutex object.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT`,
     then
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::handle
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle`
     represents a valid shared KMT handle that is returned by
     IDXGIResource::GetSharedHandle when referring to a IDXGIKeyedMutex
     object and
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::name
-    must be NULL.
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` must
+    be NULL.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD`,
-    then :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::fd must
-    be a valid file descriptor referencing a synchronization object.
-    Ownership of the file descriptor is transferred to the CUDA driver when
-    the handle is imported successfully. Performing any operations on the
-    file descriptor after it is imported results in undefined behavior.
+    then :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.fd` must be
+    a valid file descriptor referencing a synchronization object. Ownership
+    of the file descriptor is transferred to the CUDA driver when the
+    handle is imported successfully. Performing any operations on the file
+    descriptor after it is imported results in undefined behavior.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32`,
     then exactly one of
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::handle
-    and
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::name
-    must not be NULL. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::handle
-    is not NULL, then it must represent a valid shared NT handle that
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle` and
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` must
+    not be NULL. If
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle` is
+    not NULL, then it must represent a valid shared NT handle that
     references a synchronization object. Ownership of this handle is not
     transferred to CUDA after the import operation, so the application must
     release the handle using the appropriate system call. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`::handle::win32::name is
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` is
     not NULL, then it must name a valid synchronization object.
 
     Parameters
@@ -40289,7 +40277,7 @@ def cuImportExternalSemaphore(semHandleDesc : Optional[CUDA_EXTERNAL_SEMAPHORE_H
     :py:obj:`~.cuDestroyExternalSemaphore`, :py:obj:`~.cuSignalExternalSemaphoresAsync`, :py:obj:`~.cuWaitExternalSemaphoresAsync`
     """
     cdef CUexternalSemaphore extSem_out = CUexternalSemaphore()
-    cdef cydriver.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC* cysemHandleDesc_ptr = semHandleDesc._pvt_ptr if semHandleDesc is not None else NULL
+    cdef cydriver.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC* cysemHandleDesc_ptr = <cydriver.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC*>semHandleDesc._pvt_ptr if semHandleDesc is not None else NULL
     with nogil:
         err = cydriver.cuImportExternalSemaphore(<cydriver.CUexternalSemaphore*>extSem_out._pvt_ptr, cysemHandleDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -40322,15 +40310,15 @@ def cuSignalExternalSemaphoresAsync(extSemArray : Optional[tuple[CUexternalSemap
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD`,
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32`
     then the semaphore will be set to the value specified in
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS`::params::fence::value.
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS.params.fence.value`.
 
     If the semaphore object is of the type
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC` this API sets
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS`::params::nvSciSync::fence
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS.params.nvSciSync.fence`
     to a value that can be used by subsequent waiters of the same NvSciSync
     object to order operations with those currently submitted in `stream`.
     Such an update will overwrite previous contents of
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS`::params::nvSciSync::fence.
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS.params.nvSciSync.fence`.
     By default, signaling such an external semaphore object causes
     appropriate memory synchronization operations to be performed over all
     external memory objects that are imported as
@@ -40465,12 +40453,12 @@ def cuWaitExternalSemaphoresAsync(extSemArray : Optional[tuple[CUexternalSemapho
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32`
     then waiting on the semaphore will wait until the value of the
     semaphore is greater than or equal to
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS`::params::fence::value.
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS.params.fence.value`.
 
     If the semaphore object is of the type
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC` then, waiting
     on the semaphore will wait until the
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS`::params::nvSciSync::fence
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS.params.nvSciSync.fence`
     is signaled by the signaler of the NvSciSyncObj that was associated
     with this semaphore object. By default, waiting on such an external
     semaphore object causes appropriate memory synchronization operations
@@ -40494,9 +40482,9 @@ def cuWaitExternalSemaphoresAsync(extSemArray : Optional[tuple[CUexternalSemapho
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT`
     then the keyed mutex will be acquired when it is released with the key
     specified in
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS`::params::keyedmutex::key
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS.params.keyedmutex.key`
     or until the timeout specified by
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS`::params::keyedmutex::timeoutMs
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS.params.keyedmutex.timeoutMs`
     has lapsed. The timeout interval can either be a finite value specified
     in milliseconds or an infinite value. In case an infinite value is
     specified the timeout never elapses. The windows INFINITE macro must be
@@ -41725,7 +41713,7 @@ def cuLaunchKernelEx(config : Optional[CUlaunchConfig], f, kernelParams, void_pt
     other than 0 or 1 is not allowed.
 
     On success, a handle will be returned via
-    :py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::devNode
+    :py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.devNode`
     which can be passed to the various device-side update functions to
     update the node's kernel parameters from within another kernel. For
     more information on the types of device updates that can be made, as
@@ -41827,7 +41815,7 @@ def cuLaunchKernelEx(config : Optional[CUlaunchConfig], f, kernelParams, void_pt
     else:
         pf = int(CUfunction(f))
     cyf = <cydriver.CUfunction><void_ptr>pf
-    cdef cydriver.CUlaunchConfig* cyconfig_ptr = config._pvt_ptr if config is not None else NULL
+    cdef cydriver.CUlaunchConfig* cyconfig_ptr = <cydriver.CUlaunchConfig*>config._pvt_ptr if config is not None else NULL
     cykernelParams = _HelperKernelParams(kernelParams)
     cdef void** cykernelParams_ptr = <void**><void_ptr>cykernelParams.ckernelParams
     with nogil:
@@ -42645,7 +42633,7 @@ def cuLaunchGridAsync(f, int grid_width, int grid_height, hStream):
 
     Notes
     -----
-    In certain cases where cubins are created with no ABI (i.e., using `ptxas` `None` `no`), this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by growing the per-thread stack as needed per launch and not shrinking it afterwards.
+    In certain cases where cubins are created with no ABI (i.e., using `ptxas` `--abi-compile` `no`), this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by growing the per-thread stack as needed per launch and not shrinking it afterwards.
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -42936,7 +42924,7 @@ def cuGraphAddKernelNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | li
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_KERNEL_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphAddKernelNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -43027,7 +43015,7 @@ def cuGraphKernelNodeSetParams(hNode, nodeParams : Optional[CUDA_KERNEL_NODE_PAR
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_KERNEL_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphKernelNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_CUresult(err),)
@@ -43114,7 +43102,7 @@ def cuGraphAddMemcpyNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | li
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_MEMCPY3D* cycopyParams_ptr = copyParams._pvt_ptr if copyParams is not None else NULL
+    cdef cydriver.CUDA_MEMCPY3D* cycopyParams_ptr = <cydriver.CUDA_MEMCPY3D*>copyParams._pvt_ptr if copyParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphAddMemcpyNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cycopyParams_ptr, cyctx)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -43196,7 +43184,7 @@ def cuGraphMemcpyNodeSetParams(hNode, nodeParams : Optional[CUDA_MEMCPY3D]):
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_MEMCPY3D* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUDA_MEMCPY3D* cynodeParams_ptr = <cydriver.CUDA_MEMCPY3D*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphMemcpyNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_CUresult(err),)
@@ -43273,7 +43261,7 @@ def cuGraphAddMemsetNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | li
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cymemsetParams_ptr = memsetParams._pvt_ptr if memsetParams is not None else NULL
+    cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cymemsetParams_ptr = <cydriver.CUDA_MEMSET_NODE_PARAMS*>memsetParams._pvt_ptr if memsetParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphAddMemsetNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cymemsetParams_ptr, cyctx)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -43355,7 +43343,7 @@ def cuGraphMemsetNodeSetParams(hNode, nodeParams : Optional[CUDA_MEMSET_NODE_PAR
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_MEMSET_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphMemsetNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_CUresult(err),)
@@ -43422,7 +43410,7 @@ def cuGraphAddHostNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | list
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_HOST_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphAddHostNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -43504,7 +43492,7 @@ def cuGraphHostNodeSetParams(hNode, nodeParams : Optional[CUDA_HOST_NODE_PARAMS]
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_HOST_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphHostNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_CUresult(err),)
@@ -44097,7 +44085,7 @@ def cuGraphAddExternalSemaphoresSignalNode(hGraph, dependencies : Optional[tuple
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphAddExternalSemaphoresSignalNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -44186,7 +44174,7 @@ def cuGraphExternalSemaphoresSignalNodeSetParams(hNode, nodeParams : Optional[CU
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphExternalSemaphoresSignalNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_CUresult(err),)
@@ -44254,7 +44242,7 @@ def cuGraphAddExternalSemaphoresWaitNode(hGraph, dependencies : Optional[tuple[C
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphAddExternalSemaphoresWaitNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -44343,7 +44331,7 @@ def cuGraphExternalSemaphoresWaitNodeSetParams(hNode, nodeParams : Optional[CUDA
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphExternalSemaphoresWaitNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_CUresult(err),)
@@ -44414,7 +44402,7 @@ def cuGraphAddBatchMemOpNode(hGraph, dependencies : Optional[tuple[CUgraphNode]
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphAddBatchMemOpNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -44504,7 +44492,7 @@ def cuGraphBatchMemOpNodeSetParams(hNode, nodeParams : Optional[CUDA_BATCH_MEM_O
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphBatchMemOpNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_CUresult(err),)
@@ -44575,7 +44563,7 @@ def cuGraphExecBatchMemOpNodeSetParams(hGraphExec, hNode, nodeParams : Optional[
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphExecBatchMemOpNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_CUresult(err),)
@@ -44683,7 +44671,7 @@ def cuGraphAddMemAllocNode(hGraph, dependencies : Optional[tuple[CUgraphNode] |
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_MEM_ALLOC_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUDA_MEM_ALLOC_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_MEM_ALLOC_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphAddMemAllocNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -46350,7 +46338,7 @@ def cuGraphInstantiateWithParams(hGraph, instantiateParams : Optional[CUDA_GRAPH
         phGraph = int(CUgraph(hGraph))
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef CUgraphExec phGraphExec = CUgraphExec()
-    cdef cydriver.CUDA_GRAPH_INSTANTIATE_PARAMS* cyinstantiateParams_ptr = instantiateParams._pvt_ptr if instantiateParams is not None else NULL
+    cdef cydriver.CUDA_GRAPH_INSTANTIATE_PARAMS* cyinstantiateParams_ptr = <cydriver.CUDA_GRAPH_INSTANTIATE_PARAMS*>instantiateParams._pvt_ptr if instantiateParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphInstantiateWithParams(<cydriver.CUgraphExec*>phGraphExec._pvt_ptr, cyhGraph, cyinstantiateParams_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -46476,7 +46464,7 @@ def cuGraphExecKernelNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUDA
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_KERNEL_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphExecKernelNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_CUresult(err),)
@@ -46551,7 +46539,7 @@ def cuGraphExecMemcpyNodeSetParams(hGraphExec, hNode, copyParams : Optional[CUDA
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_MEMCPY3D* cycopyParams_ptr = copyParams._pvt_ptr if copyParams is not None else NULL
+    cdef cydriver.CUDA_MEMCPY3D* cycopyParams_ptr = <cydriver.CUDA_MEMCPY3D*>copyParams._pvt_ptr if copyParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphExecMemcpyNodeSetParams(cyhGraphExec, cyhNode, cycopyParams_ptr, cyctx)
     return (_CUresult(err),)
@@ -46631,7 +46619,7 @@ def cuGraphExecMemsetNodeSetParams(hGraphExec, hNode, memsetParams : Optional[CU
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cymemsetParams_ptr = memsetParams._pvt_ptr if memsetParams is not None else NULL
+    cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cymemsetParams_ptr = <cydriver.CUDA_MEMSET_NODE_PARAMS*>memsetParams._pvt_ptr if memsetParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphExecMemsetNodeSetParams(cyhGraphExec, cyhNode, cymemsetParams_ptr, cyctx)
     return (_CUresult(err),)
@@ -46686,7 +46674,7 @@ def cuGraphExecHostNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUDA_H
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_HOST_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphExecHostNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_CUresult(err),)
@@ -46942,7 +46930,7 @@ def cuGraphExecExternalSemaphoresSignalNodeSetParams(hGraphExec, hNode, nodePara
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphExecExternalSemaphoresSignalNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_CUresult(err),)
@@ -47002,7 +46990,7 @@ def cuGraphExecExternalSemaphoresWaitNodeSetParams(hGraphExec, hNode, nodeParams
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphExecExternalSemaphoresWaitNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_CUresult(err),)
@@ -47620,7 +47608,7 @@ def cuGraphKernelNodeSetAttribute(hNode, attr not None : CUkernelNodeAttrID, val
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef cydriver.CUkernelNodeAttrID cyattr = int(attr)
-    cdef cydriver.CUkernelNodeAttrValue* cyvalue_ptr = value._pvt_ptr if value is not None else NULL
+    cdef cydriver.CUkernelNodeAttrValue* cyvalue_ptr = <cydriver.CUkernelNodeAttrValue*>value._pvt_ptr if value is not None else NULL
     with nogil:
         err = cydriver.cuGraphKernelNodeSetAttribute(cyhNode, cyattr, cyvalue_ptr)
     return (_CUresult(err),)
@@ -47993,7 +47981,7 @@ def cuGraphAddNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | list[CUg
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = <cydriver.CUgraphNodeParams*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphAddNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -48089,7 +48077,7 @@ def cuGraphAddNode_v2(hGraph, dependencies : Optional[tuple[CUgraphNode] | list[
         cydependencyData = (<CUgraphEdgeData>dependencyData[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
     if numDependencies > <size_t>len(dependencyData): raise RuntimeError("List is too small: " + str(len(dependencyData)) + " < " + str(numDependencies))
-    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = <cydriver.CUgraphNodeParams*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphAddNode_v2(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, cydependencyData, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -48139,7 +48127,7 @@ def cuGraphNodeSetParams(hNode, nodeParams : Optional[CUgraphNodeParams]):
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = <cydriver.CUgraphNodeParams*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_CUresult(err),)
@@ -48199,7 +48187,7 @@ def cuGraphExecNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUgraphNod
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = <cydriver.CUgraphNodeParams*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cydriver.cuGraphExecNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_CUresult(err),)
@@ -48662,7 +48650,7 @@ def cuOccupancyMaxPotentialClusterSize(func, config : Optional[CUlaunchConfig]):
         pfunc = int(CUfunction(func))
     cyfunc = <cydriver.CUfunction><void_ptr>pfunc
     cdef int clusterSize = 0
-    cdef cydriver.CUlaunchConfig* cyconfig_ptr = config._pvt_ptr if config is not None else NULL
+    cdef cydriver.CUlaunchConfig* cyconfig_ptr = <cydriver.CUlaunchConfig*>config._pvt_ptr if config is not None else NULL
     with nogil:
         err = cydriver.cuOccupancyMaxPotentialClusterSize(&clusterSize, cyfunc, cyconfig_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -48722,7 +48710,7 @@ def cuOccupancyMaxActiveClusters(func, config : Optional[CUlaunchConfig]):
         pfunc = int(CUfunction(func))
     cyfunc = <cydriver.CUfunction><void_ptr>pfunc
     cdef int numClusters = 0
-    cdef cydriver.CUlaunchConfig* cyconfig_ptr = config._pvt_ptr if config is not None else NULL
+    cdef cydriver.CUlaunchConfig* cyconfig_ptr = <cydriver.CUlaunchConfig*>config._pvt_ptr if config is not None else NULL
     with nogil:
         err = cydriver.cuOccupancyMaxActiveClusters(&numClusters, cyfunc, cyconfig_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -48986,7 +48974,7 @@ def cuTexRefSetAddress2D(hTexRef, desc : Optional[CUDA_ARRAY_DESCRIPTOR], dptr,
     else:
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
-    cdef cydriver.CUDA_ARRAY_DESCRIPTOR* cydesc_ptr = desc._pvt_ptr if desc is not None else NULL
+    cdef cydriver.CUDA_ARRAY_DESCRIPTOR* cydesc_ptr = <cydriver.CUDA_ARRAY_DESCRIPTOR*>desc._pvt_ptr if desc is not None else NULL
     with nogil:
         err = cydriver.cuTexRefSetAddress2D(cyhTexRef, cydesc_ptr, cydptr, Pitch)
     return (_CUresult(err),)
@@ -50159,23 +50147,23 @@ def cuTexObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC], pTexDesc : Option
 
     If :py:obj:`~.CUDA_RESOURCE_DESC.resType` is set to
     :py:obj:`~.CU_RESOURCE_TYPE_ARRAY`,
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::array::hArray must be set to a
-    valid CUDA array handle.
+    :py:obj:`~.CUDA_RESOURCE_DESC.res.array.hArray` must be set to a valid
+    CUDA array handle.
 
     If :py:obj:`~.CUDA_RESOURCE_DESC.resType` is set to
     :py:obj:`~.CU_RESOURCE_TYPE_MIPMAPPED_ARRAY`,
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::mipmap::hMipmappedArray must be
-    set to a valid CUDA mipmapped array handle.
+    :py:obj:`~.CUDA_RESOURCE_DESC.res.mipmap.hMipmappedArray` must be set
+    to a valid CUDA mipmapped array handle.
 
     If :py:obj:`~.CUDA_RESOURCE_DESC.resType` is set to
     :py:obj:`~.CU_RESOURCE_TYPE_LINEAR`,
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::linear::devPtr must be set to a
-    valid device pointer, that is aligned to
+    :py:obj:`~.CUDA_RESOURCE_DESC.res.linear.devPtr` must be set to a valid
+    device pointer, that is aligned to
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT`.
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::linear::format and
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::linear::numChannels describe the
+    :py:obj:`~.CUDA_RESOURCE_DESC.res.linear.format` and
+    :py:obj:`~.CUDA_RESOURCE_DESC.res.linear.numChannels` describe the
     format of each component and the number of components per array
-    element. :py:obj:`~.CUDA_RESOURCE_DESC`::res::linear::sizeInBytes
+    element. :py:obj:`~.CUDA_RESOURCE_DESC.res.linear.sizeInBytes`
     specifies the size of the array in bytes. The total number of elements
     in the linear address range cannot exceed
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH`. The
@@ -50184,20 +50172,19 @@ def cuTexObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC], pTexDesc : Option
 
     If :py:obj:`~.CUDA_RESOURCE_DESC.resType` is set to
     :py:obj:`~.CU_RESOURCE_TYPE_PITCH2D`,
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::pitch2D::devPtr must be set to a
+    :py:obj:`~.CUDA_RESOURCE_DESC.res.pitch2D.devPtr` must be set to a
     valid device pointer, that is aligned to
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT`.
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::pitch2D::format and
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::pitch2D::numChannels describe the
+    :py:obj:`~.CUDA_RESOURCE_DESC.res.pitch2D.format` and
+    :py:obj:`~.CUDA_RESOURCE_DESC.res.pitch2D.numChannels` describe the
     format of each component and the number of components per array
-    element. :py:obj:`~.CUDA_RESOURCE_DESC`::res::pitch2D::width and
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::pitch2D::height specify the width
-    and height of the array in elements, and cannot exceed
+    element. :py:obj:`~.CUDA_RESOURCE_DESC.res.pitch2D.width` and
+    :py:obj:`~.CUDA_RESOURCE_DESC.res.pitch2D.height` specify the width and
+    height of the array in elements, and cannot exceed
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH` and
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT`
-    respectively.
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::pitch2D::pitchInBytes specifies
-    the pitch between two rows in bytes and has to be aligned to
+    respectively. :py:obj:`~.CUDA_RESOURCE_DESC.res.pitch2D.pitchInBytes`
+    specifies the pitch between two rows in bytes and has to be aligned to
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT`. Pitch cannot
     exceed :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH`.
 
@@ -50357,9 +50344,9 @@ def cuTexObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC], pTexDesc : Option
     :py:obj:`~.cuTexObjectDestroy`, :py:obj:`~.cudaCreateTextureObject`
     """
     cdef CUtexObject pTexObject = CUtexObject()
-    cdef cydriver.CUDA_RESOURCE_DESC* cypResDesc_ptr = pResDesc._pvt_ptr if pResDesc is not None else NULL
-    cdef cydriver.CUDA_TEXTURE_DESC* cypTexDesc_ptr = pTexDesc._pvt_ptr if pTexDesc is not None else NULL
-    cdef cydriver.CUDA_RESOURCE_VIEW_DESC* cypResViewDesc_ptr = pResViewDesc._pvt_ptr if pResViewDesc is not None else NULL
+    cdef cydriver.CUDA_RESOURCE_DESC* cypResDesc_ptr = <cydriver.CUDA_RESOURCE_DESC*>pResDesc._pvt_ptr if pResDesc is not None else NULL
+    cdef cydriver.CUDA_TEXTURE_DESC* cypTexDesc_ptr = <cydriver.CUDA_TEXTURE_DESC*>pTexDesc._pvt_ptr if pTexDesc is not None else NULL
+    cdef cydriver.CUDA_RESOURCE_VIEW_DESC* cypResViewDesc_ptr = <cydriver.CUDA_RESOURCE_VIEW_DESC*>pResViewDesc._pvt_ptr if pResViewDesc is not None else NULL
     with nogil:
         err = cydriver.cuTexObjectCreate(<cydriver.CUtexObject*>pTexObject._pvt_ptr, cypResDesc_ptr, cypTexDesc_ptr, cypResViewDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -50536,9 +50523,9 @@ def cuSurfObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC]):
     describes the data to perform surface load/stores on.
     :py:obj:`~.CUDA_RESOURCE_DESC.resType` must be
     :py:obj:`~.CU_RESOURCE_TYPE_ARRAY` and
-    :py:obj:`~.CUDA_RESOURCE_DESC`::res::array::hArray must be set to a
-    valid CUDA array handle. :py:obj:`~.CUDA_RESOURCE_DESC.flags` must be
-    set to zero.
+    :py:obj:`~.CUDA_RESOURCE_DESC.res.array.hArray` must be set to a valid
+    CUDA array handle. :py:obj:`~.CUDA_RESOURCE_DESC.flags` must be set to
+    zero.
 
     Surface objects are only supported on devices of compute capability 3.0
     or higher. Additionally, a surface object is an opaque value, and, as
@@ -50561,7 +50548,7 @@ def cuSurfObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC]):
     :py:obj:`~.cuSurfObjectDestroy`, :py:obj:`~.cudaCreateSurfaceObject`
     """
     cdef CUsurfObject pSurfObject = CUsurfObject()
-    cdef cydriver.CUDA_RESOURCE_DESC* cypResDesc_ptr = pResDesc._pvt_ptr if pResDesc is not None else NULL
+    cdef cydriver.CUDA_RESOURCE_DESC* cypResDesc_ptr = <cydriver.CUDA_RESOURCE_DESC*>pResDesc._pvt_ptr if pResDesc is not None else NULL
     with nogil:
         err = cydriver.cuSurfObjectCreate(<cydriver.CUsurfObject*>pSurfObject._pvt_ptr, cypResDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -51653,7 +51640,7 @@ def cuTensorMapReplaceAddress(tensorMap : Optional[CUtensorMap], globalAddress):
     --------
     :py:obj:`~.cuTensorMapEncodeTiled`, :py:obj:`~.cuTensorMapEncodeIm2col`, :py:obj:`~.cuTensorMapEncodeIm2colWide`
     """
-    cdef cydriver.CUtensorMap* cytensorMap_ptr = tensorMap._pvt_ptr if tensorMap is not None else NULL
+    cdef cydriver.CUtensorMap* cytensorMap_ptr = <cydriver.CUtensorMap*>tensorMap._pvt_ptr if tensorMap is not None else NULL
     cdef _HelperInputVoidPtrStruct cyglobalAddressHelper
     cdef void* cyglobalAddress = _helper_input_void_ptr(globalAddress, &cyglobalAddressHelper)
     with nogil:
@@ -52882,7 +52869,7 @@ def cuGetExportTable(pExportTableId : Optional[CUuuid]):
         None
     """
     cdef void_ptr ppExportTable = 0
-    cdef cydriver.CUuuid* cypExportTableId_ptr = pExportTableId._pvt_ptr if pExportTableId is not None else NULL
+    cdef cydriver.CUuuid* cypExportTableId_ptr = <cydriver.CUuuid*>pExportTableId._pvt_ptr if pExportTableId is not None else NULL
     with nogil:
         err = cydriver.cuGetExportTable(<const void**>&ppExportTable, cypExportTableId_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -53271,8 +53258,8 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION`
     result : list[:py:obj:`~.CUdevResource`]
-        Output array of `None` resources. Can be NULL to query the number
-        of groups.
+        Output array of `CUdevResource` resources. Can be NULL to query the
+        number of groups.
     nbGroups : unsigned int
         This is a pointer, specifying the number of groups that would be or
         should be created as described below.
@@ -53292,7 +53279,7 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
         if cyresult is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(nbGroups) + 'x' + str(sizeof(cydriver.CUdevResource)))
     cdef unsigned int cynbGroups = nbGroups
-    cdef cydriver.CUdevResource* cyinput__ptr = input_._pvt_ptr if input_ is not None else NULL
+    cdef cydriver.CUdevResource* cyinput__ptr = <cydriver.CUdevResource*>input_._pvt_ptr if input_ is not None else NULL
     cdef CUdevResource remaining = CUdevResource()
     with nogil:
         err = cydriver.cuDevSmResourceSplitByCount(cyresult, &cynbGroups, cyinput__ptr, <cydriver.CUdevResource*>remaining._pvt_ptr, useFlags, minCount)
@@ -53927,7 +53914,7 @@ def cuCheckpointProcessLock(int pid, args : Optional[CUcheckpointLockArgs]):
     CUresult
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED` :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED` :py:obj:`~.CUDA_ERROR_NOT_READY`
     """
-    cdef cydriver.CUcheckpointLockArgs* cyargs_ptr = args._pvt_ptr if args is not None else NULL
+    cdef cydriver.CUcheckpointLockArgs* cyargs_ptr = <cydriver.CUcheckpointLockArgs*>args._pvt_ptr if args is not None else NULL
     with nogil:
         err = cydriver.cuCheckpointProcessLock(pid, cyargs_ptr)
     return (_CUresult(err),)
@@ -53958,7 +53945,7 @@ def cuCheckpointProcessCheckpoint(int pid, args : Optional[CUcheckpointCheckpoin
     CUresult
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED` :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     """
-    cdef cydriver.CUcheckpointCheckpointArgs* cyargs_ptr = args._pvt_ptr if args is not None else NULL
+    cdef cydriver.CUcheckpointCheckpointArgs* cyargs_ptr = <cydriver.CUcheckpointCheckpointArgs*>args._pvt_ptr if args is not None else NULL
     with nogil:
         err = cydriver.cuCheckpointProcessCheckpoint(pid, cyargs_ptr)
     return (_CUresult(err),)
@@ -53994,7 +53981,7 @@ def cuCheckpointProcessRestore(int pid, args : Optional[CUcheckpointRestoreArgs]
     --------
     :py:obj:`~.cuInit`
     """
-    cdef cydriver.CUcheckpointRestoreArgs* cyargs_ptr = args._pvt_ptr if args is not None else NULL
+    cdef cydriver.CUcheckpointRestoreArgs* cyargs_ptr = <cydriver.CUcheckpointRestoreArgs*>args._pvt_ptr if args is not None else NULL
     with nogil:
         err = cydriver.cuCheckpointProcessRestore(pid, cyargs_ptr)
     return (_CUresult(err),)
@@ -54023,7 +54010,7 @@ def cuCheckpointProcessUnlock(int pid, args : Optional[CUcheckpointUnlockArgs]):
     CUresult
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED` :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     """
-    cdef cydriver.CUcheckpointUnlockArgs* cyargs_ptr = args._pvt_ptr if args is not None else NULL
+    cdef cydriver.CUcheckpointUnlockArgs* cyargs_ptr = <cydriver.CUcheckpointUnlockArgs*>args._pvt_ptr if args is not None else NULL
     with nogil:
         err = cydriver.cuCheckpointProcessUnlock(pid, cyargs_ptr)
     return (_CUresult(err),)
@@ -54642,7 +54629,7 @@ def cuEGLStreamProducerReturnFrame(conn, eglframe : Optional[CUeglFrame], pStrea
         cyconn = <cydriver.CUeglStreamConnection*><void_ptr>conn
     else:
         raise TypeError("Argument 'conn' is not instance of type (expected <class 'int, driver.CUeglStreamConnection'>, found " + str(type(conn)))
-    cdef cydriver.CUeglFrame* cyeglframe_ptr = eglframe._pvt_ptr if eglframe is not None else NULL
+    cdef cydriver.CUeglFrame* cyeglframe_ptr = <cydriver.CUeglFrame*>eglframe._pvt_ptr if eglframe is not None else NULL
     with nogil:
         err = cydriver.cuEGLStreamProducerReturnFrame(cyconn, cyeglframe_ptr, cypStream)
     return (_CUresult(err),)
diff --git a/cuda_bindings/cuda/bindings/nvml.pxd b/cuda_bindings/cuda/bindings/nvml.pxd
index 3dc3f58e5d..1822e272f3 100644
--- a/cuda_bindings/cuda/bindings/nvml.pxd
+++ b/cuda_bindings/cuda/bindings/nvml.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1568+g289771de9.d20260413. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
@@ -52,9 +52,6 @@ ctypedef nvmlMask255_t Mask255
 ctypedef nvmlHostname_v1_t Hostname_v1
 ctypedef nvmlUnrepairableMemoryStatus_v1_t UnrepairableMemoryStatus_v1
 ctypedef nvmlRusdSettings_v1_t RusdSettings_v1
-ctypedef nvmlVgpuSchedulerStateInfo_v2_t VgpuSchedulerStateInfo_v2
-ctypedef nvmlVgpuSchedulerLogEntry_v2_t VgpuSchedulerLogEntry_v2
-ctypedef nvmlVgpuSchedulerState_v2_t VgpuSchedulerState_v2
 ctypedef nvmlPowerValue_v2_t PowerValue_v2
 ctypedef nvmlVgpuTypeMaxInstance_v1_t VgpuTypeMaxInstance_v1
 ctypedef nvmlVgpuProcessUtilizationSample_t VgpuProcessUtilizationSample
@@ -70,7 +67,6 @@ ctypedef nvmlWorkloadPowerProfileCurrentProfiles_v1_t WorkloadPowerProfileCurren
 ctypedef nvmlWorkloadPowerProfileRequestedProfiles_v1_t WorkloadPowerProfileRequestedProfiles_v1
 ctypedef nvmlWorkloadPowerProfileUpdateProfiles_v1_t WorkloadPowerProfileUpdateProfiles_v1
 ctypedef nvmlPRMTLV_v1_t PRMTLV_v1
-ctypedef nvmlVgpuSchedulerLogInfo_v2_t VgpuSchedulerLogInfo_v2
 ctypedef nvmlVgpuSchedulerSetState_t VgpuSchedulerSetState
 ctypedef nvmlGpmMetricsGet_t GpmMetricsGet
 ctypedef nvmlPRMCounterList_v1_t PRMCounterList_v1
@@ -158,7 +154,7 @@ cpdef int system_get_cuda_driver_version_v2() except 0
 cpdef str system_get_process_name(unsigned int pid)
 cpdef object system_get_hic_version()
 cpdef unsigned int unit_get_count() except? 0
-cpdef intptr_t unit_get_handle_by_index(unsigned int ind_ex) except? 0
+cpdef intptr_t unit_get_handle_by_index(unsigned int index) except? 0
 cpdef object unit_get_unit_info(intptr_t unit)
 cpdef object unit_get_led_state(intptr_t unit)
 cpdef object unit_get_psu_info(intptr_t unit)
@@ -166,7 +162,7 @@ cpdef unsigned int unit_get_temperature(intptr_t unit, unsigned int type) except
 cpdef object unit_get_fan_speed_info(intptr_t unit)
 cpdef unsigned int device_get_count_v2() except? 0
 cpdef object device_get_attributes_v2(intptr_t device)
-cpdef intptr_t device_get_handle_by_index_v2(unsigned int ind_ex) except? 0
+cpdef intptr_t device_get_handle_by_index_v2(unsigned int index) except? 0
 cpdef intptr_t device_get_handle_by_serial(serial) except? 0
 cpdef intptr_t device_get_handle_by_uuid(uuid) except? 0
 cpdef intptr_t device_get_handle_by_pci_bus_id_v2(pci_bus_id) except? 0
@@ -183,7 +179,7 @@ cpdef device_set_cpu_affinity(intptr_t device)
 cpdef device_clear_cpu_affinity(intptr_t device)
 cpdef unsigned int device_get_numa_node_id(intptr_t device) except? 0
 cpdef int device_get_topology_common_ancestor(intptr_t device1, intptr_t device2) except? -1
-cpdef int device_get_p2p_status(intptr_t device1, intptr_t device2, int p2p_ind_ex) except? -1
+cpdef int device_get_p2p_status(intptr_t device1, intptr_t device2, int p2p_index) except? -1
 cpdef str device_get_uuid(intptr_t device)
 cpdef unsigned int device_get_minor_number(intptr_t device) except? 0
 cpdef str device_get_board_part_number(intptr_t device)
@@ -220,7 +216,7 @@ cpdef unsigned int device_get_fan_control_policy_v2(intptr_t device, unsigned in
 cpdef unsigned int device_get_num_fans(intptr_t device) except? 0
 cpdef object device_get_cooler_info(intptr_t device)
 cpdef unsigned int device_get_temperature_threshold(intptr_t device, int threshold_type) except? 0
-cpdef object device_get_thermal_settings(intptr_t device, unsigned int sensor_ind_ex)
+cpdef object device_get_thermal_settings(intptr_t device, unsigned int sensor_index)
 cpdef int device_get_performance_state(intptr_t device) except? -1
 cpdef unsigned long long device_get_current_clocks_event_reasons(intptr_t device) except? 0
 cpdef unsigned long long device_get_supported_clocks_event_reasons(intptr_t device) except? 0
@@ -261,6 +257,7 @@ cpdef tuple device_get_driver_model_v2(intptr_t device)
 cpdef str device_get_vbios_version(intptr_t device)
 cpdef object device_get_bridge_chip_info(intptr_t device)
 cpdef object device_get_compute_running_processes_v3(intptr_t device)
+cpdef object device_get_graphics_running_processes_v3(intptr_t device)
 cpdef object device_get_mps_compute_running_processes_v3(intptr_t device)
 cpdef int device_on_same_board(intptr_t device1, intptr_t device2) except? 0
 cpdef int device_get_api_restriction(intptr_t device, int api_type) except? -1
@@ -350,7 +347,7 @@ cpdef unsigned int vgpu_type_get_gpu_instance_profile_id(unsigned int vgpu_type_
 cpdef tuple vgpu_type_get_device_id(unsigned int vgpu_type_id)
 cpdef unsigned long long vgpu_type_get_framebuffer_size(unsigned int vgpu_type_id) except? 0
 cpdef unsigned int vgpu_type_get_num_display_heads(unsigned int vgpu_type_id) except? 0
-cpdef tuple vgpu_type_get_resolution(unsigned int vgpu_type_id, unsigned int display_ind_ex)
+cpdef tuple vgpu_type_get_resolution(unsigned int vgpu_type_id, unsigned int display_index)
 cpdef str vgpu_type_get_license(unsigned int vgpu_type_id)
 cpdef unsigned int vgpu_type_get_frame_rate_limit(unsigned int vgpu_type_id) except? 0
 cpdef unsigned int vgpu_type_get_max_instances(intptr_t device, unsigned int vgpu_type_id) except? 0
@@ -389,7 +386,7 @@ cpdef object vgpu_instance_get_accounting_stats(unsigned int vgpu_instance, unsi
 cpdef vgpu_instance_clear_accounting_pids(unsigned int vgpu_instance)
 cpdef object vgpu_instance_get_license_info_v2(unsigned int vgpu_instance)
 cpdef unsigned int get_excluded_device_count() except? 0
-cpdef object get_excluded_device_info_by_index(unsigned int ind_ex)
+cpdef object get_excluded_device_info_by_index(unsigned int index)
 cpdef int device_set_mig_mode(intptr_t device, unsigned int mode) except? -1
 cpdef tuple device_get_mig_mode(intptr_t device)
 cpdef object device_get_gpu_instance_possible_placements_v2(intptr_t device, unsigned int profile_id)
@@ -411,7 +408,7 @@ cpdef unsigned int device_is_mig_device_handle(intptr_t device) except? 0
 cpdef unsigned int device_get_gpu_instance_id(intptr_t device) except? 0
 cpdef unsigned int device_get_compute_instance_id(intptr_t device) except? 0
 cpdef unsigned int device_get_max_mig_device_count(intptr_t device) except? 0
-cpdef intptr_t device_get_mig_device_handle_by_index(intptr_t device, unsigned int ind_ex) except? 0
+cpdef intptr_t device_get_mig_device_handle_by_index(intptr_t device, unsigned int index) except? 0
 cpdef intptr_t device_get_device_handle_from_mig_device_handle(intptr_t mig_device) except? 0
 cpdef device_power_smoothing_activate_preset_profile(intptr_t device, intptr_t profile)
 cpdef device_power_smoothing_update_preset_profile_param(intptr_t device, intptr_t profile)
@@ -420,3 +417,10 @@ cpdef object device_get_addressing_mode(intptr_t device)
 cpdef object device_get_repair_status(intptr_t device)
 cpdef object device_get_power_mizer_mode_v1(intptr_t device)
 cpdef device_set_power_mizer_mode_v1(intptr_t device, intptr_t power_mizer_mode)
+cpdef device_vgpu_force_gsp_unload(intptr_t device)
+cpdef object device_get_vgpu_scheduler_state_v2(intptr_t device)
+cpdef object gpu_instance_get_vgpu_scheduler_state_v2(intptr_t gpu_instance)
+cpdef object device_get_vgpu_scheduler_log_v2(intptr_t device)
+cpdef object gpu_instance_get_vgpu_scheduler_log_v2(intptr_t gpu_instance)
+cpdef device_set_vgpu_scheduler_state_v2(intptr_t device, intptr_t p_scheduler_state)
+cpdef gpu_instance_set_vgpu_scheduler_state_v2(intptr_t gpu_instance, intptr_t p_scheduler_state)
diff --git a/cuda_bindings/cuda/bindings/nvml.pyx b/cuda_bindings/cuda/bindings/nvml.pyx
index 42c9fdcc87..83a8526ec7 100644
--- a/cuda_bindings/cuda/bindings/nvml.pyx
+++ b/cuda_bindings/cuda/bindings/nvml.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1364+ged01d643e. Do not modify it directly.
+# This code was automatically generated across versions from 12.9.1 to 13.2.0, generator version 0.3.1.dev1568+g289771de9.d20260413. Do not modify it directly.
 
 cimport cython  # NOQA
 
@@ -528,7 +528,7 @@ class Return(_FastEnum):
     See `nvmlReturn_t`.
     """
     SUCCESS = (NVML_SUCCESS, 'The operation was successful.')
-    ERROR_UNINITIALIZED = (NVML_ERROR_UNINITIALIZED, 'NVML was not first initialized with nvmlInit()')
+    ERROR_UNINITIALIZED = (NVML_ERROR_UNINITIALIZED, 'NVML was not first initialized with `nvmlInit()`')
     ERROR_INVALID_ARGUMENT = (NVML_ERROR_INVALID_ARGUMENT, 'A supplied argument is invalid.')
     ERROR_NOT_SUPPORTED = (NVML_ERROR_NOT_SUPPORTED, 'The requested operation is not available on target device.')
     ERROR_NO_PERMISSION = (NVML_ERROR_NO_PERMISSION, 'The current user does not have permission for operation.')
@@ -759,7 +759,7 @@ class FBCSessionType(_FastEnum):
 class DetachGpuState(_FastEnum):
     """
     Is the GPU device to be removed from the kernel by
-    nvmlDeviceRemoveGpu()
+    `nvmlDeviceRemoveGpu()`
 
     See `nvmlDetachGpuState_t`.
     """
@@ -768,7 +768,7 @@ class DetachGpuState(_FastEnum):
 
 class PcieLinkState(_FastEnum):
     """
-    Parent bridge PCIe link state requested by nvmlDeviceRemoveGpu()
+    Parent bridge PCIe link state requested by `nvmlDeviceRemoveGpu()`
 
     See `nvmlPcieLinkState_t`.
     """
@@ -2061,7 +2061,6 @@ cdef object _nvml_error_factory(int status):
     return NvmlError(status)
 
 
-
 @cython.profile(False)
 cpdef int check_status(int status) except 1 nogil:
     if status != 0:
@@ -3174,21 +3173,17 @@ process_info_dtype = _get_process_info_dtype_offsets()
 
 cdef class ProcessInfo:
     """Empty-initialize an array of `nvmlProcessInfo_t`.
-
     The resulting object is of length `size` and of dtype `process_info_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlProcessInfo_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=process_info_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -3356,21 +3351,17 @@ process_detail_v1_dtype = _get_process_detail_v1_dtype_offsets()
 
 cdef class ProcessDetail_v1:
     """Empty-initialize an array of `nvmlProcessDetail_v1_t`.
-
     The resulting object is of length `size` and of dtype `process_detail_v1_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlProcessDetail_v1_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=process_detail_v1_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -4083,21 +4074,17 @@ bridge_chip_info_dtype = _get_bridge_chip_info_dtype_offsets()
 
 cdef class BridgeChipInfo:
     """Empty-initialize an array of `nvmlBridgeChipInfo_t`.
-
     The resulting object is of length `size` and of dtype `bridge_chip_info_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlBridgeChipInfo_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=bridge_chip_info_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -4237,7 +4224,6 @@ value_dtype = _numpy.dtype((
     }
     ))
 
-
 cdef class Value:
     """Empty-initialize an instance of `nvmlValue_t`.
 
@@ -4423,7 +4409,7 @@ cdef class Value:
 
 
 cdef _get__py_anon_pod0_dtype_offsets():
-    cdef _anon_pod0 pod = _anon_pod0()
+    cdef cuda_bindings_nvml__anon_pod0 pod = cuda_bindings_nvml__anon_pod0()
     return _numpy.dtype({
         'names': ['controller', 'default_min_temp', 'default_max_temp', 'current_temp', 'target'],
         'formats': [_numpy.int32, _numpy.int32, _numpy.int32, _numpy.int32, _numpy.int32],
@@ -4434,25 +4420,25 @@ cdef _get__py_anon_pod0_dtype_offsets():
             (<intptr_t>&(pod.currentTemp)) - (<intptr_t>&pod),
             (<intptr_t>&(pod.target)) - (<intptr_t>&pod),
         ],
-        'itemsize': sizeof(_anon_pod0),
+        'itemsize': sizeof(cuda_bindings_nvml__anon_pod0),
     })
 
 _py_anon_pod0_dtype = _get__py_anon_pod0_dtype_offsets()
 
 cdef class _py_anon_pod0:
-    """Empty-initialize an instance of `_anon_pod0`.
+    """Empty-initialize an instance of `cuda_bindings_nvml__anon_pod0`.
 
 
-    .. seealso:: `_anon_pod0`
+    .. seealso:: `cuda_bindings_nvml__anon_pod0`
     """
     cdef:
-        _anon_pod0 *_ptr
+        cuda_bindings_nvml__anon_pod0 *_ptr
         object _owner
         bint _owned
         bint _readonly
 
     def __init__(self):
-        self._ptr = <_anon_pod0 *>calloc(1, sizeof(_anon_pod0))
+        self._ptr = <cuda_bindings_nvml__anon_pod0 *>calloc(1, sizeof(cuda_bindings_nvml__anon_pod0))
         if self._ptr == NULL:
             raise MemoryError("Error allocating _py_anon_pod0")
         self._owner = None
@@ -4460,7 +4446,7 @@ cdef class _py_anon_pod0:
         self._readonly = False
 
     def __dealloc__(self):
-        cdef _anon_pod0 *ptr
+        cdef cuda_bindings_nvml__anon_pod0 *ptr
         if self._owned and self._ptr != NULL:
             ptr = self._ptr
             self._ptr = NULL
@@ -4485,20 +4471,20 @@ cdef class _py_anon_pod0:
         if not isinstance(other, _py_anon_pod0):
             return False
         other_ = other
-        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(_anon_pod0)) == 0)
+        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(cuda_bindings_nvml__anon_pod0)) == 0)
 
     def __getbuffer__(self, Py_buffer *buffer, int flags):
-        __getbuffer(self, buffer, <void *>self._ptr, sizeof(_anon_pod0), self._readonly)
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(cuda_bindings_nvml__anon_pod0), self._readonly)
 
     def __releasebuffer__(self, Py_buffer *buffer):
         pass
 
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
-            self._ptr = <_anon_pod0 *>malloc(sizeof(_anon_pod0))
+            self._ptr = <cuda_bindings_nvml__anon_pod0 *>malloc(sizeof(cuda_bindings_nvml__anon_pod0))
             if self._ptr == NULL:
                 raise MemoryError("Error allocating _py_anon_pod0")
-            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(_anon_pod0))
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(cuda_bindings_nvml__anon_pod0))
             self._owner = None
             self._owned = True
             self._readonly = not val.flags.writeable
@@ -4563,7 +4549,7 @@ cdef class _py_anon_pod0:
     @staticmethod
     def from_buffer(buffer):
         """Create an _py_anon_pod0 instance with the memory from the given buffer."""
-        return __from_buffer(buffer, sizeof(_anon_pod0), _py_anon_pod0)
+        return __from_buffer(buffer, sizeof(cuda_bindings_nvml__anon_pod0), _py_anon_pod0)
 
     @staticmethod
     def from_data(data):
@@ -4587,14 +4573,14 @@ cdef class _py_anon_pod0:
             raise ValueError("ptr must not be null (0)")
         cdef _py_anon_pod0 obj = _py_anon_pod0.__new__(_py_anon_pod0)
         if owner is None:
-            obj._ptr = <_anon_pod0 *>malloc(sizeof(_anon_pod0))
+            obj._ptr = <cuda_bindings_nvml__anon_pod0 *>malloc(sizeof(cuda_bindings_nvml__anon_pod0))
             if obj._ptr == NULL:
                 raise MemoryError("Error allocating _py_anon_pod0")
-            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(_anon_pod0))
+            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(cuda_bindings_nvml__anon_pod0))
             obj._owner = None
             obj._owned = True
         else:
-            obj._ptr = <_anon_pod0 *>ptr
+            obj._ptr = <cuda_bindings_nvml__anon_pod0 *>ptr
             obj._owner = owner
             obj._owned = False
         obj._readonly = readonly
@@ -4604,7 +4590,7 @@ cdef class _py_anon_pod0:
 cdef _get_cooler_info_v1_dtype_offsets():
     cdef nvmlCoolerInfo_v1_t pod = nvmlCoolerInfo_v1_t()
     return _numpy.dtype({
-        'names': ['version', 'ind_ex', 'signal_type', 'target'],
+        'names': ['version', 'index', 'signal_type', 'target'],
         'formats': [_numpy.uint32, _numpy.uint32, _numpy.int32, _numpy.int32],
         'offsets': [
             (<intptr_t>&(pod.version)) - (<intptr_t>&pod),
@@ -4695,12 +4681,12 @@ cdef class CoolerInfo_v1:
         self._ptr[0].version = val
 
     @property
-    def ind_ex(self):
+    def index(self):
         """int: the cooler index"""
         return self._ptr[0].index
 
-    @ind_ex.setter
-    def ind_ex(self, val):
+    @index.setter
+    def index(self, val):
         if self._readonly:
             raise ValueError("This CoolerInfo_v1 instance is read-only")
         self._ptr[0].index = val
@@ -4784,21 +4770,17 @@ clk_mon_fault_info_dtype = _get_clk_mon_fault_info_dtype_offsets()
 
 cdef class ClkMonFaultInfo:
     """Empty-initialize an array of `nvmlClkMonFaultInfo_t`.
-
     The resulting object is of length `size` and of dtype `clk_mon_fault_info_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlClkMonFaultInfo_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=clk_mon_fault_info_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -5136,21 +5118,17 @@ process_utilization_sample_dtype = _get_process_utilization_sample_dtype_offsets
 
 cdef class ProcessUtilizationSample:
     """Empty-initialize an array of `nvmlProcessUtilizationSample_t`.
-
     The resulting object is of length `size` and of dtype `process_utilization_sample_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlProcessUtilizationSample_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=process_utilization_sample_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -5343,21 +5321,17 @@ process_utilization_info_v1_dtype = _get_process_utilization_info_v1_dtype_offse
 
 cdef class ProcessUtilizationInfo_v1:
     """Empty-initialize an array of `nvmlProcessUtilizationInfo_v1_t`.
-
     The resulting object is of length `size` and of dtype `process_utilization_info_v1_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlProcessUtilizationInfo_v1_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=process_utilization_info_v1_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -5828,7 +5802,7 @@ cdef class EccSramErrorStatus_v1:
 cdef _get_platform_info_v1_dtype_offsets():
     cdef nvmlPlatformInfo_v1_t pod = nvmlPlatformInfo_v1_t()
     return _numpy.dtype({
-        'names': ['version', 'ib_guid', 'rack_guid', 'chassis_physical_slot_number', 'compute_slot_ind_ex', 'node_ind_ex', 'peer_type', 'module_id'],
+        'names': ['version', 'ib_guid', 'rack_guid', 'chassis_physical_slot_number', 'compute_slot_index', 'node_index', 'peer_type', 'module_id'],
         'formats': [_numpy.uint32, (_numpy.uint8, 16), (_numpy.uint8, 16), _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8],
         'offsets': [
             (<intptr_t>&(pod.version)) - (<intptr_t>&pod),
@@ -5968,23 +5942,23 @@ cdef class PlatformInfo_v1:
         self._ptr[0].chassisPhysicalSlotNumber = val
 
     @property
-    def compute_slot_ind_ex(self):
+    def compute_slot_index(self):
         """int: The index within the compute slots in the rack containing this GPU (does not include switches)"""
         return self._ptr[0].computeSlotIndex
 
-    @compute_slot_ind_ex.setter
-    def compute_slot_ind_ex(self, val):
+    @compute_slot_index.setter
+    def compute_slot_index(self, val):
         if self._readonly:
             raise ValueError("This PlatformInfo_v1 instance is read-only")
         self._ptr[0].computeSlotIndex = val
 
     @property
-    def node_ind_ex(self):
+    def node_index(self):
         """int: Index of the node within the slot containing this GPU."""
         return self._ptr[0].nodeIndex
 
-    @node_ind_ex.setter
-    def node_ind_ex(self, val):
+    @node_index.setter
+    def node_index(self, val):
         if self._readonly:
             raise ValueError("This PlatformInfo_v1 instance is read-only")
         self._ptr[0].nodeIndex = val
@@ -6055,7 +6029,7 @@ cdef class PlatformInfo_v1:
 cdef _get_platform_info_v2_dtype_offsets():
     cdef nvmlPlatformInfo_v2_t pod = nvmlPlatformInfo_v2_t()
     return _numpy.dtype({
-        'names': ['version', 'ib_guid', 'chassis_serial_number', 'slot_number', 'tray_ind_ex', 'host_id', 'peer_type', 'module_id'],
+        'names': ['version', 'ib_guid', 'chassis_serial_number', 'slot_number', 'tray_index', 'host_id', 'peer_type', 'module_id'],
         'formats': [_numpy.uint32, (_numpy.uint8, 16), (_numpy.uint8, 16), _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8],
         'offsets': [
             (<intptr_t>&(pod.version)) - (<intptr_t>&pod),
@@ -6195,12 +6169,12 @@ cdef class PlatformInfo_v2:
         self._ptr[0].slotNumber = val
 
     @property
-    def tray_ind_ex(self):
+    def tray_index(self):
         """int: The tray index within the compute slots in the chassis containing this GPU (does not include switches)"""
         return self._ptr[0].trayIndex
 
-    @tray_ind_ex.setter
-    def tray_ind_ex(self, val):
+    @tray_index.setter
+    def tray_index(self, val):
         if self._readonly:
             raise ValueError("This PlatformInfo_v2 instance is read-only")
         self._ptr[0].trayIndex = val
@@ -6280,7 +6254,7 @@ cdef class PlatformInfo_v2:
 
 
 cdef _get__py_anon_pod1_dtype_offsets():
-    cdef _anon_pod1 pod = _anon_pod1()
+    cdef cuda_bindings_nvml__anon_pod1 pod = cuda_bindings_nvml__anon_pod1()
     return _numpy.dtype({
         'names': ['b_is_present', 'percentage', 'inc_threshold', 'dec_threshold'],
         'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32],
@@ -6290,25 +6264,25 @@ cdef _get__py_anon_pod1_dtype_offsets():
             (<intptr_t>&(pod.incThreshold)) - (<intptr_t>&pod),
             (<intptr_t>&(pod.decThreshold)) - (<intptr_t>&pod),
         ],
-        'itemsize': sizeof(_anon_pod1),
+        'itemsize': sizeof(cuda_bindings_nvml__anon_pod1),
     })
 
 _py_anon_pod1_dtype = _get__py_anon_pod1_dtype_offsets()
 
 cdef class _py_anon_pod1:
-    """Empty-initialize an instance of `_anon_pod1`.
+    """Empty-initialize an instance of `cuda_bindings_nvml__anon_pod1`.
 
 
-    .. seealso:: `_anon_pod1`
+    .. seealso:: `cuda_bindings_nvml__anon_pod1`
     """
     cdef:
-        _anon_pod1 *_ptr
+        cuda_bindings_nvml__anon_pod1 *_ptr
         object _owner
         bint _owned
         bint _readonly
 
     def __init__(self):
-        self._ptr = <_anon_pod1 *>calloc(1, sizeof(_anon_pod1))
+        self._ptr = <cuda_bindings_nvml__anon_pod1 *>calloc(1, sizeof(cuda_bindings_nvml__anon_pod1))
         if self._ptr == NULL:
             raise MemoryError("Error allocating _py_anon_pod1")
         self._owner = None
@@ -6316,7 +6290,7 @@ cdef class _py_anon_pod1:
         self._readonly = False
 
     def __dealloc__(self):
-        cdef _anon_pod1 *ptr
+        cdef cuda_bindings_nvml__anon_pod1 *ptr
         if self._owned and self._ptr != NULL:
             ptr = self._ptr
             self._ptr = NULL
@@ -6341,20 +6315,20 @@ cdef class _py_anon_pod1:
         if not isinstance(other, _py_anon_pod1):
             return False
         other_ = other
-        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(_anon_pod1)) == 0)
+        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(cuda_bindings_nvml__anon_pod1)) == 0)
 
     def __getbuffer__(self, Py_buffer *buffer, int flags):
-        __getbuffer(self, buffer, <void *>self._ptr, sizeof(_anon_pod1), self._readonly)
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(cuda_bindings_nvml__anon_pod1), self._readonly)
 
     def __releasebuffer__(self, Py_buffer *buffer):
         pass
 
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
-            self._ptr = <_anon_pod1 *>malloc(sizeof(_anon_pod1))
+            self._ptr = <cuda_bindings_nvml__anon_pod1 *>malloc(sizeof(cuda_bindings_nvml__anon_pod1))
             if self._ptr == NULL:
                 raise MemoryError("Error allocating _py_anon_pod1")
-            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(_anon_pod1))
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(cuda_bindings_nvml__anon_pod1))
             self._owner = None
             self._owned = True
             self._readonly = not val.flags.writeable
@@ -6408,7 +6382,7 @@ cdef class _py_anon_pod1:
     @staticmethod
     def from_buffer(buffer):
         """Create an _py_anon_pod1 instance with the memory from the given buffer."""
-        return __from_buffer(buffer, sizeof(_anon_pod1), _py_anon_pod1)
+        return __from_buffer(buffer, sizeof(cuda_bindings_nvml__anon_pod1), _py_anon_pod1)
 
     @staticmethod
     def from_data(data):
@@ -6432,14 +6406,14 @@ cdef class _py_anon_pod1:
             raise ValueError("ptr must not be null (0)")
         cdef _py_anon_pod1 obj = _py_anon_pod1.__new__(_py_anon_pod1)
         if owner is None:
-            obj._ptr = <_anon_pod1 *>malloc(sizeof(_anon_pod1))
+            obj._ptr = <cuda_bindings_nvml__anon_pod1 *>malloc(sizeof(cuda_bindings_nvml__anon_pod1))
             if obj._ptr == NULL:
                 raise MemoryError("Error allocating _py_anon_pod1")
-            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(_anon_pod1))
+            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(cuda_bindings_nvml__anon_pod1))
             obj._owner = None
             obj._owned = True
         else:
-            obj._ptr = <_anon_pod1 *>ptr
+            obj._ptr = <cuda_bindings_nvml__anon_pod1 *>ptr
             obj._owner = owner
             obj._owned = False
         obj._readonly = readonly
@@ -6792,21 +6766,17 @@ vgpu_process_utilization_info_v1_dtype = _get_vgpu_process_utilization_info_v1_d
 
 cdef class VgpuProcessUtilizationInfo_v1:
     """Empty-initialize an array of `nvmlVgpuProcessUtilizationInfo_v1_t`.
-
     The resulting object is of length `size` and of dtype `vgpu_process_utilization_info_v1_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlVgpuProcessUtilizationInfo_v1_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=vgpu_process_utilization_info_v1_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -7020,7 +6990,7 @@ cdef class VgpuProcessUtilizationInfo_v1:
 
 
 cdef _get__py_anon_pod2_dtype_offsets():
-    cdef _anon_pod2 pod = _anon_pod2()
+    cdef cuda_bindings_nvml__anon_pod2 pod = cuda_bindings_nvml__anon_pod2()
     return _numpy.dtype({
         'names': ['avg_factor', 'timeslice'],
         'formats': [_numpy.uint32, _numpy.uint32],
@@ -7028,25 +6998,25 @@ cdef _get__py_anon_pod2_dtype_offsets():
             (<intptr_t>&(pod.avgFactor)) - (<intptr_t>&pod),
             (<intptr_t>&(pod.timeslice)) - (<intptr_t>&pod),
         ],
-        'itemsize': sizeof(_anon_pod2),
+        'itemsize': sizeof(cuda_bindings_nvml__anon_pod2),
     })
 
 _py_anon_pod2_dtype = _get__py_anon_pod2_dtype_offsets()
 
 cdef class _py_anon_pod2:
-    """Empty-initialize an instance of `_anon_pod2`.
+    """Empty-initialize an instance of `cuda_bindings_nvml__anon_pod2`.
 
 
-    .. seealso:: `_anon_pod2`
+    .. seealso:: `cuda_bindings_nvml__anon_pod2`
     """
     cdef:
-        _anon_pod2 *_ptr
+        cuda_bindings_nvml__anon_pod2 *_ptr
         object _owner
         bint _owned
         bint _readonly
 
     def __init__(self):
-        self._ptr = <_anon_pod2 *>calloc(1, sizeof(_anon_pod2))
+        self._ptr = <cuda_bindings_nvml__anon_pod2 *>calloc(1, sizeof(cuda_bindings_nvml__anon_pod2))
         if self._ptr == NULL:
             raise MemoryError("Error allocating _py_anon_pod2")
         self._owner = None
@@ -7054,7 +7024,7 @@ cdef class _py_anon_pod2:
         self._readonly = False
 
     def __dealloc__(self):
-        cdef _anon_pod2 *ptr
+        cdef cuda_bindings_nvml__anon_pod2 *ptr
         if self._owned and self._ptr != NULL:
             ptr = self._ptr
             self._ptr = NULL
@@ -7079,20 +7049,20 @@ cdef class _py_anon_pod2:
         if not isinstance(other, _py_anon_pod2):
             return False
         other_ = other
-        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(_anon_pod2)) == 0)
+        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(cuda_bindings_nvml__anon_pod2)) == 0)
 
     def __getbuffer__(self, Py_buffer *buffer, int flags):
-        __getbuffer(self, buffer, <void *>self._ptr, sizeof(_anon_pod2), self._readonly)
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(cuda_bindings_nvml__anon_pod2), self._readonly)
 
     def __releasebuffer__(self, Py_buffer *buffer):
         pass
 
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
-            self._ptr = <_anon_pod2 *>malloc(sizeof(_anon_pod2))
+            self._ptr = <cuda_bindings_nvml__anon_pod2 *>malloc(sizeof(cuda_bindings_nvml__anon_pod2))
             if self._ptr == NULL:
                 raise MemoryError("Error allocating _py_anon_pod2")
-            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(_anon_pod2))
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(cuda_bindings_nvml__anon_pod2))
             self._owner = None
             self._owned = True
             self._readonly = not val.flags.writeable
@@ -7124,7 +7094,7 @@ cdef class _py_anon_pod2:
     @staticmethod
     def from_buffer(buffer):
         """Create an _py_anon_pod2 instance with the memory from the given buffer."""
-        return __from_buffer(buffer, sizeof(_anon_pod2), _py_anon_pod2)
+        return __from_buffer(buffer, sizeof(cuda_bindings_nvml__anon_pod2), _py_anon_pod2)
 
     @staticmethod
     def from_data(data):
@@ -7148,14 +7118,14 @@ cdef class _py_anon_pod2:
             raise ValueError("ptr must not be null (0)")
         cdef _py_anon_pod2 obj = _py_anon_pod2.__new__(_py_anon_pod2)
         if owner is None:
-            obj._ptr = <_anon_pod2 *>malloc(sizeof(_anon_pod2))
+            obj._ptr = <cuda_bindings_nvml__anon_pod2 *>malloc(sizeof(cuda_bindings_nvml__anon_pod2))
             if obj._ptr == NULL:
                 raise MemoryError("Error allocating _py_anon_pod2")
-            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(_anon_pod2))
+            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(cuda_bindings_nvml__anon_pod2))
             obj._owner = None
             obj._owned = True
         else:
-            obj._ptr = <_anon_pod2 *>ptr
+            obj._ptr = <cuda_bindings_nvml__anon_pod2 *>ptr
             obj._owner = owner
             obj._owned = False
         obj._readonly = readonly
@@ -7163,32 +7133,32 @@ cdef class _py_anon_pod2:
 
 
 cdef _get__py_anon_pod3_dtype_offsets():
-    cdef _anon_pod3 pod = _anon_pod3()
+    cdef cuda_bindings_nvml__anon_pod3 pod = cuda_bindings_nvml__anon_pod3()
     return _numpy.dtype({
         'names': ['timeslice'],
         'formats': [_numpy.uint32],
         'offsets': [
             (<intptr_t>&(pod.timeslice)) - (<intptr_t>&pod),
         ],
-        'itemsize': sizeof(_anon_pod3),
+        'itemsize': sizeof(cuda_bindings_nvml__anon_pod3),
     })
 
 _py_anon_pod3_dtype = _get__py_anon_pod3_dtype_offsets()
 
 cdef class _py_anon_pod3:
-    """Empty-initialize an instance of `_anon_pod3`.
+    """Empty-initialize an instance of `cuda_bindings_nvml__anon_pod3`.
 
 
-    .. seealso:: `_anon_pod3`
+    .. seealso:: `cuda_bindings_nvml__anon_pod3`
     """
     cdef:
-        _anon_pod3 *_ptr
+        cuda_bindings_nvml__anon_pod3 *_ptr
         object _owner
         bint _owned
         bint _readonly
 
     def __init__(self):
-        self._ptr = <_anon_pod3 *>calloc(1, sizeof(_anon_pod3))
+        self._ptr = <cuda_bindings_nvml__anon_pod3 *>calloc(1, sizeof(cuda_bindings_nvml__anon_pod3))
         if self._ptr == NULL:
             raise MemoryError("Error allocating _py_anon_pod3")
         self._owner = None
@@ -7196,7 +7166,7 @@ cdef class _py_anon_pod3:
         self._readonly = False
 
     def __dealloc__(self):
-        cdef _anon_pod3 *ptr
+        cdef cuda_bindings_nvml__anon_pod3 *ptr
         if self._owned and self._ptr != NULL:
             ptr = self._ptr
             self._ptr = NULL
@@ -7221,20 +7191,20 @@ cdef class _py_anon_pod3:
         if not isinstance(other, _py_anon_pod3):
             return False
         other_ = other
-        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(_anon_pod3)) == 0)
+        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(cuda_bindings_nvml__anon_pod3)) == 0)
 
     def __getbuffer__(self, Py_buffer *buffer, int flags):
-        __getbuffer(self, buffer, <void *>self._ptr, sizeof(_anon_pod3), self._readonly)
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(cuda_bindings_nvml__anon_pod3), self._readonly)
 
     def __releasebuffer__(self, Py_buffer *buffer):
         pass
 
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
-            self._ptr = <_anon_pod3 *>malloc(sizeof(_anon_pod3))
+            self._ptr = <cuda_bindings_nvml__anon_pod3 *>malloc(sizeof(cuda_bindings_nvml__anon_pod3))
             if self._ptr == NULL:
                 raise MemoryError("Error allocating _py_anon_pod3")
-            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(_anon_pod3))
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(cuda_bindings_nvml__anon_pod3))
             self._owner = None
             self._owned = True
             self._readonly = not val.flags.writeable
@@ -7255,7 +7225,7 @@ cdef class _py_anon_pod3:
     @staticmethod
     def from_buffer(buffer):
         """Create an _py_anon_pod3 instance with the memory from the given buffer."""
-        return __from_buffer(buffer, sizeof(_anon_pod3), _py_anon_pod3)
+        return __from_buffer(buffer, sizeof(cuda_bindings_nvml__anon_pod3), _py_anon_pod3)
 
     @staticmethod
     def from_data(data):
@@ -7279,14 +7249,14 @@ cdef class _py_anon_pod3:
             raise ValueError("ptr must not be null (0)")
         cdef _py_anon_pod3 obj = _py_anon_pod3.__new__(_py_anon_pod3)
         if owner is None:
-            obj._ptr = <_anon_pod3 *>malloc(sizeof(_anon_pod3))
+            obj._ptr = <cuda_bindings_nvml__anon_pod3 *>malloc(sizeof(cuda_bindings_nvml__anon_pod3))
             if obj._ptr == NULL:
                 raise MemoryError("Error allocating _py_anon_pod3")
-            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(_anon_pod3))
+            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(cuda_bindings_nvml__anon_pod3))
             obj._owner = None
             obj._owned = True
         else:
-            obj._ptr = <_anon_pod3 *>ptr
+            obj._ptr = <cuda_bindings_nvml__anon_pod3 *>ptr
             obj._owner = owner
             obj._owned = False
         obj._readonly = readonly
@@ -7313,21 +7283,17 @@ vgpu_scheduler_log_entry_dtype = _get_vgpu_scheduler_log_entry_dtype_offsets()
 
 cdef class VgpuSchedulerLogEntry:
     """Empty-initialize an array of `nvmlVgpuSchedulerLogEntry_t`.
-
     The resulting object is of length `size` and of dtype `vgpu_scheduler_log_entry_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlVgpuSchedulerLogEntry_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=vgpu_scheduler_log_entry_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -7499,7 +7465,7 @@ cdef class VgpuSchedulerLogEntry:
 
 
 cdef _get__py_anon_pod4_dtype_offsets():
-    cdef _anon_pod4 pod = _anon_pod4()
+    cdef cuda_bindings_nvml__anon_pod4 pod = cuda_bindings_nvml__anon_pod4()
     return _numpy.dtype({
         'names': ['avg_factor', 'frequency'],
         'formats': [_numpy.uint32, _numpy.uint32],
@@ -7507,25 +7473,25 @@ cdef _get__py_anon_pod4_dtype_offsets():
             (<intptr_t>&(pod.avgFactor)) - (<intptr_t>&pod),
             (<intptr_t>&(pod.frequency)) - (<intptr_t>&pod),
         ],
-        'itemsize': sizeof(_anon_pod4),
+        'itemsize': sizeof(cuda_bindings_nvml__anon_pod4),
     })
 
 _py_anon_pod4_dtype = _get__py_anon_pod4_dtype_offsets()
 
 cdef class _py_anon_pod4:
-    """Empty-initialize an instance of `_anon_pod4`.
+    """Empty-initialize an instance of `cuda_bindings_nvml__anon_pod4`.
 
 
-    .. seealso:: `_anon_pod4`
+    .. seealso:: `cuda_bindings_nvml__anon_pod4`
     """
     cdef:
-        _anon_pod4 *_ptr
+        cuda_bindings_nvml__anon_pod4 *_ptr
         object _owner
         bint _owned
         bint _readonly
 
     def __init__(self):
-        self._ptr = <_anon_pod4 *>calloc(1, sizeof(_anon_pod4))
+        self._ptr = <cuda_bindings_nvml__anon_pod4 *>calloc(1, sizeof(cuda_bindings_nvml__anon_pod4))
         if self._ptr == NULL:
             raise MemoryError("Error allocating _py_anon_pod4")
         self._owner = None
@@ -7533,7 +7499,7 @@ cdef class _py_anon_pod4:
         self._readonly = False
 
     def __dealloc__(self):
-        cdef _anon_pod4 *ptr
+        cdef cuda_bindings_nvml__anon_pod4 *ptr
         if self._owned and self._ptr != NULL:
             ptr = self._ptr
             self._ptr = NULL
@@ -7558,20 +7524,20 @@ cdef class _py_anon_pod4:
         if not isinstance(other, _py_anon_pod4):
             return False
         other_ = other
-        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(_anon_pod4)) == 0)
+        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(cuda_bindings_nvml__anon_pod4)) == 0)
 
     def __getbuffer__(self, Py_buffer *buffer, int flags):
-        __getbuffer(self, buffer, <void *>self._ptr, sizeof(_anon_pod4), self._readonly)
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(cuda_bindings_nvml__anon_pod4), self._readonly)
 
     def __releasebuffer__(self, Py_buffer *buffer):
         pass
 
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
-            self._ptr = <_anon_pod4 *>malloc(sizeof(_anon_pod4))
+            self._ptr = <cuda_bindings_nvml__anon_pod4 *>malloc(sizeof(cuda_bindings_nvml__anon_pod4))
             if self._ptr == NULL:
                 raise MemoryError("Error allocating _py_anon_pod4")
-            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(_anon_pod4))
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(cuda_bindings_nvml__anon_pod4))
             self._owner = None
             self._owned = True
             self._readonly = not val.flags.writeable
@@ -7603,7 +7569,7 @@ cdef class _py_anon_pod4:
     @staticmethod
     def from_buffer(buffer):
         """Create an _py_anon_pod4 instance with the memory from the given buffer."""
-        return __from_buffer(buffer, sizeof(_anon_pod4), _py_anon_pod4)
+        return __from_buffer(buffer, sizeof(cuda_bindings_nvml__anon_pod4), _py_anon_pod4)
 
     @staticmethod
     def from_data(data):
@@ -7627,14 +7593,14 @@ cdef class _py_anon_pod4:
             raise ValueError("ptr must not be null (0)")
         cdef _py_anon_pod4 obj = _py_anon_pod4.__new__(_py_anon_pod4)
         if owner is None:
-            obj._ptr = <_anon_pod4 *>malloc(sizeof(_anon_pod4))
+            obj._ptr = <cuda_bindings_nvml__anon_pod4 *>malloc(sizeof(cuda_bindings_nvml__anon_pod4))
             if obj._ptr == NULL:
                 raise MemoryError("Error allocating _py_anon_pod4")
-            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(_anon_pod4))
+            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(cuda_bindings_nvml__anon_pod4))
             obj._owner = None
             obj._owned = True
         else:
-            obj._ptr = <_anon_pod4 *>ptr
+            obj._ptr = <cuda_bindings_nvml__anon_pod4 *>ptr
             obj._owner = owner
             obj._owned = False
         obj._readonly = readonly
@@ -7642,32 +7608,32 @@ cdef class _py_anon_pod4:
 
 
 cdef _get__py_anon_pod5_dtype_offsets():
-    cdef _anon_pod5 pod = _anon_pod5()
+    cdef cuda_bindings_nvml__anon_pod5 pod = cuda_bindings_nvml__anon_pod5()
     return _numpy.dtype({
         'names': ['timeslice'],
         'formats': [_numpy.uint32],
         'offsets': [
             (<intptr_t>&(pod.timeslice)) - (<intptr_t>&pod),
         ],
-        'itemsize': sizeof(_anon_pod5),
+        'itemsize': sizeof(cuda_bindings_nvml__anon_pod5),
     })
 
 _py_anon_pod5_dtype = _get__py_anon_pod5_dtype_offsets()
 
 cdef class _py_anon_pod5:
-    """Empty-initialize an instance of `_anon_pod5`.
+    """Empty-initialize an instance of `cuda_bindings_nvml__anon_pod5`.
 
 
-    .. seealso:: `_anon_pod5`
+    .. seealso:: `cuda_bindings_nvml__anon_pod5`
     """
     cdef:
-        _anon_pod5 *_ptr
+        cuda_bindings_nvml__anon_pod5 *_ptr
         object _owner
         bint _owned
         bint _readonly
 
     def __init__(self):
-        self._ptr = <_anon_pod5 *>calloc(1, sizeof(_anon_pod5))
+        self._ptr = <cuda_bindings_nvml__anon_pod5 *>calloc(1, sizeof(cuda_bindings_nvml__anon_pod5))
         if self._ptr == NULL:
             raise MemoryError("Error allocating _py_anon_pod5")
         self._owner = None
@@ -7675,7 +7641,7 @@ cdef class _py_anon_pod5:
         self._readonly = False
 
     def __dealloc__(self):
-        cdef _anon_pod5 *ptr
+        cdef cuda_bindings_nvml__anon_pod5 *ptr
         if self._owned and self._ptr != NULL:
             ptr = self._ptr
             self._ptr = NULL
@@ -7700,20 +7666,20 @@ cdef class _py_anon_pod5:
         if not isinstance(other, _py_anon_pod5):
             return False
         other_ = other
-        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(_anon_pod5)) == 0)
+        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(cuda_bindings_nvml__anon_pod5)) == 0)
 
     def __getbuffer__(self, Py_buffer *buffer, int flags):
-        __getbuffer(self, buffer, <void *>self._ptr, sizeof(_anon_pod5), self._readonly)
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(cuda_bindings_nvml__anon_pod5), self._readonly)
 
     def __releasebuffer__(self, Py_buffer *buffer):
         pass
 
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
-            self._ptr = <_anon_pod5 *>malloc(sizeof(_anon_pod5))
+            self._ptr = <cuda_bindings_nvml__anon_pod5 *>malloc(sizeof(cuda_bindings_nvml__anon_pod5))
             if self._ptr == NULL:
                 raise MemoryError("Error allocating _py_anon_pod5")
-            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(_anon_pod5))
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(cuda_bindings_nvml__anon_pod5))
             self._owner = None
             self._owned = True
             self._readonly = not val.flags.writeable
@@ -7734,7 +7700,7 @@ cdef class _py_anon_pod5:
     @staticmethod
     def from_buffer(buffer):
         """Create an _py_anon_pod5 instance with the memory from the given buffer."""
-        return __from_buffer(buffer, sizeof(_anon_pod5), _py_anon_pod5)
+        return __from_buffer(buffer, sizeof(cuda_bindings_nvml__anon_pod5), _py_anon_pod5)
 
     @staticmethod
     def from_data(data):
@@ -7758,14 +7724,14 @@ cdef class _py_anon_pod5:
             raise ValueError("ptr must not be null (0)")
         cdef _py_anon_pod5 obj = _py_anon_pod5.__new__(_py_anon_pod5)
         if owner is None:
-            obj._ptr = <_anon_pod5 *>malloc(sizeof(_anon_pod5))
+            obj._ptr = <cuda_bindings_nvml__anon_pod5 *>malloc(sizeof(cuda_bindings_nvml__anon_pod5))
             if obj._ptr == NULL:
                 raise MemoryError("Error allocating _py_anon_pod5")
-            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(_anon_pod5))
+            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(cuda_bindings_nvml__anon_pod5))
             obj._owner = None
             obj._owned = True
         else:
-            obj._ptr = <_anon_pod5 *>ptr
+            obj._ptr = <cuda_bindings_nvml__anon_pod5 *>ptr
             obj._owner = owner
             obj._owned = False
         obj._readonly = readonly
@@ -8425,6 +8391,7 @@ cdef class VgpuTypeIdInfo_v1:
         object _owner
         bint _owned
         bint _readonly
+        dict _refs
 
     def __init__(self):
         self._ptr = <nvmlVgpuTypeIdInfo_v1_t *>calloc(1, sizeof(nvmlVgpuTypeIdInfo_v1_t))
@@ -8433,6 +8400,7 @@ cdef class VgpuTypeIdInfo_v1:
         self._owner = None
         self._owned = True
         self._readonly = False
+        self._refs = {}
 
     def __dealloc__(self):
         cdef nvmlVgpuTypeIdInfo_v1_t *ptr
@@ -8548,6 +8516,7 @@ cdef class VgpuTypeIdInfo_v1:
             obj._owner = owner
             obj._owned = False
         obj._readonly = readonly
+        obj._refs = {}
         return obj
 
 
@@ -8577,6 +8546,7 @@ cdef class ActiveVgpuInstanceInfo_v1:
         object _owner
         bint _owned
         bint _readonly
+        dict _refs
 
     def __init__(self):
         self._ptr = <nvmlActiveVgpuInstanceInfo_v1_t *>calloc(1, sizeof(nvmlActiveVgpuInstanceInfo_v1_t))
@@ -8585,6 +8555,7 @@ cdef class ActiveVgpuInstanceInfo_v1:
         self._owner = None
         self._owned = True
         self._readonly = False
+        self._refs = {}
 
     def __dealloc__(self):
         cdef nvmlActiveVgpuInstanceInfo_v1_t *ptr
@@ -8700,6 +8671,7 @@ cdef class ActiveVgpuInstanceInfo_v1:
             obj._owner = owner
             obj._owned = False
         obj._readonly = readonly
+        obj._refs = {}
         return obj
 
 
@@ -8898,21 +8870,17 @@ hwbc_entry_dtype = _get_hwbc_entry_dtype_offsets()
 
 cdef class HwbcEntry:
     """Empty-initialize an array of `nvmlHwbcEntry_t`.
-
     The resulting object is of length `size` and of dtype `hwbc_entry_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlHwbcEntry_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=hwbc_entry_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -9554,21 +9522,17 @@ unit_fan_info_dtype = _get_unit_fan_info_dtype_offsets()
 
 cdef class UnitFanInfo:
     """Empty-initialize an array of `nvmlUnitFanInfo_t`.
-
     The resulting object is of length `size` and of dtype `unit_fan_info_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlUnitFanInfo_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=unit_fan_info_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -9890,21 +9854,17 @@ system_event_data_v1_dtype = _get_system_event_data_v1_dtype_offsets()
 
 cdef class SystemEventData_v1:
     """Empty-initialize an array of `nvmlSystemEventData_v1_t`.
-
     The resulting object is of length `size` and of dtype `system_event_data_v1_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlSystemEventData_v1_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=system_event_data_v1_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -10245,21 +10205,17 @@ encoder_session_info_dtype = _get_encoder_session_info_dtype_offsets()
 
 cdef class EncoderSessionInfo:
     """Empty-initialize an array of `nvmlEncoderSessionInfo_t`.
-
     The resulting object is of length `size` and of dtype `encoder_session_info_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlEncoderSessionInfo_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=encoder_session_info_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -10633,21 +10589,17 @@ fbc_session_info_dtype = _get_fbc_session_info_dtype_offsets()
 
 cdef class FBCSessionInfo:
     """Empty-initialize an array of `nvmlFBCSessionInfo_t`.
-
     The resulting object is of length `size` and of dtype `fbc_session_info_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlFBCSessionInfo_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=fbc_session_info_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -13072,21 +13024,17 @@ gpu_instance_placement_dtype = _get_gpu_instance_placement_dtype_offsets()
 
 cdef class GpuInstancePlacement:
     """Empty-initialize an array of `nvmlGpuInstancePlacement_t`.
-
     The resulting object is of length `size` and of dtype `gpu_instance_placement_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlGpuInstancePlacement_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=gpu_instance_placement_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -13508,21 +13456,17 @@ compute_instance_placement_dtype = _get_compute_instance_placement_dtype_offsets
 
 cdef class ComputeInstancePlacement:
     """Empty-initialize an array of `nvmlComputeInstancePlacement_t`.
-
     The resulting object is of length `size` and of dtype `compute_instance_placement_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlComputeInstancePlacement_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=compute_instance_placement_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -14645,21 +14589,17 @@ ecc_sram_unique_uncorrected_error_entry_v1_dtype = _get_ecc_sram_unique_uncorrec
 
 cdef class EccSramUniqueUncorrectedErrorEntry_v1:
     """Empty-initialize an array of `nvmlEccSramUniqueUncorrectedErrorEntry_v1_t`.
-
     The resulting object is of length `size` and of dtype `ecc_sram_unique_uncorrected_error_entry_v1_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlEccSramUniqueUncorrectedErrorEntry_v1_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=ecc_sram_unique_uncorrected_error_entry_v1_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -15491,49 +15431,51 @@ cdef class PRMCounterInput_v1:
         return obj
 
 
-cdef _get_excluded_device_info_dtype_offsets():
-    cdef nvmlExcludedDeviceInfo_t pod = nvmlExcludedDeviceInfo_t()
+cdef _get_vgpu_scheduler_state_info_v2_dtype_offsets():
+    cdef nvmlVgpuSchedulerStateInfo_v2_t pod = nvmlVgpuSchedulerStateInfo_v2_t()
     return _numpy.dtype({
-        'names': ['pci_info', 'uuid'],
-        'formats': [pci_info_dtype, (_numpy.int8, 80)],
+        'names': ['engine_id', 'scheduler_policy', 'avg_factor', 'timeslice'],
+        'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32],
         'offsets': [
-            (<intptr_t>&(pod.pciInfo)) - (<intptr_t>&pod),
-            (<intptr_t>&(pod.uuid)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.engineId)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.schedulerPolicy)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.avgFactor)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.timeslice)) - (<intptr_t>&pod),
         ],
-        'itemsize': sizeof(nvmlExcludedDeviceInfo_t),
+        'itemsize': sizeof(nvmlVgpuSchedulerStateInfo_v2_t),
     })
 
-excluded_device_info_dtype = _get_excluded_device_info_dtype_offsets()
+vgpu_scheduler_state_info_v2_dtype = _get_vgpu_scheduler_state_info_v2_dtype_offsets()
 
-cdef class ExcludedDeviceInfo:
-    """Empty-initialize an instance of `nvmlExcludedDeviceInfo_t`.
+cdef class VgpuSchedulerStateInfo_v2:
+    """Empty-initialize an instance of `nvmlVgpuSchedulerStateInfo_v2_t`.
 
 
-    .. seealso:: `nvmlExcludedDeviceInfo_t`
+    .. seealso:: `nvmlVgpuSchedulerStateInfo_v2_t`
     """
     cdef:
-        nvmlExcludedDeviceInfo_t *_ptr
+        nvmlVgpuSchedulerStateInfo_v2_t *_ptr
         object _owner
         bint _owned
         bint _readonly
 
     def __init__(self):
-        self._ptr = <nvmlExcludedDeviceInfo_t *>calloc(1, sizeof(nvmlExcludedDeviceInfo_t))
+        self._ptr = <nvmlVgpuSchedulerStateInfo_v2_t *>calloc(1, sizeof(nvmlVgpuSchedulerStateInfo_v2_t))
         if self._ptr == NULL:
-            raise MemoryError("Error allocating ExcludedDeviceInfo")
+            raise MemoryError("Error allocating VgpuSchedulerStateInfo_v2")
         self._owner = None
         self._owned = True
         self._readonly = False
 
     def __dealloc__(self):
-        cdef nvmlExcludedDeviceInfo_t *ptr
+        cdef nvmlVgpuSchedulerStateInfo_v2_t *ptr
         if self._owned and self._ptr != NULL:
             ptr = self._ptr
             self._ptr = NULL
             free(ptr)
 
     def __repr__(self):
-        return f"<{__name__}.ExcludedDeviceInfo object at {hex(id(self))}>"
+        return f"<{__name__}.VgpuSchedulerStateInfo_v2 object at {hex(id(self))}>"
 
     @property
     def ptr(self):
@@ -15547,24 +15489,24 @@ cdef class ExcludedDeviceInfo:
         return <intptr_t>(self._ptr)
 
     def __eq__(self, other):
-        cdef ExcludedDeviceInfo other_
-        if not isinstance(other, ExcludedDeviceInfo):
+        cdef VgpuSchedulerStateInfo_v2 other_
+        if not isinstance(other, VgpuSchedulerStateInfo_v2):
             return False
         other_ = other
-        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlExcludedDeviceInfo_t)) == 0)
+        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuSchedulerStateInfo_v2_t)) == 0)
 
     def __getbuffer__(self, Py_buffer *buffer, int flags):
-        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlExcludedDeviceInfo_t), self._readonly)
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuSchedulerStateInfo_v2_t), self._readonly)
 
     def __releasebuffer__(self, Py_buffer *buffer):
         pass
 
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
-            self._ptr = <nvmlExcludedDeviceInfo_t *>malloc(sizeof(nvmlExcludedDeviceInfo_t))
+            self._ptr = <nvmlVgpuSchedulerStateInfo_v2_t *>malloc(sizeof(nvmlVgpuSchedulerStateInfo_v2_t))
             if self._ptr == NULL:
-                raise MemoryError("Error allocating ExcludedDeviceInfo")
-            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(nvmlExcludedDeviceInfo_t))
+                raise MemoryError("Error allocating VgpuSchedulerStateInfo_v2")
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(nvmlVgpuSchedulerStateInfo_v2_t))
             self._owner = None
             self._owned = True
             self._readonly = not val.flags.writeable
@@ -15572,49 +15514,66 @@ cdef class ExcludedDeviceInfo:
             setattr(self, key, val)
 
     @property
-    def pci_info(self):
-        """PciInfo: """
-        return PciInfo.from_ptr(<intptr_t>&(self._ptr[0].pciInfo), self._readonly, self)
+    def engine_id(self):
+        """int: IN: Engine whose software scheduler state info is fetched. One of NVML_VGPU_SCHEDULER_ENGINE_TYPE_*."""
+        return self._ptr[0].engineId
 
-    @pci_info.setter
-    def pci_info(self, val):
+    @engine_id.setter
+    def engine_id(self, val):
         if self._readonly:
-            raise ValueError("This ExcludedDeviceInfo instance is read-only")
-        cdef PciInfo val_ = val
-        memcpy(<void *>&(self._ptr[0].pciInfo), <void *>(val_._get_ptr()), sizeof(nvmlPciInfo_t) * 1)
+            raise ValueError("This VgpuSchedulerStateInfo_v2 instance is read-only")
+        self._ptr[0].engineId = val
 
     @property
-    def uuid(self):
-        """~_numpy.int8: (array of length 80)."""
-        return cpython.PyUnicode_FromString(self._ptr[0].uuid)
+    def scheduler_policy(self):
+        """int: OUT: Scheduler policy."""
+        return self._ptr[0].schedulerPolicy
 
-    @uuid.setter
-    def uuid(self, val):
+    @scheduler_policy.setter
+    def scheduler_policy(self, val):
         if self._readonly:
-            raise ValueError("This ExcludedDeviceInfo instance is read-only")
-        cdef bytes buf = val.encode()
-        if len(buf) >= 80:
-            raise ValueError("String too long for field uuid, max length is 79")
-        cdef char *ptr = buf
-        memcpy(<void *>(self._ptr[0].uuid), <void *>ptr, 80)
+            raise ValueError("This VgpuSchedulerStateInfo_v2 instance is read-only")
+        self._ptr[0].schedulerPolicy = val
+
+    @property
+    def avg_factor(self):
+        """int: OUT: Average factor in compensating the timeslice for Adaptive Round Robin mode."""
+        return self._ptr[0].avgFactor
+
+    @avg_factor.setter
+    def avg_factor(self, val):
+        if self._readonly:
+            raise ValueError("This VgpuSchedulerStateInfo_v2 instance is read-only")
+        self._ptr[0].avgFactor = val
+
+    @property
+    def timeslice(self):
+        """int: OUT: The timeslice in ns for each software run list as configured, or the default value otherwise."""
+        return self._ptr[0].timeslice
+
+    @timeslice.setter
+    def timeslice(self, val):
+        if self._readonly:
+            raise ValueError("This VgpuSchedulerStateInfo_v2 instance is read-only")
+        self._ptr[0].timeslice = val
 
     @staticmethod
     def from_buffer(buffer):
-        """Create an ExcludedDeviceInfo instance with the memory from the given buffer."""
-        return __from_buffer(buffer, sizeof(nvmlExcludedDeviceInfo_t), ExcludedDeviceInfo)
+        """Create an VgpuSchedulerStateInfo_v2 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuSchedulerStateInfo_v2_t), VgpuSchedulerStateInfo_v2)
 
     @staticmethod
     def from_data(data):
-        """Create an ExcludedDeviceInfo instance wrapping the given NumPy array.
+        """Create an VgpuSchedulerStateInfo_v2 instance wrapping the given NumPy array.
 
         Args:
-            data (_numpy.ndarray): a single-element array of dtype `excluded_device_info_dtype` holding the data.
+            data (_numpy.ndarray): a single-element array of dtype `vgpu_scheduler_state_info_v2_dtype` holding the data.
         """
-        return __from_data(data, "excluded_device_info_dtype", excluded_device_info_dtype, ExcludedDeviceInfo)
+        return __from_data(data, "vgpu_scheduler_state_info_v2_dtype", vgpu_scheduler_state_info_v2_dtype, VgpuSchedulerStateInfo_v2)
 
     @staticmethod
     def from_ptr(intptr_t ptr, bint readonly=False, object owner=None):
-        """Create an ExcludedDeviceInfo instance wrapping the given pointer.
+        """Create an VgpuSchedulerStateInfo_v2 instance wrapping the given pointer.
 
         Args:
             ptr (intptr_t): pointer address as Python :class:`int` to the data.
@@ -15623,118 +15582,646 @@ cdef class ExcludedDeviceInfo:
         """
         if ptr == 0:
             raise ValueError("ptr must not be null (0)")
-        cdef ExcludedDeviceInfo obj = ExcludedDeviceInfo.__new__(ExcludedDeviceInfo)
+        cdef VgpuSchedulerStateInfo_v2 obj = VgpuSchedulerStateInfo_v2.__new__(VgpuSchedulerStateInfo_v2)
         if owner is None:
-            obj._ptr = <nvmlExcludedDeviceInfo_t *>malloc(sizeof(nvmlExcludedDeviceInfo_t))
+            obj._ptr = <nvmlVgpuSchedulerStateInfo_v2_t *>malloc(sizeof(nvmlVgpuSchedulerStateInfo_v2_t))
             if obj._ptr == NULL:
-                raise MemoryError("Error allocating ExcludedDeviceInfo")
-            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(nvmlExcludedDeviceInfo_t))
+                raise MemoryError("Error allocating VgpuSchedulerStateInfo_v2")
+            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(nvmlVgpuSchedulerStateInfo_v2_t))
             obj._owner = None
             obj._owned = True
         else:
-            obj._ptr = <nvmlExcludedDeviceInfo_t *>ptr
+            obj._ptr = <nvmlVgpuSchedulerStateInfo_v2_t *>ptr
             obj._owner = owner
             obj._owned = False
         obj._readonly = readonly
         return obj
 
 
-cdef _get_process_detail_list_v1_dtype_offsets():
-    cdef nvmlProcessDetailList_v1_t pod = nvmlProcessDetailList_v1_t()
+cdef _get_vgpu_scheduler_log_entry_v2_dtype_offsets():
+    cdef nvmlVgpuSchedulerLogEntry_v2_t pod = nvmlVgpuSchedulerLogEntry_v2_t()
     return _numpy.dtype({
-        'names': ['version', 'mode', 'num_proc_array_entries', 'proc_array'],
-        'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.intp],
+        'names': ['timestamp', 'time_run_total', 'time_run', 'sw_runlist_id', 'target_time_slice', 'cumulative_preemption_time', 'weight'],
+        'formats': [_numpy.uint64, _numpy.uint64, _numpy.uint64, _numpy.uint32, _numpy.uint64, _numpy.uint64, _numpy.uint32],
         'offsets': [
-            (<intptr_t>&(pod.version)) - (<intptr_t>&pod),
-            (<intptr_t>&(pod.mode)) - (<intptr_t>&pod),
-            (<intptr_t>&(pod.numProcArrayEntries)) - (<intptr_t>&pod),
-            (<intptr_t>&(pod.procArray)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.timestamp)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.timeRunTotal)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.timeRun)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.swRunlistId)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.targetTimeSlice)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.cumulativePreemptionTime)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.weight)) - (<intptr_t>&pod),
         ],
-        'itemsize': sizeof(nvmlProcessDetailList_v1_t),
+        'itemsize': sizeof(nvmlVgpuSchedulerLogEntry_v2_t),
     })
 
-process_detail_list_v1_dtype = _get_process_detail_list_v1_dtype_offsets()
+vgpu_scheduler_log_entry_v2_dtype = _get_vgpu_scheduler_log_entry_v2_dtype_offsets()
 
-cdef class ProcessDetailList_v1:
-    """Empty-initialize an instance of `nvmlProcessDetailList_v1_t`.
+cdef class VgpuSchedulerLogEntry_v2:
+    """Empty-initialize an array of `nvmlVgpuSchedulerLogEntry_v2_t`.
+    The resulting object is of length `size` and of dtype `vgpu_scheduler_log_entry_v2_dtype`.
+    If default-constructed, the instance represents a single struct.
 
+    Args:
+        size (int): number of structs, default=1.
 
-    .. seealso:: `nvmlProcessDetailList_v1_t`
+    .. seealso:: `nvmlVgpuSchedulerLogEntry_v2_t`
     """
     cdef:
-        nvmlProcessDetailList_v1_t *_ptr
-        object _owner
-        bint _owned
-        bint _readonly
-        dict _refs
-
-    def __init__(self):
-        self._ptr = <nvmlProcessDetailList_v1_t *>calloc(1, sizeof(nvmlProcessDetailList_v1_t))
-        if self._ptr == NULL:
-            raise MemoryError("Error allocating ProcessDetailList_v1")
-        self._owner = None
-        self._owned = True
-        self._readonly = False
-        self._refs = {}
+        readonly object _data
 
-    def __dealloc__(self):
-        cdef nvmlProcessDetailList_v1_t *ptr
-        if self._owned and self._ptr != NULL:
-            ptr = self._ptr
-            self._ptr = NULL
-            free(ptr)
+    def __init__(self, size=1):
+        arr = _numpy.empty(size, dtype=vgpu_scheduler_log_entry_v2_dtype)
+        self._data = arr.view(_numpy.recarray)
+        assert self._data.itemsize == sizeof(nvmlVgpuSchedulerLogEntry_v2_t), \
+            f"itemsize {self._data.itemsize} mismatches struct size { sizeof(nvmlVgpuSchedulerLogEntry_v2_t) }"
 
     def __repr__(self):
-        return f"<{__name__}.ProcessDetailList_v1 object at {hex(id(self))}>"
+        if self._data.size > 1:
+            return f"<{__name__}.VgpuSchedulerLogEntry_v2_Array_{self._data.size} object at {hex(id(self))}>"
+        else:
+            return f"<{__name__}.VgpuSchedulerLogEntry_v2 object at {hex(id(self))}>"
 
     @property
     def ptr(self):
         """Get the pointer address to the data as Python :class:`int`."""
-        return <intptr_t>(self._ptr)
+        return self._data.ctypes.data
 
     cdef intptr_t _get_ptr(self):
-        return <intptr_t>(self._ptr)
+        return self._data.ctypes.data
 
     def __int__(self):
-        return <intptr_t>(self._ptr)
+        if self._data.size > 1:
+            raise TypeError("int() argument must be a bytes-like object of size 1. "
+                            "To get the pointer address of an array, use .ptr")
+        return self._data.ctypes.data
+
+    def __len__(self):
+        return self._data.size
 
     def __eq__(self, other):
-        cdef ProcessDetailList_v1 other_
-        if not isinstance(other, ProcessDetailList_v1):
+        cdef object self_data = self._data
+        if (not isinstance(other, VgpuSchedulerLogEntry_v2)) or self_data.size != other._data.size or self_data.dtype != other._data.dtype:
             return False
-        other_ = other
-        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlProcessDetailList_v1_t)) == 0)
+        return bool((self_data == other._data).all())
 
     def __getbuffer__(self, Py_buffer *buffer, int flags):
-        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlProcessDetailList_v1_t), self._readonly)
+        cpython.PyObject_GetBuffer(self._data, buffer, flags)
 
     def __releasebuffer__(self, Py_buffer *buffer):
-        pass
-
-    def __setitem__(self, key, val):
-        if key == 0 and isinstance(val, _numpy.ndarray):
-            self._ptr = <nvmlProcessDetailList_v1_t *>malloc(sizeof(nvmlProcessDetailList_v1_t))
-            if self._ptr == NULL:
-                raise MemoryError("Error allocating ProcessDetailList_v1")
-            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(nvmlProcessDetailList_v1_t))
-            self._owner = None
-            self._owned = True
-            self._readonly = not val.flags.writeable
-        else:
-            setattr(self, key, val)
+        cpython.PyBuffer_Release(buffer)
 
     @property
-    def version(self):
-        """int: Struct version, MUST be nvmlProcessDetailList_v1."""
-        return self._ptr[0].version
-
-    @version.setter
-    def version(self, val):
-        if self._readonly:
-            raise ValueError("This ProcessDetailList_v1 instance is read-only")
-        self._ptr[0].version = val
+    def timestamp(self):
+        """Union[~_numpy.uint64, int]: OUT: Timestamp in ns when this software runlist was preeempted."""
+        if self._data.size == 1:
+            return int(self._data.timestamp[0])
+        return self._data.timestamp
 
-    @property
+    @timestamp.setter
+    def timestamp(self, val):
+        self._data.timestamp = val
+
+    @property
+    def time_run_total(self):
+        """Union[~_numpy.uint64, int]: OUT: Total time in ns this software runlist has run."""
+        if self._data.size == 1:
+            return int(self._data.time_run_total[0])
+        return self._data.time_run_total
+
+    @time_run_total.setter
+    def time_run_total(self, val):
+        self._data.time_run_total = val
+
+    @property
+    def time_run(self):
+        """Union[~_numpy.uint64, int]: OUT: Time in ns this software runlist ran before preemption."""
+        if self._data.size == 1:
+            return int(self._data.time_run[0])
+        return self._data.time_run
+
+    @time_run.setter
+    def time_run(self, val):
+        self._data.time_run = val
+
+    @property
+    def sw_runlist_id(self):
+        """Union[~_numpy.uint32, int]: OUT: Software runlist Id."""
+        if self._data.size == 1:
+            return int(self._data.sw_runlist_id[0])
+        return self._data.sw_runlist_id
+
+    @sw_runlist_id.setter
+    def sw_runlist_id(self, val):
+        self._data.sw_runlist_id = val
+
+    @property
+    def target_time_slice(self):
+        """Union[~_numpy.uint64, int]: OUT: The actual timeslice after deduction."""
+        if self._data.size == 1:
+            return int(self._data.target_time_slice[0])
+        return self._data.target_time_slice
+
+    @target_time_slice.setter
+    def target_time_slice(self, val):
+        self._data.target_time_slice = val
+
+    @property
+    def cumulative_preemption_time(self):
+        """Union[~_numpy.uint64, int]: OUT: Preemption time in ns for this SW runlist."""
+        if self._data.size == 1:
+            return int(self._data.cumulative_preemption_time[0])
+        return self._data.cumulative_preemption_time
+
+    @cumulative_preemption_time.setter
+    def cumulative_preemption_time(self, val):
+        self._data.cumulative_preemption_time = val
+
+    @property
+    def weight(self):
+        """Union[~_numpy.uint32, int]: OUT: Current weight of this SW runlist."""
+        if self._data.size == 1:
+            return int(self._data.weight[0])
+        return self._data.weight
+
+    @weight.setter
+    def weight(self, val):
+        self._data.weight = val
+
+    def __getitem__(self, key):
+        cdef ssize_t key_
+        cdef ssize_t size
+        if isinstance(key, int):
+            key_ = key
+            size = self._data.size
+            if key_ >= size or key_ <= -(size+1):
+                raise IndexError("index is out of bounds")
+            if key_ < 0:
+                key_ += size
+            return VgpuSchedulerLogEntry_v2.from_data(self._data[key_:key_+1])
+        out = self._data[key]
+        if isinstance(out, _numpy.recarray) and out.dtype == vgpu_scheduler_log_entry_v2_dtype:
+            return VgpuSchedulerLogEntry_v2.from_data(out)
+        return out
+
+    def __setitem__(self, key, val):
+        self._data[key] = val
+
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuSchedulerLogEntry_v2 instance with the memory from the given buffer."""
+        return VgpuSchedulerLogEntry_v2.from_data(_numpy.frombuffer(buffer, dtype=vgpu_scheduler_log_entry_v2_dtype))
+
+    @staticmethod
+    def from_data(data):
+        """Create an VgpuSchedulerLogEntry_v2 instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a 1D array of dtype `vgpu_scheduler_log_entry_v2_dtype` holding the data.
+        """
+        cdef VgpuSchedulerLogEntry_v2 obj = VgpuSchedulerLogEntry_v2.__new__(VgpuSchedulerLogEntry_v2)
+        if not isinstance(data, _numpy.ndarray):
+            raise TypeError("data argument must be a NumPy ndarray")
+        if data.ndim != 1:
+            raise ValueError("data array must be 1D")
+        if data.dtype != vgpu_scheduler_log_entry_v2_dtype:
+            raise ValueError("data array must be of dtype vgpu_scheduler_log_entry_v2_dtype")
+        obj._data = data.view(_numpy.recarray)
+
+        return obj
+
+    @staticmethod
+    def from_ptr(intptr_t ptr, size_t size=1, bint readonly=False):
+        """Create an VgpuSchedulerLogEntry_v2 instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            size (int): number of structs, default=1.
+            readonly (bool): whether the data is read-only (to the user). default is `False`.
+        """
+        if ptr == 0:
+            raise ValueError("ptr must not be null (0)")
+        cdef VgpuSchedulerLogEntry_v2 obj = VgpuSchedulerLogEntry_v2.__new__(VgpuSchedulerLogEntry_v2)
+        cdef flag = cpython.buffer.PyBUF_READ if readonly else cpython.buffer.PyBUF_WRITE
+        cdef object buf = cpython.memoryview.PyMemoryView_FromMemory(
+            <char*>ptr, sizeof(nvmlVgpuSchedulerLogEntry_v2_t) * size, flag)
+        data = _numpy.ndarray(size, buffer=buf, dtype=vgpu_scheduler_log_entry_v2_dtype)
+        obj._data = data.view(_numpy.recarray)
+
+        return obj
+
+
+cdef _get_vgpu_scheduler_state_v2_dtype_offsets():
+    cdef nvmlVgpuSchedulerState_v2_t pod = nvmlVgpuSchedulerState_v2_t()
+    return _numpy.dtype({
+        'names': ['engine_id', 'scheduler_policy', 'avg_factor', 'frequency'],
+        'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32],
+        'offsets': [
+            (<intptr_t>&(pod.engineId)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.schedulerPolicy)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.avgFactor)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.frequency)) - (<intptr_t>&pod),
+        ],
+        'itemsize': sizeof(nvmlVgpuSchedulerState_v2_t),
+    })
+
+vgpu_scheduler_state_v2_dtype = _get_vgpu_scheduler_state_v2_dtype_offsets()
+
+cdef class VgpuSchedulerState_v2:
+    """Empty-initialize an instance of `nvmlVgpuSchedulerState_v2_t`.
+
+
+    .. seealso:: `nvmlVgpuSchedulerState_v2_t`
+    """
+    cdef:
+        nvmlVgpuSchedulerState_v2_t *_ptr
+        object _owner
+        bint _owned
+        bint _readonly
+
+    def __init__(self):
+        self._ptr = <nvmlVgpuSchedulerState_v2_t *>calloc(1, sizeof(nvmlVgpuSchedulerState_v2_t))
+        if self._ptr == NULL:
+            raise MemoryError("Error allocating VgpuSchedulerState_v2")
+        self._owner = None
+        self._owned = True
+        self._readonly = False
+
+    def __dealloc__(self):
+        cdef nvmlVgpuSchedulerState_v2_t *ptr
+        if self._owned and self._ptr != NULL:
+            ptr = self._ptr
+            self._ptr = NULL
+            free(ptr)
+
+    def __repr__(self):
+        return f"<{__name__}.VgpuSchedulerState_v2 object at {hex(id(self))}>"
+
+    @property
+    def ptr(self):
+        """Get the pointer address to the data as Python :class:`int`."""
+        return <intptr_t>(self._ptr)
+
+    cdef intptr_t _get_ptr(self):
+        return <intptr_t>(self._ptr)
+
+    def __int__(self):
+        return <intptr_t>(self._ptr)
+
+    def __eq__(self, other):
+        cdef VgpuSchedulerState_v2 other_
+        if not isinstance(other, VgpuSchedulerState_v2):
+            return False
+        other_ = other
+        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuSchedulerState_v2_t)) == 0)
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuSchedulerState_v2_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
+    def __setitem__(self, key, val):
+        if key == 0 and isinstance(val, _numpy.ndarray):
+            self._ptr = <nvmlVgpuSchedulerState_v2_t *>malloc(sizeof(nvmlVgpuSchedulerState_v2_t))
+            if self._ptr == NULL:
+                raise MemoryError("Error allocating VgpuSchedulerState_v2")
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(nvmlVgpuSchedulerState_v2_t))
+            self._owner = None
+            self._owned = True
+            self._readonly = not val.flags.writeable
+        else:
+            setattr(self, key, val)
+
+    @property
+    def engine_id(self):
+        """int: IN: One of NVML_VGPU_SCHEDULER_ENGINE_TYPE_*."""
+        return self._ptr[0].engineId
+
+    @engine_id.setter
+    def engine_id(self, val):
+        if self._readonly:
+            raise ValueError("This VgpuSchedulerState_v2 instance is read-only")
+        self._ptr[0].engineId = val
+
+    @property
+    def scheduler_policy(self):
+        """int: IN: Scheduler policy."""
+        return self._ptr[0].schedulerPolicy
+
+    @scheduler_policy.setter
+    def scheduler_policy(self, val):
+        if self._readonly:
+            raise ValueError("This VgpuSchedulerState_v2 instance is read-only")
+        self._ptr[0].schedulerPolicy = val
+
+    @property
+    def avg_factor(self):
+        """int: IN: Average factor in compensating the timeslice for Adaptive Round Robin mode. 0 or unspecified uses default."""
+        return self._ptr[0].avgFactor
+
+    @avg_factor.setter
+    def avg_factor(self, val):
+        if self._readonly:
+            raise ValueError("This VgpuSchedulerState_v2 instance is read-only")
+        self._ptr[0].avgFactor = val
+
+    @property
+    def frequency(self):
+        """int: IN: Frequency for Adaptive Round Robin mode. 0 or unspecified uses default."""
+        return self._ptr[0].frequency
+
+    @frequency.setter
+    def frequency(self, val):
+        if self._readonly:
+            raise ValueError("This VgpuSchedulerState_v2 instance is read-only")
+        self._ptr[0].frequency = val
+
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an VgpuSchedulerState_v2 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuSchedulerState_v2_t), VgpuSchedulerState_v2)
+
+    @staticmethod
+    def from_data(data):
+        """Create an VgpuSchedulerState_v2 instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a single-element array of dtype `vgpu_scheduler_state_v2_dtype` holding the data.
+        """
+        return __from_data(data, "vgpu_scheduler_state_v2_dtype", vgpu_scheduler_state_v2_dtype, VgpuSchedulerState_v2)
+
+    @staticmethod
+    def from_ptr(intptr_t ptr, bint readonly=False, object owner=None):
+        """Create an VgpuSchedulerState_v2 instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            owner (object): The Python object that owns the pointer. If not provided, data will be copied.
+            readonly (bool): whether the data is read-only (to the user). default is `False`.
+        """
+        if ptr == 0:
+            raise ValueError("ptr must not be null (0)")
+        cdef VgpuSchedulerState_v2 obj = VgpuSchedulerState_v2.__new__(VgpuSchedulerState_v2)
+        if owner is None:
+            obj._ptr = <nvmlVgpuSchedulerState_v2_t *>malloc(sizeof(nvmlVgpuSchedulerState_v2_t))
+            if obj._ptr == NULL:
+                raise MemoryError("Error allocating VgpuSchedulerState_v2")
+            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(nvmlVgpuSchedulerState_v2_t))
+            obj._owner = None
+            obj._owned = True
+        else:
+            obj._ptr = <nvmlVgpuSchedulerState_v2_t *>ptr
+            obj._owner = owner
+            obj._owned = False
+        obj._readonly = readonly
+        return obj
+
+
+cdef _get_excluded_device_info_dtype_offsets():
+    cdef nvmlExcludedDeviceInfo_t pod = nvmlExcludedDeviceInfo_t()
+    return _numpy.dtype({
+        'names': ['pci_info', 'uuid'],
+        'formats': [pci_info_dtype, (_numpy.int8, 80)],
+        'offsets': [
+            (<intptr_t>&(pod.pciInfo)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.uuid)) - (<intptr_t>&pod),
+        ],
+        'itemsize': sizeof(nvmlExcludedDeviceInfo_t),
+    })
+
+excluded_device_info_dtype = _get_excluded_device_info_dtype_offsets()
+
+cdef class ExcludedDeviceInfo:
+    """Empty-initialize an instance of `nvmlExcludedDeviceInfo_t`.
+
+
+    .. seealso:: `nvmlExcludedDeviceInfo_t`
+    """
+    cdef:
+        nvmlExcludedDeviceInfo_t *_ptr
+        object _owner
+        bint _owned
+        bint _readonly
+
+    def __init__(self):
+        self._ptr = <nvmlExcludedDeviceInfo_t *>calloc(1, sizeof(nvmlExcludedDeviceInfo_t))
+        if self._ptr == NULL:
+            raise MemoryError("Error allocating ExcludedDeviceInfo")
+        self._owner = None
+        self._owned = True
+        self._readonly = False
+
+    def __dealloc__(self):
+        cdef nvmlExcludedDeviceInfo_t *ptr
+        if self._owned and self._ptr != NULL:
+            ptr = self._ptr
+            self._ptr = NULL
+            free(ptr)
+
+    def __repr__(self):
+        return f"<{__name__}.ExcludedDeviceInfo object at {hex(id(self))}>"
+
+    @property
+    def ptr(self):
+        """Get the pointer address to the data as Python :class:`int`."""
+        return <intptr_t>(self._ptr)
+
+    cdef intptr_t _get_ptr(self):
+        return <intptr_t>(self._ptr)
+
+    def __int__(self):
+        return <intptr_t>(self._ptr)
+
+    def __eq__(self, other):
+        cdef ExcludedDeviceInfo other_
+        if not isinstance(other, ExcludedDeviceInfo):
+            return False
+        other_ = other
+        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlExcludedDeviceInfo_t)) == 0)
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlExcludedDeviceInfo_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
+    def __setitem__(self, key, val):
+        if key == 0 and isinstance(val, _numpy.ndarray):
+            self._ptr = <nvmlExcludedDeviceInfo_t *>malloc(sizeof(nvmlExcludedDeviceInfo_t))
+            if self._ptr == NULL:
+                raise MemoryError("Error allocating ExcludedDeviceInfo")
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(nvmlExcludedDeviceInfo_t))
+            self._owner = None
+            self._owned = True
+            self._readonly = not val.flags.writeable
+        else:
+            setattr(self, key, val)
+
+    @property
+    def pci_info(self):
+        """PciInfo: """
+        return PciInfo.from_ptr(<intptr_t>&(self._ptr[0].pciInfo), self._readonly, self)
+
+    @pci_info.setter
+    def pci_info(self, val):
+        if self._readonly:
+            raise ValueError("This ExcludedDeviceInfo instance is read-only")
+        cdef PciInfo val_ = val
+        memcpy(<void *>&(self._ptr[0].pciInfo), <void *>(val_._get_ptr()), sizeof(nvmlPciInfo_t) * 1)
+
+    @property
+    def uuid(self):
+        """~_numpy.int8: (array of length 80)."""
+        return cpython.PyUnicode_FromString(self._ptr[0].uuid)
+
+    @uuid.setter
+    def uuid(self, val):
+        if self._readonly:
+            raise ValueError("This ExcludedDeviceInfo instance is read-only")
+        cdef bytes buf = val.encode()
+        if len(buf) >= 80:
+            raise ValueError("String too long for field uuid, max length is 79")
+        cdef char *ptr = buf
+        memcpy(<void *>(self._ptr[0].uuid), <void *>ptr, 80)
+
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an ExcludedDeviceInfo instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlExcludedDeviceInfo_t), ExcludedDeviceInfo)
+
+    @staticmethod
+    def from_data(data):
+        """Create an ExcludedDeviceInfo instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a single-element array of dtype `excluded_device_info_dtype` holding the data.
+        """
+        return __from_data(data, "excluded_device_info_dtype", excluded_device_info_dtype, ExcludedDeviceInfo)
+
+    @staticmethod
+    def from_ptr(intptr_t ptr, bint readonly=False, object owner=None):
+        """Create an ExcludedDeviceInfo instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            owner (object): The Python object that owns the pointer. If not provided, data will be copied.
+            readonly (bool): whether the data is read-only (to the user). default is `False`.
+        """
+        if ptr == 0:
+            raise ValueError("ptr must not be null (0)")
+        cdef ExcludedDeviceInfo obj = ExcludedDeviceInfo.__new__(ExcludedDeviceInfo)
+        if owner is None:
+            obj._ptr = <nvmlExcludedDeviceInfo_t *>malloc(sizeof(nvmlExcludedDeviceInfo_t))
+            if obj._ptr == NULL:
+                raise MemoryError("Error allocating ExcludedDeviceInfo")
+            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(nvmlExcludedDeviceInfo_t))
+            obj._owner = None
+            obj._owned = True
+        else:
+            obj._ptr = <nvmlExcludedDeviceInfo_t *>ptr
+            obj._owner = owner
+            obj._owned = False
+        obj._readonly = readonly
+        return obj
+
+
+cdef _get_process_detail_list_v1_dtype_offsets():
+    cdef nvmlProcessDetailList_v1_t pod = nvmlProcessDetailList_v1_t()
+    return _numpy.dtype({
+        'names': ['version', 'mode', 'num_proc_array_entries', 'proc_array'],
+        'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.intp],
+        'offsets': [
+            (<intptr_t>&(pod.version)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.mode)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.numProcArrayEntries)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.procArray)) - (<intptr_t>&pod),
+        ],
+        'itemsize': sizeof(nvmlProcessDetailList_v1_t),
+    })
+
+process_detail_list_v1_dtype = _get_process_detail_list_v1_dtype_offsets()
+
+cdef class ProcessDetailList_v1:
+    """Empty-initialize an instance of `nvmlProcessDetailList_v1_t`.
+
+
+    .. seealso:: `nvmlProcessDetailList_v1_t`
+    """
+    cdef:
+        nvmlProcessDetailList_v1_t *_ptr
+        object _owner
+        bint _owned
+        bint _readonly
+        dict _refs
+
+    def __init__(self):
+        self._ptr = <nvmlProcessDetailList_v1_t *>calloc(1, sizeof(nvmlProcessDetailList_v1_t))
+        if self._ptr == NULL:
+            raise MemoryError("Error allocating ProcessDetailList_v1")
+        self._owner = None
+        self._owned = True
+        self._readonly = False
+        self._refs = {}
+
+    def __dealloc__(self):
+        cdef nvmlProcessDetailList_v1_t *ptr
+        if self._owned and self._ptr != NULL:
+            ptr = self._ptr
+            self._ptr = NULL
+            free(ptr)
+
+    def __repr__(self):
+        return f"<{__name__}.ProcessDetailList_v1 object at {hex(id(self))}>"
+
+    @property
+    def ptr(self):
+        """Get the pointer address to the data as Python :class:`int`."""
+        return <intptr_t>(self._ptr)
+
+    cdef intptr_t _get_ptr(self):
+        return <intptr_t>(self._ptr)
+
+    def __int__(self):
+        return <intptr_t>(self._ptr)
+
+    def __eq__(self, other):
+        cdef ProcessDetailList_v1 other_
+        if not isinstance(other, ProcessDetailList_v1):
+            return False
+        other_ = other
+        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlProcessDetailList_v1_t)) == 0)
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlProcessDetailList_v1_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
+    def __setitem__(self, key, val):
+        if key == 0 and isinstance(val, _numpy.ndarray):
+            self._ptr = <nvmlProcessDetailList_v1_t *>malloc(sizeof(nvmlProcessDetailList_v1_t))
+            if self._ptr == NULL:
+                raise MemoryError("Error allocating ProcessDetailList_v1")
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(nvmlProcessDetailList_v1_t))
+            self._owner = None
+            self._owned = True
+            self._readonly = not val.flags.writeable
+        else:
+            setattr(self, key, val)
+
+    @property
+    def version(self):
+        """int: Struct version, MUST be nvmlProcessDetailList_v1."""
+        return self._ptr[0].version
+
+    @version.setter
+    def version(self, val):
+        if self._readonly:
+            raise ValueError("This ProcessDetailList_v1 instance is read-only")
+        self._ptr[0].version = val
+
+    @property
     def mode(self):
         """int: Process mode(Compute/Graphics/MPSCompute)"""
         return self._ptr[0].mode
@@ -15957,21 +16444,17 @@ sample_dtype = _get_sample_dtype_offsets()
 
 cdef class Sample:
     """Empty-initialize an array of `nvmlSample_t`.
-
     The resulting object is of length `size` and of dtype `sample_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlSample_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=sample_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -16116,21 +16599,17 @@ vgpu_instance_utilization_sample_dtype = _get_vgpu_instance_utilization_sample_d
 
 cdef class VgpuInstanceUtilizationSample:
     """Empty-initialize an array of `nvmlVgpuInstanceUtilizationSample_t`.
-
     The resulting object is of length `size` and of dtype `vgpu_instance_utilization_sample_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlVgpuInstanceUtilizationSample_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=vgpu_instance_utilization_sample_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -16315,21 +16794,17 @@ vgpu_instance_utilization_info_v1_dtype = _get_vgpu_instance_utilization_info_v1
 
 cdef class VgpuInstanceUtilizationInfo_v1:
     """Empty-initialize an array of `nvmlVgpuInstanceUtilizationInfo_v1_t`.
-
     The resulting object is of length `size` and of dtype `vgpu_instance_utilization_info_v1_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlVgpuInstanceUtilizationInfo_v1_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=vgpu_instance_utilization_info_v1_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -16531,21 +17006,17 @@ field_value_dtype = _get_field_value_dtype_offsets()
 
 cdef class FieldValue:
     """Empty-initialize an array of `nvmlFieldValue_t`.
-
     The resulting object is of length `size` and of dtype `field_value_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlFieldValue_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=field_value_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -16973,7 +17444,7 @@ cdef class GpuThermalSettings:
         cdef _py_anon_pod0 val_ = val
         if len(val) != 3:
             raise ValueError(f"Expected length { 3 } for field sensor, got {len(val)}")
-        memcpy(<void *>&(self._ptr[0].sensor), <void *>(val_._get_ptr()), sizeof(_anon_pod0) * 3)
+        memcpy(<void *>&(self._ptr[0].sensor), <void *>(val_._get_ptr()), sizeof(cuda_bindings_nvml__anon_pod0) * 3)
 
     @property
     def count(self):
@@ -17433,7 +17904,7 @@ cdef class GpuDynamicPstatesInfo:
         cdef _py_anon_pod1 val_ = val
         if len(val) != 8:
             raise ValueError(f"Expected length { 8 } for field utilization, got {len(val)}")
-        memcpy(<void *>&(self._ptr[0].utilization), <void *>(val_._get_ptr()), sizeof(_anon_pod1) * 8)
+        memcpy(<void *>&(self._ptr[0].utilization), <void *>(val_._get_ptr()), sizeof(cuda_bindings_nvml__anon_pod1) * 8)
 
     @property
     def flags_(self):
@@ -17659,7 +18130,6 @@ vgpu_scheduler_params_dtype = _numpy.dtype((
     }
     ))
 
-
 cdef class VgpuSchedulerParams:
     """Empty-initialize an instance of `nvmlVgpuSchedulerParams_t`.
 
@@ -17736,7 +18206,7 @@ cdef class VgpuSchedulerParams:
         if self._readonly:
             raise ValueError("This VgpuSchedulerParams instance is read-only")
         cdef _py_anon_pod2 val_ = val
-        memcpy(<void *>&(self._ptr[0].vgpuSchedDataWithARR), <void *>(val_._get_ptr()), sizeof(_anon_pod2) * 1)
+        memcpy(<void *>&(self._ptr[0].vgpuSchedDataWithARR), <void *>(val_._get_ptr()), sizeof(cuda_bindings_nvml__anon_pod2) * 1)
 
     @property
     def vgpu_sched_data(self):
@@ -17748,7 +18218,7 @@ cdef class VgpuSchedulerParams:
         if self._readonly:
             raise ValueError("This VgpuSchedulerParams instance is read-only")
         cdef _py_anon_pod3 val_ = val
-        memcpy(<void *>&(self._ptr[0].vgpuSchedData), <void *>(val_._get_ptr()), sizeof(_anon_pod3) * 1)
+        memcpy(<void *>&(self._ptr[0].vgpuSchedData), <void *>(val_._get_ptr()), sizeof(cuda_bindings_nvml__anon_pod3) * 1)
 
     @staticmethod
     def from_buffer(buffer):
@@ -17799,7 +18269,6 @@ vgpu_scheduler_set_params_dtype = _numpy.dtype((
     }
     ))
 
-
 cdef class VgpuSchedulerSetParams:
     """Empty-initialize an instance of `nvmlVgpuSchedulerSetParams_t`.
 
@@ -17876,7 +18345,7 @@ cdef class VgpuSchedulerSetParams:
         if self._readonly:
             raise ValueError("This VgpuSchedulerSetParams instance is read-only")
         cdef _py_anon_pod4 val_ = val
-        memcpy(<void *>&(self._ptr[0].vgpuSchedDataWithARR), <void *>(val_._get_ptr()), sizeof(_anon_pod4) * 1)
+        memcpy(<void *>&(self._ptr[0].vgpuSchedDataWithARR), <void *>(val_._get_ptr()), sizeof(cuda_bindings_nvml__anon_pod4) * 1)
 
     @property
     def vgpu_sched_data(self):
@@ -17888,7 +18357,7 @@ cdef class VgpuSchedulerSetParams:
         if self._readonly:
             raise ValueError("This VgpuSchedulerSetParams instance is read-only")
         cdef _py_anon_pod5 val_ = val
-        memcpy(<void *>&(self._ptr[0].vgpuSchedData), <void *>(val_._get_ptr()), sizeof(_anon_pod5) * 1)
+        memcpy(<void *>&(self._ptr[0].vgpuSchedData), <void *>(val_._get_ptr()), sizeof(cuda_bindings_nvml__anon_pod5) * 1)
 
     @staticmethod
     def from_buffer(buffer):
@@ -18107,21 +18576,17 @@ grid_licensable_feature_dtype = _get_grid_licensable_feature_dtype_offsets()
 
 cdef class GridLicensableFeature:
     """Empty-initialize an array of `nvmlGridLicensableFeature_t`.
-
     The resulting object is of length `size` and of dtype `grid_licensable_feature_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlGridLicensableFeature_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=grid_licensable_feature_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -19208,17 +19673,167 @@ cdef class NvlinkFirmwareInfo:
         return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlNvlinkFirmwareInfo_t)) == 0)
 
     def __getbuffer__(self, Py_buffer *buffer, int flags):
-        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlNvlinkFirmwareInfo_t), self._readonly)
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlNvlinkFirmwareInfo_t), self._readonly)
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
+    def __setitem__(self, key, val):
+        if key == 0 and isinstance(val, _numpy.ndarray):
+            self._ptr = <nvmlNvlinkFirmwareInfo_t *>malloc(sizeof(nvmlNvlinkFirmwareInfo_t))
+            if self._ptr == NULL:
+                raise MemoryError("Error allocating NvlinkFirmwareInfo")
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(nvmlNvlinkFirmwareInfo_t))
+            self._owner = None
+            self._owned = True
+            self._readonly = not val.flags.writeable
+        else:
+            setattr(self, key, val)
+
+    @property
+    def firmware_version(self):
+        """NvlinkFirmwareVersion: OUT - NVLINK firmware version."""
+        return NvlinkFirmwareVersion.from_ptr(<intptr_t>&(self._ptr[0].firmwareVersion), 100, self._readonly)
+
+    @firmware_version.setter
+    def firmware_version(self, val):
+        if self._readonly:
+            raise ValueError("This NvlinkFirmwareInfo instance is read-only")
+        cdef NvlinkFirmwareVersion val_ = val
+        if len(val) != 100:
+            raise ValueError(f"Expected length { 100 } for field firmware_version, got {len(val)}")
+        memcpy(<void *>&(self._ptr[0].firmwareVersion), <void *>(val_._get_ptr()), sizeof(nvmlNvlinkFirmwareVersion_t) * 100)
+
+    @property
+    def num_valid_entries(self):
+        """int: OUT - Number of valid firmware entries."""
+        return self._ptr[0].numValidEntries
+
+    @num_valid_entries.setter
+    def num_valid_entries(self, val):
+        if self._readonly:
+            raise ValueError("This NvlinkFirmwareInfo instance is read-only")
+        self._ptr[0].numValidEntries = val
+
+    @staticmethod
+    def from_buffer(buffer):
+        """Create an NvlinkFirmwareInfo instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlNvlinkFirmwareInfo_t), NvlinkFirmwareInfo)
+
+    @staticmethod
+    def from_data(data):
+        """Create an NvlinkFirmwareInfo instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a single-element array of dtype `nvlink_firmware_info_dtype` holding the data.
+        """
+        return __from_data(data, "nvlink_firmware_info_dtype", nvlink_firmware_info_dtype, NvlinkFirmwareInfo)
+
+    @staticmethod
+    def from_ptr(intptr_t ptr, bint readonly=False, object owner=None):
+        """Create an NvlinkFirmwareInfo instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            owner (object): The Python object that owns the pointer. If not provided, data will be copied.
+            readonly (bool): whether the data is read-only (to the user). default is `False`.
+        """
+        if ptr == 0:
+            raise ValueError("ptr must not be null (0)")
+        cdef NvlinkFirmwareInfo obj = NvlinkFirmwareInfo.__new__(NvlinkFirmwareInfo)
+        if owner is None:
+            obj._ptr = <nvmlNvlinkFirmwareInfo_t *>malloc(sizeof(nvmlNvlinkFirmwareInfo_t))
+            if obj._ptr == NULL:
+                raise MemoryError("Error allocating NvlinkFirmwareInfo")
+            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(nvmlNvlinkFirmwareInfo_t))
+            obj._owner = None
+            obj._owned = True
+        else:
+            obj._ptr = <nvmlNvlinkFirmwareInfo_t *>ptr
+            obj._owner = owner
+            obj._owned = False
+        obj._readonly = readonly
+        return obj
+
+
+cdef _get_vgpu_scheduler_log_info_v2_dtype_offsets():
+    cdef nvmlVgpuSchedulerLogInfo_v2_t pod = nvmlVgpuSchedulerLogInfo_v2_t()
+    return _numpy.dtype({
+        'names': ['engine_id', 'scheduler_policy', 'avg_factor', 'timeslice', 'entries_count', 'log_entries'],
+        'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, (vgpu_scheduler_log_entry_v2_dtype, 200)],
+        'offsets': [
+            (<intptr_t>&(pod.engineId)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.schedulerPolicy)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.avgFactor)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.timeslice)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.entriesCount)) - (<intptr_t>&pod),
+            (<intptr_t>&(pod.logEntries)) - (<intptr_t>&pod),
+        ],
+        'itemsize': sizeof(nvmlVgpuSchedulerLogInfo_v2_t),
+    })
+
+vgpu_scheduler_log_info_v2_dtype = _get_vgpu_scheduler_log_info_v2_dtype_offsets()
+
+cdef class VgpuSchedulerLogInfo_v2:
+    """Empty-initialize an instance of `nvmlVgpuSchedulerLogInfo_v2_t`.
+
+
+    .. seealso:: `nvmlVgpuSchedulerLogInfo_v2_t`
+    """
+    cdef:
+        nvmlVgpuSchedulerLogInfo_v2_t *_ptr
+        object _owner
+        bint _owned
+        bint _readonly
+
+    def __init__(self):
+        self._ptr = <nvmlVgpuSchedulerLogInfo_v2_t *>calloc(1, sizeof(nvmlVgpuSchedulerLogInfo_v2_t))
+        if self._ptr == NULL:
+            raise MemoryError("Error allocating VgpuSchedulerLogInfo_v2")
+        self._owner = None
+        self._owned = True
+        self._readonly = False
+
+    def __dealloc__(self):
+        cdef nvmlVgpuSchedulerLogInfo_v2_t *ptr
+        if self._owned and self._ptr != NULL:
+            ptr = self._ptr
+            self._ptr = NULL
+            free(ptr)
+
+    def __repr__(self):
+        return f"<{__name__}.VgpuSchedulerLogInfo_v2 object at {hex(id(self))}>"
+
+    @property
+    def ptr(self):
+        """Get the pointer address to the data as Python :class:`int`."""
+        return <intptr_t>(self._ptr)
+
+    cdef intptr_t _get_ptr(self):
+        return <intptr_t>(self._ptr)
+
+    def __int__(self):
+        return <intptr_t>(self._ptr)
+
+    def __eq__(self, other):
+        cdef VgpuSchedulerLogInfo_v2 other_
+        if not isinstance(other, VgpuSchedulerLogInfo_v2):
+            return False
+        other_ = other
+        return (memcmp(<void *><intptr_t>(self._ptr), <void *><intptr_t>(other_._ptr), sizeof(nvmlVgpuSchedulerLogInfo_v2_t)) == 0)
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        __getbuffer(self, buffer, <void *>self._ptr, sizeof(nvmlVgpuSchedulerLogInfo_v2_t), self._readonly)
 
     def __releasebuffer__(self, Py_buffer *buffer):
         pass
 
     def __setitem__(self, key, val):
         if key == 0 and isinstance(val, _numpy.ndarray):
-            self._ptr = <nvmlNvlinkFirmwareInfo_t *>malloc(sizeof(nvmlNvlinkFirmwareInfo_t))
+            self._ptr = <nvmlVgpuSchedulerLogInfo_v2_t *>malloc(sizeof(nvmlVgpuSchedulerLogInfo_v2_t))
             if self._ptr == NULL:
-                raise MemoryError("Error allocating NvlinkFirmwareInfo")
-            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(nvmlNvlinkFirmwareInfo_t))
+                raise MemoryError("Error allocating VgpuSchedulerLogInfo_v2")
+            memcpy(<void*>self._ptr, <void*><intptr_t>val.ctypes.data, sizeof(nvmlVgpuSchedulerLogInfo_v2_t))
             self._owner = None
             self._owned = True
             self._readonly = not val.flags.writeable
@@ -19226,47 +19841,91 @@ cdef class NvlinkFirmwareInfo:
             setattr(self, key, val)
 
     @property
-    def firmware_version(self):
-        """NvlinkFirmwareVersion: OUT - NVLINK firmware version."""
-        return NvlinkFirmwareVersion.from_ptr(<intptr_t>&(self._ptr[0].firmwareVersion), 100, self._readonly)
+    def log_entries(self):
+        """VgpuSchedulerLogEntry_v2: OUT: Structure to store the state and logs of a software runlist."""
+        return VgpuSchedulerLogEntry_v2.from_ptr(<intptr_t>&(self._ptr[0].logEntries), 200, self._readonly)
 
-    @firmware_version.setter
-    def firmware_version(self, val):
+    @log_entries.setter
+    def log_entries(self, val):
         if self._readonly:
-            raise ValueError("This NvlinkFirmwareInfo instance is read-only")
-        cdef NvlinkFirmwareVersion val_ = val
-        if len(val) != 100:
-            raise ValueError(f"Expected length { 100 } for field firmware_version, got {len(val)}")
-        memcpy(<void *>&(self._ptr[0].firmwareVersion), <void *>(val_._get_ptr()), sizeof(nvmlNvlinkFirmwareVersion_t) * 100)
+            raise ValueError("This VgpuSchedulerLogInfo_v2 instance is read-only")
+        cdef VgpuSchedulerLogEntry_v2 val_ = val
+        if len(val) != 200:
+            raise ValueError(f"Expected length { 200 } for field log_entries, got {len(val)}")
+        memcpy(<void *>&(self._ptr[0].logEntries), <void *>(val_._get_ptr()), sizeof(nvmlVgpuSchedulerLogEntry_v2_t) * 200)
 
     @property
-    def num_valid_entries(self):
-        """int: OUT - Number of valid firmware entries."""
-        return self._ptr[0].numValidEntries
+    def engine_id(self):
+        """int: IN: Engine whose software runlist log entries are fetched. One of One of NVML_VGPU_SCHEDULER_ENGINE_TYPE_*."""
+        return self._ptr[0].engineId
 
-    @num_valid_entries.setter
-    def num_valid_entries(self, val):
+    @engine_id.setter
+    def engine_id(self, val):
         if self._readonly:
-            raise ValueError("This NvlinkFirmwareInfo instance is read-only")
-        self._ptr[0].numValidEntries = val
+            raise ValueError("This VgpuSchedulerLogInfo_v2 instance is read-only")
+        self._ptr[0].engineId = val
+
+    @property
+    def scheduler_policy(self):
+        """int: OUT: Scheduler policy."""
+        return self._ptr[0].schedulerPolicy
+
+    @scheduler_policy.setter
+    def scheduler_policy(self, val):
+        if self._readonly:
+            raise ValueError("This VgpuSchedulerLogInfo_v2 instance is read-only")
+        self._ptr[0].schedulerPolicy = val
+
+    @property
+    def avg_factor(self):
+        """int: OUT: Average factor in compensating the timeslice for Adaptive Round Robin mode."""
+        return self._ptr[0].avgFactor
+
+    @avg_factor.setter
+    def avg_factor(self, val):
+        if self._readonly:
+            raise ValueError("This VgpuSchedulerLogInfo_v2 instance is read-only")
+        self._ptr[0].avgFactor = val
+
+    @property
+    def timeslice(self):
+        """int: OUT: The timeslice in ns for each software run list as configured, or the default value otherwise."""
+        return self._ptr[0].timeslice
+
+    @timeslice.setter
+    def timeslice(self, val):
+        if self._readonly:
+            raise ValueError("This VgpuSchedulerLogInfo_v2 instance is read-only")
+        self._ptr[0].timeslice = val
+
+    @property
+    def entries_count(self):
+        """int: OUT: Count of log entries fetched."""
+        return self._ptr[0].entriesCount
+
+    @entries_count.setter
+    def entries_count(self, val):
+        if self._readonly:
+            raise ValueError("This VgpuSchedulerLogInfo_v2 instance is read-only")
+        self._ptr[0].entriesCount = val
 
     @staticmethod
     def from_buffer(buffer):
-        """Create an NvlinkFirmwareInfo instance with the memory from the given buffer."""
-        return __from_buffer(buffer, sizeof(nvmlNvlinkFirmwareInfo_t), NvlinkFirmwareInfo)
+        """Create an VgpuSchedulerLogInfo_v2 instance with the memory from the given buffer."""
+        return __from_buffer(buffer, sizeof(nvmlVgpuSchedulerLogInfo_v2_t), VgpuSchedulerLogInfo_v2)
 
     @staticmethod
     def from_data(data):
-        """Create an NvlinkFirmwareInfo instance wrapping the given NumPy array.
+        """Create an VgpuSchedulerLogInfo_v2 instance wrapping the given NumPy array.
 
         Args:
-            data (_numpy.ndarray): a single-element array of dtype `nvlink_firmware_info_dtype` holding the data.
+            data (_numpy.ndarray): a single-element array of dtype `vgpu_scheduler_log_info_v2_dtype` holding the data.
         """
-        return __from_data(data, "nvlink_firmware_info_dtype", nvlink_firmware_info_dtype, NvlinkFirmwareInfo)
+        return __from_data(data, "vgpu_scheduler_log_info_v2_dtype", vgpu_scheduler_log_info_v2_dtype, VgpuSchedulerLogInfo_v2)
 
     @staticmethod
     def from_ptr(intptr_t ptr, bint readonly=False, object owner=None):
-        """Create an NvlinkFirmwareInfo instance wrapping the given pointer.
+        """Create an VgpuSchedulerLogInfo_v2 instance wrapping the given pointer.
 
         Args:
             ptr (intptr_t): pointer address as Python :class:`int` to the data.
@@ -19275,16 +19934,16 @@ cdef class NvlinkFirmwareInfo:
         """
         if ptr == 0:
             raise ValueError("ptr must not be null (0)")
-        cdef NvlinkFirmwareInfo obj = NvlinkFirmwareInfo.__new__(NvlinkFirmwareInfo)
+        cdef VgpuSchedulerLogInfo_v2 obj = VgpuSchedulerLogInfo_v2.__new__(VgpuSchedulerLogInfo_v2)
         if owner is None:
-            obj._ptr = <nvmlNvlinkFirmwareInfo_t *>malloc(sizeof(nvmlNvlinkFirmwareInfo_t))
+            obj._ptr = <nvmlVgpuSchedulerLogInfo_v2_t *>malloc(sizeof(nvmlVgpuSchedulerLogInfo_v2_t))
             if obj._ptr == NULL:
-                raise MemoryError("Error allocating NvlinkFirmwareInfo")
-            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(nvmlNvlinkFirmwareInfo_t))
+                raise MemoryError("Error allocating VgpuSchedulerLogInfo_v2")
+            memcpy(<void*>(obj._ptr), <void*>ptr, sizeof(nvmlVgpuSchedulerLogInfo_v2_t))
             obj._owner = None
             obj._owned = True
         else:
-            obj._ptr = <nvmlNvlinkFirmwareInfo_t *>ptr
+            obj._ptr = <nvmlVgpuSchedulerLogInfo_v2_t *>ptr
             obj._owner = owner
             obj._owned = False
         obj._readonly = readonly
@@ -19484,21 +20143,17 @@ prm_counter_v1_dtype = _get_prm_counter_v1_dtype_offsets()
 
 cdef class PRMCounter_v1:
     """Empty-initialize an array of `nvmlPRMCounter_v1_t`.
-
     The resulting object is of length `size` and of dtype `prm_counter_v1_dtype`.
     If default-constructed, the instance represents a single struct.
 
     Args:
         size (int): number of structs, default=1.
 
-
     .. seealso:: `nvmlPRMCounter_v1_t`
     """
     cdef:
         readonly object _data
 
-
-
     def __init__(self, size=1):
         arr = _numpy.empty(size, dtype=prm_counter_v1_dtype)
         self._data = arr.view(_numpy.recarray)
@@ -19542,7 +20197,7 @@ cdef class PRMCounter_v1:
 
     @property
     def counter_id(self):
-        """Union[~_numpy.uint32, int]: Counter ID, one of nvmlPRMCounterId_t."""
+        """Union[~_numpy.uint32, int]: Counter ID, one of `nvmlPRMCounterId_t`."""
         if self._data.size == 1:
             return int(self._data.counter_id[0])
         return self._data.counter_id
@@ -20856,7 +21511,6 @@ cdef class NvLinkInfo_v2:
         return obj
 
 
-
 cpdef init_v2():
     """Initialize NVML, but don't initialize any GPUs yet.
 
@@ -20868,7 +21522,7 @@ cpdef init_v2():
 
 
 cpdef init_with_flags(unsigned int flags):
-    """nvmlInitWithFlags is a variant of nvmlInit(), that allows passing a set of boolean values modifying the behaviour of nvmlInit(). Other than the "flags" parameter it is completely similar to ``nvmlInit_v2``.
+    """nvmlInitWithFlags is a variant of ``nvmlInit()``, that allows passing a set of boolean values modifying the behaviour of ``nvmlInit()``. Other than the "flags" parameter it is completely similar to ``nvmlInit_v2``.
 
     Args:
         flags (unsigned int): behaviour modifier flags.
@@ -20906,6 +21560,9 @@ cpdef str error_string(int result):
 cpdef str system_get_driver_version():
     """Retrieves the version of the system's graphics driver.
 
+    Returns:
+        char: Reference in which to return the version identifier.
+
     .. seealso:: `nvmlSystemGetDriverVersion`
     """
     cdef unsigned int length = 80
@@ -20919,6 +21576,9 @@ cpdef str system_get_driver_version():
 cpdef str system_get_nvml_version():
     """Retrieves the version of the NVML library.
 
+    Returns:
+        char: Reference in which to return the version identifier.
+
     .. seealso:: `nvmlSystemGetNVMLVersion`
     """
     cdef unsigned int length = 80
@@ -20965,6 +21625,9 @@ cpdef str system_get_process_name(unsigned int pid):
     Args:
         pid (unsigned int): The identifier of the process.
 
+    Returns:
+        char: Reference in which to return the process name.
+
     .. seealso:: `nvmlSystemGetProcessName`
     """
     cdef unsigned int length = 1024
@@ -20978,6 +21641,9 @@ cpdef str system_get_process_name(unsigned int pid):
 cpdef object system_get_hic_version():
     """Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system.
 
+    Returns:
+        nvmlHwbcEntry_t: Array holding information about hwbc.
+
     .. seealso:: `nvmlSystemGetHicVersion`
     """
     cdef unsigned int[1] hwbc_count = [0]
@@ -21009,11 +21675,11 @@ cpdef unsigned int unit_get_count() except? 0:
     return unit_count
 
 
-cpdef intptr_t unit_get_handle_by_index(unsigned int ind_ex) except? 0:
-    """Acquire the handle for a particular unit, based on its ind_ex.
+cpdef intptr_t unit_get_handle_by_index(unsigned int index) except? 0:
+    """Acquire the handle for a particular unit, based on its index.
 
     Args:
-        ind_ex (unsigned int): The ind_ex of the target unit, >= 0 and < ``unitCount``.
+        index (unsigned int): The index of the target unit, >= 0 and < ``unitCount``.
 
     Returns:
         intptr_t: Reference in which to return the unit handle.
@@ -21022,7 +21688,7 @@ cpdef intptr_t unit_get_handle_by_index(unsigned int ind_ex) except? 0:
     """
     cdef Unit unit
     with nogil:
-        __status__ = nvmlUnitGetHandleByIndex(ind_ex, &unit)
+        __status__ = nvmlUnitGetHandleByIndex(index, &unit)
     check_status(__status__)
     return <intptr_t>unit
 
@@ -21156,11 +21822,11 @@ cpdef object device_get_attributes_v2(intptr_t device):
     return attributes_py
 
 
-cpdef intptr_t device_get_handle_by_index_v2(unsigned int ind_ex) except? 0:
-    """Acquire the handle for a particular device, based on its ind_ex.
+cpdef intptr_t device_get_handle_by_index_v2(unsigned int index) except? 0:
+    """Acquire the handle for a particular device, based on its index.
 
     Args:
-        ind_ex (unsigned int): The ind_ex of the target GPU, >= 0 and < ``accessibleDevices``.
+        index (unsigned int): The index of the target GPU, >= 0 and < ``accessibleDevices``.
 
     Returns:
         intptr_t: Reference in which to return the device handle.
@@ -21169,7 +21835,7 @@ cpdef intptr_t device_get_handle_by_index_v2(unsigned int ind_ex) except? 0:
     """
     cdef Device device
     with nogil:
-        __status__ = nvmlDeviceGetHandleByIndex_v2(ind_ex, &device)
+        __status__ = nvmlDeviceGetHandleByIndex_v2(index, &device)
     check_status(__status__)
     return <intptr_t>device
 
@@ -21246,6 +21912,9 @@ cpdef str device_get_name(intptr_t device):
     Args:
         device (intptr_t): The identifier of the target device.
 
+    Returns:
+        char: Reference in which to return the product name.
+
     .. seealso:: `nvmlDeviceGetName`
     """
     cdef unsigned int length = 96
@@ -21285,11 +21954,11 @@ cpdef unsigned int device_get_index(intptr_t device) except? 0:
 
     .. seealso:: `nvmlDeviceGetIndex`
     """
-    cdef unsigned int ind_ex
+    cdef unsigned int index
     with nogil:
-        __status__ = nvmlDeviceGetIndex(<Device>device, &ind_ex)
+        __status__ = nvmlDeviceGetIndex(<Device>device, &index)
     check_status(__status__)
-    return ind_ex
+    return index
 
 
 cpdef str device_get_serial(intptr_t device):
@@ -21298,6 +21967,9 @@ cpdef str device_get_serial(intptr_t device):
     Args:
         device (intptr_t): The identifier of the target device.
 
+    Returns:
+        char: Reference in which to return the board/module serial number.
+
     .. seealso:: `nvmlDeviceGetSerial`
     """
     cdef unsigned int length = 30
@@ -21350,8 +22022,11 @@ cpdef object device_get_memory_affinity(intptr_t device, unsigned int node_set_s
 
     Args:
         device (intptr_t): The identifier of the target device.
-        node_set_size (unsigned int): The size of the nodeSet array that is safe to access.
-        scope (unsigned int): Array reference in which to return a bitmask of NODEs, 64 NODEs per unsigned long on 64-bit machines, 32 on 32-bit machines.
+        node_set_size (unsigned int): The size of the node_set array that is safe to access.
+        scope (unsigned int): Scope that change the default behavior.
+
+    Returns:
+        unsigned long: Array reference in which to return a bitmask of NODEs, 64 NODEs per unsigned long on 64-bit machines, 32 on 32-bit machines.
 
     .. seealso:: `nvmlDeviceGetMemoryAffinity`
     """
@@ -21370,8 +22045,11 @@ cpdef object device_get_cpu_affinity_within_scope(intptr_t device, unsigned int
 
     Args:
         device (intptr_t): The identifier of the target device.
-        cpu_set_size (unsigned int): The size of the cpuSet array that is safe to access.
-        scope (unsigned int): Array reference in which to return a bitmask of CPUs, 64 CPUs per unsigned long on 64-bit machines, 32 on 32-bit machines.
+        cpu_set_size (unsigned int): The size of the cpu_set array that is safe to access.
+        scope (unsigned int): Scope that change the default behavior.
+
+    Returns:
+        unsigned long: Array reference in which to return a bitmask of CPUs, 64 CPUs per unsigned long on 64-bit machines, 32 on 32-bit machines.
 
     .. seealso:: `nvmlDeviceGetCpuAffinityWithinScope`
     """
@@ -21390,7 +22068,10 @@ cpdef object device_get_cpu_affinity(intptr_t device, unsigned int cpu_set_size)
 
     Args:
         device (intptr_t): The identifier of the target device.
-        cpu_set_size (unsigned int): The size of the cpuSet array that is safe to access.
+        cpu_set_size (unsigned int): The size of the cpu_set array that is safe to access.
+
+    Returns:
+        unsigned long: Array reference in which to return a bitmask of CPUs, 64 CPUs per unsigned long on 64-bit machines, 32 on 32-bit machines.
 
     .. seealso:: `nvmlDeviceGetCpuAffinity`
     """
@@ -21467,22 +22148,22 @@ cpdef int device_get_topology_common_ancestor(intptr_t device1, intptr_t device2
     return <int>path_info
 
 
-cpdef int device_get_p2p_status(intptr_t device1, intptr_t device2, int p2p_ind_ex) except? -1:
+cpdef int device_get_p2p_status(intptr_t device1, intptr_t device2, int p2p_index) except? -1:
     """Retrieve the status for a given p2p capability index between a given pair of GPU.
 
     Args:
         device1 (intptr_t): The first device.
         device2 (intptr_t): The second device.
-        p2p_ind_ex (GpuP2PCapsIndex): p2p Capability Index being looked for between ``device1`` and ``device2``.
+        p2p_index (GpuP2PCapsIndex): p2p Capability Index being looked for between ``device1`` and ``device2``.
 
     Returns:
-        int: Reference in which to return the status of the ``p2p_ind_ex`` between ``device1`` and ``device2``.
+        int: Reference in which to return the status of the ``p2p_index`` between ``device1`` and ``device2``.
 
     .. seealso:: `nvmlDeviceGetP2PStatus`
     """
     cdef _GpuP2PStatus p2p_status
     with nogil:
-        __status__ = nvmlDeviceGetP2PStatus(<Device>device1, <Device>device2, <_GpuP2PCapsIndex>p2p_ind_ex, &p2p_status)
+        __status__ = nvmlDeviceGetP2PStatus(<Device>device1, <Device>device2, <_GpuP2PCapsIndex>p2p_index, &p2p_status)
     check_status(__status__)
     return <int>p2p_status
 
@@ -21493,6 +22174,9 @@ cpdef str device_get_uuid(intptr_t device):
     Args:
         device (intptr_t): The identifier of the target device.
 
+    Returns:
+        char: Reference in which to return the GPU UUID.
+
     .. seealso:: `nvmlDeviceGetUUID`
     """
     cdef unsigned int length = 96
@@ -21527,6 +22211,9 @@ cpdef str device_get_board_part_number(intptr_t device):
     Args:
         device (intptr_t): Identifier of the target device.
 
+    Returns:
+        char: Reference to the buffer to return.
+
     .. seealso:: `nvmlDeviceGetBoardPartNumber`
     """
     cdef unsigned int length = 80
@@ -21544,6 +22231,9 @@ cpdef str device_get_inforom_version(intptr_t device, int object):
         device (intptr_t): The identifier of the target device.
         object (InforomObject): The target infoROM object.
 
+    Returns:
+        char: Reference in which to return the infoROM version.
+
     .. seealso:: `nvmlDeviceGetInforomVersion`
     """
     cdef unsigned int length = 16
@@ -21560,6 +22250,9 @@ cpdef str device_get_inforom_image_version(intptr_t device):
     Args:
         device (intptr_t): The identifier of the target device.
 
+    Returns:
+        char: Reference in which to return the infoROM image version.
+
     .. seealso:: `nvmlDeviceGetInforomImageVersion`
     """
     cdef unsigned int length = 16
@@ -21944,6 +22637,9 @@ cpdef object device_get_supported_memory_clocks(intptr_t device):
     Args:
         device (intptr_t): The identifier of the target device.
 
+    Returns:
+        unsigned int: Reference in which to return the clock in MHz.
+
     .. seealso:: `nvmlDeviceGetSupportedMemoryClocks`
     """
     cdef unsigned int[1] count = [0]
@@ -21967,6 +22663,9 @@ cpdef object device_get_supported_graphics_clocks(intptr_t device, unsigned int
         device (intptr_t): The identifier of the target device.
         memory_clock_m_hz (unsigned int): Memory clock for which to return possible graphics clocks.
 
+    Returns:
+        unsigned int: Reference in which to return the clocks in MHz.
+
     .. seealso:: `nvmlDeviceGetSupportedGraphicsClocks`
     """
     cdef unsigned int[1] count = [0]
@@ -21984,7 +22683,7 @@ cpdef object device_get_supported_graphics_clocks(intptr_t device, unsigned int
 
 
 cpdef tuple device_get_auto_boosted_clocks_enabled(intptr_t device):
-    """Retrieve the current state of Auto Boosted clocks on a device and store it in ``isEnabled``.
+    """Retrieve the current state of Auto Boosted clocks on a device and store it in ``is_enabled``.
 
     Args:
         device (intptr_t): The identifier of the target device.
@@ -22159,12 +22858,12 @@ cpdef unsigned int device_get_temperature_threshold(intptr_t device, int thresho
     return temp
 
 
-cpdef object device_get_thermal_settings(intptr_t device, unsigned int sensor_ind_ex):
+cpdef object device_get_thermal_settings(intptr_t device, unsigned int sensor_index):
     """Used to execute a list of thermal system instructions.
 
     Args:
         device (intptr_t): The identifier of the target device.
-        sensor_ind_ex (unsigned int): The index of the thermal sensor.
+        sensor_index (unsigned int): The index of the thermal sensor.
 
     Returns:
         nvmlGpuThermalSettings_t: Reference in which to return the thermal sensor information.
@@ -22174,7 +22873,7 @@ cpdef object device_get_thermal_settings(intptr_t device, unsigned int sensor_in
     cdef GpuThermalSettings p_thermal_settings_py = GpuThermalSettings()
     cdef nvmlGpuThermalSettings_t *p_thermal_settings = <nvmlGpuThermalSettings_t *><intptr_t>(p_thermal_settings_py._get_ptr())
     with nogil:
-        __status__ = nvmlDeviceGetThermalSettings(<Device>device, sensor_ind_ex, p_thermal_settings)
+        __status__ = nvmlDeviceGetThermalSettings(<Device>device, sensor_index, p_thermal_settings)
     check_status(__status__)
     return p_thermal_settings_py
 
@@ -22605,7 +23304,7 @@ cpdef int device_get_default_ecc_mode(intptr_t device) except? -1:
 
 
 cpdef unsigned int device_get_board_id(intptr_t device) except? 0:
-    """Retrieves the device boardId from 0-N. Devices with the same boardId indicate GPUs connected to the same PLX. Use in conjunction with :func:`device_get_multi_gpu_board` to decide if they are on the same board as well. The boardId returned is a unique ID for the current configuration. Uniqueness and ordering across reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will always return those values but they will always be different from each other).
+    """Retrieves the device board_id from 0-N. Devices with the same board_id indicate GPUs connected to the same PLX. Use in conjunction with :func:`device_get_multi_gpu_board` to decide if they are on the same board as well. The board_id returned is a unique ID for the current configuration. Uniqueness and ordering across reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will always return those values but they will always be different from each other).
 
     Args:
         device (intptr_t): The identifier of the target device.
@@ -22623,7 +23322,7 @@ cpdef unsigned int device_get_board_id(intptr_t device) except? 0:
 
 
 cpdef unsigned int device_get_multi_gpu_board(intptr_t device) except? 0:
-    """Retrieves whether the device is on a Multi-GPU Board Devices that are on multi-GPU boards will set ``multiGpuBool`` to a non-zero value.
+    """Retrieves whether the device is on a Multi-GPU Board Devices that are on multi-GPU boards will set ``multi_gpu_bool`` to a non-zero value.
 
     Args:
         device (intptr_t): The identifier of the target device.
@@ -22771,6 +23470,9 @@ cpdef object device_get_encoder_sessions(intptr_t device):
     Args:
         device (intptr_t): The identifier of the target device.
 
+    Returns:
+        nvmlEncoderSessionInfo_t: Reference in which to return the session information.
+
     .. seealso:: `nvmlDeviceGetEncoderSessions`
     """
     cdef unsigned int[1] session_count = [0]
@@ -22878,6 +23580,9 @@ cpdef object device_get_fbc_sessions(intptr_t device):
     Args:
         device (intptr_t): The identifier of the target device.
 
+    Returns:
+        nvmlFBCSessionInfo_t: Reference in which to return the session information.
+
     .. seealso:: `nvmlDeviceGetFBCSessions`
     """
     cdef unsigned int[1] session_count = [0]
@@ -22922,6 +23627,9 @@ cpdef str device_get_vbios_version(intptr_t device):
     Args:
         device (intptr_t): The identifier of the target device.
 
+    Returns:
+        char: Reference to which to return the VBIOS version.
+
     .. seealso:: `nvmlDeviceGetVbiosVersion`
     """
     cdef unsigned int length = 32
@@ -22957,6 +23665,9 @@ cpdef object device_get_compute_running_processes_v3(intptr_t device):
     Args:
         device (intptr_t): The device handle or MIG device handle.
 
+    Returns:
+        nvmlProcessInfo_t: Reference in which to return the process information.
+
     .. seealso:: `nvmlDeviceGetComputeRunningProcesses_v3`
     """
     cdef unsigned int[1] info_count = [0]
@@ -22973,12 +23684,40 @@ cpdef object device_get_compute_running_processes_v3(intptr_t device):
     return infos
 
 
+cpdef object device_get_graphics_running_processes_v3(intptr_t device):
+    """Get information about processes with a graphics context on a device.
+
+    Args:
+        device (intptr_t): The device handle or MIG device handle.
+
+    Returns:
+        nvmlProcessInfo_t: Reference in which to return the process information.
+
+    .. seealso:: `nvmlDeviceGetGraphicsRunningProcesses_v3`
+    """
+    cdef unsigned int[1] info_count = [0]
+    with nogil:
+        __status__ = nvmlDeviceGetGraphicsRunningProcesses_v3(<Device>device, <unsigned int*>info_count, NULL)
+    check_status_size(__status__)
+    cdef ProcessInfo infos = ProcessInfo(info_count[0])
+    cdef nvmlProcessInfo_t *infos_ptr = <nvmlProcessInfo_t *><intptr_t>(infos._get_ptr())
+    if info_count[0] == 0:
+        return infos
+    with nogil:
+        __status__ = nvmlDeviceGetGraphicsRunningProcesses_v3(<Device>device, <unsigned int*>info_count, infos_ptr)
+    check_status(__status__)
+    return infos
+
+
 cpdef object device_get_mps_compute_running_processes_v3(intptr_t device):
     """Get information about processes with a Multi-Process Service (MPS) compute context on a device.
 
     Args:
         device (intptr_t): The device handle or MIG device handle.
 
+    Returns:
+        nvmlProcessInfo_t: Reference in which to return the process information.
+
     .. seealso:: `nvmlDeviceGetMPSComputeRunningProcesses_v3`
     """
     cdef unsigned int[1] info_count = [0]
@@ -23448,6 +24187,9 @@ cpdef object device_get_accounting_pids(intptr_t device):
     Args:
         device (intptr_t): The identifier of the target device.
 
+    Returns:
+        unsigned int: Reference in which to return list of process ids.
+
     .. seealso:: `nvmlDeviceGetAccountingPids`
     """
     cdef unsigned int[1] count = [0]
@@ -23489,6 +24231,9 @@ cpdef object device_get_retired_pages(intptr_t device, int cause):
         device (intptr_t): The identifier of the target device.
         cause (PageRetirementCause): Filter page addresses by cause of retirement.
 
+    Returns:
+        unsigned long long: Buffer to write the page addresses into.
+
     .. seealso:: `nvmlDeviceGetRetiredPages`
     """
     cdef unsigned int[1] page_count = [0]
@@ -23524,7 +24269,7 @@ cpdef int device_get_retired_pages_pending_status(intptr_t device) except? -1:
 
 
 cpdef tuple device_get_remapped_rows(intptr_t device):
-    """Get number of remapped rows. The number of rows reported will be based on the cause of the remapping. isPending indicates whether or not there are pending remappings. A reset will be required to actually remap the row. failureOccurred will be set if a row remapping ever failed in the past. A pending remapping won't affect future work on the GPU since error-containment and dynamic page blacklisting will take care of that.
+    """Get number of remapped rows. The number of rows reported will be based on the cause of the remapping. is_pending indicates whether or not there are pending remappings. A reset will be required to actually remap the row. failure_occurred will be set if a row remapping ever failed in the past. A pending remapping won't affect future work on the GPU since error-containment and dynamic page blacklisting will take care of that.
 
     Args:
         device (intptr_t): The identifier of the target device.
@@ -23610,7 +24355,10 @@ cpdef object device_get_process_utilization(intptr_t device, unsigned long long
 
     Args:
         device (intptr_t): The identifier of the target device.
-        last_seen_time_stamp (unsigned long long): Pointer to caller-supplied buffer in which guest process utilization samples are returned.
+        last_seen_time_stamp (unsigned long long): Return only samples with timestamp greater than last_seen_time_stamp.
+
+    Returns:
+        nvmlProcessUtilizationSample_t: Pointer to caller-supplied buffer in which guest process utilization samples are returned.
 
     .. seealso:: `nvmlDeviceGetProcessUtilization`
     """
@@ -24411,11 +25159,14 @@ cpdef unsigned int device_get_vgpu_capabilities(intptr_t device, int capability)
 
 
 cpdef str vgpu_type_get_class(unsigned int vgpu_type_id):
-    """Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator). See ``nvmlConstants.NVML_DEVICE_NAME_BUFFER_SIZE``.
+    """Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator). See nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE.
 
     Args:
         vgpu_type_id (unsigned int): Handle to vGPU type.
 
+    Returns:
+        char: Pointer to string array to return class in.
+
     .. seealso:: `nvmlVgpuTypeGetClass`
     """
     cdef unsigned int[1] size = [0]
@@ -24508,12 +25259,12 @@ cpdef unsigned int vgpu_type_get_num_display_heads(unsigned int vgpu_type_id) ex
     return num_display_heads
 
 
-cpdef tuple vgpu_type_get_resolution(unsigned int vgpu_type_id, unsigned int display_ind_ex):
+cpdef tuple vgpu_type_get_resolution(unsigned int vgpu_type_id, unsigned int display_index):
     """Retrieve vGPU display head's maximum supported resolution.
 
     Args:
         vgpu_type_id (unsigned int): Handle to vGPU type.
-        display_ind_ex (unsigned int): Zero-based index of display head.
+        display_index (unsigned int): Zero-based index of display head.
 
     Returns:
         A 2-tuple containing:
@@ -24526,7 +25277,7 @@ cpdef tuple vgpu_type_get_resolution(unsigned int vgpu_type_id, unsigned int dis
     cdef unsigned int xdim
     cdef unsigned int ydim
     with nogil:
-        __status__ = nvmlVgpuTypeGetResolution(<nvmlVgpuTypeId_t>vgpu_type_id, display_ind_ex, &xdim, &ydim)
+        __status__ = nvmlVgpuTypeGetResolution(<nvmlVgpuTypeId_t>vgpu_type_id, display_index, &xdim, &ydim)
     check_status(__status__)
     return (xdim, ydim)
 
@@ -24537,6 +25288,9 @@ cpdef str vgpu_type_get_license(unsigned int vgpu_type_id):
     Args:
         vgpu_type_id (unsigned int): Handle to vGPU type.
 
+    Returns:
+        char: Pointer to buffer to return license info.
+
     .. seealso:: `nvmlVgpuTypeGetLicense`
     """
     cdef unsigned int size = 128
@@ -24628,6 +25382,9 @@ cpdef str vgpu_instance_get_uuid(unsigned int vgpu_instance):
     Args:
         vgpu_instance (unsigned int): Identifier of the target vGPU instance.
 
+    Returns:
+        char: Pointer to caller-supplied buffer to hold vGPU UUID.
+
     .. seealso:: `nvmlVgpuInstanceGetUUID`
     """
     cdef unsigned int size = 80
@@ -24644,6 +25401,9 @@ cpdef str vgpu_instance_get_vm_driver_version(unsigned int vgpu_instance):
     Args:
         vgpu_instance (unsigned int): Identifier of the target vGPU instance.
 
+    Returns:
+        char: Caller-supplied buffer to return driver version string.
+
     .. seealso:: `nvmlVgpuInstanceGetVmDriverVersion`
     """
     cdef unsigned int length = 80
@@ -24697,7 +25457,7 @@ cpdef unsigned int vgpu_instance_get_type(unsigned int vgpu_instance) except? 0:
         vgpu_instance (unsigned int): Identifier of the target vGPU instance.
 
     Returns:
-        unsigned int: Reference to return the vgpuTypeId.
+        unsigned int: Reference to return the vgpu_type_id.
 
     .. seealso:: `nvmlVgpuInstanceGetType`
     """
@@ -24806,6 +25566,9 @@ cpdef object vgpu_instance_get_encoder_sessions(unsigned int vgpu_instance):
     Args:
         vgpu_instance (unsigned int): Identifier of the target vGPU instance.
 
+    Returns:
+        nvmlEncoderSessionInfo_t: Reference to caller supplied array in which the list of session information us returned.
+
     .. seealso:: `nvmlVgpuInstanceGetEncoderSessions`
     """
     cdef unsigned int[1] session_count = [0]
@@ -24847,6 +25610,9 @@ cpdef object vgpu_instance_get_fbc_sessions(unsigned int vgpu_instance):
     Args:
         vgpu_instance (unsigned int): Identifier of the target vGPU instance.
 
+    Returns:
+        nvmlFBCSessionInfo_t: Reference in which to return the session information.
+
     .. seealso:: `nvmlVgpuInstanceGetFBCSessions`
     """
     cdef unsigned int[1] session_count = [0]
@@ -24887,6 +25653,9 @@ cpdef str vgpu_instance_get_gpu_pci_id(unsigned int vgpu_instance):
     Args:
         vgpu_instance (unsigned int): Identifier of the target vGPU instance.
 
+    Returns:
+        char: Caller-supplied buffer to return vGPU PCI Id string.
+
     .. seealso:: `nvmlVgpuInstanceGetGpuPciId`
     """
     cdef unsigned int[1] length = [0]
@@ -24904,7 +25673,7 @@ cpdef str vgpu_instance_get_gpu_pci_id(unsigned int vgpu_instance):
 
 
 cpdef unsigned int vgpu_type_get_capabilities(unsigned int vgpu_type_id, int capability) except? 0:
-    """Retrieve the requested capability for a given vGPU type. Refer to the ``nvmlVgpuCapability_t`` structure for the specific capabilities that can be queried. The return value in ``capResult`` should be treated as a boolean, with a non-zero value indicating that the capability is supported.
+    """Retrieve the requested capability for a given vGPU type. Refer to the ``nvmlVgpuCapability_t`` structure for the specific capabilities that can be queried. The return value in ``cap_result`` should be treated as a boolean, with a non-zero value indicating that the capability is supported.
 
     Args:
         vgpu_type_id (unsigned int): Handle to vGPU type.
@@ -24928,6 +25697,9 @@ cpdef str vgpu_instance_get_mdev_uuid(unsigned int vgpu_instance):
     Args:
         vgpu_instance (unsigned int): Identifier of the target vGPU instance.
 
+    Returns:
+        char: Pointer to caller-supplied buffer to hold MDEV UUID.
+
     .. seealso:: `nvmlVgpuInstanceGetMdevUUID`
     """
     cdef unsigned int size = 80
@@ -24960,7 +25732,7 @@ cpdef object gpu_instance_get_vgpu_scheduler_state(intptr_t gpu_instance):
         gpu_instance (intptr_t): The GPU instance handle.
 
     Returns:
-        nvmlVgpuSchedulerStateInfo_v1_t: Reference in which ``pSchedulerStateInfo`` is returned.
+        nvmlVgpuSchedulerStateInfo_v1_t: Reference in which ``p_scheduler_state_info`` is returned.
 
     .. seealso:: `nvmlGpuInstanceGetVgpuSchedulerState`
     """
@@ -24974,13 +25746,13 @@ cpdef object gpu_instance_get_vgpu_scheduler_state(intptr_t gpu_instance):
 
 
 cpdef object gpu_instance_get_vgpu_scheduler_log(intptr_t gpu_instance):
-    """Returns the vGPU scheduler logs for the given GPU instance. ``pSchedulerLogInfo`` points to a caller-allocated structure to contain the logs. The number of elements returned will never exceed ``NVML_SCHEDULER_SW_MAX_LOG_ENTRIES``.
+    """Returns the vGPU scheduler logs for the given GPU instance. ``p_scheduler_log_info`` points to a caller-allocated structure to contain the logs. The number of elements returned will never exceed ``NVML_SCHEDULER_SW_MAX_LOG_ENTRIES``.
 
     Args:
         gpu_instance (intptr_t): The GPU instance handle.
 
     Returns:
-        nvmlVgpuSchedulerLogInfo_v1_t: Reference in which ``pSchedulerLogInfo`` is written.
+        nvmlVgpuSchedulerLogInfo_v1_t: Reference in which ``p_scheduler_log_info`` is written.
 
     .. seealso:: `nvmlGpuInstanceGetVgpuSchedulerLog`
     """
@@ -24999,6 +25771,9 @@ cpdef str device_get_pgpu_metadata_string(intptr_t device):
     Args:
         device (intptr_t): The identifier of the target device.
 
+    Returns:
+        char: Pointer to caller-supplied buffer into which ``pgpu_metadata`` is written.
+
     .. seealso:: `nvmlDeviceGetPgpuMetadataString`
     """
     cdef unsigned int[1] buffer_size = [0]
@@ -25016,13 +25791,13 @@ cpdef str device_get_pgpu_metadata_string(intptr_t device):
 
 
 cpdef object device_get_vgpu_scheduler_log(intptr_t device):
-    """Returns the vGPU Software scheduler logs. ``pSchedulerLog`` points to a caller-allocated structure to contain the logs. The number of elements returned will never exceed ``NVML_SCHEDULER_SW_MAX_LOG_ENTRIES``.
+    """Returns the vGPU Software scheduler logs. ``p_scheduler_log`` points to a caller-allocated structure to contain the logs. The number of elements returned will never exceed ``NVML_SCHEDULER_SW_MAX_LOG_ENTRIES``.
 
     Args:
         device (intptr_t): The identifier of the target ``device``.
 
     Returns:
-        nvmlVgpuSchedulerLog_t: Reference in which ``pSchedulerLog`` is written.
+        nvmlVgpuSchedulerLog_t: Reference in which ``p_scheduler_log`` is written.
 
     .. seealso:: `nvmlDeviceGetVgpuSchedulerLog`
     """
@@ -25041,7 +25816,7 @@ cpdef object device_get_vgpu_scheduler_state(intptr_t device):
         device (intptr_t): The identifier of the target ``device``.
 
     Returns:
-        nvmlVgpuSchedulerGetState_t: Reference in which ``pSchedulerState`` is returned.
+        nvmlVgpuSchedulerGetState_t: Reference in which ``p_scheduler_state`` is returned.
 
     .. seealso:: `nvmlDeviceGetVgpuSchedulerState`
     """
@@ -25060,7 +25835,7 @@ cpdef object device_get_vgpu_scheduler_capabilities(intptr_t device):
         device (intptr_t): The identifier of the target ``device``.
 
     Returns:
-        nvmlVgpuSchedulerCapabilities_t: Reference in which ``pCapabilities`` is written.
+        nvmlVgpuSchedulerCapabilities_t: Reference in which ``p_capabilities`` is written.
 
     .. seealso:: `nvmlDeviceGetVgpuSchedulerCapabilities`
     """
@@ -25146,6 +25921,9 @@ cpdef object vgpu_instance_get_accounting_pids(unsigned int vgpu_instance):
     Args:
         vgpu_instance (unsigned int): The identifier of the target vGPU instance.
 
+    Returns:
+        unsigned int: Reference in which to return list of process ids.
+
     .. seealso:: `nvmlVgpuInstanceGetAccountingPids`
     """
     cdef unsigned int[1] count = [0]
@@ -25229,11 +26007,11 @@ cpdef unsigned int get_excluded_device_count() except? 0:
     return device_count
 
 
-cpdef object get_excluded_device_info_by_index(unsigned int ind_ex):
-    """Acquire the device information for an excluded GPU device, based on its ind_ex.
+cpdef object get_excluded_device_info_by_index(unsigned int index):
+    """Acquire the device information for an excluded GPU device, based on its index.
 
     Args:
-        ind_ex (unsigned int): The ind_ex of the target GPU, >= 0 and < ``deviceCount``.
+        index (unsigned int): The index of the target GPU, >= 0 and < ``deviceCount``.
 
     Returns:
         nvmlExcludedDeviceInfo_t: Reference in which to return the device information.
@@ -25243,7 +26021,7 @@ cpdef object get_excluded_device_info_by_index(unsigned int ind_ex):
     cdef ExcludedDeviceInfo info_py = ExcludedDeviceInfo()
     cdef nvmlExcludedDeviceInfo_t *info = <nvmlExcludedDeviceInfo_t *><intptr_t>(info_py._get_ptr())
     with nogil:
-        __status__ = nvmlGetExcludedDeviceInfoByIndex(ind_ex, info)
+        __status__ = nvmlGetExcludedDeviceInfoByIndex(index, info)
     check_status(__status__)
     return info_py
 
@@ -25256,7 +26034,7 @@ cpdef int device_set_mig_mode(intptr_t device, unsigned int mode) except? -1:
         mode (unsigned int): The mode to be set, ``NVML_DEVICE_MIG_DISABLE`` or ``NVML_DEVICE_MIG_ENABLE``.
 
     Returns:
-        int: The activationStatus status.
+        int: The activation_status status.
 
     .. seealso:: `nvmlDeviceSetMigMode`
     """
@@ -25296,6 +26074,9 @@ cpdef object device_get_gpu_instance_possible_placements_v2(intptr_t device, uns
         device (intptr_t): The identifier of the target device.
         profile_id (unsigned int): The GPU instance profile ID. See ``nvmlDeviceGetGpuInstanceProfileInfo``.
 
+    Returns:
+        nvmlGpuInstancePlacement_t: Returns placements allowed for the profile. Can be NULL to discover number of allowed placements for this profile. If non-NULL must be large enough to accommodate the placements supported by the profile.
+
     .. seealso:: `nvmlDeviceGetGpuInstancePossiblePlacements_v2`
     """
     cdef unsigned int[1] count = [0]
@@ -25469,6 +26250,9 @@ cpdef object gpu_instance_get_compute_instance_possible_placements(intptr_t gpu_
         gpu_instance (intptr_t): The identifier of the target GPU instance.
         profile_id (unsigned int): The compute instance profile ID. See ``nvmlGpuInstanceGetComputeInstanceProfileInfo``.
 
+    Returns:
+        nvmlComputeInstancePlacement_t: Returns placements allowed for the profile. Can be NULL to discover number of allowed placements for this profile. If non-NULL must be large enough to accommodate the placements supported by the profile.
+
     .. seealso:: `nvmlGpuInstanceGetComputeInstancePossiblePlacements`
     """
     cdef unsigned int[1] count = [0]
@@ -25647,12 +26431,12 @@ cpdef unsigned int device_get_max_mig_device_count(intptr_t device) except? 0:
     return count
 
 
-cpdef intptr_t device_get_mig_device_handle_by_index(intptr_t device, unsigned int ind_ex) except? 0:
-    """Get MIG device handle for the given ind_ex under its parent NVML device.
+cpdef intptr_t device_get_mig_device_handle_by_index(intptr_t device, unsigned int index) except? 0:
+    """Get MIG device handle for the given index under its parent NVML device.
 
     Args:
         device (intptr_t): Reference to the parent GPU device handle.
-        ind_ex (unsigned int): Index of the MIG device.
+        index (unsigned int): Index of the MIG device.
 
     Returns:
         intptr_t: Reference to the MIG device handle.
@@ -25661,7 +26445,7 @@ cpdef intptr_t device_get_mig_device_handle_by_index(intptr_t device, unsigned i
     """
     cdef Device mig_device
     with nogil:
-        __status__ = nvmlDeviceGetMigDeviceHandleByIndex(<Device>device, ind_ex, &mig_device)
+        __status__ = nvmlDeviceGetMigDeviceHandleByIndex(<Device>device, index, &mig_device)
     check_status(__status__)
     return <intptr_t>mig_device
 
@@ -25799,6 +26583,123 @@ cpdef device_set_power_mizer_mode_v1(intptr_t device, intptr_t power_mizer_mode)
     check_status(__status__)
 
 
+cpdef device_vgpu_force_gsp_unload(intptr_t device):
+    """Executes a forced GSP unload operation on a device.
+
+    Args:
+        device (intptr_t): The identifier of the target device.
+
+    .. seealso:: `nvmlDeviceVgpuForceGspUnload`
+    """
+    with nogil:
+        __status__ = nvmlDeviceVgpuForceGspUnload(<Device>device)
+    check_status(__status__)
+
+
+cpdef object device_get_vgpu_scheduler_state_v2(intptr_t device):
+    """Returns the vGPU scheduler state. The information returned in ``nvmlVgpuSchedulerStateInfo_v2_t`` is not relevant if the BEST EFFORT policy is set.
+
+    Args:
+        device (intptr_t): The identifier of the target ``device``.
+
+    Returns:
+        nvmlVgpuSchedulerStateInfo_v2_t: Reference in which ``p_scheduler_state_info`` is returned.
+
+    .. seealso:: `nvmlDeviceGetVgpuSchedulerState_v2`
+    """
+    cdef VgpuSchedulerStateInfo_v2 p_scheduler_state_info_py = VgpuSchedulerStateInfo_v2()
+    cdef nvmlVgpuSchedulerStateInfo_v2_t *p_scheduler_state_info = <nvmlVgpuSchedulerStateInfo_v2_t *><intptr_t>(p_scheduler_state_info_py._get_ptr())
+    with nogil:
+        __status__ = nvmlDeviceGetVgpuSchedulerState_v2(<Device>device, p_scheduler_state_info)
+    check_status(__status__)
+    return p_scheduler_state_info_py
+
+
+cpdef object gpu_instance_get_vgpu_scheduler_state_v2(intptr_t gpu_instance):
+    """Returns the vGPU scheduler state for the given GPU instance. The information returned in ``nvmlVgpuSchedulerStateInfo_v2_t`` is not relevant if the BEST EFFORT policy is set.
+
+    Args:
+        gpu_instance (intptr_t): The GPU instance handle.
+
+    Returns:
+        nvmlVgpuSchedulerStateInfo_v2_t: Reference in which ``p_scheduler_state_info`` is returned.
+
+    .. seealso:: `nvmlGpuInstanceGetVgpuSchedulerState_v2`
+    """
+    cdef VgpuSchedulerStateInfo_v2 p_scheduler_state_info_py = VgpuSchedulerStateInfo_v2()
+    cdef nvmlVgpuSchedulerStateInfo_v2_t *p_scheduler_state_info = <nvmlVgpuSchedulerStateInfo_v2_t *><intptr_t>(p_scheduler_state_info_py._get_ptr())
+    with nogil:
+        __status__ = nvmlGpuInstanceGetVgpuSchedulerState_v2(<GpuInstance>gpu_instance, p_scheduler_state_info)
+    check_status(__status__)
+    return p_scheduler_state_info_py
+
+
+cpdef object device_get_vgpu_scheduler_log_v2(intptr_t device):
+    """Returns the vGPU Software scheduler logs for the device. ``p_scheduler_log_info`` points to a caller-allocated structure to contain the logs. The number of elements returned will never exceed ``NVML_SCHEDULER_SW_MAX_LOG_ENTRIES``.
+
+    Args:
+        device (intptr_t): The identifier of the target ``device``.
+
+    Returns:
+        nvmlVgpuSchedulerLogInfo_v2_t: Reference in which ``p_scheduler_log_info`` is written.
+
+    .. seealso:: `nvmlDeviceGetVgpuSchedulerLog_v2`
+    """
+    cdef VgpuSchedulerLogInfo_v2 p_scheduler_log_info_py = VgpuSchedulerLogInfo_v2()
+    cdef nvmlVgpuSchedulerLogInfo_v2_t *p_scheduler_log_info = <nvmlVgpuSchedulerLogInfo_v2_t *><intptr_t>(p_scheduler_log_info_py._get_ptr())
+    with nogil:
+        __status__ = nvmlDeviceGetVgpuSchedulerLog_v2(<Device>device, p_scheduler_log_info)
+    check_status(__status__)
+    return p_scheduler_log_info_py
+
+
+cpdef object gpu_instance_get_vgpu_scheduler_log_v2(intptr_t gpu_instance):
+    """Returns the vGPU scheduler logs for the given GPU instance. ``p_scheduler_log_info`` points to a caller-allocated structure to contain the logs. The number of elements returned will never exceed ``NVML_SCHEDULER_SW_MAX_LOG_ENTRIES``.
+
+    Args:
+        gpu_instance (intptr_t): The GPU instance handle.
+
+    Returns:
+        nvmlVgpuSchedulerLogInfo_v2_t: Reference in which ``p_scheduler_log_info`` is written.
+
+    .. seealso:: `nvmlGpuInstanceGetVgpuSchedulerLog_v2`
+    """
+    cdef VgpuSchedulerLogInfo_v2 p_scheduler_log_info_py = VgpuSchedulerLogInfo_v2()
+    cdef nvmlVgpuSchedulerLogInfo_v2_t *p_scheduler_log_info = <nvmlVgpuSchedulerLogInfo_v2_t *><intptr_t>(p_scheduler_log_info_py._get_ptr())
+    with nogil:
+        __status__ = nvmlGpuInstanceGetVgpuSchedulerLog_v2(<GpuInstance>gpu_instance, p_scheduler_log_info)
+    check_status(__status__)
+    return p_scheduler_log_info_py
+
+
+cpdef device_set_vgpu_scheduler_state_v2(intptr_t device, intptr_t p_scheduler_state):
+    """Sets the vGPU scheduler state.
+
+    Args:
+        device (intptr_t): The identifier of the target ``device``.
+        p_scheduler_state (intptr_t): vGPU ``p_scheduler_state`` to set.
+
+    .. seealso:: `nvmlDeviceSetVgpuSchedulerState_v2`
+    """
+    with nogil:
+        __status__ = nvmlDeviceSetVgpuSchedulerState_v2(<Device>device, <nvmlVgpuSchedulerState_v2_t*>p_scheduler_state)
+    check_status(__status__)
+
+
+cpdef gpu_instance_set_vgpu_scheduler_state_v2(intptr_t gpu_instance, intptr_t p_scheduler_state):
+    """Set vGPU scheduler state for the given GPU instance.
+
+    Args:
+        gpu_instance (intptr_t): The GPU instance handle.
+        p_scheduler_state (intptr_t): Pointer to the caller-provided structure of ``nvmlVgpuSchedulerState_v2_t``.
+
+    .. seealso:: `nvmlGpuInstanceSetVgpuSchedulerState_v2`
+    """
+    with nogil:
+        __status__ = nvmlGpuInstanceSetVgpuSchedulerState_v2(<GpuInstance>gpu_instance, <nvmlVgpuSchedulerState_v2_t*>p_scheduler_state)
+    check_status(__status__)
+
+
 cpdef object system_get_topology_gpu_set(unsigned int cpuNumber):
     """Retrieve the set of GPUs that have a CPU affinity with the given CPU number
 
@@ -27472,3 +28373,8 @@ cpdef str vgpu_type_get_name(unsigned int vgpu_type_id):
     check_status(__status__)
     return cpython.PyUnicode_FromStringAndSize(vgpu_type_name, size[0])
 
+
+# Cleanup some docstrings that don't parse as rst.
+device_get_virtualization_mode.__doc__ = device_get_virtualization_mode.__doc__.replace("NVML_GPU_VIRTUALIZATION_?", "``NVML_GPU_VIRTUALIZATION_?``")
+device_set_virtualization_mode.__doc__ = device_set_virtualization_mode.__doc__.replace("NVML_GPU_VIRTUALIZATION_?", "``NVML_GPU_VIRTUALIZATION_?``")
+GpmMetricId.GPM_METRIC_DRAM_BW_UTIL.__doc__ = "Percentage of DRAM bw used vs theoretical maximum. ``0.0 - 100.0 *\u200d/``."
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/runtime.pxd.in b/cuda_bindings/cuda/bindings/runtime.pxd.in
index 154406c684..60bbfa11a3 100644
--- a/cuda_bindings/cuda/bindings/runtime.pxd.in
+++ b/cuda_bindings/cuda/bindings/runtime.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0, generator version 49a8141. Do not modify it directly.
+# This code was automatically generated with version 12.9.0, generator version 0.3.1.dev1568+g289771de9.d20260413. Do not modify it directly.
 cimport cuda.bindings.cyruntime as cyruntime
 
 include "_lib/utils.pxd"
@@ -447,7 +447,7 @@ cdef class cudaArrayMemoryRequirements:
 
 cdef class cudaPitchedPtr:
     """
-    CUDA Pitched memory pointer  ::make_cudaPitchedPtr
+    CUDA Pitched memory pointer  make_cudaPitchedPtr
 
     Attributes
     ----------
@@ -483,7 +483,7 @@ cdef class cudaPitchedPtr:
 
 cdef class cudaExtent:
     """
-    CUDA extent  ::make_cudaExtent
+    CUDA extent  make_cudaExtent
 
     Attributes
     ----------
@@ -513,7 +513,7 @@ cdef class cudaExtent:
 
 cdef class cudaPos:
     """
-    CUDA 3D position  ::make_cudaPos
+    CUDA 3D position  make_cudaPos
 
     Attributes
     ----------
@@ -3444,9 +3444,9 @@ cdef class cudaGraphEdgeData_st:
     {{endif}}
     {{if 'cudaGraphEdgeData_st.type' in found_struct}}
     type : bytes
-        This should be populated with a value from
-        ::cudaGraphDependencyType. (It is typed as char due to compiler-
-        specific layout of bitfields.) See ::cudaGraphDependencyType.
+        This should be populated with a value from cudaGraphDependencyType.
+        (It is typed as char due to compiler-specific layout of bitfields.)
+        See cudaGraphDependencyType.
     {{endif}}
     {{if 'cudaGraphEdgeData_st.reserved' in found_struct}}
     reserved : bytes
@@ -3644,8 +3644,8 @@ cdef class cudaLaunchMemSyncDomainMap_st:
     Memory Synchronization Domain map  See cudaLaunchMemSyncDomain.  By
     default, kernels are launched in domain 0. Kernel launched with
     cudaLaunchMemSyncDomainRemote will have a different domain ID. User
-    may also alter the domain ID with ::cudaLaunchMemSyncDomainMap for
-    a specific stream / graph node / kernel launch. See
+    may also alter the domain ID with cudaLaunchMemSyncDomainMap for a
+    specific stream / graph node / kernel launch. See
     cudaLaunchAttributeMemSyncDomainMap.  Domain ID range is available
     through cudaDevAttrMemSyncDomainCount.
 
@@ -3803,8 +3803,7 @@ cdef class anon_struct26:
 
 cdef class cudaLaunchAttributeValue:
     """
-    Launch attributes union; used as value field of
-    ::cudaLaunchAttribute
+    Launch attributes union; used as value field of cudaLaunchAttribute
 
     Attributes
     ----------
@@ -3824,7 +3823,7 @@ cdef class cudaLaunchAttributeValue:
     {{if 'cudaLaunchAttributeValue.syncPolicy' in found_struct}}
     syncPolicy : cudaSynchronizationPolicy
         Value of launch attribute cudaLaunchAttributeSynchronizationPolicy.
-        ::cudaSynchronizationPolicy for work queued up in this stream.
+        cudaSynchronizationPolicy for work queued up in this stream.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterDim' in found_struct}}
     clusterDim : anon_struct22
@@ -3852,9 +3851,10 @@ cdef class cudaLaunchAttributeValue:
         Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
         the following fields: - `cudaEvent_t` event - Event to fire when
         all blocks trigger it.    - `int` flags; - Event record flags, see
-        cudaEventRecordWithFlags. Does not accept cudaEventRecordExternal.
-        - `int` triggerAtBlockStart - If this is set to non-0, each block
-        launch will automatically trigger the event.
+        ::cudaEventRecordWithFlags. Does not accept
+        cudaEventRecordExternal.    - `int` triggerAtBlockStart - If this
+        is set to non-0, each block launch will automatically trigger the
+        event.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     priority : int
@@ -3864,7 +3864,7 @@ cdef class cudaLaunchAttributeValue:
     {{if 'cudaLaunchAttributeValue.memSyncDomainMap' in found_struct}}
     memSyncDomainMap : cudaLaunchMemSyncDomainMap
         Value of launch attribute cudaLaunchAttributeMemSyncDomainMap. See
-        ::cudaLaunchMemSyncDomainMap.
+        cudaLaunchMemSyncDomainMap.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.memSyncDomain' in found_struct}}
     memSyncDomain : cudaLaunchMemSyncDomain
@@ -3879,19 +3879,19 @@ cdef class cudaLaunchAttributeValue:
         with the following fields: - `x` - The X dimension of the preferred
         cluster, in blocks. Must be a divisor of the grid X dimension, and
         must be a multiple of the `x` field of
-        cudaLaunchAttributeValue::clusterDim.    - `y` - The Y dimension of
-        the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the `y` field of
-        cudaLaunchAttributeValue::clusterDim.    - `z` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the `z` field of
-        cudaLaunchAttributeValue::clusterDim.
+        ::cudaLaunchAttributeValue::clusterDim.    - `y` - The Y dimension
+        of the preferred cluster, in blocks. Must be a divisor of the grid
+        Y dimension, and must be a multiple of the `y` field of
+        ::cudaLaunchAttributeValue::clusterDim.    - `z` - The Z dimension
+        of the preferred cluster, in blocks. Must be equal to the `z` field
+        of ::cudaLaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct25
         Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
         with the following fields: - `cudaEvent_t` event - Event to fire
         when the last block launches.    - `int` flags - Event record
-        flags, see cudaEventRecordWithFlags. Does not accept
+        flags, see ::cudaEventRecordWithFlags. Does not accept
         cudaEventRecordExternal.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
@@ -4349,9 +4349,9 @@ cdef class cudaGraphEdgeData(cudaGraphEdgeData_st):
     {{endif}}
     {{if 'cudaGraphEdgeData_st.type' in found_struct}}
     type : bytes
-        This should be populated with a value from
-        ::cudaGraphDependencyType. (It is typed as char due to compiler-
-        specific layout of bitfields.) See ::cudaGraphDependencyType.
+        This should be populated with a value from cudaGraphDependencyType.
+        (It is typed as char due to compiler-specific layout of bitfields.)
+        See cudaGraphDependencyType.
     {{endif}}
     {{if 'cudaGraphEdgeData_st.reserved' in found_struct}}
     reserved : bytes
@@ -4436,8 +4436,8 @@ cdef class cudaLaunchMemSyncDomainMap(cudaLaunchMemSyncDomainMap_st):
     Memory Synchronization Domain map  See cudaLaunchMemSyncDomain.  By
     default, kernels are launched in domain 0. Kernel launched with
     cudaLaunchMemSyncDomainRemote will have a different domain ID. User
-    may also alter the domain ID with ::cudaLaunchMemSyncDomainMap for
-    a specific stream / graph node / kernel launch. See
+    may also alter the domain ID with cudaLaunchMemSyncDomainMap for a
+    specific stream / graph node / kernel launch. See
     cudaLaunchAttributeMemSyncDomainMap.  Domain ID range is available
     through cudaDevAttrMemSyncDomainCount.
 
@@ -4512,8 +4512,7 @@ cdef class cudaAsyncNotificationInfo_t(cudaAsyncNotificationInfo):
 
 cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
     """
-    Launch attributes union; used as value field of
-    ::cudaLaunchAttribute
+    Launch attributes union; used as value field of cudaLaunchAttribute
 
     Attributes
     ----------
@@ -4533,7 +4532,7 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
     {{if 'cudaLaunchAttributeValue.syncPolicy' in found_struct}}
     syncPolicy : cudaSynchronizationPolicy
         Value of launch attribute cudaLaunchAttributeSynchronizationPolicy.
-        ::cudaSynchronizationPolicy for work queued up in this stream.
+        cudaSynchronizationPolicy for work queued up in this stream.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterDim' in found_struct}}
     clusterDim : anon_struct22
@@ -4561,9 +4560,10 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
         Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
         the following fields: - `cudaEvent_t` event - Event to fire when
         all blocks trigger it.    - `int` flags; - Event record flags, see
-        cudaEventRecordWithFlags. Does not accept cudaEventRecordExternal.
-        - `int` triggerAtBlockStart - If this is set to non-0, each block
-        launch will automatically trigger the event.
+        ::cudaEventRecordWithFlags. Does not accept
+        cudaEventRecordExternal.    - `int` triggerAtBlockStart - If this
+        is set to non-0, each block launch will automatically trigger the
+        event.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     priority : int
@@ -4573,7 +4573,7 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
     {{if 'cudaLaunchAttributeValue.memSyncDomainMap' in found_struct}}
     memSyncDomainMap : cudaLaunchMemSyncDomainMap
         Value of launch attribute cudaLaunchAttributeMemSyncDomainMap. See
-        ::cudaLaunchMemSyncDomainMap.
+        cudaLaunchMemSyncDomainMap.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.memSyncDomain' in found_struct}}
     memSyncDomain : cudaLaunchMemSyncDomain
@@ -4588,19 +4588,19 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
         with the following fields: - `x` - The X dimension of the preferred
         cluster, in blocks. Must be a divisor of the grid X dimension, and
         must be a multiple of the `x` field of
-        cudaLaunchAttributeValue::clusterDim.    - `y` - The Y dimension of
-        the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the `y` field of
-        cudaLaunchAttributeValue::clusterDim.    - `z` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the `z` field of
-        cudaLaunchAttributeValue::clusterDim.
+        ::cudaLaunchAttributeValue::clusterDim.    - `y` - The Y dimension
+        of the preferred cluster, in blocks. Must be a divisor of the grid
+        Y dimension, and must be a multiple of the `y` field of
+        ::cudaLaunchAttributeValue::clusterDim.    - `z` - The Z dimension
+        of the preferred cluster, in blocks. Must be equal to the `z` field
+        of ::cudaLaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct25
         Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
         with the following fields: - `cudaEvent_t` event - Event to fire
         when the last block launches.    - `int` flags - Event record
-        flags, see cudaEventRecordWithFlags. Does not accept
+        flags, see ::cudaEventRecordWithFlags. Does not accept
         cudaEventRecordExternal.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
@@ -4629,8 +4629,7 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
 
 cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
     """
-    Launch attributes union; used as value field of
-    ::cudaLaunchAttribute
+    Launch attributes union; used as value field of cudaLaunchAttribute
 
     Attributes
     ----------
@@ -4650,7 +4649,7 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
     {{if 'cudaLaunchAttributeValue.syncPolicy' in found_struct}}
     syncPolicy : cudaSynchronizationPolicy
         Value of launch attribute cudaLaunchAttributeSynchronizationPolicy.
-        ::cudaSynchronizationPolicy for work queued up in this stream.
+        cudaSynchronizationPolicy for work queued up in this stream.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterDim' in found_struct}}
     clusterDim : anon_struct22
@@ -4678,9 +4677,10 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
         Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
         the following fields: - `cudaEvent_t` event - Event to fire when
         all blocks trigger it.    - `int` flags; - Event record flags, see
-        cudaEventRecordWithFlags. Does not accept cudaEventRecordExternal.
-        - `int` triggerAtBlockStart - If this is set to non-0, each block
-        launch will automatically trigger the event.
+        ::cudaEventRecordWithFlags. Does not accept
+        cudaEventRecordExternal.    - `int` triggerAtBlockStart - If this
+        is set to non-0, each block launch will automatically trigger the
+        event.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     priority : int
@@ -4690,7 +4690,7 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
     {{if 'cudaLaunchAttributeValue.memSyncDomainMap' in found_struct}}
     memSyncDomainMap : cudaLaunchMemSyncDomainMap
         Value of launch attribute cudaLaunchAttributeMemSyncDomainMap. See
-        ::cudaLaunchMemSyncDomainMap.
+        cudaLaunchMemSyncDomainMap.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.memSyncDomain' in found_struct}}
     memSyncDomain : cudaLaunchMemSyncDomain
@@ -4705,19 +4705,19 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
         with the following fields: - `x` - The X dimension of the preferred
         cluster, in blocks. Must be a divisor of the grid X dimension, and
         must be a multiple of the `x` field of
-        cudaLaunchAttributeValue::clusterDim.    - `y` - The Y dimension of
-        the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the `y` field of
-        cudaLaunchAttributeValue::clusterDim.    - `z` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the `z` field of
-        cudaLaunchAttributeValue::clusterDim.
+        ::cudaLaunchAttributeValue::clusterDim.    - `y` - The Y dimension
+        of the preferred cluster, in blocks. Must be a divisor of the grid
+        Y dimension, and must be a multiple of the `y` field of
+        ::cudaLaunchAttributeValue::clusterDim.    - `z` - The Z dimension
+        of the preferred cluster, in blocks. Must be equal to the `z` field
+        of ::cudaLaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct25
         Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
         with the following fields: - `cudaEvent_t` event - Event to fire
         when the last block launches.    - `int` flags - Event record
-        flags, see cudaEventRecordWithFlags. Does not accept
+        flags, see ::cudaEventRecordWithFlags. Does not accept
         cudaEventRecordExternal.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in
index 570bbffc24..17ed3cda69 100644
--- a/cuda_bindings/cuda/bindings/runtime.pyx.in
+++ b/cuda_bindings/cuda/bindings/runtime.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0, generator version 49a8141. Do not modify it directly.
+# This code was automatically generated with version 12.9.0, generator version 0.3.1.dev1568+g289771de9.d20260413. Do not modify it directly.
 from typing import Any, Optional
 import cython
 import ctypes
@@ -1529,41 +1529,41 @@ class cudaLaunchAttributeID(_FastEnum):
     cudaLaunchAttributeAccessPolicyWindow = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeAccessPolicyWindow,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.accessPolicyWindow`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::accessPolicyWindow.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeCooperative' in found_values}}
 
     cudaLaunchAttributeCooperative = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeCooperative,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.cooperative`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::cooperative.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSynchronizationPolicy' in found_values}}
 
     cudaLaunchAttributeSynchronizationPolicy = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeSynchronizationPolicy,
-        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue.syncPolicy`.\n'
+        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue`::syncPolicy.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterDimension' in found_values}}
 
     cudaLaunchAttributeClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterDimension,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.clusterDim`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::clusterDim.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterSchedulingPolicyPreference' in found_values}}
 
     cudaLaunchAttributeClusterSchedulingPolicyPreference = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterSchedulingPolicyPreference,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.clusterSchedulingPolicyPreference`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::clusterSchedulingPolicyPreference.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeProgrammaticStreamSerialization' in found_values}}
 
     cudaLaunchAttributeProgrammaticStreamSerialization = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticStreamSerialization,
         'Valid for launches. Setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.programmaticStreamSerializationAllowed`\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticStreamSerializationAllowed\n'
         'to non-0 signals that the kernel will use programmatic means to resolve its\n'
         'stream dependency, so that the CUDA runtime should opportunistically allow\n'
         "the grid's execution to overlap with the previous kernel in the stream, if\n"
@@ -1576,11 +1576,11 @@ class cudaLaunchAttributeID(_FastEnum):
     cudaLaunchAttributeProgrammaticEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event.\n'
-        'Event recorded through this launch attribute is guaranteed to only trigger\n'
-        'after all block in the associated kernel trigger the event. A block can\n'
-        'trigger the event programmatically in a future CUDA release. A trigger can\n'
-        "also be inserted at the beginning of each block's execution if\n"
+        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticEvent to record the\n'
+        'event. Event recorded through this launch attribute is guaranteed to only\n'
+        'trigger after all block in the associated kernel trigger the event. A block\n'
+        'can trigger the event programmatically in a future CUDA release. A trigger\n'
+        "can also be inserted at the beginning of each block's execution if\n"
         'triggerAtBlockStart is set to non-0. The dependent launches can choose to\n'
         'wait on the dependency using the programmatic sync\n'
         '(cudaGridDependencySynchronize() or equivalent PTX instructions). Note that\n'
@@ -1601,28 +1601,28 @@ class cudaLaunchAttributeID(_FastEnum):
     cudaLaunchAttributePriority = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePriority,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.priority`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::priority.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomainMap' in found_values}}
 
     cudaLaunchAttributeMemSyncDomainMap = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomainMap,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomainMap`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomainMap.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomain' in found_values}}
 
     cudaLaunchAttributeMemSyncDomain = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomain,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomain`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomain.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePreferredClusterDimension' in found_values}}
 
     cudaLaunchAttributePreferredClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredClusterDimension,
         'Valid for graph nodes and launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::preferredClusterDim to allow the\n'
         'kernel launch to specify a preferred substitute cluster dimension. Blocks\n'
         'may be grouped according to either the dimensions specified with this\n'
         'attribute (grouped into a "preferred substitute cluster"), or the one\n'
@@ -1656,7 +1656,7 @@ class cudaLaunchAttributeID(_FastEnum):
     cudaLaunchAttributeLaunchCompletionEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeLaunchCompletionEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::launchCompletionEvent to record the\n'
         'event.\n'
         ' Nominally, the event is triggered once all blocks of the kernel have begun\n'
         'execution. Currently this is a best effort. If a kernel B has a launch\n'
@@ -1711,7 +1711,7 @@ class cudaLaunchAttributeID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredSharedMemoryCarveout,
         'Valid for launches. On devices where the L1 cache and shared memory use the\n'
         'same hardware resources, setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.sharedMemCarveout` to a percentage\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::sharedMemCarveout to a percentage\n'
         'between 0-100 signals sets the shared memory carveout preference in percent\n'
         'of the total shared memory for that kernel launch. This attribute takes\n'
         'precedence over :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout`.\n'
@@ -1737,7 +1737,9 @@ class cudaAsyncNotificationType(_FastEnum):
 {{if 'cudaDataType_t' in found_types}}
 
 class cudaDataType(_FastEnum):
-    """"""
+    """
+
+    """
     {{if 'CUDA_R_32F' in found_values}}
     CUDA_R_32F = cyruntime.cudaDataType_t.CUDA_R_32F{{endif}}
     {{if 'CUDA_R_64F' in found_values}}
@@ -1813,7 +1815,9 @@ class cudaDataType(_FastEnum):
 {{if 'libraryPropertyType_t' in found_types}}
 
 class libraryPropertyType(_FastEnum):
-    """"""
+    """
+
+    """
     {{if 'MAJOR_VERSION' in found_values}}
     MAJOR_VERSION = cyruntime.libraryPropertyType_t.MAJOR_VERSION{{endif}}
     {{if 'MINOR_VERSION' in found_values}}
@@ -5971,7 +5975,9 @@ class cudaTextureReadMode(_FastEnum):
 {{if 'cudaRoundMode' in found_types}}
 
 class cudaRoundMode(_FastEnum):
-    """"""
+    """
+
+    """
     {{if 'cudaRoundNearest' in found_values}}
     cudaRoundNearest = cyruntime.cudaRoundMode.cudaRoundNearest{{endif}}
     {{if 'cudaRoundZero' in found_values}}
@@ -6060,41 +6066,41 @@ class cudaStreamAttrID(_FastEnum):
     cudaLaunchAttributeAccessPolicyWindow = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeAccessPolicyWindow,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.accessPolicyWindow`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::accessPolicyWindow.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeCooperative' in found_values}}
 
     cudaLaunchAttributeCooperative = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeCooperative,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.cooperative`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::cooperative.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSynchronizationPolicy' in found_values}}
 
     cudaLaunchAttributeSynchronizationPolicy = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeSynchronizationPolicy,
-        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue.syncPolicy`.\n'
+        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue`::syncPolicy.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterDimension' in found_values}}
 
     cudaLaunchAttributeClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterDimension,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.clusterDim`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::clusterDim.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterSchedulingPolicyPreference' in found_values}}
 
     cudaLaunchAttributeClusterSchedulingPolicyPreference = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterSchedulingPolicyPreference,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.clusterSchedulingPolicyPreference`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::clusterSchedulingPolicyPreference.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeProgrammaticStreamSerialization' in found_values}}
 
     cudaLaunchAttributeProgrammaticStreamSerialization = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticStreamSerialization,
         'Valid for launches. Setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.programmaticStreamSerializationAllowed`\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticStreamSerializationAllowed\n'
         'to non-0 signals that the kernel will use programmatic means to resolve its\n'
         'stream dependency, so that the CUDA runtime should opportunistically allow\n'
         "the grid's execution to overlap with the previous kernel in the stream, if\n"
@@ -6107,11 +6113,11 @@ class cudaStreamAttrID(_FastEnum):
     cudaLaunchAttributeProgrammaticEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event.\n'
-        'Event recorded through this launch attribute is guaranteed to only trigger\n'
-        'after all block in the associated kernel trigger the event. A block can\n'
-        'trigger the event programmatically in a future CUDA release. A trigger can\n'
-        "also be inserted at the beginning of each block's execution if\n"
+        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticEvent to record the\n'
+        'event. Event recorded through this launch attribute is guaranteed to only\n'
+        'trigger after all block in the associated kernel trigger the event. A block\n'
+        'can trigger the event programmatically in a future CUDA release. A trigger\n'
+        "can also be inserted at the beginning of each block's execution if\n"
         'triggerAtBlockStart is set to non-0. The dependent launches can choose to\n'
         'wait on the dependency using the programmatic sync\n'
         '(cudaGridDependencySynchronize() or equivalent PTX instructions). Note that\n'
@@ -6132,28 +6138,28 @@ class cudaStreamAttrID(_FastEnum):
     cudaLaunchAttributePriority = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePriority,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.priority`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::priority.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomainMap' in found_values}}
 
     cudaLaunchAttributeMemSyncDomainMap = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomainMap,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomainMap`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomainMap.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomain' in found_values}}
 
     cudaLaunchAttributeMemSyncDomain = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomain,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomain`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomain.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePreferredClusterDimension' in found_values}}
 
     cudaLaunchAttributePreferredClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredClusterDimension,
         'Valid for graph nodes and launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::preferredClusterDim to allow the\n'
         'kernel launch to specify a preferred substitute cluster dimension. Blocks\n'
         'may be grouped according to either the dimensions specified with this\n'
         'attribute (grouped into a "preferred substitute cluster"), or the one\n'
@@ -6187,7 +6193,7 @@ class cudaStreamAttrID(_FastEnum):
     cudaLaunchAttributeLaunchCompletionEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeLaunchCompletionEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::launchCompletionEvent to record the\n'
         'event.\n'
         ' Nominally, the event is triggered once all blocks of the kernel have begun\n'
         'execution. Currently this is a best effort. If a kernel B has a launch\n'
@@ -6242,7 +6248,7 @@ class cudaStreamAttrID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredSharedMemoryCarveout,
         'Valid for launches. On devices where the L1 cache and shared memory use the\n'
         'same hardware resources, setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.sharedMemCarveout` to a percentage\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::sharedMemCarveout to a percentage\n'
         'between 0-100 signals sets the shared memory carveout preference in percent\n'
         'of the total shared memory for that kernel launch. This attribute takes\n'
         'precedence over :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout`.\n'
@@ -6269,41 +6275,41 @@ class cudaKernelNodeAttrID(_FastEnum):
     cudaLaunchAttributeAccessPolicyWindow = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeAccessPolicyWindow,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.accessPolicyWindow`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::accessPolicyWindow.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeCooperative' in found_values}}
 
     cudaLaunchAttributeCooperative = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeCooperative,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.cooperative`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::cooperative.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSynchronizationPolicy' in found_values}}
 
     cudaLaunchAttributeSynchronizationPolicy = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeSynchronizationPolicy,
-        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue.syncPolicy`.\n'
+        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue`::syncPolicy.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterDimension' in found_values}}
 
     cudaLaunchAttributeClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterDimension,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.clusterDim`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::clusterDim.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterSchedulingPolicyPreference' in found_values}}
 
     cudaLaunchAttributeClusterSchedulingPolicyPreference = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterSchedulingPolicyPreference,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.clusterSchedulingPolicyPreference`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::clusterSchedulingPolicyPreference.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeProgrammaticStreamSerialization' in found_values}}
 
     cudaLaunchAttributeProgrammaticStreamSerialization = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticStreamSerialization,
         'Valid for launches. Setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.programmaticStreamSerializationAllowed`\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticStreamSerializationAllowed\n'
         'to non-0 signals that the kernel will use programmatic means to resolve its\n'
         'stream dependency, so that the CUDA runtime should opportunistically allow\n'
         "the grid's execution to overlap with the previous kernel in the stream, if\n"
@@ -6316,11 +6322,11 @@ class cudaKernelNodeAttrID(_FastEnum):
     cudaLaunchAttributeProgrammaticEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event.\n'
-        'Event recorded through this launch attribute is guaranteed to only trigger\n'
-        'after all block in the associated kernel trigger the event. A block can\n'
-        'trigger the event programmatically in a future CUDA release. A trigger can\n'
-        "also be inserted at the beginning of each block's execution if\n"
+        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticEvent to record the\n'
+        'event. Event recorded through this launch attribute is guaranteed to only\n'
+        'trigger after all block in the associated kernel trigger the event. A block\n'
+        'can trigger the event programmatically in a future CUDA release. A trigger\n'
+        "can also be inserted at the beginning of each block's execution if\n"
         'triggerAtBlockStart is set to non-0. The dependent launches can choose to\n'
         'wait on the dependency using the programmatic sync\n'
         '(cudaGridDependencySynchronize() or equivalent PTX instructions). Note that\n'
@@ -6341,28 +6347,28 @@ class cudaKernelNodeAttrID(_FastEnum):
     cudaLaunchAttributePriority = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePriority,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.priority`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::priority.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomainMap' in found_values}}
 
     cudaLaunchAttributeMemSyncDomainMap = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomainMap,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomainMap`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomainMap.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomain' in found_values}}
 
     cudaLaunchAttributeMemSyncDomain = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomain,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomain`.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomain.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePreferredClusterDimension' in found_values}}
 
     cudaLaunchAttributePreferredClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredClusterDimension,
         'Valid for graph nodes and launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::preferredClusterDim to allow the\n'
         'kernel launch to specify a preferred substitute cluster dimension. Blocks\n'
         'may be grouped according to either the dimensions specified with this\n'
         'attribute (grouped into a "preferred substitute cluster"), or the one\n'
@@ -6396,7 +6402,7 @@ class cudaKernelNodeAttrID(_FastEnum):
     cudaLaunchAttributeLaunchCompletionEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeLaunchCompletionEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::launchCompletionEvent to record the\n'
         'event.\n'
         ' Nominally, the event is triggered once all blocks of the kernel have begun\n'
         'execution. Currently this is a best effort. If a kernel B has a launch\n'
@@ -6451,7 +6457,7 @@ class cudaKernelNodeAttrID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredSharedMemoryCarveout,
         'Valid for launches. On devices where the L1 cache and shared memory use the\n'
         'same hardware resources, setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue.sharedMemCarveout` to a percentage\n'
+        ':py:obj:`~.cudaLaunchAttributeValue`::sharedMemCarveout to a percentage\n'
         'between 0-100 signals sets the shared memory carveout preference in percent\n'
         'of the total shared memory for that kernel launch. This attribute takes\n'
         'precedence over :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout`.\n'
@@ -7562,7 +7568,7 @@ cdef class cudaArrayMemoryRequirements:
 
 cdef class cudaPitchedPtr:
     """
-    CUDA Pitched memory pointer  ::make_cudaPitchedPtr
+    CUDA Pitched memory pointer  make_cudaPitchedPtr
 
     Attributes
     ----------
@@ -7667,7 +7673,7 @@ cdef class cudaPitchedPtr:
 
 cdef class cudaExtent:
     """
-    CUDA extent  ::make_cudaExtent
+    CUDA extent  make_cudaExtent
 
     Attributes
     ----------
@@ -7754,7 +7760,7 @@ cdef class cudaExtent:
 
 cdef class cudaPos:
     """
-    CUDA 3D position  ::make_cudaPos
+    CUDA 3D position  make_cudaPos
 
     Attributes
     ----------
@@ -16582,9 +16588,9 @@ cdef class cudaGraphEdgeData_st:
     {{endif}}
     {{if 'cudaGraphEdgeData_st.type' in found_struct}}
     type : bytes
-        This should be populated with a value from
-        ::cudaGraphDependencyType. (It is typed as char due to compiler-
-        specific layout of bitfields.) See ::cudaGraphDependencyType.
+        This should be populated with a value from cudaGraphDependencyType.
+        (It is typed as char due to compiler-specific layout of bitfields.)
+        See cudaGraphDependencyType.
     {{endif}}
     {{if 'cudaGraphEdgeData_st.reserved' in found_struct}}
     reserved : bytes
@@ -17198,8 +17204,8 @@ cdef class cudaLaunchMemSyncDomainMap_st:
     Memory Synchronization Domain map  See cudaLaunchMemSyncDomain.  By
     default, kernels are launched in domain 0. Kernel launched with
     cudaLaunchMemSyncDomainRemote will have a different domain ID. User
-    may also alter the domain ID with ::cudaLaunchMemSyncDomainMap for
-    a specific stream / graph node / kernel launch. See
+    may also alter the domain ID with cudaLaunchMemSyncDomainMap for a
+    specific stream / graph node / kernel launch. See
     cudaLaunchAttributeMemSyncDomainMap.  Domain ID range is available
     through cudaDevAttrMemSyncDomainCount.
 
@@ -17679,8 +17685,7 @@ cdef class anon_struct26:
 
 cdef class cudaLaunchAttributeValue:
     """
-    Launch attributes union; used as value field of
-    ::cudaLaunchAttribute
+    Launch attributes union; used as value field of cudaLaunchAttribute
 
     Attributes
     ----------
@@ -17700,7 +17705,7 @@ cdef class cudaLaunchAttributeValue:
     {{if 'cudaLaunchAttributeValue.syncPolicy' in found_struct}}
     syncPolicy : cudaSynchronizationPolicy
         Value of launch attribute cudaLaunchAttributeSynchronizationPolicy.
-        ::cudaSynchronizationPolicy for work queued up in this stream.
+        cudaSynchronizationPolicy for work queued up in this stream.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterDim' in found_struct}}
     clusterDim : anon_struct22
@@ -17728,9 +17733,10 @@ cdef class cudaLaunchAttributeValue:
         Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
         the following fields: - `cudaEvent_t` event - Event to fire when
         all blocks trigger it.    - `int` flags; - Event record flags, see
-        cudaEventRecordWithFlags. Does not accept cudaEventRecordExternal.
-        - `int` triggerAtBlockStart - If this is set to non-0, each block
-        launch will automatically trigger the event.
+        ::cudaEventRecordWithFlags. Does not accept
+        cudaEventRecordExternal.    - `int` triggerAtBlockStart - If this
+        is set to non-0, each block launch will automatically trigger the
+        event.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     priority : int
@@ -17740,7 +17746,7 @@ cdef class cudaLaunchAttributeValue:
     {{if 'cudaLaunchAttributeValue.memSyncDomainMap' in found_struct}}
     memSyncDomainMap : cudaLaunchMemSyncDomainMap
         Value of launch attribute cudaLaunchAttributeMemSyncDomainMap. See
-        ::cudaLaunchMemSyncDomainMap.
+        cudaLaunchMemSyncDomainMap.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.memSyncDomain' in found_struct}}
     memSyncDomain : cudaLaunchMemSyncDomain
@@ -17755,19 +17761,19 @@ cdef class cudaLaunchAttributeValue:
         with the following fields: - `x` - The X dimension of the preferred
         cluster, in blocks. Must be a divisor of the grid X dimension, and
         must be a multiple of the `x` field of
-        cudaLaunchAttributeValue::clusterDim.    - `y` - The Y dimension of
-        the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the `y` field of
-        cudaLaunchAttributeValue::clusterDim.    - `z` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the `z` field of
-        cudaLaunchAttributeValue::clusterDim.
+        ::cudaLaunchAttributeValue::clusterDim.    - `y` - The Y dimension
+        of the preferred cluster, in blocks. Must be a divisor of the grid
+        Y dimension, and must be a multiple of the `y` field of
+        ::cudaLaunchAttributeValue::clusterDim.    - `z` - The Z dimension
+        of the preferred cluster, in blocks. Must be equal to the `z` field
+        of ::cudaLaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct25
         Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
         with the following fields: - `cudaEvent_t` event - Event to fire
         when the last block launches.    - `int` flags - Event record
-        flags, see cudaEventRecordWithFlags. Does not accept
+        flags, see ::cudaEventRecordWithFlags. Does not accept
         cudaEventRecordExternal.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
@@ -19462,33 +19468,9 @@ def cudaDeviceGetLimit(limit not None : cudaLimit):
 
 @cython.embedsignature(True)
 def cudaDeviceGetTexture1DLinearMaxWidth(fmtDesc : Optional[cudaChannelFormatDesc], int device):
-    """ Returns the maximum number of elements allocatable in a 1D linear texture for a given element size.
-
-    Returns in `maxWidthInElements` the maximum number of elements
-    allocatable in a 1D linear texture for given format descriptor
-    `fmtDesc`.
-
-    Parameters
-    ----------
-    fmtDesc : :py:obj:`~.cudaChannelFormatDesc`
-        Texture format description.
-    None : int
-        None
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorUnsupportedLimit`, :py:obj:`~.cudaErrorInvalidValue`
-    maxWidthInElements : int
-        Returns maximum number of texture elements allocatable for given
-        `fmtDesc`.
-
-    See Also
-    --------
-    :py:obj:`~.cuDeviceGetTexture1DLinearMaxWidth`
-    """
+    """"""
     cdef size_t maxWidthInElements = 0
-    cdef cyruntime.cudaChannelFormatDesc* cyfmtDesc_ptr = fmtDesc._pvt_ptr if fmtDesc is not None else NULL
+    cdef cyruntime.cudaChannelFormatDesc* cyfmtDesc_ptr = <cyruntime.cudaChannelFormatDesc*>fmtDesc._pvt_ptr if fmtDesc is not None else NULL
     with nogil:
         err = cyruntime.cudaDeviceGetTexture1DLinearMaxWidth(&maxWidthInElements, cyfmtDesc_ptr, device)
     if err != cyruntime.cudaSuccess:
@@ -19500,7 +19482,13 @@ def cudaDeviceGetTexture1DLinearMaxWidth(fmtDesc : Optional[cudaChannelFormatDes
 
 @cython.embedsignature(True)
 def cudaDeviceGetCacheConfig():
-    """ Returns the preferred cache configuration for the current device.
+    """ Returns the maximum number of elements allocatable in a 1D linear texture for a given element size.
+
+    Returns in `maxWidthInElements` the maximum number of elements
+    allocatable in a 1D linear texture for given format descriptor
+    `fmtDesc`.
+
+    Returns the preferred cache configuration for the current device.
 
     On devices where the L1 cache and shared memory use the same hardware
     resources, this returns through `pCacheConfig` the preferred cache
@@ -19530,12 +19518,16 @@ def cudaDeviceGetCacheConfig():
     Returns
     -------
     cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorUnsupportedLimit`, :py:obj:`~.cudaErrorInvalidValue`
         :py:obj:`~.cudaSuccess`
-    pCacheConfig : :py:obj:`~.cudaFuncCache`
-        Returned cache configuration
+    maxWidthInElements : :py:obj:`~.cudaFuncCache`
+        Returns maximum number of texture elements allocatable for given
+        `fmtDesc`.
 
     See Also
     --------
+    :py:obj:`~.cuDeviceGetTexture1DLinearMaxWidth`
+
     :py:obj:`~.cudaDeviceSetCacheConfig`, :py:obj:`~.cudaFuncSetCacheConfig (C API)`, cudaFuncSetCacheConfig (C++ API), :py:obj:`~.cuCtxGetCacheConfig`
     """
     cdef cyruntime.cudaFuncCache pCacheConfig
@@ -20007,38 +19999,7 @@ def cudaIpcCloseMemHandle(devPtr):
 
 @cython.embedsignature(True)
 def cudaDeviceFlushGPUDirectRDMAWrites(target not None : cudaFlushGPUDirectRDMAWritesTarget, scope not None : cudaFlushGPUDirectRDMAWritesScope):
-    """ Blocks until remote writes are visible to the specified scope.
-
-    Blocks until remote writes to the target context via mappings created
-    through GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see
-    https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are
-    visible to the specified scope.
-
-    If the scope equals or lies within the scope indicated by
-    :py:obj:`~.cudaDevAttrGPUDirectRDMAWritesOrdering`, the call will be a
-    no-op and can be safely omitted for performance. This can be determined
-    by comparing the numerical values between the two enums, with smaller
-    scopes having smaller values.
-
-    Users may query support for this API via
-    :py:obj:`~.cudaDevAttrGPUDirectRDMAFlushWritesOptions`.
-
-    Parameters
-    ----------
-    target : :py:obj:`~.cudaFlushGPUDirectRDMAWritesTarget`
-        The target of the operation, see cudaFlushGPUDirectRDMAWritesTarget
-    scope : :py:obj:`~.cudaFlushGPUDirectRDMAWritesScope`
-        The scope of the operation, see cudaFlushGPUDirectRDMAWritesScope
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorNotSupported`,
-
-    See Also
-    --------
-    :py:obj:`~.cuFlushGPUDirectRDMAWrites`
-    """
+    """"""
     cdef cyruntime.cudaFlushGPUDirectRDMAWritesTarget cytarget = int(target)
     cdef cyruntime.cudaFlushGPUDirectRDMAWritesScope cyscope = int(scope)
     with nogil:
@@ -20062,7 +20023,23 @@ cdef void cudaAsyncNotificationCallbackWrapper(cyruntime.cudaAsyncNotificationIn
 
 @cython.embedsignature(True)
 def cudaDeviceRegisterAsyncNotification(int device, callbackFunc, userData):
-    """ Registers a callback function to receive async notifications.
+    """ Blocks until remote writes are visible to the specified scope.
+
+    Blocks until remote writes to the target context via mappings created
+    through GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see
+    https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are
+    visible to the specified scope.
+
+    If the scope equals or lies within the scope indicated by
+    :py:obj:`~.cudaDevAttrGPUDirectRDMAWritesOrdering`, the call will be a
+    no-op and can be safely omitted for performance. This can be determined
+    by comparing the numerical values between the two enums, with smaller
+    scopes having smaller values.
+
+    Users may query support for this API via
+    :py:obj:`~.cudaDevAttrGPUDirectRDMAFlushWritesOptions`.
+
+    Registers a callback function to receive async notifications
 
     Registers `callbackFunc` to receive async notifications.
 
@@ -20084,23 +20061,25 @@ def cudaDeviceRegisterAsyncNotification(int device, callbackFunc, userData):
 
     Parameters
     ----------
-    device : int
+    target : int
+        The target of the operation, see cudaFlushGPUDirectRDMAWritesTarget
+    scope : :py:obj:`~.cudaAsyncCallback`
+        The scope of the operation, see cudaFlushGPUDirectRDMAWritesScope
+    device : Any
         The device on which to register the callback
-    callbackFunc : :py:obj:`~.cudaAsyncCallback`
-        The function to register as a callback
-    userData : Any
-        A generic pointer to user data. This is passed into the callback
-        function.
 
     Returns
     -------
     cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorNotSupported`,
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorNotSupported` :py:obj:`~.cudaErrorInvalidDevice` :py:obj:`~.cudaErrorInvalidValue` :py:obj:`~.cudaErrorNotPermitted` :py:obj:`~.cudaErrorUnknown`
-    callback : :py:obj:`~.cudaAsyncCallbackHandle_t`
-        A handle representing the registered callback instance
+    callbackFunc : :py:obj:`~.cudaAsyncCallbackHandle_t`
+        The function to register as a callback
 
     See Also
     --------
+    :py:obj:`~.cuFlushGPUDirectRDMAWrites`
+
     :py:obj:`~.cudaDeviceUnregisterAsyncNotification`
     """
     cdef cyruntime.cudaAsyncCallback cycallbackFunc
@@ -21303,8 +21282,8 @@ def cudaDeviceGetNvSciSyncAttributes(nvSciSyncAttrList, int device, int flags):
 
     - NvSciSyncAttrValPrimitiveType_SysmemSemaphorePayload64b if `device`
       is GA10X+. NvSciSyncAttrKey_GpuId is set to the same UUID that is
-      returned in `None` from :py:obj:`~.cudaDeviceGetProperties` for this
-      `device`.
+      returned in `cudaDeviceProp.uuid` from
+      :py:obj:`~.cudaDeviceGetProperties` for this `device`.
 
     :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorDeviceUninitialized`,
     :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidHandle`,
@@ -21423,7 +21402,7 @@ def cudaChooseDevice(prop : Optional[cudaDeviceProp]):
     :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaInitDevice`
     """
     cdef int device = 0
-    cdef cyruntime.cudaDeviceProp* cyprop_ptr = prop._pvt_ptr if prop is not None else NULL
+    cdef cyruntime.cudaDeviceProp* cyprop_ptr = <cyruntime.cudaDeviceProp*>prop._pvt_ptr if prop is not None else NULL
     with nogil:
         err = cyruntime.cudaChooseDevice(&device, cyprop_ptr)
     if err != cyruntime.cudaSuccess:
@@ -22164,7 +22143,7 @@ def cudaStreamSetAttribute(hStream, attr not None : cudaStreamAttrID, value : Op
         phStream = int(cudaStream_t(hStream))
     cyhStream = <cyruntime.cudaStream_t><void_ptr>phStream
     cdef cyruntime.cudaStreamAttrID cyattr = int(attr)
-    cdef cyruntime.cudaStreamAttrValue* cyvalue_ptr = value._pvt_ptr if value is not None else NULL
+    cdef cyruntime.cudaStreamAttrValue* cyvalue_ptr = <cyruntime.cudaStreamAttrValue*>value._pvt_ptr if value is not None else NULL
     with nogil:
         err = cyruntime.cudaStreamSetAttribute(cyhStream, cyattr, cyvalue_ptr)
     return (_cudaError_t(err),)
@@ -23413,49 +23392,7 @@ def cudaEventRecord(event, stream):
 
 @cython.embedsignature(True)
 def cudaEventRecordWithFlags(event, stream, unsigned int flags):
-    """ Records an event.
-
-    Captures in `event` the contents of `stream` at the time of this call.
-    `event` and `stream` must be on the same CUDA context. Calls such as
-    :py:obj:`~.cudaEventQuery()` or :py:obj:`~.cudaStreamWaitEvent()` will
-    then examine or wait for completion of the work that was captured. Uses
-    of `stream` after this call do not modify `event`. See note on default
-    stream behavior for what is captured in the default case.
-
-    :py:obj:`~.cudaEventRecordWithFlags()` can be called multiple times on
-    the same event and will overwrite the previously captured state. Other
-    APIs such as :py:obj:`~.cudaStreamWaitEvent()` use the most recently
-    captured state at the time of the API call, and are not affected by
-    later calls to :py:obj:`~.cudaEventRecordWithFlags()`. Before the first
-    call to :py:obj:`~.cudaEventRecordWithFlags()`, an event represents an
-    empty set of work, so for example :py:obj:`~.cudaEventQuery()` would
-    return :py:obj:`~.cudaSuccess`.
-
-    flags include:
-
-    - :py:obj:`~.cudaEventRecordDefault`: Default event creation flag.
-
-    - :py:obj:`~.cudaEventRecordExternal`: Event is captured in the graph
-      as an external event node when performing stream capture.
-
-    Parameters
-    ----------
-    event : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
-        Event to record
-    stream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
-        Stream in which to record event
-    flags : unsigned int
-        Parameters for the operation(See above)
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorLaunchFailure`
-
-    See Also
-    --------
-    :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventElapsedTime`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaEventRecord`, :py:obj:`~.cuEventRecord`,
-    """
+    """"""
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
         pstream = 0
@@ -23481,7 +23418,32 @@ def cudaEventRecordWithFlags(event, stream, unsigned int flags):
 
 @cython.embedsignature(True)
 def cudaEventQuery(event):
-    """ Queries an event's status.
+    """ Records an event.
+
+    Captures in `event` the contents of `stream` at the time of this call.
+    `event` and `stream` must be on the same CUDA context. Calls such as
+    :py:obj:`~.cudaEventQuery()` or :py:obj:`~.cudaStreamWaitEvent()` will
+    then examine or wait for completion of the work that was captured. Uses
+    of `stream` after this call do not modify `event`. See note on default
+    stream behavior for what is captured in the default case.
+
+    :py:obj:`~.cudaEventRecordWithFlags()` can be called multiple times on
+    the same event and will overwrite the previously captured state. Other
+    APIs such as :py:obj:`~.cudaStreamWaitEvent()` use the most recently
+    captured state at the time of the API call, and are not affected by
+    later calls to :py:obj:`~.cudaEventRecordWithFlags()`. Before the first
+    call to :py:obj:`~.cudaEventRecordWithFlags()`, an event represents an
+    empty set of work, so for example :py:obj:`~.cudaEventQuery()` would
+    return :py:obj:`~.cudaSuccess`.
+
+    flags include:
+
+    - :py:obj:`~.cudaEventRecordDefault`: Default event creation flag.
+
+    - :py:obj:`~.cudaEventRecordExternal`: Event is captured in the graph
+      as an external event node when performing stream capture.
+
+    Queries an event's status
 
     Queries the status of all work currently captured by `event`. See
     :py:obj:`~.cudaEventRecord()` for details on what is captured by an
@@ -23498,15 +23460,18 @@ def cudaEventQuery(event):
     Parameters
     ----------
     event : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
-        Event to query
+        Event to record
 
     Returns
     -------
     cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorLaunchFailure`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorNotReady`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorLaunchFailure`
 
     See Also
     --------
+    :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventElapsedTime`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaEventRecord`, :py:obj:`~.cuEventRecord`,
+
     :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventRecord`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventElapsedTime`, :py:obj:`~.cuEventQuery`
     """
     cdef cyruntime.cudaEvent_t cyevent
@@ -23895,7 +23860,7 @@ def cudaImportExternalMemory(memHandleDesc : Optional[cudaExternalMemoryHandleDe
     and Cache Control" chapter from Vulkan specification.
     """
     cdef cudaExternalMemory_t extMem_out = cudaExternalMemory_t()
-    cdef cyruntime.cudaExternalMemoryHandleDesc* cymemHandleDesc_ptr = memHandleDesc._pvt_ptr if memHandleDesc is not None else NULL
+    cdef cyruntime.cudaExternalMemoryHandleDesc* cymemHandleDesc_ptr = <cyruntime.cudaExternalMemoryHandleDesc*>memHandleDesc._pvt_ptr if memHandleDesc is not None else NULL
     with nogil:
         err = cyruntime.cudaImportExternalMemory(<cyruntime.cudaExternalMemory_t*>extMem_out._pvt_ptr, cymemHandleDesc_ptr)
     if err != cyruntime.cudaSuccess:
@@ -23963,7 +23928,7 @@ def cudaExternalMemoryGetMappedBuffer(extMem, bufferDesc : Optional[cudaExternal
         pextMem = int(cudaExternalMemory_t(extMem))
     cyextMem = <cyruntime.cudaExternalMemory_t><void_ptr>pextMem
     cdef void_ptr devPtr = 0
-    cdef cyruntime.cudaExternalMemoryBufferDesc* cybufferDesc_ptr = bufferDesc._pvt_ptr if bufferDesc is not None else NULL
+    cdef cyruntime.cudaExternalMemoryBufferDesc* cybufferDesc_ptr = <cyruntime.cudaExternalMemoryBufferDesc*>bufferDesc._pvt_ptr if bufferDesc is not None else NULL
     with nogil:
         err = cyruntime.cudaExternalMemoryGetMappedBuffer(<void**>&devPtr, cyextMem, cybufferDesc_ptr)
     if err != cyruntime.cudaSuccess:
@@ -24035,7 +24000,7 @@ def cudaExternalMemoryGetMappedMipmappedArray(extMem, mipmapDesc : Optional[cuda
         pextMem = int(cudaExternalMemory_t(extMem))
     cyextMem = <cyruntime.cudaExternalMemory_t><void_ptr>pextMem
     cdef cudaMipmappedArray_t mipmap = cudaMipmappedArray_t()
-    cdef cyruntime.cudaExternalMemoryMipmappedArrayDesc* cymipmapDesc_ptr = mipmapDesc._pvt_ptr if mipmapDesc is not None else NULL
+    cdef cyruntime.cudaExternalMemoryMipmappedArrayDesc* cymipmapDesc_ptr = <cyruntime.cudaExternalMemoryMipmappedArrayDesc*>mipmapDesc._pvt_ptr if mipmapDesc is not None else NULL
     with nogil:
         err = cyruntime.cudaExternalMemoryGetMappedMipmappedArray(<cyruntime.cudaMipmappedArray_t*>mipmap._pvt_ptr, cyextMem, cymipmapDesc_ptr)
     if err != cyruntime.cudaSuccess:
@@ -24222,7 +24187,7 @@ def cudaImportExternalSemaphore(semHandleDesc : Optional[cudaExternalSemaphoreHa
     :py:obj:`~.cudaDestroyExternalSemaphore`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`
     """
     cdef cudaExternalSemaphore_t extSem_out = cudaExternalSemaphore_t()
-    cdef cyruntime.cudaExternalSemaphoreHandleDesc* cysemHandleDesc_ptr = semHandleDesc._pvt_ptr if semHandleDesc is not None else NULL
+    cdef cyruntime.cudaExternalSemaphoreHandleDesc* cysemHandleDesc_ptr = <cyruntime.cudaExternalSemaphoreHandleDesc*>semHandleDesc._pvt_ptr if semHandleDesc is not None else NULL
     with nogil:
         err = cyruntime.cudaImportExternalSemaphore(<cyruntime.cudaExternalSemaphore_t*>extSem_out._pvt_ptr, cysemHandleDesc_ptr)
     if err != cyruntime.cudaSuccess:
@@ -24549,8 +24514,8 @@ def cudaFuncSetCacheConfig(func, cacheConfig not None : cudaFuncCache):
     possible, but it is free to choose a different configuration if
     required to execute `func`.
 
-    `func` is a device function symbol and must be declared as a `None`
-    function. If the specified function does not exist, then
+    `func` is a device function symbol and must be declared as a
+    `__global__` function. If the specified function does not exist, then
     :py:obj:`~.cudaErrorInvalidDeviceFunction` is returned. For templated
     functions, pass the function symbol as follows:
     func_name<template_arg_0,...,template_arg_N>
@@ -24612,8 +24577,8 @@ def cudaFuncGetAttributes(func):
 
     This function obtains the attributes of a function specified via
     `func`. `func` is a device function symbol and must be declared as a
-    `None` function. The fetched attributes are placed in `attr`. If the
-    specified function does not exist, then it is assumed to be a
+    `__global__` function. The fetched attributes are placed in `attr`. If
+    the specified function does not exist, then it is assumed to be a
     :py:obj:`~.cudaKernel_t` and used as is. For templated functions, pass
     the function symbol as follows:
     func_name<template_arg_0,...,template_arg_N>
@@ -24658,11 +24623,11 @@ def cudaFuncSetAttribute(func, attr not None : cudaFuncAttribute, int value):
     This function sets the attributes of a function specified via `func`.
     The parameter `func` must be a pointer to a function that executes on
     the device. The parameter specified by `func` must be declared as a
-    `None` function. The enumeration defined by `attr` is set to the value
-    defined by `value`. If the specified function does not exist, then it
-    is assumed to be a :py:obj:`~.cudaKernel_t` and used as is. If the
-    specified attribute cannot be written, or if the value is incorrect,
-    then :py:obj:`~.cudaErrorInvalidValue` is returned.
+    `__global__` function. The enumeration defined by `attr` is set to the
+    value defined by `value`. If the specified function does not exist,
+    then it is assumed to be a :py:obj:`~.cudaKernel_t` and used as is. If
+    the specified attribute cannot be written, or if the value is
+    incorrect, then :py:obj:`~.cudaErrorInvalidValue` is returned.
 
     Valid values for `attr` are:
 
@@ -25400,7 +25365,7 @@ def cudaMallocArray(desc : Optional[cudaChannelFormatDesc], size_t width, size_t
     :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cuArrayCreate`
     """
     cdef cudaArray_t array = cudaArray_t()
-    cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = desc._pvt_ptr if desc is not None else NULL
+    cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = <cyruntime.cudaChannelFormatDesc*>desc._pvt_ptr if desc is not None else NULL
     with nogil:
         err = cyruntime.cudaMallocArray(<cyruntime.cudaArray_t*>array._pvt_ptr, cydesc_ptr, width, height, flags)
     if err != cyruntime.cudaSuccess:
@@ -25940,7 +25905,7 @@ def cudaMalloc3D(extent not None : cudaExtent):
 
     See Also
     --------
-    :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaPitchedPtr`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuMemAllocPitch`
+    :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, make_cudaPitchedPtr, make_cudaExtent, :py:obj:`~.cuMemAllocPitch`
     """
     cdef cudaPitchedPtr pitchedDevPtr = cudaPitchedPtr()
     with nogil:
@@ -26062,10 +26027,10 @@ def cudaMalloc3DArray(desc : Optional[cudaChannelFormatDesc], extent not None :
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuArray3DCreate`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, make_cudaExtent, :py:obj:`~.cuArray3DCreate`
     """
     cdef cudaArray_t array = cudaArray_t()
-    cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = desc._pvt_ptr if desc is not None else NULL
+    cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = <cyruntime.cudaChannelFormatDesc*>desc._pvt_ptr if desc is not None else NULL
     with nogil:
         err = cyruntime.cudaMalloc3DArray(<cyruntime.cudaArray_t*>array._pvt_ptr, cydesc_ptr, extent._pvt_ptr[0], flags)
     if err != cyruntime.cudaSuccess:
@@ -26188,10 +26153,10 @@ def cudaMallocMipmappedArray(desc : Optional[cudaChannelFormatDesc], extent not
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuMipmappedArrayCreate`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, make_cudaExtent, :py:obj:`~.cuMipmappedArrayCreate`
     """
     cdef cudaMipmappedArray_t mipmappedArray = cudaMipmappedArray_t()
-    cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = desc._pvt_ptr if desc is not None else NULL
+    cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = <cyruntime.cudaChannelFormatDesc*>desc._pvt_ptr if desc is not None else NULL
     with nogil:
         err = cyruntime.cudaMallocMipmappedArray(<cyruntime.cudaMipmappedArray_t*>mipmappedArray._pvt_ptr, cydesc_ptr, extent._pvt_ptr[0], numLevels, flags)
     if err != cyruntime.cudaSuccess:
@@ -26230,7 +26195,7 @@ def cudaGetMipmappedArrayLevel(mipmappedArray, unsigned int level):
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuMipmappedArrayGetLevel`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, make_cudaExtent, :py:obj:`~.cuMipmappedArrayGetLevel`
     """
     cdef cyruntime.cudaMipmappedArray_const_t cymipmappedArray
     if mipmappedArray is None:
@@ -26324,9 +26289,9 @@ def cudaMemcpy3D(p : Optional[cudaMemcpy3DParms]):
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3DAsync`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.make_cudaPos`, :py:obj:`~.cuMemcpy3D`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3DAsync`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, make_cudaExtent, make_cudaPos, :py:obj:`~.cuMemcpy3D`
     """
-    cdef cyruntime.cudaMemcpy3DParms* cyp_ptr = p._pvt_ptr if p is not None else NULL
+    cdef cyruntime.cudaMemcpy3DParms* cyp_ptr = <cyruntime.cudaMemcpy3DParms*>p._pvt_ptr if p is not None else NULL
     with nogil:
         err = cyruntime.cudaMemcpy3D(cyp_ptr)
     return (_cudaError_t(err),)
@@ -26363,7 +26328,7 @@ def cudaMemcpy3DPeer(p : Optional[cudaMemcpy3DPeerParms]):
     --------
     :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpyPeerAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :py:obj:`~.cuMemcpy3DPeer`
     """
-    cdef cyruntime.cudaMemcpy3DPeerParms* cyp_ptr = p._pvt_ptr if p is not None else NULL
+    cdef cyruntime.cudaMemcpy3DPeerParms* cyp_ptr = <cyruntime.cudaMemcpy3DPeerParms*>p._pvt_ptr if p is not None else NULL
     with nogil:
         err = cyruntime.cudaMemcpy3DPeer(cyp_ptr)
     return (_cudaError_t(err),)
@@ -26458,7 +26423,7 @@ def cudaMemcpy3DAsync(p : Optional[cudaMemcpy3DParms], stream):
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, ::::py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.make_cudaPos`, :py:obj:`~.cuMemcpy3DAsync`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, ::::py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, make_cudaExtent, make_cudaPos, :py:obj:`~.cuMemcpy3DAsync`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -26468,7 +26433,7 @@ def cudaMemcpy3DAsync(p : Optional[cudaMemcpy3DParms], stream):
     else:
         pstream = int(cudaStream_t(stream))
     cystream = <cyruntime.cudaStream_t><void_ptr>pstream
-    cdef cyruntime.cudaMemcpy3DParms* cyp_ptr = p._pvt_ptr if p is not None else NULL
+    cdef cyruntime.cudaMemcpy3DParms* cyp_ptr = <cyruntime.cudaMemcpy3DParms*>p._pvt_ptr if p is not None else NULL
     with nogil:
         err = cyruntime.cudaMemcpy3DAsync(cyp_ptr, cystream)
     return (_cudaError_t(err),)
@@ -26508,7 +26473,7 @@ def cudaMemcpy3DPeerAsync(p : Optional[cudaMemcpy3DPeerParms], stream):
     else:
         pstream = int(cudaStream_t(stream))
     cystream = <cyruntime.cudaStream_t><void_ptr>pstream
-    cdef cyruntime.cudaMemcpy3DPeerParms* cyp_ptr = p._pvt_ptr if p is not None else NULL
+    cdef cyruntime.cudaMemcpy3DPeerParms* cyp_ptr = <cyruntime.cudaMemcpy3DPeerParms*>p._pvt_ptr if p is not None else NULL
     with nogil:
         err = cyruntime.cudaMemcpy3DPeerAsync(cyp_ptr, cystream)
     return (_cudaError_t(err),)
@@ -26768,42 +26733,7 @@ def cudaMipmappedArrayGetMemoryRequirements(mipmap, int device):
 
 @cython.embedsignature(True)
 def cudaArrayGetSparseProperties(array):
-    """ Returns the layout properties of a sparse CUDA array.
-
-    Returns the layout properties of a sparse CUDA array in
-    `sparseProperties`. If the CUDA array is not allocated with flag
-    :py:obj:`~.cudaArraySparse` :py:obj:`~.cudaErrorInvalidValue` will be
-    returned.
-
-    If the returned value in :py:obj:`~.cudaArraySparseProperties.flags`
-    contains :py:obj:`~.cudaArraySparsePropertiesSingleMipTail`, then
-    :py:obj:`~.cudaArraySparseProperties.miptailSize` represents the total
-    size of the array. Otherwise, it will be zero. Also, the returned value
-    in :py:obj:`~.cudaArraySparseProperties.miptailFirstLevel` is always
-    zero. Note that the `array` must have been allocated using
-    :py:obj:`~.cudaMallocArray` or :py:obj:`~.cudaMalloc3DArray`. For CUDA
-    arrays obtained using :py:obj:`~.cudaMipmappedArrayGetLevel`,
-    :py:obj:`~.cudaErrorInvalidValue` will be returned. Instead,
-    :py:obj:`~.cudaMipmappedArrayGetSparseProperties` must be used to
-    obtain the sparse properties of the entire CUDA mipmapped array to
-    which `array` belongs to.
-
-    Parameters
-    ----------
-    array : :py:obj:`~.cudaArray_t`
-        The CUDA array to get the sparse properties of
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
-    sparseProperties : :py:obj:`~.cudaArraySparseProperties`
-        Pointer to return the :py:obj:`~.cudaArraySparseProperties`
-
-    See Also
-    --------
-    :py:obj:`~.cudaMipmappedArrayGetSparseProperties`, :py:obj:`~.cuMemMapArrayAsync`
-    """
+    """"""
     cdef cyruntime.cudaArray_t cyarray
     if array is None:
         parray = 0
@@ -26824,42 +26754,7 @@ def cudaArrayGetSparseProperties(array):
 
 @cython.embedsignature(True)
 def cudaMipmappedArrayGetSparseProperties(mipmap):
-    """ Returns the layout properties of a sparse CUDA mipmapped array.
-
-    Returns the sparse array layout properties in `sparseProperties`. If
-    the CUDA mipmapped array is not allocated with flag
-    :py:obj:`~.cudaArraySparse` :py:obj:`~.cudaErrorInvalidValue` will be
-    returned.
-
-    For non-layered CUDA mipmapped arrays,
-    :py:obj:`~.cudaArraySparseProperties.miptailSize` returns the size of
-    the mip tail region. The mip tail region includes all mip levels whose
-    width, height or depth is less than that of the tile. For layered CUDA
-    mipmapped arrays, if :py:obj:`~.cudaArraySparseProperties.flags`
-    contains :py:obj:`~.cudaArraySparsePropertiesSingleMipTail`, then
-    :py:obj:`~.cudaArraySparseProperties.miptailSize` specifies the size of
-    the mip tail of all layers combined. Otherwise,
-    :py:obj:`~.cudaArraySparseProperties.miptailSize` specifies mip tail
-    size per layer. The returned value of
-    :py:obj:`~.cudaArraySparseProperties.miptailFirstLevel` is valid only
-    if :py:obj:`~.cudaArraySparseProperties.miptailSize` is non-zero.
-
-    Parameters
-    ----------
-    mipmap : :py:obj:`~.cudaMipmappedArray_t`
-        The CUDA mipmapped array to get the sparse properties of
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
-    sparseProperties : :py:obj:`~.cudaArraySparseProperties`
-        Pointer to return :py:obj:`~.cudaArraySparseProperties`
-
-    See Also
-    --------
-    :py:obj:`~.cudaArrayGetSparseProperties`, :py:obj:`~.cuMemMapArrayAsync`
-    """
+    """"""
     cdef cyruntime.cudaMipmappedArray_t cymipmap
     if mipmap is None:
         pmipmap = 0
@@ -26880,7 +26775,47 @@ def cudaMipmappedArrayGetSparseProperties(mipmap):
 
 @cython.embedsignature(True)
 def cudaMemcpy(dst, src, size_t count, kind not None : cudaMemcpyKind):
-    """ Copies data between host and device.
+    """ Returns the layout properties of a sparse CUDA array.
+
+    Returns the layout properties of a sparse CUDA array in
+    `sparseProperties`. If the CUDA array is not allocated with flag
+    :py:obj:`~.cudaArraySparse` :py:obj:`~.cudaErrorInvalidValue` will be
+    returned.
+
+    If the returned value in :py:obj:`~.cudaArraySparseProperties.flags`
+    contains :py:obj:`~.cudaArraySparsePropertiesSingleMipTail`, then
+    :py:obj:`~.cudaArraySparseProperties.miptailSize` represents the total
+    size of the array. Otherwise, it will be zero. Also, the returned value
+    in :py:obj:`~.cudaArraySparseProperties.miptailFirstLevel` is always
+    zero. Note that the `array` must have been allocated using
+    :py:obj:`~.cudaMallocArray` or :py:obj:`~.cudaMalloc3DArray`. For CUDA
+    arrays obtained using :py:obj:`~.cudaMipmappedArrayGetLevel`,
+    :py:obj:`~.cudaErrorInvalidValue` will be returned. Instead,
+    :py:obj:`~.cudaMipmappedArrayGetSparseProperties` must be used to
+    obtain the sparse properties of the entire CUDA mipmapped array to
+    which `array` belongs to.
+
+    Returns the layout properties of a sparse CUDA mipmapped array
+
+    Returns the sparse array layout properties in `sparseProperties`. If
+    the CUDA mipmapped array is not allocated with flag
+    :py:obj:`~.cudaArraySparse` :py:obj:`~.cudaErrorInvalidValue` will be
+    returned.
+
+    For non-layered CUDA mipmapped arrays,
+    :py:obj:`~.cudaArraySparseProperties.miptailSize` returns the size of
+    the mip tail region. The mip tail region includes all mip levels whose
+    width, height or depth is less than that of the tile. For layered CUDA
+    mipmapped arrays, if :py:obj:`~.cudaArraySparseProperties.flags`
+    contains :py:obj:`~.cudaArraySparsePropertiesSingleMipTail`, then
+    :py:obj:`~.cudaArraySparseProperties.miptailSize` specifies the size of
+    the mip tail of all layers combined. Otherwise,
+    :py:obj:`~.cudaArraySparseProperties.miptailSize` specifies mip tail
+    size per layer. The returned value of
+    :py:obj:`~.cudaArraySparseProperties.miptailFirstLevel` is valid only
+    if :py:obj:`~.cudaArraySparseProperties.miptailSize` is non-zero.
+
+    Copies data between host and device
 
     Copies `count` bytes from the memory area pointed to by `src` to the
     memory area pointed to by `dst`, where `kind` specifies the direction
@@ -26898,22 +26833,28 @@ def cudaMemcpy(dst, src, size_t count, kind not None : cudaMemcpyKind):
 
     Parameters
     ----------
-    dst : Any
-        Destination memory address
-    src : Any
-        Source memory address
-    count : size_t
-        Size in bytes to copy
-    kind : :py:obj:`~.cudaMemcpyKind`
-        Type of transfer
+    sparseProperties : Any
+        Pointer to return the :py:obj:`~.cudaArraySparseProperties`
+    array : Any
+        The CUDA array to get the sparse properties of
+    sparseProperties : size_t
+        Pointer to return :py:obj:`~.cudaArraySparseProperties`
+    mipmap : :py:obj:`~.cudaMemcpyKind`
+        The CUDA mipmapped array to get the sparse properties of
 
     Returns
     -------
     cudaError_t
+        :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidMemcpyDirection`
 
     See Also
     --------
+    :py:obj:`~.cudaMipmappedArrayGetSparseProperties`, :py:obj:`~.cuMemMapArrayAsync`
+
+    :py:obj:`~.cudaArrayGetSparseProperties`, :py:obj:`~.cuMemMapArrayAsync`
+
     :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpy`
     """
     cdef _HelperInputVoidPtrStruct cydstHelper
@@ -27566,7 +27507,7 @@ def cudaMemcpy3DBatchAsync(size_t numOps, opList : Optional[tuple[cudaMemcpy3DBa
     CUDA array. For CUDA array to CUDA array copies, the element size of
     the two CUDA arrays must match.
 
-    For a given operand, if :py:obj:`~.cudaMemcpy3DOperand`::type is
+    For a given operand, if :py:obj:`~.cudaMemcpy3DOperand.type` is
     specified as :py:obj:`~.cudaMemcpyOperandTypePointer`, then
     :py:obj:`~.cudaMemcpy3DOperand`::op::ptr will be used. The
     :py:obj:`~.cudaMemcpy3DOperand`::op::ptr::ptr field must contain the
@@ -28071,7 +28012,7 @@ def cudaMemset3D(pitchedDevPtr not None : cudaPitchedPtr, int value, extent not
 
     See Also
     --------
-    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMemset3DAsync`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.make_cudaPitchedPtr`, :py:obj:`~.make_cudaExtent`
+    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMemset3DAsync`, :py:obj:`~.cudaMalloc3D`, make_cudaPitchedPtr, make_cudaExtent
     """
     with nogil:
         err = cyruntime.cudaMemset3D(pitchedDevPtr._pvt_ptr[0], value, extent._pvt_ptr[0])
@@ -28248,7 +28189,7 @@ def cudaMemset3DAsync(pitchedDevPtr not None : cudaPitchedPtr, int value, extent
 
     See Also
     --------
-    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.make_cudaPitchedPtr`, :py:obj:`~.make_cudaExtent`
+    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMalloc3D`, make_cudaPitchedPtr, make_cudaExtent
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -29824,7 +29765,7 @@ def cudaMemPoolGetAccess(memPool, location : Optional[cudaMemLocation]):
         pmemPool = int(cudaMemPool_t(memPool))
     cymemPool = <cyruntime.cudaMemPool_t><void_ptr>pmemPool
     cdef cyruntime.cudaMemAccessFlags flags
-    cdef cyruntime.cudaMemLocation* cylocation_ptr = location._pvt_ptr if location is not None else NULL
+    cdef cyruntime.cudaMemLocation* cylocation_ptr = <cyruntime.cudaMemLocation*>location._pvt_ptr if location is not None else NULL
     with nogil:
         err = cyruntime.cudaMemPoolGetAccess(&flags, cymemPool, cylocation_ptr)
     if err != cyruntime.cudaSuccess:
@@ -29900,7 +29841,7 @@ def cudaMemPoolCreate(poolProps : Optional[cudaMemPoolProps]):
     Specifying cudaMemHandleTypeNone creates a memory pool that will not support IPC.
     """
     cdef cudaMemPool_t memPool = cudaMemPool_t()
-    cdef cyruntime.cudaMemPoolProps* cypoolProps_ptr = poolProps._pvt_ptr if poolProps is not None else NULL
+    cdef cyruntime.cudaMemPoolProps* cypoolProps_ptr = <cyruntime.cudaMemPoolProps*>poolProps._pvt_ptr if poolProps is not None else NULL
     with nogil:
         err = cyruntime.cudaMemPoolCreate(<cyruntime.cudaMemPool_t*>memPool._pvt_ptr, cypoolProps_ptr)
     if err != cyruntime.cudaSuccess:
@@ -30198,7 +30139,7 @@ def cudaMemPoolImportPointer(memPool, exportData : Optional[cudaMemPoolPtrExport
         pmemPool = int(cudaMemPool_t(memPool))
     cymemPool = <cyruntime.cudaMemPool_t><void_ptr>pmemPool
     cdef void_ptr ptr = 0
-    cdef cyruntime.cudaMemPoolPtrExportData* cyexportData_ptr = exportData._pvt_ptr if exportData is not None else NULL
+    cdef cyruntime.cudaMemPoolPtrExportData* cyexportData_ptr = <cyruntime.cudaMemPoolPtrExportData*>exportData._pvt_ptr if exportData is not None else NULL
     with nogil:
         err = cyruntime.cudaMemPoolImportPointer(<void**>&ptr, cymemPool, cyexportData_ptr)
     if err != cyruntime.cudaSuccess:
@@ -31090,9 +31031,9 @@ def cudaCreateTextureObject(pResDesc : Optional[cudaResourceDesc], pTexDesc : Op
     :py:obj:`~.cudaDestroyTextureObject`, :py:obj:`~.cuTexObjectCreate`
     """
     cdef cudaTextureObject_t pTexObject = cudaTextureObject_t()
-    cdef cyruntime.cudaResourceDesc* cypResDesc_ptr = pResDesc._pvt_ptr if pResDesc is not None else NULL
-    cdef cyruntime.cudaTextureDesc* cypTexDesc_ptr = pTexDesc._pvt_ptr if pTexDesc is not None else NULL
-    cdef cyruntime.cudaResourceViewDesc* cypResViewDesc_ptr = pResViewDesc._pvt_ptr if pResViewDesc is not None else NULL
+    cdef cyruntime.cudaResourceDesc* cypResDesc_ptr = <cyruntime.cudaResourceDesc*>pResDesc._pvt_ptr if pResDesc is not None else NULL
+    cdef cyruntime.cudaTextureDesc* cypTexDesc_ptr = <cyruntime.cudaTextureDesc*>pTexDesc._pvt_ptr if pTexDesc is not None else NULL
+    cdef cyruntime.cudaResourceViewDesc* cypResViewDesc_ptr = <cyruntime.cudaResourceViewDesc*>pResViewDesc._pvt_ptr if pResViewDesc is not None else NULL
     with nogil:
         err = cyruntime.cudaCreateTextureObject(<cyruntime.cudaTextureObject_t*>pTexObject._pvt_ptr, cypResDesc_ptr, cypTexDesc_ptr, cypResViewDesc_ptr)
     if err != cyruntime.cudaSuccess:
@@ -31293,7 +31234,7 @@ def cudaCreateSurfaceObject(pResDesc : Optional[cudaResourceDesc]):
     :py:obj:`~.cudaDestroySurfaceObject`, :py:obj:`~.cuSurfObjectCreate`
     """
     cdef cudaSurfaceObject_t pSurfObject = cudaSurfaceObject_t()
-    cdef cyruntime.cudaResourceDesc* cypResDesc_ptr = pResDesc._pvt_ptr if pResDesc is not None else NULL
+    cdef cyruntime.cudaResourceDesc* cypResDesc_ptr = <cyruntime.cudaResourceDesc*>pResDesc._pvt_ptr if pResDesc is not None else NULL
     with nogil:
         err = cyruntime.cudaCreateSurfaceObject(<cyruntime.cudaSurfaceObject_t*>pSurfObject._pvt_ptr, cypResDesc_ptr)
     if err != cyruntime.cudaSuccess:
@@ -31587,7 +31528,7 @@ def cudaGraphAddKernelNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams is not None else NULL
+    cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = <cyruntime.cudaKernelNodeParams*>pNodeParams._pvt_ptr if pNodeParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddKernelNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cypNodeParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -31678,7 +31619,7 @@ def cudaGraphKernelNodeSetParams(node, pNodeParams : Optional[cudaKernelNodePara
     else:
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams is not None else NULL
+    cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = <cyruntime.cudaKernelNodeParams*>pNodeParams._pvt_ptr if pNodeParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphKernelNodeSetParams(cynode, cypNodeParams_ptr)
     return (_cudaError_t(err),)
@@ -31811,7 +31752,7 @@ def cudaGraphKernelNodeSetAttribute(hNode, attr not None : cudaKernelNodeAttrID,
         phNode = int(cudaGraphNode_t(hNode))
     cyhNode = <cyruntime.cudaGraphNode_t><void_ptr>phNode
     cdef cyruntime.cudaKernelNodeAttrID cyattr = int(attr)
-    cdef cyruntime.cudaKernelNodeAttrValue* cyvalue_ptr = value._pvt_ptr if value is not None else NULL
+    cdef cyruntime.cudaKernelNodeAttrValue* cyvalue_ptr = <cyruntime.cudaKernelNodeAttrValue*>value._pvt_ptr if value is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphKernelNodeSetAttribute(cyhNode, cyattr, cyvalue_ptr)
     return (_cudaError_t(err),)
@@ -31883,7 +31824,7 @@ def cudaGraphAddMemcpyNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaMemcpy3DParms* cypCopyParams_ptr = pCopyParams._pvt_ptr if pCopyParams is not None else NULL
+    cdef cyruntime.cudaMemcpy3DParms* cypCopyParams_ptr = <cyruntime.cudaMemcpy3DParms*>pCopyParams._pvt_ptr if pCopyParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddMemcpyNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cypCopyParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -31897,61 +31838,7 @@ def cudaGraphAddMemcpyNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t
 
 @cython.embedsignature(True)
 def cudaGraphAddMemcpyNode1D(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, dst, src, size_t count, kind not None : cudaMemcpyKind):
-    """ Creates a 1D memcpy node and adds it to a graph.
-
-    Creates a new 1D memcpy node and adds it to `graph` with
-    `numDependencies` dependencies specified via `pDependencies`. It is
-    possible for `numDependencies` to be 0, in which case the node will be
-    placed at the root of the graph. `pDependencies` may not have any
-    duplicate entries. A handle to the new node will be returned in
-    `pGraphNode`.
-
-    When the graph is launched, the node will copy `count` bytes from the
-    memory area pointed to by `src` to the memory area pointed to by `dst`,
-    where `kind` specifies the direction of the copy, and must be one of
-    :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
-    :py:obj:`~.cudaMemcpyDeviceToHost`,
-    :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
-    Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
-    type of transfer is inferred from the pointer values. However,
-    :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
-    unified virtual addressing. Launching a memcpy node with dst and src
-    pointers that do not match the direction of the copy results in an
-    undefined behavior.
-
-    Memcpy nodes have some additional restrictions with regards to managed
-    memory, if the system contains at least one device which has a zero
-    value for the device attribute
-    :py:obj:`~.cudaDevAttrConcurrentManagedAccess`.
-
-    Parameters
-    ----------
-    graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to which to add the node
-    pDependencies : list[:py:obj:`~.cudaGraphNode_t`]
-        Dependencies of the node
-    numDependencies : size_t
-        Number of dependencies
-    dst : Any
-        Destination memory address
-    src : Any
-        Source memory address
-    count : size_t
-        Size in bytes to copy
-    kind : :py:obj:`~.cudaMemcpyKind`
-        Type of transfer
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    pGraphNode : :py:obj:`~.cudaGraphNode_t`
-        Returns newly created node
-
-    See Also
-    --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphMemcpyNodeGetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParams1D`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddHostNode`, :py:obj:`~.cudaGraphAddMemsetNode`
-    """
+    """"""
     pDependencies = [] if pDependencies is None else pDependencies
     if not all(isinstance(_x, (cudaGraphNode_t,driver.CUgraphNode)) for _x in pDependencies):
         raise TypeError("Argument 'pDependencies' is not instance of type (expected tuple[cyruntime.cudaGraphNode_t,driver.CUgraphNode] or list[cyruntime.cudaGraphNode_t,driver.CUgraphNode]")
@@ -31994,24 +31881,111 @@ def cudaGraphAddMemcpyNode1D(graph, pDependencies : Optional[tuple[cudaGraphNode
 
 @cython.embedsignature(True)
 def cudaGraphMemcpyNodeGetParams(node):
-    """ Returns a memcpy node's parameters.
+    """ Creates a memcpy node to copy to a symbol on the device and adds it to a graph.
+
+    Creates a new memcpy node to copy to `symbol` and adds it to `graph`
+    with `numDependencies` dependencies specified via `pDependencies`. It
+    is possible for `numDependencies` to be 0, in which case the node will
+    be placed at the root of the graph. `pDependencies` may not have any
+    duplicate entries. A handle to the new node will be returned in
+    `pGraphNode`.
+
+    When the graph is launched, the node will copy `count` bytes from the
+    memory area pointed to by `src` to the memory area pointed to by
+    `offset` bytes from the start of symbol `symbol`. The memory areas may
+    not overlap. `symbol` is a variable that resides in global or constant
+    memory space. `kind` can be either :py:obj:`~.cudaMemcpyHostToDevice`,
+    :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
+    Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
+    type of transfer is inferred from the pointer values. However,
+    :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
+    unified virtual addressing.
+
+    Memcpy nodes have some additional restrictions with regards to managed
+    memory, if the system contains at least one device which has a zero
+    value for the device attribute
+    :py:obj:`~.cudaDevAttrConcurrentManagedAccess`.
+
+    Creates a memcpy node to copy from a symbol on the device and adds it
+    to a graph
+
+    Creates a new memcpy node to copy from `symbol` and adds it to `graph`
+    with `numDependencies` dependencies specified via `pDependencies`. It
+    is possible for `numDependencies` to be 0, in which case the node will
+    be placed at the root of the graph. `pDependencies` may not have any
+    duplicate entries. A handle to the new node will be returned in
+    `pGraphNode`.
+
+    When the graph is launched, the node will copy `count` bytes from the
+    memory area pointed to by `offset` bytes from the start of symbol
+    `symbol` to the memory area pointed to by `dst`. The memory areas may
+    not overlap. `symbol` is a variable that resides in global or constant
+    memory space. `kind` can be either :py:obj:`~.cudaMemcpyDeviceToHost`,
+    :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
+    Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
+    type of transfer is inferred from the pointer values. However,
+    :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
+    unified virtual addressing.
+
+    Memcpy nodes have some additional restrictions with regards to managed
+    memory, if the system contains at least one device which has a zero
+    value for the device attribute
+    :py:obj:`~.cudaDevAttrConcurrentManagedAccess`.
+
+    Creates a 1D memcpy node and adds it to a graph
+
+    Creates a new 1D memcpy node and adds it to `graph` with
+    `numDependencies` dependencies specified via `pDependencies`. It is
+    possible for `numDependencies` to be 0, in which case the node will be
+    placed at the root of the graph. `pDependencies` may not have any
+    duplicate entries. A handle to the new node will be returned in
+    `pGraphNode`.
+
+    When the graph is launched, the node will copy `count` bytes from the
+    memory area pointed to by `src` to the memory area pointed to by `dst`,
+    where `kind` specifies the direction of the copy, and must be one of
+    :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
+    :py:obj:`~.cudaMemcpyDeviceToHost`,
+    :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
+    Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
+    type of transfer is inferred from the pointer values. However,
+    :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
+    unified virtual addressing. Launching a memcpy node with dst and src
+    pointers that do not match the direction of the copy results in an
+    undefined behavior.
+
+    Memcpy nodes have some additional restrictions with regards to managed
+    memory, if the system contains at least one device which has a zero
+    value for the device attribute
+    :py:obj:`~.cudaDevAttrConcurrentManagedAccess`.
+
+    Returns a memcpy node's parameters
 
     Returns the parameters of memcpy node `node` in `pNodeParams`.
 
     Parameters
     ----------
-    node : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to get the parameters for
+    pGraphNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
+        Returns newly created node
 
     Returns
     -------
     cudaError_t
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    pNodeParams : :py:obj:`~.cudaMemcpy3DParms`
-        Pointer to return the parameters
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+    graph : :py:obj:`~.cudaMemcpy3DParms`
+        Graph to which to add the node
 
     See Also
     --------
+    :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemcpyNodeFromSymbol`, :py:obj:`~.cudaGraphMemcpyNodeGetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParamsToSymbol`, :py:obj:`~.cudaGraphMemcpyNodeSetParamsFromSymbol`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddHostNode`, :py:obj:`~.cudaGraphAddMemsetNode`
+
+    :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemcpyNodeToSymbol`, :py:obj:`~.cudaGraphMemcpyNodeGetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParamsFromSymbol`, :py:obj:`~.cudaGraphMemcpyNodeSetParamsToSymbol`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddHostNode`, :py:obj:`~.cudaGraphAddMemsetNode`
+
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphMemcpyNodeGetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParams1D`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddHostNode`, :py:obj:`~.cudaGraphAddMemsetNode`
+
     :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`
     """
     cdef cyruntime.cudaGraphNode_t cynode
@@ -32062,7 +32036,7 @@ def cudaGraphMemcpyNodeSetParams(node, pNodeParams : Optional[cudaMemcpy3DParms]
     else:
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    cdef cyruntime.cudaMemcpy3DParms* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams is not None else NULL
+    cdef cyruntime.cudaMemcpy3DParms* cypNodeParams_ptr = <cyruntime.cudaMemcpy3DParms*>pNodeParams._pvt_ptr if pNodeParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphMemcpyNodeSetParams(cynode, cypNodeParams_ptr)
     return (_cudaError_t(err),)
@@ -32072,46 +32046,7 @@ def cudaGraphMemcpyNodeSetParams(node, pNodeParams : Optional[cudaMemcpy3DParms]
 
 @cython.embedsignature(True)
 def cudaGraphMemcpyNodeSetParams1D(node, dst, src, size_t count, kind not None : cudaMemcpyKind):
-    """ Sets a memcpy node's parameters to perform a 1-dimensional copy.
-
-    Sets the parameters of memcpy node `node` to the copy described by the
-    provided parameters.
-
-    When the graph is launched, the node will copy `count` bytes from the
-    memory area pointed to by `src` to the memory area pointed to by `dst`,
-    where `kind` specifies the direction of the copy, and must be one of
-    :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
-    :py:obj:`~.cudaMemcpyDeviceToHost`,
-    :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
-    Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
-    type of transfer is inferred from the pointer values. However,
-    :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
-    unified virtual addressing. Launching a memcpy node with dst and src
-    pointers that do not match the direction of the copy results in an
-    undefined behavior.
-
-    Parameters
-    ----------
-    node : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to set the parameters for
-    dst : Any
-        Destination memory address
-    src : Any
-        Source memory address
-    count : size_t
-        Size in bytes to copy
-    kind : :py:obj:`~.cudaMemcpyKind`
-        Type of transfer
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-
-    See Also
-    --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphMemcpyNodeGetParams`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cynode
     if node is None:
         pnode = 0
@@ -32136,7 +32071,57 @@ def cudaGraphMemcpyNodeSetParams1D(node, dst, src, size_t count, kind not None :
 
 @cython.embedsignature(True)
 def cudaGraphAddMemsetNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, pMemsetParams : Optional[cudaMemsetParams]):
-    """ Creates a memset node and adds it to a graph.
+    """ Sets a memcpy node's parameters to copy to a symbol on the device.
+
+    Sets the parameters of memcpy node `node` to the copy described by the
+    provided parameters.
+
+    When the graph is launched, the node will copy `count` bytes from the
+    memory area pointed to by `src` to the memory area pointed to by
+    `offset` bytes from the start of symbol `symbol`. The memory areas may
+    not overlap. `symbol` is a variable that resides in global or constant
+    memory space. `kind` can be either :py:obj:`~.cudaMemcpyHostToDevice`,
+    :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
+    Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
+    type of transfer is inferred from the pointer values. However,
+    :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
+    unified virtual addressing.
+
+    Sets a memcpy node's parameters to copy from a symbol on the device
+
+    Sets the parameters of memcpy node `node` to the copy described by the
+    provided parameters.
+
+    When the graph is launched, the node will copy `count` bytes from the
+    memory area pointed to by `offset` bytes from the start of symbol
+    `symbol` to the memory area pointed to by `dst`. The memory areas may
+    not overlap. `symbol` is a variable that resides in global or constant
+    memory space. `kind` can be either :py:obj:`~.cudaMemcpyDeviceToHost`,
+    :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
+    Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
+    type of transfer is inferred from the pointer values. However,
+    :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
+    unified virtual addressing.
+
+    Sets a memcpy node's parameters to perform a 1-dimensional copy
+
+    Sets the parameters of memcpy node `node` to the copy described by the
+    provided parameters.
+
+    When the graph is launched, the node will copy `count` bytes from the
+    memory area pointed to by `src` to the memory area pointed to by `dst`,
+    where `kind` specifies the direction of the copy, and must be one of
+    :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
+    :py:obj:`~.cudaMemcpyDeviceToHost`,
+    :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
+    Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
+    type of transfer is inferred from the pointer values. However,
+    :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
+    unified virtual addressing. Launching a memcpy node with dst and src
+    pointers that do not match the direction of the copy results in an
+    undefined behavior.
+
+    Creates a memset node and adds it to a graph
 
     Creates a new memset node and adds it to `graph` with `numDependencies`
     dependencies specified via `pDependencies`. It is possible for
@@ -32149,24 +32134,33 @@ def cudaGraphAddMemsetNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t
 
     Parameters
     ----------
-    graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to which to add the node
-    pDependencies : list[:py:obj:`~.cudaGraphNode_t`]
-        Dependencies of the node
-    numDependencies : size_t
-        Number of dependencies
-    pMemsetParams : :py:obj:`~.cudaMemsetParams`
-        Parameters for the memory set
+    symbol : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
+        Device symbol address
+    src : list[:py:obj:`~.cudaGraphNode_t`]
+        Source memory address
+    count : size_t
+        Size in bytes to copy
+    offset : :py:obj:`~.cudaMemsetParams`
+        Offset from start of symbol in bytes
 
     Returns
     -------
     cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDevice`
-    pGraphNode : :py:obj:`~.cudaGraphNode_t`
-        Returns newly created node
+    node : :py:obj:`~.cudaGraphNode_t`
+        Node to set the parameters for
 
     See Also
     --------
+    :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParamsFromSymbol`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphMemcpyNodeGetParams`
+
+    :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParamsToSymbol`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphMemcpyNodeGetParams`
+
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphMemcpyNodeGetParams`
+
     :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaGraphMemsetNodeGetParams`, :py:obj:`~.cudaGraphMemsetNodeSetParams`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddHostNode`, :py:obj:`~.cudaGraphAddMemcpyNode`
     """
     pDependencies = [] if pDependencies is None else pDependencies
@@ -32192,7 +32186,7 @@ def cudaGraphAddMemsetNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaMemsetParams* cypMemsetParams_ptr = pMemsetParams._pvt_ptr if pMemsetParams is not None else NULL
+    cdef cyruntime.cudaMemsetParams* cypMemsetParams_ptr = <cyruntime.cudaMemsetParams*>pMemsetParams._pvt_ptr if pMemsetParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddMemsetNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cypMemsetParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -32274,7 +32268,7 @@ def cudaGraphMemsetNodeSetParams(node, pNodeParams : Optional[cudaMemsetParams])
     else:
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    cdef cyruntime.cudaMemsetParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams is not None else NULL
+    cdef cyruntime.cudaMemsetParams* cypNodeParams_ptr = <cyruntime.cudaMemsetParams*>pNodeParams._pvt_ptr if pNodeParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphMemsetNodeSetParams(cynode, cypNodeParams_ptr)
     return (_cudaError_t(err),)
@@ -32341,7 +32335,7 @@ def cudaGraphAddHostNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t]
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams is not None else NULL
+    cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = <cyruntime.cudaHostNodeParams*>pNodeParams._pvt_ptr if pNodeParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddHostNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cypNodeParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -32423,7 +32417,7 @@ def cudaGraphHostNodeSetParams(node, pNodeParams : Optional[cudaHostNodeParams])
     else:
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams is not None else NULL
+    cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = <cyruntime.cudaHostNodeParams*>pNodeParams._pvt_ptr if pNodeParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphHostNodeSetParams(cynode, cypNodeParams_ptr)
     return (_cudaError_t(err),)
@@ -32630,42 +32624,7 @@ def cudaGraphAddEmptyNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t]
 
 @cython.embedsignature(True)
 def cudaGraphAddEventRecordNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, event):
-    """ Creates an event record node and adds it to a graph.
-
-    Creates a new event record node and adds it to `hGraph` with
-    `numDependencies` dependencies specified via `dependencies` and event
-    specified in `event`. It is possible for `numDependencies` to be 0, in
-    which case the node will be placed at the root of the graph.
-    `dependencies` may not have any duplicate entries. A handle to the new
-    node will be returned in `phGraphNode`.
-
-    Each launch of the graph will record `event` to capture execution of
-    the node's dependencies.
-
-    These nodes may not be used in loops or conditionals.
-
-    Parameters
-    ----------
-    hGraph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to which to add the node
-    dependencies : list[:py:obj:`~.cudaGraphNode_t`]
-        Dependencies of the node
-    numDependencies : size_t
-        Number of dependencies
-    event : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
-        Event for the node
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    phGraphNode : :py:obj:`~.cudaGraphNode_t`
-        Returns newly created node
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
-    """
+    """"""
     cdef cyruntime.cudaEvent_t cyevent
     if event is None:
         pevent = 0
@@ -32710,26 +32669,7 @@ def cudaGraphAddEventRecordNode(graph, pDependencies : Optional[tuple[cudaGraphN
 
 @cython.embedsignature(True)
 def cudaGraphEventRecordNodeGetEvent(node):
-    """ Returns the event associated with an event record node.
-
-    Returns the event of event record node `hNode` in `event_out`.
-
-    Parameters
-    ----------
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to get the event for
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    event_out : :py:obj:`~.cudaEvent_t`
-        Pointer to return the event
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphEventWaitNodeGetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cynode
     if node is None:
         pnode = 0
@@ -32750,26 +32690,7 @@ def cudaGraphEventRecordNodeGetEvent(node):
 
 @cython.embedsignature(True)
 def cudaGraphEventRecordNodeSetEvent(node, event):
-    """ Sets an event record node's event.
-
-    Sets the event of event record node `hNode` to `event`.
-
-    Parameters
-    ----------
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to set the event for
-    event : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
-        Event to use
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphNodeSetParams`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphEventRecordNodeGetEvent`, :py:obj:`~.cudaGraphEventWaitNodeSetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`
-    """
+    """"""
     cdef cyruntime.cudaEvent_t cyevent
     if event is None:
         pevent = 0
@@ -32795,45 +32716,7 @@ def cudaGraphEventRecordNodeSetEvent(node, event):
 
 @cython.embedsignature(True)
 def cudaGraphAddEventWaitNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, event):
-    """ Creates an event wait node and adds it to a graph.
-
-    Creates a new event wait node and adds it to `hGraph` with
-    `numDependencies` dependencies specified via `dependencies` and event
-    specified in `event`. It is possible for `numDependencies` to be 0, in
-    which case the node will be placed at the root of the graph.
-    `dependencies` may not have any duplicate entries. A handle to the new
-    node will be returned in `phGraphNode`.
-
-    The graph node will wait for all work captured in `event`. See
-    :py:obj:`~.cuEventRecord()` for details on what is captured by an
-    event. The synchronization will be performed efficiently on the device
-    when applicable. `event` may be from a different context or device than
-    the launch stream.
-
-    These nodes may not be used in loops or conditionals.
-
-    Parameters
-    ----------
-    hGraph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to which to add the node
-    dependencies : list[:py:obj:`~.cudaGraphNode_t`]
-        Dependencies of the node
-    numDependencies : size_t
-        Number of dependencies
-    event : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
-        Event for the node
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    phGraphNode : :py:obj:`~.cudaGraphNode_t`
-        Returns newly created node
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
-    """
+    """"""
     cdef cyruntime.cudaEvent_t cyevent
     if event is None:
         pevent = 0
@@ -32878,26 +32761,7 @@ def cudaGraphAddEventWaitNode(graph, pDependencies : Optional[tuple[cudaGraphNod
 
 @cython.embedsignature(True)
 def cudaGraphEventWaitNodeGetEvent(node):
-    """ Returns the event associated with an event wait node.
-
-    Returns the event of event wait node `hNode` in `event_out`.
-
-    Parameters
-    ----------
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to get the event for
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    event_out : :py:obj:`~.cudaEvent_t`
-        Pointer to return the event
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphEventRecordNodeGetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cynode
     if node is None:
         pnode = 0
@@ -32918,26 +32782,7 @@ def cudaGraphEventWaitNodeGetEvent(node):
 
 @cython.embedsignature(True)
 def cudaGraphEventWaitNodeSetEvent(node, event):
-    """ Sets an event wait node's event.
-
-    Sets the event of event wait node `hNode` to `event`.
-
-    Parameters
-    ----------
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to set the event for
-    event : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
-        Event to use
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphNodeSetParams`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphEventWaitNodeGetEvent`, :py:obj:`~.cudaGraphEventRecordNodeSetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`
-    """
+    """"""
     cdef cyruntime.cudaEvent_t cyevent
     if event is None:
         pevent = 0
@@ -32963,41 +32808,7 @@ def cudaGraphEventWaitNodeSetEvent(node, event):
 
 @cython.embedsignature(True)
 def cudaGraphAddExternalSemaphoresSignalNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, nodeParams : Optional[cudaExternalSemaphoreSignalNodeParams]):
-    """ Creates an external semaphore signal node and adds it to a graph.
-
-    Creates a new external semaphore signal node and adds it to `graph`
-    with `numDependencies` dependencies specified via `dependencies` and
-    arguments specified in `nodeParams`. It is possible for
-    `numDependencies` to be 0, in which case the node will be placed at the
-    root of the graph. `dependencies` may not have any duplicate entries. A
-    handle to the new node will be returned in `pGraphNode`.
-
-    Performs a signal operation on a set of externally allocated semaphore
-    objects when the node is launched. The operation(s) will occur after
-    all of the node's dependencies have completed.
-
-    Parameters
-    ----------
-    graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to which to add the node
-    pDependencies : list[:py:obj:`~.cudaGraphNode_t`]
-        Dependencies of the node
-    numDependencies : size_t
-        Number of dependencies
-    nodeParams : :py:obj:`~.cudaExternalSemaphoreSignalNodeParams`
-        Parameters for the node
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    pGraphNode : :py:obj:`~.cudaGraphNode_t`
-        Returns newly created node
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeGetParams`, :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaImportExternalSemaphore`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
-    """
+    """"""
     pDependencies = [] if pDependencies is None else pDependencies
     if not all(isinstance(_x, (cudaGraphNode_t,driver.CUgraphNode)) for _x in pDependencies):
         raise TypeError("Argument 'pDependencies' is not instance of type (expected tuple[cyruntime.cudaGraphNode_t,driver.CUgraphNode] or list[cyruntime.cudaGraphNode_t,driver.CUgraphNode]")
@@ -33021,7 +32832,7 @@ def cudaGraphAddExternalSemaphoresSignalNode(graph, pDependencies : Optional[tup
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = <cyruntime.cudaExternalSemaphoreSignalNodeParams*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddExternalSemaphoresSignalNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cynodeParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -33035,32 +32846,7 @@ def cudaGraphAddExternalSemaphoresSignalNode(graph, pDependencies : Optional[tup
 
 @cython.embedsignature(True)
 def cudaGraphExternalSemaphoresSignalNodeGetParams(hNode):
-    """ Returns an external semaphore signal node's parameters.
-
-    Returns the parameters of an external semaphore signal node `hNode` in
-    `params_out`. The `extSemArray` and `paramsArray` returned in
-    `params_out`, are owned by the node. This memory remains valid until
-    the node is destroyed or its parameters are modified, and should not be
-    modified directly. Use
-    :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams` to update
-    the parameters of this node.
-
-    Parameters
-    ----------
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to get the parameters for
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    params_out : :py:obj:`~.cudaExternalSemaphoreSignalNodeParams`
-        Pointer to return the parameters
-
-    See Also
-    --------
-    :py:obj:`~.cudaLaunchKernel`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cyhNode
     if hNode is None:
         phNode = 0
@@ -33081,27 +32867,7 @@ def cudaGraphExternalSemaphoresSignalNodeGetParams(hNode):
 
 @cython.embedsignature(True)
 def cudaGraphExternalSemaphoresSignalNodeSetParams(hNode, nodeParams : Optional[cudaExternalSemaphoreSignalNodeParams]):
-    """ Sets an external semaphore signal node's parameters.
-
-    Sets the parameters of an external semaphore signal node `hNode` to
-    `nodeParams`.
-
-    Parameters
-    ----------
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to set the parameters for
-    nodeParams : :py:obj:`~.cudaExternalSemaphoreSignalNodeParams`
-        Parameters to copy
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cyhNode
     if hNode is None:
         phNode = 0
@@ -33110,7 +32876,7 @@ def cudaGraphExternalSemaphoresSignalNodeSetParams(hNode, nodeParams : Optional[
     else:
         phNode = int(cudaGraphNode_t(hNode))
     cyhNode = <cyruntime.cudaGraphNode_t><void_ptr>phNode
-    cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = <cyruntime.cudaExternalSemaphoreSignalNodeParams*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphExternalSemaphoresSignalNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_cudaError_t(err),)
@@ -33120,41 +32886,7 @@ def cudaGraphExternalSemaphoresSignalNodeSetParams(hNode, nodeParams : Optional[
 
 @cython.embedsignature(True)
 def cudaGraphAddExternalSemaphoresWaitNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, nodeParams : Optional[cudaExternalSemaphoreWaitNodeParams]):
-    """ Creates an external semaphore wait node and adds it to a graph.
-
-    Creates a new external semaphore wait node and adds it to `graph` with
-    `numDependencies` dependencies specified via `dependencies` and
-    arguments specified in `nodeParams`. It is possible for
-    `numDependencies` to be 0, in which case the node will be placed at the
-    root of the graph. `dependencies` may not have any duplicate entries. A
-    handle to the new node will be returned in `pGraphNode`.
-
-    Performs a wait operation on a set of externally allocated semaphore
-    objects when the node is launched. The node's dependencies will not be
-    launched until the wait operation has completed.
-
-    Parameters
-    ----------
-    graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to which to add the node
-    pDependencies : list[:py:obj:`~.cudaGraphNode_t`]
-        Dependencies of the node
-    numDependencies : size_t
-        Number of dependencies
-    nodeParams : :py:obj:`~.cudaExternalSemaphoreWaitNodeParams`
-        Parameters for the node
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    pGraphNode : :py:obj:`~.cudaGraphNode_t`
-        Returns newly created node
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphExternalSemaphoresWaitNodeGetParams`, :py:obj:`~.cudaGraphExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaImportExternalSemaphore`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
-    """
+    """"""
     pDependencies = [] if pDependencies is None else pDependencies
     if not all(isinstance(_x, (cudaGraphNode_t,driver.CUgraphNode)) for _x in pDependencies):
         raise TypeError("Argument 'pDependencies' is not instance of type (expected tuple[cyruntime.cudaGraphNode_t,driver.CUgraphNode] or list[cyruntime.cudaGraphNode_t,driver.CUgraphNode]")
@@ -33178,7 +32910,7 @@ def cudaGraphAddExternalSemaphoresWaitNode(graph, pDependencies : Optional[tuple
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = <cyruntime.cudaExternalSemaphoreWaitNodeParams*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddExternalSemaphoresWaitNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cynodeParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -33192,32 +32924,7 @@ def cudaGraphAddExternalSemaphoresWaitNode(graph, pDependencies : Optional[tuple
 
 @cython.embedsignature(True)
 def cudaGraphExternalSemaphoresWaitNodeGetParams(hNode):
-    """ Returns an external semaphore wait node's parameters.
-
-    Returns the parameters of an external semaphore wait node `hNode` in
-    `params_out`. The `extSemArray` and `paramsArray` returned in
-    `params_out`, are owned by the node. This memory remains valid until
-    the node is destroyed or its parameters are modified, and should not be
-    modified directly. Use
-    :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams` to update
-    the parameters of this node.
-
-    Parameters
-    ----------
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to get the parameters for
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    params_out : :py:obj:`~.cudaExternalSemaphoreWaitNodeParams`
-        Pointer to return the parameters
-
-    See Also
-    --------
-    :py:obj:`~.cudaLaunchKernel`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaGraphExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cyhNode
     if hNode is None:
         phNode = 0
@@ -33238,27 +32945,7 @@ def cudaGraphExternalSemaphoresWaitNodeGetParams(hNode):
 
 @cython.embedsignature(True)
 def cudaGraphExternalSemaphoresWaitNodeSetParams(hNode, nodeParams : Optional[cudaExternalSemaphoreWaitNodeParams]):
-    """ Sets an external semaphore wait node's parameters.
-
-    Sets the parameters of an external semaphore wait node `hNode` to
-    `nodeParams`.
-
-    Parameters
-    ----------
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to set the parameters for
-    nodeParams : :py:obj:`~.cudaExternalSemaphoreWaitNodeParams`
-        Parameters to copy
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaGraphExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cyhNode
     if hNode is None:
         phNode = 0
@@ -33267,7 +32954,7 @@ def cudaGraphExternalSemaphoresWaitNodeSetParams(hNode, nodeParams : Optional[cu
     else:
         phNode = int(cudaGraphNode_t(hNode))
     cyhNode = <cyruntime.cudaGraphNode_t><void_ptr>phNode
-    cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = <cyruntime.cudaExternalSemaphoreWaitNodeParams*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphExternalSemaphoresWaitNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_cudaError_t(err),)
@@ -33277,7 +32964,269 @@ def cudaGraphExternalSemaphoresWaitNodeSetParams(hNode, nodeParams : Optional[cu
 
 @cython.embedsignature(True)
 def cudaGraphAddMemAllocNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, nodeParams : Optional[cudaMemAllocNodeParams]):
-    """ Creates an allocation node and adds it to a graph.
+    """"""
+    pDependencies = [] if pDependencies is None else pDependencies
+    if not all(isinstance(_x, (cudaGraphNode_t,driver.CUgraphNode)) for _x in pDependencies):
+        raise TypeError("Argument 'pDependencies' is not instance of type (expected tuple[cyruntime.cudaGraphNode_t,driver.CUgraphNode] or list[cyruntime.cudaGraphNode_t,driver.CUgraphNode]")
+    cdef cyruntime.cudaGraph_t cygraph
+    if graph is None:
+        pgraph = 0
+    elif isinstance(graph, (cudaGraph_t,driver.CUgraph)):
+        pgraph = int(graph)
+    else:
+        pgraph = int(cudaGraph_t(graph))
+    cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
+    cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
+    cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
+    if len(pDependencies) > 1:
+        cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
+        if cypDependencies is NULL:
+            raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
+        else:
+            for idx in range(len(pDependencies)):
+                cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
+    elif len(pDependencies) == 1:
+        cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
+    if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
+    cdef cyruntime.cudaMemAllocNodeParams* cynodeParams_ptr = <cyruntime.cudaMemAllocNodeParams*>nodeParams._pvt_ptr if nodeParams is not None else NULL
+    with nogil:
+        err = cyruntime.cudaGraphAddMemAllocNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cynodeParams_ptr)
+    if len(pDependencies) > 1 and cypDependencies is not NULL:
+        free(cypDependencies)
+    if err != cyruntime.cudaSuccess:
+        return (_cudaError_t(err), None)
+    return (_cudaError_t_SUCCESS, pGraphNode)
+{{endif}}
+
+{{if 'cudaGraphMemAllocNodeGetParams' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaGraphMemAllocNodeGetParams(node):
+    """"""
+    cdef cyruntime.cudaGraphNode_t cynode
+    if node is None:
+        pnode = 0
+    elif isinstance(node, (cudaGraphNode_t,driver.CUgraphNode)):
+        pnode = int(node)
+    else:
+        pnode = int(cudaGraphNode_t(node))
+    cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
+    cdef cudaMemAllocNodeParams params_out = cudaMemAllocNodeParams()
+    with nogil:
+        err = cyruntime.cudaGraphMemAllocNodeGetParams(cynode, <cyruntime.cudaMemAllocNodeParams*>params_out._pvt_ptr)
+    if err != cyruntime.cudaSuccess:
+        return (_cudaError_t(err), None)
+    return (_cudaError_t_SUCCESS, params_out)
+{{endif}}
+
+{{if 'cudaGraphAddMemFreeNode' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaGraphAddMemFreeNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, dptr):
+    """"""
+    pDependencies = [] if pDependencies is None else pDependencies
+    if not all(isinstance(_x, (cudaGraphNode_t,driver.CUgraphNode)) for _x in pDependencies):
+        raise TypeError("Argument 'pDependencies' is not instance of type (expected tuple[cyruntime.cudaGraphNode_t,driver.CUgraphNode] or list[cyruntime.cudaGraphNode_t,driver.CUgraphNode]")
+    cdef cyruntime.cudaGraph_t cygraph
+    if graph is None:
+        pgraph = 0
+    elif isinstance(graph, (cudaGraph_t,driver.CUgraph)):
+        pgraph = int(graph)
+    else:
+        pgraph = int(cudaGraph_t(graph))
+    cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
+    cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
+    cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
+    if len(pDependencies) > 1:
+        cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
+        if cypDependencies is NULL:
+            raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
+        else:
+            for idx in range(len(pDependencies)):
+                cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
+    elif len(pDependencies) == 1:
+        cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
+    if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
+    cdef _HelperInputVoidPtrStruct cydptrHelper
+    cdef void* cydptr = _helper_input_void_ptr(dptr, &cydptrHelper)
+    with nogil:
+        err = cyruntime.cudaGraphAddMemFreeNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cydptr)
+    if len(pDependencies) > 1 and cypDependencies is not NULL:
+        free(cypDependencies)
+    _helper_input_void_ptr_free(&cydptrHelper)
+    if err != cyruntime.cudaSuccess:
+        return (_cudaError_t(err), None)
+    return (_cudaError_t_SUCCESS, pGraphNode)
+{{endif}}
+
+{{if 'cudaGraphMemFreeNodeGetParams' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaGraphMemFreeNodeGetParams(node):
+    """"""
+    cdef cyruntime.cudaGraphNode_t cynode
+    if node is None:
+        pnode = 0
+    elif isinstance(node, (cudaGraphNode_t,driver.CUgraphNode)):
+        pnode = int(node)
+    else:
+        pnode = int(cudaGraphNode_t(node))
+    cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
+    cdef void_ptr dptr_out = 0
+    cdef void* cydptr_out_ptr = <void*>&dptr_out
+    with nogil:
+        err = cyruntime.cudaGraphMemFreeNodeGetParams(cynode, cydptr_out_ptr)
+    if err != cyruntime.cudaSuccess:
+        return (_cudaError_t(err), None)
+    return (_cudaError_t_SUCCESS, dptr_out)
+{{endif}}
+
+{{if 'cudaDeviceGraphMemTrim' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaDeviceGraphMemTrim(int device):
+    """"""
+    with nogil:
+        err = cyruntime.cudaDeviceGraphMemTrim(device)
+    return (_cudaError_t(err),)
+{{endif}}
+
+{{if 'cudaDeviceGetGraphMemAttribute' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaDeviceGetGraphMemAttribute(int device, attr not None : cudaGraphMemAttributeType):
+    """"""
+    cdef cyruntime.cudaGraphMemAttributeType cyattr = int(attr)
+    cdef _HelperCUgraphMem_attribute cyvalue = _HelperCUgraphMem_attribute(attr, 0, is_getter=True)
+    cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
+    with nogil:
+        err = cyruntime.cudaDeviceGetGraphMemAttribute(device, cyattr, cyvalue_ptr)
+    if err != cyruntime.cudaSuccess:
+        return (_cudaError_t(err), None)
+    return (_cudaError_t_SUCCESS, cyvalue.pyObj())
+{{endif}}
+
+{{if 'cudaDeviceSetGraphMemAttribute' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaDeviceSetGraphMemAttribute(int device, attr not None : cudaGraphMemAttributeType, value):
+    """"""
+    cdef cyruntime.cudaGraphMemAttributeType cyattr = int(attr)
+    cdef _HelperCUgraphMem_attribute cyvalue = _HelperCUgraphMem_attribute(attr, value, is_getter=False)
+    cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
+    with nogil:
+        err = cyruntime.cudaDeviceSetGraphMemAttribute(device, cyattr, cyvalue_ptr)
+    return (_cudaError_t(err),)
+{{endif}}
+
+{{if 'cudaGraphClone' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaGraphClone(originalGraph):
+    """ Creates an event record node and adds it to a graph.
+
+    Creates a new event record node and adds it to `hGraph` with
+    `numDependencies` dependencies specified via `dependencies` and event
+    specified in `event`. It is possible for `numDependencies` to be 0, in
+    which case the node will be placed at the root of the graph.
+    `dependencies` may not have any duplicate entries. A handle to the new
+    node will be returned in `phGraphNode`.
+
+    Each launch of the graph will record `event` to capture execution of
+    the node's dependencies.
+
+    These nodes may not be used in loops or conditionals.
+
+    Returns the event associated with an event record node
+
+    Returns the event of event record node `hNode` in `event_out`.
+
+    Sets an event record node's event
+
+    Sets the event of event record node `hNode` to `event`.
+
+    Creates an event wait node and adds it to a graph
+
+    Creates a new event wait node and adds it to `hGraph` with
+    `numDependencies` dependencies specified via `dependencies` and event
+    specified in `event`. It is possible for `numDependencies` to be 0, in
+    which case the node will be placed at the root of the graph.
+    `dependencies` may not have any duplicate entries. A handle to the new
+    node will be returned in `phGraphNode`.
+
+    The graph node will wait for all work captured in `event`. See
+    :py:obj:`~.cuEventRecord()` for details on what is captured by an
+    event. The synchronization will be performed efficiently on the device
+    when applicable. `event` may be from a different context or device than
+    the launch stream.
+
+    These nodes may not be used in loops or conditionals.
+
+    Returns the event associated with an event wait node
+
+    Returns the event of event wait node `hNode` in `event_out`.
+
+    Sets an event wait node's event
+
+    Sets the event of event wait node `hNode` to `event`.
+
+    Creates an external semaphore signal node and adds it to a graph
+
+    Creates a new external semaphore signal node and adds it to `graph`
+    with `numDependencies` dependencies specified via `dependencies` and
+    arguments specified in `nodeParams`. It is possible for
+    `numDependencies` to be 0, in which case the node will be placed at the
+    root of the graph. `dependencies` may not have any duplicate entries. A
+    handle to the new node will be returned in `pGraphNode`.
+
+    Performs a signal operation on a set of externally allocated semaphore
+    objects when the node is launched. The operation(s) will occur after
+    all of the node's dependencies have completed.
+
+    Returns an external semaphore signal node's parameters
+
+    Returns the parameters of an external semaphore signal node `hNode` in
+    `params_out`. The `extSemArray` and `paramsArray` returned in
+    `params_out`, are owned by the node. This memory remains valid until
+    the node is destroyed or its parameters are modified, and should not be
+    modified directly. Use
+    :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams` to update
+    the parameters of this node.
+
+    Sets an external semaphore signal node's parameters
+
+    Sets the parameters of an external semaphore signal node `hNode` to
+    `nodeParams`.
+
+    Creates an external semaphore wait node and adds it to a graph
+
+    Creates a new external semaphore wait node and adds it to `graph` with
+    `numDependencies` dependencies specified via `dependencies` and
+    arguments specified in `nodeParams`. It is possible for
+    `numDependencies` to be 0, in which case the node will be placed at the
+    root of the graph. `dependencies` may not have any duplicate entries. A
+    handle to the new node will be returned in `pGraphNode`.
+
+    Performs a wait operation on a set of externally allocated semaphore
+    objects when the node is launched. The node's dependencies will not be
+    launched until the wait operation has completed.
+
+    Returns an external semaphore wait node's parameters
+
+    Returns the parameters of an external semaphore wait node `hNode` in
+    `params_out`. The `extSemArray` and `paramsArray` returned in
+    `params_out`, are owned by the node. This memory remains valid until
+    the node is destroyed or its parameters are modified, and should not be
+    modified directly. Use
+    :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams` to update
+    the parameters of this node.
+
+    Sets an external semaphore wait node's parameters
+
+    Sets the parameters of an external semaphore wait node `hNode` to
+    `nodeParams`.
+
+    Creates an allocation node and adds it to a graph
 
     Creates a new allocation node and adds it to `graph` with
     `numDependencies` dependencies specified via `pDependencies` and
@@ -33329,109 +33278,14 @@ def cudaGraphAddMemAllocNode(graph, pDependencies : Optional[tuple[cudaGraphNode
 
     - The graph cannot be cloned.
 
-    Parameters
-    ----------
-    graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to which to add the node
-    pDependencies : list[:py:obj:`~.cudaGraphNode_t`]
-        Dependencies of the node
-    numDependencies : size_t
-        Number of dependencies
-    nodeParams : :py:obj:`~.cudaMemAllocNodeParams`
-        Parameters for the node
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorOutOfMemory`
-    pGraphNode : :py:obj:`~.cudaGraphNode_t`
-        Returns newly created node
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaGraphMemAllocNodeGetParams`, :py:obj:`~.cudaDeviceGraphMemTrim`, :py:obj:`~.cudaDeviceGetGraphMemAttribute`, :py:obj:`~.cudaDeviceSetGraphMemAttribute`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
-    """
-    pDependencies = [] if pDependencies is None else pDependencies
-    if not all(isinstance(_x, (cudaGraphNode_t,driver.CUgraphNode)) for _x in pDependencies):
-        raise TypeError("Argument 'pDependencies' is not instance of type (expected tuple[cyruntime.cudaGraphNode_t,driver.CUgraphNode] or list[cyruntime.cudaGraphNode_t,driver.CUgraphNode]")
-    cdef cyruntime.cudaGraph_t cygraph
-    if graph is None:
-        pgraph = 0
-    elif isinstance(graph, (cudaGraph_t,driver.CUgraph)):
-        pgraph = int(graph)
-    else:
-        pgraph = int(cudaGraph_t(graph))
-    cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
-    cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
-    cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
-    if len(pDependencies) > 1:
-        cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
-        if cypDependencies is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
-        else:
-            for idx in range(len(pDependencies)):
-                cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
-    elif len(pDependencies) == 1:
-        cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
-    if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaMemAllocNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
-    with nogil:
-        err = cyruntime.cudaGraphAddMemAllocNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cynodeParams_ptr)
-    if len(pDependencies) > 1 and cypDependencies is not NULL:
-        free(cypDependencies)
-    if err != cyruntime.cudaSuccess:
-        return (_cudaError_t(err), None)
-    return (_cudaError_t_SUCCESS, pGraphNode)
-{{endif}}
-
-{{if 'cudaGraphMemAllocNodeGetParams' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaGraphMemAllocNodeGetParams(node):
-    """ Returns a memory alloc node's parameters.
+    Returns a memory alloc node's parameters
 
     Returns the parameters of a memory alloc node `hNode` in `params_out`.
     The `poolProps` and `accessDescs` returned in `params_out`, are owned
     by the node. This memory remains valid until the node is destroyed. The
     returned parameters must not be modified.
 
-    Parameters
-    ----------
-    node : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to get the parameters for
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    params_out : :py:obj:`~.cudaMemAllocNodeParams`
-        Pointer to return the parameters
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphMemFreeNodeGetParams`
-    """
-    cdef cyruntime.cudaGraphNode_t cynode
-    if node is None:
-        pnode = 0
-    elif isinstance(node, (cudaGraphNode_t,driver.CUgraphNode)):
-        pnode = int(node)
-    else:
-        pnode = int(cudaGraphNode_t(node))
-    cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    cdef cudaMemAllocNodeParams params_out = cudaMemAllocNodeParams()
-    with nogil:
-        err = cyruntime.cudaGraphMemAllocNodeGetParams(cynode, <cyruntime.cudaMemAllocNodeParams*>params_out._pvt_ptr)
-    if err != cyruntime.cudaSuccess:
-        return (_cudaError_t(err), None)
-    return (_cudaError_t_SUCCESS, params_out)
-{{endif}}
-
-{{if 'cudaGraphAddMemFreeNode' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaGraphAddMemFreeNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, dptr):
-    """ Creates a memory free node and adds it to a graph.
+    Creates a memory free node and adds it to a graph
 
     Creates a new memory free node and adds it to `graph` with
     `numDependencies` dependencies specified via `pDependencies` and
@@ -33461,138 +33315,18 @@ def cudaGraphAddMemFreeNode(graph, pDependencies : Optional[tuple[cudaGraphNode_
 
     - The graph cannot be cloned.
 
-    Parameters
-    ----------
-    graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to which to add the node
-    pDependencies : list[:py:obj:`~.cudaGraphNode_t`]
-        Dependencies of the node
-    numDependencies : size_t
-        Number of dependencies
-    dptr : Any
-        Address of memory to free
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorOutOfMemory`
-    pGraphNode : :py:obj:`~.cudaGraphNode_t`
-        Returns newly created node
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphMemFreeNodeGetParams`, :py:obj:`~.cudaDeviceGraphMemTrim`, :py:obj:`~.cudaDeviceGetGraphMemAttribute`, :py:obj:`~.cudaDeviceSetGraphMemAttribute`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
-    """
-    pDependencies = [] if pDependencies is None else pDependencies
-    if not all(isinstance(_x, (cudaGraphNode_t,driver.CUgraphNode)) for _x in pDependencies):
-        raise TypeError("Argument 'pDependencies' is not instance of type (expected tuple[cyruntime.cudaGraphNode_t,driver.CUgraphNode] or list[cyruntime.cudaGraphNode_t,driver.CUgraphNode]")
-    cdef cyruntime.cudaGraph_t cygraph
-    if graph is None:
-        pgraph = 0
-    elif isinstance(graph, (cudaGraph_t,driver.CUgraph)):
-        pgraph = int(graph)
-    else:
-        pgraph = int(cudaGraph_t(graph))
-    cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
-    cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
-    cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
-    if len(pDependencies) > 1:
-        cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
-        if cypDependencies is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
-        else:
-            for idx in range(len(pDependencies)):
-                cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
-    elif len(pDependencies) == 1:
-        cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
-    if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef _HelperInputVoidPtrStruct cydptrHelper
-    cdef void* cydptr = _helper_input_void_ptr(dptr, &cydptrHelper)
-    with nogil:
-        err = cyruntime.cudaGraphAddMemFreeNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cydptr)
-    if len(pDependencies) > 1 and cypDependencies is not NULL:
-        free(cypDependencies)
-    _helper_input_void_ptr_free(&cydptrHelper)
-    if err != cyruntime.cudaSuccess:
-        return (_cudaError_t(err), None)
-    return (_cudaError_t_SUCCESS, pGraphNode)
-{{endif}}
-
-{{if 'cudaGraphMemFreeNodeGetParams' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaGraphMemFreeNodeGetParams(node):
-    """ Returns a memory free node's parameters.
+    Returns a memory free node's parameters
 
     Returns the address of a memory free node `hNode` in `dptr_out`.
 
-    Parameters
-    ----------
-    node : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to get the parameters for
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    dptr_out : Any
-        Pointer to return the device address
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaGraphMemFreeNodeGetParams`
-    """
-    cdef cyruntime.cudaGraphNode_t cynode
-    if node is None:
-        pnode = 0
-    elif isinstance(node, (cudaGraphNode_t,driver.CUgraphNode)):
-        pnode = int(node)
-    else:
-        pnode = int(cudaGraphNode_t(node))
-    cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    cdef void_ptr dptr_out = 0
-    cdef void* cydptr_out_ptr = <void*>&dptr_out
-    with nogil:
-        err = cyruntime.cudaGraphMemFreeNodeGetParams(cynode, cydptr_out_ptr)
-    if err != cyruntime.cudaSuccess:
-        return (_cudaError_t(err), None)
-    return (_cudaError_t_SUCCESS, dptr_out)
-{{endif}}
-
-{{if 'cudaDeviceGraphMemTrim' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaDeviceGraphMemTrim(int device):
-    """ Free unused memory that was cached on the specified device for use with graphs back to the OS.
+    Free unused memory that was cached on the specified device for use with
+    graphs back to the OS.
 
     Blocks which are not in use by a graph that is either currently
     executing or scheduled to execute are freed back to the operating
     system.
 
-    Parameters
-    ----------
-    device : int
-        The device for which cached memory should be freed.
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaDeviceGetGraphMemAttribute`, :py:obj:`~.cudaDeviceSetGraphMemAttribute`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`
-    """
-    with nogil:
-        err = cyruntime.cudaDeviceGraphMemTrim(device)
-    return (_cudaError_t(err),)
-{{endif}}
-
-{{if 'cudaDeviceGetGraphMemAttribute' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaDeviceGetGraphMemAttribute(int device, attr not None : cudaGraphMemAttributeType):
-    """ Query asynchronous allocation attributes related to graphs.
+    Query asynchronous allocation attributes related to graphs
 
     Valid attributes are:
 
@@ -33611,39 +33345,7 @@ def cudaDeviceGetGraphMemAttribute(int device, attr not None : cudaGraphMemAttri
       memory, in bytes, currently allocated for use by the CUDA graphs
       asynchronous allocator.
 
-    Parameters
-    ----------
-    device : int
-        Specifies the scope of the query
-    attr : :py:obj:`~.cudaGraphMemAttributeType`
-        attribute to get
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`
-    value : Any
-        retrieved value
-
-    See Also
-    --------
-    :py:obj:`~.cudaDeviceSetGraphMemAttribute`, :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaDeviceGraphMemTrim`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`
-    """
-    cdef cyruntime.cudaGraphMemAttributeType cyattr = int(attr)
-    cdef _HelperCUgraphMem_attribute cyvalue = _HelperCUgraphMem_attribute(attr, 0, is_getter=True)
-    cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
-    with nogil:
-        err = cyruntime.cudaDeviceGetGraphMemAttribute(device, cyattr, cyvalue_ptr)
-    if err != cyruntime.cudaSuccess:
-        return (_cudaError_t(err), None)
-    return (_cudaError_t_SUCCESS, cyvalue.pyObj())
-{{endif}}
-
-{{if 'cudaDeviceSetGraphMemAttribute' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaDeviceSetGraphMemAttribute(int device, attr not None : cudaGraphMemAttributeType, value):
-    """ Set asynchronous allocation attributes related to graphs.
+    Set asynchronous allocation attributes related to graphs
 
     Valid attributes are:
 
@@ -33655,37 +33357,7 @@ def cudaDeviceSetGraphMemAttribute(int device, attr not None : cudaGraphMemAttri
       memory, in bytes, currently allocated for use by the CUDA graphs
       asynchronous allocator.
 
-    Parameters
-    ----------
-    device : int
-        Specifies the scope of the query
-    attr : :py:obj:`~.cudaGraphMemAttributeType`
-        attribute to get
-    value : Any
-        pointer to value to set
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`
-
-    See Also
-    --------
-    :py:obj:`~.cudaDeviceGetGraphMemAttribute`, :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaDeviceGraphMemTrim`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`
-    """
-    cdef cyruntime.cudaGraphMemAttributeType cyattr = int(attr)
-    cdef _HelperCUgraphMem_attribute cyvalue = _HelperCUgraphMem_attribute(attr, value, is_getter=False)
-    cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
-    with nogil:
-        err = cyruntime.cudaDeviceSetGraphMemAttribute(device, cyattr, cyvalue_ptr)
-    return (_cudaError_t(err),)
-{{endif}}
-
-{{if 'cudaGraphClone' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaGraphClone(originalGraph):
-    """ Clones a graph.
+    Clones a graph
 
     This function creates a copy of `originalGraph` and returns it in
     `pGraphClone`. All parameters are copied into the cloned graph. The
@@ -33697,18 +33369,75 @@ def cudaGraphClone(originalGraph):
 
     Parameters
     ----------
-    originalGraph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to clone
+    hGraph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
+        Graph to which to add the node
 
     Returns
     -------
     cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorOutOfMemory`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorOutOfMemory`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorMemoryAllocation`
-    pGraphClone : :py:obj:`~.cudaGraph_t`
-        Returns newly created cloned graph
+    phGraphNode : :py:obj:`~.cudaGraph_t`
+        Returns newly created node
 
     See Also
     --------
+    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
+
+    :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphEventWaitNodeGetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`
+
+    :py:obj:`~.cudaGraphNodeSetParams`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphEventRecordNodeGetEvent`, :py:obj:`~.cudaGraphEventWaitNodeSetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`
+
+    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
+
+    :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphEventRecordNodeGetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`
+
+    :py:obj:`~.cudaGraphNodeSetParams`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphEventWaitNodeGetEvent`, :py:obj:`~.cudaGraphEventRecordNodeSetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`
+
+    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeGetParams`, :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaImportExternalSemaphore`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
+
+    :py:obj:`~.cudaLaunchKernel`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`
+
+    :py:obj:`~.cudaGraphNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`
+
+    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphExternalSemaphoresWaitNodeGetParams`, :py:obj:`~.cudaGraphExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaImportExternalSemaphore`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
+
+    :py:obj:`~.cudaLaunchKernel`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaGraphExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`
+
+    :py:obj:`~.cudaGraphNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaGraphExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`
+
+    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaGraphMemAllocNodeGetParams`, :py:obj:`~.cudaDeviceGraphMemTrim`, :py:obj:`~.cudaDeviceGetGraphMemAttribute`, :py:obj:`~.cudaDeviceSetGraphMemAttribute`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
+
+    :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphMemFreeNodeGetParams`
+
+    :py:obj:`~.cudaGraphAddNode`, :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphMemFreeNodeGetParams`, :py:obj:`~.cudaDeviceGraphMemTrim`, :py:obj:`~.cudaDeviceGetGraphMemAttribute`, :py:obj:`~.cudaDeviceSetGraphMemAttribute`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphDestroyNode`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`
+
+    :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaGraphMemFreeNodeGetParams`
+
+    :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaDeviceGetGraphMemAttribute`, :py:obj:`~.cudaDeviceSetGraphMemAttribute`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`
+
+    :py:obj:`~.cudaDeviceSetGraphMemAttribute`, :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaDeviceGraphMemTrim`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`
+
+    :py:obj:`~.cudaDeviceGetGraphMemAttribute`, :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaDeviceGraphMemTrim`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`
+
     :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphNodeFindInClone`
 
     Notes
@@ -34876,6 +34605,27 @@ def cudaGraphInstantiate(graph, unsigned long long flags):
 
 @cython.embedsignature(True)
 def cudaGraphInstantiateWithFlags(graph, unsigned long long flags):
+    """"""
+    cdef cyruntime.cudaGraph_t cygraph
+    if graph is None:
+        pgraph = 0
+    elif isinstance(graph, (cudaGraph_t,driver.CUgraph)):
+        pgraph = int(graph)
+    else:
+        pgraph = int(cudaGraph_t(graph))
+    cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
+    cdef cudaGraphExec_t pGraphExec = cudaGraphExec_t()
+    with nogil:
+        err = cyruntime.cudaGraphInstantiateWithFlags(<cyruntime.cudaGraphExec_t*>pGraphExec._pvt_ptr, cygraph, flags)
+    if err != cyruntime.cudaSuccess:
+        return (_cudaError_t(err), None)
+    return (_cudaError_t_SUCCESS, pGraphExec)
+{{endif}}
+
+{{if 'cudaGraphInstantiateWithParams' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaGraphInstantiateWithParams(graph, instantiateParams : Optional[cudaGraphInstantiateParams]):
     """ Creates an executable graph from a graph.
 
     Instantiates `graph` as an executable graph. The graph is validated for
@@ -34943,46 +34693,7 @@ def cudaGraphInstantiateWithFlags(graph, unsigned long long flags):
       - Both operands must be accessible from the current device, and the
         current device must match the device of other nodes in the graph.
 
-    Parameters
-    ----------
-    graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to instantiate
-    flags : unsigned long long
-        Flags to control instantiation. See
-        :py:obj:`~.CUgraphInstantiate_flags`.
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    pGraphExec : :py:obj:`~.cudaGraphExec_t`
-        Returns instantiated graph
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphInstantiate`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphUpload`, :py:obj:`~.cudaGraphLaunch`, :py:obj:`~.cudaGraphExecDestroy`
-    """
-    cdef cyruntime.cudaGraph_t cygraph
-    if graph is None:
-        pgraph = 0
-    elif isinstance(graph, (cudaGraph_t,driver.CUgraph)):
-        pgraph = int(graph)
-    else:
-        pgraph = int(cudaGraph_t(graph))
-    cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
-    cdef cudaGraphExec_t pGraphExec = cudaGraphExec_t()
-    with nogil:
-        err = cyruntime.cudaGraphInstantiateWithFlags(<cyruntime.cudaGraphExec_t*>pGraphExec._pvt_ptr, cygraph, flags)
-    if err != cyruntime.cudaSuccess:
-        return (_cudaError_t(err), None)
-    return (_cudaError_t_SUCCESS, pGraphExec)
-{{endif}}
-
-{{if 'cudaGraphInstantiateWithParams' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaGraphInstantiateWithParams(graph, instantiateParams : Optional[cudaGraphInstantiateParams]):
-    """ Creates an executable graph from a graph.
+    Creates an executable graph from a graph
 
     Instantiates `graph` as an executable graph according to the
     `instantiateParams` structure. The graph is validated for any
@@ -35094,18 +34805,22 @@ def cudaGraphInstantiateWithParams(graph, instantiateParams : Optional[cudaGraph
     ----------
     graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
         Graph to instantiate
-    instantiateParams : :py:obj:`~.cudaGraphInstantiateParams`
-        Instantiation parameters
+    flags : :py:obj:`~.cudaGraphInstantiateParams`
+        Flags to control instantiation. See
+        :py:obj:`~.CUgraphInstantiate_flags`.
 
     Returns
     -------
     cudaError_t
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pGraphExec : :py:obj:`~.cudaGraphExec_t`
         Returns instantiated graph
 
     See Also
     --------
+    :py:obj:`~.cudaGraphInstantiate`, :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphUpload`, :py:obj:`~.cudaGraphLaunch`, :py:obj:`~.cudaGraphExecDestroy`
+
     :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphInstantiate`, :py:obj:`~.cudaGraphInstantiateWithFlags`, :py:obj:`~.cudaGraphExecDestroy`
     """
     cdef cyruntime.cudaGraph_t cygraph
@@ -35117,7 +34832,7 @@ def cudaGraphInstantiateWithParams(graph, instantiateParams : Optional[cudaGraph
         pgraph = int(cudaGraph_t(graph))
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphExec_t pGraphExec = cudaGraphExec_t()
-    cdef cyruntime.cudaGraphInstantiateParams* cyinstantiateParams_ptr = instantiateParams._pvt_ptr if instantiateParams is not None else NULL
+    cdef cyruntime.cudaGraphInstantiateParams* cyinstantiateParams_ptr = <cyruntime.cudaGraphInstantiateParams*>instantiateParams._pvt_ptr if instantiateParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphInstantiateWithParams(<cyruntime.cudaGraphExec_t*>pGraphExec._pvt_ptr, cygraph, cyinstantiateParams_ptr)
     if err != cyruntime.cudaSuccess:
@@ -35244,7 +34959,7 @@ def cudaGraphExecKernelNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams is not None else NULL
+    cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = <cyruntime.cudaKernelNodeParams*>pNodeParams._pvt_ptr if pNodeParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecKernelNodeSetParams(cyhGraphExec, cynode, cypNodeParams_ptr)
     return (_cudaError_t(err),)
@@ -35309,7 +35024,7 @@ def cudaGraphExecMemcpyNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    cdef cyruntime.cudaMemcpy3DParms* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams is not None else NULL
+    cdef cyruntime.cudaMemcpy3DParms* cypNodeParams_ptr = <cyruntime.cudaMemcpy3DParms*>pNodeParams._pvt_ptr if pNodeParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecMemcpyNodeSetParams(cyhGraphExec, cynode, cypNodeParams_ptr)
     return (_cudaError_t(err),)
@@ -35319,49 +35034,7 @@ def cudaGraphExecMemcpyNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
 
 @cython.embedsignature(True)
 def cudaGraphExecMemcpyNodeSetParams1D(hGraphExec, node, dst, src, size_t count, kind not None : cudaMemcpyKind):
-    """ Sets the parameters for a memcpy node in the given graphExec to perform a 1-dimensional copy.
-
-    Updates the work represented by `node` in `hGraphExec` as though `node`
-    had contained the given params at instantiation. `node` must remain in
-    the graph which was used to instantiate `hGraphExec`. Changed edges to
-    and from `node` are ignored.
-
-    `src` and `dst` must be allocated from the same contexts as the
-    original source and destination memory. The instantiation-time memory
-    operands must be 1-dimensional. Zero-length operations are not
-    supported.
-
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `node` is also not modified by this call.
-
-    Returns :py:obj:`~.cudaErrorInvalidValue` if the memory operands'
-    mappings changed or the original memory operands are multidimensional.
-
-    Parameters
-    ----------
-    hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        The executable graph in which to set the specified node
-    node : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Memcpy node from the graph which was used to instantiate graphExec
-    dst : Any
-        Destination memory address
-    src : Any
-        Source memory address
-    count : size_t
-        Size in bytes to copy
-    kind : :py:obj:`~.cudaMemcpyKind`
-        Type of transfer
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemcpyNode1D`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParams1D`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cynode
     if node is None:
         pnode = 0
@@ -35394,7 +35067,66 @@ def cudaGraphExecMemcpyNodeSetParams1D(hGraphExec, node, dst, src, size_t count,
 
 @cython.embedsignature(True)
 def cudaGraphExecMemsetNodeSetParams(hGraphExec, node, pNodeParams : Optional[cudaMemsetParams]):
-    """ Sets the parameters for a memset node in the given graphExec.
+    """ Sets the parameters for a memcpy node in the given graphExec to copy to a symbol on the device.
+
+    Updates the work represented by `node` in `hGraphExec` as though `node`
+    had contained the given params at instantiation. `node` must remain in
+    the graph which was used to instantiate `hGraphExec`. Changed edges to
+    and from `node` are ignored.
+
+    `src` and `symbol` must be allocated from the same contexts as the
+    original source and destination memory. The instantiation-time memory
+    operands must be 1-dimensional. Zero-length operations are not
+    supported.
+
+    The modifications only affect future launches of `hGraphExec`. Already
+    enqueued or running launches of `hGraphExec` are not affected by this
+    call. `node` is also not modified by this call.
+
+    Returns :py:obj:`~.cudaErrorInvalidValue` if the memory operands'
+    mappings changed or the original memory operands are multidimensional.
+
+    Sets the parameters for a memcpy node in the given graphExec to copy
+    from a symbol on the device
+
+    Updates the work represented by `node` in `hGraphExec` as though `node`
+    had contained the given params at instantiation. `node` must remain in
+    the graph which was used to instantiate `hGraphExec`. Changed edges to
+    and from `node` are ignored.
+
+    `symbol` and `dst` must be allocated from the same contexts as the
+    original source and destination memory. The instantiation-time memory
+    operands must be 1-dimensional. Zero-length operations are not
+    supported.
+
+    The modifications only affect future launches of `hGraphExec`. Already
+    enqueued or running launches of `hGraphExec` are not affected by this
+    call. `node` is also not modified by this call.
+
+    Returns :py:obj:`~.cudaErrorInvalidValue` if the memory operands'
+    mappings changed or the original memory operands are multidimensional.
+
+    Sets the parameters for a memcpy node in the given graphExec to perform
+    a 1-dimensional copy
+
+    Updates the work represented by `node` in `hGraphExec` as though `node`
+    had contained the given params at instantiation. `node` must remain in
+    the graph which was used to instantiate `hGraphExec`. Changed edges to
+    and from `node` are ignored.
+
+    `src` and `dst` must be allocated from the same contexts as the
+    original source and destination memory. The instantiation-time memory
+    operands must be 1-dimensional. Zero-length operations are not
+    supported.
+
+    The modifications only affect future launches of `hGraphExec`. Already
+    enqueued or running launches of `hGraphExec` are not affected by this
+    call. `node` is also not modified by this call.
+
+    Returns :py:obj:`~.cudaErrorInvalidValue` if the memory operands'
+    mappings changed or the original memory operands are multidimensional.
+
+    Sets the parameters for a memset node in the given graphExec.
 
     Updates the work represented by `node` in `hGraphExec` as though `node`
     had contained `pNodeParams` at instantiation. `node` must remain in the
@@ -35425,17 +35157,26 @@ def cudaGraphExecMemsetNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
     hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
         The executable graph in which to set the specified node
     node : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Memset node from the graph which was used to instantiate graphExec
-    pNodeParams : :py:obj:`~.cudaMemsetParams`
-        Updated Parameters to set
+        Memcpy node from the graph which was used to instantiate graphExec
+    symbol : :py:obj:`~.cudaMemsetParams`
+        Device symbol address
 
     Returns
     -------
     cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
     --------
+    :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemcpyNodeToSymbol`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParamsToSymbol`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParamsFromSymbol`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
+
+    :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemcpyNodeFromSymbol`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParamsFromSymbol`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParamsToSymbol`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
+
+    :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemcpyNode1D`, :py:obj:`~.cudaGraphMemcpyNodeSetParams`, :py:obj:`~.cudaGraphMemcpyNodeSetParams1D`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
+
     :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddMemsetNode`, :py:obj:`~.cudaGraphMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
     """
     cdef cyruntime.cudaGraphNode_t cynode
@@ -35454,7 +35195,7 @@ def cudaGraphExecMemsetNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    cdef cyruntime.cudaMemsetParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams is not None else NULL
+    cdef cyruntime.cudaMemsetParams* cypNodeParams_ptr = <cyruntime.cudaMemsetParams*>pNodeParams._pvt_ptr if pNodeParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecMemsetNodeSetParams(cyhGraphExec, cynode, cypNodeParams_ptr)
     return (_cudaError_t(err),)
@@ -35509,7 +35250,7 @@ def cudaGraphExecHostNodeSetParams(hGraphExec, node, pNodeParams : Optional[cuda
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams is not None else NULL
+    cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = <cyruntime.cudaHostNodeParams*>pNodeParams._pvt_ptr if pNodeParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecHostNodeSetParams(cyhGraphExec, cynode, cypNodeParams_ptr)
     return (_cudaError_t(err),)
@@ -35519,43 +35260,7 @@ def cudaGraphExecHostNodeSetParams(hGraphExec, node, pNodeParams : Optional[cuda
 
 @cython.embedsignature(True)
 def cudaGraphExecChildGraphNodeSetParams(hGraphExec, node, childGraph):
-    """ Updates node parameters in the child graph node in the given graphExec.
-
-    Updates the work represented by `node` in `hGraphExec` as though the
-    nodes contained in `node's` graph had the parameters contained in
-    `childGraph's` nodes at instantiation. `node` must remain in the graph
-    which was used to instantiate `hGraphExec`. Changed edges to and from
-    `node` are ignored.
-
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `node` is also not modified by this call.
-
-    The topology of `childGraph`, as well as the node insertion order, must
-    match that of the graph contained in `node`. See
-    :py:obj:`~.cudaGraphExecUpdate()` for a list of restrictions on what
-    can be updated in an instantiated graph. The update is recursive, so
-    child graph nodes contained within the top level child graph will also
-    be updated.
-
-    Parameters
-    ----------
-    hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        The executable graph in which to set the specified node
-    node : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Host node from the graph which was used to instantiate graphExec
-    childGraph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        The graph supplying the updated parameters
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphChildGraphNodeGetGraph`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
-    """
+    """"""
     cdef cyruntime.cudaGraph_t cychildGraph
     if childGraph is None:
         pchildGraph = 0
@@ -35589,36 +35294,7 @@ def cudaGraphExecChildGraphNodeSetParams(hGraphExec, node, childGraph):
 
 @cython.embedsignature(True)
 def cudaGraphExecEventRecordNodeSetEvent(hGraphExec, hNode, event):
-    """ Sets the event for an event record node in the given graphExec.
-
-    Sets the event of an event record node in an executable graph
-    `hGraphExec`. The node is identified by the corresponding node `hNode`
-    in the non-executable graph, from which the executable graph was
-    instantiated.
-
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
-
-    Parameters
-    ----------
-    hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        The executable graph in which to set the specified node
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Event record node from the graph from which graphExec was
-        instantiated
-    event : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
-        Updated event to use
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphEventRecordNodeGetEvent`, :py:obj:`~.cudaGraphEventWaitNodeSetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
-    """
+    """"""
     cdef cyruntime.cudaEvent_t cyevent
     if event is None:
         pevent = 0
@@ -35652,36 +35328,7 @@ def cudaGraphExecEventRecordNodeSetEvent(hGraphExec, hNode, event):
 
 @cython.embedsignature(True)
 def cudaGraphExecEventWaitNodeSetEvent(hGraphExec, hNode, event):
-    """ Sets the event for an event wait node in the given graphExec.
-
-    Sets the event of an event wait node in an executable graph
-    `hGraphExec`. The node is identified by the corresponding node `hNode`
-    in the non-executable graph, from which the executable graph was
-    instantiated.
-
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
-
-    Parameters
-    ----------
-    hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        The executable graph in which to set the specified node
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Event wait node from the graph from which graphExec was
-        instantiated
-    event : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
-        Updated event to use
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphEventWaitNodeGetEvent`, :py:obj:`~.cudaGraphEventRecordNodeSetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
-    """
+    """"""
     cdef cyruntime.cudaEvent_t cyevent
     if event is None:
         pevent = 0
@@ -35715,40 +35362,7 @@ def cudaGraphExecEventWaitNodeSetEvent(hGraphExec, hNode, event):
 
 @cython.embedsignature(True)
 def cudaGraphExecExternalSemaphoresSignalNodeSetParams(hGraphExec, hNode, nodeParams : Optional[cudaExternalSemaphoreSignalNodeParams]):
-    """ Sets the parameters for an external semaphore signal node in the given graphExec.
-
-    Sets the parameters of an external semaphore signal node in an
-    executable graph `hGraphExec`. The node is identified by the
-    corresponding node `hNode` in the non-executable graph, from which the
-    executable graph was instantiated.
-
-    `hNode` must not have been removed from the original graph.
-
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
-
-    Changing `nodeParams->numExtSems` is not supported.
-
-    Parameters
-    ----------
-    hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        The executable graph in which to set the specified node
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        semaphore signal node from the graph from which graphExec was
-        instantiated
-    nodeParams : :py:obj:`~.cudaExternalSemaphoreSignalNodeParams`
-        Updated Parameters to set
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaImportExternalSemaphore`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cyhNode
     if hNode is None:
         phNode = 0
@@ -35765,7 +35379,7 @@ def cudaGraphExecExternalSemaphoresSignalNodeSetParams(hGraphExec, hNode, nodePa
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = <cyruntime.cudaExternalSemaphoreSignalNodeParams*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecExternalSemaphoresSignalNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_cudaError_t(err),)
@@ -35775,40 +35389,7 @@ def cudaGraphExecExternalSemaphoresSignalNodeSetParams(hGraphExec, hNode, nodePa
 
 @cython.embedsignature(True)
 def cudaGraphExecExternalSemaphoresWaitNodeSetParams(hGraphExec, hNode, nodeParams : Optional[cudaExternalSemaphoreWaitNodeParams]):
-    """ Sets the parameters for an external semaphore wait node in the given graphExec.
-
-    Sets the parameters of an external semaphore wait node in an executable
-    graph `hGraphExec`. The node is identified by the corresponding node
-    `hNode` in the non-executable graph, from which the executable graph
-    was instantiated.
-
-    `hNode` must not have been removed from the original graph.
-
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
-
-    Changing `nodeParams->numExtSems` is not supported.
-
-    Parameters
-    ----------
-    hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        The executable graph in which to set the specified node
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        semaphore wait node from the graph from which graphExec was
-        instantiated
-    nodeParams : :py:obj:`~.cudaExternalSemaphoreWaitNodeParams`
-        Updated Parameters to set
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaImportExternalSemaphore`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cyhNode
     if hNode is None:
         phNode = 0
@@ -35825,7 +35406,7 @@ def cudaGraphExecExternalSemaphoresWaitNodeSetParams(hGraphExec, hNode, nodePara
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = <cyruntime.cudaExternalSemaphoreWaitNodeParams*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecExternalSemaphoresWaitNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_cudaError_t(err),)
@@ -35835,44 +35416,7 @@ def cudaGraphExecExternalSemaphoresWaitNodeSetParams(hGraphExec, hNode, nodePara
 
 @cython.embedsignature(True)
 def cudaGraphNodeSetEnabled(hGraphExec, hNode, unsigned int isEnabled):
-    """ Enables or disables the specified node in the given graphExec.
-
-    Sets `hNode` to be either enabled or disabled. Disabled nodes are
-    functionally equivalent to empty nodes until they are reenabled.
-    Existing node parameters are not affected by disabling/enabling the
-    node.
-
-    The node is identified by the corresponding node `hNode` in the non-
-    executable graph, from which the executable graph was instantiated.
-
-    `hNode` must not have been removed from the original graph.
-
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
-
-    Parameters
-    ----------
-    hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        The executable graph in which to set the specified node
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node from the graph from which graphExec was instantiated
-    isEnabled : unsigned int
-        Node is enabled if != 0, otherwise the node is disabled
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphNodeGetEnabled`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate` :py:obj:`~.cudaGraphLaunch`
-
-    Notes
-    -----
-    Currently only kernel, memset and memcpy nodes are supported.
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cyhNode
     if hNode is None:
         phNode = 0
@@ -35898,37 +35442,7 @@ def cudaGraphNodeSetEnabled(hGraphExec, hNode, unsigned int isEnabled):
 
 @cython.embedsignature(True)
 def cudaGraphNodeGetEnabled(hGraphExec, hNode):
-    """ Query whether a node in the given graphExec is enabled.
-
-    Sets isEnabled to 1 if `hNode` is enabled, or 0 if `hNode` is disabled.
-
-    The node is identified by the corresponding node `hNode` in the non-
-    executable graph, from which the executable graph was instantiated.
-
-    `hNode` must not have been removed from the original graph.
-
-    Parameters
-    ----------
-    hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        The executable graph in which to set the specified node
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node from the graph from which graphExec was instantiated
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
-    isEnabled : unsigned int
-        Location to return the enabled status of the node
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphNodeSetEnabled`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate` :py:obj:`~.cudaGraphLaunch`
-
-    Notes
-    -----
-    Currently only kernel, memset and memcpy nodes are supported.
-    """
+    """"""
     cdef cyruntime.cudaGraphNode_t cyhNode
     if hNode is None:
         phNode = 0
@@ -35957,7 +35471,106 @@ def cudaGraphNodeGetEnabled(hGraphExec, hNode):
 
 @cython.embedsignature(True)
 def cudaGraphExecUpdate(hGraphExec, hGraph):
-    """ Check whether an executable graph can be updated with a graph and perform the update if possible.
+    """ Updates node parameters in the child graph node in the given graphExec.
+
+    Updates the work represented by `node` in `hGraphExec` as though the
+    nodes contained in `node's` graph had the parameters contained in
+    `childGraph's` nodes at instantiation. `node` must remain in the graph
+    which was used to instantiate `hGraphExec`. Changed edges to and from
+    `node` are ignored.
+
+    The modifications only affect future launches of `hGraphExec`. Already
+    enqueued or running launches of `hGraphExec` are not affected by this
+    call. `node` is also not modified by this call.
+
+    The topology of `childGraph`, as well as the node insertion order, must
+    match that of the graph contained in `node`. See
+    :py:obj:`~.cudaGraphExecUpdate()` for a list of restrictions on what
+    can be updated in an instantiated graph. The update is recursive, so
+    child graph nodes contained within the top level child graph will also
+    be updated.
+
+    Sets the event for an event record node in the given graphExec
+
+    Sets the event of an event record node in an executable graph
+    `hGraphExec`. The node is identified by the corresponding node `hNode`
+    in the non-executable graph, from which the executable graph was
+    instantiated.
+
+    The modifications only affect future launches of `hGraphExec`. Already
+    enqueued or running launches of `hGraphExec` are not affected by this
+    call. `hNode` is also not modified by this call.
+
+    Sets the event for an event wait node in the given graphExec
+
+    Sets the event of an event wait node in an executable graph
+    `hGraphExec`. The node is identified by the corresponding node `hNode`
+    in the non-executable graph, from which the executable graph was
+    instantiated.
+
+    The modifications only affect future launches of `hGraphExec`. Already
+    enqueued or running launches of `hGraphExec` are not affected by this
+    call. `hNode` is also not modified by this call.
+
+    Sets the parameters for an external semaphore signal node in the given
+    graphExec
+
+    Sets the parameters of an external semaphore signal node in an
+    executable graph `hGraphExec`. The node is identified by the
+    corresponding node `hNode` in the non-executable graph, from which the
+    executable graph was instantiated.
+
+    `hNode` must not have been removed from the original graph.
+
+    The modifications only affect future launches of `hGraphExec`. Already
+    enqueued or running launches of `hGraphExec` are not affected by this
+    call. `hNode` is also not modified by this call.
+
+    Changing `nodeParams->numExtSems` is not supported.
+
+    Sets the parameters for an external semaphore wait node in the given
+    graphExec
+
+    Sets the parameters of an external semaphore wait node in an executable
+    graph `hGraphExec`. The node is identified by the corresponding node
+    `hNode` in the non-executable graph, from which the executable graph
+    was instantiated.
+
+    `hNode` must not have been removed from the original graph.
+
+    The modifications only affect future launches of `hGraphExec`. Already
+    enqueued or running launches of `hGraphExec` are not affected by this
+    call. `hNode` is also not modified by this call.
+
+    Changing `nodeParams->numExtSems` is not supported.
+
+    Enables or disables the specified node in the given graphExec
+
+    Sets `hNode` to be either enabled or disabled. Disabled nodes are
+    functionally equivalent to empty nodes until they are reenabled.
+    Existing node parameters are not affected by disabling/enabling the
+    node.
+
+    The node is identified by the corresponding node `hNode` in the non-
+    executable graph, from which the executable graph was instantiated.
+
+    `hNode` must not have been removed from the original graph.
+
+    The modifications only affect future launches of `hGraphExec`. Already
+    enqueued or running launches of `hGraphExec` are not affected by this
+    call. `hNode` is also not modified by this call.
+
+    Query whether a node in the given graphExec is enabled
+
+    Sets isEnabled to 1 if `hNode` is enabled, or 0 if `hNode` is disabled.
+
+    The node is identified by the corresponding node `hNode` in the non-
+    executable graph, from which the executable graph was instantiated.
+
+    `hNode` must not have been removed from the original graph.
+
+    Check whether an executable graph can be updated with a graph and
+    perform the update if possible
 
     Updates the node parameters in the instantiated graph specified by
     `hGraphExec` with the node parameters in a topologically identical
@@ -36090,20 +35703,47 @@ def cudaGraphExecUpdate(hGraphExec, hGraph):
     Parameters
     ----------
     hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        The instantiated graph to be updated
-    hGraph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        The graph containing the updated parameters
+        The executable graph in which to set the specified node
+    node : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
+        Host node from the graph which was used to instantiate graphExec
 
     Returns
     -------
     cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorGraphExecUpdateFailure`,
-    resultInfo : :py:obj:`~.cudaGraphExecUpdateResultInfo`
-        the error info structure
+    childGraph : :py:obj:`~.cudaGraphExecUpdateResultInfo`
+        The graph supplying the updated parameters
 
     See Also
     --------
+    :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphChildGraphNodeGetGraph`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
+
+    :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddEventRecordNode`, :py:obj:`~.cudaGraphEventRecordNodeGetEvent`, :py:obj:`~.cudaGraphEventWaitNodeSetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
+
+    :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddEventWaitNode`, :py:obj:`~.cudaGraphEventWaitNodeGetEvent`, :py:obj:`~.cudaGraphEventRecordNodeSetEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
+
+    :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresSignalNode`, :py:obj:`~.cudaImportExternalSemaphore`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresWaitNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
+
+    :py:obj:`~.cudaGraphExecNodeSetParams`, :py:obj:`~.cudaGraphAddExternalSemaphoresWaitNode`, :py:obj:`~.cudaImportExternalSemaphore`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`, :py:obj:`~.cudaGraphExecKernelNodeSetParams`, :py:obj:`~.cudaGraphExecMemcpyNodeSetParams`, :py:obj:`~.cudaGraphExecMemsetNodeSetParams`, :py:obj:`~.cudaGraphExecHostNodeSetParams`, :py:obj:`~.cudaGraphExecChildGraphNodeSetParams`, :py:obj:`~.cudaGraphExecEventRecordNodeSetEvent`, :py:obj:`~.cudaGraphExecEventWaitNodeSetEvent`, :py:obj:`~.cudaGraphExecExternalSemaphoresSignalNodeSetParams`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate`
+
+    :py:obj:`~.cudaGraphNodeGetEnabled`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate` :py:obj:`~.cudaGraphLaunch`
+
+    :py:obj:`~.cudaGraphNodeSetEnabled`, :py:obj:`~.cudaGraphExecUpdate`, :py:obj:`~.cudaGraphInstantiate` :py:obj:`~.cudaGraphLaunch`
+
     :py:obj:`~.cudaGraphInstantiate`
+
+    Notes
+    -----
+    Currently only kernel, memset and memcpy nodes are supported.
+
+    Currently only kernel, memset and memcpy nodes are supported.
     """
     cdef cyruntime.cudaGraph_t cyhGraph
     if hGraph is None:
@@ -36133,30 +35773,7 @@ def cudaGraphExecUpdate(hGraphExec, hGraph):
 
 @cython.embedsignature(True)
 def cudaGraphUpload(graphExec, stream):
-    """ Uploads an executable graph in a stream.
-
-    Uploads `hGraphExec` to the device in `hStream` without executing it.
-    Uploads of the same `hGraphExec` will be serialized. Each upload is
-    ordered behind both any previous work in `hStream` and any previous
-    launches of `hGraphExec`. Uses memory cached by `stream` to back the
-    allocations owned by `graphExec`.
-
-    Parameters
-    ----------
-    hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        Executable graph to upload
-    hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
-        Stream in which to upload the graph
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphInstantiate`, :py:obj:`~.cudaGraphLaunch`, :py:obj:`~.cudaGraphExecDestroy`
-    """
+    """"""
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
         pstream = 0
@@ -36182,7 +35799,15 @@ def cudaGraphUpload(graphExec, stream):
 
 @cython.embedsignature(True)
 def cudaGraphLaunch(graphExec, stream):
-    """ Launches an executable graph in a stream.
+    """ Uploads an executable graph in a stream.
+
+    Uploads `hGraphExec` to the device in `hStream` without executing it.
+    Uploads of the same `hGraphExec` will be serialized. Each upload is
+    ordered behind both any previous work in `hStream` and any previous
+    launches of `hGraphExec`. Uses memory cached by `stream` to back the
+    allocations owned by `graphExec`.
+
+    Launches an executable graph in a stream
 
     Executes `graphExec` in `stream`. Only one instance of `graphExec` may
     be executing at a time. Each launch is ordered behind both any previous
@@ -36197,18 +35822,21 @@ def cudaGraphLaunch(graphExec, stream):
 
     Parameters
     ----------
-    graphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
-        Executable graph to launch
-    stream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
-        Stream in which to launch the graph
+    hGraphExec : :py:obj:`~.CUgraphExec` or :py:obj:`~.cudaGraphExec_t`
+        Executable graph to upload
+    hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
+        Stream in which to upload the graph
 
     Returns
     -------
     cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
     --------
+    :py:obj:`~.cudaGraphInstantiate`, :py:obj:`~.cudaGraphLaunch`, :py:obj:`~.cudaGraphExecDestroy`
+
     :py:obj:`~.cudaGraphInstantiate`, :py:obj:`~.cudaGraphUpload`, :py:obj:`~.cudaGraphExecDestroy`
     """
     cdef cyruntime.cudaStream_t cystream
@@ -36669,7 +36297,7 @@ def cudaGraphAddNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | li
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = <cyruntime.cudaGraphNodeParams*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cynodeParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -36765,7 +36393,7 @@ def cudaGraphAddNode_v2(graph, pDependencies : Optional[tuple[cudaGraphNode_t] |
         cydependencyData = (<cudaGraphEdgeData>dependencyData[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
     if numDependencies > <size_t>len(dependencyData): raise RuntimeError("List is too small: " + str(len(dependencyData)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = <cyruntime.cudaGraphNodeParams*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddNode_v2(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, cydependencyData, numDependencies, cynodeParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -36815,7 +36443,7 @@ def cudaGraphNodeSetParams(node, nodeParams : Optional[cudaGraphNodeParams]):
     else:
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = <cyruntime.cudaGraphNodeParams*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphNodeSetParams(cynode, cynodeParams_ptr)
     return (_cudaError_t(err),)
@@ -36875,7 +36503,7 @@ def cudaGraphExecNodeSetParams(graphExec, node, nodeParams : Optional[cudaGraphN
     else:
         pgraphExec = int(cudaGraphExec_t(graphExec))
     cygraphExec = <cyruntime.cudaGraphExec_t><void_ptr>pgraphExec
-    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams is not None else NULL
+    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = <cyruntime.cudaGraphNodeParams*>nodeParams._pvt_ptr if nodeParams is not None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecNodeSetParams(cygraphExec, cynode, cynodeParams_ptr)
     return (_cudaError_t(err),)
@@ -37770,7 +37398,7 @@ def cudaKernelSetAttributeForDevice(kernel, attr not None : cudaFuncAttribute, i
 def cudaGetExportTable(pExportTableId : Optional[cudaUUID_t]):
     """"""
     cdef void_ptr ppExportTable = 0
-    cdef cyruntime.cudaUUID_t* cypExportTableId_ptr = pExportTableId._pvt_ptr if pExportTableId is not None else NULL
+    cdef cyruntime.cudaUUID_t* cypExportTableId_ptr = <cyruntime.cudaUUID_t*>pExportTableId._pvt_ptr if pExportTableId is not None else NULL
     with nogil:
         err = cyruntime.cudaGetExportTable(<const void**>&ppExportTable, cypExportTableId_ptr)
     if err != cyruntime.cudaSuccess:
@@ -38475,7 +38103,7 @@ def cudaEGLStreamProducerReturnFrame(conn, eglframe : Optional[cudaEglFrame], pS
         cyconn = <cyruntime.cudaEglStreamConnection*><void_ptr>conn
     else:
         raise TypeError("Argument 'conn' is not instance of type (expected <class 'int, runtime.cudaEglStreamConnection'>, found " + str(type(conn)))
-    cdef cyruntime.cudaEglFrame* cyeglframe_ptr = eglframe._pvt_ptr if eglframe is not None else NULL
+    cdef cyruntime.cudaEglFrame* cyeglframe_ptr = <cyruntime.cudaEglFrame*>eglframe._pvt_ptr if eglframe is not None else NULL
     with nogil:
         err = cyruntime.cudaEGLStreamProducerReturnFrame(cyconn, cyeglframe_ptr, cypStream)
     return (_cudaError_t(err),)