From 78b89cb6e77eea22014a9490484dd8e7f680bcec Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Tue, 20 May 2025 15:27:04 -0700
Subject: [PATCH 01/65] Propagate generated path finder changes

---
 cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
index f5e8ead80f..3e8c63afb9 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
@@ -5,7 +5,6 @@
 {{if 'Windows' == platform.system()}}
 import os
 import win32api
-from pywintypes import error
 {{else}}
 cimport cuda.bindings._lib.dlfcn as dlfcn
 {{endif}}

From b423deb3d3b7b840da65463741ab1d81c56af383 Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Tue, 27 May 2025 14:40:53 -0700
Subject: [PATCH 02/65] Rebase to 13.0 RC14

---
 .../cuda/bindings/_bindings/cydriver.pxd.in   |  134 +-
 .../cuda/bindings/_bindings/cydriver.pyx.in   | 1112 +++---
 .../cuda/bindings/_bindings/cynvrtc.pxd.in    |   10 -
 .../cuda/bindings/_bindings/cynvrtc.pyx.in    |   62 -
 .../cuda/bindings/_bindings/cyruntime.pxd.in  |  150 +-
 .../cuda/bindings/_bindings/cyruntime.pyx.in  |  302 +-
 .../bindings/_bindings/cyruntime_ptds.pxd.in  |  150 +-
 .../bindings/_bindings/cyruntime_ptds.pyx.in  |  206 +-
 cuda_bindings/cuda/bindings/_lib/utils.pyx.in |    3 +-
 cuda_bindings/cuda/bindings/cydriver.pxd.in   |  216 +-
 cuda_bindings/cuda/bindings/cydriver.pyx.in   |  190 +-
 cuda_bindings/cuda/bindings/cynvrtc.pxd.in    |   11 +-
 cuda_bindings/cuda/bindings/cynvrtc.pyx.in    |   12 -
 cuda_bindings/cuda/bindings/cyruntime.pxd.in  |  158 +-
 cuda_bindings/cuda/bindings/cyruntime.pyx.in  |  200 +-
 .../cuda/bindings/cyruntime_functions.pxi.in  |  165 +-
 .../cuda/bindings/cyruntime_types.pxi.in      |  187 +-
 cuda_bindings/cuda/bindings/driver.pxd.in     |  154 +-
 cuda_bindings/cuda/bindings/driver.pyx.in     | 3164 ++++++---------
 cuda_bindings/cuda/bindings/nvrtc.pyx.in      |   66 +-
 cuda_bindings/cuda/bindings/runtime.pxd.in    |  338 +-
 cuda_bindings/cuda/bindings/runtime.pyx.in    | 3509 ++++++++---------
 cuda_bindings/docs/source/module/driver.rst   |  211 +-
 cuda_bindings/docs/source/module/nvrtc.rst    |    9 +-
 cuda_bindings/docs/source/module/runtime.rst  |  183 +-
 .../0_Introduction/clock_nvrtc_test.py        |    4 +-
 .../simpleCubemapTexture_test.py              |    5 +-
 .../examples/0_Introduction/simpleP2P_test.py |    5 +-
 .../0_Introduction/simpleZeroCopy_test.py     |    5 +-
 .../0_Introduction/systemWideAtomics_test.py  |    8 +-
 .../0_Introduction/vectorAddDrv_test.py       |    6 +-
 .../0_Introduction/vectorAddMMAP_test.py      |    6 +-
 .../streamOrderedAllocation_test.py           |    5 +-
 .../globalToShmemAsyncCopy_test.py            |    5 +-
 .../3_CUDA_Features/simpleCudaGraphs_test.py  |    5 +-
 .../conjugateGradientMultiBlockCG_test.py     |    5 +-
 cuda_bindings/examples/common/common.py       |    6 +-
 cuda_bindings/examples/common/helper_cuda.py  |    6 +-
 .../examples/common/helper_string.py          |    2 +-
 .../examples/extra/isoFDModelling_test.py     |    7 +-
 .../examples/extra/jit_program_test.py        |    7 +-
 .../examples/extra/numba_emm_plugin.py        |    6 +-
 cuda_bindings/tests/cython/test_ccuda.pyx     |    8 +-
 cuda_bindings/tests/cython/test_ccudart.pyx   |   12 +-
 .../cython/test_interoperability_cython.pyx   |   24 +-
 cuda_bindings/tests/test_cuda.py              |   64 +-
 cuda_bindings/tests/test_cudart.py            |  146 +-
 cuda_bindings/tests/test_interoperability.py  |   24 +-
 cuda_bindings/tests/test_kernelParams.py      |   18 +-
 cuda_bindings/tests/test_nvrtc.py             |    4 +-
 50 files changed, 5358 insertions(+), 5937 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
index a5aadc37b7..f4d2c10d61 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
@@ -39,11 +39,6 @@ cdef CUresult _cuDeviceGetCount(int* count) except ?CUDA_ERROR_NOT_FOUND nogil
 cdef CUresult _cuDeviceGetName(char* name, int length, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuDeviceGetUuid' in found_functions}}
-
-cdef CUresult _cuDeviceGetUuid(CUuuid* uuid, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuDeviceGetUuid_v2' in found_functions}}
 
 cdef CUresult _cuDeviceGetUuid_v2(CUuuid* uuid, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -69,6 +64,11 @@ cdef CUresult _cuDeviceGetTexture1DLinearMaxWidth(size_t* maxWidthInElements, CU
 cdef CUresult _cuDeviceGetAttribute(int* pi, CUdevice_attribute attrib, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuDeviceGetHostAtomicCapabilities' in found_functions}}
+
+cdef CUresult _cuDeviceGetHostAtomicCapabilities(unsigned int* capabilities, const CUatomicOperation* operations, unsigned int count, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuDeviceGetNvSciSyncAttributes' in found_functions}}
 
 cdef CUresult _cuDeviceGetNvSciSyncAttributes(void* nvSciSyncAttrList, CUdevice dev, int flags) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -134,16 +134,6 @@ cdef CUresult _cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int* flags, int
 cdef CUresult _cuDevicePrimaryCtxReset_v2(CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuCtxCreate_v2' in found_functions}}
-
-cdef CUresult _cuCtxCreate_v2(CUcontext* pctx, unsigned int flags, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
-{{if 'cuCtxCreate_v3' in found_functions}}
-
-cdef CUresult _cuCtxCreate_v3(CUcontext* pctx, CUexecAffinityParam* paramsArray, int numParams, unsigned int flags, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuCtxCreate_v4' in found_functions}}
 
 cdef CUresult _cuCtxCreate_v4(CUcontext* pctx, CUctxCreateParams* ctxCreateParams, unsigned int flags, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -179,6 +169,11 @@ cdef CUresult _cuCtxGetCurrent(CUcontext* pctx) except ?CUDA_ERROR_NOT_FOUND nog
 cdef CUresult _cuCtxGetDevice(CUdevice* device) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuCtxGetDevice_v2' in found_functions}}
+
+cdef CUresult _cuCtxGetDevice_v2(CUdevice* device, CUcontext ctx) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuCtxGetFlags' in found_functions}}
 
 cdef CUresult _cuCtxGetFlags(unsigned int* flags) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -199,6 +194,11 @@ cdef CUresult _cuCtxGetId(CUcontext ctx, unsigned long long* ctxId) except ?CUDA
 cdef CUresult _cuCtxSynchronize() except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuCtxSynchronize_v2' in found_functions}}
+
+cdef CUresult _cuCtxSynchronize_v2(CUcontext ctx) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuCtxSetLimit' in found_functions}}
 
 cdef CUresult _cuCtxSetLimit(CUlimit limit, size_t value) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -669,14 +669,14 @@ cdef CUresult _cuMemcpy3DAsync_v2(const CUDA_MEMCPY3D* pCopy, CUstream hStream)
 cdef CUresult _cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER* pCopy, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuMemcpyBatchAsync' in found_functions}}
+{{if 'cuMemcpyBatchAsync_v2' in found_functions}}
 
-cdef CUresult _cuMemcpyBatchAsync(CUdeviceptr* dsts, CUdeviceptr* srcs, size_t* sizes, size_t count, CUmemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, size_t* failIdx, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult _cuMemcpyBatchAsync_v2(CUdeviceptr* dsts, CUdeviceptr* srcs, size_t* sizes, size_t count, CUmemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuMemcpy3DBatchAsync' in found_functions}}
+{{if 'cuMemcpy3DBatchAsync_v2' in found_functions}}
 
-cdef CUresult _cuMemcpy3DBatchAsync(size_t numOps, CUDA_MEMCPY3D_BATCH_OP* opList, size_t* failIdx, unsigned long long flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult _cuMemcpy3DBatchAsync_v2(size_t numOps, CUDA_MEMCPY3D_BATCH_OP* opList, unsigned long long flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
 {{if 'cuMemsetD8_v2' in found_functions}}
@@ -929,6 +929,21 @@ cdef CUresult _cuMemPoolCreate(CUmemoryPool* pool, const CUmemPoolProps* poolPro
 cdef CUresult _cuMemPoolDestroy(CUmemoryPool pool) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuMemGetDefaultMemPool' in found_functions}}
+
+cdef CUresult _cuMemGetDefaultMemPool(CUmemoryPool* pool_out, CUmemLocation* location, CUmemAllocationType typename) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuMemGetMemPool' in found_functions}}
+
+cdef CUresult _cuMemGetMemPool(CUmemoryPool* pool, CUmemLocation* location, CUmemAllocationType typename) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuMemSetMemPool' in found_functions}}
+
+cdef CUresult _cuMemSetMemPool(CUmemLocation* location, CUmemAllocationType typename, CUmemoryPool pool) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuMemAllocFromPoolAsync' in found_functions}}
 
 cdef CUresult _cuMemAllocFromPoolAsync(CUdeviceptr* dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -989,24 +1004,29 @@ cdef CUresult _cuMulticastGetGranularity(size_t* granularity, const CUmulticastO
 cdef CUresult _cuPointerGetAttribute(void* data, CUpointer_attribute attribute, CUdeviceptr ptr) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuMemPrefetchAsync' in found_functions}}
+{{if 'cuMemPrefetchAsync_v2' in found_functions}}
 
-cdef CUresult _cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult _cuMemPrefetchAsync_v2(CUdeviceptr devPtr, size_t count, CUmemLocation location, unsigned int flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuMemPrefetchAsync_v2' in found_functions}}
+{{if 'cuMemAdvise_v2' in found_functions}}
 
-cdef CUresult _cuMemPrefetchAsync_v2(CUdeviceptr devPtr, size_t count, CUmemLocation location, unsigned int flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult _cuMemAdvise_v2(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUmemLocation location) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuMemAdvise' in found_functions}}
+{{if 'cuMemPrefetchBatchAsync' in found_functions}}
 
-cdef CUresult _cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult _cuMemPrefetchBatchAsync(CUdeviceptr* dptrs, size_t* sizes, size_t count, CUmemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuMemAdvise_v2' in found_functions}}
+{{if 'cuMemDiscardBatchAsync' in found_functions}}
 
-cdef CUresult _cuMemAdvise_v2(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUmemLocation location) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult _cuMemDiscardBatchAsync(CUdeviceptr* dptrs, size_t* sizes, size_t count, unsigned long long flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuMemDiscardAndPrefetchBatchAsync' in found_functions}}
+
+cdef CUresult _cuMemDiscardAndPrefetchBatchAsync(CUdeviceptr* dptrs, size_t* sizes, size_t count, CUmemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
 {{if 'cuMemRangeGetAttribute' in found_functions}}
@@ -1104,21 +1124,11 @@ cdef CUresult _cuStreamEndCapture(CUstream hStream, CUgraph* phGraph) except ?CU
 cdef CUresult _cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus* captureStatus) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuStreamGetCaptureInfo_v2' in found_functions}}
-
-cdef CUresult _cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus* captureStatus_out, cuuint64_t* id_out, CUgraph* graph_out, const CUgraphNode** dependencies_out, size_t* numDependencies_out) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuStreamGetCaptureInfo_v3' in found_functions}}
 
 cdef CUresult _cuStreamGetCaptureInfo_v3(CUstream hStream, CUstreamCaptureStatus* captureStatus_out, cuuint64_t* id_out, CUgraph* graph_out, const CUgraphNode** dependencies_out, const CUgraphEdgeData** edgeData_out, size_t* numDependencies_out) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuStreamUpdateCaptureDependencies' in found_functions}}
-
-cdef CUresult _cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode* dependencies, size_t numDependencies, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuStreamUpdateCaptureDependencies_v2' in found_functions}}
 
 cdef CUresult _cuStreamUpdateCaptureDependencies_v2(CUstream hStream, CUgraphNode* dependencies, const CUgraphEdgeData* dependencyData, size_t numDependencies, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -1189,11 +1199,6 @@ cdef CUresult _cuEventSynchronize(CUevent hEvent) except ?CUDA_ERROR_NOT_FOUND n
 cdef CUresult _cuEventDestroy_v2(CUevent hEvent) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuEventElapsedTime' in found_functions}}
-
-cdef CUresult _cuEventElapsedTime(float* pMilliseconds, CUevent hStart, CUevent hEnd) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuEventElapsedTime_v2' in found_functions}}
 
 cdef CUresult _cuEventElapsedTime_v2(float* pMilliseconds, CUevent hStart, CUevent hEnd) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -1604,51 +1609,26 @@ cdef CUresult _cuGraphGetNodes(CUgraph hGraph, CUgraphNode* nodes, size_t* numNo
 cdef CUresult _cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode* rootNodes, size_t* numRootNodes) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuGraphGetEdges' in found_functions}}
-
-cdef CUresult _cuGraphGetEdges(CUgraph hGraph, CUgraphNode* from_, CUgraphNode* to, size_t* numEdges) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuGraphGetEdges_v2' in found_functions}}
 
 cdef CUresult _cuGraphGetEdges_v2(CUgraph hGraph, CUgraphNode* from_, CUgraphNode* to, CUgraphEdgeData* edgeData, size_t* numEdges) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuGraphNodeGetDependencies' in found_functions}}
-
-cdef CUresult _cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode* dependencies, size_t* numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuGraphNodeGetDependencies_v2' in found_functions}}
 
 cdef CUresult _cuGraphNodeGetDependencies_v2(CUgraphNode hNode, CUgraphNode* dependencies, CUgraphEdgeData* edgeData, size_t* numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuGraphNodeGetDependentNodes' in found_functions}}
-
-cdef CUresult _cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode* dependentNodes, size_t* numDependentNodes) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuGraphNodeGetDependentNodes_v2' in found_functions}}
 
 cdef CUresult _cuGraphNodeGetDependentNodes_v2(CUgraphNode hNode, CUgraphNode* dependentNodes, CUgraphEdgeData* edgeData, size_t* numDependentNodes) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuGraphAddDependencies' in found_functions}}
-
-cdef CUresult _cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode* from_, const CUgraphNode* to, size_t numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuGraphAddDependencies_v2' in found_functions}}
 
 cdef CUresult _cuGraphAddDependencies_v2(CUgraph hGraph, const CUgraphNode* from_, const CUgraphNode* to, const CUgraphEdgeData* edgeData, size_t numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuGraphRemoveDependencies' in found_functions}}
-
-cdef CUresult _cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode* from_, const CUgraphNode* to, size_t numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuGraphRemoveDependencies_v2' in found_functions}}
 
 cdef CUresult _cuGraphRemoveDependencies_v2(CUgraph hGraph, const CUgraphNode* from_, const CUgraphNode* to, const CUgraphEdgeData* edgeData, size_t numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -1799,11 +1779,6 @@ cdef CUresult _cuGraphRetainUserObject(CUgraph graph, CUuserObject object, unsig
 cdef CUresult _cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsigned int count) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuGraphAddNode' in found_functions}}
-
-cdef CUresult _cuGraphAddNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, CUgraphNodeParams* nodeParams) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuGraphAddNode_v2' in found_functions}}
 
 cdef CUresult _cuGraphAddNode_v2(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, const CUgraphEdgeData* dependencyData, size_t numDependencies, CUgraphNodeParams* nodeParams) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -2084,6 +2059,11 @@ cdef CUresult _cuCtxDisablePeerAccess(CUcontext peerContext) except ?CUDA_ERROR_
 cdef CUresult _cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuDeviceGetP2PAtomicCapabilities' in found_functions}}
+
+cdef CUresult _cuDeviceGetP2PAtomicCapabilities(unsigned int* capabilities, const CUatomicOperation* operations, unsigned int count, CUdevice srcDevice, CUdevice dstDevice) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuGraphicsUnregisterResource' in found_functions}}
 
 cdef CUresult _cuGraphicsUnregisterResource(CUgraphicsResource resource) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -2209,6 +2189,11 @@ cdef CUresult _cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx* phCtx) except ?
 cdef CUresult _cuGreenCtxStreamCreate(CUstream* phStream, CUgreenCtx greenCtx, unsigned int flags, int priority) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuGreenCtxGetId' in found_functions}}
+
+cdef CUresult _cuGreenCtxGetId(CUgreenCtx greenCtx, unsigned long long* greenCtxId) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuLogsRegisterCallback' in found_functions}}
 
 cdef CUresult _cuLogsRegisterCallback(CUlogsCallback callbackFunc, void* userData, CUlogsCallbackHandle* callback_out) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -2254,11 +2239,6 @@ cdef CUresult _cuCheckpointProcessLock(int pid, CUcheckpointLockArgs* args) exce
 cdef CUresult _cuCheckpointProcessCheckpoint(int pid, CUcheckpointCheckpointArgs* args) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuCheckpointProcessRestore' in found_functions}}
-
-cdef CUresult _cuCheckpointProcessRestore(int pid, CUcheckpointRestoreArgs* args) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuCheckpointProcessUnlock' in found_functions}}
 
 cdef CUresult _cuCheckpointProcessUnlock(int pid, CUcheckpointUnlockArgs* args) except ?CUDA_ERROR_NOT_FOUND nogil
diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
index 3e8c63afb9..5e49b0270c 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
@@ -20,12 +20,12 @@ cdef bint __cuPythonInit = False
 {{if 'cuDeviceGet' in found_functions}}cdef void *__cuDeviceGet = NULL{{endif}}
 {{if 'cuDeviceGetCount' in found_functions}}cdef void *__cuDeviceGetCount = NULL{{endif}}
 {{if 'cuDeviceGetName' in found_functions}}cdef void *__cuDeviceGetName = NULL{{endif}}
-{{if 'cuDeviceGetUuid' in found_functions}}cdef void *__cuDeviceGetUuid = NULL{{endif}}
 {{if 'cuDeviceGetUuid_v2' in found_functions}}cdef void *__cuDeviceGetUuid_v2 = NULL{{endif}}
 {{if 'cuDeviceGetLuid' in found_functions}}cdef void *__cuDeviceGetLuid = NULL{{endif}}
 {{if 'cuDeviceTotalMem_v2' in found_functions}}cdef void *__cuDeviceTotalMem_v2 = NULL{{endif}}
 {{if 'cuDeviceGetTexture1DLinearMaxWidth' in found_functions}}cdef void *__cuDeviceGetTexture1DLinearMaxWidth = NULL{{endif}}
 {{if 'cuDeviceGetAttribute' in found_functions}}cdef void *__cuDeviceGetAttribute = NULL{{endif}}
+{{if 'cuDeviceGetHostAtomicCapabilities' in found_functions}}cdef void *__cuDeviceGetHostAtomicCapabilities = NULL{{endif}}
 {{if 'cuDeviceGetNvSciSyncAttributes' in found_functions}}cdef void *__cuDeviceGetNvSciSyncAttributes = NULL{{endif}}
 {{if 'cuDeviceSetMemPool' in found_functions}}cdef void *__cuDeviceSetMemPool = NULL{{endif}}
 {{if 'cuDeviceGetMemPool' in found_functions}}cdef void *__cuDeviceGetMemPool = NULL{{endif}}
@@ -39,8 +39,6 @@ cdef bint __cuPythonInit = False
 {{if 'cuDevicePrimaryCtxSetFlags_v2' in found_functions}}cdef void *__cuDevicePrimaryCtxSetFlags_v2 = NULL{{endif}}
 {{if 'cuDevicePrimaryCtxGetState' in found_functions}}cdef void *__cuDevicePrimaryCtxGetState = NULL{{endif}}
 {{if 'cuDevicePrimaryCtxReset_v2' in found_functions}}cdef void *__cuDevicePrimaryCtxReset_v2 = NULL{{endif}}
-{{if 'cuCtxCreate_v2' in found_functions}}cdef void *__cuCtxCreate_v2 = NULL{{endif}}
-{{if 'cuCtxCreate_v3' in found_functions}}cdef void *__cuCtxCreate_v3 = NULL{{endif}}
 {{if 'cuCtxCreate_v4' in found_functions}}cdef void *__cuCtxCreate_v4 = NULL{{endif}}
 {{if 'cuCtxDestroy_v2' in found_functions}}cdef void *__cuCtxDestroy_v2 = NULL{{endif}}
 {{if 'cuCtxPushCurrent_v2' in found_functions}}cdef void *__cuCtxPushCurrent_v2 = NULL{{endif}}
@@ -48,10 +46,12 @@ cdef bint __cuPythonInit = False
 {{if 'cuCtxSetCurrent' in found_functions}}cdef void *__cuCtxSetCurrent = NULL{{endif}}
 {{if 'cuCtxGetCurrent' in found_functions}}cdef void *__cuCtxGetCurrent = NULL{{endif}}
 {{if 'cuCtxGetDevice' in found_functions}}cdef void *__cuCtxGetDevice = NULL{{endif}}
+{{if 'cuCtxGetDevice_v2' in found_functions}}cdef void *__cuCtxGetDevice_v2 = NULL{{endif}}
 {{if 'cuCtxGetFlags' in found_functions}}cdef void *__cuCtxGetFlags = NULL{{endif}}
 {{if 'cuCtxSetFlags' in found_functions}}cdef void *__cuCtxSetFlags = NULL{{endif}}
 {{if 'cuCtxGetId' in found_functions}}cdef void *__cuCtxGetId = NULL{{endif}}
 {{if 'cuCtxSynchronize' in found_functions}}cdef void *__cuCtxSynchronize = NULL{{endif}}
+{{if 'cuCtxSynchronize_v2' in found_functions}}cdef void *__cuCtxSynchronize_v2 = NULL{{endif}}
 {{if 'cuCtxSetLimit' in found_functions}}cdef void *__cuCtxSetLimit = NULL{{endif}}
 {{if 'cuCtxGetLimit' in found_functions}}cdef void *__cuCtxGetLimit = NULL{{endif}}
 {{if 'cuCtxGetCacheConfig' in found_functions}}cdef void *__cuCtxGetCacheConfig = NULL{{endif}}
@@ -146,8 +146,8 @@ cdef bint __cuPythonInit = False
 {{if 'cuMemcpy2DAsync_v2' in found_functions}}cdef void *__cuMemcpy2DAsync_v2 = NULL{{endif}}
 {{if 'cuMemcpy3DAsync_v2' in found_functions}}cdef void *__cuMemcpy3DAsync_v2 = NULL{{endif}}
 {{if 'cuMemcpy3DPeerAsync' in found_functions}}cdef void *__cuMemcpy3DPeerAsync = NULL{{endif}}
-{{if 'cuMemcpyBatchAsync' in found_functions}}cdef void *__cuMemcpyBatchAsync = NULL{{endif}}
-{{if 'cuMemcpy3DBatchAsync' in found_functions}}cdef void *__cuMemcpy3DBatchAsync = NULL{{endif}}
+{{if 'cuMemcpyBatchAsync_v2' in found_functions}}cdef void *__cuMemcpyBatchAsync_v2 = NULL{{endif}}
+{{if 'cuMemcpy3DBatchAsync_v2' in found_functions}}cdef void *__cuMemcpy3DBatchAsync_v2 = NULL{{endif}}
 {{if 'cuMemsetD8_v2' in found_functions}}cdef void *__cuMemsetD8_v2 = NULL{{endif}}
 {{if 'cuMemsetD16_v2' in found_functions}}cdef void *__cuMemsetD16_v2 = NULL{{endif}}
 {{if 'cuMemsetD32_v2' in found_functions}}cdef void *__cuMemsetD32_v2 = NULL{{endif}}
@@ -198,6 +198,9 @@ cdef bint __cuPythonInit = False
 {{if 'cuMemPoolGetAccess' in found_functions}}cdef void *__cuMemPoolGetAccess = NULL{{endif}}
 {{if 'cuMemPoolCreate' in found_functions}}cdef void *__cuMemPoolCreate = NULL{{endif}}
 {{if 'cuMemPoolDestroy' in found_functions}}cdef void *__cuMemPoolDestroy = NULL{{endif}}
+{{if 'cuMemGetDefaultMemPool' in found_functions}}cdef void *__cuMemGetDefaultMemPool = NULL{{endif}}
+{{if 'cuMemGetMemPool' in found_functions}}cdef void *__cuMemGetMemPool = NULL{{endif}}
+{{if 'cuMemSetMemPool' in found_functions}}cdef void *__cuMemSetMemPool = NULL{{endif}}
 {{if 'cuMemAllocFromPoolAsync' in found_functions}}cdef void *__cuMemAllocFromPoolAsync = NULL{{endif}}
 {{if 'cuMemPoolExportToShareableHandle' in found_functions}}cdef void *__cuMemPoolExportToShareableHandle = NULL{{endif}}
 {{if 'cuMemPoolImportFromShareableHandle' in found_functions}}cdef void *__cuMemPoolImportFromShareableHandle = NULL{{endif}}
@@ -210,10 +213,11 @@ cdef bint __cuPythonInit = False
 {{if 'cuMulticastUnbind' in found_functions}}cdef void *__cuMulticastUnbind = NULL{{endif}}
 {{if 'cuMulticastGetGranularity' in found_functions}}cdef void *__cuMulticastGetGranularity = NULL{{endif}}
 {{if 'cuPointerGetAttribute' in found_functions}}cdef void *__cuPointerGetAttribute = NULL{{endif}}
-{{if 'cuMemPrefetchAsync' in found_functions}}cdef void *__cuMemPrefetchAsync = NULL{{endif}}
 {{if 'cuMemPrefetchAsync_v2' in found_functions}}cdef void *__cuMemPrefetchAsync_v2 = NULL{{endif}}
-{{if 'cuMemAdvise' in found_functions}}cdef void *__cuMemAdvise = NULL{{endif}}
 {{if 'cuMemAdvise_v2' in found_functions}}cdef void *__cuMemAdvise_v2 = NULL{{endif}}
+{{if 'cuMemPrefetchBatchAsync' in found_functions}}cdef void *__cuMemPrefetchBatchAsync = NULL{{endif}}
+{{if 'cuMemDiscardBatchAsync' in found_functions}}cdef void *__cuMemDiscardBatchAsync = NULL{{endif}}
+{{if 'cuMemDiscardAndPrefetchBatchAsync' in found_functions}}cdef void *__cuMemDiscardAndPrefetchBatchAsync = NULL{{endif}}
 {{if 'cuMemRangeGetAttribute' in found_functions}}cdef void *__cuMemRangeGetAttribute = NULL{{endif}}
 {{if 'cuMemRangeGetAttributes' in found_functions}}cdef void *__cuMemRangeGetAttributes = NULL{{endif}}
 {{if 'cuPointerSetAttribute' in found_functions}}cdef void *__cuPointerSetAttribute = NULL{{endif}}
@@ -233,9 +237,7 @@ cdef bint __cuPythonInit = False
 {{if 'cuThreadExchangeStreamCaptureMode' in found_functions}}cdef void *__cuThreadExchangeStreamCaptureMode = NULL{{endif}}
 {{if 'cuStreamEndCapture' in found_functions}}cdef void *__cuStreamEndCapture = NULL{{endif}}
 {{if 'cuStreamIsCapturing' in found_functions}}cdef void *__cuStreamIsCapturing = NULL{{endif}}
-{{if 'cuStreamGetCaptureInfo_v2' in found_functions}}cdef void *__cuStreamGetCaptureInfo_v2 = NULL{{endif}}
 {{if 'cuStreamGetCaptureInfo_v3' in found_functions}}cdef void *__cuStreamGetCaptureInfo_v3 = NULL{{endif}}
-{{if 'cuStreamUpdateCaptureDependencies' in found_functions}}cdef void *__cuStreamUpdateCaptureDependencies = NULL{{endif}}
 {{if 'cuStreamUpdateCaptureDependencies_v2' in found_functions}}cdef void *__cuStreamUpdateCaptureDependencies_v2 = NULL{{endif}}
 {{if 'cuStreamAttachMemAsync' in found_functions}}cdef void *__cuStreamAttachMemAsync = NULL{{endif}}
 {{if 'cuStreamQuery' in found_functions}}cdef void *__cuStreamQuery = NULL{{endif}}
@@ -250,7 +252,6 @@ cdef bint __cuPythonInit = False
 {{if 'cuEventQuery' in found_functions}}cdef void *__cuEventQuery = NULL{{endif}}
 {{if 'cuEventSynchronize' in found_functions}}cdef void *__cuEventSynchronize = NULL{{endif}}
 {{if 'cuEventDestroy_v2' in found_functions}}cdef void *__cuEventDestroy_v2 = NULL{{endif}}
-{{if 'cuEventElapsedTime' in found_functions}}cdef void *__cuEventElapsedTime = NULL{{endif}}
 {{if 'cuEventElapsedTime_v2' in found_functions}}cdef void *__cuEventElapsedTime_v2 = NULL{{endif}}
 {{if 'cuImportExternalMemory' in found_functions}}cdef void *__cuImportExternalMemory = NULL{{endif}}
 {{if 'cuExternalMemoryGetMappedBuffer' in found_functions}}cdef void *__cuExternalMemoryGetMappedBuffer = NULL{{endif}}
@@ -333,15 +334,10 @@ cdef bint __cuPythonInit = False
 {{if 'cuGraphNodeGetType' in found_functions}}cdef void *__cuGraphNodeGetType = NULL{{endif}}
 {{if 'cuGraphGetNodes' in found_functions}}cdef void *__cuGraphGetNodes = NULL{{endif}}
 {{if 'cuGraphGetRootNodes' in found_functions}}cdef void *__cuGraphGetRootNodes = NULL{{endif}}
-{{if 'cuGraphGetEdges' in found_functions}}cdef void *__cuGraphGetEdges = NULL{{endif}}
 {{if 'cuGraphGetEdges_v2' in found_functions}}cdef void *__cuGraphGetEdges_v2 = NULL{{endif}}
-{{if 'cuGraphNodeGetDependencies' in found_functions}}cdef void *__cuGraphNodeGetDependencies = NULL{{endif}}
 {{if 'cuGraphNodeGetDependencies_v2' in found_functions}}cdef void *__cuGraphNodeGetDependencies_v2 = NULL{{endif}}
-{{if 'cuGraphNodeGetDependentNodes' in found_functions}}cdef void *__cuGraphNodeGetDependentNodes = NULL{{endif}}
 {{if 'cuGraphNodeGetDependentNodes_v2' in found_functions}}cdef void *__cuGraphNodeGetDependentNodes_v2 = NULL{{endif}}
-{{if 'cuGraphAddDependencies' in found_functions}}cdef void *__cuGraphAddDependencies = NULL{{endif}}
 {{if 'cuGraphAddDependencies_v2' in found_functions}}cdef void *__cuGraphAddDependencies_v2 = NULL{{endif}}
-{{if 'cuGraphRemoveDependencies' in found_functions}}cdef void *__cuGraphRemoveDependencies = NULL{{endif}}
 {{if 'cuGraphRemoveDependencies_v2' in found_functions}}cdef void *__cuGraphRemoveDependencies_v2 = NULL{{endif}}
 {{if 'cuGraphDestroyNode' in found_functions}}cdef void *__cuGraphDestroyNode = NULL{{endif}}
 {{if 'cuGraphInstantiateWithFlags' in found_functions}}cdef void *__cuGraphInstantiateWithFlags = NULL{{endif}}
@@ -372,7 +368,6 @@ cdef bint __cuPythonInit = False
 {{if 'cuUserObjectRelease' in found_functions}}cdef void *__cuUserObjectRelease = NULL{{endif}}
 {{if 'cuGraphRetainUserObject' in found_functions}}cdef void *__cuGraphRetainUserObject = NULL{{endif}}
 {{if 'cuGraphReleaseUserObject' in found_functions}}cdef void *__cuGraphReleaseUserObject = NULL{{endif}}
-{{if 'cuGraphAddNode' in found_functions}}cdef void *__cuGraphAddNode = NULL{{endif}}
 {{if 'cuGraphAddNode_v2' in found_functions}}cdef void *__cuGraphAddNode_v2 = NULL{{endif}}
 {{if 'cuGraphNodeSetParams' in found_functions}}cdef void *__cuGraphNodeSetParams = NULL{{endif}}
 {{if 'cuGraphExecNodeSetParams' in found_functions}}cdef void *__cuGraphExecNodeSetParams = NULL{{endif}}
@@ -429,6 +424,7 @@ cdef bint __cuPythonInit = False
 {{if 'cuCtxEnablePeerAccess' in found_functions}}cdef void *__cuCtxEnablePeerAccess = NULL{{endif}}
 {{if 'cuCtxDisablePeerAccess' in found_functions}}cdef void *__cuCtxDisablePeerAccess = NULL{{endif}}
 {{if 'cuDeviceGetP2PAttribute' in found_functions}}cdef void *__cuDeviceGetP2PAttribute = NULL{{endif}}
+{{if 'cuDeviceGetP2PAtomicCapabilities' in found_functions}}cdef void *__cuDeviceGetP2PAtomicCapabilities = NULL{{endif}}
 {{if 'cuGraphicsUnregisterResource' in found_functions}}cdef void *__cuGraphicsUnregisterResource = NULL{{endif}}
 {{if 'cuGraphicsSubResourceGetMappedArray' in found_functions}}cdef void *__cuGraphicsSubResourceGetMappedArray = NULL{{endif}}
 {{if 'cuGraphicsResourceGetMappedMipmappedArray' in found_functions}}cdef void *__cuGraphicsResourceGetMappedMipmappedArray = NULL{{endif}}
@@ -454,6 +450,7 @@ cdef bint __cuPythonInit = False
 {{if 'cuGreenCtxWaitEvent' in found_functions}}cdef void *__cuGreenCtxWaitEvent = NULL{{endif}}
 {{if 'cuStreamGetGreenCtx' in found_functions}}cdef void *__cuStreamGetGreenCtx = NULL{{endif}}
 {{if 'cuGreenCtxStreamCreate' in found_functions}}cdef void *__cuGreenCtxStreamCreate = NULL{{endif}}
+{{if 'cuGreenCtxGetId' in found_functions}}cdef void *__cuGreenCtxGetId = NULL{{endif}}
 {{if 'cuLogsRegisterCallback' in found_functions}}cdef void *__cuLogsRegisterCallback = NULL{{endif}}
 {{if 'cuLogsUnregisterCallback' in found_functions}}cdef void *__cuLogsUnregisterCallback = NULL{{endif}}
 {{if 'cuLogsCurrent' in found_functions}}cdef void *__cuLogsCurrent = NULL{{endif}}
@@ -463,7 +460,6 @@ cdef bint __cuPythonInit = False
 {{if 'cuCheckpointProcessGetState' in found_functions}}cdef void *__cuCheckpointProcessGetState = NULL{{endif}}
 {{if 'cuCheckpointProcessLock' in found_functions}}cdef void *__cuCheckpointProcessLock = NULL{{endif}}
 {{if 'cuCheckpointProcessCheckpoint' in found_functions}}cdef void *__cuCheckpointProcessCheckpoint = NULL{{endif}}
-{{if 'cuCheckpointProcessRestore' in found_functions}}cdef void *__cuCheckpointProcessRestore = NULL{{endif}}
 {{if 'cuCheckpointProcessUnlock' in found_functions}}cdef void *__cuCheckpointProcessUnlock = NULL{{endif}}
 {{if 'cuProfilerStart' in found_functions}}cdef void *__cuProfilerStart = NULL{{endif}}
 {{if 'cuProfilerStop' in found_functions}}cdef void *__cuProfilerStop = NULL{{endif}}
@@ -636,13 +632,13 @@ cdef int cuPythonInit() except -1 nogil:
             global __cuMemcpy3DPeerAsync
             cuGetProcAddress('cuMemcpy3DPeerAsync', &__cuMemcpy3DPeerAsync, 7000, CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM, NULL)
             {{endif}}
-            {{if 'cuMemcpyBatchAsync' in found_functions}}
-            global __cuMemcpyBatchAsync
-            cuGetProcAddress('cuMemcpyBatchAsync', &__cuMemcpyBatchAsync, 12080, CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM, NULL)
+            {{if 'cuMemcpyBatchAsync_v2' in found_functions}}
+            global __cuMemcpyBatchAsync_v2
+            cuGetProcAddress('cuMemcpyBatchAsync', &__cuMemcpyBatchAsync_v2, 13000, CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM, NULL)
             {{endif}}
-            {{if 'cuMemcpy3DBatchAsync' in found_functions}}
-            global __cuMemcpy3DBatchAsync
-            cuGetProcAddress('cuMemcpy3DBatchAsync', &__cuMemcpy3DBatchAsync, 12080, CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM, NULL)
+            {{if 'cuMemcpy3DBatchAsync_v2' in found_functions}}
+            global __cuMemcpy3DBatchAsync_v2
+            cuGetProcAddress('cuMemcpy3DBatchAsync', &__cuMemcpy3DBatchAsync_v2, 13000, CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM, NULL)
             {{endif}}
             {{if 'cuMemsetD8_v2' in found_functions}}
             global __cuMemsetD8_v2
@@ -712,14 +708,22 @@ cdef int cuPythonInit() except -1 nogil:
             global __cuMemAllocFromPoolAsync
             cuGetProcAddress('cuMemAllocFromPoolAsync', &__cuMemAllocFromPoolAsync, 11020, CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM, NULL)
             {{endif}}
-            {{if 'cuMemPrefetchAsync' in found_functions}}
-            global __cuMemPrefetchAsync
-            cuGetProcAddress('cuMemPrefetchAsync', &__cuMemPrefetchAsync, 8000, CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM, NULL)
-            {{endif}}
             {{if 'cuMemPrefetchAsync_v2' in found_functions}}
             global __cuMemPrefetchAsync_v2
             cuGetProcAddress('cuMemPrefetchAsync', &__cuMemPrefetchAsync_v2, 12020, CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM, NULL)
             {{endif}}
+            {{if 'cuMemPrefetchBatchAsync' in found_functions}}
+            global __cuMemPrefetchBatchAsync
+            cuGetProcAddress('cuMemPrefetchBatchAsync', &__cuMemPrefetchBatchAsync, 13000, CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM, NULL)
+            {{endif}}
+            {{if 'cuMemDiscardBatchAsync' in found_functions}}
+            global __cuMemDiscardBatchAsync
+            cuGetProcAddress('cuMemDiscardBatchAsync', &__cuMemDiscardBatchAsync, 13000, CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM, NULL)
+            {{endif}}
+            {{if 'cuMemDiscardAndPrefetchBatchAsync' in found_functions}}
+            global __cuMemDiscardAndPrefetchBatchAsync
+            cuGetProcAddress('cuMemDiscardAndPrefetchBatchAsync', &__cuMemDiscardAndPrefetchBatchAsync, 13000, CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM, NULL)
+            {{endif}}
             {{if 'cuStreamGetPriority' in found_functions}}
             global __cuStreamGetPriority
             cuGetProcAddress('cuStreamGetPriority', &__cuStreamGetPriority, 7000, CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM, NULL)
@@ -768,18 +772,10 @@ cdef int cuPythonInit() except -1 nogil:
             global __cuStreamIsCapturing
             cuGetProcAddress('cuStreamIsCapturing', &__cuStreamIsCapturing, 10000, CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM, NULL)
             {{endif}}
-            {{if 'cuStreamGetCaptureInfo_v2' in found_functions}}
-            global __cuStreamGetCaptureInfo_v2
-            cuGetProcAddress('cuStreamGetCaptureInfo', &__cuStreamGetCaptureInfo_v2, 11030, CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM, NULL)
-            {{endif}}
             {{if 'cuStreamGetCaptureInfo_v3' in found_functions}}
             global __cuStreamGetCaptureInfo_v3
             cuGetProcAddress('cuStreamGetCaptureInfo', &__cuStreamGetCaptureInfo_v3, 12030, CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM, NULL)
             {{endif}}
-            {{if 'cuStreamUpdateCaptureDependencies' in found_functions}}
-            global __cuStreamUpdateCaptureDependencies
-            cuGetProcAddress('cuStreamUpdateCaptureDependencies', &__cuStreamUpdateCaptureDependencies, 11030, CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM, NULL)
-            {{endif}}
             {{if 'cuStreamUpdateCaptureDependencies_v2' in found_functions}}
             global __cuStreamUpdateCaptureDependencies_v2
             cuGetProcAddress('cuStreamUpdateCaptureDependencies', &__cuStreamUpdateCaptureDependencies_v2, 12030, CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM, NULL)
@@ -979,13 +975,13 @@ cdef int cuPythonInit() except -1 nogil:
             global __cuMemcpy3DPeerAsync
             cuGetProcAddress('cuMemcpy3DPeerAsync', &__cuMemcpy3DPeerAsync, 4000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
             {{endif}}
-            {{if 'cuMemcpyBatchAsync' in found_functions}}
-            global __cuMemcpyBatchAsync
-            cuGetProcAddress('cuMemcpyBatchAsync', &__cuMemcpyBatchAsync, 12080, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+            {{if 'cuMemcpyBatchAsync_v2' in found_functions}}
+            global __cuMemcpyBatchAsync_v2
+            cuGetProcAddress('cuMemcpyBatchAsync', &__cuMemcpyBatchAsync_v2, 13000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
             {{endif}}
-            {{if 'cuMemcpy3DBatchAsync' in found_functions}}
-            global __cuMemcpy3DBatchAsync
-            cuGetProcAddress('cuMemcpy3DBatchAsync', &__cuMemcpy3DBatchAsync, 12080, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+            {{if 'cuMemcpy3DBatchAsync_v2' in found_functions}}
+            global __cuMemcpy3DBatchAsync_v2
+            cuGetProcAddress('cuMemcpy3DBatchAsync', &__cuMemcpy3DBatchAsync_v2, 13000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
             {{endif}}
             {{if 'cuMemsetD8_v2' in found_functions}}
             global __cuMemsetD8_v2
@@ -1055,14 +1051,22 @@ cdef int cuPythonInit() except -1 nogil:
             global __cuMemAllocFromPoolAsync
             cuGetProcAddress('cuMemAllocFromPoolAsync', &__cuMemAllocFromPoolAsync, 11020, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
             {{endif}}
-            {{if 'cuMemPrefetchAsync' in found_functions}}
-            global __cuMemPrefetchAsync
-            cuGetProcAddress('cuMemPrefetchAsync', &__cuMemPrefetchAsync, 8000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
-            {{endif}}
             {{if 'cuMemPrefetchAsync_v2' in found_functions}}
             global __cuMemPrefetchAsync_v2
             cuGetProcAddress('cuMemPrefetchAsync', &__cuMemPrefetchAsync_v2, 12020, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
             {{endif}}
+            {{if 'cuMemPrefetchBatchAsync' in found_functions}}
+            global __cuMemPrefetchBatchAsync
+            cuGetProcAddress('cuMemPrefetchBatchAsync', &__cuMemPrefetchBatchAsync, 13000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+            {{endif}}
+            {{if 'cuMemDiscardBatchAsync' in found_functions}}
+            global __cuMemDiscardBatchAsync
+            cuGetProcAddress('cuMemDiscardBatchAsync', &__cuMemDiscardBatchAsync, 13000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+            {{endif}}
+            {{if 'cuMemDiscardAndPrefetchBatchAsync' in found_functions}}
+            global __cuMemDiscardAndPrefetchBatchAsync
+            cuGetProcAddress('cuMemDiscardAndPrefetchBatchAsync', &__cuMemDiscardAndPrefetchBatchAsync, 13000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+            {{endif}}
             {{if 'cuStreamGetPriority' in found_functions}}
             global __cuStreamGetPriority
             cuGetProcAddress('cuStreamGetPriority', &__cuStreamGetPriority, 5050, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
@@ -1111,18 +1115,10 @@ cdef int cuPythonInit() except -1 nogil:
             global __cuStreamIsCapturing
             cuGetProcAddress('cuStreamIsCapturing', &__cuStreamIsCapturing, 10000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
             {{endif}}
-            {{if 'cuStreamGetCaptureInfo_v2' in found_functions}}
-            global __cuStreamGetCaptureInfo_v2
-            cuGetProcAddress('cuStreamGetCaptureInfo', &__cuStreamGetCaptureInfo_v2, 11030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
-            {{endif}}
             {{if 'cuStreamGetCaptureInfo_v3' in found_functions}}
             global __cuStreamGetCaptureInfo_v3
             cuGetProcAddress('cuStreamGetCaptureInfo', &__cuStreamGetCaptureInfo_v3, 12030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
             {{endif}}
-            {{if 'cuStreamUpdateCaptureDependencies' in found_functions}}
-            global __cuStreamUpdateCaptureDependencies
-            cuGetProcAddress('cuStreamUpdateCaptureDependencies', &__cuStreamUpdateCaptureDependencies, 11030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
-            {{endif}}
             {{if 'cuStreamUpdateCaptureDependencies_v2' in found_functions}}
             global __cuStreamUpdateCaptureDependencies_v2
             cuGetProcAddress('cuStreamUpdateCaptureDependencies', &__cuStreamUpdateCaptureDependencies_v2, 12030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
@@ -1252,10 +1248,6 @@ cdef int cuPythonInit() except -1 nogil:
         global __cuDeviceGetName
         cuGetProcAddress('cuDeviceGetName', &__cuDeviceGetName, 2000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
         {{endif}}
-        {{if 'cuDeviceGetUuid' in found_functions}}
-        global __cuDeviceGetUuid
-        cuGetProcAddress('cuDeviceGetUuid', &__cuDeviceGetUuid, 9020, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
-        {{endif}}
         {{if 'cuDeviceGetUuid_v2' in found_functions}}
         global __cuDeviceGetUuid_v2
         cuGetProcAddress('cuDeviceGetUuid', &__cuDeviceGetUuid_v2, 11040, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
@@ -1276,6 +1268,10 @@ cdef int cuPythonInit() except -1 nogil:
         global __cuDeviceGetAttribute
         cuGetProcAddress('cuDeviceGetAttribute', &__cuDeviceGetAttribute, 2000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
         {{endif}}
+        {{if 'cuDeviceGetHostAtomicCapabilities' in found_functions}}
+        global __cuDeviceGetHostAtomicCapabilities
+        cuGetProcAddress('cuDeviceGetHostAtomicCapabilities', &__cuDeviceGetHostAtomicCapabilities, 13000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+        {{endif}}
         {{if 'cuDeviceGetNvSciSyncAttributes' in found_functions}}
         global __cuDeviceGetNvSciSyncAttributes
         cuGetProcAddress('cuDeviceGetNvSciSyncAttributes', &__cuDeviceGetNvSciSyncAttributes, 10020, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
@@ -1328,14 +1324,6 @@ cdef int cuPythonInit() except -1 nogil:
         global __cuDevicePrimaryCtxReset_v2
         cuGetProcAddress('cuDevicePrimaryCtxReset', &__cuDevicePrimaryCtxReset_v2, 11000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
         {{endif}}
-        {{if 'cuCtxCreate_v2' in found_functions}}
-        global __cuCtxCreate_v2
-        cuGetProcAddress('cuCtxCreate', &__cuCtxCreate_v2, 3020, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
-        {{endif}}
-        {{if 'cuCtxCreate_v3' in found_functions}}
-        global __cuCtxCreate_v3
-        cuGetProcAddress('cuCtxCreate', &__cuCtxCreate_v3, 11040, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
-        {{endif}}
         {{if 'cuCtxCreate_v4' in found_functions}}
         global __cuCtxCreate_v4
         cuGetProcAddress('cuCtxCreate', &__cuCtxCreate_v4, 12050, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
@@ -1364,6 +1352,10 @@ cdef int cuPythonInit() except -1 nogil:
         global __cuCtxGetDevice
         cuGetProcAddress('cuCtxGetDevice', &__cuCtxGetDevice, 2000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
         {{endif}}
+        {{if 'cuCtxGetDevice_v2' in found_functions}}
+        global __cuCtxGetDevice_v2
+        cuGetProcAddress('cuCtxGetDevice', &__cuCtxGetDevice_v2, 13000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+        {{endif}}
         {{if 'cuCtxGetFlags' in found_functions}}
         global __cuCtxGetFlags
         cuGetProcAddress('cuCtxGetFlags', &__cuCtxGetFlags, 7000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
@@ -1380,6 +1372,10 @@ cdef int cuPythonInit() except -1 nogil:
         global __cuCtxSynchronize
         cuGetProcAddress('cuCtxSynchronize', &__cuCtxSynchronize, 2000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
         {{endif}}
+        {{if 'cuCtxSynchronize_v2' in found_functions}}
+        global __cuCtxSynchronize_v2
+        cuGetProcAddress('cuCtxSynchronize', &__cuCtxSynchronize_v2, 13000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+        {{endif}}
         {{if 'cuCtxSetLimit' in found_functions}}
         global __cuCtxSetLimit
         cuGetProcAddress('cuCtxSetLimit', &__cuCtxSetLimit, 3010, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
@@ -1796,6 +1792,18 @@ cdef int cuPythonInit() except -1 nogil:
         global __cuMemPoolDestroy
         cuGetProcAddress('cuMemPoolDestroy', &__cuMemPoolDestroy, 11020, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
         {{endif}}
+        {{if 'cuMemGetDefaultMemPool' in found_functions}}
+        global __cuMemGetDefaultMemPool
+        cuGetProcAddress('cuMemGetDefaultMemPool', &__cuMemGetDefaultMemPool, 13000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+        {{endif}}
+        {{if 'cuMemGetMemPool' in found_functions}}
+        global __cuMemGetMemPool
+        cuGetProcAddress('cuMemGetMemPool', &__cuMemGetMemPool, 13000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+        {{endif}}
+        {{if 'cuMemSetMemPool' in found_functions}}
+        global __cuMemSetMemPool
+        cuGetProcAddress('cuMemSetMemPool', &__cuMemSetMemPool, 13000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+        {{endif}}
         {{if 'cuMemPoolExportToShareableHandle' in found_functions}}
         global __cuMemPoolExportToShareableHandle
         cuGetProcAddress('cuMemPoolExportToShareableHandle', &__cuMemPoolExportToShareableHandle, 11020, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
@@ -1840,10 +1848,6 @@ cdef int cuPythonInit() except -1 nogil:
         global __cuPointerGetAttribute
         cuGetProcAddress('cuPointerGetAttribute', &__cuPointerGetAttribute, 4000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
         {{endif}}
-        {{if 'cuMemAdvise' in found_functions}}
-        global __cuMemAdvise
-        cuGetProcAddress('cuMemAdvise', &__cuMemAdvise, 8000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
-        {{endif}}
         {{if 'cuMemAdvise_v2' in found_functions}}
         global __cuMemAdvise_v2
         cuGetProcAddress('cuMemAdvise', &__cuMemAdvise_v2, 12020, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
@@ -1896,10 +1900,6 @@ cdef int cuPythonInit() except -1 nogil:
         global __cuEventDestroy_v2
         cuGetProcAddress('cuEventDestroy', &__cuEventDestroy_v2, 4000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
         {{endif}}
-        {{if 'cuEventElapsedTime' in found_functions}}
-        global __cuEventElapsedTime
-        cuGetProcAddress('cuEventElapsedTime', &__cuEventElapsedTime, 2000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
-        {{endif}}
         {{if 'cuEventElapsedTime_v2' in found_functions}}
         global __cuEventElapsedTime_v2
         cuGetProcAddress('cuEventElapsedTime', &__cuEventElapsedTime_v2, 12080, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
@@ -2184,42 +2184,22 @@ cdef int cuPythonInit() except -1 nogil:
         global __cuGraphGetRootNodes
         cuGetProcAddress('cuGraphGetRootNodes', &__cuGraphGetRootNodes, 10000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
         {{endif}}
-        {{if 'cuGraphGetEdges' in found_functions}}
-        global __cuGraphGetEdges
-        cuGetProcAddress('cuGraphGetEdges', &__cuGraphGetEdges, 10000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
-        {{endif}}
         {{if 'cuGraphGetEdges_v2' in found_functions}}
         global __cuGraphGetEdges_v2
         cuGetProcAddress('cuGraphGetEdges', &__cuGraphGetEdges_v2, 12030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
         {{endif}}
-        {{if 'cuGraphNodeGetDependencies' in found_functions}}
-        global __cuGraphNodeGetDependencies
-        cuGetProcAddress('cuGraphNodeGetDependencies', &__cuGraphNodeGetDependencies, 10000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
-        {{endif}}
         {{if 'cuGraphNodeGetDependencies_v2' in found_functions}}
         global __cuGraphNodeGetDependencies_v2
         cuGetProcAddress('cuGraphNodeGetDependencies', &__cuGraphNodeGetDependencies_v2, 12030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
         {{endif}}
-        {{if 'cuGraphNodeGetDependentNodes' in found_functions}}
-        global __cuGraphNodeGetDependentNodes
-        cuGetProcAddress('cuGraphNodeGetDependentNodes', &__cuGraphNodeGetDependentNodes, 10000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
-        {{endif}}
         {{if 'cuGraphNodeGetDependentNodes_v2' in found_functions}}
         global __cuGraphNodeGetDependentNodes_v2
         cuGetProcAddress('cuGraphNodeGetDependentNodes', &__cuGraphNodeGetDependentNodes_v2, 12030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
         {{endif}}
-        {{if 'cuGraphAddDependencies' in found_functions}}
-        global __cuGraphAddDependencies
-        cuGetProcAddress('cuGraphAddDependencies', &__cuGraphAddDependencies, 10000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
-        {{endif}}
         {{if 'cuGraphAddDependencies_v2' in found_functions}}
         global __cuGraphAddDependencies_v2
         cuGetProcAddress('cuGraphAddDependencies', &__cuGraphAddDependencies_v2, 12030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
         {{endif}}
-        {{if 'cuGraphRemoveDependencies' in found_functions}}
-        global __cuGraphRemoveDependencies
-        cuGetProcAddress('cuGraphRemoveDependencies', &__cuGraphRemoveDependencies, 10000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
-        {{endif}}
         {{if 'cuGraphRemoveDependencies_v2' in found_functions}}
         global __cuGraphRemoveDependencies_v2
         cuGetProcAddress('cuGraphRemoveDependencies', &__cuGraphRemoveDependencies_v2, 12030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
@@ -2328,10 +2308,6 @@ cdef int cuPythonInit() except -1 nogil:
         global __cuGraphReleaseUserObject
         cuGetProcAddress('cuGraphReleaseUserObject', &__cuGraphReleaseUserObject, 11030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
         {{endif}}
-        {{if 'cuGraphAddNode' in found_functions}}
-        global __cuGraphAddNode
-        cuGetProcAddress('cuGraphAddNode', &__cuGraphAddNode, 12020, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
-        {{endif}}
         {{if 'cuGraphAddNode_v2' in found_functions}}
         global __cuGraphAddNode_v2
         cuGetProcAddress('cuGraphAddNode', &__cuGraphAddNode_v2, 12030, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
@@ -2556,6 +2532,10 @@ cdef int cuPythonInit() except -1 nogil:
         global __cuDeviceGetP2PAttribute
         cuGetProcAddress('cuDeviceGetP2PAttribute', &__cuDeviceGetP2PAttribute, 8000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
         {{endif}}
+        {{if 'cuDeviceGetP2PAtomicCapabilities' in found_functions}}
+        global __cuDeviceGetP2PAtomicCapabilities
+        cuGetProcAddress('cuDeviceGetP2PAtomicCapabilities', &__cuDeviceGetP2PAtomicCapabilities, 13000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+        {{endif}}
         {{if 'cuGraphicsUnregisterResource' in found_functions}}
         global __cuGraphicsUnregisterResource
         cuGetProcAddress('cuGraphicsUnregisterResource', &__cuGraphicsUnregisterResource, 3000, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
@@ -2648,6 +2628,10 @@ cdef int cuPythonInit() except -1 nogil:
         global __cuGreenCtxStreamCreate
         cuGetProcAddress('cuGreenCtxStreamCreate', &__cuGreenCtxStreamCreate, 12050, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
         {{endif}}
+        {{if 'cuGreenCtxGetId' in found_functions}}
+        global __cuGreenCtxGetId
+        cuGetProcAddress('cuGreenCtxGetId', &__cuGreenCtxGetId, 12090, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
+        {{endif}}
         {{if 'cuLogsRegisterCallback' in found_functions}}
         global __cuLogsRegisterCallback
         cuGetProcAddress('cuLogsRegisterCallback', &__cuLogsRegisterCallback, 12080, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
@@ -2684,10 +2668,6 @@ cdef int cuPythonInit() except -1 nogil:
         global __cuCheckpointProcessCheckpoint
         cuGetProcAddress('cuCheckpointProcessCheckpoint', &__cuCheckpointProcessCheckpoint, 12080, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
         {{endif}}
-        {{if 'cuCheckpointProcessRestore' in found_functions}}
-        global __cuCheckpointProcessRestore
-        cuGetProcAddress('cuCheckpointProcessRestore', &__cuCheckpointProcessRestore, 12080, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
-        {{endif}}
         {{if 'cuCheckpointProcessUnlock' in found_functions}}
         global __cuCheckpointProcessUnlock
         cuGetProcAddress('cuCheckpointProcessUnlock', &__cuCheckpointProcessUnlock, 12080, CU_GET_PROC_ADDRESS_DEFAULT, NULL)
@@ -2952,17 +2932,17 @@ cdef int cuPythonInit() except -1 nogil:
             except:
                 pass
             {{endif}}
-            {{if 'cuMemcpyBatchAsync' in found_functions}}
+            {{if 'cuMemcpyBatchAsync_v2' in found_functions}}
             try:
-                global __cuMemcpyBatchAsync
-                __cuMemcpyBatchAsync = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemcpyBatchAsync_ptsz')
+                global __cuMemcpyBatchAsync_v2
+                __cuMemcpyBatchAsync_v2 = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemcpyBatchAsync_v2_ptsz')
             except:
                 pass
             {{endif}}
-            {{if 'cuMemcpy3DBatchAsync' in found_functions}}
+            {{if 'cuMemcpy3DBatchAsync_v2' in found_functions}}
             try:
-                global __cuMemcpy3DBatchAsync
-                __cuMemcpy3DBatchAsync = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemcpy3DBatchAsync_ptsz')
+                global __cuMemcpy3DBatchAsync_v2
+                __cuMemcpy3DBatchAsync_v2 = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemcpy3DBatchAsync_v2_ptsz')
             except:
                 pass
             {{endif}}
@@ -3085,17 +3065,31 @@ cdef int cuPythonInit() except -1 nogil:
             except:
                 pass
             {{endif}}
-            {{if 'cuMemPrefetchAsync' in found_functions}}
+            {{if 'cuMemPrefetchAsync_v2' in found_functions}}
             try:
-                global __cuMemPrefetchAsync
-                __cuMemPrefetchAsync = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemPrefetchAsync_ptsz')
+                global __cuMemPrefetchAsync_v2
+                __cuMemPrefetchAsync_v2 = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemPrefetchAsync_v2_ptsz')
             except:
                 pass
             {{endif}}
-            {{if 'cuMemPrefetchAsync_v2' in found_functions}}
+            {{if 'cuMemPrefetchBatchAsync' in found_functions}}
             try:
-                global __cuMemPrefetchAsync_v2
-                __cuMemPrefetchAsync_v2 = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemPrefetchAsync_v2_ptsz')
+                global __cuMemPrefetchBatchAsync
+                __cuMemPrefetchBatchAsync = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemPrefetchBatchAsync_ptsz')
+            except:
+                pass
+            {{endif}}
+            {{if 'cuMemDiscardBatchAsync' in found_functions}}
+            try:
+                global __cuMemDiscardBatchAsync
+                __cuMemDiscardBatchAsync = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemDiscardBatchAsync_ptsz')
+            except:
+                pass
+            {{endif}}
+            {{if 'cuMemDiscardAndPrefetchBatchAsync' in found_functions}}
+            try:
+                global __cuMemDiscardAndPrefetchBatchAsync
+                __cuMemDiscardAndPrefetchBatchAsync = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemDiscardAndPrefetchBatchAsync_ptsz')
             except:
                 pass
             {{endif}}
@@ -3183,13 +3177,6 @@ cdef int cuPythonInit() except -1 nogil:
             except:
                 pass
             {{endif}}
-            {{if 'cuStreamGetCaptureInfo_v2' in found_functions}}
-            try:
-                global __cuStreamGetCaptureInfo_v2
-                __cuStreamGetCaptureInfo_v2 = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuStreamGetCaptureInfo_v2_ptsz')
-            except:
-                pass
-            {{endif}}
             {{if 'cuStreamGetCaptureInfo_v3' in found_functions}}
             try:
                 global __cuStreamGetCaptureInfo_v3
@@ -3197,13 +3184,6 @@ cdef int cuPythonInit() except -1 nogil:
             except:
                 pass
             {{endif}}
-            {{if 'cuStreamUpdateCaptureDependencies' in found_functions}}
-            try:
-                global __cuStreamUpdateCaptureDependencies
-                __cuStreamUpdateCaptureDependencies = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuStreamUpdateCaptureDependencies_ptsz')
-            except:
-                pass
-            {{endif}}
             {{if 'cuStreamUpdateCaptureDependencies_v2' in found_functions}}
             try:
                 global __cuStreamUpdateCaptureDependencies_v2
@@ -3550,17 +3530,17 @@ cdef int cuPythonInit() except -1 nogil:
             except:
                 pass
             {{endif}}
-            {{if 'cuMemcpyBatchAsync' in found_functions}}
+            {{if 'cuMemcpyBatchAsync_v2' in found_functions}}
             try:
-                global __cuMemcpyBatchAsync
-                __cuMemcpyBatchAsync = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemcpyBatchAsync')
+                global __cuMemcpyBatchAsync_v2
+                __cuMemcpyBatchAsync_v2 = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemcpyBatchAsync_v2')
             except:
                 pass
             {{endif}}
-            {{if 'cuMemcpy3DBatchAsync' in found_functions}}
+            {{if 'cuMemcpy3DBatchAsync_v2' in found_functions}}
             try:
-                global __cuMemcpy3DBatchAsync
-                __cuMemcpy3DBatchAsync = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemcpy3DBatchAsync')
+                global __cuMemcpy3DBatchAsync_v2
+                __cuMemcpy3DBatchAsync_v2 = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemcpy3DBatchAsync_v2')
             except:
                 pass
             {{endif}}
@@ -3683,17 +3663,31 @@ cdef int cuPythonInit() except -1 nogil:
             except:
                 pass
             {{endif}}
-            {{if 'cuMemPrefetchAsync' in found_functions}}
+            {{if 'cuMemPrefetchAsync_v2' in found_functions}}
             try:
-                global __cuMemPrefetchAsync
-                __cuMemPrefetchAsync = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemPrefetchAsync')
+                global __cuMemPrefetchAsync_v2
+                __cuMemPrefetchAsync_v2 = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemPrefetchAsync_v2')
             except:
                 pass
             {{endif}}
-            {{if 'cuMemPrefetchAsync_v2' in found_functions}}
+            {{if 'cuMemPrefetchBatchAsync' in found_functions}}
             try:
-                global __cuMemPrefetchAsync_v2
-                __cuMemPrefetchAsync_v2 = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemPrefetchAsync_v2')
+                global __cuMemPrefetchBatchAsync
+                __cuMemPrefetchBatchAsync = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemPrefetchBatchAsync')
+            except:
+                pass
+            {{endif}}
+            {{if 'cuMemDiscardBatchAsync' in found_functions}}
+            try:
+                global __cuMemDiscardBatchAsync
+                __cuMemDiscardBatchAsync = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemDiscardBatchAsync')
+            except:
+                pass
+            {{endif}}
+            {{if 'cuMemDiscardAndPrefetchBatchAsync' in found_functions}}
+            try:
+                global __cuMemDiscardAndPrefetchBatchAsync
+                __cuMemDiscardAndPrefetchBatchAsync = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemDiscardAndPrefetchBatchAsync')
             except:
                 pass
             {{endif}}
@@ -3781,13 +3775,6 @@ cdef int cuPythonInit() except -1 nogil:
             except:
                 pass
             {{endif}}
-            {{if 'cuStreamGetCaptureInfo_v2' in found_functions}}
-            try:
-                global __cuStreamGetCaptureInfo_v2
-                __cuStreamGetCaptureInfo_v2 = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuStreamGetCaptureInfo_v2')
-            except:
-                pass
-            {{endif}}
             {{if 'cuStreamGetCaptureInfo_v3' in found_functions}}
             try:
                 global __cuStreamGetCaptureInfo_v3
@@ -3795,13 +3782,6 @@ cdef int cuPythonInit() except -1 nogil:
             except:
                 pass
             {{endif}}
-            {{if 'cuStreamUpdateCaptureDependencies' in found_functions}}
-            try:
-                global __cuStreamUpdateCaptureDependencies
-                __cuStreamUpdateCaptureDependencies = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuStreamUpdateCaptureDependencies')
-            except:
-                pass
-            {{endif}}
             {{if 'cuStreamUpdateCaptureDependencies_v2' in found_functions}}
             try:
                 global __cuStreamUpdateCaptureDependencies_v2
@@ -4027,13 +4007,6 @@ cdef int cuPythonInit() except -1 nogil:
         except:
             pass
         {{endif}}
-        {{if 'cuDeviceGetUuid' in found_functions}}
-        try:
-            global __cuDeviceGetUuid
-            __cuDeviceGetUuid = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuDeviceGetUuid')
-        except:
-            pass
-        {{endif}}
         {{if 'cuDeviceGetUuid_v2' in found_functions}}
         try:
             global __cuDeviceGetUuid_v2
@@ -4069,6 +4042,13 @@ cdef int cuPythonInit() except -1 nogil:
         except:
             pass
         {{endif}}
+        {{if 'cuDeviceGetHostAtomicCapabilities' in found_functions}}
+        try:
+            global __cuDeviceGetHostAtomicCapabilities
+            __cuDeviceGetHostAtomicCapabilities = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuDeviceGetHostAtomicCapabilities')
+        except:
+            pass
+        {{endif}}
         {{if 'cuDeviceGetNvSciSyncAttributes' in found_functions}}
         try:
             global __cuDeviceGetNvSciSyncAttributes
@@ -4160,20 +4140,6 @@ cdef int cuPythonInit() except -1 nogil:
         except:
             pass
         {{endif}}
-        {{if 'cuCtxCreate_v2' in found_functions}}
-        try:
-            global __cuCtxCreate_v2
-            __cuCtxCreate_v2 = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuCtxCreate_v2')
-        except:
-            pass
-        {{endif}}
-        {{if 'cuCtxCreate_v3' in found_functions}}
-        try:
-            global __cuCtxCreate_v3
-            __cuCtxCreate_v3 = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuCtxCreate_v3')
-        except:
-            pass
-        {{endif}}
         {{if 'cuCtxCreate_v4' in found_functions}}
         try:
             global __cuCtxCreate_v4
@@ -4223,6 +4189,13 @@ cdef int cuPythonInit() except -1 nogil:
         except:
             pass
         {{endif}}
+        {{if 'cuCtxGetDevice_v2' in found_functions}}
+        try:
+            global __cuCtxGetDevice_v2
+            __cuCtxGetDevice_v2 = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuCtxGetDevice_v2')
+        except:
+            pass
+        {{endif}}
         {{if 'cuCtxGetFlags' in found_functions}}
         try:
             global __cuCtxGetFlags
@@ -4251,6 +4224,13 @@ cdef int cuPythonInit() except -1 nogil:
         except:
             pass
         {{endif}}
+        {{if 'cuCtxSynchronize_v2' in found_functions}}
+        try:
+            global __cuCtxSynchronize_v2
+            __cuCtxSynchronize_v2 = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuCtxSynchronize_v2')
+        except:
+            pass
+        {{endif}}
         {{if 'cuCtxSetLimit' in found_functions}}
         try:
             global __cuCtxSetLimit
@@ -4979,6 +4959,27 @@ cdef int cuPythonInit() except -1 nogil:
         except:
             pass
         {{endif}}
+        {{if 'cuMemGetDefaultMemPool' in found_functions}}
+        try:
+            global __cuMemGetDefaultMemPool
+            __cuMemGetDefaultMemPool = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemGetDefaultMemPool')
+        except:
+            pass
+        {{endif}}
+        {{if 'cuMemGetMemPool' in found_functions}}
+        try:
+            global __cuMemGetMemPool
+            __cuMemGetMemPool = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemGetMemPool')
+        except:
+            pass
+        {{endif}}
+        {{if 'cuMemSetMemPool' in found_functions}}
+        try:
+            global __cuMemSetMemPool
+            __cuMemSetMemPool = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemSetMemPool')
+        except:
+            pass
+        {{endif}}
         {{if 'cuMemPoolExportToShareableHandle' in found_functions}}
         try:
             global __cuMemPoolExportToShareableHandle
@@ -5056,13 +5057,6 @@ cdef int cuPythonInit() except -1 nogil:
         except:
             pass
         {{endif}}
-        {{if 'cuMemAdvise' in found_functions}}
-        try:
-            global __cuMemAdvise
-            __cuMemAdvise = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuMemAdvise')
-        except:
-            pass
-        {{endif}}
         {{if 'cuMemAdvise_v2' in found_functions}}
         try:
             global __cuMemAdvise_v2
@@ -5154,13 +5148,6 @@ cdef int cuPythonInit() except -1 nogil:
         except:
             pass
         {{endif}}
-        {{if 'cuEventElapsedTime' in found_functions}}
-        try:
-            global __cuEventElapsedTime
-            __cuEventElapsedTime = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuEventElapsedTime')
-        except:
-            pass
-        {{endif}}
         {{if 'cuEventElapsedTime_v2' in found_functions}}
         try:
             global __cuEventElapsedTime_v2
@@ -5658,13 +5645,6 @@ cdef int cuPythonInit() except -1 nogil:
         except:
             pass
         {{endif}}
-        {{if 'cuGraphGetEdges' in found_functions}}
-        try:
-            global __cuGraphGetEdges
-            __cuGraphGetEdges = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuGraphGetEdges')
-        except:
-            pass
-        {{endif}}
         {{if 'cuGraphGetEdges_v2' in found_functions}}
         try:
             global __cuGraphGetEdges_v2
@@ -5672,13 +5652,6 @@ cdef int cuPythonInit() except -1 nogil:
         except:
             pass
         {{endif}}
-        {{if 'cuGraphNodeGetDependencies' in found_functions}}
-        try:
-            global __cuGraphNodeGetDependencies
-            __cuGraphNodeGetDependencies = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuGraphNodeGetDependencies')
-        except:
-            pass
-        {{endif}}
         {{if 'cuGraphNodeGetDependencies_v2' in found_functions}}
         try:
             global __cuGraphNodeGetDependencies_v2
@@ -5686,13 +5659,6 @@ cdef int cuPythonInit() except -1 nogil:
         except:
             pass
         {{endif}}
-        {{if 'cuGraphNodeGetDependentNodes' in found_functions}}
-        try:
-            global __cuGraphNodeGetDependentNodes
-            __cuGraphNodeGetDependentNodes = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuGraphNodeGetDependentNodes')
-        except:
-            pass
-        {{endif}}
         {{if 'cuGraphNodeGetDependentNodes_v2' in found_functions}}
         try:
             global __cuGraphNodeGetDependentNodes_v2
@@ -5700,13 +5666,6 @@ cdef int cuPythonInit() except -1 nogil:
         except:
             pass
         {{endif}}
-        {{if 'cuGraphAddDependencies' in found_functions}}
-        try:
-            global __cuGraphAddDependencies
-            __cuGraphAddDependencies = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuGraphAddDependencies')
-        except:
-            pass
-        {{endif}}
         {{if 'cuGraphAddDependencies_v2' in found_functions}}
         try:
             global __cuGraphAddDependencies_v2
@@ -5714,13 +5673,6 @@ cdef int cuPythonInit() except -1 nogil:
         except:
             pass
         {{endif}}
-        {{if 'cuGraphRemoveDependencies' in found_functions}}
-        try:
-            global __cuGraphRemoveDependencies
-            __cuGraphRemoveDependencies = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuGraphRemoveDependencies')
-        except:
-            pass
-        {{endif}}
         {{if 'cuGraphRemoveDependencies_v2' in found_functions}}
         try:
             global __cuGraphRemoveDependencies_v2
@@ -5910,13 +5862,6 @@ cdef int cuPythonInit() except -1 nogil:
         except:
             pass
         {{endif}}
-        {{if 'cuGraphAddNode' in found_functions}}
-        try:
-            global __cuGraphAddNode
-            __cuGraphAddNode = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuGraphAddNode')
-        except:
-            pass
-        {{endif}}
         {{if 'cuGraphAddNode_v2' in found_functions}}
         try:
             global __cuGraphAddNode_v2
@@ -6309,6 +6254,13 @@ cdef int cuPythonInit() except -1 nogil:
         except:
             pass
         {{endif}}
+        {{if 'cuDeviceGetP2PAtomicCapabilities' in found_functions}}
+        try:
+            global __cuDeviceGetP2PAtomicCapabilities
+            __cuDeviceGetP2PAtomicCapabilities = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuDeviceGetP2PAtomicCapabilities')
+        except:
+            pass
+        {{endif}}
         {{if 'cuGraphicsUnregisterResource' in found_functions}}
         try:
             global __cuGraphicsUnregisterResource
@@ -6470,6 +6422,13 @@ cdef int cuPythonInit() except -1 nogil:
         except:
             pass
         {{endif}}
+        {{if 'cuGreenCtxGetId' in found_functions}}
+        try:
+            global __cuGreenCtxGetId
+            __cuGreenCtxGetId = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuGreenCtxGetId')
+        except:
+            pass
+        {{endif}}
         {{if 'cuLogsRegisterCallback' in found_functions}}
         try:
             global __cuLogsRegisterCallback
@@ -6533,13 +6492,6 @@ cdef int cuPythonInit() except -1 nogil:
         except:
             pass
         {{endif}}
-        {{if 'cuCheckpointProcessRestore' in found_functions}}
-        try:
-            global __cuCheckpointProcessRestore
-            __cuCheckpointProcessRestore = <void*><unsigned long long>win32api.GetProcAddress(handle, 'cuCheckpointProcessRestore')
-        except:
-            pass
-        {{endif}}
         {{if 'cuCheckpointProcessUnlock' in found_functions}}
         try:
             global __cuCheckpointProcessUnlock
@@ -6795,13 +6747,13 @@ cdef int cuPythonInit() except -1 nogil:
         global __cuMemcpy3DPeerAsync
         __cuMemcpy3DPeerAsync = dlfcn.dlsym(handle, 'cuMemcpy3DPeerAsync_ptsz')
         {{endif}}
-        {{if 'cuMemcpyBatchAsync' in found_functions}}
-        global __cuMemcpyBatchAsync
-        __cuMemcpyBatchAsync = dlfcn.dlsym(handle, 'cuMemcpyBatchAsync_ptsz')
+        {{if 'cuMemcpyBatchAsync_v2' in found_functions}}
+        global __cuMemcpyBatchAsync_v2
+        __cuMemcpyBatchAsync_v2 = dlfcn.dlsym(handle, 'cuMemcpyBatchAsync_v2_ptsz')
         {{endif}}
-        {{if 'cuMemcpy3DBatchAsync' in found_functions}}
-        global __cuMemcpy3DBatchAsync
-        __cuMemcpy3DBatchAsync = dlfcn.dlsym(handle, 'cuMemcpy3DBatchAsync_ptsz')
+        {{if 'cuMemcpy3DBatchAsync_v2' in found_functions}}
+        global __cuMemcpy3DBatchAsync_v2
+        __cuMemcpy3DBatchAsync_v2 = dlfcn.dlsym(handle, 'cuMemcpy3DBatchAsync_v2_ptsz')
         {{endif}}
         {{if 'cuMemsetD8_v2' in found_functions}}
         global __cuMemsetD8_v2
@@ -6871,14 +6823,22 @@ cdef int cuPythonInit() except -1 nogil:
         global __cuMemAllocFromPoolAsync
         __cuMemAllocFromPoolAsync = dlfcn.dlsym(handle, 'cuMemAllocFromPoolAsync_ptsz')
         {{endif}}
-        {{if 'cuMemPrefetchAsync' in found_functions}}
-        global __cuMemPrefetchAsync
-        __cuMemPrefetchAsync = dlfcn.dlsym(handle, 'cuMemPrefetchAsync_ptsz')
-        {{endif}}
         {{if 'cuMemPrefetchAsync_v2' in found_functions}}
         global __cuMemPrefetchAsync_v2
         __cuMemPrefetchAsync_v2 = dlfcn.dlsym(handle, 'cuMemPrefetchAsync_v2_ptsz')
         {{endif}}
+        {{if 'cuMemPrefetchBatchAsync' in found_functions}}
+        global __cuMemPrefetchBatchAsync
+        __cuMemPrefetchBatchAsync = dlfcn.dlsym(handle, 'cuMemPrefetchBatchAsync_ptsz')
+        {{endif}}
+        {{if 'cuMemDiscardBatchAsync' in found_functions}}
+        global __cuMemDiscardBatchAsync
+        __cuMemDiscardBatchAsync = dlfcn.dlsym(handle, 'cuMemDiscardBatchAsync_ptsz')
+        {{endif}}
+        {{if 'cuMemDiscardAndPrefetchBatchAsync' in found_functions}}
+        global __cuMemDiscardAndPrefetchBatchAsync
+        __cuMemDiscardAndPrefetchBatchAsync = dlfcn.dlsym(handle, 'cuMemDiscardAndPrefetchBatchAsync_ptsz')
+        {{endif}}
         {{if 'cuStreamGetPriority' in found_functions}}
         global __cuStreamGetPriority
         __cuStreamGetPriority = dlfcn.dlsym(handle, 'cuStreamGetPriority_ptsz')
@@ -6927,18 +6887,10 @@ cdef int cuPythonInit() except -1 nogil:
         global __cuStreamIsCapturing
         __cuStreamIsCapturing = dlfcn.dlsym(handle, 'cuStreamIsCapturing_ptsz')
         {{endif}}
-        {{if 'cuStreamGetCaptureInfo_v2' in found_functions}}
-        global __cuStreamGetCaptureInfo_v2
-        __cuStreamGetCaptureInfo_v2 = dlfcn.dlsym(handle, 'cuStreamGetCaptureInfo_v2_ptsz')
-        {{endif}}
         {{if 'cuStreamGetCaptureInfo_v3' in found_functions}}
         global __cuStreamGetCaptureInfo_v3
         __cuStreamGetCaptureInfo_v3 = dlfcn.dlsym(handle, 'cuStreamGetCaptureInfo_v3_ptsz')
         {{endif}}
-        {{if 'cuStreamUpdateCaptureDependencies' in found_functions}}
-        global __cuStreamUpdateCaptureDependencies
-        __cuStreamUpdateCaptureDependencies = dlfcn.dlsym(handle, 'cuStreamUpdateCaptureDependencies_ptsz')
-        {{endif}}
         {{if 'cuStreamUpdateCaptureDependencies_v2' in found_functions}}
         global __cuStreamUpdateCaptureDependencies_v2
         __cuStreamUpdateCaptureDependencies_v2 = dlfcn.dlsym(handle, 'cuStreamUpdateCaptureDependencies_v2_ptsz')
@@ -7138,13 +7090,13 @@ cdef int cuPythonInit() except -1 nogil:
         global __cuMemcpy3DPeerAsync
         __cuMemcpy3DPeerAsync = dlfcn.dlsym(handle, 'cuMemcpy3DPeerAsync')
         {{endif}}
-        {{if 'cuMemcpyBatchAsync' in found_functions}}
-        global __cuMemcpyBatchAsync
-        __cuMemcpyBatchAsync = dlfcn.dlsym(handle, 'cuMemcpyBatchAsync')
+        {{if 'cuMemcpyBatchAsync_v2' in found_functions}}
+        global __cuMemcpyBatchAsync_v2
+        __cuMemcpyBatchAsync_v2 = dlfcn.dlsym(handle, 'cuMemcpyBatchAsync_v2')
         {{endif}}
-        {{if 'cuMemcpy3DBatchAsync' in found_functions}}
-        global __cuMemcpy3DBatchAsync
-        __cuMemcpy3DBatchAsync = dlfcn.dlsym(handle, 'cuMemcpy3DBatchAsync')
+        {{if 'cuMemcpy3DBatchAsync_v2' in found_functions}}
+        global __cuMemcpy3DBatchAsync_v2
+        __cuMemcpy3DBatchAsync_v2 = dlfcn.dlsym(handle, 'cuMemcpy3DBatchAsync_v2')
         {{endif}}
         {{if 'cuMemsetD8_v2' in found_functions}}
         global __cuMemsetD8_v2
@@ -7214,14 +7166,22 @@ cdef int cuPythonInit() except -1 nogil:
         global __cuMemAllocFromPoolAsync
         __cuMemAllocFromPoolAsync = dlfcn.dlsym(handle, 'cuMemAllocFromPoolAsync')
         {{endif}}
-        {{if 'cuMemPrefetchAsync' in found_functions}}
-        global __cuMemPrefetchAsync
-        __cuMemPrefetchAsync = dlfcn.dlsym(handle, 'cuMemPrefetchAsync')
-        {{endif}}
         {{if 'cuMemPrefetchAsync_v2' in found_functions}}
         global __cuMemPrefetchAsync_v2
         __cuMemPrefetchAsync_v2 = dlfcn.dlsym(handle, 'cuMemPrefetchAsync_v2')
         {{endif}}
+        {{if 'cuMemPrefetchBatchAsync' in found_functions}}
+        global __cuMemPrefetchBatchAsync
+        __cuMemPrefetchBatchAsync = dlfcn.dlsym(handle, 'cuMemPrefetchBatchAsync')
+        {{endif}}
+        {{if 'cuMemDiscardBatchAsync' in found_functions}}
+        global __cuMemDiscardBatchAsync
+        __cuMemDiscardBatchAsync = dlfcn.dlsym(handle, 'cuMemDiscardBatchAsync')
+        {{endif}}
+        {{if 'cuMemDiscardAndPrefetchBatchAsync' in found_functions}}
+        global __cuMemDiscardAndPrefetchBatchAsync
+        __cuMemDiscardAndPrefetchBatchAsync = dlfcn.dlsym(handle, 'cuMemDiscardAndPrefetchBatchAsync')
+        {{endif}}
         {{if 'cuStreamGetPriority' in found_functions}}
         global __cuStreamGetPriority
         __cuStreamGetPriority = dlfcn.dlsym(handle, 'cuStreamGetPriority')
@@ -7270,18 +7230,10 @@ cdef int cuPythonInit() except -1 nogil:
         global __cuStreamIsCapturing
         __cuStreamIsCapturing = dlfcn.dlsym(handle, 'cuStreamIsCapturing')
         {{endif}}
-        {{if 'cuStreamGetCaptureInfo_v2' in found_functions}}
-        global __cuStreamGetCaptureInfo_v2
-        __cuStreamGetCaptureInfo_v2 = dlfcn.dlsym(handle, 'cuStreamGetCaptureInfo_v2')
-        {{endif}}
         {{if 'cuStreamGetCaptureInfo_v3' in found_functions}}
         global __cuStreamGetCaptureInfo_v3
         __cuStreamGetCaptureInfo_v3 = dlfcn.dlsym(handle, 'cuStreamGetCaptureInfo_v3')
         {{endif}}
-        {{if 'cuStreamUpdateCaptureDependencies' in found_functions}}
-        global __cuStreamUpdateCaptureDependencies
-        __cuStreamUpdateCaptureDependencies = dlfcn.dlsym(handle, 'cuStreamUpdateCaptureDependencies')
-        {{endif}}
         {{if 'cuStreamUpdateCaptureDependencies_v2' in found_functions}}
         global __cuStreamUpdateCaptureDependencies_v2
         __cuStreamUpdateCaptureDependencies_v2 = dlfcn.dlsym(handle, 'cuStreamUpdateCaptureDependencies_v2')
@@ -7411,10 +7363,6 @@ cdef int cuPythonInit() except -1 nogil:
     global __cuDeviceGetName
     __cuDeviceGetName = dlfcn.dlsym(handle, 'cuDeviceGetName')
     {{endif}}
-    {{if 'cuDeviceGetUuid' in found_functions}}
-    global __cuDeviceGetUuid
-    __cuDeviceGetUuid = dlfcn.dlsym(handle, 'cuDeviceGetUuid')
-    {{endif}}
     {{if 'cuDeviceGetUuid_v2' in found_functions}}
     global __cuDeviceGetUuid_v2
     __cuDeviceGetUuid_v2 = dlfcn.dlsym(handle, 'cuDeviceGetUuid_v2')
@@ -7435,6 +7383,10 @@ cdef int cuPythonInit() except -1 nogil:
     global __cuDeviceGetAttribute
     __cuDeviceGetAttribute = dlfcn.dlsym(handle, 'cuDeviceGetAttribute')
     {{endif}}
+    {{if 'cuDeviceGetHostAtomicCapabilities' in found_functions}}
+    global __cuDeviceGetHostAtomicCapabilities
+    __cuDeviceGetHostAtomicCapabilities = dlfcn.dlsym(handle, 'cuDeviceGetHostAtomicCapabilities')
+    {{endif}}
     {{if 'cuDeviceGetNvSciSyncAttributes' in found_functions}}
     global __cuDeviceGetNvSciSyncAttributes
     __cuDeviceGetNvSciSyncAttributes = dlfcn.dlsym(handle, 'cuDeviceGetNvSciSyncAttributes')
@@ -7487,14 +7439,6 @@ cdef int cuPythonInit() except -1 nogil:
     global __cuDevicePrimaryCtxReset_v2
     __cuDevicePrimaryCtxReset_v2 = dlfcn.dlsym(handle, 'cuDevicePrimaryCtxReset_v2')
     {{endif}}
-    {{if 'cuCtxCreate_v2' in found_functions}}
-    global __cuCtxCreate_v2
-    __cuCtxCreate_v2 = dlfcn.dlsym(handle, 'cuCtxCreate_v2')
-    {{endif}}
-    {{if 'cuCtxCreate_v3' in found_functions}}
-    global __cuCtxCreate_v3
-    __cuCtxCreate_v3 = dlfcn.dlsym(handle, 'cuCtxCreate_v3')
-    {{endif}}
     {{if 'cuCtxCreate_v4' in found_functions}}
     global __cuCtxCreate_v4
     __cuCtxCreate_v4 = dlfcn.dlsym(handle, 'cuCtxCreate_v4')
@@ -7523,6 +7467,10 @@ cdef int cuPythonInit() except -1 nogil:
     global __cuCtxGetDevice
     __cuCtxGetDevice = dlfcn.dlsym(handle, 'cuCtxGetDevice')
     {{endif}}
+    {{if 'cuCtxGetDevice_v2' in found_functions}}
+    global __cuCtxGetDevice_v2
+    __cuCtxGetDevice_v2 = dlfcn.dlsym(handle, 'cuCtxGetDevice_v2')
+    {{endif}}
     {{if 'cuCtxGetFlags' in found_functions}}
     global __cuCtxGetFlags
     __cuCtxGetFlags = dlfcn.dlsym(handle, 'cuCtxGetFlags')
@@ -7539,6 +7487,10 @@ cdef int cuPythonInit() except -1 nogil:
     global __cuCtxSynchronize
     __cuCtxSynchronize = dlfcn.dlsym(handle, 'cuCtxSynchronize')
     {{endif}}
+    {{if 'cuCtxSynchronize_v2' in found_functions}}
+    global __cuCtxSynchronize_v2
+    __cuCtxSynchronize_v2 = dlfcn.dlsym(handle, 'cuCtxSynchronize_v2')
+    {{endif}}
     {{if 'cuCtxSetLimit' in found_functions}}
     global __cuCtxSetLimit
     __cuCtxSetLimit = dlfcn.dlsym(handle, 'cuCtxSetLimit')
@@ -7955,6 +7907,18 @@ cdef int cuPythonInit() except -1 nogil:
     global __cuMemPoolDestroy
     __cuMemPoolDestroy = dlfcn.dlsym(handle, 'cuMemPoolDestroy')
     {{endif}}
+    {{if 'cuMemGetDefaultMemPool' in found_functions}}
+    global __cuMemGetDefaultMemPool
+    __cuMemGetDefaultMemPool = dlfcn.dlsym(handle, 'cuMemGetDefaultMemPool')
+    {{endif}}
+    {{if 'cuMemGetMemPool' in found_functions}}
+    global __cuMemGetMemPool
+    __cuMemGetMemPool = dlfcn.dlsym(handle, 'cuMemGetMemPool')
+    {{endif}}
+    {{if 'cuMemSetMemPool' in found_functions}}
+    global __cuMemSetMemPool
+    __cuMemSetMemPool = dlfcn.dlsym(handle, 'cuMemSetMemPool')
+    {{endif}}
     {{if 'cuMemPoolExportToShareableHandle' in found_functions}}
     global __cuMemPoolExportToShareableHandle
     __cuMemPoolExportToShareableHandle = dlfcn.dlsym(handle, 'cuMemPoolExportToShareableHandle')
@@ -7999,10 +7963,6 @@ cdef int cuPythonInit() except -1 nogil:
     global __cuPointerGetAttribute
     __cuPointerGetAttribute = dlfcn.dlsym(handle, 'cuPointerGetAttribute')
     {{endif}}
-    {{if 'cuMemAdvise' in found_functions}}
-    global __cuMemAdvise
-    __cuMemAdvise = dlfcn.dlsym(handle, 'cuMemAdvise')
-    {{endif}}
     {{if 'cuMemAdvise_v2' in found_functions}}
     global __cuMemAdvise_v2
     __cuMemAdvise_v2 = dlfcn.dlsym(handle, 'cuMemAdvise_v2')
@@ -8055,10 +8015,6 @@ cdef int cuPythonInit() except -1 nogil:
     global __cuEventDestroy_v2
     __cuEventDestroy_v2 = dlfcn.dlsym(handle, 'cuEventDestroy_v2')
     {{endif}}
-    {{if 'cuEventElapsedTime' in found_functions}}
-    global __cuEventElapsedTime
-    __cuEventElapsedTime = dlfcn.dlsym(handle, 'cuEventElapsedTime')
-    {{endif}}
     {{if 'cuEventElapsedTime_v2' in found_functions}}
     global __cuEventElapsedTime_v2
     __cuEventElapsedTime_v2 = dlfcn.dlsym(handle, 'cuEventElapsedTime_v2')
@@ -8343,42 +8299,22 @@ cdef int cuPythonInit() except -1 nogil:
     global __cuGraphGetRootNodes
     __cuGraphGetRootNodes = dlfcn.dlsym(handle, 'cuGraphGetRootNodes')
     {{endif}}
-    {{if 'cuGraphGetEdges' in found_functions}}
-    global __cuGraphGetEdges
-    __cuGraphGetEdges = dlfcn.dlsym(handle, 'cuGraphGetEdges')
-    {{endif}}
     {{if 'cuGraphGetEdges_v2' in found_functions}}
     global __cuGraphGetEdges_v2
     __cuGraphGetEdges_v2 = dlfcn.dlsym(handle, 'cuGraphGetEdges_v2')
     {{endif}}
-    {{if 'cuGraphNodeGetDependencies' in found_functions}}
-    global __cuGraphNodeGetDependencies
-    __cuGraphNodeGetDependencies = dlfcn.dlsym(handle, 'cuGraphNodeGetDependencies')
-    {{endif}}
     {{if 'cuGraphNodeGetDependencies_v2' in found_functions}}
     global __cuGraphNodeGetDependencies_v2
     __cuGraphNodeGetDependencies_v2 = dlfcn.dlsym(handle, 'cuGraphNodeGetDependencies_v2')
     {{endif}}
-    {{if 'cuGraphNodeGetDependentNodes' in found_functions}}
-    global __cuGraphNodeGetDependentNodes
-    __cuGraphNodeGetDependentNodes = dlfcn.dlsym(handle, 'cuGraphNodeGetDependentNodes')
-    {{endif}}
     {{if 'cuGraphNodeGetDependentNodes_v2' in found_functions}}
     global __cuGraphNodeGetDependentNodes_v2
     __cuGraphNodeGetDependentNodes_v2 = dlfcn.dlsym(handle, 'cuGraphNodeGetDependentNodes_v2')
     {{endif}}
-    {{if 'cuGraphAddDependencies' in found_functions}}
-    global __cuGraphAddDependencies
-    __cuGraphAddDependencies = dlfcn.dlsym(handle, 'cuGraphAddDependencies')
-    {{endif}}
     {{if 'cuGraphAddDependencies_v2' in found_functions}}
     global __cuGraphAddDependencies_v2
     __cuGraphAddDependencies_v2 = dlfcn.dlsym(handle, 'cuGraphAddDependencies_v2')
     {{endif}}
-    {{if 'cuGraphRemoveDependencies' in found_functions}}
-    global __cuGraphRemoveDependencies
-    __cuGraphRemoveDependencies = dlfcn.dlsym(handle, 'cuGraphRemoveDependencies')
-    {{endif}}
     {{if 'cuGraphRemoveDependencies_v2' in found_functions}}
     global __cuGraphRemoveDependencies_v2
     __cuGraphRemoveDependencies_v2 = dlfcn.dlsym(handle, 'cuGraphRemoveDependencies_v2')
@@ -8487,10 +8423,6 @@ cdef int cuPythonInit() except -1 nogil:
     global __cuGraphReleaseUserObject
     __cuGraphReleaseUserObject = dlfcn.dlsym(handle, 'cuGraphReleaseUserObject')
     {{endif}}
-    {{if 'cuGraphAddNode' in found_functions}}
-    global __cuGraphAddNode
-    __cuGraphAddNode = dlfcn.dlsym(handle, 'cuGraphAddNode')
-    {{endif}}
     {{if 'cuGraphAddNode_v2' in found_functions}}
     global __cuGraphAddNode_v2
     __cuGraphAddNode_v2 = dlfcn.dlsym(handle, 'cuGraphAddNode_v2')
@@ -8715,6 +8647,10 @@ cdef int cuPythonInit() except -1 nogil:
     global __cuDeviceGetP2PAttribute
     __cuDeviceGetP2PAttribute = dlfcn.dlsym(handle, 'cuDeviceGetP2PAttribute')
     {{endif}}
+    {{if 'cuDeviceGetP2PAtomicCapabilities' in found_functions}}
+    global __cuDeviceGetP2PAtomicCapabilities
+    __cuDeviceGetP2PAtomicCapabilities = dlfcn.dlsym(handle, 'cuDeviceGetP2PAtomicCapabilities')
+    {{endif}}
     {{if 'cuGraphicsUnregisterResource' in found_functions}}
     global __cuGraphicsUnregisterResource
     __cuGraphicsUnregisterResource = dlfcn.dlsym(handle, 'cuGraphicsUnregisterResource')
@@ -8807,6 +8743,10 @@ cdef int cuPythonInit() except -1 nogil:
     global __cuGreenCtxStreamCreate
     __cuGreenCtxStreamCreate = dlfcn.dlsym(handle, 'cuGreenCtxStreamCreate')
     {{endif}}
+    {{if 'cuGreenCtxGetId' in found_functions}}
+    global __cuGreenCtxGetId
+    __cuGreenCtxGetId = dlfcn.dlsym(handle, 'cuGreenCtxGetId')
+    {{endif}}
     {{if 'cuLogsRegisterCallback' in found_functions}}
     global __cuLogsRegisterCallback
     __cuLogsRegisterCallback = dlfcn.dlsym(handle, 'cuLogsRegisterCallback')
@@ -8843,10 +8783,6 @@ cdef int cuPythonInit() except -1 nogil:
     global __cuCheckpointProcessCheckpoint
     __cuCheckpointProcessCheckpoint = dlfcn.dlsym(handle, 'cuCheckpointProcessCheckpoint')
     {{endif}}
-    {{if 'cuCheckpointProcessRestore' in found_functions}}
-    global __cuCheckpointProcessRestore
-    __cuCheckpointProcessRestore = dlfcn.dlsym(handle, 'cuCheckpointProcessRestore')
-    {{endif}}
     {{if 'cuCheckpointProcessUnlock' in found_functions}}
     global __cuCheckpointProcessUnlock
     __cuCheckpointProcessUnlock = dlfcn.dlsym(handle, 'cuCheckpointProcessUnlock')
@@ -9021,18 +8957,6 @@ cdef CUresult _cuDeviceGetName(char* name, int length, CUdevice dev) except ?CUD
     return err
 {{endif}}
 
-{{if 'cuDeviceGetUuid' in found_functions}}
-
-cdef CUresult _cuDeviceGetUuid(CUuuid* uuid, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil:
-    global __cuDeviceGetUuid
-    cuPythonInit()
-    if __cuDeviceGetUuid == NULL:
-        with gil:
-            raise RuntimeError('Function "cuDeviceGetUuid" not found')
-    err = (<CUresult (*)(CUuuid*, CUdevice) except ?CUDA_ERROR_NOT_FOUND nogil> __cuDeviceGetUuid)(uuid, dev)
-    return err
-{{endif}}
-
 {{if 'cuDeviceGetUuid_v2' in found_functions}}
 
 cdef CUresult _cuDeviceGetUuid_v2(CUuuid* uuid, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -9093,6 +9017,18 @@ cdef CUresult _cuDeviceGetAttribute(int* pi, CUdevice_attribute attrib, CUdevice
     return err
 {{endif}}
 
+{{if 'cuDeviceGetHostAtomicCapabilities' in found_functions}}
+
+cdef CUresult _cuDeviceGetHostAtomicCapabilities(unsigned int* capabilities, const CUatomicOperation* operations, unsigned int count, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuDeviceGetHostAtomicCapabilities
+    cuPythonInit()
+    if __cuDeviceGetHostAtomicCapabilities == NULL:
+        with gil:
+            raise RuntimeError('Function "cuDeviceGetHostAtomicCapabilities" not found')
+    err = (<CUresult (*)(unsigned int*, const CUatomicOperation*, unsigned int, CUdevice) except ?CUDA_ERROR_NOT_FOUND nogil> __cuDeviceGetHostAtomicCapabilities)(capabilities, operations, count, dev)
+    return err
+{{endif}}
+
 {{if 'cuDeviceGetNvSciSyncAttributes' in found_functions}}
 
 cdef CUresult _cuDeviceGetNvSciSyncAttributes(void* nvSciSyncAttrList, CUdevice dev, int flags) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -9249,30 +9185,6 @@ cdef CUresult _cuDevicePrimaryCtxReset_v2(CUdevice dev) except ?CUDA_ERROR_NOT_F
     return err
 {{endif}}
 
-{{if 'cuCtxCreate_v2' in found_functions}}
-
-cdef CUresult _cuCtxCreate_v2(CUcontext* pctx, unsigned int flags, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil:
-    global __cuCtxCreate_v2
-    cuPythonInit()
-    if __cuCtxCreate_v2 == NULL:
-        with gil:
-            raise RuntimeError('Function "cuCtxCreate_v2" not found')
-    err = (<CUresult (*)(CUcontext*, unsigned int, CUdevice) except ?CUDA_ERROR_NOT_FOUND nogil> __cuCtxCreate_v2)(pctx, flags, dev)
-    return err
-{{endif}}
-
-{{if 'cuCtxCreate_v3' in found_functions}}
-
-cdef CUresult _cuCtxCreate_v3(CUcontext* pctx, CUexecAffinityParam* paramsArray, int numParams, unsigned int flags, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil:
-    global __cuCtxCreate_v3
-    cuPythonInit()
-    if __cuCtxCreate_v3 == NULL:
-        with gil:
-            raise RuntimeError('Function "cuCtxCreate_v3" not found')
-    err = (<CUresult (*)(CUcontext*, CUexecAffinityParam*, int, unsigned int, CUdevice) except ?CUDA_ERROR_NOT_FOUND nogil> __cuCtxCreate_v3)(pctx, paramsArray, numParams, flags, dev)
-    return err
-{{endif}}
-
 {{if 'cuCtxCreate_v4' in found_functions}}
 
 cdef CUresult _cuCtxCreate_v4(CUcontext* pctx, CUctxCreateParams* ctxCreateParams, unsigned int flags, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -9357,6 +9269,18 @@ cdef CUresult _cuCtxGetDevice(CUdevice* device) except ?CUDA_ERROR_NOT_FOUND nog
     return err
 {{endif}}
 
+{{if 'cuCtxGetDevice_v2' in found_functions}}
+
+cdef CUresult _cuCtxGetDevice_v2(CUdevice* device, CUcontext ctx) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuCtxGetDevice_v2
+    cuPythonInit()
+    if __cuCtxGetDevice_v2 == NULL:
+        with gil:
+            raise RuntimeError('Function "cuCtxGetDevice_v2" not found')
+    err = (<CUresult (*)(CUdevice*, CUcontext) except ?CUDA_ERROR_NOT_FOUND nogil> __cuCtxGetDevice_v2)(device, ctx)
+    return err
+{{endif}}
+
 {{if 'cuCtxGetFlags' in found_functions}}
 
 cdef CUresult _cuCtxGetFlags(unsigned int* flags) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -9405,6 +9329,18 @@ cdef CUresult _cuCtxSynchronize() except ?CUDA_ERROR_NOT_FOUND nogil:
     return err
 {{endif}}
 
+{{if 'cuCtxSynchronize_v2' in found_functions}}
+
+cdef CUresult _cuCtxSynchronize_v2(CUcontext ctx) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuCtxSynchronize_v2
+    cuPythonInit()
+    if __cuCtxSynchronize_v2 == NULL:
+        with gil:
+            raise RuntimeError('Function "cuCtxSynchronize_v2" not found')
+    err = (<CUresult (*)(CUcontext) except ?CUDA_ERROR_NOT_FOUND nogil> __cuCtxSynchronize_v2)(ctx)
+    return err
+{{endif}}
+
 {{if 'cuCtxSetLimit' in found_functions}}
 
 cdef CUresult _cuCtxSetLimit(CUlimit limit, size_t value) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -10533,27 +10469,27 @@ cdef CUresult _cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER* pCopy, CUstream hSt
     return err
 {{endif}}
 
-{{if 'cuMemcpyBatchAsync' in found_functions}}
+{{if 'cuMemcpyBatchAsync_v2' in found_functions}}
 
-cdef CUresult _cuMemcpyBatchAsync(CUdeviceptr* dsts, CUdeviceptr* srcs, size_t* sizes, size_t count, CUmemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, size_t* failIdx, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
-    global __cuMemcpyBatchAsync
+cdef CUresult _cuMemcpyBatchAsync_v2(CUdeviceptr* dsts, CUdeviceptr* srcs, size_t* sizes, size_t count, CUmemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuMemcpyBatchAsync_v2
     cuPythonInit()
-    if __cuMemcpyBatchAsync == NULL:
+    if __cuMemcpyBatchAsync_v2 == NULL:
         with gil:
-            raise RuntimeError('Function "cuMemcpyBatchAsync" not found')
-    err = (<CUresult (*)(CUdeviceptr*, CUdeviceptr*, size_t*, size_t, CUmemcpyAttributes*, size_t*, size_t, size_t*, CUstream) except ?CUDA_ERROR_NOT_FOUND nogil> __cuMemcpyBatchAsync)(dsts, srcs, sizes, count, attrs, attrsIdxs, numAttrs, failIdx, hStream)
+            raise RuntimeError('Function "cuMemcpyBatchAsync_v2" not found')
+    err = (<CUresult (*)(CUdeviceptr*, CUdeviceptr*, size_t*, size_t, CUmemcpyAttributes*, size_t*, size_t, CUstream) except ?CUDA_ERROR_NOT_FOUND nogil> __cuMemcpyBatchAsync_v2)(dsts, srcs, sizes, count, attrs, attrsIdxs, numAttrs, hStream)
     return err
 {{endif}}
 
-{{if 'cuMemcpy3DBatchAsync' in found_functions}}
+{{if 'cuMemcpy3DBatchAsync_v2' in found_functions}}
 
-cdef CUresult _cuMemcpy3DBatchAsync(size_t numOps, CUDA_MEMCPY3D_BATCH_OP* opList, size_t* failIdx, unsigned long long flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
-    global __cuMemcpy3DBatchAsync
+cdef CUresult _cuMemcpy3DBatchAsync_v2(size_t numOps, CUDA_MEMCPY3D_BATCH_OP* opList, unsigned long long flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuMemcpy3DBatchAsync_v2
     cuPythonInit()
-    if __cuMemcpy3DBatchAsync == NULL:
+    if __cuMemcpy3DBatchAsync_v2 == NULL:
         with gil:
-            raise RuntimeError('Function "cuMemcpy3DBatchAsync" not found')
-    err = (<CUresult (*)(size_t, CUDA_MEMCPY3D_BATCH_OP*, size_t*, unsigned long long, CUstream) except ?CUDA_ERROR_NOT_FOUND nogil> __cuMemcpy3DBatchAsync)(numOps, opList, failIdx, flags, hStream)
+            raise RuntimeError('Function "cuMemcpy3DBatchAsync_v2" not found')
+    err = (<CUresult (*)(size_t, CUDA_MEMCPY3D_BATCH_OP*, unsigned long long, CUstream) except ?CUDA_ERROR_NOT_FOUND nogil> __cuMemcpy3DBatchAsync_v2)(numOps, opList, flags, hStream)
     return err
 {{endif}}
 
@@ -11157,6 +11093,42 @@ cdef CUresult _cuMemPoolDestroy(CUmemoryPool pool) except ?CUDA_ERROR_NOT_FOUND
     return err
 {{endif}}
 
+{{if 'cuMemGetDefaultMemPool' in found_functions}}
+
+cdef CUresult _cuMemGetDefaultMemPool(CUmemoryPool* pool_out, CUmemLocation* location, CUmemAllocationType typename) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuMemGetDefaultMemPool
+    cuPythonInit()
+    if __cuMemGetDefaultMemPool == NULL:
+        with gil:
+            raise RuntimeError('Function "cuMemGetDefaultMemPool" not found')
+    err = (<CUresult (*)(CUmemoryPool*, CUmemLocation*, CUmemAllocationType) except ?CUDA_ERROR_NOT_FOUND nogil> __cuMemGetDefaultMemPool)(pool_out, location, typename)
+    return err
+{{endif}}
+
+{{if 'cuMemGetMemPool' in found_functions}}
+
+cdef CUresult _cuMemGetMemPool(CUmemoryPool* pool, CUmemLocation* location, CUmemAllocationType typename) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuMemGetMemPool
+    cuPythonInit()
+    if __cuMemGetMemPool == NULL:
+        with gil:
+            raise RuntimeError('Function "cuMemGetMemPool" not found')
+    err = (<CUresult (*)(CUmemoryPool*, CUmemLocation*, CUmemAllocationType) except ?CUDA_ERROR_NOT_FOUND nogil> __cuMemGetMemPool)(pool, location, typename)
+    return err
+{{endif}}
+
+{{if 'cuMemSetMemPool' in found_functions}}
+
+cdef CUresult _cuMemSetMemPool(CUmemLocation* location, CUmemAllocationType typename, CUmemoryPool pool) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuMemSetMemPool
+    cuPythonInit()
+    if __cuMemSetMemPool == NULL:
+        with gil:
+            raise RuntimeError('Function "cuMemSetMemPool" not found')
+    err = (<CUresult (*)(CUmemLocation*, CUmemAllocationType, CUmemoryPool) except ?CUDA_ERROR_NOT_FOUND nogil> __cuMemSetMemPool)(location, typename, pool)
+    return err
+{{endif}}
+
 {{if 'cuMemAllocFromPoolAsync' in found_functions}}
 
 cdef CUresult _cuMemAllocFromPoolAsync(CUdeviceptr* dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -11301,18 +11273,6 @@ cdef CUresult _cuPointerGetAttribute(void* data, CUpointer_attribute attribute,
     return err
 {{endif}}
 
-{{if 'cuMemPrefetchAsync' in found_functions}}
-
-cdef CUresult _cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
-    global __cuMemPrefetchAsync
-    cuPythonInit()
-    if __cuMemPrefetchAsync == NULL:
-        with gil:
-            raise RuntimeError('Function "cuMemPrefetchAsync" not found')
-    err = (<CUresult (*)(CUdeviceptr, size_t, CUdevice, CUstream) except ?CUDA_ERROR_NOT_FOUND nogil> __cuMemPrefetchAsync)(devPtr, count, dstDevice, hStream)
-    return err
-{{endif}}
-
 {{if 'cuMemPrefetchAsync_v2' in found_functions}}
 
 cdef CUresult _cuMemPrefetchAsync_v2(CUdeviceptr devPtr, size_t count, CUmemLocation location, unsigned int flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -11325,18 +11285,6 @@ cdef CUresult _cuMemPrefetchAsync_v2(CUdeviceptr devPtr, size_t count, CUmemLoca
     return err
 {{endif}}
 
-{{if 'cuMemAdvise' in found_functions}}
-
-cdef CUresult _cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device) except ?CUDA_ERROR_NOT_FOUND nogil:
-    global __cuMemAdvise
-    cuPythonInit()
-    if __cuMemAdvise == NULL:
-        with gil:
-            raise RuntimeError('Function "cuMemAdvise" not found')
-    err = (<CUresult (*)(CUdeviceptr, size_t, CUmem_advise, CUdevice) except ?CUDA_ERROR_NOT_FOUND nogil> __cuMemAdvise)(devPtr, count, advice, device)
-    return err
-{{endif}}
-
 {{if 'cuMemAdvise_v2' in found_functions}}
 
 cdef CUresult _cuMemAdvise_v2(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUmemLocation location) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -11349,6 +11297,42 @@ cdef CUresult _cuMemAdvise_v2(CUdeviceptr devPtr, size_t count, CUmem_advise adv
     return err
 {{endif}}
 
+{{if 'cuMemPrefetchBatchAsync' in found_functions}}
+
+cdef CUresult _cuMemPrefetchBatchAsync(CUdeviceptr* dptrs, size_t* sizes, size_t count, CUmemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuMemPrefetchBatchAsync
+    cuPythonInit()
+    if __cuMemPrefetchBatchAsync == NULL:
+        with gil:
+            raise RuntimeError('Function "cuMemPrefetchBatchAsync" not found')
+    err = (<CUresult (*)(CUdeviceptr*, size_t*, size_t, CUmemLocation*, size_t*, size_t, unsigned long long, CUstream) except ?CUDA_ERROR_NOT_FOUND nogil> __cuMemPrefetchBatchAsync)(dptrs, sizes, count, prefetchLocs, prefetchLocIdxs, numPrefetchLocs, flags, hStream)
+    return err
+{{endif}}
+
+{{if 'cuMemDiscardBatchAsync' in found_functions}}
+
+cdef CUresult _cuMemDiscardBatchAsync(CUdeviceptr* dptrs, size_t* sizes, size_t count, unsigned long long flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuMemDiscardBatchAsync
+    cuPythonInit()
+    if __cuMemDiscardBatchAsync == NULL:
+        with gil:
+            raise RuntimeError('Function "cuMemDiscardBatchAsync" not found')
+    err = (<CUresult (*)(CUdeviceptr*, size_t*, size_t, unsigned long long, CUstream) except ?CUDA_ERROR_NOT_FOUND nogil> __cuMemDiscardBatchAsync)(dptrs, sizes, count, flags, hStream)
+    return err
+{{endif}}
+
+{{if 'cuMemDiscardAndPrefetchBatchAsync' in found_functions}}
+
+cdef CUresult _cuMemDiscardAndPrefetchBatchAsync(CUdeviceptr* dptrs, size_t* sizes, size_t count, CUmemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuMemDiscardAndPrefetchBatchAsync
+    cuPythonInit()
+    if __cuMemDiscardAndPrefetchBatchAsync == NULL:
+        with gil:
+            raise RuntimeError('Function "cuMemDiscardAndPrefetchBatchAsync" not found')
+    err = (<CUresult (*)(CUdeviceptr*, size_t*, size_t, CUmemLocation*, size_t*, size_t, unsigned long long, CUstream) except ?CUDA_ERROR_NOT_FOUND nogil> __cuMemDiscardAndPrefetchBatchAsync)(dptrs, sizes, count, prefetchLocs, prefetchLocIdxs, numPrefetchLocs, flags, hStream)
+    return err
+{{endif}}
+
 {{if 'cuMemRangeGetAttribute' in found_functions}}
 
 cdef CUresult _cuMemRangeGetAttribute(void* data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -11577,18 +11561,6 @@ cdef CUresult _cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus* capt
     return err
 {{endif}}
 
-{{if 'cuStreamGetCaptureInfo_v2' in found_functions}}
-
-cdef CUresult _cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus* captureStatus_out, cuuint64_t* id_out, CUgraph* graph_out, const CUgraphNode** dependencies_out, size_t* numDependencies_out) except ?CUDA_ERROR_NOT_FOUND nogil:
-    global __cuStreamGetCaptureInfo_v2
-    cuPythonInit()
-    if __cuStreamGetCaptureInfo_v2 == NULL:
-        with gil:
-            raise RuntimeError('Function "cuStreamGetCaptureInfo_v2" not found')
-    err = (<CUresult (*)(CUstream, CUstreamCaptureStatus*, cuuint64_t*, CUgraph*, const CUgraphNode**, size_t*) except ?CUDA_ERROR_NOT_FOUND nogil> __cuStreamGetCaptureInfo_v2)(hStream, captureStatus_out, id_out, graph_out, dependencies_out, numDependencies_out)
-    return err
-{{endif}}
-
 {{if 'cuStreamGetCaptureInfo_v3' in found_functions}}
 
 cdef CUresult _cuStreamGetCaptureInfo_v3(CUstream hStream, CUstreamCaptureStatus* captureStatus_out, cuuint64_t* id_out, CUgraph* graph_out, const CUgraphNode** dependencies_out, const CUgraphEdgeData** edgeData_out, size_t* numDependencies_out) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -11601,18 +11573,6 @@ cdef CUresult _cuStreamGetCaptureInfo_v3(CUstream hStream, CUstreamCaptureStatus
     return err
 {{endif}}
 
-{{if 'cuStreamUpdateCaptureDependencies' in found_functions}}
-
-cdef CUresult _cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode* dependencies, size_t numDependencies, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil:
-    global __cuStreamUpdateCaptureDependencies
-    cuPythonInit()
-    if __cuStreamUpdateCaptureDependencies == NULL:
-        with gil:
-            raise RuntimeError('Function "cuStreamUpdateCaptureDependencies" not found')
-    err = (<CUresult (*)(CUstream, CUgraphNode*, size_t, unsigned int) except ?CUDA_ERROR_NOT_FOUND nogil> __cuStreamUpdateCaptureDependencies)(hStream, dependencies, numDependencies, flags)
-    return err
-{{endif}}
-
 {{if 'cuStreamUpdateCaptureDependencies_v2' in found_functions}}
 
 cdef CUresult _cuStreamUpdateCaptureDependencies_v2(CUstream hStream, CUgraphNode* dependencies, const CUgraphEdgeData* dependencyData, size_t numDependencies, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -11781,18 +11741,6 @@ cdef CUresult _cuEventDestroy_v2(CUevent hEvent) except ?CUDA_ERROR_NOT_FOUND no
     return err
 {{endif}}
 
-{{if 'cuEventElapsedTime' in found_functions}}
-
-cdef CUresult _cuEventElapsedTime(float* pMilliseconds, CUevent hStart, CUevent hEnd) except ?CUDA_ERROR_NOT_FOUND nogil:
-    global __cuEventElapsedTime
-    cuPythonInit()
-    if __cuEventElapsedTime == NULL:
-        with gil:
-            raise RuntimeError('Function "cuEventElapsedTime" not found')
-    err = (<CUresult (*)(float*, CUevent, CUevent) except ?CUDA_ERROR_NOT_FOUND nogil> __cuEventElapsedTime)(pMilliseconds, hStart, hEnd)
-    return err
-{{endif}}
-
 {{if 'cuEventElapsedTime_v2' in found_functions}}
 
 cdef CUresult _cuEventElapsedTime_v2(float* pMilliseconds, CUevent hStart, CUevent hEnd) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -12777,18 +12725,6 @@ cdef CUresult _cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode* rootNodes, size_
     return err
 {{endif}}
 
-{{if 'cuGraphGetEdges' in found_functions}}
-
-cdef CUresult _cuGraphGetEdges(CUgraph hGraph, CUgraphNode* from_, CUgraphNode* to, size_t* numEdges) except ?CUDA_ERROR_NOT_FOUND nogil:
-    global __cuGraphGetEdges
-    cuPythonInit()
-    if __cuGraphGetEdges == NULL:
-        with gil:
-            raise RuntimeError('Function "cuGraphGetEdges" not found')
-    err = (<CUresult (*)(CUgraph, CUgraphNode*, CUgraphNode*, size_t*) except ?CUDA_ERROR_NOT_FOUND nogil> __cuGraphGetEdges)(hGraph, from_, to, numEdges)
-    return err
-{{endif}}
-
 {{if 'cuGraphGetEdges_v2' in found_functions}}
 
 cdef CUresult _cuGraphGetEdges_v2(CUgraph hGraph, CUgraphNode* from_, CUgraphNode* to, CUgraphEdgeData* edgeData, size_t* numEdges) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -12801,18 +12737,6 @@ cdef CUresult _cuGraphGetEdges_v2(CUgraph hGraph, CUgraphNode* from_, CUgraphNod
     return err
 {{endif}}
 
-{{if 'cuGraphNodeGetDependencies' in found_functions}}
-
-cdef CUresult _cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode* dependencies, size_t* numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil:
-    global __cuGraphNodeGetDependencies
-    cuPythonInit()
-    if __cuGraphNodeGetDependencies == NULL:
-        with gil:
-            raise RuntimeError('Function "cuGraphNodeGetDependencies" not found')
-    err = (<CUresult (*)(CUgraphNode, CUgraphNode*, size_t*) except ?CUDA_ERROR_NOT_FOUND nogil> __cuGraphNodeGetDependencies)(hNode, dependencies, numDependencies)
-    return err
-{{endif}}
-
 {{if 'cuGraphNodeGetDependencies_v2' in found_functions}}
 
 cdef CUresult _cuGraphNodeGetDependencies_v2(CUgraphNode hNode, CUgraphNode* dependencies, CUgraphEdgeData* edgeData, size_t* numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -12825,18 +12749,6 @@ cdef CUresult _cuGraphNodeGetDependencies_v2(CUgraphNode hNode, CUgraphNode* dep
     return err
 {{endif}}
 
-{{if 'cuGraphNodeGetDependentNodes' in found_functions}}
-
-cdef CUresult _cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode* dependentNodes, size_t* numDependentNodes) except ?CUDA_ERROR_NOT_FOUND nogil:
-    global __cuGraphNodeGetDependentNodes
-    cuPythonInit()
-    if __cuGraphNodeGetDependentNodes == NULL:
-        with gil:
-            raise RuntimeError('Function "cuGraphNodeGetDependentNodes" not found')
-    err = (<CUresult (*)(CUgraphNode, CUgraphNode*, size_t*) except ?CUDA_ERROR_NOT_FOUND nogil> __cuGraphNodeGetDependentNodes)(hNode, dependentNodes, numDependentNodes)
-    return err
-{{endif}}
-
 {{if 'cuGraphNodeGetDependentNodes_v2' in found_functions}}
 
 cdef CUresult _cuGraphNodeGetDependentNodes_v2(CUgraphNode hNode, CUgraphNode* dependentNodes, CUgraphEdgeData* edgeData, size_t* numDependentNodes) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -12849,18 +12761,6 @@ cdef CUresult _cuGraphNodeGetDependentNodes_v2(CUgraphNode hNode, CUgraphNode* d
     return err
 {{endif}}
 
-{{if 'cuGraphAddDependencies' in found_functions}}
-
-cdef CUresult _cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode* from_, const CUgraphNode* to, size_t numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil:
-    global __cuGraphAddDependencies
-    cuPythonInit()
-    if __cuGraphAddDependencies == NULL:
-        with gil:
-            raise RuntimeError('Function "cuGraphAddDependencies" not found')
-    err = (<CUresult (*)(CUgraph, const CUgraphNode*, const CUgraphNode*, size_t) except ?CUDA_ERROR_NOT_FOUND nogil> __cuGraphAddDependencies)(hGraph, from_, to, numDependencies)
-    return err
-{{endif}}
-
 {{if 'cuGraphAddDependencies_v2' in found_functions}}
 
 cdef CUresult _cuGraphAddDependencies_v2(CUgraph hGraph, const CUgraphNode* from_, const CUgraphNode* to, const CUgraphEdgeData* edgeData, size_t numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -12873,18 +12773,6 @@ cdef CUresult _cuGraphAddDependencies_v2(CUgraph hGraph, const CUgraphNode* from
     return err
 {{endif}}
 
-{{if 'cuGraphRemoveDependencies' in found_functions}}
-
-cdef CUresult _cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode* from_, const CUgraphNode* to, size_t numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil:
-    global __cuGraphRemoveDependencies
-    cuPythonInit()
-    if __cuGraphRemoveDependencies == NULL:
-        with gil:
-            raise RuntimeError('Function "cuGraphRemoveDependencies" not found')
-    err = (<CUresult (*)(CUgraph, const CUgraphNode*, const CUgraphNode*, size_t) except ?CUDA_ERROR_NOT_FOUND nogil> __cuGraphRemoveDependencies)(hGraph, from_, to, numDependencies)
-    return err
-{{endif}}
-
 {{if 'cuGraphRemoveDependencies_v2' in found_functions}}
 
 cdef CUresult _cuGraphRemoveDependencies_v2(CUgraph hGraph, const CUgraphNode* from_, const CUgraphNode* to, const CUgraphEdgeData* edgeData, size_t numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -13245,18 +13133,6 @@ cdef CUresult _cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsi
     return err
 {{endif}}
 
-{{if 'cuGraphAddNode' in found_functions}}
-
-cdef CUresult _cuGraphAddNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, CUgraphNodeParams* nodeParams) except ?CUDA_ERROR_NOT_FOUND nogil:
-    global __cuGraphAddNode
-    cuPythonInit()
-    if __cuGraphAddNode == NULL:
-        with gil:
-            raise RuntimeError('Function "cuGraphAddNode" not found')
-    err = (<CUresult (*)(CUgraphNode*, CUgraph, const CUgraphNode*, size_t, CUgraphNodeParams*) except ?CUDA_ERROR_NOT_FOUND nogil> __cuGraphAddNode)(phGraphNode, hGraph, dependencies, numDependencies, nodeParams)
-    return err
-{{endif}}
-
 {{if 'cuGraphAddNode_v2' in found_functions}}
 
 cdef CUresult _cuGraphAddNode_v2(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, const CUgraphEdgeData* dependencyData, size_t numDependencies, CUgraphNodeParams* nodeParams) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -13929,6 +13805,18 @@ cdef CUresult _cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib,
     return err
 {{endif}}
 
+{{if 'cuDeviceGetP2PAtomicCapabilities' in found_functions}}
+
+cdef CUresult _cuDeviceGetP2PAtomicCapabilities(unsigned int* capabilities, const CUatomicOperation* operations, unsigned int count, CUdevice srcDevice, CUdevice dstDevice) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuDeviceGetP2PAtomicCapabilities
+    cuPythonInit()
+    if __cuDeviceGetP2PAtomicCapabilities == NULL:
+        with gil:
+            raise RuntimeError('Function "cuDeviceGetP2PAtomicCapabilities" not found')
+    err = (<CUresult (*)(unsigned int*, const CUatomicOperation*, unsigned int, CUdevice, CUdevice) except ?CUDA_ERROR_NOT_FOUND nogil> __cuDeviceGetP2PAtomicCapabilities)(capabilities, operations, count, srcDevice, dstDevice)
+    return err
+{{endif}}
+
 {{if 'cuGraphicsUnregisterResource' in found_functions}}
 
 cdef CUresult _cuGraphicsUnregisterResource(CUgraphicsResource resource) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -14229,6 +14117,18 @@ cdef CUresult _cuGreenCtxStreamCreate(CUstream* phStream, CUgreenCtx greenCtx, u
     return err
 {{endif}}
 
+{{if 'cuGreenCtxGetId' in found_functions}}
+
+cdef CUresult _cuGreenCtxGetId(CUgreenCtx greenCtx, unsigned long long* greenCtxId) except ?CUDA_ERROR_NOT_FOUND nogil:
+    global __cuGreenCtxGetId
+    cuPythonInit()
+    if __cuGreenCtxGetId == NULL:
+        with gil:
+            raise RuntimeError('Function "cuGreenCtxGetId" not found')
+    err = (<CUresult (*)(CUgreenCtx, unsigned long long*) except ?CUDA_ERROR_NOT_FOUND nogil> __cuGreenCtxGetId)(greenCtx, greenCtxId)
+    return err
+{{endif}}
+
 {{if 'cuLogsRegisterCallback' in found_functions}}
 
 cdef CUresult _cuLogsRegisterCallback(CUlogsCallback callbackFunc, void* userData, CUlogsCallbackHandle* callback_out) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -14337,18 +14237,6 @@ cdef CUresult _cuCheckpointProcessCheckpoint(int pid, CUcheckpointCheckpointArgs
     return err
 {{endif}}
 
-{{if 'cuCheckpointProcessRestore' in found_functions}}
-
-cdef CUresult _cuCheckpointProcessRestore(int pid, CUcheckpointRestoreArgs* args) except ?CUDA_ERROR_NOT_FOUND nogil:
-    global __cuCheckpointProcessRestore
-    cuPythonInit()
-    if __cuCheckpointProcessRestore == NULL:
-        with gil:
-            raise RuntimeError('Function "cuCheckpointProcessRestore" not found')
-    err = (<CUresult (*)(int, CUcheckpointRestoreArgs*) except ?CUDA_ERROR_NOT_FOUND nogil> __cuCheckpointProcessRestore)(pid, args)
-    return err
-{{endif}}
-
 {{if 'cuCheckpointProcessUnlock' in found_functions}}
 
 cdef CUresult _cuCheckpointProcessUnlock(int pid, CUcheckpointUnlockArgs* args) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -14672,13 +14560,6 @@ cpdef dict _inspect_function_pointers():
     data["__cuDeviceGetName"] = <intptr_t>0
     {{endif}}
 
-    {{if 'cuDeviceGetUuid' in found_functions}}
-    global __cuDeviceGetUuid
-    data["__cuDeviceGetUuid"] = <intptr_t>__cuDeviceGetUuid
-    {{else}}
-    data["__cuDeviceGetUuid"] = <intptr_t>0
-    {{endif}}
-
     {{if 'cuDeviceGetUuid_v2' in found_functions}}
     global __cuDeviceGetUuid_v2
     data["__cuDeviceGetUuid_v2"] = <intptr_t>__cuDeviceGetUuid_v2
@@ -14714,6 +14595,13 @@ cpdef dict _inspect_function_pointers():
     data["__cuDeviceGetAttribute"] = <intptr_t>0
     {{endif}}
 
+    {{if 'cuDeviceGetHostAtomicCapabilities' in found_functions}}
+    global __cuDeviceGetHostAtomicCapabilities
+    data["__cuDeviceGetHostAtomicCapabilities"] = <intptr_t>__cuDeviceGetHostAtomicCapabilities
+    {{else}}
+    data["__cuDeviceGetHostAtomicCapabilities"] = <intptr_t>0
+    {{endif}}
+
     {{if 'cuDeviceGetNvSciSyncAttributes' in found_functions}}
     global __cuDeviceGetNvSciSyncAttributes
     data["__cuDeviceGetNvSciSyncAttributes"] = <intptr_t>__cuDeviceGetNvSciSyncAttributes
@@ -14805,20 +14693,6 @@ cpdef dict _inspect_function_pointers():
     data["__cuDevicePrimaryCtxReset_v2"] = <intptr_t>0
     {{endif}}
 
-    {{if 'cuCtxCreate_v2' in found_functions}}
-    global __cuCtxCreate_v2
-    data["__cuCtxCreate_v2"] = <intptr_t>__cuCtxCreate_v2
-    {{else}}
-    data["__cuCtxCreate_v2"] = <intptr_t>0
-    {{endif}}
-
-    {{if 'cuCtxCreate_v3' in found_functions}}
-    global __cuCtxCreate_v3
-    data["__cuCtxCreate_v3"] = <intptr_t>__cuCtxCreate_v3
-    {{else}}
-    data["__cuCtxCreate_v3"] = <intptr_t>0
-    {{endif}}
-
     {{if 'cuCtxCreate_v4' in found_functions}}
     global __cuCtxCreate_v4
     data["__cuCtxCreate_v4"] = <intptr_t>__cuCtxCreate_v4
@@ -14868,6 +14742,13 @@ cpdef dict _inspect_function_pointers():
     data["__cuCtxGetDevice"] = <intptr_t>0
     {{endif}}
 
+    {{if 'cuCtxGetDevice_v2' in found_functions}}
+    global __cuCtxGetDevice_v2
+    data["__cuCtxGetDevice_v2"] = <intptr_t>__cuCtxGetDevice_v2
+    {{else}}
+    data["__cuCtxGetDevice_v2"] = <intptr_t>0
+    {{endif}}
+
     {{if 'cuCtxGetFlags' in found_functions}}
     global __cuCtxGetFlags
     data["__cuCtxGetFlags"] = <intptr_t>__cuCtxGetFlags
@@ -14896,6 +14777,13 @@ cpdef dict _inspect_function_pointers():
     data["__cuCtxSynchronize"] = <intptr_t>0
     {{endif}}
 
+    {{if 'cuCtxSynchronize_v2' in found_functions}}
+    global __cuCtxSynchronize_v2
+    data["__cuCtxSynchronize_v2"] = <intptr_t>__cuCtxSynchronize_v2
+    {{else}}
+    data["__cuCtxSynchronize_v2"] = <intptr_t>0
+    {{endif}}
+
     {{if 'cuCtxSetLimit' in found_functions}}
     global __cuCtxSetLimit
     data["__cuCtxSetLimit"] = <intptr_t>__cuCtxSetLimit
@@ -15554,18 +15442,18 @@ cpdef dict _inspect_function_pointers():
     data["__cuMemcpy3DPeerAsync"] = <intptr_t>0
     {{endif}}
 
-    {{if 'cuMemcpyBatchAsync' in found_functions}}
-    global __cuMemcpyBatchAsync
-    data["__cuMemcpyBatchAsync"] = <intptr_t>__cuMemcpyBatchAsync
+    {{if 'cuMemcpyBatchAsync_v2' in found_functions}}
+    global __cuMemcpyBatchAsync_v2
+    data["__cuMemcpyBatchAsync_v2"] = <intptr_t>__cuMemcpyBatchAsync_v2
     {{else}}
-    data["__cuMemcpyBatchAsync"] = <intptr_t>0
+    data["__cuMemcpyBatchAsync_v2"] = <intptr_t>0
     {{endif}}
 
-    {{if 'cuMemcpy3DBatchAsync' in found_functions}}
-    global __cuMemcpy3DBatchAsync
-    data["__cuMemcpy3DBatchAsync"] = <intptr_t>__cuMemcpy3DBatchAsync
+    {{if 'cuMemcpy3DBatchAsync_v2' in found_functions}}
+    global __cuMemcpy3DBatchAsync_v2
+    data["__cuMemcpy3DBatchAsync_v2"] = <intptr_t>__cuMemcpy3DBatchAsync_v2
     {{else}}
-    data["__cuMemcpy3DBatchAsync"] = <intptr_t>0
+    data["__cuMemcpy3DBatchAsync_v2"] = <intptr_t>0
     {{endif}}
 
     {{if 'cuMemsetD8_v2' in found_functions}}
@@ -15918,6 +15806,27 @@ cpdef dict _inspect_function_pointers():
     data["__cuMemPoolDestroy"] = <intptr_t>0
     {{endif}}
 
+    {{if 'cuMemGetDefaultMemPool' in found_functions}}
+    global __cuMemGetDefaultMemPool
+    data["__cuMemGetDefaultMemPool"] = <intptr_t>__cuMemGetDefaultMemPool
+    {{else}}
+    data["__cuMemGetDefaultMemPool"] = <intptr_t>0
+    {{endif}}
+
+    {{if 'cuMemGetMemPool' in found_functions}}
+    global __cuMemGetMemPool
+    data["__cuMemGetMemPool"] = <intptr_t>__cuMemGetMemPool
+    {{else}}
+    data["__cuMemGetMemPool"] = <intptr_t>0
+    {{endif}}
+
+    {{if 'cuMemSetMemPool' in found_functions}}
+    global __cuMemSetMemPool
+    data["__cuMemSetMemPool"] = <intptr_t>__cuMemSetMemPool
+    {{else}}
+    data["__cuMemSetMemPool"] = <intptr_t>0
+    {{endif}}
+
     {{if 'cuMemAllocFromPoolAsync' in found_functions}}
     global __cuMemAllocFromPoolAsync
     data["__cuMemAllocFromPoolAsync"] = <intptr_t>__cuMemAllocFromPoolAsync
@@ -16002,13 +15911,6 @@ cpdef dict _inspect_function_pointers():
     data["__cuPointerGetAttribute"] = <intptr_t>0
     {{endif}}
 
-    {{if 'cuMemPrefetchAsync' in found_functions}}
-    global __cuMemPrefetchAsync
-    data["__cuMemPrefetchAsync"] = <intptr_t>__cuMemPrefetchAsync
-    {{else}}
-    data["__cuMemPrefetchAsync"] = <intptr_t>0
-    {{endif}}
-
     {{if 'cuMemPrefetchAsync_v2' in found_functions}}
     global __cuMemPrefetchAsync_v2
     data["__cuMemPrefetchAsync_v2"] = <intptr_t>__cuMemPrefetchAsync_v2
@@ -16016,13 +15918,6 @@ cpdef dict _inspect_function_pointers():
     data["__cuMemPrefetchAsync_v2"] = <intptr_t>0
     {{endif}}
 
-    {{if 'cuMemAdvise' in found_functions}}
-    global __cuMemAdvise
-    data["__cuMemAdvise"] = <intptr_t>__cuMemAdvise
-    {{else}}
-    data["__cuMemAdvise"] = <intptr_t>0
-    {{endif}}
-
     {{if 'cuMemAdvise_v2' in found_functions}}
     global __cuMemAdvise_v2
     data["__cuMemAdvise_v2"] = <intptr_t>__cuMemAdvise_v2
@@ -16030,6 +15925,27 @@ cpdef dict _inspect_function_pointers():
     data["__cuMemAdvise_v2"] = <intptr_t>0
     {{endif}}
 
+    {{if 'cuMemPrefetchBatchAsync' in found_functions}}
+    global __cuMemPrefetchBatchAsync
+    data["__cuMemPrefetchBatchAsync"] = <intptr_t>__cuMemPrefetchBatchAsync
+    {{else}}
+    data["__cuMemPrefetchBatchAsync"] = <intptr_t>0
+    {{endif}}
+
+    {{if 'cuMemDiscardBatchAsync' in found_functions}}
+    global __cuMemDiscardBatchAsync
+    data["__cuMemDiscardBatchAsync"] = <intptr_t>__cuMemDiscardBatchAsync
+    {{else}}
+    data["__cuMemDiscardBatchAsync"] = <intptr_t>0
+    {{endif}}
+
+    {{if 'cuMemDiscardAndPrefetchBatchAsync' in found_functions}}
+    global __cuMemDiscardAndPrefetchBatchAsync
+    data["__cuMemDiscardAndPrefetchBatchAsync"] = <intptr_t>__cuMemDiscardAndPrefetchBatchAsync
+    {{else}}
+    data["__cuMemDiscardAndPrefetchBatchAsync"] = <intptr_t>0
+    {{endif}}
+
     {{if 'cuMemRangeGetAttribute' in found_functions}}
     global __cuMemRangeGetAttribute
     data["__cuMemRangeGetAttribute"] = <intptr_t>__cuMemRangeGetAttribute
@@ -16163,13 +16079,6 @@ cpdef dict _inspect_function_pointers():
     data["__cuStreamIsCapturing"] = <intptr_t>0
     {{endif}}
 
-    {{if 'cuStreamGetCaptureInfo_v2' in found_functions}}
-    global __cuStreamGetCaptureInfo_v2
-    data["__cuStreamGetCaptureInfo_v2"] = <intptr_t>__cuStreamGetCaptureInfo_v2
-    {{else}}
-    data["__cuStreamGetCaptureInfo_v2"] = <intptr_t>0
-    {{endif}}
-
     {{if 'cuStreamGetCaptureInfo_v3' in found_functions}}
     global __cuStreamGetCaptureInfo_v3
     data["__cuStreamGetCaptureInfo_v3"] = <intptr_t>__cuStreamGetCaptureInfo_v3
@@ -16177,13 +16086,6 @@ cpdef dict _inspect_function_pointers():
     data["__cuStreamGetCaptureInfo_v3"] = <intptr_t>0
     {{endif}}
 
-    {{if 'cuStreamUpdateCaptureDependencies' in found_functions}}
-    global __cuStreamUpdateCaptureDependencies
-    data["__cuStreamUpdateCaptureDependencies"] = <intptr_t>__cuStreamUpdateCaptureDependencies
-    {{else}}
-    data["__cuStreamUpdateCaptureDependencies"] = <intptr_t>0
-    {{endif}}
-
     {{if 'cuStreamUpdateCaptureDependencies_v2' in found_functions}}
     global __cuStreamUpdateCaptureDependencies_v2
     data["__cuStreamUpdateCaptureDependencies_v2"] = <intptr_t>__cuStreamUpdateCaptureDependencies_v2
@@ -16282,13 +16184,6 @@ cpdef dict _inspect_function_pointers():
     data["__cuEventDestroy_v2"] = <intptr_t>0
     {{endif}}
 
-    {{if 'cuEventElapsedTime' in found_functions}}
-    global __cuEventElapsedTime
-    data["__cuEventElapsedTime"] = <intptr_t>__cuEventElapsedTime
-    {{else}}
-    data["__cuEventElapsedTime"] = <intptr_t>0
-    {{endif}}
-
     {{if 'cuEventElapsedTime_v2' in found_functions}}
     global __cuEventElapsedTime_v2
     data["__cuEventElapsedTime_v2"] = <intptr_t>__cuEventElapsedTime_v2
@@ -16863,13 +16758,6 @@ cpdef dict _inspect_function_pointers():
     data["__cuGraphGetRootNodes"] = <intptr_t>0
     {{endif}}
 
-    {{if 'cuGraphGetEdges' in found_functions}}
-    global __cuGraphGetEdges
-    data["__cuGraphGetEdges"] = <intptr_t>__cuGraphGetEdges
-    {{else}}
-    data["__cuGraphGetEdges"] = <intptr_t>0
-    {{endif}}
-
     {{if 'cuGraphGetEdges_v2' in found_functions}}
     global __cuGraphGetEdges_v2
     data["__cuGraphGetEdges_v2"] = <intptr_t>__cuGraphGetEdges_v2
@@ -16877,13 +16765,6 @@ cpdef dict _inspect_function_pointers():
     data["__cuGraphGetEdges_v2"] = <intptr_t>0
     {{endif}}
 
-    {{if 'cuGraphNodeGetDependencies' in found_functions}}
-    global __cuGraphNodeGetDependencies
-    data["__cuGraphNodeGetDependencies"] = <intptr_t>__cuGraphNodeGetDependencies
-    {{else}}
-    data["__cuGraphNodeGetDependencies"] = <intptr_t>0
-    {{endif}}
-
     {{if 'cuGraphNodeGetDependencies_v2' in found_functions}}
     global __cuGraphNodeGetDependencies_v2
     data["__cuGraphNodeGetDependencies_v2"] = <intptr_t>__cuGraphNodeGetDependencies_v2
@@ -16891,13 +16772,6 @@ cpdef dict _inspect_function_pointers():
     data["__cuGraphNodeGetDependencies_v2"] = <intptr_t>0
     {{endif}}
 
-    {{if 'cuGraphNodeGetDependentNodes' in found_functions}}
-    global __cuGraphNodeGetDependentNodes
-    data["__cuGraphNodeGetDependentNodes"] = <intptr_t>__cuGraphNodeGetDependentNodes
-    {{else}}
-    data["__cuGraphNodeGetDependentNodes"] = <intptr_t>0
-    {{endif}}
-
     {{if 'cuGraphNodeGetDependentNodes_v2' in found_functions}}
     global __cuGraphNodeGetDependentNodes_v2
     data["__cuGraphNodeGetDependentNodes_v2"] = <intptr_t>__cuGraphNodeGetDependentNodes_v2
@@ -16905,13 +16779,6 @@ cpdef dict _inspect_function_pointers():
     data["__cuGraphNodeGetDependentNodes_v2"] = <intptr_t>0
     {{endif}}
 
-    {{if 'cuGraphAddDependencies' in found_functions}}
-    global __cuGraphAddDependencies
-    data["__cuGraphAddDependencies"] = <intptr_t>__cuGraphAddDependencies
-    {{else}}
-    data["__cuGraphAddDependencies"] = <intptr_t>0
-    {{endif}}
-
     {{if 'cuGraphAddDependencies_v2' in found_functions}}
     global __cuGraphAddDependencies_v2
     data["__cuGraphAddDependencies_v2"] = <intptr_t>__cuGraphAddDependencies_v2
@@ -16919,13 +16786,6 @@ cpdef dict _inspect_function_pointers():
     data["__cuGraphAddDependencies_v2"] = <intptr_t>0
     {{endif}}
 
-    {{if 'cuGraphRemoveDependencies' in found_functions}}
-    global __cuGraphRemoveDependencies
-    data["__cuGraphRemoveDependencies"] = <intptr_t>__cuGraphRemoveDependencies
-    {{else}}
-    data["__cuGraphRemoveDependencies"] = <intptr_t>0
-    {{endif}}
-
     {{if 'cuGraphRemoveDependencies_v2' in found_functions}}
     global __cuGraphRemoveDependencies_v2
     data["__cuGraphRemoveDependencies_v2"] = <intptr_t>__cuGraphRemoveDependencies_v2
@@ -17136,13 +16996,6 @@ cpdef dict _inspect_function_pointers():
     data["__cuGraphReleaseUserObject"] = <intptr_t>0
     {{endif}}
 
-    {{if 'cuGraphAddNode' in found_functions}}
-    global __cuGraphAddNode
-    data["__cuGraphAddNode"] = <intptr_t>__cuGraphAddNode
-    {{else}}
-    data["__cuGraphAddNode"] = <intptr_t>0
-    {{endif}}
-
     {{if 'cuGraphAddNode_v2' in found_functions}}
     global __cuGraphAddNode_v2
     data["__cuGraphAddNode_v2"] = <intptr_t>__cuGraphAddNode_v2
@@ -17535,6 +17388,13 @@ cpdef dict _inspect_function_pointers():
     data["__cuDeviceGetP2PAttribute"] = <intptr_t>0
     {{endif}}
 
+    {{if 'cuDeviceGetP2PAtomicCapabilities' in found_functions}}
+    global __cuDeviceGetP2PAtomicCapabilities
+    data["__cuDeviceGetP2PAtomicCapabilities"] = <intptr_t>__cuDeviceGetP2PAtomicCapabilities
+    {{else}}
+    data["__cuDeviceGetP2PAtomicCapabilities"] = <intptr_t>0
+    {{endif}}
+
     {{if 'cuGraphicsUnregisterResource' in found_functions}}
     global __cuGraphicsUnregisterResource
     data["__cuGraphicsUnregisterResource"] = <intptr_t>__cuGraphicsUnregisterResource
@@ -17710,6 +17570,13 @@ cpdef dict _inspect_function_pointers():
     data["__cuGreenCtxStreamCreate"] = <intptr_t>0
     {{endif}}
 
+    {{if 'cuGreenCtxGetId' in found_functions}}
+    global __cuGreenCtxGetId
+    data["__cuGreenCtxGetId"] = <intptr_t>__cuGreenCtxGetId
+    {{else}}
+    data["__cuGreenCtxGetId"] = <intptr_t>0
+    {{endif}}
+
     {{if 'cuLogsRegisterCallback' in found_functions}}
     global __cuLogsRegisterCallback
     data["__cuLogsRegisterCallback"] = <intptr_t>__cuLogsRegisterCallback
@@ -17773,13 +17640,6 @@ cpdef dict _inspect_function_pointers():
     data["__cuCheckpointProcessCheckpoint"] = <intptr_t>0
     {{endif}}
 
-    {{if 'cuCheckpointProcessRestore' in found_functions}}
-    global __cuCheckpointProcessRestore
-    data["__cuCheckpointProcessRestore"] = <intptr_t>__cuCheckpointProcessRestore
-    {{else}}
-    data["__cuCheckpointProcessRestore"] = <intptr_t>0
-    {{endif}}
-
     {{if 'cuCheckpointProcessUnlock' in found_functions}}
     global __cuCheckpointProcessUnlock
     data["__cuCheckpointProcessUnlock"] = <intptr_t>__cuCheckpointProcessUnlock
diff --git a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in
index 058101c94e..87ad37ef4d 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in
@@ -59,16 +59,6 @@ cdef nvrtcResult _nvrtcGetCUBINSize(nvrtcProgram prog, size_t* cubinSizeRet) exc
 cdef nvrtcResult _nvrtcGetCUBIN(nvrtcProgram prog, char* cubin) except ?NVRTC_ERROR_INVALID_INPUT nogil
 {{endif}}
 
-{{if 'nvrtcGetNVVMSize' in found_functions}}
-
-cdef nvrtcResult _nvrtcGetNVVMSize(nvrtcProgram prog, size_t* nvvmSizeRet) except ?NVRTC_ERROR_INVALID_INPUT nogil
-{{endif}}
-
-{{if 'nvrtcGetNVVM' in found_functions}}
-
-cdef nvrtcResult _nvrtcGetNVVM(nvrtcProgram prog, char* nvvm) except ?NVRTC_ERROR_INVALID_INPUT nogil
-{{endif}}
-
 {{if 'nvrtcGetLTOIRSize' in found_functions}}
 
 cdef nvrtcResult _nvrtcGetLTOIRSize(nvrtcProgram prog, size_t* LTOIRSizeRet) except ?NVRTC_ERROR_INVALID_INPUT nogil
diff --git a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
index b6b896765d..0f6f452a80 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
@@ -24,8 +24,6 @@ cdef bint __cuPythonInit = False
 {{if 'nvrtcGetPTX' in found_functions}}cdef void *__nvrtcGetPTX = NULL{{endif}}
 {{if 'nvrtcGetCUBINSize' in found_functions}}cdef void *__nvrtcGetCUBINSize = NULL{{endif}}
 {{if 'nvrtcGetCUBIN' in found_functions}}cdef void *__nvrtcGetCUBIN = NULL{{endif}}
-{{if 'nvrtcGetNVVMSize' in found_functions}}cdef void *__nvrtcGetNVVMSize = NULL{{endif}}
-{{if 'nvrtcGetNVVM' in found_functions}}cdef void *__nvrtcGetNVVM = NULL{{endif}}
 {{if 'nvrtcGetLTOIRSize' in found_functions}}cdef void *__nvrtcGetLTOIRSize = NULL{{endif}}
 {{if 'nvrtcGetLTOIR' in found_functions}}cdef void *__nvrtcGetLTOIR = NULL{{endif}}
 {{if 'nvrtcGetOptiXIRSize' in found_functions}}cdef void *__nvrtcGetOptiXIRSize = NULL{{endif}}
@@ -136,20 +134,6 @@ cdef int cuPythonInit() except -1 nogil:
         except:
             pass
         {{endif}}
-        {{if 'nvrtcGetNVVMSize' in found_functions}}
-        try:
-            global __nvrtcGetNVVMSize
-            __nvrtcGetNVVMSize = <void*><unsigned long long>win32api.GetProcAddress(handle, 'nvrtcGetNVVMSize')
-        except:
-            pass
-        {{endif}}
-        {{if 'nvrtcGetNVVM' in found_functions}}
-        try:
-            global __nvrtcGetNVVM
-            __nvrtcGetNVVM = <void*><unsigned long long>win32api.GetProcAddress(handle, 'nvrtcGetNVVM')
-        except:
-            pass
-        {{endif}}
         {{if 'nvrtcGetLTOIRSize' in found_functions}}
         try:
             global __nvrtcGetLTOIRSize
@@ -287,14 +271,6 @@ cdef int cuPythonInit() except -1 nogil:
     global __nvrtcGetCUBIN
     __nvrtcGetCUBIN = dlfcn.dlsym(handle, 'nvrtcGetCUBIN')
     {{endif}}
-    {{if 'nvrtcGetNVVMSize' in found_functions}}
-    global __nvrtcGetNVVMSize
-    __nvrtcGetNVVMSize = dlfcn.dlsym(handle, 'nvrtcGetNVVMSize')
-    {{endif}}
-    {{if 'nvrtcGetNVVM' in found_functions}}
-    global __nvrtcGetNVVM
-    __nvrtcGetNVVM = dlfcn.dlsym(handle, 'nvrtcGetNVVM')
-    {{endif}}
     {{if 'nvrtcGetLTOIRSize' in found_functions}}
     global __nvrtcGetLTOIRSize
     __nvrtcGetLTOIRSize = dlfcn.dlsym(handle, 'nvrtcGetLTOIRSize')
@@ -482,30 +458,6 @@ cdef nvrtcResult _nvrtcGetCUBIN(nvrtcProgram prog, char* cubin) except ?NVRTC_ER
     return err
 {{endif}}
 
-{{if 'nvrtcGetNVVMSize' in found_functions}}
-
-cdef nvrtcResult _nvrtcGetNVVMSize(nvrtcProgram prog, size_t* nvvmSizeRet) except ?NVRTC_ERROR_INVALID_INPUT nogil:
-    global __nvrtcGetNVVMSize
-    cuPythonInit()
-    if __nvrtcGetNVVMSize == NULL:
-        with gil:
-            raise RuntimeError('Function "nvrtcGetNVVMSize" not found')
-    err = (<nvrtcResult (*)(nvrtcProgram, size_t*) except ?NVRTC_ERROR_INVALID_INPUT nogil> __nvrtcGetNVVMSize)(prog, nvvmSizeRet)
-    return err
-{{endif}}
-
-{{if 'nvrtcGetNVVM' in found_functions}}
-
-cdef nvrtcResult _nvrtcGetNVVM(nvrtcProgram prog, char* nvvm) except ?NVRTC_ERROR_INVALID_INPUT nogil:
-    global __nvrtcGetNVVM
-    cuPythonInit()
-    if __nvrtcGetNVVM == NULL:
-        with gil:
-            raise RuntimeError('Function "nvrtcGetNVVM" not found')
-    err = (<nvrtcResult (*)(nvrtcProgram, char*) except ?NVRTC_ERROR_INVALID_INPUT nogil> __nvrtcGetNVVM)(prog, nvvm)
-    return err
-{{endif}}
-
 {{if 'nvrtcGetLTOIRSize' in found_functions}}
 
 cdef nvrtcResult _nvrtcGetLTOIRSize(nvrtcProgram prog, size_t* LTOIRSizeRet) except ?NVRTC_ERROR_INVALID_INPUT nogil:
@@ -749,20 +701,6 @@ cpdef dict _inspect_function_pointers():
     data["__nvrtcGetCUBIN"] = <intptr_t>0
     {{endif}}
 
-    {{if 'nvrtcGetNVVMSize' in found_functions}}
-    global __nvrtcGetNVVMSize
-    data["__nvrtcGetNVVMSize"] = <intptr_t>__nvrtcGetNVVMSize
-    {{else}}
-    data["__nvrtcGetNVVMSize"] = <intptr_t>0
-    {{endif}}
-
-    {{if 'nvrtcGetNVVM' in found_functions}}
-    global __nvrtcGetNVVM
-    data["__nvrtcGetNVVM"] = <intptr_t>__nvrtcGetNVVM
-    {{else}}
-    data["__nvrtcGetNVVM"] = <intptr_t>0
-    {{endif}}
-
     {{if 'nvrtcGetLTOIRSize' in found_functions}}
     global __nvrtcGetLTOIRSize
     data["__nvrtcGetLTOIRSize"] = <intptr_t>__nvrtcGetLTOIRSize
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
index ee6e702153..dad92bdac0 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
@@ -129,9 +129,9 @@ cdef const char* _cudaGetErrorString(cudaError_t error) except ?NULL nogil
 cdef cudaError_t _cudaGetDeviceCount(int* count) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaGetDeviceProperties_v2' in found_functions}}
+{{if 'cudaGetDeviceProperties' in found_functions}}
 
-cdef cudaError_t _cudaGetDeviceProperties_v2(cudaDeviceProp* prop, int device) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaGetDeviceProperties(cudaDeviceProp* prop, int device) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaDeviceGetAttribute' in found_functions}}
@@ -139,6 +139,11 @@ cdef cudaError_t _cudaGetDeviceProperties_v2(cudaDeviceProp* prop, int device) e
 cdef cudaError_t _cudaDeviceGetAttribute(int* value, cudaDeviceAttr attr, int device) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
+{{if 'cudaDeviceGetHostAtomicCapabilities' in found_functions}}
+
+cdef cudaError_t _cudaDeviceGetHostAtomicCapabilities(unsigned int* capabilities, const cudaAtomicOperation* operations, unsigned int count, int device) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
 {{if 'cudaDeviceGetDefaultMemPool' in found_functions}}
 
 cdef cudaError_t _cudaDeviceGetDefaultMemPool(cudaMemPool_t* memPool, int device) except ?cudaErrorCallRequiresNewerDriver nogil
@@ -164,6 +169,11 @@ cdef cudaError_t _cudaDeviceGetNvSciSyncAttributes(void* nvSciSyncAttrList, int
 cdef cudaError_t _cudaDeviceGetP2PAttribute(int* value, cudaDeviceP2PAttr attr, int srcDevice, int dstDevice) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
+{{if 'cudaDeviceGetP2PAtomicCapabilities' in found_functions}}
+
+cdef cudaError_t _cudaDeviceGetP2PAtomicCapabilities(unsigned int* capabilities, const cudaAtomicOperation* operations, unsigned int count, int srcDevice, int dstDevice) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
 {{if 'cudaChooseDevice' in found_functions}}
 
 cdef cudaError_t _cudaChooseDevice(int* device, const cudaDeviceProp* prop) except ?cudaErrorCallRequiresNewerDriver nogil
@@ -304,24 +314,14 @@ cdef cudaError_t _cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t* pGraph)
 cdef cudaError_t _cudaStreamIsCapturing(cudaStream_t stream, cudaStreamCaptureStatus* pCaptureStatus) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaStreamGetCaptureInfo_v2' in found_functions}}
-
-cdef cudaError_t _cudaStreamGetCaptureInfo_v2(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, size_t* numDependencies_out) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaStreamGetCaptureInfo_v3' in found_functions}}
+{{if 'cudaStreamGetCaptureInfo' in found_functions}}
 
-cdef cudaError_t _cudaStreamGetCaptureInfo_v3(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, const cudaGraphEdgeData** edgeData_out, size_t* numDependencies_out) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaStreamGetCaptureInfo(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, const cudaGraphEdgeData** edgeData_out, size_t* numDependencies_out) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaStreamUpdateCaptureDependencies' in found_functions}}
 
-cdef cudaError_t _cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t* dependencies, size_t numDependencies, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaStreamUpdateCaptureDependencies_v2' in found_functions}}
-
-cdef cudaError_t _cudaStreamUpdateCaptureDependencies_v2(cudaStream_t stream, cudaGraphNode_t* dependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t* dependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaEventCreate' in found_functions}}
@@ -364,11 +364,6 @@ cdef cudaError_t _cudaEventDestroy(cudaEvent_t event) except ?cudaErrorCallRequi
 cdef cudaError_t _cudaEventElapsedTime(float* ms, cudaEvent_t start, cudaEvent_t end) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaEventElapsedTime_v2' in found_functions}}
-
-cdef cudaError_t _cudaEventElapsedTime_v2(float* ms, cudaEvent_t start, cudaEvent_t end) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
 {{if 'cudaImportExternalMemory' in found_functions}}
 
 cdef cudaError_t _cudaImportExternalMemory(cudaExternalMemory_t* extMem_out, const cudaExternalMemoryHandleDesc* memHandleDesc) except ?cudaErrorCallRequiresNewerDriver nogil
@@ -394,14 +389,14 @@ cdef cudaError_t _cudaDestroyExternalMemory(cudaExternalMemory_t extMem) except
 cdef cudaError_t _cudaImportExternalSemaphore(cudaExternalSemaphore_t* extSem_out, const cudaExternalSemaphoreHandleDesc* semHandleDesc) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaSignalExternalSemaphoresAsync_v2' in found_functions}}
+{{if 'cudaSignalExternalSemaphoresAsync' in found_functions}}
 
-cdef cudaError_t _cudaSignalExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreSignalParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaSignalExternalSemaphoresAsync(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreSignalParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaWaitExternalSemaphoresAsync_v2' in found_functions}}
+{{if 'cudaWaitExternalSemaphoresAsync' in found_functions}}
 
-cdef cudaError_t _cudaWaitExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreWaitParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaWaitExternalSemaphoresAsync(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreWaitParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaDestroyExternalSemaphore' in found_functions}}
@@ -636,12 +631,12 @@ cdef cudaError_t _cudaMemcpyPeerAsync(void* dst, int dstDevice, const void* src,
 
 {{if 'cudaMemcpyBatchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemcpyBatchAsync(void** dsts, void** srcs, size_t* sizes, size_t count, cudaMemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, size_t* failIdx, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaMemcpyBatchAsync(const void** dsts, const void** srcs, const size_t* sizes, size_t count, cudaMemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaMemcpy3DBatchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemcpy3DBatchAsync(size_t numOps, cudaMemcpy3DBatchOp* opList, size_t* failIdx, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaMemcpy3DBatchAsync(size_t numOps, cudaMemcpy3DBatchOp* opList, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaMemcpy2DAsync' in found_functions}}
@@ -691,22 +686,27 @@ cdef cudaError_t _cudaMemset3DAsync(cudaPitchedPtr pitchedDevPtr, int value, cud
 
 {{if 'cudaMemPrefetchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemPrefetchAsync(const void* devPtr, size_t count, int dstDevice, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaMemPrefetchAsync(const void* devPtr, size_t count, cudaMemLocation location, unsigned int flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaMemPrefetchAsync_v2' in found_functions}}
+{{if 'cudaMemPrefetchBatchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemPrefetchAsync_v2(const void* devPtr, size_t count, cudaMemLocation location, unsigned int flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaMemPrefetchBatchAsync(void** dptrs, size_t* sizes, size_t count, cudaMemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaMemAdvise' in found_functions}}
+{{if 'cudaMemDiscardBatchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemAdvise(const void* devPtr, size_t count, cudaMemoryAdvise advice, int device) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaMemDiscardBatchAsync(void** dptrs, size_t* sizes, size_t count, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaMemAdvise_v2' in found_functions}}
+{{if 'cudaMemDiscardAndPrefetchBatchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemAdvise_v2(const void* devPtr, size_t count, cudaMemoryAdvise advice, cudaMemLocation location) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaMemDiscardAndPrefetchBatchAsync(void** dptrs, size_t* sizes, size_t count, cudaMemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaMemAdvise' in found_functions}}
+
+cdef cudaError_t _cudaMemAdvise(const void* devPtr, size_t count, cudaMemoryAdvise advice, cudaMemLocation location) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaMemRangeGetAttribute' in found_functions}}
@@ -789,6 +789,21 @@ cdef cudaError_t _cudaMemPoolCreate(cudaMemPool_t* memPool, const cudaMemPoolPro
 cdef cudaError_t _cudaMemPoolDestroy(cudaMemPool_t memPool) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
+{{if 'cudaMemGetDefaultMemPool' in found_functions}}
+
+cdef cudaError_t _cudaMemGetDefaultMemPool(cudaMemPool_t* memPool, cudaMemLocation* location, cudaMemAllocationType typename) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaMemGetMemPool' in found_functions}}
+
+cdef cudaError_t _cudaMemGetMemPool(cudaMemPool_t* memPool, cudaMemLocation* location, cudaMemAllocationType typename) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaMemSetMemPool' in found_functions}}
+
+cdef cudaError_t _cudaMemSetMemPool(cudaMemLocation* location, cudaMemAllocationType typename, cudaMemPool_t memPool) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
 {{if 'cudaMallocFromPoolAsync' in found_functions}}
 
 cdef cudaError_t _cudaMallocFromPoolAsync(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
@@ -929,6 +944,31 @@ cdef cudaError_t _cudaDriverGetVersion(int* driverVersion) except ?cudaErrorCall
 cdef cudaError_t _cudaRuntimeGetVersion(int* runtimeVersion) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
+{{if 'cudaLogsRegisterCallback' in found_functions}}
+
+cdef cudaError_t _cudaLogsRegisterCallback(cudaLogsCallback_t callbackFunc, void* userData, cudaLogsCallbackHandle* callback_out) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaLogsUnregisterCallback' in found_functions}}
+
+cdef cudaError_t _cudaLogsUnregisterCallback(cudaLogsCallbackHandle callback) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaLogsCurrent' in found_functions}}
+
+cdef cudaError_t _cudaLogsCurrent(cudaLogIterator* iterator_out, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaLogsDumpToFile' in found_functions}}
+
+cdef cudaError_t _cudaLogsDumpToFile(cudaLogIterator* iterator, const char* pathToFile, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaLogsDumpToMemory' in found_functions}}
+
+cdef cudaError_t _cudaLogsDumpToMemory(cudaLogIterator* iterator, char* buffer, size_t* size, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
 {{if 'cudaGraphCreate' in found_functions}}
 
 cdef cudaError_t _cudaGraphCreate(cudaGraph_t* pGraph, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
@@ -951,7 +991,7 @@ cdef cudaError_t _cudaGraphKernelNodeSetParams(cudaGraphNode_t node, const cudaK
 
 {{if 'cudaGraphKernelNodeCopyAttributes' in found_functions}}
 
-cdef cudaError_t _cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hSrc, cudaGraphNode_t hDst) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hDst, cudaGraphNode_t hSrc) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphKernelNodeGetAttribute' in found_functions}}
@@ -1156,52 +1196,27 @@ cdef cudaError_t _cudaGraphGetRootNodes(cudaGraph_t graph, cudaGraphNode_t* pRoo
 
 {{if 'cudaGraphGetEdges' in found_functions}}
 
-cdef cudaError_t _cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, size_t* numEdges) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaGraphGetEdges_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphGetEdges_v2(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, cudaGraphEdgeData* edgeData, size_t* numEdges) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, cudaGraphEdgeData* edgeData, size_t* numEdges) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphNodeGetDependencies' in found_functions}}
 
-cdef cudaError_t _cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, size_t* pNumDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaGraphNodeGetDependencies_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphNodeGetDependencies_v2(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, cudaGraphEdgeData* edgeData, size_t* pNumDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, cudaGraphEdgeData* edgeData, size_t* pNumDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphNodeGetDependentNodes' in found_functions}}
 
-cdef cudaError_t _cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, size_t* pNumDependentNodes) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaGraphNodeGetDependentNodes_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphNodeGetDependentNodes_v2(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, cudaGraphEdgeData* edgeData, size_t* pNumDependentNodes) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, cudaGraphEdgeData* edgeData, size_t* pNumDependentNodes) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphAddDependencies' in found_functions}}
 
-cdef cudaError_t _cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaGraphAddDependencies_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphAddDependencies_v2(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphRemoveDependencies' in found_functions}}
 
-cdef cudaError_t _cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaGraphRemoveDependencies_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphRemoveDependencies_v2(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphDestroyNode' in found_functions}}
@@ -1346,12 +1361,7 @@ cdef cudaError_t _cudaGraphReleaseUserObject(cudaGraph_t graph, cudaUserObject_t
 
 {{if 'cudaGraphAddNode' in found_functions}}
 
-cdef cudaError_t _cudaGraphAddNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, cudaGraphNodeParams* nodeParams) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaGraphAddNode_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphAddNode_v2(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, cudaGraphNodeParams* nodeParams) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaGraphAddNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, cudaGraphNodeParams* nodeParams) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphNodeSetParams' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
index d1636bbbd4..d9a79cb4e4 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
@@ -245,13 +245,13 @@ cdef cudaError_t _cudaGetDeviceCount(int* count) except ?cudaErrorCallRequiresNe
     return cudaGetDeviceCount(count)
 {{endif}}
 
-{{if 'cudaGetDeviceProperties_v2' in found_functions}}
+{{if 'cudaGetDeviceProperties' in found_functions}}
 
-cdef cudaError_t _cudaGetDeviceProperties_v2(cudaDeviceProp* prop, int device) except ?cudaErrorCallRequiresNewerDriver nogil:
+cdef cudaError_t _cudaGetDeviceProperties(cudaDeviceProp* prop, int device) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef bint usePTDS = cudaPythonInit()
     if usePTDS:
-        return ptds._cudaGetDeviceProperties_v2(prop, device)
-    return cudaGetDeviceProperties_v2(prop, device)
+        return ptds._cudaGetDeviceProperties(prop, device)
+    return cudaGetDeviceProperties(prop, device)
 {{endif}}
 
 {{if 'cudaDeviceGetAttribute' in found_functions}}
@@ -263,6 +263,15 @@ cdef cudaError_t _cudaDeviceGetAttribute(int* value, cudaDeviceAttr attr, int de
     return cudaDeviceGetAttribute(value, attr, device)
 {{endif}}
 
+{{if 'cudaDeviceGetHostAtomicCapabilities' in found_functions}}
+
+cdef cudaError_t _cudaDeviceGetHostAtomicCapabilities(unsigned int* capabilities, const cudaAtomicOperation* operations, unsigned int count, int device) except ?cudaErrorCallRequiresNewerDriver nogil:
+    cdef bint usePTDS = cudaPythonInit()
+    if usePTDS:
+        return ptds._cudaDeviceGetHostAtomicCapabilities(capabilities, operations, count, device)
+    return cudaDeviceGetHostAtomicCapabilities(capabilities, operations, count, device)
+{{endif}}
+
 {{if 'cudaDeviceGetDefaultMemPool' in found_functions}}
 
 cdef cudaError_t _cudaDeviceGetDefaultMemPool(cudaMemPool_t* memPool, int device) except ?cudaErrorCallRequiresNewerDriver nogil:
@@ -308,6 +317,15 @@ cdef cudaError_t _cudaDeviceGetP2PAttribute(int* value, cudaDeviceP2PAttr attr,
     return cudaDeviceGetP2PAttribute(value, attr, srcDevice, dstDevice)
 {{endif}}
 
+{{if 'cudaDeviceGetP2PAtomicCapabilities' in found_functions}}
+
+cdef cudaError_t _cudaDeviceGetP2PAtomicCapabilities(unsigned int* capabilities, const cudaAtomicOperation* operations, unsigned int count, int srcDevice, int dstDevice) except ?cudaErrorCallRequiresNewerDriver nogil:
+    cdef bint usePTDS = cudaPythonInit()
+    if usePTDS:
+        return ptds._cudaDeviceGetP2PAtomicCapabilities(capabilities, operations, count, srcDevice, dstDevice)
+    return cudaDeviceGetP2PAtomicCapabilities(capabilities, operations, count, srcDevice, dstDevice)
+{{endif}}
+
 {{if 'cudaChooseDevice' in found_functions}}
 
 cdef cudaError_t _cudaChooseDevice(int* device, const cudaDeviceProp* prop) except ?cudaErrorCallRequiresNewerDriver nogil:
@@ -560,40 +578,22 @@ cdef cudaError_t _cudaStreamIsCapturing(cudaStream_t stream, cudaStreamCaptureSt
     return cudaStreamIsCapturing(stream, pCaptureStatus)
 {{endif}}
 
-{{if 'cudaStreamGetCaptureInfo_v2' in found_functions}}
-
-cdef cudaError_t _cudaStreamGetCaptureInfo_v2(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, size_t* numDependencies_out) except ?cudaErrorCallRequiresNewerDriver nogil:
-    cdef bint usePTDS = cudaPythonInit()
-    if usePTDS:
-        return ptds._cudaStreamGetCaptureInfo_v2(stream, captureStatus_out, id_out, graph_out, dependencies_out, numDependencies_out)
-    return cudaStreamGetCaptureInfo_v2(stream, captureStatus_out, id_out, graph_out, dependencies_out, numDependencies_out)
-{{endif}}
-
-{{if 'cudaStreamGetCaptureInfo_v3' in found_functions}}
+{{if 'cudaStreamGetCaptureInfo' in found_functions}}
 
-cdef cudaError_t _cudaStreamGetCaptureInfo_v3(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, const cudaGraphEdgeData** edgeData_out, size_t* numDependencies_out) except ?cudaErrorCallRequiresNewerDriver nogil:
+cdef cudaError_t _cudaStreamGetCaptureInfo(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, const cudaGraphEdgeData** edgeData_out, size_t* numDependencies_out) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef bint usePTDS = cudaPythonInit()
     if usePTDS:
-        return ptds._cudaStreamGetCaptureInfo_v3(stream, captureStatus_out, id_out, graph_out, dependencies_out, edgeData_out, numDependencies_out)
-    return cudaStreamGetCaptureInfo_v3(stream, captureStatus_out, id_out, graph_out, dependencies_out, edgeData_out, numDependencies_out)
+        return ptds._cudaStreamGetCaptureInfo(stream, captureStatus_out, id_out, graph_out, dependencies_out, edgeData_out, numDependencies_out)
+    return cudaStreamGetCaptureInfo(stream, captureStatus_out, id_out, graph_out, dependencies_out, edgeData_out, numDependencies_out)
 {{endif}}
 
 {{if 'cudaStreamUpdateCaptureDependencies' in found_functions}}
 
-cdef cudaError_t _cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t* dependencies, size_t numDependencies, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
-    cdef bint usePTDS = cudaPythonInit()
-    if usePTDS:
-        return ptds._cudaStreamUpdateCaptureDependencies(stream, dependencies, numDependencies, flags)
-    return cudaStreamUpdateCaptureDependencies(stream, dependencies, numDependencies, flags)
-{{endif}}
-
-{{if 'cudaStreamUpdateCaptureDependencies_v2' in found_functions}}
-
-cdef cudaError_t _cudaStreamUpdateCaptureDependencies_v2(cudaStream_t stream, cudaGraphNode_t* dependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
+cdef cudaError_t _cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t* dependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef bint usePTDS = cudaPythonInit()
     if usePTDS:
-        return ptds._cudaStreamUpdateCaptureDependencies_v2(stream, dependencies, dependencyData, numDependencies, flags)
-    return cudaStreamUpdateCaptureDependencies_v2(stream, dependencies, dependencyData, numDependencies, flags)
+        return ptds._cudaStreamUpdateCaptureDependencies(stream, dependencies, dependencyData, numDependencies, flags)
+    return cudaStreamUpdateCaptureDependencies(stream, dependencies, dependencyData, numDependencies, flags)
 {{endif}}
 
 {{if 'cudaEventCreate' in found_functions}}
@@ -668,15 +668,6 @@ cdef cudaError_t _cudaEventElapsedTime(float* ms, cudaEvent_t start, cudaEvent_t
     return cudaEventElapsedTime(ms, start, end)
 {{endif}}
 
-{{if 'cudaEventElapsedTime_v2' in found_functions}}
-
-cdef cudaError_t _cudaEventElapsedTime_v2(float* ms, cudaEvent_t start, cudaEvent_t end) except ?cudaErrorCallRequiresNewerDriver nogil:
-    cdef bint usePTDS = cudaPythonInit()
-    if usePTDS:
-        return ptds._cudaEventElapsedTime_v2(ms, start, end)
-    return cudaEventElapsedTime_v2(ms, start, end)
-{{endif}}
-
 {{if 'cudaImportExternalMemory' in found_functions}}
 
 cdef cudaError_t _cudaImportExternalMemory(cudaExternalMemory_t* extMem_out, const cudaExternalMemoryHandleDesc* memHandleDesc) except ?cudaErrorCallRequiresNewerDriver nogil:
@@ -722,22 +713,22 @@ cdef cudaError_t _cudaImportExternalSemaphore(cudaExternalSemaphore_t* extSem_ou
     return cudaImportExternalSemaphore(extSem_out, semHandleDesc)
 {{endif}}
 
-{{if 'cudaSignalExternalSemaphoresAsync_v2' in found_functions}}
+{{if 'cudaSignalExternalSemaphoresAsync' in found_functions}}
 
-cdef cudaError_t _cudaSignalExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreSignalParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
+cdef cudaError_t _cudaSignalExternalSemaphoresAsync(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreSignalParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef bint usePTDS = cudaPythonInit()
     if usePTDS:
-        return ptds._cudaSignalExternalSemaphoresAsync_v2(extSemArray, paramsArray, numExtSems, stream)
-    return cudaSignalExternalSemaphoresAsync_v2(extSemArray, paramsArray, numExtSems, stream)
+        return ptds._cudaSignalExternalSemaphoresAsync(extSemArray, paramsArray, numExtSems, stream)
+    return cudaSignalExternalSemaphoresAsync(extSemArray, paramsArray, numExtSems, stream)
 {{endif}}
 
-{{if 'cudaWaitExternalSemaphoresAsync_v2' in found_functions}}
+{{if 'cudaWaitExternalSemaphoresAsync' in found_functions}}
 
-cdef cudaError_t _cudaWaitExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreWaitParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
+cdef cudaError_t _cudaWaitExternalSemaphoresAsync(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreWaitParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef bint usePTDS = cudaPythonInit()
     if usePTDS:
-        return ptds._cudaWaitExternalSemaphoresAsync_v2(extSemArray, paramsArray, numExtSems, stream)
-    return cudaWaitExternalSemaphoresAsync_v2(extSemArray, paramsArray, numExtSems, stream)
+        return ptds._cudaWaitExternalSemaphoresAsync(extSemArray, paramsArray, numExtSems, stream)
+    return cudaWaitExternalSemaphoresAsync(extSemArray, paramsArray, numExtSems, stream)
 {{endif}}
 
 {{if 'cudaDestroyExternalSemaphore' in found_functions}}
@@ -1156,20 +1147,20 @@ cdef cudaError_t _cudaMemcpyPeerAsync(void* dst, int dstDevice, const void* src,
 
 {{if 'cudaMemcpyBatchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemcpyBatchAsync(void** dsts, void** srcs, size_t* sizes, size_t count, cudaMemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, size_t* failIdx, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
+cdef cudaError_t _cudaMemcpyBatchAsync(const void** dsts, const void** srcs, const size_t* sizes, size_t count, cudaMemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef bint usePTDS = cudaPythonInit()
     if usePTDS:
-        return ptds._cudaMemcpyBatchAsync(dsts, srcs, sizes, count, attrs, attrsIdxs, numAttrs, failIdx, stream)
-    return cudaMemcpyBatchAsync(dsts, srcs, sizes, count, attrs, attrsIdxs, numAttrs, failIdx, stream)
+        return ptds._cudaMemcpyBatchAsync(dsts, srcs, sizes, count, attrs, attrsIdxs, numAttrs, stream)
+    return cudaMemcpyBatchAsync(dsts, srcs, sizes, count, attrs, attrsIdxs, numAttrs, stream)
 {{endif}}
 
 {{if 'cudaMemcpy3DBatchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemcpy3DBatchAsync(size_t numOps, cudaMemcpy3DBatchOp* opList, size_t* failIdx, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
+cdef cudaError_t _cudaMemcpy3DBatchAsync(size_t numOps, cudaMemcpy3DBatchOp* opList, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef bint usePTDS = cudaPythonInit()
     if usePTDS:
-        return ptds._cudaMemcpy3DBatchAsync(numOps, opList, failIdx, flags, stream)
-    return cudaMemcpy3DBatchAsync(numOps, opList, failIdx, flags, stream)
+        return ptds._cudaMemcpy3DBatchAsync(numOps, opList, flags, stream)
+    return cudaMemcpy3DBatchAsync(numOps, opList, flags, stream)
 {{endif}}
 
 {{if 'cudaMemcpy2DAsync' in found_functions}}
@@ -1255,38 +1246,47 @@ cdef cudaError_t _cudaMemset3DAsync(cudaPitchedPtr pitchedDevPtr, int value, cud
 
 {{if 'cudaMemPrefetchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemPrefetchAsync(const void* devPtr, size_t count, int dstDevice, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
+cdef cudaError_t _cudaMemPrefetchAsync(const void* devPtr, size_t count, cudaMemLocation location, unsigned int flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef bint usePTDS = cudaPythonInit()
     if usePTDS:
-        return ptds._cudaMemPrefetchAsync(devPtr, count, dstDevice, stream)
-    return cudaMemPrefetchAsync(devPtr, count, dstDevice, stream)
+        return ptds._cudaMemPrefetchAsync(devPtr, count, location, flags, stream)
+    return cudaMemPrefetchAsync(devPtr, count, location, flags, stream)
 {{endif}}
 
-{{if 'cudaMemPrefetchAsync_v2' in found_functions}}
+{{if 'cudaMemPrefetchBatchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemPrefetchAsync_v2(const void* devPtr, size_t count, cudaMemLocation location, unsigned int flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
+cdef cudaError_t _cudaMemPrefetchBatchAsync(void** dptrs, size_t* sizes, size_t count, cudaMemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef bint usePTDS = cudaPythonInit()
     if usePTDS:
-        return ptds._cudaMemPrefetchAsync_v2(devPtr, count, location, flags, stream)
-    return cudaMemPrefetchAsync_v2(devPtr, count, location, flags, stream)
+        return ptds._cudaMemPrefetchBatchAsync(dptrs, sizes, count, prefetchLocs, prefetchLocIdxs, numPrefetchLocs, flags, stream)
+    return cudaMemPrefetchBatchAsync(dptrs, sizes, count, prefetchLocs, prefetchLocIdxs, numPrefetchLocs, flags, stream)
 {{endif}}
 
-{{if 'cudaMemAdvise' in found_functions}}
+{{if 'cudaMemDiscardBatchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemAdvise(const void* devPtr, size_t count, cudaMemoryAdvise advice, int device) except ?cudaErrorCallRequiresNewerDriver nogil:
+cdef cudaError_t _cudaMemDiscardBatchAsync(void** dptrs, size_t* sizes, size_t count, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef bint usePTDS = cudaPythonInit()
     if usePTDS:
-        return ptds._cudaMemAdvise(devPtr, count, advice, device)
-    return cudaMemAdvise(devPtr, count, advice, device)
+        return ptds._cudaMemDiscardBatchAsync(dptrs, sizes, count, flags, stream)
+    return cudaMemDiscardBatchAsync(dptrs, sizes, count, flags, stream)
 {{endif}}
 
-{{if 'cudaMemAdvise_v2' in found_functions}}
+{{if 'cudaMemDiscardAndPrefetchBatchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemAdvise_v2(const void* devPtr, size_t count, cudaMemoryAdvise advice, cudaMemLocation location) except ?cudaErrorCallRequiresNewerDriver nogil:
+cdef cudaError_t _cudaMemDiscardAndPrefetchBatchAsync(void** dptrs, size_t* sizes, size_t count, cudaMemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef bint usePTDS = cudaPythonInit()
     if usePTDS:
-        return ptds._cudaMemAdvise_v2(devPtr, count, advice, location)
-    return cudaMemAdvise_v2(devPtr, count, advice, location)
+        return ptds._cudaMemDiscardAndPrefetchBatchAsync(dptrs, sizes, count, prefetchLocs, prefetchLocIdxs, numPrefetchLocs, flags, stream)
+    return cudaMemDiscardAndPrefetchBatchAsync(dptrs, sizes, count, prefetchLocs, prefetchLocIdxs, numPrefetchLocs, flags, stream)
+{{endif}}
+
+{{if 'cudaMemAdvise' in found_functions}}
+
+cdef cudaError_t _cudaMemAdvise(const void* devPtr, size_t count, cudaMemoryAdvise advice, cudaMemLocation location) except ?cudaErrorCallRequiresNewerDriver nogil:
+    cdef bint usePTDS = cudaPythonInit()
+    if usePTDS:
+        return ptds._cudaMemAdvise(devPtr, count, advice, location)
+    return cudaMemAdvise(devPtr, count, advice, location)
 {{endif}}
 
 {{if 'cudaMemRangeGetAttribute' in found_functions}}
@@ -1433,6 +1433,33 @@ cdef cudaError_t _cudaMemPoolDestroy(cudaMemPool_t memPool) except ?cudaErrorCal
     return cudaMemPoolDestroy(memPool)
 {{endif}}
 
+{{if 'cudaMemGetDefaultMemPool' in found_functions}}
+
+cdef cudaError_t _cudaMemGetDefaultMemPool(cudaMemPool_t* memPool, cudaMemLocation* location, cudaMemAllocationType typename) except ?cudaErrorCallRequiresNewerDriver nogil:
+    cdef bint usePTDS = cudaPythonInit()
+    if usePTDS:
+        return ptds._cudaMemGetDefaultMemPool(memPool, location, typename)
+    return cudaMemGetDefaultMemPool(memPool, location, typename)
+{{endif}}
+
+{{if 'cudaMemGetMemPool' in found_functions}}
+
+cdef cudaError_t _cudaMemGetMemPool(cudaMemPool_t* memPool, cudaMemLocation* location, cudaMemAllocationType typename) except ?cudaErrorCallRequiresNewerDriver nogil:
+    cdef bint usePTDS = cudaPythonInit()
+    if usePTDS:
+        return ptds._cudaMemGetMemPool(memPool, location, typename)
+    return cudaMemGetMemPool(memPool, location, typename)
+{{endif}}
+
+{{if 'cudaMemSetMemPool' in found_functions}}
+
+cdef cudaError_t _cudaMemSetMemPool(cudaMemLocation* location, cudaMemAllocationType typename, cudaMemPool_t memPool) except ?cudaErrorCallRequiresNewerDriver nogil:
+    cdef bint usePTDS = cudaPythonInit()
+    if usePTDS:
+        return ptds._cudaMemSetMemPool(location, typename, memPool)
+    return cudaMemSetMemPool(location, typename, memPool)
+{{endif}}
+
 {{if 'cudaMallocFromPoolAsync' in found_functions}}
 
 cdef cudaError_t _cudaMallocFromPoolAsync(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
@@ -1685,6 +1712,51 @@ cdef cudaError_t _cudaRuntimeGetVersion(int* runtimeVersion) except ?cudaErrorCa
     return cudaRuntimeGetVersion(runtimeVersion)
 {{endif}}
 
+{{if 'cudaLogsRegisterCallback' in found_functions}}
+
+cdef cudaError_t _cudaLogsRegisterCallback(cudaLogsCallback_t callbackFunc, void* userData, cudaLogsCallbackHandle* callback_out) except ?cudaErrorCallRequiresNewerDriver nogil:
+    cdef bint usePTDS = cudaPythonInit()
+    if usePTDS:
+        return ptds._cudaLogsRegisterCallback(callbackFunc, userData, callback_out)
+    return cudaLogsRegisterCallback(callbackFunc, userData, callback_out)
+{{endif}}
+
+{{if 'cudaLogsUnregisterCallback' in found_functions}}
+
+cdef cudaError_t _cudaLogsUnregisterCallback(cudaLogsCallbackHandle callback) except ?cudaErrorCallRequiresNewerDriver nogil:
+    cdef bint usePTDS = cudaPythonInit()
+    if usePTDS:
+        return ptds._cudaLogsUnregisterCallback(callback)
+    return cudaLogsUnregisterCallback(callback)
+{{endif}}
+
+{{if 'cudaLogsCurrent' in found_functions}}
+
+cdef cudaError_t _cudaLogsCurrent(cudaLogIterator* iterator_out, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
+    cdef bint usePTDS = cudaPythonInit()
+    if usePTDS:
+        return ptds._cudaLogsCurrent(iterator_out, flags)
+    return cudaLogsCurrent(iterator_out, flags)
+{{endif}}
+
+{{if 'cudaLogsDumpToFile' in found_functions}}
+
+cdef cudaError_t _cudaLogsDumpToFile(cudaLogIterator* iterator, const char* pathToFile, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
+    cdef bint usePTDS = cudaPythonInit()
+    if usePTDS:
+        return ptds._cudaLogsDumpToFile(iterator, pathToFile, flags)
+    return cudaLogsDumpToFile(iterator, pathToFile, flags)
+{{endif}}
+
+{{if 'cudaLogsDumpToMemory' in found_functions}}
+
+cdef cudaError_t _cudaLogsDumpToMemory(cudaLogIterator* iterator, char* buffer, size_t* size, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
+    cdef bint usePTDS = cudaPythonInit()
+    if usePTDS:
+        return ptds._cudaLogsDumpToMemory(iterator, buffer, size, flags)
+    return cudaLogsDumpToMemory(iterator, buffer, size, flags)
+{{endif}}
+
 {{if 'cudaGraphCreate' in found_functions}}
 
 cdef cudaError_t _cudaGraphCreate(cudaGraph_t* pGraph, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
@@ -1723,11 +1795,11 @@ cdef cudaError_t _cudaGraphKernelNodeSetParams(cudaGraphNode_t node, const cudaK
 
 {{if 'cudaGraphKernelNodeCopyAttributes' in found_functions}}
 
-cdef cudaError_t _cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hSrc, cudaGraphNode_t hDst) except ?cudaErrorCallRequiresNewerDriver nogil:
+cdef cudaError_t _cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hDst, cudaGraphNode_t hSrc) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef bint usePTDS = cudaPythonInit()
     if usePTDS:
-        return ptds._cudaGraphKernelNodeCopyAttributes(hSrc, hDst)
-    return cudaGraphKernelNodeCopyAttributes(hSrc, hDst)
+        return ptds._cudaGraphKernelNodeCopyAttributes(hDst, hSrc)
+    return cudaGraphKernelNodeCopyAttributes(hDst, hSrc)
 {{endif}}
 
 {{if 'cudaGraphKernelNodeGetAttribute' in found_functions}}
@@ -2092,92 +2164,47 @@ cdef cudaError_t _cudaGraphGetRootNodes(cudaGraph_t graph, cudaGraphNode_t* pRoo
 
 {{if 'cudaGraphGetEdges' in found_functions}}
 
-cdef cudaError_t _cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, size_t* numEdges) except ?cudaErrorCallRequiresNewerDriver nogil:
-    cdef bint usePTDS = cudaPythonInit()
-    if usePTDS:
-        return ptds._cudaGraphGetEdges(graph, from_, to, numEdges)
-    return cudaGraphGetEdges(graph, from_, to, numEdges)
-{{endif}}
-
-{{if 'cudaGraphGetEdges_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphGetEdges_v2(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, cudaGraphEdgeData* edgeData, size_t* numEdges) except ?cudaErrorCallRequiresNewerDriver nogil:
+cdef cudaError_t _cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, cudaGraphEdgeData* edgeData, size_t* numEdges) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef bint usePTDS = cudaPythonInit()
     if usePTDS:
-        return ptds._cudaGraphGetEdges_v2(graph, from_, to, edgeData, numEdges)
-    return cudaGraphGetEdges_v2(graph, from_, to, edgeData, numEdges)
+        return ptds._cudaGraphGetEdges(graph, from_, to, edgeData, numEdges)
+    return cudaGraphGetEdges(graph, from_, to, edgeData, numEdges)
 {{endif}}
 
 {{if 'cudaGraphNodeGetDependencies' in found_functions}}
 
-cdef cudaError_t _cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, size_t* pNumDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
+cdef cudaError_t _cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, cudaGraphEdgeData* edgeData, size_t* pNumDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef bint usePTDS = cudaPythonInit()
     if usePTDS:
-        return ptds._cudaGraphNodeGetDependencies(node, pDependencies, pNumDependencies)
-    return cudaGraphNodeGetDependencies(node, pDependencies, pNumDependencies)
-{{endif}}
-
-{{if 'cudaGraphNodeGetDependencies_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphNodeGetDependencies_v2(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, cudaGraphEdgeData* edgeData, size_t* pNumDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
-    cdef bint usePTDS = cudaPythonInit()
-    if usePTDS:
-        return ptds._cudaGraphNodeGetDependencies_v2(node, pDependencies, edgeData, pNumDependencies)
-    return cudaGraphNodeGetDependencies_v2(node, pDependencies, edgeData, pNumDependencies)
+        return ptds._cudaGraphNodeGetDependencies(node, pDependencies, edgeData, pNumDependencies)
+    return cudaGraphNodeGetDependencies(node, pDependencies, edgeData, pNumDependencies)
 {{endif}}
 
 {{if 'cudaGraphNodeGetDependentNodes' in found_functions}}
 
-cdef cudaError_t _cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, size_t* pNumDependentNodes) except ?cudaErrorCallRequiresNewerDriver nogil:
+cdef cudaError_t _cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, cudaGraphEdgeData* edgeData, size_t* pNumDependentNodes) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef bint usePTDS = cudaPythonInit()
     if usePTDS:
-        return ptds._cudaGraphNodeGetDependentNodes(node, pDependentNodes, pNumDependentNodes)
-    return cudaGraphNodeGetDependentNodes(node, pDependentNodes, pNumDependentNodes)
-{{endif}}
-
-{{if 'cudaGraphNodeGetDependentNodes_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphNodeGetDependentNodes_v2(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, cudaGraphEdgeData* edgeData, size_t* pNumDependentNodes) except ?cudaErrorCallRequiresNewerDriver nogil:
-    cdef bint usePTDS = cudaPythonInit()
-    if usePTDS:
-        return ptds._cudaGraphNodeGetDependentNodes_v2(node, pDependentNodes, edgeData, pNumDependentNodes)
-    return cudaGraphNodeGetDependentNodes_v2(node, pDependentNodes, edgeData, pNumDependentNodes)
+        return ptds._cudaGraphNodeGetDependentNodes(node, pDependentNodes, edgeData, pNumDependentNodes)
+    return cudaGraphNodeGetDependentNodes(node, pDependentNodes, edgeData, pNumDependentNodes)
 {{endif}}
 
 {{if 'cudaGraphAddDependencies' in found_functions}}
 
-cdef cudaError_t _cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
+cdef cudaError_t _cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef bint usePTDS = cudaPythonInit()
     if usePTDS:
-        return ptds._cudaGraphAddDependencies(graph, from_, to, numDependencies)
-    return cudaGraphAddDependencies(graph, from_, to, numDependencies)
-{{endif}}
-
-{{if 'cudaGraphAddDependencies_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphAddDependencies_v2(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
-    cdef bint usePTDS = cudaPythonInit()
-    if usePTDS:
-        return ptds._cudaGraphAddDependencies_v2(graph, from_, to, edgeData, numDependencies)
-    return cudaGraphAddDependencies_v2(graph, from_, to, edgeData, numDependencies)
+        return ptds._cudaGraphAddDependencies(graph, from_, to, edgeData, numDependencies)
+    return cudaGraphAddDependencies(graph, from_, to, edgeData, numDependencies)
 {{endif}}
 
 {{if 'cudaGraphRemoveDependencies' in found_functions}}
 
-cdef cudaError_t _cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
-    cdef bint usePTDS = cudaPythonInit()
-    if usePTDS:
-        return ptds._cudaGraphRemoveDependencies(graph, from_, to, numDependencies)
-    return cudaGraphRemoveDependencies(graph, from_, to, numDependencies)
-{{endif}}
-
-{{if 'cudaGraphRemoveDependencies_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphRemoveDependencies_v2(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
+cdef cudaError_t _cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef bint usePTDS = cudaPythonInit()
     if usePTDS:
-        return ptds._cudaGraphRemoveDependencies_v2(graph, from_, to, edgeData, numDependencies)
-    return cudaGraphRemoveDependencies_v2(graph, from_, to, edgeData, numDependencies)
+        return ptds._cudaGraphRemoveDependencies(graph, from_, to, edgeData, numDependencies)
+    return cudaGraphRemoveDependencies(graph, from_, to, edgeData, numDependencies)
 {{endif}}
 
 {{if 'cudaGraphDestroyNode' in found_functions}}
@@ -2434,20 +2461,11 @@ cdef cudaError_t _cudaGraphReleaseUserObject(cudaGraph_t graph, cudaUserObject_t
 
 {{if 'cudaGraphAddNode' in found_functions}}
 
-cdef cudaError_t _cudaGraphAddNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, cudaGraphNodeParams* nodeParams) except ?cudaErrorCallRequiresNewerDriver nogil:
-    cdef bint usePTDS = cudaPythonInit()
-    if usePTDS:
-        return ptds._cudaGraphAddNode(pGraphNode, graph, pDependencies, numDependencies, nodeParams)
-    return cudaGraphAddNode(pGraphNode, graph, pDependencies, numDependencies, nodeParams)
-{{endif}}
-
-{{if 'cudaGraphAddNode_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphAddNode_v2(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, cudaGraphNodeParams* nodeParams) except ?cudaErrorCallRequiresNewerDriver nogil:
+cdef cudaError_t _cudaGraphAddNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, cudaGraphNodeParams* nodeParams) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef bint usePTDS = cudaPythonInit()
     if usePTDS:
-        return ptds._cudaGraphAddNode_v2(pGraphNode, graph, pDependencies, dependencyData, numDependencies, nodeParams)
-    return cudaGraphAddNode_v2(pGraphNode, graph, pDependencies, dependencyData, numDependencies, nodeParams)
+        return ptds._cudaGraphAddNode(pGraphNode, graph, pDependencies, dependencyData, numDependencies, nodeParams)
+    return cudaGraphAddNode(pGraphNode, graph, pDependencies, dependencyData, numDependencies, nodeParams)
 {{endif}}
 
 {{if 'cudaGraphNodeSetParams' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
index b4745b5c46..bcd21181de 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
@@ -134,9 +134,9 @@ cdef const char* _cudaGetErrorString(cudaError_t error) except ?NULL nogil
 cdef cudaError_t _cudaGetDeviceCount(int* count) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaGetDeviceProperties_v2' in found_functions}}
+{{if 'cudaGetDeviceProperties' in found_functions}}
 
-cdef cudaError_t _cudaGetDeviceProperties_v2(cudaDeviceProp* prop, int device) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaGetDeviceProperties(cudaDeviceProp* prop, int device) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaDeviceGetAttribute' in found_functions}}
@@ -144,6 +144,11 @@ cdef cudaError_t _cudaGetDeviceProperties_v2(cudaDeviceProp* prop, int device) e
 cdef cudaError_t _cudaDeviceGetAttribute(int* value, cudaDeviceAttr attr, int device) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
+{{if 'cudaDeviceGetHostAtomicCapabilities' in found_functions}}
+
+cdef cudaError_t _cudaDeviceGetHostAtomicCapabilities(unsigned int* capabilities, const cudaAtomicOperation* operations, unsigned int count, int device) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
 {{if 'cudaDeviceGetDefaultMemPool' in found_functions}}
 
 cdef cudaError_t _cudaDeviceGetDefaultMemPool(cudaMemPool_t* memPool, int device) except ?cudaErrorCallRequiresNewerDriver nogil
@@ -169,6 +174,11 @@ cdef cudaError_t _cudaDeviceGetNvSciSyncAttributes(void* nvSciSyncAttrList, int
 cdef cudaError_t _cudaDeviceGetP2PAttribute(int* value, cudaDeviceP2PAttr attr, int srcDevice, int dstDevice) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
+{{if 'cudaDeviceGetP2PAtomicCapabilities' in found_functions}}
+
+cdef cudaError_t _cudaDeviceGetP2PAtomicCapabilities(unsigned int* capabilities, const cudaAtomicOperation* operations, unsigned int count, int srcDevice, int dstDevice) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
 {{if 'cudaChooseDevice' in found_functions}}
 
 cdef cudaError_t _cudaChooseDevice(int* device, const cudaDeviceProp* prop) except ?cudaErrorCallRequiresNewerDriver nogil
@@ -309,24 +319,14 @@ cdef cudaError_t _cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t* pGraph)
 cdef cudaError_t _cudaStreamIsCapturing(cudaStream_t stream, cudaStreamCaptureStatus* pCaptureStatus) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaStreamGetCaptureInfo_v2' in found_functions}}
-
-cdef cudaError_t _cudaStreamGetCaptureInfo_v2(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, size_t* numDependencies_out) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaStreamGetCaptureInfo_v3' in found_functions}}
+{{if 'cudaStreamGetCaptureInfo' in found_functions}}
 
-cdef cudaError_t _cudaStreamGetCaptureInfo_v3(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, const cudaGraphEdgeData** edgeData_out, size_t* numDependencies_out) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaStreamGetCaptureInfo(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, const cudaGraphEdgeData** edgeData_out, size_t* numDependencies_out) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaStreamUpdateCaptureDependencies' in found_functions}}
 
-cdef cudaError_t _cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t* dependencies, size_t numDependencies, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaStreamUpdateCaptureDependencies_v2' in found_functions}}
-
-cdef cudaError_t _cudaStreamUpdateCaptureDependencies_v2(cudaStream_t stream, cudaGraphNode_t* dependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t* dependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaEventCreate' in found_functions}}
@@ -369,11 +369,6 @@ cdef cudaError_t _cudaEventDestroy(cudaEvent_t event) except ?cudaErrorCallRequi
 cdef cudaError_t _cudaEventElapsedTime(float* ms, cudaEvent_t start, cudaEvent_t end) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaEventElapsedTime_v2' in found_functions}}
-
-cdef cudaError_t _cudaEventElapsedTime_v2(float* ms, cudaEvent_t start, cudaEvent_t end) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
 {{if 'cudaImportExternalMemory' in found_functions}}
 
 cdef cudaError_t _cudaImportExternalMemory(cudaExternalMemory_t* extMem_out, const cudaExternalMemoryHandleDesc* memHandleDesc) except ?cudaErrorCallRequiresNewerDriver nogil
@@ -399,14 +394,14 @@ cdef cudaError_t _cudaDestroyExternalMemory(cudaExternalMemory_t extMem) except
 cdef cudaError_t _cudaImportExternalSemaphore(cudaExternalSemaphore_t* extSem_out, const cudaExternalSemaphoreHandleDesc* semHandleDesc) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaSignalExternalSemaphoresAsync_v2' in found_functions}}
+{{if 'cudaSignalExternalSemaphoresAsync' in found_functions}}
 
-cdef cudaError_t _cudaSignalExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreSignalParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaSignalExternalSemaphoresAsync(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreSignalParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaWaitExternalSemaphoresAsync_v2' in found_functions}}
+{{if 'cudaWaitExternalSemaphoresAsync' in found_functions}}
 
-cdef cudaError_t _cudaWaitExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreWaitParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaWaitExternalSemaphoresAsync(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreWaitParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaDestroyExternalSemaphore' in found_functions}}
@@ -641,12 +636,12 @@ cdef cudaError_t _cudaMemcpyPeerAsync(void* dst, int dstDevice, const void* src,
 
 {{if 'cudaMemcpyBatchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemcpyBatchAsync(void** dsts, void** srcs, size_t* sizes, size_t count, cudaMemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, size_t* failIdx, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaMemcpyBatchAsync(const void** dsts, const void** srcs, const size_t* sizes, size_t count, cudaMemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaMemcpy3DBatchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemcpy3DBatchAsync(size_t numOps, cudaMemcpy3DBatchOp* opList, size_t* failIdx, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaMemcpy3DBatchAsync(size_t numOps, cudaMemcpy3DBatchOp* opList, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaMemcpy2DAsync' in found_functions}}
@@ -696,22 +691,27 @@ cdef cudaError_t _cudaMemset3DAsync(cudaPitchedPtr pitchedDevPtr, int value, cud
 
 {{if 'cudaMemPrefetchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemPrefetchAsync(const void* devPtr, size_t count, int dstDevice, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaMemPrefetchAsync(const void* devPtr, size_t count, cudaMemLocation location, unsigned int flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaMemPrefetchAsync_v2' in found_functions}}
+{{if 'cudaMemPrefetchBatchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemPrefetchAsync_v2(const void* devPtr, size_t count, cudaMemLocation location, unsigned int flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaMemPrefetchBatchAsync(void** dptrs, size_t* sizes, size_t count, cudaMemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaMemAdvise' in found_functions}}
+{{if 'cudaMemDiscardBatchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemAdvise(const void* devPtr, size_t count, cudaMemoryAdvise advice, int device) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaMemDiscardBatchAsync(void** dptrs, size_t* sizes, size_t count, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaMemAdvise_v2' in found_functions}}
+{{if 'cudaMemDiscardAndPrefetchBatchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemAdvise_v2(const void* devPtr, size_t count, cudaMemoryAdvise advice, cudaMemLocation location) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaMemDiscardAndPrefetchBatchAsync(void** dptrs, size_t* sizes, size_t count, cudaMemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaMemAdvise' in found_functions}}
+
+cdef cudaError_t _cudaMemAdvise(const void* devPtr, size_t count, cudaMemoryAdvise advice, cudaMemLocation location) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaMemRangeGetAttribute' in found_functions}}
@@ -794,6 +794,21 @@ cdef cudaError_t _cudaMemPoolCreate(cudaMemPool_t* memPool, const cudaMemPoolPro
 cdef cudaError_t _cudaMemPoolDestroy(cudaMemPool_t memPool) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
+{{if 'cudaMemGetDefaultMemPool' in found_functions}}
+
+cdef cudaError_t _cudaMemGetDefaultMemPool(cudaMemPool_t* memPool, cudaMemLocation* location, cudaMemAllocationType typename) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaMemGetMemPool' in found_functions}}
+
+cdef cudaError_t _cudaMemGetMemPool(cudaMemPool_t* memPool, cudaMemLocation* location, cudaMemAllocationType typename) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaMemSetMemPool' in found_functions}}
+
+cdef cudaError_t _cudaMemSetMemPool(cudaMemLocation* location, cudaMemAllocationType typename, cudaMemPool_t memPool) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
 {{if 'cudaMallocFromPoolAsync' in found_functions}}
 
 cdef cudaError_t _cudaMallocFromPoolAsync(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
@@ -934,6 +949,31 @@ cdef cudaError_t _cudaDriverGetVersion(int* driverVersion) except ?cudaErrorCall
 cdef cudaError_t _cudaRuntimeGetVersion(int* runtimeVersion) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
+{{if 'cudaLogsRegisterCallback' in found_functions}}
+
+cdef cudaError_t _cudaLogsRegisterCallback(cudaLogsCallback_t callbackFunc, void* userData, cudaLogsCallbackHandle* callback_out) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaLogsUnregisterCallback' in found_functions}}
+
+cdef cudaError_t _cudaLogsUnregisterCallback(cudaLogsCallbackHandle callback) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaLogsCurrent' in found_functions}}
+
+cdef cudaError_t _cudaLogsCurrent(cudaLogIterator* iterator_out, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaLogsDumpToFile' in found_functions}}
+
+cdef cudaError_t _cudaLogsDumpToFile(cudaLogIterator* iterator, const char* pathToFile, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaLogsDumpToMemory' in found_functions}}
+
+cdef cudaError_t _cudaLogsDumpToMemory(cudaLogIterator* iterator, char* buffer, size_t* size, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
 {{if 'cudaGraphCreate' in found_functions}}
 
 cdef cudaError_t _cudaGraphCreate(cudaGraph_t* pGraph, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
@@ -956,7 +996,7 @@ cdef cudaError_t _cudaGraphKernelNodeSetParams(cudaGraphNode_t node, const cudaK
 
 {{if 'cudaGraphKernelNodeCopyAttributes' in found_functions}}
 
-cdef cudaError_t _cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hSrc, cudaGraphNode_t hDst) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hDst, cudaGraphNode_t hSrc) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphKernelNodeGetAttribute' in found_functions}}
@@ -1161,52 +1201,27 @@ cdef cudaError_t _cudaGraphGetRootNodes(cudaGraph_t graph, cudaGraphNode_t* pRoo
 
 {{if 'cudaGraphGetEdges' in found_functions}}
 
-cdef cudaError_t _cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, size_t* numEdges) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaGraphGetEdges_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphGetEdges_v2(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, cudaGraphEdgeData* edgeData, size_t* numEdges) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, cudaGraphEdgeData* edgeData, size_t* numEdges) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphNodeGetDependencies' in found_functions}}
 
-cdef cudaError_t _cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, size_t* pNumDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaGraphNodeGetDependencies_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphNodeGetDependencies_v2(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, cudaGraphEdgeData* edgeData, size_t* pNumDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, cudaGraphEdgeData* edgeData, size_t* pNumDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphNodeGetDependentNodes' in found_functions}}
 
-cdef cudaError_t _cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, size_t* pNumDependentNodes) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaGraphNodeGetDependentNodes_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphNodeGetDependentNodes_v2(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, cudaGraphEdgeData* edgeData, size_t* pNumDependentNodes) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, cudaGraphEdgeData* edgeData, size_t* pNumDependentNodes) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphAddDependencies' in found_functions}}
 
-cdef cudaError_t _cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaGraphAddDependencies_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphAddDependencies_v2(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphRemoveDependencies' in found_functions}}
 
-cdef cudaError_t _cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaGraphRemoveDependencies_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphRemoveDependencies_v2(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphDestroyNode' in found_functions}}
@@ -1351,12 +1366,7 @@ cdef cudaError_t _cudaGraphReleaseUserObject(cudaGraph_t graph, cudaUserObject_t
 
 {{if 'cudaGraphAddNode' in found_functions}}
 
-cdef cudaError_t _cudaGraphAddNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, cudaGraphNodeParams* nodeParams) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaGraphAddNode_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphAddNode_v2(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, cudaGraphNodeParams* nodeParams) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t _cudaGraphAddNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, cudaGraphNodeParams* nodeParams) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphNodeSetParams' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
index 3ddab4e558..80c784c558 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
@@ -161,10 +161,10 @@ cdef cudaError_t _cudaGetDeviceCount(int* count) except ?cudaErrorCallRequiresNe
     return cudaGetDeviceCount(count)
 {{endif}}
 
-{{if 'cudaGetDeviceProperties_v2' in found_functions}}
+{{if 'cudaGetDeviceProperties' in found_functions}}
 
-cdef cudaError_t _cudaGetDeviceProperties_v2(cudaDeviceProp* prop, int device) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaGetDeviceProperties_v2(prop, device)
+cdef cudaError_t _cudaGetDeviceProperties(cudaDeviceProp* prop, int device) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaGetDeviceProperties(prop, device)
 {{endif}}
 
 {{if 'cudaDeviceGetAttribute' in found_functions}}
@@ -173,6 +173,12 @@ cdef cudaError_t _cudaDeviceGetAttribute(int* value, cudaDeviceAttr attr, int de
     return cudaDeviceGetAttribute(value, attr, device)
 {{endif}}
 
+{{if 'cudaDeviceGetHostAtomicCapabilities' in found_functions}}
+
+cdef cudaError_t _cudaDeviceGetHostAtomicCapabilities(unsigned int* capabilities, const cudaAtomicOperation* operations, unsigned int count, int device) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaDeviceGetHostAtomicCapabilities(capabilities, operations, count, device)
+{{endif}}
+
 {{if 'cudaDeviceGetDefaultMemPool' in found_functions}}
 
 cdef cudaError_t _cudaDeviceGetDefaultMemPool(cudaMemPool_t* memPool, int device) except ?cudaErrorCallRequiresNewerDriver nogil:
@@ -203,6 +209,12 @@ cdef cudaError_t _cudaDeviceGetP2PAttribute(int* value, cudaDeviceP2PAttr attr,
     return cudaDeviceGetP2PAttribute(value, attr, srcDevice, dstDevice)
 {{endif}}
 
+{{if 'cudaDeviceGetP2PAtomicCapabilities' in found_functions}}
+
+cdef cudaError_t _cudaDeviceGetP2PAtomicCapabilities(unsigned int* capabilities, const cudaAtomicOperation* operations, unsigned int count, int srcDevice, int dstDevice) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaDeviceGetP2PAtomicCapabilities(capabilities, operations, count, srcDevice, dstDevice)
+{{endif}}
+
 {{if 'cudaChooseDevice' in found_functions}}
 
 cdef cudaError_t _cudaChooseDevice(int* device, const cudaDeviceProp* prop) except ?cudaErrorCallRequiresNewerDriver nogil:
@@ -371,28 +383,16 @@ cdef cudaError_t _cudaStreamIsCapturing(cudaStream_t stream, cudaStreamCaptureSt
     return cudaStreamIsCapturing(stream, pCaptureStatus)
 {{endif}}
 
-{{if 'cudaStreamGetCaptureInfo_v2' in found_functions}}
-
-cdef cudaError_t _cudaStreamGetCaptureInfo_v2(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, size_t* numDependencies_out) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaStreamGetCaptureInfo_v2_ptsz(stream, captureStatus_out, id_out, graph_out, dependencies_out, numDependencies_out)
-{{endif}}
-
-{{if 'cudaStreamGetCaptureInfo_v3' in found_functions}}
+{{if 'cudaStreamGetCaptureInfo' in found_functions}}
 
-cdef cudaError_t _cudaStreamGetCaptureInfo_v3(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, const cudaGraphEdgeData** edgeData_out, size_t* numDependencies_out) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaStreamGetCaptureInfo_v3(stream, captureStatus_out, id_out, graph_out, dependencies_out, edgeData_out, numDependencies_out)
+cdef cudaError_t _cudaStreamGetCaptureInfo(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, const cudaGraphEdgeData** edgeData_out, size_t* numDependencies_out) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaStreamGetCaptureInfo(stream, captureStatus_out, id_out, graph_out, dependencies_out, edgeData_out, numDependencies_out)
 {{endif}}
 
 {{if 'cudaStreamUpdateCaptureDependencies' in found_functions}}
 
-cdef cudaError_t _cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t* dependencies, size_t numDependencies, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaStreamUpdateCaptureDependencies(stream, dependencies, numDependencies, flags)
-{{endif}}
-
-{{if 'cudaStreamUpdateCaptureDependencies_v2' in found_functions}}
-
-cdef cudaError_t _cudaStreamUpdateCaptureDependencies_v2(cudaStream_t stream, cudaGraphNode_t* dependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaStreamUpdateCaptureDependencies_v2(stream, dependencies, dependencyData, numDependencies, flags)
+cdef cudaError_t _cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t* dependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaStreamUpdateCaptureDependencies(stream, dependencies, dependencyData, numDependencies, flags)
 {{endif}}
 
 {{if 'cudaEventCreate' in found_functions}}
@@ -443,12 +443,6 @@ cdef cudaError_t _cudaEventElapsedTime(float* ms, cudaEvent_t start, cudaEvent_t
     return cudaEventElapsedTime(ms, start, end)
 {{endif}}
 
-{{if 'cudaEventElapsedTime_v2' in found_functions}}
-
-cdef cudaError_t _cudaEventElapsedTime_v2(float* ms, cudaEvent_t start, cudaEvent_t end) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaEventElapsedTime_v2(ms, start, end)
-{{endif}}
-
 {{if 'cudaImportExternalMemory' in found_functions}}
 
 cdef cudaError_t _cudaImportExternalMemory(cudaExternalMemory_t* extMem_out, const cudaExternalMemoryHandleDesc* memHandleDesc) except ?cudaErrorCallRequiresNewerDriver nogil:
@@ -479,16 +473,16 @@ cdef cudaError_t _cudaImportExternalSemaphore(cudaExternalSemaphore_t* extSem_ou
     return cudaImportExternalSemaphore(extSem_out, semHandleDesc)
 {{endif}}
 
-{{if 'cudaSignalExternalSemaphoresAsync_v2' in found_functions}}
+{{if 'cudaSignalExternalSemaphoresAsync' in found_functions}}
 
-cdef cudaError_t _cudaSignalExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreSignalParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaSignalExternalSemaphoresAsync_v2_ptsz(extSemArray, paramsArray, numExtSems, stream)
+cdef cudaError_t _cudaSignalExternalSemaphoresAsync(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreSignalParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaSignalExternalSemaphoresAsync(extSemArray, paramsArray, numExtSems, stream)
 {{endif}}
 
-{{if 'cudaWaitExternalSemaphoresAsync_v2' in found_functions}}
+{{if 'cudaWaitExternalSemaphoresAsync' in found_functions}}
 
-cdef cudaError_t _cudaWaitExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreWaitParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaWaitExternalSemaphoresAsync_v2_ptsz(extSemArray, paramsArray, numExtSems, stream)
+cdef cudaError_t _cudaWaitExternalSemaphoresAsync(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreWaitParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaWaitExternalSemaphoresAsync(extSemArray, paramsArray, numExtSems, stream)
 {{endif}}
 
 {{if 'cudaDestroyExternalSemaphore' in found_functions}}
@@ -769,14 +763,14 @@ cdef cudaError_t _cudaMemcpyPeerAsync(void* dst, int dstDevice, const void* src,
 
 {{if 'cudaMemcpyBatchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemcpyBatchAsync(void** dsts, void** srcs, size_t* sizes, size_t count, cudaMemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, size_t* failIdx, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaMemcpyBatchAsync(dsts, srcs, sizes, count, attrs, attrsIdxs, numAttrs, failIdx, stream)
+cdef cudaError_t _cudaMemcpyBatchAsync(const void** dsts, const void** srcs, const size_t* sizes, size_t count, cudaMemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaMemcpyBatchAsync(dsts, srcs, sizes, count, attrs, attrsIdxs, numAttrs, stream)
 {{endif}}
 
 {{if 'cudaMemcpy3DBatchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemcpy3DBatchAsync(size_t numOps, cudaMemcpy3DBatchOp* opList, size_t* failIdx, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaMemcpy3DBatchAsync(numOps, opList, failIdx, flags, stream)
+cdef cudaError_t _cudaMemcpy3DBatchAsync(size_t numOps, cudaMemcpy3DBatchOp* opList, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaMemcpy3DBatchAsync(numOps, opList, flags, stream)
 {{endif}}
 
 {{if 'cudaMemcpy2DAsync' in found_functions}}
@@ -835,26 +829,32 @@ cdef cudaError_t _cudaMemset3DAsync(cudaPitchedPtr pitchedDevPtr, int value, cud
 
 {{if 'cudaMemPrefetchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemPrefetchAsync(const void* devPtr, size_t count, int dstDevice, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaMemPrefetchAsync(devPtr, count, dstDevice, stream)
+cdef cudaError_t _cudaMemPrefetchAsync(const void* devPtr, size_t count, cudaMemLocation location, unsigned int flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaMemPrefetchAsync(devPtr, count, location, flags, stream)
 {{endif}}
 
-{{if 'cudaMemPrefetchAsync_v2' in found_functions}}
+{{if 'cudaMemPrefetchBatchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemPrefetchAsync_v2(const void* devPtr, size_t count, cudaMemLocation location, unsigned int flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaMemPrefetchAsync_v2(devPtr, count, location, flags, stream)
+cdef cudaError_t _cudaMemPrefetchBatchAsync(void** dptrs, size_t* sizes, size_t count, cudaMemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaMemPrefetchBatchAsync(dptrs, sizes, count, prefetchLocs, prefetchLocIdxs, numPrefetchLocs, flags, stream)
 {{endif}}
 
-{{if 'cudaMemAdvise' in found_functions}}
+{{if 'cudaMemDiscardBatchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemAdvise(const void* devPtr, size_t count, cudaMemoryAdvise advice, int device) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaMemAdvise(devPtr, count, advice, device)
+cdef cudaError_t _cudaMemDiscardBatchAsync(void** dptrs, size_t* sizes, size_t count, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaMemDiscardBatchAsync(dptrs, sizes, count, flags, stream)
 {{endif}}
 
-{{if 'cudaMemAdvise_v2' in found_functions}}
+{{if 'cudaMemDiscardAndPrefetchBatchAsync' in found_functions}}
 
-cdef cudaError_t _cudaMemAdvise_v2(const void* devPtr, size_t count, cudaMemoryAdvise advice, cudaMemLocation location) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaMemAdvise_v2(devPtr, count, advice, location)
+cdef cudaError_t _cudaMemDiscardAndPrefetchBatchAsync(void** dptrs, size_t* sizes, size_t count, cudaMemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaMemDiscardAndPrefetchBatchAsync(dptrs, sizes, count, prefetchLocs, prefetchLocIdxs, numPrefetchLocs, flags, stream)
+{{endif}}
+
+{{if 'cudaMemAdvise' in found_functions}}
+
+cdef cudaError_t _cudaMemAdvise(const void* devPtr, size_t count, cudaMemoryAdvise advice, cudaMemLocation location) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaMemAdvise(devPtr, count, advice, location)
 {{endif}}
 
 {{if 'cudaMemRangeGetAttribute' in found_functions}}
@@ -953,6 +953,24 @@ cdef cudaError_t _cudaMemPoolDestroy(cudaMemPool_t memPool) except ?cudaErrorCal
     return cudaMemPoolDestroy(memPool)
 {{endif}}
 
+{{if 'cudaMemGetDefaultMemPool' in found_functions}}
+
+cdef cudaError_t _cudaMemGetDefaultMemPool(cudaMemPool_t* memPool, cudaMemLocation* location, cudaMemAllocationType typename) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaMemGetDefaultMemPool(memPool, location, typename)
+{{endif}}
+
+{{if 'cudaMemGetMemPool' in found_functions}}
+
+cdef cudaError_t _cudaMemGetMemPool(cudaMemPool_t* memPool, cudaMemLocation* location, cudaMemAllocationType typename) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaMemGetMemPool(memPool, location, typename)
+{{endif}}
+
+{{if 'cudaMemSetMemPool' in found_functions}}
+
+cdef cudaError_t _cudaMemSetMemPool(cudaMemLocation* location, cudaMemAllocationType typename, cudaMemPool_t memPool) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaMemSetMemPool(location, typename, memPool)
+{{endif}}
+
 {{if 'cudaMallocFromPoolAsync' in found_functions}}
 
 cdef cudaError_t _cudaMallocFromPoolAsync(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
@@ -1121,6 +1139,36 @@ cdef cudaError_t _cudaRuntimeGetVersion(int* runtimeVersion) except ?cudaErrorCa
     return cudaRuntimeGetVersion(runtimeVersion)
 {{endif}}
 
+{{if 'cudaLogsRegisterCallback' in found_functions}}
+
+cdef cudaError_t _cudaLogsRegisterCallback(cudaLogsCallback_t callbackFunc, void* userData, cudaLogsCallbackHandle* callback_out) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaLogsRegisterCallback(callbackFunc, userData, callback_out)
+{{endif}}
+
+{{if 'cudaLogsUnregisterCallback' in found_functions}}
+
+cdef cudaError_t _cudaLogsUnregisterCallback(cudaLogsCallbackHandle callback) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaLogsUnregisterCallback(callback)
+{{endif}}
+
+{{if 'cudaLogsCurrent' in found_functions}}
+
+cdef cudaError_t _cudaLogsCurrent(cudaLogIterator* iterator_out, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaLogsCurrent(iterator_out, flags)
+{{endif}}
+
+{{if 'cudaLogsDumpToFile' in found_functions}}
+
+cdef cudaError_t _cudaLogsDumpToFile(cudaLogIterator* iterator, const char* pathToFile, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaLogsDumpToFile(iterator, pathToFile, flags)
+{{endif}}
+
+{{if 'cudaLogsDumpToMemory' in found_functions}}
+
+cdef cudaError_t _cudaLogsDumpToMemory(cudaLogIterator* iterator, char* buffer, size_t* size, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaLogsDumpToMemory(iterator, buffer, size, flags)
+{{endif}}
+
 {{if 'cudaGraphCreate' in found_functions}}
 
 cdef cudaError_t _cudaGraphCreate(cudaGraph_t* pGraph, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
@@ -1147,8 +1195,8 @@ cdef cudaError_t _cudaGraphKernelNodeSetParams(cudaGraphNode_t node, const cudaK
 
 {{if 'cudaGraphKernelNodeCopyAttributes' in found_functions}}
 
-cdef cudaError_t _cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hSrc, cudaGraphNode_t hDst) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaGraphKernelNodeCopyAttributes(hSrc, hDst)
+cdef cudaError_t _cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hDst, cudaGraphNode_t hSrc) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaGraphKernelNodeCopyAttributes(hDst, hSrc)
 {{endif}}
 
 {{if 'cudaGraphKernelNodeGetAttribute' in found_functions}}
@@ -1393,62 +1441,32 @@ cdef cudaError_t _cudaGraphGetRootNodes(cudaGraph_t graph, cudaGraphNode_t* pRoo
 
 {{if 'cudaGraphGetEdges' in found_functions}}
 
-cdef cudaError_t _cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, size_t* numEdges) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaGraphGetEdges(graph, from_, to, numEdges)
-{{endif}}
-
-{{if 'cudaGraphGetEdges_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphGetEdges_v2(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, cudaGraphEdgeData* edgeData, size_t* numEdges) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaGraphGetEdges_v2(graph, from_, to, edgeData, numEdges)
+cdef cudaError_t _cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, cudaGraphEdgeData* edgeData, size_t* numEdges) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaGraphGetEdges(graph, from_, to, edgeData, numEdges)
 {{endif}}
 
 {{if 'cudaGraphNodeGetDependencies' in found_functions}}
 
-cdef cudaError_t _cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, size_t* pNumDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaGraphNodeGetDependencies(node, pDependencies, pNumDependencies)
-{{endif}}
-
-{{if 'cudaGraphNodeGetDependencies_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphNodeGetDependencies_v2(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, cudaGraphEdgeData* edgeData, size_t* pNumDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaGraphNodeGetDependencies_v2(node, pDependencies, edgeData, pNumDependencies)
+cdef cudaError_t _cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, cudaGraphEdgeData* edgeData, size_t* pNumDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaGraphNodeGetDependencies(node, pDependencies, edgeData, pNumDependencies)
 {{endif}}
 
 {{if 'cudaGraphNodeGetDependentNodes' in found_functions}}
 
-cdef cudaError_t _cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, size_t* pNumDependentNodes) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaGraphNodeGetDependentNodes(node, pDependentNodes, pNumDependentNodes)
-{{endif}}
-
-{{if 'cudaGraphNodeGetDependentNodes_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphNodeGetDependentNodes_v2(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, cudaGraphEdgeData* edgeData, size_t* pNumDependentNodes) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaGraphNodeGetDependentNodes_v2(node, pDependentNodes, edgeData, pNumDependentNodes)
+cdef cudaError_t _cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, cudaGraphEdgeData* edgeData, size_t* pNumDependentNodes) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaGraphNodeGetDependentNodes(node, pDependentNodes, edgeData, pNumDependentNodes)
 {{endif}}
 
 {{if 'cudaGraphAddDependencies' in found_functions}}
 
-cdef cudaError_t _cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaGraphAddDependencies(graph, from_, to, numDependencies)
-{{endif}}
-
-{{if 'cudaGraphAddDependencies_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphAddDependencies_v2(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaGraphAddDependencies_v2(graph, from_, to, edgeData, numDependencies)
+cdef cudaError_t _cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaGraphAddDependencies(graph, from_, to, edgeData, numDependencies)
 {{endif}}
 
 {{if 'cudaGraphRemoveDependencies' in found_functions}}
 
-cdef cudaError_t _cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaGraphRemoveDependencies(graph, from_, to, numDependencies)
-{{endif}}
-
-{{if 'cudaGraphRemoveDependencies_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphRemoveDependencies_v2(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaGraphRemoveDependencies_v2(graph, from_, to, edgeData, numDependencies)
+cdef cudaError_t _cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaGraphRemoveDependencies(graph, from_, to, edgeData, numDependencies)
 {{endif}}
 
 {{if 'cudaGraphDestroyNode' in found_functions}}
@@ -1621,14 +1639,8 @@ cdef cudaError_t _cudaGraphReleaseUserObject(cudaGraph_t graph, cudaUserObject_t
 
 {{if 'cudaGraphAddNode' in found_functions}}
 
-cdef cudaError_t _cudaGraphAddNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, cudaGraphNodeParams* nodeParams) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaGraphAddNode(pGraphNode, graph, pDependencies, numDependencies, nodeParams)
-{{endif}}
-
-{{if 'cudaGraphAddNode_v2' in found_functions}}
-
-cdef cudaError_t _cudaGraphAddNode_v2(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, cudaGraphNodeParams* nodeParams) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cudaGraphAddNode_v2(pGraphNode, graph, pDependencies, dependencyData, numDependencies, nodeParams)
+cdef cudaError_t _cudaGraphAddNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, cudaGraphNodeParams* nodeParams) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cudaGraphAddNode(pGraphNode, graph, pDependencies, dependencyData, numDependencies, nodeParams)
 {{endif}}
 
 {{if 'cudaGraphNodeSetParams' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/_lib/utils.pyx.in b/cuda_bindings/cuda/bindings/_lib/utils.pyx.in
index ed85149b3b..3666f4822f 100644
--- a/cuda_bindings/cuda/bindings/_lib/utils.pyx.in
+++ b/cuda_bindings/cuda/bindings/_lib/utils.pyx.in
@@ -381,7 +381,8 @@ cdef class HelperCUjit_option:
                           {{if 'CU_JIT_TARGET_FROM_CUCONTEXT' in found_values}}cydriver.CUjit_option_enum.CU_JIT_TARGET_FROM_CUCONTEXT,{{endif}}
                           {{if 'CU_JIT_REFERENCED_KERNEL_COUNT' in found_values}}cydriver.CUjit_option_enum.CU_JIT_REFERENCED_KERNEL_COUNT,{{endif}}
                           {{if 'CU_JIT_REFERENCED_VARIABLE_COUNT' in found_values}}cydriver.CUjit_option_enum.CU_JIT_REFERENCED_VARIABLE_COUNT,{{endif}}
-                          {{if 'CU_JIT_MIN_CTA_PER_SM' in found_values}}cydriver.CUjit_option_enum.CU_JIT_MIN_CTA_PER_SM,{{endif}}):
+                          {{if 'CU_JIT_MIN_CTA_PER_SM' in found_values}}cydriver.CUjit_option_enum.CU_JIT_MIN_CTA_PER_SM,{{endif}}
+                          {{if 'CU_JIT_SPLIT_COMPILE' in found_values}}cydriver.CUjit_option_enum.CU_JIT_SPLIT_COMPILE,{{endif}}):
             self._uint = init_value
             self._cptr = <void*><void_ptr>self._uint
         elif self._attr in ({{if 'CU_JIT_WALL_TIME' in found_values}}cydriver.CUjit_option_enum.CU_JIT_WALL_TIME,{{endif}}):
diff --git a/cuda_bindings/cuda/bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/cydriver.pxd.in
index 7f0ffca84a..afc7379d83 100644
--- a/cuda_bindings/cuda/bindings/cydriver.pxd.in
+++ b/cuda_bindings/cuda/bindings/cydriver.pxd.in
@@ -557,7 +557,11 @@ cdef extern from "cuda.h":
         CU_DEVICE_ATTRIBUTE_HOST_NUMA_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 141
         CU_DEVICE_ATTRIBUTE_HOST_NUMA_MEMORY_POOLS_SUPPORTED = 142
         CU_DEVICE_ATTRIBUTE_HOST_NUMA_MULTINODE_IPC_SUPPORTED = 143
-        CU_DEVICE_ATTRIBUTE_MAX = 144
+        CU_DEVICE_ATTRIBUTE_HOST_MEMORY_POOLS_SUPPORTED = 144
+        CU_DEVICE_ATTRIBUTE_HOST_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 145
+        CU_DEVICE_ATTRIBUTE_HOST_ALLOC_DMA_BUF_SUPPORTED = 146
+        CU_DEVICE_ATTRIBUTE_ONLY_PARTIAL_HOST_NATIVE_ATOMIC_SUPPORTED = 147
+        CU_DEVICE_ATTRIBUTE_MAX = 148
 
     ctypedef CUdevice_attribute_enum CUdevice_attribute
 
@@ -717,7 +721,8 @@ cdef extern from "cuda.h":
         CU_JIT_MIN_CTA_PER_SM = 31
         CU_JIT_MAX_THREADS_PER_BLOCK = 32
         CU_JIT_OVERRIDE_DIRECTIVE_VALUES = 33
-        CU_JIT_NUM_OPTIONS = 34
+        CU_JIT_SPLIT_COMPILE = 34
+        CU_JIT_NUM_OPTIONS = 35
 
     ctypedef CUjit_option_enum CUjit_option
 
@@ -743,17 +748,20 @@ cdef extern from "cuda.h":
         CU_TARGET_COMPUTE_100 = 100
         CU_TARGET_COMPUTE_101 = 101
         CU_TARGET_COMPUTE_103 = 103
+        CU_TARGET_COMPUTE_110 = 110
         CU_TARGET_COMPUTE_120 = 120
         CU_TARGET_COMPUTE_121 = 121
         CU_TARGET_COMPUTE_90A = 65626
         CU_TARGET_COMPUTE_100A = 65636
         CU_TARGET_COMPUTE_101A = 65637
         CU_TARGET_COMPUTE_103A = 65639
+        CU_TARGET_COMPUTE_110A = 65646
         CU_TARGET_COMPUTE_120A = 65656
         CU_TARGET_COMPUTE_121A = 65657
         CU_TARGET_COMPUTE_100F = 131172
         CU_TARGET_COMPUTE_101F = 131173
         CU_TARGET_COMPUTE_103F = 131175
+        CU_TARGET_COMPUTE_110F = 131182
         CU_TARGET_COMPUTE_120F = 131192
         CU_TARGET_COMPUTE_121F = 131193
 
@@ -1048,6 +1056,7 @@ cdef extern from "cuda.h":
         CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT = 12
         CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE = 13
         CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 14
+        CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING = 16
 
     ctypedef CUlaunchAttributeID_enum CUlaunchAttributeID
 
@@ -1090,6 +1099,7 @@ cdef extern from "cuda.h":
         anon_struct4 preferredClusterDim
         anon_struct5 deviceUpdatableKernelNode
         unsigned int sharedMemCarveout
+        unsigned int nvlinkUtilCentricScheduling
 
     ctypedef CUlaunchAttributeValue_union CUlaunchAttributeValue
 
@@ -1222,6 +1232,7 @@ cdef extern from "cuda.h":
         CUDA_ERROR_PROFILER_ALREADY_STARTED = 7
         CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8
         CUDA_ERROR_STUB_LIBRARY = 34
+        CUDA_ERROR_CALL_REQUIRES_NEWER_DRIVER = 36
         CUDA_ERROR_DEVICE_UNAVAILABLE = 46
         CUDA_ERROR_NO_DEVICE = 100
         CUDA_ERROR_INVALID_DEVICE = 101
@@ -1251,6 +1262,7 @@ cdef extern from "cuda.h":
         CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224
         CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC = 225
         CUDA_ERROR_CONTAINED = 226
+        CUDA_ERROR_NVLINK_ENCRYPTION_FAILED = 227
         CUDA_ERROR_INVALID_SOURCE = 300
         CUDA_ERROR_FILE_NOT_FOUND = 301
         CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302
@@ -1321,9 +1333,39 @@ cdef extern from "cuda.h":
         CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 3
         CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED = 4
         CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED = 4
+        CU_DEVICE_P2P_ATTRIBUTE_ONLY_PARTIAL_NATIVE_ATOMIC_SUPPORTED = 5
 
     ctypedef CUdevice_P2PAttribute_enum CUdevice_P2PAttribute
 
+    cdef enum CUatomicOperation_enum:
+        CU_ATOMIC_OPERATION_INTEGER_ADD = 0
+        CU_ATOMIC_OPERATION_INTEGER_MIN = 1
+        CU_ATOMIC_OPERATION_INTEGER_MAX = 2
+        CU_ATOMIC_OPERATION_INTEGER_INCREMENT = 3
+        CU_ATOMIC_OPERATION_INTEGER_DECREMENT = 4
+        CU_ATOMIC_OPERATION_AND = 5
+        CU_ATOMIC_OPERATION_OR = 6
+        CU_ATOMIC_OPERATION_XOR = 7
+        CU_ATOMIC_OPERATION_EXCHANGE = 8
+        CU_ATOMIC_OPERATION_CAS = 9
+        CU_ATOMIC_OPERATION_FLOAT_ADD = 10
+        CU_ATOMIC_OPERATION_FLOAT_MIN = 11
+        CU_ATOMIC_OPERATION_FLOAT_MAX = 12
+        CU_ATOMIC_OPERATION_MAX = 13
+
+    ctypedef CUatomicOperation_enum CUatomicOperation
+
+    cdef enum CUatomicOperationCapability_enum:
+        CU_ATOMIC_CAPABILITY_SIGNED = 1
+        CU_ATOMIC_CAPABILITY_UNSIGNED = 2
+        CU_ATOMIC_CAPABILITY_REDUCTION = 4
+        CU_ATOMIC_CAPABILITY_SCALAR_32 = 8
+        CU_ATOMIC_CAPABILITY_SCALAR_64 = 16
+        CU_ATOMIC_CAPABILITY_SCALAR_128 = 32
+        CU_ATOMIC_CAPABILITY_VECTOR_32x4 = 64
+
+    ctypedef CUatomicOperationCapability_enum CUatomicOperationCapability
+
     ctypedef void (*CUstreamCallback)(CUstream hStream, CUresult status, void* userData)
 
     ctypedef size_t (*CUoccupancyB2DSize)(int blockSize)
@@ -1679,6 +1721,7 @@ cdef extern from "cuda.h":
         CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE = 6
         CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7
         CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8
+        CU_EXTERNAL_MEMORY_HANDLE_TYPE_DMABUF_FD = 9
 
     ctypedef CUexternalMemoryHandleType_enum CUexternalMemoryHandleType
 
@@ -1862,6 +1905,7 @@ cdef extern from "cuda.h":
 
     cdef enum CUmemLocationType_enum:
         CU_MEM_LOCATION_TYPE_INVALID = 0
+        CU_MEM_LOCATION_TYPE_NONE = 0
         CU_MEM_LOCATION_TYPE_DEVICE = 1
         CU_MEM_LOCATION_TYPE_HOST = 2
         CU_MEM_LOCATION_TYPE_HOST_NUMA = 3
@@ -1873,6 +1917,7 @@ cdef extern from "cuda.h":
     cdef enum CUmemAllocationType_enum:
         CU_MEM_ALLOCATION_TYPE_INVALID = 0
         CU_MEM_ALLOCATION_TYPE_PINNED = 1
+        CU_MEM_ALLOCATION_TYPE_MANAGED = 2
         CU_MEM_ALLOCATION_TYPE_MAX = 2147483647
 
     ctypedef CUmemAllocationType_enum CUmemAllocationType
@@ -2310,10 +2355,11 @@ cdef extern from "cuda.h":
 
     ctypedef CUcheckpointCheckpointArgs_st CUcheckpointCheckpointArgs
 
-    cdef struct CUcheckpointRestoreArgs_st:
-        cuuint64_t reserved[8]
+    cdef struct CUcheckpointGpuPair_st:
+        CUuuid oldUuid
+        CUuuid newUuid
 
-    ctypedef CUcheckpointRestoreArgs_st CUcheckpointRestoreArgs
+    ctypedef CUcheckpointGpuPair_st CUcheckpointGpuPair
 
     cdef struct CUcheckpointUnlockArgs_st:
         cuuint64_t reserved[8]
@@ -2391,6 +2437,8 @@ cdef extern from "cuda.h":
 
     cdef struct CUdevSmResource_st:
         unsigned int smCount
+        unsigned int minSmPartitionSize
+        unsigned int smCoscheduledAlignment
 
     ctypedef CUdevSmResource_st CUdevSmResource
 
@@ -2663,14 +2711,9 @@ cdef CUresult cuDeviceGetCount(int* count) except ?CUDA_ERROR_NOT_FOUND nogil
 cdef CUresult cuDeviceGetName(char* name, int length, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuDeviceGetUuid' in found_functions}}
-
-cdef CUresult cuDeviceGetUuid(CUuuid* uuid, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuDeviceGetUuid_v2' in found_functions}}
 
-cdef CUresult cuDeviceGetUuid_v2(CUuuid* uuid, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult cuDeviceGetUuid(CUuuid* uuid, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
 {{if 'cuDeviceGetLuid' in found_functions}}
@@ -2693,6 +2736,11 @@ cdef CUresult cuDeviceGetTexture1DLinearMaxWidth(size_t* maxWidthInElements, CUa
 cdef CUresult cuDeviceGetAttribute(int* pi, CUdevice_attribute attrib, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuDeviceGetHostAtomicCapabilities' in found_functions}}
+
+cdef CUresult cuDeviceGetHostAtomicCapabilities(unsigned int* capabilities, const CUatomicOperation* operations, unsigned int count, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuDeviceGetNvSciSyncAttributes' in found_functions}}
 
 cdef CUresult cuDeviceGetNvSciSyncAttributes(void* nvSciSyncAttrList, CUdevice dev, int flags) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -2758,19 +2806,9 @@ cdef CUresult cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int* flags, int*
 cdef CUresult cuDevicePrimaryCtxReset(CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuCtxCreate_v2' in found_functions}}
-
-cdef CUresult cuCtxCreate(CUcontext* pctx, unsigned int flags, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
-{{if 'cuCtxCreate_v3' in found_functions}}
-
-cdef CUresult cuCtxCreate_v3(CUcontext* pctx, CUexecAffinityParam* paramsArray, int numParams, unsigned int flags, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuCtxCreate_v4' in found_functions}}
 
-cdef CUresult cuCtxCreate_v4(CUcontext* pctx, CUctxCreateParams* ctxCreateParams, unsigned int flags, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult cuCtxCreate(CUcontext* pctx, CUctxCreateParams* ctxCreateParams, unsigned int flags, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
 {{if 'cuCtxDestroy_v2' in found_functions}}
@@ -2803,6 +2841,11 @@ cdef CUresult cuCtxGetCurrent(CUcontext* pctx) except ?CUDA_ERROR_NOT_FOUND nogi
 cdef CUresult cuCtxGetDevice(CUdevice* device) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuCtxGetDevice_v2' in found_functions}}
+
+cdef CUresult cuCtxGetDevice_v2(CUdevice* device, CUcontext ctx) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuCtxGetFlags' in found_functions}}
 
 cdef CUresult cuCtxGetFlags(unsigned int* flags) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -2823,6 +2866,11 @@ cdef CUresult cuCtxGetId(CUcontext ctx, unsigned long long* ctxId) except ?CUDA_
 cdef CUresult cuCtxSynchronize() except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuCtxSynchronize_v2' in found_functions}}
+
+cdef CUresult cuCtxSynchronize_v2(CUcontext ctx) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuCtxSetLimit' in found_functions}}
 
 cdef CUresult cuCtxSetLimit(CUlimit limit, size_t value) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -3293,14 +3341,14 @@ cdef CUresult cuMemcpy3DAsync(const CUDA_MEMCPY3D* pCopy, CUstream hStream) exce
 cdef CUresult cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER* pCopy, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuMemcpyBatchAsync' in found_functions}}
+{{if 'cuMemcpyBatchAsync_v2' in found_functions}}
 
-cdef CUresult cuMemcpyBatchAsync(CUdeviceptr* dsts, CUdeviceptr* srcs, size_t* sizes, size_t count, CUmemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, size_t* failIdx, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult cuMemcpyBatchAsync(CUdeviceptr* dsts, CUdeviceptr* srcs, size_t* sizes, size_t count, CUmemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuMemcpy3DBatchAsync' in found_functions}}
+{{if 'cuMemcpy3DBatchAsync_v2' in found_functions}}
 
-cdef CUresult cuMemcpy3DBatchAsync(size_t numOps, CUDA_MEMCPY3D_BATCH_OP* opList, size_t* failIdx, unsigned long long flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult cuMemcpy3DBatchAsync(size_t numOps, CUDA_MEMCPY3D_BATCH_OP* opList, unsigned long long flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
 {{if 'cuMemsetD8_v2' in found_functions}}
@@ -3553,6 +3601,21 @@ cdef CUresult cuMemPoolCreate(CUmemoryPool* pool, const CUmemPoolProps* poolProp
 cdef CUresult cuMemPoolDestroy(CUmemoryPool pool) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuMemGetDefaultMemPool' in found_functions}}
+
+cdef CUresult cuMemGetDefaultMemPool(CUmemoryPool* pool_out, CUmemLocation* location, CUmemAllocationType typename) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuMemGetMemPool' in found_functions}}
+
+cdef CUresult cuMemGetMemPool(CUmemoryPool* pool, CUmemLocation* location, CUmemAllocationType typename) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuMemSetMemPool' in found_functions}}
+
+cdef CUresult cuMemSetMemPool(CUmemLocation* location, CUmemAllocationType typename, CUmemoryPool pool) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuMemAllocFromPoolAsync' in found_functions}}
 
 cdef CUresult cuMemAllocFromPoolAsync(CUdeviceptr* dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -3613,24 +3676,29 @@ cdef CUresult cuMulticastGetGranularity(size_t* granularity, const CUmulticastOb
 cdef CUresult cuPointerGetAttribute(void* data, CUpointer_attribute attribute, CUdeviceptr ptr) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuMemPrefetchAsync' in found_functions}}
+{{if 'cuMemPrefetchAsync_v2' in found_functions}}
 
-cdef CUresult cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUmemLocation location, unsigned int flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuMemPrefetchAsync_v2' in found_functions}}
+{{if 'cuMemAdvise_v2' in found_functions}}
 
-cdef CUresult cuMemPrefetchAsync_v2(CUdeviceptr devPtr, size_t count, CUmemLocation location, unsigned int flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUmemLocation location) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuMemAdvise' in found_functions}}
+{{if 'cuMemPrefetchBatchAsync' in found_functions}}
 
-cdef CUresult cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult cuMemPrefetchBatchAsync(CUdeviceptr* dptrs, size_t* sizes, size_t count, CUmemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuMemAdvise_v2' in found_functions}}
+{{if 'cuMemDiscardBatchAsync' in found_functions}}
+
+cdef CUresult cuMemDiscardBatchAsync(CUdeviceptr* dptrs, size_t* sizes, size_t count, unsigned long long flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
+{{if 'cuMemDiscardAndPrefetchBatchAsync' in found_functions}}
 
-cdef CUresult cuMemAdvise_v2(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUmemLocation location) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult cuMemDiscardAndPrefetchBatchAsync(CUdeviceptr* dptrs, size_t* sizes, size_t count, CUmemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
 {{if 'cuMemRangeGetAttribute' in found_functions}}
@@ -3728,24 +3796,14 @@ cdef CUresult cuStreamEndCapture(CUstream hStream, CUgraph* phGraph) except ?CUD
 cdef CUresult cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus* captureStatus) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuStreamGetCaptureInfo_v2' in found_functions}}
-
-cdef CUresult cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus* captureStatus_out, cuuint64_t* id_out, CUgraph* graph_out, const CUgraphNode** dependencies_out, size_t* numDependencies_out) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuStreamGetCaptureInfo_v3' in found_functions}}
 
-cdef CUresult cuStreamGetCaptureInfo_v3(CUstream hStream, CUstreamCaptureStatus* captureStatus_out, cuuint64_t* id_out, CUgraph* graph_out, const CUgraphNode** dependencies_out, const CUgraphEdgeData** edgeData_out, size_t* numDependencies_out) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
-{{if 'cuStreamUpdateCaptureDependencies' in found_functions}}
-
-cdef CUresult cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode* dependencies, size_t numDependencies, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus* captureStatus_out, cuuint64_t* id_out, CUgraph* graph_out, const CUgraphNode** dependencies_out, const CUgraphEdgeData** edgeData_out, size_t* numDependencies_out) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
 {{if 'cuStreamUpdateCaptureDependencies_v2' in found_functions}}
 
-cdef CUresult cuStreamUpdateCaptureDependencies_v2(CUstream hStream, CUgraphNode* dependencies, const CUgraphEdgeData* dependencyData, size_t numDependencies, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode* dependencies, const CUgraphEdgeData* dependencyData, size_t numDependencies, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
 {{if 'cuStreamAttachMemAsync' in found_functions}}
@@ -3813,14 +3871,9 @@ cdef CUresult cuEventSynchronize(CUevent hEvent) except ?CUDA_ERROR_NOT_FOUND no
 cdef CUresult cuEventDestroy(CUevent hEvent) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuEventElapsedTime' in found_functions}}
-
-cdef CUresult cuEventElapsedTime(float* pMilliseconds, CUevent hStart, CUevent hEnd) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuEventElapsedTime_v2' in found_functions}}
 
-cdef CUresult cuEventElapsedTime_v2(float* pMilliseconds, CUevent hStart, CUevent hEnd) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult cuEventElapsedTime(float* pMilliseconds, CUevent hStart, CUevent hEnd) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
 {{if 'cuImportExternalMemory' in found_functions}}
@@ -4228,54 +4281,29 @@ cdef CUresult cuGraphGetNodes(CUgraph hGraph, CUgraphNode* nodes, size_t* numNod
 cdef CUresult cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode* rootNodes, size_t* numRootNodes) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuGraphGetEdges' in found_functions}}
-
-cdef CUresult cuGraphGetEdges(CUgraph hGraph, CUgraphNode* from_, CUgraphNode* to, size_t* numEdges) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuGraphGetEdges_v2' in found_functions}}
 
-cdef CUresult cuGraphGetEdges_v2(CUgraph hGraph, CUgraphNode* from_, CUgraphNode* to, CUgraphEdgeData* edgeData, size_t* numEdges) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
-{{if 'cuGraphNodeGetDependencies' in found_functions}}
-
-cdef CUresult cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode* dependencies, size_t* numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult cuGraphGetEdges(CUgraph hGraph, CUgraphNode* from_, CUgraphNode* to, CUgraphEdgeData* edgeData, size_t* numEdges) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
 {{if 'cuGraphNodeGetDependencies_v2' in found_functions}}
 
-cdef CUresult cuGraphNodeGetDependencies_v2(CUgraphNode hNode, CUgraphNode* dependencies, CUgraphEdgeData* edgeData, size_t* numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
-{{if 'cuGraphNodeGetDependentNodes' in found_functions}}
-
-cdef CUresult cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode* dependentNodes, size_t* numDependentNodes) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode* dependencies, CUgraphEdgeData* edgeData, size_t* numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
 {{if 'cuGraphNodeGetDependentNodes_v2' in found_functions}}
 
-cdef CUresult cuGraphNodeGetDependentNodes_v2(CUgraphNode hNode, CUgraphNode* dependentNodes, CUgraphEdgeData* edgeData, size_t* numDependentNodes) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
-{{if 'cuGraphAddDependencies' in found_functions}}
-
-cdef CUresult cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode* from_, const CUgraphNode* to, size_t numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode* dependentNodes, CUgraphEdgeData* edgeData, size_t* numDependentNodes) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
 {{if 'cuGraphAddDependencies_v2' in found_functions}}
 
-cdef CUresult cuGraphAddDependencies_v2(CUgraph hGraph, const CUgraphNode* from_, const CUgraphNode* to, const CUgraphEdgeData* edgeData, size_t numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
-{{if 'cuGraphRemoveDependencies' in found_functions}}
-
-cdef CUresult cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode* from_, const CUgraphNode* to, size_t numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode* from_, const CUgraphNode* to, const CUgraphEdgeData* edgeData, size_t numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
 {{if 'cuGraphRemoveDependencies_v2' in found_functions}}
 
-cdef CUresult cuGraphRemoveDependencies_v2(CUgraph hGraph, const CUgraphNode* from_, const CUgraphNode* to, const CUgraphEdgeData* edgeData, size_t numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode* from_, const CUgraphNode* to, const CUgraphEdgeData* edgeData, size_t numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
 {{if 'cuGraphDestroyNode' in found_functions}}
@@ -4423,14 +4451,9 @@ cdef CUresult cuGraphRetainUserObject(CUgraph graph, CUuserObject object, unsign
 cdef CUresult cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsigned int count) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuGraphAddNode' in found_functions}}
-
-cdef CUresult cuGraphAddNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, CUgraphNodeParams* nodeParams) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuGraphAddNode_v2' in found_functions}}
 
-cdef CUresult cuGraphAddNode_v2(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, const CUgraphEdgeData* dependencyData, size_t numDependencies, CUgraphNodeParams* nodeParams) except ?CUDA_ERROR_NOT_FOUND nogil
+cdef CUresult cuGraphAddNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, const CUgraphEdgeData* dependencyData, size_t numDependencies, CUgraphNodeParams* nodeParams) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
 {{if 'cuGraphNodeSetParams' in found_functions}}
@@ -4708,6 +4731,11 @@ cdef CUresult cuCtxDisablePeerAccess(CUcontext peerContext) except ?CUDA_ERROR_N
 cdef CUresult cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuDeviceGetP2PAtomicCapabilities' in found_functions}}
+
+cdef CUresult cuDeviceGetP2PAtomicCapabilities(unsigned int* capabilities, const CUatomicOperation* operations, unsigned int count, CUdevice srcDevice, CUdevice dstDevice) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuGraphicsUnregisterResource' in found_functions}}
 
 cdef CUresult cuGraphicsUnregisterResource(CUgraphicsResource resource) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -4833,6 +4861,11 @@ cdef CUresult cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx* phCtx) except ?C
 cdef CUresult cuGreenCtxStreamCreate(CUstream* phStream, CUgreenCtx greenCtx, unsigned int flags, int priority) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
+{{if 'cuGreenCtxGetId' in found_functions}}
+
+cdef CUresult cuGreenCtxGetId(CUgreenCtx greenCtx, unsigned long long* greenCtxId) except ?CUDA_ERROR_NOT_FOUND nogil
+{{endif}}
+
 {{if 'cuLogsRegisterCallback' in found_functions}}
 
 cdef CUresult cuLogsRegisterCallback(CUlogsCallback callbackFunc, void* userData, CUlogsCallbackHandle* callback_out) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -4878,11 +4911,6 @@ cdef CUresult cuCheckpointProcessLock(int pid, CUcheckpointLockArgs* args) excep
 cdef CUresult cuCheckpointProcessCheckpoint(int pid, CUcheckpointCheckpointArgs* args) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-{{if 'cuCheckpointProcessRestore' in found_functions}}
-
-cdef CUresult cuCheckpointProcessRestore(int pid, CUcheckpointRestoreArgs* args) except ?CUDA_ERROR_NOT_FOUND nogil
-{{endif}}
-
 {{if 'cuCheckpointProcessUnlock' in found_functions}}
 
 cdef CUresult cuCheckpointProcessUnlock(int pid, CUcheckpointUnlockArgs* args) except ?CUDA_ERROR_NOT_FOUND nogil
@@ -4993,7 +5021,7 @@ cdef CUresult cuGraphicsVDPAURegisterVideoSurface(CUgraphicsResource* pCudaResou
 cdef CUresult cuGraphicsVDPAURegisterOutputSurface(CUgraphicsResource* pCudaResource, VdpOutputSurface vdpSurface, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
 
-cdef enum: CUDA_VERSION = 12090
+cdef enum: CUDA_VERSION = 13000
 
 cdef enum: CU_IPC_HANDLE_SIZE = 64
 
diff --git a/cuda_bindings/cuda/bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/cydriver.pyx.in
index 5b5752b390..8fced4ff22 100644
--- a/cuda_bindings/cuda/bindings/cydriver.pyx.in
+++ b/cuda_bindings/cuda/bindings/cydriver.pyx.in
@@ -46,15 +46,9 @@ cdef CUresult cuDeviceGetName(char* name, int length, CUdevice dev) except ?CUDA
     return cydriver._cuDeviceGetName(name, length, dev)
 {{endif}}
 
-{{if 'cuDeviceGetUuid' in found_functions}}
-
-cdef CUresult cuDeviceGetUuid(CUuuid* uuid, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuDeviceGetUuid(uuid, dev)
-{{endif}}
-
 {{if 'cuDeviceGetUuid_v2' in found_functions}}
 
-cdef CUresult cuDeviceGetUuid_v2(CUuuid* uuid, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil:
+cdef CUresult cuDeviceGetUuid(CUuuid* uuid, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil:
     return cydriver._cuDeviceGetUuid_v2(uuid, dev)
 {{endif}}
 
@@ -82,6 +76,12 @@ cdef CUresult cuDeviceGetAttribute(int* pi, CUdevice_attribute attrib, CUdevice
     return cydriver._cuDeviceGetAttribute(pi, attrib, dev)
 {{endif}}
 
+{{if 'cuDeviceGetHostAtomicCapabilities' in found_functions}}
+
+cdef CUresult cuDeviceGetHostAtomicCapabilities(unsigned int* capabilities, const CUatomicOperation* operations, unsigned int count, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuDeviceGetHostAtomicCapabilities(capabilities, operations, count, dev)
+{{endif}}
+
 {{if 'cuDeviceGetNvSciSyncAttributes' in found_functions}}
 
 cdef CUresult cuDeviceGetNvSciSyncAttributes(void* nvSciSyncAttrList, CUdevice dev, int flags) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -160,21 +160,9 @@ cdef CUresult cuDevicePrimaryCtxReset(CUdevice dev) except ?CUDA_ERROR_NOT_FOUND
     return cydriver._cuDevicePrimaryCtxReset_v2(dev)
 {{endif}}
 
-{{if 'cuCtxCreate_v2' in found_functions}}
-
-cdef CUresult cuCtxCreate(CUcontext* pctx, unsigned int flags, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuCtxCreate_v2(pctx, flags, dev)
-{{endif}}
-
-{{if 'cuCtxCreate_v3' in found_functions}}
-
-cdef CUresult cuCtxCreate_v3(CUcontext* pctx, CUexecAffinityParam* paramsArray, int numParams, unsigned int flags, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuCtxCreate_v3(pctx, paramsArray, numParams, flags, dev)
-{{endif}}
-
 {{if 'cuCtxCreate_v4' in found_functions}}
 
-cdef CUresult cuCtxCreate_v4(CUcontext* pctx, CUctxCreateParams* ctxCreateParams, unsigned int flags, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil:
+cdef CUresult cuCtxCreate(CUcontext* pctx, CUctxCreateParams* ctxCreateParams, unsigned int flags, CUdevice dev) except ?CUDA_ERROR_NOT_FOUND nogil:
     return cydriver._cuCtxCreate_v4(pctx, ctxCreateParams, flags, dev)
 {{endif}}
 
@@ -214,6 +202,12 @@ cdef CUresult cuCtxGetDevice(CUdevice* device) except ?CUDA_ERROR_NOT_FOUND nogi
     return cydriver._cuCtxGetDevice(device)
 {{endif}}
 
+{{if 'cuCtxGetDevice_v2' in found_functions}}
+
+cdef CUresult cuCtxGetDevice_v2(CUdevice* device, CUcontext ctx) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuCtxGetDevice_v2(device, ctx)
+{{endif}}
+
 {{if 'cuCtxGetFlags' in found_functions}}
 
 cdef CUresult cuCtxGetFlags(unsigned int* flags) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -238,6 +232,12 @@ cdef CUresult cuCtxSynchronize() except ?CUDA_ERROR_NOT_FOUND nogil:
     return cydriver._cuCtxSynchronize()
 {{endif}}
 
+{{if 'cuCtxSynchronize_v2' in found_functions}}
+
+cdef CUresult cuCtxSynchronize_v2(CUcontext ctx) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuCtxSynchronize_v2(ctx)
+{{endif}}
+
 {{if 'cuCtxSetLimit' in found_functions}}
 
 cdef CUresult cuCtxSetLimit(CUlimit limit, size_t value) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -802,16 +802,16 @@ cdef CUresult cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER* pCopy, CUstream hStr
     return cydriver._cuMemcpy3DPeerAsync(pCopy, hStream)
 {{endif}}
 
-{{if 'cuMemcpyBatchAsync' in found_functions}}
+{{if 'cuMemcpyBatchAsync_v2' in found_functions}}
 
-cdef CUresult cuMemcpyBatchAsync(CUdeviceptr* dsts, CUdeviceptr* srcs, size_t* sizes, size_t count, CUmemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, size_t* failIdx, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuMemcpyBatchAsync(dsts, srcs, sizes, count, attrs, attrsIdxs, numAttrs, failIdx, hStream)
+cdef CUresult cuMemcpyBatchAsync(CUdeviceptr* dsts, CUdeviceptr* srcs, size_t* sizes, size_t count, CUmemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuMemcpyBatchAsync_v2(dsts, srcs, sizes, count, attrs, attrsIdxs, numAttrs, hStream)
 {{endif}}
 
-{{if 'cuMemcpy3DBatchAsync' in found_functions}}
+{{if 'cuMemcpy3DBatchAsync_v2' in found_functions}}
 
-cdef CUresult cuMemcpy3DBatchAsync(size_t numOps, CUDA_MEMCPY3D_BATCH_OP* opList, size_t* failIdx, unsigned long long flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuMemcpy3DBatchAsync(numOps, opList, failIdx, flags, hStream)
+cdef CUresult cuMemcpy3DBatchAsync(size_t numOps, CUDA_MEMCPY3D_BATCH_OP* opList, unsigned long long flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuMemcpy3DBatchAsync_v2(numOps, opList, flags, hStream)
 {{endif}}
 
 {{if 'cuMemsetD8_v2' in found_functions}}
@@ -1114,6 +1114,24 @@ cdef CUresult cuMemPoolDestroy(CUmemoryPool pool) except ?CUDA_ERROR_NOT_FOUND n
     return cydriver._cuMemPoolDestroy(pool)
 {{endif}}
 
+{{if 'cuMemGetDefaultMemPool' in found_functions}}
+
+cdef CUresult cuMemGetDefaultMemPool(CUmemoryPool* pool_out, CUmemLocation* location, CUmemAllocationType typename) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuMemGetDefaultMemPool(pool_out, location, typename)
+{{endif}}
+
+{{if 'cuMemGetMemPool' in found_functions}}
+
+cdef CUresult cuMemGetMemPool(CUmemoryPool* pool, CUmemLocation* location, CUmemAllocationType typename) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuMemGetMemPool(pool, location, typename)
+{{endif}}
+
+{{if 'cuMemSetMemPool' in found_functions}}
+
+cdef CUresult cuMemSetMemPool(CUmemLocation* location, CUmemAllocationType typename, CUmemoryPool pool) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuMemSetMemPool(location, typename, pool)
+{{endif}}
+
 {{if 'cuMemAllocFromPoolAsync' in found_functions}}
 
 cdef CUresult cuMemAllocFromPoolAsync(CUdeviceptr* dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -1186,28 +1204,34 @@ cdef CUresult cuPointerGetAttribute(void* data, CUpointer_attribute attribute, C
     return cydriver._cuPointerGetAttribute(data, attribute, ptr)
 {{endif}}
 
-{{if 'cuMemPrefetchAsync' in found_functions}}
+{{if 'cuMemPrefetchAsync_v2' in found_functions}}
 
-cdef CUresult cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuMemPrefetchAsync(devPtr, count, dstDevice, hStream)
+cdef CUresult cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUmemLocation location, unsigned int flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuMemPrefetchAsync_v2(devPtr, count, location, flags, hStream)
 {{endif}}
 
-{{if 'cuMemPrefetchAsync_v2' in found_functions}}
+{{if 'cuMemAdvise_v2' in found_functions}}
 
-cdef CUresult cuMemPrefetchAsync_v2(CUdeviceptr devPtr, size_t count, CUmemLocation location, unsigned int flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuMemPrefetchAsync_v2(devPtr, count, location, flags, hStream)
+cdef CUresult cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUmemLocation location) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuMemAdvise_v2(devPtr, count, advice, location)
 {{endif}}
 
-{{if 'cuMemAdvise' in found_functions}}
+{{if 'cuMemPrefetchBatchAsync' in found_functions}}
 
-cdef CUresult cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuMemAdvise(devPtr, count, advice, device)
+cdef CUresult cuMemPrefetchBatchAsync(CUdeviceptr* dptrs, size_t* sizes, size_t count, CUmemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuMemPrefetchBatchAsync(dptrs, sizes, count, prefetchLocs, prefetchLocIdxs, numPrefetchLocs, flags, hStream)
 {{endif}}
 
-{{if 'cuMemAdvise_v2' in found_functions}}
+{{if 'cuMemDiscardBatchAsync' in found_functions}}
 
-cdef CUresult cuMemAdvise_v2(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUmemLocation location) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuMemAdvise_v2(devPtr, count, advice, location)
+cdef CUresult cuMemDiscardBatchAsync(CUdeviceptr* dptrs, size_t* sizes, size_t count, unsigned long long flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuMemDiscardBatchAsync(dptrs, sizes, count, flags, hStream)
+{{endif}}
+
+{{if 'cuMemDiscardAndPrefetchBatchAsync' in found_functions}}
+
+cdef CUresult cuMemDiscardAndPrefetchBatchAsync(CUdeviceptr* dptrs, size_t* sizes, size_t count, CUmemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, CUstream hStream) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuMemDiscardAndPrefetchBatchAsync(dptrs, sizes, count, prefetchLocs, prefetchLocIdxs, numPrefetchLocs, flags, hStream)
 {{endif}}
 
 {{if 'cuMemRangeGetAttribute' in found_functions}}
@@ -1324,27 +1348,15 @@ cdef CUresult cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus* captu
     return cydriver._cuStreamIsCapturing(hStream, captureStatus)
 {{endif}}
 
-{{if 'cuStreamGetCaptureInfo_v2' in found_functions}}
-
-cdef CUresult cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus* captureStatus_out, cuuint64_t* id_out, CUgraph* graph_out, const CUgraphNode** dependencies_out, size_t* numDependencies_out) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuStreamGetCaptureInfo_v2(hStream, captureStatus_out, id_out, graph_out, dependencies_out, numDependencies_out)
-{{endif}}
-
 {{if 'cuStreamGetCaptureInfo_v3' in found_functions}}
 
-cdef CUresult cuStreamGetCaptureInfo_v3(CUstream hStream, CUstreamCaptureStatus* captureStatus_out, cuuint64_t* id_out, CUgraph* graph_out, const CUgraphNode** dependencies_out, const CUgraphEdgeData** edgeData_out, size_t* numDependencies_out) except ?CUDA_ERROR_NOT_FOUND nogil:
+cdef CUresult cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus* captureStatus_out, cuuint64_t* id_out, CUgraph* graph_out, const CUgraphNode** dependencies_out, const CUgraphEdgeData** edgeData_out, size_t* numDependencies_out) except ?CUDA_ERROR_NOT_FOUND nogil:
     return cydriver._cuStreamGetCaptureInfo_v3(hStream, captureStatus_out, id_out, graph_out, dependencies_out, edgeData_out, numDependencies_out)
 {{endif}}
 
-{{if 'cuStreamUpdateCaptureDependencies' in found_functions}}
-
-cdef CUresult cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode* dependencies, size_t numDependencies, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuStreamUpdateCaptureDependencies(hStream, dependencies, numDependencies, flags)
-{{endif}}
-
 {{if 'cuStreamUpdateCaptureDependencies_v2' in found_functions}}
 
-cdef CUresult cuStreamUpdateCaptureDependencies_v2(CUstream hStream, CUgraphNode* dependencies, const CUgraphEdgeData* dependencyData, size_t numDependencies, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil:
+cdef CUresult cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode* dependencies, const CUgraphEdgeData* dependencyData, size_t numDependencies, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil:
     return cydriver._cuStreamUpdateCaptureDependencies_v2(hStream, dependencies, dependencyData, numDependencies, flags)
 {{endif}}
 
@@ -1426,15 +1438,9 @@ cdef CUresult cuEventDestroy(CUevent hEvent) except ?CUDA_ERROR_NOT_FOUND nogil:
     return cydriver._cuEventDestroy_v2(hEvent)
 {{endif}}
 
-{{if 'cuEventElapsedTime' in found_functions}}
-
-cdef CUresult cuEventElapsedTime(float* pMilliseconds, CUevent hStart, CUevent hEnd) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuEventElapsedTime(pMilliseconds, hStart, hEnd)
-{{endif}}
-
 {{if 'cuEventElapsedTime_v2' in found_functions}}
 
-cdef CUresult cuEventElapsedTime_v2(float* pMilliseconds, CUevent hStart, CUevent hEnd) except ?CUDA_ERROR_NOT_FOUND nogil:
+cdef CUresult cuEventElapsedTime(float* pMilliseconds, CUevent hStart, CUevent hEnd) except ?CUDA_ERROR_NOT_FOUND nogil:
     return cydriver._cuEventElapsedTime_v2(pMilliseconds, hStart, hEnd)
 {{endif}}
 
@@ -1924,63 +1930,33 @@ cdef CUresult cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode* rootNodes, size_t
     return cydriver._cuGraphGetRootNodes(hGraph, rootNodes, numRootNodes)
 {{endif}}
 
-{{if 'cuGraphGetEdges' in found_functions}}
-
-cdef CUresult cuGraphGetEdges(CUgraph hGraph, CUgraphNode* from_, CUgraphNode* to, size_t* numEdges) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuGraphGetEdges(hGraph, from_, to, numEdges)
-{{endif}}
-
 {{if 'cuGraphGetEdges_v2' in found_functions}}
 
-cdef CUresult cuGraphGetEdges_v2(CUgraph hGraph, CUgraphNode* from_, CUgraphNode* to, CUgraphEdgeData* edgeData, size_t* numEdges) except ?CUDA_ERROR_NOT_FOUND nogil:
+cdef CUresult cuGraphGetEdges(CUgraph hGraph, CUgraphNode* from_, CUgraphNode* to, CUgraphEdgeData* edgeData, size_t* numEdges) except ?CUDA_ERROR_NOT_FOUND nogil:
     return cydriver._cuGraphGetEdges_v2(hGraph, from_, to, edgeData, numEdges)
 {{endif}}
 
-{{if 'cuGraphNodeGetDependencies' in found_functions}}
-
-cdef CUresult cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode* dependencies, size_t* numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuGraphNodeGetDependencies(hNode, dependencies, numDependencies)
-{{endif}}
-
 {{if 'cuGraphNodeGetDependencies_v2' in found_functions}}
 
-cdef CUresult cuGraphNodeGetDependencies_v2(CUgraphNode hNode, CUgraphNode* dependencies, CUgraphEdgeData* edgeData, size_t* numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil:
+cdef CUresult cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode* dependencies, CUgraphEdgeData* edgeData, size_t* numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil:
     return cydriver._cuGraphNodeGetDependencies_v2(hNode, dependencies, edgeData, numDependencies)
 {{endif}}
 
-{{if 'cuGraphNodeGetDependentNodes' in found_functions}}
-
-cdef CUresult cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode* dependentNodes, size_t* numDependentNodes) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuGraphNodeGetDependentNodes(hNode, dependentNodes, numDependentNodes)
-{{endif}}
-
 {{if 'cuGraphNodeGetDependentNodes_v2' in found_functions}}
 
-cdef CUresult cuGraphNodeGetDependentNodes_v2(CUgraphNode hNode, CUgraphNode* dependentNodes, CUgraphEdgeData* edgeData, size_t* numDependentNodes) except ?CUDA_ERROR_NOT_FOUND nogil:
+cdef CUresult cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode* dependentNodes, CUgraphEdgeData* edgeData, size_t* numDependentNodes) except ?CUDA_ERROR_NOT_FOUND nogil:
     return cydriver._cuGraphNodeGetDependentNodes_v2(hNode, dependentNodes, edgeData, numDependentNodes)
 {{endif}}
 
-{{if 'cuGraphAddDependencies' in found_functions}}
-
-cdef CUresult cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode* from_, const CUgraphNode* to, size_t numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuGraphAddDependencies(hGraph, from_, to, numDependencies)
-{{endif}}
-
 {{if 'cuGraphAddDependencies_v2' in found_functions}}
 
-cdef CUresult cuGraphAddDependencies_v2(CUgraph hGraph, const CUgraphNode* from_, const CUgraphNode* to, const CUgraphEdgeData* edgeData, size_t numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil:
+cdef CUresult cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode* from_, const CUgraphNode* to, const CUgraphEdgeData* edgeData, size_t numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil:
     return cydriver._cuGraphAddDependencies_v2(hGraph, from_, to, edgeData, numDependencies)
 {{endif}}
 
-{{if 'cuGraphRemoveDependencies' in found_functions}}
-
-cdef CUresult cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode* from_, const CUgraphNode* to, size_t numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuGraphRemoveDependencies(hGraph, from_, to, numDependencies)
-{{endif}}
-
 {{if 'cuGraphRemoveDependencies_v2' in found_functions}}
 
-cdef CUresult cuGraphRemoveDependencies_v2(CUgraph hGraph, const CUgraphNode* from_, const CUgraphNode* to, const CUgraphEdgeData* edgeData, size_t numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil:
+cdef CUresult cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode* from_, const CUgraphNode* to, const CUgraphEdgeData* edgeData, size_t numDependencies) except ?CUDA_ERROR_NOT_FOUND nogil:
     return cydriver._cuGraphRemoveDependencies_v2(hGraph, from_, to, edgeData, numDependencies)
 {{endif}}
 
@@ -2158,15 +2134,9 @@ cdef CUresult cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsig
     return cydriver._cuGraphReleaseUserObject(graph, object, count)
 {{endif}}
 
-{{if 'cuGraphAddNode' in found_functions}}
-
-cdef CUresult cuGraphAddNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, CUgraphNodeParams* nodeParams) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuGraphAddNode(phGraphNode, hGraph, dependencies, numDependencies, nodeParams)
-{{endif}}
-
 {{if 'cuGraphAddNode_v2' in found_functions}}
 
-cdef CUresult cuGraphAddNode_v2(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, const CUgraphEdgeData* dependencyData, size_t numDependencies, CUgraphNodeParams* nodeParams) except ?CUDA_ERROR_NOT_FOUND nogil:
+cdef CUresult cuGraphAddNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, const CUgraphEdgeData* dependencyData, size_t numDependencies, CUgraphNodeParams* nodeParams) except ?CUDA_ERROR_NOT_FOUND nogil:
     return cydriver._cuGraphAddNode_v2(phGraphNode, hGraph, dependencies, dependencyData, numDependencies, nodeParams)
 {{endif}}
 
@@ -2500,6 +2470,12 @@ cdef CUresult cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib,
     return cydriver._cuDeviceGetP2PAttribute(value, attrib, srcDevice, dstDevice)
 {{endif}}
 
+{{if 'cuDeviceGetP2PAtomicCapabilities' in found_functions}}
+
+cdef CUresult cuDeviceGetP2PAtomicCapabilities(unsigned int* capabilities, const CUatomicOperation* operations, unsigned int count, CUdevice srcDevice, CUdevice dstDevice) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuDeviceGetP2PAtomicCapabilities(capabilities, operations, count, srcDevice, dstDevice)
+{{endif}}
+
 {{if 'cuGraphicsUnregisterResource' in found_functions}}
 
 cdef CUresult cuGraphicsUnregisterResource(CUgraphicsResource resource) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -2650,6 +2626,12 @@ cdef CUresult cuGreenCtxStreamCreate(CUstream* phStream, CUgreenCtx greenCtx, un
     return cydriver._cuGreenCtxStreamCreate(phStream, greenCtx, flags, priority)
 {{endif}}
 
+{{if 'cuGreenCtxGetId' in found_functions}}
+
+cdef CUresult cuGreenCtxGetId(CUgreenCtx greenCtx, unsigned long long* greenCtxId) except ?CUDA_ERROR_NOT_FOUND nogil:
+    return cydriver._cuGreenCtxGetId(greenCtx, greenCtxId)
+{{endif}}
+
 {{if 'cuLogsRegisterCallback' in found_functions}}
 
 cdef CUresult cuLogsRegisterCallback(CUlogsCallback callbackFunc, void* userData, CUlogsCallbackHandle* callback_out) except ?CUDA_ERROR_NOT_FOUND nogil:
@@ -2704,12 +2686,6 @@ cdef CUresult cuCheckpointProcessCheckpoint(int pid, CUcheckpointCheckpointArgs*
     return cydriver._cuCheckpointProcessCheckpoint(pid, args)
 {{endif}}
 
-{{if 'cuCheckpointProcessRestore' in found_functions}}
-
-cdef CUresult cuCheckpointProcessRestore(int pid, CUcheckpointRestoreArgs* args) except ?CUDA_ERROR_NOT_FOUND nogil:
-    return cydriver._cuCheckpointProcessRestore(pid, args)
-{{endif}}
-
 {{if 'cuCheckpointProcessUnlock' in found_functions}}
 
 cdef CUresult cuCheckpointProcessUnlock(int pid, CUcheckpointUnlockArgs* args) except ?CUDA_ERROR_NOT_FOUND nogil:
diff --git a/cuda_bindings/cuda/bindings/cynvrtc.pxd.in b/cuda_bindings/cuda/bindings/cynvrtc.pxd.in
index a432f21558..c1b5dfd9c1 100644
--- a/cuda_bindings/cuda/bindings/cynvrtc.pxd.in
+++ b/cuda_bindings/cuda/bindings/cynvrtc.pxd.in
@@ -25,6 +25,7 @@ cdef extern from "nvrtc.h":
         NVRTC_ERROR_PCH_CREATE_HEAP_EXHAUSTED = 14
         NVRTC_ERROR_PCH_CREATE = 15
         NVRTC_ERROR_CANCELLED = 16
+        NVRTC_ERROR_TIME_TRACE_FILE_WRITE_FAILED = 17
 
     cdef struct _nvrtcProgram:
         pass
@@ -85,16 +86,6 @@ cdef nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog, size_t* cubinSizeRet) exce
 cdef nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char* cubin) except ?NVRTC_ERROR_INVALID_INPUT nogil
 {{endif}}
 
-{{if 'nvrtcGetNVVMSize' in found_functions}}
-
-cdef nvrtcResult nvrtcGetNVVMSize(nvrtcProgram prog, size_t* nvvmSizeRet) except ?NVRTC_ERROR_INVALID_INPUT nogil
-{{endif}}
-
-{{if 'nvrtcGetNVVM' in found_functions}}
-
-cdef nvrtcResult nvrtcGetNVVM(nvrtcProgram prog, char* nvvm) except ?NVRTC_ERROR_INVALID_INPUT nogil
-{{endif}}
-
 {{if 'nvrtcGetLTOIRSize' in found_functions}}
 
 cdef nvrtcResult nvrtcGetLTOIRSize(nvrtcProgram prog, size_t* LTOIRSizeRet) except ?NVRTC_ERROR_INVALID_INPUT nogil
diff --git a/cuda_bindings/cuda/bindings/cynvrtc.pyx.in b/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
index a19e081c3a..e53a96ebd2 100644
--- a/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
@@ -70,18 +70,6 @@ cdef nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char* cubin) except ?NVRTC_ERR
     return cynvrtc._nvrtcGetCUBIN(prog, cubin)
 {{endif}}
 
-{{if 'nvrtcGetNVVMSize' in found_functions}}
-
-cdef nvrtcResult nvrtcGetNVVMSize(nvrtcProgram prog, size_t* nvvmSizeRet) except ?NVRTC_ERROR_INVALID_INPUT nogil:
-    return cynvrtc._nvrtcGetNVVMSize(prog, nvvmSizeRet)
-{{endif}}
-
-{{if 'nvrtcGetNVVM' in found_functions}}
-
-cdef nvrtcResult nvrtcGetNVVM(nvrtcProgram prog, char* nvvm) except ?NVRTC_ERROR_INVALID_INPUT nogil:
-    return cynvrtc._nvrtcGetNVVM(prog, nvvm)
-{{endif}}
-
 {{if 'nvrtcGetLTOIRSize' in found_functions}}
 
 cdef nvrtcResult nvrtcGetLTOIRSize(nvrtcProgram prog, size_t* LTOIRSizeRet) except ?NVRTC_ERROR_INVALID_INPUT nogil:
diff --git a/cuda_bindings/cuda/bindings/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/cyruntime.pxd.in
index 83083ab6a7..751875cce0 100644
--- a/cuda_bindings/cuda/bindings/cyruntime.pxd.in
+++ b/cuda_bindings/cuda/bindings/cyruntime.pxd.in
@@ -176,12 +176,12 @@ cdef struct cudaEglPlaneDesc_st:
 
 ctypedef cudaEglPlaneDesc_st cudaEglPlaneDesc
 
-cdef union anon_union11:
+cdef union anon_union9:
     cudaArray_t pArray[3]
     cudaPitchedPtr pPitch[3]
 
 cdef struct cudaEglFrame_st:
-    anon_union11 frame
+    anon_union9 frame
     cudaEglPlaneDesc planeDesc[3]
     unsigned int planeCount
     cudaEglFrameType frameType
@@ -329,7 +329,7 @@ cdef const char* cudaGetErrorString(cudaError_t error) except ?NULL nogil
 cdef cudaError_t cudaGetDeviceCount(int* count) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaGetDeviceProperties_v2' in found_functions}}
+{{if 'cudaGetDeviceProperties' in found_functions}}
 
 cdef cudaError_t cudaGetDeviceProperties(cudaDeviceProp* prop, int device) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
@@ -339,6 +339,11 @@ cdef cudaError_t cudaGetDeviceProperties(cudaDeviceProp* prop, int device) excep
 cdef cudaError_t cudaDeviceGetAttribute(int* value, cudaDeviceAttr attr, int device) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
+{{if 'cudaDeviceGetHostAtomicCapabilities' in found_functions}}
+
+cdef cudaError_t cudaDeviceGetHostAtomicCapabilities(unsigned int* capabilities, const cudaAtomicOperation* operations, unsigned int count, int device) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
 {{if 'cudaDeviceGetDefaultMemPool' in found_functions}}
 
 cdef cudaError_t cudaDeviceGetDefaultMemPool(cudaMemPool_t* memPool, int device) except ?cudaErrorCallRequiresNewerDriver nogil
@@ -364,6 +369,11 @@ cdef cudaError_t cudaDeviceGetNvSciSyncAttributes(void* nvSciSyncAttrList, int d
 cdef cudaError_t cudaDeviceGetP2PAttribute(int* value, cudaDeviceP2PAttr attr, int srcDevice, int dstDevice) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
+{{if 'cudaDeviceGetP2PAtomicCapabilities' in found_functions}}
+
+cdef cudaError_t cudaDeviceGetP2PAtomicCapabilities(unsigned int* capabilities, const cudaAtomicOperation* operations, unsigned int count, int srcDevice, int dstDevice) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
 {{if 'cudaChooseDevice' in found_functions}}
 
 cdef cudaError_t cudaChooseDevice(int* device, const cudaDeviceProp* prop) except ?cudaErrorCallRequiresNewerDriver nogil
@@ -504,24 +514,14 @@ cdef cudaError_t cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t* pGraph)
 cdef cudaError_t cudaStreamIsCapturing(cudaStream_t stream, cudaStreamCaptureStatus* pCaptureStatus) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaStreamGetCaptureInfo_v2' in found_functions}}
-
-cdef cudaError_t cudaStreamGetCaptureInfo(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, size_t* numDependencies_out) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaStreamGetCaptureInfo_v3' in found_functions}}
+{{if 'cudaStreamGetCaptureInfo' in found_functions}}
 
-cdef cudaError_t cudaStreamGetCaptureInfo_v3(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, const cudaGraphEdgeData** edgeData_out, size_t* numDependencies_out) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t cudaStreamGetCaptureInfo(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, const cudaGraphEdgeData** edgeData_out, size_t* numDependencies_out) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaStreamUpdateCaptureDependencies' in found_functions}}
 
-cdef cudaError_t cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t* dependencies, size_t numDependencies, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaStreamUpdateCaptureDependencies_v2' in found_functions}}
-
-cdef cudaError_t cudaStreamUpdateCaptureDependencies_v2(cudaStream_t stream, cudaGraphNode_t* dependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t* dependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaEventCreate' in found_functions}}
@@ -564,11 +564,6 @@ cdef cudaError_t cudaEventDestroy(cudaEvent_t event) except ?cudaErrorCallRequir
 cdef cudaError_t cudaEventElapsedTime(float* ms, cudaEvent_t start, cudaEvent_t end) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaEventElapsedTime_v2' in found_functions}}
-
-cdef cudaError_t cudaEventElapsedTime_v2(float* ms, cudaEvent_t start, cudaEvent_t end) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
 {{if 'cudaImportExternalMemory' in found_functions}}
 
 cdef cudaError_t cudaImportExternalMemory(cudaExternalMemory_t* extMem_out, const cudaExternalMemoryHandleDesc* memHandleDesc) except ?cudaErrorCallRequiresNewerDriver nogil
@@ -594,12 +589,12 @@ cdef cudaError_t cudaDestroyExternalMemory(cudaExternalMemory_t extMem) except ?
 cdef cudaError_t cudaImportExternalSemaphore(cudaExternalSemaphore_t* extSem_out, const cudaExternalSemaphoreHandleDesc* semHandleDesc) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaSignalExternalSemaphoresAsync_v2' in found_functions}}
+{{if 'cudaSignalExternalSemaphoresAsync' in found_functions}}
 
 cdef cudaError_t cudaSignalExternalSemaphoresAsync(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreSignalParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaWaitExternalSemaphoresAsync_v2' in found_functions}}
+{{if 'cudaWaitExternalSemaphoresAsync' in found_functions}}
 
 cdef cudaError_t cudaWaitExternalSemaphoresAsync(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreWaitParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
@@ -836,12 +831,12 @@ cdef cudaError_t cudaMemcpyPeerAsync(void* dst, int dstDevice, const void* src,
 
 {{if 'cudaMemcpyBatchAsync' in found_functions}}
 
-cdef cudaError_t cudaMemcpyBatchAsync(void** dsts, void** srcs, size_t* sizes, size_t count, cudaMemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, size_t* failIdx, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t cudaMemcpyBatchAsync(const void** dsts, const void** srcs, const size_t* sizes, size_t count, cudaMemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaMemcpy3DBatchAsync' in found_functions}}
 
-cdef cudaError_t cudaMemcpy3DBatchAsync(size_t numOps, cudaMemcpy3DBatchOp* opList, size_t* failIdx, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t cudaMemcpy3DBatchAsync(size_t numOps, cudaMemcpy3DBatchOp* opList, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaMemcpy2DAsync' in found_functions}}
@@ -891,22 +886,27 @@ cdef cudaError_t cudaMemset3DAsync(cudaPitchedPtr pitchedDevPtr, int value, cuda
 
 {{if 'cudaMemPrefetchAsync' in found_functions}}
 
-cdef cudaError_t cudaMemPrefetchAsync(const void* devPtr, size_t count, int dstDevice, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t cudaMemPrefetchAsync(const void* devPtr, size_t count, cudaMemLocation location, unsigned int flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaMemPrefetchAsync_v2' in found_functions}}
+{{if 'cudaMemPrefetchBatchAsync' in found_functions}}
 
-cdef cudaError_t cudaMemPrefetchAsync_v2(const void* devPtr, size_t count, cudaMemLocation location, unsigned int flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t cudaMemPrefetchBatchAsync(void** dptrs, size_t* sizes, size_t count, cudaMemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaMemAdvise' in found_functions}}
+{{if 'cudaMemDiscardBatchAsync' in found_functions}}
 
-cdef cudaError_t cudaMemAdvise(const void* devPtr, size_t count, cudaMemoryAdvise advice, int device) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t cudaMemDiscardBatchAsync(void** dptrs, size_t* sizes, size_t count, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
-{{if 'cudaMemAdvise_v2' in found_functions}}
+{{if 'cudaMemDiscardAndPrefetchBatchAsync' in found_functions}}
 
-cdef cudaError_t cudaMemAdvise_v2(const void* devPtr, size_t count, cudaMemoryAdvise advice, cudaMemLocation location) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t cudaMemDiscardAndPrefetchBatchAsync(void** dptrs, size_t* sizes, size_t count, cudaMemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaMemAdvise' in found_functions}}
+
+cdef cudaError_t cudaMemAdvise(const void* devPtr, size_t count, cudaMemoryAdvise advice, cudaMemLocation location) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaMemRangeGetAttribute' in found_functions}}
@@ -989,6 +989,21 @@ cdef cudaError_t cudaMemPoolCreate(cudaMemPool_t* memPool, const cudaMemPoolProp
 cdef cudaError_t cudaMemPoolDestroy(cudaMemPool_t memPool) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
+{{if 'cudaMemGetDefaultMemPool' in found_functions}}
+
+cdef cudaError_t cudaMemGetDefaultMemPool(cudaMemPool_t* memPool, cudaMemLocation* location, cudaMemAllocationType typename) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaMemGetMemPool' in found_functions}}
+
+cdef cudaError_t cudaMemGetMemPool(cudaMemPool_t* memPool, cudaMemLocation* location, cudaMemAllocationType typename) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaMemSetMemPool' in found_functions}}
+
+cdef cudaError_t cudaMemSetMemPool(cudaMemLocation* location, cudaMemAllocationType typename, cudaMemPool_t memPool) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
 {{if 'cudaMallocFromPoolAsync' in found_functions}}
 
 cdef cudaError_t cudaMallocFromPoolAsync(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil
@@ -1129,6 +1144,31 @@ cdef cudaError_t cudaDriverGetVersion(int* driverVersion) except ?cudaErrorCallR
 cdef cudaError_t cudaRuntimeGetVersion(int* runtimeVersion) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
+{{if 'cudaLogsRegisterCallback' in found_functions}}
+
+cdef cudaError_t cudaLogsRegisterCallback(cudaLogsCallback_t callbackFunc, void* userData, cudaLogsCallbackHandle* callback_out) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaLogsUnregisterCallback' in found_functions}}
+
+cdef cudaError_t cudaLogsUnregisterCallback(cudaLogsCallbackHandle callback) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaLogsCurrent' in found_functions}}
+
+cdef cudaError_t cudaLogsCurrent(cudaLogIterator* iterator_out, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaLogsDumpToFile' in found_functions}}
+
+cdef cudaError_t cudaLogsDumpToFile(cudaLogIterator* iterator, const char* pathToFile, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
+{{if 'cudaLogsDumpToMemory' in found_functions}}
+
+cdef cudaError_t cudaLogsDumpToMemory(cudaLogIterator* iterator, char* buffer, size_t* size, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
+{{endif}}
+
 {{if 'cudaGraphCreate' in found_functions}}
 
 cdef cudaError_t cudaGraphCreate(cudaGraph_t* pGraph, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil
@@ -1151,7 +1191,7 @@ cdef cudaError_t cudaGraphKernelNodeSetParams(cudaGraphNode_t node, const cudaKe
 
 {{if 'cudaGraphKernelNodeCopyAttributes' in found_functions}}
 
-cdef cudaError_t cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hSrc, cudaGraphNode_t hDst) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hDst, cudaGraphNode_t hSrc) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphKernelNodeGetAttribute' in found_functions}}
@@ -1356,52 +1396,27 @@ cdef cudaError_t cudaGraphGetRootNodes(cudaGraph_t graph, cudaGraphNode_t* pRoot
 
 {{if 'cudaGraphGetEdges' in found_functions}}
 
-cdef cudaError_t cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, size_t* numEdges) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaGraphGetEdges_v2' in found_functions}}
-
-cdef cudaError_t cudaGraphGetEdges_v2(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, cudaGraphEdgeData* edgeData, size_t* numEdges) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, cudaGraphEdgeData* edgeData, size_t* numEdges) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphNodeGetDependencies' in found_functions}}
 
-cdef cudaError_t cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, size_t* pNumDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaGraphNodeGetDependencies_v2' in found_functions}}
-
-cdef cudaError_t cudaGraphNodeGetDependencies_v2(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, cudaGraphEdgeData* edgeData, size_t* pNumDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, cudaGraphEdgeData* edgeData, size_t* pNumDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphNodeGetDependentNodes' in found_functions}}
 
-cdef cudaError_t cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, size_t* pNumDependentNodes) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaGraphNodeGetDependentNodes_v2' in found_functions}}
-
-cdef cudaError_t cudaGraphNodeGetDependentNodes_v2(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, cudaGraphEdgeData* edgeData, size_t* pNumDependentNodes) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, cudaGraphEdgeData* edgeData, size_t* pNumDependentNodes) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphAddDependencies' in found_functions}}
 
-cdef cudaError_t cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaGraphAddDependencies_v2' in found_functions}}
-
-cdef cudaError_t cudaGraphAddDependencies_v2(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphRemoveDependencies' in found_functions}}
 
-cdef cudaError_t cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaGraphRemoveDependencies_v2' in found_functions}}
-
-cdef cudaError_t cudaGraphRemoveDependencies_v2(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphDestroyNode' in found_functions}}
@@ -1546,12 +1561,7 @@ cdef cudaError_t cudaGraphReleaseUserObject(cudaGraph_t graph, cudaUserObject_t
 
 {{if 'cudaGraphAddNode' in found_functions}}
 
-cdef cudaError_t cudaGraphAddNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, cudaGraphNodeParams* nodeParams) except ?cudaErrorCallRequiresNewerDriver nogil
-{{endif}}
-
-{{if 'cudaGraphAddNode_v2' in found_functions}}
-
-cdef cudaError_t cudaGraphAddNode_v2(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, cudaGraphNodeParams* nodeParams) except ?cudaErrorCallRequiresNewerDriver nogil
+cdef cudaError_t cudaGraphAddNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, cudaGraphNodeParams* nodeParams) except ?cudaErrorCallRequiresNewerDriver nogil
 {{endif}}
 
 {{if 'cudaGraphNodeSetParams' in found_functions}}
@@ -1862,10 +1872,6 @@ cdef enum: cudaInvalidDeviceId = -2
 
 cdef enum: cudaInitDeviceFlagsAreValid = 1
 
-cdef enum: cudaCooperativeLaunchMultiDeviceNoPreSync = 1
-
-cdef enum: cudaCooperativeLaunchMultiDeviceNoPostSync = 2
-
 cdef enum: cudaArraySparsePropertiesSingleMipTail = 1
 
 cdef enum: cudaMemPoolCreateUsageHwDecompress = 2
@@ -1916,6 +1922,8 @@ cdef enum: cudaKernelNodeAttributePreferredSharedMemoryCarveout = 14
 
 cdef enum: cudaKernelNodeAttributeDeviceUpdatableKernelNode = 13
 
+cdef enum: cudaKernelNodeAttributeNvlinkUtilCentricScheduling = 16
+
 cdef enum: cudaSurfaceType1D = 1
 
 cdef enum: cudaSurfaceType2D = 2
@@ -1944,8 +1952,8 @@ cdef enum: cudaTextureType2DLayered = 242
 
 cdef enum: cudaTextureTypeCubemapLayered = 252
 
-cdef enum: CUDART_VERSION = 12090
+cdef enum: CUDART_VERSION = 13000
 
-cdef enum: __CUDART_API_VERSION = 12090
+cdef enum: __CUDART_API_VERSION = 13000
 
 cdef enum: CUDA_EGL_MAX_PLANES = 3
\ No newline at end of file
diff --git a/cuda_bindings/cuda/bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
index ae4ab76a1a..a4c4d9c198 100644
--- a/cuda_bindings/cuda/bindings/cyruntime.pyx.in
+++ b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
@@ -156,10 +156,10 @@ cdef cudaError_t cudaGetDeviceCount(int* count) except ?cudaErrorCallRequiresNew
     return cyruntime._cudaGetDeviceCount(count)
 {{endif}}
 
-{{if 'cudaGetDeviceProperties_v2' in found_functions}}
+{{if 'cudaGetDeviceProperties' in found_functions}}
 
 cdef cudaError_t cudaGetDeviceProperties(cudaDeviceProp* prop, int device) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaGetDeviceProperties_v2(prop, device)
+    return cyruntime._cudaGetDeviceProperties(prop, device)
 {{endif}}
 
 {{if 'cudaDeviceGetAttribute' in found_functions}}
@@ -168,6 +168,12 @@ cdef cudaError_t cudaDeviceGetAttribute(int* value, cudaDeviceAttr attr, int dev
     return cyruntime._cudaDeviceGetAttribute(value, attr, device)
 {{endif}}
 
+{{if 'cudaDeviceGetHostAtomicCapabilities' in found_functions}}
+
+cdef cudaError_t cudaDeviceGetHostAtomicCapabilities(unsigned int* capabilities, const cudaAtomicOperation* operations, unsigned int count, int device) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaDeviceGetHostAtomicCapabilities(capabilities, operations, count, device)
+{{endif}}
+
 {{if 'cudaDeviceGetDefaultMemPool' in found_functions}}
 
 cdef cudaError_t cudaDeviceGetDefaultMemPool(cudaMemPool_t* memPool, int device) except ?cudaErrorCallRequiresNewerDriver nogil:
@@ -198,6 +204,12 @@ cdef cudaError_t cudaDeviceGetP2PAttribute(int* value, cudaDeviceP2PAttr attr, i
     return cyruntime._cudaDeviceGetP2PAttribute(value, attr, srcDevice, dstDevice)
 {{endif}}
 
+{{if 'cudaDeviceGetP2PAtomicCapabilities' in found_functions}}
+
+cdef cudaError_t cudaDeviceGetP2PAtomicCapabilities(unsigned int* capabilities, const cudaAtomicOperation* operations, unsigned int count, int srcDevice, int dstDevice) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaDeviceGetP2PAtomicCapabilities(capabilities, operations, count, srcDevice, dstDevice)
+{{endif}}
+
 {{if 'cudaChooseDevice' in found_functions}}
 
 cdef cudaError_t cudaChooseDevice(int* device, const cudaDeviceProp* prop) except ?cudaErrorCallRequiresNewerDriver nogil:
@@ -366,28 +378,16 @@ cdef cudaError_t cudaStreamIsCapturing(cudaStream_t stream, cudaStreamCaptureSta
     return cyruntime._cudaStreamIsCapturing(stream, pCaptureStatus)
 {{endif}}
 
-{{if 'cudaStreamGetCaptureInfo_v2' in found_functions}}
-
-cdef cudaError_t cudaStreamGetCaptureInfo(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, size_t* numDependencies_out) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaStreamGetCaptureInfo_v2(stream, captureStatus_out, id_out, graph_out, dependencies_out, numDependencies_out)
-{{endif}}
-
-{{if 'cudaStreamGetCaptureInfo_v3' in found_functions}}
+{{if 'cudaStreamGetCaptureInfo' in found_functions}}
 
-cdef cudaError_t cudaStreamGetCaptureInfo_v3(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, const cudaGraphEdgeData** edgeData_out, size_t* numDependencies_out) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaStreamGetCaptureInfo_v3(stream, captureStatus_out, id_out, graph_out, dependencies_out, edgeData_out, numDependencies_out)
+cdef cudaError_t cudaStreamGetCaptureInfo(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, const cudaGraphEdgeData** edgeData_out, size_t* numDependencies_out) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaStreamGetCaptureInfo(stream, captureStatus_out, id_out, graph_out, dependencies_out, edgeData_out, numDependencies_out)
 {{endif}}
 
 {{if 'cudaStreamUpdateCaptureDependencies' in found_functions}}
 
-cdef cudaError_t cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t* dependencies, size_t numDependencies, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaStreamUpdateCaptureDependencies(stream, dependencies, numDependencies, flags)
-{{endif}}
-
-{{if 'cudaStreamUpdateCaptureDependencies_v2' in found_functions}}
-
-cdef cudaError_t cudaStreamUpdateCaptureDependencies_v2(cudaStream_t stream, cudaGraphNode_t* dependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaStreamUpdateCaptureDependencies_v2(stream, dependencies, dependencyData, numDependencies, flags)
+cdef cudaError_t cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t* dependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaStreamUpdateCaptureDependencies(stream, dependencies, dependencyData, numDependencies, flags)
 {{endif}}
 
 {{if 'cudaEventCreate' in found_functions}}
@@ -438,12 +438,6 @@ cdef cudaError_t cudaEventElapsedTime(float* ms, cudaEvent_t start, cudaEvent_t
     return cyruntime._cudaEventElapsedTime(ms, start, end)
 {{endif}}
 
-{{if 'cudaEventElapsedTime_v2' in found_functions}}
-
-cdef cudaError_t cudaEventElapsedTime_v2(float* ms, cudaEvent_t start, cudaEvent_t end) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaEventElapsedTime_v2(ms, start, end)
-{{endif}}
-
 {{if 'cudaImportExternalMemory' in found_functions}}
 
 cdef cudaError_t cudaImportExternalMemory(cudaExternalMemory_t* extMem_out, const cudaExternalMemoryHandleDesc* memHandleDesc) except ?cudaErrorCallRequiresNewerDriver nogil:
@@ -474,16 +468,16 @@ cdef cudaError_t cudaImportExternalSemaphore(cudaExternalSemaphore_t* extSem_out
     return cyruntime._cudaImportExternalSemaphore(extSem_out, semHandleDesc)
 {{endif}}
 
-{{if 'cudaSignalExternalSemaphoresAsync_v2' in found_functions}}
+{{if 'cudaSignalExternalSemaphoresAsync' in found_functions}}
 
 cdef cudaError_t cudaSignalExternalSemaphoresAsync(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreSignalParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaSignalExternalSemaphoresAsync_v2(extSemArray, paramsArray, numExtSems, stream)
+    return cyruntime._cudaSignalExternalSemaphoresAsync(extSemArray, paramsArray, numExtSems, stream)
 {{endif}}
 
-{{if 'cudaWaitExternalSemaphoresAsync_v2' in found_functions}}
+{{if 'cudaWaitExternalSemaphoresAsync' in found_functions}}
 
 cdef cudaError_t cudaWaitExternalSemaphoresAsync(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreWaitParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaWaitExternalSemaphoresAsync_v2(extSemArray, paramsArray, numExtSems, stream)
+    return cyruntime._cudaWaitExternalSemaphoresAsync(extSemArray, paramsArray, numExtSems, stream)
 {{endif}}
 
 {{if 'cudaDestroyExternalSemaphore' in found_functions}}
@@ -764,14 +758,14 @@ cdef cudaError_t cudaMemcpyPeerAsync(void* dst, int dstDevice, const void* src,
 
 {{if 'cudaMemcpyBatchAsync' in found_functions}}
 
-cdef cudaError_t cudaMemcpyBatchAsync(void** dsts, void** srcs, size_t* sizes, size_t count, cudaMemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, size_t* failIdx, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaMemcpyBatchAsync(dsts, srcs, sizes, count, attrs, attrsIdxs, numAttrs, failIdx, stream)
+cdef cudaError_t cudaMemcpyBatchAsync(const void** dsts, const void** srcs, const size_t* sizes, size_t count, cudaMemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaMemcpyBatchAsync(dsts, srcs, sizes, count, attrs, attrsIdxs, numAttrs, stream)
 {{endif}}
 
 {{if 'cudaMemcpy3DBatchAsync' in found_functions}}
 
-cdef cudaError_t cudaMemcpy3DBatchAsync(size_t numOps, cudaMemcpy3DBatchOp* opList, size_t* failIdx, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaMemcpy3DBatchAsync(numOps, opList, failIdx, flags, stream)
+cdef cudaError_t cudaMemcpy3DBatchAsync(size_t numOps, cudaMemcpy3DBatchOp* opList, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaMemcpy3DBatchAsync(numOps, opList, flags, stream)
 {{endif}}
 
 {{if 'cudaMemcpy2DAsync' in found_functions}}
@@ -830,26 +824,32 @@ cdef cudaError_t cudaMemset3DAsync(cudaPitchedPtr pitchedDevPtr, int value, cuda
 
 {{if 'cudaMemPrefetchAsync' in found_functions}}
 
-cdef cudaError_t cudaMemPrefetchAsync(const void* devPtr, size_t count, int dstDevice, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaMemPrefetchAsync(devPtr, count, dstDevice, stream)
+cdef cudaError_t cudaMemPrefetchAsync(const void* devPtr, size_t count, cudaMemLocation location, unsigned int flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaMemPrefetchAsync(devPtr, count, location, flags, stream)
 {{endif}}
 
-{{if 'cudaMemPrefetchAsync_v2' in found_functions}}
+{{if 'cudaMemPrefetchBatchAsync' in found_functions}}
 
-cdef cudaError_t cudaMemPrefetchAsync_v2(const void* devPtr, size_t count, cudaMemLocation location, unsigned int flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaMemPrefetchAsync_v2(devPtr, count, location, flags, stream)
+cdef cudaError_t cudaMemPrefetchBatchAsync(void** dptrs, size_t* sizes, size_t count, cudaMemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaMemPrefetchBatchAsync(dptrs, sizes, count, prefetchLocs, prefetchLocIdxs, numPrefetchLocs, flags, stream)
 {{endif}}
 
-{{if 'cudaMemAdvise' in found_functions}}
+{{if 'cudaMemDiscardBatchAsync' in found_functions}}
 
-cdef cudaError_t cudaMemAdvise(const void* devPtr, size_t count, cudaMemoryAdvise advice, int device) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaMemAdvise(devPtr, count, advice, device)
+cdef cudaError_t cudaMemDiscardBatchAsync(void** dptrs, size_t* sizes, size_t count, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaMemDiscardBatchAsync(dptrs, sizes, count, flags, stream)
 {{endif}}
 
-{{if 'cudaMemAdvise_v2' in found_functions}}
+{{if 'cudaMemDiscardAndPrefetchBatchAsync' in found_functions}}
 
-cdef cudaError_t cudaMemAdvise_v2(const void* devPtr, size_t count, cudaMemoryAdvise advice, cudaMemLocation location) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaMemAdvise_v2(devPtr, count, advice, location)
+cdef cudaError_t cudaMemDiscardAndPrefetchBatchAsync(void** dptrs, size_t* sizes, size_t count, cudaMemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaMemDiscardAndPrefetchBatchAsync(dptrs, sizes, count, prefetchLocs, prefetchLocIdxs, numPrefetchLocs, flags, stream)
+{{endif}}
+
+{{if 'cudaMemAdvise' in found_functions}}
+
+cdef cudaError_t cudaMemAdvise(const void* devPtr, size_t count, cudaMemoryAdvise advice, cudaMemLocation location) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaMemAdvise(devPtr, count, advice, location)
 {{endif}}
 
 {{if 'cudaMemRangeGetAttribute' in found_functions}}
@@ -948,6 +948,24 @@ cdef cudaError_t cudaMemPoolDestroy(cudaMemPool_t memPool) except ?cudaErrorCall
     return cyruntime._cudaMemPoolDestroy(memPool)
 {{endif}}
 
+{{if 'cudaMemGetDefaultMemPool' in found_functions}}
+
+cdef cudaError_t cudaMemGetDefaultMemPool(cudaMemPool_t* memPool, cudaMemLocation* location, cudaMemAllocationType typename) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaMemGetDefaultMemPool(memPool, location, typename)
+{{endif}}
+
+{{if 'cudaMemGetMemPool' in found_functions}}
+
+cdef cudaError_t cudaMemGetMemPool(cudaMemPool_t* memPool, cudaMemLocation* location, cudaMemAllocationType typename) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaMemGetMemPool(memPool, location, typename)
+{{endif}}
+
+{{if 'cudaMemSetMemPool' in found_functions}}
+
+cdef cudaError_t cudaMemSetMemPool(cudaMemLocation* location, cudaMemAllocationType typename, cudaMemPool_t memPool) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaMemSetMemPool(location, typename, memPool)
+{{endif}}
+
 {{if 'cudaMallocFromPoolAsync' in found_functions}}
 
 cdef cudaError_t cudaMallocFromPoolAsync(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream) except ?cudaErrorCallRequiresNewerDriver nogil:
@@ -1116,6 +1134,36 @@ cdef cudaError_t cudaRuntimeGetVersion(int* runtimeVersion) except ?cudaErrorCal
     return cyruntime._cudaRuntimeGetVersion(runtimeVersion)
 {{endif}}
 
+{{if 'cudaLogsRegisterCallback' in found_functions}}
+
+cdef cudaError_t cudaLogsRegisterCallback(cudaLogsCallback_t callbackFunc, void* userData, cudaLogsCallbackHandle* callback_out) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaLogsRegisterCallback(callbackFunc, userData, callback_out)
+{{endif}}
+
+{{if 'cudaLogsUnregisterCallback' in found_functions}}
+
+cdef cudaError_t cudaLogsUnregisterCallback(cudaLogsCallbackHandle callback) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaLogsUnregisterCallback(callback)
+{{endif}}
+
+{{if 'cudaLogsCurrent' in found_functions}}
+
+cdef cudaError_t cudaLogsCurrent(cudaLogIterator* iterator_out, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaLogsCurrent(iterator_out, flags)
+{{endif}}
+
+{{if 'cudaLogsDumpToFile' in found_functions}}
+
+cdef cudaError_t cudaLogsDumpToFile(cudaLogIterator* iterator, const char* pathToFile, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaLogsDumpToFile(iterator, pathToFile, flags)
+{{endif}}
+
+{{if 'cudaLogsDumpToMemory' in found_functions}}
+
+cdef cudaError_t cudaLogsDumpToMemory(cudaLogIterator* iterator, char* buffer, size_t* size, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaLogsDumpToMemory(iterator, buffer, size, flags)
+{{endif}}
+
 {{if 'cudaGraphCreate' in found_functions}}
 
 cdef cudaError_t cudaGraphCreate(cudaGraph_t* pGraph, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil:
@@ -1142,8 +1190,8 @@ cdef cudaError_t cudaGraphKernelNodeSetParams(cudaGraphNode_t node, const cudaKe
 
 {{if 'cudaGraphKernelNodeCopyAttributes' in found_functions}}
 
-cdef cudaError_t cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hSrc, cudaGraphNode_t hDst) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaGraphKernelNodeCopyAttributes(hSrc, hDst)
+cdef cudaError_t cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hDst, cudaGraphNode_t hSrc) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaGraphKernelNodeCopyAttributes(hDst, hSrc)
 {{endif}}
 
 {{if 'cudaGraphKernelNodeGetAttribute' in found_functions}}
@@ -1388,62 +1436,32 @@ cdef cudaError_t cudaGraphGetRootNodes(cudaGraph_t graph, cudaGraphNode_t* pRoot
 
 {{if 'cudaGraphGetEdges' in found_functions}}
 
-cdef cudaError_t cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, size_t* numEdges) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaGraphGetEdges(graph, from_, to, numEdges)
-{{endif}}
-
-{{if 'cudaGraphGetEdges_v2' in found_functions}}
-
-cdef cudaError_t cudaGraphGetEdges_v2(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, cudaGraphEdgeData* edgeData, size_t* numEdges) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaGraphGetEdges_v2(graph, from_, to, edgeData, numEdges)
+cdef cudaError_t cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, cudaGraphEdgeData* edgeData, size_t* numEdges) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaGraphGetEdges(graph, from_, to, edgeData, numEdges)
 {{endif}}
 
 {{if 'cudaGraphNodeGetDependencies' in found_functions}}
 
-cdef cudaError_t cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, size_t* pNumDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaGraphNodeGetDependencies(node, pDependencies, pNumDependencies)
-{{endif}}
-
-{{if 'cudaGraphNodeGetDependencies_v2' in found_functions}}
-
-cdef cudaError_t cudaGraphNodeGetDependencies_v2(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, cudaGraphEdgeData* edgeData, size_t* pNumDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaGraphNodeGetDependencies_v2(node, pDependencies, edgeData, pNumDependencies)
+cdef cudaError_t cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, cudaGraphEdgeData* edgeData, size_t* pNumDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaGraphNodeGetDependencies(node, pDependencies, edgeData, pNumDependencies)
 {{endif}}
 
 {{if 'cudaGraphNodeGetDependentNodes' in found_functions}}
 
-cdef cudaError_t cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, size_t* pNumDependentNodes) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaGraphNodeGetDependentNodes(node, pDependentNodes, pNumDependentNodes)
-{{endif}}
-
-{{if 'cudaGraphNodeGetDependentNodes_v2' in found_functions}}
-
-cdef cudaError_t cudaGraphNodeGetDependentNodes_v2(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, cudaGraphEdgeData* edgeData, size_t* pNumDependentNodes) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaGraphNodeGetDependentNodes_v2(node, pDependentNodes, edgeData, pNumDependentNodes)
+cdef cudaError_t cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, cudaGraphEdgeData* edgeData, size_t* pNumDependentNodes) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaGraphNodeGetDependentNodes(node, pDependentNodes, edgeData, pNumDependentNodes)
 {{endif}}
 
 {{if 'cudaGraphAddDependencies' in found_functions}}
 
-cdef cudaError_t cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaGraphAddDependencies(graph, from_, to, numDependencies)
-{{endif}}
-
-{{if 'cudaGraphAddDependencies_v2' in found_functions}}
-
-cdef cudaError_t cudaGraphAddDependencies_v2(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaGraphAddDependencies_v2(graph, from_, to, edgeData, numDependencies)
+cdef cudaError_t cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaGraphAddDependencies(graph, from_, to, edgeData, numDependencies)
 {{endif}}
 
 {{if 'cudaGraphRemoveDependencies' in found_functions}}
 
-cdef cudaError_t cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaGraphRemoveDependencies(graph, from_, to, numDependencies)
-{{endif}}
-
-{{if 'cudaGraphRemoveDependencies_v2' in found_functions}}
-
-cdef cudaError_t cudaGraphRemoveDependencies_v2(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaGraphRemoveDependencies_v2(graph, from_, to, edgeData, numDependencies)
+cdef cudaError_t cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaGraphRemoveDependencies(graph, from_, to, edgeData, numDependencies)
 {{endif}}
 
 {{if 'cudaGraphDestroyNode' in found_functions}}
@@ -1616,14 +1634,8 @@ cdef cudaError_t cudaGraphReleaseUserObject(cudaGraph_t graph, cudaUserObject_t
 
 {{if 'cudaGraphAddNode' in found_functions}}
 
-cdef cudaError_t cudaGraphAddNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, cudaGraphNodeParams* nodeParams) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaGraphAddNode(pGraphNode, graph, pDependencies, numDependencies, nodeParams)
-{{endif}}
-
-{{if 'cudaGraphAddNode_v2' in found_functions}}
-
-cdef cudaError_t cudaGraphAddNode_v2(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, cudaGraphNodeParams* nodeParams) except ?cudaErrorCallRequiresNewerDriver nogil:
-    return cyruntime._cudaGraphAddNode_v2(pGraphNode, graph, pDependencies, dependencyData, numDependencies, nodeParams)
+cdef cudaError_t cudaGraphAddNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, cudaGraphNodeParams* nodeParams) except ?cudaErrorCallRequiresNewerDriver nogil:
+    return cyruntime._cudaGraphAddNode(pGraphNode, graph, pDependencies, dependencyData, numDependencies, nodeParams)
 {{endif}}
 
 {{if 'cudaGraphNodeSetParams' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
index dd1513f489..ddd29a120c 100644
--- a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
+++ b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
@@ -129,15 +129,20 @@ cdef extern from "cuda_runtime_api.h":
     cudaError_t cudaGetDeviceCount(int* count) nogil
 
     {{endif}}
-    {{if 'cudaGetDeviceProperties_v2' in found_functions}}
+    {{if 'cudaGetDeviceProperties' in found_functions}}
 
-    cudaError_t cudaGetDeviceProperties_v2(cudaDeviceProp* prop, int device) nogil
+    cudaError_t cudaGetDeviceProperties(cudaDeviceProp* prop, int device) nogil
 
     {{endif}}
     {{if 'cudaDeviceGetAttribute' in found_functions}}
 
     cudaError_t cudaDeviceGetAttribute(int* value, cudaDeviceAttr attr, int device) nogil
 
+    {{endif}}
+    {{if 'cudaDeviceGetHostAtomicCapabilities' in found_functions}}
+
+    cudaError_t cudaDeviceGetHostAtomicCapabilities(unsigned int* capabilities, const cudaAtomicOperation* operations, unsigned int count, int device) nogil
+
     {{endif}}
     {{if 'cudaDeviceGetDefaultMemPool' in found_functions}}
 
@@ -163,6 +168,11 @@ cdef extern from "cuda_runtime_api.h":
 
     cudaError_t cudaDeviceGetP2PAttribute(int* value, cudaDeviceP2PAttr attr, int srcDevice, int dstDevice) nogil
 
+    {{endif}}
+    {{if 'cudaDeviceGetP2PAtomicCapabilities' in found_functions}}
+
+    cudaError_t cudaDeviceGetP2PAtomicCapabilities(unsigned int* capabilities, const cudaAtomicOperation* operations, unsigned int count, int srcDevice, int dstDevice) nogil
+
     {{endif}}
     {{if 'cudaChooseDevice' in found_functions}}
 
@@ -304,29 +314,14 @@ cdef extern from "cuda_runtime_api.h":
     cudaError_t cudaStreamIsCapturing(cudaStream_t stream, cudaStreamCaptureStatus* pCaptureStatus) nogil
 
     {{endif}}
-    {{if 'cudaStreamGetCaptureInfo_v2' in found_functions}}
-
-    cudaError_t cudaStreamGetCaptureInfo_v2_ptsz(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, size_t* numDependencies_out) nogil
-
-    {{endif}}
-    {{if 'cudaStreamGetCaptureInfo_v2' in found_functions}}
+    {{if 'cudaStreamGetCaptureInfo' in found_functions}}
 
-    cudaError_t cudaStreamGetCaptureInfo_v2(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, size_t* numDependencies_out) nogil
-
-    {{endif}}
-    {{if 'cudaStreamGetCaptureInfo_v3' in found_functions}}
-
-    cudaError_t cudaStreamGetCaptureInfo_v3(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, const cudaGraphEdgeData** edgeData_out, size_t* numDependencies_out) nogil
+    cudaError_t cudaStreamGetCaptureInfo(cudaStream_t stream, cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, const cudaGraphEdgeData** edgeData_out, size_t* numDependencies_out) nogil
 
     {{endif}}
     {{if 'cudaStreamUpdateCaptureDependencies' in found_functions}}
 
-    cudaError_t cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t* dependencies, size_t numDependencies, unsigned int flags) nogil
-
-    {{endif}}
-    {{if 'cudaStreamUpdateCaptureDependencies_v2' in found_functions}}
-
-    cudaError_t cudaStreamUpdateCaptureDependencies_v2(cudaStream_t stream, cudaGraphNode_t* dependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, unsigned int flags) nogil
+    cudaError_t cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t* dependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, unsigned int flags) nogil
 
     {{endif}}
     {{if 'cudaEventCreate' in found_functions}}
@@ -368,11 +363,6 @@ cdef extern from "cuda_runtime_api.h":
 
     cudaError_t cudaEventElapsedTime(float* ms, cudaEvent_t start, cudaEvent_t end) nogil
 
-    {{endif}}
-    {{if 'cudaEventElapsedTime_v2' in found_functions}}
-
-    cudaError_t cudaEventElapsedTime_v2(float* ms, cudaEvent_t start, cudaEvent_t end) nogil
-
     {{endif}}
     {{if 'cudaImportExternalMemory' in found_functions}}
 
@@ -399,24 +389,14 @@ cdef extern from "cuda_runtime_api.h":
     cudaError_t cudaImportExternalSemaphore(cudaExternalSemaphore_t* extSem_out, const cudaExternalSemaphoreHandleDesc* semHandleDesc) nogil
 
     {{endif}}
-    {{if 'cudaSignalExternalSemaphoresAsync_v2' in found_functions}}
+    {{if 'cudaSignalExternalSemaphoresAsync' in found_functions}}
 
-    cudaError_t cudaSignalExternalSemaphoresAsync_v2_ptsz(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreSignalParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) nogil
+    cudaError_t cudaSignalExternalSemaphoresAsync(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreSignalParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) nogil
 
     {{endif}}
-    {{if 'cudaSignalExternalSemaphoresAsync_v2' in found_functions}}
+    {{if 'cudaWaitExternalSemaphoresAsync' in found_functions}}
 
-    cudaError_t cudaSignalExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreSignalParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) nogil
-
-    {{endif}}
-    {{if 'cudaWaitExternalSemaphoresAsync_v2' in found_functions}}
-
-    cudaError_t cudaWaitExternalSemaphoresAsync_v2_ptsz(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreWaitParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) nogil
-
-    {{endif}}
-    {{if 'cudaWaitExternalSemaphoresAsync_v2' in found_functions}}
-
-    cudaError_t cudaWaitExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreWaitParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) nogil
+    cudaError_t cudaWaitExternalSemaphoresAsync(const cudaExternalSemaphore_t* extSemArray, const cudaExternalSemaphoreWaitParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) nogil
 
     {{endif}}
     {{if 'cudaDestroyExternalSemaphore' in found_functions}}
@@ -651,12 +631,12 @@ cdef extern from "cuda_runtime_api.h":
     {{endif}}
     {{if 'cudaMemcpyBatchAsync' in found_functions}}
 
-    cudaError_t cudaMemcpyBatchAsync(void** dsts, void** srcs, size_t* sizes, size_t count, cudaMemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, size_t* failIdx, cudaStream_t stream) nogil
+    cudaError_t cudaMemcpyBatchAsync(const void** dsts, const void** srcs, const size_t* sizes, size_t count, cudaMemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs, cudaStream_t stream) nogil
 
     {{endif}}
     {{if 'cudaMemcpy3DBatchAsync' in found_functions}}
 
-    cudaError_t cudaMemcpy3DBatchAsync(size_t numOps, cudaMemcpy3DBatchOp* opList, size_t* failIdx, unsigned long long flags, cudaStream_t stream) nogil
+    cudaError_t cudaMemcpy3DBatchAsync(size_t numOps, cudaMemcpy3DBatchOp* opList, unsigned long long flags, cudaStream_t stream) nogil
 
     {{endif}}
     {{if 'cudaMemcpy2DAsync' in found_functions}}
@@ -706,22 +686,27 @@ cdef extern from "cuda_runtime_api.h":
     {{endif}}
     {{if 'cudaMemPrefetchAsync' in found_functions}}
 
-    cudaError_t cudaMemPrefetchAsync(const void* devPtr, size_t count, int dstDevice, cudaStream_t stream) nogil
+    cudaError_t cudaMemPrefetchAsync(const void* devPtr, size_t count, cudaMemLocation location, unsigned int flags, cudaStream_t stream) nogil
 
     {{endif}}
-    {{if 'cudaMemPrefetchAsync_v2' in found_functions}}
+    {{if 'cudaMemPrefetchBatchAsync' in found_functions}}
 
-    cudaError_t cudaMemPrefetchAsync_v2(const void* devPtr, size_t count, cudaMemLocation location, unsigned int flags, cudaStream_t stream) nogil
+    cudaError_t cudaMemPrefetchBatchAsync(void** dptrs, size_t* sizes, size_t count, cudaMemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, cudaStream_t stream) nogil
 
     {{endif}}
-    {{if 'cudaMemAdvise' in found_functions}}
+    {{if 'cudaMemDiscardBatchAsync' in found_functions}}
+
+    cudaError_t cudaMemDiscardBatchAsync(void** dptrs, size_t* sizes, size_t count, unsigned long long flags, cudaStream_t stream) nogil
+
+    {{endif}}
+    {{if 'cudaMemDiscardAndPrefetchBatchAsync' in found_functions}}
 
-    cudaError_t cudaMemAdvise(const void* devPtr, size_t count, cudaMemoryAdvise advice, int device) nogil
+    cudaError_t cudaMemDiscardAndPrefetchBatchAsync(void** dptrs, size_t* sizes, size_t count, cudaMemLocation* prefetchLocs, size_t* prefetchLocIdxs, size_t numPrefetchLocs, unsigned long long flags, cudaStream_t stream) nogil
 
     {{endif}}
-    {{if 'cudaMemAdvise_v2' in found_functions}}
+    {{if 'cudaMemAdvise' in found_functions}}
 
-    cudaError_t cudaMemAdvise_v2(const void* devPtr, size_t count, cudaMemoryAdvise advice, cudaMemLocation location) nogil
+    cudaError_t cudaMemAdvise(const void* devPtr, size_t count, cudaMemoryAdvise advice, cudaMemLocation location) nogil
 
     {{endif}}
     {{if 'cudaMemRangeGetAttribute' in found_functions}}
@@ -803,6 +788,21 @@ cdef extern from "cuda_runtime_api.h":
 
     cudaError_t cudaMemPoolDestroy(cudaMemPool_t memPool) nogil
 
+    {{endif}}
+    {{if 'cudaMemGetDefaultMemPool' in found_functions}}
+
+    cudaError_t cudaMemGetDefaultMemPool(cudaMemPool_t* memPool, cudaMemLocation* location, cudaMemAllocationType typename) nogil
+
+    {{endif}}
+    {{if 'cudaMemGetMemPool' in found_functions}}
+
+    cudaError_t cudaMemGetMemPool(cudaMemPool_t* memPool, cudaMemLocation* location, cudaMemAllocationType typename) nogil
+
+    {{endif}}
+    {{if 'cudaMemSetMemPool' in found_functions}}
+
+    cudaError_t cudaMemSetMemPool(cudaMemLocation* location, cudaMemAllocationType typename, cudaMemPool_t memPool) nogil
+
     {{endif}}
     {{if 'cudaMallocFromPoolAsync' in found_functions}}
 
@@ -943,6 +943,31 @@ cdef extern from "cuda_runtime_api.h":
 
     cudaError_t cudaRuntimeGetVersion(int* runtimeVersion) nogil
 
+    {{endif}}
+    {{if 'cudaLogsRegisterCallback' in found_functions}}
+
+    cudaError_t cudaLogsRegisterCallback(cudaLogsCallback_t callbackFunc, void* userData, cudaLogsCallbackHandle* callback_out) nogil
+
+    {{endif}}
+    {{if 'cudaLogsUnregisterCallback' in found_functions}}
+
+    cudaError_t cudaLogsUnregisterCallback(cudaLogsCallbackHandle callback) nogil
+
+    {{endif}}
+    {{if 'cudaLogsCurrent' in found_functions}}
+
+    cudaError_t cudaLogsCurrent(cudaLogIterator* iterator_out, unsigned int flags) nogil
+
+    {{endif}}
+    {{if 'cudaLogsDumpToFile' in found_functions}}
+
+    cudaError_t cudaLogsDumpToFile(cudaLogIterator* iterator, const char* pathToFile, unsigned int flags) nogil
+
+    {{endif}}
+    {{if 'cudaLogsDumpToMemory' in found_functions}}
+
+    cudaError_t cudaLogsDumpToMemory(cudaLogIterator* iterator, char* buffer, size_t* size, unsigned int flags) nogil
+
     {{endif}}
     {{if 'cudaGraphCreate' in found_functions}}
 
@@ -966,7 +991,7 @@ cdef extern from "cuda_runtime_api.h":
     {{endif}}
     {{if 'cudaGraphKernelNodeCopyAttributes' in found_functions}}
 
-    cudaError_t cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hSrc, cudaGraphNode_t hDst) nogil
+    cudaError_t cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hDst, cudaGraphNode_t hSrc) nogil
 
     {{endif}}
     {{if 'cudaGraphKernelNodeGetAttribute' in found_functions}}
@@ -1171,52 +1196,27 @@ cdef extern from "cuda_runtime_api.h":
     {{endif}}
     {{if 'cudaGraphGetEdges' in found_functions}}
 
-    cudaError_t cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, size_t* numEdges) nogil
-
-    {{endif}}
-    {{if 'cudaGraphGetEdges_v2' in found_functions}}
-
-    cudaError_t cudaGraphGetEdges_v2(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, cudaGraphEdgeData* edgeData, size_t* numEdges) nogil
+    cudaError_t cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t* from_, cudaGraphNode_t* to, cudaGraphEdgeData* edgeData, size_t* numEdges) nogil
 
     {{endif}}
     {{if 'cudaGraphNodeGetDependencies' in found_functions}}
 
-    cudaError_t cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, size_t* pNumDependencies) nogil
-
-    {{endif}}
-    {{if 'cudaGraphNodeGetDependencies_v2' in found_functions}}
-
-    cudaError_t cudaGraphNodeGetDependencies_v2(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, cudaGraphEdgeData* edgeData, size_t* pNumDependencies) nogil
+    cudaError_t cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, cudaGraphEdgeData* edgeData, size_t* pNumDependencies) nogil
 
     {{endif}}
     {{if 'cudaGraphNodeGetDependentNodes' in found_functions}}
 
-    cudaError_t cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, size_t* pNumDependentNodes) nogil
-
-    {{endif}}
-    {{if 'cudaGraphNodeGetDependentNodes_v2' in found_functions}}
-
-    cudaError_t cudaGraphNodeGetDependentNodes_v2(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, cudaGraphEdgeData* edgeData, size_t* pNumDependentNodes) nogil
+    cudaError_t cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, cudaGraphEdgeData* edgeData, size_t* pNumDependentNodes) nogil
 
     {{endif}}
     {{if 'cudaGraphAddDependencies' in found_functions}}
 
-    cudaError_t cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, size_t numDependencies) nogil
-
-    {{endif}}
-    {{if 'cudaGraphAddDependencies_v2' in found_functions}}
-
-    cudaError_t cudaGraphAddDependencies_v2(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) nogil
+    cudaError_t cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) nogil
 
     {{endif}}
     {{if 'cudaGraphRemoveDependencies' in found_functions}}
 
-    cudaError_t cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, size_t numDependencies) nogil
-
-    {{endif}}
-    {{if 'cudaGraphRemoveDependencies_v2' in found_functions}}
-
-    cudaError_t cudaGraphRemoveDependencies_v2(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) nogil
+    cudaError_t cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t* from_, const cudaGraphNode_t* to, const cudaGraphEdgeData* edgeData, size_t numDependencies) nogil
 
     {{endif}}
     {{if 'cudaGraphDestroyNode' in found_functions}}
@@ -1361,12 +1361,7 @@ cdef extern from "cuda_runtime_api.h":
     {{endif}}
     {{if 'cudaGraphAddNode' in found_functions}}
 
-    cudaError_t cudaGraphAddNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, cudaGraphNodeParams* nodeParams) nogil
-
-    {{endif}}
-    {{if 'cudaGraphAddNode_v2' in found_functions}}
-
-    cudaError_t cudaGraphAddNode_v2(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, cudaGraphNodeParams* nodeParams) nogil
+    cudaError_t cudaGraphAddNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, const cudaGraphEdgeData* dependencyData, size_t numDependencies, cudaGraphNodeParams* nodeParams) nogil
 
     {{endif}}
     {{if 'cudaGraphNodeSetParams' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
index 7ed8f65334..79544de150 100644
--- a/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
+++ b/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
@@ -86,6 +86,7 @@ cdef extern from "driver_types.h":
         cudaErrorUnsupportedExecAffinity = 224
         cudaErrorUnsupportedDevSideSync = 225
         cudaErrorContained = 226
+        cudaErrorNvlinkEncryptionFailed = 227
         cudaErrorInvalidSource = 300
         cudaErrorFileNotFound = 301
         cudaErrorSharedObjectSymbolNotFound = 302
@@ -283,15 +284,20 @@ cdef extern from "driver_types.h":
         size_t height
         size_t pitchInBytes
 
+    cdef struct anon_struct5:
+        int reserved[32]
+
     cdef union anon_union0:
         anon_struct1 array
         anon_struct2 mipmap
         anon_struct3 linear
         anon_struct4 pitch2D
+        anon_struct5 reserved
 
     cdef struct cudaResourceDesc:
         cudaResourceType resType
         anon_union0 res
+        unsigned int flags
 
     cdef struct cudaResourceViewDesc:
         cudaResourceViewFormat format
@@ -302,12 +308,14 @@ cdef extern from "driver_types.h":
         unsigned int lastMipmapLevel
         unsigned int firstLayer
         unsigned int lastLayer
+        unsigned int reserved[16]
 
     cdef struct cudaPointerAttributes:
         cudaMemoryType type
         int device
         void* devicePointer
         void* hostPointer
+        long reserved[8]
 
     cdef struct cudaFuncAttributes:
         size_t sharedSizeBytes
@@ -376,19 +384,19 @@ cdef extern from "driver_types.h":
         size_t y
         size_t z
 
-    cdef struct anon_struct5:
+    cdef struct anon_struct6:
         void* ptr
         size_t rowLength
         size_t layerHeight
         cudaMemLocation locHint
 
-    cdef struct anon_struct6:
+    cdef struct anon_struct7:
         cudaArray_t array
         cudaOffset3D offset
 
     cdef union anon_union1:
-        anon_struct5 ptr
-        anon_struct6 array
+        anon_struct6 ptr
+        anon_struct7 array
 
     cdef struct cudaMemcpy3DOperand:
         cudaMemcpy3DOperandType type
@@ -421,21 +429,16 @@ cdef extern from "driver_types.h":
         int maxThreadsPerBlock
         int maxThreadsDim[3]
         int maxGridSize[3]
-        int clockRate
         size_t totalConstMem
         int major
         int minor
         size_t textureAlignment
         size_t texturePitchAlignment
-        int deviceOverlap
         int multiProcessorCount
-        int kernelExecTimeoutEnabled
         int integrated
         int canMapHostMemory
-        int computeMode
         int maxTexture1D
         int maxTexture1DMipmap
-        int maxTexture1DLinear
         int maxTexture2D[2]
         int maxTexture2DMipmap[2]
         int maxTexture2DLinear[3]
@@ -462,7 +465,6 @@ cdef extern from "driver_types.h":
         int tccDriver
         int asyncEngineCount
         int unifiedAddressing
-        int memoryClockRate
         int memoryBusWidth
         int l2CacheSize
         int persistingL2CacheMaxSize
@@ -476,13 +478,11 @@ cdef extern from "driver_types.h":
         int isMultiGpuBoard
         int multiGpuBoardGroupID
         int hostNativeAtomicSupported
-        int singleToDoublePrecisionPerfRatio
         int pageableMemoryAccess
         int concurrentManagedAccess
         int computePreemptionSupported
         int canUseHostPointerForRegisteredMem
         int cooperativeLaunch
-        int cooperativeMultiDeviceLaunch
         size_t sharedMemPerBlockOptin
         int pageableMemoryAccessUsesHostPageTables
         int directManagedMemAccessFromHost
@@ -502,7 +502,14 @@ cdef extern from "driver_types.h":
         int ipcEventSupported
         int clusterLaunch
         int unifiedFunctionPointers
-        int reserved[63]
+        int deviceNumaConfig
+        int deviceNumaId
+        int mpsEnabled
+        int hostNumaId
+        unsigned int gpuPciDeviceID
+        unsigned int gpuPciSubsystemID
+        int hostNumaMultinodeIpcSupported
+        int reserved[56]
 
     cdef struct cudaIpcEventHandle_st:
         char reserved[64]
@@ -519,13 +526,13 @@ cdef extern from "driver_types.h":
 
     ctypedef cudaMemFabricHandle_st cudaMemFabricHandle_t
 
-    cdef struct anon_struct7:
+    cdef struct anon_struct8:
         void* handle
         const void* name
 
     cdef union anon_union2:
         int fd
-        anon_struct7 win32
+        anon_struct8 win32
         const void* nvSciBufObject
 
     cdef struct cudaExternalMemoryHandleDesc:
@@ -533,11 +540,13 @@ cdef extern from "driver_types.h":
         anon_union2 handle
         unsigned long long size
         unsigned int flags
+        unsigned int reserved[16]
 
     cdef struct cudaExternalMemoryBufferDesc:
         unsigned long long offset
         unsigned long long size
         unsigned int flags
+        unsigned int reserved[16]
 
     cdef struct cudaExternalMemoryMipmappedArrayDesc:
         unsigned long long offset
@@ -545,61 +554,63 @@ cdef extern from "driver_types.h":
         cudaExtent extent
         unsigned int flags
         unsigned int numLevels
+        unsigned int reserved[16]
 
-    cdef struct anon_struct8:
+    cdef struct anon_struct9:
         void* handle
         const void* name
 
     cdef union anon_union3:
         int fd
-        anon_struct8 win32
+        anon_struct9 win32
         const void* nvSciSyncObj
 
     cdef struct cudaExternalSemaphoreHandleDesc:
         cudaExternalSemaphoreHandleType type
         anon_union3 handle
         unsigned int flags
+        unsigned int reserved[16]
 
-    cdef struct anon_struct15:
+    cdef struct anon_struct10:
         unsigned long long value
 
-    cdef union anon_union6:
+    cdef union anon_union4:
         void* fence
         unsigned long long reserved
 
-    cdef struct anon_struct16:
+    cdef struct anon_struct11:
         unsigned long long key
 
-    cdef struct anon_struct17:
-        anon_struct15 fence
-        anon_union6 nvSciSync
-        anon_struct16 keyedMutex
+    cdef struct anon_struct12:
+        anon_struct10 fence
+        anon_union4 nvSciSync
+        anon_struct11 keyedMutex
         unsigned int reserved[12]
 
     cdef struct cudaExternalSemaphoreSignalParams:
-        anon_struct17 params
+        anon_struct12 params
         unsigned int flags
         unsigned int reserved[16]
 
-    cdef struct anon_struct18:
+    cdef struct anon_struct13:
         unsigned long long value
 
-    cdef union anon_union7:
+    cdef union anon_union5:
         void* fence
         unsigned long long reserved
 
-    cdef struct anon_struct19:
+    cdef struct anon_struct14:
         unsigned long long key
         unsigned int timeoutMs
 
-    cdef struct anon_struct20:
-        anon_struct18 fence
-        anon_union7 nvSciSync
-        anon_struct19 keyedMutex
+    cdef struct anon_struct15:
+        anon_struct13 fence
+        anon_union5 nvSciSync
+        anon_struct14 keyedMutex
         unsigned int reserved[10]
 
     cdef struct cudaExternalSemaphoreWaitParams:
-        anon_struct20 params
+        anon_struct15 params
         unsigned int flags
         unsigned int reserved[16]
 
@@ -774,20 +785,20 @@ cdef extern from "driver_types.h":
         pass
     ctypedef CUgraphDeviceUpdatableNode_st* cudaGraphDeviceNode_t
 
-    cdef struct anon_struct21:
+    cdef struct anon_struct16:
         const void* pValue
         size_t offset
         size_t size
 
-    cdef union anon_union9:
+    cdef union anon_union7:
         dim3 gridDim
-        anon_struct21 param
+        anon_struct16 param
         unsigned int isEnabled
 
     cdef struct cudaGraphKernelNodeUpdate:
         cudaGraphDeviceNode_t node
         cudaGraphKernelNodeField field
-        anon_union9 updateData
+        anon_union7 updateData
 
     cdef enum cudaLaunchMemSyncDomain:
         cudaLaunchMemSyncDomainDefault = 0
@@ -815,27 +826,28 @@ cdef extern from "driver_types.h":
         cudaLaunchAttributeLaunchCompletionEvent = 12
         cudaLaunchAttributeDeviceUpdatableKernelNode = 13
         cudaLaunchAttributePreferredSharedMemoryCarveout = 14
+        cudaLaunchAttributeNvlinkUtilCentricScheduling = 16
 
-    cdef struct anon_struct22:
+    cdef struct anon_struct17:
         unsigned int x
         unsigned int y
         unsigned int z
 
-    cdef struct anon_struct23:
+    cdef struct anon_struct18:
         cudaEvent_t event
         int flags
         int triggerAtBlockStart
 
-    cdef struct anon_struct24:
+    cdef struct anon_struct19:
         unsigned int x
         unsigned int y
         unsigned int z
 
-    cdef struct anon_struct25:
+    cdef struct anon_struct20:
         cudaEvent_t event
         int flags
 
-    cdef struct anon_struct26:
+    cdef struct anon_struct21:
         int deviceUpdatable
         cudaGraphDeviceNode_t devNode
 
@@ -844,17 +856,18 @@ cdef extern from "driver_types.h":
         cudaAccessPolicyWindow accessPolicyWindow
         int cooperative
         cudaSynchronizationPolicy syncPolicy
-        anon_struct22 clusterDim
+        anon_struct17 clusterDim
         cudaClusterSchedulingPolicy clusterSchedulingPolicyPreference
         int programmaticStreamSerializationAllowed
-        anon_struct23 programmaticEvent
+        anon_struct18 programmaticEvent
         int priority
         cudaLaunchMemSyncDomainMap memSyncDomainMap
         cudaLaunchMemSyncDomain memSyncDomain
-        anon_struct24 preferredClusterDim
-        anon_struct25 launchCompletionEvent
-        anon_struct26 deviceUpdatableKernelNode
+        anon_struct19 preferredClusterDim
+        anon_struct20 launchCompletionEvent
+        anon_struct21 deviceUpdatableKernelNode
         unsigned int sharedMemCarveout
+        unsigned int nvlinkUtilCentricScheduling
 
     cdef struct cudaLaunchAttribute_st:
         cudaLaunchAttributeID id
@@ -871,20 +884,32 @@ cdef extern from "driver_types.h":
 
     ctypedef cudaAsyncNotificationType_enum cudaAsyncNotificationType
 
-    cdef struct anon_struct27:
+    cdef struct anon_struct22:
         unsigned long long bytesOverBudget
 
-    cdef union anon_union10:
-        anon_struct27 overBudget
+    cdef union anon_union8:
+        anon_struct22 overBudget
 
     cdef struct cudaAsyncNotificationInfo:
         cudaAsyncNotificationType type
-        anon_union10 info
+        anon_union8 info
 
     ctypedef cudaAsyncNotificationInfo cudaAsyncNotificationInfo_t
 
     ctypedef void (*cudaAsyncCallback)(cudaAsyncNotificationInfo_t* , void* , cudaAsyncCallbackHandle_t )
 
+    cdef enum CUDAlogLevel_enum:
+        cudaLogLevelError = 0
+        cudaLogLevelWarning = 1
+
+    ctypedef CUDAlogLevel_enum cudaLogLevel
+
+    cdef struct CUlogsCallbackEntry_st:
+        pass
+    ctypedef CUlogsCallbackEntry_st* cudaLogsCallbackHandle
+
+    ctypedef unsigned int cudaLogIterator
+
     cdef enum cudaChannelFormatKind:
         cudaChannelFormatKindSigned = 0
         cudaChannelFormatKindUnsigned = 1
@@ -1202,7 +1227,7 @@ cdef extern from "driver_types.h":
         cudaDevAttrReserved93 = 93
         cudaDevAttrReserved94 = 94
         cudaDevAttrCooperativeLaunch = 95
-        cudaDevAttrCooperativeMultiDeviceLaunch = 96
+        cudaDevAttrReserved96 = 96
         cudaDevAttrMaxSharedMemoryPerBlockOptin = 97
         cudaDevAttrCanFlushRemoteWrites = 98
         cudaDevAttrHostRegisterSupported = 99
@@ -1215,7 +1240,6 @@ cdef extern from "driver_types.h":
         cudaDevAttrSparseCudaArraySupported = 112
         cudaDevAttrHostRegisterReadOnlySupported = 113
         cudaDevAttrTimelineSemaphoreInteropSupported = 114
-        cudaDevAttrMaxTimelineSemaphoreInteropSupported = 114
         cudaDevAttrMemoryPoolsSupported = 115
         cudaDevAttrGPUDirectRDMASupported = 116
         cudaDevAttrGPUDirectRDMAFlushWritesOptions = 117
@@ -1243,7 +1267,10 @@ cdef extern from "driver_types.h":
         cudaDevAttrReserved141 = 141
         cudaDevAttrHostNumaMemoryPoolsSupported = 142
         cudaDevAttrHostNumaMultinodeIpcSupported = 143
-        cudaDevAttrMax = 144
+        cudaDevAttrHostMemoryPoolsSupported = 144
+        cudaDevAttrReserved145 = 145
+        cudaDevAttrOnlyPartialHostNativeAtomicSupported = 147
+        cudaDevAttrMax = 148
 
     cdef enum cudaMemPoolAttr:
         cudaMemPoolReuseFollowEventDependencies = 1
@@ -1257,6 +1284,7 @@ cdef extern from "driver_types.h":
 
     cdef enum cudaMemLocationType:
         cudaMemLocationTypeInvalid = 0
+        cudaMemLocationTypeNone = 0
         cudaMemLocationTypeDevice = 1
         cudaMemLocationTypeHost = 2
         cudaMemLocationTypeHostNuma = 3
@@ -1270,6 +1298,7 @@ cdef extern from "driver_types.h":
     cdef enum cudaMemAllocationType:
         cudaMemAllocationTypeInvalid = 0
         cudaMemAllocationTypePinned = 1
+        cudaMemAllocationTypeManaged = 2
         cudaMemAllocationTypeMax = 2147483647
 
     cdef enum cudaMemAllocationHandleType:
@@ -1306,6 +1335,31 @@ cdef extern from "driver_types.h":
         cudaDevP2PAttrAccessSupported = 2
         cudaDevP2PAttrNativeAtomicSupported = 3
         cudaDevP2PAttrCudaArrayAccessSupported = 4
+        cudaDevP2PAttrOnlyPartialNativeAtomicSupported = 5
+
+    cdef enum cudaAtomicOperation:
+        cudaAtomicOperationIntegerAdd = 0
+        cudaAtomicOperationIntegerMin = 1
+        cudaAtomicOperationIntegerMax = 2
+        cudaAtomicOperationIntegerIncrement = 3
+        cudaAtomicOperationIntegerDecrement = 4
+        cudaAtomicOperationAnd = 5
+        cudaAtomicOperationOr = 6
+        cudaAtomicOperationXOR = 7
+        cudaAtomicOperationExchange = 8
+        cudaAtomicOperationCAS = 9
+        cudaAtomicOperationFloatAdd = 10
+        cudaAtomicOperationFloatMin = 11
+        cudaAtomicOperationFloatMax = 12
+
+    cdef enum cudaAtomicOperationCapability:
+        cudaAtomicCapabilitySigned = 1
+        cudaAtomicCapabilityUnsigned = 2
+        cudaAtomicCapabilityReduction = 4
+        cudaAtomicCapabilityScalar32 = 8
+        cudaAtomicCapabilityScalar64 = 16
+        cudaAtomicCapabilityScalar128 = 32
+        cudaAtomicCapabilityVector32x4 = 64
 
     cdef enum cudaExternalMemoryHandleType:
         cudaExternalMemoryHandleTypeOpaqueFd = 1
@@ -1364,7 +1418,7 @@ cdef extern from "driver_types.h":
     cdef enum cudaCGScope:
         cudaCGScopeInvalid = 0
         cudaCGScopeGrid = 1
-        cudaCGScopeMultiGrid = 2
+        cudaCGScopeReserved = 2
 
     cdef enum cudaGraphConditionalHandleFlags:
         cudaGraphCondAssignDefault = 1
@@ -1531,6 +1585,27 @@ cdef extern from "library_types.h":
 
     ctypedef cudaDataType_t cudaDataType
 
+    cdef enum cudaEmulationStrategy_t:
+        CUDA_EMULATION_STRATEGY_DEFAULT = 0
+        CUDA_EMULATION_STRATEGY_PERFORMANT = 1
+        CUDA_EMULATION_STRATEGY_EAGER = 2
+
+    ctypedef cudaEmulationStrategy_t cudaEmulationStrategy
+
+    cdef enum cudaEmulationMantissaControl_t:
+        CUDA_EMULATION_MANTISSA_CONTROL_DYNAMIC = 0
+        CUDA_EMULATION_MANTISSA_CONTROL_FIXED = 1
+
+    ctypedef cudaEmulationMantissaControl_t cudaEmulationMantissaControl
+
+    cdef enum cudaEmulationSpecialValuesSupport_t:
+        CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_NONE = 0
+        CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_INFINITY = 1
+        CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_NAN = 2
+        CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_DEFAULT = 65535
+
+    ctypedef cudaEmulationSpecialValuesSupport_t cudaEmulationSpecialValuesSupport
+
     cdef enum libraryPropertyType_t:
         MAJOR_VERSION = 0
         MINOR_VERSION = 1
@@ -1542,6 +1617,8 @@ cdef extern from "cuda_runtime_api.h":
 
     ctypedef void (*cudaStreamCallback_t)(cudaStream_t stream, cudaError_t status, void* userData)
 
+    ctypedef void (*cudaLogsCallback_t)(void* data, cudaLogLevel logLevel, char* message, size_t length)
+
 cdef extern from "device_types.h":
 
     cdef enum cudaRoundMode:
diff --git a/cuda_bindings/cuda/bindings/driver.pxd.in b/cuda_bindings/cuda/bindings/driver.pxd.in
index 6e73074fbc..6709d5ea51 100644
--- a/cuda_bindings/cuda/bindings/driver.pxd.in
+++ b/cuda_bindings/cuda/bindings/driver.pxd.in
@@ -673,7 +673,7 @@ cdef class CUstreamMemOpWaitValueParams_st:
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue.flags' in found_struct}}
     flags : unsigned int
-
+        See CUstreamWaitValue_flags.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue.alias' in found_struct}}
     alias : CUdeviceptr
@@ -723,7 +723,7 @@ cdef class CUstreamMemOpWriteValueParams_st:
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue.flags' in found_struct}}
     flags : unsigned int
-
+        See CUstreamWriteValue_flags.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue.alias' in found_struct}}
     alias : CUdeviceptr
@@ -761,7 +761,7 @@ cdef class CUstreamMemOpFlushRemoteWritesParams_st:
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.flushRemoteWrites.flags' in found_struct}}
     flags : unsigned int
-
+        Must be 0.
     {{endif}}
 
     Methods
@@ -803,23 +803,26 @@ cdef class CUstreamBatchMemOpParams_union:
     ----------
     {{if 'CUstreamBatchMemOpParams_union.operation' in found_struct}}
     operation : CUstreamBatchMemOpType
-
+        Operation. This is the first field of all the union elemets and
+        acts as a TAG to determine which union member is valid.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue' in found_struct}}
     waitValue : CUstreamMemOpWaitValueParams_st
-
+        Params for CU_STREAM_MEM_OP_WAIT_VALUE_32 and
+        CU_STREAM_MEM_OP_WAIT_VALUE_64 operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue' in found_struct}}
     writeValue : CUstreamMemOpWriteValueParams_st
-
+        Params for CU_STREAM_MEM_OP_WRITE_VALUE_32 and
+        CU_STREAM_MEM_OP_WRITE_VALUE_64 operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.flushRemoteWrites' in found_struct}}
     flushRemoteWrites : CUstreamMemOpFlushRemoteWritesParams_st
-
+        Params for CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.memoryBarrier' in found_struct}}
     memoryBarrier : CUstreamMemOpMemoryBarrierParams_st
-
+        Params for CU_STREAM_MEM_OP_BARRIER operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.pad' in found_struct}}
     pad : List[cuuint64_t]
@@ -850,6 +853,9 @@ cdef class CUstreamBatchMemOpParams_union:
 
 cdef class CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st:
     """
+    Batch memory operation node parameters  Used in the legacy
+    cuGraphAddBatchMemOpNode api. New code should use cuGraphAddNode()
+
     Attributes
     ----------
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st.ctx' in found_struct}}
@@ -1901,6 +1907,10 @@ cdef class CUlaunchAttributeValue_union:
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
     {{endif}}
+    {{if 'CUlaunchAttributeValue_union.nvlinkUtilCentricScheduling' in found_struct}}
+    nvlinkUtilCentricScheduling : unsigned int
+
+    {{endif}}
 
     Methods
     -------
@@ -3085,8 +3095,8 @@ cdef class CUDA_RESOURCE_VIEW_DESC_st:
 
 cdef class CUtensorMap_st:
     """
-    Tensor map descriptor. Requires compiler support for aligning to 64
-    bytes.
+    Tensor map descriptor. Requires compiler support for aligning to
+    128 bytes.
 
     Attributes
     ----------
@@ -5016,17 +5026,21 @@ cdef class CUcheckpointCheckpointArgs_st:
     cdef cydriver.CUcheckpointCheckpointArgs_st _pvt_val
     cdef cydriver.CUcheckpointCheckpointArgs_st* _pvt_ptr
 {{endif}}
-{{if 'CUcheckpointRestoreArgs_st' in found_struct}}
+{{if 'CUcheckpointGpuPair_st' in found_struct}}
 
-cdef class CUcheckpointRestoreArgs_st:
+cdef class CUcheckpointGpuPair_st:
     """
-    CUDA checkpoint optional restore arguments
+    CUDA checkpoint GPU UUID pairs for device remapping during restore
 
     Attributes
     ----------
-    {{if 'CUcheckpointRestoreArgs_st.reserved' in found_struct}}
-    reserved : List[cuuint64_t]
-        Reserved for future use, must be zeroed
+    {{if 'CUcheckpointGpuPair_st.oldUuid' in found_struct}}
+    oldUuid : CUuuid
+        UUID of the GPU that was checkpointed
+    {{endif}}
+    {{if 'CUcheckpointGpuPair_st.newUuid' in found_struct}}
+    newUuid : CUuuid
+        UUID of the GPU to restore onto
     {{endif}}
 
     Methods
@@ -5034,8 +5048,14 @@ cdef class CUcheckpointRestoreArgs_st:
     getPtr()
         Get memory address of class instance
     """
-    cdef cydriver.CUcheckpointRestoreArgs_st _pvt_val
-    cdef cydriver.CUcheckpointRestoreArgs_st* _pvt_ptr
+    cdef cydriver.CUcheckpointGpuPair_st _pvt_val
+    cdef cydriver.CUcheckpointGpuPair_st* _pvt_ptr
+    {{if 'CUcheckpointGpuPair_st.oldUuid' in found_struct}}
+    cdef CUuuid _oldUuid
+    {{endif}}
+    {{if 'CUcheckpointGpuPair_st.newUuid' in found_struct}}
+    cdef CUuuid _newUuid
+    {{endif}}
 {{endif}}
 {{if 'CUcheckpointUnlockArgs_st' in found_struct}}
 
@@ -5124,6 +5144,19 @@ cdef class CUdevSmResource_st:
         The amount of streaming multiprocessors available in this resource.
         This is an output parameter only, do not write to this field.
     {{endif}}
+    {{if 'CUdevSmResource_st.minSmPartitionSize' in found_struct}}
+    minSmPartitionSize : unsigned int
+        The minimum number of streaming multiprocessors required to
+        partition this resource. This is an output parameter only, do not
+        write to this field.
+    {{endif}}
+    {{if 'CUdevSmResource_st.smCoscheduledAlignment' in found_struct}}
+    smCoscheduledAlignment : unsigned int
+        The number of streaming multiprocessors in this resource that are
+        guaranteed to be co-scheduled on the same GPU processing cluster.
+        smCount is a multiple of this value. This is an output parameter
+        only, do not write to this field.
+    {{endif}}
 
     Methods
     -------
@@ -5483,23 +5516,26 @@ cdef class CUstreamBatchMemOpParams_v1(CUstreamBatchMemOpParams_union):
     ----------
     {{if 'CUstreamBatchMemOpParams_union.operation' in found_struct}}
     operation : CUstreamBatchMemOpType
-
+        Operation. This is the first field of all the union elemets and
+        acts as a TAG to determine which union member is valid.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue' in found_struct}}
     waitValue : CUstreamMemOpWaitValueParams_st
-
+        Params for CU_STREAM_MEM_OP_WAIT_VALUE_32 and
+        CU_STREAM_MEM_OP_WAIT_VALUE_64 operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue' in found_struct}}
     writeValue : CUstreamMemOpWriteValueParams_st
-
+        Params for CU_STREAM_MEM_OP_WRITE_VALUE_32 and
+        CU_STREAM_MEM_OP_WRITE_VALUE_64 operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.flushRemoteWrites' in found_struct}}
     flushRemoteWrites : CUstreamMemOpFlushRemoteWritesParams_st
-
+        Params for CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.memoryBarrier' in found_struct}}
     memoryBarrier : CUstreamMemOpMemoryBarrierParams_st
-
+        Params for CU_STREAM_MEM_OP_BARRIER operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.pad' in found_struct}}
     pad : List[cuuint64_t]
@@ -5523,23 +5559,26 @@ cdef class CUstreamBatchMemOpParams(CUstreamBatchMemOpParams_v1):
     ----------
     {{if 'CUstreamBatchMemOpParams_union.operation' in found_struct}}
     operation : CUstreamBatchMemOpType
-
+        Operation. This is the first field of all the union elemets and
+        acts as a TAG to determine which union member is valid.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue' in found_struct}}
     waitValue : CUstreamMemOpWaitValueParams_st
-
+        Params for CU_STREAM_MEM_OP_WAIT_VALUE_32 and
+        CU_STREAM_MEM_OP_WAIT_VALUE_64 operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue' in found_struct}}
     writeValue : CUstreamMemOpWriteValueParams_st
-
+        Params for CU_STREAM_MEM_OP_WRITE_VALUE_32 and
+        CU_STREAM_MEM_OP_WRITE_VALUE_64 operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.flushRemoteWrites' in found_struct}}
     flushRemoteWrites : CUstreamMemOpFlushRemoteWritesParams_st
-
+        Params for CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.memoryBarrier' in found_struct}}
     memoryBarrier : CUstreamMemOpMemoryBarrierParams_st
-
+        Params for CU_STREAM_MEM_OP_BARRIER operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.pad' in found_struct}}
     pad : List[cuuint64_t]
@@ -5557,6 +5596,9 @@ cdef class CUstreamBatchMemOpParams(CUstreamBatchMemOpParams_v1):
 
 cdef class CUDA_BATCH_MEM_OP_NODE_PARAMS_v1(CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st):
     """
+    Batch memory operation node parameters  Used in the legacy
+    cuGraphAddBatchMemOpNode api. New code should use cuGraphAddNode()
+
     Attributes
     ----------
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st.ctx' in found_struct}}
@@ -5587,6 +5629,9 @@ cdef class CUDA_BATCH_MEM_OP_NODE_PARAMS_v1(CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st)
 
 cdef class CUDA_BATCH_MEM_OP_NODE_PARAMS(CUDA_BATCH_MEM_OP_NODE_PARAMS_v1):
     """
+    Batch memory operation node parameters  Used in the legacy
+    cuGraphAddBatchMemOpNode api. New code should use cuGraphAddNode()
+
     Attributes
     ----------
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st.ctx' in found_struct}}
@@ -6550,6 +6595,10 @@ cdef class CUlaunchAttributeValue(CUlaunchAttributeValue_union):
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
     {{endif}}
+    {{if 'CUlaunchAttributeValue_union.nvlinkUtilCentricScheduling' in found_struct}}
+    nvlinkUtilCentricScheduling : unsigned int
+
+    {{endif}}
 
     Methods
     -------
@@ -6747,6 +6796,10 @@ cdef class CUkernelNodeAttrValue_v1(CUlaunchAttributeValue):
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
     {{endif}}
+    {{if 'CUlaunchAttributeValue_union.nvlinkUtilCentricScheduling' in found_struct}}
+    nvlinkUtilCentricScheduling : unsigned int
+
+    {{endif}}
 
     Methods
     -------
@@ -6864,6 +6917,10 @@ cdef class CUkernelNodeAttrValue(CUkernelNodeAttrValue_v1):
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
     {{endif}}
+    {{if 'CUlaunchAttributeValue_union.nvlinkUtilCentricScheduling' in found_struct}}
+    nvlinkUtilCentricScheduling : unsigned int
+
+    {{endif}}
 
     Methods
     -------
@@ -6981,6 +7038,10 @@ cdef class CUstreamAttrValue_v1(CUlaunchAttributeValue):
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
     {{endif}}
+    {{if 'CUlaunchAttributeValue_union.nvlinkUtilCentricScheduling' in found_struct}}
+    nvlinkUtilCentricScheduling : unsigned int
+
+    {{endif}}
 
     Methods
     -------
@@ -7098,6 +7159,10 @@ cdef class CUstreamAttrValue(CUstreamAttrValue_v1):
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
     {{endif}}
+    {{if 'CUlaunchAttributeValue_union.nvlinkUtilCentricScheduling' in found_struct}}
+    nvlinkUtilCentricScheduling : unsigned int
+
+    {{endif}}
 
     Methods
     -------
@@ -8489,8 +8554,8 @@ cdef class CUDA_RESOURCE_VIEW_DESC(CUDA_RESOURCE_VIEW_DESC_v1):
 
 cdef class CUtensorMap(CUtensorMap_st):
     """
-    Tensor map descriptor. Requires compiler support for aligning to 64
-    bytes.
+    Tensor map descriptor. Requires compiler support for aligning to
+    128 bytes.
 
     Attributes
     ----------
@@ -10473,17 +10538,21 @@ cdef class CUcheckpointCheckpointArgs(CUcheckpointCheckpointArgs_st):
     """
     pass
 {{endif}}
-{{if 'CUcheckpointRestoreArgs' in found_types}}
+{{if 'CUcheckpointGpuPair' in found_types}}
 
-cdef class CUcheckpointRestoreArgs(CUcheckpointRestoreArgs_st):
+cdef class CUcheckpointGpuPair(CUcheckpointGpuPair_st):
     """
-    CUDA checkpoint optional restore arguments
+    CUDA checkpoint GPU UUID pairs for device remapping during restore
 
     Attributes
     ----------
-    {{if 'CUcheckpointRestoreArgs_st.reserved' in found_struct}}
-    reserved : List[cuuint64_t]
-        Reserved for future use, must be zeroed
+    {{if 'CUcheckpointGpuPair_st.oldUuid' in found_struct}}
+    oldUuid : CUuuid
+        UUID of the GPU that was checkpointed
+    {{endif}}
+    {{if 'CUcheckpointGpuPair_st.newUuid' in found_struct}}
+    newUuid : CUuuid
+        UUID of the GPU to restore onto
     {{endif}}
 
     Methods
@@ -10578,6 +10647,19 @@ cdef class CUdevSmResource(CUdevSmResource_st):
         The amount of streaming multiprocessors available in this resource.
         This is an output parameter only, do not write to this field.
     {{endif}}
+    {{if 'CUdevSmResource_st.minSmPartitionSize' in found_struct}}
+    minSmPartitionSize : unsigned int
+        The minimum number of streaming multiprocessors required to
+        partition this resource. This is an output parameter only, do not
+        write to this field.
+    {{endif}}
+    {{if 'CUdevSmResource_st.smCoscheduledAlignment' in found_struct}}
+    smCoscheduledAlignment : unsigned int
+        The number of streaming multiprocessors in this resource that are
+        guaranteed to be co-scheduled on the same GPU processing cluster.
+        smCount is a multiple of this value. This is an output parameter
+        only, do not write to this field.
+    {{endif}}
 
     Methods
     -------
diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in
index a256ae453c..b3440ec891 100644
--- a/cuda_bindings/cuda/bindings/driver.pyx.in
+++ b/cuda_bindings/cuda/bindings/driver.pyx.in
@@ -36,6 +36,7 @@ ctypedef unsigned long long unsigned_ptr
 ctypedef unsigned long long unsigned_long_long_ptr
 ctypedef unsigned long long long_long_ptr
 ctypedef unsigned long long size_t_ptr
+ctypedef unsigned long long long_ptr
 ctypedef unsigned long long float_ptr
 ctypedef unsigned long long double_ptr
 ctypedef unsigned long long void_ptr
@@ -694,7 +695,7 @@ _dict_CUstreamBatchMemOpType = dict(((int(v), v) for k, v in CUstreamBatchMemOpT
 
 class CUstreamMemoryBarrier_flags(IntEnum):
     """
-    Flags for :py:obj:`~.CUstreamBatchMemOpParams`::memoryBarrier
+    Flags for :py:obj:`~.CUstreamBatchMemOpParams.memoryBarrier`
     """
     {{if 'CU_STREAM_MEMORY_BARRIER_TYPE_SYS' in found_values}}
 
@@ -1397,7 +1398,7 @@ class CUdevice_attribute(IntEnum):
     CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID{{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED' in found_values}}
 
-    #: Link between the device and the host supports native atomic
+    #: Link between the device and the host supports all native atomic
     #: operations
     CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED{{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO' in found_values}}
@@ -1679,6 +1680,27 @@ class CUdevice_attribute(IntEnum):
     #: Device supports HOST_NUMA location IPC between nodes in a multi-node
     #: system.
     CU_DEVICE_ATTRIBUTE_HOST_NUMA_MULTINODE_IPC_SUPPORTED = cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_HOST_NUMA_MULTINODE_IPC_SUPPORTED{{endif}}
+    {{if 'CU_DEVICE_ATTRIBUTE_HOST_MEMORY_POOLS_SUPPORTED' in found_values}}
+
+    #: Device suports HOST location with the :py:obj:`~.cuMemAllocAsync`
+    #: and :py:obj:`~.cuMemPool` family of APIs
+    CU_DEVICE_ATTRIBUTE_HOST_MEMORY_POOLS_SUPPORTED = cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_HOST_MEMORY_POOLS_SUPPORTED{{endif}}
+    {{if 'CU_DEVICE_ATTRIBUTE_HOST_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED' in found_values}}
+
+    #: Device supports HOST location with the virtual memory management
+    #: APIs like :py:obj:`~.cuMemCreate`, :py:obj:`~.cuMemMap` and related
+    #: APIs
+    CU_DEVICE_ATTRIBUTE_HOST_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_HOST_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED{{endif}}
+    {{if 'CU_DEVICE_ATTRIBUTE_HOST_ALLOC_DMA_BUF_SUPPORTED' in found_values}}
+
+    #: Device supports page-locked host memory buffer sharing with dma_buf
+    #: mechanism.
+    CU_DEVICE_ATTRIBUTE_HOST_ALLOC_DMA_BUF_SUPPORTED = cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_HOST_ALLOC_DMA_BUF_SUPPORTED{{endif}}
+    {{if 'CU_DEVICE_ATTRIBUTE_ONLY_PARTIAL_HOST_NATIVE_ATOMIC_SUPPORTED' in found_values}}
+
+    #: Link between the device and the host supports only some native
+    #: atomic operations
+    CU_DEVICE_ATTRIBUTE_ONLY_PARTIAL_HOST_NATIVE_ATOMIC_SUPPORTED = cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_ONLY_PARTIAL_HOST_NATIVE_ATOMIC_SUPPORTED{{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_MAX' in found_values}}
     CU_DEVICE_ATTRIBUTE_MAX = cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_MAX{{endif}}
 
@@ -1850,7 +1872,15 @@ class CUfunction_attribute(IntEnum):
     #: The maximum size in bytes of dynamically-allocated shared memory
     #: that can be used by this function. If the user-specified dynamic
     #: shared memory size is larger than this value, the launch will fail.
-    #: See :py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`
+    #: The default value of this attribute is
+    #: :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK` -
+    #: :py:obj:`~.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES`, except when
+    #: :py:obj:`~.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES` is greater than
+    #: :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK`, then
+    #: the default value of this attribute is 0. The value can be increased
+    #: to :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN`
+    #: - :py:obj:`~.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES`. See
+    #: :py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`
     CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = cydriver.CUfunction_attribute_enum.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES{{endif}}
     {{if 'CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT' in found_values}}
 
@@ -2390,6 +2420,16 @@ class CUjit_option(IntEnum):
     #: directives. (0: Disable, default; 1: Enable) Option type: int
     #: Applies to: compiler only
     CU_JIT_OVERRIDE_DIRECTIVE_VALUES = cydriver.CUjit_option_enum.CU_JIT_OVERRIDE_DIRECTIVE_VALUES{{endif}}
+    {{if 'CU_JIT_SPLIT_COMPILE' in found_values}}
+
+    #: This option specifies the maximum number of concurrent threads to
+    #: use when running compiler optimizations. If the specified value is
+    #: 1, the option will be ignored. If the specified value is 0, the
+    #: number of threads will match the number of CPUs on the underlying
+    #: machine. Otherwise, if the option is N, then up to N threads will be
+    #: used. Option type: unsigned int
+    #: Applies to: compiler only
+    CU_JIT_SPLIT_COMPILE = cydriver.CUjit_option_enum.CU_JIT_SPLIT_COMPILE{{endif}}
     {{if 'CU_JIT_NUM_OPTIONS' in found_values}}
     CU_JIT_NUM_OPTIONS = cydriver.CUjit_option_enum.CU_JIT_NUM_OPTIONS{{endif}}
 
@@ -2485,6 +2525,10 @@ class CUjit_target(IntEnum):
 
     #: Compute device class 10.3.
     CU_TARGET_COMPUTE_103 = cydriver.CUjit_target_enum.CU_TARGET_COMPUTE_103{{endif}}
+    {{if 'CU_TARGET_COMPUTE_110' in found_values}}
+
+    #: Compute device class 11.0.
+    CU_TARGET_COMPUTE_110 = cydriver.CUjit_target_enum.CU_TARGET_COMPUTE_110{{endif}}
     {{if 'CU_TARGET_COMPUTE_120' in found_values}}
 
     #: Compute device class 12.0.
@@ -2504,12 +2548,16 @@ class CUjit_target(IntEnum):
     CU_TARGET_COMPUTE_100A = cydriver.CUjit_target_enum.CU_TARGET_COMPUTE_100A{{endif}}
     {{if 'CU_TARGET_COMPUTE_101A' in found_values}}
 
-    #: Compute device class 10.3. with accelerated features.
+    #: Compute device class 11.0 with accelerated features.
     CU_TARGET_COMPUTE_101A = cydriver.CUjit_target_enum.CU_TARGET_COMPUTE_101A{{endif}}
     {{if 'CU_TARGET_COMPUTE_103A' in found_values}}
 
     #: Compute device class 12.0. with accelerated features.
     CU_TARGET_COMPUTE_103A = cydriver.CUjit_target_enum.CU_TARGET_COMPUTE_103A{{endif}}
+    {{if 'CU_TARGET_COMPUTE_110A' in found_values}}
+
+    #: Compute device class 10.3. with accelerated features.
+    CU_TARGET_COMPUTE_110A = cydriver.CUjit_target_enum.CU_TARGET_COMPUTE_110A{{endif}}
     {{if 'CU_TARGET_COMPUTE_120A' in found_values}}
 
     #: Compute device class 12.1. with accelerated features.
@@ -2524,12 +2572,16 @@ class CUjit_target(IntEnum):
     CU_TARGET_COMPUTE_100F = cydriver.CUjit_target_enum.CU_TARGET_COMPUTE_100F{{endif}}
     {{if 'CU_TARGET_COMPUTE_101F' in found_values}}
 
-    #: Compute device class 10.3. with family features.
+    #: Compute device class 11.0 with family features.
     CU_TARGET_COMPUTE_101F = cydriver.CUjit_target_enum.CU_TARGET_COMPUTE_101F{{endif}}
     {{if 'CU_TARGET_COMPUTE_103F' in found_values}}
 
     #: Compute device class 12.0. with family features.
     CU_TARGET_COMPUTE_103F = cydriver.CUjit_target_enum.CU_TARGET_COMPUTE_103F{{endif}}
+    {{if 'CU_TARGET_COMPUTE_110F' in found_values}}
+
+    #: Compute device class 10.3. with family features.
+    CU_TARGET_COMPUTE_110F = cydriver.CUjit_target_enum.CU_TARGET_COMPUTE_110F{{endif}}
     {{if 'CU_TARGET_COMPUTE_120F' in found_values}}
 
     #: Compute device class 12.1. with family features.
@@ -2875,7 +2927,8 @@ class CUgraphNodeType(IntEnum):
     CU_GRAPH_NODE_TYPE_MEM_FREE = cydriver.CUgraphNodeType_enum.CU_GRAPH_NODE_TYPE_MEM_FREE{{endif}}
     {{if 'CU_GRAPH_NODE_TYPE_BATCH_MEM_OP' in found_values}}
 
-    #: Batch MemOp Node
+    #: Batch MemOp Node See :py:obj:`~.cuStreamBatchMemOp` and
+    #: :py:obj:`~.CUstreamBatchMemOpType` for what these nodes can do.
     CU_GRAPH_NODE_TYPE_BATCH_MEM_OP = cydriver.CUgraphNodeType_enum.CU_GRAPH_NODE_TYPE_BATCH_MEM_OP{{endif}}
     {{if 'CU_GRAPH_NODE_TYPE_CONDITIONAL' in found_values}}
 
@@ -3230,6 +3283,29 @@ class CUlaunchAttributeID(IntEnum):
     #: is only a hint, and the CUDA driver can choose a different
     #: configuration if required for the launch.
     CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = cydriver.CUlaunchAttributeID_enum.CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT{{endif}}
+    {{if 'CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING' in found_values}}
+
+    #: Valid for streams, graph nodes, launches. This attribute is a hint
+    #: to the CUDA runtime that the launch should attempt to make the
+    #: kernel maximize its NVLINK utilization.
+    #:
+    #:  When possible to honor this hint, CUDA will assume each block in
+    #: the grid launch will carry out an even amount of NVLINK traffic, and
+    #: make a best-effort attempt to adjust the kernel launch based on that
+    #: assumption.
+    #:  This attribute is a hint only. CUDA makes no functional or
+    #: performance guarantee. Its applicability can be affected by many
+    #: different factors, including driver version (i.e. CUDA doesn't
+    #: guarantee the performance characteristics will be maintained between
+    #: driver versions or a driver update could alter or regress previously
+    #: observed perf characteristics.) It also doesn't guarantee a
+    #: successful result, i.e. applying the attribute may not improve the
+    #: performance of either the targeted kernel or the encapsulating
+    #: application.
+    #:  Valid values for
+    #: :py:obj:`~.CUlaunchAttributeValue`::nvlinkUtilCentricScheduling are
+    #: 0 (disabled) and 1 (enabled).
+    CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING = cydriver.CUlaunchAttributeID_enum.CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING{{endif}}
 
 _dict_CUlaunchAttributeID = dict(((int(v), v) for k, v in CUlaunchAttributeID.__members__.items()))
 {{endif}}
@@ -3430,6 +3506,12 @@ class CUresult(IntEnum):
     #: is a stub library. Applications that run with the stub rather than a
     #: real driver loaded will result in CUDA API returning this error.
     CUDA_ERROR_STUB_LIBRARY = cydriver.cudaError_enum.CUDA_ERROR_STUB_LIBRARY{{endif}}
+    {{if 'CUDA_ERROR_CALL_REQUIRES_NEWER_DRIVER' in found_values}}
+
+    #: This indicates that the API call requires a newer CUDA driver than
+    #: the one currently installed. Users should install an updated NVIDIA
+    #: CUDA driver to allow the API call to succeed.
+    CUDA_ERROR_CALL_REQUIRES_NEWER_DRIVER = cydriver.cudaError_enum.CUDA_ERROR_CALL_REQUIRES_NEWER_DRIVER{{endif}}
     {{if 'CUDA_ERROR_DEVICE_UNAVAILABLE' in found_values}}
 
     #: This indicates that requested CUDA device is unavailable at the
@@ -3583,6 +3665,11 @@ class CUresult(IntEnum):
     #: same error. To continue using CUDA, the process must be terminated
     #: and relaunched.
     CUDA_ERROR_CONTAINED = cydriver.cudaError_enum.CUDA_ERROR_CONTAINED{{endif}}
+    {{if 'CUDA_ERROR_NVLINK_ENCRYPTION_FAILED' in found_values}}
+
+    #: This indicates that an NVLink encryption error was detected during
+    #: the execution.
+    CUDA_ERROR_NVLINK_ENCRYPTION_FAILED = cydriver.cudaError_enum.CUDA_ERROR_NVLINK_ENCRYPTION_FAILED{{endif}}
     {{if 'CUDA_ERROR_INVALID_SOURCE' in found_values}}
 
     #: This indicates that the device kernel source is invalid. This
@@ -3980,7 +4067,7 @@ class CUdevice_P2PAttribute(IntEnum):
     CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = cydriver.CUdevice_P2PAttribute_enum.CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED{{endif}}
     {{if 'CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED' in found_values}}
 
-    #: Atomic operation over the link supported
+    #: All CUDA-valid atomic operation over the link are supported
     CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = cydriver.CUdevice_P2PAttribute_enum.CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED{{endif}}
     {{if 'CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED' in found_values}}
 
@@ -3990,9 +4077,73 @@ class CUdevice_P2PAttribute(IntEnum):
 
     #: Accessing CUDA arrays over the link supported
     CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED = cydriver.CUdevice_P2PAttribute_enum.CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED{{endif}}
+    {{if 'CU_DEVICE_P2P_ATTRIBUTE_ONLY_PARTIAL_NATIVE_ATOMIC_SUPPORTED' in found_values}}
+
+    #: Only some CUDA-valid atomic operations over the link are supported.
+    CU_DEVICE_P2P_ATTRIBUTE_ONLY_PARTIAL_NATIVE_ATOMIC_SUPPORTED = cydriver.CUdevice_P2PAttribute_enum.CU_DEVICE_P2P_ATTRIBUTE_ONLY_PARTIAL_NATIVE_ATOMIC_SUPPORTED{{endif}}
 
 _dict_CUdevice_P2PAttribute = dict(((int(v), v) for k, v in CUdevice_P2PAttribute.__members__.items()))
 {{endif}}
+{{if 'CUatomicOperation_enum' in found_types}}
+
+class CUatomicOperation(IntEnum):
+    """
+    CUDA-valid Atomic Operations
+    """
+    {{if 'CU_ATOMIC_OPERATION_INTEGER_ADD' in found_values}}
+    CU_ATOMIC_OPERATION_INTEGER_ADD = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_INTEGER_ADD{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_INTEGER_MIN' in found_values}}
+    CU_ATOMIC_OPERATION_INTEGER_MIN = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_INTEGER_MIN{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_INTEGER_MAX' in found_values}}
+    CU_ATOMIC_OPERATION_INTEGER_MAX = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_INTEGER_MAX{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_INTEGER_INCREMENT' in found_values}}
+    CU_ATOMIC_OPERATION_INTEGER_INCREMENT = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_INTEGER_INCREMENT{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_INTEGER_DECREMENT' in found_values}}
+    CU_ATOMIC_OPERATION_INTEGER_DECREMENT = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_INTEGER_DECREMENT{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_AND' in found_values}}
+    CU_ATOMIC_OPERATION_AND = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_AND{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_OR' in found_values}}
+    CU_ATOMIC_OPERATION_OR = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_OR{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_XOR' in found_values}}
+    CU_ATOMIC_OPERATION_XOR = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_XOR{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_EXCHANGE' in found_values}}
+    CU_ATOMIC_OPERATION_EXCHANGE = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_EXCHANGE{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_CAS' in found_values}}
+    CU_ATOMIC_OPERATION_CAS = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_CAS{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_FLOAT_ADD' in found_values}}
+    CU_ATOMIC_OPERATION_FLOAT_ADD = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_FLOAT_ADD{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_FLOAT_MIN' in found_values}}
+    CU_ATOMIC_OPERATION_FLOAT_MIN = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_FLOAT_MIN{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_FLOAT_MAX' in found_values}}
+    CU_ATOMIC_OPERATION_FLOAT_MAX = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_FLOAT_MAX{{endif}}
+    {{if 'CU_ATOMIC_OPERATION_MAX' in found_values}}
+    CU_ATOMIC_OPERATION_MAX = cydriver.CUatomicOperation_enum.CU_ATOMIC_OPERATION_MAX{{endif}}
+
+_dict_CUatomicOperation = dict(((int(v), v) for k, v in CUatomicOperation.__members__.items()))
+{{endif}}
+{{if 'CUatomicOperationCapability_enum' in found_types}}
+
+class CUatomicOperationCapability(IntEnum):
+    """
+    CUDA-valid Atomic Operation capabilities
+    """
+    {{if 'CU_ATOMIC_CAPABILITY_SIGNED' in found_values}}
+    CU_ATOMIC_CAPABILITY_SIGNED = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_SIGNED{{endif}}
+    {{if 'CU_ATOMIC_CAPABILITY_UNSIGNED' in found_values}}
+    CU_ATOMIC_CAPABILITY_UNSIGNED = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_UNSIGNED{{endif}}
+    {{if 'CU_ATOMIC_CAPABILITY_REDUCTION' in found_values}}
+    CU_ATOMIC_CAPABILITY_REDUCTION = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_REDUCTION{{endif}}
+    {{if 'CU_ATOMIC_CAPABILITY_SCALAR_32' in found_values}}
+    CU_ATOMIC_CAPABILITY_SCALAR_32 = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_SCALAR_32{{endif}}
+    {{if 'CU_ATOMIC_CAPABILITY_SCALAR_64' in found_values}}
+    CU_ATOMIC_CAPABILITY_SCALAR_64 = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_SCALAR_64{{endif}}
+    {{if 'CU_ATOMIC_CAPABILITY_SCALAR_128' in found_values}}
+    CU_ATOMIC_CAPABILITY_SCALAR_128 = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_SCALAR_128{{endif}}
+    {{if 'CU_ATOMIC_CAPABILITY_VECTOR_32x4' in found_values}}
+    CU_ATOMIC_CAPABILITY_VECTOR_32x4 = cydriver.CUatomicOperationCapability_enum.CU_ATOMIC_CAPABILITY_VECTOR_32x4{{endif}}
+
+_dict_CUatomicOperationCapability = dict(((int(v), v) for k, v in CUatomicOperationCapability.__members__.items()))
+{{endif}}
 {{if 'CUresourceViewFormat_enum' in found_types}}
 
 class CUresourceViewFormat(IntEnum):
@@ -4328,6 +4479,10 @@ class CUexternalMemoryHandleType(IntEnum):
 
     #: Handle is an NvSciBuf object
     CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = cydriver.CUexternalMemoryHandleType_enum.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF{{endif}}
+    {{if 'CU_EXTERNAL_MEMORY_HANDLE_TYPE_DMABUF_FD' in found_values}}
+
+    #: Handle is a dma_buf file descriptor
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_DMABUF_FD = cydriver.CUexternalMemoryHandleType_enum.CU_EXTERNAL_MEMORY_HANDLE_TYPE_DMABUF_FD{{endif}}
 
 _dict_CUexternalMemoryHandleType = dict(((int(v), v) for k, v in CUexternalMemoryHandleType.__members__.items()))
 {{endif}}
@@ -4445,6 +4600,11 @@ class CUmemLocationType(IntEnum):
     """
     {{if 'CU_MEM_LOCATION_TYPE_INVALID' in found_values}}
     CU_MEM_LOCATION_TYPE_INVALID = cydriver.CUmemLocationType_enum.CU_MEM_LOCATION_TYPE_INVALID{{endif}}
+    {{if 'CU_MEM_LOCATION_TYPE_NONE' in found_values}}
+
+    #: Location is unspecified. This is used when creating a managed memory
+    #: pool to indicate no preferred location for the pool
+    CU_MEM_LOCATION_TYPE_NONE = cydriver.CUmemLocationType_enum.CU_MEM_LOCATION_TYPE_NONE{{endif}}
     {{if 'CU_MEM_LOCATION_TYPE_DEVICE' in found_values}}
 
     #: Location is a device location, thus id is a device ordinal
@@ -4479,6 +4639,10 @@ class CUmemAllocationType(IntEnum):
     #: This allocation type is 'pinned', i.e. cannot migrate from its
     #: current location while the application is actively using it
     CU_MEM_ALLOCATION_TYPE_PINNED = cydriver.CUmemAllocationType_enum.CU_MEM_ALLOCATION_TYPE_PINNED{{endif}}
+    {{if 'CU_MEM_ALLOCATION_TYPE_MANAGED' in found_values}}
+
+    #: This allocation type is managed memory
+    CU_MEM_ALLOCATION_TYPE_MANAGED = cydriver.CUmemAllocationType_enum.CU_MEM_ALLOCATION_TYPE_MANAGED{{endif}}
     {{if 'CU_MEM_ALLOCATION_TYPE_MAX' in found_values}}
     CU_MEM_ALLOCATION_TYPE_MAX = cydriver.CUmemAllocationType_enum.CU_MEM_ALLOCATION_TYPE_MAX{{endif}}
 
@@ -6276,6 +6440,29 @@ class CUkernelNodeAttrID(IntEnum):
     #: is only a hint, and the CUDA driver can choose a different
     #: configuration if required for the launch.
     CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = cydriver.CUlaunchAttributeID_enum.CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT{{endif}}
+    {{if 'CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING' in found_values}}
+
+    #: Valid for streams, graph nodes, launches. This attribute is a hint
+    #: to the CUDA runtime that the launch should attempt to make the
+    #: kernel maximize its NVLINK utilization.
+    #:
+    #:  When possible to honor this hint, CUDA will assume each block in
+    #: the grid launch will carry out an even amount of NVLINK traffic, and
+    #: make a best-effort attempt to adjust the kernel launch based on that
+    #: assumption.
+    #:  This attribute is a hint only. CUDA makes no functional or
+    #: performance guarantee. Its applicability can be affected by many
+    #: different factors, including driver version (i.e. CUDA doesn't
+    #: guarantee the performance characteristics will be maintained between
+    #: driver versions or a driver update could alter or regress previously
+    #: observed perf characteristics.) It also doesn't guarantee a
+    #: successful result, i.e. applying the attribute may not improve the
+    #: performance of either the targeted kernel or the encapsulating
+    #: application.
+    #:  Valid values for
+    #: :py:obj:`~.CUlaunchAttributeValue`::nvlinkUtilCentricScheduling are
+    #: 0 (disabled) and 1 (enabled).
+    CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING = cydriver.CUlaunchAttributeID_enum.CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING{{endif}}
 
 _dict_CUlaunchAttributeID = dict(((int(v), v) for k, v in CUlaunchAttributeID.__members__.items()))
 {{endif}}
@@ -6468,6 +6655,29 @@ class CUstreamAttrID(IntEnum):
     #: is only a hint, and the CUDA driver can choose a different
     #: configuration if required for the launch.
     CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = cydriver.CUlaunchAttributeID_enum.CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT{{endif}}
+    {{if 'CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING' in found_values}}
+
+    #: Valid for streams, graph nodes, launches. This attribute is a hint
+    #: to the CUDA runtime that the launch should attempt to make the
+    #: kernel maximize its NVLINK utilization.
+    #:
+    #:  When possible to honor this hint, CUDA will assume each block in
+    #: the grid launch will carry out an even amount of NVLINK traffic, and
+    #: make a best-effort attempt to adjust the kernel launch based on that
+    #: assumption.
+    #:  This attribute is a hint only. CUDA makes no functional or
+    #: performance guarantee. Its applicability can be affected by many
+    #: different factors, including driver version (i.e. CUDA doesn't
+    #: guarantee the performance characteristics will be maintained between
+    #: driver versions or a driver update could alter or regress previously
+    #: observed perf characteristics.) It also doesn't guarantee a
+    #: successful result, i.e. applying the attribute may not improve the
+    #: performance of either the targeted kernel or the encapsulating
+    #: application.
+    #:  Valid values for
+    #: :py:obj:`~.CUlaunchAttributeValue`::nvlinkUtilCentricScheduling are
+    #: 0 (disabled) and 1 (enabled).
+    CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING = cydriver.CUlaunchAttributeID_enum.CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING{{endif}}
 
 _dict_CUlaunchAttributeID = dict(((int(v), v) for k, v in CUlaunchAttributeID.__members__.items()))
 {{endif}}
@@ -7780,7 +7990,7 @@ cdef class CUstreamMemOpWaitValueParams_st:
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue.flags' in found_struct}}
     flags : unsigned int
-
+        See CUstreamWaitValue_flags.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue.alias' in found_struct}}
     alias : CUdeviceptr
@@ -7970,7 +8180,7 @@ cdef class CUstreamMemOpWriteValueParams_st:
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue.flags' in found_struct}}
     flags : unsigned int
-
+        See CUstreamWriteValue_flags.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue.alias' in found_struct}}
     alias : CUdeviceptr
@@ -8148,7 +8358,7 @@ cdef class CUstreamMemOpFlushRemoteWritesParams_st:
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.flushRemoteWrites.flags' in found_struct}}
     flags : unsigned int
-
+        Must be 0.
     {{endif}}
 
     Methods
@@ -8278,23 +8488,26 @@ cdef class CUstreamBatchMemOpParams_union:
     ----------
     {{if 'CUstreamBatchMemOpParams_union.operation' in found_struct}}
     operation : CUstreamBatchMemOpType
-
+        Operation. This is the first field of all the union elemets and
+        acts as a TAG to determine which union member is valid.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue' in found_struct}}
     waitValue : CUstreamMemOpWaitValueParams_st
-
+        Params for CU_STREAM_MEM_OP_WAIT_VALUE_32 and
+        CU_STREAM_MEM_OP_WAIT_VALUE_64 operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue' in found_struct}}
     writeValue : CUstreamMemOpWriteValueParams_st
-
+        Params for CU_STREAM_MEM_OP_WRITE_VALUE_32 and
+        CU_STREAM_MEM_OP_WRITE_VALUE_64 operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.flushRemoteWrites' in found_struct}}
     flushRemoteWrites : CUstreamMemOpFlushRemoteWritesParams_st
-
+        Params for CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.memoryBarrier' in found_struct}}
     memoryBarrier : CUstreamMemOpMemoryBarrierParams_st
-
+        Params for CU_STREAM_MEM_OP_BARRIER operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.pad' in found_struct}}
     pad : List[cuuint64_t]
@@ -8427,6 +8640,9 @@ cdef class CUstreamBatchMemOpParams_union:
 
 cdef class CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st:
     """
+    Batch memory operation node parameters  Used in the legacy
+    cuGraphAddBatchMemOpNode api. New code should use cuGraphAddNode()
+
     Attributes
     ----------
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st.ctx' in found_struct}}
@@ -11538,6 +11754,10 @@ cdef class CUlaunchAttributeValue_union:
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
     {{endif}}
+    {{if 'CUlaunchAttributeValue_union.nvlinkUtilCentricScheduling' in found_struct}}
+    nvlinkUtilCentricScheduling : unsigned int
+
+    {{endif}}
 
     Methods
     -------
@@ -11669,6 +11889,12 @@ cdef class CUlaunchAttributeValue_union:
             except ValueError:
                 str_list += ['sharedMemCarveout : <ValueError>']
             {{endif}}
+            {{if 'CUlaunchAttributeValue_union.nvlinkUtilCentricScheduling' in found_struct}}
+            try:
+                str_list += ['nvlinkUtilCentricScheduling : ' + str(self.nvlinkUtilCentricScheduling)]
+            except ValueError:
+                str_list += ['nvlinkUtilCentricScheduling : <ValueError>']
+            {{endif}}
             return '\n'.join(str_list)
         else:
             return ''
@@ -11809,6 +12035,14 @@ cdef class CUlaunchAttributeValue_union:
     def sharedMemCarveout(self, unsigned int sharedMemCarveout):
         self._pvt_ptr[0].sharedMemCarveout = sharedMemCarveout
     {{endif}}
+    {{if 'CUlaunchAttributeValue_union.nvlinkUtilCentricScheduling' in found_struct}}
+    @property
+    def nvlinkUtilCentricScheduling(self):
+        return self._pvt_ptr[0].nvlinkUtilCentricScheduling
+    @nvlinkUtilCentricScheduling.setter
+    def nvlinkUtilCentricScheduling(self, unsigned int nvlinkUtilCentricScheduling):
+        self._pvt_ptr[0].nvlinkUtilCentricScheduling = nvlinkUtilCentricScheduling
+    {{endif}}
 {{endif}}
 {{if 'CUlaunchAttribute_st' in found_struct}}
 
@@ -15812,8 +16046,8 @@ cdef class CUDA_RESOURCE_VIEW_DESC_st:
 
 cdef class CUtensorMap_st:
     """
-    Tensor map descriptor. Requires compiler support for aligning to 64
-    bytes.
+    Tensor map descriptor. Requires compiler support for aligning to
+    128 bytes.
 
     Attributes
     ----------
@@ -21592,17 +21826,21 @@ cdef class CUcheckpointCheckpointArgs_st:
 
     {{endif}}
 {{endif}}
-{{if 'CUcheckpointRestoreArgs_st' in found_struct}}
+{{if 'CUcheckpointGpuPair_st' in found_struct}}
 
-cdef class CUcheckpointRestoreArgs_st:
+cdef class CUcheckpointGpuPair_st:
     """
-    CUDA checkpoint optional restore arguments
+    CUDA checkpoint GPU UUID pairs for device remapping during restore
 
     Attributes
     ----------
-    {{if 'CUcheckpointRestoreArgs_st.reserved' in found_struct}}
-    reserved : List[cuuint64_t]
-        Reserved for future use, must be zeroed
+    {{if 'CUcheckpointGpuPair_st.oldUuid' in found_struct}}
+    oldUuid : CUuuid
+        UUID of the GPU that was checkpointed
+    {{endif}}
+    {{if 'CUcheckpointGpuPair_st.newUuid' in found_struct}}
+    newUuid : CUuuid
+        UUID of the GPU to restore onto
     {{endif}}
 
     Methods
@@ -21614,9 +21852,15 @@ cdef class CUcheckpointRestoreArgs_st:
         if _ptr == 0:
             self._pvt_ptr = &self._pvt_val
         else:
-            self._pvt_ptr = <cydriver.CUcheckpointRestoreArgs_st *>_ptr
+            self._pvt_ptr = <cydriver.CUcheckpointGpuPair_st *>_ptr
     def __init__(self, void_ptr _ptr = 0):
         pass
+        {{if 'CUcheckpointGpuPair_st.oldUuid' in found_struct}}
+        self._oldUuid = CUuuid(_ptr=<void_ptr>&self._pvt_ptr[0].oldUuid)
+        {{endif}}
+        {{if 'CUcheckpointGpuPair_st.newUuid' in found_struct}}
+        self._newUuid = CUuuid(_ptr=<void_ptr>&self._pvt_ptr[0].newUuid)
+        {{endif}}
     def __dealloc__(self):
         pass
     def getPtr(self):
@@ -21624,23 +21868,36 @@ cdef class CUcheckpointRestoreArgs_st:
     def __repr__(self):
         if self._pvt_ptr is not NULL:
             str_list = []
-            {{if 'CUcheckpointRestoreArgs_st.reserved' in found_struct}}
+            {{if 'CUcheckpointGpuPair_st.oldUuid' in found_struct}}
             try:
-                str_list += ['reserved : ' + str(self.reserved)]
+                str_list += ['oldUuid :\n' + '\n'.join(['    ' + line for line in str(self.oldUuid).splitlines()])]
             except ValueError:
-                str_list += ['reserved : <ValueError>']
+                str_list += ['oldUuid : <ValueError>']
+            {{endif}}
+            {{if 'CUcheckpointGpuPair_st.newUuid' in found_struct}}
+            try:
+                str_list += ['newUuid :\n' + '\n'.join(['    ' + line for line in str(self.newUuid).splitlines()])]
+            except ValueError:
+                str_list += ['newUuid : <ValueError>']
             {{endif}}
             return '\n'.join(str_list)
         else:
             return ''
-    {{if 'CUcheckpointRestoreArgs_st.reserved' in found_struct}}
+    {{if 'CUcheckpointGpuPair_st.oldUuid' in found_struct}}
     @property
-    def reserved(self):
-        return [cuuint64_t(init_value=_reserved) for _reserved in self._pvt_ptr[0].reserved]
-    @reserved.setter
-    def reserved(self, reserved):
-        self._pvt_ptr[0].reserved = reserved
-
+    def oldUuid(self):
+        return self._oldUuid
+    @oldUuid.setter
+    def oldUuid(self, oldUuid not None : CUuuid):
+        string.memcpy(&self._pvt_ptr[0].oldUuid, <cydriver.CUuuid*><void_ptr>oldUuid.getPtr(), sizeof(self._pvt_ptr[0].oldUuid))
+    {{endif}}
+    {{if 'CUcheckpointGpuPair_st.newUuid' in found_struct}}
+    @property
+    def newUuid(self):
+        return self._newUuid
+    @newUuid.setter
+    def newUuid(self, newUuid not None : CUuuid):
+        string.memcpy(&self._pvt_ptr[0].newUuid, <cydriver.CUuuid*><void_ptr>newUuid.getPtr(), sizeof(self._pvt_ptr[0].newUuid))
     {{endif}}
 {{endif}}
 {{if 'CUcheckpointUnlockArgs_st' in found_struct}}
@@ -21877,6 +22134,19 @@ cdef class CUdevSmResource_st:
         The amount of streaming multiprocessors available in this resource.
         This is an output parameter only, do not write to this field.
     {{endif}}
+    {{if 'CUdevSmResource_st.minSmPartitionSize' in found_struct}}
+    minSmPartitionSize : unsigned int
+        The minimum number of streaming multiprocessors required to
+        partition this resource. This is an output parameter only, do not
+        write to this field.
+    {{endif}}
+    {{if 'CUdevSmResource_st.smCoscheduledAlignment' in found_struct}}
+    smCoscheduledAlignment : unsigned int
+        The number of streaming multiprocessors in this resource that are
+        guaranteed to be co-scheduled on the same GPU processing cluster.
+        smCount is a multiple of this value. This is an output parameter
+        only, do not write to this field.
+    {{endif}}
 
     Methods
     -------
@@ -21903,6 +22173,18 @@ cdef class CUdevSmResource_st:
             except ValueError:
                 str_list += ['smCount : <ValueError>']
             {{endif}}
+            {{if 'CUdevSmResource_st.minSmPartitionSize' in found_struct}}
+            try:
+                str_list += ['minSmPartitionSize : ' + str(self.minSmPartitionSize)]
+            except ValueError:
+                str_list += ['minSmPartitionSize : <ValueError>']
+            {{endif}}
+            {{if 'CUdevSmResource_st.smCoscheduledAlignment' in found_struct}}
+            try:
+                str_list += ['smCoscheduledAlignment : ' + str(self.smCoscheduledAlignment)]
+            except ValueError:
+                str_list += ['smCoscheduledAlignment : <ValueError>']
+            {{endif}}
             return '\n'.join(str_list)
         else:
             return ''
@@ -21914,6 +22196,22 @@ cdef class CUdevSmResource_st:
     def smCount(self, unsigned int smCount):
         self._pvt_ptr[0].smCount = smCount
     {{endif}}
+    {{if 'CUdevSmResource_st.minSmPartitionSize' in found_struct}}
+    @property
+    def minSmPartitionSize(self):
+        return self._pvt_ptr[0].minSmPartitionSize
+    @minSmPartitionSize.setter
+    def minSmPartitionSize(self, unsigned int minSmPartitionSize):
+        self._pvt_ptr[0].minSmPartitionSize = minSmPartitionSize
+    {{endif}}
+    {{if 'CUdevSmResource_st.smCoscheduledAlignment' in found_struct}}
+    @property
+    def smCoscheduledAlignment(self):
+        return self._pvt_ptr[0].smCoscheduledAlignment
+    @smCoscheduledAlignment.setter
+    def smCoscheduledAlignment(self, unsigned int smCoscheduledAlignment):
+        self._pvt_ptr[0].smCoscheduledAlignment = smCoscheduledAlignment
+    {{endif}}
 {{endif}}
 {{if 'CUdevResource_st' in found_struct}}
 
@@ -22832,6 +23130,12 @@ def cuGetErrorName(error not None : CUresult):
 def cuInit(unsigned int Flags):
     """ Initialize the CUDA driver API Initializes the driver API and must be called before any other function from the driver API in the current process. Currently, the `Flags` parameter must be 0. If :py:obj:`~.cuInit()` has not been called, any function from the driver API will return :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`.
 
+    Note: cuInit preloads various libraries needed for JIT compilation. To
+    opt-out of this behavior, set the environment variable
+    CUDA_FORCE_PRELOAD_LIBRARIES=0. CUDA will lazily load JIT libraries as
+    needed. To disable JIT entirely, set the environment variable
+    CUDA_DISABLE_JIT=1.
+
     Parameters
     ----------
     Flags : unsigned int
@@ -22982,56 +23286,12 @@ def cuDeviceGetName(int length, dev):
     return (_dict_CUresult[err], pyname)
 {{endif}}
 
-{{if 'cuDeviceGetUuid' in found_functions}}
+{{if 'cuDeviceGetUuid_v2' in found_functions}}
 
 @cython.embedsignature(True)
 def cuDeviceGetUuid(dev):
     """ Return an UUID for the device.
 
-    Note there is a later version of this API,
-    :py:obj:`~.cuDeviceGetUuid_v2`. It will supplant this version in 12.0,
-    which is retained for minor version compatibility.
-
-    Returns 16-octets identifying the device `dev` in the structure pointed
-    by the `uuid`.
-
-    Parameters
-    ----------
-    dev : :py:obj:`~.CUdevice`
-        Device to get identifier string for
-
-    Returns
-    -------
-    CUresult
-        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
-    uuid : :py:obj:`~.CUuuid`
-        Returned UUID
-
-    See Also
-    --------
-    :py:obj:`~.cuDeviceGetUuid_v2` :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetCount`, :py:obj:`~.cuDeviceGetName`, :py:obj:`~.cuDeviceGetLuid`, :py:obj:`~.cuDeviceGet`, :py:obj:`~.cuDeviceTotalMem`, :py:obj:`~.cuDeviceGetExecAffinitySupport`, :py:obj:`~.cudaGetDeviceProperties`
-    """
-    cdef cydriver.CUdevice cydev
-    if dev is None:
-        pdev = 0
-    elif isinstance(dev, (CUdevice,)):
-        pdev = int(dev)
-    else:
-        pdev = int(CUdevice(dev))
-    cydev = <cydriver.CUdevice>pdev
-    cdef CUuuid uuid = CUuuid()
-    err = cydriver.cuDeviceGetUuid(<cydriver.CUuuid*>uuid._pvt_ptr, cydev)
-    if err != cydriver.CUDA_SUCCESS:
-        return (_dict_CUresult[err], None)
-    return (_dict_CUresult[err], uuid)
-{{endif}}
-
-{{if 'cuDeviceGetUuid_v2' in found_functions}}
-
-@cython.embedsignature(True)
-def cuDeviceGetUuid_v2(dev):
-    """ Return an UUID for the device (11.4+)
-
     Returns 16-octets identifying the device `dev` in the structure pointed
     by the `uuid`. If the device is in MIG mode, returns its MIG UUID which
     uniquely identifies the subscribed MIG compute instance.
@@ -23061,7 +23321,7 @@ def cuDeviceGetUuid_v2(dev):
         pdev = int(CUdevice(dev))
     cydev = <cydriver.CUdevice>pdev
     cdef CUuuid uuid = CUuuid()
-    err = cydriver.cuDeviceGetUuid_v2(<cydriver.CUuuid*>uuid._pvt_ptr, cydev)
+    err = cydriver.cuDeviceGetUuid(<cydriver.CUuuid*>uuid._pvt_ptr, cydev)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], uuid)
@@ -23204,409 +23464,7 @@ def cuDeviceGetAttribute(attrib not None : CUdevice_attribute, dev):
     """ Returns information about the device.
 
     Returns in `*pi` the integer value of the attribute `attrib` on device
-    `dev`. The supported attributes are:
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK`: Maximum number
-      of threads per block;
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X`: Maximum x-dimension
-      of a block
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y`: Maximum y-dimension
-      of a block
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z`: Maximum z-dimension
-      of a block
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X`: Maximum x-dimension
-      of a grid
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y`: Maximum y-dimension
-      of a grid
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z`: Maximum z-dimension
-      of a grid
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK`: Maximum
-      amount of shared memory available to a thread block in bytes
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY`: Memory
-      available on device for constant variables in a CUDA C kernel in
-      bytes
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_WARP_SIZE`: Warp size in threads
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_PITCH`: Maximum pitch in bytes
-      allowed by the memory copy functions that involve memory regions
-      allocated through :py:obj:`~.cuMemAllocPitch()`
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH`: Maximum 1D
-      texture width
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH`:
-      Maximum width for a 1D texture bound to linear memory
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH`:
-      Maximum mipmapped 1D texture width
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH`: Maximum 2D
-      texture width
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT`: Maximum 2D
-      texture height
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH`:
-      Maximum width for a 2D texture bound to linear memory
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT`:
-      Maximum height for a 2D texture bound to linear memory
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH`:
-      Maximum pitch in bytes for a 2D texture bound to linear memory
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH`:
-      Maximum mipmapped 2D texture width
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT`:
-      Maximum mipmapped 2D texture height
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH`: Maximum 3D
-      texture width
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT`: Maximum 3D
-      texture height
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH`: Maximum 3D
-      texture depth
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE`:
-      Alternate maximum 3D texture width, 0 if no alternate maximum 3D
-      texture size is supported
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE`:
-      Alternate maximum 3D texture height, 0 if no alternate maximum 3D
-      texture size is supported
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE`:
-      Alternate maximum 3D texture depth, 0 if no alternate maximum 3D
-      texture size is supported
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH`: Maximum
-      cubemap texture width or height
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH`:
-      Maximum 1D layered texture width
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS`:
-      Maximum layers in a 1D layered texture
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH`:
-      Maximum 2D layered texture width
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT`:
-      Maximum 2D layered texture height
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS`:
-      Maximum layers in a 2D layered texture
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH`:
-      Maximum cubemap layered texture width or height
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS`:
-      Maximum layers in a cubemap layered texture
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH`: Maximum 1D
-      surface width
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH`: Maximum 2D
-      surface width
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT`: Maximum 2D
-      surface height
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH`: Maximum 3D
-      surface width
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT`: Maximum 3D
-      surface height
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH`: Maximum 3D
-      surface depth
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH`:
-      Maximum 1D layered surface width
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS`:
-      Maximum layers in a 1D layered surface
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH`:
-      Maximum 2D layered surface width
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT`:
-      Maximum 2D layered surface height
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS`:
-      Maximum layers in a 2D layered surface
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH`: Maximum
-      cubemap surface width
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH`:
-      Maximum cubemap layered surface width
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS`:
-      Maximum layers in a cubemap layered surface
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK`: Maximum
-      number of 32-bit registers available to a thread block
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_CLOCK_RATE`: The typical clock
-      frequency in kilohertz
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT`: Alignment
-      requirement; texture base addresses aligned to
-      :py:obj:`~.textureAlign` bytes do not need an offset applied to
-      texture fetches
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT`: Pitch
-      alignment requirement for 2D texture references bound to pitched
-      memory
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_GPU_OVERLAP`: 1 if the device can
-      concurrently copy memory between host and device while executing a
-      kernel, or 0 if not
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT`: Number of
-      multiprocessors on the device
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT`: 1 if there is a
-      run time limit for kernels executed on the device, or 0 if not
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_INTEGRATED`: 1 if the device is
-      integrated with the memory subsystem, or 0 if not
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY`: 1 if the device
-      can map host memory into the CUDA address space, or 0 if not
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE`: Compute mode that
-      device is currently in. Available modes are as follows:
-
-      - :py:obj:`~.CU_COMPUTEMODE_DEFAULT`: Default mode - Device is not
-        restricted and can have multiple CUDA contexts present at a single
-        time.
-
-      - :py:obj:`~.CU_COMPUTEMODE_PROHIBITED`: Compute-prohibited mode -
-        Device is prohibited from creating new CUDA contexts.
-
-      - :py:obj:`~.CU_COMPUTEMODE_EXCLUSIVE_PROCESS`: Compute-exclusive-
-        process mode - Device can have only one context used by a single
-        process at a time.
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS`: 1 if the device
-      supports executing multiple kernels within the same context
-      simultaneously, or 0 if not. It is not guaranteed that multiple
-      kernels will be resident on the device concurrently so this feature
-      should not be relied upon for correctness.
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_ECC_ENABLED`: 1 if error correction is
-      enabled on the device, 0 if error correction is disabled or not
-      supported by the device
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID`: PCI bus identifier of the
-      device
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID`: PCI device (also known
-      as slot) identifier of the device
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID`: PCI domain identifier
-      of the device
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_TCC_DRIVER`: 1 if the device is using
-      a TCC driver. TCC is only available on Tesla hardware running Windows
-      Vista or later
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE`: Peak memory clock
-      frequency in kilohertz
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH`: Global
-      memory bus width in bits
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE`: Size of L2 cache in
-      bytes. 0 if the device doesn't have L2 cache
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR`:
-      Maximum resident threads per multiprocessor
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING`: 1 if the device
-      shares a unified address space with the host, or 0 if not
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR`: Major
-      compute capability version number
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR`: Minor
-      compute capability version number
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED`: 1 if
-      device supports caching globals in L1 cache, 0 if caching globals in
-      L1 cache is not supported by the device
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED`: 1 if device
-      supports caching locals in L1 cache, 0 if caching locals in L1 cache
-      is not supported by the device
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR`:
-      Maximum amount of shared memory available to a multiprocessor in
-      bytes; this amount is shared by all thread blocks simultaneously
-      resident on a multiprocessor
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR`:
-      Maximum number of 32-bit registers available to a multiprocessor;
-      this number is shared by all thread blocks simultaneously resident on
-      a multiprocessor
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY`: 1 if device supports
-      allocating managed memory on this system, 0 if allocating managed
-      memory is not supported by the device on this system.
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD`: 1 if device is on a
-      multi-GPU board, 0 if not.
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID`: Unique
-      identifier for a group of devices associated with the same board.
-      Devices on the same multi-GPU board will share the same identifier.
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED`: 1 if
-      Link between the device and the host supports native atomic
-      operations.
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO`:
-      Ratio of single precision performance (in floating-point operations
-      per second) to double precision performance.
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS`: Device
-      supports coherently accessing pageable memory without calling
-      cudaHostRegister on it.
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS`: Device can
-      coherently access managed memory concurrently with the CPU.
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED`: Device
-      supports Compute Preemption.
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM`:
-      Device can access host registered memory at the same virtual address
-      as the CPU.
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN`:
-      The maximum per block shared memory size supported on this device.
-      This is the maximum value that can be opted into when using the
-      :py:obj:`~.cuFuncSetAttribute()` or
-      :py:obj:`~.cuKernelSetAttribute()` call. For more details see
-      :py:obj:`~.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES`
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES`:
-      Device accesses pageable memory via the host's page tables.
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST`:
-      The host can directly access managed memory on the device without
-      migration.
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED`:
-      Device supports virtual memory management APIs like
-      :py:obj:`~.cuMemAddressReserve`, :py:obj:`~.cuMemCreate`,
-      :py:obj:`~.cuMemMap` and related APIs
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED`:
-      Device supports exporting memory to a posix file descriptor with
-      :py:obj:`~.cuMemExportToShareableHandle`, if requested via
-      :py:obj:`~.cuMemCreate`
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED`:
-      Device supports exporting memory to a Win32 NT handle with
-      :py:obj:`~.cuMemExportToShareableHandle`, if requested via
-      :py:obj:`~.cuMemCreate`
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED`:
-      Device supports exporting memory to a Win32 KMT handle with
-      :py:obj:`~.cuMemExportToShareableHandle`, if requested via
-      :py:obj:`~.cuMemCreate`
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR`:
-      Maximum number of thread blocks that can reside on a multiprocessor
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED`: Device
-      supports compressible memory allocation via :py:obj:`~.cuMemCreate`
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE`: Maximum
-      L2 persisting lines capacity setting in bytes
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE`:
-      Maximum value of :py:obj:`~.CUaccessPolicyWindow.num_bytes`
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED`:
-      Device supports specifying the GPUDirect RDMA flag with
-      :py:obj:`~.cuMemCreate`.
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK`:
-      Amount of shared memory per block reserved by CUDA driver in bytes
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED`: Device
-      supports sparse CUDA arrays and sparse CUDA mipmapped arrays.
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED`:
-      Device supports using the :py:obj:`~.cuMemHostRegister` flag
-      :py:obj:`~.CU_MEMHOSTERGISTER_READ_ONLY` to register memory that must
-      be mapped as read-only to the GPU
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED`: Device
-      supports using the :py:obj:`~.cuMemAllocAsync` and
-      :py:obj:`~.cuMemPool` family of APIs
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED`: Device
-      supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see
-      https://docs.nvidia.com/cuda/gpudirect-rdma for more information)
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS`:
-      The returned attribute shall be interpreted as a bitmask, where the
-      individual bits are described by the
-      :py:obj:`~.CUflushGPUDirectRDMAWritesOptions` enum
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING`:
-      GPUDirect RDMA writes to the device do not need to be flushed for
-      consumers within the scope indicated by the returned attribute. See
-      :py:obj:`~.CUGPUDirectRDMAWritesOrdering` for the numerical values
-      returned here.
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES`:
-      Bitmask of handle types supported with mempool based IPC
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED`:
-      Device supports deferred mapping CUDA arrays and CUDA mipmapped
-      arrays.
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_NUMA_CONFIG`: NUMA configuration of a
-      device: value is of type :py:obj:`~.CUdeviceNumaConfig` enum
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_NUMA_ID`: NUMA node ID of the GPU
-      memory
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED`: Device supports
-      switch multicast and reduction operations.
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID`: The combined
-      16-bit PCI device ID and 16-bit PCI vendor ID.
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_GPU_PCI_SUBSYSTEM_ID`: The combined
-      16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID. ID.
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_HOST_NUMA_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED`:
-      Device supports HOST_NUMA location with the virtual memory management
-      APIs like :py:obj:`~.cuMemCreate`, :py:obj:`~.cuMemMap` and related
-      APIs
-
-    - :py:obj:`~.CU_DEVICE_ATTRIBUTE_HOST_NUMA_MEMORY_POOLS_SUPPORTED`:
-      Device supports HOST_NUMA location with the
-      :py:obj:`~.cuMemAllocAsync` and :py:obj:`~.cuMemPool` family of APIs
+    `dev`.
 
     Parameters
     ----------
@@ -23642,6 +23500,76 @@ def cuDeviceGetAttribute(attrib not None : CUdevice_attribute, dev):
     return (_dict_CUresult[err], pi)
 {{endif}}
 
+{{if 'cuDeviceGetHostAtomicCapabilities' in found_functions}}
+
+@cython.embedsignature(True)
+def cuDeviceGetHostAtomicCapabilities(operations : Optional[Tuple[CUatomicOperation] | List[CUatomicOperation]], unsigned int count, dev):
+    """ Queries details about atomic operations supported between the device and host.
+
+    Returns in `*capabilities` the details about requested atomic
+    `*operations` over the the link between `dev` and the host. The
+    allocated size of `*operations` and `*capabilities` must be `count`.
+
+    For each :py:obj:`~.CUatomicOperation` in `*operations`, the
+    corresponding result in `*capabilities` will be a bitmask indicating
+    which of :py:obj:`~.CUatomicOperationCapability` the link supports
+    natively.
+
+    Returns :py:obj:`~.CUDA_ERROR_INVALID_DEVICE` if `dev` is not valid.
+
+    Returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if `*capabilities` or
+    `*operations` is NULL, if `count` is 0, or if any of `*operations` is
+    not valid.
+
+    Parameters
+    ----------
+    operations : List[:py:obj:`~.CUatomicOperation`]
+        Requested operations
+    count : unsigned int
+        Count of requested operations and size of capabilities
+    dev : :py:obj:`~.CUdevice`
+        Device handle
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
+    capabilities : List[unsigned int]
+        Returned capability details of each requested operation
+
+    See Also
+    --------
+    :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetP2PAtomicCapabilities`, :py:obj:`~.cudaDeviceGeHostAtomicCapabilities`
+    """
+    cdef cydriver.CUdevice cydev
+    if dev is None:
+        pdev = 0
+    elif isinstance(dev, (CUdevice,)):
+        pdev = int(dev)
+    else:
+        pdev = int(CUdevice(dev))
+    cydev = <cydriver.CUdevice>pdev
+    operations = [] if operations is None else operations
+    if not all(isinstance(_x, (CUatomicOperation)) for _x in operations):
+        raise TypeError("Argument 'operations' is not instance of type (expected Tuple[cydriver.CUatomicOperation] or List[cydriver.CUatomicOperation]")
+    cdef unsigned int* cycapabilities = NULL
+    pycapabilities = []
+    if count != 0:
+        cycapabilities = <unsigned int*>calloc(count, sizeof(unsigned int))
+        if cycapabilities is NULL:
+            raise MemoryError('Failed to allocate length x size memory: ' + str(count) + 'x' + str(sizeof(unsigned int)))
+    cdef vector[cydriver.CUatomicOperation] cyoperations = [pyoperations.value for pyoperations in (operations)]
+    if count > len(operations): raise RuntimeError("List is too small: " + str(len(operations)) + " < " + str(count))
+    err = cydriver.cuDeviceGetHostAtomicCapabilities(cycapabilities, cyoperations.data(), count, cydev)
+    if CUresult(err) == CUresult(0):
+        pycapabilities = [<unsigned int>cycapabilities[idx] for idx in range(count)]
+    if cycapabilities is not NULL:
+        free(cycapabilities)
+    if err != cydriver.CUDA_SUCCESS:
+        return (_dict_CUresult[err], None)
+    return (_dict_CUresult[err], pycapabilities)
+{{endif}}
+
 {{if 'cuDeviceGetNvSciSyncAttributes' in found_functions}}
 
 @cython.embedsignature(True)
@@ -24400,320 +24328,10 @@ def cuDevicePrimaryCtxReset(dev):
     return (_dict_CUresult[err],)
 {{endif}}
 
-{{if 'cuCtxCreate_v2' in found_functions}}
-
-@cython.embedsignature(True)
-def cuCtxCreate(unsigned int flags, dev):
-    """ Create a CUDA context.
-
-    Creates a new CUDA context and associates it with the calling thread.
-    The `flags` parameter is described below. The context is created with a
-    usage count of 1 and the caller of :py:obj:`~.cuCtxCreate()` must call
-    :py:obj:`~.cuCtxDestroy()` when done using the context. If a context is
-    already current to the thread, it is supplanted by the newly created
-    context and may be restored by a subsequent call to
-    :py:obj:`~.cuCtxPopCurrent()`.
-
-    The three LSBs of the `flags` parameter can be used to control how the
-    OS thread, which owns the CUDA context at the time of an API call,
-    interacts with the OS scheduler when waiting for results from the GPU.
-    Only one of the scheduling flags can be set when creating a context.
-
-    - :py:obj:`~.CU_CTX_SCHED_SPIN`: Instruct CUDA to actively spin when
-      waiting for results from the GPU. This can decrease latency when
-      waiting for the GPU, but may lower the performance of CPU threads if
-      they are performing work in parallel with the CUDA thread.
-
-    - :py:obj:`~.CU_CTX_SCHED_YIELD`: Instruct CUDA to yield its thread
-      when waiting for results from the GPU. This can increase latency when
-      waiting for the GPU, but can increase the performance of CPU threads
-      performing work in parallel with the GPU.
-
-    - :py:obj:`~.CU_CTX_SCHED_BLOCKING_SYNC`: Instruct CUDA to block the
-      CPU thread on a synchronization primitive when waiting for the GPU to
-      finish work.
-
-    - :py:obj:`~.CU_CTX_BLOCKING_SYNC`: Instruct CUDA to block the CPU
-      thread on a synchronization primitive when waiting for the GPU to
-      finish work.   Deprecated: This flag was deprecated as of CUDA 4.0
-      and was replaced with :py:obj:`~.CU_CTX_SCHED_BLOCKING_SYNC`.
-
-    - :py:obj:`~.CU_CTX_SCHED_AUTO`: The default value if the `flags`
-      parameter is zero, uses a heuristic based on the number of active
-      CUDA contexts in the process `C` and the number of logical processors
-      in the system `P`. If `C` > `P`, then CUDA will yield to other OS
-      threads when waiting for the GPU (:py:obj:`~.CU_CTX_SCHED_YIELD`),
-      otherwise CUDA will not yield while waiting for results and actively
-      spin on the processor (:py:obj:`~.CU_CTX_SCHED_SPIN`). Additionally,
-      on Tegra devices, :py:obj:`~.CU_CTX_SCHED_AUTO` uses a heuristic
-      based on the power profile of the platform and may choose
-      :py:obj:`~.CU_CTX_SCHED_BLOCKING_SYNC` for low-powered devices.
-
-    - :py:obj:`~.CU_CTX_MAP_HOST`: Instruct CUDA to support mapped pinned
-      allocations. This flag must be set in order to allocate pinned host
-      memory that is accessible to the GPU.
-
-    - :py:obj:`~.CU_CTX_LMEM_RESIZE_TO_MAX`: Instruct CUDA to not reduce
-      local memory after resizing local memory for a kernel. This can
-      prevent thrashing by local memory allocations when launching many
-      kernels with high local memory usage at the cost of potentially
-      increased memory usage.   Deprecated: This flag is deprecated and the
-      behavior enabled by this flag is now the default and cannot be
-      disabled. Instead, the per-thread stack size can be controlled with
-      :py:obj:`~.cuCtxSetLimit()`.
-
-    - :py:obj:`~.CU_CTX_COREDUMP_ENABLE`: If GPU coredumps have not been
-      enabled globally with :py:obj:`~.cuCoredumpSetAttributeGlobal` or
-      environment variables, this flag can be set during context creation
-      to instruct CUDA to create a coredump if this context raises an
-      exception during execution. These environment variables are described
-      in the CUDA-GDB user guide under the "GPU core dump support" section.
-      The initial attributes will be taken from the global attributes at
-      the time of context creation. The other attributes that control
-      coredump output can be modified by calling
-      :py:obj:`~.cuCoredumpSetAttribute` from the created context after it
-      becomes current.
-
-    - :py:obj:`~.CU_CTX_USER_COREDUMP_ENABLE`: If user-triggered GPU
-      coredumps have not been enabled globally with
-      :py:obj:`~.cuCoredumpSetAttributeGlobal` or environment variables,
-      this flag can be set during context creation to instruct CUDA to
-      create a coredump if data is written to a certain pipe that is
-      present in the OS space. These environment variables are described in
-      the CUDA-GDB user guide under the "GPU core dump support" section. It
-      is important to note that the pipe name `must` be set with
-      :py:obj:`~.cuCoredumpSetAttributeGlobal` before creating the context
-      if this flag is used. Setting this flag implies that
-      :py:obj:`~.CU_CTX_COREDUMP_ENABLE` is set. The initial attributes
-      will be taken from the global attributes at the time of context
-      creation. The other attributes that control coredump output can be
-      modified by calling :py:obj:`~.cuCoredumpSetAttribute` from the
-      created context after it becomes current. Setting this flag on any
-      context creation is equivalent to setting the
-      :py:obj:`~.CU_COREDUMP_ENABLE_USER_TRIGGER` attribute to `true`
-      globally.
-
-    - :py:obj:`~.CU_CTX_SYNC_MEMOPS`: Ensures that synchronous memory
-      operations initiated on this context will always synchronize. See
-      further documentation in the section titled "API Synchronization
-      behavior" to learn more about cases when synchronous memory
-      operations can exhibit asynchronous behavior.
-
-    Context creation will fail with :py:obj:`~.CUDA_ERROR_UNKNOWN` if the
-    compute mode of the device is :py:obj:`~.CU_COMPUTEMODE_PROHIBITED`.
-    The function :py:obj:`~.cuDeviceGetAttribute()` can be used with
-    :py:obj:`~.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE` to determine the compute
-    mode of the device. The `nvidia-smi` tool can be used to set the
-    compute mode for * devices. Documentation for `nvidia-smi` can be
-    obtained by passing a -h option to it.
-
-    Parameters
-    ----------
-    flags : unsigned int
-        Context creation flags
-    dev : :py:obj:`~.CUdevice`
-        Device to create context on
-
-    Returns
-    -------
-    CUresult
-        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
-    pctx : :py:obj:`~.CUcontext`
-        Returned context handle of the new context
-
-    See Also
-    --------
-    :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCoredumpSetAttributeGlobal`, :py:obj:`~.cuCoredumpSetAttribute`, :py:obj:`~.cuCtxSynchronize`
-
-    Notes
-    -----
-    In most cases it is recommended to use :py:obj:`~.cuDevicePrimaryCtxRetain`.
-    """
-    cdef cydriver.CUdevice cydev
-    if dev is None:
-        pdev = 0
-    elif isinstance(dev, (CUdevice,)):
-        pdev = int(dev)
-    else:
-        pdev = int(CUdevice(dev))
-    cydev = <cydriver.CUdevice>pdev
-    cdef CUcontext pctx = CUcontext()
-    err = cydriver.cuCtxCreate(<cydriver.CUcontext*>pctx._pvt_ptr, flags, cydev)
-    if err != cydriver.CUDA_SUCCESS:
-        return (_dict_CUresult[err], None)
-    return (_dict_CUresult[err], pctx)
-{{endif}}
-
-{{if 'cuCtxCreate_v3' in found_functions}}
-
-@cython.embedsignature(True)
-def cuCtxCreate_v3(paramsArray : Optional[Tuple[CUexecAffinityParam] | List[CUexecAffinityParam]], int numParams, unsigned int flags, dev):
-    """ Create a CUDA context with execution affinity.
-
-    Creates a new CUDA context with execution affinity and associates it
-    with the calling thread. The `paramsArray` and `flags` parameter are
-    described below. The context is created with a usage count of 1 and the
-    caller of :py:obj:`~.cuCtxCreate()` must call
-    :py:obj:`~.cuCtxDestroy()` when done using the context. If a context is
-    already current to the thread, it is supplanted by the newly created
-    context and may be restored by a subsequent call to
-    :py:obj:`~.cuCtxPopCurrent()`.
-
-    The type and the amount of execution resource the context can use is
-    limited by `paramsArray` and `numParams`. The `paramsArray` is an array
-    of `CUexecAffinityParam` and the `numParams` describes the size of the
-    array. If two `CUexecAffinityParam` in the array have the same type,
-    the latter execution affinity parameter overrides the former execution
-    affinity parameter. The supported execution affinity types are:
-
-    - :py:obj:`~.CU_EXEC_AFFINITY_TYPE_SM_COUNT` limits the portion of SMs
-      that the context can use. The portion of SMs is specified as the
-      number of SMs via `CUexecAffinitySmCount`. This limit will be
-      internally rounded up to the next hardware-supported amount. Hence,
-      it is imperative to query the actual execution affinity of the
-      context via `cuCtxGetExecAffinity` after context creation. Currently,
-      this attribute is only supported under Volta+ MPS.
-
-    The three LSBs of the `flags` parameter can be used to control how the
-    OS thread, which owns the CUDA context at the time of an API call,
-    interacts with the OS scheduler when waiting for results from the GPU.
-    Only one of the scheduling flags can be set when creating a context.
-
-    - :py:obj:`~.CU_CTX_SCHED_SPIN`: Instruct CUDA to actively spin when
-      waiting for results from the GPU. This can decrease latency when
-      waiting for the GPU, but may lower the performance of CPU threads if
-      they are performing work in parallel with the CUDA thread.
-
-    - :py:obj:`~.CU_CTX_SCHED_YIELD`: Instruct CUDA to yield its thread
-      when waiting for results from the GPU. This can increase latency when
-      waiting for the GPU, but can increase the performance of CPU threads
-      performing work in parallel with the GPU.
-
-    - :py:obj:`~.CU_CTX_SCHED_BLOCKING_SYNC`: Instruct CUDA to block the
-      CPU thread on a synchronization primitive when waiting for the GPU to
-      finish work.
-
-    - :py:obj:`~.CU_CTX_BLOCKING_SYNC`: Instruct CUDA to block the CPU
-      thread on a synchronization primitive when waiting for the GPU to
-      finish work.   Deprecated: This flag was deprecated as of CUDA 4.0
-      and was replaced with :py:obj:`~.CU_CTX_SCHED_BLOCKING_SYNC`.
-
-    - :py:obj:`~.CU_CTX_SCHED_AUTO`: The default value if the `flags`
-      parameter is zero, uses a heuristic based on the number of active
-      CUDA contexts in the process `C` and the number of logical processors
-      in the system `P`. If `C` > `P`, then CUDA will yield to other OS
-      threads when waiting for the GPU (:py:obj:`~.CU_CTX_SCHED_YIELD`),
-      otherwise CUDA will not yield while waiting for results and actively
-      spin on the processor (:py:obj:`~.CU_CTX_SCHED_SPIN`). Additionally,
-      on Tegra devices, :py:obj:`~.CU_CTX_SCHED_AUTO` uses a heuristic
-      based on the power profile of the platform and may choose
-      :py:obj:`~.CU_CTX_SCHED_BLOCKING_SYNC` for low-powered devices.
-
-    - :py:obj:`~.CU_CTX_MAP_HOST`: Instruct CUDA to support mapped pinned
-      allocations. This flag must be set in order to allocate pinned host
-      memory that is accessible to the GPU.
-
-    - :py:obj:`~.CU_CTX_LMEM_RESIZE_TO_MAX`: Instruct CUDA to not reduce
-      local memory after resizing local memory for a kernel. This can
-      prevent thrashing by local memory allocations when launching many
-      kernels with high local memory usage at the cost of potentially
-      increased memory usage.   Deprecated: This flag is deprecated and the
-      behavior enabled by this flag is now the default and cannot be
-      disabled. Instead, the per-thread stack size can be controlled with
-      :py:obj:`~.cuCtxSetLimit()`.
-
-    - :py:obj:`~.CU_CTX_COREDUMP_ENABLE`: If GPU coredumps have not been
-      enabled globally with :py:obj:`~.cuCoredumpSetAttributeGlobal` or
-      environment variables, this flag can be set during context creation
-      to instruct CUDA to create a coredump if this context raises an
-      exception during execution. These environment variables are described
-      in the CUDA-GDB user guide under the "GPU core dump support" section.
-      The initial attributes will be taken from the global attributes at
-      the time of context creation. The other attributes that control
-      coredump output can be modified by calling
-      :py:obj:`~.cuCoredumpSetAttribute` from the created context after it
-      becomes current.
-
-    - :py:obj:`~.CU_CTX_USER_COREDUMP_ENABLE`: If user-triggered GPU
-      coredumps have not been enabled globally with
-      :py:obj:`~.cuCoredumpSetAttributeGlobal` or environment variables,
-      this flag can be set during context creation to instruct CUDA to
-      create a coredump if data is written to a certain pipe that is
-      present in the OS space. These environment variables are described in
-      the CUDA-GDB user guide under the "GPU core dump support" section. It
-      is important to note that the pipe name `must` be set with
-      :py:obj:`~.cuCoredumpSetAttributeGlobal` before creating the context
-      if this flag is used. Setting this flag implies that
-      :py:obj:`~.CU_CTX_COREDUMP_ENABLE` is set. The initial attributes
-      will be taken from the global attributes at the time of context
-      creation. The other attributes that control coredump output can be
-      modified by calling :py:obj:`~.cuCoredumpSetAttribute` from the
-      created context after it becomes current. Setting this flag on any
-      context creation is equivalent to setting the
-      :py:obj:`~.CU_COREDUMP_ENABLE_USER_TRIGGER` attribute to `true`
-      globally.
-
-    Context creation will fail with :py:obj:`~.CUDA_ERROR_UNKNOWN` if the
-    compute mode of the device is :py:obj:`~.CU_COMPUTEMODE_PROHIBITED`.
-    The function :py:obj:`~.cuDeviceGetAttribute()` can be used with
-    :py:obj:`~.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE` to determine the compute
-    mode of the device. The `nvidia-smi` tool can be used to set the
-    compute mode for * devices. Documentation for `nvidia-smi` can be
-    obtained by passing a -h option to it.
-
-    Parameters
-    ----------
-    paramsArray : List[:py:obj:`~.CUexecAffinityParam`]
-        Execution affinity parameters
-    numParams : int
-        Number of execution affinity parameters
-    flags : unsigned int
-        Context creation flags
-    dev : :py:obj:`~.CUdevice`
-        Device to create context on
-
-    Returns
-    -------
-    CUresult
-        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
-    pctx : :py:obj:`~.CUcontext`
-        Returned context handle of the new context
-
-    See Also
-    --------
-    :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :py:obj:`~.cuCoredumpSetAttributeGlobal`, :py:obj:`~.cuCoredumpSetAttribute`, :py:obj:`~.CUexecAffinityParam`
-    """
-    cdef cydriver.CUdevice cydev
-    if dev is None:
-        pdev = 0
-    elif isinstance(dev, (CUdevice,)):
-        pdev = int(dev)
-    else:
-        pdev = int(CUdevice(dev))
-    cydev = <cydriver.CUdevice>pdev
-    paramsArray = [] if paramsArray is None else paramsArray
-    if not all(isinstance(_x, (CUexecAffinityParam,)) for _x in paramsArray):
-        raise TypeError("Argument 'paramsArray' is not instance of type (expected Tuple[cydriver.CUexecAffinityParam,] or List[cydriver.CUexecAffinityParam,]")
-    cdef CUcontext pctx = CUcontext()
-    cdef cydriver.CUexecAffinityParam* cyparamsArray = NULL
-    if len(paramsArray) > 0:
-        cyparamsArray = <cydriver.CUexecAffinityParam*> calloc(len(paramsArray), sizeof(cydriver.CUexecAffinityParam))
-        if cyparamsArray is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(len(paramsArray)) + 'x' + str(sizeof(cydriver.CUexecAffinityParam)))
-        for idx in range(len(paramsArray)):
-            string.memcpy(&cyparamsArray[idx], (<CUexecAffinityParam>paramsArray[idx])._pvt_ptr, sizeof(cydriver.CUexecAffinityParam))
-    err = cydriver.cuCtxCreate_v3(<cydriver.CUcontext*>pctx._pvt_ptr, (<CUexecAffinityParam>paramsArray[0])._pvt_ptr if len(paramsArray) == 1 else cyparamsArray, numParams, flags, cydev)
-    if cyparamsArray is not NULL:
-        free(cyparamsArray)
-    if err != cydriver.CUDA_SUCCESS:
-        return (_dict_CUresult[err], None)
-    return (_dict_CUresult[err], pctx)
-{{endif}}
-
 {{if 'cuCtxCreate_v4' in found_functions}}
 
 @cython.embedsignature(True)
-def cuCtxCreate_v4(ctxCreateParams : Optional[CUctxCreateParams], unsigned int flags, dev):
+def cuCtxCreate(ctxCreateParams : Optional[CUctxCreateParams], unsigned int flags, dev):
     """ Create a CUDA context.
 
     Creates a new CUDA context and associates it with the calling thread.
@@ -24886,7 +24504,7 @@ def cuCtxCreate_v4(ctxCreateParams : Optional[CUctxCreateParams], unsigned int f
     cydev = <cydriver.CUdevice>pdev
     cdef CUcontext pctx = CUcontext()
     cdef cydriver.CUctxCreateParams* cyctxCreateParams_ptr = ctxCreateParams._pvt_ptr if ctxCreateParams != None else NULL
-    err = cydriver.cuCtxCreate_v4(<cydriver.CUcontext*>pctx._pvt_ptr, cyctxCreateParams_ptr, flags, cydev)
+    err = cydriver.cuCtxCreate(<cydriver.CUcontext*>pctx._pvt_ptr, cyctxCreateParams_ptr, flags, cydev)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pctx)
@@ -25122,6 +24740,47 @@ def cuCtxGetDevice():
     return (_dict_CUresult[err], device)
 {{endif}}
 
+{{if 'cuCtxGetDevice_v2' in found_functions}}
+
+@cython.embedsignature(True)
+def cuCtxGetDevice_v2(ctx):
+    """ Returns the device handle for the specified context.
+
+    Returns in `*device` the handle of the specified context's device. If
+    the specified context is NULL, the API will return the current
+    context's device.
+
+    Parameters
+    ----------
+    ctx : :py:obj:`~.CUcontext`
+        Context for which to obtain the device
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
+    device : :py:obj:`~.CUdevice`
+        Returned device handle for the specified context
+
+    See Also
+    --------
+    :py:obj:`~.cuCtxGetCurrent`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`
+    """
+    cdef cydriver.CUcontext cyctx
+    if ctx is None:
+        pctx = 0
+    elif isinstance(ctx, (CUcontext,)):
+        pctx = int(ctx)
+    else:
+        pctx = int(CUcontext(ctx))
+    cyctx = <cydriver.CUcontext><void_ptr>pctx
+    cdef CUdevice device = CUdevice()
+    err = cydriver.cuCtxGetDevice_v2(<cydriver.CUdevice*>device._pvt_ptr, cyctx)
+    if err != cydriver.CUDA_SUCCESS:
+        return (_dict_CUresult[err], None)
+    return (_dict_CUresult[err], device)
+{{endif}}
+
 {{if 'cuCtxGetFlags' in found_functions}}
 
 @cython.embedsignature(True)
@@ -25245,6 +24904,50 @@ def cuCtxSynchronize():
     return (_dict_CUresult[err],)
 {{endif}}
 
+{{if 'cuCtxSynchronize_v2' in found_functions}}
+
+@cython.embedsignature(True)
+def cuCtxSynchronize_v2(ctx):
+    """ Block for the specified context's tasks to complete.
+
+    Blocks until the specified context has completed all preceding
+    requested tasks. If the specified context is the primary context, green
+    contexts that have been created will also be synchronized. The API
+    returns an error if one of the preceding tasks failed.
+
+    If the context was created with the
+    :py:obj:`~.CU_CTX_SCHED_BLOCKING_SYNC` flag, the CPU thread will block
+    until the GPU context has finished its work.
+
+    If the specified context is NULL, the API will operate on the current
+    context.
+
+    Parameters
+    ----------
+    ctx : :py:obj:`~.CUcontext`
+        Context to synchronize
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
+
+    See Also
+    --------
+    :py:obj:`~.cuCtxGetCurrent`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuGreenCtxCreate`, :py:obj:`~.cuCtxFromGreenCtx`, :py:obj:`~.cudaDeviceSynchronize`
+    """
+    cdef cydriver.CUcontext cyctx
+    if ctx is None:
+        pctx = 0
+    elif isinstance(ctx, (CUcontext,)):
+        pctx = int(ctx)
+    else:
+        pctx = int(CUcontext(ctx))
+    cyctx = <cydriver.CUcontext><void_ptr>pctx
+    err = cydriver.cuCtxSynchronize_v2(cyctx)
+    return (_dict_CUresult[err],)
+{{endif}}
+
 {{if 'cuCtxSetLimit' in found_functions}}
 
 @cython.embedsignature(True)
@@ -30741,7 +30444,7 @@ def cuMemcpy3DPeerAsync(pCopy : Optional[CUDA_MEMCPY3D_PEER], hStream):
     return (_dict_CUresult[err],)
 {{endif}}
 
-{{if 'cuMemcpyBatchAsync' in found_functions}}
+{{if 'cuMemcpyBatchAsync_v2' in found_functions}}
 
 @cython.embedsignature(True)
 def cuMemcpyBatchAsync(dsts : Optional[Tuple[CUdeviceptr] | List[CUdeviceptr]], srcs : Optional[Tuple[CUdeviceptr] | List[CUdeviceptr]], sizes : Tuple[int] | List[int], size_t count, attrs : Optional[Tuple[CUmemcpyAttributes] | List[CUmemcpyAttributes]], attrsIdxs : Tuple[int] | List[int], size_t numAttrs, hStream):
@@ -30816,10 +30519,6 @@ def cuMemcpyBatchAsync(dsts : Optional[Tuple[CUdeviceptr] | List[CUdeviceptr]],
     work. Note that this flag is a hint and can be ignored depending on the
     platform and other parameters of the copy.
 
-    If any error is encountered while parsing the batch, the index within
-    the batch where the error was encountered will be returned in
-    `failIdx`.
-
     Parameters
     ----------
     dsts : List[:py:obj:`~.CUdeviceptr`]
@@ -30848,10 +30547,6 @@ def cuMemcpyBatchAsync(dsts : Optional[Tuple[CUdeviceptr] | List[CUdeviceptr]],
     -------
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
-    failIdx : int
-        Pointer to a location to return the index of the copy where a
-        failure was encountered. The value will be SIZE_MAX if the error
-        doesn't pertain to any specific copy.
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -30904,20 +30599,17 @@ def cuMemcpyBatchAsync(dsts : Optional[Tuple[CUdeviceptr] | List[CUdeviceptr]],
     cdef vector[size_t] cyattrsIdxs = attrsIdxs
     if numAttrs > <size_t>len(attrs): raise RuntimeError("List is too small: " + str(len(attrs)) + " < " + str(numAttrs))
     if numAttrs > <size_t>len(attrsIdxs): raise RuntimeError("List is too small: " + str(len(attrsIdxs)) + " < " + str(numAttrs))
-    cdef size_t failIdx = 0
-    err = cydriver.cuMemcpyBatchAsync(<cydriver.CUdeviceptr*>(<CUdeviceptr>dsts[0])._pvt_ptr if len(dsts) == 1 else cydsts, <cydriver.CUdeviceptr*>(<CUdeviceptr>srcs[0])._pvt_ptr if len(srcs) == 1 else cysrcs, cysizes.data(), count, (<CUmemcpyAttributes>attrs[0])._pvt_ptr if len(attrs) == 1 else cyattrs, cyattrsIdxs.data(), numAttrs, &failIdx, cyhStream)
+    err = cydriver.cuMemcpyBatchAsync(<cydriver.CUdeviceptr*>(<CUdeviceptr>dsts[0])._pvt_ptr if len(dsts) == 1 else cydsts, <cydriver.CUdeviceptr*>(<CUdeviceptr>srcs[0])._pvt_ptr if len(srcs) == 1 else cysrcs, cysizes.data(), count, (<CUmemcpyAttributes>attrs[0])._pvt_ptr if len(attrs) == 1 else cyattrs, cyattrsIdxs.data(), numAttrs, cyhStream)
     if cydsts is not NULL:
         free(cydsts)
     if cysrcs is not NULL:
         free(cysrcs)
     if cyattrs is not NULL:
         free(cyattrs)
-    if err != cydriver.CUDA_SUCCESS:
-        return (_dict_CUresult[err], None)
-    return (_dict_CUresult[err], failIdx)
+    return (_dict_CUresult[err],)
 {{endif}}
 
-{{if 'cuMemcpy3DBatchAsync' in found_functions}}
+{{if 'cuMemcpy3DBatchAsync_v2' in found_functions}}
 
 @cython.embedsignature(True)
 def cuMemcpy3DBatchAsync(size_t numOps, opList : Optional[Tuple[CUDA_MEMCPY3D_BATCH_OP] | List[CUDA_MEMCPY3D_BATCH_OP]], unsigned long long flags, hStream):
@@ -31003,10 +30695,6 @@ def cuMemcpy3DBatchAsync(size_t numOps, opList : Optional[Tuple[CUDA_MEMCPY3D_BA
     work. Note that this flag is a hint and can be ignored depending on the
     platform and other parameters of the copy.
 
-    If any error is encountered while parsing the batch, the index within
-    the batch where the error was encountered will be returned in
-    `failIdx`.
-
     Parameters
     ----------
     numOps : size_t
@@ -31023,10 +30711,6 @@ def cuMemcpy3DBatchAsync(size_t numOps, opList : Optional[Tuple[CUDA_MEMCPY3D_BA
     -------
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
-    failIdx : int
-        Pointer to a location to return the index of the copy where a
-        failure was encountered. The value will be SIZE_MAX if the error
-        doesn't pertain to any specific copy.
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -31047,13 +30731,10 @@ def cuMemcpy3DBatchAsync(size_t numOps, opList : Optional[Tuple[CUDA_MEMCPY3D_BA
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(opList)) + 'x' + str(sizeof(cydriver.CUDA_MEMCPY3D_BATCH_OP)))
         for idx in range(len(opList)):
             string.memcpy(&cyopList[idx], (<CUDA_MEMCPY3D_BATCH_OP>opList[idx])._pvt_ptr, sizeof(cydriver.CUDA_MEMCPY3D_BATCH_OP))
-    cdef size_t failIdx = 0
-    err = cydriver.cuMemcpy3DBatchAsync(numOps, (<CUDA_MEMCPY3D_BATCH_OP>opList[0])._pvt_ptr if len(opList) == 1 else cyopList, &failIdx, flags, cyhStream)
+    err = cydriver.cuMemcpy3DBatchAsync(numOps, (<CUDA_MEMCPY3D_BATCH_OP>opList[0])._pvt_ptr if len(opList) == 1 else cyopList, flags, cyhStream)
     if cyopList is not NULL:
         free(cyopList)
-    if err != cydriver.CUDA_SUCCESS:
-        return (_dict_CUresult[err], None)
-    return (_dict_CUresult[err], failIdx)
+    return (_dict_CUresult[err],)
 {{endif}}
 
 {{if 'cuMemsetD8_v2' in found_functions}}
@@ -32443,30 +32124,34 @@ def cuMipmappedArrayDestroy(hMipmappedArray):
 def cuMemGetHandleForAddressRange(dptr, size_t size, handleType not None : CUmemRangeHandleType, unsigned long long flags):
     """ Retrieve handle for an address range.
 
-    Get a handle of the specified type to an address range. The address
-    range must have been obtained by a prior call to either
-    :py:obj:`~.cuMemAlloc` or :py:obj:`~.cuMemAddressReserve`. If the
-    address range was obtained via :py:obj:`~.cuMemAddressReserve`, it must
-    also be fully mapped via :py:obj:`~.cuMemMap`. The address range must
-    have been obtained by a prior call to either :py:obj:`~.cuMemAllocHost`
-    or :py:obj:`~.cuMemHostAlloc` on Tegra.
+    Get a handle of the specified type to an address range. When requesting
+    CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, address
+    range obtained by a prior call to either :py:obj:`~.cuMemAlloc` or
+    :py:obj:`~.cuMemAddressReserve` is supported if the
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED` device attribute
+    returns true. If the address range was obtained via
+    :py:obj:`~.cuMemAddressReserve`, it must also be fully mapped via
+    :py:obj:`~.cuMemMap`. Address range obtained by a prior call to either
+    :py:obj:`~.cuMemAllocHost` or :py:obj:`~.cuMemHostAlloc` is supported
+    if the :py:obj:`~.CU_DEVICE_ATTRIBUTE_HOST_ALLOC_DMA_BUF_SUPPORTED`
+    device attribute returns true.
+
+    As of CUDA 13.0, querying support for address range obtained by calling
+    :py:obj:`~.cuMemAllocHost` or :py:obj:`~.cuMemHostAlloc` using the
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED` device attribute is
+    deprecated.
 
     Users must ensure the `dptr` and `size` are aligned to the host page
     size.
 
-    When requesting
-    CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, users are
-    expected to query for dma_buf support for the platform by using
-    :py:obj:`~.CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED` device attribute
-    before calling this API. The `handle` will be interpreted as a pointer
-    to an integer to store the dma_buf file descriptor. Users must ensure
-    the entire address range is backed and mapped when the address range is
-    allocated by :py:obj:`~.cuMemAddressReserve`. All the physical
-    allocations backing the address range must be resident on the same
-    device and have identical allocation properties. Users are also
-    expected to retrieve a new handle every time the underlying physical
-    allocation(s) corresponding to a previously queried VA range are
-    changed.
+    The `handle` will be interpreted as a pointer to an integer to store
+    the dma_buf file descriptor. Users must ensure the entire address range
+    is backed and mapped when the address range is allocated by
+    :py:obj:`~.cuMemAddressReserve`. All the physical allocations backing
+    the address range must be resident on the same device and have
+    identical allocation properties. Users are also expected to retrieve a
+    new handle every time the underlying physical allocation(s)
+    corresponding to a previously queried VA range are changed.
 
     For CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, users
     may set flags to
@@ -32534,10 +32219,9 @@ def cuMemBatchDecompressAsync(paramsArray : Optional[CUmemDecompressParams], siz
     :py:obj:`~.CUmemDecompressParams.dstActBytes`, must be capable of usage
     with the hardware decompress feature. That is, for each of said
     pointers, the pointer attribute
-    :py:obj:`~.CU_POINTER_ATTRIBUTE_IS_MEM_DECOMPRESS_CAPABLE` should give
-    a non-zero value. To ensure this, the memory backing the pointers
-    should have been allocated using one of the following CUDA memory
-    allocators:
+    :py:obj:`~.CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE` should give a
+    non-zero value. To ensure this, the memory backing the pointers should
+    have been allocated using one of the following CUDA memory allocators:
 
     - :py:obj:`~.cuMemAlloc()`
 
@@ -32708,14 +32392,21 @@ def cuMemCreate(size_t size, prop : Optional[CUmemAllocationProp], unsigned long
     `size` of this allocation must be a multiple of the the value given via
     :py:obj:`~.cuMemGetAllocationGranularity` with the
     :py:obj:`~.CU_MEM_ALLOC_GRANULARITY_MINIMUM` flag. To create a CPU
-    allocation targeting a specific host NUMA node, applications must set
+    allocation that doesn't target any specific NUMA nodes, applications
+    must set :py:obj:`~.CUmemAllocationProp`::CUmemLocation::type to
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST`.
+    :py:obj:`~.CUmemAllocationProp`::CUmemLocation::id is ignored for HOST
+    allocations. HOST allocations are not IPC capable and
+    :py:obj:`~.CUmemAllocationProp.requestedHandleTypes` must be 0, any
+    other value will result in :py:obj:`~.CUDA_ERROR_INVALID_VALUE`. To
+    create a CPU allocation targeting a specific host NUMA node,
+    applications must set
     :py:obj:`~.CUmemAllocationProp`::CUmemLocation::type to
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA` and
     :py:obj:`~.CUmemAllocationProp`::CUmemLocation::id must specify the
     NUMA ID of the CPU. On systems where NUMA is not available
     :py:obj:`~.CUmemAllocationProp`::CUmemLocation::id must be set to 0.
-    Specifying :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT` or
-    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST` as the
+    Specifying :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT` as the
     :py:obj:`~.CUmemLocation.type` will result in
     :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
 
@@ -33860,19 +33551,28 @@ def cuMemPoolCreate(poolProps : Optional[CUmemPoolProps]):
     `poolProps` determines the properties of the pool such as the backing
     device and IPC capabilities.
 
-    To create a memory pool targeting a specific host NUMA node,
-    applications must set :py:obj:`~.CUmemPoolProps`::CUmemLocation::type
-    to :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA` and
+    To create a memory pool for HOST memory not targeting a specific NUMA
+    node, applications must set set
+    :py:obj:`~.CUmemPoolProps`::CUmemLocation::type to
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST`.
+    :py:obj:`~.CUmemPoolProps`::CUmemLocation::id is ignored for such
+    pools. Pools created with the type
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST` are not IPC capable and
+    :py:obj:`~.CUmemPoolProps.handleTypes` must be 0, any other values will
+    result in :py:obj:`~.CUDA_ERROR_INVALID_VALUE`. To create a memory pool
+    targeting a specific host NUMA node, applications must set
+    :py:obj:`~.CUmemPoolProps`::CUmemLocation::type to
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA` and
     :py:obj:`~.CUmemPoolProps`::CUmemLocation::id must specify the NUMA ID
     of the host memory node. Specifying
-    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT` or
-    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST` as the
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT` as the
     :py:obj:`~.CUmemPoolProps`::CUmemLocation::type will result in
     :py:obj:`~.CUDA_ERROR_INVALID_VALUE`. By default, the pool's memory
     will be accessible from the device it is allocated on. In the case of
-    pools created with :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`, their
-    default accessibility will be from the host CPU. Applications can
-    control the maximum size of the pool by specifying a non-zero value for
+    pools created with :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA` or
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST`, their default accessibility will
+    be from the host CPU. Applications can control the maximum size of the
+    pool by specifying a non-zero value for
     :py:obj:`~.CUmemPoolProps.maxSize`. If set to 0, the maximum size of
     the pool will default to a system dependent value.
 
@@ -33969,6 +33669,167 @@ def cuMemPoolDestroy(pool):
     return (_dict_CUresult[err],)
 {{endif}}
 
+{{if 'cuMemGetDefaultMemPool' in found_functions}}
+
+@cython.embedsignature(True)
+def cuMemGetDefaultMemPool(location : Optional[CUmemLocation], typename not None : CUmemAllocationType):
+    """ Returns the default memory pool for a given location and allocation type.
+
+    The memory location can be of one of
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`,
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST` or
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`. The allocation type can be
+    one of :py:obj:`~.CU_MEM_ALLOCATION_TYPE_PINNED` or
+    :py:obj:`~.CU_MEM_ALLOCATION_TYPE_MANAGED`. When the allocation type is
+    :py:obj:`~.CU_MEM_ALLOCATION_TYPE_MANAGED`, the location type can also
+    be :py:obj:`~.CU_MEM_LOCATION_TYPE_NONE` to indicate no preferred
+    location for the managed memory pool. In all other cases, the call
+    returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
+
+    Parameters
+    ----------
+    location : :py:obj:`~.CUmemLocation`
+        None
+    typename : :py:obj:`~.CUmemAllocationType`
+        None
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
+    pool_out : :py:obj:`~.CUmemoryPool`
+        None
+
+    See Also
+    --------
+    :py:obj:`~.cuMemAllocAsync`, :py:obj:`~.cuMemPoolTrimTo`, :py:obj:`~.cuMemPoolGetAttribute`, :py:obj:`~.cuMemPoolSetAttribute`, :py:obj:`~.cuMemPoolSetAccess`, :py:obj:`~.cuMemGetMemPool`, :py:obj:`~.cuMemPoolCreate`
+    """
+    cdef CUmemoryPool pool_out = CUmemoryPool()
+    cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
+    cdef cydriver.CUmemAllocationType cytypename = typename.value
+    err = cydriver.cuMemGetDefaultMemPool(<cydriver.CUmemoryPool*>pool_out._pvt_ptr, cylocation_ptr, cytypename)
+    if err != cydriver.CUDA_SUCCESS:
+        return (_dict_CUresult[err], None)
+    return (_dict_CUresult[err], pool_out)
+{{endif}}
+
+{{if 'cuMemGetMemPool' in found_functions}}
+
+@cython.embedsignature(True)
+def cuMemGetMemPool(location : Optional[CUmemLocation], typename not None : CUmemAllocationType):
+    """ Sets the current memory pool for a memory location and allocation type.
+
+    The memory location can be of one of
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`,
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST` or
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`. The allocation type can be
+    one of :py:obj:`~.CU_MEM_ALLOCATION_TYPE_PINNED` or
+    :py:obj:`~.CU_MEM_ALLOCATION_TYPE_MANAGED`. When the allocation type is
+    :py:obj:`~.CU_MEM_ALLOCATION_TYPE_MANAGED`, the location type can also
+    be :py:obj:`~.CU_MEM_LOCATION_TYPE_NONE` to indicate no preferred
+    location for the managed memory pool. In all other cases, the call
+    returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
+
+    When a memory pool is set as the current memory pool, the location
+    parameter should be the same as the location of the pool. The location
+    and allocation type specified must match those of the pool otherwise
+    :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is returned. By default, a memory
+    location's current memory pool is its default memory pool that can be
+    obtained via :py:obj:`~.cuMemGetDefaultMemPool`. If the location type
+    is :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE` and the allocation type is
+    :py:obj:`~.CU_MEM_ALLOCATION_TYPE_PINNED`, then this API is the
+    equivalent of calling :py:obj:`~.cuDeviceSetMemPool` with the location
+    id as the device. For further details on the implications, please refer
+    to the documentation for :py:obj:`~.cuDeviceSetMemPool`.
+
+    Parameters
+    ----------
+    location : :py:obj:`~.CUmemLocation`
+        None
+    typename : :py:obj:`~.CUmemAllocationType`
+        None
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
+    pool : :py:obj:`~.CUmemoryPool`
+        None
+
+    See Also
+    --------
+    :py:obj:`~.cuDeviceGetDefaultMemPool`, :py:obj:`~.cuDeviceGetMemPool`, :py:obj:`~.cuMemGetMemPool`, :py:obj:`~.cuMemPoolCreate`, :py:obj:`~.cuMemPoolDestroy`, :py:obj:`~.cuMemAllocFromPoolAsync`
+
+    Notes
+    -----
+    Use :py:obj:`~.cuMemAllocFromPoolAsync` to specify asynchronous allocations from a device different than the one the stream runs on.
+    """
+    cdef CUmemoryPool pool = CUmemoryPool()
+    cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
+    cdef cydriver.CUmemAllocationType cytypename = typename.value
+    err = cydriver.cuMemGetMemPool(<cydriver.CUmemoryPool*>pool._pvt_ptr, cylocation_ptr, cytypename)
+    if err != cydriver.CUDA_SUCCESS:
+        return (_dict_CUresult[err], None)
+    return (_dict_CUresult[err], pool)
+{{endif}}
+
+{{if 'cuMemSetMemPool' in found_functions}}
+
+@cython.embedsignature(True)
+def cuMemSetMemPool(location : Optional[CUmemLocation], typename not None : CUmemAllocationType, pool):
+    """ Gets the current memory pool for a memory location and of a particular allocation type.
+
+    The memory location can be of one of
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`,
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST` or
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`. The allocation type can be
+    one of :py:obj:`~.CU_MEM_ALLOCATION_TYPE_PINNED` or
+    :py:obj:`~.CU_MEM_ALLOCATION_TYPE_MANAGED`. When the allocation type is
+    :py:obj:`~.CU_MEM_ALLOCATION_TYPE_MANAGED`, the location type can also
+    be :py:obj:`~.CU_MEM_LOCATION_TYPE_NONE` to indicate no preferred
+    location for the managed memory pool. In all other cases, the call
+    returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
+
+    Returns the last pool provided to :py:obj:`~.cuMemSetMemPool` or
+    :py:obj:`~.cuDeviceSetMemPool` for this location and allocation type or
+    the location's default memory pool if :py:obj:`~.cuMemSetMemPool` or
+    :py:obj:`~.cuDeviceSetMemPool` for that allocType and location has
+    never been called. By default the current mempool of a location is the
+    default mempool for a device. Otherwise the returned pool must have
+    been set with :py:obj:`~.cuDeviceSetMemPool`.
+
+    Parameters
+    ----------
+    location : :py:obj:`~.CUmemLocation`
+        None
+    typename : :py:obj:`~.CUmemAllocationType`
+        None
+    pool : :py:obj:`~.CUmemoryPool` or :py:obj:`~.cudaMemPool_t`
+        None
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
+
+    See Also
+    --------
+    :py:obj:`~.cuDeviceGetDefaultMemPool`, :py:obj:`~.cuMemPoolCreate`, :py:obj:`~.cuDeviceSetMemPool`, :py:obj:`~.cuMemSetMemPool`
+    """
+    cdef cydriver.CUmemoryPool cypool
+    if pool is None:
+        ppool = 0
+    elif isinstance(pool, (CUmemoryPool,)):
+        ppool = int(pool)
+    else:
+        ppool = int(CUmemoryPool(pool))
+    cypool = <cydriver.CUmemoryPool><void_ptr>ppool
+    cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
+    cdef cydriver.CUmemAllocationType cytypename = typename.value
+    err = cydriver.cuMemSetMemPool(cylocation_ptr, cytypename, cypool)
+    return (_dict_CUresult[err],)
+{{endif}}
+
 {{if 'cuMemAllocFromPoolAsync' in found_functions}}
 
 @cython.embedsignature(True)
@@ -34802,126 +34663,10 @@ def cuPointerGetAttribute(attribute not None : CUpointer_attribute, ptr):
     return (_dict_CUresult[err], cydata.pyObj())
 {{endif}}
 
-{{if 'cuMemPrefetchAsync' in found_functions}}
-
-@cython.embedsignature(True)
-def cuMemPrefetchAsync(devPtr, size_t count, dstDevice, hStream):
-    """ Prefetches memory to the specified destination device.
-
-    Note there is a later version of this API,
-    :py:obj:`~.cuMemPrefetchAsync_v2`. It will supplant this version in
-    13.0, which is retained for minor version compatibility.
-
-    Prefetches memory to the specified destination device. `devPtr` is the
-    base device pointer of the memory to be prefetched and `dstDevice` is
-    the destination device. `count` specifies the number of bytes to copy.
-    `hStream` is the stream in which the operation is enqueued. The memory
-    range must refer to managed memory allocated via
-    :py:obj:`~.cuMemAllocManaged` or declared via managed variables or it
-    may also refer to system-allocated memory on systems with non-zero
-    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
-
-    Passing in CU_DEVICE_CPU for `dstDevice` will prefetch the data to host
-    memory. If `dstDevice` is a GPU, then the device attribute
-    :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS` must be non-
-    zero. Additionally, `hStream` must be associated with a device that has
-    a non-zero value for the device attribute
-    :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS`.
-
-    The start address and end address of the memory range will be rounded
-    down and rounded up respectively to be aligned to CPU page size before
-    the prefetch operation is enqueued in the stream.
-
-    If no physical memory has been allocated for this region, then this
-    memory region will be populated and mapped on the destination device.
-    If there's insufficient memory to prefetch the desired region, the
-    Unified Memory driver may evict pages from other
-    :py:obj:`~.cuMemAllocManaged` allocations to host memory in order to
-    make room. Device memory allocated using :py:obj:`~.cuMemAlloc` or
-    :py:obj:`~.cuArrayCreate` will not be evicted.
-
-    By default, any mappings to the previous location of the migrated pages
-    are removed and mappings for the new location are only setup on
-    `dstDevice`. The exact behavior however also depends on the settings
-    applied to this memory range via :py:obj:`~.cuMemAdvise` as described
-    below:
-
-    If :py:obj:`~.CU_MEM_ADVISE_SET_READ_MOSTLY` was set on any subset of
-    this memory range, then that subset will create a read-only copy of the
-    pages on `dstDevice`.
-
-    If :py:obj:`~.CU_MEM_ADVISE_SET_PREFERRED_LOCATION` was called on any
-    subset of this memory range, then the pages will be migrated to
-    `dstDevice` even if `dstDevice` is not the preferred location of any
-    pages in the memory range.
-
-    If :py:obj:`~.CU_MEM_ADVISE_SET_ACCESSED_BY` was called on any subset
-    of this memory range, then mappings to those pages from all the
-    appropriate processors are updated to refer to the new location if
-    establishing such a mapping is possible. Otherwise, those mappings are
-    cleared.
-
-    Note that this API is not required for functionality and only serves to
-    improve performance by allowing the application to migrate data to a
-    suitable location before it is accessed. Memory accesses to this range
-    are always coherent and are allowed even when the data is actively
-    being migrated.
-
-    Note that this function is asynchronous with respect to the host and
-    all work on other devices.
-
-    Parameters
-    ----------
-    devPtr : :py:obj:`~.CUdeviceptr`
-        Pointer to be prefetched
-    count : size_t
-        Size in bytes
-    dstDevice : :py:obj:`~.CUdevice`
-        Destination device to prefetch to
-    hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
-        Stream to enqueue prefetch operation
-
-    Returns
-    -------
-    CUresult
-        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
-
-    See Also
-    --------
-    :py:obj:`~.cuMemcpy`, :py:obj:`~.cuMemcpyPeer`, :py:obj:`~.cuMemcpyAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :py:obj:`~.cuMemAdvise`, :py:obj:`~.cuMemPrefetchAsync` :py:obj:`~.cudaMemPrefetchAsync_v2`
-    """
-    cdef cydriver.CUstream cyhStream
-    if hStream is None:
-        phStream = 0
-    elif isinstance(hStream, (CUstream,)):
-        phStream = int(hStream)
-    else:
-        phStream = int(CUstream(hStream))
-    cyhStream = <cydriver.CUstream><void_ptr>phStream
-    cdef cydriver.CUdevice cydstDevice
-    if dstDevice is None:
-        pdstDevice = 0
-    elif isinstance(dstDevice, (CUdevice,)):
-        pdstDevice = int(dstDevice)
-    else:
-        pdstDevice = int(CUdevice(dstDevice))
-    cydstDevice = <cydriver.CUdevice>pdstDevice
-    cdef cydriver.CUdeviceptr cydevPtr
-    if devPtr is None:
-        pdevPtr = 0
-    elif isinstance(devPtr, (CUdeviceptr,)):
-        pdevPtr = int(devPtr)
-    else:
-        pdevPtr = int(CUdeviceptr(devPtr))
-    cydevPtr = <cydriver.CUdeviceptr><void_ptr>pdevPtr
-    err = cydriver.cuMemPrefetchAsync(cydevPtr, count, cydstDevice, cyhStream)
-    return (_dict_CUresult[err],)
-{{endif}}
-
 {{if 'cuMemPrefetchAsync_v2' in found_functions}}
 
 @cython.embedsignature(True)
-def cuMemPrefetchAsync_v2(devPtr, size_t count, location not None : CUmemLocation, unsigned int flags, hStream):
+def cuMemPrefetchAsync(devPtr, size_t count, location not None : CUmemLocation, unsigned int flags, hStream):
     """ Prefetches memory to the specified destination location.
 
     Prefetches memory to the specified destination location. `devPtr` is
@@ -35017,7 +34762,7 @@ def cuMemPrefetchAsync_v2(devPtr, size_t count, location not None : CUmemLocatio
 
     See Also
     --------
-    :py:obj:`~.cuMemcpy`, :py:obj:`~.cuMemcpyPeer`, :py:obj:`~.cuMemcpyAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :py:obj:`~.cuMemAdvise`, :py:obj:`~.cuMemPrefetchAsync`, :py:obj:`~.cudaMemPrefetchAsync_v2`
+    :py:obj:`~.cuMemcpy`, :py:obj:`~.cuMemcpyPeer`, :py:obj:`~.cuMemcpyAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :py:obj:`~.cuMemAdvise`, :py:obj:`~.cudaMemPrefetchAsync`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -35035,20 +34780,16 @@ def cuMemPrefetchAsync_v2(devPtr, size_t count, location not None : CUmemLocatio
     else:
         pdevPtr = int(CUdeviceptr(devPtr))
     cydevPtr = <cydriver.CUdeviceptr><void_ptr>pdevPtr
-    err = cydriver.cuMemPrefetchAsync_v2(cydevPtr, count, location._pvt_ptr[0], flags, cyhStream)
+    err = cydriver.cuMemPrefetchAsync(cydevPtr, count, location._pvt_ptr[0], flags, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
-{{if 'cuMemAdvise' in found_functions}}
+{{if 'cuMemAdvise_v2' in found_functions}}
 
 @cython.embedsignature(True)
-def cuMemAdvise(devPtr, size_t count, advice not None : CUmem_advise, device):
+def cuMemAdvise(devPtr, size_t count, advice not None : CUmem_advise, location not None : CUmemLocation):
     """ Advise about the usage of a given memory range.
 
-    Note there is a later version of this API, :py:obj:`~.cuMemAdvise_v2`.
-    It will supplant this version in 13.0, which is retained for minor
-    version compatibility.
-
     Advise the Unified Memory subsystem about the usage pattern for the
     memory range starting at `devPtr` with a size of `count` bytes. The
     start address and end address of the memory range will be rounded down
@@ -35069,198 +34810,17 @@ def cuMemAdvise(devPtr, size_t count, advice not None : CUmem_advise, device):
       only copy of at least the accessed pages in that processor's memory.
       Additionally, if :py:obj:`~.cuMemPrefetchAsync` is called on this
       region, it will create a read-only copy of the data on the
-      destination processor. If any processor writes to this region, all
-      copies of the corresponding page will be invalidated except for the
-      one where the write occurred. The `device` argument is ignored for
-      this advice. Note that for a page to be read-duplicated, the
-      accessing processor must either be the CPU or a GPU that has a non-
-      zero value for the device attribute
-      :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS`. Also, if a
-      context is created on a device that does not have the device
-      attribute :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS`
-      set, then read-duplication will not occur until all such contexts are
-      destroyed. If the memory region refers to valid system-allocated
-      pageable memory, then the accessing device must have a non-zero value
-      for the device attribute
-      :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS` for a read-
-      only copy to be created on that device. Note however that if the
-      accessing device also has a non-zero value for the device attribute
-      :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES`,
-      then setting this advice will not create a read-only copy when that
-      device accesses this memory region.
-
-    - :py:obj:`~.CU_MEM_ADVISE_UNSET_READ_MOSTLY`: Undoes the effect of
-      :py:obj:`~.CU_MEM_ADVISE_SET_READ_MOSTLY` and also prevents the
-      Unified Memory driver from attempting heuristic read-duplication on
-      the memory range. Any read-duplicated copies of the data will be
-      collapsed into a single copy. The location for the collapsed copy
-      will be the preferred location if the page has a preferred location
-      and one of the read-duplicated copies was resident at that location.
-      Otherwise, the location chosen is arbitrary.
-
-    - :py:obj:`~.CU_MEM_ADVISE_SET_PREFERRED_LOCATION`: This advice sets
-      the preferred location for the data to be the memory belonging to
-      `device`. Passing in CU_DEVICE_CPU for `device` sets the preferred
-      location as host memory. If `device` is a GPU, then it must have a
-      non-zero value for the device attribute
-      :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS`. Setting
-      the preferred location does not cause data to migrate to that
-      location immediately. Instead, it guides the migration policy when a
-      fault occurs on that memory region. If the data is already in its
-      preferred location and the faulting processor can establish a mapping
-      without requiring the data to be migrated, then data migration will
-      be avoided. On the other hand, if the data is not in its preferred
-      location or if a direct mapping cannot be established, then it will
-      be migrated to the processor accessing it. It is important to note
-      that setting the preferred location does not prevent data prefetching
-      done using :py:obj:`~.cuMemPrefetchAsync`. Having a preferred
-      location can override the page thrash detection and resolution logic
-      in the Unified Memory driver. Normally, if a page is detected to be
-      constantly thrashing between for example host and device memory, the
-      page may eventually be pinned to host memory by the Unified Memory
-      driver. But if the preferred location is set as device memory, then
-      the page will continue to thrash indefinitely. If
-      :py:obj:`~.CU_MEM_ADVISE_SET_READ_MOSTLY` is also set on this memory
-      region or any subset of it, then the policies associated with that
-      advice will override the policies of this advice, unless read
-      accesses from `device` will not result in a read-only copy being
-      created on that device as outlined in description for the advice
-      :py:obj:`~.CU_MEM_ADVISE_SET_READ_MOSTLY`. If the memory region
-      refers to valid system-allocated pageable memory, then `device` must
-      have a non-zero value for the device attribute
-      :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS`.
-
-    - :py:obj:`~.CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION`: Undoes the effect
-      of :py:obj:`~.CU_MEM_ADVISE_SET_PREFERRED_LOCATION` and changes the
-      preferred location to none.
-
-    - :py:obj:`~.CU_MEM_ADVISE_SET_ACCESSED_BY`: This advice implies that
-      the data will be accessed by `device`. Passing in
-      :py:obj:`~.CU_DEVICE_CPU` for `device` will set the advice for the
-      CPU. If `device` is a GPU, then the device attribute
-      :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS` must be
-      non-zero. This advice does not cause data migration and has no impact
-      on the location of the data per se. Instead, it causes the data to
-      always be mapped in the specified processor's page tables, as long as
-      the location of the data permits a mapping to be established. If the
-      data gets migrated for any reason, the mappings are updated
-      accordingly. This advice is recommended in scenarios where data
-      locality is not important, but avoiding faults is. Consider for
-      example a system containing multiple GPUs with peer-to-peer access
-      enabled, where the data located on one GPU is occasionally accessed
-      by peer GPUs. In such scenarios, migrating data over to the other
-      GPUs is not as important because the accesses are infrequent and the
-      overhead of migration may be too high. But preventing faults can
-      still help improve performance, and so having a mapping set up in
-      advance is useful. Note that on CPU access of this data, the data may
-      be migrated to host memory because the CPU typically cannot access
-      device memory directly. Any GPU that had the
-      :py:obj:`~.CU_MEM_ADVISE_SET_ACCESSED_BY` flag set for this data will
-      now have its mapping updated to point to the page in host memory. If
-      :py:obj:`~.CU_MEM_ADVISE_SET_READ_MOSTLY` is also set on this memory
-      region or any subset of it, then the policies associated with that
-      advice will override the policies of this advice. Additionally, if
-      the preferred location of this memory region or any subset of it is
-      also `device`, then the policies associated with
-      :py:obj:`~.CU_MEM_ADVISE_SET_PREFERRED_LOCATION` will override the
-      policies of this advice. If the memory region refers to valid system-
-      allocated pageable memory, then `device` must have a non-zero value
-      for the device attribute
-      :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS`. Additionally,
-      if `device` has a non-zero value for the device attribute
-      :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES`,
-      then this call has no effect.
-
-    - :py:obj:`~.CU_MEM_ADVISE_UNSET_ACCESSED_BY`: Undoes the effect of
-      :py:obj:`~.CU_MEM_ADVISE_SET_ACCESSED_BY`. Any mappings to the data
-      from `device` may be removed at any time causing accesses to result
-      in non-fatal page faults. If the memory region refers to valid
-      system-allocated pageable memory, then `device` must have a non-zero
-      value for the device attribute
-      :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS`. Additionally,
-      if `device` has a non-zero value for the device attribute
-      :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES`,
-      then this call has no effect.
-
-    Parameters
-    ----------
-    devPtr : :py:obj:`~.CUdeviceptr`
-        Pointer to memory to set the advice for
-    count : size_t
-        Size in bytes of the memory range
-    advice : :py:obj:`~.CUmem_advise`
-        Advice to be applied for the specified memory range
-    device : :py:obj:`~.CUdevice`
-        Device to apply the advice for
-
-    Returns
-    -------
-    CUresult
-        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
-
-    See Also
-    --------
-    :py:obj:`~.cuMemcpy`, :py:obj:`~.cuMemcpyPeer`, :py:obj:`~.cuMemcpyAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :py:obj:`~.cuMemPrefetchAsync`, :py:obj:`~.cuMemAdvise_v2`, :py:obj:`~.cudaMemAdvise`
-    """
-    cdef cydriver.CUdevice cydevice
-    if device is None:
-        pdevice = 0
-    elif isinstance(device, (CUdevice,)):
-        pdevice = int(device)
-    else:
-        pdevice = int(CUdevice(device))
-    cydevice = <cydriver.CUdevice>pdevice
-    cdef cydriver.CUdeviceptr cydevPtr
-    if devPtr is None:
-        pdevPtr = 0
-    elif isinstance(devPtr, (CUdeviceptr,)):
-        pdevPtr = int(devPtr)
-    else:
-        pdevPtr = int(CUdeviceptr(devPtr))
-    cydevPtr = <cydriver.CUdeviceptr><void_ptr>pdevPtr
-    cdef cydriver.CUmem_advise cyadvice = advice.value
-    err = cydriver.cuMemAdvise(cydevPtr, count, cyadvice, cydevice)
-    return (_dict_CUresult[err],)
-{{endif}}
-
-{{if 'cuMemAdvise_v2' in found_functions}}
-
-@cython.embedsignature(True)
-def cuMemAdvise_v2(devPtr, size_t count, advice not None : CUmem_advise, location not None : CUmemLocation):
-    """ Advise about the usage of a given memory range.
-
-    Advise the Unified Memory subsystem about the usage pattern for the
-    memory range starting at `devPtr` with a size of `count` bytes. The
-    start address and end address of the memory range will be rounded down
-    and rounded up respectively to be aligned to CPU page size before the
-    advice is applied. The memory range must refer to managed memory
-    allocated via :py:obj:`~.cuMemAllocManaged` or declared via managed
-    variables. The memory range could also refer to system-allocated
-    pageable memory provided it represents a valid, host-accessible region
-    of memory and all additional constraints imposed by `advice` as
-    outlined below are also satisfied. Specifying an invalid system-
-    allocated pageable memory range results in an error being returned.
-
-    The `advice` parameter can take the following values:
-
-    - :py:obj:`~.CU_MEM_ADVISE_SET_READ_MOSTLY`: This implies that the data
-      is mostly going to be read from and only occasionally written to. Any
-      read accesses from any processor to this region will create a read-
-      only copy of at least the accessed pages in that processor's memory.
-      Additionally, if :py:obj:`~.cuMemPrefetchAsync` or
-      :py:obj:`~.cuMemPrefetchAsync_v2` is called on this region, it will
-      create a read-only copy of the data on the destination processor. If
-      the target location for :py:obj:`~.cuMemPrefetchAsync_v2` is a host
-      NUMA node and a read-only copy already exists on another host NUMA
-      node, that copy will be migrated to the targeted host NUMA node. If
-      any processor writes to this region, all copies of the corresponding
-      page will be invalidated except for the one where the write occurred.
-      If the writing processor is the CPU and the preferred location of the
-      page is a host NUMA node, then the page will also be migrated to that
-      host NUMA node. The `location` argument is ignored for this advice.
-      Note that for a page to be read-duplicated, the accessing processor
-      must either be the CPU or a GPU that has a non-zero value for the
-      device attribute
+      destination processor. If the target location for
+      :py:obj:`~.cuMemPrefetchAsync` is a host NUMA node and a read-only
+      copy already exists on another host NUMA node, that copy will be
+      migrated to the targeted host NUMA node. If any processor writes to
+      this region, all copies of the corresponding page will be invalidated
+      except for the one where the write occurred. If the writing processor
+      is the CPU and the preferred location of the page is a host NUMA
+      node, then the page will also be migrated to that host NUMA node. The
+      `location` argument is ignored for this advice. Note that for a page
+      to be read-duplicated, the accessing processor must either be the CPU
+      or a GPU that has a non-zero value for the device attribute
       :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS`. Also, if a
       context is created on a device that does not have the device
       attribute :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS`
@@ -35413,7 +34973,7 @@ def cuMemAdvise_v2(devPtr, size_t count, advice not None : CUmem_advise, locatio
 
     See Also
     --------
-    :py:obj:`~.cuMemcpy`, :py:obj:`~.cuMemcpyPeer`, :py:obj:`~.cuMemcpyAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :py:obj:`~.cuMemPrefetchAsync`, :py:obj:`~.cuMemAdvise`, :py:obj:`~.cudaMemAdvise`
+    :py:obj:`~.cuMemcpy`, :py:obj:`~.cuMemcpyPeer`, :py:obj:`~.cuMemcpyAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :py:obj:`~.cuMemPrefetchAsync`, :py:obj:`~.cudaMemAdvise`
     """
     cdef cydriver.CUdeviceptr cydevPtr
     if devPtr is None:
@@ -35424,7 +34984,327 @@ def cuMemAdvise_v2(devPtr, size_t count, advice not None : CUmem_advise, locatio
         pdevPtr = int(CUdeviceptr(devPtr))
     cydevPtr = <cydriver.CUdeviceptr><void_ptr>pdevPtr
     cdef cydriver.CUmem_advise cyadvice = advice.value
-    err = cydriver.cuMemAdvise_v2(cydevPtr, count, cyadvice, location._pvt_ptr[0])
+    err = cydriver.cuMemAdvise(cydevPtr, count, cyadvice, location._pvt_ptr[0])
+    return (_dict_CUresult[err],)
+{{endif}}
+
+{{if 'cuMemPrefetchBatchAsync' in found_functions}}
+
+@cython.embedsignature(True)
+def cuMemPrefetchBatchAsync(dptrs : Optional[Tuple[CUdeviceptr] | List[CUdeviceptr]], sizes : Tuple[int] | List[int], size_t count, prefetchLocs : Optional[Tuple[CUmemLocation] | List[CUmemLocation]], prefetchLocIdxs : Tuple[int] | List[int], size_t numPrefetchLocs, unsigned long long flags, hStream):
+    """ Performs a batch of memory prefetches asynchronously.
+
+    Performs a batch of memory prefetches. The batch as a whole executes in
+    stream order but operations within a batch are not guaranteed to
+    execute in any specific order. All devices in the system must have a
+    non-zero value for the device attribute
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS` otherwise the
+    API will return an error.
+
+    The semantics of the individual prefetch operations are as described in
+    :py:obj:`~.cuMemPrefetchAsync`.
+
+    Performs memory prefetch on address ranges specified in `dptrs` and
+    `sizes`. Both arrays must be of the same length as specified by
+    `count`. Each memory range specified must refer to managed memory
+    allocated via :py:obj:`~.cuMemAllocManaged` or declared via managed
+    variables or it may also refer to system-allocated memory when all
+    devices have a non-zero value for
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS`. The prefetch
+    location for every operation in the batch is specified in the
+    `prefetchLocs` array. Each entry in this array can apply to more than
+    one operation. This can be done by specifying in the `prefetchLocIdxs`
+    array, the index of the first prefetch operation that the corresponding
+    entry in the `prefetchLocs` array applies to. Both `prefetchLocs` and
+    `prefetchLocIdxs` must be of the same length as specified by
+    `numPrefetchLocs`. For example, if a batch has 10 prefetches listed in
+    dptrs/sizes, the first 4 of which are to be prefetched to one location
+    and the remaining 6 are to be prefetched to another, then
+    `numPrefetchLocs` will be 2, `prefetchLocIdxs` will be {0, 4} and
+    `prefetchLocs` will contain the two locations. Note the first entry in
+    `prefetchLocIdxs` must always be 0. Also, each entry must be greater
+    than the previous entry and the last entry should be less than `count`.
+    Furthermore, `numPrefetchLocs` must be lesser than or equal to `count`.
+
+    Parameters
+    ----------
+    dptrs : List[:py:obj:`~.CUdeviceptr`]
+        Array of pointers to be prefetched
+    sizes : List[int]
+        Array of sizes for memory prefetch operations.
+    count : size_t
+        Size of `dptrs` and `sizes` arrays.
+    prefetchLocs : List[:py:obj:`~.CUmemLocation`]
+        Array of locations to prefetch to.
+    prefetchLocIdxs : List[int]
+        Array of indices to specify which operands each entry in the
+        `prefetchLocs` array applies to. The locations specified in
+        prefetchLocs[k] will be applied to copies starting from
+        prefetchLocIdxs[k] through prefetchLocIdxs[k+1] - 1. Also
+        prefetchLocs[numPrefetchLocs - 1] will apply to prefetches starting
+        from prefetchLocIdxs[numPrefetchLocs - 1] through count - 1.
+    numPrefetchLocs : size_t
+        Size of `prefetchLocs` and `prefetchLocIdxs` arrays.
+    flags : unsigned long long
+        Flags reserved for future use. Must be zero.
+    hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
+        The stream to enqueue the operations in. Must not be legacy NULL
+        stream.
+
+    Returns
+    -------
+    CUresult
+
+    """
+    cdef cydriver.CUstream cyhStream
+    if hStream is None:
+        phStream = 0
+    elif isinstance(hStream, (CUstream,)):
+        phStream = int(hStream)
+    else:
+        phStream = int(CUstream(hStream))
+    cyhStream = <cydriver.CUstream><void_ptr>phStream
+    if not all(isinstance(_x, (int)) for _x in prefetchLocIdxs):
+        raise TypeError("Argument 'prefetchLocIdxs' is not instance of type (expected Tuple[int] or List[int]")
+    prefetchLocs = [] if prefetchLocs is None else prefetchLocs
+    if not all(isinstance(_x, (CUmemLocation,)) for _x in prefetchLocs):
+        raise TypeError("Argument 'prefetchLocs' is not instance of type (expected Tuple[cydriver.CUmemLocation,] or List[cydriver.CUmemLocation,]")
+    if not all(isinstance(_x, (int)) for _x in sizes):
+        raise TypeError("Argument 'sizes' is not instance of type (expected Tuple[int] or List[int]")
+    dptrs = [] if dptrs is None else dptrs
+    if not all(isinstance(_x, (CUdeviceptr,)) for _x in dptrs):
+        raise TypeError("Argument 'dptrs' is not instance of type (expected Tuple[cydriver.CUdeviceptr,] or List[cydriver.CUdeviceptr,]")
+    cdef cydriver.CUdeviceptr* cydptrs = NULL
+    if len(dptrs) > 0:
+        cydptrs = <cydriver.CUdeviceptr*> calloc(len(dptrs), sizeof(cydriver.CUdeviceptr))
+        if cydptrs is NULL:
+            raise MemoryError('Failed to allocate length x size memory: ' + str(len(dptrs)) + 'x' + str(sizeof(cydriver.CUdeviceptr)))
+        else:
+            for idx in range(len(dptrs)):
+                cydptrs[idx] = <cydriver.CUdeviceptr>(<CUdeviceptr>dptrs[idx])._pvt_ptr[0]
+    cdef vector[size_t] cysizes = sizes
+    if count > <size_t>len(dptrs): raise RuntimeError("List is too small: " + str(len(dptrs)) + " < " + str(count))
+    if count > <size_t>len(sizes): raise RuntimeError("List is too small: " + str(len(sizes)) + " < " + str(count))
+    cdef cydriver.CUmemLocation* cyprefetchLocs = NULL
+    if len(prefetchLocs) > 0:
+        cyprefetchLocs = <cydriver.CUmemLocation*> calloc(len(prefetchLocs), sizeof(cydriver.CUmemLocation))
+        if cyprefetchLocs is NULL:
+            raise MemoryError('Failed to allocate length x size memory: ' + str(len(prefetchLocs)) + 'x' + str(sizeof(cydriver.CUmemLocation)))
+        for idx in range(len(prefetchLocs)):
+            string.memcpy(&cyprefetchLocs[idx], (<CUmemLocation>prefetchLocs[idx])._pvt_ptr, sizeof(cydriver.CUmemLocation))
+    cdef vector[size_t] cyprefetchLocIdxs = prefetchLocIdxs
+    if numPrefetchLocs > <size_t>len(prefetchLocs): raise RuntimeError("List is too small: " + str(len(prefetchLocs)) + " < " + str(numPrefetchLocs))
+    if numPrefetchLocs > <size_t>len(prefetchLocIdxs): raise RuntimeError("List is too small: " + str(len(prefetchLocIdxs)) + " < " + str(numPrefetchLocs))
+    err = cydriver.cuMemPrefetchBatchAsync(<cydriver.CUdeviceptr*>(<CUdeviceptr>dptrs[0])._pvt_ptr if len(dptrs) == 1 else cydptrs, cysizes.data(), count, (<CUmemLocation>prefetchLocs[0])._pvt_ptr if len(prefetchLocs) == 1 else cyprefetchLocs, cyprefetchLocIdxs.data(), numPrefetchLocs, flags, cyhStream)
+    if cydptrs is not NULL:
+        free(cydptrs)
+    if cyprefetchLocs is not NULL:
+        free(cyprefetchLocs)
+    return (_dict_CUresult[err],)
+{{endif}}
+
+{{if 'cuMemDiscardBatchAsync' in found_functions}}
+
+@cython.embedsignature(True)
+def cuMemDiscardBatchAsync(dptrs : Optional[Tuple[CUdeviceptr] | List[CUdeviceptr]], sizes : Tuple[int] | List[int], size_t count, unsigned long long flags, hStream):
+    """ Performs a batch of memory discards asynchronously.
+
+    Performs a batch of memory discards. The batch as a whole executes in
+    stream order but operations within a batch are not guaranteed to
+    execute in any specific order. All devices in the system must have a
+    non-zero value for the device attribute
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS` otherwise the
+    API will return an error.
+
+    Discarding a memory range informs the driver that the contents of that
+    range are no longer useful. Discarding memory ranges allows the driver
+    to optimize certain data migrations and can also help reduce memory
+    pressure. This operation can be undone on any part of the range by
+    either writing to it or prefetching it via
+    :py:obj:`~.cuMemPrefetchAsync` or :py:obj:`~.cuMemPrefetchBatchAsync`.
+    Reading from a discarded range, without a subsequent write or prefetch
+    to that part of the range, will return an indeterminate value. Note
+    that any reads, writes or prefetches to any part of the memory range
+    that occur simultaneously with the discard operation result in
+    undefined behavior.
+
+    Performs memory discard on address ranges specified in `dptrs` and
+    `sizes`. Both arrays must be of the same length as specified by
+    `count`. Each memory range specified must refer to managed memory
+    allocated via :py:obj:`~.cuMemAllocManaged` or declared via managed
+    variables or it may also refer to system-allocated memory when all
+    devices have a non-zero value for
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS`.
+
+    Parameters
+    ----------
+    dptrs : List[:py:obj:`~.CUdeviceptr`]
+        Array of pointers to be discarded
+    sizes : List[int]
+        Array of sizes for memory discard operations.
+    count : size_t
+        Size of `dptrs` and `sizes` arrays.
+    flags : unsigned long long
+        Flags reserved for future use. Must be zero.
+    hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
+        The stream to enqueue the operations in. Must not be legacy NULL
+        stream.
+
+    Returns
+    -------
+    CUresult
+
+    """
+    cdef cydriver.CUstream cyhStream
+    if hStream is None:
+        phStream = 0
+    elif isinstance(hStream, (CUstream,)):
+        phStream = int(hStream)
+    else:
+        phStream = int(CUstream(hStream))
+    cyhStream = <cydriver.CUstream><void_ptr>phStream
+    if not all(isinstance(_x, (int)) for _x in sizes):
+        raise TypeError("Argument 'sizes' is not instance of type (expected Tuple[int] or List[int]")
+    dptrs = [] if dptrs is None else dptrs
+    if not all(isinstance(_x, (CUdeviceptr,)) for _x in dptrs):
+        raise TypeError("Argument 'dptrs' is not instance of type (expected Tuple[cydriver.CUdeviceptr,] or List[cydriver.CUdeviceptr,]")
+    cdef cydriver.CUdeviceptr* cydptrs = NULL
+    if len(dptrs) > 0:
+        cydptrs = <cydriver.CUdeviceptr*> calloc(len(dptrs), sizeof(cydriver.CUdeviceptr))
+        if cydptrs is NULL:
+            raise MemoryError('Failed to allocate length x size memory: ' + str(len(dptrs)) + 'x' + str(sizeof(cydriver.CUdeviceptr)))
+        else:
+            for idx in range(len(dptrs)):
+                cydptrs[idx] = <cydriver.CUdeviceptr>(<CUdeviceptr>dptrs[idx])._pvt_ptr[0]
+    cdef vector[size_t] cysizes = sizes
+    if count > <size_t>len(dptrs): raise RuntimeError("List is too small: " + str(len(dptrs)) + " < " + str(count))
+    if count > <size_t>len(sizes): raise RuntimeError("List is too small: " + str(len(sizes)) + " < " + str(count))
+    err = cydriver.cuMemDiscardBatchAsync(<cydriver.CUdeviceptr*>(<CUdeviceptr>dptrs[0])._pvt_ptr if len(dptrs) == 1 else cydptrs, cysizes.data(), count, flags, cyhStream)
+    if cydptrs is not NULL:
+        free(cydptrs)
+    return (_dict_CUresult[err],)
+{{endif}}
+
+{{if 'cuMemDiscardAndPrefetchBatchAsync' in found_functions}}
+
+@cython.embedsignature(True)
+def cuMemDiscardAndPrefetchBatchAsync(dptrs : Optional[Tuple[CUdeviceptr] | List[CUdeviceptr]], sizes : Tuple[int] | List[int], size_t count, prefetchLocs : Optional[Tuple[CUmemLocation] | List[CUmemLocation]], prefetchLocIdxs : Tuple[int] | List[int], size_t numPrefetchLocs, unsigned long long flags, hStream):
+    """ Performs a batch of memory discards and prefetches asynchronously.
+
+    Performs a batch of memory discards followed by prefetches. The batch
+    as a whole executes in stream order but operations within a batch are
+    not guaranteed to execute in any specific order. All devices in the
+    system must have a non-zero value for the device attribute
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS` otherwise the
+    API will return an error.
+
+    Calling :py:obj:`~.cuMemDiscardAndPrefetchBatchAsync` is semantically
+    equivalent to calling :py:obj:`~.cuMemDiscardBatchAsync` followed by
+    :py:obj:`~.cuMemPrefetchBatchAsync`, but is more optimal. For more
+    details on what discarding and prefetching imply, please refer to
+    :py:obj:`~.cuMemDiscardBatchAsync` and
+    :py:obj:`~.cuMemPrefetchBatchAsync` respectively. Note that any reads,
+    writes or prefetches to any part of the memory range that occur
+    simultaneously with this combined discard+prefetch operation result in
+    undefined behavior.
+
+    Performs memory discard and prefetch on address ranges specified in
+    `dptrs` and `sizes`. Both arrays must be of the same length as
+    specified by `count`. Each memory range specified must refer to managed
+    memory allocated via :py:obj:`~.cuMemAllocManaged` or declared via
+    managed variables or it may also refer to system-allocated memory when
+    all devices have a non-zero value for
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS`. Every operation
+    in the batch has to be associated with a valid location to prefetch the
+    address range to and specified in the `prefetchLocs` array. Each entry
+    in this array can apply to more than one operation. This can be done by
+    specifying in the `prefetchLocIdxs` array, the index of the first
+    operation that the corresponding entry in the `prefetchLocs` array
+    applies to. Both `prefetchLocs` and `prefetchLocIdxs` must be of the
+    same length as specified by `numPrefetchLocs`. For example, if a batch
+    has 10 operations listed in dptrs/sizes, the first 6 of which are to be
+    prefetched to one location and the remaining 4 are to be prefetched to
+    another, then `numPrefetchLocs` will be 2, `prefetchLocIdxs` will be
+    {0, 6} and `prefetchLocs` will contain the two set of locations. Note
+    the first entry in `prefetchLocIdxs` must always be 0. Also, each entry
+    must be greater than the previous entry and the last entry should be
+    less than `count`. Furthermore, `numPrefetchLocs` must be lesser than
+    or equal to `count`.
+
+    Parameters
+    ----------
+    dptrs : List[:py:obj:`~.CUdeviceptr`]
+        Array of pointers to be discarded
+    sizes : List[int]
+        Array of sizes for memory discard operations.
+    count : size_t
+        Size of `dptrs` and `sizes` arrays.
+    prefetchLocs : List[:py:obj:`~.CUmemLocation`]
+        Array of locations to prefetch to.
+    prefetchLocIdxs : List[int]
+        Array of indices to specify which operands each entry in the
+        `prefetchLocs` array applies to. The locations specified in
+        prefetchLocs[k] will be applied to operations starting from
+        prefetchLocIdxs[k] through prefetchLocIdxs[k+1] - 1. Also
+        prefetchLocs[numPrefetchLocs - 1] will apply to copies starting
+        from prefetchLocIdxs[numPrefetchLocs - 1] through count - 1.
+    numPrefetchLocs : size_t
+        Size of `prefetchLocs` and `prefetchLocIdxs` arrays.
+    flags : unsigned long long
+        Flags reserved for future use. Must be zero.
+    hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
+        The stream to enqueue the operations in. Must not be legacy NULL
+        stream.
+
+    Returns
+    -------
+    CUresult
+
+    """
+    cdef cydriver.CUstream cyhStream
+    if hStream is None:
+        phStream = 0
+    elif isinstance(hStream, (CUstream,)):
+        phStream = int(hStream)
+    else:
+        phStream = int(CUstream(hStream))
+    cyhStream = <cydriver.CUstream><void_ptr>phStream
+    if not all(isinstance(_x, (int)) for _x in prefetchLocIdxs):
+        raise TypeError("Argument 'prefetchLocIdxs' is not instance of type (expected Tuple[int] or List[int]")
+    prefetchLocs = [] if prefetchLocs is None else prefetchLocs
+    if not all(isinstance(_x, (CUmemLocation,)) for _x in prefetchLocs):
+        raise TypeError("Argument 'prefetchLocs' is not instance of type (expected Tuple[cydriver.CUmemLocation,] or List[cydriver.CUmemLocation,]")
+    if not all(isinstance(_x, (int)) for _x in sizes):
+        raise TypeError("Argument 'sizes' is not instance of type (expected Tuple[int] or List[int]")
+    dptrs = [] if dptrs is None else dptrs
+    if not all(isinstance(_x, (CUdeviceptr,)) for _x in dptrs):
+        raise TypeError("Argument 'dptrs' is not instance of type (expected Tuple[cydriver.CUdeviceptr,] or List[cydriver.CUdeviceptr,]")
+    cdef cydriver.CUdeviceptr* cydptrs = NULL
+    if len(dptrs) > 0:
+        cydptrs = <cydriver.CUdeviceptr*> calloc(len(dptrs), sizeof(cydriver.CUdeviceptr))
+        if cydptrs is NULL:
+            raise MemoryError('Failed to allocate length x size memory: ' + str(len(dptrs)) + 'x' + str(sizeof(cydriver.CUdeviceptr)))
+        else:
+            for idx in range(len(dptrs)):
+                cydptrs[idx] = <cydriver.CUdeviceptr>(<CUdeviceptr>dptrs[idx])._pvt_ptr[0]
+    cdef vector[size_t] cysizes = sizes
+    if count > <size_t>len(dptrs): raise RuntimeError("List is too small: " + str(len(dptrs)) + " < " + str(count))
+    if count > <size_t>len(sizes): raise RuntimeError("List is too small: " + str(len(sizes)) + " < " + str(count))
+    cdef cydriver.CUmemLocation* cyprefetchLocs = NULL
+    if len(prefetchLocs) > 0:
+        cyprefetchLocs = <cydriver.CUmemLocation*> calloc(len(prefetchLocs), sizeof(cydriver.CUmemLocation))
+        if cyprefetchLocs is NULL:
+            raise MemoryError('Failed to allocate length x size memory: ' + str(len(prefetchLocs)) + 'x' + str(sizeof(cydriver.CUmemLocation)))
+        for idx in range(len(prefetchLocs)):
+            string.memcpy(&cyprefetchLocs[idx], (<CUmemLocation>prefetchLocs[idx])._pvt_ptr, sizeof(cydriver.CUmemLocation))
+    cdef vector[size_t] cyprefetchLocIdxs = prefetchLocIdxs
+    if numPrefetchLocs > <size_t>len(prefetchLocs): raise RuntimeError("List is too small: " + str(len(prefetchLocs)) + " < " + str(numPrefetchLocs))
+    if numPrefetchLocs > <size_t>len(prefetchLocIdxs): raise RuntimeError("List is too small: " + str(len(prefetchLocIdxs)) + " < " + str(numPrefetchLocs))
+    err = cydriver.cuMemDiscardAndPrefetchBatchAsync(<cydriver.CUdeviceptr*>(<CUdeviceptr>dptrs[0])._pvt_ptr if len(dptrs) == 1 else cydptrs, cysizes.data(), count, (<CUmemLocation>prefetchLocs[0])._pvt_ptr if len(prefetchLocs) == 1 else cyprefetchLocs, cyprefetchLocIdxs.data(), numPrefetchLocs, flags, cyhStream)
+    if cydptrs is not NULL:
+        free(cydptrs)
+    if cyprefetchLocs is not NULL:
+        free(cyprefetchLocs)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -36108,11 +35988,10 @@ def cuStreamGetCtx(hStream):
 
     Returns the CUDA context that the stream is associated with.
 
-    Note there is a later version of this API,
-    :py:obj:`~.cuStreamGetCtx_v2`. It will supplant this version in CUDA
-    13.0. It is recommended to use :py:obj:`~.cuStreamGetCtx_v2` till then
-    as this version will return :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED` for
-    streams created via the API :py:obj:`~.cuGreenCtxStreamCreate`.
+    If the stream was created via the API
+    :py:obj:`~.cuGreenCtxStreamCreate`, the returned context is equivalent
+    to the one returned by :py:obj:`~.cuCtxFromGreenCtx()` on the green
+    context associated with the stream at creation time.
 
     The stream handle `hStream` can refer to any of the following:
 
@@ -36148,7 +36027,7 @@ def cuStreamGetCtx(hStream):
 
     See Also
     --------
-    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreateWithPriority`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cuStreamGetFlags`, :py:obj:`~.cuStreamGetDevice` :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cuStreamGetCtx_v2`, :py:obj:`~.cudaStreamCreateWithFlags`
+    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreateWithPriority`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cuStreamGetFlags`, :py:obj:`~.cuStreamGetDevice` :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -36757,7 +36636,7 @@ def cuStreamIsCapturing(hStream):
     return (_dict_CUresult[err], CUstreamCaptureStatus(captureStatus))
 {{endif}}
 
-{{if 'cuStreamGetCaptureInfo_v2' in found_functions}}
+{{if 'cuStreamGetCaptureInfo_v3' in found_functions}}
 
 @cython.embedsignature(True)
 def cuStreamGetCaptureInfo(hStream):
@@ -36774,89 +36653,6 @@ def cuStreamGetCaptureInfo(hStream):
 
     - the call returns CUDA_SUCCESS
 
-    - the returned capture status is
-      :py:obj:`~.CU_STREAM_CAPTURE_STATUS_ACTIVE`
-
-    Parameters
-    ----------
-    hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
-        The stream to query
-
-    Returns
-    -------
-    CUresult
-        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_IMPLICIT`
-    captureStatus_out : :py:obj:`~.CUstreamCaptureStatus`
-        Location to return the capture status of the stream; required
-    id_out : :py:obj:`~.cuuint64_t`
-        Optional location to return an id for the capture sequence, which
-        is unique over the lifetime of the process
-    graph_out : :py:obj:`~.CUgraph`
-        Optional location to return the graph being captured into. All
-        operations other than destroy and node removal are permitted on the
-        graph while the capture sequence is in progress. This API does not
-        transfer ownership of the graph, which is transferred or destroyed
-        at :py:obj:`~.cuStreamEndCapture`. Note that the graph handle may
-        be invalidated before end of capture for certain errors. Nodes that
-        are or become unreachable from the original stream at
-        :py:obj:`~.cuStreamEndCapture` due to direct actions on the graph
-        do not trigger :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_UNJOINED`.
-    dependencies_out : List[:py:obj:`~.CUgraphNode`]
-        Optional location to store a pointer to an array of nodes. The next
-        node to be captured in the stream will depend on this set of nodes,
-        absent operations such as event wait which modify this set. The
-        array pointer is valid until the next API call which operates on
-        the stream or until the capture is terminated. The node handles may
-        be copied out and are valid until they or the graph is destroyed.
-        The driver-owned array may also be passed directly to APIs that
-        operate on the graph (not the stream) without copying.
-    numDependencies_out : int
-        Optional location to store the size of the array returned in
-        dependencies_out.
-
-    See Also
-    --------
-    :py:obj:`~.cuStreamGetCaptureInfo_v3` :py:obj:`~.cuStreamBeginCapture`, :py:obj:`~.cuStreamIsCapturing`, :py:obj:`~.cuStreamUpdateCaptureDependencies`
-    """
-    cdef cydriver.CUstream cyhStream
-    if hStream is None:
-        phStream = 0
-    elif isinstance(hStream, (CUstream,)):
-        phStream = int(hStream)
-    else:
-        phStream = int(CUstream(hStream))
-    cyhStream = <cydriver.CUstream><void_ptr>phStream
-    cdef cydriver.CUstreamCaptureStatus captureStatus_out
-    cdef cuuint64_t id_out = cuuint64_t()
-    cdef CUgraph graph_out = CUgraph()
-    cdef const cydriver.CUgraphNode* cydependencies_out = NULL
-    pydependencies_out = []
-    cdef size_t numDependencies_out = 0
-    err = cydriver.cuStreamGetCaptureInfo(cyhStream, &captureStatus_out, <cydriver.cuuint64_t*>id_out._pvt_ptr, <cydriver.CUgraph*>graph_out._pvt_ptr, &cydependencies_out, &numDependencies_out)
-    if CUresult(err) == CUresult(0):
-        pydependencies_out = [CUgraphNode(init_value=<void_ptr>cydependencies_out[idx]) for idx in range(numDependencies_out)]
-    if err != cydriver.CUDA_SUCCESS:
-        return (_dict_CUresult[err], None, None, None, None, None)
-    return (_dict_CUresult[err], CUstreamCaptureStatus(captureStatus_out), id_out, graph_out, pydependencies_out, numDependencies_out)
-{{endif}}
-
-{{if 'cuStreamGetCaptureInfo_v3' in found_functions}}
-
-@cython.embedsignature(True)
-def cuStreamGetCaptureInfo_v3(hStream):
-    """ Query a stream's capture state (12.3+)
-
-    Query stream state related to stream capture.
-
-    If called on :py:obj:`~.CU_STREAM_LEGACY` (the "null stream") while a
-    stream not created with :py:obj:`~.CU_STREAM_NON_BLOCKING` is
-    capturing, returns :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_IMPLICIT`.
-
-    Valid data (other than capture status) is returned only if both of the
-    following are true:
-
-    - the call returns CUDA_SUCCESS
-
     - the returned capture status is
       :py:obj:`~.CU_STREAM_CAPTURE_STATUS_ACTIVE`
 
@@ -36911,7 +36707,7 @@ def cuStreamGetCaptureInfo_v3(hStream):
 
     See Also
     --------
-    :py:obj:`~.cuStreamGetCaptureInfo`, :py:obj:`~.cuStreamBeginCapture`, :py:obj:`~.cuStreamIsCapturing`, :py:obj:`~.cuStreamUpdateCaptureDependencies`
+    :py:obj:`~.cuStreamBeginCapture`, :py:obj:`~.cuStreamIsCapturing`, :py:obj:`~.cuStreamUpdateCaptureDependencies`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -36929,7 +36725,7 @@ def cuStreamGetCaptureInfo_v3(hStream):
     cdef const cydriver.CUgraphEdgeData* cyedgeData_out = NULL
     pyedgeData_out = []
     cdef size_t numDependencies_out = 0
-    err = cydriver.cuStreamGetCaptureInfo_v3(cyhStream, &captureStatus_out, <cydriver.cuuint64_t*>id_out._pvt_ptr, <cydriver.CUgraph*>graph_out._pvt_ptr, &cydependencies_out, &cyedgeData_out, &numDependencies_out)
+    err = cydriver.cuStreamGetCaptureInfo(cyhStream, &captureStatus_out, <cydriver.cuuint64_t*>id_out._pvt_ptr, <cydriver.CUgraph*>graph_out._pvt_ptr, &cydependencies_out, &cyedgeData_out, &numDependencies_out)
     if CUresult(err) == CUresult(0):
         pydependencies_out = [CUgraphNode(init_value=<void_ptr>cydependencies_out[idx]) for idx in range(numDependencies_out)]
     if CUresult(err) == CUresult(0):
@@ -36939,84 +36735,11 @@ def cuStreamGetCaptureInfo_v3(hStream):
     return (_dict_CUresult[err], CUstreamCaptureStatus(captureStatus_out), id_out, graph_out, pydependencies_out, pyedgeData_out, numDependencies_out)
 {{endif}}
 
-{{if 'cuStreamUpdateCaptureDependencies' in found_functions}}
-
-@cython.embedsignature(True)
-def cuStreamUpdateCaptureDependencies(hStream, dependencies : Optional[Tuple[CUgraphNode] | List[CUgraphNode]], size_t numDependencies, unsigned int flags):
-    """ Update the set of dependencies in a capturing stream (11.3+)
-
-    Modifies the dependency set of a capturing stream. The dependency set
-    is the set of nodes that the next captured node in the stream will
-    depend on.
-
-    Valid flags are :py:obj:`~.CU_STREAM_ADD_CAPTURE_DEPENDENCIES` and
-    :py:obj:`~.CU_STREAM_SET_CAPTURE_DEPENDENCIES`. These control whether
-    the set passed to the API is added to the existing set or replaces it.
-    A flags value of 0 defaults to
-    :py:obj:`~.CU_STREAM_ADD_CAPTURE_DEPENDENCIES`.
-
-    Nodes that are removed from the dependency set via this API do not
-    result in :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_UNJOINED` if they are
-    unreachable from the stream at :py:obj:`~.cuStreamEndCapture`.
-
-    Returns :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` if the stream is not
-    capturing.
-
-    This API is new in CUDA 11.3. Developers requiring compatibility across
-    minor versions to CUDA 11.0 should not use this API or provide a
-    fallback.
-
-    Parameters
-    ----------
-    hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
-        The stream to update
-    dependencies : List[:py:obj:`~.CUgraphNode`]
-        The set of dependencies to add
-    numDependencies : size_t
-        The size of the dependencies array
-    flags : unsigned int
-        See above
-
-    Returns
-    -------
-    CUresult
-        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE`
-
-    See Also
-    --------
-    :py:obj:`~.cuStreamBeginCapture`, :py:obj:`~.cuStreamGetCaptureInfo`,
-    """
-    dependencies = [] if dependencies is None else dependencies
-    if not all(isinstance(_x, (CUgraphNode,)) for _x in dependencies):
-        raise TypeError("Argument 'dependencies' is not instance of type (expected Tuple[cydriver.CUgraphNode,] or List[cydriver.CUgraphNode,]")
-    cdef cydriver.CUstream cyhStream
-    if hStream is None:
-        phStream = 0
-    elif isinstance(hStream, (CUstream,)):
-        phStream = int(hStream)
-    else:
-        phStream = int(CUstream(hStream))
-    cyhStream = <cydriver.CUstream><void_ptr>phStream
-    cdef cydriver.CUgraphNode* cydependencies = NULL
-    if len(dependencies) > 0:
-        cydependencies = <cydriver.CUgraphNode*> calloc(len(dependencies), sizeof(cydriver.CUgraphNode))
-        if cydependencies is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
-        else:
-            for idx in range(len(dependencies)):
-                cydependencies[idx] = <cydriver.CUgraphNode>(<CUgraphNode>dependencies[idx])._pvt_ptr[0]
-    if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    err = cydriver.cuStreamUpdateCaptureDependencies(cyhStream, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, numDependencies, flags)
-    if cydependencies is not NULL:
-        free(cydependencies)
-    return (_dict_CUresult[err],)
-{{endif}}
-
 {{if 'cuStreamUpdateCaptureDependencies_v2' in found_functions}}
 
 @cython.embedsignature(True)
-def cuStreamUpdateCaptureDependencies_v2(hStream, dependencies : Optional[Tuple[CUgraphNode] | List[CUgraphNode]], dependencyData : Optional[Tuple[CUgraphEdgeData] | List[CUgraphEdgeData]], size_t numDependencies, unsigned int flags):
-    """ Update the set of dependencies in a capturing stream (12.3+)
+def cuStreamUpdateCaptureDependencies(hStream, dependencies : Optional[Tuple[CUgraphNode] | List[CUgraphNode]], dependencyData : Optional[Tuple[CUgraphEdgeData] | List[CUgraphEdgeData]], size_t numDependencies, unsigned int flags):
+    """ Update the set of dependencies in a capturing stream.
 
     Modifies the dependency set of a capturing stream. The dependency set
     is the set of nodes that the next captured node in the stream will
@@ -37086,7 +36809,7 @@ def cuStreamUpdateCaptureDependencies_v2(hStream, dependencies : Optional[Tuple[
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencyData)) + 'x' + str(sizeof(cydriver.CUgraphEdgeData)))
         for idx in range(len(dependencyData)):
             string.memcpy(&cydependencyData[idx], (<CUgraphEdgeData>dependencyData[idx])._pvt_ptr, sizeof(cydriver.CUgraphEdgeData))
-    err = cydriver.cuStreamUpdateCaptureDependencies_v2(cyhStream, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, (<CUgraphEdgeData>dependencyData[0])._pvt_ptr if len(dependencyData) == 1 else cydependencyData, numDependencies, flags)
+    err = cydriver.cuStreamUpdateCaptureDependencies(cyhStream, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, (<CUgraphEdgeData>dependencyData[0])._pvt_ptr if len(dependencyData) == 1 else cydependencyData, numDependencies, flags)
     if cydependencies is not NULL:
         free(cydependencies)
     if cydependencyData is not NULL:
@@ -37760,82 +37483,10 @@ def cuEventDestroy(hEvent):
     return (_dict_CUresult[err],)
 {{endif}}
 
-{{if 'cuEventElapsedTime' in found_functions}}
-
-@cython.embedsignature(True)
-def cuEventElapsedTime(hStart, hEnd):
-    """ Computes the elapsed time between two events.
-
-    Computes the elapsed time between two events (in milliseconds with a
-    resolution of around 0.5 microseconds).
-
-    If either event was last recorded in a non-NULL stream, the resulting
-    time may be greater than expected (even if both used the same stream
-    handle). This happens because the :py:obj:`~.cuEventRecord()` operation
-    takes place asynchronously and there is no guarantee that the measured
-    latency is actually just between the two events. Any number of other
-    different stream operations could execute in between the two measured
-    events, thus altering the timing in a significant way.
-
-    If :py:obj:`~.cuEventRecord()` has not been called on either event then
-    :py:obj:`~.CUDA_ERROR_INVALID_HANDLE` is returned. If
-    :py:obj:`~.cuEventRecord()` has been called on both events but one or
-    both of them has not yet been completed (that is,
-    :py:obj:`~.cuEventQuery()` would return
-    :py:obj:`~.CUDA_ERROR_NOT_READY` on at least one of the events),
-    :py:obj:`~.CUDA_ERROR_NOT_READY` is returned. If either event was
-    created with the :py:obj:`~.CU_EVENT_DISABLE_TIMING` flag, then this
-    function will return :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`.
-
-    Note there is a later version of this API,
-    :py:obj:`~.cuEventElapsedTime_v2`. It will supplant this version in
-    CUDA 13.0, which is retained for minor version compatibility.
-
-    Parameters
-    ----------
-    hStart : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
-        Starting event
-    hEnd : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
-        Ending event
-
-    Returns
-    -------
-    CUresult
-        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_READY`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
-    pMilliseconds : float
-        Time between `hStart` and `hEnd` in ms
-
-    See Also
-    --------
-    :py:obj:`~.cuEventCreate`, :py:obj:`~.cuEventRecord`, :py:obj:`~.cuEventQuery`, :py:obj:`~.cuEventSynchronize`, :py:obj:`~.cuEventDestroy`, :py:obj:`~.cudaEventElapsedTime`
-    """
-    cdef cydriver.CUevent cyhEnd
-    if hEnd is None:
-        phEnd = 0
-    elif isinstance(hEnd, (CUevent,)):
-        phEnd = int(hEnd)
-    else:
-        phEnd = int(CUevent(hEnd))
-    cyhEnd = <cydriver.CUevent><void_ptr>phEnd
-    cdef cydriver.CUevent cyhStart
-    if hStart is None:
-        phStart = 0
-    elif isinstance(hStart, (CUevent,)):
-        phStart = int(hStart)
-    else:
-        phStart = int(CUevent(hStart))
-    cyhStart = <cydriver.CUevent><void_ptr>phStart
-    cdef float pMilliseconds = 0
-    err = cydriver.cuEventElapsedTime(&pMilliseconds, cyhStart, cyhEnd)
-    if err != cydriver.CUDA_SUCCESS:
-        return (_dict_CUresult[err], None)
-    return (_dict_CUresult[err], pMilliseconds)
-{{endif}}
-
 {{if 'cuEventElapsedTime_v2' in found_functions}}
 
 @cython.embedsignature(True)
-def cuEventElapsedTime_v2(hStart, hEnd):
+def cuEventElapsedTime(hStart, hEnd):
     """ Computes the elapsed time between two events.
 
     Computes the elapsed time between two events (in milliseconds with a
@@ -37898,7 +37549,7 @@ def cuEventElapsedTime_v2(hStart, hEnd):
         phStart = int(CUevent(hStart))
     cyhStart = <cydriver.CUevent><void_ptr>phStart
     cdef float pMilliseconds = 0
-    err = cydriver.cuEventElapsedTime_v2(&pMilliseconds, cyhStart, cyhEnd)
+    err = cydriver.cuEventElapsedTime(&pMilliseconds, cyhStart, cyhEnd)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pMilliseconds)
@@ -38018,6 +37669,16 @@ def cuImportExternalMemory(memHandleDesc : Optional[CUDA_EXTERNAL_MEMORY_HANDLE_
     :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC` for
     memory synchronization.
 
+    If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
+    :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_DMABUF_FD`, then
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`::handle::fd must be a
+    valid file descriptor referencing a dma_buf object and
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.flags` must be zero.
+    Importing a dma_buf object is supported only on Tegra Jetson platform
+    starting with Thor series. Mapping an imported dma_buf object as CUDA
+    mipmapped array using
+    :py:obj:`~.cuExternalMemoryGetMappedMipmappedArray` is not supported.
+
     The size of the memory object must be specified in
     :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.size`.
 
@@ -38163,6 +37824,9 @@ def cuExternalMemoryGetMappedMipmappedArray(extMem, mipmapDesc : Optional[CUDA_E
     :py:obj:`~.CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC.numLevels` must be
     equal to 1.
 
+    Mapping `extMem` imported from a handle of type
+    :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_DMABUF_FD`, is not supported.
+
     The returned CUDA mipmapped array must be freed using
     :py:obj:`~.cuMipmappedArrayDestroy`.
 
@@ -38454,7 +38118,21 @@ def cuSignalExternalSemaphoresAsync(extSemArray : Optional[Tuple[CUexternalSemap
     handle such situations, either by not using the same semaphore object
     with deterministic fence support enabled in different streams or by
     adding explicit dependency amongst such streams so that the semaphore
-    is signaled in order.
+    is signaled in order. NvSciSyncFence associated with semaphore object
+    of the type :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC` can
+    be timestamp enabled. For this the NvSciSyncAttrList used to create the
+    object must have the value of NvSciSyncAttrKey_WaiterRequireTimestamps
+    key set to true. Timestamps are emitted asynchronously by the GPU and
+    CUDA saves the GPU timestamp in the corresponding NvSciSyncFence at the
+    time of signal on GPU. Users are expected to convert GPU clocks to CPU
+    clocks using appropriate scaling functions. Users are expected to wait
+    for the completion of the fence before extracting timestamp using
+    appropriate NvSciSync APIs. Users are expected to ensure that there is
+    only one outstanding timestamp enabled fence per Cuda-NvSciSync object
+    at any point of time, failing which leads to undefined behavior.
+    Extracting the timestamp before the corresponding fence is signalled
+    could lead to undefined behaviour. Timestamp extracted via appropriate
+    NvSciSync API would be in microseconds.
 
     If the semaphore object is any one of the following types:
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX`,
@@ -38993,7 +38671,7 @@ def cuStreamBatchMemOp(stream, unsigned int count, paramArray : Optional[Tuple[C
 
     Notes
     -----
-    Warning: Improper use of this API may deadlock the application. Synchronization ordering established through this API is not visible to CUDA. CUDA tasks that are (even indirectly) ordered by this API should also have that order expressed with CUDA-visible dependencies such as events. This ensures that the scheduler does not serialize them in an improper order. For more information, see the Stream Memory Operations section in the programming guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
+    Warning: Improper use of this API may deadlock the application. Synchronization ordering established through this API is not visible to CUDA. CUDA tasks that are (even indirectly) ordered by this API should also have that order expressed with CUDA-visible dependencies such as events. This ensures that the scheduler does not serialize them in an improper order.
     """
     paramArray = [] if paramArray is None else paramArray
     if not all(isinstance(_x, (CUstreamBatchMemOpParams,)) for _x in paramArray):
@@ -42384,7 +42062,7 @@ def cuGraphAddBatchMemOpNode(hGraph, dependencies : Optional[Tuple[CUgraphNode]
 
     Notes
     -----
-    Warning: Improper use of this API may deadlock the application. Synchronization ordering established through this API is not visible to CUDA. CUDA tasks that are (even indirectly) ordered by this API should also have that order expressed with CUDA-visible dependencies such as events. This ensures that the scheduler does not serialize them in an improper order. For more information, see the Stream Memory Operations section in the programming guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
+    Warning: Improper use of this API may deadlock the application. Synchronization ordering established through this API is not visible to CUDA. CUDA tasks that are (even indirectly) ordered by this API should also have that order expressed with CUDA-visible dependencies such as events. This ensures that the scheduler does not serialize them in an improper order.
     """
     dependencies = [] if dependencies is None else dependencies
     if not all(isinstance(_x, (CUgraphNode,)) for _x in dependencies):
@@ -43256,84 +42934,12 @@ def cuGraphGetRootNodes(hGraph, size_t numRootNodes = 0):
     return (_dict_CUresult[err], pyrootNodes, numRootNodes)
 {{endif}}
 
-{{if 'cuGraphGetEdges' in found_functions}}
+{{if 'cuGraphGetEdges_v2' in found_functions}}
 
 @cython.embedsignature(True)
 def cuGraphGetEdges(hGraph, size_t numEdges = 0):
     """ Returns a graph's dependency edges.
 
-    Returns a list of `hGraph's` dependency edges. Edges are returned via
-    corresponding indices in `from` and `to`; that is, the node in `to`[i]
-    has a dependency on the node in `from`[i]. `from` and `to` may both be
-    NULL, in which case this function only returns the number of edges in
-    `numEdges`. Otherwise, `numEdges` entries will be filled in. If
-    `numEdges` is higher than the actual number of edges, the remaining
-    entries in `from` and `to` will be set to NULL, and the number of edges
-    actually returned will be written to `numEdges`.
-
-    Parameters
-    ----------
-    hGraph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to get the edges from
-    numEdges : int
-        See description
-
-    Returns
-    -------
-    CUresult
-        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
-    from : List[:py:obj:`~.CUgraphNode`]
-        Location to return edge endpoints
-    to : List[:py:obj:`~.CUgraphNode`]
-        Location to return edge endpoints
-    numEdges : int
-        See description
-
-    See Also
-    --------
-    :py:obj:`~.cuGraphGetNodes`, :py:obj:`~.cuGraphGetRootNodes`, :py:obj:`~.cuGraphAddDependencies`, :py:obj:`~.cuGraphRemoveDependencies`, :py:obj:`~.cuGraphNodeGetDependencies`, :py:obj:`~.cuGraphNodeGetDependentNodes`
-    """
-    cdef size_t _graph_length = numEdges
-    cdef cydriver.CUgraph cyhGraph
-    if hGraph is None:
-        phGraph = 0
-    elif isinstance(hGraph, (CUgraph,)):
-        phGraph = int(hGraph)
-    else:
-        phGraph = int(CUgraph(hGraph))
-    cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
-    cdef cydriver.CUgraphNode* cyfrom_ = NULL
-    pyfrom_ = []
-    if _graph_length != 0:
-        cyfrom_ = <cydriver.CUgraphNode*>calloc(_graph_length, sizeof(cydriver.CUgraphNode))
-        if cyfrom_ is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cydriver.CUgraphNode)))
-    cdef cydriver.CUgraphNode* cyto = NULL
-    pyto = []
-    if _graph_length != 0:
-        cyto = <cydriver.CUgraphNode*>calloc(_graph_length, sizeof(cydriver.CUgraphNode))
-        if cyto is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cydriver.CUgraphNode)))
-    err = cydriver.cuGraphGetEdges(cyhGraph, cyfrom_, cyto, &numEdges)
-    if CUresult(err) == CUresult(0):
-        pyfrom_ = [CUgraphNode(init_value=<void_ptr>cyfrom_[idx]) for idx in range(_graph_length)]
-    if cyfrom_ is not NULL:
-        free(cyfrom_)
-    if CUresult(err) == CUresult(0):
-        pyto = [CUgraphNode(init_value=<void_ptr>cyto[idx]) for idx in range(_graph_length)]
-    if cyto is not NULL:
-        free(cyto)
-    if err != cydriver.CUDA_SUCCESS:
-        return (_dict_CUresult[err], None, None, None)
-    return (_dict_CUresult[err], pyfrom_, pyto, numEdges)
-{{endif}}
-
-{{if 'cuGraphGetEdges_v2' in found_functions}}
-
-@cython.embedsignature(True)
-def cuGraphGetEdges_v2(hGraph, size_t numEdges = 0):
-    """ Returns a graph's dependency edges (12.3+)
-
     Returns a list of `hGraph's` dependency edges. Edges are returned via
     corresponding indices in `from`, `to` and `edgeData`; that is, the node
     in `to`[i] has a dependency on the node in `from`[i] with data
@@ -43399,7 +43005,7 @@ def cuGraphGetEdges_v2(hGraph, size_t numEdges = 0):
         cyedgeData = <cydriver.CUgraphEdgeData*>calloc(_graph_length, sizeof(cydriver.CUgraphEdgeData))
         if cyedgeData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cydriver.CUgraphEdgeData)))
-    err = cydriver.cuGraphGetEdges_v2(cyhGraph, cyfrom_, cyto, cyedgeData, &numEdges)
+    err = cydriver.cuGraphGetEdges(cyhGraph, cyfrom_, cyto, cyedgeData, &numEdges)
     if CUresult(err) == CUresult(0):
         pyfrom_ = [CUgraphNode(init_value=<void_ptr>cyfrom_[idx]) for idx in range(_graph_length)]
     if cyfrom_ is not NULL:
@@ -43417,7 +43023,7 @@ def cuGraphGetEdges_v2(hGraph, size_t numEdges = 0):
     return (_dict_CUresult[err], pyfrom_, pyto, pyedgeData, numEdges)
 {{endif}}
 
-{{if 'cuGraphNodeGetDependencies' in found_functions}}
+{{if 'cuGraphNodeGetDependencies_v2' in found_functions}}
 
 @cython.embedsignature(True)
 def cuGraphNodeGetDependencies(hNode, size_t numDependencies = 0):
@@ -43431,65 +43037,6 @@ def cuGraphNodeGetDependencies(hNode, size_t numDependencies = 0):
     NULL, and the number of nodes actually obtained will be returned in
     `numDependencies`.
 
-    Parameters
-    ----------
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to query
-    numDependencies : int
-        See description
-
-    Returns
-    -------
-    CUresult
-        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
-    dependencies : List[:py:obj:`~.CUgraphNode`]
-        Pointer to return the dependencies
-    numDependencies : int
-        See description
-
-    See Also
-    --------
-    :py:obj:`~.cuGraphNodeGetDependentNodes`, :py:obj:`~.cuGraphGetNodes`, :py:obj:`~.cuGraphGetRootNodes`, :py:obj:`~.cuGraphGetEdges`, :py:obj:`~.cuGraphAddDependencies`, :py:obj:`~.cuGraphRemoveDependencies`
-    """
-    cdef size_t _graph_length = numDependencies
-    cdef cydriver.CUgraphNode cyhNode
-    if hNode is None:
-        phNode = 0
-    elif isinstance(hNode, (CUgraphNode,)):
-        phNode = int(hNode)
-    else:
-        phNode = int(CUgraphNode(hNode))
-    cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUgraphNode* cydependencies = NULL
-    pydependencies = []
-    if _graph_length != 0:
-        cydependencies = <cydriver.CUgraphNode*>calloc(_graph_length, sizeof(cydriver.CUgraphNode))
-        if cydependencies is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cydriver.CUgraphNode)))
-    err = cydriver.cuGraphNodeGetDependencies(cyhNode, cydependencies, &numDependencies)
-    if CUresult(err) == CUresult(0):
-        pydependencies = [CUgraphNode(init_value=<void_ptr>cydependencies[idx]) for idx in range(_graph_length)]
-    if cydependencies is not NULL:
-        free(cydependencies)
-    if err != cydriver.CUDA_SUCCESS:
-        return (_dict_CUresult[err], None, None)
-    return (_dict_CUresult[err], pydependencies, numDependencies)
-{{endif}}
-
-{{if 'cuGraphNodeGetDependencies_v2' in found_functions}}
-
-@cython.embedsignature(True)
-def cuGraphNodeGetDependencies_v2(hNode, size_t numDependencies = 0):
-    """ Returns a node's dependencies (12.3+)
-
-    Returns a list of `node's` dependencies. `dependencies` may be NULL, in
-    which case this function will return the number of dependencies in
-    `numDependencies`. Otherwise, `numDependencies` entries will be filled
-    in. If `numDependencies` is higher than the actual number of
-    dependencies, the remaining entries in `dependencies` will be set to
-    NULL, and the number of nodes actually obtained will be returned in
-    `numDependencies`.
-
     Note that if an edge has non-zero (non-default) edge data and
     `edgeData` is NULL, this API will return
     :py:obj:`~.CUDA_ERROR_LOSSY_QUERY`. If `edgeData` is non-NULL, then
@@ -43538,7 +43085,7 @@ def cuGraphNodeGetDependencies_v2(hNode, size_t numDependencies = 0):
         cyedgeData = <cydriver.CUgraphEdgeData*>calloc(_graph_length, sizeof(cydriver.CUgraphEdgeData))
         if cyedgeData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cydriver.CUgraphEdgeData)))
-    err = cydriver.cuGraphNodeGetDependencies_v2(cyhNode, cydependencies, cyedgeData, &numDependencies)
+    err = cydriver.cuGraphNodeGetDependencies(cyhNode, cydependencies, cyedgeData, &numDependencies)
     if CUresult(err) == CUresult(0):
         pydependencies = [CUgraphNode(init_value=<void_ptr>cydependencies[idx]) for idx in range(_graph_length)]
     if cydependencies is not NULL:
@@ -43552,7 +43099,7 @@ def cuGraphNodeGetDependencies_v2(hNode, size_t numDependencies = 0):
     return (_dict_CUresult[err], pydependencies, pyedgeData, numDependencies)
 {{endif}}
 
-{{if 'cuGraphNodeGetDependentNodes' in found_functions}}
+{{if 'cuGraphNodeGetDependentNodes_v2' in found_functions}}
 
 @cython.embedsignature(True)
 def cuGraphNodeGetDependentNodes(hNode, size_t numDependentNodes = 0):
@@ -43566,65 +43113,6 @@ def cuGraphNodeGetDependentNodes(hNode, size_t numDependentNodes = 0):
     will be set to NULL, and the number of nodes actually obtained will be
     returned in `numDependentNodes`.
 
-    Parameters
-    ----------
-    hNode : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to query
-    numDependentNodes : int
-        See description
-
-    Returns
-    -------
-    CUresult
-        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
-    dependentNodes : List[:py:obj:`~.CUgraphNode`]
-        Pointer to return the dependent nodes
-    numDependentNodes : int
-        See description
-
-    See Also
-    --------
-    :py:obj:`~.cuGraphNodeGetDependencies`, :py:obj:`~.cuGraphGetNodes`, :py:obj:`~.cuGraphGetRootNodes`, :py:obj:`~.cuGraphGetEdges`, :py:obj:`~.cuGraphAddDependencies`, :py:obj:`~.cuGraphRemoveDependencies`
-    """
-    cdef size_t _graph_length = numDependentNodes
-    cdef cydriver.CUgraphNode cyhNode
-    if hNode is None:
-        phNode = 0
-    elif isinstance(hNode, (CUgraphNode,)):
-        phNode = int(hNode)
-    else:
-        phNode = int(CUgraphNode(hNode))
-    cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUgraphNode* cydependentNodes = NULL
-    pydependentNodes = []
-    if _graph_length != 0:
-        cydependentNodes = <cydriver.CUgraphNode*>calloc(_graph_length, sizeof(cydriver.CUgraphNode))
-        if cydependentNodes is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cydriver.CUgraphNode)))
-    err = cydriver.cuGraphNodeGetDependentNodes(cyhNode, cydependentNodes, &numDependentNodes)
-    if CUresult(err) == CUresult(0):
-        pydependentNodes = [CUgraphNode(init_value=<void_ptr>cydependentNodes[idx]) for idx in range(_graph_length)]
-    if cydependentNodes is not NULL:
-        free(cydependentNodes)
-    if err != cydriver.CUDA_SUCCESS:
-        return (_dict_CUresult[err], None, None)
-    return (_dict_CUresult[err], pydependentNodes, numDependentNodes)
-{{endif}}
-
-{{if 'cuGraphNodeGetDependentNodes_v2' in found_functions}}
-
-@cython.embedsignature(True)
-def cuGraphNodeGetDependentNodes_v2(hNode, size_t numDependentNodes = 0):
-    """ Returns a node's dependent nodes (12.3+)
-
-    Returns a list of `node's` dependent nodes. `dependentNodes` may be
-    NULL, in which case this function will return the number of dependent
-    nodes in `numDependentNodes`. Otherwise, `numDependentNodes` entries
-    will be filled in. If `numDependentNodes` is higher than the actual
-    number of dependent nodes, the remaining entries in `dependentNodes`
-    will be set to NULL, and the number of nodes actually obtained will be
-    returned in `numDependentNodes`.
-
     Note that if an edge has non-zero (non-default) edge data and
     `edgeData` is NULL, this API will return
     :py:obj:`~.CUDA_ERROR_LOSSY_QUERY`. If `edgeData` is non-NULL, then
@@ -43673,7 +43161,7 @@ def cuGraphNodeGetDependentNodes_v2(hNode, size_t numDependentNodes = 0):
         cyedgeData = <cydriver.CUgraphEdgeData*>calloc(_graph_length, sizeof(cydriver.CUgraphEdgeData))
         if cyedgeData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cydriver.CUgraphEdgeData)))
-    err = cydriver.cuGraphNodeGetDependentNodes_v2(cyhNode, cydependentNodes, cyedgeData, &numDependentNodes)
+    err = cydriver.cuGraphNodeGetDependentNodes(cyhNode, cydependentNodes, cyedgeData, &numDependentNodes)
     if CUresult(err) == CUresult(0):
         pydependentNodes = [CUgraphNode(init_value=<void_ptr>cydependentNodes[idx]) for idx in range(_graph_length)]
     if cydependentNodes is not NULL:
@@ -43687,82 +43175,11 @@ def cuGraphNodeGetDependentNodes_v2(hNode, size_t numDependentNodes = 0):
     return (_dict_CUresult[err], pydependentNodes, pyedgeData, numDependentNodes)
 {{endif}}
 
-{{if 'cuGraphAddDependencies' in found_functions}}
-
-@cython.embedsignature(True)
-def cuGraphAddDependencies(hGraph, from_ : Optional[Tuple[CUgraphNode] | List[CUgraphNode]], to : Optional[Tuple[CUgraphNode] | List[CUgraphNode]], size_t numDependencies):
-    """ Adds dependency edges to a graph.
-
-    The number of dependencies to be added is defined by `numDependencies`
-    Elements in `from` and `to` at corresponding indices define a
-    dependency. Each node in `from` and `to` must belong to `hGraph`.
-
-    If `numDependencies` is 0, elements in `from` and `to` will be ignored.
-    Specifying an existing dependency will return an error.
-
-    Parameters
-    ----------
-    hGraph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to which dependencies are added
-    from : List[:py:obj:`~.CUgraphNode`]
-        Array of nodes that provide the dependencies
-    to : List[:py:obj:`~.CUgraphNode`]
-        Array of dependent nodes
-    numDependencies : size_t
-        Number of dependencies to be added
-
-    Returns
-    -------
-    CUresult
-        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
-
-    See Also
-    --------
-    :py:obj:`~.cuGraphRemoveDependencies`, :py:obj:`~.cuGraphGetEdges`, :py:obj:`~.cuGraphNodeGetDependencies`, :py:obj:`~.cuGraphNodeGetDependentNodes`
-    """
-    to = [] if to is None else to
-    if not all(isinstance(_x, (CUgraphNode,)) for _x in to):
-        raise TypeError("Argument 'to' is not instance of type (expected Tuple[cydriver.CUgraphNode,] or List[cydriver.CUgraphNode,]")
-    from_ = [] if from_ is None else from_
-    if not all(isinstance(_x, (CUgraphNode,)) for _x in from_):
-        raise TypeError("Argument 'from_' is not instance of type (expected Tuple[cydriver.CUgraphNode,] or List[cydriver.CUgraphNode,]")
-    cdef cydriver.CUgraph cyhGraph
-    if hGraph is None:
-        phGraph = 0
-    elif isinstance(hGraph, (CUgraph,)):
-        phGraph = int(hGraph)
-    else:
-        phGraph = int(CUgraph(hGraph))
-    cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
-    cdef cydriver.CUgraphNode* cyfrom_ = NULL
-    if len(from_) > 0:
-        cyfrom_ = <cydriver.CUgraphNode*> calloc(len(from_), sizeof(cydriver.CUgraphNode))
-        if cyfrom_ is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(len(from_)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
-        else:
-            for idx in range(len(from_)):
-                cyfrom_[idx] = <cydriver.CUgraphNode>(<CUgraphNode>from_[idx])._pvt_ptr[0]
-    cdef cydriver.CUgraphNode* cyto = NULL
-    if len(to) > 0:
-        cyto = <cydriver.CUgraphNode*> calloc(len(to), sizeof(cydriver.CUgraphNode))
-        if cyto is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(len(to)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
-        else:
-            for idx in range(len(to)):
-                cyto[idx] = <cydriver.CUgraphNode>(<CUgraphNode>to[idx])._pvt_ptr[0]
-    err = cydriver.cuGraphAddDependencies(cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>from_[0])._pvt_ptr if len(from_) == 1 else cyfrom_, <cydriver.CUgraphNode*>(<CUgraphNode>to[0])._pvt_ptr if len(to) == 1 else cyto, numDependencies)
-    if cyfrom_ is not NULL:
-        free(cyfrom_)
-    if cyto is not NULL:
-        free(cyto)
-    return (_dict_CUresult[err],)
-{{endif}}
-
 {{if 'cuGraphAddDependencies_v2' in found_functions}}
 
 @cython.embedsignature(True)
-def cuGraphAddDependencies_v2(hGraph, from_ : Optional[Tuple[CUgraphNode] | List[CUgraphNode]], to : Optional[Tuple[CUgraphNode] | List[CUgraphNode]], edgeData : Optional[Tuple[CUgraphEdgeData] | List[CUgraphEdgeData]], size_t numDependencies):
-    """ Adds dependency edges to a graph (12.3+)
+def cuGraphAddDependencies(hGraph, from_ : Optional[Tuple[CUgraphNode] | List[CUgraphNode]], to : Optional[Tuple[CUgraphNode] | List[CUgraphNode]], edgeData : Optional[Tuple[CUgraphEdgeData] | List[CUgraphEdgeData]], size_t numDependencies):
+    """ Adds dependency edges to a graph.
 
     The number of dependencies to be added is defined by `numDependencies`
     Elements in `from` and `to` at corresponding indices define a
@@ -43834,7 +43251,7 @@ def cuGraphAddDependencies_v2(hGraph, from_ : Optional[Tuple[CUgraphNode] | List
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(edgeData)) + 'x' + str(sizeof(cydriver.CUgraphEdgeData)))
         for idx in range(len(edgeData)):
             string.memcpy(&cyedgeData[idx], (<CUgraphEdgeData>edgeData[idx])._pvt_ptr, sizeof(cydriver.CUgraphEdgeData))
-    err = cydriver.cuGraphAddDependencies_v2(cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>from_[0])._pvt_ptr if len(from_) == 1 else cyfrom_, <cydriver.CUgraphNode*>(<CUgraphNode>to[0])._pvt_ptr if len(to) == 1 else cyto, (<CUgraphEdgeData>edgeData[0])._pvt_ptr if len(edgeData) == 1 else cyedgeData, numDependencies)
+    err = cydriver.cuGraphAddDependencies(cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>from_[0])._pvt_ptr if len(from_) == 1 else cyfrom_, <cydriver.CUgraphNode*>(<CUgraphNode>to[0])._pvt_ptr if len(to) == 1 else cyto, (<CUgraphEdgeData>edgeData[0])._pvt_ptr if len(edgeData) == 1 else cyedgeData, numDependencies)
     if cyfrom_ is not NULL:
         free(cyfrom_)
     if cyto is not NULL:
@@ -43844,86 +43261,11 @@ def cuGraphAddDependencies_v2(hGraph, from_ : Optional[Tuple[CUgraphNode] | List
     return (_dict_CUresult[err],)
 {{endif}}
 
-{{if 'cuGraphRemoveDependencies' in found_functions}}
-
-@cython.embedsignature(True)
-def cuGraphRemoveDependencies(hGraph, from_ : Optional[Tuple[CUgraphNode] | List[CUgraphNode]], to : Optional[Tuple[CUgraphNode] | List[CUgraphNode]], size_t numDependencies):
-    """ Removes dependency edges from a graph.
-
-    The number of `dependencies` to be removed is defined by
-    `numDependencies`. Elements in `from` and `to` at corresponding indices
-    define a dependency. Each node in `from` and `to` must belong to
-    `hGraph`.
-
-    If `numDependencies` is 0, elements in `from` and `to` will be ignored.
-    Specifying a non-existing dependency will return an error.
-
-    Dependencies cannot be removed from graphs which contain allocation or
-    free nodes. Any attempt to do so will return an error.
-
-    Parameters
-    ----------
-    hGraph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph from which to remove dependencies
-    from : List[:py:obj:`~.CUgraphNode`]
-        Array of nodes that provide the dependencies
-    to : List[:py:obj:`~.CUgraphNode`]
-        Array of dependent nodes
-    numDependencies : size_t
-        Number of dependencies to be removed
-
-    Returns
-    -------
-    CUresult
-        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
-
-    See Also
-    --------
-    :py:obj:`~.cuGraphAddDependencies`, :py:obj:`~.cuGraphGetEdges`, :py:obj:`~.cuGraphNodeGetDependencies`, :py:obj:`~.cuGraphNodeGetDependentNodes`
-    """
-    to = [] if to is None else to
-    if not all(isinstance(_x, (CUgraphNode,)) for _x in to):
-        raise TypeError("Argument 'to' is not instance of type (expected Tuple[cydriver.CUgraphNode,] or List[cydriver.CUgraphNode,]")
-    from_ = [] if from_ is None else from_
-    if not all(isinstance(_x, (CUgraphNode,)) for _x in from_):
-        raise TypeError("Argument 'from_' is not instance of type (expected Tuple[cydriver.CUgraphNode,] or List[cydriver.CUgraphNode,]")
-    cdef cydriver.CUgraph cyhGraph
-    if hGraph is None:
-        phGraph = 0
-    elif isinstance(hGraph, (CUgraph,)):
-        phGraph = int(hGraph)
-    else:
-        phGraph = int(CUgraph(hGraph))
-    cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
-    cdef cydriver.CUgraphNode* cyfrom_ = NULL
-    if len(from_) > 0:
-        cyfrom_ = <cydriver.CUgraphNode*> calloc(len(from_), sizeof(cydriver.CUgraphNode))
-        if cyfrom_ is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(len(from_)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
-        else:
-            for idx in range(len(from_)):
-                cyfrom_[idx] = <cydriver.CUgraphNode>(<CUgraphNode>from_[idx])._pvt_ptr[0]
-    cdef cydriver.CUgraphNode* cyto = NULL
-    if len(to) > 0:
-        cyto = <cydriver.CUgraphNode*> calloc(len(to), sizeof(cydriver.CUgraphNode))
-        if cyto is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(len(to)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
-        else:
-            for idx in range(len(to)):
-                cyto[idx] = <cydriver.CUgraphNode>(<CUgraphNode>to[idx])._pvt_ptr[0]
-    err = cydriver.cuGraphRemoveDependencies(cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>from_[0])._pvt_ptr if len(from_) == 1 else cyfrom_, <cydriver.CUgraphNode*>(<CUgraphNode>to[0])._pvt_ptr if len(to) == 1 else cyto, numDependencies)
-    if cyfrom_ is not NULL:
-        free(cyfrom_)
-    if cyto is not NULL:
-        free(cyto)
-    return (_dict_CUresult[err],)
-{{endif}}
-
 {{if 'cuGraphRemoveDependencies_v2' in found_functions}}
 
 @cython.embedsignature(True)
-def cuGraphRemoveDependencies_v2(hGraph, from_ : Optional[Tuple[CUgraphNode] | List[CUgraphNode]], to : Optional[Tuple[CUgraphNode] | List[CUgraphNode]], edgeData : Optional[Tuple[CUgraphEdgeData] | List[CUgraphEdgeData]], size_t numDependencies):
-    """ Removes dependency edges from a graph (12.3+)
+def cuGraphRemoveDependencies(hGraph, from_ : Optional[Tuple[CUgraphNode] | List[CUgraphNode]], to : Optional[Tuple[CUgraphNode] | List[CUgraphNode]], edgeData : Optional[Tuple[CUgraphEdgeData] | List[CUgraphEdgeData]], size_t numDependencies):
+    """ Removes dependency edges from a graph.
 
     The number of `dependencies` to be removed is defined by
     `numDependencies`. Elements in `from` and `to` at corresponding indices
@@ -44001,7 +43343,7 @@ def cuGraphRemoveDependencies_v2(hGraph, from_ : Optional[Tuple[CUgraphNode] | L
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(edgeData)) + 'x' + str(sizeof(cydriver.CUgraphEdgeData)))
         for idx in range(len(edgeData)):
             string.memcpy(&cyedgeData[idx], (<CUgraphEdgeData>edgeData[idx])._pvt_ptr, sizeof(cydriver.CUgraphEdgeData))
-    err = cydriver.cuGraphRemoveDependencies_v2(cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>from_[0])._pvt_ptr if len(from_) == 1 else cyfrom_, <cydriver.CUgraphNode*>(<CUgraphNode>to[0])._pvt_ptr if len(to) == 1 else cyto, (<CUgraphEdgeData>edgeData[0])._pvt_ptr if len(edgeData) == 1 else cyedgeData, numDependencies)
+    err = cydriver.cuGraphRemoveDependencies(cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>from_[0])._pvt_ptr if len(from_) == 1 else cyfrom_, <cydriver.CUgraphNode*>(<CUgraphNode>to[0])._pvt_ptr if len(to) == 1 else cyto, (<CUgraphEdgeData>edgeData[0])._pvt_ptr if len(edgeData) == 1 else cyedgeData, numDependencies)
     if cyfrom_ is not NULL:
         free(cyfrom_)
     if cyto is not NULL:
@@ -45836,88 +45178,11 @@ def cuGraphReleaseUserObject(graph, object, unsigned int count):
     return (_dict_CUresult[err],)
 {{endif}}
 
-{{if 'cuGraphAddNode' in found_functions}}
-
-@cython.embedsignature(True)
-def cuGraphAddNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] | List[CUgraphNode]], size_t numDependencies, nodeParams : Optional[CUgraphNodeParams]):
-    """ Adds a node of arbitrary type to a graph.
-
-    Creates a new node in `hGraph` described by `nodeParams` with
-    `numDependencies` dependencies specified via `dependencies`.
-    `numDependencies` may be 0. `dependencies` may be null if
-    `numDependencies` is 0. `dependencies` may not have any duplicate
-    entries.
-
-    `nodeParams` is a tagged union. The node type should be specified in
-    the `typename` field, and type-specific parameters in the corresponding
-    union member. All unused bytes - that is, `reserved0` and all bytes
-    past the utilized union member - must be set to zero. It is recommended
-    to use brace initialization or memset to ensure all bytes are
-    initialized.
-
-    Note that for some node types, `nodeParams` may contain "out
-    parameters" which are modified during the call, such as
-    `nodeParams->alloc.dptr`.
-
-    A handle to the new node will be returned in `phGraphNode`.
-
-    Parameters
-    ----------
-    hGraph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to which to add the node
-    dependencies : List[:py:obj:`~.CUgraphNode`]
-        Dependencies of the node
-    numDependencies : size_t
-        Number of dependencies
-    nodeParams : :py:obj:`~.CUgraphNodeParams`
-        Specification of the node
-
-    Returns
-    -------
-    CUresult
-        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
-    phGraphNode : :py:obj:`~.CUgraphNode`
-        Returns newly created node
-
-    See Also
-    --------
-    :py:obj:`~.cuGraphCreate`, :py:obj:`~.cuGraphNodeSetParams`, :py:obj:`~.cuGraphExecNodeSetParams`
-    """
-    dependencies = [] if dependencies is None else dependencies
-    if not all(isinstance(_x, (CUgraphNode,)) for _x in dependencies):
-        raise TypeError("Argument 'dependencies' is not instance of type (expected Tuple[cydriver.CUgraphNode,] or List[cydriver.CUgraphNode,]")
-    cdef cydriver.CUgraph cyhGraph
-    if hGraph is None:
-        phGraph = 0
-    elif isinstance(hGraph, (CUgraph,)):
-        phGraph = int(hGraph)
-    else:
-        phGraph = int(CUgraph(hGraph))
-    cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
-    cdef CUgraphNode phGraphNode = CUgraphNode()
-    cdef cydriver.CUgraphNode* cydependencies = NULL
-    if len(dependencies) > 0:
-        cydependencies = <cydriver.CUgraphNode*> calloc(len(dependencies), sizeof(cydriver.CUgraphNode))
-        if cydependencies is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
-        else:
-            for idx in range(len(dependencies)):
-                cydependencies[idx] = <cydriver.CUgraphNode>(<CUgraphNode>dependencies[idx])._pvt_ptr[0]
-    if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphAddNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, numDependencies, cynodeParams_ptr)
-    if cydependencies is not NULL:
-        free(cydependencies)
-    if err != cydriver.CUDA_SUCCESS:
-        return (_dict_CUresult[err], None)
-    return (_dict_CUresult[err], phGraphNode)
-{{endif}}
-
 {{if 'cuGraphAddNode_v2' in found_functions}}
 
 @cython.embedsignature(True)
-def cuGraphAddNode_v2(hGraph, dependencies : Optional[Tuple[CUgraphNode] | List[CUgraphNode]], dependencyData : Optional[Tuple[CUgraphEdgeData] | List[CUgraphEdgeData]], size_t numDependencies, nodeParams : Optional[CUgraphNodeParams]):
-    """ Adds a node of arbitrary type to a graph (12.3+)
+def cuGraphAddNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] | List[CUgraphNode]], dependencyData : Optional[Tuple[CUgraphEdgeData] | List[CUgraphEdgeData]], size_t numDependencies, nodeParams : Optional[CUgraphNodeParams]):
+    """ Adds a node of arbitrary type to a graph.
 
     Creates a new node in `hGraph` described by `nodeParams` with
     `numDependencies` dependencies specified via `dependencies`.
@@ -45993,10 +45258,8 @@ def cuGraphAddNode_v2(hGraph, dependencies : Optional[Tuple[CUgraphNode] | List[
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencyData)) + 'x' + str(sizeof(cydriver.CUgraphEdgeData)))
         for idx in range(len(dependencyData)):
             string.memcpy(&cydependencyData[idx], (<CUgraphEdgeData>dependencyData[idx])._pvt_ptr, sizeof(cydriver.CUgraphEdgeData))
-    if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    if numDependencies > <size_t>len(dependencyData): raise RuntimeError("List is too small: " + str(len(dependencyData)) + " < " + str(numDependencies))
     cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphAddNode_v2(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, (<CUgraphEdgeData>dependencyData[0])._pvt_ptr if len(dependencyData) == 1 else cydependencyData, numDependencies, cynodeParams_ptr)
+    err = cydriver.cuGraphAddNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, (<CUgraphEdgeData>dependencyData[0])._pvt_ptr if len(dependencyData) == 1 else cydependencyData, numDependencies, cynodeParams_ptr)
     if cydependencies is not NULL:
         free(cydependencies)
     if cydependencyData is not NULL:
@@ -49681,12 +48944,17 @@ def cuDeviceGetP2PAttribute(attrib not None : CUdevice_P2PAttribute, srcDevice,
     - :py:obj:`~.CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED` P2P: 1 if P2P
       Access is enable.
 
-    - :py:obj:`~.CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED`: 1 if
-      Atomic operations over the link are supported.
+    - :py:obj:`~.CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED`: 1 if all
+      CUDA-valid atomic operations over the link are supported.
 
     - :py:obj:`~.CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED`: 1 if
       cudaArray can be accessed over the link.
 
+    - :py:obj:`~.CU_DEVICE_P2P_ATTRIBUTE_ONLY_PARTIAL_NATIVE_ATOMIC_SUPPORTED`:
+      1 if some CUDA-valid atomic operations over the link are supported.
+      Information about specific operations can be retrieved with
+      :py:obj:`~.cuDeviceGetP2PAtomicCapabilities`.
+
     Returns :py:obj:`~.CUDA_ERROR_INVALID_DEVICE` if `srcDevice` or
     `dstDevice` are not valid or if they represent the same device.
 
@@ -49712,7 +48980,7 @@ def cuDeviceGetP2PAttribute(attrib not None : CUdevice_P2PAttribute, srcDevice,
 
     See Also
     --------
-    :py:obj:`~.cuCtxEnablePeerAccess`, :py:obj:`~.cuCtxDisablePeerAccess`, :py:obj:`~.cuDeviceCanAccessPeer`, :py:obj:`~.cudaDeviceGetP2PAttribute`
+    :py:obj:`~.cuCtxEnablePeerAccess`, :py:obj:`~.cuCtxDisablePeerAccess`, :py:obj:`~.cuDeviceCanAccessPeer`, :py:obj:`~.cuDeviceGetP2PAtomicCapabilities`, :py:obj:`~.cudaDeviceGetP2PAttribute`
     """
     cdef cydriver.CUdevice cydstDevice
     if dstDevice is None:
@@ -49738,6 +49006,88 @@ def cuDeviceGetP2PAttribute(attrib not None : CUdevice_P2PAttribute, srcDevice,
     return (_dict_CUresult[err], value)
 {{endif}}
 
+{{if 'cuDeviceGetP2PAtomicCapabilities' in found_functions}}
+
+@cython.embedsignature(True)
+def cuDeviceGetP2PAtomicCapabilities(operations : Optional[Tuple[CUatomicOperation] | List[CUatomicOperation]], unsigned int count, srcDevice, dstDevice):
+    """ Queries details about atomic operations supported between two devices.
+
+    Returns in `*capabilities` the details about requested atomic
+    `*operations` over the the link between `srcDevice` and `dstDevice`.
+    The allocated size of `*operations` and `*capabilities` must be
+    `count`.
+
+    For each :py:obj:`~.CUatomicOperation` in `*operations`, the
+    corresponding result in `*capabilities` will be a bitmask indicating
+    which of :py:obj:`~.CUatomicOperationCapability` the link supports
+    natively.
+
+    Returns :py:obj:`~.CUDA_ERROR_INVALID_DEVICE` if `srcDevice` or
+    `dstDevice` are not valid or if they represent the same device.
+
+    Returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if `*capabilities` or
+    `*operations` is NULL, if `count` is 0, or if any of `*operations` is
+    not valid.
+
+    Parameters
+    ----------
+    operations : List[:py:obj:`~.CUatomicOperation`]
+        Requested operations
+    count : unsigned int
+        Count of requested operations and size of capabilities
+    srcDevice : :py:obj:`~.CUdevice`
+        The source device of the target link
+    dstDevice : :py:obj:`~.CUdevice`
+        The destination device of the target link
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
+    capabilities : List[unsigned int]
+        Returned capability details of each requested operation
+
+    See Also
+    --------
+    :py:obj:`~.cuDeviceGetP2PAttribute`, :py:obj:`~.cudaDeviceGetP2PAttribute`, :py:obj:`~.cudaDeviceGetP2PAtomicCapabilities`
+    """
+    cdef cydriver.CUdevice cydstDevice
+    if dstDevice is None:
+        pdstDevice = 0
+    elif isinstance(dstDevice, (CUdevice,)):
+        pdstDevice = int(dstDevice)
+    else:
+        pdstDevice = int(CUdevice(dstDevice))
+    cydstDevice = <cydriver.CUdevice>pdstDevice
+    cdef cydriver.CUdevice cysrcDevice
+    if srcDevice is None:
+        psrcDevice = 0
+    elif isinstance(srcDevice, (CUdevice,)):
+        psrcDevice = int(srcDevice)
+    else:
+        psrcDevice = int(CUdevice(srcDevice))
+    cysrcDevice = <cydriver.CUdevice>psrcDevice
+    operations = [] if operations is None else operations
+    if not all(isinstance(_x, (CUatomicOperation)) for _x in operations):
+        raise TypeError("Argument 'operations' is not instance of type (expected Tuple[cydriver.CUatomicOperation] or List[cydriver.CUatomicOperation]")
+    cdef unsigned int* cycapabilities = NULL
+    pycapabilities = []
+    if count != 0:
+        cycapabilities = <unsigned int*>calloc(count, sizeof(unsigned int))
+        if cycapabilities is NULL:
+            raise MemoryError('Failed to allocate length x size memory: ' + str(count) + 'x' + str(sizeof(unsigned int)))
+    cdef vector[cydriver.CUatomicOperation] cyoperations = [pyoperations.value for pyoperations in (operations)]
+    if count > len(operations): raise RuntimeError("List is too small: " + str(len(operations)) + " < " + str(count))
+    err = cydriver.cuDeviceGetP2PAtomicCapabilities(cycapabilities, cyoperations.data(), count, cysrcDevice, cydstDevice)
+    if CUresult(err) == CUresult(0):
+        pycapabilities = [<unsigned int>cycapabilities[idx] for idx in range(count)]
+    if cycapabilities is not NULL:
+        free(cycapabilities)
+    if err != cydriver.CUDA_SUCCESS:
+        return (_dict_CUresult[err], None)
+    return (_dict_CUresult[err], pycapabilities)
+{{endif}}
+
 {{if 'cuGraphicsUnregisterResource' in found_functions}}
 
 @cython.embedsignature(True)
@@ -50190,7 +49540,7 @@ def cuGetProcAddress(char* symbol, int cudaVersion, flags):
 
     See Also
     --------
-    :py:obj:`~.cudaGetDriverEntryPoint`
+    :py:obj:`~.cudaGetDriverEntryPointByVersion`
     """
     cdef cydriver.cuuint64_t cyflags
     if flags is None:
@@ -50763,7 +50113,7 @@ def cuGreenCtxCreate(desc, dev, unsigned int flags):
 
     See Also
     --------
-    :py:obj:`~.cuGreenCtxDestroy`, :py:obj:`~.cuCtxFromGreenCtx`, :py:obj:`~.cuCtxSetCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuDevResourceGenerateDesc`, :py:obj:`~.cuDevicePrimaryCtxRetain`, :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxCreate_v3`
+    :py:obj:`~.cuGreenCtxDestroy`, :py:obj:`~.cuCtxFromGreenCtx`, :py:obj:`~.cuCtxSetCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuDevResourceGenerateDesc`, :py:obj:`~.cuDevicePrimaryCtxRetain`, :py:obj:`~.cuCtxCreate`
     """
     cdef cydriver.CUdevice cydev
     if dev is None:
@@ -51033,7 +50383,13 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
     guarantee a split that will create a disjoint set of symmetrical
     partitions. This may lead to fewer groups created than purely dividing
     the total SM count by the `minCount` due to cluster requirements or
-    alignment and granularity requirements for the minCount.
+    alignment and granularity requirements for the minCount. These
+    requirements can be queried with :py:obj:`~.cuDeviceGetDevResource`,
+    :py:obj:`~.cuCtxGetDevResource`, and
+    :py:obj:`~.cuGreenCtxGetDevResource` for
+    :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM`, using the `minSmPartitionSize` and
+    `smCoscheduledAlignment` fields to determine minimum partition size and
+    alignment granularity, respectively.
 
     The `remainder` set does not have the same functional or performance
     guarantees as the groups in `result`. Its use should be carefully
@@ -51336,7 +50692,7 @@ def cuStreamGetGreenCtx(hStream):
 
     See Also
     --------
-    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamCreateWithPriority`, :py:obj:`~.cuStreamGetCtx_v2`, :py:obj:`~.cuGreenCtxStreamCreate`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cuStreamGetFlags`, :py:obj:`~.cuStreamGetDevice`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`
+    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamCreateWithPriority`, :py:obj:`~.cuStreamGetCtx`, :py:obj:`~.cuGreenCtxStreamCreate`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cuStreamGetFlags`, :py:obj:`~.cuStreamGetDevice`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -51427,6 +50783,49 @@ def cuGreenCtxStreamCreate(greenCtx, unsigned int flags, int priority):
     return (_dict_CUresult[err], phStream)
 {{endif}}
 
+{{if 'cuGreenCtxGetId' in found_functions}}
+
+@cython.embedsignature(True)
+def cuGreenCtxGetId(greenCtx):
+    """ Returns the unique Id associated with the green context supplied.
+
+    Returns in `greenCtxId` the unique Id which is associated with a given
+    green context. The Id is unique for the life of the program for this
+    instance of CUDA. If green context is supplied as NULL and the current
+    context is set to a green context, the Id of the current green context
+    is returned.
+
+    Parameters
+    ----------
+    greenCtx : :py:obj:`~.CUgreenCtx`
+        Green context for which to obtain the Id
+
+    Returns
+    -------
+    CUresult
+        :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_CONTEXT_IS_DESTROYED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
+    greenCtxId : unsigned long long
+        Pointer to store the Id of the green context
+
+    See Also
+    --------
+    :py:obj:`~.cuGreenCtxCreate`, :py:obj:`~.cuGreenCtxDestroy`, :py:obj:`~.cuCtxGetId`
+    """
+    cdef cydriver.CUgreenCtx cygreenCtx
+    if greenCtx is None:
+        pgreenCtx = 0
+    elif isinstance(greenCtx, (CUgreenCtx,)):
+        pgreenCtx = int(greenCtx)
+    else:
+        pgreenCtx = int(CUgreenCtx(greenCtx))
+    cygreenCtx = <cydriver.CUgreenCtx><void_ptr>pgreenCtx
+    cdef unsigned long long greenCtxId = 0
+    err = cydriver.cuGreenCtxGetId(cygreenCtx, &greenCtxId)
+    if err != cydriver.CUDA_SUCCESS:
+        return (_dict_CUresult[err], None)
+    return (_dict_CUresult[err], greenCtxId)
+{{endif}}
+
 {{if 'cuLogsRegisterCallback' in found_functions}}
 
 ctypedef struct cuLogsCallbackData_st:
@@ -51759,41 +51158,6 @@ def cuCheckpointProcessCheckpoint(int pid, args : Optional[CUcheckpointCheckpoin
     return (_dict_CUresult[err],)
 {{endif}}
 
-{{if 'cuCheckpointProcessRestore' in found_functions}}
-
-@cython.embedsignature(True)
-def cuCheckpointProcessRestore(int pid, args : Optional[CUcheckpointRestoreArgs]):
-    """ Restore a CUDA process's GPU memory contents from its last checkpoint.
-
-    Restores a CUDA process specified by `pid` from its last checkpoint.
-    Process must be in the CHECKPOINTED state to restore.
-
-    Upon successful return the process will be in the LOCKED state.
-
-    CUDA process restore requires persistence mode to be enabled or
-    :py:obj:`~.cuInit` to have been called before execution.
-
-    Parameters
-    ----------
-    pid : int
-        The process ID of the CUDA process
-    args : :py:obj:`~.CUcheckpointRestoreArgs`
-        Optional restore operation arguments
-
-    Returns
-    -------
-    CUresult
-        :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED` :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
-
-    See Also
-    --------
-    :py:obj:`~.cuInit`
-    """
-    cdef cydriver.CUcheckpointRestoreArgs* cyargs_ptr = args._pvt_ptr if args != None else NULL
-    err = cydriver.cuCheckpointProcessRestore(pid, cyargs_ptr)
-    return (_dict_CUresult[err],)
-{{endif}}
-
 {{if 'cuCheckpointProcessUnlock' in found_functions}}
 
 @cython.embedsignature(True)
@@ -53744,12 +53108,12 @@ def sizeof(objType):
     {{if 'CUcheckpointCheckpointArgs' in found_types}}
     if objType == CUcheckpointCheckpointArgs:
         return sizeof(cydriver.CUcheckpointCheckpointArgs){{endif}}
-    {{if 'CUcheckpointRestoreArgs_st' in found_struct}}
-    if objType == CUcheckpointRestoreArgs_st:
-        return sizeof(cydriver.CUcheckpointRestoreArgs_st){{endif}}
-    {{if 'CUcheckpointRestoreArgs' in found_types}}
-    if objType == CUcheckpointRestoreArgs:
-        return sizeof(cydriver.CUcheckpointRestoreArgs){{endif}}
+    {{if 'CUcheckpointGpuPair_st' in found_struct}}
+    if objType == CUcheckpointGpuPair_st:
+        return sizeof(cydriver.CUcheckpointGpuPair_st){{endif}}
+    {{if 'CUcheckpointGpuPair' in found_types}}
+    if objType == CUcheckpointGpuPair:
+        return sizeof(cydriver.CUcheckpointGpuPair){{endif}}
     {{if 'CUcheckpointUnlockArgs_st' in found_struct}}
     if objType == CUcheckpointUnlockArgs_st:
         return sizeof(cydriver.CUcheckpointUnlockArgs_st){{endif}}
diff --git a/cuda_bindings/cuda/bindings/nvrtc.pyx.in b/cuda_bindings/cuda/bindings/nvrtc.pyx.in
index 9fcfff4739..8ba5424599 100644
--- a/cuda_bindings/cuda/bindings/nvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/nvrtc.pyx.in
@@ -34,6 +34,7 @@ ctypedef unsigned long long unsigned_ptr
 ctypedef unsigned long long unsigned_long_long_ptr
 ctypedef unsigned long long long_long_ptr
 ctypedef unsigned long long size_t_ptr
+ctypedef unsigned long long long_ptr
 ctypedef unsigned long long float_ptr
 ctypedef unsigned long long double_ptr
 ctypedef unsigned long long void_ptr
@@ -80,6 +81,8 @@ class nvrtcResult(IntEnum):
     NVRTC_ERROR_PCH_CREATE = cynvrtc.nvrtcResult.NVRTC_ERROR_PCH_CREATE{{endif}}
     {{if 'NVRTC_ERROR_CANCELLED' in found_values}}
     NVRTC_ERROR_CANCELLED = cynvrtc.nvrtcResult.NVRTC_ERROR_CANCELLED{{endif}}
+    {{if 'NVRTC_ERROR_TIME_TRACE_FILE_WRITE_FAILED' in found_values}}
+    NVRTC_ERROR_TIME_TRACE_FILE_WRITE_FAILED = cynvrtc.nvrtcResult.NVRTC_ERROR_TIME_TRACE_FILE_WRITE_FAILED{{endif}}
 
 _dict_nvrtcResult = dict(((int(v), v) for k, v in nvrtcResult.__members__.items()))
 {{endif}}
@@ -501,69 +504,6 @@ def nvrtcGetCUBIN(prog, char* cubin):
     return (_dict_nvrtcResult[err],)
 {{endif}}
 
-{{if 'nvrtcGetNVVMSize' in found_functions}}
-
-@cython.embedsignature(True)
-def nvrtcGetNVVMSize(prog):
-    """ DEPRECATION NOTICE: This function will be removed in a future release. Please use nvrtcGetLTOIRSize (and nvrtcGetLTOIR) instead.
-
-    Parameters
-    ----------
-    prog : :py:obj:`~.nvrtcProgram`
-        None
-
-    Returns
-    -------
-    nvrtcResult
-
-    nvvmSizeRet : int
-        None
-    """
-    cdef cynvrtc.nvrtcProgram cyprog
-    if prog is None:
-        pprog = 0
-    elif isinstance(prog, (nvrtcProgram,)):
-        pprog = int(prog)
-    else:
-        pprog = int(nvrtcProgram(prog))
-    cyprog = <cynvrtc.nvrtcProgram><void_ptr>pprog
-    cdef size_t nvvmSizeRet = 0
-    err = cynvrtc.nvrtcGetNVVMSize(cyprog, &nvvmSizeRet)
-    if err != cynvrtc.NVRTC_SUCCESS:
-        return (_dict_nvrtcResult[err], None)
-    return (_dict_nvrtcResult[err], nvvmSizeRet)
-{{endif}}
-
-{{if 'nvrtcGetNVVM' in found_functions}}
-
-@cython.embedsignature(True)
-def nvrtcGetNVVM(prog, char* nvvm):
-    """ DEPRECATION NOTICE: This function will be removed in a future release. Please use nvrtcGetLTOIR (and nvrtcGetLTOIRSize) instead.
-
-    Parameters
-    ----------
-    prog : :py:obj:`~.nvrtcProgram`
-        None
-    nvvm : bytes
-        None
-
-    Returns
-    -------
-    nvrtcResult
-
-    """
-    cdef cynvrtc.nvrtcProgram cyprog
-    if prog is None:
-        pprog = 0
-    elif isinstance(prog, (nvrtcProgram,)):
-        pprog = int(prog)
-    else:
-        pprog = int(nvrtcProgram(prog))
-    cyprog = <cynvrtc.nvrtcProgram><void_ptr>pprog
-    err = cynvrtc.nvrtcGetNVVM(cyprog, nvvm)
-    return (_dict_nvrtcResult[err],)
-{{endif}}
-
 {{if 'nvrtcGetLTOIRSize' in found_functions}}
 
 @cython.embedsignature(True)
diff --git a/cuda_bindings/cuda/bindings/runtime.pxd.in b/cuda_bindings/cuda/bindings/runtime.pxd.in
index 83987c8fbd..45a1505693 100644
--- a/cuda_bindings/cuda/bindings/runtime.pxd.in
+++ b/cuda_bindings/cuda/bindings/runtime.pxd.in
@@ -193,6 +193,21 @@ cdef class cudaAsyncCallbackHandle_t:
     cdef cyruntime.cudaAsyncCallbackHandle_t* _pvt_ptr
 {{endif}}
 
+{{if 'cudaLogsCallbackHandle' in found_types}}
+
+cdef class cudaLogsCallbackHandle:
+    """
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+
+    """
+    cdef cyruntime.cudaLogsCallbackHandle  _pvt_val
+    cdef cyruntime.cudaLogsCallbackHandle* _pvt_ptr
+{{endif}}
+
 {{if True}}
 
 cdef class EGLImageKHR:
@@ -283,6 +298,21 @@ cdef class cudaStreamCallback_t:
     cdef cyruntime.cudaStreamCallback_t* _pvt_ptr
 {{endif}}
 
+{{if 'cudaLogsCallback_t' in found_types}}
+
+cdef class cudaLogsCallback_t:
+    """
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+
+    """
+    cdef cyruntime.cudaLogsCallback_t  _pvt_val
+    cdef cyruntime.cudaLogsCallback_t* _pvt_ptr
+{{endif}}
+
 {{if 'dim3' in found_struct}}
 
 cdef class dim3:
@@ -1004,6 +1034,24 @@ cdef class anon_struct4:
     cdef cudaChannelFormatDesc _desc
     {{endif}}
 {{endif}}
+{{if 'cudaResourceDesc.res.reserved' in found_struct}}
+
+cdef class anon_struct5:
+    """
+    Attributes
+    ----------
+    {{if 'cudaResourceDesc.res.reserved.reserved' in found_struct}}
+    reserved : List[int]
+
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    cdef cyruntime.cudaResourceDesc* _pvt_ptr
+{{endif}}
 {{if 'cudaResourceDesc.res' in found_struct}}
 
 cdef class anon_union0:
@@ -1025,6 +1073,10 @@ cdef class anon_union0:
     {{if 'cudaResourceDesc.res.pitch2D' in found_struct}}
     pitch2D : anon_struct4
 
+    {{endif}}
+    {{if 'cudaResourceDesc.res.reserved' in found_struct}}
+    reserved : anon_struct5
+
     {{endif}}
 
     Methods
@@ -1045,6 +1097,9 @@ cdef class anon_union0:
     {{if 'cudaResourceDesc.res.pitch2D' in found_struct}}
     cdef anon_struct4 _pitch2D
     {{endif}}
+    {{if 'cudaResourceDesc.res.reserved' in found_struct}}
+    cdef anon_struct5 _reserved
+    {{endif}}
 {{endif}}
 {{if 'cudaResourceDesc' in found_struct}}
 
@@ -1062,6 +1117,10 @@ cdef class cudaResourceDesc:
     res : anon_union0
 
     {{endif}}
+    {{if 'cudaResourceDesc.flags' in found_struct}}
+    flags : unsigned int
+        Flags (must be zero)
+    {{endif}}
 
     Methods
     -------
@@ -1114,6 +1173,10 @@ cdef class cudaResourceViewDesc:
     lastLayer : unsigned int
         Last layer index
     {{endif}}
+    {{if 'cudaResourceViewDesc.reserved' in found_struct}}
+    reserved : List[unsigned int]
+        Must be zero
+    {{endif}}
 
     Methods
     -------
@@ -1158,6 +1221,10 @@ cdef class cudaPointerAttributes:
         unregistered memory is allocated so this field may contain invalid
         pointer if an invalid pointer has been passed to CUDA.
     {{endif}}
+    {{if 'cudaPointerAttributes.reserved' in found_struct}}
+    reserved : List[long]
+        Must be zero
+    {{endif}}
 
     Methods
     -------
@@ -1612,7 +1679,7 @@ cdef class cudaOffset3D:
 {{endif}}
 {{if 'cudaMemcpy3DOperand.op.ptr' in found_struct}}
 
-cdef class anon_struct5:
+cdef class anon_struct6:
     """
     Attributes
     ----------
@@ -1645,7 +1712,7 @@ cdef class anon_struct5:
 {{endif}}
 {{if 'cudaMemcpy3DOperand.op.array' in found_struct}}
 
-cdef class anon_struct6:
+cdef class anon_struct7:
     """
     Attributes
     ----------
@@ -1678,11 +1745,11 @@ cdef class anon_union1:
     Attributes
     ----------
     {{if 'cudaMemcpy3DOperand.op.ptr' in found_struct}}
-    ptr : anon_struct5
+    ptr : anon_struct6
 
     {{endif}}
     {{if 'cudaMemcpy3DOperand.op.array' in found_struct}}
-    array : anon_struct6
+    array : anon_struct7
 
     {{endif}}
 
@@ -1693,10 +1760,10 @@ cdef class anon_union1:
     """
     cdef cyruntime.cudaMemcpy3DOperand* _pvt_ptr
     {{if 'cudaMemcpy3DOperand.op.ptr' in found_struct}}
-    cdef anon_struct5 _ptr
+    cdef anon_struct6 _ptr
     {{endif}}
     {{if 'cudaMemcpy3DOperand.op.array' in found_struct}}
-    cdef anon_struct6 _array
+    cdef anon_struct7 _array
     {{endif}}
 {{endif}}
 {{if 'cudaMemcpy3DOperand' in found_struct}}
@@ -1849,10 +1916,6 @@ cdef class cudaDeviceProp:
     maxGridSize : List[int]
         Maximum size of each dimension of a grid
     {{endif}}
-    {{if 'cudaDeviceProp.clockRate' in found_struct}}
-    clockRate : int
-        Deprecated, Clock frequency in kilohertz
-    {{endif}}
     {{if 'cudaDeviceProp.totalConstMem' in found_struct}}
     totalConstMem : size_t
         Constant memory available on device in bytes
@@ -1874,19 +1937,10 @@ cdef class cudaDeviceProp:
         Pitch alignment requirement for texture references bound to pitched
         memory
     {{endif}}
-    {{if 'cudaDeviceProp.deviceOverlap' in found_struct}}
-    deviceOverlap : int
-        Device can concurrently copy memory and execute a kernel.
-        Deprecated. Use instead asyncEngineCount.
-    {{endif}}
     {{if 'cudaDeviceProp.multiProcessorCount' in found_struct}}
     multiProcessorCount : int
         Number of multiprocessors on device
     {{endif}}
-    {{if 'cudaDeviceProp.kernelExecTimeoutEnabled' in found_struct}}
-    kernelExecTimeoutEnabled : int
-        Deprecated, Specified whether there is a run time limit on kernels
-    {{endif}}
     {{if 'cudaDeviceProp.integrated' in found_struct}}
     integrated : int
         Device is integrated as opposed to discrete
@@ -1896,10 +1950,6 @@ cdef class cudaDeviceProp:
         Device can map host memory with
         cudaHostAlloc/cudaHostGetDevicePointer
     {{endif}}
-    {{if 'cudaDeviceProp.computeMode' in found_struct}}
-    computeMode : int
-        Deprecated, Compute mode (See cudaComputeMode)
-    {{endif}}
     {{if 'cudaDeviceProp.maxTexture1D' in found_struct}}
     maxTexture1D : int
         Maximum 1D texture size
@@ -1908,11 +1958,6 @@ cdef class cudaDeviceProp:
     maxTexture1DMipmap : int
         Maximum 1D mipmapped texture size
     {{endif}}
-    {{if 'cudaDeviceProp.maxTexture1DLinear' in found_struct}}
-    maxTexture1DLinear : int
-        Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth()
-        or cuDeviceGetTexture1DLinearMaxWidth() instead.
-    {{endif}}
     {{if 'cudaDeviceProp.maxTexture2D' in found_struct}}
     maxTexture2D : List[int]
         Maximum 2D texture dimensions
@@ -2019,10 +2064,6 @@ cdef class cudaDeviceProp:
     unifiedAddressing : int
         Device shares a unified address space with the host
     {{endif}}
-    {{if 'cudaDeviceProp.memoryClockRate' in found_struct}}
-    memoryClockRate : int
-        Deprecated, Peak memory clock frequency in kilohertz
-    {{endif}}
     {{if 'cudaDeviceProp.memoryBusWidth' in found_struct}}
     memoryBusWidth : int
         Global memory bus width in bits
@@ -2077,11 +2118,6 @@ cdef class cudaDeviceProp:
         Link between the device and the host supports native atomic
         operations
     {{endif}}
-    {{if 'cudaDeviceProp.singleToDoublePrecisionPerfRatio' in found_struct}}
-    singleToDoublePrecisionPerfRatio : int
-        Deprecated, Ratio of single precision performance (in floating-
-        point operations per second) to double precision performance
-    {{endif}}
     {{if 'cudaDeviceProp.pageableMemoryAccess' in found_struct}}
     pageableMemoryAccess : int
         Device supports coherently accessing pageable memory without
@@ -2106,10 +2142,6 @@ cdef class cudaDeviceProp:
         Device supports launching cooperative kernels via
         cudaLaunchCooperativeKernel
     {{endif}}
-    {{if 'cudaDeviceProp.cooperativeMultiDeviceLaunch' in found_struct}}
-    cooperativeMultiDeviceLaunch : int
-        Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated.
-    {{endif}}
     {{if 'cudaDeviceProp.sharedMemPerBlockOptin' in found_struct}}
     sharedMemPerBlockOptin : size_t
         Per device maximum shared memory per block usable by special opt in
@@ -2193,6 +2225,38 @@ cdef class cudaDeviceProp:
     unifiedFunctionPointers : int
         Indicates device supports unified pointers
     {{endif}}
+    {{if 'cudaDeviceProp.deviceNumaConfig' in found_struct}}
+    deviceNumaConfig : int
+        NUMA configuration of a device: value is of type
+        cudaDeviceNumaConfig enum
+    {{endif}}
+    {{if 'cudaDeviceProp.deviceNumaId' in found_struct}}
+    deviceNumaId : int
+        NUMA node ID of the GPU memory
+    {{endif}}
+    {{if 'cudaDeviceProp.mpsEnabled' in found_struct}}
+    mpsEnabled : int
+        Indicates if contexts created on this device will be shared via MPS
+    {{endif}}
+    {{if 'cudaDeviceProp.hostNumaId' in found_struct}}
+    hostNumaId : int
+        NUMA ID of the host node closest to the device or -1 when system
+        does not support NUMA
+    {{endif}}
+    {{if 'cudaDeviceProp.gpuPciDeviceID' in found_struct}}
+    gpuPciDeviceID : unsigned int
+        The combined 16-bit PCI device ID and 16-bit PCI vendor ID
+    {{endif}}
+    {{if 'cudaDeviceProp.gpuPciSubsystemID' in found_struct}}
+    gpuPciSubsystemID : unsigned int
+        The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem
+        vendor ID
+    {{endif}}
+    {{if 'cudaDeviceProp.hostNumaMultinodeIpcSupported' in found_struct}}
+    hostNumaMultinodeIpcSupported : int
+        1 if the device supports HostNuma location IPC between nodes in a
+        multi-node system.
+    {{endif}}
     {{if 'cudaDeviceProp.reserved' in found_struct}}
     reserved : List[int]
         Reserved for future use
@@ -2272,7 +2336,7 @@ cdef class cudaMemFabricHandle_st:
 {{endif}}
 {{if 'cudaExternalMemoryHandleDesc.handle.win32' in found_struct}}
 
-cdef class anon_struct7:
+cdef class anon_struct8:
     """
     Attributes
     ----------
@@ -2303,7 +2367,7 @@ cdef class anon_union2:
 
     {{endif}}
     {{if 'cudaExternalMemoryHandleDesc.handle.win32' in found_struct}}
-    win32 : anon_struct7
+    win32 : anon_struct8
 
     {{endif}}
     {{if 'cudaExternalMemoryHandleDesc.handle.nvSciBufObject' in found_struct}}
@@ -2318,7 +2382,7 @@ cdef class anon_union2:
     """
     cdef cyruntime.cudaExternalMemoryHandleDesc* _pvt_ptr
     {{if 'cudaExternalMemoryHandleDesc.handle.win32' in found_struct}}
-    cdef anon_struct7 _win32
+    cdef anon_struct8 _win32
     {{endif}}
 {{endif}}
 {{if 'cudaExternalMemoryHandleDesc' in found_struct}}
@@ -2345,6 +2409,10 @@ cdef class cudaExternalMemoryHandleDesc:
     flags : unsigned int
         Flags must either be zero or cudaExternalMemoryDedicated
     {{endif}}
+    {{if 'cudaExternalMemoryHandleDesc.reserved' in found_struct}}
+    reserved : List[unsigned int]
+        Must be zero
+    {{endif}}
 
     Methods
     -------
@@ -2377,6 +2445,10 @@ cdef class cudaExternalMemoryBufferDesc:
     flags : unsigned int
         Flags reserved for future use. Must be zero.
     {{endif}}
+    {{if 'cudaExternalMemoryBufferDesc.reserved' in found_struct}}
+    reserved : List[unsigned int]
+        Must be zero
+    {{endif}}
 
     Methods
     -------
@@ -2416,6 +2488,10 @@ cdef class cudaExternalMemoryMipmappedArrayDesc:
     numLevels : unsigned int
         Total number of levels in the mipmap chain
     {{endif}}
+    {{if 'cudaExternalMemoryMipmappedArrayDesc.reserved' in found_struct}}
+    reserved : List[unsigned int]
+        Must be zero
+    {{endif}}
 
     Methods
     -------
@@ -2433,7 +2509,7 @@ cdef class cudaExternalMemoryMipmappedArrayDesc:
 {{endif}}
 {{if 'cudaExternalSemaphoreHandleDesc.handle.win32' in found_struct}}
 
-cdef class anon_struct8:
+cdef class anon_struct9:
     """
     Attributes
     ----------
@@ -2464,7 +2540,7 @@ cdef class anon_union3:
 
     {{endif}}
     {{if 'cudaExternalSemaphoreHandleDesc.handle.win32' in found_struct}}
-    win32 : anon_struct8
+    win32 : anon_struct9
 
     {{endif}}
     {{if 'cudaExternalSemaphoreHandleDesc.handle.nvSciSyncObj' in found_struct}}
@@ -2479,7 +2555,7 @@ cdef class anon_union3:
     """
     cdef cyruntime.cudaExternalSemaphoreHandleDesc* _pvt_ptr
     {{if 'cudaExternalSemaphoreHandleDesc.handle.win32' in found_struct}}
-    cdef anon_struct8 _win32
+    cdef anon_struct9 _win32
     {{endif}}
 {{endif}}
 {{if 'cudaExternalSemaphoreHandleDesc' in found_struct}}
@@ -2502,6 +2578,10 @@ cdef class cudaExternalSemaphoreHandleDesc:
     flags : unsigned int
         Flags reserved for the future. Must be zero.
     {{endif}}
+    {{if 'cudaExternalSemaphoreHandleDesc.reserved' in found_struct}}
+    reserved : List[unsigned int]
+        Must be zero
+    {{endif}}
 
     Methods
     -------
@@ -2516,7 +2596,7 @@ cdef class cudaExternalSemaphoreHandleDesc:
 {{endif}}
 {{if 'cudaExternalSemaphoreSignalParams.params.fence' in found_struct}}
 
-cdef class anon_struct15:
+cdef class anon_struct10:
     """
     Attributes
     ----------
@@ -2534,7 +2614,7 @@ cdef class anon_struct15:
 {{endif}}
 {{if 'cudaExternalSemaphoreSignalParams.params.nvSciSync' in found_struct}}
 
-cdef class anon_union6:
+cdef class anon_union4:
     """
     Attributes
     ----------
@@ -2556,7 +2636,7 @@ cdef class anon_union6:
 {{endif}}
 {{if 'cudaExternalSemaphoreSignalParams.params.keyedMutex' in found_struct}}
 
-cdef class anon_struct16:
+cdef class anon_struct11:
     """
     Attributes
     ----------
@@ -2574,20 +2654,20 @@ cdef class anon_struct16:
 {{endif}}
 {{if 'cudaExternalSemaphoreSignalParams.params' in found_struct}}
 
-cdef class anon_struct17:
+cdef class anon_struct12:
     """
     Attributes
     ----------
     {{if 'cudaExternalSemaphoreSignalParams.params.fence' in found_struct}}
-    fence : anon_struct15
+    fence : anon_struct10
 
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.params.nvSciSync' in found_struct}}
-    nvSciSync : anon_union6
+    nvSciSync : anon_union4
 
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.params.keyedMutex' in found_struct}}
-    keyedMutex : anon_struct16
+    keyedMutex : anon_struct11
 
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.params.reserved' in found_struct}}
@@ -2602,13 +2682,13 @@ cdef class anon_struct17:
     """
     cdef cyruntime.cudaExternalSemaphoreSignalParams* _pvt_ptr
     {{if 'cudaExternalSemaphoreSignalParams.params.fence' in found_struct}}
-    cdef anon_struct15 _fence
+    cdef anon_struct10 _fence
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.params.nvSciSync' in found_struct}}
-    cdef anon_union6 _nvSciSync
+    cdef anon_union4 _nvSciSync
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.params.keyedMutex' in found_struct}}
-    cdef anon_struct16 _keyedMutex
+    cdef anon_struct11 _keyedMutex
     {{endif}}
 {{endif}}
 {{if 'cudaExternalSemaphoreSignalParams' in found_struct}}
@@ -2620,7 +2700,7 @@ cdef class cudaExternalSemaphoreSignalParams:
     Attributes
     ----------
     {{if 'cudaExternalSemaphoreSignalParams.params' in found_struct}}
-    params : anon_struct17
+    params : anon_struct12
 
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.flags' in found_struct}}
@@ -2647,12 +2727,12 @@ cdef class cudaExternalSemaphoreSignalParams:
     cdef cyruntime.cudaExternalSemaphoreSignalParams _pvt_val
     cdef cyruntime.cudaExternalSemaphoreSignalParams* _pvt_ptr
     {{if 'cudaExternalSemaphoreSignalParams.params' in found_struct}}
-    cdef anon_struct17 _params
+    cdef anon_struct12 _params
     {{endif}}
 {{endif}}
 {{if 'cudaExternalSemaphoreWaitParams.params.fence' in found_struct}}
 
-cdef class anon_struct18:
+cdef class anon_struct13:
     """
     Attributes
     ----------
@@ -2670,7 +2750,7 @@ cdef class anon_struct18:
 {{endif}}
 {{if 'cudaExternalSemaphoreWaitParams.params.nvSciSync' in found_struct}}
 
-cdef class anon_union7:
+cdef class anon_union5:
     """
     Attributes
     ----------
@@ -2692,7 +2772,7 @@ cdef class anon_union7:
 {{endif}}
 {{if 'cudaExternalSemaphoreWaitParams.params.keyedMutex' in found_struct}}
 
-cdef class anon_struct19:
+cdef class anon_struct14:
     """
     Attributes
     ----------
@@ -2714,20 +2794,20 @@ cdef class anon_struct19:
 {{endif}}
 {{if 'cudaExternalSemaphoreWaitParams.params' in found_struct}}
 
-cdef class anon_struct20:
+cdef class anon_struct15:
     """
     Attributes
     ----------
     {{if 'cudaExternalSemaphoreWaitParams.params.fence' in found_struct}}
-    fence : anon_struct18
+    fence : anon_struct13
 
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.params.nvSciSync' in found_struct}}
-    nvSciSync : anon_union7
+    nvSciSync : anon_union5
 
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.params.keyedMutex' in found_struct}}
-    keyedMutex : anon_struct19
+    keyedMutex : anon_struct14
 
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.params.reserved' in found_struct}}
@@ -2742,13 +2822,13 @@ cdef class anon_struct20:
     """
     cdef cyruntime.cudaExternalSemaphoreWaitParams* _pvt_ptr
     {{if 'cudaExternalSemaphoreWaitParams.params.fence' in found_struct}}
-    cdef anon_struct18 _fence
+    cdef anon_struct13 _fence
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.params.nvSciSync' in found_struct}}
-    cdef anon_union7 _nvSciSync
+    cdef anon_union5 _nvSciSync
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.params.keyedMutex' in found_struct}}
-    cdef anon_struct19 _keyedMutex
+    cdef anon_struct14 _keyedMutex
     {{endif}}
 {{endif}}
 {{if 'cudaExternalSemaphoreWaitParams' in found_struct}}
@@ -2760,7 +2840,7 @@ cdef class cudaExternalSemaphoreWaitParams:
     Attributes
     ----------
     {{if 'cudaExternalSemaphoreWaitParams.params' in found_struct}}
-    params : anon_struct20
+    params : anon_struct15
 
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.flags' in found_struct}}
@@ -2787,7 +2867,7 @@ cdef class cudaExternalSemaphoreWaitParams:
     cdef cyruntime.cudaExternalSemaphoreWaitParams _pvt_val
     cdef cyruntime.cudaExternalSemaphoreWaitParams* _pvt_ptr
     {{if 'cudaExternalSemaphoreWaitParams.params' in found_struct}}
-    cdef anon_struct20 _params
+    cdef anon_struct15 _params
     {{endif}}
 {{endif}}
 {{if 'cudalibraryHostUniversalFunctionAndDataTable' in found_struct}}
@@ -3093,7 +3173,7 @@ cdef class cudaConditionalNodeParams:
     {{if 'cudaConditionalNodeParams.size' in found_struct}}
     size : unsigned int
         Size of graph output array. Allowed values are 1 for
-        cudaGraphCondTypeWhile, 1 or 2 for cudaGraphCondTypeWhile, or any
+        cudaGraphCondTypeWhile, 1 or 2 for cudaGraphCondTypeIf, or any
         value greater than zero for cudaGraphCondTypeSwitch.
     {{endif}}
     {{if 'cudaConditionalNodeParams.phGraph_out' in found_struct}}
@@ -3459,7 +3539,7 @@ cdef class cudaGraphExecUpdateResultInfo_st:
 {{endif}}
 {{if 'cudaGraphKernelNodeUpdate.updateData.param' in found_struct}}
 
-cdef class anon_struct21:
+cdef class anon_struct16:
     """
     Attributes
     ----------
@@ -3485,7 +3565,7 @@ cdef class anon_struct21:
 {{endif}}
 {{if 'cudaGraphKernelNodeUpdate.updateData' in found_struct}}
 
-cdef class anon_union9:
+cdef class anon_union7:
     """
     Attributes
     ----------
@@ -3494,7 +3574,7 @@ cdef class anon_union9:
 
     {{endif}}
     {{if 'cudaGraphKernelNodeUpdate.updateData.param' in found_struct}}
-    param : anon_struct21
+    param : anon_struct16
 
     {{endif}}
     {{if 'cudaGraphKernelNodeUpdate.updateData.isEnabled' in found_struct}}
@@ -3512,7 +3592,7 @@ cdef class anon_union9:
     cdef dim3 _gridDim
     {{endif}}
     {{if 'cudaGraphKernelNodeUpdate.updateData.param' in found_struct}}
-    cdef anon_struct21 _param
+    cdef anon_struct16 _param
     {{endif}}
 {{endif}}
 {{if 'cudaGraphKernelNodeUpdate' in found_struct}}
@@ -3534,7 +3614,7 @@ cdef class cudaGraphKernelNodeUpdate:
         interpreted
     {{endif}}
     {{if 'cudaGraphKernelNodeUpdate.updateData' in found_struct}}
-    updateData : anon_union9
+    updateData : anon_union7
         Update data to apply. Which field is used depends on field's value
     {{endif}}
 
@@ -3549,7 +3629,7 @@ cdef class cudaGraphKernelNodeUpdate:
     cdef cudaGraphDeviceNode_t _node
     {{endif}}
     {{if 'cudaGraphKernelNodeUpdate.updateData' in found_struct}}
-    cdef anon_union9 _updateData
+    cdef anon_union7 _updateData
     {{endif}}
 {{endif}}
 {{if 'cudaLaunchMemSyncDomainMap_st' in found_struct}}
@@ -3585,7 +3665,7 @@ cdef class cudaLaunchMemSyncDomainMap_st:
 {{endif}}
 {{if 'cudaLaunchAttributeValue.clusterDim' in found_struct}}
 
-cdef class anon_struct22:
+cdef class anon_struct17:
     """
     Attributes
     ----------
@@ -3611,7 +3691,7 @@ cdef class anon_struct22:
 {{endif}}
 {{if 'cudaLaunchAttributeValue.programmaticEvent' in found_struct}}
 
-cdef class anon_struct23:
+cdef class anon_struct18:
     """
     Attributes
     ----------
@@ -3640,7 +3720,7 @@ cdef class anon_struct23:
 {{endif}}
 {{if 'cudaLaunchAttributeValue.preferredClusterDim' in found_struct}}
 
-cdef class anon_struct24:
+cdef class anon_struct19:
     """
     Attributes
     ----------
@@ -3666,7 +3746,7 @@ cdef class anon_struct24:
 {{endif}}
 {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
 
-cdef class anon_struct25:
+cdef class anon_struct20:
     """
     Attributes
     ----------
@@ -3691,7 +3771,7 @@ cdef class anon_struct25:
 {{endif}}
 {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
 
-cdef class anon_struct26:
+cdef class anon_struct21:
     """
     Attributes
     ----------
@@ -3742,7 +3822,7 @@ cdef class cudaLaunchAttributeValue:
         ::cudaSynchronizationPolicy for work queued up in this stream.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterDim' in found_struct}}
-    clusterDim : anon_struct22
+    clusterDim : anon_struct17
         Value of launch attribute cudaLaunchAttributeClusterDimension that
         represents the desired cluster dimensions for the kernel. Opaque
         type with the following fields: - `x` - The X dimension of the
@@ -3763,7 +3843,7 @@ cdef class cudaLaunchAttributeValue:
         cudaLaunchAttributeProgrammaticStreamSerialization.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.programmaticEvent' in found_struct}}
-    programmaticEvent : anon_struct23
+    programmaticEvent : anon_struct18
         Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
         the following fields: - `cudaEvent_t` event - Event to fire when
         all blocks trigger it.    - `int` flags; - Event record flags, see
@@ -3787,7 +3867,7 @@ cdef class cudaLaunchAttributeValue:
         cudaLaunchMemSyncDomain.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.preferredClusterDim' in found_struct}}
-    preferredClusterDim : anon_struct24
+    preferredClusterDim : anon_struct19
         Value of launch attribute
         cudaLaunchAttributePreferredClusterDimension that represents the
         desired preferred cluster dimensions for the kernel. Opaque type
@@ -3802,7 +3882,7 @@ cdef class cudaLaunchAttributeValue:
         cudaLaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
-    launchCompletionEvent : anon_struct25
+    launchCompletionEvent : anon_struct20
         Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
         with the following fields: - `cudaEvent_t` event - Event to fire
         when the last block launches.    - `int` flags - Event record
@@ -3810,7 +3890,7 @@ cdef class cudaLaunchAttributeValue:
         cudaEventRecordExternal.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
-    deviceUpdatableKernelNode : anon_struct26
+    deviceUpdatableKernelNode : anon_struct21
         Value of launch attribute
         cudaLaunchAttributeDeviceUpdatableKernelNode with the following
         fields: - `int` deviceUpdatable - Whether or not the resulting
@@ -3823,6 +3903,11 @@ cdef class cudaLaunchAttributeValue:
         Value of launch attribute
         cudaLaunchAttributePreferredSharedMemoryCarveout.
     {{endif}}
+    {{if 'cudaLaunchAttributeValue.nvlinkUtilCentricScheduling' in found_struct}}
+    nvlinkUtilCentricScheduling : unsigned int
+        Value of launch attribute
+        cudaLaunchAttributeNvlinkUtilCentricScheduling.
+    {{endif}}
 
     Methods
     -------
@@ -3835,22 +3920,22 @@ cdef class cudaLaunchAttributeValue:
     cdef cudaAccessPolicyWindow _accessPolicyWindow
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterDim' in found_struct}}
-    cdef anon_struct22 _clusterDim
+    cdef anon_struct17 _clusterDim
     {{endif}}
     {{if 'cudaLaunchAttributeValue.programmaticEvent' in found_struct}}
-    cdef anon_struct23 _programmaticEvent
+    cdef anon_struct18 _programmaticEvent
     {{endif}}
     {{if 'cudaLaunchAttributeValue.memSyncDomainMap' in found_struct}}
     cdef cudaLaunchMemSyncDomainMap _memSyncDomainMap
     {{endif}}
     {{if 'cudaLaunchAttributeValue.preferredClusterDim' in found_struct}}
-    cdef anon_struct24 _preferredClusterDim
+    cdef anon_struct19 _preferredClusterDim
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
-    cdef anon_struct25 _launchCompletionEvent
+    cdef anon_struct20 _launchCompletionEvent
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
-    cdef anon_struct26 _deviceUpdatableKernelNode
+    cdef anon_struct21 _deviceUpdatableKernelNode
     {{endif}}
 {{endif}}
 {{if 'cudaLaunchAttribute_st' in found_struct}}
@@ -3883,7 +3968,7 @@ cdef class cudaLaunchAttribute_st:
 {{endif}}
 {{if 'cudaAsyncNotificationInfo.info.overBudget' in found_struct}}
 
-cdef class anon_struct27:
+cdef class anon_struct22:
     """
     Attributes
     ----------
@@ -3901,12 +3986,12 @@ cdef class anon_struct27:
 {{endif}}
 {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
 
-cdef class anon_union10:
+cdef class anon_union8:
     """
     Attributes
     ----------
     {{if 'cudaAsyncNotificationInfo.info.overBudget' in found_struct}}
-    overBudget : anon_struct27
+    overBudget : anon_struct22
 
     {{endif}}
 
@@ -3917,7 +4002,7 @@ cdef class anon_union10:
     """
     cdef cyruntime.cudaAsyncNotificationInfo* _pvt_ptr
     {{if 'cudaAsyncNotificationInfo.info.overBudget' in found_struct}}
-    cdef anon_struct27 _overBudget
+    cdef anon_struct22 _overBudget
     {{endif}}
 {{endif}}
 {{if 'cudaAsyncNotificationInfo' in found_struct}}
@@ -3933,7 +4018,7 @@ cdef class cudaAsyncNotificationInfo:
         The type of notification being sent
     {{endif}}
     {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
-    info : anon_union10
+    info : anon_union8
         Information about the notification. `typename` must be checked in
         order to interpret this field.
     {{endif}}
@@ -3946,7 +4031,7 @@ cdef class cudaAsyncNotificationInfo:
     cdef cyruntime.cudaAsyncNotificationInfo* _val_ptr
     cdef cyruntime.cudaAsyncNotificationInfo* _pvt_ptr
     {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
-    cdef anon_union10 _info
+    cdef anon_union8 _info
     {{endif}}
 {{endif}}
 {{if 'cudaTextureDesc' in found_struct}}
@@ -4069,7 +4154,7 @@ cdef class cudaEglPlaneDesc_st:
 {{endif}}
 {{if True}}
 
-cdef class anon_union11:
+cdef class anon_union9:
     """
     Attributes
     ----------
@@ -4105,7 +4190,7 @@ cdef class cudaEglFrame_st:
     Attributes
     ----------
     {{if True}}
-    frame : anon_union11
+    frame : anon_union9
 
     {{endif}}
     {{if True}}
@@ -4133,7 +4218,7 @@ cdef class cudaEglFrame_st:
     cdef cyruntime.cudaEglFrame_st* _val_ptr
     cdef cyruntime.cudaEglFrame_st* _pvt_ptr
     {{if True}}
-    cdef anon_union11 _frame
+    cdef anon_union9 _frame
     {{endif}}
 {{endif}}
 {{if 'CUuuid' in found_types}}
@@ -4411,7 +4496,7 @@ cdef class cudaAsyncNotificationInfo_t(cudaAsyncNotificationInfo):
         The type of notification being sent
     {{endif}}
     {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
-    info : anon_union10
+    info : anon_union8
         Information about the notification. `typename` must be checked in
         order to interpret this field.
     {{endif}}
@@ -4451,7 +4536,7 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
         ::cudaSynchronizationPolicy for work queued up in this stream.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterDim' in found_struct}}
-    clusterDim : anon_struct22
+    clusterDim : anon_struct17
         Value of launch attribute cudaLaunchAttributeClusterDimension that
         represents the desired cluster dimensions for the kernel. Opaque
         type with the following fields: - `x` - The X dimension of the
@@ -4472,7 +4557,7 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
         cudaLaunchAttributeProgrammaticStreamSerialization.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.programmaticEvent' in found_struct}}
-    programmaticEvent : anon_struct23
+    programmaticEvent : anon_struct18
         Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
         the following fields: - `cudaEvent_t` event - Event to fire when
         all blocks trigger it.    - `int` flags; - Event record flags, see
@@ -4496,7 +4581,7 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
         cudaLaunchMemSyncDomain.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.preferredClusterDim' in found_struct}}
-    preferredClusterDim : anon_struct24
+    preferredClusterDim : anon_struct19
         Value of launch attribute
         cudaLaunchAttributePreferredClusterDimension that represents the
         desired preferred cluster dimensions for the kernel. Opaque type
@@ -4511,7 +4596,7 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
         cudaLaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
-    launchCompletionEvent : anon_struct25
+    launchCompletionEvent : anon_struct20
         Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
         with the following fields: - `cudaEvent_t` event - Event to fire
         when the last block launches.    - `int` flags - Event record
@@ -4519,7 +4604,7 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
         cudaEventRecordExternal.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
-    deviceUpdatableKernelNode : anon_struct26
+    deviceUpdatableKernelNode : anon_struct21
         Value of launch attribute
         cudaLaunchAttributeDeviceUpdatableKernelNode with the following
         fields: - `int` deviceUpdatable - Whether or not the resulting
@@ -4532,6 +4617,11 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
         Value of launch attribute
         cudaLaunchAttributePreferredSharedMemoryCarveout.
     {{endif}}
+    {{if 'cudaLaunchAttributeValue.nvlinkUtilCentricScheduling' in found_struct}}
+    nvlinkUtilCentricScheduling : unsigned int
+        Value of launch attribute
+        cudaLaunchAttributeNvlinkUtilCentricScheduling.
+    {{endif}}
 
     Methods
     -------
@@ -4568,7 +4658,7 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
         ::cudaSynchronizationPolicy for work queued up in this stream.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterDim' in found_struct}}
-    clusterDim : anon_struct22
+    clusterDim : anon_struct17
         Value of launch attribute cudaLaunchAttributeClusterDimension that
         represents the desired cluster dimensions for the kernel. Opaque
         type with the following fields: - `x` - The X dimension of the
@@ -4589,7 +4679,7 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
         cudaLaunchAttributeProgrammaticStreamSerialization.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.programmaticEvent' in found_struct}}
-    programmaticEvent : anon_struct23
+    programmaticEvent : anon_struct18
         Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
         the following fields: - `cudaEvent_t` event - Event to fire when
         all blocks trigger it.    - `int` flags; - Event record flags, see
@@ -4613,7 +4703,7 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
         cudaLaunchMemSyncDomain.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.preferredClusterDim' in found_struct}}
-    preferredClusterDim : anon_struct24
+    preferredClusterDim : anon_struct19
         Value of launch attribute
         cudaLaunchAttributePreferredClusterDimension that represents the
         desired preferred cluster dimensions for the kernel. Opaque type
@@ -4628,7 +4718,7 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
         cudaLaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
-    launchCompletionEvent : anon_struct25
+    launchCompletionEvent : anon_struct20
         Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
         with the following fields: - `cudaEvent_t` event - Event to fire
         when the last block launches.    - `int` flags - Event record
@@ -4636,7 +4726,7 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
         cudaEventRecordExternal.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
-    deviceUpdatableKernelNode : anon_struct26
+    deviceUpdatableKernelNode : anon_struct21
         Value of launch attribute
         cudaLaunchAttributeDeviceUpdatableKernelNode with the following
         fields: - `int` deviceUpdatable - Whether or not the resulting
@@ -4649,6 +4739,11 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
         Value of launch attribute
         cudaLaunchAttributePreferredSharedMemoryCarveout.
     {{endif}}
+    {{if 'cudaLaunchAttributeValue.nvlinkUtilCentricScheduling' in found_struct}}
+    nvlinkUtilCentricScheduling : unsigned int
+        Value of launch attribute
+        cudaLaunchAttributeNvlinkUtilCentricScheduling.
+    {{endif}}
 
     Methods
     -------
@@ -4718,7 +4813,7 @@ cdef class cudaEglFrame(cudaEglFrame_st):
     Attributes
     ----------
     {{if True}}
-    frame : anon_union11
+    frame : anon_union9
 
     {{endif}}
     {{if True}}
@@ -4906,6 +5001,21 @@ cdef class cudaGraphConditionalHandle:
     cdef cyruntime.cudaGraphConditionalHandle* _pvt_ptr
 {{endif}}
 
+{{if 'cudaLogIterator' in found_types}}
+
+cdef class cudaLogIterator:
+    """
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+
+    """
+    cdef cyruntime.cudaLogIterator  _pvt_val
+    cdef cyruntime.cudaLogIterator* _pvt_ptr
+{{endif}}
+
 {{if 'cudaSurfaceObject_t' in found_types}}
 
 cdef class cudaSurfaceObject_t:
diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in
index 4e36cce1ef..5f0d9d06a2 100644
--- a/cuda_bindings/cuda/bindings/runtime.pyx.in
+++ b/cuda_bindings/cuda/bindings/runtime.pyx.in
@@ -36,6 +36,7 @@ ctypedef unsigned long long unsigned_ptr
 ctypedef unsigned long long unsigned_long_long_ptr
 ctypedef unsigned long long long_long_ptr
 ctypedef unsigned long long size_t_ptr
+ctypedef unsigned long long long_ptr
 ctypedef unsigned long long float_ptr
 ctypedef unsigned long long double_ptr
 ctypedef unsigned long long void_ptr
@@ -209,18 +210,6 @@ cudaInvalidDeviceId = cyruntime.cudaInvalidDeviceId
 #: call
 cudaInitDeviceFlagsAreValid = cyruntime.cudaInitDeviceFlagsAreValid
 
-#: If set, each kernel launched as part of
-#: :py:obj:`~.cudaLaunchCooperativeKernelMultiDevice` only waits for prior
-#: work in the stream corresponding to that GPU to complete before the
-#: kernel begins execution.
-cudaCooperativeLaunchMultiDeviceNoPreSync = cyruntime.cudaCooperativeLaunchMultiDeviceNoPreSync
-
-#: If set, any subsequent work pushed in a stream that participated in a
-#: call to :py:obj:`~.cudaLaunchCooperativeKernelMultiDevice` will only
-#: wait for the kernel launched on the GPU corresponding to that stream to
-#: complete before it begins execution.
-cudaCooperativeLaunchMultiDeviceNoPostSync = cyruntime.cudaCooperativeLaunchMultiDeviceNoPostSync
-
 #: Indicates that the layered sparse CUDA array or CUDA mipmapped array has
 #: a single mip tail region for all layers
 cudaArraySparsePropertiesSingleMipTail = cyruntime.cudaArraySparsePropertiesSingleMipTail
@@ -306,6 +295,8 @@ cudaKernelNodeAttributePreferredSharedMemoryCarveout = cyruntime.cudaKernelNodeA
 
 cudaKernelNodeAttributeDeviceUpdatableKernelNode = cyruntime.cudaKernelNodeAttributeDeviceUpdatableKernelNode
 
+cudaKernelNodeAttributeNvlinkUtilCentricScheduling = cyruntime.cudaKernelNodeAttributeNvlinkUtilCentricScheduling
+
 cudaSurfaceType1D = cyruntime.cudaSurfaceType1D
 
 cudaSurfaceType2D = cyruntime.cudaSurfaceType2D
@@ -780,6 +771,11 @@ class cudaError_t(IntEnum):
     #: same error. To continue using CUDA, the process must be terminated
     #: and relaunched.
     cudaErrorContained = cyruntime.cudaError.cudaErrorContained{{endif}}
+    {{if 'cudaErrorNvlinkEncryptionFailed' in found_values}}
+
+    #: This indicates that an NVLink encryption error was detected during
+    #: the execution.
+    cudaErrorNvlinkEncryptionFailed = cyruntime.cudaError.cudaErrorNvlinkEncryptionFailed{{endif}}
     {{if 'cudaErrorInvalidSource' in found_values}}
 
     #: This indicates that the device kernel source is invalid.
@@ -852,8 +848,8 @@ class cudaError_t(IntEnum):
     {{if 'cudaErrorLaunchTimeout' in found_values}}
 
     #: This indicates that the device kernel took too long to execute. This
-    #: can only occur if timeouts are enabled - see the device property
-    #: :py:obj:`~.kernelExecTimeoutEnabled` for more information. This
+    #: can only occur if timeouts are enabled - see the device attribute
+    #: :py:obj:`~.cudaDevAttrKernelExecTimeout` for more information. This
     #: leaves the process in an inconsistent state and any further CUDA
     #: work will return the same error. To continue using CUDA, the process
     #: must be terminated and relaunched.
@@ -975,9 +971,8 @@ class cudaError_t(IntEnum):
 
     #: This error indicates that the number of blocks launched per grid for
     #: a kernel that was launched via either
-    #: :py:obj:`~.cudaLaunchCooperativeKernel` or
-    #: :py:obj:`~.cudaLaunchCooperativeKernelMultiDevice` exceeds the
-    #: maximum number of blocks as allowed by
+    #: :py:obj:`~.cudaLaunchCooperativeKernel` exceeds the maximum number
+    #: of blocks as allowed by
     #: :py:obj:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessor` or
     #: :py:obj:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`
     #: times the number of multiprocessors as specified by the device
@@ -1442,6 +1437,29 @@ class cudaLaunchAttributeID(IntEnum):
     #: only a hint, and the driver can choose a different configuration if
     #: required for the launch.
     cudaLaunchAttributePreferredSharedMemoryCarveout = cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredSharedMemoryCarveout{{endif}}
+    {{if 'cudaLaunchAttributeNvlinkUtilCentricScheduling' in found_values}}
+
+    #: Valid for streams, graph nodes, launches. This attribute is a hint
+    #: to the CUDA runtime that the launch should attempt to make the
+    #: kernel maximize its NVLINK utilization.
+    #:
+    #:  When possible to honor this hint, CUDA will assume each block in
+    #: the grid launch will carry out an even amount of NVLINK traffic, and
+    #: make a best-effort attempt to adjust the kernel launch based on that
+    #: assumption.
+    #:  This attribute is a hint only. CUDA makes no functional or
+    #: performance guarantee. Its applicability can be affected by many
+    #: different factors, including driver version (i.e. CUDA doesn't
+    #: guarantee the performance characteristics will be maintained between
+    #: driver versions or a driver update could alter or regress previously
+    #: observed perf characteristics.) It also doesn't guarantee a
+    #: successful result, i.e. applying the attribute may not improve the
+    #: performance of either the targeted kernel or the encapsulating
+    #: application.
+    #:  Valid values for
+    #: :py:obj:`~.cudaLaunchAttributeValue.nvlinkUtilCentricScheduling` are
+    #: 0 (disabled) and 1 (enabled).
+    cudaLaunchAttributeNvlinkUtilCentricScheduling = cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeNvlinkUtilCentricScheduling{{endif}}
 
 _dict_cudaLaunchAttributeID = dict(((int(v), v) for k, v in cudaLaunchAttributeID.__members__.items()))
 {{endif}}
@@ -1458,6 +1476,19 @@ class cudaAsyncNotificationType(IntEnum):
 
 _dict_cudaAsyncNotificationType = dict(((int(v), v) for k, v in cudaAsyncNotificationType.__members__.items()))
 {{endif}}
+{{if 'CUDAlogLevel_enum' in found_types}}
+
+class cudaLogLevel(IntEnum):
+    """
+
+    """
+    {{if 'cudaLogLevelError' in found_values}}
+    cudaLogLevelError = cyruntime.CUDAlogLevel_enum.cudaLogLevelError{{endif}}
+    {{if 'cudaLogLevelWarning' in found_values}}
+    cudaLogLevelWarning = cyruntime.CUDAlogLevel_enum.cudaLogLevelWarning{{endif}}
+
+_dict_cudaLogLevel = dict(((int(v), v) for k, v in cudaLogLevel.__members__.items()))
+{{endif}}
 {{if 'cudaDataType_t' in found_types}}
 
 class cudaDataType(IntEnum):
@@ -1535,6 +1566,45 @@ class cudaDataType(IntEnum):
 
 _dict_cudaDataType = dict(((int(v), v) for k, v in cudaDataType.__members__.items()))
 {{endif}}
+{{if 'cudaEmulationStrategy_t' in found_types}}
+
+class cudaEmulationStrategy(IntEnum):
+    """"""
+    {{if 'CUDA_EMULATION_STRATEGY_DEFAULT' in found_values}}
+    CUDA_EMULATION_STRATEGY_DEFAULT = cyruntime.cudaEmulationStrategy_t.CUDA_EMULATION_STRATEGY_DEFAULT{{endif}}
+    {{if 'CUDA_EMULATION_STRATEGY_PERFORMANT' in found_values}}
+    CUDA_EMULATION_STRATEGY_PERFORMANT = cyruntime.cudaEmulationStrategy_t.CUDA_EMULATION_STRATEGY_PERFORMANT{{endif}}
+    {{if 'CUDA_EMULATION_STRATEGY_EAGER' in found_values}}
+    CUDA_EMULATION_STRATEGY_EAGER = cyruntime.cudaEmulationStrategy_t.CUDA_EMULATION_STRATEGY_EAGER{{endif}}
+
+_dict_cudaEmulationStrategy = dict(((int(v), v) for k, v in cudaEmulationStrategy.__members__.items()))
+{{endif}}
+{{if 'cudaEmulationMantissaControl_t' in found_types}}
+
+class cudaEmulationMantissaControl(IntEnum):
+    """"""
+    {{if 'CUDA_EMULATION_MANTISSA_CONTROL_DYNAMIC' in found_values}}
+    CUDA_EMULATION_MANTISSA_CONTROL_DYNAMIC = cyruntime.cudaEmulationMantissaControl_t.CUDA_EMULATION_MANTISSA_CONTROL_DYNAMIC{{endif}}
+    {{if 'CUDA_EMULATION_MANTISSA_CONTROL_FIXED' in found_values}}
+    CUDA_EMULATION_MANTISSA_CONTROL_FIXED = cyruntime.cudaEmulationMantissaControl_t.CUDA_EMULATION_MANTISSA_CONTROL_FIXED{{endif}}
+
+_dict_cudaEmulationMantissaControl = dict(((int(v), v) for k, v in cudaEmulationMantissaControl.__members__.items()))
+{{endif}}
+{{if 'cudaEmulationSpecialValuesSupport_t' in found_types}}
+
+class cudaEmulationSpecialValuesSupport(IntEnum):
+    """"""
+    {{if 'CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_NONE' in found_values}}
+    CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_NONE = cyruntime.cudaEmulationSpecialValuesSupport_t.CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_NONE{{endif}}
+    {{if 'CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_INFINITY' in found_values}}
+    CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_INFINITY = cyruntime.cudaEmulationSpecialValuesSupport_t.CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_INFINITY{{endif}}
+    {{if 'CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_NAN' in found_values}}
+    CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_NAN = cyruntime.cudaEmulationSpecialValuesSupport_t.CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_NAN{{endif}}
+    {{if 'CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_DEFAULT' in found_values}}
+    CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_DEFAULT = cyruntime.cudaEmulationSpecialValuesSupport_t.CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_DEFAULT{{endif}}
+
+_dict_cudaEmulationSpecialValuesSupport = dict(((int(v), v) for k, v in cudaEmulationSpecialValuesSupport.__members__.items()))
+{{endif}}
 {{if 'libraryPropertyType_t' in found_types}}
 
 class libraryPropertyType(IntEnum):
@@ -3470,10 +3540,8 @@ class cudaDeviceAttr(IntEnum):
     #: Device supports launching cooperative kernels via
     #: :py:obj:`~.cudaLaunchCooperativeKernel`
     cudaDevAttrCooperativeLaunch = cyruntime.cudaDeviceAttr.cudaDevAttrCooperativeLaunch{{endif}}
-    {{if 'cudaDevAttrCooperativeMultiDeviceLaunch' in found_values}}
-
-    #: Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated.
-    cudaDevAttrCooperativeMultiDeviceLaunch = cyruntime.cudaDeviceAttr.cudaDevAttrCooperativeMultiDeviceLaunch{{endif}}
+    {{if 'cudaDevAttrReserved96' in found_values}}
+    cudaDevAttrReserved96 = cyruntime.cudaDeviceAttr.cudaDevAttrReserved96{{endif}}
     {{if 'cudaDevAttrMaxSharedMemoryPerBlockOptin' in found_values}}
 
     #: The maximum optin shared memory per block. This value may vary by
@@ -3527,11 +3595,6 @@ class cudaDeviceAttr(IntEnum):
 
     #: External timeline semaphore interop is supported on the device
     cudaDevAttrTimelineSemaphoreInteropSupported = cyruntime.cudaDeviceAttr.cudaDevAttrTimelineSemaphoreInteropSupported{{endif}}
-    {{if 'cudaDevAttrMaxTimelineSemaphoreInteropSupported' in found_values}}
-
-    #: Deprecated, External timeline semaphore interop is supported on the
-    #: device
-    cudaDevAttrMaxTimelineSemaphoreInteropSupported = cyruntime.cudaDeviceAttr.cudaDevAttrMaxTimelineSemaphoreInteropSupported{{endif}}
     {{if 'cudaDevAttrMemoryPoolsSupported' in found_values}}
 
     #: Device supports using the :py:obj:`~.cudaMallocAsync` and
@@ -3638,6 +3701,18 @@ class cudaDeviceAttr(IntEnum):
     #: Device supports HostNuma location IPC between nodes in a multi-node
     #: system.
     cudaDevAttrHostNumaMultinodeIpcSupported = cyruntime.cudaDeviceAttr.cudaDevAttrHostNumaMultinodeIpcSupported{{endif}}
+    {{if 'cudaDevAttrHostMemoryPoolsSupported' in found_values}}
+
+    #: Device suports HOST location with the :py:obj:`~.cuMemAllocAsync`
+    #: and :py:obj:`~.cuMemPool` family of APIs
+    cudaDevAttrHostMemoryPoolsSupported = cyruntime.cudaDeviceAttr.cudaDevAttrHostMemoryPoolsSupported{{endif}}
+    {{if 'cudaDevAttrReserved145' in found_values}}
+    cudaDevAttrReserved145 = cyruntime.cudaDeviceAttr.cudaDevAttrReserved145{{endif}}
+    {{if 'cudaDevAttrOnlyPartialHostNativeAtomicSupported' in found_values}}
+
+    #: Link between the device and the host supports only some native
+    #: atomic operations
+    cudaDevAttrOnlyPartialHostNativeAtomicSupported = cyruntime.cudaDeviceAttr.cudaDevAttrOnlyPartialHostNativeAtomicSupported{{endif}}
     {{if 'cudaDevAttrMax' in found_values}}
     cudaDevAttrMax = cyruntime.cudaDeviceAttr.cudaDevAttrMax{{endif}}
 
@@ -3709,6 +3784,11 @@ class cudaMemLocationType(IntEnum):
     """
     {{if 'cudaMemLocationTypeInvalid' in found_values}}
     cudaMemLocationTypeInvalid = cyruntime.cudaMemLocationType.cudaMemLocationTypeInvalid{{endif}}
+    {{if 'cudaMemLocationTypeNone' in found_values}}
+
+    #: Location is unspecified. This is used when creating a managed memory
+    #: pool to indicate no preferred location for the pool
+    cudaMemLocationTypeNone = cyruntime.cudaMemLocationType.cudaMemLocationTypeNone{{endif}}
     {{if 'cudaMemLocationTypeDevice' in found_values}}
 
     #: Location is a device location, thus id is a device ordinal
@@ -3763,6 +3843,10 @@ class cudaMemAllocationType(IntEnum):
     #: This allocation type is 'pinned', i.e. cannot migrate from its
     #: current location while the application is actively using it
     cudaMemAllocationTypePinned = cyruntime.cudaMemAllocationType.cudaMemAllocationTypePinned{{endif}}
+    {{if 'cudaMemAllocationTypeManaged' in found_values}}
+
+    #: This allocation type is managed memory
+    cudaMemAllocationTypeManaged = cyruntime.cudaMemAllocationType.cudaMemAllocationTypeManaged{{endif}}
     {{if 'cudaMemAllocationTypeMax' in found_values}}
     cudaMemAllocationTypeMax = cyruntime.cudaMemAllocationType.cudaMemAllocationTypeMax{{endif}}
 
@@ -3930,9 +4014,71 @@ class cudaDeviceP2PAttr(IntEnum):
 
     #: Accessing CUDA arrays over the link supported
     cudaDevP2PAttrCudaArrayAccessSupported = cyruntime.cudaDeviceP2PAttr.cudaDevP2PAttrCudaArrayAccessSupported{{endif}}
+    {{if 'cudaDevP2PAttrOnlyPartialNativeAtomicSupported' in found_values}}
+
+    #: Only some CUDA-valid atomic operations over the link are supported.
+    cudaDevP2PAttrOnlyPartialNativeAtomicSupported = cyruntime.cudaDeviceP2PAttr.cudaDevP2PAttrOnlyPartialNativeAtomicSupported{{endif}}
 
 _dict_cudaDeviceP2PAttr = dict(((int(v), v) for k, v in cudaDeviceP2PAttr.__members__.items()))
 {{endif}}
+{{if 'cudaAtomicOperation' in found_types}}
+
+class cudaAtomicOperation(IntEnum):
+    """
+    CUDA-valid Atomic Operations
+    """
+    {{if 'cudaAtomicOperationIntegerAdd' in found_values}}
+    cudaAtomicOperationIntegerAdd = cyruntime.cudaAtomicOperation.cudaAtomicOperationIntegerAdd{{endif}}
+    {{if 'cudaAtomicOperationIntegerMin' in found_values}}
+    cudaAtomicOperationIntegerMin = cyruntime.cudaAtomicOperation.cudaAtomicOperationIntegerMin{{endif}}
+    {{if 'cudaAtomicOperationIntegerMax' in found_values}}
+    cudaAtomicOperationIntegerMax = cyruntime.cudaAtomicOperation.cudaAtomicOperationIntegerMax{{endif}}
+    {{if 'cudaAtomicOperationIntegerIncrement' in found_values}}
+    cudaAtomicOperationIntegerIncrement = cyruntime.cudaAtomicOperation.cudaAtomicOperationIntegerIncrement{{endif}}
+    {{if 'cudaAtomicOperationIntegerDecrement' in found_values}}
+    cudaAtomicOperationIntegerDecrement = cyruntime.cudaAtomicOperation.cudaAtomicOperationIntegerDecrement{{endif}}
+    {{if 'cudaAtomicOperationAnd' in found_values}}
+    cudaAtomicOperationAnd = cyruntime.cudaAtomicOperation.cudaAtomicOperationAnd{{endif}}
+    {{if 'cudaAtomicOperationOr' in found_values}}
+    cudaAtomicOperationOr = cyruntime.cudaAtomicOperation.cudaAtomicOperationOr{{endif}}
+    {{if 'cudaAtomicOperationXOR' in found_values}}
+    cudaAtomicOperationXOR = cyruntime.cudaAtomicOperation.cudaAtomicOperationXOR{{endif}}
+    {{if 'cudaAtomicOperationExchange' in found_values}}
+    cudaAtomicOperationExchange = cyruntime.cudaAtomicOperation.cudaAtomicOperationExchange{{endif}}
+    {{if 'cudaAtomicOperationCAS' in found_values}}
+    cudaAtomicOperationCAS = cyruntime.cudaAtomicOperation.cudaAtomicOperationCAS{{endif}}
+    {{if 'cudaAtomicOperationFloatAdd' in found_values}}
+    cudaAtomicOperationFloatAdd = cyruntime.cudaAtomicOperation.cudaAtomicOperationFloatAdd{{endif}}
+    {{if 'cudaAtomicOperationFloatMin' in found_values}}
+    cudaAtomicOperationFloatMin = cyruntime.cudaAtomicOperation.cudaAtomicOperationFloatMin{{endif}}
+    {{if 'cudaAtomicOperationFloatMax' in found_values}}
+    cudaAtomicOperationFloatMax = cyruntime.cudaAtomicOperation.cudaAtomicOperationFloatMax{{endif}}
+
+_dict_cudaAtomicOperation = dict(((int(v), v) for k, v in cudaAtomicOperation.__members__.items()))
+{{endif}}
+{{if 'cudaAtomicOperationCapability' in found_types}}
+
+class cudaAtomicOperationCapability(IntEnum):
+    """
+    CUDA-valid Atomic Operation capabilities
+    """
+    {{if 'cudaAtomicCapabilitySigned' in found_values}}
+    cudaAtomicCapabilitySigned = cyruntime.cudaAtomicOperationCapability.cudaAtomicCapabilitySigned{{endif}}
+    {{if 'cudaAtomicCapabilityUnsigned' in found_values}}
+    cudaAtomicCapabilityUnsigned = cyruntime.cudaAtomicOperationCapability.cudaAtomicCapabilityUnsigned{{endif}}
+    {{if 'cudaAtomicCapabilityReduction' in found_values}}
+    cudaAtomicCapabilityReduction = cyruntime.cudaAtomicOperationCapability.cudaAtomicCapabilityReduction{{endif}}
+    {{if 'cudaAtomicCapabilityScalar32' in found_values}}
+    cudaAtomicCapabilityScalar32 = cyruntime.cudaAtomicOperationCapability.cudaAtomicCapabilityScalar32{{endif}}
+    {{if 'cudaAtomicCapabilityScalar64' in found_values}}
+    cudaAtomicCapabilityScalar64 = cyruntime.cudaAtomicOperationCapability.cudaAtomicCapabilityScalar64{{endif}}
+    {{if 'cudaAtomicCapabilityScalar128' in found_values}}
+    cudaAtomicCapabilityScalar128 = cyruntime.cudaAtomicOperationCapability.cudaAtomicCapabilityScalar128{{endif}}
+    {{if 'cudaAtomicCapabilityVector32x4' in found_values}}
+    cudaAtomicCapabilityVector32x4 = cyruntime.cudaAtomicOperationCapability.cudaAtomicCapabilityVector32x4{{endif}}
+
+_dict_cudaAtomicOperationCapability = dict(((int(v), v) for k, v in cudaAtomicOperationCapability.__members__.items()))
+{{endif}}
 {{if 'cudaExternalMemoryHandleType' in found_types}}
 
 class cudaExternalMemoryHandleType(IntEnum):
@@ -4252,10 +4398,10 @@ class cudaCGScope(IntEnum):
 
     #: Scope represented by a grid_group
     cudaCGScopeGrid = cyruntime.cudaCGScope.cudaCGScopeGrid{{endif}}
-    {{if 'cudaCGScopeMultiGrid' in found_values}}
+    {{if 'cudaCGScopeReserved' in found_values}}
 
-    #: Scope represented by a multi_grid_group
-    cudaCGScopeMultiGrid = cyruntime.cudaCGScope.cudaCGScopeMultiGrid{{endif}}
+    #: Reserved
+    cudaCGScopeReserved = cyruntime.cudaCGScope.cudaCGScopeReserved{{endif}}
 
 _dict_cudaCGScope = dict(((int(v), v) for k, v in cudaCGScope.__members__.items()))
 {{endif}}
@@ -4982,6 +5128,29 @@ class cudaStreamAttrID(IntEnum):
     #: only a hint, and the driver can choose a different configuration if
     #: required for the launch.
     cudaLaunchAttributePreferredSharedMemoryCarveout = cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredSharedMemoryCarveout{{endif}}
+    {{if 'cudaLaunchAttributeNvlinkUtilCentricScheduling' in found_values}}
+
+    #: Valid for streams, graph nodes, launches. This attribute is a hint
+    #: to the CUDA runtime that the launch should attempt to make the
+    #: kernel maximize its NVLINK utilization.
+    #:
+    #:  When possible to honor this hint, CUDA will assume each block in
+    #: the grid launch will carry out an even amount of NVLINK traffic, and
+    #: make a best-effort attempt to adjust the kernel launch based on that
+    #: assumption.
+    #:  This attribute is a hint only. CUDA makes no functional or
+    #: performance guarantee. Its applicability can be affected by many
+    #: different factors, including driver version (i.e. CUDA doesn't
+    #: guarantee the performance characteristics will be maintained between
+    #: driver versions or a driver update could alter or regress previously
+    #: observed perf characteristics.) It also doesn't guarantee a
+    #: successful result, i.e. applying the attribute may not improve the
+    #: performance of either the targeted kernel or the encapsulating
+    #: application.
+    #:  Valid values for
+    #: :py:obj:`~.cudaLaunchAttributeValue.nvlinkUtilCentricScheduling` are
+    #: 0 (disabled) and 1 (enabled).
+    cudaLaunchAttributeNvlinkUtilCentricScheduling = cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeNvlinkUtilCentricScheduling{{endif}}
 
 _dict_cudaLaunchAttributeID = dict(((int(v), v) for k, v in cudaLaunchAttributeID.__members__.items()))
 {{endif}}
@@ -5173,6 +5342,29 @@ class cudaKernelNodeAttrID(IntEnum):
     #: only a hint, and the driver can choose a different configuration if
     #: required for the launch.
     cudaLaunchAttributePreferredSharedMemoryCarveout = cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredSharedMemoryCarveout{{endif}}
+    {{if 'cudaLaunchAttributeNvlinkUtilCentricScheduling' in found_values}}
+
+    #: Valid for streams, graph nodes, launches. This attribute is a hint
+    #: to the CUDA runtime that the launch should attempt to make the
+    #: kernel maximize its NVLINK utilization.
+    #:
+    #:  When possible to honor this hint, CUDA will assume each block in
+    #: the grid launch will carry out an even amount of NVLINK traffic, and
+    #: make a best-effort attempt to adjust the kernel launch based on that
+    #: assumption.
+    #:  This attribute is a hint only. CUDA makes no functional or
+    #: performance guarantee. Its applicability can be affected by many
+    #: different factors, including driver version (i.e. CUDA doesn't
+    #: guarantee the performance characteristics will be maintained between
+    #: driver versions or a driver update could alter or regress previously
+    #: observed perf characteristics.) It also doesn't guarantee a
+    #: successful result, i.e. applying the attribute may not improve the
+    #: performance of either the targeted kernel or the encapsulating
+    #: application.
+    #:  Valid values for
+    #: :py:obj:`~.cudaLaunchAttributeValue.nvlinkUtilCentricScheduling` are
+    #: 0 (disabled) and 1 (enabled).
+    cudaLaunchAttributeNvlinkUtilCentricScheduling = cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeNvlinkUtilCentricScheduling{{endif}}
 
 _dict_cudaLaunchAttributeID = dict(((int(v), v) for k, v in cudaLaunchAttributeID.__members__.items()))
 {{endif}}
@@ -5517,6 +5709,35 @@ cdef class cudaAsyncCallbackHandle_t:
         return <void_ptr>self._pvt_ptr
 {{endif}}
 
+{{if 'cudaLogsCallbackHandle' in found_types}}
+
+cdef class cudaLogsCallbackHandle:
+    """
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+
+    """
+    def __cinit__(self, void_ptr init_value = 0, void_ptr _ptr = 0):
+        if _ptr == 0:
+            self._pvt_ptr = &self._pvt_val
+            self._pvt_ptr[0] = <cyruntime.cudaLogsCallbackHandle>init_value
+        else:
+            self._pvt_ptr = <cyruntime.cudaLogsCallbackHandle *>_ptr
+    def __init__(self, *args, **kwargs):
+        pass
+    def __repr__(self):
+        return '<cudaLogsCallbackHandle ' + str(hex(self.__int__())) + '>'
+    def __index__(self):
+        return self.__int__()
+    def __int__(self):
+        return <void_ptr>self._pvt_ptr[0]
+    def getPtr(self):
+        return <void_ptr>self._pvt_ptr
+{{endif}}
+
 {{if True}}
 
 cdef class EGLImageKHR:
@@ -5691,6 +5912,35 @@ cdef class cudaStreamCallback_t:
         return <void_ptr>self._pvt_ptr
 {{endif}}
 
+{{if 'cudaLogsCallback_t' in found_types}}
+
+cdef class cudaLogsCallback_t:
+    """
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+
+    """
+    def __cinit__(self, void_ptr init_value = 0, void_ptr _ptr = 0):
+        if _ptr == 0:
+            self._pvt_ptr = &self._pvt_val
+            self._pvt_ptr[0] = <cyruntime.cudaLogsCallback_t>init_value
+        else:
+            self._pvt_ptr = <cyruntime.cudaLogsCallback_t *>_ptr
+    def __init__(self, *args, **kwargs):
+        pass
+    def __repr__(self):
+        return '<cudaLogsCallback_t ' + str(hex(self.__int__())) + '>'
+    def __index__(self):
+        return self.__int__()
+    def __int__(self):
+        return <void_ptr>self._pvt_ptr[0]
+    def getPtr(self):
+        return <void_ptr>self._pvt_ptr
+{{endif}}
+
 {{if 'dim3' in found_struct}}
 
 cdef class dim3:
@@ -7915,6 +8165,52 @@ cdef class anon_struct4:
         self._pvt_ptr[0].res.pitch2D.pitchInBytes = pitchInBytes
     {{endif}}
 {{endif}}
+{{if 'cudaResourceDesc.res.reserved' in found_struct}}
+
+cdef class anon_struct5:
+    """
+    Attributes
+    ----------
+    {{if 'cudaResourceDesc.res.reserved.reserved' in found_struct}}
+    reserved : List[int]
+
+    {{endif}}
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+    """
+    def __cinit__(self, void_ptr _ptr):
+        self._pvt_ptr = <cyruntime.cudaResourceDesc *>_ptr
+
+    def __init__(self, void_ptr _ptr):
+        pass
+    def __dealloc__(self):
+        pass
+    def getPtr(self):
+        return <void_ptr>&self._pvt_ptr[0].res.reserved
+    def __repr__(self):
+        if self._pvt_ptr is not NULL:
+            str_list = []
+            {{if 'cudaResourceDesc.res.reserved.reserved' in found_struct}}
+            try:
+                str_list += ['reserved : ' + str(self.reserved)]
+            except ValueError:
+                str_list += ['reserved : <ValueError>']
+            {{endif}}
+            return '\n'.join(str_list)
+        else:
+            return ''
+    {{if 'cudaResourceDesc.res.reserved.reserved' in found_struct}}
+    @property
+    def reserved(self):
+        return self._pvt_ptr[0].res.reserved.reserved
+    @reserved.setter
+    def reserved(self, reserved):
+        self._pvt_ptr[0].res.reserved.reserved = reserved
+    {{endif}}
+{{endif}}
 {{if 'cudaResourceDesc.res' in found_struct}}
 
 cdef class anon_union0:
@@ -7936,6 +8232,10 @@ cdef class anon_union0:
     {{if 'cudaResourceDesc.res.pitch2D' in found_struct}}
     pitch2D : anon_struct4
 
+    {{endif}}
+    {{if 'cudaResourceDesc.res.reserved' in found_struct}}
+    reserved : anon_struct5
+
     {{endif}}
 
     Methods
@@ -7960,6 +8260,9 @@ cdef class anon_union0:
         {{if 'cudaResourceDesc.res.pitch2D' in found_struct}}
         self._pitch2D = anon_struct4(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
+        {{if 'cudaResourceDesc.res.reserved' in found_struct}}
+        self._reserved = anon_struct5(_ptr=<void_ptr>self._pvt_ptr)
+        {{endif}}
     def __dealloc__(self):
         pass
     def getPtr(self):
@@ -7991,6 +8294,12 @@ cdef class anon_union0:
             except ValueError:
                 str_list += ['pitch2D : <ValueError>']
             {{endif}}
+            {{if 'cudaResourceDesc.res.reserved' in found_struct}}
+            try:
+                str_list += ['reserved :\n' + '\n'.join(['    ' + line for line in str(self.reserved).splitlines()])]
+            except ValueError:
+                str_list += ['reserved : <ValueError>']
+            {{endif}}
             return '\n'.join(str_list)
         else:
             return ''
@@ -8026,6 +8335,14 @@ cdef class anon_union0:
     def pitch2D(self, pitch2D not None : anon_struct4):
         string.memcpy(&self._pvt_ptr[0].res.pitch2D, <cyruntime.anon_struct4*><void_ptr>pitch2D.getPtr(), sizeof(self._pvt_ptr[0].res.pitch2D))
     {{endif}}
+    {{if 'cudaResourceDesc.res.reserved' in found_struct}}
+    @property
+    def reserved(self):
+        return self._reserved
+    @reserved.setter
+    def reserved(self, reserved not None : anon_struct5):
+        string.memcpy(&self._pvt_ptr[0].res.reserved, <cyruntime.anon_struct5*><void_ptr>reserved.getPtr(), sizeof(self._pvt_ptr[0].res.reserved))
+    {{endif}}
 {{endif}}
 {{if 'cudaResourceDesc' in found_struct}}
 
@@ -8043,6 +8360,10 @@ cdef class cudaResourceDesc:
     res : anon_union0
 
     {{endif}}
+    {{if 'cudaResourceDesc.flags' in found_struct}}
+    flags : unsigned int
+        Flags (must be zero)
+    {{endif}}
 
     Methods
     -------
@@ -8080,6 +8401,12 @@ cdef class cudaResourceDesc:
             except ValueError:
                 str_list += ['res : <ValueError>']
             {{endif}}
+            {{if 'cudaResourceDesc.flags' in found_struct}}
+            try:
+                str_list += ['flags : ' + str(self.flags)]
+            except ValueError:
+                str_list += ['flags : <ValueError>']
+            {{endif}}
             return '\n'.join(str_list)
         else:
             return ''
@@ -8101,6 +8428,14 @@ cdef class cudaResourceDesc:
     def res(self, res not None : anon_union0):
         string.memcpy(&self._pvt_ptr[0].res, <cyruntime.anon_union0*><void_ptr>res.getPtr(), sizeof(self._pvt_ptr[0].res))
     {{endif}}
+    {{if 'cudaResourceDesc.flags' in found_struct}}
+    @property
+    def flags(self):
+        return self._pvt_ptr[0].flags
+    @flags.setter
+    def flags(self, unsigned int flags):
+        self._pvt_ptr[0].flags = flags
+    {{endif}}
 {{endif}}
 {{if 'cudaResourceViewDesc' in found_struct}}
 
@@ -8142,6 +8477,10 @@ cdef class cudaResourceViewDesc:
     lastLayer : unsigned int
         Last layer index
     {{endif}}
+    {{if 'cudaResourceViewDesc.reserved' in found_struct}}
+    reserved : List[unsigned int]
+        Must be zero
+    {{endif}}
 
     Methods
     -------
@@ -8210,6 +8549,12 @@ cdef class cudaResourceViewDesc:
             except ValueError:
                 str_list += ['lastLayer : <ValueError>']
             {{endif}}
+            {{if 'cudaResourceViewDesc.reserved' in found_struct}}
+            try:
+                str_list += ['reserved : ' + str(self.reserved)]
+            except ValueError:
+                str_list += ['reserved : <ValueError>']
+            {{endif}}
             return '\n'.join(str_list)
         else:
             return ''
@@ -8279,6 +8624,14 @@ cdef class cudaResourceViewDesc:
     def lastLayer(self, unsigned int lastLayer):
         self._pvt_ptr[0].lastLayer = lastLayer
     {{endif}}
+    {{if 'cudaResourceViewDesc.reserved' in found_struct}}
+    @property
+    def reserved(self):
+        return self._pvt_ptr[0].reserved
+    @reserved.setter
+    def reserved(self, reserved):
+        self._pvt_ptr[0].reserved = reserved
+    {{endif}}
 {{endif}}
 {{if 'cudaPointerAttributes' in found_struct}}
 
@@ -8315,6 +8668,10 @@ cdef class cudaPointerAttributes:
         unregistered memory is allocated so this field may contain invalid
         pointer if an invalid pointer has been passed to CUDA.
     {{endif}}
+    {{if 'cudaPointerAttributes.reserved' in found_struct}}
+    reserved : List[long]
+        Must be zero
+    {{endif}}
 
     Methods
     -------
@@ -8359,6 +8716,12 @@ cdef class cudaPointerAttributes:
             except ValueError:
                 str_list += ['hostPointer : <ValueError>']
             {{endif}}
+            {{if 'cudaPointerAttributes.reserved' in found_struct}}
+            try:
+                str_list += ['reserved : ' + str(self.reserved)]
+            except ValueError:
+                str_list += ['reserved : <ValueError>']
+            {{endif}}
             return '\n'.join(str_list)
         else:
             return ''
@@ -8398,6 +8761,14 @@ cdef class cudaPointerAttributes:
         _chostPointer = utils.HelperInputVoidPtr(hostPointer)
         self._pvt_ptr[0].hostPointer = <void*><void_ptr>_chostPointer.cptr
     {{endif}}
+    {{if 'cudaPointerAttributes.reserved' in found_struct}}
+    @property
+    def reserved(self):
+        return self._pvt_ptr[0].reserved
+    @reserved.setter
+    def reserved(self, reserved):
+        self._pvt_ptr[0].reserved = reserved
+    {{endif}}
 {{endif}}
 {{if 'cudaFuncAttributes' in found_struct}}
 
@@ -9702,7 +10073,7 @@ cdef class cudaOffset3D:
 {{endif}}
 {{if 'cudaMemcpy3DOperand.op.ptr' in found_struct}}
 
-cdef class anon_struct5:
+cdef class anon_struct6:
     """
     Attributes
     ----------
@@ -9806,7 +10177,7 @@ cdef class anon_struct5:
 {{endif}}
 {{if 'cudaMemcpy3DOperand.op.array' in found_struct}}
 
-cdef class anon_struct6:
+cdef class anon_struct7:
     """
     Attributes
     ----------
@@ -9890,11 +10261,11 @@ cdef class anon_union1:
     Attributes
     ----------
     {{if 'cudaMemcpy3DOperand.op.ptr' in found_struct}}
-    ptr : anon_struct5
+    ptr : anon_struct6
 
     {{endif}}
     {{if 'cudaMemcpy3DOperand.op.array' in found_struct}}
-    array : anon_struct6
+    array : anon_struct7
 
     {{endif}}
 
@@ -9909,10 +10280,10 @@ cdef class anon_union1:
     def __init__(self, void_ptr _ptr):
         pass
         {{if 'cudaMemcpy3DOperand.op.ptr' in found_struct}}
-        self._ptr = anon_struct5(_ptr=<void_ptr>self._pvt_ptr)
+        self._ptr = anon_struct6(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
         {{if 'cudaMemcpy3DOperand.op.array' in found_struct}}
-        self._array = anon_struct6(_ptr=<void_ptr>self._pvt_ptr)
+        self._array = anon_struct7(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         pass
@@ -9941,16 +10312,16 @@ cdef class anon_union1:
     def ptr(self):
         return self._ptr
     @ptr.setter
-    def ptr(self, ptr not None : anon_struct5):
-        string.memcpy(&self._pvt_ptr[0].op.ptr, <cyruntime.anon_struct5*><void_ptr>ptr.getPtr(), sizeof(self._pvt_ptr[0].op.ptr))
+    def ptr(self, ptr not None : anon_struct6):
+        string.memcpy(&self._pvt_ptr[0].op.ptr, <cyruntime.anon_struct6*><void_ptr>ptr.getPtr(), sizeof(self._pvt_ptr[0].op.ptr))
     {{endif}}
     {{if 'cudaMemcpy3DOperand.op.array' in found_struct}}
     @property
     def array(self):
         return self._array
     @array.setter
-    def array(self, array not None : anon_struct6):
-        string.memcpy(&self._pvt_ptr[0].op.array, <cyruntime.anon_struct6*><void_ptr>array.getPtr(), sizeof(self._pvt_ptr[0].op.array))
+    def array(self, array not None : anon_struct7):
+        string.memcpy(&self._pvt_ptr[0].op.array, <cyruntime.anon_struct7*><void_ptr>array.getPtr(), sizeof(self._pvt_ptr[0].op.array))
     {{endif}}
 {{endif}}
 {{if 'cudaMemcpy3DOperand' in found_struct}}
@@ -10263,10 +10634,6 @@ cdef class cudaDeviceProp:
     maxGridSize : List[int]
         Maximum size of each dimension of a grid
     {{endif}}
-    {{if 'cudaDeviceProp.clockRate' in found_struct}}
-    clockRate : int
-        Deprecated, Clock frequency in kilohertz
-    {{endif}}
     {{if 'cudaDeviceProp.totalConstMem' in found_struct}}
     totalConstMem : size_t
         Constant memory available on device in bytes
@@ -10288,19 +10655,10 @@ cdef class cudaDeviceProp:
         Pitch alignment requirement for texture references bound to pitched
         memory
     {{endif}}
-    {{if 'cudaDeviceProp.deviceOverlap' in found_struct}}
-    deviceOverlap : int
-        Device can concurrently copy memory and execute a kernel.
-        Deprecated. Use instead asyncEngineCount.
-    {{endif}}
     {{if 'cudaDeviceProp.multiProcessorCount' in found_struct}}
     multiProcessorCount : int
         Number of multiprocessors on device
     {{endif}}
-    {{if 'cudaDeviceProp.kernelExecTimeoutEnabled' in found_struct}}
-    kernelExecTimeoutEnabled : int
-        Deprecated, Specified whether there is a run time limit on kernels
-    {{endif}}
     {{if 'cudaDeviceProp.integrated' in found_struct}}
     integrated : int
         Device is integrated as opposed to discrete
@@ -10310,10 +10668,6 @@ cdef class cudaDeviceProp:
         Device can map host memory with
         cudaHostAlloc/cudaHostGetDevicePointer
     {{endif}}
-    {{if 'cudaDeviceProp.computeMode' in found_struct}}
-    computeMode : int
-        Deprecated, Compute mode (See cudaComputeMode)
-    {{endif}}
     {{if 'cudaDeviceProp.maxTexture1D' in found_struct}}
     maxTexture1D : int
         Maximum 1D texture size
@@ -10322,11 +10676,6 @@ cdef class cudaDeviceProp:
     maxTexture1DMipmap : int
         Maximum 1D mipmapped texture size
     {{endif}}
-    {{if 'cudaDeviceProp.maxTexture1DLinear' in found_struct}}
-    maxTexture1DLinear : int
-        Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth()
-        or cuDeviceGetTexture1DLinearMaxWidth() instead.
-    {{endif}}
     {{if 'cudaDeviceProp.maxTexture2D' in found_struct}}
     maxTexture2D : List[int]
         Maximum 2D texture dimensions
@@ -10433,10 +10782,6 @@ cdef class cudaDeviceProp:
     unifiedAddressing : int
         Device shares a unified address space with the host
     {{endif}}
-    {{if 'cudaDeviceProp.memoryClockRate' in found_struct}}
-    memoryClockRate : int
-        Deprecated, Peak memory clock frequency in kilohertz
-    {{endif}}
     {{if 'cudaDeviceProp.memoryBusWidth' in found_struct}}
     memoryBusWidth : int
         Global memory bus width in bits
@@ -10491,11 +10836,6 @@ cdef class cudaDeviceProp:
         Link between the device and the host supports native atomic
         operations
     {{endif}}
-    {{if 'cudaDeviceProp.singleToDoublePrecisionPerfRatio' in found_struct}}
-    singleToDoublePrecisionPerfRatio : int
-        Deprecated, Ratio of single precision performance (in floating-
-        point operations per second) to double precision performance
-    {{endif}}
     {{if 'cudaDeviceProp.pageableMemoryAccess' in found_struct}}
     pageableMemoryAccess : int
         Device supports coherently accessing pageable memory without
@@ -10520,10 +10860,6 @@ cdef class cudaDeviceProp:
         Device supports launching cooperative kernels via
         cudaLaunchCooperativeKernel
     {{endif}}
-    {{if 'cudaDeviceProp.cooperativeMultiDeviceLaunch' in found_struct}}
-    cooperativeMultiDeviceLaunch : int
-        Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated.
-    {{endif}}
     {{if 'cudaDeviceProp.sharedMemPerBlockOptin' in found_struct}}
     sharedMemPerBlockOptin : size_t
         Per device maximum shared memory per block usable by special opt in
@@ -10607,6 +10943,38 @@ cdef class cudaDeviceProp:
     unifiedFunctionPointers : int
         Indicates device supports unified pointers
     {{endif}}
+    {{if 'cudaDeviceProp.deviceNumaConfig' in found_struct}}
+    deviceNumaConfig : int
+        NUMA configuration of a device: value is of type
+        cudaDeviceNumaConfig enum
+    {{endif}}
+    {{if 'cudaDeviceProp.deviceNumaId' in found_struct}}
+    deviceNumaId : int
+        NUMA node ID of the GPU memory
+    {{endif}}
+    {{if 'cudaDeviceProp.mpsEnabled' in found_struct}}
+    mpsEnabled : int
+        Indicates if contexts created on this device will be shared via MPS
+    {{endif}}
+    {{if 'cudaDeviceProp.hostNumaId' in found_struct}}
+    hostNumaId : int
+        NUMA ID of the host node closest to the device or -1 when system
+        does not support NUMA
+    {{endif}}
+    {{if 'cudaDeviceProp.gpuPciDeviceID' in found_struct}}
+    gpuPciDeviceID : unsigned int
+        The combined 16-bit PCI device ID and 16-bit PCI vendor ID
+    {{endif}}
+    {{if 'cudaDeviceProp.gpuPciSubsystemID' in found_struct}}
+    gpuPciSubsystemID : unsigned int
+        The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem
+        vendor ID
+    {{endif}}
+    {{if 'cudaDeviceProp.hostNumaMultinodeIpcSupported' in found_struct}}
+    hostNumaMultinodeIpcSupported : int
+        1 if the device supports HostNuma location IPC between nodes in a
+        multi-node system.
+    {{endif}}
     {{if 'cudaDeviceProp.reserved' in found_struct}}
     reserved : List[int]
         Reserved for future use
@@ -10706,12 +11074,6 @@ cdef class cudaDeviceProp:
             except ValueError:
                 str_list += ['maxGridSize : <ValueError>']
             {{endif}}
-            {{if 'cudaDeviceProp.clockRate' in found_struct}}
-            try:
-                str_list += ['clockRate : ' + str(self.clockRate)]
-            except ValueError:
-                str_list += ['clockRate : <ValueError>']
-            {{endif}}
             {{if 'cudaDeviceProp.totalConstMem' in found_struct}}
             try:
                 str_list += ['totalConstMem : ' + str(self.totalConstMem)]
@@ -10742,24 +11104,12 @@ cdef class cudaDeviceProp:
             except ValueError:
                 str_list += ['texturePitchAlignment : <ValueError>']
             {{endif}}
-            {{if 'cudaDeviceProp.deviceOverlap' in found_struct}}
-            try:
-                str_list += ['deviceOverlap : ' + str(self.deviceOverlap)]
-            except ValueError:
-                str_list += ['deviceOverlap : <ValueError>']
-            {{endif}}
             {{if 'cudaDeviceProp.multiProcessorCount' in found_struct}}
             try:
                 str_list += ['multiProcessorCount : ' + str(self.multiProcessorCount)]
             except ValueError:
                 str_list += ['multiProcessorCount : <ValueError>']
             {{endif}}
-            {{if 'cudaDeviceProp.kernelExecTimeoutEnabled' in found_struct}}
-            try:
-                str_list += ['kernelExecTimeoutEnabled : ' + str(self.kernelExecTimeoutEnabled)]
-            except ValueError:
-                str_list += ['kernelExecTimeoutEnabled : <ValueError>']
-            {{endif}}
             {{if 'cudaDeviceProp.integrated' in found_struct}}
             try:
                 str_list += ['integrated : ' + str(self.integrated)]
@@ -10772,12 +11122,6 @@ cdef class cudaDeviceProp:
             except ValueError:
                 str_list += ['canMapHostMemory : <ValueError>']
             {{endif}}
-            {{if 'cudaDeviceProp.computeMode' in found_struct}}
-            try:
-                str_list += ['computeMode : ' + str(self.computeMode)]
-            except ValueError:
-                str_list += ['computeMode : <ValueError>']
-            {{endif}}
             {{if 'cudaDeviceProp.maxTexture1D' in found_struct}}
             try:
                 str_list += ['maxTexture1D : ' + str(self.maxTexture1D)]
@@ -10790,12 +11134,6 @@ cdef class cudaDeviceProp:
             except ValueError:
                 str_list += ['maxTexture1DMipmap : <ValueError>']
             {{endif}}
-            {{if 'cudaDeviceProp.maxTexture1DLinear' in found_struct}}
-            try:
-                str_list += ['maxTexture1DLinear : ' + str(self.maxTexture1DLinear)]
-            except ValueError:
-                str_list += ['maxTexture1DLinear : <ValueError>']
-            {{endif}}
             {{if 'cudaDeviceProp.maxTexture2D' in found_struct}}
             try:
                 str_list += ['maxTexture2D : ' + str(self.maxTexture2D)]
@@ -10952,12 +11290,6 @@ cdef class cudaDeviceProp:
             except ValueError:
                 str_list += ['unifiedAddressing : <ValueError>']
             {{endif}}
-            {{if 'cudaDeviceProp.memoryClockRate' in found_struct}}
-            try:
-                str_list += ['memoryClockRate : ' + str(self.memoryClockRate)]
-            except ValueError:
-                str_list += ['memoryClockRate : <ValueError>']
-            {{endif}}
             {{if 'cudaDeviceProp.memoryBusWidth' in found_struct}}
             try:
                 str_list += ['memoryBusWidth : ' + str(self.memoryBusWidth)]
@@ -11036,12 +11368,6 @@ cdef class cudaDeviceProp:
             except ValueError:
                 str_list += ['hostNativeAtomicSupported : <ValueError>']
             {{endif}}
-            {{if 'cudaDeviceProp.singleToDoublePrecisionPerfRatio' in found_struct}}
-            try:
-                str_list += ['singleToDoublePrecisionPerfRatio : ' + str(self.singleToDoublePrecisionPerfRatio)]
-            except ValueError:
-                str_list += ['singleToDoublePrecisionPerfRatio : <ValueError>']
-            {{endif}}
             {{if 'cudaDeviceProp.pageableMemoryAccess' in found_struct}}
             try:
                 str_list += ['pageableMemoryAccess : ' + str(self.pageableMemoryAccess)]
@@ -11072,12 +11398,6 @@ cdef class cudaDeviceProp:
             except ValueError:
                 str_list += ['cooperativeLaunch : <ValueError>']
             {{endif}}
-            {{if 'cudaDeviceProp.cooperativeMultiDeviceLaunch' in found_struct}}
-            try:
-                str_list += ['cooperativeMultiDeviceLaunch : ' + str(self.cooperativeMultiDeviceLaunch)]
-            except ValueError:
-                str_list += ['cooperativeMultiDeviceLaunch : <ValueError>']
-            {{endif}}
             {{if 'cudaDeviceProp.sharedMemPerBlockOptin' in found_struct}}
             try:
                 str_list += ['sharedMemPerBlockOptin : ' + str(self.sharedMemPerBlockOptin)]
@@ -11192,6 +11512,48 @@ cdef class cudaDeviceProp:
             except ValueError:
                 str_list += ['unifiedFunctionPointers : <ValueError>']
             {{endif}}
+            {{if 'cudaDeviceProp.deviceNumaConfig' in found_struct}}
+            try:
+                str_list += ['deviceNumaConfig : ' + str(self.deviceNumaConfig)]
+            except ValueError:
+                str_list += ['deviceNumaConfig : <ValueError>']
+            {{endif}}
+            {{if 'cudaDeviceProp.deviceNumaId' in found_struct}}
+            try:
+                str_list += ['deviceNumaId : ' + str(self.deviceNumaId)]
+            except ValueError:
+                str_list += ['deviceNumaId : <ValueError>']
+            {{endif}}
+            {{if 'cudaDeviceProp.mpsEnabled' in found_struct}}
+            try:
+                str_list += ['mpsEnabled : ' + str(self.mpsEnabled)]
+            except ValueError:
+                str_list += ['mpsEnabled : <ValueError>']
+            {{endif}}
+            {{if 'cudaDeviceProp.hostNumaId' in found_struct}}
+            try:
+                str_list += ['hostNumaId : ' + str(self.hostNumaId)]
+            except ValueError:
+                str_list += ['hostNumaId : <ValueError>']
+            {{endif}}
+            {{if 'cudaDeviceProp.gpuPciDeviceID' in found_struct}}
+            try:
+                str_list += ['gpuPciDeviceID : ' + str(self.gpuPciDeviceID)]
+            except ValueError:
+                str_list += ['gpuPciDeviceID : <ValueError>']
+            {{endif}}
+            {{if 'cudaDeviceProp.gpuPciSubsystemID' in found_struct}}
+            try:
+                str_list += ['gpuPciSubsystemID : ' + str(self.gpuPciSubsystemID)]
+            except ValueError:
+                str_list += ['gpuPciSubsystemID : <ValueError>']
+            {{endif}}
+            {{if 'cudaDeviceProp.hostNumaMultinodeIpcSupported' in found_struct}}
+            try:
+                str_list += ['hostNumaMultinodeIpcSupported : ' + str(self.hostNumaMultinodeIpcSupported)]
+            except ValueError:
+                str_list += ['hostNumaMultinodeIpcSupported : <ValueError>']
+            {{endif}}
             {{if 'cudaDeviceProp.reserved' in found_struct}}
             try:
                 str_list += ['reserved : ' + str(self.reserved)]
@@ -11309,14 +11671,6 @@ cdef class cudaDeviceProp:
     def maxGridSize(self, maxGridSize):
         self._pvt_ptr[0].maxGridSize = maxGridSize
     {{endif}}
-    {{if 'cudaDeviceProp.clockRate' in found_struct}}
-    @property
-    def clockRate(self):
-        return self._pvt_ptr[0].clockRate
-    @clockRate.setter
-    def clockRate(self, int clockRate):
-        self._pvt_ptr[0].clockRate = clockRate
-    {{endif}}
     {{if 'cudaDeviceProp.totalConstMem' in found_struct}}
     @property
     def totalConstMem(self):
@@ -11357,14 +11711,6 @@ cdef class cudaDeviceProp:
     def texturePitchAlignment(self, size_t texturePitchAlignment):
         self._pvt_ptr[0].texturePitchAlignment = texturePitchAlignment
     {{endif}}
-    {{if 'cudaDeviceProp.deviceOverlap' in found_struct}}
-    @property
-    def deviceOverlap(self):
-        return self._pvt_ptr[0].deviceOverlap
-    @deviceOverlap.setter
-    def deviceOverlap(self, int deviceOverlap):
-        self._pvt_ptr[0].deviceOverlap = deviceOverlap
-    {{endif}}
     {{if 'cudaDeviceProp.multiProcessorCount' in found_struct}}
     @property
     def multiProcessorCount(self):
@@ -11373,14 +11719,6 @@ cdef class cudaDeviceProp:
     def multiProcessorCount(self, int multiProcessorCount):
         self._pvt_ptr[0].multiProcessorCount = multiProcessorCount
     {{endif}}
-    {{if 'cudaDeviceProp.kernelExecTimeoutEnabled' in found_struct}}
-    @property
-    def kernelExecTimeoutEnabled(self):
-        return self._pvt_ptr[0].kernelExecTimeoutEnabled
-    @kernelExecTimeoutEnabled.setter
-    def kernelExecTimeoutEnabled(self, int kernelExecTimeoutEnabled):
-        self._pvt_ptr[0].kernelExecTimeoutEnabled = kernelExecTimeoutEnabled
-    {{endif}}
     {{if 'cudaDeviceProp.integrated' in found_struct}}
     @property
     def integrated(self):
@@ -11397,14 +11735,6 @@ cdef class cudaDeviceProp:
     def canMapHostMemory(self, int canMapHostMemory):
         self._pvt_ptr[0].canMapHostMemory = canMapHostMemory
     {{endif}}
-    {{if 'cudaDeviceProp.computeMode' in found_struct}}
-    @property
-    def computeMode(self):
-        return self._pvt_ptr[0].computeMode
-    @computeMode.setter
-    def computeMode(self, int computeMode):
-        self._pvt_ptr[0].computeMode = computeMode
-    {{endif}}
     {{if 'cudaDeviceProp.maxTexture1D' in found_struct}}
     @property
     def maxTexture1D(self):
@@ -11421,14 +11751,6 @@ cdef class cudaDeviceProp:
     def maxTexture1DMipmap(self, int maxTexture1DMipmap):
         self._pvt_ptr[0].maxTexture1DMipmap = maxTexture1DMipmap
     {{endif}}
-    {{if 'cudaDeviceProp.maxTexture1DLinear' in found_struct}}
-    @property
-    def maxTexture1DLinear(self):
-        return self._pvt_ptr[0].maxTexture1DLinear
-    @maxTexture1DLinear.setter
-    def maxTexture1DLinear(self, int maxTexture1DLinear):
-        self._pvt_ptr[0].maxTexture1DLinear = maxTexture1DLinear
-    {{endif}}
     {{if 'cudaDeviceProp.maxTexture2D' in found_struct}}
     @property
     def maxTexture2D(self):
@@ -11637,14 +11959,6 @@ cdef class cudaDeviceProp:
     def unifiedAddressing(self, int unifiedAddressing):
         self._pvt_ptr[0].unifiedAddressing = unifiedAddressing
     {{endif}}
-    {{if 'cudaDeviceProp.memoryClockRate' in found_struct}}
-    @property
-    def memoryClockRate(self):
-        return self._pvt_ptr[0].memoryClockRate
-    @memoryClockRate.setter
-    def memoryClockRate(self, int memoryClockRate):
-        self._pvt_ptr[0].memoryClockRate = memoryClockRate
-    {{endif}}
     {{if 'cudaDeviceProp.memoryBusWidth' in found_struct}}
     @property
     def memoryBusWidth(self):
@@ -11749,14 +12063,6 @@ cdef class cudaDeviceProp:
     def hostNativeAtomicSupported(self, int hostNativeAtomicSupported):
         self._pvt_ptr[0].hostNativeAtomicSupported = hostNativeAtomicSupported
     {{endif}}
-    {{if 'cudaDeviceProp.singleToDoublePrecisionPerfRatio' in found_struct}}
-    @property
-    def singleToDoublePrecisionPerfRatio(self):
-        return self._pvt_ptr[0].singleToDoublePrecisionPerfRatio
-    @singleToDoublePrecisionPerfRatio.setter
-    def singleToDoublePrecisionPerfRatio(self, int singleToDoublePrecisionPerfRatio):
-        self._pvt_ptr[0].singleToDoublePrecisionPerfRatio = singleToDoublePrecisionPerfRatio
-    {{endif}}
     {{if 'cudaDeviceProp.pageableMemoryAccess' in found_struct}}
     @property
     def pageableMemoryAccess(self):
@@ -11797,14 +12103,6 @@ cdef class cudaDeviceProp:
     def cooperativeLaunch(self, int cooperativeLaunch):
         self._pvt_ptr[0].cooperativeLaunch = cooperativeLaunch
     {{endif}}
-    {{if 'cudaDeviceProp.cooperativeMultiDeviceLaunch' in found_struct}}
-    @property
-    def cooperativeMultiDeviceLaunch(self):
-        return self._pvt_ptr[0].cooperativeMultiDeviceLaunch
-    @cooperativeMultiDeviceLaunch.setter
-    def cooperativeMultiDeviceLaunch(self, int cooperativeMultiDeviceLaunch):
-        self._pvt_ptr[0].cooperativeMultiDeviceLaunch = cooperativeMultiDeviceLaunch
-    {{endif}}
     {{if 'cudaDeviceProp.sharedMemPerBlockOptin' in found_struct}}
     @property
     def sharedMemPerBlockOptin(self):
@@ -11957,6 +12255,62 @@ cdef class cudaDeviceProp:
     def unifiedFunctionPointers(self, int unifiedFunctionPointers):
         self._pvt_ptr[0].unifiedFunctionPointers = unifiedFunctionPointers
     {{endif}}
+    {{if 'cudaDeviceProp.deviceNumaConfig' in found_struct}}
+    @property
+    def deviceNumaConfig(self):
+        return self._pvt_ptr[0].deviceNumaConfig
+    @deviceNumaConfig.setter
+    def deviceNumaConfig(self, int deviceNumaConfig):
+        self._pvt_ptr[0].deviceNumaConfig = deviceNumaConfig
+    {{endif}}
+    {{if 'cudaDeviceProp.deviceNumaId' in found_struct}}
+    @property
+    def deviceNumaId(self):
+        return self._pvt_ptr[0].deviceNumaId
+    @deviceNumaId.setter
+    def deviceNumaId(self, int deviceNumaId):
+        self._pvt_ptr[0].deviceNumaId = deviceNumaId
+    {{endif}}
+    {{if 'cudaDeviceProp.mpsEnabled' in found_struct}}
+    @property
+    def mpsEnabled(self):
+        return self._pvt_ptr[0].mpsEnabled
+    @mpsEnabled.setter
+    def mpsEnabled(self, int mpsEnabled):
+        self._pvt_ptr[0].mpsEnabled = mpsEnabled
+    {{endif}}
+    {{if 'cudaDeviceProp.hostNumaId' in found_struct}}
+    @property
+    def hostNumaId(self):
+        return self._pvt_ptr[0].hostNumaId
+    @hostNumaId.setter
+    def hostNumaId(self, int hostNumaId):
+        self._pvt_ptr[0].hostNumaId = hostNumaId
+    {{endif}}
+    {{if 'cudaDeviceProp.gpuPciDeviceID' in found_struct}}
+    @property
+    def gpuPciDeviceID(self):
+        return self._pvt_ptr[0].gpuPciDeviceID
+    @gpuPciDeviceID.setter
+    def gpuPciDeviceID(self, unsigned int gpuPciDeviceID):
+        self._pvt_ptr[0].gpuPciDeviceID = gpuPciDeviceID
+    {{endif}}
+    {{if 'cudaDeviceProp.gpuPciSubsystemID' in found_struct}}
+    @property
+    def gpuPciSubsystemID(self):
+        return self._pvt_ptr[0].gpuPciSubsystemID
+    @gpuPciSubsystemID.setter
+    def gpuPciSubsystemID(self, unsigned int gpuPciSubsystemID):
+        self._pvt_ptr[0].gpuPciSubsystemID = gpuPciSubsystemID
+    {{endif}}
+    {{if 'cudaDeviceProp.hostNumaMultinodeIpcSupported' in found_struct}}
+    @property
+    def hostNumaMultinodeIpcSupported(self):
+        return self._pvt_ptr[0].hostNumaMultinodeIpcSupported
+    @hostNumaMultinodeIpcSupported.setter
+    def hostNumaMultinodeIpcSupported(self, int hostNumaMultinodeIpcSupported):
+        self._pvt_ptr[0].hostNumaMultinodeIpcSupported = hostNumaMultinodeIpcSupported
+    {{endif}}
     {{if 'cudaDeviceProp.reserved' in found_struct}}
     @property
     def reserved(self):
@@ -12149,7 +12503,7 @@ cdef class cudaMemFabricHandle_st:
 {{endif}}
 {{if 'cudaExternalMemoryHandleDesc.handle.win32' in found_struct}}
 
-cdef class anon_struct7:
+cdef class anon_struct8:
     """
     Attributes
     ----------
@@ -12224,7 +12578,7 @@ cdef class anon_union2:
 
     {{endif}}
     {{if 'cudaExternalMemoryHandleDesc.handle.win32' in found_struct}}
-    win32 : anon_struct7
+    win32 : anon_struct8
 
     {{endif}}
     {{if 'cudaExternalMemoryHandleDesc.handle.nvSciBufObject' in found_struct}}
@@ -12243,7 +12597,7 @@ cdef class anon_union2:
     def __init__(self, void_ptr _ptr):
         pass
         {{if 'cudaExternalMemoryHandleDesc.handle.win32' in found_struct}}
-        self._win32 = anon_struct7(_ptr=<void_ptr>self._pvt_ptr)
+        self._win32 = anon_struct8(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         pass
@@ -12286,8 +12640,8 @@ cdef class anon_union2:
     def win32(self):
         return self._win32
     @win32.setter
-    def win32(self, win32 not None : anon_struct7):
-        string.memcpy(&self._pvt_ptr[0].handle.win32, <cyruntime.anon_struct7*><void_ptr>win32.getPtr(), sizeof(self._pvt_ptr[0].handle.win32))
+    def win32(self, win32 not None : anon_struct8):
+        string.memcpy(&self._pvt_ptr[0].handle.win32, <cyruntime.anon_struct8*><void_ptr>win32.getPtr(), sizeof(self._pvt_ptr[0].handle.win32))
     {{endif}}
     {{if 'cudaExternalMemoryHandleDesc.handle.nvSciBufObject' in found_struct}}
     @property
@@ -12323,6 +12677,10 @@ cdef class cudaExternalMemoryHandleDesc:
     flags : unsigned int
         Flags must either be zero or cudaExternalMemoryDedicated
     {{endif}}
+    {{if 'cudaExternalMemoryHandleDesc.reserved' in found_struct}}
+    reserved : List[unsigned int]
+        Must be zero
+    {{endif}}
 
     Methods
     -------
@@ -12372,6 +12730,12 @@ cdef class cudaExternalMemoryHandleDesc:
             except ValueError:
                 str_list += ['flags : <ValueError>']
             {{endif}}
+            {{if 'cudaExternalMemoryHandleDesc.reserved' in found_struct}}
+            try:
+                str_list += ['reserved : ' + str(self.reserved)]
+            except ValueError:
+                str_list += ['reserved : <ValueError>']
+            {{endif}}
             return '\n'.join(str_list)
         else:
             return ''
@@ -12409,6 +12773,14 @@ cdef class cudaExternalMemoryHandleDesc:
     def flags(self, unsigned int flags):
         self._pvt_ptr[0].flags = flags
     {{endif}}
+    {{if 'cudaExternalMemoryHandleDesc.reserved' in found_struct}}
+    @property
+    def reserved(self):
+        return self._pvt_ptr[0].reserved
+    @reserved.setter
+    def reserved(self, reserved):
+        self._pvt_ptr[0].reserved = reserved
+    {{endif}}
 {{endif}}
 {{if 'cudaExternalMemoryBufferDesc' in found_struct}}
 
@@ -12430,6 +12802,10 @@ cdef class cudaExternalMemoryBufferDesc:
     flags : unsigned int
         Flags reserved for future use. Must be zero.
     {{endif}}
+    {{if 'cudaExternalMemoryBufferDesc.reserved' in found_struct}}
+    reserved : List[unsigned int]
+        Must be zero
+    {{endif}}
 
     Methods
     -------
@@ -12468,6 +12844,12 @@ cdef class cudaExternalMemoryBufferDesc:
             except ValueError:
                 str_list += ['flags : <ValueError>']
             {{endif}}
+            {{if 'cudaExternalMemoryBufferDesc.reserved' in found_struct}}
+            try:
+                str_list += ['reserved : ' + str(self.reserved)]
+            except ValueError:
+                str_list += ['reserved : <ValueError>']
+            {{endif}}
             return '\n'.join(str_list)
         else:
             return ''
@@ -12495,6 +12877,14 @@ cdef class cudaExternalMemoryBufferDesc:
     def flags(self, unsigned int flags):
         self._pvt_ptr[0].flags = flags
     {{endif}}
+    {{if 'cudaExternalMemoryBufferDesc.reserved' in found_struct}}
+    @property
+    def reserved(self):
+        return self._pvt_ptr[0].reserved
+    @reserved.setter
+    def reserved(self, reserved):
+        self._pvt_ptr[0].reserved = reserved
+    {{endif}}
 {{endif}}
 {{if 'cudaExternalMemoryMipmappedArrayDesc' in found_struct}}
 
@@ -12526,6 +12916,10 @@ cdef class cudaExternalMemoryMipmappedArrayDesc:
     numLevels : unsigned int
         Total number of levels in the mipmap chain
     {{endif}}
+    {{if 'cudaExternalMemoryMipmappedArrayDesc.reserved' in found_struct}}
+    reserved : List[unsigned int]
+        Must be zero
+    {{endif}}
 
     Methods
     -------
@@ -12582,6 +12976,12 @@ cdef class cudaExternalMemoryMipmappedArrayDesc:
             except ValueError:
                 str_list += ['numLevels : <ValueError>']
             {{endif}}
+            {{if 'cudaExternalMemoryMipmappedArrayDesc.reserved' in found_struct}}
+            try:
+                str_list += ['reserved : ' + str(self.reserved)]
+            except ValueError:
+                str_list += ['reserved : <ValueError>']
+            {{endif}}
             return '\n'.join(str_list)
         else:
             return ''
@@ -12625,10 +13025,18 @@ cdef class cudaExternalMemoryMipmappedArrayDesc:
     def numLevels(self, unsigned int numLevels):
         self._pvt_ptr[0].numLevels = numLevels
     {{endif}}
+    {{if 'cudaExternalMemoryMipmappedArrayDesc.reserved' in found_struct}}
+    @property
+    def reserved(self):
+        return self._pvt_ptr[0].reserved
+    @reserved.setter
+    def reserved(self, reserved):
+        self._pvt_ptr[0].reserved = reserved
+    {{endif}}
 {{endif}}
 {{if 'cudaExternalSemaphoreHandleDesc.handle.win32' in found_struct}}
 
-cdef class anon_struct8:
+cdef class anon_struct9:
     """
     Attributes
     ----------
@@ -12703,7 +13111,7 @@ cdef class anon_union3:
 
     {{endif}}
     {{if 'cudaExternalSemaphoreHandleDesc.handle.win32' in found_struct}}
-    win32 : anon_struct8
+    win32 : anon_struct9
 
     {{endif}}
     {{if 'cudaExternalSemaphoreHandleDesc.handle.nvSciSyncObj' in found_struct}}
@@ -12722,7 +13130,7 @@ cdef class anon_union3:
     def __init__(self, void_ptr _ptr):
         pass
         {{if 'cudaExternalSemaphoreHandleDesc.handle.win32' in found_struct}}
-        self._win32 = anon_struct8(_ptr=<void_ptr>self._pvt_ptr)
+        self._win32 = anon_struct9(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         pass
@@ -12765,8 +13173,8 @@ cdef class anon_union3:
     def win32(self):
         return self._win32
     @win32.setter
-    def win32(self, win32 not None : anon_struct8):
-        string.memcpy(&self._pvt_ptr[0].handle.win32, <cyruntime.anon_struct8*><void_ptr>win32.getPtr(), sizeof(self._pvt_ptr[0].handle.win32))
+    def win32(self, win32 not None : anon_struct9):
+        string.memcpy(&self._pvt_ptr[0].handle.win32, <cyruntime.anon_struct9*><void_ptr>win32.getPtr(), sizeof(self._pvt_ptr[0].handle.win32))
     {{endif}}
     {{if 'cudaExternalSemaphoreHandleDesc.handle.nvSciSyncObj' in found_struct}}
     @property
@@ -12798,6 +13206,10 @@ cdef class cudaExternalSemaphoreHandleDesc:
     flags : unsigned int
         Flags reserved for the future. Must be zero.
     {{endif}}
+    {{if 'cudaExternalSemaphoreHandleDesc.reserved' in found_struct}}
+    reserved : List[unsigned int]
+        Must be zero
+    {{endif}}
 
     Methods
     -------
@@ -12841,6 +13253,12 @@ cdef class cudaExternalSemaphoreHandleDesc:
             except ValueError:
                 str_list += ['flags : <ValueError>']
             {{endif}}
+            {{if 'cudaExternalSemaphoreHandleDesc.reserved' in found_struct}}
+            try:
+                str_list += ['reserved : ' + str(self.reserved)]
+            except ValueError:
+                str_list += ['reserved : <ValueError>']
+            {{endif}}
             return '\n'.join(str_list)
         else:
             return ''
@@ -12870,10 +13288,18 @@ cdef class cudaExternalSemaphoreHandleDesc:
     def flags(self, unsigned int flags):
         self._pvt_ptr[0].flags = flags
     {{endif}}
+    {{if 'cudaExternalSemaphoreHandleDesc.reserved' in found_struct}}
+    @property
+    def reserved(self):
+        return self._pvt_ptr[0].reserved
+    @reserved.setter
+    def reserved(self, reserved):
+        self._pvt_ptr[0].reserved = reserved
+    {{endif}}
 {{endif}}
 {{if 'cudaExternalSemaphoreSignalParams.params.fence' in found_struct}}
 
-cdef class anon_struct15:
+cdef class anon_struct10:
     """
     Attributes
     ----------
@@ -12919,7 +13345,7 @@ cdef class anon_struct15:
 {{endif}}
 {{if 'cudaExternalSemaphoreSignalParams.params.nvSciSync' in found_struct}}
 
-cdef class anon_union6:
+cdef class anon_union4:
     """
     Attributes
     ----------
@@ -12984,7 +13410,7 @@ cdef class anon_union6:
 {{endif}}
 {{if 'cudaExternalSemaphoreSignalParams.params.keyedMutex' in found_struct}}
 
-cdef class anon_struct16:
+cdef class anon_struct11:
     """
     Attributes
     ----------
@@ -13030,20 +13456,20 @@ cdef class anon_struct16:
 {{endif}}
 {{if 'cudaExternalSemaphoreSignalParams.params' in found_struct}}
 
-cdef class anon_struct17:
+cdef class anon_struct12:
     """
     Attributes
     ----------
     {{if 'cudaExternalSemaphoreSignalParams.params.fence' in found_struct}}
-    fence : anon_struct15
+    fence : anon_struct10
 
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.params.nvSciSync' in found_struct}}
-    nvSciSync : anon_union6
+    nvSciSync : anon_union4
 
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.params.keyedMutex' in found_struct}}
-    keyedMutex : anon_struct16
+    keyedMutex : anon_struct11
 
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.params.reserved' in found_struct}}
@@ -13062,13 +13488,13 @@ cdef class anon_struct17:
     def __init__(self, void_ptr _ptr):
         pass
         {{if 'cudaExternalSemaphoreSignalParams.params.fence' in found_struct}}
-        self._fence = anon_struct15(_ptr=<void_ptr>self._pvt_ptr)
+        self._fence = anon_struct10(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
         {{if 'cudaExternalSemaphoreSignalParams.params.nvSciSync' in found_struct}}
-        self._nvSciSync = anon_union6(_ptr=<void_ptr>self._pvt_ptr)
+        self._nvSciSync = anon_union4(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
         {{if 'cudaExternalSemaphoreSignalParams.params.keyedMutex' in found_struct}}
-        self._keyedMutex = anon_struct16(_ptr=<void_ptr>self._pvt_ptr)
+        self._keyedMutex = anon_struct11(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         pass
@@ -13109,24 +13535,24 @@ cdef class anon_struct17:
     def fence(self):
         return self._fence
     @fence.setter
-    def fence(self, fence not None : anon_struct15):
-        string.memcpy(&self._pvt_ptr[0].params.fence, <cyruntime.anon_struct15*><void_ptr>fence.getPtr(), sizeof(self._pvt_ptr[0].params.fence))
+    def fence(self, fence not None : anon_struct10):
+        string.memcpy(&self._pvt_ptr[0].params.fence, <cyruntime.anon_struct10*><void_ptr>fence.getPtr(), sizeof(self._pvt_ptr[0].params.fence))
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.params.nvSciSync' in found_struct}}
     @property
     def nvSciSync(self):
         return self._nvSciSync
     @nvSciSync.setter
-    def nvSciSync(self, nvSciSync not None : anon_union6):
-        string.memcpy(&self._pvt_ptr[0].params.nvSciSync, <cyruntime.anon_union6*><void_ptr>nvSciSync.getPtr(), sizeof(self._pvt_ptr[0].params.nvSciSync))
+    def nvSciSync(self, nvSciSync not None : anon_union4):
+        string.memcpy(&self._pvt_ptr[0].params.nvSciSync, <cyruntime.anon_union4*><void_ptr>nvSciSync.getPtr(), sizeof(self._pvt_ptr[0].params.nvSciSync))
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.params.keyedMutex' in found_struct}}
     @property
     def keyedMutex(self):
         return self._keyedMutex
     @keyedMutex.setter
-    def keyedMutex(self, keyedMutex not None : anon_struct16):
-        string.memcpy(&self._pvt_ptr[0].params.keyedMutex, <cyruntime.anon_struct16*><void_ptr>keyedMutex.getPtr(), sizeof(self._pvt_ptr[0].params.keyedMutex))
+    def keyedMutex(self, keyedMutex not None : anon_struct11):
+        string.memcpy(&self._pvt_ptr[0].params.keyedMutex, <cyruntime.anon_struct11*><void_ptr>keyedMutex.getPtr(), sizeof(self._pvt_ptr[0].params.keyedMutex))
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.params.reserved' in found_struct}}
     @property
@@ -13146,7 +13572,7 @@ cdef class cudaExternalSemaphoreSignalParams:
     Attributes
     ----------
     {{if 'cudaExternalSemaphoreSignalParams.params' in found_struct}}
-    params : anon_struct17
+    params : anon_struct12
 
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.flags' in found_struct}}
@@ -13178,7 +13604,7 @@ cdef class cudaExternalSemaphoreSignalParams:
     def __init__(self, void_ptr _ptr = 0):
         pass
         {{if 'cudaExternalSemaphoreSignalParams.params' in found_struct}}
-        self._params = anon_struct17(_ptr=<void_ptr>self._pvt_ptr)
+        self._params = anon_struct12(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         pass
@@ -13213,8 +13639,8 @@ cdef class cudaExternalSemaphoreSignalParams:
     def params(self):
         return self._params
     @params.setter
-    def params(self, params not None : anon_struct17):
-        string.memcpy(&self._pvt_ptr[0].params, <cyruntime.anon_struct17*><void_ptr>params.getPtr(), sizeof(self._pvt_ptr[0].params))
+    def params(self, params not None : anon_struct12):
+        string.memcpy(&self._pvt_ptr[0].params, <cyruntime.anon_struct12*><void_ptr>params.getPtr(), sizeof(self._pvt_ptr[0].params))
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.flags' in found_struct}}
     @property
@@ -13235,7 +13661,7 @@ cdef class cudaExternalSemaphoreSignalParams:
 {{endif}}
 {{if 'cudaExternalSemaphoreWaitParams.params.fence' in found_struct}}
 
-cdef class anon_struct18:
+cdef class anon_struct13:
     """
     Attributes
     ----------
@@ -13281,7 +13707,7 @@ cdef class anon_struct18:
 {{endif}}
 {{if 'cudaExternalSemaphoreWaitParams.params.nvSciSync' in found_struct}}
 
-cdef class anon_union7:
+cdef class anon_union5:
     """
     Attributes
     ----------
@@ -13346,7 +13772,7 @@ cdef class anon_union7:
 {{endif}}
 {{if 'cudaExternalSemaphoreWaitParams.params.keyedMutex' in found_struct}}
 
-cdef class anon_struct19:
+cdef class anon_struct14:
     """
     Attributes
     ----------
@@ -13410,20 +13836,20 @@ cdef class anon_struct19:
 {{endif}}
 {{if 'cudaExternalSemaphoreWaitParams.params' in found_struct}}
 
-cdef class anon_struct20:
+cdef class anon_struct15:
     """
     Attributes
     ----------
     {{if 'cudaExternalSemaphoreWaitParams.params.fence' in found_struct}}
-    fence : anon_struct18
+    fence : anon_struct13
 
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.params.nvSciSync' in found_struct}}
-    nvSciSync : anon_union7
+    nvSciSync : anon_union5
 
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.params.keyedMutex' in found_struct}}
-    keyedMutex : anon_struct19
+    keyedMutex : anon_struct14
 
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.params.reserved' in found_struct}}
@@ -13442,13 +13868,13 @@ cdef class anon_struct20:
     def __init__(self, void_ptr _ptr):
         pass
         {{if 'cudaExternalSemaphoreWaitParams.params.fence' in found_struct}}
-        self._fence = anon_struct18(_ptr=<void_ptr>self._pvt_ptr)
+        self._fence = anon_struct13(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
         {{if 'cudaExternalSemaphoreWaitParams.params.nvSciSync' in found_struct}}
-        self._nvSciSync = anon_union7(_ptr=<void_ptr>self._pvt_ptr)
+        self._nvSciSync = anon_union5(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
         {{if 'cudaExternalSemaphoreWaitParams.params.keyedMutex' in found_struct}}
-        self._keyedMutex = anon_struct19(_ptr=<void_ptr>self._pvt_ptr)
+        self._keyedMutex = anon_struct14(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         pass
@@ -13489,24 +13915,24 @@ cdef class anon_struct20:
     def fence(self):
         return self._fence
     @fence.setter
-    def fence(self, fence not None : anon_struct18):
-        string.memcpy(&self._pvt_ptr[0].params.fence, <cyruntime.anon_struct18*><void_ptr>fence.getPtr(), sizeof(self._pvt_ptr[0].params.fence))
+    def fence(self, fence not None : anon_struct13):
+        string.memcpy(&self._pvt_ptr[0].params.fence, <cyruntime.anon_struct13*><void_ptr>fence.getPtr(), sizeof(self._pvt_ptr[0].params.fence))
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.params.nvSciSync' in found_struct}}
     @property
     def nvSciSync(self):
         return self._nvSciSync
     @nvSciSync.setter
-    def nvSciSync(self, nvSciSync not None : anon_union7):
-        string.memcpy(&self._pvt_ptr[0].params.nvSciSync, <cyruntime.anon_union7*><void_ptr>nvSciSync.getPtr(), sizeof(self._pvt_ptr[0].params.nvSciSync))
+    def nvSciSync(self, nvSciSync not None : anon_union5):
+        string.memcpy(&self._pvt_ptr[0].params.nvSciSync, <cyruntime.anon_union5*><void_ptr>nvSciSync.getPtr(), sizeof(self._pvt_ptr[0].params.nvSciSync))
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.params.keyedMutex' in found_struct}}
     @property
     def keyedMutex(self):
         return self._keyedMutex
     @keyedMutex.setter
-    def keyedMutex(self, keyedMutex not None : anon_struct19):
-        string.memcpy(&self._pvt_ptr[0].params.keyedMutex, <cyruntime.anon_struct19*><void_ptr>keyedMutex.getPtr(), sizeof(self._pvt_ptr[0].params.keyedMutex))
+    def keyedMutex(self, keyedMutex not None : anon_struct14):
+        string.memcpy(&self._pvt_ptr[0].params.keyedMutex, <cyruntime.anon_struct14*><void_ptr>keyedMutex.getPtr(), sizeof(self._pvt_ptr[0].params.keyedMutex))
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.params.reserved' in found_struct}}
     @property
@@ -13526,7 +13952,7 @@ cdef class cudaExternalSemaphoreWaitParams:
     Attributes
     ----------
     {{if 'cudaExternalSemaphoreWaitParams.params' in found_struct}}
-    params : anon_struct20
+    params : anon_struct15
 
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.flags' in found_struct}}
@@ -13558,7 +13984,7 @@ cdef class cudaExternalSemaphoreWaitParams:
     def __init__(self, void_ptr _ptr = 0):
         pass
         {{if 'cudaExternalSemaphoreWaitParams.params' in found_struct}}
-        self._params = anon_struct20(_ptr=<void_ptr>self._pvt_ptr)
+        self._params = anon_struct15(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         pass
@@ -13593,8 +14019,8 @@ cdef class cudaExternalSemaphoreWaitParams:
     def params(self):
         return self._params
     @params.setter
-    def params(self, params not None : anon_struct20):
-        string.memcpy(&self._pvt_ptr[0].params, <cyruntime.anon_struct20*><void_ptr>params.getPtr(), sizeof(self._pvt_ptr[0].params))
+    def params(self, params not None : anon_struct15):
+        string.memcpy(&self._pvt_ptr[0].params, <cyruntime.anon_struct15*><void_ptr>params.getPtr(), sizeof(self._pvt_ptr[0].params))
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.flags' in found_struct}}
     @property
@@ -14533,7 +14959,7 @@ cdef class cudaConditionalNodeParams:
     {{if 'cudaConditionalNodeParams.size' in found_struct}}
     size : unsigned int
         Size of graph output array. Allowed values are 1 for
-        cudaGraphCondTypeWhile, 1 or 2 for cudaGraphCondTypeWhile, or any
+        cudaGraphCondTypeWhile, 1 or 2 for cudaGraphCondTypeIf, or any
         value greater than zero for cudaGraphCondTypeSwitch.
     {{endif}}
     {{if 'cudaConditionalNodeParams.phGraph_out' in found_struct}}
@@ -15591,7 +16017,7 @@ cdef class cudaGraphExecUpdateResultInfo_st:
 {{endif}}
 {{if 'cudaGraphKernelNodeUpdate.updateData.param' in found_struct}}
 
-cdef class anon_struct21:
+cdef class anon_struct16:
     """
     Attributes
     ----------
@@ -15674,7 +16100,7 @@ cdef class anon_struct21:
 {{endif}}
 {{if 'cudaGraphKernelNodeUpdate.updateData' in found_struct}}
 
-cdef class anon_union9:
+cdef class anon_union7:
     """
     Attributes
     ----------
@@ -15683,7 +16109,7 @@ cdef class anon_union9:
 
     {{endif}}
     {{if 'cudaGraphKernelNodeUpdate.updateData.param' in found_struct}}
-    param : anon_struct21
+    param : anon_struct16
 
     {{endif}}
     {{if 'cudaGraphKernelNodeUpdate.updateData.isEnabled' in found_struct}}
@@ -15705,7 +16131,7 @@ cdef class anon_union9:
         self._gridDim = dim3(_ptr=<void_ptr>&self._pvt_ptr[0].updateData.gridDim)
         {{endif}}
         {{if 'cudaGraphKernelNodeUpdate.updateData.param' in found_struct}}
-        self._param = anon_struct21(_ptr=<void_ptr>self._pvt_ptr)
+        self._param = anon_struct16(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         pass
@@ -15748,8 +16174,8 @@ cdef class anon_union9:
     def param(self):
         return self._param
     @param.setter
-    def param(self, param not None : anon_struct21):
-        string.memcpy(&self._pvt_ptr[0].updateData.param, <cyruntime.anon_struct21*><void_ptr>param.getPtr(), sizeof(self._pvt_ptr[0].updateData.param))
+    def param(self, param not None : anon_struct16):
+        string.memcpy(&self._pvt_ptr[0].updateData.param, <cyruntime.anon_struct16*><void_ptr>param.getPtr(), sizeof(self._pvt_ptr[0].updateData.param))
     {{endif}}
     {{if 'cudaGraphKernelNodeUpdate.updateData.isEnabled' in found_struct}}
     @property
@@ -15779,7 +16205,7 @@ cdef class cudaGraphKernelNodeUpdate:
         interpreted
     {{endif}}
     {{if 'cudaGraphKernelNodeUpdate.updateData' in found_struct}}
-    updateData : anon_union9
+    updateData : anon_union7
         Update data to apply. Which field is used depends on field's value
     {{endif}}
 
@@ -15800,7 +16226,7 @@ cdef class cudaGraphKernelNodeUpdate:
         self._node = cudaGraphDeviceNode_t(_ptr=<void_ptr>&self._pvt_ptr[0].node)
         {{endif}}
         {{if 'cudaGraphKernelNodeUpdate.updateData' in found_struct}}
-        self._updateData = anon_union9(_ptr=<void_ptr>self._pvt_ptr)
+        self._updateData = anon_union7(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         if self._val_ptr is not NULL:
@@ -15863,8 +16289,8 @@ cdef class cudaGraphKernelNodeUpdate:
     def updateData(self):
         return self._updateData
     @updateData.setter
-    def updateData(self, updateData not None : anon_union9):
-        string.memcpy(&self._pvt_ptr[0].updateData, <cyruntime.anon_union9*><void_ptr>updateData.getPtr(), sizeof(self._pvt_ptr[0].updateData))
+    def updateData(self, updateData not None : anon_union7):
+        string.memcpy(&self._pvt_ptr[0].updateData, <cyruntime.anon_union7*><void_ptr>updateData.getPtr(), sizeof(self._pvt_ptr[0].updateData))
     {{endif}}
 {{endif}}
 {{if 'cudaLaunchMemSyncDomainMap_st' in found_struct}}
@@ -15943,7 +16369,7 @@ cdef class cudaLaunchMemSyncDomainMap_st:
 {{endif}}
 {{if 'cudaLaunchAttributeValue.clusterDim' in found_struct}}
 
-cdef class anon_struct22:
+cdef class anon_struct17:
     """
     Attributes
     ----------
@@ -16025,7 +16451,7 @@ cdef class anon_struct22:
 {{endif}}
 {{if 'cudaLaunchAttributeValue.programmaticEvent' in found_struct}}
 
-cdef class anon_struct23:
+cdef class anon_struct18:
     """
     Attributes
     ----------
@@ -16119,7 +16545,7 @@ cdef class anon_struct23:
 {{endif}}
 {{if 'cudaLaunchAttributeValue.preferredClusterDim' in found_struct}}
 
-cdef class anon_struct24:
+cdef class anon_struct19:
     """
     Attributes
     ----------
@@ -16201,7 +16627,7 @@ cdef class anon_struct24:
 {{endif}}
 {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
 
-cdef class anon_struct25:
+cdef class anon_struct20:
     """
     Attributes
     ----------
@@ -16277,7 +16703,7 @@ cdef class anon_struct25:
 {{endif}}
 {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
 
-cdef class anon_struct26:
+cdef class anon_struct21:
     """
     Attributes
     ----------
@@ -16379,7 +16805,7 @@ cdef class cudaLaunchAttributeValue:
         ::cudaSynchronizationPolicy for work queued up in this stream.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterDim' in found_struct}}
-    clusterDim : anon_struct22
+    clusterDim : anon_struct17
         Value of launch attribute cudaLaunchAttributeClusterDimension that
         represents the desired cluster dimensions for the kernel. Opaque
         type with the following fields: - `x` - The X dimension of the
@@ -16400,7 +16826,7 @@ cdef class cudaLaunchAttributeValue:
         cudaLaunchAttributeProgrammaticStreamSerialization.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.programmaticEvent' in found_struct}}
-    programmaticEvent : anon_struct23
+    programmaticEvent : anon_struct18
         Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
         the following fields: - `cudaEvent_t` event - Event to fire when
         all blocks trigger it.    - `int` flags; - Event record flags, see
@@ -16424,7 +16850,7 @@ cdef class cudaLaunchAttributeValue:
         cudaLaunchMemSyncDomain.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.preferredClusterDim' in found_struct}}
-    preferredClusterDim : anon_struct24
+    preferredClusterDim : anon_struct19
         Value of launch attribute
         cudaLaunchAttributePreferredClusterDimension that represents the
         desired preferred cluster dimensions for the kernel. Opaque type
@@ -16439,7 +16865,7 @@ cdef class cudaLaunchAttributeValue:
         cudaLaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
-    launchCompletionEvent : anon_struct25
+    launchCompletionEvent : anon_struct20
         Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
         with the following fields: - `cudaEvent_t` event - Event to fire
         when the last block launches.    - `int` flags - Event record
@@ -16447,7 +16873,7 @@ cdef class cudaLaunchAttributeValue:
         cudaEventRecordExternal.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
-    deviceUpdatableKernelNode : anon_struct26
+    deviceUpdatableKernelNode : anon_struct21
         Value of launch attribute
         cudaLaunchAttributeDeviceUpdatableKernelNode with the following
         fields: - `int` deviceUpdatable - Whether or not the resulting
@@ -16460,6 +16886,11 @@ cdef class cudaLaunchAttributeValue:
         Value of launch attribute
         cudaLaunchAttributePreferredSharedMemoryCarveout.
     {{endif}}
+    {{if 'cudaLaunchAttributeValue.nvlinkUtilCentricScheduling' in found_struct}}
+    nvlinkUtilCentricScheduling : unsigned int
+        Value of launch attribute
+        cudaLaunchAttributeNvlinkUtilCentricScheduling.
+    {{endif}}
 
     Methods
     -------
@@ -16477,22 +16908,22 @@ cdef class cudaLaunchAttributeValue:
         self._accessPolicyWindow = cudaAccessPolicyWindow(_ptr=<void_ptr>&self._pvt_ptr[0].accessPolicyWindow)
         {{endif}}
         {{if 'cudaLaunchAttributeValue.clusterDim' in found_struct}}
-        self._clusterDim = anon_struct22(_ptr=<void_ptr>self._pvt_ptr)
+        self._clusterDim = anon_struct17(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
         {{if 'cudaLaunchAttributeValue.programmaticEvent' in found_struct}}
-        self._programmaticEvent = anon_struct23(_ptr=<void_ptr>self._pvt_ptr)
+        self._programmaticEvent = anon_struct18(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
         {{if 'cudaLaunchAttributeValue.memSyncDomainMap' in found_struct}}
         self._memSyncDomainMap = cudaLaunchMemSyncDomainMap(_ptr=<void_ptr>&self._pvt_ptr[0].memSyncDomainMap)
         {{endif}}
         {{if 'cudaLaunchAttributeValue.preferredClusterDim' in found_struct}}
-        self._preferredClusterDim = anon_struct24(_ptr=<void_ptr>self._pvt_ptr)
+        self._preferredClusterDim = anon_struct19(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
         {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
-        self._launchCompletionEvent = anon_struct25(_ptr=<void_ptr>self._pvt_ptr)
+        self._launchCompletionEvent = anon_struct20(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
         {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
-        self._deviceUpdatableKernelNode = anon_struct26(_ptr=<void_ptr>self._pvt_ptr)
+        self._deviceUpdatableKernelNode = anon_struct21(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         pass
@@ -16591,6 +17022,12 @@ cdef class cudaLaunchAttributeValue:
             except ValueError:
                 str_list += ['sharedMemCarveout : <ValueError>']
             {{endif}}
+            {{if 'cudaLaunchAttributeValue.nvlinkUtilCentricScheduling' in found_struct}}
+            try:
+                str_list += ['nvlinkUtilCentricScheduling : ' + str(self.nvlinkUtilCentricScheduling)]
+            except ValueError:
+                str_list += ['nvlinkUtilCentricScheduling : <ValueError>']
+            {{endif}}
             return '\n'.join(str_list)
         else:
             return ''
@@ -16644,8 +17081,8 @@ cdef class cudaLaunchAttributeValue:
     def clusterDim(self):
         return self._clusterDim
     @clusterDim.setter
-    def clusterDim(self, clusterDim not None : anon_struct22):
-        string.memcpy(&self._pvt_ptr[0].clusterDim, <cyruntime.anon_struct22*><void_ptr>clusterDim.getPtr(), sizeof(self._pvt_ptr[0].clusterDim))
+    def clusterDim(self, clusterDim not None : anon_struct17):
+        string.memcpy(&self._pvt_ptr[0].clusterDim, <cyruntime.anon_struct17*><void_ptr>clusterDim.getPtr(), sizeof(self._pvt_ptr[0].clusterDim))
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterSchedulingPolicyPreference' in found_struct}}
     @property
@@ -16670,8 +17107,8 @@ cdef class cudaLaunchAttributeValue:
     def programmaticEvent(self):
         return self._programmaticEvent
     @programmaticEvent.setter
-    def programmaticEvent(self, programmaticEvent not None : anon_struct23):
-        string.memcpy(&self._pvt_ptr[0].programmaticEvent, <cyruntime.anon_struct23*><void_ptr>programmaticEvent.getPtr(), sizeof(self._pvt_ptr[0].programmaticEvent))
+    def programmaticEvent(self, programmaticEvent not None : anon_struct18):
+        string.memcpy(&self._pvt_ptr[0].programmaticEvent, <cyruntime.anon_struct18*><void_ptr>programmaticEvent.getPtr(), sizeof(self._pvt_ptr[0].programmaticEvent))
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     @property
@@ -16704,24 +17141,24 @@ cdef class cudaLaunchAttributeValue:
     def preferredClusterDim(self):
         return self._preferredClusterDim
     @preferredClusterDim.setter
-    def preferredClusterDim(self, preferredClusterDim not None : anon_struct24):
-        string.memcpy(&self._pvt_ptr[0].preferredClusterDim, <cyruntime.anon_struct24*><void_ptr>preferredClusterDim.getPtr(), sizeof(self._pvt_ptr[0].preferredClusterDim))
+    def preferredClusterDim(self, preferredClusterDim not None : anon_struct19):
+        string.memcpy(&self._pvt_ptr[0].preferredClusterDim, <cyruntime.anon_struct19*><void_ptr>preferredClusterDim.getPtr(), sizeof(self._pvt_ptr[0].preferredClusterDim))
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     @property
     def launchCompletionEvent(self):
         return self._launchCompletionEvent
     @launchCompletionEvent.setter
-    def launchCompletionEvent(self, launchCompletionEvent not None : anon_struct25):
-        string.memcpy(&self._pvt_ptr[0].launchCompletionEvent, <cyruntime.anon_struct25*><void_ptr>launchCompletionEvent.getPtr(), sizeof(self._pvt_ptr[0].launchCompletionEvent))
+    def launchCompletionEvent(self, launchCompletionEvent not None : anon_struct20):
+        string.memcpy(&self._pvt_ptr[0].launchCompletionEvent, <cyruntime.anon_struct20*><void_ptr>launchCompletionEvent.getPtr(), sizeof(self._pvt_ptr[0].launchCompletionEvent))
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
     @property
     def deviceUpdatableKernelNode(self):
         return self._deviceUpdatableKernelNode
     @deviceUpdatableKernelNode.setter
-    def deviceUpdatableKernelNode(self, deviceUpdatableKernelNode not None : anon_struct26):
-        string.memcpy(&self._pvt_ptr[0].deviceUpdatableKernelNode, <cyruntime.anon_struct26*><void_ptr>deviceUpdatableKernelNode.getPtr(), sizeof(self._pvt_ptr[0].deviceUpdatableKernelNode))
+    def deviceUpdatableKernelNode(self, deviceUpdatableKernelNode not None : anon_struct21):
+        string.memcpy(&self._pvt_ptr[0].deviceUpdatableKernelNode, <cyruntime.anon_struct21*><void_ptr>deviceUpdatableKernelNode.getPtr(), sizeof(self._pvt_ptr[0].deviceUpdatableKernelNode))
     {{endif}}
     {{if 'cudaLaunchAttributeValue.sharedMemCarveout' in found_struct}}
     @property
@@ -16731,6 +17168,14 @@ cdef class cudaLaunchAttributeValue:
     def sharedMemCarveout(self, unsigned int sharedMemCarveout):
         self._pvt_ptr[0].sharedMemCarveout = sharedMemCarveout
     {{endif}}
+    {{if 'cudaLaunchAttributeValue.nvlinkUtilCentricScheduling' in found_struct}}
+    @property
+    def nvlinkUtilCentricScheduling(self):
+        return self._pvt_ptr[0].nvlinkUtilCentricScheduling
+    @nvlinkUtilCentricScheduling.setter
+    def nvlinkUtilCentricScheduling(self, unsigned int nvlinkUtilCentricScheduling):
+        self._pvt_ptr[0].nvlinkUtilCentricScheduling = nvlinkUtilCentricScheduling
+    {{endif}}
 {{endif}}
 {{if 'cudaLaunchAttribute_st' in found_struct}}
 
@@ -16807,7 +17252,7 @@ cdef class cudaLaunchAttribute_st:
 {{endif}}
 {{if 'cudaAsyncNotificationInfo.info.overBudget' in found_struct}}
 
-cdef class anon_struct27:
+cdef class anon_struct22:
     """
     Attributes
     ----------
@@ -16853,12 +17298,12 @@ cdef class anon_struct27:
 {{endif}}
 {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
 
-cdef class anon_union10:
+cdef class anon_union8:
     """
     Attributes
     ----------
     {{if 'cudaAsyncNotificationInfo.info.overBudget' in found_struct}}
-    overBudget : anon_struct27
+    overBudget : anon_struct22
 
     {{endif}}
 
@@ -16873,7 +17318,7 @@ cdef class anon_union10:
     def __init__(self, void_ptr _ptr):
         pass
         {{if 'cudaAsyncNotificationInfo.info.overBudget' in found_struct}}
-        self._overBudget = anon_struct27(_ptr=<void_ptr>self._pvt_ptr)
+        self._overBudget = anon_struct22(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         pass
@@ -16896,8 +17341,8 @@ cdef class anon_union10:
     def overBudget(self):
         return self._overBudget
     @overBudget.setter
-    def overBudget(self, overBudget not None : anon_struct27):
-        string.memcpy(&self._pvt_ptr[0].info.overBudget, <cyruntime.anon_struct27*><void_ptr>overBudget.getPtr(), sizeof(self._pvt_ptr[0].info.overBudget))
+    def overBudget(self, overBudget not None : anon_struct22):
+        string.memcpy(&self._pvt_ptr[0].info.overBudget, <cyruntime.anon_struct22*><void_ptr>overBudget.getPtr(), sizeof(self._pvt_ptr[0].info.overBudget))
     {{endif}}
 {{endif}}
 {{if 'cudaAsyncNotificationInfo' in found_struct}}
@@ -16913,7 +17358,7 @@ cdef class cudaAsyncNotificationInfo:
         The type of notification being sent
     {{endif}}
     {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
-    info : anon_union10
+    info : anon_union8
         Information about the notification. `typename` must be checked in
         order to interpret this field.
     {{endif}}
@@ -16932,7 +17377,7 @@ cdef class cudaAsyncNotificationInfo:
     def __init__(self, void_ptr _ptr = 0):
         pass
         {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
-        self._info = anon_union10(_ptr=<void_ptr>self._pvt_ptr)
+        self._info = anon_union8(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         if self._val_ptr is not NULL:
@@ -16972,8 +17417,8 @@ cdef class cudaAsyncNotificationInfo:
     def info(self):
         return self._info
     @info.setter
-    def info(self, info not None : anon_union10):
-        string.memcpy(&self._pvt_ptr[0].info, <cyruntime.anon_union10*><void_ptr>info.getPtr(), sizeof(self._pvt_ptr[0].info))
+    def info(self, info not None : anon_union8):
+        string.memcpy(&self._pvt_ptr[0].info, <cyruntime.anon_union8*><void_ptr>info.getPtr(), sizeof(self._pvt_ptr[0].info))
     {{endif}}
 {{endif}}
 {{if 'cudaTextureDesc' in found_struct}}
@@ -17412,7 +17857,7 @@ cdef class cudaEglPlaneDesc_st:
 {{endif}}
 {{if True}}
 
-cdef class anon_union11:
+cdef class anon_union9:
     """
     Attributes
     ----------
@@ -17502,7 +17947,7 @@ cdef class cudaEglFrame_st:
     Attributes
     ----------
     {{if True}}
-    frame : anon_union11
+    frame : anon_union9
 
     {{endif}}
     {{if True}}
@@ -17536,7 +17981,7 @@ cdef class cudaEglFrame_st:
     def __init__(self, void_ptr _ptr = 0):
         pass
         {{if True}}
-        self._frame = anon_union11(_ptr=<void_ptr>self._pvt_ptr)
+        self._frame = anon_union9(_ptr=<void_ptr>self._pvt_ptr)
         {{endif}}
     def __dealloc__(self):
         if self._val_ptr is not NULL:
@@ -17584,8 +18029,8 @@ cdef class cudaEglFrame_st:
     def frame(self):
         return self._frame
     @frame.setter
-    def frame(self, frame not None : anon_union11):
-        string.memcpy(&self._pvt_ptr[0].frame, <cyruntime.anon_union11*><void_ptr>frame.getPtr(), sizeof(self._pvt_ptr[0].frame))
+    def frame(self, frame not None : anon_union9):
+        string.memcpy(&self._pvt_ptr[0].frame, <cyruntime.anon_union9*><void_ptr>frame.getPtr(), sizeof(self._pvt_ptr[0].frame))
     {{endif}}
     {{if True}}
     @property
@@ -17661,6 +18106,34 @@ cdef class cudaGraphConditionalHandle:
         return <void_ptr>self._pvt_ptr
 {{endif}}
 
+{{if 'cudaLogIterator' in found_types}}
+
+cdef class cudaLogIterator:
+    """
+
+    Methods
+    -------
+    getPtr()
+        Get memory address of class instance
+
+    """
+    def __cinit__(self, unsigned int init_value = 0, void_ptr _ptr = 0):
+        if _ptr == 0:
+            self._pvt_ptr = &self._pvt_val
+        else:
+            self._pvt_ptr = <cyruntime.cudaLogIterator *>_ptr
+        if init_value:
+            self._pvt_ptr[0] = init_value
+    def __dealloc__(self):
+        pass
+    def __repr__(self):
+        return '<cudaLogIterator ' + str(self.__int__()) + '>'
+    def __int__(self):
+        return <unsigned int>self._pvt_ptr[0]
+    def getPtr(self):
+        return <void_ptr>self._pvt_ptr
+{{endif}}
+
 {{if 'cudaSurfaceObject_t' in found_types}}
 
 cdef class cudaSurfaceObject_t:
@@ -19098,326 +19571,29 @@ def cudaGetDeviceCount():
     return (_dict_cudaError_t[err], count)
 {{endif}}
 
-{{if 'cudaGetDeviceProperties_v2' in found_functions}}
+{{if 'cudaGetDeviceProperties' in found_functions}}
 
 @cython.embedsignature(True)
 def cudaGetDeviceProperties(int device):
     """ Returns information about the compute-device.
 
-    Returns in `*prop` the properties of device `dev`. The
-    :py:obj:`~.cudaDeviceProp` structure is defined as:
-
-    **View CUDA Toolkit Documentation for a C++ code example**
-
-    where:
-
-    - :py:obj:`~.name[256]` is an ASCII string identifying the device.
-
-    - :py:obj:`~.uuid` is a 16-byte unique identifier.
-
-    - :py:obj:`~.totalGlobalMem` is the total amount of global memory
-      available on the device in bytes.
-
-    - :py:obj:`~.sharedMemPerBlock` is the maximum amount of shared memory
-      available to a thread block in bytes.
-
-    - :py:obj:`~.regsPerBlock` is the maximum number of 32-bit registers
-      available to a thread block.
-
-    - :py:obj:`~.warpSize` is the warp size in threads.
-
-    - :py:obj:`~.memPitch` is the maximum pitch in bytes allowed by the
-      memory copy functions that involve memory regions allocated through
-      :py:obj:`~.cudaMallocPitch()`.
-
-    - :py:obj:`~.maxThreadsPerBlock` is the maximum number of threads per
-      block.
-
-    - :py:obj:`~.maxThreadsDim[3]` contains the maximum size of each
-      dimension of a block.
-
-    - :py:obj:`~.maxGridSize[3]` contains the maximum size of each
-      dimension of a grid.
-
-    - :py:obj:`~.clockRate` is the clock frequency in kilohertz.
-
-    - :py:obj:`~.totalConstMem` is the total amount of constant memory
-      available on the device in bytes.
-
-    - :py:obj:`~.major`, :py:obj:`~.minor` are the major and minor revision
-      numbers defining the device's compute capability.
-
-    - :py:obj:`~.textureAlignment` is the alignment requirement; texture
-      base addresses that are aligned to :py:obj:`~.textureAlignment` bytes
-      do not need an offset applied to texture fetches.
-
-    - :py:obj:`~.texturePitchAlignment` is the pitch alignment requirement
-      for 2D texture references that are bound to pitched memory.
-
-    - :py:obj:`~.deviceOverlap` is 1 if the device can concurrently copy
-      memory between host and device while executing a kernel, or 0 if not.
-      Deprecated, use instead asyncEngineCount.
-
-    - :py:obj:`~.multiProcessorCount` is the number of multiprocessors on
-      the device.
-
-    - :py:obj:`~.kernelExecTimeoutEnabled` is 1 if there is a run time
-      limit for kernels executed on the device, or 0 if not.
-
-    - :py:obj:`~.integrated` is 1 if the device is an integrated
-      (motherboard) GPU and 0 if it is a discrete (card) component.
-
-    - :py:obj:`~.canMapHostMemory` is 1 if the device can map host memory
-      into the CUDA address space for use with
-      :py:obj:`~.cudaHostAlloc()`/:py:obj:`~.cudaHostGetDevicePointer()`,
-      or 0 if not.
-
-    - :py:obj:`~.computeMode` is the compute mode that the device is
-      currently in. Available modes are as follows:
-
-      - cudaComputeModeDefault: Default mode - Device is not restricted and
-        multiple threads can use :py:obj:`~.cudaSetDevice()` with this
-        device.
-
-      - cudaComputeModeProhibited: Compute-prohibited mode - No threads can
-        use :py:obj:`~.cudaSetDevice()` with this device.
-
-      - cudaComputeModeExclusiveProcess: Compute-exclusive-process mode -
-        Many threads in one process will be able to use
-        :py:obj:`~.cudaSetDevice()` with this device.   When an occupied
-        exclusive mode device is chosen with :py:obj:`~.cudaSetDevice`, all
-        subsequent non-device management runtime functions will return
-        :py:obj:`~.cudaErrorDevicesUnavailable`.
-
-    - :py:obj:`~.maxTexture1D` is the maximum 1D texture size.
-
-    - :py:obj:`~.maxTexture1DMipmap` is the maximum 1D mipmapped texture
-      texture size.
-
-    - :py:obj:`~.maxTexture1DLinear` is the maximum 1D texture size for
-      textures bound to linear memory.
-
-    - :py:obj:`~.maxTexture2D[2]` contains the maximum 2D texture
-      dimensions.
-
-    - :py:obj:`~.maxTexture2DMipmap[2]` contains the maximum 2D mipmapped
-      texture dimensions.
-
-    - :py:obj:`~.maxTexture2DLinear[3]` contains the maximum 2D texture
-      dimensions for 2D textures bound to pitch linear memory.
-
-    - :py:obj:`~.maxTexture2DGather[2]` contains the maximum 2D texture
-      dimensions if texture gather operations have to be performed.
-
-    - :py:obj:`~.maxTexture3D[3]` contains the maximum 3D texture
-      dimensions.
-
-    - :py:obj:`~.maxTexture3DAlt[3]` contains the maximum alternate 3D
-      texture dimensions.
-
-    - :py:obj:`~.maxTextureCubemap` is the maximum cubemap texture width or
-      height.
-
-    - :py:obj:`~.maxTexture1DLayered[2]` contains the maximum 1D layered
-      texture dimensions.
-
-    - :py:obj:`~.maxTexture2DLayered[3]` contains the maximum 2D layered
-      texture dimensions.
-
-    - :py:obj:`~.maxTextureCubemapLayered[2]` contains the maximum cubemap
-      layered texture dimensions.
-
-    - :py:obj:`~.maxSurface1D` is the maximum 1D surface size.
-
-    - :py:obj:`~.maxSurface2D[2]` contains the maximum 2D surface
-      dimensions.
-
-    - :py:obj:`~.maxSurface3D[3]` contains the maximum 3D surface
-      dimensions.
-
-    - :py:obj:`~.maxSurface1DLayered[2]` contains the maximum 1D layered
-      surface dimensions.
-
-    - :py:obj:`~.maxSurface2DLayered[3]` contains the maximum 2D layered
-      surface dimensions.
-
-    - :py:obj:`~.maxSurfaceCubemap` is the maximum cubemap surface width or
-      height.
-
-    - :py:obj:`~.maxSurfaceCubemapLayered[2]` contains the maximum cubemap
-      layered surface dimensions.
-
-    - :py:obj:`~.surfaceAlignment` specifies the alignment requirements for
-      surfaces.
-
-    - :py:obj:`~.concurrentKernels` is 1 if the device supports executing
-      multiple kernels within the same context simultaneously, or 0 if not.
-      It is not guaranteed that multiple kernels will be resident on the
-      device concurrently so this feature should not be relied upon for
-      correctness.
-
-    - :py:obj:`~.ECCEnabled` is 1 if the device has ECC support turned on,
-      or 0 if not.
-
-    - :py:obj:`~.pciBusID` is the PCI bus identifier of the device.
-
-    - :py:obj:`~.pciDeviceID` is the PCI device (sometimes called slot)
-      identifier of the device.
-
-    - :py:obj:`~.pciDomainID` is the PCI domain identifier of the device.
-
-    - :py:obj:`~.tccDriver` is 1 if the device is using a TCC driver or 0
-      if not.
-
-    - :py:obj:`~.asyncEngineCount` is 1 when the device can concurrently
-      copy memory between host and device while executing a kernel. It is 2
-      when the device can concurrently copy memory between host and device
-      in both directions and execute a kernel at the same time. It is 0 if
-      neither of these is supported.
-
-    - :py:obj:`~.unifiedAddressing` is 1 if the device shares a unified
-      address space with the host and 0 otherwise.
-
-    - :py:obj:`~.memoryClockRate` is the peak memory clock frequency in
-      kilohertz.
-
-    - :py:obj:`~.memoryBusWidth` is the memory bus width   in bits.
-
-    - :py:obj:`~.l2CacheSize` is L2 cache size in bytes.
-
-    - :py:obj:`~.persistingL2CacheMaxSize` is L2 cache's maximum persisting
-      lines size in bytes.
-
-    - :py:obj:`~.maxThreadsPerMultiProcessor`   is the number of maximum
-      resident threads per multiprocessor.
-
-    - :py:obj:`~.streamPrioritiesSupported` is 1 if the device supports
-      stream priorities, or 0 if it is not supported.
-
-    - :py:obj:`~.globalL1CacheSupported` is 1 if the device supports
-      caching of globals in L1 cache, or 0 if it is not supported.
-
-    - :py:obj:`~.localL1CacheSupported` is 1 if the device supports caching
-      of locals in L1 cache, or 0 if it is not supported.
-
-    - :py:obj:`~.sharedMemPerMultiprocessor` is the maximum amount of
-      shared memory available to a multiprocessor in bytes; this amount is
-      shared by all thread blocks simultaneously resident on a
-      multiprocessor.
-
-    - :py:obj:`~.regsPerMultiprocessor` is the maximum number of 32-bit
-      registers available to a multiprocessor; this number is shared by all
-      thread blocks simultaneously resident on a multiprocessor.
-
-    - :py:obj:`~.managedMemory` is 1 if the device supports allocating
-      managed memory on this system, or 0 if it is not supported.
-
-    - :py:obj:`~.isMultiGpuBoard` is 1 if the device is on a multi-GPU
-      board (e.g. Gemini cards), and 0 if not;
-
-    - :py:obj:`~.multiGpuBoardGroupID` is a unique identifier for a group
-      of devices associated with the same board. Devices on the same multi-
-      GPU board will share the same identifier.
-
-    - :py:obj:`~.hostNativeAtomicSupported` is 1 if the link between the
-      device and the host supports native atomic operations, or 0 if it is
-      not supported.
-
-    - :py:obj:`~.singleToDoublePrecisionPerfRatio`   is the ratio of single
-      precision performance (in floating-point operations per second) to
-      double precision performance.
-
-    - :py:obj:`~.pageableMemoryAccess` is 1 if the device supports
-      coherently accessing pageable memory without calling cudaHostRegister
-      on it, and 0 otherwise.
-
-    - :py:obj:`~.concurrentManagedAccess` is 1 if the device can coherently
-      access managed memory concurrently with the CPU, and 0 otherwise.
-
-    - :py:obj:`~.computePreemptionSupported` is 1 if the device supports
-      Compute Preemption, and 0 otherwise.
-
-    - :py:obj:`~.canUseHostPointerForRegisteredMem` is 1 if the device can
-      access host registered memory at the same virtual address as the CPU,
-      and 0 otherwise.
-
-    - :py:obj:`~.cooperativeLaunch` is 1 if the device supports launching
-      cooperative kernels via :py:obj:`~.cudaLaunchCooperativeKernel`, and
-      0 otherwise.
-
-    - :py:obj:`~.cooperativeMultiDeviceLaunch` is 1 if the device supports
-      launching cooperative kernels via
-      :py:obj:`~.cudaLaunchCooperativeKernelMultiDevice`, and 0 otherwise.
-
-    - :py:obj:`~.sharedMemPerBlockOptin` is the per device maximum shared
-      memory per block usable by special opt in
-
-    - :py:obj:`~.pageableMemoryAccessUsesHostPageTables` is 1 if the device
-      accesses pageable memory via the host's page tables, and 0 otherwise.
-
-    - :py:obj:`~.directManagedMemAccessFromHost` is 1 if the host can
-      directly access managed memory on the device without migration, and 0
-      otherwise.
-
-    - :py:obj:`~.maxBlocksPerMultiProcessor` is the maximum number of
-      thread blocks that can reside on a multiprocessor.
-
-    - :py:obj:`~.accessPolicyMaxWindowSize` is the maximum value of
-      :py:obj:`~.cudaAccessPolicyWindow.num_bytes`.
-
-    - :py:obj:`~.reservedSharedMemPerBlock` is the shared memory reserved
-      by CUDA driver per block in bytes
-
-    - :py:obj:`~.hostRegisterSupported` is 1 if the device supports host
-      memory registration via :py:obj:`~.cudaHostRegister`, and 0
-      otherwise.
-
-    - :py:obj:`~.sparseCudaArraySupported` is 1 if the device supports
-      sparse CUDA arrays and sparse CUDA mipmapped arrays, 0 otherwise
-
-    - :py:obj:`~.hostRegisterReadOnlySupported` is 1 if the device supports
-      using the :py:obj:`~.cudaHostRegister` flag cudaHostRegisterReadOnly
-      to register memory that must be mapped as read-only to the GPU
-
-    - :py:obj:`~.timelineSemaphoreInteropSupported` is 1 if external
-      timeline semaphore interop is supported on the device, 0 otherwise
-
-    - :py:obj:`~.memoryPoolsSupported` is 1 if the device supports using
-      the cudaMallocAsync and cudaMemPool family of APIs, 0 otherwise
-
-    - :py:obj:`~.gpuDirectRDMASupported` is 1 if the device supports
-      GPUDirect RDMA APIs, 0 otherwise
-
-    - :py:obj:`~.gpuDirectRDMAFlushWritesOptions` is a bitmask to be
-      interpreted according to the
-      :py:obj:`~.cudaFlushGPUDirectRDMAWritesOptions` enum
-
-    - :py:obj:`~.gpuDirectRDMAWritesOrdering` See the
-      :py:obj:`~.cudaGPUDirectRDMAWritesOrdering` enum for numerical values
-
-    - :py:obj:`~.memoryPoolSupportedHandleTypes` is a bitmask of handle
-      types supported with mempool-based IPC
-
-    - :py:obj:`~.deferredMappingCudaArraySupported` is 1 if the device
-      supports deferred mapping CUDA arrays and CUDA mipmapped arrays
-
-    - :py:obj:`~.ipcEventSupported` is 1 if the device supports IPC Events,
-      and 0 otherwise
-
-    - :py:obj:`~.unifiedFunctionPointers` is 1 if the device support
-      unified pointers, and 0 otherwise
+    Returns in `*prop` the properties of device `dev`.
 
     Parameters
     ----------
     device : int
-        None
+        Device number to get properties for
 
     Returns
     -------
     cudaError_t
-
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`
     prop : :py:obj:`~.cudaDeviceProp`
-        None
+        Properties for the specified device
+
+    See Also
+    --------
+    :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cudaDeviceGetAttribute`, :py:obj:`~.cudaInitDevice`, :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetName`
     """
     cdef cudaDeviceProp prop = cudaDeviceProp()
     err = cyruntime.cudaGetDeviceProperties(<cyruntime.cudaDeviceProp*>prop._pvt_ptr, device)
@@ -19433,350 +19609,7 @@ def cudaDeviceGetAttribute(attr not None : cudaDeviceAttr, int device):
     """ Returns information about the device.
 
     Returns in `*value` the integer value of the attribute `attr` on device
-    `device`. The supported attributes are:
-
-    - :py:obj:`~.cudaDevAttrMaxThreadsPerBlock`: Maximum number of threads
-      per block
-
-    - :py:obj:`~.cudaDevAttrMaxBlockDimX`: Maximum x-dimension of a block
-
-    - :py:obj:`~.cudaDevAttrMaxBlockDimY`: Maximum y-dimension of a block
-
-    - :py:obj:`~.cudaDevAttrMaxBlockDimZ`: Maximum z-dimension of a block
-
-    - :py:obj:`~.cudaDevAttrMaxGridDimX`: Maximum x-dimension of a grid
-
-    - :py:obj:`~.cudaDevAttrMaxGridDimY`: Maximum y-dimension of a grid
-
-    - :py:obj:`~.cudaDevAttrMaxGridDimZ`: Maximum z-dimension of a grid
-
-    - :py:obj:`~.cudaDevAttrMaxSharedMemoryPerBlock`: Maximum amount of
-      shared memory available to a thread block in bytes
-
-    - :py:obj:`~.cudaDevAttrTotalConstantMemory`: Memory available on
-      device for constant variables in a CUDA C kernel in bytes
-
-    - :py:obj:`~.cudaDevAttrWarpSize`: Warp size in threads
-
-    - :py:obj:`~.cudaDevAttrMaxPitch`: Maximum pitch in bytes allowed by
-      the memory copy functions that involve memory regions allocated
-      through :py:obj:`~.cudaMallocPitch()`
-
-    - :py:obj:`~.cudaDevAttrMaxTexture1DWidth`: Maximum 1D texture width
-
-    - :py:obj:`~.cudaDevAttrMaxTexture1DLinearWidth`: Maximum width for a
-      1D texture bound to linear memory
-
-    - :py:obj:`~.cudaDevAttrMaxTexture1DMipmappedWidth`: Maximum mipmapped
-      1D texture width
-
-    - :py:obj:`~.cudaDevAttrMaxTexture2DWidth`: Maximum 2D texture width
-
-    - :py:obj:`~.cudaDevAttrMaxTexture2DHeight`: Maximum 2D texture height
-
-    - :py:obj:`~.cudaDevAttrMaxTexture2DLinearWidth`: Maximum width for a
-      2D texture bound to linear memory
-
-    - :py:obj:`~.cudaDevAttrMaxTexture2DLinearHeight`: Maximum height for a
-      2D texture bound to linear memory
-
-    - :py:obj:`~.cudaDevAttrMaxTexture2DLinearPitch`: Maximum pitch in
-      bytes for a 2D texture bound to linear memory
-
-    - :py:obj:`~.cudaDevAttrMaxTexture2DMipmappedWidth`: Maximum mipmapped
-      2D texture width
-
-    - :py:obj:`~.cudaDevAttrMaxTexture2DMipmappedHeight`: Maximum mipmapped
-      2D texture height
-
-    - :py:obj:`~.cudaDevAttrMaxTexture3DWidth`: Maximum 3D texture width
-
-    - :py:obj:`~.cudaDevAttrMaxTexture3DHeight`: Maximum 3D texture height
-
-    - :py:obj:`~.cudaDevAttrMaxTexture3DDepth`: Maximum 3D texture depth
-
-    - :py:obj:`~.cudaDevAttrMaxTexture3DWidthAlt`: Alternate maximum 3D
-      texture width, 0 if no alternate maximum 3D texture size is supported
-
-    - :py:obj:`~.cudaDevAttrMaxTexture3DHeightAlt`: Alternate maximum 3D
-      texture height, 0 if no alternate maximum 3D texture size is
-      supported
-
-    - :py:obj:`~.cudaDevAttrMaxTexture3DDepthAlt`: Alternate maximum 3D
-      texture depth, 0 if no alternate maximum 3D texture size is supported
-
-    - :py:obj:`~.cudaDevAttrMaxTextureCubemapWidth`: Maximum cubemap
-      texture width or height
-
-    - :py:obj:`~.cudaDevAttrMaxTexture1DLayeredWidth`: Maximum 1D layered
-      texture width
-
-    - :py:obj:`~.cudaDevAttrMaxTexture1DLayeredLayers`: Maximum layers in a
-      1D layered texture
-
-    - :py:obj:`~.cudaDevAttrMaxTexture2DLayeredWidth`: Maximum 2D layered
-      texture width
-
-    - :py:obj:`~.cudaDevAttrMaxTexture2DLayeredHeight`: Maximum 2D layered
-      texture height
-
-    - :py:obj:`~.cudaDevAttrMaxTexture2DLayeredLayers`: Maximum layers in a
-      2D layered texture
-
-    - :py:obj:`~.cudaDevAttrMaxTextureCubemapLayeredWidth`: Maximum cubemap
-      layered texture width or height
-
-    - :py:obj:`~.cudaDevAttrMaxTextureCubemapLayeredLayers`: Maximum layers
-      in a cubemap layered texture
-
-    - :py:obj:`~.cudaDevAttrMaxSurface1DWidth`: Maximum 1D surface width
-
-    - :py:obj:`~.cudaDevAttrMaxSurface2DWidth`: Maximum 2D surface width
-
-    - :py:obj:`~.cudaDevAttrMaxSurface2DHeight`: Maximum 2D surface height
-
-    - :py:obj:`~.cudaDevAttrMaxSurface3DWidth`: Maximum 3D surface width
-
-    - :py:obj:`~.cudaDevAttrMaxSurface3DHeight`: Maximum 3D surface height
-
-    - :py:obj:`~.cudaDevAttrMaxSurface3DDepth`: Maximum 3D surface depth
-
-    - :py:obj:`~.cudaDevAttrMaxSurface1DLayeredWidth`: Maximum 1D layered
-      surface width
-
-    - :py:obj:`~.cudaDevAttrMaxSurface1DLayeredLayers`: Maximum layers in a
-      1D layered surface
-
-    - :py:obj:`~.cudaDevAttrMaxSurface2DLayeredWidth`: Maximum 2D layered
-      surface width
-
-    - :py:obj:`~.cudaDevAttrMaxSurface2DLayeredHeight`: Maximum 2D layered
-      surface height
-
-    - :py:obj:`~.cudaDevAttrMaxSurface2DLayeredLayers`: Maximum layers in a
-      2D layered surface
-
-    - :py:obj:`~.cudaDevAttrMaxSurfaceCubemapWidth`: Maximum cubemap
-      surface width
-
-    - :py:obj:`~.cudaDevAttrMaxSurfaceCubemapLayeredWidth`: Maximum cubemap
-      layered surface width
-
-    - :py:obj:`~.cudaDevAttrMaxSurfaceCubemapLayeredLayers`: Maximum layers
-      in a cubemap layered surface
-
-    - :py:obj:`~.cudaDevAttrMaxRegistersPerBlock`: Maximum number of 32-bit
-      registers available to a thread block
-
-    - :py:obj:`~.cudaDevAttrClockRate`: Peak clock frequency in kilohertz
-
-    - :py:obj:`~.cudaDevAttrTextureAlignment`: Alignment requirement;
-      texture base addresses aligned to :py:obj:`~.textureAlign` bytes do
-      not need an offset applied to texture fetches
-
-    - :py:obj:`~.cudaDevAttrTexturePitchAlignment`: Pitch alignment
-      requirement for 2D texture references bound to pitched memory
-
-    - :py:obj:`~.cudaDevAttrGpuOverlap`: 1 if the device can concurrently
-      copy memory between host and device while executing a kernel, or 0 if
-      not
-
-    - :py:obj:`~.cudaDevAttrMultiProcessorCount`: Number of multiprocessors
-      on the device
-
-    - :py:obj:`~.cudaDevAttrKernelExecTimeout`: 1 if there is a run time
-      limit for kernels executed on the device, or 0 if not
-
-    - :py:obj:`~.cudaDevAttrIntegrated`: 1 if the device is integrated with
-      the memory subsystem, or 0 if not
-
-    - :py:obj:`~.cudaDevAttrCanMapHostMemory`: 1 if the device can map host
-      memory into the CUDA address space, or 0 if not
-
-    - :py:obj:`~.cudaDevAttrComputeMode`: Compute mode is the compute mode
-      that the device is currently in. Available modes are as follows:
-
-      - :py:obj:`~.cudaComputeModeDefault`: Default mode - Device is not
-        restricted and multiple threads can use :py:obj:`~.cudaSetDevice()`
-        with this device.
-
-      - :py:obj:`~.cudaComputeModeProhibited`: Compute-prohibited mode - No
-        threads can use :py:obj:`~.cudaSetDevice()` with this device.
-
-      - :py:obj:`~.cudaComputeModeExclusiveProcess`: Compute-exclusive-
-        process mode - Many threads in one process will be able to use
-        :py:obj:`~.cudaSetDevice()` with this device.
-
-    - :py:obj:`~.cudaDevAttrConcurrentKernels`: 1 if the device supports
-      executing multiple kernels within the same context simultaneously, or
-      0 if not. It is not guaranteed that multiple kernels will be resident
-      on the device concurrently so this feature should not be relied upon
-      for correctness.
-
-    - :py:obj:`~.cudaDevAttrEccEnabled`: 1 if error correction is enabled
-      on the device, 0 if error correction is disabled or not supported by
-      the device
-
-    - :py:obj:`~.cudaDevAttrPciBusId`: PCI bus identifier of the device
-
-    - :py:obj:`~.cudaDevAttrPciDeviceId`: PCI device (also known as slot)
-      identifier of the device
-
-    - :py:obj:`~.cudaDevAttrTccDriver`: 1 if the device is using a TCC
-      driver. TCC is only available on Tesla hardware running Windows Vista
-      or later.
-
-    - :py:obj:`~.cudaDevAttrMemoryClockRate`: Peak memory clock frequency
-      in kilohertz
-
-    - :py:obj:`~.cudaDevAttrGlobalMemoryBusWidth`: Global memory bus width
-      in bits
-
-    - :py:obj:`~.cudaDevAttrL2CacheSize`: Size of L2 cache in bytes. 0 if
-      the device doesn't have L2 cache.
-
-    - :py:obj:`~.cudaDevAttrMaxThreadsPerMultiProcessor`: Maximum resident
-      threads per multiprocessor
-
-    - :py:obj:`~.cudaDevAttrUnifiedAddressing`: 1 if the device shares a
-      unified address space with the host, or 0 if not
-
-    - :py:obj:`~.cudaDevAttrComputeCapabilityMajor`: Major compute
-      capability version number
-
-    - :py:obj:`~.cudaDevAttrComputeCapabilityMinor`: Minor compute
-      capability version number
-
-    - :py:obj:`~.cudaDevAttrStreamPrioritiesSupported`: 1 if the device
-      supports stream priorities, or 0 if not
-
-    - :py:obj:`~.cudaDevAttrGlobalL1CacheSupported`: 1 if device supports
-      caching globals in L1 cache, 0 if not
-
-    - :py:obj:`~.cudaDevAttrLocalL1CacheSupported`: 1 if device supports
-      caching locals in L1 cache, 0 if not
-
-    - :py:obj:`~.cudaDevAttrMaxSharedMemoryPerMultiprocessor`: Maximum
-      amount of shared memory available to a multiprocessor in bytes; this
-      amount is shared by all thread blocks simultaneously resident on a
-      multiprocessor
-
-    - :py:obj:`~.cudaDevAttrMaxRegistersPerMultiprocessor`: Maximum number
-      of 32-bit registers available to a multiprocessor; this number is
-      shared by all thread blocks simultaneously resident on a
-      multiprocessor
-
-    - :py:obj:`~.cudaDevAttrManagedMemory`: 1 if device supports allocating
-      managed memory, 0 if not
-
-    - :py:obj:`~.cudaDevAttrIsMultiGpuBoard`: 1 if device is on a multi-GPU
-      board, 0 if not
-
-    - :py:obj:`~.cudaDevAttrMultiGpuBoardGroupID`: Unique identifier for a
-      group of devices on the same multi-GPU board
-
-    - :py:obj:`~.cudaDevAttrHostNativeAtomicSupported`: 1 if the link
-      between the device and the host supports native atomic operations
-
-    - :py:obj:`~.cudaDevAttrSingleToDoublePrecisionPerfRatio`: Ratio of
-      single precision performance (in floating-point operations per
-      second) to double precision performance
-
-    - :py:obj:`~.cudaDevAttrPageableMemoryAccess`: 1 if the device supports
-      coherently accessing pageable memory without calling cudaHostRegister
-      on it, and 0 otherwise
-
-    - :py:obj:`~.cudaDevAttrConcurrentManagedAccess`: 1 if the device can
-      coherently access managed memory concurrently with the CPU, and 0
-      otherwise
-
-    - :py:obj:`~.cudaDevAttrComputePreemptionSupported`: 1 if the device
-      supports Compute Preemption, 0 if not
-
-    - :py:obj:`~.cudaDevAttrCanUseHostPointerForRegisteredMem`: 1 if the
-      device can access host registered memory at the same virtual address
-      as the CPU, and 0 otherwise
-
-    - :py:obj:`~.cudaDevAttrCooperativeLaunch`: 1 if the device supports
-      launching cooperative kernels via
-      :py:obj:`~.cudaLaunchCooperativeKernel`, and 0 otherwise
-
-    - :py:obj:`~.cudaDevAttrCooperativeMultiDeviceLaunch`: 1 if the device
-      supports launching cooperative kernels via
-      :py:obj:`~.cudaLaunchCooperativeKernelMultiDevice`, and 0 otherwise
-
-    - :py:obj:`~.cudaDevAttrCanFlushRemoteWrites`: 1 if the device supports
-      flushing of outstanding remote writes, and 0 otherwise
-
-    - :py:obj:`~.cudaDevAttrHostRegisterSupported`: 1 if the device
-      supports host memory registration via :py:obj:`~.cudaHostRegister`,
-      and 0 otherwise
-
-    - :py:obj:`~.cudaDevAttrPageableMemoryAccessUsesHostPageTables`: 1 if
-      the device accesses pageable memory via the host's page tables, and 0
-      otherwise
-
-    - :py:obj:`~.cudaDevAttrDirectManagedMemAccessFromHost`: 1 if the host
-      can directly access managed memory on the device without migration,
-      and 0 otherwise
-
-    - :py:obj:`~.cudaDevAttrMaxSharedMemoryPerBlockOptin`: Maximum per
-      block shared memory size on the device. This value can be opted into
-      when using :py:obj:`~.cudaFuncSetAttribute`
-
-    - :py:obj:`~.cudaDevAttrMaxBlocksPerMultiprocessor`: Maximum number of
-      thread blocks that can reside on a multiprocessor
-
-    - :py:obj:`~.cudaDevAttrMaxPersistingL2CacheSize`: Maximum L2
-      persisting lines capacity setting in bytes
-
-    - :py:obj:`~.cudaDevAttrMaxAccessPolicyWindowSize`: Maximum value of
-      :py:obj:`~.cudaAccessPolicyWindow.num_bytes`
-
-    - :py:obj:`~.cudaDevAttrReservedSharedMemoryPerBlock`: Shared memory
-      reserved by CUDA driver per block in bytes
-
-    - :py:obj:`~.cudaDevAttrSparseCudaArraySupported`: 1 if the device
-      supports sparse CUDA arrays and sparse CUDA mipmapped arrays.
-
-    - :py:obj:`~.cudaDevAttrHostRegisterReadOnlySupported`: Device supports
-      using the :py:obj:`~.cudaHostRegister` flag cudaHostRegisterReadOnly
-      to register memory that must be mapped as read-only to the GPU
-
-    - :py:obj:`~.cudaDevAttrMemoryPoolsSupported`: 1 if the device supports
-      using the cudaMallocAsync and cudaMemPool family of APIs, and 0
-      otherwise
-
-    - :py:obj:`~.cudaDevAttrGPUDirectRDMASupported`: 1 if the device
-      supports GPUDirect RDMA APIs, and 0 otherwise
-
-    - :py:obj:`~.cudaDevAttrGPUDirectRDMAFlushWritesOptions`: bitmask to be
-      interpreted according to the
-      :py:obj:`~.cudaFlushGPUDirectRDMAWritesOptions` enum
-
-    - :py:obj:`~.cudaDevAttrGPUDirectRDMAWritesOrdering`: see the
-      :py:obj:`~.cudaGPUDirectRDMAWritesOrdering` enum for numerical values
-
-    - :py:obj:`~.cudaDevAttrMemoryPoolSupportedHandleTypes`: Bitmask of
-      handle types supported with mempool based IPC
-
-    - :py:obj:`~.cudaDevAttrDeferredMappingCudaArraySupported` : 1 if the
-      device supports deferred mapping CUDA arrays and CUDA mipmapped
-      arrays.
-
-    - :py:obj:`~.cudaDevAttrIpcEventSupport`: 1 if the device supports IPC
-      Events.
-
-    - :py:obj:`~.cudaDevAttrNumaConfig`: NUMA configuration of a device:
-      value is of type :py:obj:`~.cudaDeviceNumaConfig` enum
-
-    - :py:obj:`~.cudaDevAttrNumaId`: NUMA node ID of the GPU memory
-
-    - :py:obj:`~.cudaDevAttrGpuPciDeviceId`: The combined 16-bit PCI device
-      ID and 16-bit PCI vendor ID.
-
-    - :py:obj:`~.cudaDevAttrGpuPciSubsystemId`: The combined 16-bit PCI
-      subsystem ID and 16-bit PCI vendor subsystem ID.
+    `device`.
 
     Parameters
     ----------
@@ -19804,6 +19637,68 @@ def cudaDeviceGetAttribute(attr not None : cudaDeviceAttr, int device):
     return (_dict_cudaError_t[err], value)
 {{endif}}
 
+{{if 'cudaDeviceGetHostAtomicCapabilities' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaDeviceGetHostAtomicCapabilities(operations : Optional[Tuple[cudaAtomicOperation] | List[cudaAtomicOperation]], unsigned int count, int device):
+    """ Queries details about atomic operations supported between the device and host.
+
+    Returns in `*capabilities` the details about requested atomic
+    `*operations` over the the link between `dev` and the host. The
+    allocated size of `*operations` and `*capabilities` must be `count`.
+
+    For each :py:obj:`~.cudaAtomicOperation` in `*operations`, the
+    corresponding result in `*capabilities` will be a bitmask indicating
+    which of :py:obj:`~.cudaAtomicOperationCapability` the link supports
+    natively.
+
+    Returns :py:obj:`~.cudaErrorInvalidDevice` if `dev` is not valid.
+
+    Returns :py:obj:`~.cudaErrorInvalidValue` if `*capabilities` or
+    `*operations` is NULL, if `count` is 0, or if any of `*operations` is
+    not valid.
+
+    Parameters
+    ----------
+    operations : List[:py:obj:`~.cudaAtomicOperation`]
+        Requested operations
+    count : unsigned int
+        Count of requested operations and size of capabilities
+    dev : int
+        Device handle
+
+    Returns
+    -------
+    cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidValue`
+    capabilities : List[unsigned int]
+        Returned capability details of each requested operation
+
+    See Also
+    --------
+    :py:obj:`~.cudaDeviceGetAttribute`, :py:obj:`~.cudaDeviceGetP2PAtomicCapabilities`, :py:obj:`~.cuDeviceGeHostAtomicCapabilities`
+    """
+    operations = [] if operations is None else operations
+    if not all(isinstance(_x, (cudaAtomicOperation)) for _x in operations):
+        raise TypeError("Argument 'operations' is not instance of type (expected Tuple[cyruntime.cudaAtomicOperation] or List[cyruntime.cudaAtomicOperation]")
+    cdef unsigned int* cycapabilities = NULL
+    pycapabilities = []
+    if count != 0:
+        cycapabilities = <unsigned int*>calloc(count, sizeof(unsigned int))
+        if cycapabilities is NULL:
+            raise MemoryError('Failed to allocate length x size memory: ' + str(count) + 'x' + str(sizeof(unsigned int)))
+    cdef vector[cyruntime.cudaAtomicOperation] cyoperations = [pyoperations.value for pyoperations in (operations)]
+    if count > len(operations): raise RuntimeError("List is too small: " + str(len(operations)) + " < " + str(count))
+    err = cyruntime.cudaDeviceGetHostAtomicCapabilities(cycapabilities, cyoperations.data(), count, device)
+    if cudaError_t(err) == cudaError_t(0):
+        pycapabilities = [<unsigned int>cycapabilities[idx] for idx in range(count)]
+    if cycapabilities is not NULL:
+        free(cycapabilities)
+    if err != cyruntime.cudaSuccess:
+        return (_dict_cudaError_t[err], None)
+    return (_dict_cudaError_t[err], pycapabilities)
+{{endif}}
+
 {{if 'cudaDeviceGetDefaultMemPool' in found_functions}}
 
 @cython.embedsignature(True)
@@ -20025,12 +19920,17 @@ def cudaDeviceGetP2PAttribute(attr not None : cudaDeviceP2PAttr, int srcDevice,
     - :py:obj:`~.cudaDevP2PAttrAccessSupported`: 1 if peer access is
       enabled.
 
-    - :py:obj:`~.cudaDevP2PAttrNativeAtomicSupported`: 1 if native atomic
-      operations over the link are supported.
+    - :py:obj:`~.cudaDevP2PAttrNativeAtomicSupported`: 1 if all native
+      atomic operations over the link are supported.
 
     - :py:obj:`~.cudaDevP2PAttrCudaArrayAccessSupported`: 1 if accessing
       CUDA arrays over the link is supported.
 
+    - :py:obj:`~.cudaDevP2PAttrOnlyPartialNativeAtomicSupported`: 1 if some
+      CUDA-valid atomic operations over the link are supported. Information
+      about specific operations can be retrieved with
+      :py:obj:`~.cudaDeviceGetP2PAtomicCapabilities`.
+
     Returns :py:obj:`~.cudaErrorInvalidDevice` if `srcDevice` or
     `dstDevice` are not valid or if they represent the same device.
 
@@ -20056,7 +19956,7 @@ def cudaDeviceGetP2PAttribute(attr not None : cudaDeviceP2PAttr, int srcDevice,
 
     See Also
     --------
-    :py:obj:`~.cudaDeviceEnablePeerAccess`, :py:obj:`~.cudaDeviceDisablePeerAccess`, :py:obj:`~.cudaDeviceCanAccessPeer`, :py:obj:`~.cuDeviceGetP2PAttribute`
+    :py:obj:`~.cudaDeviceEnablePeerAccess`, :py:obj:`~.cudaDeviceDisablePeerAccess`, :py:obj:`~.cudaDeviceCanAccessPeer`, :py:obj:`~.cuDeviceGetP2PAttribute` :py:obj:`~.cudaDeviceGetP2PAtomicCapabilities`
     """
     cdef int value = 0
     cdef cyruntime.cudaDeviceP2PAttr cyattr = attr.value
@@ -20066,6 +19966,72 @@ def cudaDeviceGetP2PAttribute(attr not None : cudaDeviceP2PAttr, int srcDevice,
     return (_dict_cudaError_t[err], value)
 {{endif}}
 
+{{if 'cudaDeviceGetP2PAtomicCapabilities' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaDeviceGetP2PAtomicCapabilities(operations : Optional[Tuple[cudaAtomicOperation] | List[cudaAtomicOperation]], unsigned int count, int srcDevice, int dstDevice):
+    """ Queries details about atomic operations supported between two devices.
+
+    Returns in `*capabilities` the details about requested atomic
+    `*operations` over the the link between `srcDevice` and `dstDevice`.
+    The allocated size of `*operations` and `*capabilities` must be
+    `count`.
+
+    For each :py:obj:`~.cudaAtomicOperation` in `*operations`, the
+    corresponding result in `*capabilities` will be a bitmask indicating
+    which of :py:obj:`~.cudaAtomicOperationCapability` the link supports
+    natively.
+
+    Returns :py:obj:`~.cudaErrorInvalidDevice` if `srcDevice` or
+    `dstDevice` are not valid or if they represent the same device.
+
+    Returns :py:obj:`~.cudaErrorInvalidValue` if `*capabilities` or
+    `*operations` is NULL, if `count` is 0, or if any of `*operations` is
+    not valid.
+
+    Parameters
+    ----------
+    operations : List[:py:obj:`~.cudaAtomicOperation`]
+        Requested operations
+    count : unsigned int
+        Count of requested operations and size of capabilities
+    srcDevice : int
+        The source device of the target link
+    dstDevice : int
+        The destination device of the target link
+
+    Returns
+    -------
+    cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidValue`
+    capabilities : List[unsigned int]
+        Returned capability details of each requested operation
+
+    See Also
+    --------
+    :py:obj:`~.cudaDeviceGetP2PAttribute`, :py:obj:`~.cuDeviceGetP2PAttribute`, :py:obj:`~.cuDeviceGetP2PAtomicCapabilities`
+    """
+    operations = [] if operations is None else operations
+    if not all(isinstance(_x, (cudaAtomicOperation)) for _x in operations):
+        raise TypeError("Argument 'operations' is not instance of type (expected Tuple[cyruntime.cudaAtomicOperation] or List[cyruntime.cudaAtomicOperation]")
+    cdef unsigned int* cycapabilities = NULL
+    pycapabilities = []
+    if count != 0:
+        cycapabilities = <unsigned int*>calloc(count, sizeof(unsigned int))
+        if cycapabilities is NULL:
+            raise MemoryError('Failed to allocate length x size memory: ' + str(count) + 'x' + str(sizeof(unsigned int)))
+    cdef vector[cyruntime.cudaAtomicOperation] cyoperations = [pyoperations.value for pyoperations in (operations)]
+    if count > len(operations): raise RuntimeError("List is too small: " + str(len(operations)) + " < " + str(count))
+    err = cyruntime.cudaDeviceGetP2PAtomicCapabilities(cycapabilities, cyoperations.data(), count, srcDevice, dstDevice)
+    if cudaError_t(err) == cudaError_t(0):
+        pycapabilities = [<unsigned int>cycapabilities[idx] for idx in range(count)]
+    if cycapabilities is not NULL:
+        free(cycapabilities)
+    if err != cyruntime.cudaSuccess:
+        return (_dict_cudaError_t[err], None)
+    return (_dict_cudaError_t[err], pycapabilities)
+{{endif}}
+
 {{if 'cudaChooseDevice' in found_functions}}
 
 @cython.embedsignature(True)
@@ -21563,7 +21529,7 @@ def cudaStreamIsCapturing(stream):
     return (_dict_cudaError_t[err], cudaStreamCaptureStatus(pCaptureStatus))
 {{endif}}
 
-{{if 'cudaStreamGetCaptureInfo_v2' in found_functions}}
+{{if 'cudaStreamGetCaptureInfo' in found_functions}}
 
 @cython.embedsignature(True)
 def cudaStreamGetCaptureInfo(stream):
@@ -21580,89 +21546,6 @@ def cudaStreamGetCaptureInfo(stream):
 
     - the call returns cudaSuccess
 
-    - the returned capture status is
-      :py:obj:`~.cudaStreamCaptureStatusActive`
-
-    Parameters
-    ----------
-    stream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
-        The stream to query
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorStreamCaptureImplicit`
-    captureStatus_out : :py:obj:`~.cudaStreamCaptureStatus`
-        Location to return the capture status of the stream; required
-    id_out : unsigned long long
-        Optional location to return an id for the capture sequence, which
-        is unique over the lifetime of the process
-    graph_out : :py:obj:`~.cudaGraph_t`
-        Optional location to return the graph being captured into. All
-        operations other than destroy and node removal are permitted on the
-        graph while the capture sequence is in progress. This API does not
-        transfer ownership of the graph, which is transferred or destroyed
-        at :py:obj:`~.cudaStreamEndCapture`. Note that the graph handle may
-        be invalidated before end of capture for certain errors. Nodes that
-        are or become unreachable from the original stream at
-        :py:obj:`~.cudaStreamEndCapture` due to direct actions on the graph
-        do not trigger :py:obj:`~.cudaErrorStreamCaptureUnjoined`.
-    dependencies_out : List[:py:obj:`~.cudaGraphNode_t`]
-        Optional location to store a pointer to an array of nodes. The next
-        node to be captured in the stream will depend on this set of nodes,
-        absent operations such as event wait which modify this set. The
-        array pointer is valid until the next API call which operates on
-        the stream or until the capture is terminated. The node handles may
-        be copied out and are valid until they or the graph is destroyed.
-        The driver-owned array may also be passed directly to APIs that
-        operate on the graph (not the stream) without copying.
-    numDependencies_out : int
-        Optional location to store the size of the array returned in
-        dependencies_out.
-
-    See Also
-    --------
-    :py:obj:`~.cudaStreamGetCaptureInfo_v3`, :py:obj:`~.cudaStreamBeginCapture`, :py:obj:`~.cudaStreamIsCapturing`, :py:obj:`~.cudaStreamUpdateCaptureDependencies`
-    """
-    cdef cyruntime.cudaStream_t cystream
-    if stream is None:
-        pstream = 0
-    elif isinstance(stream, (cudaStream_t,driver.CUstream)):
-        pstream = int(stream)
-    else:
-        pstream = int(cudaStream_t(stream))
-    cystream = <cyruntime.cudaStream_t><void_ptr>pstream
-    cdef cyruntime.cudaStreamCaptureStatus captureStatus_out
-    cdef unsigned long long id_out = 0
-    cdef cudaGraph_t graph_out = cudaGraph_t()
-    cdef const cyruntime.cudaGraphNode_t* cydependencies_out = NULL
-    pydependencies_out = []
-    cdef size_t numDependencies_out = 0
-    err = cyruntime.cudaStreamGetCaptureInfo(cystream, &captureStatus_out, &id_out, <cyruntime.cudaGraph_t*>graph_out._pvt_ptr, &cydependencies_out, &numDependencies_out)
-    if cudaError_t(err) == cudaError_t(0):
-        pydependencies_out = [cudaGraphNode_t(init_value=<void_ptr>cydependencies_out[idx]) for idx in range(numDependencies_out)]
-    if err != cyruntime.cudaSuccess:
-        return (_dict_cudaError_t[err], None, None, None, None, None)
-    return (_dict_cudaError_t[err], cudaStreamCaptureStatus(captureStatus_out), id_out, graph_out, pydependencies_out, numDependencies_out)
-{{endif}}
-
-{{if 'cudaStreamGetCaptureInfo_v3' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaStreamGetCaptureInfo_v3(stream):
-    """ Query a stream's capture state (12.3+)
-
-    Query stream state related to stream capture.
-
-    If called on :py:obj:`~.cudaStreamLegacy` (the "null stream") while a
-    stream not created with :py:obj:`~.cudaStreamNonBlocking` is capturing,
-    returns :py:obj:`~.cudaErrorStreamCaptureImplicit`.
-
-    Valid data (other than capture status) is returned only if both of the
-    following are true:
-
-    - the call returns cudaSuccess
-
     - the returned capture status is
       :py:obj:`~.cudaStreamCaptureStatusActive`
 
@@ -21735,7 +21618,7 @@ def cudaStreamGetCaptureInfo_v3(stream):
     cdef const cyruntime.cudaGraphEdgeData* cyedgeData_out = NULL
     pyedgeData_out = []
     cdef size_t numDependencies_out = 0
-    err = cyruntime.cudaStreamGetCaptureInfo_v3(cystream, &captureStatus_out, &id_out, <cyruntime.cudaGraph_t*>graph_out._pvt_ptr, &cydependencies_out, &cyedgeData_out, &numDependencies_out)
+    err = cyruntime.cudaStreamGetCaptureInfo(cystream, &captureStatus_out, &id_out, <cyruntime.cudaGraph_t*>graph_out._pvt_ptr, &cydependencies_out, &cyedgeData_out, &numDependencies_out)
     if cudaError_t(err) == cudaError_t(0):
         pydependencies_out = [cudaGraphNode_t(init_value=<void_ptr>cydependencies_out[idx]) for idx in range(numDependencies_out)]
     if cudaError_t(err) == cudaError_t(0):
@@ -21748,81 +21631,8 @@ def cudaStreamGetCaptureInfo_v3(stream):
 {{if 'cudaStreamUpdateCaptureDependencies' in found_functions}}
 
 @cython.embedsignature(True)
-def cudaStreamUpdateCaptureDependencies(stream, dependencies : Optional[Tuple[cudaGraphNode_t] | List[cudaGraphNode_t]], size_t numDependencies, unsigned int flags):
-    """ Update the set of dependencies in a capturing stream (11.3+)
-
-    Modifies the dependency set of a capturing stream. The dependency set
-    is the set of nodes that the next captured node in the stream will
-    depend on.
-
-    Valid flags are :py:obj:`~.cudaStreamAddCaptureDependencies` and
-    :py:obj:`~.cudaStreamSetCaptureDependencies`. These control whether the
-    set passed to the API is added to the existing set or replaces it. A
-    flags value of 0 defaults to
-    :py:obj:`~.cudaStreamAddCaptureDependencies`.
-
-    Nodes that are removed from the dependency set via this API do not
-    result in :py:obj:`~.cudaErrorStreamCaptureUnjoined` if they are
-    unreachable from the stream at :py:obj:`~.cudaStreamEndCapture`.
-
-    Returns :py:obj:`~.cudaErrorIllegalState` if the stream is not
-    capturing.
-
-    This API is new in CUDA 11.3. Developers requiring compatibility across
-    minor versions of the CUDA driver to 11.0 should not use this API or
-    provide a fallback.
-
-    Parameters
-    ----------
-    stream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
-        The stream to update
-    dependencies : List[:py:obj:`~.cudaGraphNode_t`]
-        The set of dependencies to add
-    numDependencies : size_t
-        The size of the dependencies array
-    flags : unsigned int
-        See above
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorIllegalState`
-
-    See Also
-    --------
-    :py:obj:`~.cudaStreamBeginCapture`, :py:obj:`~.cudaStreamGetCaptureInfo`,
-    """
-    dependencies = [] if dependencies is None else dependencies
-    if not all(isinstance(_x, (cudaGraphNode_t,driver.CUgraphNode)) for _x in dependencies):
-        raise TypeError("Argument 'dependencies' is not instance of type (expected Tuple[cyruntime.cudaGraphNode_t,driver.CUgraphNode] or List[cyruntime.cudaGraphNode_t,driver.CUgraphNode]")
-    cdef cyruntime.cudaStream_t cystream
-    if stream is None:
-        pstream = 0
-    elif isinstance(stream, (cudaStream_t,driver.CUstream)):
-        pstream = int(stream)
-    else:
-        pstream = int(cudaStream_t(stream))
-    cystream = <cyruntime.cudaStream_t><void_ptr>pstream
-    cdef cyruntime.cudaGraphNode_t* cydependencies = NULL
-    if len(dependencies) > 0:
-        cydependencies = <cyruntime.cudaGraphNode_t*> calloc(len(dependencies), sizeof(cyruntime.cudaGraphNode_t))
-        if cydependencies is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
-        else:
-            for idx in range(len(dependencies)):
-                cydependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>dependencies[idx])._pvt_ptr[0]
-    if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    err = cyruntime.cudaStreamUpdateCaptureDependencies(cystream, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, numDependencies, flags)
-    if cydependencies is not NULL:
-        free(cydependencies)
-    return (_dict_cudaError_t[err],)
-{{endif}}
-
-{{if 'cudaStreamUpdateCaptureDependencies_v2' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaStreamUpdateCaptureDependencies_v2(stream, dependencies : Optional[Tuple[cudaGraphNode_t] | List[cudaGraphNode_t]], dependencyData : Optional[Tuple[cudaGraphEdgeData] | List[cudaGraphEdgeData]], size_t numDependencies, unsigned int flags):
-    """ Update the set of dependencies in a capturing stream (12.3+)
+def cudaStreamUpdateCaptureDependencies(stream, dependencies : Optional[Tuple[cudaGraphNode_t] | List[cudaGraphNode_t]], dependencyData : Optional[Tuple[cudaGraphEdgeData] | List[cudaGraphEdgeData]], size_t numDependencies, unsigned int flags):
+    """ Update the set of dependencies in a capturing stream.
 
     Modifies the dependency set of a capturing stream. The dependency set
     is the set of nodes that the next captured node in the stream will
@@ -21892,7 +21702,7 @@ def cudaStreamUpdateCaptureDependencies_v2(stream, dependencies : Optional[Tuple
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencyData)) + 'x' + str(sizeof(cyruntime.cudaGraphEdgeData)))
         for idx in range(len(dependencyData)):
             string.memcpy(&cydependencyData[idx], (<cudaGraphEdgeData>dependencyData[idx])._pvt_ptr, sizeof(cyruntime.cudaGraphEdgeData))
-    err = cyruntime.cudaStreamUpdateCaptureDependencies_v2(cystream, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, (<cudaGraphEdgeData>dependencyData[0])._pvt_ptr if len(dependencyData) == 1 else cydependencyData, numDependencies, flags)
+    err = cyruntime.cudaStreamUpdateCaptureDependencies(cystream, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, (<cudaGraphEdgeData>dependencyData[0])._pvt_ptr if len(dependencyData) == 1 else cydependencyData, numDependencies, flags)
     if cydependencies is not NULL:
         free(cydependencies)
     if cydependencyData is not NULL:
@@ -22236,74 +22046,6 @@ def cudaEventDestroy(event):
 def cudaEventElapsedTime(start, end):
     """ Computes the elapsed time between events.
 
-    Computes the elapsed time between two events (in milliseconds with a
-    resolution of around 0.5 microseconds).
-
-    If either event was last recorded in a non-NULL stream, the resulting
-    time may be greater than expected (even if both used the same stream
-    handle). This happens because the :py:obj:`~.cudaEventRecord()`
-    operation takes place asynchronously and there is no guarantee that the
-    measured latency is actually just between the two events. Any number of
-    other different stream operations could execute in between the two
-    measured events, thus altering the timing in a significant way.
-
-    If :py:obj:`~.cudaEventRecord()` has not been called on either event,
-    then :py:obj:`~.cudaErrorInvalidResourceHandle` is returned. If
-    :py:obj:`~.cudaEventRecord()` has been called on both events but one or
-    both of them has not yet been completed (that is,
-    :py:obj:`~.cudaEventQuery()` would return :py:obj:`~.cudaErrorNotReady`
-    on at least one of the events), :py:obj:`~.cudaErrorNotReady` is
-    returned. If either event was created with the
-    :py:obj:`~.cudaEventDisableTiming` flag, then this function will return
-    :py:obj:`~.cudaErrorInvalidResourceHandle`.
-
-    Parameters
-    ----------
-    start : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
-        Starting event
-    end : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
-        Ending event
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorNotReady`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorLaunchFailure`, :py:obj:`~.cudaErrorUnknown`
-    ms : float
-        Time between `start` and `end` in ms
-
-    See Also
-    --------
-    :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventRecord`, :py:obj:`~.cuEventElapsedTime`
-    """
-    cdef cyruntime.cudaEvent_t cyend
-    if end is None:
-        pend = 0
-    elif isinstance(end, (cudaEvent_t,driver.CUevent)):
-        pend = int(end)
-    else:
-        pend = int(cudaEvent_t(end))
-    cyend = <cyruntime.cudaEvent_t><void_ptr>pend
-    cdef cyruntime.cudaEvent_t cystart
-    if start is None:
-        pstart = 0
-    elif isinstance(start, (cudaEvent_t,driver.CUevent)):
-        pstart = int(start)
-    else:
-        pstart = int(cudaEvent_t(start))
-    cystart = <cyruntime.cudaEvent_t><void_ptr>pstart
-    cdef float ms = 0
-    err = cyruntime.cudaEventElapsedTime(&ms, cystart, cyend)
-    if err != cyruntime.cudaSuccess:
-        return (_dict_cudaError_t[err], None)
-    return (_dict_cudaError_t[err], ms)
-{{endif}}
-
-{{if 'cudaEventElapsedTime_v2' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaEventElapsedTime_v2(start, end):
-    """ Computes the elapsed time between events.
-
     Computes the elapsed time between two events (in milliseconds with a
     resolution of around 0.5 microseconds). Note this API is not guaranteed
     to return the latest errors for pending work. As such this API is
@@ -22364,7 +22106,7 @@ def cudaEventElapsedTime_v2(start, end):
         pstart = int(cudaEvent_t(start))
     cystart = <cyruntime.cudaEvent_t><void_ptr>pstart
     cdef float ms = 0
-    err = cyruntime.cudaEventElapsedTime_v2(&ms, cystart, cyend)
+    err = cyruntime.cudaEventElapsedTime(&ms, cystart, cyend)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], ms)
@@ -22846,7 +22588,7 @@ def cudaImportExternalSemaphore(semHandleDesc : Optional[cudaExternalSemaphoreHa
     return (_dict_cudaError_t[err], extSem_out)
 {{endif}}
 
-{{if 'cudaSignalExternalSemaphoresAsync_v2' in found_functions}}
+{{if 'cudaSignalExternalSemaphoresAsync' in found_functions}}
 
 @cython.embedsignature(True)
 def cudaSignalExternalSemaphoresAsync(extSemArray : Optional[Tuple[cudaExternalSemaphore_t] | List[cudaExternalSemaphore_t]], paramsArray : Optional[Tuple[cudaExternalSemaphoreSignalParams] | List[cudaExternalSemaphoreSignalParams]], unsigned int numExtSems, stream):
@@ -22915,6 +22657,22 @@ def cudaSignalExternalSemaphoresAsync(extSemArray : Optional[Tuple[cudaExternalS
     with deterministic fence support enabled in different streams or by
     adding explicit dependency amongst such streams so that the semaphore
     is signaled in order.
+    :py:obj:`~.cudaExternalSemaphoreSignalParams`::params::nvSciSync::fence
+    associated with semaphore object of the type
+    :py:obj:`~.cudaExternalSemaphoreHandleTypeNvSciSync` can be timestamp
+    enabled. For this the NvSciSyncAttrList used to create the object must
+    have the value of NvSciSyncAttrKey_WaiterRequireTimestamps key set to
+    true. Timestamps are emitted asynchronously by the GPU and CUDA saves
+    the GPU timestamp in the corresponding NvSciSyncFence at the time of
+    signal on GPU. Users are expected to convert GPU clocks to CPU clocks
+    using appropriate scaling functions. Users are expected to wait for the
+    completion of the fence before extracting timestamp using appropriate
+    NvSciSync APIs. Users are expected to ensure that there is only one
+    outstanding timestamp enabled fence per Cuda-NvSciSync object at any
+    point of time, failing which leads to undefined behavior. Extracting
+    the timestamp before the corresponding fence is signalled could lead to
+    undefined behaviour. Timestamp extracted via appropriate NvSciSync API
+    would be in microseconds.
 
     If the semaphore object is any one of the following types:
     :py:obj:`~.cudaExternalSemaphoreHandleTypeKeyedMutex`,
@@ -22981,7 +22739,7 @@ def cudaSignalExternalSemaphoresAsync(extSemArray : Optional[Tuple[cudaExternalS
     return (_dict_cudaError_t[err],)
 {{endif}}
 
-{{if 'cudaWaitExternalSemaphoresAsync_v2' in found_functions}}
+{{if 'cudaWaitExternalSemaphoresAsync' in found_functions}}
 
 @cython.embedsignature(True)
 def cudaWaitExternalSemaphoresAsync(extSemArray : Optional[Tuple[cudaExternalSemaphore_t] | List[cudaExternalSemaphore_t]], paramsArray : Optional[Tuple[cudaExternalSemaphoreWaitParams] | List[cudaExternalSemaphoreWaitParams]], unsigned int numExtSems, stream):
@@ -26027,10 +25785,6 @@ def cudaMemcpyBatchAsync(dsts : Optional[Tuple[Any] | List[Any]], srcs : Optiona
     Note that this flag is a hint and can be ignored depending on the
     platform and other parameters of the copy.
 
-    If any error is encountered while parsing the batch, the index within
-    the batch where the error was encountered will be returned in
-    `failIdx`.
-
     Parameters
     ----------
     dsts : List[Any]
@@ -26059,10 +25813,6 @@ def cudaMemcpyBatchAsync(dsts : Optional[Tuple[Any] | List[Any]], srcs : Optiona
     -------
     cudaError_t
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
-    failIdx : int
-        Pointer to a location to return the index of the copy where a
-        failure was encountered. The value will be SIZE_MAX if the error
-        doesn't pertain to any specific copy.
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -26099,13 +25849,10 @@ def cudaMemcpyBatchAsync(dsts : Optional[Tuple[Any] | List[Any]], srcs : Optiona
     cdef vector[size_t] cyattrsIdxs = attrsIdxs
     if numAttrs > <size_t>len(attrs): raise RuntimeError("List is too small: " + str(len(attrs)) + " < " + str(numAttrs))
     if numAttrs > <size_t>len(attrsIdxs): raise RuntimeError("List is too small: " + str(len(attrsIdxs)) + " < " + str(numAttrs))
-    cdef size_t failIdx = 0
-    err = cyruntime.cudaMemcpyBatchAsync(<void**><void_ptr>voidStarHelperdsts.cptr, <void**><void_ptr>voidStarHelpersrcs.cptr, cysizes.data(), count, (<cudaMemcpyAttributes>attrs[0])._pvt_ptr if len(attrs) == 1 else cyattrs, cyattrsIdxs.data(), numAttrs, &failIdx, cystream)
+    err = cyruntime.cudaMemcpyBatchAsync(<void**><void_ptr>voidStarHelperdsts.cptr, <void**><void_ptr>voidStarHelpersrcs.cptr, cysizes.data(), count, (<cudaMemcpyAttributes>attrs[0])._pvt_ptr if len(attrs) == 1 else cyattrs, cyattrsIdxs.data(), numAttrs, cystream)
     if cyattrs is not NULL:
         free(cyattrs)
-    if err != cyruntime.cudaSuccess:
-        return (_dict_cudaError_t[err], None)
-    return (_dict_cudaError_t[err], failIdx)
+    return (_dict_cudaError_t[err],)
 {{endif}}
 
 {{if 'cudaMemcpy3DBatchAsync' in found_functions}}
@@ -26193,10 +25940,6 @@ def cudaMemcpy3DBatchAsync(size_t numOps, opList : Optional[Tuple[cudaMemcpy3DBa
     Note that this flag is a hint and can be ignored depending on the
     platform and other parameters of the copy.
 
-    If any error is encountered while parsing the batch, the index within
-    the batch where the error was encountered will be returned in
-    `failIdx`.
-
     Parameters
     ----------
     numOps : size_t
@@ -26213,10 +25956,6 @@ def cudaMemcpy3DBatchAsync(size_t numOps, opList : Optional[Tuple[cudaMemcpy3DBa
     -------
     cudaError_t
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
-    failIdx : int
-        Pointer to a location to return the index of the copy where a
-        failure was encountered. The value will be SIZE_MAX if the error
-        doesn't pertain to any specific copy.
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -26237,13 +25976,10 @@ def cudaMemcpy3DBatchAsync(size_t numOps, opList : Optional[Tuple[cudaMemcpy3DBa
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(opList)) + 'x' + str(sizeof(cyruntime.cudaMemcpy3DBatchOp)))
         for idx in range(len(opList)):
             string.memcpy(&cyopList[idx], (<cudaMemcpy3DBatchOp>opList[idx])._pvt_ptr, sizeof(cyruntime.cudaMemcpy3DBatchOp))
-    cdef size_t failIdx = 0
-    err = cyruntime.cudaMemcpy3DBatchAsync(numOps, (<cudaMemcpy3DBatchOp>opList[0])._pvt_ptr if len(opList) == 1 else cyopList, &failIdx, flags, cystream)
+    err = cyruntime.cudaMemcpy3DBatchAsync(numOps, (<cudaMemcpy3DBatchOp>opList[0])._pvt_ptr if len(opList) == 1 else cyopList, flags, cystream)
     if cyopList is not NULL:
         free(cyopList)
-    if err != cyruntime.cudaSuccess:
-        return (_dict_cudaError_t[err], None)
-    return (_dict_cudaError_t[err], failIdx)
+    return (_dict_cudaError_t[err],)
 {{endif}}
 
 {{if 'cudaMemcpy2DAsync' in found_functions}}
@@ -26824,107 +26560,7 @@ def cudaMemset3DAsync(pitchedDevPtr not None : cudaPitchedPtr, int value, extent
 {{if 'cudaMemPrefetchAsync' in found_functions}}
 
 @cython.embedsignature(True)
-def cudaMemPrefetchAsync(devPtr, size_t count, int dstDevice, stream):
-    """ Prefetches memory to the specified destination device.
-
-    Prefetches memory to the specified destination device. `devPtr` is the
-    base device pointer of the memory to be prefetched and `dstDevice` is
-    the destination device. `count` specifies the number of bytes to copy.
-    `stream` is the stream in which the operation is enqueued. The memory
-    range must refer to managed memory allocated via
-    :py:obj:`~.cudaMallocManaged` or declared via managed variables, or it
-    may also refer to system-allocated memory on systems with non-zero
-    cudaDevAttrPageableMemoryAccess.
-
-    Passing in cudaCpuDeviceId for `dstDevice` will prefetch the data to
-    host memory. If `dstDevice` is a GPU, then the device attribute
-    :py:obj:`~.cudaDevAttrConcurrentManagedAccess` must be non-zero.
-    Additionally, `stream` must be associated with a device that has a non-
-    zero value for the device attribute
-    :py:obj:`~.cudaDevAttrConcurrentManagedAccess`.
-
-    The start address and end address of the memory range will be rounded
-    down and rounded up respectively to be aligned to CPU page size before
-    the prefetch operation is enqueued in the stream.
-
-    If no physical memory has been allocated for this region, then this
-    memory region will be populated and mapped on the destination device.
-    If there's insufficient memory to prefetch the desired region, the
-    Unified Memory driver may evict pages from other
-    :py:obj:`~.cudaMallocManaged` allocations to host memory in order to
-    make room. Device memory allocated using :py:obj:`~.cudaMalloc` or
-    :py:obj:`~.cudaMallocArray` will not be evicted.
-
-    By default, any mappings to the previous location of the migrated pages
-    are removed and mappings for the new location are only setup on
-    `dstDevice`. The exact behavior however also depends on the settings
-    applied to this memory range via :py:obj:`~.cudaMemAdvise` as described
-    below:
-
-    If :py:obj:`~.cudaMemAdviseSetReadMostly` was set on any subset of this
-    memory range, then that subset will create a read-only copy of the
-    pages on `dstDevice`.
-
-    If :py:obj:`~.cudaMemAdviseSetPreferredLocation` was called on any
-    subset of this memory range, then the pages will be migrated to
-    `dstDevice` even if `dstDevice` is not the preferred location of any
-    pages in the memory range.
-
-    If :py:obj:`~.cudaMemAdviseSetAccessedBy` was called on any subset of
-    this memory range, then mappings to those pages from all the
-    appropriate processors are updated to refer to the new location if
-    establishing such a mapping is possible. Otherwise, those mappings are
-    cleared.
-
-    Note that this API is not required for functionality and only serves to
-    improve performance by allowing the application to migrate data to a
-    suitable location before it is accessed. Memory accesses to this range
-    are always coherent and are allowed even when the data is actively
-    being migrated.
-
-    Note that this function is asynchronous with respect to the host and
-    all work on other devices.
-
-    Parameters
-    ----------
-    devPtr : Any
-        Pointer to be prefetched
-    count : size_t
-        Size in bytes
-    dstDevice : int
-        Destination device to prefetch to
-    stream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
-        Stream to enqueue prefetch operation
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDevice`
-
-    See Also
-    --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :py:obj:`~.cudaMemAdvise`, :py:obj:`~.cudaMemAdvise_v2` :py:obj:`~.cuMemPrefetchAsync`
-    """
-    cdef cyruntime.cudaStream_t cystream
-    if stream is None:
-        pstream = 0
-    elif isinstance(stream, (cudaStream_t,driver.CUstream)):
-        pstream = int(stream)
-    else:
-        pstream = int(cudaStream_t(stream))
-    cystream = <cyruntime.cudaStream_t><void_ptr>pstream
-    cydevPtr = utils.HelperInputVoidPtr(devPtr)
-    cdef void* cydevPtr_ptr = <void*><void_ptr>cydevPtr.cptr
-    with nogil:
-        err = cyruntime.cudaMemPrefetchAsync(cydevPtr_ptr, count, dstDevice, cystream)
-
-    return (_dict_cudaError_t[err],)
-{{endif}}
-
-{{if 'cudaMemPrefetchAsync_v2' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaMemPrefetchAsync_v2(devPtr, size_t count, location not None : cudaMemLocation, unsigned int flags, stream):
+def cudaMemPrefetchAsync(devPtr, size_t count, location not None : cudaMemLocation, unsigned int flags, stream):
     """ Prefetches memory to the specified destination location.
 
     Prefetches memory to the specified destination location. `devPtr` is
@@ -27021,7 +26657,7 @@ def cudaMemPrefetchAsync_v2(devPtr, size_t count, location not None : cudaMemLoc
 
     See Also
     --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :py:obj:`~.cudaMemAdvise`, :py:obj:`~.cudaMemAdvise_v2` :py:obj:`~.cuMemPrefetchAsync`
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :py:obj:`~.cudaMemAdvise`, :py:obj:`~.cuMemPrefetchAsync`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -27034,182 +26670,305 @@ def cudaMemPrefetchAsync_v2(devPtr, size_t count, location not None : cudaMemLoc
     cydevPtr = utils.HelperInputVoidPtr(devPtr)
     cdef void* cydevPtr_ptr = <void*><void_ptr>cydevPtr.cptr
     with nogil:
-        err = cyruntime.cudaMemPrefetchAsync_v2(cydevPtr_ptr, count, location._pvt_ptr[0], flags, cystream)
+        err = cyruntime.cudaMemPrefetchAsync(cydevPtr_ptr, count, location._pvt_ptr[0], flags, cystream)
 
     return (_dict_cudaError_t[err],)
 {{endif}}
 
-{{if 'cudaMemAdvise' in found_functions}}
+{{if 'cudaMemPrefetchBatchAsync' in found_functions}}
 
 @cython.embedsignature(True)
-def cudaMemAdvise(devPtr, size_t count, advice not None : cudaMemoryAdvise, int device):
-    """ Advise about the usage of a given memory range.
+def cudaMemPrefetchBatchAsync(dptrs : Optional[Tuple[Any] | List[Any]], sizes : Tuple[int] | List[int], size_t count, prefetchLocs : Optional[Tuple[cudaMemLocation] | List[cudaMemLocation]], prefetchLocIdxs : Tuple[int] | List[int], size_t numPrefetchLocs, unsigned long long flags, stream):
+    """ Performs a batch of memory prefetches asynchronously.
 
-    Advise the Unified Memory subsystem about the usage pattern for the
-    memory range starting at `devPtr` with a size of `count` bytes. The
-    start address and end address of the memory range will be rounded down
-    and rounded up respectively to be aligned to CPU page size before the
-    advice is applied. The memory range must refer to managed memory
+    Performs a batch of memory prefetches. The batch as a whole executes in
+    stream order but operations within a batch are not guaranteed to
+    execute in any specific order. All devices in the system must have a
+    non-zero value for the device attribute
+    :py:obj:`~.cudaDevAttrConcurrentManagedAccess` otherwise the API will
+    return an error.
+
+    The semantics of the individual prefetch operations are as described in
+    :py:obj:`~.cudaMemPrefetchAsync`.
+
+    Performs memory prefetch on address ranges specified in `dptrs` and
+    `sizes`. Both arrays must be of the same length as specified by
+    `count`. Each memory range specified must refer to managed memory
     allocated via :py:obj:`~.cudaMallocManaged` or declared via managed
-    variables. The memory range could also refer to system-allocated
-    pageable memory provided it represents a valid, host-accessible region
-    of memory and all additional constraints imposed by `advice` as
-    outlined below are also satisfied. Specifying an invalid system-
-    allocated pageable memory range results in an error being returned.
+    variables or it may also refer to system-allocated memory when all
+    devices have a non-zero value for
+    :py:obj:`~.cudaDevAttrPageableMemoryAccess`. The prefetch location for
+    every operation in the batch is specified in the `prefetchLocs` array.
+    Each entry in this array can apply to more than one operation. This can
+    be done by specifying in the `prefetchLocIdxs` array, the index of the
+    first prefetch operation that the corresponding entry in the
+    `prefetchLocs` array applies to. Both `prefetchLocs` and
+    `prefetchLocIdxs` must be of the same length as specified by
+    `numPrefetchLocs`. For example, if a batch has 10 prefetches listed in
+    dptrs/sizes, the first 4 of which are to be prefetched to one location
+    and the remaining 6 are to be prefetched to another, then
+    `numPrefetchLocs` will be 2, `prefetchLocIdxs` will be {0, 4} and
+    `prefetchLocs` will contain the two locations. Note the first entry in
+    `prefetchLocIdxs` must always be 0. Also, each entry must be greater
+    than the previous entry and the last entry should be less than `count`.
+    Furthermore, `numPrefetchLocs` must be lesser than or equal to `count`.
+
+    Parameters
+    ----------
+    dptrs : List[Any]
+        Array of pointers to be prefetched
+    sizes : List[int]
+        Array of sizes for memory prefetch operations.
+    count : size_t
+        Size of `dptrs` and `sizes` arrays.
+    prefetchLocs : List[:py:obj:`~.cudaMemLocation`]
+        Array of locations to prefetch to.
+    prefetchLocIdxs : List[int]
+        Array of indices to specify which operands each entry in the
+        `prefetchLocs` array applies to. The locations specified in
+        prefetchLocs[k] will be applied to copies starting from
+        prefetchLocIdxs[k] through prefetchLocIdxs[k+1] - 1. Also
+        prefetchLocs[numPrefetchLocs - 1] will apply to prefetches starting
+        from prefetchLocIdxs[numPrefetchLocs - 1] through count - 1.
+    numPrefetchLocs : size_t
+        Size of `prefetchLocs` and `prefetchLocIdxs` arrays.
+    flags : unsigned long long
+        Flags reserved for future use. Must be zero.
+    hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
+        The stream to enqueue the operations in. Must not be legacy NULL
+        stream.
 
-    The `advice` parameter can take the following values:
+    Returns
+    -------
+    cudaError_t
 
-    - :py:obj:`~.cudaMemAdviseSetReadMostly`: This implies that the data is
-      mostly going to be read from and only occasionally written to. Any
-      read accesses from any processor to this region will create a read-
-      only copy of at least the accessed pages in that processor's memory.
-      Additionally, if :py:obj:`~.cudaMemPrefetchAsync` is called on this
-      region, it will create a read-only copy of the data on the
-      destination processor. If any processor writes to this region, all
-      copies of the corresponding page will be invalidated except for the
-      one where the write occurred. The `device` argument is ignored for
-      this advice. Note that for a page to be read-duplicated, the
-      accessing processor must either be the CPU or a GPU that has a non-
-      zero value for the device attribute
-      :py:obj:`~.cudaDevAttrConcurrentManagedAccess`. Also, if a context is
-      created on a device that does not have the device attribute
-      :py:obj:`~.cudaDevAttrConcurrentManagedAccess` set, then read-
-      duplication will not occur until all such contexts are destroyed. If
-      the memory region refers to valid system-allocated pageable memory,
-      then the accessing device must have a non-zero value for the device
-      attribute :py:obj:`~.cudaDevAttrPageableMemoryAccess` for a read-only
-      copy to be created on that device. Note however that if the accessing
-      device also has a non-zero value for the device attribute
-      :py:obj:`~.cudaDevAttrPageableMemoryAccessUsesHostPageTables`, then
-      setting this advice will not create a read-only copy when that device
-      accesses this memory region.
+    """
+    cdef cyruntime.cudaStream_t cystream
+    if stream is None:
+        pstream = 0
+    elif isinstance(stream, (cudaStream_t,driver.CUstream)):
+        pstream = int(stream)
+    else:
+        pstream = int(cudaStream_t(stream))
+    cystream = <cyruntime.cudaStream_t><void_ptr>pstream
+    if not all(isinstance(_x, (int)) for _x in prefetchLocIdxs):
+        raise TypeError("Argument 'prefetchLocIdxs' is not instance of type (expected Tuple[int] or List[int]")
+    prefetchLocs = [] if prefetchLocs is None else prefetchLocs
+    if not all(isinstance(_x, (cudaMemLocation,)) for _x in prefetchLocs):
+        raise TypeError("Argument 'prefetchLocs' is not instance of type (expected Tuple[cyruntime.cudaMemLocation,] or List[cyruntime.cudaMemLocation,]")
+    if not all(isinstance(_x, (int)) for _x in sizes):
+        raise TypeError("Argument 'sizes' is not instance of type (expected Tuple[int] or List[int]")
+    dptrs = [] if dptrs is None else dptrs
+    pylist = [utils.HelperInputVoidPtr(pydptrs) for pydptrs in dptrs]
+    cdef utils.InputVoidPtrPtrHelper voidStarHelperdptrs = utils.InputVoidPtrPtrHelper(pylist)
+    cdef vector[size_t] cysizes = sizes
+    if count > <size_t>len(dptrs): raise RuntimeError("List is too small: " + str(len(dptrs)) + " < " + str(count))
+    if count > <size_t>len(sizes): raise RuntimeError("List is too small: " + str(len(sizes)) + " < " + str(count))
+    cdef cyruntime.cudaMemLocation* cyprefetchLocs = NULL
+    if len(prefetchLocs) > 0:
+        cyprefetchLocs = <cyruntime.cudaMemLocation*> calloc(len(prefetchLocs), sizeof(cyruntime.cudaMemLocation))
+        if cyprefetchLocs is NULL:
+            raise MemoryError('Failed to allocate length x size memory: ' + str(len(prefetchLocs)) + 'x' + str(sizeof(cyruntime.cudaMemLocation)))
+        for idx in range(len(prefetchLocs)):
+            string.memcpy(&cyprefetchLocs[idx], (<cudaMemLocation>prefetchLocs[idx])._pvt_ptr, sizeof(cyruntime.cudaMemLocation))
+    cdef vector[size_t] cyprefetchLocIdxs = prefetchLocIdxs
+    if numPrefetchLocs > <size_t>len(prefetchLocs): raise RuntimeError("List is too small: " + str(len(prefetchLocs)) + " < " + str(numPrefetchLocs))
+    if numPrefetchLocs > <size_t>len(prefetchLocIdxs): raise RuntimeError("List is too small: " + str(len(prefetchLocIdxs)) + " < " + str(numPrefetchLocs))
+    err = cyruntime.cudaMemPrefetchBatchAsync(<void**><void_ptr>voidStarHelperdptrs.cptr, cysizes.data(), count, (<cudaMemLocation>prefetchLocs[0])._pvt_ptr if len(prefetchLocs) == 1 else cyprefetchLocs, cyprefetchLocIdxs.data(), numPrefetchLocs, flags, cystream)
+    if cyprefetchLocs is not NULL:
+        free(cyprefetchLocs)
+    return (_dict_cudaError_t[err],)
+{{endif}}
 
-    - :py:obj:`~.cudaMemAdviceUnsetReadMostly`: Undoes the effect of
-      :py:obj:`~.cudaMemAdviceReadMostly` and also prevents the Unified
-      Memory driver from attempting heuristic read-duplication on the
-      memory range. Any read-duplicated copies of the data will be
-      collapsed into a single copy. The location for the collapsed copy
-      will be the preferred location if the page has a preferred location
-      and one of the read-duplicated copies was resident at that location.
-      Otherwise, the location chosen is arbitrary.
+{{if 'cudaMemDiscardBatchAsync' in found_functions}}
 
-    - :py:obj:`~.cudaMemAdviseSetPreferredLocation`: This advice sets the
-      preferred location for the data to be the memory belonging to
-      `device`. Passing in cudaCpuDeviceId for `device` sets the preferred
-      location as host memory. If `device` is a GPU, then it must have a
-      non-zero value for the device attribute
-      :py:obj:`~.cudaDevAttrConcurrentManagedAccess`. Setting the preferred
-      location does not cause data to migrate to that location immediately.
-      Instead, it guides the migration policy when a fault occurs on that
-      memory region. If the data is already in its preferred location and
-      the faulting processor can establish a mapping without requiring the
-      data to be migrated, then data migration will be avoided. On the
-      other hand, if the data is not in its preferred location or if a
-      direct mapping cannot be established, then it will be migrated to the
-      processor accessing it. It is important to note that setting the
-      preferred location does not prevent data prefetching done using
-      :py:obj:`~.cudaMemPrefetchAsync`. Having a preferred location can
-      override the page thrash detection and resolution logic in the
-      Unified Memory driver. Normally, if a page is detected to be
-      constantly thrashing between for example host and device memory, the
-      page may eventually be pinned to host memory by the Unified Memory
-      driver. But if the preferred location is set as device memory, then
-      the page will continue to thrash indefinitely. If
-      :py:obj:`~.cudaMemAdviseSetReadMostly` is also set on this memory
-      region or any subset of it, then the policies associated with that
-      advice will override the policies of this advice, unless read
-      accesses from `device` will not result in a read-only copy being
-      created on that device as outlined in description for the advice
-      :py:obj:`~.cudaMemAdviseSetReadMostly`. If the memory region refers
-      to valid system-allocated pageable memory, then `device` must have a
-      non-zero value for the device attribute
-      :py:obj:`~.cudaDevAttrPageableMemoryAccess`.
+@cython.embedsignature(True)
+def cudaMemDiscardBatchAsync(dptrs : Optional[Tuple[Any] | List[Any]], sizes : Tuple[int] | List[int], size_t count, unsigned long long flags, stream):
+    """ Performs a batch of memory discards asynchronously.
 
-    - :py:obj:`~.cudaMemAdviseUnsetPreferredLocation`: Undoes the effect of
-      :py:obj:`~.cudaMemAdviseSetPreferredLocation` and changes the
-      preferred location to none.
+    Performs a batch of memory discards. The batch as a whole executes in
+    stream order but operations within a batch are not guaranteed to
+    execute in any specific order. All devices in the system must have a
+    non-zero value for the device attribute
+    :py:obj:`~.cudaDevAttrConcurrentManagedAccess` otherwise the API will
+    return an error.
 
-    - :py:obj:`~.cudaMemAdviseSetAccessedBy`: This advice implies that the
-      data will be accessed by `device`. Passing in
-      :py:obj:`~.cudaCpuDeviceId` for `device` will set the advice for the
-      CPU. If `device` is a GPU, then the device attribute
-      :py:obj:`~.cudaDevAttrConcurrentManagedAccess` must be non-zero. This
-      advice does not cause data migration and has no impact on the
-      location of the data per se. Instead, it causes the data to always be
-      mapped in the specified processor's page tables, as long as the
-      location of the data permits a mapping to be established. If the data
-      gets migrated for any reason, the mappings are updated accordingly.
-      This advice is recommended in scenarios where data locality is not
-      important, but avoiding faults is. Consider for example a system
-      containing multiple GPUs with peer-to-peer access enabled, where the
-      data located on one GPU is occasionally accessed by peer GPUs. In
-      such scenarios, migrating data over to the other GPUs is not as
-      important because the accesses are infrequent and the overhead of
-      migration may be too high. But preventing faults can still help
-      improve performance, and so having a mapping set up in advance is
-      useful. Note that on CPU access of this data, the data may be
-      migrated to host memory because the CPU typically cannot access
-      device memory directly. Any GPU that had the
-      :py:obj:`~.cudaMemAdviceSetAccessedBy` flag set for this data will
-      now have its mapping updated to point to the page in host memory. If
-      :py:obj:`~.cudaMemAdviseSetReadMostly` is also set on this memory
-      region or any subset of it, then the policies associated with that
-      advice will override the policies of this advice. Additionally, if
-      the preferred location of this memory region or any subset of it is
-      also `device`, then the policies associated with
-      :py:obj:`~.cudaMemAdviseSetPreferredLocation` will override the
-      policies of this advice. If the memory region refers to valid system-
-      allocated pageable memory, then `device` must have a non-zero value
-      for the device attribute :py:obj:`~.cudaDevAttrPageableMemoryAccess`.
-      Additionally, if `device` has a non-zero value for the device
-      attribute
-      :py:obj:`~.cudaDevAttrPageableMemoryAccessUsesHostPageTables`, then
-      this call has no effect.
+    Discarding a memory range informs the driver that the contents of that
+    range are no longer useful. Discarding memory ranges allows the driver
+    to optimize certain data migrations and can also help reduce memory
+    pressure. This operation can be undone on any part of the range by
+    either writing to it or prefetching it via
+    :py:obj:`~.cudaMemPrefetchAsync` or
+    :py:obj:`~.cudaMemPrefetchBatchAsync`. Reading from a discarded range,
+    without a subsequent write or prefetch to that part of the range, will
+    return an indeterminate value. Note that any reads, writes or
+    prefetches to any part of the memory range that occur simultaneously
+    with the discard operation result in undefined behavior.
 
-    - :py:obj:`~.cudaMemAdviseUnsetAccessedBy`: Undoes the effect of
-      :py:obj:`~.cudaMemAdviseSetAccessedBy`. Any mappings to the data from
-      `device` may be removed at any time causing accesses to result in
-      non-fatal page faults. If the memory region refers to valid system-
-      allocated pageable memory, then `device` must have a non-zero value
-      for the device attribute :py:obj:`~.cudaDevAttrPageableMemoryAccess`.
-      Additionally, if `device` has a non-zero value for the device
-      attribute
-      :py:obj:`~.cudaDevAttrPageableMemoryAccessUsesHostPageTables`, then
-      this call has no effect.
+    Performs memory discard on address ranges specified in `dptrs` and
+    `sizes`. Both arrays must be of the same length as specified by
+    `count`. Each memory range specified must refer to managed memory
+    allocated via :py:obj:`~.cudaMallocManaged` or declared via managed
+    variables or it may also refer to system-allocated memory when all
+    devices have a non-zero value for
+    :py:obj:`~.cudaDevAttrPageableMemoryAccess`.
 
     Parameters
     ----------
-    devPtr : Any
-        Pointer to memory to set the advice for
+    dptrs : List[Any]
+        Array of pointers to be discarded
+    sizes : List[int]
+        Array of sizes for memory discard operations.
     count : size_t
-        Size in bytes of the memory range
-    advice : :py:obj:`~.cudaMemoryAdvise`
-        Advice to be applied for the specified memory range
-    device : int
-        Device to apply the advice for
+        Size of `dptrs` and `sizes` arrays.
+    flags : unsigned long long
+        Flags reserved for future use. Must be zero.
+    hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
+        The stream to enqueue the operations in. Must not be legacy NULL
+        stream.
 
     Returns
     -------
     cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDevice`
 
-    See Also
-    --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :py:obj:`~.cudaMemPrefetchAsync`, :py:obj:`~.cuMemAdvise`
     """
-    cydevPtr = utils.HelperInputVoidPtr(devPtr)
-    cdef void* cydevPtr_ptr = <void*><void_ptr>cydevPtr.cptr
-    cdef cyruntime.cudaMemoryAdvise cyadvice = advice.value
-    with nogil:
-        err = cyruntime.cudaMemAdvise(cydevPtr_ptr, count, cyadvice, device)
+    cdef cyruntime.cudaStream_t cystream
+    if stream is None:
+        pstream = 0
+    elif isinstance(stream, (cudaStream_t,driver.CUstream)):
+        pstream = int(stream)
+    else:
+        pstream = int(cudaStream_t(stream))
+    cystream = <cyruntime.cudaStream_t><void_ptr>pstream
+    if not all(isinstance(_x, (int)) for _x in sizes):
+        raise TypeError("Argument 'sizes' is not instance of type (expected Tuple[int] or List[int]")
+    dptrs = [] if dptrs is None else dptrs
+    pylist = [utils.HelperInputVoidPtr(pydptrs) for pydptrs in dptrs]
+    cdef utils.InputVoidPtrPtrHelper voidStarHelperdptrs = utils.InputVoidPtrPtrHelper(pylist)
+    cdef vector[size_t] cysizes = sizes
+    if count > <size_t>len(dptrs): raise RuntimeError("List is too small: " + str(len(dptrs)) + " < " + str(count))
+    if count > <size_t>len(sizes): raise RuntimeError("List is too small: " + str(len(sizes)) + " < " + str(count))
+    err = cyruntime.cudaMemDiscardBatchAsync(<void**><void_ptr>voidStarHelperdptrs.cptr, cysizes.data(), count, flags, cystream)
+    return (_dict_cudaError_t[err],)
+{{endif}}
+
+{{if 'cudaMemDiscardAndPrefetchBatchAsync' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaMemDiscardAndPrefetchBatchAsync(dptrs : Optional[Tuple[Any] | List[Any]], sizes : Tuple[int] | List[int], size_t count, prefetchLocs : Optional[Tuple[cudaMemLocation] | List[cudaMemLocation]], prefetchLocIdxs : Tuple[int] | List[int], size_t numPrefetchLocs, unsigned long long flags, stream):
+    """ Performs a batch of memory discards and prefetches asynchronously.
+
+    Performs a batch of memory discards followed by prefetches. The batch
+    as a whole executes in stream order but operations within a batch are
+    not guaranteed to execute in any specific order. All devices in the
+    system must have a non-zero value for the device attribute
+    :py:obj:`~.cudaDevAttrConcurrentManagedAccess` otherwise the API will
+    return an error.
+
+    Calling :py:obj:`~.cudaMemDiscardAndPrefetchBatchAsync` is semantically
+    equivalent to calling :py:obj:`~.cudaMemDiscardBatchAsync` followed by
+    :py:obj:`~.cudaMemPrefetchBatchAsync`, but is more optimal. For more
+    details on what discarding and prefetching imply, please refer to
+    :py:obj:`~.cudaMemDiscardBatchAsync` and
+    :py:obj:`~.cudaMemPrefetchBatchAsync` respectively. Note that any
+    reads, writes or prefetches to any part of the memory range that occur
+    simultaneously with this combined discard+prefetch operation result in
+    undefined behavior.
+
+    Performs memory discard and prefetch on address ranges specified in
+    `dptrs` and `sizes`. Both arrays must be of the same length as
+    specified by `count`. Each memory range specified must refer to managed
+    memory allocated via :py:obj:`~.cudaMallocManaged` or declared via
+    managed variables or it may also refer to system-allocated memory when
+    all devices have a non-zero value for
+    :py:obj:`~.cudaDevAttrPageableMemoryAccess`. Every operation in the
+    batch has to be associated with a valid location to prefetch the
+    address range to and specified in the `prefetchLocs` array. Each entry
+    in this array can apply to more than one operation. This can be done by
+    specifying in the `prefetchLocIdxs` array, the index of the first
+    operation that the corresponding entry in the `prefetchLocs` array
+    applies to. Both `prefetchLocs` and `prefetchLocIdxs` must be of the
+    same length as specified by `numPrefetchLocs`. For example, if a batch
+    has 10 operations listed in dptrs/sizes, the first 6 of which are to be
+    prefetched to one location and the remaining 4 are to be prefetched to
+    another, then `numPrefetchLocs` will be 2, `prefetchLocIdxs` will be
+    {0, 6} and `prefetchLocs` will contain the two set of locations. Note
+    the first entry in `prefetchLocIdxs` must always be 0. Also, each entry
+    must be greater than the previous entry and the last entry should be
+    less than `count`. Furthermore, `numPrefetchLocs` must be lesser than
+    or equal to `count`.
+
+    Parameters
+    ----------
+    dptrs : List[Any]
+        Array of pointers to be discarded
+    sizes : List[int]
+        Array of sizes for memory discard operations.
+    count : size_t
+        Size of `dptrs` and `sizes` arrays.
+    prefetchLocs : List[:py:obj:`~.cudaMemLocation`]
+        Array of locations to prefetch to.
+    prefetchLocIdxs : List[int]
+        Array of indices to specify which operands each entry in the
+        `prefetchLocs` array applies to. The locations specified in
+        prefetchLocs[k] will be applied to operations starting from
+        prefetchLocIdxs[k] through prefetchLocIdxs[k+1] - 1. Also
+        prefetchLocs[numPrefetchLocs - 1] will apply to copies starting
+        from prefetchLocIdxs[numPrefetchLocs - 1] through count - 1.
+    numPrefetchLocs : size_t
+        Size of `prefetchLocs` and `prefetchLocIdxs` arrays.
+    flags : unsigned long long
+        Flags reserved for future use. Must be zero.
+    hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
+        The stream to enqueue the operations in. Must not be legacy NULL
+        stream.
 
+    Returns
+    -------
+    cudaError_t
+
+    """
+    cdef cyruntime.cudaStream_t cystream
+    if stream is None:
+        pstream = 0
+    elif isinstance(stream, (cudaStream_t,driver.CUstream)):
+        pstream = int(stream)
+    else:
+        pstream = int(cudaStream_t(stream))
+    cystream = <cyruntime.cudaStream_t><void_ptr>pstream
+    if not all(isinstance(_x, (int)) for _x in prefetchLocIdxs):
+        raise TypeError("Argument 'prefetchLocIdxs' is not instance of type (expected Tuple[int] or List[int]")
+    prefetchLocs = [] if prefetchLocs is None else prefetchLocs
+    if not all(isinstance(_x, (cudaMemLocation,)) for _x in prefetchLocs):
+        raise TypeError("Argument 'prefetchLocs' is not instance of type (expected Tuple[cyruntime.cudaMemLocation,] or List[cyruntime.cudaMemLocation,]")
+    if not all(isinstance(_x, (int)) for _x in sizes):
+        raise TypeError("Argument 'sizes' is not instance of type (expected Tuple[int] or List[int]")
+    dptrs = [] if dptrs is None else dptrs
+    pylist = [utils.HelperInputVoidPtr(pydptrs) for pydptrs in dptrs]
+    cdef utils.InputVoidPtrPtrHelper voidStarHelperdptrs = utils.InputVoidPtrPtrHelper(pylist)
+    cdef vector[size_t] cysizes = sizes
+    if count > <size_t>len(dptrs): raise RuntimeError("List is too small: " + str(len(dptrs)) + " < " + str(count))
+    if count > <size_t>len(sizes): raise RuntimeError("List is too small: " + str(len(sizes)) + " < " + str(count))
+    cdef cyruntime.cudaMemLocation* cyprefetchLocs = NULL
+    if len(prefetchLocs) > 0:
+        cyprefetchLocs = <cyruntime.cudaMemLocation*> calloc(len(prefetchLocs), sizeof(cyruntime.cudaMemLocation))
+        if cyprefetchLocs is NULL:
+            raise MemoryError('Failed to allocate length x size memory: ' + str(len(prefetchLocs)) + 'x' + str(sizeof(cyruntime.cudaMemLocation)))
+        for idx in range(len(prefetchLocs)):
+            string.memcpy(&cyprefetchLocs[idx], (<cudaMemLocation>prefetchLocs[idx])._pvt_ptr, sizeof(cyruntime.cudaMemLocation))
+    cdef vector[size_t] cyprefetchLocIdxs = prefetchLocIdxs
+    if numPrefetchLocs > <size_t>len(prefetchLocs): raise RuntimeError("List is too small: " + str(len(prefetchLocs)) + " < " + str(numPrefetchLocs))
+    if numPrefetchLocs > <size_t>len(prefetchLocIdxs): raise RuntimeError("List is too small: " + str(len(prefetchLocIdxs)) + " < " + str(numPrefetchLocs))
+    err = cyruntime.cudaMemDiscardAndPrefetchBatchAsync(<void**><void_ptr>voidStarHelperdptrs.cptr, cysizes.data(), count, (<cudaMemLocation>prefetchLocs[0])._pvt_ptr if len(prefetchLocs) == 1 else cyprefetchLocs, cyprefetchLocIdxs.data(), numPrefetchLocs, flags, cystream)
+    if cyprefetchLocs is not NULL:
+        free(cyprefetchLocs)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
-{{if 'cudaMemAdvise_v2' in found_functions}}
+{{if 'cudaMemAdvise' in found_functions}}
 
 @cython.embedsignature(True)
-def cudaMemAdvise_v2(devPtr, size_t count, advice not None : cudaMemoryAdvise, location not None : cudaMemLocation):
+def cudaMemAdvise(devPtr, size_t count, advice not None : cudaMemoryAdvise, location not None : cudaMemLocation):
     """ Advise about the usage of a given memory range.
 
     Advise the Unified Memory subsystem about the usage pattern for the
@@ -27231,9 +26990,9 @@ def cudaMemAdvise_v2(devPtr, size_t count, advice not None : cudaMemoryAdvise, l
       read accesses from any processor to this region will create a read-
       only copy of at least the accessed pages in that processor's memory.
       Additionally, if :py:obj:`~.cudaMemPrefetchAsync` or
-      :py:obj:`~.cudaMemPrefetchAsync_v2` is called on this region, it will
+      :py:obj:`~.cudaMemPrefetchAsync` is called on this region, it will
       create a read-only copy of the data on the destination processor. If
-      the target location for :py:obj:`~.cudaMemPrefetchAsync_v2` is a host
+      the target location for :py:obj:`~.cudaMemPrefetchAsync` is a host
       NUMA node and a read-only copy already exists on another host NUMA
       node, that copy will be migrated to the targeted host NUMA node. If
       any processor writes to this region, all copies of the corresponding
@@ -27393,13 +27152,13 @@ def cudaMemAdvise_v2(devPtr, size_t count, advice not None : cudaMemoryAdvise, l
 
     See Also
     --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :py:obj:`~.cudaMemPrefetchAsync`, :py:obj:`~.cuMemAdvise`, :py:obj:`~.cuMemAdvise_v2`
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :py:obj:`~.cudaMemPrefetchAsync`, :py:obj:`~.cuMemAdvise`
     """
     cydevPtr = utils.HelperInputVoidPtr(devPtr)
     cdef void* cydevPtr_ptr = <void*><void_ptr>cydevPtr.cptr
     cdef cyruntime.cudaMemoryAdvise cyadvice = advice.value
     with nogil:
-        err = cyruntime.cudaMemAdvise_v2(cydevPtr_ptr, count, cyadvice, location._pvt_ptr[0])
+        err = cyruntime.cudaMemAdvise(cydevPtr_ptr, count, cyadvice, location._pvt_ptr[0])
 
     return (_dict_cudaError_t[err],)
 {{endif}}
@@ -28387,20 +28146,28 @@ def cudaMemPoolCreate(poolProps : Optional[cudaMemPoolProps]):
     `poolProps` determines the properties of the pool such as the backing
     device and IPC capabilities.
 
-    To create a memory pool targeting a specific host NUMA node,
-    applications must set
+    To create a memory pool for host memory not targeting a specific NUMA
+    node, applications must set set
+    :py:obj:`~.cudaMemPoolProps`::cudaMemLocation::type to
+    :py:obj:`~.cudaMemLocationTypeHost`.
+    :py:obj:`~.cudaMemPoolProps`::cudaMemLocation::id is ignored for such
+    pools. Pools created with the type :py:obj:`~.cudaMemLocationTypeHost`
+    are not IPC capable and :py:obj:`~.cudaMemPoolProps.handleTypes` must
+    be 0, any other values will result in
+    :py:obj:`~.cudaErrorInvalidValue`. To create a memory pool targeting a
+    specific host NUMA node, applications must set
     :py:obj:`~.cudaMemPoolProps`::cudaMemLocation::type to
     :py:obj:`~.cudaMemLocationTypeHostNuma` and
     :py:obj:`~.cudaMemPoolProps`::cudaMemLocation::id must specify the NUMA
     ID of the host memory node. Specifying
-    :py:obj:`~.cudaMemLocationTypeHostNumaCurrent` or
-    :py:obj:`~.cudaMemLocationTypeHost` as the
+    :py:obj:`~.cudaMemLocationTypeHostNumaCurrent` as the
     :py:obj:`~.cudaMemPoolProps`::cudaMemLocation::type will result in
     :py:obj:`~.cudaErrorInvalidValue`. By default, the pool's memory will
     be accessible from the device it is allocated on. In the case of pools
-    created with :py:obj:`~.cudaMemLocationTypeHostNuma`, their default
-    accessibility will be from the host CPU. Applications can control the
-    maximum size of the pool by specifying a non-zero value for
+    created with :py:obj:`~.cudaMemLocationTypeHostNuma` or
+    :py:obj:`~.cudaMemLocationTypeHost`, their default accessibility will
+    be from the host CPU. Applications can control the maximum size of the
+    pool by specifying a non-zero value for
     :py:obj:`~.cudaMemPoolProps.maxSize`. If set to 0, the maximum size of
     the pool will default to a system dependent value.
 
@@ -28497,6 +28264,169 @@ def cudaMemPoolDestroy(memPool):
     return (_dict_cudaError_t[err],)
 {{endif}}
 
+{{if 'cudaMemGetDefaultMemPool' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaMemGetDefaultMemPool(location : Optional[cudaMemLocation], typename not None : cudaMemAllocationType):
+    """ Returns the default memory pool for a given location and allocation type.
+
+    The memory location can be of one of
+    :py:obj:`~.cudaMemLocationTypeDevice`,
+    :py:obj:`~.cudaMemLocationTypeHost` or
+    :py:obj:`~.cudaMemLocationTypeHostNuma`. The allocation type can be one
+    of :py:obj:`~.cudaMemAllocationTypePinned` or
+    :py:obj:`~.cudaMemAllocationTypeManaged`. When the allocation type is
+    :py:obj:`~.cudaMemAllocationTypeManaged`, the location type can also be
+    :py:obj:`~.cudaMemLocationTypeNone` to indicate no preferred location
+    for the managed memory pool. In all other cases, the call return
+    :py:obj:`~.cudaErrorInvalidValue`
+
+    Parameters
+    ----------
+    location : :py:obj:`~.cudaMemLocation`
+        None
+    typename : :py:obj:`~.cudaMemAllocationType`
+        None
+
+    Returns
+    -------
+    cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotSupported`,
+    memPool : :py:obj:`~.cudaMemPool_t`
+        None
+
+    See Also
+    --------
+    :py:obj:`~.cuMemAllocAsync`, :py:obj:`~.cuMemPoolTrimTo`, :py:obj:`~.cuMemPoolGetAttribute`, :py:obj:`~.cuMemPoolSetAttribute`, cuMemPoolSetAccess, :py:obj:`~.cuMemGetMemPool`, :py:obj:`~.cuMemPoolCreate`
+    """
+    cdef cudaMemPool_t memPool = cudaMemPool_t()
+    cdef cyruntime.cudaMemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
+    cdef cyruntime.cudaMemAllocationType cytypename = typename.value
+    err = cyruntime.cudaMemGetDefaultMemPool(<cyruntime.cudaMemPool_t*>memPool._pvt_ptr, cylocation_ptr, cytypename)
+    if err != cyruntime.cudaSuccess:
+        return (_dict_cudaError_t[err], None)
+    return (_dict_cudaError_t[err], memPool)
+{{endif}}
+
+{{if 'cudaMemGetMemPool' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaMemGetMemPool(location : Optional[cudaMemLocation], typename not None : cudaMemAllocationType):
+    """ Gets the current memory pool for a given memory location and allocation type.
+
+    The memory location can be of one of
+    :py:obj:`~.cudaMemLocationTypeDevice`,
+    :py:obj:`~.cudaMemLocationTypeHost` or
+    :py:obj:`~.cudaMemLocationTypeHostNuma`. The allocation type can be one
+    of :py:obj:`~.cudaMemAllocationTypePinned` or
+    :py:obj:`~.cudaMemAllocationTypeManaged`. When the allocation type is
+    :py:obj:`~.cudaMemAllocationTypeManaged`, the location type can also be
+    :py:obj:`~.cudaMemLocationTypeNone` to indicate no preferred location
+    for the managed memory pool. In all other cases, the call return
+    :py:obj:`~.cudaErrorInvalidValue`
+
+    Returns the last pool provided to :py:obj:`~.cudaMemSetMemPool` or
+    :py:obj:`~.cudaDeviceSetMemPool` for this location and allocation type
+    or the location's default memory pool if :py:obj:`~.cudaMemSetMemPool`
+    or :py:obj:`~.cudaDeviceSetMemPool` for that allocType and location has
+    never been called. By default the current mempool of a location is the
+    default mempool for a device that can be obtained via
+    cudaMemGetDefaultMemPool Otherwise the returned pool must have been set
+    with :py:obj:`~.cudaDeviceSetMemPool`.
+
+    Parameters
+    ----------
+    location : :py:obj:`~.cudaMemLocation`
+        None
+    typename : :py:obj:`~.cudaMemAllocationType`
+        None
+
+    Returns
+    -------
+    cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+    memPool : :py:obj:`~.cudaMemPool_t`
+        None
+
+    See Also
+    --------
+    :py:obj:`~.cuDeviceGetDefaultMemPool`, :py:obj:`~.cuMemPoolCreate`, :py:obj:`~.cuDeviceSetMemPool`, :py:obj:`~.cuMemSetMemPool`
+    """
+    cdef cudaMemPool_t memPool = cudaMemPool_t()
+    cdef cyruntime.cudaMemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
+    cdef cyruntime.cudaMemAllocationType cytypename = typename.value
+    err = cyruntime.cudaMemGetMemPool(<cyruntime.cudaMemPool_t*>memPool._pvt_ptr, cylocation_ptr, cytypename)
+    if err != cyruntime.cudaSuccess:
+        return (_dict_cudaError_t[err], None)
+    return (_dict_cudaError_t[err], memPool)
+{{endif}}
+
+{{if 'cudaMemSetMemPool' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaMemSetMemPool(location : Optional[cudaMemLocation], typename not None : cudaMemAllocationType, memPool):
+    """ Sets the current memory pool for a memory location and allocation type.
+
+    The memory location can be of one of
+    :py:obj:`~.cudaMemLocationTypeDevice`,
+    :py:obj:`~.cudaMemLocationTypeHost` or
+    :py:obj:`~.cudaMemLocationTypeHostNuma`. The allocation type can be one
+    of :py:obj:`~.cudaMemAllocationTypePinned` or
+    :py:obj:`~.cudaMemAllocationTypeManaged`. When the allocation type is
+    :py:obj:`~.cudaMemAllocationTypeManaged`, the location type can also be
+    :py:obj:`~.cudaMemLocationTypeNone` to indicate no preferred location
+    for the managed memory pool. In all other cases, the call return
+    :py:obj:`~.cudaErrorInvalidValue`
+
+    When a memory pool is set as the current memory pool, the location
+    parameter should be the same as the location of the pool. If the
+    location type or index don't match, the call returns
+    :py:obj:`~.cudaErrorInvalidValue`. The type of memory pool should also
+    match the parameter allocType. Else the call returns
+    :py:obj:`~.cudaErrorInvalidValue`.   By default, a memory location's
+    current memory pool is its default memory pool. If the location type is
+    :py:obj:`~.cudaMemLocationTypeDevice` and the allocation type is
+    :py:obj:`~.cudaMemAllocationTypePinned`, then this API is the
+    equivalent of calling :py:obj:`~.cudaDeviceSetMemPool` with the
+    location id as the device. For further details on the implications,
+    please refer to the documentation for :py:obj:`~.cudaDeviceSetMemPool`.
+
+    Parameters
+    ----------
+    location : :py:obj:`~.cudaMemLocation`
+        None
+    typename : :py:obj:`~.cudaMemAllocationType`
+        None
+    memPool : :py:obj:`~.CUmemoryPool` or :py:obj:`~.cudaMemPool_t`
+        None
+
+    Returns
+    -------
+    cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
+
+    See Also
+    --------
+    :py:obj:`~.cuDeviceGetDefaultMemPool`, :py:obj:`~.cuDeviceGetMemPool`, :py:obj:`~.cuMemGetMemPool`, :py:obj:`~.cuMemPoolCreate`, :py:obj:`~.cuMemPoolDestroy`, :py:obj:`~.cuMemAllocFromPoolAsync`
+
+    Notes
+    -----
+    Use :py:obj:`~.cudaMallocFromPoolAsync` to specify asynchronous allocations from a device different than the one the stream runs on.
+    """
+    cdef cyruntime.cudaMemPool_t cymemPool
+    if memPool is None:
+        pmemPool = 0
+    elif isinstance(memPool, (cudaMemPool_t,driver.CUmemoryPool)):
+        pmemPool = int(memPool)
+    else:
+        pmemPool = int(cudaMemPool_t(memPool))
+    cymemPool = <cyruntime.cudaMemPool_t><void_ptr>pmemPool
+    cdef cyruntime.cudaMemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
+    cdef cyruntime.cudaMemAllocationType cytypename = typename.value
+    err = cyruntime.cudaMemSetMemPool(cylocation_ptr, cytypename, cymemPool)
+    return (_dict_cudaError_t[err],)
+{{endif}}
+
 {{if 'cudaMallocFromPoolAsync' in found_functions}}
 
 @cython.embedsignature(True)
@@ -29430,8 +29360,8 @@ def cudaCreateTextureObject(pResDesc : Optional[cudaResourceDesc], pTexDesc : Op
     :py:obj:`~.cudaResourceDesc`::res::linear::sizeInBytes specifies the
     size of the array in bytes. The total number of elements in the linear
     address range cannot exceed
-    :py:obj:`~.cudaDeviceProp.maxTexture1DLinear`. The number of elements
-    is computed as (sizeInBytes / sizeof(desc)).
+    :py:obj:`~.cudaDeviceGetTexture1DLinearMaxWidth()`. The number of
+    elements is computed as (sizeInBytes / sizeof(desc)).
 
     If :py:obj:`~.cudaResourceDesc.resType` is set to
     :py:obj:`~.cudaResourceTypePitch2D`,
@@ -29967,6 +29897,194 @@ def cudaRuntimeGetVersion():
     return (_dict_cudaError_t[err], runtimeVersion)
 {{endif}}
 
+{{if 'cudaLogsRegisterCallback' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaLogsRegisterCallback(callbackFunc, userData):
+    """ Register a callback function to receive error log messages.
+
+    Parameters
+    ----------
+    callbackFunc : :py:obj:`~.cudaLogsCallback_t`
+        The function to register as a callback
+    userData : Any
+        A generic pointer to user data. This is passed into the callback
+        function.
+
+    Returns
+    -------
+    cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
+    callback_out : :py:obj:`~.cudaLogsCallbackHandle`
+        Optional location to store the callback handle after it is
+        registered
+    """
+    cdef cyruntime.cudaLogsCallback_t cycallbackFunc
+    if callbackFunc is None:
+        pcallbackFunc = 0
+    elif isinstance(callbackFunc, (cudaLogsCallback_t,)):
+        pcallbackFunc = int(callbackFunc)
+    else:
+        pcallbackFunc = int(cudaLogsCallback_t(callbackFunc))
+    cycallbackFunc = <cyruntime.cudaLogsCallback_t><void_ptr>pcallbackFunc
+    cyuserData = utils.HelperInputVoidPtr(userData)
+    cdef void* cyuserData_ptr = <void*><void_ptr>cyuserData.cptr
+    cdef cudaLogsCallbackHandle callback_out = cudaLogsCallbackHandle()
+    with nogil:
+        err = cyruntime.cudaLogsRegisterCallback(cycallbackFunc, cyuserData_ptr, <cyruntime.cudaLogsCallbackHandle*>callback_out._pvt_ptr)
+
+    if err != cyruntime.cudaSuccess:
+        return (_dict_cudaError_t[err], None)
+    return (_dict_cudaError_t[err], callback_out)
+{{endif}}
+
+{{if 'cudaLogsUnregisterCallback' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaLogsUnregisterCallback(callback):
+    """ Unregister a log message callback.
+
+    Parameters
+    ----------
+    callback : :py:obj:`~.cudaLogsCallbackHandle`
+        The callback instance to unregister from receiving log messages
+
+    Returns
+    -------
+    cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
+    """
+    cdef cyruntime.cudaLogsCallbackHandle cycallback
+    if callback is None:
+        pcallback = 0
+    elif isinstance(callback, (cudaLogsCallbackHandle,)):
+        pcallback = int(callback)
+    else:
+        pcallback = int(cudaLogsCallbackHandle(callback))
+    cycallback = <cyruntime.cudaLogsCallbackHandle><void_ptr>pcallback
+    err = cyruntime.cudaLogsUnregisterCallback(cycallback)
+    return (_dict_cudaError_t[err],)
+{{endif}}
+
+{{if 'cudaLogsCurrent' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaLogsCurrent(unsigned int flags):
+    """ Sets log iterator to point to the end of log buffer, where the next message would be written.
+
+    Parameters
+    ----------
+    flags : unsigned int
+        Reserved for future use, must be 0
+
+    Returns
+    -------
+    cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
+    iterator_out : :py:obj:`~.cudaLogIterator`
+        Location to store an iterator to the current tail of the logs
+    """
+    cdef cudaLogIterator iterator_out = cudaLogIterator()
+    err = cyruntime.cudaLogsCurrent(<cyruntime.cudaLogIterator*>iterator_out._pvt_ptr, flags)
+    if err != cyruntime.cudaSuccess:
+        return (_dict_cudaError_t[err], None)
+    return (_dict_cudaError_t[err], iterator_out)
+{{endif}}
+
+{{if 'cudaLogsDumpToFile' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaLogsDumpToFile(iterator : Optional[cudaLogIterator], char* pathToFile, unsigned int flags):
+    """ Dump accumulated driver logs into a file.
+
+    Logs generated by the driver are stored in an internal buffer and can
+    be copied out using this API. This API dumps all driver logs starting
+    from `iterator` into `pathToFile` provided.
+
+    Parameters
+    ----------
+    iterator : :py:obj:`~.cudaLogIterator`
+        Optional auto-advancing iterator specifying the starting log to
+        read. NULL value dumps all logs.
+    pathToFile : bytes
+        Path to output file for dumping logs
+    flags : unsigned int
+        Reserved for future use, must be 0
+
+    Returns
+    -------
+    cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
+    iterator : :py:obj:`~.cudaLogIterator`
+        Optional auto-advancing iterator specifying the starting log to
+        read. NULL value dumps all logs.
+
+    Notes
+    -----
+    `iterator` is auto-advancing. Dumping logs will update the value of `iterator` to receive the next generated log.
+
+    The driver reserves limited memory for storing logs. The oldest logs may be overwritten and become unrecoverable. An indication will appear in the destination outupt if the logs have been truncated. Call dump after each failed API to mitigate this risk.
+    """
+    err = cyruntime.cudaLogsDumpToFile(iterator._pvt_ptr if iterator != None else NULL, pathToFile, flags)
+    if err != cyruntime.cudaSuccess:
+        return (_dict_cudaError_t[err], None)
+    return (_dict_cudaError_t[err], iterator)
+{{endif}}
+
+{{if 'cudaLogsDumpToMemory' in found_functions}}
+
+@cython.embedsignature(True)
+def cudaLogsDumpToMemory(iterator : Optional[cudaLogIterator], char* buffer, size_t size, unsigned int flags):
+    """ Dump accumulated driver logs into a buffer.
+
+    Logs generated by the driver are stored in an internal buffer and can
+    be copied out using this API. This API dumps driver logs from
+    `iterator` into `buffer` up to the size specified in `*size`. The
+    driver will always null terminate the buffer but there will not be a
+    null character between log entries, only a newline \n. The driver will
+    then return the actual number of bytes written in `*size`, excluding
+    the null terminator. If there are no messages to dump, `*size` will be
+    set to 0 and the function will return :py:obj:`~.CUDA_SUCCESS`. If the
+    provided `buffer` is not large enough to hold any messages, `*size`
+    will be set to 0 and the function will return
+    :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
+
+    Parameters
+    ----------
+    iterator : :py:obj:`~.cudaLogIterator`
+        Optional auto-advancing iterator specifying the starting log to
+        read. NULL value dumps all logs.
+    buffer : bytes
+        Pointer to dump logs
+    size : int
+        See description
+    flags : unsigned int
+        Reserved for future use, must be 0
+
+    Returns
+    -------
+    cudaError_t
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
+    iterator : :py:obj:`~.cudaLogIterator`
+        Optional auto-advancing iterator specifying the starting log to
+        read. NULL value dumps all logs.
+    size : int
+        See description
+
+    Notes
+    -----
+    `iterator` is auto-advancing. Dumping logs will update the value of `iterator` to receive the next generated log.
+
+    The driver reserves limited memory for storing logs. The maximum size of the buffer is 25600 bytes. The oldest logs may be overwritten and become unrecoverable. An indication will appear in the destination outupt if the logs have been truncated. Call dump after each failed API to mitigate this risk.
+
+    If the provided value in `*size` is not large enough to hold all buffered messages, a message will be added at the head of the buffer indicating this. The driver then computes the number of messages it is able to store in `buffer` and writes it out. The final message in `buffer` will always be the most recent log message as of when the API is called.
+    """
+    err = cyruntime.cudaLogsDumpToMemory(iterator._pvt_ptr if iterator != None else NULL, buffer, &size, flags)
+    if err != cyruntime.cudaSuccess:
+        return (_dict_cudaError_t[err], None, None)
+    return (_dict_cudaError_t[err], iterator, size)
+{{endif}}
+
 {{if 'cudaGraphCreate' in found_functions}}
 
 @cython.embedsignature(True)
@@ -30205,17 +30323,17 @@ def cudaGraphKernelNodeSetParams(node, pNodeParams : Optional[cudaKernelNodePara
 {{if 'cudaGraphKernelNodeCopyAttributes' in found_functions}}
 
 @cython.embedsignature(True)
-def cudaGraphKernelNodeCopyAttributes(hSrc, hDst):
+def cudaGraphKernelNodeCopyAttributes(hDst, hSrc):
     """ Copies attributes from source node to destination node.
 
-    Copies attributes from source node `src` to destination node `dst`.
+    Copies attributes from source node `hSrc` to destination node `hDst`.
     Both node must have the same context.
 
     Parameters
     ----------
-    dst : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
+    hDst : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
         Destination node
-    src : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
+    hSrc : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
         Source node For list of attributes see
         :py:obj:`~.cudaKernelNodeAttrID`
 
@@ -30228,14 +30346,6 @@ def cudaGraphKernelNodeCopyAttributes(hSrc, hDst):
     --------
     :py:obj:`~.cudaAccessPolicyWindow`
     """
-    cdef cyruntime.cudaGraphNode_t cyhDst
-    if hDst is None:
-        phDst = 0
-    elif isinstance(hDst, (cudaGraphNode_t,driver.CUgraphNode)):
-        phDst = int(hDst)
-    else:
-        phDst = int(cudaGraphNode_t(hDst))
-    cyhDst = <cyruntime.cudaGraphNode_t><void_ptr>phDst
     cdef cyruntime.cudaGraphNode_t cyhSrc
     if hSrc is None:
         phSrc = 0
@@ -30244,7 +30354,15 @@ def cudaGraphKernelNodeCopyAttributes(hSrc, hDst):
     else:
         phSrc = int(cudaGraphNode_t(hSrc))
     cyhSrc = <cyruntime.cudaGraphNode_t><void_ptr>phSrc
-    err = cyruntime.cudaGraphKernelNodeCopyAttributes(cyhSrc, cyhDst)
+    cdef cyruntime.cudaGraphNode_t cyhDst
+    if hDst is None:
+        phDst = 0
+    elif isinstance(hDst, (cudaGraphNode_t,driver.CUgraphNode)):
+        phDst = int(hDst)
+    else:
+        phDst = int(cudaGraphNode_t(hDst))
+    cyhDst = <cyruntime.cudaGraphNode_t><void_ptr>phDst
+    err = cyruntime.cudaGraphKernelNodeCopyAttributes(cyhDst, cyhSrc)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -32400,78 +32518,6 @@ def cudaGraphGetRootNodes(graph, size_t pNumRootNodes = 0):
 def cudaGraphGetEdges(graph, size_t numEdges = 0):
     """ Returns a graph's dependency edges.
 
-    Returns a list of `graph's` dependency edges. Edges are returned via
-    corresponding indices in `from` and `to`; that is, the node in `to`[i]
-    has a dependency on the node in `from`[i]. `from` and `to` may both be
-    NULL, in which case this function only returns the number of edges in
-    `numEdges`. Otherwise, `numEdges` entries will be filled in. If
-    `numEdges` is higher than the actual number of edges, the remaining
-    entries in `from` and `to` will be set to NULL, and the number of edges
-    actually returned will be written to `numEdges`.
-
-    Parameters
-    ----------
-    graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to get the edges from
-    numEdges : int
-        See description
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    from : List[:py:obj:`~.cudaGraphNode_t`]
-        Location to return edge endpoints
-    to : List[:py:obj:`~.cudaGraphNode_t`]
-        Location to return edge endpoints
-    numEdges : int
-        See description
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphGetNodes`, :py:obj:`~.cudaGraphGetRootNodes`, :py:obj:`~.cudaGraphAddDependencies`, :py:obj:`~.cudaGraphRemoveDependencies`, :py:obj:`~.cudaGraphNodeGetDependencies`, :py:obj:`~.cudaGraphNodeGetDependentNodes`
-    """
-    cdef size_t _graph_length = numEdges
-    cdef cyruntime.cudaGraph_t cygraph
-    if graph is None:
-        pgraph = 0
-    elif isinstance(graph, (cudaGraph_t,driver.CUgraph)):
-        pgraph = int(graph)
-    else:
-        pgraph = int(cudaGraph_t(graph))
-    cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
-    cdef cyruntime.cudaGraphNode_t* cyfrom_ = NULL
-    pyfrom_ = []
-    if _graph_length != 0:
-        cyfrom_ = <cyruntime.cudaGraphNode_t*>calloc(_graph_length, sizeof(cyruntime.cudaGraphNode_t))
-        if cyfrom_ is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
-    cdef cyruntime.cudaGraphNode_t* cyto = NULL
-    pyto = []
-    if _graph_length != 0:
-        cyto = <cyruntime.cudaGraphNode_t*>calloc(_graph_length, sizeof(cyruntime.cudaGraphNode_t))
-        if cyto is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
-    err = cyruntime.cudaGraphGetEdges(cygraph, cyfrom_, cyto, &numEdges)
-    if cudaError_t(err) == cudaError_t(0):
-        pyfrom_ = [cudaGraphNode_t(init_value=<void_ptr>cyfrom_[idx]) for idx in range(_graph_length)]
-    if cyfrom_ is not NULL:
-        free(cyfrom_)
-    if cudaError_t(err) == cudaError_t(0):
-        pyto = [cudaGraphNode_t(init_value=<void_ptr>cyto[idx]) for idx in range(_graph_length)]
-    if cyto is not NULL:
-        free(cyto)
-    if err != cyruntime.cudaSuccess:
-        return (_dict_cudaError_t[err], None, None, None)
-    return (_dict_cudaError_t[err], pyfrom_, pyto, numEdges)
-{{endif}}
-
-{{if 'cudaGraphGetEdges_v2' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaGraphGetEdges_v2(graph, size_t numEdges = 0):
-    """ Returns a graph's dependency edges (12.3+)
-
     Returns a list of `graph's` dependency edges. Edges are returned via
     corresponding indices in `from`, `to` and `edgeData`; that is, the node
     in `to`[i] has a dependency on the node in `from`[i] with data
@@ -32537,7 +32583,7 @@ def cudaGraphGetEdges_v2(graph, size_t numEdges = 0):
         cyedgeData = <cyruntime.cudaGraphEdgeData*>calloc(_graph_length, sizeof(cyruntime.cudaGraphEdgeData))
         if cyedgeData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cyruntime.cudaGraphEdgeData)))
-    err = cyruntime.cudaGraphGetEdges_v2(cygraph, cyfrom_, cyto, cyedgeData, &numEdges)
+    err = cyruntime.cudaGraphGetEdges(cygraph, cyfrom_, cyto, cyedgeData, &numEdges)
     if cudaError_t(err) == cudaError_t(0):
         pyfrom_ = [cudaGraphNode_t(init_value=<void_ptr>cyfrom_[idx]) for idx in range(_graph_length)]
     if cyfrom_ is not NULL:
@@ -32569,65 +32615,6 @@ def cudaGraphNodeGetDependencies(node, size_t pNumDependencies = 0):
     NULL, and the number of nodes actually obtained will be returned in
     `pNumDependencies`.
 
-    Parameters
-    ----------
-    node : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to query
-    pNumDependencies : int
-        See description
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    pDependencies : List[:py:obj:`~.cudaGraphNode_t`]
-        Pointer to return the dependencies
-    pNumDependencies : int
-        See description
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphNodeGetDependentNodes`, :py:obj:`~.cudaGraphGetNodes`, :py:obj:`~.cudaGraphGetRootNodes`, :py:obj:`~.cudaGraphGetEdges`, :py:obj:`~.cudaGraphAddDependencies`, :py:obj:`~.cudaGraphRemoveDependencies`
-    """
-    cdef size_t _graph_length = pNumDependencies
-    cdef cyruntime.cudaGraphNode_t cynode
-    if node is None:
-        pnode = 0
-    elif isinstance(node, (cudaGraphNode_t,driver.CUgraphNode)):
-        pnode = int(node)
-    else:
-        pnode = int(cudaGraphNode_t(node))
-    cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
-    pypDependencies = []
-    if _graph_length != 0:
-        cypDependencies = <cyruntime.cudaGraphNode_t*>calloc(_graph_length, sizeof(cyruntime.cudaGraphNode_t))
-        if cypDependencies is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
-    err = cyruntime.cudaGraphNodeGetDependencies(cynode, cypDependencies, &pNumDependencies)
-    if cudaError_t(err) == cudaError_t(0):
-        pypDependencies = [cudaGraphNode_t(init_value=<void_ptr>cypDependencies[idx]) for idx in range(_graph_length)]
-    if cypDependencies is not NULL:
-        free(cypDependencies)
-    if err != cyruntime.cudaSuccess:
-        return (_dict_cudaError_t[err], None, None)
-    return (_dict_cudaError_t[err], pypDependencies, pNumDependencies)
-{{endif}}
-
-{{if 'cudaGraphNodeGetDependencies_v2' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaGraphNodeGetDependencies_v2(node, size_t pNumDependencies = 0):
-    """ Returns a node's dependencies (12.3+)
-
-    Returns a list of `node's` dependencies. `pDependencies` may be NULL,
-    in which case this function will return the number of dependencies in
-    `pNumDependencies`. Otherwise, `pNumDependencies` entries will be
-    filled in. If `pNumDependencies` is higher than the actual number of
-    dependencies, the remaining entries in `pDependencies` will be set to
-    NULL, and the number of nodes actually obtained will be returned in
-    `pNumDependencies`.
-
     Note that if an edge has non-zero (non-default) edge data and
     `edgeData` is NULL, this API will return
     :py:obj:`~.cudaErrorLossyQuery`. If `edgeData` is non-NULL, then
@@ -32676,7 +32663,7 @@ def cudaGraphNodeGetDependencies_v2(node, size_t pNumDependencies = 0):
         cyedgeData = <cyruntime.cudaGraphEdgeData*>calloc(_graph_length, sizeof(cyruntime.cudaGraphEdgeData))
         if cyedgeData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cyruntime.cudaGraphEdgeData)))
-    err = cyruntime.cudaGraphNodeGetDependencies_v2(cynode, cypDependencies, cyedgeData, &pNumDependencies)
+    err = cyruntime.cudaGraphNodeGetDependencies(cynode, cypDependencies, cyedgeData, &pNumDependencies)
     if cudaError_t(err) == cudaError_t(0):
         pypDependencies = [cudaGraphNode_t(init_value=<void_ptr>cypDependencies[idx]) for idx in range(_graph_length)]
     if cypDependencies is not NULL:
@@ -32704,65 +32691,6 @@ def cudaGraphNodeGetDependentNodes(node, size_t pNumDependentNodes = 0):
     will be set to NULL, and the number of nodes actually obtained will be
     returned in `pNumDependentNodes`.
 
-    Parameters
-    ----------
-    node : :py:obj:`~.CUgraphNode` or :py:obj:`~.cudaGraphNode_t`
-        Node to query
-    pNumDependentNodes : int
-        See description
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-    pDependentNodes : List[:py:obj:`~.cudaGraphNode_t`]
-        Pointer to return the dependent nodes
-    pNumDependentNodes : int
-        See description
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphNodeGetDependencies`, :py:obj:`~.cudaGraphGetNodes`, :py:obj:`~.cudaGraphGetRootNodes`, :py:obj:`~.cudaGraphGetEdges`, :py:obj:`~.cudaGraphAddDependencies`, :py:obj:`~.cudaGraphRemoveDependencies`
-    """
-    cdef size_t _graph_length = pNumDependentNodes
-    cdef cyruntime.cudaGraphNode_t cynode
-    if node is None:
-        pnode = 0
-    elif isinstance(node, (cudaGraphNode_t,driver.CUgraphNode)):
-        pnode = int(node)
-    else:
-        pnode = int(cudaGraphNode_t(node))
-    cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    cdef cyruntime.cudaGraphNode_t* cypDependentNodes = NULL
-    pypDependentNodes = []
-    if _graph_length != 0:
-        cypDependentNodes = <cyruntime.cudaGraphNode_t*>calloc(_graph_length, sizeof(cyruntime.cudaGraphNode_t))
-        if cypDependentNodes is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
-    err = cyruntime.cudaGraphNodeGetDependentNodes(cynode, cypDependentNodes, &pNumDependentNodes)
-    if cudaError_t(err) == cudaError_t(0):
-        pypDependentNodes = [cudaGraphNode_t(init_value=<void_ptr>cypDependentNodes[idx]) for idx in range(_graph_length)]
-    if cypDependentNodes is not NULL:
-        free(cypDependentNodes)
-    if err != cyruntime.cudaSuccess:
-        return (_dict_cudaError_t[err], None, None)
-    return (_dict_cudaError_t[err], pypDependentNodes, pNumDependentNodes)
-{{endif}}
-
-{{if 'cudaGraphNodeGetDependentNodes_v2' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaGraphNodeGetDependentNodes_v2(node, size_t pNumDependentNodes = 0):
-    """ Returns a node's dependent nodes (12.3+)
-
-    Returns a list of `node's` dependent nodes. `pDependentNodes` may be
-    NULL, in which case this function will return the number of dependent
-    nodes in `pNumDependentNodes`. Otherwise, `pNumDependentNodes` entries
-    will be filled in. If `pNumDependentNodes` is higher than the actual
-    number of dependent nodes, the remaining entries in `pDependentNodes`
-    will be set to NULL, and the number of nodes actually obtained will be
-    returned in `pNumDependentNodes`.
-
     Note that if an edge has non-zero (non-default) edge data and
     `edgeData` is NULL, this API will return
     :py:obj:`~.cudaErrorLossyQuery`. If `edgeData` is non-NULL, then
@@ -32811,7 +32739,7 @@ def cudaGraphNodeGetDependentNodes_v2(node, size_t pNumDependentNodes = 0):
         cyedgeData = <cyruntime.cudaGraphEdgeData*>calloc(_graph_length, sizeof(cyruntime.cudaGraphEdgeData))
         if cyedgeData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cyruntime.cudaGraphEdgeData)))
-    err = cyruntime.cudaGraphNodeGetDependentNodes_v2(cynode, cypDependentNodes, cyedgeData, &pNumDependentNodes)
+    err = cyruntime.cudaGraphNodeGetDependentNodes(cynode, cypDependentNodes, cyedgeData, &pNumDependentNodes)
     if cudaError_t(err) == cudaError_t(0):
         pypDependentNodes = [cudaGraphNode_t(init_value=<void_ptr>cypDependentNodes[idx]) for idx in range(_graph_length)]
     if cypDependentNodes is not NULL:
@@ -32828,7 +32756,7 @@ def cudaGraphNodeGetDependentNodes_v2(node, size_t pNumDependentNodes = 0):
 {{if 'cudaGraphAddDependencies' in found_functions}}
 
 @cython.embedsignature(True)
-def cudaGraphAddDependencies(graph, from_ : Optional[Tuple[cudaGraphNode_t] | List[cudaGraphNode_t]], to : Optional[Tuple[cudaGraphNode_t] | List[cudaGraphNode_t]], size_t numDependencies):
+def cudaGraphAddDependencies(graph, from_ : Optional[Tuple[cudaGraphNode_t] | List[cudaGraphNode_t]], to : Optional[Tuple[cudaGraphNode_t] | List[cudaGraphNode_t]], edgeData : Optional[Tuple[cudaGraphEdgeData] | List[cudaGraphEdgeData]], size_t numDependencies):
     """ Adds dependency edges to a graph.
 
     The number of dependencies to be added is defined by `numDependencies`
@@ -32838,79 +32766,6 @@ def cudaGraphAddDependencies(graph, from_ : Optional[Tuple[cudaGraphNode_t] | Li
     If `numDependencies` is 0, elements in `pFrom` and `pTo` will be
     ignored. Specifying an existing dependency will return an error.
 
-    Parameters
-    ----------
-    graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to which dependencies are added
-    from : List[:py:obj:`~.cudaGraphNode_t`]
-        Array of nodes that provide the dependencies
-    to : List[:py:obj:`~.cudaGraphNode_t`]
-        Array of dependent nodes
-    numDependencies : size_t
-        Number of dependencies to be added
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphRemoveDependencies`, :py:obj:`~.cudaGraphGetEdges`, :py:obj:`~.cudaGraphNodeGetDependencies`, :py:obj:`~.cudaGraphNodeGetDependentNodes`
-    """
-    to = [] if to is None else to
-    if not all(isinstance(_x, (cudaGraphNode_t,driver.CUgraphNode)) for _x in to):
-        raise TypeError("Argument 'to' is not instance of type (expected Tuple[cyruntime.cudaGraphNode_t,driver.CUgraphNode] or List[cyruntime.cudaGraphNode_t,driver.CUgraphNode]")
-    from_ = [] if from_ is None else from_
-    if not all(isinstance(_x, (cudaGraphNode_t,driver.CUgraphNode)) for _x in from_):
-        raise TypeError("Argument 'from_' is not instance of type (expected Tuple[cyruntime.cudaGraphNode_t,driver.CUgraphNode] or List[cyruntime.cudaGraphNode_t,driver.CUgraphNode]")
-    cdef cyruntime.cudaGraph_t cygraph
-    if graph is None:
-        pgraph = 0
-    elif isinstance(graph, (cudaGraph_t,driver.CUgraph)):
-        pgraph = int(graph)
-    else:
-        pgraph = int(cudaGraph_t(graph))
-    cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
-    cdef cyruntime.cudaGraphNode_t* cyfrom_ = NULL
-    if len(from_) > 0:
-        cyfrom_ = <cyruntime.cudaGraphNode_t*> calloc(len(from_), sizeof(cyruntime.cudaGraphNode_t))
-        if cyfrom_ is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(len(from_)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
-        else:
-            for idx in range(len(from_)):
-                cyfrom_[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>from_[idx])._pvt_ptr[0]
-    cdef cyruntime.cudaGraphNode_t* cyto = NULL
-    if len(to) > 0:
-        cyto = <cyruntime.cudaGraphNode_t*> calloc(len(to), sizeof(cyruntime.cudaGraphNode_t))
-        if cyto is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(len(to)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
-        else:
-            for idx in range(len(to)):
-                cyto[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>to[idx])._pvt_ptr[0]
-    if numDependencies > <size_t>len(from_): raise RuntimeError("List is too small: " + str(len(from_)) + " < " + str(numDependencies))
-    if numDependencies > <size_t>len(to): raise RuntimeError("List is too small: " + str(len(to)) + " < " + str(numDependencies))
-    err = cyruntime.cudaGraphAddDependencies(cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>from_[0])._pvt_ptr if len(from_) == 1 else cyfrom_, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>to[0])._pvt_ptr if len(to) == 1 else cyto, numDependencies)
-    if cyfrom_ is not NULL:
-        free(cyfrom_)
-    if cyto is not NULL:
-        free(cyto)
-    return (_dict_cudaError_t[err],)
-{{endif}}
-
-{{if 'cudaGraphAddDependencies_v2' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaGraphAddDependencies_v2(graph, from_ : Optional[Tuple[cudaGraphNode_t] | List[cudaGraphNode_t]], to : Optional[Tuple[cudaGraphNode_t] | List[cudaGraphNode_t]], edgeData : Optional[Tuple[cudaGraphEdgeData] | List[cudaGraphEdgeData]], size_t numDependencies):
-    """ Adds dependency edges to a graph. (12.3+)
-
-    The number of dependencies to be added is defined by `numDependencies`
-    Elements in `pFrom` and `pTo` at corresponding indices define a
-    dependency. Each node in `pFrom` and `pTo` must belong to `graph`.
-
-    If `numDependencies` is 0, elements in `pFrom` and `pTo` will be
-    ignored. Specifying an existing dependency will return an error.
-
     Parameters
     ----------
     graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
@@ -32974,7 +32829,7 @@ def cudaGraphAddDependencies_v2(graph, from_ : Optional[Tuple[cudaGraphNode_t] |
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(edgeData)) + 'x' + str(sizeof(cyruntime.cudaGraphEdgeData)))
         for idx in range(len(edgeData)):
             string.memcpy(&cyedgeData[idx], (<cudaGraphEdgeData>edgeData[idx])._pvt_ptr, sizeof(cyruntime.cudaGraphEdgeData))
-    err = cyruntime.cudaGraphAddDependencies_v2(cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>from_[0])._pvt_ptr if len(from_) == 1 else cyfrom_, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>to[0])._pvt_ptr if len(to) == 1 else cyto, (<cudaGraphEdgeData>edgeData[0])._pvt_ptr if len(edgeData) == 1 else cyedgeData, numDependencies)
+    err = cyruntime.cudaGraphAddDependencies(cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>from_[0])._pvt_ptr if len(from_) == 1 else cyfrom_, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>to[0])._pvt_ptr if len(to) == 1 else cyto, (<cudaGraphEdgeData>edgeData[0])._pvt_ptr if len(edgeData) == 1 else cyedgeData, numDependencies)
     if cyfrom_ is not NULL:
         free(cyfrom_)
     if cyto is not NULL:
@@ -32987,7 +32842,7 @@ def cudaGraphAddDependencies_v2(graph, from_ : Optional[Tuple[cudaGraphNode_t] |
 {{if 'cudaGraphRemoveDependencies' in found_functions}}
 
 @cython.embedsignature(True)
-def cudaGraphRemoveDependencies(graph, from_ : Optional[Tuple[cudaGraphNode_t] | List[cudaGraphNode_t]], to : Optional[Tuple[cudaGraphNode_t] | List[cudaGraphNode_t]], size_t numDependencies):
+def cudaGraphRemoveDependencies(graph, from_ : Optional[Tuple[cudaGraphNode_t] | List[cudaGraphNode_t]], to : Optional[Tuple[cudaGraphNode_t] | List[cudaGraphNode_t]], edgeData : Optional[Tuple[cudaGraphEdgeData] | List[cudaGraphEdgeData]], size_t numDependencies):
     """ Removes dependency edges from a graph.
 
     The number of `pDependencies` to be removed is defined by
@@ -32995,80 +32850,6 @@ def cudaGraphRemoveDependencies(graph, from_ : Optional[Tuple[cudaGraphNode_t] |
     indices define a dependency. Each node in `pFrom` and `pTo` must belong
     to `graph`.
 
-    If `numDependencies` is 0, elements in `pFrom` and `pTo` will be
-    ignored. Specifying a non-existing dependency will return an error.
-
-    Parameters
-    ----------
-    graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph from which to remove dependencies
-    from : List[:py:obj:`~.cudaGraphNode_t`]
-        Array of nodes that provide the dependencies
-    to : List[:py:obj:`~.cudaGraphNode_t`]
-        Array of dependent nodes
-    numDependencies : size_t
-        Number of dependencies to be removed
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphAddDependencies`, :py:obj:`~.cudaGraphGetEdges`, :py:obj:`~.cudaGraphNodeGetDependencies`, :py:obj:`~.cudaGraphNodeGetDependentNodes`
-    """
-    to = [] if to is None else to
-    if not all(isinstance(_x, (cudaGraphNode_t,driver.CUgraphNode)) for _x in to):
-        raise TypeError("Argument 'to' is not instance of type (expected Tuple[cyruntime.cudaGraphNode_t,driver.CUgraphNode] or List[cyruntime.cudaGraphNode_t,driver.CUgraphNode]")
-    from_ = [] if from_ is None else from_
-    if not all(isinstance(_x, (cudaGraphNode_t,driver.CUgraphNode)) for _x in from_):
-        raise TypeError("Argument 'from_' is not instance of type (expected Tuple[cyruntime.cudaGraphNode_t,driver.CUgraphNode] or List[cyruntime.cudaGraphNode_t,driver.CUgraphNode]")
-    cdef cyruntime.cudaGraph_t cygraph
-    if graph is None:
-        pgraph = 0
-    elif isinstance(graph, (cudaGraph_t,driver.CUgraph)):
-        pgraph = int(graph)
-    else:
-        pgraph = int(cudaGraph_t(graph))
-    cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
-    cdef cyruntime.cudaGraphNode_t* cyfrom_ = NULL
-    if len(from_) > 0:
-        cyfrom_ = <cyruntime.cudaGraphNode_t*> calloc(len(from_), sizeof(cyruntime.cudaGraphNode_t))
-        if cyfrom_ is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(len(from_)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
-        else:
-            for idx in range(len(from_)):
-                cyfrom_[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>from_[idx])._pvt_ptr[0]
-    cdef cyruntime.cudaGraphNode_t* cyto = NULL
-    if len(to) > 0:
-        cyto = <cyruntime.cudaGraphNode_t*> calloc(len(to), sizeof(cyruntime.cudaGraphNode_t))
-        if cyto is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(len(to)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
-        else:
-            for idx in range(len(to)):
-                cyto[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>to[idx])._pvt_ptr[0]
-    if numDependencies > <size_t>len(from_): raise RuntimeError("List is too small: " + str(len(from_)) + " < " + str(numDependencies))
-    if numDependencies > <size_t>len(to): raise RuntimeError("List is too small: " + str(len(to)) + " < " + str(numDependencies))
-    err = cyruntime.cudaGraphRemoveDependencies(cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>from_[0])._pvt_ptr if len(from_) == 1 else cyfrom_, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>to[0])._pvt_ptr if len(to) == 1 else cyto, numDependencies)
-    if cyfrom_ is not NULL:
-        free(cyfrom_)
-    if cyto is not NULL:
-        free(cyto)
-    return (_dict_cudaError_t[err],)
-{{endif}}
-
-{{if 'cudaGraphRemoveDependencies_v2' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaGraphRemoveDependencies_v2(graph, from_ : Optional[Tuple[cudaGraphNode_t] | List[cudaGraphNode_t]], to : Optional[Tuple[cudaGraphNode_t] | List[cudaGraphNode_t]], edgeData : Optional[Tuple[cudaGraphEdgeData] | List[cudaGraphEdgeData]], size_t numDependencies):
-    """ Removes dependency edges from a graph. (12.3+)
-
-    The number of `pDependencies` to be removed is defined by
-    `numDependencies`. Elements in `pFrom` and `pTo` at corresponding
-    indices define a dependency. Each node in `pFrom` and `pTo` must belong
-    to `graph`.
-
     If `numDependencies` is 0, elements in `pFrom` and `pTo` will be
     ignored. Specifying an edge that does not exist in the graph, with data
     matching `edgeData`, results in an error. `edgeData` is nullable, which
@@ -33137,7 +32918,7 @@ def cudaGraphRemoveDependencies_v2(graph, from_ : Optional[Tuple[cudaGraphNode_t
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(edgeData)) + 'x' + str(sizeof(cyruntime.cudaGraphEdgeData)))
         for idx in range(len(edgeData)):
             string.memcpy(&cyedgeData[idx], (<cudaGraphEdgeData>edgeData[idx])._pvt_ptr, sizeof(cyruntime.cudaGraphEdgeData))
-    err = cyruntime.cudaGraphRemoveDependencies_v2(cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>from_[0])._pvt_ptr if len(from_) == 1 else cyfrom_, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>to[0])._pvt_ptr if len(to) == 1 else cyto, (<cudaGraphEdgeData>edgeData[0])._pvt_ptr if len(edgeData) == 1 else cyedgeData, numDependencies)
+    err = cyruntime.cudaGraphRemoveDependencies(cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>from_[0])._pvt_ptr if len(from_) == 1 else cyfrom_, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>to[0])._pvt_ptr if len(to) == 1 else cyto, (<cudaGraphEdgeData>edgeData[0])._pvt_ptr if len(edgeData) == 1 else cyedgeData, numDependencies)
     if cyfrom_ is not NULL:
         free(cyfrom_)
     if cyto is not NULL:
@@ -34989,7 +34770,7 @@ def cudaGraphReleaseUserObject(graph, object, unsigned int count):
 {{if 'cudaGraphAddNode' in found_functions}}
 
 @cython.embedsignature(True)
-def cudaGraphAddNode(graph, pDependencies : Optional[Tuple[cudaGraphNode_t] | List[cudaGraphNode_t]], size_t numDependencies, nodeParams : Optional[cudaGraphNodeParams]):
+def cudaGraphAddNode(graph, pDependencies : Optional[Tuple[cudaGraphNode_t] | List[cudaGraphNode_t]], dependencyData : Optional[Tuple[cudaGraphEdgeData] | List[cudaGraphEdgeData]], size_t numDependencies, nodeParams : Optional[cudaGraphNodeParams]):
     """ Adds a node of arbitrary type to a graph.
 
     Creates a new node in `graph` described by `nodeParams` with
@@ -35011,83 +34792,6 @@ def cudaGraphAddNode(graph, pDependencies : Optional[Tuple[cudaGraphNode_t] | Li
 
     A handle to the new node will be returned in `phGraphNode`.
 
-    Parameters
-    ----------
-    graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
-        Graph to which to add the node
-    pDependencies : List[:py:obj:`~.cudaGraphNode_t`]
-        Dependencies of the node
-    numDependencies : size_t
-        Number of dependencies
-    nodeParams : :py:obj:`~.cudaGraphNodeParams`
-        Specification of the node
-
-    Returns
-    -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDeviceFunction`, :py:obj:`~.cudaErrorNotSupported`
-    pGraphNode : :py:obj:`~.cudaGraphNode_t`
-        Returns newly created node
-
-    See Also
-    --------
-    :py:obj:`~.cudaGraphCreate`, :py:obj:`~.cudaGraphNodeSetParams`, :py:obj:`~.cudaGraphExecNodeSetParams`
-    """
-    pDependencies = [] if pDependencies is None else pDependencies
-    if not all(isinstance(_x, (cudaGraphNode_t,driver.CUgraphNode)) for _x in pDependencies):
-        raise TypeError("Argument 'pDependencies' is not instance of type (expected Tuple[cyruntime.cudaGraphNode_t,driver.CUgraphNode] or List[cyruntime.cudaGraphNode_t,driver.CUgraphNode]")
-    cdef cyruntime.cudaGraph_t cygraph
-    if graph is None:
-        pgraph = 0
-    elif isinstance(graph, (cudaGraph_t,driver.CUgraph)):
-        pgraph = int(graph)
-    else:
-        pgraph = int(cudaGraph_t(graph))
-    cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
-    cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
-    cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
-    if len(pDependencies) > 0:
-        cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
-        if cypDependencies is NULL:
-            raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
-        else:
-            for idx in range(len(pDependencies)):
-                cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
-    if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cyruntime.cudaGraphAddNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr if len(pDependencies) == 1 else cypDependencies, numDependencies, cynodeParams_ptr)
-    if cypDependencies is not NULL:
-        free(cypDependencies)
-    if err != cyruntime.cudaSuccess:
-        return (_dict_cudaError_t[err], None)
-    return (_dict_cudaError_t[err], pGraphNode)
-{{endif}}
-
-{{if 'cudaGraphAddNode_v2' in found_functions}}
-
-@cython.embedsignature(True)
-def cudaGraphAddNode_v2(graph, pDependencies : Optional[Tuple[cudaGraphNode_t] | List[cudaGraphNode_t]], dependencyData : Optional[Tuple[cudaGraphEdgeData] | List[cudaGraphEdgeData]], size_t numDependencies, nodeParams : Optional[cudaGraphNodeParams]):
-    """ Adds a node of arbitrary type to a graph (12.3+)
-
-    Creates a new node in `graph` described by `nodeParams` with
-    `numDependencies` dependencies specified via `pDependencies`.
-    `numDependencies` may be 0. `pDependencies` may be null if
-    `numDependencies` is 0. `pDependencies` may not have any duplicate
-    entries.
-
-    `nodeParams` is a tagged union. The node type should be specified in
-    the `typename` field, and type-specific parameters in the corresponding
-    union member. All unused bytes - that is, `reserved0` and all bytes
-    past the utilized union member - must be set to zero. It is recommended
-    to use brace initialization or memset to ensure all bytes are
-    initialized.
-
-    Note that for some node types, `nodeParams` may contain "out
-    parameters" which are modified during the call, such as
-    `nodeParams->alloc.dptr`.
-
-    A handle to the new node will be returned in `phGraphNode`.
-
     Parameters
     ----------
     graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
@@ -35143,10 +34847,8 @@ def cudaGraphAddNode_v2(graph, pDependencies : Optional[Tuple[cudaGraphNode_t] |
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencyData)) + 'x' + str(sizeof(cyruntime.cudaGraphEdgeData)))
         for idx in range(len(dependencyData)):
             string.memcpy(&cydependencyData[idx], (<cudaGraphEdgeData>dependencyData[idx])._pvt_ptr, sizeof(cyruntime.cudaGraphEdgeData))
-    if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    if numDependencies > <size_t>len(dependencyData): raise RuntimeError("List is too small: " + str(len(dependencyData)) + " < " + str(numDependencies))
     cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cyruntime.cudaGraphAddNode_v2(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr if len(pDependencies) == 1 else cypDependencies, (<cudaGraphEdgeData>dependencyData[0])._pvt_ptr if len(dependencyData) == 1 else cydependencyData, numDependencies, cynodeParams_ptr)
+    err = cyruntime.cudaGraphAddNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr if len(pDependencies) == 1 else cypDependencies, (<cudaGraphEdgeData>dependencyData[0])._pvt_ptr if len(dependencyData) == 1 else cydependencyData, numDependencies, cynodeParams_ptr)
     if cypDependencies is not NULL:
         free(cypDependencies)
     if cydependencyData is not NULL:
@@ -35315,6 +35017,8 @@ def cudaGraphConditionalHandleCreate(graph, unsigned int defaultLaunchValue, uns
 def cudaGetDriverEntryPoint(char* symbol, unsigned long long flags):
     """ Returns the requested driver API function pointer.
 
+    [Deprecated]
+
     Returns in `**funcPtr` the address of the CUDA driver function for the
     requested flags.
 
@@ -35397,6 +35101,10 @@ def cudaGetDriverEntryPoint(char* symbol, unsigned long long flags):
     See Also
     --------
     :py:obj:`~.cuGetProcAddress`
+
+    Notes
+    -----
+    This API is deprecated and :py:obj:`~.cudaGetDriverEntryPointByVersion` (with a hardcoded :py:obj:`~.cudaVersion`) should be used instead.
     """
     cdef void_ptr funcPtr = 0
     cdef cyruntime.cudaDriverEntryPointQueryResult driverStatus
@@ -37768,6 +37476,12 @@ def sizeof(objType):
     {{if 'cudaAsyncCallback' in found_types}}
     if objType == cudaAsyncCallback:
         return sizeof(cyruntime.cudaAsyncCallback){{endif}}
+    {{if 'cudaLogsCallbackHandle' in found_types}}
+    if objType == cudaLogsCallbackHandle:
+        return sizeof(cyruntime.cudaLogsCallbackHandle){{endif}}
+    {{if 'cudaLogIterator' in found_types}}
+    if objType == cudaLogIterator:
+        return sizeof(cyruntime.cudaLogIterator){{endif}}
     {{if 'cudaSurfaceObject_t' in found_types}}
     if objType == cudaSurfaceObject_t:
         return sizeof(cyruntime.cudaSurfaceObject_t){{endif}}
@@ -37780,6 +37494,9 @@ def sizeof(objType):
     {{if 'cudaStreamCallback_t' in found_types}}
     if objType == cudaStreamCallback_t:
         return sizeof(cyruntime.cudaStreamCallback_t){{endif}}
+    {{if 'cudaLogsCallback_t' in found_types}}
+    if objType == cudaLogsCallback_t:
+        return sizeof(cyruntime.cudaLogsCallback_t){{endif}}
     {{if True}}
     if objType == GLenum:
         return sizeof(cyruntime.GLenum){{endif}}
diff --git a/cuda_bindings/docs/source/module/driver.rst b/cuda_bindings/docs/source/module/driver.rst
index ca41b4fb75..d1ec48bb6e 100644
--- a/cuda_bindings/docs/source/module/driver.rst
+++ b/cuda_bindings/docs/source/module/driver.rst
@@ -84,7 +84,7 @@ Data types used by CUDA driver
 .. autoclass:: cuda.bindings.driver.CUgraphNodeParams_st
 .. autoclass:: cuda.bindings.driver.CUcheckpointLockArgs_st
 .. autoclass:: cuda.bindings.driver.CUcheckpointCheckpointArgs_st
-.. autoclass:: cuda.bindings.driver.CUcheckpointRestoreArgs_st
+.. autoclass:: cuda.bindings.driver.CUcheckpointGpuPair_st
 .. autoclass:: cuda.bindings.driver.CUcheckpointUnlockArgs_st
 .. autoclass:: cuda.bindings.driver.CUeglFrame_st
 .. autoclass:: cuda.bindings.driver.CUipcMem_flags
@@ -1339,7 +1339,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED
 
 
-        Link between the device and the host supports native atomic operations
+        Link between the device and the host supports all native atomic operations
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO
@@ -1690,6 +1690,30 @@ Data types used by CUDA driver
         Device supports HOST_NUMA location IPC between nodes in a multi-node system.
 
 
+    .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_HOST_MEMORY_POOLS_SUPPORTED
+
+
+        Device suports HOST location with the :py:obj:`~.cuMemAllocAsync` and :py:obj:`~.cuMemPool` family of APIs
+
+
+    .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_HOST_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED
+
+
+        Device supports HOST location with the virtual memory management APIs like :py:obj:`~.cuMemCreate`, :py:obj:`~.cuMemMap` and related APIs
+
+
+    .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_HOST_ALLOC_DMA_BUF_SUPPORTED
+
+
+        Device supports page-locked host memory buffer sharing with dma_buf mechanism.
+
+
+    .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_ONLY_PARTIAL_HOST_NATIVE_ATOMIC_SUPPORTED
+
+
+        Link between the device and the host supports only some native atomic operations
+
+
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX
 
 .. autoclass:: cuda.bindings.driver.CUpointer_attribute
@@ -1872,7 +1896,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
 
 
-        The maximum size in bytes of dynamically-allocated shared memory that can be used by this function. If the user-specified dynamic shared memory size is larger than this value, the launch will fail. See :py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`
+        The maximum size in bytes of dynamically-allocated shared memory that can be used by this function. If the user-specified dynamic shared memory size is larger than this value, the launch will fail. The default value of this attribute is :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK` - :py:obj:`~.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES`, except when :py:obj:`~.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES` is greater than :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK`, then the default value of this attribute is 0. The value can be increased to :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN` - :py:obj:`~.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES`. See :py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`
 
 
     .. autoattribute:: cuda.bindings.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
@@ -2488,6 +2512,14 @@ Data types used by CUDA driver
         Applies to: compiler only
 
 
+    .. autoattribute:: cuda.bindings.driver.CUjit_option.CU_JIT_SPLIT_COMPILE
+
+
+        This option specifies the maximum number of concurrent threads to use when running compiler optimizations. If the specified value is 1, the option will be ignored. If the specified value is 0, the number of threads will match the number of CPUs on the underlying machine. Otherwise, if the option is N, then up to N threads will be used. Option type: unsigned int
+
+        Applies to: compiler only
+
+
     .. autoattribute:: cuda.bindings.driver.CUjit_option.CU_JIT_NUM_OPTIONS
 
 .. autoclass:: cuda.bindings.driver.CUjit_target
@@ -2612,6 +2644,12 @@ Data types used by CUDA driver
         Compute device class 10.1.
 
 
+    .. autoattribute:: cuda.bindings.driver.CUjit_target.CU_TARGET_COMPUTE_110
+
+
+        Compute device class 11.0.
+
+
     .. autoattribute:: cuda.bindings.driver.CUjit_target.CU_TARGET_COMPUTE_103
 
 
@@ -2645,6 +2683,12 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUjit_target.CU_TARGET_COMPUTE_101A
 
 
+        Compute device class 11.0 with accelerated features.
+
+
+    .. autoattribute:: cuda.bindings.driver.CUjit_target.CU_TARGET_COMPUTE_110A
+
+
         Compute device class 10.3. with accelerated features.
 
 
@@ -2675,6 +2719,12 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUjit_target.CU_TARGET_COMPUTE_101F
 
 
+        Compute device class 11.0 with family features.
+
+
+    .. autoattribute:: cuda.bindings.driver.CUjit_target.CU_TARGET_COMPUTE_110F
+
+
         Compute device class 10.3. with family features.
 
 
@@ -3045,7 +3095,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUgraphNodeType.CU_GRAPH_NODE_TYPE_BATCH_MEM_OP
 
 
-        Batch MemOp Node
+        Batch MemOp Node See :py:obj:`~.cuStreamBatchMemOp` and :py:obj:`~.CUstreamBatchMemOpType` for what these nodes can do.
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphNodeType.CU_GRAPH_NODE_TYPE_CONDITIONAL
@@ -3285,6 +3335,20 @@ Data types used by CUDA driver
 
         Valid for launches. On devices where the L1 cache and shared memory use the same hardware resources, setting :py:obj:`~.CUlaunchAttributeValue.sharedMemCarveout` to a percentage between 0-100 signals the CUDA driver to set the shared memory carveout preference, in percent of the total shared memory for that kernel launch. This attribute takes precedence over :py:obj:`~.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT`. This is only a hint, and the CUDA driver can choose a different configuration if required for the launch.
 
+
+    .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING
+
+
+        Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. 
+
+
+
+         When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. 
+
+         This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. 
+
+         Valid values for :py:obj:`~.CUlaunchAttributeValue`::nvlinkUtilCentricScheduling are 0 (disabled) and 1 (enabled).
+
 .. autoclass:: cuda.bindings.driver.CUstreamCaptureStatus
 
     .. autoattribute:: cuda.bindings.driver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_NONE
@@ -3447,6 +3511,12 @@ Data types used by CUDA driver
         This indicates that the CUDA driver that the application has loaded is a stub library. Applications that run with the stub rather than a real driver loaded will result in CUDA API returning this error.
 
 
+    .. autoattribute:: cuda.bindings.driver.CUresult.CUDA_ERROR_CALL_REQUIRES_NEWER_DRIVER
+
+
+        This indicates that the API call requires a newer CUDA driver than the one currently installed. Users should install an updated NVIDIA CUDA driver to allow the API call to succeed.
+
+
     .. autoattribute:: cuda.bindings.driver.CUresult.CUDA_ERROR_DEVICE_UNAVAILABLE
 
 
@@ -3621,6 +3691,12 @@ Data types used by CUDA driver
         This indicates that an exception occurred on the device that is now contained by the GPU's error containment capability. Common causes are - a. Certain types of invalid accesses of peer GPU memory over nvlink b. Certain classes of hardware errors This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
 
+    .. autoattribute:: cuda.bindings.driver.CUresult.CUDA_ERROR_NVLINK_ENCRYPTION_FAILED
+
+
+        This indicates that an NVLink encryption error was detected during the execution.
+
+
     .. autoattribute:: cuda.bindings.driver.CUresult.CUDA_ERROR_INVALID_SOURCE
 
 
@@ -4003,7 +4079,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUdevice_P2PAttribute.CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED
 
 
-        Atomic operation over the link supported
+        All CUDA-valid atomic operation over the link are supported
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_P2PAttribute.CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED
@@ -4017,6 +4093,77 @@ Data types used by CUDA driver
 
         Accessing CUDA arrays over the link supported
 
+
+    .. autoattribute:: cuda.bindings.driver.CUdevice_P2PAttribute.CU_DEVICE_P2P_ATTRIBUTE_ONLY_PARTIAL_NATIVE_ATOMIC_SUPPORTED
+
+
+        Only some CUDA-valid atomic operations over the link are supported.
+
+.. autoclass:: cuda.bindings.driver.CUatomicOperation
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_INTEGER_ADD
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_INTEGER_MIN
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_INTEGER_MAX
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_INTEGER_INCREMENT
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_INTEGER_DECREMENT
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_AND
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_OR
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_XOR
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_EXCHANGE
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_CAS
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_FLOAT_ADD
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_FLOAT_MIN
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_FLOAT_MAX
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperation.CU_ATOMIC_OPERATION_MAX
+
+.. autoclass:: cuda.bindings.driver.CUatomicOperationCapability
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_SIGNED
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_UNSIGNED
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_REDUCTION
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_SCALAR_32
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_SCALAR_64
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_SCALAR_128
+
+
+    .. autoattribute:: cuda.bindings.driver.CUatomicOperationCapability.CU_ATOMIC_CAPABILITY_VECTOR_32x4
+
 .. autoclass:: cuda.bindings.driver.CUresourceViewFormat
 
     .. autoattribute:: cuda.bindings.driver.CUresourceViewFormat.CU_RES_VIEW_FORMAT_NONE
@@ -4404,6 +4551,12 @@ Data types used by CUDA driver
 
         Handle is an NvSciBuf object
 
+
+    .. autoattribute:: cuda.bindings.driver.CUexternalMemoryHandleType.CU_EXTERNAL_MEMORY_HANDLE_TYPE_DMABUF_FD
+
+
+        Handle is a dma_buf file descriptor
+
 .. autoclass:: cuda.bindings.driver.CUexternalSemaphoreHandleType
 
     .. autoattribute:: cuda.bindings.driver.CUexternalSemaphoreHandleType.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD
@@ -4526,6 +4679,12 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_INVALID
 
 
+    .. autoattribute:: cuda.bindings.driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_NONE
+
+
+        Location is unspecified. This is used when creating a managed memory pool to indicate no preferred location for the pool
+
+
     .. autoattribute:: cuda.bindings.driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
 
 
@@ -4563,6 +4722,12 @@ Data types used by CUDA driver
         This allocation type is 'pinned', i.e. cannot migrate from its current location while the application is actively using it
 
 
+    .. autoattribute:: cuda.bindings.driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED
+
+
+        This allocation type is managed memory
+
+
     .. autoattribute:: cuda.bindings.driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MAX
 
 .. autoclass:: cuda.bindings.driver.CUmemAllocationGranularity_flags
@@ -5959,7 +6124,7 @@ Data types used by CUDA driver
 .. autoclass:: cuda.bindings.driver.CUgraphNodeParams
 .. autoclass:: cuda.bindings.driver.CUcheckpointLockArgs
 .. autoclass:: cuda.bindings.driver.CUcheckpointCheckpointArgs
-.. autoclass:: cuda.bindings.driver.CUcheckpointRestoreArgs
+.. autoclass:: cuda.bindings.driver.CUcheckpointGpuPair
 .. autoclass:: cuda.bindings.driver.CUcheckpointUnlockArgs
 .. autoclass:: cuda.bindings.driver.CUeglFrame_v1
 .. autoclass:: cuda.bindings.driver.CUeglFrame
@@ -6254,11 +6419,11 @@ This section describes the device management functions of the low-level CUDA dri
 .. autofunction:: cuda.bindings.driver.cuDeviceGetCount
 .. autofunction:: cuda.bindings.driver.cuDeviceGetName
 .. autofunction:: cuda.bindings.driver.cuDeviceGetUuid
-.. autofunction:: cuda.bindings.driver.cuDeviceGetUuid_v2
 .. autofunction:: cuda.bindings.driver.cuDeviceGetLuid
 .. autofunction:: cuda.bindings.driver.cuDeviceTotalMem
 .. autofunction:: cuda.bindings.driver.cuDeviceGetTexture1DLinearMaxWidth
 .. autofunction:: cuda.bindings.driver.cuDeviceGetAttribute
+.. autofunction:: cuda.bindings.driver.cuDeviceGetHostAtomicCapabilities
 .. autofunction:: cuda.bindings.driver.cuDeviceGetNvSciSyncAttributes
 .. autofunction:: cuda.bindings.driver.cuDeviceSetMemPool
 .. autofunction:: cuda.bindings.driver.cuDeviceGetMemPool
@@ -6291,18 +6456,18 @@ This section describes the context management functions of the low-level CUDA dr
 Please note that some functions are described in Primary Context Management section.
 
 .. autofunction:: cuda.bindings.driver.cuCtxCreate
-.. autofunction:: cuda.bindings.driver.cuCtxCreate_v3
-.. autofunction:: cuda.bindings.driver.cuCtxCreate_v4
 .. autofunction:: cuda.bindings.driver.cuCtxDestroy
 .. autofunction:: cuda.bindings.driver.cuCtxPushCurrent
 .. autofunction:: cuda.bindings.driver.cuCtxPopCurrent
 .. autofunction:: cuda.bindings.driver.cuCtxSetCurrent
 .. autofunction:: cuda.bindings.driver.cuCtxGetCurrent
 .. autofunction:: cuda.bindings.driver.cuCtxGetDevice
+.. autofunction:: cuda.bindings.driver.cuCtxGetDevice_v2
 .. autofunction:: cuda.bindings.driver.cuCtxGetFlags
 .. autofunction:: cuda.bindings.driver.cuCtxSetFlags
 .. autofunction:: cuda.bindings.driver.cuCtxGetId
 .. autofunction:: cuda.bindings.driver.cuCtxSynchronize
+.. autofunction:: cuda.bindings.driver.cuCtxSynchronize_v2
 .. autofunction:: cuda.bindings.driver.cuCtxSetLimit
 .. autofunction:: cuda.bindings.driver.cuCtxGetLimit
 .. autofunction:: cuda.bindings.driver.cuCtxGetCacheConfig
@@ -6535,6 +6700,9 @@ Whether or not a device supports the integrated stream ordered memory allocator
 .. autofunction:: cuda.bindings.driver.cuMemPoolGetAccess
 .. autofunction:: cuda.bindings.driver.cuMemPoolCreate
 .. autofunction:: cuda.bindings.driver.cuMemPoolDestroy
+.. autofunction:: cuda.bindings.driver.cuMemGetDefaultMemPool
+.. autofunction:: cuda.bindings.driver.cuMemGetMemPool
+.. autofunction:: cuda.bindings.driver.cuMemSetMemPool
 .. autofunction:: cuda.bindings.driver.cuMemAllocFromPoolAsync
 .. autofunction:: cuda.bindings.driver.cuMemPoolExportToShareableHandle
 .. autofunction:: cuda.bindings.driver.cuMemPoolImportFromShareableHandle
@@ -6650,9 +6818,10 @@ This device address may be queried using cuMemHostGetDevicePointer() when a cont
 
 .. autofunction:: cuda.bindings.driver.cuPointerGetAttribute
 .. autofunction:: cuda.bindings.driver.cuMemPrefetchAsync
-.. autofunction:: cuda.bindings.driver.cuMemPrefetchAsync_v2
 .. autofunction:: cuda.bindings.driver.cuMemAdvise
-.. autofunction:: cuda.bindings.driver.cuMemAdvise_v2
+.. autofunction:: cuda.bindings.driver.cuMemPrefetchBatchAsync
+.. autofunction:: cuda.bindings.driver.cuMemDiscardBatchAsync
+.. autofunction:: cuda.bindings.driver.cuMemDiscardAndPrefetchBatchAsync
 .. autofunction:: cuda.bindings.driver.cuMemRangeGetAttribute
 .. autofunction:: cuda.bindings.driver.cuMemRangeGetAttributes
 .. autofunction:: cuda.bindings.driver.cuPointerSetAttribute
@@ -6679,9 +6848,7 @@ This section describes the stream management functions of the low-level CUDA dri
 .. autofunction:: cuda.bindings.driver.cuStreamEndCapture
 .. autofunction:: cuda.bindings.driver.cuStreamIsCapturing
 .. autofunction:: cuda.bindings.driver.cuStreamGetCaptureInfo
-.. autofunction:: cuda.bindings.driver.cuStreamGetCaptureInfo_v3
 .. autofunction:: cuda.bindings.driver.cuStreamUpdateCaptureDependencies
-.. autofunction:: cuda.bindings.driver.cuStreamUpdateCaptureDependencies_v2
 .. autofunction:: cuda.bindings.driver.cuStreamAttachMemAsync
 .. autofunction:: cuda.bindings.driver.cuStreamQuery
 .. autofunction:: cuda.bindings.driver.cuStreamSynchronize
@@ -6702,7 +6869,6 @@ This section describes the event management functions of the low-level CUDA driv
 .. autofunction:: cuda.bindings.driver.cuEventSynchronize
 .. autofunction:: cuda.bindings.driver.cuEventDestroy
 .. autofunction:: cuda.bindings.driver.cuEventElapsedTime
-.. autofunction:: cuda.bindings.driver.cuEventElapsedTime_v2
 
 External Resource Interoperability
 ----------------------------------
@@ -6832,15 +6998,10 @@ This section describes the graph management functions of the low-level CUDA driv
 .. autofunction:: cuda.bindings.driver.cuGraphGetNodes
 .. autofunction:: cuda.bindings.driver.cuGraphGetRootNodes
 .. autofunction:: cuda.bindings.driver.cuGraphGetEdges
-.. autofunction:: cuda.bindings.driver.cuGraphGetEdges_v2
 .. autofunction:: cuda.bindings.driver.cuGraphNodeGetDependencies
-.. autofunction:: cuda.bindings.driver.cuGraphNodeGetDependencies_v2
 .. autofunction:: cuda.bindings.driver.cuGraphNodeGetDependentNodes
-.. autofunction:: cuda.bindings.driver.cuGraphNodeGetDependentNodes_v2
 .. autofunction:: cuda.bindings.driver.cuGraphAddDependencies
-.. autofunction:: cuda.bindings.driver.cuGraphAddDependencies_v2
 .. autofunction:: cuda.bindings.driver.cuGraphRemoveDependencies
-.. autofunction:: cuda.bindings.driver.cuGraphRemoveDependencies_v2
 .. autofunction:: cuda.bindings.driver.cuGraphDestroyNode
 .. autofunction:: cuda.bindings.driver.cuGraphInstantiate
 .. autofunction:: cuda.bindings.driver.cuGraphInstantiateWithParams
@@ -6871,7 +7032,6 @@ This section describes the graph management functions of the low-level CUDA driv
 .. autofunction:: cuda.bindings.driver.cuGraphRetainUserObject
 .. autofunction:: cuda.bindings.driver.cuGraphReleaseUserObject
 .. autofunction:: cuda.bindings.driver.cuGraphAddNode
-.. autofunction:: cuda.bindings.driver.cuGraphAddNode_v2
 .. autofunction:: cuda.bindings.driver.cuGraphNodeSetParams
 .. autofunction:: cuda.bindings.driver.cuGraphExecNodeSetParams
 .. autofunction:: cuda.bindings.driver.cuGraphConditionalHandleCreate
@@ -6928,6 +7088,7 @@ This section describes the direct peer context memory access functions of the lo
 .. autofunction:: cuda.bindings.driver.cuCtxEnablePeerAccess
 .. autofunction:: cuda.bindings.driver.cuCtxDisablePeerAccess
 .. autofunction:: cuda.bindings.driver.cuDeviceGetP2PAttribute
+.. autofunction:: cuda.bindings.driver.cuDeviceGetP2PAtomicCapabilities
 
 Graphics Interoperability
 -------------------------
@@ -7054,9 +7215,13 @@ There are 4 main steps to using these new set of APIs.
 
 
 
-For ``CU_DEV_RESOURCE_TYPE_SM``\ , the partitions created have minimum SM count requirements, often rounding up and aligning the minCount provided to cuDevSmResourceSplitByCount. The following is a guideline for each architecture and may be subject to change:
+For ``CU_DEV_RESOURCE_TYPE_SM``\ , the partitions created have minimum SM count requirements, often rounding up and aligning the minCount provided to cuDevSmResourceSplitByCount. These requirements can be queried with cuDeviceGetDevResource from step (1) above to determine the minimum partition size (``sm.minSmPartitionSize``\ ) and alignment granularity (``sm.smCoscheduledAlignment``\ ).
+
+
+
+While it's recommended to use cuDeviceGetDevResource for accurate information, here is a guideline for each compute architecture:
 
-- On Compute Architecture 6.X: The minimum count is 1 SM.
+- On Compute Architecture 6.X: The minimum count is 2 SMs and must be a multiple of 2.
 
 
 
@@ -7151,6 +7316,7 @@ Even if the green contexts have disjoint SM partitions, it is not guaranteed tha
 .. autofunction:: cuda.bindings.driver.cuGreenCtxWaitEvent
 .. autofunction:: cuda.bindings.driver.cuStreamGetGreenCtx
 .. autofunction:: cuda.bindings.driver.cuGreenCtxStreamCreate
+.. autofunction:: cuda.bindings.driver.cuGreenCtxGetId
 .. autoattribute:: cuda.bindings.driver.RESOURCE_ABI_VERSION
 .. autoattribute:: cuda.bindings.driver.RESOURCE_ABI_EXTERNAL_BYTES
 .. autoattribute:: cuda.bindings.driver._CONCAT_INNER
@@ -7202,7 +7368,6 @@ Checkpoint and restore capabilities are currently restricted to Linux.
 .. autofunction:: cuda.bindings.driver.cuCheckpointProcessGetState
 .. autofunction:: cuda.bindings.driver.cuCheckpointProcessLock
 .. autofunction:: cuda.bindings.driver.cuCheckpointProcessCheckpoint
-.. autofunction:: cuda.bindings.driver.cuCheckpointProcessRestore
 .. autofunction:: cuda.bindings.driver.cuCheckpointProcessUnlock
 
 EGL Interoperability
diff --git a/cuda_bindings/docs/source/module/nvrtc.rst b/cuda_bindings/docs/source/module/nvrtc.rst
index e52afa4dc3..220246d854 100644
--- a/cuda_bindings/docs/source/module/nvrtc.rst
+++ b/cuda_bindings/docs/source/module/nvrtc.rst
@@ -61,6 +61,9 @@ NVRTC defines the following enumeration type and function for API call error han
 
     .. autoattribute:: cuda.bindings.nvrtc.nvrtcResult.NVRTC_ERROR_CANCELLED
 
+
+    .. autoattribute:: cuda.bindings.nvrtc.nvrtcResult.NVRTC_ERROR_TIME_TRACE_FILE_WRITE_FAILED
+
 .. autofunction:: cuda.bindings.nvrtc.nvrtcGetErrorString
 
 General Information Query
@@ -85,8 +88,6 @@ NVRTC defines the following type and functions for actual compilation.
 .. autofunction:: cuda.bindings.nvrtc.nvrtcGetPTX
 .. autofunction:: cuda.bindings.nvrtc.nvrtcGetCUBINSize
 .. autofunction:: cuda.bindings.nvrtc.nvrtcGetCUBIN
-.. autofunction:: cuda.bindings.nvrtc.nvrtcGetNVVMSize
-.. autofunction:: cuda.bindings.nvrtc.nvrtcGetNVVM
 .. autofunction:: cuda.bindings.nvrtc.nvrtcGetLTOIRSize
 .. autofunction:: cuda.bindings.nvrtc.nvrtcGetLTOIR
 .. autofunction:: cuda.bindings.nvrtc.nvrtcGetOptiXIRSize
@@ -242,7 +243,7 @@ Enable device code optimization. When specified along with ``-G``\ , enables lim
 
   - ``--Ofast-compile={0|min|mid|max}``\  (``-Ofc``\ )
 
-Specify level to prefer device code compilation speed, where 'max' focuses only on the fastest compilation speed, 'mid' balances compile time and runtime, 'min' has a more minimal impact on both, and 0 (default) is normal compilation
+Specify the fast-compile level for device code, which controls the tradeoff between compilation speed and runtime performance by disabling certain optimizations at varying levels.
 
 
 
@@ -420,7 +421,7 @@ Disable the use of cache for both ptx and cubin code generation.
 
   - ``--frandom-seed``\  (``-frandom-seed``\ )
 
-The user specified random seed will be used to replace random numbers used in generating symbol names and variable names. The option can be used to generate deterministicly identical ptx and object files. If the input value is a valid number (decimal, octal, or hex), it will be used directly as the random seed. Otherwise, the CRC value of the passed string will be used instead.
+The user specified random seed will be used to replace random numbers used in generating symbol names and variable names. The option can be used to generate deterministically identical ptx and object files. If the input value is a valid number (decimal, octal, or hex), it will be used directly as the random seed. Otherwise, the CRC value of the passed string will be used instead.
 
 
 
diff --git a/cuda_bindings/docs/source/module/runtime.rst b/cuda_bindings/docs/source/module/runtime.rst
index b7301c913a..653d6eba98 100644
--- a/cuda_bindings/docs/source/module/runtime.rst
+++ b/cuda_bindings/docs/source/module/runtime.rst
@@ -46,11 +46,13 @@ This section describes the device management functions of the CUDA runtime appli
 .. autofunction:: cuda.bindings.runtime.cudaGetDeviceCount
 .. autofunction:: cuda.bindings.runtime.cudaGetDeviceProperties
 .. autofunction:: cuda.bindings.runtime.cudaDeviceGetAttribute
+.. autofunction:: cuda.bindings.runtime.cudaDeviceGetHostAtomicCapabilities
 .. autofunction:: cuda.bindings.runtime.cudaDeviceGetDefaultMemPool
 .. autofunction:: cuda.bindings.runtime.cudaDeviceSetMemPool
 .. autofunction:: cuda.bindings.runtime.cudaDeviceGetMemPool
 .. autofunction:: cuda.bindings.runtime.cudaDeviceGetNvSciSyncAttributes
 .. autofunction:: cuda.bindings.runtime.cudaDeviceGetP2PAttribute
+.. autofunction:: cuda.bindings.runtime.cudaDeviceGetP2PAtomicCapabilities
 .. autofunction:: cuda.bindings.runtime.cudaChooseDevice
 .. autofunction:: cuda.bindings.runtime.cudaInitDevice
 .. autofunction:: cuda.bindings.runtime.cudaSetDevice
@@ -97,9 +99,7 @@ This section describes the stream management functions of the CUDA runtime appli
 .. autofunction:: cuda.bindings.runtime.cudaStreamEndCapture
 .. autofunction:: cuda.bindings.runtime.cudaStreamIsCapturing
 .. autofunction:: cuda.bindings.runtime.cudaStreamGetCaptureInfo
-.. autofunction:: cuda.bindings.runtime.cudaStreamGetCaptureInfo_v3
 .. autofunction:: cuda.bindings.runtime.cudaStreamUpdateCaptureDependencies
-.. autofunction:: cuda.bindings.runtime.cudaStreamUpdateCaptureDependencies_v2
 
 Event Management
 ----------------
@@ -114,7 +114,6 @@ This section describes the event management functions of the CUDA runtime applic
 .. autofunction:: cuda.bindings.runtime.cudaEventSynchronize
 .. autofunction:: cuda.bindings.runtime.cudaEventDestroy
 .. autofunction:: cuda.bindings.runtime.cudaEventElapsedTime
-.. autofunction:: cuda.bindings.runtime.cudaEventElapsedTime_v2
 
 External Resource Interoperability
 ----------------------------------
@@ -219,9 +218,10 @@ Some functions have overloaded C++ API template versions documented separately i
 .. autofunction:: cuda.bindings.runtime.cudaMemset2DAsync
 .. autofunction:: cuda.bindings.runtime.cudaMemset3DAsync
 .. autofunction:: cuda.bindings.runtime.cudaMemPrefetchAsync
-.. autofunction:: cuda.bindings.runtime.cudaMemPrefetchAsync_v2
+.. autofunction:: cuda.bindings.runtime.cudaMemPrefetchBatchAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemDiscardBatchAsync
+.. autofunction:: cuda.bindings.runtime.cudaMemDiscardAndPrefetchBatchAsync
 .. autofunction:: cuda.bindings.runtime.cudaMemAdvise
-.. autofunction:: cuda.bindings.runtime.cudaMemAdvise_v2
 .. autofunction:: cuda.bindings.runtime.cudaMemRangeGetAttribute
 .. autofunction:: cuda.bindings.runtime.cudaMemRangeGetAttributes
 .. autofunction:: cuda.bindings.runtime.make_cudaPitchedPtr
@@ -258,6 +258,9 @@ Whether or not a device supports the integrated stream ordered memory allocator
 .. autofunction:: cuda.bindings.runtime.cudaMemPoolGetAccess
 .. autofunction:: cuda.bindings.runtime.cudaMemPoolCreate
 .. autofunction:: cuda.bindings.runtime.cudaMemPoolDestroy
+.. autofunction:: cuda.bindings.runtime.cudaMemGetDefaultMemPool
+.. autofunction:: cuda.bindings.runtime.cudaMemGetMemPool
+.. autofunction:: cuda.bindings.runtime.cudaMemSetMemPool
 .. autofunction:: cuda.bindings.runtime.cudaMallocFromPoolAsync
 .. autofunction:: cuda.bindings.runtime.cudaMemPoolExportToShareableHandle
 .. autofunction:: cuda.bindings.runtime.cudaMemPoolImportFromShareableHandle
@@ -480,6 +483,18 @@ Version Management
 .. autofunction:: cuda.bindings.runtime.cudaRuntimeGetVersion
 .. autofunction:: cuda.bindings.runtime.getLocalRuntimeVersion
 
+Error Log Management Functions
+------------------------------
+
+This section describes the error log management functions of the CUDA runtime application programming interface. The Error Log Management interface will operate on both the CUDA Driver and CUDA Runtime.
+
+.. autoclass:: cuda.bindings.runtime.cudaLogsCallback_t
+.. autofunction:: cuda.bindings.runtime.cudaLogsRegisterCallback
+.. autofunction:: cuda.bindings.runtime.cudaLogsUnregisterCallback
+.. autofunction:: cuda.bindings.runtime.cudaLogsCurrent
+.. autofunction:: cuda.bindings.runtime.cudaLogsDumpToFile
+.. autofunction:: cuda.bindings.runtime.cudaLogsDumpToMemory
+
 Graph Management
 ----------------
 
@@ -531,15 +546,10 @@ This section describes the graph management functions of CUDA runtime applicatio
 .. autofunction:: cuda.bindings.runtime.cudaGraphGetNodes
 .. autofunction:: cuda.bindings.runtime.cudaGraphGetRootNodes
 .. autofunction:: cuda.bindings.runtime.cudaGraphGetEdges
-.. autofunction:: cuda.bindings.runtime.cudaGraphGetEdges_v2
 .. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetDependencies
-.. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetDependencies_v2
 .. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetDependentNodes
-.. autofunction:: cuda.bindings.runtime.cudaGraphNodeGetDependentNodes_v2
 .. autofunction:: cuda.bindings.runtime.cudaGraphAddDependencies
-.. autofunction:: cuda.bindings.runtime.cudaGraphAddDependencies_v2
 .. autofunction:: cuda.bindings.runtime.cudaGraphRemoveDependencies
-.. autofunction:: cuda.bindings.runtime.cudaGraphRemoveDependencies_v2
 .. autofunction:: cuda.bindings.runtime.cudaGraphDestroyNode
 .. autofunction:: cuda.bindings.runtime.cudaGraphInstantiate
 .. autofunction:: cuda.bindings.runtime.cudaGraphInstantiateWithFlags
@@ -569,7 +579,6 @@ This section describes the graph management functions of CUDA runtime applicatio
 .. autofunction:: cuda.bindings.runtime.cudaGraphRetainUserObject
 .. autofunction:: cuda.bindings.runtime.cudaGraphReleaseUserObject
 .. autofunction:: cuda.bindings.runtime.cudaGraphAddNode
-.. autofunction:: cuda.bindings.runtime.cudaGraphAddNode_v2
 .. autofunction:: cuda.bindings.runtime.cudaGraphNodeSetParams
 .. autofunction:: cuda.bindings.runtime.cudaGraphExecNodeSetParams
 .. autofunction:: cuda.bindings.runtime.cudaGraphConditionalHandleCreate
@@ -1984,6 +1993,12 @@ Data types used by CUDA Runtime
         This indicates that an exception occurred on the device that is now contained by the GPU's error containment capability. Common causes are - a. Certain types of invalid accesses of peer GPU memory over nvlink b. Certain classes of hardware errors This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNvlinkEncryptionFailed
+
+
+        This indicates that an NVLink encryption error was detected during the execution.
+
+
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidSource
 
 
@@ -2059,7 +2074,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLaunchTimeout
 
 
-        This indicates that the device kernel took too long to execute. This can only occur if timeouts are enabled - see the device property :py:obj:`~.kernelExecTimeoutEnabled` for more information. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
+        This indicates that the device kernel took too long to execute. This can only occur if timeouts are enabled - see the device attribute :py:obj:`~.cudaDevAttrKernelExecTimeout` for more information. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorLaunchIncompatibleTexturing
@@ -2155,7 +2170,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorCooperativeLaunchTooLarge
 
 
-        This error indicates that the number of blocks launched per grid for a kernel that was launched via either :py:obj:`~.cudaLaunchCooperativeKernel` or :py:obj:`~.cudaLaunchCooperativeKernelMultiDevice` exceeds the maximum number of blocks as allowed by :py:obj:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessor` or :py:obj:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` times the number of multiprocessors as specified by the device attribute :py:obj:`~.cudaDevAttrMultiProcessorCount`.
+        This error indicates that the number of blocks launched per grid for a kernel that was launched via either :py:obj:`~.cudaLaunchCooperativeKernel` exceeds the maximum number of blocks as allowed by :py:obj:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessor` or :py:obj:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` times the number of multiprocessors as specified by the device attribute :py:obj:`~.cudaDevAttrMultiProcessorCount`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorTensorMemoryLeak
@@ -3894,10 +3909,7 @@ Data types used by CUDA Runtime
         Device supports launching cooperative kernels via :py:obj:`~.cudaLaunchCooperativeKernel`
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrCooperativeMultiDeviceLaunch
-
-
-        Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated.
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved96
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxSharedMemoryPerBlockOptin
@@ -3972,12 +3984,6 @@ Data types used by CUDA Runtime
         External timeline semaphore interop is supported on the device
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTimelineSemaphoreInteropSupported
-
-
-        Deprecated, External timeline semaphore interop is supported on the device
-
-
     .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported
 
 
@@ -4116,6 +4122,21 @@ Data types used by CUDA Runtime
         Device supports HostNuma location IPC between nodes in a multi-node system.
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrHostMemoryPoolsSupported
+
+
+        Device suports HOST location with the :py:obj:`~.cuMemAllocAsync` and :py:obj:`~.cuMemPool` family of APIs
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved145
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrOnlyPartialHostNativeAtomicSupported
+
+
+        Link between the device and the host supports only some native atomic operations
+
+
     .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMax
 
 .. autoclass:: cuda.bindings.runtime.cudaMemPoolAttr
@@ -4172,6 +4193,12 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaMemLocationType.cudaMemLocationTypeInvalid
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemLocationType.cudaMemLocationTypeNone
+
+
+        Location is unspecified. This is used when creating a managed memory pool to indicate no preferred location for the pool
+
+
     .. autoattribute:: cuda.bindings.runtime.cudaMemLocationType.cudaMemLocationTypeDevice
 
 
@@ -4225,6 +4252,12 @@ Data types used by CUDA Runtime
         This allocation type is 'pinned', i.e. cannot migrate from its current location while the application is actively using it
 
 
+    .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationType.cudaMemAllocationTypeManaged
+
+
+        This allocation type is managed memory
+
+
     .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationType.cudaMemAllocationTypeMax
 
 .. autoclass:: cuda.bindings.runtime.cudaMemAllocationHandleType
@@ -4362,6 +4395,74 @@ Data types used by CUDA Runtime
 
         Accessing CUDA arrays over the link supported
 
+
+    .. autoattribute:: cuda.bindings.runtime.cudaDeviceP2PAttr.cudaDevP2PAttrOnlyPartialNativeAtomicSupported
+
+
+        Only some CUDA-valid atomic operations over the link are supported.
+
+.. autoclass:: cuda.bindings.runtime.cudaAtomicOperation
+
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationIntegerAdd
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationIntegerMin
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationIntegerMax
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationIntegerIncrement
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationIntegerDecrement
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationAnd
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationOr
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationXOR
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationExchange
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationCAS
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationFloatAdd
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationFloatMin
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperation.cudaAtomicOperationFloatMax
+
+.. autoclass:: cuda.bindings.runtime.cudaAtomicOperationCapability
+
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilitySigned
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilityUnsigned
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilityReduction
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilityScalar32
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilityScalar64
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilityScalar128
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaAtomicOperationCapability.cudaAtomicCapabilityVector32x4
+
 .. autoclass:: cuda.bindings.runtime.cudaExternalMemoryHandleType
 
     .. autoattribute:: cuda.bindings.runtime.cudaExternalMemoryHandleType.cudaExternalMemoryHandleTypeOpaqueFd
@@ -4699,10 +4800,10 @@ Data types used by CUDA Runtime
         Scope represented by a grid_group
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaCGScope.cudaCGScopeMultiGrid
+    .. autoattribute:: cuda.bindings.runtime.cudaCGScope.cudaCGScopeReserved
 
 
-        Scope represented by a multi_grid_group
+        Reserved
 
 .. autoclass:: cuda.bindings.runtime.cudaGraphConditionalHandleFlags
 
@@ -5247,6 +5348,20 @@ Data types used by CUDA Runtime
 
         Valid for launches. On devices where the L1 cache and shared memory use the same hardware resources, setting :py:obj:`~.cudaLaunchAttributeValue.sharedMemCarveout` to a percentage between 0-100 signals sets the shared memory carveout preference in percent of the total shared memory for that kernel launch. This attribute takes precedence over :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout`. This is only a hint, and the driver can choose a different configuration if required for the launch.
 
+
+    .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeNvlinkUtilCentricScheduling
+
+
+        Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. 
+
+
+
+         When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. 
+
+         This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. 
+
+         Valid values for :py:obj:`~.cudaLaunchAttributeValue.nvlinkUtilCentricScheduling` are 0 (disabled) and 1 (enabled).
+
 .. autoclass:: cuda.bindings.runtime.cudaDeviceNumaConfig
 
     .. autoattribute:: cuda.bindings.runtime.cudaDeviceNumaConfig.cudaDeviceNumaConfigNone
@@ -5267,6 +5382,13 @@ Data types used by CUDA Runtime
 
         Sent when the process has exceeded its device memory budget
 
+.. autoclass:: cuda.bindings.runtime.cudaLogLevel
+
+    .. autoattribute:: cuda.bindings.runtime.cudaLogLevel.cudaLogLevelError
+
+
+    .. autoattribute:: cuda.bindings.runtime.cudaLogLevel.cudaLogLevelWarning
+
 .. autoclass:: cuda.bindings.runtime.cudaSurfaceBoundaryMode
 
     .. autoattribute:: cuda.bindings.runtime.cudaSurfaceBoundaryMode.cudaBoundaryModeZero
@@ -5387,6 +5509,8 @@ Data types used by CUDA Runtime
 .. autoclass:: cuda.bindings.runtime.cudaAsyncCallbackHandle_t
 .. autoclass:: cuda.bindings.runtime.cudaAsyncNotificationInfo_t
 .. autoclass:: cuda.bindings.runtime.cudaAsyncCallback
+.. autoclass:: cuda.bindings.runtime.cudaLogsCallbackHandle
+.. autoclass:: cuda.bindings.runtime.cudaLogIterator
 .. autoclass:: cuda.bindings.runtime.cudaSurfaceObject_t
 .. autoclass:: cuda.bindings.runtime.cudaTextureObject_t
 .. autoattribute:: cuda.bindings.runtime.CUDA_EGL_MAX_PLANES
@@ -5605,14 +5729,6 @@ Data types used by CUDA Runtime
 
     Tell the CUDA runtime that DeviceFlags is being set in cudaInitDevice call
 
-.. autoattribute:: cuda.bindings.runtime.cudaCooperativeLaunchMultiDeviceNoPreSync
-
-    If set, each kernel launched as part of :py:obj:`~.cudaLaunchCooperativeKernelMultiDevice` only waits for prior work in the stream corresponding to that GPU to complete before the kernel begins execution.
-
-.. autoattribute:: cuda.bindings.runtime.cudaCooperativeLaunchMultiDeviceNoPostSync
-
-    If set, any subsequent work pushed in a stream that participated in a call to :py:obj:`~.cudaLaunchCooperativeKernelMultiDevice` will only wait for the kernel launched on the GPU corresponding to that stream to complete before it begins execution.
-
 .. autoattribute:: cuda.bindings.runtime.cudaArraySparsePropertiesSingleMipTail
 
     Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers
@@ -5679,6 +5795,7 @@ Data types used by CUDA Runtime
 .. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributeMemSyncDomain
 .. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributePreferredSharedMemoryCarveout
 .. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributeDeviceUpdatableKernelNode
+.. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttributeNvlinkUtilCentricScheduling
 .. autoattribute:: cuda.bindings.runtime.cudaKernelNodeAttrValue
 .. autoattribute:: cuda.bindings.runtime.cudaSurfaceType1D
 .. autoattribute:: cuda.bindings.runtime.cudaSurfaceType2D
diff --git a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
index 2fa3ff0f66..5d7c5ab596 100644
--- a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
+++ b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 #
 # Please refer to the NVIDIA end user license agreement (EULA) associated
 # with this source code for terms and conditions that govern your use of
@@ -11,7 +11,7 @@
 from common import common
 from common.helper_cuda import checkCudaErrors, findCudaDevice
 
-from cuda import cuda
+from cuda.bindings import driver as cuda
 
 clock_nvrtc = """\
 extern "C" __global__  void timedReduction(const float *hinput, float *output, clock_t *timer)
diff --git a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
index c3cf369a14..fa45892e56 100644
--- a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 #
 # Please refer to the NVIDIA end user license agreement (EULA) associated
 # with this source code for terms and conditions that govern your use of
@@ -13,7 +13,8 @@
 from common import common
 from common.helper_cuda import checkCudaErrors, findCudaDevice
 
-from cuda import cuda, cudart
+from cuda.bindings import driver as cuda
+from cuda.bindings import runtime as cudart
 
 simpleCubemapTexture = """\
 extern "C"
diff --git a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
index 5689db6107..0c667e036f 100644
--- a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 #
 # Please refer to the NVIDIA end user license agreement (EULA) associated
 # with this source code for terms and conditions that govern your use of
@@ -13,7 +13,8 @@
 from common import common
 from common.helper_cuda import checkCudaErrors
 
-from cuda import cuda, cudart
+from cuda.bindings import driver as cuda
+from cuda.bindings import runtime as cudart
 
 simplep2p = """\
 extern "C"
diff --git a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
index 4db0020292..3aa24bf3a3 100644
--- a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 #
 # Please refer to the NVIDIA end user license agreement (EULA) associated
 # with this source code for terms and conditions that govern your use of
@@ -16,7 +16,8 @@
 from common.helper_cuda import checkCudaErrors
 from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt
 
-from cuda import cuda, cudart
+from cuda.bindings import driver as cuda
+from cuda.bindings import runtime as cudart
 
 simpleZeroCopy = """\
 extern "C"
diff --git a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
index 64ae4d390f..398be2959a 100644
--- a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
+++ b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 #
 # Please refer to the NVIDIA end user license agreement (EULA) associated
 # with this source code for terms and conditions that govern your use of
@@ -13,7 +13,8 @@
 from common import common
 from common.helper_cuda import checkCudaErrors, findCudaDevice
 
-from cuda import cuda, cudart
+from cuda.bindings import driver as cuda
+from cuda.bindings import runtime as cudart
 
 systemWideAtomics = """\
 #define LOOP_NUM 50
@@ -182,7 +183,8 @@ def main():
         print("Unified Memory not supported on this device")
         return
 
-    if device_prop.computeMode == cudart.cudaComputeMode.cudaComputeModeProhibited:
+    computeMode = checkCudaErrors(cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeMode, dev_id))
+    if computeMode == cudart.cudaComputeMode.cudaComputeModeProhibited:
         # This sample requires being run with a default or process exclusive mode
         print("This sample requires a device in either default or process exclusive mode")
         return
diff --git a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
index 81f589f0e5..1d422ffc12 100644
--- a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
+++ b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 #
 # Please refer to the NVIDIA end user license agreement (EULA) associated
 # with this source code for terms and conditions that govern your use of
@@ -13,7 +13,7 @@
 from common import common
 from common.helper_cuda import checkCudaErrors, findCudaDeviceDRV
 
-from cuda import cuda
+from cuda.bindings import driver as cuda
 
 vectorAddDrv = """\
 /* Vector addition: C = A + B.
@@ -44,7 +44,7 @@ def main():
     checkCudaErrors(cuda.cuInit(0))
     cuDevice = findCudaDeviceDRV()
     # Create context
-    cuContext = checkCudaErrors(cuda.cuCtxCreate(0, cuDevice))
+    cuContext = checkCudaErrors(cuda.cuCtxCreate(None, 0, cuDevice))
 
     uvaSupported = checkCudaErrors(
         cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice)
diff --git a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
index 3230b50714..38ca6d7ffc 100644
--- a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
+++ b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 #
 # Please refer to the NVIDIA end user license agreement (EULA) associated
 # with this source code for terms and conditions that govern your use of
@@ -14,7 +14,7 @@
 from common import common
 from common.helper_cuda import checkCudaErrors, findCudaDeviceDRV
 
-from cuda import cuda
+from cuda.bindings import driver as cuda
 
 vectorAddMMAP = """\
 /* Vector addition: C = A + B.
@@ -239,7 +239,7 @@ def main():
     backingDevices = getBackingDevices(cuDevice)
 
     # Create context
-    cuContext = checkCudaErrors(cuda.cuCtxCreate(0, cuDevice))
+    cuContext = checkCudaErrors(cuda.cuCtxCreate(None, 0, cuDevice))
 
     kernelHelper = common.KernelHelper(vectorAddMMAP, int(cuDevice))
     _VecAdd_kernel = kernelHelper.getFunction(b"VecAdd_kernel")
diff --git a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
index 4cba3ab078..c856c4c455 100644
--- a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
+++ b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 #
 # Please refer to the NVIDIA end user license agreement (EULA) associated
 # with this source code for terms and conditions that govern your use of
@@ -16,7 +16,8 @@
 from common.helper_cuda import checkCudaErrors, findCudaDevice
 from common.helper_string import checkCmdLineFlag
 
-from cuda import cuda, cudart
+from cuda.bindings import driver as cuda
+from cuda.bindings import runtime as cudart
 
 streamOrderedAllocation = """\
 /* Add two vectors on the GPU */
diff --git a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
index b973d01814..c8a3dab2bb 100644
--- a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
+++ b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 #
 # Please refer to the NVIDIA end user license agreement (EULA) associated
 # with this source code for terms and conditions that govern your use of
@@ -17,7 +17,8 @@
 from common.helper_cuda import checkCudaErrors, findCudaDevice
 from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt
 
-from cuda import cuda, cudart
+from cuda.bindings import driver as cuda
+from cuda.bindings import runtime as cudart
 
 blockSize = 16
 
diff --git a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
index ee83436321..c0aff43cec 100644
--- a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
+++ b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 #
 # Please refer to the NVIDIA end user license agreement (EULA) associated
 # with this source code for terms and conditions that govern your use of
@@ -13,7 +13,8 @@
 from common import common
 from common.helper_cuda import checkCudaErrors, findCudaDevice
 
-from cuda import cuda, cudart
+from cuda.bindings import driver as cuda
+from cuda.bindings import runtime as cudart
 
 THREADS_PER_BLOCK = 512
 GRAPH_LAUNCH_ITERATIONS = 3
diff --git a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
index 4a6fafb768..6589ac074c 100644
--- a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
+++ b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 #
 # Please refer to the NVIDIA end user license agreement (EULA) associated
 # with this source code for terms and conditions that govern your use of
@@ -15,7 +15,8 @@
 from common import common
 from common.helper_cuda import checkCudaErrors, findCudaDevice
 
-from cuda import cuda, cudart
+from cuda.bindings import driver as cuda
+from cuda.bindings import runtime as cudart
 
 conjugateGradientMultiBlockCG = """\
 #line __LINE__
diff --git a/cuda_bindings/examples/common/common.py b/cuda_bindings/examples/common/common.py
index ec55c1ac58..c2e9893ed2 100644
--- a/cuda_bindings/examples/common/common.py
+++ b/cuda_bindings/examples/common/common.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 #
 # Please refer to the NVIDIA end user license agreement (EULA) associated
 # with this source code for terms and conditions that govern your use of
@@ -10,7 +10,9 @@
 import numpy as np
 from common.helper_cuda import checkCudaErrors
 
-from cuda import cuda, cudart, nvrtc
+from cuda.bindings import driver as cuda
+from cuda.bindings import runtime as cudart
+from cuda.bindings import nvrtc
 
 
 class KernelHelper:
diff --git a/cuda_bindings/examples/common/helper_cuda.py b/cuda_bindings/examples/common/helper_cuda.py
index 6cc4026dd0..2540d39c6c 100644
--- a/cuda_bindings/examples/common/helper_cuda.py
+++ b/cuda_bindings/examples/common/helper_cuda.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 #
 # Please refer to the NVIDIA end user license agreement (EULA) associated
 # with this source code for terms and conditions that govern your use of
@@ -7,7 +7,9 @@
 # is strictly prohibited.
 from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt
 
-from cuda import cuda, cudart, nvrtc
+from cuda.bindings import driver as cuda
+from cuda.bindings import runtime as cudart
+from cuda.bindings import nvrtc
 
 
 def _cudaGetErrorEnum(error):
diff --git a/cuda_bindings/examples/common/helper_string.py b/cuda_bindings/examples/common/helper_string.py
index 7677047a32..c6465a8fb9 100644
--- a/cuda_bindings/examples/common/helper_string.py
+++ b/cuda_bindings/examples/common/helper_string.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 #
 # Please refer to the NVIDIA end user license agreement (EULA) associated
 # with this source code for terms and conditions that govern your use of
diff --git a/cuda_bindings/examples/extra/isoFDModelling_test.py b/cuda_bindings/examples/extra/isoFDModelling_test.py
index 01e5f57144..b48908cd26 100644
--- a/cuda_bindings/examples/extra/isoFDModelling_test.py
+++ b/cuda_bindings/examples/extra/isoFDModelling_test.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 #
 # Please refer to the NVIDIA end user license agreement (EULA) associated
 # with this source code for terms and conditions that govern your use of
@@ -11,7 +11,8 @@
 from common import common
 from common.helper_cuda import checkCudaErrors
 
-from cuda import cuda, cudart
+from cuda.bindings import driver as cuda
+from cuda.bindings import runtime as cudart
 
 isoPropagator = """\
 extern "C"
@@ -243,7 +244,7 @@ def __init__(self, params, _dev):
 
         checkCudaErrors(cuda.cuInit(0))
         self.cuDevice = checkCudaErrors(cuda.cuDeviceGet(_dev))
-        self.context = checkCudaErrors(cuda.cuCtxCreate(0, self.cuDevice))
+        self.context = checkCudaErrors(cuda.cuCtxCreate(None, 0, self.cuDevice))
         self.waveOut = 0
         self.waveIn = 0
         self.streamCenter = checkCudaErrors(cuda.cuStreamCreate(0))
diff --git a/cuda_bindings/examples/extra/jit_program_test.py b/cuda_bindings/examples/extra/jit_program_test.py
index 18835ec9d2..ebb0d8daf5 100644
--- a/cuda_bindings/examples/extra/jit_program_test.py
+++ b/cuda_bindings/examples/extra/jit_program_test.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 #
 # Please refer to the NVIDIA end user license agreement (EULA) associated
 # with this source code for terms and conditions that govern your use of
@@ -9,7 +9,8 @@
 
 import numpy as np
 
-from cuda import cuda, nvrtc
+from cuda.bindings import driver as cuda
+from cuda.bindings import nvrtc
 
 
 def ASSERT_DRV(err):
@@ -45,7 +46,7 @@ def main():
     ASSERT_DRV(err)
 
     # Ctx
-    err, context = cuda.cuCtxCreate(0, cuDevice)
+    err, context = cuda.cuCtxCreate(None, 0, cuDevice)
     ASSERT_DRV(err)
 
     # Create program
diff --git a/cuda_bindings/examples/extra/numba_emm_plugin.py b/cuda_bindings/examples/extra/numba_emm_plugin.py
index 45015ada42..d44d7ed7b3 100644
--- a/cuda_bindings/examples/extra/numba_emm_plugin.py
+++ b/cuda_bindings/examples/extra/numba_emm_plugin.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 #
 # Please refer to the NVIDIA end user license agreement (EULA) associated
 # with this source code for terms and conditions that govern your use of
@@ -54,7 +54,7 @@
 
 from ctypes import c_size_t
 
-from numba import cuda
+from cuda.bindings import driver as cuda
 from numba.cuda import (
     GetIpcHandleMixin,
     HostOnlyCUDAMemoryManager,
@@ -62,7 +62,7 @@
     MemoryPointer,
 )
 
-from cuda import cuda as cuda_driver
+from cuda.bindings import driver as cuda_driver
 
 # Python functions for allocation, deallocation, and memory info via the NVIDIA
 # CUDA Python Driver API
diff --git a/cuda_bindings/tests/cython/test_ccuda.pyx b/cuda_bindings/tests/cython/test_ccuda.pyx
index 49990dbfc1..9a2327161b 100644
--- a/cuda_bindings/tests/cython/test_ccuda.pyx
+++ b/cuda_bindings/tests/cython/test_ccuda.pyx
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 # distutils: language=c++
@@ -6,9 +6,7 @@ from libc.string cimport (
     memset,
     memcmp
     )
-# TODO: update to new module once the old ones are removed, we use the
-# tests to cover backward compatibility.
-cimport cuda.ccuda as ccuda
+cimport cuda.bindings.cydriver as ccuda
 
 def test_ccuda_memcpy():
     # Init CUDA
@@ -22,7 +20,7 @@ def test_ccuda_memcpy():
 
     # Construct context
     cdef ccuda.CUcontext ctx
-    err = ccuda.cuCtxCreate(&ctx, 0, device)
+    err = ccuda.cuCtxCreate(&ctx, NULL, 0, device)
     assert(err == 0)
 
     # Allocate dev memory
diff --git a/cuda_bindings/tests/cython/test_ccudart.pyx b/cuda_bindings/tests/cython/test_ccudart.pyx
index b0267a0d47..94cdb29768 100644
--- a/cuda_bindings/tests/cython/test_ccudart.pyx
+++ b/cuda_bindings/tests/cython/test_ccudart.pyx
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 # distutils: language=c++
@@ -6,9 +6,7 @@ from libc.string cimport (
     memset,
     memcmp
     )
-# TODO: update to new module once the old ones are removed, we use the
-# tests to cover backward compatibility.
-cimport cuda.ccudart as ccudart
+cimport cuda.bindings.cyruntime as ccudart
 
 def test_ccudart_memcpy():
     # Allocate dev memory
@@ -38,9 +36,9 @@ def test_ccudart_memcpy():
     err = ccudart.cudaFree(dptr)
     assert(err == ccudart.cudaSuccess)
 
-from cuda.ccudart cimport dim3
-from cuda.ccudart cimport cudaMemAllocationHandleType
-from cuda.ccudart cimport CUuuid, cudaUUID_t
+from cuda.bindings.cyruntime cimport dim3
+from cuda.bindings.cyruntime cimport cudaMemAllocationHandleType
+from cuda.bindings.cyruntime cimport CUuuid, cudaUUID_t
 
 cdef extern from *:
     """
diff --git a/cuda_bindings/tests/cython/test_interoperability_cython.pyx b/cuda_bindings/tests/cython/test_interoperability_cython.pyx
index 38ea372ed8..47ed5c8f89 100644
--- a/cuda_bindings/tests/cython/test_interoperability_cython.pyx
+++ b/cuda_bindings/tests/cython/test_interoperability_cython.pyx
@@ -1,17 +1,15 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 # distutils: language=c++
 from libc.stdlib cimport calloc, free
-import cuda.cuda as cuda
-import cuda.cudart as cudart
+import cuda.bindings.driver as cuda
+import cuda.bindings.runtime as cudart
 import numpy as np
 import pytest
 
-# TODO: update to new module once the old ones are removed, we use the
-# tests to cover backward compatibility.
-cimport cuda.ccuda as ccuda
-cimport cuda.ccudart as ccudart
+cimport cuda.bindings.cydriver as ccuda
+cimport cuda.bindings.cyruntime as ccudart
 
 
 def supportsMemoryPool():
@@ -24,7 +22,7 @@ def test_interop_stream():
     assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
     err_dr, device = cuda.cuDeviceGet(0)
     assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
-    err_dr, ctx = cuda.cuCtxCreate(0, device)
+    err_dr, ctx = cuda.cuCtxCreate(None, 0, device)
     assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
 
     # DRV to RT
@@ -52,7 +50,7 @@ def test_interop_event():
     assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
     err_dr, device = cuda.cuDeviceGet(0)
     assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
-    err_dr, ctx = cuda.cuCtxCreate(0, device)
+    err_dr, ctx = cuda.cuCtxCreate(None, 0, device)
     assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
 
     # DRV to RT
@@ -80,7 +78,7 @@ def test_interop_graph():
     assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
     err_dr, device = cuda.cuDeviceGet(0)
     assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
-    err_dr, ctx = cuda.cuCtxCreate(0, device)
+    err_dr, ctx = cuda.cuCtxCreate(None, 0, device)
     assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
 
     # DRV to RT
@@ -108,7 +106,7 @@ def test_interop_graphNode():
     assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
     err_dr, device = cuda.cuDeviceGet(0)
     assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
-    err_dr, ctx = cuda.cuCtxCreate(0, device)
+    err_dr, ctx = cuda.cuCtxCreate(None, 0, device)
     assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
 
     # DRV to RT
@@ -146,7 +144,7 @@ def test_interop_memPool():
     assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
     err_dr, device = cuda.cuDeviceGet(0)
     assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
-    err_dr, ctx = cuda.cuCtxCreate(0, device)
+    err_dr, ctx = cuda.cuCtxCreate(None, 0, device)
     assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
 
     # DRV to RT
@@ -175,7 +173,7 @@ def test_interop_graphExec():
     assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
     err_dr, device = cuda.cuDeviceGet(0)
     assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
-    err_dr, ctx = cuda.cuCtxCreate(0, device)
+    err_dr, ctx = cuda.cuCtxCreate(None, 0, device)
     assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
 
     cdef ccuda.CUgraph* graph_dr = <ccuda.CUgraph*>calloc(1, sizeof(ccuda.CUgraph))
diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py
index 3ad670311e..485c300e86 100644
--- a/cuda_bindings/tests/test_cuda.py
+++ b/cuda_bindings/tests/test_cuda.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import platform
@@ -9,8 +9,8 @@
 import pytest
 from conftest import skipif_testing_with_compute_sanitizer
 
-import cuda.cuda as cuda
-import cuda.cudart as cudart
+import cuda.bindings.driver as cuda
+import cuda.bindings.runtime as cudart
 
 
 def driverVersionLessThan(target):
@@ -49,7 +49,7 @@ def test_cuda_memcpy():
     assert err == cuda.CUresult.CUDA_SUCCESS
 
     # Construct context
-    err, ctx = cuda.cuCtxCreate(0, device)
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err == cuda.CUresult.CUDA_SUCCESS
 
     # Allocate dev memory
@@ -92,7 +92,7 @@ def test_cuda_array():
     err, arr = cuda.cuArrayCreate(desc)
     assert err == cuda.CUresult.CUDA_ERROR_INVALID_CONTEXT or err == cuda.CUresult.CUDA_ERROR_INVALID_VALUE
 
-    err, ctx = cuda.cuCtxCreate(0, device)
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err == cuda.CUresult.CUDA_SUCCESS
 
     # Desciption not filled
@@ -121,7 +121,7 @@ def test_cuda_repr_primitive():
     assert str(device) == "<CUdevice 0>"
     assert int(device) == 0
 
-    err, ctx = cuda.cuCtxCreate(0, device)
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err == cuda.CUresult.CUDA_SUCCESS
     assert str(ctx).startswith("<CUcontext 0x")
     assert int(ctx) > 0
@@ -187,7 +187,7 @@ def test_cuda_repr_pointer():
     assert err == cuda.CUresult.CUDA_SUCCESS
 
     # Test 1: Classes representing pointers
-    err, ctx = cuda.cuCtxCreate(0, device)
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err == cuda.CUresult.CUDA_SUCCESS
     assert str(ctx).startswith("<CUcontext 0x")
     assert int(ctx) > 0
@@ -214,7 +214,7 @@ def test_cuda_uuid_list_access():
     assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
     assert err == cuda.CUresult.CUDA_SUCCESS
-    err, ctx = cuda.cuCtxCreate(0, device)
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err == cuda.CUresult.CUDA_SUCCESS
 
     err, uuid = cuda.cuDeviceGetUuid(device)
@@ -238,9 +238,9 @@ def test_cuda_uuid_list_access():
 def test_cuda_cuModuleLoadDataEx():
     (err,) = cuda.cuInit(0)
     assert err == cuda.CUresult.CUDA_SUCCESS
-    err, dev = cuda.cuDeviceGet(0)
+    err, device = cuda.cuDeviceGet(0)
     assert err == cuda.CUresult.CUDA_SUCCESS
-    err, ctx = cuda.cuCtxCreate(0, dev)
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err == cuda.CUresult.CUDA_SUCCESS
 
     option_keys = [
@@ -330,7 +330,7 @@ def test_cuda_memPool_attr():
     assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
     assert err == cuda.CUresult.CUDA_SUCCESS
-    err, ctx = cuda.cuCtxCreate(0, device)
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err == cuda.CUresult.CUDA_SUCCESS
 
     poolProps = cuda.CUmemPoolProps()
@@ -401,7 +401,7 @@ def test_cuda_pointer_attr():
     assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
     assert err == cuda.CUresult.CUDA_SUCCESS
-    err, ctx = cuda.cuCtxCreate(0, device)
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err == cuda.CUresult.CUDA_SUCCESS
     err, ptr = cuda.cuMemAllocManaged(0x1000, cuda.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value)
     assert err == cuda.CUresult.CUDA_SUCCESS
@@ -458,23 +458,31 @@ def test_cuda_mem_range_attr():
     assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
     assert err == cuda.CUresult.CUDA_SUCCESS
-    err, ctx = cuda.cuCtxCreate(0, device)
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err == cuda.CUresult.CUDA_SUCCESS
+
     size = 0x1000
+    location_device = cuda.CUmemLocation()
+    location_device.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+    location_device.id = int(device)
+    location_cpu = cuda.CUmemLocation()
+    location_cpu.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
+    location_cpu.id = int(cuda.CU_DEVICE_CPU)
+
     err, ptr = cuda.cuMemAllocManaged(size, cuda.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value)
     assert err == cuda.CUresult.CUDA_SUCCESS
-    (err,) = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY, device)
+    (err,) = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY, location_device)
     assert err == cuda.CUresult.CUDA_SUCCESS
-    (err,) = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_PREFERRED_LOCATION, cuda.CU_DEVICE_CPU)
+    (err,) = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_PREFERRED_LOCATION, location_cpu)
     assert err == cuda.CUresult.CUDA_SUCCESS
-    (err,) = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_ACCESSED_BY, cuda.CU_DEVICE_CPU)
+    (err,) = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_ACCESSED_BY, location_cpu)
     assert err == cuda.CUresult.CUDA_SUCCESS
     err, concurrentSupported = cuda.cuDeviceGetAttribute(
         cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, device
     )
     assert err == cuda.CUresult.CUDA_SUCCESS
     if concurrentSupported:
-        (err,) = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_ACCESSED_BY, device)
+        (err,) = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_ACCESSED_BY, location_device)
         assert err == cuda.CUresult.CUDA_SUCCESS
         expected_values_list = ([1, -1, [0, -1, -2], -2],)
     else:
@@ -522,7 +530,7 @@ def test_cuda_graphMem_attr():
     assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
     assert err == cuda.CUresult.CUDA_SUCCESS
-    err, ctx = cuda.cuCtxCreate(0, device)
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err == cuda.CUresult.CUDA_SUCCESS
 
     err, stream = cuda.cuStreamCreate(0)
@@ -587,7 +595,7 @@ def test_cuda_coredump_attr():
     assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
     assert err == cuda.CUresult.CUDA_SUCCESS
-    err, ctx = cuda.cuCtxCreate(0, device)
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err == cuda.CUresult.CUDA_SUCCESS
 
     attr_list = [None] * 6
@@ -628,7 +636,7 @@ def test_get_error_name_and_string():
     assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
     assert err == cuda.CUresult.CUDA_SUCCESS
-    err, ctx = cuda.cuCtxCreate(0, device)
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err == cuda.CUresult.CUDA_SUCCESS
 
     err, device = cuda.cuDeviceGet(0)
@@ -655,7 +663,7 @@ def test_device_get_name():
     assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
     assert err == cuda.CUresult.CUDA_SUCCESS
-    err, ctx = cuda.cuCtxCreate(0, device)
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err == cuda.CUresult.CUDA_SUCCESS
 
     p = subprocess.check_output(
@@ -688,7 +696,7 @@ def test_profiler():
     assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
     assert err == cuda.CUresult.CUDA_SUCCESS
-    err, ctx = cuda.cuCtxCreate(0, device)
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err == cuda.CUresult.CUDA_SUCCESS
     (err,) = cuda.cuProfilerStart()
     assert err == cuda.CUresult.CUDA_SUCCESS
@@ -777,7 +785,7 @@ def test_graph_poly():
     assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
     assert err == cuda.CUresult.CUDA_SUCCESS
-    err, ctx = cuda.cuCtxCreate(0, device)
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err == cuda.CUresult.CUDA_SUCCESS
     err, stream = cuda.cuStreamCreate(0)
     assert err == cuda.CUresult.CUDA_SUCCESS
@@ -811,7 +819,7 @@ def test_graph_poly():
     memsetParams.memset.height = 1
     memsetParams.memset.dst = device
     memsetParams.memset.value = 1
-    err, node = cuda.cuGraphAddNode(graph, None, 0, memsetParams)
+    err, node = cuda.cuGraphAddNode(graph, None, None, 0, memsetParams)
     assert err == cuda.CUresult.CUDA_SUCCESS
     nodes += [node]
 
@@ -826,7 +834,7 @@ def test_graph_poly():
     memcpyParams.memcpy.copyParams.WidthInBytes = size
     memcpyParams.memcpy.copyParams.Height = 1
     memcpyParams.memcpy.copyParams.Depth = 1
-    err, node = cuda.cuGraphAddNode(graph, None, 0, memcpyParams)
+    err, node = cuda.cuGraphAddNode(graph, None, None, 0, memcpyParams)
     assert err == cuda.CUresult.CUDA_SUCCESS
     nodes += [node]
 
@@ -895,7 +903,7 @@ def test_cuDeviceGetDevResource():
     err, device = cuda.cuDeviceGet(0)
     assert err == cuda.CUresult.CUDA_SUCCESS
     err, resource_in = cuda.cuDeviceGetDevResource(device, cuda.CUdevResourceType.CU_DEV_RESOURCE_TYPE_SM)
-    err, ctx = cuda.cuCtxCreate(0, device)
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err == cuda.CUresult.CUDA_SUCCESS
 
     err, res, count, rem = cuda.cuDevSmResourceSplitByCount(0, resource_in, 0, 2)
@@ -923,7 +931,7 @@ def test_conditional():
     assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
     assert err == cuda.CUresult.CUDA_SUCCESS
-    err, ctx = cuda.cuCtxCreate(0, device)
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err == cuda.CUresult.CUDA_SUCCESS
 
     err, graph = cuda.cuGraphCreate(0)
@@ -940,7 +948,7 @@ def test_conditional():
 
     assert len(params.conditional.phGraph_out) == 1
     assert int(params.conditional.phGraph_out[0]) == 0
-    err, node = cuda.cuGraphAddNode(graph, None, 0, params)
+    err, node = cuda.cuGraphAddNode(graph, None, None, 0, params)
     assert err == cuda.CUresult.CUDA_SUCCESS
 
     assert len(params.conditional.phGraph_out) == 1
diff --git a/cuda_bindings/tests/test_cudart.py b/cuda_bindings/tests/test_cudart.py
index be5f217b9a..38091fce64 100644
--- a/cuda_bindings/tests/test_cudart.py
+++ b/cuda_bindings/tests/test_cudart.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import ctypes
@@ -8,8 +8,8 @@
 import pytest
 from conftest import skipif_testing_with_compute_sanitizer
 
-import cuda.cuda as cuda
-import cuda.cudart as cudart
+import cuda.bindings.driver as cuda
+import cuda.bindings.runtime as cudart
 
 
 def isSuccess(err):
@@ -293,86 +293,98 @@ def test_cudart_cudaGetDeviceProperties():
     err, prop = cudart.cudaGetDeviceProperties(0)
     assertSuccess(err)
     attrs = [
-        "accessPolicyMaxWindowSize",
-        "asyncEngineCount",
-        "canMapHostMemory",
-        "canUseHostPointerForRegisteredMem",
-        "clockRate",
-        "computeMode",
-        "computePreemptionSupported",
-        "concurrentKernels",
-        "concurrentManagedAccess",
-        "cooperativeLaunch",
-        "cooperativeMultiDeviceLaunch",
-        "deviceOverlap",
-        "directManagedMemAccessFromHost",
-        "getPtr",
-        "globalL1CacheSupported",
-        "hostNativeAtomicSupported",
-        "integrated",
-        "isMultiGpuBoard",
-        "kernelExecTimeoutEnabled",
-        "l2CacheSize",
-        "localL1CacheSupported",
+        "name",
+        "uuid",
         "luid",
         "luidDeviceNodeMask",
-        "major",
-        "managedMemory",
-        "maxBlocksPerMultiProcessor",
+        "totalGlobalMem",
+        "sharedMemPerBlock",
+        "regsPerBlock",
+        "warpSize",
+        "memPitch",
+        "maxThreadsPerBlock",
+        "maxThreadsDim",
         "maxGridSize",
-        "maxSurface1D",
-        "maxSurface1DLayered",
-        "maxSurface2D",
-        "maxSurface2DLayered",
-        "maxSurface3D",
-        "maxSurfaceCubemap",
-        "maxSurfaceCubemapLayered",
+        "totalConstMem",
+        "major",
+        "minor",
+        "textureAlignment",
+        "texturePitchAlignment",
+        "multiProcessorCount",
+        "integrated",
+        "canMapHostMemory",
         "maxTexture1D",
-        "maxTexture1DLayered",
-        "maxTexture1DLinear",
         "maxTexture1DMipmap",
         "maxTexture2D",
-        "maxTexture2DGather",
-        "maxTexture2DLayered",
-        "maxTexture2DLinear",
         "maxTexture2DMipmap",
+        "maxTexture2DLinear",
+        "maxTexture2DGather",
         "maxTexture3D",
         "maxTexture3DAlt",
         "maxTextureCubemap",
+        "maxTexture1DLayered",
+        "maxTexture2DLayered",
         "maxTextureCubemapLayered",
-        "maxThreadsDim",
-        "maxThreadsPerBlock",
-        "maxThreadsPerMultiProcessor",
-        "memPitch",
-        "memoryBusWidth",
-        "memoryClockRate",
-        "minor",
-        "multiGpuBoardGroupID",
-        "multiProcessorCount",
-        "name",
-        "pageableMemoryAccess",
-        "pageableMemoryAccessUsesHostPageTables",
+        "maxSurface1D",
+        "maxSurface2D",
+        "maxSurface3D",
+        "maxSurface1DLayered",
+        "maxSurface2DLayered",
+        "maxSurfaceCubemap",
+        "maxSurfaceCubemapLayered",
+        "surfaceAlignment",
+        "concurrentKernels",
+        "ECCEnabled",
         "pciBusID",
         "pciDeviceID",
         "pciDomainID",
+        "tccDriver",
+        "asyncEngineCount",
+        "unifiedAddressing",
+        "memoryBusWidth",
+        "l2CacheSize",
         "persistingL2CacheMaxSize",
-        "regsPerBlock",
+        "maxThreadsPerMultiProcessor",
+        "streamPrioritiesSupported",
+        "globalL1CacheSupported",
+        "localL1CacheSupported",
+        "sharedMemPerMultiprocessor",
         "regsPerMultiprocessor",
-        "reservedSharedMemPerBlock",
-        "sharedMemPerBlock",
+        "managedMemory",
+        "isMultiGpuBoard",
+        "multiGpuBoardGroupID",
+        "hostNativeAtomicSupported",
+        "pageableMemoryAccess",
+        "concurrentManagedAccess",
+        "computePreemptionSupported",
+        "canUseHostPointerForRegisteredMem",
+        "cooperativeLaunch",
         "sharedMemPerBlockOptin",
-        "sharedMemPerMultiprocessor",
-        "singleToDoublePrecisionPerfRatio",
-        "streamPrioritiesSupported",
-        "surfaceAlignment",
-        "tccDriver",
-        "textureAlignment",
-        "texturePitchAlignment",
-        "totalConstMem",
-        "totalGlobalMem",
-        "unifiedAddressing",
-        "uuid",
-        "warpSize",
+        "pageableMemoryAccessUsesHostPageTables",
+        "directManagedMemAccessFromHost",
+        "maxBlocksPerMultiProcessor",
+        "accessPolicyMaxWindowSize",
+        "reservedSharedMemPerBlock",
+        "hostRegisterSupported",
+        "sparseCudaArraySupported",
+        "hostRegisterReadOnlySupported",
+        "timelineSemaphoreInteropSupported",
+        "memoryPoolsSupported",
+        "gpuDirectRDMASupported",
+        "gpuDirectRDMAFlushWritesOptions",
+        "gpuDirectRDMAWritesOrdering",
+        "memoryPoolSupportedHandleTypes",
+        "deferredMappingCudaArraySupported",
+        "ipcEventSupported",
+        "clusterLaunch",
+        "unifiedFunctionPointers",
+        "deviceNumaConfig",
+        "deviceNumaId",
+        "mpsEnabled",
+        "hostNumaId",
+        "gpuPciDeviceID",
+        "gpuPciSubsystemID",
+        "hostNumaMultinodeIpcSupported",
     ]
     for attr in attrs:
         assert hasattr(prop, attr)
@@ -1363,7 +1375,7 @@ def test_cudart_conditional():
 
     assert len(params.conditional.phGraph_out) == 1
     assert int(params.conditional.phGraph_out[0]) == 0
-    err, node = cudart.cudaGraphAddNode(graph, None, 0, params)
+    err, node = cudart.cudaGraphAddNode(graph, None, None, 0, params)
     assertSuccess(err)
 
     assert len(params.conditional.phGraph_out) == 1
diff --git a/cuda_bindings/tests/test_interoperability.py b/cuda_bindings/tests/test_interoperability.py
index 9105273b65..d518db5edd 100644
--- a/cuda_bindings/tests/test_interoperability.py
+++ b/cuda_bindings/tests/test_interoperability.py
@@ -1,11 +1,11 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import numpy as np
 import pytest
 
-import cuda.cuda as cuda
-import cuda.cudart as cudart
+import cuda.bindings.driver as cuda
+import cuda.bindings.runtime as cudart
 
 
 def supportsMemoryPool():
@@ -18,7 +18,7 @@ def test_interop_stream():
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, device = cuda.cuDeviceGet(0)
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
-    err_dr, ctx = cuda.cuCtxCreate(0, device)
+    err_dr, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
     # DRV to RT
@@ -42,7 +42,7 @@ def test_interop_event():
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, device = cuda.cuDeviceGet(0)
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
-    err_dr, ctx = cuda.cuCtxCreate(0, device)
+    err_dr, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
     # DRV to RT
@@ -66,7 +66,7 @@ def test_interop_graph():
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, device = cuda.cuDeviceGet(0)
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
-    err_dr, ctx = cuda.cuCtxCreate(0, device)
+    err_dr, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
     # DRV to RT
@@ -90,7 +90,7 @@ def test_interop_graphNode():
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, device = cuda.cuDeviceGet(0)
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
-    err_dr, ctx = cuda.cuCtxCreate(0, device)
+    err_dr, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
     err_dr, graph = cuda.cuGraphCreate(0)
@@ -119,7 +119,7 @@ def test_interop_userObject():
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, device = cuda.cuDeviceGet(0)
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
-    err_dr, ctx = cuda.cuCtxCreate(0, device)
+    err_dr, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
     # cudaUserObject_t
@@ -134,7 +134,7 @@ def test_interop_function():
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, device = cuda.cuDeviceGet(0)
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
-    err_dr, ctx = cuda.cuCtxCreate(0, device)
+    err_dr, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
     # cudaFunction_t
@@ -150,7 +150,7 @@ def test_interop_memPool():
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, device = cuda.cuDeviceGet(0)
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
-    err_dr, ctx = cuda.cuCtxCreate(0, device)
+    err_dr, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
     # DRV to RT
@@ -174,7 +174,7 @@ def test_interop_graphExec():
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, device = cuda.cuDeviceGet(0)
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
-    err_dr, ctx = cuda.cuCtxCreate(0, device)
+    err_dr, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, graph = cuda.cuGraphCreate(0)
     assert err_dr == cuda.CUresult.CUDA_SUCCESS
@@ -209,7 +209,7 @@ def test_interop_deviceptr():
     assert err == cuda.CUresult.CUDA_SUCCESS
 
     # Construct context
-    err, ctx = cuda.cuCtxCreate(0, device)
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
     assert err == cuda.CUresult.CUDA_SUCCESS
 
     # Allocate dev memory
diff --git a/cuda_bindings/tests/test_kernelParams.py b/cuda_bindings/tests/test_kernelParams.py
index 8d17aba0fc..3551af11c1 100644
--- a/cuda_bindings/tests/test_kernelParams.py
+++ b/cuda_bindings/tests/test_kernelParams.py
@@ -1,11 +1,13 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import ctypes
 
 import numpy as np
 
-from cuda import cuda, cudart, nvrtc
+import cuda.bindings.driver as cuda
+import cuda.bindings.runtime as cudart
+import cuda.bindings.nvrtc as nvrtc
 
 
 def ASSERT_DRV(err):
@@ -72,7 +74,7 @@ def test_kernelParams_empty():
     ASSERT_DRV(err)
     err, cuDevice = cuda.cuDeviceGet(0)
     ASSERT_DRV(err)
-    err, context = cuda.cuCtxCreate(0, cuDevice)
+    err, context = cuda.cuCtxCreate(None, 0, cuDevice)
     ASSERT_DRV(err)
 
     kernelString = """\
@@ -147,7 +149,7 @@ def kernelParams_basic(use_ctypes_as_values):
     ASSERT_DRV(err)
     err, cuDevice = cuda.cuDeviceGet(0)
     ASSERT_DRV(err)
-    err, context = cuda.cuCtxCreate(0, cuDevice)
+    err, context = cuda.cuCtxCreate(None, 0, cuDevice)
     ASSERT_DRV(err)
 
     if use_ctypes_as_values:
@@ -437,7 +439,7 @@ def test_kernelParams_types_cuda():
     ASSERT_DRV(err)
     err, cuDevice = cuda.cuDeviceGet(0)
     ASSERT_DRV(err)
-    err, context = cuda.cuCtxCreate(0, cuDevice)
+    err, context = cuda.cuCtxCreate(None, 0, cuDevice)
     ASSERT_DRV(err)
     err, uvaSupported = cuda.cuDeviceGetAttribute(
         cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice
@@ -567,7 +569,7 @@ def test_kernelParams_struct_custom():
     ASSERT_DRV(err)
     err, cuDevice = cuda.cuDeviceGet(0)
     ASSERT_DRV(err)
-    err, context = cuda.cuCtxCreate(0, cuDevice)
+    err, context = cuda.cuCtxCreate(None, 0, cuDevice)
     ASSERT_DRV(err)
     err, uvaSupported = cuda.cuDeviceGetAttribute(
         cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice
@@ -646,7 +648,7 @@ def kernelParams_buffer_protocol_ctypes_common(pass_by_address):
     ASSERT_DRV(err)
     err, cuDevice = cuda.cuDeviceGet(0)
     ASSERT_DRV(err)
-    err, context = cuda.cuCtxCreate(0, cuDevice)
+    err, context = cuda.cuCtxCreate(None, 0, cuDevice)
     ASSERT_DRV(err)
     err, uvaSupported = cuda.cuDeviceGetAttribute(
         cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice
@@ -758,7 +760,7 @@ def test_kernelParams_buffer_protocol_numpy():
     ASSERT_DRV(err)
     err, cuDevice = cuda.cuDeviceGet(0)
     ASSERT_DRV(err)
-    err, context = cuda.cuCtxCreate(0, cuDevice)
+    err, context = cuda.cuCtxCreate(None, 0, cuDevice)
     ASSERT_DRV(err)
     err, uvaSupported = cuda.cuDeviceGetAttribute(
         cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice
diff --git a/cuda_bindings/tests/test_nvrtc.py b/cuda_bindings/tests/test_nvrtc.py
index 682e55f397..3160b212e5 100644
--- a/cuda_bindings/tests/test_nvrtc.py
+++ b/cuda_bindings/tests/test_nvrtc.py
@@ -1,9 +1,9 @@
-# Copyright 2021-2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import pytest
 
-from cuda import nvrtc
+from cuda.bindings import nvrtc
 
 
 def ASSERT_DRV(err):

From 6413babbfcabec64a0943ad46e49387309af7fb7 Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Tue, 27 May 2025 14:56:32 -0700
Subject: [PATCH 03/65] Update license headers on examples

---
 cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py | 8 ++------
 .../examples/0_Introduction/simpleCubemapTexture_test.py  | 8 ++------
 cuda_bindings/examples/0_Introduction/simpleP2P_test.py   | 8 ++------
 .../examples/0_Introduction/simpleZeroCopy_test.py        | 8 ++------
 .../examples/0_Introduction/systemWideAtomics_test.py     | 8 ++------
 .../examples/0_Introduction/vectorAddDrv_test.py          | 8 ++------
 .../examples/0_Introduction/vectorAddMMAP_test.py         | 8 ++------
 .../streamOrderedAllocation_test.py                       | 8 ++------
 .../3_CUDA_Features/globalToShmemAsyncCopy_test.py        | 8 ++------
 .../examples/3_CUDA_Features/simpleCudaGraphs_test.py     | 8 ++------
 .../conjugateGradientMultiBlockCG_test.py                 | 8 ++------
 cuda_bindings/examples/common/common.py                   | 8 ++------
 cuda_bindings/examples/common/helper_cuda.py              | 8 ++------
 cuda_bindings/examples/common/helper_string.py            | 8 ++------
 cuda_bindings/examples/extra/isoFDModelling_test.py       | 8 ++------
 cuda_bindings/examples/extra/jit_program_test.py          | 8 ++------
 cuda_bindings/examples/extra/numba_emm_plugin.py          | 7 +------
 17 files changed, 33 insertions(+), 102 deletions(-)

diff --git a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
index 5d7c5ab596..4d3e557a39 100644
--- a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
+++ b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
@@ -1,10 +1,6 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import platform
 
 import numpy as np
diff --git a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
index fa45892e56..fae5cb6ad8 100644
--- a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
@@ -1,10 +1,6 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import ctypes
 import sys
 import time
diff --git a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
index 0c667e036f..0f83370288 100644
--- a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
@@ -1,10 +1,6 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import ctypes
 import platform
 import sys
diff --git a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
index 3aa24bf3a3..db045be677 100644
--- a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
@@ -1,10 +1,6 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import ctypes
 import math
 import platform
diff --git a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
index 398be2959a..8ce984826f 100644
--- a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
+++ b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
@@ -1,10 +1,6 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import ctypes
 import os
 import sys
diff --git a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
index 1d422ffc12..05e580999a 100644
--- a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
+++ b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
@@ -1,10 +1,6 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import ctypes
 import math
 import sys
diff --git a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
index 38ca6d7ffc..4679dde38c 100644
--- a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
+++ b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
@@ -1,10 +1,6 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import ctypes
 import math
 import platform
diff --git a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
index c856c4c455..7eb7a0b977 100644
--- a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
+++ b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
@@ -1,10 +1,6 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import ctypes
 import math
 import platform
diff --git a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
index c8a3dab2bb..df4443c075 100644
--- a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
+++ b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
@@ -1,10 +1,6 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import ctypes
 import math
 import platform
diff --git a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
index c0aff43cec..ecb8e84e66 100644
--- a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
+++ b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
@@ -1,10 +1,6 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import ctypes
 import random as rnd
 
diff --git a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
index 6589ac074c..8c2a0bc340 100644
--- a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
+++ b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
@@ -1,10 +1,6 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import ctypes
 import math
 import platform
diff --git a/cuda_bindings/examples/common/common.py b/cuda_bindings/examples/common/common.py
index c2e9893ed2..cf5db2d563 100644
--- a/cuda_bindings/examples/common/common.py
+++ b/cuda_bindings/examples/common/common.py
@@ -1,10 +1,6 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import os
 
 import numpy as np
diff --git a/cuda_bindings/examples/common/helper_cuda.py b/cuda_bindings/examples/common/helper_cuda.py
index 2540d39c6c..acdc31d2d1 100644
--- a/cuda_bindings/examples/common/helper_cuda.py
+++ b/cuda_bindings/examples/common/helper_cuda.py
@@ -1,10 +1,6 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt
 
 from cuda.bindings import driver as cuda
diff --git a/cuda_bindings/examples/common/helper_string.py b/cuda_bindings/examples/common/helper_string.py
index c6465a8fb9..9f8e70a6c4 100644
--- a/cuda_bindings/examples/common/helper_string.py
+++ b/cuda_bindings/examples/common/helper_string.py
@@ -1,10 +1,6 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import sys
 
 
diff --git a/cuda_bindings/examples/extra/isoFDModelling_test.py b/cuda_bindings/examples/extra/isoFDModelling_test.py
index b48908cd26..f0b149a32b 100644
--- a/cuda_bindings/examples/extra/isoFDModelling_test.py
+++ b/cuda_bindings/examples/extra/isoFDModelling_test.py
@@ -1,10 +1,6 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import time
 
 import numpy as np
diff --git a/cuda_bindings/examples/extra/jit_program_test.py b/cuda_bindings/examples/extra/jit_program_test.py
index ebb0d8daf5..eccbd86a67 100644
--- a/cuda_bindings/examples/extra/jit_program_test.py
+++ b/cuda_bindings/examples/extra/jit_program_test.py
@@ -1,10 +1,6 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
 import ctypes
 
 import numpy as np
diff --git a/cuda_bindings/examples/extra/numba_emm_plugin.py b/cuda_bindings/examples/extra/numba_emm_plugin.py
index d44d7ed7b3..3ca7d594f4 100644
--- a/cuda_bindings/examples/extra/numba_emm_plugin.py
+++ b/cuda_bindings/examples/extra/numba_emm_plugin.py
@@ -1,10 +1,5 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
-#
-# Please refer to the NVIDIA end user license agreement (EULA) associated
-# with this source code for terms and conditions that govern your use of
-# this software. Any use, reproduction, disclosure, or distribution of
-# this software and related documentation outside the terms of the EULA
-# is strictly prohibited.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 """Numba EMM Plugin using the CUDA Python Driver API.
 

From 177338894a506c68686065bb5398a28fa7dda535 Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Wed, 28 May 2025 11:57:18 -0700
Subject: [PATCH 04/65] Regenerate after merging upstream

---
 cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
index 5e49b0270c..bc722795db 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
@@ -5,6 +5,7 @@
 {{if 'Windows' == platform.system()}}
 import os
 import win32api
+from pywintypes import error
 {{else}}
 cimport cuda.bindings._lib.dlfcn as dlfcn
 {{endif}}

From af48e185227c0c8bc9fad40f9fe4de022fec2107 Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Wed, 28 May 2025 14:28:49 -0700
Subject: [PATCH 05/65] Update benchmarks and run pre-commit

---
 cuda_bindings/benchmarks/conftest.py                | 6 ++++--
 cuda_bindings/benchmarks/test_launch_latency.py     | 2 +-
 cuda_bindings/benchmarks/test_pointer_attributes.py | 2 +-
 cuda_bindings/examples/common/common.py             | 2 +-
 cuda_bindings/examples/common/helper_cuda.py        | 2 +-
 cuda_bindings/examples/extra/numba_emm_plugin.py    | 2 +-
 cuda_bindings/tests/test_kernelParams.py            | 2 +-
 7 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/cuda_bindings/benchmarks/conftest.py b/cuda_bindings/benchmarks/conftest.py
index 9715011663..2d75268e55 100644
--- a/cuda_bindings/benchmarks/conftest.py
+++ b/cuda_bindings/benchmarks/conftest.py
@@ -4,7 +4,9 @@
 import numpy as np
 import pytest
 
-from cuda import cuda, cudart, nvrtc
+from cuda.bindings import driver as cuda
+from cuda.bindings import nvrtc
+from cuda.bindings import runtime as cudart
 
 
 def ASSERT_DRV(err):
@@ -28,7 +30,7 @@ def init_cuda():
     ASSERT_DRV(err)
     err, device = cuda.cuDeviceGet(0)
     ASSERT_DRV(err)
-    err, ctx = cuda.cuCtxCreate(0, device)
+    err, ctx = cuda.cuCtxCreate(None, 0, device)
     ASSERT_DRV(err)
 
     # create stream
diff --git a/cuda_bindings/benchmarks/test_launch_latency.py b/cuda_bindings/benchmarks/test_launch_latency.py
index e79542c04a..1f01ab5714 100755
--- a/cuda_bindings/benchmarks/test_launch_latency.py
+++ b/cuda_bindings/benchmarks/test_launch_latency.py
@@ -7,7 +7,7 @@
 from conftest import ASSERT_DRV
 from kernels import kernel_string
 
-from cuda import cuda
+from cuda.bindings import driver as cuda
 
 
 def launch(kernel, stream, args=(), arg_types=()):
diff --git a/cuda_bindings/benchmarks/test_pointer_attributes.py b/cuda_bindings/benchmarks/test_pointer_attributes.py
index ecaaa8db59..2d02cd907f 100644
--- a/cuda_bindings/benchmarks/test_pointer_attributes.py
+++ b/cuda_bindings/benchmarks/test_pointer_attributes.py
@@ -6,7 +6,7 @@
 import pytest
 from conftest import ASSERT_DRV
 
-from cuda import cuda
+from cuda.bindings import driver as cuda
 
 random.seed(0)
 
diff --git a/cuda_bindings/examples/common/common.py b/cuda_bindings/examples/common/common.py
index cf5db2d563..34150d8f68 100644
--- a/cuda_bindings/examples/common/common.py
+++ b/cuda_bindings/examples/common/common.py
@@ -7,8 +7,8 @@
 from common.helper_cuda import checkCudaErrors
 
 from cuda.bindings import driver as cuda
-from cuda.bindings import runtime as cudart
 from cuda.bindings import nvrtc
+from cuda.bindings import runtime as cudart
 
 
 class KernelHelper:
diff --git a/cuda_bindings/examples/common/helper_cuda.py b/cuda_bindings/examples/common/helper_cuda.py
index acdc31d2d1..d741eb54d9 100644
--- a/cuda_bindings/examples/common/helper_cuda.py
+++ b/cuda_bindings/examples/common/helper_cuda.py
@@ -4,8 +4,8 @@
 from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt
 
 from cuda.bindings import driver as cuda
-from cuda.bindings import runtime as cudart
 from cuda.bindings import nvrtc
+from cuda.bindings import runtime as cudart
 
 
 def _cudaGetErrorEnum(error):
diff --git a/cuda_bindings/examples/extra/numba_emm_plugin.py b/cuda_bindings/examples/extra/numba_emm_plugin.py
index 3ca7d594f4..dcbf541321 100644
--- a/cuda_bindings/examples/extra/numba_emm_plugin.py
+++ b/cuda_bindings/examples/extra/numba_emm_plugin.py
@@ -49,7 +49,6 @@
 
 from ctypes import c_size_t
 
-from cuda.bindings import driver as cuda
 from numba.cuda import (
     GetIpcHandleMixin,
     HostOnlyCUDAMemoryManager,
@@ -57,6 +56,7 @@
     MemoryPointer,
 )
 
+from cuda.bindings import driver as cuda
 from cuda.bindings import driver as cuda_driver
 
 # Python functions for allocation, deallocation, and memory info via the NVIDIA
diff --git a/cuda_bindings/tests/test_kernelParams.py b/cuda_bindings/tests/test_kernelParams.py
index 3551af11c1..b39ce9c560 100644
--- a/cuda_bindings/tests/test_kernelParams.py
+++ b/cuda_bindings/tests/test_kernelParams.py
@@ -6,8 +6,8 @@
 import numpy as np
 
 import cuda.bindings.driver as cuda
-import cuda.bindings.runtime as cudart
 import cuda.bindings.nvrtc as nvrtc
+import cuda.bindings.runtime as cudart
 
 
 def ASSERT_DRV(err):

From 6509f3c14765a3d333a728a7671a942c3d8727a5 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 29 May 2025 15:27:02 -0700
Subject: [PATCH 06/65] cython-gen output, NO manual changes

---
 cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in       | 2 +-
 cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in       | 2 +-
 cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in        | 2 +-
 cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in        | 2 +-
 cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in      | 2 +-
 cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in      | 2 +-
 cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in | 2 +-
 cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in | 2 +-
 cuda_bindings/cuda/bindings/cydriver.pxd.in                 | 2 +-
 cuda_bindings/cuda/bindings/cydriver.pyx.in                 | 2 +-
 cuda_bindings/cuda/bindings/cynvrtc.pxd.in                  | 2 +-
 cuda_bindings/cuda/bindings/cynvrtc.pyx.in                  | 2 +-
 cuda_bindings/cuda/bindings/cyruntime.pxd.in                | 2 +-
 cuda_bindings/cuda/bindings/cyruntime.pyx.in                | 2 +-
 cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in      | 2 +-
 cuda_bindings/cuda/bindings/cyruntime_types.pxi.in          | 2 +-
 cuda_bindings/cuda/bindings/driver.pxd.in                   | 2 +-
 cuda_bindings/cuda/bindings/driver.pyx.in                   | 2 +-
 cuda_bindings/cuda/bindings/nvrtc.pxd.in                    | 2 +-
 cuda_bindings/cuda/bindings/nvrtc.pyx.in                    | 2 +-
 cuda_bindings/cuda/bindings/runtime.pxd.in                  | 2 +-
 cuda_bindings/cuda/bindings/runtime.pyx.in                  | 2 +-
 22 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
index f4d2c10d61..b35680d5b7 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 from cuda.bindings.cydriver cimport *
 
 {{if 'cuGetErrorString' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
index bc722795db..31a9e493e2 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 {{if 'Windows' == platform.system()}}
 import os
 import win32api
diff --git a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in
index 87ad37ef4d..10d7d0a584 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 from cuda.bindings.cynvrtc cimport *
 
 {{if 'nvrtcGetErrorString' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
index 0f6f452a80..9af9baad26 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 {{if 'Windows' == platform.system()}}
 import os
 import win32api
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
index dad92bdac0..2fcd438e2d 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 include "../cyruntime_types.pxi"
 
 {{if 'cudaDeviceReset' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
index d9a79cb4e4..5185e693d3 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 include "../cyruntime_functions.pxi"
 
 import os
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
index bcd21181de..cc25c75c82 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 cdef extern from "":
     """
     #define CUDA_API_PER_THREAD_DEFAULT_STREAM
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
index 80c784c558..86a8b0683c 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 cdef extern from "":
     """
     #define CUDA_API_PER_THREAD_DEFAULT_STREAM
diff --git a/cuda_bindings/cuda/bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/cydriver.pxd.in
index afc7379d83..52664211ce 100644
--- a/cuda_bindings/cuda/bindings/cydriver.pxd.in
+++ b/cuda_bindings/cuda/bindings/cydriver.pxd.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
diff --git a/cuda_bindings/cuda/bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/cydriver.pyx.in
index 8fced4ff22..0e2cca4ae1 100644
--- a/cuda_bindings/cuda/bindings/cydriver.pyx.in
+++ b/cuda_bindings/cuda/bindings/cydriver.pyx.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 cimport cuda.bindings._bindings.cydriver as cydriver
 
 {{if 'cuGetErrorString' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/cynvrtc.pxd.in b/cuda_bindings/cuda/bindings/cynvrtc.pxd.in
index c1b5dfd9c1..f8408d607f 100644
--- a/cuda_bindings/cuda/bindings/cynvrtc.pxd.in
+++ b/cuda_bindings/cuda/bindings/cynvrtc.pxd.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
diff --git a/cuda_bindings/cuda/bindings/cynvrtc.pyx.in b/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
index e53a96ebd2..a6cbe8ec8e 100644
--- a/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 cimport cuda.bindings._bindings.cynvrtc as cynvrtc
 
 {{if 'nvrtcGetErrorString' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/cyruntime.pxd.in
index 751875cce0..90978e641c 100644
--- a/cuda_bindings/cuda/bindings/cyruntime.pxd.in
+++ b/cuda_bindings/cuda/bindings/cyruntime.pxd.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
diff --git a/cuda_bindings/cuda/bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
index a4c4d9c198..dc047b2fb5 100644
--- a/cuda_bindings/cuda/bindings/cyruntime.pyx.in
+++ b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 cimport cuda.bindings._bindings.cyruntime as cyruntime
 cimport cuda.bindings._lib.cyruntime.cyruntime as custom_cyruntime
 cimport cython
diff --git a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
index ddd29a120c..b466e849a8 100644
--- a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
+++ b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 cdef extern from "cuda_runtime_api.h":
 
     {{if 'cudaDeviceReset' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
index 79544de150..6ed0a2be92 100644
--- a/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
+++ b/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 
 cdef extern from "vector_types.h":
 
diff --git a/cuda_bindings/cuda/bindings/driver.pxd.in b/cuda_bindings/cuda/bindings/driver.pxd.in
index 6709d5ea51..bf583f1386 100644
--- a/cuda_bindings/cuda/bindings/driver.pxd.in
+++ b/cuda_bindings/cuda/bindings/driver.pxd.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 cimport cuda.bindings.cydriver as cydriver
 cimport cuda.bindings._lib.utils as utils
 
diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in
index b3440ec891..b4761bc413 100644
--- a/cuda_bindings/cuda/bindings/driver.pyx.in
+++ b/cuda_bindings/cuda/bindings/driver.pyx.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 from typing import List, Tuple, Any, Optional
 from enum import IntEnum
 import cython
diff --git a/cuda_bindings/cuda/bindings/nvrtc.pxd.in b/cuda_bindings/cuda/bindings/nvrtc.pxd.in
index 07d2af3437..9802990413 100644
--- a/cuda_bindings/cuda/bindings/nvrtc.pxd.in
+++ b/cuda_bindings/cuda/bindings/nvrtc.pxd.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 cimport cuda.bindings.cynvrtc as cynvrtc
 cimport cuda.bindings._lib.utils as utils
 
diff --git a/cuda_bindings/cuda/bindings/nvrtc.pyx.in b/cuda_bindings/cuda/bindings/nvrtc.pyx.in
index 8ba5424599..609b852883 100644
--- a/cuda_bindings/cuda/bindings/nvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/nvrtc.pyx.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 from typing import List, Tuple, Any, Optional
 from enum import IntEnum
 import cython
diff --git a/cuda_bindings/cuda/bindings/runtime.pxd.in b/cuda_bindings/cuda/bindings/runtime.pxd.in
index 45a1505693..50fc13840f 100644
--- a/cuda_bindings/cuda/bindings/runtime.pxd.in
+++ b/cuda_bindings/cuda/bindings/runtime.pxd.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 cimport cuda.bindings.cyruntime as cyruntime
 cimport cuda.bindings._lib.utils as utils
 cimport cuda.bindings.driver as driver
diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in
index 5f0d9d06a2..04a9564691 100644
--- a/cuda_bindings/cuda/bindings/runtime.pyx.in
+++ b/cuda_bindings/cuda/bindings/runtime.pyx.in
@@ -1,7 +1,7 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 from typing import List, Tuple, Any, Optional
 from enum import IntEnum
 import cython

From 3d335535b2e645720be33951355da771c6ed9293 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 29 May 2025 11:09:39 -0700
Subject: [PATCH 07/65] =?UTF-8?q?Bump=20cuda/bindings/=5Fversion.py=20?=
 =?UTF-8?q?=E2=86=92=2013.0.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cuda_bindings/cuda/bindings/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_bindings/cuda/bindings/_version.py b/cuda_bindings/cuda/bindings/_version.py
index 645a0bf9c2..6c37445867 100644
--- a/cuda_bindings/cuda/bindings/_version.py
+++ b/cuda_bindings/cuda/bindings/_version.py
@@ -1,4 +1,4 @@
 # Copyright 2024-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-__version__ = "12.9.0"
+__version__ = "13.0.0"

From e20923848a92df8dfa842b96794b28f4bccfec44 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rwgkio@gmail.com>
Date: Wed, 4 Jun 2025 15:31:27 -0700
Subject: [PATCH 08/65] `path_finder` and cybind updates for CTK 13.0 (#81)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update SUPPORTED_WINDOWS_DLLS: kitpicks/cuda-r13-0/13.0.0/013/local_installers/cuda_13.0.0_windows.exe

* Update SUPPORTED_LINUX_SONAMES: kitpicks/cuda-r13-0/13.0.0/013/local_installers/cuda_13.0.0_580.31_linux.run

* 013 → 014: SUPPORTED_LINUX_SONAMES unchanged

* 013 → 014: SUPPORTED_WINDOWS_DLLS unchanged

* cybind update with 13.0.0 headers (014)

* Bump cuda/bindings/_version.py → 13.0.0

* test_nvjitlink.py: remove sm_60, add sm_100

* Updates from cybind after removing all 11.x headers (affects "automatically generated" comments only).

* Add new toolshed/reformat_cuda_enums_as_py.py (reads cuda.h, driver_types.h headers directly).

* Use new toolshed/reformat_cuda_enums_as_py.py to regenerate driver_cu_result_explanations.py, runtime_cuda_error_explanations.py

* Use `driver.cuDeviceGetUuid()` instead of `driver.cuDeviceGetUuid_v2()` with CTK 13.

* Adjustments for locating nvvm directory in CTK 13 installations.
---
 .../cuda/bindings/_internal/nvjitlink.pxd     |   2 +-
 .../bindings/_internal/nvjitlink_linux.pyx    |   2 +-
 .../bindings/_internal/nvjitlink_windows.pyx  |   2 +-
 .../cuda/bindings/_internal/nvvm.pxd          |   2 +-
 .../cuda/bindings/_internal/nvvm_linux.pyx    |   2 +-
 .../cuda/bindings/_internal/nvvm_windows.pyx  |   2 +-
 .../find_nvidia_dynamic_library.py            |  41 +-
 .../bindings/_path_finder/supported_libs.py   |  58 +-
 cuda_bindings/cuda/bindings/cynvjitlink.pxd   |  11 +-
 cuda_bindings/cuda/bindings/cynvjitlink.pyx   |   2 +-
 cuda_bindings/cuda/bindings/cynvvm.pxd        |   2 +-
 cuda_bindings/cuda/bindings/cynvvm.pyx        |   2 +-
 cuda_bindings/cuda/bindings/nvjitlink.pxd     |   2 +-
 cuda_bindings/cuda/bindings/nvjitlink.pyx     |  11 +-
 cuda_bindings/cuda/bindings/nvvm.pxd          |   2 +-
 cuda_bindings/cuda/bindings/nvvm.pyx          |   2 +-
 cuda_bindings/tests/test_nvjitlink.py         |   4 +-
 cuda_core/cuda/core/experimental/_device.py   |   2 +-
 .../_utils/driver_cu_result_explanations.py   | 406 +++++------
 .../_utils/runtime_cuda_error_explanations.py | 643 ++++++++++--------
 cuda_core/tests/test_device.py                |   2 +-
 toolshed/reformat_cuda_enums_as_py.py         | 111 +++
 .../reformat_cuda_enums_from_web_as_py.py     |  51 --
 23 files changed, 788 insertions(+), 576 deletions(-)
 create mode 100755 toolshed/reformat_cuda_enums_as_py.py
 delete mode 100755 toolshed/reformat_cuda_enums_from_web_as_py.py

diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd b/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
index c3e5ce6073..5142e839a6 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 12.9.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.0.0. Do not modify it directly.
 
 from ..cynvjitlink cimport *
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
index 36bdcb4f45..683bc0916a 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 12.9.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.0.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
index 0020fe4864..32c3fb3778 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 12.9.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.0.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm.pxd b/cuda_bindings/cuda/bindings/_internal/nvvm.pxd
index 1a8569488d..d6c278f73d 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvvm.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 11.0.3 to 12.9.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.0.0. Do not modify it directly.
 
 from ..cynvvm cimport *
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
index 8759096a4c..81f55dadda 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 11.0.3 to 12.9.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.0.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx
index 0a7eae320e..7b8a4d7d19 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 11.0.3 to 12.9.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.0.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py b/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
index 9835b72d0e..a513c04b73 100644
--- a/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
+++ b/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
@@ -5,7 +5,7 @@
 import glob
 import os
 
-from cuda.bindings._path_finder.find_sub_dirs import find_sub_dirs_all_sitepackages
+from cuda.bindings._path_finder.find_sub_dirs import find_sub_dirs, find_sub_dirs_all_sitepackages
 from cuda.bindings._path_finder.supported_libs import IS_WINDOWS, is_suppressed_dll_file
 
 
@@ -44,11 +44,14 @@ def _find_dll_under_dir(dirpath, file_wild):
 
 
 def _find_dll_using_nvidia_bin_dirs(libname, lib_searched_for, error_messages, attachments):
-    nvidia_sub_dirs = ("nvidia", "*", "nvvm", "bin") if libname == "nvvm" else ("nvidia", "*", "bin")
-    for bin_dir in find_sub_dirs_all_sitepackages(nvidia_sub_dirs):
-        dll_name = _find_dll_under_dir(bin_dir, lib_searched_for)
-        if dll_name is not None:
-            return dll_name
+    nvidia_sub_dirs_list = [("nvidia", "*", "bin")]
+    if libname == "nvvm":
+        nvidia_sub_dirs_list.append(("nvidia", "*", "nvvm", "bin"))  # Only for CTK 12
+    for nvidia_sub_dirs in nvidia_sub_dirs_list:
+        for bin_dir in find_sub_dirs_all_sitepackages(nvidia_sub_dirs):
+            dll_name = _find_dll_under_dir(bin_dir, lib_searched_for)
+            if dll_name is not None:
+                return dll_name
     _no_such_file_in_sub_dirs(nvidia_sub_dirs, lib_searched_for, error_messages, attachments)
     return None
 
@@ -65,19 +68,23 @@ def _find_lib_dir_using_cuda_home(libname):
     if cuda_home is None:
         return None
     if IS_WINDOWS:
-        subdirs = (os.path.join("nvvm", "bin"),) if libname == "nvvm" else ("bin",)
+        if libname == "nvvm":  # noqa: SIM108
+            subdirs_list = (
+                ("nvvm", "bin", "*"),  # CTK 13
+                ("nvvm", "bin"),  # CTK 12
+            )
+        else:
+            subdirs_list = (("bin",),)
     else:
-        subdirs = (
-            (os.path.join("nvvm", "lib64"),)
-            if libname == "nvvm"
-            else (
-                "lib64",  # CTK
-                "lib",  # Conda
+        if libname == "nvvm":  # noqa: SIM108
+            subdirs_list = (("nvvm", "lib64"),)
+        else:
+            subdirs_list = (
+                ("lib64",),  # CTK
+                ("lib",),  # Conda
             )
-        )
-    for subdir in subdirs:
-        dirname = os.path.join(cuda_home, subdir)
-        if os.path.isdir(dirname):
+    for sub_dirs in subdirs_list:
+        for dirname in find_sub_dirs((cuda_home,), sub_dirs):
             return dirname
     return None
 
diff --git a/cuda_bindings/cuda/bindings/_path_finder/supported_libs.py b/cuda_bindings/cuda/bindings/_path_finder/supported_libs.py
index 63bde282f5..cf9d5ae75b 100644
--- a/cuda_bindings/cuda/bindings/_path_finder/supported_libs.py
+++ b/cuda_bindings/cuda/bindings/_path_finder/supported_libs.py
@@ -112,6 +112,8 @@
 #   cuda_12.6.2_560.35.03_linux.run
 #   cuda_12.8.0_570.86.10_linux.run
 #   cuda_12.9.0_575.51.03_linux.run
+#   014
+#   TODO: Update from posted .run files before merging into public main.
 # Generated with toolshed/build_path_finder_sonames.py
 SUPPORTED_LINUX_SONAMES = {
     "cublas": (
@@ -125,14 +127,17 @@
     "cudart": (
         "libcudart.so.11.0",
         "libcudart.so.12",
+        "libcudart.so.13",
     ),
     "cufft": (
         "libcufft.so.10",
         "libcufft.so.11",
+        "libcufft.so.12",
     ),
     "cufftw": (
         "libcufftw.so.10",
         "libcufftw.so.11",
+        "libcufftw.so.12",
     ),
     "cufile": ("libcufile.so.0",),
     # "cufile_rdma": ("libcufile_rdma.so.1",),
@@ -152,62 +157,81 @@
     "nppc": (
         "libnppc.so.11",
         "libnppc.so.12",
+        "libnppc.so.13",
     ),
     "nppial": (
         "libnppial.so.11",
         "libnppial.so.12",
+        "libnppial.so.13",
     ),
     "nppicc": (
         "libnppicc.so.11",
         "libnppicc.so.12",
+        "libnppicc.so.13",
     ),
     "nppidei": (
         "libnppidei.so.11",
         "libnppidei.so.12",
+        "libnppidei.so.13",
     ),
     "nppif": (
         "libnppif.so.11",
         "libnppif.so.12",
+        "libnppif.so.13",
     ),
     "nppig": (
         "libnppig.so.11",
         "libnppig.so.12",
+        "libnppig.so.13",
     ),
     "nppim": (
         "libnppim.so.11",
         "libnppim.so.12",
+        "libnppim.so.13",
     ),
     "nppist": (
         "libnppist.so.11",
         "libnppist.so.12",
+        "libnppist.so.13",
     ),
     "nppisu": (
         "libnppisu.so.11",
         "libnppisu.so.12",
+        "libnppisu.so.13",
     ),
     "nppitc": (
         "libnppitc.so.11",
         "libnppitc.so.12",
+        "libnppitc.so.13",
     ),
     "npps": (
         "libnpps.so.11",
         "libnpps.so.12",
+        "libnpps.so.13",
+    ),
+    "nvJitLink": (
+        "libnvJitLink.so.12",
+        "libnvJitLink.so.13",
     ),
-    "nvJitLink": ("libnvJitLink.so.12",),
     "nvblas": (
         "libnvblas.so.11",
         "libnvblas.so.12",
     ),
-    "nvfatbin": ("libnvfatbin.so.12",),
+    "nvfatbin": (
+        "libnvfatbin.so.12",
+        "libnvfatbin.so.13",
+    ),
     "nvjpeg": (
         "libnvjpeg.so.11",
         "libnvjpeg.so.12",
+        "libnvjpeg.so.13",
     ),
     "nvrtc": (
         "libnvrtc.so.11.0",
         "libnvrtc.so.11.1",
         "libnvrtc.so.11.2",
         "libnvrtc.so.12",
+        "libnvrtc.so.13",
     ),
     "nvvm": (
         "libnvvm.so.3",
@@ -234,6 +258,8 @@
 #   cuda_12.6.2_560.94_windows.exe
 #   cuda_12.8.1_572.61_windows.exe
 #   cuda_12.9.0_576.02_windows.txt
+#   014
+#   TODO: Update from posted .run files before merging into public main.
 # Generated with toolshed/build_path_finder_dlls.py (WITH MANUAL EDITS)
 SUPPORTED_WINDOWS_DLLS = {
     "cublas": (
@@ -251,15 +277,18 @@
         "cudart64_101.dll",
         "cudart64_110.dll",
         "cudart64_12.dll",
+        "cudart64_13.dll",
         "cudart64_65.dll",
     ),
     "cufft": (
         "cufft64_10.dll",
         "cufft64_11.dll",
+        "cufft64_12.dll",
     ),
     "cufftw": (
         "cufftw64_10.dll",
         "cufftw64_11.dll",
+        "cufftw64_12.dll",
     ),
     "curand": ("curand64_10.dll",),
     "cusolver": (
@@ -277,62 +306,83 @@
     "nppc": (
         "nppc64_11.dll",
         "nppc64_12.dll",
+        "nppc64_13.dll",
     ),
     "nppial": (
         "nppial64_11.dll",
         "nppial64_12.dll",
+        "nppial64_13.dll",
     ),
     "nppicc": (
         "nppicc64_11.dll",
         "nppicc64_12.dll",
+        "nppicc64_13.dll",
     ),
     "nppidei": (
         "nppidei64_11.dll",
         "nppidei64_12.dll",
+        "nppidei64_13.dll",
     ),
     "nppif": (
         "nppif64_11.dll",
         "nppif64_12.dll",
+        "nppif64_13.dll",
     ),
     "nppig": (
         "nppig64_11.dll",
         "nppig64_12.dll",
+        "nppig64_13.dll",
     ),
     "nppim": (
         "nppim64_11.dll",
         "nppim64_12.dll",
+        "nppim64_13.dll",
     ),
     "nppist": (
         "nppist64_11.dll",
         "nppist64_12.dll",
+        "nppist64_13.dll",
     ),
     "nppisu": (
         "nppisu64_11.dll",
         "nppisu64_12.dll",
+        "nppisu64_13.dll",
     ),
     "nppitc": (
         "nppitc64_11.dll",
         "nppitc64_12.dll",
+        "nppitc64_13.dll",
     ),
     "npps": (
         "npps64_11.dll",
         "npps64_12.dll",
+        "npps64_13.dll",
+    ),
+    "nvJitLink": (
+        "nvJitLink_120_0.dll",
+        "nvJitLink_130_0.dll",
     ),
-    "nvJitLink": ("nvJitLink_120_0.dll",),
     "nvblas": (
         "nvblas64_11.dll",
         "nvblas64_12.dll",
     ),
-    "nvfatbin": ("nvfatbin_120_0.dll",),
+    "nvfatbin": (
+        "nvfatbin_120_0.dll",
+        "nvfatbin_130_0.dll",
+    ),
     "nvjpeg": (
         "nvjpeg64_11.dll",
         "nvjpeg64_12.dll",
+        "nvjpeg64_13.dll",
     ),
     "nvrtc": (
         "nvrtc64_110_0.dll",
         "nvrtc64_111_0.dll",
         "nvrtc64_112_0.dll",
+        "nvrtc64_120_0.alt.dll",
         "nvrtc64_120_0.dll",
+        "nvrtc64_130_0.alt.dll",
+        "nvrtc64_130_0.dll",
     ),
     "nvvm": (
         "nvvm32.dll",
diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pxd b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
index b9a16eb9bf..9db667ee5d 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 12.9.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.0.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uint32_t
 
@@ -23,6 +23,15 @@ ctypedef enum nvJitLinkResult "nvJitLinkResult":
     NVJITLINK_ERROR_THREADPOOL "NVJITLINK_ERROR_THREADPOOL"
     NVJITLINK_ERROR_UNRECOGNIZED_INPUT "NVJITLINK_ERROR_UNRECOGNIZED_INPUT"
     NVJITLINK_ERROR_FINALIZE "NVJITLINK_ERROR_FINALIZE"
+    NVJITLINK_ERROR_NULL_INPUT "NVJITLINK_ERROR_NULL_INPUT"
+    NVJITLINK_ERROR_INCOMPATIBLE_OPTIONS "NVJITLINK_ERROR_INCOMPATIBLE_OPTIONS"
+    NVJITLINK_ERROR_INCORRECT_INPUT_TYPE "NVJITLINK_ERROR_INCORRECT_INPUT_TYPE"
+    NVJITLINK_ERROR_ARCH_MISMATCH "NVJITLINK_ERROR_ARCH_MISMATCH"
+    NVJITLINK_ERROR_OUTDATED_LIBRARY "NVJITLINK_ERROR_OUTDATED_LIBRARY"
+    NVJITLINK_ERROR_MISSING_FATBIN "NVJITLINK_ERROR_MISSING_FATBIN"
+    NVJITLINK_ERROR_UNRECOGNIZED_ARCH "NVJITLINK_ERROR_UNRECOGNIZED_ARCH"
+    NVJITLINK_ERROR_UNSUPPORTED_ARCH "NVJITLINK_ERROR_UNSUPPORTED_ARCH"
+    NVJITLINK_ERROR_LTO_NOT_ENABLED "NVJITLINK_ERROR_LTO_NOT_ENABLED"
     _NVJITLINKRESULT_INTERNAL_LOADING_ERROR "_NVJITLINKRESULT_INTERNAL_LOADING_ERROR" = -42
 
 ctypedef enum nvJitLinkInputType "nvJitLinkInputType":
diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pyx b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
index 4706eb13d4..d693e87e02 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 12.9.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.0.0. Do not modify it directly.
 
 from ._internal cimport nvjitlink as _nvjitlink
 
diff --git a/cuda_bindings/cuda/bindings/cynvvm.pxd b/cuda_bindings/cuda/bindings/cynvvm.pxd
index cea1ce10fc..21078f7260 100644
--- a/cuda_bindings/cuda/bindings/cynvvm.pxd
+++ b/cuda_bindings/cuda/bindings/cynvvm.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 11.0.3 to 12.9.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.0.0. Do not modify it directly.
 
 
 ###############################################################################
diff --git a/cuda_bindings/cuda/bindings/cynvvm.pyx b/cuda_bindings/cuda/bindings/cynvvm.pyx
index 5c1a818bb7..e53634002a 100644
--- a/cuda_bindings/cuda/bindings/cynvvm.pyx
+++ b/cuda_bindings/cuda/bindings/cynvvm.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 11.0.3 to 12.9.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.0.0. Do not modify it directly.
 
 from ._internal cimport nvvm as _nvvm
 
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pxd b/cuda_bindings/cuda/bindings/nvjitlink.pxd
index e97fa1541d..584c24ef5f 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 12.9.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.0.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uint32_t
 
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pyx b/cuda_bindings/cuda/bindings/nvjitlink.pyx
index f874138e09..31723ced76 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 12.0.1 to 12.9.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.0.0. Do not modify it directly.
 
 cimport cython  # NOQA
 
@@ -29,6 +29,15 @@ class Result(_IntEnum):
     ERROR_THREADPOOL = NVJITLINK_ERROR_THREADPOOL
     ERROR_UNRECOGNIZED_INPUT = NVJITLINK_ERROR_UNRECOGNIZED_INPUT
     ERROR_FINALIZE = NVJITLINK_ERROR_FINALIZE
+    ERROR_NULL_INPUT = NVJITLINK_ERROR_NULL_INPUT
+    ERROR_INCOMPATIBLE_OPTIONS = NVJITLINK_ERROR_INCOMPATIBLE_OPTIONS
+    ERROR_INCORRECT_INPUT_TYPE = NVJITLINK_ERROR_INCORRECT_INPUT_TYPE
+    ERROR_ARCH_MISMATCH = NVJITLINK_ERROR_ARCH_MISMATCH
+    ERROR_OUTDATED_LIBRARY = NVJITLINK_ERROR_OUTDATED_LIBRARY
+    ERROR_MISSING_FATBIN = NVJITLINK_ERROR_MISSING_FATBIN
+    ERROR_UNRECOGNIZED_ARCH = NVJITLINK_ERROR_UNRECOGNIZED_ARCH
+    ERROR_UNSUPPORTED_ARCH = NVJITLINK_ERROR_UNSUPPORTED_ARCH
+    ERROR_LTO_NOT_ENABLED = NVJITLINK_ERROR_LTO_NOT_ENABLED
 
 class InputType(_IntEnum):
     """See `nvJitLinkInputType`."""
diff --git a/cuda_bindings/cuda/bindings/nvvm.pxd b/cuda_bindings/cuda/bindings/nvvm.pxd
index f8564f86ad..1cfef86ab5 100644
--- a/cuda_bindings/cuda/bindings/nvvm.pxd
+++ b/cuda_bindings/cuda/bindings/nvvm.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 11.0.3 to 12.9.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.0.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/nvvm.pyx b/cuda_bindings/cuda/bindings/nvvm.pyx
index 0a35150e59..1b7d6827a3 100644
--- a/cuda_bindings/cuda/bindings/nvvm.pyx
+++ b/cuda_bindings/cuda/bindings/nvvm.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated across versions from 11.0.3 to 12.9.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.0.1 to 13.0.0. Do not modify it directly.
 
 cimport cython  # NOQA
 
diff --git a/cuda_bindings/tests/test_nvjitlink.py b/cuda_bindings/tests/test_nvjitlink.py
index 5c6ca98ea7..943b4111f3 100644
--- a/cuda_bindings/tests/test_nvjitlink.py
+++ b/cuda_bindings/tests/test_nvjitlink.py
@@ -6,8 +6,8 @@
 from cuda.bindings import nvjitlink, nvrtc
 
 # Establish a handful of compatible architectures and PTX versions to test with
-ARCHITECTURES = ["sm_60", "sm_75", "sm_80", "sm_90"]
-PTX_VERSIONS = ["5.0", "6.4", "7.0", "8.5"]
+ARCHITECTURES = ["sm_75", "sm_80", "sm_90", "sm_100"]
+PTX_VERSIONS = ["6.4", "7.0", "8.5", "8.8"]
 
 
 PTX_HEADER = """\
diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 55afb0ebab..de14ddbd32 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -1021,7 +1021,7 @@ def uuid(self) -> str:
 
         """
         driver_ver = handle_return(driver.cuDriverGetVersion())
-        if driver_ver >= 11040:
+        if 11040 <= driver_ver < 13000:
             uuid = handle_return(driver.cuDeviceGetUuid_v2(self._id))
         else:
             uuid = handle_return(driver.cuDeviceGetUuid(self._id))
diff --git a/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py b/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py
index 29772653dc..676a63965d 100644
--- a/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py
+++ b/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py
@@ -1,291 +1,314 @@
 # Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# To regenerate the dictionary below, navigate to:
-#     https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES
-# (Chrome was used before, but probably it works with other browsers, too.)
-# Search for:
-#     enum CUresult
-# With the mouse, select the entire region with the enum definitions:
-#     CUDA_SUCCESS = 0
-#     ...
-#     CUDA_ERROR_UNKNOWN = 999
-#         This indicates that an unknown internal error has occurred.
-# Paste into a file, e.g. raw.txt
-# python ../../../../../toolshed/reformat_cuda_enums_from_web_as_py.py raw.txt > raw.py
-# ruff format raw.py
-# Copy raw.py into this file (discarding the `DATA = {`, `}` lines).
+# To regenerate the dictionary below run:
+#     ../../../../../toolshed/reformat_cuda_enums_as_py.py /usr/local/cuda/include/cuda.h
+# Replace the dictionary below with the output.
 # Also update the CUDA Toolkit version number below.
-# Done.
 
-# CUDA Toolkit v12.9.0
+# ruff: noqa: E501
+# CUDA Toolkit v13.0.0
 DRIVER_CU_RESULT_EXPLANATIONS = {
     0: (
-        "The API call returned with no errors. In the case of query calls, this also means that the operation"
-        " being queried is complete (see cuEventQuery() and cuStreamQuery())."
+        "The API call returned with no errors. In the case of query calls, this"
+        " also means that the operation being queried is complete (see"
+        " ::cuEventQuery() and ::cuStreamQuery())."
     ),
     1: (
-        "This indicates that one or more of the parameters passed to the API call is not within an acceptable"
-        " range of values."
+        "This indicates that one or more of the parameters passed to the API call"
+        " is not within an acceptable range of values."
     ),
     2: (
-        "The API call failed because it was unable to allocate enough memory or other resources to perform "
-        "the requested operation."
+        "The API call failed because it was unable to allocate enough memory or"
+        " other resources to perform the requested operation."
     ),
     3: (
-        "This indicates that the CUDA driver has not been initialized with cuInit() or that initialization has failed."
+        "This indicates that the CUDA driver has not been initialized with"
+        " ::cuInit() or that initialization has failed."
     ),
     4: "This indicates that the CUDA driver is in the process of shutting down.",
     5: (
-        "This indicates profiler is not initialized for this run. This can happen when the application is "
-        "running with external profiling tools like visual profiler."
+        "This indicates profiler is not initialized for this run. This can"
+        " happen when the application is running with external profiling tools"
+        " like visual profiler."
     ),
     6: (
-        "This error return is deprecated as of CUDA 5.0. It is no longer an error to attempt to "
-        "enable/disable the profiling via cuProfilerStart or cuProfilerStop without initialization."
+        "This error return is deprecated as of CUDA 5.0. It is no longer an error"
+        " to attempt to enable/disable the profiling via ::cuProfilerStart or"
+        " ::cuProfilerStop without initialization."
     ),
     7: (
-        "This error return is deprecated as of CUDA 5.0. It is no longer an error to call cuProfilerStart() "
-        "when profiling is already enabled."
+        "This error return is deprecated as of CUDA 5.0. It is no longer an error"
+        " to call cuProfilerStart() when profiling is already enabled."
     ),
     8: (
-        "This error return is deprecated as of CUDA 5.0. It is no longer an error to call cuProfilerStop() "
-        "when profiling is already disabled."
+        "This error return is deprecated as of CUDA 5.0. It is no longer an error"
+        " to call cuProfilerStop() when profiling is already disabled."
     ),
     34: (
-        "This indicates that the CUDA driver that the application has loaded is a stub library. Applications "
-        "that run with the stub rather than a real driver loaded will result in CUDA API returning this "
-        "error."
+        "This indicates that the CUDA driver that the application has loaded is a"
+        " stub library. Applications that run with the stub rather than a real"
+        " driver loaded will result in CUDA API returning this error."
+    ),
+    36: (
+        "This indicates that the API call requires a newer CUDA driver than the one"
+        " currently installed. Users should install an updated NVIDIA CUDA driver"
+        " to allow the API call to succeed."
     ),
     46: (
-        "This indicates that requested CUDA device is unavailable at the current time. Devices are often "
-        "unavailable due to use of CU_COMPUTEMODE_EXCLUSIVE_PROCESS or CU_COMPUTEMODE_PROHIBITED."
+        "This indicates that requested CUDA device is unavailable at the current"
+        " time. Devices are often unavailable due to use of"
+        " ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS or ::CU_COMPUTEMODE_PROHIBITED."
     ),
-    100: "This indicates that no CUDA-capable devices were detected by the installed CUDA driver.",
+    100: ("This indicates that no CUDA-capable devices were detected by the installed CUDA driver."),
     101: (
-        "This indicates that the device ordinal supplied by the user does not correspond to a valid CUDA "
-        "device or that the action requested is invalid for the specified device."
+        "This indicates that the device ordinal supplied by the user does not"
+        " correspond to a valid CUDA device or that the action requested is"
+        " invalid for the specified device."
     ),
     102: "This error indicates that the Grid license is not applied.",
     200: ("This indicates that the device kernel image is invalid. This can also indicate an invalid CUDA module."),
     201: (
-        "This most frequently indicates that there is no context bound to the current thread. This can also "
-        "be returned if the context passed to an API call is not a valid handle (such as a context that has "
-        "had cuCtxDestroy() invoked on it). This can also be returned if a user mixes different API versions "
-        "(i.e. 3010 context with 3020 API calls). See cuCtxGetApiVersion() for more details. This can also be"
-        " returned if the green context passed to an API call was not converted to a CUcontext using "
-        "cuCtxFromGreenCtx API."
+        "This most frequently indicates that there is no context bound to the"
+        " current thread. This can also be returned if the context passed to an"
+        " API call is not a valid handle (such as a context that has had"
+        " ::cuCtxDestroy() invoked on it). This can also be returned if a user"
+        " mixes different API versions (i.e. 3010 context with 3020 API calls)."
+        " See ::cuCtxGetApiVersion() for more details."
+        " This can also be returned if the green context passed to an API call"
+        " was not converted to a ::CUcontext using ::cuCtxFromGreenCtx API."
     ),
     202: (
-        "This error return is deprecated as of CUDA 3.2. It is no longer an error to attempt to push the "
-        "active context via cuCtxPushCurrent(). This indicated that the context being supplied as a parameter"
-        " to the API call was already the active context."
+        "This indicated that the context being supplied as a parameter to the"
+        " API call was already the active context."
+        " This error return is deprecated as of CUDA 3.2. It is no longer an"
+        " error to attempt to push the active context via ::cuCtxPushCurrent()."
     ),
     205: "This indicates that a map or register operation has failed.",
     206: "This indicates that an unmap or unregister operation has failed.",
-    207: "This indicates that the specified array is currently mapped and thus cannot be destroyed.",
+    207: ("This indicates that the specified array is currently mapped and thus cannot be destroyed."),
     208: "This indicates that the resource is already mapped.",
     209: (
-        "This indicates that there is no kernel image available that is suitable for the device. This can "
-        "occur when a user specifies code generation options for a particular CUDA source file that do not "
-        "include the corresponding device configuration."
+        "This indicates that there is no kernel image available that is suitable"
+        " for the device. This can occur when a user specifies code generation"
+        " options for a particular CUDA source file that do not include the"
+        " corresponding device configuration."
     ),
     210: "This indicates that a resource has already been acquired.",
     211: "This indicates that a resource is not mapped.",
-    212: "This indicates that a mapped resource is not available for access as an array.",
-    213: "This indicates that a mapped resource is not available for access as a pointer.",
-    214: "This indicates that an uncorrectable ECC error was detected during execution.",
-    215: "This indicates that the CUlimit passed to the API call is not supported by the active device.",
+    212: ("This indicates that a mapped resource is not available for access as an array."),
+    213: ("This indicates that a mapped resource is not available for access as a pointer."),
+    214: ("This indicates that an uncorrectable ECC error was detected during execution."),
+    215: ("This indicates that the ::CUlimit passed to the API call is not supported by the active device."),
     216: (
-        "This indicates that the CUcontext passed to the API call can only be bound to a single CPU thread at"
-        " a time but is already bound to a CPU thread."
+        "This indicates that the ::CUcontext passed to the API call can"
+        " only be bound to a single CPU thread at a time but is already"
+        " bound to a CPU thread."
     ),
-    217: "This indicates that peer access is not supported across the given devices.",
+    217: ("This indicates that peer access is not supported across the given devices."),
     218: "This indicates that a PTX JIT compilation failed.",
     219: "This indicates an error with OpenGL or DirectX context.",
-    220: "This indicates that an uncorrectable NVLink error was detected during the execution.",
+    220: ("This indicates that an uncorrectable NVLink error was detected during the execution."),
     221: "This indicates that the PTX JIT compiler library was not found.",
     222: "This indicates that the provided PTX was compiled with an unsupported toolchain.",
     223: "This indicates that the PTX JIT compilation was disabled.",
-    224: ("This indicates that the CUexecAffinityType passed to the API call is not supported by the active device."),
+    224: ("This indicates that the ::CUexecAffinityType passed to the API call is not supported by the active device."),
     225: (
         "This indicates that the code to be compiled by the PTX JIT contains unsupported call to cudaDeviceSynchronize."
     ),
     226: (
-        "This indicates that an exception occurred on the device that is now contained by the GPU's error "
-        "containment capability. Common causes are - a. Certain types of invalid accesses of peer GPU memory "
-        "over nvlink b. Certain classes of hardware errors This leaves the process in an inconsistent state "
-        "and any further CUDA work will return the same error. To continue using CUDA, the process must be "
-        "terminated and relaunched."
+        "This indicates that an exception occurred on the device that is now"
+        " contained by the GPU's error containment capability. Common causes are -"
+        " a. Certain types of invalid accesses of peer GPU memory over nvlink"
+        " b. Certain classes of hardware errors"
+        " This leaves the process in an inconsistent state and any further CUDA"
+        " work will return the same error. To continue using CUDA, the process must"
+        " be terminated and relaunched."
     ),
+    227: ("This indicates that an NVLink encryption error was detected during the execution."),
     300: (
-        "This indicates that the device kernel source is invalid. This includes compilation/linker errors "
-        "encountered in device code or user error."
+        "This indicates that the device kernel source is invalid. This includes"
+        " compilation/linker errors encountered in device code or user error."
     ),
     301: "This indicates that the file specified was not found.",
     302: "This indicates that a link to a shared object failed to resolve.",
     303: "This indicates that initialization of a shared object failed.",
     304: "This indicates that an OS call failed.",
     400: (
-        "This indicates that a resource handle passed to the API call was not valid. Resource handles are "
-        "opaque types like CUstream and CUevent."
+        "This indicates that a resource handle passed to the API call was not"
+        " valid. Resource handles are opaque types like ::CUstream and ::CUevent."
     ),
     401: (
-        "This indicates that a resource required by the API call is not in a valid state to perform the "
-        "requested operation."
+        "This indicates that a resource required by the API call is not in a"
+        " valid state to perform the requested operation."
     ),
     402: (
-        "This indicates an attempt was made to introspect an object in a way that would discard semantically "
-        "important information. This is either due to the object using funtionality newer than the API "
-        "version used to introspect it or omission of optional return arguments."
+        "This indicates an attempt was made to introspect an object in a way that"
+        " would discard semantically important information. This is either due to"
+        " the object using funtionality newer than the API version used to"
+        " introspect it or omission of optional return arguments."
     ),
     500: (
-        "This indicates that a named symbol was not found. Examples of symbols are global/constant variable "
-        "names, driver function names, texture names, and surface names."
+        "This indicates that a named symbol was not found. Examples of symbols"
+        " are global/constant variable names, driver function names, texture names,"
+        " and surface names."
     ),
     600: (
-        "This indicates that asynchronous operations issued previously have not completed yet. This result is"
-        " not actually an error, but must be indicated differently than CUDA_SUCCESS (which indicates "
-        "completion). Calls that may return this value include cuEventQuery() and cuStreamQuery()."
+        "This indicates that asynchronous operations issued previously have not"
+        " completed yet. This result is not actually an error, but must be indicated"
+        " differently than ::CUDA_SUCCESS (which indicates completion). Calls that"
+        " may return this value include ::cuEventQuery() and ::cuStreamQuery()."
     ),
     700: (
-        "While executing a kernel, the device encountered a load or store instruction on an invalid memory "
-        "address. This leaves the process in an inconsistent state and any further CUDA work will return the "
-        "same error. To continue using CUDA, the process must be terminated and relaunched."
+        "While executing a kernel, the device encountered a"
+        " load or store instruction on an invalid memory address."
+        " This leaves the process in an inconsistent state and any further CUDA work"
+        " will return the same error. To continue using CUDA, the process must be terminated"
+        " and relaunched."
     ),
     701: (
-        "This indicates that a launch did not occur because it did not have appropriate resources. This error"
-        " usually indicates that the user has attempted to pass too many arguments to the device kernel, or "
-        "the kernel launch specifies too many threads for the kernel's register count. Passing arguments of "
-        "the wrong size (i.e. a 64-bit pointer when a 32-bit int is expected) is equivalent to passing too "
-        "many arguments and can also result in this error."
+        "This indicates that a launch did not occur because it did not have"
+        " appropriate resources. This error usually indicates that the user has"
+        " attempted to pass too many arguments to the device kernel, or the"
+        " kernel launch specifies too many threads for the kernel's register"
+        " count. Passing arguments of the wrong size (i.e. a 64-bit pointer"
+        " when a 32-bit int is expected) is equivalent to passing too many"
+        " arguments and can also result in this error."
     ),
     702: (
-        "This indicates that the device kernel took too long to execute. This can only occur if timeouts are "
-        "enabled - see the device attribute CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. "
-        "This leaves the process in an inconsistent state and any further CUDA work will return the same "
-        "error. To continue using CUDA, the process must be terminated and relaunched."
+        "This indicates that the device kernel took too long to execute. This can"
+        " only occur if timeouts are enabled - see the device attribute"
+        " ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information."
+        " This leaves the process in an inconsistent state and any further CUDA work"
+        " will return the same error. To continue using CUDA, the process must be terminated"
+        " and relaunched."
     ),
-    703: "This error indicates a kernel launch that uses an incompatible texturing mode.",
+    703: ("This error indicates a kernel launch that uses an incompatible texturing mode."),
     704: (
-        "This error indicates that a call to cuCtxEnablePeerAccess() is trying to re-enable peer access to a "
-        "context which has already had peer access to it enabled."
+        "This error indicates that a call to ::cuCtxEnablePeerAccess() is"
+        " trying to re-enable peer access to a context which has already"
+        " had peer access to it enabled."
     ),
     705: (
-        "This error indicates that cuCtxDisablePeerAccess() is trying to disable peer access which has not "
-        "been enabled yet via cuCtxEnablePeerAccess()."
+        "This error indicates that ::cuCtxDisablePeerAccess() is"
+        " trying to disable peer access which has not been enabled yet"
+        " via ::cuCtxEnablePeerAccess()."
     ),
-    708: "This error indicates that the primary context for the specified device has already been initialized.",
+    708: ("This error indicates that the primary context for the specified device has already been initialized."),
     709: (
-        "This error indicates that the context current to the calling thread has been destroyed using "
-        "cuCtxDestroy, or is a primary context which has not yet been initialized."
+        "This error indicates that the context current to the calling thread"
+        " has been destroyed using ::cuCtxDestroy, or is a primary context which"
+        " has not yet been initialized."
     ),
     710: (
-        "A device-side assert triggered during kernel execution. The context cannot be used anymore, and must"
-        " be destroyed. All existing device memory allocations from this context are invalid and must be "
-        "reconstructed if the program is to continue using CUDA."
+        "A device-side assert triggered during kernel execution. The context"
+        " cannot be used anymore, and must be destroyed. All existing device"
+        " memory allocations from this context are invalid and must be"
+        " reconstructed if the program is to continue using CUDA."
     ),
     711: (
-        "This error indicates that the hardware resources required to enable peer access have been exhausted "
-        "for one or more of the devices passed to cuCtxEnablePeerAccess()."
+        "This error indicates that the hardware resources required to enable"
+        " peer access have been exhausted for one or more of the devices"
+        " passed to ::cuCtxEnablePeerAccess()."
     ),
-    712: ("This error indicates that the memory range passed to cuMemHostRegister() has already been registered."),
+    712: ("This error indicates that the memory range passed to ::cuMemHostRegister() has already been registered."),
     713: (
-        "This error indicates that the pointer passed to cuMemHostUnregister() does not correspond to any "
-        "currently registered memory region."
+        "This error indicates that the pointer passed to ::cuMemHostUnregister()"
+        " does not correspond to any currently registered memory region."
     ),
     714: (
-        "While executing a kernel, the device encountered a stack error. This can be due to stack corruption "
-        "or exceeding the stack size limit. This leaves the process in an inconsistent state and any further "
-        "CUDA work will return the same error. To continue using CUDA, the process must be terminated and "
-        "relaunched."
+        "While executing a kernel, the device encountered a stack error."
+        " This can be due to stack corruption or exceeding the stack size limit."
+        " This leaves the process in an inconsistent state and any further CUDA work"
+        " will return the same error. To continue using CUDA, the process must be terminated"
+        " and relaunched."
     ),
     715: (
-        "While executing a kernel, the device encountered an illegal instruction. This leaves the process in "
-        "an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, "
-        "the process must be terminated and relaunched."
+        "While executing a kernel, the device encountered an illegal instruction."
+        " This leaves the process in an inconsistent state and any further CUDA work"
+        " will return the same error. To continue using CUDA, the process must be terminated"
+        " and relaunched."
     ),
     716: (
-        "While executing a kernel, the device encountered a load or store instruction on a memory address "
-        "which is not aligned. This leaves the process in an inconsistent state and any further CUDA work "
-        "will return the same error. To continue using CUDA, the process must be terminated and relaunched."
+        "While executing a kernel, the device encountered a load or store instruction"
+        " on a memory address which is not aligned."
+        " This leaves the process in an inconsistent state and any further CUDA work"
+        " will return the same error. To continue using CUDA, the process must be terminated"
+        " and relaunched."
     ),
     717: (
-        "While executing a kernel, the device encountered an instruction which can only operate on memory "
-        "locations in certain address spaces (global, shared, or local), but was supplied a memory address "
-        "not belonging to an allowed address space. This leaves the process in an inconsistent state and any "
-        "further CUDA work will return the same error. To continue using CUDA, the process must be terminated"
+        "While executing a kernel, the device encountered an instruction"
+        " which can only operate on memory locations in certain address spaces"
+        " (global, shared, or local), but was supplied a memory address not"
+        " belonging to an allowed address space."
+        " This leaves the process in an inconsistent state and any further CUDA work"
+        " will return the same error. To continue using CUDA, the process must be terminated"
         " and relaunched."
     ),
     718: (
-        "While executing a kernel, the device program counter wrapped its address space. This leaves the "
-        "process in an inconsistent state and any further CUDA work will return the same error. To continue "
-        "using CUDA, the process must be terminated and relaunched."
+        "While executing a kernel, the device program counter wrapped its address space."
+        " This leaves the process in an inconsistent state and any further CUDA work"
+        " will return the same error. To continue using CUDA, the process must be terminated"
+        " and relaunched."
     ),
     719: (
-        "An exception occurred on the device while executing a kernel. Common causes include dereferencing an"
-        " invalid device pointer and accessing out of bounds shared memory. Less common cases can be system "
-        "specific - more information about these cases can be found in the system specific user guide. This "
-        "leaves the process in an inconsistent state and any further CUDA work will return the same error. To"
-        " continue using CUDA, the process must be terminated and relaunched."
+        "An exception occurred on the device while executing a kernel. Common"
+        " causes include dereferencing an invalid device pointer and accessing"
+        " out of bounds shared memory. Less common cases can be system specific - more"
+        " information about these cases can be found in the system specific user guide."
+        " This leaves the process in an inconsistent state and any further CUDA work"
+        " will return the same error. To continue using CUDA, the process must be terminated"
+        " and relaunched."
     ),
     720: (
-        "This error indicates that the number of blocks launched per grid for a kernel that was launched via "
-        "either cuLaunchCooperativeKernel or cuLaunchCooperativeKernelMultiDevice exceeds the maximum number "
-        "of blocks as allowed by cuOccupancyMaxActiveBlocksPerMultiprocessor or "
-        "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors as "
-        "specified by the device attribute CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT."
+        "This error indicates that the number of blocks launched per grid for a kernel that was"
+        " launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice"
+        " exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor"
+        " or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors"
+        " as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT."
     ),
     721: (
-        "An exception occurred on the device while exiting a kernel using tensor memory: the tensor memory "
-        "was not completely deallocated. This leaves the process in an inconsistent state and any further "
-        "CUDA work will return the same error. To continue using CUDA, the process must be terminated and "
-        "relaunched."
+        "An exception occurred on the device while exiting a kernel using tensor memory: the"
+        " tensor memory was not completely deallocated. This leaves the process in an inconsistent"
+        " state and any further CUDA work will return the same error. To continue using CUDA, the"
+        " process must be terminated and relaunched."
     ),
     800: "This error indicates that the attempted operation is not permitted.",
-    801: "This error indicates that the attempted operation is not supported on the current system or device.",
+    801: ("This error indicates that the attempted operation is not supported on the current system or device."),
     802: (
-        "This error indicates that the system is not yet ready to start any CUDA work. To continue using "
-        "CUDA, verify the system configuration is in a valid state and all required driver daemons are "
-        "actively running. More information about this error can be found in the system specific user guide."
+        "This error indicates that the system is not yet ready to start any CUDA"
+        " work.  To continue using CUDA, verify the system configuration is in a"
+        " valid state and all required driver daemons are actively running."
+        " More information about this error can be found in the system specific"
+        " user guide."
     ),
     803: (
-        "This error indicates that there is a mismatch between the versions of the display driver and the "
-        "CUDA driver. Refer to the compatibility documentation for supported versions."
+        "This error indicates that there is a mismatch between the versions of"
+        " the display driver and the CUDA driver. Refer to the compatibility documentation"
+        " for supported versions."
     ),
     804: (
-        "This error indicates that the system was upgraded to run with forward compatibility but the visible "
-        "hardware detected by CUDA does not support this configuration. Refer to the compatibility "
-        "documentation for the supported hardware matrix or ensure that only supported hardware is visible "
-        "during initialization via the CUDA_VISIBLE_DEVICES environment variable."
-    ),
-    805: ("This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server."),
-    806: ("This error indicates that the remote procedural call between the MPS server and the MPS client failed."),
+        "This error indicates that the system was upgraded to run with forward compatibility"
+        " but the visible hardware detected by CUDA does not support this configuration."
+        " Refer to the compatibility documentation for the supported hardware matrix or ensure"
+        " that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES"
+        " environment variable."
+    ),
+    805: "This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.",
+    806: "This error indicates that the remote procedural call between the MPS server and the MPS client failed.",
     807: (
-        "This error indicates that the MPS server is not ready to accept new MPS client requests. This error "
-        "can be returned when the MPS server is in the process of recovering from a fatal failure."
+        "This error indicates that the MPS server is not ready to accept new MPS client requests."
+        " This error can be returned when the MPS server is in the process of recovering from a fatal failure."
     ),
     808: "This error indicates that the hardware resources required to create MPS client have been exhausted.",
-    809: (
-        "This error indicates the the hardware resources required to support device connections have been exhausted."
-    ),
-    810: (
-        "This error indicates that the MPS client has been terminated by the server. To continue using CUDA, "
-        "the process must be terminated and relaunched."
-    ),
-    811: (
-        "This error indicates that the module is using CUDA Dynamic Parallelism, but the current "
-        "configuration, like MPS, does not support it."
-    ),
-    812: (
-        "This error indicates that a module contains an unsupported interaction between different versions of"
-        " CUDA Dynamic Parallelism."
-    ),
-    900: "This error indicates that the operation is not permitted when the stream is capturing.",
+    809: "This error indicates the the hardware resources required to support device connections have been exhausted.",
+    810: "This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched.",
+    811: "This error indicates that the module is using CUDA Dynamic Parallelism, but the current configuration, like MPS, does not support it.",
+    812: "This error indicates that a module contains an unsupported interaction between different versions of CUDA Dynamic Parallelism.",
+    900: ("This error indicates that the operation is not permitted when the stream is capturing."),
     901: (
-        "This error indicates that the current capture sequence on the stream has been invalidated due to a "
-        "previous error."
+        "This error indicates that the current capture sequence on the stream"
+        " has been invalidated due to a previous error."
     ),
     902: (
         "This error indicates that the operation would have resulted in a merge of two independent capture sequences."
@@ -293,34 +316,37 @@
     903: "This error indicates that the capture was not initiated in this stream.",
     904: ("This error indicates that the capture sequence contains a fork that was not joined to the primary stream."),
     905: (
-        "This error indicates that a dependency would have been created which crosses the capture sequence "
-        "boundary. Only implicit in-stream ordering dependencies are allowed to cross the boundary."
+        "This error indicates that a dependency would have been created which"
+        " crosses the capture sequence boundary. Only implicit in-stream ordering"
+        " dependencies are allowed to cross the boundary."
     ),
     906: ("This error indicates a disallowed implicit dependency on a current capture sequence from cudaStreamLegacy."),
     907: (
-        "This error indicates that the operation is not permitted on an event which was last recorded in a "
-        "capturing stream."
+        "This error indicates that the operation is not permitted on an event which"
+        " was last recorded in a capturing stream."
     ),
     908: (
-        "A stream capture sequence not initiated with the CU_STREAM_CAPTURE_MODE_RELAXED argument to "
-        "cuStreamBeginCapture was passed to cuStreamEndCapture in a different thread."
+        "A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED"
+        " argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a"
+        " different thread."
     ),
     909: "This error indicates that the timeout specified for the wait operation has lapsed.",
     910: (
-        "This error indicates that the graph update was not performed because it included changes which "
-        "violated constraints specific to instantiated graph update."
+        "This error indicates that the graph update was not performed because it included"
+        " changes which violated constraints specific to instantiated graph update."
     ),
     911: (
-        "This indicates that an async error has occurred in a device outside of CUDA. If CUDA was waiting for"
-        " an external device's signal before consuming shared data, the external device signaled an error "
-        "indicating that the data is not valid for consumption. This leaves the process in an inconsistent "
-        "state and any further CUDA work will return the same error. To continue using CUDA, the process must"
-        " be terminated and relaunched."
+        "This indicates that an async error has occurred in a device outside of CUDA."
+        " If CUDA was waiting for an external device's signal before consuming shared data,"
+        " the external device signaled an error indicating that the data is not valid for"
+        " consumption. This leaves the process in an inconsistent state and any further CUDA"
+        " work will return the same error. To continue using CUDA, the process must be"
+        " terminated and relaunched."
     ),
     912: "Indicates a kernel launch error due to cluster misconfiguration.",
-    913: "Indiciates a function handle is not loaded when calling an API that requires a loaded function.",
-    914: "This error indicates one or more resources passed in are not valid resource types for the operation.",
-    915: "This error indicates one or more resources are insufficient or non-applicable for the operation.",
-    916: "This error indicates that an error happened during the key rotation sequence.",
+    913: ("Indiciates a function handle is not loaded when calling an API that requires a loaded function."),
+    914: ("This error indicates one or more resources passed in are not valid resource types for the operation."),
+    915: ("This error indicates one or more resources are insufficient or non-applicable for the operation."),
+    916: ("This error indicates that an error happened during the key rotation sequence."),
     999: "This indicates that an unknown internal error has occurred.",
 }
diff --git a/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py b/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py
index 50191f94e4..2bde9b22c1 100644
--- a/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py
+++ b/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py
@@ -1,489 +1,540 @@
 # Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# To regenerate the dictionary below, navigate to:
-#     https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES
-# (Chrome was used before, but probably it works with other browsers, too.)
-# Search for:
-#     enum cudaError
-# With the mouse, select the entire region with the enum definitions:
-#     cudaSuccess = 0
-#     ...
-#     cudaErrorApiFailureBase = 10000
-# Paste into a file, e.g. raw.txt
-# python ../../../../../toolshed/reformat_cuda_enums_from_web_as_py.py raw.txt > raw.py
-# ruff format raw.py
-# Copy raw.py into this file (discarding the `DATA = {`, `}` lines).
-# Apply this manual fix:
-#     -     10000: "MISSING EXPLANATION",
-#     +     10000: "Pseudo code.",
+# To regenerate the dictionary below run:
+#     ../../../../../toolshed/reformat_cuda_enums_as_py.py /usr/local/cuda/include/driver_types.h
+# Replace the dictionary below with the output.
 # Also update the CUDA Toolkit version number below.
-# Done.
 
-# CUDA Toolkit v12.9.0
+# ruff: noqa: E501
+# CUDA Toolkit v13.0.0
 RUNTIME_CUDA_ERROR_EXPLANATIONS = {
     0: (
-        "The API call returned with no errors. In the case of query calls, this also means that the operation"
-        " being queried is complete (see cudaEventQuery() and cudaStreamQuery())."
+        "The API call returned with no errors. In the case of query calls, this"
+        " also means that the operation being queried is complete (see"
+        " ::cudaEventQuery() and ::cudaStreamQuery())."
     ),
     1: (
-        "This indicates that one or more of the parameters passed to the API call is not within an acceptable"
-        " range of values."
+        "This indicates that one or more of the parameters passed to the API call"
+        " is not within an acceptable range of values."
     ),
     2: (
-        "The API call failed because it was unable to allocate enough memory or other resources to perform "
-        "the requested operation."
+        "The API call failed because it was unable to allocate enough memory or"
+        " other resources to perform the requested operation."
     ),
-    3: "The API call failed because the CUDA driver and runtime could not be initialized.",
+    3: ("The API call failed because the CUDA driver and runtime could not be initialized."),
     4: (
-        "This indicates that a CUDA Runtime API call cannot be executed because it is being called during "
-        "process shut down, at a point in time after CUDA driver has been unloaded."
+        "This indicates that a CUDA Runtime API call cannot be executed because"
+        " it is being called during process shut down, at a point in time after"
+        " CUDA driver has been unloaded."
     ),
     5: (
-        "This indicates profiler is not initialized for this run. This can happen when the application is "
-        "running with external profiling tools like visual profiler."
+        "This indicates profiler is not initialized for this run. This can"
+        " happen when the application is running with external profiling tools"
+        " like visual profiler."
     ),
     6: (
-        "This error return is deprecated as of CUDA 5.0. It is no longer an error to attempt to "
-        "enable/disable the profiling via cudaProfilerStart or cudaProfilerStop without initialization."
+        "This error return is deprecated as of CUDA 5.0. It is no longer an error"
+        " to attempt to enable/disable the profiling via ::cudaProfilerStart or"
+        " ::cudaProfilerStop without initialization."
     ),
     7: (
-        "This error return is deprecated as of CUDA 5.0. It is no longer an error to call cudaProfilerStart()"
-        " when profiling is already enabled."
+        "This error return is deprecated as of CUDA 5.0. It is no longer an error"
+        " to call cudaProfilerStart() when profiling is already enabled."
     ),
     8: (
-        "This error return is deprecated as of CUDA 5.0. It is no longer an error to call cudaProfilerStop() "
-        "when profiling is already disabled."
+        "This error return is deprecated as of CUDA 5.0. It is no longer an error"
+        " to call cudaProfilerStop() when profiling is already disabled."
     ),
     9: (
-        "This indicates that a kernel launch is requesting resources that can never be satisfied by the "
-        "current device. Requesting more shared memory per block than the device supports will trigger this "
-        "error, as will requesting too many threads or blocks. See cudaDeviceProp for more device "
-        "limitations."
+        "This indicates that a kernel launch is requesting resources that can"
+        " never be satisfied by the current device. Requesting more shared memory"
+        " per block than the device supports will trigger this error, as will"
+        " requesting too many threads or blocks. See ::cudaDeviceProp for more"
+        " device limitations."
     ),
     12: (
-        "This indicates that one or more of the pitch-related parameters passed to the API call is not within"
-        " the acceptable range for pitch."
+        "This indicates that one or more of the pitch-related parameters passed"
+        " to the API call is not within the acceptable range for pitch."
     ),
     13: ("This indicates that the symbol name/identifier passed to the API call is not a valid name or identifier."),
     16: (
-        "This error return is deprecated as of CUDA 10.1. This indicates that at least one host pointer "
-        "passed to the API call is not a valid host pointer."
+        "This indicates that at least one host pointer passed to the API call is"
+        " not a valid host pointer."
+        " This error return is deprecated as of CUDA 10.1."
     ),
     17: (
-        "This error return is deprecated as of CUDA 10.1. This indicates that at least one device pointer "
-        "passed to the API call is not a valid device pointer."
+        "This indicates that at least one device pointer passed to the API call is"
+        " not a valid device pointer."
+        " This error return is deprecated as of CUDA 10.1."
     ),
-    18: "This indicates that the texture passed to the API call is not a valid texture.",
+    18: ("This indicates that the texture passed to the API call is not a valid texture."),
     19: (
-        "This indicates that the texture binding is not valid. This occurs if you call "
-        "cudaGetTextureAlignmentOffset() with an unbound texture."
+        "This indicates that the texture binding is not valid. This occurs if you"
+        " call ::cudaGetTextureAlignmentOffset() with an unbound texture."
     ),
     20: (
-        "This indicates that the channel descriptor passed to the API call is not valid. This occurs if the "
-        "format is not one of the formats specified by cudaChannelFormatKind, or if one of the dimensions is "
-        "invalid."
+        "This indicates that the channel descriptor passed to the API call is not"
+        " valid. This occurs if the format is not one of the formats specified by"
+        " ::cudaChannelFormatKind, or if one of the dimensions is invalid."
     ),
     21: (
-        "This indicates that the direction of the memcpy passed to the API call is not one of the types "
-        "specified by cudaMemcpyKind."
+        "This indicates that the direction of the memcpy passed to the API call is"
+        " not one of the types specified by ::cudaMemcpyKind."
     ),
     22: (
-        "This error return is deprecated as of CUDA 3.1. Variables in constant memory may now have their "
-        "address taken by the runtime via cudaGetSymbolAddress(). This indicated that the user has taken the "
-        "address of a constant variable, which was forbidden up until the CUDA 3.1 release."
+        "This indicated that the user has taken the address of a constant variable,"
+        " which was forbidden up until the CUDA 3.1 release."
+        " This error return is deprecated as of CUDA 3.1. Variables in constant"
+        " memory may now have their address taken by the runtime via"
+        " ::cudaGetSymbolAddress()."
     ),
     23: (
-        "This error return is deprecated as of CUDA 3.1. Device emulation mode was removed with the CUDA 3.1 "
-        "release. This indicated that a texture fetch was not able to be performed. This was previously used "
-        "for device emulation of texture operations."
+        "This indicated that a texture fetch was not able to be performed."
+        " This was previously used for device emulation of texture operations."
+        " This error return is deprecated as of CUDA 3.1. Device emulation mode was"
+        " removed with the CUDA 3.1 release."
     ),
     24: (
-        "This error return is deprecated as of CUDA 3.1. Device emulation mode was removed with the CUDA 3.1 "
-        "release. This indicated that a texture was not bound for access. This was previously used for device"
-        " emulation of texture operations."
+        "This indicated that a texture was not bound for access."
+        " This was previously used for device emulation of texture operations."
+        " This error return is deprecated as of CUDA 3.1. Device emulation mode was"
+        " removed with the CUDA 3.1 release."
     ),
     25: (
-        "This error return is deprecated as of CUDA 3.1. Device emulation mode was removed with the CUDA 3.1 "
-        "release. This indicated that a synchronization operation had failed. This was previously used for "
-        "some device emulation functions."
+        "This indicated that a synchronization operation had failed."
+        " This was previously used for some device emulation functions."
+        " This error return is deprecated as of CUDA 3.1. Device emulation mode was"
+        " removed with the CUDA 3.1 release."
     ),
     26: (
-        "This indicates that a non-float texture was being accessed with linear filtering. This is not "
-        "supported by CUDA."
+        "This indicates that a non-float texture was being accessed with linear"
+        " filtering. This is not supported by CUDA."
     ),
     27: (
-        "This indicates that an attempt was made to read an unsupported data type as a normalized float. This"
-        " is not supported by CUDA."
+        "This indicates that an attempt was made to read an unsupported data type as a"
+        " normalized float. This is not supported by CUDA."
     ),
     28: (
-        "This error return is deprecated as of CUDA 3.1. Device emulation mode was removed with the CUDA 3.1 "
-        "release. Mixing of device and device emulation code was not allowed."
+        "Mixing of device and device emulation code was not allowed."
+        " This error return is deprecated as of CUDA 3.1. Device emulation mode was"
+        " removed with the CUDA 3.1 release."
     ),
     31: (
-        "This error return is deprecated as of CUDA 4.1. This indicates that the API call is not yet "
-        "implemented. Production releases of CUDA will never return this error."
+        "This indicates that the API call is not yet implemented. Production"
+        " releases of CUDA will never return this error."
+        " This error return is deprecated as of CUDA 4.1."
     ),
     32: (
-        "This error return is deprecated as of CUDA 3.1. Device emulation mode was removed with the CUDA 3.1 "
-        "release. This indicated that an emulated device pointer exceeded the 32-bit address range."
+        "This indicated that an emulated device pointer exceeded the 32-bit address"
+        " range."
+        " This error return is deprecated as of CUDA 3.1. Device emulation mode was"
+        " removed with the CUDA 3.1 release."
     ),
     34: (
-        "This indicates that the CUDA driver that the application has loaded is a stub library. Applications "
-        "that run with the stub rather than a real driver loaded will result in CUDA API returning this "
-        "error."
+        "This indicates that the CUDA driver that the application has loaded is a"
+        " stub library. Applications that run with the stub rather than a real"
+        " driver loaded will result in CUDA API returning this error."
     ),
     35: (
-        "This indicates that the installed NVIDIA CUDA driver is older than the CUDA runtime library. This is"
-        " not a supported configuration. Users should install an updated NVIDIA display driver to allow the "
-        "application to run."
+        "This indicates that the installed NVIDIA CUDA driver is older than the"
+        " CUDA runtime library. This is not a supported configuration. Users should"
+        " install an updated NVIDIA display driver to allow the application to run."
     ),
     36: (
-        "This indicates that the API call requires a newer CUDA driver than the one currently installed. "
-        "Users should install an updated NVIDIA CUDA driver to allow the API call to succeed."
+        "This indicates that the API call requires a newer CUDA driver than the one"
+        " currently installed. Users should install an updated NVIDIA CUDA driver"
+        " to allow the API call to succeed."
     ),
-    37: "This indicates that the surface passed to the API call is not a valid surface.",
+    37: ("This indicates that the surface passed to the API call is not a valid surface."),
     43: (
-        "This indicates that multiple global or constant variables (across separate CUDA source files in the "
-        "application) share the same string name."
+        "This indicates that multiple global or constant variables (across separate"
+        " CUDA source files in the application) share the same string name."
     ),
     44: (
-        "This indicates that multiple textures (across separate CUDA source files in the application) share "
-        "the same string name."
+        "This indicates that multiple textures (across separate CUDA source"
+        " files in the application) share the same string name."
     ),
     45: (
-        "This indicates that multiple surfaces (across separate CUDA source files in the application) share "
-        "the same string name."
+        "This indicates that multiple surfaces (across separate CUDA source"
+        " files in the application) share the same string name."
     ),
     46: (
-        "This indicates that all CUDA devices are busy or unavailable at the current time. Devices are often "
-        "busy/unavailable due to use of cudaComputeModeProhibited, cudaComputeModeExclusiveProcess, or when "
-        "long running CUDA kernels have filled up the GPU and are blocking new work from starting. They can "
-        "also be unavailable due to memory constraints on a device that already has active CUDA work being "
-        "performed."
+        "This indicates that all CUDA devices are busy or unavailable at the current"
+        " time. Devices are often busy/unavailable due to use of"
+        " ::cudaComputeModeProhibited, ::cudaComputeModeExclusiveProcess, or when long"
+        " running CUDA kernels have filled up the GPU and are blocking new work"
+        " from starting. They can also be unavailable due to memory constraints"
+        " on a device that already has active CUDA work being performed."
     ),
     49: (
-        "This indicates that the current context is not compatible with this the CUDA Runtime. This can only "
-        "occur if you are using CUDA Runtime/Driver interoperability and have created an existing Driver "
-        "context using the driver API. The Driver context may be incompatible either because the Driver "
-        "context was created using an older version of the API, because the Runtime API call expects a "
-        "primary driver context and the Driver context is not primary, or because the Driver context has been"
-        ' destroyed. Please see Interactions with the CUDA Driver API" for more information.'
+        "This indicates that the current context is not compatible with this"
+        " the CUDA Runtime. This can only occur if you are using CUDA"
+        " Runtime/Driver interoperability and have created an existing Driver"
+        " context using the driver API. The Driver context may be incompatible"
+        " either because the Driver context was created using an older version"
+        " of the API, because the Runtime API call expects a primary driver"
+        " context and the Driver context is not primary, or because the Driver"
+        ' context has been destroyed. Please see CUDART_DRIVER "Interactions'
+        ' with the CUDA Driver API" for more information.'
     ),
     52: (
-        "The device function being invoked (usually via cudaLaunchKernel()) was not previously configured via"
-        " the cudaConfigureCall() function."
+        "The device function being invoked (usually via ::cudaLaunchKernel()) was not"
+        " previously configured via the ::cudaConfigureCall() function."
     ),
     53: (
-        "This error return is deprecated as of CUDA 3.1. Device emulation mode was removed with the CUDA 3.1 "
-        "release. This indicated that a previous kernel launch failed. This was previously used for device "
-        "emulation of kernel launches."
+        "This indicated that a previous kernel launch failed. This was previously"
+        " used for device emulation of kernel launches."
+        " This error return is deprecated as of CUDA 3.1. Device emulation mode was"
+        " removed with the CUDA 3.1 release."
     ),
     65: (
-        "This error indicates that a device runtime grid launch did not occur because the depth of the child "
-        "grid would exceed the maximum supported number of nested grid launches."
+        "This error indicates that a device runtime grid launch did not occur"
+        " because the depth of the child grid would exceed the maximum supported"
+        " number of nested grid launches."
     ),
     66: (
-        "This error indicates that a grid launch did not occur because the kernel uses file-scoped textures "
-        "which are unsupported by the device runtime. Kernels launched via the device runtime only support "
-        "textures created with the Texture Object API's."
+        "This error indicates that a grid launch did not occur because the kernel"
+        " uses file-scoped textures which are unsupported by the device runtime."
+        " Kernels launched via the device runtime only support textures created with"
+        " the Texture Object API's."
     ),
     67: (
-        "This error indicates that a grid launch did not occur because the kernel uses file-scoped surfaces "
-        "which are unsupported by the device runtime. Kernels launched via the device runtime only support "
-        "surfaces created with the Surface Object API's."
+        "This error indicates that a grid launch did not occur because the kernel"
+        " uses file-scoped surfaces which are unsupported by the device runtime."
+        " Kernels launched via the device runtime only support surfaces created with"
+        " the Surface Object API's."
     ),
     68: (
-        "This error indicates that a call to cudaDeviceSynchronize made from the device runtime failed "
-        "because the call was made at grid depth greater than than either the default (2 levels of grids) or "
-        "user specified device limit cudaLimitDevRuntimeSyncDepth. To be able to synchronize on launched "
-        "grids at a greater depth successfully, the maximum nested depth at which cudaDeviceSynchronize will "
-        "be called must be specified with the cudaLimitDevRuntimeSyncDepth limit to the cudaDeviceSetLimit "
-        "api before the host-side launch of a kernel using the device runtime. Keep in mind that additional "
-        "levels of sync depth require the runtime to reserve large amounts of device memory that cannot be "
-        "used for user allocations. Note that cudaDeviceSynchronize made from device runtime is only "
-        "supported on devices of compute capability < 9.0."
+        "This error indicates that a call to ::cudaDeviceSynchronize made from"
+        " the device runtime failed because the call was made at grid depth greater"
+        " than than either the default (2 levels of grids) or user specified device"
+        " limit ::cudaLimitDevRuntimeSyncDepth. To be able to synchronize on"
+        " launched grids at a greater depth successfully, the maximum nested"
+        " depth at which ::cudaDeviceSynchronize will be called must be specified"
+        " with the ::cudaLimitDevRuntimeSyncDepth limit to the ::cudaDeviceSetLimit"
+        " api before the host-side launch of a kernel using the device runtime."
+        " Keep in mind that additional levels of sync depth require the runtime"
+        " to reserve large amounts of device memory that cannot be used for"
+        " user allocations. Note that ::cudaDeviceSynchronize made from device"
+        " runtime is only supported on devices of compute capability < 9.0."
     ),
     69: (
-        "This error indicates that a device runtime grid launch failed because the launch would exceed the "
-        "limit cudaLimitDevRuntimePendingLaunchCount. For this launch to proceed successfully, "
-        "cudaDeviceSetLimit must be called to set the cudaLimitDevRuntimePendingLaunchCount to be higher than"
-        " the upper bound of outstanding launches that can be issued to the device runtime. Keep in mind that"
-        " raising the limit of pending device runtime launches will require the runtime to reserve device "
-        "memory that cannot be used for user allocations."
-    ),
-    98: "The requested device function does not exist or is not compiled for the proper device architecture.",
-    100: "This indicates that no CUDA-capable devices were detected by the installed CUDA driver.",
+        "This error indicates that a device runtime grid launch failed because"
+        " the launch would exceed the limit ::cudaLimitDevRuntimePendingLaunchCount."
+        " For this launch to proceed successfully, ::cudaDeviceSetLimit must be"
+        " called to set the ::cudaLimitDevRuntimePendingLaunchCount to be higher"
+        " than the upper bound of outstanding launches that can be issued to the"
+        " device runtime. Keep in mind that raising the limit of pending device"
+        " runtime launches will require the runtime to reserve device memory that"
+        " cannot be used for user allocations."
+    ),
+    98: ("The requested device function does not exist or is not compiled for the proper device architecture."),
+    100: ("This indicates that no CUDA-capable devices were detected by the installed CUDA driver."),
     101: (
-        "This indicates that the device ordinal supplied by the user does not correspond to a valid CUDA "
-        "device or that the action requested is invalid for the specified device."
+        "This indicates that the device ordinal supplied by the user does not"
+        " correspond to a valid CUDA device or that the action requested is"
+        " invalid for the specified device."
     ),
     102: "This indicates that the device doesn't have a valid Grid License.",
     103: (
-        "By default, the CUDA runtime may perform a minimal set of self-tests, as well as CUDA driver tests, "
-        "to establish the validity of both. Introduced in CUDA 11.2, this error return indicates that at "
-        "least one of these tests has failed and the validity of either the runtime or the driver could not "
-        "be established."
+        "By default, the CUDA runtime may perform a minimal set of self-tests,"
+        " as well as CUDA driver tests, to establish the validity of both."
+        " Introduced in CUDA 11.2, this error return indicates that at least one"
+        " of these tests has failed and the validity of either the runtime"
+        " or the driver could not be established."
     ),
     127: "This indicates an internal startup failure in the CUDA runtime.",
     200: "This indicates that the device kernel image is invalid.",
     201: (
-        "This most frequently indicates that there is no context bound to the current thread. This can also "
-        "be returned if the context passed to an API call is not a valid handle (such as a context that has "
-        "had cuCtxDestroy() invoked on it). This can also be returned if a user mixes different API versions "
-        "(i.e. 3010 context with 3020 API calls). See cuCtxGetApiVersion() for more details."
+        "This most frequently indicates that there is no context bound to the"
+        " current thread. This can also be returned if the context passed to an"
+        " API call is not a valid handle (such as a context that has had"
+        " ::cuCtxDestroy() invoked on it). This can also be returned if a user"
+        " mixes different API versions (i.e. 3010 context with 3020 API calls)."
+        " See ::cuCtxGetApiVersion() for more details."
     ),
     205: "This indicates that the buffer object could not be mapped.",
     206: "This indicates that the buffer object could not be unmapped.",
-    207: "This indicates that the specified array is currently mapped and thus cannot be destroyed.",
+    207: ("This indicates that the specified array is currently mapped and thus cannot be destroyed."),
     208: "This indicates that the resource is already mapped.",
     209: (
-        "This indicates that there is no kernel image available that is suitable for the device. This can "
-        "occur when a user specifies code generation options for a particular CUDA source file that do not "
-        "include the corresponding device configuration."
+        "This indicates that there is no kernel image available that is suitable"
+        " for the device. This can occur when a user specifies code generation"
+        " options for a particular CUDA source file that do not include the"
+        " corresponding device configuration."
     ),
     210: "This indicates that a resource has already been acquired.",
     211: "This indicates that a resource is not mapped.",
-    212: "This indicates that a mapped resource is not available for access as an array.",
-    213: "This indicates that a mapped resource is not available for access as a pointer.",
-    214: "This indicates that an uncorrectable ECC error was detected during execution.",
-    215: "This indicates that the cudaLimit passed to the API call is not supported by the active device.",
+    212: ("This indicates that a mapped resource is not available for access as an array."),
+    213: ("This indicates that a mapped resource is not available for access as a pointer."),
+    214: ("This indicates that an uncorrectable ECC error was detected during execution."),
+    215: ("This indicates that the ::cudaLimit passed to the API call is not supported by the active device."),
     216: (
-        "This indicates that a call tried to access an exclusive-thread device that is already in use by a "
-        "different thread."
+        "This indicates that a call tried to access an exclusive-thread device that"
+        " is already in use by a different thread."
     ),
-    217: "This error indicates that P2P access is not supported across the given devices.",
+    217: ("This error indicates that P2P access is not supported across the given devices."),
     218: (
-        "A PTX compilation failed. The runtime may fall back to compiling PTX if an application does not "
-        "contain a suitable binary for the current device."
+        "A PTX compilation failed. The runtime may fall back to compiling PTX if"
+        " an application does not contain a suitable binary for the current device."
     ),
     219: "This indicates an error with the OpenGL or DirectX context.",
-    220: "This indicates that an uncorrectable NVLink error was detected during the execution.",
+    220: ("This indicates that an uncorrectable NVLink error was detected during the execution."),
     221: (
-        "This indicates that the PTX JIT compiler library was not found. The JIT Compiler library is used for"
-        " PTX compilation. The runtime may fall back to compiling PTX if an application does not contain a "
-        "suitable binary for the current device."
+        "This indicates that the PTX JIT compiler library was not found. The JIT Compiler"
+        " library is used for PTX compilation. The runtime may fall back to compiling PTX"
+        " if an application does not contain a suitable binary for the current device."
     ),
     222: (
-        "This indicates that the provided PTX was compiled with an unsupported toolchain. The most common "
-        "reason for this, is the PTX was generated by a compiler newer than what is supported by the CUDA "
-        "driver and PTX JIT compiler."
+        "This indicates that the provided PTX was compiled with an unsupported toolchain."
+        " The most common reason for this, is the PTX was generated by a compiler newer"
+        " than what is supported by the CUDA driver and PTX JIT compiler."
     ),
     223: (
-        "This indicates that the JIT compilation was disabled. The JIT compilation compiles PTX. The runtime "
-        "may fall back to compiling PTX if an application does not contain a suitable binary for the current "
-        "device."
+        "This indicates that the JIT compilation was disabled. The JIT compilation compiles"
+        " PTX. The runtime may fall back to compiling PTX if an application does not contain"
+        " a suitable binary for the current device."
     ),
     224: "This indicates that the provided execution affinity is not supported by the device.",
     225: (
         "This indicates that the code to be compiled by the PTX JIT contains unsupported call to cudaDeviceSynchronize."
     ),
     226: (
-        "This indicates that an exception occurred on the device that is now contained by the GPU's error "
-        "containment capability. Common causes are - a. Certain types of invalid accesses of peer GPU memory "
-        "over nvlink b. Certain classes of hardware errors This leaves the process in an inconsistent state "
-        "and any further CUDA work will return the same error. To continue using CUDA, the process must be "
-        "terminated and relaunched."
+        "This indicates that an exception occurred on the device that is now"
+        " contained by the GPU's error containment capability. Common causes are -"
+        " a. Certain types of invalid accesses of peer GPU memory over nvlink"
+        " b. Certain classes of hardware errors"
+        " This leaves the process in an inconsistent state and any further CUDA"
+        " work will return the same error. To continue using CUDA, the process must"
+        " be terminated and relaunched."
     ),
+    227: ("This indicates that an NVLink encryption error was detected during the execution."),
     300: "This indicates that the device kernel source is invalid.",
     301: "This indicates that the file specified was not found.",
     302: "This indicates that a link to a shared object failed to resolve.",
     303: "This indicates that initialization of a shared object failed.",
     304: "This error indicates that an OS call failed.",
     400: (
-        "This indicates that a resource handle passed to the API call was not valid. Resource handles are "
-        "opaque types like cudaStream_t and cudaEvent_t."
+        "This indicates that a resource handle passed to the API call was not"
+        " valid. Resource handles are opaque types like ::cudaStream_t and"
+        " ::cudaEvent_t."
     ),
     401: (
-        "This indicates that a resource required by the API call is not in a valid state to perform the "
-        "requested operation."
+        "This indicates that a resource required by the API call is not in a"
+        " valid state to perform the requested operation."
     ),
     402: (
-        "This indicates an attempt was made to introspect an object in a way that would discard semantically "
-        "important information. This is either due to the object using funtionality newer than the API "
-        "version used to introspect it or omission of optional return arguments."
+        "This indicates an attempt was made to introspect an object in a way that"
+        " would discard semantically important information. This is either due to"
+        " the object using funtionality newer than the API version used to"
+        " introspect it or omission of optional return arguments."
     ),
     500: (
-        "This indicates that a named symbol was not found. Examples of symbols are global/constant variable "
-        "names, driver function names, texture names, and surface names."
+        "This indicates that a named symbol was not found. Examples of symbols"
+        " are global/constant variable names, driver function names, texture names,"
+        " and surface names."
     ),
     600: (
-        "This indicates that asynchronous operations issued previously have not completed yet. This result is"
-        " not actually an error, but must be indicated differently than cudaSuccess (which indicates "
-        "completion). Calls that may return this value include cudaEventQuery() and cudaStreamQuery()."
+        "This indicates that asynchronous operations issued previously have not"
+        " completed yet. This result is not actually an error, but must be indicated"
+        " differently than ::cudaSuccess (which indicates completion). Calls that"
+        " may return this value include ::cudaEventQuery() and ::cudaStreamQuery()."
     ),
     700: (
-        "The device encountered a load or store instruction on an invalid memory address. This leaves the "
-        "process in an inconsistent state and any further CUDA work will return the same error. To continue "
-        "using CUDA, the process must be terminated and relaunched."
+        "The device encountered a load or store instruction on an invalid memory address."
+        " This leaves the process in an inconsistent state and any further CUDA work"
+        " will return the same error. To continue using CUDA, the process must be terminated"
+        " and relaunched."
     ),
     701: (
-        "This indicates that a launch did not occur because it did not have appropriate resources. Although "
-        "this error is similar to cudaErrorInvalidConfiguration, this error usually indicates that the user "
-        "has attempted to pass too many arguments to the device kernel, or the kernel launch specifies too "
-        "many threads for the kernel's register count."
+        "This indicates that a launch did not occur because it did not have"
+        " appropriate resources. Although this error is similar to"
+        " ::cudaErrorInvalidConfiguration, this error usually indicates that the"
+        " user has attempted to pass too many arguments to the device kernel, or the"
+        " kernel launch specifies too many threads for the kernel's register count."
     ),
     702: (
-        "This indicates that the device kernel took too long to execute. This can only occur if timeouts are "
-        "enabled - see the device property kernelExecTimeoutEnabled for more information. This leaves the "
-        "process in an inconsistent state and any further CUDA work will return the same error. To continue "
-        "using CUDA, the process must be terminated and relaunched."
+        "This indicates that the device kernel took too long to execute. This can"
+        " only occur if timeouts are enabled - see the device attribute"
+        ' ::cudaDeviceAttr::cudaDevAttrKernelExecTimeout "cudaDevAttrKernelExecTimeout"'
+        " for more information."
+        " This leaves the process in an inconsistent state and any further CUDA work"
+        " will return the same error. To continue using CUDA, the process must be terminated"
+        " and relaunched."
     ),
-    703: "This error indicates a kernel launch that uses an incompatible texturing mode.",
+    703: ("This error indicates a kernel launch that uses an incompatible texturing mode."),
     704: (
-        "This error indicates that a call to cudaDeviceEnablePeerAccess() is trying to re-enable peer "
-        "addressing on from a context which has already had peer addressing enabled."
+        "This error indicates that a call to ::cudaDeviceEnablePeerAccess() is"
+        " trying to re-enable peer addressing on from a context which has already"
+        " had peer addressing enabled."
     ),
     705: (
-        "This error indicates that cudaDeviceDisablePeerAccess() is trying to disable peer addressing which "
-        "has not been enabled yet via cudaDeviceEnablePeerAccess()."
+        "This error indicates that ::cudaDeviceDisablePeerAccess() is trying to"
+        " disable peer addressing which has not been enabled yet via"
+        " ::cudaDeviceEnablePeerAccess()."
     ),
     708: (
-        "This indicates that the user has called cudaSetValidDevices(), cudaSetDeviceFlags(), "
-        "cudaD3D9SetDirect3DDevice(), cudaD3D10SetDirect3DDevice, cudaD3D11SetDirect3DDevice(), or "
-        "cudaVDPAUSetVDPAUDevice() after initializing the CUDA runtime by calling non-device management "
-        "operations (allocating memory and launching kernels are examples of non-device management "
-        "operations). This error can also be returned if using runtime/driver interoperability and there is "
-        "an existing CUcontext active on the host thread."
+        "This indicates that the user has called ::cudaSetValidDevices(),"
+        " ::cudaSetDeviceFlags(), ::cudaD3D9SetDirect3DDevice(),"
+        " ::cudaD3D10SetDirect3DDevice, ::cudaD3D11SetDirect3DDevice(), or"
+        " ::cudaVDPAUSetVDPAUDevice() after initializing the CUDA runtime by"
+        " calling non-device management operations (allocating memory and"
+        " launching kernels are examples of non-device management operations)."
+        " This error can also be returned if using runtime/driver"
+        " interoperability and there is an existing ::CUcontext active on the"
+        " host thread."
     ),
     709: (
-        "This error indicates that the context current to the calling thread has been destroyed using "
-        "cuCtxDestroy, or is a primary context which has not yet been initialized."
+        "This error indicates that the context current to the calling thread"
+        " has been destroyed using ::cuCtxDestroy, or is a primary context which"
+        " has not yet been initialized."
     ),
     710: (
-        "An assert triggered in device code during kernel execution. The device cannot be used again. All "
-        "existing allocations are invalid. To continue using CUDA, the process must be terminated and "
-        "relaunched."
+        "An assert triggered in device code during kernel execution. The device"
+        " cannot be used again. All existing allocations are invalid. To continue"
+        " using CUDA, the process must be terminated and relaunched."
     ),
     711: (
-        "This error indicates that the hardware resources required to enable peer access have been exhausted "
-        "for one or more of the devices passed to cudaEnablePeerAccess()."
+        "This error indicates that the hardware resources required to enable"
+        " peer access have been exhausted for one or more of the devices"
+        " passed to ::cudaEnablePeerAccess()."
     ),
-    712: "This error indicates that the memory range passed to cudaHostRegister() has already been registered.",
+    712: ("This error indicates that the memory range passed to ::cudaHostRegister() has already been registered."),
     713: (
-        "This error indicates that the pointer passed to cudaHostUnregister() does not correspond to any "
-        "currently registered memory region."
+        "This error indicates that the pointer passed to ::cudaHostUnregister()"
+        " does not correspond to any currently registered memory region."
     ),
     714: (
-        "Device encountered an error in the call stack during kernel execution, possibly due to stack "
-        "corruption or exceeding the stack size limit. This leaves the process in an inconsistent state and "
-        "any further CUDA work will return the same error. To continue using CUDA, the process must be "
-        "terminated and relaunched."
+        "Device encountered an error in the call stack during kernel execution,"
+        " possibly due to stack corruption or exceeding the stack size limit."
+        " This leaves the process in an inconsistent state and any further CUDA work"
+        " will return the same error. To continue using CUDA, the process must be terminated"
+        " and relaunched."
     ),
     715: (
-        "The device encountered an illegal instruction during kernel execution This leaves the process in an "
-        "inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the"
-        " process must be terminated and relaunched."
+        "The device encountered an illegal instruction during kernel execution"
+        " This leaves the process in an inconsistent state and any further CUDA work"
+        " will return the same error. To continue using CUDA, the process must be terminated"
+        " and relaunched."
     ),
     716: (
-        "The device encountered a load or store instruction on a memory address which is not aligned. This "
-        "leaves the process in an inconsistent state and any further CUDA work will return the same error. To"
-        " continue using CUDA, the process must be terminated and relaunched."
+        "The device encountered a load or store instruction"
+        " on a memory address which is not aligned."
+        " This leaves the process in an inconsistent state and any further CUDA work"
+        " will return the same error. To continue using CUDA, the process must be terminated"
+        " and relaunched."
     ),
     717: (
-        "While executing a kernel, the device encountered an instruction which can only operate on memory "
-        "locations in certain address spaces (global, shared, or local), but was supplied a memory address "
-        "not belonging to an allowed address space. This leaves the process in an inconsistent state and any "
-        "further CUDA work will return the same error. To continue using CUDA, the process must be terminated"
+        "While executing a kernel, the device encountered an instruction"
+        " which can only operate on memory locations in certain address spaces"
+        " (global, shared, or local), but was supplied a memory address not"
+        " belonging to an allowed address space."
+        " This leaves the process in an inconsistent state and any further CUDA work"
+        " will return the same error. To continue using CUDA, the process must be terminated"
         " and relaunched."
     ),
     718: (
-        "The device encountered an invalid program counter. This leaves the process in an inconsistent state "
-        "and any further CUDA work will return the same error. To continue using CUDA, the process must be "
-        "terminated and relaunched."
+        "The device encountered an invalid program counter."
+        " This leaves the process in an inconsistent state and any further CUDA work"
+        " will return the same error. To continue using CUDA, the process must be terminated"
+        " and relaunched."
     ),
     719: (
-        "An exception occurred on the device while executing a kernel. Common causes include dereferencing an"
-        " invalid device pointer and accessing out of bounds shared memory. Less common cases can be system "
-        "specific - more information about these cases can be found in the system specific user guide. This "
-        "leaves the process in an inconsistent state and any further CUDA work will return the same error. To"
-        " continue using CUDA, the process must be terminated and relaunched."
+        "An exception occurred on the device while executing a kernel. Common"
+        " causes include dereferencing an invalid device pointer and accessing"
+        " out of bounds shared memory. Less common cases can be system specific - more"
+        " information about these cases can be found in the system specific user guide."
+        " This leaves the process in an inconsistent state and any further CUDA work"
+        " will return the same error. To continue using CUDA, the process must be terminated"
+        " and relaunched."
     ),
     720: (
-        "This error indicates that the number of blocks launched per grid for a kernel that was launched via "
-        "either cudaLaunchCooperativeKernel or cudaLaunchCooperativeKernelMultiDevice exceeds the maximum "
-        "number of blocks as allowed by cudaOccupancyMaxActiveBlocksPerMultiprocessor or "
-        "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors as "
-        "specified by the device attribute cudaDevAttrMultiProcessorCount."
+        "This error indicates that the number of blocks launched per grid for a kernel that was"
+        " launched via either ::cudaLaunchCooperativeKernel"
+        " exceeds the maximum number of blocks as allowed by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor"
+        " or ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors"
+        " as specified by the device attribute ::cudaDevAttrMultiProcessorCount."
     ),
     721: (
-        "An exception occurred on the device while exiting a kernel using tensor memory: the tensor memory "
-        "was not completely deallocated. This leaves the process in an inconsistent state and any further "
-        "CUDA work will return the same error. To continue using CUDA, the process must be terminated and "
-        "relaunched."
+        "An exception occurred on the device while exiting a kernel using tensor memory: the"
+        " tensor memory was not completely deallocated. This leaves the process in an inconsistent"
+        " state and any further CUDA work will return the same error. To continue using CUDA, the"
+        " process must be terminated and relaunched."
     ),
     800: "This error indicates the attempted operation is not permitted.",
-    801: "This error indicates the attempted operation is not supported on the current system or device.",
+    801: ("This error indicates the attempted operation is not supported on the current system or device."),
     802: (
-        "This error indicates that the system is not yet ready to start any CUDA work. To continue using "
-        "CUDA, verify the system configuration is in a valid state and all required driver daemons are "
-        "actively running. More information about this error can be found in the system specific user guide."
+        "This error indicates that the system is not yet ready to start any CUDA"
+        " work.  To continue using CUDA, verify the system configuration is in a"
+        " valid state and all required driver daemons are actively running."
+        " More information about this error can be found in the system specific"
+        " user guide."
     ),
     803: (
-        "This error indicates that there is a mismatch between the versions of the display driver and the "
-        "CUDA driver. Refer to the compatibility documentation for supported versions."
+        "This error indicates that there is a mismatch between the versions of"
+        " the display driver and the CUDA driver. Refer to the compatibility documentation"
+        " for supported versions."
     ),
     804: (
-        "This error indicates that the system was upgraded to run with forward compatibility but the visible "
-        "hardware detected by CUDA does not support this configuration. Refer to the compatibility "
-        "documentation for the supported hardware matrix or ensure that only supported hardware is visible "
-        "during initialization via the CUDA_VISIBLE_DEVICES environment variable."
-    ),
-    805: ("This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server."),
-    806: ("This error indicates that the remote procedural call between the MPS server and the MPS client failed."),
+        "This error indicates that the system was upgraded to run with forward compatibility"
+        " but the visible hardware detected by CUDA does not support this configuration."
+        " Refer to the compatibility documentation for the supported hardware matrix or ensure"
+        " that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES"
+        " environment variable."
+    ),
+    805: "This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.",
+    806: "This error indicates that the remote procedural call between the MPS server and the MPS client failed.",
     807: (
-        "This error indicates that the MPS server is not ready to accept new MPS client requests. This error "
-        "can be returned when the MPS server is in the process of recovering from a fatal failure."
+        "This error indicates that the MPS server is not ready to accept new MPS client requests."
+        " This error can be returned when the MPS server is in the process of recovering from a fatal failure."
     ),
     808: "This error indicates that the hardware resources required to create MPS client have been exhausted.",
     809: "This error indicates the the hardware resources required to device connections have been exhausted.",
-    810: (
-        "This error indicates that the MPS client has been terminated by the server. To continue using CUDA, "
-        "the process must be terminated and relaunched."
-    ),
-    811: (
-        "This error indicates, that the program is using CUDA Dynamic Parallelism, but the current "
-        "configuration, like MPS, does not support it."
-    ),
-    812: (
-        "This error indicates, that the program contains an unsupported interaction between different "
-        "versions of CUDA Dynamic Parallelism."
-    ),
+    810: "This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched.",
+    811: "This error indicates, that the program is using CUDA Dynamic Parallelism, but the current configuration, like MPS, does not support it.",
+    812: "This error indicates, that the program contains an unsupported interaction between different versions of CUDA Dynamic Parallelism.",
     900: "The operation is not permitted when the stream is capturing.",
-    901: "The current capture sequence on the stream has been invalidated due to a previous error.",
-    902: "The operation would have resulted in a merge of two independent capture sequences.",
+    901: ("The current capture sequence on the stream has been invalidated due to a previous error."),
+    902: ("The operation would have resulted in a merge of two independent capture sequences."),
     903: "The capture was not initiated in this stream.",
-    904: "The capture sequence contains a fork that was not joined to the primary stream.",
+    904: ("The capture sequence contains a fork that was not joined to the primary stream."),
     905: (
-        "A dependency would have been created which crosses the capture sequence boundary. Only implicit in-"
-        "stream ordering dependencies are allowed to cross the boundary."
+        "A dependency would have been created which crosses the capture sequence"
+        " boundary. Only implicit in-stream ordering dependencies are allowed to"
+        " cross the boundary."
     ),
     906: (
-        "The operation would have resulted in a disallowed implicit dependency on a current capture sequence "
-        "from cudaStreamLegacy."
+        "The operation would have resulted in a disallowed implicit dependency on"
+        " a current capture sequence from cudaStreamLegacy."
     ),
-    907: "The operation is not permitted on an event which was last recorded in a capturing stream.",
+    907: ("The operation is not permitted on an event which was last recorded in a capturing stream."),
     908: (
-        "A stream capture sequence not initiated with the cudaStreamCaptureModeRelaxed argument to "
-        "cudaStreamBeginCapture was passed to cudaStreamEndCapture in a different thread."
+        "A stream capture sequence not initiated with the ::cudaStreamCaptureModeRelaxed"
+        " argument to ::cudaStreamBeginCapture was passed to ::cudaStreamEndCapture in a"
+        " different thread."
     ),
     909: "This indicates that the wait operation has timed out.",
     910: (
-        "This error indicates that the graph update was not performed because it included changes which "
-        "violated constraints specific to instantiated graph update."
+        "This error indicates that the graph update was not performed because it included"
+        " changes which violated constraints specific to instantiated graph update."
     ),
     911: (
-        "This indicates that an async error has occurred in a device outside of CUDA. If CUDA was waiting for"
-        " an external device's signal before consuming shared data, the external device signaled an error "
-        "indicating that the data is not valid for consumption. This leaves the process in an inconsistent "
-        "state and any further CUDA work will return the same error. To continue using CUDA, the process must"
-        " be terminated and relaunched."
-    ),
-    912: "This indicates that a kernel launch error has occurred due to cluster misconfiguration.",
-    913: "Indiciates a function handle is not loaded when calling an API that requires a loaded function.",
-    914: "This error indicates one or more resources passed in are not valid resource types for the operation.",
-    915: "This error indicates one or more resources are insufficient or non-applicable for the operation.",
+        "This indicates that an async error has occurred in a device outside of CUDA."
+        " If CUDA was waiting for an external device's signal before consuming shared data,"
+        " the external device signaled an error indicating that the data is not valid for"
+        " consumption. This leaves the process in an inconsistent state and any further CUDA"
+        " work will return the same error. To continue using CUDA, the process must be"
+        " terminated and relaunched."
+    ),
+    912: ("This indicates that a kernel launch error has occurred due to cluster misconfiguration."),
+    913: ("Indiciates a function handle is not loaded when calling an API that requires a loaded function."),
+    914: ("This error indicates one or more resources passed in are not valid resource types for the operation."),
+    915: ("This error indicates one or more resources are insufficient or non-applicable for the operation."),
     999: "This indicates that an unknown internal error has occurred.",
-    10000: "Pseudo code.",
+    10000: (
+        "Any unhandled CUDA driver error is added to this value and returned via"
+        " the runtime. Production releases of CUDA should not return such errors."
+        " This error return is deprecated as of CUDA 4.1."
+    ),
 }
diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py
index 309d661f50..db168af580 100644
--- a/cuda_core/tests/test_device.py
+++ b/cuda_core/tests/test_device.py
@@ -77,7 +77,7 @@ def test_pci_bus_id():
 def test_uuid():
     device = Device()
     driver_ver = handle_return(driver.cuDriverGetVersion())
-    if driver_ver >= 11040:
+    if 11040 <= driver_ver < 13000:
         uuid = handle_return(driver.cuDeviceGetUuid_v2(device.device_id))
     else:
         uuid = handle_return(driver.cuDeviceGetUuid(device.device_id))
diff --git a/toolshed/reformat_cuda_enums_as_py.py b/toolshed/reformat_cuda_enums_as_py.py
new file mode 100755
index 0000000000..0a048cde99
--- /dev/null
+++ b/toolshed/reformat_cuda_enums_as_py.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+
+
+def extract_enum_block(header_file_lines):
+    line_iter = iter(header_file_lines)
+    for line in line_iter:
+        if line == "typedef enum cudaError_enum {":
+            closing_line = "} CUresult;"
+            python_dict_name = "DRIVER_CU_RESULT_EXPLANATIONS"
+            break
+        if line == "enum __device_builtin__ cudaError":
+            line = next(line_iter)
+            assert line == "{", line
+            closing_line = "};"
+            python_dict_name = "RUNTIME_CUDA_ERROR_EXPLANATIONS"
+            break
+    else:
+        raise RuntimeError("Opening line not found.")
+    block = []
+    for line in line_iter:
+        if line == closing_line:
+            break
+        block.append(line)
+    else:
+        raise RuntimeError("Closing line not found.")
+    return python_dict_name, block
+
+
+def parse_enum_doc_and_value_pairs(enum_block):
+    entries = []
+    comment_lines = []
+    inside_comment = False
+
+    for line in enum_block:
+        stripped = line.strip()
+        if not stripped:
+            continue
+
+        if stripped.startswith("/**"):
+            inside_comment = True
+            comment = stripped[3:].lstrip()
+            if comment:
+                comment_lines = [comment]
+        elif inside_comment:
+            if stripped.endswith("*/"):
+                comment = stripped[:-2].strip()
+                if comment:
+                    comment_lines.append(comment)
+                inside_comment = False
+            else:
+                comment_lines.append(stripped.lstrip("*").strip())
+        elif stripped:
+            assert stripped.count(",") <= 1, line
+            stripped = stripped.replace(",", "")
+            flds = stripped.split(" = ")
+            assert len(flds) == 2, line
+            try:
+                val = int(flds[1].strip())
+            except Exception as e:
+                raise RuntimeError(f"Unexpected {line=!r}") from e
+            entries.append((int(val), comment_lines))
+            comment_lines = []
+
+    return entries
+
+
+def emit_python_dict(python_dict_name, entries):
+    print(f"{python_dict_name} = {{")
+    for val, lines in entries:
+        py_lines = []
+        continuation_space = ""
+        for line in lines:
+            if line == r"\deprecated":
+                continue
+            mod_line = line.replace("\\ref ", "")
+            assert "\\" not in mod_line, line
+            mod_line = mod_line.replace('"', '\\"')
+            py_lines.append(f'"{continuation_space}{mod_line}"')
+            continuation_space = " "
+        assert py_lines, lines
+        if len(py_lines) == 1:
+            print(f"    {val}: {py_lines[0]},")
+        else:
+            print(f"    {val}: (")
+            for py_line in py_lines:
+                print(f"        {py_line}")
+            print("    ),")
+    print("}")
+
+
+def run(args):
+    if len(args) != 1:
+        print(
+            "Usage: reformat_cuda_enums_as_py.py /path/to/cuda.h|driver_types.h",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    header_file_text = open(sys.argv[1]).read().splitlines()
+    python_dict_name, enum_block = extract_enum_block(header_file_text)
+    entries = parse_enum_doc_and_value_pairs(enum_block)
+    emit_python_dict(python_dict_name, entries)
+
+
+if __name__ == "__main__":
+    run(sys.argv[1:])
diff --git a/toolshed/reformat_cuda_enums_from_web_as_py.py b/toolshed/reformat_cuda_enums_from_web_as_py.py
deleted file mode 100755
index 8ebce05611..0000000000
--- a/toolshed/reformat_cuda_enums_from_web_as_py.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import sys
-import textwrap
-
-
-def run(args):
-    assert len(args) == 1
-    num = None
-    buffer = []
-
-    print("DATA = {")
-
-    def flush_buffer():
-        txt = " ".join(buffer)
-        buffer.clear()
-        parts = textwrap.wrap(txt, width=100, drop_whitespace=False)
-        assert "".join(parts) == txt
-        print(f"{num}:")
-        if len(parts) > 1:
-            print("(")
-        for p in parts:
-            print(repr(p))
-        if len(parts) > 1:
-            print(")")
-        print(",")
-
-    for line in open(args[0]).read().splitlines():
-        line = line.strip()
-        if not line or line == "Deprecated":
-            continue
-        if " = " in line:
-            if buffer:
-                assert num is not None
-                flush_buffer()
-            kw, num = line.split(" = ", 1)
-        else:
-            buffer.append(line)
-    if num is not None and not buffer:
-        buffer = ["MISSING EXPLANATION"]
-    if buffer:
-        assert num is not None
-        flush_buffer()
-
-    print("}")  # DATA
-
-
-if __name__ == "__main__":
-    run(args=sys.argv[1:])

From fa3304560a0f303d2ed3e7155cea86553c971140 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rwgkio@gmail.com>
Date: Tue, 10 Jun 2025 10:22:09 -0700
Subject: [PATCH 09/65] Fixes from windows testing (#89)

* Add missing error handling (tests/test_nvjitlink.py)

* Add missing `const` in cudaMemcpyBatchAsync call (cuda/bindings/runtime.pyx.in)

* Add qa/13.0.0/01_linux.sh

* Remove qa/13.0.0/01_linux.sh after it was moved to a new upstream qa branch.

* Strictly correct casts for cudaMemcpyBatchAsync (generated by cython_gen).

* Pragmatic minimal fix for cudaMemcpyBatchAsync casts (works with Linux and
Windows). (generated with cython-gen)
---
 cuda_bindings/cuda/bindings/runtime.pyx.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in
index 04a9564691..026df7dcd0 100644
--- a/cuda_bindings/cuda/bindings/runtime.pyx.in
+++ b/cuda_bindings/cuda/bindings/runtime.pyx.in
@@ -25849,7 +25849,7 @@ def cudaMemcpyBatchAsync(dsts : Optional[Tuple[Any] | List[Any]], srcs : Optiona
     cdef vector[size_t] cyattrsIdxs = attrsIdxs
     if numAttrs > <size_t>len(attrs): raise RuntimeError("List is too small: " + str(len(attrs)) + " < " + str(numAttrs))
     if numAttrs > <size_t>len(attrsIdxs): raise RuntimeError("List is too small: " + str(len(attrsIdxs)) + " < " + str(numAttrs))
-    err = cyruntime.cudaMemcpyBatchAsync(<void**><void_ptr>voidStarHelperdsts.cptr, <void**><void_ptr>voidStarHelpersrcs.cptr, cysizes.data(), count, (<cudaMemcpyAttributes>attrs[0])._pvt_ptr if len(attrs) == 1 else cyattrs, cyattrsIdxs.data(), numAttrs, cystream)
+    err = cyruntime.cudaMemcpyBatchAsync(<const void**><void_ptr>voidStarHelperdsts.cptr, <const void**><void_ptr>voidStarHelpersrcs.cptr, cysizes.data(), count, (<cudaMemcpyAttributes>attrs[0])._pvt_ptr if len(attrs) == 1 else cyattrs, cyattrsIdxs.data(), numAttrs, cystream)
     if cyattrs is not NULL:
         free(cyattrs)
     return (_dict_cudaError_t[err],)

From 104abbdb4824dc2ed43565c8cafdf363aefc5f07 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 12 Jun 2025 10:36:05 -0700
Subject: [PATCH 10/65] print *prog pointers in nvrtcCreateProgram,
 nvrtcCompileProgram bindings

---
 .../cuda/bindings/_bindings/cynvrtc.pyx.in          | 13 +++++++++++++
 cuda_bindings/cuda/bindings/cynvrtc.pyx.in          |  2 ++
 cuda_bindings/cuda/bindings/nvrtc.pyx.in            |  1 +
 3 files changed, 16 insertions(+)

diff --git a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
index 9af9baad26..9ad166b4c4 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
@@ -382,6 +382,14 @@ cdef nvrtcResult _nvrtcCreateProgram(nvrtcProgram* prog, const char* src, const
     if __nvrtcCreateProgram == NULL:
         with gil:
             raise RuntimeError('Function "nvrtcCreateProgram" not found')
+    with gil:
+        print(f"\nLOOOK _bindings/cynvrtc.nvrtcCreateProgram CALLS __nvrtcCreateProgram", flush=True)
+        print(f"LOOOK     *prog={<unsigned long long>prog:x}", flush=True)
+        print(f"LOOOK     {src=!r}", flush=True)
+        print(f"LOOOK     {name=!r}", flush=True)
+        print(f"LOOOK     {numHeaders=!r}", flush=True)
+        print(f"LOOOK     *headers={<unsigned long long>headers:x}", flush=True)
+        print(f"LOOOK     *includeNames={<unsigned long long>includeNames:x}", flush=True)
     err = (<nvrtcResult (*)(nvrtcProgram*, const char*, const char*, int, const char**, const char**) except ?NVRTC_ERROR_INVALID_INPUT nogil> __nvrtcCreateProgram)(prog, src, name, numHeaders, headers, includeNames)
     return err
 {{endif}}
@@ -406,6 +414,11 @@ cdef nvrtcResult _nvrtcCompileProgram(nvrtcProgram prog, int numOptions, const c
     if __nvrtcCompileProgram == NULL:
         with gil:
             raise RuntimeError('Function "nvrtcCompileProgram" not found')
+    with gil:
+        print(f"\nLOOOK _bindings/cynvrtc._nvrtcCompileProgram CALLS __nvrtcCompileProgram", flush=True)
+        print(f"LOOOK     *prog={<unsigned long long>prog:x}", flush=True)
+        print(f"LOOOK     {numOptions=!r}", flush=True)
+        print(f"LOOOK     *options={<unsigned long long>options:x}", flush=True)
     err = (<nvrtcResult (*)(nvrtcProgram, int, const char**) except ?NVRTC_ERROR_INVALID_INPUT nogil> __nvrtcCompileProgram)(prog, numOptions, options)
     return err
 {{endif}}
diff --git a/cuda_bindings/cuda/bindings/cynvrtc.pyx.in b/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
index a6cbe8ec8e..4a267728e7 100644
--- a/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
@@ -43,6 +43,8 @@ cdef nvrtcResult nvrtcDestroyProgram(nvrtcProgram* prog) except ?NVRTC_ERROR_INV
 {{if 'nvrtcCompileProgram' in found_functions}}
 
 cdef nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, int numOptions, const char** options) except ?NVRTC_ERROR_INVALID_INPUT nogil:
+    with gil:
+        print(f"\nLOOOK cynvrtc.nvrtcCompileProgram CALLS cynvrtc._nvrtcCompileProgram", flush=True)
     return cynvrtc._nvrtcCompileProgram(prog, numOptions, options)
 {{endif}}
 
diff --git a/cuda_bindings/cuda/bindings/nvrtc.pyx.in b/cuda_bindings/cuda/bindings/nvrtc.pyx.in
index 609b852883..687fe7386d 100644
--- a/cuda_bindings/cuda/bindings/nvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/nvrtc.pyx.in
@@ -350,6 +350,7 @@ def nvrtcCompileProgram(prog, int numOptions, options : Optional[Tuple[bytes] |
     cyprog = <cynvrtc.nvrtcProgram><void_ptr>pprog
     if numOptions > len(options): raise RuntimeError("List is too small: " + str(len(options)) + " < " + str(numOptions))
     cdef vector[const char*] cyoptions = options
+    print(f"\nLOOOK nvrtcCompileProgram CALLS cynvrtc.nvrtcCompileProgram", flush=True)
     err = cynvrtc.nvrtcCompileProgram(cyprog, numOptions, cyoptions.data())
     return (_dict_nvrtcResult[err],)
 {{endif}}

From f64edebc2cae404443321b9258007515d8448b2a Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 12 Jun 2025 11:58:16 -0700
Subject: [PATCH 11/65] Remove stray `"nvrtc64_*_0.alt.dll"` entries in
 `SUPPORTED_WINDOWS_DLLS`

---
 cuda_bindings/cuda/bindings/_path_finder/supported_libs.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_path_finder/supported_libs.py b/cuda_bindings/cuda/bindings/_path_finder/supported_libs.py
index cf9d5ae75b..305098dfaa 100644
--- a/cuda_bindings/cuda/bindings/_path_finder/supported_libs.py
+++ b/cuda_bindings/cuda/bindings/_path_finder/supported_libs.py
@@ -379,9 +379,7 @@
         "nvrtc64_110_0.dll",
         "nvrtc64_111_0.dll",
         "nvrtc64_112_0.dll",
-        "nvrtc64_120_0.alt.dll",
         "nvrtc64_120_0.dll",
-        "nvrtc64_130_0.alt.dll",
         "nvrtc64_130_0.dll",
     ),
     "nvvm": (

From 50e41bbff8bf83939aa12318c4f99525011e8727 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 12 Jun 2025 11:58:41 -0700
Subject: [PATCH 12/65] Revert "print *prog pointers in nvrtcCreateProgram,
 nvrtcCompileProgram bindings"

This reverts commit 104abbdb4824dc2ed43565c8cafdf363aefc5f07.
---
 .../cuda/bindings/_bindings/cynvrtc.pyx.in          | 13 -------------
 cuda_bindings/cuda/bindings/cynvrtc.pyx.in          |  2 --
 cuda_bindings/cuda/bindings/nvrtc.pyx.in            |  1 -
 3 files changed, 16 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
index 9ad166b4c4..9af9baad26 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in
@@ -382,14 +382,6 @@ cdef nvrtcResult _nvrtcCreateProgram(nvrtcProgram* prog, const char* src, const
     if __nvrtcCreateProgram == NULL:
         with gil:
             raise RuntimeError('Function "nvrtcCreateProgram" not found')
-    with gil:
-        print(f"\nLOOOK _bindings/cynvrtc.nvrtcCreateProgram CALLS __nvrtcCreateProgram", flush=True)
-        print(f"LOOOK     *prog={<unsigned long long>prog:x}", flush=True)
-        print(f"LOOOK     {src=!r}", flush=True)
-        print(f"LOOOK     {name=!r}", flush=True)
-        print(f"LOOOK     {numHeaders=!r}", flush=True)
-        print(f"LOOOK     *headers={<unsigned long long>headers:x}", flush=True)
-        print(f"LOOOK     *includeNames={<unsigned long long>includeNames:x}", flush=True)
     err = (<nvrtcResult (*)(nvrtcProgram*, const char*, const char*, int, const char**, const char**) except ?NVRTC_ERROR_INVALID_INPUT nogil> __nvrtcCreateProgram)(prog, src, name, numHeaders, headers, includeNames)
     return err
 {{endif}}
@@ -414,11 +406,6 @@ cdef nvrtcResult _nvrtcCompileProgram(nvrtcProgram prog, int numOptions, const c
     if __nvrtcCompileProgram == NULL:
         with gil:
             raise RuntimeError('Function "nvrtcCompileProgram" not found')
-    with gil:
-        print(f"\nLOOOK _bindings/cynvrtc._nvrtcCompileProgram CALLS __nvrtcCompileProgram", flush=True)
-        print(f"LOOOK     *prog={<unsigned long long>prog:x}", flush=True)
-        print(f"LOOOK     {numOptions=!r}", flush=True)
-        print(f"LOOOK     *options={<unsigned long long>options:x}", flush=True)
     err = (<nvrtcResult (*)(nvrtcProgram, int, const char**) except ?NVRTC_ERROR_INVALID_INPUT nogil> __nvrtcCompileProgram)(prog, numOptions, options)
     return err
 {{endif}}
diff --git a/cuda_bindings/cuda/bindings/cynvrtc.pyx.in b/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
index 4a267728e7..a6cbe8ec8e 100644
--- a/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/cynvrtc.pyx.in
@@ -43,8 +43,6 @@ cdef nvrtcResult nvrtcDestroyProgram(nvrtcProgram* prog) except ?NVRTC_ERROR_INV
 {{if 'nvrtcCompileProgram' in found_functions}}
 
 cdef nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, int numOptions, const char** options) except ?NVRTC_ERROR_INVALID_INPUT nogil:
-    with gil:
-        print(f"\nLOOOK cynvrtc.nvrtcCompileProgram CALLS cynvrtc._nvrtcCompileProgram", flush=True)
     return cynvrtc._nvrtcCompileProgram(prog, numOptions, options)
 {{endif}}
 
diff --git a/cuda_bindings/cuda/bindings/nvrtc.pyx.in b/cuda_bindings/cuda/bindings/nvrtc.pyx.in
index 687fe7386d..609b852883 100644
--- a/cuda_bindings/cuda/bindings/nvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/nvrtc.pyx.in
@@ -350,7 +350,6 @@ def nvrtcCompileProgram(prog, int numOptions, options : Optional[Tuple[bytes] |
     cyprog = <cynvrtc.nvrtcProgram><void_ptr>pprog
     if numOptions > len(options): raise RuntimeError("List is too small: " + str(len(options)) + " < " + str(numOptions))
     cdef vector[const char*] cyoptions = options
-    print(f"\nLOOOK nvrtcCompileProgram CALLS cynvrtc.nvrtcCompileProgram", flush=True)
     err = cynvrtc.nvrtcCompileProgram(cyprog, numOptions, cyoptions.data())
     return (_dict_nvrtcResult[err],)
 {{endif}}

From eaf66f2965f912f6b64593bb419bf5de7c5fec35 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 13 Jun 2025 15:40:29 -0700
Subject: [PATCH 13/65] =?UTF-8?q?=5Ffind=5Flib=5Fdir=5Fusing=5Fcuda=5Fhome?=
 =?UTF-8?q?():=20Windows=20CTK=2013=20=E2=86=92=20bin\x64?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../bindings/_path_finder/find_nvidia_dynamic_library.py     | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py b/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
index a513c04b73..d34c1464d6 100644
--- a/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
+++ b/cuda_bindings/cuda/bindings/_path_finder/find_nvidia_dynamic_library.py
@@ -74,7 +74,10 @@ def _find_lib_dir_using_cuda_home(libname):
                 ("nvvm", "bin"),  # CTK 12
             )
         else:
-            subdirs_list = (("bin",),)
+            subdirs_list = (
+                ("bin", "x64"),  # CTK 13
+                ("bin",),  # CTK 12
+            )
     else:
         if libname == "nvvm":  # noqa: SIM108
             subdirs_list = (("nvvm", "lib64"),)

From ebc9920816bfa3c9f97557dbf7ef3c8db4120f09 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 17 Jun 2025 10:57:28 -0400
Subject: [PATCH 14/65] getLocalRuntimeVersion(): Search for libcudart.so.13

---
 cuda_bindings/cuda/bindings/cyruntime.pyx.in | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
index dc047b2fb5..7e57305f45 100644
--- a/cuda_bindings/cuda/bindings/cyruntime.pyx.in
+++ b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
@@ -1896,16 +1896,16 @@ cdef cudaError_t getLocalRuntimeVersion(int* runtimeVersion) except ?cudaErrorCa
         raise NotImplementedError('"getLocalRuntimeVersion" is unsupported on Windows')
     {{else}}
     # Load
-    handle = dlfcn.dlopen('libcudart.so.12', dlfcn.RTLD_NOW)
+    handle = dlfcn.dlopen('libcudart.so.13', dlfcn.RTLD_NOW)
     if handle == NULL:
         with gil:
-            raise RuntimeError(f'Failed to dlopen libcudart.so.12')
+            raise RuntimeError(f'Failed to dlopen libcudart.so.13')
 
     __cudaRuntimeGetVersion = dlfcn.dlsym(handle, 'cudaRuntimeGetVersion')
 
     if __cudaRuntimeGetVersion == NULL:
         with gil:
-            raise RuntimeError(f'Function "cudaRuntimeGetVersion" not found in libcudart.so.12')
+            raise RuntimeError(f'Function "cudaRuntimeGetVersion" not found in libcudart.so.13')
 
     # Call
     cdef cudaError_t err = cudaSuccess

From d760969336879b56c79004c91f309d26548db23e Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 17 Jun 2025 10:57:55 -0400
Subject: [PATCH 15/65] SUPPORTED_LINUX_SONAMES: Add CTK 13 soname values

---
 cuda_bindings/cuda/bindings/_path_finder/supported_libs.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cuda_bindings/cuda/bindings/_path_finder/supported_libs.py b/cuda_bindings/cuda/bindings/_path_finder/supported_libs.py
index 305098dfaa..af2f129f7d 100644
--- a/cuda_bindings/cuda/bindings/_path_finder/supported_libs.py
+++ b/cuda_bindings/cuda/bindings/_path_finder/supported_libs.py
@@ -119,10 +119,12 @@
     "cublas": (
         "libcublas.so.11",
         "libcublas.so.12",
+        "libcublas.so.13",
     ),
     "cublasLt": (
         "libcublasLt.so.11",
         "libcublasLt.so.12",
+        "libcublasLt.so.13",
     ),
     "cudart": (
         "libcudart.so.11.0",
@@ -145,10 +147,12 @@
     "cusolver": (
         "libcusolver.so.10",
         "libcusolver.so.11",
+        "libcusolver.so.12",
     ),
     "cusolverMg": (
         "libcusolverMg.so.10",
         "libcusolverMg.so.11",
+        "libcusolverMg.so.12",
     ),
     "cusparse": (
         "libcusparse.so.11",
@@ -216,6 +220,7 @@
     "nvblas": (
         "libnvblas.so.11",
         "libnvblas.so.12",
+        "libnvblas.so.13",
     ),
     "nvfatbin": (
         "libnvfatbin.so.12",

From 7f8fa90ac215ad04551c679ffba35f9db1a41ae3 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rwgkio@gmail.com>
Date: Wed, 18 Jun 2025 08:02:22 -0700
Subject: [PATCH 16/65] Update path_finder/supported_libs.py from kitpicks
 13.0.0/025 (#96)

* Linux update from cuda_13.0.0_580.46_kitpicks025_linux.run: no-op b/o NVIDIA/cuda-python-private#95

* Windows update from cuda_13.0.0_kitpicks025_windows.exe
---
 .../cuda/bindings/_path_finder/supported_libs.py         | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_path_finder/supported_libs.py b/cuda_bindings/cuda/bindings/_path_finder/supported_libs.py
index af2f129f7d..8c0eb7dc70 100644
--- a/cuda_bindings/cuda/bindings/_path_finder/supported_libs.py
+++ b/cuda_bindings/cuda/bindings/_path_finder/supported_libs.py
@@ -112,7 +112,7 @@
 #   cuda_12.6.2_560.35.03_linux.run
 #   cuda_12.8.0_570.86.10_linux.run
 #   cuda_12.9.0_575.51.03_linux.run
-#   014
+#   025
 #   TODO: Update from posted .run files before merging into public main.
 # Generated with toolshed/build_path_finder_sonames.py
 SUPPORTED_LINUX_SONAMES = {
@@ -263,17 +263,19 @@
 #   cuda_12.6.2_560.94_windows.exe
 #   cuda_12.8.1_572.61_windows.exe
 #   cuda_12.9.0_576.02_windows.txt
-#   014
+#   025
 #   TODO: Update from posted .run files before merging into public main.
 # Generated with toolshed/build_path_finder_dlls.py (WITH MANUAL EDITS)
 SUPPORTED_WINDOWS_DLLS = {
     "cublas": (
         "cublas64_11.dll",
         "cublas64_12.dll",
+        "cublas64_13.dll",
     ),
     "cublasLt": (
         "cublasLt64_11.dll",
         "cublasLt64_12.dll",
+        "cublasLt64_13.dll",
     ),
     "cudart": (
         "cudart32_110.dll",
@@ -299,10 +301,12 @@
     "cusolver": (
         "cusolver64_10.dll",
         "cusolver64_11.dll",
+        "cusolver64_12.dll",
     ),
     "cusolverMg": (
         "cusolverMg64_10.dll",
         "cusolverMg64_11.dll",
+        "cusolverMg64_12.dll",
     ),
     "cusparse": (
         "cusparse64_11.dll",
@@ -370,6 +374,7 @@
     "nvblas": (
         "nvblas64_11.dll",
         "nvblas64_12.dll",
+        "nvblas64_13.dll",
     ),
     "nvfatbin": (
         "nvfatbin_120_0.dll",

From 3bec7e3f8ab13310e710dc54bcbd123de4fed502 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 21 Jun 2025 11:48:40 -0700
Subject: [PATCH 17/65] This trivial change should have been included in PR
 #81, but was overlooked. Direct commit for simplicity.

---
 cuda_core/tests/test_cuda_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/tests/test_cuda_utils.py b/cuda_core/tests/test_cuda_utils.py
index 5f94e545fe..713c6469dc 100644
--- a/cuda_core/tests/test_cuda_utils.py
+++ b/cuda_core/tests/test_cuda_utils.py
@@ -18,7 +18,7 @@ def test_driver_cu_result_explanations_health():
         assert code in expl_dict
         known_codes.add(code)
 
-    if cuda_utils.get_binding_version() >= (12, 0):
+    if cuda_utils.get_binding_version() >= (13, 0):
         # Ensure expl_dict has no codes not known as a CUresult enum
         extra_expl = sorted(set(expl_dict.keys()) - known_codes)
         assert not extra_expl
@@ -34,7 +34,7 @@ def test_runtime_cuda_error_explanations_health():
         assert code in expl_dict
         known_codes.add(code)
 
-    if cuda_utils.get_binding_version() >= (12, 0):
+    if cuda_utils.get_binding_version() >= (13, 0):
         # Ensure expl_dict has no codes not known as a cudaError_t enum
         extra_expl = sorted(set(expl_dict.keys()) - known_codes)
         assert not extra_expl

From 669fbc1be57dc52fcace1a12a82abad19cd42c45 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rwgkio@gmail.com>
Date: Mon, 23 Jun 2025 21:26:52 -0700
Subject: [PATCH 18/65] cuda_core forward compatibility changes (private
 development branch) (#94)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* CCCL_INCLUDE_PATH fixes in test_event.py, test_launcher.py

* Add new file (accidentally missing in a prior commit).

* Fix pre-commit errors in new tests/helpers.py

* 12→13 compatibility fixes in cuda/core/experimental/_graph.py

* CTK 12 compatibility (tests/test_cuda_utils.py)

* Make the cuda/core/experimental/_graph.py changes backwards compatible.

* Do not try to hide `13` in cuda_core/tests/test_cuda_utils.py

* More elegant handling of `CCCL_INCLUDE_PATHS` in cuda_core/tests/helpers.py

* Remove stray empty line (cuda_core/tests/conftest.py).

* Fix logic error computing CCCL_INCLUDE_PATHS in cuda_core/tests/helpers.py
---
 cuda_core/cuda/core/experimental/_graph.py | 26 ++++++++++++++--------
 cuda_core/tests/conftest.py                |  5 ++---
 cuda_core/tests/helpers.py                 | 16 +++++++++++++
 cuda_core/tests/test_event.py              |  4 ++--
 cuda_core/tests/test_launcher.py           | 15 +++++--------
 5 files changed, 42 insertions(+), 24 deletions(-)
 create mode 100644 cuda_core/tests/helpers.py

diff --git a/cuda_core/cuda/core/experimental/_graph.py b/cuda_core/cuda/core/experimental/_graph.py
index 1acf38176f..894b41047a 100644
--- a/cuda_core/cuda/core/experimental/_graph.py
+++ b/cuda_core/cuda/core/experimental/_graph.py
@@ -476,7 +476,7 @@ def create_conditional_handle(self, default_value=None) -> driver.CUgraphConditi
             default_value = 0
             flags = 0
 
-        status, _, graph, _, _ = handle_return(driver.cuStreamGetCaptureInfo(self._mnff.stream.handle))
+        status, _, graph, *_, _ = handle_return(driver.cuStreamGetCaptureInfo(self._mnff.stream.handle))
         if status != driver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_ACTIVE:
             raise RuntimeError("Cannot create a conditional handle when graph is not being built")
 
@@ -486,20 +486,22 @@ def create_conditional_handle(self, default_value=None) -> driver.CUgraphConditi
 
     def _cond_with_params(self, node_params) -> GraphBuilder:
         # Get current capture info to ensure we're in a valid state
-        status, _, graph, dependencies, num_dependencies = handle_return(
+        status, _, graph, *deps_info, num_dependencies = handle_return(
             driver.cuStreamGetCaptureInfo(self._mnff.stream.handle)
         )
         if status != driver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_ACTIVE:
             raise RuntimeError("Cannot add conditional node when not actively capturing")
 
         # Add the conditional node to the graph
-        node = handle_return(driver.cuGraphAddNode(graph, dependencies, num_dependencies, node_params))
+        deps_info_update = [
+            [handle_return(driver.cuGraphAddNode(graph, *deps_info, num_dependencies, node_params))]
+        ] + [None] * (len(deps_info) - 1)
 
         # Update the stream's capture dependencies
         handle_return(
             driver.cuStreamUpdateCaptureDependencies(
                 self._mnff.stream.handle,
-                [node],  # dependencies
+                *deps_info_update,  # dependencies, edgeData
                 1,  # numDependencies
                 driver.CUstreamUpdateCaptureDependencies_flags.CU_STREAM_SET_CAPTURE_DEPENDENCIES,
             )
@@ -677,17 +679,23 @@ def add_child(self, child_graph: GraphBuilder):
             raise ValueError("Parent graph is not being built.")
 
         stream_handle = self._mnff.stream.handle
-        _, _, graph_out, dependencies_out, num_dependencies_out = handle_return(
+        _, _, graph_out, *deps_info_out, num_dependencies_out = handle_return(
             driver.cuStreamGetCaptureInfo(stream_handle)
         )
 
-        child_node = handle_return(
-            driver.cuGraphAddChildGraphNode(graph_out, dependencies_out, num_dependencies_out, child_graph._mnff.graph)
-        )
+        deps_info_update = [
+            [
+                handle_return(
+                    driver.cuGraphAddChildGraphNode(
+                        graph_out, deps_info_out[0], num_dependencies_out, child_graph._mnff.graph
+                    )
+                )
+            ]
+        ] + [None] * (len(deps_info_out) - 1)
         handle_return(
             driver.cuStreamUpdateCaptureDependencies(
                 stream_handle,
-                [child_node],
+                *deps_info_update,  # dependencies, edgeData
                 1,
                 driver.CUstreamUpdateCaptureDependencies_flags.CU_STREAM_SET_CAPTURE_DEPENDENCIES,
             )
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index 34c87e1852..77eab1a59a 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -1,7 +1,7 @@
 # Copyright 2024 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import os
+import helpers
 
 try:
     from cuda.bindings import driver
@@ -65,5 +65,4 @@ def pop_all_contexts():
     return pop_all_contexts
 
 
-# TODO: make the fixture more sophisticated using path finder
-skipif_need_cuda_headers = pytest.mark.skipif(os.environ.get("CUDA_PATH") is None, reason="need CUDA header")
+skipif_need_cuda_headers = pytest.mark.skipif(helpers.CUDA_INCLUDE_PATH is None, reason="need CUDA header")
diff --git a/cuda_core/tests/helpers.py b/cuda_core/tests/helpers.py
new file mode 100644
index 0000000000..3cc6e48461
--- /dev/null
+++ b/cuda_core/tests/helpers.py
@@ -0,0 +1,16 @@
+# Copyright 2025 NVIDIA Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+CUDA_PATH = os.environ.get("CUDA_PATH")
+CUDA_INCLUDE_PATH = None
+CCCL_INCLUDE_PATHS = None
+if CUDA_PATH is not None:
+    path = os.path.join(CUDA_PATH, "include")
+    if os.path.isdir(path):
+        CUDA_INCLUDE_PATH = path
+        CCCL_INCLUDE_PATHS = (path,)
+        path = os.path.join(path, "cccl")
+        if os.path.isdir(path):
+            CCCL_INCLUDE_PATHS = (path,) + CCCL_INCLUDE_PATHS
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index 9cee936d8d..ed721d2dfd 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -2,9 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-import pathlib
 import time
 
+import helpers
 import numpy as np
 import pytest
 from conftest import skipif_need_cuda_headers
@@ -144,7 +144,7 @@ def test_error_timing_incomplete():
     program_options = ProgramOptions(
         std="c++17",
         arch=f"sm_{arch}",
-        include_path=str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include")),
+        include_path=helpers.CCCL_INCLUDE_PATHS,
     )
     prog = Program(code, code_type="c++", options=program_options)
     mod = prog.compile(target_type="cubin")
diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
index 3a02065de8..d31be9f336 100644
--- a/cuda_core/tests/test_launcher.py
+++ b/cuda_core/tests/test_launcher.py
@@ -2,9 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import ctypes
-import os
-import pathlib
 
+import helpers
 import numpy as np
 import pytest
 from conftest import skipif_need_cuda_headers
@@ -94,7 +93,7 @@ def test_launch_invalid_values(init_cuda):
     (ctypes.c_float, "float", 3.14),
     (ctypes.c_double, "double", 2.718),
 )
-if os.environ.get("CUDA_PATH"):
+if helpers.CCCL_INCLUDE_PATHS is not None:
     PARAMS += (
         (np.float16, "half", 0.78),
         (np.complex64, "cuda::std::complex<float>", 1 + 2j),
@@ -128,8 +127,7 @@ def test_launch_scalar_argument(python_type, cpp_type, init_value):
 
     # Compile and force instantiation for this type
     arch = "".join(f"{i}" for i in dev.compute_capability)
-    if os.environ.get("CUDA_PATH"):
-        include_path = str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include"))
+    if helpers.CCCL_INCLUDE_PATHS is not None:
         code = (
             r"""
         #include <cuda_fp16.h>
@@ -137,9 +135,7 @@ def test_launch_scalar_argument(python_type, cpp_type, init_value):
         """
             + code
         )
-    else:
-        include_path = None
-    pro_opts = ProgramOptions(std="c++11", arch=f"sm_{arch}", include_path=include_path)
+    pro_opts = ProgramOptions(std="c++17", arch=f"sm_{arch}", include_path=helpers.CCCL_INCLUDE_PATHS)
     prog = Program(code, code_type="c++", options=pro_opts)
     ker_name = f"write_scalar<{cpp_type}>"
     mod = prog.compile("cubin", name_expressions=(ker_name,))
@@ -173,8 +169,7 @@ def test_cooperative_launch():
 
     # Compile and force instantiation for this type
     arch = "".join(f"{i}" for i in dev.compute_capability)
-    include_path = str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include"))
-    pro_opts = ProgramOptions(std="c++17", arch=f"sm_{arch}", include_path=include_path)
+    pro_opts = ProgramOptions(std="c++17", arch=f"sm_{arch}", include_path=helpers.CCCL_INCLUDE_PATHS)
     prog = Program(code, code_type="c++", options=pro_opts)
     ker = prog.compile("cubin").get_kernel("test_grid_sync")
 

From b6fd97fad0278cb395f45ab6137f4251508c3d46 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rwgkio@gmail.com>
Date: Thu, 26 Jun 2025 16:11:55 -0700
Subject: [PATCH 19/65] Fix `cuda_bindings` and `cuda_core` examples  (#98)

* Unmask globalToShmemAsyncCopy_test.py error: explicit pytest_skipif_cuda_include_not_found(), pytest_skipif_compute_capability_too_low()

* Update cuda_bindings/examples/common/common.py for CTK 13 compatibility, to fix globalToShmemAsyncCopy_test.py

* Update cuda_core/examples/thread_block_cluster.py for CTK 13 compatibility.
---
 .../globalToShmemAsyncCopy_test.py            | 25 +-------
 cuda_bindings/examples/common/common.py       | 63 +++++++++++++++----
 cuda_core/examples/thread_block_cluster.py    |  9 ++-
 3 files changed, 60 insertions(+), 37 deletions(-)

diff --git a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
index df4443c075..8c94feb4a4 100644
--- a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
+++ b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
@@ -8,7 +8,6 @@
 from enum import Enum
 
 import numpy as np
-import pytest
 from common import common
 from common.helper_cuda import checkCudaErrors, findCudaDevice
 from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt
@@ -1115,28 +1114,10 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
     return -1
 
 
-def checkKernelCompiles():
-    kernel_headers = """\
-    #line __LINE__
-    #if __CUDA_ARCH__ >= 700
-    #include <cuda/barrier>
-    #endif
-    #include <cooperative_groups.h>
-    #include <cooperative_groups/reduce.h>
-    #include <cuda/pipeline>
-    """
-    try:
-        common.KernelHelper(kernel_headers, findCudaDevice())
-    except:
-        # Filters out test from automation for two reasons
-        # 1. Headers are not found
-        # 2. Incompatible device
-        return False
-    return True
-
-
-@pytest.mark.skipif(not checkKernelCompiles(), reason="Automation filter against incompatible kernel")
 def main():
+    common.pytest_skipif_cuda_include_not_found()
+    common.pytest_skipif_compute_capability_too_low(findCudaDevice(), (7, 0))
+
     print("[globalToShmemAsyncCopy] - Starting...")
 
     if platform.machine() == "qnx":
diff --git a/cuda_bindings/examples/common/common.py b/cuda_bindings/examples/common/common.py
index 34150d8f68..635493e88a 100644
--- a/cuda_bindings/examples/common/common.py
+++ b/cuda_bindings/examples/common/common.py
@@ -11,15 +11,50 @@
 from cuda.bindings import runtime as cudart
 
 
+def get_cuda_home():
+    cuda_home = os.getenv("CUDA_HOME")
+    if cuda_home is None:
+        cuda_home = os.getenv("CUDA_PATH")
+    return cuda_home
+
+
+def pytest_skipif_cuda_include_not_found():
+    import pytest
+
+    cuda_home = get_cuda_home()
+    if cuda_home is None:
+        pytest.skip("CUDA_HOME/CUDA_PATH not set")
+    cuda_include = os.path.join(cuda_home, "include")
+    if not os.path.exists(cuda_include):
+        pytest.skip(f"$CUDA_HOME/include does not exist: '{cuda_include}'")
+
+
+def pytest_skipif_compute_capability_too_low(devID, required_cc_major_minor):
+    import pytest
+
+    cc_major = checkCudaErrors(
+        cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID)
+    )
+    cc_minor = checkCudaErrors(
+        cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, devID)
+    )
+    have_cc_major_minor = (cc_major, cc_minor)
+    if have_cc_major_minor < required_cc_major_minor:
+        pytest.skip(f"cudaDevAttrComputeCapability too low: {have_cc_major_minor=!r}, {required_cc_major_minor=!r}")
+
+
 class KernelHelper:
     def __init__(self, code, devID):
         prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None))
-        CUDA_HOME = os.getenv("CUDA_HOME")
-        if CUDA_HOME is None:
-            CUDA_HOME = os.getenv("CUDA_PATH")
-        if CUDA_HOME is None:
-            raise RuntimeError("Environment variable CUDA_HOME or CUDA_PATH is not set")
-        include_dirs = os.path.join(CUDA_HOME, "include")
+
+        cuda_home = get_cuda_home()
+        assert cuda_home is not None
+        cuda_include = os.path.join(cuda_home, "include")
+        assert os.path.isdir(cuda_include)
+        include_dirs = [cuda_include]
+        cccl_include = os.path.join(cuda_include, "cccl")
+        if os.path.isdir(cccl_include):
+            include_dirs.insert(0, cccl_include)
 
         # Initialize CUDA
         checkCudaErrors(cudart.cudaFree(0))
@@ -35,14 +70,16 @@ def __init__(self, code, devID):
         prefix = "sm" if use_cubin else "compute"
         arch_arg = bytes(f"--gpu-architecture={prefix}_{major}{minor}", "ascii")
 
+        opts = [
+            b"--fmad=true",
+            arch_arg,
+            b"--std=c++17",
+            b"-default-device",
+        ]
+        for inc_dir in include_dirs:
+            opts.append(f"--include-path={inc_dir}".encode())
+
         try:
-            opts = [
-                b"--fmad=true",
-                arch_arg,
-                f"--include-path={include_dirs}".encode(),
-                b"--std=c++11",
-                b"-default-device",
-            ]
             checkCudaErrors(nvrtc.nvrtcCompileProgram(prog, len(opts), opts))
         except RuntimeError as err:
             logSize = checkCudaErrors(nvrtc.nvrtcGetProgramLogSize(prog))
diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py
index e761fbb47d..f8969f8974 100644
--- a/cuda_core/examples/thread_block_cluster.py
+++ b/cuda_core/examples/thread_block_cluster.py
@@ -12,7 +12,12 @@
 if cuda_path is None:
     print("this demo requires a valid CUDA_PATH environment variable set", file=sys.stderr)
     sys.exit(0)
-cuda_include_path = os.path.join(cuda_path, "include")
+cuda_include = os.path.join(cuda_path, "include")
+assert os.path.isdir(cuda_include)
+include_path = [cuda_include]
+cccl_include = os.path.join(cuda_include, "cccl")
+if os.path.isdir(cccl_include):
+    include_path.insert(0, cccl_include)
 
 # print cluster info using a kernel
 code = r"""
@@ -47,7 +52,7 @@
 prog = Program(
     code,
     code_type="c++",
-    options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=cuda_include_path),
+    options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path),
 )
 mod = prog.compile(target_type="cubin")
 ker = mod.get_kernel("check_cluster_info")

From 6d9167d8d0631ff6fa8d4beb939f72e931879084 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rwgkio@gmail.com>
Date: Mon, 30 Jun 2025 15:20:56 -0700
Subject: [PATCH 20/65] Update driver_cu_result_explanations.py,
 runtime_cuda_error_explanations.py (#100)

---
 .../core/experimental/_utils/driver_cu_result_explanations.py    | 1 -
 .../core/experimental/_utils/runtime_cuda_error_explanations.py  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py b/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py
index 676a63965d..5f52cd86f1 100644
--- a/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py
+++ b/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py
@@ -125,7 +125,6 @@
         " work will return the same error. To continue using CUDA, the process must"
         " be terminated and relaunched."
     ),
-    227: ("This indicates that an NVLink encryption error was detected during the execution."),
     300: (
         "This indicates that the device kernel source is invalid. This includes"
         " compilation/linker errors encountered in device code or user error."
diff --git a/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py b/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py
index 2bde9b22c1..09740e8a76 100644
--- a/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py
+++ b/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py
@@ -309,7 +309,6 @@
         " work will return the same error. To continue using CUDA, the process must"
         " be terminated and relaunched."
     ),
-    227: ("This indicates that an NVLink encryption error was detected during the execution."),
     300: "This indicates that the device kernel source is invalid.",
     301: "This indicates that the file specified was not found.",
     302: "This indicates that a link to a shared object failed to resolve.",

From c83ee83bfb239507411dff9c69c9e6c22b6a7191 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rwgkio@gmail.com>
Date: Tue, 8 Jul 2025 13:39:44 -0700
Subject: [PATCH 21/65] kitpicks/cuda-r13-0/13.0.0/033:
 CUDA_HOME=/usr/local/cuda python cython_gen.py --target-lib driver runtime
 nvrtc --out ../unreleased-13.0 (#107)

---
 cuda_bindings/cuda/bindings/cydriver.pxd.in   |  4 -
 .../cuda/bindings/cyruntime_types.pxi.in      | 15 ----
 cuda_bindings/cuda/bindings/driver.pyx.in     | 90 +++++++++----------
 cuda_bindings/cuda/bindings/runtime.pyx.in    | 35 +-------
 cuda_bindings/docs/source/module/driver.rst   | 24 -----
 cuda_bindings/docs/source/module/nvrtc.rst    |  2 +-
 cuda_bindings/docs/source/module/runtime.rst  |  6 --
 7 files changed, 45 insertions(+), 131 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/cydriver.pxd.in
index 52664211ce..c98fb9980d 100644
--- a/cuda_bindings/cuda/bindings/cydriver.pxd.in
+++ b/cuda_bindings/cuda/bindings/cydriver.pxd.in
@@ -746,20 +746,17 @@ cdef extern from "cuda.h":
         CU_TARGET_COMPUTE_89 = 89
         CU_TARGET_COMPUTE_90 = 90
         CU_TARGET_COMPUTE_100 = 100
-        CU_TARGET_COMPUTE_101 = 101
         CU_TARGET_COMPUTE_103 = 103
         CU_TARGET_COMPUTE_110 = 110
         CU_TARGET_COMPUTE_120 = 120
         CU_TARGET_COMPUTE_121 = 121
         CU_TARGET_COMPUTE_90A = 65626
         CU_TARGET_COMPUTE_100A = 65636
-        CU_TARGET_COMPUTE_101A = 65637
         CU_TARGET_COMPUTE_103A = 65639
         CU_TARGET_COMPUTE_110A = 65646
         CU_TARGET_COMPUTE_120A = 65656
         CU_TARGET_COMPUTE_121A = 65657
         CU_TARGET_COMPUTE_100F = 131172
-        CU_TARGET_COMPUTE_101F = 131173
         CU_TARGET_COMPUTE_103F = 131175
         CU_TARGET_COMPUTE_110F = 131182
         CU_TARGET_COMPUTE_120F = 131192
@@ -1262,7 +1259,6 @@ cdef extern from "cuda.h":
         CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224
         CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC = 225
         CUDA_ERROR_CONTAINED = 226
-        CUDA_ERROR_NVLINK_ENCRYPTION_FAILED = 227
         CUDA_ERROR_INVALID_SOURCE = 300
         CUDA_ERROR_FILE_NOT_FOUND = 301
         CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302
diff --git a/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
index 6ed0a2be92..05dc40b823 100644
--- a/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
+++ b/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
@@ -86,7 +86,6 @@ cdef extern from "driver_types.h":
         cudaErrorUnsupportedExecAffinity = 224
         cudaErrorUnsupportedDevSideSync = 225
         cudaErrorContained = 226
-        cudaErrorNvlinkEncryptionFailed = 227
         cudaErrorInvalidSource = 300
         cudaErrorFileNotFound = 301
         cudaErrorSharedObjectSymbolNotFound = 302
@@ -1592,20 +1591,6 @@ cdef extern from "library_types.h":
 
     ctypedef cudaEmulationStrategy_t cudaEmulationStrategy
 
-    cdef enum cudaEmulationMantissaControl_t:
-        CUDA_EMULATION_MANTISSA_CONTROL_DYNAMIC = 0
-        CUDA_EMULATION_MANTISSA_CONTROL_FIXED = 1
-
-    ctypedef cudaEmulationMantissaControl_t cudaEmulationMantissaControl
-
-    cdef enum cudaEmulationSpecialValuesSupport_t:
-        CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_NONE = 0
-        CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_INFINITY = 1
-        CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_NAN = 2
-        CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_DEFAULT = 65535
-
-    ctypedef cudaEmulationSpecialValuesSupport_t cudaEmulationSpecialValuesSupport
-
     cdef enum libraryPropertyType_t:
         MAJOR_VERSION = 0
         MINOR_VERSION = 1
diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in
index b4761bc413..60d76e1e86 100644
--- a/cuda_bindings/cuda/bindings/driver.pyx.in
+++ b/cuda_bindings/cuda/bindings/driver.pyx.in
@@ -2517,10 +2517,6 @@ class CUjit_target(IntEnum):
 
     #: Compute device class 10.0.
     CU_TARGET_COMPUTE_100 = cydriver.CUjit_target_enum.CU_TARGET_COMPUTE_100{{endif}}
-    {{if 'CU_TARGET_COMPUTE_101' in found_values}}
-
-    #: Compute device class 10.1.
-    CU_TARGET_COMPUTE_101 = cydriver.CUjit_target_enum.CU_TARGET_COMPUTE_101{{endif}}
     {{if 'CU_TARGET_COMPUTE_103' in found_values}}
 
     #: Compute device class 10.3.
@@ -2544,12 +2540,8 @@ class CUjit_target(IntEnum):
     CU_TARGET_COMPUTE_90A = cydriver.CUjit_target_enum.CU_TARGET_COMPUTE_90A{{endif}}
     {{if 'CU_TARGET_COMPUTE_100A' in found_values}}
 
-    #: Compute device class 10.1 with accelerated features.
-    CU_TARGET_COMPUTE_100A = cydriver.CUjit_target_enum.CU_TARGET_COMPUTE_100A{{endif}}
-    {{if 'CU_TARGET_COMPUTE_101A' in found_values}}
-
     #: Compute device class 11.0 with accelerated features.
-    CU_TARGET_COMPUTE_101A = cydriver.CUjit_target_enum.CU_TARGET_COMPUTE_101A{{endif}}
+    CU_TARGET_COMPUTE_100A = cydriver.CUjit_target_enum.CU_TARGET_COMPUTE_100A{{endif}}
     {{if 'CU_TARGET_COMPUTE_103A' in found_values}}
 
     #: Compute device class 12.0. with accelerated features.
@@ -2568,12 +2560,8 @@ class CUjit_target(IntEnum):
     CU_TARGET_COMPUTE_121A = cydriver.CUjit_target_enum.CU_TARGET_COMPUTE_121A{{endif}}
     {{if 'CU_TARGET_COMPUTE_100F' in found_values}}
 
-    #: Compute device class 10.1 with family features.
-    CU_TARGET_COMPUTE_100F = cydriver.CUjit_target_enum.CU_TARGET_COMPUTE_100F{{endif}}
-    {{if 'CU_TARGET_COMPUTE_101F' in found_values}}
-
     #: Compute device class 11.0 with family features.
-    CU_TARGET_COMPUTE_101F = cydriver.CUjit_target_enum.CU_TARGET_COMPUTE_101F{{endif}}
+    CU_TARGET_COMPUTE_100F = cydriver.CUjit_target_enum.CU_TARGET_COMPUTE_100F{{endif}}
     {{if 'CU_TARGET_COMPUTE_103F' in found_values}}
 
     #: Compute device class 12.0. with family features.
@@ -3665,11 +3653,6 @@ class CUresult(IntEnum):
     #: same error. To continue using CUDA, the process must be terminated
     #: and relaunched.
     CUDA_ERROR_CONTAINED = cydriver.cudaError_enum.CUDA_ERROR_CONTAINED{{endif}}
-    {{if 'CUDA_ERROR_NVLINK_ENCRYPTION_FAILED' in found_values}}
-
-    #: This indicates that an NVLink encryption error was detected during
-    #: the execution.
-    CUDA_ERROR_NVLINK_ENCRYPTION_FAILED = cydriver.cudaError_enum.CUDA_ERROR_NVLINK_ENCRYPTION_FAILED{{endif}}
     {{if 'CUDA_ERROR_INVALID_SOURCE' in found_values}}
 
     #: This indicates that the device kernel source is invalid. This
@@ -33717,7 +33700,7 @@ def cuMemGetDefaultMemPool(location : Optional[CUmemLocation], typename not None
 
 @cython.embedsignature(True)
 def cuMemGetMemPool(location : Optional[CUmemLocation], typename not None : CUmemAllocationType):
-    """ Sets the current memory pool for a memory location and allocation type.
+    """ Gets the current memory pool for a memory location and of a particular allocation type.
 
     The memory location can be of one of
     :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`,
@@ -33728,19 +33711,15 @@ def cuMemGetMemPool(location : Optional[CUmemLocation], typename not None : CUme
     :py:obj:`~.CU_MEM_ALLOCATION_TYPE_MANAGED`, the location type can also
     be :py:obj:`~.CU_MEM_LOCATION_TYPE_NONE` to indicate no preferred
     location for the managed memory pool. In all other cases, the call
-    returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
+    returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
-    When a memory pool is set as the current memory pool, the location
-    parameter should be the same as the location of the pool. The location
-    and allocation type specified must match those of the pool otherwise
-    :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is returned. By default, a memory
-    location's current memory pool is its default memory pool that can be
-    obtained via :py:obj:`~.cuMemGetDefaultMemPool`. If the location type
-    is :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE` and the allocation type is
-    :py:obj:`~.CU_MEM_ALLOCATION_TYPE_PINNED`, then this API is the
-    equivalent of calling :py:obj:`~.cuDeviceSetMemPool` with the location
-    id as the device. For further details on the implications, please refer
-    to the documentation for :py:obj:`~.cuDeviceSetMemPool`.
+    Returns the last pool provided to :py:obj:`~.cuMemSetMemPool` or
+    :py:obj:`~.cuDeviceSetMemPool` for this location and allocation type or
+    the location's default memory pool if :py:obj:`~.cuMemSetMemPool` or
+    :py:obj:`~.cuDeviceSetMemPool` for that allocType and location has
+    never been called. By default the current mempool of a location is the
+    default mempool for a device. Otherwise the returned pool must have
+    been set with :py:obj:`~.cuDeviceSetMemPool`.
 
     Parameters
     ----------
@@ -33758,11 +33737,7 @@ def cuMemGetMemPool(location : Optional[CUmemLocation], typename not None : CUme
 
     See Also
     --------
-    :py:obj:`~.cuDeviceGetDefaultMemPool`, :py:obj:`~.cuDeviceGetMemPool`, :py:obj:`~.cuMemGetMemPool`, :py:obj:`~.cuMemPoolCreate`, :py:obj:`~.cuMemPoolDestroy`, :py:obj:`~.cuMemAllocFromPoolAsync`
-
-    Notes
-    -----
-    Use :py:obj:`~.cuMemAllocFromPoolAsync` to specify asynchronous allocations from a device different than the one the stream runs on.
+    :py:obj:`~.cuDeviceGetDefaultMemPool`, :py:obj:`~.cuMemPoolCreate`, :py:obj:`~.cuDeviceSetMemPool`, :py:obj:`~.cuMemSetMemPool`
     """
     cdef CUmemoryPool pool = CUmemoryPool()
     cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
@@ -33777,7 +33752,7 @@ def cuMemGetMemPool(location : Optional[CUmemLocation], typename not None : CUme
 
 @cython.embedsignature(True)
 def cuMemSetMemPool(location : Optional[CUmemLocation], typename not None : CUmemAllocationType, pool):
-    """ Gets the current memory pool for a memory location and of a particular allocation type.
+    """ Sets the current memory pool for a memory location and allocation type.
 
     The memory location can be of one of
     :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`,
@@ -33788,15 +33763,19 @@ def cuMemSetMemPool(location : Optional[CUmemLocation], typename not None : CUme
     :py:obj:`~.CU_MEM_ALLOCATION_TYPE_MANAGED`, the location type can also
     be :py:obj:`~.CU_MEM_LOCATION_TYPE_NONE` to indicate no preferred
     location for the managed memory pool. In all other cases, the call
-    returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
+    returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
 
-    Returns the last pool provided to :py:obj:`~.cuMemSetMemPool` or
-    :py:obj:`~.cuDeviceSetMemPool` for this location and allocation type or
-    the location's default memory pool if :py:obj:`~.cuMemSetMemPool` or
-    :py:obj:`~.cuDeviceSetMemPool` for that allocType and location has
-    never been called. By default the current mempool of a location is the
-    default mempool for a device. Otherwise the returned pool must have
-    been set with :py:obj:`~.cuDeviceSetMemPool`.
+    When a memory pool is set as the current memory pool, the location
+    parameter should be the same as the location of the pool. The location
+    and allocation type specified must match those of the pool otherwise
+    :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is returned. By default, a memory
+    location's current memory pool is its default memory pool that can be
+    obtained via :py:obj:`~.cuMemGetDefaultMemPool`. If the location type
+    is :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE` and the allocation type is
+    :py:obj:`~.CU_MEM_ALLOCATION_TYPE_PINNED`, then this API is the
+    equivalent of calling :py:obj:`~.cuDeviceSetMemPool` with the location
+    id as the device. For further details on the implications, please refer
+    to the documentation for :py:obj:`~.cuDeviceSetMemPool`.
 
     Parameters
     ----------
@@ -33814,7 +33793,11 @@ def cuMemSetMemPool(location : Optional[CUmemLocation], typename not None : CUme
 
     See Also
     --------
-    :py:obj:`~.cuDeviceGetDefaultMemPool`, :py:obj:`~.cuMemPoolCreate`, :py:obj:`~.cuDeviceSetMemPool`, :py:obj:`~.cuMemSetMemPool`
+    :py:obj:`~.cuDeviceGetDefaultMemPool`, :py:obj:`~.cuDeviceGetMemPool`, :py:obj:`~.cuMemGetMemPool`, :py:obj:`~.cuMemPoolCreate`, :py:obj:`~.cuMemPoolDestroy`, :py:obj:`~.cuMemAllocFromPoolAsync`
+
+    Notes
+    -----
+    Use :py:obj:`~.cuMemAllocFromPoolAsync` to specify asynchronous allocations from a device different than the one the stream runs on.
     """
     cdef cydriver.CUmemoryPool cypool
     if pool is None:
@@ -49468,7 +49451,9 @@ def cuGetProcAddress(char* symbol, int cudaVersion, flags):
     11.2 should be specified as 11020. For a requested driver symbol, if
     the specified CUDA version is greater than or equal to the CUDA version
     in which the driver symbol was introduced, this API will return the
-    function pointer to the corresponding versioned function.
+    function pointer to the corresponding versioned function. If the
+    specified CUDA version is greater than the driver version, the API will
+    return :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
 
     The pointer returned by the API should be cast to a function pointer
     matching the requested driver function's definition in the API header
@@ -50147,7 +50132,14 @@ def cuGreenCtxDestroy(hCtx):
     Destroys the green context, releasing the primary context of the device
     that this green context was created for. Any resources provisioned for
     this green context (that were initially available via the resource
-    descriptor) are released as well.
+    descriptor) are released as well. The API does not destroy streams
+    created via :py:obj:`~.cuGreenCtxStreamCreate`,
+    :py:obj:`~.cuStreamCreate`, or :py:obj:`~.cuStreamCreateWithPriority`.
+    Once the green context is destroyed, any subsequent API calls involving
+    these streams (including :py:obj:`~.cuStreamDestroy`) will return
+    :py:obj:`~.CUDA_ERROR_CONTEXT_IS_DESTROYED`. Users must explicitly
+    destroy all such streams before invoking :py:obj:`~.cuGreenCtxDestroy`.
+    Failure to do so will result in a memory leak.
 
     Parameters
     ----------
diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in
index 026df7dcd0..cc4123493c 100644
--- a/cuda_bindings/cuda/bindings/runtime.pyx.in
+++ b/cuda_bindings/cuda/bindings/runtime.pyx.in
@@ -771,11 +771,6 @@ class cudaError_t(IntEnum):
     #: same error. To continue using CUDA, the process must be terminated
     #: and relaunched.
     cudaErrorContained = cyruntime.cudaError.cudaErrorContained{{endif}}
-    {{if 'cudaErrorNvlinkEncryptionFailed' in found_values}}
-
-    #: This indicates that an NVLink encryption error was detected during
-    #: the execution.
-    cudaErrorNvlinkEncryptionFailed = cyruntime.cudaError.cudaErrorNvlinkEncryptionFailed{{endif}}
     {{if 'cudaErrorInvalidSource' in found_values}}
 
     #: This indicates that the device kernel source is invalid.
@@ -1579,32 +1574,6 @@ class cudaEmulationStrategy(IntEnum):
 
 _dict_cudaEmulationStrategy = dict(((int(v), v) for k, v in cudaEmulationStrategy.__members__.items()))
 {{endif}}
-{{if 'cudaEmulationMantissaControl_t' in found_types}}
-
-class cudaEmulationMantissaControl(IntEnum):
-    """"""
-    {{if 'CUDA_EMULATION_MANTISSA_CONTROL_DYNAMIC' in found_values}}
-    CUDA_EMULATION_MANTISSA_CONTROL_DYNAMIC = cyruntime.cudaEmulationMantissaControl_t.CUDA_EMULATION_MANTISSA_CONTROL_DYNAMIC{{endif}}
-    {{if 'CUDA_EMULATION_MANTISSA_CONTROL_FIXED' in found_values}}
-    CUDA_EMULATION_MANTISSA_CONTROL_FIXED = cyruntime.cudaEmulationMantissaControl_t.CUDA_EMULATION_MANTISSA_CONTROL_FIXED{{endif}}
-
-_dict_cudaEmulationMantissaControl = dict(((int(v), v) for k, v in cudaEmulationMantissaControl.__members__.items()))
-{{endif}}
-{{if 'cudaEmulationSpecialValuesSupport_t' in found_types}}
-
-class cudaEmulationSpecialValuesSupport(IntEnum):
-    """"""
-    {{if 'CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_NONE' in found_values}}
-    CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_NONE = cyruntime.cudaEmulationSpecialValuesSupport_t.CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_NONE{{endif}}
-    {{if 'CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_INFINITY' in found_values}}
-    CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_INFINITY = cyruntime.cudaEmulationSpecialValuesSupport_t.CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_INFINITY{{endif}}
-    {{if 'CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_NAN' in found_values}}
-    CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_NAN = cyruntime.cudaEmulationSpecialValuesSupport_t.CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_NAN{{endif}}
-    {{if 'CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_DEFAULT' in found_values}}
-    CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_DEFAULT = cyruntime.cudaEmulationSpecialValuesSupport_t.CUDA_EMULATION_SPECIAL_VALUES_SUPPORT_DEFAULT{{endif}}
-
-_dict_cudaEmulationSpecialValuesSupport = dict(((int(v), v) for k, v in cudaEmulationSpecialValuesSupport.__members__.items()))
-{{endif}}
 {{if 'libraryPropertyType_t' in found_types}}
 
 class libraryPropertyType(IntEnum):
@@ -35127,7 +35096,9 @@ def cudaGetDriverEntryPointByVersion(char* symbol, unsigned int cudaVersion, uns
     11.2 should be specified as 11020. For a requested driver symbol, if
     the specified CUDA version is greater than or equal to the CUDA version
     in which the driver symbol was introduced, this API will return the
-    function pointer to the corresponding versioned function.
+    function pointer to the corresponding versioned function. If the
+    specified CUDA version is greater than the driver version, the API will
+    return :py:obj:`~.cudaErrorInvalidValue`.
 
     The pointer returned by the API should be cast to a function pointer
     matching the requested driver function's definition in the API header
diff --git a/cuda_bindings/docs/source/module/driver.rst b/cuda_bindings/docs/source/module/driver.rst
index d1ec48bb6e..d2878e1c1d 100644
--- a/cuda_bindings/docs/source/module/driver.rst
+++ b/cuda_bindings/docs/source/module/driver.rst
@@ -2638,12 +2638,6 @@ Data types used by CUDA driver
         Compute device class 10.0.
 
 
-    .. autoattribute:: cuda.bindings.driver.CUjit_target.CU_TARGET_COMPUTE_101
-
-
-        Compute device class 10.1.
-
-
     .. autoattribute:: cuda.bindings.driver.CUjit_target.CU_TARGET_COMPUTE_110
 
 
@@ -2677,12 +2671,6 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUjit_target.CU_TARGET_COMPUTE_100A
 
 
-        Compute device class 10.1 with accelerated features.
-
-
-    .. autoattribute:: cuda.bindings.driver.CUjit_target.CU_TARGET_COMPUTE_101A
-
-
         Compute device class 11.0 with accelerated features.
 
 
@@ -2713,12 +2701,6 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUjit_target.CU_TARGET_COMPUTE_100F
 
 
-        Compute device class 10.1 with family features.
-
-
-    .. autoattribute:: cuda.bindings.driver.CUjit_target.CU_TARGET_COMPUTE_101F
-
-
         Compute device class 11.0 with family features.
 
 
@@ -3691,12 +3673,6 @@ Data types used by CUDA driver
         This indicates that an exception occurred on the device that is now contained by the GPU's error containment capability. Common causes are - a. Certain types of invalid accesses of peer GPU memory over nvlink b. Certain classes of hardware errors This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
 
-    .. autoattribute:: cuda.bindings.driver.CUresult.CUDA_ERROR_NVLINK_ENCRYPTION_FAILED
-
-
-        This indicates that an NVLink encryption error was detected during the execution.
-
-
     .. autoattribute:: cuda.bindings.driver.CUresult.CUDA_ERROR_INVALID_SOURCE
 
 
diff --git a/cuda_bindings/docs/source/module/nvrtc.rst b/cuda_bindings/docs/source/module/nvrtc.rst
index 220246d854..0133efae50 100644
--- a/cuda_bindings/docs/source/module/nvrtc.rst
+++ b/cuda_bindings/docs/source/module/nvrtc.rst
@@ -111,7 +111,7 @@ NVRTC defines the following function related to PCH. Also see PCH related flags
 Supported Compile Options
 -------------------------
 
-NVRTC supports the compile options below. Option names with two preceding dashs (``--``\ ) are long option names and option names with one preceding dash (``-``\ ) are short option names. Short option names can be used instead of long option names. When a compile option takes an argument, an assignment operator (``=``\ ) is used to separate the compile option argument from the compile option name, e.g., ``"--gpu-architecture=compute_60"``\ . Alternatively, the compile option name and the argument can be specified in separate strings without an assignment operator, .e.g, ``"--gpu-architecture"``\  ``"compute_60"``\ . Single-character short option names, such as ``-D``\ , ``-U``\ , and ``-I``\ , do not require an assignment operator, and the compile option name and the argument can be present in the same string with or without spaces between them. For instance, ``"-D=<def>"``\ , ``"-D<def>"``\ , and ``"-D <def>"``\  are all supported.
+NVRTC supports the compile options below. Option names with two preceding dashs (``--``\ ) are long option names and option names with one preceding dash (``-``\ ) are short option names. Short option names can be used instead of long option names. When a compile option takes an argument, an assignment operator (``=``\ ) is used to separate the compile option argument from the compile option name, e.g., ``"--gpu-architecture=compute_100"``\ . Alternatively, the compile option name and the argument can be specified in separate strings without an assignment operator, .e.g, ``"--gpu-architecture"``\  ``"compute_100"``\ . Single-character short option names, such as ``-D``\ , ``-U``\ , and ``-I``\ , do not require an assignment operator, and the compile option name and the argument can be present in the same string with or without spaces between them. For instance, ``"-D=<def>"``\ , ``"-D<def>"``\ , and ``"-D <def>"``\  are all supported.
 
 
 
diff --git a/cuda_bindings/docs/source/module/runtime.rst b/cuda_bindings/docs/source/module/runtime.rst
index 653d6eba98..bc3321d511 100644
--- a/cuda_bindings/docs/source/module/runtime.rst
+++ b/cuda_bindings/docs/source/module/runtime.rst
@@ -1993,12 +1993,6 @@ Data types used by CUDA Runtime
         This indicates that an exception occurred on the device that is now contained by the GPU's error containment capability. Common causes are - a. Certain types of invalid accesses of peer GPU memory over nvlink b. Certain classes of hardware errors This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
 
 
-    .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorNvlinkEncryptionFailed
-
-
-        This indicates that an NVLink encryption error was detected during the execution.
-
-
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidSource
 
 

From 303110065b84d9fbb423028f75be28b80a9fa395 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 16 Jul 2025 09:28:11 -0700
Subject: [PATCH 22/65] Update cuda_pathfinder supported_nvidia_libs.py from
 kitpicks 13.0.0/036 (NO CHANGES compared to 025)

---
 .../cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py
index 122fd2cea4..e381ef6636 100644
--- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py
@@ -103,7 +103,7 @@
 #   cuda_12.6.2_560.35.03_linux.run
 #   cuda_12.8.0_570.86.10_linux.run
 #   cuda_12.9.0_575.51.03_linux.run
-#   025
+#   036
 #   TODO: Update from posted .run files before merging into public main.
 # Generated with toolshed/build_pathfinder_sonames.py
 SUPPORTED_LINUX_SONAMES = {
@@ -254,8 +254,8 @@
 #   cuda_12.6.2_560.94_windows.exe
 #   cuda_12.8.1_572.61_windows.exe
 #   cuda_12.9.0_576.02_windows.txt
-#   025
-#   TODO: Update from posted .run files before merging into public main.
+#   036
+#   TODO: Update from posted .exe files before merging into public main.
 # Generated with toolshed/build_pathfinder_dlls.py (WITH MANUAL EDITS)
 SUPPORTED_WINDOWS_DLLS = {
     "cublas": (

From 16c86bd247d19f48afbfe5e3cad20599ca1601cd Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 16 Jul 2025 09:38:17 -0700
Subject: [PATCH 23/65] Update driver_cu_result_explanations.py,
 runtime_cuda_error_explanations.py from kitpicks 13.0.0/036 (NO CHANGES
 compared to 025)

---
 .../core/experimental/_utils/driver_cu_result_explanations.py   | 2 ++
 .../core/experimental/_utils/runtime_cuda_error_explanations.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py b/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py
index c961e82ac5..86e9f87231 100644
--- a/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py
+++ b/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py
@@ -8,6 +8,8 @@
 
 # ruff: noqa: E501
 # CUDA Toolkit v13.0.0
+# 036
+# TODO: Update from posted .run files before merging into public main.
 DRIVER_CU_RESULT_EXPLANATIONS = {
     0: (
         "The API call returned with no errors. In the case of query calls, this"
diff --git a/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py b/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py
index 126897f2b5..0195e4af45 100644
--- a/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py
+++ b/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py
@@ -8,6 +8,8 @@
 
 # ruff: noqa: E501
 # CUDA Toolkit v13.0.0
+# 036
+# TODO: Update from posted .run files before merging into public main.
 RUNTIME_CUDA_ERROR_EXPLANATIONS = {
     0: (
         "The API call returned with no errors. In the case of query calls, this"

From bbb3585c81a3e80677efb8921fc72add1513ca44 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 17 Jul 2025 15:33:18 -0700
Subject: [PATCH 24/65] Update cuda_pathfinder supported_nvidia_libs.py
 EXPECTED_LIB_SYMBOLS for libnpp*

The newly chosen symbols appear in all CTK 12.x releases and 13.0.0:

https://gitlab-master.nvidia.com/rgrossekunst/rwgk_config_nvidia/-/blob/a1c2f29decd9b93fc7af9611bdc60565446b0cd3/bin/check_libnpp_symbols.sh
---
 .../_dynamic_libs/supported_nvidia_libs.py    | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py
index e381ef6636..3ea0ce8122 100644
--- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py
@@ -405,10 +405,10 @@ def is_suppressed_dll_file(path_basename: str) -> bool:
     return path_basename.startswith(("cudart32_", "nvvm32"))
 
 
-# Based on nm output for Linux x86_64 /usr/local/cuda (12.8.1)
+# Based on `nm -D --defined-only` output for Linux x86_64 distributions.
 EXPECTED_LIB_SYMBOLS = {
     "nvJitLink": (
-        "__nvJitLinkCreate_12_0",  # 12.0 through 12.8 (at least)
+        "__nvJitLinkCreate_12_0",  # 12.0 through 12.9
         "nvJitLinkVersion",  # 12.3 and up
     ),
     "nvrtc": ("nvrtcVersion",),
@@ -424,16 +424,16 @@ def is_suppressed_dll_file(path_basename: str) -> bool:
     "cusolverMg": ("cusolverMgCreate",),
     "cusparse": ("cusparseGetVersion",),
     "nppc": ("nppGetLibVersion",),
-    "nppial": ("nppiAdd_32f_C1R",),
-    "nppicc": ("nppiColorToGray_8u_C3C1R",),
-    "nppidei": ("nppiCopy_8u_C1R",),
-    "nppif": ("nppiFilterSobelHorizBorder_8u_C1R",),
-    "nppig": ("nppiResize_8u_C1R",),
-    "nppim": ("nppiErode_8u_C1R",),
-    "nppist": ("nppiMean_8u_C1R",),
+    "nppial": ("nppiAdd_32f_C1R_Ctx",),
+    "nppicc": ("nppiColorToGray_8u_C3C1R_Ctx",),
+    "nppidei": ("nppiCopy_8u_C1R_Ctx",),
+    "nppif": ("nppiFilterSobelHorizBorder_8u_C1R_Ctx",),
+    "nppig": ("nppiResize_8u_C1R_Ctx",),
+    "nppim": ("nppiErode_8u_C1R_Ctx",),
+    "nppist": ("nppiMean_8u_C1R_Ctx",),
     "nppisu": ("nppiFree",),
-    "nppitc": ("nppiThreshold_8u_C1R",),
-    "npps": ("nppsAdd_32f",),
+    "nppitc": ("nppiThreshold_8u_C1R_Ctx",),
+    "npps": ("nppsAdd_32f_Ctx",),
     "nvblas": ("dgemm",),
     "cufile": ("cuFileGetVersion",),
     # "cufile_rdma": ("rdma_buffer_reg",),

From ca968c16b0fbdb4785604458da4416dbb6ef71ac Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rwgkio@gmail.com>
Date: Mon, 21 Jul 2025 15:37:33 -0700
Subject: [PATCH 25/65] cython-gen changes due to PR #101 (#115)

---
 cuda_bindings/cuda/bindings/driver.pyx.in    | 1740 ++++++++++++------
 cuda_bindings/cuda/bindings/nvrtc.pyx.in     |   72 +-
 cuda_bindings/cuda/bindings/runtime.pyx.in   | 1064 +++++++----
 cuda_bindings/docs/source/module/driver.rst  |    2 +-
 cuda_bindings/docs/source/module/nvrtc.rst   |    2 +-
 cuda_bindings/docs/source/module/runtime.rst |    2 +-
 6 files changed, 1897 insertions(+), 985 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in
index 725e67ebcb..26cac65c09 100644
--- a/cuda_bindings/cuda/bindings/driver.pyx.in
+++ b/cuda_bindings/cuda/bindings/driver.pyx.in
@@ -23066,7 +23066,8 @@ def cuGetErrorString(error not None : CUresult):
     """
     cdef cydriver.CUresult cyerror = error.value
     cdef const char* pStr = NULL
-    err = cydriver.cuGetErrorString(cyerror, &pStr)
+    with nogil:
+        err = cydriver.cuGetErrorString(cyerror, &pStr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], <bytes>pStr if pStr != NULL else None)
@@ -23101,7 +23102,8 @@ def cuGetErrorName(error not None : CUresult):
     """
     cdef cydriver.CUresult cyerror = error.value
     cdef const char* pStr = NULL
-    err = cydriver.cuGetErrorName(cyerror, &pStr)
+    with nogil:
+        err = cydriver.cuGetErrorName(cyerror, &pStr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], <bytes>pStr if pStr != NULL else None)
@@ -23129,7 +23131,8 @@ def cuInit(unsigned int Flags):
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_SYSTEM_DRIVER_MISMATCH`, :py:obj:`~.CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE`
     """
-    err = cydriver.cuInit(Flags)
+    with nogil:
+        err = cydriver.cuInit(Flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -23158,7 +23161,8 @@ def cuDriverGetVersion():
     :py:obj:`~.cudaDriverGetVersion`, :py:obj:`~.cudaRuntimeGetVersion`
     """
     cdef int driverVersion = 0
-    err = cydriver.cuDriverGetVersion(&driverVersion)
+    with nogil:
+        err = cydriver.cuDriverGetVersion(&driverVersion)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], driverVersion)
@@ -23190,7 +23194,8 @@ def cuDeviceGet(int ordinal):
     :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetCount`, :py:obj:`~.cuDeviceGetName`, :py:obj:`~.cuDeviceGetUuid`, :py:obj:`~.cuDeviceGetLuid`, :py:obj:`~.cuDeviceTotalMem`, :py:obj:`~.cuDeviceGetExecAffinitySupport`
     """
     cdef CUdevice device = CUdevice()
-    err = cydriver.cuDeviceGet(<cydriver.CUdevice*>device._pvt_ptr, ordinal)
+    with nogil:
+        err = cydriver.cuDeviceGet(<cydriver.CUdevice*>device._pvt_ptr, ordinal)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], device)
@@ -23218,7 +23223,8 @@ def cuDeviceGetCount():
     :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetName`, :py:obj:`~.cuDeviceGetUuid`, :py:obj:`~.cuDeviceGetLuid`, :py:obj:`~.cuDeviceGet`, :py:obj:`~.cuDeviceTotalMem`, :py:obj:`~.cuDeviceGetExecAffinitySupport`, :py:obj:`~.cudaGetDeviceCount`
     """
     cdef int count = 0
-    err = cydriver.cuDeviceGetCount(&count)
+    with nogil:
+        err = cydriver.cuDeviceGetCount(&count)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], count)
@@ -23263,7 +23269,8 @@ def cuDeviceGetName(int length, dev):
     cydev = <cydriver.CUdevice>pdev
     pyname = b" " * length
     cdef char* name = pyname
-    err = cydriver.cuDeviceGetName(name, length, cydev)
+    with nogil:
+        err = cydriver.cuDeviceGetName(name, length, cydev)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pyname)
@@ -23304,7 +23311,8 @@ def cuDeviceGetUuid(dev):
         pdev = int(CUdevice(dev))
     cydev = <cydriver.CUdevice>pdev
     cdef CUuuid uuid = CUuuid()
-    err = cydriver.cuDeviceGetUuid(<cydriver.CUuuid*>uuid._pvt_ptr, cydev)
+    with nogil:
+        err = cydriver.cuDeviceGetUuid(<cydriver.CUuuid*>uuid._pvt_ptr, cydev)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], uuid)
@@ -23347,7 +23355,8 @@ def cuDeviceGetLuid(dev):
     cydev = <cydriver.CUdevice>pdev
     cdef char luid[8]
     cdef unsigned int deviceNodeMask = 0
-    err = cydriver.cuDeviceGetLuid(luid, &deviceNodeMask, cydev)
+    with nogil:
+        err = cydriver.cuDeviceGetLuid(luid, &deviceNodeMask, cydev)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], <bytes>luid, deviceNodeMask)
@@ -23387,7 +23396,8 @@ def cuDeviceTotalMem(dev):
         pdev = int(CUdevice(dev))
     cydev = <cydriver.CUdevice>pdev
     cdef size_t numbytes = 0
-    err = cydriver.cuDeviceTotalMem(&numbytes, cydev)
+    with nogil:
+        err = cydriver.cuDeviceTotalMem(&numbytes, cydev)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], numbytes)
@@ -23434,7 +23444,8 @@ def cuDeviceGetTexture1DLinearMaxWidth(pformat not None : CUarray_format, unsign
     cydev = <cydriver.CUdevice>pdev
     cdef size_t maxWidthInElements = 0
     cdef cydriver.CUarray_format cypformat = pformat.value
-    err = cydriver.cuDeviceGetTexture1DLinearMaxWidth(&maxWidthInElements, cypformat, numChannels, cydev)
+    with nogil:
+        err = cydriver.cuDeviceGetTexture1DLinearMaxWidth(&maxWidthInElements, cypformat, numChannels, cydev)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], maxWidthInElements)
@@ -23477,7 +23488,8 @@ def cuDeviceGetAttribute(attrib not None : CUdevice_attribute, dev):
     cydev = <cydriver.CUdevice>pdev
     cdef int pi = 0
     cdef cydriver.CUdevice_attribute cyattrib = attrib.value
-    err = cydriver.cuDeviceGetAttribute(&pi, cyattrib, cydev)
+    with nogil:
+        err = cydriver.cuDeviceGetAttribute(&pi, cyattrib, cydev)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pi)
@@ -23543,7 +23555,8 @@ def cuDeviceGetHostAtomicCapabilities(operations : Optional[Tuple[CUatomicOperat
             raise MemoryError('Failed to allocate length x size memory: ' + str(count) + 'x' + str(sizeof(unsigned int)))
     cdef vector[cydriver.CUatomicOperation] cyoperations = [pyoperations.value for pyoperations in (operations)]
     if count > len(operations): raise RuntimeError("List is too small: " + str(len(operations)) + " < " + str(count))
-    err = cydriver.cuDeviceGetHostAtomicCapabilities(cycapabilities, cyoperations.data(), count, cydev)
+    with nogil:
+        err = cydriver.cuDeviceGetHostAtomicCapabilities(cycapabilities, cyoperations.data(), count, cydev)
     if CUresult(err) == CUresult(0):
         pycapabilities = [<unsigned int>cycapabilities[idx] for idx in range(count)]
     if cycapabilities is not NULL:
@@ -23646,7 +23659,8 @@ def cuDeviceGetNvSciSyncAttributes(nvSciSyncAttrList, dev, int flags):
     cydev = <cydriver.CUdevice>pdev
     cynvSciSyncAttrList = utils.HelperInputVoidPtr(nvSciSyncAttrList)
     cdef void* cynvSciSyncAttrList_ptr = <void*><void_ptr>cynvSciSyncAttrList.cptr
-    err = cydriver.cuDeviceGetNvSciSyncAttributes(cynvSciSyncAttrList_ptr, cydev, flags)
+    with nogil:
+        err = cydriver.cuDeviceGetNvSciSyncAttributes(cynvSciSyncAttrList_ptr, cydev, flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -23697,7 +23711,8 @@ def cuDeviceSetMemPool(dev, pool):
     else:
         pdev = int(CUdevice(dev))
     cydev = <cydriver.CUdevice>pdev
-    err = cydriver.cuDeviceSetMemPool(cydev, cypool)
+    with nogil:
+        err = cydriver.cuDeviceSetMemPool(cydev, cypool)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -23738,7 +23753,8 @@ def cuDeviceGetMemPool(dev):
         pdev = int(CUdevice(dev))
     cydev = <cydriver.CUdevice>pdev
     cdef CUmemoryPool pool = CUmemoryPool()
-    err = cydriver.cuDeviceGetMemPool(<cydriver.CUmemoryPool*>pool._pvt_ptr, cydev)
+    with nogil:
+        err = cydriver.cuDeviceGetMemPool(<cydriver.CUmemoryPool*>pool._pvt_ptr, cydev)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pool)
@@ -23778,7 +23794,8 @@ def cuDeviceGetDefaultMemPool(dev):
         pdev = int(CUdevice(dev))
     cydev = <cydriver.CUdevice>pdev
     cdef CUmemoryPool pool_out = CUmemoryPool()
-    err = cydriver.cuDeviceGetDefaultMemPool(<cydriver.CUmemoryPool*>pool_out._pvt_ptr, cydev)
+    with nogil:
+        err = cydriver.cuDeviceGetDefaultMemPool(<cydriver.CUmemoryPool*>pool_out._pvt_ptr, cydev)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pool_out)
@@ -23825,7 +23842,8 @@ def cuDeviceGetExecAffinitySupport(typename not None : CUexecAffinityType, dev):
     cydev = <cydriver.CUdevice>pdev
     cdef int pi = 0
     cdef cydriver.CUexecAffinityType cytypename = typename.value
-    err = cydriver.cuDeviceGetExecAffinitySupport(&pi, cytypename, cydev)
+    with nogil:
+        err = cydriver.cuDeviceGetExecAffinitySupport(&pi, cytypename, cydev)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pi)
@@ -23877,7 +23895,8 @@ def cuFlushGPUDirectRDMAWrites(target not None : CUflushGPUDirectRDMAWritesTarge
     """
     cdef cydriver.CUflushGPUDirectRDMAWritesTarget cytarget = target.value
     cdef cydriver.CUflushGPUDirectRDMAWritesScope cyscope = scope.value
-    err = cydriver.cuFlushGPUDirectRDMAWrites(cytarget, cyscope)
+    with nogil:
+        err = cydriver.cuFlushGPUDirectRDMAWrites(cytarget, cyscope)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -23954,7 +23973,8 @@ def cuDeviceGetProperties(dev):
         pdev = int(CUdevice(dev))
     cydev = <cydriver.CUdevice>pdev
     cdef CUdevprop prop = CUdevprop()
-    err = cydriver.cuDeviceGetProperties(<cydriver.CUdevprop*>prop._pvt_ptr, cydev)
+    with nogil:
+        err = cydriver.cuDeviceGetProperties(<cydriver.CUdevprop*>prop._pvt_ptr, cydev)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], prop)
@@ -24002,7 +24022,8 @@ def cuDeviceComputeCapability(dev):
     cydev = <cydriver.CUdevice>pdev
     cdef int major = 0
     cdef int minor = 0
-    err = cydriver.cuDeviceComputeCapability(&major, &minor, cydev)
+    with nogil:
+        err = cydriver.cuDeviceComputeCapability(&major, &minor, cydev)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], major, minor)
@@ -24059,7 +24080,8 @@ def cuDevicePrimaryCtxRetain(dev):
         pdev = int(CUdevice(dev))
     cydev = <cydriver.CUdevice>pdev
     cdef CUcontext pctx = CUcontext()
-    err = cydriver.cuDevicePrimaryCtxRetain(<cydriver.CUcontext*>pctx._pvt_ptr, cydev)
+    with nogil:
+        err = cydriver.cuDevicePrimaryCtxRetain(<cydriver.CUcontext*>pctx._pvt_ptr, cydev)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pctx)
@@ -24106,7 +24128,8 @@ def cuDevicePrimaryCtxRelease(dev):
     else:
         pdev = int(CUdevice(dev))
     cydev = <cydriver.CUdevice>pdev
-    err = cydriver.cuDevicePrimaryCtxRelease(cydev)
+    with nogil:
+        err = cydriver.cuDevicePrimaryCtxRelease(cydev)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -24219,7 +24242,8 @@ def cuDevicePrimaryCtxSetFlags(dev, unsigned int flags):
     else:
         pdev = int(CUdevice(dev))
     cydev = <cydriver.CUdevice>pdev
-    err = cydriver.cuDevicePrimaryCtxSetFlags(cydev, flags)
+    with nogil:
+        err = cydriver.cuDevicePrimaryCtxSetFlags(cydev, flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -24261,7 +24285,8 @@ def cuDevicePrimaryCtxGetState(dev):
     cydev = <cydriver.CUdevice>pdev
     cdef unsigned int flags = 0
     cdef int active = 0
-    err = cydriver.cuDevicePrimaryCtxGetState(cydev, &flags, &active)
+    with nogil:
+        err = cydriver.cuDevicePrimaryCtxGetState(cydev, &flags, &active)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], flags, active)
@@ -24307,7 +24332,8 @@ def cuDevicePrimaryCtxReset(dev):
     else:
         pdev = int(CUdevice(dev))
     cydev = <cydriver.CUdevice>pdev
-    err = cydriver.cuDevicePrimaryCtxReset(cydev)
+    with nogil:
+        err = cydriver.cuDevicePrimaryCtxReset(cydev)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -24487,7 +24513,8 @@ def cuCtxCreate(ctxCreateParams : Optional[CUctxCreateParams], unsigned int flag
     cydev = <cydriver.CUdevice>pdev
     cdef CUcontext pctx = CUcontext()
     cdef cydriver.CUctxCreateParams* cyctxCreateParams_ptr = ctxCreateParams._pvt_ptr if ctxCreateParams != None else NULL
-    err = cydriver.cuCtxCreate(<cydriver.CUcontext*>pctx._pvt_ptr, cyctxCreateParams_ptr, flags, cydev)
+    with nogil:
+        err = cydriver.cuCtxCreate(<cydriver.CUcontext*>pctx._pvt_ptr, cyctxCreateParams_ptr, flags, cydev)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pctx)
@@ -24551,7 +24578,8 @@ def cuCtxDestroy(ctx):
     else:
         pctx = int(CUcontext(ctx))
     cyctx = <cydriver.CUcontext><void_ptr>pctx
-    err = cydriver.cuCtxDestroy(cyctx)
+    with nogil:
+        err = cydriver.cuCtxDestroy(cyctx)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -24591,7 +24619,8 @@ def cuCtxPushCurrent(ctx):
     else:
         pctx = int(CUcontext(ctx))
     cyctx = <cydriver.CUcontext><void_ptr>pctx
-    err = cydriver.cuCtxPushCurrent(cyctx)
+    with nogil:
+        err = cydriver.cuCtxPushCurrent(cyctx)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -24621,7 +24650,8 @@ def cuCtxPopCurrent():
     :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`
     """
     cdef CUcontext pctx = CUcontext()
-    err = cydriver.cuCtxPopCurrent(<cydriver.CUcontext*>pctx._pvt_ptr)
+    with nogil:
+        err = cydriver.cuCtxPopCurrent(<cydriver.CUcontext*>pctx._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pctx)
@@ -24665,7 +24695,8 @@ def cuCtxSetCurrent(ctx):
     else:
         pctx = int(CUcontext(ctx))
     cyctx = <cydriver.CUcontext><void_ptr>pctx
-    err = cydriver.cuCtxSetCurrent(cyctx)
+    with nogil:
+        err = cydriver.cuCtxSetCurrent(cyctx)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -24691,7 +24722,8 @@ def cuCtxGetCurrent():
     :py:obj:`~.cuCtxSetCurrent`, :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cudaGetDevice`
     """
     cdef CUcontext pctx = CUcontext()
-    err = cydriver.cuCtxGetCurrent(<cydriver.CUcontext*>pctx._pvt_ptr)
+    with nogil:
+        err = cydriver.cuCtxGetCurrent(<cydriver.CUcontext*>pctx._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pctx)
@@ -24717,7 +24749,8 @@ def cuCtxGetDevice():
     :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :py:obj:`~.cudaGetDevice`
     """
     cdef CUdevice device = CUdevice()
-    err = cydriver.cuCtxGetDevice(<cydriver.CUdevice*>device._pvt_ptr)
+    with nogil:
+        err = cydriver.cuCtxGetDevice(<cydriver.CUdevice*>device._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], device)
@@ -24758,7 +24791,8 @@ def cuCtxGetDevice_v2(ctx):
         pctx = int(CUcontext(ctx))
     cyctx = <cydriver.CUcontext><void_ptr>pctx
     cdef CUdevice device = CUdevice()
-    err = cydriver.cuCtxGetDevice_v2(<cydriver.CUdevice*>device._pvt_ptr, cyctx)
+    with nogil:
+        err = cydriver.cuCtxGetDevice_v2(<cydriver.CUdevice*>device._pvt_ptr, cyctx)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], device)
@@ -24785,7 +24819,8 @@ def cuCtxGetFlags():
     :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetCurrent`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxGetSharedMemConfig`, :py:obj:`~.cuCtxGetStreamPriorityRange`, :py:obj:`~.cuCtxSetFlags`, :py:obj:`~.cudaGetDeviceFlags`
     """
     cdef unsigned int flags = 0
-    err = cydriver.cuCtxGetFlags(&flags)
+    with nogil:
+        err = cydriver.cuCtxGetFlags(&flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], flags)
@@ -24814,7 +24849,8 @@ def cuCtxSetFlags(unsigned int flags):
     --------
     :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetCurrent`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxGetSharedMemConfig`, :py:obj:`~.cuCtxGetStreamPriorityRange`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cudaGetDeviceFlags`, :py:obj:`~.cuDevicePrimaryCtxSetFlags`,
     """
-    err = cydriver.cuCtxSetFlags(flags)
+    with nogil:
+        err = cydriver.cuCtxSetFlags(flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -24854,7 +24890,8 @@ def cuCtxGetId(ctx):
         pctx = int(CUcontext(ctx))
     cyctx = <cydriver.CUcontext><void_ptr>pctx
     cdef unsigned long long ctxId = 0
-    err = cydriver.cuCtxGetId(cyctx, &ctxId)
+    with nogil:
+        err = cydriver.cuCtxGetId(cyctx, &ctxId)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], ctxId)
@@ -24883,7 +24920,8 @@ def cuCtxSynchronize():
     --------
     :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cudaDeviceSynchronize`
     """
-    err = cydriver.cuCtxSynchronize()
+    with nogil:
+        err = cydriver.cuCtxSynchronize()
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -24927,7 +24965,8 @@ def cuCtxSynchronize_v2(ctx):
     else:
         pctx = int(CUcontext(ctx))
     cyctx = <cydriver.CUcontext><void_ptr>pctx
-    err = cydriver.cuCtxSynchronize_v2(cyctx)
+    with nogil:
+        err = cydriver.cuCtxSynchronize_v2(cyctx)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -25033,7 +25072,8 @@ def cuCtxSetLimit(limit not None : CUlimit, size_t value):
     :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxSynchronize`, :py:obj:`~.cudaDeviceSetLimit`
     """
     cdef cydriver.CUlimit cylimit = limit.value
-    err = cydriver.cuCtxSetLimit(cylimit, value)
+    with nogil:
+        err = cydriver.cuCtxSetLimit(cylimit, value)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -25089,7 +25129,8 @@ def cuCtxGetLimit(limit not None : CUlimit):
     """
     cdef size_t pvalue = 0
     cdef cydriver.CUlimit cylimit = limit.value
-    err = cydriver.cuCtxGetLimit(&pvalue, cylimit)
+    with nogil:
+        err = cydriver.cuCtxGetLimit(&pvalue, cylimit)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pvalue)
@@ -25136,7 +25177,8 @@ def cuCtxGetCacheConfig():
     :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cudaDeviceGetCacheConfig`
     """
     cdef cydriver.CUfunc_cache pconfig
-    err = cydriver.cuCtxGetCacheConfig(&pconfig)
+    with nogil:
+        err = cydriver.cuCtxGetCacheConfig(&pconfig)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], CUfunc_cache(pconfig))
@@ -25195,7 +25237,8 @@ def cuCtxSetCacheConfig(config not None : CUfunc_cache):
     :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cudaDeviceSetCacheConfig`, :py:obj:`~.cuKernelSetCacheConfig`
     """
     cdef cydriver.CUfunc_cache cyconfig = config.value
-    err = cydriver.cuCtxSetCacheConfig(cyconfig)
+    with nogil:
+        err = cydriver.cuCtxSetCacheConfig(cyconfig)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -25240,7 +25283,8 @@ def cuCtxGetApiVersion(ctx):
         pctx = int(CUcontext(ctx))
     cyctx = <cydriver.CUcontext><void_ptr>pctx
     cdef unsigned int version = 0
-    err = cydriver.cuCtxGetApiVersion(cyctx, &version)
+    with nogil:
+        err = cydriver.cuCtxGetApiVersion(cyctx, &version)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], version)
@@ -25285,7 +25329,8 @@ def cuCtxGetStreamPriorityRange():
     """
     cdef int leastPriority = 0
     cdef int greatestPriority = 0
-    err = cydriver.cuCtxGetStreamPriorityRange(&leastPriority, &greatestPriority)
+    with nogil:
+        err = cydriver.cuCtxGetStreamPriorityRange(&leastPriority, &greatestPriority)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], leastPriority, greatestPriority)
@@ -25309,7 +25354,8 @@ def cuCtxResetPersistingL2Cache():
     --------
     :py:obj:`~.CUaccessPolicyWindow`
     """
-    err = cydriver.cuCtxResetPersistingL2Cache()
+    with nogil:
+        err = cydriver.cuCtxResetPersistingL2Cache()
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -25343,7 +25389,8 @@ def cuCtxGetExecAffinity(typename not None : CUexecAffinityType):
     """
     cdef CUexecAffinityParam pExecAffinity = CUexecAffinityParam()
     cdef cydriver.CUexecAffinityType cytypename = typename.value
-    err = cydriver.cuCtxGetExecAffinity(<cydriver.CUexecAffinityParam*>pExecAffinity._pvt_ptr, cytypename)
+    with nogil:
+        err = cydriver.cuCtxGetExecAffinity(<cydriver.CUexecAffinityParam*>pExecAffinity._pvt_ptr, cytypename)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pExecAffinity)
@@ -25404,7 +25451,8 @@ def cuCtxRecordEvent(hCtx, hEvent):
     else:
         phCtx = int(CUcontext(hCtx))
     cyhCtx = <cydriver.CUcontext><void_ptr>phCtx
-    err = cydriver.cuCtxRecordEvent(cyhCtx, cyhEvent)
+    with nogil:
+        err = cydriver.cuCtxRecordEvent(cyhCtx, cyhEvent)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -25462,7 +25510,8 @@ def cuCtxWaitEvent(hCtx, hEvent):
     else:
         phCtx = int(CUcontext(hCtx))
     cyhCtx = <cydriver.CUcontext><void_ptr>phCtx
-    err = cydriver.cuCtxWaitEvent(cyhCtx, cyhEvent)
+    with nogil:
+        err = cydriver.cuCtxWaitEvent(cyhCtx, cyhEvent)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -25500,7 +25549,8 @@ def cuCtxAttach(unsigned int flags):
     :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxDetach`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`
     """
     cdef CUcontext pctx = CUcontext()
-    err = cydriver.cuCtxAttach(<cydriver.CUcontext*>pctx._pvt_ptr, flags)
+    with nogil:
+        err = cydriver.cuCtxAttach(<cydriver.CUcontext*>pctx._pvt_ptr, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pctx)
@@ -25543,7 +25593,8 @@ def cuCtxDetach(ctx):
     else:
         pctx = int(CUcontext(ctx))
     cyctx = <cydriver.CUcontext><void_ptr>pctx
-    err = cydriver.cuCtxDetach(cyctx)
+    with nogil:
+        err = cydriver.cuCtxDetach(cyctx)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -25583,7 +25634,8 @@ def cuCtxGetSharedMemConfig():
     :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :py:obj:`~.cuCtxGetSharedMemConfig`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cudaDeviceGetSharedMemConfig`
     """
     cdef cydriver.CUsharedconfig pConfig
-    err = cydriver.cuCtxGetSharedMemConfig(&pConfig)
+    with nogil:
+        err = cydriver.cuCtxGetSharedMemConfig(&pConfig)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], CUsharedconfig(pConfig))
@@ -25639,7 +25691,8 @@ def cuCtxSetSharedMemConfig(config not None : CUsharedconfig):
     :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :py:obj:`~.cuCtxGetSharedMemConfig`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cudaDeviceSetSharedMemConfig`
     """
     cdef cydriver.CUsharedconfig cyconfig = config.value
-    err = cydriver.cuCtxSetSharedMemConfig(cyconfig)
+    with nogil:
+        err = cydriver.cuCtxSetSharedMemConfig(cyconfig)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -25675,7 +25728,8 @@ def cuModuleLoad(char* fname):
     :py:obj:`~.cuModuleGetFunction`, :py:obj:`~.cuModuleGetGlobal`, :py:obj:`~.cuModuleGetTexRef`, :py:obj:`~.cuModuleLoadData`, :py:obj:`~.cuModuleLoadDataEx`, :py:obj:`~.cuModuleLoadFatBinary`, :py:obj:`~.cuModuleUnload`
     """
     cdef CUmodule module = CUmodule()
-    err = cydriver.cuModuleLoad(<cydriver.CUmodule*>module._pvt_ptr, fname)
+    with nogil:
+        err = cydriver.cuModuleLoad(<cydriver.CUmodule*>module._pvt_ptr, fname)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], module)
@@ -25711,7 +25765,8 @@ def cuModuleLoadData(image):
     cdef CUmodule module = CUmodule()
     cyimage = utils.HelperInputVoidPtr(image)
     cdef void* cyimage_ptr = <void*><void_ptr>cyimage.cptr
-    err = cydriver.cuModuleLoadData(<cydriver.CUmodule*>module._pvt_ptr, cyimage_ptr)
+    with nogil:
+        err = cydriver.cuModuleLoadData(<cydriver.CUmodule*>module._pvt_ptr, cyimage_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], module)
@@ -25762,7 +25817,9 @@ def cuModuleLoadDataEx(image, unsigned int numOptions, options : Optional[Tuple[
     cdef vector[cydriver.CUjit_option] cyoptions = [pyoptions.value for pyoptions in (options)]
     pylist = [utils.HelperCUjit_option(pyoptions, pyoptionValues) for pyoptions, pyoptionValues in zip(options, optionValues)]
     cdef utils.InputVoidPtrPtrHelper voidStarHelperoptionValues = utils.InputVoidPtrPtrHelper(pylist)
-    err = cydriver.cuModuleLoadDataEx(<cydriver.CUmodule*>module._pvt_ptr, cyimage_ptr, numOptions, cyoptions.data(), <void**><void_ptr>voidStarHelperoptionValues.cptr)
+    cdef void** cyoptionValues_ptr = <void**><void_ptr>voidStarHelperoptionValues.cptr
+    with nogil:
+        err = cydriver.cuModuleLoadDataEx(<cydriver.CUmodule*>module._pvt_ptr, cyimage_ptr, numOptions, cyoptions.data(), cyoptionValues_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], module)
@@ -25804,7 +25861,8 @@ def cuModuleLoadFatBinary(fatCubin):
     cdef CUmodule module = CUmodule()
     cyfatCubin = utils.HelperInputVoidPtr(fatCubin)
     cdef void* cyfatCubin_ptr = <void*><void_ptr>cyfatCubin.cptr
-    err = cydriver.cuModuleLoadFatBinary(<cydriver.CUmodule*>module._pvt_ptr, cyfatCubin_ptr)
+    with nogil:
+        err = cydriver.cuModuleLoadFatBinary(<cydriver.CUmodule*>module._pvt_ptr, cyfatCubin_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], module)
@@ -25843,7 +25901,8 @@ def cuModuleUnload(hmod):
     else:
         phmod = int(CUmodule(hmod))
     cyhmod = <cydriver.CUmodule><void_ptr>phmod
-    err = cydriver.cuModuleUnload(cyhmod)
+    with nogil:
+        err = cydriver.cuModuleUnload(cyhmod)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -25868,7 +25927,8 @@ def cuModuleGetLoadingMode():
     :py:obj:`~.cuModuleLoad`,
     """
     cdef cydriver.CUmoduleLoadingMode mode
-    err = cydriver.cuModuleGetLoadingMode(&mode)
+    with nogil:
+        err = cydriver.cuModuleGetLoadingMode(&mode)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], CUmoduleLoadingMode(mode))
@@ -25912,7 +25972,8 @@ def cuModuleGetFunction(hmod, char* name):
         phmod = int(CUmodule(hmod))
     cyhmod = <cydriver.CUmodule><void_ptr>phmod
     cdef CUfunction hfunc = CUfunction()
-    err = cydriver.cuModuleGetFunction(<cydriver.CUfunction*>hfunc._pvt_ptr, cyhmod, name)
+    with nogil:
+        err = cydriver.cuModuleGetFunction(<cydriver.CUfunction*>hfunc._pvt_ptr, cyhmod, name)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], hfunc)
@@ -25947,7 +26008,8 @@ def cuModuleGetFunctionCount(mod):
         pmod = int(CUmodule(mod))
     cymod = <cydriver.CUmodule><void_ptr>pmod
     cdef unsigned int count = 0
-    err = cydriver.cuModuleGetFunctionCount(&count, cymod)
+    with nogil:
+        err = cydriver.cuModuleGetFunctionCount(&count, cymod)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], count)
@@ -26001,7 +26063,8 @@ def cuModuleEnumerateFunctions(unsigned int numFunctions, mod):
         cyfunctions = <cydriver.CUfunction*>calloc(numFunctions, sizeof(cydriver.CUfunction))
         if cyfunctions is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(numFunctions) + 'x' + str(sizeof(cydriver.CUfunction)))
-    err = cydriver.cuModuleEnumerateFunctions(cyfunctions, numFunctions, cymod)
+    with nogil:
+        err = cydriver.cuModuleEnumerateFunctions(cyfunctions, numFunctions, cymod)
     if CUresult(err) == CUresult(0):
         pyfunctions = [CUfunction(init_value=<void_ptr>cyfunctions[idx]) for idx in range(numFunctions)]
     if cyfunctions is not NULL:
@@ -26053,7 +26116,8 @@ def cuModuleGetGlobal(hmod, char* name):
     cyhmod = <cydriver.CUmodule><void_ptr>phmod
     cdef CUdeviceptr dptr = CUdeviceptr()
     cdef size_t numbytes = 0
-    err = cydriver.cuModuleGetGlobal(<cydriver.CUdeviceptr*>dptr._pvt_ptr, &numbytes, cyhmod, name)
+    with nogil:
+        err = cydriver.cuModuleGetGlobal(<cydriver.CUdeviceptr*>dptr._pvt_ptr, &numbytes, cyhmod, name)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], dptr, numbytes)
@@ -26121,8 +26185,10 @@ def cuLinkCreate(unsigned int numOptions, options : Optional[Tuple[CUjit_option]
     cdef vector[cydriver.CUjit_option] cyoptions = [pyoptions.value for pyoptions in (options)]
     pylist = [utils.HelperCUjit_option(pyoptions, pyoptionValues) for pyoptions, pyoptionValues in zip(options, optionValues)]
     cdef utils.InputVoidPtrPtrHelper voidStarHelperoptionValues = utils.InputVoidPtrPtrHelper(pylist)
+    cdef void** cyoptionValues_ptr = <void**><void_ptr>voidStarHelperoptionValues.cptr
     cdef CUlinkState stateOut = CUlinkState()
-    err = cydriver.cuLinkCreate(numOptions, cyoptions.data(), <void**><void_ptr>voidStarHelperoptionValues.cptr, stateOut._pvt_ptr)
+    with nogil:
+        err = cydriver.cuLinkCreate(numOptions, cyoptions.data(), cyoptionValues_ptr, stateOut._pvt_ptr)
     stateOut._keepalive.append(voidStarHelperoptionValues)
     for option in pylist:
         stateOut._keepalive.append(option)
@@ -26199,7 +26265,9 @@ def cuLinkAddData(state, typename not None : CUjitInputType, data, size_t size,
     cdef vector[cydriver.CUjit_option] cyoptions = [pyoptions.value for pyoptions in (options)]
     pylist = [utils.HelperCUjit_option(pyoptions, pyoptionValues) for pyoptions, pyoptionValues in zip(options, optionValues)]
     cdef utils.InputVoidPtrPtrHelper voidStarHelperoptionValues = utils.InputVoidPtrPtrHelper(pylist)
-    err = cydriver.cuLinkAddData(cystate, cytypename, cydata_ptr, size, name, numOptions, cyoptions.data(), <void**><void_ptr>voidStarHelperoptionValues.cptr)
+    cdef void** cyoptionValues_ptr = <void**><void_ptr>voidStarHelperoptionValues.cptr
+    with nogil:
+        err = cydriver.cuLinkAddData(cystate, cytypename, cydata_ptr, size, name, numOptions, cyoptions.data(), cyoptionValues_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -26267,7 +26335,9 @@ def cuLinkAddFile(state, typename not None : CUjitInputType, char* path, unsigne
     cdef vector[cydriver.CUjit_option] cyoptions = [pyoptions.value for pyoptions in (options)]
     pylist = [utils.HelperCUjit_option(pyoptions, pyoptionValues) for pyoptions, pyoptionValues in zip(options, optionValues)]
     cdef utils.InputVoidPtrPtrHelper voidStarHelperoptionValues = utils.InputVoidPtrPtrHelper(pylist)
-    err = cydriver.cuLinkAddFile(cystate, cytypename, path, numOptions, cyoptions.data(), <void**><void_ptr>voidStarHelperoptionValues.cptr)
+    cdef void** cyoptionValues_ptr = <void**><void_ptr>voidStarHelperoptionValues.cptr
+    with nogil:
+        err = cydriver.cuLinkAddFile(cystate, cytypename, path, numOptions, cyoptions.data(), cyoptionValues_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -26311,7 +26381,8 @@ def cuLinkComplete(state):
     cystate = <cydriver.CUlinkState><void_ptr>pstate
     cdef void_ptr cubinOut = 0
     cdef size_t sizeOut = 0
-    err = cydriver.cuLinkComplete(cystate, <void**>&cubinOut, &sizeOut)
+    with nogil:
+        err = cydriver.cuLinkComplete(cystate, <void**>&cubinOut, &sizeOut)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], cubinOut, sizeOut)
@@ -26345,7 +26416,8 @@ def cuLinkDestroy(state):
     else:
         pstate = int(CUlinkState(state))
     cystate = <cydriver.CUlinkState><void_ptr>pstate
-    err = cydriver.cuLinkDestroy(cystate)
+    with nogil:
+        err = cydriver.cuLinkDestroy(cystate)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -26391,7 +26463,8 @@ def cuModuleGetTexRef(hmod, char* name):
         phmod = int(CUmodule(hmod))
     cyhmod = <cydriver.CUmodule><void_ptr>phmod
     cdef CUtexref pTexRef = CUtexref()
-    err = cydriver.cuModuleGetTexRef(<cydriver.CUtexref*>pTexRef._pvt_ptr, cyhmod, name)
+    with nogil:
+        err = cydriver.cuModuleGetTexRef(<cydriver.CUtexref*>pTexRef._pvt_ptr, cyhmod, name)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pTexRef)
@@ -26437,7 +26510,8 @@ def cuModuleGetSurfRef(hmod, char* name):
         phmod = int(CUmodule(hmod))
     cyhmod = <cydriver.CUmodule><void_ptr>phmod
     cdef CUsurfref pSurfRef = CUsurfref()
-    err = cydriver.cuModuleGetSurfRef(<cydriver.CUsurfref*>pSurfRef._pvt_ptr, cyhmod, name)
+    with nogil:
+        err = cydriver.cuModuleGetSurfRef(<cydriver.CUsurfref*>pSurfRef._pvt_ptr, cyhmod, name)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pSurfRef)
@@ -26525,14 +26599,17 @@ def cuLibraryLoadData(code, jitOptions : Optional[Tuple[CUjit_option] | List[CUj
     cdef vector[cydriver.CUjit_option] cyjitOptions = [pyjitOptions.value for pyjitOptions in (jitOptions)]
     pylist = [utils.HelperCUjit_option(pyoptions, pyoptionValues) for pyoptions, pyoptionValues in zip(jitOptions, jitOptionsValues)]
     cdef utils.InputVoidPtrPtrHelper voidStarHelperjitOptionsValues = utils.InputVoidPtrPtrHelper(pylist)
+    cdef void** cyjitOptionsValues_ptr = <void**><void_ptr>voidStarHelperjitOptionsValues.cptr
     if numJitOptions > len(jitOptions): raise RuntimeError("List is too small: " + str(len(jitOptions)) + " < " + str(numJitOptions))
     if numJitOptions > len(jitOptionsValues): raise RuntimeError("List is too small: " + str(len(jitOptionsValues)) + " < " + str(numJitOptions))
     cdef vector[cydriver.CUlibraryOption] cylibraryOptions = [pylibraryOptions.value for pylibraryOptions in (libraryOptions)]
     pylist = [utils.HelperCUlibraryOption(pyoptions, pyoptionValues) for pyoptions, pyoptionValues in zip(libraryOptions, libraryOptionValues)]
     cdef utils.InputVoidPtrPtrHelper voidStarHelperlibraryOptionValues = utils.InputVoidPtrPtrHelper(pylist)
+    cdef void** cylibraryOptionValues_ptr = <void**><void_ptr>voidStarHelperlibraryOptionValues.cptr
     if numLibraryOptions > len(libraryOptions): raise RuntimeError("List is too small: " + str(len(libraryOptions)) + " < " + str(numLibraryOptions))
     if numLibraryOptions > len(libraryOptionValues): raise RuntimeError("List is too small: " + str(len(libraryOptionValues)) + " < " + str(numLibraryOptions))
-    err = cydriver.cuLibraryLoadData(<cydriver.CUlibrary*>library._pvt_ptr, cycode_ptr, cyjitOptions.data(), <void**><void_ptr>voidStarHelperjitOptionsValues.cptr, numJitOptions, cylibraryOptions.data(), <void**><void_ptr>voidStarHelperlibraryOptionValues.cptr, numLibraryOptions)
+    with nogil:
+        err = cydriver.cuLibraryLoadData(<cydriver.CUlibrary*>library._pvt_ptr, cycode_ptr, cyjitOptions.data(), cyjitOptionsValues_ptr, numJitOptions, cylibraryOptions.data(), cylibraryOptionValues_ptr, numLibraryOptions)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], library)
@@ -26619,14 +26696,17 @@ def cuLibraryLoadFromFile(char* fileName, jitOptions : Optional[Tuple[CUjit_opti
     cdef vector[cydriver.CUjit_option] cyjitOptions = [pyjitOptions.value for pyjitOptions in (jitOptions)]
     pylist = [utils.HelperCUjit_option(pyoptions, pyoptionValues) for pyoptions, pyoptionValues in zip(jitOptions, jitOptionsValues)]
     cdef utils.InputVoidPtrPtrHelper voidStarHelperjitOptionsValues = utils.InputVoidPtrPtrHelper(pylist)
+    cdef void** cyjitOptionsValues_ptr = <void**><void_ptr>voidStarHelperjitOptionsValues.cptr
     if numJitOptions > len(jitOptions): raise RuntimeError("List is too small: " + str(len(jitOptions)) + " < " + str(numJitOptions))
     if numJitOptions > len(jitOptionsValues): raise RuntimeError("List is too small: " + str(len(jitOptionsValues)) + " < " + str(numJitOptions))
     cdef vector[cydriver.CUlibraryOption] cylibraryOptions = [pylibraryOptions.value for pylibraryOptions in (libraryOptions)]
     pylist = [utils.HelperCUlibraryOption(pyoptions, pyoptionValues) for pyoptions, pyoptionValues in zip(libraryOptions, libraryOptionValues)]
     cdef utils.InputVoidPtrPtrHelper voidStarHelperlibraryOptionValues = utils.InputVoidPtrPtrHelper(pylist)
+    cdef void** cylibraryOptionValues_ptr = <void**><void_ptr>voidStarHelperlibraryOptionValues.cptr
     if numLibraryOptions > len(libraryOptions): raise RuntimeError("List is too small: " + str(len(libraryOptions)) + " < " + str(numLibraryOptions))
     if numLibraryOptions > len(libraryOptionValues): raise RuntimeError("List is too small: " + str(len(libraryOptionValues)) + " < " + str(numLibraryOptions))
-    err = cydriver.cuLibraryLoadFromFile(<cydriver.CUlibrary*>library._pvt_ptr, fileName, cyjitOptions.data(), <void**><void_ptr>voidStarHelperjitOptionsValues.cptr, numJitOptions, cylibraryOptions.data(), <void**><void_ptr>voidStarHelperlibraryOptionValues.cptr, numLibraryOptions)
+    with nogil:
+        err = cydriver.cuLibraryLoadFromFile(<cydriver.CUlibrary*>library._pvt_ptr, fileName, cyjitOptions.data(), cyjitOptionsValues_ptr, numJitOptions, cylibraryOptions.data(), cylibraryOptionValues_ptr, numLibraryOptions)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], library)
@@ -26662,7 +26742,8 @@ def cuLibraryUnload(library):
     else:
         plibrary = int(CUlibrary(library))
     cylibrary = <cydriver.CUlibrary><void_ptr>plibrary
-    err = cydriver.cuLibraryUnload(cylibrary)
+    with nogil:
+        err = cydriver.cuLibraryUnload(cylibrary)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -26703,7 +26784,8 @@ def cuLibraryGetKernel(library, char* name):
         plibrary = int(CUlibrary(library))
     cylibrary = <cydriver.CUlibrary><void_ptr>plibrary
     cdef CUkernel pKernel = CUkernel()
-    err = cydriver.cuLibraryGetKernel(<cydriver.CUkernel*>pKernel._pvt_ptr, cylibrary, name)
+    with nogil:
+        err = cydriver.cuLibraryGetKernel(<cydriver.CUkernel*>pKernel._pvt_ptr, cylibrary, name)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pKernel)
@@ -26738,7 +26820,8 @@ def cuLibraryGetKernelCount(lib):
         plib = int(CUlibrary(lib))
     cylib = <cydriver.CUlibrary><void_ptr>plib
     cdef unsigned int count = 0
-    err = cydriver.cuLibraryGetKernelCount(&count, cylib)
+    with nogil:
+        err = cydriver.cuLibraryGetKernelCount(&count, cylib)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], count)
@@ -26786,7 +26869,8 @@ def cuLibraryEnumerateKernels(unsigned int numKernels, lib):
         cykernels = <cydriver.CUkernel*>calloc(numKernels, sizeof(cydriver.CUkernel))
         if cykernels is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(numKernels) + 'x' + str(sizeof(cydriver.CUkernel)))
-    err = cydriver.cuLibraryEnumerateKernels(cykernels, numKernels, cylib)
+    with nogil:
+        err = cydriver.cuLibraryEnumerateKernels(cykernels, numKernels, cylib)
     if CUresult(err) == CUresult(0):
         pykernels = [CUkernel(init_value=<void_ptr>cykernels[idx]) for idx in range(numKernels)]
     if cykernels is not NULL:
@@ -26831,7 +26915,8 @@ def cuLibraryGetModule(library):
         plibrary = int(CUlibrary(library))
     cylibrary = <cydriver.CUlibrary><void_ptr>plibrary
     cdef CUmodule pMod = CUmodule()
-    err = cydriver.cuLibraryGetModule(<cydriver.CUmodule*>pMod._pvt_ptr, cylibrary)
+    with nogil:
+        err = cydriver.cuLibraryGetModule(<cydriver.CUmodule*>pMod._pvt_ptr, cylibrary)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pMod)
@@ -26872,7 +26957,8 @@ def cuKernelGetFunction(kernel):
         pkernel = int(CUkernel(kernel))
     cykernel = <cydriver.CUkernel><void_ptr>pkernel
     cdef CUfunction pFunc = CUfunction()
-    err = cydriver.cuKernelGetFunction(<cydriver.CUfunction*>pFunc._pvt_ptr, cykernel)
+    with nogil:
+        err = cydriver.cuKernelGetFunction(<cydriver.CUfunction*>pFunc._pvt_ptr, cykernel)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pFunc)
@@ -26912,7 +26998,8 @@ def cuKernelGetLibrary(kernel):
         pkernel = int(CUkernel(kernel))
     cykernel = <cydriver.CUkernel><void_ptr>pkernel
     cdef CUlibrary pLib = CUlibrary()
-    err = cydriver.cuKernelGetLibrary(<cydriver.CUlibrary*>pLib._pvt_ptr, cykernel)
+    with nogil:
+        err = cydriver.cuKernelGetLibrary(<cydriver.CUlibrary*>pLib._pvt_ptr, cykernel)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pLib)
@@ -26960,7 +27047,8 @@ def cuLibraryGetGlobal(library, char* name):
     cylibrary = <cydriver.CUlibrary><void_ptr>plibrary
     cdef CUdeviceptr dptr = CUdeviceptr()
     cdef size_t numbytes = 0
-    err = cydriver.cuLibraryGetGlobal(<cydriver.CUdeviceptr*>dptr._pvt_ptr, &numbytes, cylibrary, name)
+    with nogil:
+        err = cydriver.cuLibraryGetGlobal(<cydriver.CUdeviceptr*>dptr._pvt_ptr, &numbytes, cylibrary, name)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], dptr, numbytes)
@@ -27010,7 +27098,8 @@ def cuLibraryGetManaged(library, char* name):
     cylibrary = <cydriver.CUlibrary><void_ptr>plibrary
     cdef CUdeviceptr dptr = CUdeviceptr()
     cdef size_t numbytes = 0
-    err = cydriver.cuLibraryGetManaged(<cydriver.CUdeviceptr*>dptr._pvt_ptr, &numbytes, cylibrary, name)
+    with nogil:
+        err = cydriver.cuLibraryGetManaged(<cydriver.CUdeviceptr*>dptr._pvt_ptr, &numbytes, cylibrary, name)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], dptr, numbytes)
@@ -27056,7 +27145,8 @@ def cuLibraryGetUnifiedFunction(library, char* symbol):
         plibrary = int(CUlibrary(library))
     cylibrary = <cydriver.CUlibrary><void_ptr>plibrary
     cdef void_ptr fptr = 0
-    err = cydriver.cuLibraryGetUnifiedFunction(<void**>&fptr, cylibrary, symbol)
+    with nogil:
+        err = cydriver.cuLibraryGetUnifiedFunction(<void**>&fptr, cylibrary, symbol)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], fptr)
@@ -27190,7 +27280,8 @@ def cuKernelGetAttribute(attrib not None : CUfunction_attribute, kernel, dev):
     cykernel = <cydriver.CUkernel><void_ptr>pkernel
     cdef int pi = 0
     cdef cydriver.CUfunction_attribute cyattrib = attrib.value
-    err = cydriver.cuKernelGetAttribute(&pi, cyattrib, cykernel, cydev)
+    with nogil:
+        err = cydriver.cuKernelGetAttribute(&pi, cyattrib, cykernel, cydev)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pi)
@@ -27306,7 +27397,8 @@ def cuKernelSetAttribute(attrib not None : CUfunction_attribute, int val, kernel
         pkernel = int(CUkernel(kernel))
     cykernel = <cydriver.CUkernel><void_ptr>pkernel
     cdef cydriver.CUfunction_attribute cyattrib = attrib.value
-    err = cydriver.cuKernelSetAttribute(cyattrib, val, cykernel, cydev)
+    with nogil:
+        err = cydriver.cuKernelSetAttribute(cyattrib, val, cykernel, cydev)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -27389,7 +27481,8 @@ def cuKernelSetCacheConfig(kernel, config not None : CUfunc_cache, dev):
         pkernel = int(CUkernel(kernel))
     cykernel = <cydriver.CUkernel><void_ptr>pkernel
     cdef cydriver.CUfunc_cache cyconfig = config.value
-    err = cydriver.cuKernelSetCacheConfig(cykernel, cyconfig, cydev)
+    with nogil:
+        err = cydriver.cuKernelSetCacheConfig(cykernel, cyconfig, cydev)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -27428,7 +27521,8 @@ def cuKernelGetName(hfunc):
         phfunc = int(CUkernel(hfunc))
     cyhfunc = <cydriver.CUkernel><void_ptr>phfunc
     cdef const char* name = NULL
-    err = cydriver.cuKernelGetName(&name, cyhfunc)
+    with nogil:
+        err = cydriver.cuKernelGetName(&name, cyhfunc)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], <bytes>name if name != NULL else None)
@@ -27482,7 +27576,8 @@ def cuKernelGetParamInfo(kernel, size_t paramIndex):
     cykernel = <cydriver.CUkernel><void_ptr>pkernel
     cdef size_t paramOffset = 0
     cdef size_t paramSize = 0
-    err = cydriver.cuKernelGetParamInfo(cykernel, paramIndex, &paramOffset, &paramSize)
+    with nogil:
+        err = cydriver.cuKernelGetParamInfo(cykernel, paramIndex, &paramOffset, &paramSize)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], paramOffset, paramSize)
@@ -27526,7 +27621,8 @@ def cuMemGetInfo():
     """
     cdef size_t free = 0
     cdef size_t total = 0
-    err = cydriver.cuMemGetInfo(&free, &total)
+    with nogil:
+        err = cydriver.cuMemGetInfo(&free, &total)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], free, total)
@@ -27561,7 +27657,8 @@ def cuMemAlloc(size_t bytesize):
     :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMalloc`
     """
     cdef CUdeviceptr dptr = CUdeviceptr()
-    err = cydriver.cuMemAlloc(<cydriver.CUdeviceptr*>dptr._pvt_ptr, bytesize)
+    with nogil:
+        err = cydriver.cuMemAlloc(<cydriver.CUdeviceptr*>dptr._pvt_ptr, bytesize)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], dptr)
@@ -27629,7 +27726,8 @@ def cuMemAllocPitch(size_t WidthInBytes, size_t Height, unsigned int ElementSize
     """
     cdef CUdeviceptr dptr = CUdeviceptr()
     cdef size_t pPitch = 0
-    err = cydriver.cuMemAllocPitch(<cydriver.CUdeviceptr*>dptr._pvt_ptr, &pPitch, WidthInBytes, Height, ElementSizeBytes)
+    with nogil:
+        err = cydriver.cuMemAllocPitch(<cydriver.CUdeviceptr*>dptr._pvt_ptr, &pPitch, WidthInBytes, Height, ElementSizeBytes)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], dptr, pPitch)
@@ -27678,7 +27776,8 @@ def cuMemFree(dptr):
     else:
         pdptr = int(CUdeviceptr(dptr))
     cydptr = <cydriver.CUdeviceptr><void_ptr>pdptr
-    err = cydriver.cuMemFree(cydptr)
+    with nogil:
+        err = cydriver.cuMemFree(cydptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -27721,7 +27820,8 @@ def cuMemGetAddressRange(dptr):
     cydptr = <cydriver.CUdeviceptr><void_ptr>pdptr
     cdef CUdeviceptr pbase = CUdeviceptr()
     cdef size_t psize = 0
-    err = cydriver.cuMemGetAddressRange(<cydriver.CUdeviceptr*>pbase._pvt_ptr, &psize, cydptr)
+    with nogil:
+        err = cydriver.cuMemGetAddressRange(<cydriver.CUdeviceptr*>pbase._pvt_ptr, &psize, cydptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], pbase, psize)
@@ -27777,7 +27877,8 @@ def cuMemAllocHost(size_t bytesize):
     :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMallocHost`
     """
     cdef void_ptr pp = 0
-    err = cydriver.cuMemAllocHost(<void**>&pp, bytesize)
+    with nogil:
+        err = cydriver.cuMemAllocHost(<void**>&pp, bytesize)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pp)
@@ -27808,7 +27909,8 @@ def cuMemFreeHost(p):
     """
     cyp = utils.HelperInputVoidPtr(p)
     cdef void* cyp_ptr = <void*><void_ptr>cyp.cptr
-    err = cydriver.cuMemFreeHost(cyp_ptr)
+    with nogil:
+        err = cydriver.cuMemFreeHost(cyp_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -27898,7 +28000,8 @@ def cuMemHostAlloc(size_t bytesize, unsigned int Flags):
     :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaHostAlloc`
     """
     cdef void_ptr pp = 0
-    err = cydriver.cuMemHostAlloc(<void**>&pp, bytesize, Flags)
+    with nogil:
+        err = cydriver.cuMemHostAlloc(<void**>&pp, bytesize, Flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pp)
@@ -27960,7 +28063,8 @@ def cuMemHostGetDevicePointer(p, unsigned int Flags):
     cdef CUdeviceptr pdptr = CUdeviceptr()
     cyp = utils.HelperInputVoidPtr(p)
     cdef void* cyp_ptr = <void*><void_ptr>cyp.cptr
-    err = cydriver.cuMemHostGetDevicePointer(<cydriver.CUdeviceptr*>pdptr._pvt_ptr, cyp_ptr, Flags)
+    with nogil:
+        err = cydriver.cuMemHostGetDevicePointer(<cydriver.CUdeviceptr*>pdptr._pvt_ptr, cyp_ptr, Flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pdptr)
@@ -27998,7 +28102,8 @@ def cuMemHostGetFlags(p):
     cdef unsigned int pFlags = 0
     cyp = utils.HelperInputVoidPtr(p)
     cdef void* cyp_ptr = <void*><void_ptr>cyp.cptr
-    err = cydriver.cuMemHostGetFlags(&pFlags, cyp_ptr)
+    with nogil:
+        err = cydriver.cuMemHostGetFlags(&pFlags, cyp_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pFlags)
@@ -28134,7 +28239,8 @@ def cuMemAllocManaged(size_t bytesize, unsigned int flags):
     :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuStreamAttachMemAsync`, :py:obj:`~.cudaMallocManaged`
     """
     cdef CUdeviceptr dptr = CUdeviceptr()
-    err = cydriver.cuMemAllocManaged(<cydriver.CUdeviceptr*>dptr._pvt_ptr, bytesize, flags)
+    with nogil:
+        err = cydriver.cuMemAllocManaged(<cydriver.CUdeviceptr*>dptr._pvt_ptr, bytesize, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], dptr)
@@ -28226,7 +28332,6 @@ def cuDeviceRegisterAsyncNotification(device, callbackFunc, userData):
     cdef CUasyncCallbackHandle callback = CUasyncCallbackHandle()
     with nogil:
         err = cydriver.cuDeviceRegisterAsyncNotification(cydevice, <cydriver.CUasyncCallback>cuAsyncNotificationCallbackWrapper, <void *>cbData, <cydriver.CUasyncCallbackHandle*>callback._pvt_ptr)
-
     if err != cydriver.CUDA_SUCCESS:
         free(cbData)
     else:
@@ -28278,7 +28383,8 @@ def cuDeviceUnregisterAsyncNotification(device, callback):
     else:
         pdevice = int(CUdevice(device))
     cydevice = <cydriver.CUdevice>pdevice
-    err = cydriver.cuDeviceUnregisterAsyncNotification(cydevice, cycallback)
+    with nogil:
+        err = cydriver.cuDeviceUnregisterAsyncNotification(cydevice, cycallback)
     if err == cydriver.CUDA_SUCCESS:
         free(m_global._allocated[pcallback])
         m_global._allocated.erase(<void_ptr>pcallback)
@@ -28313,7 +28419,8 @@ def cuDeviceGetByPCIBusId(char* pciBusId):
     :py:obj:`~.cuDeviceGet`, :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetPCIBusId`, :py:obj:`~.cudaDeviceGetByPCIBusId`
     """
     cdef CUdevice dev = CUdevice()
-    err = cydriver.cuDeviceGetByPCIBusId(<cydriver.CUdevice*>dev._pvt_ptr, pciBusId)
+    with nogil:
+        err = cydriver.cuDeviceGetByPCIBusId(<cydriver.CUdevice*>dev._pvt_ptr, pciBusId)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], dev)
@@ -28361,7 +28468,8 @@ def cuDeviceGetPCIBusId(int length, dev):
     cydev = <cydriver.CUdevice>pdev
     pypciBusId = b" " * length
     cdef char* pciBusId = pypciBusId
-    err = cydriver.cuDeviceGetPCIBusId(pciBusId, length, cydev)
+    with nogil:
+        err = cydriver.cuDeviceGetPCIBusId(pciBusId, length, cydev)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pypciBusId)
@@ -28421,7 +28529,8 @@ def cuIpcGetEventHandle(event):
         pevent = int(CUevent(event))
     cyevent = <cydriver.CUevent><void_ptr>pevent
     cdef CUipcEventHandle pHandle = CUipcEventHandle()
-    err = cydriver.cuIpcGetEventHandle(<cydriver.CUipcEventHandle*>pHandle._pvt_ptr, cyevent)
+    with nogil:
+        err = cydriver.cuIpcGetEventHandle(<cydriver.CUipcEventHandle*>pHandle._pvt_ptr, cyevent)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pHandle)
@@ -28467,7 +28576,8 @@ def cuIpcOpenEventHandle(handle not None : CUipcEventHandle):
     :py:obj:`~.cuEventCreate`, :py:obj:`~.cuEventDestroy`, :py:obj:`~.cuEventSynchronize`, :py:obj:`~.cuEventQuery`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuIpcGetEventHandle`, :py:obj:`~.cuIpcGetMemHandle`, :py:obj:`~.cuIpcOpenMemHandle`, :py:obj:`~.cuIpcCloseMemHandle`, :py:obj:`~.cudaIpcOpenEventHandle`
     """
     cdef CUevent phEvent = CUevent()
-    err = cydriver.cuIpcOpenEventHandle(<cydriver.CUevent*>phEvent._pvt_ptr, handle._pvt_ptr[0])
+    with nogil:
+        err = cydriver.cuIpcOpenEventHandle(<cydriver.CUevent*>phEvent._pvt_ptr, handle._pvt_ptr[0])
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], phEvent)
@@ -28522,7 +28632,8 @@ def cuIpcGetMemHandle(dptr):
         pdptr = int(CUdeviceptr(dptr))
     cydptr = <cydriver.CUdeviceptr><void_ptr>pdptr
     cdef CUipcMemHandle pHandle = CUipcMemHandle()
-    err = cydriver.cuIpcGetMemHandle(<cydriver.CUipcMemHandle*>pHandle._pvt_ptr, cydptr)
+    with nogil:
+        err = cydriver.cuIpcGetMemHandle(<cydriver.CUipcMemHandle*>pHandle._pvt_ptr, cydptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pHandle)
@@ -28590,7 +28701,8 @@ def cuIpcOpenMemHandle(handle not None : CUipcMemHandle, unsigned int Flags):
     No guarantees are made about the address returned in `*pdptr`. In particular, multiple processes may not receive the same address for the same `handle`.
     """
     cdef CUdeviceptr pdptr = CUdeviceptr()
-    err = cydriver.cuIpcOpenMemHandle(<cydriver.CUdeviceptr*>pdptr._pvt_ptr, handle._pvt_ptr[0], Flags)
+    with nogil:
+        err = cydriver.cuIpcOpenMemHandle(<cydriver.CUdeviceptr*>pdptr._pvt_ptr, handle._pvt_ptr[0], Flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pdptr)
@@ -28640,7 +28752,8 @@ def cuIpcCloseMemHandle(dptr):
     else:
         pdptr = int(CUdeviceptr(dptr))
     cydptr = <cydriver.CUdeviceptr><void_ptr>pdptr
-    err = cydriver.cuIpcCloseMemHandle(cydptr)
+    with nogil:
+        err = cydriver.cuIpcCloseMemHandle(cydptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -28744,7 +28857,8 @@ def cuMemHostRegister(p, size_t bytesize, unsigned int Flags):
     """
     cyp = utils.HelperInputVoidPtr(p)
     cdef void* cyp_ptr = <void*><void_ptr>cyp.cptr
-    err = cydriver.cuMemHostRegister(cyp_ptr, bytesize, Flags)
+    with nogil:
+        err = cydriver.cuMemHostRegister(cyp_ptr, bytesize, Flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -28776,7 +28890,8 @@ def cuMemHostUnregister(p):
     """
     cyp = utils.HelperInputVoidPtr(p)
     cdef void* cyp_ptr = <void*><void_ptr>cyp.cptr
-    err = cydriver.cuMemHostUnregister(cyp_ptr)
+    with nogil:
+        err = cydriver.cuMemHostUnregister(cyp_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -28827,7 +28942,8 @@ def cuMemcpy(dst, src, size_t ByteCount):
     else:
         pdst = int(CUdeviceptr(dst))
     cydst = <cydriver.CUdeviceptr><void_ptr>pdst
-    err = cydriver.cuMemcpy(cydst, cysrc, ByteCount)
+    with nogil:
+        err = cydriver.cuMemcpy(cydst, cysrc, ByteCount)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -28897,7 +29013,8 @@ def cuMemcpyPeer(dstDevice, dstContext, srcDevice, srcContext, size_t ByteCount)
     else:
         pdstDevice = int(CUdeviceptr(dstDevice))
     cydstDevice = <cydriver.CUdeviceptr><void_ptr>pdstDevice
-    err = cydriver.cuMemcpyPeer(cydstDevice, cydstContext, cysrcDevice, cysrcContext, ByteCount)
+    with nogil:
+        err = cydriver.cuMemcpyPeer(cydstDevice, cydstContext, cysrcDevice, cysrcContext, ByteCount)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -28939,7 +29056,8 @@ def cuMemcpyHtoD(dstDevice, srcHost, size_t ByteCount):
     cydstDevice = <cydriver.CUdeviceptr><void_ptr>pdstDevice
     cysrcHost = utils.HelperInputVoidPtr(srcHost)
     cdef void* cysrcHost_ptr = <void*><void_ptr>cysrcHost.cptr
-    err = cydriver.cuMemcpyHtoD(cydstDevice, cysrcHost_ptr, ByteCount)
+    with nogil:
+        err = cydriver.cuMemcpyHtoD(cydstDevice, cysrcHost_ptr, ByteCount)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -28981,7 +29099,8 @@ def cuMemcpyDtoH(dstHost, srcDevice, size_t ByteCount):
     cysrcDevice = <cydriver.CUdeviceptr><void_ptr>psrcDevice
     cydstHost = utils.HelperInputVoidPtr(dstHost)
     cdef void* cydstHost_ptr = <void*><void_ptr>cydstHost.cptr
-    err = cydriver.cuMemcpyDtoH(cydstHost_ptr, cysrcDevice, ByteCount)
+    with nogil:
+        err = cydriver.cuMemcpyDtoH(cydstHost_ptr, cysrcDevice, ByteCount)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -29029,7 +29148,8 @@ def cuMemcpyDtoD(dstDevice, srcDevice, size_t ByteCount):
     else:
         pdstDevice = int(CUdeviceptr(dstDevice))
     cydstDevice = <cydriver.CUdeviceptr><void_ptr>pdstDevice
-    err = cydriver.cuMemcpyDtoD(cydstDevice, cysrcDevice, ByteCount)
+    with nogil:
+        err = cydriver.cuMemcpyDtoD(cydstDevice, cysrcDevice, ByteCount)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -29080,7 +29200,8 @@ def cuMemcpyDtoA(dstArray, size_t dstOffset, srcDevice, size_t ByteCount):
     else:
         pdstArray = int(CUarray(dstArray))
     cydstArray = <cydriver.CUarray><void_ptr>pdstArray
-    err = cydriver.cuMemcpyDtoA(cydstArray, dstOffset, cysrcDevice, ByteCount)
+    with nogil:
+        err = cydriver.cuMemcpyDtoA(cydstArray, dstOffset, cysrcDevice, ByteCount)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -29133,7 +29254,8 @@ def cuMemcpyAtoD(dstDevice, srcArray, size_t srcOffset, size_t ByteCount):
     else:
         pdstDevice = int(CUdeviceptr(dstDevice))
     cydstDevice = <cydriver.CUdeviceptr><void_ptr>pdstDevice
-    err = cydriver.cuMemcpyAtoD(cydstDevice, cysrcArray, srcOffset, ByteCount)
+    with nogil:
+        err = cydriver.cuMemcpyAtoD(cydstDevice, cysrcArray, srcOffset, ByteCount)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -29178,7 +29300,8 @@ def cuMemcpyHtoA(dstArray, size_t dstOffset, srcHost, size_t ByteCount):
     cydstArray = <cydriver.CUarray><void_ptr>pdstArray
     cysrcHost = utils.HelperInputVoidPtr(srcHost)
     cdef void* cysrcHost_ptr = <void*><void_ptr>cysrcHost.cptr
-    err = cydriver.cuMemcpyHtoA(cydstArray, dstOffset, cysrcHost_ptr, ByteCount)
+    with nogil:
+        err = cydriver.cuMemcpyHtoA(cydstArray, dstOffset, cysrcHost_ptr, ByteCount)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -29223,7 +29346,8 @@ def cuMemcpyAtoH(dstHost, srcArray, size_t srcOffset, size_t ByteCount):
     cysrcArray = <cydriver.CUarray><void_ptr>psrcArray
     cydstHost = utils.HelperInputVoidPtr(dstHost)
     cdef void* cydstHost_ptr = <void*><void_ptr>cydstHost.cptr
-    err = cydriver.cuMemcpyAtoH(cydstHost_ptr, cysrcArray, srcOffset, ByteCount)
+    with nogil:
+        err = cydriver.cuMemcpyAtoH(cydstHost_ptr, cysrcArray, srcOffset, ByteCount)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -29279,7 +29403,8 @@ def cuMemcpyAtoA(dstArray, size_t dstOffset, srcArray, size_t srcOffset, size_t
     else:
         pdstArray = int(CUarray(dstArray))
     cydstArray = <cydriver.CUarray><void_ptr>pdstArray
-    err = cydriver.cuMemcpyAtoA(cydstArray, dstOffset, cysrcArray, srcOffset, ByteCount)
+    with nogil:
+        err = cydriver.cuMemcpyAtoA(cydstArray, dstOffset, cysrcArray, srcOffset, ByteCount)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -29405,7 +29530,8 @@ def cuMemcpy2D(pCopy : Optional[CUDA_MEMCPY2D]):
     :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`
     """
     cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
-    err = cydriver.cuMemcpy2D(cypCopy_ptr)
+    with nogil:
+        err = cydriver.cuMemcpy2D(cypCopy_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -29531,7 +29657,8 @@ def cuMemcpy2DUnaligned(pCopy : Optional[CUDA_MEMCPY2D]):
     :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`
     """
     cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
-    err = cydriver.cuMemcpy2DUnaligned(cypCopy_ptr)
+    with nogil:
+        err = cydriver.cuMemcpy2DUnaligned(cypCopy_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -29660,7 +29787,8 @@ def cuMemcpy3D(pCopy : Optional[CUDA_MEMCPY3D]):
     :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpy3D`
     """
     cdef cydriver.CUDA_MEMCPY3D* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
-    err = cydriver.cuMemcpy3D(cypCopy_ptr)
+    with nogil:
+        err = cydriver.cuMemcpy3D(cypCopy_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -29689,7 +29817,8 @@ def cuMemcpy3DPeer(pCopy : Optional[CUDA_MEMCPY3D_PEER]):
     :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyPeer`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyPeerAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :py:obj:`~.cudaMemcpy3DPeer`
     """
     cdef cydriver.CUDA_MEMCPY3D_PEER* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
-    err = cydriver.cuMemcpy3DPeer(cypCopy_ptr)
+    with nogil:
+        err = cydriver.cuMemcpy3DPeer(cypCopy_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -29750,7 +29879,8 @@ def cuMemcpyAsync(dst, src, size_t ByteCount, hStream):
     else:
         pdst = int(CUdeviceptr(dst))
     cydst = <cydriver.CUdeviceptr><void_ptr>pdst
-    err = cydriver.cuMemcpyAsync(cydst, cysrc, ByteCount, cyhStream)
+    with nogil:
+        err = cydriver.cuMemcpyAsync(cydst, cysrc, ByteCount, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -29830,7 +29960,8 @@ def cuMemcpyPeerAsync(dstDevice, dstContext, srcDevice, srcContext, size_t ByteC
     else:
         pdstDevice = int(CUdeviceptr(dstDevice))
     cydstDevice = <cydriver.CUdeviceptr><void_ptr>pdstDevice
-    err = cydriver.cuMemcpyPeerAsync(cydstDevice, cydstContext, cysrcDevice, cysrcContext, ByteCount, cyhStream)
+    with nogil:
+        err = cydriver.cuMemcpyPeerAsync(cydstDevice, cydstContext, cysrcDevice, cysrcContext, ByteCount, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -29882,7 +30013,8 @@ def cuMemcpyHtoDAsync(dstDevice, srcHost, size_t ByteCount, hStream):
     cydstDevice = <cydriver.CUdeviceptr><void_ptr>pdstDevice
     cysrcHost = utils.HelperInputVoidPtr(srcHost)
     cdef void* cysrcHost_ptr = <void*><void_ptr>cysrcHost.cptr
-    err = cydriver.cuMemcpyHtoDAsync(cydstDevice, cysrcHost_ptr, ByteCount, cyhStream)
+    with nogil:
+        err = cydriver.cuMemcpyHtoDAsync(cydstDevice, cysrcHost_ptr, ByteCount, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -29934,7 +30066,8 @@ def cuMemcpyDtoHAsync(dstHost, srcDevice, size_t ByteCount, hStream):
     cysrcDevice = <cydriver.CUdeviceptr><void_ptr>psrcDevice
     cydstHost = utils.HelperInputVoidPtr(dstHost)
     cdef void* cydstHost_ptr = <void*><void_ptr>cydstHost.cptr
-    err = cydriver.cuMemcpyDtoHAsync(cydstHost_ptr, cysrcDevice, ByteCount, cyhStream)
+    with nogil:
+        err = cydriver.cuMemcpyDtoHAsync(cydstHost_ptr, cysrcDevice, ByteCount, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -29992,7 +30125,8 @@ def cuMemcpyDtoDAsync(dstDevice, srcDevice, size_t ByteCount, hStream):
     else:
         pdstDevice = int(CUdeviceptr(dstDevice))
     cydstDevice = <cydriver.CUdeviceptr><void_ptr>pdstDevice
-    err = cydriver.cuMemcpyDtoDAsync(cydstDevice, cysrcDevice, ByteCount, cyhStream)
+    with nogil:
+        err = cydriver.cuMemcpyDtoDAsync(cydstDevice, cysrcDevice, ByteCount, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -30047,7 +30181,8 @@ def cuMemcpyHtoAAsync(dstArray, size_t dstOffset, srcHost, size_t ByteCount, hSt
     cydstArray = <cydriver.CUarray><void_ptr>pdstArray
     cysrcHost = utils.HelperInputVoidPtr(srcHost)
     cdef void* cysrcHost_ptr = <void*><void_ptr>cysrcHost.cptr
-    err = cydriver.cuMemcpyHtoAAsync(cydstArray, dstOffset, cysrcHost_ptr, ByteCount, cyhStream)
+    with nogil:
+        err = cydriver.cuMemcpyHtoAAsync(cydstArray, dstOffset, cysrcHost_ptr, ByteCount, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -30102,7 +30237,8 @@ def cuMemcpyAtoHAsync(dstHost, srcArray, size_t srcOffset, size_t ByteCount, hSt
     cysrcArray = <cydriver.CUarray><void_ptr>psrcArray
     cydstHost = utils.HelperInputVoidPtr(dstHost)
     cdef void* cydstHost_ptr = <void*><void_ptr>cydstHost.cptr
-    err = cydriver.cuMemcpyAtoHAsync(cydstHost_ptr, cysrcArray, srcOffset, ByteCount, cyhStream)
+    with nogil:
+        err = cydriver.cuMemcpyAtoHAsync(cydstHost_ptr, cysrcArray, srcOffset, ByteCount, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -30245,7 +30381,8 @@ def cuMemcpy2DAsync(pCopy : Optional[CUDA_MEMCPY2D], hStream):
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
-    err = cydriver.cuMemcpy2DAsync(cypCopy_ptr, cyhStream)
+    with nogil:
+        err = cydriver.cuMemcpy2DAsync(cypCopy_ptr, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -30384,7 +30521,8 @@ def cuMemcpy3DAsync(pCopy : Optional[CUDA_MEMCPY3D], hStream):
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef cydriver.CUDA_MEMCPY3D* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
-    err = cydriver.cuMemcpy3DAsync(cypCopy_ptr, cyhStream)
+    with nogil:
+        err = cydriver.cuMemcpy3DAsync(cypCopy_ptr, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -30423,7 +30561,8 @@ def cuMemcpy3DPeerAsync(pCopy : Optional[CUDA_MEMCPY3D_PEER], hStream):
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef cydriver.CUDA_MEMCPY3D_PEER* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
-    err = cydriver.cuMemcpy3DPeerAsync(cypCopy_ptr, cyhStream)
+    with nogil:
+        err = cydriver.cuMemcpy3DPeerAsync(cypCopy_ptr, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -30553,41 +30692,48 @@ def cuMemcpyBatchAsync(dsts : Optional[Tuple[CUdeviceptr] | List[CUdeviceptr]],
     if not all(isinstance(_x, (CUdeviceptr,)) for _x in dsts):
         raise TypeError("Argument 'dsts' is not instance of type (expected Tuple[cydriver.CUdeviceptr,] or List[cydriver.CUdeviceptr,]")
     cdef cydriver.CUdeviceptr* cydsts = NULL
-    if len(dsts) > 0:
+    if len(dsts) > 1:
         cydsts = <cydriver.CUdeviceptr*> calloc(len(dsts), sizeof(cydriver.CUdeviceptr))
         if cydsts is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dsts)) + 'x' + str(sizeof(cydriver.CUdeviceptr)))
         else:
             for idx in range(len(dsts)):
                 cydsts[idx] = <cydriver.CUdeviceptr>(<CUdeviceptr>dsts[idx])._pvt_ptr[0]
+    elif len(dsts) == 1:
+        cydsts = <cydriver.CUdeviceptr*>(<CUdeviceptr>dsts[0])._pvt_ptr
     cdef cydriver.CUdeviceptr* cysrcs = NULL
-    if len(srcs) > 0:
+    if len(srcs) > 1:
         cysrcs = <cydriver.CUdeviceptr*> calloc(len(srcs), sizeof(cydriver.CUdeviceptr))
         if cysrcs is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(srcs)) + 'x' + str(sizeof(cydriver.CUdeviceptr)))
         else:
             for idx in range(len(srcs)):
                 cysrcs[idx] = <cydriver.CUdeviceptr>(<CUdeviceptr>srcs[idx])._pvt_ptr[0]
+    elif len(srcs) == 1:
+        cysrcs = <cydriver.CUdeviceptr*>(<CUdeviceptr>srcs[0])._pvt_ptr
     cdef vector[size_t] cysizes = sizes
     if count > <size_t>len(dsts): raise RuntimeError("List is too small: " + str(len(dsts)) + " < " + str(count))
     if count > <size_t>len(srcs): raise RuntimeError("List is too small: " + str(len(srcs)) + " < " + str(count))
     if count > <size_t>len(sizes): raise RuntimeError("List is too small: " + str(len(sizes)) + " < " + str(count))
     cdef cydriver.CUmemcpyAttributes* cyattrs = NULL
-    if len(attrs) > 0:
+    if len(attrs) > 1:
         cyattrs = <cydriver.CUmemcpyAttributes*> calloc(len(attrs), sizeof(cydriver.CUmemcpyAttributes))
         if cyattrs is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(attrs)) + 'x' + str(sizeof(cydriver.CUmemcpyAttributes)))
         for idx in range(len(attrs)):
             string.memcpy(&cyattrs[idx], (<CUmemcpyAttributes>attrs[idx])._pvt_ptr, sizeof(cydriver.CUmemcpyAttributes))
+    elif len(attrs) == 1:
+        cyattrs = (<CUmemcpyAttributes>attrs[0])._pvt_ptr
     cdef vector[size_t] cyattrsIdxs = attrsIdxs
     if numAttrs > <size_t>len(attrs): raise RuntimeError("List is too small: " + str(len(attrs)) + " < " + str(numAttrs))
     if numAttrs > <size_t>len(attrsIdxs): raise RuntimeError("List is too small: " + str(len(attrsIdxs)) + " < " + str(numAttrs))
-    err = cydriver.cuMemcpyBatchAsync(<cydriver.CUdeviceptr*>(<CUdeviceptr>dsts[0])._pvt_ptr if len(dsts) == 1 else cydsts, <cydriver.CUdeviceptr*>(<CUdeviceptr>srcs[0])._pvt_ptr if len(srcs) == 1 else cysrcs, cysizes.data(), count, (<CUmemcpyAttributes>attrs[0])._pvt_ptr if len(attrs) == 1 else cyattrs, cyattrsIdxs.data(), numAttrs, cyhStream)
-    if cydsts is not NULL:
+    with nogil:
+        err = cydriver.cuMemcpyBatchAsync(cydsts, cysrcs, cysizes.data(), count, cyattrs, cyattrsIdxs.data(), numAttrs, cyhStream)
+    if len(dsts) > 1 and cydsts is not NULL:
         free(cydsts)
-    if cysrcs is not NULL:
+    if len(srcs) > 1 and cysrcs is not NULL:
         free(cysrcs)
-    if cyattrs is not NULL:
+    if len(attrs) > 1 and cyattrs is not NULL:
         free(cyattrs)
     return (_dict_CUresult[err],)
 {{endif}}
@@ -30708,14 +30854,17 @@ def cuMemcpy3DBatchAsync(size_t numOps, opList : Optional[Tuple[CUDA_MEMCPY3D_BA
         raise TypeError("Argument 'opList' is not instance of type (expected Tuple[cydriver.CUDA_MEMCPY3D_BATCH_OP,] or List[cydriver.CUDA_MEMCPY3D_BATCH_OP,]")
     if numOps > <size_t>len(opList): raise RuntimeError("List is too small: " + str(len(opList)) + " < " + str(numOps))
     cdef cydriver.CUDA_MEMCPY3D_BATCH_OP* cyopList = NULL
-    if len(opList) > 0:
+    if len(opList) > 1:
         cyopList = <cydriver.CUDA_MEMCPY3D_BATCH_OP*> calloc(len(opList), sizeof(cydriver.CUDA_MEMCPY3D_BATCH_OP))
         if cyopList is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(opList)) + 'x' + str(sizeof(cydriver.CUDA_MEMCPY3D_BATCH_OP)))
         for idx in range(len(opList)):
             string.memcpy(&cyopList[idx], (<CUDA_MEMCPY3D_BATCH_OP>opList[idx])._pvt_ptr, sizeof(cydriver.CUDA_MEMCPY3D_BATCH_OP))
-    err = cydriver.cuMemcpy3DBatchAsync(numOps, (<CUDA_MEMCPY3D_BATCH_OP>opList[0])._pvt_ptr if len(opList) == 1 else cyopList, flags, cyhStream)
-    if cyopList is not NULL:
+    elif len(opList) == 1:
+        cyopList = (<CUDA_MEMCPY3D_BATCH_OP>opList[0])._pvt_ptr
+    with nogil:
+        err = cydriver.cuMemcpy3DBatchAsync(numOps, cyopList, flags, cyhStream)
+    if len(opList) > 1 and cyopList is not NULL:
         free(cyopList)
     return (_dict_CUresult[err],)
 {{endif}}
@@ -30754,7 +30903,8 @@ def cuMemsetD8(dstDevice, unsigned char uc, size_t N):
     else:
         pdstDevice = int(CUdeviceptr(dstDevice))
     cydstDevice = <cydriver.CUdeviceptr><void_ptr>pdstDevice
-    err = cydriver.cuMemsetD8(cydstDevice, uc, N)
+    with nogil:
+        err = cydriver.cuMemsetD8(cydstDevice, uc, N)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -30793,7 +30943,8 @@ def cuMemsetD16(dstDevice, unsigned short us, size_t N):
     else:
         pdstDevice = int(CUdeviceptr(dstDevice))
     cydstDevice = <cydriver.CUdeviceptr><void_ptr>pdstDevice
-    err = cydriver.cuMemsetD16(cydstDevice, us, N)
+    with nogil:
+        err = cydriver.cuMemsetD16(cydstDevice, us, N)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -30832,7 +30983,8 @@ def cuMemsetD32(dstDevice, unsigned int ui, size_t N):
     else:
         pdstDevice = int(CUdeviceptr(dstDevice))
     cydstDevice = <cydriver.CUdeviceptr><void_ptr>pdstDevice
-    err = cydriver.cuMemsetD32(cydstDevice, ui, N)
+    with nogil:
+        err = cydriver.cuMemsetD32(cydstDevice, ui, N)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -30878,7 +31030,8 @@ def cuMemsetD2D8(dstDevice, size_t dstPitch, unsigned char uc, size_t Width, siz
     else:
         pdstDevice = int(CUdeviceptr(dstDevice))
     cydstDevice = <cydriver.CUdeviceptr><void_ptr>pdstDevice
-    err = cydriver.cuMemsetD2D8(cydstDevice, dstPitch, uc, Width, Height)
+    with nogil:
+        err = cydriver.cuMemsetD2D8(cydstDevice, dstPitch, uc, Width, Height)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -30925,7 +31078,8 @@ def cuMemsetD2D16(dstDevice, size_t dstPitch, unsigned short us, size_t Width, s
     else:
         pdstDevice = int(CUdeviceptr(dstDevice))
     cydstDevice = <cydriver.CUdeviceptr><void_ptr>pdstDevice
-    err = cydriver.cuMemsetD2D16(cydstDevice, dstPitch, us, Width, Height)
+    with nogil:
+        err = cydriver.cuMemsetD2D16(cydstDevice, dstPitch, us, Width, Height)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -30972,7 +31126,8 @@ def cuMemsetD2D32(dstDevice, size_t dstPitch, unsigned int ui, size_t Width, siz
     else:
         pdstDevice = int(CUdeviceptr(dstDevice))
     cydstDevice = <cydriver.CUdeviceptr><void_ptr>pdstDevice
-    err = cydriver.cuMemsetD2D32(cydstDevice, dstPitch, ui, Width, Height)
+    with nogil:
+        err = cydriver.cuMemsetD2D32(cydstDevice, dstPitch, ui, Width, Height)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -31020,7 +31175,8 @@ def cuMemsetD8Async(dstDevice, unsigned char uc, size_t N, hStream):
     else:
         pdstDevice = int(CUdeviceptr(dstDevice))
     cydstDevice = <cydriver.CUdeviceptr><void_ptr>pdstDevice
-    err = cydriver.cuMemsetD8Async(cydstDevice, uc, N, cyhStream)
+    with nogil:
+        err = cydriver.cuMemsetD8Async(cydstDevice, uc, N, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -31069,7 +31225,8 @@ def cuMemsetD16Async(dstDevice, unsigned short us, size_t N, hStream):
     else:
         pdstDevice = int(CUdeviceptr(dstDevice))
     cydstDevice = <cydriver.CUdeviceptr><void_ptr>pdstDevice
-    err = cydriver.cuMemsetD16Async(cydstDevice, us, N, cyhStream)
+    with nogil:
+        err = cydriver.cuMemsetD16Async(cydstDevice, us, N, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -31118,7 +31275,8 @@ def cuMemsetD32Async(dstDevice, unsigned int ui, size_t N, hStream):
     else:
         pdstDevice = int(CUdeviceptr(dstDevice))
     cydstDevice = <cydriver.CUdeviceptr><void_ptr>pdstDevice
-    err = cydriver.cuMemsetD32Async(cydstDevice, ui, N, cyhStream)
+    with nogil:
+        err = cydriver.cuMemsetD32Async(cydstDevice, ui, N, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -31174,7 +31332,8 @@ def cuMemsetD2D8Async(dstDevice, size_t dstPitch, unsigned char uc, size_t Width
     else:
         pdstDevice = int(CUdeviceptr(dstDevice))
     cydstDevice = <cydriver.CUdeviceptr><void_ptr>pdstDevice
-    err = cydriver.cuMemsetD2D8Async(cydstDevice, dstPitch, uc, Width, Height, cyhStream)
+    with nogil:
+        err = cydriver.cuMemsetD2D8Async(cydstDevice, dstPitch, uc, Width, Height, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -31231,7 +31390,8 @@ def cuMemsetD2D16Async(dstDevice, size_t dstPitch, unsigned short us, size_t Wid
     else:
         pdstDevice = int(CUdeviceptr(dstDevice))
     cydstDevice = <cydriver.CUdeviceptr><void_ptr>pdstDevice
-    err = cydriver.cuMemsetD2D16Async(cydstDevice, dstPitch, us, Width, Height, cyhStream)
+    with nogil:
+        err = cydriver.cuMemsetD2D16Async(cydstDevice, dstPitch, us, Width, Height, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -31288,7 +31448,8 @@ def cuMemsetD2D32Async(dstDevice, size_t dstPitch, unsigned int ui, size_t Width
     else:
         pdstDevice = int(CUdeviceptr(dstDevice))
     cydstDevice = <cydriver.CUdeviceptr><void_ptr>pdstDevice
-    err = cydriver.cuMemsetD2D32Async(cydstDevice, dstPitch, ui, Width, Height, cyhStream)
+    with nogil:
+        err = cydriver.cuMemsetD2D32Async(cydstDevice, dstPitch, ui, Width, Height, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -31356,7 +31517,8 @@ def cuArrayCreate(pAllocateArray : Optional[CUDA_ARRAY_DESCRIPTOR]):
     """
     cdef CUarray pHandle = CUarray()
     cdef cydriver.CUDA_ARRAY_DESCRIPTOR* cypAllocateArray_ptr = pAllocateArray._pvt_ptr if pAllocateArray != None else NULL
-    err = cydriver.cuArrayCreate(<cydriver.CUarray*>pHandle._pvt_ptr, cypAllocateArray_ptr)
+    with nogil:
+        err = cydriver.cuArrayCreate(<cydriver.CUarray*>pHandle._pvt_ptr, cypAllocateArray_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pHandle)
@@ -31398,7 +31560,8 @@ def cuArrayGetDescriptor(hArray):
         phArray = int(CUarray(hArray))
     cyhArray = <cydriver.CUarray><void_ptr>phArray
     cdef CUDA_ARRAY_DESCRIPTOR pArrayDescriptor = CUDA_ARRAY_DESCRIPTOR()
-    err = cydriver.cuArrayGetDescriptor(<cydriver.CUDA_ARRAY_DESCRIPTOR*>pArrayDescriptor._pvt_ptr, cyhArray)
+    with nogil:
+        err = cydriver.cuArrayGetDescriptor(<cydriver.CUDA_ARRAY_DESCRIPTOR*>pArrayDescriptor._pvt_ptr, cyhArray)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pArrayDescriptor)
@@ -31453,7 +31616,8 @@ def cuArrayGetSparseProperties(array):
         parray = int(CUarray(array))
     cyarray = <cydriver.CUarray><void_ptr>parray
     cdef CUDA_ARRAY_SPARSE_PROPERTIES sparseProperties = CUDA_ARRAY_SPARSE_PROPERTIES()
-    err = cydriver.cuArrayGetSparseProperties(<cydriver.CUDA_ARRAY_SPARSE_PROPERTIES*>sparseProperties._pvt_ptr, cyarray)
+    with nogil:
+        err = cydriver.cuArrayGetSparseProperties(<cydriver.CUDA_ARRAY_SPARSE_PROPERTIES*>sparseProperties._pvt_ptr, cyarray)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], sparseProperties)
@@ -31510,7 +31674,8 @@ def cuMipmappedArrayGetSparseProperties(mipmap):
         pmipmap = int(CUmipmappedArray(mipmap))
     cymipmap = <cydriver.CUmipmappedArray><void_ptr>pmipmap
     cdef CUDA_ARRAY_SPARSE_PROPERTIES sparseProperties = CUDA_ARRAY_SPARSE_PROPERTIES()
-    err = cydriver.cuMipmappedArrayGetSparseProperties(<cydriver.CUDA_ARRAY_SPARSE_PROPERTIES*>sparseProperties._pvt_ptr, cymipmap)
+    with nogil:
+        err = cydriver.cuMipmappedArrayGetSparseProperties(<cydriver.CUDA_ARRAY_SPARSE_PROPERTIES*>sparseProperties._pvt_ptr, cymipmap)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], sparseProperties)
@@ -31567,7 +31732,8 @@ def cuArrayGetMemoryRequirements(array, device):
         parray = int(CUarray(array))
     cyarray = <cydriver.CUarray><void_ptr>parray
     cdef CUDA_ARRAY_MEMORY_REQUIREMENTS memoryRequirements = CUDA_ARRAY_MEMORY_REQUIREMENTS()
-    err = cydriver.cuArrayGetMemoryRequirements(<cydriver.CUDA_ARRAY_MEMORY_REQUIREMENTS*>memoryRequirements._pvt_ptr, cyarray, cydevice)
+    with nogil:
+        err = cydriver.cuArrayGetMemoryRequirements(<cydriver.CUDA_ARRAY_MEMORY_REQUIREMENTS*>memoryRequirements._pvt_ptr, cyarray, cydevice)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], memoryRequirements)
@@ -31625,7 +31791,8 @@ def cuMipmappedArrayGetMemoryRequirements(mipmap, device):
         pmipmap = int(CUmipmappedArray(mipmap))
     cymipmap = <cydriver.CUmipmappedArray><void_ptr>pmipmap
     cdef CUDA_ARRAY_MEMORY_REQUIREMENTS memoryRequirements = CUDA_ARRAY_MEMORY_REQUIREMENTS()
-    err = cydriver.cuMipmappedArrayGetMemoryRequirements(<cydriver.CUDA_ARRAY_MEMORY_REQUIREMENTS*>memoryRequirements._pvt_ptr, cymipmap, cydevice)
+    with nogil:
+        err = cydriver.cuMipmappedArrayGetMemoryRequirements(<cydriver.CUDA_ARRAY_MEMORY_REQUIREMENTS*>memoryRequirements._pvt_ptr, cymipmap, cydevice)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], memoryRequirements)
@@ -31680,7 +31847,8 @@ def cuArrayGetPlane(hArray, unsigned int planeIdx):
         phArray = int(CUarray(hArray))
     cyhArray = <cydriver.CUarray><void_ptr>phArray
     cdef CUarray pPlaneArray = CUarray()
-    err = cydriver.cuArrayGetPlane(<cydriver.CUarray*>pPlaneArray._pvt_ptr, cyhArray, planeIdx)
+    with nogil:
+        err = cydriver.cuArrayGetPlane(<cydriver.CUarray*>pPlaneArray._pvt_ptr, cyhArray, planeIdx)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pPlaneArray)
@@ -31716,7 +31884,8 @@ def cuArrayDestroy(hArray):
     else:
         phArray = int(CUarray(hArray))
     cyhArray = <cydriver.CUarray><void_ptr>phArray
-    err = cydriver.cuArrayDestroy(cyhArray)
+    with nogil:
+        err = cydriver.cuArrayDestroy(cyhArray)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -31848,7 +32017,8 @@ def cuArray3DCreate(pAllocateArray : Optional[CUDA_ARRAY3D_DESCRIPTOR]):
     """
     cdef CUarray pHandle = CUarray()
     cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR* cypAllocateArray_ptr = pAllocateArray._pvt_ptr if pAllocateArray != None else NULL
-    err = cydriver.cuArray3DCreate(<cydriver.CUarray*>pHandle._pvt_ptr, cypAllocateArray_ptr)
+    with nogil:
+        err = cydriver.cuArray3DCreate(<cydriver.CUarray*>pHandle._pvt_ptr, cypAllocateArray_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pHandle)
@@ -31894,7 +32064,8 @@ def cuArray3DGetDescriptor(hArray):
         phArray = int(CUarray(hArray))
     cyhArray = <cydriver.CUarray><void_ptr>phArray
     cdef CUDA_ARRAY3D_DESCRIPTOR pArrayDescriptor = CUDA_ARRAY3D_DESCRIPTOR()
-    err = cydriver.cuArray3DGetDescriptor(<cydriver.CUDA_ARRAY3D_DESCRIPTOR*>pArrayDescriptor._pvt_ptr, cyhArray)
+    with nogil:
+        err = cydriver.cuArray3DGetDescriptor(<cydriver.CUDA_ARRAY3D_DESCRIPTOR*>pArrayDescriptor._pvt_ptr, cyhArray)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pArrayDescriptor)
@@ -32016,7 +32187,8 @@ def cuMipmappedArrayCreate(pMipmappedArrayDesc : Optional[CUDA_ARRAY3D_DESCRIPTO
     """
     cdef CUmipmappedArray pHandle = CUmipmappedArray()
     cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR* cypMipmappedArrayDesc_ptr = pMipmappedArrayDesc._pvt_ptr if pMipmappedArrayDesc != None else NULL
-    err = cydriver.cuMipmappedArrayCreate(<cydriver.CUmipmappedArray*>pHandle._pvt_ptr, cypMipmappedArrayDesc_ptr, numMipmapLevels)
+    with nogil:
+        err = cydriver.cuMipmappedArrayCreate(<cydriver.CUmipmappedArray*>pHandle._pvt_ptr, cypMipmappedArrayDesc_ptr, numMipmapLevels)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pHandle)
@@ -32061,7 +32233,8 @@ def cuMipmappedArrayGetLevel(hMipmappedArray, unsigned int level):
         phMipmappedArray = int(CUmipmappedArray(hMipmappedArray))
     cyhMipmappedArray = <cydriver.CUmipmappedArray><void_ptr>phMipmappedArray
     cdef CUarray pLevelArray = CUarray()
-    err = cydriver.cuMipmappedArrayGetLevel(<cydriver.CUarray*>pLevelArray._pvt_ptr, cyhMipmappedArray, level)
+    with nogil:
+        err = cydriver.cuMipmappedArrayGetLevel(<cydriver.CUarray*>pLevelArray._pvt_ptr, cyhMipmappedArray, level)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pLevelArray)
@@ -32097,7 +32270,8 @@ def cuMipmappedArrayDestroy(hMipmappedArray):
     else:
         phMipmappedArray = int(CUmipmappedArray(hMipmappedArray))
     cyhMipmappedArray = <cydriver.CUmipmappedArray><void_ptr>phMipmappedArray
-    err = cydriver.cuMipmappedArrayDestroy(cyhMipmappedArray)
+    with nogil:
+        err = cydriver.cuMipmappedArrayDestroy(cyhMipmappedArray)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -32176,7 +32350,8 @@ def cuMemGetHandleForAddressRange(dptr, size_t size, handleType not None : CUmem
     cdef int handle = 0
     cdef void* cyhandle_ptr = <void*>&handle
     cdef cydriver.CUmemRangeHandleType cyhandleType = handleType.value
-    err = cydriver.cuMemGetHandleForAddressRange(cyhandle_ptr, cydptr, size, cyhandleType, flags)
+    with nogil:
+        err = cydriver.cuMemGetHandleForAddressRange(cyhandle_ptr, cydptr, size, cyhandleType, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], handle)
@@ -32260,7 +32435,8 @@ def cuMemBatchDecompressAsync(paramsArray : Optional[CUmemDecompressParams], siz
     cystream = <cydriver.CUstream><void_ptr>pstream
     cdef cydriver.CUmemDecompressParams* cyparamsArray_ptr = paramsArray._pvt_ptr if paramsArray != None else NULL
     cdef size_t errorIndex = 0
-    err = cydriver.cuMemBatchDecompressAsync(cyparamsArray_ptr, count, flags, &errorIndex, cystream)
+    with nogil:
+        err = cydriver.cuMemBatchDecompressAsync(cyparamsArray_ptr, count, flags, &errorIndex, cystream)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], errorIndex)
@@ -32312,7 +32488,8 @@ def cuMemAddressReserve(size_t size, size_t alignment, addr, unsigned long long
         paddr = int(CUdeviceptr(addr))
     cyaddr = <cydriver.CUdeviceptr><void_ptr>paddr
     cdef CUdeviceptr ptr = CUdeviceptr()
-    err = cydriver.cuMemAddressReserve(<cydriver.CUdeviceptr*>ptr._pvt_ptr, size, alignment, cyaddr, flags)
+    with nogil:
+        err = cydriver.cuMemAddressReserve(<cydriver.CUdeviceptr*>ptr._pvt_ptr, size, alignment, cyaddr, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], ptr)
@@ -32352,7 +32529,8 @@ def cuMemAddressFree(ptr, size_t size):
     else:
         pptr = int(CUdeviceptr(ptr))
     cyptr = <cydriver.CUdeviceptr><void_ptr>pptr
-    err = cydriver.cuMemAddressFree(cyptr, size)
+    with nogil:
+        err = cydriver.cuMemAddressFree(cyptr, size)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -32442,7 +32620,8 @@ def cuMemCreate(size_t size, prop : Optional[CUmemAllocationProp], unsigned long
     """
     cdef CUmemGenericAllocationHandle handle = CUmemGenericAllocationHandle()
     cdef cydriver.CUmemAllocationProp* cyprop_ptr = prop._pvt_ptr if prop != None else NULL
-    err = cydriver.cuMemCreate(<cydriver.CUmemGenericAllocationHandle*>handle._pvt_ptr, size, cyprop_ptr, flags)
+    with nogil:
+        err = cydriver.cuMemCreate(<cydriver.CUmemGenericAllocationHandle*>handle._pvt_ptr, size, cyprop_ptr, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], handle)
@@ -32487,7 +32666,8 @@ def cuMemRelease(handle):
     else:
         phandle = int(CUmemGenericAllocationHandle(handle))
     cyhandle = <cydriver.CUmemGenericAllocationHandle><void_ptr>phandle
-    err = cydriver.cuMemRelease(cyhandle)
+    with nogil:
+        err = cydriver.cuMemRelease(cyhandle)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -32569,7 +32749,8 @@ def cuMemMap(ptr, size_t size, size_t offset, handle, unsigned long long flags):
     else:
         pptr = int(CUdeviceptr(ptr))
     cyptr = <cydriver.CUdeviceptr><void_ptr>pptr
-    err = cydriver.cuMemMap(cyptr, size, offset, cyhandle, flags)
+    with nogil:
+        err = cydriver.cuMemMap(cyptr, size, offset, cyhandle, flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -32731,15 +32912,18 @@ def cuMemMapArrayAsync(mapInfoList : Optional[Tuple[CUarrayMapInfo] | List[CUarr
     if not all(isinstance(_x, (CUarrayMapInfo,)) for _x in mapInfoList):
         raise TypeError("Argument 'mapInfoList' is not instance of type (expected Tuple[cydriver.CUarrayMapInfo,] or List[cydriver.CUarrayMapInfo,]")
     cdef cydriver.CUarrayMapInfo* cymapInfoList = NULL
-    if len(mapInfoList) > 0:
+    if len(mapInfoList) > 1:
         cymapInfoList = <cydriver.CUarrayMapInfo*> calloc(len(mapInfoList), sizeof(cydriver.CUarrayMapInfo))
         if cymapInfoList is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(mapInfoList)) + 'x' + str(sizeof(cydriver.CUarrayMapInfo)))
         for idx in range(len(mapInfoList)):
             string.memcpy(&cymapInfoList[idx], (<CUarrayMapInfo>mapInfoList[idx])._pvt_ptr, sizeof(cydriver.CUarrayMapInfo))
+    elif len(mapInfoList) == 1:
+        cymapInfoList = (<CUarrayMapInfo>mapInfoList[0])._pvt_ptr
     if count > len(mapInfoList): raise RuntimeError("List is too small: " + str(len(mapInfoList)) + " < " + str(count))
-    err = cydriver.cuMemMapArrayAsync((<CUarrayMapInfo>mapInfoList[0])._pvt_ptr if len(mapInfoList) == 1 else cymapInfoList, count, cyhStream)
-    if cymapInfoList is not NULL:
+    with nogil:
+        err = cydriver.cuMemMapArrayAsync(cymapInfoList, count, cyhStream)
+    if len(mapInfoList) > 1 and cymapInfoList is not NULL:
         free(cymapInfoList)
     return (_dict_CUresult[err],)
 {{endif}}
@@ -32787,7 +32971,8 @@ def cuMemUnmap(ptr, size_t size):
     else:
         pptr = int(CUdeviceptr(ptr))
     cyptr = <cydriver.CUdeviceptr><void_ptr>pptr
-    err = cydriver.cuMemUnmap(cyptr, size)
+    with nogil:
+        err = cydriver.cuMemUnmap(cyptr, size)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -32848,15 +33033,18 @@ def cuMemSetAccess(ptr, size_t size, desc : Optional[Tuple[CUmemAccessDesc] | Li
         pptr = int(CUdeviceptr(ptr))
     cyptr = <cydriver.CUdeviceptr><void_ptr>pptr
     cdef cydriver.CUmemAccessDesc* cydesc = NULL
-    if len(desc) > 0:
+    if len(desc) > 1:
         cydesc = <cydriver.CUmemAccessDesc*> calloc(len(desc), sizeof(cydriver.CUmemAccessDesc))
         if cydesc is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(desc)) + 'x' + str(sizeof(cydriver.CUmemAccessDesc)))
         for idx in range(len(desc)):
             string.memcpy(&cydesc[idx], (<CUmemAccessDesc>desc[idx])._pvt_ptr, sizeof(cydriver.CUmemAccessDesc))
+    elif len(desc) == 1:
+        cydesc = (<CUmemAccessDesc>desc[0])._pvt_ptr
     if count > <size_t>len(desc): raise RuntimeError("List is too small: " + str(len(desc)) + " < " + str(count))
-    err = cydriver.cuMemSetAccess(cyptr, size, (<CUmemAccessDesc>desc[0])._pvt_ptr if len(desc) == 1 else cydesc, count)
-    if cydesc is not NULL:
+    with nogil:
+        err = cydriver.cuMemSetAccess(cyptr, size, cydesc, count)
+    if len(desc) > 1 and cydesc is not NULL:
         free(cydesc)
     return (_dict_CUresult[err],)
 {{endif}}
@@ -32895,7 +33083,8 @@ def cuMemGetAccess(location : Optional[CUmemLocation], ptr):
     cyptr = <cydriver.CUdeviceptr><void_ptr>pptr
     cdef unsigned long long flags = 0
     cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
-    err = cydriver.cuMemGetAccess(&flags, cylocation_ptr, cyptr)
+    with nogil:
+        err = cydriver.cuMemGetAccess(&flags, cylocation_ptr, cyptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], flags)
@@ -32954,7 +33143,8 @@ def cuMemExportToShareableHandle(handle, handleType not None : CUmemAllocationHa
     cdef utils.HelperCUmemAllocationHandleType cyshareableHandle = utils.HelperCUmemAllocationHandleType(handleType)
     cdef void* cyshareableHandle_ptr = <void*><void_ptr>cyshareableHandle.cptr
     cdef cydriver.CUmemAllocationHandleType cyhandleType = handleType.value
-    err = cydriver.cuMemExportToShareableHandle(cyshareableHandle_ptr, cyhandle, cyhandleType, flags)
+    with nogil:
+        err = cydriver.cuMemExportToShareableHandle(cyshareableHandle_ptr, cyhandle, cyhandleType, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], cyshareableHandle.pyObj())
@@ -33003,7 +33193,8 @@ def cuMemImportFromShareableHandle(osHandle, shHandleType not None : CUmemAlloca
     cyosHandle = utils.HelperInputVoidPtr(osHandle)
     cdef void* cyosHandle_ptr = <void*><void_ptr>cyosHandle.cptr
     cdef cydriver.CUmemAllocationHandleType cyshHandleType = shHandleType.value
-    err = cydriver.cuMemImportFromShareableHandle(<cydriver.CUmemGenericAllocationHandle*>handle._pvt_ptr, cyosHandle_ptr, cyshHandleType)
+    with nogil:
+        err = cydriver.cuMemImportFromShareableHandle(<cydriver.CUmemGenericAllocationHandle*>handle._pvt_ptr, cyosHandle_ptr, cyshHandleType)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], handle)
@@ -33041,7 +33232,8 @@ def cuMemGetAllocationGranularity(prop : Optional[CUmemAllocationProp], option n
     cdef size_t granularity = 0
     cdef cydriver.CUmemAllocationProp* cyprop_ptr = prop._pvt_ptr if prop != None else NULL
     cdef cydriver.CUmemAllocationGranularity_flags cyoption = option.value
-    err = cydriver.cuMemGetAllocationGranularity(&granularity, cyprop_ptr, cyoption)
+    with nogil:
+        err = cydriver.cuMemGetAllocationGranularity(&granularity, cyprop_ptr, cyoption)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], granularity)
@@ -33079,7 +33271,8 @@ def cuMemGetAllocationPropertiesFromHandle(handle):
         phandle = int(CUmemGenericAllocationHandle(handle))
     cyhandle = <cydriver.CUmemGenericAllocationHandle><void_ptr>phandle
     cdef CUmemAllocationProp prop = CUmemAllocationProp()
-    err = cydriver.cuMemGetAllocationPropertiesFromHandle(<cydriver.CUmemAllocationProp*>prop._pvt_ptr, cyhandle)
+    with nogil:
+        err = cydriver.cuMemGetAllocationPropertiesFromHandle(<cydriver.CUmemAllocationProp*>prop._pvt_ptr, cyhandle)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], prop)
@@ -33119,7 +33312,8 @@ def cuMemRetainAllocationHandle(addr):
     cdef CUmemGenericAllocationHandle handle = CUmemGenericAllocationHandle()
     cyaddr = utils.HelperInputVoidPtr(addr)
     cdef void* cyaddr_ptr = <void*><void_ptr>cyaddr.cptr
-    err = cydriver.cuMemRetainAllocationHandle(<cydriver.CUmemGenericAllocationHandle*>handle._pvt_ptr, cyaddr_ptr)
+    with nogil:
+        err = cydriver.cuMemRetainAllocationHandle(<cydriver.CUmemGenericAllocationHandle*>handle._pvt_ptr, cyaddr_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], handle)
@@ -33168,7 +33362,8 @@ def cuMemFreeAsync(dptr, hStream):
     else:
         pdptr = int(CUdeviceptr(dptr))
     cydptr = <cydriver.CUdeviceptr><void_ptr>pdptr
-    err = cydriver.cuMemFreeAsync(cydptr, cyhStream)
+    with nogil:
+        err = cydriver.cuMemFreeAsync(cydptr, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -33219,7 +33414,8 @@ def cuMemAllocAsync(size_t bytesize, hStream):
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef CUdeviceptr dptr = CUdeviceptr()
-    err = cydriver.cuMemAllocAsync(<cydriver.CUdeviceptr*>dptr._pvt_ptr, bytesize, cyhStream)
+    with nogil:
+        err = cydriver.cuMemAllocAsync(<cydriver.CUdeviceptr*>dptr._pvt_ptr, bytesize, cyhStream)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], dptr)
@@ -33270,7 +33466,8 @@ def cuMemPoolTrimTo(pool, size_t minBytesToKeep):
     else:
         ppool = int(CUmemoryPool(pool))
     cypool = <cydriver.CUmemoryPool><void_ptr>ppool
-    err = cydriver.cuMemPoolTrimTo(cypool, minBytesToKeep)
+    with nogil:
+        err = cydriver.cuMemPoolTrimTo(cypool, minBytesToKeep)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -33344,7 +33541,8 @@ def cuMemPoolSetAttribute(pool, attr not None : CUmemPool_attribute, value):
     cdef cydriver.CUmemPool_attribute cyattr = attr.value
     cdef utils.HelperCUmemPool_attribute cyvalue = utils.HelperCUmemPool_attribute(attr, value, is_getter=False)
     cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
-    err = cydriver.cuMemPoolSetAttribute(cypool, cyattr, cyvalue_ptr)
+    with nogil:
+        err = cydriver.cuMemPoolSetAttribute(cypool, cyattr, cyvalue_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -33425,7 +33623,8 @@ def cuMemPoolGetAttribute(pool, attr not None : CUmemPool_attribute):
     cdef cydriver.CUmemPool_attribute cyattr = attr.value
     cdef utils.HelperCUmemPool_attribute cyvalue = utils.HelperCUmemPool_attribute(attr, 0, is_getter=True)
     cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
-    err = cydriver.cuMemPoolGetAttribute(cypool, cyattr, cyvalue_ptr)
+    with nogil:
+        err = cydriver.cuMemPoolGetAttribute(cypool, cyattr, cyvalue_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], cyvalue.pyObj())
@@ -33468,15 +33667,18 @@ def cuMemPoolSetAccess(pool, map : Optional[Tuple[CUmemAccessDesc] | List[CUmemA
         ppool = int(CUmemoryPool(pool))
     cypool = <cydriver.CUmemoryPool><void_ptr>ppool
     cdef cydriver.CUmemAccessDesc* cymap = NULL
-    if len(map) > 0:
+    if len(map) > 1:
         cymap = <cydriver.CUmemAccessDesc*> calloc(len(map), sizeof(cydriver.CUmemAccessDesc))
         if cymap is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(map)) + 'x' + str(sizeof(cydriver.CUmemAccessDesc)))
         for idx in range(len(map)):
             string.memcpy(&cymap[idx], (<CUmemAccessDesc>map[idx])._pvt_ptr, sizeof(cydriver.CUmemAccessDesc))
+    elif len(map) == 1:
+        cymap = (<CUmemAccessDesc>map[0])._pvt_ptr
     if count > <size_t>len(map): raise RuntimeError("List is too small: " + str(len(map)) + " < " + str(count))
-    err = cydriver.cuMemPoolSetAccess(cypool, (<CUmemAccessDesc>map[0])._pvt_ptr if len(map) == 1 else cymap, count)
-    if cymap is not NULL:
+    with nogil:
+        err = cydriver.cuMemPoolSetAccess(cypool, cymap, count)
+    if len(map) > 1 and cymap is not NULL:
         free(cymap)
     return (_dict_CUresult[err],)
 {{endif}}
@@ -33518,7 +33720,8 @@ def cuMemPoolGetAccess(memPool, location : Optional[CUmemLocation]):
     cymemPool = <cydriver.CUmemoryPool><void_ptr>pmemPool
     cdef cydriver.CUmemAccess_flags flags
     cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
-    err = cydriver.cuMemPoolGetAccess(&flags, cymemPool, cylocation_ptr)
+    with nogil:
+        err = cydriver.cuMemPoolGetAccess(&flags, cymemPool, cylocation_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], CUmemAccess_flags(flags))
@@ -33601,7 +33804,8 @@ def cuMemPoolCreate(poolProps : Optional[CUmemPoolProps]):
     """
     cdef CUmemoryPool pool = CUmemoryPool()
     cdef cydriver.CUmemPoolProps* cypoolProps_ptr = poolProps._pvt_ptr if poolProps != None else NULL
-    err = cydriver.cuMemPoolCreate(<cydriver.CUmemoryPool*>pool._pvt_ptr, cypoolProps_ptr)
+    with nogil:
+        err = cydriver.cuMemPoolCreate(<cydriver.CUmemoryPool*>pool._pvt_ptr, cypoolProps_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pool)
@@ -33648,7 +33852,8 @@ def cuMemPoolDestroy(pool):
     else:
         ppool = int(CUmemoryPool(pool))
     cypool = <cydriver.CUmemoryPool><void_ptr>ppool
-    err = cydriver.cuMemPoolDestroy(cypool)
+    with nogil:
+        err = cydriver.cuMemPoolDestroy(cypool)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -33690,7 +33895,8 @@ def cuMemGetDefaultMemPool(location : Optional[CUmemLocation], typename not None
     cdef CUmemoryPool pool_out = CUmemoryPool()
     cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
     cdef cydriver.CUmemAllocationType cytypename = typename.value
-    err = cydriver.cuMemGetDefaultMemPool(<cydriver.CUmemoryPool*>pool_out._pvt_ptr, cylocation_ptr, cytypename)
+    with nogil:
+        err = cydriver.cuMemGetDefaultMemPool(<cydriver.CUmemoryPool*>pool_out._pvt_ptr, cylocation_ptr, cytypename)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pool_out)
@@ -33742,7 +33948,8 @@ def cuMemGetMemPool(location : Optional[CUmemLocation], typename not None : CUme
     cdef CUmemoryPool pool = CUmemoryPool()
     cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
     cdef cydriver.CUmemAllocationType cytypename = typename.value
-    err = cydriver.cuMemGetMemPool(<cydriver.CUmemoryPool*>pool._pvt_ptr, cylocation_ptr, cytypename)
+    with nogil:
+        err = cydriver.cuMemGetMemPool(<cydriver.CUmemoryPool*>pool._pvt_ptr, cylocation_ptr, cytypename)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pool)
@@ -33809,7 +34016,8 @@ def cuMemSetMemPool(location : Optional[CUmemLocation], typename not None : CUme
     cypool = <cydriver.CUmemoryPool><void_ptr>ppool
     cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
     cdef cydriver.CUmemAllocationType cytypename = typename.value
-    err = cydriver.cuMemSetMemPool(cylocation_ptr, cytypename, cypool)
+    with nogil:
+        err = cydriver.cuMemSetMemPool(cylocation_ptr, cytypename, cypool)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -33865,7 +34073,8 @@ def cuMemAllocFromPoolAsync(size_t bytesize, pool, hStream):
         ppool = int(CUmemoryPool(pool))
     cypool = <cydriver.CUmemoryPool><void_ptr>ppool
     cdef CUdeviceptr dptr = CUdeviceptr()
-    err = cydriver.cuMemAllocFromPoolAsync(<cydriver.CUdeviceptr*>dptr._pvt_ptr, bytesize, cypool, cyhStream)
+    with nogil:
+        err = cydriver.cuMemAllocFromPoolAsync(<cydriver.CUdeviceptr*>dptr._pvt_ptr, bytesize, cypool, cyhStream)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], dptr)
@@ -33921,7 +34130,8 @@ def cuMemPoolExportToShareableHandle(pool, handleType not None : CUmemAllocation
     cdef utils.HelperCUmemAllocationHandleType cyhandle_out = utils.HelperCUmemAllocationHandleType(handleType)
     cdef void* cyhandle_out_ptr = <void*><void_ptr>cyhandle_out.cptr
     cdef cydriver.CUmemAllocationHandleType cyhandleType = handleType.value
-    err = cydriver.cuMemPoolExportToShareableHandle(cyhandle_out_ptr, cypool, cyhandleType, flags)
+    with nogil:
+        err = cydriver.cuMemPoolExportToShareableHandle(cyhandle_out_ptr, cypool, cyhandleType, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], cyhandle_out.pyObj())
@@ -33969,7 +34179,8 @@ def cuMemPoolImportFromShareableHandle(handle, handleType not None : CUmemAlloca
     cyhandle = utils.HelperInputVoidPtr(handle)
     cdef void* cyhandle_ptr = <void*><void_ptr>cyhandle.cptr
     cdef cydriver.CUmemAllocationHandleType cyhandleType = handleType.value
-    err = cydriver.cuMemPoolImportFromShareableHandle(<cydriver.CUmemoryPool*>pool_out._pvt_ptr, cyhandle_ptr, cyhandleType, flags)
+    with nogil:
+        err = cydriver.cuMemPoolImportFromShareableHandle(<cydriver.CUmemoryPool*>pool_out._pvt_ptr, cyhandle_ptr, cyhandleType, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pool_out)
@@ -34011,7 +34222,8 @@ def cuMemPoolExportPointer(ptr):
         pptr = int(CUdeviceptr(ptr))
     cyptr = <cydriver.CUdeviceptr><void_ptr>pptr
     cdef CUmemPoolPtrExportData shareData_out = CUmemPoolPtrExportData()
-    err = cydriver.cuMemPoolExportPointer(<cydriver.CUmemPoolPtrExportData*>shareData_out._pvt_ptr, cyptr)
+    with nogil:
+        err = cydriver.cuMemPoolExportPointer(<cydriver.CUmemPoolPtrExportData*>shareData_out._pvt_ptr, cyptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], shareData_out)
@@ -34063,7 +34275,8 @@ def cuMemPoolImportPointer(pool, shareData : Optional[CUmemPoolPtrExportData]):
     cypool = <cydriver.CUmemoryPool><void_ptr>ppool
     cdef CUdeviceptr ptr_out = CUdeviceptr()
     cdef cydriver.CUmemPoolPtrExportData* cyshareData_ptr = shareData._pvt_ptr if shareData != None else NULL
-    err = cydriver.cuMemPoolImportPointer(<cydriver.CUdeviceptr*>ptr_out._pvt_ptr, cypool, cyshareData_ptr)
+    with nogil:
+        err = cydriver.cuMemPoolImportPointer(<cydriver.CUdeviceptr*>ptr_out._pvt_ptr, cypool, cyshareData_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], ptr_out)
@@ -34123,7 +34336,8 @@ def cuMulticastCreate(prop : Optional[CUmulticastObjectProp]):
     """
     cdef CUmemGenericAllocationHandle mcHandle = CUmemGenericAllocationHandle()
     cdef cydriver.CUmulticastObjectProp* cyprop_ptr = prop._pvt_ptr if prop != None else NULL
-    err = cydriver.cuMulticastCreate(<cydriver.CUmemGenericAllocationHandle*>mcHandle._pvt_ptr, cyprop_ptr)
+    with nogil:
+        err = cydriver.cuMulticastCreate(<cydriver.CUmemGenericAllocationHandle*>mcHandle._pvt_ptr, cyprop_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], mcHandle)
@@ -34180,7 +34394,8 @@ def cuMulticastAddDevice(mcHandle, dev):
     else:
         pmcHandle = int(CUmemGenericAllocationHandle(mcHandle))
     cymcHandle = <cydriver.CUmemGenericAllocationHandle><void_ptr>pmcHandle
-    err = cydriver.cuMulticastAddDevice(cymcHandle, cydev)
+    with nogil:
+        err = cydriver.cuMulticastAddDevice(cymcHandle, cydev)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -34259,7 +34474,8 @@ def cuMulticastBindMem(mcHandle, size_t mcOffset, memHandle, size_t memOffset, s
     else:
         pmcHandle = int(CUmemGenericAllocationHandle(mcHandle))
     cymcHandle = <cydriver.CUmemGenericAllocationHandle><void_ptr>pmcHandle
-    err = cydriver.cuMulticastBindMem(cymcHandle, mcOffset, cymemHandle, memOffset, size, flags)
+    with nogil:
+        err = cydriver.cuMulticastBindMem(cymcHandle, mcOffset, cymemHandle, memOffset, size, flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -34334,7 +34550,8 @@ def cuMulticastBindAddr(mcHandle, size_t mcOffset, memptr, size_t size, unsigned
     else:
         pmcHandle = int(CUmemGenericAllocationHandle(mcHandle))
     cymcHandle = <cydriver.CUmemGenericAllocationHandle><void_ptr>pmcHandle
-    err = cydriver.cuMulticastBindAddr(cymcHandle, mcOffset, cymemptr, size, flags)
+    with nogil:
+        err = cydriver.cuMulticastBindAddr(cymcHandle, mcOffset, cymemptr, size, flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -34392,7 +34609,8 @@ def cuMulticastUnbind(mcHandle, dev, size_t mcOffset, size_t size):
     else:
         pmcHandle = int(CUmemGenericAllocationHandle(mcHandle))
     cymcHandle = <cydriver.CUmemGenericAllocationHandle><void_ptr>pmcHandle
-    err = cydriver.cuMulticastUnbind(cymcHandle, cydev, mcOffset, size)
+    with nogil:
+        err = cydriver.cuMulticastUnbind(cymcHandle, cydev, mcOffset, size)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -34428,7 +34646,8 @@ def cuMulticastGetGranularity(prop : Optional[CUmulticastObjectProp], option not
     cdef size_t granularity = 0
     cdef cydriver.CUmulticastObjectProp* cyprop_ptr = prop._pvt_ptr if prop != None else NULL
     cdef cydriver.CUmulticastGranularity_flags cyoption = option.value
-    err = cydriver.cuMulticastGetGranularity(&granularity, cyprop_ptr, cyoption)
+    with nogil:
+        err = cydriver.cuMulticastGetGranularity(&granularity, cyprop_ptr, cyoption)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], granularity)
@@ -34640,7 +34859,8 @@ def cuPointerGetAttribute(attribute not None : CUpointer_attribute, ptr):
     cdef utils.HelperCUpointer_attribute cydata = utils.HelperCUpointer_attribute(attribute, 0, is_getter=True)
     cdef void* cydata_ptr = <void*><void_ptr>cydata.cptr
     cdef cydriver.CUpointer_attribute cyattribute = attribute.value
-    err = cydriver.cuPointerGetAttribute(cydata_ptr, cyattribute, cyptr)
+    with nogil:
+        err = cydriver.cuPointerGetAttribute(cydata_ptr, cyattribute, cyptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], cydata.pyObj())
@@ -34763,7 +34983,8 @@ def cuMemPrefetchAsync(devPtr, size_t count, location not None : CUmemLocation,
     else:
         pdevPtr = int(CUdeviceptr(devPtr))
     cydevPtr = <cydriver.CUdeviceptr><void_ptr>pdevPtr
-    err = cydriver.cuMemPrefetchAsync(cydevPtr, count, location._pvt_ptr[0], flags, cyhStream)
+    with nogil:
+        err = cydriver.cuMemPrefetchAsync(cydevPtr, count, location._pvt_ptr[0], flags, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -34967,7 +35188,8 @@ def cuMemAdvise(devPtr, size_t count, advice not None : CUmem_advise, location n
         pdevPtr = int(CUdeviceptr(devPtr))
     cydevPtr = <cydriver.CUdeviceptr><void_ptr>pdevPtr
     cdef cydriver.CUmem_advise cyadvice = advice.value
-    err = cydriver.cuMemAdvise(cydevPtr, count, cyadvice, location._pvt_ptr[0])
+    with nogil:
+        err = cydriver.cuMemAdvise(cydevPtr, count, cyadvice, location._pvt_ptr[0])
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -35058,30 +35280,35 @@ def cuMemPrefetchBatchAsync(dptrs : Optional[Tuple[CUdeviceptr] | List[CUdevicep
     if not all(isinstance(_x, (CUdeviceptr,)) for _x in dptrs):
         raise TypeError("Argument 'dptrs' is not instance of type (expected Tuple[cydriver.CUdeviceptr,] or List[cydriver.CUdeviceptr,]")
     cdef cydriver.CUdeviceptr* cydptrs = NULL
-    if len(dptrs) > 0:
+    if len(dptrs) > 1:
         cydptrs = <cydriver.CUdeviceptr*> calloc(len(dptrs), sizeof(cydriver.CUdeviceptr))
         if cydptrs is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dptrs)) + 'x' + str(sizeof(cydriver.CUdeviceptr)))
         else:
             for idx in range(len(dptrs)):
                 cydptrs[idx] = <cydriver.CUdeviceptr>(<CUdeviceptr>dptrs[idx])._pvt_ptr[0]
+    elif len(dptrs) == 1:
+        cydptrs = <cydriver.CUdeviceptr*>(<CUdeviceptr>dptrs[0])._pvt_ptr
     cdef vector[size_t] cysizes = sizes
     if count > <size_t>len(dptrs): raise RuntimeError("List is too small: " + str(len(dptrs)) + " < " + str(count))
     if count > <size_t>len(sizes): raise RuntimeError("List is too small: " + str(len(sizes)) + " < " + str(count))
     cdef cydriver.CUmemLocation* cyprefetchLocs = NULL
-    if len(prefetchLocs) > 0:
+    if len(prefetchLocs) > 1:
         cyprefetchLocs = <cydriver.CUmemLocation*> calloc(len(prefetchLocs), sizeof(cydriver.CUmemLocation))
         if cyprefetchLocs is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(prefetchLocs)) + 'x' + str(sizeof(cydriver.CUmemLocation)))
         for idx in range(len(prefetchLocs)):
             string.memcpy(&cyprefetchLocs[idx], (<CUmemLocation>prefetchLocs[idx])._pvt_ptr, sizeof(cydriver.CUmemLocation))
+    elif len(prefetchLocs) == 1:
+        cyprefetchLocs = (<CUmemLocation>prefetchLocs[0])._pvt_ptr
     cdef vector[size_t] cyprefetchLocIdxs = prefetchLocIdxs
     if numPrefetchLocs > <size_t>len(prefetchLocs): raise RuntimeError("List is too small: " + str(len(prefetchLocs)) + " < " + str(numPrefetchLocs))
     if numPrefetchLocs > <size_t>len(prefetchLocIdxs): raise RuntimeError("List is too small: " + str(len(prefetchLocIdxs)) + " < " + str(numPrefetchLocs))
-    err = cydriver.cuMemPrefetchBatchAsync(<cydriver.CUdeviceptr*>(<CUdeviceptr>dptrs[0])._pvt_ptr if len(dptrs) == 1 else cydptrs, cysizes.data(), count, (<CUmemLocation>prefetchLocs[0])._pvt_ptr if len(prefetchLocs) == 1 else cyprefetchLocs, cyprefetchLocIdxs.data(), numPrefetchLocs, flags, cyhStream)
-    if cydptrs is not NULL:
+    with nogil:
+        err = cydriver.cuMemPrefetchBatchAsync(cydptrs, cysizes.data(), count, cyprefetchLocs, cyprefetchLocIdxs.data(), numPrefetchLocs, flags, cyhStream)
+    if len(dptrs) > 1 and cydptrs is not NULL:
         free(cydptrs)
-    if cyprefetchLocs is not NULL:
+    if len(prefetchLocs) > 1 and cyprefetchLocs is not NULL:
         free(cyprefetchLocs)
     return (_dict_CUresult[err],)
 {{endif}}
@@ -35152,18 +35379,21 @@ def cuMemDiscardBatchAsync(dptrs : Optional[Tuple[CUdeviceptr] | List[CUdevicept
     if not all(isinstance(_x, (CUdeviceptr,)) for _x in dptrs):
         raise TypeError("Argument 'dptrs' is not instance of type (expected Tuple[cydriver.CUdeviceptr,] or List[cydriver.CUdeviceptr,]")
     cdef cydriver.CUdeviceptr* cydptrs = NULL
-    if len(dptrs) > 0:
+    if len(dptrs) > 1:
         cydptrs = <cydriver.CUdeviceptr*> calloc(len(dptrs), sizeof(cydriver.CUdeviceptr))
         if cydptrs is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dptrs)) + 'x' + str(sizeof(cydriver.CUdeviceptr)))
         else:
             for idx in range(len(dptrs)):
                 cydptrs[idx] = <cydriver.CUdeviceptr>(<CUdeviceptr>dptrs[idx])._pvt_ptr[0]
+    elif len(dptrs) == 1:
+        cydptrs = <cydriver.CUdeviceptr*>(<CUdeviceptr>dptrs[0])._pvt_ptr
     cdef vector[size_t] cysizes = sizes
     if count > <size_t>len(dptrs): raise RuntimeError("List is too small: " + str(len(dptrs)) + " < " + str(count))
     if count > <size_t>len(sizes): raise RuntimeError("List is too small: " + str(len(sizes)) + " < " + str(count))
-    err = cydriver.cuMemDiscardBatchAsync(<cydriver.CUdeviceptr*>(<CUdeviceptr>dptrs[0])._pvt_ptr if len(dptrs) == 1 else cydptrs, cysizes.data(), count, flags, cyhStream)
-    if cydptrs is not NULL:
+    with nogil:
+        err = cydriver.cuMemDiscardBatchAsync(cydptrs, cysizes.data(), count, flags, cyhStream)
+    if len(dptrs) > 1 and cydptrs is not NULL:
         free(cydptrs)
     return (_dict_CUresult[err],)
 {{endif}}
@@ -35263,30 +35493,35 @@ def cuMemDiscardAndPrefetchBatchAsync(dptrs : Optional[Tuple[CUdeviceptr] | List
     if not all(isinstance(_x, (CUdeviceptr,)) for _x in dptrs):
         raise TypeError("Argument 'dptrs' is not instance of type (expected Tuple[cydriver.CUdeviceptr,] or List[cydriver.CUdeviceptr,]")
     cdef cydriver.CUdeviceptr* cydptrs = NULL
-    if len(dptrs) > 0:
+    if len(dptrs) > 1:
         cydptrs = <cydriver.CUdeviceptr*> calloc(len(dptrs), sizeof(cydriver.CUdeviceptr))
         if cydptrs is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dptrs)) + 'x' + str(sizeof(cydriver.CUdeviceptr)))
         else:
             for idx in range(len(dptrs)):
                 cydptrs[idx] = <cydriver.CUdeviceptr>(<CUdeviceptr>dptrs[idx])._pvt_ptr[0]
+    elif len(dptrs) == 1:
+        cydptrs = <cydriver.CUdeviceptr*>(<CUdeviceptr>dptrs[0])._pvt_ptr
     cdef vector[size_t] cysizes = sizes
     if count > <size_t>len(dptrs): raise RuntimeError("List is too small: " + str(len(dptrs)) + " < " + str(count))
     if count > <size_t>len(sizes): raise RuntimeError("List is too small: " + str(len(sizes)) + " < " + str(count))
     cdef cydriver.CUmemLocation* cyprefetchLocs = NULL
-    if len(prefetchLocs) > 0:
+    if len(prefetchLocs) > 1:
         cyprefetchLocs = <cydriver.CUmemLocation*> calloc(len(prefetchLocs), sizeof(cydriver.CUmemLocation))
         if cyprefetchLocs is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(prefetchLocs)) + 'x' + str(sizeof(cydriver.CUmemLocation)))
         for idx in range(len(prefetchLocs)):
             string.memcpy(&cyprefetchLocs[idx], (<CUmemLocation>prefetchLocs[idx])._pvt_ptr, sizeof(cydriver.CUmemLocation))
+    elif len(prefetchLocs) == 1:
+        cyprefetchLocs = (<CUmemLocation>prefetchLocs[0])._pvt_ptr
     cdef vector[size_t] cyprefetchLocIdxs = prefetchLocIdxs
     if numPrefetchLocs > <size_t>len(prefetchLocs): raise RuntimeError("List is too small: " + str(len(prefetchLocs)) + " < " + str(numPrefetchLocs))
     if numPrefetchLocs > <size_t>len(prefetchLocIdxs): raise RuntimeError("List is too small: " + str(len(prefetchLocIdxs)) + " < " + str(numPrefetchLocs))
-    err = cydriver.cuMemDiscardAndPrefetchBatchAsync(<cydriver.CUdeviceptr*>(<CUdeviceptr>dptrs[0])._pvt_ptr if len(dptrs) == 1 else cydptrs, cysizes.data(), count, (<CUmemLocation>prefetchLocs[0])._pvt_ptr if len(prefetchLocs) == 1 else cyprefetchLocs, cyprefetchLocIdxs.data(), numPrefetchLocs, flags, cyhStream)
-    if cydptrs is not NULL:
+    with nogil:
+        err = cydriver.cuMemDiscardAndPrefetchBatchAsync(cydptrs, cysizes.data(), count, cyprefetchLocs, cyprefetchLocIdxs.data(), numPrefetchLocs, flags, cyhStream)
+    if len(dptrs) > 1 and cydptrs is not NULL:
         free(cydptrs)
-    if cyprefetchLocs is not NULL:
+    if len(prefetchLocs) > 1 and cyprefetchLocs is not NULL:
         free(cyprefetchLocs)
     return (_dict_CUresult[err],)
 {{endif}}
@@ -35442,7 +35677,8 @@ def cuMemRangeGetAttribute(size_t dataSize, attribute not None : CUmem_range_att
     cdef utils.HelperCUmem_range_attribute cydata = utils.HelperCUmem_range_attribute(attribute, dataSize)
     cdef void* cydata_ptr = <void*><void_ptr>cydata.cptr
     cdef cydriver.CUmem_range_attribute cyattribute = attribute.value
-    err = cydriver.cuMemRangeGetAttribute(cydata_ptr, dataSize, cyattribute, cydevPtr, count)
+    with nogil:
+        err = cydriver.cuMemRangeGetAttribute(cydata_ptr, dataSize, cyattribute, cydevPtr, count)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], cydata.pyObj())
@@ -35528,7 +35764,8 @@ def cuMemRangeGetAttributes(dataSizes : Tuple[int] | List[int], attributes : Opt
     cdef vector[cydriver.CUmem_range_attribute] cyattributes = [pyattributes.value for pyattributes in (attributes)]
     if numAttributes > <size_t>len(dataSizes): raise RuntimeError("List is too small: " + str(len(dataSizes)) + " < " + str(numAttributes))
     if numAttributes > <size_t>len(attributes): raise RuntimeError("List is too small: " + str(len(attributes)) + " < " + str(numAttributes))
-    err = cydriver.cuMemRangeGetAttributes(cyvoidStarHelper_ptr, cydataSizes.data(), cyattributes.data(), numAttributes, cydevPtr, count)
+    with nogil:
+        err = cydriver.cuMemRangeGetAttributes(cyvoidStarHelper_ptr, cydataSizes.data(), cyattributes.data(), numAttributes, cydevPtr, count)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], [obj.pyObj() for obj in pylist])
@@ -35585,7 +35822,8 @@ def cuPointerSetAttribute(value, attribute not None : CUpointer_attribute, ptr):
     cdef utils.HelperCUpointer_attribute cyvalue = utils.HelperCUpointer_attribute(attribute, value, is_getter=False)
     cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
     cdef cydriver.CUpointer_attribute cyattribute = attribute.value
-    err = cydriver.cuPointerSetAttribute(cyvalue_ptr, cyattribute, cyptr)
+    with nogil:
+        err = cydriver.cuPointerSetAttribute(cyvalue_ptr, cyattribute, cyptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -35676,7 +35914,8 @@ def cuPointerGetAttributes(unsigned int numAttributes, attributes : Optional[Tup
     pylist = [utils.HelperCUpointer_attribute(pyattributes, 0, is_getter=True) for pyattributes in attributes]
     cdef utils.InputVoidPtrPtrHelper voidStarHelperdata = utils.InputVoidPtrPtrHelper(pylist)
     cdef void** cyvoidStarHelper_ptr = <void**><void_ptr>voidStarHelperdata.cptr
-    err = cydriver.cuPointerGetAttributes(numAttributes, cyattributes.data(), cyvoidStarHelper_ptr, cyptr)
+    with nogil:
+        err = cydriver.cuPointerGetAttributes(numAttributes, cyattributes.data(), cyvoidStarHelper_ptr, cyptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], [obj.pyObj() for obj in pylist])
@@ -35717,7 +35956,8 @@ def cuStreamCreate(unsigned int Flags):
     :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreateWithPriority`, :py:obj:`~.cuGreenCtxStreamCreate`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cuStreamGetFlags`, :py:obj:`~.cuStreamGetDevice` :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`
     """
     cdef CUstream phStream = CUstream()
-    err = cydriver.cuStreamCreate(<cydriver.CUstream*>phStream._pvt_ptr, Flags)
+    with nogil:
+        err = cydriver.cuStreamCreate(<cydriver.CUstream*>phStream._pvt_ptr, Flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], phStream)
@@ -35771,7 +36011,8 @@ def cuStreamCreateWithPriority(unsigned int flags, int priority):
     In the current implementation, only compute kernels launched in priority streams are affected by the stream's priority. Stream priorities have no effect on host-to-device and device-to-host memory operations.
     """
     cdef CUstream phStream = CUstream()
-    err = cydriver.cuStreamCreateWithPriority(<cydriver.CUstream*>phStream._pvt_ptr, flags, priority)
+    with nogil:
+        err = cydriver.cuStreamCreateWithPriority(<cydriver.CUstream*>phStream._pvt_ptr, flags, priority)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], phStream)
@@ -35818,7 +36059,8 @@ def cuStreamGetPriority(hStream):
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef int priority = 0
-    err = cydriver.cuStreamGetPriority(cyhStream, &priority)
+    with nogil:
+        err = cydriver.cuStreamGetPriority(cyhStream, &priority)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], priority)
@@ -35857,7 +36099,8 @@ def cuStreamGetDevice(hStream):
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef CUdevice device = CUdevice()
-    err = cydriver.cuStreamGetDevice(cyhStream, <cydriver.CUdevice*>device._pvt_ptr)
+    with nogil:
+        err = cydriver.cuStreamGetDevice(cyhStream, <cydriver.CUdevice*>device._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], device)
@@ -35901,7 +36144,8 @@ def cuStreamGetFlags(hStream):
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef unsigned int flags = 0
-    err = cydriver.cuStreamGetFlags(cyhStream, &flags)
+    with nogil:
+        err = cydriver.cuStreamGetFlags(cyhStream, &flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], flags)
@@ -35957,7 +36201,8 @@ def cuStreamGetId(hStream):
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef unsigned long long streamId = 0
-    err = cydriver.cuStreamGetId(cyhStream, &streamId)
+    with nogil:
+        err = cydriver.cuStreamGetId(cyhStream, &streamId)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], streamId)
@@ -36021,7 +36266,8 @@ def cuStreamGetCtx(hStream):
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef CUcontext pctx = CUcontext()
-    err = cydriver.cuStreamGetCtx(cyhStream, <cydriver.CUcontext*>pctx._pvt_ptr)
+    with nogil:
+        err = cydriver.cuStreamGetCtx(cyhStream, <cydriver.CUcontext*>pctx._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pctx)
@@ -36098,7 +36344,8 @@ def cuStreamGetCtx_v2(hStream):
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef CUcontext pCtx = CUcontext()
     cdef CUgreenCtx pGreenCtx = CUgreenCtx()
-    err = cydriver.cuStreamGetCtx_v2(cyhStream, <cydriver.CUcontext*>pCtx._pvt_ptr, <cydriver.CUgreenCtx*>pGreenCtx._pvt_ptr)
+    with nogil:
+        err = cydriver.cuStreamGetCtx_v2(cyhStream, <cydriver.CUcontext*>pCtx._pvt_ptr, <cydriver.CUgreenCtx*>pGreenCtx._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], pCtx, pGreenCtx)
@@ -36158,7 +36405,8 @@ def cuStreamWaitEvent(hStream, hEvent, unsigned int Flags):
     else:
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
-    err = cydriver.cuStreamWaitEvent(cyhStream, cyhEvent, Flags)
+    with nogil:
+        err = cydriver.cuStreamWaitEvent(cyhStream, cyhEvent, Flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -36274,7 +36522,6 @@ def cuStreamAddCallback(hStream, callback, userData, unsigned int flags):
 
     with nogil:
         err = cydriver.cuStreamAddCallback(cyhStream, <cydriver.CUstreamCallback>cuStreamCallbackWrapper, <void *>cbData, flags)
-
     if err != cydriver.CUDA_SUCCESS:
         free(cbData)
     return (_dict_CUresult[err],)
@@ -36331,7 +36578,8 @@ def cuStreamBeginCapture(hStream, mode not None : CUstreamCaptureMode):
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef cydriver.CUstreamCaptureMode cymode = mode.value
-    err = cydriver.cuStreamBeginCapture(cyhStream, cymode)
+    with nogil:
+        err = cydriver.cuStreamBeginCapture(cyhStream, cymode)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -36412,27 +36660,32 @@ def cuStreamBeginCaptureToGraph(hStream, hGraph, dependencies : Optional[Tuple[C
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef cydriver.CUgraphNode* cydependencies = NULL
-    if len(dependencies) > 0:
+    if len(dependencies) > 1:
         cydependencies = <cydriver.CUgraphNode*> calloc(len(dependencies), sizeof(cydriver.CUgraphNode))
         if cydependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
         else:
             for idx in range(len(dependencies)):
                 cydependencies[idx] = <cydriver.CUgraphNode>(<CUgraphNode>dependencies[idx])._pvt_ptr[0]
+    elif len(dependencies) == 1:
+        cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     cdef cydriver.CUgraphEdgeData* cydependencyData = NULL
-    if len(dependencyData) > 0:
+    if len(dependencyData) > 1:
         cydependencyData = <cydriver.CUgraphEdgeData*> calloc(len(dependencyData), sizeof(cydriver.CUgraphEdgeData))
         if cydependencyData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencyData)) + 'x' + str(sizeof(cydriver.CUgraphEdgeData)))
         for idx in range(len(dependencyData)):
             string.memcpy(&cydependencyData[idx], (<CUgraphEdgeData>dependencyData[idx])._pvt_ptr, sizeof(cydriver.CUgraphEdgeData))
+    elif len(dependencyData) == 1:
+        cydependencyData = (<CUgraphEdgeData>dependencyData[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
     if numDependencies > <size_t>len(dependencyData): raise RuntimeError("List is too small: " + str(len(dependencyData)) + " < " + str(numDependencies))
     cdef cydriver.CUstreamCaptureMode cymode = mode.value
-    err = cydriver.cuStreamBeginCaptureToGraph(cyhStream, cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, (<CUgraphEdgeData>dependencyData[0])._pvt_ptr if len(dependencyData) == 1 else cydependencyData, numDependencies, cymode)
-    if cydependencies is not NULL:
+    with nogil:
+        err = cydriver.cuStreamBeginCaptureToGraph(cyhStream, cyhGraph, cydependencies, cydependencyData, numDependencies, cymode)
+    if len(dependencies) > 1 and cydependencies is not NULL:
         free(cydependencies)
-    if cydependencyData is not NULL:
+    if len(dependencyData) > 1 and cydependencyData is not NULL:
         free(cydependencyData)
     return (_dict_CUresult[err],)
 {{endif}}
@@ -36504,7 +36757,8 @@ def cuThreadExchangeStreamCaptureMode(mode not None : CUstreamCaptureMode):
     :py:obj:`~.cuStreamBeginCapture`
     """
     cdef cydriver.CUstreamCaptureMode cymode = mode.value
-    err = cydriver.cuThreadExchangeStreamCaptureMode(&cymode)
+    with nogil:
+        err = cydriver.cuThreadExchangeStreamCaptureMode(&cymode)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], CUstreamCaptureMode(cymode))
@@ -36551,7 +36805,8 @@ def cuStreamEndCapture(hStream):
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef CUgraph phGraph = CUgraph()
-    err = cydriver.cuStreamEndCapture(cyhStream, <cydriver.CUgraph*>phGraph._pvt_ptr)
+    with nogil:
+        err = cydriver.cuStreamEndCapture(cyhStream, <cydriver.CUgraph*>phGraph._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], phGraph)
@@ -36613,7 +36868,8 @@ def cuStreamIsCapturing(hStream):
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef cydriver.CUstreamCaptureStatus captureStatus
-    err = cydriver.cuStreamIsCapturing(cyhStream, &captureStatus)
+    with nogil:
+        err = cydriver.cuStreamIsCapturing(cyhStream, &captureStatus)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], CUstreamCaptureStatus(captureStatus))
@@ -36708,7 +36964,8 @@ def cuStreamGetCaptureInfo(hStream):
     cdef const cydriver.CUgraphEdgeData* cyedgeData_out = NULL
     pyedgeData_out = []
     cdef size_t numDependencies_out = 0
-    err = cydriver.cuStreamGetCaptureInfo(cyhStream, &captureStatus_out, <cydriver.cuuint64_t*>id_out._pvt_ptr, <cydriver.CUgraph*>graph_out._pvt_ptr, &cydependencies_out, &cyedgeData_out, &numDependencies_out)
+    with nogil:
+        err = cydriver.cuStreamGetCaptureInfo(cyhStream, &captureStatus_out, <cydriver.cuuint64_t*>id_out._pvt_ptr, <cydriver.CUgraph*>graph_out._pvt_ptr, &cydependencies_out, &cyedgeData_out, &numDependencies_out)
     if CUresult(err) == CUresult(0):
         pydependencies_out = [CUgraphNode(init_value=<void_ptr>cydependencies_out[idx]) for idx in range(numDependencies_out)]
     if CUresult(err) == CUresult(0):
@@ -36778,24 +37035,29 @@ def cuStreamUpdateCaptureDependencies(hStream, dependencies : Optional[Tuple[CUg
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef cydriver.CUgraphNode* cydependencies = NULL
-    if len(dependencies) > 0:
+    if len(dependencies) > 1:
         cydependencies = <cydriver.CUgraphNode*> calloc(len(dependencies), sizeof(cydriver.CUgraphNode))
         if cydependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
         else:
             for idx in range(len(dependencies)):
                 cydependencies[idx] = <cydriver.CUgraphNode>(<CUgraphNode>dependencies[idx])._pvt_ptr[0]
+    elif len(dependencies) == 1:
+        cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     cdef cydriver.CUgraphEdgeData* cydependencyData = NULL
-    if len(dependencyData) > 0:
+    if len(dependencyData) > 1:
         cydependencyData = <cydriver.CUgraphEdgeData*> calloc(len(dependencyData), sizeof(cydriver.CUgraphEdgeData))
         if cydependencyData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencyData)) + 'x' + str(sizeof(cydriver.CUgraphEdgeData)))
         for idx in range(len(dependencyData)):
             string.memcpy(&cydependencyData[idx], (<CUgraphEdgeData>dependencyData[idx])._pvt_ptr, sizeof(cydriver.CUgraphEdgeData))
-    err = cydriver.cuStreamUpdateCaptureDependencies(cyhStream, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, (<CUgraphEdgeData>dependencyData[0])._pvt_ptr if len(dependencyData) == 1 else cydependencyData, numDependencies, flags)
-    if cydependencies is not NULL:
+    elif len(dependencyData) == 1:
+        cydependencyData = (<CUgraphEdgeData>dependencyData[0])._pvt_ptr
+    with nogil:
+        err = cydriver.cuStreamUpdateCaptureDependencies(cyhStream, cydependencies, cydependencyData, numDependencies, flags)
+    if len(dependencies) > 1 and cydependencies is not NULL:
         free(cydependencies)
-    if cydependencyData is not NULL:
+    if len(dependencyData) > 1 and cydependencyData is not NULL:
         free(cydependencyData)
     return (_dict_CUresult[err],)
 {{endif}}
@@ -36908,7 +37170,8 @@ def cuStreamAttachMemAsync(hStream, dptr, size_t length, unsigned int flags):
     else:
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
-    err = cydriver.cuStreamAttachMemAsync(cyhStream, cydptr, length, flags)
+    with nogil:
+        err = cydriver.cuStreamAttachMemAsync(cyhStream, cydptr, length, flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -36948,7 +37211,8 @@ def cuStreamQuery(hStream):
     else:
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
-    err = cydriver.cuStreamQuery(cyhStream)
+    with nogil:
+        err = cydriver.cuStreamQuery(cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -36987,7 +37251,8 @@ def cuStreamSynchronize(hStream):
     else:
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
-    err = cydriver.cuStreamSynchronize(cyhStream)
+    with nogil:
+        err = cydriver.cuStreamSynchronize(cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -37027,7 +37292,8 @@ def cuStreamDestroy(hStream):
     else:
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
-    err = cydriver.cuStreamDestroy(cyhStream)
+    with nogil:
+        err = cydriver.cuStreamDestroy(cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -37072,7 +37338,8 @@ def cuStreamCopyAttributes(dst, src):
     else:
         pdst = int(CUstream(dst))
     cydst = <cydriver.CUstream><void_ptr>pdst
-    err = cydriver.cuStreamCopyAttributes(cydst, cysrc)
+    with nogil:
+        err = cydriver.cuStreamCopyAttributes(cydst, cysrc)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -37113,7 +37380,8 @@ def cuStreamGetAttribute(hStream, attr not None : CUstreamAttrID):
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef cydriver.CUstreamAttrID cyattr = attr.value
     cdef CUstreamAttrValue value_out = CUstreamAttrValue()
-    err = cydriver.cuStreamGetAttribute(cyhStream, cyattr, <cydriver.CUstreamAttrValue*>value_out._pvt_ptr)
+    with nogil:
+        err = cydriver.cuStreamGetAttribute(cyhStream, cyattr, <cydriver.CUstreamAttrValue*>value_out._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], value_out)
@@ -37157,7 +37425,8 @@ def cuStreamSetAttribute(hStream, attr not None : CUstreamAttrID, value : Option
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef cydriver.CUstreamAttrID cyattr = attr.value
     cdef cydriver.CUstreamAttrValue* cyvalue_ptr = value._pvt_ptr if value != None else NULL
-    err = cydriver.cuStreamSetAttribute(cyhStream, cyattr, cyvalue_ptr)
+    with nogil:
+        err = cydriver.cuStreamSetAttribute(cyhStream, cyattr, cyvalue_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -37205,7 +37474,8 @@ def cuEventCreate(unsigned int Flags):
     :py:obj:`~.cuEventRecord`, :py:obj:`~.cuEventQuery`, :py:obj:`~.cuEventSynchronize`, :py:obj:`~.cuEventDestroy`, :py:obj:`~.cuEventElapsedTime`, :py:obj:`~.cudaEventCreate`, :py:obj:`~.cudaEventCreateWithFlags`
     """
     cdef CUevent phEvent = CUevent()
-    err = cydriver.cuEventCreate(<cydriver.CUevent*>phEvent._pvt_ptr, Flags)
+    with nogil:
+        err = cydriver.cuEventCreate(<cydriver.CUevent*>phEvent._pvt_ptr, Flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], phEvent)
@@ -37266,7 +37536,8 @@ def cuEventRecord(hEvent, hStream):
     else:
         phEvent = int(CUevent(hEvent))
     cyhEvent = <cydriver.CUevent><void_ptr>phEvent
-    err = cydriver.cuEventRecord(cyhEvent, cyhStream)
+    with nogil:
+        err = cydriver.cuEventRecord(cyhEvent, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -37335,7 +37606,8 @@ def cuEventRecordWithFlags(hEvent, hStream, unsigned int flags):
     else:
         phEvent = int(CUevent(hEvent))
     cyhEvent = <cydriver.CUevent><void_ptr>phEvent
-    err = cydriver.cuEventRecordWithFlags(cyhEvent, cyhStream, flags)
+    with nogil:
+        err = cydriver.cuEventRecordWithFlags(cyhEvent, cyhStream, flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -37379,7 +37651,8 @@ def cuEventQuery(hEvent):
     else:
         phEvent = int(CUevent(hEvent))
     cyhEvent = <cydriver.CUevent><void_ptr>phEvent
-    err = cydriver.cuEventQuery(cyhEvent)
+    with nogil:
+        err = cydriver.cuEventQuery(cyhEvent)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -37422,7 +37695,8 @@ def cuEventSynchronize(hEvent):
     else:
         phEvent = int(CUevent(hEvent))
     cyhEvent = <cydriver.CUevent><void_ptr>phEvent
-    err = cydriver.cuEventSynchronize(cyhEvent)
+    with nogil:
+        err = cydriver.cuEventSynchronize(cyhEvent)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -37462,7 +37736,8 @@ def cuEventDestroy(hEvent):
     else:
         phEvent = int(CUevent(hEvent))
     cyhEvent = <cydriver.CUevent><void_ptr>phEvent
-    err = cydriver.cuEventDestroy(cyhEvent)
+    with nogil:
+        err = cydriver.cuEventDestroy(cyhEvent)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -37532,7 +37807,8 @@ def cuEventElapsedTime(hStart, hEnd):
         phStart = int(CUevent(hStart))
     cyhStart = <cydriver.CUevent><void_ptr>phStart
     cdef float pMilliseconds = 0
-    err = cydriver.cuEventElapsedTime(&pMilliseconds, cyhStart, cyhEnd)
+    with nogil:
+        err = cydriver.cuEventElapsedTime(&pMilliseconds, cyhStart, cyhEnd)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pMilliseconds)
@@ -37697,7 +37973,8 @@ def cuImportExternalMemory(memHandleDesc : Optional[CUDA_EXTERNAL_MEMORY_HANDLE_
     """
     cdef CUexternalMemory extMem_out = CUexternalMemory()
     cdef cydriver.CUDA_EXTERNAL_MEMORY_HANDLE_DESC* cymemHandleDesc_ptr = memHandleDesc._pvt_ptr if memHandleDesc != None else NULL
-    err = cydriver.cuImportExternalMemory(<cydriver.CUexternalMemory*>extMem_out._pvt_ptr, cymemHandleDesc_ptr)
+    with nogil:
+        err = cydriver.cuImportExternalMemory(<cydriver.CUexternalMemory*>extMem_out._pvt_ptr, cymemHandleDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], extMem_out)
@@ -37766,7 +38043,8 @@ def cuExternalMemoryGetMappedBuffer(extMem, bufferDesc : Optional[CUDA_EXTERNAL_
     cyextMem = <cydriver.CUexternalMemory><void_ptr>pextMem
     cdef CUdeviceptr devPtr = CUdeviceptr()
     cdef cydriver.CUDA_EXTERNAL_MEMORY_BUFFER_DESC* cybufferDesc_ptr = bufferDesc._pvt_ptr if bufferDesc != None else NULL
-    err = cydriver.cuExternalMemoryGetMappedBuffer(<cydriver.CUdeviceptr*>devPtr._pvt_ptr, cyextMem, cybufferDesc_ptr)
+    with nogil:
+        err = cydriver.cuExternalMemoryGetMappedBuffer(<cydriver.CUdeviceptr*>devPtr._pvt_ptr, cyextMem, cybufferDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], devPtr)
@@ -37841,7 +38119,8 @@ def cuExternalMemoryGetMappedMipmappedArray(extMem, mipmapDesc : Optional[CUDA_E
     cyextMem = <cydriver.CUexternalMemory><void_ptr>pextMem
     cdef CUmipmappedArray mipmap = CUmipmappedArray()
     cdef cydriver.CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC* cymipmapDesc_ptr = mipmapDesc._pvt_ptr if mipmapDesc != None else NULL
-    err = cydriver.cuExternalMemoryGetMappedMipmappedArray(<cydriver.CUmipmappedArray*>mipmap._pvt_ptr, cyextMem, cymipmapDesc_ptr)
+    with nogil:
+        err = cydriver.cuExternalMemoryGetMappedMipmappedArray(<cydriver.CUmipmappedArray*>mipmap._pvt_ptr, cyextMem, cymipmapDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], mipmap)
@@ -37880,7 +38159,8 @@ def cuDestroyExternalMemory(extMem):
     else:
         pextMem = int(CUexternalMemory(extMem))
     cyextMem = <cydriver.CUexternalMemory><void_ptr>pextMem
-    err = cydriver.cuDestroyExternalMemory(cyextMem)
+    with nogil:
+        err = cydriver.cuDestroyExternalMemory(cyextMem)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -38028,7 +38308,8 @@ def cuImportExternalSemaphore(semHandleDesc : Optional[CUDA_EXTERNAL_SEMAPHORE_H
     """
     cdef CUexternalSemaphore extSem_out = CUexternalSemaphore()
     cdef cydriver.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC* cysemHandleDesc_ptr = semHandleDesc._pvt_ptr if semHandleDesc != None else NULL
-    err = cydriver.cuImportExternalSemaphore(<cydriver.CUexternalSemaphore*>extSem_out._pvt_ptr, cysemHandleDesc_ptr)
+    with nogil:
+        err = cydriver.cuImportExternalSemaphore(<cydriver.CUexternalSemaphore*>extSem_out._pvt_ptr, cysemHandleDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], extSem_out)
@@ -38158,26 +38439,31 @@ def cuSignalExternalSemaphoresAsync(extSemArray : Optional[Tuple[CUexternalSemap
     if not all(isinstance(_x, (CUexternalSemaphore,)) for _x in extSemArray):
         raise TypeError("Argument 'extSemArray' is not instance of type (expected Tuple[cydriver.CUexternalSemaphore,] or List[cydriver.CUexternalSemaphore,]")
     cdef cydriver.CUexternalSemaphore* cyextSemArray = NULL
-    if len(extSemArray) > 0:
+    if len(extSemArray) > 1:
         cyextSemArray = <cydriver.CUexternalSemaphore*> calloc(len(extSemArray), sizeof(cydriver.CUexternalSemaphore))
         if cyextSemArray is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(extSemArray)) + 'x' + str(sizeof(cydriver.CUexternalSemaphore)))
         else:
             for idx in range(len(extSemArray)):
                 cyextSemArray[idx] = <cydriver.CUexternalSemaphore>(<CUexternalSemaphore>extSemArray[idx])._pvt_ptr[0]
+    elif len(extSemArray) == 1:
+        cyextSemArray = <cydriver.CUexternalSemaphore*>(<CUexternalSemaphore>extSemArray[0])._pvt_ptr
     cdef cydriver.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS* cyparamsArray = NULL
-    if len(paramsArray) > 0:
+    if len(paramsArray) > 1:
         cyparamsArray = <cydriver.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS*> calloc(len(paramsArray), sizeof(cydriver.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS))
         if cyparamsArray is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(paramsArray)) + 'x' + str(sizeof(cydriver.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS)))
         for idx in range(len(paramsArray)):
             string.memcpy(&cyparamsArray[idx], (<CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS>paramsArray[idx])._pvt_ptr, sizeof(cydriver.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS))
+    elif len(paramsArray) == 1:
+        cyparamsArray = (<CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS>paramsArray[0])._pvt_ptr
     if numExtSems > len(extSemArray): raise RuntimeError("List is too small: " + str(len(extSemArray)) + " < " + str(numExtSems))
     if numExtSems > len(paramsArray): raise RuntimeError("List is too small: " + str(len(paramsArray)) + " < " + str(numExtSems))
-    err = cydriver.cuSignalExternalSemaphoresAsync(<cydriver.CUexternalSemaphore*>(<CUexternalSemaphore>extSemArray[0])._pvt_ptr if len(extSemArray) == 1 else cyextSemArray, (<CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS>paramsArray[0])._pvt_ptr if len(paramsArray) == 1 else cyparamsArray, numExtSems, cystream)
-    if cyextSemArray is not NULL:
+    with nogil:
+        err = cydriver.cuSignalExternalSemaphoresAsync(cyextSemArray, cyparamsArray, numExtSems, cystream)
+    if len(extSemArray) > 1 and cyextSemArray is not NULL:
         free(cyextSemArray)
-    if cyparamsArray is not NULL:
+    if len(paramsArray) > 1 and cyparamsArray is not NULL:
         free(cyparamsArray)
     return (_dict_CUresult[err],)
 {{endif}}
@@ -38283,26 +38569,31 @@ def cuWaitExternalSemaphoresAsync(extSemArray : Optional[Tuple[CUexternalSemapho
     if not all(isinstance(_x, (CUexternalSemaphore,)) for _x in extSemArray):
         raise TypeError("Argument 'extSemArray' is not instance of type (expected Tuple[cydriver.CUexternalSemaphore,] or List[cydriver.CUexternalSemaphore,]")
     cdef cydriver.CUexternalSemaphore* cyextSemArray = NULL
-    if len(extSemArray) > 0:
+    if len(extSemArray) > 1:
         cyextSemArray = <cydriver.CUexternalSemaphore*> calloc(len(extSemArray), sizeof(cydriver.CUexternalSemaphore))
         if cyextSemArray is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(extSemArray)) + 'x' + str(sizeof(cydriver.CUexternalSemaphore)))
         else:
             for idx in range(len(extSemArray)):
                 cyextSemArray[idx] = <cydriver.CUexternalSemaphore>(<CUexternalSemaphore>extSemArray[idx])._pvt_ptr[0]
+    elif len(extSemArray) == 1:
+        cyextSemArray = <cydriver.CUexternalSemaphore*>(<CUexternalSemaphore>extSemArray[0])._pvt_ptr
     cdef cydriver.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS* cyparamsArray = NULL
-    if len(paramsArray) > 0:
+    if len(paramsArray) > 1:
         cyparamsArray = <cydriver.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS*> calloc(len(paramsArray), sizeof(cydriver.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS))
         if cyparamsArray is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(paramsArray)) + 'x' + str(sizeof(cydriver.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS)))
         for idx in range(len(paramsArray)):
             string.memcpy(&cyparamsArray[idx], (<CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS>paramsArray[idx])._pvt_ptr, sizeof(cydriver.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS))
+    elif len(paramsArray) == 1:
+        cyparamsArray = (<CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS>paramsArray[0])._pvt_ptr
     if numExtSems > len(extSemArray): raise RuntimeError("List is too small: " + str(len(extSemArray)) + " < " + str(numExtSems))
     if numExtSems > len(paramsArray): raise RuntimeError("List is too small: " + str(len(paramsArray)) + " < " + str(numExtSems))
-    err = cydriver.cuWaitExternalSemaphoresAsync(<cydriver.CUexternalSemaphore*>(<CUexternalSemaphore>extSemArray[0])._pvt_ptr if len(extSemArray) == 1 else cyextSemArray, (<CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS>paramsArray[0])._pvt_ptr if len(paramsArray) == 1 else cyparamsArray, numExtSems, cystream)
-    if cyextSemArray is not NULL:
+    with nogil:
+        err = cydriver.cuWaitExternalSemaphoresAsync(cyextSemArray, cyparamsArray, numExtSems, cystream)
+    if len(extSemArray) > 1 and cyextSemArray is not NULL:
         free(cyextSemArray)
-    if cyparamsArray is not NULL:
+    if len(paramsArray) > 1 and cyparamsArray is not NULL:
         free(cyparamsArray)
     return (_dict_CUresult[err],)
 {{endif}}
@@ -38339,7 +38630,8 @@ def cuDestroyExternalSemaphore(extSem):
     else:
         pextSem = int(CUexternalSemaphore(extSem))
     cyextSem = <cydriver.CUexternalSemaphore><void_ptr>pextSem
-    err = cydriver.cuDestroyExternalSemaphore(cyextSem)
+    with nogil:
+        err = cydriver.cuDestroyExternalSemaphore(cyextSem)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -38412,7 +38704,8 @@ def cuStreamWaitValue32(stream, addr, value, unsigned int flags):
     else:
         pstream = int(CUstream(stream))
     cystream = <cydriver.CUstream><void_ptr>pstream
-    err = cydriver.cuStreamWaitValue32(cystream, cyaddr, cyvalue, flags)
+    with nogil:
+        err = cydriver.cuStreamWaitValue32(cystream, cyaddr, cyvalue, flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -38483,7 +38776,8 @@ def cuStreamWaitValue64(stream, addr, value, unsigned int flags):
     else:
         pstream = int(CUstream(stream))
     cystream = <cydriver.CUstream><void_ptr>pstream
-    err = cydriver.cuStreamWaitValue64(cystream, cyaddr, cyvalue, flags)
+    with nogil:
+        err = cydriver.cuStreamWaitValue64(cystream, cyaddr, cyvalue, flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -38544,7 +38838,8 @@ def cuStreamWriteValue32(stream, addr, value, unsigned int flags):
     else:
         pstream = int(CUstream(stream))
     cystream = <cydriver.CUstream><void_ptr>pstream
-    err = cydriver.cuStreamWriteValue32(cystream, cyaddr, cyvalue, flags)
+    with nogil:
+        err = cydriver.cuStreamWriteValue32(cystream, cyaddr, cyvalue, flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -38607,7 +38902,8 @@ def cuStreamWriteValue64(stream, addr, value, unsigned int flags):
     else:
         pstream = int(CUstream(stream))
     cystream = <cydriver.CUstream><void_ptr>pstream
-    err = cydriver.cuStreamWriteValue64(cystream, cyaddr, cyvalue, flags)
+    with nogil:
+        err = cydriver.cuStreamWriteValue64(cystream, cyaddr, cyvalue, flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -38669,14 +38965,17 @@ def cuStreamBatchMemOp(stream, unsigned int count, paramArray : Optional[Tuple[C
     cystream = <cydriver.CUstream><void_ptr>pstream
     if count > len(paramArray): raise RuntimeError("List is too small: " + str(len(paramArray)) + " < " + str(count))
     cdef cydriver.CUstreamBatchMemOpParams* cyparamArray = NULL
-    if len(paramArray) > 0:
+    if len(paramArray) > 1:
         cyparamArray = <cydriver.CUstreamBatchMemOpParams*> calloc(len(paramArray), sizeof(cydriver.CUstreamBatchMemOpParams))
         if cyparamArray is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(paramArray)) + 'x' + str(sizeof(cydriver.CUstreamBatchMemOpParams)))
         for idx in range(len(paramArray)):
             string.memcpy(&cyparamArray[idx], (<CUstreamBatchMemOpParams>paramArray[idx])._pvt_ptr, sizeof(cydriver.CUstreamBatchMemOpParams))
-    err = cydriver.cuStreamBatchMemOp(cystream, count, (<CUstreamBatchMemOpParams>paramArray[0])._pvt_ptr if len(paramArray) == 1 else cyparamArray, flags)
-    if cyparamArray is not NULL:
+    elif len(paramArray) == 1:
+        cyparamArray = (<CUstreamBatchMemOpParams>paramArray[0])._pvt_ptr
+    with nogil:
+        err = cydriver.cuStreamBatchMemOp(cystream, count, cyparamArray, flags)
+    if len(paramArray) > 1 and cyparamArray is not NULL:
         free(cyparamArray)
     return (_dict_CUresult[err],)
 {{endif}}
@@ -38810,7 +39109,8 @@ def cuFuncGetAttribute(attrib not None : CUfunction_attribute, hfunc):
     cyhfunc = <cydriver.CUfunction><void_ptr>phfunc
     cdef int pi = 0
     cdef cydriver.CUfunction_attribute cyattrib = attrib.value
-    err = cydriver.cuFuncGetAttribute(&pi, cyattrib, cyhfunc)
+    with nogil:
+        err = cydriver.cuFuncGetAttribute(&pi, cyattrib, cyhfunc)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pi)
@@ -38905,7 +39205,8 @@ def cuFuncSetAttribute(hfunc, attrib not None : CUfunction_attribute, int value)
         phfunc = int(CUfunction(hfunc))
     cyhfunc = <cydriver.CUfunction><void_ptr>phfunc
     cdef cydriver.CUfunction_attribute cyattrib = attrib.value
-    err = cydriver.cuFuncSetAttribute(cyhfunc, cyattrib, value)
+    with nogil:
+        err = cydriver.cuFuncSetAttribute(cyhfunc, cyattrib, value)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -38970,7 +39271,8 @@ def cuFuncSetCacheConfig(hfunc, config not None : CUfunc_cache):
         phfunc = int(CUfunction(hfunc))
     cyhfunc = <cydriver.CUfunction><void_ptr>phfunc
     cdef cydriver.CUfunc_cache cyconfig = config.value
-    err = cydriver.cuFuncSetCacheConfig(cyhfunc, cyconfig)
+    with nogil:
+        err = cydriver.cuFuncSetCacheConfig(cyhfunc, cyconfig)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -39011,7 +39313,8 @@ def cuFuncGetModule(hfunc):
         phfunc = int(CUfunction(hfunc))
     cyhfunc = <cydriver.CUfunction><void_ptr>phfunc
     cdef CUmodule hmod = CUmodule()
-    err = cydriver.cuFuncGetModule(<cydriver.CUmodule*>hmod._pvt_ptr, cyhfunc)
+    with nogil:
+        err = cydriver.cuFuncGetModule(<cydriver.CUmodule*>hmod._pvt_ptr, cyhfunc)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], hmod)
@@ -39052,7 +39355,8 @@ def cuFuncGetName(hfunc):
         phfunc = int(CUfunction(hfunc))
     cyhfunc = <cydriver.CUfunction><void_ptr>phfunc
     cdef const char* name = NULL
-    err = cydriver.cuFuncGetName(&name, cyhfunc)
+    with nogil:
+        err = cydriver.cuFuncGetName(&name, cyhfunc)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], <bytes>name if name != NULL else None)
@@ -39106,7 +39410,8 @@ def cuFuncGetParamInfo(func, size_t paramIndex):
     cyfunc = <cydriver.CUfunction><void_ptr>pfunc
     cdef size_t paramOffset = 0
     cdef size_t paramSize = 0
-    err = cydriver.cuFuncGetParamInfo(cyfunc, paramIndex, &paramOffset, &paramSize)
+    with nogil:
+        err = cydriver.cuFuncGetParamInfo(cyfunc, paramIndex, &paramOffset, &paramSize)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], paramOffset, paramSize)
@@ -39145,7 +39450,8 @@ def cuFuncIsLoaded(function):
         pfunction = int(CUfunction(function))
     cyfunction = <cydriver.CUfunction><void_ptr>pfunction
     cdef cydriver.CUfunctionLoadingState state
-    err = cydriver.cuFuncIsLoaded(&state, cyfunction)
+    with nogil:
+        err = cydriver.cuFuncIsLoaded(&state, cyfunction)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], CUfunctionLoadingState(state))
@@ -39182,7 +39488,8 @@ def cuFuncLoad(function):
     else:
         pfunction = int(CUfunction(function))
     cyfunction = <cydriver.CUfunction><void_ptr>pfunction
-    err = cydriver.cuFuncLoad(cyfunction)
+    with nogil:
+        err = cydriver.cuFuncLoad(cyfunction)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -39313,7 +39620,9 @@ def cuLaunchKernel(f, unsigned int gridDimX, unsigned int gridDimY, unsigned int
         pf = int(CUfunction(f))
     cyf = <cydriver.CUfunction><void_ptr>pf
     cykernelParams = utils.HelperKernelParams(kernelParams)
-    err = cydriver.cuLaunchKernel(cyf, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, cyhStream, <void**><void_ptr>cykernelParams.ckernelParams, <void**>extra)
+    cdef void** cykernelParams_ptr = <void**><void_ptr>cykernelParams.ckernelParams
+    with nogil:
+        err = cydriver.cuLaunchKernel(cyf, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, cyhStream, cykernelParams_ptr, <void**>extra)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -39552,7 +39861,9 @@ def cuLaunchKernelEx(config : Optional[CUlaunchConfig], f, kernelParams, void_pt
     cyf = <cydriver.CUfunction><void_ptr>pf
     cdef cydriver.CUlaunchConfig* cyconfig_ptr = config._pvt_ptr if config != None else NULL
     cykernelParams = utils.HelperKernelParams(kernelParams)
-    err = cydriver.cuLaunchKernelEx(cyconfig_ptr, cyf, <void**><void_ptr>cykernelParams.ckernelParams, <void**>extra)
+    cdef void** cykernelParams_ptr = <void**><void_ptr>cykernelParams.ckernelParams
+    with nogil:
+        err = cydriver.cuLaunchKernelEx(cyconfig_ptr, cyf, cykernelParams_ptr, <void**>extra)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -39663,7 +39974,9 @@ def cuLaunchCooperativeKernel(f, unsigned int gridDimX, unsigned int gridDimY, u
         pf = int(CUfunction(f))
     cyf = <cydriver.CUfunction><void_ptr>pf
     cykernelParams = utils.HelperKernelParams(kernelParams)
-    err = cydriver.cuLaunchCooperativeKernel(cyf, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, cyhStream, <void**><void_ptr>cykernelParams.ckernelParams)
+    cdef void** cykernelParams_ptr = <void**><void_ptr>cykernelParams.ckernelParams
+    with nogil:
+        err = cydriver.cuLaunchCooperativeKernel(cyf, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, cyhStream, cykernelParams_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -39825,15 +40138,18 @@ def cuLaunchCooperativeKernelMultiDevice(launchParamsList : Optional[Tuple[CUDA_
     if not all(isinstance(_x, (CUDA_LAUNCH_PARAMS,)) for _x in launchParamsList):
         raise TypeError("Argument 'launchParamsList' is not instance of type (expected Tuple[cydriver.CUDA_LAUNCH_PARAMS,] or List[cydriver.CUDA_LAUNCH_PARAMS,]")
     cdef cydriver.CUDA_LAUNCH_PARAMS* cylaunchParamsList = NULL
-    if len(launchParamsList) > 0:
+    if len(launchParamsList) > 1:
         cylaunchParamsList = <cydriver.CUDA_LAUNCH_PARAMS*> calloc(len(launchParamsList), sizeof(cydriver.CUDA_LAUNCH_PARAMS))
         if cylaunchParamsList is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(launchParamsList)) + 'x' + str(sizeof(cydriver.CUDA_LAUNCH_PARAMS)))
         for idx in range(len(launchParamsList)):
             string.memcpy(&cylaunchParamsList[idx], (<CUDA_LAUNCH_PARAMS>launchParamsList[idx])._pvt_ptr, sizeof(cydriver.CUDA_LAUNCH_PARAMS))
+    elif len(launchParamsList) == 1:
+        cylaunchParamsList = (<CUDA_LAUNCH_PARAMS>launchParamsList[0])._pvt_ptr
     if numDevices > len(launchParamsList): raise RuntimeError("List is too small: " + str(len(launchParamsList)) + " < " + str(numDevices))
-    err = cydriver.cuLaunchCooperativeKernelMultiDevice((<CUDA_LAUNCH_PARAMS>launchParamsList[0])._pvt_ptr if len(launchParamsList) == 1 else cylaunchParamsList, numDevices, flags)
-    if cylaunchParamsList is not NULL:
+    with nogil:
+        err = cydriver.cuLaunchCooperativeKernelMultiDevice(cylaunchParamsList, numDevices, flags)
+    if len(launchParamsList) > 1 and cylaunchParamsList is not NULL:
         free(cylaunchParamsList)
     return (_dict_CUresult[err],)
 {{endif}}
@@ -39942,7 +40258,6 @@ def cuLaunchHostFunc(hStream, fn, userData):
 
     with nogil:
         err = cydriver.cuLaunchHostFunc(cyhStream, <cydriver.CUhostFn>cuHostCallbackWrapper, <void *>cbData)
-
     if err != cydriver.CUDA_SUCCESS:
         free(cbData)
     return (_dict_CUresult[err],)
@@ -39987,7 +40302,8 @@ def cuFuncSetBlockShape(hfunc, int x, int y, int z):
     else:
         phfunc = int(CUfunction(hfunc))
     cyhfunc = <cydriver.CUfunction><void_ptr>phfunc
-    err = cydriver.cuFuncSetBlockShape(cyhfunc, x, y, z)
+    with nogil:
+        err = cydriver.cuFuncSetBlockShape(cyhfunc, x, y, z)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -40027,7 +40343,8 @@ def cuFuncSetSharedSize(hfunc, unsigned int numbytes):
     else:
         phfunc = int(CUfunction(hfunc))
     cyhfunc = <cydriver.CUfunction><void_ptr>phfunc
-    err = cydriver.cuFuncSetSharedSize(cyhfunc, numbytes)
+    with nogil:
+        err = cydriver.cuFuncSetSharedSize(cyhfunc, numbytes)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -40066,7 +40383,8 @@ def cuParamSetSize(hfunc, unsigned int numbytes):
     else:
         phfunc = int(CUfunction(hfunc))
     cyhfunc = <cydriver.CUfunction><void_ptr>phfunc
-    err = cydriver.cuParamSetSize(cyhfunc, numbytes)
+    with nogil:
+        err = cydriver.cuParamSetSize(cyhfunc, numbytes)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -40108,7 +40426,8 @@ def cuParamSeti(hfunc, int offset, unsigned int value):
     else:
         phfunc = int(CUfunction(hfunc))
     cyhfunc = <cydriver.CUfunction><void_ptr>phfunc
-    err = cydriver.cuParamSeti(cyhfunc, offset, value)
+    with nogil:
+        err = cydriver.cuParamSeti(cyhfunc, offset, value)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -40150,7 +40469,8 @@ def cuParamSetf(hfunc, int offset, float value):
     else:
         phfunc = int(CUfunction(hfunc))
     cyhfunc = <cydriver.CUfunction><void_ptr>phfunc
-    err = cydriver.cuParamSetf(cyhfunc, offset, value)
+    with nogil:
+        err = cydriver.cuParamSetf(cyhfunc, offset, value)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -40196,7 +40516,8 @@ def cuParamSetv(hfunc, int offset, ptr, unsigned int numbytes):
     cyhfunc = <cydriver.CUfunction><void_ptr>phfunc
     cyptr = utils.HelperInputVoidPtr(ptr)
     cdef void* cyptr_ptr = <void*><void_ptr>cyptr.cptr
-    err = cydriver.cuParamSetv(cyhfunc, offset, cyptr_ptr, numbytes)
+    with nogil:
+        err = cydriver.cuParamSetv(cyhfunc, offset, cyptr_ptr, numbytes)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -40246,7 +40567,8 @@ def cuLaunch(f):
     else:
         pf = int(CUfunction(f))
     cyf = <cydriver.CUfunction><void_ptr>pf
-    err = cydriver.cuLaunch(cyf)
+    with nogil:
+        err = cydriver.cuLaunch(cyf)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -40300,7 +40622,8 @@ def cuLaunchGrid(f, int grid_width, int grid_height):
     else:
         pf = int(CUfunction(f))
     cyf = <cydriver.CUfunction><void_ptr>pf
-    err = cydriver.cuLaunchGrid(cyf, grid_width, grid_height)
+    with nogil:
+        err = cydriver.cuLaunchGrid(cyf, grid_width, grid_height)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -40370,7 +40693,8 @@ def cuLaunchGridAsync(f, int grid_width, int grid_height, hStream):
     else:
         pf = int(CUfunction(f))
     cyf = <cydriver.CUfunction><void_ptr>pf
-    err = cydriver.cuLaunchGridAsync(cyf, grid_width, grid_height, cyhStream)
+    with nogil:
+        err = cydriver.cuLaunchGridAsync(cyf, grid_width, grid_height, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -40418,7 +40742,8 @@ def cuParamSetTexRef(hfunc, int texunit, hTexRef):
     else:
         phfunc = int(CUfunction(hfunc))
     cyhfunc = <cydriver.CUfunction><void_ptr>phfunc
-    err = cydriver.cuParamSetTexRef(cyhfunc, texunit, cyhTexRef)
+    with nogil:
+        err = cydriver.cuParamSetTexRef(cyhfunc, texunit, cyhTexRef)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -40490,7 +40815,8 @@ def cuFuncSetSharedMemConfig(hfunc, config not None : CUsharedconfig):
         phfunc = int(CUfunction(hfunc))
     cyhfunc = <cydriver.CUfunction><void_ptr>phfunc
     cdef cydriver.CUsharedconfig cyconfig = config.value
-    err = cydriver.cuFuncSetSharedMemConfig(cyhfunc, cyconfig)
+    with nogil:
+        err = cydriver.cuFuncSetSharedMemConfig(cyhfunc, cyconfig)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -40519,7 +40845,8 @@ def cuGraphCreate(unsigned int flags):
     :py:obj:`~.cuGraphAddChildGraphNode`, :py:obj:`~.cuGraphAddEmptyNode`, :py:obj:`~.cuGraphAddKernelNode`, :py:obj:`~.cuGraphAddHostNode`, :py:obj:`~.cuGraphAddMemcpyNode`, :py:obj:`~.cuGraphAddMemsetNode`, :py:obj:`~.cuGraphInstantiate`, :py:obj:`~.cuGraphDestroy`, :py:obj:`~.cuGraphGetNodes`, :py:obj:`~.cuGraphGetRootNodes`, :py:obj:`~.cuGraphGetEdges`, :py:obj:`~.cuGraphClone`
     """
     cdef CUgraph phGraph = CUgraph()
-    err = cydriver.cuGraphCreate(<cydriver.CUgraph*>phGraph._pvt_ptr, flags)
+    with nogil:
+        err = cydriver.cuGraphCreate(<cydriver.CUgraph*>phGraph._pvt_ptr, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], phGraph)
@@ -40629,17 +40956,20 @@ def cuGraphAddKernelNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] | Li
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef CUgraphNode phGraphNode = CUgraphNode()
     cdef cydriver.CUgraphNode* cydependencies = NULL
-    if len(dependencies) > 0:
+    if len(dependencies) > 1:
         cydependencies = <cydriver.CUgraphNode*> calloc(len(dependencies), sizeof(cydriver.CUgraphNode))
         if cydependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
         else:
             for idx in range(len(dependencies)):
                 cydependencies[idx] = <cydriver.CUgraphNode>(<CUgraphNode>dependencies[idx])._pvt_ptr[0]
+    elif len(dependencies) == 1:
+        cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
     cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphAddKernelNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, numDependencies, cynodeParams_ptr)
-    if cydependencies is not NULL:
+    with nogil:
+        err = cydriver.cuGraphAddKernelNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
+    if len(dependencies) > 1 and cydependencies is not NULL:
         free(cydependencies)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
@@ -40688,7 +41018,8 @@ def cuGraphKernelNodeGetParams(hNode):
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef CUDA_KERNEL_NODE_PARAMS nodeParams = CUDA_KERNEL_NODE_PARAMS()
-    err = cydriver.cuGraphKernelNodeGetParams(cyhNode, <cydriver.CUDA_KERNEL_NODE_PARAMS*>nodeParams._pvt_ptr)
+    with nogil:
+        err = cydriver.cuGraphKernelNodeGetParams(cyhNode, <cydriver.CUDA_KERNEL_NODE_PARAMS*>nodeParams._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], nodeParams)
@@ -40727,7 +41058,8 @@ def cuGraphKernelNodeSetParams(hNode, nodeParams : Optional[CUDA_KERNEL_NODE_PAR
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphKernelNodeSetParams(cyhNode, cynodeParams_ptr)
+    with nogil:
+        err = cydriver.cuGraphKernelNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -40802,17 +41134,20 @@ def cuGraphAddMemcpyNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] | Li
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef CUgraphNode phGraphNode = CUgraphNode()
     cdef cydriver.CUgraphNode* cydependencies = NULL
-    if len(dependencies) > 0:
+    if len(dependencies) > 1:
         cydependencies = <cydriver.CUgraphNode*> calloc(len(dependencies), sizeof(cydriver.CUgraphNode))
         if cydependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
         else:
             for idx in range(len(dependencies)):
                 cydependencies[idx] = <cydriver.CUgraphNode>(<CUgraphNode>dependencies[idx])._pvt_ptr[0]
+    elif len(dependencies) == 1:
+        cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
     cdef cydriver.CUDA_MEMCPY3D* cycopyParams_ptr = copyParams._pvt_ptr if copyParams != None else NULL
-    err = cydriver.cuGraphAddMemcpyNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, numDependencies, cycopyParams_ptr, cyctx)
-    if cydependencies is not NULL:
+    with nogil:
+        err = cydriver.cuGraphAddMemcpyNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cycopyParams_ptr, cyctx)
+    if len(dependencies) > 1 and cydependencies is not NULL:
         free(cydependencies)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
@@ -40852,7 +41187,8 @@ def cuGraphMemcpyNodeGetParams(hNode):
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef CUDA_MEMCPY3D nodeParams = CUDA_MEMCPY3D()
-    err = cydriver.cuGraphMemcpyNodeGetParams(cyhNode, <cydriver.CUDA_MEMCPY3D*>nodeParams._pvt_ptr)
+    with nogil:
+        err = cydriver.cuGraphMemcpyNodeGetParams(cyhNode, <cydriver.CUDA_MEMCPY3D*>nodeParams._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], nodeParams)
@@ -40891,7 +41227,8 @@ def cuGraphMemcpyNodeSetParams(hNode, nodeParams : Optional[CUDA_MEMCPY3D]):
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef cydriver.CUDA_MEMCPY3D* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphMemcpyNodeSetParams(cyhNode, cynodeParams_ptr)
+    with nogil:
+        err = cydriver.cuGraphMemcpyNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -40956,17 +41293,20 @@ def cuGraphAddMemsetNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] | Li
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef CUgraphNode phGraphNode = CUgraphNode()
     cdef cydriver.CUgraphNode* cydependencies = NULL
-    if len(dependencies) > 0:
+    if len(dependencies) > 1:
         cydependencies = <cydriver.CUgraphNode*> calloc(len(dependencies), sizeof(cydriver.CUgraphNode))
         if cydependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
         else:
             for idx in range(len(dependencies)):
                 cydependencies[idx] = <cydriver.CUgraphNode>(<CUgraphNode>dependencies[idx])._pvt_ptr[0]
+    elif len(dependencies) == 1:
+        cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
     cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cymemsetParams_ptr = memsetParams._pvt_ptr if memsetParams != None else NULL
-    err = cydriver.cuGraphAddMemsetNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, numDependencies, cymemsetParams_ptr, cyctx)
-    if cydependencies is not NULL:
+    with nogil:
+        err = cydriver.cuGraphAddMemsetNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cymemsetParams_ptr, cyctx)
+    if len(dependencies) > 1 and cydependencies is not NULL:
         free(cydependencies)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
@@ -41006,7 +41346,8 @@ def cuGraphMemsetNodeGetParams(hNode):
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef CUDA_MEMSET_NODE_PARAMS nodeParams = CUDA_MEMSET_NODE_PARAMS()
-    err = cydriver.cuGraphMemsetNodeGetParams(cyhNode, <cydriver.CUDA_MEMSET_NODE_PARAMS*>nodeParams._pvt_ptr)
+    with nogil:
+        err = cydriver.cuGraphMemsetNodeGetParams(cyhNode, <cydriver.CUDA_MEMSET_NODE_PARAMS*>nodeParams._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], nodeParams)
@@ -41045,7 +41386,8 @@ def cuGraphMemsetNodeSetParams(hNode, nodeParams : Optional[CUDA_MEMSET_NODE_PAR
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphMemsetNodeSetParams(cyhNode, cynodeParams_ptr)
+    with nogil:
+        err = cydriver.cuGraphMemsetNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -41100,17 +41442,20 @@ def cuGraphAddHostNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] | List
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef CUgraphNode phGraphNode = CUgraphNode()
     cdef cydriver.CUgraphNode* cydependencies = NULL
-    if len(dependencies) > 0:
+    if len(dependencies) > 1:
         cydependencies = <cydriver.CUgraphNode*> calloc(len(dependencies), sizeof(cydriver.CUgraphNode))
         if cydependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
         else:
             for idx in range(len(dependencies)):
                 cydependencies[idx] = <cydriver.CUgraphNode>(<CUgraphNode>dependencies[idx])._pvt_ptr[0]
+    elif len(dependencies) == 1:
+        cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
     cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphAddHostNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, numDependencies, cynodeParams_ptr)
-    if cydependencies is not NULL:
+    with nogil:
+        err = cydriver.cuGraphAddHostNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
+    if len(dependencies) > 1 and cydependencies is not NULL:
         free(cydependencies)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
@@ -41150,7 +41495,8 @@ def cuGraphHostNodeGetParams(hNode):
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef CUDA_HOST_NODE_PARAMS nodeParams = CUDA_HOST_NODE_PARAMS()
-    err = cydriver.cuGraphHostNodeGetParams(cyhNode, <cydriver.CUDA_HOST_NODE_PARAMS*>nodeParams._pvt_ptr)
+    with nogil:
+        err = cydriver.cuGraphHostNodeGetParams(cyhNode, <cydriver.CUDA_HOST_NODE_PARAMS*>nodeParams._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], nodeParams)
@@ -41189,7 +41535,8 @@ def cuGraphHostNodeSetParams(hNode, nodeParams : Optional[CUDA_HOST_NODE_PARAMS]
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphHostNodeSetParams(cyhNode, cynodeParams_ptr)
+    with nogil:
+        err = cydriver.cuGraphHostNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -41255,16 +41602,19 @@ def cuGraphAddChildGraphNode(hGraph, dependencies : Optional[Tuple[CUgraphNode]
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef CUgraphNode phGraphNode = CUgraphNode()
     cdef cydriver.CUgraphNode* cydependencies = NULL
-    if len(dependencies) > 0:
+    if len(dependencies) > 1:
         cydependencies = <cydriver.CUgraphNode*> calloc(len(dependencies), sizeof(cydriver.CUgraphNode))
         if cydependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
         else:
             for idx in range(len(dependencies)):
                 cydependencies[idx] = <cydriver.CUgraphNode>(<CUgraphNode>dependencies[idx])._pvt_ptr[0]
+    elif len(dependencies) == 1:
+        cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    err = cydriver.cuGraphAddChildGraphNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, numDependencies, cychildGraph)
-    if cydependencies is not NULL:
+    with nogil:
+        err = cydriver.cuGraphAddChildGraphNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cychildGraph)
+    if len(dependencies) > 1 and cydependencies is not NULL:
         free(cydependencies)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
@@ -41309,7 +41659,8 @@ def cuGraphChildGraphNodeGetGraph(hNode):
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef CUgraph phGraph = CUgraph()
-    err = cydriver.cuGraphChildGraphNodeGetGraph(cyhNode, <cydriver.CUgraph*>phGraph._pvt_ptr)
+    with nogil:
+        err = cydriver.cuGraphChildGraphNodeGetGraph(cyhNode, <cydriver.CUgraph*>phGraph._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], phGraph)
@@ -41367,16 +41718,19 @@ def cuGraphAddEmptyNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] | Lis
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef CUgraphNode phGraphNode = CUgraphNode()
     cdef cydriver.CUgraphNode* cydependencies = NULL
-    if len(dependencies) > 0:
+    if len(dependencies) > 1:
         cydependencies = <cydriver.CUgraphNode*> calloc(len(dependencies), sizeof(cydriver.CUgraphNode))
         if cydependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
         else:
             for idx in range(len(dependencies)):
                 cydependencies[idx] = <cydriver.CUgraphNode>(<CUgraphNode>dependencies[idx])._pvt_ptr[0]
+    elif len(dependencies) == 1:
+        cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    err = cydriver.cuGraphAddEmptyNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, numDependencies)
-    if cydependencies is not NULL:
+    with nogil:
+        err = cydriver.cuGraphAddEmptyNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies)
+    if len(dependencies) > 1 and cydependencies is not NULL:
         free(cydependencies)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
@@ -41442,16 +41796,19 @@ def cuGraphAddEventRecordNode(hGraph, dependencies : Optional[Tuple[CUgraphNode]
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef CUgraphNode phGraphNode = CUgraphNode()
     cdef cydriver.CUgraphNode* cydependencies = NULL
-    if len(dependencies) > 0:
+    if len(dependencies) > 1:
         cydependencies = <cydriver.CUgraphNode*> calloc(len(dependencies), sizeof(cydriver.CUgraphNode))
         if cydependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
         else:
             for idx in range(len(dependencies)):
                 cydependencies[idx] = <cydriver.CUgraphNode>(<CUgraphNode>dependencies[idx])._pvt_ptr[0]
+    elif len(dependencies) == 1:
+        cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    err = cydriver.cuGraphAddEventRecordNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, numDependencies, cyevent)
-    if cydependencies is not NULL:
+    with nogil:
+        err = cydriver.cuGraphAddEventRecordNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cyevent)
+    if len(dependencies) > 1 and cydependencies is not NULL:
         free(cydependencies)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
@@ -41491,7 +41848,8 @@ def cuGraphEventRecordNodeGetEvent(hNode):
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef CUevent event_out = CUevent()
-    err = cydriver.cuGraphEventRecordNodeGetEvent(cyhNode, <cydriver.CUevent*>event_out._pvt_ptr)
+    with nogil:
+        err = cydriver.cuGraphEventRecordNodeGetEvent(cyhNode, <cydriver.CUevent*>event_out._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], event_out)
@@ -41537,7 +41895,8 @@ def cuGraphEventRecordNodeSetEvent(hNode, event):
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    err = cydriver.cuGraphEventRecordNodeSetEvent(cyhNode, cyevent)
+    with nogil:
+        err = cydriver.cuGraphEventRecordNodeSetEvent(cyhNode, cyevent)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -41602,16 +41961,19 @@ def cuGraphAddEventWaitNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] |
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef CUgraphNode phGraphNode = CUgraphNode()
     cdef cydriver.CUgraphNode* cydependencies = NULL
-    if len(dependencies) > 0:
+    if len(dependencies) > 1:
         cydependencies = <cydriver.CUgraphNode*> calloc(len(dependencies), sizeof(cydriver.CUgraphNode))
         if cydependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
         else:
             for idx in range(len(dependencies)):
                 cydependencies[idx] = <cydriver.CUgraphNode>(<CUgraphNode>dependencies[idx])._pvt_ptr[0]
+    elif len(dependencies) == 1:
+        cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    err = cydriver.cuGraphAddEventWaitNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, numDependencies, cyevent)
-    if cydependencies is not NULL:
+    with nogil:
+        err = cydriver.cuGraphAddEventWaitNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cyevent)
+    if len(dependencies) > 1 and cydependencies is not NULL:
         free(cydependencies)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
@@ -41651,7 +42013,8 @@ def cuGraphEventWaitNodeGetEvent(hNode):
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef CUevent event_out = CUevent()
-    err = cydriver.cuGraphEventWaitNodeGetEvent(cyhNode, <cydriver.CUevent*>event_out._pvt_ptr)
+    with nogil:
+        err = cydriver.cuGraphEventWaitNodeGetEvent(cyhNode, <cydriver.CUevent*>event_out._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], event_out)
@@ -41697,7 +42060,8 @@ def cuGraphEventWaitNodeSetEvent(hNode, event):
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    err = cydriver.cuGraphEventWaitNodeSetEvent(cyhNode, cyevent)
+    with nogil:
+        err = cydriver.cuGraphEventWaitNodeSetEvent(cyhNode, cyevent)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -41753,17 +42117,20 @@ def cuGraphAddExternalSemaphoresSignalNode(hGraph, dependencies : Optional[Tuple
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef CUgraphNode phGraphNode = CUgraphNode()
     cdef cydriver.CUgraphNode* cydependencies = NULL
-    if len(dependencies) > 0:
+    if len(dependencies) > 1:
         cydependencies = <cydriver.CUgraphNode*> calloc(len(dependencies), sizeof(cydriver.CUgraphNode))
         if cydependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
         else:
             for idx in range(len(dependencies)):
                 cydependencies[idx] = <cydriver.CUgraphNode>(<CUgraphNode>dependencies[idx])._pvt_ptr[0]
+    elif len(dependencies) == 1:
+        cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
     cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphAddExternalSemaphoresSignalNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, numDependencies, cynodeParams_ptr)
-    if cydependencies is not NULL:
+    with nogil:
+        err = cydriver.cuGraphAddExternalSemaphoresSignalNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
+    if len(dependencies) > 1 and cydependencies is not NULL:
         free(cydependencies)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
@@ -41809,7 +42176,8 @@ def cuGraphExternalSemaphoresSignalNodeGetParams(hNode):
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef CUDA_EXT_SEM_SIGNAL_NODE_PARAMS params_out = CUDA_EXT_SEM_SIGNAL_NODE_PARAMS()
-    err = cydriver.cuGraphExternalSemaphoresSignalNodeGetParams(cyhNode, <cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS*>params_out._pvt_ptr)
+    with nogil:
+        err = cydriver.cuGraphExternalSemaphoresSignalNodeGetParams(cyhNode, <cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS*>params_out._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], params_out)
@@ -41849,7 +42217,8 @@ def cuGraphExternalSemaphoresSignalNodeSetParams(hNode, nodeParams : Optional[CU
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphExternalSemaphoresSignalNodeSetParams(cyhNode, cynodeParams_ptr)
+    with nogil:
+        err = cydriver.cuGraphExternalSemaphoresSignalNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -41905,17 +42274,20 @@ def cuGraphAddExternalSemaphoresWaitNode(hGraph, dependencies : Optional[Tuple[C
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef CUgraphNode phGraphNode = CUgraphNode()
     cdef cydriver.CUgraphNode* cydependencies = NULL
-    if len(dependencies) > 0:
+    if len(dependencies) > 1:
         cydependencies = <cydriver.CUgraphNode*> calloc(len(dependencies), sizeof(cydriver.CUgraphNode))
         if cydependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
         else:
             for idx in range(len(dependencies)):
                 cydependencies[idx] = <cydriver.CUgraphNode>(<CUgraphNode>dependencies[idx])._pvt_ptr[0]
+    elif len(dependencies) == 1:
+        cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
     cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphAddExternalSemaphoresWaitNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, numDependencies, cynodeParams_ptr)
-    if cydependencies is not NULL:
+    with nogil:
+        err = cydriver.cuGraphAddExternalSemaphoresWaitNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
+    if len(dependencies) > 1 and cydependencies is not NULL:
         free(cydependencies)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
@@ -41961,7 +42333,8 @@ def cuGraphExternalSemaphoresWaitNodeGetParams(hNode):
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef CUDA_EXT_SEM_WAIT_NODE_PARAMS params_out = CUDA_EXT_SEM_WAIT_NODE_PARAMS()
-    err = cydriver.cuGraphExternalSemaphoresWaitNodeGetParams(cyhNode, <cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS*>params_out._pvt_ptr)
+    with nogil:
+        err = cydriver.cuGraphExternalSemaphoresWaitNodeGetParams(cyhNode, <cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS*>params_out._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], params_out)
@@ -42001,7 +42374,8 @@ def cuGraphExternalSemaphoresWaitNodeSetParams(hNode, nodeParams : Optional[CUDA
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphExternalSemaphoresWaitNodeSetParams(cyhNode, cynodeParams_ptr)
+    with nogil:
+        err = cydriver.cuGraphExternalSemaphoresWaitNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -42060,17 +42434,20 @@ def cuGraphAddBatchMemOpNode(hGraph, dependencies : Optional[Tuple[CUgraphNode]
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef CUgraphNode phGraphNode = CUgraphNode()
     cdef cydriver.CUgraphNode* cydependencies = NULL
-    if len(dependencies) > 0:
+    if len(dependencies) > 1:
         cydependencies = <cydriver.CUgraphNode*> calloc(len(dependencies), sizeof(cydriver.CUgraphNode))
         if cydependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
         else:
             for idx in range(len(dependencies)):
                 cydependencies[idx] = <cydriver.CUgraphNode>(<CUgraphNode>dependencies[idx])._pvt_ptr[0]
+    elif len(dependencies) == 1:
+        cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
     cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphAddBatchMemOpNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, numDependencies, cynodeParams_ptr)
-    if cydependencies is not NULL:
+    with nogil:
+        err = cydriver.cuGraphAddBatchMemOpNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
+    if len(dependencies) > 1 and cydependencies is not NULL:
         free(cydependencies)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
@@ -42115,7 +42492,8 @@ def cuGraphBatchMemOpNodeGetParams(hNode):
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef CUDA_BATCH_MEM_OP_NODE_PARAMS nodeParams_out = CUDA_BATCH_MEM_OP_NODE_PARAMS()
-    err = cydriver.cuGraphBatchMemOpNodeGetParams(cyhNode, <cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS*>nodeParams_out._pvt_ptr)
+    with nogil:
+        err = cydriver.cuGraphBatchMemOpNodeGetParams(cyhNode, <cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS*>nodeParams_out._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], nodeParams_out)
@@ -42157,7 +42535,8 @@ def cuGraphBatchMemOpNodeSetParams(hNode, nodeParams : Optional[CUDA_BATCH_MEM_O
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphBatchMemOpNodeSetParams(cyhNode, cynodeParams_ptr)
+    with nogil:
+        err = cydriver.cuGraphBatchMemOpNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -42227,7 +42606,8 @@ def cuGraphExecBatchMemOpNodeSetParams(hGraphExec, hNode, nodeParams : Optional[
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
     cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphExecBatchMemOpNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
+    with nogil:
+        err = cydriver.cuGraphExecBatchMemOpNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -42323,17 +42703,20 @@ def cuGraphAddMemAllocNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] |
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef CUgraphNode phGraphNode = CUgraphNode()
     cdef cydriver.CUgraphNode* cydependencies = NULL
-    if len(dependencies) > 0:
+    if len(dependencies) > 1:
         cydependencies = <cydriver.CUgraphNode*> calloc(len(dependencies), sizeof(cydriver.CUgraphNode))
         if cydependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
         else:
             for idx in range(len(dependencies)):
                 cydependencies[idx] = <cydriver.CUgraphNode>(<CUgraphNode>dependencies[idx])._pvt_ptr[0]
+    elif len(dependencies) == 1:
+        cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
     cdef cydriver.CUDA_MEM_ALLOC_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphAddMemAllocNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, numDependencies, cynodeParams_ptr)
-    if cydependencies is not NULL:
+    with nogil:
+        err = cydriver.cuGraphAddMemAllocNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
+    if len(dependencies) > 1 and cydependencies is not NULL:
         free(cydependencies)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
@@ -42376,7 +42759,8 @@ def cuGraphMemAllocNodeGetParams(hNode):
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef CUDA_MEM_ALLOC_NODE_PARAMS params_out = CUDA_MEM_ALLOC_NODE_PARAMS()
-    err = cydriver.cuGraphMemAllocNodeGetParams(cyhNode, <cydriver.CUDA_MEM_ALLOC_NODE_PARAMS*>params_out._pvt_ptr)
+    with nogil:
+        err = cydriver.cuGraphMemAllocNodeGetParams(cyhNode, <cydriver.CUDA_MEM_ALLOC_NODE_PARAMS*>params_out._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], params_out)
@@ -42459,16 +42843,19 @@ def cuGraphAddMemFreeNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] | L
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef CUgraphNode phGraphNode = CUgraphNode()
     cdef cydriver.CUgraphNode* cydependencies = NULL
-    if len(dependencies) > 0:
+    if len(dependencies) > 1:
         cydependencies = <cydriver.CUgraphNode*> calloc(len(dependencies), sizeof(cydriver.CUgraphNode))
         if cydependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
         else:
             for idx in range(len(dependencies)):
                 cydependencies[idx] = <cydriver.CUgraphNode>(<CUgraphNode>dependencies[idx])._pvt_ptr[0]
+    elif len(dependencies) == 1:
+        cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    err = cydriver.cuGraphAddMemFreeNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, numDependencies, cydptr)
-    if cydependencies is not NULL:
+    with nogil:
+        err = cydriver.cuGraphAddMemFreeNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cydptr)
+    if len(dependencies) > 1 and cydependencies is not NULL:
         free(cydependencies)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
@@ -42508,7 +42895,8 @@ def cuGraphMemFreeNodeGetParams(hNode):
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef CUdeviceptr dptr_out = CUdeviceptr()
-    err = cydriver.cuGraphMemFreeNodeGetParams(cyhNode, <cydriver.CUdeviceptr*>dptr_out._pvt_ptr)
+    with nogil:
+        err = cydriver.cuGraphMemFreeNodeGetParams(cyhNode, <cydriver.CUdeviceptr*>dptr_out._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], dptr_out)
@@ -42546,7 +42934,8 @@ def cuDeviceGraphMemTrim(device):
     else:
         pdevice = int(CUdevice(device))
     cydevice = <cydriver.CUdevice>pdevice
-    err = cydriver.cuDeviceGraphMemTrim(cydevice)
+    with nogil:
+        err = cydriver.cuDeviceGraphMemTrim(cydevice)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -42602,7 +42991,8 @@ def cuDeviceGetGraphMemAttribute(device, attr not None : CUgraphMem_attribute):
     cdef cydriver.CUgraphMem_attribute cyattr = attr.value
     cdef utils.HelperCUgraphMem_attribute cyvalue = utils.HelperCUgraphMem_attribute(attr, 0, is_getter=True)
     cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
-    err = cydriver.cuDeviceGetGraphMemAttribute(cydevice, cyattr, cyvalue_ptr)
+    with nogil:
+        err = cydriver.cuDeviceGetGraphMemAttribute(cydevice, cyattr, cyvalue_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], cyvalue.pyObj())
@@ -42653,7 +43043,8 @@ def cuDeviceSetGraphMemAttribute(device, attr not None : CUgraphMem_attribute, v
     cdef cydriver.CUgraphMem_attribute cyattr = attr.value
     cdef utils.HelperCUgraphMem_attribute cyvalue = utils.HelperCUgraphMem_attribute(attr, value, is_getter=False)
     cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
-    err = cydriver.cuDeviceSetGraphMemAttribute(cydevice, cyattr, cyvalue_ptr)
+    with nogil:
+        err = cydriver.cuDeviceSetGraphMemAttribute(cydevice, cyattr, cyvalue_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -42700,7 +43091,8 @@ def cuGraphClone(originalGraph):
         poriginalGraph = int(CUgraph(originalGraph))
     cyoriginalGraph = <cydriver.CUgraph><void_ptr>poriginalGraph
     cdef CUgraph phGraphClone = CUgraph()
-    err = cydriver.cuGraphClone(<cydriver.CUgraph*>phGraphClone._pvt_ptr, cyoriginalGraph)
+    with nogil:
+        err = cydriver.cuGraphClone(<cydriver.CUgraph*>phGraphClone._pvt_ptr, cyoriginalGraph)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], phGraphClone)
@@ -42756,7 +43148,8 @@ def cuGraphNodeFindInClone(hOriginalNode, hClonedGraph):
         phOriginalNode = int(CUgraphNode(hOriginalNode))
     cyhOriginalNode = <cydriver.CUgraphNode><void_ptr>phOriginalNode
     cdef CUgraphNode phNode = CUgraphNode()
-    err = cydriver.cuGraphNodeFindInClone(<cydriver.CUgraphNode*>phNode._pvt_ptr, cyhOriginalNode, cyhClonedGraph)
+    with nogil:
+        err = cydriver.cuGraphNodeFindInClone(<cydriver.CUgraphNode*>phNode._pvt_ptr, cyhOriginalNode, cyhClonedGraph)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], phNode)
@@ -42795,7 +43188,8 @@ def cuGraphNodeGetType(hNode):
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef cydriver.CUgraphNodeType typename
-    err = cydriver.cuGraphNodeGetType(cyhNode, &typename)
+    with nogil:
+        err = cydriver.cuGraphNodeGetType(cyhNode, &typename)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], CUgraphNodeType(typename))
@@ -42849,7 +43243,8 @@ def cuGraphGetNodes(hGraph, size_t numNodes = 0):
         cynodes = <cydriver.CUgraphNode*>calloc(_graph_length, sizeof(cydriver.CUgraphNode))
         if cynodes is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cydriver.CUgraphNode)))
-    err = cydriver.cuGraphGetNodes(cyhGraph, cynodes, &numNodes)
+    with nogil:
+        err = cydriver.cuGraphGetNodes(cyhGraph, cynodes, &numNodes)
     if CUresult(err) == CUresult(0):
         pynodes = [CUgraphNode(init_value=<void_ptr>cynodes[idx]) for idx in range(_graph_length)]
     if cynodes is not NULL:
@@ -42907,7 +43302,8 @@ def cuGraphGetRootNodes(hGraph, size_t numRootNodes = 0):
         cyrootNodes = <cydriver.CUgraphNode*>calloc(_graph_length, sizeof(cydriver.CUgraphNode))
         if cyrootNodes is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cydriver.CUgraphNode)))
-    err = cydriver.cuGraphGetRootNodes(cyhGraph, cyrootNodes, &numRootNodes)
+    with nogil:
+        err = cydriver.cuGraphGetRootNodes(cyhGraph, cyrootNodes, &numRootNodes)
     if CUresult(err) == CUresult(0):
         pyrootNodes = [CUgraphNode(init_value=<void_ptr>cyrootNodes[idx]) for idx in range(_graph_length)]
     if cyrootNodes is not NULL:
@@ -42988,7 +43384,8 @@ def cuGraphGetEdges(hGraph, size_t numEdges = 0):
         cyedgeData = <cydriver.CUgraphEdgeData*>calloc(_graph_length, sizeof(cydriver.CUgraphEdgeData))
         if cyedgeData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cydriver.CUgraphEdgeData)))
-    err = cydriver.cuGraphGetEdges(cyhGraph, cyfrom_, cyto, cyedgeData, &numEdges)
+    with nogil:
+        err = cydriver.cuGraphGetEdges(cyhGraph, cyfrom_, cyto, cyedgeData, &numEdges)
     if CUresult(err) == CUresult(0):
         pyfrom_ = [CUgraphNode(init_value=<void_ptr>cyfrom_[idx]) for idx in range(_graph_length)]
     if cyfrom_ is not NULL:
@@ -43068,7 +43465,8 @@ def cuGraphNodeGetDependencies(hNode, size_t numDependencies = 0):
         cyedgeData = <cydriver.CUgraphEdgeData*>calloc(_graph_length, sizeof(cydriver.CUgraphEdgeData))
         if cyedgeData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cydriver.CUgraphEdgeData)))
-    err = cydriver.cuGraphNodeGetDependencies(cyhNode, cydependencies, cyedgeData, &numDependencies)
+    with nogil:
+        err = cydriver.cuGraphNodeGetDependencies(cyhNode, cydependencies, cyedgeData, &numDependencies)
     if CUresult(err) == CUresult(0):
         pydependencies = [CUgraphNode(init_value=<void_ptr>cydependencies[idx]) for idx in range(_graph_length)]
     if cydependencies is not NULL:
@@ -43144,7 +43542,8 @@ def cuGraphNodeGetDependentNodes(hNode, size_t numDependentNodes = 0):
         cyedgeData = <cydriver.CUgraphEdgeData*>calloc(_graph_length, sizeof(cydriver.CUgraphEdgeData))
         if cyedgeData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cydriver.CUgraphEdgeData)))
-    err = cydriver.cuGraphNodeGetDependentNodes(cyhNode, cydependentNodes, cyedgeData, &numDependentNodes)
+    with nogil:
+        err = cydriver.cuGraphNodeGetDependentNodes(cyhNode, cydependentNodes, cyedgeData, &numDependentNodes)
     if CUresult(err) == CUresult(0):
         pydependentNodes = [CUgraphNode(init_value=<void_ptr>cydependentNodes[idx]) for idx in range(_graph_length)]
     if cydependentNodes is not NULL:
@@ -43212,34 +43611,41 @@ def cuGraphAddDependencies(hGraph, from_ : Optional[Tuple[CUgraphNode] | List[CU
         phGraph = int(CUgraph(hGraph))
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef cydriver.CUgraphNode* cyfrom_ = NULL
-    if len(from_) > 0:
+    if len(from_) > 1:
         cyfrom_ = <cydriver.CUgraphNode*> calloc(len(from_), sizeof(cydriver.CUgraphNode))
         if cyfrom_ is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(from_)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
         else:
             for idx in range(len(from_)):
                 cyfrom_[idx] = <cydriver.CUgraphNode>(<CUgraphNode>from_[idx])._pvt_ptr[0]
+    elif len(from_) == 1:
+        cyfrom_ = <cydriver.CUgraphNode*>(<CUgraphNode>from_[0])._pvt_ptr
     cdef cydriver.CUgraphNode* cyto = NULL
-    if len(to) > 0:
+    if len(to) > 1:
         cyto = <cydriver.CUgraphNode*> calloc(len(to), sizeof(cydriver.CUgraphNode))
         if cyto is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(to)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
         else:
             for idx in range(len(to)):
                 cyto[idx] = <cydriver.CUgraphNode>(<CUgraphNode>to[idx])._pvt_ptr[0]
+    elif len(to) == 1:
+        cyto = <cydriver.CUgraphNode*>(<CUgraphNode>to[0])._pvt_ptr
     cdef cydriver.CUgraphEdgeData* cyedgeData = NULL
-    if len(edgeData) > 0:
+    if len(edgeData) > 1:
         cyedgeData = <cydriver.CUgraphEdgeData*> calloc(len(edgeData), sizeof(cydriver.CUgraphEdgeData))
         if cyedgeData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(edgeData)) + 'x' + str(sizeof(cydriver.CUgraphEdgeData)))
         for idx in range(len(edgeData)):
             string.memcpy(&cyedgeData[idx], (<CUgraphEdgeData>edgeData[idx])._pvt_ptr, sizeof(cydriver.CUgraphEdgeData))
-    err = cydriver.cuGraphAddDependencies(cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>from_[0])._pvt_ptr if len(from_) == 1 else cyfrom_, <cydriver.CUgraphNode*>(<CUgraphNode>to[0])._pvt_ptr if len(to) == 1 else cyto, (<CUgraphEdgeData>edgeData[0])._pvt_ptr if len(edgeData) == 1 else cyedgeData, numDependencies)
-    if cyfrom_ is not NULL:
+    elif len(edgeData) == 1:
+        cyedgeData = (<CUgraphEdgeData>edgeData[0])._pvt_ptr
+    with nogil:
+        err = cydriver.cuGraphAddDependencies(cyhGraph, cyfrom_, cyto, cyedgeData, numDependencies)
+    if len(from_) > 1 and cyfrom_ is not NULL:
         free(cyfrom_)
-    if cyto is not NULL:
+    if len(to) > 1 and cyto is not NULL:
         free(cyto)
-    if cyedgeData is not NULL:
+    if len(edgeData) > 1 and cyedgeData is not NULL:
         free(cyedgeData)
     return (_dict_CUresult[err],)
 {{endif}}
@@ -43304,34 +43710,41 @@ def cuGraphRemoveDependencies(hGraph, from_ : Optional[Tuple[CUgraphNode] | List
         phGraph = int(CUgraph(hGraph))
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef cydriver.CUgraphNode* cyfrom_ = NULL
-    if len(from_) > 0:
+    if len(from_) > 1:
         cyfrom_ = <cydriver.CUgraphNode*> calloc(len(from_), sizeof(cydriver.CUgraphNode))
         if cyfrom_ is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(from_)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
         else:
             for idx in range(len(from_)):
                 cyfrom_[idx] = <cydriver.CUgraphNode>(<CUgraphNode>from_[idx])._pvt_ptr[0]
+    elif len(from_) == 1:
+        cyfrom_ = <cydriver.CUgraphNode*>(<CUgraphNode>from_[0])._pvt_ptr
     cdef cydriver.CUgraphNode* cyto = NULL
-    if len(to) > 0:
+    if len(to) > 1:
         cyto = <cydriver.CUgraphNode*> calloc(len(to), sizeof(cydriver.CUgraphNode))
         if cyto is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(to)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
         else:
             for idx in range(len(to)):
                 cyto[idx] = <cydriver.CUgraphNode>(<CUgraphNode>to[idx])._pvt_ptr[0]
+    elif len(to) == 1:
+        cyto = <cydriver.CUgraphNode*>(<CUgraphNode>to[0])._pvt_ptr
     cdef cydriver.CUgraphEdgeData* cyedgeData = NULL
-    if len(edgeData) > 0:
+    if len(edgeData) > 1:
         cyedgeData = <cydriver.CUgraphEdgeData*> calloc(len(edgeData), sizeof(cydriver.CUgraphEdgeData))
         if cyedgeData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(edgeData)) + 'x' + str(sizeof(cydriver.CUgraphEdgeData)))
         for idx in range(len(edgeData)):
             string.memcpy(&cyedgeData[idx], (<CUgraphEdgeData>edgeData[idx])._pvt_ptr, sizeof(cydriver.CUgraphEdgeData))
-    err = cydriver.cuGraphRemoveDependencies(cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>from_[0])._pvt_ptr if len(from_) == 1 else cyfrom_, <cydriver.CUgraphNode*>(<CUgraphNode>to[0])._pvt_ptr if len(to) == 1 else cyto, (<CUgraphEdgeData>edgeData[0])._pvt_ptr if len(edgeData) == 1 else cyedgeData, numDependencies)
-    if cyfrom_ is not NULL:
+    elif len(edgeData) == 1:
+        cyedgeData = (<CUgraphEdgeData>edgeData[0])._pvt_ptr
+    with nogil:
+        err = cydriver.cuGraphRemoveDependencies(cyhGraph, cyfrom_, cyto, cyedgeData, numDependencies)
+    if len(from_) > 1 and cyfrom_ is not NULL:
         free(cyfrom_)
-    if cyto is not NULL:
+    if len(to) > 1 and cyto is not NULL:
         free(cyto)
-    if cyedgeData is not NULL:
+    if len(edgeData) > 1 and cyedgeData is not NULL:
         free(cyedgeData)
     return (_dict_CUresult[err],)
 {{endif}}
@@ -43370,7 +43783,8 @@ def cuGraphDestroyNode(hNode):
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    err = cydriver.cuGraphDestroyNode(cyhNode)
+    with nogil:
+        err = cydriver.cuGraphDestroyNode(cyhNode)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -43472,7 +43886,8 @@ def cuGraphInstantiate(hGraph, unsigned long long flags):
         phGraph = int(CUgraph(hGraph))
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef CUgraphExec phGraphExec = CUgraphExec()
-    err = cydriver.cuGraphInstantiate(<cydriver.CUgraphExec*>phGraphExec._pvt_ptr, cyhGraph, flags)
+    with nogil:
+        err = cydriver.cuGraphInstantiate(<cydriver.CUgraphExec*>phGraphExec._pvt_ptr, cyhGraph, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], phGraphExec)
@@ -43617,7 +44032,8 @@ def cuGraphInstantiateWithParams(hGraph, instantiateParams : Optional[CUDA_GRAPH
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef CUgraphExec phGraphExec = CUgraphExec()
     cdef cydriver.CUDA_GRAPH_INSTANTIATE_PARAMS* cyinstantiateParams_ptr = instantiateParams._pvt_ptr if instantiateParams != None else NULL
-    err = cydriver.cuGraphInstantiateWithParams(<cydriver.CUgraphExec*>phGraphExec._pvt_ptr, cyhGraph, cyinstantiateParams_ptr)
+    with nogil:
+        err = cydriver.cuGraphInstantiateWithParams(<cydriver.CUgraphExec*>phGraphExec._pvt_ptr, cyhGraph, cyinstantiateParams_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], phGraphExec)
@@ -43659,7 +44075,8 @@ def cuGraphExecGetFlags(hGraphExec):
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
     cdef cuuint64_t flags = cuuint64_t()
-    err = cydriver.cuGraphExecGetFlags(cyhGraphExec, <cydriver.cuuint64_t*>flags._pvt_ptr)
+    with nogil:
+        err = cydriver.cuGraphExecGetFlags(cyhGraphExec, <cydriver.cuuint64_t*>flags._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], flags)
@@ -43741,7 +44158,8 @@ def cuGraphExecKernelNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUDA
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
     cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphExecKernelNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
+    with nogil:
+        err = cydriver.cuGraphExecKernelNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -43815,7 +44233,8 @@ def cuGraphExecMemcpyNodeSetParams(hGraphExec, hNode, copyParams : Optional[CUDA
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
     cdef cydriver.CUDA_MEMCPY3D* cycopyParams_ptr = copyParams._pvt_ptr if copyParams != None else NULL
-    err = cydriver.cuGraphExecMemcpyNodeSetParams(cyhGraphExec, cyhNode, cycopyParams_ptr, cyctx)
+    with nogil:
+        err = cydriver.cuGraphExecMemcpyNodeSetParams(cyhGraphExec, cyhNode, cycopyParams_ptr, cyctx)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -43894,7 +44313,8 @@ def cuGraphExecMemsetNodeSetParams(hGraphExec, hNode, memsetParams : Optional[CU
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
     cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cymemsetParams_ptr = memsetParams._pvt_ptr if memsetParams != None else NULL
-    err = cydriver.cuGraphExecMemsetNodeSetParams(cyhGraphExec, cyhNode, cymemsetParams_ptr, cyctx)
+    with nogil:
+        err = cydriver.cuGraphExecMemsetNodeSetParams(cyhGraphExec, cyhNode, cymemsetParams_ptr, cyctx)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -43948,7 +44368,8 @@ def cuGraphExecHostNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUDA_H
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
     cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphExecHostNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
+    with nogil:
+        err = cydriver.cuGraphExecHostNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -44017,7 +44438,8 @@ def cuGraphExecChildGraphNodeSetParams(hGraphExec, hNode, childGraph):
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    err = cydriver.cuGraphExecChildGraphNodeSetParams(cyhGraphExec, cyhNode, cychildGraph)
+    with nogil:
+        err = cydriver.cuGraphExecChildGraphNodeSetParams(cyhGraphExec, cyhNode, cychildGraph)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -44079,7 +44501,8 @@ def cuGraphExecEventRecordNodeSetEvent(hGraphExec, hNode, event):
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    err = cydriver.cuGraphExecEventRecordNodeSetEvent(cyhGraphExec, cyhNode, cyevent)
+    with nogil:
+        err = cydriver.cuGraphExecEventRecordNodeSetEvent(cyhGraphExec, cyhNode, cyevent)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -44141,7 +44564,8 @@ def cuGraphExecEventWaitNodeSetEvent(hGraphExec, hNode, event):
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    err = cydriver.cuGraphExecEventWaitNodeSetEvent(cyhGraphExec, cyhNode, cyevent)
+    with nogil:
+        err = cydriver.cuGraphExecEventWaitNodeSetEvent(cyhGraphExec, cyhNode, cyevent)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -44200,7 +44624,8 @@ def cuGraphExecExternalSemaphoresSignalNodeSetParams(hGraphExec, hNode, nodePara
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
     cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphExecExternalSemaphoresSignalNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
+    with nogil:
+        err = cydriver.cuGraphExecExternalSemaphoresSignalNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -44259,7 +44684,8 @@ def cuGraphExecExternalSemaphoresWaitNodeSetParams(hGraphExec, hNode, nodeParams
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
     cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphExecExternalSemaphoresWaitNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
+    with nogil:
+        err = cydriver.cuGraphExecExternalSemaphoresWaitNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -44327,7 +44753,8 @@ def cuGraphNodeSetEnabled(hGraphExec, hNode, unsigned int isEnabled):
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    err = cydriver.cuGraphNodeSetEnabled(cyhGraphExec, cyhNode, isEnabled)
+    with nogil:
+        err = cydriver.cuGraphNodeSetEnabled(cyhGraphExec, cyhNode, isEnabled)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -44385,7 +44812,8 @@ def cuGraphNodeGetEnabled(hGraphExec, hNode):
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
     cdef unsigned int isEnabled = 0
-    err = cydriver.cuGraphNodeGetEnabled(cyhGraphExec, cyhNode, &isEnabled)
+    with nogil:
+        err = cydriver.cuGraphNodeGetEnabled(cyhGraphExec, cyhNode, &isEnabled)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], isEnabled)
@@ -44435,7 +44863,8 @@ def cuGraphUpload(hGraphExec, hStream):
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    err = cydriver.cuGraphUpload(cyhGraphExec, cyhStream)
+    with nogil:
+        err = cydriver.cuGraphUpload(cyhGraphExec, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -44488,7 +44917,8 @@ def cuGraphLaunch(hGraphExec, hStream):
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    err = cydriver.cuGraphLaunch(cyhGraphExec, cyhStream)
+    with nogil:
+        err = cydriver.cuGraphLaunch(cyhGraphExec, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -44524,7 +44954,8 @@ def cuGraphExecDestroy(hGraphExec):
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    err = cydriver.cuGraphExecDestroy(cyhGraphExec)
+    with nogil:
+        err = cydriver.cuGraphExecDestroy(cyhGraphExec)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -44558,7 +44989,8 @@ def cuGraphDestroy(hGraph):
     else:
         phGraph = int(CUgraph(hGraph))
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
-    err = cydriver.cuGraphDestroy(cyhGraph)
+    with nogil:
+        err = cydriver.cuGraphDestroy(cyhGraph)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -44735,7 +45167,8 @@ def cuGraphExecUpdate(hGraphExec, hGraph):
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
     cdef CUgraphExecUpdateResultInfo resultInfo = CUgraphExecUpdateResultInfo()
-    err = cydriver.cuGraphExecUpdate(cyhGraphExec, cyhGraph, <cydriver.CUgraphExecUpdateResultInfo*>resultInfo._pvt_ptr)
+    with nogil:
+        err = cydriver.cuGraphExecUpdate(cyhGraphExec, cyhGraph, <cydriver.CUgraphExecUpdateResultInfo*>resultInfo._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], resultInfo)
@@ -44783,7 +45216,8 @@ def cuGraphKernelNodeCopyAttributes(dst, src):
     else:
         pdst = int(CUgraphNode(dst))
     cydst = <cydriver.CUgraphNode><void_ptr>pdst
-    err = cydriver.cuGraphKernelNodeCopyAttributes(cydst, cysrc)
+    with nogil:
+        err = cydriver.cuGraphKernelNodeCopyAttributes(cydst, cysrc)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -44824,7 +45258,8 @@ def cuGraphKernelNodeGetAttribute(hNode, attr not None : CUkernelNodeAttrID):
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef cydriver.CUkernelNodeAttrID cyattr = attr.value
     cdef CUkernelNodeAttrValue value_out = CUkernelNodeAttrValue()
-    err = cydriver.cuGraphKernelNodeGetAttribute(cyhNode, cyattr, <cydriver.CUkernelNodeAttrValue*>value_out._pvt_ptr)
+    with nogil:
+        err = cydriver.cuGraphKernelNodeGetAttribute(cyhNode, cyattr, <cydriver.CUkernelNodeAttrValue*>value_out._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], value_out)
@@ -44867,7 +45302,8 @@ def cuGraphKernelNodeSetAttribute(hNode, attr not None : CUkernelNodeAttrID, val
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef cydriver.CUkernelNodeAttrID cyattr = attr.value
     cdef cydriver.CUkernelNodeAttrValue* cyvalue_ptr = value._pvt_ptr if value != None else NULL
-    err = cydriver.cuGraphKernelNodeSetAttribute(cyhNode, cyattr, cyvalue_ptr)
+    with nogil:
+        err = cydriver.cuGraphKernelNodeSetAttribute(cyhNode, cyattr, cyvalue_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -44906,7 +45342,8 @@ def cuGraphDebugDotPrint(hGraph, char* path, unsigned int flags):
     else:
         phGraph = int(CUgraph(hGraph))
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
-    err = cydriver.cuGraphDebugDotPrint(cyhGraph, path, flags)
+    with nogil:
+        err = cydriver.cuGraphDebugDotPrint(cyhGraph, path, flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -44965,7 +45402,8 @@ def cuUserObjectCreate(ptr, destroy, unsigned int initialRefcount, unsigned int
     cdef CUuserObject object_out = CUuserObject()
     cyptr = utils.HelperInputVoidPtr(ptr)
     cdef void* cyptr_ptr = <void*><void_ptr>cyptr.cptr
-    err = cydriver.cuUserObjectCreate(<cydriver.CUuserObject*>object_out._pvt_ptr, cyptr_ptr, cydestroy, initialRefcount, flags)
+    with nogil:
+        err = cydriver.cuUserObjectCreate(<cydriver.CUuserObject*>object_out._pvt_ptr, cyptr_ptr, cydestroy, initialRefcount, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], object_out)
@@ -45008,7 +45446,8 @@ def cuUserObjectRetain(object, unsigned int count):
     else:
         pobject = int(CUuserObject(object))
     cyobject = <cydriver.CUuserObject><void_ptr>pobject
-    err = cydriver.cuUserObjectRetain(cyobject, count)
+    with nogil:
+        err = cydriver.cuUserObjectRetain(cyobject, count)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -45052,7 +45491,8 @@ def cuUserObjectRelease(object, unsigned int count):
     else:
         pobject = int(CUuserObject(object))
     cyobject = <cydriver.CUuserObject><void_ptr>pobject
-    err = cydriver.cuUserObjectRelease(cyobject, count)
+    with nogil:
+        err = cydriver.cuUserObjectRelease(cyobject, count)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -45107,7 +45547,8 @@ def cuGraphRetainUserObject(graph, object, unsigned int count, unsigned int flag
     else:
         pgraph = int(CUgraph(graph))
     cygraph = <cydriver.CUgraph><void_ptr>pgraph
-    err = cydriver.cuGraphRetainUserObject(cygraph, cyobject, count, flags)
+    with nogil:
+        err = cydriver.cuGraphRetainUserObject(cygraph, cyobject, count, flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -45157,7 +45598,8 @@ def cuGraphReleaseUserObject(graph, object, unsigned int count):
     else:
         pgraph = int(CUgraph(graph))
     cygraph = <cydriver.CUgraph><void_ptr>pgraph
-    err = cydriver.cuGraphReleaseUserObject(cygraph, cyobject, count)
+    with nogil:
+        err = cydriver.cuGraphReleaseUserObject(cygraph, cyobject, count)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -45227,25 +45669,30 @@ def cuGraphAddNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] | List[CUg
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef CUgraphNode phGraphNode = CUgraphNode()
     cdef cydriver.CUgraphNode* cydependencies = NULL
-    if len(dependencies) > 0:
+    if len(dependencies) > 1:
         cydependencies = <cydriver.CUgraphNode*> calloc(len(dependencies), sizeof(cydriver.CUgraphNode))
         if cydependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cydriver.CUgraphNode)))
         else:
             for idx in range(len(dependencies)):
                 cydependencies[idx] = <cydriver.CUgraphNode>(<CUgraphNode>dependencies[idx])._pvt_ptr[0]
+    elif len(dependencies) == 1:
+        cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     cdef cydriver.CUgraphEdgeData* cydependencyData = NULL
-    if len(dependencyData) > 0:
+    if len(dependencyData) > 1:
         cydependencyData = <cydriver.CUgraphEdgeData*> calloc(len(dependencyData), sizeof(cydriver.CUgraphEdgeData))
         if cydependencyData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencyData)) + 'x' + str(sizeof(cydriver.CUgraphEdgeData)))
         for idx in range(len(dependencyData)):
             string.memcpy(&cydependencyData[idx], (<CUgraphEdgeData>dependencyData[idx])._pvt_ptr, sizeof(cydriver.CUgraphEdgeData))
+    elif len(dependencyData) == 1:
+        cydependencyData = (<CUgraphEdgeData>dependencyData[0])._pvt_ptr
     cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphAddNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, (<CUgraphEdgeData>dependencyData[0])._pvt_ptr if len(dependencyData) == 1 else cydependencyData, numDependencies, cynodeParams_ptr)
-    if cydependencies is not NULL:
+    with nogil:
+        err = cydriver.cuGraphAddNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, cydependencyData, numDependencies, cynodeParams_ptr)
+    if len(dependencies) > 1 and cydependencies is not NULL:
         free(cydependencies)
-    if cydependencyData is not NULL:
+    if len(dependencyData) > 1 and cydependencyData is not NULL:
         free(cydependencyData)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
@@ -45291,7 +45738,8 @@ def cuGraphNodeSetParams(hNode, nodeParams : Optional[CUgraphNodeParams]):
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphNodeSetParams(cyhNode, cynodeParams_ptr)
+    with nogil:
+        err = cydriver.cuGraphNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -45350,7 +45798,8 @@ def cuGraphExecNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUgraphNod
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
     cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cydriver.cuGraphExecNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
+    with nogil:
+        err = cydriver.cuGraphExecNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -45412,7 +45861,8 @@ def cuGraphConditionalHandleCreate(hGraph, ctx, unsigned int defaultLaunchValue,
         phGraph = int(CUgraph(hGraph))
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef CUgraphConditionalHandle pHandle_out = CUgraphConditionalHandle()
-    err = cydriver.cuGraphConditionalHandleCreate(<cydriver.CUgraphConditionalHandle*>pHandle_out._pvt_ptr, cyhGraph, cyctx, defaultLaunchValue, flags)
+    with nogil:
+        err = cydriver.cuGraphConditionalHandleCreate(<cydriver.CUgraphConditionalHandle*>pHandle_out._pvt_ptr, cyhGraph, cyctx, defaultLaunchValue, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pHandle_out)
@@ -45462,7 +45912,8 @@ def cuOccupancyMaxActiveBlocksPerMultiprocessor(func, int blockSize, size_t dyna
         pfunc = int(CUfunction(func))
     cyfunc = <cydriver.CUfunction><void_ptr>pfunc
     cdef int numBlocks = 0
-    err = cydriver.cuOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, cyfunc, blockSize, dynamicSMemSize)
+    with nogil:
+        err = cydriver.cuOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, cyfunc, blockSize, dynamicSMemSize)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], numBlocks)
@@ -45530,7 +45981,8 @@ def cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(func, int blockSize, si
         pfunc = int(CUfunction(func))
     cyfunc = <cydriver.CUfunction><void_ptr>pfunc
     cdef int numBlocks = 0
-    err = cydriver.cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&numBlocks, cyfunc, blockSize, dynamicSMemSize, flags)
+    with nogil:
+        err = cydriver.cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&numBlocks, cyfunc, blockSize, dynamicSMemSize, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], numBlocks)
@@ -45616,7 +46068,8 @@ def cuOccupancyMaxPotentialBlockSize(func, blockSizeToDynamicSMemSize, size_t dy
     cyfunc = <cydriver.CUfunction><void_ptr>pfunc
     cdef int minGridSize = 0
     cdef int blockSize = 0
-    err = cydriver.cuOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, cyfunc, cyblockSizeToDynamicSMemSize, dynamicSMemSize, blockSizeLimit)
+    with nogil:
+        err = cydriver.cuOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, cyfunc, cyblockSizeToDynamicSMemSize, dynamicSMemSize, blockSizeLimit)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], minGridSize, blockSize)
@@ -45701,7 +46154,8 @@ def cuOccupancyMaxPotentialBlockSizeWithFlags(func, blockSizeToDynamicSMemSize,
     cyfunc = <cydriver.CUfunction><void_ptr>pfunc
     cdef int minGridSize = 0
     cdef int blockSize = 0
-    err = cydriver.cuOccupancyMaxPotentialBlockSizeWithFlags(&minGridSize, &blockSize, cyfunc, cyblockSizeToDynamicSMemSize, dynamicSMemSize, blockSizeLimit, flags)
+    with nogil:
+        err = cydriver.cuOccupancyMaxPotentialBlockSizeWithFlags(&minGridSize, &blockSize, cyfunc, cyblockSizeToDynamicSMemSize, dynamicSMemSize, blockSizeLimit, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], minGridSize, blockSize)
@@ -45747,7 +46201,8 @@ def cuOccupancyAvailableDynamicSMemPerBlock(func, int numBlocks, int blockSize):
         pfunc = int(CUfunction(func))
     cyfunc = <cydriver.CUfunction><void_ptr>pfunc
     cdef size_t dynamicSmemSize = 0
-    err = cydriver.cuOccupancyAvailableDynamicSMemPerBlock(&dynamicSmemSize, cyfunc, numBlocks, blockSize)
+    with nogil:
+        err = cydriver.cuOccupancyAvailableDynamicSMemPerBlock(&dynamicSmemSize, cyfunc, numBlocks, blockSize)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], dynamicSmemSize)
@@ -45806,7 +46261,8 @@ def cuOccupancyMaxPotentialClusterSize(func, config : Optional[CUlaunchConfig]):
     cyfunc = <cydriver.CUfunction><void_ptr>pfunc
     cdef int clusterSize = 0
     cdef cydriver.CUlaunchConfig* cyconfig_ptr = config._pvt_ptr if config != None else NULL
-    err = cydriver.cuOccupancyMaxPotentialClusterSize(&clusterSize, cyfunc, cyconfig_ptr)
+    with nogil:
+        err = cydriver.cuOccupancyMaxPotentialClusterSize(&clusterSize, cyfunc, cyconfig_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], clusterSize)
@@ -45865,7 +46321,8 @@ def cuOccupancyMaxActiveClusters(func, config : Optional[CUlaunchConfig]):
     cyfunc = <cydriver.CUfunction><void_ptr>pfunc
     cdef int numClusters = 0
     cdef cydriver.CUlaunchConfig* cyconfig_ptr = config._pvt_ptr if config != None else NULL
-    err = cydriver.cuOccupancyMaxActiveClusters(&numClusters, cyfunc, cyconfig_ptr)
+    with nogil:
+        err = cydriver.cuOccupancyMaxActiveClusters(&numClusters, cyfunc, cyconfig_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], numClusters)
@@ -45919,7 +46376,8 @@ def cuTexRefSetArray(hTexRef, hArray, unsigned int Flags):
     else:
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
-    err = cydriver.cuTexRefSetArray(cyhTexRef, cyhArray, Flags)
+    with nogil:
+        err = cydriver.cuTexRefSetArray(cyhTexRef, cyhArray, Flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -45971,7 +46429,8 @@ def cuTexRefSetMipmappedArray(hTexRef, hMipmappedArray, unsigned int Flags):
     else:
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
-    err = cydriver.cuTexRefSetMipmappedArray(cyhTexRef, cyhMipmappedArray, Flags)
+    with nogil:
+        err = cydriver.cuTexRefSetMipmappedArray(cyhTexRef, cyhMipmappedArray, Flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -46043,7 +46502,8 @@ def cuTexRefSetAddress(hTexRef, dptr, size_t numbytes):
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
     cdef size_t ByteOffset = 0
-    err = cydriver.cuTexRefSetAddress(&ByteOffset, cyhTexRef, cydptr, numbytes)
+    with nogil:
+        err = cydriver.cuTexRefSetAddress(&ByteOffset, cyhTexRef, cydptr, numbytes)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], ByteOffset)
@@ -46125,7 +46585,8 @@ def cuTexRefSetAddress2D(hTexRef, desc : Optional[CUDA_ARRAY_DESCRIPTOR], dptr,
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
     cdef cydriver.CUDA_ARRAY_DESCRIPTOR* cydesc_ptr = desc._pvt_ptr if desc != None else NULL
-    err = cydriver.cuTexRefSetAddress2D(cyhTexRef, cydesc_ptr, cydptr, Pitch)
+    with nogil:
+        err = cydriver.cuTexRefSetAddress2D(cyhTexRef, cydesc_ptr, cydptr, Pitch)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -46170,7 +46631,8 @@ def cuTexRefSetFormat(hTexRef, fmt not None : CUarray_format, int NumPackedCompo
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
     cdef cydriver.CUarray_format cyfmt = fmt.value
-    err = cydriver.cuTexRefSetFormat(cyhTexRef, cyfmt, NumPackedComponents)
+    with nogil:
+        err = cydriver.cuTexRefSetFormat(cyhTexRef, cyfmt, NumPackedComponents)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -46222,7 +46684,8 @@ def cuTexRefSetAddressMode(hTexRef, int dim, am not None : CUaddress_mode):
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
     cdef cydriver.CUaddress_mode cyam = am.value
-    err = cydriver.cuTexRefSetAddressMode(cyhTexRef, dim, cyam)
+    with nogil:
+        err = cydriver.cuTexRefSetAddressMode(cyhTexRef, dim, cyam)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -46268,7 +46731,8 @@ def cuTexRefSetFilterMode(hTexRef, fm not None : CUfilter_mode):
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
     cdef cydriver.CUfilter_mode cyfm = fm.value
-    err = cydriver.cuTexRefSetFilterMode(cyhTexRef, cyfm)
+    with nogil:
+        err = cydriver.cuTexRefSetFilterMode(cyhTexRef, cyfm)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -46314,7 +46778,8 @@ def cuTexRefSetMipmapFilterMode(hTexRef, fm not None : CUfilter_mode):
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
     cdef cydriver.CUfilter_mode cyfm = fm.value
-    err = cydriver.cuTexRefSetMipmapFilterMode(cyhTexRef, cyfm)
+    with nogil:
+        err = cydriver.cuTexRefSetMipmapFilterMode(cyhTexRef, cyfm)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -46357,7 +46822,8 @@ def cuTexRefSetMipmapLevelBias(hTexRef, float bias):
     else:
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
-    err = cydriver.cuTexRefSetMipmapLevelBias(cyhTexRef, bias)
+    with nogil:
+        err = cydriver.cuTexRefSetMipmapLevelBias(cyhTexRef, bias)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -46402,7 +46868,8 @@ def cuTexRefSetMipmapLevelClamp(hTexRef, float minMipmapLevelClamp, float maxMip
     else:
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
-    err = cydriver.cuTexRefSetMipmapLevelClamp(cyhTexRef, minMipmapLevelClamp, maxMipmapLevelClamp)
+    with nogil:
+        err = cydriver.cuTexRefSetMipmapLevelClamp(cyhTexRef, minMipmapLevelClamp, maxMipmapLevelClamp)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -46444,7 +46911,8 @@ def cuTexRefSetMaxAnisotropy(hTexRef, unsigned int maxAniso):
     else:
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
-    err = cydriver.cuTexRefSetMaxAnisotropy(cyhTexRef, maxAniso)
+    with nogil:
+        err = cydriver.cuTexRefSetMaxAnisotropy(cyhTexRef, maxAniso)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -46491,7 +46959,8 @@ def cuTexRefSetBorderColor(hTexRef, float pBorderColor):
     else:
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
-    err = cydriver.cuTexRefSetBorderColor(cyhTexRef, &pBorderColor)
+    with nogil:
+        err = cydriver.cuTexRefSetBorderColor(cyhTexRef, &pBorderColor)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -46548,7 +47017,8 @@ def cuTexRefSetFlags(hTexRef, unsigned int Flags):
     else:
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
-    err = cydriver.cuTexRefSetFlags(cyhTexRef, Flags)
+    with nogil:
+        err = cydriver.cuTexRefSetFlags(cyhTexRef, Flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -46589,7 +47059,8 @@ def cuTexRefGetAddress(hTexRef):
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
     cdef CUdeviceptr pdptr = CUdeviceptr()
-    err = cydriver.cuTexRefGetAddress(<cydriver.CUdeviceptr*>pdptr._pvt_ptr, cyhTexRef)
+    with nogil:
+        err = cydriver.cuTexRefGetAddress(<cydriver.CUdeviceptr*>pdptr._pvt_ptr, cyhTexRef)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pdptr)
@@ -46632,7 +47103,8 @@ def cuTexRefGetArray(hTexRef):
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
     cdef CUarray phArray = CUarray()
-    err = cydriver.cuTexRefGetArray(<cydriver.CUarray*>phArray._pvt_ptr, cyhTexRef)
+    with nogil:
+        err = cydriver.cuTexRefGetArray(<cydriver.CUarray*>phArray._pvt_ptr, cyhTexRef)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], phArray)
@@ -46676,7 +47148,8 @@ def cuTexRefGetMipmappedArray(hTexRef):
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
     cdef CUmipmappedArray phMipmappedArray = CUmipmappedArray()
-    err = cydriver.cuTexRefGetMipmappedArray(<cydriver.CUmipmappedArray*>phMipmappedArray._pvt_ptr, cyhTexRef)
+    with nogil:
+        err = cydriver.cuTexRefGetMipmappedArray(<cydriver.CUmipmappedArray*>phMipmappedArray._pvt_ptr, cyhTexRef)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], phMipmappedArray)
@@ -46721,7 +47194,8 @@ def cuTexRefGetAddressMode(hTexRef, int dim):
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
     cdef cydriver.CUaddress_mode pam
-    err = cydriver.cuTexRefGetAddressMode(&pam, cyhTexRef, dim)
+    with nogil:
+        err = cydriver.cuTexRefGetAddressMode(&pam, cyhTexRef, dim)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], CUaddress_mode(pam))
@@ -46763,7 +47237,8 @@ def cuTexRefGetFilterMode(hTexRef):
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
     cdef cydriver.CUfilter_mode pfm
-    err = cydriver.cuTexRefGetFilterMode(&pfm, cyhTexRef)
+    with nogil:
+        err = cydriver.cuTexRefGetFilterMode(&pfm, cyhTexRef)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], CUfilter_mode(pfm))
@@ -46809,7 +47284,8 @@ def cuTexRefGetFormat(hTexRef):
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
     cdef cydriver.CUarray_format pFormat
     cdef int pNumChannels = 0
-    err = cydriver.cuTexRefGetFormat(&pFormat, &pNumChannels, cyhTexRef)
+    with nogil:
+        err = cydriver.cuTexRefGetFormat(&pFormat, &pNumChannels, cyhTexRef)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], CUarray_format(pFormat), pNumChannels)
@@ -46851,7 +47327,8 @@ def cuTexRefGetMipmapFilterMode(hTexRef):
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
     cdef cydriver.CUfilter_mode pfm
-    err = cydriver.cuTexRefGetMipmapFilterMode(&pfm, cyhTexRef)
+    with nogil:
+        err = cydriver.cuTexRefGetMipmapFilterMode(&pfm, cyhTexRef)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], CUfilter_mode(pfm))
@@ -46894,7 +47371,8 @@ def cuTexRefGetMipmapLevelBias(hTexRef):
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
     cdef float pbias = 0
-    err = cydriver.cuTexRefGetMipmapLevelBias(&pbias, cyhTexRef)
+    with nogil:
+        err = cydriver.cuTexRefGetMipmapLevelBias(&pbias, cyhTexRef)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pbias)
@@ -46940,7 +47418,8 @@ def cuTexRefGetMipmapLevelClamp(hTexRef):
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
     cdef float pminMipmapLevelClamp = 0
     cdef float pmaxMipmapLevelClamp = 0
-    err = cydriver.cuTexRefGetMipmapLevelClamp(&pminMipmapLevelClamp, &pmaxMipmapLevelClamp, cyhTexRef)
+    with nogil:
+        err = cydriver.cuTexRefGetMipmapLevelClamp(&pminMipmapLevelClamp, &pmaxMipmapLevelClamp, cyhTexRef)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], pminMipmapLevelClamp, pmaxMipmapLevelClamp)
@@ -46982,7 +47461,8 @@ def cuTexRefGetMaxAnisotropy(hTexRef):
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
     cdef int pmaxAniso = 0
-    err = cydriver.cuTexRefGetMaxAnisotropy(&pmaxAniso, cyhTexRef)
+    with nogil:
+        err = cydriver.cuTexRefGetMaxAnisotropy(&pmaxAniso, cyhTexRef)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pmaxAniso)
@@ -47027,7 +47507,8 @@ def cuTexRefGetBorderColor(hTexRef):
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
     cdef float pBorderColor = 0
-    err = cydriver.cuTexRefGetBorderColor(&pBorderColor, cyhTexRef)
+    with nogil:
+        err = cydriver.cuTexRefGetBorderColor(&pBorderColor, cyhTexRef)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pBorderColor)
@@ -47068,7 +47549,8 @@ def cuTexRefGetFlags(hTexRef):
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
     cdef unsigned int pFlags = 0
-    err = cydriver.cuTexRefGetFlags(&pFlags, cyhTexRef)
+    with nogil:
+        err = cydriver.cuTexRefGetFlags(&pFlags, cyhTexRef)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pFlags)
@@ -47101,7 +47583,8 @@ def cuTexRefCreate():
     :py:obj:`~.cuTexRefDestroy`
     """
     cdef CUtexref pTexRef = CUtexref()
-    err = cydriver.cuTexRefCreate(<cydriver.CUtexref*>pTexRef._pvt_ptr)
+    with nogil:
+        err = cydriver.cuTexRefCreate(<cydriver.CUtexref*>pTexRef._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pTexRef)
@@ -47139,7 +47622,8 @@ def cuTexRefDestroy(hTexRef):
     else:
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
-    err = cydriver.cuTexRefDestroy(cyhTexRef)
+    with nogil:
+        err = cydriver.cuTexRefDestroy(cyhTexRef)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -47192,7 +47676,8 @@ def cuSurfRefSetArray(hSurfRef, hArray, unsigned int Flags):
     else:
         phSurfRef = int(CUsurfref(hSurfRef))
     cyhSurfRef = <cydriver.CUsurfref><void_ptr>phSurfRef
-    err = cydriver.cuSurfRefSetArray(cyhSurfRef, cyhArray, Flags)
+    with nogil:
+        err = cydriver.cuSurfRefSetArray(cyhSurfRef, cyhArray, Flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -47233,7 +47718,8 @@ def cuSurfRefGetArray(hSurfRef):
         phSurfRef = int(CUsurfref(hSurfRef))
     cyhSurfRef = <cydriver.CUsurfref><void_ptr>phSurfRef
     cdef CUarray phArray = CUarray()
-    err = cydriver.cuSurfRefGetArray(<cydriver.CUarray*>phArray._pvt_ptr, cyhSurfRef)
+    with nogil:
+        err = cydriver.cuSurfRefGetArray(<cydriver.CUarray*>phArray._pvt_ptr, cyhSurfRef)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], phArray)
@@ -47472,7 +47958,8 @@ def cuTexObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC], pTexDesc : Option
     cdef cydriver.CUDA_RESOURCE_DESC* cypResDesc_ptr = pResDesc._pvt_ptr if pResDesc != None else NULL
     cdef cydriver.CUDA_TEXTURE_DESC* cypTexDesc_ptr = pTexDesc._pvt_ptr if pTexDesc != None else NULL
     cdef cydriver.CUDA_RESOURCE_VIEW_DESC* cypResViewDesc_ptr = pResViewDesc._pvt_ptr if pResViewDesc != None else NULL
-    err = cydriver.cuTexObjectCreate(<cydriver.CUtexObject*>pTexObject._pvt_ptr, cypResDesc_ptr, cypTexDesc_ptr, cypResViewDesc_ptr)
+    with nogil:
+        err = cydriver.cuTexObjectCreate(<cydriver.CUtexObject*>pTexObject._pvt_ptr, cypResDesc_ptr, cypTexDesc_ptr, cypResViewDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pTexObject)
@@ -47508,7 +47995,8 @@ def cuTexObjectDestroy(texObject):
     else:
         ptexObject = int(CUtexObject(texObject))
     cytexObject = <cydriver.CUtexObject><void_ptr>ptexObject
-    err = cydriver.cuTexObjectDestroy(cytexObject)
+    with nogil:
+        err = cydriver.cuTexObjectDestroy(cytexObject)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -47546,7 +48034,8 @@ def cuTexObjectGetResourceDesc(texObject):
         ptexObject = int(CUtexObject(texObject))
     cytexObject = <cydriver.CUtexObject><void_ptr>ptexObject
     cdef CUDA_RESOURCE_DESC pResDesc = CUDA_RESOURCE_DESC()
-    err = cydriver.cuTexObjectGetResourceDesc(<cydriver.CUDA_RESOURCE_DESC*>pResDesc._pvt_ptr, cytexObject)
+    with nogil:
+        err = cydriver.cuTexObjectGetResourceDesc(<cydriver.CUDA_RESOURCE_DESC*>pResDesc._pvt_ptr, cytexObject)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pResDesc)
@@ -47586,7 +48075,8 @@ def cuTexObjectGetTextureDesc(texObject):
         ptexObject = int(CUtexObject(texObject))
     cytexObject = <cydriver.CUtexObject><void_ptr>ptexObject
     cdef CUDA_TEXTURE_DESC pTexDesc = CUDA_TEXTURE_DESC()
-    err = cydriver.cuTexObjectGetTextureDesc(<cydriver.CUDA_TEXTURE_DESC*>pTexDesc._pvt_ptr, cytexObject)
+    with nogil:
+        err = cydriver.cuTexObjectGetTextureDesc(<cydriver.CUDA_TEXTURE_DESC*>pTexDesc._pvt_ptr, cytexObject)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pTexDesc)
@@ -47627,7 +48117,8 @@ def cuTexObjectGetResourceViewDesc(texObject):
         ptexObject = int(CUtexObject(texObject))
     cytexObject = <cydriver.CUtexObject><void_ptr>ptexObject
     cdef CUDA_RESOURCE_VIEW_DESC pResViewDesc = CUDA_RESOURCE_VIEW_DESC()
-    err = cydriver.cuTexObjectGetResourceViewDesc(<cydriver.CUDA_RESOURCE_VIEW_DESC*>pResViewDesc._pvt_ptr, cytexObject)
+    with nogil:
+        err = cydriver.cuTexObjectGetResourceViewDesc(<cydriver.CUDA_RESOURCE_VIEW_DESC*>pResViewDesc._pvt_ptr, cytexObject)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pResViewDesc)
@@ -47669,7 +48160,8 @@ def cuSurfObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC]):
     """
     cdef CUsurfObject pSurfObject = CUsurfObject()
     cdef cydriver.CUDA_RESOURCE_DESC* cypResDesc_ptr = pResDesc._pvt_ptr if pResDesc != None else NULL
-    err = cydriver.cuSurfObjectCreate(<cydriver.CUsurfObject*>pSurfObject._pvt_ptr, cypResDesc_ptr)
+    with nogil:
+        err = cydriver.cuSurfObjectCreate(<cydriver.CUsurfObject*>pSurfObject._pvt_ptr, cypResDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pSurfObject)
@@ -47705,7 +48197,8 @@ def cuSurfObjectDestroy(surfObject):
     else:
         psurfObject = int(CUsurfObject(surfObject))
     cysurfObject = <cydriver.CUsurfObject><void_ptr>psurfObject
-    err = cydriver.cuSurfObjectDestroy(cysurfObject)
+    with nogil:
+        err = cydriver.cuSurfObjectDestroy(cysurfObject)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -47743,7 +48236,8 @@ def cuSurfObjectGetResourceDesc(surfObject):
         psurfObject = int(CUsurfObject(surfObject))
     cysurfObject = <cydriver.CUsurfObject><void_ptr>psurfObject
     cdef CUDA_RESOURCE_DESC pResDesc = CUDA_RESOURCE_DESC()
-    err = cydriver.cuSurfObjectGetResourceDesc(<cydriver.CUDA_RESOURCE_DESC*>pResDesc._pvt_ptr, cysurfObject)
+    with nogil:
+        err = cydriver.cuSurfObjectGetResourceDesc(<cydriver.CUDA_RESOURCE_DESC*>pResDesc._pvt_ptr, cysurfObject)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pResDesc)
@@ -48002,49 +48496,58 @@ def cuTensorMapEncodeTiled(tensorDataType not None : CUtensorMapDataType, tensor
     cyglobalAddress = utils.HelperInputVoidPtr(globalAddress)
     cdef void* cyglobalAddress_ptr = <void*><void_ptr>cyglobalAddress.cptr
     cdef cydriver.cuuint64_t* cyglobalDim = NULL
-    if len(globalDim) > 0:
+    if len(globalDim) > 1:
         cyglobalDim = <cydriver.cuuint64_t*> calloc(len(globalDim), sizeof(cydriver.cuuint64_t))
         if cyglobalDim is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(globalDim)) + 'x' + str(sizeof(cydriver.cuuint64_t)))
         else:
             for idx in range(len(globalDim)):
                 cyglobalDim[idx] = <cydriver.cuuint64_t>(<cuuint64_t>globalDim[idx])._pvt_ptr[0]
+    elif len(globalDim) == 1:
+        cyglobalDim = <cydriver.cuuint64_t*>(<cuuint64_t>globalDim[0])._pvt_ptr
     cdef cydriver.cuuint64_t* cyglobalStrides = NULL
-    if len(globalStrides) > 0:
+    if len(globalStrides) > 1:
         cyglobalStrides = <cydriver.cuuint64_t*> calloc(len(globalStrides), sizeof(cydriver.cuuint64_t))
         if cyglobalStrides is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(globalStrides)) + 'x' + str(sizeof(cydriver.cuuint64_t)))
         else:
             for idx in range(len(globalStrides)):
                 cyglobalStrides[idx] = <cydriver.cuuint64_t>(<cuuint64_t>globalStrides[idx])._pvt_ptr[0]
+    elif len(globalStrides) == 1:
+        cyglobalStrides = <cydriver.cuuint64_t*>(<cuuint64_t>globalStrides[0])._pvt_ptr
     cdef cydriver.cuuint32_t* cyboxDim = NULL
-    if len(boxDim) > 0:
+    if len(boxDim) > 1:
         cyboxDim = <cydriver.cuuint32_t*> calloc(len(boxDim), sizeof(cydriver.cuuint32_t))
         if cyboxDim is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(boxDim)) + 'x' + str(sizeof(cydriver.cuuint32_t)))
         else:
             for idx in range(len(boxDim)):
                 cyboxDim[idx] = <cydriver.cuuint32_t>(<cuuint32_t>boxDim[idx])._pvt_ptr[0]
+    elif len(boxDim) == 1:
+        cyboxDim = <cydriver.cuuint32_t*>(<cuuint32_t>boxDim[0])._pvt_ptr
     cdef cydriver.cuuint32_t* cyelementStrides = NULL
-    if len(elementStrides) > 0:
+    if len(elementStrides) > 1:
         cyelementStrides = <cydriver.cuuint32_t*> calloc(len(elementStrides), sizeof(cydriver.cuuint32_t))
         if cyelementStrides is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(elementStrides)) + 'x' + str(sizeof(cydriver.cuuint32_t)))
         else:
             for idx in range(len(elementStrides)):
                 cyelementStrides[idx] = <cydriver.cuuint32_t>(<cuuint32_t>elementStrides[idx])._pvt_ptr[0]
+    elif len(elementStrides) == 1:
+        cyelementStrides = <cydriver.cuuint32_t*>(<cuuint32_t>elementStrides[0])._pvt_ptr
     cdef cydriver.CUtensorMapInterleave cyinterleave = interleave.value
     cdef cydriver.CUtensorMapSwizzle cyswizzle = swizzle.value
     cdef cydriver.CUtensorMapL2promotion cyl2Promotion = l2Promotion.value
     cdef cydriver.CUtensorMapFloatOOBfill cyoobFill = oobFill.value
-    err = cydriver.cuTensorMapEncodeTiled(<cydriver.CUtensorMap*>tensorMap._pvt_ptr, cytensorDataType, cytensorRank, cyglobalAddress_ptr, <cydriver.cuuint64_t*>(<cuuint64_t>globalDim[0])._pvt_ptr if len(globalDim) == 1 else cyglobalDim, <cydriver.cuuint64_t*>(<cuuint64_t>globalStrides[0])._pvt_ptr if len(globalStrides) == 1 else cyglobalStrides, <cydriver.cuuint32_t*>(<cuuint32_t>boxDim[0])._pvt_ptr if len(boxDim) == 1 else cyboxDim, <cydriver.cuuint32_t*>(<cuuint32_t>elementStrides[0])._pvt_ptr if len(elementStrides) == 1 else cyelementStrides, cyinterleave, cyswizzle, cyl2Promotion, cyoobFill)
-    if cyglobalDim is not NULL:
+    with nogil:
+        err = cydriver.cuTensorMapEncodeTiled(<cydriver.CUtensorMap*>tensorMap._pvt_ptr, cytensorDataType, cytensorRank, cyglobalAddress_ptr, cyglobalDim, cyglobalStrides, cyboxDim, cyelementStrides, cyinterleave, cyswizzle, cyl2Promotion, cyoobFill)
+    if len(globalDim) > 1 and cyglobalDim is not NULL:
         free(cyglobalDim)
-    if cyglobalStrides is not NULL:
+    if len(globalStrides) > 1 and cyglobalStrides is not NULL:
         free(cyglobalStrides)
-    if cyboxDim is not NULL:
+    if len(boxDim) > 1 and cyboxDim is not NULL:
         free(cyboxDim)
-    if cyelementStrides is not NULL:
+    if len(elementStrides) > 1 and cyelementStrides is not NULL:
         free(cyelementStrides)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
@@ -48350,41 +48853,48 @@ def cuTensorMapEncodeIm2col(tensorDataType not None : CUtensorMapDataType, tenso
     cyglobalAddress = utils.HelperInputVoidPtr(globalAddress)
     cdef void* cyglobalAddress_ptr = <void*><void_ptr>cyglobalAddress.cptr
     cdef cydriver.cuuint64_t* cyglobalDim = NULL
-    if len(globalDim) > 0:
+    if len(globalDim) > 1:
         cyglobalDim = <cydriver.cuuint64_t*> calloc(len(globalDim), sizeof(cydriver.cuuint64_t))
         if cyglobalDim is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(globalDim)) + 'x' + str(sizeof(cydriver.cuuint64_t)))
         else:
             for idx in range(len(globalDim)):
                 cyglobalDim[idx] = <cydriver.cuuint64_t>(<cuuint64_t>globalDim[idx])._pvt_ptr[0]
+    elif len(globalDim) == 1:
+        cyglobalDim = <cydriver.cuuint64_t*>(<cuuint64_t>globalDim[0])._pvt_ptr
     cdef cydriver.cuuint64_t* cyglobalStrides = NULL
-    if len(globalStrides) > 0:
+    if len(globalStrides) > 1:
         cyglobalStrides = <cydriver.cuuint64_t*> calloc(len(globalStrides), sizeof(cydriver.cuuint64_t))
         if cyglobalStrides is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(globalStrides)) + 'x' + str(sizeof(cydriver.cuuint64_t)))
         else:
             for idx in range(len(globalStrides)):
                 cyglobalStrides[idx] = <cydriver.cuuint64_t>(<cuuint64_t>globalStrides[idx])._pvt_ptr[0]
+    elif len(globalStrides) == 1:
+        cyglobalStrides = <cydriver.cuuint64_t*>(<cuuint64_t>globalStrides[0])._pvt_ptr
     cdef vector[int] cypixelBoxLowerCorner = pixelBoxLowerCorner
     cdef vector[int] cypixelBoxUpperCorner = pixelBoxUpperCorner
     cdef cydriver.cuuint32_t* cyelementStrides = NULL
-    if len(elementStrides) > 0:
+    if len(elementStrides) > 1:
         cyelementStrides = <cydriver.cuuint32_t*> calloc(len(elementStrides), sizeof(cydriver.cuuint32_t))
         if cyelementStrides is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(elementStrides)) + 'x' + str(sizeof(cydriver.cuuint32_t)))
         else:
             for idx in range(len(elementStrides)):
                 cyelementStrides[idx] = <cydriver.cuuint32_t>(<cuuint32_t>elementStrides[idx])._pvt_ptr[0]
+    elif len(elementStrides) == 1:
+        cyelementStrides = <cydriver.cuuint32_t*>(<cuuint32_t>elementStrides[0])._pvt_ptr
     cdef cydriver.CUtensorMapInterleave cyinterleave = interleave.value
     cdef cydriver.CUtensorMapSwizzle cyswizzle = swizzle.value
     cdef cydriver.CUtensorMapL2promotion cyl2Promotion = l2Promotion.value
     cdef cydriver.CUtensorMapFloatOOBfill cyoobFill = oobFill.value
-    err = cydriver.cuTensorMapEncodeIm2col(<cydriver.CUtensorMap*>tensorMap._pvt_ptr, cytensorDataType, cytensorRank, cyglobalAddress_ptr, <cydriver.cuuint64_t*>(<cuuint64_t>globalDim[0])._pvt_ptr if len(globalDim) == 1 else cyglobalDim, <cydriver.cuuint64_t*>(<cuuint64_t>globalStrides[0])._pvt_ptr if len(globalStrides) == 1 else cyglobalStrides, cypixelBoxLowerCorner.data(), cypixelBoxUpperCorner.data(), cychannelsPerPixel, cypixelsPerColumn, <cydriver.cuuint32_t*>(<cuuint32_t>elementStrides[0])._pvt_ptr if len(elementStrides) == 1 else cyelementStrides, cyinterleave, cyswizzle, cyl2Promotion, cyoobFill)
-    if cyglobalDim is not NULL:
+    with nogil:
+        err = cydriver.cuTensorMapEncodeIm2col(<cydriver.CUtensorMap*>tensorMap._pvt_ptr, cytensorDataType, cytensorRank, cyglobalAddress_ptr, cyglobalDim, cyglobalStrides, cypixelBoxLowerCorner.data(), cypixelBoxUpperCorner.data(), cychannelsPerPixel, cypixelsPerColumn, cyelementStrides, cyinterleave, cyswizzle, cyl2Promotion, cyoobFill)
+    if len(globalDim) > 1 and cyglobalDim is not NULL:
         free(cyglobalDim)
-    if cyglobalStrides is not NULL:
+    if len(globalStrides) > 1 and cyglobalStrides is not NULL:
         free(cyglobalStrides)
-    if cyelementStrides is not NULL:
+    if len(elementStrides) > 1 and cyelementStrides is not NULL:
         free(cyelementStrides)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
@@ -48671,40 +49181,47 @@ def cuTensorMapEncodeIm2colWide(tensorDataType not None : CUtensorMapDataType, t
     cyglobalAddress = utils.HelperInputVoidPtr(globalAddress)
     cdef void* cyglobalAddress_ptr = <void*><void_ptr>cyglobalAddress.cptr
     cdef cydriver.cuuint64_t* cyglobalDim = NULL
-    if len(globalDim) > 0:
+    if len(globalDim) > 1:
         cyglobalDim = <cydriver.cuuint64_t*> calloc(len(globalDim), sizeof(cydriver.cuuint64_t))
         if cyglobalDim is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(globalDim)) + 'x' + str(sizeof(cydriver.cuuint64_t)))
         else:
             for idx in range(len(globalDim)):
                 cyglobalDim[idx] = <cydriver.cuuint64_t>(<cuuint64_t>globalDim[idx])._pvt_ptr[0]
+    elif len(globalDim) == 1:
+        cyglobalDim = <cydriver.cuuint64_t*>(<cuuint64_t>globalDim[0])._pvt_ptr
     cdef cydriver.cuuint64_t* cyglobalStrides = NULL
-    if len(globalStrides) > 0:
+    if len(globalStrides) > 1:
         cyglobalStrides = <cydriver.cuuint64_t*> calloc(len(globalStrides), sizeof(cydriver.cuuint64_t))
         if cyglobalStrides is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(globalStrides)) + 'x' + str(sizeof(cydriver.cuuint64_t)))
         else:
             for idx in range(len(globalStrides)):
                 cyglobalStrides[idx] = <cydriver.cuuint64_t>(<cuuint64_t>globalStrides[idx])._pvt_ptr[0]
+    elif len(globalStrides) == 1:
+        cyglobalStrides = <cydriver.cuuint64_t*>(<cuuint64_t>globalStrides[0])._pvt_ptr
     cdef cydriver.cuuint32_t* cyelementStrides = NULL
-    if len(elementStrides) > 0:
+    if len(elementStrides) > 1:
         cyelementStrides = <cydriver.cuuint32_t*> calloc(len(elementStrides), sizeof(cydriver.cuuint32_t))
         if cyelementStrides is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(elementStrides)) + 'x' + str(sizeof(cydriver.cuuint32_t)))
         else:
             for idx in range(len(elementStrides)):
                 cyelementStrides[idx] = <cydriver.cuuint32_t>(<cuuint32_t>elementStrides[idx])._pvt_ptr[0]
+    elif len(elementStrides) == 1:
+        cyelementStrides = <cydriver.cuuint32_t*>(<cuuint32_t>elementStrides[0])._pvt_ptr
     cdef cydriver.CUtensorMapInterleave cyinterleave = interleave.value
     cdef cydriver.CUtensorMapIm2ColWideMode cymode = mode.value
     cdef cydriver.CUtensorMapSwizzle cyswizzle = swizzle.value
     cdef cydriver.CUtensorMapL2promotion cyl2Promotion = l2Promotion.value
     cdef cydriver.CUtensorMapFloatOOBfill cyoobFill = oobFill.value
-    err = cydriver.cuTensorMapEncodeIm2colWide(<cydriver.CUtensorMap*>tensorMap._pvt_ptr, cytensorDataType, cytensorRank, cyglobalAddress_ptr, <cydriver.cuuint64_t*>(<cuuint64_t>globalDim[0])._pvt_ptr if len(globalDim) == 1 else cyglobalDim, <cydriver.cuuint64_t*>(<cuuint64_t>globalStrides[0])._pvt_ptr if len(globalStrides) == 1 else cyglobalStrides, pixelBoxLowerCornerWidth, pixelBoxUpperCornerWidth, cychannelsPerPixel, cypixelsPerColumn, <cydriver.cuuint32_t*>(<cuuint32_t>elementStrides[0])._pvt_ptr if len(elementStrides) == 1 else cyelementStrides, cyinterleave, cymode, cyswizzle, cyl2Promotion, cyoobFill)
-    if cyglobalDim is not NULL:
+    with nogil:
+        err = cydriver.cuTensorMapEncodeIm2colWide(<cydriver.CUtensorMap*>tensorMap._pvt_ptr, cytensorDataType, cytensorRank, cyglobalAddress_ptr, cyglobalDim, cyglobalStrides, pixelBoxLowerCornerWidth, pixelBoxUpperCornerWidth, cychannelsPerPixel, cypixelsPerColumn, cyelementStrides, cyinterleave, cymode, cyswizzle, cyl2Promotion, cyoobFill)
+    if len(globalDim) > 1 and cyglobalDim is not NULL:
         free(cyglobalDim)
-    if cyglobalStrides is not NULL:
+    if len(globalStrides) > 1 and cyglobalStrides is not NULL:
         free(cyglobalStrides)
-    if cyelementStrides is not NULL:
+    if len(elementStrides) > 1 and cyelementStrides is not NULL:
         free(cyelementStrides)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
@@ -48744,7 +49261,8 @@ def cuTensorMapReplaceAddress(tensorMap : Optional[CUtensorMap], globalAddress):
     cdef cydriver.CUtensorMap* cytensorMap_ptr = tensorMap._pvt_ptr if tensorMap != None else NULL
     cyglobalAddress = utils.HelperInputVoidPtr(globalAddress)
     cdef void* cyglobalAddress_ptr = <void*><void_ptr>cyglobalAddress.cptr
-    err = cydriver.cuTensorMapReplaceAddress(cytensorMap_ptr, cyglobalAddress_ptr)
+    with nogil:
+        err = cydriver.cuTensorMapReplaceAddress(cytensorMap_ptr, cyglobalAddress_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -48797,7 +49315,8 @@ def cuDeviceCanAccessPeer(dev, peerDev):
         pdev = int(CUdevice(dev))
     cydev = <cydriver.CUdevice>pdev
     cdef int canAccessPeer = 0
-    err = cydriver.cuDeviceCanAccessPeer(&canAccessPeer, cydev, cypeerDev)
+    with nogil:
+        err = cydriver.cuDeviceCanAccessPeer(&canAccessPeer, cydev, cypeerDev)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], canAccessPeer)
@@ -48868,7 +49387,8 @@ def cuCtxEnablePeerAccess(peerContext, unsigned int Flags):
     else:
         ppeerContext = int(CUcontext(peerContext))
     cypeerContext = <cydriver.CUcontext><void_ptr>ppeerContext
-    err = cydriver.cuCtxEnablePeerAccess(cypeerContext, Flags)
+    with nogil:
+        err = cydriver.cuCtxEnablePeerAccess(cypeerContext, Flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -48907,7 +49427,8 @@ def cuCtxDisablePeerAccess(peerContext):
     else:
         ppeerContext = int(CUcontext(peerContext))
     cypeerContext = <cydriver.CUcontext><void_ptr>ppeerContext
-    err = cydriver.cuCtxDisablePeerAccess(cypeerContext)
+    with nogil:
+        err = cydriver.cuCtxDisablePeerAccess(cypeerContext)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -48983,7 +49504,8 @@ def cuDeviceGetP2PAttribute(attrib not None : CUdevice_P2PAttribute, srcDevice,
     cysrcDevice = <cydriver.CUdevice>psrcDevice
     cdef int value = 0
     cdef cydriver.CUdevice_P2PAttribute cyattrib = attrib.value
-    err = cydriver.cuDeviceGetP2PAttribute(&value, cyattrib, cysrcDevice, cydstDevice)
+    with nogil:
+        err = cydriver.cuDeviceGetP2PAttribute(&value, cyattrib, cysrcDevice, cydstDevice)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], value)
@@ -49061,7 +49583,8 @@ def cuDeviceGetP2PAtomicCapabilities(operations : Optional[Tuple[CUatomicOperati
             raise MemoryError('Failed to allocate length x size memory: ' + str(count) + 'x' + str(sizeof(unsigned int)))
     cdef vector[cydriver.CUatomicOperation] cyoperations = [pyoperations.value for pyoperations in (operations)]
     if count > len(operations): raise RuntimeError("List is too small: " + str(len(operations)) + " < " + str(count))
-    err = cydriver.cuDeviceGetP2PAtomicCapabilities(cycapabilities, cyoperations.data(), count, cysrcDevice, cydstDevice)
+    with nogil:
+        err = cydriver.cuDeviceGetP2PAtomicCapabilities(cycapabilities, cyoperations.data(), count, cysrcDevice, cydstDevice)
     if CUresult(err) == CUresult(0):
         pycapabilities = [<unsigned int>cycapabilities[idx] for idx in range(count)]
     if cycapabilities is not NULL:
@@ -49105,7 +49628,8 @@ def cuGraphicsUnregisterResource(resource):
     else:
         presource = int(CUgraphicsResource(resource))
     cyresource = <cydriver.CUgraphicsResource><void_ptr>presource
-    err = cydriver.cuGraphicsUnregisterResource(cyresource)
+    with nogil:
+        err = cydriver.cuGraphicsUnregisterResource(cyresource)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -49160,7 +49684,8 @@ def cuGraphicsSubResourceGetMappedArray(resource, unsigned int arrayIndex, unsig
         presource = int(CUgraphicsResource(resource))
     cyresource = <cydriver.CUgraphicsResource><void_ptr>presource
     cdef CUarray pArray = CUarray()
-    err = cydriver.cuGraphicsSubResourceGetMappedArray(<cydriver.CUarray*>pArray._pvt_ptr, cyresource, arrayIndex, mipLevel)
+    with nogil:
+        err = cydriver.cuGraphicsSubResourceGetMappedArray(<cydriver.CUarray*>pArray._pvt_ptr, cyresource, arrayIndex, mipLevel)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pArray)
@@ -49206,7 +49731,8 @@ def cuGraphicsResourceGetMappedMipmappedArray(resource):
         presource = int(CUgraphicsResource(resource))
     cyresource = <cydriver.CUgraphicsResource><void_ptr>presource
     cdef CUmipmappedArray pMipmappedArray = CUmipmappedArray()
-    err = cydriver.cuGraphicsResourceGetMappedMipmappedArray(<cydriver.CUmipmappedArray*>pMipmappedArray._pvt_ptr, cyresource)
+    with nogil:
+        err = cydriver.cuGraphicsResourceGetMappedMipmappedArray(<cydriver.CUmipmappedArray*>pMipmappedArray._pvt_ptr, cyresource)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pMipmappedArray)
@@ -49252,7 +49778,8 @@ def cuGraphicsResourceGetMappedPointer(resource):
     cyresource = <cydriver.CUgraphicsResource><void_ptr>presource
     cdef CUdeviceptr pDevPtr = CUdeviceptr()
     cdef size_t pSize = 0
-    err = cydriver.cuGraphicsResourceGetMappedPointer(<cydriver.CUdeviceptr*>pDevPtr._pvt_ptr, &pSize, cyresource)
+    with nogil:
+        err = cydriver.cuGraphicsResourceGetMappedPointer(<cydriver.CUdeviceptr*>pDevPtr._pvt_ptr, &pSize, cyresource)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], pDevPtr, pSize)
@@ -49312,7 +49839,8 @@ def cuGraphicsResourceSetMapFlags(resource, unsigned int flags):
     else:
         presource = int(CUgraphicsResource(resource))
     cyresource = <cydriver.CUgraphicsResource><void_ptr>presource
-    err = cydriver.cuGraphicsResourceSetMapFlags(cyresource, flags)
+    with nogil:
+        err = cydriver.cuGraphicsResourceSetMapFlags(cyresource, flags)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -49374,7 +49902,8 @@ def cuGraphicsMapResources(unsigned int count, resources, hStream):
         cyresources = <cydriver.CUgraphicsResource*><void_ptr>resources
     else:
         raise TypeError("Argument 'resources' is not instance of type (expected <class 'int, driver.CUgraphicsResource'>, found " + str(type(resources)))
-    err = cydriver.cuGraphicsMapResources(count, cyresources, cyhStream)
+    with nogil:
+        err = cydriver.cuGraphicsMapResources(count, cyresources, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -49434,7 +49963,8 @@ def cuGraphicsUnmapResources(unsigned int count, resources, hStream):
         cyresources = <cydriver.CUgraphicsResource*><void_ptr>resources
     else:
         raise TypeError("Argument 'resources' is not instance of type (expected <class 'int, driver.CUgraphicsResource'>, found " + str(type(resources)))
-    err = cydriver.cuGraphicsUnmapResources(count, cyresources, cyhStream)
+    with nogil:
+        err = cydriver.cuGraphicsUnmapResources(count, cyresources, cyhStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -49537,7 +50067,8 @@ def cuGetProcAddress(char* symbol, int cudaVersion, flags):
     cyflags = <cydriver.cuuint64_t><void_ptr>pflags
     cdef void_ptr pfn = 0
     cdef cydriver.CUdriverProcAddressQueryResult symbolStatus
-    err = cydriver.cuGetProcAddress(symbol, <void**>&pfn, cudaVersion, cyflags, &symbolStatus)
+    with nogil:
+        err = cydriver.cuGetProcAddress(symbol, <void**>&pfn, cudaVersion, cyflags, &symbolStatus)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], pfn, CUdriverProcAddressQueryResult(symbolStatus))
@@ -49656,7 +50187,8 @@ def cuCoredumpGetAttribute(attrib not None : CUcoredumpSettings):
     cdef utils.HelperCUcoredumpSettings cyvalue = utils.HelperCUcoredumpSettings(attrib, 0, is_getter=True)
     cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
     cdef size_t size = cyvalue.size()
-    err = cydriver.cuCoredumpGetAttribute(cyattrib, cyvalue_ptr, &size)
+    with nogil:
+        err = cydriver.cuCoredumpGetAttribute(cyattrib, cyvalue_ptr, &size)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], cyvalue.pyObj())
@@ -49771,7 +50303,8 @@ def cuCoredumpGetAttributeGlobal(attrib not None : CUcoredumpSettings):
     cdef utils.HelperCUcoredumpSettings cyvalue = utils.HelperCUcoredumpSettings(attrib, 0, is_getter=True)
     cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
     cdef size_t size = cyvalue.size()
-    err = cydriver.cuCoredumpGetAttributeGlobal(cyattrib, cyvalue_ptr, &size)
+    with nogil:
+        err = cydriver.cuCoredumpGetAttributeGlobal(cyattrib, cyvalue_ptr, &size)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], cyvalue.pyObj())
@@ -49893,7 +50426,8 @@ def cuCoredumpSetAttribute(attrib not None : CUcoredumpSettings, value):
     cdef utils.HelperCUcoredumpSettings cyvalue = utils.HelperCUcoredumpSettings(attrib, value, is_getter=False)
     cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
     cdef size_t size = cyvalue.size()
-    err = cydriver.cuCoredumpSetAttribute(cyattrib, cyvalue_ptr, &size)
+    with nogil:
+        err = cydriver.cuCoredumpSetAttribute(cyattrib, cyvalue_ptr, &size)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -50018,7 +50552,8 @@ def cuCoredumpSetAttributeGlobal(attrib not None : CUcoredumpSettings, value):
     cdef utils.HelperCUcoredumpSettings cyvalue = utils.HelperCUcoredumpSettings(attrib, value, is_getter=False)
     cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
     cdef size_t size = cyvalue.size()
-    err = cydriver.cuCoredumpSetAttributeGlobal(cyattrib, cyvalue_ptr, &size)
+    with nogil:
+        err = cydriver.cuCoredumpSetAttributeGlobal(cyattrib, cyvalue_ptr, &size)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -50042,7 +50577,8 @@ def cuGetExportTable(pExportTableId : Optional[CUuuid]):
     """
     cdef void_ptr ppExportTable = 0
     cdef cydriver.CUuuid* cypExportTableId_ptr = pExportTableId._pvt_ptr if pExportTableId != None else NULL
-    err = cydriver.cuGetExportTable(<const void**>&ppExportTable, cypExportTableId_ptr)
+    with nogil:
+        err = cydriver.cuGetExportTable(<const void**>&ppExportTable, cypExportTableId_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], ppExportTable)
@@ -50117,7 +50653,8 @@ def cuGreenCtxCreate(desc, dev, unsigned int flags):
         pdesc = int(CUdevResourceDesc(desc))
     cydesc = <cydriver.CUdevResourceDesc><void_ptr>pdesc
     cdef CUgreenCtx phCtx = CUgreenCtx()
-    err = cydriver.cuGreenCtxCreate(<cydriver.CUgreenCtx*>phCtx._pvt_ptr, cydesc, cydev, flags)
+    with nogil:
+        err = cydriver.cuGreenCtxCreate(<cydriver.CUgreenCtx*>phCtx._pvt_ptr, cydesc, cydev, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], phCtx)
@@ -50163,7 +50700,8 @@ def cuGreenCtxDestroy(hCtx):
     else:
         phCtx = int(CUgreenCtx(hCtx))
     cyhCtx = <cydriver.CUgreenCtx><void_ptr>phCtx
-    err = cydriver.cuGreenCtxDestroy(cyhCtx)
+    with nogil:
+        err = cydriver.cuGreenCtxDestroy(cyhCtx)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -50209,7 +50747,8 @@ def cuCtxFromGreenCtx(hCtx):
         phCtx = int(CUgreenCtx(hCtx))
     cyhCtx = <cydriver.CUgreenCtx><void_ptr>phCtx
     cdef CUcontext pContext = CUcontext()
-    err = cydriver.cuCtxFromGreenCtx(<cydriver.CUcontext*>pContext._pvt_ptr, cyhCtx)
+    with nogil:
+        err = cydriver.cuCtxFromGreenCtx(<cydriver.CUcontext*>pContext._pvt_ptr, cyhCtx)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pContext)
@@ -50255,7 +50794,8 @@ def cuDeviceGetDevResource(device, typename not None : CUdevResourceType):
     cydevice = <cydriver.CUdevice>pdevice
     cdef CUdevResource resource = CUdevResource()
     cdef cydriver.CUdevResourceType cytypename = typename.value
-    err = cydriver.cuDeviceGetDevResource(cydevice, <cydriver.CUdevResource*>resource._pvt_ptr, cytypename)
+    with nogil:
+        err = cydriver.cuDeviceGetDevResource(cydevice, <cydriver.CUdevResource*>resource._pvt_ptr, cytypename)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], resource)
@@ -50298,7 +50838,8 @@ def cuCtxGetDevResource(hCtx, typename not None : CUdevResourceType):
     cyhCtx = <cydriver.CUcontext><void_ptr>phCtx
     cdef CUdevResource resource = CUdevResource()
     cdef cydriver.CUdevResourceType cytypename = typename.value
-    err = cydriver.cuCtxGetDevResource(cyhCtx, <cydriver.CUdevResource*>resource._pvt_ptr, cytypename)
+    with nogil:
+        err = cydriver.cuCtxGetDevResource(cyhCtx, <cydriver.CUdevResource*>resource._pvt_ptr, cytypename)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], resource)
@@ -50341,7 +50882,8 @@ def cuGreenCtxGetDevResource(hCtx, typename not None : CUdevResourceType):
     cyhCtx = <cydriver.CUgreenCtx><void_ptr>phCtx
     cdef CUdevResource resource = CUdevResource()
     cdef cydriver.CUdevResourceType cytypename = typename.value
-    err = cydriver.cuGreenCtxGetDevResource(cyhCtx, <cydriver.CUdevResource*>resource._pvt_ptr, cytypename)
+    with nogil:
+        err = cydriver.cuGreenCtxGetDevResource(cyhCtx, <cydriver.CUdevResource*>resource._pvt_ptr, cytypename)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], resource)
@@ -50459,7 +51001,8 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
     cdef unsigned int cynbGroups = nbGroups
     cdef cydriver.CUdevResource* cyinput__ptr = input_._pvt_ptr if input_ != None else NULL
     cdef CUdevResource remaining = CUdevResource()
-    err = cydriver.cuDevSmResourceSplitByCount(cyresult, &cynbGroups, cyinput__ptr, <cydriver.CUdevResource*>remaining._pvt_ptr, useFlags, minCount)
+    with nogil:
+        err = cydriver.cuDevSmResourceSplitByCount(cyresult, &cynbGroups, cyinput__ptr, <cydriver.CUdevResource*>remaining._pvt_ptr, useFlags, minCount)
     if CUresult(err) == CUresult(0):
         for idx in range(nbGroups):
             string.memcpy((<CUdevResource>pyresult[idx])._pvt_ptr, &cyresult[idx], sizeof(cydriver.CUdevResource))
@@ -50519,15 +51062,18 @@ def cuDevResourceGenerateDesc(resources : Optional[Tuple[CUdevResource] | List[C
         raise TypeError("Argument 'resources' is not instance of type (expected Tuple[cydriver.CUdevResource,] or List[cydriver.CUdevResource,]")
     cdef CUdevResourceDesc phDesc = CUdevResourceDesc()
     cdef cydriver.CUdevResource* cyresources = NULL
-    if len(resources) > 0:
+    if len(resources) > 1:
         cyresources = <cydriver.CUdevResource*> calloc(len(resources), sizeof(cydriver.CUdevResource))
         if cyresources is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(resources)) + 'x' + str(sizeof(cydriver.CUdevResource)))
         for idx in range(len(resources)):
             string.memcpy(&cyresources[idx], (<CUdevResource>resources[idx])._pvt_ptr, sizeof(cydriver.CUdevResource))
+    elif len(resources) == 1:
+        cyresources = (<CUdevResource>resources[0])._pvt_ptr
     if nbResources > len(resources): raise RuntimeError("List is too small: " + str(len(resources)) + " < " + str(nbResources))
-    err = cydriver.cuDevResourceGenerateDesc(<cydriver.CUdevResourceDesc*>phDesc._pvt_ptr, (<CUdevResource>resources[0])._pvt_ptr if len(resources) == 1 else cyresources, nbResources)
-    if cyresources is not NULL:
+    with nogil:
+        err = cydriver.cuDevResourceGenerateDesc(<cydriver.CUdevResourceDesc*>phDesc._pvt_ptr, cyresources, nbResources)
+    if len(resources) > 1 and cyresources is not NULL:
         free(cyresources)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
@@ -50584,7 +51130,8 @@ def cuGreenCtxRecordEvent(hCtx, hEvent):
     else:
         phCtx = int(CUgreenCtx(hCtx))
     cyhCtx = <cydriver.CUgreenCtx><void_ptr>phCtx
-    err = cydriver.cuGreenCtxRecordEvent(cyhCtx, cyhEvent)
+    with nogil:
+        err = cydriver.cuGreenCtxRecordEvent(cyhCtx, cyhEvent)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -50638,7 +51185,8 @@ def cuGreenCtxWaitEvent(hCtx, hEvent):
     else:
         phCtx = int(CUgreenCtx(hCtx))
     cyhCtx = <cydriver.CUgreenCtx><void_ptr>phCtx
-    err = cydriver.cuGreenCtxWaitEvent(cyhCtx, cyhEvent)
+    with nogil:
+        err = cydriver.cuGreenCtxWaitEvent(cyhCtx, cyhEvent)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -50695,7 +51243,8 @@ def cuStreamGetGreenCtx(hStream):
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef CUgreenCtx phCtx = CUgreenCtx()
-    err = cydriver.cuStreamGetGreenCtx(cyhStream, <cydriver.CUgreenCtx*>phCtx._pvt_ptr)
+    with nogil:
+        err = cydriver.cuStreamGetGreenCtx(cyhStream, <cydriver.CUgreenCtx*>phCtx._pvt_ptr)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], phCtx)
@@ -50769,7 +51318,8 @@ def cuGreenCtxStreamCreate(greenCtx, unsigned int flags, int priority):
         pgreenCtx = int(CUgreenCtx(greenCtx))
     cygreenCtx = <cydriver.CUgreenCtx><void_ptr>pgreenCtx
     cdef CUstream phStream = CUstream()
-    err = cydriver.cuGreenCtxStreamCreate(<cydriver.CUstream*>phStream._pvt_ptr, cygreenCtx, flags, priority)
+    with nogil:
+        err = cydriver.cuGreenCtxStreamCreate(<cydriver.CUstream*>phStream._pvt_ptr, cygreenCtx, flags, priority)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], phStream)
@@ -50812,7 +51362,8 @@ def cuGreenCtxGetId(greenCtx):
         pgreenCtx = int(CUgreenCtx(greenCtx))
     cygreenCtx = <cydriver.CUgreenCtx><void_ptr>pgreenCtx
     cdef unsigned long long greenCtxId = 0
-    err = cydriver.cuGreenCtxGetId(cygreenCtx, &greenCtxId)
+    with nogil:
+        err = cydriver.cuGreenCtxGetId(cygreenCtx, &greenCtxId)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], greenCtxId)
@@ -50873,7 +51424,6 @@ def cuLogsRegisterCallback(callbackFunc, userData):
     cdef CUlogsCallbackHandle callback_out = CUlogsCallbackHandle()
     with nogil:
         err = cydriver.cuLogsRegisterCallback(<cydriver.CUlogsCallback>cuLogsCallbackWrapper, <void *>cbData, <cydriver.CUlogsCallbackHandle*>callback_out._pvt_ptr)
-
     if err != cydriver.CUDA_SUCCESS:
         free(cbData)
     else:
@@ -50907,7 +51457,8 @@ def cuLogsUnregisterCallback(callback):
     else:
         pcallback = int(CUlogsCallbackHandle(callback))
     cycallback = <cydriver.CUlogsCallbackHandle><void_ptr>pcallback
-    err = cydriver.cuLogsUnregisterCallback(cycallback)
+    with nogil:
+        err = cydriver.cuLogsUnregisterCallback(cycallback)
     if err == cydriver.CUDA_SUCCESS:
         free(m_global._allocated[pcallback])
         m_global._allocated.erase(<void_ptr>pcallback)
@@ -50933,7 +51484,8 @@ def cuLogsCurrent(unsigned int flags):
         Location to store an iterator to the current tail of the logs
     """
     cdef CUlogIterator iterator_out = CUlogIterator()
-    err = cydriver.cuLogsCurrent(<cydriver.CUlogIterator*>iterator_out._pvt_ptr, flags)
+    with nogil:
+        err = cydriver.cuLogsCurrent(<cydriver.CUlogIterator*>iterator_out._pvt_ptr, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], iterator_out)
@@ -50973,7 +51525,11 @@ def cuLogsDumpToFile(iterator : Optional[CUlogIterator], char* pathToFile, unsig
 
     The driver reserves limited memory for storing logs. The oldest logs may be overwritten and become unrecoverable. An indication will appear in the destination outupt if the logs have been truncated. Call dump after each failed API to mitigate this risk.
     """
-    err = cydriver.cuLogsDumpToFile(iterator._pvt_ptr if iterator != None else NULL, pathToFile, flags)
+    cdef cydriver.CUlogIterator* cyiterator = NULL
+    if iterator is not None:
+        cyiterator = iterator._pvt_ptr
+    with nogil:
+        err = cydriver.cuLogsDumpToFile(cyiterator, pathToFile, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], iterator)
@@ -51027,7 +51583,11 @@ def cuLogsDumpToMemory(iterator : Optional[CUlogIterator], char* buffer, size_t
 
     If the provided value in `*size` is not large enough to hold all buffered messages, a message will be added at the head of the buffer indicating this. The driver then computes the number of messages it is able to store in `buffer` and writes it out. The final message in `buffer` will always be the most recent log message as of when the API is called.
     """
-    err = cydriver.cuLogsDumpToMemory(iterator._pvt_ptr if iterator != None else NULL, buffer, &size, flags)
+    cdef cydriver.CUlogIterator* cyiterator = NULL
+    if iterator is not None:
+        cyiterator = iterator._pvt_ptr
+    with nogil:
+        err = cydriver.cuLogsDumpToMemory(cyiterator, buffer, &size, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None, None)
     return (_dict_CUresult[err], iterator, size)
@@ -51055,7 +51615,8 @@ def cuCheckpointProcessGetRestoreThreadId(int pid):
         Returned restore thread ID
     """
     cdef int tid = 0
-    err = cydriver.cuCheckpointProcessGetRestoreThreadId(pid, &tid)
+    with nogil:
+        err = cydriver.cuCheckpointProcessGetRestoreThreadId(pid, &tid)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], tid)
@@ -51083,7 +51644,8 @@ def cuCheckpointProcessGetState(int pid):
         Returned CUDA process state
     """
     cdef cydriver.CUprocessState state
-    err = cydriver.cuCheckpointProcessGetState(pid, &state)
+    with nogil:
+        err = cydriver.cuCheckpointProcessGetState(pid, &state)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], CUprocessState(state))
@@ -51116,7 +51678,8 @@ def cuCheckpointProcessLock(int pid, args : Optional[CUcheckpointLockArgs]):
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED` :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED` :py:obj:`~.CUDA_ERROR_NOT_READY`
     """
     cdef cydriver.CUcheckpointLockArgs* cyargs_ptr = args._pvt_ptr if args != None else NULL
-    err = cydriver.cuCheckpointProcessLock(pid, cyargs_ptr)
+    with nogil:
+        err = cydriver.cuCheckpointProcessLock(pid, cyargs_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -51146,7 +51709,8 @@ def cuCheckpointProcessCheckpoint(int pid, args : Optional[CUcheckpointCheckpoin
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED` :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     """
     cdef cydriver.CUcheckpointCheckpointArgs* cyargs_ptr = args._pvt_ptr if args != None else NULL
-    err = cydriver.cuCheckpointProcessCheckpoint(pid, cyargs_ptr)
+    with nogil:
+        err = cydriver.cuCheckpointProcessCheckpoint(pid, cyargs_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -51174,7 +51738,8 @@ def cuCheckpointProcessUnlock(int pid, args : Optional[CUcheckpointUnlockArgs]):
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED` :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     """
     cdef cydriver.CUcheckpointUnlockArgs* cyargs_ptr = args._pvt_ptr if args != None else NULL
-    err = cydriver.cuCheckpointProcessUnlock(pid, cyargs_ptr)
+    with nogil:
+        err = cydriver.cuCheckpointProcessUnlock(pid, cyargs_ptr)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -51201,7 +51766,8 @@ def cuProfilerStart():
     --------
     :py:obj:`~.cuProfilerInitialize`, :py:obj:`~.cuProfilerStop`, :py:obj:`~.cudaProfilerStart`
     """
-    err = cydriver.cuProfilerStart()
+    with nogil:
+        err = cydriver.cuProfilerStart()
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -51228,7 +51794,8 @@ def cuProfilerStop():
     --------
     :py:obj:`~.cuProfilerInitialize`, :py:obj:`~.cuProfilerStart`, :py:obj:`~.cudaProfilerStop`
     """
-    err = cydriver.cuProfilerStop()
+    with nogil:
+        err = cydriver.cuProfilerStop()
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -51301,7 +51868,8 @@ def cuGraphicsEGLRegisterImage(image, unsigned int flags):
         pimage = int(EGLImageKHR(image))
     cyimage = <cydriver.EGLImageKHR><void_ptr>pimage
     cdef CUgraphicsResource pCudaResource = CUgraphicsResource()
-    err = cydriver.cuGraphicsEGLRegisterImage(<cydriver.CUgraphicsResource*>pCudaResource._pvt_ptr, cyimage, flags)
+    with nogil:
+        err = cydriver.cuGraphicsEGLRegisterImage(<cydriver.CUgraphicsResource*>pCudaResource._pvt_ptr, cyimage, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pCudaResource)
@@ -51343,7 +51911,8 @@ def cuEGLStreamConsumerConnect(stream):
         pstream = int(EGLStreamKHR(stream))
     cystream = <cydriver.EGLStreamKHR><void_ptr>pstream
     cdef CUeglStreamConnection conn = CUeglStreamConnection()
-    err = cydriver.cuEGLStreamConsumerConnect(<cydriver.CUeglStreamConnection*>conn._pvt_ptr, cystream)
+    with nogil:
+        err = cydriver.cuEGLStreamConsumerConnect(<cydriver.CUeglStreamConnection*>conn._pvt_ptr, cystream)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], conn)
@@ -51389,7 +51958,8 @@ def cuEGLStreamConsumerConnectWithFlags(stream, unsigned int flags):
         pstream = int(EGLStreamKHR(stream))
     cystream = <cydriver.EGLStreamKHR><void_ptr>pstream
     cdef CUeglStreamConnection conn = CUeglStreamConnection()
-    err = cydriver.cuEGLStreamConsumerConnectWithFlags(<cydriver.CUeglStreamConnection*>conn._pvt_ptr, cystream, flags)
+    with nogil:
+        err = cydriver.cuEGLStreamConsumerConnectWithFlags(<cydriver.CUeglStreamConnection*>conn._pvt_ptr, cystream, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], conn)
@@ -51427,7 +51997,8 @@ def cuEGLStreamConsumerDisconnect(conn):
         cyconn = <cydriver.CUeglStreamConnection*><void_ptr>conn
     else:
         raise TypeError("Argument 'conn' is not instance of type (expected <class 'int, driver.CUeglStreamConnection'>, found " + str(type(conn)))
-    err = cydriver.cuEGLStreamConsumerDisconnect(cyconn)
+    with nogil:
+        err = cydriver.cuEGLStreamConsumerDisconnect(cyconn)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -51498,7 +52069,8 @@ def cuEGLStreamConsumerAcquireFrame(conn, pCudaResource, pStream, unsigned int t
         cyconn = <cydriver.CUeglStreamConnection*><void_ptr>conn
     else:
         raise TypeError("Argument 'conn' is not instance of type (expected <class 'int, driver.CUeglStreamConnection'>, found " + str(type(conn)))
-    err = cydriver.cuEGLStreamConsumerAcquireFrame(cyconn, cypCudaResource, cypStream, timeout)
+    with nogil:
+        err = cydriver.cuEGLStreamConsumerAcquireFrame(cyconn, cypCudaResource, cypStream, timeout)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -51560,7 +52132,8 @@ def cuEGLStreamConsumerReleaseFrame(conn, pCudaResource, pStream):
         cyconn = <cydriver.CUeglStreamConnection*><void_ptr>conn
     else:
         raise TypeError("Argument 'conn' is not instance of type (expected <class 'int, driver.CUeglStreamConnection'>, found " + str(type(conn)))
-    err = cydriver.cuEGLStreamConsumerReleaseFrame(cyconn, cypCudaResource, cypStream)
+    with nogil:
+        err = cydriver.cuEGLStreamConsumerReleaseFrame(cyconn, cypCudaResource, cypStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -51620,7 +52193,8 @@ def cuEGLStreamProducerConnect(stream, width, height):
         pstream = int(EGLStreamKHR(stream))
     cystream = <cydriver.EGLStreamKHR><void_ptr>pstream
     cdef CUeglStreamConnection conn = CUeglStreamConnection()
-    err = cydriver.cuEGLStreamProducerConnect(<cydriver.CUeglStreamConnection*>conn._pvt_ptr, cystream, cywidth, cyheight)
+    with nogil:
+        err = cydriver.cuEGLStreamProducerConnect(<cydriver.CUeglStreamConnection*>conn._pvt_ptr, cystream, cywidth, cyheight)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], conn)
@@ -51658,7 +52232,8 @@ def cuEGLStreamProducerDisconnect(conn):
         cyconn = <cydriver.CUeglStreamConnection*><void_ptr>conn
     else:
         raise TypeError("Argument 'conn' is not instance of type (expected <class 'int, driver.CUeglStreamConnection'>, found " + str(type(conn)))
-    err = cydriver.cuEGLStreamProducerDisconnect(cyconn)
+    with nogil:
+        err = cydriver.cuEGLStreamProducerDisconnect(cyconn)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -51727,7 +52302,8 @@ def cuEGLStreamProducerPresentFrame(conn, eglframe not None : CUeglFrame, pStrea
         cyconn = <cydriver.CUeglStreamConnection*><void_ptr>conn
     else:
         raise TypeError("Argument 'conn' is not instance of type (expected <class 'int, driver.CUeglStreamConnection'>, found " + str(type(conn)))
-    err = cydriver.cuEGLStreamProducerPresentFrame(cyconn, eglframe._pvt_ptr[0], cypStream)
+    with nogil:
+        err = cydriver.cuEGLStreamProducerPresentFrame(cyconn, eglframe._pvt_ptr[0], cypStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -51781,7 +52357,8 @@ def cuEGLStreamProducerReturnFrame(conn, eglframe : Optional[CUeglFrame], pStrea
     else:
         raise TypeError("Argument 'conn' is not instance of type (expected <class 'int, driver.CUeglStreamConnection'>, found " + str(type(conn)))
     cdef cydriver.CUeglFrame* cyeglframe_ptr = eglframe._pvt_ptr if eglframe != None else NULL
-    err = cydriver.cuEGLStreamProducerReturnFrame(cyconn, cyeglframe_ptr, cypStream)
+    with nogil:
+        err = cydriver.cuEGLStreamProducerReturnFrame(cyconn, cyeglframe_ptr, cypStream)
     return (_dict_CUresult[err],)
 {{endif}}
 
@@ -51827,7 +52404,8 @@ def cuGraphicsResourceGetMappedEglFrame(resource, unsigned int index, unsigned i
         presource = int(CUgraphicsResource(resource))
     cyresource = <cydriver.CUgraphicsResource><void_ptr>presource
     cdef CUeglFrame eglFrame = CUeglFrame()
-    err = cydriver.cuGraphicsResourceGetMappedEglFrame(<cydriver.CUeglFrame*>eglFrame._pvt_ptr, cyresource, index, mipLevel)
+    with nogil:
+        err = cydriver.cuGraphicsResourceGetMappedEglFrame(<cydriver.CUeglFrame*>eglFrame._pvt_ptr, cyresource, index, mipLevel)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], eglFrame)
@@ -51885,7 +52463,8 @@ def cuEventCreateFromEGLSync(eglSync, unsigned int flags):
         peglSync = int(EGLSyncKHR(eglSync))
     cyeglSync = <cydriver.EGLSyncKHR><void_ptr>peglSync
     cdef CUevent phEvent = CUevent()
-    err = cydriver.cuEventCreateFromEGLSync(<cydriver.CUevent*>phEvent._pvt_ptr, cyeglSync, flags)
+    with nogil:
+        err = cydriver.cuEventCreateFromEGLSync(<cydriver.CUevent*>phEvent._pvt_ptr, cyeglSync, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], phEvent)
@@ -51941,7 +52520,8 @@ def cuGraphicsGLRegisterBuffer(buffer, unsigned int Flags):
         pbuffer = int(GLuint(buffer))
     cybuffer = <cydriver.GLuint><void_ptr>pbuffer
     cdef CUgraphicsResource pCudaResource = CUgraphicsResource()
-    err = cydriver.cuGraphicsGLRegisterBuffer(<cydriver.CUgraphicsResource*>pCudaResource._pvt_ptr, cybuffer, Flags)
+    with nogil:
+        err = cydriver.cuGraphicsGLRegisterBuffer(<cydriver.CUgraphicsResource*>pCudaResource._pvt_ptr, cybuffer, Flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pCudaResource)
@@ -52040,7 +52620,8 @@ def cuGraphicsGLRegisterImage(image, target, unsigned int Flags):
         pimage = int(GLuint(image))
     cyimage = <cydriver.GLuint><void_ptr>pimage
     cdef CUgraphicsResource pCudaResource = CUgraphicsResource()
-    err = cydriver.cuGraphicsGLRegisterImage(<cydriver.CUgraphicsResource*>pCudaResource._pvt_ptr, cyimage, cytarget, Flags)
+    with nogil:
+        err = cydriver.cuGraphicsGLRegisterImage(<cydriver.CUgraphicsResource*>pCudaResource._pvt_ptr, cyimage, cytarget, Flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pCudaResource)
@@ -52104,7 +52685,8 @@ def cuGLGetDevices(unsigned int cudaDeviceCount, deviceList not None : CUGLDevic
         if cypCudaDevices is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(cudaDeviceCount) + 'x' + str(sizeof(cydriver.CUdevice)))
     cdef cydriver.CUGLDeviceList cydeviceList = deviceList.value
-    err = cydriver.cuGLGetDevices(&pCudaDeviceCount, cypCudaDevices, cudaDeviceCount, cydeviceList)
+    with nogil:
+        err = cydriver.cuGLGetDevices(&pCudaDeviceCount, cypCudaDevices, cudaDeviceCount, cydeviceList)
     if CUresult(err) == CUresult(0):
         pypCudaDevices = [CUdevice(init_value=<void_ptr>cypCudaDevices[idx]) for idx in range(cudaDeviceCount)]
     if cypCudaDevices is not NULL:
@@ -52160,7 +52742,8 @@ def cuVDPAUGetDevice(vdpDevice, vdpGetProcAddress):
         pvdpDevice = int(VdpDevice(vdpDevice))
     cyvdpDevice = <cydriver.VdpDevice><void_ptr>pvdpDevice
     cdef CUdevice pDevice = CUdevice()
-    err = cydriver.cuVDPAUGetDevice(<cydriver.CUdevice*>pDevice._pvt_ptr, cyvdpDevice, cyvdpGetProcAddress)
+    with nogil:
+        err = cydriver.cuVDPAUGetDevice(<cydriver.CUdevice*>pDevice._pvt_ptr, cyvdpDevice, cyvdpGetProcAddress)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pDevice)
@@ -52227,7 +52810,8 @@ def cuVDPAUCtxCreate(unsigned int flags, device, vdpDevice, vdpGetProcAddress):
         pdevice = int(CUdevice(device))
     cydevice = <cydriver.CUdevice>pdevice
     cdef CUcontext pCtx = CUcontext()
-    err = cydriver.cuVDPAUCtxCreate(<cydriver.CUcontext*>pCtx._pvt_ptr, flags, cydevice, cyvdpDevice, cyvdpGetProcAddress)
+    with nogil:
+        err = cydriver.cuVDPAUCtxCreate(<cydriver.CUcontext*>pCtx._pvt_ptr, flags, cydevice, cyvdpDevice, cyvdpGetProcAddress)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pCtx)
@@ -52289,7 +52873,8 @@ def cuGraphicsVDPAURegisterVideoSurface(vdpSurface, unsigned int flags):
         pvdpSurface = int(VdpVideoSurface(vdpSurface))
     cyvdpSurface = <cydriver.VdpVideoSurface><void_ptr>pvdpSurface
     cdef CUgraphicsResource pCudaResource = CUgraphicsResource()
-    err = cydriver.cuGraphicsVDPAURegisterVideoSurface(<cydriver.CUgraphicsResource*>pCudaResource._pvt_ptr, cyvdpSurface, flags)
+    with nogil:
+        err = cydriver.cuGraphicsVDPAURegisterVideoSurface(<cydriver.CUgraphicsResource*>pCudaResource._pvt_ptr, cyvdpSurface, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pCudaResource)
@@ -52351,7 +52936,8 @@ def cuGraphicsVDPAURegisterOutputSurface(vdpSurface, unsigned int flags):
         pvdpSurface = int(VdpOutputSurface(vdpSurface))
     cyvdpSurface = <cydriver.VdpOutputSurface><void_ptr>pvdpSurface
     cdef CUgraphicsResource pCudaResource = CUgraphicsResource()
-    err = cydriver.cuGraphicsVDPAURegisterOutputSurface(<cydriver.CUgraphicsResource*>pCudaResource._pvt_ptr, cyvdpSurface, flags)
+    with nogil:
+        err = cydriver.cuGraphicsVDPAURegisterOutputSurface(<cydriver.CUgraphicsResource*>pCudaResource._pvt_ptr, cyvdpSurface, flags)
     if err != cydriver.CUDA_SUCCESS:
         return (_dict_CUresult[err], None)
     return (_dict_CUresult[err], pCudaResource)
diff --git a/cuda_bindings/cuda/bindings/nvrtc.pyx.in b/cuda_bindings/cuda/bindings/nvrtc.pyx.in
index a30e54d061..34405105fd 100644
--- a/cuda_bindings/cuda/bindings/nvrtc.pyx.in
+++ b/cuda_bindings/cuda/bindings/nvrtc.pyx.in
@@ -136,7 +136,8 @@ def nvrtcGetErrorString(result not None : nvrtcResult):
         Message string for the given :py:obj:`~.nvrtcResult` code.
     """
     cdef cynvrtc.nvrtcResult cyresult = result.value
-    err = cynvrtc.nvrtcGetErrorString(cyresult)
+    with nogil:
+        err = cynvrtc.nvrtcGetErrorString(cyresult)
     return (nvrtcResult.NVRTC_SUCCESS, err)
 {{endif}}
 
@@ -158,7 +159,8 @@ def nvrtcVersion():
     """
     cdef int major = 0
     cdef int minor = 0
-    err = cynvrtc.nvrtcVersion(&major, &minor)
+    with nogil:
+        err = cynvrtc.nvrtcVersion(&major, &minor)
     if err != cynvrtc.NVRTC_SUCCESS:
         return (_dict_nvrtcResult[err], None, None)
     return (_dict_nvrtcResult[err], major, minor)
@@ -181,7 +183,8 @@ def nvrtcGetNumSupportedArchs():
         number of supported architectures.
     """
     cdef int numArchs = 0
-    err = cynvrtc.nvrtcGetNumSupportedArchs(&numArchs)
+    with nogil:
+        err = cynvrtc.nvrtcGetNumSupportedArchs(&numArchs)
     if err != cynvrtc.NVRTC_SUCCESS:
         return (_dict_nvrtcResult[err], None)
     return (_dict_nvrtcResult[err], numArchs)
@@ -207,7 +210,8 @@ def nvrtcGetSupportedArchs():
     _, s = nvrtcGetNumSupportedArchs()
     supportedArchs.resize(s)
 
-    err = cynvrtc.nvrtcGetSupportedArchs(supportedArchs.data())
+    with nogil:
+        err = cynvrtc.nvrtcGetSupportedArchs(supportedArchs.data())
     if err != cynvrtc.NVRTC_SUCCESS:
         return (_dict_nvrtcResult[err], None)
     return (_dict_nvrtcResult[err], supportedArchs)
@@ -264,7 +268,8 @@ def nvrtcCreateProgram(char* src, char* name, int numHeaders, headers : Optional
     if numHeaders > len(includeNames): raise RuntimeError("List is too small: " + str(len(includeNames)) + " < " + str(numHeaders))
     cdef vector[const char*] cyheaders = headers
     cdef vector[const char*] cyincludeNames = includeNames
-    err = cynvrtc.nvrtcCreateProgram(<cynvrtc.nvrtcProgram*>prog._pvt_ptr, src, name, numHeaders, cyheaders.data(), cyincludeNames.data())
+    with nogil:
+        err = cynvrtc.nvrtcCreateProgram(<cynvrtc.nvrtcProgram*>prog._pvt_ptr, src, name, numHeaders, cyheaders.data(), cyincludeNames.data())
     if err != cynvrtc.NVRTC_SUCCESS:
         return (_dict_nvrtcResult[err], None)
     return (_dict_nvrtcResult[err], prog)
@@ -301,7 +306,8 @@ def nvrtcDestroyProgram(prog):
         cyprog = <cynvrtc.nvrtcProgram*><void_ptr>prog
     else:
         raise TypeError("Argument 'prog' is not instance of type (expected <class 'int, nvrtc.nvrtcProgram'>, found " + str(type(prog)))
-    err = cynvrtc.nvrtcDestroyProgram(cyprog)
+    with nogil:
+        err = cynvrtc.nvrtcDestroyProgram(cyprog)
     return (_dict_nvrtcResult[err],)
 {{endif}}
 
@@ -350,7 +356,8 @@ def nvrtcCompileProgram(prog, int numOptions, options : Optional[Tuple[bytes] |
     cyprog = <cynvrtc.nvrtcProgram><void_ptr>pprog
     if numOptions > len(options): raise RuntimeError("List is too small: " + str(len(options)) + " < " + str(numOptions))
     cdef vector[const char*] cyoptions = options
-    err = cynvrtc.nvrtcCompileProgram(cyprog, numOptions, cyoptions.data())
+    with nogil:
+        err = cynvrtc.nvrtcCompileProgram(cyprog, numOptions, cyoptions.data())
     return (_dict_nvrtcResult[err],)
 {{endif}}
 
@@ -387,7 +394,8 @@ def nvrtcGetPTXSize(prog):
         pprog = int(nvrtcProgram(prog))
     cyprog = <cynvrtc.nvrtcProgram><void_ptr>pprog
     cdef size_t ptxSizeRet = 0
-    err = cynvrtc.nvrtcGetPTXSize(cyprog, &ptxSizeRet)
+    with nogil:
+        err = cynvrtc.nvrtcGetPTXSize(cyprog, &ptxSizeRet)
     if err != cynvrtc.NVRTC_SUCCESS:
         return (_dict_nvrtcResult[err], None)
     return (_dict_nvrtcResult[err], ptxSizeRet)
@@ -425,7 +433,8 @@ def nvrtcGetPTX(prog, char* ptx):
     else:
         pprog = int(nvrtcProgram(prog))
     cyprog = <cynvrtc.nvrtcProgram><void_ptr>pprog
-    err = cynvrtc.nvrtcGetPTX(cyprog, ptx)
+    with nogil:
+        err = cynvrtc.nvrtcGetPTX(cyprog, ptx)
     return (_dict_nvrtcResult[err],)
 {{endif}}
 
@@ -462,7 +471,8 @@ def nvrtcGetCUBINSize(prog):
         pprog = int(nvrtcProgram(prog))
     cyprog = <cynvrtc.nvrtcProgram><void_ptr>pprog
     cdef size_t cubinSizeRet = 0
-    err = cynvrtc.nvrtcGetCUBINSize(cyprog, &cubinSizeRet)
+    with nogil:
+        err = cynvrtc.nvrtcGetCUBINSize(cyprog, &cubinSizeRet)
     if err != cynvrtc.NVRTC_SUCCESS:
         return (_dict_nvrtcResult[err], None)
     return (_dict_nvrtcResult[err], cubinSizeRet)
@@ -500,7 +510,8 @@ def nvrtcGetCUBIN(prog, char* cubin):
     else:
         pprog = int(nvrtcProgram(prog))
     cyprog = <cynvrtc.nvrtcProgram><void_ptr>pprog
-    err = cynvrtc.nvrtcGetCUBIN(cyprog, cubin)
+    with nogil:
+        err = cynvrtc.nvrtcGetCUBIN(cyprog, cubin)
     return (_dict_nvrtcResult[err],)
 {{endif}}
 
@@ -537,7 +548,8 @@ def nvrtcGetLTOIRSize(prog):
         pprog = int(nvrtcProgram(prog))
     cyprog = <cynvrtc.nvrtcProgram><void_ptr>pprog
     cdef size_t LTOIRSizeRet = 0
-    err = cynvrtc.nvrtcGetLTOIRSize(cyprog, &LTOIRSizeRet)
+    with nogil:
+        err = cynvrtc.nvrtcGetLTOIRSize(cyprog, &LTOIRSizeRet)
     if err != cynvrtc.NVRTC_SUCCESS:
         return (_dict_nvrtcResult[err], None)
     return (_dict_nvrtcResult[err], LTOIRSizeRet)
@@ -575,7 +587,8 @@ def nvrtcGetLTOIR(prog, char* LTOIR):
     else:
         pprog = int(nvrtcProgram(prog))
     cyprog = <cynvrtc.nvrtcProgram><void_ptr>pprog
-    err = cynvrtc.nvrtcGetLTOIR(cyprog, LTOIR)
+    with nogil:
+        err = cynvrtc.nvrtcGetLTOIR(cyprog, LTOIR)
     return (_dict_nvrtcResult[err],)
 {{endif}}
 
@@ -612,7 +625,8 @@ def nvrtcGetOptiXIRSize(prog):
         pprog = int(nvrtcProgram(prog))
     cyprog = <cynvrtc.nvrtcProgram><void_ptr>pprog
     cdef size_t optixirSizeRet = 0
-    err = cynvrtc.nvrtcGetOptiXIRSize(cyprog, &optixirSizeRet)
+    with nogil:
+        err = cynvrtc.nvrtcGetOptiXIRSize(cyprog, &optixirSizeRet)
     if err != cynvrtc.NVRTC_SUCCESS:
         return (_dict_nvrtcResult[err], None)
     return (_dict_nvrtcResult[err], optixirSizeRet)
@@ -650,7 +664,8 @@ def nvrtcGetOptiXIR(prog, char* optixir):
     else:
         pprog = int(nvrtcProgram(prog))
     cyprog = <cynvrtc.nvrtcProgram><void_ptr>pprog
-    err = cynvrtc.nvrtcGetOptiXIR(cyprog, optixir)
+    with nogil:
+        err = cynvrtc.nvrtcGetOptiXIR(cyprog, optixir)
     return (_dict_nvrtcResult[err],)
 {{endif}}
 
@@ -690,7 +705,8 @@ def nvrtcGetProgramLogSize(prog):
         pprog = int(nvrtcProgram(prog))
     cyprog = <cynvrtc.nvrtcProgram><void_ptr>pprog
     cdef size_t logSizeRet = 0
-    err = cynvrtc.nvrtcGetProgramLogSize(cyprog, &logSizeRet)
+    with nogil:
+        err = cynvrtc.nvrtcGetProgramLogSize(cyprog, &logSizeRet)
     if err != cynvrtc.NVRTC_SUCCESS:
         return (_dict_nvrtcResult[err], None)
     return (_dict_nvrtcResult[err], logSizeRet)
@@ -728,7 +744,8 @@ def nvrtcGetProgramLog(prog, char* log):
     else:
         pprog = int(nvrtcProgram(prog))
     cyprog = <cynvrtc.nvrtcProgram><void_ptr>pprog
-    err = cynvrtc.nvrtcGetProgramLog(cyprog, log)
+    with nogil:
+        err = cynvrtc.nvrtcGetProgramLog(cyprog, log)
     return (_dict_nvrtcResult[err],)
 {{endif}}
 
@@ -769,7 +786,8 @@ def nvrtcAddNameExpression(prog, char* name_expression):
     else:
         pprog = int(nvrtcProgram(prog))
     cyprog = <cynvrtc.nvrtcProgram><void_ptr>pprog
-    err = cynvrtc.nvrtcAddNameExpression(cyprog, name_expression)
+    with nogil:
+        err = cynvrtc.nvrtcAddNameExpression(cyprog, name_expression)
     return (_dict_nvrtcResult[err],)
 {{endif}}
 
@@ -811,7 +829,8 @@ def nvrtcGetLoweredName(prog, char* name_expression):
         pprog = int(nvrtcProgram(prog))
     cyprog = <cynvrtc.nvrtcProgram><void_ptr>pprog
     cdef const char* lowered_name = NULL
-    err = cynvrtc.nvrtcGetLoweredName(cyprog, name_expression, &lowered_name)
+    with nogil:
+        err = cynvrtc.nvrtcGetLoweredName(cyprog, name_expression, &lowered_name)
     if err != cynvrtc.NVRTC_SUCCESS:
         return (_dict_nvrtcResult[err], None)
     return (_dict_nvrtcResult[err], <bytes>lowered_name if lowered_name != NULL else None)
@@ -832,7 +851,8 @@ def nvrtcGetPCHHeapSize():
         pointer to location where the size of the PCH Heap will be stored
     """
     cdef size_t ret = 0
-    err = cynvrtc.nvrtcGetPCHHeapSize(&ret)
+    with nogil:
+        err = cynvrtc.nvrtcGetPCHHeapSize(&ret)
     if err != cynvrtc.NVRTC_SUCCESS:
         return (_dict_nvrtcResult[err], None)
     return (_dict_nvrtcResult[err], ret)
@@ -858,7 +878,8 @@ def nvrtcSetPCHHeapSize(size_t size):
     nvrtcResult
         - :py:obj:`~.NVRTC_SUCCESS`
     """
-    err = cynvrtc.nvrtcSetPCHHeapSize(size)
+    with nogil:
+        err = cynvrtc.nvrtcSetPCHHeapSize(size)
     return (_dict_nvrtcResult[err],)
 {{endif}}
 
@@ -905,7 +926,8 @@ def nvrtcGetPCHCreateStatus(prog):
     else:
         pprog = int(nvrtcProgram(prog))
     cyprog = <cynvrtc.nvrtcProgram><void_ptr>pprog
-    err = cynvrtc.nvrtcGetPCHCreateStatus(cyprog)
+    with nogil:
+        err = cynvrtc.nvrtcGetPCHCreateStatus(cyprog)
     return (_dict_nvrtcResult[err],)
 {{endif}}
 
@@ -939,7 +961,8 @@ def nvrtcGetPCHHeapSizeRequired(prog):
         pprog = int(nvrtcProgram(prog))
     cyprog = <cynvrtc.nvrtcProgram><void_ptr>pprog
     cdef size_t size = 0
-    err = cynvrtc.nvrtcGetPCHHeapSizeRequired(cyprog, &size)
+    with nogil:
+        err = cynvrtc.nvrtcGetPCHHeapSizeRequired(cyprog, &size)
     if err != cynvrtc.NVRTC_SUCCESS:
         return (_dict_nvrtcResult[err], None)
     return (_dict_nvrtcResult[err], size)
@@ -1001,7 +1024,8 @@ def nvrtcSetFlowCallback(prog, callback, payload):
     cdef void* cycallback_ptr = <void*><void_ptr>cycallback.cptr
     cypayload = utils.HelperInputVoidPtr(payload)
     cdef void* cypayload_ptr = <void*><void_ptr>cypayload.cptr
-    err = cynvrtc.nvrtcSetFlowCallback(cyprog, cycallback_ptr, cypayload_ptr)
+    with nogil:
+        err = cynvrtc.nvrtcSetFlowCallback(cyprog, cycallback_ptr, cypayload_ptr)
     return (_dict_nvrtcResult[err],)
 {{endif}}
 
diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in
index ec00a16e67..939e1dfcc1 100644
--- a/cuda_bindings/cuda/bindings/runtime.pyx.in
+++ b/cuda_bindings/cuda/bindings/runtime.pyx.in
@@ -18401,7 +18401,8 @@ def cudaDeviceReset():
 
     If a non-primary :py:obj:`~.CUcontext` is current to the thread, :py:obj:`~.cudaDeviceReset()` will destroy only the internal CUDA RT state for that :py:obj:`~.CUcontext`.
     """
-    err = cyruntime.cudaDeviceReset()
+    with nogil:
+        err = cyruntime.cudaDeviceReset()
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -18429,7 +18430,6 @@ def cudaDeviceSynchronize():
     """
     with nogil:
         err = cyruntime.cudaDeviceSynchronize()
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -18531,7 +18531,8 @@ def cudaDeviceSetLimit(limit not None : cudaLimit, size_t value):
     :py:obj:`~.cudaDeviceGetLimit`, :py:obj:`~.cuCtxSetLimit`
     """
     cdef cyruntime.cudaLimit cylimit = limit.value
-    err = cyruntime.cudaDeviceSetLimit(cylimit, value)
+    with nogil:
+        err = cyruntime.cudaDeviceSetLimit(cylimit, value)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -18588,7 +18589,8 @@ def cudaDeviceGetLimit(limit not None : cudaLimit):
     """
     cdef size_t pValue = 0
     cdef cyruntime.cudaLimit cylimit = limit.value
-    err = cyruntime.cudaDeviceGetLimit(&pValue, cylimit)
+    with nogil:
+        err = cyruntime.cudaDeviceGetLimit(&pValue, cylimit)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pValue)
@@ -18625,7 +18627,8 @@ def cudaDeviceGetTexture1DLinearMaxWidth(fmtDesc : Optional[cudaChannelFormatDes
     """
     cdef size_t maxWidthInElements = 0
     cdef cyruntime.cudaChannelFormatDesc* cyfmtDesc_ptr = fmtDesc._pvt_ptr if fmtDesc != None else NULL
-    err = cyruntime.cudaDeviceGetTexture1DLinearMaxWidth(&maxWidthInElements, cyfmtDesc_ptr, device)
+    with nogil:
+        err = cyruntime.cudaDeviceGetTexture1DLinearMaxWidth(&maxWidthInElements, cyfmtDesc_ptr, device)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], maxWidthInElements)
@@ -18674,7 +18677,8 @@ def cudaDeviceGetCacheConfig():
     :py:obj:`~.cudaDeviceSetCacheConfig`, :py:obj:`~.cudaFuncSetCacheConfig (C API)`, cudaFuncSetCacheConfig (C++ API), :py:obj:`~.cuCtxGetCacheConfig`
     """
     cdef cyruntime.cudaFuncCache pCacheConfig
-    err = cyruntime.cudaDeviceGetCacheConfig(&pCacheConfig)
+    with nogil:
+        err = cyruntime.cudaDeviceGetCacheConfig(&pCacheConfig)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], cudaFuncCache(pCacheConfig))
@@ -18720,7 +18724,8 @@ def cudaDeviceGetStreamPriorityRange():
     """
     cdef int leastPriority = 0
     cdef int greatestPriority = 0
-    err = cyruntime.cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority)
+    with nogil:
+        err = cyruntime.cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None, None)
     return (_dict_cudaError_t[err], leastPriority, greatestPriority)
@@ -18779,7 +18784,8 @@ def cudaDeviceSetCacheConfig(cacheConfig not None : cudaFuncCache):
     :py:obj:`~.cudaDeviceGetCacheConfig`, :py:obj:`~.cudaFuncSetCacheConfig (C API)`, cudaFuncSetCacheConfig (C++ API), :py:obj:`~.cuCtxSetCacheConfig`
     """
     cdef cyruntime.cudaFuncCache cycacheConfig = cacheConfig.value
-    err = cyruntime.cudaDeviceSetCacheConfig(cycacheConfig)
+    with nogil:
+        err = cyruntime.cudaDeviceSetCacheConfig(cycacheConfig)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -18811,7 +18817,8 @@ def cudaDeviceGetByPCIBusId(char* pciBusId):
     :py:obj:`~.cudaDeviceGetPCIBusId`, :py:obj:`~.cuDeviceGetByPCIBusId`
     """
     cdef int device = 0
-    err = cyruntime.cudaDeviceGetByPCIBusId(&device, pciBusId)
+    with nogil:
+        err = cyruntime.cudaDeviceGetByPCIBusId(&device, pciBusId)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], device)
@@ -18851,7 +18858,8 @@ def cudaDeviceGetPCIBusId(int length, int device):
     """
     pypciBusId = b" " * length
     cdef char* pciBusId = pypciBusId
-    err = cyruntime.cudaDeviceGetPCIBusId(pciBusId, length, device)
+    with nogil:
+        err = cyruntime.cudaDeviceGetPCIBusId(pciBusId, length, device)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pypciBusId)
@@ -18911,7 +18919,8 @@ def cudaIpcGetEventHandle(event):
         pevent = int(cudaEvent_t(event))
     cyevent = <cyruntime.cudaEvent_t><void_ptr>pevent
     cdef cudaIpcEventHandle_t handle = cudaIpcEventHandle_t()
-    err = cyruntime.cudaIpcGetEventHandle(<cyruntime.cudaIpcEventHandle_t*>handle._pvt_ptr, cyevent)
+    with nogil:
+        err = cyruntime.cudaIpcGetEventHandle(<cyruntime.cudaIpcEventHandle_t*>handle._pvt_ptr, cyevent)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], handle)
@@ -18957,7 +18966,8 @@ def cudaIpcOpenEventHandle(handle not None : cudaIpcEventHandle_t):
     :py:obj:`~.cudaEventCreate`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaIpcGetEventHandle`, :py:obj:`~.cudaIpcGetMemHandle`, :py:obj:`~.cudaIpcOpenMemHandle`, :py:obj:`~.cudaIpcCloseMemHandle`, :py:obj:`~.cuIpcOpenEventHandle`
     """
     cdef cudaEvent_t event = cudaEvent_t()
-    err = cyruntime.cudaIpcOpenEventHandle(<cyruntime.cudaEvent_t*>event._pvt_ptr, handle._pvt_ptr[0])
+    with nogil:
+        err = cyruntime.cudaIpcOpenEventHandle(<cyruntime.cudaEvent_t*>event._pvt_ptr, handle._pvt_ptr[0])
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], event)
@@ -19006,7 +19016,8 @@ def cudaIpcGetMemHandle(devPtr):
     cdef cudaIpcMemHandle_t handle = cudaIpcMemHandle_t()
     cydevPtr = utils.HelperInputVoidPtr(devPtr)
     cdef void* cydevPtr_ptr = <void*><void_ptr>cydevPtr.cptr
-    err = cyruntime.cudaIpcGetMemHandle(<cyruntime.cudaIpcMemHandle_t*>handle._pvt_ptr, cydevPtr_ptr)
+    with nogil:
+        err = cyruntime.cudaIpcGetMemHandle(<cyruntime.cudaIpcMemHandle_t*>handle._pvt_ptr, cydevPtr_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], handle)
@@ -19078,7 +19089,8 @@ def cudaIpcOpenMemHandle(handle not None : cudaIpcMemHandle_t, unsigned int flag
      In particular, multiple processes may not receive the same address for the same `handle`.
     """
     cdef void_ptr devPtr = 0
-    err = cyruntime.cudaIpcOpenMemHandle(<void**>&devPtr, handle._pvt_ptr[0], flags)
+    with nogil:
+        err = cyruntime.cudaIpcOpenMemHandle(<void**>&devPtr, handle._pvt_ptr[0], flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], devPtr)
@@ -19122,7 +19134,8 @@ def cudaIpcCloseMemHandle(devPtr):
     """
     cydevPtr = utils.HelperInputVoidPtr(devPtr)
     cdef void* cydevPtr_ptr = <void*><void_ptr>cydevPtr.cptr
-    err = cyruntime.cudaIpcCloseMemHandle(cydevPtr_ptr)
+    with nogil:
+        err = cyruntime.cudaIpcCloseMemHandle(cydevPtr_ptr)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -19164,7 +19177,8 @@ def cudaDeviceFlushGPUDirectRDMAWrites(target not None : cudaFlushGPUDirectRDMAW
     """
     cdef cyruntime.cudaFlushGPUDirectRDMAWritesTarget cytarget = target.value
     cdef cyruntime.cudaFlushGPUDirectRDMAWritesScope cyscope = scope.value
-    err = cyruntime.cudaDeviceFlushGPUDirectRDMAWrites(cytarget, cyscope)
+    with nogil:
+        err = cyruntime.cudaDeviceFlushGPUDirectRDMAWrites(cytarget, cyscope)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -19246,7 +19260,6 @@ def cudaDeviceRegisterAsyncNotification(int device, callbackFunc, userData):
     cdef cudaAsyncCallbackHandle_t callback = cudaAsyncCallbackHandle_t()
     with nogil:
         err = cyruntime.cudaDeviceRegisterAsyncNotification(device, <cyruntime.cudaAsyncCallback>cudaAsyncNotificationCallbackWrapper, <void *>cbData, <cyruntime.cudaAsyncCallbackHandle_t*>callback._pvt_ptr)
-
     if err != cyruntime.cudaSuccess:
         free(cbData)
     else:
@@ -19290,7 +19303,8 @@ def cudaDeviceUnregisterAsyncNotification(int device, callback):
     else:
         pcallback = int(cudaAsyncCallbackHandle_t(callback))
     cycallback = <cyruntime.cudaAsyncCallbackHandle_t><void_ptr>pcallback
-    err = cyruntime.cudaDeviceUnregisterAsyncNotification(device, cycallback)
+    with nogil:
+        err = cyruntime.cudaDeviceUnregisterAsyncNotification(device, cycallback)
     if err == cyruntime.cudaSuccess:
         free(m_global._allocated[pcallback])
         m_global._allocated.erase(<void_ptr>pcallback)
@@ -19334,7 +19348,8 @@ def cudaDeviceGetSharedMemConfig():
     :py:obj:`~.cudaDeviceSetCacheConfig`, :py:obj:`~.cudaDeviceGetCacheConfig`, :py:obj:`~.cudaDeviceSetSharedMemConfig`, :py:obj:`~.cudaFuncSetCacheConfig`, :py:obj:`~.cuCtxGetSharedMemConfig`
     """
     cdef cyruntime.cudaSharedMemConfig pConfig
-    err = cyruntime.cudaDeviceGetSharedMemConfig(&pConfig)
+    with nogil:
+        err = cyruntime.cudaDeviceGetSharedMemConfig(&pConfig)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], cudaSharedMemConfig(pConfig))
@@ -19392,7 +19407,8 @@ def cudaDeviceSetSharedMemConfig(config not None : cudaSharedMemConfig):
     :py:obj:`~.cudaDeviceSetCacheConfig`, :py:obj:`~.cudaDeviceGetCacheConfig`, :py:obj:`~.cudaDeviceGetSharedMemConfig`, :py:obj:`~.cudaFuncSetCacheConfig`, :py:obj:`~.cuCtxSetSharedMemConfig`
     """
     cdef cyruntime.cudaSharedMemConfig cyconfig = config.value
-    err = cyruntime.cudaDeviceSetSharedMemConfig(cyconfig)
+    with nogil:
+        err = cyruntime.cudaDeviceSetSharedMemConfig(cyconfig)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -19419,7 +19435,8 @@ def cudaGetLastError():
     --------
     :py:obj:`~.cudaPeekAtLastError`, :py:obj:`~.cudaGetErrorName`, :py:obj:`~.cudaGetErrorString`, :py:obj:`~.cudaError`
     """
-    err = cyruntime.cudaGetLastError()
+    with nogil:
+        err = cyruntime.cudaGetLastError()
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -19447,7 +19464,8 @@ def cudaPeekAtLastError():
     --------
     :py:obj:`~.cudaGetLastError`, :py:obj:`~.cudaGetErrorName`, :py:obj:`~.cudaGetErrorString`, :py:obj:`~.cudaError`
     """
-    err = cyruntime.cudaPeekAtLastError()
+    with nogil:
+        err = cyruntime.cudaPeekAtLastError()
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -19478,7 +19496,8 @@ def cudaGetErrorName(error not None : cudaError_t):
     :py:obj:`~.cudaGetErrorString`, :py:obj:`~.cudaGetLastError`, :py:obj:`~.cudaPeekAtLastError`, :py:obj:`~.cudaError`, :py:obj:`~.cuGetErrorName`
     """
     cdef cyruntime.cudaError_t cyerror = error.value
-    err = cyruntime.cudaGetErrorName(cyerror)
+    with nogil:
+        err = cyruntime.cudaGetErrorName(cyerror)
     return (cudaError_t.cudaSuccess, err)
 {{endif}}
 
@@ -19508,7 +19527,8 @@ def cudaGetErrorString(error not None : cudaError_t):
     :py:obj:`~.cudaGetErrorName`, :py:obj:`~.cudaGetLastError`, :py:obj:`~.cudaPeekAtLastError`, :py:obj:`~.cudaError`, :py:obj:`~.cuGetErrorString`
     """
     cdef cyruntime.cudaError_t cyerror = error.value
-    err = cyruntime.cudaGetErrorString(cyerror)
+    with nogil:
+        err = cyruntime.cudaGetErrorString(cyerror)
     return (cudaError_t.cudaSuccess, err)
 {{endif}}
 
@@ -19534,7 +19554,8 @@ def cudaGetDeviceCount():
     :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cudaInitDevice`, :py:obj:`~.cuDeviceGetCount`
     """
     cdef int count = 0
-    err = cyruntime.cudaGetDeviceCount(&count)
+    with nogil:
+        err = cyruntime.cudaGetDeviceCount(&count)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], count)
@@ -19565,7 +19586,8 @@ def cudaGetDeviceProperties(int device):
     :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cudaDeviceGetAttribute`, :py:obj:`~.cudaInitDevice`, :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetName`
     """
     cdef cudaDeviceProp prop = cudaDeviceProp()
-    err = cyruntime.cudaGetDeviceProperties(<cyruntime.cudaDeviceProp*>prop._pvt_ptr, device)
+    with nogil:
+        err = cyruntime.cudaGetDeviceProperties(<cyruntime.cudaDeviceProp*>prop._pvt_ptr, device)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], prop)
@@ -19600,7 +19622,8 @@ def cudaDeviceGetAttribute(attr not None : cudaDeviceAttr, int device):
     """
     cdef int value = 0
     cdef cyruntime.cudaDeviceAttr cyattr = attr.value
-    err = cyruntime.cudaDeviceGetAttribute(&value, cyattr, device)
+    with nogil:
+        err = cyruntime.cudaDeviceGetAttribute(&value, cyattr, device)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], value)
@@ -19658,7 +19681,8 @@ def cudaDeviceGetHostAtomicCapabilities(operations : Optional[Tuple[cudaAtomicOp
             raise MemoryError('Failed to allocate length x size memory: ' + str(count) + 'x' + str(sizeof(unsigned int)))
     cdef vector[cyruntime.cudaAtomicOperation] cyoperations = [pyoperations.value for pyoperations in (operations)]
     if count > len(operations): raise RuntimeError("List is too small: " + str(len(operations)) + " < " + str(count))
-    err = cyruntime.cudaDeviceGetHostAtomicCapabilities(cycapabilities, cyoperations.data(), count, device)
+    with nogil:
+        err = cyruntime.cudaDeviceGetHostAtomicCapabilities(cycapabilities, cyoperations.data(), count, device)
     if cudaError_t(err) == cudaError_t(0):
         pycapabilities = [<unsigned int>cycapabilities[idx] for idx in range(count)]
     if cycapabilities is not NULL:
@@ -19696,7 +19720,6 @@ def cudaDeviceGetDefaultMemPool(int device):
     cdef cudaMemPool_t memPool = cudaMemPool_t()
     with nogil:
         err = cyruntime.cudaDeviceGetDefaultMemPool(<cyruntime.cudaMemPool_t*>memPool._pvt_ptr, device)
-
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], memPool)
@@ -19744,7 +19767,6 @@ def cudaDeviceSetMemPool(int device, memPool):
     cymemPool = <cyruntime.cudaMemPool_t><void_ptr>pmemPool
     with nogil:
         err = cyruntime.cudaDeviceSetMemPool(device, cymemPool)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -19780,7 +19802,6 @@ def cudaDeviceGetMemPool(int device):
     cdef cudaMemPool_t memPool = cudaMemPool_t()
     with nogil:
         err = cyruntime.cudaDeviceGetMemPool(<cyruntime.cudaMemPool_t*>memPool._pvt_ptr, device)
-
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], memPool)
@@ -19867,7 +19888,8 @@ def cudaDeviceGetNvSciSyncAttributes(nvSciSyncAttrList, int device, int flags):
     """
     cynvSciSyncAttrList = utils.HelperInputVoidPtr(nvSciSyncAttrList)
     cdef void* cynvSciSyncAttrList_ptr = <void*><void_ptr>cynvSciSyncAttrList.cptr
-    err = cyruntime.cudaDeviceGetNvSciSyncAttributes(cynvSciSyncAttrList_ptr, device, flags)
+    with nogil:
+        err = cyruntime.cudaDeviceGetNvSciSyncAttributes(cynvSciSyncAttrList_ptr, device, flags)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -19929,7 +19951,8 @@ def cudaDeviceGetP2PAttribute(attr not None : cudaDeviceP2PAttr, int srcDevice,
     """
     cdef int value = 0
     cdef cyruntime.cudaDeviceP2PAttr cyattr = attr.value
-    err = cyruntime.cudaDeviceGetP2PAttribute(&value, cyattr, srcDevice, dstDevice)
+    with nogil:
+        err = cyruntime.cudaDeviceGetP2PAttribute(&value, cyattr, srcDevice, dstDevice)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], value)
@@ -19991,7 +20014,8 @@ def cudaDeviceGetP2PAtomicCapabilities(operations : Optional[Tuple[cudaAtomicOpe
             raise MemoryError('Failed to allocate length x size memory: ' + str(count) + 'x' + str(sizeof(unsigned int)))
     cdef vector[cyruntime.cudaAtomicOperation] cyoperations = [pyoperations.value for pyoperations in (operations)]
     if count > len(operations): raise RuntimeError("List is too small: " + str(len(operations)) + " < " + str(count))
-    err = cyruntime.cudaDeviceGetP2PAtomicCapabilities(cycapabilities, cyoperations.data(), count, srcDevice, dstDevice)
+    with nogil:
+        err = cyruntime.cudaDeviceGetP2PAtomicCapabilities(cycapabilities, cyoperations.data(), count, srcDevice, dstDevice)
     if cudaError_t(err) == cudaError_t(0):
         pycapabilities = [<unsigned int>cycapabilities[idx] for idx in range(count)]
     if cycapabilities is not NULL:
@@ -20028,7 +20052,8 @@ def cudaChooseDevice(prop : Optional[cudaDeviceProp]):
     """
     cdef int device = 0
     cdef cyruntime.cudaDeviceProp* cyprop_ptr = prop._pvt_ptr if prop != None else NULL
-    err = cyruntime.cudaChooseDevice(&device, cyprop_ptr)
+    with nogil:
+        err = cyruntime.cudaChooseDevice(&device, cyprop_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], device)
@@ -20072,7 +20097,8 @@ def cudaInitDevice(int device, unsigned int deviceFlags, unsigned int flags):
     --------
     :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cudaSetDevice` :py:obj:`~.cuCtxSetCurrent`
     """
-    err = cyruntime.cudaInitDevice(device, deviceFlags, flags)
+    with nogil:
+        err = cyruntime.cudaInitDevice(device, deviceFlags, flags)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -20127,7 +20153,8 @@ def cudaSetDevice(int device):
     --------
     :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cudaInitDevice`, :py:obj:`~.cuCtxSetCurrent`
     """
-    err = cyruntime.cudaSetDevice(device)
+    with nogil:
+        err = cyruntime.cudaSetDevice(device)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -20152,7 +20179,8 @@ def cudaGetDevice():
     :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cuCtxGetCurrent`
     """
     cdef int device = 0
-    err = cyruntime.cudaGetDevice(&device)
+    with nogil:
+        err = cyruntime.cudaGetDevice(&device)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], device)
@@ -20240,7 +20268,8 @@ def cudaSetDeviceFlags(unsigned int flags):
     --------
     :py:obj:`~.cudaGetDeviceFlags`, :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaSetValidDevices`, :py:obj:`~.cudaInitDevice`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cuDevicePrimaryCtxSetFlags`
     """
-    err = cyruntime.cudaSetDeviceFlags(flags)
+    with nogil:
+        err = cyruntime.cudaSetDeviceFlags(flags)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -20287,7 +20316,8 @@ def cudaGetDeviceFlags():
     :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaSetDeviceFlags`, :py:obj:`~.cudaInitDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuDevicePrimaryCtxGetState`
     """
     cdef unsigned int flags = 0
-    err = cyruntime.cudaGetDeviceFlags(&flags)
+    with nogil:
+        err = cyruntime.cudaGetDeviceFlags(&flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], flags)
@@ -20316,7 +20346,8 @@ def cudaStreamCreate():
     :py:obj:`~.cudaStreamCreateWithPriority`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamGetPriority`, :py:obj:`~.cudaStreamGetFlags`, :py:obj:`~.cudaStreamGetDevice`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaStreamDestroy`, :py:obj:`~.cuStreamCreate`
     """
     cdef cudaStream_t pStream = cudaStream_t()
-    err = cyruntime.cudaStreamCreate(<cyruntime.cudaStream_t*>pStream._pvt_ptr)
+    with nogil:
+        err = cyruntime.cudaStreamCreate(<cyruntime.cudaStream_t*>pStream._pvt_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pStream)
@@ -20359,7 +20390,8 @@ def cudaStreamCreateWithFlags(unsigned int flags):
     :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithPriority`, :py:obj:`~.cudaStreamGetFlags`, :py:obj:`~.cudaStreamGetDevice`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaStreamDestroy`, :py:obj:`~.cuStreamCreate`
     """
     cdef cudaStream_t pStream = cudaStream_t()
-    err = cyruntime.cudaStreamCreateWithFlags(<cyruntime.cudaStream_t*>pStream._pvt_ptr, flags)
+    with nogil:
+        err = cyruntime.cudaStreamCreateWithFlags(<cyruntime.cudaStream_t*>pStream._pvt_ptr, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pStream)
@@ -20419,7 +20451,8 @@ def cudaStreamCreateWithPriority(unsigned int flags, int priority):
     In the current implementation, only compute kernels launched in priority streams are affected by the stream's priority. Stream priorities have no effect on host-to-device and device-to-host memory operations.
     """
     cdef cudaStream_t pStream = cudaStream_t()
-    err = cyruntime.cudaStreamCreateWithPriority(<cyruntime.cudaStream_t*>pStream._pvt_ptr, flags, priority)
+    with nogil:
+        err = cyruntime.cudaStreamCreateWithPriority(<cyruntime.cudaStream_t*>pStream._pvt_ptr, flags, priority)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pStream)
@@ -20464,7 +20497,8 @@ def cudaStreamGetPriority(hStream):
         phStream = int(cudaStream_t(hStream))
     cyhStream = <cyruntime.cudaStream_t><void_ptr>phStream
     cdef int priority = 0
-    err = cyruntime.cudaStreamGetPriority(cyhStream, &priority)
+    with nogil:
+        err = cyruntime.cudaStreamGetPriority(cyhStream, &priority)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], priority)
@@ -20505,7 +20539,8 @@ def cudaStreamGetFlags(hStream):
         phStream = int(cudaStream_t(hStream))
     cyhStream = <cyruntime.cudaStream_t><void_ptr>phStream
     cdef unsigned int flags = 0
-    err = cyruntime.cudaStreamGetFlags(cyhStream, &flags)
+    with nogil:
+        err = cyruntime.cudaStreamGetFlags(cyhStream, &flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], flags)
@@ -20560,7 +20595,8 @@ def cudaStreamGetId(hStream):
         phStream = int(cudaStream_t(hStream))
     cyhStream = <cyruntime.cudaStream_t><void_ptr>phStream
     cdef unsigned long long streamId = 0
-    err = cyruntime.cudaStreamGetId(cyhStream, &streamId)
+    with nogil:
+        err = cyruntime.cudaStreamGetId(cyhStream, &streamId)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], streamId)
@@ -20599,7 +20635,8 @@ def cudaStreamGetDevice(hStream):
         phStream = int(cudaStream_t(hStream))
     cyhStream = <cyruntime.cudaStream_t><void_ptr>phStream
     cdef int device = 0
-    err = cyruntime.cudaStreamGetDevice(cyhStream, &device)
+    with nogil:
+        err = cyruntime.cudaStreamGetDevice(cyhStream, &device)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], device)
@@ -20623,7 +20660,8 @@ def cudaCtxResetPersistingL2Cache():
     --------
     :py:obj:`~.cudaAccessPolicyWindow`
     """
-    err = cyruntime.cudaCtxResetPersistingL2Cache()
+    with nogil:
+        err = cyruntime.cudaCtxResetPersistingL2Cache()
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -20668,7 +20706,8 @@ def cudaStreamCopyAttributes(dst, src):
     else:
         pdst = int(cudaStream_t(dst))
     cydst = <cyruntime.cudaStream_t><void_ptr>pdst
-    err = cyruntime.cudaStreamCopyAttributes(cydst, cysrc)
+    with nogil:
+        err = cyruntime.cudaStreamCopyAttributes(cydst, cysrc)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -20709,7 +20748,8 @@ def cudaStreamGetAttribute(hStream, attr not None : cudaStreamAttrID):
     cyhStream = <cyruntime.cudaStream_t><void_ptr>phStream
     cdef cyruntime.cudaStreamAttrID cyattr = attr.value
     cdef cudaStreamAttrValue value_out = cudaStreamAttrValue()
-    err = cyruntime.cudaStreamGetAttribute(cyhStream, cyattr, <cyruntime.cudaStreamAttrValue*>value_out._pvt_ptr)
+    with nogil:
+        err = cyruntime.cudaStreamGetAttribute(cyhStream, cyattr, <cyruntime.cudaStreamAttrValue*>value_out._pvt_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], value_out)
@@ -20753,7 +20793,8 @@ def cudaStreamSetAttribute(hStream, attr not None : cudaStreamAttrID, value : Op
     cyhStream = <cyruntime.cudaStream_t><void_ptr>phStream
     cdef cyruntime.cudaStreamAttrID cyattr = attr.value
     cdef cyruntime.cudaStreamAttrValue* cyvalue_ptr = value._pvt_ptr if value != None else NULL
-    err = cyruntime.cudaStreamSetAttribute(cyhStream, cyattr, cyvalue_ptr)
+    with nogil:
+        err = cyruntime.cudaStreamSetAttribute(cyhStream, cyattr, cyvalue_ptr)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -20792,7 +20833,8 @@ def cudaStreamDestroy(stream):
     else:
         pstream = int(cudaStream_t(stream))
     cystream = <cyruntime.cudaStream_t><void_ptr>pstream
-    err = cyruntime.cudaStreamDestroy(cystream)
+    with nogil:
+        err = cyruntime.cudaStreamDestroy(cystream)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -20851,7 +20893,6 @@ def cudaStreamWaitEvent(stream, event, unsigned int flags):
     cystream = <cyruntime.cudaStream_t><void_ptr>pstream
     with nogil:
         err = cyruntime.cudaStreamWaitEvent(cystream, cyevent, flags)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -20967,7 +21008,6 @@ def cudaStreamAddCallback(stream, callback, userData, unsigned int flags):
 
     with nogil:
         err = cyruntime.cudaStreamAddCallback(cystream, <cyruntime.cudaStreamCallback_t>cudaStreamRtCallbackWrapper, <void *>cbData, flags)
-
     if err != cyruntime.cudaSuccess:
         free(cbData)
     return (_dict_cudaError_t[err],)
@@ -21008,7 +21048,6 @@ def cudaStreamSynchronize(stream):
     cystream = <cyruntime.cudaStream_t><void_ptr>pstream
     with nogil:
         err = cyruntime.cudaStreamSynchronize(cystream)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -21047,7 +21086,8 @@ def cudaStreamQuery(stream):
     else:
         pstream = int(cudaStream_t(stream))
     cystream = <cyruntime.cudaStream_t><void_ptr>pstream
-    err = cyruntime.cudaStreamQuery(cystream)
+    with nogil:
+        err = cyruntime.cudaStreamQuery(cystream)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -21157,7 +21197,8 @@ def cudaStreamAttachMemAsync(stream, devPtr, size_t length, unsigned int flags):
     cystream = <cyruntime.cudaStream_t><void_ptr>pstream
     cydevPtr = utils.HelperInputVoidPtr(devPtr)
     cdef void* cydevPtr_ptr = <void*><void_ptr>cydevPtr.cptr
-    err = cyruntime.cudaStreamAttachMemAsync(cystream, cydevPtr_ptr, length, flags)
+    with nogil:
+        err = cyruntime.cudaStreamAttachMemAsync(cystream, cydevPtr_ptr, length, flags)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -21213,7 +21254,8 @@ def cudaStreamBeginCapture(stream, mode not None : cudaStreamCaptureMode):
         pstream = int(cudaStream_t(stream))
     cystream = <cyruntime.cudaStream_t><void_ptr>pstream
     cdef cyruntime.cudaStreamCaptureMode cymode = mode.value
-    err = cyruntime.cudaStreamBeginCapture(cystream, cymode)
+    with nogil:
+        err = cyruntime.cudaStreamBeginCapture(cystream, cymode)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -21293,26 +21335,31 @@ def cudaStreamBeginCaptureToGraph(stream, graph, dependencies : Optional[Tuple[c
         pstream = int(cudaStream_t(stream))
     cystream = <cyruntime.cudaStream_t><void_ptr>pstream
     cdef cyruntime.cudaGraphNode_t* cydependencies = NULL
-    if len(dependencies) > 0:
+    if len(dependencies) > 1:
         cydependencies = <cyruntime.cudaGraphNode_t*> calloc(len(dependencies), sizeof(cyruntime.cudaGraphNode_t))
         if cydependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
         else:
             for idx in range(len(dependencies)):
                 cydependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>dependencies[idx])._pvt_ptr[0]
+    elif len(dependencies) == 1:
+        cydependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>dependencies[0])._pvt_ptr
     cdef cyruntime.cudaGraphEdgeData* cydependencyData = NULL
-    if len(dependencyData) > 0:
+    if len(dependencyData) > 1:
         cydependencyData = <cyruntime.cudaGraphEdgeData*> calloc(len(dependencyData), sizeof(cyruntime.cudaGraphEdgeData))
         if cydependencyData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencyData)) + 'x' + str(sizeof(cyruntime.cudaGraphEdgeData)))
         for idx in range(len(dependencyData)):
             string.memcpy(&cydependencyData[idx], (<cudaGraphEdgeData>dependencyData[idx])._pvt_ptr, sizeof(cyruntime.cudaGraphEdgeData))
+    elif len(dependencyData) == 1:
+        cydependencyData = (<cudaGraphEdgeData>dependencyData[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
     cdef cyruntime.cudaStreamCaptureMode cymode = mode.value
-    err = cyruntime.cudaStreamBeginCaptureToGraph(cystream, cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, (<cudaGraphEdgeData>dependencyData[0])._pvt_ptr if len(dependencyData) == 1 else cydependencyData, numDependencies, cymode)
-    if cydependencies is not NULL:
+    with nogil:
+        err = cyruntime.cudaStreamBeginCaptureToGraph(cystream, cygraph, cydependencies, cydependencyData, numDependencies, cymode)
+    if len(dependencies) > 1 and cydependencies is not NULL:
         free(cydependencies)
-    if cydependencyData is not NULL:
+    if len(dependencyData) > 1 and cydependencyData is not NULL:
         free(cydependencyData)
     return (_dict_cudaError_t[err],)
 {{endif}}
@@ -21384,7 +21431,8 @@ def cudaThreadExchangeStreamCaptureMode(mode not None : cudaStreamCaptureMode):
     :py:obj:`~.cudaStreamBeginCapture`
     """
     cdef cyruntime.cudaStreamCaptureMode cymode = mode.value
-    err = cyruntime.cudaThreadExchangeStreamCaptureMode(&cymode)
+    with nogil:
+        err = cyruntime.cudaThreadExchangeStreamCaptureMode(&cymode)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], cudaStreamCaptureMode(cymode))
@@ -21431,7 +21479,8 @@ def cudaStreamEndCapture(stream):
         pstream = int(cudaStream_t(stream))
     cystream = <cyruntime.cudaStream_t><void_ptr>pstream
     cdef cudaGraph_t pGraph = cudaGraph_t()
-    err = cyruntime.cudaStreamEndCapture(cystream, <cyruntime.cudaGraph_t*>pGraph._pvt_ptr)
+    with nogil:
+        err = cyruntime.cudaStreamEndCapture(cystream, <cyruntime.cudaGraph_t*>pGraph._pvt_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pGraph)
@@ -21492,7 +21541,8 @@ def cudaStreamIsCapturing(stream):
         pstream = int(cudaStream_t(stream))
     cystream = <cyruntime.cudaStream_t><void_ptr>pstream
     cdef cyruntime.cudaStreamCaptureStatus pCaptureStatus
-    err = cyruntime.cudaStreamIsCapturing(cystream, &pCaptureStatus)
+    with nogil:
+        err = cyruntime.cudaStreamIsCapturing(cystream, &pCaptureStatus)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], cudaStreamCaptureStatus(pCaptureStatus))
@@ -21587,7 +21637,8 @@ def cudaStreamGetCaptureInfo(stream):
     cdef const cyruntime.cudaGraphEdgeData* cyedgeData_out = NULL
     pyedgeData_out = []
     cdef size_t numDependencies_out = 0
-    err = cyruntime.cudaStreamGetCaptureInfo(cystream, &captureStatus_out, &id_out, <cyruntime.cudaGraph_t*>graph_out._pvt_ptr, &cydependencies_out, &cyedgeData_out, &numDependencies_out)
+    with nogil:
+        err = cyruntime.cudaStreamGetCaptureInfo(cystream, &captureStatus_out, &id_out, <cyruntime.cudaGraph_t*>graph_out._pvt_ptr, &cydependencies_out, &cyedgeData_out, &numDependencies_out)
     if cudaError_t(err) == cudaError_t(0):
         pydependencies_out = [cudaGraphNode_t(init_value=<void_ptr>cydependencies_out[idx]) for idx in range(numDependencies_out)]
     if cudaError_t(err) == cudaError_t(0):
@@ -21657,24 +21708,29 @@ def cudaStreamUpdateCaptureDependencies(stream, dependencies : Optional[Tuple[cu
         pstream = int(cudaStream_t(stream))
     cystream = <cyruntime.cudaStream_t><void_ptr>pstream
     cdef cyruntime.cudaGraphNode_t* cydependencies = NULL
-    if len(dependencies) > 0:
+    if len(dependencies) > 1:
         cydependencies = <cyruntime.cudaGraphNode_t*> calloc(len(dependencies), sizeof(cyruntime.cudaGraphNode_t))
         if cydependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
         else:
             for idx in range(len(dependencies)):
                 cydependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>dependencies[idx])._pvt_ptr[0]
+    elif len(dependencies) == 1:
+        cydependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>dependencies[0])._pvt_ptr
     cdef cyruntime.cudaGraphEdgeData* cydependencyData = NULL
-    if len(dependencyData) > 0:
+    if len(dependencyData) > 1:
         cydependencyData = <cyruntime.cudaGraphEdgeData*> calloc(len(dependencyData), sizeof(cyruntime.cudaGraphEdgeData))
         if cydependencyData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencyData)) + 'x' + str(sizeof(cyruntime.cudaGraphEdgeData)))
         for idx in range(len(dependencyData)):
             string.memcpy(&cydependencyData[idx], (<cudaGraphEdgeData>dependencyData[idx])._pvt_ptr, sizeof(cyruntime.cudaGraphEdgeData))
-    err = cyruntime.cudaStreamUpdateCaptureDependencies(cystream, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>dependencies[0])._pvt_ptr if len(dependencies) == 1 else cydependencies, (<cudaGraphEdgeData>dependencyData[0])._pvt_ptr if len(dependencyData) == 1 else cydependencyData, numDependencies, flags)
-    if cydependencies is not NULL:
+    elif len(dependencyData) == 1:
+        cydependencyData = (<cudaGraphEdgeData>dependencyData[0])._pvt_ptr
+    with nogil:
+        err = cyruntime.cudaStreamUpdateCaptureDependencies(cystream, cydependencies, cydependencyData, numDependencies, flags)
+    if len(dependencies) > 1 and cydependencies is not NULL:
         free(cydependencies)
-    if cydependencyData is not NULL:
+    if len(dependencyData) > 1 and cydependencyData is not NULL:
         free(cydependencyData)
     return (_dict_cudaError_t[err],)
 {{endif}}
@@ -21700,7 +21756,8 @@ def cudaEventCreate():
     cudaEventCreate (C++ API), :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventRecord`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventElapsedTime`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cuEventCreate`
     """
     cdef cudaEvent_t event = cudaEvent_t()
-    err = cyruntime.cudaEventCreate(<cyruntime.cudaEvent_t*>event._pvt_ptr)
+    with nogil:
+        err = cyruntime.cudaEventCreate(<cyruntime.cudaEvent_t*>event._pvt_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], event)
@@ -21751,7 +21808,8 @@ def cudaEventCreateWithFlags(unsigned int flags):
     :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventElapsedTime`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cuEventCreate`
     """
     cdef cudaEvent_t event = cudaEvent_t()
-    err = cyruntime.cudaEventCreateWithFlags(<cyruntime.cudaEvent_t*>event._pvt_ptr, flags)
+    with nogil:
+        err = cyruntime.cudaEventCreateWithFlags(<cyruntime.cudaEvent_t*>event._pvt_ptr, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], event)
@@ -21811,7 +21869,8 @@ def cudaEventRecord(event, stream):
     else:
         pevent = int(cudaEvent_t(event))
     cyevent = <cyruntime.cudaEvent_t><void_ptr>pevent
-    err = cyruntime.cudaEventRecord(cyevent, cystream)
+    with nogil:
+        err = cyruntime.cudaEventRecord(cyevent, cystream)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -21878,7 +21937,8 @@ def cudaEventRecordWithFlags(event, stream, unsigned int flags):
     else:
         pevent = int(cudaEvent_t(event))
     cyevent = <cyruntime.cudaEvent_t><void_ptr>pevent
-    err = cyruntime.cudaEventRecordWithFlags(cyevent, cystream, flags)
+    with nogil:
+        err = cyruntime.cudaEventRecordWithFlags(cyevent, cystream, flags)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -21922,7 +21982,8 @@ def cudaEventQuery(event):
     else:
         pevent = int(cudaEvent_t(event))
     cyevent = <cyruntime.cudaEvent_t><void_ptr>pevent
-    err = cyruntime.cudaEventQuery(cyevent)
+    with nogil:
+        err = cyruntime.cudaEventQuery(cyevent)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -21965,7 +22026,8 @@ def cudaEventSynchronize(event):
     else:
         pevent = int(cudaEvent_t(event))
     cyevent = <cyruntime.cudaEvent_t><void_ptr>pevent
-    err = cyruntime.cudaEventSynchronize(cyevent)
+    with nogil:
+        err = cyruntime.cudaEventSynchronize(cyevent)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -22005,7 +22067,8 @@ def cudaEventDestroy(event):
     else:
         pevent = int(cudaEvent_t(event))
     cyevent = <cyruntime.cudaEvent_t><void_ptr>pevent
-    err = cyruntime.cudaEventDestroy(cyevent)
+    with nogil:
+        err = cyruntime.cudaEventDestroy(cyevent)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -22075,7 +22138,8 @@ def cudaEventElapsedTime(start, end):
         pstart = int(cudaEvent_t(start))
     cystart = <cyruntime.cudaEvent_t><void_ptr>pstart
     cdef float ms = 0
-    err = cyruntime.cudaEventElapsedTime(&ms, cystart, cyend)
+    with nogil:
+        err = cyruntime.cudaEventElapsedTime(&ms, cystart, cyend)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], ms)
@@ -22228,7 +22292,8 @@ def cudaImportExternalMemory(memHandleDesc : Optional[cudaExternalMemoryHandleDe
     """
     cdef cudaExternalMemory_t extMem_out = cudaExternalMemory_t()
     cdef cyruntime.cudaExternalMemoryHandleDesc* cymemHandleDesc_ptr = memHandleDesc._pvt_ptr if memHandleDesc != None else NULL
-    err = cyruntime.cudaImportExternalMemory(<cyruntime.cudaExternalMemory_t*>extMem_out._pvt_ptr, cymemHandleDesc_ptr)
+    with nogil:
+        err = cyruntime.cudaImportExternalMemory(<cyruntime.cudaExternalMemory_t*>extMem_out._pvt_ptr, cymemHandleDesc_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], extMem_out)
@@ -22295,7 +22360,8 @@ def cudaExternalMemoryGetMappedBuffer(extMem, bufferDesc : Optional[cudaExternal
     cyextMem = <cyruntime.cudaExternalMemory_t><void_ptr>pextMem
     cdef void_ptr devPtr = 0
     cdef cyruntime.cudaExternalMemoryBufferDesc* cybufferDesc_ptr = bufferDesc._pvt_ptr if bufferDesc != None else NULL
-    err = cyruntime.cudaExternalMemoryGetMappedBuffer(<void**>&devPtr, cyextMem, cybufferDesc_ptr)
+    with nogil:
+        err = cyruntime.cudaExternalMemoryGetMappedBuffer(<void**>&devPtr, cyextMem, cybufferDesc_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], devPtr)
@@ -22366,7 +22432,8 @@ def cudaExternalMemoryGetMappedMipmappedArray(extMem, mipmapDesc : Optional[cuda
     cyextMem = <cyruntime.cudaExternalMemory_t><void_ptr>pextMem
     cdef cudaMipmappedArray_t mipmap = cudaMipmappedArray_t()
     cdef cyruntime.cudaExternalMemoryMipmappedArrayDesc* cymipmapDesc_ptr = mipmapDesc._pvt_ptr if mipmapDesc != None else NULL
-    err = cyruntime.cudaExternalMemoryGetMappedMipmappedArray(<cyruntime.cudaMipmappedArray_t*>mipmap._pvt_ptr, cyextMem, cymipmapDesc_ptr)
+    with nogil:
+        err = cyruntime.cudaExternalMemoryGetMappedMipmappedArray(<cyruntime.cudaMipmappedArray_t*>mipmap._pvt_ptr, cyextMem, cymipmapDesc_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], mipmap)
@@ -22405,7 +22472,8 @@ def cudaDestroyExternalMemory(extMem):
     else:
         pextMem = int(cudaExternalMemory_t(extMem))
     cyextMem = <cyruntime.cudaExternalMemory_t><void_ptr>pextMem
-    err = cyruntime.cudaDestroyExternalMemory(cyextMem)
+    with nogil:
+        err = cyruntime.cudaDestroyExternalMemory(cyextMem)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -22551,7 +22619,8 @@ def cudaImportExternalSemaphore(semHandleDesc : Optional[cudaExternalSemaphoreHa
     """
     cdef cudaExternalSemaphore_t extSem_out = cudaExternalSemaphore_t()
     cdef cyruntime.cudaExternalSemaphoreHandleDesc* cysemHandleDesc_ptr = semHandleDesc._pvt_ptr if semHandleDesc != None else NULL
-    err = cyruntime.cudaImportExternalSemaphore(<cyruntime.cudaExternalSemaphore_t*>extSem_out._pvt_ptr, cysemHandleDesc_ptr)
+    with nogil:
+        err = cyruntime.cudaImportExternalSemaphore(<cyruntime.cudaExternalSemaphore_t*>extSem_out._pvt_ptr, cysemHandleDesc_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], extSem_out)
@@ -22684,26 +22753,31 @@ def cudaSignalExternalSemaphoresAsync(extSemArray : Optional[Tuple[cudaExternalS
     if not all(isinstance(_x, (cudaExternalSemaphore_t,)) for _x in extSemArray):
         raise TypeError("Argument 'extSemArray' is not instance of type (expected Tuple[cyruntime.cudaExternalSemaphore_t,] or List[cyruntime.cudaExternalSemaphore_t,]")
     cdef cyruntime.cudaExternalSemaphore_t* cyextSemArray = NULL
-    if len(extSemArray) > 0:
+    if len(extSemArray) > 1:
         cyextSemArray = <cyruntime.cudaExternalSemaphore_t*> calloc(len(extSemArray), sizeof(cyruntime.cudaExternalSemaphore_t))
         if cyextSemArray is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(extSemArray)) + 'x' + str(sizeof(cyruntime.cudaExternalSemaphore_t)))
         else:
             for idx in range(len(extSemArray)):
                 cyextSemArray[idx] = <cyruntime.cudaExternalSemaphore_t>(<cudaExternalSemaphore_t>extSemArray[idx])._pvt_ptr[0]
+    elif len(extSemArray) == 1:
+        cyextSemArray = <cyruntime.cudaExternalSemaphore_t*>(<cudaExternalSemaphore_t>extSemArray[0])._pvt_ptr
     cdef cyruntime.cudaExternalSemaphoreSignalParams* cyparamsArray = NULL
-    if len(paramsArray) > 0:
+    if len(paramsArray) > 1:
         cyparamsArray = <cyruntime.cudaExternalSemaphoreSignalParams*> calloc(len(paramsArray), sizeof(cyruntime.cudaExternalSemaphoreSignalParams))
         if cyparamsArray is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(paramsArray)) + 'x' + str(sizeof(cyruntime.cudaExternalSemaphoreSignalParams)))
         for idx in range(len(paramsArray)):
             string.memcpy(&cyparamsArray[idx], (<cudaExternalSemaphoreSignalParams>paramsArray[idx])._pvt_ptr, sizeof(cyruntime.cudaExternalSemaphoreSignalParams))
+    elif len(paramsArray) == 1:
+        cyparamsArray = (<cudaExternalSemaphoreSignalParams>paramsArray[0])._pvt_ptr
     if numExtSems > len(extSemArray): raise RuntimeError("List is too small: " + str(len(extSemArray)) + " < " + str(numExtSems))
     if numExtSems > len(paramsArray): raise RuntimeError("List is too small: " + str(len(paramsArray)) + " < " + str(numExtSems))
-    err = cyruntime.cudaSignalExternalSemaphoresAsync(<cyruntime.cudaExternalSemaphore_t*>(<cudaExternalSemaphore_t>extSemArray[0])._pvt_ptr if len(extSemArray) == 1 else cyextSemArray, (<cudaExternalSemaphoreSignalParams>paramsArray[0])._pvt_ptr if len(paramsArray) == 1 else cyparamsArray, numExtSems, cystream)
-    if cyextSemArray is not NULL:
+    with nogil:
+        err = cyruntime.cudaSignalExternalSemaphoresAsync(cyextSemArray, cyparamsArray, numExtSems, cystream)
+    if len(extSemArray) > 1 and cyextSemArray is not NULL:
         free(cyextSemArray)
-    if cyparamsArray is not NULL:
+    if len(paramsArray) > 1 and cyparamsArray is not NULL:
         free(cyparamsArray)
     return (_dict_cudaError_t[err],)
 {{endif}}
@@ -22808,26 +22882,31 @@ def cudaWaitExternalSemaphoresAsync(extSemArray : Optional[Tuple[cudaExternalSem
     if not all(isinstance(_x, (cudaExternalSemaphore_t,)) for _x in extSemArray):
         raise TypeError("Argument 'extSemArray' is not instance of type (expected Tuple[cyruntime.cudaExternalSemaphore_t,] or List[cyruntime.cudaExternalSemaphore_t,]")
     cdef cyruntime.cudaExternalSemaphore_t* cyextSemArray = NULL
-    if len(extSemArray) > 0:
+    if len(extSemArray) > 1:
         cyextSemArray = <cyruntime.cudaExternalSemaphore_t*> calloc(len(extSemArray), sizeof(cyruntime.cudaExternalSemaphore_t))
         if cyextSemArray is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(extSemArray)) + 'x' + str(sizeof(cyruntime.cudaExternalSemaphore_t)))
         else:
             for idx in range(len(extSemArray)):
                 cyextSemArray[idx] = <cyruntime.cudaExternalSemaphore_t>(<cudaExternalSemaphore_t>extSemArray[idx])._pvt_ptr[0]
+    elif len(extSemArray) == 1:
+        cyextSemArray = <cyruntime.cudaExternalSemaphore_t*>(<cudaExternalSemaphore_t>extSemArray[0])._pvt_ptr
     cdef cyruntime.cudaExternalSemaphoreWaitParams* cyparamsArray = NULL
-    if len(paramsArray) > 0:
+    if len(paramsArray) > 1:
         cyparamsArray = <cyruntime.cudaExternalSemaphoreWaitParams*> calloc(len(paramsArray), sizeof(cyruntime.cudaExternalSemaphoreWaitParams))
         if cyparamsArray is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(paramsArray)) + 'x' + str(sizeof(cyruntime.cudaExternalSemaphoreWaitParams)))
         for idx in range(len(paramsArray)):
             string.memcpy(&cyparamsArray[idx], (<cudaExternalSemaphoreWaitParams>paramsArray[idx])._pvt_ptr, sizeof(cyruntime.cudaExternalSemaphoreWaitParams))
+    elif len(paramsArray) == 1:
+        cyparamsArray = (<cudaExternalSemaphoreWaitParams>paramsArray[0])._pvt_ptr
     if numExtSems > len(extSemArray): raise RuntimeError("List is too small: " + str(len(extSemArray)) + " < " + str(numExtSems))
     if numExtSems > len(paramsArray): raise RuntimeError("List is too small: " + str(len(paramsArray)) + " < " + str(numExtSems))
-    err = cyruntime.cudaWaitExternalSemaphoresAsync(<cyruntime.cudaExternalSemaphore_t*>(<cudaExternalSemaphore_t>extSemArray[0])._pvt_ptr if len(extSemArray) == 1 else cyextSemArray, (<cudaExternalSemaphoreWaitParams>paramsArray[0])._pvt_ptr if len(paramsArray) == 1 else cyparamsArray, numExtSems, cystream)
-    if cyextSemArray is not NULL:
+    with nogil:
+        err = cyruntime.cudaWaitExternalSemaphoresAsync(cyextSemArray, cyparamsArray, numExtSems, cystream)
+    if len(extSemArray) > 1 and cyextSemArray is not NULL:
         free(cyextSemArray)
-    if cyparamsArray is not NULL:
+    if len(paramsArray) > 1 and cyparamsArray is not NULL:
         free(cyparamsArray)
     return (_dict_cudaError_t[err],)
 {{endif}}
@@ -22864,7 +22943,8 @@ def cudaDestroyExternalSemaphore(extSem):
     else:
         pextSem = int(cudaExternalSemaphore_t(extSem))
     cyextSem = <cyruntime.cudaExternalSemaphore_t><void_ptr>pextSem
-    err = cyruntime.cudaDestroyExternalSemaphore(cyextSem)
+    with nogil:
+        err = cyruntime.cudaDestroyExternalSemaphore(cyextSem)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -22930,7 +23010,8 @@ def cudaFuncSetCacheConfig(func, cacheConfig not None : cudaFuncCache):
     cyfunc = utils.HelperInputVoidPtr(func)
     cdef void* cyfunc_ptr = <void*><void_ptr>cyfunc.cptr
     cdef cyruntime.cudaFuncCache cycacheConfig = cacheConfig.value
-    err = cyruntime.cudaFuncSetCacheConfig(cyfunc_ptr, cycacheConfig)
+    with nogil:
+        err = cyruntime.cudaFuncSetCacheConfig(cyfunc_ptr, cycacheConfig)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -22971,7 +23052,8 @@ def cudaFuncGetAttributes(func):
     cdef cudaFuncAttributes attr = cudaFuncAttributes()
     cyfunc = utils.HelperInputVoidPtr(func)
     cdef void* cyfunc_ptr = <void*><void_ptr>cyfunc.cptr
-    err = cyruntime.cudaFuncGetAttributes(<cyruntime.cudaFuncAttributes*>attr._pvt_ptr, cyfunc_ptr)
+    with nogil:
+        err = cyruntime.cudaFuncGetAttributes(<cyruntime.cudaFuncAttributes*>attr._pvt_ptr, cyfunc_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], attr)
@@ -23058,7 +23140,8 @@ def cudaFuncSetAttribute(func, attr not None : cudaFuncAttribute, int value):
     cyfunc = utils.HelperInputVoidPtr(func)
     cdef void* cyfunc_ptr = <void*><void_ptr>cyfunc.cptr
     cdef cyruntime.cudaFuncAttribute cyattr = attr.value
-    err = cyruntime.cudaFuncSetAttribute(cyfunc_ptr, cyattr, value)
+    with nogil:
+        err = cyruntime.cudaFuncSetAttribute(cyfunc_ptr, cyattr, value)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -23167,7 +23250,6 @@ def cudaLaunchHostFunc(stream, fn, userData):
 
     with nogil:
         err = cyruntime.cudaLaunchHostFunc(cystream, <cyruntime.cudaHostFn_t>cudaStreamRtHostCallbackWrapper, <void *>cbData)
-
     if err != cyruntime.cudaSuccess:
         free(cbData)
     return (_dict_cudaError_t[err],)
@@ -23236,7 +23318,8 @@ def cudaFuncSetSharedMemConfig(func, config not None : cudaSharedMemConfig):
     cyfunc = utils.HelperInputVoidPtr(func)
     cdef void* cyfunc_ptr = <void*><void_ptr>cyfunc.cptr
     cdef cyruntime.cudaSharedMemConfig cyconfig = config.value
-    err = cyruntime.cudaFuncSetSharedMemConfig(cyfunc_ptr, cyconfig)
+    with nogil:
+        err = cyruntime.cudaFuncSetSharedMemConfig(cyfunc_ptr, cyconfig)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -23272,7 +23355,8 @@ def cudaOccupancyMaxActiveBlocksPerMultiprocessor(func, int blockSize, size_t dy
     cdef int numBlocks = 0
     cyfunc = utils.HelperInputVoidPtr(func)
     cdef void* cyfunc_ptr = <void*><void_ptr>cyfunc.cptr
-    err = cyruntime.cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, cyfunc_ptr, blockSize, dynamicSMemSize)
+    with nogil:
+        err = cyruntime.cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, cyfunc_ptr, blockSize, dynamicSMemSize)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], numBlocks)
@@ -23310,7 +23394,8 @@ def cudaOccupancyAvailableDynamicSMemPerBlock(func, int numBlocks, int blockSize
     cdef size_t dynamicSmemSize = 0
     cyfunc = utils.HelperInputVoidPtr(func)
     cdef void* cyfunc_ptr = <void*><void_ptr>cyfunc.cptr
-    err = cyruntime.cudaOccupancyAvailableDynamicSMemPerBlock(&dynamicSmemSize, cyfunc_ptr, numBlocks, blockSize)
+    with nogil:
+        err = cyruntime.cudaOccupancyAvailableDynamicSMemPerBlock(&dynamicSmemSize, cyfunc_ptr, numBlocks, blockSize)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], dynamicSmemSize)
@@ -23365,7 +23450,8 @@ def cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(func, int blockSize,
     cdef int numBlocks = 0
     cyfunc = utils.HelperInputVoidPtr(func)
     cdef void* cyfunc_ptr = <void*><void_ptr>cyfunc.cptr
-    err = cyruntime.cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&numBlocks, cyfunc_ptr, blockSize, dynamicSMemSize, flags)
+    with nogil:
+        err = cyruntime.cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&numBlocks, cyfunc_ptr, blockSize, dynamicSMemSize, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], numBlocks)
@@ -23500,7 +23586,6 @@ def cudaMallocManaged(size_t size, unsigned int flags):
     cdef void_ptr devPtr = 0
     with nogil:
         err = cyruntime.cudaMallocManaged(<void**>&devPtr, size, flags)
-
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], devPtr)
@@ -23540,7 +23625,6 @@ def cudaMalloc(size_t size):
     cdef void_ptr devPtr = 0
     with nogil:
         err = cyruntime.cudaMalloc(<void**>&devPtr, size)
-
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], devPtr)
@@ -23587,7 +23671,8 @@ def cudaMallocHost(size_t size):
     :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, cudaMallocHost (C++ API), :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cuMemAllocHost`
     """
     cdef void_ptr ptr = 0
-    err = cyruntime.cudaMallocHost(<void**>&ptr, size)
+    with nogil:
+        err = cyruntime.cudaMallocHost(<void**>&ptr, size)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], ptr)
@@ -23642,7 +23727,8 @@ def cudaMallocPitch(size_t width, size_t height):
     """
     cdef void_ptr devPtr = 0
     cdef size_t pitch = 0
-    err = cyruntime.cudaMallocPitch(<void**>&devPtr, &pitch, width, height)
+    with nogil:
+        err = cyruntime.cudaMallocPitch(<void**>&devPtr, &pitch, width, height)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None, None)
     return (_dict_cudaError_t[err], devPtr, pitch)
@@ -23720,7 +23806,6 @@ def cudaMallocArray(desc : Optional[cudaChannelFormatDesc], size_t width, size_t
     cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = desc._pvt_ptr if desc != None else NULL
     with nogil:
         err = cyruntime.cudaMallocArray(<cyruntime.cudaArray_t*>array._pvt_ptr, cydesc_ptr, width, height, flags)
-
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], array)
@@ -23773,7 +23858,6 @@ def cudaFree(devPtr):
     cdef void* cydevPtr_ptr = <void*><void_ptr>cydevPtr.cptr
     with nogil:
         err = cyruntime.cudaFree(cydevPtr_ptr)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -23805,7 +23889,6 @@ def cudaFreeHost(ptr):
     cdef void* cyptr_ptr = <void*><void_ptr>cyptr.cptr
     with nogil:
         err = cyruntime.cudaFreeHost(cyptr_ptr)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -23843,7 +23926,6 @@ def cudaFreeArray(array):
     cyarray = <cyruntime.cudaArray_t><void_ptr>parray
     with nogil:
         err = cyruntime.cudaFreeArray(cyarray)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -23879,7 +23961,8 @@ def cudaFreeMipmappedArray(mipmappedArray):
     else:
         pmipmappedArray = int(cudaMipmappedArray_t(mipmappedArray))
     cymipmappedArray = <cyruntime.cudaMipmappedArray_t><void_ptr>pmipmappedArray
-    err = cyruntime.cudaFreeMipmappedArray(cymipmappedArray)
+    with nogil:
+        err = cyruntime.cudaFreeMipmappedArray(cymipmappedArray)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -23963,7 +24046,6 @@ def cudaHostAlloc(size_t size, unsigned int flags):
     cdef void_ptr pHost = 0
     with nogil:
         err = cyruntime.cudaHostAlloc(<void**>&pHost, size, flags)
-
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pHost)
@@ -24083,7 +24165,6 @@ def cudaHostRegister(ptr, size_t size, unsigned int flags):
     cdef void* cyptr_ptr = <void*><void_ptr>cyptr.cptr
     with nogil:
         err = cyruntime.cudaHostRegister(cyptr_ptr, size, flags)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -24117,7 +24198,6 @@ def cudaHostUnregister(ptr):
     cdef void* cyptr_ptr = <void*><void_ptr>cyptr.cptr
     with nogil:
         err = cyruntime.cudaHostUnregister(cyptr_ptr)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -24176,7 +24256,8 @@ def cudaHostGetDevicePointer(pHost, unsigned int flags):
     cdef void_ptr pDevice = 0
     cypHost = utils.HelperInputVoidPtr(pHost)
     cdef void* cypHost_ptr = <void*><void_ptr>cypHost.cptr
-    err = cyruntime.cudaHostGetDevicePointer(<void**>&pDevice, cypHost_ptr, flags)
+    with nogil:
+        err = cyruntime.cudaHostGetDevicePointer(<void**>&pDevice, cypHost_ptr, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pDevice)
@@ -24210,7 +24291,8 @@ def cudaHostGetFlags(pHost):
     cdef unsigned int pFlags = 0
     cypHost = utils.HelperInputVoidPtr(pHost)
     cdef void* cypHost_ptr = <void*><void_ptr>cypHost.cptr
-    err = cyruntime.cudaHostGetFlags(&pFlags, cypHost_ptr)
+    with nogil:
+        err = cyruntime.cudaHostGetFlags(&pFlags, cypHost_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pFlags)
@@ -24258,7 +24340,8 @@ def cudaMalloc3D(extent not None : cudaExtent):
     :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaPitchedPtr`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuMemAllocPitch`
     """
     cdef cudaPitchedPtr pitchedDevPtr = cudaPitchedPtr()
-    err = cyruntime.cudaMalloc3D(<cyruntime.cudaPitchedPtr*>pitchedDevPtr._pvt_ptr, extent._pvt_ptr[0])
+    with nogil:
+        err = cyruntime.cudaMalloc3D(<cyruntime.cudaPitchedPtr*>pitchedDevPtr._pvt_ptr, extent._pvt_ptr[0])
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pitchedDevPtr)
@@ -24382,7 +24465,6 @@ def cudaMalloc3DArray(desc : Optional[cudaChannelFormatDesc], extent not None :
     cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = desc._pvt_ptr if desc != None else NULL
     with nogil:
         err = cyruntime.cudaMalloc3DArray(<cyruntime.cudaArray_t*>array._pvt_ptr, cydesc_ptr, extent._pvt_ptr[0], flags)
-
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], array)
@@ -24507,7 +24589,8 @@ def cudaMallocMipmappedArray(desc : Optional[cudaChannelFormatDesc], extent not
     """
     cdef cudaMipmappedArray_t mipmappedArray = cudaMipmappedArray_t()
     cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = desc._pvt_ptr if desc != None else NULL
-    err = cyruntime.cudaMallocMipmappedArray(<cyruntime.cudaMipmappedArray_t*>mipmappedArray._pvt_ptr, cydesc_ptr, extent._pvt_ptr[0], numLevels, flags)
+    with nogil:
+        err = cyruntime.cudaMallocMipmappedArray(<cyruntime.cudaMipmappedArray_t*>mipmappedArray._pvt_ptr, cydesc_ptr, extent._pvt_ptr[0], numLevels, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], mipmappedArray)
@@ -24555,7 +24638,8 @@ def cudaGetMipmappedArrayLevel(mipmappedArray, unsigned int level):
         pmipmappedArray = int(cudaMipmappedArray_const_t(mipmappedArray))
     cymipmappedArray = <cyruntime.cudaMipmappedArray_const_t><void_ptr>pmipmappedArray
     cdef cudaArray_t levelArray = cudaArray_t()
-    err = cyruntime.cudaGetMipmappedArrayLevel(<cyruntime.cudaArray_t*>levelArray._pvt_ptr, cymipmappedArray, level)
+    with nogil:
+        err = cyruntime.cudaGetMipmappedArrayLevel(<cyruntime.cudaArray_t*>levelArray._pvt_ptr, cymipmappedArray, level)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], levelArray)
@@ -24642,7 +24726,6 @@ def cudaMemcpy3D(p : Optional[cudaMemcpy3DParms]):
     cdef cyruntime.cudaMemcpy3DParms* cyp_ptr = p._pvt_ptr if p != None else NULL
     with nogil:
         err = cyruntime.cudaMemcpy3D(cyp_ptr)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -24678,7 +24761,8 @@ def cudaMemcpy3DPeer(p : Optional[cudaMemcpy3DPeerParms]):
     :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpyPeerAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :py:obj:`~.cuMemcpy3DPeer`
     """
     cdef cyruntime.cudaMemcpy3DPeerParms* cyp_ptr = p._pvt_ptr if p != None else NULL
-    err = cyruntime.cudaMemcpy3DPeer(cyp_ptr)
+    with nogil:
+        err = cyruntime.cudaMemcpy3DPeer(cyp_ptr)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -24784,7 +24868,6 @@ def cudaMemcpy3DAsync(p : Optional[cudaMemcpy3DParms], stream):
     cdef cyruntime.cudaMemcpy3DParms* cyp_ptr = p._pvt_ptr if p != None else NULL
     with nogil:
         err = cyruntime.cudaMemcpy3DAsync(cyp_ptr, cystream)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -24823,7 +24906,8 @@ def cudaMemcpy3DPeerAsync(p : Optional[cudaMemcpy3DPeerParms], stream):
         pstream = int(cudaStream_t(stream))
     cystream = <cyruntime.cudaStream_t><void_ptr>pstream
     cdef cyruntime.cudaMemcpy3DPeerParms* cyp_ptr = p._pvt_ptr if p != None else NULL
-    err = cyruntime.cudaMemcpy3DPeerAsync(cyp_ptr, cystream)
+    with nogil:
+        err = cyruntime.cudaMemcpy3DPeerAsync(cyp_ptr, cystream)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -24865,7 +24949,8 @@ def cudaMemGetInfo():
     """
     cdef size_t free = 0
     cdef size_t total = 0
-    err = cyruntime.cudaMemGetInfo(&free, &total)
+    with nogil:
+        err = cyruntime.cudaMemGetInfo(&free, &total)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None, None)
     return (_dict_cudaError_t[err], free, total)
@@ -24913,7 +24998,8 @@ def cudaArrayGetInfo(array):
     cdef cudaChannelFormatDesc desc = cudaChannelFormatDesc()
     cdef cudaExtent extent = cudaExtent()
     cdef unsigned int flags = 0
-    err = cyruntime.cudaArrayGetInfo(<cyruntime.cudaChannelFormatDesc*>desc._pvt_ptr, <cyruntime.cudaExtent*>extent._pvt_ptr, &flags, cyarray)
+    with nogil:
+        err = cyruntime.cudaArrayGetInfo(<cyruntime.cudaChannelFormatDesc*>desc._pvt_ptr, <cyruntime.cudaExtent*>extent._pvt_ptr, &flags, cyarray)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None, None, None)
     return (_dict_cudaError_t[err], desc, extent, flags)
@@ -24968,7 +25054,8 @@ def cudaArrayGetPlane(hArray, unsigned int planeIdx):
         phArray = int(cudaArray_t(hArray))
     cyhArray = <cyruntime.cudaArray_t><void_ptr>phArray
     cdef cudaArray_t pPlaneArray = cudaArray_t()
-    err = cyruntime.cudaArrayGetPlane(<cyruntime.cudaArray_t*>pPlaneArray._pvt_ptr, cyhArray, planeIdx)
+    with nogil:
+        err = cyruntime.cudaArrayGetPlane(<cyruntime.cudaArray_t*>pPlaneArray._pvt_ptr, cyhArray, planeIdx)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pPlaneArray)
@@ -25017,7 +25104,8 @@ def cudaArrayGetMemoryRequirements(array, int device):
         parray = int(cudaArray_t(array))
     cyarray = <cyruntime.cudaArray_t><void_ptr>parray
     cdef cudaArrayMemoryRequirements memoryRequirements = cudaArrayMemoryRequirements()
-    err = cyruntime.cudaArrayGetMemoryRequirements(<cyruntime.cudaArrayMemoryRequirements*>memoryRequirements._pvt_ptr, cyarray, device)
+    with nogil:
+        err = cyruntime.cudaArrayGetMemoryRequirements(<cyruntime.cudaArrayMemoryRequirements*>memoryRequirements._pvt_ptr, cyarray, device)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], memoryRequirements)
@@ -25066,7 +25154,8 @@ def cudaMipmappedArrayGetMemoryRequirements(mipmap, int device):
         pmipmap = int(cudaMipmappedArray_t(mipmap))
     cymipmap = <cyruntime.cudaMipmappedArray_t><void_ptr>pmipmap
     cdef cudaArrayMemoryRequirements memoryRequirements = cudaArrayMemoryRequirements()
-    err = cyruntime.cudaMipmappedArrayGetMemoryRequirements(<cyruntime.cudaArrayMemoryRequirements*>memoryRequirements._pvt_ptr, cymipmap, device)
+    with nogil:
+        err = cyruntime.cudaMipmappedArrayGetMemoryRequirements(<cyruntime.cudaArrayMemoryRequirements*>memoryRequirements._pvt_ptr, cymipmap, device)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], memoryRequirements)
@@ -25121,7 +25210,8 @@ def cudaArrayGetSparseProperties(array):
         parray = int(cudaArray_t(array))
     cyarray = <cyruntime.cudaArray_t><void_ptr>parray
     cdef cudaArraySparseProperties sparseProperties = cudaArraySparseProperties()
-    err = cyruntime.cudaArrayGetSparseProperties(<cyruntime.cudaArraySparseProperties*>sparseProperties._pvt_ptr, cyarray)
+    with nogil:
+        err = cyruntime.cudaArrayGetSparseProperties(<cyruntime.cudaArraySparseProperties*>sparseProperties._pvt_ptr, cyarray)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], sparseProperties)
@@ -25176,7 +25266,8 @@ def cudaMipmappedArrayGetSparseProperties(mipmap):
         pmipmap = int(cudaMipmappedArray_t(mipmap))
     cymipmap = <cyruntime.cudaMipmappedArray_t><void_ptr>pmipmap
     cdef cudaArraySparseProperties sparseProperties = cudaArraySparseProperties()
-    err = cyruntime.cudaMipmappedArrayGetSparseProperties(<cyruntime.cudaArraySparseProperties*>sparseProperties._pvt_ptr, cymipmap)
+    with nogil:
+        err = cyruntime.cudaMipmappedArrayGetSparseProperties(<cyruntime.cudaArraySparseProperties*>sparseProperties._pvt_ptr, cymipmap)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], sparseProperties)
@@ -25229,7 +25320,6 @@ def cudaMemcpy(dst, src, size_t count, kind not None : cudaMemcpyKind):
     cdef cyruntime.cudaMemcpyKind cykind = kind.value
     with nogil:
         err = cyruntime.cudaMemcpy(cydst_ptr, cysrc_ptr, count, cykind)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -25278,7 +25368,6 @@ def cudaMemcpyPeer(dst, int dstDevice, src, int srcDevice, size_t count):
     cdef void* cysrc_ptr = <void*><void_ptr>cysrc.cptr
     with nogil:
         err = cyruntime.cudaMemcpyPeer(cydst_ptr, dstDevice, cysrc_ptr, srcDevice, count)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -25339,7 +25428,6 @@ def cudaMemcpy2D(dst, size_t dpitch, src, size_t spitch, size_t width, size_t he
     cdef cyruntime.cudaMemcpyKind cykind = kind.value
     with nogil:
         err = cyruntime.cudaMemcpy2D(cydst_ptr, dpitch, cysrc_ptr, spitch, width, height, cykind)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -25407,7 +25495,6 @@ def cudaMemcpy2DToArray(dst, size_t wOffset, size_t hOffset, src, size_t spitch,
     cdef cyruntime.cudaMemcpyKind cykind = kind.value
     with nogil:
         err = cyruntime.cudaMemcpy2DToArray(cydst, wOffset, hOffset, cysrc_ptr, spitch, width, height, cykind)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -25475,7 +25562,6 @@ def cudaMemcpy2DFromArray(dst, size_t dpitch, src, size_t wOffset, size_t hOffse
     cdef cyruntime.cudaMemcpyKind cykind = kind.value
     with nogil:
         err = cyruntime.cudaMemcpy2DFromArray(cydst_ptr, dpitch, cysrc, wOffset, hOffset, width, height, cykind)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -25547,7 +25633,8 @@ def cudaMemcpy2DArrayToArray(dst, size_t wOffsetDst, size_t hOffsetDst, src, siz
         pdst = int(cudaArray_t(dst))
     cydst = <cyruntime.cudaArray_t><void_ptr>pdst
     cdef cyruntime.cudaMemcpyKind cykind = kind.value
-    err = cyruntime.cudaMemcpy2DArrayToArray(cydst, wOffsetDst, hOffsetDst, cysrc, wOffsetSrc, hOffsetSrc, width, height, cykind)
+    with nogil:
+        err = cyruntime.cudaMemcpy2DArrayToArray(cydst, wOffsetDst, hOffsetDst, cysrc, wOffsetSrc, hOffsetSrc, width, height, cykind)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -25618,7 +25705,6 @@ def cudaMemcpyAsync(dst, src, size_t count, kind not None : cudaMemcpyKind, stre
     cdef cyruntime.cudaMemcpyKind cykind = kind.value
     with nogil:
         err = cyruntime.cudaMemcpyAsync(cydst_ptr, cysrc_ptr, count, cykind, cystream)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -25675,7 +25761,6 @@ def cudaMemcpyPeerAsync(dst, int dstDevice, src, int srcDevice, size_t count, st
     cdef void* cysrc_ptr = <void*><void_ptr>cysrc.cptr
     with nogil:
         err = cyruntime.cudaMemcpyPeerAsync(cydst_ptr, dstDevice, cysrc_ptr, srcDevice, count, cystream)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -25802,24 +25887,29 @@ def cudaMemcpyBatchAsync(dsts : Optional[Tuple[Any] | List[Any]], srcs : Optiona
     dsts = [] if dsts is None else dsts
     pylist = [utils.HelperInputVoidPtr(pydsts) for pydsts in dsts]
     cdef utils.InputVoidPtrPtrHelper voidStarHelperdsts = utils.InputVoidPtrPtrHelper(pylist)
+    cdef const void** cydsts_ptr = <const void**><void_ptr>voidStarHelperdsts.cptr
     pylist = [utils.HelperInputVoidPtr(pysrcs) for pysrcs in srcs]
     cdef utils.InputVoidPtrPtrHelper voidStarHelpersrcs = utils.InputVoidPtrPtrHelper(pylist)
+    cdef const void** cysrcs_ptr = <const void**><void_ptr>voidStarHelpersrcs.cptr
     cdef vector[size_t] cysizes = sizes
     if count > <size_t>len(dsts): raise RuntimeError("List is too small: " + str(len(dsts)) + " < " + str(count))
     if count > <size_t>len(srcs): raise RuntimeError("List is too small: " + str(len(srcs)) + " < " + str(count))
     if count > <size_t>len(sizes): raise RuntimeError("List is too small: " + str(len(sizes)) + " < " + str(count))
     cdef cyruntime.cudaMemcpyAttributes* cyattrs = NULL
-    if len(attrs) > 0:
+    if len(attrs) > 1:
         cyattrs = <cyruntime.cudaMemcpyAttributes*> calloc(len(attrs), sizeof(cyruntime.cudaMemcpyAttributes))
         if cyattrs is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(attrs)) + 'x' + str(sizeof(cyruntime.cudaMemcpyAttributes)))
         for idx in range(len(attrs)):
             string.memcpy(&cyattrs[idx], (<cudaMemcpyAttributes>attrs[idx])._pvt_ptr, sizeof(cyruntime.cudaMemcpyAttributes))
+    elif len(attrs) == 1:
+        cyattrs = (<cudaMemcpyAttributes>attrs[0])._pvt_ptr
     cdef vector[size_t] cyattrsIdxs = attrsIdxs
     if numAttrs > <size_t>len(attrs): raise RuntimeError("List is too small: " + str(len(attrs)) + " < " + str(numAttrs))
     if numAttrs > <size_t>len(attrsIdxs): raise RuntimeError("List is too small: " + str(len(attrsIdxs)) + " < " + str(numAttrs))
-    err = cyruntime.cudaMemcpyBatchAsync(<const void**><void_ptr>voidStarHelperdsts.cptr, <const void**><void_ptr>voidStarHelpersrcs.cptr, cysizes.data(), count, (<cudaMemcpyAttributes>attrs[0])._pvt_ptr if len(attrs) == 1 else cyattrs, cyattrsIdxs.data(), numAttrs, cystream)
-    if cyattrs is not NULL:
+    with nogil:
+        err = cyruntime.cudaMemcpyBatchAsync(cydsts_ptr, cysrcs_ptr, cysizes.data(), count, cyattrs, cyattrsIdxs.data(), numAttrs, cystream)
+    if len(attrs) > 1 and cyattrs is not NULL:
         free(cyattrs)
     return (_dict_cudaError_t[err],)
 {{endif}}
@@ -25939,14 +26029,17 @@ def cudaMemcpy3DBatchAsync(size_t numOps, opList : Optional[Tuple[cudaMemcpy3DBa
         raise TypeError("Argument 'opList' is not instance of type (expected Tuple[cyruntime.cudaMemcpy3DBatchOp,] or List[cyruntime.cudaMemcpy3DBatchOp,]")
     if numOps > <size_t>len(opList): raise RuntimeError("List is too small: " + str(len(opList)) + " < " + str(numOps))
     cdef cyruntime.cudaMemcpy3DBatchOp* cyopList = NULL
-    if len(opList) > 0:
+    if len(opList) > 1:
         cyopList = <cyruntime.cudaMemcpy3DBatchOp*> calloc(len(opList), sizeof(cyruntime.cudaMemcpy3DBatchOp))
         if cyopList is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(opList)) + 'x' + str(sizeof(cyruntime.cudaMemcpy3DBatchOp)))
         for idx in range(len(opList)):
             string.memcpy(&cyopList[idx], (<cudaMemcpy3DBatchOp>opList[idx])._pvt_ptr, sizeof(cyruntime.cudaMemcpy3DBatchOp))
-    err = cyruntime.cudaMemcpy3DBatchAsync(numOps, (<cudaMemcpy3DBatchOp>opList[0])._pvt_ptr if len(opList) == 1 else cyopList, flags, cystream)
-    if cyopList is not NULL:
+    elif len(opList) == 1:
+        cyopList = (<cudaMemcpy3DBatchOp>opList[0])._pvt_ptr
+    with nogil:
+        err = cyruntime.cudaMemcpy3DBatchAsync(numOps, cyopList, flags, cystream)
+    if len(opList) > 1 and cyopList is not NULL:
         free(cyopList)
     return (_dict_cudaError_t[err],)
 {{endif}}
@@ -26029,7 +26122,6 @@ def cudaMemcpy2DAsync(dst, size_t dpitch, src, size_t spitch, size_t width, size
     cdef cyruntime.cudaMemcpyKind cykind = kind.value
     with nogil:
         err = cyruntime.cudaMemcpy2DAsync(cydst_ptr, dpitch, cysrc_ptr, spitch, width, height, cykind, cystream)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -26118,7 +26210,6 @@ def cudaMemcpy2DToArrayAsync(dst, size_t wOffset, size_t hOffset, src, size_t sp
     cdef cyruntime.cudaMemcpyKind cykind = kind.value
     with nogil:
         err = cyruntime.cudaMemcpy2DToArrayAsync(cydst, wOffset, hOffset, cysrc_ptr, spitch, width, height, cykind, cystream)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -26206,7 +26297,6 @@ def cudaMemcpy2DFromArrayAsync(dst, size_t dpitch, src, size_t wOffset, size_t h
     cdef cyruntime.cudaMemcpyKind cykind = kind.value
     with nogil:
         err = cyruntime.cudaMemcpy2DFromArrayAsync(cydst_ptr, dpitch, cysrc, wOffset, hOffset, width, height, cykind, cystream)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -26244,7 +26334,6 @@ def cudaMemset(devPtr, int value, size_t count):
     cdef void* cydevPtr_ptr = <void*><void_ptr>cydevPtr.cptr
     with nogil:
         err = cyruntime.cudaMemset(cydevPtr_ptr, value, count)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -26287,7 +26376,8 @@ def cudaMemset2D(devPtr, size_t pitch, int value, size_t width, size_t height):
     """
     cydevPtr = utils.HelperInputVoidPtr(devPtr)
     cdef void* cydevPtr_ptr = <void*><void_ptr>cydevPtr.cptr
-    err = cyruntime.cudaMemset2D(cydevPtr_ptr, pitch, value, width, height)
+    with nogil:
+        err = cyruntime.cudaMemset2D(cydevPtr_ptr, pitch, value, width, height)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -26340,7 +26430,8 @@ def cudaMemset3D(pitchedDevPtr not None : cudaPitchedPtr, int value, extent not
     --------
     :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMemset3DAsync`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.make_cudaPitchedPtr`, :py:obj:`~.make_cudaExtent`
     """
-    err = cyruntime.cudaMemset3D(pitchedDevPtr._pvt_ptr[0], value, extent._pvt_ptr[0])
+    with nogil:
+        err = cyruntime.cudaMemset3D(pitchedDevPtr._pvt_ptr[0], value, extent._pvt_ptr[0])
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -26394,7 +26485,6 @@ def cudaMemsetAsync(devPtr, int value, size_t count, stream):
     cdef void* cydevPtr_ptr = <void*><void_ptr>cydevPtr.cptr
     with nogil:
         err = cyruntime.cudaMemsetAsync(cydevPtr_ptr, value, count, cystream)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -26453,7 +26543,8 @@ def cudaMemset2DAsync(devPtr, size_t pitch, int value, size_t width, size_t heig
     cystream = <cyruntime.cudaStream_t><void_ptr>pstream
     cydevPtr = utils.HelperInputVoidPtr(devPtr)
     cdef void* cydevPtr_ptr = <void*><void_ptr>cydevPtr.cptr
-    err = cyruntime.cudaMemset2DAsync(cydevPtr_ptr, pitch, value, width, height, cystream)
+    with nogil:
+        err = cyruntime.cudaMemset2DAsync(cydevPtr_ptr, pitch, value, width, height, cystream)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -26522,7 +26613,8 @@ def cudaMemset3DAsync(pitchedDevPtr not None : cudaPitchedPtr, int value, extent
     else:
         pstream = int(cudaStream_t(stream))
     cystream = <cyruntime.cudaStream_t><void_ptr>pstream
-    err = cyruntime.cudaMemset3DAsync(pitchedDevPtr._pvt_ptr[0], value, extent._pvt_ptr[0], cystream)
+    with nogil:
+        err = cyruntime.cudaMemset3DAsync(pitchedDevPtr._pvt_ptr[0], value, extent._pvt_ptr[0], cystream)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -26640,7 +26732,6 @@ def cudaMemPrefetchAsync(devPtr, size_t count, location not None : cudaMemLocati
     cdef void* cydevPtr_ptr = <void*><void_ptr>cydevPtr.cptr
     with nogil:
         err = cyruntime.cudaMemPrefetchAsync(cydevPtr_ptr, count, location._pvt_ptr[0], flags, cystream)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -26730,21 +26821,25 @@ def cudaMemPrefetchBatchAsync(dptrs : Optional[Tuple[Any] | List[Any]], sizes :
     dptrs = [] if dptrs is None else dptrs
     pylist = [utils.HelperInputVoidPtr(pydptrs) for pydptrs in dptrs]
     cdef utils.InputVoidPtrPtrHelper voidStarHelperdptrs = utils.InputVoidPtrPtrHelper(pylist)
+    cdef void** cydptrs_ptr = <void**><void_ptr>voidStarHelperdptrs.cptr
     cdef vector[size_t] cysizes = sizes
     if count > <size_t>len(dptrs): raise RuntimeError("List is too small: " + str(len(dptrs)) + " < " + str(count))
     if count > <size_t>len(sizes): raise RuntimeError("List is too small: " + str(len(sizes)) + " < " + str(count))
     cdef cyruntime.cudaMemLocation* cyprefetchLocs = NULL
-    if len(prefetchLocs) > 0:
+    if len(prefetchLocs) > 1:
         cyprefetchLocs = <cyruntime.cudaMemLocation*> calloc(len(prefetchLocs), sizeof(cyruntime.cudaMemLocation))
         if cyprefetchLocs is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(prefetchLocs)) + 'x' + str(sizeof(cyruntime.cudaMemLocation)))
         for idx in range(len(prefetchLocs)):
             string.memcpy(&cyprefetchLocs[idx], (<cudaMemLocation>prefetchLocs[idx])._pvt_ptr, sizeof(cyruntime.cudaMemLocation))
+    elif len(prefetchLocs) == 1:
+        cyprefetchLocs = (<cudaMemLocation>prefetchLocs[0])._pvt_ptr
     cdef vector[size_t] cyprefetchLocIdxs = prefetchLocIdxs
     if numPrefetchLocs > <size_t>len(prefetchLocs): raise RuntimeError("List is too small: " + str(len(prefetchLocs)) + " < " + str(numPrefetchLocs))
     if numPrefetchLocs > <size_t>len(prefetchLocIdxs): raise RuntimeError("List is too small: " + str(len(prefetchLocIdxs)) + " < " + str(numPrefetchLocs))
-    err = cyruntime.cudaMemPrefetchBatchAsync(<void**><void_ptr>voidStarHelperdptrs.cptr, cysizes.data(), count, (<cudaMemLocation>prefetchLocs[0])._pvt_ptr if len(prefetchLocs) == 1 else cyprefetchLocs, cyprefetchLocIdxs.data(), numPrefetchLocs, flags, cystream)
-    if cyprefetchLocs is not NULL:
+    with nogil:
+        err = cyruntime.cudaMemPrefetchBatchAsync(cydptrs_ptr, cysizes.data(), count, cyprefetchLocs, cyprefetchLocIdxs.data(), numPrefetchLocs, flags, cystream)
+    if len(prefetchLocs) > 1 and cyprefetchLocs is not NULL:
         free(cyprefetchLocs)
     return (_dict_cudaError_t[err],)
 {{endif}}
@@ -26814,10 +26909,12 @@ def cudaMemDiscardBatchAsync(dptrs : Optional[Tuple[Any] | List[Any]], sizes : T
     dptrs = [] if dptrs is None else dptrs
     pylist = [utils.HelperInputVoidPtr(pydptrs) for pydptrs in dptrs]
     cdef utils.InputVoidPtrPtrHelper voidStarHelperdptrs = utils.InputVoidPtrPtrHelper(pylist)
+    cdef void** cydptrs_ptr = <void**><void_ptr>voidStarHelperdptrs.cptr
     cdef vector[size_t] cysizes = sizes
     if count > <size_t>len(dptrs): raise RuntimeError("List is too small: " + str(len(dptrs)) + " < " + str(count))
     if count > <size_t>len(sizes): raise RuntimeError("List is too small: " + str(len(sizes)) + " < " + str(count))
-    err = cyruntime.cudaMemDiscardBatchAsync(<void**><void_ptr>voidStarHelperdptrs.cptr, cysizes.data(), count, flags, cystream)
+    with nogil:
+        err = cyruntime.cudaMemDiscardBatchAsync(cydptrs_ptr, cysizes.data(), count, flags, cystream)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -26915,21 +27012,25 @@ def cudaMemDiscardAndPrefetchBatchAsync(dptrs : Optional[Tuple[Any] | List[Any]]
     dptrs = [] if dptrs is None else dptrs
     pylist = [utils.HelperInputVoidPtr(pydptrs) for pydptrs in dptrs]
     cdef utils.InputVoidPtrPtrHelper voidStarHelperdptrs = utils.InputVoidPtrPtrHelper(pylist)
+    cdef void** cydptrs_ptr = <void**><void_ptr>voidStarHelperdptrs.cptr
     cdef vector[size_t] cysizes = sizes
     if count > <size_t>len(dptrs): raise RuntimeError("List is too small: " + str(len(dptrs)) + " < " + str(count))
     if count > <size_t>len(sizes): raise RuntimeError("List is too small: " + str(len(sizes)) + " < " + str(count))
     cdef cyruntime.cudaMemLocation* cyprefetchLocs = NULL
-    if len(prefetchLocs) > 0:
+    if len(prefetchLocs) > 1:
         cyprefetchLocs = <cyruntime.cudaMemLocation*> calloc(len(prefetchLocs), sizeof(cyruntime.cudaMemLocation))
         if cyprefetchLocs is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(prefetchLocs)) + 'x' + str(sizeof(cyruntime.cudaMemLocation)))
         for idx in range(len(prefetchLocs)):
             string.memcpy(&cyprefetchLocs[idx], (<cudaMemLocation>prefetchLocs[idx])._pvt_ptr, sizeof(cyruntime.cudaMemLocation))
+    elif len(prefetchLocs) == 1:
+        cyprefetchLocs = (<cudaMemLocation>prefetchLocs[0])._pvt_ptr
     cdef vector[size_t] cyprefetchLocIdxs = prefetchLocIdxs
     if numPrefetchLocs > <size_t>len(prefetchLocs): raise RuntimeError("List is too small: " + str(len(prefetchLocs)) + " < " + str(numPrefetchLocs))
     if numPrefetchLocs > <size_t>len(prefetchLocIdxs): raise RuntimeError("List is too small: " + str(len(prefetchLocIdxs)) + " < " + str(numPrefetchLocs))
-    err = cyruntime.cudaMemDiscardAndPrefetchBatchAsync(<void**><void_ptr>voidStarHelperdptrs.cptr, cysizes.data(), count, (<cudaMemLocation>prefetchLocs[0])._pvt_ptr if len(prefetchLocs) == 1 else cyprefetchLocs, cyprefetchLocIdxs.data(), numPrefetchLocs, flags, cystream)
-    if cyprefetchLocs is not NULL:
+    with nogil:
+        err = cyruntime.cudaMemDiscardAndPrefetchBatchAsync(cydptrs_ptr, cysizes.data(), count, cyprefetchLocs, cyprefetchLocIdxs.data(), numPrefetchLocs, flags, cystream)
+    if len(prefetchLocs) > 1 and cyprefetchLocs is not NULL:
         free(cyprefetchLocs)
     return (_dict_cudaError_t[err],)
 {{endif}}
@@ -27128,7 +27229,6 @@ def cudaMemAdvise(devPtr, size_t count, advice not None : cudaMemoryAdvise, loca
     cdef cyruntime.cudaMemoryAdvise cyadvice = advice.value
     with nogil:
         err = cyruntime.cudaMemAdvise(cydevPtr_ptr, count, cyadvice, location._pvt_ptr[0])
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -27275,7 +27375,8 @@ def cudaMemRangeGetAttribute(size_t dataSize, attribute not None : cudaMemRangeA
     cdef cyruntime.cudaMemRangeAttribute cyattribute = attribute.value
     cydevPtr = utils.HelperInputVoidPtr(devPtr)
     cdef void* cydevPtr_ptr = <void*><void_ptr>cydevPtr.cptr
-    err = cyruntime.cudaMemRangeGetAttribute(cydata_ptr, dataSize, cyattribute, cydevPtr_ptr, count)
+    with nogil:
+        err = cyruntime.cudaMemRangeGetAttribute(cydata_ptr, dataSize, cyattribute, cydevPtr_ptr, count)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], cydata.pyObj())
@@ -27355,7 +27456,8 @@ def cudaMemRangeGetAttributes(dataSizes : Tuple[int] | List[int], attributes : O
     if numAttributes > <size_t>len(attributes): raise RuntimeError("List is too small: " + str(len(attributes)) + " < " + str(numAttributes))
     cydevPtr = utils.HelperInputVoidPtr(devPtr)
     cdef void* cydevPtr_ptr = <void*><void_ptr>cydevPtr.cptr
-    err = cyruntime.cudaMemRangeGetAttributes(cyvoidStarHelper_ptr, cydataSizes.data(), cyattributes.data(), numAttributes, cydevPtr_ptr, count)
+    with nogil:
+        err = cyruntime.cudaMemRangeGetAttributes(cyvoidStarHelper_ptr, cydataSizes.data(), cyattributes.data(), numAttributes, cydevPtr_ptr, count)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], [obj.pyObj() for obj in pylist])
@@ -27415,7 +27517,8 @@ def cudaMemcpyToArray(dst, size_t wOffset, size_t hOffset, src, size_t count, ki
     cysrc = utils.HelperInputVoidPtr(src)
     cdef void* cysrc_ptr = <void*><void_ptr>cysrc.cptr
     cdef cyruntime.cudaMemcpyKind cykind = kind.value
-    err = cyruntime.cudaMemcpyToArray(cydst, wOffset, hOffset, cysrc_ptr, count, cykind)
+    with nogil:
+        err = cyruntime.cudaMemcpyToArray(cydst, wOffset, hOffset, cysrc_ptr, count, cykind)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -27473,7 +27576,8 @@ def cudaMemcpyFromArray(dst, src, size_t wOffset, size_t hOffset, size_t count,
     cydst = utils.HelperInputVoidPtr(dst)
     cdef void* cydst_ptr = <void*><void_ptr>cydst.cptr
     cdef cyruntime.cudaMemcpyKind cykind = kind.value
-    err = cyruntime.cudaMemcpyFromArray(cydst_ptr, cysrc, wOffset, hOffset, count, cykind)
+    with nogil:
+        err = cyruntime.cudaMemcpyFromArray(cydst_ptr, cysrc, wOffset, hOffset, count, cykind)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -27542,7 +27646,8 @@ def cudaMemcpyArrayToArray(dst, size_t wOffsetDst, size_t hOffsetDst, src, size_
         pdst = int(cudaArray_t(dst))
     cydst = <cyruntime.cudaArray_t><void_ptr>pdst
     cdef cyruntime.cudaMemcpyKind cykind = kind.value
-    err = cyruntime.cudaMemcpyArrayToArray(cydst, wOffsetDst, hOffsetDst, cysrc, wOffsetSrc, hOffsetSrc, count, cykind)
+    with nogil:
+        err = cyruntime.cudaMemcpyArrayToArray(cydst, wOffsetDst, hOffsetDst, cysrc, wOffsetSrc, hOffsetSrc, count, cykind)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -27617,7 +27722,8 @@ def cudaMemcpyToArrayAsync(dst, size_t wOffset, size_t hOffset, src, size_t coun
     cysrc = utils.HelperInputVoidPtr(src)
     cdef void* cysrc_ptr = <void*><void_ptr>cysrc.cptr
     cdef cyruntime.cudaMemcpyKind cykind = kind.value
-    err = cyruntime.cudaMemcpyToArrayAsync(cydst, wOffset, hOffset, cysrc_ptr, count, cykind, cystream)
+    with nogil:
+        err = cyruntime.cudaMemcpyToArrayAsync(cydst, wOffset, hOffset, cysrc_ptr, count, cykind, cystream)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -27692,7 +27798,8 @@ def cudaMemcpyFromArrayAsync(dst, src, size_t wOffset, size_t hOffset, size_t co
     cydst = utils.HelperInputVoidPtr(dst)
     cdef void* cydst_ptr = <void*><void_ptr>cydst.cptr
     cdef cyruntime.cudaMemcpyKind cykind = kind.value
-    err = cyruntime.cudaMemcpyFromArrayAsync(cydst_ptr, cysrc, wOffset, hOffset, count, cykind, cystream)
+    with nogil:
+        err = cyruntime.cudaMemcpyFromArrayAsync(cydst_ptr, cysrc, wOffset, hOffset, count, cykind, cystream)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -27746,7 +27853,6 @@ def cudaMallocAsync(size_t size, hStream):
     cdef void_ptr devPtr = 0
     with nogil:
         err = cyruntime.cudaMallocAsync(<void**>&devPtr, size, cyhStream)
-
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], devPtr)
@@ -27795,7 +27901,6 @@ def cudaFreeAsync(devPtr, hStream):
     cdef void* cydevPtr_ptr = <void*><void_ptr>cydevPtr.cptr
     with nogil:
         err = cyruntime.cudaFreeAsync(cydevPtr_ptr, cyhStream)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -27846,7 +27951,6 @@ def cudaMemPoolTrimTo(memPool, size_t minBytesToKeep):
     cymemPool = <cyruntime.cudaMemPool_t><void_ptr>pmemPool
     with nogil:
         err = cyruntime.cudaMemPoolTrimTo(cymemPool, minBytesToKeep)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -27923,7 +28027,6 @@ def cudaMemPoolSetAttribute(memPool, attr not None : cudaMemPoolAttr, value):
     cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
     with nogil:
         err = cyruntime.cudaMemPoolSetAttribute(cymemPool, cyattr, cyvalue_ptr)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -28006,7 +28109,6 @@ def cudaMemPoolGetAttribute(memPool, attr not None : cudaMemPoolAttr):
     cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
     with nogil:
         err = cyruntime.cudaMemPoolGetAttribute(cymemPool, cyattr, cyvalue_ptr)
-
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], cyvalue.pyObj())
@@ -28049,15 +28151,18 @@ def cudaMemPoolSetAccess(memPool, descList : Optional[Tuple[cudaMemAccessDesc] |
         pmemPool = int(cudaMemPool_t(memPool))
     cymemPool = <cyruntime.cudaMemPool_t><void_ptr>pmemPool
     cdef cyruntime.cudaMemAccessDesc* cydescList = NULL
-    if len(descList) > 0:
+    if len(descList) > 1:
         cydescList = <cyruntime.cudaMemAccessDesc*> calloc(len(descList), sizeof(cyruntime.cudaMemAccessDesc))
         if cydescList is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(descList)) + 'x' + str(sizeof(cyruntime.cudaMemAccessDesc)))
         for idx in range(len(descList)):
             string.memcpy(&cydescList[idx], (<cudaMemAccessDesc>descList[idx])._pvt_ptr, sizeof(cyruntime.cudaMemAccessDesc))
+    elif len(descList) == 1:
+        cydescList = (<cudaMemAccessDesc>descList[0])._pvt_ptr
     if count > <size_t>len(descList): raise RuntimeError("List is too small: " + str(len(descList)) + " < " + str(count))
-    err = cyruntime.cudaMemPoolSetAccess(cymemPool, (<cudaMemAccessDesc>descList[0])._pvt_ptr if len(descList) == 1 else cydescList, count)
-    if cydescList is not NULL:
+    with nogil:
+        err = cyruntime.cudaMemPoolSetAccess(cymemPool, cydescList, count)
+    if len(descList) > 1 and cydescList is not NULL:
         free(cydescList)
     return (_dict_cudaError_t[err],)
 {{endif}}
@@ -28099,7 +28204,8 @@ def cudaMemPoolGetAccess(memPool, location : Optional[cudaMemLocation]):
     cymemPool = <cyruntime.cudaMemPool_t><void_ptr>pmemPool
     cdef cyruntime.cudaMemAccessFlags flags
     cdef cyruntime.cudaMemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
-    err = cyruntime.cudaMemPoolGetAccess(&flags, cymemPool, cylocation_ptr)
+    with nogil:
+        err = cyruntime.cudaMemPoolGetAccess(&flags, cymemPool, cylocation_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], cudaMemAccessFlags(flags))
@@ -28182,7 +28288,8 @@ def cudaMemPoolCreate(poolProps : Optional[cudaMemPoolProps]):
     """
     cdef cudaMemPool_t memPool = cudaMemPool_t()
     cdef cyruntime.cudaMemPoolProps* cypoolProps_ptr = poolProps._pvt_ptr if poolProps != None else NULL
-    err = cyruntime.cudaMemPoolCreate(<cyruntime.cudaMemPool_t*>memPool._pvt_ptr, cypoolProps_ptr)
+    with nogil:
+        err = cyruntime.cudaMemPoolCreate(<cyruntime.cudaMemPool_t*>memPool._pvt_ptr, cypoolProps_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], memPool)
@@ -28229,7 +28336,8 @@ def cudaMemPoolDestroy(memPool):
     else:
         pmemPool = int(cudaMemPool_t(memPool))
     cymemPool = <cyruntime.cudaMemPool_t><void_ptr>pmemPool
-    err = cyruntime.cudaMemPoolDestroy(cymemPool)
+    with nogil:
+        err = cyruntime.cudaMemPoolDestroy(cymemPool)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -28271,7 +28379,8 @@ def cudaMemGetDefaultMemPool(location : Optional[cudaMemLocation], typename not
     cdef cudaMemPool_t memPool = cudaMemPool_t()
     cdef cyruntime.cudaMemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
     cdef cyruntime.cudaMemAllocationType cytypename = typename.value
-    err = cyruntime.cudaMemGetDefaultMemPool(<cyruntime.cudaMemPool_t*>memPool._pvt_ptr, cylocation_ptr, cytypename)
+    with nogil:
+        err = cyruntime.cudaMemGetDefaultMemPool(<cyruntime.cudaMemPool_t*>memPool._pvt_ptr, cylocation_ptr, cytypename)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], memPool)
@@ -28324,7 +28433,8 @@ def cudaMemGetMemPool(location : Optional[cudaMemLocation], typename not None :
     cdef cudaMemPool_t memPool = cudaMemPool_t()
     cdef cyruntime.cudaMemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
     cdef cyruntime.cudaMemAllocationType cytypename = typename.value
-    err = cyruntime.cudaMemGetMemPool(<cyruntime.cudaMemPool_t*>memPool._pvt_ptr, cylocation_ptr, cytypename)
+    with nogil:
+        err = cyruntime.cudaMemGetMemPool(<cyruntime.cudaMemPool_t*>memPool._pvt_ptr, cylocation_ptr, cytypename)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], memPool)
@@ -28392,7 +28502,8 @@ def cudaMemSetMemPool(location : Optional[cudaMemLocation], typename not None :
     cymemPool = <cyruntime.cudaMemPool_t><void_ptr>pmemPool
     cdef cyruntime.cudaMemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
     cdef cyruntime.cudaMemAllocationType cytypename = typename.value
-    err = cyruntime.cudaMemSetMemPool(cylocation_ptr, cytypename, cymemPool)
+    with nogil:
+        err = cyruntime.cudaMemSetMemPool(cylocation_ptr, cytypename, cymemPool)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -28448,7 +28559,8 @@ def cudaMallocFromPoolAsync(size_t size, memPool, stream):
         pmemPool = int(cudaMemPool_t(memPool))
     cymemPool = <cyruntime.cudaMemPool_t><void_ptr>pmemPool
     cdef void_ptr ptr = 0
-    err = cyruntime.cudaMallocFromPoolAsync(<void**>&ptr, size, cymemPool, cystream)
+    with nogil:
+        err = cyruntime.cudaMallocFromPoolAsync(<void**>&ptr, size, cymemPool, cystream)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], ptr)
@@ -28504,7 +28616,8 @@ def cudaMemPoolExportToShareableHandle(memPool, handleType not None : cudaMemAll
     cdef utils.HelperCUmemAllocationHandleType cyshareableHandle = utils.HelperCUmemAllocationHandleType(handleType)
     cdef void* cyshareableHandle_ptr = <void*><void_ptr>cyshareableHandle.cptr
     cdef cyruntime.cudaMemAllocationHandleType cyhandleType = handleType.value
-    err = cyruntime.cudaMemPoolExportToShareableHandle(cyshareableHandle_ptr, cymemPool, cyhandleType, flags)
+    with nogil:
+        err = cyruntime.cudaMemPoolExportToShareableHandle(cyshareableHandle_ptr, cymemPool, cyhandleType, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], cyshareableHandle.pyObj())
@@ -28547,7 +28660,8 @@ def cudaMemPoolImportFromShareableHandle(shareableHandle, handleType not None :
     cyshareableHandle = utils.HelperInputVoidPtr(shareableHandle)
     cdef void* cyshareableHandle_ptr = <void*><void_ptr>cyshareableHandle.cptr
     cdef cyruntime.cudaMemAllocationHandleType cyhandleType = handleType.value
-    err = cyruntime.cudaMemPoolImportFromShareableHandle(<cyruntime.cudaMemPool_t*>memPool._pvt_ptr, cyshareableHandle_ptr, cyhandleType, flags)
+    with nogil:
+        err = cyruntime.cudaMemPoolImportFromShareableHandle(<cyruntime.cudaMemPool_t*>memPool._pvt_ptr, cyshareableHandle_ptr, cyhandleType, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], memPool)
@@ -28583,7 +28697,8 @@ def cudaMemPoolExportPointer(ptr):
     cdef cudaMemPoolPtrExportData exportData = cudaMemPoolPtrExportData()
     cyptr = utils.HelperInputVoidPtr(ptr)
     cdef void* cyptr_ptr = <void*><void_ptr>cyptr.cptr
-    err = cyruntime.cudaMemPoolExportPointer(<cyruntime.cudaMemPoolPtrExportData*>exportData._pvt_ptr, cyptr_ptr)
+    with nogil:
+        err = cyruntime.cudaMemPoolExportPointer(<cyruntime.cudaMemPoolPtrExportData*>exportData._pvt_ptr, cyptr_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], exportData)
@@ -28635,7 +28750,8 @@ def cudaMemPoolImportPointer(memPool, exportData : Optional[cudaMemPoolPtrExport
     cymemPool = <cyruntime.cudaMemPool_t><void_ptr>pmemPool
     cdef void_ptr ptr = 0
     cdef cyruntime.cudaMemPoolPtrExportData* cyexportData_ptr = exportData._pvt_ptr if exportData != None else NULL
-    err = cyruntime.cudaMemPoolImportPointer(<void**>&ptr, cymemPool, cyexportData_ptr)
+    with nogil:
+        err = cyruntime.cudaMemPoolImportPointer(<void**>&ptr, cymemPool, cyexportData_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], ptr)
@@ -28706,7 +28822,8 @@ def cudaPointerGetAttributes(ptr):
     cdef cudaPointerAttributes attributes = cudaPointerAttributes()
     cyptr = utils.HelperInputVoidPtr(ptr)
     cdef void* cyptr_ptr = <void*><void_ptr>cyptr.cptr
-    err = cyruntime.cudaPointerGetAttributes(<cyruntime.cudaPointerAttributes*>attributes._pvt_ptr, cyptr_ptr)
+    with nogil:
+        err = cyruntime.cudaPointerGetAttributes(<cyruntime.cudaPointerAttributes*>attributes._pvt_ptr, cyptr_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], attributes)
@@ -28744,7 +28861,8 @@ def cudaDeviceCanAccessPeer(int device, int peerDevice):
     :py:obj:`~.cudaDeviceEnablePeerAccess`, :py:obj:`~.cudaDeviceDisablePeerAccess`, :py:obj:`~.cuDeviceCanAccessPeer`
     """
     cdef int canAccessPeer = 0
-    err = cyruntime.cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice)
+    with nogil:
+        err = cyruntime.cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], canAccessPeer)
@@ -28796,7 +28914,8 @@ def cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags):
     --------
     :py:obj:`~.cudaDeviceCanAccessPeer`, :py:obj:`~.cudaDeviceDisablePeerAccess`, :py:obj:`~.cuCtxEnablePeerAccess`
     """
-    err = cyruntime.cudaDeviceEnablePeerAccess(peerDevice, flags)
+    with nogil:
+        err = cyruntime.cudaDeviceEnablePeerAccess(peerDevice, flags)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -28824,7 +28943,8 @@ def cudaDeviceDisablePeerAccess(int peerDevice):
     --------
     :py:obj:`~.cudaDeviceCanAccessPeer`, :py:obj:`~.cudaDeviceEnablePeerAccess`, :py:obj:`~.cuCtxDisablePeerAccess`
     """
-    err = cyruntime.cudaDeviceDisablePeerAccess(peerDevice)
+    with nogil:
+        err = cyruntime.cudaDeviceDisablePeerAccess(peerDevice)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -28862,7 +28982,8 @@ def cudaGraphicsUnregisterResource(resource):
     else:
         presource = int(cudaGraphicsResource_t(resource))
     cyresource = <cyruntime.cudaGraphicsResource_t><void_ptr>presource
-    err = cyruntime.cudaGraphicsUnregisterResource(cyresource)
+    with nogil:
+        err = cyruntime.cudaGraphicsUnregisterResource(cyresource)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -28917,7 +29038,8 @@ def cudaGraphicsResourceSetMapFlags(resource, unsigned int flags):
     else:
         presource = int(cudaGraphicsResource_t(resource))
     cyresource = <cyruntime.cudaGraphicsResource_t><void_ptr>presource
-    err = cyruntime.cudaGraphicsResourceSetMapFlags(cyresource, flags)
+    with nogil:
+        err = cyruntime.cudaGraphicsResourceSetMapFlags(cyresource, flags)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -28979,7 +29101,8 @@ def cudaGraphicsMapResources(int count, resources, stream):
         cyresources = <cyruntime.cudaGraphicsResource_t*><void_ptr>resources
     else:
         raise TypeError("Argument 'resources' is not instance of type (expected <class 'int, runtime.cudaGraphicsResource_t'>, found " + str(type(resources)))
-    err = cyruntime.cudaGraphicsMapResources(count, cyresources, cystream)
+    with nogil:
+        err = cyruntime.cudaGraphicsMapResources(count, cyresources, cystream)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -29039,7 +29162,8 @@ def cudaGraphicsUnmapResources(int count, resources, stream):
         cyresources = <cyruntime.cudaGraphicsResource_t*><void_ptr>resources
     else:
         raise TypeError("Argument 'resources' is not instance of type (expected <class 'int, runtime.cudaGraphicsResource_t'>, found " + str(type(resources)))
-    err = cyruntime.cudaGraphicsUnmapResources(count, cyresources, cystream)
+    with nogil:
+        err = cyruntime.cudaGraphicsUnmapResources(count, cyresources, cystream)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -29082,7 +29206,8 @@ def cudaGraphicsResourceGetMappedPointer(resource):
     cyresource = <cyruntime.cudaGraphicsResource_t><void_ptr>presource
     cdef void_ptr devPtr = 0
     cdef size_t size = 0
-    err = cyruntime.cudaGraphicsResourceGetMappedPointer(<void**>&devPtr, &size, cyresource)
+    with nogil:
+        err = cyruntime.cudaGraphicsResourceGetMappedPointer(<void**>&devPtr, &size, cyresource)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None, None)
     return (_dict_cudaError_t[err], devPtr, size)
@@ -29138,7 +29263,8 @@ def cudaGraphicsSubResourceGetMappedArray(resource, unsigned int arrayIndex, uns
         presource = int(cudaGraphicsResource_t(resource))
     cyresource = <cyruntime.cudaGraphicsResource_t><void_ptr>presource
     cdef cudaArray_t array = cudaArray_t()
-    err = cyruntime.cudaGraphicsSubResourceGetMappedArray(<cyruntime.cudaArray_t*>array._pvt_ptr, cyresource, arrayIndex, mipLevel)
+    with nogil:
+        err = cyruntime.cudaGraphicsSubResourceGetMappedArray(<cyruntime.cudaArray_t*>array._pvt_ptr, cyresource, arrayIndex, mipLevel)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], array)
@@ -29183,7 +29309,8 @@ def cudaGraphicsResourceGetMappedMipmappedArray(resource):
         presource = int(cudaGraphicsResource_t(resource))
     cyresource = <cyruntime.cudaGraphicsResource_t><void_ptr>presource
     cdef cudaMipmappedArray_t mipmappedArray = cudaMipmappedArray_t()
-    err = cyruntime.cudaGraphicsResourceGetMappedMipmappedArray(<cyruntime.cudaMipmappedArray_t*>mipmappedArray._pvt_ptr, cyresource)
+    with nogil:
+        err = cyruntime.cudaGraphicsResourceGetMappedMipmappedArray(<cyruntime.cudaMipmappedArray_t*>mipmappedArray._pvt_ptr, cyresource)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], mipmappedArray)
@@ -29224,7 +29351,6 @@ def cudaGetChannelDesc(array):
     cdef cudaChannelFormatDesc desc = cudaChannelFormatDesc()
     with nogil:
         err = cyruntime.cudaGetChannelDesc(<cyruntime.cudaChannelFormatDesc*>desc._pvt_ptr, cyarray)
-
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], desc)
@@ -29272,7 +29398,8 @@ def cudaCreateChannelDesc(int x, int y, int z, int w, f not None : cudaChannelFo
     cudaCreateChannelDesc (C++ API), :py:obj:`~.cudaGetChannelDesc`, :py:obj:`~.cudaCreateTextureObject`, :py:obj:`~.cudaCreateSurfaceObject`
     """
     cdef cyruntime.cudaChannelFormatKind cyf = f.value
-    err = cyruntime.cudaCreateChannelDesc(x, y, z, w, cyf)
+    with nogil:
+        err = cyruntime.cudaCreateChannelDesc(x, y, z, w, cyf)
     cdef cudaChannelFormatDesc wrapper = cudaChannelFormatDesc()
     wrapper._pvt_ptr[0] = err
     return (cudaError_t.cudaSuccess, wrapper)
@@ -29516,7 +29643,8 @@ def cudaCreateTextureObject(pResDesc : Optional[cudaResourceDesc], pTexDesc : Op
     cdef cyruntime.cudaResourceDesc* cypResDesc_ptr = pResDesc._pvt_ptr if pResDesc != None else NULL
     cdef cyruntime.cudaTextureDesc* cypTexDesc_ptr = pTexDesc._pvt_ptr if pTexDesc != None else NULL
     cdef cyruntime.cudaResourceViewDesc* cypResViewDesc_ptr = pResViewDesc._pvt_ptr if pResViewDesc != None else NULL
-    err = cyruntime.cudaCreateTextureObject(<cyruntime.cudaTextureObject_t*>pTexObject._pvt_ptr, cypResDesc_ptr, cypTexDesc_ptr, cypResViewDesc_ptr)
+    with nogil:
+        err = cyruntime.cudaCreateTextureObject(<cyruntime.cudaTextureObject_t*>pTexObject._pvt_ptr, cypResDesc_ptr, cypTexDesc_ptr, cypResViewDesc_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pTexObject)
@@ -29554,7 +29682,6 @@ def cudaDestroyTextureObject(texObject):
     cytexObject = <cyruntime.cudaTextureObject_t><void_ptr>ptexObject
     with nogil:
         err = cyruntime.cudaDestroyTextureObject(cytexObject)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -29594,7 +29721,6 @@ def cudaGetTextureObjectResourceDesc(texObject):
     cdef cudaResourceDesc pResDesc = cudaResourceDesc()
     with nogil:
         err = cyruntime.cudaGetTextureObjectResourceDesc(<cyruntime.cudaResourceDesc*>pResDesc._pvt_ptr, cytexObject)
-
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pResDesc)
@@ -29636,7 +29762,6 @@ def cudaGetTextureObjectTextureDesc(texObject):
     cdef cudaTextureDesc pTexDesc = cudaTextureDesc()
     with nogil:
         err = cyruntime.cudaGetTextureObjectTextureDesc(<cyruntime.cudaTextureDesc*>pTexDesc._pvt_ptr, cytexObject)
-
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pTexDesc)
@@ -29677,7 +29802,8 @@ def cudaGetTextureObjectResourceViewDesc(texObject):
         ptexObject = int(cudaTextureObject_t(texObject))
     cytexObject = <cyruntime.cudaTextureObject_t><void_ptr>ptexObject
     cdef cudaResourceViewDesc pResViewDesc = cudaResourceViewDesc()
-    err = cyruntime.cudaGetTextureObjectResourceViewDesc(<cyruntime.cudaResourceViewDesc*>pResViewDesc._pvt_ptr, cytexObject)
+    with nogil:
+        err = cyruntime.cudaGetTextureObjectResourceViewDesc(<cyruntime.cudaResourceViewDesc*>pResViewDesc._pvt_ptr, cytexObject)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pResViewDesc)
@@ -29720,7 +29846,6 @@ def cudaCreateSurfaceObject(pResDesc : Optional[cudaResourceDesc]):
     cdef cyruntime.cudaResourceDesc* cypResDesc_ptr = pResDesc._pvt_ptr if pResDesc != None else NULL
     with nogil:
         err = cyruntime.cudaCreateSurfaceObject(<cyruntime.cudaSurfaceObject_t*>pSurfObject._pvt_ptr, cypResDesc_ptr)
-
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pSurfObject)
@@ -29758,7 +29883,6 @@ def cudaDestroySurfaceObject(surfObject):
     cysurfObject = <cyruntime.cudaSurfaceObject_t><void_ptr>psurfObject
     with nogil:
         err = cyruntime.cudaDestroySurfaceObject(cysurfObject)
-
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -29793,7 +29917,8 @@ def cudaGetSurfaceObjectResourceDesc(surfObject):
         psurfObject = int(cudaSurfaceObject_t(surfObject))
     cysurfObject = <cyruntime.cudaSurfaceObject_t><void_ptr>psurfObject
     cdef cudaResourceDesc pResDesc = cudaResourceDesc()
-    err = cyruntime.cudaGetSurfaceObjectResourceDesc(<cyruntime.cudaResourceDesc*>pResDesc._pvt_ptr, cysurfObject)
+    with nogil:
+        err = cyruntime.cudaGetSurfaceObjectResourceDesc(<cyruntime.cudaResourceDesc*>pResDesc._pvt_ptr, cysurfObject)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pResDesc)
@@ -29825,7 +29950,8 @@ def cudaDriverGetVersion():
     :py:obj:`~.cudaRuntimeGetVersion`, :py:obj:`~.cuDriverGetVersion`
     """
     cdef int driverVersion = 0
-    err = cyruntime.cudaDriverGetVersion(&driverVersion)
+    with nogil:
+        err = cyruntime.cudaDriverGetVersion(&driverVersion)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], driverVersion)
@@ -29860,7 +29986,8 @@ def cudaRuntimeGetVersion():
     :py:obj:`~.cudaDriverGetVersion`, :py:obj:`~.cuDriverGetVersion`
     """
     cdef int runtimeVersion = 0
-    err = cyruntime.cudaRuntimeGetVersion(&runtimeVersion)
+    with nogil:
+        err = cyruntime.cudaRuntimeGetVersion(&runtimeVersion)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], runtimeVersion)
@@ -29901,7 +30028,6 @@ def cudaLogsRegisterCallback(callbackFunc, userData):
     cdef cudaLogsCallbackHandle callback_out = cudaLogsCallbackHandle()
     with nogil:
         err = cyruntime.cudaLogsRegisterCallback(cycallbackFunc, cyuserData_ptr, <cyruntime.cudaLogsCallbackHandle*>callback_out._pvt_ptr)
-
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], callback_out)
@@ -29931,7 +30057,8 @@ def cudaLogsUnregisterCallback(callback):
     else:
         pcallback = int(cudaLogsCallbackHandle(callback))
     cycallback = <cyruntime.cudaLogsCallbackHandle><void_ptr>pcallback
-    err = cyruntime.cudaLogsUnregisterCallback(cycallback)
+    with nogil:
+        err = cyruntime.cudaLogsUnregisterCallback(cycallback)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -29954,7 +30081,8 @@ def cudaLogsCurrent(unsigned int flags):
         Location to store an iterator to the current tail of the logs
     """
     cdef cudaLogIterator iterator_out = cudaLogIterator()
-    err = cyruntime.cudaLogsCurrent(<cyruntime.cudaLogIterator*>iterator_out._pvt_ptr, flags)
+    with nogil:
+        err = cyruntime.cudaLogsCurrent(<cyruntime.cudaLogIterator*>iterator_out._pvt_ptr, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], iterator_out)
@@ -29994,7 +30122,11 @@ def cudaLogsDumpToFile(iterator : Optional[cudaLogIterator], char* pathToFile, u
 
     The driver reserves limited memory for storing logs. The oldest logs may be overwritten and become unrecoverable. An indication will appear in the destination outupt if the logs have been truncated. Call dump after each failed API to mitigate this risk.
     """
-    err = cyruntime.cudaLogsDumpToFile(iterator._pvt_ptr if iterator != None else NULL, pathToFile, flags)
+    cdef cyruntime.cudaLogIterator* cyiterator = NULL
+    if iterator is not None:
+        cyiterator = iterator._pvt_ptr
+    with nogil:
+        err = cyruntime.cudaLogsDumpToFile(cyiterator, pathToFile, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], iterator)
@@ -30048,7 +30180,11 @@ def cudaLogsDumpToMemory(iterator : Optional[cudaLogIterator], char* buffer, siz
 
     If the provided value in `*size` is not large enough to hold all buffered messages, a message will be added at the head of the buffer indicating this. The driver then computes the number of messages it is able to store in `buffer` and writes it out. The final message in `buffer` will always be the most recent log message as of when the API is called.
     """
-    err = cyruntime.cudaLogsDumpToMemory(iterator._pvt_ptr if iterator != None else NULL, buffer, &size, flags)
+    cdef cyruntime.cudaLogIterator* cyiterator = NULL
+    if iterator is not None:
+        cyiterator = iterator._pvt_ptr
+    with nogil:
+        err = cyruntime.cudaLogsDumpToMemory(cyiterator, buffer, &size, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None, None)
     return (_dict_cudaError_t[err], iterator, size)
@@ -30079,7 +30215,8 @@ def cudaGraphCreate(unsigned int flags):
     :py:obj:`~.cudaGraphAddChildGraphNode`, :py:obj:`~.cudaGraphAddEmptyNode`, :py:obj:`~.cudaGraphAddKernelNode`, :py:obj:`~.cudaGraphAddHostNode`, :py:obj:`~.cudaGraphAddMemcpyNode`, :py:obj:`~.cudaGraphAddMemsetNode`, :py:obj:`~.cudaGraphInstantiate`, :py:obj:`~.cudaGraphDestroy`, :py:obj:`~.cudaGraphGetNodes`, :py:obj:`~.cudaGraphGetRootNodes`, :py:obj:`~.cudaGraphGetEdges`, :py:obj:`~.cudaGraphClone`
     """
     cdef cudaGraph_t pGraph = cudaGraph_t()
-    err = cyruntime.cudaGraphCreate(<cyruntime.cudaGraph_t*>pGraph._pvt_ptr, flags)
+    with nogil:
+        err = cyruntime.cudaGraphCreate(<cyruntime.cudaGraph_t*>pGraph._pvt_ptr, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pGraph)
@@ -30187,17 +30324,20 @@ def cudaGraphAddKernelNode(graph, pDependencies : Optional[Tuple[cudaGraphNode_t
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
     cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
-    if len(pDependencies) > 0:
+    if len(pDependencies) > 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
         if cypDependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
         else:
             for idx in range(len(pDependencies)):
                 cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
+    elif len(pDependencies) == 1:
+        cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
     cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
-    err = cyruntime.cudaGraphAddKernelNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr if len(pDependencies) == 1 else cypDependencies, numDependencies, cypNodeParams_ptr)
-    if cypDependencies is not NULL:
+    with nogil:
+        err = cyruntime.cudaGraphAddKernelNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cypNodeParams_ptr)
+    if len(pDependencies) > 1 and cypDependencies is not NULL:
         free(cypDependencies)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
@@ -30246,7 +30386,8 @@ def cudaGraphKernelNodeGetParams(node):
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
     cdef cudaKernelNodeParams pNodeParams = cudaKernelNodeParams()
-    err = cyruntime.cudaGraphKernelNodeGetParams(cynode, <cyruntime.cudaKernelNodeParams*>pNodeParams._pvt_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphKernelNodeGetParams(cynode, <cyruntime.cudaKernelNodeParams*>pNodeParams._pvt_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pNodeParams)
@@ -30285,7 +30426,8 @@ def cudaGraphKernelNodeSetParams(node, pNodeParams : Optional[cudaKernelNodePara
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
     cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
-    err = cyruntime.cudaGraphKernelNodeSetParams(cynode, cypNodeParams_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphKernelNodeSetParams(cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -30331,7 +30473,8 @@ def cudaGraphKernelNodeCopyAttributes(hDst, hSrc):
     else:
         phDst = int(cudaGraphNode_t(hDst))
     cyhDst = <cyruntime.cudaGraphNode_t><void_ptr>phDst
-    err = cyruntime.cudaGraphKernelNodeCopyAttributes(cyhDst, cyhSrc)
+    with nogil:
+        err = cyruntime.cudaGraphKernelNodeCopyAttributes(cyhDst, cyhSrc)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -30372,7 +30515,8 @@ def cudaGraphKernelNodeGetAttribute(hNode, attr not None : cudaKernelNodeAttrID)
     cyhNode = <cyruntime.cudaGraphNode_t><void_ptr>phNode
     cdef cyruntime.cudaKernelNodeAttrID cyattr = attr.value
     cdef cudaKernelNodeAttrValue value_out = cudaKernelNodeAttrValue()
-    err = cyruntime.cudaGraphKernelNodeGetAttribute(cyhNode, cyattr, <cyruntime.cudaKernelNodeAttrValue*>value_out._pvt_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphKernelNodeGetAttribute(cyhNode, cyattr, <cyruntime.cudaKernelNodeAttrValue*>value_out._pvt_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], value_out)
@@ -30415,7 +30559,8 @@ def cudaGraphKernelNodeSetAttribute(hNode, attr not None : cudaKernelNodeAttrID,
     cyhNode = <cyruntime.cudaGraphNode_t><void_ptr>phNode
     cdef cyruntime.cudaKernelNodeAttrID cyattr = attr.value
     cdef cyruntime.cudaKernelNodeAttrValue* cyvalue_ptr = value._pvt_ptr if value != None else NULL
-    err = cyruntime.cudaGraphKernelNodeSetAttribute(cyhNode, cyattr, cyvalue_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphKernelNodeSetAttribute(cyhNode, cyattr, cyvalue_ptr)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -30475,17 +30620,20 @@ def cudaGraphAddMemcpyNode(graph, pDependencies : Optional[Tuple[cudaGraphNode_t
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
     cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
-    if len(pDependencies) > 0:
+    if len(pDependencies) > 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
         if cypDependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
         else:
             for idx in range(len(pDependencies)):
                 cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
+    elif len(pDependencies) == 1:
+        cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
     cdef cyruntime.cudaMemcpy3DParms* cypCopyParams_ptr = pCopyParams._pvt_ptr if pCopyParams != None else NULL
-    err = cyruntime.cudaGraphAddMemcpyNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr if len(pDependencies) == 1 else cypDependencies, numDependencies, cypCopyParams_ptr)
-    if cypDependencies is not NULL:
+    with nogil:
+        err = cyruntime.cudaGraphAddMemcpyNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cypCopyParams_ptr)
+    if len(pDependencies) > 1 and cypDependencies is not NULL:
         free(cypDependencies)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
@@ -30564,20 +30712,23 @@ def cudaGraphAddMemcpyNode1D(graph, pDependencies : Optional[Tuple[cudaGraphNode
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
     cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
-    if len(pDependencies) > 0:
+    if len(pDependencies) > 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
         if cypDependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
         else:
             for idx in range(len(pDependencies)):
                 cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
+    elif len(pDependencies) == 1:
+        cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     cydst = utils.HelperInputVoidPtr(dst)
     cdef void* cydst_ptr = <void*><void_ptr>cydst.cptr
     cysrc = utils.HelperInputVoidPtr(src)
     cdef void* cysrc_ptr = <void*><void_ptr>cysrc.cptr
     cdef cyruntime.cudaMemcpyKind cykind = kind.value
-    err = cyruntime.cudaGraphAddMemcpyNode1D(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr if len(pDependencies) == 1 else cypDependencies, numDependencies, cydst_ptr, cysrc_ptr, count, cykind)
-    if cypDependencies is not NULL:
+    with nogil:
+        err = cyruntime.cudaGraphAddMemcpyNode1D(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cydst_ptr, cysrc_ptr, count, cykind)
+    if len(pDependencies) > 1 and cypDependencies is not NULL:
         free(cypDependencies)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
@@ -30617,7 +30768,8 @@ def cudaGraphMemcpyNodeGetParams(node):
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
     cdef cudaMemcpy3DParms pNodeParams = cudaMemcpy3DParms()
-    err = cyruntime.cudaGraphMemcpyNodeGetParams(cynode, <cyruntime.cudaMemcpy3DParms*>pNodeParams._pvt_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphMemcpyNodeGetParams(cynode, <cyruntime.cudaMemcpy3DParms*>pNodeParams._pvt_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pNodeParams)
@@ -30656,7 +30808,8 @@ def cudaGraphMemcpyNodeSetParams(node, pNodeParams : Optional[cudaMemcpy3DParms]
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
     cdef cyruntime.cudaMemcpy3DParms* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
-    err = cyruntime.cudaGraphMemcpyNodeSetParams(cynode, cypNodeParams_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphMemcpyNodeSetParams(cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -30717,7 +30870,8 @@ def cudaGraphMemcpyNodeSetParams1D(node, dst, src, size_t count, kind not None :
     cysrc = utils.HelperInputVoidPtr(src)
     cdef void* cysrc_ptr = <void*><void_ptr>cysrc.cptr
     cdef cyruntime.cudaMemcpyKind cykind = kind.value
-    err = cyruntime.cudaGraphMemcpyNodeSetParams1D(cynode, cydst_ptr, cysrc_ptr, count, cykind)
+    with nogil:
+        err = cyruntime.cudaGraphMemcpyNodeSetParams1D(cynode, cydst_ptr, cysrc_ptr, count, cykind)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -30771,17 +30925,20 @@ def cudaGraphAddMemsetNode(graph, pDependencies : Optional[Tuple[cudaGraphNode_t
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
     cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
-    if len(pDependencies) > 0:
+    if len(pDependencies) > 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
         if cypDependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
         else:
             for idx in range(len(pDependencies)):
                 cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
+    elif len(pDependencies) == 1:
+        cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
     cdef cyruntime.cudaMemsetParams* cypMemsetParams_ptr = pMemsetParams._pvt_ptr if pMemsetParams != None else NULL
-    err = cyruntime.cudaGraphAddMemsetNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr if len(pDependencies) == 1 else cypDependencies, numDependencies, cypMemsetParams_ptr)
-    if cypDependencies is not NULL:
+    with nogil:
+        err = cyruntime.cudaGraphAddMemsetNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cypMemsetParams_ptr)
+    if len(pDependencies) > 1 and cypDependencies is not NULL:
         free(cypDependencies)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
@@ -30821,7 +30978,8 @@ def cudaGraphMemsetNodeGetParams(node):
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
     cdef cudaMemsetParams pNodeParams = cudaMemsetParams()
-    err = cyruntime.cudaGraphMemsetNodeGetParams(cynode, <cyruntime.cudaMemsetParams*>pNodeParams._pvt_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphMemsetNodeGetParams(cynode, <cyruntime.cudaMemsetParams*>pNodeParams._pvt_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pNodeParams)
@@ -30860,7 +31018,8 @@ def cudaGraphMemsetNodeSetParams(node, pNodeParams : Optional[cudaMemsetParams])
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
     cdef cyruntime.cudaMemsetParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
-    err = cyruntime.cudaGraphMemsetNodeSetParams(cynode, cypNodeParams_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphMemsetNodeSetParams(cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -30915,17 +31074,20 @@ def cudaGraphAddHostNode(graph, pDependencies : Optional[Tuple[cudaGraphNode_t]
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
     cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
-    if len(pDependencies) > 0:
+    if len(pDependencies) > 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
         if cypDependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
         else:
             for idx in range(len(pDependencies)):
                 cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
+    elif len(pDependencies) == 1:
+        cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
     cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
-    err = cyruntime.cudaGraphAddHostNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr if len(pDependencies) == 1 else cypDependencies, numDependencies, cypNodeParams_ptr)
-    if cypDependencies is not NULL:
+    with nogil:
+        err = cyruntime.cudaGraphAddHostNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cypNodeParams_ptr)
+    if len(pDependencies) > 1 and cypDependencies is not NULL:
         free(cypDependencies)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
@@ -30965,7 +31127,8 @@ def cudaGraphHostNodeGetParams(node):
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
     cdef cudaHostNodeParams pNodeParams = cudaHostNodeParams()
-    err = cyruntime.cudaGraphHostNodeGetParams(cynode, <cyruntime.cudaHostNodeParams*>pNodeParams._pvt_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphHostNodeGetParams(cynode, <cyruntime.cudaHostNodeParams*>pNodeParams._pvt_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pNodeParams)
@@ -31004,7 +31167,8 @@ def cudaGraphHostNodeSetParams(node, pNodeParams : Optional[cudaHostNodeParams])
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
     cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
-    err = cyruntime.cudaGraphHostNodeSetParams(cynode, cypNodeParams_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphHostNodeSetParams(cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -31070,16 +31234,19 @@ def cudaGraphAddChildGraphNode(graph, pDependencies : Optional[Tuple[cudaGraphNo
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
     cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
-    if len(pDependencies) > 0:
+    if len(pDependencies) > 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
         if cypDependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
         else:
             for idx in range(len(pDependencies)):
                 cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
+    elif len(pDependencies) == 1:
+        cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    err = cyruntime.cudaGraphAddChildGraphNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr if len(pDependencies) == 1 else cypDependencies, numDependencies, cychildGraph)
-    if cypDependencies is not NULL:
+    with nogil:
+        err = cyruntime.cudaGraphAddChildGraphNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cychildGraph)
+    if len(pDependencies) > 1 and cypDependencies is not NULL:
         free(cypDependencies)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
@@ -31124,7 +31291,8 @@ def cudaGraphChildGraphNodeGetGraph(node):
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
     cdef cudaGraph_t pGraph = cudaGraph_t()
-    err = cyruntime.cudaGraphChildGraphNodeGetGraph(cynode, <cyruntime.cudaGraph_t*>pGraph._pvt_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphChildGraphNodeGetGraph(cynode, <cyruntime.cudaGraph_t*>pGraph._pvt_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pGraph)
@@ -31182,16 +31350,19 @@ def cudaGraphAddEmptyNode(graph, pDependencies : Optional[Tuple[cudaGraphNode_t]
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
     cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
-    if len(pDependencies) > 0:
+    if len(pDependencies) > 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
         if cypDependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
         else:
             for idx in range(len(pDependencies)):
                 cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
+    elif len(pDependencies) == 1:
+        cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    err = cyruntime.cudaGraphAddEmptyNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr if len(pDependencies) == 1 else cypDependencies, numDependencies)
-    if cypDependencies is not NULL:
+    with nogil:
+        err = cyruntime.cudaGraphAddEmptyNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies)
+    if len(pDependencies) > 1 and cypDependencies is not NULL:
         free(cypDependencies)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
@@ -31259,16 +31430,19 @@ def cudaGraphAddEventRecordNode(graph, pDependencies : Optional[Tuple[cudaGraphN
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
     cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
-    if len(pDependencies) > 0:
+    if len(pDependencies) > 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
         if cypDependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
         else:
             for idx in range(len(pDependencies)):
                 cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
+    elif len(pDependencies) == 1:
+        cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    err = cyruntime.cudaGraphAddEventRecordNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr if len(pDependencies) == 1 else cypDependencies, numDependencies, cyevent)
-    if cypDependencies is not NULL:
+    with nogil:
+        err = cyruntime.cudaGraphAddEventRecordNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cyevent)
+    if len(pDependencies) > 1 and cypDependencies is not NULL:
         free(cypDependencies)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
@@ -31308,7 +31482,8 @@ def cudaGraphEventRecordNodeGetEvent(node):
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
     cdef cudaEvent_t event_out = cudaEvent_t()
-    err = cyruntime.cudaGraphEventRecordNodeGetEvent(cynode, <cyruntime.cudaEvent_t*>event_out._pvt_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphEventRecordNodeGetEvent(cynode, <cyruntime.cudaEvent_t*>event_out._pvt_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], event_out)
@@ -31354,7 +31529,8 @@ def cudaGraphEventRecordNodeSetEvent(node, event):
     else:
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    err = cyruntime.cudaGraphEventRecordNodeSetEvent(cynode, cyevent)
+    with nogil:
+        err = cyruntime.cudaGraphEventRecordNodeSetEvent(cynode, cyevent)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -31422,16 +31598,19 @@ def cudaGraphAddEventWaitNode(graph, pDependencies : Optional[Tuple[cudaGraphNod
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
     cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
-    if len(pDependencies) > 0:
+    if len(pDependencies) > 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
         if cypDependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
         else:
             for idx in range(len(pDependencies)):
                 cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
+    elif len(pDependencies) == 1:
+        cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    err = cyruntime.cudaGraphAddEventWaitNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr if len(pDependencies) == 1 else cypDependencies, numDependencies, cyevent)
-    if cypDependencies is not NULL:
+    with nogil:
+        err = cyruntime.cudaGraphAddEventWaitNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cyevent)
+    if len(pDependencies) > 1 and cypDependencies is not NULL:
         free(cypDependencies)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
@@ -31471,7 +31650,8 @@ def cudaGraphEventWaitNodeGetEvent(node):
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
     cdef cudaEvent_t event_out = cudaEvent_t()
-    err = cyruntime.cudaGraphEventWaitNodeGetEvent(cynode, <cyruntime.cudaEvent_t*>event_out._pvt_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphEventWaitNodeGetEvent(cynode, <cyruntime.cudaEvent_t*>event_out._pvt_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], event_out)
@@ -31517,7 +31697,8 @@ def cudaGraphEventWaitNodeSetEvent(node, event):
     else:
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    err = cyruntime.cudaGraphEventWaitNodeSetEvent(cynode, cyevent)
+    with nogil:
+        err = cyruntime.cudaGraphEventWaitNodeSetEvent(cynode, cyevent)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -31573,17 +31754,20 @@ def cudaGraphAddExternalSemaphoresSignalNode(graph, pDependencies : Optional[Tup
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
     cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
-    if len(pDependencies) > 0:
+    if len(pDependencies) > 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
         if cypDependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
         else:
             for idx in range(len(pDependencies)):
                 cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
+    elif len(pDependencies) == 1:
+        cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
     cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cyruntime.cudaGraphAddExternalSemaphoresSignalNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr if len(pDependencies) == 1 else cypDependencies, numDependencies, cynodeParams_ptr)
-    if cypDependencies is not NULL:
+    with nogil:
+        err = cyruntime.cudaGraphAddExternalSemaphoresSignalNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cynodeParams_ptr)
+    if len(pDependencies) > 1 and cypDependencies is not NULL:
         free(cypDependencies)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
@@ -31629,7 +31813,8 @@ def cudaGraphExternalSemaphoresSignalNodeGetParams(hNode):
         phNode = int(cudaGraphNode_t(hNode))
     cyhNode = <cyruntime.cudaGraphNode_t><void_ptr>phNode
     cdef cudaExternalSemaphoreSignalNodeParams params_out = cudaExternalSemaphoreSignalNodeParams()
-    err = cyruntime.cudaGraphExternalSemaphoresSignalNodeGetParams(cyhNode, <cyruntime.cudaExternalSemaphoreSignalNodeParams*>params_out._pvt_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphExternalSemaphoresSignalNodeGetParams(cyhNode, <cyruntime.cudaExternalSemaphoreSignalNodeParams*>params_out._pvt_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], params_out)
@@ -31669,7 +31854,8 @@ def cudaGraphExternalSemaphoresSignalNodeSetParams(hNode, nodeParams : Optional[
         phNode = int(cudaGraphNode_t(hNode))
     cyhNode = <cyruntime.cudaGraphNode_t><void_ptr>phNode
     cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cyruntime.cudaGraphExternalSemaphoresSignalNodeSetParams(cyhNode, cynodeParams_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphExternalSemaphoresSignalNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -31725,17 +31911,20 @@ def cudaGraphAddExternalSemaphoresWaitNode(graph, pDependencies : Optional[Tuple
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
     cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
-    if len(pDependencies) > 0:
+    if len(pDependencies) > 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
         if cypDependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
         else:
             for idx in range(len(pDependencies)):
                 cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
+    elif len(pDependencies) == 1:
+        cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
     cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cyruntime.cudaGraphAddExternalSemaphoresWaitNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr if len(pDependencies) == 1 else cypDependencies, numDependencies, cynodeParams_ptr)
-    if cypDependencies is not NULL:
+    with nogil:
+        err = cyruntime.cudaGraphAddExternalSemaphoresWaitNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cynodeParams_ptr)
+    if len(pDependencies) > 1 and cypDependencies is not NULL:
         free(cypDependencies)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
@@ -31781,7 +31970,8 @@ def cudaGraphExternalSemaphoresWaitNodeGetParams(hNode):
         phNode = int(cudaGraphNode_t(hNode))
     cyhNode = <cyruntime.cudaGraphNode_t><void_ptr>phNode
     cdef cudaExternalSemaphoreWaitNodeParams params_out = cudaExternalSemaphoreWaitNodeParams()
-    err = cyruntime.cudaGraphExternalSemaphoresWaitNodeGetParams(cyhNode, <cyruntime.cudaExternalSemaphoreWaitNodeParams*>params_out._pvt_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphExternalSemaphoresWaitNodeGetParams(cyhNode, <cyruntime.cudaExternalSemaphoreWaitNodeParams*>params_out._pvt_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], params_out)
@@ -31821,7 +32011,8 @@ def cudaGraphExternalSemaphoresWaitNodeSetParams(hNode, nodeParams : Optional[cu
         phNode = int(cudaGraphNode_t(hNode))
     cyhNode = <cyruntime.cudaGraphNode_t><void_ptr>phNode
     cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cyruntime.cudaGraphExternalSemaphoresWaitNodeSetParams(cyhNode, cynodeParams_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphExternalSemaphoresWaitNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -31916,17 +32107,20 @@ def cudaGraphAddMemAllocNode(graph, pDependencies : Optional[Tuple[cudaGraphNode
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
     cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
-    if len(pDependencies) > 0:
+    if len(pDependencies) > 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
         if cypDependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
         else:
             for idx in range(len(pDependencies)):
                 cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
+    elif len(pDependencies) == 1:
+        cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
     cdef cyruntime.cudaMemAllocNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cyruntime.cudaGraphAddMemAllocNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr if len(pDependencies) == 1 else cypDependencies, numDependencies, cynodeParams_ptr)
-    if cypDependencies is not NULL:
+    with nogil:
+        err = cyruntime.cudaGraphAddMemAllocNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cynodeParams_ptr)
+    if len(pDependencies) > 1 and cypDependencies is not NULL:
         free(cypDependencies)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
@@ -31969,7 +32163,8 @@ def cudaGraphMemAllocNodeGetParams(node):
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
     cdef cudaMemAllocNodeParams params_out = cudaMemAllocNodeParams()
-    err = cyruntime.cudaGraphMemAllocNodeGetParams(cynode, <cyruntime.cudaMemAllocNodeParams*>params_out._pvt_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphMemAllocNodeGetParams(cynode, <cyruntime.cudaMemAllocNodeParams*>params_out._pvt_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], params_out)
@@ -32044,18 +32239,21 @@ def cudaGraphAddMemFreeNode(graph, pDependencies : Optional[Tuple[cudaGraphNode_
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
     cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
-    if len(pDependencies) > 0:
+    if len(pDependencies) > 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
         if cypDependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
         else:
             for idx in range(len(pDependencies)):
                 cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
+    elif len(pDependencies) == 1:
+        cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
     cydptr = utils.HelperInputVoidPtr(dptr)
     cdef void* cydptr_ptr = <void*><void_ptr>cydptr.cptr
-    err = cyruntime.cudaGraphAddMemFreeNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr if len(pDependencies) == 1 else cypDependencies, numDependencies, cydptr_ptr)
-    if cypDependencies is not NULL:
+    with nogil:
+        err = cyruntime.cudaGraphAddMemFreeNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cydptr_ptr)
+    if len(pDependencies) > 1 and cypDependencies is not NULL:
         free(cypDependencies)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
@@ -32096,7 +32294,8 @@ def cudaGraphMemFreeNodeGetParams(node):
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
     cdef void_ptr dptr_out = 0
     cdef void* cydptr_out_ptr = <void*>&dptr_out
-    err = cyruntime.cudaGraphMemFreeNodeGetParams(cynode, cydptr_out_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphMemFreeNodeGetParams(cynode, cydptr_out_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], dptr_out)
@@ -32126,7 +32325,8 @@ def cudaDeviceGraphMemTrim(int device):
     --------
     :py:obj:`~.cudaGraphAddMemAllocNode`, :py:obj:`~.cudaGraphAddMemFreeNode`, :py:obj:`~.cudaDeviceGetGraphMemAttribute`, :py:obj:`~.cudaDeviceSetGraphMemAttribute`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`
     """
-    err = cyruntime.cudaDeviceGraphMemTrim(device)
+    with nogil:
+        err = cyruntime.cudaDeviceGraphMemTrim(device)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -32174,7 +32374,8 @@ def cudaDeviceGetGraphMemAttribute(int device, attr not None : cudaGraphMemAttri
     cdef cyruntime.cudaGraphMemAttributeType cyattr = attr.value
     cdef utils.HelperCUgraphMem_attribute cyvalue = utils.HelperCUgraphMem_attribute(attr, 0, is_getter=True)
     cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
-    err = cyruntime.cudaDeviceGetGraphMemAttribute(device, cyattr, cyvalue_ptr)
+    with nogil:
+        err = cyruntime.cudaDeviceGetGraphMemAttribute(device, cyattr, cyvalue_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], cyvalue.pyObj())
@@ -32217,7 +32418,8 @@ def cudaDeviceSetGraphMemAttribute(int device, attr not None : cudaGraphMemAttri
     cdef cyruntime.cudaGraphMemAttributeType cyattr = attr.value
     cdef utils.HelperCUgraphMem_attribute cyvalue = utils.HelperCUgraphMem_attribute(attr, value, is_getter=False)
     cdef void* cyvalue_ptr = <void*><void_ptr>cyvalue.cptr
-    err = cyruntime.cudaDeviceSetGraphMemAttribute(device, cyattr, cyvalue_ptr)
+    with nogil:
+        err = cyruntime.cudaDeviceSetGraphMemAttribute(device, cyattr, cyvalue_ptr)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -32264,7 +32466,8 @@ def cudaGraphClone(originalGraph):
         poriginalGraph = int(cudaGraph_t(originalGraph))
     cyoriginalGraph = <cyruntime.cudaGraph_t><void_ptr>poriginalGraph
     cdef cudaGraph_t pGraphClone = cudaGraph_t()
-    err = cyruntime.cudaGraphClone(<cyruntime.cudaGraph_t*>pGraphClone._pvt_ptr, cyoriginalGraph)
+    with nogil:
+        err = cyruntime.cudaGraphClone(<cyruntime.cudaGraph_t*>pGraphClone._pvt_ptr, cyoriginalGraph)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pGraphClone)
@@ -32320,7 +32523,8 @@ def cudaGraphNodeFindInClone(originalNode, clonedGraph):
         poriginalNode = int(cudaGraphNode_t(originalNode))
     cyoriginalNode = <cyruntime.cudaGraphNode_t><void_ptr>poriginalNode
     cdef cudaGraphNode_t pNode = cudaGraphNode_t()
-    err = cyruntime.cudaGraphNodeFindInClone(<cyruntime.cudaGraphNode_t*>pNode._pvt_ptr, cyoriginalNode, cyclonedGraph)
+    with nogil:
+        err = cyruntime.cudaGraphNodeFindInClone(<cyruntime.cudaGraphNode_t*>pNode._pvt_ptr, cyoriginalNode, cyclonedGraph)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pNode)
@@ -32359,7 +32563,8 @@ def cudaGraphNodeGetType(node):
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
     cdef cyruntime.cudaGraphNodeType pType
-    err = cyruntime.cudaGraphNodeGetType(cynode, &pType)
+    with nogil:
+        err = cyruntime.cudaGraphNodeGetType(cynode, &pType)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], cudaGraphNodeType(pType))
@@ -32413,7 +32618,8 @@ def cudaGraphGetNodes(graph, size_t numNodes = 0):
         cynodes = <cyruntime.cudaGraphNode_t*>calloc(_graph_length, sizeof(cyruntime.cudaGraphNode_t))
         if cynodes is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
-    err = cyruntime.cudaGraphGetNodes(cygraph, cynodes, &numNodes)
+    with nogil:
+        err = cyruntime.cudaGraphGetNodes(cygraph, cynodes, &numNodes)
     if cudaError_t(err) == cudaError_t(0):
         pynodes = [cudaGraphNode_t(init_value=<void_ptr>cynodes[idx]) for idx in range(_graph_length)]
     if cynodes is not NULL:
@@ -32471,7 +32677,8 @@ def cudaGraphGetRootNodes(graph, size_t pNumRootNodes = 0):
         cypRootNodes = <cyruntime.cudaGraphNode_t*>calloc(_graph_length, sizeof(cyruntime.cudaGraphNode_t))
         if cypRootNodes is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
-    err = cyruntime.cudaGraphGetRootNodes(cygraph, cypRootNodes, &pNumRootNodes)
+    with nogil:
+        err = cyruntime.cudaGraphGetRootNodes(cygraph, cypRootNodes, &pNumRootNodes)
     if cudaError_t(err) == cudaError_t(0):
         pypRootNodes = [cudaGraphNode_t(init_value=<void_ptr>cypRootNodes[idx]) for idx in range(_graph_length)]
     if cypRootNodes is not NULL:
@@ -32552,7 +32759,8 @@ def cudaGraphGetEdges(graph, size_t numEdges = 0):
         cyedgeData = <cyruntime.cudaGraphEdgeData*>calloc(_graph_length, sizeof(cyruntime.cudaGraphEdgeData))
         if cyedgeData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cyruntime.cudaGraphEdgeData)))
-    err = cyruntime.cudaGraphGetEdges(cygraph, cyfrom_, cyto, cyedgeData, &numEdges)
+    with nogil:
+        err = cyruntime.cudaGraphGetEdges(cygraph, cyfrom_, cyto, cyedgeData, &numEdges)
     if cudaError_t(err) == cudaError_t(0):
         pyfrom_ = [cudaGraphNode_t(init_value=<void_ptr>cyfrom_[idx]) for idx in range(_graph_length)]
     if cyfrom_ is not NULL:
@@ -32632,7 +32840,8 @@ def cudaGraphNodeGetDependencies(node, size_t pNumDependencies = 0):
         cyedgeData = <cyruntime.cudaGraphEdgeData*>calloc(_graph_length, sizeof(cyruntime.cudaGraphEdgeData))
         if cyedgeData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cyruntime.cudaGraphEdgeData)))
-    err = cyruntime.cudaGraphNodeGetDependencies(cynode, cypDependencies, cyedgeData, &pNumDependencies)
+    with nogil:
+        err = cyruntime.cudaGraphNodeGetDependencies(cynode, cypDependencies, cyedgeData, &pNumDependencies)
     if cudaError_t(err) == cudaError_t(0):
         pypDependencies = [cudaGraphNode_t(init_value=<void_ptr>cypDependencies[idx]) for idx in range(_graph_length)]
     if cypDependencies is not NULL:
@@ -32708,7 +32917,8 @@ def cudaGraphNodeGetDependentNodes(node, size_t pNumDependentNodes = 0):
         cyedgeData = <cyruntime.cudaGraphEdgeData*>calloc(_graph_length, sizeof(cyruntime.cudaGraphEdgeData))
         if cyedgeData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(_graph_length) + 'x' + str(sizeof(cyruntime.cudaGraphEdgeData)))
-    err = cyruntime.cudaGraphNodeGetDependentNodes(cynode, cypDependentNodes, cyedgeData, &pNumDependentNodes)
+    with nogil:
+        err = cyruntime.cudaGraphNodeGetDependentNodes(cynode, cypDependentNodes, cyedgeData, &pNumDependentNodes)
     if cudaError_t(err) == cudaError_t(0):
         pypDependentNodes = [cudaGraphNode_t(init_value=<void_ptr>cypDependentNodes[idx]) for idx in range(_graph_length)]
     if cypDependentNodes is not NULL:
@@ -32776,34 +32986,41 @@ def cudaGraphAddDependencies(graph, from_ : Optional[Tuple[cudaGraphNode_t] | Li
         pgraph = int(cudaGraph_t(graph))
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cyruntime.cudaGraphNode_t* cyfrom_ = NULL
-    if len(from_) > 0:
+    if len(from_) > 1:
         cyfrom_ = <cyruntime.cudaGraphNode_t*> calloc(len(from_), sizeof(cyruntime.cudaGraphNode_t))
         if cyfrom_ is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(from_)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
         else:
             for idx in range(len(from_)):
                 cyfrom_[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>from_[idx])._pvt_ptr[0]
+    elif len(from_) == 1:
+        cyfrom_ = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>from_[0])._pvt_ptr
     cdef cyruntime.cudaGraphNode_t* cyto = NULL
-    if len(to) > 0:
+    if len(to) > 1:
         cyto = <cyruntime.cudaGraphNode_t*> calloc(len(to), sizeof(cyruntime.cudaGraphNode_t))
         if cyto is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(to)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
         else:
             for idx in range(len(to)):
                 cyto[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>to[idx])._pvt_ptr[0]
+    elif len(to) == 1:
+        cyto = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>to[0])._pvt_ptr
     cdef cyruntime.cudaGraphEdgeData* cyedgeData = NULL
-    if len(edgeData) > 0:
+    if len(edgeData) > 1:
         cyedgeData = <cyruntime.cudaGraphEdgeData*> calloc(len(edgeData), sizeof(cyruntime.cudaGraphEdgeData))
         if cyedgeData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(edgeData)) + 'x' + str(sizeof(cyruntime.cudaGraphEdgeData)))
         for idx in range(len(edgeData)):
             string.memcpy(&cyedgeData[idx], (<cudaGraphEdgeData>edgeData[idx])._pvt_ptr, sizeof(cyruntime.cudaGraphEdgeData))
-    err = cyruntime.cudaGraphAddDependencies(cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>from_[0])._pvt_ptr if len(from_) == 1 else cyfrom_, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>to[0])._pvt_ptr if len(to) == 1 else cyto, (<cudaGraphEdgeData>edgeData[0])._pvt_ptr if len(edgeData) == 1 else cyedgeData, numDependencies)
-    if cyfrom_ is not NULL:
+    elif len(edgeData) == 1:
+        cyedgeData = (<cudaGraphEdgeData>edgeData[0])._pvt_ptr
+    with nogil:
+        err = cyruntime.cudaGraphAddDependencies(cygraph, cyfrom_, cyto, cyedgeData, numDependencies)
+    if len(from_) > 1 and cyfrom_ is not NULL:
         free(cyfrom_)
-    if cyto is not NULL:
+    if len(to) > 1 and cyto is not NULL:
         free(cyto)
-    if cyedgeData is not NULL:
+    if len(edgeData) > 1 and cyedgeData is not NULL:
         free(cyedgeData)
     return (_dict_cudaError_t[err],)
 {{endif}}
@@ -32865,34 +33082,41 @@ def cudaGraphRemoveDependencies(graph, from_ : Optional[Tuple[cudaGraphNode_t] |
         pgraph = int(cudaGraph_t(graph))
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cyruntime.cudaGraphNode_t* cyfrom_ = NULL
-    if len(from_) > 0:
+    if len(from_) > 1:
         cyfrom_ = <cyruntime.cudaGraphNode_t*> calloc(len(from_), sizeof(cyruntime.cudaGraphNode_t))
         if cyfrom_ is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(from_)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
         else:
             for idx in range(len(from_)):
                 cyfrom_[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>from_[idx])._pvt_ptr[0]
+    elif len(from_) == 1:
+        cyfrom_ = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>from_[0])._pvt_ptr
     cdef cyruntime.cudaGraphNode_t* cyto = NULL
-    if len(to) > 0:
+    if len(to) > 1:
         cyto = <cyruntime.cudaGraphNode_t*> calloc(len(to), sizeof(cyruntime.cudaGraphNode_t))
         if cyto is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(to)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
         else:
             for idx in range(len(to)):
                 cyto[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>to[idx])._pvt_ptr[0]
+    elif len(to) == 1:
+        cyto = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>to[0])._pvt_ptr
     cdef cyruntime.cudaGraphEdgeData* cyedgeData = NULL
-    if len(edgeData) > 0:
+    if len(edgeData) > 1:
         cyedgeData = <cyruntime.cudaGraphEdgeData*> calloc(len(edgeData), sizeof(cyruntime.cudaGraphEdgeData))
         if cyedgeData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(edgeData)) + 'x' + str(sizeof(cyruntime.cudaGraphEdgeData)))
         for idx in range(len(edgeData)):
             string.memcpy(&cyedgeData[idx], (<cudaGraphEdgeData>edgeData[idx])._pvt_ptr, sizeof(cyruntime.cudaGraphEdgeData))
-    err = cyruntime.cudaGraphRemoveDependencies(cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>from_[0])._pvt_ptr if len(from_) == 1 else cyfrom_, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>to[0])._pvt_ptr if len(to) == 1 else cyto, (<cudaGraphEdgeData>edgeData[0])._pvt_ptr if len(edgeData) == 1 else cyedgeData, numDependencies)
-    if cyfrom_ is not NULL:
+    elif len(edgeData) == 1:
+        cyedgeData = (<cudaGraphEdgeData>edgeData[0])._pvt_ptr
+    with nogil:
+        err = cyruntime.cudaGraphRemoveDependencies(cygraph, cyfrom_, cyto, cyedgeData, numDependencies)
+    if len(from_) > 1 and cyfrom_ is not NULL:
         free(cyfrom_)
-    if cyto is not NULL:
+    if len(to) > 1 and cyto is not NULL:
         free(cyto)
-    if cyedgeData is not NULL:
+    if len(edgeData) > 1 and cyedgeData is not NULL:
         free(cyedgeData)
     return (_dict_cudaError_t[err],)
 {{endif}}
@@ -32931,7 +33155,8 @@ def cudaGraphDestroyNode(node):
     else:
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    err = cyruntime.cudaGraphDestroyNode(cynode)
+    with nogil:
+        err = cyruntime.cudaGraphDestroyNode(cynode)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -33032,7 +33257,8 @@ def cudaGraphInstantiate(graph, unsigned long long flags):
         pgraph = int(cudaGraph_t(graph))
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphExec_t pGraphExec = cudaGraphExec_t()
-    err = cyruntime.cudaGraphInstantiate(<cyruntime.cudaGraphExec_t*>pGraphExec._pvt_ptr, cygraph, flags)
+    with nogil:
+        err = cyruntime.cudaGraphInstantiate(<cyruntime.cudaGraphExec_t*>pGraphExec._pvt_ptr, cygraph, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pGraphExec)
@@ -33137,7 +33363,8 @@ def cudaGraphInstantiateWithFlags(graph, unsigned long long flags):
         pgraph = int(cudaGraph_t(graph))
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphExec_t pGraphExec = cudaGraphExec_t()
-    err = cyruntime.cudaGraphInstantiateWithFlags(<cyruntime.cudaGraphExec_t*>pGraphExec._pvt_ptr, cygraph, flags)
+    with nogil:
+        err = cyruntime.cudaGraphInstantiateWithFlags(<cyruntime.cudaGraphExec_t*>pGraphExec._pvt_ptr, cygraph, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pGraphExec)
@@ -33283,7 +33510,8 @@ def cudaGraphInstantiateWithParams(graph, instantiateParams : Optional[cudaGraph
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphExec_t pGraphExec = cudaGraphExec_t()
     cdef cyruntime.cudaGraphInstantiateParams* cyinstantiateParams_ptr = instantiateParams._pvt_ptr if instantiateParams != None else NULL
-    err = cyruntime.cudaGraphInstantiateWithParams(<cyruntime.cudaGraphExec_t*>pGraphExec._pvt_ptr, cygraph, cyinstantiateParams_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphInstantiateWithParams(<cyruntime.cudaGraphExec_t*>pGraphExec._pvt_ptr, cygraph, cyinstantiateParams_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pGraphExec)
@@ -33325,7 +33553,8 @@ def cudaGraphExecGetFlags(graphExec):
         pgraphExec = int(cudaGraphExec_t(graphExec))
     cygraphExec = <cyruntime.cudaGraphExec_t><void_ptr>pgraphExec
     cdef unsigned long long flags = 0
-    err = cyruntime.cudaGraphExecGetFlags(cygraphExec, &flags)
+    with nogil:
+        err = cyruntime.cudaGraphExecGetFlags(cygraphExec, &flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], flags)
@@ -33408,7 +33637,8 @@ def cudaGraphExecKernelNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
     cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
-    err = cyruntime.cudaGraphExecKernelNodeSetParams(cyhGraphExec, cynode, cypNodeParams_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphExecKernelNodeSetParams(cyhGraphExec, cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -33472,7 +33702,8 @@ def cudaGraphExecMemcpyNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
     cdef cyruntime.cudaMemcpy3DParms* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
-    err = cyruntime.cudaGraphExecMemcpyNodeSetParams(cyhGraphExec, cynode, cypNodeParams_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphExecMemcpyNodeSetParams(cyhGraphExec, cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -33544,7 +33775,8 @@ def cudaGraphExecMemcpyNodeSetParams1D(hGraphExec, node, dst, src, size_t count,
     cysrc = utils.HelperInputVoidPtr(src)
     cdef void* cysrc_ptr = <void*><void_ptr>cysrc.cptr
     cdef cyruntime.cudaMemcpyKind cykind = kind.value
-    err = cyruntime.cudaGraphExecMemcpyNodeSetParams1D(cyhGraphExec, cynode, cydst_ptr, cysrc_ptr, count, cykind)
+    with nogil:
+        err = cyruntime.cudaGraphExecMemcpyNodeSetParams1D(cyhGraphExec, cynode, cydst_ptr, cysrc_ptr, count, cykind)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -33613,7 +33845,8 @@ def cudaGraphExecMemsetNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
     cdef cyruntime.cudaMemsetParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
-    err = cyruntime.cudaGraphExecMemsetNodeSetParams(cyhGraphExec, cynode, cypNodeParams_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphExecMemsetNodeSetParams(cyhGraphExec, cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -33667,7 +33900,8 @@ def cudaGraphExecHostNodeSetParams(hGraphExec, node, pNodeParams : Optional[cuda
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
     cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
-    err = cyruntime.cudaGraphExecHostNodeSetParams(cyhGraphExec, cynode, cypNodeParams_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphExecHostNodeSetParams(cyhGraphExec, cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -33736,7 +33970,8 @@ def cudaGraphExecChildGraphNodeSetParams(hGraphExec, node, childGraph):
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    err = cyruntime.cudaGraphExecChildGraphNodeSetParams(cyhGraphExec, cynode, cychildGraph)
+    with nogil:
+        err = cyruntime.cudaGraphExecChildGraphNodeSetParams(cyhGraphExec, cynode, cychildGraph)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -33798,7 +34033,8 @@ def cudaGraphExecEventRecordNodeSetEvent(hGraphExec, hNode, event):
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    err = cyruntime.cudaGraphExecEventRecordNodeSetEvent(cyhGraphExec, cyhNode, cyevent)
+    with nogil:
+        err = cyruntime.cudaGraphExecEventRecordNodeSetEvent(cyhGraphExec, cyhNode, cyevent)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -33860,7 +34096,8 @@ def cudaGraphExecEventWaitNodeSetEvent(hGraphExec, hNode, event):
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    err = cyruntime.cudaGraphExecEventWaitNodeSetEvent(cyhGraphExec, cyhNode, cyevent)
+    with nogil:
+        err = cyruntime.cudaGraphExecEventWaitNodeSetEvent(cyhGraphExec, cyhNode, cyevent)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -33919,7 +34156,8 @@ def cudaGraphExecExternalSemaphoresSignalNodeSetParams(hGraphExec, hNode, nodePa
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
     cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cyruntime.cudaGraphExecExternalSemaphoresSignalNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphExecExternalSemaphoresSignalNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -33978,7 +34216,8 @@ def cudaGraphExecExternalSemaphoresWaitNodeSetParams(hGraphExec, hNode, nodePara
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
     cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cyruntime.cudaGraphExecExternalSemaphoresWaitNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphExecExternalSemaphoresWaitNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -34040,7 +34279,8 @@ def cudaGraphNodeSetEnabled(hGraphExec, hNode, unsigned int isEnabled):
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    err = cyruntime.cudaGraphNodeSetEnabled(cyhGraphExec, cyhNode, isEnabled)
+    with nogil:
+        err = cyruntime.cudaGraphNodeSetEnabled(cyhGraphExec, cyhNode, isEnabled)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -34096,7 +34336,8 @@ def cudaGraphNodeGetEnabled(hGraphExec, hNode):
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
     cdef unsigned int isEnabled = 0
-    err = cyruntime.cudaGraphNodeGetEnabled(cyhGraphExec, cyhNode, &isEnabled)
+    with nogil:
+        err = cyruntime.cudaGraphNodeGetEnabled(cyhGraphExec, cyhNode, &isEnabled)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], isEnabled)
@@ -34271,7 +34512,8 @@ def cudaGraphExecUpdate(hGraphExec, hGraph):
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
     cdef cudaGraphExecUpdateResultInfo resultInfo = cudaGraphExecUpdateResultInfo()
-    err = cyruntime.cudaGraphExecUpdate(cyhGraphExec, cyhGraph, <cyruntime.cudaGraphExecUpdateResultInfo*>resultInfo._pvt_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphExecUpdate(cyhGraphExec, cyhGraph, <cyruntime.cudaGraphExecUpdateResultInfo*>resultInfo._pvt_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], resultInfo)
@@ -34321,7 +34563,8 @@ def cudaGraphUpload(graphExec, stream):
     else:
         pgraphExec = int(cudaGraphExec_t(graphExec))
     cygraphExec = <cyruntime.cudaGraphExec_t><void_ptr>pgraphExec
-    err = cyruntime.cudaGraphUpload(cygraphExec, cystream)
+    with nogil:
+        err = cyruntime.cudaGraphUpload(cygraphExec, cystream)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -34374,7 +34617,8 @@ def cudaGraphLaunch(graphExec, stream):
     else:
         pgraphExec = int(cudaGraphExec_t(graphExec))
     cygraphExec = <cyruntime.cudaGraphExec_t><void_ptr>pgraphExec
-    err = cyruntime.cudaGraphLaunch(cygraphExec, cystream)
+    with nogil:
+        err = cyruntime.cudaGraphLaunch(cygraphExec, cystream)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -34408,7 +34652,8 @@ def cudaGraphExecDestroy(graphExec):
     else:
         pgraphExec = int(cudaGraphExec_t(graphExec))
     cygraphExec = <cyruntime.cudaGraphExec_t><void_ptr>pgraphExec
-    err = cyruntime.cudaGraphExecDestroy(cygraphExec)
+    with nogil:
+        err = cyruntime.cudaGraphExecDestroy(cygraphExec)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -34442,7 +34687,8 @@ def cudaGraphDestroy(graph):
     else:
         pgraph = int(cudaGraph_t(graph))
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
-    err = cyruntime.cudaGraphDestroy(cygraph)
+    with nogil:
+        err = cyruntime.cudaGraphDestroy(cygraph)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -34481,7 +34727,8 @@ def cudaGraphDebugDotPrint(graph, char* path, unsigned int flags):
     else:
         pgraph = int(cudaGraph_t(graph))
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
-    err = cyruntime.cudaGraphDebugDotPrint(cygraph, path, flags)
+    with nogil:
+        err = cyruntime.cudaGraphDebugDotPrint(cygraph, path, flags)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -34540,7 +34787,8 @@ def cudaUserObjectCreate(ptr, destroy, unsigned int initialRefcount, unsigned in
     cdef cudaUserObject_t object_out = cudaUserObject_t()
     cyptr = utils.HelperInputVoidPtr(ptr)
     cdef void* cyptr_ptr = <void*><void_ptr>cyptr.cptr
-    err = cyruntime.cudaUserObjectCreate(<cyruntime.cudaUserObject_t*>object_out._pvt_ptr, cyptr_ptr, cydestroy, initialRefcount, flags)
+    with nogil:
+        err = cyruntime.cudaUserObjectCreate(<cyruntime.cudaUserObject_t*>object_out._pvt_ptr, cyptr_ptr, cydestroy, initialRefcount, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], object_out)
@@ -34583,7 +34831,8 @@ def cudaUserObjectRetain(object, unsigned int count):
     else:
         pobject = int(cudaUserObject_t(object))
     cyobject = <cyruntime.cudaUserObject_t><void_ptr>pobject
-    err = cyruntime.cudaUserObjectRetain(cyobject, count)
+    with nogil:
+        err = cyruntime.cudaUserObjectRetain(cyobject, count)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -34627,7 +34876,8 @@ def cudaUserObjectRelease(object, unsigned int count):
     else:
         pobject = int(cudaUserObject_t(object))
     cyobject = <cyruntime.cudaUserObject_t><void_ptr>pobject
-    err = cyruntime.cudaUserObjectRelease(cyobject, count)
+    with nogil:
+        err = cyruntime.cudaUserObjectRelease(cyobject, count)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -34682,7 +34932,8 @@ def cudaGraphRetainUserObject(graph, object, unsigned int count, unsigned int fl
     else:
         pgraph = int(cudaGraph_t(graph))
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
-    err = cyruntime.cudaGraphRetainUserObject(cygraph, cyobject, count, flags)
+    with nogil:
+        err = cyruntime.cudaGraphRetainUserObject(cygraph, cyobject, count, flags)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -34732,7 +34983,8 @@ def cudaGraphReleaseUserObject(graph, object, unsigned int count):
     else:
         pgraph = int(cudaGraph_t(graph))
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
-    err = cyruntime.cudaGraphReleaseUserObject(cygraph, cyobject, count)
+    with nogil:
+        err = cyruntime.cudaGraphReleaseUserObject(cygraph, cyobject, count)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -34802,25 +35054,30 @@ def cudaGraphAddNode(graph, pDependencies : Optional[Tuple[cudaGraphNode_t] | Li
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphNode_t pGraphNode = cudaGraphNode_t()
     cdef cyruntime.cudaGraphNode_t* cypDependencies = NULL
-    if len(pDependencies) > 0:
+    if len(pDependencies) > 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*> calloc(len(pDependencies), sizeof(cyruntime.cudaGraphNode_t))
         if cypDependencies is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(pDependencies)) + 'x' + str(sizeof(cyruntime.cudaGraphNode_t)))
         else:
             for idx in range(len(pDependencies)):
                 cypDependencies[idx] = <cyruntime.cudaGraphNode_t>(<cudaGraphNode_t>pDependencies[idx])._pvt_ptr[0]
+    elif len(pDependencies) == 1:
+        cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     cdef cyruntime.cudaGraphEdgeData* cydependencyData = NULL
-    if len(dependencyData) > 0:
+    if len(dependencyData) > 1:
         cydependencyData = <cyruntime.cudaGraphEdgeData*> calloc(len(dependencyData), sizeof(cyruntime.cudaGraphEdgeData))
         if cydependencyData is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(len(dependencyData)) + 'x' + str(sizeof(cyruntime.cudaGraphEdgeData)))
         for idx in range(len(dependencyData)):
             string.memcpy(&cydependencyData[idx], (<cudaGraphEdgeData>dependencyData[idx])._pvt_ptr, sizeof(cyruntime.cudaGraphEdgeData))
+    elif len(dependencyData) == 1:
+        cydependencyData = (<cudaGraphEdgeData>dependencyData[0])._pvt_ptr
     cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cyruntime.cudaGraphAddNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr if len(pDependencies) == 1 else cypDependencies, (<cudaGraphEdgeData>dependencyData[0])._pvt_ptr if len(dependencyData) == 1 else cydependencyData, numDependencies, cynodeParams_ptr)
-    if cypDependencies is not NULL:
+    with nogil:
+        err = cyruntime.cudaGraphAddNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, cydependencyData, numDependencies, cynodeParams_ptr)
+    if len(pDependencies) > 1 and cypDependencies is not NULL:
         free(cypDependencies)
-    if cydependencyData is not NULL:
+    if len(dependencyData) > 1 and cydependencyData is not NULL:
         free(cydependencyData)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
@@ -34866,7 +35123,8 @@ def cudaGraphNodeSetParams(node, nodeParams : Optional[cudaGraphNodeParams]):
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
     cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cyruntime.cudaGraphNodeSetParams(cynode, cynodeParams_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphNodeSetParams(cynode, cynodeParams_ptr)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -34925,7 +35183,8 @@ def cudaGraphExecNodeSetParams(graphExec, node, nodeParams : Optional[cudaGraphN
         pgraphExec = int(cudaGraphExec_t(graphExec))
     cygraphExec = <cyruntime.cudaGraphExec_t><void_ptr>pgraphExec
     cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
-    err = cyruntime.cudaGraphExecNodeSetParams(cygraphExec, cynode, cynodeParams_ptr)
+    with nogil:
+        err = cyruntime.cudaGraphExecNodeSetParams(cygraphExec, cynode, cynodeParams_ptr)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -34974,7 +35233,8 @@ def cudaGraphConditionalHandleCreate(graph, unsigned int defaultLaunchValue, uns
         pgraph = int(cudaGraph_t(graph))
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphConditionalHandle pHandle_out = cudaGraphConditionalHandle()
-    err = cyruntime.cudaGraphConditionalHandleCreate(<cyruntime.cudaGraphConditionalHandle*>pHandle_out._pvt_ptr, cygraph, defaultLaunchValue, flags)
+    with nogil:
+        err = cyruntime.cudaGraphConditionalHandleCreate(<cyruntime.cudaGraphConditionalHandle*>pHandle_out._pvt_ptr, cygraph, defaultLaunchValue, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pHandle_out)
@@ -35077,7 +35337,8 @@ def cudaGetDriverEntryPoint(char* symbol, unsigned long long flags):
     """
     cdef void_ptr funcPtr = 0
     cdef cyruntime.cudaDriverEntryPointQueryResult driverStatus
-    err = cyruntime.cudaGetDriverEntryPoint(symbol, <void**>&funcPtr, flags, &driverStatus)
+    with nogil:
+        err = cyruntime.cudaGetDriverEntryPoint(symbol, <void**>&funcPtr, flags, &driverStatus)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None, None)
     return (_dict_cudaError_t[err], funcPtr, cudaDriverEntryPointQueryResult(driverStatus))
@@ -35184,7 +35445,8 @@ def cudaGetDriverEntryPointByVersion(char* symbol, unsigned int cudaVersion, uns
     """
     cdef void_ptr funcPtr = 0
     cdef cyruntime.cudaDriverEntryPointQueryResult driverStatus
-    err = cyruntime.cudaGetDriverEntryPointByVersion(symbol, <void**>&funcPtr, cudaVersion, flags, &driverStatus)
+    with nogil:
+        err = cyruntime.cudaGetDriverEntryPointByVersion(symbol, <void**>&funcPtr, cudaVersion, flags, &driverStatus)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None, None)
     return (_dict_cudaError_t[err], funcPtr, cudaDriverEntryPointQueryResult(driverStatus))
@@ -35273,14 +35535,17 @@ def cudaLibraryLoadData(code, jitOptions : Optional[Tuple[cudaJitOption] | List[
     cdef vector[cyruntime.cudaJitOption] cyjitOptions = [pyjitOptions.value for pyjitOptions in (jitOptions)]
     pylist = [utils.HelperCudaJitOption(pyoptions, pyoptionValues) for pyoptions, pyoptionValues in zip(jitOptions, jitOptionsValues)]
     cdef utils.InputVoidPtrPtrHelper voidStarHelperjitOptionsValues = utils.InputVoidPtrPtrHelper(pylist)
+    cdef void** cyjitOptionsValues_ptr = <void**><void_ptr>voidStarHelperjitOptionsValues.cptr
     if numJitOptions > len(jitOptions): raise RuntimeError("List is too small: " + str(len(jitOptions)) + " < " + str(numJitOptions))
     if numJitOptions > len(jitOptionsValues): raise RuntimeError("List is too small: " + str(len(jitOptionsValues)) + " < " + str(numJitOptions))
     cdef vector[cyruntime.cudaLibraryOption] cylibraryOptions = [pylibraryOptions.value for pylibraryOptions in (libraryOptions)]
     pylist = [utils.HelperCudaLibraryOption(pyoptions, pyoptionValues) for pyoptions, pyoptionValues in zip(libraryOptions, libraryOptionValues)]
     cdef utils.InputVoidPtrPtrHelper voidStarHelperlibraryOptionValues = utils.InputVoidPtrPtrHelper(pylist)
+    cdef void** cylibraryOptionValues_ptr = <void**><void_ptr>voidStarHelperlibraryOptionValues.cptr
     if numLibraryOptions > len(libraryOptions): raise RuntimeError("List is too small: " + str(len(libraryOptions)) + " < " + str(numLibraryOptions))
     if numLibraryOptions > len(libraryOptionValues): raise RuntimeError("List is too small: " + str(len(libraryOptionValues)) + " < " + str(numLibraryOptions))
-    err = cyruntime.cudaLibraryLoadData(<cyruntime.cudaLibrary_t*>library._pvt_ptr, cycode_ptr, cyjitOptions.data(), <void**><void_ptr>voidStarHelperjitOptionsValues.cptr, numJitOptions, cylibraryOptions.data(), <void**><void_ptr>voidStarHelperlibraryOptionValues.cptr, numLibraryOptions)
+    with nogil:
+        err = cyruntime.cudaLibraryLoadData(<cyruntime.cudaLibrary_t*>library._pvt_ptr, cycode_ptr, cyjitOptions.data(), cyjitOptionsValues_ptr, numJitOptions, cylibraryOptions.data(), cylibraryOptionValues_ptr, numLibraryOptions)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], library)
@@ -35367,14 +35632,17 @@ def cudaLibraryLoadFromFile(char* fileName, jitOptions : Optional[Tuple[cudaJitO
     cdef vector[cyruntime.cudaJitOption] cyjitOptions = [pyjitOptions.value for pyjitOptions in (jitOptions)]
     pylist = [utils.HelperCudaJitOption(pyoptions, pyoptionValues) for pyoptions, pyoptionValues in zip(jitOptions, jitOptionsValues)]
     cdef utils.InputVoidPtrPtrHelper voidStarHelperjitOptionsValues = utils.InputVoidPtrPtrHelper(pylist)
+    cdef void** cyjitOptionsValues_ptr = <void**><void_ptr>voidStarHelperjitOptionsValues.cptr
     if numJitOptions > len(jitOptions): raise RuntimeError("List is too small: " + str(len(jitOptions)) + " < " + str(numJitOptions))
     if numJitOptions > len(jitOptionsValues): raise RuntimeError("List is too small: " + str(len(jitOptionsValues)) + " < " + str(numJitOptions))
     cdef vector[cyruntime.cudaLibraryOption] cylibraryOptions = [pylibraryOptions.value for pylibraryOptions in (libraryOptions)]
     pylist = [utils.HelperCudaLibraryOption(pyoptions, pyoptionValues) for pyoptions, pyoptionValues in zip(libraryOptions, libraryOptionValues)]
     cdef utils.InputVoidPtrPtrHelper voidStarHelperlibraryOptionValues = utils.InputVoidPtrPtrHelper(pylist)
+    cdef void** cylibraryOptionValues_ptr = <void**><void_ptr>voidStarHelperlibraryOptionValues.cptr
     if numLibraryOptions > len(libraryOptions): raise RuntimeError("List is too small: " + str(len(libraryOptions)) + " < " + str(numLibraryOptions))
     if numLibraryOptions > len(libraryOptionValues): raise RuntimeError("List is too small: " + str(len(libraryOptionValues)) + " < " + str(numLibraryOptions))
-    err = cyruntime.cudaLibraryLoadFromFile(<cyruntime.cudaLibrary_t*>library._pvt_ptr, fileName, cyjitOptions.data(), <void**><void_ptr>voidStarHelperjitOptionsValues.cptr, numJitOptions, cylibraryOptions.data(), <void**><void_ptr>voidStarHelperlibraryOptionValues.cptr, numLibraryOptions)
+    with nogil:
+        err = cyruntime.cudaLibraryLoadFromFile(<cyruntime.cudaLibrary_t*>library._pvt_ptr, fileName, cyjitOptions.data(), cyjitOptionsValues_ptr, numJitOptions, cylibraryOptions.data(), cylibraryOptionValues_ptr, numLibraryOptions)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], library)
@@ -35410,7 +35678,8 @@ def cudaLibraryUnload(library):
     else:
         plibrary = int(cudaLibrary_t(library))
     cylibrary = <cyruntime.cudaLibrary_t><void_ptr>plibrary
-    err = cyruntime.cudaLibraryUnload(cylibrary)
+    with nogil:
+        err = cyruntime.cudaLibraryUnload(cylibrary)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -35451,7 +35720,8 @@ def cudaLibraryGetKernel(library, char* name):
         plibrary = int(cudaLibrary_t(library))
     cylibrary = <cyruntime.cudaLibrary_t><void_ptr>plibrary
     cdef cudaKernel_t pKernel = cudaKernel_t()
-    err = cyruntime.cudaLibraryGetKernel(<cyruntime.cudaKernel_t*>pKernel._pvt_ptr, cylibrary, name)
+    with nogil:
+        err = cyruntime.cudaLibraryGetKernel(<cyruntime.cudaKernel_t*>pKernel._pvt_ptr, cylibrary, name)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pKernel)
@@ -35502,7 +35772,8 @@ def cudaLibraryGetGlobal(library, char* name):
     cylibrary = <cyruntime.cudaLibrary_t><void_ptr>plibrary
     cdef void_ptr dptr = 0
     cdef size_t numbytes = 0
-    err = cyruntime.cudaLibraryGetGlobal(<void**>&dptr, &numbytes, cylibrary, name)
+    with nogil:
+        err = cyruntime.cudaLibraryGetGlobal(<void**>&dptr, &numbytes, cylibrary, name)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None, None)
     return (_dict_cudaError_t[err], dptr, numbytes)
@@ -35555,7 +35826,8 @@ def cudaLibraryGetManaged(library, char* name):
     cylibrary = <cyruntime.cudaLibrary_t><void_ptr>plibrary
     cdef void_ptr dptr = 0
     cdef size_t numbytes = 0
-    err = cyruntime.cudaLibraryGetManaged(<void**>&dptr, &numbytes, cylibrary, name)
+    with nogil:
+        err = cyruntime.cudaLibraryGetManaged(<void**>&dptr, &numbytes, cylibrary, name)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None, None)
     return (_dict_cudaError_t[err], dptr, numbytes)
@@ -35600,7 +35872,8 @@ def cudaLibraryGetUnifiedFunction(library, char* symbol):
         plibrary = int(cudaLibrary_t(library))
     cylibrary = <cyruntime.cudaLibrary_t><void_ptr>plibrary
     cdef void_ptr fptr = 0
-    err = cyruntime.cudaLibraryGetUnifiedFunction(<void**>&fptr, cylibrary, symbol)
+    with nogil:
+        err = cyruntime.cudaLibraryGetUnifiedFunction(<void**>&fptr, cylibrary, symbol)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], fptr)
@@ -35639,7 +35912,8 @@ def cudaLibraryGetKernelCount(lib):
         plib = int(cudaLibrary_t(lib))
     cylib = <cyruntime.cudaLibrary_t><void_ptr>plib
     cdef unsigned int count = 0
-    err = cyruntime.cudaLibraryGetKernelCount(&count, cylib)
+    with nogil:
+        err = cyruntime.cudaLibraryGetKernelCount(&count, cylib)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], count)
@@ -35687,7 +35961,8 @@ def cudaLibraryEnumerateKernels(unsigned int numKernels, lib):
         cykernels = <cyruntime.cudaKernel_t*>calloc(numKernels, sizeof(cyruntime.cudaKernel_t))
         if cykernels is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(numKernels) + 'x' + str(sizeof(cyruntime.cudaKernel_t)))
-    err = cyruntime.cudaLibraryEnumerateKernels(cykernels, numKernels, cylib)
+    with nogil:
+        err = cyruntime.cudaLibraryEnumerateKernels(cykernels, numKernels, cylib)
     if cudaError_t(err) == cudaError_t(0):
         pykernels = [cudaKernel_t(init_value=<void_ptr>cykernels[idx]) for idx in range(numKernels)]
     if cykernels is not NULL:
@@ -35798,7 +36073,8 @@ def cudaKernelSetAttributeForDevice(kernel, attr not None : cudaFuncAttribute, i
         pkernel = int(cudaKernel_t(kernel))
     cykernel = <cyruntime.cudaKernel_t><void_ptr>pkernel
     cdef cyruntime.cudaFuncAttribute cyattr = attr.value
-    err = cyruntime.cudaKernelSetAttributeForDevice(cykernel, cyattr, value, device)
+    with nogil:
+        err = cyruntime.cudaKernelSetAttributeForDevice(cykernel, cyattr, value, device)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -35809,7 +36085,8 @@ def cudaGetExportTable(pExportTableId : Optional[cudaUUID_t]):
     """"""
     cdef void_ptr ppExportTable = 0
     cdef cyruntime.cudaUUID_t* cypExportTableId_ptr = pExportTableId._pvt_ptr if pExportTableId != None else NULL
-    err = cyruntime.cudaGetExportTable(<const void**>&ppExportTable, cypExportTableId_ptr)
+    with nogil:
+        err = cyruntime.cudaGetExportTable(<const void**>&ppExportTable, cypExportTableId_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], ppExportTable)
@@ -35852,7 +36129,8 @@ def cudaGetKernel(entryFuncAddr):
     cdef cudaKernel_t kernelPtr = cudaKernel_t()
     cyentryFuncAddr = utils.HelperInputVoidPtr(entryFuncAddr)
     cdef void* cyentryFuncAddr_ptr = <void*><void_ptr>cyentryFuncAddr.cptr
-    err = cyruntime.cudaGetKernel(<cyruntime.cudaKernel_t*>kernelPtr._pvt_ptr, cyentryFuncAddr_ptr)
+    with nogil:
+        err = cyruntime.cudaGetKernel(<cyruntime.cudaKernel_t*>kernelPtr._pvt_ptr, cyentryFuncAddr_ptr)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], kernelPtr)
@@ -35891,7 +36169,8 @@ def make_cudaPitchedPtr(d, size_t p, size_t xsz, size_t ysz):
     """
     cyd = utils.HelperInputVoidPtr(d)
     cdef void* cyd_ptr = <void*><void_ptr>cyd.cptr
-    err = cyruntime.make_cudaPitchedPtr(cyd_ptr, p, xsz, ysz)
+    with nogil:
+        err = cyruntime.make_cudaPitchedPtr(cyd_ptr, p, xsz, ysz)
     cdef cudaPitchedPtr wrapper = cudaPitchedPtr()
     wrapper._pvt_ptr[0] = err
     return wrapper
@@ -35926,7 +36205,8 @@ def make_cudaPos(size_t x, size_t y, size_t z):
     --------
     make_cudaExtent, make_cudaPitchedPtr
     """
-    err = cyruntime.make_cudaPos(x, y, z)
+    with nogil:
+        err = cyruntime.make_cudaPos(x, y, z)
     cdef cudaPos wrapper = cudaPos()
     wrapper._pvt_ptr[0] = err
     return wrapper
@@ -35962,7 +36242,8 @@ def make_cudaExtent(size_t w, size_t h, size_t d):
     --------
     make_cudaPitchedPtr, make_cudaPos
     """
-    err = cyruntime.make_cudaExtent(w, h, d)
+    with nogil:
+        err = cyruntime.make_cudaExtent(w, h, d)
     cdef cudaExtent wrapper = cudaExtent()
     wrapper._pvt_ptr[0] = err
     return wrapper
@@ -36037,7 +36318,8 @@ def cudaGraphicsEGLRegisterImage(image, unsigned int flags):
         pimage = int(EGLImageKHR(image))
     cyimage = <cyruntime.EGLImageKHR><void_ptr>pimage
     cdef cudaGraphicsResource_t pCudaResource = cudaGraphicsResource_t()
-    err = cyruntime.cudaGraphicsEGLRegisterImage(pCudaResource._pvt_ptr, cyimage, flags)
+    with nogil:
+        err = cyruntime.cudaGraphicsEGLRegisterImage(pCudaResource._pvt_ptr, cyimage, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], pCudaResource)
@@ -36079,7 +36361,8 @@ def cudaEGLStreamConsumerConnect(eglStream):
         peglStream = int(EGLStreamKHR(eglStream))
     cyeglStream = <cyruntime.EGLStreamKHR><void_ptr>peglStream
     cdef cudaEglStreamConnection conn = cudaEglStreamConnection()
-    err = cyruntime.cudaEGLStreamConsumerConnect(<cyruntime.cudaEglStreamConnection*>conn._pvt_ptr, cyeglStream)
+    with nogil:
+        err = cyruntime.cudaEGLStreamConsumerConnect(<cyruntime.cudaEglStreamConnection*>conn._pvt_ptr, cyeglStream)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], conn)
@@ -36125,7 +36408,8 @@ def cudaEGLStreamConsumerConnectWithFlags(eglStream, unsigned int flags):
         peglStream = int(EGLStreamKHR(eglStream))
     cyeglStream = <cyruntime.EGLStreamKHR><void_ptr>peglStream
     cdef cudaEglStreamConnection conn = cudaEglStreamConnection()
-    err = cyruntime.cudaEGLStreamConsumerConnectWithFlags(<cyruntime.cudaEglStreamConnection*>conn._pvt_ptr, cyeglStream, flags)
+    with nogil:
+        err = cyruntime.cudaEGLStreamConsumerConnectWithFlags(<cyruntime.cudaEglStreamConnection*>conn._pvt_ptr, cyeglStream, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], conn)
@@ -36163,7 +36447,8 @@ def cudaEGLStreamConsumerDisconnect(conn):
         cyconn = <cyruntime.cudaEglStreamConnection*><void_ptr>conn
     else:
         raise TypeError("Argument 'conn' is not instance of type (expected <class 'int, runtime.cudaEglStreamConnection'>, found " + str(type(conn)))
-    err = cyruntime.cudaEGLStreamConsumerDisconnect(cyconn)
+    with nogil:
+        err = cyruntime.cudaEGLStreamConsumerDisconnect(cyconn)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -36228,7 +36513,8 @@ def cudaEGLStreamConsumerAcquireFrame(conn, pCudaResource, pStream, unsigned int
         cyconn = <cyruntime.cudaEglStreamConnection*><void_ptr>conn
     else:
         raise TypeError("Argument 'conn' is not instance of type (expected <class 'int, runtime.cudaEglStreamConnection'>, found " + str(type(conn)))
-    err = cyruntime.cudaEGLStreamConsumerAcquireFrame(cyconn, cypCudaResource, cypStream, timeout)
+    with nogil:
+        err = cyruntime.cudaEGLStreamConsumerAcquireFrame(cyconn, cypCudaResource, cypStream, timeout)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -36287,7 +36573,8 @@ def cudaEGLStreamConsumerReleaseFrame(conn, pCudaResource, pStream):
         cyconn = <cyruntime.cudaEglStreamConnection*><void_ptr>conn
     else:
         raise TypeError("Argument 'conn' is not instance of type (expected <class 'int, runtime.cudaEglStreamConnection'>, found " + str(type(conn)))
-    err = cyruntime.cudaEGLStreamConsumerReleaseFrame(cyconn, cypCudaResource, cypStream)
+    with nogil:
+        err = cyruntime.cudaEGLStreamConsumerReleaseFrame(cyconn, cypCudaResource, cypStream)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -36347,7 +36634,8 @@ def cudaEGLStreamProducerConnect(eglStream, width, height):
         peglStream = int(EGLStreamKHR(eglStream))
     cyeglStream = <cyruntime.EGLStreamKHR><void_ptr>peglStream
     cdef cudaEglStreamConnection conn = cudaEglStreamConnection()
-    err = cyruntime.cudaEGLStreamProducerConnect(<cyruntime.cudaEglStreamConnection*>conn._pvt_ptr, cyeglStream, cywidth, cyheight)
+    with nogil:
+        err = cyruntime.cudaEGLStreamProducerConnect(<cyruntime.cudaEglStreamConnection*>conn._pvt_ptr, cyeglStream, cywidth, cyheight)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], conn)
@@ -36385,7 +36673,8 @@ def cudaEGLStreamProducerDisconnect(conn):
         cyconn = <cyruntime.cudaEglStreamConnection*><void_ptr>conn
     else:
         raise TypeError("Argument 'conn' is not instance of type (expected <class 'int, runtime.cudaEglStreamConnection'>, found " + str(type(conn)))
-    err = cyruntime.cudaEGLStreamProducerDisconnect(cyconn)
+    with nogil:
+        err = cyruntime.cudaEGLStreamProducerDisconnect(cyconn)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -36444,7 +36733,8 @@ def cudaEGLStreamProducerPresentFrame(conn, eglframe not None : cudaEglFrame, pS
         cyconn = <cyruntime.cudaEglStreamConnection*><void_ptr>conn
     else:
         raise TypeError("Argument 'conn' is not instance of type (expected <class 'int, runtime.cudaEglStreamConnection'>, found " + str(type(conn)))
-    err = cyruntime.cudaEGLStreamProducerPresentFrame(cyconn, eglframe._pvt_ptr[0], cypStream)
+    with nogil:
+        err = cyruntime.cudaEGLStreamProducerPresentFrame(cyconn, eglframe._pvt_ptr[0], cypStream)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -36498,7 +36788,8 @@ def cudaEGLStreamProducerReturnFrame(conn, eglframe : Optional[cudaEglFrame], pS
     else:
         raise TypeError("Argument 'conn' is not instance of type (expected <class 'int, runtime.cudaEglStreamConnection'>, found " + str(type(conn)))
     cdef cyruntime.cudaEglFrame* cyeglframe_ptr = eglframe._pvt_ptr if eglframe != None else NULL
-    err = cyruntime.cudaEGLStreamProducerReturnFrame(cyconn, cyeglframe_ptr, cypStream)
+    with nogil:
+        err = cyruntime.cudaEGLStreamProducerReturnFrame(cyconn, cyeglframe_ptr, cypStream)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -36549,7 +36840,8 @@ def cudaGraphicsResourceGetMappedEglFrame(resource, unsigned int index, unsigned
         presource = int(cudaGraphicsResource_t(resource))
     cyresource = <cyruntime.cudaGraphicsResource_t><void_ptr>presource
     cdef cudaEglFrame eglFrame = cudaEglFrame()
-    err = cyruntime.cudaGraphicsResourceGetMappedEglFrame(<cyruntime.cudaEglFrame*>eglFrame._pvt_ptr, cyresource, index, mipLevel)
+    with nogil:
+        err = cyruntime.cudaGraphicsResourceGetMappedEglFrame(<cyruntime.cudaEglFrame*>eglFrame._pvt_ptr, cyresource, index, mipLevel)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], eglFrame)
@@ -36604,7 +36896,8 @@ def cudaEventCreateFromEGLSync(eglSync, unsigned int flags):
         peglSync = int(EGLSyncKHR(eglSync))
     cyeglSync = <cyruntime.EGLSyncKHR><void_ptr>peglSync
     cdef cudaEvent_t phEvent = cudaEvent_t()
-    err = cyruntime.cudaEventCreateFromEGLSync(<cyruntime.cudaEvent_t*>phEvent._pvt_ptr, cyeglSync, flags)
+    with nogil:
+        err = cyruntime.cudaEventCreateFromEGLSync(<cyruntime.cudaEvent_t*>phEvent._pvt_ptr, cyeglSync, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], phEvent)
@@ -36633,7 +36926,8 @@ def cudaProfilerStart():
     --------
     :py:obj:`~.cudaProfilerStop`, :py:obj:`~.cuProfilerStart`
     """
-    err = cyruntime.cudaProfilerStart()
+    with nogil:
+        err = cyruntime.cudaProfilerStart()
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -36660,7 +36954,8 @@ def cudaProfilerStop():
     --------
     :py:obj:`~.cudaProfilerStart`, :py:obj:`~.cuProfilerStop`
     """
-    err = cyruntime.cudaProfilerStop()
+    with nogil:
+        err = cyruntime.cudaProfilerStop()
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -36721,7 +37016,8 @@ def cudaGLGetDevices(unsigned int cudaDeviceCount, deviceList not None : cudaGLD
         if cypCudaDevices is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(cudaDeviceCount) + 'x' + str(sizeof(int)))
     cdef cyruntime.cudaGLDeviceList cydeviceList = deviceList.value
-    err = cyruntime.cudaGLGetDevices(&pCudaDeviceCount, cypCudaDevices, cudaDeviceCount, cydeviceList)
+    with nogil:
+        err = cyruntime.cudaGLGetDevices(&pCudaDeviceCount, cypCudaDevices, cudaDeviceCount, cydeviceList)
     if cudaError_t(err) == cudaError_t(0):
         pypCudaDevices = [<void_ptr>cypCudaDevices[idx] for idx in range(cudaDeviceCount)]
     if cypCudaDevices is not NULL:
@@ -36824,7 +37120,8 @@ def cudaGraphicsGLRegisterImage(image, target, unsigned int flags):
         pimage = int(GLuint(image))
     cyimage = <cyruntime.GLuint><void_ptr>pimage
     cdef cudaGraphicsResource_t resource = cudaGraphicsResource_t()
-    err = cyruntime.cudaGraphicsGLRegisterImage(resource._pvt_ptr, cyimage, cytarget, flags)
+    with nogil:
+        err = cyruntime.cudaGraphicsGLRegisterImage(resource._pvt_ptr, cyimage, cytarget, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], resource)
@@ -36880,7 +37177,8 @@ def cudaGraphicsGLRegisterBuffer(buffer, unsigned int flags):
         pbuffer = int(GLuint(buffer))
     cybuffer = <cyruntime.GLuint><void_ptr>pbuffer
     cdef cudaGraphicsResource_t resource = cudaGraphicsResource_t()
-    err = cyruntime.cudaGraphicsGLRegisterBuffer(resource._pvt_ptr, cybuffer, flags)
+    with nogil:
+        err = cyruntime.cudaGraphicsGLRegisterBuffer(resource._pvt_ptr, cybuffer, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], resource)
@@ -36932,7 +37230,8 @@ def cudaVDPAUGetDevice(vdpDevice, vdpGetProcAddress):
         pvdpDevice = int(VdpDevice(vdpDevice))
     cyvdpDevice = <cyruntime.VdpDevice><void_ptr>pvdpDevice
     cdef int device = 0
-    err = cyruntime.cudaVDPAUGetDevice(&device, cyvdpDevice, cyvdpGetProcAddress)
+    with nogil:
+        err = cyruntime.cudaVDPAUGetDevice(&device, cyvdpDevice, cyvdpGetProcAddress)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], device)
@@ -36992,7 +37291,8 @@ def cudaVDPAUSetVDPAUDevice(int device, vdpDevice, vdpGetProcAddress):
     else:
         pvdpDevice = int(VdpDevice(vdpDevice))
     cyvdpDevice = <cyruntime.VdpDevice><void_ptr>pvdpDevice
-    err = cyruntime.cudaVDPAUSetVDPAUDevice(device, cyvdpDevice, cyvdpGetProcAddress)
+    with nogil:
+        err = cyruntime.cudaVDPAUSetVDPAUDevice(device, cyvdpDevice, cyvdpGetProcAddress)
     return (_dict_cudaError_t[err],)
 {{endif}}
 
@@ -37046,7 +37346,8 @@ def cudaGraphicsVDPAURegisterVideoSurface(vdpSurface, unsigned int flags):
         pvdpSurface = int(VdpVideoSurface(vdpSurface))
     cyvdpSurface = <cyruntime.VdpVideoSurface><void_ptr>pvdpSurface
     cdef cudaGraphicsResource_t resource = cudaGraphicsResource_t()
-    err = cyruntime.cudaGraphicsVDPAURegisterVideoSurface(resource._pvt_ptr, cyvdpSurface, flags)
+    with nogil:
+        err = cyruntime.cudaGraphicsVDPAURegisterVideoSurface(resource._pvt_ptr, cyvdpSurface, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], resource)
@@ -37102,7 +37403,8 @@ def cudaGraphicsVDPAURegisterOutputSurface(vdpSurface, unsigned int flags):
         pvdpSurface = int(VdpOutputSurface(vdpSurface))
     cyvdpSurface = <cyruntime.VdpOutputSurface><void_ptr>pvdpSurface
     cdef cudaGraphicsResource_t resource = cudaGraphicsResource_t()
-    err = cyruntime.cudaGraphicsVDPAURegisterOutputSurface(resource._pvt_ptr, cyvdpSurface, flags)
+    with nogil:
+        err = cyruntime.cudaGraphicsVDPAURegisterOutputSurface(resource._pvt_ptr, cyvdpSurface, flags)
     if err != cyruntime.cudaSuccess:
         return (_dict_cudaError_t[err], None)
     return (_dict_cudaError_t[err], resource)
diff --git a/cuda_bindings/docs/source/module/driver.rst b/cuda_bindings/docs/source/module/driver.rst
index 0fc7976b67..04e0390d12 100644
--- a/cuda_bindings/docs/source/module/driver.rst
+++ b/cuda_bindings/docs/source/module/driver.rst
@@ -1,4 +1,4 @@
-.. SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 .. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 ------
diff --git a/cuda_bindings/docs/source/module/nvrtc.rst b/cuda_bindings/docs/source/module/nvrtc.rst
index ed5230c0a4..079cd39aad 100644
--- a/cuda_bindings/docs/source/module/nvrtc.rst
+++ b/cuda_bindings/docs/source/module/nvrtc.rst
@@ -1,4 +1,4 @@
-.. SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 .. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 -----
diff --git a/cuda_bindings/docs/source/module/runtime.rst b/cuda_bindings/docs/source/module/runtime.rst
index 22b3426a37..d155f85ebc 100644
--- a/cuda_bindings/docs/source/module/runtime.rst
+++ b/cuda_bindings/docs/source/module/runtime.rst
@@ -1,4 +1,4 @@
-.. SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 .. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 -------

From 8681e715b6901e2ecd1283d09e2569bc2e3990f2 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rwgkio@gmail.com>
Date: Mon, 21 Jul 2025 15:42:07 -0700
Subject: [PATCH 26/65] cython-gen changes due to
 release_gil_revert_leos_commits_fix_cast_error (#116)

---
 cuda_bindings/cuda/bindings/driver.pyx.in  | 140 ++++++++++-----------
 cuda_bindings/cuda/bindings/runtime.pyx.in | 100 +++++++--------
 2 files changed, 120 insertions(+), 120 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in
index 26cac65c09..e6be2eec47 100644
--- a/cuda_bindings/cuda/bindings/driver.pyx.in
+++ b/cuda_bindings/cuda/bindings/driver.pyx.in
@@ -24512,7 +24512,7 @@ def cuCtxCreate(ctxCreateParams : Optional[CUctxCreateParams], unsigned int flag
         pdev = int(CUdevice(dev))
     cydev = <cydriver.CUdevice>pdev
     cdef CUcontext pctx = CUcontext()
-    cdef cydriver.CUctxCreateParams* cyctxCreateParams_ptr = ctxCreateParams._pvt_ptr if ctxCreateParams != None else NULL
+    cdef cydriver.CUctxCreateParams* cyctxCreateParams_ptr = <cydriver.CUctxCreateParams*>ctxCreateParams._pvt_ptr if ctxCreateParams != None else NULL
     with nogil:
         err = cydriver.cuCtxCreate(<cydriver.CUcontext*>pctx._pvt_ptr, cyctxCreateParams_ptr, flags, cydev)
     if err != cydriver.CUDA_SUCCESS:
@@ -29529,7 +29529,7 @@ def cuMemcpy2D(pCopy : Optional[CUDA_MEMCPY2D]):
     --------
     :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`
     """
-    cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
+    cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = <cydriver.CUDA_MEMCPY2D*>pCopy._pvt_ptr if pCopy != None else NULL
     with nogil:
         err = cydriver.cuMemcpy2D(cypCopy_ptr)
     return (_dict_CUresult[err],)
@@ -29656,7 +29656,7 @@ def cuMemcpy2DUnaligned(pCopy : Optional[CUDA_MEMCPY2D]):
     --------
     :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`
     """
-    cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
+    cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = <cydriver.CUDA_MEMCPY2D*>pCopy._pvt_ptr if pCopy != None else NULL
     with nogil:
         err = cydriver.cuMemcpy2DUnaligned(cypCopy_ptr)
     return (_dict_CUresult[err],)
@@ -29786,7 +29786,7 @@ def cuMemcpy3D(pCopy : Optional[CUDA_MEMCPY3D]):
     --------
     :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpy3D`
     """
-    cdef cydriver.CUDA_MEMCPY3D* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
+    cdef cydriver.CUDA_MEMCPY3D* cypCopy_ptr = <cydriver.CUDA_MEMCPY3D*>pCopy._pvt_ptr if pCopy != None else NULL
     with nogil:
         err = cydriver.cuMemcpy3D(cypCopy_ptr)
     return (_dict_CUresult[err],)
@@ -29816,7 +29816,7 @@ def cuMemcpy3DPeer(pCopy : Optional[CUDA_MEMCPY3D_PEER]):
     --------
     :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyPeer`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyPeerAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :py:obj:`~.cudaMemcpy3DPeer`
     """
-    cdef cydriver.CUDA_MEMCPY3D_PEER* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
+    cdef cydriver.CUDA_MEMCPY3D_PEER* cypCopy_ptr = <cydriver.CUDA_MEMCPY3D_PEER*>pCopy._pvt_ptr if pCopy != None else NULL
     with nogil:
         err = cydriver.cuMemcpy3DPeer(cypCopy_ptr)
     return (_dict_CUresult[err],)
@@ -30380,7 +30380,7 @@ def cuMemcpy2DAsync(pCopy : Optional[CUDA_MEMCPY2D], hStream):
     else:
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
-    cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
+    cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = <cydriver.CUDA_MEMCPY2D*>pCopy._pvt_ptr if pCopy != None else NULL
     with nogil:
         err = cydriver.cuMemcpy2DAsync(cypCopy_ptr, cyhStream)
     return (_dict_CUresult[err],)
@@ -30520,7 +30520,7 @@ def cuMemcpy3DAsync(pCopy : Optional[CUDA_MEMCPY3D], hStream):
     else:
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
-    cdef cydriver.CUDA_MEMCPY3D* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
+    cdef cydriver.CUDA_MEMCPY3D* cypCopy_ptr = <cydriver.CUDA_MEMCPY3D*>pCopy._pvt_ptr if pCopy != None else NULL
     with nogil:
         err = cydriver.cuMemcpy3DAsync(cypCopy_ptr, cyhStream)
     return (_dict_CUresult[err],)
@@ -30560,7 +30560,7 @@ def cuMemcpy3DPeerAsync(pCopy : Optional[CUDA_MEMCPY3D_PEER], hStream):
     else:
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
-    cdef cydriver.CUDA_MEMCPY3D_PEER* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
+    cdef cydriver.CUDA_MEMCPY3D_PEER* cypCopy_ptr = <cydriver.CUDA_MEMCPY3D_PEER*>pCopy._pvt_ptr if pCopy != None else NULL
     with nogil:
         err = cydriver.cuMemcpy3DPeerAsync(cypCopy_ptr, cyhStream)
     return (_dict_CUresult[err],)
@@ -31516,7 +31516,7 @@ def cuArrayCreate(pAllocateArray : Optional[CUDA_ARRAY_DESCRIPTOR]):
     :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMallocArray`
     """
     cdef CUarray pHandle = CUarray()
-    cdef cydriver.CUDA_ARRAY_DESCRIPTOR* cypAllocateArray_ptr = pAllocateArray._pvt_ptr if pAllocateArray != None else NULL
+    cdef cydriver.CUDA_ARRAY_DESCRIPTOR* cypAllocateArray_ptr = <cydriver.CUDA_ARRAY_DESCRIPTOR*>pAllocateArray._pvt_ptr if pAllocateArray != None else NULL
     with nogil:
         err = cydriver.cuArrayCreate(<cydriver.CUarray*>pHandle._pvt_ptr, cypAllocateArray_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -32016,7 +32016,7 @@ def cuArray3DCreate(pAllocateArray : Optional[CUDA_ARRAY3D_DESCRIPTOR]):
     :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMalloc3DArray`
     """
     cdef CUarray pHandle = CUarray()
-    cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR* cypAllocateArray_ptr = pAllocateArray._pvt_ptr if pAllocateArray != None else NULL
+    cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR* cypAllocateArray_ptr = <cydriver.CUDA_ARRAY3D_DESCRIPTOR*>pAllocateArray._pvt_ptr if pAllocateArray != None else NULL
     with nogil:
         err = cydriver.cuArray3DCreate(<cydriver.CUarray*>pHandle._pvt_ptr, cypAllocateArray_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -32186,7 +32186,7 @@ def cuMipmappedArrayCreate(pMipmappedArrayDesc : Optional[CUDA_ARRAY3D_DESCRIPTO
     :py:obj:`~.cuMipmappedArrayDestroy`, :py:obj:`~.cuMipmappedArrayGetLevel`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cudaMallocMipmappedArray`
     """
     cdef CUmipmappedArray pHandle = CUmipmappedArray()
-    cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR* cypMipmappedArrayDesc_ptr = pMipmappedArrayDesc._pvt_ptr if pMipmappedArrayDesc != None else NULL
+    cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR* cypMipmappedArrayDesc_ptr = <cydriver.CUDA_ARRAY3D_DESCRIPTOR*>pMipmappedArrayDesc._pvt_ptr if pMipmappedArrayDesc != None else NULL
     with nogil:
         err = cydriver.cuMipmappedArrayCreate(<cydriver.CUmipmappedArray*>pHandle._pvt_ptr, cypMipmappedArrayDesc_ptr, numMipmapLevels)
     if err != cydriver.CUDA_SUCCESS:
@@ -32433,7 +32433,7 @@ def cuMemBatchDecompressAsync(paramsArray : Optional[CUmemDecompressParams], siz
     else:
         pstream = int(CUstream(stream))
     cystream = <cydriver.CUstream><void_ptr>pstream
-    cdef cydriver.CUmemDecompressParams* cyparamsArray_ptr = paramsArray._pvt_ptr if paramsArray != None else NULL
+    cdef cydriver.CUmemDecompressParams* cyparamsArray_ptr = <cydriver.CUmemDecompressParams*>paramsArray._pvt_ptr if paramsArray != None else NULL
     cdef size_t errorIndex = 0
     with nogil:
         err = cydriver.cuMemBatchDecompressAsync(cyparamsArray_ptr, count, flags, &errorIndex, cystream)
@@ -32619,7 +32619,7 @@ def cuMemCreate(size_t size, prop : Optional[CUmemAllocationProp], unsigned long
     :py:obj:`~.cuMemRelease`, :py:obj:`~.cuMemExportToShareableHandle`, :py:obj:`~.cuMemImportFromShareableHandle`
     """
     cdef CUmemGenericAllocationHandle handle = CUmemGenericAllocationHandle()
-    cdef cydriver.CUmemAllocationProp* cyprop_ptr = prop._pvt_ptr if prop != None else NULL
+    cdef cydriver.CUmemAllocationProp* cyprop_ptr = <cydriver.CUmemAllocationProp*>prop._pvt_ptr if prop != None else NULL
     with nogil:
         err = cydriver.cuMemCreate(<cydriver.CUmemGenericAllocationHandle*>handle._pvt_ptr, size, cyprop_ptr, flags)
     if err != cydriver.CUDA_SUCCESS:
@@ -33082,7 +33082,7 @@ def cuMemGetAccess(location : Optional[CUmemLocation], ptr):
         pptr = int(CUdeviceptr(ptr))
     cyptr = <cydriver.CUdeviceptr><void_ptr>pptr
     cdef unsigned long long flags = 0
-    cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
+    cdef cydriver.CUmemLocation* cylocation_ptr = <cydriver.CUmemLocation*>location._pvt_ptr if location != None else NULL
     with nogil:
         err = cydriver.cuMemGetAccess(&flags, cylocation_ptr, cyptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -33230,7 +33230,7 @@ def cuMemGetAllocationGranularity(prop : Optional[CUmemAllocationProp], option n
     :py:obj:`~.cuMemCreate`, :py:obj:`~.cuMemMap`
     """
     cdef size_t granularity = 0
-    cdef cydriver.CUmemAllocationProp* cyprop_ptr = prop._pvt_ptr if prop != None else NULL
+    cdef cydriver.CUmemAllocationProp* cyprop_ptr = <cydriver.CUmemAllocationProp*>prop._pvt_ptr if prop != None else NULL
     cdef cydriver.CUmemAllocationGranularity_flags cyoption = option.value
     with nogil:
         err = cydriver.cuMemGetAllocationGranularity(&granularity, cyprop_ptr, cyoption)
@@ -33719,7 +33719,7 @@ def cuMemPoolGetAccess(memPool, location : Optional[CUmemLocation]):
         pmemPool = int(CUmemoryPool(memPool))
     cymemPool = <cydriver.CUmemoryPool><void_ptr>pmemPool
     cdef cydriver.CUmemAccess_flags flags
-    cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
+    cdef cydriver.CUmemLocation* cylocation_ptr = <cydriver.CUmemLocation*>location._pvt_ptr if location != None else NULL
     with nogil:
         err = cydriver.cuMemPoolGetAccess(&flags, cymemPool, cylocation_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -33803,7 +33803,7 @@ def cuMemPoolCreate(poolProps : Optional[CUmemPoolProps]):
     Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC.
     """
     cdef CUmemoryPool pool = CUmemoryPool()
-    cdef cydriver.CUmemPoolProps* cypoolProps_ptr = poolProps._pvt_ptr if poolProps != None else NULL
+    cdef cydriver.CUmemPoolProps* cypoolProps_ptr = <cydriver.CUmemPoolProps*>poolProps._pvt_ptr if poolProps != None else NULL
     with nogil:
         err = cydriver.cuMemPoolCreate(<cydriver.CUmemoryPool*>pool._pvt_ptr, cypoolProps_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -33893,7 +33893,7 @@ def cuMemGetDefaultMemPool(location : Optional[CUmemLocation], typename not None
     :py:obj:`~.cuMemAllocAsync`, :py:obj:`~.cuMemPoolTrimTo`, :py:obj:`~.cuMemPoolGetAttribute`, :py:obj:`~.cuMemPoolSetAttribute`, :py:obj:`~.cuMemPoolSetAccess`, :py:obj:`~.cuMemGetMemPool`, :py:obj:`~.cuMemPoolCreate`
     """
     cdef CUmemoryPool pool_out = CUmemoryPool()
-    cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
+    cdef cydriver.CUmemLocation* cylocation_ptr = <cydriver.CUmemLocation*>location._pvt_ptr if location != None else NULL
     cdef cydriver.CUmemAllocationType cytypename = typename.value
     with nogil:
         err = cydriver.cuMemGetDefaultMemPool(<cydriver.CUmemoryPool*>pool_out._pvt_ptr, cylocation_ptr, cytypename)
@@ -33946,7 +33946,7 @@ def cuMemGetMemPool(location : Optional[CUmemLocation], typename not None : CUme
     :py:obj:`~.cuDeviceGetDefaultMemPool`, :py:obj:`~.cuMemPoolCreate`, :py:obj:`~.cuDeviceSetMemPool`, :py:obj:`~.cuMemSetMemPool`
     """
     cdef CUmemoryPool pool = CUmemoryPool()
-    cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
+    cdef cydriver.CUmemLocation* cylocation_ptr = <cydriver.CUmemLocation*>location._pvt_ptr if location != None else NULL
     cdef cydriver.CUmemAllocationType cytypename = typename.value
     with nogil:
         err = cydriver.cuMemGetMemPool(<cydriver.CUmemoryPool*>pool._pvt_ptr, cylocation_ptr, cytypename)
@@ -34014,7 +34014,7 @@ def cuMemSetMemPool(location : Optional[CUmemLocation], typename not None : CUme
     else:
         ppool = int(CUmemoryPool(pool))
     cypool = <cydriver.CUmemoryPool><void_ptr>ppool
-    cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
+    cdef cydriver.CUmemLocation* cylocation_ptr = <cydriver.CUmemLocation*>location._pvt_ptr if location != None else NULL
     cdef cydriver.CUmemAllocationType cytypename = typename.value
     with nogil:
         err = cydriver.cuMemSetMemPool(cylocation_ptr, cytypename, cypool)
@@ -34274,7 +34274,7 @@ def cuMemPoolImportPointer(pool, shareData : Optional[CUmemPoolPtrExportData]):
         ppool = int(CUmemoryPool(pool))
     cypool = <cydriver.CUmemoryPool><void_ptr>ppool
     cdef CUdeviceptr ptr_out = CUdeviceptr()
-    cdef cydriver.CUmemPoolPtrExportData* cyshareData_ptr = shareData._pvt_ptr if shareData != None else NULL
+    cdef cydriver.CUmemPoolPtrExportData* cyshareData_ptr = <cydriver.CUmemPoolPtrExportData*>shareData._pvt_ptr if shareData != None else NULL
     with nogil:
         err = cydriver.cuMemPoolImportPointer(<cydriver.CUdeviceptr*>ptr_out._pvt_ptr, cypool, cyshareData_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -34335,7 +34335,7 @@ def cuMulticastCreate(prop : Optional[CUmulticastObjectProp]):
     :py:obj:`~.cuMemCreate`, :py:obj:`~.cuMemRelease`, :py:obj:`~.cuMemExportToShareableHandle`, :py:obj:`~.cuMemImportFromShareableHandle`
     """
     cdef CUmemGenericAllocationHandle mcHandle = CUmemGenericAllocationHandle()
-    cdef cydriver.CUmulticastObjectProp* cyprop_ptr = prop._pvt_ptr if prop != None else NULL
+    cdef cydriver.CUmulticastObjectProp* cyprop_ptr = <cydriver.CUmulticastObjectProp*>prop._pvt_ptr if prop != None else NULL
     with nogil:
         err = cydriver.cuMulticastCreate(<cydriver.CUmemGenericAllocationHandle*>mcHandle._pvt_ptr, cyprop_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -34644,7 +34644,7 @@ def cuMulticastGetGranularity(prop : Optional[CUmulticastObjectProp], option not
     :py:obj:`~.cuMulticastCreate`, :py:obj:`~.cuMulticastBindMem`, :py:obj:`~.cuMulticastBindAddr`, :py:obj:`~.cuMulticastUnbind`
     """
     cdef size_t granularity = 0
-    cdef cydriver.CUmulticastObjectProp* cyprop_ptr = prop._pvt_ptr if prop != None else NULL
+    cdef cydriver.CUmulticastObjectProp* cyprop_ptr = <cydriver.CUmulticastObjectProp*>prop._pvt_ptr if prop != None else NULL
     cdef cydriver.CUmulticastGranularity_flags cyoption = option.value
     with nogil:
         err = cydriver.cuMulticastGetGranularity(&granularity, cyprop_ptr, cyoption)
@@ -37424,7 +37424,7 @@ def cuStreamSetAttribute(hStream, attr not None : CUstreamAttrID, value : Option
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef cydriver.CUstreamAttrID cyattr = attr.value
-    cdef cydriver.CUstreamAttrValue* cyvalue_ptr = value._pvt_ptr if value != None else NULL
+    cdef cydriver.CUstreamAttrValue* cyvalue_ptr = <cydriver.CUstreamAttrValue*>value._pvt_ptr if value != None else NULL
     with nogil:
         err = cydriver.cuStreamSetAttribute(cyhStream, cyattr, cyvalue_ptr)
     return (_dict_CUresult[err],)
@@ -37972,7 +37972,7 @@ def cuImportExternalMemory(memHandleDesc : Optional[CUDA_EXTERNAL_MEMORY_HANDLE_
     and Cache Control" chapter from Vulkan specification.
     """
     cdef CUexternalMemory extMem_out = CUexternalMemory()
-    cdef cydriver.CUDA_EXTERNAL_MEMORY_HANDLE_DESC* cymemHandleDesc_ptr = memHandleDesc._pvt_ptr if memHandleDesc != None else NULL
+    cdef cydriver.CUDA_EXTERNAL_MEMORY_HANDLE_DESC* cymemHandleDesc_ptr = <cydriver.CUDA_EXTERNAL_MEMORY_HANDLE_DESC*>memHandleDesc._pvt_ptr if memHandleDesc != None else NULL
     with nogil:
         err = cydriver.cuImportExternalMemory(<cydriver.CUexternalMemory*>extMem_out._pvt_ptr, cymemHandleDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -38042,7 +38042,7 @@ def cuExternalMemoryGetMappedBuffer(extMem, bufferDesc : Optional[CUDA_EXTERNAL_
         pextMem = int(CUexternalMemory(extMem))
     cyextMem = <cydriver.CUexternalMemory><void_ptr>pextMem
     cdef CUdeviceptr devPtr = CUdeviceptr()
-    cdef cydriver.CUDA_EXTERNAL_MEMORY_BUFFER_DESC* cybufferDesc_ptr = bufferDesc._pvt_ptr if bufferDesc != None else NULL
+    cdef cydriver.CUDA_EXTERNAL_MEMORY_BUFFER_DESC* cybufferDesc_ptr = <cydriver.CUDA_EXTERNAL_MEMORY_BUFFER_DESC*>bufferDesc._pvt_ptr if bufferDesc != None else NULL
     with nogil:
         err = cydriver.cuExternalMemoryGetMappedBuffer(<cydriver.CUdeviceptr*>devPtr._pvt_ptr, cyextMem, cybufferDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -38118,7 +38118,7 @@ def cuExternalMemoryGetMappedMipmappedArray(extMem, mipmapDesc : Optional[CUDA_E
         pextMem = int(CUexternalMemory(extMem))
     cyextMem = <cydriver.CUexternalMemory><void_ptr>pextMem
     cdef CUmipmappedArray mipmap = CUmipmappedArray()
-    cdef cydriver.CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC* cymipmapDesc_ptr = mipmapDesc._pvt_ptr if mipmapDesc != None else NULL
+    cdef cydriver.CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC* cymipmapDesc_ptr = <cydriver.CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC*>mipmapDesc._pvt_ptr if mipmapDesc != None else NULL
     with nogil:
         err = cydriver.cuExternalMemoryGetMappedMipmappedArray(<cydriver.CUmipmappedArray*>mipmap._pvt_ptr, cyextMem, cymipmapDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -38307,7 +38307,7 @@ def cuImportExternalSemaphore(semHandleDesc : Optional[CUDA_EXTERNAL_SEMAPHORE_H
     :py:obj:`~.cuDestroyExternalSemaphore`, :py:obj:`~.cuSignalExternalSemaphoresAsync`, :py:obj:`~.cuWaitExternalSemaphoresAsync`
     """
     cdef CUexternalSemaphore extSem_out = CUexternalSemaphore()
-    cdef cydriver.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC* cysemHandleDesc_ptr = semHandleDesc._pvt_ptr if semHandleDesc != None else NULL
+    cdef cydriver.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC* cysemHandleDesc_ptr = <cydriver.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC*>semHandleDesc._pvt_ptr if semHandleDesc != None else NULL
     with nogil:
         err = cydriver.cuImportExternalSemaphore(<cydriver.CUexternalSemaphore*>extSem_out._pvt_ptr, cysemHandleDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -39859,7 +39859,7 @@ def cuLaunchKernelEx(config : Optional[CUlaunchConfig], f, kernelParams, void_pt
     else:
         pf = int(CUfunction(f))
     cyf = <cydriver.CUfunction><void_ptr>pf
-    cdef cydriver.CUlaunchConfig* cyconfig_ptr = config._pvt_ptr if config != None else NULL
+    cdef cydriver.CUlaunchConfig* cyconfig_ptr = <cydriver.CUlaunchConfig*>config._pvt_ptr if config != None else NULL
     cykernelParams = utils.HelperKernelParams(kernelParams)
     cdef void** cykernelParams_ptr = <void**><void_ptr>cykernelParams.ckernelParams
     with nogil:
@@ -40966,7 +40966,7 @@ def cuGraphAddKernelNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] | Li
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_KERNEL_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphAddKernelNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -41057,7 +41057,7 @@ def cuGraphKernelNodeSetParams(hNode, nodeParams : Optional[CUDA_KERNEL_NODE_PAR
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_KERNEL_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphKernelNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -41144,7 +41144,7 @@ def cuGraphAddMemcpyNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] | Li
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_MEMCPY3D* cycopyParams_ptr = copyParams._pvt_ptr if copyParams != None else NULL
+    cdef cydriver.CUDA_MEMCPY3D* cycopyParams_ptr = <cydriver.CUDA_MEMCPY3D*>copyParams._pvt_ptr if copyParams != None else NULL
     with nogil:
         err = cydriver.cuGraphAddMemcpyNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cycopyParams_ptr, cyctx)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -41226,7 +41226,7 @@ def cuGraphMemcpyNodeSetParams(hNode, nodeParams : Optional[CUDA_MEMCPY3D]):
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_MEMCPY3D* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_MEMCPY3D* cynodeParams_ptr = <cydriver.CUDA_MEMCPY3D*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphMemcpyNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -41303,7 +41303,7 @@ def cuGraphAddMemsetNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] | Li
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cymemsetParams_ptr = memsetParams._pvt_ptr if memsetParams != None else NULL
+    cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cymemsetParams_ptr = <cydriver.CUDA_MEMSET_NODE_PARAMS*>memsetParams._pvt_ptr if memsetParams != None else NULL
     with nogil:
         err = cydriver.cuGraphAddMemsetNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cymemsetParams_ptr, cyctx)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -41385,7 +41385,7 @@ def cuGraphMemsetNodeSetParams(hNode, nodeParams : Optional[CUDA_MEMSET_NODE_PAR
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_MEMSET_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphMemsetNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -41452,7 +41452,7 @@ def cuGraphAddHostNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] | List
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_HOST_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphAddHostNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -41534,7 +41534,7 @@ def cuGraphHostNodeSetParams(hNode, nodeParams : Optional[CUDA_HOST_NODE_PARAMS]
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_HOST_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphHostNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -42127,7 +42127,7 @@ def cuGraphAddExternalSemaphoresSignalNode(hGraph, dependencies : Optional[Tuple
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphAddExternalSemaphoresSignalNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -42216,7 +42216,7 @@ def cuGraphExternalSemaphoresSignalNodeSetParams(hNode, nodeParams : Optional[CU
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphExternalSemaphoresSignalNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -42284,7 +42284,7 @@ def cuGraphAddExternalSemaphoresWaitNode(hGraph, dependencies : Optional[Tuple[C
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphAddExternalSemaphoresWaitNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -42373,7 +42373,7 @@ def cuGraphExternalSemaphoresWaitNodeSetParams(hNode, nodeParams : Optional[CUDA
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphExternalSemaphoresWaitNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -42444,7 +42444,7 @@ def cuGraphAddBatchMemOpNode(hGraph, dependencies : Optional[Tuple[CUgraphNode]
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphAddBatchMemOpNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -42534,7 +42534,7 @@ def cuGraphBatchMemOpNodeSetParams(hNode, nodeParams : Optional[CUDA_BATCH_MEM_O
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphBatchMemOpNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -42605,7 +42605,7 @@ def cuGraphExecBatchMemOpNodeSetParams(hGraphExec, hNode, nodeParams : Optional[
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphExecBatchMemOpNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -42713,7 +42713,7 @@ def cuGraphAddMemAllocNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] |
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_MEM_ALLOC_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_MEM_ALLOC_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_MEM_ALLOC_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphAddMemAllocNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -44031,7 +44031,7 @@ def cuGraphInstantiateWithParams(hGraph, instantiateParams : Optional[CUDA_GRAPH
         phGraph = int(CUgraph(hGraph))
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef CUgraphExec phGraphExec = CUgraphExec()
-    cdef cydriver.CUDA_GRAPH_INSTANTIATE_PARAMS* cyinstantiateParams_ptr = instantiateParams._pvt_ptr if instantiateParams != None else NULL
+    cdef cydriver.CUDA_GRAPH_INSTANTIATE_PARAMS* cyinstantiateParams_ptr = <cydriver.CUDA_GRAPH_INSTANTIATE_PARAMS*>instantiateParams._pvt_ptr if instantiateParams != None else NULL
     with nogil:
         err = cydriver.cuGraphInstantiateWithParams(<cydriver.CUgraphExec*>phGraphExec._pvt_ptr, cyhGraph, cyinstantiateParams_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -44157,7 +44157,7 @@ def cuGraphExecKernelNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUDA
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_KERNEL_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphExecKernelNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -44232,7 +44232,7 @@ def cuGraphExecMemcpyNodeSetParams(hGraphExec, hNode, copyParams : Optional[CUDA
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_MEMCPY3D* cycopyParams_ptr = copyParams._pvt_ptr if copyParams != None else NULL
+    cdef cydriver.CUDA_MEMCPY3D* cycopyParams_ptr = <cydriver.CUDA_MEMCPY3D*>copyParams._pvt_ptr if copyParams != None else NULL
     with nogil:
         err = cydriver.cuGraphExecMemcpyNodeSetParams(cyhGraphExec, cyhNode, cycopyParams_ptr, cyctx)
     return (_dict_CUresult[err],)
@@ -44312,7 +44312,7 @@ def cuGraphExecMemsetNodeSetParams(hGraphExec, hNode, memsetParams : Optional[CU
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cymemsetParams_ptr = memsetParams._pvt_ptr if memsetParams != None else NULL
+    cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cymemsetParams_ptr = <cydriver.CUDA_MEMSET_NODE_PARAMS*>memsetParams._pvt_ptr if memsetParams != None else NULL
     with nogil:
         err = cydriver.cuGraphExecMemsetNodeSetParams(cyhGraphExec, cyhNode, cymemsetParams_ptr, cyctx)
     return (_dict_CUresult[err],)
@@ -44367,7 +44367,7 @@ def cuGraphExecHostNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUDA_H
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_HOST_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphExecHostNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -44623,7 +44623,7 @@ def cuGraphExecExternalSemaphoresSignalNodeSetParams(hGraphExec, hNode, nodePara
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphExecExternalSemaphoresSignalNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -44683,7 +44683,7 @@ def cuGraphExecExternalSemaphoresWaitNodeSetParams(hGraphExec, hNode, nodeParams
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphExecExternalSemaphoresWaitNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -45301,7 +45301,7 @@ def cuGraphKernelNodeSetAttribute(hNode, attr not None : CUkernelNodeAttrID, val
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef cydriver.CUkernelNodeAttrID cyattr = attr.value
-    cdef cydriver.CUkernelNodeAttrValue* cyvalue_ptr = value._pvt_ptr if value != None else NULL
+    cdef cydriver.CUkernelNodeAttrValue* cyvalue_ptr = <cydriver.CUkernelNodeAttrValue*>value._pvt_ptr if value != None else NULL
     with nogil:
         err = cydriver.cuGraphKernelNodeSetAttribute(cyhNode, cyattr, cyvalue_ptr)
     return (_dict_CUresult[err],)
@@ -45687,7 +45687,7 @@ def cuGraphAddNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] | List[CUg
             string.memcpy(&cydependencyData[idx], (<CUgraphEdgeData>dependencyData[idx])._pvt_ptr, sizeof(cydriver.CUgraphEdgeData))
     elif len(dependencyData) == 1:
         cydependencyData = (<CUgraphEdgeData>dependencyData[0])._pvt_ptr
-    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = <cydriver.CUgraphNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphAddNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, cydependencyData, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -45737,7 +45737,7 @@ def cuGraphNodeSetParams(hNode, nodeParams : Optional[CUgraphNodeParams]):
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = <cydriver.CUgraphNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -45797,7 +45797,7 @@ def cuGraphExecNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUgraphNod
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = <cydriver.CUgraphNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphExecNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -46260,7 +46260,7 @@ def cuOccupancyMaxPotentialClusterSize(func, config : Optional[CUlaunchConfig]):
         pfunc = int(CUfunction(func))
     cyfunc = <cydriver.CUfunction><void_ptr>pfunc
     cdef int clusterSize = 0
-    cdef cydriver.CUlaunchConfig* cyconfig_ptr = config._pvt_ptr if config != None else NULL
+    cdef cydriver.CUlaunchConfig* cyconfig_ptr = <cydriver.CUlaunchConfig*>config._pvt_ptr if config != None else NULL
     with nogil:
         err = cydriver.cuOccupancyMaxPotentialClusterSize(&clusterSize, cyfunc, cyconfig_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -46320,7 +46320,7 @@ def cuOccupancyMaxActiveClusters(func, config : Optional[CUlaunchConfig]):
         pfunc = int(CUfunction(func))
     cyfunc = <cydriver.CUfunction><void_ptr>pfunc
     cdef int numClusters = 0
-    cdef cydriver.CUlaunchConfig* cyconfig_ptr = config._pvt_ptr if config != None else NULL
+    cdef cydriver.CUlaunchConfig* cyconfig_ptr = <cydriver.CUlaunchConfig*>config._pvt_ptr if config != None else NULL
     with nogil:
         err = cydriver.cuOccupancyMaxActiveClusters(&numClusters, cyfunc, cyconfig_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -46584,7 +46584,7 @@ def cuTexRefSetAddress2D(hTexRef, desc : Optional[CUDA_ARRAY_DESCRIPTOR], dptr,
     else:
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
-    cdef cydriver.CUDA_ARRAY_DESCRIPTOR* cydesc_ptr = desc._pvt_ptr if desc != None else NULL
+    cdef cydriver.CUDA_ARRAY_DESCRIPTOR* cydesc_ptr = <cydriver.CUDA_ARRAY_DESCRIPTOR*>desc._pvt_ptr if desc != None else NULL
     with nogil:
         err = cydriver.cuTexRefSetAddress2D(cyhTexRef, cydesc_ptr, cydptr, Pitch)
     return (_dict_CUresult[err],)
@@ -47955,9 +47955,9 @@ def cuTexObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC], pTexDesc : Option
     :py:obj:`~.cuTexObjectDestroy`, :py:obj:`~.cudaCreateTextureObject`
     """
     cdef CUtexObject pTexObject = CUtexObject()
-    cdef cydriver.CUDA_RESOURCE_DESC* cypResDesc_ptr = pResDesc._pvt_ptr if pResDesc != None else NULL
-    cdef cydriver.CUDA_TEXTURE_DESC* cypTexDesc_ptr = pTexDesc._pvt_ptr if pTexDesc != None else NULL
-    cdef cydriver.CUDA_RESOURCE_VIEW_DESC* cypResViewDesc_ptr = pResViewDesc._pvt_ptr if pResViewDesc != None else NULL
+    cdef cydriver.CUDA_RESOURCE_DESC* cypResDesc_ptr = <cydriver.CUDA_RESOURCE_DESC*>pResDesc._pvt_ptr if pResDesc != None else NULL
+    cdef cydriver.CUDA_TEXTURE_DESC* cypTexDesc_ptr = <cydriver.CUDA_TEXTURE_DESC*>pTexDesc._pvt_ptr if pTexDesc != None else NULL
+    cdef cydriver.CUDA_RESOURCE_VIEW_DESC* cypResViewDesc_ptr = <cydriver.CUDA_RESOURCE_VIEW_DESC*>pResViewDesc._pvt_ptr if pResViewDesc != None else NULL
     with nogil:
         err = cydriver.cuTexObjectCreate(<cydriver.CUtexObject*>pTexObject._pvt_ptr, cypResDesc_ptr, cypTexDesc_ptr, cypResViewDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -48159,7 +48159,7 @@ def cuSurfObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC]):
     :py:obj:`~.cuSurfObjectDestroy`, :py:obj:`~.cudaCreateSurfaceObject`
     """
     cdef CUsurfObject pSurfObject = CUsurfObject()
-    cdef cydriver.CUDA_RESOURCE_DESC* cypResDesc_ptr = pResDesc._pvt_ptr if pResDesc != None else NULL
+    cdef cydriver.CUDA_RESOURCE_DESC* cypResDesc_ptr = <cydriver.CUDA_RESOURCE_DESC*>pResDesc._pvt_ptr if pResDesc != None else NULL
     with nogil:
         err = cydriver.cuSurfObjectCreate(<cydriver.CUsurfObject*>pSurfObject._pvt_ptr, cypResDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -49258,7 +49258,7 @@ def cuTensorMapReplaceAddress(tensorMap : Optional[CUtensorMap], globalAddress):
     --------
     :py:obj:`~.cuTensorMapEncodeTiled`, :py:obj:`~.cuTensorMapEncodeIm2col`, :py:obj:`~.cuTensorMapEncodeIm2colWide`
     """
-    cdef cydriver.CUtensorMap* cytensorMap_ptr = tensorMap._pvt_ptr if tensorMap != None else NULL
+    cdef cydriver.CUtensorMap* cytensorMap_ptr = <cydriver.CUtensorMap*>tensorMap._pvt_ptr if tensorMap != None else NULL
     cyglobalAddress = utils.HelperInputVoidPtr(globalAddress)
     cdef void* cyglobalAddress_ptr = <void*><void_ptr>cyglobalAddress.cptr
     with nogil:
@@ -50576,7 +50576,7 @@ def cuGetExportTable(pExportTableId : Optional[CUuuid]):
         None
     """
     cdef void_ptr ppExportTable = 0
-    cdef cydriver.CUuuid* cypExportTableId_ptr = pExportTableId._pvt_ptr if pExportTableId != None else NULL
+    cdef cydriver.CUuuid* cypExportTableId_ptr = <cydriver.CUuuid*>pExportTableId._pvt_ptr if pExportTableId != None else NULL
     with nogil:
         err = cydriver.cuGetExportTable(<const void**>&ppExportTable, cypExportTableId_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -50999,7 +50999,7 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
         if cyresult is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(nbGroups) + 'x' + str(sizeof(cydriver.CUdevResource)))
     cdef unsigned int cynbGroups = nbGroups
-    cdef cydriver.CUdevResource* cyinput__ptr = input_._pvt_ptr if input_ != None else NULL
+    cdef cydriver.CUdevResource* cyinput__ptr = <cydriver.CUdevResource*>input_._pvt_ptr if input_ != None else NULL
     cdef CUdevResource remaining = CUdevResource()
     with nogil:
         err = cydriver.cuDevSmResourceSplitByCount(cyresult, &cynbGroups, cyinput__ptr, <cydriver.CUdevResource*>remaining._pvt_ptr, useFlags, minCount)
@@ -51677,7 +51677,7 @@ def cuCheckpointProcessLock(int pid, args : Optional[CUcheckpointLockArgs]):
     CUresult
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED` :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED` :py:obj:`~.CUDA_ERROR_NOT_READY`
     """
-    cdef cydriver.CUcheckpointLockArgs* cyargs_ptr = args._pvt_ptr if args != None else NULL
+    cdef cydriver.CUcheckpointLockArgs* cyargs_ptr = <cydriver.CUcheckpointLockArgs*>args._pvt_ptr if args != None else NULL
     with nogil:
         err = cydriver.cuCheckpointProcessLock(pid, cyargs_ptr)
     return (_dict_CUresult[err],)
@@ -51708,7 +51708,7 @@ def cuCheckpointProcessCheckpoint(int pid, args : Optional[CUcheckpointCheckpoin
     CUresult
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED` :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     """
-    cdef cydriver.CUcheckpointCheckpointArgs* cyargs_ptr = args._pvt_ptr if args != None else NULL
+    cdef cydriver.CUcheckpointCheckpointArgs* cyargs_ptr = <cydriver.CUcheckpointCheckpointArgs*>args._pvt_ptr if args != None else NULL
     with nogil:
         err = cydriver.cuCheckpointProcessCheckpoint(pid, cyargs_ptr)
     return (_dict_CUresult[err],)
@@ -51737,7 +51737,7 @@ def cuCheckpointProcessUnlock(int pid, args : Optional[CUcheckpointUnlockArgs]):
     CUresult
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED` :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     """
-    cdef cydriver.CUcheckpointUnlockArgs* cyargs_ptr = args._pvt_ptr if args != None else NULL
+    cdef cydriver.CUcheckpointUnlockArgs* cyargs_ptr = <cydriver.CUcheckpointUnlockArgs*>args._pvt_ptr if args != None else NULL
     with nogil:
         err = cydriver.cuCheckpointProcessUnlock(pid, cyargs_ptr)
     return (_dict_CUresult[err],)
@@ -52356,7 +52356,7 @@ def cuEGLStreamProducerReturnFrame(conn, eglframe : Optional[CUeglFrame], pStrea
         cyconn = <cydriver.CUeglStreamConnection*><void_ptr>conn
     else:
         raise TypeError("Argument 'conn' is not instance of type (expected <class 'int, driver.CUeglStreamConnection'>, found " + str(type(conn)))
-    cdef cydriver.CUeglFrame* cyeglframe_ptr = eglframe._pvt_ptr if eglframe != None else NULL
+    cdef cydriver.CUeglFrame* cyeglframe_ptr = <cydriver.CUeglFrame*>eglframe._pvt_ptr if eglframe != None else NULL
     with nogil:
         err = cydriver.cuEGLStreamProducerReturnFrame(cyconn, cyeglframe_ptr, cypStream)
     return (_dict_CUresult[err],)
diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in
index 939e1dfcc1..4f4ef937dd 100644
--- a/cuda_bindings/cuda/bindings/runtime.pyx.in
+++ b/cuda_bindings/cuda/bindings/runtime.pyx.in
@@ -18626,7 +18626,7 @@ def cudaDeviceGetTexture1DLinearMaxWidth(fmtDesc : Optional[cudaChannelFormatDes
     :py:obj:`~.cuDeviceGetTexture1DLinearMaxWidth`
     """
     cdef size_t maxWidthInElements = 0
-    cdef cyruntime.cudaChannelFormatDesc* cyfmtDesc_ptr = fmtDesc._pvt_ptr if fmtDesc != None else NULL
+    cdef cyruntime.cudaChannelFormatDesc* cyfmtDesc_ptr = <cyruntime.cudaChannelFormatDesc*>fmtDesc._pvt_ptr if fmtDesc != None else NULL
     with nogil:
         err = cyruntime.cudaDeviceGetTexture1DLinearMaxWidth(&maxWidthInElements, cyfmtDesc_ptr, device)
     if err != cyruntime.cudaSuccess:
@@ -20051,7 +20051,7 @@ def cudaChooseDevice(prop : Optional[cudaDeviceProp]):
     :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaInitDevice`
     """
     cdef int device = 0
-    cdef cyruntime.cudaDeviceProp* cyprop_ptr = prop._pvt_ptr if prop != None else NULL
+    cdef cyruntime.cudaDeviceProp* cyprop_ptr = <cyruntime.cudaDeviceProp*>prop._pvt_ptr if prop != None else NULL
     with nogil:
         err = cyruntime.cudaChooseDevice(&device, cyprop_ptr)
     if err != cyruntime.cudaSuccess:
@@ -20792,7 +20792,7 @@ def cudaStreamSetAttribute(hStream, attr not None : cudaStreamAttrID, value : Op
         phStream = int(cudaStream_t(hStream))
     cyhStream = <cyruntime.cudaStream_t><void_ptr>phStream
     cdef cyruntime.cudaStreamAttrID cyattr = attr.value
-    cdef cyruntime.cudaStreamAttrValue* cyvalue_ptr = value._pvt_ptr if value != None else NULL
+    cdef cyruntime.cudaStreamAttrValue* cyvalue_ptr = <cyruntime.cudaStreamAttrValue*>value._pvt_ptr if value != None else NULL
     with nogil:
         err = cyruntime.cudaStreamSetAttribute(cyhStream, cyattr, cyvalue_ptr)
     return (_dict_cudaError_t[err],)
@@ -22291,7 +22291,7 @@ def cudaImportExternalMemory(memHandleDesc : Optional[cudaExternalMemoryHandleDe
     and Cache Control" chapter from Vulkan specification.
     """
     cdef cudaExternalMemory_t extMem_out = cudaExternalMemory_t()
-    cdef cyruntime.cudaExternalMemoryHandleDesc* cymemHandleDesc_ptr = memHandleDesc._pvt_ptr if memHandleDesc != None else NULL
+    cdef cyruntime.cudaExternalMemoryHandleDesc* cymemHandleDesc_ptr = <cyruntime.cudaExternalMemoryHandleDesc*>memHandleDesc._pvt_ptr if memHandleDesc != None else NULL
     with nogil:
         err = cyruntime.cudaImportExternalMemory(<cyruntime.cudaExternalMemory_t*>extMem_out._pvt_ptr, cymemHandleDesc_ptr)
     if err != cyruntime.cudaSuccess:
@@ -22359,7 +22359,7 @@ def cudaExternalMemoryGetMappedBuffer(extMem, bufferDesc : Optional[cudaExternal
         pextMem = int(cudaExternalMemory_t(extMem))
     cyextMem = <cyruntime.cudaExternalMemory_t><void_ptr>pextMem
     cdef void_ptr devPtr = 0
-    cdef cyruntime.cudaExternalMemoryBufferDesc* cybufferDesc_ptr = bufferDesc._pvt_ptr if bufferDesc != None else NULL
+    cdef cyruntime.cudaExternalMemoryBufferDesc* cybufferDesc_ptr = <cyruntime.cudaExternalMemoryBufferDesc*>bufferDesc._pvt_ptr if bufferDesc != None else NULL
     with nogil:
         err = cyruntime.cudaExternalMemoryGetMappedBuffer(<void**>&devPtr, cyextMem, cybufferDesc_ptr)
     if err != cyruntime.cudaSuccess:
@@ -22431,7 +22431,7 @@ def cudaExternalMemoryGetMappedMipmappedArray(extMem, mipmapDesc : Optional[cuda
         pextMem = int(cudaExternalMemory_t(extMem))
     cyextMem = <cyruntime.cudaExternalMemory_t><void_ptr>pextMem
     cdef cudaMipmappedArray_t mipmap = cudaMipmappedArray_t()
-    cdef cyruntime.cudaExternalMemoryMipmappedArrayDesc* cymipmapDesc_ptr = mipmapDesc._pvt_ptr if mipmapDesc != None else NULL
+    cdef cyruntime.cudaExternalMemoryMipmappedArrayDesc* cymipmapDesc_ptr = <cyruntime.cudaExternalMemoryMipmappedArrayDesc*>mipmapDesc._pvt_ptr if mipmapDesc != None else NULL
     with nogil:
         err = cyruntime.cudaExternalMemoryGetMappedMipmappedArray(<cyruntime.cudaMipmappedArray_t*>mipmap._pvt_ptr, cyextMem, cymipmapDesc_ptr)
     if err != cyruntime.cudaSuccess:
@@ -22618,7 +22618,7 @@ def cudaImportExternalSemaphore(semHandleDesc : Optional[cudaExternalSemaphoreHa
     :py:obj:`~.cudaDestroyExternalSemaphore`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`
     """
     cdef cudaExternalSemaphore_t extSem_out = cudaExternalSemaphore_t()
-    cdef cyruntime.cudaExternalSemaphoreHandleDesc* cysemHandleDesc_ptr = semHandleDesc._pvt_ptr if semHandleDesc != None else NULL
+    cdef cyruntime.cudaExternalSemaphoreHandleDesc* cysemHandleDesc_ptr = <cyruntime.cudaExternalSemaphoreHandleDesc*>semHandleDesc._pvt_ptr if semHandleDesc != None else NULL
     with nogil:
         err = cyruntime.cudaImportExternalSemaphore(<cyruntime.cudaExternalSemaphore_t*>extSem_out._pvt_ptr, cysemHandleDesc_ptr)
     if err != cyruntime.cudaSuccess:
@@ -23803,7 +23803,7 @@ def cudaMallocArray(desc : Optional[cudaChannelFormatDesc], size_t width, size_t
     :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cuArrayCreate`
     """
     cdef cudaArray_t array = cudaArray_t()
-    cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = desc._pvt_ptr if desc != None else NULL
+    cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = <cyruntime.cudaChannelFormatDesc*>desc._pvt_ptr if desc != None else NULL
     with nogil:
         err = cyruntime.cudaMallocArray(<cyruntime.cudaArray_t*>array._pvt_ptr, cydesc_ptr, width, height, flags)
     if err != cyruntime.cudaSuccess:
@@ -24462,7 +24462,7 @@ def cudaMalloc3DArray(desc : Optional[cudaChannelFormatDesc], extent not None :
     :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuArray3DCreate`
     """
     cdef cudaArray_t array = cudaArray_t()
-    cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = desc._pvt_ptr if desc != None else NULL
+    cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = <cyruntime.cudaChannelFormatDesc*>desc._pvt_ptr if desc != None else NULL
     with nogil:
         err = cyruntime.cudaMalloc3DArray(<cyruntime.cudaArray_t*>array._pvt_ptr, cydesc_ptr, extent._pvt_ptr[0], flags)
     if err != cyruntime.cudaSuccess:
@@ -24588,7 +24588,7 @@ def cudaMallocMipmappedArray(desc : Optional[cudaChannelFormatDesc], extent not
     :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuMipmappedArrayCreate`
     """
     cdef cudaMipmappedArray_t mipmappedArray = cudaMipmappedArray_t()
-    cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = desc._pvt_ptr if desc != None else NULL
+    cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = <cyruntime.cudaChannelFormatDesc*>desc._pvt_ptr if desc != None else NULL
     with nogil:
         err = cyruntime.cudaMallocMipmappedArray(<cyruntime.cudaMipmappedArray_t*>mipmappedArray._pvt_ptr, cydesc_ptr, extent._pvt_ptr[0], numLevels, flags)
     if err != cyruntime.cudaSuccess:
@@ -24723,7 +24723,7 @@ def cudaMemcpy3D(p : Optional[cudaMemcpy3DParms]):
     --------
     :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3DAsync`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.make_cudaPos`, :py:obj:`~.cuMemcpy3D`
     """
-    cdef cyruntime.cudaMemcpy3DParms* cyp_ptr = p._pvt_ptr if p != None else NULL
+    cdef cyruntime.cudaMemcpy3DParms* cyp_ptr = <cyruntime.cudaMemcpy3DParms*>p._pvt_ptr if p != None else NULL
     with nogil:
         err = cyruntime.cudaMemcpy3D(cyp_ptr)
     return (_dict_cudaError_t[err],)
@@ -24760,7 +24760,7 @@ def cudaMemcpy3DPeer(p : Optional[cudaMemcpy3DPeerParms]):
     --------
     :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpyPeerAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :py:obj:`~.cuMemcpy3DPeer`
     """
-    cdef cyruntime.cudaMemcpy3DPeerParms* cyp_ptr = p._pvt_ptr if p != None else NULL
+    cdef cyruntime.cudaMemcpy3DPeerParms* cyp_ptr = <cyruntime.cudaMemcpy3DPeerParms*>p._pvt_ptr if p != None else NULL
     with nogil:
         err = cyruntime.cudaMemcpy3DPeer(cyp_ptr)
     return (_dict_cudaError_t[err],)
@@ -24865,7 +24865,7 @@ def cudaMemcpy3DAsync(p : Optional[cudaMemcpy3DParms], stream):
     else:
         pstream = int(cudaStream_t(stream))
     cystream = <cyruntime.cudaStream_t><void_ptr>pstream
-    cdef cyruntime.cudaMemcpy3DParms* cyp_ptr = p._pvt_ptr if p != None else NULL
+    cdef cyruntime.cudaMemcpy3DParms* cyp_ptr = <cyruntime.cudaMemcpy3DParms*>p._pvt_ptr if p != None else NULL
     with nogil:
         err = cyruntime.cudaMemcpy3DAsync(cyp_ptr, cystream)
     return (_dict_cudaError_t[err],)
@@ -24905,7 +24905,7 @@ def cudaMemcpy3DPeerAsync(p : Optional[cudaMemcpy3DPeerParms], stream):
     else:
         pstream = int(cudaStream_t(stream))
     cystream = <cyruntime.cudaStream_t><void_ptr>pstream
-    cdef cyruntime.cudaMemcpy3DPeerParms* cyp_ptr = p._pvt_ptr if p != None else NULL
+    cdef cyruntime.cudaMemcpy3DPeerParms* cyp_ptr = <cyruntime.cudaMemcpy3DPeerParms*>p._pvt_ptr if p != None else NULL
     with nogil:
         err = cyruntime.cudaMemcpy3DPeerAsync(cyp_ptr, cystream)
     return (_dict_cudaError_t[err],)
@@ -28203,7 +28203,7 @@ def cudaMemPoolGetAccess(memPool, location : Optional[cudaMemLocation]):
         pmemPool = int(cudaMemPool_t(memPool))
     cymemPool = <cyruntime.cudaMemPool_t><void_ptr>pmemPool
     cdef cyruntime.cudaMemAccessFlags flags
-    cdef cyruntime.cudaMemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
+    cdef cyruntime.cudaMemLocation* cylocation_ptr = <cyruntime.cudaMemLocation*>location._pvt_ptr if location != None else NULL
     with nogil:
         err = cyruntime.cudaMemPoolGetAccess(&flags, cymemPool, cylocation_ptr)
     if err != cyruntime.cudaSuccess:
@@ -28287,7 +28287,7 @@ def cudaMemPoolCreate(poolProps : Optional[cudaMemPoolProps]):
     Specifying cudaMemHandleTypeNone creates a memory pool that will not support IPC.
     """
     cdef cudaMemPool_t memPool = cudaMemPool_t()
-    cdef cyruntime.cudaMemPoolProps* cypoolProps_ptr = poolProps._pvt_ptr if poolProps != None else NULL
+    cdef cyruntime.cudaMemPoolProps* cypoolProps_ptr = <cyruntime.cudaMemPoolProps*>poolProps._pvt_ptr if poolProps != None else NULL
     with nogil:
         err = cyruntime.cudaMemPoolCreate(<cyruntime.cudaMemPool_t*>memPool._pvt_ptr, cypoolProps_ptr)
     if err != cyruntime.cudaSuccess:
@@ -28377,7 +28377,7 @@ def cudaMemGetDefaultMemPool(location : Optional[cudaMemLocation], typename not
     :py:obj:`~.cuMemAllocAsync`, :py:obj:`~.cuMemPoolTrimTo`, :py:obj:`~.cuMemPoolGetAttribute`, :py:obj:`~.cuMemPoolSetAttribute`, cuMemPoolSetAccess, :py:obj:`~.cuMemGetMemPool`, :py:obj:`~.cuMemPoolCreate`
     """
     cdef cudaMemPool_t memPool = cudaMemPool_t()
-    cdef cyruntime.cudaMemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
+    cdef cyruntime.cudaMemLocation* cylocation_ptr = <cyruntime.cudaMemLocation*>location._pvt_ptr if location != None else NULL
     cdef cyruntime.cudaMemAllocationType cytypename = typename.value
     with nogil:
         err = cyruntime.cudaMemGetDefaultMemPool(<cyruntime.cudaMemPool_t*>memPool._pvt_ptr, cylocation_ptr, cytypename)
@@ -28431,7 +28431,7 @@ def cudaMemGetMemPool(location : Optional[cudaMemLocation], typename not None :
     :py:obj:`~.cuDeviceGetDefaultMemPool`, :py:obj:`~.cuMemPoolCreate`, :py:obj:`~.cuDeviceSetMemPool`, :py:obj:`~.cuMemSetMemPool`
     """
     cdef cudaMemPool_t memPool = cudaMemPool_t()
-    cdef cyruntime.cudaMemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
+    cdef cyruntime.cudaMemLocation* cylocation_ptr = <cyruntime.cudaMemLocation*>location._pvt_ptr if location != None else NULL
     cdef cyruntime.cudaMemAllocationType cytypename = typename.value
     with nogil:
         err = cyruntime.cudaMemGetMemPool(<cyruntime.cudaMemPool_t*>memPool._pvt_ptr, cylocation_ptr, cytypename)
@@ -28500,7 +28500,7 @@ def cudaMemSetMemPool(location : Optional[cudaMemLocation], typename not None :
     else:
         pmemPool = int(cudaMemPool_t(memPool))
     cymemPool = <cyruntime.cudaMemPool_t><void_ptr>pmemPool
-    cdef cyruntime.cudaMemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
+    cdef cyruntime.cudaMemLocation* cylocation_ptr = <cyruntime.cudaMemLocation*>location._pvt_ptr if location != None else NULL
     cdef cyruntime.cudaMemAllocationType cytypename = typename.value
     with nogil:
         err = cyruntime.cudaMemSetMemPool(cylocation_ptr, cytypename, cymemPool)
@@ -28749,7 +28749,7 @@ def cudaMemPoolImportPointer(memPool, exportData : Optional[cudaMemPoolPtrExport
         pmemPool = int(cudaMemPool_t(memPool))
     cymemPool = <cyruntime.cudaMemPool_t><void_ptr>pmemPool
     cdef void_ptr ptr = 0
-    cdef cyruntime.cudaMemPoolPtrExportData* cyexportData_ptr = exportData._pvt_ptr if exportData != None else NULL
+    cdef cyruntime.cudaMemPoolPtrExportData* cyexportData_ptr = <cyruntime.cudaMemPoolPtrExportData*>exportData._pvt_ptr if exportData != None else NULL
     with nogil:
         err = cyruntime.cudaMemPoolImportPointer(<void**>&ptr, cymemPool, cyexportData_ptr)
     if err != cyruntime.cudaSuccess:
@@ -29640,9 +29640,9 @@ def cudaCreateTextureObject(pResDesc : Optional[cudaResourceDesc], pTexDesc : Op
     :py:obj:`~.cudaDestroyTextureObject`, :py:obj:`~.cuTexObjectCreate`
     """
     cdef cudaTextureObject_t pTexObject = cudaTextureObject_t()
-    cdef cyruntime.cudaResourceDesc* cypResDesc_ptr = pResDesc._pvt_ptr if pResDesc != None else NULL
-    cdef cyruntime.cudaTextureDesc* cypTexDesc_ptr = pTexDesc._pvt_ptr if pTexDesc != None else NULL
-    cdef cyruntime.cudaResourceViewDesc* cypResViewDesc_ptr = pResViewDesc._pvt_ptr if pResViewDesc != None else NULL
+    cdef cyruntime.cudaResourceDesc* cypResDesc_ptr = <cyruntime.cudaResourceDesc*>pResDesc._pvt_ptr if pResDesc != None else NULL
+    cdef cyruntime.cudaTextureDesc* cypTexDesc_ptr = <cyruntime.cudaTextureDesc*>pTexDesc._pvt_ptr if pTexDesc != None else NULL
+    cdef cyruntime.cudaResourceViewDesc* cypResViewDesc_ptr = <cyruntime.cudaResourceViewDesc*>pResViewDesc._pvt_ptr if pResViewDesc != None else NULL
     with nogil:
         err = cyruntime.cudaCreateTextureObject(<cyruntime.cudaTextureObject_t*>pTexObject._pvt_ptr, cypResDesc_ptr, cypTexDesc_ptr, cypResViewDesc_ptr)
     if err != cyruntime.cudaSuccess:
@@ -29843,7 +29843,7 @@ def cudaCreateSurfaceObject(pResDesc : Optional[cudaResourceDesc]):
     :py:obj:`~.cudaDestroySurfaceObject`, :py:obj:`~.cuSurfObjectCreate`
     """
     cdef cudaSurfaceObject_t pSurfObject = cudaSurfaceObject_t()
-    cdef cyruntime.cudaResourceDesc* cypResDesc_ptr = pResDesc._pvt_ptr if pResDesc != None else NULL
+    cdef cyruntime.cudaResourceDesc* cypResDesc_ptr = <cyruntime.cudaResourceDesc*>pResDesc._pvt_ptr if pResDesc != None else NULL
     with nogil:
         err = cyruntime.cudaCreateSurfaceObject(<cyruntime.cudaSurfaceObject_t*>pSurfObject._pvt_ptr, cypResDesc_ptr)
     if err != cyruntime.cudaSuccess:
@@ -30334,7 +30334,7 @@ def cudaGraphAddKernelNode(graph, pDependencies : Optional[Tuple[cudaGraphNode_t
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
+    cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = <cyruntime.cudaKernelNodeParams*>pNodeParams._pvt_ptr if pNodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddKernelNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cypNodeParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -30425,7 +30425,7 @@ def cudaGraphKernelNodeSetParams(node, pNodeParams : Optional[cudaKernelNodePara
     else:
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
+    cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = <cyruntime.cudaKernelNodeParams*>pNodeParams._pvt_ptr if pNodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphKernelNodeSetParams(cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -30558,7 +30558,7 @@ def cudaGraphKernelNodeSetAttribute(hNode, attr not None : cudaKernelNodeAttrID,
         phNode = int(cudaGraphNode_t(hNode))
     cyhNode = <cyruntime.cudaGraphNode_t><void_ptr>phNode
     cdef cyruntime.cudaKernelNodeAttrID cyattr = attr.value
-    cdef cyruntime.cudaKernelNodeAttrValue* cyvalue_ptr = value._pvt_ptr if value != None else NULL
+    cdef cyruntime.cudaKernelNodeAttrValue* cyvalue_ptr = <cyruntime.cudaKernelNodeAttrValue*>value._pvt_ptr if value != None else NULL
     with nogil:
         err = cyruntime.cudaGraphKernelNodeSetAttribute(cyhNode, cyattr, cyvalue_ptr)
     return (_dict_cudaError_t[err],)
@@ -30630,7 +30630,7 @@ def cudaGraphAddMemcpyNode(graph, pDependencies : Optional[Tuple[cudaGraphNode_t
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaMemcpy3DParms* cypCopyParams_ptr = pCopyParams._pvt_ptr if pCopyParams != None else NULL
+    cdef cyruntime.cudaMemcpy3DParms* cypCopyParams_ptr = <cyruntime.cudaMemcpy3DParms*>pCopyParams._pvt_ptr if pCopyParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddMemcpyNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cypCopyParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -30807,7 +30807,7 @@ def cudaGraphMemcpyNodeSetParams(node, pNodeParams : Optional[cudaMemcpy3DParms]
     else:
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    cdef cyruntime.cudaMemcpy3DParms* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
+    cdef cyruntime.cudaMemcpy3DParms* cypNodeParams_ptr = <cyruntime.cudaMemcpy3DParms*>pNodeParams._pvt_ptr if pNodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphMemcpyNodeSetParams(cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -30935,7 +30935,7 @@ def cudaGraphAddMemsetNode(graph, pDependencies : Optional[Tuple[cudaGraphNode_t
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaMemsetParams* cypMemsetParams_ptr = pMemsetParams._pvt_ptr if pMemsetParams != None else NULL
+    cdef cyruntime.cudaMemsetParams* cypMemsetParams_ptr = <cyruntime.cudaMemsetParams*>pMemsetParams._pvt_ptr if pMemsetParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddMemsetNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cypMemsetParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -31017,7 +31017,7 @@ def cudaGraphMemsetNodeSetParams(node, pNodeParams : Optional[cudaMemsetParams])
     else:
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    cdef cyruntime.cudaMemsetParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
+    cdef cyruntime.cudaMemsetParams* cypNodeParams_ptr = <cyruntime.cudaMemsetParams*>pNodeParams._pvt_ptr if pNodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphMemsetNodeSetParams(cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -31084,7 +31084,7 @@ def cudaGraphAddHostNode(graph, pDependencies : Optional[Tuple[cudaGraphNode_t]
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
+    cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = <cyruntime.cudaHostNodeParams*>pNodeParams._pvt_ptr if pNodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddHostNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cypNodeParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -31166,7 +31166,7 @@ def cudaGraphHostNodeSetParams(node, pNodeParams : Optional[cudaHostNodeParams])
     else:
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
+    cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = <cyruntime.cudaHostNodeParams*>pNodeParams._pvt_ptr if pNodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphHostNodeSetParams(cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -31764,7 +31764,7 @@ def cudaGraphAddExternalSemaphoresSignalNode(graph, pDependencies : Optional[Tup
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = <cyruntime.cudaExternalSemaphoreSignalNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddExternalSemaphoresSignalNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cynodeParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -31853,7 +31853,7 @@ def cudaGraphExternalSemaphoresSignalNodeSetParams(hNode, nodeParams : Optional[
     else:
         phNode = int(cudaGraphNode_t(hNode))
     cyhNode = <cyruntime.cudaGraphNode_t><void_ptr>phNode
-    cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = <cyruntime.cudaExternalSemaphoreSignalNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphExternalSemaphoresSignalNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -31921,7 +31921,7 @@ def cudaGraphAddExternalSemaphoresWaitNode(graph, pDependencies : Optional[Tuple
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = <cyruntime.cudaExternalSemaphoreWaitNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddExternalSemaphoresWaitNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cynodeParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -32010,7 +32010,7 @@ def cudaGraphExternalSemaphoresWaitNodeSetParams(hNode, nodeParams : Optional[cu
     else:
         phNode = int(cudaGraphNode_t(hNode))
     cyhNode = <cyruntime.cudaGraphNode_t><void_ptr>phNode
-    cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = <cyruntime.cudaExternalSemaphoreWaitNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphExternalSemaphoresWaitNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -32117,7 +32117,7 @@ def cudaGraphAddMemAllocNode(graph, pDependencies : Optional[Tuple[cudaGraphNode
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaMemAllocNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cyruntime.cudaMemAllocNodeParams* cynodeParams_ptr = <cyruntime.cudaMemAllocNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddMemAllocNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cynodeParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -33509,7 +33509,7 @@ def cudaGraphInstantiateWithParams(graph, instantiateParams : Optional[cudaGraph
         pgraph = int(cudaGraph_t(graph))
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphExec_t pGraphExec = cudaGraphExec_t()
-    cdef cyruntime.cudaGraphInstantiateParams* cyinstantiateParams_ptr = instantiateParams._pvt_ptr if instantiateParams != None else NULL
+    cdef cyruntime.cudaGraphInstantiateParams* cyinstantiateParams_ptr = <cyruntime.cudaGraphInstantiateParams*>instantiateParams._pvt_ptr if instantiateParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphInstantiateWithParams(<cyruntime.cudaGraphExec_t*>pGraphExec._pvt_ptr, cygraph, cyinstantiateParams_ptr)
     if err != cyruntime.cudaSuccess:
@@ -33636,7 +33636,7 @@ def cudaGraphExecKernelNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
+    cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = <cyruntime.cudaKernelNodeParams*>pNodeParams._pvt_ptr if pNodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecKernelNodeSetParams(cyhGraphExec, cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -33701,7 +33701,7 @@ def cudaGraphExecMemcpyNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    cdef cyruntime.cudaMemcpy3DParms* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
+    cdef cyruntime.cudaMemcpy3DParms* cypNodeParams_ptr = <cyruntime.cudaMemcpy3DParms*>pNodeParams._pvt_ptr if pNodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecMemcpyNodeSetParams(cyhGraphExec, cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -33844,7 +33844,7 @@ def cudaGraphExecMemsetNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    cdef cyruntime.cudaMemsetParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
+    cdef cyruntime.cudaMemsetParams* cypNodeParams_ptr = <cyruntime.cudaMemsetParams*>pNodeParams._pvt_ptr if pNodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecMemsetNodeSetParams(cyhGraphExec, cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -33899,7 +33899,7 @@ def cudaGraphExecHostNodeSetParams(hGraphExec, node, pNodeParams : Optional[cuda
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
+    cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = <cyruntime.cudaHostNodeParams*>pNodeParams._pvt_ptr if pNodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecHostNodeSetParams(cyhGraphExec, cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -34155,7 +34155,7 @@ def cudaGraphExecExternalSemaphoresSignalNodeSetParams(hGraphExec, hNode, nodePa
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = <cyruntime.cudaExternalSemaphoreSignalNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecExternalSemaphoresSignalNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -34215,7 +34215,7 @@ def cudaGraphExecExternalSemaphoresWaitNodeSetParams(hGraphExec, hNode, nodePara
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = <cyruntime.cudaExternalSemaphoreWaitNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecExternalSemaphoresWaitNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -35072,7 +35072,7 @@ def cudaGraphAddNode(graph, pDependencies : Optional[Tuple[cudaGraphNode_t] | Li
             string.memcpy(&cydependencyData[idx], (<cudaGraphEdgeData>dependencyData[idx])._pvt_ptr, sizeof(cyruntime.cudaGraphEdgeData))
     elif len(dependencyData) == 1:
         cydependencyData = (<cudaGraphEdgeData>dependencyData[0])._pvt_ptr
-    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = <cyruntime.cudaGraphNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, cydependencyData, numDependencies, cynodeParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -35122,7 +35122,7 @@ def cudaGraphNodeSetParams(node, nodeParams : Optional[cudaGraphNodeParams]):
     else:
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = <cyruntime.cudaGraphNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphNodeSetParams(cynode, cynodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -35182,7 +35182,7 @@ def cudaGraphExecNodeSetParams(graphExec, node, nodeParams : Optional[cudaGraphN
     else:
         pgraphExec = int(cudaGraphExec_t(graphExec))
     cygraphExec = <cyruntime.cudaGraphExec_t><void_ptr>pgraphExec
-    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = <cyruntime.cudaGraphNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecNodeSetParams(cygraphExec, cynode, cynodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -36084,7 +36084,7 @@ def cudaKernelSetAttributeForDevice(kernel, attr not None : cudaFuncAttribute, i
 def cudaGetExportTable(pExportTableId : Optional[cudaUUID_t]):
     """"""
     cdef void_ptr ppExportTable = 0
-    cdef cyruntime.cudaUUID_t* cypExportTableId_ptr = pExportTableId._pvt_ptr if pExportTableId != None else NULL
+    cdef cyruntime.cudaUUID_t* cypExportTableId_ptr = <cyruntime.cudaUUID_t*>pExportTableId._pvt_ptr if pExportTableId != None else NULL
     with nogil:
         err = cyruntime.cudaGetExportTable(<const void**>&ppExportTable, cypExportTableId_ptr)
     if err != cyruntime.cudaSuccess:
@@ -36787,7 +36787,7 @@ def cudaEGLStreamProducerReturnFrame(conn, eglframe : Optional[cudaEglFrame], pS
         cyconn = <cyruntime.cudaEglStreamConnection*><void_ptr>conn
     else:
         raise TypeError("Argument 'conn' is not instance of type (expected <class 'int, runtime.cudaEglStreamConnection'>, found " + str(type(conn)))
-    cdef cyruntime.cudaEglFrame* cyeglframe_ptr = eglframe._pvt_ptr if eglframe != None else NULL
+    cdef cyruntime.cudaEglFrame* cyeglframe_ptr = <cyruntime.cudaEglFrame*>eglframe._pvt_ptr if eglframe != None else NULL
     with nogil:
         err = cyruntime.cudaEGLStreamProducerReturnFrame(cyconn, cyeglframe_ptr, cypStream)
     return (_dict_cudaError_t[err],)

From 6c55c804b6a755ac3cd1e686c92fec5344f41736 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rwgkio@gmail.com>
Date: Tue, 22 Jul 2025 10:47:33 -0700
Subject: [PATCH 27/65] cython-gen changes due to cython-gen PR #118 (#120)

---
 cuda_bindings/cuda/bindings/driver.pyx.in  | 140 ++++++++++-----------
 cuda_bindings/cuda/bindings/runtime.pyx.in | 100 +++++++--------
 2 files changed, 120 insertions(+), 120 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in
index e6be2eec47..26cac65c09 100644
--- a/cuda_bindings/cuda/bindings/driver.pyx.in
+++ b/cuda_bindings/cuda/bindings/driver.pyx.in
@@ -24512,7 +24512,7 @@ def cuCtxCreate(ctxCreateParams : Optional[CUctxCreateParams], unsigned int flag
         pdev = int(CUdevice(dev))
     cydev = <cydriver.CUdevice>pdev
     cdef CUcontext pctx = CUcontext()
-    cdef cydriver.CUctxCreateParams* cyctxCreateParams_ptr = <cydriver.CUctxCreateParams*>ctxCreateParams._pvt_ptr if ctxCreateParams != None else NULL
+    cdef cydriver.CUctxCreateParams* cyctxCreateParams_ptr = ctxCreateParams._pvt_ptr if ctxCreateParams != None else NULL
     with nogil:
         err = cydriver.cuCtxCreate(<cydriver.CUcontext*>pctx._pvt_ptr, cyctxCreateParams_ptr, flags, cydev)
     if err != cydriver.CUDA_SUCCESS:
@@ -29529,7 +29529,7 @@ def cuMemcpy2D(pCopy : Optional[CUDA_MEMCPY2D]):
     --------
     :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`
     """
-    cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = <cydriver.CUDA_MEMCPY2D*>pCopy._pvt_ptr if pCopy != None else NULL
+    cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
     with nogil:
         err = cydriver.cuMemcpy2D(cypCopy_ptr)
     return (_dict_CUresult[err],)
@@ -29656,7 +29656,7 @@ def cuMemcpy2DUnaligned(pCopy : Optional[CUDA_MEMCPY2D]):
     --------
     :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`
     """
-    cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = <cydriver.CUDA_MEMCPY2D*>pCopy._pvt_ptr if pCopy != None else NULL
+    cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
     with nogil:
         err = cydriver.cuMemcpy2DUnaligned(cypCopy_ptr)
     return (_dict_CUresult[err],)
@@ -29786,7 +29786,7 @@ def cuMemcpy3D(pCopy : Optional[CUDA_MEMCPY3D]):
     --------
     :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpy3D`
     """
-    cdef cydriver.CUDA_MEMCPY3D* cypCopy_ptr = <cydriver.CUDA_MEMCPY3D*>pCopy._pvt_ptr if pCopy != None else NULL
+    cdef cydriver.CUDA_MEMCPY3D* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
     with nogil:
         err = cydriver.cuMemcpy3D(cypCopy_ptr)
     return (_dict_CUresult[err],)
@@ -29816,7 +29816,7 @@ def cuMemcpy3DPeer(pCopy : Optional[CUDA_MEMCPY3D_PEER]):
     --------
     :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyPeer`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyPeerAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :py:obj:`~.cudaMemcpy3DPeer`
     """
-    cdef cydriver.CUDA_MEMCPY3D_PEER* cypCopy_ptr = <cydriver.CUDA_MEMCPY3D_PEER*>pCopy._pvt_ptr if pCopy != None else NULL
+    cdef cydriver.CUDA_MEMCPY3D_PEER* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
     with nogil:
         err = cydriver.cuMemcpy3DPeer(cypCopy_ptr)
     return (_dict_CUresult[err],)
@@ -30380,7 +30380,7 @@ def cuMemcpy2DAsync(pCopy : Optional[CUDA_MEMCPY2D], hStream):
     else:
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
-    cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = <cydriver.CUDA_MEMCPY2D*>pCopy._pvt_ptr if pCopy != None else NULL
+    cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
     with nogil:
         err = cydriver.cuMemcpy2DAsync(cypCopy_ptr, cyhStream)
     return (_dict_CUresult[err],)
@@ -30520,7 +30520,7 @@ def cuMemcpy3DAsync(pCopy : Optional[CUDA_MEMCPY3D], hStream):
     else:
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
-    cdef cydriver.CUDA_MEMCPY3D* cypCopy_ptr = <cydriver.CUDA_MEMCPY3D*>pCopy._pvt_ptr if pCopy != None else NULL
+    cdef cydriver.CUDA_MEMCPY3D* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
     with nogil:
         err = cydriver.cuMemcpy3DAsync(cypCopy_ptr, cyhStream)
     return (_dict_CUresult[err],)
@@ -30560,7 +30560,7 @@ def cuMemcpy3DPeerAsync(pCopy : Optional[CUDA_MEMCPY3D_PEER], hStream):
     else:
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
-    cdef cydriver.CUDA_MEMCPY3D_PEER* cypCopy_ptr = <cydriver.CUDA_MEMCPY3D_PEER*>pCopy._pvt_ptr if pCopy != None else NULL
+    cdef cydriver.CUDA_MEMCPY3D_PEER* cypCopy_ptr = pCopy._pvt_ptr if pCopy != None else NULL
     with nogil:
         err = cydriver.cuMemcpy3DPeerAsync(cypCopy_ptr, cyhStream)
     return (_dict_CUresult[err],)
@@ -31516,7 +31516,7 @@ def cuArrayCreate(pAllocateArray : Optional[CUDA_ARRAY_DESCRIPTOR]):
     :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMallocArray`
     """
     cdef CUarray pHandle = CUarray()
-    cdef cydriver.CUDA_ARRAY_DESCRIPTOR* cypAllocateArray_ptr = <cydriver.CUDA_ARRAY_DESCRIPTOR*>pAllocateArray._pvt_ptr if pAllocateArray != None else NULL
+    cdef cydriver.CUDA_ARRAY_DESCRIPTOR* cypAllocateArray_ptr = pAllocateArray._pvt_ptr if pAllocateArray != None else NULL
     with nogil:
         err = cydriver.cuArrayCreate(<cydriver.CUarray*>pHandle._pvt_ptr, cypAllocateArray_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -32016,7 +32016,7 @@ def cuArray3DCreate(pAllocateArray : Optional[CUDA_ARRAY3D_DESCRIPTOR]):
     :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMalloc3DArray`
     """
     cdef CUarray pHandle = CUarray()
-    cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR* cypAllocateArray_ptr = <cydriver.CUDA_ARRAY3D_DESCRIPTOR*>pAllocateArray._pvt_ptr if pAllocateArray != None else NULL
+    cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR* cypAllocateArray_ptr = pAllocateArray._pvt_ptr if pAllocateArray != None else NULL
     with nogil:
         err = cydriver.cuArray3DCreate(<cydriver.CUarray*>pHandle._pvt_ptr, cypAllocateArray_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -32186,7 +32186,7 @@ def cuMipmappedArrayCreate(pMipmappedArrayDesc : Optional[CUDA_ARRAY3D_DESCRIPTO
     :py:obj:`~.cuMipmappedArrayDestroy`, :py:obj:`~.cuMipmappedArrayGetLevel`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cudaMallocMipmappedArray`
     """
     cdef CUmipmappedArray pHandle = CUmipmappedArray()
-    cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR* cypMipmappedArrayDesc_ptr = <cydriver.CUDA_ARRAY3D_DESCRIPTOR*>pMipmappedArrayDesc._pvt_ptr if pMipmappedArrayDesc != None else NULL
+    cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR* cypMipmappedArrayDesc_ptr = pMipmappedArrayDesc._pvt_ptr if pMipmappedArrayDesc != None else NULL
     with nogil:
         err = cydriver.cuMipmappedArrayCreate(<cydriver.CUmipmappedArray*>pHandle._pvt_ptr, cypMipmappedArrayDesc_ptr, numMipmapLevels)
     if err != cydriver.CUDA_SUCCESS:
@@ -32433,7 +32433,7 @@ def cuMemBatchDecompressAsync(paramsArray : Optional[CUmemDecompressParams], siz
     else:
         pstream = int(CUstream(stream))
     cystream = <cydriver.CUstream><void_ptr>pstream
-    cdef cydriver.CUmemDecompressParams* cyparamsArray_ptr = <cydriver.CUmemDecompressParams*>paramsArray._pvt_ptr if paramsArray != None else NULL
+    cdef cydriver.CUmemDecompressParams* cyparamsArray_ptr = paramsArray._pvt_ptr if paramsArray != None else NULL
     cdef size_t errorIndex = 0
     with nogil:
         err = cydriver.cuMemBatchDecompressAsync(cyparamsArray_ptr, count, flags, &errorIndex, cystream)
@@ -32619,7 +32619,7 @@ def cuMemCreate(size_t size, prop : Optional[CUmemAllocationProp], unsigned long
     :py:obj:`~.cuMemRelease`, :py:obj:`~.cuMemExportToShareableHandle`, :py:obj:`~.cuMemImportFromShareableHandle`
     """
     cdef CUmemGenericAllocationHandle handle = CUmemGenericAllocationHandle()
-    cdef cydriver.CUmemAllocationProp* cyprop_ptr = <cydriver.CUmemAllocationProp*>prop._pvt_ptr if prop != None else NULL
+    cdef cydriver.CUmemAllocationProp* cyprop_ptr = prop._pvt_ptr if prop != None else NULL
     with nogil:
         err = cydriver.cuMemCreate(<cydriver.CUmemGenericAllocationHandle*>handle._pvt_ptr, size, cyprop_ptr, flags)
     if err != cydriver.CUDA_SUCCESS:
@@ -33082,7 +33082,7 @@ def cuMemGetAccess(location : Optional[CUmemLocation], ptr):
         pptr = int(CUdeviceptr(ptr))
     cyptr = <cydriver.CUdeviceptr><void_ptr>pptr
     cdef unsigned long long flags = 0
-    cdef cydriver.CUmemLocation* cylocation_ptr = <cydriver.CUmemLocation*>location._pvt_ptr if location != None else NULL
+    cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
     with nogil:
         err = cydriver.cuMemGetAccess(&flags, cylocation_ptr, cyptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -33230,7 +33230,7 @@ def cuMemGetAllocationGranularity(prop : Optional[CUmemAllocationProp], option n
     :py:obj:`~.cuMemCreate`, :py:obj:`~.cuMemMap`
     """
     cdef size_t granularity = 0
-    cdef cydriver.CUmemAllocationProp* cyprop_ptr = <cydriver.CUmemAllocationProp*>prop._pvt_ptr if prop != None else NULL
+    cdef cydriver.CUmemAllocationProp* cyprop_ptr = prop._pvt_ptr if prop != None else NULL
     cdef cydriver.CUmemAllocationGranularity_flags cyoption = option.value
     with nogil:
         err = cydriver.cuMemGetAllocationGranularity(&granularity, cyprop_ptr, cyoption)
@@ -33719,7 +33719,7 @@ def cuMemPoolGetAccess(memPool, location : Optional[CUmemLocation]):
         pmemPool = int(CUmemoryPool(memPool))
     cymemPool = <cydriver.CUmemoryPool><void_ptr>pmemPool
     cdef cydriver.CUmemAccess_flags flags
-    cdef cydriver.CUmemLocation* cylocation_ptr = <cydriver.CUmemLocation*>location._pvt_ptr if location != None else NULL
+    cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
     with nogil:
         err = cydriver.cuMemPoolGetAccess(&flags, cymemPool, cylocation_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -33803,7 +33803,7 @@ def cuMemPoolCreate(poolProps : Optional[CUmemPoolProps]):
     Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC.
     """
     cdef CUmemoryPool pool = CUmemoryPool()
-    cdef cydriver.CUmemPoolProps* cypoolProps_ptr = <cydriver.CUmemPoolProps*>poolProps._pvt_ptr if poolProps != None else NULL
+    cdef cydriver.CUmemPoolProps* cypoolProps_ptr = poolProps._pvt_ptr if poolProps != None else NULL
     with nogil:
         err = cydriver.cuMemPoolCreate(<cydriver.CUmemoryPool*>pool._pvt_ptr, cypoolProps_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -33893,7 +33893,7 @@ def cuMemGetDefaultMemPool(location : Optional[CUmemLocation], typename not None
     :py:obj:`~.cuMemAllocAsync`, :py:obj:`~.cuMemPoolTrimTo`, :py:obj:`~.cuMemPoolGetAttribute`, :py:obj:`~.cuMemPoolSetAttribute`, :py:obj:`~.cuMemPoolSetAccess`, :py:obj:`~.cuMemGetMemPool`, :py:obj:`~.cuMemPoolCreate`
     """
     cdef CUmemoryPool pool_out = CUmemoryPool()
-    cdef cydriver.CUmemLocation* cylocation_ptr = <cydriver.CUmemLocation*>location._pvt_ptr if location != None else NULL
+    cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
     cdef cydriver.CUmemAllocationType cytypename = typename.value
     with nogil:
         err = cydriver.cuMemGetDefaultMemPool(<cydriver.CUmemoryPool*>pool_out._pvt_ptr, cylocation_ptr, cytypename)
@@ -33946,7 +33946,7 @@ def cuMemGetMemPool(location : Optional[CUmemLocation], typename not None : CUme
     :py:obj:`~.cuDeviceGetDefaultMemPool`, :py:obj:`~.cuMemPoolCreate`, :py:obj:`~.cuDeviceSetMemPool`, :py:obj:`~.cuMemSetMemPool`
     """
     cdef CUmemoryPool pool = CUmemoryPool()
-    cdef cydriver.CUmemLocation* cylocation_ptr = <cydriver.CUmemLocation*>location._pvt_ptr if location != None else NULL
+    cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
     cdef cydriver.CUmemAllocationType cytypename = typename.value
     with nogil:
         err = cydriver.cuMemGetMemPool(<cydriver.CUmemoryPool*>pool._pvt_ptr, cylocation_ptr, cytypename)
@@ -34014,7 +34014,7 @@ def cuMemSetMemPool(location : Optional[CUmemLocation], typename not None : CUme
     else:
         ppool = int(CUmemoryPool(pool))
     cypool = <cydriver.CUmemoryPool><void_ptr>ppool
-    cdef cydriver.CUmemLocation* cylocation_ptr = <cydriver.CUmemLocation*>location._pvt_ptr if location != None else NULL
+    cdef cydriver.CUmemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
     cdef cydriver.CUmemAllocationType cytypename = typename.value
     with nogil:
         err = cydriver.cuMemSetMemPool(cylocation_ptr, cytypename, cypool)
@@ -34274,7 +34274,7 @@ def cuMemPoolImportPointer(pool, shareData : Optional[CUmemPoolPtrExportData]):
         ppool = int(CUmemoryPool(pool))
     cypool = <cydriver.CUmemoryPool><void_ptr>ppool
     cdef CUdeviceptr ptr_out = CUdeviceptr()
-    cdef cydriver.CUmemPoolPtrExportData* cyshareData_ptr = <cydriver.CUmemPoolPtrExportData*>shareData._pvt_ptr if shareData != None else NULL
+    cdef cydriver.CUmemPoolPtrExportData* cyshareData_ptr = shareData._pvt_ptr if shareData != None else NULL
     with nogil:
         err = cydriver.cuMemPoolImportPointer(<cydriver.CUdeviceptr*>ptr_out._pvt_ptr, cypool, cyshareData_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -34335,7 +34335,7 @@ def cuMulticastCreate(prop : Optional[CUmulticastObjectProp]):
     :py:obj:`~.cuMemCreate`, :py:obj:`~.cuMemRelease`, :py:obj:`~.cuMemExportToShareableHandle`, :py:obj:`~.cuMemImportFromShareableHandle`
     """
     cdef CUmemGenericAllocationHandle mcHandle = CUmemGenericAllocationHandle()
-    cdef cydriver.CUmulticastObjectProp* cyprop_ptr = <cydriver.CUmulticastObjectProp*>prop._pvt_ptr if prop != None else NULL
+    cdef cydriver.CUmulticastObjectProp* cyprop_ptr = prop._pvt_ptr if prop != None else NULL
     with nogil:
         err = cydriver.cuMulticastCreate(<cydriver.CUmemGenericAllocationHandle*>mcHandle._pvt_ptr, cyprop_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -34644,7 +34644,7 @@ def cuMulticastGetGranularity(prop : Optional[CUmulticastObjectProp], option not
     :py:obj:`~.cuMulticastCreate`, :py:obj:`~.cuMulticastBindMem`, :py:obj:`~.cuMulticastBindAddr`, :py:obj:`~.cuMulticastUnbind`
     """
     cdef size_t granularity = 0
-    cdef cydriver.CUmulticastObjectProp* cyprop_ptr = <cydriver.CUmulticastObjectProp*>prop._pvt_ptr if prop != None else NULL
+    cdef cydriver.CUmulticastObjectProp* cyprop_ptr = prop._pvt_ptr if prop != None else NULL
     cdef cydriver.CUmulticastGranularity_flags cyoption = option.value
     with nogil:
         err = cydriver.cuMulticastGetGranularity(&granularity, cyprop_ptr, cyoption)
@@ -37424,7 +37424,7 @@ def cuStreamSetAttribute(hStream, attr not None : CUstreamAttrID, value : Option
         phStream = int(CUstream(hStream))
     cyhStream = <cydriver.CUstream><void_ptr>phStream
     cdef cydriver.CUstreamAttrID cyattr = attr.value
-    cdef cydriver.CUstreamAttrValue* cyvalue_ptr = <cydriver.CUstreamAttrValue*>value._pvt_ptr if value != None else NULL
+    cdef cydriver.CUstreamAttrValue* cyvalue_ptr = value._pvt_ptr if value != None else NULL
     with nogil:
         err = cydriver.cuStreamSetAttribute(cyhStream, cyattr, cyvalue_ptr)
     return (_dict_CUresult[err],)
@@ -37972,7 +37972,7 @@ def cuImportExternalMemory(memHandleDesc : Optional[CUDA_EXTERNAL_MEMORY_HANDLE_
     and Cache Control" chapter from Vulkan specification.
     """
     cdef CUexternalMemory extMem_out = CUexternalMemory()
-    cdef cydriver.CUDA_EXTERNAL_MEMORY_HANDLE_DESC* cymemHandleDesc_ptr = <cydriver.CUDA_EXTERNAL_MEMORY_HANDLE_DESC*>memHandleDesc._pvt_ptr if memHandleDesc != None else NULL
+    cdef cydriver.CUDA_EXTERNAL_MEMORY_HANDLE_DESC* cymemHandleDesc_ptr = memHandleDesc._pvt_ptr if memHandleDesc != None else NULL
     with nogil:
         err = cydriver.cuImportExternalMemory(<cydriver.CUexternalMemory*>extMem_out._pvt_ptr, cymemHandleDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -38042,7 +38042,7 @@ def cuExternalMemoryGetMappedBuffer(extMem, bufferDesc : Optional[CUDA_EXTERNAL_
         pextMem = int(CUexternalMemory(extMem))
     cyextMem = <cydriver.CUexternalMemory><void_ptr>pextMem
     cdef CUdeviceptr devPtr = CUdeviceptr()
-    cdef cydriver.CUDA_EXTERNAL_MEMORY_BUFFER_DESC* cybufferDesc_ptr = <cydriver.CUDA_EXTERNAL_MEMORY_BUFFER_DESC*>bufferDesc._pvt_ptr if bufferDesc != None else NULL
+    cdef cydriver.CUDA_EXTERNAL_MEMORY_BUFFER_DESC* cybufferDesc_ptr = bufferDesc._pvt_ptr if bufferDesc != None else NULL
     with nogil:
         err = cydriver.cuExternalMemoryGetMappedBuffer(<cydriver.CUdeviceptr*>devPtr._pvt_ptr, cyextMem, cybufferDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -38118,7 +38118,7 @@ def cuExternalMemoryGetMappedMipmappedArray(extMem, mipmapDesc : Optional[CUDA_E
         pextMem = int(CUexternalMemory(extMem))
     cyextMem = <cydriver.CUexternalMemory><void_ptr>pextMem
     cdef CUmipmappedArray mipmap = CUmipmappedArray()
-    cdef cydriver.CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC* cymipmapDesc_ptr = <cydriver.CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC*>mipmapDesc._pvt_ptr if mipmapDesc != None else NULL
+    cdef cydriver.CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC* cymipmapDesc_ptr = mipmapDesc._pvt_ptr if mipmapDesc != None else NULL
     with nogil:
         err = cydriver.cuExternalMemoryGetMappedMipmappedArray(<cydriver.CUmipmappedArray*>mipmap._pvt_ptr, cyextMem, cymipmapDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -38307,7 +38307,7 @@ def cuImportExternalSemaphore(semHandleDesc : Optional[CUDA_EXTERNAL_SEMAPHORE_H
     :py:obj:`~.cuDestroyExternalSemaphore`, :py:obj:`~.cuSignalExternalSemaphoresAsync`, :py:obj:`~.cuWaitExternalSemaphoresAsync`
     """
     cdef CUexternalSemaphore extSem_out = CUexternalSemaphore()
-    cdef cydriver.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC* cysemHandleDesc_ptr = <cydriver.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC*>semHandleDesc._pvt_ptr if semHandleDesc != None else NULL
+    cdef cydriver.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC* cysemHandleDesc_ptr = semHandleDesc._pvt_ptr if semHandleDesc != None else NULL
     with nogil:
         err = cydriver.cuImportExternalSemaphore(<cydriver.CUexternalSemaphore*>extSem_out._pvt_ptr, cysemHandleDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -39859,7 +39859,7 @@ def cuLaunchKernelEx(config : Optional[CUlaunchConfig], f, kernelParams, void_pt
     else:
         pf = int(CUfunction(f))
     cyf = <cydriver.CUfunction><void_ptr>pf
-    cdef cydriver.CUlaunchConfig* cyconfig_ptr = <cydriver.CUlaunchConfig*>config._pvt_ptr if config != None else NULL
+    cdef cydriver.CUlaunchConfig* cyconfig_ptr = config._pvt_ptr if config != None else NULL
     cykernelParams = utils.HelperKernelParams(kernelParams)
     cdef void** cykernelParams_ptr = <void**><void_ptr>cykernelParams.ckernelParams
     with nogil:
@@ -40966,7 +40966,7 @@ def cuGraphAddKernelNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] | Li
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_KERNEL_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphAddKernelNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -41057,7 +41057,7 @@ def cuGraphKernelNodeSetParams(hNode, nodeParams : Optional[CUDA_KERNEL_NODE_PAR
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_KERNEL_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphKernelNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -41144,7 +41144,7 @@ def cuGraphAddMemcpyNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] | Li
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_MEMCPY3D* cycopyParams_ptr = <cydriver.CUDA_MEMCPY3D*>copyParams._pvt_ptr if copyParams != None else NULL
+    cdef cydriver.CUDA_MEMCPY3D* cycopyParams_ptr = copyParams._pvt_ptr if copyParams != None else NULL
     with nogil:
         err = cydriver.cuGraphAddMemcpyNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cycopyParams_ptr, cyctx)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -41226,7 +41226,7 @@ def cuGraphMemcpyNodeSetParams(hNode, nodeParams : Optional[CUDA_MEMCPY3D]):
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_MEMCPY3D* cynodeParams_ptr = <cydriver.CUDA_MEMCPY3D*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_MEMCPY3D* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphMemcpyNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -41303,7 +41303,7 @@ def cuGraphAddMemsetNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] | Li
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cymemsetParams_ptr = <cydriver.CUDA_MEMSET_NODE_PARAMS*>memsetParams._pvt_ptr if memsetParams != None else NULL
+    cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cymemsetParams_ptr = memsetParams._pvt_ptr if memsetParams != None else NULL
     with nogil:
         err = cydriver.cuGraphAddMemsetNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cymemsetParams_ptr, cyctx)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -41385,7 +41385,7 @@ def cuGraphMemsetNodeSetParams(hNode, nodeParams : Optional[CUDA_MEMSET_NODE_PAR
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_MEMSET_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphMemsetNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -41452,7 +41452,7 @@ def cuGraphAddHostNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] | List
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_HOST_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphAddHostNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -41534,7 +41534,7 @@ def cuGraphHostNodeSetParams(hNode, nodeParams : Optional[CUDA_HOST_NODE_PARAMS]
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_HOST_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphHostNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -42127,7 +42127,7 @@ def cuGraphAddExternalSemaphoresSignalNode(hGraph, dependencies : Optional[Tuple
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphAddExternalSemaphoresSignalNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -42216,7 +42216,7 @@ def cuGraphExternalSemaphoresSignalNodeSetParams(hNode, nodeParams : Optional[CU
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphExternalSemaphoresSignalNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -42284,7 +42284,7 @@ def cuGraphAddExternalSemaphoresWaitNode(hGraph, dependencies : Optional[Tuple[C
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphAddExternalSemaphoresWaitNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -42373,7 +42373,7 @@ def cuGraphExternalSemaphoresWaitNodeSetParams(hNode, nodeParams : Optional[CUDA
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphExternalSemaphoresWaitNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -42444,7 +42444,7 @@ def cuGraphAddBatchMemOpNode(hGraph, dependencies : Optional[Tuple[CUgraphNode]
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphAddBatchMemOpNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -42534,7 +42534,7 @@ def cuGraphBatchMemOpNodeSetParams(hNode, nodeParams : Optional[CUDA_BATCH_MEM_O
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphBatchMemOpNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -42605,7 +42605,7 @@ def cuGraphExecBatchMemOpNodeSetParams(hGraphExec, hNode, nodeParams : Optional[
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_BATCH_MEM_OP_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphExecBatchMemOpNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -42713,7 +42713,7 @@ def cuGraphAddMemAllocNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] |
     elif len(dependencies) == 1:
         cydependencies = <cydriver.CUgraphNode*>(<CUgraphNode>dependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(dependencies): raise RuntimeError("List is too small: " + str(len(dependencies)) + " < " + str(numDependencies))
-    cdef cydriver.CUDA_MEM_ALLOC_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_MEM_ALLOC_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_MEM_ALLOC_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphAddMemAllocNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -44031,7 +44031,7 @@ def cuGraphInstantiateWithParams(hGraph, instantiateParams : Optional[CUDA_GRAPH
         phGraph = int(CUgraph(hGraph))
     cyhGraph = <cydriver.CUgraph><void_ptr>phGraph
     cdef CUgraphExec phGraphExec = CUgraphExec()
-    cdef cydriver.CUDA_GRAPH_INSTANTIATE_PARAMS* cyinstantiateParams_ptr = <cydriver.CUDA_GRAPH_INSTANTIATE_PARAMS*>instantiateParams._pvt_ptr if instantiateParams != None else NULL
+    cdef cydriver.CUDA_GRAPH_INSTANTIATE_PARAMS* cyinstantiateParams_ptr = instantiateParams._pvt_ptr if instantiateParams != None else NULL
     with nogil:
         err = cydriver.cuGraphInstantiateWithParams(<cydriver.CUgraphExec*>phGraphExec._pvt_ptr, cyhGraph, cyinstantiateParams_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -44157,7 +44157,7 @@ def cuGraphExecKernelNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUDA
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_KERNEL_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_KERNEL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphExecKernelNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -44232,7 +44232,7 @@ def cuGraphExecMemcpyNodeSetParams(hGraphExec, hNode, copyParams : Optional[CUDA
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_MEMCPY3D* cycopyParams_ptr = <cydriver.CUDA_MEMCPY3D*>copyParams._pvt_ptr if copyParams != None else NULL
+    cdef cydriver.CUDA_MEMCPY3D* cycopyParams_ptr = copyParams._pvt_ptr if copyParams != None else NULL
     with nogil:
         err = cydriver.cuGraphExecMemcpyNodeSetParams(cyhGraphExec, cyhNode, cycopyParams_ptr, cyctx)
     return (_dict_CUresult[err],)
@@ -44312,7 +44312,7 @@ def cuGraphExecMemsetNodeSetParams(hGraphExec, hNode, memsetParams : Optional[CU
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cymemsetParams_ptr = <cydriver.CUDA_MEMSET_NODE_PARAMS*>memsetParams._pvt_ptr if memsetParams != None else NULL
+    cdef cydriver.CUDA_MEMSET_NODE_PARAMS* cymemsetParams_ptr = memsetParams._pvt_ptr if memsetParams != None else NULL
     with nogil:
         err = cydriver.cuGraphExecMemsetNodeSetParams(cyhGraphExec, cyhNode, cymemsetParams_ptr, cyctx)
     return (_dict_CUresult[err],)
@@ -44367,7 +44367,7 @@ def cuGraphExecHostNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUDA_H
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_HOST_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_HOST_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphExecHostNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -44623,7 +44623,7 @@ def cuGraphExecExternalSemaphoresSignalNodeSetParams(hGraphExec, hNode, nodePara
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphExecExternalSemaphoresSignalNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -44683,7 +44683,7 @@ def cuGraphExecExternalSemaphoresWaitNodeSetParams(hGraphExec, hNode, nodeParams
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = <cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUDA_EXT_SEM_WAIT_NODE_PARAMS* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphExecExternalSemaphoresWaitNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -45301,7 +45301,7 @@ def cuGraphKernelNodeSetAttribute(hNode, attr not None : CUkernelNodeAttrID, val
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
     cdef cydriver.CUkernelNodeAttrID cyattr = attr.value
-    cdef cydriver.CUkernelNodeAttrValue* cyvalue_ptr = <cydriver.CUkernelNodeAttrValue*>value._pvt_ptr if value != None else NULL
+    cdef cydriver.CUkernelNodeAttrValue* cyvalue_ptr = value._pvt_ptr if value != None else NULL
     with nogil:
         err = cydriver.cuGraphKernelNodeSetAttribute(cyhNode, cyattr, cyvalue_ptr)
     return (_dict_CUresult[err],)
@@ -45687,7 +45687,7 @@ def cuGraphAddNode(hGraph, dependencies : Optional[Tuple[CUgraphNode] | List[CUg
             string.memcpy(&cydependencyData[idx], (<CUgraphEdgeData>dependencyData[idx])._pvt_ptr, sizeof(cydriver.CUgraphEdgeData))
     elif len(dependencyData) == 1:
         cydependencyData = (<CUgraphEdgeData>dependencyData[0])._pvt_ptr
-    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = <cydriver.CUgraphNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphAddNode(<cydriver.CUgraphNode*>phGraphNode._pvt_ptr, cyhGraph, cydependencies, cydependencyData, numDependencies, cynodeParams_ptr)
     if len(dependencies) > 1 and cydependencies is not NULL:
@@ -45737,7 +45737,7 @@ def cuGraphNodeSetParams(hNode, nodeParams : Optional[CUgraphNodeParams]):
     else:
         phNode = int(CUgraphNode(hNode))
     cyhNode = <cydriver.CUgraphNode><void_ptr>phNode
-    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = <cydriver.CUgraphNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -45797,7 +45797,7 @@ def cuGraphExecNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUgraphNod
     else:
         phGraphExec = int(CUgraphExec(hGraphExec))
     cyhGraphExec = <cydriver.CUgraphExec><void_ptr>phGraphExec
-    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = <cydriver.CUgraphNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cydriver.CUgraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cydriver.cuGraphExecNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_CUresult[err],)
@@ -46260,7 +46260,7 @@ def cuOccupancyMaxPotentialClusterSize(func, config : Optional[CUlaunchConfig]):
         pfunc = int(CUfunction(func))
     cyfunc = <cydriver.CUfunction><void_ptr>pfunc
     cdef int clusterSize = 0
-    cdef cydriver.CUlaunchConfig* cyconfig_ptr = <cydriver.CUlaunchConfig*>config._pvt_ptr if config != None else NULL
+    cdef cydriver.CUlaunchConfig* cyconfig_ptr = config._pvt_ptr if config != None else NULL
     with nogil:
         err = cydriver.cuOccupancyMaxPotentialClusterSize(&clusterSize, cyfunc, cyconfig_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -46320,7 +46320,7 @@ def cuOccupancyMaxActiveClusters(func, config : Optional[CUlaunchConfig]):
         pfunc = int(CUfunction(func))
     cyfunc = <cydriver.CUfunction><void_ptr>pfunc
     cdef int numClusters = 0
-    cdef cydriver.CUlaunchConfig* cyconfig_ptr = <cydriver.CUlaunchConfig*>config._pvt_ptr if config != None else NULL
+    cdef cydriver.CUlaunchConfig* cyconfig_ptr = config._pvt_ptr if config != None else NULL
     with nogil:
         err = cydriver.cuOccupancyMaxActiveClusters(&numClusters, cyfunc, cyconfig_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -46584,7 +46584,7 @@ def cuTexRefSetAddress2D(hTexRef, desc : Optional[CUDA_ARRAY_DESCRIPTOR], dptr,
     else:
         phTexRef = int(CUtexref(hTexRef))
     cyhTexRef = <cydriver.CUtexref><void_ptr>phTexRef
-    cdef cydriver.CUDA_ARRAY_DESCRIPTOR* cydesc_ptr = <cydriver.CUDA_ARRAY_DESCRIPTOR*>desc._pvt_ptr if desc != None else NULL
+    cdef cydriver.CUDA_ARRAY_DESCRIPTOR* cydesc_ptr = desc._pvt_ptr if desc != None else NULL
     with nogil:
         err = cydriver.cuTexRefSetAddress2D(cyhTexRef, cydesc_ptr, cydptr, Pitch)
     return (_dict_CUresult[err],)
@@ -47955,9 +47955,9 @@ def cuTexObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC], pTexDesc : Option
     :py:obj:`~.cuTexObjectDestroy`, :py:obj:`~.cudaCreateTextureObject`
     """
     cdef CUtexObject pTexObject = CUtexObject()
-    cdef cydriver.CUDA_RESOURCE_DESC* cypResDesc_ptr = <cydriver.CUDA_RESOURCE_DESC*>pResDesc._pvt_ptr if pResDesc != None else NULL
-    cdef cydriver.CUDA_TEXTURE_DESC* cypTexDesc_ptr = <cydriver.CUDA_TEXTURE_DESC*>pTexDesc._pvt_ptr if pTexDesc != None else NULL
-    cdef cydriver.CUDA_RESOURCE_VIEW_DESC* cypResViewDesc_ptr = <cydriver.CUDA_RESOURCE_VIEW_DESC*>pResViewDesc._pvt_ptr if pResViewDesc != None else NULL
+    cdef cydriver.CUDA_RESOURCE_DESC* cypResDesc_ptr = pResDesc._pvt_ptr if pResDesc != None else NULL
+    cdef cydriver.CUDA_TEXTURE_DESC* cypTexDesc_ptr = pTexDesc._pvt_ptr if pTexDesc != None else NULL
+    cdef cydriver.CUDA_RESOURCE_VIEW_DESC* cypResViewDesc_ptr = pResViewDesc._pvt_ptr if pResViewDesc != None else NULL
     with nogil:
         err = cydriver.cuTexObjectCreate(<cydriver.CUtexObject*>pTexObject._pvt_ptr, cypResDesc_ptr, cypTexDesc_ptr, cypResViewDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -48159,7 +48159,7 @@ def cuSurfObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC]):
     :py:obj:`~.cuSurfObjectDestroy`, :py:obj:`~.cudaCreateSurfaceObject`
     """
     cdef CUsurfObject pSurfObject = CUsurfObject()
-    cdef cydriver.CUDA_RESOURCE_DESC* cypResDesc_ptr = <cydriver.CUDA_RESOURCE_DESC*>pResDesc._pvt_ptr if pResDesc != None else NULL
+    cdef cydriver.CUDA_RESOURCE_DESC* cypResDesc_ptr = pResDesc._pvt_ptr if pResDesc != None else NULL
     with nogil:
         err = cydriver.cuSurfObjectCreate(<cydriver.CUsurfObject*>pSurfObject._pvt_ptr, cypResDesc_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -49258,7 +49258,7 @@ def cuTensorMapReplaceAddress(tensorMap : Optional[CUtensorMap], globalAddress):
     --------
     :py:obj:`~.cuTensorMapEncodeTiled`, :py:obj:`~.cuTensorMapEncodeIm2col`, :py:obj:`~.cuTensorMapEncodeIm2colWide`
     """
-    cdef cydriver.CUtensorMap* cytensorMap_ptr = <cydriver.CUtensorMap*>tensorMap._pvt_ptr if tensorMap != None else NULL
+    cdef cydriver.CUtensorMap* cytensorMap_ptr = tensorMap._pvt_ptr if tensorMap != None else NULL
     cyglobalAddress = utils.HelperInputVoidPtr(globalAddress)
     cdef void* cyglobalAddress_ptr = <void*><void_ptr>cyglobalAddress.cptr
     with nogil:
@@ -50576,7 +50576,7 @@ def cuGetExportTable(pExportTableId : Optional[CUuuid]):
         None
     """
     cdef void_ptr ppExportTable = 0
-    cdef cydriver.CUuuid* cypExportTableId_ptr = <cydriver.CUuuid*>pExportTableId._pvt_ptr if pExportTableId != None else NULL
+    cdef cydriver.CUuuid* cypExportTableId_ptr = pExportTableId._pvt_ptr if pExportTableId != None else NULL
     with nogil:
         err = cydriver.cuGetExportTable(<const void**>&ppExportTable, cypExportTableId_ptr)
     if err != cydriver.CUDA_SUCCESS:
@@ -50999,7 +50999,7 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
         if cyresult is NULL:
             raise MemoryError('Failed to allocate length x size memory: ' + str(nbGroups) + 'x' + str(sizeof(cydriver.CUdevResource)))
     cdef unsigned int cynbGroups = nbGroups
-    cdef cydriver.CUdevResource* cyinput__ptr = <cydriver.CUdevResource*>input_._pvt_ptr if input_ != None else NULL
+    cdef cydriver.CUdevResource* cyinput__ptr = input_._pvt_ptr if input_ != None else NULL
     cdef CUdevResource remaining = CUdevResource()
     with nogil:
         err = cydriver.cuDevSmResourceSplitByCount(cyresult, &cynbGroups, cyinput__ptr, <cydriver.CUdevResource*>remaining._pvt_ptr, useFlags, minCount)
@@ -51677,7 +51677,7 @@ def cuCheckpointProcessLock(int pid, args : Optional[CUcheckpointLockArgs]):
     CUresult
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED` :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED` :py:obj:`~.CUDA_ERROR_NOT_READY`
     """
-    cdef cydriver.CUcheckpointLockArgs* cyargs_ptr = <cydriver.CUcheckpointLockArgs*>args._pvt_ptr if args != None else NULL
+    cdef cydriver.CUcheckpointLockArgs* cyargs_ptr = args._pvt_ptr if args != None else NULL
     with nogil:
         err = cydriver.cuCheckpointProcessLock(pid, cyargs_ptr)
     return (_dict_CUresult[err],)
@@ -51708,7 +51708,7 @@ def cuCheckpointProcessCheckpoint(int pid, args : Optional[CUcheckpointCheckpoin
     CUresult
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED` :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     """
-    cdef cydriver.CUcheckpointCheckpointArgs* cyargs_ptr = <cydriver.CUcheckpointCheckpointArgs*>args._pvt_ptr if args != None else NULL
+    cdef cydriver.CUcheckpointCheckpointArgs* cyargs_ptr = args._pvt_ptr if args != None else NULL
     with nogil:
         err = cydriver.cuCheckpointProcessCheckpoint(pid, cyargs_ptr)
     return (_dict_CUresult[err],)
@@ -51737,7 +51737,7 @@ def cuCheckpointProcessUnlock(int pid, args : Optional[CUcheckpointUnlockArgs]):
     CUresult
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED` :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     """
-    cdef cydriver.CUcheckpointUnlockArgs* cyargs_ptr = <cydriver.CUcheckpointUnlockArgs*>args._pvt_ptr if args != None else NULL
+    cdef cydriver.CUcheckpointUnlockArgs* cyargs_ptr = args._pvt_ptr if args != None else NULL
     with nogil:
         err = cydriver.cuCheckpointProcessUnlock(pid, cyargs_ptr)
     return (_dict_CUresult[err],)
@@ -52356,7 +52356,7 @@ def cuEGLStreamProducerReturnFrame(conn, eglframe : Optional[CUeglFrame], pStrea
         cyconn = <cydriver.CUeglStreamConnection*><void_ptr>conn
     else:
         raise TypeError("Argument 'conn' is not instance of type (expected <class 'int, driver.CUeglStreamConnection'>, found " + str(type(conn)))
-    cdef cydriver.CUeglFrame* cyeglframe_ptr = <cydriver.CUeglFrame*>eglframe._pvt_ptr if eglframe != None else NULL
+    cdef cydriver.CUeglFrame* cyeglframe_ptr = eglframe._pvt_ptr if eglframe != None else NULL
     with nogil:
         err = cydriver.cuEGLStreamProducerReturnFrame(cyconn, cyeglframe_ptr, cypStream)
     return (_dict_CUresult[err],)
diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in
index 4f4ef937dd..939e1dfcc1 100644
--- a/cuda_bindings/cuda/bindings/runtime.pyx.in
+++ b/cuda_bindings/cuda/bindings/runtime.pyx.in
@@ -18626,7 +18626,7 @@ def cudaDeviceGetTexture1DLinearMaxWidth(fmtDesc : Optional[cudaChannelFormatDes
     :py:obj:`~.cuDeviceGetTexture1DLinearMaxWidth`
     """
     cdef size_t maxWidthInElements = 0
-    cdef cyruntime.cudaChannelFormatDesc* cyfmtDesc_ptr = <cyruntime.cudaChannelFormatDesc*>fmtDesc._pvt_ptr if fmtDesc != None else NULL
+    cdef cyruntime.cudaChannelFormatDesc* cyfmtDesc_ptr = fmtDesc._pvt_ptr if fmtDesc != None else NULL
     with nogil:
         err = cyruntime.cudaDeviceGetTexture1DLinearMaxWidth(&maxWidthInElements, cyfmtDesc_ptr, device)
     if err != cyruntime.cudaSuccess:
@@ -20051,7 +20051,7 @@ def cudaChooseDevice(prop : Optional[cudaDeviceProp]):
     :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaInitDevice`
     """
     cdef int device = 0
-    cdef cyruntime.cudaDeviceProp* cyprop_ptr = <cyruntime.cudaDeviceProp*>prop._pvt_ptr if prop != None else NULL
+    cdef cyruntime.cudaDeviceProp* cyprop_ptr = prop._pvt_ptr if prop != None else NULL
     with nogil:
         err = cyruntime.cudaChooseDevice(&device, cyprop_ptr)
     if err != cyruntime.cudaSuccess:
@@ -20792,7 +20792,7 @@ def cudaStreamSetAttribute(hStream, attr not None : cudaStreamAttrID, value : Op
         phStream = int(cudaStream_t(hStream))
     cyhStream = <cyruntime.cudaStream_t><void_ptr>phStream
     cdef cyruntime.cudaStreamAttrID cyattr = attr.value
-    cdef cyruntime.cudaStreamAttrValue* cyvalue_ptr = <cyruntime.cudaStreamAttrValue*>value._pvt_ptr if value != None else NULL
+    cdef cyruntime.cudaStreamAttrValue* cyvalue_ptr = value._pvt_ptr if value != None else NULL
     with nogil:
         err = cyruntime.cudaStreamSetAttribute(cyhStream, cyattr, cyvalue_ptr)
     return (_dict_cudaError_t[err],)
@@ -22291,7 +22291,7 @@ def cudaImportExternalMemory(memHandleDesc : Optional[cudaExternalMemoryHandleDe
     and Cache Control" chapter from Vulkan specification.
     """
     cdef cudaExternalMemory_t extMem_out = cudaExternalMemory_t()
-    cdef cyruntime.cudaExternalMemoryHandleDesc* cymemHandleDesc_ptr = <cyruntime.cudaExternalMemoryHandleDesc*>memHandleDesc._pvt_ptr if memHandleDesc != None else NULL
+    cdef cyruntime.cudaExternalMemoryHandleDesc* cymemHandleDesc_ptr = memHandleDesc._pvt_ptr if memHandleDesc != None else NULL
     with nogil:
         err = cyruntime.cudaImportExternalMemory(<cyruntime.cudaExternalMemory_t*>extMem_out._pvt_ptr, cymemHandleDesc_ptr)
     if err != cyruntime.cudaSuccess:
@@ -22359,7 +22359,7 @@ def cudaExternalMemoryGetMappedBuffer(extMem, bufferDesc : Optional[cudaExternal
         pextMem = int(cudaExternalMemory_t(extMem))
     cyextMem = <cyruntime.cudaExternalMemory_t><void_ptr>pextMem
     cdef void_ptr devPtr = 0
-    cdef cyruntime.cudaExternalMemoryBufferDesc* cybufferDesc_ptr = <cyruntime.cudaExternalMemoryBufferDesc*>bufferDesc._pvt_ptr if bufferDesc != None else NULL
+    cdef cyruntime.cudaExternalMemoryBufferDesc* cybufferDesc_ptr = bufferDesc._pvt_ptr if bufferDesc != None else NULL
     with nogil:
         err = cyruntime.cudaExternalMemoryGetMappedBuffer(<void**>&devPtr, cyextMem, cybufferDesc_ptr)
     if err != cyruntime.cudaSuccess:
@@ -22431,7 +22431,7 @@ def cudaExternalMemoryGetMappedMipmappedArray(extMem, mipmapDesc : Optional[cuda
         pextMem = int(cudaExternalMemory_t(extMem))
     cyextMem = <cyruntime.cudaExternalMemory_t><void_ptr>pextMem
     cdef cudaMipmappedArray_t mipmap = cudaMipmappedArray_t()
-    cdef cyruntime.cudaExternalMemoryMipmappedArrayDesc* cymipmapDesc_ptr = <cyruntime.cudaExternalMemoryMipmappedArrayDesc*>mipmapDesc._pvt_ptr if mipmapDesc != None else NULL
+    cdef cyruntime.cudaExternalMemoryMipmappedArrayDesc* cymipmapDesc_ptr = mipmapDesc._pvt_ptr if mipmapDesc != None else NULL
     with nogil:
         err = cyruntime.cudaExternalMemoryGetMappedMipmappedArray(<cyruntime.cudaMipmappedArray_t*>mipmap._pvt_ptr, cyextMem, cymipmapDesc_ptr)
     if err != cyruntime.cudaSuccess:
@@ -22618,7 +22618,7 @@ def cudaImportExternalSemaphore(semHandleDesc : Optional[cudaExternalSemaphoreHa
     :py:obj:`~.cudaDestroyExternalSemaphore`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync`
     """
     cdef cudaExternalSemaphore_t extSem_out = cudaExternalSemaphore_t()
-    cdef cyruntime.cudaExternalSemaphoreHandleDesc* cysemHandleDesc_ptr = <cyruntime.cudaExternalSemaphoreHandleDesc*>semHandleDesc._pvt_ptr if semHandleDesc != None else NULL
+    cdef cyruntime.cudaExternalSemaphoreHandleDesc* cysemHandleDesc_ptr = semHandleDesc._pvt_ptr if semHandleDesc != None else NULL
     with nogil:
         err = cyruntime.cudaImportExternalSemaphore(<cyruntime.cudaExternalSemaphore_t*>extSem_out._pvt_ptr, cysemHandleDesc_ptr)
     if err != cyruntime.cudaSuccess:
@@ -23803,7 +23803,7 @@ def cudaMallocArray(desc : Optional[cudaChannelFormatDesc], size_t width, size_t
     :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cuArrayCreate`
     """
     cdef cudaArray_t array = cudaArray_t()
-    cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = <cyruntime.cudaChannelFormatDesc*>desc._pvt_ptr if desc != None else NULL
+    cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = desc._pvt_ptr if desc != None else NULL
     with nogil:
         err = cyruntime.cudaMallocArray(<cyruntime.cudaArray_t*>array._pvt_ptr, cydesc_ptr, width, height, flags)
     if err != cyruntime.cudaSuccess:
@@ -24462,7 +24462,7 @@ def cudaMalloc3DArray(desc : Optional[cudaChannelFormatDesc], extent not None :
     :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuArray3DCreate`
     """
     cdef cudaArray_t array = cudaArray_t()
-    cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = <cyruntime.cudaChannelFormatDesc*>desc._pvt_ptr if desc != None else NULL
+    cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = desc._pvt_ptr if desc != None else NULL
     with nogil:
         err = cyruntime.cudaMalloc3DArray(<cyruntime.cudaArray_t*>array._pvt_ptr, cydesc_ptr, extent._pvt_ptr[0], flags)
     if err != cyruntime.cudaSuccess:
@@ -24588,7 +24588,7 @@ def cudaMallocMipmappedArray(desc : Optional[cudaChannelFormatDesc], extent not
     :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuMipmappedArrayCreate`
     """
     cdef cudaMipmappedArray_t mipmappedArray = cudaMipmappedArray_t()
-    cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = <cyruntime.cudaChannelFormatDesc*>desc._pvt_ptr if desc != None else NULL
+    cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = desc._pvt_ptr if desc != None else NULL
     with nogil:
         err = cyruntime.cudaMallocMipmappedArray(<cyruntime.cudaMipmappedArray_t*>mipmappedArray._pvt_ptr, cydesc_ptr, extent._pvt_ptr[0], numLevels, flags)
     if err != cyruntime.cudaSuccess:
@@ -24723,7 +24723,7 @@ def cudaMemcpy3D(p : Optional[cudaMemcpy3DParms]):
     --------
     :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3DAsync`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.make_cudaPos`, :py:obj:`~.cuMemcpy3D`
     """
-    cdef cyruntime.cudaMemcpy3DParms* cyp_ptr = <cyruntime.cudaMemcpy3DParms*>p._pvt_ptr if p != None else NULL
+    cdef cyruntime.cudaMemcpy3DParms* cyp_ptr = p._pvt_ptr if p != None else NULL
     with nogil:
         err = cyruntime.cudaMemcpy3D(cyp_ptr)
     return (_dict_cudaError_t[err],)
@@ -24760,7 +24760,7 @@ def cudaMemcpy3DPeer(p : Optional[cudaMemcpy3DPeerParms]):
     --------
     :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpyPeerAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :py:obj:`~.cuMemcpy3DPeer`
     """
-    cdef cyruntime.cudaMemcpy3DPeerParms* cyp_ptr = <cyruntime.cudaMemcpy3DPeerParms*>p._pvt_ptr if p != None else NULL
+    cdef cyruntime.cudaMemcpy3DPeerParms* cyp_ptr = p._pvt_ptr if p != None else NULL
     with nogil:
         err = cyruntime.cudaMemcpy3DPeer(cyp_ptr)
     return (_dict_cudaError_t[err],)
@@ -24865,7 +24865,7 @@ def cudaMemcpy3DAsync(p : Optional[cudaMemcpy3DParms], stream):
     else:
         pstream = int(cudaStream_t(stream))
     cystream = <cyruntime.cudaStream_t><void_ptr>pstream
-    cdef cyruntime.cudaMemcpy3DParms* cyp_ptr = <cyruntime.cudaMemcpy3DParms*>p._pvt_ptr if p != None else NULL
+    cdef cyruntime.cudaMemcpy3DParms* cyp_ptr = p._pvt_ptr if p != None else NULL
     with nogil:
         err = cyruntime.cudaMemcpy3DAsync(cyp_ptr, cystream)
     return (_dict_cudaError_t[err],)
@@ -24905,7 +24905,7 @@ def cudaMemcpy3DPeerAsync(p : Optional[cudaMemcpy3DPeerParms], stream):
     else:
         pstream = int(cudaStream_t(stream))
     cystream = <cyruntime.cudaStream_t><void_ptr>pstream
-    cdef cyruntime.cudaMemcpy3DPeerParms* cyp_ptr = <cyruntime.cudaMemcpy3DPeerParms*>p._pvt_ptr if p != None else NULL
+    cdef cyruntime.cudaMemcpy3DPeerParms* cyp_ptr = p._pvt_ptr if p != None else NULL
     with nogil:
         err = cyruntime.cudaMemcpy3DPeerAsync(cyp_ptr, cystream)
     return (_dict_cudaError_t[err],)
@@ -28203,7 +28203,7 @@ def cudaMemPoolGetAccess(memPool, location : Optional[cudaMemLocation]):
         pmemPool = int(cudaMemPool_t(memPool))
     cymemPool = <cyruntime.cudaMemPool_t><void_ptr>pmemPool
     cdef cyruntime.cudaMemAccessFlags flags
-    cdef cyruntime.cudaMemLocation* cylocation_ptr = <cyruntime.cudaMemLocation*>location._pvt_ptr if location != None else NULL
+    cdef cyruntime.cudaMemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
     with nogil:
         err = cyruntime.cudaMemPoolGetAccess(&flags, cymemPool, cylocation_ptr)
     if err != cyruntime.cudaSuccess:
@@ -28287,7 +28287,7 @@ def cudaMemPoolCreate(poolProps : Optional[cudaMemPoolProps]):
     Specifying cudaMemHandleTypeNone creates a memory pool that will not support IPC.
     """
     cdef cudaMemPool_t memPool = cudaMemPool_t()
-    cdef cyruntime.cudaMemPoolProps* cypoolProps_ptr = <cyruntime.cudaMemPoolProps*>poolProps._pvt_ptr if poolProps != None else NULL
+    cdef cyruntime.cudaMemPoolProps* cypoolProps_ptr = poolProps._pvt_ptr if poolProps != None else NULL
     with nogil:
         err = cyruntime.cudaMemPoolCreate(<cyruntime.cudaMemPool_t*>memPool._pvt_ptr, cypoolProps_ptr)
     if err != cyruntime.cudaSuccess:
@@ -28377,7 +28377,7 @@ def cudaMemGetDefaultMemPool(location : Optional[cudaMemLocation], typename not
     :py:obj:`~.cuMemAllocAsync`, :py:obj:`~.cuMemPoolTrimTo`, :py:obj:`~.cuMemPoolGetAttribute`, :py:obj:`~.cuMemPoolSetAttribute`, cuMemPoolSetAccess, :py:obj:`~.cuMemGetMemPool`, :py:obj:`~.cuMemPoolCreate`
     """
     cdef cudaMemPool_t memPool = cudaMemPool_t()
-    cdef cyruntime.cudaMemLocation* cylocation_ptr = <cyruntime.cudaMemLocation*>location._pvt_ptr if location != None else NULL
+    cdef cyruntime.cudaMemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
     cdef cyruntime.cudaMemAllocationType cytypename = typename.value
     with nogil:
         err = cyruntime.cudaMemGetDefaultMemPool(<cyruntime.cudaMemPool_t*>memPool._pvt_ptr, cylocation_ptr, cytypename)
@@ -28431,7 +28431,7 @@ def cudaMemGetMemPool(location : Optional[cudaMemLocation], typename not None :
     :py:obj:`~.cuDeviceGetDefaultMemPool`, :py:obj:`~.cuMemPoolCreate`, :py:obj:`~.cuDeviceSetMemPool`, :py:obj:`~.cuMemSetMemPool`
     """
     cdef cudaMemPool_t memPool = cudaMemPool_t()
-    cdef cyruntime.cudaMemLocation* cylocation_ptr = <cyruntime.cudaMemLocation*>location._pvt_ptr if location != None else NULL
+    cdef cyruntime.cudaMemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
     cdef cyruntime.cudaMemAllocationType cytypename = typename.value
     with nogil:
         err = cyruntime.cudaMemGetMemPool(<cyruntime.cudaMemPool_t*>memPool._pvt_ptr, cylocation_ptr, cytypename)
@@ -28500,7 +28500,7 @@ def cudaMemSetMemPool(location : Optional[cudaMemLocation], typename not None :
     else:
         pmemPool = int(cudaMemPool_t(memPool))
     cymemPool = <cyruntime.cudaMemPool_t><void_ptr>pmemPool
-    cdef cyruntime.cudaMemLocation* cylocation_ptr = <cyruntime.cudaMemLocation*>location._pvt_ptr if location != None else NULL
+    cdef cyruntime.cudaMemLocation* cylocation_ptr = location._pvt_ptr if location != None else NULL
     cdef cyruntime.cudaMemAllocationType cytypename = typename.value
     with nogil:
         err = cyruntime.cudaMemSetMemPool(cylocation_ptr, cytypename, cymemPool)
@@ -28749,7 +28749,7 @@ def cudaMemPoolImportPointer(memPool, exportData : Optional[cudaMemPoolPtrExport
         pmemPool = int(cudaMemPool_t(memPool))
     cymemPool = <cyruntime.cudaMemPool_t><void_ptr>pmemPool
     cdef void_ptr ptr = 0
-    cdef cyruntime.cudaMemPoolPtrExportData* cyexportData_ptr = <cyruntime.cudaMemPoolPtrExportData*>exportData._pvt_ptr if exportData != None else NULL
+    cdef cyruntime.cudaMemPoolPtrExportData* cyexportData_ptr = exportData._pvt_ptr if exportData != None else NULL
     with nogil:
         err = cyruntime.cudaMemPoolImportPointer(<void**>&ptr, cymemPool, cyexportData_ptr)
     if err != cyruntime.cudaSuccess:
@@ -29640,9 +29640,9 @@ def cudaCreateTextureObject(pResDesc : Optional[cudaResourceDesc], pTexDesc : Op
     :py:obj:`~.cudaDestroyTextureObject`, :py:obj:`~.cuTexObjectCreate`
     """
     cdef cudaTextureObject_t pTexObject = cudaTextureObject_t()
-    cdef cyruntime.cudaResourceDesc* cypResDesc_ptr = <cyruntime.cudaResourceDesc*>pResDesc._pvt_ptr if pResDesc != None else NULL
-    cdef cyruntime.cudaTextureDesc* cypTexDesc_ptr = <cyruntime.cudaTextureDesc*>pTexDesc._pvt_ptr if pTexDesc != None else NULL
-    cdef cyruntime.cudaResourceViewDesc* cypResViewDesc_ptr = <cyruntime.cudaResourceViewDesc*>pResViewDesc._pvt_ptr if pResViewDesc != None else NULL
+    cdef cyruntime.cudaResourceDesc* cypResDesc_ptr = pResDesc._pvt_ptr if pResDesc != None else NULL
+    cdef cyruntime.cudaTextureDesc* cypTexDesc_ptr = pTexDesc._pvt_ptr if pTexDesc != None else NULL
+    cdef cyruntime.cudaResourceViewDesc* cypResViewDesc_ptr = pResViewDesc._pvt_ptr if pResViewDesc != None else NULL
     with nogil:
         err = cyruntime.cudaCreateTextureObject(<cyruntime.cudaTextureObject_t*>pTexObject._pvt_ptr, cypResDesc_ptr, cypTexDesc_ptr, cypResViewDesc_ptr)
     if err != cyruntime.cudaSuccess:
@@ -29843,7 +29843,7 @@ def cudaCreateSurfaceObject(pResDesc : Optional[cudaResourceDesc]):
     :py:obj:`~.cudaDestroySurfaceObject`, :py:obj:`~.cuSurfObjectCreate`
     """
     cdef cudaSurfaceObject_t pSurfObject = cudaSurfaceObject_t()
-    cdef cyruntime.cudaResourceDesc* cypResDesc_ptr = <cyruntime.cudaResourceDesc*>pResDesc._pvt_ptr if pResDesc != None else NULL
+    cdef cyruntime.cudaResourceDesc* cypResDesc_ptr = pResDesc._pvt_ptr if pResDesc != None else NULL
     with nogil:
         err = cyruntime.cudaCreateSurfaceObject(<cyruntime.cudaSurfaceObject_t*>pSurfObject._pvt_ptr, cypResDesc_ptr)
     if err != cyruntime.cudaSuccess:
@@ -30334,7 +30334,7 @@ def cudaGraphAddKernelNode(graph, pDependencies : Optional[Tuple[cudaGraphNode_t
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = <cyruntime.cudaKernelNodeParams*>pNodeParams._pvt_ptr if pNodeParams != None else NULL
+    cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddKernelNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cypNodeParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -30425,7 +30425,7 @@ def cudaGraphKernelNodeSetParams(node, pNodeParams : Optional[cudaKernelNodePara
     else:
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = <cyruntime.cudaKernelNodeParams*>pNodeParams._pvt_ptr if pNodeParams != None else NULL
+    cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphKernelNodeSetParams(cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -30558,7 +30558,7 @@ def cudaGraphKernelNodeSetAttribute(hNode, attr not None : cudaKernelNodeAttrID,
         phNode = int(cudaGraphNode_t(hNode))
     cyhNode = <cyruntime.cudaGraphNode_t><void_ptr>phNode
     cdef cyruntime.cudaKernelNodeAttrID cyattr = attr.value
-    cdef cyruntime.cudaKernelNodeAttrValue* cyvalue_ptr = <cyruntime.cudaKernelNodeAttrValue*>value._pvt_ptr if value != None else NULL
+    cdef cyruntime.cudaKernelNodeAttrValue* cyvalue_ptr = value._pvt_ptr if value != None else NULL
     with nogil:
         err = cyruntime.cudaGraphKernelNodeSetAttribute(cyhNode, cyattr, cyvalue_ptr)
     return (_dict_cudaError_t[err],)
@@ -30630,7 +30630,7 @@ def cudaGraphAddMemcpyNode(graph, pDependencies : Optional[Tuple[cudaGraphNode_t
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaMemcpy3DParms* cypCopyParams_ptr = <cyruntime.cudaMemcpy3DParms*>pCopyParams._pvt_ptr if pCopyParams != None else NULL
+    cdef cyruntime.cudaMemcpy3DParms* cypCopyParams_ptr = pCopyParams._pvt_ptr if pCopyParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddMemcpyNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cypCopyParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -30807,7 +30807,7 @@ def cudaGraphMemcpyNodeSetParams(node, pNodeParams : Optional[cudaMemcpy3DParms]
     else:
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    cdef cyruntime.cudaMemcpy3DParms* cypNodeParams_ptr = <cyruntime.cudaMemcpy3DParms*>pNodeParams._pvt_ptr if pNodeParams != None else NULL
+    cdef cyruntime.cudaMemcpy3DParms* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphMemcpyNodeSetParams(cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -30935,7 +30935,7 @@ def cudaGraphAddMemsetNode(graph, pDependencies : Optional[Tuple[cudaGraphNode_t
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaMemsetParams* cypMemsetParams_ptr = <cyruntime.cudaMemsetParams*>pMemsetParams._pvt_ptr if pMemsetParams != None else NULL
+    cdef cyruntime.cudaMemsetParams* cypMemsetParams_ptr = pMemsetParams._pvt_ptr if pMemsetParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddMemsetNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cypMemsetParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -31017,7 +31017,7 @@ def cudaGraphMemsetNodeSetParams(node, pNodeParams : Optional[cudaMemsetParams])
     else:
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    cdef cyruntime.cudaMemsetParams* cypNodeParams_ptr = <cyruntime.cudaMemsetParams*>pNodeParams._pvt_ptr if pNodeParams != None else NULL
+    cdef cyruntime.cudaMemsetParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphMemsetNodeSetParams(cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -31084,7 +31084,7 @@ def cudaGraphAddHostNode(graph, pDependencies : Optional[Tuple[cudaGraphNode_t]
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = <cyruntime.cudaHostNodeParams*>pNodeParams._pvt_ptr if pNodeParams != None else NULL
+    cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddHostNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cypNodeParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -31166,7 +31166,7 @@ def cudaGraphHostNodeSetParams(node, pNodeParams : Optional[cudaHostNodeParams])
     else:
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = <cyruntime.cudaHostNodeParams*>pNodeParams._pvt_ptr if pNodeParams != None else NULL
+    cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphHostNodeSetParams(cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -31764,7 +31764,7 @@ def cudaGraphAddExternalSemaphoresSignalNode(graph, pDependencies : Optional[Tup
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = <cyruntime.cudaExternalSemaphoreSignalNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddExternalSemaphoresSignalNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cynodeParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -31853,7 +31853,7 @@ def cudaGraphExternalSemaphoresSignalNodeSetParams(hNode, nodeParams : Optional[
     else:
         phNode = int(cudaGraphNode_t(hNode))
     cyhNode = <cyruntime.cudaGraphNode_t><void_ptr>phNode
-    cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = <cyruntime.cudaExternalSemaphoreSignalNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphExternalSemaphoresSignalNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -31921,7 +31921,7 @@ def cudaGraphAddExternalSemaphoresWaitNode(graph, pDependencies : Optional[Tuple
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = <cyruntime.cudaExternalSemaphoreWaitNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddExternalSemaphoresWaitNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cynodeParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -32010,7 +32010,7 @@ def cudaGraphExternalSemaphoresWaitNodeSetParams(hNode, nodeParams : Optional[cu
     else:
         phNode = int(cudaGraphNode_t(hNode))
     cyhNode = <cyruntime.cudaGraphNode_t><void_ptr>phNode
-    cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = <cyruntime.cudaExternalSemaphoreWaitNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphExternalSemaphoresWaitNodeSetParams(cyhNode, cynodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -32117,7 +32117,7 @@ def cudaGraphAddMemAllocNode(graph, pDependencies : Optional[Tuple[cudaGraphNode
     elif len(pDependencies) == 1:
         cypDependencies = <cyruntime.cudaGraphNode_t*>(<cudaGraphNode_t>pDependencies[0])._pvt_ptr
     if numDependencies > <size_t>len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies))
-    cdef cyruntime.cudaMemAllocNodeParams* cynodeParams_ptr = <cyruntime.cudaMemAllocNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cyruntime.cudaMemAllocNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddMemAllocNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cynodeParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -33509,7 +33509,7 @@ def cudaGraphInstantiateWithParams(graph, instantiateParams : Optional[cudaGraph
         pgraph = int(cudaGraph_t(graph))
     cygraph = <cyruntime.cudaGraph_t><void_ptr>pgraph
     cdef cudaGraphExec_t pGraphExec = cudaGraphExec_t()
-    cdef cyruntime.cudaGraphInstantiateParams* cyinstantiateParams_ptr = <cyruntime.cudaGraphInstantiateParams*>instantiateParams._pvt_ptr if instantiateParams != None else NULL
+    cdef cyruntime.cudaGraphInstantiateParams* cyinstantiateParams_ptr = instantiateParams._pvt_ptr if instantiateParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphInstantiateWithParams(<cyruntime.cudaGraphExec_t*>pGraphExec._pvt_ptr, cygraph, cyinstantiateParams_ptr)
     if err != cyruntime.cudaSuccess:
@@ -33636,7 +33636,7 @@ def cudaGraphExecKernelNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = <cyruntime.cudaKernelNodeParams*>pNodeParams._pvt_ptr if pNodeParams != None else NULL
+    cdef cyruntime.cudaKernelNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecKernelNodeSetParams(cyhGraphExec, cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -33701,7 +33701,7 @@ def cudaGraphExecMemcpyNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    cdef cyruntime.cudaMemcpy3DParms* cypNodeParams_ptr = <cyruntime.cudaMemcpy3DParms*>pNodeParams._pvt_ptr if pNodeParams != None else NULL
+    cdef cyruntime.cudaMemcpy3DParms* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecMemcpyNodeSetParams(cyhGraphExec, cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -33844,7 +33844,7 @@ def cudaGraphExecMemsetNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    cdef cyruntime.cudaMemsetParams* cypNodeParams_ptr = <cyruntime.cudaMemsetParams*>pNodeParams._pvt_ptr if pNodeParams != None else NULL
+    cdef cyruntime.cudaMemsetParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecMemsetNodeSetParams(cyhGraphExec, cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -33899,7 +33899,7 @@ def cudaGraphExecHostNodeSetParams(hGraphExec, node, pNodeParams : Optional[cuda
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = <cyruntime.cudaHostNodeParams*>pNodeParams._pvt_ptr if pNodeParams != None else NULL
+    cdef cyruntime.cudaHostNodeParams* cypNodeParams_ptr = pNodeParams._pvt_ptr if pNodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecHostNodeSetParams(cyhGraphExec, cynode, cypNodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -34155,7 +34155,7 @@ def cudaGraphExecExternalSemaphoresSignalNodeSetParams(hGraphExec, hNode, nodePa
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = <cyruntime.cudaExternalSemaphoreSignalNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cyruntime.cudaExternalSemaphoreSignalNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecExternalSemaphoresSignalNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -34215,7 +34215,7 @@ def cudaGraphExecExternalSemaphoresWaitNodeSetParams(hGraphExec, hNode, nodePara
     else:
         phGraphExec = int(cudaGraphExec_t(hGraphExec))
     cyhGraphExec = <cyruntime.cudaGraphExec_t><void_ptr>phGraphExec
-    cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = <cyruntime.cudaExternalSemaphoreWaitNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cyruntime.cudaExternalSemaphoreWaitNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecExternalSemaphoresWaitNodeSetParams(cyhGraphExec, cyhNode, cynodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -35072,7 +35072,7 @@ def cudaGraphAddNode(graph, pDependencies : Optional[Tuple[cudaGraphNode_t] | Li
             string.memcpy(&cydependencyData[idx], (<cudaGraphEdgeData>dependencyData[idx])._pvt_ptr, sizeof(cyruntime.cudaGraphEdgeData))
     elif len(dependencyData) == 1:
         cydependencyData = (<cudaGraphEdgeData>dependencyData[0])._pvt_ptr
-    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = <cyruntime.cudaGraphNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphAddNode(<cyruntime.cudaGraphNode_t*>pGraphNode._pvt_ptr, cygraph, cypDependencies, cydependencyData, numDependencies, cynodeParams_ptr)
     if len(pDependencies) > 1 and cypDependencies is not NULL:
@@ -35122,7 +35122,7 @@ def cudaGraphNodeSetParams(node, nodeParams : Optional[cudaGraphNodeParams]):
     else:
         pnode = int(cudaGraphNode_t(node))
     cynode = <cyruntime.cudaGraphNode_t><void_ptr>pnode
-    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = <cyruntime.cudaGraphNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphNodeSetParams(cynode, cynodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -35182,7 +35182,7 @@ def cudaGraphExecNodeSetParams(graphExec, node, nodeParams : Optional[cudaGraphN
     else:
         pgraphExec = int(cudaGraphExec_t(graphExec))
     cygraphExec = <cyruntime.cudaGraphExec_t><void_ptr>pgraphExec
-    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = <cyruntime.cudaGraphNodeParams*>nodeParams._pvt_ptr if nodeParams != None else NULL
+    cdef cyruntime.cudaGraphNodeParams* cynodeParams_ptr = nodeParams._pvt_ptr if nodeParams != None else NULL
     with nogil:
         err = cyruntime.cudaGraphExecNodeSetParams(cygraphExec, cynode, cynodeParams_ptr)
     return (_dict_cudaError_t[err],)
@@ -36084,7 +36084,7 @@ def cudaKernelSetAttributeForDevice(kernel, attr not None : cudaFuncAttribute, i
 def cudaGetExportTable(pExportTableId : Optional[cudaUUID_t]):
     """"""
     cdef void_ptr ppExportTable = 0
-    cdef cyruntime.cudaUUID_t* cypExportTableId_ptr = <cyruntime.cudaUUID_t*>pExportTableId._pvt_ptr if pExportTableId != None else NULL
+    cdef cyruntime.cudaUUID_t* cypExportTableId_ptr = pExportTableId._pvt_ptr if pExportTableId != None else NULL
     with nogil:
         err = cyruntime.cudaGetExportTable(<const void**>&ppExportTable, cypExportTableId_ptr)
     if err != cyruntime.cudaSuccess:
@@ -36787,7 +36787,7 @@ def cudaEGLStreamProducerReturnFrame(conn, eglframe : Optional[cudaEglFrame], pS
         cyconn = <cyruntime.cudaEglStreamConnection*><void_ptr>conn
     else:
         raise TypeError("Argument 'conn' is not instance of type (expected <class 'int, runtime.cudaEglStreamConnection'>, found " + str(type(conn)))
-    cdef cyruntime.cudaEglFrame* cyeglframe_ptr = <cyruntime.cudaEglFrame*>eglframe._pvt_ptr if eglframe != None else NULL
+    cdef cyruntime.cudaEglFrame* cyeglframe_ptr = eglframe._pvt_ptr if eglframe != None else NULL
     with nogil:
         err = cyruntime.cudaEGLStreamProducerReturnFrame(cyconn, cyeglframe_ptr, cypStream)
     return (_dict_cudaError_t[err],)

From aec7d10c6d608e9184a81cc52583f1de38217e3a Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 22 Jul 2025 15:19:47 -0700
Subject: [PATCH 28/65] test_cufile.py: pytest.skip("NEEDS DEBUGGING
 (unreleased-13.0)")

---
 cuda_bindings/tests/test_cufile.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cuda_bindings/tests/test_cufile.py b/cuda_bindings/tests/test_cufile.py
index 4962a7feda..259afe57ab 100644
--- a/cuda_bindings/tests/test_cufile.py
+++ b/cuda_bindings/tests/test_cufile.py
@@ -20,6 +20,8 @@
 
 if cufile is None:
     pytest.skip("skipping tests on Windows", allow_module_level=True)
+else:
+    pytest.skip("NEEDS DEBUGGING (unreleased-13.0)", allow_module_level=True)
 
 
 def cufileLibraryAvailable():

From 9fc982fe902f28b7aa3edddb32446a586ad723ad Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 23 Jul 2025 14:33:36 -0700
Subject: [PATCH 29/65] cython-gen changes on top of `git merge world-main -X
 ours` product.

---
 cuda_bindings/cuda/bindings/runtime.pyx.in           | 6 ++++++
 cuda_bindings/cuda/bindings/utils/_get_handle.pyx.in | 6 +++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in
index fea2c66db0..43ce11ee88 100644
--- a/cuda_bindings/cuda/bindings/runtime.pyx.in
+++ b/cuda_bindings/cuda/bindings/runtime.pyx.in
@@ -5767,6 +5767,12 @@ cdef class cudaLogsCallbackHandle:
         return '<cudaLogsCallbackHandle ' + str(hex(self.__int__())) + '>'
     def __index__(self):
         return self.__int__()
+    def __eq__(self, other):
+        if not isinstance(other, cudaLogsCallbackHandle):
+            return False
+        return self._pvt_ptr[0] == (<cudaLogsCallbackHandle>other)._pvt_ptr[0]
+    def __hash__(self):
+        return hash(<uintptr_t><void*>(self._pvt_ptr[0]))
     def __int__(self):
         return <void_ptr>self._pvt_ptr[0]
     def getPtr(self):
diff --git a/cuda_bindings/cuda/bindings/utils/_get_handle.pyx.in b/cuda_bindings/cuda/bindings/utils/_get_handle.pyx.in
index 2d40133dba..30718591e6 100644
--- a/cuda_bindings/cuda/bindings/utils/_get_handle.pyx.in
+++ b/cuda_bindings/cuda/bindings/utils/_get_handle.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 12.9.0. Do not modify it directly.
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
 
 from libc.stdint cimport uintptr_t
 cimport cython
@@ -220,6 +220,10 @@ def get_cuda_native_handle(obj) -> int:
         def cudaAsyncCallbackHandle_t_getter(runtime.cudaAsyncCallbackHandle_t x): return <uintptr_t><void*><cyruntime.cudaAsyncCallbackHandle_t>(x._pvt_ptr[0])
         _handle_getters[runtime.cudaAsyncCallbackHandle_t] = cudaAsyncCallbackHandle_t_getter
         {{endif}}
+        {{if 'cudaLogsCallbackHandle' in found_types}}
+        def cudaLogsCallbackHandle_getter(runtime.cudaLogsCallbackHandle x): return <uintptr_t><void*><cyruntime.cudaLogsCallbackHandle>(x._pvt_ptr[0])
+        _handle_getters[runtime.cudaLogsCallbackHandle] = cudaLogsCallbackHandle_getter
+        {{endif}}
         {{if True}}
         def cudaEglStreamConnection_getter(runtime.cudaEglStreamConnection x): return <uintptr_t><void*><cyruntime.cudaEglStreamConnection>(x._pvt_ptr[0])
         _handle_getters[runtime.cudaEglStreamConnection] = cudaEglStreamConnection_getter

From 29628eaef60e13f950c7a9f9cd7d6dd9407b3fc5 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 23 Jul 2025 15:03:15 -0700
Subject: [PATCH 30/65] Add missing imports (related to
 NVIDIA/cuda-python#769). These got lost due to merging with `-X ours`

---
 cuda_bindings/tests/test_cuda.py   | 1 +
 cuda_bindings/tests/test_cudart.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py
index 4db3939166..8479c2dc04 100644
--- a/cuda_bindings/tests/test_cuda.py
+++ b/cuda_bindings/tests/test_cuda.py
@@ -10,6 +10,7 @@
 
 import cuda.bindings.driver as cuda
 import cuda.bindings.runtime as cudart
+from cuda.bindings import driver
 
 
 def driverVersionLessThan(target):
diff --git a/cuda_bindings/tests/test_cudart.py b/cuda_bindings/tests/test_cudart.py
index 100882f32b..21e902733f 100644
--- a/cuda_bindings/tests/test_cudart.py
+++ b/cuda_bindings/tests/test_cudart.py
@@ -9,6 +9,7 @@
 
 import cuda.bindings.driver as cuda
 import cuda.bindings.runtime as cudart
+from cuda.bindings import runtime
 
 
 def isSuccess(err):

From 3f033123bcc80e6af7a37e246367e99345aa22fc Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 24 Jul 2025 15:52:00 -0700
Subject: [PATCH 31/65] Revert obsolete
 aec7d10c6d608e9184a81cc52583f1de38217e3a

Made obsolete by https://github.com/NVIDIA/cuda-python/pull/778
---
 cuda_bindings/tests/test_cufile.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cuda_bindings/tests/test_cufile.py b/cuda_bindings/tests/test_cufile.py
index 843e83319b..1a51348680 100644
--- a/cuda_bindings/tests/test_cufile.py
+++ b/cuda_bindings/tests/test_cufile.py
@@ -35,8 +35,6 @@ def platform_is_wsl():
 
 if cufile is None:
     pytest.skip("skipping tests on Windows", allow_module_level=True)
-else:
-    pytest.skip("NEEDS DEBUGGING (unreleased-13.0)", allow_module_level=True)
 
 if platform_is_wsl():
     pytest.skip("skipping cuFile tests on WSL", allow_module_level=True)

From 27d84d9d99a24a9df4be0eb1d5bc652831061388 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 24 Jul 2025 15:54:38 -0700
Subject: [PATCH 32/65] Remove cuda_bindings/site-packages entirely.

---
 cuda_bindings/setup.py                        | 70 -------------------
 .../_cuda_bindings_redirector.pth             |  4 --
 .../_cuda_bindings_redirector.py              | 29 --------
 3 files changed, 103 deletions(-)
 delete mode 100644 cuda_bindings/site-packages/_cuda_bindings_redirector.pth
 delete mode 100644 cuda_bindings/site-packages/_cuda_bindings_redirector.py

diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index 409c48eda8..0e5d608db5 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -382,79 +382,9 @@ def build_extension(self, ext):
         super().build_extension(ext)
 
 
-################################################################################
-# Adapted from NVIDIA/numba-cuda
-# TODO: Remove this block once we get rid of cuda.__version__ and the .pth files
-
-REDIRECTOR_PTH = "_cuda_bindings_redirector.pth"
-REDIRECTOR_PY = "_cuda_bindings_redirector.py"
-SITE_PACKAGES = pathlib.Path("site-packages")
-
-
-class build_py_with_redirector(build_py):  # noqa: N801
-    """Include the redirector files in the generated wheel."""
-
-    def copy_redirector_file(self, source, destination="."):
-        destination = pathlib.Path(self.build_lib) / destination
-        self.copy_file(str(source), str(destination), preserve_mode=0)
-
-    def run(self):
-        super().run()
-        self.copy_redirector_file(SITE_PACKAGES / REDIRECTOR_PTH)
-        self.copy_redirector_file(SITE_PACKAGES / REDIRECTOR_PY)
-
-    def get_source_files(self):
-        src = super().get_source_files()
-        src.extend(
-            [
-                str(SITE_PACKAGES / REDIRECTOR_PTH),
-                str(SITE_PACKAGES / REDIRECTOR_PY),
-            ]
-        )
-        return src
-
-    def get_output_mapping(self):
-        mapping = super().get_output_mapping()
-        build_lib = pathlib.Path(self.build_lib)
-        mapping[str(build_lib / REDIRECTOR_PTH)] = REDIRECTOR_PTH
-        mapping[str(build_lib / REDIRECTOR_PY)] = REDIRECTOR_PY
-        return mapping
-
-
-class TopLevelFinderWithRedirector(_TopLevelFinder):
-    """Include the redirector files in the editable wheel."""
-
-    def get_implementation(self):
-        for item in super().get_implementation():  # noqa: UP028
-            yield item
-
-        with open(SITE_PACKAGES / REDIRECTOR_PTH) as f:
-            yield (REDIRECTOR_PTH, f.read())
-
-        with open(SITE_PACKAGES / REDIRECTOR_PY) as f:
-            yield (REDIRECTOR_PY, f.read())
-
-
-class editable_wheel_with_redirector(editable_wheel):
-    def _select_strategy(self, name, tag, build_lib):
-        # The default mode is "lenient" - others are "strict" and "compat".
-        # "compat" is deprecated. "strict" creates a tree of links to files in
-        # the repo. It could be implemented, but we only handle the default
-        # case for now.
-        if self.mode is not None and self.mode != "lenient":
-            raise RuntimeError(f"Only lenient mode is supported for editable install. Current mode is {self.mode}")
-
-        return TopLevelFinderWithRedirector(self.distribution, name)
-
-
-################################################################################
-
-
 cmdclass = {
     "bdist_wheel": WheelsBuildExtensions,
     "build_ext": ParallelBuildExtensions,
-    "build_py": build_py_with_redirector,
-    "editable_wheel": editable_wheel_with_redirector,
 }
 
 # ----------------------------------------------------------------------
diff --git a/cuda_bindings/site-packages/_cuda_bindings_redirector.pth b/cuda_bindings/site-packages/_cuda_bindings_redirector.pth
deleted file mode 100644
index 9371fb3645..0000000000
--- a/cuda_bindings/site-packages/_cuda_bindings_redirector.pth
+++ /dev/null
@@ -1,4 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-import _cuda_bindings_redirector
diff --git a/cuda_bindings/site-packages/_cuda_bindings_redirector.py b/cuda_bindings/site-packages/_cuda_bindings_redirector.py
deleted file mode 100644
index cce666aa80..0000000000
--- a/cuda_bindings/site-packages/_cuda_bindings_redirector.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
-
-import sys
-from types import ModuleType
-
-
-class LazyCudaModule(ModuleType):
-
-    def __getattr__(self, name):
-        if name == '__version__':
-            import warnings
-            warnings.warn(
-                "accessing cuda.__version__ is deprecated, " "please switch to use cuda.bindings.__version__ instead",
-                FutureWarning,
-                stacklevel=2,
-            )
-            from cuda.bindings import __version__
-
-            return __version__
-
-        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
-
-
-# Important: We need to populate the cuda namespace module first, otherwise
-# we'd lose access to any of its submodules. This is a cheap op because there
-# is nothing under cuda.bindings.
-import cuda.bindings
-sys.modules['cuda'].__class__ = LazyCudaModule

From fa4f39c0ae0b5563e66b09d8bc5766b3381da063 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 26 Jul 2025 21:18:59 -0700
Subject: [PATCH 33/65] Change test_batch_io_large_operations to avoid a flood
 of output (`assert read_data == expected_data` failure).

---
 cuda_bindings/tests/test_cufile.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/cuda_bindings/tests/test_cufile.py b/cuda_bindings/tests/test_cufile.py
index 1a51348680..3ab2d4220f 100644
--- a/cuda_bindings/tests/test_cufile.py
+++ b/cuda_bindings/tests/test_cufile.py
@@ -1573,7 +1573,14 @@ def test_batch_io_large_operations():
             repetitions = buf_size // test_string_len
             expected_data = (test_string * repetitions)[:buf_size]
 
-            assert read_data == expected_data, f"Read data doesn't match written data for operation {i}"
+            if read_data != expected_data:
+                n = 100  # Show first n bytes
+                raise RuntimeError(
+                    f"Read data doesn't match written data for operation {i}: "
+                    f"{len(read_data)=}, {len(expected_data)=}, "
+                    f"first {n} bytes: read {read_data[:n]!r}, "
+                    f"expected {expected_data[:n]!r}"
+                )
 
         # Clean up batch IO
         cufile.batch_io_destroy(batch_handle)

From 68fc5e9e943515a82467bc94ee988d768c035190 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Sat, 26 Jul 2025 22:09:58 -0700
Subject: [PATCH 34/65] Remove `(scope="module")` from `cufile_env_json`
 fixture: resolves test_batch_io_large_operations failure.

---
 cuda_bindings/tests/test_cufile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_bindings/tests/test_cufile.py b/cuda_bindings/tests/test_cufile.py
index 3ab2d4220f..02053a2a24 100644
--- a/cuda_bindings/tests/test_cufile.py
+++ b/cuda_bindings/tests/test_cufile.py
@@ -40,7 +40,7 @@ def platform_is_wsl():
     pytest.skip("skipping cuFile tests on WSL", allow_module_level=True)
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def cufile_env_json():
     """Set CUFILE_ENV_PATH_JSON environment variable for async tests."""
     original_value = os.environ.get("CUFILE_ENV_PATH_JSON")

From d4204da309e1e41d067def3dbd94e7424e5aaab1 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rwgkio@gmail.com>
Date: Wed, 30 Jul 2025 10:22:52 -0700
Subject: [PATCH 35/65] [unreleased-13.0] `test_cufile.py`: Remove fallback to
 `/etc/cufile.json` (#126)

* test_cufile.py: NEVER USE /etc/cufile.json

* Remove /etc/cufile.json code entirely.
---
 cuda_bindings/tests/test_cufile.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/cuda_bindings/tests/test_cufile.py b/cuda_bindings/tests/test_cufile.py
index 02053a2a24..6d5ef5699e 100644
--- a/cuda_bindings/tests/test_cufile.py
+++ b/cuda_bindings/tests/test_cufile.py
@@ -45,17 +45,14 @@ def cufile_env_json():
     """Set CUFILE_ENV_PATH_JSON environment variable for async tests."""
     original_value = os.environ.get("CUFILE_ENV_PATH_JSON")
 
-    # Use /etc/cufile.json if it exists, otherwise fallback to cufile.json in tests directory
-    if os.path.exists("/etc/cufile.json"):
-        config_path = "/etc/cufile.json"
-    else:
-        # Get absolute path to cufile.json in the same directory as this test file
-        test_dir = os.path.dirname(os.path.abspath(__file__))
-        config_path = os.path.join(test_dir, "cufile.json")
-
+    # Get absolute path to cufile.json in the same directory as this test file
+    test_dir = os.path.dirname(os.path.abspath(__file__))
+    config_path = os.path.join(test_dir, "cufile.json")
     logging.info(f"Using cuFile config: {config_path}")
+    assert os.path.isfile(config_path)
     os.environ["CUFILE_ENV_PATH_JSON"] = config_path
     yield
+
     # Restore original value or remove if it wasn't set
     if original_value is not None:
         os.environ["CUFILE_ENV_PATH_JSON"] = original_value

From 5934b31b1aa68e3d3e605a861f5d75becfab6fc5 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 4 Aug 2025 15:42:43 +0000
Subject: [PATCH 36/65] update win driver to 580.88

---
 .github/workflows/install_gpu_driver.ps1 | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/install_gpu_driver.ps1 b/.github/workflows/install_gpu_driver.ps1
index 8b94901989..256c5cf3a9 100644
--- a/.github/workflows/install_gpu_driver.ps1
+++ b/.github/workflows/install_gpu_driver.ps1
@@ -6,9 +6,9 @@
 function Install-Driver {
 
     # Set the correct URL, filename, and arguments to the installer
-    # This driver is picked to support Windows 11 & CUDA 12.8
-    $url = 'https://us.download.nvidia.com/tesla/572.13/572.13-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe';
-    $file_dir = 'C:\NVIDIA-Driver\572.13-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe';
+    # This driver is picked to support Windows 11 & CUDA 13.0
+    $url = 'https://us.download.nvidia.com/tesla/580.88/580.88-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe';
+    $file_dir = 'C:\NVIDIA-Driver\580.88-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe';
     $install_args = '/s /noeula /noreboot';
 
     # Create the folder for the driver download

From c41290f91bc514eec1561cfcd471ffd89ddd0eff Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 4 Aug 2025 15:43:26 +0000
Subject: [PATCH 37/65] change backport branch to 12.9.x

---
 .github/BACKPORT_BRANCH | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/BACKPORT_BRANCH b/.github/BACKPORT_BRANCH
index 9266e6784d..1ba33f6aec 100644
--- a/.github/BACKPORT_BRANCH
+++ b/.github/BACKPORT_BRANCH
@@ -1 +1 @@
-11.8.x
+12.9.x

From de1e084f9e21fad9ee81a4dcab3fa95e4241b7cd Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 4 Aug 2025 15:44:48 +0000
Subject: [PATCH 38/65] update build ver to 13.0.0

---
 ci/versions.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/versions.json b/ci/versions.json
index 5608eeb1d9..5eb48beb83 100644
--- a/ci/versions.json
+++ b/ci/versions.json
@@ -1,7 +1,7 @@
 {
   "cuda": {
     "build": {
-      "version": "12.9.0"
+      "version": "13.0.0"
     }
   }
 }

From dc956cac8f3d6c9e5fdd019b60a4f17fc71f9254 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 4 Aug 2025 15:58:15 +0000
Subject: [PATCH 39/65] crt headers are now split from cudart (or nvcc?)

---
 .github/actions/fetch_ctk/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml
index 43a0188806..5ba17feaa8 100644
--- a/.github/actions/fetch_ctk/action.yml
+++ b/.github/actions/fetch_ctk/action.yml
@@ -17,7 +17,7 @@ inputs:
     description: "A list of the CTK components to install as a comma-separated list. e.g. 'cuda_nvcc,cuda_nvrtc,cuda_cudart'"
     required: false
     type: string
-    default: "cuda_nvcc,cuda_cudart,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile"
+    default: "cuda_nvcc,cuda_cudart,cuda_crt,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile"
 
 runs:
   using: composite

From 02cf5f708e413220390ca5cecfe4ac78fe9023ec Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Mon, 4 Aug 2025 16:15:04 +0000
Subject: [PATCH 40/65] remove the outdated cufile skip condition (it was buggy
 anyway)

---
 .github/actions/fetch_ctk/action.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml
index 5ba17feaa8..3db1827cdb 100644
--- a/.github/actions/fetch_ctk/action.yml
+++ b/.github/actions/fetch_ctk/action.yml
@@ -36,11 +36,6 @@ runs:
         if [[ "${{ inputs.host-platform }}" == win-* ]]; then
           CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libcufile/}"
         fi
-        # Conditionally strip out libcufile for CUDA versions < 12.2.0 + aarch64 (redist not available)
-        CUDA_MINOR_VER="$(cut -d '.' -f 2 <<< ${{ inputs.cuda-version }})"
-        if [[ ("$CUDA_MAJOR_VER" -lt 12 || "$CUDA_MINOR_VER" -lt 2) && "${{ inputs.host-platform }}" == "linux-aarch64" ]]; then
-          CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libcufile/}"
-        fi
         # Cleanup stray commas after removing components
         CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//,,/,}"
 

From 1bae8f155a77d8071d809ebf4483004783468de3 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 5 Aug 2025 01:22:36 +0000
Subject: [PATCH 41/65] remove 11.8 CI and add 13.0 CI

---
 .github/workflows/test-wheel-linux.yml   | 18 ++++++++----------
 .github/workflows/test-wheel-windows.yml |  4 ++--
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index 546e6aa956..ed9ecfaa8f 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -51,7 +51,7 @@ jobs:
           # Add a special entry for the H100 runner on amd64.
           special_runner=""
           if [[ "${ARCH}" == "amd64" ]]; then
-            special_runner="- { ARCH: ${ARCH}, PY_VER: '3.13', CUDA_VER: '12.9.0', LOCAL_CTK: '1', GPU: 'H100', DRIVER: 'latest' }"
+            special_runner="- { ARCH: ${ARCH}, PY_VER: '3.13', CUDA_VER: '13.0.0', LOCAL_CTK: '1', GPU: 'H100', DRIVER: 'latest' }"
           fi
 
           # Please keep the matrices sorted in ascending order by the following:
@@ -62,18 +62,16 @@ jobs:
           #
           export MATRICES="
             pull-request:
-              - { ARCH: ${ARCH}, PY_VER: '3.9',  CUDA_VER: '11.8.0', LOCAL_CTK: '0', GPU: ${gpu}, DRIVER: 'earliest' }
-              - { ARCH: ${ARCH}, PY_VER: '3.9',  CUDA_VER: '12.0.1', LOCAL_CTK: '1', GPU: ${gpu}, DRIVER: 'latest' }
-              - { ARCH: ${ARCH}, PY_VER: '3.9',  CUDA_VER: '12.9.0', LOCAL_CTK: '0', GPU: ${gpu}, DRIVER: 'latest' }
-              - { ARCH: ${ARCH}, PY_VER: '3.10', CUDA_VER: '11.8.0', LOCAL_CTK: '1', GPU: ${gpu}, DRIVER: 'earliest' }
+              - { ARCH: ${ARCH}, PY_VER: '3.9',  CUDA_VER: '12.9.0', LOCAL_CTK: '0', GPU: ${gpu}, DRIVER: 'earliest' }
+              - { ARCH: ${ARCH}, PY_VER: '3.9',  CUDA_VER: '13.0.0', LOCAL_CTK: '1', GPU: ${gpu}, DRIVER: 'latest' }
               - { ARCH: ${ARCH}, PY_VER: '3.10', CUDA_VER: '12.9.0', LOCAL_CTK: '1', GPU: ${gpu}, DRIVER: 'latest' }
-              - { ARCH: ${ARCH}, PY_VER: '3.11', CUDA_VER: '11.8.0', LOCAL_CTK: '1', GPU: ${gpu}, DRIVER: 'latest' }
+              - { ARCH: ${ARCH}, PY_VER: '3.10', CUDA_VER: '13.0.0', LOCAL_CTK: '0', GPU: ${gpu}, DRIVER: 'latest' }
               - { ARCH: ${ARCH}, PY_VER: '3.11', CUDA_VER: '12.9.0', LOCAL_CTK: '0', GPU: ${gpu}, DRIVER: 'latest' }
-              - { ARCH: ${ARCH}, PY_VER: '3.12', CUDA_VER: '12.0.1', LOCAL_CTK: '1', GPU: ${gpu}, DRIVER: 'earliest' }
+              - { ARCH: ${ARCH}, PY_VER: '3.11', CUDA_VER: '13.0.0', LOCAL_CTK: '1', GPU: ${gpu}, DRIVER: 'latest' }
               - { ARCH: ${ARCH}, PY_VER: '3.12', CUDA_VER: '12.9.0', LOCAL_CTK: '1', GPU: ${gpu}, DRIVER: 'latest' }
-              - { ARCH: ${ARCH}, PY_VER: '3.13', CUDA_VER: '11.8.0', LOCAL_CTK: '0', GPU: ${gpu}, DRIVER: 'latest' }
-              - { ARCH: ${ARCH}, PY_VER: '3.13', CUDA_VER: '12.0.1', LOCAL_CTK: '1', GPU: ${gpu}, DRIVER: 'latest' }
-              - { ARCH: ${ARCH}, PY_VER: '3.13', CUDA_VER: '12.9.0', LOCAL_CTK: '0', GPU: ${gpu}, DRIVER: 'latest' }
+              - { ARCH: ${ARCH}, PY_VER: '3.12', CUDA_VER: '13.0.0', LOCAL_CTK: '0', GPU: ${gpu}, DRIVER: 'latest' }
+              - { ARCH: ${ARCH}, PY_VER: '3.13', CUDA_VER: '12.9.0', LOCAL_CTK: '0', GPU: ${gpu}, DRIVER: 'earliest' }
+              - { ARCH: ${ARCH}, PY_VER: '3.13', CUDA_VER: '13.0.0', LOCAL_CTK: '1', GPU: ${gpu}, DRIVER: 'latest' }
               ${special_runner}
             nightly:
               - { ARCH: ${ARCH}, PY_VER: '3.9',  CUDA_VER: '11.8.0', LOCAL_CTK: '0', GPU: ${gpu}, DRIVER: 'earliest' }
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index 0f175af78b..34059373ec 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -47,10 +47,10 @@ jobs:
           #
           export MATRICES="
             pull-request:
-              - { ARCH: ${ARCH}, PY_VER: '3.12',  CUDA_VER: '11.8.0', LOCAL_CTK: '0' }
-              - { ARCH: ${ARCH}, PY_VER: '3.12',  CUDA_VER: '11.8.0', LOCAL_CTK: '1' }
               - { ARCH: ${ARCH}, PY_VER: '3.12',  CUDA_VER: '12.9.0', LOCAL_CTK: '0' }
               - { ARCH: ${ARCH}, PY_VER: '3.12',  CUDA_VER: '12.9.0', LOCAL_CTK: '1' }
+              - { ARCH: ${ARCH}, PY_VER: '3.13',  CUDA_VER: '13.0.0', LOCAL_CTK: '0' }
+              - { ARCH: ${ARCH}, PY_VER: '3.13',  CUDA_VER: '13.0.0', LOCAL_CTK: '1' }
             nightly:
               - { ARCH: ${ARCH}, PY_VER: '3.12',  CUDA_VER: '11.8.0', LOCAL_CTK: '0' }
               - { ARCH: ${ARCH}, PY_VER: '3.12',  CUDA_VER: '11.8.0', LOCAL_CTK: '1' }

From c09bb7aac57c511f9ab707579bf86a700a9ff8c1 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 5 Aug 2025 01:25:30 +0000
Subject: [PATCH 42/65] update cuda-bindings optional dependencies

---
 cuda_bindings/pyproject.toml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml
index f1546e2999..c6ee2b2042 100644
--- a/cuda_bindings/pyproject.toml
+++ b/cuda_bindings/pyproject.toml
@@ -33,10 +33,10 @@ dependencies = [
 
 [project.optional-dependencies]
 all = [
-    "nvidia-cuda-nvcc-cu12",
-    "nvidia-cuda-nvrtc-cu12",
-    "nvidia-nvjitlink-cu12>=12.3",
-    "nvidia-cufile-cu12; sys_platform == 'linux'",
+    "nvidia-cuda-nvcc~=13.0",
+    "nvidia-cuda-nvrtc~=13.0",
+    "nvidia-nvjitlink~=13.0",
+    "nvidia-cufile; sys_platform == 'linux'",
 ]
 
 test = [

From 6edac58f2d0fcf4727fb90cea939d29ec22bd93c Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 5 Aug 2025 01:27:57 +0000
Subject: [PATCH 43/65] update release notes

---
 .../{12.X.Y-notes.rst => 12.9.1-notes.rst}    |  2 +-
 .../docs/source/release/13.0.0-notes.rst      | 47 +++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)
 rename cuda_bindings/docs/source/release/{12.X.Y-notes.rst => 12.9.1-notes.rst} (97%)
 create mode 100644 cuda_bindings/docs/source/release/13.0.0-notes.rst

diff --git a/cuda_bindings/docs/source/release/12.X.Y-notes.rst b/cuda_bindings/docs/source/release/12.9.1-notes.rst
similarity index 97%
rename from cuda_bindings/docs/source/release/12.X.Y-notes.rst
rename to cuda_bindings/docs/source/release/12.9.1-notes.rst
index 80cd405308..c81f102aaf 100644
--- a/cuda_bindings/docs/source/release/12.X.Y-notes.rst
+++ b/cuda_bindings/docs/source/release/12.9.1-notes.rst
@@ -3,7 +3,7 @@
 
 .. module:: cuda.bindings
 
-``cuda-bindings`` 12.X.Y Release notes
+``cuda-bindings`` 12.9.1 Release notes
 ======================================
 
 Released on MM DD, 2025
diff --git a/cuda_bindings/docs/source/release/13.0.0-notes.rst b/cuda_bindings/docs/source/release/13.0.0-notes.rst
new file mode 100644
index 0000000000..37de83d5c2
--- /dev/null
+++ b/cuda_bindings/docs/source/release/13.0.0-notes.rst
@@ -0,0 +1,47 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+.. module:: cuda.bindings
+
+``cuda-bindings`` 13.0.0 Release notes
+======================================
+
+Released on MM DD, 2025
+
+
+Highlights
+----------
+
+* Support CUDA 13.0
+
+* A utility module :mod:`cuda.bindings.utils` is added
+
+  * Using ``int(cuda_obj)`` to retrieve the underlying address of a CUDA object is deprecated and
+    subject to future removal. Please switch to use :func:`~cuda.bindings.utils.get_cuda_native_handle`
+    instead.
+
+* The ``cuda.bindings.cufile`` Python module was added, wrapping the
+  `cuFile C APIs <https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html>`_.
+  Supported on Linux only.
+
+  * Currently using this module requires NumPy to be present. Any recent NumPy 1.x or 2.x should work.
+
+* Python bindings in every module, including ``driver``, ``runtime``, and ``nvrtc``, now have the GIL
+  released before calling the underlying C APIs.
+
+
+Bug fixes
+---------
+
+
+Miscellaneous
+-------------
+
+* Added PTX utilities including :func:`~utils.get_minimal_required_cuda_ver_from_ptx_ver` and :func:`~utils.get_ptx_ver`.
+* Common CUDA objects such as :class:`~runtime.cudaStream_t` now compare equal if the underlying address is the same.
+
+
+Known issues
+------------
+
+* Updating from older versions (v12.6.2.post1 and below) via ``pip install -U cuda-python`` might not work. Please do a clean re-installation by uninstalling ``pip uninstall -y cuda-python`` followed by installing ``pip install cuda-python``.

From f014791ee2d150996746ef858eecf33cde28a5bc Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 5 Aug 2025 01:35:15 +0000
Subject: [PATCH 44/65] update cuda-bindings docs

---
 README.md                             |  1 +
 cuda_bindings/docs/source/install.md  | 11 ++++++-----
 cuda_bindings/docs/source/release.rst |  3 ++-
 cuda_bindings/docs/versions.json      |  2 ++
 4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 97d9800cce..7cc64fafa8 100644
--- a/README.md
+++ b/README.md
@@ -38,3 +38,4 @@ The list of available interfaces is:
 * NVRTC
 * nvJitLink
 * NVVM
+* cuFile
diff --git a/cuda_bindings/docs/source/install.md b/cuda_bindings/docs/source/install.md
index 175e304e61..f7e0e36693 100644
--- a/cuda_bindings/docs/source/install.md
+++ b/cuda_bindings/docs/source/install.md
@@ -6,8 +6,8 @@
 
 * Linux (x86-64, arm64) and Windows (x86-64)
 * Python 3.9 - 3.13
-* Driver: Linux (450.80.02 or later) Windows (456.38 or later)
-* Optionally, NVRTC, nvJitLink, and NVVM from CUDA Toolkit 12.x
+* Driver: Linux (580.65.06 or later) Windows (580.88 or later)
+* Optionally, NVRTC, nvJitLink, NVVM, and cuFile from CUDA Toolkit 13.x
 
 ```{note}
 The optional CUDA Toolkit components can be installed via PyPI, Conda, OS-specific package managers, or local installers (as described in the CUDA Toolkit [Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html) and [Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) Installation Guides).
@@ -29,9 +29,10 @@ pip install -U cuda-python[all]
 
 Where the optional dependencies are:
 
-* nvidia-cuda-nvrtc-cu12 (Provides NVRTC shared library)
-* nvidia-nvjitlink-cu12>=12.3 (Provides nvJitLink shared library)
-* nvidia-cuda-nvcc-cu12 (Provides NVVM shared library)
+* nvidia-cuda-nvrtc (Provides NVRTC shared library)
+* nvidia-nvjitlink (Provides nvJitLink shared library)
+* nvidia-cuda-nvcc (Provides NVVM shared library)
+* nvidia-cufile (Provides cuFile shared library)
 
 
 ## Installing from Conda
diff --git a/cuda_bindings/docs/source/release.rst b/cuda_bindings/docs/source/release.rst
index 23e1eca808..057a1c666d 100644
--- a/cuda_bindings/docs/source/release.rst
+++ b/cuda_bindings/docs/source/release.rst
@@ -7,7 +7,8 @@ Release Notes
 .. toctree::
    :maxdepth: 3
 
-   12.X.Y <release/12.X.Y-notes.rst>
+   13.0.0 <release/13.0.0-notes.rst>
+   12.9.1 <release/12.9.1-notes.rst>
    12.9.0 <release/12.9.0-notes.rst>
    12.8.0 <release/12.8.0-notes.md>
    12.6.2 <release/12.6.2-notes.md>
diff --git a/cuda_bindings/docs/versions.json b/cuda_bindings/docs/versions.json
index cc1299896a..43354f1cb4 100644
--- a/cuda_bindings/docs/versions.json
+++ b/cuda_bindings/docs/versions.json
@@ -1,5 +1,7 @@
 {
     "latest"  : "latest",
+    "13.0.0"  : "13.0.0",
+    "12.9.1"  : "12.9.1",
     "12.9.0"  : "12.9.0",
     "12.8.0"  : "12.8.0",
     "12.6.2"  : "12.6.2",

From 1458112a445d2d0f150fdde5a79d8883ad092c3c Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 5 Aug 2025 01:43:07 +0000
Subject: [PATCH 45/65] update cuda-python docs

---
 cuda_python/DESCRIPTION.rst                   |  1 +
 cuda_python/docs/source/release.md            |  3 ++-
 .../{12.X.Y-notes.rst => 12.9.1-notes.rst}    |  6 ++++-
 .../docs/source/release/13.0.0-notes.rst      | 25 +++++++++++++++++++
 cuda_python/docs/versions.json                |  2 ++
 5 files changed, 35 insertions(+), 2 deletions(-)
 rename cuda_python/docs/source/release/{12.X.Y-notes.rst => 12.9.1-notes.rst} (76%)
 create mode 100644 cuda_python/docs/source/release/13.0.0-notes.rst

diff --git a/cuda_python/DESCRIPTION.rst b/cuda_python/DESCRIPTION.rst
index 01da48eac1..154c698938 100644
--- a/cuda_python/DESCRIPTION.rst
+++ b/cuda_python/DESCRIPTION.rst
@@ -47,3 +47,4 @@ The list of available interfaces are:
 * NVRTC
 * nvJitLink
 * NVVM
+* cuFile
diff --git a/cuda_python/docs/source/release.md b/cuda_python/docs/source/release.md
index 467c9c8e6d..e7e264bd18 100644
--- a/cuda_python/docs/source/release.md
+++ b/cuda_python/docs/source/release.md
@@ -5,7 +5,8 @@
 maxdepth: 3
 ---
 
-    12.X.Y <release/12.X.Y-notes>
+    13.0.0 <release/13.0.0-notes>
+    12.9.1 <release/12.9.1-notes>
     12.9.0 <release/12.9.0-notes>
     12.8.0 <release/12.8.0-notes>
     12.6.2 <release/12.6.2-notes>
diff --git a/cuda_python/docs/source/release/12.X.Y-notes.rst b/cuda_python/docs/source/release/12.9.1-notes.rst
similarity index 76%
rename from cuda_python/docs/source/release/12.X.Y-notes.rst
rename to cuda_python/docs/source/release/12.9.1-notes.rst
index d75a5aadcd..18523efd46 100644
--- a/cuda_python/docs/source/release/12.X.Y-notes.rst
+++ b/cuda_python/docs/source/release/12.9.1-notes.rst
@@ -1,7 +1,7 @@
 .. SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 .. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-CUDA Python 12.X.Y Release notes
+CUDA Python 12.9.1 Release notes
 ================================
 
 Released on MM DD, 2025.
@@ -10,10 +10,14 @@ Released on MM DD, 2025.
 Included components
 -------------------
 
+* `cuda.bindings 12.9.1 <https://nvidia.github.io/cuda-python/cuda-bindings/12.9.1/release/12.9.1-notes.html>`_
+
 
 Highlights
 ----------
 
+* Add bindings for cuFile
+
 
 Known issues
 ------------
diff --git a/cuda_python/docs/source/release/13.0.0-notes.rst b/cuda_python/docs/source/release/13.0.0-notes.rst
new file mode 100644
index 0000000000..e8ad920119
--- /dev/null
+++ b/cuda_python/docs/source/release/13.0.0-notes.rst
@@ -0,0 +1,25 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+CUDA Python 13.0.0 Release notes
+================================
+
+Released on MM DD, 2025.
+
+
+Included components
+-------------------
+
+* `cuda.bindings 13.0.0 <https://nvidia.github.io/cuda-python/cuda-bindings/13.0.0/release/13.0.0-notes.html>`_
+
+
+Highlights
+----------
+
+* Add bindings for cuFile
+
+
+Known issues
+------------
+
+* Updating from older versions (v12.6.2.post1 and below) via ``pip install -U cuda-python`` might not work. Please do a clean re-installation by uninstalling ``pip uninstall -y cuda-python`` followed by installing ``pip install cuda-python``.
diff --git a/cuda_python/docs/versions.json b/cuda_python/docs/versions.json
index cc1299896a..43354f1cb4 100644
--- a/cuda_python/docs/versions.json
+++ b/cuda_python/docs/versions.json
@@ -1,5 +1,7 @@
 {
     "latest"  : "latest",
+    "13.0.0"  : "13.0.0",
+    "12.9.1"  : "12.9.1",
     "12.9.0"  : "12.9.0",
     "12.8.0"  : "12.8.0",
     "12.6.2"  : "12.6.2",

From d65d0f0b2c9e41d9ec334c930f1da111ab1a3d51 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 5 Aug 2025 03:05:27 +0000
Subject: [PATCH 46/65] libnvvm is also split out

---
 .github/actions/fetch_ctk/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml
index 3db1827cdb..8e1dd617b8 100644
--- a/.github/actions/fetch_ctk/action.yml
+++ b/.github/actions/fetch_ctk/action.yml
@@ -17,7 +17,7 @@ inputs:
     description: "A list of the CTK components to install as a comma-separated list. e.g. 'cuda_nvcc,cuda_nvrtc,cuda_cudart'"
     required: false
     type: string
-    default: "cuda_nvcc,cuda_cudart,cuda_crt,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile"
+    default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile"
 
 runs:
   using: composite

From 11664e72ffcd9c93090e2a4ed11008fad0284574 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 5 Aug 2025 03:33:54 +0000
Subject: [PATCH 47/65] ensure using sanitizer from the latest release of the
 same major ver

---
 .github/workflows/guess_latest.sh | 13 +++++++++++--
 ci/tools/env-vars                 |  2 +-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/guess_latest.sh b/.github/workflows/guess_latest.sh
index d2e8427eb2..8a0a13034c 100644
--- a/.github/workflows/guess_latest.sh
+++ b/.github/workflows/guess_latest.sh
@@ -6,18 +6,27 @@
 # URL to search
 URL="https://developer.download.nvidia.com/compute/cuda/redist/"
 
+# Ensure exactly one argument is provided
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <CUDA_major_version>"
+    exit 1
+fi
+
+# Accept major version as the first argument
+MAJOR_VERSION="$1"
+
 # Fetch the directory listing and extract the latest version number
 get_latest_version() {
     # Get the HTML content of the page
     local html_content=$(wget -q -O - "$URL")
 
     # Extract links matching the pattern redistrib_?.?.?.json
-    local files=$(echo "$html_content" | grep -oP 'redistrib_[0-9]+\.[0-9]+\.[0-9]+\.json' | cut -d'"' -f2)
+    local files=$(echo "$html_content" | grep -oP "redistrib_${MAJOR_VERSION}\.[0-9]+\.[0-9]+\.json" | cut -d'"' -f2)
 
     # If files were found, extract the version numbers and find the latest
     if [ -n "$files" ]; then
         # Extract just the version numbers using regex
-        local versions=$(echo "$files" | grep -oP 'redistrib_\K[0-9]+\.[0-9]+\.[0-9]+(?=\.json)')
+        local versions=$(echo "$files" | grep -oP "redistrib_\K${MAJOR_VERSION}\.[0-9]+\.[0-9]+(?=\.json)")
 
         # Sort the versions and get the latest
         local latest_version=$(echo "$versions" | sort -V | tail -n 1)
diff --git a/ci/tools/env-vars b/ci/tools/env-vars
index 8b68540fc4..3dcb81a4c0 100755
--- a/ci/tools/env-vars
+++ b/ci/tools/env-vars
@@ -60,7 +60,7 @@ elif [[ "${1}" == "test" ]]; then
   # We only test compute-sanitizer on python 3.12 arbitrarily; we don't need to use sanitizer on the entire matrix
   # Only local ctk installs have compute-sanitizer; there is no wheel for it
   if [[ "${PY_VER}" == "3.12" && "${CUDA_VER}" != "11.8.0" && "${LOCAL_CTK}" == 1 && "${HOST_PLATFORM}" == linux* ]]; then
-    echo "LATEST_CUDA_VERSION=$(bash .github/workflows/guess_latest.sh)" >> $GITHUB_ENV
+    echo "LATEST_CUDA_VERSION=$(bash .github/workflows/guess_latest.sh $TEST_CUDA_MAJOR)" >> $GITHUB_ENV
     SETUP_SANITIZER=1
   else
     SETUP_SANITIZER=0

From de3c184498f60a4ef658af7565de6d2899b00739 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 5 Aug 2025 00:51:26 -0700
Subject: [PATCH 48/65] Remove -cu12 suffixes and add nvidia-nvvm in
 cuda_pathfinder/pyproject.toml. Make related changes in .github/workflows

---
 .github/workflows/test-wheel-linux.yml   |  8 +++----
 .github/workflows/test-wheel-windows.yml |  8 +++----
 cuda_pathfinder/pyproject.toml           | 29 ++++++++++++------------
 3 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index ed9ecfaa8f..62badccde3 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -316,16 +316,16 @@ jobs:
             pip install $(ls cuda_python*.whl)[all]
           fi
 
-      - name: Install cuda.pathfinder nvidia_wheels_cu12
-        if: startsWith(matrix.CUDA_VER, '12.')
+      - name: Install cuda.pathfinder nvidia_wheels_cu13
+        if: startsWith(matrix.CUDA_VER, '13.')
         run: |
           pushd cuda_pathfinder
-          pip install -v .[nvidia_wheels_cu12]
+          pip install -v .[nvidia_wheels_cu13]
           pip freeze
           popd
 
       - name: Run cuda.pathfinder tests with all_must_work
-        if: startsWith(matrix.CUDA_VER, '12.')
+        if: startsWith(matrix.CUDA_VER, '13.')
         env:
           CUDA_PATHFINDER_TEST_LOAD_NVIDIA_DYNAMIC_LIB_STRICTNESS: all_must_work
         run: run-tests pathfinder
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index 34059373ec..0ead454e14 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -285,17 +285,17 @@ jobs:
             pip install "$((Get-ChildItem -Filter cuda_python*.whl).FullName)[all]"
           }
 
-      - name: Install cuda.pathfinder nvidia_wheels_cu12
-        if: startsWith(matrix.CUDA_VER, '12.')
+      - name: Install cuda.pathfinder nvidia_wheels_cu13
+        if: startsWith(matrix.CUDA_VER, '13.')
         shell: bash --noprofile --norc -xeuo pipefail {0}
         run: |
           pushd cuda_pathfinder
-          pip install -v .[nvidia_wheels_cu12]
+          pip install -v .[nvidia_wheels_cu13]
           pip freeze
           popd
 
       - name: Run cuda.pathfinder tests with all_must_work
-        if: startsWith(matrix.CUDA_VER, '12.')
+        if: startsWith(matrix.CUDA_VER, '13.')
         env:
           CUDA_PATHFINDER_TEST_LOAD_NVIDIA_DYNAMIC_LIB_STRICTNESS: all_must_work
         shell: bash --noprofile --norc -xeuo pipefail {0}
diff --git a/cuda_pathfinder/pyproject.toml b/cuda_pathfinder/pyproject.toml
index 161dd66971..c1eaf18a8b 100644
--- a/cuda_pathfinder/pyproject.toml
+++ b/cuda_pathfinder/pyproject.toml
@@ -14,20 +14,21 @@ dependencies = []
 test = [
     "pytest>=6.2.4",
 ]
-nvidia_wheels_cu12 = [
-    "nvidia-cublas-cu12",
-    "nvidia-cuda-nvcc-cu12",
-    "nvidia-cuda-nvrtc-cu12",
-    "nvidia-cuda-runtime-cu12",
-    "nvidia-cufft-cu12",
-    "nvidia-cufile-cu12; sys_platform != 'win32'",
-    "nvidia-curand-cu12",
-    "nvidia-cusolver-cu12",
-    "nvidia-cusparse-cu12",
-    "nvidia-npp-cu12",
-    "nvidia-nvfatbin-cu12",
-    "nvidia-nvjitlink-cu12",
-    "nvidia-nvjpeg-cu12",
+nvidia_wheels_cu13 = [
+    "nvidia-cublas",
+    "nvidia-cuda-nvcc",
+    "nvidia-cuda-nvrtc",
+    "nvidia-cuda-runtime",
+    "nvidia-cufft",
+    "nvidia-cufile; sys_platform != 'win32'",
+    "nvidia-curand",
+    "nvidia-cusolver",
+    "nvidia-cusparse",
+    "nvidia-npp",
+    "nvidia-nvfatbin",
+    "nvidia-nvjitlink",
+    "nvidia-nvjpeg",
+    "nvidia-nvvm",
 ]
 
 [project.urls]

From 074bfa937ef54e5254d877ec8f2756d647b472fe Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 5 Aug 2025 10:33:36 +0000
Subject: [PATCH 49/65] fix backport branch's ci name

---
 .github/workflows/test-wheel-linux.yml   | 2 +-
 .github/workflows/test-wheel-windows.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index 62badccde3..60d5842ecc 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -198,7 +198,7 @@ jobs:
 
           OLD_BRANCH=$(cat .github/BACKPORT_BRANCH)
           OLD_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda*-${{ inputs.host-platform }}*"
-          LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "build-and-test.yml" -s completed -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId')
+          LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "ci.yml" -s completed -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId')
           if [[ "$LATEST_PRIOR_RUN_ID" == "" ]]; then
             echo "LATEST_PRIOR_RUN_ID not found!"
             exit 1
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index 0ead454e14..7648f427bc 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -178,7 +178,7 @@ jobs:
         run: |
           $OLD_BRANCH = Get-Content .github/BACKPORT_BRANCH
           $OLD_BASENAME = "cuda-bindings-python${env:PYTHON_VERSION_FORMATTED}-cuda*-${{ inputs.host-platform }}*"
-          $runData = gh run list -b $OLD_BRANCH -L 1 -w "build-and-test.yml" -s completed -R NVIDIA/cuda-python --json databaseId | ConvertFrom-Json
+          $runData = gh run list -b $OLD_BRANCH -L 1 -w "ci.yml" -s completed -R NVIDIA/cuda-python --json databaseId | ConvertFrom-Json
           if (-not $runData -or $runData.Length -eq 0 -or -not $runData[0].databaseId -or [string]::IsNullOrEmpty($runData[0].databaseId)) {
               Write-Host "LATEST_PRIOR_RUN_ID not found!"
               exit 1

From 34f84f03faf8d3d4225ce3d0b60081fbf381101d Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 5 Aug 2025 10:36:03 +0000
Subject: [PATCH 50/65] restore nvidia_wheels_cu12

---
 cuda_pathfinder/pyproject.toml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/cuda_pathfinder/pyproject.toml b/cuda_pathfinder/pyproject.toml
index c1eaf18a8b..ac6724277e 100644
--- a/cuda_pathfinder/pyproject.toml
+++ b/cuda_pathfinder/pyproject.toml
@@ -14,6 +14,21 @@ dependencies = []
 test = [
     "pytest>=6.2.4",
 ]
+nvidia_wheels_cu12 = [
+    "nvidia-cublas-cu12",
+    "nvidia-cuda-nvcc-cu12",
+    "nvidia-cuda-nvrtc-cu12",
+    "nvidia-cuda-runtime-cu12",
+    "nvidia-cufft-cu12",
+    "nvidia-cufile-cu12; sys_platform != 'win32'",
+    "nvidia-curand-cu12",
+    "nvidia-cusolver-cu12",
+    "nvidia-cusparse-cu12",
+    "nvidia-npp-cu12",
+    "nvidia-nvfatbin-cu12",
+    "nvidia-nvjitlink-cu12",
+    "nvidia-nvjpeg-cu12",
+]
 nvidia_wheels_cu13 = [
     "nvidia-cublas",
     "nvidia-cuda-nvcc",

From 0d4450b60d61fc601e333fb0b3ea9e0f8f7bb265 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 5 Aug 2025 11:11:18 +0000
Subject: [PATCH 51/65] remove tests

---
 .github/workflows/test-wheel-linux.yml   | 1 +
 .github/workflows/test-wheel-windows.yml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index 60d5842ecc..95d5736f4f 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -205,6 +205,7 @@ jobs:
           fi
 
           gh run download $LATEST_PRIOR_RUN_ID -p ${OLD_BASENAME} -R NVIDIA/cuda-python
+          rm -rf ${OLD_BASENAME}-tests  # exclude cython test artifacts
           ls -al $OLD_BASENAME
           mkdir -p "${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}"
           mv $OLD_BASENAME/*.whl "${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}"/
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index 7648f427bc..99cdca6c39 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -186,6 +186,7 @@ jobs:
           $LATEST_PRIOR_RUN_ID = $runData[0].databaseId
 
           gh run download $LATEST_PRIOR_RUN_ID -p $OLD_BASENAME -R NVIDIA/cuda-python
+          Remove-Item -Recurse -Force "${OLD_BASENAME}-tests"  # exclude cython test artifacts
           Get-ChildItem -Path $OLD_BASENAME
           New-Item -Path "${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}" -ItemType Directory -Force
           Move-Item -Path "$OLD_BASENAME/*.whl" -Destination "${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}"

From 47b961f730a04796fc41a2e2c4c43788ca23780b Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 5 Aug 2025 13:00:25 +0000
Subject: [PATCH 52/65] always test 12.9.x with the latest driver

---
 .github/workflows/test-wheel-linux.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index 95d5736f4f..f7b1e60642 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -62,7 +62,7 @@ jobs:
           #
           export MATRICES="
             pull-request:
-              - { ARCH: ${ARCH}, PY_VER: '3.9',  CUDA_VER: '12.9.0', LOCAL_CTK: '0', GPU: ${gpu}, DRIVER: 'earliest' }
+              - { ARCH: ${ARCH}, PY_VER: '3.9',  CUDA_VER: '12.9.0', LOCAL_CTK: '0', GPU: ${gpu}, DRIVER: 'latest' }
               - { ARCH: ${ARCH}, PY_VER: '3.9',  CUDA_VER: '13.0.0', LOCAL_CTK: '1', GPU: ${gpu}, DRIVER: 'latest' }
               - { ARCH: ${ARCH}, PY_VER: '3.10', CUDA_VER: '12.9.0', LOCAL_CTK: '1', GPU: ${gpu}, DRIVER: 'latest' }
               - { ARCH: ${ARCH}, PY_VER: '3.10', CUDA_VER: '13.0.0', LOCAL_CTK: '0', GPU: ${gpu}, DRIVER: 'latest' }
@@ -70,7 +70,7 @@ jobs:
               - { ARCH: ${ARCH}, PY_VER: '3.11', CUDA_VER: '13.0.0', LOCAL_CTK: '1', GPU: ${gpu}, DRIVER: 'latest' }
               - { ARCH: ${ARCH}, PY_VER: '3.12', CUDA_VER: '12.9.0', LOCAL_CTK: '1', GPU: ${gpu}, DRIVER: 'latest' }
               - { ARCH: ${ARCH}, PY_VER: '3.12', CUDA_VER: '13.0.0', LOCAL_CTK: '0', GPU: ${gpu}, DRIVER: 'latest' }
-              - { ARCH: ${ARCH}, PY_VER: '3.13', CUDA_VER: '12.9.0', LOCAL_CTK: '0', GPU: ${gpu}, DRIVER: 'earliest' }
+              - { ARCH: ${ARCH}, PY_VER: '3.13', CUDA_VER: '12.9.0', LOCAL_CTK: '0', GPU: ${gpu}, DRIVER: 'latest' }
               - { ARCH: ${ARCH}, PY_VER: '3.13', CUDA_VER: '13.0.0', LOCAL_CTK: '1', GPU: ${gpu}, DRIVER: 'latest' }
               ${special_runner}
             nightly:

From d4bf846f4b0498fdfccdc871fd230758c2454709 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 5 Aug 2025 13:02:59 +0000
Subject: [PATCH 53/65] ensure fetch_ctk works with 12.x

---
 .github/actions/fetch_ctk/action.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml
index 8e1dd617b8..83b447f0ce 100644
--- a/.github/actions/fetch_ctk/action.yml
+++ b/.github/actions/fetch_ctk/action.yml
@@ -32,6 +32,12 @@ runs:
         if [[ "$CUDA_MAJOR_VER" -lt 12 ]]; then
           CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libnvjitlink/}"
         fi
+        # Conditionally strip out cuda_crt and libnvvm for CUDA versions < 13
+        CUDA_MAJOR_VER="$(cut -d '.' -f 1 <<< ${{ inputs.cuda-version }})"
+        if [[ "$CUDA_MAJOR_VER" -lt 13 ]]; then
+          CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//cuda_crt/}"
+          CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libnvvm/}"
+        fi
         # Conditionally strip out libcufile since it does not support Windows
         if [[ "${{ inputs.host-platform }}" == win-* ]]; then
           CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libcufile/}"

From d1d8748751e22d3e79c4efc4a6640cd15678a6d6 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 5 Aug 2025 18:32:45 -0700
Subject: [PATCH 54/65] Fix Linux libnvvm site-packages search for CTK 13

CTK 12: site-packages/nvidia/cuda_nvcc/nvvm/lib64/libnvvm.so
CTK 13: site-packages/nvidia/cu13/lib/libnvvm.so.4
---
 .../_dynamic_libs/find_nvidia_dynamic_lib.py  | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_nvidia_dynamic_lib.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_nvidia_dynamic_lib.py
index b91407b753..a6a7f7fb54 100644
--- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_nvidia_dynamic_lib.py
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_nvidia_dynamic_lib.py
@@ -28,18 +28,21 @@ def _no_such_file_in_sub_dirs(
 def _find_so_using_nvidia_lib_dirs(
     libname: str, so_basename: str, error_messages: list[str], attachments: list[str]
 ) -> Optional[str]:
-    nvidia_sub_dirs = ("nvidia", "*", "nvvm", "lib64") if libname == "nvvm" else ("nvidia", "*", "lib")
     file_wild = so_basename + "*"
-    for lib_dir in find_sub_dirs_all_sitepackages(nvidia_sub_dirs):
-        # First look for an exact match
-        so_name = os.path.join(lib_dir, so_basename)
-        if os.path.isfile(so_name):
-            return so_name
-        # Look for a versioned library
-        # Using sort here mainly to make the result deterministic.
-        for so_name in sorted(glob.glob(os.path.join(lib_dir, file_wild))):
+    nvidia_sub_dirs_list: list[tuple[str, ...]] = [("nvidia", "*", "lib")]  # works also for CTK 13 nvvm
+    if libname == "nvvm":
+        nvidia_sub_dirs_list.append(("nvidia", "*", "nvvm", "lib64"))  # CTK 12
+    for nvidia_sub_dirs in nvidia_sub_dirs_list:
+        for lib_dir in find_sub_dirs_all_sitepackages(nvidia_sub_dirs):
+            # First look for an exact match
+            so_name = os.path.join(lib_dir, so_basename)
             if os.path.isfile(so_name):
                 return so_name
+            # Look for a versioned library
+            # Using sort here mainly to make the result deterministic.
+            for so_name in sorted(glob.glob(os.path.join(lib_dir, file_wild))):
+                if os.path.isfile(so_name):
+                    return so_name
     _no_such_file_in_sub_dirs(nvidia_sub_dirs, file_wild, error_messages, attachments)
     return None
 

From 8f5bfe574f6aa2bb541b7f447e756e87e9a93ed7 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 6 Aug 2025 02:15:12 +0000
Subject: [PATCH 55/65] update docs

---
 cuda_bindings/docs/source/release/12.9.1-notes.rst | 5 +++++
 cuda_bindings/docs/versions.json                   | 1 -
 cuda_python/docs/source/release/12.9.1-notes.rst   | 2 +-
 cuda_python/docs/versions.json                     | 1 -
 4 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/cuda_bindings/docs/source/release/12.9.1-notes.rst b/cuda_bindings/docs/source/release/12.9.1-notes.rst
index c81f102aaf..9c7dd326ca 100644
--- a/cuda_bindings/docs/source/release/12.9.1-notes.rst
+++ b/cuda_bindings/docs/source/release/12.9.1-notes.rst
@@ -31,12 +31,17 @@ Highlights
 Bug fixes
 ---------
 
+* Fix a library loading bug that preferred shared libraries without a SOVERSION.
+
 
 Miscellaneous
 -------------
 
+* All Python bindings now have the GIL released when calling into the underlying C APIs.
 * Added PTX utilities including :func:`~utils.get_minimal_required_cuda_ver_from_ptx_ver` and :func:`~utils.get_ptx_ver`.
 * Common CUDA objects such as :class:`~runtime.cudaStream_t` now compare equal if the underlying address is the same.
+* Add a binding to ``nvvmGetErrorString()``.
+* Build the bindings with Cython profile hooks disabled.
 
 
 Known issues
diff --git a/cuda_bindings/docs/versions.json b/cuda_bindings/docs/versions.json
index 43354f1cb4..c174c4eee3 100644
--- a/cuda_bindings/docs/versions.json
+++ b/cuda_bindings/docs/versions.json
@@ -1,7 +1,6 @@
 {
     "latest"  : "latest",
     "13.0.0"  : "13.0.0",
-    "12.9.1"  : "12.9.1",
     "12.9.0"  : "12.9.0",
     "12.8.0"  : "12.8.0",
     "12.6.2"  : "12.6.2",
diff --git a/cuda_python/docs/source/release/12.9.1-notes.rst b/cuda_python/docs/source/release/12.9.1-notes.rst
index 18523efd46..7b395ec82c 100644
--- a/cuda_python/docs/source/release/12.9.1-notes.rst
+++ b/cuda_python/docs/source/release/12.9.1-notes.rst
@@ -10,7 +10,7 @@ Released on MM DD, 2025.
 Included components
 -------------------
 
-* `cuda.bindings 12.9.1 <https://nvidia.github.io/cuda-python/cuda-bindings/12.9.1/release/12.9.1-notes.html>`_
+* `cuda.bindings 12.9.1 <https://nvidia.github.io/cuda-python/cuda-bindings/latest/release/12.9.1-notes.html>`_
 
 
 Highlights
diff --git a/cuda_python/docs/versions.json b/cuda_python/docs/versions.json
index 43354f1cb4..c174c4eee3 100644
--- a/cuda_python/docs/versions.json
+++ b/cuda_python/docs/versions.json
@@ -1,7 +1,6 @@
 {
     "latest"  : "latest",
     "13.0.0"  : "13.0.0",
-    "12.9.1"  : "12.9.1",
     "12.9.0"  : "12.9.0",
     "12.8.0"  : "12.8.0",
     "12.6.2"  : "12.6.2",

From 89250c4a76e065fa6065d6feea0f681c53b87290 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 6 Aug 2025 02:16:22 +0000
Subject: [PATCH 56/65] add PTX ISA 9.0 to utils

---
 cuda_bindings/cuda/bindings/utils/_ptx_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cuda_bindings/cuda/bindings/utils/_ptx_utils.py b/cuda_bindings/cuda/bindings/utils/_ptx_utils.py
index d303d5980b..038492f6ab 100644
--- a/cuda_bindings/cuda/bindings/utils/_ptx_utils.py
+++ b/cuda_bindings/cuda/bindings/utils/_ptx_utils.py
@@ -47,6 +47,7 @@
     "8.6": (12, 7),
     "8.7": (12, 8),
     "8.8": (12, 9),
+    "9.0": (13, 0),
 }
 
 

From f6ebdbc0cf8ebbb80e5cb7499c210615349a10fc Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 6 Aug 2025 02:20:29 +0000
Subject: [PATCH 57/65] sync 13.0.0 docs

---
 cuda_bindings/docs/source/release/13.0.0-notes.rst | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cuda_bindings/docs/source/release/13.0.0-notes.rst b/cuda_bindings/docs/source/release/13.0.0-notes.rst
index 37de83d5c2..f67ede2112 100644
--- a/cuda_bindings/docs/source/release/13.0.0-notes.rst
+++ b/cuda_bindings/docs/source/release/13.0.0-notes.rst
@@ -12,7 +12,7 @@ Released on MM DD, 2025
 Highlights
 ----------
 
-* Support CUDA 13.0
+* Support CUDA 13.0.
 
 * A utility module :mod:`cuda.bindings.utils` is added
 
@@ -33,12 +33,17 @@ Highlights
 Bug fixes
 ---------
 
+* Fix a library loading bug that preferred shared libraries without a SOVERSION.
+
 
 Miscellaneous
 -------------
 
+* All Python bindings now have the GIL released when calling into the underlying C APIs.
 * Added PTX utilities including :func:`~utils.get_minimal_required_cuda_ver_from_ptx_ver` and :func:`~utils.get_ptx_ver`.
 * Common CUDA objects such as :class:`~runtime.cudaStream_t` now compare equal if the underlying address is the same.
+* Add a binding to ``nvvmGetErrorString()``.
+* Build the bindings with Cython profile hooks disabled.
 
 
 Known issues

From de1e83a236fb0000c86d44861287f6de32cebe49 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 5 Aug 2025 19:28:54 -0700
Subject: [PATCH 58/65] Fix Windows site-packages search for CTK 13

---
 .../cuda/pathfinder/_dynamic_libs/find_nvidia_dynamic_lib.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_nvidia_dynamic_lib.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_nvidia_dynamic_lib.py
index a6a7f7fb54..bb6c32b63a 100644
--- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_nvidia_dynamic_lib.py
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_nvidia_dynamic_lib.py
@@ -59,7 +59,10 @@ def _find_dll_under_dir(dirpath: str, file_wild: str) -> Optional[str]:
 def _find_dll_using_nvidia_bin_dirs(
     libname: str, lib_searched_for: str, error_messages: list[str], attachments: list[str]
 ) -> Optional[str]:
-    nvidia_sub_dirs_list: list[tuple[str, ...]] = [("nvidia", "*", "bin")]
+    nvidia_sub_dirs_list: list[tuple[str, ...]] = [
+        ("nvidia", "*", "bin"),  # CTK 12
+        ("nvidia", "*", "bin", "*"),  # CTK 13, e.g. site-packages\nvidia\cu13\bin\x86_64\
+    ]
     if libname == "nvvm":
         nvidia_sub_dirs_list.append(("nvidia", "*", "nvvm", "bin"))  # Only for CTK 12
     for nvidia_sub_dirs in nvidia_sub_dirs_list:

From e18a5e8b294e60ed8ac9b9ba95c08cf177e78ab6 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 5 Aug 2025 21:51:22 -0700
Subject: [PATCH 59/65] Also add "nvidia-nvvm~=13.0" in
 cuda_bindings/pyproject.toml

---
 cuda_bindings/pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml
index c6ee2b2042..bd6471cb1d 100644
--- a/cuda_bindings/pyproject.toml
+++ b/cuda_bindings/pyproject.toml
@@ -36,6 +36,7 @@ all = [
     "nvidia-cuda-nvcc~=13.0",
     "nvidia-cuda-nvrtc~=13.0",
     "nvidia-nvjitlink~=13.0",
+    "nvidia-nvvm~=13.0",
     "nvidia-cufile; sys_platform == 'linux'",
 ]
 

From ff339b62855907e7f7770dae5d86069d89f94eb8 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 5 Aug 2025 23:53:46 -0700
Subject: [PATCH 60/65] Add _work_around_known_bugs() in load_dl_linux.py

To resolve this issue: https://github.com/NVIDIA/cuda-python/pull/792#issuecomment-3157455586
---
 .../pathfinder/_dynamic_libs/load_dl_linux.py | 23 ++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_linux.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_linux.py
index 251e0593a2..29192ec4c3 100644
--- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_linux.py
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_dl_linux.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+import contextlib
 import ctypes
 import ctypes.util
 import os
@@ -109,7 +110,26 @@ def load_with_system_search(libname: str) -> Optional[LoadedDL]:
     return None
 
 
-def load_with_abs_path(_libname: str, found_path: str) -> LoadedDL:
+def _work_around_known_bugs(libname: str, found_path: str) -> None:
+    if libname == "nvrtc":
+        # Work around bug/oversight in
+        #   nvidia_cuda_nvrtc-13.0.48-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl
+        # Issue: libnvrtc.so.13 RUNPATH is not set.
+        # This workaround is highly specific
+        #   - for simplicity.
+        #   - to not mask bugs in future nvidia-cuda-nvrtc releases.
+        #   - because a more general workaround is complicated.
+        dirname, basename = os.path.split(found_path)
+        if basename == "libnvrtc.so.13":
+            dep_basename = "libnvrtc-builtins.so.13.0"
+            dep_path = os.path.join(dirname, dep_basename)
+            if os.path.isfile(dep_path):
+                # In case of failure, defer to primary load, which is almost certain to fail, too.
+                with contextlib.suppress(OSError):
+                    ctypes.CDLL(dep_path, CDLL_MODE)
+
+
+def load_with_abs_path(libname: str, found_path: str) -> LoadedDL:
     """Load a dynamic library from the given path.
 
     Args:
@@ -122,6 +142,7 @@ def load_with_abs_path(_libname: str, found_path: str) -> LoadedDL:
     Raises:
         RuntimeError: If the library cannot be loaded
     """
+    _work_around_known_bugs(libname, found_path)
     try:
         handle = ctypes.CDLL(found_path, CDLL_MODE)
     except OSError as e:

From 25d4b0adac31662ac355664a1e5ee6d30f1263f4 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 6 Aug 2025 01:59:25 -0700
Subject: [PATCH 61/65] driver_cu_result_explanations.py,
 runtime_cuda_error_explanations.py refresh (no-op)

---
 .../core/experimental/_utils/driver_cu_result_explanations.py   | 2 --
 .../core/experimental/_utils/runtime_cuda_error_explanations.py | 2 --
 2 files changed, 4 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py b/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py
index 86e9f87231..c961e82ac5 100644
--- a/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py
+++ b/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py
@@ -8,8 +8,6 @@
 
 # ruff: noqa: E501
 # CUDA Toolkit v13.0.0
-# 036
-# TODO: Update from posted .run files before merging into public main.
 DRIVER_CU_RESULT_EXPLANATIONS = {
     0: (
         "The API call returned with no errors. In the case of query calls, this"
diff --git a/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py b/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py
index 0195e4af45..126897f2b5 100644
--- a/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py
+++ b/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py
@@ -8,8 +8,6 @@
 
 # ruff: noqa: E501
 # CUDA Toolkit v13.0.0
-# 036
-# TODO: Update from posted .run files before merging into public main.
 RUNTIME_CUDA_ERROR_EXPLANATIONS = {
     0: (
         "The API call returned with no errors. In the case of query calls, this"

From db83fbee605e815d8f61dbb43b1ed57b9795d6a6 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 6 Aug 2025 02:07:59 -0700
Subject: [PATCH 62/65] SUPPORTED_LINUX_SONAMES refresh (no-op)

---
 .../cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py
index 3ea0ce8122..24395e4222 100644
--- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py
@@ -101,10 +101,9 @@
 #   cuda_12.4.1_550.54.15_linux.run
 #   cuda_12.5.1_555.42.06_linux.run
 #   cuda_12.6.2_560.35.03_linux.run
-#   cuda_12.8.0_570.86.10_linux.run
-#   cuda_12.9.0_575.51.03_linux.run
-#   036
-#   TODO: Update from posted .run files before merging into public main.
+#   cuda_12.8.1_570.124.06_linux.run
+#   cuda_12.9.1_575.57.08_linux.run
+#   cuda_13.0.0_580.65.06_linux.run
 # Generated with toolshed/build_pathfinder_sonames.py
 SUPPORTED_LINUX_SONAMES = {
     "cublas": (

From 36d9f0654ff69df58086b9b1d81f0cab6f2d2364 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 6 Aug 2025 02:12:48 -0700
Subject: [PATCH 63/65] SUPPORTED_WINDOWS_DLLS refresh (no-op)

---
 .../cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py
index 24395e4222..14901c3e1f 100644
--- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/supported_nvidia_libs.py
@@ -252,10 +252,9 @@
 #   cuda_12.5.1_555.85_windows.exe
 #   cuda_12.6.2_560.94_windows.exe
 #   cuda_12.8.1_572.61_windows.exe
-#   cuda_12.9.0_576.02_windows.txt
-#   036
-#   TODO: Update from posted .exe files before merging into public main.
-# Generated with toolshed/build_pathfinder_dlls.py (WITH MANUAL EDITS)
+#   cuda_12.9.1_576.57_windows.exe
+#   cuda_13.0.0_windows.exe
+# Generated with toolshed/build_pathfinder_dlls.py
 SUPPORTED_WINDOWS_DLLS = {
     "cublas": (
         "cublas64_11.dll",

From 37ef8e058b9513ea21fd89552390d9c2fe5dfea9 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 6 Aug 2025 02:33:16 -0700
Subject: [PATCH 64/65] Update generated files: nvjitlink, nvvm (trivial
 changes, functional no-op)

---
 cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd       | 2 +-
 cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx | 2 +-
 .../cuda/bindings/_internal/nvjitlink_windows.pyx         | 2 +-
 cuda_bindings/cuda/bindings/_internal/utils.pxd           | 8 ++++----
 cuda_bindings/cuda/bindings/_internal/utils.pyx           | 2 +-
 cuda_bindings/cuda/bindings/cynvjitlink.pxd               | 2 +-
 cuda_bindings/cuda/bindings/cynvjitlink.pyx               | 2 +-
 cuda_bindings/cuda/bindings/nvjitlink.pxd                 | 2 +-
 cuda_bindings/cuda/bindings/nvjitlink.pyx                 | 2 +-
 9 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd b/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
index 184239c311..c2ca56bdea 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
index 87422dcc7b..62890c2405 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_linux.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
diff --git a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
index edcc4a0de6..d08c43fde9 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvjitlink_windows.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
diff --git a/cuda_bindings/cuda/bindings/_internal/utils.pxd b/cuda_bindings/cuda/bindings/_internal/utils.pxd
index 30f7935afb..50484727b7 100644
--- a/cuda_bindings/cuda/bindings/_internal/utils.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/utils.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
@@ -24,7 +24,7 @@ cdef extern from * nogil:
             if (own_data)
                 manager_.reset(data);
             else
-                raw_data_ = data;       
+                raw_data_ = data;
         }
 
         nullable_unique_ptr(const nullable_unique_ptr&) = delete;
@@ -39,7 +39,7 @@ cdef extern from * nogil:
             {
                 manager_ = std::move(other.manager_);
                 raw_data_ = nullptr;  // just in case
-            }   
+            }
             else
             {
                 manager_.reset(nullptr);  // just in case
@@ -55,7 +55,7 @@ cdef extern from * nogil:
             {
                 manager_ = std::move(other.manager_);
                 raw_data_ = nullptr;  // just in case
-            }   
+            }
             else
             {
                 manager_.reset(nullptr);  // just in case
diff --git a/cuda_bindings/cuda/bindings/_internal/utils.pyx b/cuda_bindings/cuda/bindings/_internal/utils.pyx
index aa78e6cff0..bf2422f791 100644
--- a/cuda_bindings/cuda/bindings/_internal/utils.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/utils.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pxd b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
index 4898f5a2bc..60ea8b1d12 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
diff --git a/cuda_bindings/cuda/bindings/cynvjitlink.pyx b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
index 13f9c2741f..8a65590e06 100644
--- a/cuda_bindings/cuda/bindings/cynvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/cynvjitlink.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pxd b/cuda_bindings/cuda/bindings/nvjitlink.pxd
index 87d2d4751c..40f21351a8 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pxd
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
diff --git a/cuda_bindings/cuda/bindings/nvjitlink.pyx b/cuda_bindings/cuda/bindings/nvjitlink.pyx
index e25cbc3add..a05b63feaa 100644
--- a/cuda_bindings/cuda/bindings/nvjitlink.pyx
+++ b/cuda_bindings/cuda/bindings/nvjitlink.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #

From ba95fc90df64bac1d5a182a905397a0f1cdb21c2 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 6 Aug 2025 15:18:18 +0000
Subject: [PATCH 65/65] update release dates

---
 cuda_bindings/docs/source/release/12.9.1-notes.rst | 2 +-
 cuda_bindings/docs/source/release/13.0.0-notes.rst | 2 +-
 cuda_python/docs/source/release/12.9.1-notes.rst   | 2 +-
 cuda_python/docs/source/release/13.0.0-notes.rst   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cuda_bindings/docs/source/release/12.9.1-notes.rst b/cuda_bindings/docs/source/release/12.9.1-notes.rst
index 9c7dd326ca..881d49d32c 100644
--- a/cuda_bindings/docs/source/release/12.9.1-notes.rst
+++ b/cuda_bindings/docs/source/release/12.9.1-notes.rst
@@ -6,7 +6,7 @@
 ``cuda-bindings`` 12.9.1 Release notes
 ======================================
 
-Released on MM DD, 2025
+Released on Aug 6, 2025
 
 
 Highlights
diff --git a/cuda_bindings/docs/source/release/13.0.0-notes.rst b/cuda_bindings/docs/source/release/13.0.0-notes.rst
index f67ede2112..3df3ca48ad 100644
--- a/cuda_bindings/docs/source/release/13.0.0-notes.rst
+++ b/cuda_bindings/docs/source/release/13.0.0-notes.rst
@@ -6,7 +6,7 @@
 ``cuda-bindings`` 13.0.0 Release notes
 ======================================
 
-Released on MM DD, 2025
+Released on Aug 6, 2025
 
 
 Highlights
diff --git a/cuda_python/docs/source/release/12.9.1-notes.rst b/cuda_python/docs/source/release/12.9.1-notes.rst
index 7b395ec82c..282cd56f7b 100644
--- a/cuda_python/docs/source/release/12.9.1-notes.rst
+++ b/cuda_python/docs/source/release/12.9.1-notes.rst
@@ -4,7 +4,7 @@
 CUDA Python 12.9.1 Release notes
 ================================
 
-Released on MM DD, 2025.
+Released on Aug 6, 2025.
 
 
 Included components
diff --git a/cuda_python/docs/source/release/13.0.0-notes.rst b/cuda_python/docs/source/release/13.0.0-notes.rst
index e8ad920119..140c28839c 100644
--- a/cuda_python/docs/source/release/13.0.0-notes.rst
+++ b/cuda_python/docs/source/release/13.0.0-notes.rst
@@ -4,7 +4,7 @@
 CUDA Python 13.0.0 Release notes
 ================================
 
-Released on MM DD, 2025.
+Released on Aug 6, 2025.
 
 
 Included components