From 3921ca628dff3b75314e780b53daec594338820b Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 12 Jan 2026 14:23:55 -0800
Subject: [PATCH] Use __pyx_capi__ for CUDA driver function pointers (#1450)

Replace the _CUDA_DRIVER_API_V1 capsule with direct extraction of function
pointers from cuda.bindings.cydriver.__pyx_capi__ at module import time.

This simplifies the architecture by eliminating the custom capsule struct
and its associated loading machinery (load_driver_api, ensure_driver_loaded,
cuGetProcAddress resolution). The driver function pointers are now populated
directly from Cython's built-in cross-module API mechanism.

Closes #1450
---
 cuda_core/cuda/core/_cpp/DESIGN.md            |  50 ++--
 cuda_core/cuda/core/_cpp/resource_handles.cpp | 272 +++---------------
 cuda_core/cuda/core/_cpp/resource_handles.hpp |  37 +++
 cuda_core/cuda/core/_resource_handles.pyx     | 211 +++++++-------
 4 files changed, 203 insertions(+), 367 deletions(-)

diff --git a/cuda_core/cuda/core/_cpp/DESIGN.md b/cuda_core/cuda/core/_cpp/DESIGN.md
index 3accf1bcd0..965904ea67 100644
--- a/cuda_core/cuda/core/_cpp/DESIGN.md
+++ b/cuda_core/cuda/core/_cpp/DESIGN.md
@@ -117,7 +117,8 @@ link against this code directly—they access it through a capsule mechanism
 
 ## Capsule Architecture
 
-The implementation uses **two separate capsule mechanisms** for different purposes:
+The implementation uses a capsule mechanism for cross-module C++ function sharing,
+and Cython's `__pyx_capi__` for CUDA driver function resolution:
 
 ### Capsule 1: C++ API Table (`_CXX_API`)
 
@@ -160,38 +161,45 @@ cdef inline StreamHandle create_stream_handle(...) except * nogil:
 Importing modules are expected to call `_init_handles_table()` prior to calling
 any wrapper functions.
 
-### Capsule 2: CUDA Driver API (`_CUDA_DRIVER_API_V1`)
+### CUDA Driver Function Pointers via `__pyx_capi__`
 
 **Problem**: cuda.core cannot directly call CUDA driver functions because:
 
 1. We don't want to link against `libcuda.so` at build time.
 2. The driver symbols must be resolved dynamically through cuda-bindings.
 
-**Solution**: `_resource_handles.pyx` creates a capsule containing CUDA driver
-function pointers obtained from cuda-bindings:
+**Solution**: The C++ code declares extern function pointer variables:
 
 ```cpp
-struct CudaDriverApiV1 {
-    uint32_t abi_version;
-    uint32_t struct_size;
-
-    uintptr_t cuDevicePrimaryCtxRetain;
-    uintptr_t cuDevicePrimaryCtxRelease;
-    uintptr_t cuStreamCreateWithPriority;
-    uintptr_t cuStreamDestroy;
-    // ... etc
-};
+// resource_handles.hpp
+extern decltype(&cuStreamCreateWithPriority) p_cuStreamCreateWithPriority;
+extern decltype(&cuMemPoolCreate) p_cuMemPoolCreate;
+// ... etc
 ```
 
-The C++ code retrieves this capsule once (via `load_driver_api()`) and caches the
-function pointers for subsequent use.
+At module import time, `_resource_handles.pyx` populates these pointers by
+extracting them from `cuda.bindings.cydriver.__pyx_capi__`:
+
+```cython
+import cuda.bindings.cydriver as cydriver
+
+cdef void* _get_driver_fn(str name):
+    capsule = cydriver.__pyx_capi__[name]
+    return PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule))
+
+p_cuStreamCreateWithPriority = _get_driver_fn("cuStreamCreateWithPriority")
+```
 
-### Why Two Capsules?
+The `__pyx_capi__` dictionary contains PyCapsules that Cython automatically
+generates for each `cdef` function declared in a `.pxd` file. Each capsule's
+name is the function's C signature; we query it with `PyCapsule_GetName()`
+rather than hardcoding signatures.
 
-| Capsule | Direction | Purpose |
-|---------|-----------|---------|
-| `_CXX_API` | C++ → Cython | Share handle functions across modules |
-| `_CUDA_DRIVER_API_V1` | Cython → C++ | Provide resolved driver symbols |
+This approach:
+- Avoids linking against `libcuda.so` at build time
+- Works on CPU-only machines (capsule extraction succeeds; actual driver calls
+  will return errors like `CUDA_ERROR_NO_DEVICE`)
+- Requires no custom capsule infrastructure—uses Cython's built-in mechanism
 
 ## Key Implementation Details
 
diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp
index 2cbdce2fa9..179ad369eb 100644
--- a/cuda_core/cuda/core/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp
@@ -9,7 +9,6 @@
 #include <cuda.h>
 #include <cstdint>
 #include <cstring>
-#include <atomic>
 #include <mutex>
 #include <unordered_map>
 #include <vector>
@@ -17,13 +16,47 @@
 namespace cuda_core {
 
 // ============================================================================
-// CUDA driver lazy resolution via cuda-bindings (CPU-only import + MVC)
+// CUDA driver function pointers
+//
+// These are populated by _resource_handles.pyx at module import time using
+// function pointers extracted from cuda.bindings.cydriver.__pyx_capi__.
 // ============================================================================
 
-namespace {
+decltype(&cuDevicePrimaryCtxRetain) p_cuDevicePrimaryCtxRetain = nullptr;
+decltype(&cuDevicePrimaryCtxRelease) p_cuDevicePrimaryCtxRelease = nullptr;
+decltype(&cuCtxGetCurrent) p_cuCtxGetCurrent = nullptr;
+
+decltype(&cuStreamCreateWithPriority) p_cuStreamCreateWithPriority = nullptr;
+decltype(&cuStreamDestroy) p_cuStreamDestroy = nullptr;
+
+decltype(&cuEventCreate) p_cuEventCreate = nullptr;
+decltype(&cuEventDestroy) p_cuEventDestroy = nullptr;
+decltype(&cuIpcOpenEventHandle) p_cuIpcOpenEventHandle = nullptr;
+
+decltype(&cuDeviceGetCount) p_cuDeviceGetCount = nullptr;
+
+decltype(&cuMemPoolSetAccess) p_cuMemPoolSetAccess = nullptr;
+decltype(&cuMemPoolDestroy) p_cuMemPoolDestroy = nullptr;
+decltype(&cuMemPoolCreate) p_cuMemPoolCreate = nullptr;
+decltype(&cuDeviceGetMemPool) p_cuDeviceGetMemPool = nullptr;
+decltype(&cuMemPoolImportFromShareableHandle) p_cuMemPoolImportFromShareableHandle = nullptr;
+
+decltype(&cuMemAllocFromPoolAsync) p_cuMemAllocFromPoolAsync = nullptr;
+decltype(&cuMemAllocAsync) p_cuMemAllocAsync = nullptr;
+decltype(&cuMemAlloc) p_cuMemAlloc = nullptr;
+decltype(&cuMemAllocHost) p_cuMemAllocHost = nullptr;
+
+decltype(&cuMemFreeAsync) p_cuMemFreeAsync = nullptr;
+decltype(&cuMemFree) p_cuMemFree = nullptr;
+decltype(&cuMemFreeHost) p_cuMemFreeHost = nullptr;
+
+decltype(&cuMemPoolImportPointer) p_cuMemPoolImportPointer = nullptr;
+
+// ============================================================================
+// GIL management helpers
+// ============================================================================
 
-std::once_flag driver_load_once;
-std::atomic<bool> driver_loaded{false};
+namespace {
 
 #if PY_VERSION_HEX < 0x030D0000
 extern "C" int _Py_IsFinalizing(void);
@@ -39,10 +72,6 @@ inline bool py_is_finalizing() noexcept {
 #endif
 }
 
-// ============================================================================
-// GIL management helpers
-// ============================================================================
-
 // Helper to release the GIL while calling into the CUDA driver.
 // This guard is *conditional*: if the caller already dropped the GIL,
 // we avoid calling PyEval_SaveThread (which requires holding the GIL).
@@ -110,178 +139,6 @@ class GILAcquireGuard {
     bool acquired_;
 };
 
-
-#define DECLARE_DRIVER_FN(name) using name##_t = decltype(&name); name##_t p_##name = nullptr
-
-DECLARE_DRIVER_FN(cuDevicePrimaryCtxRetain);
-DECLARE_DRIVER_FN(cuDevicePrimaryCtxRelease);
-DECLARE_DRIVER_FN(cuCtxGetCurrent);
-
-DECLARE_DRIVER_FN(cuStreamCreateWithPriority);
-DECLARE_DRIVER_FN(cuStreamDestroy);
-
-DECLARE_DRIVER_FN(cuEventCreate);
-DECLARE_DRIVER_FN(cuEventDestroy);
-DECLARE_DRIVER_FN(cuIpcOpenEventHandle);
-
-DECLARE_DRIVER_FN(cuDeviceGetCount);
-
-DECLARE_DRIVER_FN(cuMemPoolSetAccess);
-DECLARE_DRIVER_FN(cuMemPoolDestroy);
-DECLARE_DRIVER_FN(cuMemPoolCreate);
-DECLARE_DRIVER_FN(cuDeviceGetMemPool);
-DECLARE_DRIVER_FN(cuMemPoolImportFromShareableHandle);
-
-DECLARE_DRIVER_FN(cuMemAllocFromPoolAsync);
-DECLARE_DRIVER_FN(cuMemAllocAsync);
-DECLARE_DRIVER_FN(cuMemAlloc);
-DECLARE_DRIVER_FN(cuMemAllocHost);
-
-DECLARE_DRIVER_FN(cuMemFreeAsync);
-DECLARE_DRIVER_FN(cuMemFree);
-DECLARE_DRIVER_FN(cuMemFreeHost);
-
-DECLARE_DRIVER_FN(cuMemPoolImportPointer);
-
-#undef DECLARE_DRIVER_FN
-
-bool load_driver_api() noexcept {
-    struct CudaDriverApiV1 {
-        std::uint32_t abi_version;
-        std::uint32_t struct_size;
-
-        std::uintptr_t cuDevicePrimaryCtxRetain;
-        std::uintptr_t cuDevicePrimaryCtxRelease;
-        std::uintptr_t cuCtxGetCurrent;
-
-        std::uintptr_t cuStreamCreateWithPriority;
-        std::uintptr_t cuStreamDestroy;
-
-        std::uintptr_t cuEventCreate;
-        std::uintptr_t cuEventDestroy;
-        std::uintptr_t cuIpcOpenEventHandle;
-
-        std::uintptr_t cuDeviceGetCount;
-
-        std::uintptr_t cuMemPoolSetAccess;
-        std::uintptr_t cuMemPoolDestroy;
-        std::uintptr_t cuMemPoolCreate;
-        std::uintptr_t cuDeviceGetMemPool;
-        std::uintptr_t cuMemPoolImportFromShareableHandle;
-
-        std::uintptr_t cuMemAllocFromPoolAsync;
-        std::uintptr_t cuMemAllocAsync;
-        std::uintptr_t cuMemAlloc;
-        std::uintptr_t cuMemAllocHost;
-
-        std::uintptr_t cuMemFreeAsync;
-        std::uintptr_t cuMemFree;
-        std::uintptr_t cuMemFreeHost;
-
-        std::uintptr_t cuMemPoolImportPointer;
-    };
-
-    static constexpr const char* capsule_name =
-        "cuda.core._resource_handles._CUDA_DRIVER_API_V1";
-
-    GILAcquireGuard gil;
-    if (!gil.acquired()) {
-        return false;
-    }
-
-    // `_resource_handles` is already loaded (it exports the handle API capsule),
-    // so avoid import machinery and just grab the module object.
-    PyObject* mod = PyImport_AddModule("cuda.core._resource_handles");  // borrowed
-    if (!mod) {
-        PyErr_Clear();
-        return false;
-    }
-
-    PyObject* fn = PyObject_GetAttrString(mod, "_get_cuda_driver_api_v1_capsule");  // new ref
-    if (!fn) {
-        PyErr_Clear();
-        return false;
-    }
-
-    PyObject* cap = PyObject_CallFunctionObjArgs(fn, nullptr);
-    Py_DECREF(fn);
-    if (!cap) {
-        PyErr_Clear();
-        return false;
-    }
-
-    const auto* api = static_cast<const CudaDriverApiV1*>(PyCapsule_GetPointer(cap, capsule_name));
-    Py_DECREF(cap);
-
-    if (!api) {
-        PyErr_Clear();
-        return false;
-    }
-    if (api->abi_version != 1 || api->struct_size < sizeof(CudaDriverApiV1)) {
-        return false;
-    }
-
-#define LOAD_ADDR(name)                                             \
-    do {                                                            \
-        if (api->name == 0) {                                       \
-            return false;                                           \
-        }                                                           \
-        p_##name = reinterpret_cast<decltype(p_##name)>(api->name); \
-    } while (0)
-
-    LOAD_ADDR(cuDevicePrimaryCtxRetain);
-    LOAD_ADDR(cuDevicePrimaryCtxRelease);
-    LOAD_ADDR(cuCtxGetCurrent);
-
-    LOAD_ADDR(cuStreamCreateWithPriority);
-    LOAD_ADDR(cuStreamDestroy);
-
-    LOAD_ADDR(cuEventCreate);
-    LOAD_ADDR(cuEventDestroy);
-    LOAD_ADDR(cuIpcOpenEventHandle);
-
-    LOAD_ADDR(cuDeviceGetCount);
-
-    LOAD_ADDR(cuMemPoolSetAccess);
-    LOAD_ADDR(cuMemPoolDestroy);
-    LOAD_ADDR(cuMemPoolCreate);
-    LOAD_ADDR(cuDeviceGetMemPool);
-    LOAD_ADDR(cuMemPoolImportFromShareableHandle);
-
-    LOAD_ADDR(cuMemAllocFromPoolAsync);
-    LOAD_ADDR(cuMemAllocAsync);
-    LOAD_ADDR(cuMemAlloc);
-    LOAD_ADDR(cuMemAllocHost);
-
-    LOAD_ADDR(cuMemFreeAsync);
-    LOAD_ADDR(cuMemFree);
-    LOAD_ADDR(cuMemFreeHost);
-
-    LOAD_ADDR(cuMemPoolImportPointer);
-
-#undef LOAD_ADDR
-
-    return true;
-}
-
-bool ensure_driver_loaded() noexcept {
-    // Fast path: already loaded (no locking needed)
-    if (driver_loaded.load(std::memory_order_acquire)) {
-        return true;
-    }
-
-    // Slow path: release GIL before acquiring call_once guard.
-    // This ensures lock order is always: guard mutex -> GIL, preventing deadlock.
-    // See DESIGN.md "Static Initialization and Deadlock Hazards".
-    GILReleaseGuard release_gil;
-    std::call_once(driver_load_once, []() {
-        // Inside call_once, safe to acquire GIL (correct lock order).
-        // load_driver_api() acquires GIL internally via GILAcquireGuard.
-        driver_loaded.store(load_driver_api(), std::memory_order_release);
-    });
-    return driver_loaded.load(std::memory_order_acquire);
-}
-
 }  // namespace
 
 // ============================================================================
@@ -324,10 +181,6 @@ ContextHandle create_context_handle_ref(CUcontext ctx) {
 static thread_local std::vector<ContextHandle> primary_context_cache;
 
 ContextHandle get_primary_context(int device_id) noexcept {
-    if (!ensure_driver_loaded()) {
-        err = CUDA_ERROR_NOT_INITIALIZED;
-        return {};
-    }
     // Check thread-local cache
     if (static_cast<size_t>(device_id) < primary_context_cache.size()) {
         if (auto cached = primary_context_cache[device_id]) {
@@ -361,10 +214,6 @@ ContextHandle get_primary_context(int device_id) noexcept {
 }
 
 ContextHandle get_current_context() noexcept {
-    if (!ensure_driver_loaded()) {
-        err = CUDA_ERROR_NOT_INITIALIZED;
-        return {};
-    }
     GILReleaseGuard gil;
     CUcontext ctx = nullptr;
     if (CUDA_SUCCESS != (err = p_cuCtxGetCurrent(&ctx))) {
@@ -387,10 +236,6 @@ struct StreamBox {
 }  // namespace
 
 StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) {
-    if (!ensure_driver_loaded()) {
-        err = CUDA_ERROR_NOT_INITIALIZED;
-        return {};
-    }
     GILReleaseGuard gil;
     CUstream stream;
     if (CUDA_SUCCESS != (err = p_cuStreamCreateWithPriority(&stream, flags, priority))) {
@@ -449,10 +294,6 @@ struct EventBox {
 }  // namespace
 
 EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) {
-    if (!ensure_driver_loaded()) {
-        err = CUDA_ERROR_NOT_INITIALIZED;
-        return {};
-    }
     GILReleaseGuard gil;
     CUevent event;
     if (CUDA_SUCCESS != (err = p_cuEventCreate(&event, flags))) {
@@ -475,10 +316,6 @@ EventHandle create_event_handle(unsigned int flags) {
 }
 
 EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) {
-    if (!ensure_driver_loaded()) {
-        err = CUDA_ERROR_NOT_INITIALIZED;
-        return {};
-    }
     GILReleaseGuard gil;
     CUevent event;
     if (CUDA_SUCCESS != (err = p_cuIpcOpenEventHandle(&event, ipc_handle))) {
@@ -537,10 +374,6 @@ static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) {
 }
 
 MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props) {
-    if (!ensure_driver_loaded()) {
-        err = CUDA_ERROR_NOT_INITIALIZED;
-        return {};
-    }
     GILReleaseGuard gil;
     CUmemoryPool pool;
     if (CUDA_SUCCESS != (err = p_cuMemPoolCreate(&pool, &props))) {
@@ -555,10 +388,6 @@ MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool) {
 }
 
 MemoryPoolHandle get_device_mempool(int device_id) noexcept {
-    if (!ensure_driver_loaded()) {
-        err = CUDA_ERROR_NOT_INITIALIZED;
-        return {};
-    }
     GILReleaseGuard gil;
     CUmemoryPool pool;
     if (CUDA_SUCCESS != (err = p_cuDeviceGetMemPool(&pool, device_id))) {
@@ -568,10 +397,6 @@ MemoryPoolHandle get_device_mempool(int device_id) noexcept {
 }
 
 MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type) {
-    if (!ensure_driver_loaded()) {
-        err = CUDA_ERROR_NOT_INITIALIZED;
-        return {};
-    }
     GILReleaseGuard gil;
     CUmemoryPool pool;
     auto handle_ptr = reinterpret_cast<void*>(static_cast<uintptr_t>(fd));
@@ -616,10 +441,6 @@ void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) {
 }
 
 DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, StreamHandle h_stream) {
-    if (!ensure_driver_loaded()) {
-        err = CUDA_ERROR_NOT_INITIALIZED;
-        return {};
-    }
     GILReleaseGuard gil;
     CUdeviceptr ptr;
     if (CUDA_SUCCESS != (err = p_cuMemAllocFromPoolAsync(&ptr, size, *h_pool, as_cu(h_stream)))) {
@@ -638,10 +459,6 @@ DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool,
 }
 
 DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) {
-    if (!ensure_driver_loaded()) {
-        err = CUDA_ERROR_NOT_INITIALIZED;
-        return {};
-    }
     GILReleaseGuard gil;
     CUdeviceptr ptr;
     if (CUDA_SUCCESS != (err = p_cuMemAllocAsync(&ptr, size, as_cu(h_stream)))) {
@@ -660,10 +477,6 @@ DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) {
 }
 
 DevicePtrHandle deviceptr_alloc(size_t size) {
-    if (!ensure_driver_loaded()) {
-        err = CUDA_ERROR_NOT_INITIALIZED;
-        return {};
-    }
     GILReleaseGuard gil;
     CUdeviceptr ptr;
     if (CUDA_SUCCESS != (err = p_cuMemAlloc(&ptr, size))) {
@@ -682,10 +495,6 @@ DevicePtrHandle deviceptr_alloc(size_t size) {
 }
 
 DevicePtrHandle deviceptr_alloc_host(size_t size) {
-    if (!ensure_driver_loaded()) {
-        err = CUDA_ERROR_NOT_INITIALIZED;
-        return {};
-    }
     GILReleaseGuard gil;
     void* ptr;
     if (CUDA_SUCCESS != (err = p_cuMemAllocHost(&ptr, size))) {
@@ -785,11 +594,6 @@ static std::mutex ipc_ptr_cache_mutex;
 static std::unordered_map<ExportDataKey, std::weak_ptr<DevicePtrBox>, ExportDataKeyHash> ipc_ptr_cache;
 
 DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) {
-    if (!ensure_driver_loaded()) {
-        err = CUDA_ERROR_NOT_INITIALIZED;
-        return {};
-    }
-
     auto data = const_cast<CUmemPoolPtrExportData*>(
         reinterpret_cast<const CUmemPoolPtrExportData*>(export_data));
 
diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp
index 6b82594c2b..adff5c37d7 100644
--- a/cuda_core/cuda/core/_cpp/resource_handles.hpp
+++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp
@@ -24,6 +24,43 @@ CUresult peek_last_error() noexcept;
 // Explicitly clear the last error
 void clear_last_error() noexcept;
 
+// ============================================================================
+// CUDA driver function pointers
+//
+// These are populated by _resource_handles.pyx at module import time using
+// function pointers extracted from cuda.bindings.cydriver.__pyx_capi__.
+// ============================================================================
+
+extern decltype(&cuDevicePrimaryCtxRetain) p_cuDevicePrimaryCtxRetain;
+extern decltype(&cuDevicePrimaryCtxRelease) p_cuDevicePrimaryCtxRelease;
+extern decltype(&cuCtxGetCurrent) p_cuCtxGetCurrent;
+
+extern decltype(&cuStreamCreateWithPriority) p_cuStreamCreateWithPriority;
+extern decltype(&cuStreamDestroy) p_cuStreamDestroy;
+
+extern decltype(&cuEventCreate) p_cuEventCreate;
+extern decltype(&cuEventDestroy) p_cuEventDestroy;
+extern decltype(&cuIpcOpenEventHandle) p_cuIpcOpenEventHandle;
+
+extern decltype(&cuDeviceGetCount) p_cuDeviceGetCount;
+
+extern decltype(&cuMemPoolSetAccess) p_cuMemPoolSetAccess;
+extern decltype(&cuMemPoolDestroy) p_cuMemPoolDestroy;
+extern decltype(&cuMemPoolCreate) p_cuMemPoolCreate;
+extern decltype(&cuDeviceGetMemPool) p_cuDeviceGetMemPool;
+extern decltype(&cuMemPoolImportFromShareableHandle) p_cuMemPoolImportFromShareableHandle;
+
+extern decltype(&cuMemAllocFromPoolAsync) p_cuMemAllocFromPoolAsync;
+extern decltype(&cuMemAllocAsync) p_cuMemAllocAsync;
+extern decltype(&cuMemAlloc) p_cuMemAlloc;
+extern decltype(&cuMemAllocHost) p_cuMemAllocHost;
+
+extern decltype(&cuMemFreeAsync) p_cuMemFreeAsync;
+extern decltype(&cuMemFree) p_cuMemFree;
+extern decltype(&cuMemFreeHost) p_cuMemFreeHost;
+
+extern decltype(&cuMemPoolImportPointer) p_cuMemPoolImportPointer;
+
 // ============================================================================
 // Handle type aliases - expose only the raw CUDA resource
 // ============================================================================
diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx
index f15d524430..5f1b016884 100644
--- a/cuda_core/cuda/core/_resource_handles.pyx
+++ b/cuda_core/cuda/core/_resource_handles.pyx
@@ -2,23 +2,20 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-# This module exists to compile _cpp/resource_handles.cpp into a shared library.
-# The helper functions (cu, intptr, py) are implemented as inline C++ functions
-# in _cpp/resource_handles.hpp and declared as extern in _resource_handles.pxd.
+# This module compiles _cpp/resource_handles.cpp into a shared library.
+# At import time, it populates the C++ driver function pointers using
+# capsules from cuda.bindings.cydriver.__pyx_capi__.
 
-from cpython.pycapsule cimport PyCapsule_New
-from libc.stdint cimport uint32_t, uint64_t, uintptr_t
+from cpython.pycapsule cimport PyCapsule_GetName, PyCapsule_GetPointer, PyCapsule_New
 
 from ._resource_handles_cxx_api cimport (
     ResourceHandlesCxxApiV1,
     get_resource_handles_cxx_api_v1,
 )
 
-import cython
-
+import cuda.bindings.cydriver as cydriver
 
 cdef const char* _CXX_API_NAME = b"cuda.core._resource_handles._CXX_API"
-cdef const char* _CUDA_DRIVER_API_V1_NAME = b"cuda.core._resource_handles._CUDA_DRIVER_API_V1"
 
 # Export the C++ handles dispatch table as a PyCapsule.
 # Consumers use PyCapsule_Import(_CXX_API_NAME, 0) to retrieve it.
@@ -31,107 +28,97 @@ if _CXX_API is None:
     raise RuntimeError("Failed to create _CXX_API capsule")
 
 
-cdef struct CudaDriverApiV1:
-    uint32_t abi_version
-    uint32_t struct_size
-
-    uintptr_t cuDevicePrimaryCtxRetain
-    uintptr_t cuDevicePrimaryCtxRelease
-    uintptr_t cuCtxGetCurrent
-
-    uintptr_t cuStreamCreateWithPriority
-    uintptr_t cuStreamDestroy
-
-    uintptr_t cuEventCreate
-    uintptr_t cuEventDestroy
-    uintptr_t cuIpcOpenEventHandle
-
-    uintptr_t cuDeviceGetCount
-
-    uintptr_t cuMemPoolSetAccess
-    uintptr_t cuMemPoolDestroy
-    uintptr_t cuMemPoolCreate
-    uintptr_t cuDeviceGetMemPool
-    uintptr_t cuMemPoolImportFromShareableHandle
-
-    uintptr_t cuMemAllocFromPoolAsync
-    uintptr_t cuMemAllocAsync
-    uintptr_t cuMemAlloc
-    uintptr_t cuMemAllocHost
-
-    uintptr_t cuMemFreeAsync
-    uintptr_t cuMemFree
-    uintptr_t cuMemFreeHost
-
-    uintptr_t cuMemPoolImportPointer
-
-
-cdef CudaDriverApiV1 _cuda_driver_api_v1
-cdef bint _cuda_driver_api_v1_inited = False
-
-
-cdef inline uintptr_t _as_addr(object pfn) except 0:
-    return <uintptr_t>int(pfn)
-
-
-cdef inline uintptr_t _resolve(object d, int driver_ver, uint64_t flags, bytes sym) except 0:
-    err, pfn, status = d.cuGetProcAddress(sym, driver_ver, flags)
-    if int(err) != 0 or pfn is None:
-        raise RuntimeError(f"cuGetProcAddress failed for {sym!r}, err={err}, status={status}")
-    return _as_addr(pfn)
-
-
-def _get_cuda_driver_api_v1_capsule():
-    """Return a PyCapsule containing cached CUDA driver entrypoints.
-
-    This is evaluated lazily on first use so cuda-core remains importable on
-    CPU-only machines.
-    """
-    global _cuda_driver_api_v1_inited, _cuda_driver_api_v1
-    if not _cuda_driver_api_v1_inited:
-        import cuda.bindings.driver as d
-
-        err, ver = d.cuDriverGetVersion()
-        if int(err) != 0:
-            raise RuntimeError(f"cuDriverGetVersion failed: {err}")
-        driver_ver = int(ver)
-
-        flags = 0  # CU_GET_PROC_ADDRESS_DEFAULT
-
-        _cuda_driver_api_v1.cuDevicePrimaryCtxRetain = _resolve(d, driver_ver, flags, b"cuDevicePrimaryCtxRetain")
-        _cuda_driver_api_v1.cuDevicePrimaryCtxRelease = _resolve(d, driver_ver, flags, b"cuDevicePrimaryCtxRelease")
-        _cuda_driver_api_v1.cuCtxGetCurrent = _resolve(d, driver_ver, flags, b"cuCtxGetCurrent")
-
-        _cuda_driver_api_v1.cuStreamCreateWithPriority = _resolve(d, driver_ver, flags, b"cuStreamCreateWithPriority")
-        _cuda_driver_api_v1.cuStreamDestroy = _resolve(d, driver_ver, flags, b"cuStreamDestroy")
-
-        _cuda_driver_api_v1.cuEventCreate = _resolve(d, driver_ver, flags, b"cuEventCreate")
-        _cuda_driver_api_v1.cuEventDestroy = _resolve(d, driver_ver, flags, b"cuEventDestroy")
-        _cuda_driver_api_v1.cuIpcOpenEventHandle = _resolve(d, driver_ver, flags, b"cuIpcOpenEventHandle")
-
-        _cuda_driver_api_v1.cuDeviceGetCount = _resolve(d, driver_ver, flags, b"cuDeviceGetCount")
-
-        _cuda_driver_api_v1.cuMemPoolSetAccess = _resolve(d, driver_ver, flags, b"cuMemPoolSetAccess")
-        _cuda_driver_api_v1.cuMemPoolDestroy = _resolve(d, driver_ver, flags, b"cuMemPoolDestroy")
-        _cuda_driver_api_v1.cuMemPoolCreate = _resolve(d, driver_ver, flags, b"cuMemPoolCreate")
-        _cuda_driver_api_v1.cuDeviceGetMemPool = _resolve(d, driver_ver, flags, b"cuDeviceGetMemPool")
-        _cuda_driver_api_v1.cuMemPoolImportFromShareableHandle = _resolve(
-            d, driver_ver, flags, b"cuMemPoolImportFromShareableHandle"
-        )
-
-        _cuda_driver_api_v1.cuMemAllocFromPoolAsync = _resolve(d, driver_ver, flags, b"cuMemAllocFromPoolAsync")
-        _cuda_driver_api_v1.cuMemAllocAsync = _resolve(d, driver_ver, flags, b"cuMemAllocAsync")
-        _cuda_driver_api_v1.cuMemAlloc = _resolve(d, driver_ver, flags, b"cuMemAlloc")
-        _cuda_driver_api_v1.cuMemAllocHost = _resolve(d, driver_ver, flags, b"cuMemAllocHost")
-
-        _cuda_driver_api_v1.cuMemFreeAsync = _resolve(d, driver_ver, flags, b"cuMemFreeAsync")
-        _cuda_driver_api_v1.cuMemFree = _resolve(d, driver_ver, flags, b"cuMemFree")
-        _cuda_driver_api_v1.cuMemFreeHost = _resolve(d, driver_ver, flags, b"cuMemFreeHost")
-
-        _cuda_driver_api_v1.cuMemPoolImportPointer = _resolve(d, driver_ver, flags, b"cuMemPoolImportPointer")
-
-        _cuda_driver_api_v1.abi_version = 1
-        _cuda_driver_api_v1.struct_size = cython.sizeof(CudaDriverApiV1)
-        _cuda_driver_api_v1_inited = True
-
-    return <object>PyCapsule_New(<void*>&_cuda_driver_api_v1, _CUDA_DRIVER_API_V1_NAME, NULL)
+# =============================================================================
+# CUDA driver function pointer initialization
+#
+# The C++ code declares extern function pointers (p_cuXxx) that need to be
+# populated before any handle creation functions are called. We extract these
+# from cuda.bindings.cydriver.__pyx_capi__ at module import time.
+#
+# The Cython string substitution (e.g., "reinterpret_cast<void*&>(...)")
+# allows us to assign void* values to typed function pointer variables.
+# =============================================================================
+
+# Declare extern variables with reinterpret_cast to allow void* assignment
+cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
+    # Context
+    void* p_cuDevicePrimaryCtxRetain "reinterpret_cast<void*&>(cuda_core::p_cuDevicePrimaryCtxRetain)"
+    void* p_cuDevicePrimaryCtxRelease "reinterpret_cast<void*&>(cuda_core::p_cuDevicePrimaryCtxRelease)"
+    void* p_cuCtxGetCurrent "reinterpret_cast<void*&>(cuda_core::p_cuCtxGetCurrent)"
+
+    # Stream
+    void* p_cuStreamCreateWithPriority "reinterpret_cast<void*&>(cuda_core::p_cuStreamCreateWithPriority)"
+    void* p_cuStreamDestroy "reinterpret_cast<void*&>(cuda_core::p_cuStreamDestroy)"
+
+    # Event
+    void* p_cuEventCreate "reinterpret_cast<void*&>(cuda_core::p_cuEventCreate)"
+    void* p_cuEventDestroy "reinterpret_cast<void*&>(cuda_core::p_cuEventDestroy)"
+    void* p_cuIpcOpenEventHandle "reinterpret_cast<void*&>(cuda_core::p_cuIpcOpenEventHandle)"
+
+    # Device
+    void* p_cuDeviceGetCount "reinterpret_cast<void*&>(cuda_core::p_cuDeviceGetCount)"
+
+    # Memory pool
+    void* p_cuMemPoolSetAccess "reinterpret_cast<void*&>(cuda_core::p_cuMemPoolSetAccess)"
+    void* p_cuMemPoolDestroy "reinterpret_cast<void*&>(cuda_core::p_cuMemPoolDestroy)"
+    void* p_cuMemPoolCreate "reinterpret_cast<void*&>(cuda_core::p_cuMemPoolCreate)"
+    void* p_cuDeviceGetMemPool "reinterpret_cast<void*&>(cuda_core::p_cuDeviceGetMemPool)"
+    void* p_cuMemPoolImportFromShareableHandle "reinterpret_cast<void*&>(cuda_core::p_cuMemPoolImportFromShareableHandle)"
+
+    # Memory allocation
+    void* p_cuMemAllocFromPoolAsync "reinterpret_cast<void*&>(cuda_core::p_cuMemAllocFromPoolAsync)"
+    void* p_cuMemAllocAsync "reinterpret_cast<void*&>(cuda_core::p_cuMemAllocAsync)"
+    void* p_cuMemAlloc "reinterpret_cast<void*&>(cuda_core::p_cuMemAlloc)"
+    void* p_cuMemAllocHost "reinterpret_cast<void*&>(cuda_core::p_cuMemAllocHost)"
+
+    # Memory deallocation
+    void* p_cuMemFreeAsync "reinterpret_cast<void*&>(cuda_core::p_cuMemFreeAsync)"
+    void* p_cuMemFree "reinterpret_cast<void*&>(cuda_core::p_cuMemFree)"
+    void* p_cuMemFreeHost "reinterpret_cast<void*&>(cuda_core::p_cuMemFreeHost)"
+
+    # IPC
+    void* p_cuMemPoolImportPointer "reinterpret_cast<void*&>(cuda_core::p_cuMemPoolImportPointer)"
+
+
+# Initialize driver function pointers from cydriver.__pyx_capi__ at module load
+cdef void* _get_driver_fn(str name):
+    capsule = cydriver.__pyx_capi__[name]
+    return PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule))
+
+# Context
+p_cuDevicePrimaryCtxRetain = _get_driver_fn("cuDevicePrimaryCtxRetain")
+p_cuDevicePrimaryCtxRelease = _get_driver_fn("cuDevicePrimaryCtxRelease")
+p_cuCtxGetCurrent = _get_driver_fn("cuCtxGetCurrent")
+
+# Stream
+p_cuStreamCreateWithPriority = _get_driver_fn("cuStreamCreateWithPriority")
+p_cuStreamDestroy = _get_driver_fn("cuStreamDestroy")
+
+# Event
+p_cuEventCreate = _get_driver_fn("cuEventCreate")
+p_cuEventDestroy = _get_driver_fn("cuEventDestroy")
+p_cuIpcOpenEventHandle = _get_driver_fn("cuIpcOpenEventHandle")
+
+# Device
+p_cuDeviceGetCount = _get_driver_fn("cuDeviceGetCount")
+
+# Memory pool
+p_cuMemPoolSetAccess = _get_driver_fn("cuMemPoolSetAccess")
+p_cuMemPoolDestroy = _get_driver_fn("cuMemPoolDestroy")
+p_cuMemPoolCreate = _get_driver_fn("cuMemPoolCreate")
+p_cuDeviceGetMemPool = _get_driver_fn("cuDeviceGetMemPool")
+p_cuMemPoolImportFromShareableHandle = _get_driver_fn("cuMemPoolImportFromShareableHandle")
+
+# Memory allocation
+p_cuMemAllocFromPoolAsync = _get_driver_fn("cuMemAllocFromPoolAsync")
+p_cuMemAllocAsync = _get_driver_fn("cuMemAllocAsync")
+p_cuMemAlloc = _get_driver_fn("cuMemAlloc")
+p_cuMemAllocHost = _get_driver_fn("cuMemAllocHost")
+
+# Memory deallocation
+p_cuMemFreeAsync = _get_driver_fn("cuMemFreeAsync")
+p_cuMemFree = _get_driver_fn("cuMemFree")
+p_cuMemFreeHost = _get_driver_fn("cuMemFreeHost")
+
+# IPC
+p_cuMemPoolImportPointer = _get_driver_fn("cuMemPoolImportPointer")