From 3921ca628dff3b75314e780b53daec594338820b Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 12 Jan 2026 14:23:55 -0800 Subject: [PATCH] Use __pyx_capi__ for CUDA driver function pointers (#1450) Replace the _CUDA_DRIVER_API_V1 capsule with direct extraction of function pointers from cuda.bindings.cydriver.__pyx_capi__ at module import time. This simplifies the architecture by eliminating the custom capsule struct and its associated loading machinery (load_driver_api, ensure_driver_loaded, cuGetProcAddress resolution). The driver function pointers are now populated directly from Cython's built-in cross-module API mechanism. Closes #1450 --- cuda_core/cuda/core/_cpp/DESIGN.md | 50 ++-- cuda_core/cuda/core/_cpp/resource_handles.cpp | 272 +++--------------- cuda_core/cuda/core/_cpp/resource_handles.hpp | 37 +++ cuda_core/cuda/core/_resource_handles.pyx | 211 +++++++------- 4 files changed, 203 insertions(+), 367 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/DESIGN.md b/cuda_core/cuda/core/_cpp/DESIGN.md index 3accf1bcd0..965904ea67 100644 --- a/cuda_core/cuda/core/_cpp/DESIGN.md +++ b/cuda_core/cuda/core/_cpp/DESIGN.md @@ -117,7 +117,8 @@ link against this code directly—they access it through a capsule mechanism ## Capsule Architecture -The implementation uses **two separate capsule mechanisms** for different purposes: +The implementation uses a capsule mechanism for cross-module C++ function sharing, +and Cython's `__pyx_capi__` for CUDA driver function resolution: ### Capsule 1: C++ API Table (`_CXX_API`) @@ -160,38 +161,45 @@ cdef inline StreamHandle create_stream_handle(...) except * nogil: Importing modules are expected to call `_init_handles_table()` prior to calling any wrapper functions. -### Capsule 2: CUDA Driver API (`_CUDA_DRIVER_API_V1`) +### CUDA Driver Function Pointers via `__pyx_capi__` **Problem**: cuda.core cannot directly call CUDA driver functions because: 1. We don't want to link against `libcuda.so` at build time. 2. The driver symbols must be resolved dynamically through cuda-bindings. -**Solution**: `_resource_handles.pyx` creates a capsule containing CUDA driver -function pointers obtained from cuda-bindings: +**Solution**: The C++ code declares extern function pointer variables: ```cpp -struct CudaDriverApiV1 { - uint32_t abi_version; - uint32_t struct_size; - - uintptr_t cuDevicePrimaryCtxRetain; - uintptr_t cuDevicePrimaryCtxRelease; - uintptr_t cuStreamCreateWithPriority; - uintptr_t cuStreamDestroy; - // ... etc -}; +// resource_handles.hpp +extern decltype(&cuStreamCreateWithPriority) p_cuStreamCreateWithPriority; +extern decltype(&cuMemPoolCreate) p_cuMemPoolCreate; +// ... etc ``` -The C++ code retrieves this capsule once (via `load_driver_api()`) and caches the -function pointers for subsequent use. +At module import time, `_resource_handles.pyx` populates these pointers by +extracting them from `cuda.bindings.cydriver.__pyx_capi__`: + +```cython +import cuda.bindings.cydriver as cydriver + +cdef void* _get_driver_fn(str name): + capsule = cydriver.__pyx_capi__[name] + return PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule)) + +p_cuStreamCreateWithPriority = _get_driver_fn("cuStreamCreateWithPriority") +``` -### Why Two Capsules? +The `__pyx_capi__` dictionary contains PyCapsules that Cython automatically +generates for each `cdef` function declared in a `.pxd` file. Each capsule's +name is the function's C signature; we query it with `PyCapsule_GetName()` +rather than hardcoding signatures. -| Capsule | Direction | Purpose | -|---------|-----------|---------| -| `_CXX_API` | C++ → Cython | Share handle functions across modules | -| `_CUDA_DRIVER_API_V1` | Cython → C++ | Provide resolved driver symbols | +This approach: +- Avoids linking against `libcuda.so` at build time +- Works on CPU-only machines (capsule extraction succeeds; actual driver calls + will return errors like `CUDA_ERROR_NO_DEVICE`) +- Requires no custom capsule infrastructure—uses Cython's built-in mechanism ## Key Implementation Details diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 2cbdce2fa9..179ad369eb 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -17,13 +16,47 @@ namespace cuda_core { // ============================================================================ -// CUDA driver lazy resolution via cuda-bindings (CPU-only import + MVC) +// CUDA driver function pointers +// +// These are populated by _resource_handles.pyx at module import time using +// function pointers extracted from cuda.bindings.cydriver.__pyx_capi__. // ============================================================================ -namespace { +decltype(&cuDevicePrimaryCtxRetain) p_cuDevicePrimaryCtxRetain = nullptr; +decltype(&cuDevicePrimaryCtxRelease) p_cuDevicePrimaryCtxRelease = nullptr; +decltype(&cuCtxGetCurrent) p_cuCtxGetCurrent = nullptr; + +decltype(&cuStreamCreateWithPriority) p_cuStreamCreateWithPriority = nullptr; +decltype(&cuStreamDestroy) p_cuStreamDestroy = nullptr; + +decltype(&cuEventCreate) p_cuEventCreate = nullptr; +decltype(&cuEventDestroy) p_cuEventDestroy = nullptr; +decltype(&cuIpcOpenEventHandle) p_cuIpcOpenEventHandle = nullptr; + +decltype(&cuDeviceGetCount) p_cuDeviceGetCount = nullptr; + +decltype(&cuMemPoolSetAccess) p_cuMemPoolSetAccess = nullptr; +decltype(&cuMemPoolDestroy) p_cuMemPoolDestroy = nullptr; +decltype(&cuMemPoolCreate) p_cuMemPoolCreate = nullptr; +decltype(&cuDeviceGetMemPool) p_cuDeviceGetMemPool = nullptr; +decltype(&cuMemPoolImportFromShareableHandle) p_cuMemPoolImportFromShareableHandle = nullptr; + +decltype(&cuMemAllocFromPoolAsync) p_cuMemAllocFromPoolAsync = nullptr; +decltype(&cuMemAllocAsync) p_cuMemAllocAsync = nullptr; +decltype(&cuMemAlloc) p_cuMemAlloc = nullptr; +decltype(&cuMemAllocHost) p_cuMemAllocHost = nullptr; + +decltype(&cuMemFreeAsync) p_cuMemFreeAsync = nullptr; +decltype(&cuMemFree) p_cuMemFree = nullptr; +decltype(&cuMemFreeHost) p_cuMemFreeHost = nullptr; + +decltype(&cuMemPoolImportPointer) p_cuMemPoolImportPointer = nullptr; + +// ============================================================================ +// GIL management helpers +// ============================================================================ -std::once_flag driver_load_once; -std::atomic driver_loaded{false}; +namespace { #if PY_VERSION_HEX < 0x030D0000 extern "C" int _Py_IsFinalizing(void); @@ -39,10 +72,6 @@ inline bool py_is_finalizing() noexcept { #endif } -// ============================================================================ -// GIL management helpers -// ============================================================================ - // Helper to release the GIL while calling into the CUDA driver. // This guard is *conditional*: if the caller already dropped the GIL, // we avoid calling PyEval_SaveThread (which requires holding the GIL). @@ -110,178 +139,6 @@ class GILAcquireGuard { bool acquired_; }; - -#define DECLARE_DRIVER_FN(name) using name##_t = decltype(&name); name##_t p_##name = nullptr - -DECLARE_DRIVER_FN(cuDevicePrimaryCtxRetain); -DECLARE_DRIVER_FN(cuDevicePrimaryCtxRelease); -DECLARE_DRIVER_FN(cuCtxGetCurrent); - -DECLARE_DRIVER_FN(cuStreamCreateWithPriority); -DECLARE_DRIVER_FN(cuStreamDestroy); - -DECLARE_DRIVER_FN(cuEventCreate); -DECLARE_DRIVER_FN(cuEventDestroy); -DECLARE_DRIVER_FN(cuIpcOpenEventHandle); - -DECLARE_DRIVER_FN(cuDeviceGetCount); - -DECLARE_DRIVER_FN(cuMemPoolSetAccess); -DECLARE_DRIVER_FN(cuMemPoolDestroy); -DECLARE_DRIVER_FN(cuMemPoolCreate); -DECLARE_DRIVER_FN(cuDeviceGetMemPool); -DECLARE_DRIVER_FN(cuMemPoolImportFromShareableHandle); - -DECLARE_DRIVER_FN(cuMemAllocFromPoolAsync); -DECLARE_DRIVER_FN(cuMemAllocAsync); -DECLARE_DRIVER_FN(cuMemAlloc); -DECLARE_DRIVER_FN(cuMemAllocHost); - -DECLARE_DRIVER_FN(cuMemFreeAsync); -DECLARE_DRIVER_FN(cuMemFree); -DECLARE_DRIVER_FN(cuMemFreeHost); - -DECLARE_DRIVER_FN(cuMemPoolImportPointer); - -#undef DECLARE_DRIVER_FN - -bool load_driver_api() noexcept { - struct CudaDriverApiV1 { - std::uint32_t abi_version; - std::uint32_t struct_size; - - std::uintptr_t cuDevicePrimaryCtxRetain; - std::uintptr_t cuDevicePrimaryCtxRelease; - std::uintptr_t cuCtxGetCurrent; - - std::uintptr_t cuStreamCreateWithPriority; - std::uintptr_t cuStreamDestroy; - - std::uintptr_t cuEventCreate; - std::uintptr_t cuEventDestroy; - std::uintptr_t cuIpcOpenEventHandle; - - std::uintptr_t cuDeviceGetCount; - - std::uintptr_t cuMemPoolSetAccess; - std::uintptr_t cuMemPoolDestroy; - std::uintptr_t cuMemPoolCreate; - std::uintptr_t cuDeviceGetMemPool; - std::uintptr_t cuMemPoolImportFromShareableHandle; - - std::uintptr_t cuMemAllocFromPoolAsync; - std::uintptr_t cuMemAllocAsync; - std::uintptr_t cuMemAlloc; - std::uintptr_t cuMemAllocHost; - - std::uintptr_t cuMemFreeAsync; - std::uintptr_t cuMemFree; - std::uintptr_t cuMemFreeHost; - - std::uintptr_t cuMemPoolImportPointer; - }; - - static constexpr const char* capsule_name = - "cuda.core._resource_handles._CUDA_DRIVER_API_V1"; - - GILAcquireGuard gil; - if (!gil.acquired()) { - return false; - } - - // `_resource_handles` is already loaded (it exports the handle API capsule), - // so avoid import machinery and just grab the module object. - PyObject* mod = PyImport_AddModule("cuda.core._resource_handles"); // borrowed - if (!mod) { - PyErr_Clear(); - return false; - } - - PyObject* fn = PyObject_GetAttrString(mod, "_get_cuda_driver_api_v1_capsule"); // new ref - if (!fn) { - PyErr_Clear(); - return false; - } - - PyObject* cap = PyObject_CallFunctionObjArgs(fn, nullptr); - Py_DECREF(fn); - if (!cap) { - PyErr_Clear(); - return false; - } - - const auto* api = static_cast(PyCapsule_GetPointer(cap, capsule_name)); - Py_DECREF(cap); - - if (!api) { - PyErr_Clear(); - return false; - } - if (api->abi_version != 1 || api->struct_size < sizeof(CudaDriverApiV1)) { - return false; - } - -#define LOAD_ADDR(name) \ - do { \ - if (api->name == 0) { \ - return false; \ - } \ - p_##name = reinterpret_cast(api->name); \ - } while (0) - - LOAD_ADDR(cuDevicePrimaryCtxRetain); - LOAD_ADDR(cuDevicePrimaryCtxRelease); - LOAD_ADDR(cuCtxGetCurrent); - - LOAD_ADDR(cuStreamCreateWithPriority); - LOAD_ADDR(cuStreamDestroy); - - LOAD_ADDR(cuEventCreate); - LOAD_ADDR(cuEventDestroy); - LOAD_ADDR(cuIpcOpenEventHandle); - - LOAD_ADDR(cuDeviceGetCount); - - LOAD_ADDR(cuMemPoolSetAccess); - LOAD_ADDR(cuMemPoolDestroy); - LOAD_ADDR(cuMemPoolCreate); - LOAD_ADDR(cuDeviceGetMemPool); - LOAD_ADDR(cuMemPoolImportFromShareableHandle); - - LOAD_ADDR(cuMemAllocFromPoolAsync); - LOAD_ADDR(cuMemAllocAsync); - LOAD_ADDR(cuMemAlloc); - LOAD_ADDR(cuMemAllocHost); - - LOAD_ADDR(cuMemFreeAsync); - LOAD_ADDR(cuMemFree); - LOAD_ADDR(cuMemFreeHost); - - LOAD_ADDR(cuMemPoolImportPointer); - -#undef LOAD_ADDR - - return true; -} - -bool ensure_driver_loaded() noexcept { - // Fast path: already loaded (no locking needed) - if (driver_loaded.load(std::memory_order_acquire)) { - return true; - } - - // Slow path: release GIL before acquiring call_once guard. - // This ensures lock order is always: guard mutex -> GIL, preventing deadlock. - // See DESIGN.md "Static Initialization and Deadlock Hazards". - GILReleaseGuard release_gil; - std::call_once(driver_load_once, []() { - // Inside call_once, safe to acquire GIL (correct lock order). - // load_driver_api() acquires GIL internally via GILAcquireGuard. - driver_loaded.store(load_driver_api(), std::memory_order_release); - }); - return driver_loaded.load(std::memory_order_acquire); -} - } // namespace // ============================================================================ @@ -324,10 +181,6 @@ ContextHandle create_context_handle_ref(CUcontext ctx) { static thread_local std::vector primary_context_cache; ContextHandle get_primary_context(int device_id) noexcept { - if (!ensure_driver_loaded()) { - err = CUDA_ERROR_NOT_INITIALIZED; - return {}; - } // Check thread-local cache if (static_cast(device_id) < primary_context_cache.size()) { if (auto cached = primary_context_cache[device_id]) { @@ -361,10 +214,6 @@ ContextHandle get_primary_context(int device_id) noexcept { } ContextHandle get_current_context() noexcept { - if (!ensure_driver_loaded()) { - err = CUDA_ERROR_NOT_INITIALIZED; - return {}; - } GILReleaseGuard gil; CUcontext ctx = nullptr; if (CUDA_SUCCESS != (err = p_cuCtxGetCurrent(&ctx))) { @@ -387,10 +236,6 @@ struct StreamBox { } // namespace StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) { - if (!ensure_driver_loaded()) { - err = CUDA_ERROR_NOT_INITIALIZED; - return {}; - } GILReleaseGuard gil; CUstream stream; if (CUDA_SUCCESS != (err = p_cuStreamCreateWithPriority(&stream, flags, priority))) { @@ -449,10 +294,6 @@ struct EventBox { } // namespace EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) { - if (!ensure_driver_loaded()) { - err = CUDA_ERROR_NOT_INITIALIZED; - return {}; - } GILReleaseGuard gil; CUevent event; if (CUDA_SUCCESS != (err = p_cuEventCreate(&event, flags))) { @@ -475,10 +316,6 @@ EventHandle create_event_handle(unsigned int flags) { } EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) { - if (!ensure_driver_loaded()) { - err = CUDA_ERROR_NOT_INITIALIZED; - return {}; - } GILReleaseGuard gil; CUevent event; if (CUDA_SUCCESS != (err = p_cuIpcOpenEventHandle(&event, ipc_handle))) { @@ -537,10 +374,6 @@ static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) { } MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props) { - if (!ensure_driver_loaded()) { - err = CUDA_ERROR_NOT_INITIALIZED; - return {}; - } GILReleaseGuard gil; CUmemoryPool pool; if (CUDA_SUCCESS != (err = p_cuMemPoolCreate(&pool, &props))) { @@ -555,10 +388,6 @@ MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool) { } MemoryPoolHandle get_device_mempool(int device_id) noexcept { - if (!ensure_driver_loaded()) { - err = CUDA_ERROR_NOT_INITIALIZED; - return {}; - } GILReleaseGuard gil; CUmemoryPool pool; if (CUDA_SUCCESS != (err = p_cuDeviceGetMemPool(&pool, device_id))) { @@ -568,10 +397,6 @@ MemoryPoolHandle get_device_mempool(int device_id) noexcept { } MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type) { - if (!ensure_driver_loaded()) { - err = CUDA_ERROR_NOT_INITIALIZED; - return {}; - } GILReleaseGuard gil; CUmemoryPool pool; auto handle_ptr = reinterpret_cast(static_cast(fd)); @@ -616,10 +441,6 @@ void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) { } DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, StreamHandle h_stream) { - if (!ensure_driver_loaded()) { - err = CUDA_ERROR_NOT_INITIALIZED; - return {}; - } GILReleaseGuard gil; CUdeviceptr ptr; if (CUDA_SUCCESS != (err = p_cuMemAllocFromPoolAsync(&ptr, size, *h_pool, as_cu(h_stream)))) { @@ -638,10 +459,6 @@ DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, } DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) { - if (!ensure_driver_loaded()) { - err = CUDA_ERROR_NOT_INITIALIZED; - return {}; - } GILReleaseGuard gil; CUdeviceptr ptr; if (CUDA_SUCCESS != (err = p_cuMemAllocAsync(&ptr, size, as_cu(h_stream)))) { @@ -660,10 +477,6 @@ DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) { } DevicePtrHandle deviceptr_alloc(size_t size) { - if (!ensure_driver_loaded()) { - err = CUDA_ERROR_NOT_INITIALIZED; - return {}; - } GILReleaseGuard gil; CUdeviceptr ptr; if (CUDA_SUCCESS != (err = p_cuMemAlloc(&ptr, size))) { @@ -682,10 +495,6 @@ DevicePtrHandle deviceptr_alloc(size_t size) { } DevicePtrHandle deviceptr_alloc_host(size_t size) { - if (!ensure_driver_loaded()) { - err = CUDA_ERROR_NOT_INITIALIZED; - return {}; - } GILReleaseGuard gil; void* ptr; if (CUDA_SUCCESS != (err = p_cuMemAllocHost(&ptr, size))) { @@ -785,11 +594,6 @@ static std::mutex ipc_ptr_cache_mutex; static std::unordered_map, ExportDataKeyHash> ipc_ptr_cache; DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) { - if (!ensure_driver_loaded()) { - err = CUDA_ERROR_NOT_INITIALIZED; - return {}; - } - auto data = const_cast( reinterpret_cast(export_data)); diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index 6b82594c2b..adff5c37d7 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -24,6 +24,43 @@ CUresult peek_last_error() noexcept; // Explicitly clear the last error void clear_last_error() noexcept; +// ============================================================================ +// CUDA driver function pointers +// +// These are populated by _resource_handles.pyx at module import time using +// function pointers extracted from cuda.bindings.cydriver.__pyx_capi__. +// ============================================================================ + +extern decltype(&cuDevicePrimaryCtxRetain) p_cuDevicePrimaryCtxRetain; +extern decltype(&cuDevicePrimaryCtxRelease) p_cuDevicePrimaryCtxRelease; +extern decltype(&cuCtxGetCurrent) p_cuCtxGetCurrent; + +extern decltype(&cuStreamCreateWithPriority) p_cuStreamCreateWithPriority; +extern decltype(&cuStreamDestroy) p_cuStreamDestroy; + +extern decltype(&cuEventCreate) p_cuEventCreate; +extern decltype(&cuEventDestroy) p_cuEventDestroy; +extern decltype(&cuIpcOpenEventHandle) p_cuIpcOpenEventHandle; + +extern decltype(&cuDeviceGetCount) p_cuDeviceGetCount; + +extern decltype(&cuMemPoolSetAccess) p_cuMemPoolSetAccess; +extern decltype(&cuMemPoolDestroy) p_cuMemPoolDestroy; +extern decltype(&cuMemPoolCreate) p_cuMemPoolCreate; +extern decltype(&cuDeviceGetMemPool) p_cuDeviceGetMemPool; +extern decltype(&cuMemPoolImportFromShareableHandle) p_cuMemPoolImportFromShareableHandle; + +extern decltype(&cuMemAllocFromPoolAsync) p_cuMemAllocFromPoolAsync; +extern decltype(&cuMemAllocAsync) p_cuMemAllocAsync; +extern decltype(&cuMemAlloc) p_cuMemAlloc; +extern decltype(&cuMemAllocHost) p_cuMemAllocHost; + +extern decltype(&cuMemFreeAsync) p_cuMemFreeAsync; +extern decltype(&cuMemFree) p_cuMemFree; +extern decltype(&cuMemFreeHost) p_cuMemFreeHost; + +extern decltype(&cuMemPoolImportPointer) p_cuMemPoolImportPointer; + // ============================================================================ // Handle type aliases - expose only the raw CUDA resource // ============================================================================ diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index f15d524430..5f1b016884 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -2,23 +2,20 @@ # # SPDX-License-Identifier: Apache-2.0 -# This module exists to compile _cpp/resource_handles.cpp into a shared library. -# The helper functions (cu, intptr, py) are implemented as inline C++ functions -# in _cpp/resource_handles.hpp and declared as extern in _resource_handles.pxd. +# This module compiles _cpp/resource_handles.cpp into a shared library. +# At import time, it populates the C++ driver function pointers using +# capsules from cuda.bindings.cydriver.__pyx_capi__. -from cpython.pycapsule cimport PyCapsule_New -from libc.stdint cimport uint32_t, uint64_t, uintptr_t +from cpython.pycapsule cimport PyCapsule_GetName, PyCapsule_GetPointer, PyCapsule_New from ._resource_handles_cxx_api cimport ( ResourceHandlesCxxApiV1, get_resource_handles_cxx_api_v1, ) -import cython - +import cuda.bindings.cydriver as cydriver cdef const char* _CXX_API_NAME = b"cuda.core._resource_handles._CXX_API" -cdef const char* _CUDA_DRIVER_API_V1_NAME = b"cuda.core._resource_handles._CUDA_DRIVER_API_V1" # Export the C++ handles dispatch table as a PyCapsule. # Consumers use PyCapsule_Import(_CXX_API_NAME, 0) to retrieve it. @@ -31,107 +28,97 @@ if _CXX_API is None: raise RuntimeError("Failed to create _CXX_API capsule") -cdef struct CudaDriverApiV1: - uint32_t abi_version - uint32_t struct_size - - uintptr_t cuDevicePrimaryCtxRetain - uintptr_t cuDevicePrimaryCtxRelease - uintptr_t cuCtxGetCurrent - - uintptr_t cuStreamCreateWithPriority - uintptr_t cuStreamDestroy - - uintptr_t cuEventCreate - uintptr_t cuEventDestroy - uintptr_t cuIpcOpenEventHandle - - uintptr_t cuDeviceGetCount - - uintptr_t cuMemPoolSetAccess - uintptr_t cuMemPoolDestroy - uintptr_t cuMemPoolCreate - uintptr_t cuDeviceGetMemPool - uintptr_t cuMemPoolImportFromShareableHandle - - uintptr_t cuMemAllocFromPoolAsync - uintptr_t cuMemAllocAsync - uintptr_t cuMemAlloc - uintptr_t cuMemAllocHost - - uintptr_t cuMemFreeAsync - uintptr_t cuMemFree - uintptr_t cuMemFreeHost - - uintptr_t cuMemPoolImportPointer - - -cdef CudaDriverApiV1 _cuda_driver_api_v1 -cdef bint _cuda_driver_api_v1_inited = False - - -cdef inline uintptr_t _as_addr(object pfn) except 0: - return int(pfn) - - -cdef inline uintptr_t _resolve(object d, int driver_ver, uint64_t flags, bytes sym) except 0: - err, pfn, status = d.cuGetProcAddress(sym, driver_ver, flags) - if int(err) != 0 or pfn is None: - raise RuntimeError(f"cuGetProcAddress failed for {sym!r}, err={err}, status={status}") - return _as_addr(pfn) - - -def _get_cuda_driver_api_v1_capsule(): - """Return a PyCapsule containing cached CUDA driver entrypoints. - - This is evaluated lazily on first use so cuda-core remains importable on - CPU-only machines. - """ - global _cuda_driver_api_v1_inited, _cuda_driver_api_v1 - if not _cuda_driver_api_v1_inited: - import cuda.bindings.driver as d - - err, ver = d.cuDriverGetVersion() - if int(err) != 0: - raise RuntimeError(f"cuDriverGetVersion failed: {err}") - driver_ver = int(ver) - - flags = 0 # CU_GET_PROC_ADDRESS_DEFAULT - - _cuda_driver_api_v1.cuDevicePrimaryCtxRetain = _resolve(d, driver_ver, flags, b"cuDevicePrimaryCtxRetain") - _cuda_driver_api_v1.cuDevicePrimaryCtxRelease = _resolve(d, driver_ver, flags, b"cuDevicePrimaryCtxRelease") - _cuda_driver_api_v1.cuCtxGetCurrent = _resolve(d, driver_ver, flags, b"cuCtxGetCurrent") - - _cuda_driver_api_v1.cuStreamCreateWithPriority = _resolve(d, driver_ver, flags, b"cuStreamCreateWithPriority") - _cuda_driver_api_v1.cuStreamDestroy = _resolve(d, driver_ver, flags, b"cuStreamDestroy") - - _cuda_driver_api_v1.cuEventCreate = _resolve(d, driver_ver, flags, b"cuEventCreate") - _cuda_driver_api_v1.cuEventDestroy = _resolve(d, driver_ver, flags, b"cuEventDestroy") - _cuda_driver_api_v1.cuIpcOpenEventHandle = _resolve(d, driver_ver, flags, b"cuIpcOpenEventHandle") - - _cuda_driver_api_v1.cuDeviceGetCount = _resolve(d, driver_ver, flags, b"cuDeviceGetCount") - - _cuda_driver_api_v1.cuMemPoolSetAccess = _resolve(d, driver_ver, flags, b"cuMemPoolSetAccess") - _cuda_driver_api_v1.cuMemPoolDestroy = _resolve(d, driver_ver, flags, b"cuMemPoolDestroy") - _cuda_driver_api_v1.cuMemPoolCreate = _resolve(d, driver_ver, flags, b"cuMemPoolCreate") - _cuda_driver_api_v1.cuDeviceGetMemPool = _resolve(d, driver_ver, flags, b"cuDeviceGetMemPool") - _cuda_driver_api_v1.cuMemPoolImportFromShareableHandle = _resolve( - d, driver_ver, flags, b"cuMemPoolImportFromShareableHandle" - ) - - _cuda_driver_api_v1.cuMemAllocFromPoolAsync = _resolve(d, driver_ver, flags, b"cuMemAllocFromPoolAsync") - _cuda_driver_api_v1.cuMemAllocAsync = _resolve(d, driver_ver, flags, b"cuMemAllocAsync") - _cuda_driver_api_v1.cuMemAlloc = _resolve(d, driver_ver, flags, b"cuMemAlloc") - _cuda_driver_api_v1.cuMemAllocHost = _resolve(d, driver_ver, flags, b"cuMemAllocHost") - - _cuda_driver_api_v1.cuMemFreeAsync = _resolve(d, driver_ver, flags, b"cuMemFreeAsync") - _cuda_driver_api_v1.cuMemFree = _resolve(d, driver_ver, flags, b"cuMemFree") - _cuda_driver_api_v1.cuMemFreeHost = _resolve(d, driver_ver, flags, b"cuMemFreeHost") - - _cuda_driver_api_v1.cuMemPoolImportPointer = _resolve(d, driver_ver, flags, b"cuMemPoolImportPointer") - - _cuda_driver_api_v1.abi_version = 1 - _cuda_driver_api_v1.struct_size = cython.sizeof(CudaDriverApiV1) - _cuda_driver_api_v1_inited = True - - return PyCapsule_New(&_cuda_driver_api_v1, _CUDA_DRIVER_API_V1_NAME, NULL) +# ============================================================================= +# CUDA driver function pointer initialization +# +# The C++ code declares extern function pointers (p_cuXxx) that need to be +# populated before any handle creation functions are called. We extract these +# from cuda.bindings.cydriver.__pyx_capi__ at module import time. +# +# The Cython string substitution (e.g., "reinterpret_cast(...)") +# allows us to assign void* values to typed function pointer variables. +# ============================================================================= + +# Declare extern variables with reinterpret_cast to allow void* assignment +cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": + # Context + void* p_cuDevicePrimaryCtxRetain "reinterpret_cast(cuda_core::p_cuDevicePrimaryCtxRetain)" + void* p_cuDevicePrimaryCtxRelease "reinterpret_cast(cuda_core::p_cuDevicePrimaryCtxRelease)" + void* p_cuCtxGetCurrent "reinterpret_cast(cuda_core::p_cuCtxGetCurrent)" + + # Stream + void* p_cuStreamCreateWithPriority "reinterpret_cast(cuda_core::p_cuStreamCreateWithPriority)" + void* p_cuStreamDestroy "reinterpret_cast(cuda_core::p_cuStreamDestroy)" + + # Event + void* p_cuEventCreate "reinterpret_cast(cuda_core::p_cuEventCreate)" + void* p_cuEventDestroy "reinterpret_cast(cuda_core::p_cuEventDestroy)" + void* p_cuIpcOpenEventHandle "reinterpret_cast(cuda_core::p_cuIpcOpenEventHandle)" + + # Device + void* p_cuDeviceGetCount "reinterpret_cast(cuda_core::p_cuDeviceGetCount)" + + # Memory pool + void* p_cuMemPoolSetAccess "reinterpret_cast(cuda_core::p_cuMemPoolSetAccess)" + void* p_cuMemPoolDestroy "reinterpret_cast(cuda_core::p_cuMemPoolDestroy)" + void* p_cuMemPoolCreate "reinterpret_cast(cuda_core::p_cuMemPoolCreate)" + void* p_cuDeviceGetMemPool "reinterpret_cast(cuda_core::p_cuDeviceGetMemPool)" + void* p_cuMemPoolImportFromShareableHandle "reinterpret_cast(cuda_core::p_cuMemPoolImportFromShareableHandle)" + + # Memory allocation + void* p_cuMemAllocFromPoolAsync "reinterpret_cast(cuda_core::p_cuMemAllocFromPoolAsync)" + void* p_cuMemAllocAsync "reinterpret_cast(cuda_core::p_cuMemAllocAsync)" + void* p_cuMemAlloc "reinterpret_cast(cuda_core::p_cuMemAlloc)" + void* p_cuMemAllocHost "reinterpret_cast(cuda_core::p_cuMemAllocHost)" + + # Memory deallocation + void* p_cuMemFreeAsync "reinterpret_cast(cuda_core::p_cuMemFreeAsync)" + void* p_cuMemFree "reinterpret_cast(cuda_core::p_cuMemFree)" + void* p_cuMemFreeHost "reinterpret_cast(cuda_core::p_cuMemFreeHost)" + + # IPC + void* p_cuMemPoolImportPointer "reinterpret_cast(cuda_core::p_cuMemPoolImportPointer)" + + +# Initialize driver function pointers from cydriver.__pyx_capi__ at module load +cdef void* _get_driver_fn(str name): + capsule = cydriver.__pyx_capi__[name] + return PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule)) + +# Context +p_cuDevicePrimaryCtxRetain = _get_driver_fn("cuDevicePrimaryCtxRetain") +p_cuDevicePrimaryCtxRelease = _get_driver_fn("cuDevicePrimaryCtxRelease") +p_cuCtxGetCurrent = _get_driver_fn("cuCtxGetCurrent") + +# Stream +p_cuStreamCreateWithPriority = _get_driver_fn("cuStreamCreateWithPriority") +p_cuStreamDestroy = _get_driver_fn("cuStreamDestroy") + +# Event +p_cuEventCreate = _get_driver_fn("cuEventCreate") +p_cuEventDestroy = _get_driver_fn("cuEventDestroy") +p_cuIpcOpenEventHandle = _get_driver_fn("cuIpcOpenEventHandle") + +# Device +p_cuDeviceGetCount = _get_driver_fn("cuDeviceGetCount") + +# Memory pool +p_cuMemPoolSetAccess = _get_driver_fn("cuMemPoolSetAccess") +p_cuMemPoolDestroy = _get_driver_fn("cuMemPoolDestroy") +p_cuMemPoolCreate = _get_driver_fn("cuMemPoolCreate") +p_cuDeviceGetMemPool = _get_driver_fn("cuDeviceGetMemPool") +p_cuMemPoolImportFromShareableHandle = _get_driver_fn("cuMemPoolImportFromShareableHandle") + +# Memory allocation +p_cuMemAllocFromPoolAsync = _get_driver_fn("cuMemAllocFromPoolAsync") +p_cuMemAllocAsync = _get_driver_fn("cuMemAllocAsync") +p_cuMemAlloc = _get_driver_fn("cuMemAlloc") +p_cuMemAllocHost = _get_driver_fn("cuMemAllocHost") + +# Memory deallocation +p_cuMemFreeAsync = _get_driver_fn("cuMemFreeAsync") +p_cuMemFree = _get_driver_fn("cuMemFree") +p_cuMemFreeHost = _get_driver_fn("cuMemFreeHost") + +# IPC +p_cuMemPoolImportPointer = _get_driver_fn("cuMemPoolImportPointer")