From 6ec833021e5bcda2a5b9e5f8f61e996f434e3647 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 12 Jan 2026 12:45:58 -0800 Subject: [PATCH] Eliminate _CXX_API capsule for resource handle functions Replace the PyCapsule-based function pointer table with direct Cython cimport. Consumer modules now call resource handle functions directly through _resource_handles.so, simplifying the architecture while correctly sharing static/thread-local state. Changes: - Remove _CXX_API capsule infrastructure (resource_handles_cxx_api.hpp, _resource_handles_cxx_api.pxd, get_resource_handles_cxx_api_v1()) - Remove _init_handles_table() calls from all consumer modules - Rename create_event_handle(flags) to create_event_handle_noctx(flags) to avoid C++ overload ambiguity for Cython binding - Update DESIGN.md to reflect the simplified architecture - Add clarifying comment in build_hooks.py for cpp file discovery Closes #1452 --- cuda_core/build_hooks.py | 1 + cuda_core/cuda/core/_cpp/DESIGN.md | 80 ++---- cuda_core/cuda/core/_cpp/resource_handles.cpp | 59 +--- cuda_core/cuda/core/_cpp/resource_handles.hpp | 2 +- .../core/_cpp/resource_handles_cxx_api.hpp | 79 ------ cuda_core/cuda/core/_device.pyx | 4 - cuda_core/cuda/core/_event.pyx | 4 - cuda_core/cuda/core/_memory/_buffer.pyx | 4 - .../core/_memory/_graph_memory_resource.pyx | 4 - cuda_core/cuda/core/_memory/_ipc.pyx | 4 - cuda_core/cuda/core/_memory/_memory_pool.pyx | 4 - cuda_core/cuda/core/_memoryview.pyx | 4 - cuda_core/cuda/core/_resource_handles.pxd | 264 ++++-------------- cuda_core/cuda/core/_resource_handles.pyx | 106 +++++-- .../cuda/core/_resource_handles_cxx_api.pxd | 68 ----- cuda_core/cuda/core/_stream.pyx | 4 - 16 files changed, 179 insertions(+), 512 deletions(-) delete mode 100644 cuda_core/cuda/core/_cpp/resource_handles_cxx_api.hpp delete mode 100644 cuda_core/cuda/core/_resource_handles_cxx_api.pxd diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index 63f32020d1..bb7951db62 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -105,6 +105,7 @@ def get_sources(mod_name): sources = [f"cuda/core/{mod_name}.pyx"] # Add module-specific .cpp file from _cpp/ directory if it exists + # Example: _resource_handles.pyx finds _cpp/resource_handles.cpp. cpp_file = f"cuda/core/_cpp/{mod_name.lstrip('_')}.cpp" if os.path.exists(cpp_file): sources.append(cpp_file) diff --git a/cuda_core/cuda/core/_cpp/DESIGN.md b/cuda_core/cuda/core/_cpp/DESIGN.md index 3accf1bcd0..61f11e1ec9 100644 --- a/cuda_core/cuda/core/_cpp/DESIGN.md +++ b/cuda_core/cuda/core/_cpp/DESIGN.md @@ -101,25 +101,20 @@ return as_py(h_stream) # cuda.bindings.driver.CUstream ``` cuda/core/ ├── _resource_handles.pyx # Cython module (compiles resource_handles.cpp) -├── _resource_handles.pxd # Cython declarations and dispatch wrappers +├── _resource_handles.pxd # Cython declarations for consumer modules └── _cpp/ ├── resource_handles.hpp # C++ API declarations - ├── resource_handles.cpp # C++ implementation - └── resource_handles_cxx_api.hpp # Capsule struct definition + └── resource_handles.cpp # C++ implementation ``` ### Build Implications The `_cpp/` subdirectory contains C++ source files that are compiled into the `_resource_handles` extension module. Other Cython modules in cuda.core do **not** -link against this code directly—they access it through a capsule mechanism -(explained below). +link against this code directly—they `cimport` functions from +`_resource_handles.pxd`, and calls go through `_resource_handles.so` at runtime. -## Capsule Architecture - -The implementation uses **two separate capsule mechanisms** for different purposes: - -### Capsule 1: C++ API Table (`_CXX_API`) +## Cross-Module Function Sharing **Problem**: Cython extension modules compile independently. If multiple modules (`_memory.pyx`, `_ipc.pyx`, etc.) each linked `resource_handles.cpp`, they would @@ -129,38 +124,32 @@ each have their own copies of: - Thread-local error state - Other static data, including global caches -**Solution**: Only `_resource_handles.so` links the C++ code. It exports a capsule -containing function pointers: - -```cpp -struct ResourceHandlesCxxApiV1 { - uint32_t abi_version; - uint32_t struct_size; - - // Thread-local error handling - CUresult (*get_last_error)() noexcept; - CUresult (*peek_last_error)() noexcept; - void (*clear_last_error)() noexcept; +**Solution**: Only `_resource_handles.so` links the C++ code. The `.pyx` file +uses `cdef extern from` to declare C++ functions with Cython-accessible names: - // Handle creation functions - ContextHandle (*get_primary_context)(int device_id) noexcept; - StreamHandle (*create_stream_handle)(...) noexcept; - // ... etc -}; +```cython +# In _resource_handles.pyx +cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": + StreamHandle create_stream_handle "cuda_core::create_stream_handle" ( + ContextHandle h_ctx, unsigned int flags, int priority) nogil + # ... other functions ``` -Other Cython modules import this capsule at runtime and call through the function -pointers. The `.pxd` file provides inline wrappers that hide this indirection: +The `.pxd` file declares these same functions so other modules can `cimport` them: ```cython -cdef inline StreamHandle create_stream_handle(...) except * nogil: - return _handles_table.create_stream_handle(...) +# In _resource_handles.pxd +cdef StreamHandle create_stream_handle( + ContextHandle h_ctx, unsigned int flags, int priority) noexcept nogil ``` -Importing modules are expected to call `_init_handles_table()` prior to calling -any wrapper functions. +The `cdef extern from` declaration in the `.pyx` satisfies the `.pxd` declaration +directly—no wrapper functions are needed. When consumer modules `cimport` these +functions, Cython generates calls through `_resource_handles.so` at runtime. +This ensures all static and thread-local state lives in a single shared library, +avoiding the duplicate state problem. -### Capsule 2: CUDA Driver API (`_CUDA_DRIVER_API_V1`) +## CUDA Driver API Capsule (`_CUDA_DRIVER_API_V1`) **Problem**: cuda.core cannot directly call CUDA driver functions because: @@ -186,13 +175,6 @@ struct CudaDriverApiV1 { The C++ code retrieves this capsule once (via `load_driver_api()`) and caches the function pointers for subsequent use. -### Why Two Capsules? - -| Capsule | Direction | Purpose | -|---------|-----------|---------| -| `_CXX_API` | C++ → Cython | Share handle functions across modules | -| `_CUDA_DRIVER_API_V1` | Cython → C++ | Provide resolved driver symbols | - ## Key Implementation Details ### Structural Dependencies @@ -276,14 +258,12 @@ Related functions: from cuda.core._resource_handles cimport ( StreamHandle, create_stream_handle, - cu, - intptr, + as_cu, + as_intptr, + as_py, get_last_error, - _init_handles_table, ) -_init_handles_table() # prerequisite before calling handle API functions - # Create a stream cdef StreamHandle h_stream = create_stream_handle(h_ctx, flags, priority) if not h_stream: @@ -302,10 +282,10 @@ The resource handle design: 1. **Separates resource management** into its own layer, independent of Python objects. 2. **Encodes lifetimes structurally** via embedded handle dependencies. -3. **Uses capsules** to solve two distinct problems: - - Sharing C++ code across Cython modules without duplicate statics. - - Resolving CUDA driver symbols dynamically through cuda-bindings. -4. **Provides overloaded accessors** (`cu`, `intptr`, `py`) since handles cannot +3. **Uses Cython's `cimport` mechanism** to share C++ code across modules without + duplicate static/thread-local state. +4. **Uses a capsule** to resolve CUDA driver symbols dynamically through cuda-bindings. +5. **Provides overloaded accessors** (`as_cu`, `as_intptr`, `as_py`) since handles cannot have attributes without unnecessary Python object wrappers. This architecture ensures CUDA resources are managed correctly regardless of Python diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 2cbdce2fa9..82dd234a00 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -5,7 +5,6 @@ #include #include "resource_handles.hpp" -#include "resource_handles_cxx_api.hpp" #include #include #include @@ -470,7 +469,7 @@ EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) { return EventHandle(box, &box->resource); } -EventHandle create_event_handle(unsigned int flags) { +EventHandle create_event_handle_noctx(unsigned int flags) { return create_event_handle(ContextHandle{}, flags); } @@ -857,60 +856,4 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export } } -// ============================================================================ -// Capsule C++ API table -// ============================================================================ - -const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() noexcept { - static const ResourceHandlesCxxApiV1 table = []() { - ResourceHandlesCxxApiV1 t{}; - t.abi_version = RESOURCE_HANDLES_CXX_API_VERSION; - t.struct_size = static_cast(sizeof(ResourceHandlesCxxApiV1)); - - // Error handling - t.get_last_error = &get_last_error; - t.peek_last_error = &peek_last_error; - t.clear_last_error = &clear_last_error; - - // Context - t.create_context_handle_ref = &create_context_handle_ref; - t.get_primary_context = &get_primary_context; - t.get_current_context = &get_current_context; - - // Stream - t.create_stream_handle = &create_stream_handle; - t.create_stream_handle_ref = &create_stream_handle_ref; - t.create_stream_handle_with_owner = &create_stream_handle_with_owner; - t.get_legacy_stream = &get_legacy_stream; - t.get_per_thread_stream = &get_per_thread_stream; - - // Event (resolve overloads explicitly) - t.create_event_handle = - static_cast(&create_event_handle); - t.create_event_handle_noctx = - static_cast(&create_event_handle); - t.create_event_handle_ipc = &create_event_handle_ipc; - - // Memory pool - t.create_mempool_handle = &create_mempool_handle; - t.create_mempool_handle_ref = &create_mempool_handle_ref; - t.get_device_mempool = &get_device_mempool; - t.create_mempool_handle_ipc = &create_mempool_handle_ipc; - - // Device pointer - t.deviceptr_alloc_from_pool = &deviceptr_alloc_from_pool; - t.deviceptr_alloc_async = &deviceptr_alloc_async; - t.deviceptr_alloc = &deviceptr_alloc; - t.deviceptr_alloc_host = &deviceptr_alloc_host; - t.deviceptr_create_ref = &deviceptr_create_ref; - t.deviceptr_create_with_owner = &deviceptr_create_with_owner; - t.deviceptr_import_ipc = &deviceptr_import_ipc; - t.deallocation_stream = &deallocation_stream; - t.set_deallocation_stream = &set_deallocation_stream; - - return t; - }(); - return &table; -} - } // namespace cuda_core diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index 6b82594c2b..0338b052f6 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -91,7 +91,7 @@ EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags); // Use for temporary events that are created and destroyed in the same scope. // When the last reference is released, cuEventDestroy is called automatically. // Returns empty handle on error (caller must check). -EventHandle create_event_handle(unsigned int flags); +EventHandle create_event_handle_noctx(unsigned int flags); // Create an owning event handle from an IPC handle. // The originating process owns the event and its context. diff --git a/cuda_core/cuda/core/_cpp/resource_handles_cxx_api.hpp b/cuda_core/cuda/core/_cpp/resource_handles_cxx_api.hpp deleted file mode 100644 index 6ff07a6ee0..0000000000 --- a/cuda_core/cuda/core/_cpp/resource_handles_cxx_api.hpp +++ /dev/null @@ -1,79 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include - -#include "resource_handles.hpp" - -namespace cuda_core { - -// C++ capsule API for cross-extension-module calls. -// -// The function-pointer table is exported from the Python extension module -// `cuda.core._resource_handles` as a PyCapsule named: -// -// "cuda.core._resource_handles._CXX_API" -// -// Other extension modules import the capsule and dispatch through the table to -// ensure there is a single owner of all correctness-critical static/thread_local -// state in resource_handles.cpp (caches, last-error state, etc.). - -static constexpr std::uint32_t RESOURCE_HANDLES_CXX_API_VERSION = 1; - -struct ResourceHandlesCxxApiV1 { - std::uint32_t abi_version; - std::uint32_t struct_size; - - // Thread-local error handling - CUresult (*get_last_error)() noexcept; - CUresult (*peek_last_error)() noexcept; - void (*clear_last_error)() noexcept; - - // Context handles - ContextHandle (*create_context_handle_ref)(CUcontext ctx); - ContextHandle (*get_primary_context)(int device_id) noexcept; - ContextHandle (*get_current_context)() noexcept; - - // Stream handles - StreamHandle (*create_stream_handle)(ContextHandle h_ctx, unsigned int flags, int priority); - StreamHandle (*create_stream_handle_ref)(CUstream stream); - StreamHandle (*create_stream_handle_with_owner)(CUstream stream, PyObject* owner); - StreamHandle (*get_legacy_stream)() noexcept; - StreamHandle (*get_per_thread_stream)() noexcept; - - // Event handles - EventHandle (*create_event_handle)(ContextHandle h_ctx, unsigned int flags); - EventHandle (*create_event_handle_noctx)(unsigned int flags); - EventHandle (*create_event_handle_ipc)(const CUipcEventHandle& ipc_handle); - - // Memory pool handles - MemoryPoolHandle (*create_mempool_handle)(const CUmemPoolProps& props); - MemoryPoolHandle (*create_mempool_handle_ref)(CUmemoryPool pool); - MemoryPoolHandle (*get_device_mempool)(int device_id) noexcept; - MemoryPoolHandle (*create_mempool_handle_ipc)(int fd, CUmemAllocationHandleType handle_type); - - // Device pointer handles - DevicePtrHandle (*deviceptr_alloc_from_pool)( - size_t size, - MemoryPoolHandle h_pool, - StreamHandle h_stream); - DevicePtrHandle (*deviceptr_alloc_async)(size_t size, StreamHandle h_stream); - DevicePtrHandle (*deviceptr_alloc)(size_t size); - DevicePtrHandle (*deviceptr_alloc_host)(size_t size); - DevicePtrHandle (*deviceptr_create_ref)(CUdeviceptr ptr); - DevicePtrHandle (*deviceptr_create_with_owner)(CUdeviceptr ptr, PyObject* owner); - DevicePtrHandle (*deviceptr_import_ipc)( - MemoryPoolHandle h_pool, - const void* export_data, - StreamHandle h_stream); - StreamHandle (*deallocation_stream)(const DevicePtrHandle& h); - void (*set_deallocation_stream)(const DevicePtrHandle& h, StreamHandle h_stream); -}; - -// Return pointer to a process-wide singleton table. -const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() noexcept; - -} // namespace cuda_core diff --git a/cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/_device.pyx index 60c9ae5973..dc7b0b69c9 100644 --- a/cuda_core/cuda/core/_device.pyx +++ b/cuda_core/cuda/core/_device.pyx @@ -17,15 +17,11 @@ from cuda.core._event cimport Event as cyEvent from cuda.core._event import Event, EventOptions from cuda.core._resource_handles cimport ( ContextHandle, - _init_handles_table, create_context_handle_ref, get_primary_context, as_cu, ) -# Prerequisite before calling handle API functions (see _cpp/DESIGN.md) -_init_handles_table() - from cuda.core._graph import GraphBuilder from cuda.core._stream import IsStreamT, Stream, StreamOptions from cuda.core._utils.clear_error_support import assert_type diff --git a/cuda_core/cuda/core/_event.pyx b/cuda_core/cuda/core/_event.pyx index 2a47faf789..0d25959a22 100644 --- a/cuda_core/cuda/core/_event.pyx +++ b/cuda_core/cuda/core/_event.pyx @@ -11,7 +11,6 @@ from cuda.core._context cimport Context from cuda.core._resource_handles cimport ( ContextHandle, EventHandle, - _init_handles_table, create_event_handle, create_event_handle_ipc, as_intptr, @@ -19,9 +18,6 @@ from cuda.core._resource_handles cimport ( as_py, ) -# Prerequisite before calling handle API functions (see _cpp/DESIGN.md) -_init_handles_table() - from cuda.core._utils.cuda_utils cimport ( check_or_create_options, HANDLE_RETURN diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index bd131ab8ee..6dc53a3b96 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -16,16 +16,12 @@ from cuda.core._memory cimport _ipc from cuda.core._resource_handles cimport ( DevicePtrHandle, StreamHandle, - _init_handles_table, deviceptr_create_with_owner, as_intptr, as_cu, set_deallocation_stream, ) -# Prerequisite before calling handle API functions (see _cpp/DESIGN.md) -_init_handles_table() - from cuda.core._stream cimport Stream_accept, Stream from cuda.core._utils.cuda_utils cimport HANDLE_RETURN diff --git a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx index 6742d7c12f..090dd4468a 100644 --- a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx @@ -10,14 +10,10 @@ from cuda.bindings cimport cydriver from cuda.core._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle, MemoryResource from cuda.core._resource_handles cimport ( DevicePtrHandle, - _init_handles_table, deviceptr_alloc_async, as_cu, ) -# Prerequisite before calling handle API functions (see _cpp/DESIGN.md) -_init_handles_table() - from cuda.core._stream cimport default_stream, Stream_accept, Stream from cuda.core._utils.cuda_utils cimport HANDLE_RETURN diff --git a/cuda_core/cuda/core/_memory/_ipc.pyx b/cuda_core/cuda/core/_memory/_ipc.pyx index 690cfcdb65..9e89ad90d9 100644 --- a/cuda_core/cuda/core/_memory/_ipc.pyx +++ b/cuda_core/cuda/core/_memory/_ipc.pyx @@ -10,16 +10,12 @@ from cuda.core._memory._memory_pool cimport _MemPool from cuda.core._stream cimport Stream from cuda.core._resource_handles cimport ( DevicePtrHandle, - _init_handles_table, create_mempool_handle_ipc, deviceptr_import_ipc, get_last_error, as_cu, ) -# Prerequisite before calling handle API functions (see _cpp/DESIGN.md) -_init_handles_table() - from cuda.core._stream cimport default_stream from cuda.core._utils.cuda_utils cimport HANDLE_RETURN from cuda.core._utils.cuda_utils import check_multiprocessing_start_method diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyx b/cuda_core/cuda/core/_memory/_memory_pool.pyx index 60c85a1a46..563f556015 100644 --- a/cuda_core/cuda/core/_memory/_memory_pool.pyx +++ b/cuda_core/cuda/core/_memory/_memory_pool.pyx @@ -16,7 +16,6 @@ from cuda.core._stream cimport default_stream, Stream_accept, Stream from cuda.core._resource_handles cimport ( MemoryPoolHandle, DevicePtrHandle, - _init_handles_table, create_mempool_handle, create_mempool_handle_ref, get_device_mempool, @@ -25,9 +24,6 @@ from cuda.core._resource_handles cimport ( as_py, ) -# Prerequisite before calling handle API functions (see _cpp/DESIGN.md) -_init_handles_table() - from cuda.core._utils.cuda_utils cimport ( HANDLE_RETURN, ) diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx index 6d909af89b..24aa2e8ff7 100644 --- a/cuda_core/cuda/core/_memoryview.pyx +++ b/cuda_core/cuda/core/_memoryview.pyx @@ -16,14 +16,10 @@ import numpy from cuda.bindings cimport cydriver from cuda.core._resource_handles cimport ( EventHandle, - _init_handles_table, create_event_handle_noctx, as_cu, ) -# Prerequisite before calling handle API functions (see _cpp/DESIGN.md) -_init_handles_table() - from cuda.core._utils.cuda_utils import handle_return, driver from cuda.core._utils.cuda_utils cimport HANDLE_RETURN diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index 3f9eef72c7..5560b5c526 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -3,15 +3,19 @@ # SPDX-License-Identifier: Apache-2.0 from libc.stddef cimport size_t -from libc.stdint cimport intptr_t, uint32_t -from libcpp.memory cimport shared_ptr +from libc.stdint cimport intptr_t -from cpython.pycapsule cimport PyCapsule_Import +from libcpp.memory cimport shared_ptr from cuda.bindings cimport cydriver -# Declare the C++ namespace and types (inline helpers live in the header). + +# ============================================================================= +# Handle type aliases and inline helpers (declared from C++ header) +# ============================================================================= + cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": + # Handle types ctypedef shared_ptr[const cydriver.CUcontext] ContextHandle ctypedef shared_ptr[const cydriver.CUstream] StreamHandle ctypedef shared_ptr[const cydriver.CUevent] EventHandle @@ -40,207 +44,53 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": object as_py(DevicePtrHandle h) -# The resource handles API table is exported from `cuda.core._resource_handles` -# as a PyCapsule named: +# ============================================================================= +# Wrapper function declarations (implemented in _resource_handles.pyx) # -# "cuda.core._resource_handles._CXX_API" -# -# Consumers dispatch through this table to avoid relying on RTLD_GLOBAL and to -# ensure a single owner of correctness-critical static/thread_local state. -cdef extern from "_cpp/resource_handles_cxx_api.hpp" namespace "cuda_core": - cdef struct ResourceHandlesCxxApiV1: - uint32_t abi_version - uint32_t struct_size - - # Thread-local error handling - cydriver.CUresult (*get_last_error)() nogil - cydriver.CUresult (*peek_last_error)() nogil - void (*clear_last_error)() nogil - - # Context handles - ContextHandle (*create_context_handle_ref)(cydriver.CUcontext ctx) nogil - ContextHandle (*get_primary_context)(int device_id) nogil - ContextHandle (*get_current_context)() nogil - - # Stream handles - StreamHandle (*create_stream_handle)(ContextHandle h_ctx, unsigned int flags, int priority) nogil - StreamHandle (*create_stream_handle_ref)(cydriver.CUstream stream) nogil - StreamHandle (*create_stream_handle_with_owner)(cydriver.CUstream stream, object owner) - StreamHandle (*get_legacy_stream)() nogil - StreamHandle (*get_per_thread_stream)() nogil - - # Event handles - EventHandle (*create_event_handle)(ContextHandle h_ctx, unsigned int flags) nogil - EventHandle (*create_event_handle_noctx)(unsigned int flags) nogil - EventHandle (*create_event_handle_ipc)(const cydriver.CUipcEventHandle& ipc_handle) nogil - - # Memory pool handles - MemoryPoolHandle (*create_mempool_handle)(const cydriver.CUmemPoolProps& props) nogil - MemoryPoolHandle (*create_mempool_handle_ref)(cydriver.CUmemoryPool pool) nogil - MemoryPoolHandle (*get_device_mempool)(int device_id) nogil - MemoryPoolHandle (*create_mempool_handle_ipc)(int fd, cydriver.CUmemAllocationHandleType handle_type) nogil - - # Device pointer handles - DevicePtrHandle (*deviceptr_alloc_from_pool)( - size_t size, - MemoryPoolHandle h_pool, - StreamHandle h_stream) nogil - DevicePtrHandle (*deviceptr_alloc_async)(size_t size, StreamHandle h_stream) nogil - DevicePtrHandle (*deviceptr_alloc)(size_t size) nogil - DevicePtrHandle (*deviceptr_alloc_host)(size_t size) nogil - DevicePtrHandle (*deviceptr_create_ref)(cydriver.CUdeviceptr ptr) nogil - DevicePtrHandle (*deviceptr_create_with_owner)(cydriver.CUdeviceptr ptr, object owner) - DevicePtrHandle (*deviceptr_import_ipc)( - MemoryPoolHandle h_pool, - const void* export_data, - StreamHandle h_stream) nogil - StreamHandle (*deallocation_stream)(const DevicePtrHandle& h) nogil - void (*set_deallocation_stream)(const DevicePtrHandle& h, StreamHandle h_stream) nogil - - const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() nogil - - -cdef const ResourceHandlesCxxApiV1* _handles_table = NULL - - -cdef inline const ResourceHandlesCxxApiV1* _get_handles_table() except NULL nogil: - global _handles_table - if _handles_table == NULL: - with gil: - if _handles_table == NULL: - _handles_table = PyCapsule_Import( - b"cuda.core._resource_handles._CXX_API", 0 - ) - if _handles_table == NULL: - raise ImportError("Failed to import cuda.core._resource_handles._CXX_API capsule") - if _handles_table.abi_version != 1: - raise ImportError("Unsupported resource handles C++ API version") - if _handles_table.struct_size < sizeof(ResourceHandlesCxxApiV1): - raise ImportError("Resource handles C++ API table is too small") - return _handles_table - - -# ----------------------------------------------------------------------------- -# Dispatch wrappers -# -# These wrappers assume _handles_table has been initialized. Consumers must call -# _init_handles_table() at module level before using these functions in nogil blocks. -# ----------------------------------------------------------------------------- - -cdef inline void _init_handles_table() except *: - """Initialize the handles table. Call at module level before using wrappers.""" - _get_handles_table() - - -cdef inline cydriver.CUresult get_last_error() noexcept nogil: - return _handles_table.get_last_error() - - -cdef inline cydriver.CUresult peek_last_error() noexcept nogil: - return _handles_table.peek_last_error() - - -cdef inline void clear_last_error() noexcept nogil: - _handles_table.clear_last_error() - - -cdef inline ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) noexcept nogil: - return _handles_table.create_context_handle_ref(ctx) - - -cdef inline ContextHandle get_primary_context(int device_id) noexcept nogil: - return _handles_table.get_primary_context(device_id) - - -cdef inline ContextHandle get_current_context() noexcept nogil: - return _handles_table.get_current_context() - - -cdef inline StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) noexcept nogil: - return _handles_table.create_stream_handle(h_ctx, flags, priority) - - -cdef inline StreamHandle create_stream_handle_ref(cydriver.CUstream stream) noexcept nogil: - return _handles_table.create_stream_handle_ref(stream) - - -cdef inline StreamHandle create_stream_handle_with_owner(cydriver.CUstream stream, object owner): - return _handles_table.create_stream_handle_with_owner(stream, owner) - - -cdef inline StreamHandle get_legacy_stream() noexcept nogil: - return _handles_table.get_legacy_stream() - - -cdef inline StreamHandle get_per_thread_stream() noexcept nogil: - return _handles_table.get_per_thread_stream() - - -cdef inline EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) noexcept nogil: - return _handles_table.create_event_handle(h_ctx, flags) - - -cdef inline EventHandle create_event_handle_noctx(unsigned int flags) noexcept nogil: - return _handles_table.create_event_handle_noctx(flags) - - -cdef inline EventHandle create_event_handle_ipc(const cydriver.CUipcEventHandle& ipc_handle) noexcept nogil: - return _handles_table.create_event_handle_ipc(ipc_handle) - - -cdef inline MemoryPoolHandle create_mempool_handle(const cydriver.CUmemPoolProps& props) noexcept nogil: - return _handles_table.create_mempool_handle(props) - - -cdef inline MemoryPoolHandle create_mempool_handle_ref(cydriver.CUmemoryPool pool) noexcept nogil: - return _handles_table.create_mempool_handle_ref(pool) - - -cdef inline MemoryPoolHandle get_device_mempool(int device_id) noexcept nogil: - return _handles_table.get_device_mempool(device_id) - - -cdef inline MemoryPoolHandle create_mempool_handle_ipc(int fd, cydriver.CUmemAllocationHandleType handle_type) noexcept nogil: - return _handles_table.create_mempool_handle_ipc(fd, handle_type) - - -cdef inline DevicePtrHandle deviceptr_alloc_from_pool( - size_t size, - MemoryPoolHandle h_pool, - StreamHandle h_stream) noexcept nogil: - return _handles_table.deviceptr_alloc_from_pool(size, h_pool, h_stream) - - -cdef inline DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) noexcept nogil: - return _handles_table.deviceptr_alloc_async(size, h_stream) - - -cdef inline DevicePtrHandle deviceptr_alloc(size_t size) noexcept nogil: - return _handles_table.deviceptr_alloc(size) - - -cdef inline DevicePtrHandle deviceptr_alloc_host(size_t size) noexcept nogil: - return _handles_table.deviceptr_alloc_host(size) - - -cdef inline DevicePtrHandle deviceptr_create_ref(cydriver.CUdeviceptr ptr) noexcept nogil: - return _handles_table.deviceptr_create_ref(ptr) - - -cdef inline DevicePtrHandle deviceptr_create_with_owner(cydriver.CUdeviceptr ptr, object owner): - return _handles_table.deviceptr_create_with_owner(ptr, owner) - - -cdef inline DevicePtrHandle deviceptr_import_ipc( - MemoryPoolHandle h_pool, - const void* export_data, - StreamHandle h_stream) noexcept nogil: - return _handles_table.deviceptr_import_ipc(h_pool, export_data, h_stream) - - -cdef inline StreamHandle deallocation_stream(const DevicePtrHandle& h) noexcept nogil: - return _handles_table.deallocation_stream(h) - - -cdef inline void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) noexcept nogil: - _handles_table.set_deallocation_stream(h, h_stream) +# Consumer modules cimport these. Calls go through _resource_handles.so. +# ============================================================================= + +# Thread-local error handling +cdef cydriver.CUresult get_last_error() noexcept nogil +cdef cydriver.CUresult peek_last_error() noexcept nogil +cdef void clear_last_error() noexcept nogil + +# Context handles +cdef ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) noexcept nogil +cdef ContextHandle get_primary_context(int device_id) noexcept nogil +cdef ContextHandle get_current_context() noexcept nogil + +# Stream handles +cdef StreamHandle create_stream_handle( + ContextHandle h_ctx, unsigned int flags, int priority) noexcept nogil +cdef StreamHandle create_stream_handle_ref(cydriver.CUstream stream) noexcept nogil +cdef StreamHandle create_stream_handle_with_owner(cydriver.CUstream stream, object owner) +cdef StreamHandle get_legacy_stream() noexcept nogil +cdef StreamHandle get_per_thread_stream() noexcept nogil + +# Event handles +cdef EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) noexcept nogil +cdef EventHandle create_event_handle_noctx(unsigned int flags) noexcept nogil +cdef EventHandle create_event_handle_ipc( + const cydriver.CUipcEventHandle& ipc_handle) noexcept nogil + +# Memory pool handles +cdef MemoryPoolHandle create_mempool_handle( + const cydriver.CUmemPoolProps& props) noexcept nogil +cdef MemoryPoolHandle create_mempool_handle_ref(cydriver.CUmemoryPool pool) noexcept nogil +cdef MemoryPoolHandle get_device_mempool(int device_id) noexcept nogil +cdef MemoryPoolHandle create_mempool_handle_ipc( + int fd, cydriver.CUmemAllocationHandleType handle_type) noexcept nogil + +# Device pointer handles +cdef DevicePtrHandle deviceptr_alloc_from_pool( + size_t size, MemoryPoolHandle h_pool, StreamHandle h_stream) noexcept nogil +cdef DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) noexcept nogil +cdef DevicePtrHandle deviceptr_alloc(size_t size) noexcept nogil +cdef DevicePtrHandle deviceptr_alloc_host(size_t size) noexcept nogil +cdef DevicePtrHandle deviceptr_create_ref(cydriver.CUdeviceptr ptr) noexcept nogil +cdef DevicePtrHandle deviceptr_create_with_owner(cydriver.CUdeviceptr ptr, object owner) +cdef DevicePtrHandle deviceptr_import_ipc( + MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) noexcept nogil +cdef StreamHandle deallocation_stream(const DevicePtrHandle& h) noexcept nogil +cdef void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) noexcept nogil diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index f15d524430..d98886be6d 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -2,33 +2,105 @@ # # SPDX-License-Identifier: Apache-2.0 -# This module exists to compile _cpp/resource_handles.cpp into a shared library. -# The helper functions (cu, intptr, py) are implemented as inline C++ functions -# in _cpp/resource_handles.hpp and declared as extern in _resource_handles.pxd. +# This module compiles _cpp/resource_handles.cpp into a shared library. +# Consumer modules cimport the functions declared in _resource_handles.pxd. +# Since there is only one copy of the C++ code (in this .so), all static and +# thread-local state is shared correctly across all consumer modules. +# +# The cdef extern from declarations below satisfy the .pxd declarations directly, +# without needing separate wrapper functions. from cpython.pycapsule cimport PyCapsule_New +from libc.stddef cimport size_t from libc.stdint cimport uint32_t, uint64_t, uintptr_t -from ._resource_handles_cxx_api cimport ( - ResourceHandlesCxxApiV1, - get_resource_handles_cxx_api_v1, +from cuda.bindings cimport cydriver + +from ._resource_handles cimport ( + ContextHandle, + StreamHandle, + EventHandle, + MemoryPoolHandle, + DevicePtrHandle, ) import cython -cdef const char* _CXX_API_NAME = b"cuda.core._resource_handles._CXX_API" -cdef const char* _CUDA_DRIVER_API_V1_NAME = b"cuda.core._resource_handles._CUDA_DRIVER_API_V1" - -# Export the C++ handles dispatch table as a PyCapsule. -# Consumers use PyCapsule_Import(_CXX_API_NAME, 0) to retrieve it. -cdef const ResourceHandlesCxxApiV1* _handles_table = get_resource_handles_cxx_api_v1() -if _handles_table == NULL: - raise RuntimeError("Failed to initialize resource handles C++ API table") +# ============================================================================= +# C++ function declarations (non-inline, implemented in resource_handles.cpp) +# +# These declarations satisfy the cdef function declarations in _resource_handles.pxd. +# Consumer modules cimport these functions and calls go through this .so. +# ============================================================================= + +cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": + # Thread-local error handling + cydriver.CUresult get_last_error "cuda_core::get_last_error" () noexcept nogil + cydriver.CUresult peek_last_error "cuda_core::peek_last_error" () noexcept nogil + void clear_last_error "cuda_core::clear_last_error" () noexcept nogil + + # Context handles + ContextHandle create_context_handle_ref "cuda_core::create_context_handle_ref" ( + cydriver.CUcontext ctx) nogil + ContextHandle get_primary_context "cuda_core::get_primary_context" ( + int device_id) noexcept nogil + ContextHandle get_current_context "cuda_core::get_current_context" () noexcept nogil + + # Stream handles + StreamHandle create_stream_handle "cuda_core::create_stream_handle" ( + ContextHandle h_ctx, unsigned int flags, int priority) nogil + StreamHandle create_stream_handle_ref "cuda_core::create_stream_handle_ref" ( + cydriver.CUstream stream) nogil + StreamHandle create_stream_handle_with_owner "cuda_core::create_stream_handle_with_owner" ( + cydriver.CUstream stream, object owner) + StreamHandle get_legacy_stream "cuda_core::get_legacy_stream" () noexcept nogil + StreamHandle get_per_thread_stream "cuda_core::get_per_thread_stream" () noexcept nogil + + # Event handles (note: _create_event_handle* are internal due to C++ overloading) + EventHandle create_event_handle "cuda_core::create_event_handle" ( + ContextHandle h_ctx, unsigned int flags) nogil + EventHandle create_event_handle_noctx "cuda_core::create_event_handle_noctx" ( + unsigned int flags) nogil + EventHandle create_event_handle_ipc "cuda_core::create_event_handle_ipc" ( + const cydriver.CUipcEventHandle& ipc_handle) nogil + + # Memory pool handles + MemoryPoolHandle create_mempool_handle "cuda_core::create_mempool_handle" ( + const cydriver.CUmemPoolProps& props) nogil + MemoryPoolHandle create_mempool_handle_ref "cuda_core::create_mempool_handle_ref" ( + cydriver.CUmemoryPool pool) nogil + MemoryPoolHandle get_device_mempool "cuda_core::get_device_mempool" ( + int device_id) noexcept nogil + MemoryPoolHandle create_mempool_handle_ipc "cuda_core::create_mempool_handle_ipc" ( + int fd, cydriver.CUmemAllocationHandleType handle_type) nogil + + # Device pointer handles + DevicePtrHandle deviceptr_alloc_from_pool "cuda_core::deviceptr_alloc_from_pool" ( + size_t size, MemoryPoolHandle h_pool, StreamHandle h_stream) nogil + DevicePtrHandle deviceptr_alloc_async "cuda_core::deviceptr_alloc_async" ( + size_t size, StreamHandle h_stream) nogil + DevicePtrHandle deviceptr_alloc "cuda_core::deviceptr_alloc" (size_t size) nogil + DevicePtrHandle deviceptr_alloc_host "cuda_core::deviceptr_alloc_host" (size_t size) nogil + DevicePtrHandle deviceptr_create_ref "cuda_core::deviceptr_create_ref" ( + cydriver.CUdeviceptr ptr) nogil + DevicePtrHandle deviceptr_create_with_owner "cuda_core::deviceptr_create_with_owner" ( + cydriver.CUdeviceptr ptr, object owner) + DevicePtrHandle deviceptr_import_ipc "cuda_core::deviceptr_import_ipc" ( + MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) nogil + StreamHandle deallocation_stream "cuda_core::deallocation_stream" ( + const DevicePtrHandle& h) noexcept nogil + void set_deallocation_stream "cuda_core::set_deallocation_stream" ( + const DevicePtrHandle& h, StreamHandle h_stream) noexcept nogil + + +# ============================================================================= +# CUDA Driver API capsule +# +# This provides resolved CUDA driver function pointers to the C++ code. +# ============================================================================= -_CXX_API = PyCapsule_New(_handles_table, _CXX_API_NAME, NULL) -if _CXX_API is None: - raise RuntimeError("Failed to create _CXX_API capsule") +cdef const char* _CUDA_DRIVER_API_V1_NAME = b"cuda.core._resource_handles._CUDA_DRIVER_API_V1" cdef struct CudaDriverApiV1: diff --git a/cuda_core/cuda/core/_resource_handles_cxx_api.pxd b/cuda_core/cuda/core/_resource_handles_cxx_api.pxd deleted file mode 100644 index da3d8d4fd3..0000000000 --- a/cuda_core/cuda/core/_resource_handles_cxx_api.pxd +++ /dev/null @@ -1,68 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -from libc.stdint cimport uint32_t -from libc.stddef cimport size_t - -from cuda.bindings cimport cydriver -from ._resource_handles cimport ( - ContextHandle, - DevicePtrHandle, - EventHandle, - MemoryPoolHandle, - StreamHandle, -) - - -cdef extern from "_cpp/resource_handles_cxx_api.hpp" namespace "cuda_core": - cdef struct ResourceHandlesCxxApiV1: - uint32_t abi_version - uint32_t struct_size - - # Thread-local error handling - cydriver.CUresult (*get_last_error)() nogil - cydriver.CUresult (*peek_last_error)() nogil - void (*clear_last_error)() nogil - - # Context handles - ContextHandle (*create_context_handle_ref)(cydriver.CUcontext ctx) nogil - ContextHandle (*get_primary_context)(int device_id) nogil - ContextHandle (*get_current_context)() nogil - - # Stream handles - StreamHandle (*create_stream_handle)(ContextHandle h_ctx, unsigned int flags, int priority) nogil - StreamHandle (*create_stream_handle_ref)(cydriver.CUstream stream) nogil - StreamHandle (*create_stream_handle_with_owner)(cydriver.CUstream stream, object owner) - StreamHandle (*get_legacy_stream)() nogil - StreamHandle (*get_per_thread_stream)() nogil - - # Event handles - EventHandle (*create_event_handle)(ContextHandle h_ctx, unsigned int flags) nogil - EventHandle (*create_event_handle_noctx)(unsigned int flags) nogil - EventHandle (*create_event_handle_ipc)(const cydriver.CUipcEventHandle& ipc_handle) nogil - - # Memory pool handles - MemoryPoolHandle (*create_mempool_handle)(const cydriver.CUmemPoolProps& props) nogil - MemoryPoolHandle (*create_mempool_handle_ref)(cydriver.CUmemoryPool pool) nogil - MemoryPoolHandle (*get_device_mempool)(int device_id) nogil - MemoryPoolHandle (*create_mempool_handle_ipc)(int fd, cydriver.CUmemAllocationHandleType handle_type) nogil - - # Device pointer handles - DevicePtrHandle (*deviceptr_alloc_from_pool)( - size_t size, - MemoryPoolHandle h_pool, - StreamHandle h_stream) nogil - DevicePtrHandle (*deviceptr_alloc_async)(size_t size, StreamHandle h_stream) nogil - DevicePtrHandle (*deviceptr_alloc)(size_t size) nogil - DevicePtrHandle (*deviceptr_alloc_host)(size_t size) nogil - DevicePtrHandle (*deviceptr_create_ref)(cydriver.CUdeviceptr ptr) nogil - DevicePtrHandle (*deviceptr_create_with_owner)(cydriver.CUdeviceptr ptr, object owner) - DevicePtrHandle (*deviceptr_import_ipc)( - MemoryPoolHandle h_pool, - const void* export_data, - StreamHandle h_stream) nogil - StreamHandle (*deallocation_stream)(const DevicePtrHandle& h) nogil - void (*set_deallocation_stream)(const DevicePtrHandle& h, StreamHandle h_stream) nogil - - const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() nogil diff --git a/cuda_core/cuda/core/_stream.pyx b/cuda_core/cuda/core/_stream.pyx index 7814586b48..d1747abe2d 100644 --- a/cuda_core/cuda/core/_stream.pyx +++ b/cuda_core/cuda/core/_stream.pyx @@ -29,7 +29,6 @@ from cuda.core._resource_handles cimport ( ContextHandle, EventHandle, StreamHandle, - _init_handles_table, create_context_handle_ref, create_event_handle_noctx, create_stream_handle, @@ -42,9 +41,6 @@ from cuda.core._resource_handles cimport ( as_py, ) -# Prerequisite before calling handle API functions (see _cpp/DESIGN.md) -_init_handles_table() - from cuda.core._graph import GraphBuilder