From b17ffa80a62d60d0d04bf9940767830d78786566 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 16 Dec 2025 16:29:15 -0800 Subject: [PATCH 01/19] wip --- cuda_core/cuda/core/experimental/_module.py | 93 +++++++++++++++++++- cuda_core/cuda/core/experimental/_program.py | 71 +++++++++++++++ cuda_core/tests/test_module.py | 69 +++++++++++++++ cuda_core/tests/test_program.py | 43 +++++++++ 4 files changed, 275 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index 9af722465b..f52817323f 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -436,7 +436,51 @@ def occupancy(self) -> KernelOccupancy: self._occupancy = KernelOccupancy._init(self._handle) return self._occupancy - # TODO: implement from_handle() + @staticmethod + def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel": + """Create a new :obj:`Kernel` object from a foreign kernel handle. + + Uses a CUfunction or CUkernel pointer address represented as a Python int + to create a new :obj:`Kernel` object. + + Note + ---- + Kernel lifetime is not managed, foreign object must remain + alive while this kernel is active. + + Parameters + ---------- + handle : int + Kernel handle representing the address of a foreign + kernel object (CUfunction or CUkernel). + mod : :obj:`ObjectCode`, optional + The ObjectCode object associated with this kernel. If not provided, + a placeholder ObjectCode will be created. Note that without a proper + ObjectCode, certain operations may be limited. + + Returns + ------- + :obj:`Kernel` + Newly created kernel object. + + """ + _lazy_init() + # Convert the integer handle to the appropriate driver type + if _py_major_ver >= 12 and _driver_ver >= 12000: + # Try CUkernel first for newer CUDA versions + kernel_obj = driver.CUkernel(handle) + else: + # Use CUfunction for older versions + kernel_obj = driver.CUfunction(handle) + + # If no module provided, create a placeholder + if mod is None: + # Create a placeholder ObjectCode that won't try to load anything + mod = ObjectCode._init(b"", "cubin") + # Set a dummy handle to prevent lazy loading + mod._handle = 1 # Non-null placeholder + + return Kernel._from_obj(kernel_obj, mod) CodeTypeT = Union[bytes, bytearray, str] @@ -605,6 +649,53 @@ def from_library(module: Union[bytes, str], *, name: str = "", symbol_mapping: d """ return ObjectCode._init(module, "library", name=name, symbol_mapping=symbol_mapping) + @staticmethod + def from_handle(handle: int, code_type: str = "cubin", *, name: str = "", symbol_mapping: dict | None = None) -> "ObjectCode": + """Create a new :obj:`ObjectCode` object from a foreign module handle. + + Uses a CUmodule or CUlibrary pointer address represented as a Python int + to create a new :obj:`ObjectCode` object. + + Note + ---- + Module lifetime is not managed, foreign object must remain + alive while this object code is active. + + Parameters + ---------- + handle : int + Module handle representing the address of a foreign + module object (CUmodule or CUlibrary). + code_type : str, optional + The type of code object this handle represents. Must be one of + "cubin", "ptx", "ltoir", "fatbin", "object", or "library". + (Default: "cubin") + name : str, optional + A human-readable identifier representing this code object. + symbol_mapping : dict, optional + A dictionary specifying how the unmangled symbol names (as keys) + should be mapped to the mangled names before trying to retrieve + them (default to no mappings). + + Returns + ------- + :obj:`ObjectCode` + Newly created object code. + + """ + _lazy_init() + # Create an ObjectCode instance with a placeholder module + # The handle will be set directly, bypassing the lazy loading + obj = ObjectCode._init(b"", code_type, name=name, symbol_mapping=symbol_mapping) + + # Set the handle directly from the foreign handle + if obj._backend_version == "new": + obj._handle = driver.CUlibrary(handle) + else: + obj._handle = driver.CUmodule(handle) + + return obj + # TODO: do we want to unload in a finalizer? Probably not.. def _lazy_load_module(self, *args, **kwargs): diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index f3ad9af644..d6e051e651 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -860,3 +860,74 @@ def handle(self) -> ProgramHandleT: handle, call ``int(Program.handle)``. """ return self._mnff.handle + + @staticmethod + def from_handle(handle: int, backend: str, options: ProgramOptions = None) -> Program: + """Create a new :obj:`Program` object from a foreign program handle. + + Uses a nvrtcProgram or NVVM program handle represented as a Python int + to create a new :obj:`Program` object. + + Note + ---- + Program lifetime is not managed, foreign object must remain + alive while this program is active. The handle should not be + destroyed externally while this Program object is in use. + + Parameters + ---------- + handle : int + Program handle representing the address of a foreign + program object (nvrtcProgram for NVRTC backend, or + NVVM program handle for NVVM backend). + backend : str + The backend type of the program handle. Must be either + "NVRTC" or "NVVM". This determines how the handle is + interpreted and used. + options : :obj:`ProgramOptions`, optional + Program options that may be used for subsequent operations. + If not provided, default options will be created. + + Returns + ------- + :obj:`Program` + Newly created program object. + + Raises + ------ + ValueError + If an unsupported backend is specified. + + """ + backend = backend.upper() + if backend not in ("NVRTC", "NVVM"): + raise ValueError(f"Unsupported backend '{backend}'. Must be 'NVRTC' or 'NVVM'") + + # Create a new Program instance without going through __init__ + prog = object.__new__(Program) + + # Initialize the members needed for finalization + # Note: We pass None as the program_obj to avoid finalization since + # we don't own the handle + prog._mnff = Program._MembersNeededForFinalize.__new__( + Program._MembersNeededForFinalize + ) + prog._mnff.handle = None # Don't manage the foreign handle + prog._mnff.backend = backend + + # Store the backend and options + prog._backend = backend + prog._options = check_or_create_options(ProgramOptions, options, "Program options") + prog._linker = None + + # Store the handle directly without taking ownership + # This means the finalizer won't destroy it + if backend == "NVRTC": + prog._mnff.handle = nvrtc.nvrtcProgram(handle) + elif backend == "NVVM": + nvvm_module = _get_nvvm_module() + # For NVVM, we just store the handle as-is + # The actual NVVM program handle is opaque + prog._mnff.handle = handle + + return prog diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index 25b8d5dd86..e150064129 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -420,3 +420,72 @@ def test_module_serialization_roundtrip(get_saxpy_kernel_cubin): assert objcode.code == result.code assert objcode._sym_map == result._sym_map assert objcode.code_type == result.code_type + + +def test_object_code_from_handle(get_saxpy_kernel_cubin): + """Test ObjectCode.from_handle() with a valid handle""" + kernel, original_objcode = get_saxpy_kernel_cubin + + # Get the handle from the original object code + handle = int(original_objcode.handle) + + # Create a new ObjectCode from the handle + objcode_from_handle = ObjectCode.from_handle(handle, "cubin", symbol_mapping=original_objcode._sym_map) + assert isinstance(objcode_from_handle, ObjectCode) + assert objcode_from_handle.code_type == "cubin" + + # Try to get a kernel from the new object code + # Note: This should work since we're reusing the handle + kernel_from_handle = objcode_from_handle.get_kernel("saxpy") + assert isinstance(kernel_from_handle, cuda.core.experimental._module.Kernel) + + +def test_object_code_from_handle_with_different_code_types(get_saxpy_kernel_ptx): + """Test ObjectCode.from_handle() with PTX code type""" + ptx, original_objcode = get_saxpy_kernel_ptx + + if not Program._can_load_generated_ptx(): + pytest.skip("PTX version too new for current driver") + + # Force loading to get a handle + _ = original_objcode.get_kernel("saxpy") + handle = int(original_objcode.handle) + + # Create a new ObjectCode from the handle with PTX code type + objcode_from_handle = ObjectCode.from_handle(handle, "ptx", symbol_mapping=original_objcode._sym_map) + assert isinstance(objcode_from_handle, ObjectCode) + assert objcode_from_handle.code_type == "ptx" + + +def test_kernel_from_handle(get_saxpy_kernel_cubin): + """Test Kernel.from_handle() with a valid handle""" + original_kernel, objcode = get_saxpy_kernel_cubin + + # Get the handle from the original kernel + handle = int(original_kernel._handle) + + # Create a new Kernel from the handle + kernel_from_handle = cuda.core.experimental._module.Kernel.from_handle(handle, objcode) + assert isinstance(kernel_from_handle, cuda.core.experimental._module.Kernel) + + # Verify we can access kernel attributes + max_threads = kernel_from_handle.attributes.max_threads_per_block() + assert isinstance(max_threads, int) + assert max_threads > 0 + + +def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin): + """Test Kernel.from_handle() without providing a module""" + original_kernel, _ = get_saxpy_kernel_cubin + + # Get the handle from the original kernel + handle = int(original_kernel._handle) + + # Create a new Kernel from the handle without a module + kernel_from_handle = cuda.core.experimental._module.Kernel.from_handle(handle) + assert isinstance(kernel_from_handle, cuda.core.experimental._module.Kernel) + + # Verify we can still access kernel attributes + max_threads = kernel_from_handle.attributes.max_threads_per_block() + assert isinstance(max_threads, int) + assert max_threads > 0 diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 2b0ac5d617..9e1bbb5d20 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -538,3 +538,46 @@ def test_program_options_as_bytes_nvvm_unsupported_option(): options = ProgramOptions(arch="sm_80", lineinfo=True) with pytest.raises(CUDAError, match="not supported by NVVM backend"): options.as_bytes("nvvm") + + +def test_program_from_handle_nvrtc(init_cuda): + """Test Program.from_handle() with NVRTC backend""" + # Create a regular program to get a handle + code = 'extern "C" __global__ void test_kernel() {}' + original_program = Program(code, "c++") + assert original_program.backend == "NVRTC" + + # Get the handle + handle = int(original_program.handle) + + # Create a new program from the handle + program_from_handle = Program.from_handle(handle, "NVRTC") + assert program_from_handle.backend == "NVRTC" + # Note: We don't own the handle, so we shouldn't close it in the from_handle instance + + # Clean up the original program + original_program.close() + + +@nvvm_available +def test_program_from_handle_nvvm(init_cuda, nvvm_ir): + """Test Program.from_handle() with NVVM backend""" + # Create a regular NVVM program to get a handle + original_program = Program(nvvm_ir, "nvvm") + assert original_program.backend == "NVVM" + + # Get the handle + handle = int(original_program.handle) if hasattr(original_program.handle, '__int__') else original_program.handle + + # Create a new program from the handle + program_from_handle = Program.from_handle(handle, "NVVM") + assert program_from_handle.backend == "NVVM" + + # Clean up the original program + original_program.close() + + +def test_program_from_handle_invalid_backend(): + """Test Program.from_handle() with invalid backend""" + with pytest.raises(ValueError, match="Unsupported backend 'INVALID'"): + Program.from_handle(0, "INVALID") From 62eceae550cb5b0c9bd5db05309ee4fe564f8ed2 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 19 Dec 2025 08:51:00 -0800 Subject: [PATCH 02/19] adding thread safety --- cuda_core/cuda/core/_module.py | 94 ++++++++++++++------ cuda_core/cuda/core/experimental/__init__.py | 1 + 2 files changed, 69 insertions(+), 26 deletions(-) diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py index 842e316d20..cceb8d19ef 100644 --- a/cuda_core/cuda/core/_module.py +++ b/cuda_core/cuda/core/_module.py @@ -2,6 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 +import sys +import threading import weakref from collections import namedtuple from typing import Union @@ -27,7 +29,10 @@ } -# TODO: revisit this treatment for py313t builds +# Lazy initialization state and synchronization +# For Python 3.13t (free-threaded builds), we use a lock to ensure thread-safe initialization. +# For regular Python builds with GIL, the lock overhead is minimal and the code remains safe. +_init_lock = threading.Lock() _inited = False _py_major_ver = None _driver_ver = None @@ -35,27 +40,66 @@ def _lazy_init(): + """ + Initialize module-level state in a thread-safe manner. + + This function is thread-safe and suitable for both: + - Regular Python builds (with GIL) + - Python 3.13t free-threaded builds (without GIL) + + Uses double-checked locking pattern for performance: + - Fast path: check without lock if already initialized + - Slow path: acquire lock and initialize if needed + """ global _inited + # Fast path: already initialized (no lock needed for read) if _inited: return - global _py_major_ver, _driver_ver, _kernel_ctypes - # binding availability depends on cuda-python version - _py_major_ver, _ = get_binding_version() - if _py_major_ver >= 12: - _backend["new"] = { - "file": driver.cuLibraryLoadFromFile, - "data": driver.cuLibraryLoadData, - "kernel": driver.cuLibraryGetKernel, - "attribute": driver.cuKernelGetAttribute, - } - _kernel_ctypes = (driver.CUfunction, driver.CUkernel) - else: - _kernel_ctypes = (driver.CUfunction,) - _driver_ver = handle_return(driver.cuDriverGetVersion()) - if _py_major_ver >= 12 and _driver_ver >= 12040: - _backend["new"]["paraminfo"] = driver.cuKernelGetParamInfo - _inited = True + # Slow path: acquire lock and initialize + with _init_lock: + # Double-check: another thread might have initialized while we waited + if _inited: + return + + global _py_major_ver, _driver_ver, _kernel_ctypes + # binding availability depends on cuda-python version + _py_major_ver, _ = get_binding_version() + if _py_major_ver >= 12: + _backend["new"] = { + "file": driver.cuLibraryLoadFromFile, + "data": driver.cuLibraryLoadData, + "kernel": driver.cuLibraryGetKernel, + "attribute": driver.cuKernelGetAttribute, + } + _kernel_ctypes = (driver.CUfunction, driver.CUkernel) + else: + _kernel_ctypes = (driver.CUfunction,) + _driver_ver = handle_return(driver.cuDriverGetVersion()) + if _py_major_ver >= 12 and _driver_ver >= 12040: + _backend["new"]["paraminfo"] = driver.cuKernelGetParamInfo + + # Mark as initialized (must be last to ensure all state is set) + _inited = True + + +# Auto-initializing property accessors +def _get_py_major_ver(): + """Get the Python binding major version, initializing if needed.""" + _lazy_init() + return _py_major_ver + + +def _get_driver_ver(): + """Get the CUDA driver version, initializing if needed.""" + _lazy_init() + return _driver_ver + + +def _get_kernel_ctypes(): + """Get the kernel ctypes tuple, initializing if needed.""" + _lazy_init() + return _kernel_ctypes class KernelAttributes: @@ -70,7 +114,7 @@ def _init(cls, kernel): self._kernel = weakref.ref(kernel) self._cache = {} - self._backend_version = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old" + self._backend_version = "new" if (_get_py_major_ver() >= 12 and _get_driver_ver() >= 12000) else "old" self._loader = _backend[self._backend_version] return self @@ -378,7 +422,7 @@ def __new__(self, *args, **kwargs): @classmethod def _from_obj(cls, obj, mod): - assert_type(obj, _kernel_ctypes) + assert_type(obj, _get_kernel_ctypes()) assert_type(mod, ObjectCode) ker = super().__new__(cls) ker._handle = obj @@ -399,9 +443,10 @@ def _get_arguments_info(self, param_info=False) -> tuple[int, list[ParamInfo]]: if attr_impl._backend_version != "new": raise NotImplementedError("New backend is required") if "paraminfo" not in attr_impl._loader: + driver_ver = _get_driver_ver() raise NotImplementedError( "Driver version 12.4 or newer is required for this function. " - f"Using driver version {_driver_ver // 1000}.{(_driver_ver % 1000) // 10}" + f"Using driver version {driver_ver // 1000}.{(driver_ver % 1000) // 10}" ) arg_pos = 0 param_info_data = [] @@ -464,9 +509,8 @@ def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel": Newly created kernel object. """ - _lazy_init() # Convert the integer handle to the appropriate driver type - if _py_major_ver >= 12 and _driver_ver >= 12000: + if _get_py_major_ver() >= 12 and _get_driver_ver() >= 12000: # Try CUkernel first for newer CUDA versions kernel_obj = driver.CUkernel(handle) else: @@ -518,12 +562,11 @@ def __new__(self, *args, **kwargs): def _init(cls, module, code_type, *, name: str = "", symbol_mapping: dict | None = None): self = super().__new__(cls) assert code_type in self._supported_code_type, f"{code_type=} is not supported" - _lazy_init() # handle is assigned during _lazy_load self._handle = None - self._backend_version = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old" + self._backend_version = "new" if (_get_py_major_ver() >= 12 and _get_driver_ver() >= 12000) else "old" self._loader = _backend[self._backend_version] self._code_type = code_type @@ -683,7 +726,6 @@ def from_handle(handle: int, code_type: str = "cubin", *, name: str = "", symbol Newly created object code. """ - _lazy_init() # Create an ObjectCode instance with a placeholder module # The handle will be set directly, bypassing the lazy loading obj = ObjectCode._init(b"", code_type, name=name, symbol_mapping=symbol_mapping) diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 3dbf3b7440..406c2e7e26 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -70,6 +70,7 @@ def _warn_deprecated(): VirtualMemoryResource, VirtualMemoryResourceOptions, ) +import cuda.core._module as _module # noqa: E402 from cuda.core._module import Kernel, ObjectCode # noqa: E402 from cuda.core._program import Program, ProgramOptions # noqa: E402 from cuda.core._stream import Stream, StreamOptions # noqa: E402 From 4e751a68ef7e4191eb5366f357f8493c1860af44 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 19 Dec 2025 09:00:43 -0800 Subject: [PATCH 03/19] cleanup work --- cuda_core/cuda/core/_module.py | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py index cceb8d19ef..762f562c4f 100644 --- a/cuda_core/cuda/core/_module.py +++ b/cuda_core/cuda/core/_module.py @@ -19,15 +19,6 @@ ) from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return, precondition -_backend = { - "old": { - "file": driver.cuModuleLoad, - "data": driver.cuModuleLoadDataEx, - "kernel": driver.cuModuleGetFunction, - "attribute": driver.cuFuncGetAttribute, - }, -} - # Lazy initialization state and synchronization # For Python 3.13t (free-threaded builds), we use a lock to ensure thread-safe initialization. @@ -37,7 +28,14 @@ _py_major_ver = None _driver_ver = None _kernel_ctypes = None - +_backend = { + "old": { + "file": driver.cuModuleLoad, + "data": driver.cuModuleLoadDataEx, + "kernel": driver.cuModuleGetFunction, + "attribute": driver.cuFuncGetAttribute, + }, +} def _lazy_init(): """ @@ -102,6 +100,14 @@ def _get_kernel_ctypes(): return _kernel_ctypes +def _get_backend_version(): + """Get the backend version ("new" or "old") based on CUDA version. + + Returns "new" for CUDA 12.0+ (uses cuLibrary API), "old" otherwise (uses cuModule API). + """ + return "new" if (_get_py_major_ver() >= 12 and _get_driver_ver() >= 12000) else "old" + + class KernelAttributes: def __new__(self, *args, **kwargs): raise RuntimeError("KernelAttributes cannot be instantiated directly. Please use Kernel APIs.") @@ -114,7 +120,7 @@ def _init(cls, kernel): self._kernel = weakref.ref(kernel) self._cache = {} - self._backend_version = "new" if (_get_py_major_ver() >= 12 and _get_driver_ver() >= 12000) else "old" + self._backend_version = _get_backend_version() self._loader = _backend[self._backend_version] return self @@ -241,7 +247,9 @@ def cluster_scheduling_policy_preference(self, device_id: Device | int = None) - class KernelOccupancy: - """ """ + """This class offers methods to query occupancy metrics that help determine optimal + launch parameters such as block size, grid size, and shared memory usage. + """ def __new__(self, *args, **kwargs): raise RuntimeError("KernelOccupancy cannot be instantiated directly. Please use Kernel APIs.") @@ -566,7 +574,7 @@ def _init(cls, module, code_type, *, name: str = "", symbol_mapping: dict | None # handle is assigned during _lazy_load self._handle = None - self._backend_version = "new" if (_get_py_major_ver() >= 12 and _get_driver_ver() >= 12000) else "old" + self._backend_version = _get_backend_version() self._loader = _backend[self._backend_version] self._code_type = code_type From 0dca71048e0793555fee7ecc05d3d0c033c52133 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 19 Dec 2025 09:14:35 -0800 Subject: [PATCH 04/19] updating docstrings --- cuda_core/cuda/core/_module.py | 30 +++--------------------------- 1 file changed, 3 insertions(+), 27 deletions(-) diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py index 762f562c4f..a2ecf4517f 100644 --- a/cuda_core/cuda/core/_module.py +++ b/cuda_core/cuda/core/_module.py @@ -491,15 +491,9 @@ def occupancy(self) -> KernelOccupancy: @staticmethod def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel": - """Create a new :obj:`Kernel` object from a foreign kernel handle. + """Creates a new :obj:`Kernel` object from a foreign kernel handle. - Uses a CUfunction or CUkernel pointer address represented as a Python int - to create a new :obj:`Kernel` object. - - Note - ---- - Kernel lifetime is not managed, foreign object must remain - alive while this kernel is active. + Uses a CUfunction or CUkernel pointer address to create a new :obj:`Kernel` object. Parameters ---------- @@ -510,12 +504,6 @@ def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel": The ObjectCode object associated with this kernel. If not provided, a placeholder ObjectCode will be created. Note that without a proper ObjectCode, certain operations may be limited. - - Returns - ------- - :obj:`Kernel` - Newly created kernel object. - """ # Convert the integer handle to the appropriate driver type if _get_py_major_ver() >= 12 and _get_driver_ver() >= 12000: @@ -704,13 +692,7 @@ def from_library(module: Union[bytes, str], *, name: str = "", symbol_mapping: d def from_handle(handle: int, code_type: str = "cubin", *, name: str = "", symbol_mapping: dict | None = None) -> "ObjectCode": """Create a new :obj:`ObjectCode` object from a foreign module handle. - Uses a CUmodule or CUlibrary pointer address represented as a Python int - to create a new :obj:`ObjectCode` object. - - Note - ---- - Module lifetime is not managed, foreign object must remain - alive while this object code is active. + Uses a CUmodule or CUlibrary pointer address to create a new :obj:`ObjectCode` object. Parameters ---------- @@ -727,12 +709,6 @@ def from_handle(handle: int, code_type: str = "cubin", *, name: str = "", symbol A dictionary specifying how the unmangled symbol names (as keys) should be mapped to the mangled names before trying to retrieve them (default to no mappings). - - Returns - ------- - :obj:`ObjectCode` - Newly created object code. - """ # Create an ObjectCode instance with a placeholder module # The handle will be set directly, bypassing the lazy loading From 8457870fa4b9c4c64b17d5af4ef61f4ab2814dda Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 19 Dec 2025 09:32:42 -0800 Subject: [PATCH 05/19] removing experimental namespace --- cuda_core/tests/test_module.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index bfb0aafb19..31f7a3e303 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -425,32 +425,32 @@ def test_module_serialization_roundtrip(get_saxpy_kernel_cubin): def test_object_code_from_handle(get_saxpy_kernel_cubin): """Test ObjectCode.from_handle() with a valid handle""" kernel, original_objcode = get_saxpy_kernel_cubin - + # Get the handle from the original object code handle = int(original_objcode.handle) - + # Create a new ObjectCode from the handle objcode_from_handle = ObjectCode.from_handle(handle, "cubin", symbol_mapping=original_objcode._sym_map) assert isinstance(objcode_from_handle, ObjectCode) assert objcode_from_handle.code_type == "cubin" - + # Try to get a kernel from the new object code # Note: This should work since we're reusing the handle kernel_from_handle = objcode_from_handle.get_kernel("saxpy") - assert isinstance(kernel_from_handle, cuda.core.experimental._module.Kernel) + assert isinstance(kernel_from_handle, cuda.core._module.Kernel) def test_object_code_from_handle_with_different_code_types(get_saxpy_kernel_ptx): """Test ObjectCode.from_handle() with PTX code type""" ptx, original_objcode = get_saxpy_kernel_ptx - + if not Program._can_load_generated_ptx(): pytest.skip("PTX version too new for current driver") - + # Force loading to get a handle _ = original_objcode.get_kernel("saxpy") handle = int(original_objcode.handle) - + # Create a new ObjectCode from the handle with PTX code type objcode_from_handle = ObjectCode.from_handle(handle, "ptx", symbol_mapping=original_objcode._sym_map) assert isinstance(objcode_from_handle, ObjectCode) @@ -460,14 +460,14 @@ def test_object_code_from_handle_with_different_code_types(get_saxpy_kernel_ptx) def test_kernel_from_handle(get_saxpy_kernel_cubin): """Test Kernel.from_handle() with a valid handle""" original_kernel, objcode = get_saxpy_kernel_cubin - + # Get the handle from the original kernel handle = int(original_kernel._handle) - + # Create a new Kernel from the handle - kernel_from_handle = cuda.core.experimental._module.Kernel.from_handle(handle, objcode) - assert isinstance(kernel_from_handle, cuda.core.experimental._module.Kernel) - + kernel_from_handle = cuda.core._module.Kernel.from_handle(handle, objcode) + assert isinstance(kernel_from_handle, cuda.core._module.Kernel) + # Verify we can access kernel attributes max_threads = kernel_from_handle.attributes.max_threads_per_block() assert isinstance(max_threads, int) @@ -477,14 +477,14 @@ def test_kernel_from_handle(get_saxpy_kernel_cubin): def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin): """Test Kernel.from_handle() without providing a module""" original_kernel, _ = get_saxpy_kernel_cubin - + # Get the handle from the original kernel handle = int(original_kernel._handle) - + # Create a new Kernel from the handle without a module - kernel_from_handle = cuda.core.experimental._module.Kernel.from_handle(handle) - assert isinstance(kernel_from_handle, cuda.core.experimental._module.Kernel) - + kernel_from_handle = cuda.core._module.Kernel.from_handle(handle) + assert isinstance(kernel_from_handle, cuda.core._module.Kernel) + # Verify we can still access kernel attributes max_threads = kernel_from_handle.attributes.max_threads_per_block() assert isinstance(max_threads, int) From 2178dcbb8083d8f7bb8e82aa94e0caebe05ecba4 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 19 Dec 2025 09:42:03 -0800 Subject: [PATCH 06/19] clean up --- cuda_core/tests/test_module.py | 1 - cuda_core/tests/test_program.py | 14 +++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index 31f7a3e303..f0c28fd673 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -435,7 +435,6 @@ def test_object_code_from_handle(get_saxpy_kernel_cubin): assert objcode_from_handle.code_type == "cubin" # Try to get a kernel from the new object code - # Note: This should work since we're reusing the handle kernel_from_handle = objcode_from_handle.get_kernel("saxpy") assert isinstance(kernel_from_handle, cuda.core._module.Kernel) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 2d9d5c5cd5..105595839a 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -555,15 +555,15 @@ def test_program_from_handle_nvrtc(init_cuda): code = 'extern "C" __global__ void test_kernel() {}' original_program = Program(code, "c++") assert original_program.backend == "NVRTC" - + # Get the handle handle = int(original_program.handle) - + # Create a new program from the handle program_from_handle = Program.from_handle(handle, "NVRTC") assert program_from_handle.backend == "NVRTC" # Note: We don't own the handle, so we shouldn't close it in the from_handle instance - + # Clean up the original program original_program.close() @@ -574,14 +574,14 @@ def test_program_from_handle_nvvm(init_cuda, nvvm_ir): # Create a regular NVVM program to get a handle original_program = Program(nvvm_ir, "nvvm") assert original_program.backend == "NVVM" - + # Get the handle - handle = int(original_program.handle) if hasattr(original_program.handle, '__int__') else original_program.handle - + handle = int(original_program.handle) + # Create a new program from the handle program_from_handle = Program.from_handle(handle, "NVVM") assert program_from_handle.backend == "NVVM" - + # Clean up the original program original_program.close() From 180eeb0a3bbde71541353c9d4d5ef8d6e04fc961 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 19 Dec 2025 11:47:09 -0800 Subject: [PATCH 07/19] adding more unit test to cover edge cases --- cuda_core/cuda/core/_module.py | 5 + cuda_core/tests/test_module.py | 216 +++++++++++++++++++++++++++++++++ 2 files changed, 221 insertions(+) diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py index a2ecf4517f..e3bdc5ecf0 100644 --- a/cuda_core/cuda/core/_module.py +++ b/cuda_core/cuda/core/_module.py @@ -505,6 +505,11 @@ def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel": a placeholder ObjectCode will be created. Note that without a proper ObjectCode, certain operations may be limited. """ + + # Validate that handle is an integer + if not isinstance(handle, int): + raise TypeError(f"handle must be an integer, got {type(handle).__name__}") + # Convert the integer handle to the appropriate driver type if _get_py_major_ver() >= 12 and _get_driver_ver() >= 12000: # Try CUkernel first for newer CUDA versions diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index f0c28fd673..73a1c35299 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -488,3 +488,219 @@ def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin): max_threads = kernel_from_handle.attributes.max_threads_per_block() assert isinstance(max_threads, int) assert max_threads > 0 + + +# Edge case tests for from_handle methods + + +# @pytest.mark.parametrize("invalid_handle", [0, -1, 0xDEADBEEF]) +# def test_object_code_from_handle_invalid_handles(invalid_handle): +# """Test ObjectCode.from_handle() with various invalid handle values""" +# # Invalid handles should be accepted during construction but fail when used +# objcode = ObjectCode.from_handle(invalid_handle, "cubin", symbol_mapping={}) +# assert isinstance(objcode, ObjectCode) +# assert objcode.code_type == "cubin" +# +# # Attempting to get a kernel from an invalid handle should fail with CUDAError +# with pytest.raises(CUDAError): +# objcode.get_kernel("kernel_that_does_not_exist") + + +# @pytest.mark.parametrize( +# "invalid_value", +# [ +# pytest.param("not_an_int", id="str"), +# pytest.param(3.14, id="float"), +# pytest.param(None, id="None"), +# pytest.param([123], id="list"), +# pytest.param((123,), id="tuple"), +# pytest.param({"handle": 123}, id="dict"), +# pytest.param(b"\x00\x01\x02", id="bytes"), +# pytest.param({123}, id="set"), +# pytest.param(object(), id="object"), +# ], +# ) +# def test_object_code_from_handle_type_validation(invalid_value): +# """Test ObjectCode.from_handle() with wrong handle types""" +# with pytest.raises(TypeError): +# ObjectCode.from_handle(invalid_value, "cubin", symbol_mapping={}) +# + +@pytest.mark.parametrize( + "invalid_code_type,expected_error", + [ + pytest.param("invalid_type", AssertionError, id="invalid_str"), + pytest.param("", AssertionError, id="empty_str"), + pytest.param(None, (AssertionError, TypeError), id="None"), + pytest.param(123, (AssertionError, TypeError), id="int"), + pytest.param(3.14, (AssertionError, TypeError), id="float"), + pytest.param(["cubin"], (AssertionError, TypeError), id="list"), + pytest.param(("cubin",), (AssertionError, TypeError), id="tuple"), + pytest.param({"type": "cubin"}, (AssertionError, TypeError), id="dict"), + pytest.param(b"cubin", (AssertionError, TypeError), id="bytes"), + pytest.param({"cubin"}, (AssertionError, TypeError), id="set"), + pytest.param(object(), (AssertionError, TypeError), id="object"), + ], +) +def test_object_code_from_handle_invalid_code_type(invalid_code_type, expected_error): + """Test ObjectCode.from_handle() with invalid code_type""" + with pytest.raises(expected_error): + ObjectCode.from_handle(0, invalid_code_type, symbol_mapping={}) + + +def test_object_code_from_handle_symbol_mapping_variations(): + """Test ObjectCode.from_handle() with various symbol_mapping values""" + # None symbol_mapping (should default to empty dict) + objcode1 = ObjectCode.from_handle(0, "cubin", symbol_mapping=None) + assert objcode1._sym_map == {} + + # Empty dict + objcode2 = ObjectCode.from_handle(0, "cubin", symbol_mapping={}) + assert objcode2._sym_map == {} + + # Valid symbol mapping + sym_map = {"kernel1": b"_Z7kernel1v", "kernel2": b"_Z7kernel2v"} + objcode3 = ObjectCode.from_handle(0, "cubin", symbol_mapping=sym_map) + assert objcode3._sym_map == sym_map + + +def test_object_code_from_handle_symbol_mapping_with_valid_handle(get_saxpy_kernel_cubin): + """Test that symbol_mapping is actually used when getting kernels""" + _, original_objcode = get_saxpy_kernel_cubin + original_handle = int(original_objcode.handle) + + # Create ObjectCode with correct symbol mapping + objcode_with_map = ObjectCode.from_handle( + original_handle, + "cubin", + symbol_mapping=original_objcode._sym_map + ) + + # Should successfully get kernel using unmangled name from symbol_mapping + kernel = objcode_with_map.get_kernel("saxpy") + assert isinstance(kernel, cuda.core._module.Kernel) + + # Create ObjectCode without symbol mapping + objcode_no_map = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping={}) + + # Should fail to get kernel using unmangled name (no mapping available) + with pytest.raises(CUDAError): + objcode_no_map.get_kernel("saxpy") + + +def test_object_code_from_handle_lifecycle(get_saxpy_kernel_cubin): + """Test handle lifecycle and ownership with from_handle""" + original_kernel, original_objcode = get_saxpy_kernel_cubin + + # Get the original handle + original_handle = int(original_objcode.handle) + + # Create a new ObjectCode from the same handle + objcode_from_handle = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map) + + # Both should reference the same underlying CUDA module + assert int(objcode_from_handle.handle) == original_handle + + # Get a kernel from the from_handle version + kernel_from_copy = objcode_from_handle.get_kernel("saxpy") + assert isinstance(kernel_from_copy, cuda.core._module.Kernel) + + # The original should still work + kernel_from_original = original_objcode.get_kernel("saxpy") + assert isinstance(kernel_from_original, cuda.core._module.Kernel) + + # Both kernels should reference the same underlying CUDA kernel handle + # If handles are equal, they're the same kernel - no need to check attributes + assert int(kernel_from_copy._handle) == int(kernel_from_original._handle) + + +def test_object_code_from_handle_multiple_instances(get_saxpy_kernel_cubin): + """Test creating multiple ObjectCode instances from the same handle""" + original_kernel, original_objcode = get_saxpy_kernel_cubin + + # Get the original handle + original_handle = int(original_objcode.handle) + + # Create multiple ObjectCode instances from the same handle + objcode1 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map) + objcode2 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map) + objcode3 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map) + + # All should have the same handle + assert int(objcode1.handle) == original_handle + assert int(objcode2.handle) == original_handle + assert int(objcode3.handle) == original_handle + + # All should be able to get kernels + kernel1 = objcode1.get_kernel("saxpy") + kernel2 = objcode2.get_kernel("saxpy") + kernel3 = objcode3.get_kernel("saxpy") + + assert isinstance(kernel1, cuda.core._module.Kernel) + assert isinstance(kernel2, cuda.core._module.Kernel) + assert isinstance(kernel3, cuda.core._module.Kernel) + + +# @pytest.mark.parametrize("invalid_handle", [0, -1, 0xBADC0FFEE]) +# def test_kernel_from_handle_invalid_handles(invalid_handle): +# """Test Kernel.from_handle() with various invalid handle values""" +# # Invalid handles should be accepted during construction but fail when used +# kernel = cuda.core._module.Kernel.from_handle(invalid_handle) +# assert isinstance(kernel, cuda.core._module.Kernel) +# +# # Attempting to access attributes with invalid handle should fail +# with pytest.raises(CUDAError): +# kernel.attributes.max_threads_per_block() + + +@pytest.mark.parametrize( + "invalid_value", + [ + pytest.param("not_an_int", id="str"), + pytest.param(2.71828, id="float"), + pytest.param(None, id="None"), + pytest.param({"handle": 123}, id="dict"), + pytest.param([456], id="list"), + pytest.param((789,), id="tuple"), + pytest.param(3+4j, id="complex"), + pytest.param(b"\xde\xad\xbe\xef", id="bytes"), + pytest.param({999}, id="set"), + pytest.param(object(), id="object"), + ], +) +def test_kernel_from_handle_type_validation(invalid_value): + """Test Kernel.from_handle() with wrong handle types""" + with pytest.raises(TypeError): + cuda.core._module.Kernel.from_handle(invalid_value) + + +def test_kernel_from_handle_invalid_module_type(get_saxpy_kernel_cubin): + """Test Kernel.from_handle() with invalid module parameter""" + original_kernel, _ = get_saxpy_kernel_cubin + handle = int(original_kernel._handle) + + # Invalid module type (should fail type assertion in _from_obj) + with pytest.raises((TypeError, AssertionError)): + cuda.core._module.Kernel.from_handle(handle, mod="not_an_objectcode") + + with pytest.raises((TypeError, AssertionError)): + cuda.core._module.Kernel.from_handle(handle, mod=12345) + + +def test_kernel_from_handle_multiple_instances(get_saxpy_kernel_cubin): + """Test creating multiple Kernel instances from the same handle""" + original_kernel, objcode = get_saxpy_kernel_cubin + handle = int(original_kernel._handle) + + # Create multiple Kernel instances from the same handle + kernel1 = cuda.core._module.Kernel.from_handle(handle, objcode) + kernel2 = cuda.core._module.Kernel.from_handle(handle, objcode) + kernel3 = cuda.core._module.Kernel.from_handle(handle, objcode) + + # All should be valid Kernel objects + assert isinstance(kernel1, cuda.core._module.Kernel) + assert isinstance(kernel2, cuda.core._module.Kernel) + assert isinstance(kernel3, cuda.core._module.Kernel) + + # All should reference the same underlying CUDA kernel handle + assert int(kernel1._handle) == int(kernel2._handle) == int(kernel3._handle) == handle From 6489a6d8dcfcd0b8807b5cf593278520ac46be06 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 19 Dec 2025 13:42:35 -0800 Subject: [PATCH 08/19] removing unneeded import --- cuda_core/cuda/core/experimental/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 406c2e7e26..3dbf3b7440 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -70,7 +70,6 @@ def _warn_deprecated(): VirtualMemoryResource, VirtualMemoryResourceOptions, ) -import cuda.core._module as _module # noqa: E402 from cuda.core._module import Kernel, ObjectCode # noqa: E402 from cuda.core._program import Program, ProgramOptions # noqa: E402 from cuda.core._stream import Stream, StreamOptions # noqa: E402 From 74a80f72e3f0c198f82e79b8b4cf9f53390dca2d Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 19 Dec 2025 14:08:38 -0800 Subject: [PATCH 09/19] removing broken tests --- cuda_core/tests/test_module.py | 94 +++++++++------------------------- 1 file changed, 24 insertions(+), 70 deletions(-) diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index 73a1c35299..00241306e9 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -492,40 +492,6 @@ def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin): # Edge case tests for from_handle methods - -# @pytest.mark.parametrize("invalid_handle", [0, -1, 0xDEADBEEF]) -# def test_object_code_from_handle_invalid_handles(invalid_handle): -# """Test ObjectCode.from_handle() with various invalid handle values""" -# # Invalid handles should be accepted during construction but fail when used -# objcode = ObjectCode.from_handle(invalid_handle, "cubin", symbol_mapping={}) -# assert isinstance(objcode, ObjectCode) -# assert objcode.code_type == "cubin" -# -# # Attempting to get a kernel from an invalid handle should fail with CUDAError -# with pytest.raises(CUDAError): -# objcode.get_kernel("kernel_that_does_not_exist") - - -# @pytest.mark.parametrize( -# "invalid_value", -# [ -# pytest.param("not_an_int", id="str"), -# pytest.param(3.14, id="float"), -# pytest.param(None, id="None"), -# pytest.param([123], id="list"), -# pytest.param((123,), id="tuple"), -# pytest.param({"handle": 123}, id="dict"), -# pytest.param(b"\x00\x01\x02", id="bytes"), -# pytest.param({123}, id="set"), -# pytest.param(object(), id="object"), -# ], -# ) -# def test_object_code_from_handle_type_validation(invalid_value): -# """Test ObjectCode.from_handle() with wrong handle types""" -# with pytest.raises(TypeError): -# ObjectCode.from_handle(invalid_value, "cubin", symbol_mapping={}) -# - @pytest.mark.parametrize( "invalid_code_type,expected_error", [ @@ -553,11 +519,11 @@ def test_object_code_from_handle_symbol_mapping_variations(): # None symbol_mapping (should default to empty dict) objcode1 = ObjectCode.from_handle(0, "cubin", symbol_mapping=None) assert objcode1._sym_map == {} - + # Empty dict objcode2 = ObjectCode.from_handle(0, "cubin", symbol_mapping={}) assert objcode2._sym_map == {} - + # Valid symbol mapping sym_map = {"kernel1": b"_Z7kernel1v", "kernel2": b"_Z7kernel2v"} objcode3 = ObjectCode.from_handle(0, "cubin", symbol_mapping=sym_map) @@ -568,21 +534,21 @@ def test_object_code_from_handle_symbol_mapping_with_valid_handle(get_saxpy_kern """Test that symbol_mapping is actually used when getting kernels""" _, original_objcode = get_saxpy_kernel_cubin original_handle = int(original_objcode.handle) - + # Create ObjectCode with correct symbol mapping objcode_with_map = ObjectCode.from_handle( - original_handle, - "cubin", + original_handle, + "cubin", symbol_mapping=original_objcode._sym_map ) - + # Should successfully get kernel using unmangled name from symbol_mapping kernel = objcode_with_map.get_kernel("saxpy") assert isinstance(kernel, cuda.core._module.Kernel) - + # Create ObjectCode without symbol mapping objcode_no_map = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping={}) - + # Should fail to get kernel using unmangled name (no mapping available) with pytest.raises(CUDAError): objcode_no_map.get_kernel("saxpy") @@ -591,24 +557,24 @@ def test_object_code_from_handle_symbol_mapping_with_valid_handle(get_saxpy_kern def test_object_code_from_handle_lifecycle(get_saxpy_kernel_cubin): """Test handle lifecycle and ownership with from_handle""" original_kernel, original_objcode = get_saxpy_kernel_cubin - + # Get the original handle original_handle = int(original_objcode.handle) - + # Create a new ObjectCode from the same handle objcode_from_handle = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map) - + # Both should reference the same underlying CUDA module assert int(objcode_from_handle.handle) == original_handle - + # Get a kernel from the from_handle version kernel_from_copy = objcode_from_handle.get_kernel("saxpy") assert isinstance(kernel_from_copy, cuda.core._module.Kernel) - + # The original should still work kernel_from_original = original_objcode.get_kernel("saxpy") assert isinstance(kernel_from_original, cuda.core._module.Kernel) - + # Both kernels should reference the same underlying CUDA kernel handle # If handles are equal, they're the same kernel - no need to check attributes assert int(kernel_from_copy._handle) == int(kernel_from_original._handle) @@ -617,42 +583,30 @@ def test_object_code_from_handle_lifecycle(get_saxpy_kernel_cubin): def test_object_code_from_handle_multiple_instances(get_saxpy_kernel_cubin): """Test creating multiple ObjectCode instances from the same handle""" original_kernel, original_objcode = get_saxpy_kernel_cubin - + # Get the original handle original_handle = int(original_objcode.handle) - + # Create multiple ObjectCode instances from the same handle objcode1 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map) objcode2 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map) objcode3 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map) - + # All should have the same handle assert int(objcode1.handle) == original_handle assert int(objcode2.handle) == original_handle assert int(objcode3.handle) == original_handle - + # All should be able to get kernels kernel1 = objcode1.get_kernel("saxpy") kernel2 = objcode2.get_kernel("saxpy") kernel3 = objcode3.get_kernel("saxpy") - + assert isinstance(kernel1, cuda.core._module.Kernel) assert isinstance(kernel2, cuda.core._module.Kernel) assert isinstance(kernel3, cuda.core._module.Kernel) -# @pytest.mark.parametrize("invalid_handle", [0, -1, 0xBADC0FFEE]) -# def test_kernel_from_handle_invalid_handles(invalid_handle): -# """Test Kernel.from_handle() with various invalid handle values""" -# # Invalid handles should be accepted during construction but fail when used -# kernel = cuda.core._module.Kernel.from_handle(invalid_handle) -# assert isinstance(kernel, cuda.core._module.Kernel) -# -# # Attempting to access attributes with invalid handle should fail -# with pytest.raises(CUDAError): -# kernel.attributes.max_threads_per_block() - - @pytest.mark.parametrize( "invalid_value", [ @@ -678,11 +632,11 @@ def test_kernel_from_handle_invalid_module_type(get_saxpy_kernel_cubin): """Test Kernel.from_handle() with invalid module parameter""" original_kernel, _ = get_saxpy_kernel_cubin handle = int(original_kernel._handle) - + # Invalid module type (should fail type assertion in _from_obj) with pytest.raises((TypeError, AssertionError)): cuda.core._module.Kernel.from_handle(handle, mod="not_an_objectcode") - + with pytest.raises((TypeError, AssertionError)): cuda.core._module.Kernel.from_handle(handle, mod=12345) @@ -691,16 +645,16 @@ def test_kernel_from_handle_multiple_instances(get_saxpy_kernel_cubin): """Test creating multiple Kernel instances from the same handle""" original_kernel, objcode = get_saxpy_kernel_cubin handle = int(original_kernel._handle) - + # Create multiple Kernel instances from the same handle kernel1 = cuda.core._module.Kernel.from_handle(handle, objcode) kernel2 = cuda.core._module.Kernel.from_handle(handle, objcode) kernel3 = cuda.core._module.Kernel.from_handle(handle, objcode) - + # All should be valid Kernel objects assert isinstance(kernel1, cuda.core._module.Kernel) assert isinstance(kernel2, cuda.core._module.Kernel) assert isinstance(kernel3, cuda.core._module.Kernel) - + # All should reference the same underlying CUDA kernel handle assert int(kernel1._handle) == int(kernel2._handle) == int(kernel3._handle) == handle From 30dfcdc5240f8bb08a2c9ade6a3d3592ac825304 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 19 Dec 2025 14:25:43 -0800 Subject: [PATCH 10/19] pre-commit changes --- cuda_core/cuda/core/_module.py | 29 +++++++++++++++-------------- cuda_core/cuda/core/_program.py | 15 ++++++--------- cuda_core/tests/test_module.py | 9 +++------ 3 files changed, 24 insertions(+), 29 deletions(-) diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py index e3bdc5ecf0..c2eb200de5 100644 --- a/cuda_core/cuda/core/_module.py +++ b/cuda_core/cuda/core/_module.py @@ -2,7 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -import sys import threading import weakref from collections import namedtuple @@ -19,7 +18,6 @@ ) from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return, precondition - # Lazy initialization state and synchronization # For Python 3.13t (free-threaded builds), we use a lock to ensure thread-safe initialization. # For regular Python builds with GIL, the lock overhead is minimal and the code remains safe. @@ -37,14 +35,15 @@ }, } + def _lazy_init(): """ Initialize module-level state in a thread-safe manner. - + This function is thread-safe and suitable for both: - Regular Python builds (with GIL) - Python 3.13t free-threaded builds (without GIL) - + Uses double-checked locking pattern for performance: - Fast path: check without lock if already initialized - Slow path: acquire lock and initialize if needed @@ -76,7 +75,7 @@ def _lazy_init(): _driver_ver = handle_return(driver.cuDriverGetVersion()) if _py_major_ver >= 12 and _driver_ver >= 12040: _backend["new"]["paraminfo"] = driver.cuKernelGetParamInfo - + # Mark as initialized (must be last to ensure all state is set) _inited = True @@ -102,7 +101,7 @@ def _get_kernel_ctypes(): def _get_backend_version(): """Get the backend version ("new" or "old") based on CUDA version. - + Returns "new" for CUDA 12.0+ (uses cuLibrary API), "old" otherwise (uses cuModule API). """ return "new" if (_get_py_major_ver() >= 12 and _get_driver_ver() >= 12000) else "old" @@ -493,7 +492,7 @@ def occupancy(self) -> KernelOccupancy: def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel": """Creates a new :obj:`Kernel` object from a foreign kernel handle. - Uses a CUfunction or CUkernel pointer address to create a new :obj:`Kernel` object. + Uses a CUfunction or CUkernel pointer address to create a new :obj:`Kernel` object. Parameters ---------- @@ -505,11 +504,11 @@ def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel": a placeholder ObjectCode will be created. Note that without a proper ObjectCode, certain operations may be limited. """ - + # Validate that handle is an integer if not isinstance(handle, int): raise TypeError(f"handle must be an integer, got {type(handle).__name__}") - + # Convert the integer handle to the appropriate driver type if _get_py_major_ver() >= 12 and _get_driver_ver() >= 12000: # Try CUkernel first for newer CUDA versions @@ -517,14 +516,14 @@ def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel": else: # Use CUfunction for older versions kernel_obj = driver.CUfunction(handle) - + # If no module provided, create a placeholder if mod is None: # Create a placeholder ObjectCode that won't try to load anything mod = ObjectCode._init(b"", "cubin") # Set a dummy handle to prevent lazy loading mod._handle = 1 # Non-null placeholder - + return Kernel._from_obj(kernel_obj, mod) @@ -694,7 +693,9 @@ def from_library(module: Union[bytes, str], *, name: str = "", symbol_mapping: d return ObjectCode._init(module, "library", name=name, symbol_mapping=symbol_mapping) @staticmethod - def from_handle(handle: int, code_type: str = "cubin", *, name: str = "", symbol_mapping: dict | None = None) -> "ObjectCode": + def from_handle( + handle: int, code_type: str = "cubin", *, name: str = "", symbol_mapping: dict | None = None + ) -> "ObjectCode": """Create a new :obj:`ObjectCode` object from a foreign module handle. Uses a CUmodule or CUlibrary pointer address to create a new :obj:`ObjectCode` object. @@ -718,13 +719,13 @@ def from_handle(handle: int, code_type: str = "cubin", *, name: str = "", symbol # Create an ObjectCode instance with a placeholder module # The handle will be set directly, bypassing the lazy loading obj = ObjectCode._init(b"", code_type, name=name, symbol_mapping=symbol_mapping) - + # Set the handle directly from the foreign handle if obj._backend_version == "new": obj._handle = driver.CUlibrary(handle) else: obj._handle = driver.CUmodule(handle) - + return obj # TODO: do we want to unload in a finalizer? Probably not.. diff --git a/cuda_core/cuda/core/_program.py b/cuda_core/cuda/core/_program.py index f0ac4212f0..cffd500310 100644 --- a/cuda_core/cuda/core/_program.py +++ b/cuda_core/cuda/core/_program.py @@ -902,32 +902,29 @@ def from_handle(handle: int, backend: str, options: ProgramOptions = None) -> Pr backend = backend.upper() if backend not in ("NVRTC", "NVVM"): raise ValueError(f"Unsupported backend '{backend}'. Must be 'NVRTC' or 'NVVM'") - + # Create a new Program instance without going through __init__ prog = object.__new__(Program) - + # Initialize the members needed for finalization # Note: We pass None as the program_obj to avoid finalization since # we don't own the handle - prog._mnff = Program._MembersNeededForFinalize.__new__( - Program._MembersNeededForFinalize - ) + prog._mnff = Program._MembersNeededForFinalize.__new__(Program._MembersNeededForFinalize) prog._mnff.handle = None # Don't manage the foreign handle prog._mnff.backend = backend - + # Store the backend and options prog._backend = backend prog._options = check_or_create_options(ProgramOptions, options, "Program options") prog._linker = None - + # Store the handle directly without taking ownership # This means the finalizer won't destroy it if backend == "NVRTC": prog._mnff.handle = nvrtc.nvrtcProgram(handle) elif backend == "NVVM": - nvvm_module = _get_nvvm_module() # For NVVM, we just store the handle as-is # The actual NVVM program handle is opaque prog._mnff.handle = handle - + return prog diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index 00241306e9..fc07fe7741 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -492,6 +492,7 @@ def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin): # Edge case tests for from_handle methods + @pytest.mark.parametrize( "invalid_code_type,expected_error", [ @@ -536,11 +537,7 @@ def test_object_code_from_handle_symbol_mapping_with_valid_handle(get_saxpy_kern original_handle = int(original_objcode.handle) # Create ObjectCode with correct symbol mapping - objcode_with_map = ObjectCode.from_handle( - original_handle, - "cubin", - symbol_mapping=original_objcode._sym_map - ) + objcode_with_map = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map) # Should successfully get kernel using unmangled name from symbol_mapping kernel = objcode_with_map.get_kernel("saxpy") @@ -616,7 +613,7 @@ def test_object_code_from_handle_multiple_instances(get_saxpy_kernel_cubin): pytest.param({"handle": 123}, id="dict"), pytest.param([456], id="list"), pytest.param((789,), id="tuple"), - pytest.param(3+4j, id="complex"), + pytest.param(3 + 4j, id="complex"), pytest.param(b"\xde\xad\xbe\xef", id="bytes"), pytest.param({999}, id="set"), pytest.param(object(), id="object"), From 47ad63e1629f731470238ea32ad257d949dd266d Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Fri, 19 Dec 2025 14:33:46 -0800 Subject: [PATCH 11/19] clean up --- cuda_core/cuda/core/_program.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/cuda_core/cuda/core/_program.py b/cuda_core/cuda/core/_program.py index cffd500310..d936ca0a26 100644 --- a/cuda_core/cuda/core/_program.py +++ b/cuda_core/cuda/core/_program.py @@ -868,12 +868,6 @@ def from_handle(handle: int, backend: str, options: ProgramOptions = None) -> Pr Uses a nvrtcProgram or NVVM program handle represented as a Python int to create a new :obj:`Program` object. - Note - ---- - Program lifetime is not managed, foreign object must remain - alive while this program is active. The handle should not be - destroyed externally while this Program object is in use. - Parameters ---------- handle : int @@ -887,17 +881,6 @@ def from_handle(handle: int, backend: str, options: ProgramOptions = None) -> Pr options : :obj:`ProgramOptions`, optional Program options that may be used for subsequent operations. If not provided, default options will be created. - - Returns - ------- - :obj:`Program` - Newly created program object. - - Raises - ------ - ValueError - If an unsupported backend is specified. - """ backend = backend.upper() if backend not in ("NVRTC", "NVVM"): From ba14d5df7e1dfcf58c8b6028a52897c2c9608654 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 12 Jan 2026 14:37:21 -0800 Subject: [PATCH 12/19] Removing Object.from_handle and Program.from_handle --- cuda_core/cuda/core/_module.py | 36 -------- cuda_core/cuda/core/_program.py | 51 ----------- cuda_core/tests/test_module.py | 149 -------------------------------- cuda_core/tests/test_program.py | 43 --------- 4 files changed, 279 deletions(-) diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py index c2eb200de5..0877eabb99 100644 --- a/cuda_core/cuda/core/_module.py +++ b/cuda_core/cuda/core/_module.py @@ -692,42 +692,6 @@ def from_library(module: Union[bytes, str], *, name: str = "", symbol_mapping: d """ return ObjectCode._init(module, "library", name=name, symbol_mapping=symbol_mapping) - @staticmethod - def from_handle( - handle: int, code_type: str = "cubin", *, name: str = "", symbol_mapping: dict | None = None - ) -> "ObjectCode": - """Create a new :obj:`ObjectCode` object from a foreign module handle. - - Uses a CUmodule or CUlibrary pointer address to create a new :obj:`ObjectCode` object. - - Parameters - ---------- - handle : int - Module handle representing the address of a foreign - module object (CUmodule or CUlibrary). - code_type : str, optional - The type of code object this handle represents. Must be one of - "cubin", "ptx", "ltoir", "fatbin", "object", or "library". - (Default: "cubin") - name : str, optional - A human-readable identifier representing this code object. - symbol_mapping : dict, optional - A dictionary specifying how the unmangled symbol names (as keys) - should be mapped to the mangled names before trying to retrieve - them (default to no mappings). - """ - # Create an ObjectCode instance with a placeholder module - # The handle will be set directly, bypassing the lazy loading - obj = ObjectCode._init(b"", code_type, name=name, symbol_mapping=symbol_mapping) - - # Set the handle directly from the foreign handle - if obj._backend_version == "new": - obj._handle = driver.CUlibrary(handle) - else: - obj._handle = driver.CUmodule(handle) - - return obj - # TODO: do we want to unload in a finalizer? Probably not.. def _lazy_load_module(self, *args, **kwargs): diff --git a/cuda_core/cuda/core/_program.py b/cuda_core/cuda/core/_program.py index d936ca0a26..121dd13963 100644 --- a/cuda_core/cuda/core/_program.py +++ b/cuda_core/cuda/core/_program.py @@ -860,54 +860,3 @@ def handle(self) -> ProgramHandleT: handle, call ``int(Program.handle)``. """ return self._mnff.handle - - @staticmethod - def from_handle(handle: int, backend: str, options: ProgramOptions = None) -> Program: - """Create a new :obj:`Program` object from a foreign program handle. - - Uses a nvrtcProgram or NVVM program handle represented as a Python int - to create a new :obj:`Program` object. - - Parameters - ---------- - handle : int - Program handle representing the address of a foreign - program object (nvrtcProgram for NVRTC backend, or - NVVM program handle for NVVM backend). - backend : str - The backend type of the program handle. Must be either - "NVRTC" or "NVVM". This determines how the handle is - interpreted and used. - options : :obj:`ProgramOptions`, optional - Program options that may be used for subsequent operations. - If not provided, default options will be created. - """ - backend = backend.upper() - if backend not in ("NVRTC", "NVVM"): - raise ValueError(f"Unsupported backend '{backend}'. Must be 'NVRTC' or 'NVVM'") - - # Create a new Program instance without going through __init__ - prog = object.__new__(Program) - - # Initialize the members needed for finalization - # Note: We pass None as the program_obj to avoid finalization since - # we don't own the handle - prog._mnff = Program._MembersNeededForFinalize.__new__(Program._MembersNeededForFinalize) - prog._mnff.handle = None # Don't manage the foreign handle - prog._mnff.backend = backend - - # Store the backend and options - prog._backend = backend - prog._options = check_or_create_options(ProgramOptions, options, "Program options") - prog._linker = None - - # Store the handle directly without taking ownership - # This means the finalizer won't destroy it - if backend == "NVRTC": - prog._mnff.handle = nvrtc.nvrtcProgram(handle) - elif backend == "NVVM": - # For NVVM, we just store the handle as-is - # The actual NVVM program handle is opaque - prog._mnff.handle = handle - - return prog diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index fc07fe7741..89110eea0e 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -421,41 +421,6 @@ def test_module_serialization_roundtrip(get_saxpy_kernel_cubin): assert objcode._sym_map == result._sym_map assert objcode.code_type == result.code_type - -def test_object_code_from_handle(get_saxpy_kernel_cubin): - """Test ObjectCode.from_handle() with a valid handle""" - kernel, original_objcode = get_saxpy_kernel_cubin - - # Get the handle from the original object code - handle = int(original_objcode.handle) - - # Create a new ObjectCode from the handle - objcode_from_handle = ObjectCode.from_handle(handle, "cubin", symbol_mapping=original_objcode._sym_map) - assert isinstance(objcode_from_handle, ObjectCode) - assert objcode_from_handle.code_type == "cubin" - - # Try to get a kernel from the new object code - kernel_from_handle = objcode_from_handle.get_kernel("saxpy") - assert isinstance(kernel_from_handle, cuda.core._module.Kernel) - - -def test_object_code_from_handle_with_different_code_types(get_saxpy_kernel_ptx): - """Test ObjectCode.from_handle() with PTX code type""" - ptx, original_objcode = get_saxpy_kernel_ptx - - if not Program._can_load_generated_ptx(): - pytest.skip("PTX version too new for current driver") - - # Force loading to get a handle - _ = original_objcode.get_kernel("saxpy") - handle = int(original_objcode.handle) - - # Create a new ObjectCode from the handle with PTX code type - objcode_from_handle = ObjectCode.from_handle(handle, "ptx", symbol_mapping=original_objcode._sym_map) - assert isinstance(objcode_from_handle, ObjectCode) - assert objcode_from_handle.code_type == "ptx" - - def test_kernel_from_handle(get_saxpy_kernel_cubin): """Test Kernel.from_handle() with a valid handle""" original_kernel, objcode = get_saxpy_kernel_cubin @@ -490,120 +455,6 @@ def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin): assert max_threads > 0 -# Edge case tests for from_handle methods - - -@pytest.mark.parametrize( - "invalid_code_type,expected_error", - [ - pytest.param("invalid_type", AssertionError, id="invalid_str"), - pytest.param("", AssertionError, id="empty_str"), - pytest.param(None, (AssertionError, TypeError), id="None"), - pytest.param(123, (AssertionError, TypeError), id="int"), - pytest.param(3.14, (AssertionError, TypeError), id="float"), - pytest.param(["cubin"], (AssertionError, TypeError), id="list"), - pytest.param(("cubin",), (AssertionError, TypeError), id="tuple"), - pytest.param({"type": "cubin"}, (AssertionError, TypeError), id="dict"), - pytest.param(b"cubin", (AssertionError, TypeError), id="bytes"), - pytest.param({"cubin"}, (AssertionError, TypeError), id="set"), - pytest.param(object(), (AssertionError, TypeError), id="object"), - ], -) -def test_object_code_from_handle_invalid_code_type(invalid_code_type, expected_error): - """Test ObjectCode.from_handle() with invalid code_type""" - with pytest.raises(expected_error): - ObjectCode.from_handle(0, invalid_code_type, symbol_mapping={}) - - -def test_object_code_from_handle_symbol_mapping_variations(): - """Test ObjectCode.from_handle() with various symbol_mapping values""" - # None symbol_mapping (should default to empty dict) - objcode1 = ObjectCode.from_handle(0, "cubin", symbol_mapping=None) - assert objcode1._sym_map == {} - - # Empty dict - objcode2 = ObjectCode.from_handle(0, "cubin", symbol_mapping={}) - assert objcode2._sym_map == {} - - # Valid symbol mapping - sym_map = {"kernel1": b"_Z7kernel1v", "kernel2": b"_Z7kernel2v"} - objcode3 = ObjectCode.from_handle(0, "cubin", symbol_mapping=sym_map) - assert objcode3._sym_map == sym_map - - -def test_object_code_from_handle_symbol_mapping_with_valid_handle(get_saxpy_kernel_cubin): - """Test that symbol_mapping is actually used when getting kernels""" - _, original_objcode = get_saxpy_kernel_cubin - original_handle = int(original_objcode.handle) - - # Create ObjectCode with correct symbol mapping - objcode_with_map = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map) - - # Should successfully get kernel using unmangled name from symbol_mapping - kernel = objcode_with_map.get_kernel("saxpy") - assert isinstance(kernel, cuda.core._module.Kernel) - - # Create ObjectCode without symbol mapping - objcode_no_map = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping={}) - - # Should fail to get kernel using unmangled name (no mapping available) - with pytest.raises(CUDAError): - objcode_no_map.get_kernel("saxpy") - - -def test_object_code_from_handle_lifecycle(get_saxpy_kernel_cubin): - """Test handle lifecycle and ownership with from_handle""" - original_kernel, original_objcode = get_saxpy_kernel_cubin - - # Get the original handle - original_handle = int(original_objcode.handle) - - # Create a new ObjectCode from the same handle - objcode_from_handle = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map) - - # Both should reference the same underlying CUDA module - assert int(objcode_from_handle.handle) == original_handle - - # Get a kernel from the from_handle version - kernel_from_copy = objcode_from_handle.get_kernel("saxpy") - assert isinstance(kernel_from_copy, cuda.core._module.Kernel) - - # The original should still work - kernel_from_original = original_objcode.get_kernel("saxpy") - assert isinstance(kernel_from_original, cuda.core._module.Kernel) - - # Both kernels should reference the same underlying CUDA kernel handle - # If handles are equal, they're the same kernel - no need to check attributes - assert int(kernel_from_copy._handle) == int(kernel_from_original._handle) - - -def test_object_code_from_handle_multiple_instances(get_saxpy_kernel_cubin): - """Test creating multiple ObjectCode instances from the same handle""" - original_kernel, original_objcode = get_saxpy_kernel_cubin - - # Get the original handle - original_handle = int(original_objcode.handle) - - # Create multiple ObjectCode instances from the same handle - objcode1 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map) - objcode2 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map) - objcode3 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map) - - # All should have the same handle - assert int(objcode1.handle) == original_handle - assert int(objcode2.handle) == original_handle - assert int(objcode3.handle) == original_handle - - # All should be able to get kernels - kernel1 = objcode1.get_kernel("saxpy") - kernel2 = objcode2.get_kernel("saxpy") - kernel3 = objcode3.get_kernel("saxpy") - - assert isinstance(kernel1, cuda.core._module.Kernel) - assert isinstance(kernel2, cuda.core._module.Kernel) - assert isinstance(kernel3, cuda.core._module.Kernel) - - @pytest.mark.parametrize( "invalid_value", [ diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 105595839a..9a9e4926ae 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -547,46 +547,3 @@ def test_program_options_as_bytes_nvvm_unsupported_option(): options = ProgramOptions(arch="sm_80", lineinfo=True) with pytest.raises(CUDAError, match="not supported by NVVM backend"): options.as_bytes("nvvm") - - -def test_program_from_handle_nvrtc(init_cuda): - """Test Program.from_handle() with NVRTC backend""" - # Create a regular program to get a handle - code = 'extern "C" __global__ void test_kernel() {}' - original_program = Program(code, "c++") - assert original_program.backend == "NVRTC" - - # Get the handle - handle = int(original_program.handle) - - # Create a new program from the handle - program_from_handle = Program.from_handle(handle, "NVRTC") - assert program_from_handle.backend == "NVRTC" - # Note: We don't own the handle, so we shouldn't close it in the from_handle instance - - # Clean up the original program - original_program.close() - - -@nvvm_available -def test_program_from_handle_nvvm(init_cuda, nvvm_ir): - """Test Program.from_handle() with NVVM backend""" - # Create a regular NVVM program to get a handle - original_program = Program(nvvm_ir, "nvvm") - assert original_program.backend == "NVVM" - - # Get the handle - handle = int(original_program.handle) - - # Create a new program from the handle - program_from_handle = Program.from_handle(handle, "NVVM") - assert program_from_handle.backend == "NVVM" - - # Clean up the original program - original_program.close() - - -def test_program_from_handle_invalid_backend(): - """Test Program.from_handle() with invalid backend""" - with pytest.raises(ValueError, match="Unsupported backend 'INVALID'"): - Program.from_handle(0, "INVALID") From 566842dcf4b1cc0fea304111f32ab4af51c7f7a0 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 12 Jan 2026 14:39:02 -0800 Subject: [PATCH 13/19] whitespace --- cuda_core/tests/test_module.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index 89110eea0e..c6415c93eb 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -421,6 +421,7 @@ def test_module_serialization_roundtrip(get_saxpy_kernel_cubin): assert objcode._sym_map == result._sym_map assert objcode.code_type == result.code_type + def test_kernel_from_handle(get_saxpy_kernel_cubin): """Test Kernel.from_handle() with a valid handle""" original_kernel, objcode = get_saxpy_kernel_cubin From cffa98a7fbc27cbd79133369d04713b1618087e3 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 13 Jan 2026 08:11:11 -0800 Subject: [PATCH 14/19] adding function memoization --- cuda_bindings/pixi.lock | 80 +++++++++++++++++----------------- cuda_core/cuda/core/_module.py | 2 + 2 files changed, 42 insertions(+), 40 deletions(-) diff --git a/cuda_bindings/pixi.lock b/cuda_bindings/pixi.lock index f84d569dff..fb3d0ad393 100644 --- a/cuda_bindings/pixi.lock +++ b/cuda_bindings/pixi.lock @@ -26,7 +26,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.1.80-hecca717_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.1.80-h376f20c_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.1.80-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.80-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.1.80-h69a702a_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.1.80-ha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.80-h4bc722e_0.conda @@ -72,7 +72,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.0.49-hd07211c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.125-hb03c661_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda @@ -220,7 +220,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-cudart-static-13.1.80-h8f3c8d4_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-aarch64-13.1.80-h8f3c8d4_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-aarch64-13.1.80-h8f3c8d4_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.80-h8f3c8d4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.115-h8f3c8d4_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-13.1.80-he9431aa_100.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-aarch64-13.1.80-h579c4fd_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-impl-13.1.80-h7b14b0b_0.conda @@ -263,7 +263,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libblas-3.11.0-5_haddc8a3_openblas.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcap-2.77-h68e9139_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcblas-3.11.0-5_hd72aa62_openblas.conda - - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.0.49-hbf501ad_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.1.26-hbf501ad_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdeflate-1.25-h1af38f5_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdrm-2.4.125-he30d5cf_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libegl-1.7.0-hd24410f_2.conda @@ -401,7 +401,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-static-13.1.80-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_win-64-13.1.80-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_win-64-13.1.80-hac47afa_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.80-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.115-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-13.1.80-h719f0c7_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-13.1.80-h57928b3_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-13.1.80-h2466b09_0.conda @@ -542,7 +542,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.1.80-hecca717_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.1.80-h376f20c_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.1.80-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.80-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.1.80-h69a702a_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.1.80-ha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.80-h4bc722e_0.conda @@ -588,7 +588,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.0.49-hd07211c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.125-hb03c661_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda @@ -736,7 +736,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-cudart-static-13.1.80-h8f3c8d4_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-aarch64-13.1.80-h8f3c8d4_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-aarch64-13.1.80-h8f3c8d4_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.80-h8f3c8d4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.115-h8f3c8d4_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-13.1.80-he9431aa_100.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-aarch64-13.1.80-h579c4fd_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-impl-13.1.80-h7b14b0b_0.conda @@ -779,7 +779,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libblas-3.11.0-5_haddc8a3_openblas.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcap-2.77-h68e9139_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcblas-3.11.0-5_hd72aa62_openblas.conda - - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.0.49-hbf501ad_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.1.26-hbf501ad_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdeflate-1.25-h1af38f5_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdrm-2.4.125-he30d5cf_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libegl-1.7.0-hd24410f_2.conda @@ -917,7 +917,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-static-13.1.80-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_win-64-13.1.80-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_win-64-13.1.80-hac47afa_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.80-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.115-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-13.1.80-h719f0c7_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-13.1.80-h57928b3_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-13.1.80-h2466b09_0.conda @@ -1461,10 +1461,10 @@ packages: - cuda-pathfinder >=1.1,<2 - libnvjitlink - cuda-nvrtc - - cuda-nvrtc >=13.1.80,<14.0a0 + - cuda-nvrtc >=13.1.115,<14.0a0 - cuda-nvvm - libcufile - - libcufile >=1.16.0.49,<2.0a0 + - libcufile >=1.16.1.26,<2.0a0 - libgcc >=15 - libgcc >=15 - libstdcxx >=15 @@ -1483,7 +1483,7 @@ packages: - cuda-pathfinder >=1.1,<2 - libnvjitlink - cuda-nvrtc - - cuda-nvrtc >=13.1.80,<14.0a0 + - cuda-nvrtc >=13.1.115,<14.0a0 - cuda-nvvm - vc >=14.1,<15 - vc14_runtime >=14.16.27033 @@ -1502,10 +1502,10 @@ packages: - cuda-pathfinder >=1.1,<2 - libnvjitlink - cuda-nvrtc - - cuda-nvrtc >=13.1.80,<14.0a0 + - cuda-nvrtc >=13.1.115,<14.0a0 - cuda-nvvm - libcufile - - libcufile >=1.16.0.49,<2.0a0 + - libcufile >=1.16.1.26,<2.0a0 - libgcc >=15 - libgcc >=15 - libstdcxx >=15 @@ -1759,39 +1759,39 @@ packages: license: LicenseRef-NVIDIA-End-User-License-Agreement size: 24082 timestamp: 1764883821516 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.80-hecca717_0.conda - sha256: d6b326bdbf6fa7bfa0fa617dda547dc585159816b8f130f2535740c4e53fd12c - md5: 7ef874b2dc4ca388ecef3b3893305459 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda + sha256: 9cc4f9df70c02eea5121cdb0e865207b04cd52591f57ebcac2ba44fada10eb5b + md5: df16c9049d882cdaf4f83a5b90079589 depends: - __glibc >=2.17,<3.0.a0 - cuda-version >=13.1,<13.2.0a0 - libgcc >=14 - libstdcxx >=14 license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 35479197 - timestamp: 1764880529154 -- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.80-h8f3c8d4_0.conda - sha256: 5e10ce4dd84c22c73e58a9f8359fb1e5ef4596afd3a0bc12b9fbde73b388ec0d - md5: 0473ebdb01f2f4024177b024fc19fa72 + size: 35339417 + timestamp: 1768272955912 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.115-h8f3c8d4_0.conda + sha256: a1ec61512cecb093797e00590ad381ecd5852d2a32440ff22b34f78c743f3d5a + md5: 34da2ff2c64054d65eb8f04d76c40cca depends: - arm-variant * sbsa - cuda-version >=13.1,<13.2.0a0 - libgcc >=14 - libstdcxx >=14 license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 33619044 - timestamp: 1764880672755 -- conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.80-hac47afa_0.conda - sha256: 3f67de8a9eb182fa20bbc80bda7185afb676cfe8894f6a0549173bd752a7d2f4 - md5: 7b42337a35cd887ec3eed254b5ed606f + size: 33616576 + timestamp: 1768272976976 +- conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.115-hac47afa_0.conda + sha256: a8869b7d997722f90b9f8a602dc0b1d0d497f2a6f3561dc89383aeb2cd379a66 + md5: 372d3c612a832d5f87d8dd9702d487b2 depends: - cuda-version >=13.1,<13.2.0a0 - ucrt >=10.0.20348.0 - vc >=14.3,<15 - vc14_runtime >=14.44.35208 license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 31012754 - timestamp: 1764880740086 + size: 31006920 + timestamp: 1768273107962 - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.1.80-h69a702a_0.conda sha256: 84f971ab146e2c822103cfe06f478ece244747a6f2aa565be639a4709d0a1579 md5: 9250c651d8758c8f665dff7519ef21ff @@ -3275,9 +3275,9 @@ packages: license_family: BSD size: 68079 timestamp: 1765819124349 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.0.49-hd07211c_0.conda - sha256: 6aabad84132b1f3ee367e5d24291febf8a11d9a7f3967a64fc07e77d9b0b22df - md5: 9cb68a85f8c08f0512931f944f6a75df +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda + sha256: 8c44b5bf947afad827df0df49fe7483cf1b2916694081b2db4fecdfd6a2bacd1 + md5: 48418c48dac04671fa46cb446122b8a5 depends: - __glibc >=2.28,<3.0.a0 - cuda-version >=13.1,<13.2.0a0 @@ -3285,11 +3285,11 @@ packages: - libstdcxx >=14 - rdma-core >=60.0 license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 990030 - timestamp: 1764881892686 -- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.0.49-hbf501ad_0.conda - sha256: d03963dc7708ded20340176ade987fc4c3e49da4f7b139a85e69ca7eb413f57a - md5: 315e1b144eaf890519fc63049b6e9228 + size: 990938 + timestamp: 1768273732081 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.1.26-hbf501ad_0.conda + sha256: 7451b3e2204e6cad21db501052dfe595c3440213ef3e22c0f9c784012f6a8419 + md5: ee60a24c702ce02de95ae1982c4841d8 depends: - __glibc >=2.28,<3.0.a0 - arm-variant * sbsa @@ -3300,8 +3300,8 @@ packages: constrains: - arm-variant * sbsa license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 887547 - timestamp: 1764881951574 + size: 891752 + timestamp: 1768273724252 - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda sha256: aa8e8c4be9a2e81610ddf574e05b64ee131fab5e0e3693210c9d6d2fba32c680 md5: 6c77a605a7a689d17d4819c0f8ac9a00 diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py index 0877eabb99..ccdfb30c0e 100644 --- a/cuda_core/cuda/core/_module.py +++ b/cuda_core/cuda/core/_module.py @@ -7,6 +7,7 @@ from collections import namedtuple from typing import Union from warnings import warn +import functools from cuda.core._device import Device from cuda.core._launch_config import LaunchConfig, _to_native_launch_config @@ -99,6 +100,7 @@ def _get_kernel_ctypes(): return _kernel_ctypes +@functools.cache def _get_backend_version(): """Get the backend version ("new" or "old") based on CUDA version. From b6a485d2034ecbf5b4a106cd824fce1eed85c341 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 13 Jan 2026 08:34:31 -0800 Subject: [PATCH 15/19] accessing Kernel directly --- cuda_core/tests/test_module.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index c6415c93eb..20c358d46d 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -7,7 +7,7 @@ import cuda.core import pytest -from cuda.core import Device, ObjectCode, Program, ProgramOptions +from cuda.core import Device, ObjectCode, Program, ProgramOptions, Kernel from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return try: @@ -430,8 +430,8 @@ def test_kernel_from_handle(get_saxpy_kernel_cubin): handle = int(original_kernel._handle) # Create a new Kernel from the handle - kernel_from_handle = cuda.core._module.Kernel.from_handle(handle, objcode) - assert isinstance(kernel_from_handle, cuda.core._module.Kernel) + kernel_from_handle = Kernel.from_handle(handle, objcode) + assert isinstance(kernel_from_handle, Kernel) # Verify we can access kernel attributes max_threads = kernel_from_handle.attributes.max_threads_per_block() @@ -447,8 +447,8 @@ def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin): handle = int(original_kernel._handle) # Create a new Kernel from the handle without a module - kernel_from_handle = cuda.core._module.Kernel.from_handle(handle) - assert isinstance(kernel_from_handle, cuda.core._module.Kernel) + kernel_from_handle = Kernel.from_handle(handle) + assert isinstance(kernel_from_handle, Kernel) # Verify we can still access kernel attributes max_threads = kernel_from_handle.attributes.max_threads_per_block() @@ -474,7 +474,7 @@ def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin): def test_kernel_from_handle_type_validation(invalid_value): """Test Kernel.from_handle() with wrong handle types""" with pytest.raises(TypeError): - cuda.core._module.Kernel.from_handle(invalid_value) + Kernel.from_handle(invalid_value) def test_kernel_from_handle_invalid_module_type(get_saxpy_kernel_cubin): @@ -484,10 +484,10 @@ def test_kernel_from_handle_invalid_module_type(get_saxpy_kernel_cubin): # Invalid module type (should fail type assertion in _from_obj) with pytest.raises((TypeError, AssertionError)): - cuda.core._module.Kernel.from_handle(handle, mod="not_an_objectcode") + Kernel.from_handle(handle, mod="not_an_objectcode") with pytest.raises((TypeError, AssertionError)): - cuda.core._module.Kernel.from_handle(handle, mod=12345) + Kernel.from_handle(handle, mod=12345) def test_kernel_from_handle_multiple_instances(get_saxpy_kernel_cubin): @@ -496,14 +496,14 @@ def test_kernel_from_handle_multiple_instances(get_saxpy_kernel_cubin): handle = int(original_kernel._handle) # Create multiple Kernel instances from the same handle - kernel1 = cuda.core._module.Kernel.from_handle(handle, objcode) - kernel2 = cuda.core._module.Kernel.from_handle(handle, objcode) - kernel3 = cuda.core._module.Kernel.from_handle(handle, objcode) + kernel1 = Kernel.from_handle(handle, objcode) + kernel2 = Kernel.from_handle(handle, objcode) + kernel3 = Kernel.from_handle(handle, objcode) # All should be valid Kernel objects - assert isinstance(kernel1, cuda.core._module.Kernel) - assert isinstance(kernel2, cuda.core._module.Kernel) - assert isinstance(kernel3, cuda.core._module.Kernel) + assert isinstance(kernel1, Kernel) + assert isinstance(kernel2, Kernel) + assert isinstance(kernel3, Kernel) # All should reference the same underlying CUDA kernel handle assert int(kernel1._handle) == int(kernel2._handle) == int(kernel3._handle) == handle From df1c6e1840738a741a09918d9badbc29903508ff Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 13 Jan 2026 10:52:54 -0800 Subject: [PATCH 16/19] feedback --- cuda_core/cuda/core/_module.py | 69 ++++++++++++++++++++++++++++------ cuda_core/tests/test_module.py | 7 ++++ 2 files changed, 65 insertions(+), 11 deletions(-) diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py index ccdfb30c0e..a103e8ab71 100644 --- a/cuda_core/cuda/core/_module.py +++ b/cuda_core/cuda/core/_module.py @@ -17,7 +17,7 @@ assert_type_str_or_bytes_like, raise_code_path_meant_to_be_unreachable, ) -from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return, precondition +from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return, precondition # Lazy initialization state and synchronization # For Python 3.13t (free-threaded builds), we use a lock to ensure thread-safe initialization. @@ -25,6 +25,7 @@ _init_lock = threading.Lock() _inited = False _py_major_ver = None +_py_minor_ver = None _driver_ver = None _kernel_ctypes = None _backend = { @@ -60,9 +61,9 @@ def _lazy_init(): if _inited: return - global _py_major_ver, _driver_ver, _kernel_ctypes + global _py_major_ver, _py_minor_ver, _driver_ver, _kernel_ctypes # binding availability depends on cuda-python version - _py_major_ver, _ = get_binding_version() + _py_major_ver, _py_minor_ver = get_binding_version() if _py_major_ver >= 12: _backend["new"] = { "file": driver.cuLibraryLoadFromFile, @@ -88,6 +89,12 @@ def _get_py_major_ver(): return _py_major_ver +def _get_py_minor_ver(): + """Get the Python binding minor version, initializing if needed.""" + _lazy_init() + return _py_minor_ver + + def _get_driver_ver(): """Get the CUDA driver version, initializing if needed.""" _lazy_init() @@ -100,13 +107,37 @@ def _get_kernel_ctypes(): return _kernel_ctypes +@functools.cache +def _is_cuda_12_plus_backend() -> bool: + """Return True when the CUDA 12+ (cuLibrary) backend is available/active.""" + return _get_py_major_ver() >= 12 and _get_driver_ver() >= 12000 + + @functools.cache def _get_backend_version(): """Get the backend version ("new" or "old") based on CUDA version. Returns "new" for CUDA 12.0+ (uses cuLibrary API), "old" otherwise (uses cuModule API). """ - return "new" if (_get_py_major_ver() >= 12 and _get_driver_ver() >= 12000) else "old" + return "new" if _is_cuda_12_plus_backend() else "old" + + +@functools.cache +def _is_cukernel_get_library_supported() -> bool: + """Return True when cuKernelGetLibrary is available for inverse kernel-to-library lookup. + + Requires cuda-python bindings >= 12.5 and driver >= 12.5. + """ + return ( + (_get_py_major_ver(), _get_py_minor_ver()) >= (12, 5) + and _get_driver_ver() >= 12050 + and hasattr(driver, "cuKernelGetLibrary") + ) + + +def _make_dummy_library_handle(): + """Create a non-null placeholder CUlibrary handle to disable lazy loading.""" + return driver.CUlibrary(1) if hasattr(driver, "CUlibrary") else 1 class KernelAttributes: @@ -511,20 +542,36 @@ def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel": if not isinstance(handle, int): raise TypeError(f"handle must be an integer, got {type(handle).__name__}") - # Convert the integer handle to the appropriate driver type - if _get_py_major_ver() >= 12 and _get_driver_ver() >= 12000: - # Try CUkernel first for newer CUDA versions + # Convert the integer handle to the appropriate driver type. + # + # - For the CUDA 12+ "new" backend, kernels are CUkernel handles. + # - For the legacy "old" backend, kernels are CUfunction handles. + if _is_cuda_12_plus_backend(): kernel_obj = driver.CUkernel(handle) else: - # Use CUfunction for older versions kernel_obj = driver.CUfunction(handle) # If no module provided, create a placeholder if mod is None: - # Create a placeholder ObjectCode that won't try to load anything + if not _is_cuda_12_plus_backend(): + raise NotImplementedError( + "Kernel.from_handle(..., mod=None) is only supported for CUkernel handles " + "(CUDA 12 'new' backend). For CUfunction handles, please pass the owning " + "ObjectCode explicitly via the 'mod' argument." + ) + + # For CUkernel, we can (optionally) inverse-lookup the owning CUlibrary via + # cuKernelGetLibrary (added in CUDA 12.5). If the API is not available, we fall + # back to a non-null dummy handle purely to disable lazy loading. mod = ObjectCode._init(b"", "cubin") - # Set a dummy handle to prevent lazy loading - mod._handle = 1 # Non-null placeholder + if _is_cukernel_get_library_supported(): + try: + mod._handle = handle_return(driver.cuKernelGetLibrary(kernel_obj)) + except (CUDAError, RuntimeError): + # Best-effort: don't fail construction if inverse lookup fails. + mod._handle = _make_dummy_library_handle() + else: + mod._handle = _make_dummy_library_handle() return Kernel._from_obj(kernel_obj, mod) diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index 20c358d46d..9265c96bf4 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -8,6 +8,7 @@ import cuda.core import pytest from cuda.core import Device, ObjectCode, Program, ProgramOptions, Kernel +from cuda.core._module import _is_cuda_12_plus_backend from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return try: @@ -446,6 +447,12 @@ def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin): # Get the handle from the original kernel handle = int(original_kernel._handle) + # Kernel.from_handle(..., mod=None) is only supported on the CUDA 12 "new" backend. + if not _is_cuda_12_plus_backend(): + with pytest.raises(NotImplementedError): + Kernel.from_handle(handle) + pytest.skip("mod=None only supported on CUDA 12+ backend") + # Create a new Kernel from the handle without a module kernel_from_handle = Kernel.from_handle(handle) assert isinstance(kernel_from_handle, Kernel) From 5f01231ce54638d670adc2e36ba533642d17fb9b Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 13 Jan 2026 11:40:47 -0800 Subject: [PATCH 17/19] precommit --- cuda_core/cuda/core/_module.py | 2 +- cuda_core/tests/test_module.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py index a103e8ab71..ed98044ae5 100644 --- a/cuda_core/cuda/core/_module.py +++ b/cuda_core/cuda/core/_module.py @@ -2,12 +2,12 @@ # # SPDX-License-Identifier: Apache-2.0 +import functools import threading import weakref from collections import namedtuple from typing import Union from warnings import warn -import functools from cuda.core._device import Device from cuda.core._launch_config import LaunchConfig, _to_native_launch_config diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index 9265c96bf4..aad867cc68 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -7,7 +7,7 @@ import cuda.core import pytest -from cuda.core import Device, ObjectCode, Program, ProgramOptions, Kernel +from cuda.core import Device, Kernel, ObjectCode, Program, ProgramOptions from cuda.core._module import _is_cuda_12_plus_backend from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return From ca0b7da2263e85855709c3f62c5c10c00df9ed54 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 13 Jan 2026 14:34:24 -0800 Subject: [PATCH 18/19] removing the CUDA11 code path --- cuda_core/cuda/core/_launcher.pyx | 12 ++-- cuda_core/cuda/core/_module.py | 109 +++++++----------------------- cuda_core/tests/test_module.py | 8 +-- 3 files changed, 33 insertions(+), 96 deletions(-) diff --git a/cuda_core/cuda/core/_launcher.pyx b/cuda_core/cuda/core/_launcher.pyx index c575ad9bd0..9559f7697a 100644 --- a/cuda_core/cuda/core/_launcher.pyx +++ b/cuda_core/cuda/core/_launcher.pyx @@ -78,14 +78,14 @@ def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kern cdef void** args_ptr = (ker_args.ptr) # TODO: cythonize Module/Kernel/... - # Note: CUfunction and CUkernel are interchangeable + # Note: We now use CUkernel handles exclusively (CUDA 12+), but they can be cast to + # CUfunction for use with cuLaunchKernel, as both handle types are interchangeable + # for kernel launch purposes. cdef cydriver.CUfunction func_handle = ((kernel._handle)) - # Note: CUkernel can still be launched via the old cuLaunchKernel and we do not care - # about the CUfunction/CUkernel difference (which depends on whether the "old" or - # "new" module loading APIs are in use). We check both binding & driver versions here - # mainly to see if the "Ex" API is available and if so we use it, as it's more feature - # rich. + # Note: CUkernel can still be launched via cuLaunchKernel (not just cuLaunchKernelEx). + # We check both binding & driver versions here mainly to see if the "Ex" API is + # available and if so we use it, as it's more feature rich. if _use_ex: drv_cfg = conf._to_native_launch_config() drv_cfg.hStream = as_cu(s._h_stream) diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py index ed98044ae5..603ad07eb0 100644 --- a/cuda_core/cuda/core/_module.py +++ b/cuda_core/cuda/core/_module.py @@ -28,14 +28,7 @@ _py_minor_ver = None _driver_ver = None _kernel_ctypes = None -_backend = { - "old": { - "file": driver.cuModuleLoad, - "data": driver.cuModuleLoadDataEx, - "kernel": driver.cuModuleGetFunction, - "attribute": driver.cuFuncGetAttribute, - }, -} +_backend = {} def _lazy_init(): @@ -61,22 +54,19 @@ def _lazy_init(): if _inited: return - global _py_major_ver, _py_minor_ver, _driver_ver, _kernel_ctypes + global _py_major_ver, _py_minor_ver, _driver_ver, _kernel_ctypes, _backend # binding availability depends on cuda-python version _py_major_ver, _py_minor_ver = get_binding_version() - if _py_major_ver >= 12: - _backend["new"] = { - "file": driver.cuLibraryLoadFromFile, - "data": driver.cuLibraryLoadData, - "kernel": driver.cuLibraryGetKernel, - "attribute": driver.cuKernelGetAttribute, - } - _kernel_ctypes = (driver.CUfunction, driver.CUkernel) - else: - _kernel_ctypes = (driver.CUfunction,) + _backend = { + "file": driver.cuLibraryLoadFromFile, + "data": driver.cuLibraryLoadData, + "kernel": driver.cuLibraryGetKernel, + "attribute": driver.cuKernelGetAttribute, + } + _kernel_ctypes = (driver.CUkernel,) _driver_ver = handle_return(driver.cuDriverGetVersion()) - if _py_major_ver >= 12 and _driver_ver >= 12040: - _backend["new"]["paraminfo"] = driver.cuKernelGetParamInfo + if _driver_ver >= 12040: + _backend["paraminfo"] = driver.cuKernelGetParamInfo # Mark as initialized (must be last to ensure all state is set) _inited = True @@ -107,21 +97,6 @@ def _get_kernel_ctypes(): return _kernel_ctypes -@functools.cache -def _is_cuda_12_plus_backend() -> bool: - """Return True when the CUDA 12+ (cuLibrary) backend is available/active.""" - return _get_py_major_ver() >= 12 and _get_driver_ver() >= 12000 - - -@functools.cache -def _get_backend_version(): - """Get the backend version ("new" or "old") based on CUDA version. - - Returns "new" for CUDA 12.0+ (uses cuLibrary API), "old" otherwise (uses cuModule API). - """ - return "new" if _is_cuda_12_plus_backend() else "old" - - @functools.cache def _is_cukernel_get_library_supported() -> bool: """Return True when cuKernelGetLibrary is available for inverse kernel-to-library lookup. @@ -144,7 +119,7 @@ class KernelAttributes: def __new__(self, *args, **kwargs): raise RuntimeError("KernelAttributes cannot be instantiated directly. Please use Kernel APIs.") - slots = ("_kernel", "_cache", "_backend_version", "_loader") + slots = ("_kernel", "_cache", "_loader") @classmethod def _init(cls, kernel): @@ -152,8 +127,9 @@ def _init(cls, kernel): self._kernel = weakref.ref(kernel) self._cache = {} - self._backend_version = _get_backend_version() - self._loader = _backend[self._backend_version] + # Ensure backend is initialized before setting loader + _lazy_init() + self._loader = _backend return self def _get_cached_attribute(self, device_id: Device | int, attribute: driver.CUfunction_attribute) -> int: @@ -166,15 +142,7 @@ def _get_cached_attribute(self, device_id: Device | int, attribute: driver.CUfun kernel = self._kernel() if kernel is None: raise RuntimeError("Cannot access kernel attributes for expired Kernel object") - if self._backend_version == "new": - result = handle_return(self._loader["attribute"](attribute, kernel._handle, device_id)) - else: # "old" backend - warn( - "Device ID argument is ignored when getting attribute from kernel when cuda version < 12. ", - RuntimeWarning, - stacklevel=2, - ) - result = handle_return(self._loader["attribute"](attribute, kernel._handle)) + result = handle_return(self._loader["attribute"](attribute, kernel._handle, device_id)) self._cache[cache_key] = result return result @@ -480,8 +448,6 @@ def attributes(self) -> KernelAttributes: def _get_arguments_info(self, param_info=False) -> tuple[int, list[ParamInfo]]: attr_impl = self.attributes - if attr_impl._backend_version != "new": - raise NotImplementedError("New backend is required") if "paraminfo" not in attr_impl._loader: driver_ver = _get_driver_ver() raise NotImplementedError( @@ -525,13 +491,13 @@ def occupancy(self) -> KernelOccupancy: def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel": """Creates a new :obj:`Kernel` object from a foreign kernel handle. - Uses a CUfunction or CUkernel pointer address to create a new :obj:`Kernel` object. + Uses a CUkernel pointer address to create a new :obj:`Kernel` object. Parameters ---------- handle : int Kernel handle representing the address of a foreign - kernel object (CUfunction or CUkernel). + kernel object (CUkernel). mod : :obj:`ObjectCode`, optional The ObjectCode object associated with this kernel. If not provided, a placeholder ObjectCode will be created. Note that without a proper @@ -542,24 +508,11 @@ def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel": if not isinstance(handle, int): raise TypeError(f"handle must be an integer, got {type(handle).__name__}") - # Convert the integer handle to the appropriate driver type. - # - # - For the CUDA 12+ "new" backend, kernels are CUkernel handles. - # - For the legacy "old" backend, kernels are CUfunction handles. - if _is_cuda_12_plus_backend(): - kernel_obj = driver.CUkernel(handle) - else: - kernel_obj = driver.CUfunction(handle) + # Convert the integer handle to CUkernel driver type + kernel_obj = driver.CUkernel(handle) # If no module provided, create a placeholder if mod is None: - if not _is_cuda_12_plus_backend(): - raise NotImplementedError( - "Kernel.from_handle(..., mod=None) is only supported for CUkernel handles " - "(CUDA 12 'new' backend). For CUfunction handles, please pass the owning " - "ObjectCode explicitly via the 'mod' argument." - ) - # For CUkernel, we can (optionally) inverse-lookup the owning CUlibrary via # cuKernelGetLibrary (added in CUDA 12.5). If the API is not available, we fall # back to a non-null dummy handle purely to disable lazy loading. @@ -591,14 +544,9 @@ class ObjectCode: like to load, use the :meth:`from_cubin` alternative constructor. Constructing directly from all other possible code types should be avoided in favor of compilation through :class:`~cuda.core.Program` - - Note - ---- - Usage under CUDA 11.x will only load to the current device - context. """ - __slots__ = ("_handle", "_backend_version", "_code_type", "_module", "_loader", "_sym_map", "_name") + __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map", "_name") _supported_code_type = ("cubin", "ptx", "ltoir", "fatbin", "object", "library") def __new__(self, *args, **kwargs): @@ -615,8 +563,9 @@ def _init(cls, module, code_type, *, name: str = "", symbol_mapping: dict | None # handle is assigned during _lazy_load self._handle = None - self._backend_version = _get_backend_version() - self._loader = _backend[self._backend_version] + # Ensure backend is initialized before setting loader + _lazy_init() + self._loader = _backend self._code_type = code_type self._module = module @@ -749,16 +698,10 @@ def _lazy_load_module(self, *args, **kwargs): module = self._module assert_type_str_or_bytes_like(module) if isinstance(module, str): - if self._backend_version == "new": - self._handle = handle_return(self._loader["file"](module.encode(), [], [], 0, [], [], 0)) - else: # "old" backend - self._handle = handle_return(self._loader["file"](module.encode())) + self._handle = handle_return(self._loader["file"](module.encode(), [], [], 0, [], [], 0)) return if isinstance(module, (bytes, bytearray)): - if self._backend_version == "new": - self._handle = handle_return(self._loader["data"](module, [], [], 0, [], [], 0)) - else: # "old" backend - self._handle = handle_return(self._loader["data"](module, 0, [], [])) + self._handle = handle_return(self._loader["data"](module, [], [], 0, [], [], 0)) return raise_code_path_meant_to_be_unreachable() diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index aad867cc68..f9bbcd3e4c 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -8,7 +8,6 @@ import cuda.core import pytest from cuda.core import Device, Kernel, ObjectCode, Program, ProgramOptions -from cuda.core._module import _is_cuda_12_plus_backend from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return try: @@ -447,13 +446,8 @@ def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin): # Get the handle from the original kernel handle = int(original_kernel._handle) - # Kernel.from_handle(..., mod=None) is only supported on the CUDA 12 "new" backend. - if not _is_cuda_12_plus_backend(): - with pytest.raises(NotImplementedError): - Kernel.from_handle(handle) - pytest.skip("mod=None only supported on CUDA 12+ backend") - # Create a new Kernel from the handle without a module + # This is supported on CUDA 12+ backend (CUkernel) kernel_from_handle = Kernel.from_handle(handle) assert isinstance(kernel_from_handle, Kernel) From c9cca32c63a2a6b6357638870b464ac8b229a4db Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Tue, 13 Jan 2026 14:35:00 -0800 Subject: [PATCH 19/19] pre-commit --- cuda_core/cuda/core/_module.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py index 603ad07eb0..dd3f4494d5 100644 --- a/cuda_core/cuda/core/_module.py +++ b/cuda_core/cuda/core/_module.py @@ -7,7 +7,6 @@ import weakref from collections import namedtuple from typing import Union -from warnings import warn from cuda.core._device import Device from cuda.core._launch_config import LaunchConfig, _to_native_launch_config