From b17ffa80a62d60d0d04bf9940767830d78786566 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 16 Dec 2025 16:29:15 -0800
Subject: [PATCH 01/19] wip

---
 cuda_core/cuda/core/experimental/_module.py  | 93 +++++++++++++++++++-
 cuda_core/cuda/core/experimental/_program.py | 71 +++++++++++++++
 cuda_core/tests/test_module.py               | 69 +++++++++++++++
 cuda_core/tests/test_program.py              | 43 +++++++++
 4 files changed, 275 insertions(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
index 9af722465b..f52817323f 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/experimental/_module.py
@@ -436,7 +436,51 @@ def occupancy(self) -> KernelOccupancy:
             self._occupancy = KernelOccupancy._init(self._handle)
         return self._occupancy
 
-    # TODO: implement from_handle()
+    @staticmethod
+    def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel":
+        """Create a new :obj:`Kernel` object from a foreign kernel handle.
+
+        Uses a CUfunction or CUkernel pointer address represented as a Python int
+        to create a new :obj:`Kernel` object.
+
+        Note
+        ----
+        Kernel lifetime is not managed, foreign object must remain
+        alive while this kernel is active.
+
+        Parameters
+        ----------
+        handle : int
+            Kernel handle representing the address of a foreign
+            kernel object (CUfunction or CUkernel).
+        mod : :obj:`ObjectCode`, optional
+            The ObjectCode object associated with this kernel. If not provided,
+            a placeholder ObjectCode will be created. Note that without a proper
+            ObjectCode, certain operations may be limited.
+
+        Returns
+        -------
+        :obj:`Kernel`
+            Newly created kernel object.
+
+        """
+        _lazy_init()
+        # Convert the integer handle to the appropriate driver type
+        if _py_major_ver >= 12 and _driver_ver >= 12000:
+            # Try CUkernel first for newer CUDA versions
+            kernel_obj = driver.CUkernel(handle)
+        else:
+            # Use CUfunction for older versions
+            kernel_obj = driver.CUfunction(handle)
+        
+        # If no module provided, create a placeholder
+        if mod is None:
+            # Create a placeholder ObjectCode that won't try to load anything
+            mod = ObjectCode._init(b"", "cubin")
+            # Set a dummy handle to prevent lazy loading
+            mod._handle = 1  # Non-null placeholder
+        
+        return Kernel._from_obj(kernel_obj, mod)
 
 
 CodeTypeT = Union[bytes, bytearray, str]
@@ -605,6 +649,53 @@ def from_library(module: Union[bytes, str], *, name: str = "", symbol_mapping: d
         """
         return ObjectCode._init(module, "library", name=name, symbol_mapping=symbol_mapping)
 
+    @staticmethod
+    def from_handle(handle: int, code_type: str = "cubin", *, name: str = "", symbol_mapping: dict | None = None) -> "ObjectCode":
+        """Create a new :obj:`ObjectCode` object from a foreign module handle.
+
+        Uses a CUmodule or CUlibrary pointer address represented as a Python int
+        to create a new :obj:`ObjectCode` object.
+
+        Note
+        ----
+        Module lifetime is not managed, foreign object must remain
+        alive while this object code is active.
+
+        Parameters
+        ----------
+        handle : int
+            Module handle representing the address of a foreign
+            module object (CUmodule or CUlibrary).
+        code_type : str, optional
+            The type of code object this handle represents. Must be one of
+            "cubin", "ptx", "ltoir", "fatbin", "object", or "library".
+            (Default: "cubin")
+        name : str, optional
+            A human-readable identifier representing this code object.
+        symbol_mapping : dict, optional
+            A dictionary specifying how the unmangled symbol names (as keys)
+            should be mapped to the mangled names before trying to retrieve
+            them (default to no mappings).
+
+        Returns
+        -------
+        :obj:`ObjectCode`
+            Newly created object code.
+
+        """
+        _lazy_init()
+        # Create an ObjectCode instance with a placeholder module
+        # The handle will be set directly, bypassing the lazy loading
+        obj = ObjectCode._init(b"", code_type, name=name, symbol_mapping=symbol_mapping)
+        
+        # Set the handle directly from the foreign handle
+        if obj._backend_version == "new":
+            obj._handle = driver.CUlibrary(handle)
+        else:
+            obj._handle = driver.CUmodule(handle)
+        
+        return obj
+
     # TODO: do we want to unload in a finalizer? Probably not..
 
     def _lazy_load_module(self, *args, **kwargs):
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index f3ad9af644..d6e051e651 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -860,3 +860,74 @@ def handle(self) -> ProgramHandleT:
             handle, call ``int(Program.handle)``.
         """
         return self._mnff.handle
+
+    @staticmethod
+    def from_handle(handle: int, backend: str, options: ProgramOptions = None) -> Program:
+        """Create a new :obj:`Program` object from a foreign program handle.
+
+        Uses a nvrtcProgram or NVVM program handle represented as a Python int
+        to create a new :obj:`Program` object.
+
+        Note
+        ----
+        Program lifetime is not managed, foreign object must remain
+        alive while this program is active. The handle should not be
+        destroyed externally while this Program object is in use.
+
+        Parameters
+        ----------
+        handle : int
+            Program handle representing the address of a foreign
+            program object (nvrtcProgram for NVRTC backend, or
+            NVVM program handle for NVVM backend).
+        backend : str
+            The backend type of the program handle. Must be either
+            "NVRTC" or "NVVM". This determines how the handle is
+            interpreted and used.
+        options : :obj:`ProgramOptions`, optional
+            Program options that may be used for subsequent operations.
+            If not provided, default options will be created.
+
+        Returns
+        -------
+        :obj:`Program`
+            Newly created program object.
+
+        Raises
+        ------
+        ValueError
+            If an unsupported backend is specified.
+
+        """
+        backend = backend.upper()
+        if backend not in ("NVRTC", "NVVM"):
+            raise ValueError(f"Unsupported backend '{backend}'. Must be 'NVRTC' or 'NVVM'")
+        
+        # Create a new Program instance without going through __init__
+        prog = object.__new__(Program)
+        
+        # Initialize the members needed for finalization
+        # Note: We pass None as the program_obj to avoid finalization since
+        # we don't own the handle
+        prog._mnff = Program._MembersNeededForFinalize.__new__(
+            Program._MembersNeededForFinalize
+        )
+        prog._mnff.handle = None  # Don't manage the foreign handle
+        prog._mnff.backend = backend
+        
+        # Store the backend and options
+        prog._backend = backend
+        prog._options = check_or_create_options(ProgramOptions, options, "Program options")
+        prog._linker = None
+        
+        # Store the handle directly without taking ownership
+        # This means the finalizer won't destroy it
+        if backend == "NVRTC":
+            prog._mnff.handle = nvrtc.nvrtcProgram(handle)
+        elif backend == "NVVM":
+            nvvm_module = _get_nvvm_module()
+            # For NVVM, we just store the handle as-is
+            # The actual NVVM program handle is opaque
+            prog._mnff.handle = handle
+        
+        return prog
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 25b8d5dd86..e150064129 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -420,3 +420,72 @@ def test_module_serialization_roundtrip(get_saxpy_kernel_cubin):
     assert objcode.code == result.code
     assert objcode._sym_map == result._sym_map
     assert objcode.code_type == result.code_type
+
+
+def test_object_code_from_handle(get_saxpy_kernel_cubin):
+    """Test ObjectCode.from_handle() with a valid handle"""
+    kernel, original_objcode = get_saxpy_kernel_cubin
+    
+    # Get the handle from the original object code
+    handle = int(original_objcode.handle)
+    
+    # Create a new ObjectCode from the handle
+    objcode_from_handle = ObjectCode.from_handle(handle, "cubin", symbol_mapping=original_objcode._sym_map)
+    assert isinstance(objcode_from_handle, ObjectCode)
+    assert objcode_from_handle.code_type == "cubin"
+    
+    # Try to get a kernel from the new object code
+    # Note: This should work since we're reusing the handle
+    kernel_from_handle = objcode_from_handle.get_kernel("saxpy<float>")
+    assert isinstance(kernel_from_handle, cuda.core.experimental._module.Kernel)
+
+
+def test_object_code_from_handle_with_different_code_types(get_saxpy_kernel_ptx):
+    """Test ObjectCode.from_handle() with PTX code type"""
+    ptx, original_objcode = get_saxpy_kernel_ptx
+    
+    if not Program._can_load_generated_ptx():
+        pytest.skip("PTX version too new for current driver")
+    
+    # Force loading to get a handle
+    _ = original_objcode.get_kernel("saxpy<float>")
+    handle = int(original_objcode.handle)
+    
+    # Create a new ObjectCode from the handle with PTX code type
+    objcode_from_handle = ObjectCode.from_handle(handle, "ptx", symbol_mapping=original_objcode._sym_map)
+    assert isinstance(objcode_from_handle, ObjectCode)
+    assert objcode_from_handle.code_type == "ptx"
+
+
+def test_kernel_from_handle(get_saxpy_kernel_cubin):
+    """Test Kernel.from_handle() with a valid handle"""
+    original_kernel, objcode = get_saxpy_kernel_cubin
+    
+    # Get the handle from the original kernel
+    handle = int(original_kernel._handle)
+    
+    # Create a new Kernel from the handle
+    kernel_from_handle = cuda.core.experimental._module.Kernel.from_handle(handle, objcode)
+    assert isinstance(kernel_from_handle, cuda.core.experimental._module.Kernel)
+    
+    # Verify we can access kernel attributes
+    max_threads = kernel_from_handle.attributes.max_threads_per_block()
+    assert isinstance(max_threads, int)
+    assert max_threads > 0
+
+
+def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin):
+    """Test Kernel.from_handle() without providing a module"""
+    original_kernel, _ = get_saxpy_kernel_cubin
+    
+    # Get the handle from the original kernel
+    handle = int(original_kernel._handle)
+    
+    # Create a new Kernel from the handle without a module
+    kernel_from_handle = cuda.core.experimental._module.Kernel.from_handle(handle)
+    assert isinstance(kernel_from_handle, cuda.core.experimental._module.Kernel)
+    
+    # Verify we can still access kernel attributes
+    max_threads = kernel_from_handle.attributes.max_threads_per_block()
+    assert isinstance(max_threads, int)
+    assert max_threads > 0
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 2b0ac5d617..9e1bbb5d20 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -538,3 +538,46 @@ def test_program_options_as_bytes_nvvm_unsupported_option():
     options = ProgramOptions(arch="sm_80", lineinfo=True)
     with pytest.raises(CUDAError, match="not supported by NVVM backend"):
         options.as_bytes("nvvm")
+
+
+def test_program_from_handle_nvrtc(init_cuda):
+    """Test Program.from_handle() with NVRTC backend"""
+    # Create a regular program to get a handle
+    code = 'extern "C" __global__ void test_kernel() {}'
+    original_program = Program(code, "c++")
+    assert original_program.backend == "NVRTC"
+    
+    # Get the handle
+    handle = int(original_program.handle)
+    
+    # Create a new program from the handle
+    program_from_handle = Program.from_handle(handle, "NVRTC")
+    assert program_from_handle.backend == "NVRTC"
+    # Note: We don't own the handle, so we shouldn't close it in the from_handle instance
+    
+    # Clean up the original program
+    original_program.close()
+
+
+@nvvm_available
+def test_program_from_handle_nvvm(init_cuda, nvvm_ir):
+    """Test Program.from_handle() with NVVM backend"""
+    # Create a regular NVVM program to get a handle
+    original_program = Program(nvvm_ir, "nvvm")
+    assert original_program.backend == "NVVM"
+    
+    # Get the handle
+    handle = int(original_program.handle) if hasattr(original_program.handle, '__int__') else original_program.handle
+    
+    # Create a new program from the handle
+    program_from_handle = Program.from_handle(handle, "NVVM")
+    assert program_from_handle.backend == "NVVM"
+    
+    # Clean up the original program
+    original_program.close()
+
+
+def test_program_from_handle_invalid_backend():
+    """Test Program.from_handle() with invalid backend"""
+    with pytest.raises(ValueError, match="Unsupported backend 'INVALID'"):
+        Program.from_handle(0, "INVALID")

From 62eceae550cb5b0c9bd5db05309ee4fe564f8ed2 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Fri, 19 Dec 2025 08:51:00 -0800
Subject: [PATCH 02/19] adding thread safety

---
 cuda_core/cuda/core/_module.py               | 94 ++++++++++++++------
 cuda_core/cuda/core/experimental/__init__.py |  1 +
 2 files changed, 69 insertions(+), 26 deletions(-)

diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py
index 842e316d20..cceb8d19ef 100644
--- a/cuda_core/cuda/core/_module.py
+++ b/cuda_core/cuda/core/_module.py
@@ -2,6 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import sys
+import threading
 import weakref
 from collections import namedtuple
 from typing import Union
@@ -27,7 +29,10 @@
 }
 
 
-# TODO: revisit this treatment for py313t builds
+# Lazy initialization state and synchronization
+# For Python 3.13t (free-threaded builds), we use a lock to ensure thread-safe initialization.
+# For regular Python builds with GIL, the lock overhead is minimal and the code remains safe.
+_init_lock = threading.Lock()
 _inited = False
 _py_major_ver = None
 _driver_ver = None
@@ -35,27 +40,66 @@
 
 
 def _lazy_init():
+    """
+    Initialize module-level state in a thread-safe manner.
+    
+    This function is thread-safe and suitable for both:
+    - Regular Python builds (with GIL)
+    - Python 3.13t free-threaded builds (without GIL)
+    
+    Uses double-checked locking pattern for performance:
+    - Fast path: check without lock if already initialized
+    - Slow path: acquire lock and initialize if needed
+    """
     global _inited
+    # Fast path: already initialized (no lock needed for read)
     if _inited:
         return
 
-    global _py_major_ver, _driver_ver, _kernel_ctypes
-    # binding availability depends on cuda-python version
-    _py_major_ver, _ = get_binding_version()
-    if _py_major_ver >= 12:
-        _backend["new"] = {
-            "file": driver.cuLibraryLoadFromFile,
-            "data": driver.cuLibraryLoadData,
-            "kernel": driver.cuLibraryGetKernel,
-            "attribute": driver.cuKernelGetAttribute,
-        }
-        _kernel_ctypes = (driver.CUfunction, driver.CUkernel)
-    else:
-        _kernel_ctypes = (driver.CUfunction,)
-    _driver_ver = handle_return(driver.cuDriverGetVersion())
-    if _py_major_ver >= 12 and _driver_ver >= 12040:
-        _backend["new"]["paraminfo"] = driver.cuKernelGetParamInfo
-    _inited = True
+    # Slow path: acquire lock and initialize
+    with _init_lock:
+        # Double-check: another thread might have initialized while we waited
+        if _inited:
+            return
+
+        global _py_major_ver, _driver_ver, _kernel_ctypes
+        # binding availability depends on cuda-python version
+        _py_major_ver, _ = get_binding_version()
+        if _py_major_ver >= 12:
+            _backend["new"] = {
+                "file": driver.cuLibraryLoadFromFile,
+                "data": driver.cuLibraryLoadData,
+                "kernel": driver.cuLibraryGetKernel,
+                "attribute": driver.cuKernelGetAttribute,
+            }
+            _kernel_ctypes = (driver.CUfunction, driver.CUkernel)
+        else:
+            _kernel_ctypes = (driver.CUfunction,)
+        _driver_ver = handle_return(driver.cuDriverGetVersion())
+        if _py_major_ver >= 12 and _driver_ver >= 12040:
+            _backend["new"]["paraminfo"] = driver.cuKernelGetParamInfo
+        
+        # Mark as initialized (must be last to ensure all state is set)
+        _inited = True
+
+
+# Auto-initializing property accessors
+def _get_py_major_ver():
+    """Get the Python binding major version, initializing if needed."""
+    _lazy_init()
+    return _py_major_ver
+
+
+def _get_driver_ver():
+    """Get the CUDA driver version, initializing if needed."""
+    _lazy_init()
+    return _driver_ver
+
+
+def _get_kernel_ctypes():
+    """Get the kernel ctypes tuple, initializing if needed."""
+    _lazy_init()
+    return _kernel_ctypes
 
 
 class KernelAttributes:
@@ -70,7 +114,7 @@ def _init(cls, kernel):
         self._kernel = weakref.ref(kernel)
         self._cache = {}
 
-        self._backend_version = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old"
+        self._backend_version = "new" if (_get_py_major_ver() >= 12 and _get_driver_ver() >= 12000) else "old"
         self._loader = _backend[self._backend_version]
         return self
 
@@ -378,7 +422,7 @@ def __new__(self, *args, **kwargs):
 
     @classmethod
     def _from_obj(cls, obj, mod):
-        assert_type(obj, _kernel_ctypes)
+        assert_type(obj, _get_kernel_ctypes())
         assert_type(mod, ObjectCode)
         ker = super().__new__(cls)
         ker._handle = obj
@@ -399,9 +443,10 @@ def _get_arguments_info(self, param_info=False) -> tuple[int, list[ParamInfo]]:
         if attr_impl._backend_version != "new":
             raise NotImplementedError("New backend is required")
         if "paraminfo" not in attr_impl._loader:
+            driver_ver = _get_driver_ver()
             raise NotImplementedError(
                 "Driver version 12.4 or newer is required for this function. "
-                f"Using driver version {_driver_ver // 1000}.{(_driver_ver % 1000) // 10}"
+                f"Using driver version {driver_ver // 1000}.{(driver_ver % 1000) // 10}"
             )
         arg_pos = 0
         param_info_data = []
@@ -464,9 +509,8 @@ def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel":
             Newly created kernel object.
 
         """
-        _lazy_init()
         # Convert the integer handle to the appropriate driver type
-        if _py_major_ver >= 12 and _driver_ver >= 12000:
+        if _get_py_major_ver() >= 12 and _get_driver_ver() >= 12000:
             # Try CUkernel first for newer CUDA versions
             kernel_obj = driver.CUkernel(handle)
         else:
@@ -518,12 +562,11 @@ def __new__(self, *args, **kwargs):
     def _init(cls, module, code_type, *, name: str = "", symbol_mapping: dict | None = None):
         self = super().__new__(cls)
         assert code_type in self._supported_code_type, f"{code_type=} is not supported"
-        _lazy_init()
 
         # handle is assigned during _lazy_load
         self._handle = None
 
-        self._backend_version = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old"
+        self._backend_version = "new" if (_get_py_major_ver() >= 12 and _get_driver_ver() >= 12000) else "old"
         self._loader = _backend[self._backend_version]
 
         self._code_type = code_type
@@ -683,7 +726,6 @@ def from_handle(handle: int, code_type: str = "cubin", *, name: str = "", symbol
             Newly created object code.
 
         """
-        _lazy_init()
         # Create an ObjectCode instance with a placeholder module
         # The handle will be set directly, bypassing the lazy loading
         obj = ObjectCode._init(b"", code_type, name=name, symbol_mapping=symbol_mapping)
diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 3dbf3b7440..406c2e7e26 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -70,6 +70,7 @@ def _warn_deprecated():
     VirtualMemoryResource,
     VirtualMemoryResourceOptions,
 )
+import cuda.core._module as _module  # noqa: E402
 from cuda.core._module import Kernel, ObjectCode  # noqa: E402
 from cuda.core._program import Program, ProgramOptions  # noqa: E402
 from cuda.core._stream import Stream, StreamOptions  # noqa: E402

From 4e751a68ef7e4191eb5366f357f8493c1860af44 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Fri, 19 Dec 2025 09:00:43 -0800
Subject: [PATCH 03/19] cleanup work

---
 cuda_core/cuda/core/_module.py | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py
index cceb8d19ef..762f562c4f 100644
--- a/cuda_core/cuda/core/_module.py
+++ b/cuda_core/cuda/core/_module.py
@@ -19,15 +19,6 @@
 )
 from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return, precondition
 
-_backend = {
-    "old": {
-        "file": driver.cuModuleLoad,
-        "data": driver.cuModuleLoadDataEx,
-        "kernel": driver.cuModuleGetFunction,
-        "attribute": driver.cuFuncGetAttribute,
-    },
-}
-
 
 # Lazy initialization state and synchronization
 # For Python 3.13t (free-threaded builds), we use a lock to ensure thread-safe initialization.
@@ -37,7 +28,14 @@
 _py_major_ver = None
 _driver_ver = None
 _kernel_ctypes = None
-
+_backend = {
+    "old": {
+        "file": driver.cuModuleLoad,
+        "data": driver.cuModuleLoadDataEx,
+        "kernel": driver.cuModuleGetFunction,
+        "attribute": driver.cuFuncGetAttribute,
+    },
+}
 
 def _lazy_init():
     """
@@ -102,6 +100,14 @@ def _get_kernel_ctypes():
     return _kernel_ctypes
 
 
+def _get_backend_version():
+    """Get the backend version ("new" or "old") based on CUDA version.
+    
+    Returns "new" for CUDA 12.0+ (uses cuLibrary API), "old" otherwise (uses cuModule API).
+    """
+    return "new" if (_get_py_major_ver() >= 12 and _get_driver_ver() >= 12000) else "old"
+
+
 class KernelAttributes:
     def __new__(self, *args, **kwargs):
         raise RuntimeError("KernelAttributes cannot be instantiated directly. Please use Kernel APIs.")
@@ -114,7 +120,7 @@ def _init(cls, kernel):
         self._kernel = weakref.ref(kernel)
         self._cache = {}
 
-        self._backend_version = "new" if (_get_py_major_ver() >= 12 and _get_driver_ver() >= 12000) else "old"
+        self._backend_version = _get_backend_version()
         self._loader = _backend[self._backend_version]
         return self
 
@@ -241,7 +247,9 @@ def cluster_scheduling_policy_preference(self, device_id: Device | int = None) -
 
 
 class KernelOccupancy:
-    """ """
+    """This class offers methods to query occupancy metrics that help determine optimal
+    launch parameters such as block size, grid size, and shared memory usage.
+    """
 
     def __new__(self, *args, **kwargs):
         raise RuntimeError("KernelOccupancy cannot be instantiated directly. Please use Kernel APIs.")
@@ -566,7 +574,7 @@ def _init(cls, module, code_type, *, name: str = "", symbol_mapping: dict | None
         # handle is assigned during _lazy_load
         self._handle = None
 
-        self._backend_version = "new" if (_get_py_major_ver() >= 12 and _get_driver_ver() >= 12000) else "old"
+        self._backend_version = _get_backend_version()
         self._loader = _backend[self._backend_version]
 
         self._code_type = code_type

From 0dca71048e0793555fee7ecc05d3d0c033c52133 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Fri, 19 Dec 2025 09:14:35 -0800
Subject: [PATCH 04/19] updating docstrings

---
 cuda_core/cuda/core/_module.py | 30 +++---------------------------
 1 file changed, 3 insertions(+), 27 deletions(-)

diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py
index 762f562c4f..a2ecf4517f 100644
--- a/cuda_core/cuda/core/_module.py
+++ b/cuda_core/cuda/core/_module.py
@@ -491,15 +491,9 @@ def occupancy(self) -> KernelOccupancy:
 
     @staticmethod
     def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel":
-        """Create a new :obj:`Kernel` object from a foreign kernel handle.
+        """Creates a new :obj:`Kernel` object from a foreign kernel handle.
 
-        Uses a CUfunction or CUkernel pointer address represented as a Python int
-        to create a new :obj:`Kernel` object.
-
-        Note
-        ----
-        Kernel lifetime is not managed, foreign object must remain
-        alive while this kernel is active.
+        Uses a CUfunction or CUkernel pointer address to create a new :obj:`Kernel` object. 
 
         Parameters
         ----------
@@ -510,12 +504,6 @@ def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel":
             The ObjectCode object associated with this kernel. If not provided,
             a placeholder ObjectCode will be created. Note that without a proper
             ObjectCode, certain operations may be limited.
-
-        Returns
-        -------
-        :obj:`Kernel`
-            Newly created kernel object.
-
         """
         # Convert the integer handle to the appropriate driver type
         if _get_py_major_ver() >= 12 and _get_driver_ver() >= 12000:
@@ -704,13 +692,7 @@ def from_library(module: Union[bytes, str], *, name: str = "", symbol_mapping: d
     def from_handle(handle: int, code_type: str = "cubin", *, name: str = "", symbol_mapping: dict | None = None) -> "ObjectCode":
         """Create a new :obj:`ObjectCode` object from a foreign module handle.
 
-        Uses a CUmodule or CUlibrary pointer address represented as a Python int
-        to create a new :obj:`ObjectCode` object.
-
-        Note
-        ----
-        Module lifetime is not managed, foreign object must remain
-        alive while this object code is active.
+        Uses a CUmodule or CUlibrary pointer address to create a new :obj:`ObjectCode` object.
 
         Parameters
         ----------
@@ -727,12 +709,6 @@ def from_handle(handle: int, code_type: str = "cubin", *, name: str = "", symbol
             A dictionary specifying how the unmangled symbol names (as keys)
             should be mapped to the mangled names before trying to retrieve
             them (default to no mappings).
-
-        Returns
-        -------
-        :obj:`ObjectCode`
-            Newly created object code.
-
         """
         # Create an ObjectCode instance with a placeholder module
         # The handle will be set directly, bypassing the lazy loading

From 8457870fa4b9c4c64b17d5af4ef61f4ab2814dda Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Fri, 19 Dec 2025 09:32:42 -0800
Subject: [PATCH 05/19] removing experimental namespace

---
 cuda_core/tests/test_module.py | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index bfb0aafb19..31f7a3e303 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -425,32 +425,32 @@ def test_module_serialization_roundtrip(get_saxpy_kernel_cubin):
 def test_object_code_from_handle(get_saxpy_kernel_cubin):
     """Test ObjectCode.from_handle() with a valid handle"""
     kernel, original_objcode = get_saxpy_kernel_cubin
-    
+
     # Get the handle from the original object code
     handle = int(original_objcode.handle)
-    
+
     # Create a new ObjectCode from the handle
     objcode_from_handle = ObjectCode.from_handle(handle, "cubin", symbol_mapping=original_objcode._sym_map)
     assert isinstance(objcode_from_handle, ObjectCode)
     assert objcode_from_handle.code_type == "cubin"
-    
+
     # Try to get a kernel from the new object code
     # Note: This should work since we're reusing the handle
     kernel_from_handle = objcode_from_handle.get_kernel("saxpy<float>")
-    assert isinstance(kernel_from_handle, cuda.core.experimental._module.Kernel)
+    assert isinstance(kernel_from_handle, cuda.core._module.Kernel)
 
 
 def test_object_code_from_handle_with_different_code_types(get_saxpy_kernel_ptx):
     """Test ObjectCode.from_handle() with PTX code type"""
     ptx, original_objcode = get_saxpy_kernel_ptx
-    
+
     if not Program._can_load_generated_ptx():
         pytest.skip("PTX version too new for current driver")
-    
+
     # Force loading to get a handle
     _ = original_objcode.get_kernel("saxpy<float>")
     handle = int(original_objcode.handle)
-    
+
     # Create a new ObjectCode from the handle with PTX code type
     objcode_from_handle = ObjectCode.from_handle(handle, "ptx", symbol_mapping=original_objcode._sym_map)
     assert isinstance(objcode_from_handle, ObjectCode)
@@ -460,14 +460,14 @@ def test_object_code_from_handle_with_different_code_types(get_saxpy_kernel_ptx)
 def test_kernel_from_handle(get_saxpy_kernel_cubin):
     """Test Kernel.from_handle() with a valid handle"""
     original_kernel, objcode = get_saxpy_kernel_cubin
-    
+
     # Get the handle from the original kernel
     handle = int(original_kernel._handle)
-    
+
     # Create a new Kernel from the handle
-    kernel_from_handle = cuda.core.experimental._module.Kernel.from_handle(handle, objcode)
-    assert isinstance(kernel_from_handle, cuda.core.experimental._module.Kernel)
-    
+    kernel_from_handle = cuda.core._module.Kernel.from_handle(handle, objcode)
+    assert isinstance(kernel_from_handle, cuda.core._module.Kernel)
+
     # Verify we can access kernel attributes
     max_threads = kernel_from_handle.attributes.max_threads_per_block()
     assert isinstance(max_threads, int)
@@ -477,14 +477,14 @@ def test_kernel_from_handle(get_saxpy_kernel_cubin):
 def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin):
     """Test Kernel.from_handle() without providing a module"""
     original_kernel, _ = get_saxpy_kernel_cubin
-    
+
     # Get the handle from the original kernel
     handle = int(original_kernel._handle)
-    
+
     # Create a new Kernel from the handle without a module
-    kernel_from_handle = cuda.core.experimental._module.Kernel.from_handle(handle)
-    assert isinstance(kernel_from_handle, cuda.core.experimental._module.Kernel)
-    
+    kernel_from_handle = cuda.core._module.Kernel.from_handle(handle)
+    assert isinstance(kernel_from_handle, cuda.core._module.Kernel)
+
     # Verify we can still access kernel attributes
     max_threads = kernel_from_handle.attributes.max_threads_per_block()
     assert isinstance(max_threads, int)

From 2178dcbb8083d8f7bb8e82aa94e0caebe05ecba4 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Fri, 19 Dec 2025 09:42:03 -0800
Subject: [PATCH 06/19] clean up

---
 cuda_core/tests/test_module.py  |  1 -
 cuda_core/tests/test_program.py | 14 +++++++-------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 31f7a3e303..f0c28fd673 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -435,7 +435,6 @@ def test_object_code_from_handle(get_saxpy_kernel_cubin):
     assert objcode_from_handle.code_type == "cubin"
 
     # Try to get a kernel from the new object code
-    # Note: This should work since we're reusing the handle
     kernel_from_handle = objcode_from_handle.get_kernel("saxpy<float>")
     assert isinstance(kernel_from_handle, cuda.core._module.Kernel)
 
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 2d9d5c5cd5..105595839a 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -555,15 +555,15 @@ def test_program_from_handle_nvrtc(init_cuda):
     code = 'extern "C" __global__ void test_kernel() {}'
     original_program = Program(code, "c++")
     assert original_program.backend == "NVRTC"
-    
+
     # Get the handle
     handle = int(original_program.handle)
-    
+
     # Create a new program from the handle
     program_from_handle = Program.from_handle(handle, "NVRTC")
     assert program_from_handle.backend == "NVRTC"
     # Note: We don't own the handle, so we shouldn't close it in the from_handle instance
-    
+
     # Clean up the original program
     original_program.close()
 
@@ -574,14 +574,14 @@ def test_program_from_handle_nvvm(init_cuda, nvvm_ir):
     # Create a regular NVVM program to get a handle
     original_program = Program(nvvm_ir, "nvvm")
     assert original_program.backend == "NVVM"
-    
+
     # Get the handle
-    handle = int(original_program.handle) if hasattr(original_program.handle, '__int__') else original_program.handle
-    
+    handle = int(original_program.handle)
+
     # Create a new program from the handle
     program_from_handle = Program.from_handle(handle, "NVVM")
     assert program_from_handle.backend == "NVVM"
-    
+
     # Clean up the original program
     original_program.close()
 

From 180eeb0a3bbde71541353c9d4d5ef8d6e04fc961 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Fri, 19 Dec 2025 11:47:09 -0800
Subject: [PATCH 07/19] adding more unit test to cover edge cases

---
 cuda_core/cuda/core/_module.py |   5 +
 cuda_core/tests/test_module.py | 216 +++++++++++++++++++++++++++++++++
 2 files changed, 221 insertions(+)

diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py
index a2ecf4517f..e3bdc5ecf0 100644
--- a/cuda_core/cuda/core/_module.py
+++ b/cuda_core/cuda/core/_module.py
@@ -505,6 +505,11 @@ def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel":
             a placeholder ObjectCode will be created. Note that without a proper
             ObjectCode, certain operations may be limited.
         """
+        
+        # Validate that handle is an integer
+        if not isinstance(handle, int):
+            raise TypeError(f"handle must be an integer, got {type(handle).__name__}")
+        
         # Convert the integer handle to the appropriate driver type
         if _get_py_major_ver() >= 12 and _get_driver_ver() >= 12000:
             # Try CUkernel first for newer CUDA versions
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index f0c28fd673..73a1c35299 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -488,3 +488,219 @@ def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin):
     max_threads = kernel_from_handle.attributes.max_threads_per_block()
     assert isinstance(max_threads, int)
     assert max_threads > 0
+
+
+# Edge case tests for from_handle methods
+
+
+#  @pytest.mark.parametrize("invalid_handle", [0, -1, 0xDEADBEEF])
+#  def test_object_code_from_handle_invalid_handles(invalid_handle):
+#      """Test ObjectCode.from_handle() with various invalid handle values"""
+#      # Invalid handles should be accepted during construction but fail when used
+#      objcode = ObjectCode.from_handle(invalid_handle, "cubin", symbol_mapping={})
+#      assert isinstance(objcode, ObjectCode)
+#      assert objcode.code_type == "cubin"
+#
+#      # Attempting to get a kernel from an invalid handle should fail with CUDAError
+#      with pytest.raises(CUDAError):
+#          objcode.get_kernel("kernel_that_does_not_exist")
+
+
+#  @pytest.mark.parametrize(
+#      "invalid_value",
+#      [
+#          pytest.param("not_an_int", id="str"),
+#          pytest.param(3.14, id="float"),
+#          pytest.param(None, id="None"),
+#          pytest.param([123], id="list"),
+#          pytest.param((123,), id="tuple"),
+#          pytest.param({"handle": 123}, id="dict"),
+#          pytest.param(b"\x00\x01\x02", id="bytes"),
+#          pytest.param({123}, id="set"),
+#          pytest.param(object(), id="object"),
+#      ],
+#  )
+#  def test_object_code_from_handle_type_validation(invalid_value):
+#      """Test ObjectCode.from_handle() with wrong handle types"""
+#      with pytest.raises(TypeError):
+#          ObjectCode.from_handle(invalid_value, "cubin", symbol_mapping={})
+#
+
+@pytest.mark.parametrize(
+    "invalid_code_type,expected_error",
+    [
+        pytest.param("invalid_type", AssertionError, id="invalid_str"),
+        pytest.param("", AssertionError, id="empty_str"),
+        pytest.param(None, (AssertionError, TypeError), id="None"),
+        pytest.param(123, (AssertionError, TypeError), id="int"),
+        pytest.param(3.14, (AssertionError, TypeError), id="float"),
+        pytest.param(["cubin"], (AssertionError, TypeError), id="list"),
+        pytest.param(("cubin",), (AssertionError, TypeError), id="tuple"),
+        pytest.param({"type": "cubin"}, (AssertionError, TypeError), id="dict"),
+        pytest.param(b"cubin", (AssertionError, TypeError), id="bytes"),
+        pytest.param({"cubin"}, (AssertionError, TypeError), id="set"),
+        pytest.param(object(), (AssertionError, TypeError), id="object"),
+    ],
+)
+def test_object_code_from_handle_invalid_code_type(invalid_code_type, expected_error):
+    """Test ObjectCode.from_handle() with invalid code_type"""
+    with pytest.raises(expected_error):
+        ObjectCode.from_handle(0, invalid_code_type, symbol_mapping={})
+
+
+def test_object_code_from_handle_symbol_mapping_variations():
+    """Test ObjectCode.from_handle() with various symbol_mapping values"""
+    # None symbol_mapping (should default to empty dict)
+    objcode1 = ObjectCode.from_handle(0, "cubin", symbol_mapping=None)
+    assert objcode1._sym_map == {}
+    
+    # Empty dict
+    objcode2 = ObjectCode.from_handle(0, "cubin", symbol_mapping={})
+    assert objcode2._sym_map == {}
+    
+    # Valid symbol mapping
+    sym_map = {"kernel1": b"_Z7kernel1v", "kernel2": b"_Z7kernel2v"}
+    objcode3 = ObjectCode.from_handle(0, "cubin", symbol_mapping=sym_map)
+    assert objcode3._sym_map == sym_map
+
+
+def test_object_code_from_handle_symbol_mapping_with_valid_handle(get_saxpy_kernel_cubin):
+    """Test that symbol_mapping is actually used when getting kernels"""
+    _, original_objcode = get_saxpy_kernel_cubin
+    original_handle = int(original_objcode.handle)
+    
+    # Create ObjectCode with correct symbol mapping
+    objcode_with_map = ObjectCode.from_handle(
+        original_handle, 
+        "cubin", 
+        symbol_mapping=original_objcode._sym_map
+    )
+    
+    # Should successfully get kernel using unmangled name from symbol_mapping
+    kernel = objcode_with_map.get_kernel("saxpy<float>")
+    assert isinstance(kernel, cuda.core._module.Kernel)
+    
+    # Create ObjectCode without symbol mapping
+    objcode_no_map = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping={})
+    
+    # Should fail to get kernel using unmangled name (no mapping available)
+    with pytest.raises(CUDAError):
+        objcode_no_map.get_kernel("saxpy<float>")
+
+
+def test_object_code_from_handle_lifecycle(get_saxpy_kernel_cubin):
+    """Test handle lifecycle and ownership with from_handle"""
+    original_kernel, original_objcode = get_saxpy_kernel_cubin
+    
+    # Get the original handle
+    original_handle = int(original_objcode.handle)
+    
+    # Create a new ObjectCode from the same handle
+    objcode_from_handle = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map)
+    
+    # Both should reference the same underlying CUDA module
+    assert int(objcode_from_handle.handle) == original_handle
+    
+    # Get a kernel from the from_handle version
+    kernel_from_copy = objcode_from_handle.get_kernel("saxpy<float>")
+    assert isinstance(kernel_from_copy, cuda.core._module.Kernel)
+    
+    # The original should still work
+    kernel_from_original = original_objcode.get_kernel("saxpy<float>")
+    assert isinstance(kernel_from_original, cuda.core._module.Kernel)
+    
+    # Both kernels should reference the same underlying CUDA kernel handle
+    # If handles are equal, they're the same kernel - no need to check attributes
+    assert int(kernel_from_copy._handle) == int(kernel_from_original._handle)
+
+
+def test_object_code_from_handle_multiple_instances(get_saxpy_kernel_cubin):
+    """Test creating multiple ObjectCode instances from the same handle"""
+    original_kernel, original_objcode = get_saxpy_kernel_cubin
+    
+    # Get the original handle
+    original_handle = int(original_objcode.handle)
+    
+    # Create multiple ObjectCode instances from the same handle
+    objcode1 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map)
+    objcode2 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map)
+    objcode3 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map)
+    
+    # All should have the same handle
+    assert int(objcode1.handle) == original_handle
+    assert int(objcode2.handle) == original_handle
+    assert int(objcode3.handle) == original_handle
+    
+    # All should be able to get kernels
+    kernel1 = objcode1.get_kernel("saxpy<float>")
+    kernel2 = objcode2.get_kernel("saxpy<float>")
+    kernel3 = objcode3.get_kernel("saxpy<float>")
+    
+    assert isinstance(kernel1, cuda.core._module.Kernel)
+    assert isinstance(kernel2, cuda.core._module.Kernel)
+    assert isinstance(kernel3, cuda.core._module.Kernel)
+
+
+#  @pytest.mark.parametrize("invalid_handle", [0, -1, 0xBADC0FFEE])
+#  def test_kernel_from_handle_invalid_handles(invalid_handle):
+#      """Test Kernel.from_handle() with various invalid handle values"""
+#      # Invalid handles should be accepted during construction but fail when used
+#      kernel = cuda.core._module.Kernel.from_handle(invalid_handle)
+#      assert isinstance(kernel, cuda.core._module.Kernel)
+#
+#      # Attempting to access attributes with invalid handle should fail
+#      with pytest.raises(CUDAError):
+#          kernel.attributes.max_threads_per_block()
+
+
+@pytest.mark.parametrize(
+    "invalid_value",
+    [
+        pytest.param("not_an_int", id="str"),
+        pytest.param(2.71828, id="float"),
+        pytest.param(None, id="None"),
+        pytest.param({"handle": 123}, id="dict"),
+        pytest.param([456], id="list"),
+        pytest.param((789,), id="tuple"),
+        pytest.param(3+4j, id="complex"),
+        pytest.param(b"\xde\xad\xbe\xef", id="bytes"),
+        pytest.param({999}, id="set"),
+        pytest.param(object(), id="object"),
+    ],
+)
+def test_kernel_from_handle_type_validation(invalid_value):
+    """Test Kernel.from_handle() with wrong handle types"""
+    with pytest.raises(TypeError):
+        cuda.core._module.Kernel.from_handle(invalid_value)
+
+
+def test_kernel_from_handle_invalid_module_type(get_saxpy_kernel_cubin):
+    """Test Kernel.from_handle() with invalid module parameter"""
+    original_kernel, _ = get_saxpy_kernel_cubin
+    handle = int(original_kernel._handle)
+    
+    # Invalid module type (should fail type assertion in _from_obj)
+    with pytest.raises((TypeError, AssertionError)):
+        cuda.core._module.Kernel.from_handle(handle, mod="not_an_objectcode")
+    
+    with pytest.raises((TypeError, AssertionError)):
+        cuda.core._module.Kernel.from_handle(handle, mod=12345)
+
+
+def test_kernel_from_handle_multiple_instances(get_saxpy_kernel_cubin):
+    """Test creating multiple Kernel instances from the same handle"""
+    original_kernel, objcode = get_saxpy_kernel_cubin
+    handle = int(original_kernel._handle)
+    
+    # Create multiple Kernel instances from the same handle
+    kernel1 = cuda.core._module.Kernel.from_handle(handle, objcode)
+    kernel2 = cuda.core._module.Kernel.from_handle(handle, objcode)
+    kernel3 = cuda.core._module.Kernel.from_handle(handle, objcode)
+    
+    # All should be valid Kernel objects
+    assert isinstance(kernel1, cuda.core._module.Kernel)
+    assert isinstance(kernel2, cuda.core._module.Kernel)
+    assert isinstance(kernel3, cuda.core._module.Kernel)
+    
+    # All should reference the same underlying CUDA kernel handle
+    assert int(kernel1._handle) == int(kernel2._handle) == int(kernel3._handle) == handle

From 6489a6d8dcfcd0b8807b5cf593278520ac46be06 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Fri, 19 Dec 2025 13:42:35 -0800
Subject: [PATCH 08/19] removing unneeded import

---
 cuda_core/cuda/core/experimental/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 406c2e7e26..3dbf3b7440 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -70,7 +70,6 @@ def _warn_deprecated():
     VirtualMemoryResource,
     VirtualMemoryResourceOptions,
 )
-import cuda.core._module as _module  # noqa: E402
 from cuda.core._module import Kernel, ObjectCode  # noqa: E402
 from cuda.core._program import Program, ProgramOptions  # noqa: E402
 from cuda.core._stream import Stream, StreamOptions  # noqa: E402

From 74a80f72e3f0c198f82e79b8b4cf9f53390dca2d Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Fri, 19 Dec 2025 14:08:38 -0800
Subject: [PATCH 09/19] removing broken tests

---
 cuda_core/tests/test_module.py | 94 +++++++++-------------------------
 1 file changed, 24 insertions(+), 70 deletions(-)

diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 73a1c35299..00241306e9 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -492,40 +492,6 @@ def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin):
 
 # Edge case tests for from_handle methods
 
-
-#  @pytest.mark.parametrize("invalid_handle", [0, -1, 0xDEADBEEF])
-#  def test_object_code_from_handle_invalid_handles(invalid_handle):
-#      """Test ObjectCode.from_handle() with various invalid handle values"""
-#      # Invalid handles should be accepted during construction but fail when used
-#      objcode = ObjectCode.from_handle(invalid_handle, "cubin", symbol_mapping={})
-#      assert isinstance(objcode, ObjectCode)
-#      assert objcode.code_type == "cubin"
-#
-#      # Attempting to get a kernel from an invalid handle should fail with CUDAError
-#      with pytest.raises(CUDAError):
-#          objcode.get_kernel("kernel_that_does_not_exist")
-
-
-#  @pytest.mark.parametrize(
-#      "invalid_value",
-#      [
-#          pytest.param("not_an_int", id="str"),
-#          pytest.param(3.14, id="float"),
-#          pytest.param(None, id="None"),
-#          pytest.param([123], id="list"),
-#          pytest.param((123,), id="tuple"),
-#          pytest.param({"handle": 123}, id="dict"),
-#          pytest.param(b"\x00\x01\x02", id="bytes"),
-#          pytest.param({123}, id="set"),
-#          pytest.param(object(), id="object"),
-#      ],
-#  )
-#  def test_object_code_from_handle_type_validation(invalid_value):
-#      """Test ObjectCode.from_handle() with wrong handle types"""
-#      with pytest.raises(TypeError):
-#          ObjectCode.from_handle(invalid_value, "cubin", symbol_mapping={})
-#
-
 @pytest.mark.parametrize(
     "invalid_code_type,expected_error",
     [
@@ -553,11 +519,11 @@ def test_object_code_from_handle_symbol_mapping_variations():
     # None symbol_mapping (should default to empty dict)
     objcode1 = ObjectCode.from_handle(0, "cubin", symbol_mapping=None)
     assert objcode1._sym_map == {}
-    
+
     # Empty dict
     objcode2 = ObjectCode.from_handle(0, "cubin", symbol_mapping={})
     assert objcode2._sym_map == {}
-    
+
     # Valid symbol mapping
     sym_map = {"kernel1": b"_Z7kernel1v", "kernel2": b"_Z7kernel2v"}
     objcode3 = ObjectCode.from_handle(0, "cubin", symbol_mapping=sym_map)
@@ -568,21 +534,21 @@ def test_object_code_from_handle_symbol_mapping_with_valid_handle(get_saxpy_kern
     """Test that symbol_mapping is actually used when getting kernels"""
     _, original_objcode = get_saxpy_kernel_cubin
     original_handle = int(original_objcode.handle)
-    
+
     # Create ObjectCode with correct symbol mapping
     objcode_with_map = ObjectCode.from_handle(
-        original_handle, 
-        "cubin", 
+        original_handle,
+        "cubin",
         symbol_mapping=original_objcode._sym_map
     )
-    
+
     # Should successfully get kernel using unmangled name from symbol_mapping
     kernel = objcode_with_map.get_kernel("saxpy<float>")
     assert isinstance(kernel, cuda.core._module.Kernel)
-    
+
     # Create ObjectCode without symbol mapping
     objcode_no_map = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping={})
-    
+
     # Should fail to get kernel using unmangled name (no mapping available)
     with pytest.raises(CUDAError):
         objcode_no_map.get_kernel("saxpy<float>")
@@ -591,24 +557,24 @@ def test_object_code_from_handle_symbol_mapping_with_valid_handle(get_saxpy_kern
 def test_object_code_from_handle_lifecycle(get_saxpy_kernel_cubin):
     """Test handle lifecycle and ownership with from_handle"""
     original_kernel, original_objcode = get_saxpy_kernel_cubin
-    
+
     # Get the original handle
     original_handle = int(original_objcode.handle)
-    
+
     # Create a new ObjectCode from the same handle
     objcode_from_handle = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map)
-    
+
     # Both should reference the same underlying CUDA module
     assert int(objcode_from_handle.handle) == original_handle
-    
+
     # Get a kernel from the from_handle version
     kernel_from_copy = objcode_from_handle.get_kernel("saxpy<float>")
     assert isinstance(kernel_from_copy, cuda.core._module.Kernel)
-    
+
     # The original should still work
     kernel_from_original = original_objcode.get_kernel("saxpy<float>")
     assert isinstance(kernel_from_original, cuda.core._module.Kernel)
-    
+
     # Both kernels should reference the same underlying CUDA kernel handle
     # If handles are equal, they're the same kernel - no need to check attributes
     assert int(kernel_from_copy._handle) == int(kernel_from_original._handle)
@@ -617,42 +583,30 @@ def test_object_code_from_handle_lifecycle(get_saxpy_kernel_cubin):
 def test_object_code_from_handle_multiple_instances(get_saxpy_kernel_cubin):
     """Test creating multiple ObjectCode instances from the same handle"""
     original_kernel, original_objcode = get_saxpy_kernel_cubin
-    
+
     # Get the original handle
     original_handle = int(original_objcode.handle)
-    
+
     # Create multiple ObjectCode instances from the same handle
     objcode1 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map)
     objcode2 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map)
     objcode3 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map)
-    
+
     # All should have the same handle
     assert int(objcode1.handle) == original_handle
     assert int(objcode2.handle) == original_handle
     assert int(objcode3.handle) == original_handle
-    
+
     # All should be able to get kernels
     kernel1 = objcode1.get_kernel("saxpy<float>")
     kernel2 = objcode2.get_kernel("saxpy<float>")
     kernel3 = objcode3.get_kernel("saxpy<float>")
-    
+
     assert isinstance(kernel1, cuda.core._module.Kernel)
     assert isinstance(kernel2, cuda.core._module.Kernel)
     assert isinstance(kernel3, cuda.core._module.Kernel)
 
 
-#  @pytest.mark.parametrize("invalid_handle", [0, -1, 0xBADC0FFEE])
-#  def test_kernel_from_handle_invalid_handles(invalid_handle):
-#      """Test Kernel.from_handle() with various invalid handle values"""
-#      # Invalid handles should be accepted during construction but fail when used
-#      kernel = cuda.core._module.Kernel.from_handle(invalid_handle)
-#      assert isinstance(kernel, cuda.core._module.Kernel)
-#
-#      # Attempting to access attributes with invalid handle should fail
-#      with pytest.raises(CUDAError):
-#          kernel.attributes.max_threads_per_block()
-
-
 @pytest.mark.parametrize(
     "invalid_value",
     [
@@ -678,11 +632,11 @@ def test_kernel_from_handle_invalid_module_type(get_saxpy_kernel_cubin):
     """Test Kernel.from_handle() with invalid module parameter"""
     original_kernel, _ = get_saxpy_kernel_cubin
     handle = int(original_kernel._handle)
-    
+
     # Invalid module type (should fail type assertion in _from_obj)
     with pytest.raises((TypeError, AssertionError)):
         cuda.core._module.Kernel.from_handle(handle, mod="not_an_objectcode")
-    
+
     with pytest.raises((TypeError, AssertionError)):
         cuda.core._module.Kernel.from_handle(handle, mod=12345)
 
@@ -691,16 +645,16 @@ def test_kernel_from_handle_multiple_instances(get_saxpy_kernel_cubin):
     """Test creating multiple Kernel instances from the same handle"""
     original_kernel, objcode = get_saxpy_kernel_cubin
     handle = int(original_kernel._handle)
-    
+
     # Create multiple Kernel instances from the same handle
     kernel1 = cuda.core._module.Kernel.from_handle(handle, objcode)
     kernel2 = cuda.core._module.Kernel.from_handle(handle, objcode)
     kernel3 = cuda.core._module.Kernel.from_handle(handle, objcode)
-    
+
     # All should be valid Kernel objects
     assert isinstance(kernel1, cuda.core._module.Kernel)
     assert isinstance(kernel2, cuda.core._module.Kernel)
     assert isinstance(kernel3, cuda.core._module.Kernel)
-    
+
     # All should reference the same underlying CUDA kernel handle
     assert int(kernel1._handle) == int(kernel2._handle) == int(kernel3._handle) == handle

From 30dfcdc5240f8bb08a2c9ade6a3d3592ac825304 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Fri, 19 Dec 2025 14:25:43 -0800
Subject: [PATCH 10/19] pre-commit changes

---
 cuda_core/cuda/core/_module.py  | 29 +++++++++++++++--------------
 cuda_core/cuda/core/_program.py | 15 ++++++---------
 cuda_core/tests/test_module.py  |  9 +++------
 3 files changed, 24 insertions(+), 29 deletions(-)

diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py
index e3bdc5ecf0..c2eb200de5 100644
--- a/cuda_core/cuda/core/_module.py
+++ b/cuda_core/cuda/core/_module.py
@@ -2,7 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import sys
 import threading
 import weakref
 from collections import namedtuple
@@ -19,7 +18,6 @@
 )
 from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return, precondition
 
-
 # Lazy initialization state and synchronization
 # For Python 3.13t (free-threaded builds), we use a lock to ensure thread-safe initialization.
 # For regular Python builds with GIL, the lock overhead is minimal and the code remains safe.
@@ -37,14 +35,15 @@
     },
 }
 
+
 def _lazy_init():
     """
     Initialize module-level state in a thread-safe manner.
-    
+
     This function is thread-safe and suitable for both:
     - Regular Python builds (with GIL)
     - Python 3.13t free-threaded builds (without GIL)
-    
+
     Uses double-checked locking pattern for performance:
     - Fast path: check without lock if already initialized
     - Slow path: acquire lock and initialize if needed
@@ -76,7 +75,7 @@ def _lazy_init():
         _driver_ver = handle_return(driver.cuDriverGetVersion())
         if _py_major_ver >= 12 and _driver_ver >= 12040:
             _backend["new"]["paraminfo"] = driver.cuKernelGetParamInfo
-        
+
         # Mark as initialized (must be last to ensure all state is set)
         _inited = True
 
@@ -102,7 +101,7 @@ def _get_kernel_ctypes():
 
 def _get_backend_version():
     """Get the backend version ("new" or "old") based on CUDA version.
-    
+
     Returns "new" for CUDA 12.0+ (uses cuLibrary API), "old" otherwise (uses cuModule API).
     """
     return "new" if (_get_py_major_ver() >= 12 and _get_driver_ver() >= 12000) else "old"
@@ -493,7 +492,7 @@ def occupancy(self) -> KernelOccupancy:
     def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel":
         """Creates a new :obj:`Kernel` object from a foreign kernel handle.
 
-        Uses a CUfunction or CUkernel pointer address to create a new :obj:`Kernel` object. 
+        Uses a CUfunction or CUkernel pointer address to create a new :obj:`Kernel` object.
 
         Parameters
         ----------
@@ -505,11 +504,11 @@ def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel":
             a placeholder ObjectCode will be created. Note that without a proper
             ObjectCode, certain operations may be limited.
         """
-        
+
         # Validate that handle is an integer
         if not isinstance(handle, int):
             raise TypeError(f"handle must be an integer, got {type(handle).__name__}")
-        
+
         # Convert the integer handle to the appropriate driver type
         if _get_py_major_ver() >= 12 and _get_driver_ver() >= 12000:
             # Try CUkernel first for newer CUDA versions
@@ -517,14 +516,14 @@ def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel":
         else:
             # Use CUfunction for older versions
             kernel_obj = driver.CUfunction(handle)
-        
+
         # If no module provided, create a placeholder
         if mod is None:
             # Create a placeholder ObjectCode that won't try to load anything
             mod = ObjectCode._init(b"", "cubin")
             # Set a dummy handle to prevent lazy loading
             mod._handle = 1  # Non-null placeholder
-        
+
         return Kernel._from_obj(kernel_obj, mod)
 
 
@@ -694,7 +693,9 @@ def from_library(module: Union[bytes, str], *, name: str = "", symbol_mapping: d
         return ObjectCode._init(module, "library", name=name, symbol_mapping=symbol_mapping)
 
     @staticmethod
-    def from_handle(handle: int, code_type: str = "cubin", *, name: str = "", symbol_mapping: dict | None = None) -> "ObjectCode":
+    def from_handle(
+        handle: int, code_type: str = "cubin", *, name: str = "", symbol_mapping: dict | None = None
+    ) -> "ObjectCode":
         """Create a new :obj:`ObjectCode` object from a foreign module handle.
 
         Uses a CUmodule or CUlibrary pointer address to create a new :obj:`ObjectCode` object.
@@ -718,13 +719,13 @@ def from_handle(handle: int, code_type: str = "cubin", *, name: str = "", symbol
         # Create an ObjectCode instance with a placeholder module
         # The handle will be set directly, bypassing the lazy loading
         obj = ObjectCode._init(b"", code_type, name=name, symbol_mapping=symbol_mapping)
-        
+
         # Set the handle directly from the foreign handle
         if obj._backend_version == "new":
             obj._handle = driver.CUlibrary(handle)
         else:
             obj._handle = driver.CUmodule(handle)
-        
+
         return obj
 
     # TODO: do we want to unload in a finalizer? Probably not..
diff --git a/cuda_core/cuda/core/_program.py b/cuda_core/cuda/core/_program.py
index f0ac4212f0..cffd500310 100644
--- a/cuda_core/cuda/core/_program.py
+++ b/cuda_core/cuda/core/_program.py
@@ -902,32 +902,29 @@ def from_handle(handle: int, backend: str, options: ProgramOptions = None) -> Pr
         backend = backend.upper()
         if backend not in ("NVRTC", "NVVM"):
             raise ValueError(f"Unsupported backend '{backend}'. Must be 'NVRTC' or 'NVVM'")
-        
+
         # Create a new Program instance without going through __init__
         prog = object.__new__(Program)
-        
+
         # Initialize the members needed for finalization
         # Note: We pass None as the program_obj to avoid finalization since
         # we don't own the handle
-        prog._mnff = Program._MembersNeededForFinalize.__new__(
-            Program._MembersNeededForFinalize
-        )
+        prog._mnff = Program._MembersNeededForFinalize.__new__(Program._MembersNeededForFinalize)
         prog._mnff.handle = None  # Don't manage the foreign handle
         prog._mnff.backend = backend
-        
+
         # Store the backend and options
         prog._backend = backend
         prog._options = check_or_create_options(ProgramOptions, options, "Program options")
         prog._linker = None
-        
+
         # Store the handle directly without taking ownership
         # This means the finalizer won't destroy it
         if backend == "NVRTC":
             prog._mnff.handle = nvrtc.nvrtcProgram(handle)
         elif backend == "NVVM":
-            nvvm_module = _get_nvvm_module()
             # For NVVM, we just store the handle as-is
             # The actual NVVM program handle is opaque
             prog._mnff.handle = handle
-        
+
         return prog
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 00241306e9..fc07fe7741 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -492,6 +492,7 @@ def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin):
 
 # Edge case tests for from_handle methods
 
+
 @pytest.mark.parametrize(
     "invalid_code_type,expected_error",
     [
@@ -536,11 +537,7 @@ def test_object_code_from_handle_symbol_mapping_with_valid_handle(get_saxpy_kern
     original_handle = int(original_objcode.handle)
 
     # Create ObjectCode with correct symbol mapping
-    objcode_with_map = ObjectCode.from_handle(
-        original_handle,
-        "cubin",
-        symbol_mapping=original_objcode._sym_map
-    )
+    objcode_with_map = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map)
 
     # Should successfully get kernel using unmangled name from symbol_mapping
     kernel = objcode_with_map.get_kernel("saxpy<float>")
@@ -616,7 +613,7 @@ def test_object_code_from_handle_multiple_instances(get_saxpy_kernel_cubin):
         pytest.param({"handle": 123}, id="dict"),
         pytest.param([456], id="list"),
         pytest.param((789,), id="tuple"),
-        pytest.param(3+4j, id="complex"),
+        pytest.param(3 + 4j, id="complex"),
         pytest.param(b"\xde\xad\xbe\xef", id="bytes"),
         pytest.param({999}, id="set"),
         pytest.param(object(), id="object"),

From 47ad63e1629f731470238ea32ad257d949dd266d Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Fri, 19 Dec 2025 14:33:46 -0800
Subject: [PATCH 11/19] clean up

---
 cuda_core/cuda/core/_program.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/cuda_core/cuda/core/_program.py b/cuda_core/cuda/core/_program.py
index cffd500310..d936ca0a26 100644
--- a/cuda_core/cuda/core/_program.py
+++ b/cuda_core/cuda/core/_program.py
@@ -868,12 +868,6 @@ def from_handle(handle: int, backend: str, options: ProgramOptions = None) -> Pr
         Uses a nvrtcProgram or NVVM program handle represented as a Python int
         to create a new :obj:`Program` object.
 
-        Note
-        ----
-        Program lifetime is not managed, foreign object must remain
-        alive while this program is active. The handle should not be
-        destroyed externally while this Program object is in use.
-
         Parameters
         ----------
         handle : int
@@ -887,17 +881,6 @@ def from_handle(handle: int, backend: str, options: ProgramOptions = None) -> Pr
         options : :obj:`ProgramOptions`, optional
             Program options that may be used for subsequent operations.
             If not provided, default options will be created.
-
-        Returns
-        -------
-        :obj:`Program`
-            Newly created program object.
-
-        Raises
-        ------
-        ValueError
-            If an unsupported backend is specified.
-
         """
         backend = backend.upper()
         if backend not in ("NVRTC", "NVVM"):

From ba14d5df7e1dfcf58c8b6028a52897c2c9608654 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 12 Jan 2026 14:37:21 -0800
Subject: [PATCH 12/19] Removing Object.from_handle and Program.from_handle

---
 cuda_core/cuda/core/_module.py  |  36 --------
 cuda_core/cuda/core/_program.py |  51 -----------
 cuda_core/tests/test_module.py  | 149 --------------------------------
 cuda_core/tests/test_program.py |  43 ---------
 4 files changed, 279 deletions(-)

diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py
index c2eb200de5..0877eabb99 100644
--- a/cuda_core/cuda/core/_module.py
+++ b/cuda_core/cuda/core/_module.py
@@ -692,42 +692,6 @@ def from_library(module: Union[bytes, str], *, name: str = "", symbol_mapping: d
         """
         return ObjectCode._init(module, "library", name=name, symbol_mapping=symbol_mapping)
 
-    @staticmethod
-    def from_handle(
-        handle: int, code_type: str = "cubin", *, name: str = "", symbol_mapping: dict | None = None
-    ) -> "ObjectCode":
-        """Create a new :obj:`ObjectCode` object from a foreign module handle.
-
-        Uses a CUmodule or CUlibrary pointer address to create a new :obj:`ObjectCode` object.
-
-        Parameters
-        ----------
-        handle : int
-            Module handle representing the address of a foreign
-            module object (CUmodule or CUlibrary).
-        code_type : str, optional
-            The type of code object this handle represents. Must be one of
-            "cubin", "ptx", "ltoir", "fatbin", "object", or "library".
-            (Default: "cubin")
-        name : str, optional
-            A human-readable identifier representing this code object.
-        symbol_mapping : dict, optional
-            A dictionary specifying how the unmangled symbol names (as keys)
-            should be mapped to the mangled names before trying to retrieve
-            them (default to no mappings).
-        """
-        # Create an ObjectCode instance with a placeholder module
-        # The handle will be set directly, bypassing the lazy loading
-        obj = ObjectCode._init(b"", code_type, name=name, symbol_mapping=symbol_mapping)
-
-        # Set the handle directly from the foreign handle
-        if obj._backend_version == "new":
-            obj._handle = driver.CUlibrary(handle)
-        else:
-            obj._handle = driver.CUmodule(handle)
-
-        return obj
-
     # TODO: do we want to unload in a finalizer? Probably not..
 
     def _lazy_load_module(self, *args, **kwargs):
diff --git a/cuda_core/cuda/core/_program.py b/cuda_core/cuda/core/_program.py
index d936ca0a26..121dd13963 100644
--- a/cuda_core/cuda/core/_program.py
+++ b/cuda_core/cuda/core/_program.py
@@ -860,54 +860,3 @@ def handle(self) -> ProgramHandleT:
             handle, call ``int(Program.handle)``.
         """
         return self._mnff.handle
-
-    @staticmethod
-    def from_handle(handle: int, backend: str, options: ProgramOptions = None) -> Program:
-        """Create a new :obj:`Program` object from a foreign program handle.
-
-        Uses a nvrtcProgram or NVVM program handle represented as a Python int
-        to create a new :obj:`Program` object.
-
-        Parameters
-        ----------
-        handle : int
-            Program handle representing the address of a foreign
-            program object (nvrtcProgram for NVRTC backend, or
-            NVVM program handle for NVVM backend).
-        backend : str
-            The backend type of the program handle. Must be either
-            "NVRTC" or "NVVM". This determines how the handle is
-            interpreted and used.
-        options : :obj:`ProgramOptions`, optional
-            Program options that may be used for subsequent operations.
-            If not provided, default options will be created.
-        """
-        backend = backend.upper()
-        if backend not in ("NVRTC", "NVVM"):
-            raise ValueError(f"Unsupported backend '{backend}'. Must be 'NVRTC' or 'NVVM'")
-
-        # Create a new Program instance without going through __init__
-        prog = object.__new__(Program)
-
-        # Initialize the members needed for finalization
-        # Note: We pass None as the program_obj to avoid finalization since
-        # we don't own the handle
-        prog._mnff = Program._MembersNeededForFinalize.__new__(Program._MembersNeededForFinalize)
-        prog._mnff.handle = None  # Don't manage the foreign handle
-        prog._mnff.backend = backend
-
-        # Store the backend and options
-        prog._backend = backend
-        prog._options = check_or_create_options(ProgramOptions, options, "Program options")
-        prog._linker = None
-
-        # Store the handle directly without taking ownership
-        # This means the finalizer won't destroy it
-        if backend == "NVRTC":
-            prog._mnff.handle = nvrtc.nvrtcProgram(handle)
-        elif backend == "NVVM":
-            # For NVVM, we just store the handle as-is
-            # The actual NVVM program handle is opaque
-            prog._mnff.handle = handle
-
-        return prog
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index fc07fe7741..89110eea0e 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -421,41 +421,6 @@ def test_module_serialization_roundtrip(get_saxpy_kernel_cubin):
     assert objcode._sym_map == result._sym_map
     assert objcode.code_type == result.code_type
 
-
-def test_object_code_from_handle(get_saxpy_kernel_cubin):
-    """Test ObjectCode.from_handle() with a valid handle"""
-    kernel, original_objcode = get_saxpy_kernel_cubin
-
-    # Get the handle from the original object code
-    handle = int(original_objcode.handle)
-
-    # Create a new ObjectCode from the handle
-    objcode_from_handle = ObjectCode.from_handle(handle, "cubin", symbol_mapping=original_objcode._sym_map)
-    assert isinstance(objcode_from_handle, ObjectCode)
-    assert objcode_from_handle.code_type == "cubin"
-
-    # Try to get a kernel from the new object code
-    kernel_from_handle = objcode_from_handle.get_kernel("saxpy<float>")
-    assert isinstance(kernel_from_handle, cuda.core._module.Kernel)
-
-
-def test_object_code_from_handle_with_different_code_types(get_saxpy_kernel_ptx):
-    """Test ObjectCode.from_handle() with PTX code type"""
-    ptx, original_objcode = get_saxpy_kernel_ptx
-
-    if not Program._can_load_generated_ptx():
-        pytest.skip("PTX version too new for current driver")
-
-    # Force loading to get a handle
-    _ = original_objcode.get_kernel("saxpy<float>")
-    handle = int(original_objcode.handle)
-
-    # Create a new ObjectCode from the handle with PTX code type
-    objcode_from_handle = ObjectCode.from_handle(handle, "ptx", symbol_mapping=original_objcode._sym_map)
-    assert isinstance(objcode_from_handle, ObjectCode)
-    assert objcode_from_handle.code_type == "ptx"
-
-
 def test_kernel_from_handle(get_saxpy_kernel_cubin):
     """Test Kernel.from_handle() with a valid handle"""
     original_kernel, objcode = get_saxpy_kernel_cubin
@@ -490,120 +455,6 @@ def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin):
     assert max_threads > 0
 
 
-# Edge case tests for from_handle methods
-
-
-@pytest.mark.parametrize(
-    "invalid_code_type,expected_error",
-    [
-        pytest.param("invalid_type", AssertionError, id="invalid_str"),
-        pytest.param("", AssertionError, id="empty_str"),
-        pytest.param(None, (AssertionError, TypeError), id="None"),
-        pytest.param(123, (AssertionError, TypeError), id="int"),
-        pytest.param(3.14, (AssertionError, TypeError), id="float"),
-        pytest.param(["cubin"], (AssertionError, TypeError), id="list"),
-        pytest.param(("cubin",), (AssertionError, TypeError), id="tuple"),
-        pytest.param({"type": "cubin"}, (AssertionError, TypeError), id="dict"),
-        pytest.param(b"cubin", (AssertionError, TypeError), id="bytes"),
-        pytest.param({"cubin"}, (AssertionError, TypeError), id="set"),
-        pytest.param(object(), (AssertionError, TypeError), id="object"),
-    ],
-)
-def test_object_code_from_handle_invalid_code_type(invalid_code_type, expected_error):
-    """Test ObjectCode.from_handle() with invalid code_type"""
-    with pytest.raises(expected_error):
-        ObjectCode.from_handle(0, invalid_code_type, symbol_mapping={})
-
-
-def test_object_code_from_handle_symbol_mapping_variations():
-    """Test ObjectCode.from_handle() with various symbol_mapping values"""
-    # None symbol_mapping (should default to empty dict)
-    objcode1 = ObjectCode.from_handle(0, "cubin", symbol_mapping=None)
-    assert objcode1._sym_map == {}
-
-    # Empty dict
-    objcode2 = ObjectCode.from_handle(0, "cubin", symbol_mapping={})
-    assert objcode2._sym_map == {}
-
-    # Valid symbol mapping
-    sym_map = {"kernel1": b"_Z7kernel1v", "kernel2": b"_Z7kernel2v"}
-    objcode3 = ObjectCode.from_handle(0, "cubin", symbol_mapping=sym_map)
-    assert objcode3._sym_map == sym_map
-
-
-def test_object_code_from_handle_symbol_mapping_with_valid_handle(get_saxpy_kernel_cubin):
-    """Test that symbol_mapping is actually used when getting kernels"""
-    _, original_objcode = get_saxpy_kernel_cubin
-    original_handle = int(original_objcode.handle)
-
-    # Create ObjectCode with correct symbol mapping
-    objcode_with_map = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map)
-
-    # Should successfully get kernel using unmangled name from symbol_mapping
-    kernel = objcode_with_map.get_kernel("saxpy<float>")
-    assert isinstance(kernel, cuda.core._module.Kernel)
-
-    # Create ObjectCode without symbol mapping
-    objcode_no_map = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping={})
-
-    # Should fail to get kernel using unmangled name (no mapping available)
-    with pytest.raises(CUDAError):
-        objcode_no_map.get_kernel("saxpy<float>")
-
-
-def test_object_code_from_handle_lifecycle(get_saxpy_kernel_cubin):
-    """Test handle lifecycle and ownership with from_handle"""
-    original_kernel, original_objcode = get_saxpy_kernel_cubin
-
-    # Get the original handle
-    original_handle = int(original_objcode.handle)
-
-    # Create a new ObjectCode from the same handle
-    objcode_from_handle = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map)
-
-    # Both should reference the same underlying CUDA module
-    assert int(objcode_from_handle.handle) == original_handle
-
-    # Get a kernel from the from_handle version
-    kernel_from_copy = objcode_from_handle.get_kernel("saxpy<float>")
-    assert isinstance(kernel_from_copy, cuda.core._module.Kernel)
-
-    # The original should still work
-    kernel_from_original = original_objcode.get_kernel("saxpy<float>")
-    assert isinstance(kernel_from_original, cuda.core._module.Kernel)
-
-    # Both kernels should reference the same underlying CUDA kernel handle
-    # If handles are equal, they're the same kernel - no need to check attributes
-    assert int(kernel_from_copy._handle) == int(kernel_from_original._handle)
-
-
-def test_object_code_from_handle_multiple_instances(get_saxpy_kernel_cubin):
-    """Test creating multiple ObjectCode instances from the same handle"""
-    original_kernel, original_objcode = get_saxpy_kernel_cubin
-
-    # Get the original handle
-    original_handle = int(original_objcode.handle)
-
-    # Create multiple ObjectCode instances from the same handle
-    objcode1 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map)
-    objcode2 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map)
-    objcode3 = ObjectCode.from_handle(original_handle, "cubin", symbol_mapping=original_objcode._sym_map)
-
-    # All should have the same handle
-    assert int(objcode1.handle) == original_handle
-    assert int(objcode2.handle) == original_handle
-    assert int(objcode3.handle) == original_handle
-
-    # All should be able to get kernels
-    kernel1 = objcode1.get_kernel("saxpy<float>")
-    kernel2 = objcode2.get_kernel("saxpy<float>")
-    kernel3 = objcode3.get_kernel("saxpy<float>")
-
-    assert isinstance(kernel1, cuda.core._module.Kernel)
-    assert isinstance(kernel2, cuda.core._module.Kernel)
-    assert isinstance(kernel3, cuda.core._module.Kernel)
-
-
 @pytest.mark.parametrize(
     "invalid_value",
     [
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 105595839a..9a9e4926ae 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -547,46 +547,3 @@ def test_program_options_as_bytes_nvvm_unsupported_option():
     options = ProgramOptions(arch="sm_80", lineinfo=True)
     with pytest.raises(CUDAError, match="not supported by NVVM backend"):
         options.as_bytes("nvvm")
-
-
-def test_program_from_handle_nvrtc(init_cuda):
-    """Test Program.from_handle() with NVRTC backend"""
-    # Create a regular program to get a handle
-    code = 'extern "C" __global__ void test_kernel() {}'
-    original_program = Program(code, "c++")
-    assert original_program.backend == "NVRTC"
-
-    # Get the handle
-    handle = int(original_program.handle)
-
-    # Create a new program from the handle
-    program_from_handle = Program.from_handle(handle, "NVRTC")
-    assert program_from_handle.backend == "NVRTC"
-    # Note: We don't own the handle, so we shouldn't close it in the from_handle instance
-
-    # Clean up the original program
-    original_program.close()
-
-
-@nvvm_available
-def test_program_from_handle_nvvm(init_cuda, nvvm_ir):
-    """Test Program.from_handle() with NVVM backend"""
-    # Create a regular NVVM program to get a handle
-    original_program = Program(nvvm_ir, "nvvm")
-    assert original_program.backend == "NVVM"
-
-    # Get the handle
-    handle = int(original_program.handle)
-
-    # Create a new program from the handle
-    program_from_handle = Program.from_handle(handle, "NVVM")
-    assert program_from_handle.backend == "NVVM"
-
-    # Clean up the original program
-    original_program.close()
-
-
-def test_program_from_handle_invalid_backend():
-    """Test Program.from_handle() with invalid backend"""
-    with pytest.raises(ValueError, match="Unsupported backend 'INVALID'"):
-        Program.from_handle(0, "INVALID")

From 566842dcf4b1cc0fea304111f32ab4af51c7f7a0 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Mon, 12 Jan 2026 14:39:02 -0800
Subject: [PATCH 13/19] whitespace

---
 cuda_core/tests/test_module.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 89110eea0e..c6415c93eb 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -421,6 +421,7 @@ def test_module_serialization_roundtrip(get_saxpy_kernel_cubin):
     assert objcode._sym_map == result._sym_map
     assert objcode.code_type == result.code_type
 
+
 def test_kernel_from_handle(get_saxpy_kernel_cubin):
     """Test Kernel.from_handle() with a valid handle"""
     original_kernel, objcode = get_saxpy_kernel_cubin

From cffa98a7fbc27cbd79133369d04713b1618087e3 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 13 Jan 2026 08:11:11 -0800
Subject: [PATCH 14/19] adding function memoization

---
 cuda_bindings/pixi.lock        | 80 +++++++++++++++++-----------------
 cuda_core/cuda/core/_module.py |  2 +
 2 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/cuda_bindings/pixi.lock b/cuda_bindings/pixi.lock
index f84d569dff..fb3d0ad393 100644
--- a/cuda_bindings/pixi.lock
+++ b/cuda_bindings/pixi.lock
@@ -26,7 +26,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.1.80-hecca717_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.1.80-h376f20c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.1.80-h376f20c_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.80-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.1.80-h69a702a_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.1.80-ha770c72_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.80-h4bc722e_0.conda
@@ -72,7 +72,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.0.49-hd07211c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.125-hb03c661_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda
@@ -220,7 +220,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-cudart-static-13.1.80-h8f3c8d4_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-aarch64-13.1.80-h8f3c8d4_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-aarch64-13.1.80-h8f3c8d4_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.80-h8f3c8d4_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.115-h8f3c8d4_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-13.1.80-he9431aa_100.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-aarch64-13.1.80-h579c4fd_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-impl-13.1.80-h7b14b0b_0.conda
@@ -263,7 +263,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libblas-3.11.0-5_haddc8a3_openblas.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcap-2.77-h68e9139_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcblas-3.11.0-5_hd72aa62_openblas.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.0.49-hbf501ad_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.1.26-hbf501ad_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdeflate-1.25-h1af38f5_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdrm-2.4.125-he30d5cf_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libegl-1.7.0-hd24410f_2.conda
@@ -401,7 +401,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-static-13.1.80-hac47afa_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_win-64-13.1.80-hac47afa_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_win-64-13.1.80-hac47afa_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.80-hac47afa_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.115-hac47afa_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-13.1.80-h719f0c7_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-13.1.80-h57928b3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-13.1.80-h2466b09_0.conda
@@ -542,7 +542,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.1.80-hecca717_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.1.80-h376f20c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.1.80-h376f20c_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.80-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.1.80-h69a702a_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.1.80-ha770c72_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.80-h4bc722e_0.conda
@@ -588,7 +588,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.0.49-hd07211c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.125-hb03c661_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda
@@ -736,7 +736,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-cudart-static-13.1.80-h8f3c8d4_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-aarch64-13.1.80-h8f3c8d4_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-aarch64-13.1.80-h8f3c8d4_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.80-h8f3c8d4_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.115-h8f3c8d4_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-13.1.80-he9431aa_100.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-aarch64-13.1.80-h579c4fd_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-impl-13.1.80-h7b14b0b_0.conda
@@ -779,7 +779,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libblas-3.11.0-5_haddc8a3_openblas.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcap-2.77-h68e9139_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcblas-3.11.0-5_hd72aa62_openblas.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.0.49-hbf501ad_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.1.26-hbf501ad_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdeflate-1.25-h1af38f5_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdrm-2.4.125-he30d5cf_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libegl-1.7.0-hd24410f_2.conda
@@ -917,7 +917,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-static-13.1.80-hac47afa_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_win-64-13.1.80-hac47afa_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_win-64-13.1.80-hac47afa_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.80-hac47afa_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.115-hac47afa_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-13.1.80-h719f0c7_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-13.1.80-h57928b3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-13.1.80-h2466b09_0.conda
@@ -1461,10 +1461,10 @@ packages:
   - cuda-pathfinder >=1.1,<2
   - libnvjitlink
   - cuda-nvrtc
-  - cuda-nvrtc >=13.1.80,<14.0a0
+  - cuda-nvrtc >=13.1.115,<14.0a0
   - cuda-nvvm
   - libcufile
-  - libcufile >=1.16.0.49,<2.0a0
+  - libcufile >=1.16.1.26,<2.0a0
   - libgcc >=15
   - libgcc >=15
   - libstdcxx >=15
@@ -1483,7 +1483,7 @@ packages:
   - cuda-pathfinder >=1.1,<2
   - libnvjitlink
   - cuda-nvrtc
-  - cuda-nvrtc >=13.1.80,<14.0a0
+  - cuda-nvrtc >=13.1.115,<14.0a0
   - cuda-nvvm
   - vc >=14.1,<15
   - vc14_runtime >=14.16.27033
@@ -1502,10 +1502,10 @@ packages:
   - cuda-pathfinder >=1.1,<2
   - libnvjitlink
   - cuda-nvrtc
-  - cuda-nvrtc >=13.1.80,<14.0a0
+  - cuda-nvrtc >=13.1.115,<14.0a0
   - cuda-nvvm
   - libcufile
-  - libcufile >=1.16.0.49,<2.0a0
+  - libcufile >=1.16.1.26,<2.0a0
   - libgcc >=15
   - libgcc >=15
   - libstdcxx >=15
@@ -1759,39 +1759,39 @@ packages:
   license: LicenseRef-NVIDIA-End-User-License-Agreement
   size: 24082
   timestamp: 1764883821516
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.80-hecca717_0.conda
-  sha256: d6b326bdbf6fa7bfa0fa617dda547dc585159816b8f130f2535740c4e53fd12c
-  md5: 7ef874b2dc4ca388ecef3b3893305459
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda
+  sha256: 9cc4f9df70c02eea5121cdb0e865207b04cd52591f57ebcac2ba44fada10eb5b
+  md5: df16c9049d882cdaf4f83a5b90079589
   depends:
   - __glibc >=2.17,<3.0.a0
   - cuda-version >=13.1,<13.2.0a0
   - libgcc >=14
   - libstdcxx >=14
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 35479197
-  timestamp: 1764880529154
-- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.80-h8f3c8d4_0.conda
-  sha256: 5e10ce4dd84c22c73e58a9f8359fb1e5ef4596afd3a0bc12b9fbde73b388ec0d
-  md5: 0473ebdb01f2f4024177b024fc19fa72
+  size: 35339417
+  timestamp: 1768272955912
+- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.115-h8f3c8d4_0.conda
+  sha256: a1ec61512cecb093797e00590ad381ecd5852d2a32440ff22b34f78c743f3d5a
+  md5: 34da2ff2c64054d65eb8f04d76c40cca
   depends:
   - arm-variant * sbsa
   - cuda-version >=13.1,<13.2.0a0
   - libgcc >=14
   - libstdcxx >=14
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 33619044
-  timestamp: 1764880672755
-- conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.80-hac47afa_0.conda
-  sha256: 3f67de8a9eb182fa20bbc80bda7185afb676cfe8894f6a0549173bd752a7d2f4
-  md5: 7b42337a35cd887ec3eed254b5ed606f
+  size: 33616576
+  timestamp: 1768272976976
+- conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.115-hac47afa_0.conda
+  sha256: a8869b7d997722f90b9f8a602dc0b1d0d497f2a6f3561dc89383aeb2cd379a66
+  md5: 372d3c612a832d5f87d8dd9702d487b2
   depends:
   - cuda-version >=13.1,<13.2.0a0
   - ucrt >=10.0.20348.0
   - vc >=14.3,<15
   - vc14_runtime >=14.44.35208
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 31012754
-  timestamp: 1764880740086
+  size: 31006920
+  timestamp: 1768273107962
 - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.1.80-h69a702a_0.conda
   sha256: 84f971ab146e2c822103cfe06f478ece244747a6f2aa565be639a4709d0a1579
   md5: 9250c651d8758c8f665dff7519ef21ff
@@ -3275,9 +3275,9 @@ packages:
   license_family: BSD
   size: 68079
   timestamp: 1765819124349
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.0.49-hd07211c_0.conda
-  sha256: 6aabad84132b1f3ee367e5d24291febf8a11d9a7f3967a64fc07e77d9b0b22df
-  md5: 9cb68a85f8c08f0512931f944f6a75df
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda
+  sha256: 8c44b5bf947afad827df0df49fe7483cf1b2916694081b2db4fecdfd6a2bacd1
+  md5: 48418c48dac04671fa46cb446122b8a5
   depends:
   - __glibc >=2.28,<3.0.a0
   - cuda-version >=13.1,<13.2.0a0
@@ -3285,11 +3285,11 @@ packages:
   - libstdcxx >=14
   - rdma-core >=60.0
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 990030
-  timestamp: 1764881892686
-- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.0.49-hbf501ad_0.conda
-  sha256: d03963dc7708ded20340176ade987fc4c3e49da4f7b139a85e69ca7eb413f57a
-  md5: 315e1b144eaf890519fc63049b6e9228
+  size: 990938
+  timestamp: 1768273732081
+- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.1.26-hbf501ad_0.conda
+  sha256: 7451b3e2204e6cad21db501052dfe595c3440213ef3e22c0f9c784012f6a8419
+  md5: ee60a24c702ce02de95ae1982c4841d8
   depends:
   - __glibc >=2.28,<3.0.a0
   - arm-variant * sbsa
@@ -3300,8 +3300,8 @@ packages:
   constrains:
   - arm-variant * sbsa
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 887547
-  timestamp: 1764881951574
+  size: 891752
+  timestamp: 1768273724252
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda
   sha256: aa8e8c4be9a2e81610ddf574e05b64ee131fab5e0e3693210c9d6d2fba32c680
   md5: 6c77a605a7a689d17d4819c0f8ac9a00
diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py
index 0877eabb99..ccdfb30c0e 100644
--- a/cuda_core/cuda/core/_module.py
+++ b/cuda_core/cuda/core/_module.py
@@ -7,6 +7,7 @@
 from collections import namedtuple
 from typing import Union
 from warnings import warn
+import functools
 
 from cuda.core._device import Device
 from cuda.core._launch_config import LaunchConfig, _to_native_launch_config
@@ -99,6 +100,7 @@ def _get_kernel_ctypes():
     return _kernel_ctypes
 
 
+@functools.cache
 def _get_backend_version():
     """Get the backend version ("new" or "old") based on CUDA version.
 

From b6a485d2034ecbf5b4a106cd824fce1eed85c341 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 13 Jan 2026 08:34:31 -0800
Subject: [PATCH 15/19] accessing Kernel directly

---
 cuda_core/tests/test_module.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index c6415c93eb..20c358d46d 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -7,7 +7,7 @@
 
 import cuda.core
 import pytest
-from cuda.core import Device, ObjectCode, Program, ProgramOptions
+from cuda.core import Device, ObjectCode, Program, ProgramOptions, Kernel
 from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return
 
 try:
@@ -430,8 +430,8 @@ def test_kernel_from_handle(get_saxpy_kernel_cubin):
     handle = int(original_kernel._handle)
 
     # Create a new Kernel from the handle
-    kernel_from_handle = cuda.core._module.Kernel.from_handle(handle, objcode)
-    assert isinstance(kernel_from_handle, cuda.core._module.Kernel)
+    kernel_from_handle = Kernel.from_handle(handle, objcode)
+    assert isinstance(kernel_from_handle, Kernel)
 
     # Verify we can access kernel attributes
     max_threads = kernel_from_handle.attributes.max_threads_per_block()
@@ -447,8 +447,8 @@ def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin):
     handle = int(original_kernel._handle)
 
     # Create a new Kernel from the handle without a module
-    kernel_from_handle = cuda.core._module.Kernel.from_handle(handle)
-    assert isinstance(kernel_from_handle, cuda.core._module.Kernel)
+    kernel_from_handle = Kernel.from_handle(handle)
+    assert isinstance(kernel_from_handle, Kernel)
 
     # Verify we can still access kernel attributes
     max_threads = kernel_from_handle.attributes.max_threads_per_block()
@@ -474,7 +474,7 @@ def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin):
 def test_kernel_from_handle_type_validation(invalid_value):
     """Test Kernel.from_handle() with wrong handle types"""
     with pytest.raises(TypeError):
-        cuda.core._module.Kernel.from_handle(invalid_value)
+        Kernel.from_handle(invalid_value)
 
 
 def test_kernel_from_handle_invalid_module_type(get_saxpy_kernel_cubin):
@@ -484,10 +484,10 @@ def test_kernel_from_handle_invalid_module_type(get_saxpy_kernel_cubin):
 
     # Invalid module type (should fail type assertion in _from_obj)
     with pytest.raises((TypeError, AssertionError)):
-        cuda.core._module.Kernel.from_handle(handle, mod="not_an_objectcode")
+        Kernel.from_handle(handle, mod="not_an_objectcode")
 
     with pytest.raises((TypeError, AssertionError)):
-        cuda.core._module.Kernel.from_handle(handle, mod=12345)
+        Kernel.from_handle(handle, mod=12345)
 
 
 def test_kernel_from_handle_multiple_instances(get_saxpy_kernel_cubin):
@@ -496,14 +496,14 @@ def test_kernel_from_handle_multiple_instances(get_saxpy_kernel_cubin):
     handle = int(original_kernel._handle)
 
     # Create multiple Kernel instances from the same handle
-    kernel1 = cuda.core._module.Kernel.from_handle(handle, objcode)
-    kernel2 = cuda.core._module.Kernel.from_handle(handle, objcode)
-    kernel3 = cuda.core._module.Kernel.from_handle(handle, objcode)
+    kernel1 = Kernel.from_handle(handle, objcode)
+    kernel2 = Kernel.from_handle(handle, objcode)
+    kernel3 = Kernel.from_handle(handle, objcode)
 
     # All should be valid Kernel objects
-    assert isinstance(kernel1, cuda.core._module.Kernel)
-    assert isinstance(kernel2, cuda.core._module.Kernel)
-    assert isinstance(kernel3, cuda.core._module.Kernel)
+    assert isinstance(kernel1, Kernel)
+    assert isinstance(kernel2, Kernel)
+    assert isinstance(kernel3, Kernel)
 
     # All should reference the same underlying CUDA kernel handle
     assert int(kernel1._handle) == int(kernel2._handle) == int(kernel3._handle) == handle

From df1c6e1840738a741a09918d9badbc29903508ff Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 13 Jan 2026 10:52:54 -0800
Subject: [PATCH 16/19] feedback

---
 cuda_core/cuda/core/_module.py | 69 ++++++++++++++++++++++++++++------
 cuda_core/tests/test_module.py |  7 ++++
 2 files changed, 65 insertions(+), 11 deletions(-)

diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py
index ccdfb30c0e..a103e8ab71 100644
--- a/cuda_core/cuda/core/_module.py
+++ b/cuda_core/cuda/core/_module.py
@@ -17,7 +17,7 @@
     assert_type_str_or_bytes_like,
     raise_code_path_meant_to_be_unreachable,
 )
-from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return, precondition
+from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return, precondition
 
 # Lazy initialization state and synchronization
 # For Python 3.13t (free-threaded builds), we use a lock to ensure thread-safe initialization.
@@ -25,6 +25,7 @@
 _init_lock = threading.Lock()
 _inited = False
 _py_major_ver = None
+_py_minor_ver = None
 _driver_ver = None
 _kernel_ctypes = None
 _backend = {
@@ -60,9 +61,9 @@ def _lazy_init():
         if _inited:
             return
 
-        global _py_major_ver, _driver_ver, _kernel_ctypes
+        global _py_major_ver, _py_minor_ver, _driver_ver, _kernel_ctypes
         # binding availability depends on cuda-python version
-        _py_major_ver, _ = get_binding_version()
+        _py_major_ver, _py_minor_ver = get_binding_version()
         if _py_major_ver >= 12:
             _backend["new"] = {
                 "file": driver.cuLibraryLoadFromFile,
@@ -88,6 +89,12 @@ def _get_py_major_ver():
     return _py_major_ver
 
 
+def _get_py_minor_ver():
+    """Get the Python binding minor version, initializing if needed."""
+    _lazy_init()
+    return _py_minor_ver
+
+
 def _get_driver_ver():
     """Get the CUDA driver version, initializing if needed."""
     _lazy_init()
@@ -100,13 +107,37 @@ def _get_kernel_ctypes():
     return _kernel_ctypes
 
 
+@functools.cache
+def _is_cuda_12_plus_backend() -> bool:
+    """Return True when the CUDA 12+ (cuLibrary) backend is available/active."""
+    return _get_py_major_ver() >= 12 and _get_driver_ver() >= 12000
+
+
 @functools.cache
 def _get_backend_version():
     """Get the backend version ("new" or "old") based on CUDA version.
 
     Returns "new" for CUDA 12.0+ (uses cuLibrary API), "old" otherwise (uses cuModule API).
     """
-    return "new" if (_get_py_major_ver() >= 12 and _get_driver_ver() >= 12000) else "old"
+    return "new" if _is_cuda_12_plus_backend() else "old"
+
+
+@functools.cache
+def _is_cukernel_get_library_supported() -> bool:
+    """Return True when cuKernelGetLibrary is available for inverse kernel-to-library lookup.
+
+    Requires cuda-python bindings >= 12.5 and driver >= 12.5.
+    """
+    return (
+        (_get_py_major_ver(), _get_py_minor_ver()) >= (12, 5)
+        and _get_driver_ver() >= 12050
+        and hasattr(driver, "cuKernelGetLibrary")
+    )
+
+
+def _make_dummy_library_handle():
+    """Create a non-null placeholder CUlibrary handle to disable lazy loading."""
+    return driver.CUlibrary(1) if hasattr(driver, "CUlibrary") else 1
 
 
 class KernelAttributes:
@@ -511,20 +542,36 @@ def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel":
         if not isinstance(handle, int):
             raise TypeError(f"handle must be an integer, got {type(handle).__name__}")
 
-        # Convert the integer handle to the appropriate driver type
-        if _get_py_major_ver() >= 12 and _get_driver_ver() >= 12000:
-            # Try CUkernel first for newer CUDA versions
+        # Convert the integer handle to the appropriate driver type.
+        #
+        # - For the CUDA 12+ "new" backend, kernels are CUkernel handles.
+        # - For the legacy "old" backend, kernels are CUfunction handles.
+        if _is_cuda_12_plus_backend():
             kernel_obj = driver.CUkernel(handle)
         else:
-            # Use CUfunction for older versions
             kernel_obj = driver.CUfunction(handle)
 
         # If no module provided, create a placeholder
         if mod is None:
-            # Create a placeholder ObjectCode that won't try to load anything
+            if not _is_cuda_12_plus_backend():
+                raise NotImplementedError(
+                    "Kernel.from_handle(..., mod=None) is only supported for CUkernel handles "
+                    "(CUDA 12 'new' backend). For CUfunction handles, please pass the owning "
+                    "ObjectCode explicitly via the 'mod' argument."
+                )
+
+            # For CUkernel, we can (optionally) inverse-lookup the owning CUlibrary via
+            # cuKernelGetLibrary (added in CUDA 12.5). If the API is not available, we fall
+            # back to a non-null dummy handle purely to disable lazy loading.
             mod = ObjectCode._init(b"", "cubin")
-            # Set a dummy handle to prevent lazy loading
-            mod._handle = 1  # Non-null placeholder
+            if _is_cukernel_get_library_supported():
+                try:
+                    mod._handle = handle_return(driver.cuKernelGetLibrary(kernel_obj))
+                except (CUDAError, RuntimeError):
+                    # Best-effort: don't fail construction if inverse lookup fails.
+                    mod._handle = _make_dummy_library_handle()
+            else:
+                mod._handle = _make_dummy_library_handle()
 
         return Kernel._from_obj(kernel_obj, mod)
 
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 20c358d46d..9265c96bf4 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -8,6 +8,7 @@
 import cuda.core
 import pytest
 from cuda.core import Device, ObjectCode, Program, ProgramOptions, Kernel
+from cuda.core._module import _is_cuda_12_plus_backend
 from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return
 
 try:
@@ -446,6 +447,12 @@ def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin):
     # Get the handle from the original kernel
     handle = int(original_kernel._handle)
 
+    # Kernel.from_handle(..., mod=None) is only supported on the CUDA 12 "new" backend.
+    if not _is_cuda_12_plus_backend():
+        with pytest.raises(NotImplementedError):
+            Kernel.from_handle(handle)
+        pytest.skip("mod=None only supported on CUDA 12+ backend")
+
     # Create a new Kernel from the handle without a module
     kernel_from_handle = Kernel.from_handle(handle)
     assert isinstance(kernel_from_handle, Kernel)

From 5f01231ce54638d670adc2e36ba533642d17fb9b Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 13 Jan 2026 11:40:47 -0800
Subject: [PATCH 17/19] precommit

---
 cuda_core/cuda/core/_module.py | 2 +-
 cuda_core/tests/test_module.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py
index a103e8ab71..ed98044ae5 100644
--- a/cuda_core/cuda/core/_module.py
+++ b/cuda_core/cuda/core/_module.py
@@ -2,12 +2,12 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import functools
 import threading
 import weakref
 from collections import namedtuple
 from typing import Union
 from warnings import warn
-import functools
 
 from cuda.core._device import Device
 from cuda.core._launch_config import LaunchConfig, _to_native_launch_config
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 9265c96bf4..aad867cc68 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -7,7 +7,7 @@
 
 import cuda.core
 import pytest
-from cuda.core import Device, ObjectCode, Program, ProgramOptions, Kernel
+from cuda.core import Device, Kernel, ObjectCode, Program, ProgramOptions
 from cuda.core._module import _is_cuda_12_plus_backend
 from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return
 

From ca0b7da2263e85855709c3f62c5c10c00df9ed54 Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 13 Jan 2026 14:34:24 -0800
Subject: [PATCH 18/19] removing the CUDA11 code path

---
 cuda_core/cuda/core/_launcher.pyx |  12 ++--
 cuda_core/cuda/core/_module.py    | 109 +++++++-----------------------
 cuda_core/tests/test_module.py    |   8 +--
 3 files changed, 33 insertions(+), 96 deletions(-)

diff --git a/cuda_core/cuda/core/_launcher.pyx b/cuda_core/cuda/core/_launcher.pyx
index c575ad9bd0..9559f7697a 100644
--- a/cuda_core/cuda/core/_launcher.pyx
+++ b/cuda_core/cuda/core/_launcher.pyx
@@ -78,14 +78,14 @@ def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kern
     cdef void** args_ptr = <void**><uintptr_t>(ker_args.ptr)
 
     # TODO: cythonize Module/Kernel/...
-    # Note: CUfunction and CUkernel are interchangeable
+    # Note: We now use CUkernel handles exclusively (CUDA 12+), but they can be cast to
+    # CUfunction for use with cuLaunchKernel, as both handle types are interchangeable
+    # for kernel launch purposes.
     cdef cydriver.CUfunction func_handle = <cydriver.CUfunction>(<uintptr_t>(kernel._handle))
 
-    # Note: CUkernel can still be launched via the old cuLaunchKernel and we do not care
-    # about the CUfunction/CUkernel difference (which depends on whether the "old" or
-    # "new" module loading APIs are in use). We check both binding & driver versions here
-    # mainly to see if the "Ex" API is available and if so we use it, as it's more feature
-    # rich.
+    # Note: CUkernel can still be launched via cuLaunchKernel (not just cuLaunchKernelEx).
+    # We check both binding & driver versions here mainly to see if the "Ex" API is
+    # available and if so we use it, as it's more feature rich.
     if _use_ex:
         drv_cfg = conf._to_native_launch_config()
         drv_cfg.hStream = as_cu(s._h_stream)
diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py
index ed98044ae5..603ad07eb0 100644
--- a/cuda_core/cuda/core/_module.py
+++ b/cuda_core/cuda/core/_module.py
@@ -28,14 +28,7 @@
 _py_minor_ver = None
 _driver_ver = None
 _kernel_ctypes = None
-_backend = {
-    "old": {
-        "file": driver.cuModuleLoad,
-        "data": driver.cuModuleLoadDataEx,
-        "kernel": driver.cuModuleGetFunction,
-        "attribute": driver.cuFuncGetAttribute,
-    },
-}
+_backend = {}
 
 
 def _lazy_init():
@@ -61,22 +54,19 @@ def _lazy_init():
         if _inited:
             return
 
-        global _py_major_ver, _py_minor_ver, _driver_ver, _kernel_ctypes
+        global _py_major_ver, _py_minor_ver, _driver_ver, _kernel_ctypes, _backend
         # binding availability depends on cuda-python version
         _py_major_ver, _py_minor_ver = get_binding_version()
-        if _py_major_ver >= 12:
-            _backend["new"] = {
-                "file": driver.cuLibraryLoadFromFile,
-                "data": driver.cuLibraryLoadData,
-                "kernel": driver.cuLibraryGetKernel,
-                "attribute": driver.cuKernelGetAttribute,
-            }
-            _kernel_ctypes = (driver.CUfunction, driver.CUkernel)
-        else:
-            _kernel_ctypes = (driver.CUfunction,)
+        _backend = {
+            "file": driver.cuLibraryLoadFromFile,
+            "data": driver.cuLibraryLoadData,
+            "kernel": driver.cuLibraryGetKernel,
+            "attribute": driver.cuKernelGetAttribute,
+        }
+        _kernel_ctypes = (driver.CUkernel,)
         _driver_ver = handle_return(driver.cuDriverGetVersion())
-        if _py_major_ver >= 12 and _driver_ver >= 12040:
-            _backend["new"]["paraminfo"] = driver.cuKernelGetParamInfo
+        if _driver_ver >= 12040:
+            _backend["paraminfo"] = driver.cuKernelGetParamInfo
 
         # Mark as initialized (must be last to ensure all state is set)
         _inited = True
@@ -107,21 +97,6 @@ def _get_kernel_ctypes():
     return _kernel_ctypes
 
 
-@functools.cache
-def _is_cuda_12_plus_backend() -> bool:
-    """Return True when the CUDA 12+ (cuLibrary) backend is available/active."""
-    return _get_py_major_ver() >= 12 and _get_driver_ver() >= 12000
-
-
-@functools.cache
-def _get_backend_version():
-    """Get the backend version ("new" or "old") based on CUDA version.
-
-    Returns "new" for CUDA 12.0+ (uses cuLibrary API), "old" otherwise (uses cuModule API).
-    """
-    return "new" if _is_cuda_12_plus_backend() else "old"
-
-
 @functools.cache
 def _is_cukernel_get_library_supported() -> bool:
     """Return True when cuKernelGetLibrary is available for inverse kernel-to-library lookup.
@@ -144,7 +119,7 @@ class KernelAttributes:
     def __new__(self, *args, **kwargs):
         raise RuntimeError("KernelAttributes cannot be instantiated directly. Please use Kernel APIs.")
 
-    slots = ("_kernel", "_cache", "_backend_version", "_loader")
+    slots = ("_kernel", "_cache", "_loader")
 
     @classmethod
     def _init(cls, kernel):
@@ -152,8 +127,9 @@ def _init(cls, kernel):
         self._kernel = weakref.ref(kernel)
         self._cache = {}
 
-        self._backend_version = _get_backend_version()
-        self._loader = _backend[self._backend_version]
+        # Ensure backend is initialized before setting loader
+        _lazy_init()
+        self._loader = _backend
         return self
 
     def _get_cached_attribute(self, device_id: Device | int, attribute: driver.CUfunction_attribute) -> int:
@@ -166,15 +142,7 @@ def _get_cached_attribute(self, device_id: Device | int, attribute: driver.CUfun
         kernel = self._kernel()
         if kernel is None:
             raise RuntimeError("Cannot access kernel attributes for expired Kernel object")
-        if self._backend_version == "new":
-            result = handle_return(self._loader["attribute"](attribute, kernel._handle, device_id))
-        else:  # "old" backend
-            warn(
-                "Device ID argument is ignored when getting attribute from kernel when cuda version < 12. ",
-                RuntimeWarning,
-                stacklevel=2,
-            )
-            result = handle_return(self._loader["attribute"](attribute, kernel._handle))
+        result = handle_return(self._loader["attribute"](attribute, kernel._handle, device_id))
         self._cache[cache_key] = result
         return result
 
@@ -480,8 +448,6 @@ def attributes(self) -> KernelAttributes:
 
     def _get_arguments_info(self, param_info=False) -> tuple[int, list[ParamInfo]]:
         attr_impl = self.attributes
-        if attr_impl._backend_version != "new":
-            raise NotImplementedError("New backend is required")
         if "paraminfo" not in attr_impl._loader:
             driver_ver = _get_driver_ver()
             raise NotImplementedError(
@@ -525,13 +491,13 @@ def occupancy(self) -> KernelOccupancy:
     def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel":
         """Creates a new :obj:`Kernel` object from a foreign kernel handle.
 
-        Uses a CUfunction or CUkernel pointer address to create a new :obj:`Kernel` object.
+        Uses a CUkernel pointer address to create a new :obj:`Kernel` object.
 
         Parameters
         ----------
         handle : int
             Kernel handle representing the address of a foreign
-            kernel object (CUfunction or CUkernel).
+            kernel object (CUkernel).
         mod : :obj:`ObjectCode`, optional
             The ObjectCode object associated with this kernel. If not provided,
             a placeholder ObjectCode will be created. Note that without a proper
@@ -542,24 +508,11 @@ def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel":
         if not isinstance(handle, int):
             raise TypeError(f"handle must be an integer, got {type(handle).__name__}")
 
-        # Convert the integer handle to the appropriate driver type.
-        #
-        # - For the CUDA 12+ "new" backend, kernels are CUkernel handles.
-        # - For the legacy "old" backend, kernels are CUfunction handles.
-        if _is_cuda_12_plus_backend():
-            kernel_obj = driver.CUkernel(handle)
-        else:
-            kernel_obj = driver.CUfunction(handle)
+        # Convert the integer handle to CUkernel driver type
+        kernel_obj = driver.CUkernel(handle)
 
         # If no module provided, create a placeholder
         if mod is None:
-            if not _is_cuda_12_plus_backend():
-                raise NotImplementedError(
-                    "Kernel.from_handle(..., mod=None) is only supported for CUkernel handles "
-                    "(CUDA 12 'new' backend). For CUfunction handles, please pass the owning "
-                    "ObjectCode explicitly via the 'mod' argument."
-                )
-
             # For CUkernel, we can (optionally) inverse-lookup the owning CUlibrary via
             # cuKernelGetLibrary (added in CUDA 12.5). If the API is not available, we fall
             # back to a non-null dummy handle purely to disable lazy loading.
@@ -591,14 +544,9 @@ class ObjectCode:
     like to load, use the :meth:`from_cubin` alternative constructor. Constructing directly
     from all other possible code types should be avoided in favor of compilation through
     :class:`~cuda.core.Program`
-
-    Note
-    ----
-    Usage under CUDA 11.x will only load to the current device
-    context.
     """
 
-    __slots__ = ("_handle", "_backend_version", "_code_type", "_module", "_loader", "_sym_map", "_name")
+    __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map", "_name")
     _supported_code_type = ("cubin", "ptx", "ltoir", "fatbin", "object", "library")
 
     def __new__(self, *args, **kwargs):
@@ -615,8 +563,9 @@ def _init(cls, module, code_type, *, name: str = "", symbol_mapping: dict | None
         # handle is assigned during _lazy_load
         self._handle = None
 
-        self._backend_version = _get_backend_version()
-        self._loader = _backend[self._backend_version]
+        # Ensure backend is initialized before setting loader
+        _lazy_init()
+        self._loader = _backend
 
         self._code_type = code_type
         self._module = module
@@ -749,16 +698,10 @@ def _lazy_load_module(self, *args, **kwargs):
         module = self._module
         assert_type_str_or_bytes_like(module)
         if isinstance(module, str):
-            if self._backend_version == "new":
-                self._handle = handle_return(self._loader["file"](module.encode(), [], [], 0, [], [], 0))
-            else:  # "old" backend
-                self._handle = handle_return(self._loader["file"](module.encode()))
+            self._handle = handle_return(self._loader["file"](module.encode(), [], [], 0, [], [], 0))
             return
         if isinstance(module, (bytes, bytearray)):
-            if self._backend_version == "new":
-                self._handle = handle_return(self._loader["data"](module, [], [], 0, [], [], 0))
-            else:  # "old" backend
-                self._handle = handle_return(self._loader["data"](module, 0, [], []))
+            self._handle = handle_return(self._loader["data"](module, [], [], 0, [], [], 0))
             return
         raise_code_path_meant_to_be_unreachable()
 
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index aad867cc68..f9bbcd3e4c 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -8,7 +8,6 @@
 import cuda.core
 import pytest
 from cuda.core import Device, Kernel, ObjectCode, Program, ProgramOptions
-from cuda.core._module import _is_cuda_12_plus_backend
 from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return
 
 try:
@@ -447,13 +446,8 @@ def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin):
     # Get the handle from the original kernel
     handle = int(original_kernel._handle)
 
-    # Kernel.from_handle(..., mod=None) is only supported on the CUDA 12 "new" backend.
-    if not _is_cuda_12_plus_backend():
-        with pytest.raises(NotImplementedError):
-            Kernel.from_handle(handle)
-        pytest.skip("mod=None only supported on CUDA 12+ backend")
-
     # Create a new Kernel from the handle without a module
+    # This is supported on CUDA 12+ backend (CUkernel)
     kernel_from_handle = Kernel.from_handle(handle)
     assert isinstance(kernel_from_handle, Kernel)
 

From c9cca32c63a2a6b6357638870b464ac8b229a4db Mon Sep 17 00:00:00 2001
From: Rob Parolin <rparolin@nvidia.com>
Date: Tue, 13 Jan 2026 14:35:00 -0800
Subject: [PATCH 19/19] pre-commit

---
 cuda_core/cuda/core/_module.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py
index 603ad07eb0..dd3f4494d5 100644
--- a/cuda_core/cuda/core/_module.py
+++ b/cuda_core/cuda/core/_module.py
@@ -7,7 +7,6 @@
 import weakref
 from collections import namedtuple
 from typing import Union
-from warnings import warn
 
 from cuda.core._device import Device
 from cuda.core._launch_config import LaunchConfig, _to_native_launch_config