diff --git a/cuda_bindings/pixi.lock b/cuda_bindings/pixi.lock
index f84d569dff..fb3d0ad393 100644
--- a/cuda_bindings/pixi.lock
+++ b/cuda_bindings/pixi.lock
@@ -26,7 +26,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.1.80-hecca717_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.1.80-h376f20c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.1.80-h376f20c_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.80-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.1.80-h69a702a_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.1.80-ha770c72_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.80-h4bc722e_0.conda
@@ -72,7 +72,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.0.49-hd07211c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.125-hb03c661_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda
@@ -220,7 +220,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-cudart-static-13.1.80-h8f3c8d4_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-aarch64-13.1.80-h8f3c8d4_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-aarch64-13.1.80-h8f3c8d4_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.80-h8f3c8d4_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.115-h8f3c8d4_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-13.1.80-he9431aa_100.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-aarch64-13.1.80-h579c4fd_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-impl-13.1.80-h7b14b0b_0.conda
@@ -263,7 +263,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libblas-3.11.0-5_haddc8a3_openblas.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcap-2.77-h68e9139_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcblas-3.11.0-5_hd72aa62_openblas.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.0.49-hbf501ad_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.1.26-hbf501ad_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdeflate-1.25-h1af38f5_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdrm-2.4.125-he30d5cf_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libegl-1.7.0-hd24410f_2.conda
@@ -401,7 +401,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-static-13.1.80-hac47afa_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_win-64-13.1.80-hac47afa_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_win-64-13.1.80-hac47afa_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.80-hac47afa_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.115-hac47afa_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-13.1.80-h719f0c7_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-13.1.80-h57928b3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-13.1.80-h2466b09_0.conda
@@ -542,7 +542,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.1.80-hecca717_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.1.80-h376f20c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.1.80-h376f20c_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.80-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.1.80-h69a702a_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.1.80-ha770c72_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.80-h4bc722e_0.conda
@@ -588,7 +588,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.0.49-hd07211c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.125-hb03c661_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda
@@ -736,7 +736,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-cudart-static-13.1.80-h8f3c8d4_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-aarch64-13.1.80-h8f3c8d4_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-aarch64-13.1.80-h8f3c8d4_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.80-h8f3c8d4_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.115-h8f3c8d4_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-13.1.80-he9431aa_100.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-aarch64-13.1.80-h579c4fd_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-impl-13.1.80-h7b14b0b_0.conda
@@ -779,7 +779,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libblas-3.11.0-5_haddc8a3_openblas.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcap-2.77-h68e9139_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcblas-3.11.0-5_hd72aa62_openblas.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.0.49-hbf501ad_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.1.26-hbf501ad_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdeflate-1.25-h1af38f5_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdrm-2.4.125-he30d5cf_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libegl-1.7.0-hd24410f_2.conda
@@ -917,7 +917,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-static-13.1.80-hac47afa_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_win-64-13.1.80-hac47afa_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_win-64-13.1.80-hac47afa_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.80-hac47afa_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.115-hac47afa_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-13.1.80-h719f0c7_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-13.1.80-h57928b3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-13.1.80-h2466b09_0.conda
@@ -1461,10 +1461,10 @@ packages:
   - cuda-pathfinder >=1.1,<2
   - libnvjitlink
   - cuda-nvrtc
-  - cuda-nvrtc >=13.1.80,<14.0a0
+  - cuda-nvrtc >=13.1.115,<14.0a0
   - cuda-nvvm
   - libcufile
-  - libcufile >=1.16.0.49,<2.0a0
+  - libcufile >=1.16.1.26,<2.0a0
   - libgcc >=15
   - libgcc >=15
   - libstdcxx >=15
@@ -1483,7 +1483,7 @@ packages:
   - cuda-pathfinder >=1.1,<2
   - libnvjitlink
   - cuda-nvrtc
-  - cuda-nvrtc >=13.1.80,<14.0a0
+  - cuda-nvrtc >=13.1.115,<14.0a0
   - cuda-nvvm
   - vc >=14.1,<15
   - vc14_runtime >=14.16.27033
@@ -1502,10 +1502,10 @@ packages:
   - cuda-pathfinder >=1.1,<2
   - libnvjitlink
   - cuda-nvrtc
-  - cuda-nvrtc >=13.1.80,<14.0a0
+  - cuda-nvrtc >=13.1.115,<14.0a0
   - cuda-nvvm
   - libcufile
-  - libcufile >=1.16.0.49,<2.0a0
+  - libcufile >=1.16.1.26,<2.0a0
   - libgcc >=15
   - libgcc >=15
   - libstdcxx >=15
@@ -1759,39 +1759,39 @@ packages:
   license: LicenseRef-NVIDIA-End-User-License-Agreement
   size: 24082
   timestamp: 1764883821516
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.80-hecca717_0.conda
-  sha256: d6b326bdbf6fa7bfa0fa617dda547dc585159816b8f130f2535740c4e53fd12c
-  md5: 7ef874b2dc4ca388ecef3b3893305459
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda
+  sha256: 9cc4f9df70c02eea5121cdb0e865207b04cd52591f57ebcac2ba44fada10eb5b
+  md5: df16c9049d882cdaf4f83a5b90079589
   depends:
   - __glibc >=2.17,<3.0.a0
   - cuda-version >=13.1,<13.2.0a0
   - libgcc >=14
   - libstdcxx >=14
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 35479197
-  timestamp: 1764880529154
-- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.80-h8f3c8d4_0.conda
-  sha256: 5e10ce4dd84c22c73e58a9f8359fb1e5ef4596afd3a0bc12b9fbde73b388ec0d
-  md5: 0473ebdb01f2f4024177b024fc19fa72
+  size: 35339417
+  timestamp: 1768272955912
+- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.115-h8f3c8d4_0.conda
+  sha256: a1ec61512cecb093797e00590ad381ecd5852d2a32440ff22b34f78c743f3d5a
+  md5: 34da2ff2c64054d65eb8f04d76c40cca
   depends:
   - arm-variant * sbsa
   - cuda-version >=13.1,<13.2.0a0
   - libgcc >=14
   - libstdcxx >=14
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 33619044
-  timestamp: 1764880672755
-- conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.80-hac47afa_0.conda
-  sha256: 3f67de8a9eb182fa20bbc80bda7185afb676cfe8894f6a0549173bd752a7d2f4
-  md5: 7b42337a35cd887ec3eed254b5ed606f
+  size: 33616576
+  timestamp: 1768272976976
+- conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.115-hac47afa_0.conda
+  sha256: a8869b7d997722f90b9f8a602dc0b1d0d497f2a6f3561dc89383aeb2cd379a66
+  md5: 372d3c612a832d5f87d8dd9702d487b2
   depends:
   - cuda-version >=13.1,<13.2.0a0
   - ucrt >=10.0.20348.0
   - vc >=14.3,<15
   - vc14_runtime >=14.44.35208
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 31012754
-  timestamp: 1764880740086
+  size: 31006920
+  timestamp: 1768273107962
 - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.1.80-h69a702a_0.conda
   sha256: 84f971ab146e2c822103cfe06f478ece244747a6f2aa565be639a4709d0a1579
   md5: 9250c651d8758c8f665dff7519ef21ff
@@ -3275,9 +3275,9 @@ packages:
   license_family: BSD
   size: 68079
   timestamp: 1765819124349
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.0.49-hd07211c_0.conda
-  sha256: 6aabad84132b1f3ee367e5d24291febf8a11d9a7f3967a64fc07e77d9b0b22df
-  md5: 9cb68a85f8c08f0512931f944f6a75df
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda
+  sha256: 8c44b5bf947afad827df0df49fe7483cf1b2916694081b2db4fecdfd6a2bacd1
+  md5: 48418c48dac04671fa46cb446122b8a5
   depends:
   - __glibc >=2.28,<3.0.a0
   - cuda-version >=13.1,<13.2.0a0
@@ -3285,11 +3285,11 @@ packages:
   - libstdcxx >=14
   - rdma-core >=60.0
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 990030
-  timestamp: 1764881892686
-- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.0.49-hbf501ad_0.conda
-  sha256: d03963dc7708ded20340176ade987fc4c3e49da4f7b139a85e69ca7eb413f57a
-  md5: 315e1b144eaf890519fc63049b6e9228
+  size: 990938
+  timestamp: 1768273732081
+- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.1.26-hbf501ad_0.conda
+  sha256: 7451b3e2204e6cad21db501052dfe595c3440213ef3e22c0f9c784012f6a8419
+  md5: ee60a24c702ce02de95ae1982c4841d8
   depends:
   - __glibc >=2.28,<3.0.a0
   - arm-variant * sbsa
@@ -3300,8 +3300,8 @@ packages:
   constrains:
   - arm-variant * sbsa
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 887547
-  timestamp: 1764881951574
+  size: 891752
+  timestamp: 1768273724252
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda
   sha256: aa8e8c4be9a2e81610ddf574e05b64ee131fab5e0e3693210c9d6d2fba32c680
   md5: 6c77a605a7a689d17d4819c0f8ac9a00
diff --git a/cuda_core/cuda/core/_launcher.pyx b/cuda_core/cuda/core/_launcher.pyx
index c575ad9bd0..9559f7697a 100644
--- a/cuda_core/cuda/core/_launcher.pyx
+++ b/cuda_core/cuda/core/_launcher.pyx
@@ -78,14 +78,14 @@ def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kern
     cdef void** args_ptr = <void**><uintptr_t>(ker_args.ptr)
 
     # TODO: cythonize Module/Kernel/...
-    # Note: CUfunction and CUkernel are interchangeable
+    # Note: We now use CUkernel handles exclusively (CUDA 12+), but they can be cast to
+    # CUfunction for use with cuLaunchKernel, as both handle types are interchangeable
+    # for kernel launch purposes.
     cdef cydriver.CUfunction func_handle = <cydriver.CUfunction>(<uintptr_t>(kernel._handle))
 
-    # Note: CUkernel can still be launched via the old cuLaunchKernel and we do not care
-    # about the CUfunction/CUkernel difference (which depends on whether the "old" or
-    # "new" module loading APIs are in use). We check both binding & driver versions here
-    # mainly to see if the "Ex" API is available and if so we use it, as it's more feature
-    # rich.
+    # Note: CUkernel can still be launched via cuLaunchKernel (not just cuLaunchKernelEx).
+    # We check both binding & driver versions here mainly to see if the "Ex" API is
+    # available and if so we use it, as it's more feature rich.
     if _use_ex:
         drv_cfg = conf._to_native_launch_config()
         drv_cfg.hStream = as_cu(s._h_stream)
diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py
index fbea314406..dd3f4494d5 100644
--- a/cuda_core/cuda/core/_module.py
+++ b/cuda_core/cuda/core/_module.py
@@ -2,10 +2,11 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import functools
+import threading
 import weakref
 from collections import namedtuple
 from typing import Union
-from warnings import warn
 
 from cuda.core._device import Device
 from cuda.core._launch_config import LaunchConfig, _to_native_launch_config
@@ -15,54 +16,109 @@
     assert_type_str_or_bytes_like,
     raise_code_path_meant_to_be_unreachable,
 )
-from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return, precondition
+from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return, precondition
 
-_backend = {
-    "old": {
-        "file": driver.cuModuleLoad,
-        "data": driver.cuModuleLoadDataEx,
-        "kernel": driver.cuModuleGetFunction,
-        "attribute": driver.cuFuncGetAttribute,
-    },
-}
-
-
-# TODO: revisit this treatment for py313t builds
+# Lazy initialization state and synchronization
+# For Python 3.13t (free-threaded builds), we use a lock to ensure thread-safe initialization.
+# For regular Python builds with GIL, the lock overhead is minimal and the code remains safe.
+_init_lock = threading.Lock()
 _inited = False
 _py_major_ver = None
+_py_minor_ver = None
 _driver_ver = None
 _kernel_ctypes = None
+_backend = {}
 
 
 def _lazy_init():
+    """
+    Initialize module-level state in a thread-safe manner.
+
+    This function is thread-safe and suitable for both:
+    - Regular Python builds (with GIL)
+    - Python 3.13t free-threaded builds (without GIL)
+
+    Uses double-checked locking pattern for performance:
+    - Fast path: check without lock if already initialized
+    - Slow path: acquire lock and initialize if needed
+    """
     global _inited
+    # Fast path: already initialized (no lock needed for read)
     if _inited:
         return
 
-    global _py_major_ver, _driver_ver, _kernel_ctypes
-    # binding availability depends on cuda-python version
-    _py_major_ver, _ = get_binding_version()
-    if _py_major_ver >= 12:
-        _backend["new"] = {
+    # Slow path: acquire lock and initialize
+    with _init_lock:
+        # Double-check: another thread might have initialized while we waited
+        if _inited:
+            return
+
+        global _py_major_ver, _py_minor_ver, _driver_ver, _kernel_ctypes, _backend
+        # binding availability depends on cuda-python version
+        _py_major_ver, _py_minor_ver = get_binding_version()
+        _backend = {
             "file": driver.cuLibraryLoadFromFile,
             "data": driver.cuLibraryLoadData,
             "kernel": driver.cuLibraryGetKernel,
             "attribute": driver.cuKernelGetAttribute,
         }
-        _kernel_ctypes = (driver.CUfunction, driver.CUkernel)
-    else:
-        _kernel_ctypes = (driver.CUfunction,)
-    _driver_ver = handle_return(driver.cuDriverGetVersion())
-    if _py_major_ver >= 12 and _driver_ver >= 12040:
-        _backend["new"]["paraminfo"] = driver.cuKernelGetParamInfo
-    _inited = True
+        _kernel_ctypes = (driver.CUkernel,)
+        _driver_ver = handle_return(driver.cuDriverGetVersion())
+        if _driver_ver >= 12040:
+            _backend["paraminfo"] = driver.cuKernelGetParamInfo
+
+        # Mark as initialized (must be last to ensure all state is set)
+        _inited = True
+
+
+# Auto-initializing property accessors
+def _get_py_major_ver():
+    """Get the Python binding major version, initializing if needed."""
+    _lazy_init()
+    return _py_major_ver
+
+
+def _get_py_minor_ver():
+    """Get the Python binding minor version, initializing if needed."""
+    _lazy_init()
+    return _py_minor_ver
+
+
+def _get_driver_ver():
+    """Get the CUDA driver version, initializing if needed."""
+    _lazy_init()
+    return _driver_ver
+
+
+def _get_kernel_ctypes():
+    """Get the kernel ctypes tuple, initializing if needed."""
+    _lazy_init()
+    return _kernel_ctypes
+
+
+@functools.cache
+def _is_cukernel_get_library_supported() -> bool:
+    """Return True when cuKernelGetLibrary is available for inverse kernel-to-library lookup.
+
+    Requires cuda-python bindings >= 12.5 and driver >= 12.5.
+    """
+    return (
+        (_get_py_major_ver(), _get_py_minor_ver()) >= (12, 5)
+        and _get_driver_ver() >= 12050
+        and hasattr(driver, "cuKernelGetLibrary")
+    )
+
+
+def _make_dummy_library_handle():
+    """Create a non-null placeholder CUlibrary handle to disable lazy loading."""
+    return driver.CUlibrary(1) if hasattr(driver, "CUlibrary") else 1
 
 
 class KernelAttributes:
     def __new__(self, *args, **kwargs):
         raise RuntimeError("KernelAttributes cannot be instantiated directly. Please use Kernel APIs.")
 
-    slots = ("_kernel", "_cache", "_backend_version", "_loader")
+    slots = ("_kernel", "_cache", "_loader")
 
     @classmethod
     def _init(cls, kernel):
@@ -70,8 +126,9 @@ def _init(cls, kernel):
         self._kernel = weakref.ref(kernel)
         self._cache = {}
 
-        self._backend_version = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old"
-        self._loader = _backend[self._backend_version]
+        # Ensure backend is initialized before setting loader
+        _lazy_init()
+        self._loader = _backend
         return self
 
     def _get_cached_attribute(self, device_id: Device | int, attribute: driver.CUfunction_attribute) -> int:
@@ -84,15 +141,7 @@ def _get_cached_attribute(self, device_id: Device | int, attribute: driver.CUfun
         kernel = self._kernel()
         if kernel is None:
             raise RuntimeError("Cannot access kernel attributes for expired Kernel object")
-        if self._backend_version == "new":
-            result = handle_return(self._loader["attribute"](attribute, kernel._handle, device_id))
-        else:  # "old" backend
-            warn(
-                "Device ID argument is ignored when getting attribute from kernel when cuda version < 12. ",
-                RuntimeWarning,
-                stacklevel=2,
-            )
-            result = handle_return(self._loader["attribute"](attribute, kernel._handle))
+        result = handle_return(self._loader["attribute"](attribute, kernel._handle, device_id))
         self._cache[cache_key] = result
         return result
 
@@ -197,7 +246,9 @@ def cluster_scheduling_policy_preference(self, device_id: Device | int = None) -
 
 
 class KernelOccupancy:
-    """ """
+    """This class offers methods to query occupancy metrics that help determine optimal
+    launch parameters such as block size, grid size, and shared memory usage.
+    """
 
     def __new__(self, *args, **kwargs):
         raise RuntimeError("KernelOccupancy cannot be instantiated directly. Please use Kernel APIs.")
@@ -378,7 +429,7 @@ def __new__(self, *args, **kwargs):
 
     @classmethod
     def _from_obj(cls, obj, mod):
-        assert_type(obj, _kernel_ctypes)
+        assert_type(obj, _get_kernel_ctypes())
         assert_type(mod, ObjectCode)
         ker = super().__new__(cls)
         ker._handle = obj
@@ -396,12 +447,11 @@ def attributes(self) -> KernelAttributes:
 
     def _get_arguments_info(self, param_info=False) -> tuple[int, list[ParamInfo]]:
         attr_impl = self.attributes
-        if attr_impl._backend_version != "new":
-            raise NotImplementedError("New backend is required")
         if "paraminfo" not in attr_impl._loader:
+            driver_ver = _get_driver_ver()
             raise NotImplementedError(
                 "Driver version 12.4 or newer is required for this function. "
-                f"Using driver version {_driver_ver // 1000}.{(_driver_ver % 1000) // 10}"
+                f"Using driver version {driver_ver // 1000}.{(driver_ver % 1000) // 10}"
             )
         arg_pos = 0
         param_info_data = []
@@ -436,7 +486,46 @@ def occupancy(self) -> KernelOccupancy:
             self._occupancy = KernelOccupancy._init(self._handle)
         return self._occupancy
 
-    # TODO: implement from_handle()
+    @staticmethod
+    def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel":
+        """Creates a new :obj:`Kernel` object from a foreign kernel handle.
+
+        Uses a CUkernel pointer address to create a new :obj:`Kernel` object.
+
+        Parameters
+        ----------
+        handle : int
+            Kernel handle representing the address of a foreign
+            kernel object (CUkernel).
+        mod : :obj:`ObjectCode`, optional
+            The ObjectCode object associated with this kernel. If not provided,
+            a placeholder ObjectCode will be created. Note that without a proper
+            ObjectCode, certain operations may be limited.
+        """
+
+        # Validate that handle is an integer
+        if not isinstance(handle, int):
+            raise TypeError(f"handle must be an integer, got {type(handle).__name__}")
+
+        # Convert the integer handle to CUkernel driver type
+        kernel_obj = driver.CUkernel(handle)
+
+        # If no module provided, create a placeholder
+        if mod is None:
+            # For CUkernel, we can (optionally) inverse-lookup the owning CUlibrary via
+            # cuKernelGetLibrary (added in CUDA 12.5). If the API is not available, we fall
+            # back to a non-null dummy handle purely to disable lazy loading.
+            mod = ObjectCode._init(b"", "cubin")
+            if _is_cukernel_get_library_supported():
+                try:
+                    mod._handle = handle_return(driver.cuKernelGetLibrary(kernel_obj))
+                except (CUDAError, RuntimeError):
+                    # Best-effort: don't fail construction if inverse lookup fails.
+                    mod._handle = _make_dummy_library_handle()
+            else:
+                mod._handle = _make_dummy_library_handle()
+
+        return Kernel._from_obj(kernel_obj, mod)
 
 
 CodeTypeT = Union[bytes, bytearray, str]
@@ -454,14 +543,9 @@ class ObjectCode:
     like to load, use the :meth:`from_cubin` alternative constructor. Constructing directly
     from all other possible code types should be avoided in favor of compilation through
     :class:`~cuda.core.Program`
-
-    Note
-    ----
-    Usage under CUDA 11.x will only load to the current device
-    context.
     """
 
-    __slots__ = ("_handle", "_backend_version", "_code_type", "_module", "_loader", "_sym_map", "_name")
+    __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map", "_name")
     _supported_code_type = ("cubin", "ptx", "ltoir", "fatbin", "object", "library")
 
     def __new__(self, *args, **kwargs):
@@ -474,13 +558,13 @@ def __new__(self, *args, **kwargs):
     def _init(cls, module, code_type, *, name: str = "", symbol_mapping: dict | None = None):
         self = super().__new__(cls)
         assert code_type in self._supported_code_type, f"{code_type=} is not supported"
-        _lazy_init()
 
         # handle is assigned during _lazy_load
         self._handle = None
 
-        self._backend_version = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old"
-        self._loader = _backend[self._backend_version]
+        # Ensure backend is initialized before setting loader
+        _lazy_init()
+        self._loader = _backend
 
         self._code_type = code_type
         self._module = module
@@ -613,16 +697,10 @@ def _lazy_load_module(self, *args, **kwargs):
         module = self._module
         assert_type_str_or_bytes_like(module)
         if isinstance(module, str):
-            if self._backend_version == "new":
-                self._handle = handle_return(self._loader["file"](module.encode(), [], [], 0, [], [], 0))
-            else:  # "old" backend
-                self._handle = handle_return(self._loader["file"](module.encode()))
+            self._handle = handle_return(self._loader["file"](module.encode(), [], [], 0, [], [], 0))
             return
         if isinstance(module, (bytes, bytearray)):
-            if self._backend_version == "new":
-                self._handle = handle_return(self._loader["data"](module, [], [], 0, [], [], 0))
-            else:  # "old" backend
-                self._handle = handle_return(self._loader["data"](module, 0, [], []))
+            self._handle = handle_return(self._loader["data"](module, [], [], 0, [], [], 0))
             return
         raise_code_path_meant_to_be_unreachable()
 
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 4b3817ece4..f9bbcd3e4c 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -7,7 +7,7 @@
 
 import cuda.core
 import pytest
-from cuda.core import Device, ObjectCode, Program, ProgramOptions
+from cuda.core import Device, Kernel, ObjectCode, Program, ProgramOptions
 from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return
 
 try:
@@ -420,3 +420,91 @@ def test_module_serialization_roundtrip(get_saxpy_kernel_cubin):
     assert objcode.code == result.code
     assert objcode._sym_map == result._sym_map
     assert objcode.code_type == result.code_type
+
+
+def test_kernel_from_handle(get_saxpy_kernel_cubin):
+    """Test Kernel.from_handle() with a valid handle"""
+    original_kernel, objcode = get_saxpy_kernel_cubin
+
+    # Get the handle from the original kernel
+    handle = int(original_kernel._handle)
+
+    # Create a new Kernel from the handle
+    kernel_from_handle = Kernel.from_handle(handle, objcode)
+    assert isinstance(kernel_from_handle, Kernel)
+
+    # Verify we can access kernel attributes
+    max_threads = kernel_from_handle.attributes.max_threads_per_block()
+    assert isinstance(max_threads, int)
+    assert max_threads > 0
+
+
+def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin):
+    """Test Kernel.from_handle() without providing a module"""
+    original_kernel, _ = get_saxpy_kernel_cubin
+
+    # Get the handle from the original kernel
+    handle = int(original_kernel._handle)
+
+    # Create a new Kernel from the handle without a module
+    # This is supported on CUDA 12+ backend (CUkernel)
+    kernel_from_handle = Kernel.from_handle(handle)
+    assert isinstance(kernel_from_handle, Kernel)
+
+    # Verify we can still access kernel attributes
+    max_threads = kernel_from_handle.attributes.max_threads_per_block()
+    assert isinstance(max_threads, int)
+    assert max_threads > 0
+
+
+@pytest.mark.parametrize(
+    "invalid_value",
+    [
+        pytest.param("not_an_int", id="str"),
+        pytest.param(2.71828, id="float"),
+        pytest.param(None, id="None"),
+        pytest.param({"handle": 123}, id="dict"),
+        pytest.param([456], id="list"),
+        pytest.param((789,), id="tuple"),
+        pytest.param(3 + 4j, id="complex"),
+        pytest.param(b"\xde\xad\xbe\xef", id="bytes"),
+        pytest.param({999}, id="set"),
+        pytest.param(object(), id="object"),
+    ],
+)
+def test_kernel_from_handle_type_validation(invalid_value):
+    """Test Kernel.from_handle() with wrong handle types"""
+    with pytest.raises(TypeError):
+        Kernel.from_handle(invalid_value)
+
+
+def test_kernel_from_handle_invalid_module_type(get_saxpy_kernel_cubin):
+    """Test Kernel.from_handle() with invalid module parameter"""
+    original_kernel, _ = get_saxpy_kernel_cubin
+    handle = int(original_kernel._handle)
+
+    # Invalid module type (should fail type assertion in _from_obj)
+    with pytest.raises((TypeError, AssertionError)):
+        Kernel.from_handle(handle, mod="not_an_objectcode")
+
+    with pytest.raises((TypeError, AssertionError)):
+        Kernel.from_handle(handle, mod=12345)
+
+
+def test_kernel_from_handle_multiple_instances(get_saxpy_kernel_cubin):
+    """Test creating multiple Kernel instances from the same handle"""
+    original_kernel, objcode = get_saxpy_kernel_cubin
+    handle = int(original_kernel._handle)
+
+    # Create multiple Kernel instances from the same handle
+    kernel1 = Kernel.from_handle(handle, objcode)
+    kernel2 = Kernel.from_handle(handle, objcode)
+    kernel3 = Kernel.from_handle(handle, objcode)
+
+    # All should be valid Kernel objects
+    assert isinstance(kernel1, Kernel)
+    assert isinstance(kernel2, Kernel)
+    assert isinstance(kernel3, Kernel)
+
+    # All should reference the same underlying CUDA kernel handle
+    assert int(kernel1._handle) == int(kernel2._handle) == int(kernel3._handle) == handle