diff --git a/cuda_bindings/pixi.lock b/cuda_bindings/pixi.lock index f84d569dff..fb3d0ad393 100644 --- a/cuda_bindings/pixi.lock +++ b/cuda_bindings/pixi.lock @@ -26,7 +26,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.1.80-hecca717_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.1.80-h376f20c_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.1.80-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.80-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.1.80-h69a702a_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.1.80-ha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.80-h4bc722e_0.conda @@ -72,7 +72,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.0.49-hd07211c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.125-hb03c661_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda @@ -220,7 +220,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-cudart-static-13.1.80-h8f3c8d4_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-aarch64-13.1.80-h8f3c8d4_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-aarch64-13.1.80-h8f3c8d4_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.80-h8f3c8d4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.115-h8f3c8d4_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-13.1.80-he9431aa_100.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-aarch64-13.1.80-h579c4fd_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-impl-13.1.80-h7b14b0b_0.conda @@ -263,7 +263,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libblas-3.11.0-5_haddc8a3_openblas.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcap-2.77-h68e9139_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcblas-3.11.0-5_hd72aa62_openblas.conda - - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.0.49-hbf501ad_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.1.26-hbf501ad_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdeflate-1.25-h1af38f5_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdrm-2.4.125-he30d5cf_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libegl-1.7.0-hd24410f_2.conda @@ -401,7 +401,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-static-13.1.80-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_win-64-13.1.80-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_win-64-13.1.80-hac47afa_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.80-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.115-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-13.1.80-h719f0c7_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-13.1.80-h57928b3_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-13.1.80-h2466b09_0.conda @@ -542,7 +542,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-static-13.1.80-hecca717_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-64-13.1.80-h376f20c_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.1.80-h376f20c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.80-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.1.80-h69a702a_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.1.80-ha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.80-h4bc722e_0.conda @@ -588,7 +588,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-h3ff7636_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.0.49-hd07211c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.125-hb03c661_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda @@ -736,7 +736,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-cudart-static-13.1.80-h8f3c8d4_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_linux-aarch64-13.1.80-h8f3c8d4_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-aarch64-13.1.80-h8f3c8d4_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.80-h8f3c8d4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.115-h8f3c8d4_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-13.1.80-he9431aa_100.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-aarch64-13.1.80-h579c4fd_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvvm-impl-13.1.80-h7b14b0b_0.conda @@ -779,7 +779,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libblas-3.11.0-5_haddc8a3_openblas.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcap-2.77-h68e9139_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcblas-3.11.0-5_hd72aa62_openblas.conda - - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.0.49-hbf501ad_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.1.26-hbf501ad_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdeflate-1.25-h1af38f5_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdrm-2.4.125-he30d5cf_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libegl-1.7.0-hd24410f_2.conda @@ -917,7 +917,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-cudart-static-13.1.80-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart-static_win-64-13.1.80-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_win-64-13.1.80-hac47afa_0.conda - - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.80-hac47afa_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.115-hac47afa_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-13.1.80-h719f0c7_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_win-64-13.1.80-h57928b3_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvvm-impl-13.1.80-h2466b09_0.conda @@ -1461,10 +1461,10 @@ packages: - cuda-pathfinder >=1.1,<2 - libnvjitlink - cuda-nvrtc - - cuda-nvrtc >=13.1.80,<14.0a0 + - cuda-nvrtc >=13.1.115,<14.0a0 - cuda-nvvm - libcufile - - libcufile >=1.16.0.49,<2.0a0 + - libcufile >=1.16.1.26,<2.0a0 - libgcc >=15 - libgcc >=15 - libstdcxx >=15 @@ -1483,7 +1483,7 @@ packages: - cuda-pathfinder >=1.1,<2 - libnvjitlink - cuda-nvrtc - - cuda-nvrtc >=13.1.80,<14.0a0 + - cuda-nvrtc >=13.1.115,<14.0a0 - cuda-nvvm - vc >=14.1,<15 - vc14_runtime >=14.16.27033 @@ -1502,10 +1502,10 @@ packages: - cuda-pathfinder >=1.1,<2 - libnvjitlink - cuda-nvrtc - - cuda-nvrtc >=13.1.80,<14.0a0 + - cuda-nvrtc >=13.1.115,<14.0a0 - cuda-nvvm - libcufile - - libcufile >=1.16.0.49,<2.0a0 + - libcufile >=1.16.1.26,<2.0a0 - libgcc >=15 - libgcc >=15 - libstdcxx >=15 @@ -1759,39 +1759,39 @@ packages: license: LicenseRef-NVIDIA-End-User-License-Agreement size: 24082 timestamp: 1764883821516 -- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.80-hecca717_0.conda - sha256: d6b326bdbf6fa7bfa0fa617dda547dc585159816b8f130f2535740c4e53fd12c - md5: 7ef874b2dc4ca388ecef3b3893305459 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda + sha256: 9cc4f9df70c02eea5121cdb0e865207b04cd52591f57ebcac2ba44fada10eb5b + md5: df16c9049d882cdaf4f83a5b90079589 depends: - __glibc >=2.17,<3.0.a0 - cuda-version >=13.1,<13.2.0a0 - libgcc >=14 - libstdcxx >=14 license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 35479197 - timestamp: 1764880529154 -- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.80-h8f3c8d4_0.conda - sha256: 5e10ce4dd84c22c73e58a9f8359fb1e5ef4596afd3a0bc12b9fbde73b388ec0d - md5: 0473ebdb01f2f4024177b024fc19fa72 + size: 35339417 + timestamp: 1768272955912 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/cuda-nvrtc-13.1.115-h8f3c8d4_0.conda + sha256: a1ec61512cecb093797e00590ad381ecd5852d2a32440ff22b34f78c743f3d5a + md5: 34da2ff2c64054d65eb8f04d76c40cca depends: - arm-variant * sbsa - cuda-version >=13.1,<13.2.0a0 - libgcc >=14 - libstdcxx >=14 license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 33619044 - timestamp: 1764880672755 -- conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.80-hac47afa_0.conda - sha256: 3f67de8a9eb182fa20bbc80bda7185afb676cfe8894f6a0549173bd752a7d2f4 - md5: 7b42337a35cd887ec3eed254b5ed606f + size: 33616576 + timestamp: 1768272976976 +- conda: https://conda.anaconda.org/conda-forge/win-64/cuda-nvrtc-13.1.115-hac47afa_0.conda + sha256: a8869b7d997722f90b9f8a602dc0b1d0d497f2a6f3561dc89383aeb2cd379a66 + md5: 372d3c612a832d5f87d8dd9702d487b2 depends: - cuda-version >=13.1,<13.2.0a0 - ucrt >=10.0.20348.0 - vc >=14.3,<15 - vc14_runtime >=14.44.35208 license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 31012754 - timestamp: 1764880740086 + size: 31006920 + timestamp: 1768273107962 - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.1.80-h69a702a_0.conda sha256: 84f971ab146e2c822103cfe06f478ece244747a6f2aa565be639a4709d0a1579 md5: 9250c651d8758c8f665dff7519ef21ff @@ -3275,9 +3275,9 @@ packages: license_family: BSD size: 68079 timestamp: 1765819124349 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.0.49-hd07211c_0.conda - sha256: 6aabad84132b1f3ee367e5d24291febf8a11d9a7f3967a64fc07e77d9b0b22df - md5: 9cb68a85f8c08f0512931f944f6a75df +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda + sha256: 8c44b5bf947afad827df0df49fe7483cf1b2916694081b2db4fecdfd6a2bacd1 + md5: 48418c48dac04671fa46cb446122b8a5 depends: - __glibc >=2.28,<3.0.a0 - cuda-version >=13.1,<13.2.0a0 @@ -3285,11 +3285,11 @@ packages: - libstdcxx >=14 - rdma-core >=60.0 license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 990030 - timestamp: 1764881892686 -- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.0.49-hbf501ad_0.conda - sha256: d03963dc7708ded20340176ade987fc4c3e49da4f7b139a85e69ca7eb413f57a - md5: 315e1b144eaf890519fc63049b6e9228 + size: 990938 + timestamp: 1768273732081 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcufile-1.16.1.26-hbf501ad_0.conda + sha256: 7451b3e2204e6cad21db501052dfe595c3440213ef3e22c0f9c784012f6a8419 + md5: ee60a24c702ce02de95ae1982c4841d8 depends: - __glibc >=2.28,<3.0.a0 - arm-variant * sbsa @@ -3300,8 +3300,8 @@ packages: constrains: - arm-variant * sbsa license: LicenseRef-NVIDIA-End-User-License-Agreement - size: 887547 - timestamp: 1764881951574 + size: 891752 + timestamp: 1768273724252 - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda sha256: aa8e8c4be9a2e81610ddf574e05b64ee131fab5e0e3693210c9d6d2fba32c680 md5: 6c77a605a7a689d17d4819c0f8ac9a00 diff --git a/cuda_core/cuda/core/_launcher.pyx b/cuda_core/cuda/core/_launcher.pyx index c575ad9bd0..9559f7697a 100644 --- a/cuda_core/cuda/core/_launcher.pyx +++ b/cuda_core/cuda/core/_launcher.pyx @@ -78,14 +78,14 @@ def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kern cdef void** args_ptr = (ker_args.ptr) # TODO: cythonize Module/Kernel/... - # Note: CUfunction and CUkernel are interchangeable + # Note: We now use CUkernel handles exclusively (CUDA 12+), but they can be cast to + # CUfunction for use with cuLaunchKernel, as both handle types are interchangeable + # for kernel launch purposes. cdef cydriver.CUfunction func_handle = ((kernel._handle)) - # Note: CUkernel can still be launched via the old cuLaunchKernel and we do not care - # about the CUfunction/CUkernel difference (which depends on whether the "old" or - # "new" module loading APIs are in use). We check both binding & driver versions here - # mainly to see if the "Ex" API is available and if so we use it, as it's more feature - # rich. + # Note: CUkernel can still be launched via cuLaunchKernel (not just cuLaunchKernelEx). + # We check both binding & driver versions here mainly to see if the "Ex" API is + # available and if so we use it, as it's more feature rich. if _use_ex: drv_cfg = conf._to_native_launch_config() drv_cfg.hStream = as_cu(s._h_stream) diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.py index fbea314406..dd3f4494d5 100644 --- a/cuda_core/cuda/core/_module.py +++ b/cuda_core/cuda/core/_module.py @@ -2,10 +2,11 @@ # # SPDX-License-Identifier: Apache-2.0 +import functools +import threading import weakref from collections import namedtuple from typing import Union -from warnings import warn from cuda.core._device import Device from cuda.core._launch_config import LaunchConfig, _to_native_launch_config @@ -15,54 +16,109 @@ assert_type_str_or_bytes_like, raise_code_path_meant_to_be_unreachable, ) -from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return, precondition +from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return, precondition -_backend = { - "old": { - "file": driver.cuModuleLoad, - "data": driver.cuModuleLoadDataEx, - "kernel": driver.cuModuleGetFunction, - "attribute": driver.cuFuncGetAttribute, - }, -} - - -# TODO: revisit this treatment for py313t builds +# Lazy initialization state and synchronization +# For Python 3.13t (free-threaded builds), we use a lock to ensure thread-safe initialization. +# For regular Python builds with GIL, the lock overhead is minimal and the code remains safe. +_init_lock = threading.Lock() _inited = False _py_major_ver = None +_py_minor_ver = None _driver_ver = None _kernel_ctypes = None +_backend = {} def _lazy_init(): + """ + Initialize module-level state in a thread-safe manner. + + This function is thread-safe and suitable for both: + - Regular Python builds (with GIL) + - Python 3.13t free-threaded builds (without GIL) + + Uses double-checked locking pattern for performance: + - Fast path: check without lock if already initialized + - Slow path: acquire lock and initialize if needed + """ global _inited + # Fast path: already initialized (no lock needed for read) if _inited: return - global _py_major_ver, _driver_ver, _kernel_ctypes - # binding availability depends on cuda-python version - _py_major_ver, _ = get_binding_version() - if _py_major_ver >= 12: - _backend["new"] = { + # Slow path: acquire lock and initialize + with _init_lock: + # Double-check: another thread might have initialized while we waited + if _inited: + return + + global _py_major_ver, _py_minor_ver, _driver_ver, _kernel_ctypes, _backend + # binding availability depends on cuda-python version + _py_major_ver, _py_minor_ver = get_binding_version() + _backend = { "file": driver.cuLibraryLoadFromFile, "data": driver.cuLibraryLoadData, "kernel": driver.cuLibraryGetKernel, "attribute": driver.cuKernelGetAttribute, } - _kernel_ctypes = (driver.CUfunction, driver.CUkernel) - else: - _kernel_ctypes = (driver.CUfunction,) - _driver_ver = handle_return(driver.cuDriverGetVersion()) - if _py_major_ver >= 12 and _driver_ver >= 12040: - _backend["new"]["paraminfo"] = driver.cuKernelGetParamInfo - _inited = True + _kernel_ctypes = (driver.CUkernel,) + _driver_ver = handle_return(driver.cuDriverGetVersion()) + if _driver_ver >= 12040: + _backend["paraminfo"] = driver.cuKernelGetParamInfo + + # Mark as initialized (must be last to ensure all state is set) + _inited = True + + +# Auto-initializing property accessors +def _get_py_major_ver(): + """Get the Python binding major version, initializing if needed.""" + _lazy_init() + return _py_major_ver + + +def _get_py_minor_ver(): + """Get the Python binding minor version, initializing if needed.""" + _lazy_init() + return _py_minor_ver + + +def _get_driver_ver(): + """Get the CUDA driver version, initializing if needed.""" + _lazy_init() + return _driver_ver + + +def _get_kernel_ctypes(): + """Get the kernel ctypes tuple, initializing if needed.""" + _lazy_init() + return _kernel_ctypes + + +@functools.cache +def _is_cukernel_get_library_supported() -> bool: + """Return True when cuKernelGetLibrary is available for inverse kernel-to-library lookup. + + Requires cuda-python bindings >= 12.5 and driver >= 12.5. + """ + return ( + (_get_py_major_ver(), _get_py_minor_ver()) >= (12, 5) + and _get_driver_ver() >= 12050 + and hasattr(driver, "cuKernelGetLibrary") + ) + + +def _make_dummy_library_handle(): + """Create a non-null placeholder CUlibrary handle to disable lazy loading.""" + return driver.CUlibrary(1) if hasattr(driver, "CUlibrary") else 1 class KernelAttributes: def __new__(self, *args, **kwargs): raise RuntimeError("KernelAttributes cannot be instantiated directly. Please use Kernel APIs.") - slots = ("_kernel", "_cache", "_backend_version", "_loader") + slots = ("_kernel", "_cache", "_loader") @classmethod def _init(cls, kernel): @@ -70,8 +126,9 @@ def _init(cls, kernel): self._kernel = weakref.ref(kernel) self._cache = {} - self._backend_version = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old" - self._loader = _backend[self._backend_version] + # Ensure backend is initialized before setting loader + _lazy_init() + self._loader = _backend return self def _get_cached_attribute(self, device_id: Device | int, attribute: driver.CUfunction_attribute) -> int: @@ -84,15 +141,7 @@ def _get_cached_attribute(self, device_id: Device | int, attribute: driver.CUfun kernel = self._kernel() if kernel is None: raise RuntimeError("Cannot access kernel attributes for expired Kernel object") - if self._backend_version == "new": - result = handle_return(self._loader["attribute"](attribute, kernel._handle, device_id)) - else: # "old" backend - warn( - "Device ID argument is ignored when getting attribute from kernel when cuda version < 12. ", - RuntimeWarning, - stacklevel=2, - ) - result = handle_return(self._loader["attribute"](attribute, kernel._handle)) + result = handle_return(self._loader["attribute"](attribute, kernel._handle, device_id)) self._cache[cache_key] = result return result @@ -197,7 +246,9 @@ def cluster_scheduling_policy_preference(self, device_id: Device | int = None) - class KernelOccupancy: - """ """ + """This class offers methods to query occupancy metrics that help determine optimal + launch parameters such as block size, grid size, and shared memory usage. + """ def __new__(self, *args, **kwargs): raise RuntimeError("KernelOccupancy cannot be instantiated directly. Please use Kernel APIs.") @@ -378,7 +429,7 @@ def __new__(self, *args, **kwargs): @classmethod def _from_obj(cls, obj, mod): - assert_type(obj, _kernel_ctypes) + assert_type(obj, _get_kernel_ctypes()) assert_type(mod, ObjectCode) ker = super().__new__(cls) ker._handle = obj @@ -396,12 +447,11 @@ def attributes(self) -> KernelAttributes: def _get_arguments_info(self, param_info=False) -> tuple[int, list[ParamInfo]]: attr_impl = self.attributes - if attr_impl._backend_version != "new": - raise NotImplementedError("New backend is required") if "paraminfo" not in attr_impl._loader: + driver_ver = _get_driver_ver() raise NotImplementedError( "Driver version 12.4 or newer is required for this function. " - f"Using driver version {_driver_ver // 1000}.{(_driver_ver % 1000) // 10}" + f"Using driver version {driver_ver // 1000}.{(driver_ver % 1000) // 10}" ) arg_pos = 0 param_info_data = [] @@ -436,7 +486,46 @@ def occupancy(self) -> KernelOccupancy: self._occupancy = KernelOccupancy._init(self._handle) return self._occupancy - # TODO: implement from_handle() + @staticmethod + def from_handle(handle: int, mod: "ObjectCode" = None) -> "Kernel": + """Creates a new :obj:`Kernel` object from a foreign kernel handle. + + Uses a CUkernel pointer address to create a new :obj:`Kernel` object. + + Parameters + ---------- + handle : int + Kernel handle representing the address of a foreign + kernel object (CUkernel). + mod : :obj:`ObjectCode`, optional + The ObjectCode object associated with this kernel. If not provided, + a placeholder ObjectCode will be created. Note that without a proper + ObjectCode, certain operations may be limited. + """ + + # Validate that handle is an integer + if not isinstance(handle, int): + raise TypeError(f"handle must be an integer, got {type(handle).__name__}") + + # Convert the integer handle to CUkernel driver type + kernel_obj = driver.CUkernel(handle) + + # If no module provided, create a placeholder + if mod is None: + # For CUkernel, we can (optionally) inverse-lookup the owning CUlibrary via + # cuKernelGetLibrary (added in CUDA 12.5). If the API is not available, we fall + # back to a non-null dummy handle purely to disable lazy loading. + mod = ObjectCode._init(b"", "cubin") + if _is_cukernel_get_library_supported(): + try: + mod._handle = handle_return(driver.cuKernelGetLibrary(kernel_obj)) + except (CUDAError, RuntimeError): + # Best-effort: don't fail construction if inverse lookup fails. + mod._handle = _make_dummy_library_handle() + else: + mod._handle = _make_dummy_library_handle() + + return Kernel._from_obj(kernel_obj, mod) CodeTypeT = Union[bytes, bytearray, str] @@ -454,14 +543,9 @@ class ObjectCode: like to load, use the :meth:`from_cubin` alternative constructor. Constructing directly from all other possible code types should be avoided in favor of compilation through :class:`~cuda.core.Program` - - Note - ---- - Usage under CUDA 11.x will only load to the current device - context. """ - __slots__ = ("_handle", "_backend_version", "_code_type", "_module", "_loader", "_sym_map", "_name") + __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map", "_name") _supported_code_type = ("cubin", "ptx", "ltoir", "fatbin", "object", "library") def __new__(self, *args, **kwargs): @@ -474,13 +558,13 @@ def __new__(self, *args, **kwargs): def _init(cls, module, code_type, *, name: str = "", symbol_mapping: dict | None = None): self = super().__new__(cls) assert code_type in self._supported_code_type, f"{code_type=} is not supported" - _lazy_init() # handle is assigned during _lazy_load self._handle = None - self._backend_version = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old" - self._loader = _backend[self._backend_version] + # Ensure backend is initialized before setting loader + _lazy_init() + self._loader = _backend self._code_type = code_type self._module = module @@ -613,16 +697,10 @@ def _lazy_load_module(self, *args, **kwargs): module = self._module assert_type_str_or_bytes_like(module) if isinstance(module, str): - if self._backend_version == "new": - self._handle = handle_return(self._loader["file"](module.encode(), [], [], 0, [], [], 0)) - else: # "old" backend - self._handle = handle_return(self._loader["file"](module.encode())) + self._handle = handle_return(self._loader["file"](module.encode(), [], [], 0, [], [], 0)) return if isinstance(module, (bytes, bytearray)): - if self._backend_version == "new": - self._handle = handle_return(self._loader["data"](module, [], [], 0, [], [], 0)) - else: # "old" backend - self._handle = handle_return(self._loader["data"](module, 0, [], [])) + self._handle = handle_return(self._loader["data"](module, [], [], 0, [], [], 0)) return raise_code_path_meant_to_be_unreachable() diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index 4b3817ece4..f9bbcd3e4c 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -7,7 +7,7 @@ import cuda.core import pytest -from cuda.core import Device, ObjectCode, Program, ProgramOptions +from cuda.core import Device, Kernel, ObjectCode, Program, ProgramOptions from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return try: @@ -420,3 +420,91 @@ def test_module_serialization_roundtrip(get_saxpy_kernel_cubin): assert objcode.code == result.code assert objcode._sym_map == result._sym_map assert objcode.code_type == result.code_type + + +def test_kernel_from_handle(get_saxpy_kernel_cubin): + """Test Kernel.from_handle() with a valid handle""" + original_kernel, objcode = get_saxpy_kernel_cubin + + # Get the handle from the original kernel + handle = int(original_kernel._handle) + + # Create a new Kernel from the handle + kernel_from_handle = Kernel.from_handle(handle, objcode) + assert isinstance(kernel_from_handle, Kernel) + + # Verify we can access kernel attributes + max_threads = kernel_from_handle.attributes.max_threads_per_block() + assert isinstance(max_threads, int) + assert max_threads > 0 + + +def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin): + """Test Kernel.from_handle() without providing a module""" + original_kernel, _ = get_saxpy_kernel_cubin + + # Get the handle from the original kernel + handle = int(original_kernel._handle) + + # Create a new Kernel from the handle without a module + # This is supported on CUDA 12+ backend (CUkernel) + kernel_from_handle = Kernel.from_handle(handle) + assert isinstance(kernel_from_handle, Kernel) + + # Verify we can still access kernel attributes + max_threads = kernel_from_handle.attributes.max_threads_per_block() + assert isinstance(max_threads, int) + assert max_threads > 0 + + +@pytest.mark.parametrize( + "invalid_value", + [ + pytest.param("not_an_int", id="str"), + pytest.param(2.71828, id="float"), + pytest.param(None, id="None"), + pytest.param({"handle": 123}, id="dict"), + pytest.param([456], id="list"), + pytest.param((789,), id="tuple"), + pytest.param(3 + 4j, id="complex"), + pytest.param(b"\xde\xad\xbe\xef", id="bytes"), + pytest.param({999}, id="set"), + pytest.param(object(), id="object"), + ], +) +def test_kernel_from_handle_type_validation(invalid_value): + """Test Kernel.from_handle() with wrong handle types""" + with pytest.raises(TypeError): + Kernel.from_handle(invalid_value) + + +def test_kernel_from_handle_invalid_module_type(get_saxpy_kernel_cubin): + """Test Kernel.from_handle() with invalid module parameter""" + original_kernel, _ = get_saxpy_kernel_cubin + handle = int(original_kernel._handle) + + # Invalid module type (should fail type assertion in _from_obj) + with pytest.raises((TypeError, AssertionError)): + Kernel.from_handle(handle, mod="not_an_objectcode") + + with pytest.raises((TypeError, AssertionError)): + Kernel.from_handle(handle, mod=12345) + + +def test_kernel_from_handle_multiple_instances(get_saxpy_kernel_cubin): + """Test creating multiple Kernel instances from the same handle""" + original_kernel, objcode = get_saxpy_kernel_cubin + handle = int(original_kernel._handle) + + # Create multiple Kernel instances from the same handle + kernel1 = Kernel.from_handle(handle, objcode) + kernel2 = Kernel.from_handle(handle, objcode) + kernel3 = Kernel.from_handle(handle, objcode) + + # All should be valid Kernel objects + assert isinstance(kernel1, Kernel) + assert isinstance(kernel2, Kernel) + assert isinstance(kernel3, Kernel) + + # All should reference the same underlying CUDA kernel handle + assert int(kernel1._handle) == int(kernel2._handle) == int(kernel3._handle) == handle