diff --git a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pxd b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pxd new file mode 100644 index 0000000000..ac84743a30 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pxd @@ -0,0 +1,15 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from libc.stdint cimport intptr_t +from libcpp cimport vector + + +cdef class ParamHolder: + + cdef: + vector.vector[void*] data + vector.vector[void*] data_addresses + object kernel_args + readonly intptr_t ptr diff --git a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx index 4cac74a25f..e805b8ad66 100644 --- a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx @@ -250,12 +250,6 @@ cdef inline int prepare_numpy_arg( cdef class ParamHolder: - cdef: - vector.vector[void*] data - vector.vector[void*] data_addresses - object kernel_args - readonly intptr_t ptr - def __init__(self, kernel_args): if len(kernel_args) == 0: self.ptr = 0 diff --git a/cuda_core/cuda/core/experimental/_launch_config.pxd b/cuda_core/cuda/core/experimental/_launch_config.pxd index 90e2a5b606..eeb8ce41e7 100644 --- a/cuda_core/cuda/core/experimental/_launch_config.pxd +++ b/cuda_core/cuda/core/experimental/_launch_config.pxd @@ -2,18 +2,23 @@ # # SPDX-License-Identifier: Apache-2.0 +from libcpp.vector cimport vector -cdef bint _inited -cdef bint _use_ex +from cuda.bindings cimport cydriver -cdef void _lazy_init() except * cdef class LaunchConfig: """Customizable launch options.""" - cdef public tuple grid - cdef public tuple cluster - cdef public tuple block - cdef public int shmem_size - cdef public bint cooperative_launch + cdef: + public tuple grid + public tuple cluster + public tuple block + public int shmem_size + public bint cooperative_launch + + vector[cydriver.CUlaunchAttribute] _attrs + + cdef cydriver.CUlaunchConfig _to_native_launch_config(self) + cpdef object _to_native_launch_config(LaunchConfig config) diff --git a/cuda_core/cuda/core/experimental/_launch_config.pyx b/cuda_core/cuda/core/experimental/_launch_config.pyx index 7d6a1ab2b9..1f9de3f999 100644 --- a/cuda_core/cuda/core/experimental/_launch_config.pyx +++ b/cuda_core/cuda/core/experimental/_launch_config.pyx @@ -2,34 +2,44 @@ # # SPDX-License-Identifier: Apache-2.0 +from cuda.core.experimental._utils.cuda_utils cimport ( + HANDLE_RETURN, +) + +import threading + from cuda.core.experimental._device import Device from cuda.core.experimental._utils.cuda_utils import ( CUDAError, cast_to_3_tuple, driver, get_binding_version, - handle_return, ) -# TODO: revisit this treatment for py313t builds + cdef bint _inited = False cdef bint _use_ex = False +cdef object _lock = threading.Lock() -cdef void _lazy_init() except *: - """Initialize module-level globals for driver version checks.""" +cdef int _lazy_init() except?-1: global _inited, _use_ex if _inited: - return + return 0 cdef tuple _py_major_minor cdef int _driver_ver + with _lock: + if _inited: + return 0 + + # binding availability depends on cuda-python version + _py_major_minor = get_binding_version() + HANDLE_RETURN(cydriver.cuDriverGetVersion(&_driver_ver)) + _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8)) + _inited = True - # binding availability depends on cuda-python version - _py_major_minor = get_binding_version() - _driver_ver = handle_return(driver.cuDriverGetVersion()) - _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8)) - _inited = True + return 0 cdef class LaunchConfig: @@ -127,7 +137,42 @@ cdef class LaunchConfig: f"block={self.block}, shmem_size={self.shmem_size}, " f"cooperative_launch={self.cooperative_launch})") + cdef cydriver.CUlaunchConfig _to_native_launch_config(self): + _lazy_init() + # TODO: memset to zero? + cdef cydriver.CUlaunchConfig drv_cfg + cdef cydriver.CUlaunchAttribute attr + self._attrs.resize(0) + + # Handle grid dimensions and cluster configuration + if self.cluster is not None: + # Convert grid from cluster units to block units + drv_cfg.gridDimX = self.grid[0] * self.cluster[0] + drv_cfg.gridDimY = self.grid[1] * self.cluster[1] + drv_cfg.gridDimZ = self.grid[2] * self.cluster[2] + + # Set up cluster attribute + attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION + attr.value.clusterDim.x, attr.value.clusterDim.y, attr.value.clusterDim.z = self.cluster + self._attrs.push_back(attr) + else: + drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = self.grid + + drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = self.block + drv_cfg.sharedMemBytes = self.shmem_size + + if self.cooperative_launch: + attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE + attr.value.cooperative = 1 + self._attrs.push_back(attr) + + drv_cfg.numAttrs = self._attrs.size() + drv_cfg.attrs = self._attrs.data() + + return drv_cfg + +# TODO: once all modules are cythonized, this function can be dropped in favor of the cdef method above cpdef object _to_native_launch_config(LaunchConfig config): """Convert LaunchConfig to native driver CUlaunchConfig. diff --git a/cuda_core/cuda/core/experimental/_launcher.pyx b/cuda_core/cuda/core/experimental/_launcher.pyx index 2cba15cbf4..0e1b9a7d4b 100644 --- a/cuda_core/cuda/core/experimental/_launcher.pyx +++ b/cuda_core/cuda/core/experimental/_launcher.pyx @@ -1,38 +1,51 @@ # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 -from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config -from cuda.core.experimental._stream cimport Stream_accept +from libc.stdint cimport uintptr_t + +from cuda.bindings cimport cydriver + +from cuda.core.experimental._launch_config cimport LaunchConfig +from cuda.core.experimental._kernel_arg_handler cimport ParamHolder +from cuda.core.experimental._stream cimport Stream_accept, Stream +from cuda.core.experimental._utils.cuda_utils cimport ( + check_or_create_options, + HANDLE_RETURN, +) + +import threading -from cuda.core.experimental._kernel_arg_handler import ParamHolder from cuda.core.experimental._module import Kernel from cuda.core.experimental._stream import Stream -from cuda.core.experimental._utils.clear_error_support import assert_type from cuda.core.experimental._utils.cuda_utils import ( _reduce_3_tuple, - check_or_create_options, - driver, get_binding_version, - handle_return, ) -# TODO: revisit this treatment for py313t builds -_inited = False -_use_ex = None + +cdef bint _inited = False +cdef bint _use_ex = False +cdef object _lock = threading.Lock() -def _lazy_init(): - global _inited +cdef int _lazy_init() except?-1: + global _inited, _use_ex if _inited: - return + return 0 + + cdef int _driver_ver + with _lock: + if _inited: + return 0 - global _use_ex - # binding availability depends on cuda-python version - _py_major_minor = get_binding_version() - _driver_ver = handle_return(driver.cuDriverGetVersion()) - _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8)) - _inited = True + # binding availability depends on cuda-python version + _py_major_minor = get_binding_version() + HANDLE_RETURN(cydriver.cuDriverGetVersion(&_driver_ver)) + _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8)) + _inited = True + + return 0 def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kernel: Kernel, *kernel_args): @@ -54,15 +67,18 @@ def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kern launching kernel. """ - stream = Stream_accept(stream, allow_stream_protocol=True) - assert_type(kernel, Kernel) + cdef Stream s = Stream_accept(stream, allow_stream_protocol=True) _lazy_init() - config = check_or_create_options(LaunchConfig, config, "launch config") + cdef LaunchConfig conf = check_or_create_options(LaunchConfig, config, "launch config") # TODO: can we ensure kernel_args is valid/safe to use here? # TODO: merge with HelperKernelParams? - kernel_args = ParamHolder(kernel_args) - args_ptr = kernel_args.ptr + cdef ParamHolder ker_args = ParamHolder(kernel_args) + cdef void** args_ptr = (ker_args.ptr) + + # TODO: cythonize Module/Kernel/... + # Note: CUfunction and CUkernel are interchangeable + cdef cydriver.CUfunction func_handle = ((kernel._handle)) # Note: CUkernel can still be launched via the old cuLaunchKernel and we do not care # about the CUfunction/CUkernel difference (which depends on whether the "old" or @@ -70,16 +86,20 @@ def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kern # mainly to see if the "Ex" API is available and if so we use it, as it's more feature # rich. if _use_ex: - drv_cfg = _to_native_launch_config(config) - drv_cfg.hStream = stream.handle - if config.cooperative_launch: - _check_cooperative_launch(kernel, config, stream) - handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0)) + drv_cfg = conf._to_native_launch_config() + drv_cfg.hStream = s._handle + if conf.cooperative_launch: + _check_cooperative_launch(kernel, conf, s) + with nogil: + HANDLE_RETURN(cydriver.cuLaunchKernelEx(&drv_cfg, func_handle, args_ptr, NULL)) else: # TODO: check if config has any unsupported attrs - handle_return( - driver.cuLaunchKernel( - int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0 + HANDLE_RETURN( + cydriver.cuLaunchKernel( + func_handle, + conf.grid[0], conf.grid[1], conf.grid[2], + conf.block[0], conf.block[1], conf.block[2], + conf.shmem_size, s._handle, args_ptr, NULL ) )