Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions cuda_core/cuda/core/experimental/_kernel_arg_handler.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

from libc.stdint cimport intptr_t
from libcpp cimport vector


cdef class ParamHolder:

cdef:
vector.vector[void*] data
vector.vector[void*] data_addresses
object kernel_args
readonly intptr_t ptr
6 changes: 0 additions & 6 deletions cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -250,12 +250,6 @@ cdef inline int prepare_numpy_arg(

cdef class ParamHolder:

cdef:
vector.vector[void*] data
vector.vector[void*] data_addresses
object kernel_args
readonly intptr_t ptr

def __init__(self, kernel_args):
if len(kernel_args) == 0:
self.ptr = 0
Expand Down
21 changes: 13 additions & 8 deletions cuda_core/cuda/core/experimental/_launch_config.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,23 @@
#
# SPDX-License-Identifier: Apache-2.0

from libcpp.vector cimport vector

cdef bint _inited
cdef bint _use_ex
from cuda.bindings cimport cydriver

cdef void _lazy_init() except *

cdef class LaunchConfig:
"""Customizable launch options."""
cdef public tuple grid
cdef public tuple cluster
cdef public tuple block
cdef public int shmem_size
cdef public bint cooperative_launch
cdef:
public tuple grid
public tuple cluster
public tuple block
public int shmem_size
public bint cooperative_launch
Comment on lines +12 to +17
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Leaving a quick note here in case I forget.

I wasn't super happy about this PR and it's why I was on the fence of pushing this forward: Our design of LaunchConfig allows reusing the Python object across multiple launches. But, it is too flexible that we pay the price of maintaining the public attributes (both readable and writable in Python) associated with Python overhead. If we don't think this is reasonable, we should break it before GA, and turn LaunchConfig into an immutable object.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In theory we could store the native types and have getter / setter properties that translate to/from tuples as needed? I agree we should avoid paying the Python overhead here if possible.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. Though in the case of launch config here, we essentially are wrapping a struct with a flexible array member (the attributes array can be arbitrarily long), which is annoying.

Last night I was thinking about thread safety. But in the present case we still do not offer thread safety anyway (ex: when two threads set grid member at the same time, for example). Something to think about in #1389.


vector[cydriver.CUlaunchAttribute] _attrs

cdef cydriver.CUlaunchConfig _to_native_launch_config(self)


cpdef object _to_native_launch_config(LaunchConfig config)
65 changes: 55 additions & 10 deletions cuda_core/cuda/core/experimental/_launch_config.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,44 @@
#
# SPDX-License-Identifier: Apache-2.0

from cuda.core.experimental._utils.cuda_utils cimport (
HANDLE_RETURN,
)

import threading

from cuda.core.experimental._device import Device
from cuda.core.experimental._utils.cuda_utils import (
CUDAError,
cast_to_3_tuple,
driver,
get_binding_version,
handle_return,
)

# TODO: revisit this treatment for py313t builds

cdef bint _inited = False
cdef bint _use_ex = False
cdef object _lock = threading.Lock()


cdef void _lazy_init() except *:
"""Initialize module-level globals for driver version checks."""
cdef int _lazy_init() except?-1:
global _inited, _use_ex
if _inited:
return
return 0

cdef tuple _py_major_minor
cdef int _driver_ver
with _lock:
if _inited:
return 0

# binding availability depends on cuda-python version
_py_major_minor = get_binding_version()
HANDLE_RETURN(cydriver.cuDriverGetVersion(&_driver_ver))
_use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8))
_inited = True

# binding availability depends on cuda-python version
_py_major_minor = get_binding_version()
_driver_ver = handle_return(driver.cuDriverGetVersion())
_use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8))
_inited = True
return 0


cdef class LaunchConfig:
Expand Down Expand Up @@ -127,7 +137,42 @@ cdef class LaunchConfig:
f"block={self.block}, shmem_size={self.shmem_size}, "
f"cooperative_launch={self.cooperative_launch})")

cdef cydriver.CUlaunchConfig _to_native_launch_config(self):
_lazy_init()
# TODO: memset to zero?
cdef cydriver.CUlaunchConfig drv_cfg
cdef cydriver.CUlaunchAttribute attr
self._attrs.resize(0)

# Handle grid dimensions and cluster configuration
if self.cluster is not None:
# Convert grid from cluster units to block units
drv_cfg.gridDimX = self.grid[0] * self.cluster[0]
drv_cfg.gridDimY = self.grid[1] * self.cluster[1]
drv_cfg.gridDimZ = self.grid[2] * self.cluster[2]

# Set up cluster attribute
attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
attr.value.clusterDim.x, attr.value.clusterDim.y, attr.value.clusterDim.z = self.cluster
self._attrs.push_back(attr)
else:
drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = self.grid

drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = self.block
drv_cfg.sharedMemBytes = self.shmem_size

if self.cooperative_launch:
attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE
attr.value.cooperative = 1
self._attrs.push_back(attr)

drv_cfg.numAttrs = self._attrs.size()
drv_cfg.attrs = self._attrs.data()

return drv_cfg


# TODO: once all modules are cythonized, this function can be dropped in favor of the cdef method above
cpdef object _to_native_launch_config(LaunchConfig config):
"""Convert LaunchConfig to native driver CUlaunchConfig.

Expand Down
84 changes: 52 additions & 32 deletions cuda_core/cuda/core/experimental/_launcher.pyx
Original file line number Diff line number Diff line change
@@ -1,38 +1,51 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config
from cuda.core.experimental._stream cimport Stream_accept

from libc.stdint cimport uintptr_t

from cuda.bindings cimport cydriver

from cuda.core.experimental._launch_config cimport LaunchConfig
from cuda.core.experimental._kernel_arg_handler cimport ParamHolder
from cuda.core.experimental._stream cimport Stream_accept, Stream
from cuda.core.experimental._utils.cuda_utils cimport (
check_or_create_options,
HANDLE_RETURN,
)

import threading

from cuda.core.experimental._kernel_arg_handler import ParamHolder
from cuda.core.experimental._module import Kernel
from cuda.core.experimental._stream import Stream
from cuda.core.experimental._utils.clear_error_support import assert_type
from cuda.core.experimental._utils.cuda_utils import (
_reduce_3_tuple,
check_or_create_options,
driver,
get_binding_version,
handle_return,
)

# TODO: revisit this treatment for py313t builds
_inited = False
_use_ex = None

cdef bint _inited = False
cdef bint _use_ex = False
cdef object _lock = threading.Lock()


def _lazy_init():
global _inited
cdef int _lazy_init() except?-1:
global _inited, _use_ex
if _inited:
return
return 0

cdef int _driver_ver
with _lock:
if _inited:
return 0

global _use_ex
# binding availability depends on cuda-python version
_py_major_minor = get_binding_version()
_driver_ver = handle_return(driver.cuDriverGetVersion())
_use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8))
_inited = True
# binding availability depends on cuda-python version
_py_major_minor = get_binding_version()
HANDLE_RETURN(cydriver.cuDriverGetVersion(&_driver_ver))
_use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8))
_inited = True

return 0


def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kernel: Kernel, *kernel_args):
Expand All @@ -54,32 +67,39 @@ def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kern
launching kernel.

"""
stream = Stream_accept(stream, allow_stream_protocol=True)
assert_type(kernel, Kernel)
cdef Stream s = Stream_accept(stream, allow_stream_protocol=True)
_lazy_init()
config = check_or_create_options(LaunchConfig, config, "launch config")
cdef LaunchConfig conf = check_or_create_options(LaunchConfig, config, "launch config")

# TODO: can we ensure kernel_args is valid/safe to use here?
# TODO: merge with HelperKernelParams?
kernel_args = ParamHolder(kernel_args)
args_ptr = kernel_args.ptr
cdef ParamHolder ker_args = ParamHolder(kernel_args)
cdef void** args_ptr = <void**><uintptr_t>(ker_args.ptr)

# TODO: cythonize Module/Kernel/...
# Note: CUfunction and CUkernel are interchangeable
cdef cydriver.CUfunction func_handle = <cydriver.CUfunction>(<uintptr_t>(kernel._handle))

# Note: CUkernel can still be launched via the old cuLaunchKernel and we do not care
# about the CUfunction/CUkernel difference (which depends on whether the "old" or
# "new" module loading APIs are in use). We check both binding & driver versions here
# mainly to see if the "Ex" API is available and if so we use it, as it's more feature
# rich.
if _use_ex:
drv_cfg = _to_native_launch_config(config)
drv_cfg.hStream = stream.handle
if config.cooperative_launch:
_check_cooperative_launch(kernel, config, stream)
handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
drv_cfg = conf._to_native_launch_config()
drv_cfg.hStream = s._handle
if conf.cooperative_launch:
_check_cooperative_launch(kernel, conf, s)
with nogil:
HANDLE_RETURN(cydriver.cuLaunchKernelEx(&drv_cfg, func_handle, args_ptr, NULL))
else:
# TODO: check if config has any unsupported attrs
handle_return(
driver.cuLaunchKernel(
int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0
HANDLE_RETURN(
cydriver.cuLaunchKernel(
func_handle,
conf.grid[0], conf.grid[1], conf.grid[2],
conf.block[0], conf.block[1], conf.block[2],
conf.shmem_size, s._handle, args_ptr, NULL
)
)

Expand Down
Loading