diff --git a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pxd b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pxd
new file mode 100644
index 0000000000..ac84743a30
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pxd
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport intptr_t
+from libcpp cimport vector
+
+
+cdef class ParamHolder:
+
+    cdef:
+        vector.vector[void*] data
+        vector.vector[void*] data_addresses
+        object kernel_args
+        readonly intptr_t ptr
diff --git a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx
index 4cac74a25f..e805b8ad66 100644
--- a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx
+++ b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx
@@ -250,12 +250,6 @@ cdef inline int prepare_numpy_arg(
 
 cdef class ParamHolder:
 
-    cdef:
-        vector.vector[void*] data
-        vector.vector[void*] data_addresses
-        object kernel_args
-        readonly intptr_t ptr
-
     def __init__(self, kernel_args):
         if len(kernel_args) == 0:
             self.ptr = 0
diff --git a/cuda_core/cuda/core/experimental/_launch_config.pxd b/cuda_core/cuda/core/experimental/_launch_config.pxd
index 90e2a5b606..eeb8ce41e7 100644
--- a/cuda_core/cuda/core/experimental/_launch_config.pxd
+++ b/cuda_core/cuda/core/experimental/_launch_config.pxd
@@ -2,18 +2,23 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from libcpp.vector cimport vector
 
-cdef bint _inited
-cdef bint _use_ex
+from cuda.bindings cimport cydriver
 
-cdef void _lazy_init() except *
 
 cdef class LaunchConfig:
     """Customizable launch options."""
-    cdef public tuple grid
-    cdef public tuple cluster
-    cdef public tuple block
-    cdef public int shmem_size
-    cdef public bint cooperative_launch
+    cdef:
+        public tuple grid
+        public tuple cluster
+        public tuple block
+        public int shmem_size
+        public bint cooperative_launch
+
+        vector[cydriver.CUlaunchAttribute] _attrs
+
+    cdef cydriver.CUlaunchConfig _to_native_launch_config(self)
+
 
 cpdef object _to_native_launch_config(LaunchConfig config)
diff --git a/cuda_core/cuda/core/experimental/_launch_config.pyx b/cuda_core/cuda/core/experimental/_launch_config.pyx
index 7d6a1ab2b9..1f9de3f999 100644
--- a/cuda_core/cuda/core/experimental/_launch_config.pyx
+++ b/cuda_core/cuda/core/experimental/_launch_config.pyx
@@ -2,34 +2,44 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from cuda.core.experimental._utils.cuda_utils cimport (
+    HANDLE_RETURN,
+)
+
+import threading
+
 from cuda.core.experimental._device import Device
 from cuda.core.experimental._utils.cuda_utils import (
     CUDAError,
     cast_to_3_tuple,
     driver,
     get_binding_version,
-    handle_return,
 )
 
-# TODO: revisit this treatment for py313t builds
+
 cdef bint _inited = False
 cdef bint _use_ex = False
+cdef object _lock = threading.Lock()
 
 
-cdef void _lazy_init() except *:
-    """Initialize module-level globals for driver version checks."""
+cdef int _lazy_init() except?-1:
     global _inited, _use_ex
     if _inited:
-        return
+        return 0
 
     cdef tuple _py_major_minor
     cdef int _driver_ver
+    with _lock:
+        if _inited:
+            return 0
+
+        # binding availability depends on cuda-python version
+        _py_major_minor = get_binding_version()
+        HANDLE_RETURN(cydriver.cuDriverGetVersion(&_driver_ver))
+        _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8))
+        _inited = True
 
-    # binding availability depends on cuda-python version
-    _py_major_minor = get_binding_version()
-    _driver_ver = handle_return(driver.cuDriverGetVersion())
-    _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8))
-    _inited = True
+    return 0
 
 
 cdef class LaunchConfig:
@@ -127,7 +137,42 @@ cdef class LaunchConfig:
                 f"block={self.block}, shmem_size={self.shmem_size}, "
                 f"cooperative_launch={self.cooperative_launch})")
 
+    cdef cydriver.CUlaunchConfig _to_native_launch_config(self):
+        _lazy_init()
+        # TODO: memset to zero?
+        cdef cydriver.CUlaunchConfig drv_cfg
+        cdef cydriver.CUlaunchAttribute attr
+        self._attrs.resize(0)
+
+        # Handle grid dimensions and cluster configuration
+        if self.cluster is not None:
+            # Convert grid from cluster units to block units
+            drv_cfg.gridDimX = self.grid[0] * self.cluster[0]
+            drv_cfg.gridDimY = self.grid[1] * self.cluster[1]
+            drv_cfg.gridDimZ = self.grid[2] * self.cluster[2]
+
+            # Set up cluster attribute
+            attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
+            attr.value.clusterDim.x, attr.value.clusterDim.y, attr.value.clusterDim.z = self.cluster
+            self._attrs.push_back(attr)
+        else:
+            drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = self.grid
+
+        drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = self.block
+        drv_cfg.sharedMemBytes = self.shmem_size
+
+        if self.cooperative_launch:
+            attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE
+            attr.value.cooperative = 1
+            self._attrs.push_back(attr)
+
+        drv_cfg.numAttrs = self._attrs.size()
+        drv_cfg.attrs = self._attrs.data()
+
+        return drv_cfg
+
 
+# TODO: once all modules are cythonized, this function can be dropped in favor of the cdef method above
 cpdef object _to_native_launch_config(LaunchConfig config):
     """Convert LaunchConfig to native driver CUlaunchConfig.
 
diff --git a/cuda_core/cuda/core/experimental/_launcher.pyx b/cuda_core/cuda/core/experimental/_launcher.pyx
index 2cba15cbf4..0e1b9a7d4b 100644
--- a/cuda_core/cuda/core/experimental/_launcher.pyx
+++ b/cuda_core/cuda/core/experimental/_launcher.pyx
@@ -1,38 +1,51 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
-from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config
-from cuda.core.experimental._stream cimport Stream_accept
 
+from libc.stdint cimport uintptr_t
+
+from cuda.bindings cimport cydriver
+
+from cuda.core.experimental._launch_config cimport LaunchConfig
+from cuda.core.experimental._kernel_arg_handler cimport ParamHolder
+from cuda.core.experimental._stream cimport Stream_accept, Stream
+from cuda.core.experimental._utils.cuda_utils cimport (
+    check_or_create_options,
+    HANDLE_RETURN,
+)
+
+import threading
 
-from cuda.core.experimental._kernel_arg_handler import ParamHolder
 from cuda.core.experimental._module import Kernel
 from cuda.core.experimental._stream import Stream
-from cuda.core.experimental._utils.clear_error_support import assert_type
 from cuda.core.experimental._utils.cuda_utils import (
     _reduce_3_tuple,
-    check_or_create_options,
-    driver,
     get_binding_version,
-    handle_return,
 )
 
-# TODO: revisit this treatment for py313t builds
-_inited = False
-_use_ex = None
+
+cdef bint _inited = False
+cdef bint _use_ex = False
+cdef object _lock = threading.Lock()
 
 
-def _lazy_init():
-    global _inited
+cdef int _lazy_init() except?-1:
+    global _inited, _use_ex
     if _inited:
-        return
+        return 0
+
+    cdef int _driver_ver
+    with _lock:
+        if _inited:
+            return 0
 
-    global _use_ex
-    # binding availability depends on cuda-python version
-    _py_major_minor = get_binding_version()
-    _driver_ver = handle_return(driver.cuDriverGetVersion())
-    _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8))
-    _inited = True
+        # binding availability depends on cuda-python version
+        _py_major_minor = get_binding_version()
+        HANDLE_RETURN(cydriver.cuDriverGetVersion(&_driver_ver))
+        _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8))
+        _inited = True
+
+    return 0
 
 
 def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kernel: Kernel, *kernel_args):
@@ -54,15 +67,18 @@ def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kern
         launching kernel.
 
     """
-    stream = Stream_accept(stream, allow_stream_protocol=True)
-    assert_type(kernel, Kernel)
+    cdef Stream s = Stream_accept(stream, allow_stream_protocol=True)
     _lazy_init()
-    config = check_or_create_options(LaunchConfig, config, "launch config")
+    cdef LaunchConfig conf = check_or_create_options(LaunchConfig, config, "launch config")
 
     # TODO: can we ensure kernel_args is valid/safe to use here?
     # TODO: merge with HelperKernelParams?
-    kernel_args = ParamHolder(kernel_args)
-    args_ptr = kernel_args.ptr
+    cdef ParamHolder ker_args = ParamHolder(kernel_args)
+    cdef void** args_ptr = <void**><uintptr_t>(ker_args.ptr)
+
+    # TODO: cythonize Module/Kernel/...
+    # Note: CUfunction and CUkernel are interchangeable
+    cdef cydriver.CUfunction func_handle = <cydriver.CUfunction>(<uintptr_t>(kernel._handle))
 
     # Note: CUkernel can still be launched via the old cuLaunchKernel and we do not care
     # about the CUfunction/CUkernel difference (which depends on whether the "old" or
@@ -70,16 +86,20 @@ def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kern
     # mainly to see if the "Ex" API is available and if so we use it, as it's more feature
     # rich.
     if _use_ex:
-        drv_cfg = _to_native_launch_config(config)
-        drv_cfg.hStream = stream.handle
-        if config.cooperative_launch:
-            _check_cooperative_launch(kernel, config, stream)
-        handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
+        drv_cfg = conf._to_native_launch_config()
+        drv_cfg.hStream = s._handle
+        if conf.cooperative_launch:
+            _check_cooperative_launch(kernel, conf, s)
+        with nogil:
+            HANDLE_RETURN(cydriver.cuLaunchKernelEx(&drv_cfg, func_handle, args_ptr, NULL))
     else:
         # TODO: check if config has any unsupported attrs
-        handle_return(
-            driver.cuLaunchKernel(
-                int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0
+        HANDLE_RETURN(
+            cydriver.cuLaunchKernel(
+                func_handle,
+                conf.grid[0], conf.grid[1], conf.grid[2],
+                conf.block[0], conf.block[1], conf.block[2],
+                conf.shmem_size, s._handle, args_ptr, NULL
             )
         )