NVIDIA · rwgk · Dec 2, 2025 · Dec 2, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -52,7 +52,7 @@ body:
     attributes:
       label: Describe the bug
       description: A clear and concise description of what problem you are running into.
-      placeholder: "Attempting to compile a program via `cuda.core.experimental.Program.compile` throws a `ValueError`."
+      placeholder: "Attempting to compile a program via `cuda.core.Program.compile` throws a `ValueError`."
     validations:
       required: true
 
@@ -62,7 +62,7 @@ body:
       label: How to Reproduce
       description: Steps used to reproduce the bug.
       placeholder: |
-        0. Construct a `cuda.core.experimental.Program` instance
+        0. Construct a `cuda.core.Program` instance
         1. Call the `.compile(...)` method of the instance
         2. The call throws a `ValueError` with the following:
         ```
@@ -76,7 +76,7 @@ body:
     attributes:
       label: Expected behavior
       description: A clear and concise description of what you expected to happen.
-      placeholder: "Using `cuda.core.experimental.Program.compile(...)` should run successfully and not throw a `ValueError`"
+      placeholder: "Using `cuda.core.Program.compile(...)` should run successfully and not throw a `ValueError`"
     validations:
       required: true
 

diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -36,7 +36,7 @@ body:
     attributes:
       label: Is your feature request related to a problem? Please describe.
       description: A clear and concise description of what the problem is, e.g., "I would like to be able to..."
-      placeholder: I would like to be able to use the equivalent of `cuda.core.experimental.Program.compile(...)` to compile my code to PTX.
+      placeholder: I would like to be able to use the equivalent of `cuda.core.Program.compile(...)` to compile my code to PTX.
     validations:
       required: true
 
@@ -46,7 +46,7 @@ body:
       label: Describe the solution you'd like
       description: A clear and concise description of what you want to happen.
       placeholder: |
-        Support a `ptx` target_type in the `cuda.core.experimental.Program.compile(...)` function.
+        Support a `ptx` target_type in the `cuda.core.Program.compile(...)` function.
     validations:
       required: true
 
@@ -57,7 +57,7 @@ body:
       description:
         If applicable, please add a clear and concise description of any alternative solutions or features you've
         considered.
-      placeholder: The alternatives to using `cuda.core.experimental.Program.compile(...)` are unappealing. They usually involve using lower level bindings to something like nvRTC or invoking the nvcc executable.
+      placeholder: The alternatives to using `cuda.core.Program.compile(...)` are unappealing. They usually involve using lower level bindings to something like nvRTC or invoking the nvcc executable.
     validations:
       required: false
 

diff --git a/.spdx-ignore b/.spdx-ignore
@@ -10,6 +10,6 @@ requirements*.txt
 cuda_bindings/examples/*
 
 # Vendored
-cuda_core/cuda/core/experimental/include/dlpack.h
+cuda_core/cuda/core/include/dlpack.h
 
 qa/ctk-next.drawio.svg
diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py
@@ -12,8 +12,8 @@
 
 In particular, each wheel contains a CUDA-specific build of the `cuda.core` library
 and the associated bindings. This script merges these directories into a single wheel
-that supports both CUDA versions, i.e., containing both `cuda/core/experimental/cu12`
-and `cuda/core/experimental/cu13`. At runtime, the code in `cuda/core/experimental/__init__.py`
+that supports both CUDA versions, i.e., containing both `cuda/core/cu12`
+and `cuda/core/cu13`. At runtime, the code in `cuda/core/__init__.py`
 is used to import the appropriate CUDA-specific bindings.
 
 This script is based on the one in NVIDIA/CCCL.
@@ -94,27 +94,38 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
         # Use the first wheel as the base and merge binaries from others
         base_wheel = extracted_wheels[0]
 
-        # now copy the version-specific directory from other wheels
-        # into the appropriate place in the base wheel
+        # Copy version-specific binaries from each wheel into versioned subdirectories
+        # Note: Python modules stay in cuda/core/, only binaries go into cu12/cu13/
+        base_dir = Path("cuda") / "core"
+
         for i, wheel_dir in enumerate(extracted_wheels):
             cuda_version = wheels[i].name.split(".cu")[1].split(".")[0]
-            base_dir = Path("cuda") / "core" / "experimental"
-            # Copy from other wheels
-            print(f"  Copying {wheel_dir} to {base_wheel}", file=sys.stderr)
-            shutil.copytree(wheel_dir / base_dir, base_wheel / base_dir / f"cu{cuda_version}")
-
-            # Overwrite the __init__.py in versioned dirs
-            os.truncate(base_wheel / base_dir / f"cu{cuda_version}" / "__init__.py", 0)
-
-        # The base dir should only contain __init__.py, the include dir, and the versioned dirs
-        files_to_remove = os.scandir(base_wheel / base_dir)
-        for f in files_to_remove:
-            f_abspath = f.path
-            if f.name not in ("__init__.py", "cu12", "cu13", "include"):
-                if f.is_dir():
-                    shutil.rmtree(f_abspath)
-                else:
-                    os.remove(f_abspath)
+            versioned_dir = base_wheel / base_dir / f"cu{cuda_version}"
+
+            # Create versioned directory
+            versioned_dir.mkdir(parents=True, exist_ok=True)
+
+            # Copy only version-specific binaries (.so, .pyd, .dll files) from the source wheel
+            # Python modules (.py, .pyx, .pxd) remain in cuda/core/
+            # Exclude versioned directories (cu12/, cu13/) to avoid recursion
+            source_dir = wheel_dir / base_dir
+            for item in source_dir.rglob("*"):
+                if item.is_dir():
+                    continue
+
+                # Skip files in versioned directories to avoid recursion
+                rel_path = item.relative_to(source_dir)
+                if any(part in ("cu12", "cu13") for part in rel_path.parts):
+                    continue
+
+                # Only copy binary files, not Python source files
+                if item.suffix in (".so", ".pyd", ".dll"):
+                    dest_item = versioned_dir / rel_path
+                    dest_item.parent.mkdir(parents=True, exist_ok=True)
+                    shutil.copy2(item, dest_item)
+
+            # Create empty __init__.py in versioned dirs
+            (versioned_dir / "__init__.py").touch()
 
         # Repack the merged wheel
         output_dir.mkdir(parents=True, exist_ok=True)

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
@@ -66,7 +66,7 @@ def _build_cuda_core():
 
     # It seems setuptools' wildcard support has problems for namespace packages,
     # so we explicitly spell out all Extension instances.
-    root_module = "cuda.core.experimental"
+    root_module = "cuda.core"
     root_path = f"{os.path.sep}".join(root_module.split(".")) + os.path.sep
     ext_files = glob.glob(f"{root_path}/**/*.pyx", recursive=True)
 
@@ -84,11 +84,16 @@ def get_cuda_paths():
         print("CUDA paths:", CUDA_PATH)
         return CUDA_PATH
 
+    # Add local include directory for cuda/core/include
+    local_include_dirs = ["cuda/core"]
+    cuda_include_dirs = list(os.path.join(root, "include") for root in get_cuda_paths())
+    all_include_dirs = local_include_dirs + cuda_include_dirs
+
     ext_modules = tuple(
         Extension(
-            f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}",
-            sources=[f"cuda/core/experimental/{mod}.pyx"],
-            include_dirs=list(os.path.join(root, "include") for root in get_cuda_paths()),
+            f"cuda.core.{mod.replace(os.path.sep, '.')}",
+            sources=[f"cuda/core/{mod}.pyx"],
+            include_dirs=all_include_dirs,
             language="c++",
         )
         for mod in module_names

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
@@ -3,3 +3,59 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.core._version import __version__
+
+try:
+    from cuda import bindings
+except ImportError:
+    raise ImportError("cuda.bindings 12.x or 13.x must be installed") from None
+else:
+    cuda_major, cuda_minor = bindings.__version__.split(".")[:2]
+    if cuda_major not in ("12", "13"):
+        raise ImportError("cuda.bindings 12.x or 13.x must be installed")
+
+import importlib
+
+subdir = f"cu{cuda_major}"
+try:
+    versioned_mod = importlib.import_module(f".{subdir}", __package__)
+    # Import all symbols from the module
+    globals().update(versioned_mod.__dict__)
+except ImportError:
+    # This is not a wheel build, but a conda or local build, do nothing
+    pass
+else:
+    del versioned_mod
+finally:
+    del bindings, importlib, subdir, cuda_major, cuda_minor
+
+from cuda.core import utils  # noqa: E402
+from cuda.core._device import Device  # noqa: E402
+from cuda.core._event import Event, EventOptions  # noqa: E402
+from cuda.core._graph import (  # noqa: E402
+    Graph,
+    GraphBuilder,
+    GraphCompleteOptions,
+    GraphDebugPrintOptions,
+)
+from cuda.core._launch_config import LaunchConfig  # noqa: E402
+from cuda.core._launcher import launch  # noqa: E402
+from cuda.core._layout import StridedLayout  # noqa: E402
+from cuda.core._linker import Linker, LinkerOptions  # noqa: E402
+from cuda.core._memory import (  # noqa: E402
+    Buffer,
+    DeviceMemoryResource,
+    DeviceMemoryResourceOptions,
+    GraphMemoryResource,
+    LegacyPinnedMemoryResource,
+    MemoryResource,
+    VirtualMemoryResource,
+    VirtualMemoryResourceOptions,
+)
+from cuda.core._module import Kernel, ObjectCode  # noqa: E402
+from cuda.core._program import Program, ProgramOptions  # noqa: E402
+from cuda.core._stream import Stream, StreamOptions  # noqa: E402
+from cuda.core._system import System  # noqa: E402
+
+system = System()
+__import__("sys").modules[__spec__.name + ".system"] = system
+del System
diff --git a/..._core/cuda/core/experimental/__init__.pxd → cuda_core/cuda/core/__init__experimental.pxd b/..._core/cuda/core/experimental/__init__.pxd → cuda_core/cuda/core/__init__experimental.pxd
           # Rename wheel to include CUDA version suffix 
           mkdir -p "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_PREV_CUDA_MAJOR}" 
           for wheel in ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl; do 
             if [[ -f "${wheel}" ]]; then 
               base_name=$(basename "${wheel}" .whl) 
               new_name="${base_name}.cu${BUILD_PREV_CUDA_MAJOR}.whl" 
               mv "${wheel}" "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_PREV_CUDA_MAJOR}/${new_name}" 
               echo "Renamed wheel to: ${new_name}" 
             fi 
           done 
           # Rename wheel to include CUDA version suffix 
           mkdir -p "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_PREV_CUDA_MAJOR}" 
           for wheel in ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl; do 
             if [[ -f "${wheel}" ]]; then 
               base_name=$(basename "${wheel}" .whl) 
               new_name="${base_name}.cu${BUILD_PREV_CUDA_MAJOR}.whl" 
               mv "${wheel}" "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_PREV_CUDA_MAJOR}/${new_name}" 
               echo "Renamed wheel to: ${new_name}" 
             fi 
           done 
diff --git a/..._core/cuda/core/experimental/_context.pyx → cuda_core/cuda/core/_context.pyx b/..._core/cuda/core/experimental/_context.pyx → cuda_core/cuda/core/_context.pyx
@@ -4,7 +4,7 @@
 
 from dataclasses import dataclass
 
-from cuda.core.experimental._utils.cuda_utils import driver
+from cuda.core._utils.cuda_utils import driver
 
 
 @dataclass

diff --git a/cuda_core/cuda/core/experimental/_device.pyx → cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx → cuda_core/cuda/core/_device.pyx
@@ -6,27 +6,27 @@ cimport cpython
 from libc.stdint cimport uintptr_t
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 import threading
 from typing import Optional, TYPE_CHECKING, Union
 
-from cuda.core.experimental._context import Context, ContextOptions
-from cuda.core.experimental._event import Event, EventOptions
-from cuda.core.experimental._graph import GraphBuilder
-from cuda.core.experimental._stream import IsStreamT, Stream, StreamOptions
-from cuda.core.experimental._utils.clear_error_support import assert_type
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._context import Context, ContextOptions
+from cuda.core._event import Event, EventOptions
+from cuda.core._graph import GraphBuilder
+from cuda.core._stream import IsStreamT, Stream, StreamOptions
+from cuda.core._utils.clear_error_support import assert_type
+from cuda.core._utils.cuda_utils import (
     ComputeCapability,
     CUDAError,
     driver,
     handle_return,
     runtime,
 )
-from cuda.core.experimental._stream cimport default_stream
+from cuda.core._stream cimport default_stream
 
 if TYPE_CHECKING:
-    from cuda.core.experimental._memory import Buffer, MemoryResource
+    from cuda.core._memory import Buffer, MemoryResource
 
 # TODO: I prefer to type these as "cdef object" and avoid accessing them from within Python,
 # but it seems it is very convenient to expose them for testing purposes...
@@ -1154,17 +1154,17 @@ class Device:
                     )
                 )
             if attr == 1:
-                from cuda.core.experimental._memory import DeviceMemoryResource
+                from cuda.core._memory import DeviceMemoryResource
                 self._memory_resource = DeviceMemoryResource(self._id)
             else:
-                from cuda.core.experimental._memory import _SynchronousMemoryResource
+                from cuda.core._memory import _SynchronousMemoryResource
                 self._memory_resource = _SynchronousMemoryResource(self._id)
 
         return self._memory_resource
 
     @memory_resource.setter
     def memory_resource(self, mr):
-        from cuda.core.experimental._memory import MemoryResource
+        from cuda.core._memory import MemoryResource
         assert_type(mr, MemoryResource)
         self._memory_resource = mr
 
@@ -1223,7 +1223,7 @@ class Device:
         Acts as an entry point of this object. Users always start a code by
         calling this method, e.g.
 
-        >>> from cuda.core.experimental import Device
+        >>> from cuda.core import Device
         >>> dev0 = Device(0)
         >>> dev0.set_current()
         >>> # ... do work on device 0 ...

diff --git a/cuda_core/cuda/core/experimental/_dlpack.pxd → cuda_core/cuda/core/_dlpack.pxd b/cuda_core/cuda/core/experimental/_dlpack.pxd → cuda_core/cuda/core/_dlpack.pxd
diff --git a/cuda_core/cuda/core/experimental/_dlpack.pyx → cuda_core/cuda/core/_dlpack.pyx b/cuda_core/cuda/core/experimental/_dlpack.pyx → cuda_core/cuda/core/_dlpack.pyx
diff --git a/cuda_core/cuda/core/experimental/_event.pxd → cuda_core/cuda/core/_event.pxd b/cuda_core/cuda/core/experimental/_event.pxd → cuda_core/cuda/core/_event.pxd
diff --git a/cuda_core/cuda/core/experimental/_event.pyx → cuda_core/cuda/core/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx → cuda_core/cuda/core/_event.pyx
@@ -8,7 +8,7 @@ cimport cpython
 from libc.stdint cimport uintptr_t
 from libc.string cimport memcpy
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._utils.cuda_utils cimport (
+from cuda.core._utils.cuda_utils cimport (
     check_or_create_options,
     HANDLE_RETURN
 )
@@ -18,8 +18,8 @@ from dataclasses import dataclass
 import multiprocessing
 from typing import TYPE_CHECKING, Optional
 
-from cuda.core.experimental._context import Context
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._context import Context
+from cuda.core._utils.cuda_utils import (
     CUDAError,
     check_multiprocessing_start_method,
     driver,

diff --git a/cuda_core/cuda/core/experimental/_graph.py → cuda_core/cuda/core/_graph.py b/cuda_core/cuda/core/experimental/_graph.py → cuda_core/cuda/core/_graph.py
@@ -9,8 +9,8 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from cuda.core.experimental._stream import Stream
-from cuda.core.experimental._utils.cuda_utils import (
+    from cuda.core._stream import Stream
+from cuda.core._utils.cuda_utils import (
     driver,
     get_binding_version,
     handle_return,

diff --git a/...core/experimental/_kernel_arg_handler.pyx → cuda_core/cuda/core/_kernel_arg_handler.pyx b/...core/experimental/_kernel_arg_handler.pyx → cuda_core/cuda/core/_kernel_arg_handler.pyx
@@ -15,8 +15,8 @@ import ctypes
 
 import numpy
 
-from cuda.core.experimental._memory import Buffer
-from cuda.core.experimental._utils.cuda_utils import driver
+from cuda.core._memory import Buffer
+from cuda.core._utils.cuda_utils import driver
 from cuda.bindings cimport cydriver
 
 

diff --git a/...cuda/core/experimental/_launch_config.pxd → cuda_core/cuda/core/_launch_config.pxd b/...cuda/core/experimental/_launch_config.pxd → cuda_core/cuda/core/_launch_config.pxd
diff --git a/...cuda/core/experimental/_launch_config.pyx → cuda_core/cuda/core/_launch_config.pyx b/...cuda/core/experimental/_launch_config.pyx → cuda_core/cuda/core/_launch_config.pyx
@@ -2,8 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._device import Device
+from cuda.core._utils.cuda_utils import (
     CUDAError,
     cast_to_3_tuple,
     driver,

diff --git a/...core/cuda/core/experimental/_launcher.pyx → cuda_core/cuda/core/_launcher.pyx b/...core/cuda/core/experimental/_launcher.pyx → cuda_core/cuda/core/_launcher.pyx
@@ -1,15 +1,15 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
-from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config
-from cuda.core.experimental._stream cimport Stream_accept
+from cuda.core._launch_config cimport LaunchConfig, _to_native_launch_config
+from cuda.core._stream cimport Stream_accept
 
 
-from cuda.core.experimental._kernel_arg_handler import ParamHolder
-from cuda.core.experimental._module import Kernel
-from cuda.core.experimental._stream import Stream
-from cuda.core.experimental._utils.clear_error_support import assert_type
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._kernel_arg_handler import ParamHolder
+from cuda.core._module import Kernel
+from cuda.core._stream import Stream
+from cuda.core._utils.clear_error_support import assert_type
+from cuda.core._utils.cuda_utils import (
     _reduce_3_tuple,
     check_or_create_options,
     driver,

diff --git a/cuda_core/cuda/core/experimental/_layout.pxd → cuda_core/cuda/core/_layout.pxd b/cuda_core/cuda/core/experimental/_layout.pxd → cuda_core/cuda/core/_layout.pxd
@@ -18,7 +18,7 @@ ctypedef uint32_t property_mask_t
 ctypedef vector.vector[stride_t] extents_strides_t
 ctypedef vector.vector[axis_t] axis_vec_t
 
-from cuda.core.experimental._utils cimport cuda_utils
+from cuda.core._utils cimport cuda_utils
 
 
 ctypedef fused integer_t:

diff --git a/cuda_core/cuda/core/experimental/_layout.pyx → cuda_core/cuda/core/_layout.pyx b/cuda_core/cuda/core/experimental/_layout.pyx → cuda_core/cuda/core/_layout.pyx