From f87082b656c7f4e4acff1bea0cfdf820cd951b80 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Mon, 15 Dec 2025 22:03:59 -0800
Subject: [PATCH 01/21] Migrate cuda.core.experimental to cuda.core

This commit migrates all code from cuda.core.experimental to cuda.core,
completing the deprecation of the experimental namespace.

Changes:
- Move all files from cuda/core/experimental/ to cuda/core/
- Move include/ directory to _include/ (with leading underscore for
  implementation detail)
- Update all imports from cuda.core.experimental.* to cuda.core.*
- Update build_hooks.py to use cuda.core instead of cuda.core.experimental
- Update pyproject.toml package-data paths
- Update cuda/core/__init__.py to export all symbols
- Update cuda/core/experimental/__init__.py for backward compatibility
  with deprecation warnings
- Update Cython extern declarations to use _include/ instead of include/
- Fix import paths for _memory_pool (cuda.core._memory._memory_pool)
- Update test files and test helpers to use new import paths

All tests pass (1499 passed, 79 skipped).
---
 cuda_core/build_hooks.py                      |  14 +-
 cuda_core/cuda/core/__init__.py               |  64 +++++++++
 .../cuda/core/{experimental => }/_context.pyx |   2 +-
 .../cuda/core/{experimental => }/_device.pyx  |  28 ++--
 .../cuda/core/{experimental => }/_dlpack.pxd  |   2 +-
 .../cuda/core/{experimental => }/_dlpack.pyx  |   0
 .../cuda/core/{experimental => }/_event.pxd   |   0
 .../cuda/core/{experimental => }/_event.pyx   |   6 +-
 .../cuda/core/{experimental => }/_graph.py    |   4 +-
 .../include => _include}/dlpack.h             |   0
 .../include => _include}/layout.hpp           |   0
 .../include => _include}/utility.hpp          |   0
 .../_kernel_arg_handler.pyx                   |   4 +-
 .../{experimental => }/_launch_config.pxd     |   0
 .../{experimental => }/_launch_config.pyx     |   4 +-
 .../core/{experimental => }/_launcher.pyx     |  14 +-
 .../cuda/core/{experimental => }/_layout.pxd  |   4 +-
 .../cuda/core/{experimental => }/_layout.pyx  |   0
 .../cuda/core/{experimental => }/_linker.py   |  10 +-
 .../{experimental => }/_memory/__init__.pxd   |   0
 .../{experimental => }/_memory/__init__.py    |   0
 .../{experimental => }/_memory/_buffer.pxd    |   2 +-
 .../{experimental => }/_memory/_buffer.pyx    |  18 +--
 .../_memory/_device_memory_resource.pxd       |   4 +-
 .../_memory/_device_memory_resource.pyx       |  10 +-
 .../_memory/_graph_memory_resource.pxd        |   2 +-
 .../_memory/_graph_memory_resource.pyx        |   8 +-
 .../core/{experimental => }/_memory/_ipc.pxd  |   4 +-
 .../core/{experimental => }/_memory/_ipc.pyx  |   8 +-
 .../{experimental => }/_memory/_legacy.py     |  12 +-
 .../_memory/_managed_memory_resource.pxd      |   2 +-
 .../_memory/_managed_memory_resource.pyx      |   4 +-
 .../_memory/_memory_pool.pxd                  |   4 +-
 .../_memory/_memory_pool.pyx                  |  12 +-
 .../_memory/_pinned_memory_resource.pxd       |   4 +-
 .../_memory/_pinned_memory_resource.pyx       |  10 +-
 .../_memory/_virtual_memory_resource.py       |  10 +-
 .../core/{experimental => }/_memoryview.pyx   |   8 +-
 .../cuda/core/{experimental => }/_module.py   |  12 +-
 .../cuda/core/{experimental => }/_program.py  |  10 +-
 .../cuda/core/{experimental => }/_stream.pxd  |   0
 .../cuda/core/{experimental => }/_stream.pyx  |  16 +--
 .../cuda/core/{experimental => }/_system.py   |   4 +-
 .../{experimental => }/_utils/__init__.pxd    |   0
 .../{experimental => }/_utils/__init__.py     |   0
 .../_utils/clear_error_support.py             |   0
 .../{experimental => }/_utils/cuda_utils.pxd  |   0
 .../{experimental => }/_utils/cuda_utils.pyx  |   4 +-
 .../_utils/driver_cu_result_explanations.py   |   0
 .../_utils/runtime_cuda_error_explanations.py |   0
 cuda_core/cuda/core/experimental/__init__.py  | 128 +++++++++++++-----
 .../cuda/core/{experimental => }/utils.py     |   4 +-
 cuda_core/pyproject.toml                      |   2 +-
 cuda_core/tests/conftest.py                   |   8 +-
 .../example_tests/test_basic_examples.py      |   2 +-
 cuda_core/tests/helpers/buffers.py            |   4 +-
 cuda_core/tests/helpers/latch.py              |   2 +-
 cuda_core/tests/helpers/nanosleep_kernel.py   |   2 +-
 cuda_core/tests/memory_ipc/test_errors.py     |   4 +-
 cuda_core/tests/memory_ipc/test_event_ipc.py  |   2 +-
 cuda_core/tests/memory_ipc/test_memory_ipc.py |   2 +-
 .../tests/memory_ipc/test_peer_access.py      |   4 +-
 .../tests/memory_ipc/test_send_buffers.py     |   2 +-
 cuda_core/tests/memory_ipc/test_serialize.py  |   2 +-
 cuda_core/tests/memory_ipc/test_workerpool.py |   2 +-
 cuda_core/tests/test_comparable.py            |   8 +-
 cuda_core/tests/test_context.py               |   6 +-
 cuda_core/tests/test_cuda_utils.py            |   2 +-
 cuda_core/tests/test_device.py                |   8 +-
 cuda_core/tests/test_event.py                 |   6 +-
 cuda_core/tests/test_graph.py                 |   4 +-
 cuda_core/tests/test_graph_mem.py             |   2 +-
 cuda_core/tests/test_hashable.py              |   8 +-
 cuda_core/tests/test_helpers.py               |   2 +-
 cuda_core/tests/test_launcher.py              |  10 +-
 cuda_core/tests/test_linker.py                |   6 +-
 cuda_core/tests/test_memory.py                |  14 +-
 cuda_core/tests/test_memory_peer_access.py    |   8 +-
 cuda_core/tests/test_module.py                |  16 +--
 .../tests/test_multiprocessing_warning.py     |  10 +-
 cuda_core/tests/test_program.py               |  20 +--
 cuda_core/tests/test_stream.py                |   8 +-
 cuda_core/tests/test_strided_layout.py        |   2 +-
 cuda_core/tests/test_system.py                |   4 +-
 cuda_core/tests/test_utils.py                 |  10 +-
 .../cuda_python_test_helpers/__init__.py      |   2 +-
 86 files changed, 388 insertions(+), 262 deletions(-)
 rename cuda_core/cuda/core/{experimental => }/_context.pyx (94%)
 rename cuda_core/cuda/core/{experimental => }/_device.pyx (98%)
 rename cuda_core/cuda/core/{experimental => }/_dlpack.pxd (97%)
 rename cuda_core/cuda/core/{experimental => }/_dlpack.pyx (100%)
 rename cuda_core/cuda/core/{experimental => }/_event.pxd (100%)
 rename cuda_core/cuda/core/{experimental => }/_event.pyx (98%)
 rename cuda_core/cuda/core/{experimental => }/_graph.py (99%)
 rename cuda_core/cuda/core/{experimental/include => _include}/dlpack.h (100%)
 rename cuda_core/cuda/core/{experimental/include => _include}/layout.hpp (100%)
 rename cuda_core/cuda/core/{experimental/include => _include}/utility.hpp (100%)
 rename cuda_core/cuda/core/{experimental => }/_kernel_arg_handler.pyx (99%)
 rename cuda_core/cuda/core/{experimental => }/_launch_config.pxd (100%)
 rename cuda_core/cuda/core/{experimental => }/_launch_config.pyx (98%)
 rename cuda_core/cuda/core/{experimental => }/_launcher.pyx (88%)
 rename cuda_core/cuda/core/{experimental => }/_layout.pxd (99%)
 rename cuda_core/cuda/core/{experimental => }/_layout.pyx (100%)
 rename cuda_core/cuda/core/{experimental => }/_linker.py (98%)
 rename cuda_core/cuda/core/{experimental => }/_memory/__init__.pxd (100%)
 rename cuda_core/cuda/core/{experimental => }/_memory/__init__.py (100%)
 rename cuda_core/cuda/core/{experimental => }/_memory/_buffer.pxd (92%)
 rename cuda_core/cuda/core/{experimental => }/_memory/_buffer.pyx (96%)
 rename cuda_core/cuda/core/{experimental => }/_memory/_device_memory_resource.pxd (66%)
 rename cuda_core/cuda/core/{experimental => }/_memory/_device_memory_resource.pyx (97%)
 rename cuda_core/cuda/core/{experimental => }/_memory/_graph_memory_resource.pxd (77%)
 rename cuda_core/cuda/core/{experimental => }/_memory/_graph_memory_resource.pyx (96%)
 rename cuda_core/cuda/core/{experimental => }/_memory/_ipc.pxd (92%)
 rename cuda_core/cuda/core/{experimental => }/_memory/_ipc.pyx (96%)
 rename cuda_core/cuda/core/{experimental => }/_memory/_legacy.py (89%)
 rename cuda_core/cuda/core/{experimental => }/_memory/_managed_memory_resource.pxd (75%)
 rename cuda_core/cuda/core/{experimental => }/_memory/_managed_memory_resource.pyx (96%)
 rename cuda_core/cuda/core/{experimental => }/_memory/_memory_pool.pxd (85%)
 rename cuda_core/cuda/core/{experimental => }/_memory/_memory_pool.pyx (97%)
 rename cuda_core/cuda/core/{experimental => }/_memory/_pinned_memory_resource.pxd (60%)
 rename cuda_core/cuda/core/{experimental => }/_memory/_pinned_memory_resource.pyx (96%)
 rename cuda_core/cuda/core/{experimental => }/_memory/_virtual_memory_resource.py (98%)
 rename cuda_core/cuda/core/{experimental => }/_memoryview.pyx (99%)
 rename cuda_core/cuda/core/{experimental => }/_module.py (98%)
 rename cuda_core/cuda/core/{experimental => }/_program.py (99%)
 rename cuda_core/cuda/core/{experimental => }/_stream.pxd (100%)
 rename cuda_core/cuda/core/{experimental => }/_stream.pyx (97%)
 rename cuda_core/cuda/core/{experimental => }/_system.py (95%)
 rename cuda_core/cuda/core/{experimental => }/_utils/__init__.pxd (100%)
 rename cuda_core/cuda/core/{experimental => }/_utils/__init__.py (100%)
 rename cuda_core/cuda/core/{experimental => }/_utils/clear_error_support.py (100%)
 rename cuda_core/cuda/core/{experimental => }/_utils/cuda_utils.pxd (100%)
 rename cuda_core/cuda/core/{experimental => }/_utils/cuda_utils.pyx (98%)
 rename cuda_core/cuda/core/{experimental => }/_utils/driver_cu_result_explanations.py (100%)
 rename cuda_core/cuda/core/{experimental => }/_utils/runtime_cuda_error_explanations.py (100%)
 rename cuda_core/cuda/core/{experimental => }/utils.py (66%)

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index 6191dcb706..6fb3fefd2d 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -66,7 +66,7 @@ def _build_cuda_core():
 
     # It seems setuptools' wildcard support has problems for namespace packages,
     # so we explicitly spell out all Extension instances.
-    root_module = "cuda.core.experimental"
+    root_module = "cuda.core"
     root_path = f"{os.path.sep}".join(root_module.split(".")) + os.path.sep
     ext_files = glob.glob(f"{root_path}/**/*.pyx", recursive=True)
 
@@ -84,11 +84,17 @@ def get_cuda_paths():
         print("CUDA paths:", CUDA_PATH)
         return CUDA_PATH
 
+    # Add local include directory for cuda/core/_include
+    # This allows Cython files to use: cdef extern from "_include/layout.hpp"
+    local_include_dirs = ["cuda/core"]
+    cuda_include_dirs = list(os.path.join(root, "include") for root in get_cuda_paths())
+    all_include_dirs = local_include_dirs + cuda_include_dirs
+
     ext_modules = tuple(
         Extension(
-            f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}",
-            sources=[f"cuda/core/experimental/{mod}.pyx"],
-            include_dirs=list(os.path.join(root, "include") for root in get_cuda_paths()),
+            f"cuda.core.{mod.replace(os.path.sep, '.')}",
+            sources=[f"cuda/core/{mod}.pyx"],
+            include_dirs=all_include_dirs,
             language="c++",
         )
         for mod in module_names
diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index 96a80d1f3e..a10812606e 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -3,3 +3,67 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.core._version import __version__
+
+try:
+    from cuda import bindings
+except ImportError:
+    raise ImportError("cuda.bindings 12.x or 13.x must be installed") from None
+else:
+    cuda_major, cuda_minor = bindings.__version__.split(".")[:2]
+    if cuda_major not in ("12", "13"):
+        raise ImportError("cuda.bindings 12.x or 13.x must be installed")
+
+import importlib
+
+subdir = f"cu{cuda_major}"
+try:
+    versioned_mod = importlib.import_module(f".{subdir}", __package__)
+    # Import all symbols from the module
+    globals().update(versioned_mod.__dict__)
+except ImportError:
+    # This is not a wheel build, but a conda or local build, do nothing
+    pass
+else:
+    del versioned_mod
+finally:
+    del bindings, importlib, subdir, cuda_major, cuda_minor
+
+from cuda.core import utils  # noqa: E402
+from cuda.core._device import Device  # noqa: E402
+from cuda.core._event import Event, EventOptions  # noqa: E402
+from cuda.core._graph import (  # noqa: E402
+    Graph,
+    GraphBuilder,
+    GraphCompleteOptions,
+    GraphDebugPrintOptions,
+)
+from cuda.core._launch_config import LaunchConfig  # noqa: E402
+from cuda.core._launcher import launch  # noqa: E402
+from cuda.core._layout import _StridedLayout  # noqa: E402
+from cuda.core._linker import Linker, LinkerOptions  # noqa: E402
+from cuda.core._memory import (  # noqa: E402
+    Buffer,
+    DeviceMemoryResource,
+    DeviceMemoryResourceOptions,
+    GraphMemoryResource,
+    LegacyPinnedMemoryResource,
+    ManagedMemoryResource,
+    ManagedMemoryResourceOptions,
+    MemoryResource,
+    PinnedMemoryResource,
+    PinnedMemoryResourceOptions,
+    VirtualMemoryResource,
+    VirtualMemoryResourceOptions,
+)
+from cuda.core._memoryview import (  # noqa: E402
+    StridedMemoryView,  # noqa: E402
+    args_viewable_as_strided_memory,  # noqa: E402
+)
+from cuda.core._module import Kernel, ObjectCode  # noqa: E402
+from cuda.core._program import Program, ProgramOptions  # noqa: E402
+from cuda.core._stream import Stream, StreamOptions  # noqa: E402
+from cuda.core._system import System  # noqa: E402
+
+system = System()
+__import__("sys").modules[__spec__.name + ".system"] = system
+del System
diff --git a/cuda_core/cuda/core/experimental/_context.pyx b/cuda_core/cuda/core/_context.pyx
similarity index 94%
rename from cuda_core/cuda/core/experimental/_context.pyx
rename to cuda_core/cuda/core/_context.pyx
index f9858c1710..c1c28b3389 100644
--- a/cuda_core/cuda/core/experimental/_context.pyx
+++ b/cuda_core/cuda/core/_context.pyx
@@ -4,7 +4,7 @@
 
 from dataclasses import dataclass
 
-from cuda.core.experimental._utils.cuda_utils import driver
+from cuda.core._utils.cuda_utils import driver
 
 
 @dataclass
diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/_device.pyx
similarity index 98%
rename from cuda_core/cuda/core/experimental/_device.pyx
rename to cuda_core/cuda/core/_device.pyx
index b510320f2e..2d775b6580 100644
--- a/cuda_core/cuda/core/experimental/_device.pyx
+++ b/cuda_core/cuda/core/_device.pyx
@@ -6,27 +6,27 @@ cimport cpython
 from libc.stdint cimport uintptr_t
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 import threading
 from typing import Optional, TYPE_CHECKING, Union
 
-from cuda.core.experimental._context import Context, ContextOptions
-from cuda.core.experimental._event import Event, EventOptions
-from cuda.core.experimental._graph import GraphBuilder
-from cuda.core.experimental._stream import IsStreamT, Stream, StreamOptions
-from cuda.core.experimental._utils.clear_error_support import assert_type
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._context import Context, ContextOptions
+from cuda.core._event import Event, EventOptions
+from cuda.core._graph import GraphBuilder
+from cuda.core._stream import IsStreamT, Stream, StreamOptions
+from cuda.core._utils.clear_error_support import assert_type
+from cuda.core._utils.cuda_utils import (
     ComputeCapability,
     CUDAError,
     driver,
     handle_return,
     runtime,
 )
-from cuda.core.experimental._stream cimport default_stream
+from cuda.core._stream cimport default_stream
 
 if TYPE_CHECKING:
-    from cuda.core.experimental._memory import Buffer, MemoryResource
+    from cuda.core._memory import Buffer, MemoryResource
 
 # TODO: I prefer to type these as "cdef object" and avoid accessing them from within Python,
 # but it seems it is very convenient to expose them for testing purposes...
@@ -1034,7 +1034,7 @@ class Device:
         tuple of Device
             A tuple containing instances of available devices.
         """
-        from cuda.core.experimental import system
+        from cuda.core import system
         total = system.get_num_devices()
         return tuple(cls(device_id) for device_id in range(total))
 
@@ -1168,17 +1168,17 @@ class Device:
                     )
                 )
             if attr == 1:
-                from cuda.core.experimental._memory import DeviceMemoryResource
+                from cuda.core._memory import DeviceMemoryResource
                 self._memory_resource = DeviceMemoryResource(self._id)
             else:
-                from cuda.core.experimental._memory import _SynchronousMemoryResource
+                from cuda.core._memory import _SynchronousMemoryResource
                 self._memory_resource = _SynchronousMemoryResource(self._id)
 
         return self._memory_resource
 
     @memory_resource.setter
     def memory_resource(self, mr):
-        from cuda.core.experimental._memory import MemoryResource
+        from cuda.core._memory import MemoryResource
         assert_type(mr, MemoryResource)
         self._memory_resource = mr
 
@@ -1237,7 +1237,7 @@ class Device:
         Acts as an entry point of this object. Users always start a code by
         calling this method, e.g.
 
-        >>> from cuda.core.experimental import Device
+        >>> from cuda.core import Device
         >>> dev0 = Device(0)
         >>> dev0.set_current()
         >>> # ... do work on device 0 ...
diff --git a/cuda_core/cuda/core/experimental/_dlpack.pxd b/cuda_core/cuda/core/_dlpack.pxd
similarity index 97%
rename from cuda_core/cuda/core/experimental/_dlpack.pxd
rename to cuda_core/cuda/core/_dlpack.pxd
index d61b6a2bca..7b886cae10 100644
--- a/cuda_core/cuda/core/experimental/_dlpack.pxd
+++ b/cuda_core/cuda/core/_dlpack.pxd
@@ -14,7 +14,7 @@ from libc.stdint cimport uint64_t
 from libc.stdint cimport intptr_t
 
 
-cdef extern from "include/dlpack.h" nogil:
+cdef extern from "_include/dlpack.h" nogil:
     """
     #define DLPACK_TENSOR_UNUSED_NAME "dltensor"
     #define DLPACK_VERSIONED_TENSOR_UNUSED_NAME "dltensor_versioned"
diff --git a/cuda_core/cuda/core/experimental/_dlpack.pyx b/cuda_core/cuda/core/_dlpack.pyx
similarity index 100%
rename from cuda_core/cuda/core/experimental/_dlpack.pyx
rename to cuda_core/cuda/core/_dlpack.pyx
diff --git a/cuda_core/cuda/core/experimental/_event.pxd b/cuda_core/cuda/core/_event.pxd
similarity index 100%
rename from cuda_core/cuda/core/experimental/_event.pxd
rename to cuda_core/cuda/core/_event.pxd
diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/_event.pyx
similarity index 98%
rename from cuda_core/cuda/core/experimental/_event.pyx
rename to cuda_core/cuda/core/_event.pyx
index 149c92b8e1..e97fdfbab4 100644
--- a/cuda_core/cuda/core/experimental/_event.pyx
+++ b/cuda_core/cuda/core/_event.pyx
@@ -8,7 +8,7 @@ cimport cpython
 from libc.stdint cimport uintptr_t
 from libc.string cimport memcpy
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._utils.cuda_utils cimport (
+from cuda.core._utils.cuda_utils cimport (
     check_or_create_options,
     HANDLE_RETURN
 )
@@ -18,8 +18,8 @@ from dataclasses import dataclass
 import multiprocessing
 from typing import TYPE_CHECKING, Optional
 
-from cuda.core.experimental._context import Context
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._context import Context
+from cuda.core._utils.cuda_utils import (
     CUDAError,
     check_multiprocessing_start_method,
     driver,
diff --git a/cuda_core/cuda/core/experimental/_graph.py b/cuda_core/cuda/core/_graph.py
similarity index 99%
rename from cuda_core/cuda/core/experimental/_graph.py
rename to cuda_core/cuda/core/_graph.py
index a82bd70f55..df51126bb0 100644
--- a/cuda_core/cuda/core/experimental/_graph.py
+++ b/cuda_core/cuda/core/_graph.py
@@ -9,8 +9,8 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from cuda.core.experimental._stream import Stream
-from cuda.core.experimental._utils.cuda_utils import (
+    from cuda.core._stream import Stream
+from cuda.core._utils.cuda_utils import (
     driver,
     get_binding_version,
     handle_return,
diff --git a/cuda_core/cuda/core/experimental/include/dlpack.h b/cuda_core/cuda/core/_include/dlpack.h
similarity index 100%
rename from cuda_core/cuda/core/experimental/include/dlpack.h
rename to cuda_core/cuda/core/_include/dlpack.h
diff --git a/cuda_core/cuda/core/experimental/include/layout.hpp b/cuda_core/cuda/core/_include/layout.hpp
similarity index 100%
rename from cuda_core/cuda/core/experimental/include/layout.hpp
rename to cuda_core/cuda/core/_include/layout.hpp
diff --git a/cuda_core/cuda/core/experimental/include/utility.hpp b/cuda_core/cuda/core/_include/utility.hpp
similarity index 100%
rename from cuda_core/cuda/core/experimental/include/utility.hpp
rename to cuda_core/cuda/core/_include/utility.hpp
diff --git a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx
similarity index 99%
rename from cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx
rename to cuda_core/cuda/core/_kernel_arg_handler.pyx
index 4cac74a25f..6374605262 100644
--- a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx
+++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx
@@ -15,8 +15,8 @@ import ctypes
 
 import numpy
 
-from cuda.core.experimental._memory import Buffer
-from cuda.core.experimental._utils.cuda_utils import driver
+from cuda.core._memory import Buffer
+from cuda.core._utils.cuda_utils import driver
 from cuda.bindings cimport cydriver
 
 
diff --git a/cuda_core/cuda/core/experimental/_launch_config.pxd b/cuda_core/cuda/core/_launch_config.pxd
similarity index 100%
rename from cuda_core/cuda/core/experimental/_launch_config.pxd
rename to cuda_core/cuda/core/_launch_config.pxd
diff --git a/cuda_core/cuda/core/experimental/_launch_config.pyx b/cuda_core/cuda/core/_launch_config.pyx
similarity index 98%
rename from cuda_core/cuda/core/experimental/_launch_config.pyx
rename to cuda_core/cuda/core/_launch_config.pyx
index 7d6a1ab2b9..00c71ad903 100644
--- a/cuda_core/cuda/core/experimental/_launch_config.pyx
+++ b/cuda_core/cuda/core/_launch_config.pyx
@@ -2,8 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._device import Device
+from cuda.core._utils.cuda_utils import (
     CUDAError,
     cast_to_3_tuple,
     driver,
diff --git a/cuda_core/cuda/core/experimental/_launcher.pyx b/cuda_core/cuda/core/_launcher.pyx
similarity index 88%
rename from cuda_core/cuda/core/experimental/_launcher.pyx
rename to cuda_core/cuda/core/_launcher.pyx
index 2cba15cbf4..09900a668c 100644
--- a/cuda_core/cuda/core/experimental/_launcher.pyx
+++ b/cuda_core/cuda/core/_launcher.pyx
@@ -1,15 +1,15 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
-from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config
-from cuda.core.experimental._stream cimport Stream_accept
+from cuda.core._launch_config cimport LaunchConfig, _to_native_launch_config
+from cuda.core._stream cimport Stream_accept
 
 
-from cuda.core.experimental._kernel_arg_handler import ParamHolder
-from cuda.core.experimental._module import Kernel
-from cuda.core.experimental._stream import Stream
-from cuda.core.experimental._utils.clear_error_support import assert_type
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._kernel_arg_handler import ParamHolder
+from cuda.core._module import Kernel
+from cuda.core._stream import Stream
+from cuda.core._utils.clear_error_support import assert_type
+from cuda.core._utils.cuda_utils import (
     _reduce_3_tuple,
     check_or_create_options,
     driver,
diff --git a/cuda_core/cuda/core/experimental/_layout.pxd b/cuda_core/cuda/core/_layout.pxd
similarity index 99%
rename from cuda_core/cuda/core/experimental/_layout.pxd
rename to cuda_core/cuda/core/_layout.pxd
index 2d96a2cc83..b68e2fdabf 100644
--- a/cuda_core/cuda/core/experimental/_layout.pxd
+++ b/cuda_core/cuda/core/_layout.pxd
@@ -18,7 +18,7 @@ ctypedef uint32_t property_mask_t
 ctypedef vector.vector[stride_t] extents_strides_t
 ctypedef vector.vector[axis_t] axis_vec_t
 
-from cuda.core.experimental._utils cimport cuda_utils
+from cuda.core._utils cimport cuda_utils
 
 
 ctypedef fused integer_t:
@@ -26,7 +26,7 @@ ctypedef fused integer_t:
     int32_t
 
 
-cdef extern from "include/layout.hpp":
+cdef extern from "_include/layout.hpp":
 
     cdef int STRIDED_LAYOUT_MAX_NDIM
     cdef axes_mask_t AXES_MASK_ALL
diff --git a/cuda_core/cuda/core/experimental/_layout.pyx b/cuda_core/cuda/core/_layout.pyx
similarity index 100%
rename from cuda_core/cuda/core/experimental/_layout.pyx
rename to cuda_core/cuda/core/_layout.pyx
diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/_linker.py
similarity index 98%
rename from cuda_core/cuda/core/experimental/_linker.py
rename to cuda_core/cuda/core/_linker.py
index 2c94fb9b02..1f6f221a39 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/_linker.py
@@ -15,10 +15,10 @@
 if TYPE_CHECKING:
     import cuda.bindings
 
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._module import ObjectCode
-from cuda.core.experimental._utils.clear_error_support import assert_type
-from cuda.core.experimental._utils.cuda_utils import check_or_create_options, driver, handle_return, is_sequence
+from cuda.core._device import Device
+from cuda.core._module import ObjectCode
+from cuda.core._utils.clear_error_support import assert_type
+from cuda.core._utils.cuda_utils import check_or_create_options, driver, handle_return, is_sequence
 
 # TODO: revisit this treatment for py313t builds
 _driver = None  # populated if nvJitLink cannot be used
@@ -388,7 +388,7 @@ def _exception_manager(self):
 
 class Linker:
     """Represent a linking machinery to link one or multiple object codes into
-    :obj:`~cuda.core.experimental._module.ObjectCode` with the specified options.
+    :obj:`~cuda.core._module.ObjectCode` with the specified options.
 
     This object provides a unified interface to multiple underlying
     linker libraries (such as nvJitLink or cuLink* from CUDA driver).
diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.pxd b/cuda_core/cuda/core/_memory/__init__.pxd
similarity index 100%
rename from cuda_core/cuda/core/experimental/_memory/__init__.pxd
rename to cuda_core/cuda/core/_memory/__init__.pxd
diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/_memory/__init__.py
similarity index 100%
rename from cuda_core/cuda/core/experimental/_memory/__init__.py
rename to cuda_core/cuda/core/_memory/__init__.py
diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd
similarity index 92%
rename from cuda_core/cuda/core/experimental/_memory/_buffer.pxd
rename to cuda_core/cuda/core/_memory/_buffer.pxd
index b581dcd293..730e448f63 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd
+++ b/cuda_core/cuda/core/_memory/_buffer.pxd
@@ -4,7 +4,7 @@
 
 from libc.stdint cimport uintptr_t
 
-from cuda.core.experimental._stream cimport Stream
+from cuda.core._stream cimport Stream
 
 
 cdef struct _MemAttrs:
diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
similarity index 96%
rename from cuda_core/cuda/core/experimental/_memory/_buffer.pyx
rename to cuda_core/cuda/core/_memory/_buffer.pyx
index 3c44915bcb..44f781ebdf 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -9,12 +9,12 @@ from libc.stdint cimport uint8_t, uint16_t, uint32_t, uintptr_t
 from cpython.buffer cimport PyObject_GetBuffer, PyBuffer_Release, Py_buffer, PyBUF_SIMPLE
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._device_memory_resource import DeviceMemoryResource
-from cuda.core.experimental._memory._pinned_memory_resource import PinnedMemoryResource
-from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor, IPCDataForBuffer
-from cuda.core.experimental._memory cimport _ipc
-from cuda.core.experimental._stream cimport Stream_accept, Stream
-from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
+from cuda.core._memory._device_memory_resource import DeviceMemoryResource
+from cuda.core._memory._pinned_memory_resource import PinnedMemoryResource
+from cuda.core._memory._ipc cimport IPCBufferDescriptor, IPCDataForBuffer
+from cuda.core._memory cimport _ipc
+from cuda.core._stream cimport Stream_accept, Stream
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 import abc
 import sys
@@ -25,9 +25,9 @@ if sys.version_info >= (3, 12):
 else:
     BufferProtocol = object
 
-from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
-from cuda.core.experimental._utils.cuda_utils import driver
-from cuda.core.experimental._device import Device
+from cuda.core._dlpack import DLDeviceType, make_py_capsule
+from cuda.core._utils.cuda_utils import driver
+from cuda.core._device import Device
 
 __all__ = ['Buffer', 'MemoryResource']
 
diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd b/cuda_core/cuda/core/_memory/_device_memory_resource.pxd
similarity index 66%
rename from cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd
rename to cuda_core/cuda/core/_memory/_device_memory_resource.pxd
index 17ee12e54f..c293d72750 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd
+++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pxd
@@ -2,8 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.core.experimental._memory._memory_pool cimport _MemPool
-from cuda.core.experimental._memory._ipc cimport IPCDataForMR
+from cuda.core._memory._memory_pool cimport _MemPool
+from cuda.core._memory._ipc cimport IPCDataForMR
 
 
 cdef class DeviceMemoryResource(_MemPool):
diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
similarity index 97%
rename from cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
rename to cuda_core/cuda/core/_memory/_device_memory_resource.pyx
index 49c590374e..f9defcd68e 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
@@ -5,10 +5,10 @@
 from __future__ import annotations
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._memory_pool cimport _MemPool, _MemPoolOptions
-from cuda.core.experimental._memory cimport _ipc
-from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle
-from cuda.core.experimental._utils.cuda_utils cimport (
+from cuda.core._memory._memory_pool cimport _MemPool, _MemPoolOptions
+from cuda.core._memory cimport _ipc
+from cuda.core._memory._ipc cimport IPCAllocationHandle
+from cuda.core._utils.cuda_utils cimport (
     check_or_create_options,
     HANDLE_RETURN,
 )
@@ -19,7 +19,7 @@ from typing import Optional, TYPE_CHECKING
 import platform  # no-cython-lint
 import uuid
 
-from cuda.core.experimental._utils.cuda_utils import check_multiprocessing_start_method
+from cuda.core._utils.cuda_utils import check_multiprocessing_start_method
 
 if TYPE_CHECKING:
     from .._device import Device
diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd b/cuda_core/cuda/core/_memory/_graph_memory_resource.pxd
similarity index 77%
rename from cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd
rename to cuda_core/cuda/core/_memory/_graph_memory_resource.pxd
index f9c7798e76..2f6c35d72e 100644
--- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd
+++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.core.experimental._memory._buffer cimport MemoryResource
+from cuda.core._memory._buffer cimport MemoryResource
 
 
 cdef class cyGraphMemoryResource(MemoryResource):
diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
similarity index 96%
rename from cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
rename to cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
index c65354b612..bda075c201 100644
--- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
@@ -7,15 +7,15 @@ from __future__ import annotations
 from libc.stdint cimport intptr_t
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
-from cuda.core.experimental._stream cimport default_stream, Stream_accept, Stream
-from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
+from cuda.core._memory._buffer cimport Buffer, MemoryResource
+from cuda.core._stream cimport default_stream, Stream_accept, Stream
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 from functools import cache
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from cuda.core.experimental._memory.buffer import DevicePointerT
+    from cuda.core._memory.buffer import DevicePointerT
 
 __all__ = ['GraphMemoryResource']
 
diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd b/cuda_core/cuda/core/_memory/_ipc.pxd
similarity index 92%
rename from cuda_core/cuda/core/experimental/_memory/_ipc.pxd
rename to cuda_core/cuda/core/_memory/_ipc.pxd
index 3fed2b7188..0c7375efdb 100644
--- a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd
+++ b/cuda_core/cuda/core/_memory/_ipc.pxd
@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._buffer cimport Buffer
-from cuda.core.experimental._memory._memory_pool cimport _MemPool
+from cuda.core._memory._buffer cimport Buffer
+from cuda.core._memory._memory_pool cimport _MemPool
 
 
 # Holds _MemPool objects imported by this process.  This enables
diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/_memory/_ipc.pyx
similarity index 96%
rename from cuda_core/cuda/core/experimental/_memory/_ipc.pyx
rename to cuda_core/cuda/core/_memory/_ipc.pyx
index 980e814e11..793e4168d7 100644
--- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
+++ b/cuda_core/cuda/core/_memory/_ipc.pyx
@@ -7,10 +7,10 @@ from libc.stdint cimport uintptr_t
 from libc.string cimport memcpy
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._buffer cimport Buffer
-from cuda.core.experimental._stream cimport default_stream
-from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
-from cuda.core.experimental._utils.cuda_utils import check_multiprocessing_start_method
+from cuda.core._memory._buffer cimport Buffer
+from cuda.core._stream cimport default_stream
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
+from cuda.core._utils.cuda_utils import check_multiprocessing_start_method
 
 import multiprocessing
 import os
diff --git a/cuda_core/cuda/core/experimental/_memory/_legacy.py b/cuda_core/cuda/core/_memory/_legacy.py
similarity index 89%
rename from cuda_core/cuda/core/experimental/_memory/_legacy.py
rename to cuda_core/cuda/core/_memory/_legacy.py
index 09ea0e15d2..317494ea9e 100644
--- a/cuda_core/cuda/core/experimental/_memory/_legacy.py
+++ b/cuda_core/cuda/core/_memory/_legacy.py
@@ -6,16 +6,16 @@
 
 from typing import TYPE_CHECKING
 
-from cuda.core.experimental._memory._buffer import Buffer, MemoryResource
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._memory._buffer import Buffer, MemoryResource
+from cuda.core._utils.cuda_utils import (
     _check_driver_error as raise_if_driver_error,
 )
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._utils.cuda_utils import (
     driver,
 )
 
 if TYPE_CHECKING:
-    from cuda.core.experimental._memory.buffer import DevicePointerT
+    from cuda.core._memory.buffer import DevicePointerT
 
 __all__ = ["LegacyPinnedMemoryResource", "_SynchronousMemoryResource"]
 
@@ -43,7 +43,7 @@ def allocate(self, size, stream=None) -> Buffer:
             The allocated buffer object, which is accessible on both host and device.
         """
         if stream is None:
-            from cuda.core.experimental._stream import default_stream
+            from cuda.core._stream import default_stream
 
             stream = default_stream()
         err, ptr = driver.cuMemAllocHost(size)
@@ -93,7 +93,7 @@ def __init__(self, device_id):
 
     def allocate(self, size, stream=None) -> Buffer:
         if stream is None:
-            from cuda.core.experimental._stream import default_stream
+            from cuda.core._stream import default_stream
 
             stream = default_stream()
         err, ptr = driver.cuMemAlloc(size)
diff --git a/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pxd b/cuda_core/cuda/core/_memory/_managed_memory_resource.pxd
similarity index 75%
rename from cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pxd
rename to cuda_core/cuda/core/_memory/_managed_memory_resource.pxd
index 3e9aed7bee..46e00cd4cb 100644
--- a/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pxd
+++ b/cuda_core/cuda/core/_memory/_managed_memory_resource.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.core.experimental._memory._memory_pool cimport _MemPool
+from cuda.core._memory._memory_pool cimport _MemPool
 
 
 cdef class ManagedMemoryResource(_MemPool):
diff --git a/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx b/cuda_core/cuda/core/_memory/_managed_memory_resource.pyx
similarity index 96%
rename from cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx
rename to cuda_core/cuda/core/_memory/_managed_memory_resource.pyx
index 7636213a63..9ca9eb8f88 100644
--- a/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_managed_memory_resource.pyx
@@ -5,8 +5,8 @@
 from __future__ import annotations
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._memory_pool cimport _MemPool, _MemPoolOptions
-from cuda.core.experimental._utils.cuda_utils cimport (
+from cuda.core._memory._memory_pool cimport _MemPool, _MemPoolOptions
+from cuda.core._utils.cuda_utils cimport (
     check_or_create_options,
 )
 
diff --git a/cuda_core/cuda/core/experimental/_memory/_memory_pool.pxd b/cuda_core/cuda/core/_memory/_memory_pool.pxd
similarity index 85%
rename from cuda_core/cuda/core/experimental/_memory/_memory_pool.pxd
rename to cuda_core/cuda/core/_memory/_memory_pool.pxd
index 68b2e6438f..8d9961b68b 100644
--- a/cuda_core/cuda/core/experimental/_memory/_memory_pool.pxd
+++ b/cuda_core/cuda/core/_memory/_memory_pool.pxd
@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._buffer cimport MemoryResource
-from cuda.core.experimental._memory._ipc cimport IPCDataForMR
+from cuda.core._memory._buffer cimport MemoryResource
+from cuda.core._memory._ipc cimport IPCDataForMR
 
 
 cdef class _MemPool(MemoryResource):
diff --git a/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx b/cuda_core/cuda/core/_memory/_memory_pool.pyx
similarity index 97%
rename from cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx
rename to cuda_core/cuda/core/_memory/_memory_pool.pyx
index dbbcc75715..30d9a9cf3a 100644
--- a/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx
+++ b/cuda_core/cuda/core/_memory/_memory_pool.pyx
@@ -10,10 +10,10 @@ from libc.string cimport memset
 from cpython.mem cimport PyMem_Malloc, PyMem_Free
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
-from cuda.core.experimental._memory cimport _ipc
-from cuda.core.experimental._stream cimport default_stream, Stream_accept, Stream
-from cuda.core.experimental._utils.cuda_utils cimport (
+from cuda.core._memory._buffer cimport Buffer, MemoryResource
+from cuda.core._memory cimport _ipc
+from cuda.core._stream cimport default_stream, Stream_accept, Stream
+from cuda.core._utils.cuda_utils cimport (
     HANDLE_RETURN,
 )
 
@@ -21,10 +21,10 @@ from typing import TYPE_CHECKING
 import platform  # no-cython-lint
 import weakref
 
-from cuda.core.experimental._utils.cuda_utils import driver
+from cuda.core._utils.cuda_utils import driver
 
 if TYPE_CHECKING:
-    from cuda.core.experimental._memory.buffer import DevicePointerT
+    from cuda.core._memory.buffer import DevicePointerT
     from .._device import Device
 
 
diff --git a/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pxd b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pxd
similarity index 60%
rename from cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pxd
rename to cuda_core/cuda/core/_memory/_pinned_memory_resource.pxd
index df225c1860..a8262d9bd8 100644
--- a/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pxd
+++ b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pxd
@@ -2,8 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.core.experimental._memory._memory_pool cimport _MemPool
-from cuda.core.experimental._memory._ipc cimport IPCDataForMR
+from cuda.core._memory._memory_pool cimport _MemPool
+from cuda.core._memory._ipc cimport IPCDataForMR
 
 
 cdef class PinnedMemoryResource(_MemPool):
diff --git a/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx
similarity index 96%
rename from cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx
rename to cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx
index f5395308e5..194210dabb 100644
--- a/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx
@@ -5,10 +5,10 @@
 from __future__ import annotations
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._memory_pool cimport _MemPool, _MemPoolOptions
-from cuda.core.experimental._memory cimport _ipc
-from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle
-from cuda.core.experimental._utils.cuda_utils cimport (
+from cuda.core._memory._memory_pool cimport _MemPool, _MemPoolOptions
+from cuda.core._memory cimport _ipc
+from cuda.core._memory._ipc cimport IPCAllocationHandle
+from cuda.core._utils.cuda_utils cimport (
     check_or_create_options,
     HANDLE_RETURN,
 )
@@ -21,7 +21,7 @@ import subprocess
 import uuid
 import warnings
 
-from cuda.core.experimental._utils.cuda_utils import check_multiprocessing_start_method
+from cuda.core._utils.cuda_utils import check_multiprocessing_start_method
 
 
 # Cache to ensure NUMA warning is only raised once per process
diff --git a/cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py b/cuda_core/cuda/core/_memory/_virtual_memory_resource.py
similarity index 98%
rename from cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py
rename to cuda_core/cuda/core/_memory/_virtual_memory_resource.py
index 2806e2d0d5..43da00744a 100644
--- a/cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py
+++ b/cuda_core/cuda/core/_memory/_virtual_memory_resource.py
@@ -7,20 +7,20 @@
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Iterable, Literal, Union
 
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._memory._buffer import Buffer, MemoryResource
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._device import Device
+from cuda.core._memory._buffer import Buffer, MemoryResource
+from cuda.core._utils.cuda_utils import (
     Transaction,
     check_or_create_options,
     driver,
     get_binding_version,
 )
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._utils.cuda_utils import (
     _check_driver_error as raise_if_driver_error,
 )
 
 if TYPE_CHECKING:
-    from cuda.core.experimental._stream import Stream
+    from cuda.core._stream import Stream
 
 __all__ = ["VirtualMemoryResourceOptions", "VirtualMemoryResource"]
 
diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx
similarity index 99%
rename from cuda_core/cuda/core/experimental/_memoryview.pyx
rename to cuda_core/cuda/core/_memoryview.pyx
index cbfc790866..6d1ac6c439 100644
--- a/cuda_core/cuda/core/experimental/_memoryview.pyx
+++ b/cuda_core/cuda/core/_memoryview.pyx
@@ -4,8 +4,8 @@
 
 from ._dlpack cimport *
 from libc.stdint cimport intptr_t
-from cuda.core.experimental._layout cimport _StridedLayout
-from cuda.core.experimental._stream import Stream
+from cuda.core._layout cimport _StridedLayout
+from cuda.core._stream import Stream
 
 import functools
 import warnings
@@ -13,10 +13,10 @@ from typing import Optional
 
 import numpy
 
-from cuda.core.experimental._utils.cuda_utils import handle_return, driver
+from cuda.core._utils.cuda_utils import handle_return, driver
 
 
-from cuda.core.experimental._memory import Buffer
+from cuda.core._memory import Buffer
 
 # TODO(leofang): support NumPy structured dtypes
 
diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/_module.py
similarity index 98%
rename from cuda_core/cuda/core/experimental/_module.py
rename to cuda_core/cuda/core/_module.py
index 9af722465b..fbea314406 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/_module.py
@@ -7,15 +7,15 @@
 from typing import Union
 from warnings import warn
 
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._launch_config import LaunchConfig, _to_native_launch_config
-from cuda.core.experimental._stream import Stream
-from cuda.core.experimental._utils.clear_error_support import (
+from cuda.core._device import Device
+from cuda.core._launch_config import LaunchConfig, _to_native_launch_config
+from cuda.core._stream import Stream
+from cuda.core._utils.clear_error_support import (
     assert_type,
     assert_type_str_or_bytes_like,
     raise_code_path_meant_to_be_unreachable,
 )
-from cuda.core.experimental._utils.cuda_utils import driver, get_binding_version, handle_return, precondition
+from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return, precondition
 
 _backend = {
     "old": {
@@ -453,7 +453,7 @@ class ObjectCode:
     This class has no default constructor. If you already have a cubin that you would
     like to load, use the :meth:`from_cubin` alternative constructor. Constructing directly
     from all other possible code types should be avoided in favor of compilation through
-    :class:`~cuda.core.experimental.Program`
+    :class:`~cuda.core.Program`
 
     Note
     ----
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/_program.py
similarity index 99%
rename from cuda_core/cuda/core/experimental/_program.py
rename to cuda_core/cuda/core/_program.py
index f3ad9af644..121dd13963 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/_program.py
@@ -13,11 +13,11 @@
 if TYPE_CHECKING:
     import cuda.bindings
 
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._linker import Linker, LinkerHandleT, LinkerOptions
-from cuda.core.experimental._module import ObjectCode
-from cuda.core.experimental._utils.clear_error_support import assert_type
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._device import Device
+from cuda.core._linker import Linker, LinkerHandleT, LinkerOptions
+from cuda.core._module import ObjectCode
+from cuda.core._utils.clear_error_support import assert_type
+from cuda.core._utils.cuda_utils import (
     CUDAError,
     _handle_boolean_option,
     check_or_create_options,
diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/_stream.pxd
similarity index 100%
rename from cuda_core/cuda/core/experimental/_stream.pxd
rename to cuda_core/cuda/core/_stream.pxd
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/_stream.pyx
similarity index 97%
rename from cuda_core/cuda/core/experimental/_stream.pyx
rename to cuda_core/cuda/core/_stream.pyx
index 87ec4a691a..b724f9aee3 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/_stream.pyx
@@ -9,8 +9,8 @@ from libc.stdlib cimport strtol, getenv
 
 from cuda.bindings cimport cydriver
 
-from cuda.core.experimental._event cimport Event as cyEvent
-from cuda.core.experimental._utils.cuda_utils cimport (
+from cuda.core._event cimport Event as cyEvent
+from cuda.core._utils.cuda_utils cimport (
     check_or_create_options,
     CU_CONTEXT_INVALID,
     get_device_from_ctx,
@@ -24,11 +24,11 @@ from typing import TYPE_CHECKING, Optional, Protocol, Union
 
 if TYPE_CHECKING:
     import cuda.bindings
-    from cuda.core.experimental._device import Device
-from cuda.core.experimental._context import Context
-from cuda.core.experimental._event import Event, EventOptions
-from cuda.core.experimental._graph import GraphBuilder
-from cuda.core.experimental._utils.cuda_utils import (
+    from cuda.core._device import Device
+from cuda.core._context import Context
+from cuda.core._event import Event, EventOptions
+from cuda.core._graph import GraphBuilder
+from cuda.core._utils.cuda_utils import (
     driver,
 )
 
@@ -311,7 +311,7 @@ cdef class Stream:
         context is set current after a stream is created.
 
         """
-        from cuda.core.experimental._device import Device  # avoid circular import
+        from cuda.core._device import Device  # avoid circular import
         self._get_device_and_context()
         return Device(<int>(self._device_id))
 
diff --git a/cuda_core/cuda/core/experimental/_system.py b/cuda_core/cuda/core/_system.py
similarity index 95%
rename from cuda_core/cuda/core/experimental/_system.py
rename to cuda_core/cuda/core/_system.py
index ac157f5760..6f06587b46 100644
--- a/cuda_core/cuda/core/experimental/_system.py
+++ b/cuda_core/cuda/core/_system.py
@@ -4,8 +4,8 @@
 
 import warnings
 
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._utils.cuda_utils import driver, handle_return, runtime
+from cuda.core._device import Device
+from cuda.core._utils.cuda_utils import driver, handle_return, runtime
 
 
 class System:
diff --git a/cuda_core/cuda/core/experimental/_utils/__init__.pxd b/cuda_core/cuda/core/_utils/__init__.pxd
similarity index 100%
rename from cuda_core/cuda/core/experimental/_utils/__init__.pxd
rename to cuda_core/cuda/core/_utils/__init__.pxd
diff --git a/cuda_core/cuda/core/experimental/_utils/__init__.py b/cuda_core/cuda/core/_utils/__init__.py
similarity index 100%
rename from cuda_core/cuda/core/experimental/_utils/__init__.py
rename to cuda_core/cuda/core/_utils/__init__.py
diff --git a/cuda_core/cuda/core/experimental/_utils/clear_error_support.py b/cuda_core/cuda/core/_utils/clear_error_support.py
similarity index 100%
rename from cuda_core/cuda/core/experimental/_utils/clear_error_support.py
rename to cuda_core/cuda/core/_utils/clear_error_support.py
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/_utils/cuda_utils.pxd
similarity index 100%
rename from cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
rename to cuda_core/cuda/core/_utils/cuda_utils.pxd
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/_utils/cuda_utils.pyx
similarity index 98%
rename from cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
rename to cuda_core/cuda/core/_utils/cuda_utils.pyx
index 4489871747..0c3f6521a4 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/_utils/cuda_utils.pyx
@@ -20,8 +20,8 @@ except ImportError:
     from cuda import cudart as runtime
     from cuda import nvrtc
 
-from cuda.core.experimental._utils.driver_cu_result_explanations import DRIVER_CU_RESULT_EXPLANATIONS
-from cuda.core.experimental._utils.runtime_cuda_error_explanations import RUNTIME_CUDA_ERROR_EXPLANATIONS
+from cuda.core._utils.driver_cu_result_explanations import DRIVER_CU_RESULT_EXPLANATIONS
+from cuda.core._utils.runtime_cuda_error_explanations import RUNTIME_CUDA_ERROR_EXPLANATIONS
 
 
 class CUDAError(Exception):
diff --git a/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py b/cuda_core/cuda/core/_utils/driver_cu_result_explanations.py
similarity index 100%
rename from cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py
rename to cuda_core/cuda/core/_utils/driver_cu_result_explanations.py
diff --git a/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py b/cuda_core/cuda/core/_utils/runtime_cuda_error_explanations.py
similarity index 100%
rename from cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py
rename to cuda_core/cuda/core/_utils/runtime_cuda_error_explanations.py
diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 92174468d1..8adc1b78fd 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -2,43 +2,59 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-try:
-    from cuda import bindings
-except ImportError:
-    raise ImportError("cuda.bindings 12.x or 13.x must be installed") from None
-else:
-    cuda_major, cuda_minor = bindings.__version__.split(".")[:2]
-    if cuda_major not in ("12", "13"):
-        raise ImportError("cuda.bindings 12.x or 13.x must be installed")
-
-import importlib
-
-subdir = f"cu{cuda_major}"
-try:
-    versioned_mod = importlib.import_module(f".{subdir}", __package__)
-    # Import all symbols from the module
-    globals().update(versioned_mod.__dict__)
-except ImportError:
-    # This is not a wheel build, but a conda or local build, do nothing
-    pass
-else:
-    del versioned_mod
-finally:
-    del bindings, importlib, subdir, cuda_major, cuda_minor
-
-from cuda.core.experimental import utils  # noqa: E402
-from cuda.core.experimental._device import Device  # noqa: E402
-from cuda.core.experimental._event import Event, EventOptions  # noqa: E402
-from cuda.core.experimental._graph import (  # noqa: E402
+"""
+Backward compatibility stubs for cuda.core.experimental namespace.
+
+This module provides forwarding stubs that import from the new cuda.core.*
+locations and emit deprecation warnings. Users should migrate to importing
+directly from cuda.core instead of cuda.core.experimental.
+
+The experimental namespace will be removed in a future release.
+"""
+
+import warnings
+
+
+def _warn_deprecated():
+    """Emit a deprecation warning for using the experimental namespace.
+
+    Note: This warning is only when the experimental module is first imported.
+    Subsequent accesses to attributes (like utils, Device, etc.) do not trigger
+    additional warnings since they are already set in the module namespace.
+    Only accessing submodules via __getattr__ (e.g., _device, _utils) will trigger
+    additional warnings.
+    """
+    warnings.warn(
+        "The cuda.core.experimental namespace is deprecated. "
+        "Please import directly from cuda.core instead. "
+        "For example, use 'from cuda.core import Device' instead of "
+        "'from cuda.core.experimental import Device'. "
+        "The experimental namespace will be removed in a future release.",
+        DeprecationWarning,
+        stacklevel=3,
+    )
+
+
+# Import from new locations and re-export
+_warn_deprecated()
+
+from cuda.core import utils  # noqa: E402
+
+# Make utils accessible as a submodule for backward compatibility
+__import__("sys").modules[__spec__.name + ".utils"] = utils
+from cuda.core._device import Device  # noqa: E402
+from cuda.core._event import Event, EventOptions  # noqa: E402
+from cuda.core._graph import (  # noqa: E402
     Graph,
     GraphBuilder,
     GraphCompleteOptions,
     GraphDebugPrintOptions,
 )
-from cuda.core.experimental._launch_config import LaunchConfig  # noqa: E402
-from cuda.core.experimental._launcher import launch  # noqa: E402
-from cuda.core.experimental._linker import Linker, LinkerOptions  # noqa: E402
-from cuda.core.experimental._memory import (  # noqa: E402
+from cuda.core._launch_config import LaunchConfig  # noqa: E402
+from cuda.core._launcher import launch  # noqa: E402
+from cuda.core._layout import _StridedLayout  # noqa: E402
+from cuda.core._linker import Linker, LinkerOptions  # noqa: E402
+from cuda.core._memory import (  # noqa: E402
     Buffer,
     DeviceMemoryResource,
     DeviceMemoryResourceOptions,
@@ -52,11 +68,51 @@
     VirtualMemoryResource,
     VirtualMemoryResourceOptions,
 )
-from cuda.core.experimental._module import Kernel, ObjectCode  # noqa: E402
-from cuda.core.experimental._program import Program, ProgramOptions  # noqa: E402
-from cuda.core.experimental._stream import Stream, StreamOptions  # noqa: E402
-from cuda.core.experimental._system import System  # noqa: E402
+from cuda.core._memoryview import (  # noqa: E402
+    StridedMemoryView,  # noqa: E402
+    args_viewable_as_strided_memory,  # noqa: E402
+)
+from cuda.core._module import Kernel, ObjectCode  # noqa: E402
+from cuda.core._program import Program, ProgramOptions  # noqa: E402
+from cuda.core._stream import Stream, StreamOptions  # noqa: E402
+from cuda.core._system import System  # noqa: E402
 
 system = System()
 __import__("sys").modules[__spec__.name + ".system"] = system
 del System
+
+
+# Also create forwarding stubs for submodules
+# These will be imported lazily when accessed
+def __getattr__(name):
+    """Forward attribute access to the new location with deprecation warning."""
+    if name in (
+        "_context",
+        "_device",
+        "_dlpack",
+        "_event",
+        "_graph",
+        "_kernel_arg_handler",
+        "_launch_config",
+        "_launcher",
+        "_layout",
+        "_linker",
+        "_memory",
+        "_memoryview",
+        "_module",
+        "_program",
+        "_stream",
+        "_system",
+        "_utils",
+    ):
+        _warn_deprecated()
+        # Import the submodule from the new location
+        import importlib
+
+        new_name = name.lstrip("_")
+        try:
+            return importlib.import_module(f"cuda.core.{new_name}")
+        except ImportError:
+            # Fallback to underscore-prefixed name
+            return importlib.import_module(f"cuda.core.{name}")
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/cuda_core/cuda/core/experimental/utils.py b/cuda_core/cuda/core/utils.py
similarity index 66%
rename from cuda_core/cuda/core/experimental/utils.py
rename to cuda_core/cuda/core/utils.py
index b8dc55b478..844591391b 100644
--- a/cuda_core/cuda/core/experimental/utils.py
+++ b/cuda_core/cuda/core/utils.py
@@ -2,8 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.core.experimental._layout import _StridedLayout  # noqa: F401
-from cuda.core.experimental._memoryview import (
+from cuda.core._layout import _StridedLayout  # noqa: F401
+from cuda.core._memoryview import (
     StridedMemoryView,  # noqa: F401
     args_viewable_as_strided_memory,  # noqa: F401
 )
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index af99ddd361..94a9e931cc 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -69,7 +69,7 @@ issues = "https://github.com/NVIDIA/cuda-python/issues/"
 include = ["cuda.core*"]
 
 [tool.setuptools.package-data]
-"cuda.core.experimental.include" = ["*.h", "*.hpp", "*.cuh"]
+"cuda.core._include" = ["*.h", "*.hpp", "*.cuh"]
 
 [tool.setuptools.dynamic]
 version = { attr = "cuda.core._version.__version__" }
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index 9aaf23498f..0dac8f7def 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -12,8 +12,8 @@
 except ImportError:
     from cuda import cuda as driver
 
-import cuda.core.experimental
-from cuda.core.experimental import (
+import cuda.core
+from cuda.core import (
     Device,
     DeviceMemoryResource,
     DeviceMemoryResourceOptions,
@@ -23,7 +23,7 @@
     PinnedMemoryResourceOptions,
     _device,
 )
-from cuda.core.experimental._utils.cuda_utils import handle_return
+from cuda.core._utils.cuda_utils import handle_return
 
 
 def skip_if_pinned_memory_unsupported(device):
@@ -172,7 +172,7 @@ def mempool_device():
 
 
 def _mempool_device_impl(num):
-    num_devices = len(cuda.core.experimental.Device.get_all_devices())
+    num_devices = len(cuda.core.Device.get_all_devices())
     if num_devices < num:
         pytest.skip(f"Test requires at least {num} GPUs")
 
diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py
index 450c60bf06..640b53c2fc 100644
--- a/cuda_core/tests/example_tests/test_basic_examples.py
+++ b/cuda_core/tests/example_tests/test_basic_examples.py
@@ -7,7 +7,7 @@
 import os
 
 import pytest
-from cuda.core.experimental import Device
+from cuda.core import Device
 
 from .utils import run_example
 
diff --git a/cuda_core/tests/helpers/buffers.py b/cuda_core/tests/helpers/buffers.py
index b4d769eab3..3004cd0d00 100644
--- a/cuda_core/tests/helpers/buffers.py
+++ b/cuda_core/tests/helpers/buffers.py
@@ -3,8 +3,8 @@
 
 import ctypes
 
-from cuda.core.experimental import Buffer, Device, MemoryResource
-from cuda.core.experimental._utils.cuda_utils import driver, handle_return
+from cuda.core import Buffer, Device, MemoryResource
+from cuda.core._utils.cuda_utils import driver, handle_return
 
 from . import libc
 
diff --git a/cuda_core/tests/helpers/latch.py b/cuda_core/tests/helpers/latch.py
index 46516c1b06..e35ee3325b 100644
--- a/cuda_core/tests/helpers/latch.py
+++ b/cuda_core/tests/helpers/latch.py
@@ -4,7 +4,7 @@
 import ctypes
 
 import pytest
-from cuda.core.experimental import (
+from cuda.core import (
     LaunchConfig,
     LegacyPinnedMemoryResource,
     Program,
diff --git a/cuda_core/tests/helpers/nanosleep_kernel.py b/cuda_core/tests/helpers/nanosleep_kernel.py
index ea6ae34dcf..99d32c9aa4 100644
--- a/cuda_core/tests/helpers/nanosleep_kernel.py
+++ b/cuda_core/tests/helpers/nanosleep_kernel.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.core.experimental import (
+from cuda.core import (
     LaunchConfig,
     Program,
     ProgramOptions,
diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py
index d6280ae0ec..ccb3d3b7cc 100644
--- a/cuda_core/tests/memory_ipc/test_errors.py
+++ b/cuda_core/tests/memory_ipc/test_errors.py
@@ -5,8 +5,8 @@
 import pickle
 import re
 
-from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
-from cuda.core.experimental._utils.cuda_utils import CUDAError
+from cuda.core import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
+from cuda.core._utils.cuda_utils import CUDAError
 
 CHILD_TIMEOUT_SEC = 20
 NBYTES = 64
diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py
index 5edf97f2ae..1fabaeddda 100644
--- a/cuda_core/tests/memory_ipc/test_event_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_event_ipc.py
@@ -4,7 +4,7 @@
 import multiprocessing as mp
 
 import pytest
-from cuda.core.experimental import Device, EventOptions
+from cuda.core import Device, EventOptions
 from helpers.buffers import compare_equal_buffers, make_scratch_buffer
 from helpers.latch import LatchKernel
 from helpers.logging import TimestampedLogger
diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py
index 54d8056865..d92a28ab5a 100644
--- a/cuda_core/tests/memory_ipc/test_memory_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py
@@ -3,7 +3,7 @@
 
 import multiprocessing as mp
 
-from cuda.core.experimental import Buffer, DeviceMemoryResource
+from cuda.core import Buffer, DeviceMemoryResource
 from helpers.buffers import PatternGen
 
 CHILD_TIMEOUT_SEC = 20
diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py
index 87dc459ffc..5a06133c9b 100644
--- a/cuda_core/tests/memory_ipc/test_peer_access.py
+++ b/cuda_core/tests/memory_ipc/test_peer_access.py
@@ -4,8 +4,8 @@
 import multiprocessing as mp
 
 import pytest
-from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
-from cuda.core.experimental._utils.cuda_utils import CUDAError
+from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
+from cuda.core._utils.cuda_utils import CUDAError
 from helpers.buffers import PatternGen
 
 CHILD_TIMEOUT_SEC = 20
diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py
index 3493828c7e..2df3fe1bbc 100644
--- a/cuda_core/tests/memory_ipc/test_send_buffers.py
+++ b/cuda_core/tests/memory_ipc/test_send_buffers.py
@@ -5,7 +5,7 @@
 from itertools import cycle
 
 import pytest
-from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
+from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
 from helpers.buffers import PatternGen
 
 CHILD_TIMEOUT_SEC = 20
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index f5686db28c..546c8a91aa 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -5,7 +5,7 @@
 import multiprocessing.reduction
 import os
 
-from cuda.core.experimental import Buffer, Device, DeviceMemoryResource
+from cuda.core import Buffer, Device, DeviceMemoryResource
 from helpers.buffers import PatternGen
 
 CHILD_TIMEOUT_SEC = 20
diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
index 3f3f46cd27..b13b9896a1 100644
--- a/cuda_core/tests/memory_ipc/test_workerpool.py
+++ b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -6,7 +6,7 @@
 from itertools import cycle
 
 import pytest
-from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
+from cuda.core import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
 from helpers.buffers import PatternGen
 
 CHILD_TIMEOUT_SEC = 20
diff --git a/cuda_core/tests/test_comparable.py b/cuda_core/tests/test_comparable.py
index c99963cd23..a93e49e4e8 100644
--- a/cuda_core/tests/test_comparable.py
+++ b/cuda_core/tests/test_comparable.py
@@ -8,10 +8,10 @@
 across Device, Stream, Event, and Context objects.
 """
 
-from cuda.core.experimental import Device, Stream
-from cuda.core.experimental._context import Context
-from cuda.core.experimental._event import Event, EventOptions
-from cuda.core.experimental._stream import StreamOptions
+from cuda.core import Device, Stream
+from cuda.core._context import Context
+from cuda.core._event import Event, EventOptions
+from cuda.core._stream import StreamOptions
 
 # ============================================================================
 # Equality Contract Tests
diff --git a/cuda_core/tests/test_context.py b/cuda_core/tests/test_context.py
index 4fe35dc18d..5183aa1a85 100644
--- a/cuda_core/tests/test_context.py
+++ b/cuda_core/tests/test_context.py
@@ -1,14 +1,14 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import cuda.core.experimental
+import cuda.core
 import pytest
-from cuda.core.experimental import Device
+from cuda.core import Device
 
 
 def test_context_init_disabled():
     with pytest.raises(RuntimeError, match=r"^Context objects cannot be instantiated directly\."):
-        cuda.core.experimental._context.Context()  # Ensure back door is locked.
+        cuda.core._context.Context()  # Ensure back door is locked.
 
 
 # ============================================================================
diff --git a/cuda_core/tests/test_cuda_utils.py b/cuda_core/tests/test_cuda_utils.py
index b0a0518652..c68f8fb841 100644
--- a/cuda_core/tests/test_cuda_utils.py
+++ b/cuda_core/tests/test_cuda_utils.py
@@ -4,7 +4,7 @@
 
 import pytest
 from cuda.bindings import driver, runtime
-from cuda.core.experimental._utils import cuda_utils
+from cuda.core._utils import cuda_utils
 
 
 def test_driver_cu_result_explanations_health():
diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py
index ebdc3e3ac4..e4365ac0c9 100644
--- a/cuda_core/tests/test_device.py
+++ b/cuda_core/tests/test_device.py
@@ -6,15 +6,15 @@
 except ImportError:
     from cuda import cuda as driver
     from cuda import cudart as runtime
-import cuda.core.experimental
+import cuda.core
 import pytest
-from cuda.core.experimental import Device
-from cuda.core.experimental._utils.cuda_utils import ComputeCapability, get_binding_version, handle_return
+from cuda.core import Device
+from cuda.core._utils.cuda_utils import ComputeCapability, get_binding_version, handle_return
 
 
 def test_device_init_disabled():
     with pytest.raises(RuntimeError, match=r"^DeviceProperties cannot be instantiated directly\."):
-        cuda.core.experimental._device.DeviceProperties()  # Ensure back door is locked.
+        cuda.core._device.DeviceProperties()  # Ensure back door is locked.
 
 
 @pytest.fixture(scope="module")
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index ec35448619..0d8f3a3c2d 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -4,9 +4,9 @@
 
 import math
 
-import cuda.core.experimental
+import cuda.core
 import pytest
-from cuda.core.experimental import (
+from cuda.core import (
     Device,
     Event,
     EventOptions,
@@ -17,7 +17,7 @@
 
 def test_event_init_disabled():
     with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."):
-        cuda.core.experimental._event.Event()  # Ensure back door is locked.
+        cuda.core._event.Event()  # Ensure back door is locked.
 
 
 def test_timing_success(init_cuda):
diff --git a/cuda_core/tests/test_graph.py b/cuda_core/tests/test_graph.py
index e988eeebf6..aaad9304f4 100644
--- a/cuda_core/tests/test_graph.py
+++ b/cuda_core/tests/test_graph.py
@@ -11,7 +11,7 @@
     from cuda.bindings import nvrtc
 except ImportError:
     from cuda import nvrtc
-from cuda.core.experimental import (
+from cuda.core import (
     Device,
     GraphBuilder,
     GraphCompleteOptions,
@@ -22,7 +22,7 @@
     ProgramOptions,
     launch,
 )
-from cuda.core.experimental._utils.cuda_utils import NVRTCError, handle_return
+from cuda.core._utils.cuda_utils import NVRTCError, handle_return
 
 
 def _common_kernels():
diff --git a/cuda_core/tests/test_graph_mem.py b/cuda_core/tests/test_graph_mem.py
index 6f39c287d6..5159fd2b2b 100644
--- a/cuda_core/tests/test_graph_mem.py
+++ b/cuda_core/tests/test_graph_mem.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import pytest
-from cuda.core.experimental import (
+from cuda.core import (
     Device,
     DeviceMemoryResource,
     GraphCompleteOptions,
diff --git a/cuda_core/tests/test_hashable.py b/cuda_core/tests/test_hashable.py
index 4aa801866f..9bc89969a2 100644
--- a/cuda_core/tests/test_hashable.py
+++ b/cuda_core/tests/test_hashable.py
@@ -12,10 +12,10 @@
 5. Hash/equality contract compliance (if a == b, then hash(a) must equal hash(b))
 """
 
-from cuda.core.experimental import Device
-from cuda.core.experimental._context import Context
-from cuda.core.experimental._event import Event, EventOptions
-from cuda.core.experimental._stream import Stream, StreamOptions
+from cuda.core import Device
+from cuda.core._context import Context
+from cuda.core._event import Event, EventOptions
+from cuda.core._stream import Stream, StreamOptions
 
 # ============================================================================
 # Integration Tests
diff --git a/cuda_core/tests/test_helpers.py b/cuda_core/tests/test_helpers.py
index 65df23980c..8230f08088 100644
--- a/cuda_core/tests/test_helpers.py
+++ b/cuda_core/tests/test_helpers.py
@@ -5,7 +5,7 @@
 import time
 
 import pytest
-from cuda.core.experimental import Device
+from cuda.core import Device
 from helpers import IS_WINDOWS, IS_WSL
 from helpers.buffers import PatternGen, compare_equal_buffers, make_scratch_buffer
 from helpers.latch import LatchKernel
diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
index d2e0a89a28..ae3e5531c1 100644
--- a/cuda_core/tests/test_launcher.py
+++ b/cuda_core/tests/test_launcher.py
@@ -12,7 +12,7 @@
     cp = None
 import numpy as np
 import pytest
-from cuda.core.experimental import (
+from cuda.core import (
     Device,
     DeviceMemoryResource,
     LaunchConfig,
@@ -21,8 +21,8 @@
     ProgramOptions,
     launch,
 )
-from cuda.core.experimental._memory import _SynchronousMemoryResource
-from cuda.core.experimental._utils.cuda_utils import CUDAError
+from cuda.core._memory import _SynchronousMemoryResource
+from cuda.core._utils.cuda_utils import CUDAError
 
 from conftest import skipif_need_cuda_headers
 
@@ -95,7 +95,7 @@ def test_launch_config_cluster_grid_conversion(init_cuda):
 
 def test_launch_config_native_conversion(init_cuda):
     """Test that _to_native_launch_config correctly converts grid from cluster units to block units."""
-    from cuda.core.experimental._launch_config import _to_native_launch_config
+    from cuda.core._launch_config import _to_native_launch_config
 
     try:
         # Test case 1: 1D - Issue #867 example
@@ -264,7 +264,7 @@ def test_cooperative_launch():
     # # Commented out as this seems to be a sticky error...
     # config = LaunchConfig(grid=1, block=1)
     # launch(s, config, ker)
-    # from cuda.core.experimental._utils.cuda_utils import CUDAError
+    # from cuda.core._utils.cuda_utils import CUDAError
     # with pytest.raises(CUDAError) as e:
     #     s.sync()
     # assert "CUDA_ERROR_LAUNCH_FAILED" in str(e)
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index b7af4b6ab7..0cb3b8e95f 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -3,9 +3,9 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import pytest
-from cuda.core.experimental import Device, Linker, LinkerOptions, Program, ProgramOptions, _linker
-from cuda.core.experimental._module import ObjectCode
-from cuda.core.experimental._utils.cuda_utils import CUDAError
+from cuda.core import Device, Linker, LinkerOptions, Program, ProgramOptions, _linker
+from cuda.core._module import ObjectCode
+from cuda.core._utils.cuda_utils import CUDAError
 
 ARCH = "sm_" + "".join(f"{i}" for i in Device().compute_capability)
 
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 8607139914..4348342b87 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -17,7 +17,7 @@
 import re
 
 import pytest
-from cuda.core.experimental import (
+from cuda.core import (
     Buffer,
     Device,
     DeviceMemoryResource,
@@ -31,13 +31,13 @@
     VirtualMemoryResource,
     VirtualMemoryResourceOptions,
 )
-from cuda.core.experimental import (
+from cuda.core import (
     system as ccx_system,
 )
-from cuda.core.experimental._dlpack import DLDeviceType
-from cuda.core.experimental._memory import IPCBufferDescriptor
-from cuda.core.experimental._utils.cuda_utils import CUDAError, handle_return
-from cuda.core.experimental.utils import StridedMemoryView
+from cuda.core._dlpack import DLDeviceType
+from cuda.core._memory import IPCBufferDescriptor
+from cuda.core._utils.cuda_utils import CUDAError, handle_return
+from cuda.core.utils import StridedMemoryView
 from helpers import IS_WINDOWS
 from helpers.buffers import DummyUnifiedMemoryResource
 
@@ -149,7 +149,7 @@ def test_package_contents():
         "VirtualMemoryResource",
     ]
     d = {}
-    exec("from cuda.core.experimental._memory import *", d)  # noqa: S102
+    exec("from cuda.core._memory import *", d)  # noqa: S102
     d = {k: v for k, v in d.items() if not k.startswith("__")}
     assert sorted(expected) == sorted(d.keys())
 
diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py
index d17cdfd089..bcae9576da 100644
--- a/cuda_core/tests/test_memory_peer_access.py
+++ b/cuda_core/tests/test_memory_peer_access.py
@@ -1,10 +1,10 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import cuda.core.experimental
+import cuda.core
 import pytest
-from cuda.core.experimental import DeviceMemoryResource
-from cuda.core.experimental._utils.cuda_utils import CUDAError
+from cuda.core import DeviceMemoryResource
+from cuda.core._utils.cuda_utils import CUDAError
 from helpers.buffers import PatternGen, compare_buffer_to_constant, make_scratch_buffer
 
 NBYTES = 1024
@@ -72,7 +72,7 @@ def check(expected):
     with pytest.raises(ValueError, match=r"device_id must be \>\= 0"):
         dmr.peer_accessible_by = [-1]  # device ID out of bounds
 
-    num_devices = len(cuda.core.experimental.Device.get_all_devices())
+    num_devices = len(cuda.core.Device.get_all_devices())
 
     with pytest.raises(ValueError, match=r"device_id must be within \[0, \d+\)"):
         dmr.peer_accessible_by = [num_devices]  # device ID out of bounds
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 25b8d5dd86..4b3817ece4 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -5,10 +5,10 @@
 import pickle
 import warnings
 
-import cuda.core.experimental
+import cuda.core
 import pytest
-from cuda.core.experimental import Device, ObjectCode, Program, ProgramOptions
-from cuda.core.experimental._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return
+from cuda.core import Device, ObjectCode, Program, ProgramOptions
+from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return
 
 try:
     import numba
@@ -41,17 +41,17 @@ def cuda12_4_prerequisite_check():
 
 def test_kernel_attributes_init_disabled():
     with pytest.raises(RuntimeError, match=r"^KernelAttributes cannot be instantiated directly\."):
-        cuda.core.experimental._module.KernelAttributes()  # Ensure back door is locked.
+        cuda.core._module.KernelAttributes()  # Ensure back door is locked.
 
 
 def test_kernel_occupancy_init_disabled():
     with pytest.raises(RuntimeError, match=r"^KernelOccupancy cannot be instantiated directly\."):
-        cuda.core.experimental._module.KernelOccupancy()  # Ensure back door is locked.
+        cuda.core._module.KernelOccupancy()  # Ensure back door is locked.
 
 
 def test_kernel_init_disabled():
     with pytest.raises(RuntimeError, match=r"^Kernel objects cannot be instantiated directly\."):
-        cuda.core.experimental._module.Kernel()  # Ensure back door is locked.
+        cuda.core._module.Kernel()  # Ensure back door is locked.
 
 
 def test_object_code_init_disabled():
@@ -387,7 +387,7 @@ def test_occupancy_max_active_clusters(get_saxpy_kernel_cubin, cluster):
     dev = Device()
     if dev.compute_capability < (9, 0):
         pytest.skip("Device with compute capability 90 or higher is required for cluster support")
-    launch_config = cuda.core.experimental.LaunchConfig(grid=128, block=64, cluster=cluster)
+    launch_config = cuda.core.LaunchConfig(grid=128, block=64, cluster=cluster)
     query_fn = kernel.occupancy.max_active_clusters
     max_active_clusters = query_fn(launch_config)
     assert isinstance(max_active_clusters, int)
@@ -402,7 +402,7 @@ def test_occupancy_max_potential_cluster_size(get_saxpy_kernel_cubin):
     dev = Device()
     if dev.compute_capability < (9, 0):
         pytest.skip("Device with compute capability 90 or higher is required for cluster support")
-    launch_config = cuda.core.experimental.LaunchConfig(grid=128, block=64)
+    launch_config = cuda.core.LaunchConfig(grid=128, block=64)
     query_fn = kernel.occupancy.max_potential_cluster_size
     max_potential_cluster_size = query_fn(launch_config)
     assert isinstance(max_potential_cluster_size, int)
diff --git a/cuda_core/tests/test_multiprocessing_warning.py b/cuda_core/tests/test_multiprocessing_warning.py
index 8b490af233..2248cd2cd6 100644
--- a/cuda_core/tests/test_multiprocessing_warning.py
+++ b/cuda_core/tests/test_multiprocessing_warning.py
@@ -12,11 +12,11 @@
 import warnings
 from unittest.mock import patch
 
-from cuda.core.experimental import DeviceMemoryResource, DeviceMemoryResourceOptions, EventOptions
-from cuda.core.experimental._event import _reduce_event
-from cuda.core.experimental._memory._device_memory_resource import _deep_reduce_device_memory_resource
-from cuda.core.experimental._memory._ipc import _reduce_allocation_handle
-from cuda.core.experimental._utils.cuda_utils import reset_fork_warning
+from cuda.core import DeviceMemoryResource, DeviceMemoryResourceOptions, EventOptions
+from cuda.core._event import _reduce_event
+from cuda.core._memory._device_memory_resource import _deep_reduce_device_memory_resource
+from cuda.core._memory._ipc import _reduce_allocation_handle
+from cuda.core._utils.cuda_utils import reset_fork_warning
 
 
 def test_warn_on_fork_method_device_memory_resource(ipc_device):
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index f432e3f88d..9a9e4926ae 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -6,11 +6,11 @@
 import warnings
 
 import pytest
-from cuda.core.experimental import _linker
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._module import Kernel, ObjectCode
-from cuda.core.experimental._program import Program, ProgramOptions
-from cuda.core.experimental._utils.cuda_utils import CUDAError, driver, handle_return
+from cuda.core import _linker
+from cuda.core._device import Device
+from cuda.core._module import Kernel, ObjectCode
+from cuda.core._program import Program, ProgramOptions
+from cuda.core._utils.cuda_utils import CUDAError, driver, handle_return
 
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
@@ -19,7 +19,7 @@
 def _is_nvvm_available():
     """Check if NVVM is available."""
     try:
-        from cuda.core.experimental._program import _get_nvvm_module
+        from cuda.core._program import _get_nvvm_module
 
         _get_nvvm_module()
         return True
@@ -32,7 +32,7 @@ def _is_nvvm_available():
 )
 
 try:
-    from cuda.core.experimental._utils.cuda_utils import driver, handle_return, nvrtc
+    from cuda.core._utils.cuda_utils import driver, handle_return, nvrtc
 
     _cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 except Exception:
@@ -92,7 +92,7 @@ def _get_libnvvm_version_for_tests():
     _libnvvm_version_attempted = True
 
     try:
-        from cuda.core.experimental._program import _get_nvvm_module
+        from cuda.core._program import _get_nvvm_module
 
         nvvm = _get_nvvm_module()
 
@@ -140,7 +140,7 @@ def nvvm_ir():
     fallback assumes no version metadata will be present in
     the input nvvm ir
     """
-    from cuda.core.experimental._program import _get_nvvm_module
+    from cuda.core._program import _get_nvvm_module
 
     nvvm = _get_nvvm_module()
     major, minor, debug_major, debug_minor = nvvm.ir_version()
@@ -415,7 +415,7 @@ def test_program_close():
 @nvvm_available
 def test_nvvm_deferred_import():
     """Test that our deferred NVVM import works correctly"""
-    from cuda.core.experimental._program import _get_nvvm_module
+    from cuda.core._program import _get_nvvm_module
 
     nvvm = _get_nvvm_module()
     assert nvvm is not None
diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py
index 695a70e931..01b0b861af 100644
--- a/cuda_core/tests/test_stream.py
+++ b/cuda_core/tests/test_stream.py
@@ -2,10 +2,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
-from cuda.core.experimental import Device, Stream, StreamOptions
-from cuda.core.experimental._event import Event
-from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM
-from cuda.core.experimental._utils.cuda_utils import driver
+from cuda.core import Device, Stream, StreamOptions
+from cuda.core._event import Event
+from cuda.core._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM
+from cuda.core._utils.cuda_utils import driver
 from helpers.misc import StreamWrapper
 
 
diff --git a/cuda_core/tests/test_strided_layout.py b/cuda_core/tests/test_strided_layout.py
index c615365cf8..d897f78cfe 100644
--- a/cuda_core/tests/test_strided_layout.py
+++ b/cuda_core/tests/test_strided_layout.py
@@ -9,7 +9,7 @@
 
 import numpy as np
 import pytest
-from cuda.core.experimental._layout import _StridedLayout
+from cuda.core._layout import _StridedLayout
 from helpers.layout import (
     DenseOrder,
     LayoutSpec,
diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py
index da81bbec99..60b7ef7ec7 100644
--- a/cuda_core/tests/test_system.py
+++ b/cuda_core/tests/test_system.py
@@ -7,8 +7,8 @@
     from cuda import cuda as driver
     from cuda import cudart as runtime
 
-from cuda.core.experimental import Device, system
-from cuda.core.experimental._utils.cuda_utils import handle_return
+from cuda.core import Device, system
+from cuda.core._utils.cuda_utils import handle_return
 
 
 def test_system_singleton():
diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index 4adff9d9c5..6a5fdd55ba 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -12,15 +12,15 @@
     from numba import cuda as numba_cuda
 except ImportError:
     numba_cuda = None
-import cuda.core.experimental
+import cuda.core
 import numpy as np
 import pytest
-from cuda.core.experimental import Device
-from cuda.core.experimental.utils import StridedMemoryView, _StridedLayout, args_viewable_as_strided_memory
+from cuda.core import Device
+from cuda.core.utils import StridedMemoryView, _StridedLayout, args_viewable_as_strided_memory
 
 
 def test_cast_to_3_tuple_success():
-    c3t = cuda.core.experimental._utils.cuda_utils.cast_to_3_tuple
+    c3t = cuda.core._utils.cuda_utils.cast_to_3_tuple
     assert c3t("", ()) == (1, 1, 1)
     assert c3t("", 2) == (2, 1, 1)
     assert c3t("", (2,)) == (2, 1, 1)
@@ -44,7 +44,7 @@ def test_cast_to_3_tuple_success():
 )
 def test_cast_to_3_tuple_value_error(cfg, expected):
     with pytest.raises(ValueError, match=expected):
-        cuda.core.experimental._utils.cuda_utils.cast_to_3_tuple("Lbl", cfg)
+        cuda.core._utils.cuda_utils.cast_to_3_tuple("Lbl", cfg)
 
 
 def convert_strides_to_counts(strides, itemsize):
diff --git a/cuda_python_test_helpers/cuda_python_test_helpers/__init__.py b/cuda_python_test_helpers/cuda_python_test_helpers/__init__.py
index a661b4f1aa..e7829df406 100644
--- a/cuda_python_test_helpers/cuda_python_test_helpers/__init__.py
+++ b/cuda_python_test_helpers/cuda_python_test_helpers/__init__.py
@@ -9,7 +9,7 @@
 from contextlib import suppress
 from typing import Union
 
-from cuda.core.experimental._utils.cuda_utils import handle_return
+from cuda.core._utils.cuda_utils import handle_return
 
 __all__ = [
     "IS_WINDOWS",

From 3eb8abb761561a800e7b367a062af54b1fb79029 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Mon, 15 Dec 2025 22:38:52 -0800
Subject: [PATCH 02/21] Remove experimental namespace from examples and test
 files

Update all example files and test Cython files to use cuda.core instead
of cuda.core.experimental:

- Update all example imports from cuda.core.experimental.* to cuda.core.*
- Update example utils imports from cuda.core.experimental.utils to cuda.core.utils
- Update test Cython file imports
- Update build_tests.sh include path from experimental/include to _include

All example files now use the non-experimental import paths.
---
 cuda_core/examples/cuda_graphs.py                      | 2 +-
 cuda_core/examples/jit_lto_fractal.py                  | 2 +-
 cuda_core/examples/memory_ops.py                       | 2 +-
 cuda_core/examples/pytorch_example.py                  | 2 +-
 cuda_core/examples/saxpy.py                            | 2 +-
 cuda_core/examples/show_device_properties.py           | 2 +-
 cuda_core/examples/simple_multi_gpu_example.py         | 2 +-
 cuda_core/examples/strided_memory_view_cpu.py          | 2 +-
 cuda_core/examples/strided_memory_view_gpu.py          | 4 ++--
 cuda_core/examples/thread_block_cluster.py             | 2 +-
 cuda_core/examples/vector_add.py                       | 2 +-
 cuda_core/tests/cython/build_tests.sh                  | 4 ++--
 cuda_core/tests/cython/test_get_cuda_native_handle.pyx | 2 +-
 13 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/cuda_core/examples/cuda_graphs.py b/cuda_core/examples/cuda_graphs.py
index 2d2d9833fb..9cc759b500 100644
--- a/cuda_core/examples/cuda_graphs.py
+++ b/cuda_core/examples/cuda_graphs.py
@@ -13,7 +13,7 @@
 import time
 
 import cupy as cp
-from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
+from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch
 
 
 def main():
diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py
index d1553f6b67..b0040708b6 100644
--- a/cuda_core/examples/jit_lto_fractal.py
+++ b/cuda_core/examples/jit_lto_fractal.py
@@ -25,7 +25,7 @@
 import sys
 
 import cupy as cp
-from cuda.core.experimental import Device, LaunchConfig, Linker, LinkerOptions, Program, ProgramOptions, launch
+from cuda.core import Device, LaunchConfig, Linker, LinkerOptions, Program, ProgramOptions, launch
 
 
 # ################################################################################
diff --git a/cuda_core/examples/memory_ops.py b/cuda_core/examples/memory_ops.py
index c4abd06e2c..123b1f6a11 100644
--- a/cuda_core/examples/memory_ops.py
+++ b/cuda_core/examples/memory_ops.py
@@ -16,7 +16,7 @@
 
 import cupy as cp
 import numpy as np
-from cuda.core.experimental import (
+from cuda.core import (
     Device,
     LaunchConfig,
     LegacyPinnedMemoryResource,
diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py
index ea067302b9..433d63c9eb 100644
--- a/cuda_core/examples/pytorch_example.py
+++ b/cuda_core/examples/pytorch_example.py
@@ -15,7 +15,7 @@
 import sys
 
 import torch
-from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
+from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch
 
 # SAXPY kernel - passing a as a pointer to avoid any type issues
 code = """
diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
index f38caef392..aa0d77eff9 100644
--- a/cuda_core/examples/saxpy.py
+++ b/cuda_core/examples/saxpy.py
@@ -14,7 +14,7 @@
 import sys
 
 import cupy as cp
-from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
+from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch
 
 # compute out = a * x + y
 code = """
diff --git a/cuda_core/examples/show_device_properties.py b/cuda_core/examples/show_device_properties.py
index 1609d8c230..8b14cf0767 100644
--- a/cuda_core/examples/show_device_properties.py
+++ b/cuda_core/examples/show_device_properties.py
@@ -11,7 +11,7 @@
 
 import sys
 
-from cuda.core.experimental import Device, system
+from cuda.core import Device, system
 
 
 # Convert boolean to YES or NO string
diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py
index ec997a649b..438a21c808 100644
--- a/cuda_core/examples/simple_multi_gpu_example.py
+++ b/cuda_core/examples/simple_multi_gpu_example.py
@@ -12,7 +12,7 @@
 import sys
 
 import cupy as cp
-from cuda.core.experimental import Device, LaunchConfig, Program, launch, system
+from cuda.core import Device, LaunchConfig, Program, launch, system
 
 if system.get_num_devices() < 2:
     print("this example requires at least 2 GPUs", file=sys.stderr)
diff --git a/cuda_core/examples/strided_memory_view_cpu.py b/cuda_core/examples/strided_memory_view_cpu.py
index de6007fd26..a20377cc76 100644
--- a/cuda_core/examples/strided_memory_view_cpu.py
+++ b/cuda_core/examples/strided_memory_view_cpu.py
@@ -26,7 +26,7 @@
     print("cffi is not installed, the CPU example will be skipped", file=sys.stderr)
     FFI = None
 import numpy as np
-from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
+from cuda.core.utils import StridedMemoryView, args_viewable_as_strided_memory
 
 # ################################################################################
 #
diff --git a/cuda_core/examples/strided_memory_view_gpu.py b/cuda_core/examples/strided_memory_view_gpu.py
index 3e456776a8..e91ddc25cc 100644
--- a/cuda_core/examples/strided_memory_view_gpu.py
+++ b/cuda_core/examples/strided_memory_view_gpu.py
@@ -23,8 +23,8 @@
     print("cupy is not installed, the GPU example will be skipped", file=sys.stderr)
     cp = None
 import numpy as np
-from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
-from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
+from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch
+from cuda.core.utils import StridedMemoryView, args_viewable_as_strided_memory
 
 # ################################################################################
 #
diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py
index e14158f8bd..f1ea8b8579 100644
--- a/cuda_core/examples/thread_block_cluster.py
+++ b/cuda_core/examples/thread_block_cluster.py
@@ -13,7 +13,7 @@
 import sys
 
 import numpy as np
-from cuda.core.experimental import (
+from cuda.core import (
     Device,
     LaunchConfig,
     LegacyPinnedMemoryResource,
diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
index 2851303c7e..d31ab77208 100644
--- a/cuda_core/examples/vector_add.py
+++ b/cuda_core/examples/vector_add.py
@@ -10,7 +10,7 @@
 # ################################################################################
 
 import cupy as cp
-from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
+from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch
 
 # compute c = a + b
 code = """
diff --git a/cuda_core/tests/cython/build_tests.sh b/cuda_core/tests/cython/build_tests.sh
index eb3303840d..3e20136133 100755
--- a/cuda_core/tests/cython/build_tests.sh
+++ b/cuda_core/tests/cython/build_tests.sh
@@ -6,10 +6,10 @@
 UNAME=$(uname)
 if [ "$UNAME" == "Linux" ] ; then
   SCRIPTPATH=$(dirname $(realpath "$0"))
-  export CPLUS_INCLUDE_PATH=${SCRIPTPATH}/../../cuda/core/experimental/include:$CUDA_HOME/include:$CPLUS_INCLUDE_PATH
+  export CPLUS_INCLUDE_PATH=${SCRIPTPATH}/../../cuda/core/_include:$CUDA_HOME/include:$CPLUS_INCLUDE_PATH
 elif [[ "$UNAME" == CYGWIN* || "$UNAME" == MINGW* || "$UNAME" == MSYS* ]] ; then
   SCRIPTPATH="$(dirname $(cygpath -w $(realpath "$0")))"
-  CUDA_CORE_INCLUDE_PATH=$(echo "${SCRIPTPATH}\..\..\cuda\core\experimental\include" | sed 's/\\/\\\\/g')
+  CUDA_CORE_INCLUDE_PATH=$(echo "${SCRIPTPATH}\..\..\cuda\core\_include" | sed 's/\\/\\\\/g')
   export CL="/I\"${CUDA_CORE_INCLUDE_PATH}\" /I\"${CUDA_HOME}\\include\" ${CL}"
 else
   exit 1
diff --git a/cuda_core/tests/cython/test_get_cuda_native_handle.pyx b/cuda_core/tests/cython/test_get_cuda_native_handle.pyx
index 0c3921e925..2b105e13ae 100644
--- a/cuda_core/tests/cython/test_get_cuda_native_handle.pyx
+++ b/cuda_core/tests/cython/test_get_cuda_native_handle.pyx
@@ -13,7 +13,7 @@ from cuda.bindings.nvrtc cimport nvrtcProgram as pynvrtcProgram
 from cuda.bindings.cydriver cimport CUstream, CUevent
 from cuda.bindings.cynvrtc cimport nvrtcProgram
 
-from cuda.core.experimental import Device, Program
+from cuda.core import Device, Program
 
 
 cdef extern from "utility.hpp":

From bf40c27d8225ff1c61fb34e8ccf5136361ee78f8 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Mon, 15 Dec 2025 22:56:42 -0800
Subject: [PATCH 03/21] Update .spdx-ignore for
 cuda_core/cuda/core/_include/dlpack.h

---
 .spdx-ignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.spdx-ignore b/.spdx-ignore
index 7bbb51dcd5..7263b5414f 100644
--- a/.spdx-ignore
+++ b/.spdx-ignore
@@ -9,6 +9,6 @@ requirements*.txt
 cuda_bindings/examples/*
 
 # Vendored
-cuda_core/cuda/core/experimental/include/dlpack.h
+cuda_core/cuda/core/_include/dlpack.h
 
 qa/ctk-next.drawio.svg

From 693599f222383213a424b2d5d275bf13fe690675 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Mon, 15 Dec 2025 23:14:39 -0800
Subject: [PATCH 04/21] Update documentation to reflect migration from
 experimental to core

Update all documentation files to reference cuda.core instead of
cuda.core.experimental:

- api.rst: Change module from cuda.core.experimental to cuda.core
- getting-started.rst: Update currentmodule and example imports
- interoperability.rst: Update currentmodule
- api_private.rst: Update currentmodule
- conf.py: Update system import paths and excluded_dirs paths

All documentation now reflects the new non-experimental API paths.
Release notes files are left unchanged as they are historical.
---
 cuda_core/docs/source/api.rst              | 16 +++++++---------
 cuda_core/docs/source/api_private.rst      |  2 +-
 cuda_core/docs/source/conf.py              |  8 ++++----
 cuda_core/docs/source/getting-started.rst  |  4 ++--
 cuda_core/docs/source/interoperability.rst |  2 +-
 5 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index 51e505b59d..af77b70859 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -1,14 +1,12 @@
 .. SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 .. SPDX-License-Identifier: Apache-2.0
 
-.. module:: cuda.core.experimental
+.. module:: cuda.core
 
-``cuda.core.experimental`` API Reference
-========================================
+``cuda.core`` API Reference
+===========================
 
-All of the APIs listed (or cross-referenced from) below are considered *experimental*
-and subject to future changes without deprecation notice. Once stabilized they will be
-moved out of the ``experimental`` namespace.
+This is the main API reference for ``cuda.core``. All APIs are stable and ready for production use.
 
 
 CUDA runtime
@@ -64,11 +62,11 @@ CUDA compilation toolchain
 CUDA system information
 -----------------------
 
-.. automethod:: cuda.core.experimental._system.System.get_driver_version
-.. automethod:: cuda.core.experimental._system.System.get_num_devices
+.. automethod:: cuda.core._system.System.get_driver_version
+.. automethod:: cuda.core._system.System.get_num_devices
 
 
-.. module:: cuda.core.experimental.utils
+.. module:: cuda.core.utils
 
 Utility functions
 -----------------
diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst
index b832cfdbce..b0fcf61291 100644
--- a/cuda_core/docs/source/api_private.rst
+++ b/cuda_core/docs/source/api_private.rst
@@ -8,7 +8,7 @@
    via returned values from public APIs. These classes must be referred in
    public APIs returning their instances.
 
-.. currentmodule:: cuda.core.experimental
+.. currentmodule:: cuda.core
 
 CUDA runtime
 ------------
diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py
index bab2a2b942..e5136e040a 100644
--- a/cuda_core/docs/source/conf.py
+++ b/cuda_core/docs/source/conf.py
@@ -99,14 +99,14 @@
 
 
 def autodoc_process_docstring(app, what, name, obj, options, lines):
-    if name.startswith("cuda.core.experimental._system.System"):
+    if name.startswith("cuda.core._system.System"):
         name = name.replace("._system.System", ".system")
         # patch the docstring (in lines) *in-place*. Should docstrings include section titles other than "Returns",
         # this will need to be modified to handle them.
         while lines:
             lines.pop()
         attr = name.split(".")[-1]
-        from cuda.core.experimental._system import System
+        from cuda.core._system import System
 
         original_lines = getattr(System, attr).__doc__.split("\n")
         new_lines = []
@@ -129,8 +129,8 @@ def skip_member(app, what, name, obj, skip, options):
     # are assumed to be properties (because cythonized
     # properties are not recognized as such by autodoc)
     excluded_dirs = [
-        "cuda.core.experimental._layout",
-        "cuda.core.experimental._memoryview",
+        "cuda.core._layout",
+        "cuda.core._memoryview",
     ]
     if what == "attribute" and getattr(obj, "__doc__", None) is None:
         obj_module = getattr(getattr(obj, "__objclass__", None), "__module__", None)
diff --git a/cuda_core/docs/source/getting-started.rst b/cuda_core/docs/source/getting-started.rst
index 2bc7c6156e..2ac779dc2b 100644
--- a/cuda_core/docs/source/getting-started.rst
+++ b/cuda_core/docs/source/getting-started.rst
@@ -1,7 +1,7 @@
 .. SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 .. SPDX-License-Identifier: Apache-2.0
 
-.. currentmodule:: cuda.core.experimental
+.. currentmodule:: cuda.core
 
 Overview
 ========
@@ -59,7 +59,7 @@ Don't forget to use :meth:`Device.set_current`!
 .. code-block:: python
 
    import cupy as cp
-   from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
+   from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch
 
    dev = Device()
    dev.set_current()
diff --git a/cuda_core/docs/source/interoperability.rst b/cuda_core/docs/source/interoperability.rst
index 2d3657abed..9871ebb5bf 100644
--- a/cuda_core/docs/source/interoperability.rst
+++ b/cuda_core/docs/source/interoperability.rst
@@ -1,7 +1,7 @@
 .. SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 .. SPDX-License-Identifier: Apache-2.0
 
-.. currentmodule:: cuda.core.experimental
+.. currentmodule:: cuda.core
 
 Interoperability
 ================

From 742b68d1c78103f7d901850019e75828bd71ce16 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Mon, 15 Dec 2025 23:18:41 -0800
Subject: [PATCH 05/21] Update issue templates and wheel merge script for
 cuda.core migration

Update GitHub issue templates and wheel merge script to reference
cuda.core instead of cuda.core.experimental:

- bug_report.yml: Update example references from cuda.core.experimental
  to cuda.core
- feature_request.yml: Update example references from cuda.core.experimental
  to cuda.core
- merge_cuda_core_wheels.py: Update to merge cuda/core/ instead of
  cuda/core/experimental/, and adopt logic that copies only binaries
  into versioned subdirectories while keeping Python modules in cuda/core/
---
 .github/ISSUE_TEMPLATE/bug_report.yml      |  6 +--
 .github/ISSUE_TEMPLATE/feature_request.yml |  6 +--
 ci/tools/merge_cuda_core_wheels.py         | 53 +++++++++++++---------
 3 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
index 4574e04bf3..1e0b4d2c20 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -52,7 +52,7 @@ body:
     attributes:
       label: Describe the bug
       description: A clear and concise description of what problem you are running into.
-      placeholder: "Attempting to compile a program via `cuda.core.experimental.Program.compile` throws a `ValueError`."
+      placeholder: "Attempting to compile a program via `cuda.core.Program.compile` throws a `ValueError`."
     validations:
       required: true
 
@@ -62,7 +62,7 @@ body:
       label: How to Reproduce
       description: Steps used to reproduce the bug.
       placeholder: |
-        0. Construct a `cuda.core.experimental.Program` instance
+        0. Construct a `cuda.core.Program` instance
         1. Call the `.compile(...)` method of the instance
         2. The call throws a `ValueError` with the following:
         ```
@@ -76,7 +76,7 @@ body:
     attributes:
       label: Expected behavior
       description: A clear and concise description of what you expected to happen.
-      placeholder: "Using `cuda.core.experimental.Program.compile(...)` should run successfully and not throw a `ValueError`"
+      placeholder: "Using `cuda.core.Program.compile(...)` should run successfully and not throw a `ValueError`"
     validations:
       required: true
 
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml
index cbbc03c492..6d1504c4c4 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -36,7 +36,7 @@ body:
     attributes:
       label: Is your feature request related to a problem? Please describe.
       description: A clear and concise description of what the problem is, e.g., "I would like to be able to..."
-      placeholder: I would like to be able to use the equivalent of `cuda.core.experimental.Program.compile(...)` to compile my code to PTX.
+      placeholder: I would like to be able to use the equivalent of `cuda.core.Program.compile(...)` to compile my code to PTX.
     validations:
       required: true
 
@@ -46,7 +46,7 @@ body:
       label: Describe the solution you'd like
       description: A clear and concise description of what you want to happen.
       placeholder: |
-        Support a `ptx` target_type in the `cuda.core.experimental.Program.compile(...)` function.
+        Support a `ptx` target_type in the `cuda.core.Program.compile(...)` function.
     validations:
       required: true
 
@@ -57,7 +57,7 @@ body:
       description:
         If applicable, please add a clear and concise description of any alternative solutions or features you've
         considered.
-      placeholder: The alternatives to using `cuda.core.experimental.Program.compile(...)` are unappealing. They usually involve using lower level bindings to something like nvRTC or invoking the nvcc executable.
+      placeholder: The alternatives to using `cuda.core.Program.compile(...)` are unappealing. They usually involve using lower level bindings to something like nvRTC or invoking the nvcc executable.
     validations:
       required: false
 
diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py
index 359b98d6a7..14ed53c308 100644
--- a/ci/tools/merge_cuda_core_wheels.py
+++ b/ci/tools/merge_cuda_core_wheels.py
@@ -12,8 +12,8 @@
 
 In particular, each wheel contains a CUDA-specific build of the `cuda.core` library
 and the associated bindings. This script merges these directories into a single wheel
-that supports both CUDA versions, i.e., containing both `cuda/core/experimental/cu12`
-and `cuda/core/experimental/cu13`. At runtime, the code in `cuda/core/experimental/__init__.py`
+that supports both CUDA versions, i.e., containing both `cuda/core/cu12`
+and `cuda/core/cu13`. At runtime, the code in `cuda/core/__init__.py`
 is used to import the appropriate CUDA-specific bindings.
 
 This script is based on the one in NVIDIA/CCCL.
@@ -94,27 +94,38 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
         # Use the first wheel as the base and merge binaries from others
         base_wheel = extracted_wheels[0]
 
-        # now copy the version-specific directory from other wheels
-        # into the appropriate place in the base wheel
+        # Copy version-specific binaries from each wheel into versioned subdirectories
+        # Note: Python modules stay in cuda/core/, only binaries go into cu12/cu13/
+        base_dir = Path("cuda") / "core"
+
         for i, wheel_dir in enumerate(extracted_wheels):
             cuda_version = wheels[i].name.split(".cu")[1].split(".")[0]
-            base_dir = Path("cuda") / "core" / "experimental"
-            # Copy from other wheels
-            print(f"  Copying {wheel_dir} to {base_wheel}", file=sys.stderr)
-            shutil.copytree(wheel_dir / base_dir, base_wheel / base_dir / f"cu{cuda_version}")
-
-            # Overwrite the __init__.py in versioned dirs
-            os.truncate(base_wheel / base_dir / f"cu{cuda_version}" / "__init__.py", 0)
-
-        # The base dir should only contain __init__.py, the include dir, and the versioned dirs
-        files_to_remove = os.scandir(base_wheel / base_dir)
-        for f in files_to_remove:
-            f_abspath = f.path
-            if f.name not in ("__init__.py", "cu12", "cu13", "include"):
-                if f.is_dir():
-                    shutil.rmtree(f_abspath)
-                else:
-                    os.remove(f_abspath)
+            versioned_dir = base_wheel / base_dir / f"cu{cuda_version}"
+
+            # Create versioned directory
+            versioned_dir.mkdir(parents=True, exist_ok=True)
+
+            # Copy only version-specific binaries (.so, .pyd, .dll files) from the source wheel
+            # Python modules (.py, .pyx, .pxd) remain in cuda/core/
+            # Exclude versioned directories (cu12/, cu13/) to avoid recursion
+            source_dir = wheel_dir / base_dir
+            for item in source_dir.rglob("*"):
+                if item.is_dir():
+                    continue
+
+                # Skip files in versioned directories to avoid recursion
+                rel_path = item.relative_to(source_dir)
+                if any(part in ("cu12", "cu13") for part in rel_path.parts):
+                    continue
+
+                # Only copy binary files, not Python source files
+                if item.suffix in (".so", ".pyd", ".dll"):
+                    dest_item = versioned_dir / rel_path
+                    dest_item.parent.mkdir(parents=True, exist_ok=True)
+                    shutil.copy2(item, dest_item)
+
+            # Create empty __init__.py in versioned dirs
+            (versioned_dir / "__init__.py").touch()
 
         # Repack the merged wheel
         output_dir.mkdir(parents=True, exist_ok=True)

From 6769cbfee84857b475a7f3679e9af4a2e0e467e6 Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Tue, 16 Dec 2025 14:28:34 -0500
Subject: [PATCH 06/21] chore: bump pixi version

---
 cuda_core/pixi.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cuda_core/pixi.lock b/cuda_core/pixi.lock
index 5f7368f8f7..16a0d2460f 100644
--- a/cuda_core/pixi.lock
+++ b/cuda_core/pixi.lock
@@ -1068,7 +1068,7 @@ packages:
   - cuda-cudart >=13.1.80,<14.0a0
   license: Apache-2.0
   input:
-    hash: 34cc0e9528da3d29832a101ecb88d7268d870dcc8b47dd880a3df12d7244e4a0
+    hash: cccb645b22f775570680f1a9a62e415a09774e46645523bbd147226681155628
     globs:
     - pyproject.toml
 - conda: .
@@ -1088,7 +1088,7 @@ packages:
   - python_abi 3.14.* *_cp314
   license: Apache-2.0
   input:
-    hash: 34cc0e9528da3d29832a101ecb88d7268d870dcc8b47dd880a3df12d7244e4a0
+    hash: cccb645b22f775570680f1a9a62e415a09774e46645523bbd147226681155628
     globs:
     - pyproject.toml
 - conda: .
@@ -1110,7 +1110,7 @@ packages:
   - cuda-cudart >=13.1.80,<14.0a0
   license: Apache-2.0
   input:
-    hash: 34cc0e9528da3d29832a101ecb88d7268d870dcc8b47dd880a3df12d7244e4a0
+    hash: cccb645b22f775570680f1a9a62e415a09774e46645523bbd147226681155628
     globs:
     - pyproject.toml
 - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-12.9.86-ha770c72_2.conda

From d4569e9ca9a69cb5f70bc9ddefda2b70010f65bd Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Tue, 16 Dec 2025 14:32:14 -0500
Subject: [PATCH 07/21] test: handle deprecated calls

---
 cuda_core/tests/test_memory.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 4348342b87..cb575f497f 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -365,7 +365,8 @@ def test_buffer_external_host():
 
 @pytest.mark.parametrize("change_device", [True, False])
 def test_buffer_external_device(change_device):
-    n = ccx_system.num_devices
+    with pytest.deprecated_call():
+        n = ccx_system.num_devices
     if n < 1:
         pytest.skip("No devices found")
     dev_id = n - 1
@@ -389,7 +390,8 @@ def test_buffer_external_device(change_device):
 
 @pytest.mark.parametrize("change_device", [True, False])
 def test_buffer_external_pinned_alloc(change_device):
-    n = ccx_system.num_devices
+    with pytest.deprecated_call():
+        n = ccx_system.num_devices
     if n < 1:
         pytest.skip("No devices found")
     dev_id = n - 1
@@ -414,7 +416,8 @@ def test_buffer_external_pinned_alloc(change_device):
 
 @pytest.mark.parametrize("change_device", [True, False])
 def test_buffer_external_pinned_registered(change_device):
-    n = ccx_system.num_devices
+    with pytest.deprecated_call():
+        n = ccx_system.num_devices
     if n < 1:
         pytest.skip("No devices found")
     dev_id = n - 1
@@ -447,7 +450,8 @@ def test_buffer_external_pinned_registered(change_device):
 
 @pytest.mark.parametrize("change_device", [True, False])
 def test_buffer_external_managed(change_device):
-    n = ccx_system.num_devices
+    with pytest.deprecated_call():
+        n = ccx_system.num_devices
     if n < 1:
         pytest.skip("No devices found")
     dev_id = n - 1

From 6e4266413f4befcf9f4466be72befb761941b6bf Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Tue, 16 Dec 2025 14:32:28 -0500
Subject: [PATCH 08/21] build: remove unnecessary `local_include_dirs`

---
 cuda_core/build_hooks.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index e78f248a54..4337783563 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -86,11 +86,7 @@ def get_cuda_paths():
         print("CUDA paths:", CUDA_PATH)
         return CUDA_PATH
 
-    # Add local include directory for cuda/core/_include
-    # This allows Cython files to use: cdef extern from "_include/layout.hpp"
-    local_include_dirs = ["cuda/core"]
-    cuda_include_dirs = list(os.path.join(root, "include") for root in get_cuda_paths())
-    all_include_dirs = local_include_dirs + cuda_include_dirs
+    all_include_dirs = list(os.path.join(root, "include") for root in get_cuda_paths())
     extra_compile_args = []
     if COMPILE_FOR_COVERAGE:
         # CYTHON_TRACE_NOGIL indicates to trace nogil functions.  It is not

From e0f784d73cffa68e96fc60de63e0c9e28793a90b Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Tue, 16 Dec 2025 15:02:40 -0500
Subject: [PATCH 09/21] chore: try reverting to the original merge wheel script

---
 ci/tools/merge_cuda_core_wheels.py | 49 ++++++++++++------------------
 1 file changed, 19 insertions(+), 30 deletions(-)

diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py
index 14ed53c308..8ae9a266ad 100644
--- a/ci/tools/merge_cuda_core_wheels.py
+++ b/ci/tools/merge_cuda_core_wheels.py
@@ -94,38 +94,27 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
         # Use the first wheel as the base and merge binaries from others
         base_wheel = extracted_wheels[0]
 
-        # Copy version-specific binaries from each wheel into versioned subdirectories
-        # Note: Python modules stay in cuda/core/, only binaries go into cu12/cu13/
-        base_dir = Path("cuda") / "core"
-
+        # now copy the version-specific directory from other wheels
+        # into the appropriate place in the base wheel
         for i, wheel_dir in enumerate(extracted_wheels):
             cuda_version = wheels[i].name.split(".cu")[1].split(".")[0]
-            versioned_dir = base_wheel / base_dir / f"cu{cuda_version}"
-
-            # Create versioned directory
-            versioned_dir.mkdir(parents=True, exist_ok=True)
-
-            # Copy only version-specific binaries (.so, .pyd, .dll files) from the source wheel
-            # Python modules (.py, .pyx, .pxd) remain in cuda/core/
-            # Exclude versioned directories (cu12/, cu13/) to avoid recursion
-            source_dir = wheel_dir / base_dir
-            for item in source_dir.rglob("*"):
-                if item.is_dir():
-                    continue
-
-                # Skip files in versioned directories to avoid recursion
-                rel_path = item.relative_to(source_dir)
-                if any(part in ("cu12", "cu13") for part in rel_path.parts):
-                    continue
-
-                # Only copy binary files, not Python source files
-                if item.suffix in (".so", ".pyd", ".dll"):
-                    dest_item = versioned_dir / rel_path
-                    dest_item.parent.mkdir(parents=True, exist_ok=True)
-                    shutil.copy2(item, dest_item)
-
-            # Create empty __init__.py in versioned dirs
-            (versioned_dir / "__init__.py").touch()
+            base_dir = Path("cuda") / "core"
+            # Copy from other wheels
+            print(f"  Copying {wheel_dir} to {base_wheel}", file=sys.stderr)
+            shutil.copytree(wheel_dir / base_dir, base_wheel / base_dir / f"cu{cuda_version}")
+
+            # Overwrite the __init__.py in versioned dirs
+            os.truncate(base_wheel / base_dir / f"cu{cuda_version}" / "__init__.py", 0)
+
+        # The base dir should only contain __init__.py, the include dir, and the versioned dirs
+        files_to_remove = os.scandir(base_wheel / base_dir)
+        for f in files_to_remove:
+            f_abspath = f.path
+            if f.name not in ("__init__.py", "cu12", "cu13", "include"):
+                if f.is_dir():
+                    shutil.rmtree(f_abspath)
+                else:
+                    os.remove(f_abspath)
 
         # Repack the merged wheel
         output_dir.mkdir(parents=True, exist_ok=True)

From 30f03395d393596fbb64f9766d08f74f07631f45 Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Tue, 16 Dec 2025 15:27:36 -0500
Subject: [PATCH 10/21] revert: chore: try reverting to the original merge
 wheel script

This reverts commit e0f784d73cffa68e96fc60de63e0c9e28793a90b.
---
 ci/tools/merge_cuda_core_wheels.py | 49 ++++++++++++++++++------------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py
index 8ae9a266ad..14ed53c308 100644
--- a/ci/tools/merge_cuda_core_wheels.py
+++ b/ci/tools/merge_cuda_core_wheels.py
@@ -94,27 +94,38 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
         # Use the first wheel as the base and merge binaries from others
         base_wheel = extracted_wheels[0]
 
-        # now copy the version-specific directory from other wheels
-        # into the appropriate place in the base wheel
+        # Copy version-specific binaries from each wheel into versioned subdirectories
+        # Note: Python modules stay in cuda/core/, only binaries go into cu12/cu13/
+        base_dir = Path("cuda") / "core"
+
         for i, wheel_dir in enumerate(extracted_wheels):
             cuda_version = wheels[i].name.split(".cu")[1].split(".")[0]
-            base_dir = Path("cuda") / "core"
-            # Copy from other wheels
-            print(f"  Copying {wheel_dir} to {base_wheel}", file=sys.stderr)
-            shutil.copytree(wheel_dir / base_dir, base_wheel / base_dir / f"cu{cuda_version}")
-
-            # Overwrite the __init__.py in versioned dirs
-            os.truncate(base_wheel / base_dir / f"cu{cuda_version}" / "__init__.py", 0)
-
-        # The base dir should only contain __init__.py, the include dir, and the versioned dirs
-        files_to_remove = os.scandir(base_wheel / base_dir)
-        for f in files_to_remove:
-            f_abspath = f.path
-            if f.name not in ("__init__.py", "cu12", "cu13", "include"):
-                if f.is_dir():
-                    shutil.rmtree(f_abspath)
-                else:
-                    os.remove(f_abspath)
+            versioned_dir = base_wheel / base_dir / f"cu{cuda_version}"
+
+            # Create versioned directory
+            versioned_dir.mkdir(parents=True, exist_ok=True)
+
+            # Copy only version-specific binaries (.so, .pyd, .dll files) from the source wheel
+            # Python modules (.py, .pyx, .pxd) remain in cuda/core/
+            # Exclude versioned directories (cu12/, cu13/) to avoid recursion
+            source_dir = wheel_dir / base_dir
+            for item in source_dir.rglob("*"):
+                if item.is_dir():
+                    continue
+
+                # Skip files in versioned directories to avoid recursion
+                rel_path = item.relative_to(source_dir)
+                if any(part in ("cu12", "cu13") for part in rel_path.parts):
+                    continue
+
+                # Only copy binary files, not Python source files
+                if item.suffix in (".so", ".pyd", ".dll"):
+                    dest_item = versioned_dir / rel_path
+                    dest_item.parent.mkdir(parents=True, exist_ok=True)
+                    shutil.copy2(item, dest_item)
+
+            # Create empty __init__.py in versioned dirs
+            (versioned_dir / "__init__.py").touch()
 
         # Repack the merged wheel
         output_dir.mkdir(parents=True, exist_ok=True)

From 57708908fbe2069d882653354b1896651009a09e Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Tue, 16 Dec 2025 15:42:54 -0500
Subject: [PATCH 11/21] test: try removing unnecessary utils import

---
 cuda_core/cuda/core/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index a10812606e..ae7c93a041 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -28,7 +28,6 @@
 finally:
     del bindings, importlib, subdir, cuda_major, cuda_minor
 
-from cuda.core import utils  # noqa: E402
 from cuda.core._device import Device  # noqa: E402
 from cuda.core._event import Event, EventOptions  # noqa: E402
 from cuda.core._graph import (  # noqa: E402

From 53c380ec2a590ad9d7360efb586c3b7ea0f45d45 Mon Sep 17 00:00:00 2001
From: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
Date: Tue, 16 Dec 2025 16:34:59 -0500
Subject: [PATCH 12/21] chore: copy everything

---
 ci/tools/merge_cuda_core_wheels.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py
index 14ed53c308..5d53b82b61 100644
--- a/ci/tools/merge_cuda_core_wheels.py
+++ b/ci/tools/merge_cuda_core_wheels.py
@@ -118,11 +118,11 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
                 if any(part in ("cu12", "cu13") for part in rel_path.parts):
                     continue
 
-                # Only copy binary files, not Python source files
-                if item.suffix in (".so", ".pyd", ".dll"):
-                    dest_item = versioned_dir / rel_path
-                    dest_item.parent.mkdir(parents=True, exist_ok=True)
-                    shutil.copy2(item, dest_item)
+                # copy everything, because modules can't be assembled partially
+                # from other modules of the same name
+                dest_item = versioned_dir / rel_path
+                dest_item.parent.mkdir(parents=True, exist_ok=True)
+                shutil.copy2(item, dest_item)
 
             # Create empty __init__.py in versioned dirs
             (versioned_dir / "__init__.py").touch()

From 2f8531017837578b05190cd8067e5af3e9674d1b Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 16 Dec 2025 20:31:36 -0800
Subject: [PATCH 13/21] Fix merge script to selectively copy files to versioned
 directories

Fix the wheel merge script to copy files selectively instead of copying
everything, which was causing import errors:

- Copy binaries (.so, .pyd, .dll) to versioned directories (version-specific)
- Copy Python files (.py) to versioned directories (needed for imports like utils.py)
- Do NOT copy Cython files (.pyx, .pxd) to versioned directories

The previous "copy everything" approach caused failures when testing CUDA 12.9.1
because CUDA 13 .pyx files (which reference version-specific C functions like
cuMemGetMemPool) were being copied into cu12/ directories, causing import errors.

This selective approach fixes both:
- Original issue: utils.py import failures (now .py files are copied)
- Current issue: cuMemGetMemPool errors (now .pyx files are NOT copied)
---
 ci/tools/merge_cuda_core_wheels.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py
index 5d53b82b61..945da3327e 100644
--- a/ci/tools/merge_cuda_core_wheels.py
+++ b/ci/tools/merge_cuda_core_wheels.py
@@ -118,11 +118,15 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
                 if any(part in ("cu12", "cu13") for part in rel_path.parts):
                     continue
 
-                # copy everything, because modules can't be assembled partially
-                # from other modules of the same name
-                dest_item = versioned_dir / rel_path
-                dest_item.parent.mkdir(parents=True, exist_ok=True)
-                shutil.copy2(item, dest_item)
+                # Copy binaries and Python source files, but NOT Cython source files
+                # Binaries (.so, .pyd, .dll) are version-specific and must be in versioned dirs
+                # Python files (.py) like utils.py may be needed in versioned dirs for imports
+                # Cython files (.pyx, .pxd) should NOT be copied as they reference
+                # version-specific C functions and would cause import errors
+                if item.suffix in (".so", ".pyd", ".dll", ".py"):
+                    dest_item = versioned_dir / rel_path
+                    dest_item.parent.mkdir(parents=True, exist_ok=True)
+                    shutil.copy2(item, dest_item)
 
             # Create empty __init__.py in versioned dirs
             (versioned_dir / "__init__.py").touch()

From d6d5c34b86cb924331fecede384a64e4df2d953b Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 16 Dec 2025 20:35:33 -0800
Subject: [PATCH 14/21] Add debugging output to show wheel directory structures

Add debugging output to show the directory structure of input and output
wheels for troubleshooting. This will help diagnose issues with the
wheel merge process.

- Use Python's zipfile module (standard library) to list wheel contents
- Add debugging output showing cuda/core/ directory structure for:
  * Each input wheel before merging
  * The output merged wheel after merging
- Format output similar to unzip -l for readability
- Filter output to show only cuda/core/ entries
---
 ci/tools/merge_cuda_core_wheels.py | 50 ++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py
index 945da3327e..1e0360c9b8 100644
--- a/ci/tools/merge_cuda_core_wheels.py
+++ b/ci/tools/merge_cuda_core_wheels.py
@@ -25,6 +25,7 @@
 import subprocess
 import sys
 import tempfile
+import zipfile
 from pathlib import Path
 from typing import List
 
@@ -54,6 +55,31 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
     if len(wheels) == 1:
         raise RuntimeError("only one wheel is provided, nothing to merge")
 
+    # Debug: Show directory structure of input wheels
+    print("\n=== Input wheel directory structures ===", file=sys.stderr)
+    for i, wheel in enumerate(wheels):
+        print(f"\n--- Input wheel {i + 1}: {wheel.name} ---", file=sys.stderr)
+        try:
+            with zipfile.ZipFile(wheel, "r") as zf:
+                print(f"{'Length':>10}  {'Date':>12}  {'Time':>8}  Name", file=sys.stderr)
+                print("-" * 80, file=sys.stderr)
+                total_size = 0
+                file_count = 0
+                for name in sorted(zf.namelist()):
+                    if "cuda/core/" in name:
+                        info = zf.getinfo(name)
+                        total_size += info.file_size
+                        file_count += 1
+                        # Format similar to unzip -l output
+                        date_time = info.date_time
+                        date_str = f"{date_time[0]:04d}-{date_time[1]:02d}-{date_time[2]:02d}"
+                        time_str = f"{date_time[3]:02d}:{date_time[4]:02d}:{date_time[5]:02d}"
+                        print(f"{info.file_size:10d}  {date_str}  {time_str}  {name}", file=sys.stderr)
+                print("-" * 80, file=sys.stderr)
+                print(f"{total_size:10d}                    {file_count} files", file=sys.stderr)
+        except Exception as e:
+            print(f"Warning: Could not list wheel contents: {e}", file=sys.stderr)
+
     # Extract all wheels to temporary directories
     with tempfile.TemporaryDirectory() as temp_dir:
         temp_path = Path(temp_dir)
@@ -157,6 +183,30 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
 
         merged_wheel = output_wheels[0]
         print(f"Successfully merged wheel: {merged_wheel}", file=sys.stderr)
+
+        # Debug: Show directory structure of output wheel
+        print("\n=== Output wheel directory structure ===", file=sys.stderr)
+        try:
+            with zipfile.ZipFile(merged_wheel, "r") as zf:
+                print(f"{'Length':>10}  {'Date':>12}  {'Time':>8}  Name", file=sys.stderr)
+                print("-" * 80, file=sys.stderr)
+                total_size = 0
+                file_count = 0
+                for name in sorted(zf.namelist()):
+                    if "cuda/core/" in name:
+                        info = zf.getinfo(name)
+                        total_size += info.file_size
+                        file_count += 1
+                        # Format similar to unzip -l output
+                        date_time = info.date_time
+                        date_str = f"{date_time[0]:04d}-{date_time[1]:02d}-{date_time[2]:02d}"
+                        time_str = f"{date_time[3]:02d}:{date_time[4]:02d}:{date_time[5]:02d}"
+                        print(f"{info.file_size:10d}  {date_str}  {time_str}  {name}", file=sys.stderr)
+                print("-" * 80, file=sys.stderr)
+                print(f"{total_size:10d}                    {file_count} files", file=sys.stderr)
+        except Exception as e:
+            print(f"Warning: Could not list wheel contents: {e}", file=sys.stderr)
+
         return merged_wheel
 
 

From e8f6321bee18e9011caa0478a91428d47ac1972e Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 16 Dec 2025 22:22:58 -0800
Subject: [PATCH 15/21] Fix merge script to match main branch's working
 approach

Based on analysis of main branch's merge script and CI logs, updated the
merge logic to:

1. Use shutil.copytree() to copy entire directory trees into versioned
   subdirectories (cu12/, cu13/), matching main's approach for experimental/

2. Clean up main cuda/core/ directory to only keep:
   - __init__.py
   - _include/ directory
   - Versioned subdirectories (cu12/, cu13/)

   This ensures Python imports from versioned directories instead of
   main directory, which may contain binaries from a different CUDA version.

3. Add DRY helper function print_wheel_directory_structure() for debugging
   output, refactored from duplicate code.

4. Update docstring to reference cuda/core instead of experimental namespace.

This fixes the ImportError issues where Python was loading CUDA 13 binaries
when running on CUDA 12.9.1, because the main directory still contained
.so files from the base wheel.
---
 ci/tools/merge_cuda_core_wheels.py | 138 +++++++++++++----------------
 1 file changed, 64 insertions(+), 74 deletions(-)

diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py
index 1e0360c9b8..46413033ba 100644
--- a/ci/tools/merge_cuda_core_wheels.py
+++ b/ci/tools/merge_cuda_core_wheels.py
@@ -47,6 +47,37 @@ def run_command(cmd: List[str], cwd: Path = None, env: dict = os.environ) -> sub
     return result
 
 
+def print_wheel_directory_structure(wheel_path: Path, filter_prefix: str = "cuda/core/", label: str = None):
+    """Print the directory structure of a wheel file, similar to unzip -l output.
+
+    Args:
+        wheel_path: Path to the wheel file to inspect
+        filter_prefix: Only show files matching this prefix (default: "cuda/core/")
+        label: Optional label to print before the structure (e.g., "Input wheel 1: name.whl")
+    """
+    if label:
+        print(f"\n--- {label} ---", file=sys.stderr)
+    try:
+        with zipfile.ZipFile(wheel_path, "r") as zf:
+            print(f"{'Length':>10}  {'Date':>12}  {'Time':>8}  Name", file=sys.stderr)
+            print("-" * 80, file=sys.stderr)
+            total_size = 0
+            file_count = 0
+            for name in sorted(zf.namelist()):
+                if filter_prefix in name:
+                    info = zf.getinfo(name)
+                    total_size += info.file_size
+                    file_count += 1
+                    date_time = info.date_time
+                    date_str = f"{date_time[0]:04d}-{date_time[1]:02d}-{date_time[2]:02d}"
+                    time_str = f"{date_time[3]:02d}:{date_time[4]:02d}:{date_time[5]:02d}"
+                    print(f"{info.file_size:10d}  {date_str}  {time_str}  {name}", file=sys.stderr)
+            print("-" * 80, file=sys.stderr)
+            print(f"{total_size:10d}                    {file_count} files", file=sys.stderr)
+    except Exception as e:
+        print(f"Warning: Could not list wheel contents: {e}", file=sys.stderr)
+
+
 def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
     """Merge multiple wheels into a single wheel with version-specific binaries."""
     print("\n=== Merging wheels ===", file=sys.stderr)
@@ -55,31 +86,6 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
     if len(wheels) == 1:
         raise RuntimeError("only one wheel is provided, nothing to merge")
 
-    # Debug: Show directory structure of input wheels
-    print("\n=== Input wheel directory structures ===", file=sys.stderr)
-    for i, wheel in enumerate(wheels):
-        print(f"\n--- Input wheel {i + 1}: {wheel.name} ---", file=sys.stderr)
-        try:
-            with zipfile.ZipFile(wheel, "r") as zf:
-                print(f"{'Length':>10}  {'Date':>12}  {'Time':>8}  Name", file=sys.stderr)
-                print("-" * 80, file=sys.stderr)
-                total_size = 0
-                file_count = 0
-                for name in sorted(zf.namelist()):
-                    if "cuda/core/" in name:
-                        info = zf.getinfo(name)
-                        total_size += info.file_size
-                        file_count += 1
-                        # Format similar to unzip -l output
-                        date_time = info.date_time
-                        date_str = f"{date_time[0]:04d}-{date_time[1]:02d}-{date_time[2]:02d}"
-                        time_str = f"{date_time[3]:02d}:{date_time[4]:02d}:{date_time[5]:02d}"
-                        print(f"{info.file_size:10d}  {date_str}  {time_str}  {name}", file=sys.stderr)
-                print("-" * 80, file=sys.stderr)
-                print(f"{total_size:10d}                    {file_count} files", file=sys.stderr)
-        except Exception as e:
-            print(f"Warning: Could not list wheel contents: {e}", file=sys.stderr)
-
     # Extract all wheels to temporary directories
     with tempfile.TemporaryDirectory() as temp_dir:
         temp_path = Path(temp_dir)
@@ -117,46 +123,49 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
 
             extracted_wheels.append(extract_dir)
 
+        # Debug: Show directory structure of input wheels
+        print("\n=== Input wheel directory structures ===", file=sys.stderr)
+        for i, wheel in enumerate(wheels):
+            print_wheel_directory_structure(wheel, label=f"Input wheel {i + 1}: {wheel.name}")
+
         # Use the first wheel as the base and merge binaries from others
         base_wheel = extracted_wheels[0]
 
-        # Copy version-specific binaries from each wheel into versioned subdirectories
-        # Note: Python modules stay in cuda/core/, only binaries go into cu12/cu13/
+        # Copy version-specific directories from each wheel into versioned subdirectories
+        # This matches the approach used on main branch: copy entire directory trees
         base_dir = Path("cuda") / "core"
 
         for i, wheel_dir in enumerate(extracted_wheels):
             cuda_version = wheels[i].name.split(".cu")[1].split(".")[0]
             versioned_dir = base_wheel / base_dir / f"cu{cuda_version}"
 
-            # Create versioned directory
-            versioned_dir.mkdir(parents=True, exist_ok=True)
-
-            # Copy only version-specific binaries (.so, .pyd, .dll files) from the source wheel
-            # Python modules (.py, .pyx, .pxd) remain in cuda/core/
-            # Exclude versioned directories (cu12/, cu13/) to avoid recursion
-            source_dir = wheel_dir / base_dir
-            for item in source_dir.rglob("*"):
-                if item.is_dir():
-                    continue
-
-                # Skip files in versioned directories to avoid recursion
-                rel_path = item.relative_to(source_dir)
-                if any(part in ("cu12", "cu13") for part in rel_path.parts):
-                    continue
-
-                # Copy binaries and Python source files, but NOT Cython source files
-                # Binaries (.so, .pyd, .dll) are version-specific and must be in versioned dirs
-                # Python files (.py) like utils.py may be needed in versioned dirs for imports
-                # Cython files (.pyx, .pxd) should NOT be copied as they reference
-                # version-specific C functions and would cause import errors
-                if item.suffix in (".so", ".pyd", ".dll", ".py"):
-                    dest_item = versioned_dir / rel_path
-                    dest_item.parent.mkdir(parents=True, exist_ok=True)
-                    shutil.copy2(item, dest_item)
-
-            # Create empty __init__.py in versioned dirs
+            # Copy entire directory tree from source wheel to versioned directory
+            # This includes all files: .so, .pyx, .pxd, .py, .cpp, etc.
+            print(f"  Copying {wheel_dir / base_dir} to {versioned_dir}", file=sys.stderr)
+            shutil.copytree(wheel_dir / base_dir, versioned_dir, dirs_exist_ok=True)
+
+            # Overwrite the __init__.py in versioned dirs to be empty
             (versioned_dir / "__init__.py").touch()
 
+        # The base dir should only contain __init__.py, the _include dir, and the versioned dirs
+        # Remove all other files and directories to ensure Python imports from versioned dirs
+        print("\n=== Removing files from main directory ===", file=sys.stderr)
+        files_to_remove = os.scandir(base_wheel / base_dir)
+        removed_count = 0
+        for f in files_to_remove:
+            f_abspath = f.path
+            # Keep: __init__.py, _include directory, and versioned subdirectories (cu12, cu13)
+            if f.name not in ("__init__.py", "_include", "cu12", "cu13"):
+                if f.is_dir():
+                    print(f"  Removing directory: {f.name}", file=sys.stderr)
+                    shutil.rmtree(f_abspath)
+                    removed_count += 1
+                else:
+                    print(f"  Removing file: {f.name}", file=sys.stderr)
+                    os.remove(f_abspath)
+                    removed_count += 1
+        print(f"Removed {removed_count} items from main directory", file=sys.stderr)
+
         # Repack the merged wheel
         output_dir.mkdir(parents=True, exist_ok=True)
 
@@ -186,26 +195,7 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
 
         # Debug: Show directory structure of output wheel
         print("\n=== Output wheel directory structure ===", file=sys.stderr)
-        try:
-            with zipfile.ZipFile(merged_wheel, "r") as zf:
-                print(f"{'Length':>10}  {'Date':>12}  {'Time':>8}  Name", file=sys.stderr)
-                print("-" * 80, file=sys.stderr)
-                total_size = 0
-                file_count = 0
-                for name in sorted(zf.namelist()):
-                    if "cuda/core/" in name:
-                        info = zf.getinfo(name)
-                        total_size += info.file_size
-                        file_count += 1
-                        # Format similar to unzip -l output
-                        date_time = info.date_time
-                        date_str = f"{date_time[0]:04d}-{date_time[1]:02d}-{date_time[2]:02d}"
-                        time_str = f"{date_time[3]:02d}:{date_time[4]:02d}:{date_time[5]:02d}"
-                        print(f"{info.file_size:10d}  {date_str}  {time_str}  {name}", file=sys.stderr)
-                print("-" * 80, file=sys.stderr)
-                print(f"{total_size:10d}                    {file_count} files", file=sys.stderr)
-        except Exception as e:
-            print(f"Warning: Could not list wheel contents: {e}", file=sys.stderr)
+        print_wheel_directory_structure(merged_wheel)
 
         return merged_wheel
 

From 80194097e852f67a4f6d6430be0d51f06103c4e7 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 16 Dec 2025 23:27:16 -0800
Subject: [PATCH 16/21] Fix merge script: keep _version.py and __init__.pxd in
 main directory

The merge script was removing _version.py and __init__.pxd from the main
cuda/core/ directory, but these files are required:
- _version.py: imported by __init__.py at module initialization
- __init__.pxd: kept for Cython compatibility (matches main branch behavior)

Updated the cleanup logic to preserve these files along with __init__.py,
_include/, and versioned subdirectories (cu12/, cu13/).

The code was also manually cleaned up for better readability and
maintainability.
---
 ci/tools/merge_cuda_core_wheels.py | 55 ++++++++++++++++--------------
 1 file changed, 29 insertions(+), 26 deletions(-)

diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py
index 46413033ba..bc178734a1 100644
--- a/ci/tools/merge_cuda_core_wheels.py
+++ b/ci/tools/merge_cuda_core_wheels.py
@@ -78,7 +78,7 @@ def print_wheel_directory_structure(wheel_path: Path, filter_prefix: str = "cuda
         print(f"Warning: Could not list wheel contents: {e}", file=sys.stderr)
 
 
-def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
+def merge_wheels(wheels: List[Path], output_dir: Path, show_wheel_contents: bool = True) -> Path:
     """Merge multiple wheels into a single wheel with version-specific binaries."""
     print("\n=== Merging wheels ===", file=sys.stderr)
     print(f"Input wheels: {[w.name for w in wheels]}", file=sys.stderr)
@@ -123,16 +123,15 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
 
             extracted_wheels.append(extract_dir)
 
-        # Debug: Show directory structure of input wheels
-        print("\n=== Input wheel directory structures ===", file=sys.stderr)
-        for i, wheel in enumerate(wheels):
-            print_wheel_directory_structure(wheel, label=f"Input wheel {i + 1}: {wheel.name}")
+        if show_wheel_contents:
+            print("\n=== Input wheel directory structures ===", file=sys.stderr)
+            for i, wheel in enumerate(wheels):
+                print_wheel_directory_structure(wheel, label=f"Input wheel {i + 1}: {wheel.name}")
 
         # Use the first wheel as the base and merge binaries from others
         base_wheel = extracted_wheels[0]
 
         # Copy version-specific directories from each wheel into versioned subdirectories
-        # This matches the approach used on main branch: copy entire directory trees
         base_dir = Path("cuda") / "core"
 
         for i, wheel_dir in enumerate(extracted_wheels):
@@ -140,31 +139,35 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
             versioned_dir = base_wheel / base_dir / f"cu{cuda_version}"
 
             # Copy entire directory tree from source wheel to versioned directory
-            # This includes all files: .so, .pyx, .pxd, .py, .cpp, etc.
             print(f"  Copying {wheel_dir / base_dir} to {versioned_dir}", file=sys.stderr)
             shutil.copytree(wheel_dir / base_dir, versioned_dir, dirs_exist_ok=True)
 
             # Overwrite the __init__.py in versioned dirs to be empty
             (versioned_dir / "__init__.py").touch()
 
-        # The base dir should only contain __init__.py, the _include dir, and the versioned dirs
-        # Remove all other files and directories to ensure Python imports from versioned dirs
-        print("\n=== Removing files from main directory ===", file=sys.stderr)
-        files_to_remove = os.scandir(base_wheel / base_dir)
+        print("\n=== Removing files from cuda/core/ directory ===", file=sys.stderr)
+        items_to_keep = (
+            "__init__.py",
+            "__init__.pxd",
+            "_version.py",
+            "_include",
+            "cu12",
+            "cu13",
+        )
+        all_items = os.scandir(base_wheel / base_dir)
         removed_count = 0
-        for f in files_to_remove:
+        for f in all_items:
             f_abspath = f.path
-            # Keep: __init__.py, _include directory, and versioned subdirectories (cu12, cu13)
-            if f.name not in ("__init__.py", "_include", "cu12", "cu13"):
-                if f.is_dir():
-                    print(f"  Removing directory: {f.name}", file=sys.stderr)
-                    shutil.rmtree(f_abspath)
-                    removed_count += 1
-                else:
-                    print(f"  Removing file: {f.name}", file=sys.stderr)
-                    os.remove(f_abspath)
-                    removed_count += 1
-        print(f"Removed {removed_count} items from main directory", file=sys.stderr)
+            if f.name in items_to_keep:
+                continue
+            if f.is_dir():
+                print(f"  Removing directory: {f.name}", file=sys.stderr)
+                shutil.rmtree(f_abspath)
+            else:
+                print(f"  Removing file: {f.name}", file=sys.stderr)
+                os.remove(f_abspath)
+            removed_count += 1
+        print(f"Removed {removed_count} items from cuda/core/ directory", file=sys.stderr)
 
         # Repack the merged wheel
         output_dir.mkdir(parents=True, exist_ok=True)
@@ -193,9 +196,9 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
         merged_wheel = output_wheels[0]
         print(f"Successfully merged wheel: {merged_wheel}", file=sys.stderr)
 
-        # Debug: Show directory structure of output wheel
-        print("\n=== Output wheel directory structure ===", file=sys.stderr)
-        print_wheel_directory_structure(merged_wheel)
+        if show_wheel_contents:
+            print("\n=== Output wheel directory structure ===", file=sys.stderr)
+            print_wheel_directory_structure(merged_wheel)
 
         return merged_wheel
 

From 7c7e44485d7b0de8239b537d7fe01bb8c43fe714 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 17 Dec 2025 00:11:58 -0800
Subject: [PATCH 17/21] Rework top-level into in cuda_core/docs/source/api.rst

---
 cuda_core/docs/source/api.rst | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index af77b70859..e24334e476 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -6,7 +6,11 @@
 ``cuda.core`` API Reference
 ===========================
 
-This is the main API reference for ``cuda.core``. All APIs are stable and ready for production use.
+This is the main API reference for ``cuda.core``. The package has not yet
+reached version 1.0.0, and APIs may change between minor versions, possibly
+without deprecation warnings. Once version 1.0.0 is released, APIs will
+be considered stable and will follow semantic versioning with appropriate
+deprecation periods for breaking changes.
 
 
 CUDA runtime

From 815620e36fadefe8367b52c5708179e5bc2e8188 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 17 Dec 2025 00:48:35 -0800
Subject: [PATCH 18/21] Bug fix in ci/tools/merge_cuda_core_wheels.py: replace
 AI-generated .touch() with os.truncate()

---
 ci/tools/merge_cuda_core_wheels.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py
index bc178734a1..e3f71ea282 100644
--- a/ci/tools/merge_cuda_core_wheels.py
+++ b/ci/tools/merge_cuda_core_wheels.py
@@ -143,7 +143,7 @@ def merge_wheels(wheels: List[Path], output_dir: Path, show_wheel_contents: bool
             shutil.copytree(wheel_dir / base_dir, versioned_dir, dirs_exist_ok=True)
 
             # Overwrite the __init__.py in versioned dirs to be empty
-            (versioned_dir / "__init__.py").touch()
+            os.truncate(versioned_dir / "__init__.py", 0)
 
         print("\n=== Removing files from cuda/core/ directory ===", file=sys.stderr)
         items_to_keep = (

From 88c139c9b115ab3319b46c2d3288db7dce04263a Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 17 Dec 2025 06:40:02 -0800
Subject: [PATCH 19/21] Add backward compatibility tests for experimental
 namespace

- Add test_experimental_backward_compat.py from v0 branch
- Remove __getattr__ from experimental/__init__.py per reviewer feedback
  (underscored modules are not public APIs)
- Update test to expect AttributeError when accessing underscored modules
- Document in module docstring that underscored modules are intentionally
  not accessible through experimental namespace
- Move note about underscored modules to module docstring (per pre-commit)
---
 cuda_core/cuda/core/experimental/__init__.py  |  41 +----
 .../test_experimental_backward_compat.py      | 164 ++++++++++++++++++
 2 files changed, 167 insertions(+), 38 deletions(-)
 create mode 100644 cuda_core/tests/test_experimental_backward_compat.py

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 8adc1b78fd..95d548985f 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -10,6 +10,9 @@
 directly from cuda.core instead of cuda.core.experimental.
 
 The experimental namespace will be removed in a future release.
+
+Note: Underscored modules (e.g., _device, _memory) are not public APIs
+and are intentionally not made accessible here.
 """
 
 import warnings
@@ -21,8 +24,6 @@ def _warn_deprecated():
     Note: This warning is only when the experimental module is first imported.
     Subsequent accesses to attributes (like utils, Device, etc.) do not trigger
     additional warnings since they are already set in the module namespace.
-    Only accessing submodules via __getattr__ (e.g., _device, _utils) will trigger
-    additional warnings.
     """
     warnings.warn(
         "The cuda.core.experimental namespace is deprecated. "
@@ -80,39 +81,3 @@ def _warn_deprecated():
 system = System()
 __import__("sys").modules[__spec__.name + ".system"] = system
 del System
-
-
-# Also create forwarding stubs for submodules
-# These will be imported lazily when accessed
-def __getattr__(name):
-    """Forward attribute access to the new location with deprecation warning."""
-    if name in (
-        "_context",
-        "_device",
-        "_dlpack",
-        "_event",
-        "_graph",
-        "_kernel_arg_handler",
-        "_launch_config",
-        "_launcher",
-        "_layout",
-        "_linker",
-        "_memory",
-        "_memoryview",
-        "_module",
-        "_program",
-        "_stream",
-        "_system",
-        "_utils",
-    ):
-        _warn_deprecated()
-        # Import the submodule from the new location
-        import importlib
-
-        new_name = name.lstrip("_")
-        try:
-            return importlib.import_module(f"cuda.core.{new_name}")
-        except ImportError:
-            # Fallback to underscore-prefixed name
-            return importlib.import_module(f"cuda.core.{name}")
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/cuda_core/tests/test_experimental_backward_compat.py b/cuda_core/tests/test_experimental_backward_compat.py
new file mode 100644
index 0000000000..4f71a082f2
--- /dev/null
+++ b/cuda_core/tests/test_experimental_backward_compat.py
@@ -0,0 +1,164 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Tests for backward compatibility of cuda.core.experimental namespace.
+
+These tests verify that the experimental namespace forwarding stubs work
+correctly and emit appropriate deprecation warnings.
+"""
+
+import pytest
+
+
+# Test that experimental imports still work
+def test_experimental_imports_work():
+    """Test that imports from experimental namespace still work."""
+    # Clear cached module to ensure warning is emitted
+    import sys
+
+    if "cuda.core.experimental" in sys.modules:
+        del sys.modules["cuda.core.experimental"]
+
+    # Test main module import - should emit deprecation warning
+    with pytest.deprecated_call():
+        import cuda.core.experimental
+
+    # Test that symbols are accessible
+    assert hasattr(cuda.core.experimental, "Device")
+    assert hasattr(cuda.core.experimental, "Stream")
+    assert hasattr(cuda.core.experimental, "Buffer")
+    assert hasattr(cuda.core.experimental, "system")
+
+
+def test_experimental_symbols_are_same_objects():
+    """Test that experimental namespace symbols are the same objects as core."""
+    import cuda.core
+    import cuda.core.experimental
+
+    # Compare classes/types
+    assert cuda.core.experimental.Device is cuda.core.Device
+    assert cuda.core.experimental.Stream is cuda.core.Stream
+    assert cuda.core.experimental.Buffer is cuda.core.Buffer
+    assert cuda.core.experimental.MemoryResource is cuda.core.MemoryResource
+    assert cuda.core.experimental.Program is cuda.core.Program
+    assert cuda.core.experimental.Kernel is cuda.core.Kernel
+    assert cuda.core.experimental.ObjectCode is cuda.core.ObjectCode
+    assert cuda.core.experimental.Graph is cuda.core.Graph
+    assert cuda.core.experimental.GraphBuilder is cuda.core.GraphBuilder
+    assert cuda.core.experimental.Event is cuda.core.Event
+    assert cuda.core.experimental.Linker is cuda.core.Linker
+
+    # Compare singletons
+    assert cuda.core.experimental.system is cuda.core.system
+
+
+def test_experimental_direct_imports():
+    """Test that direct imports from experimental submodules work."""
+    # Clear any cached imports to ensure warnings are emitted
+    import sys
+
+    if "cuda.core.experimental" in sys.modules:
+        del sys.modules["cuda.core.experimental"]
+
+    # Test various import patterns - warning is emitted once at module import time
+    with pytest.deprecated_call():
+        from cuda.core.experimental import (
+            Buffer,
+            Device,
+            Stream,
+        )
+
+    # Verify objects are usable
+    assert Device is not None
+    assert Stream is not None
+    assert Buffer is not None
+
+
+def test_experimental_submodule_access():
+    """Test that accessing experimental underscored submodules raises AttributeError.
+
+    Underscored modules are not public APIs and should not be accessible through
+    the experimental namespace.
+    """
+    import cuda.core.experimental
+
+    # Underscored modules should not be accessible (__getattr__ removed per reviewer feedback)
+    with pytest.raises(AttributeError):
+        _ = cuda.core.experimental._device
+    with pytest.raises(AttributeError):
+        _ = cuda.core.experimental._stream
+    with pytest.raises(AttributeError):
+        _ = cuda.core.experimental._memory
+
+
+def test_experimental_utils_module():
+    """Test that experimental.utils module works.
+
+    Note: The deprecation warning is only emitted once at import time when
+    cuda.core.experimental is first imported. Accessing utils or importing
+    from utils does not trigger additional warnings since utils is already
+    set as an attribute in the module namespace.
+    """
+    import cuda.core.experimental
+
+    # Should be able to access utils (no warning on access, only on initial import)
+    assert hasattr(cuda.core.experimental, "utils")
+    assert cuda.core.experimental.utils is not None
+
+    # Should have expected utilities (no warning on import from utils submodule)
+    from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
+
+    assert StridedMemoryView is not None
+    assert args_viewable_as_strided_memory is not None
+
+
+def test_experimental_options_classes():
+    """Test that options classes are accessible."""
+    import cuda.core.experimental
+
+    assert hasattr(cuda.core.experimental, "EventOptions")
+    assert hasattr(cuda.core.experimental, "StreamOptions")
+    assert hasattr(cuda.core.experimental, "LaunchConfig")
+    assert hasattr(cuda.core.experimental, "ProgramOptions")
+    assert hasattr(cuda.core.experimental, "LinkerOptions")
+    assert hasattr(cuda.core.experimental, "GraphCompleteOptions")
+    assert hasattr(cuda.core.experimental, "GraphDebugPrintOptions")
+    assert hasattr(cuda.core.experimental, "DeviceMemoryResourceOptions")
+    assert hasattr(cuda.core.experimental, "VirtualMemoryResourceOptions")
+
+    # Verify they're the same objects
+    assert cuda.core.experimental.EventOptions is cuda.core.EventOptions
+    assert cuda.core.experimental.StreamOptions is cuda.core.StreamOptions
+    assert cuda.core.experimental.LaunchConfig is cuda.core.LaunchConfig
+
+
+def test_experimental_memory_classes():
+    """Test that memory-related classes are accessible."""
+    import cuda.core.experimental
+
+    assert hasattr(cuda.core.experimental, "MemoryResource")
+    assert hasattr(cuda.core.experimental, "DeviceMemoryResource")
+    assert hasattr(cuda.core.experimental, "LegacyPinnedMemoryResource")
+    assert hasattr(cuda.core.experimental, "VirtualMemoryResource")
+    assert hasattr(cuda.core.experimental, "GraphMemoryResource")
+
+    # Verify they're the same objects
+    assert cuda.core.experimental.MemoryResource is cuda.core.MemoryResource
+    assert cuda.core.experimental.DeviceMemoryResource is cuda.core.DeviceMemoryResource
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_experimental_instantiations():
+    """Test that objects can be instantiated through experimental namespace."""
+    from cuda.core.experimental import Device
+
+    # Should be able to create objects
+    device = Device()
+    assert device is not None
+
+    # Verify it's the same type
+    from cuda.core import Device as CoreDevice
+
+    assert isinstance(device, CoreDevice)

From 55db7f3326bde8323a5266383ae57aaa26e54b47 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 17 Dec 2025 06:50:29 -0800
Subject: [PATCH 20/21] Revert test files to use experimental namespace imports

Revert all changes under cuda_core/tests/ to match main branch,
which uses cuda.core.experimental imports instead of cuda.core imports.

This allows verification that the original tests still pass with the
backward compatibility stubs, except possibly for direct imports of
underscored (private) modules which are no longer accessible through
the experimental namespace.
---
 cuda_core/tests/conftest.py                   |   8 +-
 cuda_core/tests/cython/build_tests.sh         |   4 +-
 .../cython/test_get_cuda_native_handle.pyx    |   2 +-
 .../example_tests/test_basic_examples.py      |   2 +-
 cuda_core/tests/helpers/buffers.py            |   4 +-
 cuda_core/tests/helpers/latch.py              |   2 +-
 cuda_core/tests/helpers/nanosleep_kernel.py   |   2 +-
 cuda_core/tests/memory_ipc/test_errors.py     |   4 +-
 cuda_core/tests/memory_ipc/test_event_ipc.py  |   2 +-
 cuda_core/tests/memory_ipc/test_memory_ipc.py |   2 +-
 .../tests/memory_ipc/test_peer_access.py      |   4 +-
 .../tests/memory_ipc/test_send_buffers.py     |   2 +-
 cuda_core/tests/memory_ipc/test_serialize.py  |   2 +-
 cuda_core/tests/memory_ipc/test_workerpool.py |   2 +-
 cuda_core/tests/test_comparable.py            |   8 +-
 cuda_core/tests/test_context.py               |   6 +-
 cuda_core/tests/test_cuda_utils.py            |   2 +-
 cuda_core/tests/test_device.py                |   8 +-
 cuda_core/tests/test_event.py                 |   6 +-
 .../test_experimental_backward_compat.py      | 164 ------------------
 cuda_core/tests/test_graph.py                 |   4 +-
 cuda_core/tests/test_graph_mem.py             |   2 +-
 cuda_core/tests/test_hashable.py              |   8 +-
 cuda_core/tests/test_helpers.py               |   2 +-
 cuda_core/tests/test_launcher.py              |  10 +-
 cuda_core/tests/test_linker.py                |   6 +-
 cuda_core/tests/test_memory.py                |  14 +-
 cuda_core/tests/test_memory_peer_access.py    |   8 +-
 cuda_core/tests/test_module.py                |  16 +-
 .../tests/test_multiprocessing_warning.py     |  10 +-
 cuda_core/tests/test_program.py               |  20 +--
 cuda_core/tests/test_stream.py                |   8 +-
 cuda_core/tests/test_strided_layout.py        |   2 +-
 cuda_core/tests/test_system.py                |   4 +-
 cuda_core/tests/test_utils.py                 |  12 +-
 35 files changed, 99 insertions(+), 263 deletions(-)
 delete mode 100644 cuda_core/tests/test_experimental_backward_compat.py

diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index 0dac8f7def..9aaf23498f 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -12,8 +12,8 @@
 except ImportError:
     from cuda import cuda as driver
 
-import cuda.core
-from cuda.core import (
+import cuda.core.experimental
+from cuda.core.experimental import (
     Device,
     DeviceMemoryResource,
     DeviceMemoryResourceOptions,
@@ -23,7 +23,7 @@
     PinnedMemoryResourceOptions,
     _device,
 )
-from cuda.core._utils.cuda_utils import handle_return
+from cuda.core.experimental._utils.cuda_utils import handle_return
 
 
 def skip_if_pinned_memory_unsupported(device):
@@ -172,7 +172,7 @@ def mempool_device():
 
 
 def _mempool_device_impl(num):
-    num_devices = len(cuda.core.Device.get_all_devices())
+    num_devices = len(cuda.core.experimental.Device.get_all_devices())
     if num_devices < num:
         pytest.skip(f"Test requires at least {num} GPUs")
 
diff --git a/cuda_core/tests/cython/build_tests.sh b/cuda_core/tests/cython/build_tests.sh
index 3e20136133..eb3303840d 100755
--- a/cuda_core/tests/cython/build_tests.sh
+++ b/cuda_core/tests/cython/build_tests.sh
@@ -6,10 +6,10 @@
 UNAME=$(uname)
 if [ "$UNAME" == "Linux" ] ; then
   SCRIPTPATH=$(dirname $(realpath "$0"))
-  export CPLUS_INCLUDE_PATH=${SCRIPTPATH}/../../cuda/core/_include:$CUDA_HOME/include:$CPLUS_INCLUDE_PATH
+  export CPLUS_INCLUDE_PATH=${SCRIPTPATH}/../../cuda/core/experimental/include:$CUDA_HOME/include:$CPLUS_INCLUDE_PATH
 elif [[ "$UNAME" == CYGWIN* || "$UNAME" == MINGW* || "$UNAME" == MSYS* ]] ; then
   SCRIPTPATH="$(dirname $(cygpath -w $(realpath "$0")))"
-  CUDA_CORE_INCLUDE_PATH=$(echo "${SCRIPTPATH}\..\..\cuda\core\_include" | sed 's/\\/\\\\/g')
+  CUDA_CORE_INCLUDE_PATH=$(echo "${SCRIPTPATH}\..\..\cuda\core\experimental\include" | sed 's/\\/\\\\/g')
   export CL="/I\"${CUDA_CORE_INCLUDE_PATH}\" /I\"${CUDA_HOME}\\include\" ${CL}"
 else
   exit 1
diff --git a/cuda_core/tests/cython/test_get_cuda_native_handle.pyx b/cuda_core/tests/cython/test_get_cuda_native_handle.pyx
index 2b105e13ae..0c3921e925 100644
--- a/cuda_core/tests/cython/test_get_cuda_native_handle.pyx
+++ b/cuda_core/tests/cython/test_get_cuda_native_handle.pyx
@@ -13,7 +13,7 @@ from cuda.bindings.nvrtc cimport nvrtcProgram as pynvrtcProgram
 from cuda.bindings.cydriver cimport CUstream, CUevent
 from cuda.bindings.cynvrtc cimport nvrtcProgram
 
-from cuda.core import Device, Program
+from cuda.core.experimental import Device, Program
 
 
 cdef extern from "utility.hpp":
diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py
index 640b53c2fc..450c60bf06 100644
--- a/cuda_core/tests/example_tests/test_basic_examples.py
+++ b/cuda_core/tests/example_tests/test_basic_examples.py
@@ -7,7 +7,7 @@
 import os
 
 import pytest
-from cuda.core import Device
+from cuda.core.experimental import Device
 
 from .utils import run_example
 
diff --git a/cuda_core/tests/helpers/buffers.py b/cuda_core/tests/helpers/buffers.py
index 3004cd0d00..b4d769eab3 100644
--- a/cuda_core/tests/helpers/buffers.py
+++ b/cuda_core/tests/helpers/buffers.py
@@ -3,8 +3,8 @@
 
 import ctypes
 
-from cuda.core import Buffer, Device, MemoryResource
-from cuda.core._utils.cuda_utils import driver, handle_return
+from cuda.core.experimental import Buffer, Device, MemoryResource
+from cuda.core.experimental._utils.cuda_utils import driver, handle_return
 
 from . import libc
 
diff --git a/cuda_core/tests/helpers/latch.py b/cuda_core/tests/helpers/latch.py
index e35ee3325b..46516c1b06 100644
--- a/cuda_core/tests/helpers/latch.py
+++ b/cuda_core/tests/helpers/latch.py
@@ -4,7 +4,7 @@
 import ctypes
 
 import pytest
-from cuda.core import (
+from cuda.core.experimental import (
     LaunchConfig,
     LegacyPinnedMemoryResource,
     Program,
diff --git a/cuda_core/tests/helpers/nanosleep_kernel.py b/cuda_core/tests/helpers/nanosleep_kernel.py
index 99d32c9aa4..ea6ae34dcf 100644
--- a/cuda_core/tests/helpers/nanosleep_kernel.py
+++ b/cuda_core/tests/helpers/nanosleep_kernel.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.core import (
+from cuda.core.experimental import (
     LaunchConfig,
     Program,
     ProgramOptions,
diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py
index ccb3d3b7cc..d6280ae0ec 100644
--- a/cuda_core/tests/memory_ipc/test_errors.py
+++ b/cuda_core/tests/memory_ipc/test_errors.py
@@ -5,8 +5,8 @@
 import pickle
 import re
 
-from cuda.core import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
-from cuda.core._utils.cuda_utils import CUDAError
+from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
+from cuda.core.experimental._utils.cuda_utils import CUDAError
 
 CHILD_TIMEOUT_SEC = 20
 NBYTES = 64
diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py
index 1fabaeddda..5edf97f2ae 100644
--- a/cuda_core/tests/memory_ipc/test_event_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_event_ipc.py
@@ -4,7 +4,7 @@
 import multiprocessing as mp
 
 import pytest
-from cuda.core import Device, EventOptions
+from cuda.core.experimental import Device, EventOptions
 from helpers.buffers import compare_equal_buffers, make_scratch_buffer
 from helpers.latch import LatchKernel
 from helpers.logging import TimestampedLogger
diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py
index d92a28ab5a..54d8056865 100644
--- a/cuda_core/tests/memory_ipc/test_memory_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py
@@ -3,7 +3,7 @@
 
 import multiprocessing as mp
 
-from cuda.core import Buffer, DeviceMemoryResource
+from cuda.core.experimental import Buffer, DeviceMemoryResource
 from helpers.buffers import PatternGen
 
 CHILD_TIMEOUT_SEC = 20
diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py
index 5a06133c9b..87dc459ffc 100644
--- a/cuda_core/tests/memory_ipc/test_peer_access.py
+++ b/cuda_core/tests/memory_ipc/test_peer_access.py
@@ -4,8 +4,8 @@
 import multiprocessing as mp
 
 import pytest
-from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
-from cuda.core._utils.cuda_utils import CUDAError
+from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
+from cuda.core.experimental._utils.cuda_utils import CUDAError
 from helpers.buffers import PatternGen
 
 CHILD_TIMEOUT_SEC = 20
diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py
index 2df3fe1bbc..3493828c7e 100644
--- a/cuda_core/tests/memory_ipc/test_send_buffers.py
+++ b/cuda_core/tests/memory_ipc/test_send_buffers.py
@@ -5,7 +5,7 @@
 from itertools import cycle
 
 import pytest
-from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
+from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
 from helpers.buffers import PatternGen
 
 CHILD_TIMEOUT_SEC = 20
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index 546c8a91aa..f5686db28c 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -5,7 +5,7 @@
 import multiprocessing.reduction
 import os
 
-from cuda.core import Buffer, Device, DeviceMemoryResource
+from cuda.core.experimental import Buffer, Device, DeviceMemoryResource
 from helpers.buffers import PatternGen
 
 CHILD_TIMEOUT_SEC = 20
diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
index b13b9896a1..3f3f46cd27 100644
--- a/cuda_core/tests/memory_ipc/test_workerpool.py
+++ b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -6,7 +6,7 @@
 from itertools import cycle
 
 import pytest
-from cuda.core import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
+from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
 from helpers.buffers import PatternGen
 
 CHILD_TIMEOUT_SEC = 20
diff --git a/cuda_core/tests/test_comparable.py b/cuda_core/tests/test_comparable.py
index a93e49e4e8..c99963cd23 100644
--- a/cuda_core/tests/test_comparable.py
+++ b/cuda_core/tests/test_comparable.py
@@ -8,10 +8,10 @@
 across Device, Stream, Event, and Context objects.
 """
 
-from cuda.core import Device, Stream
-from cuda.core._context import Context
-from cuda.core._event import Event, EventOptions
-from cuda.core._stream import StreamOptions
+from cuda.core.experimental import Device, Stream
+from cuda.core.experimental._context import Context
+from cuda.core.experimental._event import Event, EventOptions
+from cuda.core.experimental._stream import StreamOptions
 
 # ============================================================================
 # Equality Contract Tests
diff --git a/cuda_core/tests/test_context.py b/cuda_core/tests/test_context.py
index 5183aa1a85..4fe35dc18d 100644
--- a/cuda_core/tests/test_context.py
+++ b/cuda_core/tests/test_context.py
@@ -1,14 +1,14 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import cuda.core
+import cuda.core.experimental
 import pytest
-from cuda.core import Device
+from cuda.core.experimental import Device
 
 
 def test_context_init_disabled():
     with pytest.raises(RuntimeError, match=r"^Context objects cannot be instantiated directly\."):
-        cuda.core._context.Context()  # Ensure back door is locked.
+        cuda.core.experimental._context.Context()  # Ensure back door is locked.
 
 
 # ============================================================================
diff --git a/cuda_core/tests/test_cuda_utils.py b/cuda_core/tests/test_cuda_utils.py
index c68f8fb841..b0a0518652 100644
--- a/cuda_core/tests/test_cuda_utils.py
+++ b/cuda_core/tests/test_cuda_utils.py
@@ -4,7 +4,7 @@
 
 import pytest
 from cuda.bindings import driver, runtime
-from cuda.core._utils import cuda_utils
+from cuda.core.experimental._utils import cuda_utils
 
 
 def test_driver_cu_result_explanations_health():
diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py
index e4365ac0c9..ebdc3e3ac4 100644
--- a/cuda_core/tests/test_device.py
+++ b/cuda_core/tests/test_device.py
@@ -6,15 +6,15 @@
 except ImportError:
     from cuda import cuda as driver
     from cuda import cudart as runtime
-import cuda.core
+import cuda.core.experimental
 import pytest
-from cuda.core import Device
-from cuda.core._utils.cuda_utils import ComputeCapability, get_binding_version, handle_return
+from cuda.core.experimental import Device
+from cuda.core.experimental._utils.cuda_utils import ComputeCapability, get_binding_version, handle_return
 
 
 def test_device_init_disabled():
     with pytest.raises(RuntimeError, match=r"^DeviceProperties cannot be instantiated directly\."):
-        cuda.core._device.DeviceProperties()  # Ensure back door is locked.
+        cuda.core.experimental._device.DeviceProperties()  # Ensure back door is locked.
 
 
 @pytest.fixture(scope="module")
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index 0d8f3a3c2d..ec35448619 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -4,9 +4,9 @@
 
 import math
 
-import cuda.core
+import cuda.core.experimental
 import pytest
-from cuda.core import (
+from cuda.core.experimental import (
     Device,
     Event,
     EventOptions,
@@ -17,7 +17,7 @@
 
 def test_event_init_disabled():
     with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."):
-        cuda.core._event.Event()  # Ensure back door is locked.
+        cuda.core.experimental._event.Event()  # Ensure back door is locked.
 
 
 def test_timing_success(init_cuda):
diff --git a/cuda_core/tests/test_experimental_backward_compat.py b/cuda_core/tests/test_experimental_backward_compat.py
deleted file mode 100644
index 4f71a082f2..0000000000
--- a/cuda_core/tests/test_experimental_backward_compat.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-"""
-Tests for backward compatibility of cuda.core.experimental namespace.
-
-These tests verify that the experimental namespace forwarding stubs work
-correctly and emit appropriate deprecation warnings.
-"""
-
-import pytest
-
-
-# Test that experimental imports still work
-def test_experimental_imports_work():
-    """Test that imports from experimental namespace still work."""
-    # Clear cached module to ensure warning is emitted
-    import sys
-
-    if "cuda.core.experimental" in sys.modules:
-        del sys.modules["cuda.core.experimental"]
-
-    # Test main module import - should emit deprecation warning
-    with pytest.deprecated_call():
-        import cuda.core.experimental
-
-    # Test that symbols are accessible
-    assert hasattr(cuda.core.experimental, "Device")
-    assert hasattr(cuda.core.experimental, "Stream")
-    assert hasattr(cuda.core.experimental, "Buffer")
-    assert hasattr(cuda.core.experimental, "system")
-
-
-def test_experimental_symbols_are_same_objects():
-    """Test that experimental namespace symbols are the same objects as core."""
-    import cuda.core
-    import cuda.core.experimental
-
-    # Compare classes/types
-    assert cuda.core.experimental.Device is cuda.core.Device
-    assert cuda.core.experimental.Stream is cuda.core.Stream
-    assert cuda.core.experimental.Buffer is cuda.core.Buffer
-    assert cuda.core.experimental.MemoryResource is cuda.core.MemoryResource
-    assert cuda.core.experimental.Program is cuda.core.Program
-    assert cuda.core.experimental.Kernel is cuda.core.Kernel
-    assert cuda.core.experimental.ObjectCode is cuda.core.ObjectCode
-    assert cuda.core.experimental.Graph is cuda.core.Graph
-    assert cuda.core.experimental.GraphBuilder is cuda.core.GraphBuilder
-    assert cuda.core.experimental.Event is cuda.core.Event
-    assert cuda.core.experimental.Linker is cuda.core.Linker
-
-    # Compare singletons
-    assert cuda.core.experimental.system is cuda.core.system
-
-
-def test_experimental_direct_imports():
-    """Test that direct imports from experimental submodules work."""
-    # Clear any cached imports to ensure warnings are emitted
-    import sys
-
-    if "cuda.core.experimental" in sys.modules:
-        del sys.modules["cuda.core.experimental"]
-
-    # Test various import patterns - warning is emitted once at module import time
-    with pytest.deprecated_call():
-        from cuda.core.experimental import (
-            Buffer,
-            Device,
-            Stream,
-        )
-
-    # Verify objects are usable
-    assert Device is not None
-    assert Stream is not None
-    assert Buffer is not None
-
-
-def test_experimental_submodule_access():
-    """Test that accessing experimental underscored submodules raises AttributeError.
-
-    Underscored modules are not public APIs and should not be accessible through
-    the experimental namespace.
-    """
-    import cuda.core.experimental
-
-    # Underscored modules should not be accessible (__getattr__ removed per reviewer feedback)
-    with pytest.raises(AttributeError):
-        _ = cuda.core.experimental._device
-    with pytest.raises(AttributeError):
-        _ = cuda.core.experimental._stream
-    with pytest.raises(AttributeError):
-        _ = cuda.core.experimental._memory
-
-
-def test_experimental_utils_module():
-    """Test that experimental.utils module works.
-
-    Note: The deprecation warning is only emitted once at import time when
-    cuda.core.experimental is first imported. Accessing utils or importing
-    from utils does not trigger additional warnings since utils is already
-    set as an attribute in the module namespace.
-    """
-    import cuda.core.experimental
-
-    # Should be able to access utils (no warning on access, only on initial import)
-    assert hasattr(cuda.core.experimental, "utils")
-    assert cuda.core.experimental.utils is not None
-
-    # Should have expected utilities (no warning on import from utils submodule)
-    from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
-
-    assert StridedMemoryView is not None
-    assert args_viewable_as_strided_memory is not None
-
-
-def test_experimental_options_classes():
-    """Test that options classes are accessible."""
-    import cuda.core.experimental
-
-    assert hasattr(cuda.core.experimental, "EventOptions")
-    assert hasattr(cuda.core.experimental, "StreamOptions")
-    assert hasattr(cuda.core.experimental, "LaunchConfig")
-    assert hasattr(cuda.core.experimental, "ProgramOptions")
-    assert hasattr(cuda.core.experimental, "LinkerOptions")
-    assert hasattr(cuda.core.experimental, "GraphCompleteOptions")
-    assert hasattr(cuda.core.experimental, "GraphDebugPrintOptions")
-    assert hasattr(cuda.core.experimental, "DeviceMemoryResourceOptions")
-    assert hasattr(cuda.core.experimental, "VirtualMemoryResourceOptions")
-
-    # Verify they're the same objects
-    assert cuda.core.experimental.EventOptions is cuda.core.EventOptions
-    assert cuda.core.experimental.StreamOptions is cuda.core.StreamOptions
-    assert cuda.core.experimental.LaunchConfig is cuda.core.LaunchConfig
-
-
-def test_experimental_memory_classes():
-    """Test that memory-related classes are accessible."""
-    import cuda.core.experimental
-
-    assert hasattr(cuda.core.experimental, "MemoryResource")
-    assert hasattr(cuda.core.experimental, "DeviceMemoryResource")
-    assert hasattr(cuda.core.experimental, "LegacyPinnedMemoryResource")
-    assert hasattr(cuda.core.experimental, "VirtualMemoryResource")
-    assert hasattr(cuda.core.experimental, "GraphMemoryResource")
-
-    # Verify they're the same objects
-    assert cuda.core.experimental.MemoryResource is cuda.core.MemoryResource
-    assert cuda.core.experimental.DeviceMemoryResource is cuda.core.DeviceMemoryResource
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_experimental_instantiations():
-    """Test that objects can be instantiated through experimental namespace."""
-    from cuda.core.experimental import Device
-
-    # Should be able to create objects
-    device = Device()
-    assert device is not None
-
-    # Verify it's the same type
-    from cuda.core import Device as CoreDevice
-
-    assert isinstance(device, CoreDevice)
diff --git a/cuda_core/tests/test_graph.py b/cuda_core/tests/test_graph.py
index aaad9304f4..e988eeebf6 100644
--- a/cuda_core/tests/test_graph.py
+++ b/cuda_core/tests/test_graph.py
@@ -11,7 +11,7 @@
     from cuda.bindings import nvrtc
 except ImportError:
     from cuda import nvrtc
-from cuda.core import (
+from cuda.core.experimental import (
     Device,
     GraphBuilder,
     GraphCompleteOptions,
@@ -22,7 +22,7 @@
     ProgramOptions,
     launch,
 )
-from cuda.core._utils.cuda_utils import NVRTCError, handle_return
+from cuda.core.experimental._utils.cuda_utils import NVRTCError, handle_return
 
 
 def _common_kernels():
diff --git a/cuda_core/tests/test_graph_mem.py b/cuda_core/tests/test_graph_mem.py
index 5159fd2b2b..6f39c287d6 100644
--- a/cuda_core/tests/test_graph_mem.py
+++ b/cuda_core/tests/test_graph_mem.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import pytest
-from cuda.core import (
+from cuda.core.experimental import (
     Device,
     DeviceMemoryResource,
     GraphCompleteOptions,
diff --git a/cuda_core/tests/test_hashable.py b/cuda_core/tests/test_hashable.py
index 9bc89969a2..4aa801866f 100644
--- a/cuda_core/tests/test_hashable.py
+++ b/cuda_core/tests/test_hashable.py
@@ -12,10 +12,10 @@
 5. Hash/equality contract compliance (if a == b, then hash(a) must equal hash(b))
 """
 
-from cuda.core import Device
-from cuda.core._context import Context
-from cuda.core._event import Event, EventOptions
-from cuda.core._stream import Stream, StreamOptions
+from cuda.core.experimental import Device
+from cuda.core.experimental._context import Context
+from cuda.core.experimental._event import Event, EventOptions
+from cuda.core.experimental._stream import Stream, StreamOptions
 
 # ============================================================================
 # Integration Tests
diff --git a/cuda_core/tests/test_helpers.py b/cuda_core/tests/test_helpers.py
index 8230f08088..65df23980c 100644
--- a/cuda_core/tests/test_helpers.py
+++ b/cuda_core/tests/test_helpers.py
@@ -5,7 +5,7 @@
 import time
 
 import pytest
-from cuda.core import Device
+from cuda.core.experimental import Device
 from helpers import IS_WINDOWS, IS_WSL
 from helpers.buffers import PatternGen, compare_equal_buffers, make_scratch_buffer
 from helpers.latch import LatchKernel
diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
index ae3e5531c1..d2e0a89a28 100644
--- a/cuda_core/tests/test_launcher.py
+++ b/cuda_core/tests/test_launcher.py
@@ -12,7 +12,7 @@
     cp = None
 import numpy as np
 import pytest
-from cuda.core import (
+from cuda.core.experimental import (
     Device,
     DeviceMemoryResource,
     LaunchConfig,
@@ -21,8 +21,8 @@
     ProgramOptions,
     launch,
 )
-from cuda.core._memory import _SynchronousMemoryResource
-from cuda.core._utils.cuda_utils import CUDAError
+from cuda.core.experimental._memory import _SynchronousMemoryResource
+from cuda.core.experimental._utils.cuda_utils import CUDAError
 
 from conftest import skipif_need_cuda_headers
 
@@ -95,7 +95,7 @@ def test_launch_config_cluster_grid_conversion(init_cuda):
 
 def test_launch_config_native_conversion(init_cuda):
     """Test that _to_native_launch_config correctly converts grid from cluster units to block units."""
-    from cuda.core._launch_config import _to_native_launch_config
+    from cuda.core.experimental._launch_config import _to_native_launch_config
 
     try:
         # Test case 1: 1D - Issue #867 example
@@ -264,7 +264,7 @@ def test_cooperative_launch():
     # # Commented out as this seems to be a sticky error...
     # config = LaunchConfig(grid=1, block=1)
     # launch(s, config, ker)
-    # from cuda.core._utils.cuda_utils import CUDAError
+    # from cuda.core.experimental._utils.cuda_utils import CUDAError
     # with pytest.raises(CUDAError) as e:
     #     s.sync()
     # assert "CUDA_ERROR_LAUNCH_FAILED" in str(e)
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 0cb3b8e95f..b7af4b6ab7 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -3,9 +3,9 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import pytest
-from cuda.core import Device, Linker, LinkerOptions, Program, ProgramOptions, _linker
-from cuda.core._module import ObjectCode
-from cuda.core._utils.cuda_utils import CUDAError
+from cuda.core.experimental import Device, Linker, LinkerOptions, Program, ProgramOptions, _linker
+from cuda.core.experimental._module import ObjectCode
+from cuda.core.experimental._utils.cuda_utils import CUDAError
 
 ARCH = "sm_" + "".join(f"{i}" for i in Device().compute_capability)
 
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 0f6288bcf7..23572014bb 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -17,7 +17,7 @@
 import re
 
 import pytest
-from cuda.core import (
+from cuda.core.experimental import (
     Buffer,
     Device,
     DeviceMemoryResource,
@@ -31,13 +31,13 @@
     VirtualMemoryResource,
     VirtualMemoryResourceOptions,
 )
-from cuda.core import (
+from cuda.core.experimental import (
     system as ccx_system,
 )
-from cuda.core._dlpack import DLDeviceType
-from cuda.core._memory import IPCBufferDescriptor
-from cuda.core._utils.cuda_utils import CUDAError, handle_return
-from cuda.core.utils import StridedMemoryView
+from cuda.core.experimental._dlpack import DLDeviceType
+from cuda.core.experimental._memory import IPCBufferDescriptor
+from cuda.core.experimental._utils.cuda_utils import CUDAError, handle_return
+from cuda.core.experimental.utils import StridedMemoryView
 from helpers import IS_WINDOWS
 from helpers.buffers import DummyUnifiedMemoryResource
 
@@ -149,7 +149,7 @@ def test_package_contents():
         "VirtualMemoryResource",
     ]
     d = {}
-    exec("from cuda.core._memory import *", d)  # noqa: S102
+    exec("from cuda.core.experimental._memory import *", d)  # noqa: S102
     d = {k: v for k, v in d.items() if not k.startswith("__")}
     assert sorted(expected) == sorted(d.keys())
 
diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py
index bcae9576da..d17cdfd089 100644
--- a/cuda_core/tests/test_memory_peer_access.py
+++ b/cuda_core/tests/test_memory_peer_access.py
@@ -1,10 +1,10 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import cuda.core
+import cuda.core.experimental
 import pytest
-from cuda.core import DeviceMemoryResource
-from cuda.core._utils.cuda_utils import CUDAError
+from cuda.core.experimental import DeviceMemoryResource
+from cuda.core.experimental._utils.cuda_utils import CUDAError
 from helpers.buffers import PatternGen, compare_buffer_to_constant, make_scratch_buffer
 
 NBYTES = 1024
@@ -72,7 +72,7 @@ def check(expected):
     with pytest.raises(ValueError, match=r"device_id must be \>\= 0"):
         dmr.peer_accessible_by = [-1]  # device ID out of bounds
 
-    num_devices = len(cuda.core.Device.get_all_devices())
+    num_devices = len(cuda.core.experimental.Device.get_all_devices())
 
     with pytest.raises(ValueError, match=r"device_id must be within \[0, \d+\)"):
         dmr.peer_accessible_by = [num_devices]  # device ID out of bounds
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 4b3817ece4..25b8d5dd86 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -5,10 +5,10 @@
 import pickle
 import warnings
 
-import cuda.core
+import cuda.core.experimental
 import pytest
-from cuda.core import Device, ObjectCode, Program, ProgramOptions
-from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return
+from cuda.core.experimental import Device, ObjectCode, Program, ProgramOptions
+from cuda.core.experimental._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return
 
 try:
     import numba
@@ -41,17 +41,17 @@ def cuda12_4_prerequisite_check():
 
 def test_kernel_attributes_init_disabled():
     with pytest.raises(RuntimeError, match=r"^KernelAttributes cannot be instantiated directly\."):
-        cuda.core._module.KernelAttributes()  # Ensure back door is locked.
+        cuda.core.experimental._module.KernelAttributes()  # Ensure back door is locked.
 
 
 def test_kernel_occupancy_init_disabled():
     with pytest.raises(RuntimeError, match=r"^KernelOccupancy cannot be instantiated directly\."):
-        cuda.core._module.KernelOccupancy()  # Ensure back door is locked.
+        cuda.core.experimental._module.KernelOccupancy()  # Ensure back door is locked.
 
 
 def test_kernel_init_disabled():
     with pytest.raises(RuntimeError, match=r"^Kernel objects cannot be instantiated directly\."):
-        cuda.core._module.Kernel()  # Ensure back door is locked.
+        cuda.core.experimental._module.Kernel()  # Ensure back door is locked.
 
 
 def test_object_code_init_disabled():
@@ -387,7 +387,7 @@ def test_occupancy_max_active_clusters(get_saxpy_kernel_cubin, cluster):
     dev = Device()
     if dev.compute_capability < (9, 0):
         pytest.skip("Device with compute capability 90 or higher is required for cluster support")
-    launch_config = cuda.core.LaunchConfig(grid=128, block=64, cluster=cluster)
+    launch_config = cuda.core.experimental.LaunchConfig(grid=128, block=64, cluster=cluster)
     query_fn = kernel.occupancy.max_active_clusters
     max_active_clusters = query_fn(launch_config)
     assert isinstance(max_active_clusters, int)
@@ -402,7 +402,7 @@ def test_occupancy_max_potential_cluster_size(get_saxpy_kernel_cubin):
     dev = Device()
     if dev.compute_capability < (9, 0):
         pytest.skip("Device with compute capability 90 or higher is required for cluster support")
-    launch_config = cuda.core.LaunchConfig(grid=128, block=64)
+    launch_config = cuda.core.experimental.LaunchConfig(grid=128, block=64)
     query_fn = kernel.occupancy.max_potential_cluster_size
     max_potential_cluster_size = query_fn(launch_config)
     assert isinstance(max_potential_cluster_size, int)
diff --git a/cuda_core/tests/test_multiprocessing_warning.py b/cuda_core/tests/test_multiprocessing_warning.py
index 2248cd2cd6..8b490af233 100644
--- a/cuda_core/tests/test_multiprocessing_warning.py
+++ b/cuda_core/tests/test_multiprocessing_warning.py
@@ -12,11 +12,11 @@
 import warnings
 from unittest.mock import patch
 
-from cuda.core import DeviceMemoryResource, DeviceMemoryResourceOptions, EventOptions
-from cuda.core._event import _reduce_event
-from cuda.core._memory._device_memory_resource import _deep_reduce_device_memory_resource
-from cuda.core._memory._ipc import _reduce_allocation_handle
-from cuda.core._utils.cuda_utils import reset_fork_warning
+from cuda.core.experimental import DeviceMemoryResource, DeviceMemoryResourceOptions, EventOptions
+from cuda.core.experimental._event import _reduce_event
+from cuda.core.experimental._memory._device_memory_resource import _deep_reduce_device_memory_resource
+from cuda.core.experimental._memory._ipc import _reduce_allocation_handle
+from cuda.core.experimental._utils.cuda_utils import reset_fork_warning
 
 
 def test_warn_on_fork_method_device_memory_resource(ipc_device):
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 9a9e4926ae..f432e3f88d 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -6,11 +6,11 @@
 import warnings
 
 import pytest
-from cuda.core import _linker
-from cuda.core._device import Device
-from cuda.core._module import Kernel, ObjectCode
-from cuda.core._program import Program, ProgramOptions
-from cuda.core._utils.cuda_utils import CUDAError, driver, handle_return
+from cuda.core.experimental import _linker
+from cuda.core.experimental._device import Device
+from cuda.core.experimental._module import Kernel, ObjectCode
+from cuda.core.experimental._program import Program, ProgramOptions
+from cuda.core.experimental._utils.cuda_utils import CUDAError, driver, handle_return
 
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
@@ -19,7 +19,7 @@
 def _is_nvvm_available():
     """Check if NVVM is available."""
     try:
-        from cuda.core._program import _get_nvvm_module
+        from cuda.core.experimental._program import _get_nvvm_module
 
         _get_nvvm_module()
         return True
@@ -32,7 +32,7 @@ def _is_nvvm_available():
 )
 
 try:
-    from cuda.core._utils.cuda_utils import driver, handle_return, nvrtc
+    from cuda.core.experimental._utils.cuda_utils import driver, handle_return, nvrtc
 
     _cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 except Exception:
@@ -92,7 +92,7 @@ def _get_libnvvm_version_for_tests():
     _libnvvm_version_attempted = True
 
     try:
-        from cuda.core._program import _get_nvvm_module
+        from cuda.core.experimental._program import _get_nvvm_module
 
         nvvm = _get_nvvm_module()
 
@@ -140,7 +140,7 @@ def nvvm_ir():
     fallback assumes no version metadata will be present in
     the input nvvm ir
     """
-    from cuda.core._program import _get_nvvm_module
+    from cuda.core.experimental._program import _get_nvvm_module
 
     nvvm = _get_nvvm_module()
     major, minor, debug_major, debug_minor = nvvm.ir_version()
@@ -415,7 +415,7 @@ def test_program_close():
 @nvvm_available
 def test_nvvm_deferred_import():
     """Test that our deferred NVVM import works correctly"""
-    from cuda.core._program import _get_nvvm_module
+    from cuda.core.experimental._program import _get_nvvm_module
 
     nvvm = _get_nvvm_module()
     assert nvvm is not None
diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py
index 01b0b861af..695a70e931 100644
--- a/cuda_core/tests/test_stream.py
+++ b/cuda_core/tests/test_stream.py
@@ -2,10 +2,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
-from cuda.core import Device, Stream, StreamOptions
-from cuda.core._event import Event
-from cuda.core._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM
-from cuda.core._utils.cuda_utils import driver
+from cuda.core.experimental import Device, Stream, StreamOptions
+from cuda.core.experimental._event import Event
+from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM
+from cuda.core.experimental._utils.cuda_utils import driver
 from helpers.misc import StreamWrapper
 
 
diff --git a/cuda_core/tests/test_strided_layout.py b/cuda_core/tests/test_strided_layout.py
index d897f78cfe..c615365cf8 100644
--- a/cuda_core/tests/test_strided_layout.py
+++ b/cuda_core/tests/test_strided_layout.py
@@ -9,7 +9,7 @@
 
 import numpy as np
 import pytest
-from cuda.core._layout import _StridedLayout
+from cuda.core.experimental._layout import _StridedLayout
 from helpers.layout import (
     DenseOrder,
     LayoutSpec,
diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py
index 60b7ef7ec7..da81bbec99 100644
--- a/cuda_core/tests/test_system.py
+++ b/cuda_core/tests/test_system.py
@@ -7,8 +7,8 @@
     from cuda import cuda as driver
     from cuda import cudart as runtime
 
-from cuda.core import Device, system
-from cuda.core._utils.cuda_utils import handle_return
+from cuda.core.experimental import Device, system
+from cuda.core.experimental._utils.cuda_utils import handle_return
 
 
 def test_system_singleton():
diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index 1bb432de6c..8bb66ef60d 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -12,16 +12,16 @@
     from numba import cuda as numba_cuda
 except ImportError:
     numba_cuda = None
-import cuda.core
+import cuda.core.experimental
 import numpy as np
 import pytest
-from cuda.core import Device
-from cuda.core._layout import _StridedLayout
-from cuda.core.utils import StridedMemoryView, args_viewable_as_strided_memory
+from cuda.core.experimental import Device
+from cuda.core.experimental._layout import _StridedLayout
+from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
 
 
 def test_cast_to_3_tuple_success():
-    c3t = cuda.core._utils.cuda_utils.cast_to_3_tuple
+    c3t = cuda.core.experimental._utils.cuda_utils.cast_to_3_tuple
     assert c3t("", ()) == (1, 1, 1)
     assert c3t("", 2) == (2, 1, 1)
     assert c3t("", (2,)) == (2, 1, 1)
@@ -45,7 +45,7 @@ def test_cast_to_3_tuple_success():
 )
 def test_cast_to_3_tuple_value_error(cfg, expected):
     with pytest.raises(ValueError, match=expected):
-        cuda.core._utils.cuda_utils.cast_to_3_tuple("Lbl", cfg)
+        cuda.core.experimental._utils.cuda_utils.cast_to_3_tuple("Lbl", cfg)
 
 
 def convert_strides_to_counts(strides, itemsize):

From c75fe3f6f98914071607a4f5b721b1a91765bf43 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 17 Dec 2025 07:03:12 -0800
Subject: [PATCH 21/21] Replace all cuda.core.experimental._ with cuda.core._

---
 cuda_core/tests/conftest.py                   |  4 ++--
 cuda_core/tests/helpers/buffers.py            |  2 +-
 cuda_core/tests/memory_ipc/test_errors.py     |  2 +-
 .../tests/memory_ipc/test_peer_access.py      |  2 +-
 cuda_core/tests/test_comparable.py            |  6 +++---
 cuda_core/tests/test_context.py               |  2 +-
 cuda_core/tests/test_cuda_utils.py            |  2 +-
 cuda_core/tests/test_device.py                |  4 ++--
 cuda_core/tests/test_event.py                 |  2 +-
 cuda_core/tests/test_graph.py                 |  2 +-
 cuda_core/tests/test_hashable.py              |  6 +++---
 cuda_core/tests/test_launcher.py              |  8 ++++----
 cuda_core/tests/test_linker.py                |  7 ++++---
 cuda_core/tests/test_memory.py                |  8 ++++----
 cuda_core/tests/test_memory_peer_access.py    |  2 +-
 cuda_core/tests/test_module.py                |  8 ++++----
 .../tests/test_multiprocessing_warning.py     |  8 ++++----
 cuda_core/tests/test_program.py               | 20 +++++++++----------
 cuda_core/tests/test_stream.py                |  6 +++---
 cuda_core/tests/test_strided_layout.py        |  2 +-
 cuda_core/tests/test_system.py                |  2 +-
 cuda_core/tests/test_utils.py                 |  6 +++---
 22 files changed, 56 insertions(+), 55 deletions(-)

diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index 9aaf23498f..114e9af296 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -13,6 +13,8 @@
     from cuda import cuda as driver
 
 import cuda.core.experimental
+from cuda.core import _device
+from cuda.core._utils.cuda_utils import handle_return
 from cuda.core.experimental import (
     Device,
     DeviceMemoryResource,
@@ -21,9 +23,7 @@
     ManagedMemoryResourceOptions,
     PinnedMemoryResource,
     PinnedMemoryResourceOptions,
-    _device,
 )
-from cuda.core.experimental._utils.cuda_utils import handle_return
 
 
 def skip_if_pinned_memory_unsupported(device):
diff --git a/cuda_core/tests/helpers/buffers.py b/cuda_core/tests/helpers/buffers.py
index b4d769eab3..eb02ea9c8b 100644
--- a/cuda_core/tests/helpers/buffers.py
+++ b/cuda_core/tests/helpers/buffers.py
@@ -3,8 +3,8 @@
 
 import ctypes
 
+from cuda.core._utils.cuda_utils import driver, handle_return
 from cuda.core.experimental import Buffer, Device, MemoryResource
-from cuda.core.experimental._utils.cuda_utils import driver, handle_return
 
 from . import libc
 
diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py
index d6280ae0ec..0d847c914f 100644
--- a/cuda_core/tests/memory_ipc/test_errors.py
+++ b/cuda_core/tests/memory_ipc/test_errors.py
@@ -5,8 +5,8 @@
 import pickle
 import re
 
+from cuda.core._utils.cuda_utils import CUDAError
 from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
-from cuda.core.experimental._utils.cuda_utils import CUDAError
 
 CHILD_TIMEOUT_SEC = 20
 NBYTES = 64
diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py
index 87dc459ffc..aa18e4059e 100644
--- a/cuda_core/tests/memory_ipc/test_peer_access.py
+++ b/cuda_core/tests/memory_ipc/test_peer_access.py
@@ -4,8 +4,8 @@
 import multiprocessing as mp
 
 import pytest
+from cuda.core._utils.cuda_utils import CUDAError
 from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
-from cuda.core.experimental._utils.cuda_utils import CUDAError
 from helpers.buffers import PatternGen
 
 CHILD_TIMEOUT_SEC = 20
diff --git a/cuda_core/tests/test_comparable.py b/cuda_core/tests/test_comparable.py
index c99963cd23..84f440ae99 100644
--- a/cuda_core/tests/test_comparable.py
+++ b/cuda_core/tests/test_comparable.py
@@ -8,10 +8,10 @@
 across Device, Stream, Event, and Context objects.
 """
 
+from cuda.core._context import Context
+from cuda.core._event import Event, EventOptions
+from cuda.core._stream import StreamOptions
 from cuda.core.experimental import Device, Stream
-from cuda.core.experimental._context import Context
-from cuda.core.experimental._event import Event, EventOptions
-from cuda.core.experimental._stream import StreamOptions
 
 # ============================================================================
 # Equality Contract Tests
diff --git a/cuda_core/tests/test_context.py b/cuda_core/tests/test_context.py
index 4fe35dc18d..133ebb4a5d 100644
--- a/cuda_core/tests/test_context.py
+++ b/cuda_core/tests/test_context.py
@@ -8,7 +8,7 @@
 
 def test_context_init_disabled():
     with pytest.raises(RuntimeError, match=r"^Context objects cannot be instantiated directly\."):
-        cuda.core.experimental._context.Context()  # Ensure back door is locked.
+        cuda.core._context.Context()  # Ensure back door is locked.
 
 
 # ============================================================================
diff --git a/cuda_core/tests/test_cuda_utils.py b/cuda_core/tests/test_cuda_utils.py
index b0a0518652..c68f8fb841 100644
--- a/cuda_core/tests/test_cuda_utils.py
+++ b/cuda_core/tests/test_cuda_utils.py
@@ -4,7 +4,7 @@
 
 import pytest
 from cuda.bindings import driver, runtime
-from cuda.core.experimental._utils import cuda_utils
+from cuda.core._utils import cuda_utils
 
 
 def test_driver_cu_result_explanations_health():
diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py
index ebdc3e3ac4..5e524d3d63 100644
--- a/cuda_core/tests/test_device.py
+++ b/cuda_core/tests/test_device.py
@@ -8,13 +8,13 @@
     from cuda import cudart as runtime
 import cuda.core.experimental
 import pytest
+from cuda.core._utils.cuda_utils import ComputeCapability, get_binding_version, handle_return
 from cuda.core.experimental import Device
-from cuda.core.experimental._utils.cuda_utils import ComputeCapability, get_binding_version, handle_return
 
 
 def test_device_init_disabled():
     with pytest.raises(RuntimeError, match=r"^DeviceProperties cannot be instantiated directly\."):
-        cuda.core.experimental._device.DeviceProperties()  # Ensure back door is locked.
+        cuda.core._device.DeviceProperties()  # Ensure back door is locked.
 
 
 @pytest.fixture(scope="module")
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index ec35448619..a99a1448d5 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -17,7 +17,7 @@
 
 def test_event_init_disabled():
     with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."):
-        cuda.core.experimental._event.Event()  # Ensure back door is locked.
+        cuda.core._event.Event()  # Ensure back door is locked.
 
 
 def test_timing_success(init_cuda):
diff --git a/cuda_core/tests/test_graph.py b/cuda_core/tests/test_graph.py
index e988eeebf6..77a2ee2489 100644
--- a/cuda_core/tests/test_graph.py
+++ b/cuda_core/tests/test_graph.py
@@ -11,6 +11,7 @@
     from cuda.bindings import nvrtc
 except ImportError:
     from cuda import nvrtc
+from cuda.core._utils.cuda_utils import NVRTCError, handle_return
 from cuda.core.experimental import (
     Device,
     GraphBuilder,
@@ -22,7 +23,6 @@
     ProgramOptions,
     launch,
 )
-from cuda.core.experimental._utils.cuda_utils import NVRTCError, handle_return
 
 
 def _common_kernels():
diff --git a/cuda_core/tests/test_hashable.py b/cuda_core/tests/test_hashable.py
index 4aa801866f..28a00605ce 100644
--- a/cuda_core/tests/test_hashable.py
+++ b/cuda_core/tests/test_hashable.py
@@ -12,10 +12,10 @@
 5. Hash/equality contract compliance (if a == b, then hash(a) must equal hash(b))
 """
 
+from cuda.core._context import Context
+from cuda.core._event import Event, EventOptions
+from cuda.core._stream import Stream, StreamOptions
 from cuda.core.experimental import Device
-from cuda.core.experimental._context import Context
-from cuda.core.experimental._event import Event, EventOptions
-from cuda.core.experimental._stream import Stream, StreamOptions
 
 # ============================================================================
 # Integration Tests
diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
index d2e0a89a28..405e27c111 100644
--- a/cuda_core/tests/test_launcher.py
+++ b/cuda_core/tests/test_launcher.py
@@ -12,6 +12,8 @@
     cp = None
 import numpy as np
 import pytest
+from cuda.core._memory import _SynchronousMemoryResource
+from cuda.core._utils.cuda_utils import CUDAError
 from cuda.core.experimental import (
     Device,
     DeviceMemoryResource,
@@ -21,8 +23,6 @@
     ProgramOptions,
     launch,
 )
-from cuda.core.experimental._memory import _SynchronousMemoryResource
-from cuda.core.experimental._utils.cuda_utils import CUDAError
 
 from conftest import skipif_need_cuda_headers
 
@@ -95,7 +95,7 @@ def test_launch_config_cluster_grid_conversion(init_cuda):
 
 def test_launch_config_native_conversion(init_cuda):
     """Test that _to_native_launch_config correctly converts grid from cluster units to block units."""
-    from cuda.core.experimental._launch_config import _to_native_launch_config
+    from cuda.core._launch_config import _to_native_launch_config
 
     try:
         # Test case 1: 1D - Issue #867 example
@@ -264,7 +264,7 @@ def test_cooperative_launch():
     # # Commented out as this seems to be a sticky error...
     # config = LaunchConfig(grid=1, block=1)
     # launch(s, config, ker)
-    # from cuda.core.experimental._utils.cuda_utils import CUDAError
+    # from cuda.core._utils.cuda_utils import CUDAError
     # with pytest.raises(CUDAError) as e:
     #     s.sync()
     # assert "CUDA_ERROR_LAUNCH_FAILED" in str(e)
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index b7af4b6ab7..b05aa7586b 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -3,9 +3,10 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import pytest
-from cuda.core.experimental import Device, Linker, LinkerOptions, Program, ProgramOptions, _linker
-from cuda.core.experimental._module import ObjectCode
-from cuda.core.experimental._utils.cuda_utils import CUDAError
+from cuda.core import _linker
+from cuda.core._module import ObjectCode
+from cuda.core._utils.cuda_utils import CUDAError
+from cuda.core.experimental import Device, Linker, LinkerOptions, Program, ProgramOptions
 
 ARCH = "sm_" + "".join(f"{i}" for i in Device().compute_capability)
 
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 23572014bb..2ff844ef93 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -17,6 +17,9 @@
 import re
 
 import pytest
+from cuda.core._dlpack import DLDeviceType
+from cuda.core._memory import IPCBufferDescriptor
+from cuda.core._utils.cuda_utils import CUDAError, handle_return
 from cuda.core.experimental import (
     Buffer,
     Device,
@@ -34,9 +37,6 @@
 from cuda.core.experimental import (
     system as ccx_system,
 )
-from cuda.core.experimental._dlpack import DLDeviceType
-from cuda.core.experimental._memory import IPCBufferDescriptor
-from cuda.core.experimental._utils.cuda_utils import CUDAError, handle_return
 from cuda.core.experimental.utils import StridedMemoryView
 from helpers import IS_WINDOWS
 from helpers.buffers import DummyUnifiedMemoryResource
@@ -149,7 +149,7 @@ def test_package_contents():
         "VirtualMemoryResource",
     ]
     d = {}
-    exec("from cuda.core.experimental._memory import *", d)  # noqa: S102
+    exec("from cuda.core._memory import *", d)  # noqa: S102
     d = {k: v for k, v in d.items() if not k.startswith("__")}
     assert sorted(expected) == sorted(d.keys())
 
diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py
index d17cdfd089..4067eb857a 100644
--- a/cuda_core/tests/test_memory_peer_access.py
+++ b/cuda_core/tests/test_memory_peer_access.py
@@ -3,8 +3,8 @@
 
 import cuda.core.experimental
 import pytest
+from cuda.core._utils.cuda_utils import CUDAError
 from cuda.core.experimental import DeviceMemoryResource
-from cuda.core.experimental._utils.cuda_utils import CUDAError
 from helpers.buffers import PatternGen, compare_buffer_to_constant, make_scratch_buffer
 
 NBYTES = 1024
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 25b8d5dd86..041fe2f8cc 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -7,8 +7,8 @@
 
 import cuda.core.experimental
 import pytest
+from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return
 from cuda.core.experimental import Device, ObjectCode, Program, ProgramOptions
-from cuda.core.experimental._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return
 
 try:
     import numba
@@ -41,17 +41,17 @@ def cuda12_4_prerequisite_check():
 
 def test_kernel_attributes_init_disabled():
     with pytest.raises(RuntimeError, match=r"^KernelAttributes cannot be instantiated directly\."):
-        cuda.core.experimental._module.KernelAttributes()  # Ensure back door is locked.
+        cuda.core._module.KernelAttributes()  # Ensure back door is locked.
 
 
 def test_kernel_occupancy_init_disabled():
     with pytest.raises(RuntimeError, match=r"^KernelOccupancy cannot be instantiated directly\."):
-        cuda.core.experimental._module.KernelOccupancy()  # Ensure back door is locked.
+        cuda.core._module.KernelOccupancy()  # Ensure back door is locked.
 
 
 def test_kernel_init_disabled():
     with pytest.raises(RuntimeError, match=r"^Kernel objects cannot be instantiated directly\."):
-        cuda.core.experimental._module.Kernel()  # Ensure back door is locked.
+        cuda.core._module.Kernel()  # Ensure back door is locked.
 
 
 def test_object_code_init_disabled():
diff --git a/cuda_core/tests/test_multiprocessing_warning.py b/cuda_core/tests/test_multiprocessing_warning.py
index 8b490af233..0743b7f71d 100644
--- a/cuda_core/tests/test_multiprocessing_warning.py
+++ b/cuda_core/tests/test_multiprocessing_warning.py
@@ -12,11 +12,11 @@
 import warnings
 from unittest.mock import patch
 
+from cuda.core._event import _reduce_event
+from cuda.core._memory._device_memory_resource import _deep_reduce_device_memory_resource
+from cuda.core._memory._ipc import _reduce_allocation_handle
+from cuda.core._utils.cuda_utils import reset_fork_warning
 from cuda.core.experimental import DeviceMemoryResource, DeviceMemoryResourceOptions, EventOptions
-from cuda.core.experimental._event import _reduce_event
-from cuda.core.experimental._memory._device_memory_resource import _deep_reduce_device_memory_resource
-from cuda.core.experimental._memory._ipc import _reduce_allocation_handle
-from cuda.core.experimental._utils.cuda_utils import reset_fork_warning
 
 
 def test_warn_on_fork_method_device_memory_resource(ipc_device):
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index f432e3f88d..9a9e4926ae 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -6,11 +6,11 @@
 import warnings
 
 import pytest
-from cuda.core.experimental import _linker
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._module import Kernel, ObjectCode
-from cuda.core.experimental._program import Program, ProgramOptions
-from cuda.core.experimental._utils.cuda_utils import CUDAError, driver, handle_return
+from cuda.core import _linker
+from cuda.core._device import Device
+from cuda.core._module import Kernel, ObjectCode
+from cuda.core._program import Program, ProgramOptions
+from cuda.core._utils.cuda_utils import CUDAError, driver, handle_return
 
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
@@ -19,7 +19,7 @@
 def _is_nvvm_available():
     """Check if NVVM is available."""
     try:
-        from cuda.core.experimental._program import _get_nvvm_module
+        from cuda.core._program import _get_nvvm_module
 
         _get_nvvm_module()
         return True
@@ -32,7 +32,7 @@ def _is_nvvm_available():
 )
 
 try:
-    from cuda.core.experimental._utils.cuda_utils import driver, handle_return, nvrtc
+    from cuda.core._utils.cuda_utils import driver, handle_return, nvrtc
 
     _cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 except Exception:
@@ -92,7 +92,7 @@ def _get_libnvvm_version_for_tests():
     _libnvvm_version_attempted = True
 
     try:
-        from cuda.core.experimental._program import _get_nvvm_module
+        from cuda.core._program import _get_nvvm_module
 
         nvvm = _get_nvvm_module()
 
@@ -140,7 +140,7 @@ def nvvm_ir():
     fallback assumes no version metadata will be present in
     the input nvvm ir
     """
-    from cuda.core.experimental._program import _get_nvvm_module
+    from cuda.core._program import _get_nvvm_module
 
     nvvm = _get_nvvm_module()
     major, minor, debug_major, debug_minor = nvvm.ir_version()
@@ -415,7 +415,7 @@ def test_program_close():
 @nvvm_available
 def test_nvvm_deferred_import():
     """Test that our deferred NVVM import works correctly"""
-    from cuda.core.experimental._program import _get_nvvm_module
+    from cuda.core._program import _get_nvvm_module
 
     nvvm = _get_nvvm_module()
     assert nvvm is not None
diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py
index 695a70e931..f05a7d3b3a 100644
--- a/cuda_core/tests/test_stream.py
+++ b/cuda_core/tests/test_stream.py
@@ -2,10 +2,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+from cuda.core._event import Event
+from cuda.core._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM
+from cuda.core._utils.cuda_utils import driver
 from cuda.core.experimental import Device, Stream, StreamOptions
-from cuda.core.experimental._event import Event
-from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM
-from cuda.core.experimental._utils.cuda_utils import driver
 from helpers.misc import StreamWrapper
 
 
diff --git a/cuda_core/tests/test_strided_layout.py b/cuda_core/tests/test_strided_layout.py
index c615365cf8..d897f78cfe 100644
--- a/cuda_core/tests/test_strided_layout.py
+++ b/cuda_core/tests/test_strided_layout.py
@@ -9,7 +9,7 @@
 
 import numpy as np
 import pytest
-from cuda.core.experimental._layout import _StridedLayout
+from cuda.core._layout import _StridedLayout
 from helpers.layout import (
     DenseOrder,
     LayoutSpec,
diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py
index da81bbec99..d52629ded7 100644
--- a/cuda_core/tests/test_system.py
+++ b/cuda_core/tests/test_system.py
@@ -7,8 +7,8 @@
     from cuda import cuda as driver
     from cuda import cudart as runtime
 
+from cuda.core._utils.cuda_utils import handle_return
 from cuda.core.experimental import Device, system
-from cuda.core.experimental._utils.cuda_utils import handle_return
 
 
 def test_system_singleton():
diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index 8bb66ef60d..d8c747bdb5 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -15,13 +15,13 @@
 import cuda.core.experimental
 import numpy as np
 import pytest
+from cuda.core._layout import _StridedLayout
 from cuda.core.experimental import Device
-from cuda.core.experimental._layout import _StridedLayout
 from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
 
 
 def test_cast_to_3_tuple_success():
-    c3t = cuda.core.experimental._utils.cuda_utils.cast_to_3_tuple
+    c3t = cuda.core._utils.cuda_utils.cast_to_3_tuple
     assert c3t("", ()) == (1, 1, 1)
     assert c3t("", 2) == (2, 1, 1)
     assert c3t("", (2,)) == (2, 1, 1)
@@ -45,7 +45,7 @@ def test_cast_to_3_tuple_success():
 )
 def test_cast_to_3_tuple_value_error(cfg, expected):
     with pytest.raises(ValueError, match=expected):
-        cuda.core.experimental._utils.cuda_utils.cast_to_3_tuple("Lbl", cfg)
+        cuda.core._utils.cuda_utils.cast_to_3_tuple("Lbl", cfg)
 
 
 def convert_strides_to_counts(strides, itemsize):